diff options
Diffstat (limited to 'numpy/core/src/umath')
35 files changed, 4761 insertions, 3078 deletions
diff --git a/numpy/core/src/umath/_rational_tests.c b/numpy/core/src/umath/_rational_tests.c index bf50a2226..391c5f4e1 100644 --- a/numpy/core/src/umath/_rational_tests.c +++ b/numpy/core/src/umath/_rational_tests.c @@ -49,7 +49,7 @@ set_zero_divide(void) { /* Integer arithmetic utilities */ -static NPY_INLINE npy_int32 +static inline npy_int32 safe_neg(npy_int32 x) { if (x==(npy_int32)1<<31) { set_overflow(); @@ -57,7 +57,7 @@ safe_neg(npy_int32 x) { return -x; } -static NPY_INLINE npy_int32 +static inline npy_int32 safe_abs32(npy_int32 x) { npy_int32 nx; if (x>=0) { @@ -70,7 +70,7 @@ safe_abs32(npy_int32 x) { return nx; } -static NPY_INLINE npy_int64 +static inline npy_int64 safe_abs64(npy_int64 x) { npy_int64 nx; if (x>=0) { @@ -83,7 +83,7 @@ safe_abs64(npy_int64 x) { return nx; } -static NPY_INLINE npy_int64 +static inline npy_int64 gcd(npy_int64 x, npy_int64 y) { x = safe_abs64(x); y = safe_abs64(y); @@ -102,7 +102,7 @@ gcd(npy_int64 x, npy_int64 y) { return x; } -static NPY_INLINE npy_int64 +static inline npy_int64 lcm(npy_int64 x, npy_int64 y) { npy_int64 lcm; if (!x || !y) { @@ -128,7 +128,7 @@ typedef struct { npy_int32 dmm; } rational; -static NPY_INLINE rational +static inline rational make_rational_int(npy_int64 n) { rational r = {(npy_int32)n,0}; if (r.n != n) { @@ -164,7 +164,7 @@ make_rational_slow(npy_int64 n_, npy_int64 d_) { return r; } -static NPY_INLINE npy_int32 +static inline npy_int32 d(rational r) { return r.dmm+1; } @@ -184,7 +184,7 @@ make_rational_fast(npy_int64 n_, npy_int64 d_) { return r; } -static NPY_INLINE rational +static inline rational rational_negative(rational r) { rational x; x.n = safe_neg(r.n); @@ -192,7 +192,7 @@ rational_negative(rational r) { return x; } -static NPY_INLINE rational +static inline rational rational_add(rational x, rational y) { /* * Note that the numerator computation can never overflow int128_t, @@ -202,25 +202,25 @@ rational_add(rational x, rational y) { (npy_int64)d(x)*d(y)); } -static NPY_INLINE rational +static inline rational rational_subtract(rational x, rational y) { /* We're safe from overflow as with + */ return make_rational_fast((npy_int64)x.n*d(y)-(npy_int64)d(x)*y.n, (npy_int64)d(x)*d(y)); } -static NPY_INLINE rational +static inline rational rational_multiply(rational x, rational y) { /* We're safe from overflow as with + */ return make_rational_fast((npy_int64)x.n*y.n,(npy_int64)d(x)*d(y)); } -static NPY_INLINE rational +static inline rational rational_divide(rational x, rational y) { return make_rational_slow((npy_int64)x.n*d(y),(npy_int64)d(x)*y.n); } -static NPY_INLINE npy_int64 +static inline npy_int64 rational_floor(rational x) { /* Always round down */ if (x.n>=0) { @@ -233,18 +233,18 @@ rational_floor(rational x) { return -((-(npy_int64)x.n+d(x)-1)/d(x)); } -static NPY_INLINE npy_int64 +static inline npy_int64 rational_ceil(rational x) { return -rational_floor(rational_negative(x)); } -static NPY_INLINE rational +static inline rational rational_remainder(rational x, rational y) { return rational_subtract(x, rational_multiply(y,make_rational_int( rational_floor(rational_divide(x,y))))); } -static NPY_INLINE rational +static inline rational rational_abs(rational x) { rational y; y.n = safe_abs32(x.n); @@ -252,7 +252,7 @@ rational_abs(rational x) { return y; } -static NPY_INLINE npy_int64 +static inline npy_int64 rational_rint(rational x) { /* * Round towards nearest integer, moving exact half integers towards @@ -262,12 +262,12 @@ rational_rint(rational x) { return (2*(npy_int64)x.n+(x.n<0?-d_:d_))/(2*(npy_int64)d_); } -static NPY_INLINE int +static inline int rational_sign(rational x) { return x.n<0?-1:x.n==0?0:1; } -static NPY_INLINE rational +static inline rational rational_inverse(rational x) { rational y = {0}; if (!x.n) { @@ -286,7 +286,7 @@ rational_inverse(rational x) { return y; } -static NPY_INLINE int +static inline int rational_eq(rational x, rational y) { /* * Since we enforce d > 0, and store fractions in reduced form, @@ -295,42 +295,42 @@ rational_eq(rational x, rational y) { return x.n==y.n && x.dmm==y.dmm; } -static NPY_INLINE int +static inline int rational_ne(rational x, rational y) { return !rational_eq(x,y); } -static NPY_INLINE int +static inline int rational_lt(rational x, rational y) { return (npy_int64)x.n*d(y) < (npy_int64)y.n*d(x); } -static NPY_INLINE int +static inline int rational_gt(rational x, rational y) { return rational_lt(y,x); } -static NPY_INLINE int +static inline int rational_le(rational x, rational y) { return !rational_lt(y,x); } -static NPY_INLINE int +static inline int rational_ge(rational x, rational y) { return !rational_lt(x,y); } -static NPY_INLINE npy_int32 +static inline npy_int32 rational_int(rational x) { return x.n/d(x); } -static NPY_INLINE double +static inline double rational_double(rational x) { return (double)x.n/d(x); } -static NPY_INLINE int +static inline int rational_nonzero(rational x) { return x.n!=0; } @@ -367,7 +367,7 @@ typedef struct { static PyTypeObject PyRational_Type; -static NPY_INLINE int +static inline int PyRational_Check(PyObject* object) { return PyObject_IsInstance(object,(PyObject*)&PyRational_Type); } @@ -753,7 +753,7 @@ npyrational_setitem(PyObject* item, void* data, void* arr) { return 0; } -static NPY_INLINE void +static inline void byteswap(npy_int32* x) { char* p = (char*)x; size_t i; @@ -996,7 +996,7 @@ UNARY_UFUNC(reciprocal,rational,rational_inverse(x)) UNARY_UFUNC(numerator,npy_int64,x.n) UNARY_UFUNC(denominator,npy_int64,d(x)) -static NPY_INLINE void +static inline void rational_matrix_multiply(char **args, npy_intp const *dimensions, npy_intp const *steps) { /* pointers to data for input and output arrays */ diff --git a/numpy/core/src/umath/_scaled_float_dtype.c b/numpy/core/src/umath/_scaled_float_dtype.c index a214b32aa..c26ace9f1 100644 --- a/numpy/core/src/umath/_scaled_float_dtype.c +++ b/numpy/core/src/umath/_scaled_float_dtype.c @@ -26,6 +26,14 @@ #include "dispatching.h" +/* TODO: from wrapping_array_method.c, use proper public header eventually */ +NPY_NO_EXPORT int +PyUFunc_AddWrappingLoop(PyObject *ufunc_obj, + PyArray_DTypeMeta *new_dtypes[], PyArray_DTypeMeta *wrapped_dtypes[], + translate_given_descrs_func *translate_given_descrs, + translate_loop_descrs_func *translate_loop_descrs); + + typedef struct { PyArray_Descr base; double scaling; @@ -125,9 +133,9 @@ sfloat_setitem(PyObject *obj, char *data, PyArrayObject *arr) /* Special DType methods and the descr->f slot storage */ NPY_DType_Slots sfloat_slots = { - .default_descr = &sfloat_default_descr, .discover_descr_from_pyobject = &sfloat_discover_from_pyobject, .is_known_scalar_type = &sfloat_is_known_scalar_type, + .default_descr = &sfloat_default_descr, .common_dtype = &sfloat_common_dtype, .common_instance = &sfloat_common_instance, .f = { @@ -136,14 +144,13 @@ NPY_DType_Slots sfloat_slots = { } }; - static PyArray_SFloatDescr SFloatSingleton = {{ - .elsize = sizeof(double), - .alignment = _ALIGN(double), + .byteorder = '|', /* do not bother with byte-swapping... */ .flags = NPY_USE_GETITEM|NPY_USE_SETITEM, .type_num = -1, + .elsize = sizeof(double), + .alignment = _ALIGN(double), .f = &sfloat_slots.f, - .byteorder = '|', /* do not bother with byte-swapping... */ }, .scaling = 1, }; @@ -233,15 +240,15 @@ sfloat_repr(PyArray_SFloatDescr *self) static PyArray_DTypeMeta PyArray_SFloatDType = {{{ PyVarObject_HEAD_INIT(NULL, 0) .tp_name = "numpy._ScaledFloatTestDType", - .tp_methods = sfloat_methods, - .tp_new = sfloat_new, + .tp_basicsize = sizeof(PyArray_SFloatDescr), .tp_repr = (reprfunc)sfloat_repr, .tp_str = (reprfunc)sfloat_repr, - .tp_basicsize = sizeof(PyArray_SFloatDescr), + .tp_methods = sfloat_methods, + .tp_new = sfloat_new, }}, .type_num = -1, .scalar_type = NULL, - .flags = NPY_DT_PARAMETRIC, + .flags = NPY_DT_PARAMETRIC | NPY_DT_NUMERIC, .dt_slots = &sfloat_slots, }; @@ -440,7 +447,7 @@ sfloat_to_bool_resolve_descriptors( static int -init_casts(void) +sfloat_init_casts(void) { PyArray_DTypeMeta *dtypes[2] = {&PyArray_SFloatDType, &PyArray_SFloatDType}; PyType_Slot slots[4] = {{0, NULL}}; @@ -448,11 +455,11 @@ init_casts(void) .name = "sfloat_to_sfloat_cast", .nin = 1, .nout = 1, + /* minimal guaranteed casting */ + .casting = NPY_SAME_KIND_CASTING, .flags = NPY_METH_SUPPORTS_UNALIGNED, .dtypes = dtypes, .slots = slots, - /* minimal guaranteed casting */ - .casting = NPY_SAME_KIND_CASTING, }; slots[0].slot = NPY_METH_resolve_descriptors; @@ -646,13 +653,55 @@ add_sfloats_resolve_descriptors( } +/* + * We define the hypot loop using the "PyUFunc_AddWrappingLoop" API. + * We use this very narrowly for mapping to the double hypot loop currently. + */ static int -add_loop(const char *ufunc_name, - PyArray_DTypeMeta *dtypes[3], PyObject *meth_or_promoter) +translate_given_descrs_to_double( + int nin, int nout, PyArray_DTypeMeta *wrapped_dtypes[], + PyArray_Descr *given_descrs[], PyArray_Descr *new_descrs[]) +{ + assert(nin == 2 && nout == 1); + for (int i = 0; i < 3; i++) { + if (given_descrs[i] == NULL) { + new_descrs[i] = NULL; + } + else { + new_descrs[i] = PyArray_DescrFromType(NPY_DOUBLE); + } + } + return 0; +} + + +static int +translate_loop_descrs( + int nin, int nout, PyArray_DTypeMeta *new_dtypes[], + PyArray_Descr *given_descrs[], + PyArray_Descr *NPY_UNUSED(original_descrs[]), + PyArray_Descr *loop_descrs[]) +{ + assert(nin == 2 && nout == 1); + loop_descrs[0] = sfloat_common_instance( + given_descrs[0], given_descrs[1]); + if (loop_descrs[0] == 0) { + return -1; + } + Py_INCREF(loop_descrs[0]); + loop_descrs[1] = loop_descrs[0]; + Py_INCREF(loop_descrs[0]); + loop_descrs[2] = loop_descrs[0]; + return 0; +} + + +static PyObject * +sfloat_get_ufunc(const char *ufunc_name) { PyObject *mod = PyImport_ImportModule("numpy"); if (mod == NULL) { - return -1; + return NULL; } PyObject *ufunc = PyObject_GetAttrString(mod, ufunc_name); Py_DECREF(mod); @@ -660,6 +709,18 @@ add_loop(const char *ufunc_name, Py_DECREF(ufunc); PyErr_Format(PyExc_TypeError, "numpy.%s was not a ufunc!", ufunc_name); + return NULL; + } + return ufunc; +} + + +static int +sfloat_add_loop(const char *ufunc_name, + PyArray_DTypeMeta *dtypes[3], PyObject *meth_or_promoter) +{ + PyObject *ufunc = sfloat_get_ufunc(ufunc_name); + if (ufunc == NULL) { return -1; } PyObject *dtype_tup = PyArray_TupleFromItems(3, (PyObject **)dtypes, 1); @@ -680,6 +741,24 @@ add_loop(const char *ufunc_name, } +static int +sfloat_add_wrapping_loop(const char *ufunc_name, PyArray_DTypeMeta *dtypes[3]) +{ + PyObject *ufunc = sfloat_get_ufunc(ufunc_name); + if (ufunc == NULL) { + return -1; + } + PyArray_DTypeMeta *double_dt = PyArray_DTypeFromTypeNum(NPY_DOUBLE); + PyArray_DTypeMeta *wrapped_dtypes[3] = {double_dt, double_dt, double_dt}; + int res = PyUFunc_AddWrappingLoop( + ufunc, dtypes, wrapped_dtypes, &translate_given_descrs_to_double, + &translate_loop_descrs); + Py_DECREF(ufunc); + Py_DECREF(double_dt); + + return res; +} + /* * We add some very basic promoters to allow multiplying normal and scaled @@ -707,7 +786,7 @@ promote_to_sfloat(PyUFuncObject *NPY_UNUSED(ufunc), * get less so with the introduction of public API). */ static int -init_ufuncs(void) { +sfloat_init_ufuncs(void) { PyArray_DTypeMeta *dtypes[3] = { &PyArray_SFloatDType, &PyArray_SFloatDType, &PyArray_SFloatDType}; PyType_Slot slots[3] = {{0, NULL}}; @@ -728,7 +807,7 @@ init_ufuncs(void) { if (bmeth == NULL) { return -1; } - int res = add_loop("multiply", + int res = sfloat_add_loop("multiply", bmeth->dtypes, (PyObject *)bmeth->method); Py_DECREF(bmeth); if (res < 0) { @@ -746,13 +825,18 @@ init_ufuncs(void) { if (bmeth == NULL) { return -1; } - res = add_loop("add", + res = sfloat_add_loop("add", bmeth->dtypes, (PyObject *)bmeth->method); Py_DECREF(bmeth); if (res < 0) { return -1; } + /* N.B.: Wrapping isn't actually correct if scaling can be negative */ + if (sfloat_add_wrapping_loop("hypot", dtypes) < 0) { + return -1; + } + /* * Add a promoter for both directions of multiply with double. */ @@ -767,14 +851,14 @@ init_ufuncs(void) { if (promoter == NULL) { return -1; } - res = add_loop("multiply", promoter_dtypes, promoter); + res = sfloat_add_loop("multiply", promoter_dtypes, promoter); if (res < 0) { Py_DECREF(promoter); return -1; } promoter_dtypes[0] = double_DType; promoter_dtypes[1] = &PyArray_SFloatDType; - res = add_loop("multiply", promoter_dtypes, promoter); + res = sfloat_add_loop("multiply", promoter_dtypes, promoter); Py_DECREF(promoter); if (res < 0) { return -1; @@ -815,11 +899,11 @@ get_sfloat_dtype(PyObject *NPY_UNUSED(mod), PyObject *NPY_UNUSED(args)) return NULL; } - if (init_casts() < 0) { + if (sfloat_init_casts() < 0) { return NULL; } - if (init_ufuncs() < 0) { + if (sfloat_init_ufuncs() < 0) { return NULL; } diff --git a/numpy/core/src/umath/_umath_tests.c.src b/numpy/core/src/umath/_umath_tests.c.src index 1bf459ce6..b427991e5 100644 --- a/numpy/core/src/umath/_umath_tests.c.src +++ b/numpy/core/src/umath/_umath_tests.c.src @@ -9,6 +9,9 @@ #include <Python.h> #define NPY_NO_DEPRECATED_API NPY_API_VERSION +#if defined(NPY_INTERNAL_BUILD) +#undef NPY_INTERNAL_BUILD +#endif #include "numpy/arrayobject.h" #include "numpy/ufuncobject.h" #include "numpy/npy_math.h" @@ -19,6 +22,9 @@ #include "npy_cpu_features.h" #include "npy_cpu_dispatch.h" #include "numpy/npy_cpu.h" +#include "npy_import.h" +#include "numpy/experimental_dtype_api.h" + /* ***************************************************************************** @@ -300,7 +306,7 @@ static void ptr_this += stride_d; ptr_that += stride_d; } - *(@typ@ *)data_out = npy_@sqrt_func@(out); + *(@typ@ *)data_out = @sqrt_func@(out); data_that += stride_n; data_out += stride_p; } @@ -343,6 +349,50 @@ static void /**end repeat**/ +static int +INT32_negative(PyArrayMethod_Context *NPY_UNUSED(context), + char **args, npy_intp const *dimensions, + npy_intp const *steps, NpyAuxData *NPY_UNUSED(func)) +{ + npy_intp di = dimensions[0]; + npy_intp i; + npy_intp is=steps[0], os=steps[1]; + char *ip=args[0], *op=args[1]; + for (i = 0; i < di; i++, ip += is, op += os) { + if (i == 3) { + *(int32_t *)op = - 100; + } else { + *(int32_t *)op = - *(int32_t *)ip; + } + } + return 0; +} + + +static int +INT32_negative_indexed(PyArrayMethod_Context *NPY_UNUSED(context), + char * const*args, npy_intp const *dimensions, + npy_intp const *steps, NpyAuxData *NPY_UNUSED(func)) +{ + char *ip1 = args[0]; + char *indx = args[1]; + npy_intp is1 = steps[0], isindex = steps[1]; + npy_intp n = dimensions[0]; + npy_intp i; + int32_t *indexed; + for(i = 0; i < n; i++, indx += isindex) { + indexed = (int32_t *)(ip1 + is1 * *(npy_intp *)indx); + if (i == 3) { + *indexed = -200; + } else { + *indexed = - *indexed; + } + } + return 0; +} + + + /* The following lines were generated using a slightly modified version of code_generators/generate_umath.py and adding these lines to defdict: @@ -671,6 +721,43 @@ err: return NULL; } +static int +add_INT32_negative_indexed(PyObject *module, PyObject *dict) { + if (import_experimental_dtype_api(__EXPERIMENTAL_DTYPE_API_VERSION) < 0) { + return -1; + } + + PyObject * negative = PyUFunc_FromFuncAndData(NULL, NULL, NULL, 0, 1, 1, + PyUFunc_Zero, "indexed_negative", NULL, 0); + if (negative == NULL) { + return -1; + } + PyArray_DTypeMeta *dtypes[] = {&PyArray_Int32DType, &PyArray_Int32DType}; + + PyType_Slot slots[] = { + {NPY_METH_contiguous_indexed_loop, INT32_negative_indexed}, + {NPY_METH_strided_loop, INT32_negative}, + {0, NULL} + }; + + PyArrayMethod_Spec spec = { + .name = "negative_indexed_loop", + .nin = 1, + .nout = 1, + .dtypes = dtypes, + .slots = slots, + .flags = NPY_METH_NO_FLOATINGPOINT_ERRORS + }; + + if (PyUFunc_AddLoopFromSpec(negative, &spec) < 0) { + Py_DECREF(negative); + return -1; + } + PyDict_SetItemString(dict, "indexed_negative", negative); + Py_DECREF(negative); + return 0; +} + static PyMethodDef UMath_TestsMethods[] = { {"test_signature", UMath_Tests_test_signature, METH_VARARGS, "Test signature parsing of ufunc. \n" @@ -733,5 +820,13 @@ PyMODINIT_FUNC PyInit__umath_tests(void) { "cannot load _umath_tests module."); return NULL; } + + if (add_INT32_negative_indexed(m, d) < 0) { + Py_DECREF(m); + PyErr_Print(); + PyErr_SetString(PyExc_RuntimeError, + "cannot load _umath_tests module."); + return NULL; + } return m; } diff --git a/numpy/core/src/umath/dispatching.c b/numpy/core/src/umath/dispatching.c index 79de6c3c8..fee77b2d6 100644 --- a/numpy/core/src/umath/dispatching.c +++ b/numpy/core/src/umath/dispatching.c @@ -43,6 +43,7 @@ #include <convert_datatype.h> #include "numpy/ndarraytypes.h" +#include "numpy/npy_3kcompat.h" #include "common.h" #include "dispatching.h" @@ -58,7 +59,7 @@ /* forward declaration */ -static NPY_INLINE PyObject * +static inline PyObject * promote_and_get_info_and_ufuncimpl(PyUFuncObject *ufunc, PyArrayObject *const ops[], PyArray_DTypeMeta *signature[], @@ -667,12 +668,9 @@ legacy_promote_using_legacy_type_resolver(PyUFuncObject *ufunc, Py_DECREF(out_descrs[i]); } /* - * The PyUFunc_SimpleBinaryComparisonTypeResolver has a deprecation - * warning (ignoring `dtype=`) and cannot be cached. - * All datetime ones *should* have a warning, but currently don't, - * but ignore all signature passing also. So they can also - * not be cached, and they mutate the signature which of course is wrong, - * but not doing it would confuse the code later. + * datetime legacy resolvers ignore the signature, which should be + * warn/raise (when used). In such cases, the signature is (incorrectly) + * mutated, and caching is not possible. */ for (int i = 0; i < nargs; i++) { if (signature[i] != NULL && signature[i] != operation_DTypes[i]) { @@ -728,7 +726,7 @@ add_and_return_legacy_wrapping_ufunc_loop(PyUFuncObject *ufunc, * If value-based promotion is necessary, this is handled ahead of time by * `promote_and_get_ufuncimpl`. */ -static NPY_INLINE PyObject * +static inline PyObject * promote_and_get_info_and_ufuncimpl(PyUFuncObject *ufunc, PyArrayObject *const ops[], PyArray_DTypeMeta *signature[], @@ -916,8 +914,8 @@ promote_and_get_ufuncimpl(PyUFuncObject *ufunc, int nin = ufunc->nin, nargs = ufunc->nargs; /* - * Get the actual DTypes we operate with by mixing the operand array - * ones with the passed signature. + * Get the actual DTypes we operate with by setting op_dtypes[i] from + * signature[i]. */ for (int i = 0; i < nargs; i++) { if (signature[i] != NULL) { @@ -950,7 +948,7 @@ promote_and_get_ufuncimpl(PyUFuncObject *ufunc, int cacheable = 1; /* unused, as we modify the original `op_dtypes` */ if (legacy_promote_using_legacy_type_resolver(ufunc, ops, signature, op_dtypes, &cacheable, NPY_FALSE) < 0) { - return NULL; + goto handle_error; } } @@ -962,10 +960,7 @@ promote_and_get_ufuncimpl(PyUFuncObject *ufunc, npy_promotion_state = old_promotion_state; if (info == NULL) { - if (!PyErr_Occurred()) { - raise_no_loop_found_error(ufunc, (PyObject **)op_dtypes); - } - return NULL; + goto handle_error; } PyArrayMethodObject *method = (PyArrayMethodObject *)PyTuple_GET_ITEM(info, 1); @@ -987,7 +982,7 @@ promote_and_get_ufuncimpl(PyUFuncObject *ufunc, /* Reset the promotion state: */ npy_promotion_state = NPY_USE_WEAK_PROMOTION_AND_WARN; if (res < 0) { - return NULL; + goto handle_error; } } @@ -1021,12 +1016,29 @@ promote_and_get_ufuncimpl(PyUFuncObject *ufunc, * If signature is forced the cache may contain an incompatible * loop found via promotion (signature not enforced). Reject it. */ - raise_no_loop_found_error(ufunc, (PyObject **)op_dtypes); - return NULL; + goto handle_error; } } return method; + + handle_error: + /* We only set the "no loop found error here" */ + if (!PyErr_Occurred()) { + raise_no_loop_found_error(ufunc, (PyObject **)op_dtypes); + } + /* + * Otherwise an error occurred, but if the error was DTypePromotionError + * then we chain it, because DTypePromotionError effectively means that there + * is no loop available. (We failed finding a loop by using promotion.) + */ + else if (PyErr_ExceptionMatches(npy_DTypePromotionError)) { + PyObject *err_type = NULL, *err_value = NULL, *err_traceback = NULL; + PyErr_Fetch(&err_type, &err_value, &err_traceback); + raise_no_loop_found_error(ufunc, (PyObject **)op_dtypes); + npy_PyErr_ChainExceptionsCause(err_type, err_value, err_traceback); + } + return NULL; } @@ -1042,13 +1054,6 @@ default_ufunc_promoter(PyUFuncObject *ufunc, PyArray_DTypeMeta *op_dtypes[], PyArray_DTypeMeta *signature[], PyArray_DTypeMeta *new_op_dtypes[]) { - if (ufunc->type_resolver == &PyUFunc_SimpleBinaryComparisonTypeResolver - && signature[0] == NULL && signature[1] == NULL - && signature[2] != NULL && signature[2]->type_num != NPY_BOOL) { - /* bail out, this is _only_ to give future/deprecation warning! */ - return -1; - } - /* If nin < 2 promotion is a no-op, so it should not be registered */ assert(ufunc->nin > 1); if (op_dtypes[0] == NULL) { @@ -1235,3 +1240,40 @@ install_logical_ufunc_promoter(PyObject *ufunc) return PyUFunc_AddLoop((PyUFuncObject *)ufunc, info, 0); } + +/* + * Return the PyArrayMethodObject or PyCapsule that matches a registered + * tuple of identical dtypes. Return a borrowed ref of the first match. + */ +NPY_NO_EXPORT PyObject * +get_info_no_cast(PyUFuncObject *ufunc, PyArray_DTypeMeta *op_dtype, + int ndtypes) +{ + PyObject *t_dtypes = PyTuple_New(ndtypes); + if (t_dtypes == NULL) { + return NULL; + } + for (int i=0; i < ndtypes; i++) { + PyTuple_SetItem(t_dtypes, i, (PyObject *)op_dtype); + } + PyObject *loops = ufunc->_loops; + Py_ssize_t length = PyList_Size(loops); + for (Py_ssize_t i = 0; i < length; i++) { + PyObject *item = PyList_GetItem(loops, i); + PyObject *cur_DType_tuple = PyTuple_GetItem(item, 0); + int cmp = PyObject_RichCompareBool(cur_DType_tuple, + t_dtypes, Py_EQ); + if (cmp < 0) { + Py_DECREF(t_dtypes); + return NULL; + } + if (cmp == 0) { + continue; + } + /* Got the match */ + Py_DECREF(t_dtypes); + return PyTuple_GetItem(item, 1); + } + Py_DECREF(t_dtypes); + Py_RETURN_NONE; +} diff --git a/numpy/core/src/umath/fast_loop_macros.h b/numpy/core/src/umath/fast_loop_macros.h index cbd1f04aa..b8c1926b2 100644 --- a/numpy/core/src/umath/fast_loop_macros.h +++ b/numpy/core/src/umath/fast_loop_macros.h @@ -12,6 +12,19 @@ #include <assert.h> +#include "simd/simd.h" + +/* + * largest simd vector size in bytes numpy supports + * it is currently a extremely large value as it is only used for memory + * overlap checks + */ +#if NPY_SIMD > 0 + // Enough for compiler unroll + #define AUTOVEC_OVERLAP_SIZE NPY_SIMD_WIDTH*4 +#else + #define AUTOVEC_OVERLAP_SIZE 1024 +#endif /* * MAX_STEP_SIZE is used to determine if we need to use SIMD version of the ufunc. * Very large step size can be as slow as processing it using scalar. The @@ -27,7 +40,7 @@ */ #define MAX_STEP_SIZE 2097152 -static NPY_INLINE npy_uintp +static inline npy_uintp abs_ptrdiff(char *a, char *b) { return (a > b) ? (a - b) : (b - a); @@ -219,11 +232,11 @@ abs_ptrdiff(char *a, char *b) /* condition allows compiler to optimize the generic macro */ \ if (IS_BINARY_CONT(tin, tout)) { \ if (abs_ptrdiff(args[2], args[0]) == 0 && \ - abs_ptrdiff(args[2], args[1]) >= NPY_MAX_SIMD_SIZE) { \ + abs_ptrdiff(args[2], args[1]) >= AUTOVEC_OVERLAP_SIZE) { \ BASE_BINARY_LOOP_INP(tin, tout, op) \ } \ else if (abs_ptrdiff(args[2], args[1]) == 0 && \ - abs_ptrdiff(args[2], args[0]) >= NPY_MAX_SIMD_SIZE) { \ + abs_ptrdiff(args[2], args[0]) >= AUTOVEC_OVERLAP_SIZE) { \ BASE_BINARY_LOOP_INP(tin, tout, op) \ } \ else { \ @@ -378,19 +391,6 @@ abs_ptrdiff(char *a, char *b) #undef abs_ptrdiff -#define IS_BLOCKABLE_BINARY_BOOL(esize, vsize) \ - (steps[0] == (esize) && steps[0] == steps[1] && steps[2] == (1) && \ - npy_is_aligned(args[1], (esize)) && \ - npy_is_aligned(args[0], (esize))) - -#define IS_BLOCKABLE_BINARY_SCALAR1_BOOL(esize, vsize) \ - (steps[0] == 0 && steps[1] == (esize) && steps[2] == (1) && \ - npy_is_aligned(args[1], (esize))) - -#define IS_BLOCKABLE_BINARY_SCALAR2_BOOL(esize, vsize) \ - (steps[0] == (esize) && steps[1] == 0 && steps[2] == (1) && \ - npy_is_aligned(args[0], (esize))) - /* align var to alignment */ #define LOOP_BLOCK_ALIGN_VAR(var, type, alignment)\ npy_intp i, peel = npy_aligned_block_offset(var, sizeof(type),\ diff --git a/numpy/core/src/umath/legacy_array_method.c b/numpy/core/src/umath/legacy_array_method.c index c3d421d9b..965a0eb83 100644 --- a/numpy/core/src/umath/legacy_array_method.c +++ b/numpy/core/src/umath/legacy_array_method.c @@ -13,10 +13,13 @@ #include "convert_datatype.h" #include "array_method.h" +#include "array_coercion.h" #include "dtype_transfer.h" #include "legacy_array_method.h" #include "dtypemeta.h" +#include "ufunc_object.h" + typedef struct { NpyAuxData base; @@ -91,7 +94,6 @@ generic_wrapped_legacy_loop(PyArrayMethod_Context *NPY_UNUSED(context), return 0; } - /* * Signal that the old type-resolution function must be used to resolve * the descriptors (mainly/only used for datetimes due to the unit). @@ -234,6 +236,97 @@ get_wrapped_legacy_ufunc_loop(PyArrayMethod_Context *context, } + +/* + * We can shave off a bit of time by just caching the initial and this is + * trivial for all internal numeric types. (Wrapped ufuncs never use + * byte-swapping.) + */ +static int +copy_cached_initial( + PyArrayMethod_Context *context, npy_bool NPY_UNUSED(reduction_is_empty), + char *initial) +{ + memcpy(initial, context->method->legacy_initial, + context->descriptors[0]->elsize); + return 1; +} + + +/* + * The default `get_reduction_initial` attempts to look up the identity + * from the calling ufunc. This might fail, so we only call it when necessary. + * + * For internal number dtypes, we can easily cache it, so do so after the + * first call by overriding the function with `copy_cache_initial`. + * This path is not publicly available. That could be added, and for a + * custom initial getter it should be static/compile time data anyway. + */ +static int +get_initial_from_ufunc( + PyArrayMethod_Context *context, npy_bool reduction_is_empty, + char *initial) +{ + if (context->caller == NULL + || !PyObject_TypeCheck(context->caller, &PyUFunc_Type)) { + /* Impossible in NumPy 1.24; guard in case it becomes possible. */ + PyErr_SetString(PyExc_ValueError, + "getting initial failed because it can only done for legacy " + "ufunc loops when the ufunc is provided."); + return -1; + } + npy_bool reorderable; + PyObject *identity_obj = PyUFunc_GetDefaultIdentity( + (PyUFuncObject *)context->caller, &reorderable); + if (identity_obj == NULL) { + return -1; + } + if (identity_obj == Py_None) { + /* UFunc has no idenity (should not happen) */ + Py_DECREF(identity_obj); + return 0; + } + if (PyTypeNum_ISUNSIGNED(context->descriptors[1]->type_num) + && PyLong_CheckExact(identity_obj)) { + /* + * This is a bit of a hack until we have truly loop specific + * identities. Python -1 cannot be cast to unsigned so convert + * it to a NumPy scalar, but we use -1 for bitwise functions to + * signal all 1s. + * (A builtin identity would not overflow here, although we may + * unnecessary convert 0 and 1.) + */ + Py_SETREF(identity_obj, PyObject_CallFunctionObjArgs( + (PyObject *)&PyLongArrType_Type, identity_obj, NULL)); + if (identity_obj == NULL) { + return -1; + } + } + else if (context->descriptors[0]->type_num == NPY_OBJECT + && !reduction_is_empty) { + /* Allows `sum([object()])` to work, but use 0 when empty. */ + Py_DECREF(identity_obj); + return 0; + } + + int res = PyArray_Pack(context->descriptors[0], initial, identity_obj); + Py_DECREF(identity_obj); + if (res < 0) { + return -1; + } + + if (PyTypeNum_ISNUMBER(context->descriptors[0]->type_num)) { + /* For numbers we can cache to avoid going via Python ints */ + memcpy(context->method->legacy_initial, initial, + context->descriptors[0]->elsize); + context->method->get_reduction_initial = ©_cached_initial; + } + + /* Reduction can use the initial value */ + return 1; +} + + /* * Get the unbound ArrayMethod which wraps the instances of the ufunc. * Note that this function stores the result on the ufunc and then only @@ -273,6 +366,27 @@ PyArray_NewLegacyWrappingArrayMethod(PyUFuncObject *ufunc, flags = _NPY_METH_FORCE_CAST_INPUTS; } + get_reduction_initial_function *get_reduction_intial = NULL; + if (ufunc->nin == 2 && ufunc->nout == 1) { + npy_bool reorderable = NPY_FALSE; + PyObject *identity_obj = PyUFunc_GetDefaultIdentity( + ufunc, &reorderable); + if (identity_obj == NULL) { + return NULL; + } + /* + * TODO: For object, "reorderable" is needed(?), because otherwise + * we disable multi-axis reductions `arr.sum(0, 1)`. But for + * `arr = array([["a", "b"], ["c", "d"]], dtype="object")` + * it isn't actually reorderable (order changes result). + */ + if (reorderable) { + flags |= NPY_METH_IS_REORDERABLE; + } + if (identity_obj != Py_None) { + get_reduction_intial = &get_initial_from_ufunc; + } + } for (int i = 0; i < ufunc->nin+ufunc->nout; i++) { if (signature[i]->singleton->flags & ( NPY_ITEM_REFCOUNT | NPY_ITEM_IS_POINTER | NPY_NEEDS_PYAPI)) { @@ -283,9 +397,10 @@ PyArray_NewLegacyWrappingArrayMethod(PyUFuncObject *ufunc, } } - PyType_Slot slots[3] = { - {NPY_METH_get_loop, &get_wrapped_legacy_ufunc_loop}, + PyType_Slot slots[4] = { + {_NPY_METH_get_loop, &get_wrapped_legacy_ufunc_loop}, {NPY_METH_resolve_descriptors, &simple_legacy_resolve_descriptors}, + {NPY_METH_get_reduction_initial, get_reduction_intial}, {0, NULL}, }; if (any_output_flexible) { @@ -297,10 +412,10 @@ PyArray_NewLegacyWrappingArrayMethod(PyUFuncObject *ufunc, .name = method_name, .nin = ufunc->nin, .nout = ufunc->nout, - .dtypes = signature, + .casting = NPY_NO_CASTING, .flags = flags, + .dtypes = signature, .slots = slots, - .casting = NPY_NO_CASTING, }; PyBoundArrayMethodObject *bound_res = PyArrayMethod_FromSpec_int(&spec, 1); diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src index fe5aa9374..97a74b425 100644 --- a/numpy/core/src/umath/loops.c.src +++ b/numpy/core/src/umath/loops.c.src @@ -13,6 +13,7 @@ #include "numpy/npy_math.h" #include "numpy/halffloat.h" #include "lowlevel_strided_loops.h" +#include "loops_utils.h" #include "npy_pycompat.h" @@ -31,27 +32,9 @@ */ #define PW_BLOCKSIZE 128 - -/* - * largest simd vector size in bytes numpy supports - * it is currently a extremely large value as it is only used for memory - * overlap checks - */ -#ifndef NPY_MAX_SIMD_SIZE -#define NPY_MAX_SIMD_SIZE 1024 -#endif - /** Provides the various *_LOOP macros */ #include "fast_loop_macros.h" -/* - * include vectorized functions and dispatchers - * this file is safe to include also for generic builds - * platform specific instructions are either masked via the proprocessor or - * runtime detected - */ -#include "simd.inc" - /****************************************************************************** ** GENERIC FLOAT LOOPS ** *****************************************************************************/ @@ -416,98 +399,6 @@ PyUFunc_On_Om(char **args, npy_intp const *dimensions, npy_intp const *steps, vo ***************************************************************************** */ -/**begin repeat - * #kind = logical_and, logical_or# - * #OP = &&, ||# - * #SC = ==, !=# - * #and = 1, 0# - **/ - -NPY_NO_EXPORT void -BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ - if(IS_BINARY_REDUCE) { -#ifdef NPY_HAVE_SSE2_INTRINSICS - /* - * stick with our variant for more reliable performance, only known - * platform which outperforms it by ~20% is an i7 with glibc 2.17 - */ - if (run_reduce_simd_@kind@_BOOL(args, dimensions, steps)) { - return; - } -#else - /* for now only use libc on 32-bit/non-x86 */ - if (steps[1] == 1) { - npy_bool * op = (npy_bool *)args[0]; -#if @and@ - /* np.all(), search for a zero (false) */ - if (*op) { - *op = memchr(args[1], 0, dimensions[0]) == NULL; - } -#else - /* - * np.any(), search for a non-zero (true) via comparing against - * zero blocks, memcmp is faster than memchr on SSE4 machines - * with glibc >= 2.12 and memchr can only check for equal 1 - */ - static const npy_bool zero[4096]; /* zero by C standard */ - npy_uintp i, n = dimensions[0]; - - for (i = 0; !*op && i < n - (n % sizeof(zero)); i += sizeof(zero)) { - *op = memcmp(&args[1][i], zero, sizeof(zero)) != 0; - } - if (!*op && n - i > 0) { - *op = memcmp(&args[1][i], zero, n - i) != 0; - } -#endif - return; - } -#endif - else { - BINARY_REDUCE_LOOP(npy_bool) { - const npy_bool in2 = *(npy_bool *)ip2; - io1 = io1 @OP@ in2; - if (io1 @SC@ 0) { - break; - } - } - *((npy_bool *)iop1) = io1; - } - } - else { - if (run_binary_simd_@kind@_BOOL(args, dimensions, steps)) { - return; - } - else { - BINARY_LOOP { - const npy_bool in1 = *(npy_bool *)ip1; - const npy_bool in2 = *(npy_bool *)ip2; - *((npy_bool *)op1) = in1 @OP@ in2; - } - } - } -} -/**end repeat**/ - -/**begin repeat - * #kind = absolute, logical_not# - * #OP = !=, ==# - **/ -NPY_NO_EXPORT void -BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ - if (run_unary_simd_@kind@_BOOL(args, dimensions, steps)) { - return; - } - else { - UNARY_LOOP { - npy_bool in1 = *(npy_bool *)ip1; - *((npy_bool *)op1) = in1 @OP@ 0; - } - } -} -/**end repeat**/ - NPY_NO_EXPORT void BOOL__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)) { @@ -516,24 +407,6 @@ BOOL__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, } } - -/**begin repeat - * #kind = isnan, isinf, isfinite# - * #func = npy_isnan, npy_isinf, npy_isfinite# - * #val = NPY_FALSE, NPY_FALSE, NPY_TRUE# - **/ -NPY_NO_EXPORT void -BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ - /* - * The (void)in; suppresses an unused variable warning raised by gcc and allows - * us to re-use this macro even though we do not depend on in - */ - UNARY_LOOP_FAST(npy_bool, npy_bool, (void)in; *out = @val@); -} - -/**end repeat**/ - /* ***************************************************************************** ** INTEGER LOOPS @@ -552,8 +425,11 @@ BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void */ #define @TYPE@_floor_divide @TYPE@_divide +#define @TYPE@_floor_divide_indexed @TYPE@_divide_indexed #define @TYPE@_fmax @TYPE@_maximum +#define @TYPE@_fmax_indexed @TYPE@_maximum_indexed #define @TYPE@_fmin @TYPE@_minimum +#define @TYPE@_fmin_indexed @TYPE@_minimum_indexed NPY_NO_EXPORT void @TYPE@__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)) @@ -562,163 +438,30 @@ NPY_NO_EXPORT void *((@type@ *)op1) = 1; } } - -NPY_NO_EXPORT void -@TYPE@_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ - UNARY_LOOP_FAST(@type@, @type@, *out = +in); -} - /**begin repeat1 - * #isa = , _avx2# - * #CHK = 1, defined(HAVE_ATTRIBUTE_TARGET_AVX2)# - * #ATTR = , NPY_GCC_TARGET_AVX2# - */ - -#if @CHK@ -NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void -@TYPE@_square@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)) -{ - UNARY_LOOP_FAST(@type@, @type@, *out = in * in); -} -#endif - -#if @CHK@ -NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void -@TYPE@_reciprocal@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)) -{ - UNARY_LOOP_FAST(@type@, @type@, *out = 1.0 / in); -} -#endif - -#if @CHK@ -NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void -@TYPE@_conjugate@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ - UNARY_LOOP_FAST(@type@, @type@, *out = in); -} -#endif - -#if @CHK@ -NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void -@TYPE@_negative@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ - UNARY_LOOP_FAST(@type@, @type@, *out = -in); -} -#endif - -#if @CHK@ -NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void -@TYPE@_logical_not@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ - UNARY_LOOP_FAST(@type@, npy_bool, *out = !in); -} -#endif - -#if @CHK@ -NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void -@TYPE@_invert@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ - UNARY_LOOP_FAST(@type@, @type@, *out = ~in); -} -#endif - -/**begin repeat2 * Arithmetic * #kind = add, subtract, multiply, bitwise_and, bitwise_or, bitwise_xor# * #OP = +, -, *, &, |, ^# */ -#if @CHK@ -NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void -@TYPE@_@kind@@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ - if (IS_BINARY_REDUCE) { - BINARY_REDUCE_LOOP_FAST(@type@, io1 @OP@= in2); - } - else { - BINARY_LOOP_FAST(@type@, @type@, *out = in1 @OP@ in2); - } -} -#endif - -/**end repeat2**/ - -/* - * Arithmetic bit shift operations. - * - * Intel hardware masks bit shift values, so large shifts wrap around - * and can produce surprising results. The special handling ensures that - * behavior is independent of compiler or hardware. - * TODO: We could implement consistent behavior for negative shifts, - * which is undefined in C. - */ - -#define INT_left_shift_needs_clear_floatstatus -#define UINT_left_shift_needs_clear_floatstatus - -#if @CHK@ -NPY_NO_EXPORT NPY_GCC_OPT_3 void -@TYPE@_left_shift@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, - void *NPY_UNUSED(func)) -{ - BINARY_LOOP_FAST(@type@, @type@, *out = npy_lshift@c@(in1, in2)); - -#ifdef @TYPE@_left_shift_needs_clear_floatstatus - // For some reason, our macOS CI sets an "invalid" flag here, but only - // for some types. - npy_clear_floatstatus_barrier((char*)dimensions); -#endif -} -#endif - -#undef INT_left_shift_needs_clear_floatstatus -#undef UINT_left_shift_needs_clear_floatstatus - -#if @CHK@ -NPY_NO_EXPORT -#ifndef NPY_DO_NOT_OPTIMIZE_@TYPE@_right_shift -NPY_GCC_OPT_3 -#endif -void -@TYPE@_right_shift@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, - void *NPY_UNUSED(func)) -{ - BINARY_LOOP_FAST(@type@, @type@, *out = npy_rshift@c@(in1, in2)); -} -#endif - -/**begin repeat2 - * #kind = logical_and, logical_or# - * #OP = &&, ||# - */ - -#if @CHK@ -NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void -@TYPE@_@kind@@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ - /* - * gcc vectorization of this is not good (PR60575) but manual integer - * vectorization is too tedious to be worthwhile - */ - BINARY_LOOP_FAST(@type@, npy_bool, *out = in1 @OP@ in2); -} -#endif - -/**end repeat2**/ - -#if @CHK@ -NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void -@TYPE@_logical_xor@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +NPY_NO_EXPORT NPY_GCC_OPT_3 int +@TYPE@_@kind@_indexed(PyArrayMethod_Context *NPY_UNUSED(context), + char **args, npy_intp const *dimensions, npy_intp const *steps, + void *NPY_UNUSED(func)) { - BINARY_LOOP { - const int t1 = !!*(@type@ *)ip1; - const int t2 = !!*(@type@ *)ip2; - *((npy_bool *)op1) = (t1 != t2); + char *ip1 = args[0]; + char *indx = args[1]; + char *value = args[2]; + npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2]; + npy_intp n = dimensions[0]; + npy_intp i; + @type@ *indexed; + for(i = 0; i < n; i++, indx += isindex, value += isb) { + indexed = (@type@ *)(ip1 + is1 * *(npy_intp *)indx); + *indexed = *indexed @OP@ *(@type@ *)value; } + return 0; } -#endif - /**end repeat1**/ NPY_NO_EXPORT void @@ -760,23 +503,6 @@ NPY_NO_EXPORT void *((@type@ *) op1) = out; } } - -/**begin repeat1 - * #kind = isnan, isinf, isfinite# - * #func = npy_isnan, npy_isinf, npy_isfinite# - * #val = NPY_FALSE, NPY_FALSE, NPY_TRUE# - **/ -NPY_NO_EXPORT void -@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ - /* - * The (void)in; suppresses an unused variable warning raised by gcc and allows - * us to re-use this macro even though we do not depend on in - */ - UNARY_LOOP_FAST(@type@, npy_bool, (void)in; *out = @val@); -} -/**end repeat1**/ - /**end repeat**/ /**begin repeat @@ -784,19 +510,6 @@ NPY_NO_EXPORT void * #type = npy_byte, npy_short, npy_int, npy_long, npy_longlong# * #c = ,,,l,ll# */ - -NPY_NO_EXPORT NPY_GCC_OPT_3 void -@TYPE@_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ - UNARY_LOOP_FAST(@type@, @type@, *out = (in >= 0) ? in : -in); -} - -NPY_NO_EXPORT NPY_GCC_OPT_3 void -@TYPE@_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ - UNARY_LOOP_FAST(@type@, @type@, *out = in > 0 ? 1 : (in < 0 ? -1 : 0)); -} - /**begin repeat1 * #kind = gcd, lcm# **/ @@ -810,7 +523,6 @@ NPY_NO_EXPORT void } } /**end repeat1**/ - /**end repeat**/ /**begin repeat @@ -818,19 +530,6 @@ NPY_NO_EXPORT void * #type = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong# * #c = u,u,u,ul,ull# */ - -NPY_NO_EXPORT NPY_GCC_OPT_3 void -@TYPE@_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ - UNARY_LOOP_FAST(@type@, @type@, *out = in); -} - -NPY_NO_EXPORT NPY_GCC_OPT_3 void -@TYPE@_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ - UNARY_LOOP_FAST(@type@, @type@, *out = in > 0 ? 1 : 0); -} - /**begin repeat1 * #kind = gcd, lcm# **/ @@ -844,9 +543,50 @@ NPY_NO_EXPORT void } } /**end repeat1**/ +/**end repeat**/ + +/* + * NOTE: It may be nice to vectorize these, OTOH, these are still faster + * than the cast we used to do. + */ + +/**begin repeat + * #kind = equal, not_equal, less, less_equal, greater, greater_equal# + * #OP = ==, !=, <, <=, >, >=# + */ +NPY_NO_EXPORT void +LONGLONG_Qq_bool_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + BINARY_LOOP { + const npy_ulonglong in1 = *(npy_ulonglong *)ip1; + const npy_longlong in2 = *(npy_longlong *)ip2; + if (in2 < 0) { + *(npy_bool *)op1 = 0 @OP@ in2; + } + else { + *(npy_bool *)op1 = in1 @OP@ (npy_ulonglong)in2; + } + } +} + +NPY_NO_EXPORT void +LONGLONG_qQ_bool_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + BINARY_LOOP { + const npy_longlong in1 = *(npy_longlong *)ip1; + const npy_ulonglong in2 = *(npy_ulonglong *)ip2; + if (in1 < 0) { + *(npy_bool *)op1 = in1 @OP@ 0; + } + else { + *(npy_bool *)op1 = (npy_ulonglong)in1 @OP@ in2; + } + } +} /**end repeat**/ + /* ***************************************************************************** ** DATETIME LOOPS ** @@ -923,12 +663,6 @@ NPY_NO_EXPORT void } NPY_NO_EXPORT void -@TYPE@_isinf(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ - UNARY_LOOP_FAST(npy_bool, npy_bool, (void)in; *out = NPY_FALSE); -} - -NPY_NO_EXPORT void @TYPE@__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)) { OUTPUT_LOOP { @@ -987,6 +721,7 @@ NPY_NO_EXPORT void } } } + /**end repeat1**/ /**begin repeat1 @@ -1406,6 +1141,8 @@ TIMEDELTA_mm_qm_divmod(char **args, npy_intp const *dimensions, npy_intp const * * #TYPE = FLOAT, DOUBLE, LONGDOUBLE# * #c = f, , l# * #C = F, , L# + * #fd = 1, 1, 0# + * #VCHK = 1, 1, 0# */ /**begin repeat1 * #kind = logical_and, logical_or# @@ -1442,32 +1179,22 @@ NPY_NO_EXPORT void } } +#if !@fd@ /**begin repeat1 * #kind = isnan, isinf, isfinite, signbit# * #func = npy_isnan, npy_isinf, npy_isfinite, npy_signbit# **/ - -/**begin repeat2 - * #ISA = , _avx512_skx# - * #isa = simd, avx512_skx# - * #CHK = 1, defined(HAVE_ATTRIBUTE_TARGET_AVX512_SKX)# - **/ - -#if @CHK@ NPY_NO_EXPORT void -@TYPE@_@kind@@ISA@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { - if (!run_@kind@_@isa@_@TYPE@(args, dimensions, steps)) { - UNARY_LOOP { - const @type@ in1 = *(@type@ *)ip1; - *((npy_bool *)op1) = @func@(in1) != 0; - } + UNARY_LOOP { + const @type@ in1 = *(@type@ *)ip1; + *((npy_bool *)op1) = @func@(in1) != 0; } npy_clear_floatstatus_barrier((char*)dimensions); } -#endif -/**end repeat2**/ /**end repeat1**/ +#endif NPY_NO_EXPORT void @TYPE@_spacing(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) @@ -1508,6 +1235,25 @@ NPY_NO_EXPORT void } } +NPY_NO_EXPORT int +@TYPE@_floor_divide_indexed(PyArrayMethod_Context *NPY_UNUSED(context), + char **args, npy_intp const *dimensions, npy_intp const *steps, + void *NPY_UNUSED(func)) +{ + char *ip1 = args[0]; + char *indx = args[1]; + char *value = args[2]; + npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2]; + npy_intp n = dimensions[0]; + npy_intp i; + @type@ *indexed; + for(i = 0; i < n; i++, indx += isindex, value += isb) { + indexed = (@type@ *)(ip1 + is1 * *(npy_intp *)indx); + *indexed = npy_floor_divide@c@(*indexed, *(@type@ *)value); + } + return 0; +} + NPY_NO_EXPORT void @TYPE@_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { @@ -1546,17 +1292,6 @@ NPY_NO_EXPORT void } NPY_NO_EXPORT void -@TYPE@_negative(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ - if (!run_unary_simd_negative_@TYPE@(args, dimensions, steps)) { - UNARY_LOOP { - const @type@ in1 = *(@type@ *)ip1; - *((@type@ *)op1) = -in1; - } - } -} - -NPY_NO_EXPORT void @TYPE@_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { UNARY_LOOP { @@ -1653,6 +1388,26 @@ LONGDOUBLE_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps } } } + +NPY_NO_EXPORT int +LONGDOUBLE_@kind@_indexed(PyArrayMethod_Context *NPY_UNUSED(context), + char **args, npy_intp const *dimensions, npy_intp const *steps, + void *NPY_UNUSED(func)) +{ + char *ip1 = args[0]; + char *indx = args[1]; + char *value = args[2]; + npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2]; + npy_intp n = dimensions[0]; + npy_intp i; + npy_longdouble *indexed; + for(i = 0; i < n; i++, indx += isindex, value += isb) { + indexed = (npy_longdouble *)(ip1 + is1 * *(npy_intp *)indx); + *indexed = *indexed @OP@ *(npy_longdouble *)value; + } + return 0; +} + /**end repeat**/ /**begin repeat @@ -1758,6 +1513,26 @@ HALF_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void } } } + +NPY_NO_EXPORT int +HALF_@kind@_indexed(void *NPY_UNUSED(context), + char **args, npy_intp const *dimensions, npy_intp const *steps, + void *NPY_UNUSED(func)) +{ + char *ip1 = args[0]; + char *indx = args[1]; + char *value = args[2]; + npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2]; + npy_intp n = dimensions[0]; + npy_intp i; + npy_half *indexed; + for(i = 0; i < n; i++, indx += isindex, value += isb) { + indexed = (npy_half *)(ip1 + is1 * *(npy_intp *)indx); + const float v = npy_half_to_float(*(npy_half *)value); + *indexed = npy_float_to_half(npy_half_to_float(*indexed) @OP@ v); + } + return 0; +} /**end repeat**/ #define _HALF_LOGICAL_AND(a,b) (!npy_half_iszero(a) && !npy_half_iszero(b)) @@ -1859,6 +1634,27 @@ HALF_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void } /* npy_half_isnan will never set floatstatus_invalid, so do not clear */ } + +NPY_NO_EXPORT int +HALF_@kind@_indexed(PyArrayMethod_Context *NPY_UNUSED(context), + char **args, npy_intp const *dimensions, npy_intp const *steps, + void *NPY_UNUSED(func)) +{ + char *ip1 = args[0]; + char *indx = args[1]; + char *value = args[2]; + npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2]; + npy_intp n = dimensions[0]; + npy_intp i; + npy_half *indexed; + for(i = 0; i < n; i++, indx += isindex, value += isb) { + indexed = (npy_half *)(ip1 + is1 * *(npy_intp *)indx); + npy_half v = *(npy_half *)value; + *indexed = (@OP@(*indexed, v) || npy_half_isnan(*indexed)) ? *indexed : v; + } + return 0; +} + /**end repeat**/ /**begin repeat @@ -1874,8 +1670,29 @@ HALF_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void const npy_half in2 = *(npy_half *)ip2; *((npy_half *)op1) = (@OP@(in1, in2) || npy_half_isnan(in2)) ? in1 : in2; } - /* npy_half_isnan will never set floatstatus_invalid, so do not clear */ + /* no need to clear floatstatus_invalid */ +} + +NPY_NO_EXPORT int +HALF_@kind@_indexed(PyArrayMethod_Context *NPY_UNUSED(context), + char **args, npy_intp const *dimensions, npy_intp const *steps, + void *NPY_UNUSED(func)) +{ + char *ip1 = args[0]; + char *indx = args[1]; + char *value = args[2]; + npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2]; + npy_intp n = dimensions[0]; + npy_intp i; + npy_half *indexed; + for (i = 0; i < n; i++, indx += isindex, value += isb) { + indexed = (npy_half *)(ip1 + is1 * *(npy_intp *)indx); + npy_half v = *(npy_half *)value; + *indexed = (@OP@(*indexed, v) || npy_half_isnan(v)) ? *indexed: v; + } + return 0; } + /**end repeat**/ NPY_NO_EXPORT void @@ -1894,6 +1711,27 @@ HALF_floor_divide(char **args, npy_intp const *dimensions, npy_intp const *steps } } +NPY_NO_EXPORT int +HALF_floor_divide_indexed(PyArrayMethod_Context *NPY_UNUSED(context), + char **args, npy_intp const *dimensions, npy_intp const *steps, + void *NPY_UNUSED(func)) +{ + char *ip1 = args[0]; + char *indx = args[1]; + char *value = args[2]; + npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2]; + npy_intp n = dimensions[0]; + npy_intp i; + npy_half *indexed; + for(i = 0; i < n; i++, indx += isindex, value += isb) { + indexed = (npy_half *)(ip1 + is1 * *(npy_intp *)indx); + float v = npy_half_to_float(*(npy_half *)value); + float div = npy_floor_dividef(npy_half_to_float(*indexed), v); + *indexed = npy_float_to_half(div); + } + return 0; +} + NPY_NO_EXPORT void HALF_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { @@ -1953,12 +1791,6 @@ HALF_conjugate(char **args, npy_intp const *dimensions, npy_intp const *steps, v } } -NPY_NO_EXPORT NPY_GCC_OPT_3 void -HALF_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ - UNARY_LOOP_FAST(npy_half, npy_half, *out = in&0x7fffu); -} - NPY_NO_EXPORT void HALF_negative(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { @@ -2110,6 +1942,26 @@ NPY_NO_EXPORT void } } } + +NPY_NO_EXPORT int @TYPE@_@kind@_indexed +(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func)) +{ + char *ip1 = args[0]; + char *indx = args[1]; + char *value = args[2]; + npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2]; + npy_intp n = dimensions[0]; + npy_intp i; + @ftype@ *indexed; + for(i = 0; i < n; i++, indx += isindex, value += isb) { + indexed = (@ftype@ *)(ip1 + is1 * *(npy_intp *)indx); + const @ftype@ b_r = ((@ftype@ *)value)[0]; + const @ftype@ b_i = ((@ftype@ *)value)[1]; + indexed[0] @OP@= b_r; + indexed[1] @OP@= b_i; + } + return 0; +} /**end repeat1**/ NPY_NO_EXPORT void @@ -2124,6 +1976,28 @@ NPY_NO_EXPORT void ((@ftype@ *)op1)[1] = in1r*in2i + in1i*in2r; } } + +NPY_NO_EXPORT int @TYPE@_multiply_indexed +(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func)) +{ + char *ip1 = args[0]; + char *indx = args[1]; + char *value = args[2]; + npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2]; + npy_intp n = dimensions[0]; + npy_intp i; + @ftype@ *indexed; + for(i = 0; i < n; i++, indx += isindex, value += isb) { + indexed = (@ftype@ *)(ip1 + is1 * *(npy_intp *)indx); + const @ftype@ a_r = indexed[0]; + const @ftype@ a_i = indexed[1]; + const @ftype@ b_r = ((@ftype@ *)value)[0]; + const @ftype@ b_i = ((@ftype@ *)value)[1]; + indexed[0] = a_r*b_r - a_i*b_i; + indexed[1] = a_r*b_i + a_i*b_r; + } + return 0; +} #endif // !SIMD NPY_NO_EXPORT void @@ -2235,6 +2109,8 @@ NPY_NO_EXPORT void } /**end repeat1**/ +#if !@SIMD@ +// CFLOAT & CDOUBLE defined by 'loops_arithm_fp.dispatch.c.src' NPY_NO_EXPORT void @TYPE@_square(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)) { @@ -2245,6 +2121,7 @@ NPY_NO_EXPORT void ((@ftype@ *)op1)[1] = in1r*in1i + in1i*in1r; } } +#endif NPY_NO_EXPORT void @TYPE@_reciprocal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)) @@ -2275,6 +2152,8 @@ NPY_NO_EXPORT void } } +#if !@SIMD@ +// CFLOAT & CDOUBLE defined by 'loops_arithm_fp.dispatch.c.src' NPY_NO_EXPORT void @TYPE@_conjugate(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { UNARY_LOOP { @@ -2294,20 +2173,6 @@ NPY_NO_EXPORT void *((@ftype@ *)op1) = npy_hypot@c@(in1r, in1i); } } - -#if @SIMD@ && defined(HAVE_ATTRIBUTE_TARGET_AVX512F) -/**begin repeat1 - * arithmetic - * #kind = conjugate, square, absolute# - */ -NPY_NO_EXPORT void -@TYPE@_@kind@_avx512f(char **args, const npy_intp *dimensions, const npy_intp *steps, void *func) -{ - if (!run_unary_avx512f_@kind@_@TYPE@(args, dimensions, steps)) { - @TYPE@_@kind@(args, dimensions, steps, func); - } -} -/**end repeat1**/ #endif NPY_NO_EXPORT void diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src index 424e204c1..cce73aff8 100644 --- a/numpy/core/src/umath/loops.h.src +++ b/numpy/core/src/umath/loops.h.src @@ -10,24 +10,32 @@ #define NPY_NO_EXPORT NPY_VISIBILITY_HIDDEN #endif -#define BOOL_invert BOOL_logical_not -#define BOOL_add BOOL_logical_or -#define BOOL_bitwise_and BOOL_logical_and -#define BOOL_bitwise_or BOOL_logical_or -#define BOOL_logical_xor BOOL_not_equal -#define BOOL_bitwise_xor BOOL_logical_xor -#define BOOL_multiply BOOL_logical_and -#define BOOL_maximum BOOL_logical_or -#define BOOL_minimum BOOL_logical_and -#define BOOL_fmax BOOL_maximum -#define BOOL_fmin BOOL_minimum - /* ***************************************************************************** ** BOOLEAN LOOPS ** ***************************************************************************** */ +/* + * Following functions are defined by umath generator + * to enable runtime dispatching without the need + * to redefine them within dsipatch-able sources. + */ +// #define BOOL_invert BOOL_logical_not +// #define BOOL_add BOOL_logical_or +// #define BOOL_bitwise_and BOOL_logical_and +// #define BOOL_bitwise_or BOOL_logical_or +// #define BOOL_logical_xor BOOL_not_equal +// #define BOOL_bitwise_xor BOOL_logical_xor +// #define BOOL_multiply BOOL_logical_and +// #define BOOL_maximum BOOL_logical_or +// #define BOOL_minimum BOOL_logical_and +// #define BOOL_fmax BOOL_maximum +// #define BOOL_fmin BOOL_minimum + +typedef struct PyArrayMethod_Context_tag PyArrayMethod_Context; +typedef struct NpyAuxData_tag NpyAuxData; + #ifndef NPY_DISABLE_OPTIMIZATION #include "loops_comparison.dispatch.h" #endif @@ -39,11 +47,15 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BOOL_@kind@, (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))) /**end repeat**/ +#ifndef NPY_DISABLE_OPTIMIZATION + #include "loops_logical.dispatch.h" +#endif + /**begin repeat - * #kind = logical_and, logical_or, absolute, logical_not# - **/ -NPY_NO_EXPORT void -BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + * #kind = logical_and, logical_or, logical_not, absolute# + */ + NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BOOL_@kind@, + (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))) /**end repeat**/ NPY_NO_EXPORT void @@ -56,6 +68,17 @@ NPY_NO_EXPORT void BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); /**end repeat**/ + +#ifndef NPY_DISABLE_OPTIMIZATION + #include "loops_autovec.dispatch.h" +#endif +/**begin repeat + * #kind = isnan, isinf, isfinite# + */ +NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BOOL_@kind@, + (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))) +/**end repeat**/ + /* ***************************************************************************** ** INTEGER LOOPS @@ -72,6 +95,11 @@ BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void */ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_divide, (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))) + +NPY_NO_EXPORT int +@TYPE@_divide_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func)); + +/**end repeat3**/ /**end repeat**/ #ifndef NPY_DISABLE_OPTIMIZATION @@ -106,19 +134,40 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, /**end repeat1**/ /**end repeat**/ + +#ifndef NPY_DISABLE_OPTIMIZATION + #include "loops_autovec.dispatch.h" +#endif /**begin repeat - * #TYPE = BYTE, SHORT, INT, LONG, LONGLONG# + * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG, + BYTE, SHORT, INT, LONG, LONGLONG# */ +/**begin repeat1 + * #kind = invert, logical_not, conjugate, reciprocal, square, add, + * subtract, multiply, bitwise_and, bitwise_or, bitwise_xor, + * left_shift, right_shift, logical_and, logical_or, + * logical_xor, isnan, isinf, isfinite, + * absolute, sign# + */ +NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, + (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))) +/**end repeat1**/ +/**end repeat**/ +/**begin repeat + * #TYPE = BYTE, SHORT, INT, LONG, LONGLONG# + */ /**begin repeat1 * both signed and unsigned integer types * #s = , u# * #S = , U# */ - #define @S@@TYPE@_floor_divide @S@@TYPE@_divide +#define @S@@TYPE@_floor_divide_indexed @S@@TYPE@_divide_indexed #define @S@@TYPE@_fmax @S@@TYPE@_maximum #define @S@@TYPE@_fmin @S@@TYPE@_minimum +#define @S@@TYPE@_fmax_indexed @S@@TYPE@_maximum_indexed +#define @S@@TYPE@_fmin_indexed @S@@TYPE@_minimum_indexed NPY_NO_EXPORT void @S@@TYPE@__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); @@ -127,85 +176,69 @@ NPY_NO_EXPORT void @S@@TYPE@_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); /**begin repeat2 - * #isa = , _avx2# - */ - -NPY_NO_EXPORT void -@S@@TYPE@_square@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); - -NPY_NO_EXPORT void -@S@@TYPE@_reciprocal@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); - -NPY_NO_EXPORT void -@S@@TYPE@_conjugate@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -@S@@TYPE@_negative@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -@S@@TYPE@_logical_not@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void -@S@@TYPE@_invert@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -/**begin repeat3 * Arithmetic * #kind = add, subtract, multiply, bitwise_and, bitwise_or, bitwise_xor, * left_shift, right_shift# * #OP = +, -,*, &, |, ^, <<, >># */ -NPY_NO_EXPORT void -@S@@TYPE@_@kind@@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -/**end repeat3**/ - -/**begin repeat3 - * #kind = logical_and, logical_or# - * #OP = &&, ||# - */ -NPY_NO_EXPORT void -@S@@TYPE@_@kind@@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -/**end repeat3**/ +NPY_NO_EXPORT int +@S@@TYPE@_@kind@_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, + npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func)); -NPY_NO_EXPORT void -@S@@TYPE@_logical_xor@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); /**end repeat2**/ /**begin repeat2 * #kind = maximum, minimum# - * #OP = >, <# **/ NPY_NO_EXPORT void @S@@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); -/**end repeat2**/ -NPY_NO_EXPORT void -@S@@TYPE@_power(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +NPY_NO_EXPORT int +@S@@TYPE@_@kind@_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func)); -NPY_NO_EXPORT void -@S@@TYPE@_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +/**end repeat2**/ NPY_NO_EXPORT void -@S@@TYPE@_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +@S@@TYPE@_power(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); NPY_NO_EXPORT void @S@@TYPE@_gcd(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); NPY_NO_EXPORT void @S@@TYPE@_lcm(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -/**begin repeat2 - * #kind = isnan, isinf, isfinite# - **/ -NPY_NO_EXPORT void -@S@@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); /**end repeat2**/ /**end repeat1**/ +/**end repeat**/ + +/**begin repeat + * #kind = equal, not_equal, less, less_equal, greater, greater_equal# + * #OP = ==, !=, <, <=, >, >=# + */ +NPY_NO_EXPORT void +LONGLONG_Qq_bool_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +NPY_NO_EXPORT void +LONGLONG_qQ_bool_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); /**end repeat**/ + +#ifndef NPY_DISABLE_OPTIMIZATION + #include "loops_unary.dispatch.h" +#endif +/**begin repeat + * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG, + * BYTE, SHORT, INT, LONG, LONGLONG# + */ +/**begin repeat1 + * #kind = negative# + */ +NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, + (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))) +/**end repeat1**/ +/**end repeat**/ + + /* ***************************************************************************** ** FLOAT LOOPS ** @@ -226,6 +259,34 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, /**end repeat**/ #ifndef NPY_DISABLE_OPTIMIZATION + #include "loops_unary_fp_le.dispatch.h" +#endif +/**begin repeat + * #TYPE = FLOAT, DOUBLE# + */ +/**begin repeat1 + * #kind = isnan, isinf, isfinite, signbit# + */ +NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, + (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))) +/**end repeat1**/ +/**end repeat**/ + +#ifndef NPY_DISABLE_OPTIMIZATION + #include "loops_unary.dispatch.h" +#endif +/**begin repeat + * #TYPE = FLOAT, DOUBLE, LONGDOUBLE# + */ +/**begin repeat1 + * #kind = negative# + */ +NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, + (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))) +/**end repeat1**/ +/**end repeat**/ + +#ifndef NPY_DISABLE_OPTIMIZATION #include "loops_arithm_fp.dispatch.h" #endif /**begin repeat @@ -234,10 +295,13 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, /**begin repeat1 * Arithmetic * # kind = add, subtract, multiply, divide# - * # OP = +, -, *, /# */ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))) + +NPY_NO_EXPORT int +@TYPE@_@kind@_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func)); + /**end repeat1**/ /**end repeat**/ @@ -283,15 +347,6 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void HALF_@func@, /**end repeat**/ /**begin repeat - * #func = sin, cos# - */ - -NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_@func@, - (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))) - -/**end repeat**/ - -/**begin repeat * #TYPE = FLOAT, DOUBLE# */ /**begin repeat1 @@ -304,27 +359,20 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@func@, /**end repeat1**/ /**end repeat**/ -/**begin repeat - * #TYPE = FLOAT, DOUBLE# - */ -/**begin repeat1 - * #func = maximum, minimum# - */ -NPY_NO_EXPORT void -@TYPE@_@func@_avx512f(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -/**end repeat1**/ -/**end repeat**/ - #ifndef NPY_DISABLE_OPTIMIZATION #include "loops_trigonometric.dispatch.h" #endif + /**begin repeat + * #TYPE = FLOAT, DOUBLE# + */ +/**begin repeat1 * #func = sin, cos# */ -NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_@func@, ( +NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@func@, ( char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func) )) +/**end repeat1**/ /**end repeat**/ #ifndef NPY_DISABLE_OPTIMIZATION @@ -362,6 +410,8 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, ( * #TYPE = HALF, FLOAT, DOUBLE, LONGDOUBLE# * #c = f, f, , l# * #C = F, F, , L# + * #half = 1, 0, 0, 0# + * #fd = 0, 1, 1, 0# */ /**begin repeat1 @@ -371,6 +421,11 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, ( */ NPY_NO_EXPORT void @TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT int +@TYPE@_@kind@_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func)); + +/**end repeat1**/ /**end repeat1**/ /**begin repeat1 @@ -390,35 +445,44 @@ NPY_NO_EXPORT void /**begin repeat1 * #kind = isnan, isinf, isfinite, signbit, copysign, nextafter, spacing# * #func = npy_isnan, npy_isinf, npy_isfinite, npy_signbit, npy_copysign, nextafter, spacing# + * #dispatched = 1, 1, 1, 1, 0, 0, 0# **/ -/**begin repeat2 - * #ISA = , _avx512_skx# - **/ +#if !@fd@ || !@dispatched@ NPY_NO_EXPORT void -@TYPE@_@kind@@ISA@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +#endif /**end repeat2**/ /**end repeat1**/ /**begin repeat1 * #kind = maximum, minimum# - * #OP = >=, <=# **/ NPY_NO_EXPORT void @TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT int +@TYPE@_@kind@_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func)); + /**end repeat1**/ /**begin repeat1 * #kind = fmax, fmin# - * #OP = >=, <=# **/ NPY_NO_EXPORT void @TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT int +@TYPE@_@kind@_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func)); + /**end repeat1**/ NPY_NO_EXPORT void @TYPE@_floor_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +NPY_NO_EXPORT int +@TYPE@_floor_divide_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func)); + NPY_NO_EXPORT void @TYPE@_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); @@ -440,8 +504,10 @@ NPY_NO_EXPORT void NPY_NO_EXPORT void @TYPE@_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +#if @half@ NPY_NO_EXPORT void @TYPE@_negative(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +#endif NPY_NO_EXPORT void @TYPE@_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); @@ -473,6 +539,19 @@ NPY_NO_EXPORT void /**end repeat1**/ /**end repeat**/ +#ifndef NPY_DISABLE_OPTIMIZATION + #include "loops_autovec.dispatch.h" +#endif +/**begin repeat + * #TYPE = HALF# + */ +/**begin repeat1 + * #kind = absolute# + */ +NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, + (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))) +/**end repeat1**/ +/**end repeat**/ /* ***************************************************************************** ** COMPLEX LOOPS ** @@ -485,7 +564,21 @@ NPY_NO_EXPORT void * #TYPE = CFLOAT, CDOUBLE# */ /**begin repeat1 - * #kind = add, subtract, multiply# + * #kind = add, subtract, multiply, conjugate, square# + */ +NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, + (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))) +/**end repeat1**/ +/**end repeat**/ + +#ifndef NPY_DISABLE_OPTIMIZATION + #include "loops_unary_complex.dispatch.h" +#endif +/**begin repeat + * #TYPE = CFLOAT, CDOUBLE# + */ +/**begin repeat1 + * #kind = absolute# */ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))) @@ -512,6 +605,9 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, */ NPY_NO_EXPORT void C@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT int +C@TYPE@_@kind@_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func)); /**end repeat1**/ NPY_NO_EXPORT void @@ -554,19 +650,14 @@ C@TYPE@_reciprocal(char **args, npy_intp const *dimensions, npy_intp const *step NPY_NO_EXPORT void C@TYPE@__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); -/**begin repeat1 - * #isa = , _avx512f# - */ - NPY_NO_EXPORT void -C@TYPE@_conjugate@isa@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)); +C@TYPE@_conjugate(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)); NPY_NO_EXPORT void -C@TYPE@_absolute@isa@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)); +C@TYPE@_absolute(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)); NPY_NO_EXPORT void -C@TYPE@_square@isa@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(data)); -/**end repeat1**/ +C@TYPE@_square(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(data)); NPY_NO_EXPORT void C@TYPE@__arg(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); @@ -627,9 +718,6 @@ NPY_NO_EXPORT void NPY_NO_EXPORT void @TYPE@_isfinite(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); -NPY_NO_EXPORT void -@TYPE@_isinf(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - #define @TYPE@_isnan @TYPE@_isnat NPY_NO_EXPORT void @@ -705,6 +793,21 @@ TIMEDELTA_mm_qm_divmod(char **args, npy_intp const *dimensions, npy_intp const * #define TIMEDELTA_md_m_floor_divide TIMEDELTA_md_m_divide /* #define TIMEDELTA_mm_d_floor_divide TIMEDELTA_mm_d_divide */ + +#ifndef NPY_DISABLE_OPTIMIZATION + #include "loops_autovec.dispatch.h" +#endif +/**begin repeat + * #TYPE = TIMEDELTA, DATETIME# + */ +/**begin repeat1 + * #kind = isinf# + */ +NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, + (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))) +/**end repeat1**/ +/**end repeat**/ + /* ***************************************************************************** ** OBJECT LOOPS ** diff --git a/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src b/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src index bf8142880..3ab5a968d 100644 --- a/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src +++ b/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src @@ -1,6 +1,8 @@ /*@targets ** $maxopt baseline - ** sse2 avx2 avx512f + ** sse2 (avx2 fma3) + ** neon asimd + ** vsx2 vsx3 ** vx vxe **/ #define _UMATHMODULE @@ -14,707 +16,392 @@ // Provides the various *_LOOP macros #include "fast_loop_macros.h" -// TODO: replace raw SIMD with NPYV +/** + * TODO: + * - Improve the implementation of SIMD complex absolute, + * current one kinda slow and it can be optimized by + * at least avoiding the division and keep sqrt. + * - Vectorize reductions + * - Add support for ASIMD/VCMLA through universal intrinics. + */ + //############################################################################### //## Real Single/Double precision //############################################################################### /******************************************************************************** - ** Defining the SIMD kernels + ** Defining ufunc inner functions ********************************************************************************/ -#ifdef NPY_HAVE_SSE2 + +/* + * clang has a bug that's present at -O1 or greater. When partially loading a + * vector register for a divide operation, the remaining elements are set + * to 1 to avoid divide-by-zero. The partial load is paired with a partial + * store after the divide operation. clang notices that the entire register + * is not needed for the store and optimizes out the fill of 1 to the remaining + * elements. This causes either a divide-by-zero or 0/0 with invalid exception + * that we were trying to avoid by filling. + * + * Using a dummy variable marked 'volatile' convinces clang not to ignore + * the explicit fill of remaining elements. If `-ftrapping-math` is + * supported, then it'll also avoid the bug. `-ftrapping-math` is supported + * on Apple clang v12+ for x86_64. It is not currently supported for arm64. + * `-ftrapping-math` is set by default of Numpy builds in + * numpy/distutils/ccompiler.py. + * + * Note: Apple clang and clang upstream have different versions that overlap + */ +#if defined(__clang__) + #if defined(__apple_build_version__) + // Apple Clang + #if __apple_build_version__ < 12000000 + // Apple Clang before v12 + #define WORKAROUND_CLANG_PARTIAL_LOAD_BUG 1 + #elif defined(NPY_CPU_X86) || defined(NPY_CPU_AMD64) + // Apple Clang after v12, targeting i386 or x86_64 + #define WORKAROUND_CLANG_PARTIAL_LOAD_BUG 0 + #else + // Apple Clang after v12, not targeting i386 or x86_64 + #define WORKAROUND_CLANG_PARTIAL_LOAD_BUG 1 + #endif + #else + // Clang, not Apple Clang + #if __clang_major__ < 10 + // Clang before v10 + #define WORKAROUND_CLANG_PARTIAL_LOAD_BUG 1 + #elif defined(_MSC_VER) + // clang-cl has the same bug + #define WORKAROUND_CLANG_PARTIAL_LOAD_BUG 1 + #elif defined(NPY_CPU_X86) || defined(NPY_CPU_AMD64) + // Clang v10+, targeting i386 or x86_64 + #define WORKAROUND_CLANG_PARTIAL_LOAD_BUG 0 + #else + // Clang v10+, not targeting i386 or x86_64 + #define WORKAROUND_CLANG_PARTIAL_LOAD_BUG 1 + #endif + #endif +#else +// Not a Clang compiler +#define WORKAROUND_CLANG_PARTIAL_LOAD_BUG 0 +#endif + /**begin repeat + * Float types * #type = npy_float, npy_double# * #TYPE = FLOAT, DOUBLE# - * #scalarf = npy_sqrtf, npy_sqrt# + * #sfx = f32, f64# * #c = f, # - * #vtype = __m128, __m128d# - * #vtype256 = __m256, __m256d# - * #vtype512 = __m512, __m512d# - * #vpre = _mm, _mm# - * #vpre256 = _mm256, _mm256# - * #vpre512 = _mm512, _mm512# - * #vsuf = ps, pd# - * #vsufs = ss, sd# - * #nan = NPY_NANF, NPY_NAN# - * #double = 0, 1# - * #cast = _mm_castps_si128, _mm_castpd_si128# + * #C = F, # + * #VECTOR = NPY_SIMD_F32, NPY_SIMD_F64# */ /**begin repeat1 -* Arithmetic -* # kind = add, subtract, multiply, divide# -* # OP = +, -, *, /# -* # VOP = add, sub, mul, div# -*/ -static void -sse2_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n) + * Arithmetic + * # kind = add, subtract, multiply, divide# + * # intrin = add, sub, mul, div# + * # OP = +, -, *, /# + * # PW = 1, 0, 0, 0# + * # is_div = 0*3, 1# + * # is_mul = 0*2, 1, 0# + */ +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { -#ifdef NPY_HAVE_AVX512F - const npy_intp vector_size_bytes = 64; - LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes) - op[i] = ip1[i] @OP@ ip2[i]; - /* lots of specializations, to squeeze out max performance */ - if (npy_is_aligned(&ip1[i], vector_size_bytes) && npy_is_aligned(&ip2[i], vector_size_bytes)) { - if (ip1 == ip2) { - LOOP_BLOCKED(@type@, vector_size_bytes) { - @vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]); - @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, a); - @vpre512@_store_@vsuf@(&op[i], c); + npy_intp len = dimensions[0]; + char *src0 = args[0], *src1 = args[1], *dst = args[2]; + npy_intp ssrc0 = steps[0], ssrc1 = steps[1], sdst = steps[2]; + // reduce + if (ssrc0 == 0 && ssrc0 == sdst && src0 == dst) { + #if @PW@ + *((@type@*)src0) @OP@= @TYPE@_pairwise_sum(src1, len, ssrc1); + #else + @type@ acc = *((@type@*)src0); + if (ssrc1 == sizeof(@type@)) { + for (; len > 0; --len, src1 += sizeof(@type@)) { + acc @OP@= *(@type@ *)src1; } - } - else { - LOOP_BLOCKED(@type@, vector_size_bytes) { - @vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]); - @vtype512@ b = @vpre512@_load_@vsuf@(&ip2[i]); - @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b); - @vpre512@_store_@vsuf@(&op[i], c); + } else { + for (; len > 0; --len, src1 += ssrc1) { + acc @OP@= *(@type@ *)src1; } } + *((@type@*)src0) = acc; + #endif + return; } - else if (npy_is_aligned(&ip1[i], vector_size_bytes)) { - LOOP_BLOCKED(@type@, vector_size_bytes) { - @vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]); - @vtype512@ b = @vpre512@_loadu_@vsuf@(&ip2[i]); - @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b); - @vpre512@_store_@vsuf@(&op[i], c); - } - } - else if (npy_is_aligned(&ip2[i], vector_size_bytes)) { - LOOP_BLOCKED(@type@, vector_size_bytes) { - @vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]); - @vtype512@ b = @vpre512@_load_@vsuf@(&ip2[i]); - @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b); - @vpre512@_store_@vsuf@(&op[i], c); - } - } - else { - if (ip1 == ip2) { - LOOP_BLOCKED(@type@, vector_size_bytes) { - @vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]); - @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, a); - @vpre512@_store_@vsuf@(&op[i], c); +#if @VECTOR@ + if (len > npyv_nlanes_@sfx@*2 && + !is_mem_overlap(src0, ssrc0, dst, sdst, len) && + !is_mem_overlap(src1, ssrc1, dst, sdst, len) + ) { + const int vstep = npyv_nlanes_u8; + const int wstep = vstep * 2; + const int hstep = npyv_nlanes_@sfx@; + const int lstep = hstep * 2; + // lots of specializations, to squeeze out max performance + if (ssrc0 == sizeof(@type@) && ssrc0 == ssrc1 && ssrc0 == sdst) { + for (; len >= lstep; len -= lstep, src0 += wstep, src1 += wstep, dst += wstep) { + npyv_@sfx@ a0 = npyv_load_@sfx@((const @type@*)src0); + npyv_@sfx@ a1 = npyv_load_@sfx@((const @type@*)(src0 + vstep)); + npyv_@sfx@ b0 = npyv_load_@sfx@((const @type@*)src1); + npyv_@sfx@ b1 = npyv_load_@sfx@((const @type@*)(src1 + vstep)); + npyv_@sfx@ r0 = npyv_@intrin@_@sfx@(a0, b0); + npyv_@sfx@ r1 = npyv_@intrin@_@sfx@(a1, b1); + npyv_store_@sfx@((@type@*)dst, r0); + npyv_store_@sfx@((@type@*)(dst + vstep), r1); } - } - else { - LOOP_BLOCKED(@type@, vector_size_bytes) { - @vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]); - @vtype512@ b = @vpre512@_loadu_@vsuf@(&ip2[i]); - @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b); - @vpre512@_store_@vsuf@(&op[i], c); + #if @is_div@ && WORKAROUND_CLANG_PARTIAL_LOAD_BUG + const int vstop = hstep - 1; + #else + const int vstop = 0; + #endif // #if @is_div@ && WORKAROUND_CLANG_PARTIAL_LOAD_BUG + for (; len > vstop; len -= hstep, src0 += vstep, src1 += vstep, dst += vstep) { + #if @is_div@ + npyv_@sfx@ a = npyv_load_till_@sfx@((const @type@*)src0, len, 1.0@c@); + npyv_@sfx@ b = npyv_load_till_@sfx@((const @type@*)src1, len, 1.0@c@); + #else + npyv_@sfx@ a = npyv_load_tillz_@sfx@((const @type@*)src0, len); + npyv_@sfx@ b = npyv_load_tillz_@sfx@((const @type@*)src1, len); + #endif + npyv_@sfx@ r = npyv_@intrin@_@sfx@(a, b); + npyv_store_till_@sfx@((@type@*)dst, len, r); } - } - } -#elif defined NPY_HAVE_AVX2 - const npy_intp vector_size_bytes = 32; - LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes) - op[i] = ip1[i] @OP@ ip2[i]; - /* lots of specializations, to squeeze out max performance */ - if (npy_is_aligned(&ip1[i], vector_size_bytes) && - npy_is_aligned(&ip2[i], vector_size_bytes)) { - if (ip1 == ip2) { - LOOP_BLOCKED(@type@, vector_size_bytes) { - @vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]); - @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, a); - @vpre256@_store_@vsuf@(&op[i], c); + #if @is_div@ && WORKAROUND_CLANG_PARTIAL_LOAD_BUG + // last partial iteration for divide and working around clang partial load bug + if(len > 0){ + npyv_@sfx@ a = npyv_load_till_@sfx@((const @type@*)src0, len, 1.0@c@); + volatile npyv_@sfx@ b = npyv_load_till_@sfx@((const @type@*)src1, len, 1.0@c@); + npyv_@sfx@ r = npyv_@intrin@_@sfx@(a, b); + npyv_store_till_@sfx@((@type@*)dst, len, r); } - } - else { - LOOP_BLOCKED(@type@, vector_size_bytes) { - @vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]); - @vtype256@ b = @vpre256@_load_@vsuf@(&ip2[i]); - @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b); - @vpre256@_store_@vsuf@(&op[i], c); + #endif // #if @is_div@ && WORKAROUND_CLANG_PARTIAL_LOAD_BUG + } + else if (ssrc0 == 0 && ssrc1 == sizeof(@type@) && sdst == ssrc1) { + npyv_@sfx@ a = npyv_setall_@sfx@(*((@type@*)src0)); + for (; len >= lstep; len -= lstep, src1 += wstep, dst += wstep) { + npyv_@sfx@ b0 = npyv_load_@sfx@((const @type@*)src1); + npyv_@sfx@ b1 = npyv_load_@sfx@((const @type@*)(src1 + vstep)); + npyv_@sfx@ r0 = npyv_@intrin@_@sfx@(a, b0); + npyv_@sfx@ r1 = npyv_@intrin@_@sfx@(a, b1); + npyv_store_@sfx@((@type@*)dst, r0); + npyv_store_@sfx@((@type@*)(dst + vstep), r1); } - } - } - else if (npy_is_aligned(&ip1[i], vector_size_bytes)) { - LOOP_BLOCKED(@type@, vector_size_bytes) { - @vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]); - @vtype256@ b = @vpre256@_loadu_@vsuf@(&ip2[i]); - @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b); - @vpre256@_store_@vsuf@(&op[i], c); - } - } - else if (npy_is_aligned(&ip2[i], vector_size_bytes)) { - LOOP_BLOCKED(@type@, vector_size_bytes) { - @vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]); - @vtype256@ b = @vpre256@_load_@vsuf@(&ip2[i]); - @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b); - @vpre256@_store_@vsuf@(&op[i], c); - } - } - else { - if (ip1 == ip2) { - LOOP_BLOCKED(@type@, vector_size_bytes) { - @vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]); - @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, a); - @vpre256@_store_@vsuf@(&op[i], c); - } - } - else { - LOOP_BLOCKED(@type@, vector_size_bytes) { - @vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]); - @vtype256@ b = @vpre256@_loadu_@vsuf@(&ip2[i]); - @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b); - @vpre256@_store_@vsuf@(&op[i], c); + #if (@is_div@ || @is_mul@) && WORKAROUND_CLANG_PARTIAL_LOAD_BUG + const int vstop = hstep - 1; + #else + const int vstop = 0; + #endif // #if (@is_div@ || @is_mul@) && WORKAROUND_CLANG_PARTIAL_LOAD_BUG + for (; len > vstop; len -= hstep, src1 += vstep, dst += vstep) { + #if @is_div@ || @is_mul@ + npyv_@sfx@ b = npyv_load_till_@sfx@((const @type@*)src1, len, 1.0@c@); + #else + npyv_@sfx@ b = npyv_load_tillz_@sfx@((const @type@*)src1, len); + #endif + npyv_@sfx@ r = npyv_@intrin@_@sfx@(a, b); + npyv_store_till_@sfx@((@type@*)dst, len, r); } - } - } -#else - const npy_intp vector_size_bytes = 16; - LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes) - op[i] = ip1[i] @OP@ ip2[i]; - /* lots of specializations, to squeeze out max performance */ - if (npy_is_aligned(&ip1[i], vector_size_bytes) && - npy_is_aligned(&ip2[i], vector_size_bytes)) { - if (ip1 == ip2) { - LOOP_BLOCKED(@type@, vector_size_bytes) { - @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]); - @vtype@ c = @vpre@_@VOP@_@vsuf@(a, a); - @vpre@_store_@vsuf@(&op[i], c); - } - } - else { - LOOP_BLOCKED(@type@, vector_size_bytes) { - @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]); - @vtype@ b = @vpre@_load_@vsuf@(&ip2[i]); - @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b); - @vpre@_store_@vsuf@(&op[i], c); + #if (@is_div@ || @is_mul@) && WORKAROUND_CLANG_PARTIAL_LOAD_BUG + // last partial iteration for multiply / divide and working around clang partial load bug + if(len > 0){ + volatile npyv_@sfx@ b = npyv_load_till_@sfx@((const @type@*)src1, len, 1.0@c@); + npyv_@sfx@ r = npyv_@intrin@_@sfx@(a, b); + npyv_store_till_@sfx@((@type@*)dst, len, r); } - } - } - else if (npy_is_aligned(&ip1[i], vector_size_bytes)) { - LOOP_BLOCKED(@type@, vector_size_bytes) { - @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]); - @vtype@ b = @vpre@_loadu_@vsuf@(&ip2[i]); - @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b); - @vpre@_store_@vsuf@(&op[i], c); - } - } - else if (npy_is_aligned(&ip2[i], vector_size_bytes)) { - LOOP_BLOCKED(@type@, vector_size_bytes) { - @vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]); - @vtype@ b = @vpre@_load_@vsuf@(&ip2[i]); - @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b); - @vpre@_store_@vsuf@(&op[i], c); - } - } - else { - if (ip1 == ip2) { - LOOP_BLOCKED(@type@, vector_size_bytes) { - @vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]); - @vtype@ c = @vpre@_@VOP@_@vsuf@(a, a); - @vpre@_store_@vsuf@(&op[i], c); + #endif // #if (@is_div@ || @is_mul@) && WORKAROUND_CLANG_PARTIAL_LOAD_BUG + } + else if (ssrc1 == 0 && ssrc0 == sizeof(@type@) && sdst == ssrc0) { + npyv_@sfx@ b = npyv_setall_@sfx@(*((@type@*)src1)); + for (; len >= lstep; len -= lstep, src0 += wstep, dst += wstep) { + npyv_@sfx@ a0 = npyv_load_@sfx@((const @type@*)src0); + npyv_@sfx@ a1 = npyv_load_@sfx@((const @type@*)(src0 + vstep)); + npyv_@sfx@ r0 = npyv_@intrin@_@sfx@(a0, b); + npyv_@sfx@ r1 = npyv_@intrin@_@sfx@(a1, b); + npyv_store_@sfx@((@type@*)dst, r0); + npyv_store_@sfx@((@type@*)(dst + vstep), r1); } - } - else { - LOOP_BLOCKED(@type@, vector_size_bytes) { - @vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]); - @vtype@ b = @vpre@_loadu_@vsuf@(&ip2[i]); - @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b); - @vpre@_store_@vsuf@(&op[i], c); + for (; len > 0; len -= hstep, src0 += vstep, dst += vstep) { + #if @is_div@ || @is_mul@ + npyv_@sfx@ a = npyv_load_till_@sfx@((const @type@*)src0, len, 1.0@c@); + #else + npyv_@sfx@ a = npyv_load_tillz_@sfx@((const @type@*)src0, len); + #endif + npyv_@sfx@ r = npyv_@intrin@_@sfx@(a, b); + npyv_store_till_@sfx@((@type@*)dst, len, r); } + } else { + goto loop_scalar; } + npyv_cleanup(); + return; } +loop_scalar: #endif - LOOP_BLOCKED_END { - op[i] = ip1[i] @OP@ ip2[i]; - } -} - -static void -sse2_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n) -{ -#ifdef NPY_HAVE_AVX512F - const npy_intp vector_size_bytes = 64; - const @vtype512@ a = @vpre512@_set1_@vsuf@(ip1[0]); - LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes) - op[i] = ip1[0] @OP@ ip2[i]; - if (npy_is_aligned(&ip2[i], vector_size_bytes)) { - LOOP_BLOCKED(@type@, vector_size_bytes) { - @vtype512@ b = @vpre512@_load_@vsuf@(&ip2[i]); - @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b); - @vpre512@_store_@vsuf@(&op[i], c); - } - } - else { - LOOP_BLOCKED(@type@, vector_size_bytes) { - @vtype512@ b = @vpre512@_loadu_@vsuf@(&ip2[i]); - @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b); - @vpre512@_store_@vsuf@(&op[i], c); - } - } - - -#elif defined NPY_HAVE_AVX2 - const npy_intp vector_size_bytes = 32; - const @vtype256@ a = @vpre256@_set1_@vsuf@(ip1[0]); - LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes) - op[i] = ip1[0] @OP@ ip2[i]; - if (npy_is_aligned(&ip2[i], vector_size_bytes)) { - LOOP_BLOCKED(@type@, vector_size_bytes) { - @vtype256@ b = @vpre256@_load_@vsuf@(&ip2[i]); - @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b); - @vpre256@_store_@vsuf@(&op[i], c); - } - } - else { - LOOP_BLOCKED(@type@, vector_size_bytes) { - @vtype256@ b = @vpre256@_loadu_@vsuf@(&ip2[i]); - @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b); - @vpre256@_store_@vsuf@(&op[i], c); - } - } -#else - const npy_intp vector_size_bytes = 16; - const @vtype@ a = @vpre@_set1_@vsuf@(ip1[0]); - LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes) - op[i] = ip1[0] @OP@ ip2[i]; - if (npy_is_aligned(&ip2[i], vector_size_bytes)) { - LOOP_BLOCKED(@type@, vector_size_bytes) { - @vtype@ b = @vpre@_load_@vsuf@(&ip2[i]); - @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b); - @vpre@_store_@vsuf@(&op[i], c); - } - } - else { - LOOP_BLOCKED(@type@, vector_size_bytes) { - @vtype@ b = @vpre@_loadu_@vsuf@(&ip2[i]); - @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b); - @vpre@_store_@vsuf@(&op[i], c); - } - } -#endif - LOOP_BLOCKED_END { - op[i] = ip1[0] @OP@ ip2[i]; + for (; len > 0; --len, src0 += ssrc0, src1 += ssrc1, dst += sdst) { + const @type@ a = *((@type@*)src0); + const @type@ b = *((@type@*)src1); + *((@type@*)dst) = a @OP@ b; } } -static void -sse2_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n) +NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@_indexed) +(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func)) { -#ifdef NPY_HAVE_AVX512F - const npy_intp vector_size_bytes = 64; - const @vtype512@ b = @vpre512@_set1_@vsuf@(ip2[0]); - LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes) - op[i] = ip1[i] @OP@ ip2[0]; - if (npy_is_aligned(&ip1[i], vector_size_bytes)) { - LOOP_BLOCKED(@type@, vector_size_bytes) { - @vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]); - @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b); - @vpre512@_store_@vsuf@(&op[i], c); - } - } - else { - LOOP_BLOCKED(@type@, vector_size_bytes) { - @vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]); - @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b); - @vpre512@_store_@vsuf@(&op[i], c); - } - } - -#elif defined NPY_HAVE_AVX2 - const npy_intp vector_size_bytes = 32; - const @vtype256@ b = @vpre256@_set1_@vsuf@(ip2[0]); - LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes) - op[i] = ip1[i] @OP@ ip2[0]; - if (npy_is_aligned(&ip1[i], vector_size_bytes)) { - LOOP_BLOCKED(@type@, vector_size_bytes) { - @vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]); - @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b); - @vpre256@_store_@vsuf@(&op[i], c); - } - } - else { - LOOP_BLOCKED(@type@, vector_size_bytes) { - @vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]); - @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b); - @vpre256@_store_@vsuf@(&op[i], c); - } - } -#else - const npy_intp vector_size_bytes = 16; - const @vtype@ b = @vpre@_set1_@vsuf@(ip2[0]); - LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes) - op[i] = ip1[i] @OP@ ip2[0]; - if (npy_is_aligned(&ip1[i], vector_size_bytes)) { - LOOP_BLOCKED(@type@, vector_size_bytes) { - @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]); - @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b); - @vpre@_store_@vsuf@(&op[i], c); - } - } - else { - LOOP_BLOCKED(@type@, vector_size_bytes) { - @vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]); - @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b); - @vpre@_store_@vsuf@(&op[i], c); - } - } -#endif - LOOP_BLOCKED_END { - op[i] = ip1[i] @OP@ ip2[0]; + char *ip1 = args[0]; + char *indx = args[1]; + char *value = args[2]; + npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2]; + npy_intp n = dimensions[0]; + npy_intp i; + @type@ *indexed; + for(i = 0; i < n; i++, indx += isindex, value += isb) { + indexed = (@type@ *)(ip1 + is1 * *(npy_intp *)indx); + *indexed = *indexed @OP@ *(@type@ *)value; } + return 0; } /**end repeat1**/ /**end repeat**/ -#else // NPY_HAVE_SSE2 +#undef WORKAROUND_CLANG_PARTIAL_LOAD_BUG -/**begin repeat - * #type = npy_float, npy_double# - * #TYPE = FLOAT, DOUBLE# - * #sfx = f32, f64# - * #CHK = _F32, _F64# - */ -#if NPY_SIMD@CHK@ -/**begin repeat1 -* Arithmetic -* # kind = add, subtract, multiply, divide# -* # OP = +, -, *, /# -* # VOP = add, sub, mul, div# -*/ - -static void -simd_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n) -{ - LOOP_BLOCK_ALIGN_VAR(op, @type@, NPY_SIMD_WIDTH) { - op[i] = ip1[i] @OP@ ip2[i]; - } - /* lots of specializations, to squeeze out max performance */ - if (ip1 == ip2) { - LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) { - npyv_@sfx@ a = npyv_load_@sfx@(&ip1[i]); - npyv_@sfx@ c = npyv_@VOP@_@sfx@(a, a); - npyv_store_@sfx@(&op[i], c); - } - } - else { - LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) { - npyv_@sfx@ a = npyv_load_@sfx@(&ip1[i]); - npyv_@sfx@ b = npyv_load_@sfx@(&ip2[i]); - npyv_@sfx@ c = npyv_@VOP@_@sfx@(a, b); - npyv_store_@sfx@(&op[i], c); - } - } - LOOP_BLOCKED_END { - op[i] = ip1[i] @OP@ ip2[i]; - } -} +//############################################################################### +//## Complex Single/Double precision +//############################################################################### -static void -simd_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n) -{ - const npyv_@sfx@ v1 = npyv_setall_@sfx@(ip1[0]); - LOOP_BLOCK_ALIGN_VAR(op, @type@, NPY_SIMD_WIDTH) { - op[i] = ip1[0] @OP@ ip2[i]; - } - LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) { - npyv_@sfx@ v2 = npyv_load_@sfx@(&ip2[i]); - npyv_@sfx@ v3 = npyv_@VOP@_@sfx@(v1, v2); - npyv_store_@sfx@(&op[i], v3); - } - LOOP_BLOCKED_END { - op[i] = ip1[0] @OP@ ip2[i]; - } -} +/******************************************************************************** + ** op intrinics + ********************************************************************************/ -static void -simd_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n) +#if NPY_SIMD_F32 +NPY_FINLINE npyv_f32x2 simd_set2_f32(const float *a) { - const npyv_@sfx@ v2 = npyv_setall_@sfx@(ip2[0]); - LOOP_BLOCK_ALIGN_VAR(op, @type@, NPY_SIMD_WIDTH) { - op[i] = ip1[i] @OP@ ip2[0]; - } - LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) { - npyv_@sfx@ v1 = npyv_load_@sfx@(&ip1[i]); - npyv_@sfx@ v3 = npyv_@VOP@_@sfx@(v1, v2); - npyv_store_@sfx@(&op[i], v3); - } - LOOP_BLOCKED_END { - op[i] = ip1[i] @OP@ ip2[0]; - } + npyv_f32 fill = npyv_reinterpret_f32_u64(npyv_setall_u64(*(npy_uint64*)a)); + npyv_f32x2 r; + r.val[0] = fill; + r.val[1] = fill; + return r; } -/**end repeat1**/ -#endif /* NPY_SIMD@CHK@ */ -/**end repeat**/ -#endif // NPY_HAVE_SSE2 -/**begin repeat - * Float types - * #type = npy_float, npy_double, npy_longdouble# - * #TYPE = FLOAT, DOUBLE, LONGDOUBLE# - * #vector = 1, 1, 0# - * #VECTOR = NPY_SIMD_F32, NPY_SIMD_F64, 0 # - */ -/**begin repeat1 - * Arithmetic - * # kind = add, subtract, multiply, divide# - */ -static NPY_INLINE int -run_binary_simd_@kind@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps) +NPY_FINLINE npyv_f32 +simd_cconjugate_f32(npyv_f32 x) { -#if @vector@ && defined NPY_HAVE_SSE2 - @type@ * ip1 = (@type@ *)args[0]; - @type@ * ip2 = (@type@ *)args[1]; - @type@ * op = (@type@ *)args[2]; - npy_intp n = dimensions[0]; -#if defined NPY_HAVE_AVX512F - const npy_uintp vector_size_bytes = 64; -#elif defined NPY_HAVE_AVX2 - const npy_uintp vector_size_bytes = 32; +#if NPY_SIMD_BIGENDIAN + const npyv_f32 mask = npyv_reinterpret_f32_u64(npyv_setall_u64(0x80000000)); #else - const npy_uintp vector_size_bytes = 32; -#endif - /* argument one scalar */ - if (IS_BLOCKABLE_BINARY_SCALAR1(sizeof(@type@), vector_size_bytes)) { - sse2_binary_scalar1_@kind@_@TYPE@(op, ip1, ip2, n); - return 1; - } - /* argument two scalar */ - else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), vector_size_bytes)) { - sse2_binary_scalar2_@kind@_@TYPE@(op, ip1, ip2, n); - return 1; - } - else if (IS_BLOCKABLE_BINARY(sizeof(@type@), vector_size_bytes)) { - sse2_binary_@kind@_@TYPE@(op, ip1, ip2, n); - return 1; - } -#elif @VECTOR@ - @type@ * ip1 = (@type@ *)args[0]; - @type@ * ip2 = (@type@ *)args[1]; - @type@ * op = (@type@ *)args[2]; - npy_intp n = dimensions[0]; - /* argument one scalar */ - if (IS_BLOCKABLE_BINARY_SCALAR1(sizeof(@type@), NPY_SIMD_WIDTH)) { - simd_binary_scalar1_@kind@_@TYPE@(op, ip1, ip2, n); - return 1; - } - /* argument two scalar */ - else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), NPY_SIMD_WIDTH)) { - simd_binary_scalar2_@kind@_@TYPE@(op, ip1, ip2, n); - return 1; - } - else if (IS_BLOCKABLE_BINARY(sizeof(@type@), NPY_SIMD_WIDTH)) { - simd_binary_@kind@_@TYPE@(op, ip1, ip2, n); - return 1; - } + const npyv_f32 mask = npyv_reinterpret_f32_u64(npyv_setall_u64(0x8000000000000000ULL)); #endif - return 0; + return npyv_xor_f32(x, mask); } -/**end repeat1**/ -/**end repeat**/ -/******************************************************************************** - ** Defining ufunc inner functions - ********************************************************************************/ -/**begin repeat - * Float types - * #type = npy_float, npy_double# - * #TYPE = FLOAT, DOUBLE# - * #c = f, # - * #C = F, # - */ -/**begin repeat1 - * Arithmetic - * # kind = add, subtract, multiply, divide# - * # OP = +, -, *, /# - * # PW = 1, 0, 0, 0# - */ -NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@) -(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +NPY_FINLINE npyv_f32 +simd_cmul_f32(npyv_f32 a, npyv_f32 b) { - if (IS_BINARY_REDUCE) { -#if @PW@ - @type@ * iop1 = (@type@ *)args[0]; - npy_intp n = dimensions[0]; - - *iop1 @OP@= @TYPE@_pairwise_sum(args[1], n, steps[1]); -#else - BINARY_REDUCE_LOOP(@type@) { - io1 @OP@= *(@type@ *)ip2; - } - *((@type@ *)iop1) = io1; -#endif - } - else if (!run_binary_simd_@kind@_@TYPE@(args, dimensions, steps)) { - BINARY_LOOP { - const @type@ in1 = *(@type@ *)ip1; - const @type@ in2 = *(@type@ *)ip2; - *((@type@ *)op1) = in1 @OP@ in2; - } - } + npyv_f32 b_rev = npyv_permi128_f32(b, 1, 0, 3, 2); + npyv_f32 a_re = npyv_permi128_f32(a, 0, 0, 2, 2); + npyv_f32 a_im = npyv_permi128_f32(a, 1, 1, 3, 3); + // a_im * b_im, a_im * b_re + npyv_f32 ab_iiir = npyv_mul_f32(a_im, b_rev); + return npyv_muladdsub_f32(a_re, b, ab_iiir); } -/**end repeat1**/ -/**end repeat**/ -//############################################################################### -//## Complex Single/Double precision -//############################################################################### -/******************************************************************************** - ** Defining the SIMD kernels - ********************************************************************************/ -#if !defined(_MSC_VER) && defined(NPY_HAVE_AVX512F) - /** - * For somehow MSVC commit aggressive optimization lead - * to raises 'RuntimeWarning: invalid value encountered in multiply' - * - * the issue mainly caused by '_mm512_maskz_loadu_ps', we need to - * investigate about it while moving to NPYV. - */ - #define AVX512F_NOMSVC +NPY_FINLINE npyv_f32 +simd_csquare_f32(npyv_f32 x) +{ return simd_cmul_f32(x, x); } #endif -#ifdef AVX512F_NOMSVC -NPY_FINLINE __mmask16 -avx512_get_full_load_mask_ps(void) -{ - return 0xFFFF; -} - -NPY_FINLINE __mmask8 -avx512_get_full_load_mask_pd(void) -{ - return 0xFF; -} -NPY_FINLINE __m512 -avx512_masked_load_ps(__mmask16 mask, npy_float* addr) -{ - return _mm512_maskz_loadu_ps(mask, (__m512 *)addr); -} +#if NPY_SIMD_F64 -NPY_FINLINE __m512d -avx512_masked_load_pd(__mmask8 mask, npy_double* addr) +NPY_FINLINE npyv_f64x2 simd_set2_f64(const double *a) { - return _mm512_maskz_loadu_pd(mask, (__m512d *)addr); + npyv_f64 r = npyv_setall_f64(a[0]); + npyv_f64 i = npyv_setall_f64(a[1]); + return npyv_zip_f64(r, i); } -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16 -avx512_get_partial_load_mask_ps(const npy_int num_elem, const npy_int total_elem) +NPY_FINLINE npyv_f64 +simd_cconjugate_f64(npyv_f64 x) { - return (0x0001 << num_elem) - 0x0001; + const npyv_f64 mask = npyv_reinterpret_f64_u64(npyv_set_u64( + 0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL, + 0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL, + 0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL, + 0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL, + 0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL, + 0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL, + 0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL, + 0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL + )); + return npyv_xor_f64(x, mask); } -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask8 -avx512_get_partial_load_mask_pd(const npy_int num_elem, const npy_int total_elem) -{ - return (0x01 << num_elem) - 0x01; -} -/**begin repeat - * #vsub = ps, pd# - * #type= npy_float, npy_double# - * #epi_vsub = epi32, epi64# - * #vtype = __m512, __m512d# - * #mask = __mmask16, __mmask8# - * #and_const = 0x7fffffff, 0x7fffffffffffffffLL# - * #neg_mask = 0x80000000, 0x8000000000000000# - * #perm_ = 0xb1, 0x55# - * #cmpx_img_mask = 0xAAAA, 0xAA# - * #cmpx_re_mask = 0x5555, 0x55# - * #INF = NPY_INFINITYF, NPY_INFINITY# - * #NAN = NPY_NANF, NPY_NAN# - */ -NPY_FINLINE @vtype@ -avx512_hadd_@vsub@(const @vtype@ x) +NPY_FINLINE npyv_f64 +simd_cmul_f64(npyv_f64 a, npyv_f64 b) { - return _mm512_add_@vsub@(x, _mm512_permute_@vsub@(x, @perm_@)); + npyv_f64 b_rev = npyv_permi128_f64(b, 1, 0); + npyv_f64 a_re = npyv_permi128_f64(a, 0, 0); + npyv_f64 a_im = npyv_permi128_f64(a, 1, 1); + // a_im * b_im, a_im * b_re + npyv_f64 ab_iiir = npyv_mul_f64(a_im, b_rev); + return npyv_muladdsub_f64(a_re, b, ab_iiir); } -NPY_FINLINE @vtype@ -avx512_hsub_@vsub@(const @vtype@ x) -{ - return _mm512_sub_@vsub@(x, _mm512_permute_@vsub@(x, @perm_@)); -} -NPY_FINLINE @vtype@ -avx512_cmul_@vsub@(@vtype@ x1, @vtype@ x2) -{ - // x1 = r1, i1 - // x2 = r2, i2 - @vtype@ x3 = _mm512_permute_@vsub@(x2, @perm_@); // i2, r2 - @vtype@ x12 = _mm512_mul_@vsub@(x1, x2); // r1*r2, i1*i2 - @vtype@ x13 = _mm512_mul_@vsub@(x1, x3); // r1*i2, r2*i1 - @vtype@ outreal = avx512_hsub_@vsub@(x12); // r1*r2 - i1*i2, r1*r2 - i1*i2 - @vtype@ outimg = avx512_hadd_@vsub@(x13); // r1*i2 + i1*r2, r1*i2 + i1*r2 - return _mm512_mask_blend_@vsub@(@cmpx_img_mask@, outreal, outimg); -} -/**end repeat**/ +NPY_FINLINE npyv_f64 +simd_csquare_f64(npyv_f64 x) +{ return simd_cmul_f64(x, x); } #endif /**begin repeat - * #TYPE = CFLOAT, CDOUBLE# * #type = npy_float, npy_double# - * #num_lanes = 16, 8# - * #vsuffix = ps, pd# - * #epi_vsub = epi32, epi64# - * #mask = __mmask16, __mmask8# - * #vtype = __m512, __m512d# - * #scale = 4, 8# - * #vindextype = __m512i, __m256i# - * #vindexload = _mm512_loadu_si512, _mm256_loadu_si256# - * #storemask = 0xFF, 0xF# - * #IS_FLOAT = 1, 0# - */ -/**begin repeat1 - * #func = add, subtract, multiply# - * #vectorf = _mm512_add, _mm512_sub, avx512_cmul# - */ -#if defined AVX512F_NOMSVC -static NPY_INLINE void -AVX512F_@func@_@TYPE@(char **args, const npy_intp *dimensions, const npy_intp *steps) -{ - const npy_intp array_size = dimensions[0]; - npy_intp num_remaining_elements = 2*array_size; - @type@* ip1 = (@type@*) args[0]; - @type@* ip2 = (@type@*) args[1]; - @type@* op = (@type@*) args[2]; - - @mask@ load_mask = avx512_get_full_load_mask_@vsuffix@(); - - while (num_remaining_elements > 0) { - if (num_remaining_elements < @num_lanes@) { - load_mask = avx512_get_partial_load_mask_@vsuffix@( - num_remaining_elements, @num_lanes@); - } - @vtype@ x1, x2; - x1 = avx512_masked_load_@vsuffix@(load_mask, ip1); - x2 = avx512_masked_load_@vsuffix@(load_mask, ip2); - - @vtype@ out = @vectorf@_@vsuffix@(x1, x2); - - _mm512_mask_storeu_@vsuffix@(op, load_mask, out); - - ip1 += @num_lanes@; - ip2 += @num_lanes@; - op += @num_lanes@; - num_remaining_elements -= @num_lanes@; - } -} -#endif // AVX512F_NOMSVC -/**end repeat1**/ -/**end repeat**/ - -/**begin repeat - * #TYPE = CFLOAT, CDOUBLE# - * #type= npy_float, npy_double# - * #esize = 8, 16# - */ -/**begin repeat1 - * #func = add, subtract, multiply# + * #sfx = f32, f64# + * #bsfx = b32, b64# + * #usfx = b32, u64# + * #VECTOR = NPY_SIMD_F32, NPY_SIMD_F64# + * #is_double = 0, 1# + * #c = f, # + * #INF = NPY_INFINITYF, NPY_INFINITY# + * #NAN = NPY_NANF, NPY_NAN# */ -static NPY_INLINE int -run_binary_avx512f_@func@_@TYPE@(char **args, const npy_intp *dimensions, const npy_intp *steps) +#if @VECTOR@ +NPY_FINLINE npyv_@sfx@ +simd_cabsolute_@sfx@(npyv_@sfx@ re, npyv_@sfx@ im) { -#if defined AVX512F_NOMSVC - if (IS_BINARY_STRIDE_ONE(@esize@, 64)) { - AVX512F_@func@_@TYPE@(args, dimensions, steps); - return 1; - } - else - return 0; -#endif - return 0; + const npyv_@sfx@ inf = npyv_setall_@sfx@(@INF@); + const npyv_@sfx@ nan = npyv_setall_@sfx@(@NAN@); + + re = npyv_abs_@sfx@(re); + im = npyv_abs_@sfx@(im); + /* + * If real or imag = INF, then convert it to inf + j*inf + * Handles: inf + j*nan, nan + j*inf + */ + npyv_@bsfx@ re_infmask = npyv_cmpeq_@sfx@(re, inf); + npyv_@bsfx@ im_infmask = npyv_cmpeq_@sfx@(im, inf); + im = npyv_select_@sfx@(re_infmask, inf, im); + re = npyv_select_@sfx@(im_infmask, inf, re); + /* + * If real or imag = NAN, then convert it to nan + j*nan + * Handles: x + j*nan, nan + j*x + */ + npyv_@bsfx@ re_nnanmask = npyv_notnan_@sfx@(re); + npyv_@bsfx@ im_nnanmask = npyv_notnan_@sfx@(im); + im = npyv_select_@sfx@(re_nnanmask, im, nan); + re = npyv_select_@sfx@(im_nnanmask, re, nan); + + npyv_@sfx@ larger = npyv_max_@sfx@(re, im); + npyv_@sfx@ smaller = npyv_min_@sfx@(im, re); + /* + * Calculate div_mask to prevent 0./0. and inf/inf operations in div + */ + npyv_@bsfx@ zeromask = npyv_cmpeq_@sfx@(larger, npyv_zero_@sfx@()); + npyv_@bsfx@ infmask = npyv_cmpeq_@sfx@(smaller, inf); + npyv_@bsfx@ div_mask = npyv_not_@bsfx@(npyv_or_@bsfx@(zeromask, infmask)); + + npyv_@sfx@ ratio = npyv_ifdivz_@sfx@(div_mask, smaller, larger); + npyv_@sfx@ hypot = npyv_sqrt_@sfx@( + npyv_muladd_@sfx@(ratio, ratio, npyv_setall_@sfx@(1.0@c@) + )); + return npyv_mul_@sfx@(hypot, larger); } -/**end repeat1**/ +#endif // VECTOR /**end repeat**/ /******************************************************************************** @@ -724,55 +411,345 @@ run_binary_avx512f_@func@_@TYPE@(char **args, const npy_intp *dimensions, const * complex types * #TYPE = CFLOAT, CDOUBLE# * #ftype = npy_float, npy_double# + * #VECTOR = NPY_SIMD_F32, NPY_SIMD_F64# + * #sfx = f32, f64# * #c = f, # * #C = F, # */ /**begin repeat1 * arithmetic - * #kind = add, subtract# - * #OP = +, -# - * #PW = 1, 0# + * #kind = add, subtract, multiply# + * #vectorf = npyv_add, npyv_sub, simd_cmul# + * #OP = +, -, *# + * #PW = 1, 0, 0# + * #is_mul = 0*2, 1# */ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@) (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { - // Parenthesis around @PW@ tells clang dead code is intentional - if (IS_BINARY_REDUCE && (@PW@)) { - npy_intp n = dimensions[0]; - @ftype@ * or = ((@ftype@ *)args[0]); - @ftype@ * oi = ((@ftype@ *)args[0]) + 1; + npy_intp len = dimensions[0]; + char *b_src0 = args[0], *b_src1 = args[1], *b_dst = args[2]; + npy_intp b_ssrc0 = steps[0], b_ssrc1 = steps[1], b_sdst = steps[2]; +#if @PW@ + // reduce + if (b_ssrc0 == 0 && b_ssrc0 == b_sdst && b_src0 == b_dst && + b_ssrc1 % (sizeof(@ftype@)*2) == 0 + ) { + @ftype@ *rl_im = (@ftype@ *)b_src0; @ftype@ rr, ri; - - @TYPE@_pairwise_sum(&rr, &ri, args[1], n * 2, steps[1] / 2); - *or @OP@= rr; - *oi @OP@= ri; + @TYPE@_pairwise_sum(&rr, &ri, b_src1, len * 2, b_ssrc1 / 2); + rl_im[0] @OP@= rr; + rl_im[1] @OP@= ri; return; } - if (!run_binary_avx512f_@kind@_@TYPE@(args, dimensions, steps)) { - BINARY_LOOP { - const @ftype@ in1r = ((@ftype@ *)ip1)[0]; - const @ftype@ in1i = ((@ftype@ *)ip1)[1]; - const @ftype@ in2r = ((@ftype@ *)ip2)[0]; - const @ftype@ in2i = ((@ftype@ *)ip2)[1]; - ((@ftype@ *)op1)[0] = in1r @OP@ in2r; - ((@ftype@ *)op1)[1] = in1i @OP@ in2i; +#endif +#if @VECTOR@ + if (is_mem_overlap(b_src0, b_ssrc0, b_dst, b_sdst, len) || + is_mem_overlap(b_src1, b_ssrc1, b_dst, b_sdst, len) || + b_sdst % sizeof(@ftype@) != 0 || b_sdst == 0 || + b_ssrc0 % sizeof(@ftype@) != 0 || + b_ssrc1 % sizeof(@ftype@) != 0 + ) { + goto loop_scalar; + } + const @ftype@ *src0 = (@ftype@*)b_src0; + const @ftype@ *src1 = (@ftype@*)b_src1; + @ftype@ *dst = (@ftype@*)b_dst; + + const npy_intp ssrc0 = b_ssrc0 / sizeof(@ftype@); + const npy_intp ssrc1 = b_ssrc1 / sizeof(@ftype@); + const npy_intp sdst = b_sdst / sizeof(@ftype@); + + const int vstep = npyv_nlanes_@sfx@; + const int wstep = vstep * 2; + const int hstep = vstep / 2; + + const int loadable0 = npyv_loadable_stride_s64(ssrc0); + const int loadable1 = npyv_loadable_stride_s64(ssrc1); + const int storable = npyv_storable_stride_s64(sdst); + + // lots**lots of specializations, to squeeze out max performance + // contig + if (ssrc0 == 2 && ssrc0 == ssrc1 && ssrc0 == sdst) { + for (; len >= vstep; len -= vstep, src0 += wstep, src1 += wstep, dst += wstep) { + npyv_@sfx@ a0 = npyv_load_@sfx@(src0); + npyv_@sfx@ a1 = npyv_load_@sfx@(src0 + vstep); + npyv_@sfx@ b0 = npyv_load_@sfx@(src1); + npyv_@sfx@ b1 = npyv_load_@sfx@(src1 + vstep); + npyv_@sfx@ r0 = @vectorf@_@sfx@(a0, b0); + npyv_@sfx@ r1 = @vectorf@_@sfx@(a1, b1); + npyv_store_@sfx@(dst, r0); + npyv_store_@sfx@(dst + vstep, r1); + } + for (; len > 0; len -= hstep, src0 += vstep, src1 += vstep, dst += vstep) { + npyv_@sfx@ a = npyv_load2_tillz_@sfx@(src0, len); + npyv_@sfx@ b = npyv_load2_tillz_@sfx@(src1, len); + npyv_@sfx@ r = @vectorf@_@sfx@(a, b); + npyv_store2_till_@sfx@(dst, len, r); + } + } + // scalar 0 + else if (ssrc0 == 0) { + npyv_@sfx@x2 a = simd_set2_@sfx@(src0); + // contig + if (ssrc1 == 2 && sdst == ssrc1) { + for (; len >= vstep; len -= vstep, src1 += wstep, dst += wstep) { + npyv_@sfx@ b0 = npyv_load_@sfx@(src1); + npyv_@sfx@ b1 = npyv_load_@sfx@(src1 + vstep); + npyv_@sfx@ r0 = @vectorf@_@sfx@(a.val[0], b0); + npyv_@sfx@ r1 = @vectorf@_@sfx@(a.val[1], b1); + npyv_store_@sfx@(dst, r0); + npyv_store_@sfx@(dst + vstep, r1); + } + for (; len > 0; len -= hstep, src1 += vstep, dst += vstep) { + #if @is_mul@ + npyv_@sfx@ b = npyv_load2_till_@sfx@(src1, len, 1.0@c@, 1.0@c@); + #else + npyv_@sfx@ b = npyv_load2_tillz_@sfx@(src1, len); + #endif + npyv_@sfx@ r = @vectorf@_@sfx@(a.val[0], b); + npyv_store2_till_@sfx@(dst, len, r); + } + } + // non-contig + else if (loadable1 && storable) { + for (; len >= vstep; len -= vstep, src1 += ssrc1*vstep, dst += sdst*vstep) { + npyv_@sfx@ b0 = npyv_loadn2_@sfx@(src1, ssrc1); + npyv_@sfx@ b1 = npyv_loadn2_@sfx@(src1 + ssrc1*hstep, ssrc1); + npyv_@sfx@ r0 = @vectorf@_@sfx@(a.val[0], b0); + npyv_@sfx@ r1 = @vectorf@_@sfx@(a.val[1], b1); + npyv_storen2_@sfx@(dst, sdst, r0); + npyv_storen2_@sfx@(dst + sdst*hstep, sdst, r1); + } + for (; len > 0; len -= hstep, src1 += ssrc1*hstep, dst += sdst*hstep) { + #if @is_mul@ + npyv_@sfx@ b = npyv_loadn2_till_@sfx@(src1, ssrc1, len, 1.0@c@, 1.0@c@); + #else + npyv_@sfx@ b = npyv_loadn2_tillz_@sfx@(src1, ssrc1, len); + #endif + npyv_@sfx@ r = @vectorf@_@sfx@(a.val[0], b); + npyv_storen2_till_@sfx@(dst, sdst, len, r); + } + } + else { + goto loop_scalar; + } + } + // scalar 1 + else if (ssrc1 == 0) { + npyv_@sfx@x2 b = simd_set2_@sfx@(src1); + if (ssrc0 == 2 && sdst == ssrc0) { + for (; len >= vstep; len -= vstep, src0 += wstep, dst += wstep) { + npyv_@sfx@ a0 = npyv_load_@sfx@(src0); + npyv_@sfx@ a1 = npyv_load_@sfx@(src0 + vstep); + npyv_@sfx@ r0 = @vectorf@_@sfx@(a0, b.val[0]); + npyv_@sfx@ r1 = @vectorf@_@sfx@(a1, b.val[1]); + npyv_store_@sfx@(dst, r0); + npyv_store_@sfx@(dst + vstep, r1); + } + for (; len > 0; len -= hstep, src0 += vstep, dst += vstep) { + #if @is_mul@ + npyv_@sfx@ a = npyv_load2_till_@sfx@(src0, len, 1.0@c@, 1.0@c@); + #else + npyv_@sfx@ a = npyv_load2_tillz_@sfx@(src0, len); + #endif + npyv_@sfx@ r = @vectorf@_@sfx@(a, b.val[0]); + npyv_store2_till_@sfx@(dst, len, r); + } + } + // non-contig + else if (loadable0 && storable) { + for (; len >= vstep; len -= vstep, src0 += ssrc0*vstep, dst += sdst*vstep) { + npyv_@sfx@ a0 = npyv_loadn2_@sfx@(src0, ssrc0); + npyv_@sfx@ a1 = npyv_loadn2_@sfx@(src0 + ssrc0*hstep, ssrc0); + npyv_@sfx@ r0 = @vectorf@_@sfx@(a0, b.val[0]); + npyv_@sfx@ r1 = @vectorf@_@sfx@(a1, b.val[1]); + npyv_storen2_@sfx@(dst, sdst, r0); + npyv_storen2_@sfx@(dst + sdst*hstep, sdst, r1); + } + for (; len > 0; len -= hstep, src0 += ssrc0*hstep, dst += sdst*hstep) { + #if @is_mul@ + npyv_@sfx@ a = npyv_loadn2_till_@sfx@(src0, ssrc0, len, 1.0@c@, 1.0@c@); + #else + npyv_@sfx@ a = npyv_loadn2_tillz_@sfx@(src0, ssrc0, len); + #endif + npyv_@sfx@ r = @vectorf@_@sfx@(a, b.val[0]); + npyv_storen2_till_@sfx@(dst, sdst, len, r); + } } + else { + goto loop_scalar; + } + } + #if @is_mul@ + // non-contig + else if (loadable0 && loadable1 && storable) { + for (; len >= vstep; len -= vstep, src0 += ssrc0*vstep, + src1 += ssrc1*vstep, dst += sdst*vstep + ) { + npyv_@sfx@ a0 = npyv_loadn2_@sfx@(src0, ssrc0); + npyv_@sfx@ a1 = npyv_loadn2_@sfx@(src0 + ssrc0*hstep, ssrc0); + npyv_@sfx@ b0 = npyv_loadn2_@sfx@(src1, ssrc1); + npyv_@sfx@ b1 = npyv_loadn2_@sfx@(src1 + ssrc1*hstep, ssrc1); + npyv_@sfx@ r0 = @vectorf@_@sfx@(a0, b0); + npyv_@sfx@ r1 = @vectorf@_@sfx@(a1, b1); + npyv_storen2_@sfx@(dst, sdst, r0); + npyv_storen2_@sfx@(dst + sdst*hstep, sdst, r1); + } + for (; len > 0; len -= hstep, src0 += ssrc0*hstep, + src1 += ssrc1*hstep, dst += sdst*hstep + ) { + #if @is_mul@ + npyv_@sfx@ a = npyv_loadn2_till_@sfx@(src0, ssrc0, len, 1.0@c@, 1.0@c@); + npyv_@sfx@ b = npyv_loadn2_till_@sfx@(src1, ssrc1, len, 1.0@c@, 1.0@c@); + #else + npyv_@sfx@ a = npyv_loadn2_tillz_@sfx@(src0, ssrc0, len); + npyv_@sfx@ b = npyv_loadn2_tillz_@sfx@(src1, ssrc1, len); + #endif + npyv_@sfx@ r = @vectorf@_@sfx@(a, b); + npyv_storen2_till_@sfx@(dst, sdst, len, r); + } + } + #endif + else { + goto loop_scalar; } + npyv_cleanup(); + return; +loop_scalar: +#endif + for (; len > 0; --len, b_src0 += b_ssrc0, b_src1 += b_ssrc1, b_dst += b_sdst) { + const @ftype@ a_r = ((@ftype@ *)b_src0)[0]; + const @ftype@ a_i = ((@ftype@ *)b_src0)[1]; + const @ftype@ b_r = ((@ftype@ *)b_src1)[0]; + const @ftype@ b_i = ((@ftype@ *)b_src1)[1]; + #if @is_mul@ + ((@ftype@ *)b_dst)[0] = a_r*b_r - a_i*b_i; + ((@ftype@ *)b_dst)[1] = a_r*b_i + a_i*b_r; + #else + ((@ftype@ *)b_dst)[0] = a_r @OP@ b_r; + ((@ftype@ *)b_dst)[1] = a_i @OP@ b_i; + #endif + } +} + +NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@_indexed) +(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func)) +{ + char *ip1 = args[0]; + char *indx = args[1]; + char *value = args[2]; + npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2]; + npy_intp n = dimensions[0]; + npy_intp i; + @ftype@ *indexed; + for(i = 0; i < n; i++, indx += isindex, value += isb) { + indexed = (@ftype@ *)(ip1 + is1 * *(npy_intp *)indx); + const @ftype@ b_r = ((@ftype@ *)value)[0]; + const @ftype@ b_i = ((@ftype@ *)value)[1]; + #if @is_mul@ + const @ftype@ a_r = indexed[0]; + const @ftype@ a_i = indexed[1]; + indexed[0] = a_r*b_r - a_i*b_i; + indexed[1] = a_r*b_i + a_i*b_r; + #else + indexed[0] @OP@= b_r; + indexed[1] @OP@= b_i; + #endif + } + return 0; } /**end repeat1**/ -NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_multiply) +/**begin repeat1 + * #kind = conjugate, square# + * #is_square = 0, 1# + */ +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@) (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { - if (!run_binary_avx512f_multiply_@TYPE@(args, dimensions, steps)) { - BINARY_LOOP { - const @ftype@ in1r = ((@ftype@ *)ip1)[0]; - const @ftype@ in1i = ((@ftype@ *)ip1)[1]; - const @ftype@ in2r = ((@ftype@ *)ip2)[0]; - const @ftype@ in2i = ((@ftype@ *)ip2)[1]; - ((@ftype@ *)op1)[0] = in1r*in2r - in1i*in2i; - ((@ftype@ *)op1)[1] = in1r*in2i + in1i*in2r; + npy_intp len = dimensions[0]; + char *b_src = args[0], *b_dst = args[1]; + npy_intp b_ssrc = steps[0], b_sdst = steps[1]; +#if @VECTOR@ + if (is_mem_overlap(b_src, b_ssrc, b_dst, b_sdst, len) || + b_sdst % sizeof(@ftype@) != 0 || + b_ssrc % sizeof(@ftype@) != 0 + ) { + goto loop_scalar; + } + const @ftype@ *src = (@ftype@*)b_src; + @ftype@ *dst = (@ftype@*)b_dst; + const npy_intp ssrc = b_ssrc / sizeof(@ftype@); + const npy_intp sdst = b_sdst / sizeof(@ftype@); + + const int vstep = npyv_nlanes_@sfx@; + const int wstep = vstep * 2; + const int hstep = vstep / 2; + + if (ssrc == 2 && ssrc == sdst) { + for (; len >= vstep; len -= vstep, src += wstep, dst += wstep) { + npyv_@sfx@ a0 = npyv_load_@sfx@(src); + npyv_@sfx@ a1 = npyv_load_@sfx@(src + vstep); + npyv_@sfx@ r0 = simd_c@kind@_@sfx@(a0); + npyv_@sfx@ r1 = simd_c@kind@_@sfx@(a1); + npyv_store_@sfx@(dst, r0); + npyv_store_@sfx@(dst + vstep, r1); + } + for (; len > 0; len -= hstep, src += vstep, dst += vstep) { + npyv_@sfx@ a = npyv_load2_tillz_@sfx@(src, len); + npyv_@sfx@ r = simd_c@kind@_@sfx@(a); + npyv_store2_till_@sfx@(dst, len, r); + } + } + else if (ssrc == 2 && npyv_storable_stride_s64(sdst)) { + for (; len >= vstep; len -= vstep, src += wstep, dst += sdst*vstep) { + npyv_@sfx@ a0 = npyv_load_@sfx@(src); + npyv_@sfx@ a1 = npyv_load_@sfx@(src + vstep); + npyv_@sfx@ r0 = simd_c@kind@_@sfx@(a0); + npyv_@sfx@ r1 = simd_c@kind@_@sfx@(a1); + npyv_storen2_@sfx@(dst, sdst, r0); + npyv_storen2_@sfx@(dst + sdst*hstep, sdst, r1); + } + for (; len > 0; len -= hstep, src += vstep, dst += sdst*hstep) { + npyv_@sfx@ a = npyv_load2_tillz_@sfx@(src, len); + npyv_@sfx@ r = simd_c@kind@_@sfx@(a); + npyv_storen2_till_@sfx@(dst, sdst, len, r); + } + } + else if (sdst == 2 && npyv_loadable_stride_s64(ssrc)) { + for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += wstep) { + npyv_@sfx@ a0 = npyv_loadn2_@sfx@(src, ssrc); + npyv_@sfx@ a1 = npyv_loadn2_@sfx@(src + ssrc*hstep, ssrc); + npyv_@sfx@ r0 = simd_c@kind@_@sfx@(a0); + npyv_@sfx@ r1 = simd_c@kind@_@sfx@(a1); + npyv_store_@sfx@(dst, r0); + npyv_store_@sfx@(dst + vstep, r1); + } + for (; len > 0; len -= hstep, src += ssrc*hstep, dst += vstep) { + npyv_@sfx@ a = npyv_loadn2_tillz_@sfx@((@ftype@*)src, ssrc, len); + npyv_@sfx@ r = simd_c@kind@_@sfx@(a); + npyv_store2_till_@sfx@(dst, len, r); } } + else { + goto loop_scalar; + } + npyv_cleanup(); + return; +loop_scalar: +#endif + for (; len > 0; --len, b_src += b_ssrc, b_dst += b_sdst) { + const @ftype@ rl = ((@ftype@ *)b_src)[0]; + const @ftype@ im = ((@ftype@ *)b_src)[1]; + #if @is_square@ + ((@ftype@ *)b_dst)[0] = rl*rl - im*im; + ((@ftype@ *)b_dst)[1] = rl*im + im*rl; + #else + ((@ftype@ *)b_dst)[0] = rl; + ((@ftype@ *)b_dst)[1] = -im; + #endif + } } +/**end repeat1**/ /**end repeat**/ diff --git a/numpy/core/src/umath/loops_arithmetic.dispatch.c.src b/numpy/core/src/umath/loops_arithmetic.dispatch.c.src index 5b5f13ad1..b6f126298 100644 --- a/numpy/core/src/umath/loops_arithmetic.dispatch.c.src +++ b/numpy/core/src/umath/loops_arithmetic.dispatch.c.src @@ -42,7 +42,7 @@ * #sfx = s8, s16, s32, s64# * #len = 8, 16, 32, 64# */ -static NPY_INLINE void +static inline void simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len) { npyv_lanetype_@sfx@ *src = (npyv_lanetype_@sfx@ *) args[0]; @@ -108,7 +108,7 @@ simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len) * #sfx = u8, u16, u32, u64# * #len = 8, 16, 32, 64# */ -static NPY_INLINE void +static inline void simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len) { npyv_lanetype_@sfx@ *src = (npyv_lanetype_@sfx@ *) args[0]; @@ -207,7 +207,7 @@ vsx4_div_@t@16(npyv_@t@16 a, npyv_@t@16 b) * #sfx = u8, u16, u32, u64# * #len = 8, 16, 32, 64# */ -static NPY_INLINE void +static inline void vsx4_simd_divide_contig_@sfx@(char **args, npy_intp len) { npyv_lanetype_@sfx@ *src1 = (npyv_lanetype_@sfx@ *) args[0]; @@ -246,7 +246,7 @@ vsx4_simd_divide_contig_@sfx@(char **args, npy_intp len) * #sfx = s8, s16, s32, s64# * #len = 8, 16, 32, 64# */ -static NPY_INLINE void +static inline void vsx4_simd_divide_contig_@sfx@(char **args, npy_intp len) { npyv_lanetype_@sfx@ *src1 = (npyv_lanetype_@sfx@ *) args[0]; @@ -395,6 +395,24 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_divide) } } } + +NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(@TYPE@_divide_indexed) +(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func)) +{ + char *ip1 = args[0]; + char *indx = args[1]; + char *value = args[2]; + npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2]; + npy_intp n = dimensions[0]; + npy_intp i; + @type@ *indexed; + for(i = 0; i < n; i++, indx += isindex, value += isb) { + indexed = (@type@ *)(ip1 + is1 * *(npy_intp *)indx); + *indexed = floor_div_@TYPE@(*indexed, *(@type@ *)value); + } + return 0; +} + /**end repeat**/ /**begin repeat @@ -463,4 +481,28 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_divide) } } } + +NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(@TYPE@_divide_indexed) +(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func)) +{ + char *ip1 = args[0]; + char *indx = args[1]; + char *value = args[2]; + npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2]; + npy_intp n = dimensions[0]; + npy_intp i; + @type@ *indexed; + for(i = 0; i < n; i++, indx += isindex, value += isb) { + indexed = (@type@ *)(ip1 + is1 * *(npy_intp *)indx); + @type@ in2 = *(@type@ *)value; + if (NPY_UNLIKELY(in2 == 0)) { + npy_set_floatstatus_divbyzero(); + *indexed = 0; + } else { + *indexed = *indexed / in2; + } + } + return 0; +} + /**end repeat**/ diff --git a/numpy/core/src/umath/loops_autovec.dispatch.c.src b/numpy/core/src/umath/loops_autovec.dispatch.c.src new file mode 100644 index 000000000..bdbfa0f86 --- /dev/null +++ b/numpy/core/src/umath/loops_autovec.dispatch.c.src @@ -0,0 +1,287 @@ +/*@targets + ** $maxopt $autovec baseline + ** sse2 avx2 + ** neon + ** vsx2 + ** vx + **/ +#define _UMATHMODULE +#define _MULTIARRAYMODULE +#define NPY_NO_DEPRECATED_API NPY_API_VERSION + +#include "simd/simd.h" +#include "loops_utils.h" +#include "loops.h" +// Provides the various *_LOOP macros +#include "fast_loop_macros.h" + +/* + ***************************************************************************** + ** INTEGER LOOPS + ***************************************************************************** + */ +/* + * Arithmetic bit shift operations. + * + * Intel hardware masks bit shift values, so large shifts wrap around + * and can produce surprising results. The special handling ensures that + * behavior is independent of compiler or hardware. + * TODO: We could implement consistent behavior for negative shifts, + * which is undefined in C. + */ +#define INT_left_shift_needs_clear_floatstatus +#define UINT_left_shift_needs_clear_floatstatus + +/**begin repeat + * #TYPE = BYTE, UBYTE, SHORT, USHORT, INT, UINT, + * LONG, ULONG, LONGLONG, ULONGLONG# + * #type = npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint, + * npy_long, npy_ulong, npy_longlong, npy_ulonglong# + * #ftype = npy_float, npy_float, npy_float, npy_float, npy_double, npy_double, + * npy_double, npy_double, npy_double, npy_double# + * #SIGNED = 1, 0, 1, 0, 1, 0, 1, 0, 1, 0# + * #c = hh,uhh,h,uh,,u,l,ul,ll,ull# + */ + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_positive) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + UNARY_LOOP_FAST(@type@, @type@, *out = +in); +} + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_square) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)) +{ + UNARY_LOOP_FAST(@type@, @type@, *out = in * in); +} + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_reciprocal) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)) +{ + UNARY_LOOP_FAST(@type@, @type@, *out = 1.0 / in); +} + +/**begin repeat1 + * Arithmetic + * #kind = add, subtract, multiply# + * #OP = +, -, *# + */ +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + if (IS_BINARY_REDUCE) { + BINARY_REDUCE_LOOP_FAST(@type@, io1 @OP@= in2); + } + else { + BINARY_LOOP_FAST(@type@, @type@, *out = in1 @OP@ in2); + } +} +/**end repeat1**/ + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_left_shift) +(char **args, npy_intp const *dimensions, npy_intp const *steps, + void *NPY_UNUSED(func)) +{ + BINARY_LOOP_FAST(@type@, @type@, *out = npy_lshift@c@(in1, in2)); +#ifdef @TYPE@_left_shift_needs_clear_floatstatus + // For some reason, our macOS CI sets an "invalid" flag here, but only + // for some types. + npy_clear_floatstatus_barrier((char*)dimensions); +#endif +} + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_right_shift) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ +#ifndef NPY_DO_NOT_OPTIMIZE_@TYPE@_right_shift + BINARY_LOOP_FAST(@type@, @type@, *out = npy_rshift@c@(in1, in2)); +#else + BINARY_LOOP { + @type@ in1 = *(@type@ *)ip1; + @type@ in2 = *(@type@ *)ip2; + *(@type@ *)op1 = npy_rshift@c@(in1, in2); + } +#endif +} +/**end repeat**/ + +/* + ***************************************************************************** + ** UNSIGNED INTEGER LOOPS + ***************************************************************************** + */ +/**begin repeat + * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG# + * #type = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong# + * #c = u,u,u,ul,ull# + */ +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_absolute) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + UNARY_LOOP_FAST(@type@, @type@, *out = in); +} + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_sign) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + UNARY_LOOP_FAST(@type@, @type@, *out = in > 0 ? 1 : 0); +} + +/**begin repeat1 + * Arithmetic + * #kind = bitwise_and, bitwise_or, bitwise_xor# + * #OP = &, |, ^# + */ +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + if (IS_BINARY_REDUCE) { + BINARY_REDUCE_LOOP_FAST(@type@, io1 @OP@= in2); + } + else { + BINARY_LOOP_FAST(@type@, @type@, *out = in1 @OP@ in2); + } +} +/**end repeat1**/ + +/**begin repeat1 + * #kind = logical_and, logical_or# + * #OP = &&, ||# + */ +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + /* + * gcc vectorization of this is not good (PR60575) but manual integer + * vectorization is too tedious to be worthwhile + */ + BINARY_LOOP_FAST(@type@, npy_bool, *out = in1 @OP@ in2); +} +/**end repeat1**/ + +NPY_FINLINE npy_bool @TYPE@_logical_xor_(@type@ in1, @type@ in2) +{ return (!!in1) != (!!in2); } + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_logical_xor) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + BINARY_LOOP_FAST(@type@, npy_bool, *out = @TYPE@_logical_xor_(in1, in2)); +} + +/**begin repeat1 + * #kind = isnan, isinf, isfinite# + * #func = npy_isnan, npy_isinf, npy_isfinite# + * #val = NPY_FALSE, NPY_FALSE, NPY_TRUE# + **/ +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + /* + * The (void)in; suppresses an unused variable warning raised by gcc and allows + * us to re-use this macro even though we do not depend on in + */ + UNARY_LOOP_FAST(@type@, npy_bool, (void)in; *out = @val@); +} +/**end repeat1**/ + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_conjugate) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + UNARY_LOOP_FAST(@type@, @type@, *out = in); +} + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_logical_not) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + UNARY_LOOP_FAST(@type@, npy_bool, *out = !in); +} + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_invert) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + UNARY_LOOP_FAST(@type@, @type@, *out = ~in); +} +/**end repeat**/ + +/* + ***************************************************************************** + ** SIGNED! INTEGER LOOPS + ***************************************************************************** + */ + +/**begin repeat + * #TYPE = BYTE, SHORT, INT, LONG, LONGLONG# + * #type = npy_byte, npy_short, npy_int, npy_long, npy_longlong# + * #c = ,,,l,ll# + */ + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_absolute) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + UNARY_LOOP_FAST(@type@, @type@, *out = (in >= 0) ? in : -in); +} + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_sign) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + UNARY_LOOP_FAST(@type@, @type@, *out = in > 0 ? 1 : (in < 0 ? -1 : 0)); +} + +/**begin repeat1 + * #kind = conjugate, invert, isnan, isinf, isfinite, + * logical_and, logical_or, logical_xor, logical_not, + * bitwise_and, bitwise_or, bitwise_xor# + **/ +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func) +{ + NPY_CPU_DISPATCH_CURFX(U@TYPE@_@kind@)(args, dimensions, steps, func); +} +/**end repeat1**/ +/**end repeat**/ + +/* + ***************************************************************************** + ** BOOLEAN LOOPS ** + ***************************************************************************** + */ +/**begin repeat + * #kind = isnan, isinf, isfinite# + * #func = npy_isnan, npy_isinf, npy_isfinite# + * #val = NPY_FALSE, NPY_FALSE, NPY_TRUE# + **/ +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BOOL_@kind@) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func) +{ + NPY_CPU_DISPATCH_CURFX(UBYTE_@kind@)(args, dimensions, steps, func); +} +/**end repeat**/ + +/* + ***************************************************************************** + ** HALF-FLOAT LOOPS ** + ***************************************************************************** + */ + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(HALF_absolute) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + UNARY_LOOP_FAST(npy_half, npy_half, *out = in&0x7fffu); +} + +/* + ***************************************************************************** + ** DATETIME LOOPS ** + ***************************************************************************** + */ + +/**begin repeat + * #type = npy_datetime, npy_timedelta# + * #TYPE = DATETIME, TIMEDELTA# + */ +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_isinf) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func) +{ + NPY_CPU_DISPATCH_CURFX(ULONGLONG_isinf)(args, dimensions, steps, func); +} +/**end repeat**/ diff --git a/numpy/core/src/umath/loops_comparison.dispatch.c.src b/numpy/core/src/umath/loops_comparison.dispatch.c.src index 2f75593a5..751080871 100644 --- a/numpy/core/src/umath/loops_comparison.dispatch.c.src +++ b/numpy/core/src/umath/loops_comparison.dispatch.c.src @@ -234,7 +234,7 @@ static void simd_binary_@kind@_b8(char **args, npy_intp len) npyv_b8 a = npyv_cmpeq_u8(npyv_load_u8(src1), vzero); npyv_b8 b = npyv_cmpeq_u8(npyv_load_u8(src2), vzero); npyv_b8 c = npyv_@VOP@_b8(a, b); - npyv_store_u8(dst, npyv_andc_u8(npyv_cvt_u8_b8(c), truemask)); + npyv_store_u8(dst, npyv_and_u8(npyv_cvt_u8_b8(c), truemask)); } for (; len > 0; --len, ++src1, ++src2, ++dst) { @@ -258,7 +258,7 @@ static void simd_binary_scalar1_@kind@_b8(char **args, npy_intp len) for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) { npyv_b8 b = npyv_cmpeq_u8(npyv_load_u8(src), vzero); npyv_b8 c = npyv_@VOP@_b8(a, b); - npyv_store_u8(dst, npyv_andc_u8(npyv_cvt_u8_b8(c), truemask)); + npyv_store_u8(dst, npyv_and_u8(npyv_cvt_u8_b8(c), truemask)); } for (; len > 0; --len, ++src, ++dst) { @@ -281,7 +281,7 @@ static void simd_binary_scalar2_@kind@_b8(char **args, npy_intp len) for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) { npyv_b8 a = npyv_cmpeq_u8(npyv_load_u8(src), vzero); npyv_b8 c = npyv_@VOP@_b8(a, b); - npyv_store_u8(dst, npyv_andc_u8(npyv_cvt_u8_b8(c), truemask)); + npyv_store_u8(dst, npyv_and_u8(npyv_cvt_u8_b8(c), truemask)); } for (; len > 0; --len, ++src, ++dst) { @@ -308,23 +308,27 @@ static void simd_binary_scalar2_@kind@_b8(char **args, npy_intp len) * #OP = ==, !=, <, <=# */ #if !((@eq@ || @neq@) && @signed@) -static NPY_INLINE void +static inline void run_binary_simd_@kind@_@sfx@(char **args, npy_intp const *dimensions, npy_intp const *steps) { #if @VECTOR@ - /* argument one scalar */ - if (IS_BLOCKABLE_BINARY_SCALAR1_BOOL(sizeof(@type@), NPY_SIMD_WIDTH)) { - simd_binary_scalar1_@kind@_@sfx@(args, dimensions[0]); - return; - } - /* argument two scalar */ - else if (IS_BLOCKABLE_BINARY_SCALAR2_BOOL(sizeof(@type@), NPY_SIMD_WIDTH)) { - simd_binary_scalar2_@kind@_@sfx@(args, dimensions[0]); - return; - } - else if (IS_BLOCKABLE_BINARY_BOOL(sizeof(@type@), NPY_SIMD_WIDTH)) { - simd_binary_@kind@_@sfx@(args, dimensions[0]); - return; + if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) && + !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0]) + ) { + /* argument one scalar */ + if (IS_BINARY_CONT_S1(@type@, npy_bool)) { + simd_binary_scalar1_@kind@_@sfx@(args, dimensions[0]); + return; + } + /* argument two scalar */ + else if (IS_BINARY_CONT_S2(@type@, npy_bool)) { + simd_binary_scalar2_@kind@_@sfx@(args, dimensions[0]); + return; + } + else if (IS_BINARY_CONT(@type@, npy_bool)) { + simd_binary_@kind@_@sfx@(args, dimensions[0]); + return; + } } #endif diff --git a/numpy/core/src/umath/loops_exponent_log.dispatch.c.src b/numpy/core/src/umath/loops_exponent_log.dispatch.c.src index 8f123a48b..1fac3c150 100644 --- a/numpy/core/src/umath/loops_exponent_log.dispatch.c.src +++ b/numpy/core/src/umath/loops_exponent_log.dispatch.c.src @@ -239,7 +239,7 @@ fma_scalef_ps(__m256 poly, __m256 quadrant) #ifdef SIMD_AVX512F -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16 +NPY_FINLINE __mmask16 avx512_get_full_load_mask_ps(void) { return 0xFFFF; @@ -1146,7 +1146,7 @@ AVX512F_log_DOUBLE(npy_double * op, * #vtype2_scatter = _mm512_mask_i32scatter_epi32, _mm256_mask_i32scatter_epi32# * #setzero = _mm512_setzero_epi32, _mm256_setzero_si256# */ -static NPY_INLINE void +static inline void AVX512_SKX_ldexp_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps) { const npy_intp stride_ip1 = steps[0]/(npy_intp)sizeof(@type@); @@ -1215,7 +1215,7 @@ AVX512_SKX_ldexp_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const } } -static NPY_INLINE void +static inline void AVX512_SKX_frexp_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps) { const npy_intp stride_ip1 = steps[0]/(npy_intp)sizeof(@type@); diff --git a/numpy/core/src/umath/loops_logical.dispatch.c.src b/numpy/core/src/umath/loops_logical.dispatch.c.src new file mode 100644 index 000000000..c07525be4 --- /dev/null +++ b/numpy/core/src/umath/loops_logical.dispatch.c.src @@ -0,0 +1,377 @@ +/*@targets + ** $maxopt baseline + ** neon asimd + ** sse2 avx2 avx512_skx + ** vsx2 + ** vx + **/ +#define _UMATHMODULE +#define _MULTIARRAYMODULE +#define NPY_NO_DEPRECATED_API NPY_API_VERSION + +#include "simd/simd.h" +#include "loops_utils.h" +#include "loops.h" +#include "lowlevel_strided_loops.h" +// Provides the various *_LOOP macros +#include "fast_loop_macros.h" + +/******************************************************************************* + ** Defining the SIMD kernels + ******************************************************************************/ + +#if NPY_SIMD +/* + * convert any bit set to boolean true so vectorized and normal operations are + * consistent, should not be required if bool is used correctly everywhere but + * you never know + */ +NPY_FINLINE npyv_u8 byte_to_true(npyv_u8 v) +{ + const npyv_u8 zero = npyv_zero_u8(); + const npyv_u8 truemask = npyv_setall_u8(1 == 1); + // cmpeq(v, 0) turns 0x00 -> 0xff and non-zero -> 0x00 + npyv_u8 tmp = npyv_cvt_u8_b8(npyv_cmpeq_u8(v, zero)); + // tmp is filled with 0xff/0x00, negate and mask to boolean true + return npyv_andc_u8(truemask, tmp); +} +/* + * convert mask vector (0xff/0x00) to boolean true. similar to byte_to_true(), + * but we've already got a mask and can skip negation. + */ +NPY_FINLINE npyv_u8 mask_to_true(npyv_b8 v) +{ + const npyv_u8 truemask = npyv_setall_u8(1 == 1); + return npyv_and_u8(truemask, npyv_cvt_u8_b8(v)); +} +/* + * For logical_and, we have to be careful to handle non-bool inputs where + * bits of each operand might not overlap. Example: a = 0x01, b = 0x80 + * Both evaluate to boolean true, however, a & b is false. Return value + * should be consistent with byte_to_true(). + */ +NPY_FINLINE npyv_u8 simd_logical_and_u8(npyv_u8 a, npyv_u8 b) +{ + const npyv_u8 zero = npyv_zero_u8(); + const npyv_u8 truemask = npyv_setall_u8(1 == 1); + npyv_b8 ma = npyv_cmpeq_u8(a, zero); + npyv_b8 mb = npyv_cmpeq_u8(b, zero); + npyv_u8 r = npyv_cvt_u8_b8(npyv_or_b8(ma, mb)); + return npyv_andc_u8(truemask, r); +} +/* + * We don't really need the following, but it simplifies the templating code + * below since it is paired with simd_logical_and_u8() above. + */ +NPY_FINLINE npyv_u8 simd_logical_or_u8(npyv_u8 a, npyv_u8 b) +{ + npyv_u8 r = npyv_or_u8(a, b); + return byte_to_true(r); +} + + +/**begin repeat + * #kind = logical_and, logical_or# + * #and = 1, 0# + * #scalar_op = &&, ||# + * #intrin = and, or# + * #reduce = min, max# + * #scalar_cmp = ==, !=# + * #anyall = all, any# + */ +static void +simd_binary_@kind@_BOOL(npy_bool * op, npy_bool * ip1, npy_bool * ip2, npy_intp len) +{ + #define UNROLL 16 + + const int vstep = npyv_nlanes_u8; + const int wstep = vstep * UNROLL; + + // Unrolled vectors loop + for (; len >= wstep; len -= wstep, ip1 += wstep, ip2 += wstep, op += wstep) { + /**begin repeat1 + * #unroll = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15# + */ + #if UNROLL > @unroll@ + npyv_u8 a@unroll@ = npyv_load_u8(ip1 + vstep * @unroll@); + npyv_u8 b@unroll@ = npyv_load_u8(ip2 + vstep * @unroll@); + npyv_u8 r@unroll@ = simd_logical_@intrin@_u8(a@unroll@, b@unroll@); + npyv_store_u8(op + vstep * @unroll@, r@unroll@); + #endif + /**end repeat1**/ + } + #undef UNROLL + + // Single vectors loop + for (; len >= vstep; len -= vstep, ip1 += vstep, ip2 += vstep, op += vstep) { + npyv_u8 a = npyv_load_u8(ip1); + npyv_u8 b = npyv_load_u8(ip2); + npyv_u8 r = simd_logical_@intrin@_u8(a, b); + npyv_store_u8(op, r); + } + + // Scalar loop to finish off + for (; len > 0; len--, ip1++, ip2++, op++) { + *op = *ip1 @scalar_op@ *ip2; + } +} + +static void +simd_reduce_@kind@_BOOL(npy_bool * op, npy_bool * ip, npy_intp len) +{ + #define UNROLL 8 + + const int vstep = npyv_nlanes_u8; + const int wstep = vstep * UNROLL; + + // Unrolled vectors loop + for (; len >= wstep; len -= wstep, ip += wstep) { + #if defined(NPY_HAVE_SSE2) + NPY_PREFETCH(ip + wstep, 0, 3); + #endif + npyv_u8 v0 = npyv_load_u8(ip + vstep * 0); + npyv_u8 v1 = npyv_load_u8(ip + vstep * 1); + npyv_u8 v2 = npyv_load_u8(ip + vstep * 2); + npyv_u8 v3 = npyv_load_u8(ip + vstep * 3); + npyv_u8 v4 = npyv_load_u8(ip + vstep * 4); + npyv_u8 v5 = npyv_load_u8(ip + vstep * 5); + npyv_u8 v6 = npyv_load_u8(ip + vstep * 6); + npyv_u8 v7 = npyv_load_u8(ip + vstep * 7); + + npyv_u8 m01 = npyv_@reduce@_u8(v0, v1); + npyv_u8 m23 = npyv_@reduce@_u8(v2, v3); + npyv_u8 m45 = npyv_@reduce@_u8(v4, v5); + npyv_u8 m67 = npyv_@reduce@_u8(v6, v7); + + npyv_u8 m0123 = npyv_@reduce@_u8(m01, m23); + npyv_u8 m4567 = npyv_@reduce@_u8(m45, m67); + + npyv_u8 mv = npyv_@reduce@_u8(m0123, m4567); + + if(npyv_@anyall@_u8(mv) @scalar_cmp@ 0){ + *op = !@and@; + return; + } + } + + // Single vectors loop + for (; len >= vstep; len -= vstep, ip += vstep) { + npyv_u8 v0 = npyv_load_u8(ip); + if(npyv_@anyall@_u8(v0) @scalar_cmp@ 0){ + *op = !@and@; + return; + } + } + + // Scalar loop to finish off + for (; len > 0; --len, ++ip) { + *op = *op @scalar_op@ *ip; + if (*op @scalar_cmp@ 0) { + return; + } + } +#undef UNROLL +} +/**end repeat**/ + +/**begin repeat + * #kind = logical_not, absolute# + * #op = ==, !=# + * #not = 1, 0# + */ +static void +simd_@kind@_BOOL(npy_bool * op, npy_bool * ip, npy_intp len) +{ + #define UNROLL 16 + + const int vstep = npyv_nlanes_u8; + const int wstep = vstep * UNROLL; + + #if @not@ + const npyv_u8 zero = npyv_zero_u8(); + #endif + + // Unrolled vectors loop + for (; len >= wstep; len -= wstep, ip += wstep, op += wstep) { + /**begin repeat1 + * #unroll = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15# + */ + #if UNROLL > @unroll@ + npyv_u8 v@unroll@ = npyv_load_u8(ip + vstep * @unroll@); +#if @not@ + npyv_u8 r@unroll@ = mask_to_true(npyv_cmpeq_u8(v@unroll@, zero)); +#else + npyv_u8 r@unroll@ = byte_to_true(v@unroll@); +#endif + npyv_store_u8(op + vstep * @unroll@, r@unroll@); + #endif + /**end repeat1**/ + } + #undef UNROLL + + // Single vectors loop + for (; len >= vstep; len -= vstep, ip += vstep, op += vstep) { + npyv_u8 v = npyv_load_u8(ip); +#if @not@ + npyv_u8 r = mask_to_true(npyv_cmpeq_u8(v, zero)); +#else + npyv_u8 r = byte_to_true(v); +#endif + npyv_store_u8(op, r); + } + + // Scalar loop to finish off + for (; len > 0; --len, ++ip, ++op) { + *op = (*ip @op@ 0); + } +} +/**end repeat**/ + +#endif // NPY_SIMD + +/******************************************************************************* + ** Defining ufunc inner functions + ******************************************************************************/ + +/**begin repeat + * # kind = logical_or, logical_and# + */ +static NPY_INLINE int +run_binary_simd_@kind@_BOOL(char **args, npy_intp const *dimensions, npy_intp const *steps) +{ +#if NPY_SIMD + if (sizeof(npy_bool) == 1 && + IS_BLOCKABLE_BINARY(sizeof(npy_bool), NPY_SIMD_WIDTH)) { + simd_binary_@kind@_BOOL((npy_bool*)args[2], (npy_bool*)args[0], + (npy_bool*)args[1], dimensions[0]); + return 1; + } +#endif + return 0; +} + + +static NPY_INLINE int +run_reduce_simd_@kind@_BOOL(char **args, npy_intp const *dimensions, npy_intp const *steps) +{ +#if NPY_SIMD + if (sizeof(npy_bool) == 1 && + IS_BLOCKABLE_REDUCE(sizeof(npy_bool), NPY_SIMD_WIDTH)) { + simd_reduce_@kind@_BOOL((npy_bool*)args[0], (npy_bool*)args[1], + dimensions[0]); + return 1; + } +#endif + return 0; +} +/**end repeat**/ + +/**begin repeat + * #kind = logical_not, absolute# + */ +static NPY_INLINE int +run_unary_simd_@kind@_BOOL(char **args, npy_intp const *dimensions, npy_intp const *steps) +{ +#if NPY_SIMD + if (sizeof(npy_bool) == 1 && + IS_BLOCKABLE_UNARY(sizeof(npy_bool), NPY_SIMD_WIDTH)) { + simd_@kind@_BOOL((npy_bool*)args[1], (npy_bool*)args[0], dimensions[0]); + return 1; + } +#endif + return 0; +} +/**end repeat**/ + + +/**begin repeat + * #kind = logical_and, logical_or# + * #OP = &&, ||# + * #SC = ==, !=# + * #and = 1, 0# + */ +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BOOL_@kind@) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + if(IS_BINARY_REDUCE) { +#if NPY_SIMD + /* + * stick with our variant for more reliable performance, only known + * platform which outperforms it by ~20% is an i7 with glibc 2.17 + */ + if (run_reduce_simd_@kind@_BOOL(args, dimensions, steps)) { + return; + } +#else + /* for now only use libc on 32-bit/non-x86 */ + if (steps[1] == 1) { + npy_bool * op = (npy_bool *)args[0]; +#if @and@ + /* np.all(), search for a zero (false) */ + if (*op) { + *op = memchr(args[1], 0, dimensions[0]) == NULL; + } +#else + /* + * np.any(), search for a non-zero (true) via comparing against + * zero blocks, memcmp is faster than memchr on SSE4 machines + * with glibc >= 2.12 and memchr can only check for equal 1 + */ + static const npy_bool zero[4096]; /* zero by C standard */ + npy_uintp i, n = dimensions[0]; + + for (i = 0; !*op && i < n - (n % sizeof(zero)); i += sizeof(zero)) { + *op = memcmp(&args[1][i], zero, sizeof(zero)) != 0; + } + if (!*op && n - i > 0) { + *op = memcmp(&args[1][i], zero, n - i) != 0; + } +#endif + return; + } +#endif + else { + BINARY_REDUCE_LOOP(npy_bool) { + const npy_bool in2 = *(npy_bool *)ip2; + io1 = io1 @OP@ in2; + if (io1 @SC@ 0) { + break; + } + } + *((npy_bool *)iop1) = io1; + } + } + else { + if (run_binary_simd_@kind@_BOOL(args, dimensions, steps)) { + return; + } + else { + BINARY_LOOP { + const npy_bool in1 = *(npy_bool *)ip1; + const npy_bool in2 = *(npy_bool *)ip2; + *((npy_bool *)op1) = in1 @OP@ in2; + } + } + } +} +/**end repeat**/ + +/**begin repeat + * #kind = logical_not, absolute# + * #OP = ==, !=# + **/ +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BOOL_@kind@) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + if (run_unary_simd_@kind@_BOOL(args, dimensions, steps)) { + return; + } + else { + UNARY_LOOP { + npy_bool in1 = *(npy_bool *)ip1; + *((npy_bool *)op1) = in1 @OP@ 0; + } + } +} +/**end repeat**/ + diff --git a/numpy/core/src/umath/loops_minmax.dispatch.c.src b/numpy/core/src/umath/loops_minmax.dispatch.c.src index 237c8e933..9d8667d38 100644 --- a/numpy/core/src/umath/loops_minmax.dispatch.c.src +++ b/numpy/core/src/umath/loops_minmax.dispatch.c.src @@ -451,6 +451,24 @@ clear_fp: #endif } + +NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@_indexed) +(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func)) +{ + char *ip1 = args[0]; + char *indx = args[1]; + char *value = args[2]; + npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2]; + npy_intp n = dimensions[0]; + npy_intp i; + @type@ *indexed; + for(i = 0; i < n; i++, indx += isindex, value += isb) { + indexed = (@type@ *)(ip1 + is1 * *(npy_intp *)indx); + *indexed = SCALAR_OP(*indexed, *(@type@ *)value); + } + return 0; +} + #undef SCALAR_OP #endif // !fp_only || (is_fp && fp_only) diff --git a/numpy/core/src/umath/loops_modulo.dispatch.c.src b/numpy/core/src/umath/loops_modulo.dispatch.c.src index 53b7da289..25edffb1e 100644 --- a/numpy/core/src/umath/loops_modulo.dispatch.c.src +++ b/numpy/core/src/umath/loops_modulo.dispatch.c.src @@ -171,7 +171,7 @@ vsx4_divisor_@sfx@(const npyv_@sfx@ vscalar) * #func = fmod, remainder, divmod# * #id = 0, 1, 2# */ -static NPY_INLINE void +static inline void vsx4_simd_@func@_contig_@sfx@(char **args, npy_intp len) { npyv_lanetype_@sfx@ *src1 = (npyv_lanetype_@sfx@ *) args[0]; @@ -239,7 +239,7 @@ vsx4_simd_@func@_contig_@sfx@(char **args, npy_intp len) npyv_cleanup(); } -static NPY_INLINE void +static inline void vsx4_simd_@func@_by_scalar_contig_@sfx@(char **args, npy_intp len) { npyv_lanetype_@sfx@ *src1 = (npyv_lanetype_@sfx@ *) args[0]; @@ -292,7 +292,7 @@ vsx4_simd_@func@_by_scalar_contig_@sfx@(char **args, npy_intp len) * #func = fmod, remainder, divmod# * #id = 0, 1, 2# */ -static NPY_INLINE void +static inline void vsx4_simd_@func@_contig_@sfx@(char **args, npy_intp len) { npyv_lanetype_@sfx@ *src1 = (npyv_lanetype_@sfx@ *) args[0]; @@ -410,7 +410,7 @@ vsx4_simd_@func@_contig_@sfx@(char **args, npy_intp len) npyv_cleanup(); } -static NPY_INLINE void +static inline void vsx4_simd_@func@_by_scalar_contig_@sfx@(char **args, npy_intp len) { npyv_lanetype_@sfx@ *src1 = (npyv_lanetype_@sfx@ *) args[0]; diff --git a/numpy/core/src/umath/loops_trigonometric.dispatch.c.src b/numpy/core/src/umath/loops_trigonometric.dispatch.c.src index 78685e807..43eb58ffe 100644 --- a/numpy/core/src/umath/loops_trigonometric.dispatch.c.src +++ b/numpy/core/src/umath/loops_trigonometric.dispatch.c.src @@ -9,12 +9,18 @@ #include "simd/simd.h" #include "loops_utils.h" #include "loops.h" +#include "fast_loop_macros.h" /* * TODO: * - use vectorized version of Payne-Hanek style reduction for large elements or * when there's no native FUSED support instead of fallback to libc */ -#if NPY_SIMD_F32 && NPY_SIMD_FMA3 // native support +#if NPY_SIMD_FMA3 // native support +/**begin repeat + * #check = F64, F32# + * #sfx = f64, f32# + */ +#if NPY_SIMD_@check@ /* * Vectorized Cody-Waite range reduction technique * Performs the reduction step x* = x - y*C in three steps: @@ -23,14 +29,189 @@ * 3) x* = x - y*c3 * c1, c2 are exact floating points, c3 = C - c1 - c2 simulates higher precision */ -NPY_FINLINE npyv_f32 -simd_range_reduction_f32(npyv_f32 x, npyv_f32 y, npyv_f32 c1, npyv_f32 c2, npyv_f32 c3) +NPY_FINLINE npyv_@sfx@ +simd_range_reduction_@sfx@(npyv_@sfx@ x, npyv_@sfx@ y, npyv_@sfx@ c1, npyv_@sfx@ c2, npyv_@sfx@ c3) { - npyv_f32 reduced_x = npyv_muladd_f32(y, c1, x); - reduced_x = npyv_muladd_f32(y, c2, reduced_x); - reduced_x = npyv_muladd_f32(y, c3, reduced_x); + npyv_@sfx@ reduced_x = npyv_muladd_@sfx@(y, c1, x); + reduced_x = npyv_muladd_@sfx@(y, c2, reduced_x); + reduced_x = npyv_muladd_@sfx@(y, c3, reduced_x); return reduced_x; } +#endif +/**end repeat**/ + +#if NPY_SIMD_F64 +/**begin repeat + * #op = cos, sin# + */ +#if defined(NPY_OS_WIN32) || defined(NPY_OS_CYGWIN) +NPY_FINLINE npyv_f64 +#else +NPY_NOINLINE npyv_f64 +#endif +simd_@op@_scalar_f64(npyv_f64 out, npy_uint64 cmp_bits) +{ + // MSVC doesn't compile with direct vector access, so we copy it here + // as we have no npyv_get_lane/npyv_set_lane intrinsics + npy_double NPY_DECL_ALIGNED(NPY_SIMD_WIDTH) out_copy[npyv_nlanes_f64]; + npyv_storea_f64(out_copy, out); + + for (unsigned i = 0; i < npyv_nlanes_f64; ++i) { + if (cmp_bits & (1 << i)) { + out_copy[i] = npy_@op@(out_copy[i]); + } + } + + return npyv_loada_f64(out_copy); +} +/**end repeat**/ + +/* + * Approximate sine algorithm for x \in [-pi/2, pi/2] + * worst-case error is 3.5 ulp. + * abs error: 0x1.be222a58p-53 in [-pi/2, pi/2]. + */ +NPY_FINLINE npyv_f64 +simd_approx_sine_poly_f64(npyv_f64 r) +{ + const npyv_f64 poly1 = npyv_setall_f64(-0x1.9f4a9c8b21dc9p-41); + const npyv_f64 poly2 = npyv_setall_f64(0x1.60e88a10163f2p-33); + const npyv_f64 poly3 = npyv_setall_f64(-0x1.ae6361b7254e7p-26); + const npyv_f64 poly4 = npyv_setall_f64(0x1.71de382e8d62bp-19); + const npyv_f64 poly5 = npyv_setall_f64(-0x1.a01a019aeb4ffp-13); + const npyv_f64 poly6 = npyv_setall_f64(0x1.111111110b25ep-7); + const npyv_f64 poly7 = npyv_setall_f64(-0x1.55555555554c3p-3); + + npyv_f64 r2 = npyv_mul_f64(r, r); + npyv_f64 y = npyv_muladd_f64(poly1, r2, poly2); + y = npyv_muladd_f64(y, r2, poly3); + y = npyv_muladd_f64(y, r2, poly4); + y = npyv_muladd_f64(y, r2, poly5); + y = npyv_muladd_f64(y, r2, poly6); + y = npyv_muladd_f64(y, r2, poly7); + y = npyv_muladd_f64(npyv_mul_f64(y, r2), r, r); + + return y; +} + +/* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */ +NPY_FINLINE npyv_f64 +simd_range_reduction_pi2(npyv_f64 r, npyv_f64 n) { + const npyv_f64 pi1 = npyv_setall_f64(-0x1.921fb54442d18p+1); + const npyv_f64 pi2 = npyv_setall_f64(-0x1.1a62633145c06p-53); + const npyv_f64 pi3 = npyv_setall_f64(-0x1.c1cd129024e09p-106); + + return simd_range_reduction_f64(r, n, pi1, pi2, pi3); +} + +NPY_FINLINE npyv_b64 simd_sin_range_check_f64(npyv_u64 ir) { + const npyv_u64 tiny_bound = npyv_setall_u64(0x202); /* top12 (asuint64 (0x1p-509)). */ + const npyv_u64 simd_thresh = npyv_setall_u64(0x214); /* top12 (asuint64 (RangeVal)) - SIMD_TINY_BOUND. */ + + return npyv_cmpge_u64(npyv_sub_u64(npyv_shri_u64(ir, 52), tiny_bound), simd_thresh); +} + +NPY_FINLINE npyv_b64 simd_cos_range_check_f64(npyv_u64 ir) { + const npyv_f64 range_val = npyv_setall_f64(0x1p23); + + return npyv_cmpge_u64(ir, npyv_reinterpret_u64_f64(range_val)); +} + +NPY_FINLINE npyv_f64 +simd_cos_poly_f64(npyv_f64 r, npyv_u64 ir, npyv_u64 sign) +{ + const npyv_f64 inv_pi = npyv_setall_f64(0x1.45f306dc9c883p-2); + const npyv_f64 half_pi = npyv_setall_f64(0x1.921fb54442d18p+0); + const npyv_f64 shift = npyv_setall_f64(0x1.8p52); + + /* n = rint((|x|+pi/2)/pi) - 0.5. */ + npyv_f64 n = npyv_muladd_f64(inv_pi, npyv_add_f64(r, half_pi), shift); + npyv_u64 odd = npyv_shli_u64(npyv_reinterpret_u64_f64(n), 63); + n = npyv_sub_f64(n, shift); + n = npyv_sub_f64(n, npyv_setall_f64(0.5)); + + /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */ + r = simd_range_reduction_pi2(r, n); + + /* sin(r) poly approx. */ + npyv_f64 y = simd_approx_sine_poly_f64(r); + + /* sign. */ + return npyv_reinterpret_f64_u64(npyv_xor_u64(npyv_reinterpret_u64_f64(y), odd)); +} + +NPY_FINLINE npyv_f64 +simd_sin_poly_f64(npyv_f64 r, npyv_u64 ir, npyv_u64 sign) +{ + const npyv_f64 inv_pi = npyv_setall_f64(0x1.45f306dc9c883p-2); + const npyv_f64 shift = npyv_setall_f64(0x1.8p52); + + /* n = rint(|x|/pi). */ + npyv_f64 n = npyv_muladd_f64(inv_pi, r, shift); + npyv_u64 odd = npyv_shli_u64(npyv_reinterpret_u64_f64(n), 63); + n = npyv_sub_f64(n, shift); + + /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */ + r = simd_range_reduction_pi2(r, n); + + /* sin(r) poly approx. */ + npyv_f64 y = simd_approx_sine_poly_f64(r); + + /* sign. */ + return npyv_reinterpret_f64_u64(npyv_xor_u64(npyv_xor_u64(npyv_reinterpret_u64_f64(y), sign), odd)); +} + +/**begin repeat + * #op = cos, sin# + */ +NPY_FINLINE void +simd_@op@_f64(const double *src, npy_intp ssrc, double *dst, npy_intp sdst, npy_intp len) +{ + const npyv_u64 abs_mask = npyv_setall_u64(0x7fffffffffffffff); + const int vstep = npyv_nlanes_f64; + + npyv_f64 out = npyv_zero_f64(); + npyv_f64 x_in; + + for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) { + if (ssrc == 1) { + x_in = npyv_load_tillz_f64(src, len); + } else { + x_in = npyv_loadn_tillz_f64(src, ssrc, len); + } + + npyv_u64 ir = npyv_and_u64(npyv_reinterpret_u64_f64(x_in), abs_mask); + npyv_f64 r = npyv_reinterpret_f64_u64(ir); + npyv_u64 sign = npyv_and_u64(npyv_reinterpret_u64_f64(x_in), npyv_not_u64(abs_mask)); + + npyv_b64 cmp = simd_@op@_range_check_f64(ir); + /* If fenv exceptions are to be triggered correctly, set any special lanes + to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by + scalar loop later. */ + r = npyv_select_f64(cmp, npyv_setall_f64(1.0), r); + + // Some in range, at least one calculation is useful + if (!npyv_all_b64(cmp)) { + out = simd_@op@_poly_f64(r, ir, sign); + } + + if (npyv_any_b64(cmp)) { + out = npyv_select_f64(cmp, x_in, out); + out = simd_@op@_scalar_f64(out, npyv_tobits_b64(cmp)); + } + + if (sdst == 1) { + npyv_store_till_f64(dst, len, out); + } else { + npyv_storen_till_f64(dst, sdst, len, out); + } + } + npyv_cleanup(); +} +/**end repeat**/ +#endif // NPY_SIMD_F64 + +#if NPY_SIMD_F32 /* * Approximate cosine algorithm for x \in [-PI/4, PI/4] * Maximum ULP across all 32-bit floats = 0.875 @@ -124,6 +305,11 @@ simd_sincos_f32(const float *src, npy_intp ssrc, float *dst, npy_intp sdst, } else { x_in = npyv_loadn_tillz_f32(src, ssrc, len); } + npyv_b32 nnan_mask = npyv_notnan_f32(x_in); + #if NPY_SIMD_CMPSIGNAL + // Eliminate NaN to avoid FP invalid exception + x_in = npyv_and_f32(x_in, npyv_reinterpret_f32_u32(npyv_cvt_u32_b32(nnan_mask))); + #endif npyv_b32 simd_mask = npyv_cmple_f32(npyv_abs_f32(x_in), max_cody); npy_uint64 simd_maski = npyv_tobits_b32(simd_mask); /* @@ -132,7 +318,6 @@ simd_sincos_f32(const float *src, npy_intp ssrc, float *dst, npy_intp sdst, * these numbers */ if (simd_maski != 0) { - npyv_b32 nnan_mask = npyv_notnan_f32(x_in); npyv_f32 x = npyv_select_f32(npyv_and_b32(nnan_mask, simd_mask), x_in, zerosf); npyv_f32 quadrant = npyv_mul_f32(x, two_over_pi); @@ -194,24 +379,58 @@ simd_sincos_f32(const float *src, npy_intp ssrc, float *dst, npy_intp sdst, } npyv_cleanup(); } -#endif // NPY_SIMD_FMA3 +#endif // NPY_SIMD_FP32 +#endif // NYP_SIMD_FMA3 /**begin repeat * #func = cos, sin# - * #enum = SIMD_COMPUTE_COS, SIMD_COMPUTE_SIN# + */ +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_@func@) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)) +{ +#if NPY_SIMD_F64 && NPY_SIMD_FMA3 + const double *src = (double*)args[0]; + double *dst = (double*)args[1]; + const int lsize = sizeof(src[0]); + const npy_intp ssrc = steps[0] / lsize; + const npy_intp sdst = steps[1] / lsize; + npy_intp len = dimensions[0]; + assert(len <= 1 || (steps[0] % lsize == 0 && steps[1] % lsize == 0)); + + if (is_mem_overlap(src, steps[0], dst, steps[1], len) || + !npyv_loadable_stride_f64(ssrc) || !npyv_storable_stride_f64(sdst) + ) { + for (; len > 0; --len, src += ssrc, dst += sdst) { + simd_@func@_f64(src, 1, dst, 1, 1); + } + } else { + simd_@func@_f64(src, ssrc, dst, sdst, len); + } +#else + UNARY_LOOP { + const npy_double in1 = *(npy_double *)ip1; + *(npy_double *)op1 = npy_@func@(in1); + } +#endif +} +/**end repeat**/ + +/**begin repeat + * #func = sin, cos# + * #enum = SIMD_COMPUTE_SIN, SIMD_COMPUTE_COS# */ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_@func@) (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)) { - const float *src = (float*)args[0]; - float *dst = (float*)args[1]; +#if NPY_SIMD_F32 && NPY_SIMD_FMA3 + const npy_float *src = (npy_float*)args[0]; + npy_float *dst = (npy_float*)args[1]; const int lsize = sizeof(src[0]); const npy_intp ssrc = steps[0] / lsize; const npy_intp sdst = steps[1] / lsize; npy_intp len = dimensions[0]; assert(len <= 1 || (steps[0] % lsize == 0 && steps[1] % lsize == 0)); -#if NPY_SIMD_F32 && NPY_SIMD_FMA3 if (is_mem_overlap(src, steps[0], dst, steps[1], len) || !npyv_loadable_stride_f32(ssrc) || !npyv_storable_stride_f32(sdst) ) { @@ -222,9 +441,9 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_@func@) simd_sincos_f32(src, ssrc, dst, sdst, len, @enum@); } #else - for (; len > 0; --len, src += ssrc, dst += sdst) { - const float src0 = *src; - *dst = npy_@func@f(src0); + UNARY_LOOP { + const npy_float in1 = *(npy_float *)ip1; + *(npy_float *)op1 = npy_@func@f(in1); } #endif } diff --git a/numpy/core/src/umath/loops_umath_fp.dispatch.c.src b/numpy/core/src/umath/loops_umath_fp.dispatch.c.src index 46ce51824..89999e879 100644 --- a/numpy/core/src/umath/loops_umath_fp.dispatch.c.src +++ b/numpy/core/src/umath/loops_umath_fp.dispatch.c.src @@ -12,10 +12,12 @@ /**begin repeat * #sfx = f32, f64# * #func_suffix = f16, 8# + * #len = 32, 64# */ /**begin repeat1 * #func = exp2, log2, log10, expm1, log1p, cbrt, tan, asin, acos, atan, sinh, cosh, asinh, acosh, atanh# * #default_val = 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0# + * #fxnan = 0, 0, 0, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0# */ static void simd_@func@_@sfx@(const npyv_lanetype_@sfx@ *src, npy_intp ssrc, @@ -37,7 +39,15 @@ simd_@func@_@sfx@(const npyv_lanetype_@sfx@ *src, npy_intp ssrc, x = npyv_loadn_tillz_@sfx@(src, ssrc, len); } #endif + #if @fxnan@ == @len@ + // Workaround, invalid value encountered when x is set to nan + npyv_b@len@ nnan_mask = npyv_notnan_@sfx@(x); + npyv_@sfx@ x_exnan = npyv_select_@sfx@(nnan_mask, x, npyv_setall_@sfx@(@default_val@)); + npyv_@sfx@ out = __svml_@func@@func_suffix@(x_exnan); + out = npyv_select_@sfx@(nnan_mask, out, x); + #else npyv_@sfx@ out = __svml_@func@@func_suffix@(x); + #endif if (sdst == 1) { npyv_store_till_@sfx@(dst, len, out); } else { @@ -50,32 +60,6 @@ simd_@func@_@sfx@(const npyv_lanetype_@sfx@ *src, npy_intp ssrc, /**end repeat**/ /**begin repeat - * #func = sin, cos# - */ -static void -simd_@func@_f64(const double *src, npy_intp ssrc, - double *dst, npy_intp sdst, npy_intp len) -{ - const int vstep = npyv_nlanes_f64; - for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) { - npyv_f64 x; - if (ssrc == 1) { - x = npyv_load_tillz_f64(src, len); - } else { - x = npyv_loadn_tillz_f64(src, ssrc, len); - } - npyv_f64 out = __svml_@func@8(x); - if (sdst == 1) { - npyv_store_till_f64(dst, len, out); - } else { - npyv_storen_till_f64(dst, sdst, len, out); - } - } - npyv_cleanup(); -} -/**end repeat**/ - -/**begin repeat * #sfx = f32, f64# * #func_suffix = f16, 8# */ @@ -267,31 +251,3 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@func@) } /**end repeat1**/ /**end repeat**/ - -/**begin repeat - * #func = sin, cos# - */ -NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_@func@) -(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)) -{ -#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML) - const double *src = (double*)args[0]; - double *dst = (double*)args[1]; - const int lsize = sizeof(src[0]); - const npy_intp ssrc = steps[0] / lsize; - const npy_intp sdst = steps[1] / lsize; - const npy_intp len = dimensions[0]; - assert(len <= 1 || (steps[0] % lsize == 0 && steps[1] % lsize == 0)); - if (!is_mem_overlap(src, steps[0], dst, steps[1], len) && - npyv_loadable_stride_f64(ssrc) && - npyv_storable_stride_f64(sdst)) { - simd_@func@_f64(src, ssrc, dst, sdst, len); - return; - } -#endif - UNARY_LOOP { - const npy_double in1 = *(npy_double *)ip1; - *(npy_double *)op1 = npy_@func@(in1); - } -} -/**end repeat**/ diff --git a/numpy/core/src/umath/loops_unary.dispatch.c.src b/numpy/core/src/umath/loops_unary.dispatch.c.src new file mode 100644 index 000000000..1e2a81d20 --- /dev/null +++ b/numpy/core/src/umath/loops_unary.dispatch.c.src @@ -0,0 +1,364 @@ +/*@targets + ** $maxopt baseline + ** neon asimd + ** sse2 avx2 avx512_skx + ** vsx2 + ** vx vxe + **/ + +#define _UMATHMODULE +#define _MULTIARRAYMODULE +#define NPY_NO_DEPRECATED_API NPY_API_VERSION + +#include "numpy/npy_math.h" +#include "simd/simd.h" +#include "loops_utils.h" +#include "loops.h" +#include "lowlevel_strided_loops.h" +// Provides the various *_LOOP macros +#include "fast_loop_macros.h" + +/******************************************************************************* + ** Scalar ops + ******************************************************************************/ +#define scalar_negative(X) (-X) + +/******************************************************************************* + ** extra SIMD intrinsics + ******************************************************************************/ + +#if NPY_SIMD + +/**begin repeat + * #sfx = s8, u8, s16, u16, s32, u32, s64, u64# + * #ssfx = 8, 8, 16, 16, 32, 32, 64, 64# + */ +static NPY_INLINE npyv_@sfx@ +npyv_negative_@sfx@(npyv_@sfx@ v) +{ +#if defined(NPY_HAVE_NEON) && (defined(__aarch64__) || @ssfx@ < 64) + return npyv_reinterpret_@sfx@_s@ssfx@(vnegq_s@ssfx@(npyv_reinterpret_s@ssfx@_@sfx@(v))); +#else + // (x ^ -1) + 1 + const npyv_@sfx@ m1 = npyv_setall_@sfx@((npyv_lanetype_@sfx@)-1); + return npyv_sub_@sfx@(npyv_xor_@sfx@(v, m1), m1); +#endif +} +/**end repeat**/ + +/**begin repeat + * #sfx = f32, f64# + * #VCHK = NPY_SIMD_F32, NPY_SIMD_F64# + * #fd = f, # + */ +#if @VCHK@ +static NPY_INLINE npyv_@sfx@ +npyv_negative_@sfx@(npyv_@sfx@ v) +{ +#if defined(NPY_HAVE_NEON) + return vnegq_@sfx@(v); +#else + // (v ^ signmask) + const npyv_@sfx@ signmask = npyv_setall_@sfx@(-0.@fd@); + return npyv_xor_@sfx@(v, signmask); +#endif +} +#endif // @VCHK@ +/**end repeat**/ + +#endif // NPY_SIMD + +/******************************************************************************** + ** Defining the SIMD kernels + ********************************************************************************/ +/**begin repeat + * #sfx = s8, u8, s16, u16, s32, u32, s64, u64, f32, f64# + * #simd_chk = NPY_SIMD*8, NPY_SIMD_F32, NPY_SIMD_F64# + * #is_fp = 0*8, 1*2# + * #supports_ncontig = 0*4,1*6# + */ +/**begin repeat1 + * #kind = negative# + * #intrin = negative# + * #unroll = 4# + */ +#if @simd_chk@ +#if @unroll@ < 1 +#error "Unroll must be at least 1" +#elif NPY_SIMD != 128 && @unroll@ > 2 +// Avoid memory bandwidth bottleneck for larger SIMD +#define UNROLL 2 +#else +#define UNROLL @unroll@ +#endif +// contiguous inputs and output. +static NPY_INLINE void +simd_unary_cc_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip, + npyv_lanetype_@sfx@ *op, + npy_intp len) +{ + const int vstep = npyv_nlanes_@sfx@; + const int wstep = vstep * UNROLL; + + // unrolled vector loop + for (; len >= wstep; len -= wstep, ip += wstep, op += wstep) { + /**begin repeat2 + * #U = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15# + */ + #if UNROLL > @U@ + npyv_@sfx@ v_@U@ = npyv_load_@sfx@(ip + @U@ * vstep); + npyv_@sfx@ r_@U@ = npyv_@intrin@_@sfx@(v_@U@); + npyv_store_@sfx@(op + @U@ * vstep, r_@U@); + #endif + /**end repeat2**/ + } + // single vector loop + for (; len >= vstep; len -= vstep, ip += vstep, op +=vstep) { + npyv_@sfx@ v = npyv_load_@sfx@(ip); + npyv_@sfx@ r = npyv_@intrin@_@sfx@(v); + npyv_store_@sfx@(op, r); + } + // scalar finish up any remaining iterations + for (; len > 0; --len, ++ip, ++op) { + *op = scalar_@intrin@(*ip); + } +} + +#if @supports_ncontig@ +// contiguous input, non-contiguous output +static NPY_INLINE void +simd_unary_cn_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip, + npyv_lanetype_@sfx@ *op, npy_intp ostride, + npy_intp len) +{ + const int vstep = npyv_nlanes_@sfx@; + const int wstep = vstep * UNROLL; + + // unrolled vector loop + for (; len >= wstep; len -= wstep, ip += wstep, op += ostride*wstep) { + /**begin repeat2 + * #U = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15# + */ + #if UNROLL > @U@ + npyv_@sfx@ v_@U@ = npyv_load_@sfx@(ip + @U@ * vstep); + npyv_@sfx@ r_@U@ = npyv_@intrin@_@sfx@(v_@U@); + npyv_storen_@sfx@(op + @U@ * vstep * ostride, ostride, r_@U@); + #endif + /**end repeat2**/ + } + // single vector loop + for (; len >= vstep; len -= vstep, ip += vstep, op += ostride*vstep) { + npyv_@sfx@ v = npyv_load_@sfx@(ip); + npyv_@sfx@ r = npyv_@intrin@_@sfx@(v); + npyv_storen_@sfx@(op, ostride, r); + } + // scalar finish up any remaining iterations + for (; len > 0; --len, ++ip, op += ostride) { + *op = scalar_@intrin@(*ip); + } +} +// non-contiguous input, contiguous output +static NPY_INLINE void +simd_unary_nc_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip, npy_intp istride, + npyv_lanetype_@sfx@ *op, + npy_intp len) +{ + const int vstep = npyv_nlanes_@sfx@; + const int wstep = vstep * UNROLL; + + // unrolled vector loop + for (; len >= wstep; len -= wstep, ip += istride*wstep, op += wstep) { + /**begin repeat2 + * #U = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15# + */ + #if UNROLL > @U@ + npyv_@sfx@ v_@U@ = npyv_loadn_@sfx@(ip + @U@ * vstep * istride, istride); + npyv_@sfx@ r_@U@ = npyv_@intrin@_@sfx@(v_@U@); + npyv_store_@sfx@(op + @U@ * vstep, r_@U@); + #endif + /**end repeat2**/ + } + // single vector loop + for (; len >= vstep; len -= vstep, ip += istride*vstep, op += vstep) { + npyv_@sfx@ v = npyv_loadn_@sfx@(ip, istride); + npyv_@sfx@ r = npyv_@intrin@_@sfx@(v); + npyv_store_@sfx@(op, r); + } + // scalar finish up any remaining iterations + for (; len > 0; --len, ip += istride, ++op) { + *op = scalar_@intrin@(*ip); + } +} +// non-contiguous input and output +// limit unroll to 2x +#if UNROLL > 2 +#undef UNROLL +#define UNROLL 2 +#endif +static NPY_INLINE void +simd_unary_nn_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip, npy_intp istride, + npyv_lanetype_@sfx@ *op, npy_intp ostride, + npy_intp len) +{ + const int vstep = npyv_nlanes_@sfx@; + const int wstep = vstep * UNROLL; + + // unrolled vector loop + for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) { + /**begin repeat2 + * #U = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15# + */ + #if UNROLL > @U@ + npyv_@sfx@ v_@U@ = npyv_loadn_@sfx@(ip + @U@ * vstep * istride, istride); + npyv_@sfx@ r_@U@ = npyv_@intrin@_@sfx@(v_@U@); + npyv_storen_@sfx@(op + @U@ * vstep * ostride, ostride, r_@U@); + #endif + /**end repeat2**/ + } + // single vector loop + for (; len >= vstep; len -= vstep, ip += istride*vstep, op += ostride*vstep) { + npyv_@sfx@ v = npyv_loadn_@sfx@(ip, istride); + npyv_@sfx@ r = npyv_@intrin@_@sfx@(v); + npyv_storen_@sfx@(op, ostride, r); + } + // scalar finish up any remaining iterations + for (; len > 0; --len, ip += istride, op += ostride) { + *op = scalar_@intrin@(*ip); + } +} +#endif // @supports_ncontig@ +#undef UNROLL +#endif // @simd_chk@ +/*end repeat1**/ +/**end repeat**/ + +/******************************************************************************** + ** Defining ufunc inner functions + ********************************************************************************/ +/**begin repeat + * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG, + * BYTE, SHORT, INT, LONG, LONGLONG, + * FLOAT, DOUBLE, LONGDOUBLE# + * + * #BTYPE = BYTE, SHORT, INT, LONG, LONGLONG, + * BYTE, SHORT, INT, LONG, LONGLONG, + * FLOAT, DOUBLE, LONGDOUBLE# + * #type = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong, + * npy_byte, npy_short, npy_int, npy_long, npy_longlong, + * npy_float, npy_double, npy_longdouble# + * + * #is_fp = 0*10, 1*3# + * #is_unsigned = 1*5, 0*5, 0*3# + * #supports_ncontig = 0*2, 1*3, 0*2, 1*3, 1*3# + */ +#undef TO_SIMD_SFX +#if 0 +/**begin repeat1 + * #len = 8, 16, 32, 64# + */ +#elif NPY_SIMD && NPY_BITSOF_@BTYPE@ == @len@ + #if @is_fp@ + #define TO_SIMD_SFX(X) X##_f@len@ + #if NPY_BITSOF_@BTYPE@ == 32 && !NPY_SIMD_F32 + #undef TO_SIMD_SFX + #endif + #if NPY_BITSOF_@BTYPE@ == 64 && !NPY_SIMD_F64 + #undef TO_SIMD_SFX + #endif + #elif @is_unsigned@ + #define TO_SIMD_SFX(X) X##_u@len@ + #else + #define TO_SIMD_SFX(X) X##_s@len@ + #endif +/**end repeat1**/ +#endif + +/**begin repeat1 + * #kind = negative# + * #intrin = negative# + */ +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + char *ip = args[0], *op = args[1]; + npy_intp istep = steps[0], ostep = steps[1], + len = dimensions[0]; +#ifdef TO_SIMD_SFX + #undef STYPE + #define STYPE TO_SIMD_SFX(npyv_lanetype) + if (!is_mem_overlap(ip, istep, op, ostep, len)) { + if (IS_UNARY_CONT(@type@, @type@)) { + // no overlap and operands are contiguous + TO_SIMD_SFX(simd_unary_cc_@intrin@)( + (STYPE*)ip, (STYPE*)op, len + ); + goto clear; + } + #if @supports_ncontig@ + const npy_intp istride = istep / sizeof(STYPE); + const npy_intp ostride = ostep / sizeof(STYPE); + if (TO_SIMD_SFX(npyv_loadable_stride)(istride) && + TO_SIMD_SFX(npyv_storable_stride)(ostride)) + { + if (istride == 1 && ostride != 1) { + // contiguous input, non-contiguous output + TO_SIMD_SFX(simd_unary_cn_@intrin@)( + (STYPE*)ip, (STYPE*)op, ostride, len + ); + goto clear; + } + else if (istride != 1 && ostride == 1) { + // non-contiguous input, contiguous output + TO_SIMD_SFX(simd_unary_nc_@intrin@)( + (STYPE*)ip, istride, (STYPE*)op, len + ); + goto clear; + } + // SSE2 does better with unrolled scalar for heavy non-contiguous + #if !defined(NPY_HAVE_SSE2) + else if (istride != 1 && ostride != 1) { + // non-contiguous input and output + TO_SIMD_SFX(simd_unary_nn_@intrin@)( + (STYPE*)ip, istride, (STYPE*)op, ostride, len + ); + goto clear; + } + #endif + } + #endif // @supports_ncontig@ + } +#endif // TO_SIMD_SFX +#ifndef NPY_DISABLE_OPTIMIZATION + /* + * scalar unrolls + * 8x unroll performed best on + * - Apple M1 Native / arm64 + * - Apple M1 Rosetta / SSE42 + * - iMacPro / AVX512 + */ + #define UNROLL 8 + for (; len >= UNROLL; len -= UNROLL, ip += istep*UNROLL, op += ostep*UNROLL) { + /**begin repeat2 + * #U = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15# + */ + #if UNROLL > @U@ + const @type@ in_@U@ = *((const @type@ *)(ip + @U@ * istep)); + *((@type@ *)(op + @U@ * ostep)) = scalar_@intrin@(in_@U@); + #endif + /**end repeat2**/ + } +#endif // NPY_DISABLE_OPTIMIZATION + for (; len > 0; --len, ip += istep, op += ostep) { + *((@type@ *)op) = scalar_@intrin@(*(const @type@ *)ip); + } +#ifdef TO_SIMD_SFX +clear: + npyv_cleanup(); +#endif +#if @is_fp@ + npy_clear_floatstatus_barrier((char*)dimensions); +#endif +} +/**end repeat**/ + +#undef NEGATIVE_CONTIG_ONLY diff --git a/numpy/core/src/umath/loops_unary_complex.dispatch.c.src b/numpy/core/src/umath/loops_unary_complex.dispatch.c.src new file mode 100644 index 000000000..052ad464c --- /dev/null +++ b/numpy/core/src/umath/loops_unary_complex.dispatch.c.src @@ -0,0 +1,139 @@ +/*@targets + ** $maxopt baseline + ** sse2 (avx2 fma3) avx512f + ** neon asimd + ** vsx2 vsx3 + ** vx vxe + **/ +#define _UMATHMODULE +#define _MULTIARRAYMODULE +#define NPY_NO_DEPRECATED_API NPY_API_VERSION + +#include "simd/simd.h" +#include "loops_utils.h" +#include "loops.h" +#include "lowlevel_strided_loops.h" +// Provides the various *_LOOP macros +#include "fast_loop_macros.h" + +/**begin repeat + * #type = npy_float, npy_double# + * #sfx = f32, f64# + * #bsfx = b32, b64# + * #usfx = b32, u64# + * #VECTOR = NPY_SIMD_F32, NPY_SIMD_F64# + * #is_double = 0, 1# + * #c = f, # + * #INF = NPY_INFINITYF, NPY_INFINITY# + * #NAN = NPY_NANF, NPY_NAN# + */ +#if @VECTOR@ +NPY_FINLINE npyv_@sfx@ +simd_cabsolute_@sfx@(npyv_@sfx@ re, npyv_@sfx@ im) +{ + const npyv_@sfx@ inf = npyv_setall_@sfx@(@INF@); + const npyv_@sfx@ nan = npyv_setall_@sfx@(@NAN@); + + re = npyv_abs_@sfx@(re); + im = npyv_abs_@sfx@(im); + /* + * If real or imag = INF, then convert it to inf + j*inf + * Handles: inf + j*nan, nan + j*inf + */ + npyv_@bsfx@ re_infmask = npyv_cmpeq_@sfx@(re, inf); + npyv_@bsfx@ im_infmask = npyv_cmpeq_@sfx@(im, inf); + im = npyv_select_@sfx@(re_infmask, inf, im); + re = npyv_select_@sfx@(im_infmask, inf, re); + /* + * If real or imag = NAN, then convert it to nan + j*nan + * Handles: x + j*nan, nan + j*x + */ + npyv_@bsfx@ re_nnanmask = npyv_notnan_@sfx@(re); + npyv_@bsfx@ im_nnanmask = npyv_notnan_@sfx@(im); + im = npyv_select_@sfx@(re_nnanmask, im, nan); + re = npyv_select_@sfx@(im_nnanmask, re, nan); + + npyv_@sfx@ larger = npyv_max_@sfx@(re, im); + npyv_@sfx@ smaller = npyv_min_@sfx@(im, re); + /* + * Calculate div_mask to prevent 0./0. and inf/inf operations in div + */ + npyv_@bsfx@ zeromask = npyv_cmpeq_@sfx@(larger, npyv_zero_@sfx@()); + npyv_@bsfx@ infmask = npyv_cmpeq_@sfx@(smaller, inf); + npyv_@bsfx@ div_mask = npyv_not_@bsfx@(npyv_or_@bsfx@(zeromask, infmask)); + + npyv_@sfx@ ratio = npyv_ifdivz_@sfx@(div_mask, smaller, larger); + npyv_@sfx@ hypot = npyv_sqrt_@sfx@( + npyv_muladd_@sfx@(ratio, ratio, npyv_setall_@sfx@(1.0@c@) + )); + return npyv_mul_@sfx@(hypot, larger); +} +#endif // VECTOR +/**end repeat**/ + +/******************************************************************************** + ** Defining ufunc inner functions + ********************************************************************************/ +/**begin repeat + * complex types + * #TYPE = CFLOAT, CDOUBLE# + * #ftype = npy_float, npy_double# + * #VECTOR = NPY_SIMD_F32, NPY_SIMD_F64# + * #sfx = f32, f64# + * #c = f, # + * #C = F, # + */ +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_absolute) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ +#if @VECTOR@ + npy_intp len = dimensions[0]; + npy_intp ssrc = steps[0] / sizeof(@ftype@); + npy_intp sdst = steps[1] / sizeof(@ftype@); + + if (!is_mem_overlap(args[0], steps[0], args[1], steps[1], len) && + npyv_loadable_stride_@sfx@(ssrc) && npyv_storable_stride_@sfx@(sdst) + && steps[0] % sizeof(@ftype@) == 0 + && steps[1] % sizeof(@ftype@) == 0 + ) { + const @ftype@ *src = (@ftype@*)args[0]; + @ftype@ *dst = (@ftype@*)args[1]; + + const int vstep = npyv_nlanes_@sfx@; + const int wstep = vstep * 2; + const int hstep = vstep / 2; + + if (ssrc == 2 && sdst == 1) { + for (; len >= vstep; len -= vstep, src += wstep, dst += vstep) { + npyv_@sfx@x2 ab = npyv_load_@sfx@x2(src); + npyv_@sfx@ r = simd_cabsolute_@sfx@(ab.val[0], ab.val[1]); + npyv_store_@sfx@(dst, r); + } + } + else { + for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) { + npyv_@sfx@ re_im0 = npyv_loadn2_@sfx@(src, ssrc); + npyv_@sfx@ re_im1 = npyv_loadn2_@sfx@(src + ssrc*hstep, ssrc); + npyv_@sfx@x2 ab = npyv_unzip_@sfx@(re_im0, re_im1); + npyv_@sfx@ r = simd_cabsolute_@sfx@(ab.val[0], ab.val[1]); + npyv_storen_@sfx@(dst, sdst, r); + } + } + for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) { + npyv_@sfx@ rl = npyv_loadn_tillz_@sfx@(src, ssrc, len); + npyv_@sfx@ im = npyv_loadn_tillz_@sfx@(src + 1, ssrc, len); + npyv_@sfx@ r = simd_cabsolute_@sfx@(rl, im); + npyv_storen_till_@sfx@(dst, sdst, len, r); + } + npyv_cleanup(); + npy_clear_floatstatus_barrier((char*)&len); + return; + } +#endif + UNARY_LOOP { + const @ftype@ re = ((@ftype@ *)ip1)[0]; + const @ftype@ im = ((@ftype@ *)ip1)[1]; + *((@ftype@ *)op1) = npy_hypot@c@(re, im); + } +} +/**end repeat**/ diff --git a/numpy/core/src/umath/loops_unary_fp.dispatch.c.src b/numpy/core/src/umath/loops_unary_fp.dispatch.c.src index 0ac39a9b1..c4e7b8929 100644 --- a/numpy/core/src/umath/loops_unary_fp.dispatch.c.src +++ b/numpy/core/src/umath/loops_unary_fp.dispatch.c.src @@ -301,7 +301,7 @@ no_unroll: #endif // @VCHK@ for (; len > 0; --len, src += src_step, dst += dst_step) { #if @VCHK@ - // to guarantee the same precsion and fp/domain errors for both scalars and vectors + // to guarantee the same precision and fp/domain errors for both scalars and vectors simd_@TYPE@_@kind@_CONTIG_CONTIG(src, 0, dst, 0, 1); #else const npyv_lanetype_@sfx@ src0 = *(npyv_lanetype_@sfx@*)src; diff --git a/numpy/core/src/umath/loops_unary_fp_le.dispatch.c.src b/numpy/core/src/umath/loops_unary_fp_le.dispatch.c.src new file mode 100644 index 000000000..ba133dc1e --- /dev/null +++ b/numpy/core/src/umath/loops_unary_fp_le.dispatch.c.src @@ -0,0 +1,565 @@ +/*@targets + ** $maxopt baseline + ** sse2 sse41 + ** vsx2 + ** neon asimd + **/ + +/** + * Force use SSE only on x86, even if AVX2 or AVX512F are enabled + * through the baseline, since scatter(AVX512F) and gather very costly + * to handle non-contiguous memory access comparing with SSE for + * such small operations that this file covers. + */ +#define NPY_SIMD_FORCE_128 +#define _UMATHMODULE +#define _MULTIARRAYMODULE +#define NPY_NO_DEPRECATED_API NPY_API_VERSION +#include <float.h> +#include "numpy/npy_math.h" +#include "simd/simd.h" +#include "loops_utils.h" +#include "loops.h" +#include "lowlevel_strided_loops.h" +// Provides the various *_LOOP macros +#include "fast_loop_macros.h" + +/** + * This code should really be merged into loops_unary_fp.dispatch.c.src + * However there is an issue with enabling the code here for VX and VXE + * as the shifts don't behave as expected. + * See the code below that references NPY__CPU_TARGET_VX and + * NPY_BIG_ENDIAN. Suspect that this is a big endian vector issue. + * + * Splitting the files out allows us to keep loops_unary_fp.dispatch.c.src + * building for VX and VXE so we don't regress performance while adding this + * code for other platforms. + */ +// TODO(@seiko2plus): add support for big-endian +#if NPY_SIMD_BIGENDIAN + #undef NPY_SIMD + #undef NPY_SIMD_F32 + #undef NPY_SIMD_F64 + #define NPY_SIMD 0 + #define NPY_SIMD_F32 0 + #define NPY_SIMD_F64 0 +#endif + +/******************************************************************************* + ** extra SIMD intrinsics + ******************************************************************************/ + +#if NPY_SIMD + +/** + * We define intrinsics for isnan, isinf, isfinite, and signbit below. There's + * a few flavors of each. We'll use f32 as an example although f64 versions + * are also defined. + * + * npyv_u32 npyv_KIND_f32(npyv_f32 v) + * These are mainly used for the single vector loops. As such, result should + * be bool true / false, ready to write back. + * + * npyv_b32 _npyv_KIND_f32(npyv_f32 v) + * These are used by the geneal intrinsics above as well as the multi-vector + * packing intrinsics. The multi-vector packing intrinsics are the ones + * utilized in the unrolled vector loops. Results should be vector masks + * of 0x00/0xff. + * + * npyv_u8 npyv_pack_KIND_f32(npyv_f32 v0, npyv_f32 v1, npyv_f32 v2, npyv_f32 v3) + * These are the multi-vector packing intrinsics utilized by unrolled vector + * loops. They perform the operation on all input vectors and pack the + * results to a single npyv_u8. Assuming NPY_SIMD == 128, that means we + * can pack results from 4x npyv_f32 or 8x npyv_64 in a single npyv_u8. + * Result should be bool true / false, ready to write back. + */ + +#if NPY_SIMD_F32 +NPY_FINLINE npyv_u32 +npyv_isnan_f32(npyv_f32 v) +{ + const npyv_u8 truemask = npyv_reinterpret_u8_u32(npyv_setall_u32(1==1)); + npyv_u8 notnan = npyv_reinterpret_u8_u32(npyv_cvt_u32_b32(npyv_notnan_f32(v))); + return npyv_reinterpret_u32_u8(npyv_andc_u8(truemask, notnan)); +} +NPY_FINLINE npyv_u8 +npyv_pack_isnan_f32(npyv_f32 v0, npyv_f32 v1, npyv_f32 v2, npyv_f32 v3) +{ + const npyv_u8 truemask = npyv_setall_u8(1==1); + npyv_b32 b0 = npyv_notnan_f32(v0); + npyv_b32 b1 = npyv_notnan_f32(v1); + npyv_b32 b2 = npyv_notnan_f32(v2); + npyv_b32 b3 = npyv_notnan_f32(v3); + npyv_b8 notnan = npyv_pack_b8_b32(b0, b1, b2, b3); + return npyv_andc_u8(truemask, npyv_cvt_u8_b8(notnan)); +} +#endif +#if NPY_SIMD_F64 +NPY_FINLINE npyv_u64 +npyv_isnan_f64(npyv_f64 v) +{ + const npyv_u8 truemask = npyv_reinterpret_u8_u64(npyv_setall_u64(1==1)); + npyv_u8 notnan = npyv_reinterpret_u8_u64(npyv_cvt_u64_b64(npyv_notnan_f64(v))); + return npyv_reinterpret_u64_u8(npyv_andc_u8(truemask, notnan)); +} +NPY_FINLINE npyv_u8 +npyv_pack_isnan_f64(npyv_f64 v0, npyv_f64 v1, npyv_f64 v2, npyv_f64 v3, + npyv_f64 v4, npyv_f64 v5, npyv_f64 v6, npyv_f64 v7) +{ + const npyv_u8 truemask = npyv_setall_u8(1==1); + npyv_b64 b0 = npyv_notnan_f64(v0); + npyv_b64 b1 = npyv_notnan_f64(v1); + npyv_b64 b2 = npyv_notnan_f64(v2); + npyv_b64 b3 = npyv_notnan_f64(v3); + npyv_b64 b4 = npyv_notnan_f64(v4); + npyv_b64 b5 = npyv_notnan_f64(v5); + npyv_b64 b6 = npyv_notnan_f64(v6); + npyv_b64 b7 = npyv_notnan_f64(v7); + npyv_b8 notnan = npyv_pack_b8_b64(b0, b1, b2, b3, b4, b5, b6, b7); + return npyv_andc_u8(truemask, npyv_cvt_u8_b8(notnan)); +} +#endif + +#if NPY_SIMD_F32 +NPY_FINLINE npyv_b32 +_npyv_isinf_f32(npyv_f32 v) +{ +#if defined(NPY_HAVE_NEON) + // abs(v) > FLT_MAX + const npyv_f32 fltmax = npyv_setall_f32(FLT_MAX); + return vcagtq_f32(v, fltmax); +#else + // cast out the sign and check if all exponent bits are set. + const npyv_u32 exp_mask = npyv_setall_u32(0xff000000); + npyv_u32 bits = npyv_shli_u32(npyv_reinterpret_u32_f32(v), 1); + return npyv_cmpeq_u32(bits, exp_mask); +#endif +} +NPY_FINLINE npyv_u32 +npyv_isinf_f32(npyv_f32 v) +{ + const npyv_u32 truemask = npyv_setall_u32(1==1); + return npyv_and_u32(truemask, npyv_cvt_u32_b32(_npyv_isinf_f32(v))); +} +NPY_FINLINE npyv_u8 +npyv_pack_isinf_f32(npyv_f32 v0, npyv_f32 v1, npyv_f32 v2, npyv_f32 v3) +{ + const npyv_u8 truemask = npyv_setall_u8(1==1); + npyv_b32 b0 = _npyv_isinf_f32(v0); + npyv_b32 b1 = _npyv_isinf_f32(v1); + npyv_b32 b2 = _npyv_isinf_f32(v2); + npyv_b32 b3 = _npyv_isinf_f32(v3); + npyv_b8 isinf = npyv_pack_b8_b32(b0, b1, b2, b3); + return npyv_and_u8(truemask, npyv_cvt_u8_b8(isinf)); +} +#endif // NPY_SIMD_F32 +#if NPY_SIMD_F64 +NPY_FINLINE npyv_b64 +_npyv_isinf_f64(npyv_f64 v) +{ +#if defined(NPY_HAVE_NEON) + // abs(v) > DBL_MAX + const npyv_f64 fltmax = npyv_setall_f64(DBL_MAX); + return vcagtq_f64(v, fltmax); +#else + // cast out the sign and check if all exponent bits are set. + const npyv_u64 exp_mask = npyv_setall_u64(0xffe0000000000000); + npyv_u64 bits = npyv_shli_u64(npyv_reinterpret_u64_f64(v), 1); + return npyv_cmpeq_u64(bits, exp_mask); +#endif +} +NPY_FINLINE npyv_u64 +npyv_isinf_f64(npyv_f64 v) +{ + const npyv_u64 truemask = npyv_setall_u64(1==1); + return npyv_and_u64(truemask, npyv_cvt_u64_b64(_npyv_isinf_f64(v))); +} +NPY_FINLINE npyv_u8 +npyv_pack_isinf_f64(npyv_f64 v0, npyv_f64 v1, npyv_f64 v2, npyv_f64 v3, + npyv_f64 v4, npyv_f64 v5, npyv_f64 v6, npyv_f64 v7) +{ + const npyv_u8 truemask = npyv_setall_u8(1==1); + npyv_b64 b0 = _npyv_isinf_f64(v0); + npyv_b64 b1 = _npyv_isinf_f64(v1); + npyv_b64 b2 = _npyv_isinf_f64(v2); + npyv_b64 b3 = _npyv_isinf_f64(v3); + npyv_b64 b4 = _npyv_isinf_f64(v4); + npyv_b64 b5 = _npyv_isinf_f64(v5); + npyv_b64 b6 = _npyv_isinf_f64(v6); + npyv_b64 b7 = _npyv_isinf_f64(v7); + npyv_b8 isinf = npyv_pack_b8_b64(b0, b1, b2, b3, b4, b5, b6, b7); + return npyv_and_u8(truemask, npyv_cvt_u8_b8(isinf)); +} +#endif // NPY_SIMD_F64 + + +#if NPY_SIMD_F32 +NPY_FINLINE npyv_b32 +npyv_notfinite_f32(npyv_f32 v) +{ + // cast out the sign and check if all exponent bits are set + // no matter the mentissa is. + const npyv_u32 exp_mask = npyv_setall_u32(0x7f800000); + npyv_u32 bits = npyv_reinterpret_u32_f32(v); + bits = npyv_and_u32(bits, exp_mask); + return npyv_cmpeq_u32(bits, exp_mask); +} +NPY_FINLINE npyv_u32 +npyv_isfinite_f32(npyv_f32 v) +{ + const npyv_u8 truemask = npyv_reinterpret_u8_u32(npyv_setall_u32(1==1)); + npyv_u8 notfinite = npyv_reinterpret_u8_u32(npyv_cvt_u32_b32(npyv_notfinite_f32(v))); + return npyv_reinterpret_u32_u8(npyv_andc_u8(truemask, notfinite)); +} +NPY_FINLINE npyv_u8 +npyv_pack_isfinite_f32(npyv_f32 v0, npyv_f32 v1, npyv_f32 v2, npyv_f32 v3) +{ +#if defined(NPY_HAVE_NEON) && defined(__aarch64__) + // F32 exponent is 8-bits, which means we can pack multiple into + // a single vector. We shift out sign bit so that we're left + // with only exponent in high byte. If not all bits are set, + // then we've got a finite number. + uint8x16x4_t tbl; + tbl.val[0] = npyv_reinterpret_u8_u32(npyv_shli_u32(npyv_reinterpret_u32_f32(v0), 1)); + tbl.val[1] = npyv_reinterpret_u8_u32(npyv_shli_u32(npyv_reinterpret_u32_f32(v1), 1)); + tbl.val[2] = npyv_reinterpret_u8_u32(npyv_shli_u32(npyv_reinterpret_u32_f32(v2), 1)); + tbl.val[3] = npyv_reinterpret_u8_u32(npyv_shli_u32(npyv_reinterpret_u32_f32(v3), 1)); + + const npyv_u8 permute = {3,7,11,15, 19,23,27,31, 35,39,43,47, 51,55,59,63}; + npyv_u8 r = vqtbl4q_u8(tbl, permute); + + const npyv_u8 expmask = npyv_setall_u8(0xff); + r = npyv_cmpneq_u8(r, expmask); + r = vshrq_n_u8(r, 7); + return r; +#else + const npyv_u8 truemask = npyv_setall_u8(1==1); + npyv_b32 b0 = npyv_notfinite_f32(v0); + npyv_b32 b1 = npyv_notfinite_f32(v1); + npyv_b32 b2 = npyv_notfinite_f32(v2); + npyv_b32 b3 = npyv_notfinite_f32(v3); + npyv_b8 notfinite = npyv_pack_b8_b32(b0, b1, b2, b3); + return npyv_andc_u8(truemask, npyv_cvt_u8_b8(notfinite)); +#endif +} +#endif // NPY_SIMD_F32 +#if NPY_SIMD_F64 +NPY_FINLINE npyv_b64 +npyv_notfinite_f64(npyv_f64 v) +{ + // cast out the sign and check if all exponent bits are set + // no matter the mantissa is. + const npyv_u64 exp_mask = npyv_setall_u64(0x7ff0000000000000); + npyv_u64 bits = npyv_reinterpret_u64_f64(v); + bits = npyv_and_u64(bits, exp_mask); + return npyv_cmpeq_u64(bits, exp_mask); +} +NPY_FINLINE npyv_u64 +npyv_isfinite_f64(npyv_f64 v) +{ + const npyv_u8 truemask = npyv_reinterpret_u8_u64(npyv_setall_u64(1==1)); + npyv_u8 notfinite = npyv_reinterpret_u8_u64(npyv_cvt_u64_b64(npyv_notfinite_f64(v))); + return npyv_reinterpret_u64_u8(npyv_andc_u8(truemask, notfinite)); +} +NPY_FINLINE npyv_u8 +npyv_pack_isfinite_f64(npyv_f64 v0, npyv_f64 v1, npyv_f64 v2, npyv_f64 v3, + npyv_f64 v4, npyv_f64 v5, npyv_f64 v6, npyv_f64 v7) +{ +#if defined(NPY_HAVE_NEON) && defined(__aarch64__) + // F64 exponent is 11-bits, which means we can pack multiple into + // a single vector. We'll need to use u16 to fit all exponent + // bits. If not all bits are set, then we've got a finite number. + uint8x16x4_t t0123, t4567; + t0123.val[0] = npyv_reinterpret_u8_f64(v0); + t0123.val[1] = npyv_reinterpret_u8_f64(v1); + t0123.val[2] = npyv_reinterpret_u8_f64(v2); + t0123.val[3] = npyv_reinterpret_u8_f64(v3); + t4567.val[0] = npyv_reinterpret_u8_f64(v4); + t4567.val[1] = npyv_reinterpret_u8_f64(v5); + t4567.val[2] = npyv_reinterpret_u8_f64(v6); + t4567.val[3] = npyv_reinterpret_u8_f64(v7); + + const npyv_u8 permute = {6,7,14,15, 22,23,30,31, 38,39,46,47, 54,55,62,63}; + npyv_u16 r0 = npyv_reinterpret_u16_u8(vqtbl4q_u8(t0123, permute)); + npyv_u16 r1 = npyv_reinterpret_u16_u8(vqtbl4q_u8(t4567, permute)); + + const npyv_u16 expmask = npyv_setall_u16(0x7ff0); + r0 = npyv_and_u16(r0, expmask); + r0 = npyv_cmpneq_u16(r0, expmask); + r0 = npyv_shri_u16(r0, 15); + r1 = npyv_and_u16(r1, expmask); + r1 = npyv_cmpneq_u16(r1, expmask); + r1 = npyv_shri_u16(r1, 15); + + npyv_u8 r = npyv_pack_b8_b16(r0, r1); + return r; +#else + const npyv_u8 truemask = npyv_setall_u8(1==1); + npyv_b64 b0 = npyv_notfinite_f64(v0); + npyv_b64 b1 = npyv_notfinite_f64(v1); + npyv_b64 b2 = npyv_notfinite_f64(v2); + npyv_b64 b3 = npyv_notfinite_f64(v3); + npyv_b64 b4 = npyv_notfinite_f64(v4); + npyv_b64 b5 = npyv_notfinite_f64(v5); + npyv_b64 b6 = npyv_notfinite_f64(v6); + npyv_b64 b7 = npyv_notfinite_f64(v7); + npyv_b8 notfinite = npyv_pack_b8_b64(b0, b1, b2, b3, b4, b5, b6, b7); + return npyv_andc_u8(truemask, npyv_cvt_u8_b8(notfinite)); +#endif +} +#endif // NPY_SIMD_F64 + +#if NPY_SIMD_F32 +NPY_FINLINE npyv_u32 +npyv_signbit_f32(npyv_f32 v) +{ + return npyv_shri_u32(npyv_reinterpret_u32_f32(v), (sizeof(npyv_lanetype_f32)*8)-1); +} +NPY_FINLINE npyv_u8 +npyv_pack_signbit_f32(npyv_f32 v0, npyv_f32 v1, npyv_f32 v2, npyv_f32 v3) +{ +#if defined(NPY_HAVE_NEON) && defined(__aarch64__) + // We only need high byte for signbit, which means we can pack + // multiple inputs into a single vector. + uint8x16x4_t tbl; + tbl.val[0] = npyv_reinterpret_u8_f32(v0); + tbl.val[1] = npyv_reinterpret_u8_f32(v1); + tbl.val[2] = npyv_reinterpret_u8_f32(v2); + tbl.val[3] = npyv_reinterpret_u8_f32(v3); + + const npyv_u8 permute = {3,7,11,15, 19,23,27,31, 35,39,43,47, 51,55,59,63}; + npyv_u8 r = vqtbl4q_u8(tbl, permute); + r = vshrq_n_u8(r, 7); + return r; +#else + npyv_b32 b0 = npyv_cvt_b32_u32(npyv_signbit_f32(v0)); + npyv_b32 b1 = npyv_cvt_b32_u32(npyv_signbit_f32(v1)); + npyv_b32 b2 = npyv_cvt_b32_u32(npyv_signbit_f32(v2)); + npyv_b32 b3 = npyv_cvt_b32_u32(npyv_signbit_f32(v3)); + npyv_b8 signbit = npyv_pack_b8_b32(b0, b1, b2, b3); + return npyv_cvt_u8_b8(signbit); +#endif +} +#endif // NPY_SIMD_F32 +#if NPY_SIMD_F64 +NPY_FINLINE npyv_u64 +npyv_signbit_f64(npyv_f64 v) +{ + return npyv_shri_u64(npyv_reinterpret_u64_f64(v), (sizeof(npyv_lanetype_f64)*8)-1); +} +NPY_FINLINE npyv_u8 +npyv_pack_signbit_f64(npyv_f64 v0, npyv_f64 v1, npyv_f64 v2, npyv_f64 v3, + npyv_f64 v4, npyv_f64 v5, npyv_f64 v6, npyv_f64 v7) +{ +#if defined(NPY_HAVE_NEON) && defined(__aarch64__) + // We only need high byte for signbit, which means we can pack + // multiple inputs into a single vector. + + // vuzp2 faster than vtbl for f64 + npyv_u32 v01 = vuzp2q_u32(npyv_reinterpret_u32_f64(v0), npyv_reinterpret_u32_f64(v1)); + npyv_u32 v23 = vuzp2q_u32(npyv_reinterpret_u32_f64(v2), npyv_reinterpret_u32_f64(v3)); + npyv_u32 v45 = vuzp2q_u32(npyv_reinterpret_u32_f64(v4), npyv_reinterpret_u32_f64(v5)); + npyv_u32 v67 = vuzp2q_u32(npyv_reinterpret_u32_f64(v6), npyv_reinterpret_u32_f64(v7)); + + npyv_u16 v0123 = vuzp2q_u16(npyv_reinterpret_u16_u32(v01), npyv_reinterpret_u16_u32(v23)); + npyv_u16 v4567 = vuzp2q_u16(npyv_reinterpret_u16_u32(v45), npyv_reinterpret_u16_u32(v67)); + + npyv_u8 r = vuzp2q_u8(npyv_reinterpret_u8_u16(v0123), npyv_reinterpret_u8_u16(v4567)); + r = vshrq_n_u8(r, 7); + return r; +#else + npyv_b64 b0 = npyv_cvt_b64_u64(npyv_signbit_f64(v0)); + npyv_b64 b1 = npyv_cvt_b64_u64(npyv_signbit_f64(v1)); + npyv_b64 b2 = npyv_cvt_b64_u64(npyv_signbit_f64(v2)); + npyv_b64 b3 = npyv_cvt_b64_u64(npyv_signbit_f64(v3)); + npyv_b64 b4 = npyv_cvt_b64_u64(npyv_signbit_f64(v4)); + npyv_b64 b5 = npyv_cvt_b64_u64(npyv_signbit_f64(v5)); + npyv_b64 b6 = npyv_cvt_b64_u64(npyv_signbit_f64(v6)); + npyv_b64 b7 = npyv_cvt_b64_u64(npyv_signbit_f64(v7)); + npyv_b8 signbit = npyv_pack_b8_b64(b0, b1, b2, b3, b4, b5, b6, b7); + return npyv_cvt_u8_b8(signbit); +#endif +} +#endif // NPY_SIMD_F64 + +#endif // NPY_SIMD + +/******************************************************************************** + ** Defining the SIMD kernels + ********************************************************************************/ +/** Notes: + * - avoid the use of libmath to unify fp/domain errors + * for both scalars and vectors among all compilers/architectures. + * - use intrinsic npyv_load_till_* instead of npyv_load_tillz_ + * to fill the remind lanes with 1.0 to avoid divide by zero fp + * exception in reciprocal. + */ +#define CONTIG 0 +#define NCONTIG 1 + +/**begin repeat + * #TYPE = FLOAT, DOUBLE# + * #sfx = f32, f64# + * #VCHK = NPY_SIMD_F32, NPY_SIMD_F64# + * #ssfx = 32, 64# + */ +#if @VCHK@ +/**begin repeat1 + * #kind = isnan, isinf, isfinite, signbit# + */ +/**begin repeat2 + * #STYPE = CONTIG, NCONTIG, CONTIG, NCONTIG# + * #DTYPE = CONTIG, CONTIG, NCONTIG, NCONTIG# + */ +static void simd_unary_@kind@_@TYPE@_@STYPE@_@DTYPE@ +(const void *src, npy_intp istride, void *dst, npy_intp ostride, npy_intp len) +{ + const npyv_lanetype_@sfx@ *ip = src; + npy_bool *op = dst; + + // How many vectors can be packed into a u8 / bool vector? + #define PACK_FACTOR (NPY_SIMD_WIDTH / npyv_nlanes_@sfx@) + assert(PACK_FACTOR == 4 || PACK_FACTOR == 8); + + const int vstep = npyv_nlanes_@sfx@; + const int wstep = vstep * PACK_FACTOR; + + // unrolled iterations + for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) { + // Load vectors + #if @STYPE@ == CONTIG + // contiguous input + npyv_@sfx@ v0 = npyv_load_@sfx@(ip + vstep * 0); + npyv_@sfx@ v1 = npyv_load_@sfx@(ip + vstep * 1); + npyv_@sfx@ v2 = npyv_load_@sfx@(ip + vstep * 2); + npyv_@sfx@ v3 = npyv_load_@sfx@(ip + vstep * 3); + #if PACK_FACTOR == 8 + npyv_@sfx@ v4 = npyv_load_@sfx@(ip + vstep * 4); + npyv_@sfx@ v5 = npyv_load_@sfx@(ip + vstep * 5); + npyv_@sfx@ v6 = npyv_load_@sfx@(ip + vstep * 6); + npyv_@sfx@ v7 = npyv_load_@sfx@(ip + vstep * 7); + #endif + #else + // non-contiguous input + npyv_@sfx@ v0 = npyv_loadn_@sfx@(ip + istride * vstep * 0, istride); + npyv_@sfx@ v1 = npyv_loadn_@sfx@(ip + istride * vstep * 1, istride); + npyv_@sfx@ v2 = npyv_loadn_@sfx@(ip + istride * vstep * 2, istride); + npyv_@sfx@ v3 = npyv_loadn_@sfx@(ip + istride * vstep * 3, istride); + #if PACK_FACTOR == 8 + npyv_@sfx@ v4 = npyv_loadn_@sfx@(ip + istride * vstep * 4, istride); + npyv_@sfx@ v5 = npyv_loadn_@sfx@(ip + istride * vstep * 5, istride); + npyv_@sfx@ v6 = npyv_loadn_@sfx@(ip + istride * vstep * 6, istride); + npyv_@sfx@ v7 = npyv_loadn_@sfx@(ip + istride * vstep * 7, istride); + #endif + #endif + + #if PACK_FACTOR == 4 + npyv_u8 r = npyv_pack_@kind@_@sfx@(v0, v1, v2, v3); + #elif PACK_FACTOR == 8 + npyv_u8 r = npyv_pack_@kind@_@sfx@(v0, v1, v2, v3, v4, v5, v6, v7); + #endif + + #if @DTYPE@ == CONTIG + npyv_store_u8(op, r); + #else // @DTYPE@ == CONTIG + // Results are packed, so we can just loop over them + npy_uint8 lane[npyv_nlanes_u8]; + npyv_store_u8(lane, r); + for (int ln=0; (ln * sizeof(npyv_lanetype_@sfx@)) < npyv_nlanes_u8; ++ln){ + op[ln * ostride] = lane[ln * sizeof(npyv_lanetype_@sfx@)]; + } + #endif // @DTYPE@ == CONTIG + } + + // vector-sized iterations + for (; len >= vstep; len -= vstep, ip += istride*vstep, op += ostride*vstep) { + #if @STYPE@ == CONTIG + npyv_@sfx@ v = npyv_load_@sfx@(ip); + #else + npyv_@sfx@ v = npyv_loadn_@sfx@(ip, istride); + #endif + + npyv_u@ssfx@ r = npyv_@kind@_@sfx@(v); + + npy_uint8 lane[npyv_nlanes_u8]; + npyv_store_u8(lane, npyv_reinterpret_u8_u@ssfx@(r)); + + op[0 * ostride] = lane[0 * sizeof(npyv_lanetype_@sfx@)]; + op[1 * ostride] = lane[1 * sizeof(npyv_lanetype_@sfx@)]; + #if npyv_nlanes_@sfx@ == 4 + op[2 * ostride] = lane[2 * sizeof(npyv_lanetype_@sfx@)]; + op[3 * ostride] = lane[3 * sizeof(npyv_lanetype_@sfx@)]; + #endif + } + + #undef PACK_FACTOR + + // Scalar loop to finish off + for (; len > 0; --len, ip += istride, op += ostride) { + *op = (npy_@kind@(*ip) != 0); + } + + npyv_cleanup(); +} +/**end repeat2**/ +/**end repeat1**/ + +#endif // @VCHK@ +/**end repeat**/ + +/******************************************************************************** + ** Defining ufunc inner functions + ********************************************************************************/ +/**begin repeat + * #TYPE = FLOAT, DOUBLE# + * #sfx = f32, f64# + * #VCHK = NPY_SIMD_F32, NPY_SIMD_F64# + */ + +/**begin repeat1 + * #kind = isnan, isinf, isfinite, signbit# + **/ +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ +#if @VCHK@ + const char *ip = args[0]; + char *op = args[1]; + const npy_intp istep = steps[0]; + const npy_intp ostep = steps[1]; + npy_intp len = dimensions[0]; + const int ilsize = sizeof(npyv_lanetype_@sfx@); + const int olsize = sizeof(npy_bool); + const npy_intp istride = istep / ilsize; + const npy_intp ostride = ostep / olsize; + assert(len <= 1 || ostep % olsize == 0); + + if ((istep % ilsize == 0) && + !is_mem_overlap(ip, istep, op, ostep, len) && + npyv_loadable_stride_@sfx@(istride) && + npyv_storable_stride_@sfx@(ostride)) + { + if (istride == 1 && ostride == 1) { + simd_unary_@kind@_@TYPE@_CONTIG_CONTIG(ip, 1, op, 1, len); + } + else if (ostride == 1) { + simd_unary_@kind@_@TYPE@_NCONTIG_CONTIG(ip, istride, op, 1, len); + } + else if (istride == 1) { + simd_unary_@kind@_@TYPE@_CONTIG_NCONTIG(ip, 1, op, ostride, len); + } else { + simd_unary_@kind@_@TYPE@_NCONTIG_NCONTIG(ip, istride, op, ostride, len); + } + } else +#endif // @VCHK@ + { + UNARY_LOOP { + const npyv_lanetype_@sfx@ in = *(npyv_lanetype_@sfx@ *)ip1; + *((npy_bool *)op1) = (npy_@kind@(in) != 0); + } + } + + npy_clear_floatstatus_barrier((char*)dimensions); +} +/**end repeat1**/ +/**end repeat**/ diff --git a/numpy/core/src/umath/loops_utils.h.src b/numpy/core/src/umath/loops_utils.h.src index df92bc315..5640a1f0b 100644 --- a/numpy/core/src/umath/loops_utils.h.src +++ b/numpy/core/src/umath/loops_utils.h.src @@ -74,7 +74,7 @@ is_mem_overlap(const void *src, npy_intp src_step, const void *dst, npy_intp dst * The recursion depth is O(lg n) as well. * when updating also update similar complex floats summation */ -static NPY_INLINE @type@ +static inline @type@ @TYPE@_pairwise_sum(char *a, npy_intp n, npy_intp stride) { if (n < 8) { @@ -152,7 +152,7 @@ static NPY_INLINE @type@ * #SIMD = 1, 1, 0# */ /* similar to pairwise sum of real floats */ -static NPY_INLINE void +static inline void @TYPE@_pairwise_sum(@ftype@ *rr, @ftype@ * ri, char * a, npy_intp n, npy_intp stride) { diff --git a/numpy/core/src/umath/matmul.c.src b/numpy/core/src/umath/matmul.c.src index 4dd0c4759..127bb5e27 100644 --- a/numpy/core/src/umath/matmul.c.src +++ b/numpy/core/src/umath/matmul.c.src @@ -45,7 +45,7 @@ * 3. The slower (first) axis stride, in unit steps, must be larger than * the faster axis dimension */ -static NPY_INLINE npy_bool +static inline npy_bool is_blasable2d(npy_intp byte_stride1, npy_intp byte_stride2, npy_intp d1, npy_intp d2, npy_intp itemsize) { diff --git a/numpy/core/src/umath/override.c b/numpy/core/src/umath/override.c index d247c2639..167164163 100644 --- a/numpy/core/src/umath/override.c +++ b/numpy/core/src/umath/override.c @@ -23,18 +23,19 @@ * Returns -1 on failure. */ static int -get_array_ufunc_overrides(PyObject *in_args, PyObject *out_args, +get_array_ufunc_overrides(PyObject *in_args, PyObject *out_args, PyObject *wheremask_obj, PyObject **with_override, PyObject **methods) { int i; int num_override_args = 0; - int narg, nout; + int narg, nout, nwhere; narg = (int)PyTuple_GET_SIZE(in_args); /* It is valid for out_args to be NULL: */ nout = (out_args != NULL) ? (int)PyTuple_GET_SIZE(out_args) : 0; + nwhere = (wheremask_obj != NULL) ? 1: 0; - for (i = 0; i < narg + nout; ++i) { + for (i = 0; i < narg + nout + nwhere; ++i) { PyObject *obj; int j; int new_class = 1; @@ -42,9 +43,12 @@ get_array_ufunc_overrides(PyObject *in_args, PyObject *out_args, if (i < narg) { obj = PyTuple_GET_ITEM(in_args, i); } - else { + else if (i < narg + nout){ obj = PyTuple_GET_ITEM(out_args, i - narg); } + else { + obj = wheremask_obj; + } /* * Have we seen this class before? If so, ignore. */ @@ -208,7 +212,7 @@ copy_positional_args_to_kwargs(const char **keywords, */ NPY_NO_EXPORT int PyUFunc_CheckOverride(PyUFuncObject *ufunc, char *method, - PyObject *in_args, PyObject *out_args, + PyObject *in_args, PyObject *out_args, PyObject *wheremask_obj, PyObject *const *args, Py_ssize_t len_args, PyObject *kwnames, PyObject **result) { @@ -227,7 +231,7 @@ PyUFunc_CheckOverride(PyUFuncObject *ufunc, char *method, * Check inputs for overrides */ num_override_args = get_array_ufunc_overrides( - in_args, out_args, with_override, array_ufunc_methods); + in_args, out_args, wheremask_obj, with_override, array_ufunc_methods); if (num_override_args == -1) { goto fail; } diff --git a/numpy/core/src/umath/override.h b/numpy/core/src/umath/override.h index 4e9a323ca..20621bb19 100644 --- a/numpy/core/src/umath/override.h +++ b/numpy/core/src/umath/override.h @@ -6,7 +6,7 @@ NPY_NO_EXPORT int PyUFunc_CheckOverride(PyUFuncObject *ufunc, char *method, - PyObject *in_args, PyObject *out_args, + PyObject *in_args, PyObject *out_args, PyObject *wheremask_obj, PyObject *const *args, Py_ssize_t len_args, PyObject *kwnames, PyObject **result); diff --git a/numpy/core/src/umath/reduction.c b/numpy/core/src/umath/reduction.c index 817f99a04..9416e9a29 100644 --- a/numpy/core/src/umath/reduction.c +++ b/numpy/core/src/umath/reduction.c @@ -17,6 +17,9 @@ #include "numpy/arrayobject.h" #include "npy_pycompat.h" +#include "array_assign.h" +#include "array_coercion.h" +#include "array_method.h" #include "ctors.h" #include "numpy/ufuncobject.h" @@ -148,24 +151,15 @@ PyArray_CopyInitialReduceValues( * context : The ArrayMethod context (with ufunc, method, and descriptors). * operand : The array to be reduced. * out : NULL, or the array into which to place the result. - * wheremask : NOT YET SUPPORTED, but this parameter is placed here - * so that support can be added in the future without breaking - * API compatibility. Pass in NULL. + * wheremask : Reduction mask of valid values used for `where=`. * axis_flags : Flags indicating the reduction axes of 'operand'. - * reorderable : If True, the reduction being done is reorderable, which - * means specifying multiple axes of reduction at once is ok, - * and the reduction code may calculate the reduction in an - * arbitrary order. The calculation may be reordered because - * of cache behavior or multithreading requirements. * keepdims : If true, leaves the reduction dimensions in the result * with size one. * subok : If true, the result uses the subclass of operand, otherwise * it is always a base class ndarray. - * identity : If Py_None, PyArray_CopyInitialReduceValues is used, otherwise - * this value is used to initialize the result to - * the reduction's unit. + * initial : Initial value, if NULL the default is fetched from the + * ArrayMethod (typically as the default from the ufunc). * loop : `reduce_loop` from `ufunc_object.c`. TODO: Refactor - * data : Data which is passed to the inner loop. * buffersize : Buffer size for the iterator. For the default, pass in 0. * funcname : The name of the reduction function, for error messages. * errormask : forwarded from _get_bufsize_errmask @@ -182,9 +176,9 @@ PyArray_CopyInitialReduceValues( NPY_NO_EXPORT PyArrayObject * PyUFunc_ReduceWrapper(PyArrayMethod_Context *context, PyArrayObject *operand, PyArrayObject *out, PyArrayObject *wheremask, - npy_bool *axis_flags, int reorderable, int keepdims, - PyObject *identity, PyArray_ReduceLoopFunc *loop, - void *data, npy_intp buffersize, const char *funcname, int errormask) + npy_bool *axis_flags, int keepdims, + PyObject *initial, PyArray_ReduceLoopFunc *loop, + npy_intp buffersize, const char *funcname, int errormask) { assert(loop != NULL); PyArrayObject *result = NULL; @@ -198,38 +192,35 @@ PyUFunc_ReduceWrapper(PyArrayMethod_Context *context, /* Loop auxdata (must be freed on error) */ NpyAuxData *auxdata = NULL; - /* More than one axis means multiple orders are possible */ - if (!reorderable && count_axes(PyArray_NDIM(operand), axis_flags) > 1) { - PyErr_Format(PyExc_ValueError, - "reduction operation '%s' is not reorderable, " - "so at most one axis may be specified", - funcname); - return NULL; - } - /* Can only use where with an initial ( from identity or argument) */ - if (wheremask != NULL && identity == Py_None) { - PyErr_Format(PyExc_ValueError, - "reduction operation '%s' does not have an identity, " - "so to use a where mask one has to specify 'initial'", - funcname); - return NULL; - } - - /* Set up the iterator */ op[0] = out; op[1] = operand; op_dtypes[0] = context->descriptors[0]; op_dtypes[1] = context->descriptors[1]; + /* Buffer to use when we need an initial value */ + char *initial_buf = NULL; + + /* More than one axis means multiple orders are possible */ + if (!(context->method->flags & NPY_METH_IS_REORDERABLE) + && count_axes(PyArray_NDIM(operand), axis_flags) > 1) { + PyErr_Format(PyExc_ValueError, + "reduction operation '%s' is not reorderable, " + "so at most one axis may be specified", + funcname); + goto fail; + } + it_flags = NPY_ITER_BUFFERED | NPY_ITER_EXTERNAL_LOOP | NPY_ITER_GROWINNER | - NPY_ITER_DONT_NEGATE_STRIDES | NPY_ITER_ZEROSIZE_OK | NPY_ITER_REFS_OK | NPY_ITER_DELAY_BUFALLOC | NPY_ITER_COPY_IF_OVERLAP; + if (!(context->method->flags & NPY_METH_IS_REORDERABLE)) { + it_flags |= NPY_ITER_DONT_NEGATE_STRIDES; + } op_flags[0] = NPY_ITER_READWRITE | NPY_ITER_ALIGNED | NPY_ITER_ALLOCATE | @@ -297,8 +288,52 @@ PyUFunc_ReduceWrapper(PyArrayMethod_Context *context, goto fail; } + npy_bool empty_iteration = NpyIter_GetIterSize(iter) == 0; result = NpyIter_GetOperandArray(iter)[0]; + /* + * Get the initial value (if it exists). If the iteration is empty + * then we assume the reduction is also empty. The reason is that when + * the outer iteration is empty we just won't use the initial value + * in any case. (`np.sum(np.zeros((0, 3)), axis=0)` is a length 3 + * reduction but has an empty result.) + */ + if ((initial == NULL && context->method->get_reduction_initial == NULL) + || initial == Py_None) { + /* There is no initial value, or initial value was explicitly unset */ + } + else { + /* Not all functions will need initialization, but init always: */ + initial_buf = PyMem_Calloc(1, op_dtypes[0]->elsize); + if (initial_buf == NULL) { + PyErr_NoMemory(); + goto fail; + } + if (initial != NULL) { + /* must use user provided initial value */ + if (PyArray_Pack(op_dtypes[0], initial_buf, initial) < 0) { + goto fail; + } + } + else { + /* + * Fetch initial from ArrayMethod, we pretend the reduction is + * empty when the iteration is. This may be wrong, but when it is, + * we will not need the identity as the result is also empty. + */ + int has_initial = context->method->get_reduction_initial( + context, empty_iteration, initial_buf); + if (has_initial < 0) { + goto fail; + } + if (!has_initial) { + /* We have no initial value available, free buffer to indicate */ + PyMem_FREE(initial_buf); + initial_buf = NULL; + } + } + } + PyArrayMethod_StridedLoop *strided_loop; NPY_ARRAYMETHOD_FLAGS flags = 0; @@ -313,12 +348,27 @@ PyUFunc_ReduceWrapper(PyArrayMethod_Context *context, * Initialize the result to the reduction unit if possible, * otherwise copy the initial values and get a view to the rest. */ - if (identity != Py_None) { - if (PyArray_FillWithScalar(result, identity) < 0) { + if (initial_buf != NULL) { + /* Loop provided an identity or default value, assign to result. */ + int ret = raw_array_assign_scalar( + PyArray_NDIM(result), PyArray_DIMS(result), + PyArray_DESCR(result), + PyArray_BYTES(result), PyArray_STRIDES(result), + op_dtypes[0], initial_buf); + if (ret < 0) { goto fail; } } else { + /* Can only use where with an initial (from identity or argument) */ + if (wheremask != NULL) { + PyErr_Format(PyExc_ValueError, + "reduction operation '%s' does not have an identity, " + "so to use a where mask one has to specify 'initial'", + funcname); + return NULL; + } + /* * For 1-D skip_first_count could be optimized to 0, but no-identity * reductions are not super common. @@ -354,7 +404,7 @@ PyUFunc_ReduceWrapper(PyArrayMethod_Context *context, } } - if (NpyIter_GetIterSize(iter) != 0) { + if (!empty_iteration) { NpyIter_IterNextFunc *iternext; char **dataptr; npy_intp *strideptr; @@ -387,6 +437,10 @@ PyUFunc_ReduceWrapper(PyArrayMethod_Context *context, } Py_INCREF(result); + if (initial_buf != NULL && PyDataType_REFCHK(PyArray_DESCR(result))) { + PyArray_Item_XDECREF(initial_buf, PyArray_DESCR(result)); + } + PyMem_FREE(initial_buf); NPY_AUXDATA_FREE(auxdata); if (!NpyIter_Deallocate(iter)) { Py_DECREF(result); @@ -395,6 +449,10 @@ PyUFunc_ReduceWrapper(PyArrayMethod_Context *context, return result; fail: + if (initial_buf != NULL && PyDataType_REFCHK(op_dtypes[0])) { + PyArray_Item_XDECREF(initial_buf, op_dtypes[0]); + } + PyMem_FREE(initial_buf); NPY_AUXDATA_FREE(auxdata); if (iter != NULL) { NpyIter_Deallocate(iter); diff --git a/numpy/core/src/umath/reduction.h b/numpy/core/src/umath/reduction.h index 2170e27a7..d2cbe4849 100644 --- a/numpy/core/src/umath/reduction.h +++ b/numpy/core/src/umath/reduction.h @@ -64,8 +64,8 @@ typedef int (PyArray_ReduceLoopFunc)(PyArrayMethod_Context *context, NPY_NO_EXPORT PyArrayObject * PyUFunc_ReduceWrapper(PyArrayMethod_Context *context, PyArrayObject *operand, PyArrayObject *out, PyArrayObject *wheremask, - npy_bool *axis_flags, int reorderable, int keepdims, - PyObject *identity, PyArray_ReduceLoopFunc *loop, - void *data, npy_intp buffersize, const char *funcname, int errormask); + npy_bool *axis_flags, int keepdims, + PyObject *initial, PyArray_ReduceLoopFunc *loop, + npy_intp buffersize, const char *funcname, int errormask); #endif diff --git a/numpy/core/src/umath/scalarmath.c.src b/numpy/core/src/umath/scalarmath.c.src index 7c63ac0f1..a159fdc12 100644 --- a/numpy/core/src/umath/scalarmath.c.src +++ b/numpy/core/src/umath/scalarmath.c.src @@ -57,7 +57,7 @@ * #name = byte, short, int, long, longlong# * #type = npy_byte, npy_short, npy_int, npy_long, npy_longlong# */ -static NPY_INLINE int +static inline int @name@_ctype_add(@type@ a, @type@ b, @type@ *out) { *out = a + b; if ((*out^a) >= 0 || (*out^b) >= 0) { @@ -66,7 +66,7 @@ static NPY_INLINE int return NPY_FPE_OVERFLOW; } -static NPY_INLINE int +static inline int @name@_ctype_subtract(@type@ a, @type@ b, @type@ *out) { *out = a - b; if ((*out^a) >= 0 || (*out^~b) >= 0) { @@ -80,7 +80,7 @@ static NPY_INLINE int * #name = ubyte, ushort, uint, ulong, ulonglong# * #type = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong# */ -static NPY_INLINE int +static inline int @name@_ctype_add(@type@ a, @type@ b, @type@ *out) { *out = a + b; if (*out >= a && *out >= b) { @@ -89,7 +89,7 @@ static NPY_INLINE int return NPY_FPE_OVERFLOW; } -static NPY_INLINE int +static inline int @name@_ctype_subtract(@type@ a, @type@ b, @type@ *out) { *out = a - b; if (a >= b) { @@ -118,7 +118,7 @@ static NPY_INLINE int * #neg = (1,0)*4# */ #if NPY_SIZEOF_@SIZE@ > NPY_SIZEOF_@SIZENAME@ -static NPY_INLINE int +static inline int @name@_ctype_multiply(@type@ a, @type@ b, @type@ *out) { @big@ temp; temp = ((@big@) a) * ((@big@) b); @@ -144,7 +144,7 @@ static NPY_INLINE int * #SIZE = INT*2, LONG*2, LONGLONG*2# */ #if NPY_SIZEOF_LONGLONG == NPY_SIZEOF_@SIZE@ -static NPY_INLINE int +static inline int @name@_ctype_multiply(@type@ a, @type@ b, @type@ *out) { if (npy_mul_with_overflow_@name@(out, a, b)) { return NPY_FPE_OVERFLOW; @@ -171,7 +171,7 @@ static NPY_INLINE int #define DIVIDEBYZERO_CHECK (b == 0) #endif -static NPY_INLINE int +static inline int @name@_ctype_divide(@type@ a, @type@ b, @type@ *out) { if (b == 0) { *out = 0; @@ -200,7 +200,7 @@ static NPY_INLINE int #define @name@_ctype_floor_divide @name@_ctype_divide -static NPY_INLINE int +static inline int @name@_ctype_remainder(@type@ a, @type@ b, @type@ *out) { if (DIVIDEBYZERO_CHECK) { *out = 0; @@ -232,7 +232,7 @@ static NPY_INLINE int * ulong, longlong, ulonglong# */ -static NPY_INLINE int +static inline int @name@_ctype_true_divide(npy_@name@ a, npy_@name@ b, npy_double *out) { *out = (npy_double)a / (npy_double)b; @@ -251,7 +251,7 @@ static NPY_INLINE int * #upc = BYTE, UBYTE, SHORT, USHORT, INT, UINT, * LONG, ULONG, LONGLONG, ULONGLONG# */ -static NPY_INLINE int +static inline int @name@_ctype_power(@type@ a, @type@ b, @type@ *out) { @type@ tmp; @@ -292,7 +292,7 @@ static NPY_INLINE int * #op = &, ^, |# */ -static NPY_INLINE int +static inline int @name@_ctype_@oper@(@type@ arg1, @type@ arg2, @type@ *out) { *out = arg1 @op@ arg2; @@ -301,14 +301,14 @@ static NPY_INLINE int /**end repeat1**/ -static NPY_INLINE int +static inline int @name@_ctype_lshift(@type@ arg1, @type@ arg2, @type@ *out) { *out = npy_lshift@suffix@(arg1, arg2); return 0; } -static NPY_INLINE int +static inline int @name@_ctype_rshift(@type@ arg1, @type@ arg2, @type@ *out) { *out = npy_rshift@suffix@(arg1, arg2); @@ -328,7 +328,7 @@ static NPY_INLINE int * #oper = add, subtract, multiply, divide# */ -static NPY_INLINE int +static inline int @name@_ctype_@oper@(@type@ a, @type@ b, @type@ *out) { *out = a @OP@ b; @@ -340,21 +340,21 @@ static NPY_INLINE int #define @name@_ctype_true_divide @name@_ctype_divide -static NPY_INLINE int +static inline int @name@_ctype_floor_divide(@type@ a, @type@ b, @type@ *out) { *out = npy_floor_divide@c@(a, b); return 0; } -static NPY_INLINE int +static inline int @name@_ctype_remainder(@type@ a, @type@ b, @type@ *out) { *out = npy_remainder@c@(a, b); return 0; } -static NPY_INLINE int +static inline int @name@_ctype_divmod(@type@ a, @type@ b, @type@ *out1, @type@ *out2) { *out1 = npy_divmod@c@(a, b, out2); return 0; @@ -368,7 +368,7 @@ static NPY_INLINE int * #oper = add, subtract, multiply, divide# */ -static NPY_INLINE int +static inline int half_ctype_@oper@(npy_half a, npy_half b, npy_half *out) { float res = npy_half_to_float(a) @OP@ npy_half_to_float(b); @@ -380,7 +380,7 @@ half_ctype_@oper@(npy_half a, npy_half b, npy_half *out) #define half_ctype_true_divide half_ctype_divide -static NPY_INLINE int +static inline int half_ctype_floor_divide(npy_half a, npy_half b, npy_half *out) { npy_half mod; @@ -396,7 +396,7 @@ half_ctype_floor_divide(npy_half a, npy_half b, npy_half *out) } -static NPY_INLINE int +static inline int half_ctype_remainder(npy_half a, npy_half b, npy_half *out) { npy_half_divmod(a, b, out); @@ -404,7 +404,7 @@ half_ctype_remainder(npy_half a, npy_half b, npy_half *out) } -static NPY_INLINE int +static inline int half_ctype_divmod(npy_half a, npy_half b, npy_half *out1, npy_half *out2) { *out1 = npy_half_divmod(a, b, out2); @@ -419,7 +419,7 @@ half_ctype_divmod(npy_half a, npy_half b, npy_half *out1, npy_half *out2) * #rtype = npy_float, npy_double, npy_longdouble# * #c = f,,l# */ -static NPY_INLINE int +static inline int @name@_ctype_add(@type@ a, @type@ b, @type@ *out) { out->real = a.real + b.real; @@ -427,7 +427,7 @@ static NPY_INLINE int return 0; } -static NPY_INLINE int +static inline int @name@_ctype_subtract(@type@ a, @type@ b, @type@ *out) { out->real = a.real - b.real; @@ -440,7 +440,7 @@ static NPY_INLINE int * TODO: Mark as to work around FPEs not being issues on clang 12. * This should be removed when possible. */ -static NPY_INLINE int +static inline int @name@_ctype_multiply( @type@ a, @type@ b, @type@ *out) { out->real = a.real * b.real - a.imag * b.imag; @@ -449,11 +449,11 @@ static NPY_INLINE int } /* Use the ufunc loop directly to avoid duplicating the complicated logic */ -static NPY_INLINE int +static inline int @name@_ctype_divide(@type@ a, @type@ b, @type@ *out) { char *args[3] = {(char *)&a, (char *)&b, (char *)out}; - npy_intp steps[3]; + npy_intp steps[3] = {0, 0, 0}; npy_intp size = 1; @TYPE@_divide(args, &size, steps, NULL); return 0; @@ -470,7 +470,7 @@ static NPY_INLINE int * longlong, ulonglong# */ -static NPY_INLINE int +static inline int @name@_ctype_divmod(npy_@name@ a, npy_@name@ b, npy_@name@ *out, npy_@name@ *out2) { int res = @name@_ctype_floor_divide(a, b, out); @@ -487,7 +487,7 @@ static NPY_INLINE int * #c = f,,l# */ -static NPY_INLINE int +static inline int @name@_ctype_power(@type@ a, @type@ b, @type@ *out) { *out = npy_pow@c@(a, b); @@ -495,7 +495,7 @@ static NPY_INLINE int } /**end repeat**/ -static NPY_INLINE int +static inline int half_ctype_power(npy_half a, npy_half b, npy_half *out) { const npy_float af = npy_half_to_float(a); @@ -518,7 +518,7 @@ half_ctype_power(npy_half a, npy_half b, npy_half *out) * #uns = (0,1)*5,0*3# * #int = 1*10,0*3# */ -static NPY_INLINE int +static inline int @name@_ctype_negative(@type@ a, @type@ *out) { #if @uns@ @@ -541,7 +541,7 @@ static NPY_INLINE int } /**end repeat**/ -static NPY_INLINE int +static inline int half_ctype_negative(npy_half a, npy_half *out) { *out = a^0x8000u; @@ -553,7 +553,7 @@ half_ctype_negative(npy_half a, npy_half *out) * #name = cfloat, cdouble, clongdouble# * #type = npy_cfloat, npy_cdouble, npy_clongdouble# */ -static NPY_INLINE int +static inline int @name@_ctype_negative(@type@ a, @type@ *out) { out->real = -a.real; @@ -570,7 +570,7 @@ static NPY_INLINE int * npy_long, npy_ulong, npy_longlong, npy_ulonglong, * npy_half, npy_float, npy_double, npy_longdouble# */ -static NPY_INLINE int +static inline int @name@_ctype_positive(@type@ a, @type@ *out) { *out = a; @@ -583,7 +583,7 @@ static NPY_INLINE int * #type = npy_cfloat, npy_cdouble, npy_clongdouble# * #c = f,,l# */ -static NPY_INLINE int +static inline int @name@_ctype_positive(@type@ a, @type@ *out) { out->real = a.real; @@ -591,7 +591,7 @@ static NPY_INLINE int return 0; } -static NPY_INLINE int +static inline int @name@_ctype_power(@type@ a, @type@ b, @type@ *out) { *out = npy_cpow@c@(a, b); @@ -614,7 +614,7 @@ static NPY_INLINE int * #type = npy_byte, npy_short, npy_int, npy_long, npy_longlong# * #NAME = BYTE, SHORT, INT, LONG, LONGLONG# */ -static NPY_INLINE int +static inline int @name@_ctype_absolute(@type@ a, @type@ *out) { if (a == NPY_MIN_@NAME@) { @@ -631,7 +631,7 @@ static NPY_INLINE int * #type = npy_float, npy_double, npy_longdouble# * #c = f,,l# */ -static NPY_INLINE int +static inline int @name@_ctype_absolute(@type@ a, @type@ *out) { *out = npy_fabs@c@(a); @@ -639,7 +639,7 @@ static NPY_INLINE int } /**end repeat**/ -static NPY_INLINE int +static inline int half_ctype_absolute(npy_half a, npy_half *out) { *out = a&0x7fffu; @@ -652,7 +652,7 @@ half_ctype_absolute(npy_half a, npy_half *out) * #rtype = npy_float, npy_double, npy_longdouble# * #c = f,,l# */ -static NPY_INLINE int +static inline int @name@_ctype_absolute(@type@ a, @rtype@ *out) { *out = npy_cabs@c@(a); @@ -665,7 +665,7 @@ static NPY_INLINE int * ulong, longlong, ulonglong# */ -static NPY_INLINE int +static inline int @name@_ctype_invert(npy_@name@ a, npy_@name@ *out) { *out = ~a; @@ -806,7 +806,7 @@ typedef enum { */ CONVERT_PYSCALAR, /* - * Other object is an unkown scalar or array-like, we (typically) use + * Other object is an unknown scalar or array-like, we (typically) use * the generic path, which normally ends up in the ufunc machinery. */ OTHER_IS_UNKNOWN_OBJECT, @@ -929,7 +929,7 @@ typedef enum { * @result The result value indicating what we did with `value` or what type * of object it is (see `conversion_result`). */ -static NPY_INLINE conversion_result +static inline conversion_result convert_to_@name@(PyObject *value, @type@ *result, npy_bool *may_need_deferring) { PyArray_Descr *descr; @@ -1004,7 +1004,7 @@ convert_to_@name@(PyObject *value, @type@ *result, npy_bool *may_need_deferring) if (overflow) { /* handle as if "unsafe" */ if (npy_promotion_state != NPY_USE_WEAK_PROMOTION) { - return PROMOTION_REQUIRED; + return OTHER_IS_UNKNOWN_OBJECT; } return CONVERT_PYSCALAR; } @@ -1179,6 +1179,11 @@ convert_to_@name@(PyObject *value, @type@ *result, npy_bool *may_need_deferring) * (Half, Float, Double, LongDouble, * CFloat, CDouble, CLongDouble)*4, * (Half, Float, Double, LongDouble)*3# + * #NAME = (BYTE, UBYTE, SHORT, USHORT, INT, UINT, + * LONG, ULONG, LONGLONG, ULONGLONG)*12, + * (HALF, FLOAT, DOUBLE, LONGDOUBLE, + * CFLOAT, CDOUBLE, CLONGDOUBLE)*4, + * (HALF, FLOAT, DOUBLE, LONGDOUBLE)*3# * #type = (npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint, * npy_long, npy_ulong, npy_longlong, npy_ulonglong)*12, * (npy_half, npy_float, npy_double, npy_longdouble, @@ -1202,24 +1207,12 @@ convert_to_@name@(PyObject *value, @type@ *result, npy_bool *may_need_deferring) * (npy_half, npy_float, npy_double, npy_longdouble, * npy_cfloat, npy_cdouble, npy_clongdouble)*4, * (npy_half, npy_float, npy_double, npy_longdouble)*3# - * #oname = (byte, ubyte, short, ushort, int, uint, - * long, ulong, longlong, ulonglong)*11, - * double*10, - * (half, float, double, longdouble, - * cfloat, cdouble, clongdouble)*4, - * (half, float, double, longdouble)*3# * #OName = (Byte, UByte, Short, UShort, Int, UInt, * Long, ULong, LongLong, ULongLong)*11, * Double*10, * (Half, Float, Double, LongDouble, * CFloat, CDouble, CLongDouble)*4, * (Half, Float, Double, LongDouble)*3# - * #ONAME = (BYTE, UBYTE, SHORT, USHORT, INT, UINT, - * LONG, ULONG, LONGLONG, ULONGLONG)*11, - * DOUBLE*10, - * (HALF, FLOAT, DOUBLE, LONGDOUBLE, - * CFLOAT, CDOUBLE, CLONGDOUBLE)*4, - * (HALF, FLOAT, DOUBLE, LONGDOUBLE)*3# */ #define IS_@name@ /* drop the "true_" from "true_divide" for floating point warnings: */ @@ -1234,7 +1227,7 @@ static PyObject * @name@_@oper@(PyObject *a, PyObject *b) { PyObject *ret; - @otype@ arg1, arg2, other_val; + @type@ arg1, arg2, other_val; /* * Check if this operation may be considered forward. Note `is_forward` @@ -1263,7 +1256,7 @@ static PyObject * PyObject *other = is_forward ? b : a; npy_bool may_need_deferring; - conversion_result res = convert_to_@oname@( + conversion_result res = convert_to_@name@( other, &other_val, &may_need_deferring); if (res == CONVERSION_ERROR) { return NULL; /* an error occurred (should never happen) */ @@ -1305,7 +1298,7 @@ static PyObject * */ return PyGenericArrType_Type.tp_as_number->nb_@oper@(a,b); case CONVERT_PYSCALAR: - if (@ONAME@_setitem(other, (char *)&other_val, NULL) < 0) { + if (@NAME@_setitem(other, (char *)&other_val, NULL) < 0) { return NULL; } break; @@ -1345,7 +1338,7 @@ static PyObject * #if @twoout@ int retstatus = @name@_ctype_@oper@(arg1, arg2, &out, &out2); #else - int retstatus = @oname@_ctype_@oper@(arg1, arg2, &out); + int retstatus = @name@_ctype_@oper@(arg1, arg2, &out); #endif #if @fperr@ @@ -1549,7 +1542,7 @@ static PyObject * * */ -/* +/* * Complex numbers do not support remainder so we manually make sure that the * operation is not defined. This is/was especially important for longdoubles * due to their tendency to recurse for some operations, see gh-18548. @@ -1711,7 +1704,7 @@ static int emit_complexwarning(void) { static PyObject *cls = NULL; - npy_cache_import("numpy.core", "ComplexWarning", &cls); + npy_cache_import("numpy.exceptions", "ComplexWarning", &cls); if (cls == NULL) { return -1; } diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src deleted file mode 100644 index d6c9a7e65..000000000 --- a/numpy/core/src/umath/simd.inc.src +++ /dev/null @@ -1,1215 +0,0 @@ - - -/* - * This file is for the definitions of simd vectorized operations. - * - * Currently contains sse2 functions that are built on amd64, x32 or - * non-generic builds (CFLAGS=-march=...) - * In future it may contain other instruction sets like AVX or NEON detected - * at runtime in which case it needs to be included indirectly via a file - * compiled with special options (or use gcc target attributes) so the binary - * stays portable. - */ - - -#ifndef __NPY_SIMD_INC -#define __NPY_SIMD_INC - -#include "lowlevel_strided_loops.h" -#include "numpy/npy_common.h" -#include "numpy/npy_math.h" -#include "npy_simd_data.h" -#ifdef NPY_HAVE_SSE2_INTRINSICS -#include <emmintrin.h> -#if !defined(_MSC_VER) || _MSC_VER >= 1600 -#include <immintrin.h> -#else -#undef __AVX2__ -#undef __AVX512F__ -#endif -#endif -#include "loops_utils.h" // nomemoverlap -#include <assert.h> -#include <stdlib.h> -#include <float.h> -#include <string.h> /* for memcpy */ - -#define VECTOR_SIZE_BYTES 16 - -/* - * Dispatcher functions - * decide whether the operation can be vectorized and run it - * if it was run returns true and false if nothing was done - */ - -/* - ***************************************************************************** - ** CMPLX DISPATCHERS - ***************************************************************************** - */ - -/**begin repeat - * #TYPE = CFLOAT, CDOUBLE# - * #type= npy_float, npy_double# - * #esize = 8, 16# - */ - -/**begin repeat1 - * #func = square, absolute, conjugate# - * #outsize = 1, 2, 1# - * #max_stride = 2, 8, 8# - */ - -#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS -static NPY_INLINE NPY_GCC_TARGET_AVX512F void -AVX512F_@func@_@TYPE@(@type@*, @type@*, const npy_intp n, const npy_intp stride); -#endif - -static NPY_INLINE int -run_unary_avx512f_@func@_@TYPE@(char **args, const npy_intp *dimensions, const npy_intp *steps) -{ -#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS - if ((IS_OUTPUT_BLOCKABLE_UNARY(@esize@, (npy_uint)(@esize@/@outsize@), 64)) && (labs(steps[0]) < 2*@max_stride@*@esize@)) { - AVX512F_@func@_@TYPE@((@type@*)args[1], (@type@*)args[0], dimensions[0], steps[0]); - return 1; - } - else - return 0; -#endif - return 0; -} - -/**end repeat1**/ -/**end repeat**/ - -/* - ***************************************************************************** - ** FLOAT DISPATCHERS - ***************************************************************************** - */ - -/**begin repeat - * #type = npy_float, npy_double, npy_longdouble# - * #TYPE = FLOAT, DOUBLE, LONGDOUBLE# - * #EXISTS = 1, 1, 0# - */ - -/**begin repeat1 - * #func = isnan, isfinite, isinf, signbit# - */ - -#if defined HAVE_ATTRIBUTE_TARGET_AVX512_SKX_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS && @EXISTS@ -static NPY_INLINE NPY_GCC_TARGET_AVX512_SKX void -AVX512_SKX_@func@_@TYPE@(npy_bool*, @type@*, const npy_intp n, const npy_intp stride); -#endif - -static NPY_INLINE int -run_@func@_avx512_skx_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps) -{ -#if defined HAVE_ATTRIBUTE_TARGET_AVX512_SKX_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS && @EXISTS@ - if (IS_OUTPUT_BLOCKABLE_UNARY(sizeof(@type@), sizeof(npy_bool), 64)) { - AVX512_SKX_@func@_@TYPE@((npy_bool*)args[1], (@type@*)args[0], dimensions[0], steps[0]); - return 1; - } - else { - return 0; - } -#endif - return 0; -} - - -/**end repeat1**/ -/**end repeat**/ - -/**begin repeat - * Float types - * #type = npy_float, npy_double, npy_longdouble# - * #TYPE = FLOAT, DOUBLE, LONGDOUBLE# - * #vector = 1, 1, 0# - * #VECTOR = NPY_SIMD, NPY_SIMD_F64, 0 # - */ - -/**begin repeat1 - * #func = negative# - * #check = IS_BLOCKABLE_UNARY# - * #name = unary# - */ - -#if @vector@ && defined NPY_HAVE_SSE2_INTRINSICS - -/* prototypes */ -static void -sse2_@func@_@TYPE@(@type@ *, @type@ *, const npy_intp n); - -#endif - -static NPY_INLINE int -run_@name@_simd_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps) -{ -#if @vector@ && defined NPY_HAVE_SSE2_INTRINSICS - if (@check@(sizeof(@type@), VECTOR_SIZE_BYTES)) { - sse2_@func@_@TYPE@((@type@*)args[1], (@type@*)args[0], dimensions[0]); - return 1; - } -#endif - return 0; -} - -/**end repeat1**/ - -/**begin repeat1 - * #kind = isnan, isfinite, isinf, signbit# - */ - -#if @vector@ && defined NPY_HAVE_SSE2_INTRINSICS - -static void -sse2_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, npy_intp n); - -#endif - -static NPY_INLINE int -run_@kind@_simd_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps) -{ -#if @vector@ && defined NPY_HAVE_SSE2_INTRINSICS - if (steps[0] == sizeof(@type@) && steps[1] == 1 && - npy_is_aligned(args[0], sizeof(@type@))) { - sse2_@kind@_@TYPE@((npy_bool*)args[1], (@type@*)args[0], dimensions[0]); - return 1; - } -#endif - return 0; -} - -/**end repeat1**/ - -/**end repeat**/ - -/* - ***************************************************************************** - ** BOOL DISPATCHERS - ***************************************************************************** - */ - -/**begin repeat - * # kind = logical_or, logical_and# - */ - -#if defined NPY_HAVE_SSE2_INTRINSICS -static void -sse2_binary_@kind@_BOOL(npy_bool * op, npy_bool * ip1, npy_bool * ip2, - npy_intp n); - -static void -sse2_reduce_@kind@_BOOL(npy_bool * op, npy_bool * ip, npy_intp n); -#endif - -static NPY_INLINE int -run_binary_simd_@kind@_BOOL(char **args, npy_intp const *dimensions, npy_intp const *steps) -{ -#if defined NPY_HAVE_SSE2_INTRINSICS - if (sizeof(npy_bool) == 1 && - IS_BLOCKABLE_BINARY(sizeof(npy_bool), VECTOR_SIZE_BYTES)) { - sse2_binary_@kind@_BOOL((npy_bool*)args[2], (npy_bool*)args[0], - (npy_bool*)args[1], dimensions[0]); - return 1; - } -#endif - return 0; -} - - -static NPY_INLINE int -run_reduce_simd_@kind@_BOOL(char **args, npy_intp const *dimensions, npy_intp const *steps) -{ -#if defined NPY_HAVE_SSE2_INTRINSICS - if (sizeof(npy_bool) == 1 && - IS_BLOCKABLE_REDUCE(sizeof(npy_bool), VECTOR_SIZE_BYTES)) { - sse2_reduce_@kind@_BOOL((npy_bool*)args[0], (npy_bool*)args[1], - dimensions[0]); - return 1; - } -#endif - return 0; -} - -/**end repeat**/ - -/**begin repeat - * # kind = absolute, logical_not# - */ - -#if defined NPY_HAVE_SSE2_INTRINSICS -static void -sse2_@kind@_BOOL(npy_bool *, npy_bool *, const npy_intp n); -#endif - -static NPY_INLINE int -run_unary_simd_@kind@_BOOL(char **args, npy_intp const *dimensions, npy_intp const *steps) -{ -#if defined NPY_HAVE_SSE2_INTRINSICS - if (sizeof(npy_bool) == 1 && - IS_BLOCKABLE_UNARY(sizeof(npy_bool), VECTOR_SIZE_BYTES)) { - sse2_@kind@_BOOL((npy_bool*)args[1], (npy_bool*)args[0], dimensions[0]); - return 1; - } -#endif - return 0; -} - -/**end repeat**/ - -#ifdef NPY_HAVE_SSE2_INTRINSICS - -/* - * Vectorized operations - */ -/* - ***************************************************************************** - ** FLOAT LOOPS - ***************************************************************************** - */ - -/**begin repeat -* horizontal reductions on a vector -* # VOP = min, max# -*/ - -NPY_FINLINE npy_float sse2_horizontal_@VOP@___m128(__m128 v) -{ - npy_float r; - __m128 tmp = _mm_movehl_ps(v, v); /* c d ... */ - __m128 m = _mm_@VOP@_ps(v, tmp); /* m(ac) m(bd) ... */ - tmp = _mm_shuffle_ps(m, m, _MM_SHUFFLE(1, 1, 1, 1));/* m(bd) m(bd) ... */ - _mm_store_ss(&r, _mm_@VOP@_ps(tmp, m)); /* m(acbd) ... */ - return r; -} - -NPY_FINLINE npy_double sse2_horizontal_@VOP@___m128d(__m128d v) -{ - npy_double r; - __m128d tmp = _mm_unpackhi_pd(v, v); /* b b */ - _mm_store_sd(&r, _mm_@VOP@_pd(tmp, v)); /* m(ab) m(bb) */ - return r; -} -/**end repeat**/ - -/**begin repeat - * #type = npy_float, npy_double# - * #TYPE = FLOAT, DOUBLE# - * #scalarf = npy_sqrtf, npy_sqrt# - * #c = f, # - * #vtype = __m128, __m128d# - * #vtype256 = __m256, __m256d# - * #vtype512 = __m512, __m512d# - * #vpre = _mm, _mm# - * #vpre256 = _mm256, _mm256# - * #vpre512 = _mm512, _mm512# - * #vsuf = ps, pd# - * #vsufs = ss, sd# - * #nan = NPY_NANF, NPY_NAN# - * #double = 0, 1# - * #cast = _mm_castps_si128, _mm_castpd_si128# - */ -/* - * compress 4 vectors to 4/8 bytes in op with filled with 0 or 1 - * the last vector is passed as a pointer as MSVC 2010 is unable to ignore the - * calling convention leading to C2719 on 32 bit, see #4795 - */ -NPY_FINLINE void -sse2_compress4_to_byte_@TYPE@(@vtype@ r1, @vtype@ r2, @vtype@ r3, @vtype@ * r4, - npy_bool * op) -{ - const __m128i mask = @vpre@_set1_epi8(0x1); - __m128i ir1 = @vpre@_packs_epi32(@cast@(r1), @cast@(r2)); - __m128i ir2 = @vpre@_packs_epi32(@cast@(r3), @cast@(*r4)); - __m128i rr = @vpre@_packs_epi16(ir1, ir2); -#if @double@ - rr = @vpre@_packs_epi16(rr, rr); - rr = @vpre@_and_si128(rr, mask); - @vpre@_storel_epi64((__m128i*)op, rr); -#else - rr = @vpre@_and_si128(rr, mask); - @vpre@_storeu_si128((__m128i*)op, rr); -#endif -} - -static void -sse2_signbit_@TYPE@(npy_bool * op, @type@ * ip1, npy_intp n) -{ - LOOP_BLOCK_ALIGN_VAR(ip1, @type@, VECTOR_SIZE_BYTES) { - op[i] = npy_signbit(ip1[i]) != 0; - } - LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) { - @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]); - int r = @vpre@_movemask_@vsuf@(a); - if (sizeof(@type@) == 8) { - op[i] = r & 1; - op[i + 1] = (r >> 1); - } - else { - op[i] = r & 1; - op[i + 1] = (r >> 1) & 1; - op[i + 2] = (r >> 2) & 1; - op[i + 3] = (r >> 3); - } - } - LOOP_BLOCKED_END { - op[i] = npy_signbit(ip1[i]) != 0; - } -} - -/**begin repeat1 - * #kind = isnan, isfinite, isinf# - * #var = 0, 1, 2# - */ - -static void -sse2_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, npy_intp n) -{ -#if @var@ != 0 /* isinf/isfinite */ - /* signbit mask 0x7FFFFFFF after andnot */ - const @vtype@ mask = @vpre@_set1_@vsuf@(-0.@c@); - const @vtype@ ones = @vpre@_cmpeq_@vsuf@(@vpre@_setzero_@vsuf@(), - @vpre@_setzero_@vsuf@()); -#if @double@ - const @vtype@ fltmax = @vpre@_set1_@vsuf@(DBL_MAX); -#else - const @vtype@ fltmax = @vpre@_set1_@vsuf@(FLT_MAX); -#endif -#endif - LOOP_BLOCK_ALIGN_VAR(ip1, @type@, VECTOR_SIZE_BYTES) { - op[i] = npy_@kind@(ip1[i]) != 0; - } - LOOP_BLOCKED(@type@, 4 * VECTOR_SIZE_BYTES) { - @vtype@ a = @vpre@_load_@vsuf@(&ip1[i + 0 * VECTOR_SIZE_BYTES / sizeof(@type@)]); - @vtype@ b = @vpre@_load_@vsuf@(&ip1[i + 1 * VECTOR_SIZE_BYTES / sizeof(@type@)]); - @vtype@ c = @vpre@_load_@vsuf@(&ip1[i + 2 * VECTOR_SIZE_BYTES / sizeof(@type@)]); - @vtype@ d = @vpre@_load_@vsuf@(&ip1[i + 3 * VECTOR_SIZE_BYTES / sizeof(@type@)]); - @vtype@ r1, r2, r3, r4; -#if @var@ != 0 /* isinf/isfinite */ - /* fabs via masking of sign bit */ - r1 = @vpre@_andnot_@vsuf@(mask, a); - r2 = @vpre@_andnot_@vsuf@(mask, b); - r3 = @vpre@_andnot_@vsuf@(mask, c); - r4 = @vpre@_andnot_@vsuf@(mask, d); -#if @var@ == 1 /* isfinite */ - /* negative compare against max float, nan is always true */ - r1 = @vpre@_cmpnle_@vsuf@(r1, fltmax); - r2 = @vpre@_cmpnle_@vsuf@(r2, fltmax); - r3 = @vpre@_cmpnle_@vsuf@(r3, fltmax); - r4 = @vpre@_cmpnle_@vsuf@(r4, fltmax); -#else /* isinf */ - r1 = @vpre@_cmpnlt_@vsuf@(fltmax, r1); - r2 = @vpre@_cmpnlt_@vsuf@(fltmax, r2); - r3 = @vpre@_cmpnlt_@vsuf@(fltmax, r3); - r4 = @vpre@_cmpnlt_@vsuf@(fltmax, r4); -#endif - /* flip results to what we want (andnot as there is no sse not) */ - r1 = @vpre@_andnot_@vsuf@(r1, ones); - r2 = @vpre@_andnot_@vsuf@(r2, ones); - r3 = @vpre@_andnot_@vsuf@(r3, ones); - r4 = @vpre@_andnot_@vsuf@(r4, ones); -#endif -#if @var@ == 0 /* isnan */ - r1 = @vpre@_cmpneq_@vsuf@(a, a); - r2 = @vpre@_cmpneq_@vsuf@(b, b); - r3 = @vpre@_cmpneq_@vsuf@(c, c); - r4 = @vpre@_cmpneq_@vsuf@(d, d); -#endif - sse2_compress4_to_byte_@TYPE@(r1, r2, r3, &r4, &op[i]); - } - LOOP_BLOCKED_END { - op[i] = npy_@kind@(ip1[i]) != 0; - } -} - -/**end repeat1**/ - -static void -sse2_negative_@TYPE@(@type@ * op, @type@ * ip, const npy_intp n) -{ - /* - * get 0x7FFFFFFF mask (everything but signbit set) - * float & ~mask will remove the sign, float ^ mask flips the sign - * this is equivalent to how the compiler implements fabs on amd64 - */ - const @vtype@ mask = @vpre@_set1_@vsuf@(-0.@c@); - - /* align output to VECTOR_SIZE_BYTES bytes */ - LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES) { - op[i] = -ip[i]; - } - assert((npy_uintp)n < (VECTOR_SIZE_BYTES / sizeof(@type@)) || - npy_is_aligned(&op[i], VECTOR_SIZE_BYTES)); - if (npy_is_aligned(&ip[i], VECTOR_SIZE_BYTES)) { - LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) { - @vtype@ a = @vpre@_load_@vsuf@(&ip[i]); - @vpre@_store_@vsuf@(&op[i], @vpre@_xor_@vsuf@(mask, a)); - } - } - else { - LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) { - @vtype@ a = @vpre@_loadu_@vsuf@(&ip[i]); - @vpre@_store_@vsuf@(&op[i], @vpre@_xor_@vsuf@(mask, a)); - } - } - LOOP_BLOCKED_END { - op[i] = -ip[i]; - } -} -/**end repeat1**/ - -/**end repeat**/ - -/* bunch of helper functions used in ISA_exp/log_FLOAT*/ - -#if defined HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256 -fma_get_full_load_mask_ps(void) -{ - return _mm256_set1_ps(-1.0); -} - -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256i -fma_get_full_load_mask_pd(void) -{ - return _mm256_castpd_si256(_mm256_set1_pd(-1.0)); -} - -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256 -fma_get_partial_load_mask_ps(const npy_int num_elem, const npy_int num_lanes) -{ - float maskint[16] = {-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0, - 1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0}; - float* addr = maskint + num_lanes - num_elem; - return _mm256_loadu_ps(addr); -} - -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256i -fma_get_partial_load_mask_pd(const npy_int num_elem, const npy_int num_lanes) -{ - npy_int maskint[16] = {-1,-1,-1,-1,-1,-1,-1,-1,1,1,1,1,1,1,1,1}; - npy_int* addr = maskint + 2*num_lanes - 2*num_elem; - return _mm256_loadu_si256((__m256i*) addr); -} - -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256 -fma_masked_gather_ps(__m256 src, - npy_float* addr, - __m256i vindex, - __m256 mask) -{ - return _mm256_mask_i32gather_ps(src, addr, vindex, mask, 4); -} - -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256d -fma_masked_gather_pd(__m256d src, - npy_double* addr, - __m128i vindex, - __m256d mask) -{ - return _mm256_mask_i32gather_pd(src, addr, vindex, mask, 8); -} - -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256 -fma_masked_load_ps(__m256 mask, npy_float* addr) -{ - return _mm256_maskload_ps(addr, _mm256_cvtps_epi32(mask)); -} - -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256d -fma_masked_load_pd(__m256i mask, npy_double* addr) -{ - return _mm256_maskload_pd(addr, mask); -} - -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256 -fma_set_masked_lanes_ps(__m256 x, __m256 val, __m256 mask) -{ - return _mm256_blendv_ps(x, val, mask); -} - -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256d -fma_set_masked_lanes_pd(__m256d x, __m256d val, __m256d mask) -{ - return _mm256_blendv_pd(x, val, mask); -} - -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256 -fma_blend(__m256 x, __m256 y, __m256 ymask) -{ - return _mm256_blendv_ps(x, y, ymask); -} - -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256 -fma_invert_mask_ps(__m256 ymask) -{ - return _mm256_andnot_ps(ymask, _mm256_set1_ps(-1.0)); -} - -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256i -fma_invert_mask_pd(__m256i ymask) -{ - return _mm256_andnot_si256(ymask, _mm256_set1_epi32(0xFFFFFFFF)); -} - -/**begin repeat - * #vsub = ps, pd# - * #vtype = __m256, __m256d# - */ -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@ -fma_abs_@vsub@(@vtype@ x) -{ - return _mm256_andnot_@vsub@(_mm256_set1_@vsub@(-0.0), x); -} - -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@ -fma_reciprocal_@vsub@(@vtype@ x) -{ - return _mm256_div_@vsub@(_mm256_set1_@vsub@(1.0f), x); -} - -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@ -fma_rint_@vsub@(@vtype@ x) -{ - return _mm256_round_@vsub@(x, _MM_FROUND_TO_NEAREST_INT); -} - -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@ -fma_floor_@vsub@(@vtype@ x) -{ - return _mm256_round_@vsub@(x, _MM_FROUND_TO_NEG_INF); -} - -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@ -fma_trunc_@vsub@(@vtype@ x) -{ - return _mm256_round_@vsub@(x, _MM_FROUND_TO_ZERO); -} -/**end repeat**/ -#endif - -#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16 -avx512_get_full_load_mask_ps(void) -{ - return 0xFFFF; -} - -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask8 -avx512_get_full_load_mask_pd(void) -{ - return 0xFF; -} - -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16 -avx512_get_partial_load_mask_ps(const npy_int num_elem, const npy_int total_elem) -{ - return (0x0001 << num_elem) - 0x0001; -} - -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask8 -avx512_get_partial_load_mask_pd(const npy_int num_elem, const npy_int total_elem) -{ - return (0x01 << num_elem) - 0x01; -} - -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512 -avx512_masked_gather_ps(__m512 src, - npy_float* addr, - __m512i vindex, - __mmask16 kmask) -{ - return _mm512_mask_i32gather_ps(src, kmask, vindex, addr, 4); -} - -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512d -avx512_masked_gather_pd(__m512d src, - npy_double* addr, - __m256i vindex, - __mmask8 kmask) -{ - return _mm512_mask_i32gather_pd(src, kmask, vindex, addr, 8); -} - -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512 -avx512_masked_load_ps(__mmask16 mask, npy_float* addr) -{ - return _mm512_maskz_loadu_ps(mask, (__m512 *)addr); -} - -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512d -avx512_masked_load_pd(__mmask8 mask, npy_double* addr) -{ - return _mm512_maskz_loadu_pd(mask, (__m512d *)addr); -} - -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512 -avx512_set_masked_lanes_ps(__m512 x, __m512 val, __mmask16 mask) -{ - return _mm512_mask_blend_ps(mask, x, val); -} - -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512d -avx512_set_masked_lanes_pd(__m512d x, __m512d val, __mmask8 mask) -{ - return _mm512_mask_blend_pd(mask, x, val); -} - -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512 -avx512_blend(__m512 x, __m512 y, __mmask16 ymask) -{ - return _mm512_mask_mov_ps(x, ymask, y); -} - -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16 -avx512_invert_mask_ps(__mmask16 ymask) -{ - return _mm512_knot(ymask); -} - -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask8 -avx512_invert_mask_pd(__mmask8 ymask) -{ - return _mm512_knot(ymask); -} - -/**begin repeat - * #vsub = ps, pd# - * #type= npy_float, npy_double# - * #epi_vsub = epi32, epi64# - * #vtype = __m512, __m512d# - * #mask = __mmask16, __mmask8# - * #and_const = 0x7fffffff, 0x7fffffffffffffffLL# - * #neg_mask = 0x80000000, 0x8000000000000000# - * #perm_ = 0xb1, 0x55# - * #cmpx_img_mask = 0xAAAA, 0xAA# - * #cmpx_re_mask = 0x5555, 0x55# - * #INF = NPY_INFINITYF, NPY_INFINITY# - * #NAN = NPY_NANF, NPY_NAN# - */ -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@ -avx512_abs_@vsub@(@vtype@ x) -{ - return (@vtype@) _mm512_and_@epi_vsub@((__m512i) x, - _mm512_set1_@epi_vsub@ (@and_const@)); -} - -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@ -avx512_reciprocal_@vsub@(@vtype@ x) -{ - return _mm512_div_@vsub@(_mm512_set1_@vsub@(1.0f), x); -} - -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@ -avx512_rint_@vsub@(@vtype@ x) -{ - return _mm512_roundscale_@vsub@(x, 0x08); -} - -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@ -avx512_floor_@vsub@(@vtype@ x) -{ - return _mm512_roundscale_@vsub@(x, 0x09); -} - -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@ -avx512_trunc_@vsub@(@vtype@ x) -{ - return _mm512_roundscale_@vsub@(x, 0x0B); -} - -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@ -avx512_hadd_@vsub@(const @vtype@ x) -{ - return _mm512_add_@vsub@(x, _mm512_permute_@vsub@(x, @perm_@)); -} - -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@ -avx512_hsub_@vsub@(const @vtype@ x) -{ - return _mm512_sub_@vsub@(x, _mm512_permute_@vsub@(x, @perm_@)); -} - -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@ -avx512_cabsolute_@vsub@(const @vtype@ x1, - const @vtype@ x2, - const __m512i re_indices, - const __m512i im_indices) -{ - @vtype@ inf = _mm512_set1_@vsub@(@INF@); - @vtype@ nan = _mm512_set1_@vsub@(@NAN@); - @vtype@ x1_abs = avx512_abs_@vsub@(x1); - @vtype@ x2_abs = avx512_abs_@vsub@(x2); - @vtype@ re = _mm512_permutex2var_@vsub@(x1_abs, re_indices, x2_abs); - @vtype@ im = _mm512_permutex2var_@vsub@(x1_abs, im_indices , x2_abs); - /* - * If real or imag = INF, then convert it to inf + j*inf - * Handles: inf + j*nan, nan + j*inf - */ - @mask@ re_infmask = _mm512_cmp_@vsub@_mask(re, inf, _CMP_EQ_OQ); - @mask@ im_infmask = _mm512_cmp_@vsub@_mask(im, inf, _CMP_EQ_OQ); - im = _mm512_mask_mov_@vsub@(im, re_infmask, inf); - re = _mm512_mask_mov_@vsub@(re, im_infmask, inf); - - /* - * If real or imag = NAN, then convert it to nan + j*nan - * Handles: x + j*nan, nan + j*x - */ - @mask@ re_nanmask = _mm512_cmp_@vsub@_mask(re, re, _CMP_NEQ_UQ); - @mask@ im_nanmask = _mm512_cmp_@vsub@_mask(im, im, _CMP_NEQ_UQ); - im = _mm512_mask_mov_@vsub@(im, re_nanmask, nan); - re = _mm512_mask_mov_@vsub@(re, im_nanmask, nan); - - @vtype@ larger = _mm512_max_@vsub@(re, im); - @vtype@ smaller = _mm512_min_@vsub@(im, re); - - /* - * Calculate div_mask to prevent 0./0. and inf/inf operations in div - */ - @mask@ zeromask = _mm512_cmp_@vsub@_mask(larger, _mm512_setzero_@vsub@(), _CMP_EQ_OQ); - @mask@ infmask = _mm512_cmp_@vsub@_mask(smaller, inf, _CMP_EQ_OQ); - @mask@ div_mask = _mm512_knot(_mm512_kor(zeromask, infmask)); - @vtype@ ratio = _mm512_maskz_div_@vsub@(div_mask, smaller, larger); - @vtype@ hypot = _mm512_sqrt_@vsub@(_mm512_fmadd_@vsub@( - ratio, ratio, _mm512_set1_@vsub@(1.0f))); - return _mm512_mul_@vsub@(hypot, larger); -} - -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@ -avx512_conjugate_@vsub@(const @vtype@ x) -{ - /* - * __mm512_mask_xor_ps/pd requires AVX512DQ. We cast it to __m512i and - * use the xor_epi32/64 uinstruction instead. Cast is a zero latency instruction - */ - __m512i cast_x = _mm512_cast@vsub@_si512(x); - __m512i res = _mm512_mask_xor_@epi_vsub@(cast_x, @cmpx_img_mask@, - cast_x, _mm512_set1_@epi_vsub@(@neg_mask@)); - return _mm512_castsi512_@vsub@(res); -} - -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@ -avx512_cmul_@vsub@(@vtype@ x1, @vtype@ x2) -{ - // x1 = r1, i1 - // x2 = r2, i2 - @vtype@ x3 = _mm512_permute_@vsub@(x2, @perm_@); // i2, r2 - @vtype@ x12 = _mm512_mul_@vsub@(x1, x2); // r1*r2, i1*i2 - @vtype@ x13 = _mm512_mul_@vsub@(x1, x3); // r1*i2, r2*i1 - @vtype@ outreal = avx512_hsub_@vsub@(x12); // r1*r2 - i1*i2, r1*r2 - i1*i2 - @vtype@ outimg = avx512_hadd_@vsub@(x13); // r1*i2 + i1*r2, r1*i2 + i1*r2 - return _mm512_mask_blend_@vsub@(@cmpx_img_mask@, outreal, outimg); -} - -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@ -avx512_csquare_@vsub@(@vtype@ x) -{ - return avx512_cmul_@vsub@(x, x); -} - -/**end repeat**/ -#endif - -/**begin repeat - * #ISA = FMA, AVX512F# - * #isa = fma, avx512# - * #vtype = __m256, __m512# - * #vsize = 256, 512# - * #or = or_ps, kor# - * #vsub = , _mask# - * #mask = __m256, __mmask16# - * #fmadd = _mm256_fmadd_ps, _mm512_fmadd_ps# - * #CHK = HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS# - **/ - -#if defined @CHK@ - -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@ -@isa@_sqrt_ps(@vtype@ x) -{ - return _mm@vsize@_sqrt_ps(x); -} - -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@d -@isa@_sqrt_pd(@vtype@d x) -{ - return _mm@vsize@_sqrt_pd(x); -} - -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@ -@isa@_square_ps(@vtype@ x) -{ - return _mm@vsize@_mul_ps(x,x); -} - -NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@d -@isa@_square_pd(@vtype@d x) -{ - return _mm@vsize@_mul_pd(x,x); -} - -#endif -/**end repeat**/ - -/**begin repeat - * #type = npy_float, npy_double# - * #TYPE = FLOAT, DOUBLE# - * #num_lanes = 16, 8# - * #vsuffix = ps, pd# - * #mask = __mmask16, __mmask8# - * #vtype = __m512, __m512d# - * #scale = 4, 8# - * #vindextype = __m512i, __m256i# - * #vindexload = _mm512_loadu_si512, _mm256_loadu_si256# - * #episize = epi32, epi64# - */ - -/**begin repeat1 - * #func = isnan, isfinite, isinf, signbit# - * #IMM8 = 0x81, 0x99, 0x18, 0x04# - * #is_finite = 0, 1, 0, 0# - * #is_signbit = 0, 0, 0, 1# - */ - -#if defined HAVE_ATTRIBUTE_TARGET_AVX512_SKX_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS -static NPY_INLINE NPY_GCC_TARGET_AVX512_SKX void -AVX512_SKX_@func@_@TYPE@(npy_bool* op, @type@* ip, const npy_intp array_size, const npy_intp steps) -{ - const npy_intp stride_ip = steps/(npy_intp)sizeof(@type@); - npy_intp num_remaining_elements = array_size; - - @mask@ load_mask = avx512_get_full_load_mask_@vsuffix@(); -#if @is_signbit@ - @vtype@ signbit = _mm512_set1_@vsuffix@(-0.0); -#endif - - /* - * Note: while generally indices are npy_intp, we ensure that our maximum - * index will fit in an int32 as a precondition for this function via - * IS_OUTPUT_BLOCKABLE_UNARY - */ - - npy_int32 index_ip[@num_lanes@]; - for (npy_int32 ii = 0; ii < @num_lanes@; ii++) { - index_ip[ii] = ii*stride_ip; - } - @vindextype@ vindex_ip = @vindexload@((@vindextype@*)&index_ip[0]); - @vtype@ zeros_f = _mm512_setzero_@vsuffix@(); - __m512i ones = _mm512_set1_@episize@(1); - - while (num_remaining_elements > 0) { - if (num_remaining_elements < @num_lanes@) { - load_mask = avx512_get_partial_load_mask_@vsuffix@( - num_remaining_elements, @num_lanes@); - } - @vtype@ x1; - if (stride_ip == 1) { - x1 = avx512_masked_load_@vsuffix@(load_mask, ip); - } - else { - x1 = avx512_masked_gather_@vsuffix@(zeros_f, ip, vindex_ip, load_mask); - } -#if @is_signbit@ - x1 = _mm512_and_@vsuffix@(x1,signbit); -#endif - - @mask@ fpclassmask = _mm512_fpclass_@vsuffix@_mask(x1, @IMM8@); -#if @is_finite@ - fpclassmask = _mm512_knot(fpclassmask); -#endif - - __m128i out =_mm512_maskz_cvts@episize@_epi8(fpclassmask, ones); - _mm_mask_storeu_epi8(op, load_mask, out); - - ip += @num_lanes@*stride_ip; - op += @num_lanes@; - num_remaining_elements -= @num_lanes@; - } -} -#endif -/**end repeat1**/ -/**end repeat**/ - -/**begin repeat - * #TYPE = CFLOAT, CDOUBLE# - * #type = npy_float, npy_double# - * #num_lanes = 16, 8# - * #vsuffix = ps, pd# - * #epi_vsub = epi32, epi64# - * #mask = __mmask16, __mmask8# - * #vtype = __m512, __m512d# - * #scale = 4, 8# - * #vindextype = __m512i, __m256i# - * #vindexload = _mm512_loadu_si512, _mm256_loadu_si256# - * #storemask = 0xFF, 0xF# - * #IS_FLOAT = 1, 0# - */ - -/**begin repeat1 - * #func = square, conjugate# - * #vectorf = avx512_csquare, avx512_conjugate# - */ - -#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS -static NPY_GCC_OPT_3 NPY_INLINE NPY_GCC_TARGET_AVX512F void -AVX512F_@func@_@TYPE@(@type@ * op, - @type@ * ip, - const npy_intp array_size, - const npy_intp steps) -{ - npy_intp num_remaining_elements = 2*array_size; - const npy_intp stride_ip1 = steps/(npy_intp)sizeof(@type@)/2; - - /* - * Note: while generally indices are npy_intp, we ensure that our maximum index - * will fit in an int32 as a precondition for this function via max_stride - */ - npy_int32 index_ip1[16]; - for (npy_int32 ii = 0; ii < @num_lanes@; ii=ii+2) { - index_ip1[ii] = ii*stride_ip1; - index_ip1[ii+1] = ii*stride_ip1 + 1; - } - @vindextype@ vindex = @vindexload@((@vindextype@*)index_ip1); - @mask@ load_mask = avx512_get_full_load_mask_@vsuffix@(); - @vtype@ zeros = _mm512_setzero_@vsuffix@(); - - while (num_remaining_elements > 0) { - if (num_remaining_elements < @num_lanes@) { - load_mask = avx512_get_partial_load_mask_@vsuffix@( - num_remaining_elements, @num_lanes@); - } - @vtype@ x1; - if (stride_ip1 == 1) { - x1 = avx512_masked_load_@vsuffix@(load_mask, ip); - } - else { - x1 = avx512_masked_gather_@vsuffix@(zeros, ip, vindex, load_mask); - } - - @vtype@ out = @vectorf@_@vsuffix@(x1); - - _mm512_mask_storeu_@vsuffix@(op, load_mask, out); - op += @num_lanes@; - ip += @num_lanes@*stride_ip1; - num_remaining_elements -= @num_lanes@; - } -} -#endif -/**end repeat1**/ - -#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS -static NPY_GCC_OPT_3 NPY_INLINE NPY_GCC_TARGET_AVX512F void -AVX512F_absolute_@TYPE@(@type@ * op, - @type@ * ip, - const npy_intp array_size, - const npy_intp steps) -{ - npy_intp num_remaining_elements = 2*array_size; - const npy_intp stride_ip1 = steps/(npy_intp)sizeof(@type@)/2; - - /* - * Note: while generally indices are npy_intp, we ensure that our maximum index - * will fit in an int32 as a precondition for this function via max_stride - */ - npy_int32 index_ip[32]; - for (npy_int32 ii = 0; ii < 2*@num_lanes@; ii=ii+2) { - index_ip[ii] = ii*stride_ip1; - index_ip[ii+1] = ii*stride_ip1 + 1; - } - @vindextype@ vindex1 = @vindexload@((@vindextype@*)index_ip); - @vindextype@ vindex2 = @vindexload@((@vindextype@*)(index_ip+@num_lanes@)); - - @mask@ load_mask1 = avx512_get_full_load_mask_@vsuffix@(); - @mask@ load_mask2 = avx512_get_full_load_mask_@vsuffix@(); - @mask@ store_mask = avx512_get_full_load_mask_@vsuffix@(); - @vtype@ zeros = _mm512_setzero_@vsuffix@(); - -#if @IS_FLOAT@ - __m512i re_index = _mm512_set_epi32(30,28,26,24,22,20,18,16,14,12,10,8,6,4,2,0); - __m512i im_index = _mm512_set_epi32(31,29,27,25,23,21,19,17,15,13,11,9,7,5,3,1); -#else - __m512i re_index = _mm512_set_epi64(14,12,10,8,6,4,2,0); - __m512i im_index = _mm512_set_epi64(15,13,11,9,7,5,3,1); -#endif - - while (num_remaining_elements > 0) { - if (num_remaining_elements < @num_lanes@) { - load_mask1 = avx512_get_partial_load_mask_@vsuffix@( - num_remaining_elements, @num_lanes@); - load_mask2 = 0x0000; - store_mask = avx512_get_partial_load_mask_@vsuffix@( - num_remaining_elements/2, @num_lanes@); - } else if (num_remaining_elements < 2*@num_lanes@) { - load_mask1 = avx512_get_full_load_mask_@vsuffix@(); - load_mask2 = avx512_get_partial_load_mask_@vsuffix@( - num_remaining_elements - @num_lanes@, @num_lanes@); - store_mask = avx512_get_partial_load_mask_@vsuffix@( - num_remaining_elements/2, @num_lanes@); - } - @vtype@ x1, x2; - if (stride_ip1 == 1) { - x1 = avx512_masked_load_@vsuffix@(load_mask1, ip); - x2 = avx512_masked_load_@vsuffix@(load_mask2, ip+@num_lanes@); - } - else { - x1 = avx512_masked_gather_@vsuffix@(zeros, ip, vindex1, load_mask1); - x2 = avx512_masked_gather_@vsuffix@(zeros, ip, vindex2, load_mask2); - } - - @vtype@ out = avx512_cabsolute_@vsuffix@(x1, x2, re_index, im_index); - - _mm512_mask_storeu_@vsuffix@(op, store_mask, out); - op += @num_lanes@; - ip += 2*@num_lanes@*stride_ip1; - num_remaining_elements -= 2*@num_lanes@; - } - npy_clear_floatstatus_barrier((char*)&num_remaining_elements); -} - -#endif -/**end repeat**/ - -/* - ***************************************************************************** - ** BOOL LOOPS - ***************************************************************************** - */ - -/**begin repeat - * # kind = logical_or, logical_and# - * # and = 0, 1# - * # op = ||, &&# - * # sc = !=, ==# - * # vpre = _mm*2# - * # vsuf = si128*2# - * # vtype = __m128i*2# - * # type = npy_bool*2# - * # vload = _mm_load_si128*2# - * # vloadu = _mm_loadu_si128*2# - * # vstore = _mm_store_si128*2# - */ - -/* - * convert any bit set to boolean true so vectorized and normal operations are - * consistent, should not be required if bool is used correctly everywhere but - * you never know - */ -#if !@and@ -NPY_FINLINE @vtype@ byte_to_true(@vtype@ v) -{ - const @vtype@ zero = @vpre@_setzero_@vsuf@(); - const @vtype@ truemask = @vpre@_set1_epi8(1 == 1); - /* get 0xFF for zeros */ - @vtype@ tmp = @vpre@_cmpeq_epi8(v, zero); - /* filled with 0xFF/0x00, negate and mask to boolean true */ - return @vpre@_andnot_@vsuf@(tmp, truemask); -} -#endif - -static void -sse2_binary_@kind@_BOOL(npy_bool * op, npy_bool * ip1, npy_bool * ip2, npy_intp n) -{ - LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES) - op[i] = ip1[i] @op@ ip2[i]; - LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) { - @vtype@ a = @vloadu@((@vtype@*)&ip1[i]); - @vtype@ b = @vloadu@((@vtype@*)&ip2[i]); -#if @and@ - const @vtype@ zero = @vpre@_setzero_@vsuf@(); - /* get 0xFF for non zeros*/ - @vtype@ tmp = @vpre@_cmpeq_epi8(a, zero); - /* andnot -> 0x00 for zeros xFF for non zeros, & with ip2 */ - tmp = @vpre@_andnot_@vsuf@(tmp, b); -#else - @vtype@ tmp = @vpre@_or_@vsuf@(a, b); -#endif - - @vstore@((@vtype@*)&op[i], byte_to_true(tmp)); - } - LOOP_BLOCKED_END { - op[i] = (ip1[i] @op@ ip2[i]); - } -} - - -static void -sse2_reduce_@kind@_BOOL(npy_bool * op, npy_bool * ip, const npy_intp n) -{ - const @vtype@ zero = @vpre@_setzero_@vsuf@(); - LOOP_BLOCK_ALIGN_VAR(ip, npy_bool, VECTOR_SIZE_BYTES) { - *op = *op @op@ ip[i]; - if (*op @sc@ 0) { - return; - } - } - /* unrolled once to replace a slow movmsk with a fast pmaxb */ - LOOP_BLOCKED(npy_bool, 2 * VECTOR_SIZE_BYTES) { - @vtype@ v = @vload@((@vtype@*)&ip[i]); - @vtype@ v2 = @vload@((@vtype@*)&ip[i + VECTOR_SIZE_BYTES]); - v = @vpre@_cmpeq_epi8(v, zero); - v2 = @vpre@_cmpeq_epi8(v2, zero); -#if @and@ - if ((@vpre@_movemask_epi8(@vpre@_max_epu8(v, v2)) != 0)) { - *op = 0; -#else - if ((@vpre@_movemask_epi8(@vpre@_min_epu8(v, v2)) != 0xFFFF)) { - *op = 1; -#endif - return; - } - } - LOOP_BLOCKED_END { - *op = *op @op@ ip[i]; - if (*op @sc@ 0) { - return; - } - } -} - -/**end repeat**/ - -/**begin repeat - * # kind = absolute, logical_not# - * # op = !=, ==# - * # not = 0, 1# - * # vpre = _mm*2# - * # vsuf = si128*2# - * # vtype = __m128i*2# - * # type = npy_bool*2# - * # vloadu = _mm_loadu_si128*2# - * # vstore = _mm_store_si128*2# - */ - -static void -sse2_@kind@_BOOL(@type@ * op, @type@ * ip, const npy_intp n) -{ - LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES) - op[i] = (ip[i] @op@ 0); - LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) { - @vtype@ a = @vloadu@((@vtype@*)&ip[i]); -#if @not@ - const @vtype@ zero = @vpre@_setzero_@vsuf@(); - const @vtype@ truemask = @vpre@_set1_epi8(1 == 1); - /* equivalent to byte_to_true but can skip the negation */ - a = @vpre@_cmpeq_epi8(a, zero); - a = @vpre@_and_@vsuf@(a, truemask); -#else - /* abs is kind of pointless but maybe its used for byte_to_true */ - a = byte_to_true(a); -#endif - @vstore@((@vtype@*)&op[i], a); - } - LOOP_BLOCKED_END { - op[i] = (ip[i] @op@ 0); - } -} - -/**end repeat**/ - -#undef VECTOR_SIZE_BYTES -#endif /* NPY_HAVE_SSE2_INTRINSICS */ -#endif - diff --git a/numpy/core/src/umath/string_ufuncs.cpp b/numpy/core/src/umath/string_ufuncs.cpp index 5a35c318b..5d82be6db 100644 --- a/numpy/core/src/umath/string_ufuncs.cpp +++ b/numpy/core/src/umath/string_ufuncs.cpp @@ -16,7 +16,7 @@ template <typename character> -static NPY_INLINE int +static inline int character_cmp(character a, character b) { if (a == b) { @@ -37,7 +37,7 @@ character_cmp(character a, character b) * is always padded with zeros). */ template <bool rstrip, typename character> -static NPY_INLINE int +static inline int string_cmp(int len1, const character *str1, int len2, const character *str2) { if (rstrip) { diff --git a/numpy/core/src/umath/ufunc_object.c b/numpy/core/src/umath/ufunc_object.c index 693a6d6c9..39e64decb 100644 --- a/numpy/core/src/umath/ufunc_object.c +++ b/numpy/core/src/umath/ufunc_object.c @@ -52,6 +52,7 @@ #include "arrayobject.h" #include "common.h" +#include "ctors.h" #include "dtypemeta.h" #include "numpyos.h" #include "dispatching.h" @@ -1391,7 +1392,7 @@ try_trivial_single_output_loop(PyArrayMethod_Context *context, * or pass in the full cast information. But this can special case * the logical functions and prints a better error message. */ -static NPY_INLINE int +static inline int validate_casting(PyArrayMethodObject *method, PyUFuncObject *ufunc, PyArrayObject *ops[], PyArray_Descr *descriptors[], NPY_CASTING casting) @@ -1781,6 +1782,8 @@ _check_keepdims_support(PyUFuncObject *ufunc) { static int _parse_axes_arg(PyUFuncObject *ufunc, int op_core_num_dims[], PyObject *axes, PyArrayObject **op, int broadcast_ndim, int **remap_axis) { + static PyObject *AxisError_cls = NULL; + int nin = ufunc->nin; int nop = ufunc->nargs; int iop, list_size; @@ -1825,16 +1828,17 @@ _parse_axes_arg(PyUFuncObject *ufunc, int op_core_num_dims[], PyObject *axes, op_axes_tuple = PyList_GET_ITEM(axes, iop); if (PyTuple_Check(op_axes_tuple)) { if (PyTuple_Size(op_axes_tuple) != op_ncore) { - if (op_ncore == 1) { - PyErr_Format(PyExc_ValueError, - "axes item %d should be a tuple with a " - "single element, or an integer", iop); - } - else { - PyErr_Format(PyExc_ValueError, - "axes item %d should be a tuple with %d " - "elements", iop, op_ncore); + /* must have been a tuple with too many entries. */ + npy_cache_import( + "numpy.exceptions", "AxisError", &AxisError_cls); + if (AxisError_cls == NULL) { + return -1; } + PyErr_Format(AxisError_cls, + "%s: operand %d has %d core dimensions, " + "but %zd dimensions are specified by axes tuple.", + ufunc_get_name_cstr(ufunc), iop, op_ncore, + PyTuple_Size(op_axes_tuple)); return -1; } Py_INCREF(op_axes_tuple); @@ -1846,8 +1850,22 @@ _parse_axes_arg(PyUFuncObject *ufunc, int op_core_num_dims[], PyObject *axes, } } else { - PyErr_Format(PyExc_TypeError, "axes item %d should be a tuple", - iop); + /* If input is not an integer tell user that a tuple is needed */ + if (error_converting(PyArray_PyIntAsInt(op_axes_tuple))) { + PyErr_Format(PyExc_TypeError, + "%s: axes item %d should be a tuple.", + ufunc_get_name_cstr(ufunc), iop); + return -1; + } + /* If it is a single integer, inform user that more are needed */ + npy_cache_import("numpy.exceptions", "AxisError", &AxisError_cls); + if (AxisError_cls == NULL) { + return -1; + } + PyErr_Format(AxisError_cls, + "%s: operand %d has %d core dimensions, " + "but the axes item is a single integer.", + ufunc_get_name_cstr(ufunc), iop, op_ncore); return -1; } /* @@ -2067,12 +2085,16 @@ _get_coredim_sizes(PyUFuncObject *ufunc, PyArrayObject **op, } /* - * Returns a new reference + * Returns a new reference to the ufunc identity. Note that this identity + * is only a default identity value stored on the ufunc, since the invidiual + * ufunc loop (ArrayMethod) is queried for the actual identity. + * * TODO: store a reference in the ufunc object itself, rather than * constructing one each time */ -static PyObject * -_get_identity(PyUFuncObject *ufunc, npy_bool *reorderable) { +NPY_NO_EXPORT PyObject * +PyUFunc_GetDefaultIdentity(PyUFuncObject *ufunc, npy_bool *reorderable) +{ switch(ufunc->identity) { case PyUFunc_One: *reorderable = 1; @@ -2739,8 +2761,41 @@ reducelike_promote_and_resolve(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *out, PyArray_DTypeMeta *signature[3], npy_bool enforce_uniform_args, PyArray_Descr *out_descrs[3], - char *method) + NPY_CASTING casting, char *method) { + /* + * If no dtype is specified and out is not specified, we override the + * integer and bool dtype used for add and multiply. + * + * TODO: The following should be handled by a promoter! + */ + if (signature[0] == NULL && out == NULL) { + /* + * For integer types --- make sure at least a long + * is used for add and multiply reduction to avoid overflow + */ + int typenum = PyArray_TYPE(arr); + if ((PyTypeNum_ISBOOL(typenum) || PyTypeNum_ISINTEGER(typenum)) + && ((strcmp(ufunc->name, "add") == 0) + || (strcmp(ufunc->name, "multiply") == 0))) { + if (PyTypeNum_ISBOOL(typenum)) { + typenum = NPY_LONG; + } + else if ((size_t)PyArray_DESCR(arr)->elsize < sizeof(long)) { + if (PyTypeNum_ISUNSIGNED(typenum)) { + typenum = NPY_ULONG; + } + else { + typenum = NPY_LONG; + } + } + signature[0] = PyArray_DTypeFromTypeNum(typenum); + } + } + assert(signature[2] == NULL); /* we always fill it here */ + Py_XINCREF(signature[0]); + signature[2] = signature[0]; + /* * Note that the `ops` is not really correct. But legacy resolution * cannot quite handle the correct ops (e.g. a NULL first item if `out` @@ -2802,7 +2857,7 @@ reducelike_promote_and_resolve(PyUFuncObject *ufunc, * (although this should possibly happen through a deprecation) */ if (resolve_descriptors(3, ufunc, ufuncimpl, - ops, out_descrs, signature, NPY_UNSAFE_CASTING) < 0) { + ops, out_descrs, signature, casting) < 0) { return NULL; } @@ -2825,8 +2880,7 @@ reducelike_promote_and_resolve(PyUFuncObject *ufunc, goto fail; } /* TODO: This really should _not_ be unsafe casting (same above)! */ - if (validate_casting(ufuncimpl, - ufunc, ops, out_descrs, NPY_UNSAFE_CASTING) < 0) { + if (validate_casting(ufuncimpl, ufunc, ops, out_descrs, casting) < 0) { goto fail; } @@ -2834,7 +2888,7 @@ reducelike_promote_and_resolve(PyUFuncObject *ufunc, fail: for (int i = 0; i < 3; ++i) { - Py_DECREF(out_descrs[i]); + Py_CLEAR(out_descrs[i]); } return NULL; } @@ -2956,10 +3010,8 @@ PyUFunc_Reduce(PyUFuncObject *ufunc, PyObject *initial, PyArrayObject *wheremask) { int iaxes, ndim; - npy_bool reorderable; npy_bool axis_flags[NPY_MAXDIMS]; - PyArrayObject *result = NULL; - PyObject *identity; + const char *ufunc_name = ufunc_get_name_cstr(ufunc); /* These parameters come from a TLS global */ int buffersize = 0, errormask = 0; @@ -2984,72 +3036,26 @@ PyUFunc_Reduce(PyUFuncObject *ufunc, return NULL; } - /* - * Promote and fetch ufuncimpl (currently needed to fix up the identity). - */ PyArray_Descr *descrs[3]; PyArrayMethodObject *ufuncimpl = reducelike_promote_and_resolve(ufunc, - arr, out, signature, NPY_FALSE, descrs, "reduce"); + arr, out, signature, NPY_FALSE, descrs, NPY_UNSAFE_CASTING, "reduce"); if (ufuncimpl == NULL) { return NULL; } - /* Get the identity */ - /* TODO: Both of these should be provided by the ArrayMethod! */ - identity = _get_identity(ufunc, &reorderable); - if (identity == NULL) { - goto finish; - } - - /* Get the initial value */ - if (initial == NULL) { - initial = identity; - - /* - * The identity for a dynamic dtype like - * object arrays can't be used in general - */ - if (initial != Py_None && PyArray_ISOBJECT(arr) && PyArray_SIZE(arr) != 0) { - Py_DECREF(initial); - initial = Py_None; - Py_INCREF(initial); - } - else if (PyTypeNum_ISUNSIGNED(descrs[2]->type_num) - && PyLong_CheckExact(initial)) { - /* - * This is a bit of a hack until we have truly loop specific - * identities. Python -1 cannot be cast to unsigned so convert - * it to a NumPy scalar, but we use -1 for bitwise functions to - * signal all 1s. - * (A builtin identity would not overflow here, although we may - * unnecessary convert 0 and 1.) - */ - Py_SETREF(initial, PyObject_CallFunctionObjArgs( - (PyObject *)&PyLongArrType_Type, initial, NULL)); - if (initial == NULL) { - goto finish; - } - } - } else { - Py_DECREF(identity); - Py_INCREF(initial); /* match the reference count in the if above */ - } - PyArrayMethod_Context context = { .caller = (PyObject *)ufunc, .method = ufuncimpl, .descriptors = descrs, }; - result = PyUFunc_ReduceWrapper(&context, - arr, out, wheremask, axis_flags, reorderable, keepdims, - initial, reduce_loop, ufunc, buffersize, ufunc_name, errormask); + PyArrayObject *result = PyUFunc_ReduceWrapper(&context, + arr, out, wheremask, axis_flags, keepdims, + initial, reduce_loop, buffersize, ufunc_name, errormask); - finish: for (int i = 0; i < 3; i++) { Py_DECREF(descrs[i]); } - Py_XDECREF(initial); return result; } @@ -3094,7 +3100,8 @@ PyUFunc_Accumulate(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *out, PyArray_Descr *descrs[3]; PyArrayMethodObject *ufuncimpl = reducelike_promote_and_resolve(ufunc, - arr, out, signature, NPY_TRUE, descrs, "accumulate"); + arr, out, signature, NPY_TRUE, descrs, NPY_UNSAFE_CASTING, + "accumulate"); if (ufuncimpl == NULL) { return NULL; } @@ -3511,7 +3518,8 @@ PyUFunc_Reduceat(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *ind, PyArray_Descr *descrs[3]; PyArrayMethodObject *ufuncimpl = reducelike_promote_and_resolve(ufunc, - arr, out, signature, NPY_TRUE, descrs, "reduceat"); + arr, out, signature, NPY_TRUE, descrs, NPY_UNSAFE_CASTING, + "reduceat"); if (ufuncimpl == NULL) { return NULL; } @@ -4063,7 +4071,7 @@ PyUFunc_GenericReduction(PyUFuncObject *ufunc, /* We now have all the information required to check for Overrides */ PyObject *override = NULL; int errval = PyUFunc_CheckOverride(ufunc, _reduce_type[operation], - full_args.in, full_args.out, args, len_args, kwnames, &override); + full_args.in, full_args.out, wheremask_obj, args, len_args, kwnames, &override); if (errval) { return NULL; } @@ -4169,38 +4177,6 @@ PyUFunc_GenericReduction(PyUFuncObject *ufunc, } } - /* - * If no dtype is specified and out is not specified, we override the - * integer and bool dtype used for add and multiply. - * - * TODO: The following should be handled by a promoter! - */ - if (signature[0] == NULL && out == NULL) { - /* - * For integer types --- make sure at least a long - * is used for add and multiply reduction to avoid overflow - */ - int typenum = PyArray_TYPE(mp); - if ((PyTypeNum_ISBOOL(typenum) || PyTypeNum_ISINTEGER(typenum)) - && ((strcmp(ufunc->name, "add") == 0) - || (strcmp(ufunc->name, "multiply") == 0))) { - if (PyTypeNum_ISBOOL(typenum)) { - typenum = NPY_LONG; - } - else if ((size_t)PyArray_DESCR(mp)->elsize < sizeof(long)) { - if (PyTypeNum_ISUNSIGNED(typenum)) { - typenum = NPY_ULONG; - } - else { - typenum = NPY_LONG; - } - } - signature[0] = PyArray_DTypeFromTypeNum(typenum); - } - } - Py_XINCREF(signature[0]); - signature[2] = signature[0]; - switch(operation) { case UFUNC_REDUCE: ret = PyUFunc_Reduce(ufunc, @@ -4361,23 +4337,16 @@ _get_dtype(PyObject *dtype_obj) { } else if (NPY_UNLIKELY(out->singleton != descr)) { /* This does not warn about `metadata`, but units is important. */ - if (!PyArray_EquivTypes(out->singleton, descr)) { - /* Deprecated NumPy 1.21.2 (was an accidental error in 1.21) */ - if (DEPRECATE( + if (out->singleton == NULL + || !PyArray_EquivTypes(out->singleton, descr)) { + PyErr_SetString(PyExc_TypeError, "The `dtype` and `signature` arguments to " "ufuncs only select the general DType and not details " - "such as the byte order or time unit (with rare " - "exceptions see release notes). To avoid this warning " - "please use the scalar types `np.float64`, or string " - "notation.\n" - "In rare cases where the time unit was preserved, " - "either cast the inputs or provide an output array. " - "In the future NumPy may transition to allow providing " - "`dtype=` to denote the outputs `dtype` as well. " - "(Deprecated NumPy 1.21)") < 0) { - Py_DECREF(descr); - return NULL; - } + "such as the byte order or time unit. " + "You can avoid this error by using the scalar types " + "`np.float64` or the dtype string notation."); + Py_DECREF(descr); + return NULL; } } Py_INCREF(out); @@ -4874,7 +4843,7 @@ ufunc_generic_fastcall(PyUFuncObject *ufunc, /* We now have all the information required to check for Overrides */ PyObject *override = NULL; errval = PyUFunc_CheckOverride(ufunc, method, - full_args.in, full_args.out, + full_args.in, full_args.out, where_obj, args, len_args, kwnames, &override); if (errval) { goto fail; @@ -5066,14 +5035,11 @@ ufunc_generic_vectorcall(PyObject *ufunc, NPY_NO_EXPORT PyObject * -ufunc_geterr(PyObject *NPY_UNUSED(dummy), PyObject *args) +ufunc_geterr(PyObject *NPY_UNUSED(dummy), PyObject *NPY_UNUSED(arg)) { PyObject *thedict; PyObject *res; - if (!PyArg_ParseTuple(args, "")) { - return NULL; - } thedict = PyThreadState_GetDict(); if (thedict == NULL) { thedict = PyEval_GetBuiltins(); @@ -5099,16 +5065,13 @@ ufunc_geterr(PyObject *NPY_UNUSED(dummy), PyObject *args) NPY_NO_EXPORT PyObject * -ufunc_seterr(PyObject *NPY_UNUSED(dummy), PyObject *args) +ufunc_seterr(PyObject *NPY_UNUSED(dummy), PyObject *arg) { PyObject *thedict; int res; - PyObject *val; + PyObject *val = arg; static char *msg = "Error object must be a list of length 3"; - if (!PyArg_ParseTuple(args, "O:seterrobj", &val)) { - return NULL; - } if (!PyList_CheckExact(val) || PyList_GET_SIZE(val) != 3) { PyErr_SetString(PyExc_ValueError, msg); return NULL; @@ -5379,7 +5342,7 @@ cmp_arg_types(int *arg1, int *arg2, int n) * This frees the linked-list structure when the CObject * is destroyed (removed from the internal dictionary) */ -static NPY_INLINE void +static inline void _free_loop1d_list(PyUFunc_Loop1d *data) { int i; @@ -5920,10 +5883,11 @@ ufunc_reduceat(PyUFuncObject *ufunc, } /* Helper for ufunc_at, below */ -static NPY_INLINE PyArrayObject * +static inline PyArrayObject * new_array_op(PyArrayObject *op_array, char *data) { npy_intp dims[1] = {1}; + Py_INCREF(PyArray_DESCR(op_array)); /* NewFromDescr steals a reference */ PyObject *r = PyArray_NewFromDescr(&PyArray_Type, PyArray_DESCR(op_array), 1, dims, NULL, data, NPY_ARRAY_WRITEABLE, NULL); @@ -5931,206 +5895,171 @@ new_array_op(PyArrayObject *op_array, char *data) } /* - * Call ufunc only on selected array items and store result in first operand. - * For add ufunc, method call is equivalent to op1[idx] += op2 with no - * buffering of the first operand. - * Arguments: - * op1 - First operand to ufunc - * idx - Indices that are applied to first operand. Equivalent to op1[idx]. - * op2 - Second operand to ufunc (if needed). Must be able to broadcast - * over first operand. + * Use an indexed loop to do the work + * Returns 0 if successful */ -static PyObject * -ufunc_at(PyUFuncObject *ufunc, PyObject *args) +static int +trivial_at_loop(PyArrayMethodObject *ufuncimpl, NPY_ARRAYMETHOD_FLAGS flags, + PyArrayMapIterObject *iter, + PyArrayObject *op1_array, PyArrayObject *op2_array, + PyArrayMethod_Context *context) { - PyObject *op1 = NULL; - PyObject *idx = NULL; - PyObject *op2 = NULL; - PyArrayObject *op1_array = NULL; - PyArrayObject *op2_array = NULL; - PyArrayMapIterObject *iter = NULL; - PyArrayIterObject *iter2 = NULL; - PyArrayObject *operands[3] = {NULL, NULL, NULL}; - PyArrayObject *array_operands[3] = {NULL, NULL, NULL}; - - PyArray_DTypeMeta *signature[3] = {NULL, NULL, NULL}; - PyArray_DTypeMeta *operand_DTypes[3] = {NULL, NULL, NULL}; - PyArray_Descr *operation_descrs[3] = {NULL, NULL, NULL}; - - int nop; - - /* override vars */ - int errval; - PyObject *override = NULL; - - NpyIter *iter_buffer; - NpyIter_IterNextFunc *iternext; - npy_uint32 op_flags[NPY_MAXARGS]; - int buffersize; - int errormask = 0; - char * err_msg = NULL; - - PyArrayMethod_StridedLoop *strided_loop; - NpyAuxData *auxdata = NULL; + int buffersize=0, errormask = 0; + int res; + char *args[3]; + npy_intp steps[3]; + args[0] = (char *) iter->baseoffset; + steps[0] = iter->fancy_strides[0]; + if (ufuncimpl->nin == 1) { + args[2] = NULL; + steps[2] = 0; + } else { + args[2] = (char *)PyArray_DATA(op2_array); + if (PyArray_NDIM(op2_array) == 0 + || PyArray_DIM(op2_array, 0) <= 1) { + steps[2] = 0; + } else { + steps[2] = PyArray_STRIDE(op2_array, 0); + } + } - NPY_BEGIN_THREADS_DEF; + npy_intp *inner_size = NpyIter_GetInnerLoopSizePtr(iter->outer); - if (ufunc->core_enabled) { - PyErr_Format(PyExc_TypeError, - "%s.at does not support ufunc with non-trivial signature: %s has signature %s.", - ufunc->name, ufunc->name, ufunc->core_signature); - return NULL; + if (!(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) { + npy_clear_floatstatus_barrier((char *)context); } - if (ufunc->nin > 2) { - PyErr_SetString(PyExc_ValueError, - "Only unary and binary ufuncs supported at this time"); - return NULL; - } + do { + args[1] = (char *) iter->outer_ptrs[0]; + steps[1] = iter->outer_strides[0]; - if (ufunc->nout != 1) { - PyErr_SetString(PyExc_ValueError, - "Only single output ufuncs supported at this time"); - return NULL; - } + res = ufuncimpl->contiguous_indexed_loop( + context, args, inner_size, steps, NULL); + if (args[2] != NULL) { + args[2] += (*inner_size) * steps[2]; + } + } while (res == 0 && iter->outer_next(iter->outer)); - if (!PyArg_ParseTuple(args, "OO|O:at", &op1, &idx, &op2)) { - return NULL; + if (res == 0 && !(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) { + const char * ufunc_name = + ufunc_get_name_cstr((PyUFuncObject *)context->caller); + if (_get_bufsize_errmask(NULL, ufunc_name, + &buffersize, &errormask) < 0) { + return -1; + } + res = _check_ufunc_fperr(errormask, NULL, ufunc_name); } + return res; +} - if (ufunc->nin == 2 && op2 == NULL) { - PyErr_SetString(PyExc_ValueError, - "second operand needed for ufunc"); - return NULL; - } - errval = PyUFunc_CheckOverride(ufunc, "at", - args, NULL, NULL, 0, NULL, &override); +static int +ufunc_at__fast_iter(PyUFuncObject *ufunc, NPY_ARRAYMETHOD_FLAGS flags, + PyArrayMapIterObject *iter, PyArrayIterObject *iter2, + PyArrayObject *op1_array, PyArrayObject *op2_array, + PyArrayMethod_StridedLoop *strided_loop, + PyArrayMethod_Context *context, + NpyAuxData *auxdata + ) +{ + int buffersize; + int errormask = 0; + int res = 0; + NPY_BEGIN_THREADS_DEF; - if (errval) { - return NULL; - } - else if (override) { - return override; + if (_get_bufsize_errmask(NULL, ufunc->name, &buffersize, &errormask) < 0) { + return -1; } - - if (!PyArray_Check(op1)) { - PyErr_SetString(PyExc_TypeError, - "first operand must be array"); - return NULL; + int needs_api = (flags & NPY_METH_REQUIRES_PYAPI) != 0; + if (!(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) { + /* Start with the floating-point exception flags cleared */ + npy_clear_floatstatus_barrier((char*)&iter); } - op1_array = (PyArrayObject *)op1; - - /* Create second operand from number array if needed. */ - if (op2 != NULL) { - op2_array = (PyArrayObject *)PyArray_FromAny(op2, NULL, - 0, 0, 0, NULL); - if (op2_array == NULL) { - goto fail; - } + if (!needs_api) { + NPY_BEGIN_THREADS; } - /* Create map iterator */ - iter = (PyArrayMapIterObject *)PyArray_MapIterArrayCopyIfOverlap( - op1_array, idx, 1, op2_array); - if (iter == NULL) { - goto fail; - } - op1_array = iter->array; /* May be updateifcopied on overlap */ + npy_intp strides[3] = {0, 0, 0}; + /* + * Iterate over first and second operands and call ufunc + * for each pair of inputs + */ + for (npy_intp i = iter->size; i > 0; i--) + { + char *dataptr[3]; + /* one element at a time, no stride required but read by innerloop */ + npy_intp count = 1; - if (op2 != NULL) { /* - * May need to swap axes so that second operand is - * iterated over correctly + * Set up data pointers for either one or two input operands. + * The output data pointer points to the first operand data. */ - if ((iter->subspace != NULL) && (iter->consec)) { - PyArray_MapIterSwapAxes(iter, &op2_array, 0); - if (op2_array == NULL) { - goto fail; - } + dataptr[0] = iter->dataptr; + if (iter2 != NULL) { + dataptr[1] = PyArray_ITER_DATA(iter2); + dataptr[2] = iter->dataptr; + } + else { + dataptr[1] = iter->dataptr; + dataptr[2] = NULL; } - /* - * Create array iter object for second operand that - * "matches" the map iter object for the first operand. - * Then we can just iterate over the first and second - * operands at the same time and not have to worry about - * picking the correct elements from each operand to apply - * the ufunc to. - */ - if ((iter2 = (PyArrayIterObject *)\ - PyArray_BroadcastToShape((PyObject *)op2_array, - iter->dimensions, iter->nd))==NULL) { - goto fail; + res = strided_loop(context, dataptr, &count, strides, auxdata); + if (res != 0) { + break; } - } - /* - * Create dtypes array for either one or two input operands. - * Compare to the logic in `convert_ufunc_arguments`. - * TODO: It may be good to review some of this behaviour, since the - * operand array is special (it is written to) similar to reductions. - * Using unsafe-casting as done here, is likely not desirable. - */ - operands[0] = op1_array; - operand_DTypes[0] = NPY_DTYPE(PyArray_DESCR(op1_array)); - Py_INCREF(operand_DTypes[0]); - int force_legacy_promotion = 0; - int allow_legacy_promotion = NPY_DT_is_legacy(operand_DTypes[0]); + PyArray_MapIterNext(iter); + if (iter2 != NULL) { + PyArray_ITER_NEXT(iter2); + } + } - if (op2_array != NULL) { - operands[1] = op2_array; - operand_DTypes[1] = NPY_DTYPE(PyArray_DESCR(op2_array)); - Py_INCREF(operand_DTypes[1]); - allow_legacy_promotion &= NPY_DT_is_legacy(operand_DTypes[1]); - operands[2] = operands[0]; - operand_DTypes[2] = operand_DTypes[0]; - Py_INCREF(operand_DTypes[2]); + NPY_END_THREADS; - nop = 3; - if (allow_legacy_promotion && ((PyArray_NDIM(op1_array) == 0) - != (PyArray_NDIM(op2_array) == 0))) { - /* both are legacy and only one is 0-D: force legacy */ - force_legacy_promotion = should_use_min_scalar(2, operands, 0, NULL); - } - } - else { - operands[1] = operands[0]; - operand_DTypes[1] = operand_DTypes[0]; - Py_INCREF(operand_DTypes[1]); - operands[2] = NULL; - nop = 2; + if (res == 0 && !(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) { + /* NOTE: We could check float errors even when `res < 0` */ + res = _check_ufunc_fperr(errormask, NULL, "at"); } + return res; +} - PyArrayMethodObject *ufuncimpl = promote_and_get_ufuncimpl(ufunc, - operands, signature, operand_DTypes, - force_legacy_promotion, allow_legacy_promotion, - NPY_FALSE, NPY_FALSE); - if (ufuncimpl == NULL) { - goto fail; - } +static int +ufunc_at__slow_iter(PyUFuncObject *ufunc, NPY_ARRAYMETHOD_FLAGS flags, + PyArrayMapIterObject *iter, PyArrayIterObject *iter2, + PyArrayObject *op1_array, PyArrayObject *op2_array, + PyArray_Descr *operation_descrs[3], + PyArrayMethod_StridedLoop *strided_loop, + PyArrayMethod_Context *context, + NpyAuxData *auxdata + ) +{ + NpyIter *iter_buffer = NULL; + PyArrayObject *array_operands[3] = {NULL, NULL, NULL}; + int buffersize; + int errormask = 0; + int res = 0; + int nop = 0; + NpyIter_IterNextFunc *iternext; + char * err_msg = NULL; + NPY_BEGIN_THREADS_DEF; - /* Find the correct descriptors for the operation */ - if (resolve_descriptors(nop, ufunc, ufuncimpl, - operands, operation_descrs, signature, NPY_UNSAFE_CASTING) < 0) { - goto fail; + if (_get_bufsize_errmask(NULL, ufunc->name, &buffersize, &errormask) < 0) { + return -1; } - - Py_INCREF(PyArray_DESCR(op1_array)); array_operands[0] = new_array_op(op1_array, iter->dataptr); if (iter2 != NULL) { - Py_INCREF(PyArray_DESCR(op2_array)); array_operands[1] = new_array_op(op2_array, PyArray_ITER_DATA(iter2)); - Py_INCREF(PyArray_DESCR(op1_array)); array_operands[2] = new_array_op(op1_array, iter->dataptr); + nop = 3; } else { - Py_INCREF(PyArray_DESCR(op1_array)); array_operands[1] = new_array_op(op1_array, iter->dataptr); array_operands[2] = NULL; + nop = 2; } - /* Set up the flags */ + npy_uint32 op_flags[3]; op_flags[0] = NPY_ITER_READONLY| NPY_ITER_ALIGNED; @@ -6150,11 +6079,6 @@ ufunc_at(PyUFuncObject *ufunc, PyObject *args) NPY_ITER_NO_BROADCAST| NPY_ITER_NO_SUBTYPE; } - - if (_get_bufsize_errmask(NULL, ufunc->name, &buffersize, &errormask) < 0) { - goto fail; - } - /* * Create NpyIter object to "iterate" over single element of each input * operand. This is an easy way to reuse the NpyIter logic for dealing @@ -6177,33 +6101,23 @@ ufunc_at(PyUFuncObject *ufunc, PyObject *args) -1, NULL, NULL, buffersize); if (iter_buffer == NULL) { - goto fail; + /* will fail only on memory allocation errors */ + for (int i = 0; i < 3; i++) { + Py_XDECREF(array_operands[i]); + } + return -1; } iternext = NpyIter_GetIterNext(iter_buffer, NULL); if (iternext == NULL) { + /* can not really happen, iter_buffer creation is tightly controlled */ NpyIter_Deallocate(iter_buffer); - goto fail; - } - - PyArrayMethod_Context context = { - .caller = (PyObject *)ufunc, - .method = ufuncimpl, - .descriptors = operation_descrs, - }; - - NPY_ARRAYMETHOD_FLAGS flags; - /* Use contiguous strides; if there is such a loop it may be faster */ - npy_intp strides[3] = { - operation_descrs[0]->elsize, operation_descrs[1]->elsize, 0}; - if (nop == 3) { - strides[2] = operation_descrs[2]->elsize; + for (int i = 0; i < 3; i++) { + Py_XDECREF(array_operands[i]); + } + return -1; } - if (ufuncimpl->get_strided_loop(&context, 1, 0, strides, - &strided_loop, &auxdata, &flags) < 0) { - goto fail; - } int needs_api = (flags & NPY_METH_REQUIRES_PYAPI) != 0; needs_api |= NpyIter_IterationNeedsAPI(iter_buffer); if (!(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) { @@ -6211,6 +6125,7 @@ ufunc_at(PyUFuncObject *ufunc, PyObject *args) npy_clear_floatstatus_barrier((char*)&iter); } + npy_intp strides[3] = {0, 0, 0}; if (!needs_api) { NPY_BEGIN_THREADS; } @@ -6219,7 +6134,6 @@ ufunc_at(PyUFuncObject *ufunc, PyObject *args) * Iterate over first and second operands and call ufunc * for each pair of inputs */ - int res = 0; for (npy_intp i = iter->size; i > 0; i--) { char *dataptr[3]; @@ -6250,7 +6164,7 @@ ufunc_at(PyUFuncObject *ufunc, PyObject *args) buffer_dataptr = NpyIter_GetDataPtrArray(iter_buffer); - res = strided_loop(&context, buffer_dataptr, &count, strides, auxdata); + res = strided_loop(context, buffer_dataptr, &count, strides, auxdata); if (res != 0) { break; } @@ -6276,18 +6190,284 @@ ufunc_at(PyUFuncObject *ufunc, PyObject *args) /* NOTE: We could check float errors even when `res < 0` */ res = _check_ufunc_fperr(errormask, NULL, "at"); } + NpyIter_Deallocate(iter_buffer); + for (int i = 0; i < 3; i++) { + Py_XDECREF(array_operands[i]); + } + return res; +} +/* + * Call ufunc only on selected array items and store result in first operand. + * For add ufunc, method call is equivalent to op1[idx] += op2 with no + * buffering of the first operand. + * Arguments: + * op1 - First operand to ufunc + * idx - Indices that are applied to first operand. Equivalent to op1[idx]. + * op2 - Second operand to ufunc (if needed). Must be able to broadcast + * over first operand. + */ +static PyObject * +ufunc_at(PyUFuncObject *ufunc, PyObject *args) +{ + PyObject *op1 = NULL; + PyObject *idx = NULL; + PyObject *op2 = NULL; + PyArrayObject *op1_array = NULL; + PyArrayObject *op2_array = NULL; + PyArrayMapIterObject *iter = NULL; + PyArrayIterObject *iter2 = NULL; + PyArray_Descr *operation_descrs[3] = {NULL, NULL, NULL}; + + int nop; + + /* override vars */ + int errval; + PyObject *override = NULL; + int res = -1; /* start with fail condition so "goto fail" will error */ + + PyArrayMethod_StridedLoop *strided_loop; + NpyAuxData *auxdata = NULL; + + if (ufunc->core_enabled) { + PyErr_Format(PyExc_TypeError, + "%s.at does not support ufunc with non-trivial signature: %s has signature %s.", + ufunc->name, ufunc->name, ufunc->core_signature); + return NULL; + } + + if (ufunc->nin > 2) { + PyErr_SetString(PyExc_ValueError, + "Only unary and binary ufuncs supported at this time"); + return NULL; + } + + if (ufunc->nout != 1) { + PyErr_SetString(PyExc_ValueError, + "Only single output ufuncs supported at this time"); + return NULL; + } + + if (!PyArg_ParseTuple(args, "OO|O:at", &op1, &idx, &op2)) { + return NULL; + } + + if (ufunc->nin == 2 && op2 == NULL) { + PyErr_SetString(PyExc_ValueError, + "second operand needed for ufunc"); + return NULL; + } + + if (ufunc->nin == 1 && op2 != NULL) { + PyErr_SetString(PyExc_ValueError, + "second operand provided when ufunc is unary"); + return NULL; + } + errval = PyUFunc_CheckOverride(ufunc, "at", + args, NULL, NULL, NULL, 0, NULL, &override); + + if (errval) { + return NULL; + } + else if (override) { + return override; + } + + if (!PyArray_Check(op1)) { + PyErr_SetString(PyExc_TypeError, + "first operand must be array"); + return NULL; + } + + op1_array = (PyArrayObject *)op1; + + /* Create second operand from number array if needed. */ + if (op2 == NULL) { + nop = 2; + } + else { + nop = 3; + op2_array = (PyArrayObject *)PyArray_FromAny(op2, NULL, + 0, 0, 0, NULL); + if (op2_array == NULL) { + goto fail; + } + } + + PyArrayMethodObject *ufuncimpl = NULL; + { + /* Do all the dtype handling and find the correct ufuncimpl */ + + PyArrayObject *tmp_operands[3] = {NULL, NULL, NULL}; + PyArray_DTypeMeta *signature[3] = {NULL, NULL, NULL}; + PyArray_DTypeMeta *operand_DTypes[3] = {NULL, NULL, NULL}; + /* + * Create dtypes array for either one or two input operands. + * Compare to the logic in `convert_ufunc_arguments`. + * TODO: It may be good to review some of this behaviour, since the + * operand array is special (it is written to) similar to reductions. + * Using unsafe-casting as done here, is likely not desirable. + */ + tmp_operands[0] = op1_array; + operand_DTypes[0] = NPY_DTYPE(PyArray_DESCR(op1_array)); + Py_INCREF(operand_DTypes[0]); + int force_legacy_promotion = 0; + int allow_legacy_promotion = NPY_DT_is_legacy(operand_DTypes[0]); + + if (op2_array != NULL) { + tmp_operands[1] = op2_array; + operand_DTypes[1] = NPY_DTYPE(PyArray_DESCR(op2_array)); + Py_INCREF(operand_DTypes[1]); + allow_legacy_promotion &= NPY_DT_is_legacy(operand_DTypes[1]); + tmp_operands[2] = tmp_operands[0]; + operand_DTypes[2] = operand_DTypes[0]; + Py_INCREF(operand_DTypes[2]); + + if (allow_legacy_promotion && ((PyArray_NDIM(op1_array) == 0) + != (PyArray_NDIM(op2_array) == 0))) { + /* both are legacy and only one is 0-D: force legacy */ + force_legacy_promotion = should_use_min_scalar(2, tmp_operands, 0, NULL); + } + } + else { + tmp_operands[1] = tmp_operands[0]; + operand_DTypes[1] = operand_DTypes[0]; + Py_INCREF(operand_DTypes[1]); + tmp_operands[2] = NULL; + } + + ufuncimpl = promote_and_get_ufuncimpl(ufunc, tmp_operands, signature, + operand_DTypes, force_legacy_promotion, + allow_legacy_promotion, NPY_FALSE, NPY_FALSE); + if (ufuncimpl == NULL) { + for (int i = 0; i < 3; i++) { + Py_XDECREF(signature[i]); + Py_XDECREF(operand_DTypes[i]); + } + goto fail; + } + + /* Find the correct operation_descrs for the operation */ + int resolve_result = resolve_descriptors(nop, ufunc, ufuncimpl, + tmp_operands, operation_descrs, signature, NPY_UNSAFE_CASTING); + for (int i = 0; i < 3; i++) { + Py_XDECREF(signature[i]); + Py_XDECREF(operand_DTypes[i]); + } + if (resolve_result < 0) { + goto fail; + } + } + + iter = (PyArrayMapIterObject *)PyArray_MapIterArrayCopyIfOverlap( + op1_array, idx, 1, op2_array); + if (iter == NULL) { + goto fail; + } + op1_array = iter->array; /* May be updateifcopied on overlap */ + + if (op2_array != NULL) { + /* + * May need to swap axes so that second operand is + * iterated over correctly + */ + if ((iter->subspace != NULL) && (iter->consec)) { + PyArray_MapIterSwapAxes(iter, &op2_array, 0); + if (op2_array == NULL) { + /* only on memory allocation failure */ + goto fail; + } + } + + /* + * Create array iter object for second operand that + * "matches" the map iter object for the first operand. + * Then we can just iterate over the first and second + * operands at the same time and not have to worry about + * picking the correct elements from each operand to apply + * the ufunc to. + */ + if ((iter2 = (PyArrayIterObject *)\ + PyArray_BroadcastToShape((PyObject *)op2_array, + iter->dimensions, iter->nd))==NULL) { + goto fail; + } + } + + PyArrayMethod_Context context = { + .caller = (PyObject *)ufunc, + .method = ufuncimpl, + .descriptors = operation_descrs, + }; + + /* Use contiguous strides; if there is such a loop it may be faster */ + npy_intp strides[3] = { + operation_descrs[0]->elsize, operation_descrs[1]->elsize, 0}; + if (nop == 3) { + strides[2] = operation_descrs[2]->elsize; + } + + NPY_ARRAYMETHOD_FLAGS flags; + if (ufuncimpl->get_strided_loop(&context, 1, 0, strides, + &strided_loop, &auxdata, &flags) < 0) { + goto fail; + } + int fast_path = 1; + /* check no casting, alignment */ + if (PyArray_DESCR(op1_array) != operation_descrs[0]) { + fast_path = 0; + } + if (PyArray_DESCR(op1_array) != operation_descrs[nop - 1]) { + /* output casting */ + fast_path = 0; + } + if (!PyArray_ISALIGNED(op1_array)) { + fast_path = 0; + } + if (nop >2) { + if (PyArray_DESCR(op2_array) != operation_descrs[1]) { + fast_path = 0; + } + if (!PyArray_ISALIGNED(op2_array)) { + fast_path = 0; + } + } + if (fast_path == 1) { + /* + * Try to use trivial loop (1d, no casting, aligned) if + * - the matching info has a indexed loop + * - idx must be exactly one integer index array + * - all operands are 1d + * A future enhancement could loosen the restriction on 1d operands + * by adding an iteration loop inside trivial_at_loop + */ + if ((ufuncimpl->contiguous_indexed_loop != NULL) && + (PyArray_NDIM(op1_array) == 1) && + (op2_array == NULL || PyArray_NDIM(op2_array) <= 1) && + (iter->subspace_iter == NULL) && (iter->numiter == 1)) { + res = trivial_at_loop(ufuncimpl, flags, iter, op1_array, + op2_array, &context); + + } + else { + /* Couldn't use the fastest path, use the faster path */ + res = ufunc_at__fast_iter(ufunc, flags, iter, iter2, op1_array, + op2_array, strided_loop, &context, auxdata); + } + } else { + res = ufunc_at__slow_iter(ufunc, flags, iter, iter2, op1_array, + op2_array, operation_descrs, strided_loop, &context, + auxdata); + } + +fail: NPY_AUXDATA_FREE(auxdata); - NpyIter_Deallocate(iter_buffer); Py_XDECREF(op2_array); Py_XDECREF(iter); Py_XDECREF(iter2); for (int i = 0; i < nop; i++) { - Py_DECREF(signature[i]); - Py_XDECREF(operand_DTypes[i]); Py_XDECREF(operation_descrs[i]); - Py_XDECREF(array_operands[i]); } /* @@ -6296,29 +6476,406 @@ ufunc_at(PyUFuncObject *ufunc, PyObject *args) * (e.g. `power` released the GIL but manually set an Exception). */ if (res != 0 || PyErr_Occurred()) { + /* iter_buffer has already been deallocated, don't use NpyIter_Dealloc */ + if (PyArray_FLAGS(op1_array) & NPY_ARRAY_WRITEBACKIFCOPY) { + PyArray_DiscardWritebackIfCopy(op1_array); + } return NULL; } else { Py_RETURN_NONE; } +} -fail: - /* iter_buffer has already been deallocated, don't use NpyIter_Dealloc */ - if (op1_array != (PyArrayObject*)op1) { - PyArray_DiscardWritebackIfCopy(op1_array); + +typedef struct { + PyArrayMethod_StridedLoop *strided_loop; + PyArrayMethod_Context *context; + NpyAuxData *auxdata; + /* Should move to flags, but lets keep it bools for now: */ + npy_bool requires_pyapi; + npy_bool no_floatingpoint_errors; + PyArrayMethod_Context _full_context; + PyArray_Descr *_descrs[]; +} ufunc_call_info; + + +void +free_ufunc_call_info(PyObject *self) +{ + ufunc_call_info *call_info = PyCapsule_GetPointer( + self, "numpy_1.24_ufunc_call_info"); + + PyArrayMethod_Context *context = call_info->context; + + int nargs = context->method->nin + context->method->nout; + for (int i = 0; i < nargs; i++) { + Py_DECREF(context->descriptors[i]); } - Py_XDECREF(op2_array); - Py_XDECREF(iter); - Py_XDECREF(iter2); - for (int i = 0; i < 3; i++) { + Py_DECREF(context->caller); + Py_DECREF(context->method); + NPY_AUXDATA_FREE(call_info->auxdata); + + PyObject_Free(call_info); +} + + +/* + * Python entry-point to ufunc promotion and dtype/descr resolution. + * + * This function does most of the work required to execute ufunc without + * actually executing it. + * This can be very useful for downstream libraries that reimplement NumPy + * functionality, such as Numba or Dask. + */ +static PyObject * +py_resolve_dtypes_generic(PyUFuncObject *ufunc, npy_bool return_context, + PyObject *const *args, Py_ssize_t len_args, PyObject *kwnames) +{ + NPY_PREPARE_ARGPARSER; + + PyObject *descrs_tuple; + PyObject *signature_obj = NULL; + NPY_CASTING casting = NPY_DEFAULT_ASSIGN_CASTING; + npy_bool reduction = NPY_FALSE; + + if (npy_parse_arguments("resolve_dtypes", args, len_args, kwnames, + "", NULL, &descrs_tuple, + "$signature", NULL, &signature_obj, + "$casting", &PyArray_CastingConverter, &casting, + "$reduction", &PyArray_BoolConverter, &reduction, + NULL, NULL, NULL) < 0) { + return NULL; + } + + if (reduction && (ufunc->nin != 2 || ufunc->nout != 1)) { + PyErr_SetString(PyExc_ValueError, + "ufunc is not compatible with reduction operations."); + return NULL; + } + + /* + * Legacy type resolvers expect NumPy arrays as input. Until NEP 50 is + * adopted, it is most convenient to ensure that we have an "array" object + * before calling the type promotion. Eventually, this hack may be moved + * into the legacy type resolution code itself (probably after NumPy stops + * using legacy type resolution itself for the most part). + * + * We make the pretty safe assumptions here that: + * - Nobody will actually do anything with the array objects besides + * checking the descriptor or calling CanCast. + * - No type resolver will cause weird paths that mess with our promotion + * state (or mind us messing with it). + */ + PyObject *result = NULL; + PyObject *result_dtype_tuple = NULL; + + PyArrayObject *dummy_arrays[NPY_MAXARGS] = {NULL}; + PyArray_DTypeMeta *DTypes[NPY_MAXARGS] = {NULL}; + PyArray_DTypeMeta *signature[NPY_MAXARGS] = {NULL}; + PyArray_Descr *operation_descrs[NPY_MAXARGS] = {NULL}; + + /* This entry-point to promotion lives in the NEP 50 future: */ + int original_promotion_state = npy_promotion_state; + npy_promotion_state = NPY_USE_WEAK_PROMOTION; + + npy_bool promoting_pyscalars = NPY_FALSE; + npy_bool allow_legacy_promotion = NPY_TRUE; + + if (_get_fixed_signature(ufunc, NULL, signature_obj, signature) < 0) { + goto finish; + } + + if (!PyTuple_CheckExact(descrs_tuple) + || PyTuple_Size(descrs_tuple) != ufunc->nargs) { + PyErr_SetString(PyExc_TypeError, + "resolve_dtypes: The dtypes must be a tuple of " + "`ufunc.nargs` length."); + goto finish; + } + for (int i=0; i < ufunc->nargs; i++) { + /* + * We create dummy arrays for now. It should be OK to make this + * truly "dummy" (not even proper objects), but that is a hack better + * left for the legacy_type_resolution wrapper when NEP 50 is done. + */ + PyObject *descr_obj = PyTuple_GET_ITEM(descrs_tuple, i); + PyArray_Descr *descr; + + if (PyArray_DescrCheck(descr_obj)) { + descr = (PyArray_Descr *)descr_obj; + Py_INCREF(descr); + dummy_arrays[i] = (PyArrayObject *)PyArray_NewFromDescr_int( + &PyArray_Type, descr, 0, NULL, NULL, NULL, + 0, NULL, NULL, _NPY_ARRAY_ENSURE_DTYPE_IDENTITY); + if (dummy_arrays[i] == NULL) { + goto finish; + } + DTypes[i] = NPY_DTYPE(descr); + Py_INCREF(DTypes[i]); + if (!NPY_DT_is_legacy(DTypes[i])) { + allow_legacy_promotion = NPY_FALSE; + } + } + /* Explicitly allow int, float, and complex for the "weak" types. */ + else if (descr_obj == (PyObject *)&PyLong_Type) { + descr = PyArray_DescrFromType(NPY_LONG); + dummy_arrays[i] = (PyArrayObject *)PyArray_Empty(0, NULL, descr, 0); + if (dummy_arrays[i] == NULL) { + goto finish; + } + PyArray_ENABLEFLAGS(dummy_arrays[i], NPY_ARRAY_WAS_PYTHON_INT); + Py_INCREF(&PyArray_PyIntAbstractDType); + DTypes[i] = &PyArray_PyIntAbstractDType; + promoting_pyscalars = NPY_TRUE; + } + else if (descr_obj == (PyObject *)&PyFloat_Type) { + descr = PyArray_DescrFromType(NPY_DOUBLE); + dummy_arrays[i] = (PyArrayObject *)PyArray_Empty(0, NULL, descr, 0); + if (dummy_arrays[i] == NULL) { + goto finish; + } + PyArray_ENABLEFLAGS(dummy_arrays[i], NPY_ARRAY_WAS_PYTHON_FLOAT); + Py_INCREF(&PyArray_PyFloatAbstractDType); + DTypes[i] = &PyArray_PyFloatAbstractDType; + promoting_pyscalars = NPY_TRUE; + } + else if (descr_obj == (PyObject *)&PyComplex_Type) { + descr = PyArray_DescrFromType(NPY_CDOUBLE); + dummy_arrays[i] = (PyArrayObject *)PyArray_Empty(0, NULL, descr, 0); + if (dummy_arrays[i] == NULL) { + goto finish; + } + PyArray_ENABLEFLAGS(dummy_arrays[i], NPY_ARRAY_WAS_PYTHON_COMPLEX); + Py_INCREF(&PyArray_PyComplexAbstractDType); + DTypes[i] = &PyArray_PyComplexAbstractDType; + promoting_pyscalars = NPY_TRUE; + } + else if (descr_obj == Py_None) { + if (i < ufunc->nin && !(reduction && i == 0)) { + PyErr_SetString(PyExc_TypeError, + "All input dtypes must be provided " + "(except the first one in reductions)"); + goto finish; + } + } + else { + PyErr_SetString(PyExc_TypeError, + "Provided dtype must be a valid NumPy dtype, " + "int, float, complex, or None."); + goto finish; + } + } + + PyArrayMethodObject *ufuncimpl; + if (!reduction) { + ufuncimpl = promote_and_get_ufuncimpl(ufunc, + dummy_arrays, signature, DTypes, NPY_FALSE, + allow_legacy_promotion, promoting_pyscalars, NPY_FALSE); + if (ufuncimpl == NULL) { + goto finish; + } + + /* Find the correct descriptors for the operation */ + if (resolve_descriptors(ufunc->nargs, ufunc, ufuncimpl, + dummy_arrays, operation_descrs, signature, casting) < 0) { + goto finish; + } + + if (validate_casting( + ufuncimpl, ufunc, dummy_arrays, operation_descrs, casting) < 0) { + goto finish; + } + } + else { /* reduction */ + if (signature[2] != NULL) { + PyErr_SetString(PyExc_ValueError, + "Reduction signature must end with None, instead pass " + "the first DType in the signature."); + goto finish; + } + + if (dummy_arrays[2] != NULL) { + PyErr_SetString(PyExc_TypeError, + "Output dtype must not be passed for reductions, " + "pass the first input instead."); + goto finish; + } + + ufuncimpl = reducelike_promote_and_resolve(ufunc, + dummy_arrays[1], dummy_arrays[0], signature, NPY_FALSE, + operation_descrs, casting, "resolve_dtypes"); + + if (ufuncimpl == NULL) { + goto finish; + } + } + + result = PyArray_TupleFromItems( + ufunc->nargs, (PyObject **)operation_descrs, 0); + + if (result == NULL || !return_context) { + goto finish; + } + /* Result will be (dtype_tuple, call_info), so move it and clear result */ + result_dtype_tuple = result; + result = NULL; + + /* We may have to return the context: */ + ufunc_call_info *call_info; + call_info = PyObject_Malloc(sizeof(ufunc_call_info) + + ufunc->nargs * sizeof(PyArray_Descr *)); + if (call_info == NULL) { + PyErr_NoMemory(); + goto finish; + } + call_info->strided_loop = NULL; + call_info->auxdata = NULL; + call_info->context = &call_info->_full_context; + + /* + * We create a capsule with NumPy 1.24 in the name to signal that it is + * prone to change in version updates (it doesn't have to). + * This capsule is documented in the `ufunc._resolve_dtypes_and_context` + * docstring. + */ + PyObject *capsule = PyCapsule_New( + call_info, "numpy_1.24_ufunc_call_info", &free_ufunc_call_info); + if (capsule == NULL) { + PyObject_Free(call_info); + goto finish; + } + + PyArrayMethod_Context *context = call_info->context; + + Py_INCREF(ufunc); + context->caller = (PyObject *)ufunc; + Py_INCREF(ufuncimpl); + context->method = ufuncimpl; + context->descriptors = call_info->_descrs; + for (int i=0; i < ufunc->nargs; i++) { + Py_INCREF(operation_descrs[i]); + context->descriptors[i] = operation_descrs[i]; + } + + result = PyTuple_Pack(2, result_dtype_tuple, capsule); + /* cleanup and return */ + Py_DECREF(capsule); + + finish: + npy_promotion_state = original_promotion_state; + + Py_XDECREF(result_dtype_tuple); + for (int i = 0; i < ufunc->nargs; i++) { Py_XDECREF(signature[i]); - Py_XDECREF(operand_DTypes[i]); + Py_XDECREF(dummy_arrays[i]); Py_XDECREF(operation_descrs[i]); - Py_XDECREF(array_operands[i]); + Py_XDECREF(DTypes[i]); } - NPY_AUXDATA_FREE(auxdata); - return NULL; + return result; +} + + +static PyObject * +py_resolve_dtypes(PyUFuncObject *ufunc, + PyObject *const *args, Py_ssize_t len_args, PyObject *kwnames) +{ + return py_resolve_dtypes_generic(ufunc, NPY_FALSE, args, len_args, kwnames); +} + + +static PyObject * +py_resolve_dtypes_and_context(PyUFuncObject *ufunc, + PyObject *const *args, Py_ssize_t len_args, PyObject *kwnames) +{ + return py_resolve_dtypes_generic(ufunc, NPY_TRUE, args, len_args, kwnames); +} + + +static PyObject * +py_get_strided_loop(PyUFuncObject *ufunc, + PyObject *const *args, Py_ssize_t len_args, PyObject *kwnames) +{ + NPY_PREPARE_ARGPARSER; + + PyObject *call_info_obj; + PyObject *fixed_strides_obj = Py_None; + npy_intp fixed_strides[NPY_MAXARGS]; + + if (npy_parse_arguments("_get_strided_loop", args, len_args, kwnames, + "", NULL, &call_info_obj, + "$fixed_strides", NULL, &fixed_strides_obj, + NULL, NULL, NULL) < 0) { + return NULL; + } + + ufunc_call_info *call_info = PyCapsule_GetPointer( + call_info_obj, "numpy_1.24_ufunc_call_info"); + if (call_info == NULL) { + /* Cannot have a context with NULL inside... */ + assert(PyErr_Occurred()); + return NULL; + } + if (call_info->strided_loop != NULL) { + PyErr_SetString(PyExc_TypeError, + "ufunc call_info has already been filled/used!"); + return NULL; + } + + if (call_info->context->caller != (PyObject *)ufunc) { + PyErr_SetString(PyExc_TypeError, + "calling get_strided_loop with incompatible context"); + return NULL; + } + + /* + * Strict conversion of fixed_strides, None, or tuple of int or None. + */ + if (fixed_strides_obj == Py_None) { + for (int i = 0; i < ufunc->nargs; i++) { + fixed_strides[i] = NPY_MAX_INTP; + } + } + else if (PyTuple_CheckExact(fixed_strides_obj) + && PyTuple_Size(fixed_strides_obj) == ufunc->nargs) { + for (int i = 0; i < ufunc->nargs; i++) { + PyObject *stride = PyTuple_GET_ITEM(fixed_strides_obj, i); + if (PyLong_CheckExact(stride)) { + fixed_strides[i] = PyLong_AsSsize_t(stride); + if (error_converting(fixed_strides[i])) { + return NULL; + } + } + else if (stride == Py_None) { + fixed_strides[i] = NPY_MAX_INTP; + } + else { + PyErr_SetString(PyExc_TypeError, + "_get_strided_loop(): fixed_strides tuple must contain " + "Python ints or None"); + return NULL; + } + } + } + else { + PyErr_SetString(PyExc_TypeError, + "_get_strided_loop(): fixed_strides must be a tuple or None"); + return NULL; + } + + NPY_ARRAYMETHOD_FLAGS flags; + if (call_info->context->method->get_strided_loop(call_info->context, + 1, 0, fixed_strides, &call_info->strided_loop, &call_info->auxdata, + &flags) < 0) { + return NULL; + } + + call_info->requires_pyapi = flags & NPY_METH_REQUIRES_PYAPI; + call_info->no_floatingpoint_errors = ( + flags & NPY_METH_NO_FLOATINGPOINT_ERRORS); + + Py_RETURN_NONE; } @@ -6338,6 +6895,21 @@ static struct PyMethodDef ufunc_methods[] = { {"at", (PyCFunction)ufunc_at, METH_VARARGS, NULL}, + /* Lower level methods: */ + {"resolve_dtypes", + (PyCFunction)py_resolve_dtypes, + METH_FASTCALL | METH_KEYWORDS, NULL}, + /* + * The following two functions are public API, but underscored since they + * are C-user specific and allow direct access to the core of ufunc loops. + * (See their documentation for API stability.) + */ + {"_resolve_dtypes_and_context", + (PyCFunction)py_resolve_dtypes_and_context, + METH_FASTCALL | METH_KEYWORDS, NULL}, + {"_get_strided_loop", + (PyCFunction)py_get_strided_loop, + METH_FASTCALL | METH_KEYWORDS, NULL}, {NULL, NULL, 0, NULL} /* sentinel */ }; @@ -6459,7 +7031,7 @@ static PyObject * ufunc_get_identity(PyUFuncObject *ufunc, void *NPY_UNUSED(ignored)) { npy_bool reorderable; - return _get_identity(ufunc, &reorderable); + return PyUFunc_GetDefaultIdentity(ufunc, &reorderable); } static PyObject * @@ -6518,6 +7090,7 @@ NPY_NO_EXPORT PyTypeObject PyUFunc_Type = { .tp_name = "numpy.ufunc", .tp_basicsize = sizeof(PyUFuncObject), .tp_dealloc = (destructor)ufunc_dealloc, + .tp_vectorcall_offset = offsetof(PyUFuncObject, vectorcall), .tp_repr = (reprfunc)ufunc_repr, .tp_call = &PyVectorcall_Call, .tp_str = (reprfunc)ufunc_repr, @@ -6527,7 +7100,6 @@ NPY_NO_EXPORT PyTypeObject PyUFunc_Type = { .tp_traverse = (traverseproc)ufunc_traverse, .tp_methods = ufunc_methods, .tp_getset = ufunc_getset, - .tp_vectorcall_offset = offsetof(PyUFuncObject, vectorcall), }; /* End of code for ufunc objects */ diff --git a/numpy/core/src/umath/ufunc_object.h b/numpy/core/src/umath/ufunc_object.h index 32af6c58e..1b80bb2df 100644 --- a/numpy/core/src/umath/ufunc_object.h +++ b/numpy/core/src/umath/ufunc_object.h @@ -4,14 +4,17 @@ #include <numpy/ufuncobject.h> NPY_NO_EXPORT PyObject * -ufunc_geterr(PyObject *NPY_UNUSED(dummy), PyObject *args); +ufunc_geterr(PyObject *NPY_UNUSED(dummy), PyObject *NPY_UNUSED(arg)); NPY_NO_EXPORT PyObject * -ufunc_seterr(PyObject *NPY_UNUSED(dummy), PyObject *args); +ufunc_seterr(PyObject *NPY_UNUSED(dummy), PyObject *arg); NPY_NO_EXPORT const char* ufunc_get_name_cstr(PyUFuncObject *ufunc); +NPY_NO_EXPORT PyObject * +PyUFunc_GetDefaultIdentity(PyUFuncObject *ufunc, npy_bool *reorderable); + /* strings from umathmodule.c that are interned on umath import */ NPY_VISIBILITY_HIDDEN extern PyObject *npy_um_str_array_ufunc; NPY_VISIBILITY_HIDDEN extern PyObject *npy_um_str_array_prepare; diff --git a/numpy/core/src/umath/ufunc_type_resolution.c b/numpy/core/src/umath/ufunc_type_resolution.c index 94338e031..decd26580 100644 --- a/numpy/core/src/umath/ufunc_type_resolution.c +++ b/numpy/core/src/umath/ufunc_type_resolution.c @@ -358,20 +358,51 @@ PyUFunc_SimpleBinaryComparisonTypeResolver(PyUFuncObject *ufunc, } if (type_tup == NULL) { + if (PyArray_ISDATETIME(operands[0]) + && PyArray_ISDATETIME(operands[1]) + && type_num1 != type_num2) { + /* + * Reject mixed datetime and timedelta explicitly, this was always + * implicitly rejected because casting fails (except with + * `casting="unsafe"` admittedly). + * This is required to ensure that `==` and `!=` can correctly + * detect that they should return a result array of False/True. + */ + return raise_binary_type_reso_error(ufunc, operands); + } /* - * DEPRECATED NumPy 1.20, 2020-12. - * This check is required to avoid the FutureWarning that - * ResultType will give for number->string promotions. + * This check is required to avoid a potential FutureWarning that + * ResultType would give for number->string promotions. * (We never supported flexible dtypes here.) */ - if (!PyArray_ISFLEXIBLE(operands[0]) && + else if (!PyArray_ISFLEXIBLE(operands[0]) && !PyArray_ISFLEXIBLE(operands[1])) { out_dtypes[0] = PyArray_ResultType(2, operands, 0, NULL); if (out_dtypes[0] == NULL) { return -1; } - out_dtypes[1] = out_dtypes[0]; - Py_INCREF(out_dtypes[1]); + if (PyArray_ISINTEGER(operands[0]) + && PyArray_ISINTEGER(operands[1]) + && !PyDataType_ISINTEGER(out_dtypes[0])) { + /* + * NumPy promotion allows uint+int to go to float, avoid it + * (input must have been a mix of signed and unsigned) + */ + if (PyArray_ISSIGNED(operands[0])) { + Py_SETREF(out_dtypes[0], PyArray_DescrFromType(NPY_LONGLONG)); + out_dtypes[1] = PyArray_DescrFromType(NPY_ULONGLONG); + Py_INCREF(out_dtypes[1]); + } + else { + Py_SETREF(out_dtypes[0], PyArray_DescrFromType(NPY_ULONGLONG)); + out_dtypes[1] = PyArray_DescrFromType(NPY_LONGLONG); + Py_INCREF(out_dtypes[1]); + } + } + else { + out_dtypes[1] = out_dtypes[0]; + Py_INCREF(out_dtypes[1]); + } } else { /* Not doing anything will lead to a loop no found error. */ @@ -382,64 +413,13 @@ PyUFunc_SimpleBinaryComparisonTypeResolver(PyUFuncObject *ufunc, } } else { - PyArray_Descr *descr; - /* - * DEPRECATED 2021-03, NumPy 1.20 - * - * If the type tuple was originally a single element (probably), - * issue a deprecation warning, but otherwise accept it. Since the - * result dtype is always boolean, this is not actually valid unless it - * is `object` (but if there is an object input we already deferred). - * - * TODO: Once this deprecation is gone, the special case for - * `PyUFunc_SimpleBinaryComparisonTypeResolver` in dispatching.c - * can be removed. - */ - if (PyTuple_Check(type_tup) && PyTuple_GET_SIZE(type_tup) == 3 && - PyTuple_GET_ITEM(type_tup, 0) == Py_None && - PyTuple_GET_ITEM(type_tup, 1) == Py_None && - PyArray_DescrCheck(PyTuple_GET_ITEM(type_tup, 2))) { - descr = (PyArray_Descr *)PyTuple_GET_ITEM(type_tup, 2); - if (descr->type_num == NPY_OBJECT) { - if (DEPRECATE_FUTUREWARNING( - "using `dtype=object` (or equivalent signature) will " - "return object arrays in the future also when the " - "inputs do not already have `object` dtype.") < 0) { - return -1; - } - } - else if (descr->type_num != NPY_BOOL) { - if (DEPRECATE( - "using `dtype=` in comparisons is only useful for " - "`dtype=object` (and will do nothing for bool). " - "This operation will fail in the future.") < 0) { - return -1; - } - } - } - else { - /* Usually a failure, but let the default version handle it */ - return PyUFunc_DefaultTypeResolver(ufunc, casting, - operands, type_tup, out_dtypes); - } - - out_dtypes[0] = NPY_DT_CALL_ensure_canonical(descr); - if (out_dtypes[0] == NULL) { - return -1; - } - out_dtypes[1] = out_dtypes[0]; - Py_INCREF(out_dtypes[1]); + /* Usually a failure, but let the default version handle it */ + return PyUFunc_DefaultTypeResolver(ufunc, casting, + operands, type_tup, out_dtypes); } - /* Output type is always boolean */ + /* Output type is always boolean (cannot fail for builtins) */ out_dtypes[2] = PyArray_DescrFromType(NPY_BOOL); - if (out_dtypes[2] == NULL) { - for (i = 0; i < 2; ++i) { - Py_DECREF(out_dtypes[i]); - out_dtypes[i] = NULL; - } - return -1; - } /* Check against the casting rules */ if (PyUFunc_ValidateCasting(ufunc, casting, operands, out_dtypes) < 0) { @@ -694,7 +674,7 @@ PyUFunc_IsNaTTypeResolver(PyUFuncObject *ufunc, { if (!PyTypeNum_ISDATETIME(PyArray_DESCR(operands[0])->type_num)) { PyErr_SetString(PyExc_TypeError, - "ufunc 'isnat' is only defined for datetime and timedelta."); + "ufunc 'isnat' is only defined for np.datetime64 and np.timedelta64."); return -1; } diff --git a/numpy/core/src/umath/wrapping_array_method.c b/numpy/core/src/umath/wrapping_array_method.c index 9f8f036e8..64fea7aeb 100644 --- a/numpy/core/src/umath/wrapping_array_method.c +++ b/numpy/core/src/umath/wrapping_array_method.c @@ -177,6 +177,39 @@ wrapping_method_get_loop( } +/* + * Wraps the original identity function, needs to translate the descriptors + * back to the original ones and provide an "original" context (identically to + * `get_loop`). + * We assume again that translating the descriptors is quick. + */ +static int +wrapping_method_get_identity_function( + PyArrayMethod_Context *context, npy_bool reduction_is_empty, + char *item) +{ + /* Copy the context, and replace descriptors: */ + PyArrayMethod_Context orig_context = *context; + PyArray_Descr *orig_descrs[NPY_MAXARGS]; + orig_context.descriptors = orig_descrs; + orig_context.method = context->method->wrapped_meth; + + int nin = context->method->nin, nout = context->method->nout; + PyArray_DTypeMeta **dtypes = context->method->wrapped_dtypes; + + if (context->method->translate_given_descrs( + nin, nout, dtypes, context->descriptors, orig_descrs) < 0) { + return -1; + } + int res = context->method->wrapped_meth->get_reduction_initial( + &orig_context, reduction_is_empty, item); + for (int i = 0; i < nin + nout; i++) { + Py_DECREF(orig_descrs); + } + return res; +} + + /** * Allows creating of a fairly lightweight wrapper around an existing ufunc * loop. The idea is mainly for units, as this is currently slightly limited @@ -235,14 +268,17 @@ PyUFunc_AddWrappingLoop(PyObject *ufunc_obj, break; } if (wrapped_meth == NULL) { - PyErr_SetString(PyExc_TypeError, - "Did not find the to-be-wrapped loop in the ufunc."); + PyErr_Format(PyExc_TypeError, + "Did not find the to-be-wrapped loop in the ufunc with given " + "DTypes. Received wrapping types: %S", wrapped_dt_tuple); goto finish; } PyType_Slot slots[] = { {NPY_METH_resolve_descriptors, &wrapping_method_resolve_descriptors}, - {NPY_METH_get_loop, &wrapping_method_get_loop}, + {_NPY_METH_get_loop, &wrapping_method_get_loop}, + {NPY_METH_get_reduction_initial, + &wrapping_method_get_identity_function}, {0, NULL} }; |