summaryrefslogtreecommitdiff
path: root/numpy/core/src/umath
diff options
context:
space:
mode:
Diffstat (limited to 'numpy/core/src/umath')
-rw-r--r--numpy/core/src/umath/_rational_tests.c62
-rw-r--r--numpy/core/src/umath/_scaled_float_dtype.c128
-rw-r--r--numpy/core/src/umath/_umath_tests.c.src97
-rw-r--r--numpy/core/src/umath/dispatching.c92
-rw-r--r--numpy/core/src/umath/fast_loop_macros.h32
-rw-r--r--numpy/core/src/umath/legacy_array_method.c125
-rw-r--r--numpy/core/src/umath/loops.c.src613
-rw-r--r--numpy/core/src/umath/loops.h.src321
-rw-r--r--numpy/core/src/umath/loops_arithm_fp.dispatch.c.src1299
-rw-r--r--numpy/core/src/umath/loops_arithmetic.dispatch.c.src50
-rw-r--r--numpy/core/src/umath/loops_autovec.dispatch.c.src287
-rw-r--r--numpy/core/src/umath/loops_comparison.dispatch.c.src38
-rw-r--r--numpy/core/src/umath/loops_exponent_log.dispatch.c.src6
-rw-r--r--numpy/core/src/umath/loops_logical.dispatch.c.src377
-rw-r--r--numpy/core/src/umath/loops_minmax.dispatch.c.src18
-rw-r--r--numpy/core/src/umath/loops_modulo.dispatch.c.src8
-rw-r--r--numpy/core/src/umath/loops_trigonometric.dispatch.c.src249
-rw-r--r--numpy/core/src/umath/loops_umath_fp.dispatch.c.src64
-rw-r--r--numpy/core/src/umath/loops_unary.dispatch.c.src364
-rw-r--r--numpy/core/src/umath/loops_unary_complex.dispatch.c.src139
-rw-r--r--numpy/core/src/umath/loops_unary_fp.dispatch.c.src2
-rw-r--r--numpy/core/src/umath/loops_unary_fp_le.dispatch.c.src565
-rw-r--r--numpy/core/src/umath/loops_utils.h.src4
-rw-r--r--numpy/core/src/umath/matmul.c.src2
-rw-r--r--numpy/core/src/umath/override.c16
-rw-r--r--numpy/core/src/umath/override.h2
-rw-r--r--numpy/core/src/umath/reduction.c132
-rw-r--r--numpy/core/src/umath/reduction.h6
-rw-r--r--numpy/core/src/umath/scalarmath.c.src115
-rw-r--r--numpy/core/src/umath/simd.inc.src1215
-rw-r--r--numpy/core/src/umath/string_ufuncs.cpp4
-rw-r--r--numpy/core/src/umath/ufunc_object.c1254
-rw-r--r--numpy/core/src/umath/ufunc_object.h7
-rw-r--r--numpy/core/src/umath/ufunc_type_resolution.c104
-rw-r--r--numpy/core/src/umath/wrapping_array_method.c42
35 files changed, 4761 insertions, 3078 deletions
diff --git a/numpy/core/src/umath/_rational_tests.c b/numpy/core/src/umath/_rational_tests.c
index bf50a2226..391c5f4e1 100644
--- a/numpy/core/src/umath/_rational_tests.c
+++ b/numpy/core/src/umath/_rational_tests.c
@@ -49,7 +49,7 @@ set_zero_divide(void) {
/* Integer arithmetic utilities */
-static NPY_INLINE npy_int32
+static inline npy_int32
safe_neg(npy_int32 x) {
if (x==(npy_int32)1<<31) {
set_overflow();
@@ -57,7 +57,7 @@ safe_neg(npy_int32 x) {
return -x;
}
-static NPY_INLINE npy_int32
+static inline npy_int32
safe_abs32(npy_int32 x) {
npy_int32 nx;
if (x>=0) {
@@ -70,7 +70,7 @@ safe_abs32(npy_int32 x) {
return nx;
}
-static NPY_INLINE npy_int64
+static inline npy_int64
safe_abs64(npy_int64 x) {
npy_int64 nx;
if (x>=0) {
@@ -83,7 +83,7 @@ safe_abs64(npy_int64 x) {
return nx;
}
-static NPY_INLINE npy_int64
+static inline npy_int64
gcd(npy_int64 x, npy_int64 y) {
x = safe_abs64(x);
y = safe_abs64(y);
@@ -102,7 +102,7 @@ gcd(npy_int64 x, npy_int64 y) {
return x;
}
-static NPY_INLINE npy_int64
+static inline npy_int64
lcm(npy_int64 x, npy_int64 y) {
npy_int64 lcm;
if (!x || !y) {
@@ -128,7 +128,7 @@ typedef struct {
npy_int32 dmm;
} rational;
-static NPY_INLINE rational
+static inline rational
make_rational_int(npy_int64 n) {
rational r = {(npy_int32)n,0};
if (r.n != n) {
@@ -164,7 +164,7 @@ make_rational_slow(npy_int64 n_, npy_int64 d_) {
return r;
}
-static NPY_INLINE npy_int32
+static inline npy_int32
d(rational r) {
return r.dmm+1;
}
@@ -184,7 +184,7 @@ make_rational_fast(npy_int64 n_, npy_int64 d_) {
return r;
}
-static NPY_INLINE rational
+static inline rational
rational_negative(rational r) {
rational x;
x.n = safe_neg(r.n);
@@ -192,7 +192,7 @@ rational_negative(rational r) {
return x;
}
-static NPY_INLINE rational
+static inline rational
rational_add(rational x, rational y) {
/*
* Note that the numerator computation can never overflow int128_t,
@@ -202,25 +202,25 @@ rational_add(rational x, rational y) {
(npy_int64)d(x)*d(y));
}
-static NPY_INLINE rational
+static inline rational
rational_subtract(rational x, rational y) {
/* We're safe from overflow as with + */
return make_rational_fast((npy_int64)x.n*d(y)-(npy_int64)d(x)*y.n,
(npy_int64)d(x)*d(y));
}
-static NPY_INLINE rational
+static inline rational
rational_multiply(rational x, rational y) {
/* We're safe from overflow as with + */
return make_rational_fast((npy_int64)x.n*y.n,(npy_int64)d(x)*d(y));
}
-static NPY_INLINE rational
+static inline rational
rational_divide(rational x, rational y) {
return make_rational_slow((npy_int64)x.n*d(y),(npy_int64)d(x)*y.n);
}
-static NPY_INLINE npy_int64
+static inline npy_int64
rational_floor(rational x) {
/* Always round down */
if (x.n>=0) {
@@ -233,18 +233,18 @@ rational_floor(rational x) {
return -((-(npy_int64)x.n+d(x)-1)/d(x));
}
-static NPY_INLINE npy_int64
+static inline npy_int64
rational_ceil(rational x) {
return -rational_floor(rational_negative(x));
}
-static NPY_INLINE rational
+static inline rational
rational_remainder(rational x, rational y) {
return rational_subtract(x, rational_multiply(y,make_rational_int(
rational_floor(rational_divide(x,y)))));
}
-static NPY_INLINE rational
+static inline rational
rational_abs(rational x) {
rational y;
y.n = safe_abs32(x.n);
@@ -252,7 +252,7 @@ rational_abs(rational x) {
return y;
}
-static NPY_INLINE npy_int64
+static inline npy_int64
rational_rint(rational x) {
/*
* Round towards nearest integer, moving exact half integers towards
@@ -262,12 +262,12 @@ rational_rint(rational x) {
return (2*(npy_int64)x.n+(x.n<0?-d_:d_))/(2*(npy_int64)d_);
}
-static NPY_INLINE int
+static inline int
rational_sign(rational x) {
return x.n<0?-1:x.n==0?0:1;
}
-static NPY_INLINE rational
+static inline rational
rational_inverse(rational x) {
rational y = {0};
if (!x.n) {
@@ -286,7 +286,7 @@ rational_inverse(rational x) {
return y;
}
-static NPY_INLINE int
+static inline int
rational_eq(rational x, rational y) {
/*
* Since we enforce d > 0, and store fractions in reduced form,
@@ -295,42 +295,42 @@ rational_eq(rational x, rational y) {
return x.n==y.n && x.dmm==y.dmm;
}
-static NPY_INLINE int
+static inline int
rational_ne(rational x, rational y) {
return !rational_eq(x,y);
}
-static NPY_INLINE int
+static inline int
rational_lt(rational x, rational y) {
return (npy_int64)x.n*d(y) < (npy_int64)y.n*d(x);
}
-static NPY_INLINE int
+static inline int
rational_gt(rational x, rational y) {
return rational_lt(y,x);
}
-static NPY_INLINE int
+static inline int
rational_le(rational x, rational y) {
return !rational_lt(y,x);
}
-static NPY_INLINE int
+static inline int
rational_ge(rational x, rational y) {
return !rational_lt(x,y);
}
-static NPY_INLINE npy_int32
+static inline npy_int32
rational_int(rational x) {
return x.n/d(x);
}
-static NPY_INLINE double
+static inline double
rational_double(rational x) {
return (double)x.n/d(x);
}
-static NPY_INLINE int
+static inline int
rational_nonzero(rational x) {
return x.n!=0;
}
@@ -367,7 +367,7 @@ typedef struct {
static PyTypeObject PyRational_Type;
-static NPY_INLINE int
+static inline int
PyRational_Check(PyObject* object) {
return PyObject_IsInstance(object,(PyObject*)&PyRational_Type);
}
@@ -753,7 +753,7 @@ npyrational_setitem(PyObject* item, void* data, void* arr) {
return 0;
}
-static NPY_INLINE void
+static inline void
byteswap(npy_int32* x) {
char* p = (char*)x;
size_t i;
@@ -996,7 +996,7 @@ UNARY_UFUNC(reciprocal,rational,rational_inverse(x))
UNARY_UFUNC(numerator,npy_int64,x.n)
UNARY_UFUNC(denominator,npy_int64,d(x))
-static NPY_INLINE void
+static inline void
rational_matrix_multiply(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
/* pointers to data for input and output arrays */
diff --git a/numpy/core/src/umath/_scaled_float_dtype.c b/numpy/core/src/umath/_scaled_float_dtype.c
index a214b32aa..c26ace9f1 100644
--- a/numpy/core/src/umath/_scaled_float_dtype.c
+++ b/numpy/core/src/umath/_scaled_float_dtype.c
@@ -26,6 +26,14 @@
#include "dispatching.h"
+/* TODO: from wrapping_array_method.c, use proper public header eventually */
+NPY_NO_EXPORT int
+PyUFunc_AddWrappingLoop(PyObject *ufunc_obj,
+ PyArray_DTypeMeta *new_dtypes[], PyArray_DTypeMeta *wrapped_dtypes[],
+ translate_given_descrs_func *translate_given_descrs,
+ translate_loop_descrs_func *translate_loop_descrs);
+
+
typedef struct {
PyArray_Descr base;
double scaling;
@@ -125,9 +133,9 @@ sfloat_setitem(PyObject *obj, char *data, PyArrayObject *arr)
/* Special DType methods and the descr->f slot storage */
NPY_DType_Slots sfloat_slots = {
- .default_descr = &sfloat_default_descr,
.discover_descr_from_pyobject = &sfloat_discover_from_pyobject,
.is_known_scalar_type = &sfloat_is_known_scalar_type,
+ .default_descr = &sfloat_default_descr,
.common_dtype = &sfloat_common_dtype,
.common_instance = &sfloat_common_instance,
.f = {
@@ -136,14 +144,13 @@ NPY_DType_Slots sfloat_slots = {
}
};
-
static PyArray_SFloatDescr SFloatSingleton = {{
- .elsize = sizeof(double),
- .alignment = _ALIGN(double),
+ .byteorder = '|', /* do not bother with byte-swapping... */
.flags = NPY_USE_GETITEM|NPY_USE_SETITEM,
.type_num = -1,
+ .elsize = sizeof(double),
+ .alignment = _ALIGN(double),
.f = &sfloat_slots.f,
- .byteorder = '|', /* do not bother with byte-swapping... */
},
.scaling = 1,
};
@@ -233,15 +240,15 @@ sfloat_repr(PyArray_SFloatDescr *self)
static PyArray_DTypeMeta PyArray_SFloatDType = {{{
PyVarObject_HEAD_INIT(NULL, 0)
.tp_name = "numpy._ScaledFloatTestDType",
- .tp_methods = sfloat_methods,
- .tp_new = sfloat_new,
+ .tp_basicsize = sizeof(PyArray_SFloatDescr),
.tp_repr = (reprfunc)sfloat_repr,
.tp_str = (reprfunc)sfloat_repr,
- .tp_basicsize = sizeof(PyArray_SFloatDescr),
+ .tp_methods = sfloat_methods,
+ .tp_new = sfloat_new,
}},
.type_num = -1,
.scalar_type = NULL,
- .flags = NPY_DT_PARAMETRIC,
+ .flags = NPY_DT_PARAMETRIC | NPY_DT_NUMERIC,
.dt_slots = &sfloat_slots,
};
@@ -440,7 +447,7 @@ sfloat_to_bool_resolve_descriptors(
static int
-init_casts(void)
+sfloat_init_casts(void)
{
PyArray_DTypeMeta *dtypes[2] = {&PyArray_SFloatDType, &PyArray_SFloatDType};
PyType_Slot slots[4] = {{0, NULL}};
@@ -448,11 +455,11 @@ init_casts(void)
.name = "sfloat_to_sfloat_cast",
.nin = 1,
.nout = 1,
+ /* minimal guaranteed casting */
+ .casting = NPY_SAME_KIND_CASTING,
.flags = NPY_METH_SUPPORTS_UNALIGNED,
.dtypes = dtypes,
.slots = slots,
- /* minimal guaranteed casting */
- .casting = NPY_SAME_KIND_CASTING,
};
slots[0].slot = NPY_METH_resolve_descriptors;
@@ -646,13 +653,55 @@ add_sfloats_resolve_descriptors(
}
+/*
+ * We define the hypot loop using the "PyUFunc_AddWrappingLoop" API.
+ * We use this very narrowly for mapping to the double hypot loop currently.
+ */
static int
-add_loop(const char *ufunc_name,
- PyArray_DTypeMeta *dtypes[3], PyObject *meth_or_promoter)
+translate_given_descrs_to_double(
+ int nin, int nout, PyArray_DTypeMeta *wrapped_dtypes[],
+ PyArray_Descr *given_descrs[], PyArray_Descr *new_descrs[])
+{
+ assert(nin == 2 && nout == 1);
+ for (int i = 0; i < 3; i++) {
+ if (given_descrs[i] == NULL) {
+ new_descrs[i] = NULL;
+ }
+ else {
+ new_descrs[i] = PyArray_DescrFromType(NPY_DOUBLE);
+ }
+ }
+ return 0;
+}
+
+
+static int
+translate_loop_descrs(
+ int nin, int nout, PyArray_DTypeMeta *new_dtypes[],
+ PyArray_Descr *given_descrs[],
+ PyArray_Descr *NPY_UNUSED(original_descrs[]),
+ PyArray_Descr *loop_descrs[])
+{
+ assert(nin == 2 && nout == 1);
+ loop_descrs[0] = sfloat_common_instance(
+ given_descrs[0], given_descrs[1]);
+ if (loop_descrs[0] == 0) {
+ return -1;
+ }
+ Py_INCREF(loop_descrs[0]);
+ loop_descrs[1] = loop_descrs[0];
+ Py_INCREF(loop_descrs[0]);
+ loop_descrs[2] = loop_descrs[0];
+ return 0;
+}
+
+
+static PyObject *
+sfloat_get_ufunc(const char *ufunc_name)
{
PyObject *mod = PyImport_ImportModule("numpy");
if (mod == NULL) {
- return -1;
+ return NULL;
}
PyObject *ufunc = PyObject_GetAttrString(mod, ufunc_name);
Py_DECREF(mod);
@@ -660,6 +709,18 @@ add_loop(const char *ufunc_name,
Py_DECREF(ufunc);
PyErr_Format(PyExc_TypeError,
"numpy.%s was not a ufunc!", ufunc_name);
+ return NULL;
+ }
+ return ufunc;
+}
+
+
+static int
+sfloat_add_loop(const char *ufunc_name,
+ PyArray_DTypeMeta *dtypes[3], PyObject *meth_or_promoter)
+{
+ PyObject *ufunc = sfloat_get_ufunc(ufunc_name);
+ if (ufunc == NULL) {
return -1;
}
PyObject *dtype_tup = PyArray_TupleFromItems(3, (PyObject **)dtypes, 1);
@@ -680,6 +741,24 @@ add_loop(const char *ufunc_name,
}
+static int
+sfloat_add_wrapping_loop(const char *ufunc_name, PyArray_DTypeMeta *dtypes[3])
+{
+ PyObject *ufunc = sfloat_get_ufunc(ufunc_name);
+ if (ufunc == NULL) {
+ return -1;
+ }
+ PyArray_DTypeMeta *double_dt = PyArray_DTypeFromTypeNum(NPY_DOUBLE);
+ PyArray_DTypeMeta *wrapped_dtypes[3] = {double_dt, double_dt, double_dt};
+ int res = PyUFunc_AddWrappingLoop(
+ ufunc, dtypes, wrapped_dtypes, &translate_given_descrs_to_double,
+ &translate_loop_descrs);
+ Py_DECREF(ufunc);
+ Py_DECREF(double_dt);
+
+ return res;
+}
+
/*
* We add some very basic promoters to allow multiplying normal and scaled
@@ -707,7 +786,7 @@ promote_to_sfloat(PyUFuncObject *NPY_UNUSED(ufunc),
* get less so with the introduction of public API).
*/
static int
-init_ufuncs(void) {
+sfloat_init_ufuncs(void) {
PyArray_DTypeMeta *dtypes[3] = {
&PyArray_SFloatDType, &PyArray_SFloatDType, &PyArray_SFloatDType};
PyType_Slot slots[3] = {{0, NULL}};
@@ -728,7 +807,7 @@ init_ufuncs(void) {
if (bmeth == NULL) {
return -1;
}
- int res = add_loop("multiply",
+ int res = sfloat_add_loop("multiply",
bmeth->dtypes, (PyObject *)bmeth->method);
Py_DECREF(bmeth);
if (res < 0) {
@@ -746,13 +825,18 @@ init_ufuncs(void) {
if (bmeth == NULL) {
return -1;
}
- res = add_loop("add",
+ res = sfloat_add_loop("add",
bmeth->dtypes, (PyObject *)bmeth->method);
Py_DECREF(bmeth);
if (res < 0) {
return -1;
}
+ /* N.B.: Wrapping isn't actually correct if scaling can be negative */
+ if (sfloat_add_wrapping_loop("hypot", dtypes) < 0) {
+ return -1;
+ }
+
/*
* Add a promoter for both directions of multiply with double.
*/
@@ -767,14 +851,14 @@ init_ufuncs(void) {
if (promoter == NULL) {
return -1;
}
- res = add_loop("multiply", promoter_dtypes, promoter);
+ res = sfloat_add_loop("multiply", promoter_dtypes, promoter);
if (res < 0) {
Py_DECREF(promoter);
return -1;
}
promoter_dtypes[0] = double_DType;
promoter_dtypes[1] = &PyArray_SFloatDType;
- res = add_loop("multiply", promoter_dtypes, promoter);
+ res = sfloat_add_loop("multiply", promoter_dtypes, promoter);
Py_DECREF(promoter);
if (res < 0) {
return -1;
@@ -815,11 +899,11 @@ get_sfloat_dtype(PyObject *NPY_UNUSED(mod), PyObject *NPY_UNUSED(args))
return NULL;
}
- if (init_casts() < 0) {
+ if (sfloat_init_casts() < 0) {
return NULL;
}
- if (init_ufuncs() < 0) {
+ if (sfloat_init_ufuncs() < 0) {
return NULL;
}
diff --git a/numpy/core/src/umath/_umath_tests.c.src b/numpy/core/src/umath/_umath_tests.c.src
index 1bf459ce6..b427991e5 100644
--- a/numpy/core/src/umath/_umath_tests.c.src
+++ b/numpy/core/src/umath/_umath_tests.c.src
@@ -9,6 +9,9 @@
#include <Python.h>
#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#if defined(NPY_INTERNAL_BUILD)
+#undef NPY_INTERNAL_BUILD
+#endif
#include "numpy/arrayobject.h"
#include "numpy/ufuncobject.h"
#include "numpy/npy_math.h"
@@ -19,6 +22,9 @@
#include "npy_cpu_features.h"
#include "npy_cpu_dispatch.h"
#include "numpy/npy_cpu.h"
+#include "npy_import.h"
+#include "numpy/experimental_dtype_api.h"
+
/*
*****************************************************************************
@@ -300,7 +306,7 @@ static void
ptr_this += stride_d;
ptr_that += stride_d;
}
- *(@typ@ *)data_out = npy_@sqrt_func@(out);
+ *(@typ@ *)data_out = @sqrt_func@(out);
data_that += stride_n;
data_out += stride_p;
}
@@ -343,6 +349,50 @@ static void
/**end repeat**/
+static int
+INT32_negative(PyArrayMethod_Context *NPY_UNUSED(context),
+ char **args, npy_intp const *dimensions,
+ npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+ npy_intp di = dimensions[0];
+ npy_intp i;
+ npy_intp is=steps[0], os=steps[1];
+ char *ip=args[0], *op=args[1];
+ for (i = 0; i < di; i++, ip += is, op += os) {
+ if (i == 3) {
+ *(int32_t *)op = - 100;
+ } else {
+ *(int32_t *)op = - *(int32_t *)ip;
+ }
+ }
+ return 0;
+}
+
+
+static int
+INT32_negative_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+ char * const*args, npy_intp const *dimensions,
+ npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+ char *ip1 = args[0];
+ char *indx = args[1];
+ npy_intp is1 = steps[0], isindex = steps[1];
+ npy_intp n = dimensions[0];
+ npy_intp i;
+ int32_t *indexed;
+ for(i = 0; i < n; i++, indx += isindex) {
+ indexed = (int32_t *)(ip1 + is1 * *(npy_intp *)indx);
+ if (i == 3) {
+ *indexed = -200;
+ } else {
+ *indexed = - *indexed;
+ }
+ }
+ return 0;
+}
+
+
+
/* The following lines were generated using a slightly modified
version of code_generators/generate_umath.py and adding these
lines to defdict:
@@ -671,6 +721,43 @@ err:
return NULL;
}
+static int
+add_INT32_negative_indexed(PyObject *module, PyObject *dict) {
+ if (import_experimental_dtype_api(__EXPERIMENTAL_DTYPE_API_VERSION) < 0) {
+ return -1;
+ }
+
+ PyObject * negative = PyUFunc_FromFuncAndData(NULL, NULL, NULL, 0, 1, 1,
+ PyUFunc_Zero, "indexed_negative", NULL, 0);
+ if (negative == NULL) {
+ return -1;
+ }
+ PyArray_DTypeMeta *dtypes[] = {&PyArray_Int32DType, &PyArray_Int32DType};
+
+ PyType_Slot slots[] = {
+ {NPY_METH_contiguous_indexed_loop, INT32_negative_indexed},
+ {NPY_METH_strided_loop, INT32_negative},
+ {0, NULL}
+ };
+
+ PyArrayMethod_Spec spec = {
+ .name = "negative_indexed_loop",
+ .nin = 1,
+ .nout = 1,
+ .dtypes = dtypes,
+ .slots = slots,
+ .flags = NPY_METH_NO_FLOATINGPOINT_ERRORS
+ };
+
+ if (PyUFunc_AddLoopFromSpec(negative, &spec) < 0) {
+ Py_DECREF(negative);
+ return -1;
+ }
+ PyDict_SetItemString(dict, "indexed_negative", negative);
+ Py_DECREF(negative);
+ return 0;
+}
+
static PyMethodDef UMath_TestsMethods[] = {
{"test_signature", UMath_Tests_test_signature, METH_VARARGS,
"Test signature parsing of ufunc. \n"
@@ -733,5 +820,13 @@ PyMODINIT_FUNC PyInit__umath_tests(void) {
"cannot load _umath_tests module.");
return NULL;
}
+
+ if (add_INT32_negative_indexed(m, d) < 0) {
+ Py_DECREF(m);
+ PyErr_Print();
+ PyErr_SetString(PyExc_RuntimeError,
+ "cannot load _umath_tests module.");
+ return NULL;
+ }
return m;
}
diff --git a/numpy/core/src/umath/dispatching.c b/numpy/core/src/umath/dispatching.c
index 79de6c3c8..fee77b2d6 100644
--- a/numpy/core/src/umath/dispatching.c
+++ b/numpy/core/src/umath/dispatching.c
@@ -43,6 +43,7 @@
#include <convert_datatype.h>
#include "numpy/ndarraytypes.h"
+#include "numpy/npy_3kcompat.h"
#include "common.h"
#include "dispatching.h"
@@ -58,7 +59,7 @@
/* forward declaration */
-static NPY_INLINE PyObject *
+static inline PyObject *
promote_and_get_info_and_ufuncimpl(PyUFuncObject *ufunc,
PyArrayObject *const ops[],
PyArray_DTypeMeta *signature[],
@@ -667,12 +668,9 @@ legacy_promote_using_legacy_type_resolver(PyUFuncObject *ufunc,
Py_DECREF(out_descrs[i]);
}
/*
- * The PyUFunc_SimpleBinaryComparisonTypeResolver has a deprecation
- * warning (ignoring `dtype=`) and cannot be cached.
- * All datetime ones *should* have a warning, but currently don't,
- * but ignore all signature passing also. So they can also
- * not be cached, and they mutate the signature which of course is wrong,
- * but not doing it would confuse the code later.
+ * datetime legacy resolvers ignore the signature, which should be
+ * warn/raise (when used). In such cases, the signature is (incorrectly)
+ * mutated, and caching is not possible.
*/
for (int i = 0; i < nargs; i++) {
if (signature[i] != NULL && signature[i] != operation_DTypes[i]) {
@@ -728,7 +726,7 @@ add_and_return_legacy_wrapping_ufunc_loop(PyUFuncObject *ufunc,
* If value-based promotion is necessary, this is handled ahead of time by
* `promote_and_get_ufuncimpl`.
*/
-static NPY_INLINE PyObject *
+static inline PyObject *
promote_and_get_info_and_ufuncimpl(PyUFuncObject *ufunc,
PyArrayObject *const ops[],
PyArray_DTypeMeta *signature[],
@@ -916,8 +914,8 @@ promote_and_get_ufuncimpl(PyUFuncObject *ufunc,
int nin = ufunc->nin, nargs = ufunc->nargs;
/*
- * Get the actual DTypes we operate with by mixing the operand array
- * ones with the passed signature.
+ * Get the actual DTypes we operate with by setting op_dtypes[i] from
+ * signature[i].
*/
for (int i = 0; i < nargs; i++) {
if (signature[i] != NULL) {
@@ -950,7 +948,7 @@ promote_and_get_ufuncimpl(PyUFuncObject *ufunc,
int cacheable = 1; /* unused, as we modify the original `op_dtypes` */
if (legacy_promote_using_legacy_type_resolver(ufunc,
ops, signature, op_dtypes, &cacheable, NPY_FALSE) < 0) {
- return NULL;
+ goto handle_error;
}
}
@@ -962,10 +960,7 @@ promote_and_get_ufuncimpl(PyUFuncObject *ufunc,
npy_promotion_state = old_promotion_state;
if (info == NULL) {
- if (!PyErr_Occurred()) {
- raise_no_loop_found_error(ufunc, (PyObject **)op_dtypes);
- }
- return NULL;
+ goto handle_error;
}
PyArrayMethodObject *method = (PyArrayMethodObject *)PyTuple_GET_ITEM(info, 1);
@@ -987,7 +982,7 @@ promote_and_get_ufuncimpl(PyUFuncObject *ufunc,
/* Reset the promotion state: */
npy_promotion_state = NPY_USE_WEAK_PROMOTION_AND_WARN;
if (res < 0) {
- return NULL;
+ goto handle_error;
}
}
@@ -1021,12 +1016,29 @@ promote_and_get_ufuncimpl(PyUFuncObject *ufunc,
* If signature is forced the cache may contain an incompatible
* loop found via promotion (signature not enforced). Reject it.
*/
- raise_no_loop_found_error(ufunc, (PyObject **)op_dtypes);
- return NULL;
+ goto handle_error;
}
}
return method;
+
+ handle_error:
+ /* We only set the "no loop found error here" */
+ if (!PyErr_Occurred()) {
+ raise_no_loop_found_error(ufunc, (PyObject **)op_dtypes);
+ }
+ /*
+ * Otherwise an error occurred, but if the error was DTypePromotionError
+ * then we chain it, because DTypePromotionError effectively means that there
+ * is no loop available. (We failed finding a loop by using promotion.)
+ */
+ else if (PyErr_ExceptionMatches(npy_DTypePromotionError)) {
+ PyObject *err_type = NULL, *err_value = NULL, *err_traceback = NULL;
+ PyErr_Fetch(&err_type, &err_value, &err_traceback);
+ raise_no_loop_found_error(ufunc, (PyObject **)op_dtypes);
+ npy_PyErr_ChainExceptionsCause(err_type, err_value, err_traceback);
+ }
+ return NULL;
}
@@ -1042,13 +1054,6 @@ default_ufunc_promoter(PyUFuncObject *ufunc,
PyArray_DTypeMeta *op_dtypes[], PyArray_DTypeMeta *signature[],
PyArray_DTypeMeta *new_op_dtypes[])
{
- if (ufunc->type_resolver == &PyUFunc_SimpleBinaryComparisonTypeResolver
- && signature[0] == NULL && signature[1] == NULL
- && signature[2] != NULL && signature[2]->type_num != NPY_BOOL) {
- /* bail out, this is _only_ to give future/deprecation warning! */
- return -1;
- }
-
/* If nin < 2 promotion is a no-op, so it should not be registered */
assert(ufunc->nin > 1);
if (op_dtypes[0] == NULL) {
@@ -1235,3 +1240,40 @@ install_logical_ufunc_promoter(PyObject *ufunc)
return PyUFunc_AddLoop((PyUFuncObject *)ufunc, info, 0);
}
+
+/*
+ * Return the PyArrayMethodObject or PyCapsule that matches a registered
+ * tuple of identical dtypes. Return a borrowed ref of the first match.
+ */
+NPY_NO_EXPORT PyObject *
+get_info_no_cast(PyUFuncObject *ufunc, PyArray_DTypeMeta *op_dtype,
+ int ndtypes)
+{
+ PyObject *t_dtypes = PyTuple_New(ndtypes);
+ if (t_dtypes == NULL) {
+ return NULL;
+ }
+ for (int i=0; i < ndtypes; i++) {
+ PyTuple_SetItem(t_dtypes, i, (PyObject *)op_dtype);
+ }
+ PyObject *loops = ufunc->_loops;
+ Py_ssize_t length = PyList_Size(loops);
+ for (Py_ssize_t i = 0; i < length; i++) {
+ PyObject *item = PyList_GetItem(loops, i);
+ PyObject *cur_DType_tuple = PyTuple_GetItem(item, 0);
+ int cmp = PyObject_RichCompareBool(cur_DType_tuple,
+ t_dtypes, Py_EQ);
+ if (cmp < 0) {
+ Py_DECREF(t_dtypes);
+ return NULL;
+ }
+ if (cmp == 0) {
+ continue;
+ }
+ /* Got the match */
+ Py_DECREF(t_dtypes);
+ return PyTuple_GetItem(item, 1);
+ }
+ Py_DECREF(t_dtypes);
+ Py_RETURN_NONE;
+}
diff --git a/numpy/core/src/umath/fast_loop_macros.h b/numpy/core/src/umath/fast_loop_macros.h
index cbd1f04aa..b8c1926b2 100644
--- a/numpy/core/src/umath/fast_loop_macros.h
+++ b/numpy/core/src/umath/fast_loop_macros.h
@@ -12,6 +12,19 @@
#include <assert.h>
+#include "simd/simd.h"
+
+/*
+ * largest simd vector size in bytes numpy supports
+ * it is currently a extremely large value as it is only used for memory
+ * overlap checks
+ */
+#if NPY_SIMD > 0
+ // Enough for compiler unroll
+ #define AUTOVEC_OVERLAP_SIZE NPY_SIMD_WIDTH*4
+#else
+ #define AUTOVEC_OVERLAP_SIZE 1024
+#endif
/*
* MAX_STEP_SIZE is used to determine if we need to use SIMD version of the ufunc.
* Very large step size can be as slow as processing it using scalar. The
@@ -27,7 +40,7 @@
*/
#define MAX_STEP_SIZE 2097152
-static NPY_INLINE npy_uintp
+static inline npy_uintp
abs_ptrdiff(char *a, char *b)
{
return (a > b) ? (a - b) : (b - a);
@@ -219,11 +232,11 @@ abs_ptrdiff(char *a, char *b)
/* condition allows compiler to optimize the generic macro */ \
if (IS_BINARY_CONT(tin, tout)) { \
if (abs_ptrdiff(args[2], args[0]) == 0 && \
- abs_ptrdiff(args[2], args[1]) >= NPY_MAX_SIMD_SIZE) { \
+ abs_ptrdiff(args[2], args[1]) >= AUTOVEC_OVERLAP_SIZE) { \
BASE_BINARY_LOOP_INP(tin, tout, op) \
} \
else if (abs_ptrdiff(args[2], args[1]) == 0 && \
- abs_ptrdiff(args[2], args[0]) >= NPY_MAX_SIMD_SIZE) { \
+ abs_ptrdiff(args[2], args[0]) >= AUTOVEC_OVERLAP_SIZE) { \
BASE_BINARY_LOOP_INP(tin, tout, op) \
} \
else { \
@@ -378,19 +391,6 @@ abs_ptrdiff(char *a, char *b)
#undef abs_ptrdiff
-#define IS_BLOCKABLE_BINARY_BOOL(esize, vsize) \
- (steps[0] == (esize) && steps[0] == steps[1] && steps[2] == (1) && \
- npy_is_aligned(args[1], (esize)) && \
- npy_is_aligned(args[0], (esize)))
-
-#define IS_BLOCKABLE_BINARY_SCALAR1_BOOL(esize, vsize) \
- (steps[0] == 0 && steps[1] == (esize) && steps[2] == (1) && \
- npy_is_aligned(args[1], (esize)))
-
-#define IS_BLOCKABLE_BINARY_SCALAR2_BOOL(esize, vsize) \
- (steps[0] == (esize) && steps[1] == 0 && steps[2] == (1) && \
- npy_is_aligned(args[0], (esize)))
-
/* align var to alignment */
#define LOOP_BLOCK_ALIGN_VAR(var, type, alignment)\
npy_intp i, peel = npy_aligned_block_offset(var, sizeof(type),\
diff --git a/numpy/core/src/umath/legacy_array_method.c b/numpy/core/src/umath/legacy_array_method.c
index c3d421d9b..965a0eb83 100644
--- a/numpy/core/src/umath/legacy_array_method.c
+++ b/numpy/core/src/umath/legacy_array_method.c
@@ -13,10 +13,13 @@
#include "convert_datatype.h"
#include "array_method.h"
+#include "array_coercion.h"
#include "dtype_transfer.h"
#include "legacy_array_method.h"
#include "dtypemeta.h"
+#include "ufunc_object.h"
+
typedef struct {
NpyAuxData base;
@@ -91,7 +94,6 @@ generic_wrapped_legacy_loop(PyArrayMethod_Context *NPY_UNUSED(context),
return 0;
}
-
/*
* Signal that the old type-resolution function must be used to resolve
* the descriptors (mainly/only used for datetimes due to the unit).
@@ -234,6 +236,97 @@ get_wrapped_legacy_ufunc_loop(PyArrayMethod_Context *context,
}
+
+/*
+ * We can shave off a bit of time by just caching the initial and this is
+ * trivial for all internal numeric types. (Wrapped ufuncs never use
+ * byte-swapping.)
+ */
+static int
+copy_cached_initial(
+ PyArrayMethod_Context *context, npy_bool NPY_UNUSED(reduction_is_empty),
+ char *initial)
+{
+ memcpy(initial, context->method->legacy_initial,
+ context->descriptors[0]->elsize);
+ return 1;
+}
+
+
+/*
+ * The default `get_reduction_initial` attempts to look up the identity
+ * from the calling ufunc. This might fail, so we only call it when necessary.
+ *
+ * For internal number dtypes, we can easily cache it, so do so after the
+ * first call by overriding the function with `copy_cache_initial`.
+ * This path is not publicly available. That could be added, and for a
+ * custom initial getter it should be static/compile time data anyway.
+ */
+static int
+get_initial_from_ufunc(
+ PyArrayMethod_Context *context, npy_bool reduction_is_empty,
+ char *initial)
+{
+ if (context->caller == NULL
+ || !PyObject_TypeCheck(context->caller, &PyUFunc_Type)) {
+ /* Impossible in NumPy 1.24; guard in case it becomes possible. */
+ PyErr_SetString(PyExc_ValueError,
+ "getting initial failed because it can only done for legacy "
+ "ufunc loops when the ufunc is provided.");
+ return -1;
+ }
+ npy_bool reorderable;
+ PyObject *identity_obj = PyUFunc_GetDefaultIdentity(
+ (PyUFuncObject *)context->caller, &reorderable);
+ if (identity_obj == NULL) {
+ return -1;
+ }
+ if (identity_obj == Py_None) {
+ /* UFunc has no idenity (should not happen) */
+ Py_DECREF(identity_obj);
+ return 0;
+ }
+ if (PyTypeNum_ISUNSIGNED(context->descriptors[1]->type_num)
+ && PyLong_CheckExact(identity_obj)) {
+ /*
+ * This is a bit of a hack until we have truly loop specific
+ * identities. Python -1 cannot be cast to unsigned so convert
+ * it to a NumPy scalar, but we use -1 for bitwise functions to
+ * signal all 1s.
+ * (A builtin identity would not overflow here, although we may
+ * unnecessary convert 0 and 1.)
+ */
+ Py_SETREF(identity_obj, PyObject_CallFunctionObjArgs(
+ (PyObject *)&PyLongArrType_Type, identity_obj, NULL));
+ if (identity_obj == NULL) {
+ return -1;
+ }
+ }
+ else if (context->descriptors[0]->type_num == NPY_OBJECT
+ && !reduction_is_empty) {
+ /* Allows `sum([object()])` to work, but use 0 when empty. */
+ Py_DECREF(identity_obj);
+ return 0;
+ }
+
+ int res = PyArray_Pack(context->descriptors[0], initial, identity_obj);
+ Py_DECREF(identity_obj);
+ if (res < 0) {
+ return -1;
+ }
+
+ if (PyTypeNum_ISNUMBER(context->descriptors[0]->type_num)) {
+ /* For numbers we can cache to avoid going via Python ints */
+ memcpy(context->method->legacy_initial, initial,
+ context->descriptors[0]->elsize);
+ context->method->get_reduction_initial = &copy_cached_initial;
+ }
+
+ /* Reduction can use the initial value */
+ return 1;
+}
+
+
/*
* Get the unbound ArrayMethod which wraps the instances of the ufunc.
* Note that this function stores the result on the ufunc and then only
@@ -273,6 +366,27 @@ PyArray_NewLegacyWrappingArrayMethod(PyUFuncObject *ufunc,
flags = _NPY_METH_FORCE_CAST_INPUTS;
}
+ get_reduction_initial_function *get_reduction_intial = NULL;
+ if (ufunc->nin == 2 && ufunc->nout == 1) {
+ npy_bool reorderable = NPY_FALSE;
+ PyObject *identity_obj = PyUFunc_GetDefaultIdentity(
+ ufunc, &reorderable);
+ if (identity_obj == NULL) {
+ return NULL;
+ }
+ /*
+ * TODO: For object, "reorderable" is needed(?), because otherwise
+ * we disable multi-axis reductions `arr.sum(0, 1)`. But for
+ * `arr = array([["a", "b"], ["c", "d"]], dtype="object")`
+ * it isn't actually reorderable (order changes result).
+ */
+ if (reorderable) {
+ flags |= NPY_METH_IS_REORDERABLE;
+ }
+ if (identity_obj != Py_None) {
+ get_reduction_intial = &get_initial_from_ufunc;
+ }
+ }
for (int i = 0; i < ufunc->nin+ufunc->nout; i++) {
if (signature[i]->singleton->flags & (
NPY_ITEM_REFCOUNT | NPY_ITEM_IS_POINTER | NPY_NEEDS_PYAPI)) {
@@ -283,9 +397,10 @@ PyArray_NewLegacyWrappingArrayMethod(PyUFuncObject *ufunc,
}
}
- PyType_Slot slots[3] = {
- {NPY_METH_get_loop, &get_wrapped_legacy_ufunc_loop},
+ PyType_Slot slots[4] = {
+ {_NPY_METH_get_loop, &get_wrapped_legacy_ufunc_loop},
{NPY_METH_resolve_descriptors, &simple_legacy_resolve_descriptors},
+ {NPY_METH_get_reduction_initial, get_reduction_intial},
{0, NULL},
};
if (any_output_flexible) {
@@ -297,10 +412,10 @@ PyArray_NewLegacyWrappingArrayMethod(PyUFuncObject *ufunc,
.name = method_name,
.nin = ufunc->nin,
.nout = ufunc->nout,
- .dtypes = signature,
+ .casting = NPY_NO_CASTING,
.flags = flags,
+ .dtypes = signature,
.slots = slots,
- .casting = NPY_NO_CASTING,
};
PyBoundArrayMethodObject *bound_res = PyArrayMethod_FromSpec_int(&spec, 1);
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index fe5aa9374..97a74b425 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -13,6 +13,7 @@
#include "numpy/npy_math.h"
#include "numpy/halffloat.h"
#include "lowlevel_strided_loops.h"
+#include "loops_utils.h"
#include "npy_pycompat.h"
@@ -31,27 +32,9 @@
*/
#define PW_BLOCKSIZE 128
-
-/*
- * largest simd vector size in bytes numpy supports
- * it is currently a extremely large value as it is only used for memory
- * overlap checks
- */
-#ifndef NPY_MAX_SIMD_SIZE
-#define NPY_MAX_SIMD_SIZE 1024
-#endif
-
/** Provides the various *_LOOP macros */
#include "fast_loop_macros.h"
-/*
- * include vectorized functions and dispatchers
- * this file is safe to include also for generic builds
- * platform specific instructions are either masked via the proprocessor or
- * runtime detected
- */
-#include "simd.inc"
-
/******************************************************************************
** GENERIC FLOAT LOOPS **
*****************************************************************************/
@@ -416,98 +399,6 @@ PyUFunc_On_Om(char **args, npy_intp const *dimensions, npy_intp const *steps, vo
*****************************************************************************
*/
-/**begin repeat
- * #kind = logical_and, logical_or#
- * #OP = &&, ||#
- * #SC = ==, !=#
- * #and = 1, 0#
- **/
-
-NPY_NO_EXPORT void
-BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
- if(IS_BINARY_REDUCE) {
-#ifdef NPY_HAVE_SSE2_INTRINSICS
- /*
- * stick with our variant for more reliable performance, only known
- * platform which outperforms it by ~20% is an i7 with glibc 2.17
- */
- if (run_reduce_simd_@kind@_BOOL(args, dimensions, steps)) {
- return;
- }
-#else
- /* for now only use libc on 32-bit/non-x86 */
- if (steps[1] == 1) {
- npy_bool * op = (npy_bool *)args[0];
-#if @and@
- /* np.all(), search for a zero (false) */
- if (*op) {
- *op = memchr(args[1], 0, dimensions[0]) == NULL;
- }
-#else
- /*
- * np.any(), search for a non-zero (true) via comparing against
- * zero blocks, memcmp is faster than memchr on SSE4 machines
- * with glibc >= 2.12 and memchr can only check for equal 1
- */
- static const npy_bool zero[4096]; /* zero by C standard */
- npy_uintp i, n = dimensions[0];
-
- for (i = 0; !*op && i < n - (n % sizeof(zero)); i += sizeof(zero)) {
- *op = memcmp(&args[1][i], zero, sizeof(zero)) != 0;
- }
- if (!*op && n - i > 0) {
- *op = memcmp(&args[1][i], zero, n - i) != 0;
- }
-#endif
- return;
- }
-#endif
- else {
- BINARY_REDUCE_LOOP(npy_bool) {
- const npy_bool in2 = *(npy_bool *)ip2;
- io1 = io1 @OP@ in2;
- if (io1 @SC@ 0) {
- break;
- }
- }
- *((npy_bool *)iop1) = io1;
- }
- }
- else {
- if (run_binary_simd_@kind@_BOOL(args, dimensions, steps)) {
- return;
- }
- else {
- BINARY_LOOP {
- const npy_bool in1 = *(npy_bool *)ip1;
- const npy_bool in2 = *(npy_bool *)ip2;
- *((npy_bool *)op1) = in1 @OP@ in2;
- }
- }
- }
-}
-/**end repeat**/
-
-/**begin repeat
- * #kind = absolute, logical_not#
- * #OP = !=, ==#
- **/
-NPY_NO_EXPORT void
-BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
- if (run_unary_simd_@kind@_BOOL(args, dimensions, steps)) {
- return;
- }
- else {
- UNARY_LOOP {
- npy_bool in1 = *(npy_bool *)ip1;
- *((npy_bool *)op1) = in1 @OP@ 0;
- }
- }
-}
-/**end repeat**/
-
NPY_NO_EXPORT void
BOOL__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
{
@@ -516,24 +407,6 @@ BOOL__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps,
}
}
-
-/**begin repeat
- * #kind = isnan, isinf, isfinite#
- * #func = npy_isnan, npy_isinf, npy_isfinite#
- * #val = NPY_FALSE, NPY_FALSE, NPY_TRUE#
- **/
-NPY_NO_EXPORT void
-BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
- /*
- * The (void)in; suppresses an unused variable warning raised by gcc and allows
- * us to re-use this macro even though we do not depend on in
- */
- UNARY_LOOP_FAST(npy_bool, npy_bool, (void)in; *out = @val@);
-}
-
-/**end repeat**/
-
/*
*****************************************************************************
** INTEGER LOOPS
@@ -552,8 +425,11 @@ BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void
*/
#define @TYPE@_floor_divide @TYPE@_divide
+#define @TYPE@_floor_divide_indexed @TYPE@_divide_indexed
#define @TYPE@_fmax @TYPE@_maximum
+#define @TYPE@_fmax_indexed @TYPE@_maximum_indexed
#define @TYPE@_fmin @TYPE@_minimum
+#define @TYPE@_fmin_indexed @TYPE@_minimum_indexed
NPY_NO_EXPORT void
@TYPE@__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
@@ -562,163 +438,30 @@ NPY_NO_EXPORT void
*((@type@ *)op1) = 1;
}
}
-
-NPY_NO_EXPORT void
-@TYPE@_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
- UNARY_LOOP_FAST(@type@, @type@, *out = +in);
-}
-
/**begin repeat1
- * #isa = , _avx2#
- * #CHK = 1, defined(HAVE_ATTRIBUTE_TARGET_AVX2)#
- * #ATTR = , NPY_GCC_TARGET_AVX2#
- */
-
-#if @CHK@
-NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
-@TYPE@_square@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
-{
- UNARY_LOOP_FAST(@type@, @type@, *out = in * in);
-}
-#endif
-
-#if @CHK@
-NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
-@TYPE@_reciprocal@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
-{
- UNARY_LOOP_FAST(@type@, @type@, *out = 1.0 / in);
-}
-#endif
-
-#if @CHK@
-NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
-@TYPE@_conjugate@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
- UNARY_LOOP_FAST(@type@, @type@, *out = in);
-}
-#endif
-
-#if @CHK@
-NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
-@TYPE@_negative@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
- UNARY_LOOP_FAST(@type@, @type@, *out = -in);
-}
-#endif
-
-#if @CHK@
-NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
-@TYPE@_logical_not@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
- UNARY_LOOP_FAST(@type@, npy_bool, *out = !in);
-}
-#endif
-
-#if @CHK@
-NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
-@TYPE@_invert@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
- UNARY_LOOP_FAST(@type@, @type@, *out = ~in);
-}
-#endif
-
-/**begin repeat2
* Arithmetic
* #kind = add, subtract, multiply, bitwise_and, bitwise_or, bitwise_xor#
* #OP = +, -, *, &, |, ^#
*/
-#if @CHK@
-NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
-@TYPE@_@kind@@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
- if (IS_BINARY_REDUCE) {
- BINARY_REDUCE_LOOP_FAST(@type@, io1 @OP@= in2);
- }
- else {
- BINARY_LOOP_FAST(@type@, @type@, *out = in1 @OP@ in2);
- }
-}
-#endif
-
-/**end repeat2**/
-
-/*
- * Arithmetic bit shift operations.
- *
- * Intel hardware masks bit shift values, so large shifts wrap around
- * and can produce surprising results. The special handling ensures that
- * behavior is independent of compiler or hardware.
- * TODO: We could implement consistent behavior for negative shifts,
- * which is undefined in C.
- */
-
-#define INT_left_shift_needs_clear_floatstatus
-#define UINT_left_shift_needs_clear_floatstatus
-
-#if @CHK@
-NPY_NO_EXPORT NPY_GCC_OPT_3 void
-@TYPE@_left_shift@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps,
- void *NPY_UNUSED(func))
-{
- BINARY_LOOP_FAST(@type@, @type@, *out = npy_lshift@c@(in1, in2));
-
-#ifdef @TYPE@_left_shift_needs_clear_floatstatus
- // For some reason, our macOS CI sets an "invalid" flag here, but only
- // for some types.
- npy_clear_floatstatus_barrier((char*)dimensions);
-#endif
-}
-#endif
-
-#undef INT_left_shift_needs_clear_floatstatus
-#undef UINT_left_shift_needs_clear_floatstatus
-
-#if @CHK@
-NPY_NO_EXPORT
-#ifndef NPY_DO_NOT_OPTIMIZE_@TYPE@_right_shift
-NPY_GCC_OPT_3
-#endif
-void
-@TYPE@_right_shift@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps,
- void *NPY_UNUSED(func))
-{
- BINARY_LOOP_FAST(@type@, @type@, *out = npy_rshift@c@(in1, in2));
-}
-#endif
-
-/**begin repeat2
- * #kind = logical_and, logical_or#
- * #OP = &&, ||#
- */
-
-#if @CHK@
-NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
-@TYPE@_@kind@@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
- /*
- * gcc vectorization of this is not good (PR60575) but manual integer
- * vectorization is too tedious to be worthwhile
- */
- BINARY_LOOP_FAST(@type@, npy_bool, *out = in1 @OP@ in2);
-}
-#endif
-
-/**end repeat2**/
-
-#if @CHK@
-NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
-@TYPE@_logical_xor@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+NPY_NO_EXPORT NPY_GCC_OPT_3 int
+@TYPE@_@kind@_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+ char **args, npy_intp const *dimensions, npy_intp const *steps,
+ void *NPY_UNUSED(func))
{
- BINARY_LOOP {
- const int t1 = !!*(@type@ *)ip1;
- const int t2 = !!*(@type@ *)ip2;
- *((npy_bool *)op1) = (t1 != t2);
+ char *ip1 = args[0];
+ char *indx = args[1];
+ char *value = args[2];
+ npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+ npy_intp n = dimensions[0];
+ npy_intp i;
+ @type@ *indexed;
+ for(i = 0; i < n; i++, indx += isindex, value += isb) {
+ indexed = (@type@ *)(ip1 + is1 * *(npy_intp *)indx);
+ *indexed = *indexed @OP@ *(@type@ *)value;
}
+ return 0;
}
-#endif
-
/**end repeat1**/
NPY_NO_EXPORT void
@@ -760,23 +503,6 @@ NPY_NO_EXPORT void
*((@type@ *) op1) = out;
}
}
-
-/**begin repeat1
- * #kind = isnan, isinf, isfinite#
- * #func = npy_isnan, npy_isinf, npy_isfinite#
- * #val = NPY_FALSE, NPY_FALSE, NPY_TRUE#
- **/
-NPY_NO_EXPORT void
-@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
- /*
- * The (void)in; suppresses an unused variable warning raised by gcc and allows
- * us to re-use this macro even though we do not depend on in
- */
- UNARY_LOOP_FAST(@type@, npy_bool, (void)in; *out = @val@);
-}
-/**end repeat1**/
-
/**end repeat**/
/**begin repeat
@@ -784,19 +510,6 @@ NPY_NO_EXPORT void
* #type = npy_byte, npy_short, npy_int, npy_long, npy_longlong#
* #c = ,,,l,ll#
*/
-
-NPY_NO_EXPORT NPY_GCC_OPT_3 void
-@TYPE@_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
- UNARY_LOOP_FAST(@type@, @type@, *out = (in >= 0) ? in : -in);
-}
-
-NPY_NO_EXPORT NPY_GCC_OPT_3 void
-@TYPE@_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
- UNARY_LOOP_FAST(@type@, @type@, *out = in > 0 ? 1 : (in < 0 ? -1 : 0));
-}
-
/**begin repeat1
* #kind = gcd, lcm#
**/
@@ -810,7 +523,6 @@ NPY_NO_EXPORT void
}
}
/**end repeat1**/
-
/**end repeat**/
/**begin repeat
@@ -818,19 +530,6 @@ NPY_NO_EXPORT void
* #type = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong#
* #c = u,u,u,ul,ull#
*/
-
-NPY_NO_EXPORT NPY_GCC_OPT_3 void
-@TYPE@_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
- UNARY_LOOP_FAST(@type@, @type@, *out = in);
-}
-
-NPY_NO_EXPORT NPY_GCC_OPT_3 void
-@TYPE@_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
- UNARY_LOOP_FAST(@type@, @type@, *out = in > 0 ? 1 : 0);
-}
-
/**begin repeat1
* #kind = gcd, lcm#
**/
@@ -844,9 +543,50 @@ NPY_NO_EXPORT void
}
}
/**end repeat1**/
+/**end repeat**/
+
+/*
+ * NOTE: It may be nice to vectorize these, OTOH, these are still faster
+ * than the cast we used to do.
+ */
+
+/**begin repeat
+ * #kind = equal, not_equal, less, less_equal, greater, greater_equal#
+ * #OP = ==, !=, <, <=, >, >=#
+ */
+NPY_NO_EXPORT void
+LONGLONG_Qq_bool_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+ BINARY_LOOP {
+ const npy_ulonglong in1 = *(npy_ulonglong *)ip1;
+ const npy_longlong in2 = *(npy_longlong *)ip2;
+ if (in2 < 0) {
+ *(npy_bool *)op1 = 0 @OP@ in2;
+ }
+ else {
+ *(npy_bool *)op1 = in1 @OP@ (npy_ulonglong)in2;
+ }
+ }
+}
+
+NPY_NO_EXPORT void
+LONGLONG_qQ_bool_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+ BINARY_LOOP {
+ const npy_longlong in1 = *(npy_longlong *)ip1;
+ const npy_ulonglong in2 = *(npy_ulonglong *)ip2;
+ if (in1 < 0) {
+ *(npy_bool *)op1 = in1 @OP@ 0;
+ }
+ else {
+ *(npy_bool *)op1 = (npy_ulonglong)in1 @OP@ in2;
+ }
+ }
+}
/**end repeat**/
+
/*
*****************************************************************************
** DATETIME LOOPS **
@@ -923,12 +663,6 @@ NPY_NO_EXPORT void
}
NPY_NO_EXPORT void
-@TYPE@_isinf(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
- UNARY_LOOP_FAST(npy_bool, npy_bool, (void)in; *out = NPY_FALSE);
-}
-
-NPY_NO_EXPORT void
@TYPE@__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
{
OUTPUT_LOOP {
@@ -987,6 +721,7 @@ NPY_NO_EXPORT void
}
}
}
+
/**end repeat1**/
/**begin repeat1
@@ -1406,6 +1141,8 @@ TIMEDELTA_mm_qm_divmod(char **args, npy_intp const *dimensions, npy_intp const *
* #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
* #c = f, , l#
* #C = F, , L#
+ * #fd = 1, 1, 0#
+ * #VCHK = 1, 1, 0#
*/
/**begin repeat1
* #kind = logical_and, logical_or#
@@ -1442,32 +1179,22 @@ NPY_NO_EXPORT void
}
}
+#if !@fd@
/**begin repeat1
* #kind = isnan, isinf, isfinite, signbit#
* #func = npy_isnan, npy_isinf, npy_isfinite, npy_signbit#
**/
-
-/**begin repeat2
- * #ISA = , _avx512_skx#
- * #isa = simd, avx512_skx#
- * #CHK = 1, defined(HAVE_ATTRIBUTE_TARGET_AVX512_SKX)#
- **/
-
-#if @CHK@
NPY_NO_EXPORT void
-@TYPE@_@kind@@ISA@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
- if (!run_@kind@_@isa@_@TYPE@(args, dimensions, steps)) {
- UNARY_LOOP {
- const @type@ in1 = *(@type@ *)ip1;
- *((npy_bool *)op1) = @func@(in1) != 0;
- }
+ UNARY_LOOP {
+ const @type@ in1 = *(@type@ *)ip1;
+ *((npy_bool *)op1) = @func@(in1) != 0;
}
npy_clear_floatstatus_barrier((char*)dimensions);
}
-#endif
-/**end repeat2**/
/**end repeat1**/
+#endif
NPY_NO_EXPORT void
@TYPE@_spacing(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
@@ -1508,6 +1235,25 @@ NPY_NO_EXPORT void
}
}
+NPY_NO_EXPORT int
+@TYPE@_floor_divide_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+ char **args, npy_intp const *dimensions, npy_intp const *steps,
+ void *NPY_UNUSED(func))
+{
+ char *ip1 = args[0];
+ char *indx = args[1];
+ char *value = args[2];
+ npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+ npy_intp n = dimensions[0];
+ npy_intp i;
+ @type@ *indexed;
+ for(i = 0; i < n; i++, indx += isindex, value += isb) {
+ indexed = (@type@ *)(ip1 + is1 * *(npy_intp *)indx);
+ *indexed = npy_floor_divide@c@(*indexed, *(@type@ *)value);
+ }
+ return 0;
+}
+
NPY_NO_EXPORT void
@TYPE@_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
@@ -1546,17 +1292,6 @@ NPY_NO_EXPORT void
}
NPY_NO_EXPORT void
-@TYPE@_negative(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
- if (!run_unary_simd_negative_@TYPE@(args, dimensions, steps)) {
- UNARY_LOOP {
- const @type@ in1 = *(@type@ *)ip1;
- *((@type@ *)op1) = -in1;
- }
- }
-}
-
-NPY_NO_EXPORT void
@TYPE@_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
UNARY_LOOP {
@@ -1653,6 +1388,26 @@ LONGDOUBLE_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps
}
}
}
+
+NPY_NO_EXPORT int
+LONGDOUBLE_@kind@_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+ char **args, npy_intp const *dimensions, npy_intp const *steps,
+ void *NPY_UNUSED(func))
+{
+ char *ip1 = args[0];
+ char *indx = args[1];
+ char *value = args[2];
+ npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+ npy_intp n = dimensions[0];
+ npy_intp i;
+ npy_longdouble *indexed;
+ for(i = 0; i < n; i++, indx += isindex, value += isb) {
+ indexed = (npy_longdouble *)(ip1 + is1 * *(npy_intp *)indx);
+ *indexed = *indexed @OP@ *(npy_longdouble *)value;
+ }
+ return 0;
+}
+
/**end repeat**/
/**begin repeat
@@ -1758,6 +1513,26 @@ HALF_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void
}
}
}
+
+NPY_NO_EXPORT int
+HALF_@kind@_indexed(void *NPY_UNUSED(context),
+ char **args, npy_intp const *dimensions, npy_intp const *steps,
+ void *NPY_UNUSED(func))
+{
+ char *ip1 = args[0];
+ char *indx = args[1];
+ char *value = args[2];
+ npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+ npy_intp n = dimensions[0];
+ npy_intp i;
+ npy_half *indexed;
+ for(i = 0; i < n; i++, indx += isindex, value += isb) {
+ indexed = (npy_half *)(ip1 + is1 * *(npy_intp *)indx);
+ const float v = npy_half_to_float(*(npy_half *)value);
+ *indexed = npy_float_to_half(npy_half_to_float(*indexed) @OP@ v);
+ }
+ return 0;
+}
/**end repeat**/
#define _HALF_LOGICAL_AND(a,b) (!npy_half_iszero(a) && !npy_half_iszero(b))
@@ -1859,6 +1634,27 @@ HALF_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void
}
/* npy_half_isnan will never set floatstatus_invalid, so do not clear */
}
+
+NPY_NO_EXPORT int
+HALF_@kind@_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+ char **args, npy_intp const *dimensions, npy_intp const *steps,
+ void *NPY_UNUSED(func))
+{
+ char *ip1 = args[0];
+ char *indx = args[1];
+ char *value = args[2];
+ npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+ npy_intp n = dimensions[0];
+ npy_intp i;
+ npy_half *indexed;
+ for(i = 0; i < n; i++, indx += isindex, value += isb) {
+ indexed = (npy_half *)(ip1 + is1 * *(npy_intp *)indx);
+ npy_half v = *(npy_half *)value;
+ *indexed = (@OP@(*indexed, v) || npy_half_isnan(*indexed)) ? *indexed : v;
+ }
+ return 0;
+}
+
/**end repeat**/
/**begin repeat
@@ -1874,8 +1670,29 @@ HALF_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void
const npy_half in2 = *(npy_half *)ip2;
*((npy_half *)op1) = (@OP@(in1, in2) || npy_half_isnan(in2)) ? in1 : in2;
}
- /* npy_half_isnan will never set floatstatus_invalid, so do not clear */
+ /* no need to clear floatstatus_invalid */
+}
+
+NPY_NO_EXPORT int
+HALF_@kind@_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+ char **args, npy_intp const *dimensions, npy_intp const *steps,
+ void *NPY_UNUSED(func))
+{
+ char *ip1 = args[0];
+ char *indx = args[1];
+ char *value = args[2];
+ npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+ npy_intp n = dimensions[0];
+ npy_intp i;
+ npy_half *indexed;
+ for (i = 0; i < n; i++, indx += isindex, value += isb) {
+ indexed = (npy_half *)(ip1 + is1 * *(npy_intp *)indx);
+ npy_half v = *(npy_half *)value;
+ *indexed = (@OP@(*indexed, v) || npy_half_isnan(v)) ? *indexed: v;
+ }
+ return 0;
}
+
/**end repeat**/
NPY_NO_EXPORT void
@@ -1894,6 +1711,27 @@ HALF_floor_divide(char **args, npy_intp const *dimensions, npy_intp const *steps
}
}
+NPY_NO_EXPORT int
+HALF_floor_divide_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+ char **args, npy_intp const *dimensions, npy_intp const *steps,
+ void *NPY_UNUSED(func))
+{
+ char *ip1 = args[0];
+ char *indx = args[1];
+ char *value = args[2];
+ npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+ npy_intp n = dimensions[0];
+ npy_intp i;
+ npy_half *indexed;
+ for(i = 0; i < n; i++, indx += isindex, value += isb) {
+ indexed = (npy_half *)(ip1 + is1 * *(npy_intp *)indx);
+ float v = npy_half_to_float(*(npy_half *)value);
+ float div = npy_floor_dividef(npy_half_to_float(*indexed), v);
+ *indexed = npy_float_to_half(div);
+ }
+ return 0;
+}
+
NPY_NO_EXPORT void
HALF_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
@@ -1953,12 +1791,6 @@ HALF_conjugate(char **args, npy_intp const *dimensions, npy_intp const *steps, v
}
}
-NPY_NO_EXPORT NPY_GCC_OPT_3 void
-HALF_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
- UNARY_LOOP_FAST(npy_half, npy_half, *out = in&0x7fffu);
-}
-
NPY_NO_EXPORT void
HALF_negative(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
@@ -2110,6 +1942,26 @@ NPY_NO_EXPORT void
}
}
}
+
+NPY_NO_EXPORT int @TYPE@_@kind@_indexed
+(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+ char *ip1 = args[0];
+ char *indx = args[1];
+ char *value = args[2];
+ npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+ npy_intp n = dimensions[0];
+ npy_intp i;
+ @ftype@ *indexed;
+ for(i = 0; i < n; i++, indx += isindex, value += isb) {
+ indexed = (@ftype@ *)(ip1 + is1 * *(npy_intp *)indx);
+ const @ftype@ b_r = ((@ftype@ *)value)[0];
+ const @ftype@ b_i = ((@ftype@ *)value)[1];
+ indexed[0] @OP@= b_r;
+ indexed[1] @OP@= b_i;
+ }
+ return 0;
+}
/**end repeat1**/
NPY_NO_EXPORT void
@@ -2124,6 +1976,28 @@ NPY_NO_EXPORT void
((@ftype@ *)op1)[1] = in1r*in2i + in1i*in2r;
}
}
+
+NPY_NO_EXPORT int @TYPE@_multiply_indexed
+(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+ char *ip1 = args[0];
+ char *indx = args[1];
+ char *value = args[2];
+ npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+ npy_intp n = dimensions[0];
+ npy_intp i;
+ @ftype@ *indexed;
+ for(i = 0; i < n; i++, indx += isindex, value += isb) {
+ indexed = (@ftype@ *)(ip1 + is1 * *(npy_intp *)indx);
+ const @ftype@ a_r = indexed[0];
+ const @ftype@ a_i = indexed[1];
+ const @ftype@ b_r = ((@ftype@ *)value)[0];
+ const @ftype@ b_i = ((@ftype@ *)value)[1];
+ indexed[0] = a_r*b_r - a_i*b_i;
+ indexed[1] = a_r*b_i + a_i*b_r;
+ }
+ return 0;
+}
#endif // !SIMD
NPY_NO_EXPORT void
@@ -2235,6 +2109,8 @@ NPY_NO_EXPORT void
}
/**end repeat1**/
+#if !@SIMD@
+// CFLOAT & CDOUBLE defined by 'loops_arithm_fp.dispatch.c.src'
NPY_NO_EXPORT void
@TYPE@_square(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
{
@@ -2245,6 +2121,7 @@ NPY_NO_EXPORT void
((@ftype@ *)op1)[1] = in1r*in1i + in1i*in1r;
}
}
+#endif
NPY_NO_EXPORT void
@TYPE@_reciprocal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
@@ -2275,6 +2152,8 @@ NPY_NO_EXPORT void
}
}
+#if !@SIMD@
+// CFLOAT & CDOUBLE defined by 'loops_arithm_fp.dispatch.c.src'
NPY_NO_EXPORT void
@TYPE@_conjugate(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) {
UNARY_LOOP {
@@ -2294,20 +2173,6 @@ NPY_NO_EXPORT void
*((@ftype@ *)op1) = npy_hypot@c@(in1r, in1i);
}
}
-
-#if @SIMD@ && defined(HAVE_ATTRIBUTE_TARGET_AVX512F)
-/**begin repeat1
- * arithmetic
- * #kind = conjugate, square, absolute#
- */
-NPY_NO_EXPORT void
-@TYPE@_@kind@_avx512f(char **args, const npy_intp *dimensions, const npy_intp *steps, void *func)
-{
- if (!run_unary_avx512f_@kind@_@TYPE@(args, dimensions, steps)) {
- @TYPE@_@kind@(args, dimensions, steps, func);
- }
-}
-/**end repeat1**/
#endif
NPY_NO_EXPORT void
diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src
index 424e204c1..cce73aff8 100644
--- a/numpy/core/src/umath/loops.h.src
+++ b/numpy/core/src/umath/loops.h.src
@@ -10,24 +10,32 @@
#define NPY_NO_EXPORT NPY_VISIBILITY_HIDDEN
#endif
-#define BOOL_invert BOOL_logical_not
-#define BOOL_add BOOL_logical_or
-#define BOOL_bitwise_and BOOL_logical_and
-#define BOOL_bitwise_or BOOL_logical_or
-#define BOOL_logical_xor BOOL_not_equal
-#define BOOL_bitwise_xor BOOL_logical_xor
-#define BOOL_multiply BOOL_logical_and
-#define BOOL_maximum BOOL_logical_or
-#define BOOL_minimum BOOL_logical_and
-#define BOOL_fmax BOOL_maximum
-#define BOOL_fmin BOOL_minimum
-
/*
*****************************************************************************
** BOOLEAN LOOPS **
*****************************************************************************
*/
+/*
+ * Following functions are defined by umath generator
+ * to enable runtime dispatching without the need
+ * to redefine them within dsipatch-able sources.
+ */
+// #define BOOL_invert BOOL_logical_not
+// #define BOOL_add BOOL_logical_or
+// #define BOOL_bitwise_and BOOL_logical_and
+// #define BOOL_bitwise_or BOOL_logical_or
+// #define BOOL_logical_xor BOOL_not_equal
+// #define BOOL_bitwise_xor BOOL_logical_xor
+// #define BOOL_multiply BOOL_logical_and
+// #define BOOL_maximum BOOL_logical_or
+// #define BOOL_minimum BOOL_logical_and
+// #define BOOL_fmax BOOL_maximum
+// #define BOOL_fmin BOOL_minimum
+
+typedef struct PyArrayMethod_Context_tag PyArrayMethod_Context;
+typedef struct NpyAuxData_tag NpyAuxData;
+
#ifndef NPY_DISABLE_OPTIMIZATION
#include "loops_comparison.dispatch.h"
#endif
@@ -39,11 +47,15 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BOOL_@kind@,
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
/**end repeat**/
+#ifndef NPY_DISABLE_OPTIMIZATION
+ #include "loops_logical.dispatch.h"
+#endif
+
/**begin repeat
- * #kind = logical_and, logical_or, absolute, logical_not#
- **/
-NPY_NO_EXPORT void
-BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+ * #kind = logical_and, logical_or, logical_not, absolute#
+ */
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BOOL_@kind@,
+ (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
/**end repeat**/
NPY_NO_EXPORT void
@@ -56,6 +68,17 @@ NPY_NO_EXPORT void
BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
/**end repeat**/
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+ #include "loops_autovec.dispatch.h"
+#endif
+/**begin repeat
+ * #kind = isnan, isinf, isfinite#
+ */
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BOOL_@kind@,
+ (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+/**end repeat**/
+
/*
*****************************************************************************
** INTEGER LOOPS
@@ -72,6 +95,11 @@ BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void
*/
NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_divide,
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+NPY_NO_EXPORT int
+@TYPE@_divide_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+/**end repeat3**/
/**end repeat**/
#ifndef NPY_DISABLE_OPTIMIZATION
@@ -106,19 +134,40 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
/**end repeat1**/
/**end repeat**/
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+ #include "loops_autovec.dispatch.h"
+#endif
/**begin repeat
- * #TYPE = BYTE, SHORT, INT, LONG, LONGLONG#
+ * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG,
+ BYTE, SHORT, INT, LONG, LONGLONG#
*/
+/**begin repeat1
+ * #kind = invert, logical_not, conjugate, reciprocal, square, add,
+ * subtract, multiply, bitwise_and, bitwise_or, bitwise_xor,
+ * left_shift, right_shift, logical_and, logical_or,
+ * logical_xor, isnan, isinf, isfinite,
+ * absolute, sign#
+ */
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
+ (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+/**end repeat1**/
+/**end repeat**/
+/**begin repeat
+ * #TYPE = BYTE, SHORT, INT, LONG, LONGLONG#
+ */
/**begin repeat1
* both signed and unsigned integer types
* #s = , u#
* #S = , U#
*/
-
#define @S@@TYPE@_floor_divide @S@@TYPE@_divide
+#define @S@@TYPE@_floor_divide_indexed @S@@TYPE@_divide_indexed
#define @S@@TYPE@_fmax @S@@TYPE@_maximum
#define @S@@TYPE@_fmin @S@@TYPE@_minimum
+#define @S@@TYPE@_fmax_indexed @S@@TYPE@_maximum_indexed
+#define @S@@TYPE@_fmin_indexed @S@@TYPE@_minimum_indexed
NPY_NO_EXPORT void
@S@@TYPE@__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
@@ -127,85 +176,69 @@ NPY_NO_EXPORT void
@S@@TYPE@_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
/**begin repeat2
- * #isa = , _avx2#
- */
-
-NPY_NO_EXPORT void
-@S@@TYPE@_square@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
-
-NPY_NO_EXPORT void
-@S@@TYPE@_reciprocal@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
-
-NPY_NO_EXPORT void
-@S@@TYPE@_conjugate@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@S@@TYPE@_negative@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@S@@TYPE@_logical_not@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@S@@TYPE@_invert@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
-
-/**begin repeat3
* Arithmetic
* #kind = add, subtract, multiply, bitwise_and, bitwise_or, bitwise_xor,
* left_shift, right_shift#
* #OP = +, -,*, &, |, ^, <<, >>#
*/
-NPY_NO_EXPORT void
-@S@@TYPE@_@kind@@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
-
-/**end repeat3**/
-
-/**begin repeat3
- * #kind = logical_and, logical_or#
- * #OP = &&, ||#
- */
-NPY_NO_EXPORT void
-@S@@TYPE@_@kind@@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
-
-/**end repeat3**/
+NPY_NO_EXPORT int
+@S@@TYPE@_@kind@_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+ npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
-NPY_NO_EXPORT void
-@S@@TYPE@_logical_xor@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
/**end repeat2**/
/**begin repeat2
* #kind = maximum, minimum#
- * #OP = >, <#
**/
NPY_NO_EXPORT void
@S@@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
-/**end repeat2**/
-NPY_NO_EXPORT void
-@S@@TYPE@_power(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+NPY_NO_EXPORT int
+@S@@TYPE@_@kind@_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
-NPY_NO_EXPORT void
-@S@@TYPE@_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**end repeat2**/
NPY_NO_EXPORT void
-@S@@TYPE@_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+@S@@TYPE@_power(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
NPY_NO_EXPORT void
@S@@TYPE@_gcd(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
NPY_NO_EXPORT void
@S@@TYPE@_lcm(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
-
-/**begin repeat2
- * #kind = isnan, isinf, isfinite#
- **/
-NPY_NO_EXPORT void
-@S@@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
/**end repeat2**/
/**end repeat1**/
+/**end repeat**/
+
+/**begin repeat
+ * #kind = equal, not_equal, less, less_equal, greater, greater_equal#
+ * #OP = ==, !=, <, <=, >, >=#
+ */
+NPY_NO_EXPORT void
+LONGLONG_Qq_bool_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+NPY_NO_EXPORT void
+LONGLONG_qQ_bool_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
/**end repeat**/
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+ #include "loops_unary.dispatch.h"
+#endif
+/**begin repeat
+ * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG,
+ * BYTE, SHORT, INT, LONG, LONGLONG#
+ */
+/**begin repeat1
+ * #kind = negative#
+ */
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
+ (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+/**end repeat1**/
+/**end repeat**/
+
+
/*
*****************************************************************************
** FLOAT LOOPS **
@@ -226,6 +259,34 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
/**end repeat**/
#ifndef NPY_DISABLE_OPTIMIZATION
+ #include "loops_unary_fp_le.dispatch.h"
+#endif
+/**begin repeat
+ * #TYPE = FLOAT, DOUBLE#
+ */
+/**begin repeat1
+ * #kind = isnan, isinf, isfinite, signbit#
+ */
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
+ (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+/**end repeat1**/
+/**end repeat**/
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+ #include "loops_unary.dispatch.h"
+#endif
+/**begin repeat
+ * #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
+ */
+/**begin repeat1
+ * #kind = negative#
+ */
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
+ (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+/**end repeat1**/
+/**end repeat**/
+
+#ifndef NPY_DISABLE_OPTIMIZATION
#include "loops_arithm_fp.dispatch.h"
#endif
/**begin repeat
@@ -234,10 +295,13 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
/**begin repeat1
* Arithmetic
* # kind = add, subtract, multiply, divide#
- * # OP = +, -, *, /#
*/
NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+NPY_NO_EXPORT int
+@TYPE@_@kind@_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
/**end repeat1**/
/**end repeat**/
@@ -283,15 +347,6 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void HALF_@func@,
/**end repeat**/
/**begin repeat
- * #func = sin, cos#
- */
-
-NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_@func@,
- (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
-
-/**end repeat**/
-
-/**begin repeat
* #TYPE = FLOAT, DOUBLE#
*/
/**begin repeat1
@@ -304,27 +359,20 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@func@,
/**end repeat1**/
/**end repeat**/
-/**begin repeat
- * #TYPE = FLOAT, DOUBLE#
- */
-/**begin repeat1
- * #func = maximum, minimum#
- */
-NPY_NO_EXPORT void
-@TYPE@_@func@_avx512f(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
-
-/**end repeat1**/
-/**end repeat**/
-
#ifndef NPY_DISABLE_OPTIMIZATION
#include "loops_trigonometric.dispatch.h"
#endif
+
/**begin repeat
+ * #TYPE = FLOAT, DOUBLE#
+ */
+/**begin repeat1
* #func = sin, cos#
*/
-NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_@func@, (
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@func@, (
char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)
))
+/**end repeat1**/
/**end repeat**/
#ifndef NPY_DISABLE_OPTIMIZATION
@@ -362,6 +410,8 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, (
* #TYPE = HALF, FLOAT, DOUBLE, LONGDOUBLE#
* #c = f, f, , l#
* #C = F, F, , L#
+ * #half = 1, 0, 0, 0#
+ * #fd = 0, 1, 1, 0#
*/
/**begin repeat1
@@ -371,6 +421,11 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, (
*/
NPY_NO_EXPORT void
@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+@TYPE@_@kind@_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+/**end repeat1**/
/**end repeat1**/
/**begin repeat1
@@ -390,35 +445,44 @@ NPY_NO_EXPORT void
/**begin repeat1
* #kind = isnan, isinf, isfinite, signbit, copysign, nextafter, spacing#
* #func = npy_isnan, npy_isinf, npy_isfinite, npy_signbit, npy_copysign, nextafter, spacing#
+ * #dispatched = 1, 1, 1, 1, 0, 0, 0#
**/
-/**begin repeat2
- * #ISA = , _avx512_skx#
- **/
+#if !@fd@ || !@dispatched@
NPY_NO_EXPORT void
-@TYPE@_@kind@@ISA@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+#endif
/**end repeat2**/
/**end repeat1**/
/**begin repeat1
* #kind = maximum, minimum#
- * #OP = >=, <=#
**/
NPY_NO_EXPORT void
@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+@TYPE@_@kind@_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
/**end repeat1**/
/**begin repeat1
* #kind = fmax, fmin#
- * #OP = >=, <=#
**/
NPY_NO_EXPORT void
@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+@TYPE@_@kind@_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
/**end repeat1**/
NPY_NO_EXPORT void
@TYPE@_floor_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+NPY_NO_EXPORT int
+@TYPE@_floor_divide_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
NPY_NO_EXPORT void
@TYPE@_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
@@ -440,8 +504,10 @@ NPY_NO_EXPORT void
NPY_NO_EXPORT void
@TYPE@_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+#if @half@
NPY_NO_EXPORT void
@TYPE@_negative(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+#endif
NPY_NO_EXPORT void
@TYPE@_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
@@ -473,6 +539,19 @@ NPY_NO_EXPORT void
/**end repeat1**/
/**end repeat**/
+#ifndef NPY_DISABLE_OPTIMIZATION
+ #include "loops_autovec.dispatch.h"
+#endif
+/**begin repeat
+ * #TYPE = HALF#
+ */
+/**begin repeat1
+ * #kind = absolute#
+ */
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
+ (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+/**end repeat1**/
+/**end repeat**/
/*
*****************************************************************************
** COMPLEX LOOPS **
@@ -485,7 +564,21 @@ NPY_NO_EXPORT void
* #TYPE = CFLOAT, CDOUBLE#
*/
/**begin repeat1
- * #kind = add, subtract, multiply#
+ * #kind = add, subtract, multiply, conjugate, square#
+ */
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
+ (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+/**end repeat1**/
+/**end repeat**/
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+ #include "loops_unary_complex.dispatch.h"
+#endif
+/**begin repeat
+ * #TYPE = CFLOAT, CDOUBLE#
+ */
+/**begin repeat1
+ * #kind = absolute#
*/
NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
@@ -512,6 +605,9 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
*/
NPY_NO_EXPORT void
C@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+C@TYPE@_@kind@_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
/**end repeat1**/
NPY_NO_EXPORT void
@@ -554,19 +650,14 @@ C@TYPE@_reciprocal(char **args, npy_intp const *dimensions, npy_intp const *step
NPY_NO_EXPORT void
C@TYPE@__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
-/**begin repeat1
- * #isa = , _avx512f#
- */
-
NPY_NO_EXPORT void
-C@TYPE@_conjugate@isa@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+C@TYPE@_conjugate(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
NPY_NO_EXPORT void
-C@TYPE@_absolute@isa@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+C@TYPE@_absolute(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
NPY_NO_EXPORT void
-C@TYPE@_square@isa@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(data));
-/**end repeat1**/
+C@TYPE@_square(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(data));
NPY_NO_EXPORT void
C@TYPE@__arg(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
@@ -627,9 +718,6 @@ NPY_NO_EXPORT void
NPY_NO_EXPORT void
@TYPE@_isfinite(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
-NPY_NO_EXPORT void
-@TYPE@_isinf(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
-
#define @TYPE@_isnan @TYPE@_isnat
NPY_NO_EXPORT void
@@ -705,6 +793,21 @@ TIMEDELTA_mm_qm_divmod(char **args, npy_intp const *dimensions, npy_intp const *
#define TIMEDELTA_md_m_floor_divide TIMEDELTA_md_m_divide
/* #define TIMEDELTA_mm_d_floor_divide TIMEDELTA_mm_d_divide */
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+ #include "loops_autovec.dispatch.h"
+#endif
+/**begin repeat
+ * #TYPE = TIMEDELTA, DATETIME#
+ */
+/**begin repeat1
+ * #kind = isinf#
+ */
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
+ (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+/**end repeat1**/
+/**end repeat**/
+
/*
*****************************************************************************
** OBJECT LOOPS **
diff --git a/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src b/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src
index bf8142880..3ab5a968d 100644
--- a/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src
+++ b/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src
@@ -1,6 +1,8 @@
/*@targets
** $maxopt baseline
- ** sse2 avx2 avx512f
+ ** sse2 (avx2 fma3)
+ ** neon asimd
+ ** vsx2 vsx3
** vx vxe
**/
#define _UMATHMODULE
@@ -14,707 +16,392 @@
// Provides the various *_LOOP macros
#include "fast_loop_macros.h"
-// TODO: replace raw SIMD with NPYV
+/**
+ * TODO:
+ * - Improve the implementation of SIMD complex absolute,
+ * current one kinda slow and it can be optimized by
+ * at least avoiding the division and keep sqrt.
+ * - Vectorize reductions
+ * - Add support for ASIMD/VCMLA through universal intrinics.
+ */
+
//###############################################################################
//## Real Single/Double precision
//###############################################################################
/********************************************************************************
- ** Defining the SIMD kernels
+ ** Defining ufunc inner functions
********************************************************************************/
-#ifdef NPY_HAVE_SSE2
+
+/*
+ * clang has a bug that's present at -O1 or greater. When partially loading a
+ * vector register for a divide operation, the remaining elements are set
+ * to 1 to avoid divide-by-zero. The partial load is paired with a partial
+ * store after the divide operation. clang notices that the entire register
+ * is not needed for the store and optimizes out the fill of 1 to the remaining
+ * elements. This causes either a divide-by-zero or 0/0 with invalid exception
+ * that we were trying to avoid by filling.
+ *
+ * Using a dummy variable marked 'volatile' convinces clang not to ignore
+ * the explicit fill of remaining elements. If `-ftrapping-math` is
+ * supported, then it'll also avoid the bug. `-ftrapping-math` is supported
+ * on Apple clang v12+ for x86_64. It is not currently supported for arm64.
+ * `-ftrapping-math` is set by default of Numpy builds in
+ * numpy/distutils/ccompiler.py.
+ *
+ * Note: Apple clang and clang upstream have different versions that overlap
+ */
+#if defined(__clang__)
+ #if defined(__apple_build_version__)
+ // Apple Clang
+ #if __apple_build_version__ < 12000000
+ // Apple Clang before v12
+ #define WORKAROUND_CLANG_PARTIAL_LOAD_BUG 1
+ #elif defined(NPY_CPU_X86) || defined(NPY_CPU_AMD64)
+ // Apple Clang after v12, targeting i386 or x86_64
+ #define WORKAROUND_CLANG_PARTIAL_LOAD_BUG 0
+ #else
+ // Apple Clang after v12, not targeting i386 or x86_64
+ #define WORKAROUND_CLANG_PARTIAL_LOAD_BUG 1
+ #endif
+ #else
+ // Clang, not Apple Clang
+ #if __clang_major__ < 10
+ // Clang before v10
+ #define WORKAROUND_CLANG_PARTIAL_LOAD_BUG 1
+ #elif defined(_MSC_VER)
+ // clang-cl has the same bug
+ #define WORKAROUND_CLANG_PARTIAL_LOAD_BUG 1
+ #elif defined(NPY_CPU_X86) || defined(NPY_CPU_AMD64)
+ // Clang v10+, targeting i386 or x86_64
+ #define WORKAROUND_CLANG_PARTIAL_LOAD_BUG 0
+ #else
+ // Clang v10+, not targeting i386 or x86_64
+ #define WORKAROUND_CLANG_PARTIAL_LOAD_BUG 1
+ #endif
+ #endif
+#else
+// Not a Clang compiler
+#define WORKAROUND_CLANG_PARTIAL_LOAD_BUG 0
+#endif
+
/**begin repeat
+ * Float types
* #type = npy_float, npy_double#
* #TYPE = FLOAT, DOUBLE#
- * #scalarf = npy_sqrtf, npy_sqrt#
+ * #sfx = f32, f64#
* #c = f, #
- * #vtype = __m128, __m128d#
- * #vtype256 = __m256, __m256d#
- * #vtype512 = __m512, __m512d#
- * #vpre = _mm, _mm#
- * #vpre256 = _mm256, _mm256#
- * #vpre512 = _mm512, _mm512#
- * #vsuf = ps, pd#
- * #vsufs = ss, sd#
- * #nan = NPY_NANF, NPY_NAN#
- * #double = 0, 1#
- * #cast = _mm_castps_si128, _mm_castpd_si128#
+ * #C = F, #
+ * #VECTOR = NPY_SIMD_F32, NPY_SIMD_F64#
*/
/**begin repeat1
-* Arithmetic
-* # kind = add, subtract, multiply, divide#
-* # OP = +, -, *, /#
-* # VOP = add, sub, mul, div#
-*/
-static void
-sse2_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
+ * Arithmetic
+ * # kind = add, subtract, multiply, divide#
+ * # intrin = add, sub, mul, div#
+ * # OP = +, -, *, /#
+ * # PW = 1, 0, 0, 0#
+ * # is_div = 0*3, 1#
+ * # is_mul = 0*2, 1, 0#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
-#ifdef NPY_HAVE_AVX512F
- const npy_intp vector_size_bytes = 64;
- LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
- op[i] = ip1[i] @OP@ ip2[i];
- /* lots of specializations, to squeeze out max performance */
- if (npy_is_aligned(&ip1[i], vector_size_bytes) && npy_is_aligned(&ip2[i], vector_size_bytes)) {
- if (ip1 == ip2) {
- LOOP_BLOCKED(@type@, vector_size_bytes) {
- @vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]);
- @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, a);
- @vpre512@_store_@vsuf@(&op[i], c);
+ npy_intp len = dimensions[0];
+ char *src0 = args[0], *src1 = args[1], *dst = args[2];
+ npy_intp ssrc0 = steps[0], ssrc1 = steps[1], sdst = steps[2];
+ // reduce
+ if (ssrc0 == 0 && ssrc0 == sdst && src0 == dst) {
+ #if @PW@
+ *((@type@*)src0) @OP@= @TYPE@_pairwise_sum(src1, len, ssrc1);
+ #else
+ @type@ acc = *((@type@*)src0);
+ if (ssrc1 == sizeof(@type@)) {
+ for (; len > 0; --len, src1 += sizeof(@type@)) {
+ acc @OP@= *(@type@ *)src1;
}
- }
- else {
- LOOP_BLOCKED(@type@, vector_size_bytes) {
- @vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]);
- @vtype512@ b = @vpre512@_load_@vsuf@(&ip2[i]);
- @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
- @vpre512@_store_@vsuf@(&op[i], c);
+ } else {
+ for (; len > 0; --len, src1 += ssrc1) {
+ acc @OP@= *(@type@ *)src1;
}
}
+ *((@type@*)src0) = acc;
+ #endif
+ return;
}
- else if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
- LOOP_BLOCKED(@type@, vector_size_bytes) {
- @vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]);
- @vtype512@ b = @vpre512@_loadu_@vsuf@(&ip2[i]);
- @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
- @vpre512@_store_@vsuf@(&op[i], c);
- }
- }
- else if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
- LOOP_BLOCKED(@type@, vector_size_bytes) {
- @vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]);
- @vtype512@ b = @vpre512@_load_@vsuf@(&ip2[i]);
- @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
- @vpre512@_store_@vsuf@(&op[i], c);
- }
- }
- else {
- if (ip1 == ip2) {
- LOOP_BLOCKED(@type@, vector_size_bytes) {
- @vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]);
- @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, a);
- @vpre512@_store_@vsuf@(&op[i], c);
+#if @VECTOR@
+ if (len > npyv_nlanes_@sfx@*2 &&
+ !is_mem_overlap(src0, ssrc0, dst, sdst, len) &&
+ !is_mem_overlap(src1, ssrc1, dst, sdst, len)
+ ) {
+ const int vstep = npyv_nlanes_u8;
+ const int wstep = vstep * 2;
+ const int hstep = npyv_nlanes_@sfx@;
+ const int lstep = hstep * 2;
+ // lots of specializations, to squeeze out max performance
+ if (ssrc0 == sizeof(@type@) && ssrc0 == ssrc1 && ssrc0 == sdst) {
+ for (; len >= lstep; len -= lstep, src0 += wstep, src1 += wstep, dst += wstep) {
+ npyv_@sfx@ a0 = npyv_load_@sfx@((const @type@*)src0);
+ npyv_@sfx@ a1 = npyv_load_@sfx@((const @type@*)(src0 + vstep));
+ npyv_@sfx@ b0 = npyv_load_@sfx@((const @type@*)src1);
+ npyv_@sfx@ b1 = npyv_load_@sfx@((const @type@*)(src1 + vstep));
+ npyv_@sfx@ r0 = npyv_@intrin@_@sfx@(a0, b0);
+ npyv_@sfx@ r1 = npyv_@intrin@_@sfx@(a1, b1);
+ npyv_store_@sfx@((@type@*)dst, r0);
+ npyv_store_@sfx@((@type@*)(dst + vstep), r1);
}
- }
- else {
- LOOP_BLOCKED(@type@, vector_size_bytes) {
- @vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]);
- @vtype512@ b = @vpre512@_loadu_@vsuf@(&ip2[i]);
- @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
- @vpre512@_store_@vsuf@(&op[i], c);
+ #if @is_div@ && WORKAROUND_CLANG_PARTIAL_LOAD_BUG
+ const int vstop = hstep - 1;
+ #else
+ const int vstop = 0;
+ #endif // #if @is_div@ && WORKAROUND_CLANG_PARTIAL_LOAD_BUG
+ for (; len > vstop; len -= hstep, src0 += vstep, src1 += vstep, dst += vstep) {
+ #if @is_div@
+ npyv_@sfx@ a = npyv_load_till_@sfx@((const @type@*)src0, len, 1.0@c@);
+ npyv_@sfx@ b = npyv_load_till_@sfx@((const @type@*)src1, len, 1.0@c@);
+ #else
+ npyv_@sfx@ a = npyv_load_tillz_@sfx@((const @type@*)src0, len);
+ npyv_@sfx@ b = npyv_load_tillz_@sfx@((const @type@*)src1, len);
+ #endif
+ npyv_@sfx@ r = npyv_@intrin@_@sfx@(a, b);
+ npyv_store_till_@sfx@((@type@*)dst, len, r);
}
- }
- }
-#elif defined NPY_HAVE_AVX2
- const npy_intp vector_size_bytes = 32;
- LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
- op[i] = ip1[i] @OP@ ip2[i];
- /* lots of specializations, to squeeze out max performance */
- if (npy_is_aligned(&ip1[i], vector_size_bytes) &&
- npy_is_aligned(&ip2[i], vector_size_bytes)) {
- if (ip1 == ip2) {
- LOOP_BLOCKED(@type@, vector_size_bytes) {
- @vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]);
- @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, a);
- @vpre256@_store_@vsuf@(&op[i], c);
+ #if @is_div@ && WORKAROUND_CLANG_PARTIAL_LOAD_BUG
+ // last partial iteration for divide and working around clang partial load bug
+ if(len > 0){
+ npyv_@sfx@ a = npyv_load_till_@sfx@((const @type@*)src0, len, 1.0@c@);
+ volatile npyv_@sfx@ b = npyv_load_till_@sfx@((const @type@*)src1, len, 1.0@c@);
+ npyv_@sfx@ r = npyv_@intrin@_@sfx@(a, b);
+ npyv_store_till_@sfx@((@type@*)dst, len, r);
}
- }
- else {
- LOOP_BLOCKED(@type@, vector_size_bytes) {
- @vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]);
- @vtype256@ b = @vpre256@_load_@vsuf@(&ip2[i]);
- @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
- @vpre256@_store_@vsuf@(&op[i], c);
+ #endif // #if @is_div@ && WORKAROUND_CLANG_PARTIAL_LOAD_BUG
+ }
+ else if (ssrc0 == 0 && ssrc1 == sizeof(@type@) && sdst == ssrc1) {
+ npyv_@sfx@ a = npyv_setall_@sfx@(*((@type@*)src0));
+ for (; len >= lstep; len -= lstep, src1 += wstep, dst += wstep) {
+ npyv_@sfx@ b0 = npyv_load_@sfx@((const @type@*)src1);
+ npyv_@sfx@ b1 = npyv_load_@sfx@((const @type@*)(src1 + vstep));
+ npyv_@sfx@ r0 = npyv_@intrin@_@sfx@(a, b0);
+ npyv_@sfx@ r1 = npyv_@intrin@_@sfx@(a, b1);
+ npyv_store_@sfx@((@type@*)dst, r0);
+ npyv_store_@sfx@((@type@*)(dst + vstep), r1);
}
- }
- }
- else if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
- LOOP_BLOCKED(@type@, vector_size_bytes) {
- @vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]);
- @vtype256@ b = @vpre256@_loadu_@vsuf@(&ip2[i]);
- @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
- @vpre256@_store_@vsuf@(&op[i], c);
- }
- }
- else if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
- LOOP_BLOCKED(@type@, vector_size_bytes) {
- @vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]);
- @vtype256@ b = @vpre256@_load_@vsuf@(&ip2[i]);
- @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
- @vpre256@_store_@vsuf@(&op[i], c);
- }
- }
- else {
- if (ip1 == ip2) {
- LOOP_BLOCKED(@type@, vector_size_bytes) {
- @vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]);
- @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, a);
- @vpre256@_store_@vsuf@(&op[i], c);
- }
- }
- else {
- LOOP_BLOCKED(@type@, vector_size_bytes) {
- @vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]);
- @vtype256@ b = @vpre256@_loadu_@vsuf@(&ip2[i]);
- @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
- @vpre256@_store_@vsuf@(&op[i], c);
+ #if (@is_div@ || @is_mul@) && WORKAROUND_CLANG_PARTIAL_LOAD_BUG
+ const int vstop = hstep - 1;
+ #else
+ const int vstop = 0;
+ #endif // #if (@is_div@ || @is_mul@) && WORKAROUND_CLANG_PARTIAL_LOAD_BUG
+ for (; len > vstop; len -= hstep, src1 += vstep, dst += vstep) {
+ #if @is_div@ || @is_mul@
+ npyv_@sfx@ b = npyv_load_till_@sfx@((const @type@*)src1, len, 1.0@c@);
+ #else
+ npyv_@sfx@ b = npyv_load_tillz_@sfx@((const @type@*)src1, len);
+ #endif
+ npyv_@sfx@ r = npyv_@intrin@_@sfx@(a, b);
+ npyv_store_till_@sfx@((@type@*)dst, len, r);
}
- }
- }
-#else
- const npy_intp vector_size_bytes = 16;
- LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
- op[i] = ip1[i] @OP@ ip2[i];
- /* lots of specializations, to squeeze out max performance */
- if (npy_is_aligned(&ip1[i], vector_size_bytes) &&
- npy_is_aligned(&ip2[i], vector_size_bytes)) {
- if (ip1 == ip2) {
- LOOP_BLOCKED(@type@, vector_size_bytes) {
- @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]);
- @vtype@ c = @vpre@_@VOP@_@vsuf@(a, a);
- @vpre@_store_@vsuf@(&op[i], c);
- }
- }
- else {
- LOOP_BLOCKED(@type@, vector_size_bytes) {
- @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]);
- @vtype@ b = @vpre@_load_@vsuf@(&ip2[i]);
- @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
- @vpre@_store_@vsuf@(&op[i], c);
+ #if (@is_div@ || @is_mul@) && WORKAROUND_CLANG_PARTIAL_LOAD_BUG
+ // last partial iteration for multiply / divide and working around clang partial load bug
+ if(len > 0){
+ volatile npyv_@sfx@ b = npyv_load_till_@sfx@((const @type@*)src1, len, 1.0@c@);
+ npyv_@sfx@ r = npyv_@intrin@_@sfx@(a, b);
+ npyv_store_till_@sfx@((@type@*)dst, len, r);
}
- }
- }
- else if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
- LOOP_BLOCKED(@type@, vector_size_bytes) {
- @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]);
- @vtype@ b = @vpre@_loadu_@vsuf@(&ip2[i]);
- @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
- @vpre@_store_@vsuf@(&op[i], c);
- }
- }
- else if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
- LOOP_BLOCKED(@type@, vector_size_bytes) {
- @vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]);
- @vtype@ b = @vpre@_load_@vsuf@(&ip2[i]);
- @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
- @vpre@_store_@vsuf@(&op[i], c);
- }
- }
- else {
- if (ip1 == ip2) {
- LOOP_BLOCKED(@type@, vector_size_bytes) {
- @vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]);
- @vtype@ c = @vpre@_@VOP@_@vsuf@(a, a);
- @vpre@_store_@vsuf@(&op[i], c);
+ #endif // #if (@is_div@ || @is_mul@) && WORKAROUND_CLANG_PARTIAL_LOAD_BUG
+ }
+ else if (ssrc1 == 0 && ssrc0 == sizeof(@type@) && sdst == ssrc0) {
+ npyv_@sfx@ b = npyv_setall_@sfx@(*((@type@*)src1));
+ for (; len >= lstep; len -= lstep, src0 += wstep, dst += wstep) {
+ npyv_@sfx@ a0 = npyv_load_@sfx@((const @type@*)src0);
+ npyv_@sfx@ a1 = npyv_load_@sfx@((const @type@*)(src0 + vstep));
+ npyv_@sfx@ r0 = npyv_@intrin@_@sfx@(a0, b);
+ npyv_@sfx@ r1 = npyv_@intrin@_@sfx@(a1, b);
+ npyv_store_@sfx@((@type@*)dst, r0);
+ npyv_store_@sfx@((@type@*)(dst + vstep), r1);
}
- }
- else {
- LOOP_BLOCKED(@type@, vector_size_bytes) {
- @vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]);
- @vtype@ b = @vpre@_loadu_@vsuf@(&ip2[i]);
- @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
- @vpre@_store_@vsuf@(&op[i], c);
+ for (; len > 0; len -= hstep, src0 += vstep, dst += vstep) {
+ #if @is_div@ || @is_mul@
+ npyv_@sfx@ a = npyv_load_till_@sfx@((const @type@*)src0, len, 1.0@c@);
+ #else
+ npyv_@sfx@ a = npyv_load_tillz_@sfx@((const @type@*)src0, len);
+ #endif
+ npyv_@sfx@ r = npyv_@intrin@_@sfx@(a, b);
+ npyv_store_till_@sfx@((@type@*)dst, len, r);
}
+ } else {
+ goto loop_scalar;
}
+ npyv_cleanup();
+ return;
}
+loop_scalar:
#endif
- LOOP_BLOCKED_END {
- op[i] = ip1[i] @OP@ ip2[i];
- }
-}
-
-static void
-sse2_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
-{
-#ifdef NPY_HAVE_AVX512F
- const npy_intp vector_size_bytes = 64;
- const @vtype512@ a = @vpre512@_set1_@vsuf@(ip1[0]);
- LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
- op[i] = ip1[0] @OP@ ip2[i];
- if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
- LOOP_BLOCKED(@type@, vector_size_bytes) {
- @vtype512@ b = @vpre512@_load_@vsuf@(&ip2[i]);
- @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
- @vpre512@_store_@vsuf@(&op[i], c);
- }
- }
- else {
- LOOP_BLOCKED(@type@, vector_size_bytes) {
- @vtype512@ b = @vpre512@_loadu_@vsuf@(&ip2[i]);
- @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
- @vpre512@_store_@vsuf@(&op[i], c);
- }
- }
-
-
-#elif defined NPY_HAVE_AVX2
- const npy_intp vector_size_bytes = 32;
- const @vtype256@ a = @vpre256@_set1_@vsuf@(ip1[0]);
- LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
- op[i] = ip1[0] @OP@ ip2[i];
- if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
- LOOP_BLOCKED(@type@, vector_size_bytes) {
- @vtype256@ b = @vpre256@_load_@vsuf@(&ip2[i]);
- @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
- @vpre256@_store_@vsuf@(&op[i], c);
- }
- }
- else {
- LOOP_BLOCKED(@type@, vector_size_bytes) {
- @vtype256@ b = @vpre256@_loadu_@vsuf@(&ip2[i]);
- @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
- @vpre256@_store_@vsuf@(&op[i], c);
- }
- }
-#else
- const npy_intp vector_size_bytes = 16;
- const @vtype@ a = @vpre@_set1_@vsuf@(ip1[0]);
- LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
- op[i] = ip1[0] @OP@ ip2[i];
- if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
- LOOP_BLOCKED(@type@, vector_size_bytes) {
- @vtype@ b = @vpre@_load_@vsuf@(&ip2[i]);
- @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
- @vpre@_store_@vsuf@(&op[i], c);
- }
- }
- else {
- LOOP_BLOCKED(@type@, vector_size_bytes) {
- @vtype@ b = @vpre@_loadu_@vsuf@(&ip2[i]);
- @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
- @vpre@_store_@vsuf@(&op[i], c);
- }
- }
-#endif
- LOOP_BLOCKED_END {
- op[i] = ip1[0] @OP@ ip2[i];
+ for (; len > 0; --len, src0 += ssrc0, src1 += ssrc1, dst += sdst) {
+ const @type@ a = *((@type@*)src0);
+ const @type@ b = *((@type@*)src1);
+ *((@type@*)dst) = a @OP@ b;
}
}
-static void
-sse2_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
{
-#ifdef NPY_HAVE_AVX512F
- const npy_intp vector_size_bytes = 64;
- const @vtype512@ b = @vpre512@_set1_@vsuf@(ip2[0]);
- LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
- op[i] = ip1[i] @OP@ ip2[0];
- if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
- LOOP_BLOCKED(@type@, vector_size_bytes) {
- @vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]);
- @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
- @vpre512@_store_@vsuf@(&op[i], c);
- }
- }
- else {
- LOOP_BLOCKED(@type@, vector_size_bytes) {
- @vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]);
- @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
- @vpre512@_store_@vsuf@(&op[i], c);
- }
- }
-
-#elif defined NPY_HAVE_AVX2
- const npy_intp vector_size_bytes = 32;
- const @vtype256@ b = @vpre256@_set1_@vsuf@(ip2[0]);
- LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
- op[i] = ip1[i] @OP@ ip2[0];
- if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
- LOOP_BLOCKED(@type@, vector_size_bytes) {
- @vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]);
- @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
- @vpre256@_store_@vsuf@(&op[i], c);
- }
- }
- else {
- LOOP_BLOCKED(@type@, vector_size_bytes) {
- @vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]);
- @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
- @vpre256@_store_@vsuf@(&op[i], c);
- }
- }
-#else
- const npy_intp vector_size_bytes = 16;
- const @vtype@ b = @vpre@_set1_@vsuf@(ip2[0]);
- LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
- op[i] = ip1[i] @OP@ ip2[0];
- if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
- LOOP_BLOCKED(@type@, vector_size_bytes) {
- @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]);
- @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
- @vpre@_store_@vsuf@(&op[i], c);
- }
- }
- else {
- LOOP_BLOCKED(@type@, vector_size_bytes) {
- @vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]);
- @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
- @vpre@_store_@vsuf@(&op[i], c);
- }
- }
-#endif
- LOOP_BLOCKED_END {
- op[i] = ip1[i] @OP@ ip2[0];
+ char *ip1 = args[0];
+ char *indx = args[1];
+ char *value = args[2];
+ npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+ npy_intp n = dimensions[0];
+ npy_intp i;
+ @type@ *indexed;
+ for(i = 0; i < n; i++, indx += isindex, value += isb) {
+ indexed = (@type@ *)(ip1 + is1 * *(npy_intp *)indx);
+ *indexed = *indexed @OP@ *(@type@ *)value;
}
+ return 0;
}
/**end repeat1**/
/**end repeat**/
-#else // NPY_HAVE_SSE2
+#undef WORKAROUND_CLANG_PARTIAL_LOAD_BUG
-/**begin repeat
- * #type = npy_float, npy_double#
- * #TYPE = FLOAT, DOUBLE#
- * #sfx = f32, f64#
- * #CHK = _F32, _F64#
- */
-#if NPY_SIMD@CHK@
-/**begin repeat1
-* Arithmetic
-* # kind = add, subtract, multiply, divide#
-* # OP = +, -, *, /#
-* # VOP = add, sub, mul, div#
-*/
-
-static void
-simd_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
-{
- LOOP_BLOCK_ALIGN_VAR(op, @type@, NPY_SIMD_WIDTH) {
- op[i] = ip1[i] @OP@ ip2[i];
- }
- /* lots of specializations, to squeeze out max performance */
- if (ip1 == ip2) {
- LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) {
- npyv_@sfx@ a = npyv_load_@sfx@(&ip1[i]);
- npyv_@sfx@ c = npyv_@VOP@_@sfx@(a, a);
- npyv_store_@sfx@(&op[i], c);
- }
- }
- else {
- LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) {
- npyv_@sfx@ a = npyv_load_@sfx@(&ip1[i]);
- npyv_@sfx@ b = npyv_load_@sfx@(&ip2[i]);
- npyv_@sfx@ c = npyv_@VOP@_@sfx@(a, b);
- npyv_store_@sfx@(&op[i], c);
- }
- }
- LOOP_BLOCKED_END {
- op[i] = ip1[i] @OP@ ip2[i];
- }
-}
+//###############################################################################
+//## Complex Single/Double precision
+//###############################################################################
-static void
-simd_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
-{
- const npyv_@sfx@ v1 = npyv_setall_@sfx@(ip1[0]);
- LOOP_BLOCK_ALIGN_VAR(op, @type@, NPY_SIMD_WIDTH) {
- op[i] = ip1[0] @OP@ ip2[i];
- }
- LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) {
- npyv_@sfx@ v2 = npyv_load_@sfx@(&ip2[i]);
- npyv_@sfx@ v3 = npyv_@VOP@_@sfx@(v1, v2);
- npyv_store_@sfx@(&op[i], v3);
- }
- LOOP_BLOCKED_END {
- op[i] = ip1[0] @OP@ ip2[i];
- }
-}
+/********************************************************************************
+ ** op intrinics
+ ********************************************************************************/
-static void
-simd_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
+#if NPY_SIMD_F32
+NPY_FINLINE npyv_f32x2 simd_set2_f32(const float *a)
{
- const npyv_@sfx@ v2 = npyv_setall_@sfx@(ip2[0]);
- LOOP_BLOCK_ALIGN_VAR(op, @type@, NPY_SIMD_WIDTH) {
- op[i] = ip1[i] @OP@ ip2[0];
- }
- LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) {
- npyv_@sfx@ v1 = npyv_load_@sfx@(&ip1[i]);
- npyv_@sfx@ v3 = npyv_@VOP@_@sfx@(v1, v2);
- npyv_store_@sfx@(&op[i], v3);
- }
- LOOP_BLOCKED_END {
- op[i] = ip1[i] @OP@ ip2[0];
- }
+ npyv_f32 fill = npyv_reinterpret_f32_u64(npyv_setall_u64(*(npy_uint64*)a));
+ npyv_f32x2 r;
+ r.val[0] = fill;
+ r.val[1] = fill;
+ return r;
}
-/**end repeat1**/
-#endif /* NPY_SIMD@CHK@ */
-/**end repeat**/
-#endif // NPY_HAVE_SSE2
-/**begin repeat
- * Float types
- * #type = npy_float, npy_double, npy_longdouble#
- * #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
- * #vector = 1, 1, 0#
- * #VECTOR = NPY_SIMD_F32, NPY_SIMD_F64, 0 #
- */
-/**begin repeat1
- * Arithmetic
- * # kind = add, subtract, multiply, divide#
- */
-static NPY_INLINE int
-run_binary_simd_@kind@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
+NPY_FINLINE npyv_f32
+simd_cconjugate_f32(npyv_f32 x)
{
-#if @vector@ && defined NPY_HAVE_SSE2
- @type@ * ip1 = (@type@ *)args[0];
- @type@ * ip2 = (@type@ *)args[1];
- @type@ * op = (@type@ *)args[2];
- npy_intp n = dimensions[0];
-#if defined NPY_HAVE_AVX512F
- const npy_uintp vector_size_bytes = 64;
-#elif defined NPY_HAVE_AVX2
- const npy_uintp vector_size_bytes = 32;
+#if NPY_SIMD_BIGENDIAN
+ const npyv_f32 mask = npyv_reinterpret_f32_u64(npyv_setall_u64(0x80000000));
#else
- const npy_uintp vector_size_bytes = 32;
-#endif
- /* argument one scalar */
- if (IS_BLOCKABLE_BINARY_SCALAR1(sizeof(@type@), vector_size_bytes)) {
- sse2_binary_scalar1_@kind@_@TYPE@(op, ip1, ip2, n);
- return 1;
- }
- /* argument two scalar */
- else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), vector_size_bytes)) {
- sse2_binary_scalar2_@kind@_@TYPE@(op, ip1, ip2, n);
- return 1;
- }
- else if (IS_BLOCKABLE_BINARY(sizeof(@type@), vector_size_bytes)) {
- sse2_binary_@kind@_@TYPE@(op, ip1, ip2, n);
- return 1;
- }
-#elif @VECTOR@
- @type@ * ip1 = (@type@ *)args[0];
- @type@ * ip2 = (@type@ *)args[1];
- @type@ * op = (@type@ *)args[2];
- npy_intp n = dimensions[0];
- /* argument one scalar */
- if (IS_BLOCKABLE_BINARY_SCALAR1(sizeof(@type@), NPY_SIMD_WIDTH)) {
- simd_binary_scalar1_@kind@_@TYPE@(op, ip1, ip2, n);
- return 1;
- }
- /* argument two scalar */
- else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), NPY_SIMD_WIDTH)) {
- simd_binary_scalar2_@kind@_@TYPE@(op, ip1, ip2, n);
- return 1;
- }
- else if (IS_BLOCKABLE_BINARY(sizeof(@type@), NPY_SIMD_WIDTH)) {
- simd_binary_@kind@_@TYPE@(op, ip1, ip2, n);
- return 1;
- }
+ const npyv_f32 mask = npyv_reinterpret_f32_u64(npyv_setall_u64(0x8000000000000000ULL));
#endif
- return 0;
+ return npyv_xor_f32(x, mask);
}
-/**end repeat1**/
-/**end repeat**/
-/********************************************************************************
- ** Defining ufunc inner functions
- ********************************************************************************/
-/**begin repeat
- * Float types
- * #type = npy_float, npy_double#
- * #TYPE = FLOAT, DOUBLE#
- * #c = f, #
- * #C = F, #
- */
-/**begin repeat1
- * Arithmetic
- * # kind = add, subtract, multiply, divide#
- * # OP = +, -, *, /#
- * # PW = 1, 0, 0, 0#
- */
-NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
-(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+NPY_FINLINE npyv_f32
+simd_cmul_f32(npyv_f32 a, npyv_f32 b)
{
- if (IS_BINARY_REDUCE) {
-#if @PW@
- @type@ * iop1 = (@type@ *)args[0];
- npy_intp n = dimensions[0];
-
- *iop1 @OP@= @TYPE@_pairwise_sum(args[1], n, steps[1]);
-#else
- BINARY_REDUCE_LOOP(@type@) {
- io1 @OP@= *(@type@ *)ip2;
- }
- *((@type@ *)iop1) = io1;
-#endif
- }
- else if (!run_binary_simd_@kind@_@TYPE@(args, dimensions, steps)) {
- BINARY_LOOP {
- const @type@ in1 = *(@type@ *)ip1;
- const @type@ in2 = *(@type@ *)ip2;
- *((@type@ *)op1) = in1 @OP@ in2;
- }
- }
+ npyv_f32 b_rev = npyv_permi128_f32(b, 1, 0, 3, 2);
+ npyv_f32 a_re = npyv_permi128_f32(a, 0, 0, 2, 2);
+ npyv_f32 a_im = npyv_permi128_f32(a, 1, 1, 3, 3);
+ // a_im * b_im, a_im * b_re
+ npyv_f32 ab_iiir = npyv_mul_f32(a_im, b_rev);
+ return npyv_muladdsub_f32(a_re, b, ab_iiir);
}
-/**end repeat1**/
-/**end repeat**/
-//###############################################################################
-//## Complex Single/Double precision
-//###############################################################################
-/********************************************************************************
- ** Defining the SIMD kernels
- ********************************************************************************/
-#if !defined(_MSC_VER) && defined(NPY_HAVE_AVX512F)
- /**
- * For somehow MSVC commit aggressive optimization lead
- * to raises 'RuntimeWarning: invalid value encountered in multiply'
- *
- * the issue mainly caused by '_mm512_maskz_loadu_ps', we need to
- * investigate about it while moving to NPYV.
- */
- #define AVX512F_NOMSVC
+NPY_FINLINE npyv_f32
+simd_csquare_f32(npyv_f32 x)
+{ return simd_cmul_f32(x, x); }
#endif
-#ifdef AVX512F_NOMSVC
-NPY_FINLINE __mmask16
-avx512_get_full_load_mask_ps(void)
-{
- return 0xFFFF;
-}
-
-NPY_FINLINE __mmask8
-avx512_get_full_load_mask_pd(void)
-{
- return 0xFF;
-}
-NPY_FINLINE __m512
-avx512_masked_load_ps(__mmask16 mask, npy_float* addr)
-{
- return _mm512_maskz_loadu_ps(mask, (__m512 *)addr);
-}
+#if NPY_SIMD_F64
-NPY_FINLINE __m512d
-avx512_masked_load_pd(__mmask8 mask, npy_double* addr)
+NPY_FINLINE npyv_f64x2 simd_set2_f64(const double *a)
{
- return _mm512_maskz_loadu_pd(mask, (__m512d *)addr);
+ npyv_f64 r = npyv_setall_f64(a[0]);
+ npyv_f64 i = npyv_setall_f64(a[1]);
+ return npyv_zip_f64(r, i);
}
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
-avx512_get_partial_load_mask_ps(const npy_int num_elem, const npy_int total_elem)
+NPY_FINLINE npyv_f64
+simd_cconjugate_f64(npyv_f64 x)
{
- return (0x0001 << num_elem) - 0x0001;
+ const npyv_f64 mask = npyv_reinterpret_f64_u64(npyv_set_u64(
+ 0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL,
+ 0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL,
+ 0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL,
+ 0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL,
+ 0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL,
+ 0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL,
+ 0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL,
+ 0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL
+ ));
+ return npyv_xor_f64(x, mask);
}
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask8
-avx512_get_partial_load_mask_pd(const npy_int num_elem, const npy_int total_elem)
-{
- return (0x01 << num_elem) - 0x01;
-}
-/**begin repeat
- * #vsub = ps, pd#
- * #type= npy_float, npy_double#
- * #epi_vsub = epi32, epi64#
- * #vtype = __m512, __m512d#
- * #mask = __mmask16, __mmask8#
- * #and_const = 0x7fffffff, 0x7fffffffffffffffLL#
- * #neg_mask = 0x80000000, 0x8000000000000000#
- * #perm_ = 0xb1, 0x55#
- * #cmpx_img_mask = 0xAAAA, 0xAA#
- * #cmpx_re_mask = 0x5555, 0x55#
- * #INF = NPY_INFINITYF, NPY_INFINITY#
- * #NAN = NPY_NANF, NPY_NAN#
- */
-NPY_FINLINE @vtype@
-avx512_hadd_@vsub@(const @vtype@ x)
+NPY_FINLINE npyv_f64
+simd_cmul_f64(npyv_f64 a, npyv_f64 b)
{
- return _mm512_add_@vsub@(x, _mm512_permute_@vsub@(x, @perm_@));
+ npyv_f64 b_rev = npyv_permi128_f64(b, 1, 0);
+ npyv_f64 a_re = npyv_permi128_f64(a, 0, 0);
+ npyv_f64 a_im = npyv_permi128_f64(a, 1, 1);
+ // a_im * b_im, a_im * b_re
+ npyv_f64 ab_iiir = npyv_mul_f64(a_im, b_rev);
+ return npyv_muladdsub_f64(a_re, b, ab_iiir);
}
-NPY_FINLINE @vtype@
-avx512_hsub_@vsub@(const @vtype@ x)
-{
- return _mm512_sub_@vsub@(x, _mm512_permute_@vsub@(x, @perm_@));
-}
-NPY_FINLINE @vtype@
-avx512_cmul_@vsub@(@vtype@ x1, @vtype@ x2)
-{
- // x1 = r1, i1
- // x2 = r2, i2
- @vtype@ x3 = _mm512_permute_@vsub@(x2, @perm_@); // i2, r2
- @vtype@ x12 = _mm512_mul_@vsub@(x1, x2); // r1*r2, i1*i2
- @vtype@ x13 = _mm512_mul_@vsub@(x1, x3); // r1*i2, r2*i1
- @vtype@ outreal = avx512_hsub_@vsub@(x12); // r1*r2 - i1*i2, r1*r2 - i1*i2
- @vtype@ outimg = avx512_hadd_@vsub@(x13); // r1*i2 + i1*r2, r1*i2 + i1*r2
- return _mm512_mask_blend_@vsub@(@cmpx_img_mask@, outreal, outimg);
-}
-/**end repeat**/
+NPY_FINLINE npyv_f64
+simd_csquare_f64(npyv_f64 x)
+{ return simd_cmul_f64(x, x); }
#endif
/**begin repeat
- * #TYPE = CFLOAT, CDOUBLE#
* #type = npy_float, npy_double#
- * #num_lanes = 16, 8#
- * #vsuffix = ps, pd#
- * #epi_vsub = epi32, epi64#
- * #mask = __mmask16, __mmask8#
- * #vtype = __m512, __m512d#
- * #scale = 4, 8#
- * #vindextype = __m512i, __m256i#
- * #vindexload = _mm512_loadu_si512, _mm256_loadu_si256#
- * #storemask = 0xFF, 0xF#
- * #IS_FLOAT = 1, 0#
- */
-/**begin repeat1
- * #func = add, subtract, multiply#
- * #vectorf = _mm512_add, _mm512_sub, avx512_cmul#
- */
-#if defined AVX512F_NOMSVC
-static NPY_INLINE void
-AVX512F_@func@_@TYPE@(char **args, const npy_intp *dimensions, const npy_intp *steps)
-{
- const npy_intp array_size = dimensions[0];
- npy_intp num_remaining_elements = 2*array_size;
- @type@* ip1 = (@type@*) args[0];
- @type@* ip2 = (@type@*) args[1];
- @type@* op = (@type@*) args[2];
-
- @mask@ load_mask = avx512_get_full_load_mask_@vsuffix@();
-
- while (num_remaining_elements > 0) {
- if (num_remaining_elements < @num_lanes@) {
- load_mask = avx512_get_partial_load_mask_@vsuffix@(
- num_remaining_elements, @num_lanes@);
- }
- @vtype@ x1, x2;
- x1 = avx512_masked_load_@vsuffix@(load_mask, ip1);
- x2 = avx512_masked_load_@vsuffix@(load_mask, ip2);
-
- @vtype@ out = @vectorf@_@vsuffix@(x1, x2);
-
- _mm512_mask_storeu_@vsuffix@(op, load_mask, out);
-
- ip1 += @num_lanes@;
- ip2 += @num_lanes@;
- op += @num_lanes@;
- num_remaining_elements -= @num_lanes@;
- }
-}
-#endif // AVX512F_NOMSVC
-/**end repeat1**/
-/**end repeat**/
-
-/**begin repeat
- * #TYPE = CFLOAT, CDOUBLE#
- * #type= npy_float, npy_double#
- * #esize = 8, 16#
- */
-/**begin repeat1
- * #func = add, subtract, multiply#
+ * #sfx = f32, f64#
+ * #bsfx = b32, b64#
+ * #usfx = b32, u64#
+ * #VECTOR = NPY_SIMD_F32, NPY_SIMD_F64#
+ * #is_double = 0, 1#
+ * #c = f, #
+ * #INF = NPY_INFINITYF, NPY_INFINITY#
+ * #NAN = NPY_NANF, NPY_NAN#
*/
-static NPY_INLINE int
-run_binary_avx512f_@func@_@TYPE@(char **args, const npy_intp *dimensions, const npy_intp *steps)
+#if @VECTOR@
+NPY_FINLINE npyv_@sfx@
+simd_cabsolute_@sfx@(npyv_@sfx@ re, npyv_@sfx@ im)
{
-#if defined AVX512F_NOMSVC
- if (IS_BINARY_STRIDE_ONE(@esize@, 64)) {
- AVX512F_@func@_@TYPE@(args, dimensions, steps);
- return 1;
- }
- else
- return 0;
-#endif
- return 0;
+ const npyv_@sfx@ inf = npyv_setall_@sfx@(@INF@);
+ const npyv_@sfx@ nan = npyv_setall_@sfx@(@NAN@);
+
+ re = npyv_abs_@sfx@(re);
+ im = npyv_abs_@sfx@(im);
+ /*
+ * If real or imag = INF, then convert it to inf + j*inf
+ * Handles: inf + j*nan, nan + j*inf
+ */
+ npyv_@bsfx@ re_infmask = npyv_cmpeq_@sfx@(re, inf);
+ npyv_@bsfx@ im_infmask = npyv_cmpeq_@sfx@(im, inf);
+ im = npyv_select_@sfx@(re_infmask, inf, im);
+ re = npyv_select_@sfx@(im_infmask, inf, re);
+ /*
+ * If real or imag = NAN, then convert it to nan + j*nan
+ * Handles: x + j*nan, nan + j*x
+ */
+ npyv_@bsfx@ re_nnanmask = npyv_notnan_@sfx@(re);
+ npyv_@bsfx@ im_nnanmask = npyv_notnan_@sfx@(im);
+ im = npyv_select_@sfx@(re_nnanmask, im, nan);
+ re = npyv_select_@sfx@(im_nnanmask, re, nan);
+
+ npyv_@sfx@ larger = npyv_max_@sfx@(re, im);
+ npyv_@sfx@ smaller = npyv_min_@sfx@(im, re);
+ /*
+ * Calculate div_mask to prevent 0./0. and inf/inf operations in div
+ */
+ npyv_@bsfx@ zeromask = npyv_cmpeq_@sfx@(larger, npyv_zero_@sfx@());
+ npyv_@bsfx@ infmask = npyv_cmpeq_@sfx@(smaller, inf);
+ npyv_@bsfx@ div_mask = npyv_not_@bsfx@(npyv_or_@bsfx@(zeromask, infmask));
+
+ npyv_@sfx@ ratio = npyv_ifdivz_@sfx@(div_mask, smaller, larger);
+ npyv_@sfx@ hypot = npyv_sqrt_@sfx@(
+ npyv_muladd_@sfx@(ratio, ratio, npyv_setall_@sfx@(1.0@c@)
+ ));
+ return npyv_mul_@sfx@(hypot, larger);
}
-/**end repeat1**/
+#endif // VECTOR
/**end repeat**/
/********************************************************************************
@@ -724,55 +411,345 @@ run_binary_avx512f_@func@_@TYPE@(char **args, const npy_intp *dimensions, const
* complex types
* #TYPE = CFLOAT, CDOUBLE#
* #ftype = npy_float, npy_double#
+ * #VECTOR = NPY_SIMD_F32, NPY_SIMD_F64#
+ * #sfx = f32, f64#
* #c = f, #
* #C = F, #
*/
/**begin repeat1
* arithmetic
- * #kind = add, subtract#
- * #OP = +, -#
- * #PW = 1, 0#
+ * #kind = add, subtract, multiply#
+ * #vectorf = npyv_add, npyv_sub, simd_cmul#
+ * #OP = +, -, *#
+ * #PW = 1, 0, 0#
+ * #is_mul = 0*2, 1#
*/
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
- // Parenthesis around @PW@ tells clang dead code is intentional
- if (IS_BINARY_REDUCE && (@PW@)) {
- npy_intp n = dimensions[0];
- @ftype@ * or = ((@ftype@ *)args[0]);
- @ftype@ * oi = ((@ftype@ *)args[0]) + 1;
+ npy_intp len = dimensions[0];
+ char *b_src0 = args[0], *b_src1 = args[1], *b_dst = args[2];
+ npy_intp b_ssrc0 = steps[0], b_ssrc1 = steps[1], b_sdst = steps[2];
+#if @PW@
+ // reduce
+ if (b_ssrc0 == 0 && b_ssrc0 == b_sdst && b_src0 == b_dst &&
+ b_ssrc1 % (sizeof(@ftype@)*2) == 0
+ ) {
+ @ftype@ *rl_im = (@ftype@ *)b_src0;
@ftype@ rr, ri;
-
- @TYPE@_pairwise_sum(&rr, &ri, args[1], n * 2, steps[1] / 2);
- *or @OP@= rr;
- *oi @OP@= ri;
+ @TYPE@_pairwise_sum(&rr, &ri, b_src1, len * 2, b_ssrc1 / 2);
+ rl_im[0] @OP@= rr;
+ rl_im[1] @OP@= ri;
return;
}
- if (!run_binary_avx512f_@kind@_@TYPE@(args, dimensions, steps)) {
- BINARY_LOOP {
- const @ftype@ in1r = ((@ftype@ *)ip1)[0];
- const @ftype@ in1i = ((@ftype@ *)ip1)[1];
- const @ftype@ in2r = ((@ftype@ *)ip2)[0];
- const @ftype@ in2i = ((@ftype@ *)ip2)[1];
- ((@ftype@ *)op1)[0] = in1r @OP@ in2r;
- ((@ftype@ *)op1)[1] = in1i @OP@ in2i;
+#endif
+#if @VECTOR@
+ if (is_mem_overlap(b_src0, b_ssrc0, b_dst, b_sdst, len) ||
+ is_mem_overlap(b_src1, b_ssrc1, b_dst, b_sdst, len) ||
+ b_sdst % sizeof(@ftype@) != 0 || b_sdst == 0 ||
+ b_ssrc0 % sizeof(@ftype@) != 0 ||
+ b_ssrc1 % sizeof(@ftype@) != 0
+ ) {
+ goto loop_scalar;
+ }
+ const @ftype@ *src0 = (@ftype@*)b_src0;
+ const @ftype@ *src1 = (@ftype@*)b_src1;
+ @ftype@ *dst = (@ftype@*)b_dst;
+
+ const npy_intp ssrc0 = b_ssrc0 / sizeof(@ftype@);
+ const npy_intp ssrc1 = b_ssrc1 / sizeof(@ftype@);
+ const npy_intp sdst = b_sdst / sizeof(@ftype@);
+
+ const int vstep = npyv_nlanes_@sfx@;
+ const int wstep = vstep * 2;
+ const int hstep = vstep / 2;
+
+ const int loadable0 = npyv_loadable_stride_s64(ssrc0);
+ const int loadable1 = npyv_loadable_stride_s64(ssrc1);
+ const int storable = npyv_storable_stride_s64(sdst);
+
+ // lots**lots of specializations, to squeeze out max performance
+ // contig
+ if (ssrc0 == 2 && ssrc0 == ssrc1 && ssrc0 == sdst) {
+ for (; len >= vstep; len -= vstep, src0 += wstep, src1 += wstep, dst += wstep) {
+ npyv_@sfx@ a0 = npyv_load_@sfx@(src0);
+ npyv_@sfx@ a1 = npyv_load_@sfx@(src0 + vstep);
+ npyv_@sfx@ b0 = npyv_load_@sfx@(src1);
+ npyv_@sfx@ b1 = npyv_load_@sfx@(src1 + vstep);
+ npyv_@sfx@ r0 = @vectorf@_@sfx@(a0, b0);
+ npyv_@sfx@ r1 = @vectorf@_@sfx@(a1, b1);
+ npyv_store_@sfx@(dst, r0);
+ npyv_store_@sfx@(dst + vstep, r1);
+ }
+ for (; len > 0; len -= hstep, src0 += vstep, src1 += vstep, dst += vstep) {
+ npyv_@sfx@ a = npyv_load2_tillz_@sfx@(src0, len);
+ npyv_@sfx@ b = npyv_load2_tillz_@sfx@(src1, len);
+ npyv_@sfx@ r = @vectorf@_@sfx@(a, b);
+ npyv_store2_till_@sfx@(dst, len, r);
+ }
+ }
+ // scalar 0
+ else if (ssrc0 == 0) {
+ npyv_@sfx@x2 a = simd_set2_@sfx@(src0);
+ // contig
+ if (ssrc1 == 2 && sdst == ssrc1) {
+ for (; len >= vstep; len -= vstep, src1 += wstep, dst += wstep) {
+ npyv_@sfx@ b0 = npyv_load_@sfx@(src1);
+ npyv_@sfx@ b1 = npyv_load_@sfx@(src1 + vstep);
+ npyv_@sfx@ r0 = @vectorf@_@sfx@(a.val[0], b0);
+ npyv_@sfx@ r1 = @vectorf@_@sfx@(a.val[1], b1);
+ npyv_store_@sfx@(dst, r0);
+ npyv_store_@sfx@(dst + vstep, r1);
+ }
+ for (; len > 0; len -= hstep, src1 += vstep, dst += vstep) {
+ #if @is_mul@
+ npyv_@sfx@ b = npyv_load2_till_@sfx@(src1, len, 1.0@c@, 1.0@c@);
+ #else
+ npyv_@sfx@ b = npyv_load2_tillz_@sfx@(src1, len);
+ #endif
+ npyv_@sfx@ r = @vectorf@_@sfx@(a.val[0], b);
+ npyv_store2_till_@sfx@(dst, len, r);
+ }
+ }
+ // non-contig
+ else if (loadable1 && storable) {
+ for (; len >= vstep; len -= vstep, src1 += ssrc1*vstep, dst += sdst*vstep) {
+ npyv_@sfx@ b0 = npyv_loadn2_@sfx@(src1, ssrc1);
+ npyv_@sfx@ b1 = npyv_loadn2_@sfx@(src1 + ssrc1*hstep, ssrc1);
+ npyv_@sfx@ r0 = @vectorf@_@sfx@(a.val[0], b0);
+ npyv_@sfx@ r1 = @vectorf@_@sfx@(a.val[1], b1);
+ npyv_storen2_@sfx@(dst, sdst, r0);
+ npyv_storen2_@sfx@(dst + sdst*hstep, sdst, r1);
+ }
+ for (; len > 0; len -= hstep, src1 += ssrc1*hstep, dst += sdst*hstep) {
+ #if @is_mul@
+ npyv_@sfx@ b = npyv_loadn2_till_@sfx@(src1, ssrc1, len, 1.0@c@, 1.0@c@);
+ #else
+ npyv_@sfx@ b = npyv_loadn2_tillz_@sfx@(src1, ssrc1, len);
+ #endif
+ npyv_@sfx@ r = @vectorf@_@sfx@(a.val[0], b);
+ npyv_storen2_till_@sfx@(dst, sdst, len, r);
+ }
+ }
+ else {
+ goto loop_scalar;
+ }
+ }
+ // scalar 1
+ else if (ssrc1 == 0) {
+ npyv_@sfx@x2 b = simd_set2_@sfx@(src1);
+ if (ssrc0 == 2 && sdst == ssrc0) {
+ for (; len >= vstep; len -= vstep, src0 += wstep, dst += wstep) {
+ npyv_@sfx@ a0 = npyv_load_@sfx@(src0);
+ npyv_@sfx@ a1 = npyv_load_@sfx@(src0 + vstep);
+ npyv_@sfx@ r0 = @vectorf@_@sfx@(a0, b.val[0]);
+ npyv_@sfx@ r1 = @vectorf@_@sfx@(a1, b.val[1]);
+ npyv_store_@sfx@(dst, r0);
+ npyv_store_@sfx@(dst + vstep, r1);
+ }
+ for (; len > 0; len -= hstep, src0 += vstep, dst += vstep) {
+ #if @is_mul@
+ npyv_@sfx@ a = npyv_load2_till_@sfx@(src0, len, 1.0@c@, 1.0@c@);
+ #else
+ npyv_@sfx@ a = npyv_load2_tillz_@sfx@(src0, len);
+ #endif
+ npyv_@sfx@ r = @vectorf@_@sfx@(a, b.val[0]);
+ npyv_store2_till_@sfx@(dst, len, r);
+ }
+ }
+ // non-contig
+ else if (loadable0 && storable) {
+ for (; len >= vstep; len -= vstep, src0 += ssrc0*vstep, dst += sdst*vstep) {
+ npyv_@sfx@ a0 = npyv_loadn2_@sfx@(src0, ssrc0);
+ npyv_@sfx@ a1 = npyv_loadn2_@sfx@(src0 + ssrc0*hstep, ssrc0);
+ npyv_@sfx@ r0 = @vectorf@_@sfx@(a0, b.val[0]);
+ npyv_@sfx@ r1 = @vectorf@_@sfx@(a1, b.val[1]);
+ npyv_storen2_@sfx@(dst, sdst, r0);
+ npyv_storen2_@sfx@(dst + sdst*hstep, sdst, r1);
+ }
+ for (; len > 0; len -= hstep, src0 += ssrc0*hstep, dst += sdst*hstep) {
+ #if @is_mul@
+ npyv_@sfx@ a = npyv_loadn2_till_@sfx@(src0, ssrc0, len, 1.0@c@, 1.0@c@);
+ #else
+ npyv_@sfx@ a = npyv_loadn2_tillz_@sfx@(src0, ssrc0, len);
+ #endif
+ npyv_@sfx@ r = @vectorf@_@sfx@(a, b.val[0]);
+ npyv_storen2_till_@sfx@(dst, sdst, len, r);
+ }
}
+ else {
+ goto loop_scalar;
+ }
+ }
+ #if @is_mul@
+ // non-contig
+ else if (loadable0 && loadable1 && storable) {
+ for (; len >= vstep; len -= vstep, src0 += ssrc0*vstep,
+ src1 += ssrc1*vstep, dst += sdst*vstep
+ ) {
+ npyv_@sfx@ a0 = npyv_loadn2_@sfx@(src0, ssrc0);
+ npyv_@sfx@ a1 = npyv_loadn2_@sfx@(src0 + ssrc0*hstep, ssrc0);
+ npyv_@sfx@ b0 = npyv_loadn2_@sfx@(src1, ssrc1);
+ npyv_@sfx@ b1 = npyv_loadn2_@sfx@(src1 + ssrc1*hstep, ssrc1);
+ npyv_@sfx@ r0 = @vectorf@_@sfx@(a0, b0);
+ npyv_@sfx@ r1 = @vectorf@_@sfx@(a1, b1);
+ npyv_storen2_@sfx@(dst, sdst, r0);
+ npyv_storen2_@sfx@(dst + sdst*hstep, sdst, r1);
+ }
+ for (; len > 0; len -= hstep, src0 += ssrc0*hstep,
+ src1 += ssrc1*hstep, dst += sdst*hstep
+ ) {
+ #if @is_mul@
+ npyv_@sfx@ a = npyv_loadn2_till_@sfx@(src0, ssrc0, len, 1.0@c@, 1.0@c@);
+ npyv_@sfx@ b = npyv_loadn2_till_@sfx@(src1, ssrc1, len, 1.0@c@, 1.0@c@);
+ #else
+ npyv_@sfx@ a = npyv_loadn2_tillz_@sfx@(src0, ssrc0, len);
+ npyv_@sfx@ b = npyv_loadn2_tillz_@sfx@(src1, ssrc1, len);
+ #endif
+ npyv_@sfx@ r = @vectorf@_@sfx@(a, b);
+ npyv_storen2_till_@sfx@(dst, sdst, len, r);
+ }
+ }
+ #endif
+ else {
+ goto loop_scalar;
}
+ npyv_cleanup();
+ return;
+loop_scalar:
+#endif
+ for (; len > 0; --len, b_src0 += b_ssrc0, b_src1 += b_ssrc1, b_dst += b_sdst) {
+ const @ftype@ a_r = ((@ftype@ *)b_src0)[0];
+ const @ftype@ a_i = ((@ftype@ *)b_src0)[1];
+ const @ftype@ b_r = ((@ftype@ *)b_src1)[0];
+ const @ftype@ b_i = ((@ftype@ *)b_src1)[1];
+ #if @is_mul@
+ ((@ftype@ *)b_dst)[0] = a_r*b_r - a_i*b_i;
+ ((@ftype@ *)b_dst)[1] = a_r*b_i + a_i*b_r;
+ #else
+ ((@ftype@ *)b_dst)[0] = a_r @OP@ b_r;
+ ((@ftype@ *)b_dst)[1] = a_i @OP@ b_i;
+ #endif
+ }
+}
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+ char *ip1 = args[0];
+ char *indx = args[1];
+ char *value = args[2];
+ npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+ npy_intp n = dimensions[0];
+ npy_intp i;
+ @ftype@ *indexed;
+ for(i = 0; i < n; i++, indx += isindex, value += isb) {
+ indexed = (@ftype@ *)(ip1 + is1 * *(npy_intp *)indx);
+ const @ftype@ b_r = ((@ftype@ *)value)[0];
+ const @ftype@ b_i = ((@ftype@ *)value)[1];
+ #if @is_mul@
+ const @ftype@ a_r = indexed[0];
+ const @ftype@ a_i = indexed[1];
+ indexed[0] = a_r*b_r - a_i*b_i;
+ indexed[1] = a_r*b_i + a_i*b_r;
+ #else
+ indexed[0] @OP@= b_r;
+ indexed[1] @OP@= b_i;
+ #endif
+ }
+ return 0;
}
/**end repeat1**/
-NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_multiply)
+/**begin repeat1
+ * #kind = conjugate, square#
+ * #is_square = 0, 1#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
- if (!run_binary_avx512f_multiply_@TYPE@(args, dimensions, steps)) {
- BINARY_LOOP {
- const @ftype@ in1r = ((@ftype@ *)ip1)[0];
- const @ftype@ in1i = ((@ftype@ *)ip1)[1];
- const @ftype@ in2r = ((@ftype@ *)ip2)[0];
- const @ftype@ in2i = ((@ftype@ *)ip2)[1];
- ((@ftype@ *)op1)[0] = in1r*in2r - in1i*in2i;
- ((@ftype@ *)op1)[1] = in1r*in2i + in1i*in2r;
+ npy_intp len = dimensions[0];
+ char *b_src = args[0], *b_dst = args[1];
+ npy_intp b_ssrc = steps[0], b_sdst = steps[1];
+#if @VECTOR@
+ if (is_mem_overlap(b_src, b_ssrc, b_dst, b_sdst, len) ||
+ b_sdst % sizeof(@ftype@) != 0 ||
+ b_ssrc % sizeof(@ftype@) != 0
+ ) {
+ goto loop_scalar;
+ }
+ const @ftype@ *src = (@ftype@*)b_src;
+ @ftype@ *dst = (@ftype@*)b_dst;
+ const npy_intp ssrc = b_ssrc / sizeof(@ftype@);
+ const npy_intp sdst = b_sdst / sizeof(@ftype@);
+
+ const int vstep = npyv_nlanes_@sfx@;
+ const int wstep = vstep * 2;
+ const int hstep = vstep / 2;
+
+ if (ssrc == 2 && ssrc == sdst) {
+ for (; len >= vstep; len -= vstep, src += wstep, dst += wstep) {
+ npyv_@sfx@ a0 = npyv_load_@sfx@(src);
+ npyv_@sfx@ a1 = npyv_load_@sfx@(src + vstep);
+ npyv_@sfx@ r0 = simd_c@kind@_@sfx@(a0);
+ npyv_@sfx@ r1 = simd_c@kind@_@sfx@(a1);
+ npyv_store_@sfx@(dst, r0);
+ npyv_store_@sfx@(dst + vstep, r1);
+ }
+ for (; len > 0; len -= hstep, src += vstep, dst += vstep) {
+ npyv_@sfx@ a = npyv_load2_tillz_@sfx@(src, len);
+ npyv_@sfx@ r = simd_c@kind@_@sfx@(a);
+ npyv_store2_till_@sfx@(dst, len, r);
+ }
+ }
+ else if (ssrc == 2 && npyv_storable_stride_s64(sdst)) {
+ for (; len >= vstep; len -= vstep, src += wstep, dst += sdst*vstep) {
+ npyv_@sfx@ a0 = npyv_load_@sfx@(src);
+ npyv_@sfx@ a1 = npyv_load_@sfx@(src + vstep);
+ npyv_@sfx@ r0 = simd_c@kind@_@sfx@(a0);
+ npyv_@sfx@ r1 = simd_c@kind@_@sfx@(a1);
+ npyv_storen2_@sfx@(dst, sdst, r0);
+ npyv_storen2_@sfx@(dst + sdst*hstep, sdst, r1);
+ }
+ for (; len > 0; len -= hstep, src += vstep, dst += sdst*hstep) {
+ npyv_@sfx@ a = npyv_load2_tillz_@sfx@(src, len);
+ npyv_@sfx@ r = simd_c@kind@_@sfx@(a);
+ npyv_storen2_till_@sfx@(dst, sdst, len, r);
+ }
+ }
+ else if (sdst == 2 && npyv_loadable_stride_s64(ssrc)) {
+ for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += wstep) {
+ npyv_@sfx@ a0 = npyv_loadn2_@sfx@(src, ssrc);
+ npyv_@sfx@ a1 = npyv_loadn2_@sfx@(src + ssrc*hstep, ssrc);
+ npyv_@sfx@ r0 = simd_c@kind@_@sfx@(a0);
+ npyv_@sfx@ r1 = simd_c@kind@_@sfx@(a1);
+ npyv_store_@sfx@(dst, r0);
+ npyv_store_@sfx@(dst + vstep, r1);
+ }
+ for (; len > 0; len -= hstep, src += ssrc*hstep, dst += vstep) {
+ npyv_@sfx@ a = npyv_loadn2_tillz_@sfx@((@ftype@*)src, ssrc, len);
+ npyv_@sfx@ r = simd_c@kind@_@sfx@(a);
+ npyv_store2_till_@sfx@(dst, len, r);
}
}
+ else {
+ goto loop_scalar;
+ }
+ npyv_cleanup();
+ return;
+loop_scalar:
+#endif
+ for (; len > 0; --len, b_src += b_ssrc, b_dst += b_sdst) {
+ const @ftype@ rl = ((@ftype@ *)b_src)[0];
+ const @ftype@ im = ((@ftype@ *)b_src)[1];
+ #if @is_square@
+ ((@ftype@ *)b_dst)[0] = rl*rl - im*im;
+ ((@ftype@ *)b_dst)[1] = rl*im + im*rl;
+ #else
+ ((@ftype@ *)b_dst)[0] = rl;
+ ((@ftype@ *)b_dst)[1] = -im;
+ #endif
+ }
}
+/**end repeat1**/
/**end repeat**/
diff --git a/numpy/core/src/umath/loops_arithmetic.dispatch.c.src b/numpy/core/src/umath/loops_arithmetic.dispatch.c.src
index 5b5f13ad1..b6f126298 100644
--- a/numpy/core/src/umath/loops_arithmetic.dispatch.c.src
+++ b/numpy/core/src/umath/loops_arithmetic.dispatch.c.src
@@ -42,7 +42,7 @@
* #sfx = s8, s16, s32, s64#
* #len = 8, 16, 32, 64#
*/
-static NPY_INLINE void
+static inline void
simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len)
{
npyv_lanetype_@sfx@ *src = (npyv_lanetype_@sfx@ *) args[0];
@@ -108,7 +108,7 @@ simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len)
* #sfx = u8, u16, u32, u64#
* #len = 8, 16, 32, 64#
*/
-static NPY_INLINE void
+static inline void
simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len)
{
npyv_lanetype_@sfx@ *src = (npyv_lanetype_@sfx@ *) args[0];
@@ -207,7 +207,7 @@ vsx4_div_@t@16(npyv_@t@16 a, npyv_@t@16 b)
* #sfx = u8, u16, u32, u64#
* #len = 8, 16, 32, 64#
*/
-static NPY_INLINE void
+static inline void
vsx4_simd_divide_contig_@sfx@(char **args, npy_intp len)
{
npyv_lanetype_@sfx@ *src1 = (npyv_lanetype_@sfx@ *) args[0];
@@ -246,7 +246,7 @@ vsx4_simd_divide_contig_@sfx@(char **args, npy_intp len)
* #sfx = s8, s16, s32, s64#
* #len = 8, 16, 32, 64#
*/
-static NPY_INLINE void
+static inline void
vsx4_simd_divide_contig_@sfx@(char **args, npy_intp len)
{
npyv_lanetype_@sfx@ *src1 = (npyv_lanetype_@sfx@ *) args[0];
@@ -395,6 +395,24 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_divide)
}
}
}
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(@TYPE@_divide_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+ char *ip1 = args[0];
+ char *indx = args[1];
+ char *value = args[2];
+ npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+ npy_intp n = dimensions[0];
+ npy_intp i;
+ @type@ *indexed;
+ for(i = 0; i < n; i++, indx += isindex, value += isb) {
+ indexed = (@type@ *)(ip1 + is1 * *(npy_intp *)indx);
+ *indexed = floor_div_@TYPE@(*indexed, *(@type@ *)value);
+ }
+ return 0;
+}
+
/**end repeat**/
/**begin repeat
@@ -463,4 +481,28 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_divide)
}
}
}
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(@TYPE@_divide_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+ char *ip1 = args[0];
+ char *indx = args[1];
+ char *value = args[2];
+ npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+ npy_intp n = dimensions[0];
+ npy_intp i;
+ @type@ *indexed;
+ for(i = 0; i < n; i++, indx += isindex, value += isb) {
+ indexed = (@type@ *)(ip1 + is1 * *(npy_intp *)indx);
+ @type@ in2 = *(@type@ *)value;
+ if (NPY_UNLIKELY(in2 == 0)) {
+ npy_set_floatstatus_divbyzero();
+ *indexed = 0;
+ } else {
+ *indexed = *indexed / in2;
+ }
+ }
+ return 0;
+}
+
/**end repeat**/
diff --git a/numpy/core/src/umath/loops_autovec.dispatch.c.src b/numpy/core/src/umath/loops_autovec.dispatch.c.src
new file mode 100644
index 000000000..bdbfa0f86
--- /dev/null
+++ b/numpy/core/src/umath/loops_autovec.dispatch.c.src
@@ -0,0 +1,287 @@
+/*@targets
+ ** $maxopt $autovec baseline
+ ** sse2 avx2
+ ** neon
+ ** vsx2
+ ** vx
+ **/
+#define _UMATHMODULE
+#define _MULTIARRAYMODULE
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+#include "simd/simd.h"
+#include "loops_utils.h"
+#include "loops.h"
+// Provides the various *_LOOP macros
+#include "fast_loop_macros.h"
+
+/*
+ *****************************************************************************
+ ** INTEGER LOOPS
+ *****************************************************************************
+ */
+/*
+ * Arithmetic bit shift operations.
+ *
+ * Intel hardware masks bit shift values, so large shifts wrap around
+ * and can produce surprising results. The special handling ensures that
+ * behavior is independent of compiler or hardware.
+ * TODO: We could implement consistent behavior for negative shifts,
+ * which is undefined in C.
+ */
+#define INT_left_shift_needs_clear_floatstatus
+#define UINT_left_shift_needs_clear_floatstatus
+
+/**begin repeat
+ * #TYPE = BYTE, UBYTE, SHORT, USHORT, INT, UINT,
+ * LONG, ULONG, LONGLONG, ULONGLONG#
+ * #type = npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint,
+ * npy_long, npy_ulong, npy_longlong, npy_ulonglong#
+ * #ftype = npy_float, npy_float, npy_float, npy_float, npy_double, npy_double,
+ * npy_double, npy_double, npy_double, npy_double#
+ * #SIGNED = 1, 0, 1, 0, 1, 0, 1, 0, 1, 0#
+ * #c = hh,uhh,h,uh,,u,l,ul,ll,ull#
+ */
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_positive)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+ UNARY_LOOP_FAST(@type@, @type@, *out = +in);
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_square)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+ UNARY_LOOP_FAST(@type@, @type@, *out = in * in);
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_reciprocal)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+ UNARY_LOOP_FAST(@type@, @type@, *out = 1.0 / in);
+}
+
+/**begin repeat1
+ * Arithmetic
+ * #kind = add, subtract, multiply#
+ * #OP = +, -, *#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+ if (IS_BINARY_REDUCE) {
+ BINARY_REDUCE_LOOP_FAST(@type@, io1 @OP@= in2);
+ }
+ else {
+ BINARY_LOOP_FAST(@type@, @type@, *out = in1 @OP@ in2);
+ }
+}
+/**end repeat1**/
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_left_shift)
+(char **args, npy_intp const *dimensions, npy_intp const *steps,
+ void *NPY_UNUSED(func))
+{
+ BINARY_LOOP_FAST(@type@, @type@, *out = npy_lshift@c@(in1, in2));
+#ifdef @TYPE@_left_shift_needs_clear_floatstatus
+ // For some reason, our macOS CI sets an "invalid" flag here, but only
+ // for some types.
+ npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_right_shift)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+#ifndef NPY_DO_NOT_OPTIMIZE_@TYPE@_right_shift
+ BINARY_LOOP_FAST(@type@, @type@, *out = npy_rshift@c@(in1, in2));
+#else
+ BINARY_LOOP {
+ @type@ in1 = *(@type@ *)ip1;
+ @type@ in2 = *(@type@ *)ip2;
+ *(@type@ *)op1 = npy_rshift@c@(in1, in2);
+ }
+#endif
+}
+/**end repeat**/
+
+/*
+ *****************************************************************************
+ ** UNSIGNED INTEGER LOOPS
+ *****************************************************************************
+ */
+/**begin repeat
+ * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG#
+ * #type = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong#
+ * #c = u,u,u,ul,ull#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_absolute)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+ UNARY_LOOP_FAST(@type@, @type@, *out = in);
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_sign)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+ UNARY_LOOP_FAST(@type@, @type@, *out = in > 0 ? 1 : 0);
+}
+
+/**begin repeat1
+ * Arithmetic
+ * #kind = bitwise_and, bitwise_or, bitwise_xor#
+ * #OP = &, |, ^#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+ if (IS_BINARY_REDUCE) {
+ BINARY_REDUCE_LOOP_FAST(@type@, io1 @OP@= in2);
+ }
+ else {
+ BINARY_LOOP_FAST(@type@, @type@, *out = in1 @OP@ in2);
+ }
+}
+/**end repeat1**/
+
+/**begin repeat1
+ * #kind = logical_and, logical_or#
+ * #OP = &&, ||#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+ /*
+ * gcc vectorization of this is not good (PR60575) but manual integer
+ * vectorization is too tedious to be worthwhile
+ */
+ BINARY_LOOP_FAST(@type@, npy_bool, *out = in1 @OP@ in2);
+}
+/**end repeat1**/
+
+NPY_FINLINE npy_bool @TYPE@_logical_xor_(@type@ in1, @type@ in2)
+{ return (!!in1) != (!!in2); }
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_logical_xor)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+ BINARY_LOOP_FAST(@type@, npy_bool, *out = @TYPE@_logical_xor_(in1, in2));
+}
+
+/**begin repeat1
+ * #kind = isnan, isinf, isfinite#
+ * #func = npy_isnan, npy_isinf, npy_isfinite#
+ * #val = NPY_FALSE, NPY_FALSE, NPY_TRUE#
+ **/
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+ /*
+ * The (void)in; suppresses an unused variable warning raised by gcc and allows
+ * us to re-use this macro even though we do not depend on in
+ */
+ UNARY_LOOP_FAST(@type@, npy_bool, (void)in; *out = @val@);
+}
+/**end repeat1**/
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_conjugate)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+ UNARY_LOOP_FAST(@type@, @type@, *out = in);
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_logical_not)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+ UNARY_LOOP_FAST(@type@, npy_bool, *out = !in);
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_invert)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+ UNARY_LOOP_FAST(@type@, @type@, *out = ~in);
+}
+/**end repeat**/
+
+/*
+ *****************************************************************************
+ ** SIGNED! INTEGER LOOPS
+ *****************************************************************************
+ */
+
+/**begin repeat
+ * #TYPE = BYTE, SHORT, INT, LONG, LONGLONG#
+ * #type = npy_byte, npy_short, npy_int, npy_long, npy_longlong#
+ * #c = ,,,l,ll#
+ */
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_absolute)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+ UNARY_LOOP_FAST(@type@, @type@, *out = (in >= 0) ? in : -in);
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_sign)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+ UNARY_LOOP_FAST(@type@, @type@, *out = in > 0 ? 1 : (in < 0 ? -1 : 0));
+}
+
+/**begin repeat1
+ * #kind = conjugate, invert, isnan, isinf, isfinite,
+ * logical_and, logical_or, logical_xor, logical_not,
+ * bitwise_and, bitwise_or, bitwise_xor#
+ **/
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+ NPY_CPU_DISPATCH_CURFX(U@TYPE@_@kind@)(args, dimensions, steps, func);
+}
+/**end repeat1**/
+/**end repeat**/
+
+/*
+ *****************************************************************************
+ ** BOOLEAN LOOPS **
+ *****************************************************************************
+ */
+/**begin repeat
+ * #kind = isnan, isinf, isfinite#
+ * #func = npy_isnan, npy_isinf, npy_isfinite#
+ * #val = NPY_FALSE, NPY_FALSE, NPY_TRUE#
+ **/
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BOOL_@kind@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+ NPY_CPU_DISPATCH_CURFX(UBYTE_@kind@)(args, dimensions, steps, func);
+}
+/**end repeat**/
+
+/*
+ *****************************************************************************
+ ** HALF-FLOAT LOOPS **
+ *****************************************************************************
+ */
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(HALF_absolute)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+ UNARY_LOOP_FAST(npy_half, npy_half, *out = in&0x7fffu);
+}
+
+/*
+ *****************************************************************************
+ ** DATETIME LOOPS **
+ *****************************************************************************
+ */
+
+/**begin repeat
+ * #type = npy_datetime, npy_timedelta#
+ * #TYPE = DATETIME, TIMEDELTA#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_isinf)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+ NPY_CPU_DISPATCH_CURFX(ULONGLONG_isinf)(args, dimensions, steps, func);
+}
+/**end repeat**/
diff --git a/numpy/core/src/umath/loops_comparison.dispatch.c.src b/numpy/core/src/umath/loops_comparison.dispatch.c.src
index 2f75593a5..751080871 100644
--- a/numpy/core/src/umath/loops_comparison.dispatch.c.src
+++ b/numpy/core/src/umath/loops_comparison.dispatch.c.src
@@ -234,7 +234,7 @@ static void simd_binary_@kind@_b8(char **args, npy_intp len)
npyv_b8 a = npyv_cmpeq_u8(npyv_load_u8(src1), vzero);
npyv_b8 b = npyv_cmpeq_u8(npyv_load_u8(src2), vzero);
npyv_b8 c = npyv_@VOP@_b8(a, b);
- npyv_store_u8(dst, npyv_andc_u8(npyv_cvt_u8_b8(c), truemask));
+ npyv_store_u8(dst, npyv_and_u8(npyv_cvt_u8_b8(c), truemask));
}
for (; len > 0; --len, ++src1, ++src2, ++dst) {
@@ -258,7 +258,7 @@ static void simd_binary_scalar1_@kind@_b8(char **args, npy_intp len)
for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
npyv_b8 b = npyv_cmpeq_u8(npyv_load_u8(src), vzero);
npyv_b8 c = npyv_@VOP@_b8(a, b);
- npyv_store_u8(dst, npyv_andc_u8(npyv_cvt_u8_b8(c), truemask));
+ npyv_store_u8(dst, npyv_and_u8(npyv_cvt_u8_b8(c), truemask));
}
for (; len > 0; --len, ++src, ++dst) {
@@ -281,7 +281,7 @@ static void simd_binary_scalar2_@kind@_b8(char **args, npy_intp len)
for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
npyv_b8 a = npyv_cmpeq_u8(npyv_load_u8(src), vzero);
npyv_b8 c = npyv_@VOP@_b8(a, b);
- npyv_store_u8(dst, npyv_andc_u8(npyv_cvt_u8_b8(c), truemask));
+ npyv_store_u8(dst, npyv_and_u8(npyv_cvt_u8_b8(c), truemask));
}
for (; len > 0; --len, ++src, ++dst) {
@@ -308,23 +308,27 @@ static void simd_binary_scalar2_@kind@_b8(char **args, npy_intp len)
* #OP = ==, !=, <, <=#
*/
#if !((@eq@ || @neq@) && @signed@)
-static NPY_INLINE void
+static inline void
run_binary_simd_@kind@_@sfx@(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
#if @VECTOR@
- /* argument one scalar */
- if (IS_BLOCKABLE_BINARY_SCALAR1_BOOL(sizeof(@type@), NPY_SIMD_WIDTH)) {
- simd_binary_scalar1_@kind@_@sfx@(args, dimensions[0]);
- return;
- }
- /* argument two scalar */
- else if (IS_BLOCKABLE_BINARY_SCALAR2_BOOL(sizeof(@type@), NPY_SIMD_WIDTH)) {
- simd_binary_scalar2_@kind@_@sfx@(args, dimensions[0]);
- return;
- }
- else if (IS_BLOCKABLE_BINARY_BOOL(sizeof(@type@), NPY_SIMD_WIDTH)) {
- simd_binary_@kind@_@sfx@(args, dimensions[0]);
- return;
+ if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
+ !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
+ ) {
+ /* argument one scalar */
+ if (IS_BINARY_CONT_S1(@type@, npy_bool)) {
+ simd_binary_scalar1_@kind@_@sfx@(args, dimensions[0]);
+ return;
+ }
+ /* argument two scalar */
+ else if (IS_BINARY_CONT_S2(@type@, npy_bool)) {
+ simd_binary_scalar2_@kind@_@sfx@(args, dimensions[0]);
+ return;
+ }
+ else if (IS_BINARY_CONT(@type@, npy_bool)) {
+ simd_binary_@kind@_@sfx@(args, dimensions[0]);
+ return;
+ }
}
#endif
diff --git a/numpy/core/src/umath/loops_exponent_log.dispatch.c.src b/numpy/core/src/umath/loops_exponent_log.dispatch.c.src
index 8f123a48b..1fac3c150 100644
--- a/numpy/core/src/umath/loops_exponent_log.dispatch.c.src
+++ b/numpy/core/src/umath/loops_exponent_log.dispatch.c.src
@@ -239,7 +239,7 @@ fma_scalef_ps(__m256 poly, __m256 quadrant)
#ifdef SIMD_AVX512F
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
+NPY_FINLINE __mmask16
avx512_get_full_load_mask_ps(void)
{
return 0xFFFF;
@@ -1146,7 +1146,7 @@ AVX512F_log_DOUBLE(npy_double * op,
* #vtype2_scatter = _mm512_mask_i32scatter_epi32, _mm256_mask_i32scatter_epi32#
* #setzero = _mm512_setzero_epi32, _mm256_setzero_si256#
*/
-static NPY_INLINE void
+static inline void
AVX512_SKX_ldexp_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
const npy_intp stride_ip1 = steps[0]/(npy_intp)sizeof(@type@);
@@ -1215,7 +1215,7 @@ AVX512_SKX_ldexp_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const
}
}
-static NPY_INLINE void
+static inline void
AVX512_SKX_frexp_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
const npy_intp stride_ip1 = steps[0]/(npy_intp)sizeof(@type@);
diff --git a/numpy/core/src/umath/loops_logical.dispatch.c.src b/numpy/core/src/umath/loops_logical.dispatch.c.src
new file mode 100644
index 000000000..c07525be4
--- /dev/null
+++ b/numpy/core/src/umath/loops_logical.dispatch.c.src
@@ -0,0 +1,377 @@
+/*@targets
+ ** $maxopt baseline
+ ** neon asimd
+ ** sse2 avx2 avx512_skx
+ ** vsx2
+ ** vx
+ **/
+#define _UMATHMODULE
+#define _MULTIARRAYMODULE
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+#include "simd/simd.h"
+#include "loops_utils.h"
+#include "loops.h"
+#include "lowlevel_strided_loops.h"
+// Provides the various *_LOOP macros
+#include "fast_loop_macros.h"
+
+/*******************************************************************************
+ ** Defining the SIMD kernels
+ ******************************************************************************/
+
+#if NPY_SIMD
+/*
+ * convert any bit set to boolean true so vectorized and normal operations are
+ * consistent, should not be required if bool is used correctly everywhere but
+ * you never know
+ */
+NPY_FINLINE npyv_u8 byte_to_true(npyv_u8 v)
+{
+ const npyv_u8 zero = npyv_zero_u8();
+ const npyv_u8 truemask = npyv_setall_u8(1 == 1);
+ // cmpeq(v, 0) turns 0x00 -> 0xff and non-zero -> 0x00
+ npyv_u8 tmp = npyv_cvt_u8_b8(npyv_cmpeq_u8(v, zero));
+ // tmp is filled with 0xff/0x00, negate and mask to boolean true
+ return npyv_andc_u8(truemask, tmp);
+}
+/*
+ * convert mask vector (0xff/0x00) to boolean true. similar to byte_to_true(),
+ * but we've already got a mask and can skip negation.
+ */
+NPY_FINLINE npyv_u8 mask_to_true(npyv_b8 v)
+{
+ const npyv_u8 truemask = npyv_setall_u8(1 == 1);
+ return npyv_and_u8(truemask, npyv_cvt_u8_b8(v));
+}
+/*
+ * For logical_and, we have to be careful to handle non-bool inputs where
+ * bits of each operand might not overlap. Example: a = 0x01, b = 0x80
+ * Both evaluate to boolean true, however, a & b is false. Return value
+ * should be consistent with byte_to_true().
+ */
+NPY_FINLINE npyv_u8 simd_logical_and_u8(npyv_u8 a, npyv_u8 b)
+{
+ const npyv_u8 zero = npyv_zero_u8();
+ const npyv_u8 truemask = npyv_setall_u8(1 == 1);
+ npyv_b8 ma = npyv_cmpeq_u8(a, zero);
+ npyv_b8 mb = npyv_cmpeq_u8(b, zero);
+ npyv_u8 r = npyv_cvt_u8_b8(npyv_or_b8(ma, mb));
+ return npyv_andc_u8(truemask, r);
+}
+/*
+ * We don't really need the following, but it simplifies the templating code
+ * below since it is paired with simd_logical_and_u8() above.
+ */
+NPY_FINLINE npyv_u8 simd_logical_or_u8(npyv_u8 a, npyv_u8 b)
+{
+ npyv_u8 r = npyv_or_u8(a, b);
+ return byte_to_true(r);
+}
+
+
+/**begin repeat
+ * #kind = logical_and, logical_or#
+ * #and = 1, 0#
+ * #scalar_op = &&, ||#
+ * #intrin = and, or#
+ * #reduce = min, max#
+ * #scalar_cmp = ==, !=#
+ * #anyall = all, any#
+ */
+static void
+simd_binary_@kind@_BOOL(npy_bool * op, npy_bool * ip1, npy_bool * ip2, npy_intp len)
+{
+ #define UNROLL 16
+
+ const int vstep = npyv_nlanes_u8;
+ const int wstep = vstep * UNROLL;
+
+ // Unrolled vectors loop
+ for (; len >= wstep; len -= wstep, ip1 += wstep, ip2 += wstep, op += wstep) {
+ /**begin repeat1
+ * #unroll = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15#
+ */
+ #if UNROLL > @unroll@
+ npyv_u8 a@unroll@ = npyv_load_u8(ip1 + vstep * @unroll@);
+ npyv_u8 b@unroll@ = npyv_load_u8(ip2 + vstep * @unroll@);
+ npyv_u8 r@unroll@ = simd_logical_@intrin@_u8(a@unroll@, b@unroll@);
+ npyv_store_u8(op + vstep * @unroll@, r@unroll@);
+ #endif
+ /**end repeat1**/
+ }
+ #undef UNROLL
+
+ // Single vectors loop
+ for (; len >= vstep; len -= vstep, ip1 += vstep, ip2 += vstep, op += vstep) {
+ npyv_u8 a = npyv_load_u8(ip1);
+ npyv_u8 b = npyv_load_u8(ip2);
+ npyv_u8 r = simd_logical_@intrin@_u8(a, b);
+ npyv_store_u8(op, r);
+ }
+
+ // Scalar loop to finish off
+ for (; len > 0; len--, ip1++, ip2++, op++) {
+ *op = *ip1 @scalar_op@ *ip2;
+ }
+}
+
+static void
+simd_reduce_@kind@_BOOL(npy_bool * op, npy_bool * ip, npy_intp len)
+{
+ #define UNROLL 8
+
+ const int vstep = npyv_nlanes_u8;
+ const int wstep = vstep * UNROLL;
+
+ // Unrolled vectors loop
+ for (; len >= wstep; len -= wstep, ip += wstep) {
+ #if defined(NPY_HAVE_SSE2)
+ NPY_PREFETCH(ip + wstep, 0, 3);
+ #endif
+ npyv_u8 v0 = npyv_load_u8(ip + vstep * 0);
+ npyv_u8 v1 = npyv_load_u8(ip + vstep * 1);
+ npyv_u8 v2 = npyv_load_u8(ip + vstep * 2);
+ npyv_u8 v3 = npyv_load_u8(ip + vstep * 3);
+ npyv_u8 v4 = npyv_load_u8(ip + vstep * 4);
+ npyv_u8 v5 = npyv_load_u8(ip + vstep * 5);
+ npyv_u8 v6 = npyv_load_u8(ip + vstep * 6);
+ npyv_u8 v7 = npyv_load_u8(ip + vstep * 7);
+
+ npyv_u8 m01 = npyv_@reduce@_u8(v0, v1);
+ npyv_u8 m23 = npyv_@reduce@_u8(v2, v3);
+ npyv_u8 m45 = npyv_@reduce@_u8(v4, v5);
+ npyv_u8 m67 = npyv_@reduce@_u8(v6, v7);
+
+ npyv_u8 m0123 = npyv_@reduce@_u8(m01, m23);
+ npyv_u8 m4567 = npyv_@reduce@_u8(m45, m67);
+
+ npyv_u8 mv = npyv_@reduce@_u8(m0123, m4567);
+
+ if(npyv_@anyall@_u8(mv) @scalar_cmp@ 0){
+ *op = !@and@;
+ return;
+ }
+ }
+
+ // Single vectors loop
+ for (; len >= vstep; len -= vstep, ip += vstep) {
+ npyv_u8 v0 = npyv_load_u8(ip);
+ if(npyv_@anyall@_u8(v0) @scalar_cmp@ 0){
+ *op = !@and@;
+ return;
+ }
+ }
+
+ // Scalar loop to finish off
+ for (; len > 0; --len, ++ip) {
+ *op = *op @scalar_op@ *ip;
+ if (*op @scalar_cmp@ 0) {
+ return;
+ }
+ }
+#undef UNROLL
+}
+/**end repeat**/
+
+/**begin repeat
+ * #kind = logical_not, absolute#
+ * #op = ==, !=#
+ * #not = 1, 0#
+ */
+static void
+simd_@kind@_BOOL(npy_bool * op, npy_bool * ip, npy_intp len)
+{
+ #define UNROLL 16
+
+ const int vstep = npyv_nlanes_u8;
+ const int wstep = vstep * UNROLL;
+
+ #if @not@
+ const npyv_u8 zero = npyv_zero_u8();
+ #endif
+
+ // Unrolled vectors loop
+ for (; len >= wstep; len -= wstep, ip += wstep, op += wstep) {
+ /**begin repeat1
+ * #unroll = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15#
+ */
+ #if UNROLL > @unroll@
+ npyv_u8 v@unroll@ = npyv_load_u8(ip + vstep * @unroll@);
+#if @not@
+ npyv_u8 r@unroll@ = mask_to_true(npyv_cmpeq_u8(v@unroll@, zero));
+#else
+ npyv_u8 r@unroll@ = byte_to_true(v@unroll@);
+#endif
+ npyv_store_u8(op + vstep * @unroll@, r@unroll@);
+ #endif
+ /**end repeat1**/
+ }
+ #undef UNROLL
+
+ // Single vectors loop
+ for (; len >= vstep; len -= vstep, ip += vstep, op += vstep) {
+ npyv_u8 v = npyv_load_u8(ip);
+#if @not@
+ npyv_u8 r = mask_to_true(npyv_cmpeq_u8(v, zero));
+#else
+ npyv_u8 r = byte_to_true(v);
+#endif
+ npyv_store_u8(op, r);
+ }
+
+ // Scalar loop to finish off
+ for (; len > 0; --len, ++ip, ++op) {
+ *op = (*ip @op@ 0);
+ }
+}
+/**end repeat**/
+
+#endif // NPY_SIMD
+
+/*******************************************************************************
+ ** Defining ufunc inner functions
+ ******************************************************************************/
+
+/**begin repeat
+ * # kind = logical_or, logical_and#
+ */
+static NPY_INLINE int
+run_binary_simd_@kind@_BOOL(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+#if NPY_SIMD
+ if (sizeof(npy_bool) == 1 &&
+ IS_BLOCKABLE_BINARY(sizeof(npy_bool), NPY_SIMD_WIDTH)) {
+ simd_binary_@kind@_BOOL((npy_bool*)args[2], (npy_bool*)args[0],
+ (npy_bool*)args[1], dimensions[0]);
+ return 1;
+ }
+#endif
+ return 0;
+}
+
+
+static NPY_INLINE int
+run_reduce_simd_@kind@_BOOL(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+#if NPY_SIMD
+ if (sizeof(npy_bool) == 1 &&
+ IS_BLOCKABLE_REDUCE(sizeof(npy_bool), NPY_SIMD_WIDTH)) {
+ simd_reduce_@kind@_BOOL((npy_bool*)args[0], (npy_bool*)args[1],
+ dimensions[0]);
+ return 1;
+ }
+#endif
+ return 0;
+}
+/**end repeat**/
+
+/**begin repeat
+ * #kind = logical_not, absolute#
+ */
+static NPY_INLINE int
+run_unary_simd_@kind@_BOOL(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+#if NPY_SIMD
+ if (sizeof(npy_bool) == 1 &&
+ IS_BLOCKABLE_UNARY(sizeof(npy_bool), NPY_SIMD_WIDTH)) {
+ simd_@kind@_BOOL((npy_bool*)args[1], (npy_bool*)args[0], dimensions[0]);
+ return 1;
+ }
+#endif
+ return 0;
+}
+/**end repeat**/
+
+
+/**begin repeat
+ * #kind = logical_and, logical_or#
+ * #OP = &&, ||#
+ * #SC = ==, !=#
+ * #and = 1, 0#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BOOL_@kind@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+ if(IS_BINARY_REDUCE) {
+#if NPY_SIMD
+ /*
+ * stick with our variant for more reliable performance, only known
+ * platform which outperforms it by ~20% is an i7 with glibc 2.17
+ */
+ if (run_reduce_simd_@kind@_BOOL(args, dimensions, steps)) {
+ return;
+ }
+#else
+ /* for now only use libc on 32-bit/non-x86 */
+ if (steps[1] == 1) {
+ npy_bool * op = (npy_bool *)args[0];
+#if @and@
+ /* np.all(), search for a zero (false) */
+ if (*op) {
+ *op = memchr(args[1], 0, dimensions[0]) == NULL;
+ }
+#else
+ /*
+ * np.any(), search for a non-zero (true) via comparing against
+ * zero blocks, memcmp is faster than memchr on SSE4 machines
+ * with glibc >= 2.12 and memchr can only check for equal 1
+ */
+ static const npy_bool zero[4096]; /* zero by C standard */
+ npy_uintp i, n = dimensions[0];
+
+ for (i = 0; !*op && i < n - (n % sizeof(zero)); i += sizeof(zero)) {
+ *op = memcmp(&args[1][i], zero, sizeof(zero)) != 0;
+ }
+ if (!*op && n - i > 0) {
+ *op = memcmp(&args[1][i], zero, n - i) != 0;
+ }
+#endif
+ return;
+ }
+#endif
+ else {
+ BINARY_REDUCE_LOOP(npy_bool) {
+ const npy_bool in2 = *(npy_bool *)ip2;
+ io1 = io1 @OP@ in2;
+ if (io1 @SC@ 0) {
+ break;
+ }
+ }
+ *((npy_bool *)iop1) = io1;
+ }
+ }
+ else {
+ if (run_binary_simd_@kind@_BOOL(args, dimensions, steps)) {
+ return;
+ }
+ else {
+ BINARY_LOOP {
+ const npy_bool in1 = *(npy_bool *)ip1;
+ const npy_bool in2 = *(npy_bool *)ip2;
+ *((npy_bool *)op1) = in1 @OP@ in2;
+ }
+ }
+ }
+}
+/**end repeat**/
+
+/**begin repeat
+ * #kind = logical_not, absolute#
+ * #OP = ==, !=#
+ **/
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BOOL_@kind@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+ if (run_unary_simd_@kind@_BOOL(args, dimensions, steps)) {
+ return;
+ }
+ else {
+ UNARY_LOOP {
+ npy_bool in1 = *(npy_bool *)ip1;
+ *((npy_bool *)op1) = in1 @OP@ 0;
+ }
+ }
+}
+/**end repeat**/
+
diff --git a/numpy/core/src/umath/loops_minmax.dispatch.c.src b/numpy/core/src/umath/loops_minmax.dispatch.c.src
index 237c8e933..9d8667d38 100644
--- a/numpy/core/src/umath/loops_minmax.dispatch.c.src
+++ b/numpy/core/src/umath/loops_minmax.dispatch.c.src
@@ -451,6 +451,24 @@ clear_fp:
#endif
}
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+ char *ip1 = args[0];
+ char *indx = args[1];
+ char *value = args[2];
+ npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+ npy_intp n = dimensions[0];
+ npy_intp i;
+ @type@ *indexed;
+ for(i = 0; i < n; i++, indx += isindex, value += isb) {
+ indexed = (@type@ *)(ip1 + is1 * *(npy_intp *)indx);
+ *indexed = SCALAR_OP(*indexed, *(@type@ *)value);
+ }
+ return 0;
+}
+
#undef SCALAR_OP
#endif // !fp_only || (is_fp && fp_only)
diff --git a/numpy/core/src/umath/loops_modulo.dispatch.c.src b/numpy/core/src/umath/loops_modulo.dispatch.c.src
index 53b7da289..25edffb1e 100644
--- a/numpy/core/src/umath/loops_modulo.dispatch.c.src
+++ b/numpy/core/src/umath/loops_modulo.dispatch.c.src
@@ -171,7 +171,7 @@ vsx4_divisor_@sfx@(const npyv_@sfx@ vscalar)
* #func = fmod, remainder, divmod#
* #id = 0, 1, 2#
*/
-static NPY_INLINE void
+static inline void
vsx4_simd_@func@_contig_@sfx@(char **args, npy_intp len)
{
npyv_lanetype_@sfx@ *src1 = (npyv_lanetype_@sfx@ *) args[0];
@@ -239,7 +239,7 @@ vsx4_simd_@func@_contig_@sfx@(char **args, npy_intp len)
npyv_cleanup();
}
-static NPY_INLINE void
+static inline void
vsx4_simd_@func@_by_scalar_contig_@sfx@(char **args, npy_intp len)
{
npyv_lanetype_@sfx@ *src1 = (npyv_lanetype_@sfx@ *) args[0];
@@ -292,7 +292,7 @@ vsx4_simd_@func@_by_scalar_contig_@sfx@(char **args, npy_intp len)
* #func = fmod, remainder, divmod#
* #id = 0, 1, 2#
*/
-static NPY_INLINE void
+static inline void
vsx4_simd_@func@_contig_@sfx@(char **args, npy_intp len)
{
npyv_lanetype_@sfx@ *src1 = (npyv_lanetype_@sfx@ *) args[0];
@@ -410,7 +410,7 @@ vsx4_simd_@func@_contig_@sfx@(char **args, npy_intp len)
npyv_cleanup();
}
-static NPY_INLINE void
+static inline void
vsx4_simd_@func@_by_scalar_contig_@sfx@(char **args, npy_intp len)
{
npyv_lanetype_@sfx@ *src1 = (npyv_lanetype_@sfx@ *) args[0];
diff --git a/numpy/core/src/umath/loops_trigonometric.dispatch.c.src b/numpy/core/src/umath/loops_trigonometric.dispatch.c.src
index 78685e807..43eb58ffe 100644
--- a/numpy/core/src/umath/loops_trigonometric.dispatch.c.src
+++ b/numpy/core/src/umath/loops_trigonometric.dispatch.c.src
@@ -9,12 +9,18 @@
#include "simd/simd.h"
#include "loops_utils.h"
#include "loops.h"
+#include "fast_loop_macros.h"
/*
* TODO:
* - use vectorized version of Payne-Hanek style reduction for large elements or
* when there's no native FUSED support instead of fallback to libc
*/
-#if NPY_SIMD_F32 && NPY_SIMD_FMA3 // native support
+#if NPY_SIMD_FMA3 // native support
+/**begin repeat
+ * #check = F64, F32#
+ * #sfx = f64, f32#
+ */
+#if NPY_SIMD_@check@
/*
* Vectorized Cody-Waite range reduction technique
* Performs the reduction step x* = x - y*C in three steps:
@@ -23,14 +29,189 @@
* 3) x* = x - y*c3
* c1, c2 are exact floating points, c3 = C - c1 - c2 simulates higher precision
*/
-NPY_FINLINE npyv_f32
-simd_range_reduction_f32(npyv_f32 x, npyv_f32 y, npyv_f32 c1, npyv_f32 c2, npyv_f32 c3)
+NPY_FINLINE npyv_@sfx@
+simd_range_reduction_@sfx@(npyv_@sfx@ x, npyv_@sfx@ y, npyv_@sfx@ c1, npyv_@sfx@ c2, npyv_@sfx@ c3)
{
- npyv_f32 reduced_x = npyv_muladd_f32(y, c1, x);
- reduced_x = npyv_muladd_f32(y, c2, reduced_x);
- reduced_x = npyv_muladd_f32(y, c3, reduced_x);
+ npyv_@sfx@ reduced_x = npyv_muladd_@sfx@(y, c1, x);
+ reduced_x = npyv_muladd_@sfx@(y, c2, reduced_x);
+ reduced_x = npyv_muladd_@sfx@(y, c3, reduced_x);
return reduced_x;
}
+#endif
+/**end repeat**/
+
+#if NPY_SIMD_F64
+/**begin repeat
+ * #op = cos, sin#
+ */
+#if defined(NPY_OS_WIN32) || defined(NPY_OS_CYGWIN)
+NPY_FINLINE npyv_f64
+#else
+NPY_NOINLINE npyv_f64
+#endif
+simd_@op@_scalar_f64(npyv_f64 out, npy_uint64 cmp_bits)
+{
+ // MSVC doesn't compile with direct vector access, so we copy it here
+ // as we have no npyv_get_lane/npyv_set_lane intrinsics
+ npy_double NPY_DECL_ALIGNED(NPY_SIMD_WIDTH) out_copy[npyv_nlanes_f64];
+ npyv_storea_f64(out_copy, out);
+
+ for (unsigned i = 0; i < npyv_nlanes_f64; ++i) {
+ if (cmp_bits & (1 << i)) {
+ out_copy[i] = npy_@op@(out_copy[i]);
+ }
+ }
+
+ return npyv_loada_f64(out_copy);
+}
+/**end repeat**/
+
+/*
+ * Approximate sine algorithm for x \in [-pi/2, pi/2]
+ * worst-case error is 3.5 ulp.
+ * abs error: 0x1.be222a58p-53 in [-pi/2, pi/2].
+ */
+NPY_FINLINE npyv_f64
+simd_approx_sine_poly_f64(npyv_f64 r)
+{
+ const npyv_f64 poly1 = npyv_setall_f64(-0x1.9f4a9c8b21dc9p-41);
+ const npyv_f64 poly2 = npyv_setall_f64(0x1.60e88a10163f2p-33);
+ const npyv_f64 poly3 = npyv_setall_f64(-0x1.ae6361b7254e7p-26);
+ const npyv_f64 poly4 = npyv_setall_f64(0x1.71de382e8d62bp-19);
+ const npyv_f64 poly5 = npyv_setall_f64(-0x1.a01a019aeb4ffp-13);
+ const npyv_f64 poly6 = npyv_setall_f64(0x1.111111110b25ep-7);
+ const npyv_f64 poly7 = npyv_setall_f64(-0x1.55555555554c3p-3);
+
+ npyv_f64 r2 = npyv_mul_f64(r, r);
+ npyv_f64 y = npyv_muladd_f64(poly1, r2, poly2);
+ y = npyv_muladd_f64(y, r2, poly3);
+ y = npyv_muladd_f64(y, r2, poly4);
+ y = npyv_muladd_f64(y, r2, poly5);
+ y = npyv_muladd_f64(y, r2, poly6);
+ y = npyv_muladd_f64(y, r2, poly7);
+ y = npyv_muladd_f64(npyv_mul_f64(y, r2), r, r);
+
+ return y;
+}
+
+/* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */
+NPY_FINLINE npyv_f64
+simd_range_reduction_pi2(npyv_f64 r, npyv_f64 n) {
+ const npyv_f64 pi1 = npyv_setall_f64(-0x1.921fb54442d18p+1);
+ const npyv_f64 pi2 = npyv_setall_f64(-0x1.1a62633145c06p-53);
+ const npyv_f64 pi3 = npyv_setall_f64(-0x1.c1cd129024e09p-106);
+
+ return simd_range_reduction_f64(r, n, pi1, pi2, pi3);
+}
+
+NPY_FINLINE npyv_b64 simd_sin_range_check_f64(npyv_u64 ir) {
+ const npyv_u64 tiny_bound = npyv_setall_u64(0x202); /* top12 (asuint64 (0x1p-509)). */
+ const npyv_u64 simd_thresh = npyv_setall_u64(0x214); /* top12 (asuint64 (RangeVal)) - SIMD_TINY_BOUND. */
+
+ return npyv_cmpge_u64(npyv_sub_u64(npyv_shri_u64(ir, 52), tiny_bound), simd_thresh);
+}
+
+NPY_FINLINE npyv_b64 simd_cos_range_check_f64(npyv_u64 ir) {
+ const npyv_f64 range_val = npyv_setall_f64(0x1p23);
+
+ return npyv_cmpge_u64(ir, npyv_reinterpret_u64_f64(range_val));
+}
+
+NPY_FINLINE npyv_f64
+simd_cos_poly_f64(npyv_f64 r, npyv_u64 ir, npyv_u64 sign)
+{
+ const npyv_f64 inv_pi = npyv_setall_f64(0x1.45f306dc9c883p-2);
+ const npyv_f64 half_pi = npyv_setall_f64(0x1.921fb54442d18p+0);
+ const npyv_f64 shift = npyv_setall_f64(0x1.8p52);
+
+ /* n = rint((|x|+pi/2)/pi) - 0.5. */
+ npyv_f64 n = npyv_muladd_f64(inv_pi, npyv_add_f64(r, half_pi), shift);
+ npyv_u64 odd = npyv_shli_u64(npyv_reinterpret_u64_f64(n), 63);
+ n = npyv_sub_f64(n, shift);
+ n = npyv_sub_f64(n, npyv_setall_f64(0.5));
+
+ /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */
+ r = simd_range_reduction_pi2(r, n);
+
+ /* sin(r) poly approx. */
+ npyv_f64 y = simd_approx_sine_poly_f64(r);
+
+ /* sign. */
+ return npyv_reinterpret_f64_u64(npyv_xor_u64(npyv_reinterpret_u64_f64(y), odd));
+}
+
+NPY_FINLINE npyv_f64
+simd_sin_poly_f64(npyv_f64 r, npyv_u64 ir, npyv_u64 sign)
+{
+ const npyv_f64 inv_pi = npyv_setall_f64(0x1.45f306dc9c883p-2);
+ const npyv_f64 shift = npyv_setall_f64(0x1.8p52);
+
+ /* n = rint(|x|/pi). */
+ npyv_f64 n = npyv_muladd_f64(inv_pi, r, shift);
+ npyv_u64 odd = npyv_shli_u64(npyv_reinterpret_u64_f64(n), 63);
+ n = npyv_sub_f64(n, shift);
+
+ /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */
+ r = simd_range_reduction_pi2(r, n);
+
+ /* sin(r) poly approx. */
+ npyv_f64 y = simd_approx_sine_poly_f64(r);
+
+ /* sign. */
+ return npyv_reinterpret_f64_u64(npyv_xor_u64(npyv_xor_u64(npyv_reinterpret_u64_f64(y), sign), odd));
+}
+
+/**begin repeat
+ * #op = cos, sin#
+ */
+NPY_FINLINE void
+simd_@op@_f64(const double *src, npy_intp ssrc, double *dst, npy_intp sdst, npy_intp len)
+{
+ const npyv_u64 abs_mask = npyv_setall_u64(0x7fffffffffffffff);
+ const int vstep = npyv_nlanes_f64;
+
+ npyv_f64 out = npyv_zero_f64();
+ npyv_f64 x_in;
+
+ for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+ if (ssrc == 1) {
+ x_in = npyv_load_tillz_f64(src, len);
+ } else {
+ x_in = npyv_loadn_tillz_f64(src, ssrc, len);
+ }
+
+ npyv_u64 ir = npyv_and_u64(npyv_reinterpret_u64_f64(x_in), abs_mask);
+ npyv_f64 r = npyv_reinterpret_f64_u64(ir);
+ npyv_u64 sign = npyv_and_u64(npyv_reinterpret_u64_f64(x_in), npyv_not_u64(abs_mask));
+
+ npyv_b64 cmp = simd_@op@_range_check_f64(ir);
+ /* If fenv exceptions are to be triggered correctly, set any special lanes
+ to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by
+ scalar loop later. */
+ r = npyv_select_f64(cmp, npyv_setall_f64(1.0), r);
+
+ // Some in range, at least one calculation is useful
+ if (!npyv_all_b64(cmp)) {
+ out = simd_@op@_poly_f64(r, ir, sign);
+ }
+
+ if (npyv_any_b64(cmp)) {
+ out = npyv_select_f64(cmp, x_in, out);
+ out = simd_@op@_scalar_f64(out, npyv_tobits_b64(cmp));
+ }
+
+ if (sdst == 1) {
+ npyv_store_till_f64(dst, len, out);
+ } else {
+ npyv_storen_till_f64(dst, sdst, len, out);
+ }
+ }
+ npyv_cleanup();
+}
+/**end repeat**/
+#endif // NPY_SIMD_F64
+
+#if NPY_SIMD_F32
/*
* Approximate cosine algorithm for x \in [-PI/4, PI/4]
* Maximum ULP across all 32-bit floats = 0.875
@@ -124,6 +305,11 @@ simd_sincos_f32(const float *src, npy_intp ssrc, float *dst, npy_intp sdst,
} else {
x_in = npyv_loadn_tillz_f32(src, ssrc, len);
}
+ npyv_b32 nnan_mask = npyv_notnan_f32(x_in);
+ #if NPY_SIMD_CMPSIGNAL
+ // Eliminate NaN to avoid FP invalid exception
+ x_in = npyv_and_f32(x_in, npyv_reinterpret_f32_u32(npyv_cvt_u32_b32(nnan_mask)));
+ #endif
npyv_b32 simd_mask = npyv_cmple_f32(npyv_abs_f32(x_in), max_cody);
npy_uint64 simd_maski = npyv_tobits_b32(simd_mask);
/*
@@ -132,7 +318,6 @@ simd_sincos_f32(const float *src, npy_intp ssrc, float *dst, npy_intp sdst,
* these numbers
*/
if (simd_maski != 0) {
- npyv_b32 nnan_mask = npyv_notnan_f32(x_in);
npyv_f32 x = npyv_select_f32(npyv_and_b32(nnan_mask, simd_mask), x_in, zerosf);
npyv_f32 quadrant = npyv_mul_f32(x, two_over_pi);
@@ -194,24 +379,58 @@ simd_sincos_f32(const float *src, npy_intp ssrc, float *dst, npy_intp sdst,
}
npyv_cleanup();
}
-#endif // NPY_SIMD_FMA3
+#endif // NPY_SIMD_FP32
+#endif // NYP_SIMD_FMA3
/**begin repeat
* #func = cos, sin#
- * #enum = SIMD_COMPUTE_COS, SIMD_COMPUTE_SIN#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_@func@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+#if NPY_SIMD_F64 && NPY_SIMD_FMA3
+ const double *src = (double*)args[0];
+ double *dst = (double*)args[1];
+ const int lsize = sizeof(src[0]);
+ const npy_intp ssrc = steps[0] / lsize;
+ const npy_intp sdst = steps[1] / lsize;
+ npy_intp len = dimensions[0];
+ assert(len <= 1 || (steps[0] % lsize == 0 && steps[1] % lsize == 0));
+
+ if (is_mem_overlap(src, steps[0], dst, steps[1], len) ||
+ !npyv_loadable_stride_f64(ssrc) || !npyv_storable_stride_f64(sdst)
+ ) {
+ for (; len > 0; --len, src += ssrc, dst += sdst) {
+ simd_@func@_f64(src, 1, dst, 1, 1);
+ }
+ } else {
+ simd_@func@_f64(src, ssrc, dst, sdst, len);
+ }
+#else
+ UNARY_LOOP {
+ const npy_double in1 = *(npy_double *)ip1;
+ *(npy_double *)op1 = npy_@func@(in1);
+ }
+#endif
+}
+/**end repeat**/
+
+/**begin repeat
+ * #func = sin, cos#
+ * #enum = SIMD_COMPUTE_SIN, SIMD_COMPUTE_COS#
*/
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_@func@)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
{
- const float *src = (float*)args[0];
- float *dst = (float*)args[1];
+#if NPY_SIMD_F32 && NPY_SIMD_FMA3
+ const npy_float *src = (npy_float*)args[0];
+ npy_float *dst = (npy_float*)args[1];
const int lsize = sizeof(src[0]);
const npy_intp ssrc = steps[0] / lsize;
const npy_intp sdst = steps[1] / lsize;
npy_intp len = dimensions[0];
assert(len <= 1 || (steps[0] % lsize == 0 && steps[1] % lsize == 0));
-#if NPY_SIMD_F32 && NPY_SIMD_FMA3
if (is_mem_overlap(src, steps[0], dst, steps[1], len) ||
!npyv_loadable_stride_f32(ssrc) || !npyv_storable_stride_f32(sdst)
) {
@@ -222,9 +441,9 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_@func@)
simd_sincos_f32(src, ssrc, dst, sdst, len, @enum@);
}
#else
- for (; len > 0; --len, src += ssrc, dst += sdst) {
- const float src0 = *src;
- *dst = npy_@func@f(src0);
+ UNARY_LOOP {
+ const npy_float in1 = *(npy_float *)ip1;
+ *(npy_float *)op1 = npy_@func@f(in1);
}
#endif
}
diff --git a/numpy/core/src/umath/loops_umath_fp.dispatch.c.src b/numpy/core/src/umath/loops_umath_fp.dispatch.c.src
index 46ce51824..89999e879 100644
--- a/numpy/core/src/umath/loops_umath_fp.dispatch.c.src
+++ b/numpy/core/src/umath/loops_umath_fp.dispatch.c.src
@@ -12,10 +12,12 @@
/**begin repeat
* #sfx = f32, f64#
* #func_suffix = f16, 8#
+ * #len = 32, 64#
*/
/**begin repeat1
* #func = exp2, log2, log10, expm1, log1p, cbrt, tan, asin, acos, atan, sinh, cosh, asinh, acosh, atanh#
* #default_val = 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0#
+ * #fxnan = 0, 0, 0, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0#
*/
static void
simd_@func@_@sfx@(const npyv_lanetype_@sfx@ *src, npy_intp ssrc,
@@ -37,7 +39,15 @@ simd_@func@_@sfx@(const npyv_lanetype_@sfx@ *src, npy_intp ssrc,
x = npyv_loadn_tillz_@sfx@(src, ssrc, len);
}
#endif
+ #if @fxnan@ == @len@
+ // Workaround, invalid value encountered when x is set to nan
+ npyv_b@len@ nnan_mask = npyv_notnan_@sfx@(x);
+ npyv_@sfx@ x_exnan = npyv_select_@sfx@(nnan_mask, x, npyv_setall_@sfx@(@default_val@));
+ npyv_@sfx@ out = __svml_@func@@func_suffix@(x_exnan);
+ out = npyv_select_@sfx@(nnan_mask, out, x);
+ #else
npyv_@sfx@ out = __svml_@func@@func_suffix@(x);
+ #endif
if (sdst == 1) {
npyv_store_till_@sfx@(dst, len, out);
} else {
@@ -50,32 +60,6 @@ simd_@func@_@sfx@(const npyv_lanetype_@sfx@ *src, npy_intp ssrc,
/**end repeat**/
/**begin repeat
- * #func = sin, cos#
- */
-static void
-simd_@func@_f64(const double *src, npy_intp ssrc,
- double *dst, npy_intp sdst, npy_intp len)
-{
- const int vstep = npyv_nlanes_f64;
- for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
- npyv_f64 x;
- if (ssrc == 1) {
- x = npyv_load_tillz_f64(src, len);
- } else {
- x = npyv_loadn_tillz_f64(src, ssrc, len);
- }
- npyv_f64 out = __svml_@func@8(x);
- if (sdst == 1) {
- npyv_store_till_f64(dst, len, out);
- } else {
- npyv_storen_till_f64(dst, sdst, len, out);
- }
- }
- npyv_cleanup();
-}
-/**end repeat**/
-
-/**begin repeat
* #sfx = f32, f64#
* #func_suffix = f16, 8#
*/
@@ -267,31 +251,3 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@func@)
}
/**end repeat1**/
/**end repeat**/
-
-/**begin repeat
- * #func = sin, cos#
- */
-NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_@func@)
-(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
-{
-#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
- const double *src = (double*)args[0];
- double *dst = (double*)args[1];
- const int lsize = sizeof(src[0]);
- const npy_intp ssrc = steps[0] / lsize;
- const npy_intp sdst = steps[1] / lsize;
- const npy_intp len = dimensions[0];
- assert(len <= 1 || (steps[0] % lsize == 0 && steps[1] % lsize == 0));
- if (!is_mem_overlap(src, steps[0], dst, steps[1], len) &&
- npyv_loadable_stride_f64(ssrc) &&
- npyv_storable_stride_f64(sdst)) {
- simd_@func@_f64(src, ssrc, dst, sdst, len);
- return;
- }
-#endif
- UNARY_LOOP {
- const npy_double in1 = *(npy_double *)ip1;
- *(npy_double *)op1 = npy_@func@(in1);
- }
-}
-/**end repeat**/
diff --git a/numpy/core/src/umath/loops_unary.dispatch.c.src b/numpy/core/src/umath/loops_unary.dispatch.c.src
new file mode 100644
index 000000000..1e2a81d20
--- /dev/null
+++ b/numpy/core/src/umath/loops_unary.dispatch.c.src
@@ -0,0 +1,364 @@
+/*@targets
+ ** $maxopt baseline
+ ** neon asimd
+ ** sse2 avx2 avx512_skx
+ ** vsx2
+ ** vx vxe
+ **/
+
+#define _UMATHMODULE
+#define _MULTIARRAYMODULE
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+#include "numpy/npy_math.h"
+#include "simd/simd.h"
+#include "loops_utils.h"
+#include "loops.h"
+#include "lowlevel_strided_loops.h"
+// Provides the various *_LOOP macros
+#include "fast_loop_macros.h"
+
+/*******************************************************************************
+ ** Scalar ops
+ ******************************************************************************/
+#define scalar_negative(X) (-X)
+
+/*******************************************************************************
+ ** extra SIMD intrinsics
+ ******************************************************************************/
+
+#if NPY_SIMD
+
+/**begin repeat
+ * #sfx = s8, u8, s16, u16, s32, u32, s64, u64#
+ * #ssfx = 8, 8, 16, 16, 32, 32, 64, 64#
+ */
+static NPY_INLINE npyv_@sfx@
+npyv_negative_@sfx@(npyv_@sfx@ v)
+{
+#if defined(NPY_HAVE_NEON) && (defined(__aarch64__) || @ssfx@ < 64)
+ return npyv_reinterpret_@sfx@_s@ssfx@(vnegq_s@ssfx@(npyv_reinterpret_s@ssfx@_@sfx@(v)));
+#else
+ // (x ^ -1) + 1
+ const npyv_@sfx@ m1 = npyv_setall_@sfx@((npyv_lanetype_@sfx@)-1);
+ return npyv_sub_@sfx@(npyv_xor_@sfx@(v, m1), m1);
+#endif
+}
+/**end repeat**/
+
+/**begin repeat
+ * #sfx = f32, f64#
+ * #VCHK = NPY_SIMD_F32, NPY_SIMD_F64#
+ * #fd = f, #
+ */
+#if @VCHK@
+static NPY_INLINE npyv_@sfx@
+npyv_negative_@sfx@(npyv_@sfx@ v)
+{
+#if defined(NPY_HAVE_NEON)
+ return vnegq_@sfx@(v);
+#else
+ // (v ^ signmask)
+ const npyv_@sfx@ signmask = npyv_setall_@sfx@(-0.@fd@);
+ return npyv_xor_@sfx@(v, signmask);
+#endif
+}
+#endif // @VCHK@
+/**end repeat**/
+
+#endif // NPY_SIMD
+
+/********************************************************************************
+ ** Defining the SIMD kernels
+ ********************************************************************************/
+/**begin repeat
+ * #sfx = s8, u8, s16, u16, s32, u32, s64, u64, f32, f64#
+ * #simd_chk = NPY_SIMD*8, NPY_SIMD_F32, NPY_SIMD_F64#
+ * #is_fp = 0*8, 1*2#
+ * #supports_ncontig = 0*4,1*6#
+ */
+/**begin repeat1
+ * #kind = negative#
+ * #intrin = negative#
+ * #unroll = 4#
+ */
+#if @simd_chk@
+#if @unroll@ < 1
+#error "Unroll must be at least 1"
+#elif NPY_SIMD != 128 && @unroll@ > 2
+// Avoid memory bandwidth bottleneck for larger SIMD
+#define UNROLL 2
+#else
+#define UNROLL @unroll@
+#endif
+// contiguous inputs and output.
+static NPY_INLINE void
+simd_unary_cc_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip,
+ npyv_lanetype_@sfx@ *op,
+ npy_intp len)
+{
+ const int vstep = npyv_nlanes_@sfx@;
+ const int wstep = vstep * UNROLL;
+
+ // unrolled vector loop
+ for (; len >= wstep; len -= wstep, ip += wstep, op += wstep) {
+ /**begin repeat2
+ * #U = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15#
+ */
+ #if UNROLL > @U@
+ npyv_@sfx@ v_@U@ = npyv_load_@sfx@(ip + @U@ * vstep);
+ npyv_@sfx@ r_@U@ = npyv_@intrin@_@sfx@(v_@U@);
+ npyv_store_@sfx@(op + @U@ * vstep, r_@U@);
+ #endif
+ /**end repeat2**/
+ }
+ // single vector loop
+ for (; len >= vstep; len -= vstep, ip += vstep, op +=vstep) {
+ npyv_@sfx@ v = npyv_load_@sfx@(ip);
+ npyv_@sfx@ r = npyv_@intrin@_@sfx@(v);
+ npyv_store_@sfx@(op, r);
+ }
+ // scalar finish up any remaining iterations
+ for (; len > 0; --len, ++ip, ++op) {
+ *op = scalar_@intrin@(*ip);
+ }
+}
+
+#if @supports_ncontig@
+// contiguous input, non-contiguous output
+static NPY_INLINE void
+simd_unary_cn_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip,
+ npyv_lanetype_@sfx@ *op, npy_intp ostride,
+ npy_intp len)
+{
+ const int vstep = npyv_nlanes_@sfx@;
+ const int wstep = vstep * UNROLL;
+
+ // unrolled vector loop
+ for (; len >= wstep; len -= wstep, ip += wstep, op += ostride*wstep) {
+ /**begin repeat2
+ * #U = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15#
+ */
+ #if UNROLL > @U@
+ npyv_@sfx@ v_@U@ = npyv_load_@sfx@(ip + @U@ * vstep);
+ npyv_@sfx@ r_@U@ = npyv_@intrin@_@sfx@(v_@U@);
+ npyv_storen_@sfx@(op + @U@ * vstep * ostride, ostride, r_@U@);
+ #endif
+ /**end repeat2**/
+ }
+ // single vector loop
+ for (; len >= vstep; len -= vstep, ip += vstep, op += ostride*vstep) {
+ npyv_@sfx@ v = npyv_load_@sfx@(ip);
+ npyv_@sfx@ r = npyv_@intrin@_@sfx@(v);
+ npyv_storen_@sfx@(op, ostride, r);
+ }
+ // scalar finish up any remaining iterations
+ for (; len > 0; --len, ++ip, op += ostride) {
+ *op = scalar_@intrin@(*ip);
+ }
+}
+// non-contiguous input, contiguous output
+static NPY_INLINE void
+simd_unary_nc_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip, npy_intp istride,
+ npyv_lanetype_@sfx@ *op,
+ npy_intp len)
+{
+ const int vstep = npyv_nlanes_@sfx@;
+ const int wstep = vstep * UNROLL;
+
+ // unrolled vector loop
+ for (; len >= wstep; len -= wstep, ip += istride*wstep, op += wstep) {
+ /**begin repeat2
+ * #U = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15#
+ */
+ #if UNROLL > @U@
+ npyv_@sfx@ v_@U@ = npyv_loadn_@sfx@(ip + @U@ * vstep * istride, istride);
+ npyv_@sfx@ r_@U@ = npyv_@intrin@_@sfx@(v_@U@);
+ npyv_store_@sfx@(op + @U@ * vstep, r_@U@);
+ #endif
+ /**end repeat2**/
+ }
+ // single vector loop
+ for (; len >= vstep; len -= vstep, ip += istride*vstep, op += vstep) {
+ npyv_@sfx@ v = npyv_loadn_@sfx@(ip, istride);
+ npyv_@sfx@ r = npyv_@intrin@_@sfx@(v);
+ npyv_store_@sfx@(op, r);
+ }
+ // scalar finish up any remaining iterations
+ for (; len > 0; --len, ip += istride, ++op) {
+ *op = scalar_@intrin@(*ip);
+ }
+}
+// non-contiguous input and output
+// limit unroll to 2x
+#if UNROLL > 2
+#undef UNROLL
+#define UNROLL 2
+#endif
+static NPY_INLINE void
+simd_unary_nn_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip, npy_intp istride,
+ npyv_lanetype_@sfx@ *op, npy_intp ostride,
+ npy_intp len)
+{
+ const int vstep = npyv_nlanes_@sfx@;
+ const int wstep = vstep * UNROLL;
+
+ // unrolled vector loop
+ for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) {
+ /**begin repeat2
+ * #U = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15#
+ */
+ #if UNROLL > @U@
+ npyv_@sfx@ v_@U@ = npyv_loadn_@sfx@(ip + @U@ * vstep * istride, istride);
+ npyv_@sfx@ r_@U@ = npyv_@intrin@_@sfx@(v_@U@);
+ npyv_storen_@sfx@(op + @U@ * vstep * ostride, ostride, r_@U@);
+ #endif
+ /**end repeat2**/
+ }
+ // single vector loop
+ for (; len >= vstep; len -= vstep, ip += istride*vstep, op += ostride*vstep) {
+ npyv_@sfx@ v = npyv_loadn_@sfx@(ip, istride);
+ npyv_@sfx@ r = npyv_@intrin@_@sfx@(v);
+ npyv_storen_@sfx@(op, ostride, r);
+ }
+ // scalar finish up any remaining iterations
+ for (; len > 0; --len, ip += istride, op += ostride) {
+ *op = scalar_@intrin@(*ip);
+ }
+}
+#endif // @supports_ncontig@
+#undef UNROLL
+#endif // @simd_chk@
+/*end repeat1**/
+/**end repeat**/
+
+/********************************************************************************
+ ** Defining ufunc inner functions
+ ********************************************************************************/
+/**begin repeat
+ * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG,
+ * BYTE, SHORT, INT, LONG, LONGLONG,
+ * FLOAT, DOUBLE, LONGDOUBLE#
+ *
+ * #BTYPE = BYTE, SHORT, INT, LONG, LONGLONG,
+ * BYTE, SHORT, INT, LONG, LONGLONG,
+ * FLOAT, DOUBLE, LONGDOUBLE#
+ * #type = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong,
+ * npy_byte, npy_short, npy_int, npy_long, npy_longlong,
+ * npy_float, npy_double, npy_longdouble#
+ *
+ * #is_fp = 0*10, 1*3#
+ * #is_unsigned = 1*5, 0*5, 0*3#
+ * #supports_ncontig = 0*2, 1*3, 0*2, 1*3, 1*3#
+ */
+#undef TO_SIMD_SFX
+#if 0
+/**begin repeat1
+ * #len = 8, 16, 32, 64#
+ */
+#elif NPY_SIMD && NPY_BITSOF_@BTYPE@ == @len@
+ #if @is_fp@
+ #define TO_SIMD_SFX(X) X##_f@len@
+ #if NPY_BITSOF_@BTYPE@ == 32 && !NPY_SIMD_F32
+ #undef TO_SIMD_SFX
+ #endif
+ #if NPY_BITSOF_@BTYPE@ == 64 && !NPY_SIMD_F64
+ #undef TO_SIMD_SFX
+ #endif
+ #elif @is_unsigned@
+ #define TO_SIMD_SFX(X) X##_u@len@
+ #else
+ #define TO_SIMD_SFX(X) X##_s@len@
+ #endif
+/**end repeat1**/
+#endif
+
+/**begin repeat1
+ * #kind = negative#
+ * #intrin = negative#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+ char *ip = args[0], *op = args[1];
+ npy_intp istep = steps[0], ostep = steps[1],
+ len = dimensions[0];
+#ifdef TO_SIMD_SFX
+ #undef STYPE
+ #define STYPE TO_SIMD_SFX(npyv_lanetype)
+ if (!is_mem_overlap(ip, istep, op, ostep, len)) {
+ if (IS_UNARY_CONT(@type@, @type@)) {
+ // no overlap and operands are contiguous
+ TO_SIMD_SFX(simd_unary_cc_@intrin@)(
+ (STYPE*)ip, (STYPE*)op, len
+ );
+ goto clear;
+ }
+ #if @supports_ncontig@
+ const npy_intp istride = istep / sizeof(STYPE);
+ const npy_intp ostride = ostep / sizeof(STYPE);
+ if (TO_SIMD_SFX(npyv_loadable_stride)(istride) &&
+ TO_SIMD_SFX(npyv_storable_stride)(ostride))
+ {
+ if (istride == 1 && ostride != 1) {
+ // contiguous input, non-contiguous output
+ TO_SIMD_SFX(simd_unary_cn_@intrin@)(
+ (STYPE*)ip, (STYPE*)op, ostride, len
+ );
+ goto clear;
+ }
+ else if (istride != 1 && ostride == 1) {
+ // non-contiguous input, contiguous output
+ TO_SIMD_SFX(simd_unary_nc_@intrin@)(
+ (STYPE*)ip, istride, (STYPE*)op, len
+ );
+ goto clear;
+ }
+ // SSE2 does better with unrolled scalar for heavy non-contiguous
+ #if !defined(NPY_HAVE_SSE2)
+ else if (istride != 1 && ostride != 1) {
+ // non-contiguous input and output
+ TO_SIMD_SFX(simd_unary_nn_@intrin@)(
+ (STYPE*)ip, istride, (STYPE*)op, ostride, len
+ );
+ goto clear;
+ }
+ #endif
+ }
+ #endif // @supports_ncontig@
+ }
+#endif // TO_SIMD_SFX
+#ifndef NPY_DISABLE_OPTIMIZATION
+ /*
+ * scalar unrolls
+ * 8x unroll performed best on
+ * - Apple M1 Native / arm64
+ * - Apple M1 Rosetta / SSE42
+ * - iMacPro / AVX512
+ */
+ #define UNROLL 8
+ for (; len >= UNROLL; len -= UNROLL, ip += istep*UNROLL, op += ostep*UNROLL) {
+ /**begin repeat2
+ * #U = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15#
+ */
+ #if UNROLL > @U@
+ const @type@ in_@U@ = *((const @type@ *)(ip + @U@ * istep));
+ *((@type@ *)(op + @U@ * ostep)) = scalar_@intrin@(in_@U@);
+ #endif
+ /**end repeat2**/
+ }
+#endif // NPY_DISABLE_OPTIMIZATION
+ for (; len > 0; --len, ip += istep, op += ostep) {
+ *((@type@ *)op) = scalar_@intrin@(*(const @type@ *)ip);
+ }
+#ifdef TO_SIMD_SFX
+clear:
+ npyv_cleanup();
+#endif
+#if @is_fp@
+ npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+/**end repeat**/
+
+#undef NEGATIVE_CONTIG_ONLY
diff --git a/numpy/core/src/umath/loops_unary_complex.dispatch.c.src b/numpy/core/src/umath/loops_unary_complex.dispatch.c.src
new file mode 100644
index 000000000..052ad464c
--- /dev/null
+++ b/numpy/core/src/umath/loops_unary_complex.dispatch.c.src
@@ -0,0 +1,139 @@
+/*@targets
+ ** $maxopt baseline
+ ** sse2 (avx2 fma3) avx512f
+ ** neon asimd
+ ** vsx2 vsx3
+ ** vx vxe
+ **/
+#define _UMATHMODULE
+#define _MULTIARRAYMODULE
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+#include "simd/simd.h"
+#include "loops_utils.h"
+#include "loops.h"
+#include "lowlevel_strided_loops.h"
+// Provides the various *_LOOP macros
+#include "fast_loop_macros.h"
+
+/**begin repeat
+ * #type = npy_float, npy_double#
+ * #sfx = f32, f64#
+ * #bsfx = b32, b64#
+ * #usfx = b32, u64#
+ * #VECTOR = NPY_SIMD_F32, NPY_SIMD_F64#
+ * #is_double = 0, 1#
+ * #c = f, #
+ * #INF = NPY_INFINITYF, NPY_INFINITY#
+ * #NAN = NPY_NANF, NPY_NAN#
+ */
+#if @VECTOR@
+NPY_FINLINE npyv_@sfx@
+simd_cabsolute_@sfx@(npyv_@sfx@ re, npyv_@sfx@ im)
+{
+ const npyv_@sfx@ inf = npyv_setall_@sfx@(@INF@);
+ const npyv_@sfx@ nan = npyv_setall_@sfx@(@NAN@);
+
+ re = npyv_abs_@sfx@(re);
+ im = npyv_abs_@sfx@(im);
+ /*
+ * If real or imag = INF, then convert it to inf + j*inf
+ * Handles: inf + j*nan, nan + j*inf
+ */
+ npyv_@bsfx@ re_infmask = npyv_cmpeq_@sfx@(re, inf);
+ npyv_@bsfx@ im_infmask = npyv_cmpeq_@sfx@(im, inf);
+ im = npyv_select_@sfx@(re_infmask, inf, im);
+ re = npyv_select_@sfx@(im_infmask, inf, re);
+ /*
+ * If real or imag = NAN, then convert it to nan + j*nan
+ * Handles: x + j*nan, nan + j*x
+ */
+ npyv_@bsfx@ re_nnanmask = npyv_notnan_@sfx@(re);
+ npyv_@bsfx@ im_nnanmask = npyv_notnan_@sfx@(im);
+ im = npyv_select_@sfx@(re_nnanmask, im, nan);
+ re = npyv_select_@sfx@(im_nnanmask, re, nan);
+
+ npyv_@sfx@ larger = npyv_max_@sfx@(re, im);
+ npyv_@sfx@ smaller = npyv_min_@sfx@(im, re);
+ /*
+ * Calculate div_mask to prevent 0./0. and inf/inf operations in div
+ */
+ npyv_@bsfx@ zeromask = npyv_cmpeq_@sfx@(larger, npyv_zero_@sfx@());
+ npyv_@bsfx@ infmask = npyv_cmpeq_@sfx@(smaller, inf);
+ npyv_@bsfx@ div_mask = npyv_not_@bsfx@(npyv_or_@bsfx@(zeromask, infmask));
+
+ npyv_@sfx@ ratio = npyv_ifdivz_@sfx@(div_mask, smaller, larger);
+ npyv_@sfx@ hypot = npyv_sqrt_@sfx@(
+ npyv_muladd_@sfx@(ratio, ratio, npyv_setall_@sfx@(1.0@c@)
+ ));
+ return npyv_mul_@sfx@(hypot, larger);
+}
+#endif // VECTOR
+/**end repeat**/
+
+/********************************************************************************
+ ** Defining ufunc inner functions
+ ********************************************************************************/
+/**begin repeat
+ * complex types
+ * #TYPE = CFLOAT, CDOUBLE#
+ * #ftype = npy_float, npy_double#
+ * #VECTOR = NPY_SIMD_F32, NPY_SIMD_F64#
+ * #sfx = f32, f64#
+ * #c = f, #
+ * #C = F, #
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_absolute)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+#if @VECTOR@
+ npy_intp len = dimensions[0];
+ npy_intp ssrc = steps[0] / sizeof(@ftype@);
+ npy_intp sdst = steps[1] / sizeof(@ftype@);
+
+ if (!is_mem_overlap(args[0], steps[0], args[1], steps[1], len) &&
+ npyv_loadable_stride_@sfx@(ssrc) && npyv_storable_stride_@sfx@(sdst)
+ && steps[0] % sizeof(@ftype@) == 0
+ && steps[1] % sizeof(@ftype@) == 0
+ ) {
+ const @ftype@ *src = (@ftype@*)args[0];
+ @ftype@ *dst = (@ftype@*)args[1];
+
+ const int vstep = npyv_nlanes_@sfx@;
+ const int wstep = vstep * 2;
+ const int hstep = vstep / 2;
+
+ if (ssrc == 2 && sdst == 1) {
+ for (; len >= vstep; len -= vstep, src += wstep, dst += vstep) {
+ npyv_@sfx@x2 ab = npyv_load_@sfx@x2(src);
+ npyv_@sfx@ r = simd_cabsolute_@sfx@(ab.val[0], ab.val[1]);
+ npyv_store_@sfx@(dst, r);
+ }
+ }
+ else {
+ for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+ npyv_@sfx@ re_im0 = npyv_loadn2_@sfx@(src, ssrc);
+ npyv_@sfx@ re_im1 = npyv_loadn2_@sfx@(src + ssrc*hstep, ssrc);
+ npyv_@sfx@x2 ab = npyv_unzip_@sfx@(re_im0, re_im1);
+ npyv_@sfx@ r = simd_cabsolute_@sfx@(ab.val[0], ab.val[1]);
+ npyv_storen_@sfx@(dst, sdst, r);
+ }
+ }
+ for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+ npyv_@sfx@ rl = npyv_loadn_tillz_@sfx@(src, ssrc, len);
+ npyv_@sfx@ im = npyv_loadn_tillz_@sfx@(src + 1, ssrc, len);
+ npyv_@sfx@ r = simd_cabsolute_@sfx@(rl, im);
+ npyv_storen_till_@sfx@(dst, sdst, len, r);
+ }
+ npyv_cleanup();
+ npy_clear_floatstatus_barrier((char*)&len);
+ return;
+ }
+#endif
+ UNARY_LOOP {
+ const @ftype@ re = ((@ftype@ *)ip1)[0];
+ const @ftype@ im = ((@ftype@ *)ip1)[1];
+ *((@ftype@ *)op1) = npy_hypot@c@(re, im);
+ }
+}
+/**end repeat**/
diff --git a/numpy/core/src/umath/loops_unary_fp.dispatch.c.src b/numpy/core/src/umath/loops_unary_fp.dispatch.c.src
index 0ac39a9b1..c4e7b8929 100644
--- a/numpy/core/src/umath/loops_unary_fp.dispatch.c.src
+++ b/numpy/core/src/umath/loops_unary_fp.dispatch.c.src
@@ -301,7 +301,7 @@ no_unroll:
#endif // @VCHK@
for (; len > 0; --len, src += src_step, dst += dst_step) {
#if @VCHK@
- // to guarantee the same precsion and fp/domain errors for both scalars and vectors
+ // to guarantee the same precision and fp/domain errors for both scalars and vectors
simd_@TYPE@_@kind@_CONTIG_CONTIG(src, 0, dst, 0, 1);
#else
const npyv_lanetype_@sfx@ src0 = *(npyv_lanetype_@sfx@*)src;
diff --git a/numpy/core/src/umath/loops_unary_fp_le.dispatch.c.src b/numpy/core/src/umath/loops_unary_fp_le.dispatch.c.src
new file mode 100644
index 000000000..ba133dc1e
--- /dev/null
+++ b/numpy/core/src/umath/loops_unary_fp_le.dispatch.c.src
@@ -0,0 +1,565 @@
+/*@targets
+ ** $maxopt baseline
+ ** sse2 sse41
+ ** vsx2
+ ** neon asimd
+ **/
+
+/**
+ * Force use SSE only on x86, even if AVX2 or AVX512F are enabled
+ * through the baseline, since scatter(AVX512F) and gather very costly
+ * to handle non-contiguous memory access comparing with SSE for
+ * such small operations that this file covers.
+ */
+#define NPY_SIMD_FORCE_128
+#define _UMATHMODULE
+#define _MULTIARRAYMODULE
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#include <float.h>
+#include "numpy/npy_math.h"
+#include "simd/simd.h"
+#include "loops_utils.h"
+#include "loops.h"
+#include "lowlevel_strided_loops.h"
+// Provides the various *_LOOP macros
+#include "fast_loop_macros.h"
+
+/**
+ * This code should really be merged into loops_unary_fp.dispatch.c.src
+ * However there is an issue with enabling the code here for VX and VXE
+ * as the shifts don't behave as expected.
+ * See the code below that references NPY__CPU_TARGET_VX and
+ * NPY_BIG_ENDIAN. Suspect that this is a big endian vector issue.
+ *
+ * Splitting the files out allows us to keep loops_unary_fp.dispatch.c.src
+ * building for VX and VXE so we don't regress performance while adding this
+ * code for other platforms.
+ */
+// TODO(@seiko2plus): add support for big-endian
+#if NPY_SIMD_BIGENDIAN
+ #undef NPY_SIMD
+ #undef NPY_SIMD_F32
+ #undef NPY_SIMD_F64
+ #define NPY_SIMD 0
+ #define NPY_SIMD_F32 0
+ #define NPY_SIMD_F64 0
+#endif
+
+/*******************************************************************************
+ ** extra SIMD intrinsics
+ ******************************************************************************/
+
+#if NPY_SIMD
+
+/**
+ * We define intrinsics for isnan, isinf, isfinite, and signbit below. There's
+ * a few flavors of each. We'll use f32 as an example although f64 versions
+ * are also defined.
+ *
+ * npyv_u32 npyv_KIND_f32(npyv_f32 v)
+ * These are mainly used for the single vector loops. As such, result should
+ * be bool true / false, ready to write back.
+ *
+ * npyv_b32 _npyv_KIND_f32(npyv_f32 v)
+ * These are used by the geneal intrinsics above as well as the multi-vector
+ * packing intrinsics. The multi-vector packing intrinsics are the ones
+ * utilized in the unrolled vector loops. Results should be vector masks
+ * of 0x00/0xff.
+ *
+ * npyv_u8 npyv_pack_KIND_f32(npyv_f32 v0, npyv_f32 v1, npyv_f32 v2, npyv_f32 v3)
+ * These are the multi-vector packing intrinsics utilized by unrolled vector
+ * loops. They perform the operation on all input vectors and pack the
+ * results to a single npyv_u8. Assuming NPY_SIMD == 128, that means we
+ * can pack results from 4x npyv_f32 or 8x npyv_64 in a single npyv_u8.
+ * Result should be bool true / false, ready to write back.
+ */
+
+#if NPY_SIMD_F32
+NPY_FINLINE npyv_u32
+npyv_isnan_f32(npyv_f32 v)
+{
+ const npyv_u8 truemask = npyv_reinterpret_u8_u32(npyv_setall_u32(1==1));
+ npyv_u8 notnan = npyv_reinterpret_u8_u32(npyv_cvt_u32_b32(npyv_notnan_f32(v)));
+ return npyv_reinterpret_u32_u8(npyv_andc_u8(truemask, notnan));
+}
+NPY_FINLINE npyv_u8
+npyv_pack_isnan_f32(npyv_f32 v0, npyv_f32 v1, npyv_f32 v2, npyv_f32 v3)
+{
+ const npyv_u8 truemask = npyv_setall_u8(1==1);
+ npyv_b32 b0 = npyv_notnan_f32(v0);
+ npyv_b32 b1 = npyv_notnan_f32(v1);
+ npyv_b32 b2 = npyv_notnan_f32(v2);
+ npyv_b32 b3 = npyv_notnan_f32(v3);
+ npyv_b8 notnan = npyv_pack_b8_b32(b0, b1, b2, b3);
+ return npyv_andc_u8(truemask, npyv_cvt_u8_b8(notnan));
+}
+#endif
+#if NPY_SIMD_F64
+NPY_FINLINE npyv_u64
+npyv_isnan_f64(npyv_f64 v)
+{
+ const npyv_u8 truemask = npyv_reinterpret_u8_u64(npyv_setall_u64(1==1));
+ npyv_u8 notnan = npyv_reinterpret_u8_u64(npyv_cvt_u64_b64(npyv_notnan_f64(v)));
+ return npyv_reinterpret_u64_u8(npyv_andc_u8(truemask, notnan));
+}
+NPY_FINLINE npyv_u8
+npyv_pack_isnan_f64(npyv_f64 v0, npyv_f64 v1, npyv_f64 v2, npyv_f64 v3,
+ npyv_f64 v4, npyv_f64 v5, npyv_f64 v6, npyv_f64 v7)
+{
+ const npyv_u8 truemask = npyv_setall_u8(1==1);
+ npyv_b64 b0 = npyv_notnan_f64(v0);
+ npyv_b64 b1 = npyv_notnan_f64(v1);
+ npyv_b64 b2 = npyv_notnan_f64(v2);
+ npyv_b64 b3 = npyv_notnan_f64(v3);
+ npyv_b64 b4 = npyv_notnan_f64(v4);
+ npyv_b64 b5 = npyv_notnan_f64(v5);
+ npyv_b64 b6 = npyv_notnan_f64(v6);
+ npyv_b64 b7 = npyv_notnan_f64(v7);
+ npyv_b8 notnan = npyv_pack_b8_b64(b0, b1, b2, b3, b4, b5, b6, b7);
+ return npyv_andc_u8(truemask, npyv_cvt_u8_b8(notnan));
+}
+#endif
+
+#if NPY_SIMD_F32
+NPY_FINLINE npyv_b32
+_npyv_isinf_f32(npyv_f32 v)
+{
+#if defined(NPY_HAVE_NEON)
+ // abs(v) > FLT_MAX
+ const npyv_f32 fltmax = npyv_setall_f32(FLT_MAX);
+ return vcagtq_f32(v, fltmax);
+#else
+ // cast out the sign and check if all exponent bits are set.
+ const npyv_u32 exp_mask = npyv_setall_u32(0xff000000);
+ npyv_u32 bits = npyv_shli_u32(npyv_reinterpret_u32_f32(v), 1);
+ return npyv_cmpeq_u32(bits, exp_mask);
+#endif
+}
+NPY_FINLINE npyv_u32
+npyv_isinf_f32(npyv_f32 v)
+{
+ const npyv_u32 truemask = npyv_setall_u32(1==1);
+ return npyv_and_u32(truemask, npyv_cvt_u32_b32(_npyv_isinf_f32(v)));
+}
+NPY_FINLINE npyv_u8
+npyv_pack_isinf_f32(npyv_f32 v0, npyv_f32 v1, npyv_f32 v2, npyv_f32 v3)
+{
+ const npyv_u8 truemask = npyv_setall_u8(1==1);
+ npyv_b32 b0 = _npyv_isinf_f32(v0);
+ npyv_b32 b1 = _npyv_isinf_f32(v1);
+ npyv_b32 b2 = _npyv_isinf_f32(v2);
+ npyv_b32 b3 = _npyv_isinf_f32(v3);
+ npyv_b8 isinf = npyv_pack_b8_b32(b0, b1, b2, b3);
+ return npyv_and_u8(truemask, npyv_cvt_u8_b8(isinf));
+}
+#endif // NPY_SIMD_F32
+#if NPY_SIMD_F64
+NPY_FINLINE npyv_b64
+_npyv_isinf_f64(npyv_f64 v)
+{
+#if defined(NPY_HAVE_NEON)
+ // abs(v) > DBL_MAX
+ const npyv_f64 fltmax = npyv_setall_f64(DBL_MAX);
+ return vcagtq_f64(v, fltmax);
+#else
+ // cast out the sign and check if all exponent bits are set.
+ const npyv_u64 exp_mask = npyv_setall_u64(0xffe0000000000000);
+ npyv_u64 bits = npyv_shli_u64(npyv_reinterpret_u64_f64(v), 1);
+ return npyv_cmpeq_u64(bits, exp_mask);
+#endif
+}
+NPY_FINLINE npyv_u64
+npyv_isinf_f64(npyv_f64 v)
+{
+ const npyv_u64 truemask = npyv_setall_u64(1==1);
+ return npyv_and_u64(truemask, npyv_cvt_u64_b64(_npyv_isinf_f64(v)));
+}
+NPY_FINLINE npyv_u8
+npyv_pack_isinf_f64(npyv_f64 v0, npyv_f64 v1, npyv_f64 v2, npyv_f64 v3,
+ npyv_f64 v4, npyv_f64 v5, npyv_f64 v6, npyv_f64 v7)
+{
+ const npyv_u8 truemask = npyv_setall_u8(1==1);
+ npyv_b64 b0 = _npyv_isinf_f64(v0);
+ npyv_b64 b1 = _npyv_isinf_f64(v1);
+ npyv_b64 b2 = _npyv_isinf_f64(v2);
+ npyv_b64 b3 = _npyv_isinf_f64(v3);
+ npyv_b64 b4 = _npyv_isinf_f64(v4);
+ npyv_b64 b5 = _npyv_isinf_f64(v5);
+ npyv_b64 b6 = _npyv_isinf_f64(v6);
+ npyv_b64 b7 = _npyv_isinf_f64(v7);
+ npyv_b8 isinf = npyv_pack_b8_b64(b0, b1, b2, b3, b4, b5, b6, b7);
+ return npyv_and_u8(truemask, npyv_cvt_u8_b8(isinf));
+}
+#endif // NPY_SIMD_F64
+
+
+#if NPY_SIMD_F32
+NPY_FINLINE npyv_b32
+npyv_notfinite_f32(npyv_f32 v)
+{
+ // cast out the sign and check if all exponent bits are set
+ // no matter the mentissa is.
+ const npyv_u32 exp_mask = npyv_setall_u32(0x7f800000);
+ npyv_u32 bits = npyv_reinterpret_u32_f32(v);
+ bits = npyv_and_u32(bits, exp_mask);
+ return npyv_cmpeq_u32(bits, exp_mask);
+}
+NPY_FINLINE npyv_u32
+npyv_isfinite_f32(npyv_f32 v)
+{
+ const npyv_u8 truemask = npyv_reinterpret_u8_u32(npyv_setall_u32(1==1));
+ npyv_u8 notfinite = npyv_reinterpret_u8_u32(npyv_cvt_u32_b32(npyv_notfinite_f32(v)));
+ return npyv_reinterpret_u32_u8(npyv_andc_u8(truemask, notfinite));
+}
+NPY_FINLINE npyv_u8
+npyv_pack_isfinite_f32(npyv_f32 v0, npyv_f32 v1, npyv_f32 v2, npyv_f32 v3)
+{
+#if defined(NPY_HAVE_NEON) && defined(__aarch64__)
+ // F32 exponent is 8-bits, which means we can pack multiple into
+ // a single vector. We shift out sign bit so that we're left
+ // with only exponent in high byte. If not all bits are set,
+ // then we've got a finite number.
+ uint8x16x4_t tbl;
+ tbl.val[0] = npyv_reinterpret_u8_u32(npyv_shli_u32(npyv_reinterpret_u32_f32(v0), 1));
+ tbl.val[1] = npyv_reinterpret_u8_u32(npyv_shli_u32(npyv_reinterpret_u32_f32(v1), 1));
+ tbl.val[2] = npyv_reinterpret_u8_u32(npyv_shli_u32(npyv_reinterpret_u32_f32(v2), 1));
+ tbl.val[3] = npyv_reinterpret_u8_u32(npyv_shli_u32(npyv_reinterpret_u32_f32(v3), 1));
+
+ const npyv_u8 permute = {3,7,11,15, 19,23,27,31, 35,39,43,47, 51,55,59,63};
+ npyv_u8 r = vqtbl4q_u8(tbl, permute);
+
+ const npyv_u8 expmask = npyv_setall_u8(0xff);
+ r = npyv_cmpneq_u8(r, expmask);
+ r = vshrq_n_u8(r, 7);
+ return r;
+#else
+ const npyv_u8 truemask = npyv_setall_u8(1==1);
+ npyv_b32 b0 = npyv_notfinite_f32(v0);
+ npyv_b32 b1 = npyv_notfinite_f32(v1);
+ npyv_b32 b2 = npyv_notfinite_f32(v2);
+ npyv_b32 b3 = npyv_notfinite_f32(v3);
+ npyv_b8 notfinite = npyv_pack_b8_b32(b0, b1, b2, b3);
+ return npyv_andc_u8(truemask, npyv_cvt_u8_b8(notfinite));
+#endif
+}
+#endif // NPY_SIMD_F32
+#if NPY_SIMD_F64
+NPY_FINLINE npyv_b64
+npyv_notfinite_f64(npyv_f64 v)
+{
+ // cast out the sign and check if all exponent bits are set
+ // no matter the mantissa is.
+ const npyv_u64 exp_mask = npyv_setall_u64(0x7ff0000000000000);
+ npyv_u64 bits = npyv_reinterpret_u64_f64(v);
+ bits = npyv_and_u64(bits, exp_mask);
+ return npyv_cmpeq_u64(bits, exp_mask);
+}
+NPY_FINLINE npyv_u64
+npyv_isfinite_f64(npyv_f64 v)
+{
+ const npyv_u8 truemask = npyv_reinterpret_u8_u64(npyv_setall_u64(1==1));
+ npyv_u8 notfinite = npyv_reinterpret_u8_u64(npyv_cvt_u64_b64(npyv_notfinite_f64(v)));
+ return npyv_reinterpret_u64_u8(npyv_andc_u8(truemask, notfinite));
+}
+NPY_FINLINE npyv_u8
+npyv_pack_isfinite_f64(npyv_f64 v0, npyv_f64 v1, npyv_f64 v2, npyv_f64 v3,
+ npyv_f64 v4, npyv_f64 v5, npyv_f64 v6, npyv_f64 v7)
+{
+#if defined(NPY_HAVE_NEON) && defined(__aarch64__)
+ // F64 exponent is 11-bits, which means we can pack multiple into
+ // a single vector. We'll need to use u16 to fit all exponent
+ // bits. If not all bits are set, then we've got a finite number.
+ uint8x16x4_t t0123, t4567;
+ t0123.val[0] = npyv_reinterpret_u8_f64(v0);
+ t0123.val[1] = npyv_reinterpret_u8_f64(v1);
+ t0123.val[2] = npyv_reinterpret_u8_f64(v2);
+ t0123.val[3] = npyv_reinterpret_u8_f64(v3);
+ t4567.val[0] = npyv_reinterpret_u8_f64(v4);
+ t4567.val[1] = npyv_reinterpret_u8_f64(v5);
+ t4567.val[2] = npyv_reinterpret_u8_f64(v6);
+ t4567.val[3] = npyv_reinterpret_u8_f64(v7);
+
+ const npyv_u8 permute = {6,7,14,15, 22,23,30,31, 38,39,46,47, 54,55,62,63};
+ npyv_u16 r0 = npyv_reinterpret_u16_u8(vqtbl4q_u8(t0123, permute));
+ npyv_u16 r1 = npyv_reinterpret_u16_u8(vqtbl4q_u8(t4567, permute));
+
+ const npyv_u16 expmask = npyv_setall_u16(0x7ff0);
+ r0 = npyv_and_u16(r0, expmask);
+ r0 = npyv_cmpneq_u16(r0, expmask);
+ r0 = npyv_shri_u16(r0, 15);
+ r1 = npyv_and_u16(r1, expmask);
+ r1 = npyv_cmpneq_u16(r1, expmask);
+ r1 = npyv_shri_u16(r1, 15);
+
+ npyv_u8 r = npyv_pack_b8_b16(r0, r1);
+ return r;
+#else
+ const npyv_u8 truemask = npyv_setall_u8(1==1);
+ npyv_b64 b0 = npyv_notfinite_f64(v0);
+ npyv_b64 b1 = npyv_notfinite_f64(v1);
+ npyv_b64 b2 = npyv_notfinite_f64(v2);
+ npyv_b64 b3 = npyv_notfinite_f64(v3);
+ npyv_b64 b4 = npyv_notfinite_f64(v4);
+ npyv_b64 b5 = npyv_notfinite_f64(v5);
+ npyv_b64 b6 = npyv_notfinite_f64(v6);
+ npyv_b64 b7 = npyv_notfinite_f64(v7);
+ npyv_b8 notfinite = npyv_pack_b8_b64(b0, b1, b2, b3, b4, b5, b6, b7);
+ return npyv_andc_u8(truemask, npyv_cvt_u8_b8(notfinite));
+#endif
+}
+#endif // NPY_SIMD_F64
+
+#if NPY_SIMD_F32
+NPY_FINLINE npyv_u32
+npyv_signbit_f32(npyv_f32 v)
+{
+ return npyv_shri_u32(npyv_reinterpret_u32_f32(v), (sizeof(npyv_lanetype_f32)*8)-1);
+}
+NPY_FINLINE npyv_u8
+npyv_pack_signbit_f32(npyv_f32 v0, npyv_f32 v1, npyv_f32 v2, npyv_f32 v3)
+{
+#if defined(NPY_HAVE_NEON) && defined(__aarch64__)
+ // We only need high byte for signbit, which means we can pack
+ // multiple inputs into a single vector.
+ uint8x16x4_t tbl;
+ tbl.val[0] = npyv_reinterpret_u8_f32(v0);
+ tbl.val[1] = npyv_reinterpret_u8_f32(v1);
+ tbl.val[2] = npyv_reinterpret_u8_f32(v2);
+ tbl.val[3] = npyv_reinterpret_u8_f32(v3);
+
+ const npyv_u8 permute = {3,7,11,15, 19,23,27,31, 35,39,43,47, 51,55,59,63};
+ npyv_u8 r = vqtbl4q_u8(tbl, permute);
+ r = vshrq_n_u8(r, 7);
+ return r;
+#else
+ npyv_b32 b0 = npyv_cvt_b32_u32(npyv_signbit_f32(v0));
+ npyv_b32 b1 = npyv_cvt_b32_u32(npyv_signbit_f32(v1));
+ npyv_b32 b2 = npyv_cvt_b32_u32(npyv_signbit_f32(v2));
+ npyv_b32 b3 = npyv_cvt_b32_u32(npyv_signbit_f32(v3));
+ npyv_b8 signbit = npyv_pack_b8_b32(b0, b1, b2, b3);
+ return npyv_cvt_u8_b8(signbit);
+#endif
+}
+#endif // NPY_SIMD_F32
+#if NPY_SIMD_F64
+NPY_FINLINE npyv_u64
+npyv_signbit_f64(npyv_f64 v)
+{
+ return npyv_shri_u64(npyv_reinterpret_u64_f64(v), (sizeof(npyv_lanetype_f64)*8)-1);
+}
+NPY_FINLINE npyv_u8
+npyv_pack_signbit_f64(npyv_f64 v0, npyv_f64 v1, npyv_f64 v2, npyv_f64 v3,
+ npyv_f64 v4, npyv_f64 v5, npyv_f64 v6, npyv_f64 v7)
+{
+#if defined(NPY_HAVE_NEON) && defined(__aarch64__)
+ // We only need high byte for signbit, which means we can pack
+ // multiple inputs into a single vector.
+
+ // vuzp2 faster than vtbl for f64
+ npyv_u32 v01 = vuzp2q_u32(npyv_reinterpret_u32_f64(v0), npyv_reinterpret_u32_f64(v1));
+ npyv_u32 v23 = vuzp2q_u32(npyv_reinterpret_u32_f64(v2), npyv_reinterpret_u32_f64(v3));
+ npyv_u32 v45 = vuzp2q_u32(npyv_reinterpret_u32_f64(v4), npyv_reinterpret_u32_f64(v5));
+ npyv_u32 v67 = vuzp2q_u32(npyv_reinterpret_u32_f64(v6), npyv_reinterpret_u32_f64(v7));
+
+ npyv_u16 v0123 = vuzp2q_u16(npyv_reinterpret_u16_u32(v01), npyv_reinterpret_u16_u32(v23));
+ npyv_u16 v4567 = vuzp2q_u16(npyv_reinterpret_u16_u32(v45), npyv_reinterpret_u16_u32(v67));
+
+ npyv_u8 r = vuzp2q_u8(npyv_reinterpret_u8_u16(v0123), npyv_reinterpret_u8_u16(v4567));
+ r = vshrq_n_u8(r, 7);
+ return r;
+#else
+ npyv_b64 b0 = npyv_cvt_b64_u64(npyv_signbit_f64(v0));
+ npyv_b64 b1 = npyv_cvt_b64_u64(npyv_signbit_f64(v1));
+ npyv_b64 b2 = npyv_cvt_b64_u64(npyv_signbit_f64(v2));
+ npyv_b64 b3 = npyv_cvt_b64_u64(npyv_signbit_f64(v3));
+ npyv_b64 b4 = npyv_cvt_b64_u64(npyv_signbit_f64(v4));
+ npyv_b64 b5 = npyv_cvt_b64_u64(npyv_signbit_f64(v5));
+ npyv_b64 b6 = npyv_cvt_b64_u64(npyv_signbit_f64(v6));
+ npyv_b64 b7 = npyv_cvt_b64_u64(npyv_signbit_f64(v7));
+ npyv_b8 signbit = npyv_pack_b8_b64(b0, b1, b2, b3, b4, b5, b6, b7);
+ return npyv_cvt_u8_b8(signbit);
+#endif
+}
+#endif // NPY_SIMD_F64
+
+#endif // NPY_SIMD
+
+/********************************************************************************
+ ** Defining the SIMD kernels
+ ********************************************************************************/
+/** Notes:
+ * - avoid the use of libmath to unify fp/domain errors
+ * for both scalars and vectors among all compilers/architectures.
+ * - use intrinsic npyv_load_till_* instead of npyv_load_tillz_
+ * to fill the remind lanes with 1.0 to avoid divide by zero fp
+ * exception in reciprocal.
+ */
+#define CONTIG 0
+#define NCONTIG 1
+
+/**begin repeat
+ * #TYPE = FLOAT, DOUBLE#
+ * #sfx = f32, f64#
+ * #VCHK = NPY_SIMD_F32, NPY_SIMD_F64#
+ * #ssfx = 32, 64#
+ */
+#if @VCHK@
+/**begin repeat1
+ * #kind = isnan, isinf, isfinite, signbit#
+ */
+/**begin repeat2
+ * #STYPE = CONTIG, NCONTIG, CONTIG, NCONTIG#
+ * #DTYPE = CONTIG, CONTIG, NCONTIG, NCONTIG#
+ */
+static void simd_unary_@kind@_@TYPE@_@STYPE@_@DTYPE@
+(const void *src, npy_intp istride, void *dst, npy_intp ostride, npy_intp len)
+{
+ const npyv_lanetype_@sfx@ *ip = src;
+ npy_bool *op = dst;
+
+ // How many vectors can be packed into a u8 / bool vector?
+ #define PACK_FACTOR (NPY_SIMD_WIDTH / npyv_nlanes_@sfx@)
+ assert(PACK_FACTOR == 4 || PACK_FACTOR == 8);
+
+ const int vstep = npyv_nlanes_@sfx@;
+ const int wstep = vstep * PACK_FACTOR;
+
+ // unrolled iterations
+ for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) {
+ // Load vectors
+ #if @STYPE@ == CONTIG
+ // contiguous input
+ npyv_@sfx@ v0 = npyv_load_@sfx@(ip + vstep * 0);
+ npyv_@sfx@ v1 = npyv_load_@sfx@(ip + vstep * 1);
+ npyv_@sfx@ v2 = npyv_load_@sfx@(ip + vstep * 2);
+ npyv_@sfx@ v3 = npyv_load_@sfx@(ip + vstep * 3);
+ #if PACK_FACTOR == 8
+ npyv_@sfx@ v4 = npyv_load_@sfx@(ip + vstep * 4);
+ npyv_@sfx@ v5 = npyv_load_@sfx@(ip + vstep * 5);
+ npyv_@sfx@ v6 = npyv_load_@sfx@(ip + vstep * 6);
+ npyv_@sfx@ v7 = npyv_load_@sfx@(ip + vstep * 7);
+ #endif
+ #else
+ // non-contiguous input
+ npyv_@sfx@ v0 = npyv_loadn_@sfx@(ip + istride * vstep * 0, istride);
+ npyv_@sfx@ v1 = npyv_loadn_@sfx@(ip + istride * vstep * 1, istride);
+ npyv_@sfx@ v2 = npyv_loadn_@sfx@(ip + istride * vstep * 2, istride);
+ npyv_@sfx@ v3 = npyv_loadn_@sfx@(ip + istride * vstep * 3, istride);
+ #if PACK_FACTOR == 8
+ npyv_@sfx@ v4 = npyv_loadn_@sfx@(ip + istride * vstep * 4, istride);
+ npyv_@sfx@ v5 = npyv_loadn_@sfx@(ip + istride * vstep * 5, istride);
+ npyv_@sfx@ v6 = npyv_loadn_@sfx@(ip + istride * vstep * 6, istride);
+ npyv_@sfx@ v7 = npyv_loadn_@sfx@(ip + istride * vstep * 7, istride);
+ #endif
+ #endif
+
+ #if PACK_FACTOR == 4
+ npyv_u8 r = npyv_pack_@kind@_@sfx@(v0, v1, v2, v3);
+ #elif PACK_FACTOR == 8
+ npyv_u8 r = npyv_pack_@kind@_@sfx@(v0, v1, v2, v3, v4, v5, v6, v7);
+ #endif
+
+ #if @DTYPE@ == CONTIG
+ npyv_store_u8(op, r);
+ #else // @DTYPE@ == CONTIG
+ // Results are packed, so we can just loop over them
+ npy_uint8 lane[npyv_nlanes_u8];
+ npyv_store_u8(lane, r);
+ for (int ln=0; (ln * sizeof(npyv_lanetype_@sfx@)) < npyv_nlanes_u8; ++ln){
+ op[ln * ostride] = lane[ln * sizeof(npyv_lanetype_@sfx@)];
+ }
+ #endif // @DTYPE@ == CONTIG
+ }
+
+ // vector-sized iterations
+ for (; len >= vstep; len -= vstep, ip += istride*vstep, op += ostride*vstep) {
+ #if @STYPE@ == CONTIG
+ npyv_@sfx@ v = npyv_load_@sfx@(ip);
+ #else
+ npyv_@sfx@ v = npyv_loadn_@sfx@(ip, istride);
+ #endif
+
+ npyv_u@ssfx@ r = npyv_@kind@_@sfx@(v);
+
+ npy_uint8 lane[npyv_nlanes_u8];
+ npyv_store_u8(lane, npyv_reinterpret_u8_u@ssfx@(r));
+
+ op[0 * ostride] = lane[0 * sizeof(npyv_lanetype_@sfx@)];
+ op[1 * ostride] = lane[1 * sizeof(npyv_lanetype_@sfx@)];
+ #if npyv_nlanes_@sfx@ == 4
+ op[2 * ostride] = lane[2 * sizeof(npyv_lanetype_@sfx@)];
+ op[3 * ostride] = lane[3 * sizeof(npyv_lanetype_@sfx@)];
+ #endif
+ }
+
+ #undef PACK_FACTOR
+
+ // Scalar loop to finish off
+ for (; len > 0; --len, ip += istride, op += ostride) {
+ *op = (npy_@kind@(*ip) != 0);
+ }
+
+ npyv_cleanup();
+}
+/**end repeat2**/
+/**end repeat1**/
+
+#endif // @VCHK@
+/**end repeat**/
+
+/********************************************************************************
+ ** Defining ufunc inner functions
+ ********************************************************************************/
+/**begin repeat
+ * #TYPE = FLOAT, DOUBLE#
+ * #sfx = f32, f64#
+ * #VCHK = NPY_SIMD_F32, NPY_SIMD_F64#
+ */
+
+/**begin repeat1
+ * #kind = isnan, isinf, isfinite, signbit#
+ **/
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+#if @VCHK@
+ const char *ip = args[0];
+ char *op = args[1];
+ const npy_intp istep = steps[0];
+ const npy_intp ostep = steps[1];
+ npy_intp len = dimensions[0];
+ const int ilsize = sizeof(npyv_lanetype_@sfx@);
+ const int olsize = sizeof(npy_bool);
+ const npy_intp istride = istep / ilsize;
+ const npy_intp ostride = ostep / olsize;
+ assert(len <= 1 || ostep % olsize == 0);
+
+ if ((istep % ilsize == 0) &&
+ !is_mem_overlap(ip, istep, op, ostep, len) &&
+ npyv_loadable_stride_@sfx@(istride) &&
+ npyv_storable_stride_@sfx@(ostride))
+ {
+ if (istride == 1 && ostride == 1) {
+ simd_unary_@kind@_@TYPE@_CONTIG_CONTIG(ip, 1, op, 1, len);
+ }
+ else if (ostride == 1) {
+ simd_unary_@kind@_@TYPE@_NCONTIG_CONTIG(ip, istride, op, 1, len);
+ }
+ else if (istride == 1) {
+ simd_unary_@kind@_@TYPE@_CONTIG_NCONTIG(ip, 1, op, ostride, len);
+ } else {
+ simd_unary_@kind@_@TYPE@_NCONTIG_NCONTIG(ip, istride, op, ostride, len);
+ }
+ } else
+#endif // @VCHK@
+ {
+ UNARY_LOOP {
+ const npyv_lanetype_@sfx@ in = *(npyv_lanetype_@sfx@ *)ip1;
+ *((npy_bool *)op1) = (npy_@kind@(in) != 0);
+ }
+ }
+
+ npy_clear_floatstatus_barrier((char*)dimensions);
+}
+/**end repeat1**/
+/**end repeat**/
diff --git a/numpy/core/src/umath/loops_utils.h.src b/numpy/core/src/umath/loops_utils.h.src
index df92bc315..5640a1f0b 100644
--- a/numpy/core/src/umath/loops_utils.h.src
+++ b/numpy/core/src/umath/loops_utils.h.src
@@ -74,7 +74,7 @@ is_mem_overlap(const void *src, npy_intp src_step, const void *dst, npy_intp dst
* The recursion depth is O(lg n) as well.
* when updating also update similar complex floats summation
*/
-static NPY_INLINE @type@
+static inline @type@
@TYPE@_pairwise_sum(char *a, npy_intp n, npy_intp stride)
{
if (n < 8) {
@@ -152,7 +152,7 @@ static NPY_INLINE @type@
* #SIMD = 1, 1, 0#
*/
/* similar to pairwise sum of real floats */
-static NPY_INLINE void
+static inline void
@TYPE@_pairwise_sum(@ftype@ *rr, @ftype@ * ri, char * a, npy_intp n,
npy_intp stride)
{
diff --git a/numpy/core/src/umath/matmul.c.src b/numpy/core/src/umath/matmul.c.src
index 4dd0c4759..127bb5e27 100644
--- a/numpy/core/src/umath/matmul.c.src
+++ b/numpy/core/src/umath/matmul.c.src
@@ -45,7 +45,7 @@
* 3. The slower (first) axis stride, in unit steps, must be larger than
* the faster axis dimension
*/
-static NPY_INLINE npy_bool
+static inline npy_bool
is_blasable2d(npy_intp byte_stride1, npy_intp byte_stride2,
npy_intp d1, npy_intp d2, npy_intp itemsize)
{
diff --git a/numpy/core/src/umath/override.c b/numpy/core/src/umath/override.c
index d247c2639..167164163 100644
--- a/numpy/core/src/umath/override.c
+++ b/numpy/core/src/umath/override.c
@@ -23,18 +23,19 @@
* Returns -1 on failure.
*/
static int
-get_array_ufunc_overrides(PyObject *in_args, PyObject *out_args,
+get_array_ufunc_overrides(PyObject *in_args, PyObject *out_args, PyObject *wheremask_obj,
PyObject **with_override, PyObject **methods)
{
int i;
int num_override_args = 0;
- int narg, nout;
+ int narg, nout, nwhere;
narg = (int)PyTuple_GET_SIZE(in_args);
/* It is valid for out_args to be NULL: */
nout = (out_args != NULL) ? (int)PyTuple_GET_SIZE(out_args) : 0;
+ nwhere = (wheremask_obj != NULL) ? 1: 0;
- for (i = 0; i < narg + nout; ++i) {
+ for (i = 0; i < narg + nout + nwhere; ++i) {
PyObject *obj;
int j;
int new_class = 1;
@@ -42,9 +43,12 @@ get_array_ufunc_overrides(PyObject *in_args, PyObject *out_args,
if (i < narg) {
obj = PyTuple_GET_ITEM(in_args, i);
}
- else {
+ else if (i < narg + nout){
obj = PyTuple_GET_ITEM(out_args, i - narg);
}
+ else {
+ obj = wheremask_obj;
+ }
/*
* Have we seen this class before? If so, ignore.
*/
@@ -208,7 +212,7 @@ copy_positional_args_to_kwargs(const char **keywords,
*/
NPY_NO_EXPORT int
PyUFunc_CheckOverride(PyUFuncObject *ufunc, char *method,
- PyObject *in_args, PyObject *out_args,
+ PyObject *in_args, PyObject *out_args, PyObject *wheremask_obj,
PyObject *const *args, Py_ssize_t len_args, PyObject *kwnames,
PyObject **result)
{
@@ -227,7 +231,7 @@ PyUFunc_CheckOverride(PyUFuncObject *ufunc, char *method,
* Check inputs for overrides
*/
num_override_args = get_array_ufunc_overrides(
- in_args, out_args, with_override, array_ufunc_methods);
+ in_args, out_args, wheremask_obj, with_override, array_ufunc_methods);
if (num_override_args == -1) {
goto fail;
}
diff --git a/numpy/core/src/umath/override.h b/numpy/core/src/umath/override.h
index 4e9a323ca..20621bb19 100644
--- a/numpy/core/src/umath/override.h
+++ b/numpy/core/src/umath/override.h
@@ -6,7 +6,7 @@
NPY_NO_EXPORT int
PyUFunc_CheckOverride(PyUFuncObject *ufunc, char *method,
- PyObject *in_args, PyObject *out_args,
+ PyObject *in_args, PyObject *out_args, PyObject *wheremask_obj,
PyObject *const *args, Py_ssize_t len_args, PyObject *kwnames,
PyObject **result);
diff --git a/numpy/core/src/umath/reduction.c b/numpy/core/src/umath/reduction.c
index 817f99a04..9416e9a29 100644
--- a/numpy/core/src/umath/reduction.c
+++ b/numpy/core/src/umath/reduction.c
@@ -17,6 +17,9 @@
#include "numpy/arrayobject.h"
#include "npy_pycompat.h"
+#include "array_assign.h"
+#include "array_coercion.h"
+#include "array_method.h"
#include "ctors.h"
#include "numpy/ufuncobject.h"
@@ -148,24 +151,15 @@ PyArray_CopyInitialReduceValues(
* context : The ArrayMethod context (with ufunc, method, and descriptors).
* operand : The array to be reduced.
* out : NULL, or the array into which to place the result.
- * wheremask : NOT YET SUPPORTED, but this parameter is placed here
- * so that support can be added in the future without breaking
- * API compatibility. Pass in NULL.
+ * wheremask : Reduction mask of valid values used for `where=`.
* axis_flags : Flags indicating the reduction axes of 'operand'.
- * reorderable : If True, the reduction being done is reorderable, which
- * means specifying multiple axes of reduction at once is ok,
- * and the reduction code may calculate the reduction in an
- * arbitrary order. The calculation may be reordered because
- * of cache behavior or multithreading requirements.
* keepdims : If true, leaves the reduction dimensions in the result
* with size one.
* subok : If true, the result uses the subclass of operand, otherwise
* it is always a base class ndarray.
- * identity : If Py_None, PyArray_CopyInitialReduceValues is used, otherwise
- * this value is used to initialize the result to
- * the reduction's unit.
+ * initial : Initial value, if NULL the default is fetched from the
+ * ArrayMethod (typically as the default from the ufunc).
* loop : `reduce_loop` from `ufunc_object.c`. TODO: Refactor
- * data : Data which is passed to the inner loop.
* buffersize : Buffer size for the iterator. For the default, pass in 0.
* funcname : The name of the reduction function, for error messages.
* errormask : forwarded from _get_bufsize_errmask
@@ -182,9 +176,9 @@ PyArray_CopyInitialReduceValues(
NPY_NO_EXPORT PyArrayObject *
PyUFunc_ReduceWrapper(PyArrayMethod_Context *context,
PyArrayObject *operand, PyArrayObject *out, PyArrayObject *wheremask,
- npy_bool *axis_flags, int reorderable, int keepdims,
- PyObject *identity, PyArray_ReduceLoopFunc *loop,
- void *data, npy_intp buffersize, const char *funcname, int errormask)
+ npy_bool *axis_flags, int keepdims,
+ PyObject *initial, PyArray_ReduceLoopFunc *loop,
+ npy_intp buffersize, const char *funcname, int errormask)
{
assert(loop != NULL);
PyArrayObject *result = NULL;
@@ -198,38 +192,35 @@ PyUFunc_ReduceWrapper(PyArrayMethod_Context *context,
/* Loop auxdata (must be freed on error) */
NpyAuxData *auxdata = NULL;
- /* More than one axis means multiple orders are possible */
- if (!reorderable && count_axes(PyArray_NDIM(operand), axis_flags) > 1) {
- PyErr_Format(PyExc_ValueError,
- "reduction operation '%s' is not reorderable, "
- "so at most one axis may be specified",
- funcname);
- return NULL;
- }
- /* Can only use where with an initial ( from identity or argument) */
- if (wheremask != NULL && identity == Py_None) {
- PyErr_Format(PyExc_ValueError,
- "reduction operation '%s' does not have an identity, "
- "so to use a where mask one has to specify 'initial'",
- funcname);
- return NULL;
- }
-
-
/* Set up the iterator */
op[0] = out;
op[1] = operand;
op_dtypes[0] = context->descriptors[0];
op_dtypes[1] = context->descriptors[1];
+ /* Buffer to use when we need an initial value */
+ char *initial_buf = NULL;
+
+ /* More than one axis means multiple orders are possible */
+ if (!(context->method->flags & NPY_METH_IS_REORDERABLE)
+ && count_axes(PyArray_NDIM(operand), axis_flags) > 1) {
+ PyErr_Format(PyExc_ValueError,
+ "reduction operation '%s' is not reorderable, "
+ "so at most one axis may be specified",
+ funcname);
+ goto fail;
+ }
+
it_flags = NPY_ITER_BUFFERED |
NPY_ITER_EXTERNAL_LOOP |
NPY_ITER_GROWINNER |
- NPY_ITER_DONT_NEGATE_STRIDES |
NPY_ITER_ZEROSIZE_OK |
NPY_ITER_REFS_OK |
NPY_ITER_DELAY_BUFALLOC |
NPY_ITER_COPY_IF_OVERLAP;
+ if (!(context->method->flags & NPY_METH_IS_REORDERABLE)) {
+ it_flags |= NPY_ITER_DONT_NEGATE_STRIDES;
+ }
op_flags[0] = NPY_ITER_READWRITE |
NPY_ITER_ALIGNED |
NPY_ITER_ALLOCATE |
@@ -297,8 +288,52 @@ PyUFunc_ReduceWrapper(PyArrayMethod_Context *context,
goto fail;
}
+ npy_bool empty_iteration = NpyIter_GetIterSize(iter) == 0;
result = NpyIter_GetOperandArray(iter)[0];
+ /*
+ * Get the initial value (if it exists). If the iteration is empty
+ * then we assume the reduction is also empty. The reason is that when
+ * the outer iteration is empty we just won't use the initial value
+ * in any case. (`np.sum(np.zeros((0, 3)), axis=0)` is a length 3
+ * reduction but has an empty result.)
+ */
+ if ((initial == NULL && context->method->get_reduction_initial == NULL)
+ || initial == Py_None) {
+ /* There is no initial value, or initial value was explicitly unset */
+ }
+ else {
+ /* Not all functions will need initialization, but init always: */
+ initial_buf = PyMem_Calloc(1, op_dtypes[0]->elsize);
+ if (initial_buf == NULL) {
+ PyErr_NoMemory();
+ goto fail;
+ }
+ if (initial != NULL) {
+ /* must use user provided initial value */
+ if (PyArray_Pack(op_dtypes[0], initial_buf, initial) < 0) {
+ goto fail;
+ }
+ }
+ else {
+ /*
+ * Fetch initial from ArrayMethod, we pretend the reduction is
+ * empty when the iteration is. This may be wrong, but when it is,
+ * we will not need the identity as the result is also empty.
+ */
+ int has_initial = context->method->get_reduction_initial(
+ context, empty_iteration, initial_buf);
+ if (has_initial < 0) {
+ goto fail;
+ }
+ if (!has_initial) {
+ /* We have no initial value available, free buffer to indicate */
+ PyMem_FREE(initial_buf);
+ initial_buf = NULL;
+ }
+ }
+ }
+
PyArrayMethod_StridedLoop *strided_loop;
NPY_ARRAYMETHOD_FLAGS flags = 0;
@@ -313,12 +348,27 @@ PyUFunc_ReduceWrapper(PyArrayMethod_Context *context,
* Initialize the result to the reduction unit if possible,
* otherwise copy the initial values and get a view to the rest.
*/
- if (identity != Py_None) {
- if (PyArray_FillWithScalar(result, identity) < 0) {
+ if (initial_buf != NULL) {
+ /* Loop provided an identity or default value, assign to result. */
+ int ret = raw_array_assign_scalar(
+ PyArray_NDIM(result), PyArray_DIMS(result),
+ PyArray_DESCR(result),
+ PyArray_BYTES(result), PyArray_STRIDES(result),
+ op_dtypes[0], initial_buf);
+ if (ret < 0) {
goto fail;
}
}
else {
+ /* Can only use where with an initial (from identity or argument) */
+ if (wheremask != NULL) {
+ PyErr_Format(PyExc_ValueError,
+ "reduction operation '%s' does not have an identity, "
+ "so to use a where mask one has to specify 'initial'",
+ funcname);
+ return NULL;
+ }
+
/*
* For 1-D skip_first_count could be optimized to 0, but no-identity
* reductions are not super common.
@@ -354,7 +404,7 @@ PyUFunc_ReduceWrapper(PyArrayMethod_Context *context,
}
}
- if (NpyIter_GetIterSize(iter) != 0) {
+ if (!empty_iteration) {
NpyIter_IterNextFunc *iternext;
char **dataptr;
npy_intp *strideptr;
@@ -387,6 +437,10 @@ PyUFunc_ReduceWrapper(PyArrayMethod_Context *context,
}
Py_INCREF(result);
+ if (initial_buf != NULL && PyDataType_REFCHK(PyArray_DESCR(result))) {
+ PyArray_Item_XDECREF(initial_buf, PyArray_DESCR(result));
+ }
+ PyMem_FREE(initial_buf);
NPY_AUXDATA_FREE(auxdata);
if (!NpyIter_Deallocate(iter)) {
Py_DECREF(result);
@@ -395,6 +449,10 @@ PyUFunc_ReduceWrapper(PyArrayMethod_Context *context,
return result;
fail:
+ if (initial_buf != NULL && PyDataType_REFCHK(op_dtypes[0])) {
+ PyArray_Item_XDECREF(initial_buf, op_dtypes[0]);
+ }
+ PyMem_FREE(initial_buf);
NPY_AUXDATA_FREE(auxdata);
if (iter != NULL) {
NpyIter_Deallocate(iter);
diff --git a/numpy/core/src/umath/reduction.h b/numpy/core/src/umath/reduction.h
index 2170e27a7..d2cbe4849 100644
--- a/numpy/core/src/umath/reduction.h
+++ b/numpy/core/src/umath/reduction.h
@@ -64,8 +64,8 @@ typedef int (PyArray_ReduceLoopFunc)(PyArrayMethod_Context *context,
NPY_NO_EXPORT PyArrayObject *
PyUFunc_ReduceWrapper(PyArrayMethod_Context *context,
PyArrayObject *operand, PyArrayObject *out, PyArrayObject *wheremask,
- npy_bool *axis_flags, int reorderable, int keepdims,
- PyObject *identity, PyArray_ReduceLoopFunc *loop,
- void *data, npy_intp buffersize, const char *funcname, int errormask);
+ npy_bool *axis_flags, int keepdims,
+ PyObject *initial, PyArray_ReduceLoopFunc *loop,
+ npy_intp buffersize, const char *funcname, int errormask);
#endif
diff --git a/numpy/core/src/umath/scalarmath.c.src b/numpy/core/src/umath/scalarmath.c.src
index 7c63ac0f1..a159fdc12 100644
--- a/numpy/core/src/umath/scalarmath.c.src
+++ b/numpy/core/src/umath/scalarmath.c.src
@@ -57,7 +57,7 @@
* #name = byte, short, int, long, longlong#
* #type = npy_byte, npy_short, npy_int, npy_long, npy_longlong#
*/
-static NPY_INLINE int
+static inline int
@name@_ctype_add(@type@ a, @type@ b, @type@ *out) {
*out = a + b;
if ((*out^a) >= 0 || (*out^b) >= 0) {
@@ -66,7 +66,7 @@ static NPY_INLINE int
return NPY_FPE_OVERFLOW;
}
-static NPY_INLINE int
+static inline int
@name@_ctype_subtract(@type@ a, @type@ b, @type@ *out) {
*out = a - b;
if ((*out^a) >= 0 || (*out^~b) >= 0) {
@@ -80,7 +80,7 @@ static NPY_INLINE int
* #name = ubyte, ushort, uint, ulong, ulonglong#
* #type = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong#
*/
-static NPY_INLINE int
+static inline int
@name@_ctype_add(@type@ a, @type@ b, @type@ *out) {
*out = a + b;
if (*out >= a && *out >= b) {
@@ -89,7 +89,7 @@ static NPY_INLINE int
return NPY_FPE_OVERFLOW;
}
-static NPY_INLINE int
+static inline int
@name@_ctype_subtract(@type@ a, @type@ b, @type@ *out) {
*out = a - b;
if (a >= b) {
@@ -118,7 +118,7 @@ static NPY_INLINE int
* #neg = (1,0)*4#
*/
#if NPY_SIZEOF_@SIZE@ > NPY_SIZEOF_@SIZENAME@
-static NPY_INLINE int
+static inline int
@name@_ctype_multiply(@type@ a, @type@ b, @type@ *out) {
@big@ temp;
temp = ((@big@) a) * ((@big@) b);
@@ -144,7 +144,7 @@ static NPY_INLINE int
* #SIZE = INT*2, LONG*2, LONGLONG*2#
*/
#if NPY_SIZEOF_LONGLONG == NPY_SIZEOF_@SIZE@
-static NPY_INLINE int
+static inline int
@name@_ctype_multiply(@type@ a, @type@ b, @type@ *out) {
if (npy_mul_with_overflow_@name@(out, a, b)) {
return NPY_FPE_OVERFLOW;
@@ -171,7 +171,7 @@ static NPY_INLINE int
#define DIVIDEBYZERO_CHECK (b == 0)
#endif
-static NPY_INLINE int
+static inline int
@name@_ctype_divide(@type@ a, @type@ b, @type@ *out) {
if (b == 0) {
*out = 0;
@@ -200,7 +200,7 @@ static NPY_INLINE int
#define @name@_ctype_floor_divide @name@_ctype_divide
-static NPY_INLINE int
+static inline int
@name@_ctype_remainder(@type@ a, @type@ b, @type@ *out) {
if (DIVIDEBYZERO_CHECK) {
*out = 0;
@@ -232,7 +232,7 @@ static NPY_INLINE int
* ulong, longlong, ulonglong#
*/
-static NPY_INLINE int
+static inline int
@name@_ctype_true_divide(npy_@name@ a, npy_@name@ b, npy_double *out)
{
*out = (npy_double)a / (npy_double)b;
@@ -251,7 +251,7 @@ static NPY_INLINE int
* #upc = BYTE, UBYTE, SHORT, USHORT, INT, UINT,
* LONG, ULONG, LONGLONG, ULONGLONG#
*/
-static NPY_INLINE int
+static inline int
@name@_ctype_power(@type@ a, @type@ b, @type@ *out) {
@type@ tmp;
@@ -292,7 +292,7 @@ static NPY_INLINE int
* #op = &, ^, |#
*/
-static NPY_INLINE int
+static inline int
@name@_ctype_@oper@(@type@ arg1, @type@ arg2, @type@ *out)
{
*out = arg1 @op@ arg2;
@@ -301,14 +301,14 @@ static NPY_INLINE int
/**end repeat1**/
-static NPY_INLINE int
+static inline int
@name@_ctype_lshift(@type@ arg1, @type@ arg2, @type@ *out)
{
*out = npy_lshift@suffix@(arg1, arg2);
return 0;
}
-static NPY_INLINE int
+static inline int
@name@_ctype_rshift(@type@ arg1, @type@ arg2, @type@ *out)
{
*out = npy_rshift@suffix@(arg1, arg2);
@@ -328,7 +328,7 @@ static NPY_INLINE int
* #oper = add, subtract, multiply, divide#
*/
-static NPY_INLINE int
+static inline int
@name@_ctype_@oper@(@type@ a, @type@ b, @type@ *out)
{
*out = a @OP@ b;
@@ -340,21 +340,21 @@ static NPY_INLINE int
#define @name@_ctype_true_divide @name@_ctype_divide
-static NPY_INLINE int
+static inline int
@name@_ctype_floor_divide(@type@ a, @type@ b, @type@ *out) {
*out = npy_floor_divide@c@(a, b);
return 0;
}
-static NPY_INLINE int
+static inline int
@name@_ctype_remainder(@type@ a, @type@ b, @type@ *out) {
*out = npy_remainder@c@(a, b);
return 0;
}
-static NPY_INLINE int
+static inline int
@name@_ctype_divmod(@type@ a, @type@ b, @type@ *out1, @type@ *out2) {
*out1 = npy_divmod@c@(a, b, out2);
return 0;
@@ -368,7 +368,7 @@ static NPY_INLINE int
* #oper = add, subtract, multiply, divide#
*/
-static NPY_INLINE int
+static inline int
half_ctype_@oper@(npy_half a, npy_half b, npy_half *out)
{
float res = npy_half_to_float(a) @OP@ npy_half_to_float(b);
@@ -380,7 +380,7 @@ half_ctype_@oper@(npy_half a, npy_half b, npy_half *out)
#define half_ctype_true_divide half_ctype_divide
-static NPY_INLINE int
+static inline int
half_ctype_floor_divide(npy_half a, npy_half b, npy_half *out)
{
npy_half mod;
@@ -396,7 +396,7 @@ half_ctype_floor_divide(npy_half a, npy_half b, npy_half *out)
}
-static NPY_INLINE int
+static inline int
half_ctype_remainder(npy_half a, npy_half b, npy_half *out)
{
npy_half_divmod(a, b, out);
@@ -404,7 +404,7 @@ half_ctype_remainder(npy_half a, npy_half b, npy_half *out)
}
-static NPY_INLINE int
+static inline int
half_ctype_divmod(npy_half a, npy_half b, npy_half *out1, npy_half *out2)
{
*out1 = npy_half_divmod(a, b, out2);
@@ -419,7 +419,7 @@ half_ctype_divmod(npy_half a, npy_half b, npy_half *out1, npy_half *out2)
* #rtype = npy_float, npy_double, npy_longdouble#
* #c = f,,l#
*/
-static NPY_INLINE int
+static inline int
@name@_ctype_add(@type@ a, @type@ b, @type@ *out)
{
out->real = a.real + b.real;
@@ -427,7 +427,7 @@ static NPY_INLINE int
return 0;
}
-static NPY_INLINE int
+static inline int
@name@_ctype_subtract(@type@ a, @type@ b, @type@ *out)
{
out->real = a.real - b.real;
@@ -440,7 +440,7 @@ static NPY_INLINE int
* TODO: Mark as to work around FPEs not being issues on clang 12.
* This should be removed when possible.
*/
-static NPY_INLINE int
+static inline int
@name@_ctype_multiply( @type@ a, @type@ b, @type@ *out)
{
out->real = a.real * b.real - a.imag * b.imag;
@@ -449,11 +449,11 @@ static NPY_INLINE int
}
/* Use the ufunc loop directly to avoid duplicating the complicated logic */
-static NPY_INLINE int
+static inline int
@name@_ctype_divide(@type@ a, @type@ b, @type@ *out)
{
char *args[3] = {(char *)&a, (char *)&b, (char *)out};
- npy_intp steps[3];
+ npy_intp steps[3] = {0, 0, 0};
npy_intp size = 1;
@TYPE@_divide(args, &size, steps, NULL);
return 0;
@@ -470,7 +470,7 @@ static NPY_INLINE int
* longlong, ulonglong#
*/
-static NPY_INLINE int
+static inline int
@name@_ctype_divmod(npy_@name@ a, npy_@name@ b, npy_@name@ *out, npy_@name@ *out2)
{
int res = @name@_ctype_floor_divide(a, b, out);
@@ -487,7 +487,7 @@ static NPY_INLINE int
* #c = f,,l#
*/
-static NPY_INLINE int
+static inline int
@name@_ctype_power(@type@ a, @type@ b, @type@ *out)
{
*out = npy_pow@c@(a, b);
@@ -495,7 +495,7 @@ static NPY_INLINE int
}
/**end repeat**/
-static NPY_INLINE int
+static inline int
half_ctype_power(npy_half a, npy_half b, npy_half *out)
{
const npy_float af = npy_half_to_float(a);
@@ -518,7 +518,7 @@ half_ctype_power(npy_half a, npy_half b, npy_half *out)
* #uns = (0,1)*5,0*3#
* #int = 1*10,0*3#
*/
-static NPY_INLINE int
+static inline int
@name@_ctype_negative(@type@ a, @type@ *out)
{
#if @uns@
@@ -541,7 +541,7 @@ static NPY_INLINE int
}
/**end repeat**/
-static NPY_INLINE int
+static inline int
half_ctype_negative(npy_half a, npy_half *out)
{
*out = a^0x8000u;
@@ -553,7 +553,7 @@ half_ctype_negative(npy_half a, npy_half *out)
* #name = cfloat, cdouble, clongdouble#
* #type = npy_cfloat, npy_cdouble, npy_clongdouble#
*/
-static NPY_INLINE int
+static inline int
@name@_ctype_negative(@type@ a, @type@ *out)
{
out->real = -a.real;
@@ -570,7 +570,7 @@ static NPY_INLINE int
* npy_long, npy_ulong, npy_longlong, npy_ulonglong,
* npy_half, npy_float, npy_double, npy_longdouble#
*/
-static NPY_INLINE int
+static inline int
@name@_ctype_positive(@type@ a, @type@ *out)
{
*out = a;
@@ -583,7 +583,7 @@ static NPY_INLINE int
* #type = npy_cfloat, npy_cdouble, npy_clongdouble#
* #c = f,,l#
*/
-static NPY_INLINE int
+static inline int
@name@_ctype_positive(@type@ a, @type@ *out)
{
out->real = a.real;
@@ -591,7 +591,7 @@ static NPY_INLINE int
return 0;
}
-static NPY_INLINE int
+static inline int
@name@_ctype_power(@type@ a, @type@ b, @type@ *out)
{
*out = npy_cpow@c@(a, b);
@@ -614,7 +614,7 @@ static NPY_INLINE int
* #type = npy_byte, npy_short, npy_int, npy_long, npy_longlong#
* #NAME = BYTE, SHORT, INT, LONG, LONGLONG#
*/
-static NPY_INLINE int
+static inline int
@name@_ctype_absolute(@type@ a, @type@ *out)
{
if (a == NPY_MIN_@NAME@) {
@@ -631,7 +631,7 @@ static NPY_INLINE int
* #type = npy_float, npy_double, npy_longdouble#
* #c = f,,l#
*/
-static NPY_INLINE int
+static inline int
@name@_ctype_absolute(@type@ a, @type@ *out)
{
*out = npy_fabs@c@(a);
@@ -639,7 +639,7 @@ static NPY_INLINE int
}
/**end repeat**/
-static NPY_INLINE int
+static inline int
half_ctype_absolute(npy_half a, npy_half *out)
{
*out = a&0x7fffu;
@@ -652,7 +652,7 @@ half_ctype_absolute(npy_half a, npy_half *out)
* #rtype = npy_float, npy_double, npy_longdouble#
* #c = f,,l#
*/
-static NPY_INLINE int
+static inline int
@name@_ctype_absolute(@type@ a, @rtype@ *out)
{
*out = npy_cabs@c@(a);
@@ -665,7 +665,7 @@ static NPY_INLINE int
* ulong, longlong, ulonglong#
*/
-static NPY_INLINE int
+static inline int
@name@_ctype_invert(npy_@name@ a, npy_@name@ *out)
{
*out = ~a;
@@ -806,7 +806,7 @@ typedef enum {
*/
CONVERT_PYSCALAR,
/*
- * Other object is an unkown scalar or array-like, we (typically) use
+ * Other object is an unknown scalar or array-like, we (typically) use
* the generic path, which normally ends up in the ufunc machinery.
*/
OTHER_IS_UNKNOWN_OBJECT,
@@ -929,7 +929,7 @@ typedef enum {
* @result The result value indicating what we did with `value` or what type
* of object it is (see `conversion_result`).
*/
-static NPY_INLINE conversion_result
+static inline conversion_result
convert_to_@name@(PyObject *value, @type@ *result, npy_bool *may_need_deferring)
{
PyArray_Descr *descr;
@@ -1004,7 +1004,7 @@ convert_to_@name@(PyObject *value, @type@ *result, npy_bool *may_need_deferring)
if (overflow) {
/* handle as if "unsafe" */
if (npy_promotion_state != NPY_USE_WEAK_PROMOTION) {
- return PROMOTION_REQUIRED;
+ return OTHER_IS_UNKNOWN_OBJECT;
}
return CONVERT_PYSCALAR;
}
@@ -1179,6 +1179,11 @@ convert_to_@name@(PyObject *value, @type@ *result, npy_bool *may_need_deferring)
* (Half, Float, Double, LongDouble,
* CFloat, CDouble, CLongDouble)*4,
* (Half, Float, Double, LongDouble)*3#
+ * #NAME = (BYTE, UBYTE, SHORT, USHORT, INT, UINT,
+ * LONG, ULONG, LONGLONG, ULONGLONG)*12,
+ * (HALF, FLOAT, DOUBLE, LONGDOUBLE,
+ * CFLOAT, CDOUBLE, CLONGDOUBLE)*4,
+ * (HALF, FLOAT, DOUBLE, LONGDOUBLE)*3#
* #type = (npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint,
* npy_long, npy_ulong, npy_longlong, npy_ulonglong)*12,
* (npy_half, npy_float, npy_double, npy_longdouble,
@@ -1202,24 +1207,12 @@ convert_to_@name@(PyObject *value, @type@ *result, npy_bool *may_need_deferring)
* (npy_half, npy_float, npy_double, npy_longdouble,
* npy_cfloat, npy_cdouble, npy_clongdouble)*4,
* (npy_half, npy_float, npy_double, npy_longdouble)*3#
- * #oname = (byte, ubyte, short, ushort, int, uint,
- * long, ulong, longlong, ulonglong)*11,
- * double*10,
- * (half, float, double, longdouble,
- * cfloat, cdouble, clongdouble)*4,
- * (half, float, double, longdouble)*3#
* #OName = (Byte, UByte, Short, UShort, Int, UInt,
* Long, ULong, LongLong, ULongLong)*11,
* Double*10,
* (Half, Float, Double, LongDouble,
* CFloat, CDouble, CLongDouble)*4,
* (Half, Float, Double, LongDouble)*3#
- * #ONAME = (BYTE, UBYTE, SHORT, USHORT, INT, UINT,
- * LONG, ULONG, LONGLONG, ULONGLONG)*11,
- * DOUBLE*10,
- * (HALF, FLOAT, DOUBLE, LONGDOUBLE,
- * CFLOAT, CDOUBLE, CLONGDOUBLE)*4,
- * (HALF, FLOAT, DOUBLE, LONGDOUBLE)*3#
*/
#define IS_@name@
/* drop the "true_" from "true_divide" for floating point warnings: */
@@ -1234,7 +1227,7 @@ static PyObject *
@name@_@oper@(PyObject *a, PyObject *b)
{
PyObject *ret;
- @otype@ arg1, arg2, other_val;
+ @type@ arg1, arg2, other_val;
/*
* Check if this operation may be considered forward. Note `is_forward`
@@ -1263,7 +1256,7 @@ static PyObject *
PyObject *other = is_forward ? b : a;
npy_bool may_need_deferring;
- conversion_result res = convert_to_@oname@(
+ conversion_result res = convert_to_@name@(
other, &other_val, &may_need_deferring);
if (res == CONVERSION_ERROR) {
return NULL; /* an error occurred (should never happen) */
@@ -1305,7 +1298,7 @@ static PyObject *
*/
return PyGenericArrType_Type.tp_as_number->nb_@oper@(a,b);
case CONVERT_PYSCALAR:
- if (@ONAME@_setitem(other, (char *)&other_val, NULL) < 0) {
+ if (@NAME@_setitem(other, (char *)&other_val, NULL) < 0) {
return NULL;
}
break;
@@ -1345,7 +1338,7 @@ static PyObject *
#if @twoout@
int retstatus = @name@_ctype_@oper@(arg1, arg2, &out, &out2);
#else
- int retstatus = @oname@_ctype_@oper@(arg1, arg2, &out);
+ int retstatus = @name@_ctype_@oper@(arg1, arg2, &out);
#endif
#if @fperr@
@@ -1549,7 +1542,7 @@ static PyObject *
*
*/
-/*
+/*
* Complex numbers do not support remainder so we manually make sure that the
* operation is not defined. This is/was especially important for longdoubles
* due to their tendency to recurse for some operations, see gh-18548.
@@ -1711,7 +1704,7 @@ static int
emit_complexwarning(void)
{
static PyObject *cls = NULL;
- npy_cache_import("numpy.core", "ComplexWarning", &cls);
+ npy_cache_import("numpy.exceptions", "ComplexWarning", &cls);
if (cls == NULL) {
return -1;
}
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
deleted file mode 100644
index d6c9a7e65..000000000
--- a/numpy/core/src/umath/simd.inc.src
+++ /dev/null
@@ -1,1215 +0,0 @@
-
-
-/*
- * This file is for the definitions of simd vectorized operations.
- *
- * Currently contains sse2 functions that are built on amd64, x32 or
- * non-generic builds (CFLAGS=-march=...)
- * In future it may contain other instruction sets like AVX or NEON detected
- * at runtime in which case it needs to be included indirectly via a file
- * compiled with special options (or use gcc target attributes) so the binary
- * stays portable.
- */
-
-
-#ifndef __NPY_SIMD_INC
-#define __NPY_SIMD_INC
-
-#include "lowlevel_strided_loops.h"
-#include "numpy/npy_common.h"
-#include "numpy/npy_math.h"
-#include "npy_simd_data.h"
-#ifdef NPY_HAVE_SSE2_INTRINSICS
-#include <emmintrin.h>
-#if !defined(_MSC_VER) || _MSC_VER >= 1600
-#include <immintrin.h>
-#else
-#undef __AVX2__
-#undef __AVX512F__
-#endif
-#endif
-#include "loops_utils.h" // nomemoverlap
-#include <assert.h>
-#include <stdlib.h>
-#include <float.h>
-#include <string.h> /* for memcpy */
-
-#define VECTOR_SIZE_BYTES 16
-
-/*
- * Dispatcher functions
- * decide whether the operation can be vectorized and run it
- * if it was run returns true and false if nothing was done
- */
-
-/*
- *****************************************************************************
- ** CMPLX DISPATCHERS
- *****************************************************************************
- */
-
-/**begin repeat
- * #TYPE = CFLOAT, CDOUBLE#
- * #type= npy_float, npy_double#
- * #esize = 8, 16#
- */
-
-/**begin repeat1
- * #func = square, absolute, conjugate#
- * #outsize = 1, 2, 1#
- * #max_stride = 2, 8, 8#
- */
-
-#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
-static NPY_INLINE NPY_GCC_TARGET_AVX512F void
-AVX512F_@func@_@TYPE@(@type@*, @type@*, const npy_intp n, const npy_intp stride);
-#endif
-
-static NPY_INLINE int
-run_unary_avx512f_@func@_@TYPE@(char **args, const npy_intp *dimensions, const npy_intp *steps)
-{
-#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
- if ((IS_OUTPUT_BLOCKABLE_UNARY(@esize@, (npy_uint)(@esize@/@outsize@), 64)) && (labs(steps[0]) < 2*@max_stride@*@esize@)) {
- AVX512F_@func@_@TYPE@((@type@*)args[1], (@type@*)args[0], dimensions[0], steps[0]);
- return 1;
- }
- else
- return 0;
-#endif
- return 0;
-}
-
-/**end repeat1**/
-/**end repeat**/
-
-/*
- *****************************************************************************
- ** FLOAT DISPATCHERS
- *****************************************************************************
- */
-
-/**begin repeat
- * #type = npy_float, npy_double, npy_longdouble#
- * #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
- * #EXISTS = 1, 1, 0#
- */
-
-/**begin repeat1
- * #func = isnan, isfinite, isinf, signbit#
- */
-
-#if defined HAVE_ATTRIBUTE_TARGET_AVX512_SKX_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS && @EXISTS@
-static NPY_INLINE NPY_GCC_TARGET_AVX512_SKX void
-AVX512_SKX_@func@_@TYPE@(npy_bool*, @type@*, const npy_intp n, const npy_intp stride);
-#endif
-
-static NPY_INLINE int
-run_@func@_avx512_skx_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
-{
-#if defined HAVE_ATTRIBUTE_TARGET_AVX512_SKX_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS && @EXISTS@
- if (IS_OUTPUT_BLOCKABLE_UNARY(sizeof(@type@), sizeof(npy_bool), 64)) {
- AVX512_SKX_@func@_@TYPE@((npy_bool*)args[1], (@type@*)args[0], dimensions[0], steps[0]);
- return 1;
- }
- else {
- return 0;
- }
-#endif
- return 0;
-}
-
-
-/**end repeat1**/
-/**end repeat**/
-
-/**begin repeat
- * Float types
- * #type = npy_float, npy_double, npy_longdouble#
- * #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
- * #vector = 1, 1, 0#
- * #VECTOR = NPY_SIMD, NPY_SIMD_F64, 0 #
- */
-
-/**begin repeat1
- * #func = negative#
- * #check = IS_BLOCKABLE_UNARY#
- * #name = unary#
- */
-
-#if @vector@ && defined NPY_HAVE_SSE2_INTRINSICS
-
-/* prototypes */
-static void
-sse2_@func@_@TYPE@(@type@ *, @type@ *, const npy_intp n);
-
-#endif
-
-static NPY_INLINE int
-run_@name@_simd_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
-{
-#if @vector@ && defined NPY_HAVE_SSE2_INTRINSICS
- if (@check@(sizeof(@type@), VECTOR_SIZE_BYTES)) {
- sse2_@func@_@TYPE@((@type@*)args[1], (@type@*)args[0], dimensions[0]);
- return 1;
- }
-#endif
- return 0;
-}
-
-/**end repeat1**/
-
-/**begin repeat1
- * #kind = isnan, isfinite, isinf, signbit#
- */
-
-#if @vector@ && defined NPY_HAVE_SSE2_INTRINSICS
-
-static void
-sse2_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, npy_intp n);
-
-#endif
-
-static NPY_INLINE int
-run_@kind@_simd_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
-{
-#if @vector@ && defined NPY_HAVE_SSE2_INTRINSICS
- if (steps[0] == sizeof(@type@) && steps[1] == 1 &&
- npy_is_aligned(args[0], sizeof(@type@))) {
- sse2_@kind@_@TYPE@((npy_bool*)args[1], (@type@*)args[0], dimensions[0]);
- return 1;
- }
-#endif
- return 0;
-}
-
-/**end repeat1**/
-
-/**end repeat**/
-
-/*
- *****************************************************************************
- ** BOOL DISPATCHERS
- *****************************************************************************
- */
-
-/**begin repeat
- * # kind = logical_or, logical_and#
- */
-
-#if defined NPY_HAVE_SSE2_INTRINSICS
-static void
-sse2_binary_@kind@_BOOL(npy_bool * op, npy_bool * ip1, npy_bool * ip2,
- npy_intp n);
-
-static void
-sse2_reduce_@kind@_BOOL(npy_bool * op, npy_bool * ip, npy_intp n);
-#endif
-
-static NPY_INLINE int
-run_binary_simd_@kind@_BOOL(char **args, npy_intp const *dimensions, npy_intp const *steps)
-{
-#if defined NPY_HAVE_SSE2_INTRINSICS
- if (sizeof(npy_bool) == 1 &&
- IS_BLOCKABLE_BINARY(sizeof(npy_bool), VECTOR_SIZE_BYTES)) {
- sse2_binary_@kind@_BOOL((npy_bool*)args[2], (npy_bool*)args[0],
- (npy_bool*)args[1], dimensions[0]);
- return 1;
- }
-#endif
- return 0;
-}
-
-
-static NPY_INLINE int
-run_reduce_simd_@kind@_BOOL(char **args, npy_intp const *dimensions, npy_intp const *steps)
-{
-#if defined NPY_HAVE_SSE2_INTRINSICS
- if (sizeof(npy_bool) == 1 &&
- IS_BLOCKABLE_REDUCE(sizeof(npy_bool), VECTOR_SIZE_BYTES)) {
- sse2_reduce_@kind@_BOOL((npy_bool*)args[0], (npy_bool*)args[1],
- dimensions[0]);
- return 1;
- }
-#endif
- return 0;
-}
-
-/**end repeat**/
-
-/**begin repeat
- * # kind = absolute, logical_not#
- */
-
-#if defined NPY_HAVE_SSE2_INTRINSICS
-static void
-sse2_@kind@_BOOL(npy_bool *, npy_bool *, const npy_intp n);
-#endif
-
-static NPY_INLINE int
-run_unary_simd_@kind@_BOOL(char **args, npy_intp const *dimensions, npy_intp const *steps)
-{
-#if defined NPY_HAVE_SSE2_INTRINSICS
- if (sizeof(npy_bool) == 1 &&
- IS_BLOCKABLE_UNARY(sizeof(npy_bool), VECTOR_SIZE_BYTES)) {
- sse2_@kind@_BOOL((npy_bool*)args[1], (npy_bool*)args[0], dimensions[0]);
- return 1;
- }
-#endif
- return 0;
-}
-
-/**end repeat**/
-
-#ifdef NPY_HAVE_SSE2_INTRINSICS
-
-/*
- * Vectorized operations
- */
-/*
- *****************************************************************************
- ** FLOAT LOOPS
- *****************************************************************************
- */
-
-/**begin repeat
-* horizontal reductions on a vector
-* # VOP = min, max#
-*/
-
-NPY_FINLINE npy_float sse2_horizontal_@VOP@___m128(__m128 v)
-{
- npy_float r;
- __m128 tmp = _mm_movehl_ps(v, v); /* c d ... */
- __m128 m = _mm_@VOP@_ps(v, tmp); /* m(ac) m(bd) ... */
- tmp = _mm_shuffle_ps(m, m, _MM_SHUFFLE(1, 1, 1, 1));/* m(bd) m(bd) ... */
- _mm_store_ss(&r, _mm_@VOP@_ps(tmp, m)); /* m(acbd) ... */
- return r;
-}
-
-NPY_FINLINE npy_double sse2_horizontal_@VOP@___m128d(__m128d v)
-{
- npy_double r;
- __m128d tmp = _mm_unpackhi_pd(v, v); /* b b */
- _mm_store_sd(&r, _mm_@VOP@_pd(tmp, v)); /* m(ab) m(bb) */
- return r;
-}
-/**end repeat**/
-
-/**begin repeat
- * #type = npy_float, npy_double#
- * #TYPE = FLOAT, DOUBLE#
- * #scalarf = npy_sqrtf, npy_sqrt#
- * #c = f, #
- * #vtype = __m128, __m128d#
- * #vtype256 = __m256, __m256d#
- * #vtype512 = __m512, __m512d#
- * #vpre = _mm, _mm#
- * #vpre256 = _mm256, _mm256#
- * #vpre512 = _mm512, _mm512#
- * #vsuf = ps, pd#
- * #vsufs = ss, sd#
- * #nan = NPY_NANF, NPY_NAN#
- * #double = 0, 1#
- * #cast = _mm_castps_si128, _mm_castpd_si128#
- */
-/*
- * compress 4 vectors to 4/8 bytes in op with filled with 0 or 1
- * the last vector is passed as a pointer as MSVC 2010 is unable to ignore the
- * calling convention leading to C2719 on 32 bit, see #4795
- */
-NPY_FINLINE void
-sse2_compress4_to_byte_@TYPE@(@vtype@ r1, @vtype@ r2, @vtype@ r3, @vtype@ * r4,
- npy_bool * op)
-{
- const __m128i mask = @vpre@_set1_epi8(0x1);
- __m128i ir1 = @vpre@_packs_epi32(@cast@(r1), @cast@(r2));
- __m128i ir2 = @vpre@_packs_epi32(@cast@(r3), @cast@(*r4));
- __m128i rr = @vpre@_packs_epi16(ir1, ir2);
-#if @double@
- rr = @vpre@_packs_epi16(rr, rr);
- rr = @vpre@_and_si128(rr, mask);
- @vpre@_storel_epi64((__m128i*)op, rr);
-#else
- rr = @vpre@_and_si128(rr, mask);
- @vpre@_storeu_si128((__m128i*)op, rr);
-#endif
-}
-
-static void
-sse2_signbit_@TYPE@(npy_bool * op, @type@ * ip1, npy_intp n)
-{
- LOOP_BLOCK_ALIGN_VAR(ip1, @type@, VECTOR_SIZE_BYTES) {
- op[i] = npy_signbit(ip1[i]) != 0;
- }
- LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
- @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]);
- int r = @vpre@_movemask_@vsuf@(a);
- if (sizeof(@type@) == 8) {
- op[i] = r & 1;
- op[i + 1] = (r >> 1);
- }
- else {
- op[i] = r & 1;
- op[i + 1] = (r >> 1) & 1;
- op[i + 2] = (r >> 2) & 1;
- op[i + 3] = (r >> 3);
- }
- }
- LOOP_BLOCKED_END {
- op[i] = npy_signbit(ip1[i]) != 0;
- }
-}
-
-/**begin repeat1
- * #kind = isnan, isfinite, isinf#
- * #var = 0, 1, 2#
- */
-
-static void
-sse2_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, npy_intp n)
-{
-#if @var@ != 0 /* isinf/isfinite */
- /* signbit mask 0x7FFFFFFF after andnot */
- const @vtype@ mask = @vpre@_set1_@vsuf@(-0.@c@);
- const @vtype@ ones = @vpre@_cmpeq_@vsuf@(@vpre@_setzero_@vsuf@(),
- @vpre@_setzero_@vsuf@());
-#if @double@
- const @vtype@ fltmax = @vpre@_set1_@vsuf@(DBL_MAX);
-#else
- const @vtype@ fltmax = @vpre@_set1_@vsuf@(FLT_MAX);
-#endif
-#endif
- LOOP_BLOCK_ALIGN_VAR(ip1, @type@, VECTOR_SIZE_BYTES) {
- op[i] = npy_@kind@(ip1[i]) != 0;
- }
- LOOP_BLOCKED(@type@, 4 * VECTOR_SIZE_BYTES) {
- @vtype@ a = @vpre@_load_@vsuf@(&ip1[i + 0 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
- @vtype@ b = @vpre@_load_@vsuf@(&ip1[i + 1 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
- @vtype@ c = @vpre@_load_@vsuf@(&ip1[i + 2 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
- @vtype@ d = @vpre@_load_@vsuf@(&ip1[i + 3 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
- @vtype@ r1, r2, r3, r4;
-#if @var@ != 0 /* isinf/isfinite */
- /* fabs via masking of sign bit */
- r1 = @vpre@_andnot_@vsuf@(mask, a);
- r2 = @vpre@_andnot_@vsuf@(mask, b);
- r3 = @vpre@_andnot_@vsuf@(mask, c);
- r4 = @vpre@_andnot_@vsuf@(mask, d);
-#if @var@ == 1 /* isfinite */
- /* negative compare against max float, nan is always true */
- r1 = @vpre@_cmpnle_@vsuf@(r1, fltmax);
- r2 = @vpre@_cmpnle_@vsuf@(r2, fltmax);
- r3 = @vpre@_cmpnle_@vsuf@(r3, fltmax);
- r4 = @vpre@_cmpnle_@vsuf@(r4, fltmax);
-#else /* isinf */
- r1 = @vpre@_cmpnlt_@vsuf@(fltmax, r1);
- r2 = @vpre@_cmpnlt_@vsuf@(fltmax, r2);
- r3 = @vpre@_cmpnlt_@vsuf@(fltmax, r3);
- r4 = @vpre@_cmpnlt_@vsuf@(fltmax, r4);
-#endif
- /* flip results to what we want (andnot as there is no sse not) */
- r1 = @vpre@_andnot_@vsuf@(r1, ones);
- r2 = @vpre@_andnot_@vsuf@(r2, ones);
- r3 = @vpre@_andnot_@vsuf@(r3, ones);
- r4 = @vpre@_andnot_@vsuf@(r4, ones);
-#endif
-#if @var@ == 0 /* isnan */
- r1 = @vpre@_cmpneq_@vsuf@(a, a);
- r2 = @vpre@_cmpneq_@vsuf@(b, b);
- r3 = @vpre@_cmpneq_@vsuf@(c, c);
- r4 = @vpre@_cmpneq_@vsuf@(d, d);
-#endif
- sse2_compress4_to_byte_@TYPE@(r1, r2, r3, &r4, &op[i]);
- }
- LOOP_BLOCKED_END {
- op[i] = npy_@kind@(ip1[i]) != 0;
- }
-}
-
-/**end repeat1**/
-
-static void
-sse2_negative_@TYPE@(@type@ * op, @type@ * ip, const npy_intp n)
-{
- /*
- * get 0x7FFFFFFF mask (everything but signbit set)
- * float & ~mask will remove the sign, float ^ mask flips the sign
- * this is equivalent to how the compiler implements fabs on amd64
- */
- const @vtype@ mask = @vpre@_set1_@vsuf@(-0.@c@);
-
- /* align output to VECTOR_SIZE_BYTES bytes */
- LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES) {
- op[i] = -ip[i];
- }
- assert((npy_uintp)n < (VECTOR_SIZE_BYTES / sizeof(@type@)) ||
- npy_is_aligned(&op[i], VECTOR_SIZE_BYTES));
- if (npy_is_aligned(&ip[i], VECTOR_SIZE_BYTES)) {
- LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
- @vtype@ a = @vpre@_load_@vsuf@(&ip[i]);
- @vpre@_store_@vsuf@(&op[i], @vpre@_xor_@vsuf@(mask, a));
- }
- }
- else {
- LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
- @vtype@ a = @vpre@_loadu_@vsuf@(&ip[i]);
- @vpre@_store_@vsuf@(&op[i], @vpre@_xor_@vsuf@(mask, a));
- }
- }
- LOOP_BLOCKED_END {
- op[i] = -ip[i];
- }
-}
-/**end repeat1**/
-
-/**end repeat**/
-
-/* bunch of helper functions used in ISA_exp/log_FLOAT*/
-
-#if defined HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
-fma_get_full_load_mask_ps(void)
-{
- return _mm256_set1_ps(-1.0);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256i
-fma_get_full_load_mask_pd(void)
-{
- return _mm256_castpd_si256(_mm256_set1_pd(-1.0));
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
-fma_get_partial_load_mask_ps(const npy_int num_elem, const npy_int num_lanes)
-{
- float maskint[16] = {-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,
- 1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0};
- float* addr = maskint + num_lanes - num_elem;
- return _mm256_loadu_ps(addr);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256i
-fma_get_partial_load_mask_pd(const npy_int num_elem, const npy_int num_lanes)
-{
- npy_int maskint[16] = {-1,-1,-1,-1,-1,-1,-1,-1,1,1,1,1,1,1,1,1};
- npy_int* addr = maskint + 2*num_lanes - 2*num_elem;
- return _mm256_loadu_si256((__m256i*) addr);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
-fma_masked_gather_ps(__m256 src,
- npy_float* addr,
- __m256i vindex,
- __m256 mask)
-{
- return _mm256_mask_i32gather_ps(src, addr, vindex, mask, 4);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256d
-fma_masked_gather_pd(__m256d src,
- npy_double* addr,
- __m128i vindex,
- __m256d mask)
-{
- return _mm256_mask_i32gather_pd(src, addr, vindex, mask, 8);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
-fma_masked_load_ps(__m256 mask, npy_float* addr)
-{
- return _mm256_maskload_ps(addr, _mm256_cvtps_epi32(mask));
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256d
-fma_masked_load_pd(__m256i mask, npy_double* addr)
-{
- return _mm256_maskload_pd(addr, mask);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
-fma_set_masked_lanes_ps(__m256 x, __m256 val, __m256 mask)
-{
- return _mm256_blendv_ps(x, val, mask);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256d
-fma_set_masked_lanes_pd(__m256d x, __m256d val, __m256d mask)
-{
- return _mm256_blendv_pd(x, val, mask);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
-fma_blend(__m256 x, __m256 y, __m256 ymask)
-{
- return _mm256_blendv_ps(x, y, ymask);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
-fma_invert_mask_ps(__m256 ymask)
-{
- return _mm256_andnot_ps(ymask, _mm256_set1_ps(-1.0));
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256i
-fma_invert_mask_pd(__m256i ymask)
-{
- return _mm256_andnot_si256(ymask, _mm256_set1_epi32(0xFFFFFFFF));
-}
-
-/**begin repeat
- * #vsub = ps, pd#
- * #vtype = __m256, __m256d#
- */
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@
-fma_abs_@vsub@(@vtype@ x)
-{
- return _mm256_andnot_@vsub@(_mm256_set1_@vsub@(-0.0), x);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@
-fma_reciprocal_@vsub@(@vtype@ x)
-{
- return _mm256_div_@vsub@(_mm256_set1_@vsub@(1.0f), x);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@
-fma_rint_@vsub@(@vtype@ x)
-{
- return _mm256_round_@vsub@(x, _MM_FROUND_TO_NEAREST_INT);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@
-fma_floor_@vsub@(@vtype@ x)
-{
- return _mm256_round_@vsub@(x, _MM_FROUND_TO_NEG_INF);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@
-fma_trunc_@vsub@(@vtype@ x)
-{
- return _mm256_round_@vsub@(x, _MM_FROUND_TO_ZERO);
-}
-/**end repeat**/
-#endif
-
-#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
-avx512_get_full_load_mask_ps(void)
-{
- return 0xFFFF;
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask8
-avx512_get_full_load_mask_pd(void)
-{
- return 0xFF;
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
-avx512_get_partial_load_mask_ps(const npy_int num_elem, const npy_int total_elem)
-{
- return (0x0001 << num_elem) - 0x0001;
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask8
-avx512_get_partial_load_mask_pd(const npy_int num_elem, const npy_int total_elem)
-{
- return (0x01 << num_elem) - 0x01;
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
-avx512_masked_gather_ps(__m512 src,
- npy_float* addr,
- __m512i vindex,
- __mmask16 kmask)
-{
- return _mm512_mask_i32gather_ps(src, kmask, vindex, addr, 4);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512d
-avx512_masked_gather_pd(__m512d src,
- npy_double* addr,
- __m256i vindex,
- __mmask8 kmask)
-{
- return _mm512_mask_i32gather_pd(src, kmask, vindex, addr, 8);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
-avx512_masked_load_ps(__mmask16 mask, npy_float* addr)
-{
- return _mm512_maskz_loadu_ps(mask, (__m512 *)addr);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512d
-avx512_masked_load_pd(__mmask8 mask, npy_double* addr)
-{
- return _mm512_maskz_loadu_pd(mask, (__m512d *)addr);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
-avx512_set_masked_lanes_ps(__m512 x, __m512 val, __mmask16 mask)
-{
- return _mm512_mask_blend_ps(mask, x, val);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512d
-avx512_set_masked_lanes_pd(__m512d x, __m512d val, __mmask8 mask)
-{
- return _mm512_mask_blend_pd(mask, x, val);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
-avx512_blend(__m512 x, __m512 y, __mmask16 ymask)
-{
- return _mm512_mask_mov_ps(x, ymask, y);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
-avx512_invert_mask_ps(__mmask16 ymask)
-{
- return _mm512_knot(ymask);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask8
-avx512_invert_mask_pd(__mmask8 ymask)
-{
- return _mm512_knot(ymask);
-}
-
-/**begin repeat
- * #vsub = ps, pd#
- * #type= npy_float, npy_double#
- * #epi_vsub = epi32, epi64#
- * #vtype = __m512, __m512d#
- * #mask = __mmask16, __mmask8#
- * #and_const = 0x7fffffff, 0x7fffffffffffffffLL#
- * #neg_mask = 0x80000000, 0x8000000000000000#
- * #perm_ = 0xb1, 0x55#
- * #cmpx_img_mask = 0xAAAA, 0xAA#
- * #cmpx_re_mask = 0x5555, 0x55#
- * #INF = NPY_INFINITYF, NPY_INFINITY#
- * #NAN = NPY_NANF, NPY_NAN#
- */
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
-avx512_abs_@vsub@(@vtype@ x)
-{
- return (@vtype@) _mm512_and_@epi_vsub@((__m512i) x,
- _mm512_set1_@epi_vsub@ (@and_const@));
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
-avx512_reciprocal_@vsub@(@vtype@ x)
-{
- return _mm512_div_@vsub@(_mm512_set1_@vsub@(1.0f), x);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
-avx512_rint_@vsub@(@vtype@ x)
-{
- return _mm512_roundscale_@vsub@(x, 0x08);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
-avx512_floor_@vsub@(@vtype@ x)
-{
- return _mm512_roundscale_@vsub@(x, 0x09);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
-avx512_trunc_@vsub@(@vtype@ x)
-{
- return _mm512_roundscale_@vsub@(x, 0x0B);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
-avx512_hadd_@vsub@(const @vtype@ x)
-{
- return _mm512_add_@vsub@(x, _mm512_permute_@vsub@(x, @perm_@));
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
-avx512_hsub_@vsub@(const @vtype@ x)
-{
- return _mm512_sub_@vsub@(x, _mm512_permute_@vsub@(x, @perm_@));
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
-avx512_cabsolute_@vsub@(const @vtype@ x1,
- const @vtype@ x2,
- const __m512i re_indices,
- const __m512i im_indices)
-{
- @vtype@ inf = _mm512_set1_@vsub@(@INF@);
- @vtype@ nan = _mm512_set1_@vsub@(@NAN@);
- @vtype@ x1_abs = avx512_abs_@vsub@(x1);
- @vtype@ x2_abs = avx512_abs_@vsub@(x2);
- @vtype@ re = _mm512_permutex2var_@vsub@(x1_abs, re_indices, x2_abs);
- @vtype@ im = _mm512_permutex2var_@vsub@(x1_abs, im_indices , x2_abs);
- /*
- * If real or imag = INF, then convert it to inf + j*inf
- * Handles: inf + j*nan, nan + j*inf
- */
- @mask@ re_infmask = _mm512_cmp_@vsub@_mask(re, inf, _CMP_EQ_OQ);
- @mask@ im_infmask = _mm512_cmp_@vsub@_mask(im, inf, _CMP_EQ_OQ);
- im = _mm512_mask_mov_@vsub@(im, re_infmask, inf);
- re = _mm512_mask_mov_@vsub@(re, im_infmask, inf);
-
- /*
- * If real or imag = NAN, then convert it to nan + j*nan
- * Handles: x + j*nan, nan + j*x
- */
- @mask@ re_nanmask = _mm512_cmp_@vsub@_mask(re, re, _CMP_NEQ_UQ);
- @mask@ im_nanmask = _mm512_cmp_@vsub@_mask(im, im, _CMP_NEQ_UQ);
- im = _mm512_mask_mov_@vsub@(im, re_nanmask, nan);
- re = _mm512_mask_mov_@vsub@(re, im_nanmask, nan);
-
- @vtype@ larger = _mm512_max_@vsub@(re, im);
- @vtype@ smaller = _mm512_min_@vsub@(im, re);
-
- /*
- * Calculate div_mask to prevent 0./0. and inf/inf operations in div
- */
- @mask@ zeromask = _mm512_cmp_@vsub@_mask(larger, _mm512_setzero_@vsub@(), _CMP_EQ_OQ);
- @mask@ infmask = _mm512_cmp_@vsub@_mask(smaller, inf, _CMP_EQ_OQ);
- @mask@ div_mask = _mm512_knot(_mm512_kor(zeromask, infmask));
- @vtype@ ratio = _mm512_maskz_div_@vsub@(div_mask, smaller, larger);
- @vtype@ hypot = _mm512_sqrt_@vsub@(_mm512_fmadd_@vsub@(
- ratio, ratio, _mm512_set1_@vsub@(1.0f)));
- return _mm512_mul_@vsub@(hypot, larger);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
-avx512_conjugate_@vsub@(const @vtype@ x)
-{
- /*
- * __mm512_mask_xor_ps/pd requires AVX512DQ. We cast it to __m512i and
- * use the xor_epi32/64 uinstruction instead. Cast is a zero latency instruction
- */
- __m512i cast_x = _mm512_cast@vsub@_si512(x);
- __m512i res = _mm512_mask_xor_@epi_vsub@(cast_x, @cmpx_img_mask@,
- cast_x, _mm512_set1_@epi_vsub@(@neg_mask@));
- return _mm512_castsi512_@vsub@(res);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
-avx512_cmul_@vsub@(@vtype@ x1, @vtype@ x2)
-{
- // x1 = r1, i1
- // x2 = r2, i2
- @vtype@ x3 = _mm512_permute_@vsub@(x2, @perm_@); // i2, r2
- @vtype@ x12 = _mm512_mul_@vsub@(x1, x2); // r1*r2, i1*i2
- @vtype@ x13 = _mm512_mul_@vsub@(x1, x3); // r1*i2, r2*i1
- @vtype@ outreal = avx512_hsub_@vsub@(x12); // r1*r2 - i1*i2, r1*r2 - i1*i2
- @vtype@ outimg = avx512_hadd_@vsub@(x13); // r1*i2 + i1*r2, r1*i2 + i1*r2
- return _mm512_mask_blend_@vsub@(@cmpx_img_mask@, outreal, outimg);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
-avx512_csquare_@vsub@(@vtype@ x)
-{
- return avx512_cmul_@vsub@(x, x);
-}
-
-/**end repeat**/
-#endif
-
-/**begin repeat
- * #ISA = FMA, AVX512F#
- * #isa = fma, avx512#
- * #vtype = __m256, __m512#
- * #vsize = 256, 512#
- * #or = or_ps, kor#
- * #vsub = , _mask#
- * #mask = __m256, __mmask16#
- * #fmadd = _mm256_fmadd_ps, _mm512_fmadd_ps#
- * #CHK = HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS#
- **/
-
-#if defined @CHK@
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
-@isa@_sqrt_ps(@vtype@ x)
-{
- return _mm@vsize@_sqrt_ps(x);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@d
-@isa@_sqrt_pd(@vtype@d x)
-{
- return _mm@vsize@_sqrt_pd(x);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
-@isa@_square_ps(@vtype@ x)
-{
- return _mm@vsize@_mul_ps(x,x);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@d
-@isa@_square_pd(@vtype@d x)
-{
- return _mm@vsize@_mul_pd(x,x);
-}
-
-#endif
-/**end repeat**/
-
-/**begin repeat
- * #type = npy_float, npy_double#
- * #TYPE = FLOAT, DOUBLE#
- * #num_lanes = 16, 8#
- * #vsuffix = ps, pd#
- * #mask = __mmask16, __mmask8#
- * #vtype = __m512, __m512d#
- * #scale = 4, 8#
- * #vindextype = __m512i, __m256i#
- * #vindexload = _mm512_loadu_si512, _mm256_loadu_si256#
- * #episize = epi32, epi64#
- */
-
-/**begin repeat1
- * #func = isnan, isfinite, isinf, signbit#
- * #IMM8 = 0x81, 0x99, 0x18, 0x04#
- * #is_finite = 0, 1, 0, 0#
- * #is_signbit = 0, 0, 0, 1#
- */
-
-#if defined HAVE_ATTRIBUTE_TARGET_AVX512_SKX_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
-static NPY_INLINE NPY_GCC_TARGET_AVX512_SKX void
-AVX512_SKX_@func@_@TYPE@(npy_bool* op, @type@* ip, const npy_intp array_size, const npy_intp steps)
-{
- const npy_intp stride_ip = steps/(npy_intp)sizeof(@type@);
- npy_intp num_remaining_elements = array_size;
-
- @mask@ load_mask = avx512_get_full_load_mask_@vsuffix@();
-#if @is_signbit@
- @vtype@ signbit = _mm512_set1_@vsuffix@(-0.0);
-#endif
-
- /*
- * Note: while generally indices are npy_intp, we ensure that our maximum
- * index will fit in an int32 as a precondition for this function via
- * IS_OUTPUT_BLOCKABLE_UNARY
- */
-
- npy_int32 index_ip[@num_lanes@];
- for (npy_int32 ii = 0; ii < @num_lanes@; ii++) {
- index_ip[ii] = ii*stride_ip;
- }
- @vindextype@ vindex_ip = @vindexload@((@vindextype@*)&index_ip[0]);
- @vtype@ zeros_f = _mm512_setzero_@vsuffix@();
- __m512i ones = _mm512_set1_@episize@(1);
-
- while (num_remaining_elements > 0) {
- if (num_remaining_elements < @num_lanes@) {
- load_mask = avx512_get_partial_load_mask_@vsuffix@(
- num_remaining_elements, @num_lanes@);
- }
- @vtype@ x1;
- if (stride_ip == 1) {
- x1 = avx512_masked_load_@vsuffix@(load_mask, ip);
- }
- else {
- x1 = avx512_masked_gather_@vsuffix@(zeros_f, ip, vindex_ip, load_mask);
- }
-#if @is_signbit@
- x1 = _mm512_and_@vsuffix@(x1,signbit);
-#endif
-
- @mask@ fpclassmask = _mm512_fpclass_@vsuffix@_mask(x1, @IMM8@);
-#if @is_finite@
- fpclassmask = _mm512_knot(fpclassmask);
-#endif
-
- __m128i out =_mm512_maskz_cvts@episize@_epi8(fpclassmask, ones);
- _mm_mask_storeu_epi8(op, load_mask, out);
-
- ip += @num_lanes@*stride_ip;
- op += @num_lanes@;
- num_remaining_elements -= @num_lanes@;
- }
-}
-#endif
-/**end repeat1**/
-/**end repeat**/
-
-/**begin repeat
- * #TYPE = CFLOAT, CDOUBLE#
- * #type = npy_float, npy_double#
- * #num_lanes = 16, 8#
- * #vsuffix = ps, pd#
- * #epi_vsub = epi32, epi64#
- * #mask = __mmask16, __mmask8#
- * #vtype = __m512, __m512d#
- * #scale = 4, 8#
- * #vindextype = __m512i, __m256i#
- * #vindexload = _mm512_loadu_si512, _mm256_loadu_si256#
- * #storemask = 0xFF, 0xF#
- * #IS_FLOAT = 1, 0#
- */
-
-/**begin repeat1
- * #func = square, conjugate#
- * #vectorf = avx512_csquare, avx512_conjugate#
- */
-
-#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
-static NPY_GCC_OPT_3 NPY_INLINE NPY_GCC_TARGET_AVX512F void
-AVX512F_@func@_@TYPE@(@type@ * op,
- @type@ * ip,
- const npy_intp array_size,
- const npy_intp steps)
-{
- npy_intp num_remaining_elements = 2*array_size;
- const npy_intp stride_ip1 = steps/(npy_intp)sizeof(@type@)/2;
-
- /*
- * Note: while generally indices are npy_intp, we ensure that our maximum index
- * will fit in an int32 as a precondition for this function via max_stride
- */
- npy_int32 index_ip1[16];
- for (npy_int32 ii = 0; ii < @num_lanes@; ii=ii+2) {
- index_ip1[ii] = ii*stride_ip1;
- index_ip1[ii+1] = ii*stride_ip1 + 1;
- }
- @vindextype@ vindex = @vindexload@((@vindextype@*)index_ip1);
- @mask@ load_mask = avx512_get_full_load_mask_@vsuffix@();
- @vtype@ zeros = _mm512_setzero_@vsuffix@();
-
- while (num_remaining_elements > 0) {
- if (num_remaining_elements < @num_lanes@) {
- load_mask = avx512_get_partial_load_mask_@vsuffix@(
- num_remaining_elements, @num_lanes@);
- }
- @vtype@ x1;
- if (stride_ip1 == 1) {
- x1 = avx512_masked_load_@vsuffix@(load_mask, ip);
- }
- else {
- x1 = avx512_masked_gather_@vsuffix@(zeros, ip, vindex, load_mask);
- }
-
- @vtype@ out = @vectorf@_@vsuffix@(x1);
-
- _mm512_mask_storeu_@vsuffix@(op, load_mask, out);
- op += @num_lanes@;
- ip += @num_lanes@*stride_ip1;
- num_remaining_elements -= @num_lanes@;
- }
-}
-#endif
-/**end repeat1**/
-
-#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
-static NPY_GCC_OPT_3 NPY_INLINE NPY_GCC_TARGET_AVX512F void
-AVX512F_absolute_@TYPE@(@type@ * op,
- @type@ * ip,
- const npy_intp array_size,
- const npy_intp steps)
-{
- npy_intp num_remaining_elements = 2*array_size;
- const npy_intp stride_ip1 = steps/(npy_intp)sizeof(@type@)/2;
-
- /*
- * Note: while generally indices are npy_intp, we ensure that our maximum index
- * will fit in an int32 as a precondition for this function via max_stride
- */
- npy_int32 index_ip[32];
- for (npy_int32 ii = 0; ii < 2*@num_lanes@; ii=ii+2) {
- index_ip[ii] = ii*stride_ip1;
- index_ip[ii+1] = ii*stride_ip1 + 1;
- }
- @vindextype@ vindex1 = @vindexload@((@vindextype@*)index_ip);
- @vindextype@ vindex2 = @vindexload@((@vindextype@*)(index_ip+@num_lanes@));
-
- @mask@ load_mask1 = avx512_get_full_load_mask_@vsuffix@();
- @mask@ load_mask2 = avx512_get_full_load_mask_@vsuffix@();
- @mask@ store_mask = avx512_get_full_load_mask_@vsuffix@();
- @vtype@ zeros = _mm512_setzero_@vsuffix@();
-
-#if @IS_FLOAT@
- __m512i re_index = _mm512_set_epi32(30,28,26,24,22,20,18,16,14,12,10,8,6,4,2,0);
- __m512i im_index = _mm512_set_epi32(31,29,27,25,23,21,19,17,15,13,11,9,7,5,3,1);
-#else
- __m512i re_index = _mm512_set_epi64(14,12,10,8,6,4,2,0);
- __m512i im_index = _mm512_set_epi64(15,13,11,9,7,5,3,1);
-#endif
-
- while (num_remaining_elements > 0) {
- if (num_remaining_elements < @num_lanes@) {
- load_mask1 = avx512_get_partial_load_mask_@vsuffix@(
- num_remaining_elements, @num_lanes@);
- load_mask2 = 0x0000;
- store_mask = avx512_get_partial_load_mask_@vsuffix@(
- num_remaining_elements/2, @num_lanes@);
- } else if (num_remaining_elements < 2*@num_lanes@) {
- load_mask1 = avx512_get_full_load_mask_@vsuffix@();
- load_mask2 = avx512_get_partial_load_mask_@vsuffix@(
- num_remaining_elements - @num_lanes@, @num_lanes@);
- store_mask = avx512_get_partial_load_mask_@vsuffix@(
- num_remaining_elements/2, @num_lanes@);
- }
- @vtype@ x1, x2;
- if (stride_ip1 == 1) {
- x1 = avx512_masked_load_@vsuffix@(load_mask1, ip);
- x2 = avx512_masked_load_@vsuffix@(load_mask2, ip+@num_lanes@);
- }
- else {
- x1 = avx512_masked_gather_@vsuffix@(zeros, ip, vindex1, load_mask1);
- x2 = avx512_masked_gather_@vsuffix@(zeros, ip, vindex2, load_mask2);
- }
-
- @vtype@ out = avx512_cabsolute_@vsuffix@(x1, x2, re_index, im_index);
-
- _mm512_mask_storeu_@vsuffix@(op, store_mask, out);
- op += @num_lanes@;
- ip += 2*@num_lanes@*stride_ip1;
- num_remaining_elements -= 2*@num_lanes@;
- }
- npy_clear_floatstatus_barrier((char*)&num_remaining_elements);
-}
-
-#endif
-/**end repeat**/
-
-/*
- *****************************************************************************
- ** BOOL LOOPS
- *****************************************************************************
- */
-
-/**begin repeat
- * # kind = logical_or, logical_and#
- * # and = 0, 1#
- * # op = ||, &&#
- * # sc = !=, ==#
- * # vpre = _mm*2#
- * # vsuf = si128*2#
- * # vtype = __m128i*2#
- * # type = npy_bool*2#
- * # vload = _mm_load_si128*2#
- * # vloadu = _mm_loadu_si128*2#
- * # vstore = _mm_store_si128*2#
- */
-
-/*
- * convert any bit set to boolean true so vectorized and normal operations are
- * consistent, should not be required if bool is used correctly everywhere but
- * you never know
- */
-#if !@and@
-NPY_FINLINE @vtype@ byte_to_true(@vtype@ v)
-{
- const @vtype@ zero = @vpre@_setzero_@vsuf@();
- const @vtype@ truemask = @vpre@_set1_epi8(1 == 1);
- /* get 0xFF for zeros */
- @vtype@ tmp = @vpre@_cmpeq_epi8(v, zero);
- /* filled with 0xFF/0x00, negate and mask to boolean true */
- return @vpre@_andnot_@vsuf@(tmp, truemask);
-}
-#endif
-
-static void
-sse2_binary_@kind@_BOOL(npy_bool * op, npy_bool * ip1, npy_bool * ip2, npy_intp n)
-{
- LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES)
- op[i] = ip1[i] @op@ ip2[i];
- LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
- @vtype@ a = @vloadu@((@vtype@*)&ip1[i]);
- @vtype@ b = @vloadu@((@vtype@*)&ip2[i]);
-#if @and@
- const @vtype@ zero = @vpre@_setzero_@vsuf@();
- /* get 0xFF for non zeros*/
- @vtype@ tmp = @vpre@_cmpeq_epi8(a, zero);
- /* andnot -> 0x00 for zeros xFF for non zeros, & with ip2 */
- tmp = @vpre@_andnot_@vsuf@(tmp, b);
-#else
- @vtype@ tmp = @vpre@_or_@vsuf@(a, b);
-#endif
-
- @vstore@((@vtype@*)&op[i], byte_to_true(tmp));
- }
- LOOP_BLOCKED_END {
- op[i] = (ip1[i] @op@ ip2[i]);
- }
-}
-
-
-static void
-sse2_reduce_@kind@_BOOL(npy_bool * op, npy_bool * ip, const npy_intp n)
-{
- const @vtype@ zero = @vpre@_setzero_@vsuf@();
- LOOP_BLOCK_ALIGN_VAR(ip, npy_bool, VECTOR_SIZE_BYTES) {
- *op = *op @op@ ip[i];
- if (*op @sc@ 0) {
- return;
- }
- }
- /* unrolled once to replace a slow movmsk with a fast pmaxb */
- LOOP_BLOCKED(npy_bool, 2 * VECTOR_SIZE_BYTES) {
- @vtype@ v = @vload@((@vtype@*)&ip[i]);
- @vtype@ v2 = @vload@((@vtype@*)&ip[i + VECTOR_SIZE_BYTES]);
- v = @vpre@_cmpeq_epi8(v, zero);
- v2 = @vpre@_cmpeq_epi8(v2, zero);
-#if @and@
- if ((@vpre@_movemask_epi8(@vpre@_max_epu8(v, v2)) != 0)) {
- *op = 0;
-#else
- if ((@vpre@_movemask_epi8(@vpre@_min_epu8(v, v2)) != 0xFFFF)) {
- *op = 1;
-#endif
- return;
- }
- }
- LOOP_BLOCKED_END {
- *op = *op @op@ ip[i];
- if (*op @sc@ 0) {
- return;
- }
- }
-}
-
-/**end repeat**/
-
-/**begin repeat
- * # kind = absolute, logical_not#
- * # op = !=, ==#
- * # not = 0, 1#
- * # vpre = _mm*2#
- * # vsuf = si128*2#
- * # vtype = __m128i*2#
- * # type = npy_bool*2#
- * # vloadu = _mm_loadu_si128*2#
- * # vstore = _mm_store_si128*2#
- */
-
-static void
-sse2_@kind@_BOOL(@type@ * op, @type@ * ip, const npy_intp n)
-{
- LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES)
- op[i] = (ip[i] @op@ 0);
- LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
- @vtype@ a = @vloadu@((@vtype@*)&ip[i]);
-#if @not@
- const @vtype@ zero = @vpre@_setzero_@vsuf@();
- const @vtype@ truemask = @vpre@_set1_epi8(1 == 1);
- /* equivalent to byte_to_true but can skip the negation */
- a = @vpre@_cmpeq_epi8(a, zero);
- a = @vpre@_and_@vsuf@(a, truemask);
-#else
- /* abs is kind of pointless but maybe its used for byte_to_true */
- a = byte_to_true(a);
-#endif
- @vstore@((@vtype@*)&op[i], a);
- }
- LOOP_BLOCKED_END {
- op[i] = (ip[i] @op@ 0);
- }
-}
-
-/**end repeat**/
-
-#undef VECTOR_SIZE_BYTES
-#endif /* NPY_HAVE_SSE2_INTRINSICS */
-#endif
-
diff --git a/numpy/core/src/umath/string_ufuncs.cpp b/numpy/core/src/umath/string_ufuncs.cpp
index 5a35c318b..5d82be6db 100644
--- a/numpy/core/src/umath/string_ufuncs.cpp
+++ b/numpy/core/src/umath/string_ufuncs.cpp
@@ -16,7 +16,7 @@
template <typename character>
-static NPY_INLINE int
+static inline int
character_cmp(character a, character b)
{
if (a == b) {
@@ -37,7 +37,7 @@ character_cmp(character a, character b)
* is always padded with zeros).
*/
template <bool rstrip, typename character>
-static NPY_INLINE int
+static inline int
string_cmp(int len1, const character *str1, int len2, const character *str2)
{
if (rstrip) {
diff --git a/numpy/core/src/umath/ufunc_object.c b/numpy/core/src/umath/ufunc_object.c
index 693a6d6c9..39e64decb 100644
--- a/numpy/core/src/umath/ufunc_object.c
+++ b/numpy/core/src/umath/ufunc_object.c
@@ -52,6 +52,7 @@
#include "arrayobject.h"
#include "common.h"
+#include "ctors.h"
#include "dtypemeta.h"
#include "numpyos.h"
#include "dispatching.h"
@@ -1391,7 +1392,7 @@ try_trivial_single_output_loop(PyArrayMethod_Context *context,
* or pass in the full cast information. But this can special case
* the logical functions and prints a better error message.
*/
-static NPY_INLINE int
+static inline int
validate_casting(PyArrayMethodObject *method, PyUFuncObject *ufunc,
PyArrayObject *ops[], PyArray_Descr *descriptors[],
NPY_CASTING casting)
@@ -1781,6 +1782,8 @@ _check_keepdims_support(PyUFuncObject *ufunc) {
static int
_parse_axes_arg(PyUFuncObject *ufunc, int op_core_num_dims[], PyObject *axes,
PyArrayObject **op, int broadcast_ndim, int **remap_axis) {
+ static PyObject *AxisError_cls = NULL;
+
int nin = ufunc->nin;
int nop = ufunc->nargs;
int iop, list_size;
@@ -1825,16 +1828,17 @@ _parse_axes_arg(PyUFuncObject *ufunc, int op_core_num_dims[], PyObject *axes,
op_axes_tuple = PyList_GET_ITEM(axes, iop);
if (PyTuple_Check(op_axes_tuple)) {
if (PyTuple_Size(op_axes_tuple) != op_ncore) {
- if (op_ncore == 1) {
- PyErr_Format(PyExc_ValueError,
- "axes item %d should be a tuple with a "
- "single element, or an integer", iop);
- }
- else {
- PyErr_Format(PyExc_ValueError,
- "axes item %d should be a tuple with %d "
- "elements", iop, op_ncore);
+ /* must have been a tuple with too many entries. */
+ npy_cache_import(
+ "numpy.exceptions", "AxisError", &AxisError_cls);
+ if (AxisError_cls == NULL) {
+ return -1;
}
+ PyErr_Format(AxisError_cls,
+ "%s: operand %d has %d core dimensions, "
+ "but %zd dimensions are specified by axes tuple.",
+ ufunc_get_name_cstr(ufunc), iop, op_ncore,
+ PyTuple_Size(op_axes_tuple));
return -1;
}
Py_INCREF(op_axes_tuple);
@@ -1846,8 +1850,22 @@ _parse_axes_arg(PyUFuncObject *ufunc, int op_core_num_dims[], PyObject *axes,
}
}
else {
- PyErr_Format(PyExc_TypeError, "axes item %d should be a tuple",
- iop);
+ /* If input is not an integer tell user that a tuple is needed */
+ if (error_converting(PyArray_PyIntAsInt(op_axes_tuple))) {
+ PyErr_Format(PyExc_TypeError,
+ "%s: axes item %d should be a tuple.",
+ ufunc_get_name_cstr(ufunc), iop);
+ return -1;
+ }
+ /* If it is a single integer, inform user that more are needed */
+ npy_cache_import("numpy.exceptions", "AxisError", &AxisError_cls);
+ if (AxisError_cls == NULL) {
+ return -1;
+ }
+ PyErr_Format(AxisError_cls,
+ "%s: operand %d has %d core dimensions, "
+ "but the axes item is a single integer.",
+ ufunc_get_name_cstr(ufunc), iop, op_ncore);
return -1;
}
/*
@@ -2067,12 +2085,16 @@ _get_coredim_sizes(PyUFuncObject *ufunc, PyArrayObject **op,
}
/*
- * Returns a new reference
+ * Returns a new reference to the ufunc identity. Note that this identity
+ * is only a default identity value stored on the ufunc, since the invidiual
+ * ufunc loop (ArrayMethod) is queried for the actual identity.
+ *
* TODO: store a reference in the ufunc object itself, rather than
* constructing one each time
*/
-static PyObject *
-_get_identity(PyUFuncObject *ufunc, npy_bool *reorderable) {
+NPY_NO_EXPORT PyObject *
+PyUFunc_GetDefaultIdentity(PyUFuncObject *ufunc, npy_bool *reorderable)
+{
switch(ufunc->identity) {
case PyUFunc_One:
*reorderable = 1;
@@ -2739,8 +2761,41 @@ reducelike_promote_and_resolve(PyUFuncObject *ufunc,
PyArrayObject *arr, PyArrayObject *out,
PyArray_DTypeMeta *signature[3],
npy_bool enforce_uniform_args, PyArray_Descr *out_descrs[3],
- char *method)
+ NPY_CASTING casting, char *method)
{
+ /*
+ * If no dtype is specified and out is not specified, we override the
+ * integer and bool dtype used for add and multiply.
+ *
+ * TODO: The following should be handled by a promoter!
+ */
+ if (signature[0] == NULL && out == NULL) {
+ /*
+ * For integer types --- make sure at least a long
+ * is used for add and multiply reduction to avoid overflow
+ */
+ int typenum = PyArray_TYPE(arr);
+ if ((PyTypeNum_ISBOOL(typenum) || PyTypeNum_ISINTEGER(typenum))
+ && ((strcmp(ufunc->name, "add") == 0)
+ || (strcmp(ufunc->name, "multiply") == 0))) {
+ if (PyTypeNum_ISBOOL(typenum)) {
+ typenum = NPY_LONG;
+ }
+ else if ((size_t)PyArray_DESCR(arr)->elsize < sizeof(long)) {
+ if (PyTypeNum_ISUNSIGNED(typenum)) {
+ typenum = NPY_ULONG;
+ }
+ else {
+ typenum = NPY_LONG;
+ }
+ }
+ signature[0] = PyArray_DTypeFromTypeNum(typenum);
+ }
+ }
+ assert(signature[2] == NULL); /* we always fill it here */
+ Py_XINCREF(signature[0]);
+ signature[2] = signature[0];
+
/*
* Note that the `ops` is not really correct. But legacy resolution
* cannot quite handle the correct ops (e.g. a NULL first item if `out`
@@ -2802,7 +2857,7 @@ reducelike_promote_and_resolve(PyUFuncObject *ufunc,
* (although this should possibly happen through a deprecation)
*/
if (resolve_descriptors(3, ufunc, ufuncimpl,
- ops, out_descrs, signature, NPY_UNSAFE_CASTING) < 0) {
+ ops, out_descrs, signature, casting) < 0) {
return NULL;
}
@@ -2825,8 +2880,7 @@ reducelike_promote_and_resolve(PyUFuncObject *ufunc,
goto fail;
}
/* TODO: This really should _not_ be unsafe casting (same above)! */
- if (validate_casting(ufuncimpl,
- ufunc, ops, out_descrs, NPY_UNSAFE_CASTING) < 0) {
+ if (validate_casting(ufuncimpl, ufunc, ops, out_descrs, casting) < 0) {
goto fail;
}
@@ -2834,7 +2888,7 @@ reducelike_promote_and_resolve(PyUFuncObject *ufunc,
fail:
for (int i = 0; i < 3; ++i) {
- Py_DECREF(out_descrs[i]);
+ Py_CLEAR(out_descrs[i]);
}
return NULL;
}
@@ -2956,10 +3010,8 @@ PyUFunc_Reduce(PyUFuncObject *ufunc,
PyObject *initial, PyArrayObject *wheremask)
{
int iaxes, ndim;
- npy_bool reorderable;
npy_bool axis_flags[NPY_MAXDIMS];
- PyArrayObject *result = NULL;
- PyObject *identity;
+
const char *ufunc_name = ufunc_get_name_cstr(ufunc);
/* These parameters come from a TLS global */
int buffersize = 0, errormask = 0;
@@ -2984,72 +3036,26 @@ PyUFunc_Reduce(PyUFuncObject *ufunc,
return NULL;
}
- /*
- * Promote and fetch ufuncimpl (currently needed to fix up the identity).
- */
PyArray_Descr *descrs[3];
PyArrayMethodObject *ufuncimpl = reducelike_promote_and_resolve(ufunc,
- arr, out, signature, NPY_FALSE, descrs, "reduce");
+ arr, out, signature, NPY_FALSE, descrs, NPY_UNSAFE_CASTING, "reduce");
if (ufuncimpl == NULL) {
return NULL;
}
- /* Get the identity */
- /* TODO: Both of these should be provided by the ArrayMethod! */
- identity = _get_identity(ufunc, &reorderable);
- if (identity == NULL) {
- goto finish;
- }
-
- /* Get the initial value */
- if (initial == NULL) {
- initial = identity;
-
- /*
- * The identity for a dynamic dtype like
- * object arrays can't be used in general
- */
- if (initial != Py_None && PyArray_ISOBJECT(arr) && PyArray_SIZE(arr) != 0) {
- Py_DECREF(initial);
- initial = Py_None;
- Py_INCREF(initial);
- }
- else if (PyTypeNum_ISUNSIGNED(descrs[2]->type_num)
- && PyLong_CheckExact(initial)) {
- /*
- * This is a bit of a hack until we have truly loop specific
- * identities. Python -1 cannot be cast to unsigned so convert
- * it to a NumPy scalar, but we use -1 for bitwise functions to
- * signal all 1s.
- * (A builtin identity would not overflow here, although we may
- * unnecessary convert 0 and 1.)
- */
- Py_SETREF(initial, PyObject_CallFunctionObjArgs(
- (PyObject *)&PyLongArrType_Type, initial, NULL));
- if (initial == NULL) {
- goto finish;
- }
- }
- } else {
- Py_DECREF(identity);
- Py_INCREF(initial); /* match the reference count in the if above */
- }
-
PyArrayMethod_Context context = {
.caller = (PyObject *)ufunc,
.method = ufuncimpl,
.descriptors = descrs,
};
- result = PyUFunc_ReduceWrapper(&context,
- arr, out, wheremask, axis_flags, reorderable, keepdims,
- initial, reduce_loop, ufunc, buffersize, ufunc_name, errormask);
+ PyArrayObject *result = PyUFunc_ReduceWrapper(&context,
+ arr, out, wheremask, axis_flags, keepdims,
+ initial, reduce_loop, buffersize, ufunc_name, errormask);
- finish:
for (int i = 0; i < 3; i++) {
Py_DECREF(descrs[i]);
}
- Py_XDECREF(initial);
return result;
}
@@ -3094,7 +3100,8 @@ PyUFunc_Accumulate(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *out,
PyArray_Descr *descrs[3];
PyArrayMethodObject *ufuncimpl = reducelike_promote_and_resolve(ufunc,
- arr, out, signature, NPY_TRUE, descrs, "accumulate");
+ arr, out, signature, NPY_TRUE, descrs, NPY_UNSAFE_CASTING,
+ "accumulate");
if (ufuncimpl == NULL) {
return NULL;
}
@@ -3511,7 +3518,8 @@ PyUFunc_Reduceat(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *ind,
PyArray_Descr *descrs[3];
PyArrayMethodObject *ufuncimpl = reducelike_promote_and_resolve(ufunc,
- arr, out, signature, NPY_TRUE, descrs, "reduceat");
+ arr, out, signature, NPY_TRUE, descrs, NPY_UNSAFE_CASTING,
+ "reduceat");
if (ufuncimpl == NULL) {
return NULL;
}
@@ -4063,7 +4071,7 @@ PyUFunc_GenericReduction(PyUFuncObject *ufunc,
/* We now have all the information required to check for Overrides */
PyObject *override = NULL;
int errval = PyUFunc_CheckOverride(ufunc, _reduce_type[operation],
- full_args.in, full_args.out, args, len_args, kwnames, &override);
+ full_args.in, full_args.out, wheremask_obj, args, len_args, kwnames, &override);
if (errval) {
return NULL;
}
@@ -4169,38 +4177,6 @@ PyUFunc_GenericReduction(PyUFuncObject *ufunc,
}
}
- /*
- * If no dtype is specified and out is not specified, we override the
- * integer and bool dtype used for add and multiply.
- *
- * TODO: The following should be handled by a promoter!
- */
- if (signature[0] == NULL && out == NULL) {
- /*
- * For integer types --- make sure at least a long
- * is used for add and multiply reduction to avoid overflow
- */
- int typenum = PyArray_TYPE(mp);
- if ((PyTypeNum_ISBOOL(typenum) || PyTypeNum_ISINTEGER(typenum))
- && ((strcmp(ufunc->name, "add") == 0)
- || (strcmp(ufunc->name, "multiply") == 0))) {
- if (PyTypeNum_ISBOOL(typenum)) {
- typenum = NPY_LONG;
- }
- else if ((size_t)PyArray_DESCR(mp)->elsize < sizeof(long)) {
- if (PyTypeNum_ISUNSIGNED(typenum)) {
- typenum = NPY_ULONG;
- }
- else {
- typenum = NPY_LONG;
- }
- }
- signature[0] = PyArray_DTypeFromTypeNum(typenum);
- }
- }
- Py_XINCREF(signature[0]);
- signature[2] = signature[0];
-
switch(operation) {
case UFUNC_REDUCE:
ret = PyUFunc_Reduce(ufunc,
@@ -4361,23 +4337,16 @@ _get_dtype(PyObject *dtype_obj) {
}
else if (NPY_UNLIKELY(out->singleton != descr)) {
/* This does not warn about `metadata`, but units is important. */
- if (!PyArray_EquivTypes(out->singleton, descr)) {
- /* Deprecated NumPy 1.21.2 (was an accidental error in 1.21) */
- if (DEPRECATE(
+ if (out->singleton == NULL
+ || !PyArray_EquivTypes(out->singleton, descr)) {
+ PyErr_SetString(PyExc_TypeError,
"The `dtype` and `signature` arguments to "
"ufuncs only select the general DType and not details "
- "such as the byte order or time unit (with rare "
- "exceptions see release notes). To avoid this warning "
- "please use the scalar types `np.float64`, or string "
- "notation.\n"
- "In rare cases where the time unit was preserved, "
- "either cast the inputs or provide an output array. "
- "In the future NumPy may transition to allow providing "
- "`dtype=` to denote the outputs `dtype` as well. "
- "(Deprecated NumPy 1.21)") < 0) {
- Py_DECREF(descr);
- return NULL;
- }
+ "such as the byte order or time unit. "
+ "You can avoid this error by using the scalar types "
+ "`np.float64` or the dtype string notation.");
+ Py_DECREF(descr);
+ return NULL;
}
}
Py_INCREF(out);
@@ -4874,7 +4843,7 @@ ufunc_generic_fastcall(PyUFuncObject *ufunc,
/* We now have all the information required to check for Overrides */
PyObject *override = NULL;
errval = PyUFunc_CheckOverride(ufunc, method,
- full_args.in, full_args.out,
+ full_args.in, full_args.out, where_obj,
args, len_args, kwnames, &override);
if (errval) {
goto fail;
@@ -5066,14 +5035,11 @@ ufunc_generic_vectorcall(PyObject *ufunc,
NPY_NO_EXPORT PyObject *
-ufunc_geterr(PyObject *NPY_UNUSED(dummy), PyObject *args)
+ufunc_geterr(PyObject *NPY_UNUSED(dummy), PyObject *NPY_UNUSED(arg))
{
PyObject *thedict;
PyObject *res;
- if (!PyArg_ParseTuple(args, "")) {
- return NULL;
- }
thedict = PyThreadState_GetDict();
if (thedict == NULL) {
thedict = PyEval_GetBuiltins();
@@ -5099,16 +5065,13 @@ ufunc_geterr(PyObject *NPY_UNUSED(dummy), PyObject *args)
NPY_NO_EXPORT PyObject *
-ufunc_seterr(PyObject *NPY_UNUSED(dummy), PyObject *args)
+ufunc_seterr(PyObject *NPY_UNUSED(dummy), PyObject *arg)
{
PyObject *thedict;
int res;
- PyObject *val;
+ PyObject *val = arg;
static char *msg = "Error object must be a list of length 3";
- if (!PyArg_ParseTuple(args, "O:seterrobj", &val)) {
- return NULL;
- }
if (!PyList_CheckExact(val) || PyList_GET_SIZE(val) != 3) {
PyErr_SetString(PyExc_ValueError, msg);
return NULL;
@@ -5379,7 +5342,7 @@ cmp_arg_types(int *arg1, int *arg2, int n)
* This frees the linked-list structure when the CObject
* is destroyed (removed from the internal dictionary)
*/
-static NPY_INLINE void
+static inline void
_free_loop1d_list(PyUFunc_Loop1d *data)
{
int i;
@@ -5920,10 +5883,11 @@ ufunc_reduceat(PyUFuncObject *ufunc,
}
/* Helper for ufunc_at, below */
-static NPY_INLINE PyArrayObject *
+static inline PyArrayObject *
new_array_op(PyArrayObject *op_array, char *data)
{
npy_intp dims[1] = {1};
+ Py_INCREF(PyArray_DESCR(op_array)); /* NewFromDescr steals a reference */
PyObject *r = PyArray_NewFromDescr(&PyArray_Type, PyArray_DESCR(op_array),
1, dims, NULL, data,
NPY_ARRAY_WRITEABLE, NULL);
@@ -5931,206 +5895,171 @@ new_array_op(PyArrayObject *op_array, char *data)
}
/*
- * Call ufunc only on selected array items and store result in first operand.
- * For add ufunc, method call is equivalent to op1[idx] += op2 with no
- * buffering of the first operand.
- * Arguments:
- * op1 - First operand to ufunc
- * idx - Indices that are applied to first operand. Equivalent to op1[idx].
- * op2 - Second operand to ufunc (if needed). Must be able to broadcast
- * over first operand.
+ * Use an indexed loop to do the work
+ * Returns 0 if successful
*/
-static PyObject *
-ufunc_at(PyUFuncObject *ufunc, PyObject *args)
+static int
+trivial_at_loop(PyArrayMethodObject *ufuncimpl, NPY_ARRAYMETHOD_FLAGS flags,
+ PyArrayMapIterObject *iter,
+ PyArrayObject *op1_array, PyArrayObject *op2_array,
+ PyArrayMethod_Context *context)
{
- PyObject *op1 = NULL;
- PyObject *idx = NULL;
- PyObject *op2 = NULL;
- PyArrayObject *op1_array = NULL;
- PyArrayObject *op2_array = NULL;
- PyArrayMapIterObject *iter = NULL;
- PyArrayIterObject *iter2 = NULL;
- PyArrayObject *operands[3] = {NULL, NULL, NULL};
- PyArrayObject *array_operands[3] = {NULL, NULL, NULL};
-
- PyArray_DTypeMeta *signature[3] = {NULL, NULL, NULL};
- PyArray_DTypeMeta *operand_DTypes[3] = {NULL, NULL, NULL};
- PyArray_Descr *operation_descrs[3] = {NULL, NULL, NULL};
-
- int nop;
-
- /* override vars */
- int errval;
- PyObject *override = NULL;
-
- NpyIter *iter_buffer;
- NpyIter_IterNextFunc *iternext;
- npy_uint32 op_flags[NPY_MAXARGS];
- int buffersize;
- int errormask = 0;
- char * err_msg = NULL;
-
- PyArrayMethod_StridedLoop *strided_loop;
- NpyAuxData *auxdata = NULL;
+ int buffersize=0, errormask = 0;
+ int res;
+ char *args[3];
+ npy_intp steps[3];
+ args[0] = (char *) iter->baseoffset;
+ steps[0] = iter->fancy_strides[0];
+ if (ufuncimpl->nin == 1) {
+ args[2] = NULL;
+ steps[2] = 0;
+ } else {
+ args[2] = (char *)PyArray_DATA(op2_array);
+ if (PyArray_NDIM(op2_array) == 0
+ || PyArray_DIM(op2_array, 0) <= 1) {
+ steps[2] = 0;
+ } else {
+ steps[2] = PyArray_STRIDE(op2_array, 0);
+ }
+ }
- NPY_BEGIN_THREADS_DEF;
+ npy_intp *inner_size = NpyIter_GetInnerLoopSizePtr(iter->outer);
- if (ufunc->core_enabled) {
- PyErr_Format(PyExc_TypeError,
- "%s.at does not support ufunc with non-trivial signature: %s has signature %s.",
- ufunc->name, ufunc->name, ufunc->core_signature);
- return NULL;
+ if (!(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) {
+ npy_clear_floatstatus_barrier((char *)context);
}
- if (ufunc->nin > 2) {
- PyErr_SetString(PyExc_ValueError,
- "Only unary and binary ufuncs supported at this time");
- return NULL;
- }
+ do {
+ args[1] = (char *) iter->outer_ptrs[0];
+ steps[1] = iter->outer_strides[0];
- if (ufunc->nout != 1) {
- PyErr_SetString(PyExc_ValueError,
- "Only single output ufuncs supported at this time");
- return NULL;
- }
+ res = ufuncimpl->contiguous_indexed_loop(
+ context, args, inner_size, steps, NULL);
+ if (args[2] != NULL) {
+ args[2] += (*inner_size) * steps[2];
+ }
+ } while (res == 0 && iter->outer_next(iter->outer));
- if (!PyArg_ParseTuple(args, "OO|O:at", &op1, &idx, &op2)) {
- return NULL;
+ if (res == 0 && !(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) {
+ const char * ufunc_name =
+ ufunc_get_name_cstr((PyUFuncObject *)context->caller);
+ if (_get_bufsize_errmask(NULL, ufunc_name,
+ &buffersize, &errormask) < 0) {
+ return -1;
+ }
+ res = _check_ufunc_fperr(errormask, NULL, ufunc_name);
}
+ return res;
+}
- if (ufunc->nin == 2 && op2 == NULL) {
- PyErr_SetString(PyExc_ValueError,
- "second operand needed for ufunc");
- return NULL;
- }
- errval = PyUFunc_CheckOverride(ufunc, "at",
- args, NULL, NULL, 0, NULL, &override);
+static int
+ufunc_at__fast_iter(PyUFuncObject *ufunc, NPY_ARRAYMETHOD_FLAGS flags,
+ PyArrayMapIterObject *iter, PyArrayIterObject *iter2,
+ PyArrayObject *op1_array, PyArrayObject *op2_array,
+ PyArrayMethod_StridedLoop *strided_loop,
+ PyArrayMethod_Context *context,
+ NpyAuxData *auxdata
+ )
+{
+ int buffersize;
+ int errormask = 0;
+ int res = 0;
+ NPY_BEGIN_THREADS_DEF;
- if (errval) {
- return NULL;
- }
- else if (override) {
- return override;
+ if (_get_bufsize_errmask(NULL, ufunc->name, &buffersize, &errormask) < 0) {
+ return -1;
}
-
- if (!PyArray_Check(op1)) {
- PyErr_SetString(PyExc_TypeError,
- "first operand must be array");
- return NULL;
+ int needs_api = (flags & NPY_METH_REQUIRES_PYAPI) != 0;
+ if (!(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) {
+ /* Start with the floating-point exception flags cleared */
+ npy_clear_floatstatus_barrier((char*)&iter);
}
- op1_array = (PyArrayObject *)op1;
-
- /* Create second operand from number array if needed. */
- if (op2 != NULL) {
- op2_array = (PyArrayObject *)PyArray_FromAny(op2, NULL,
- 0, 0, 0, NULL);
- if (op2_array == NULL) {
- goto fail;
- }
+ if (!needs_api) {
+ NPY_BEGIN_THREADS;
}
- /* Create map iterator */
- iter = (PyArrayMapIterObject *)PyArray_MapIterArrayCopyIfOverlap(
- op1_array, idx, 1, op2_array);
- if (iter == NULL) {
- goto fail;
- }
- op1_array = iter->array; /* May be updateifcopied on overlap */
+ npy_intp strides[3] = {0, 0, 0};
+ /*
+ * Iterate over first and second operands and call ufunc
+ * for each pair of inputs
+ */
+ for (npy_intp i = iter->size; i > 0; i--)
+ {
+ char *dataptr[3];
+ /* one element at a time, no stride required but read by innerloop */
+ npy_intp count = 1;
- if (op2 != NULL) {
/*
- * May need to swap axes so that second operand is
- * iterated over correctly
+ * Set up data pointers for either one or two input operands.
+ * The output data pointer points to the first operand data.
*/
- if ((iter->subspace != NULL) && (iter->consec)) {
- PyArray_MapIterSwapAxes(iter, &op2_array, 0);
- if (op2_array == NULL) {
- goto fail;
- }
+ dataptr[0] = iter->dataptr;
+ if (iter2 != NULL) {
+ dataptr[1] = PyArray_ITER_DATA(iter2);
+ dataptr[2] = iter->dataptr;
+ }
+ else {
+ dataptr[1] = iter->dataptr;
+ dataptr[2] = NULL;
}
- /*
- * Create array iter object for second operand that
- * "matches" the map iter object for the first operand.
- * Then we can just iterate over the first and second
- * operands at the same time and not have to worry about
- * picking the correct elements from each operand to apply
- * the ufunc to.
- */
- if ((iter2 = (PyArrayIterObject *)\
- PyArray_BroadcastToShape((PyObject *)op2_array,
- iter->dimensions, iter->nd))==NULL) {
- goto fail;
+ res = strided_loop(context, dataptr, &count, strides, auxdata);
+ if (res != 0) {
+ break;
}
- }
- /*
- * Create dtypes array for either one or two input operands.
- * Compare to the logic in `convert_ufunc_arguments`.
- * TODO: It may be good to review some of this behaviour, since the
- * operand array is special (it is written to) similar to reductions.
- * Using unsafe-casting as done here, is likely not desirable.
- */
- operands[0] = op1_array;
- operand_DTypes[0] = NPY_DTYPE(PyArray_DESCR(op1_array));
- Py_INCREF(operand_DTypes[0]);
- int force_legacy_promotion = 0;
- int allow_legacy_promotion = NPY_DT_is_legacy(operand_DTypes[0]);
+ PyArray_MapIterNext(iter);
+ if (iter2 != NULL) {
+ PyArray_ITER_NEXT(iter2);
+ }
+ }
- if (op2_array != NULL) {
- operands[1] = op2_array;
- operand_DTypes[1] = NPY_DTYPE(PyArray_DESCR(op2_array));
- Py_INCREF(operand_DTypes[1]);
- allow_legacy_promotion &= NPY_DT_is_legacy(operand_DTypes[1]);
- operands[2] = operands[0];
- operand_DTypes[2] = operand_DTypes[0];
- Py_INCREF(operand_DTypes[2]);
+ NPY_END_THREADS;
- nop = 3;
- if (allow_legacy_promotion && ((PyArray_NDIM(op1_array) == 0)
- != (PyArray_NDIM(op2_array) == 0))) {
- /* both are legacy and only one is 0-D: force legacy */
- force_legacy_promotion = should_use_min_scalar(2, operands, 0, NULL);
- }
- }
- else {
- operands[1] = operands[0];
- operand_DTypes[1] = operand_DTypes[0];
- Py_INCREF(operand_DTypes[1]);
- operands[2] = NULL;
- nop = 2;
+ if (res == 0 && !(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) {
+ /* NOTE: We could check float errors even when `res < 0` */
+ res = _check_ufunc_fperr(errormask, NULL, "at");
}
+ return res;
+}
- PyArrayMethodObject *ufuncimpl = promote_and_get_ufuncimpl(ufunc,
- operands, signature, operand_DTypes,
- force_legacy_promotion, allow_legacy_promotion,
- NPY_FALSE, NPY_FALSE);
- if (ufuncimpl == NULL) {
- goto fail;
- }
+static int
+ufunc_at__slow_iter(PyUFuncObject *ufunc, NPY_ARRAYMETHOD_FLAGS flags,
+ PyArrayMapIterObject *iter, PyArrayIterObject *iter2,
+ PyArrayObject *op1_array, PyArrayObject *op2_array,
+ PyArray_Descr *operation_descrs[3],
+ PyArrayMethod_StridedLoop *strided_loop,
+ PyArrayMethod_Context *context,
+ NpyAuxData *auxdata
+ )
+{
+ NpyIter *iter_buffer = NULL;
+ PyArrayObject *array_operands[3] = {NULL, NULL, NULL};
+ int buffersize;
+ int errormask = 0;
+ int res = 0;
+ int nop = 0;
+ NpyIter_IterNextFunc *iternext;
+ char * err_msg = NULL;
+ NPY_BEGIN_THREADS_DEF;
- /* Find the correct descriptors for the operation */
- if (resolve_descriptors(nop, ufunc, ufuncimpl,
- operands, operation_descrs, signature, NPY_UNSAFE_CASTING) < 0) {
- goto fail;
+ if (_get_bufsize_errmask(NULL, ufunc->name, &buffersize, &errormask) < 0) {
+ return -1;
}
-
- Py_INCREF(PyArray_DESCR(op1_array));
array_operands[0] = new_array_op(op1_array, iter->dataptr);
if (iter2 != NULL) {
- Py_INCREF(PyArray_DESCR(op2_array));
array_operands[1] = new_array_op(op2_array, PyArray_ITER_DATA(iter2));
- Py_INCREF(PyArray_DESCR(op1_array));
array_operands[2] = new_array_op(op1_array, iter->dataptr);
+ nop = 3;
}
else {
- Py_INCREF(PyArray_DESCR(op1_array));
array_operands[1] = new_array_op(op1_array, iter->dataptr);
array_operands[2] = NULL;
+ nop = 2;
}
-
/* Set up the flags */
+ npy_uint32 op_flags[3];
op_flags[0] = NPY_ITER_READONLY|
NPY_ITER_ALIGNED;
@@ -6150,11 +6079,6 @@ ufunc_at(PyUFuncObject *ufunc, PyObject *args)
NPY_ITER_NO_BROADCAST|
NPY_ITER_NO_SUBTYPE;
}
-
- if (_get_bufsize_errmask(NULL, ufunc->name, &buffersize, &errormask) < 0) {
- goto fail;
- }
-
/*
* Create NpyIter object to "iterate" over single element of each input
* operand. This is an easy way to reuse the NpyIter logic for dealing
@@ -6177,33 +6101,23 @@ ufunc_at(PyUFuncObject *ufunc, PyObject *args)
-1, NULL, NULL, buffersize);
if (iter_buffer == NULL) {
- goto fail;
+ /* will fail only on memory allocation errors */
+ for (int i = 0; i < 3; i++) {
+ Py_XDECREF(array_operands[i]);
+ }
+ return -1;
}
iternext = NpyIter_GetIterNext(iter_buffer, NULL);
if (iternext == NULL) {
+ /* can not really happen, iter_buffer creation is tightly controlled */
NpyIter_Deallocate(iter_buffer);
- goto fail;
- }
-
- PyArrayMethod_Context context = {
- .caller = (PyObject *)ufunc,
- .method = ufuncimpl,
- .descriptors = operation_descrs,
- };
-
- NPY_ARRAYMETHOD_FLAGS flags;
- /* Use contiguous strides; if there is such a loop it may be faster */
- npy_intp strides[3] = {
- operation_descrs[0]->elsize, operation_descrs[1]->elsize, 0};
- if (nop == 3) {
- strides[2] = operation_descrs[2]->elsize;
+ for (int i = 0; i < 3; i++) {
+ Py_XDECREF(array_operands[i]);
+ }
+ return -1;
}
- if (ufuncimpl->get_strided_loop(&context, 1, 0, strides,
- &strided_loop, &auxdata, &flags) < 0) {
- goto fail;
- }
int needs_api = (flags & NPY_METH_REQUIRES_PYAPI) != 0;
needs_api |= NpyIter_IterationNeedsAPI(iter_buffer);
if (!(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) {
@@ -6211,6 +6125,7 @@ ufunc_at(PyUFuncObject *ufunc, PyObject *args)
npy_clear_floatstatus_barrier((char*)&iter);
}
+ npy_intp strides[3] = {0, 0, 0};
if (!needs_api) {
NPY_BEGIN_THREADS;
}
@@ -6219,7 +6134,6 @@ ufunc_at(PyUFuncObject *ufunc, PyObject *args)
* Iterate over first and second operands and call ufunc
* for each pair of inputs
*/
- int res = 0;
for (npy_intp i = iter->size; i > 0; i--)
{
char *dataptr[3];
@@ -6250,7 +6164,7 @@ ufunc_at(PyUFuncObject *ufunc, PyObject *args)
buffer_dataptr = NpyIter_GetDataPtrArray(iter_buffer);
- res = strided_loop(&context, buffer_dataptr, &count, strides, auxdata);
+ res = strided_loop(context, buffer_dataptr, &count, strides, auxdata);
if (res != 0) {
break;
}
@@ -6276,18 +6190,284 @@ ufunc_at(PyUFuncObject *ufunc, PyObject *args)
/* NOTE: We could check float errors even when `res < 0` */
res = _check_ufunc_fperr(errormask, NULL, "at");
}
+ NpyIter_Deallocate(iter_buffer);
+ for (int i = 0; i < 3; i++) {
+ Py_XDECREF(array_operands[i]);
+ }
+ return res;
+}
+/*
+ * Call ufunc only on selected array items and store result in first operand.
+ * For add ufunc, method call is equivalent to op1[idx] += op2 with no
+ * buffering of the first operand.
+ * Arguments:
+ * op1 - First operand to ufunc
+ * idx - Indices that are applied to first operand. Equivalent to op1[idx].
+ * op2 - Second operand to ufunc (if needed). Must be able to broadcast
+ * over first operand.
+ */
+static PyObject *
+ufunc_at(PyUFuncObject *ufunc, PyObject *args)
+{
+ PyObject *op1 = NULL;
+ PyObject *idx = NULL;
+ PyObject *op2 = NULL;
+ PyArrayObject *op1_array = NULL;
+ PyArrayObject *op2_array = NULL;
+ PyArrayMapIterObject *iter = NULL;
+ PyArrayIterObject *iter2 = NULL;
+ PyArray_Descr *operation_descrs[3] = {NULL, NULL, NULL};
+
+ int nop;
+
+ /* override vars */
+ int errval;
+ PyObject *override = NULL;
+ int res = -1; /* start with fail condition so "goto fail" will error */
+
+ PyArrayMethod_StridedLoop *strided_loop;
+ NpyAuxData *auxdata = NULL;
+
+ if (ufunc->core_enabled) {
+ PyErr_Format(PyExc_TypeError,
+ "%s.at does not support ufunc with non-trivial signature: %s has signature %s.",
+ ufunc->name, ufunc->name, ufunc->core_signature);
+ return NULL;
+ }
+
+ if (ufunc->nin > 2) {
+ PyErr_SetString(PyExc_ValueError,
+ "Only unary and binary ufuncs supported at this time");
+ return NULL;
+ }
+
+ if (ufunc->nout != 1) {
+ PyErr_SetString(PyExc_ValueError,
+ "Only single output ufuncs supported at this time");
+ return NULL;
+ }
+
+ if (!PyArg_ParseTuple(args, "OO|O:at", &op1, &idx, &op2)) {
+ return NULL;
+ }
+
+ if (ufunc->nin == 2 && op2 == NULL) {
+ PyErr_SetString(PyExc_ValueError,
+ "second operand needed for ufunc");
+ return NULL;
+ }
+
+ if (ufunc->nin == 1 && op2 != NULL) {
+ PyErr_SetString(PyExc_ValueError,
+ "second operand provided when ufunc is unary");
+ return NULL;
+ }
+ errval = PyUFunc_CheckOverride(ufunc, "at",
+ args, NULL, NULL, NULL, 0, NULL, &override);
+
+ if (errval) {
+ return NULL;
+ }
+ else if (override) {
+ return override;
+ }
+
+ if (!PyArray_Check(op1)) {
+ PyErr_SetString(PyExc_TypeError,
+ "first operand must be array");
+ return NULL;
+ }
+
+ op1_array = (PyArrayObject *)op1;
+
+ /* Create second operand from number array if needed. */
+ if (op2 == NULL) {
+ nop = 2;
+ }
+ else {
+ nop = 3;
+ op2_array = (PyArrayObject *)PyArray_FromAny(op2, NULL,
+ 0, 0, 0, NULL);
+ if (op2_array == NULL) {
+ goto fail;
+ }
+ }
+
+ PyArrayMethodObject *ufuncimpl = NULL;
+ {
+ /* Do all the dtype handling and find the correct ufuncimpl */
+
+ PyArrayObject *tmp_operands[3] = {NULL, NULL, NULL};
+ PyArray_DTypeMeta *signature[3] = {NULL, NULL, NULL};
+ PyArray_DTypeMeta *operand_DTypes[3] = {NULL, NULL, NULL};
+ /*
+ * Create dtypes array for either one or two input operands.
+ * Compare to the logic in `convert_ufunc_arguments`.
+ * TODO: It may be good to review some of this behaviour, since the
+ * operand array is special (it is written to) similar to reductions.
+ * Using unsafe-casting as done here, is likely not desirable.
+ */
+ tmp_operands[0] = op1_array;
+ operand_DTypes[0] = NPY_DTYPE(PyArray_DESCR(op1_array));
+ Py_INCREF(operand_DTypes[0]);
+ int force_legacy_promotion = 0;
+ int allow_legacy_promotion = NPY_DT_is_legacy(operand_DTypes[0]);
+
+ if (op2_array != NULL) {
+ tmp_operands[1] = op2_array;
+ operand_DTypes[1] = NPY_DTYPE(PyArray_DESCR(op2_array));
+ Py_INCREF(operand_DTypes[1]);
+ allow_legacy_promotion &= NPY_DT_is_legacy(operand_DTypes[1]);
+ tmp_operands[2] = tmp_operands[0];
+ operand_DTypes[2] = operand_DTypes[0];
+ Py_INCREF(operand_DTypes[2]);
+
+ if (allow_legacy_promotion && ((PyArray_NDIM(op1_array) == 0)
+ != (PyArray_NDIM(op2_array) == 0))) {
+ /* both are legacy and only one is 0-D: force legacy */
+ force_legacy_promotion = should_use_min_scalar(2, tmp_operands, 0, NULL);
+ }
+ }
+ else {
+ tmp_operands[1] = tmp_operands[0];
+ operand_DTypes[1] = operand_DTypes[0];
+ Py_INCREF(operand_DTypes[1]);
+ tmp_operands[2] = NULL;
+ }
+
+ ufuncimpl = promote_and_get_ufuncimpl(ufunc, tmp_operands, signature,
+ operand_DTypes, force_legacy_promotion,
+ allow_legacy_promotion, NPY_FALSE, NPY_FALSE);
+ if (ufuncimpl == NULL) {
+ for (int i = 0; i < 3; i++) {
+ Py_XDECREF(signature[i]);
+ Py_XDECREF(operand_DTypes[i]);
+ }
+ goto fail;
+ }
+
+ /* Find the correct operation_descrs for the operation */
+ int resolve_result = resolve_descriptors(nop, ufunc, ufuncimpl,
+ tmp_operands, operation_descrs, signature, NPY_UNSAFE_CASTING);
+ for (int i = 0; i < 3; i++) {
+ Py_XDECREF(signature[i]);
+ Py_XDECREF(operand_DTypes[i]);
+ }
+ if (resolve_result < 0) {
+ goto fail;
+ }
+ }
+
+ iter = (PyArrayMapIterObject *)PyArray_MapIterArrayCopyIfOverlap(
+ op1_array, idx, 1, op2_array);
+ if (iter == NULL) {
+ goto fail;
+ }
+ op1_array = iter->array; /* May be updateifcopied on overlap */
+
+ if (op2_array != NULL) {
+ /*
+ * May need to swap axes so that second operand is
+ * iterated over correctly
+ */
+ if ((iter->subspace != NULL) && (iter->consec)) {
+ PyArray_MapIterSwapAxes(iter, &op2_array, 0);
+ if (op2_array == NULL) {
+ /* only on memory allocation failure */
+ goto fail;
+ }
+ }
+
+ /*
+ * Create array iter object for second operand that
+ * "matches" the map iter object for the first operand.
+ * Then we can just iterate over the first and second
+ * operands at the same time and not have to worry about
+ * picking the correct elements from each operand to apply
+ * the ufunc to.
+ */
+ if ((iter2 = (PyArrayIterObject *)\
+ PyArray_BroadcastToShape((PyObject *)op2_array,
+ iter->dimensions, iter->nd))==NULL) {
+ goto fail;
+ }
+ }
+
+ PyArrayMethod_Context context = {
+ .caller = (PyObject *)ufunc,
+ .method = ufuncimpl,
+ .descriptors = operation_descrs,
+ };
+
+ /* Use contiguous strides; if there is such a loop it may be faster */
+ npy_intp strides[3] = {
+ operation_descrs[0]->elsize, operation_descrs[1]->elsize, 0};
+ if (nop == 3) {
+ strides[2] = operation_descrs[2]->elsize;
+ }
+
+ NPY_ARRAYMETHOD_FLAGS flags;
+ if (ufuncimpl->get_strided_loop(&context, 1, 0, strides,
+ &strided_loop, &auxdata, &flags) < 0) {
+ goto fail;
+ }
+ int fast_path = 1;
+ /* check no casting, alignment */
+ if (PyArray_DESCR(op1_array) != operation_descrs[0]) {
+ fast_path = 0;
+ }
+ if (PyArray_DESCR(op1_array) != operation_descrs[nop - 1]) {
+ /* output casting */
+ fast_path = 0;
+ }
+ if (!PyArray_ISALIGNED(op1_array)) {
+ fast_path = 0;
+ }
+ if (nop >2) {
+ if (PyArray_DESCR(op2_array) != operation_descrs[1]) {
+ fast_path = 0;
+ }
+ if (!PyArray_ISALIGNED(op2_array)) {
+ fast_path = 0;
+ }
+ }
+ if (fast_path == 1) {
+ /*
+ * Try to use trivial loop (1d, no casting, aligned) if
+ * - the matching info has a indexed loop
+ * - idx must be exactly one integer index array
+ * - all operands are 1d
+ * A future enhancement could loosen the restriction on 1d operands
+ * by adding an iteration loop inside trivial_at_loop
+ */
+ if ((ufuncimpl->contiguous_indexed_loop != NULL) &&
+ (PyArray_NDIM(op1_array) == 1) &&
+ (op2_array == NULL || PyArray_NDIM(op2_array) <= 1) &&
+ (iter->subspace_iter == NULL) && (iter->numiter == 1)) {
+ res = trivial_at_loop(ufuncimpl, flags, iter, op1_array,
+ op2_array, &context);
+
+ }
+ else {
+ /* Couldn't use the fastest path, use the faster path */
+ res = ufunc_at__fast_iter(ufunc, flags, iter, iter2, op1_array,
+ op2_array, strided_loop, &context, auxdata);
+ }
+ } else {
+ res = ufunc_at__slow_iter(ufunc, flags, iter, iter2, op1_array,
+ op2_array, operation_descrs, strided_loop, &context,
+ auxdata);
+ }
+
+fail:
NPY_AUXDATA_FREE(auxdata);
- NpyIter_Deallocate(iter_buffer);
Py_XDECREF(op2_array);
Py_XDECREF(iter);
Py_XDECREF(iter2);
for (int i = 0; i < nop; i++) {
- Py_DECREF(signature[i]);
- Py_XDECREF(operand_DTypes[i]);
Py_XDECREF(operation_descrs[i]);
- Py_XDECREF(array_operands[i]);
}
/*
@@ -6296,29 +6476,406 @@ ufunc_at(PyUFuncObject *ufunc, PyObject *args)
* (e.g. `power` released the GIL but manually set an Exception).
*/
if (res != 0 || PyErr_Occurred()) {
+ /* iter_buffer has already been deallocated, don't use NpyIter_Dealloc */
+ if (PyArray_FLAGS(op1_array) & NPY_ARRAY_WRITEBACKIFCOPY) {
+ PyArray_DiscardWritebackIfCopy(op1_array);
+ }
return NULL;
}
else {
Py_RETURN_NONE;
}
+}
-fail:
- /* iter_buffer has already been deallocated, don't use NpyIter_Dealloc */
- if (op1_array != (PyArrayObject*)op1) {
- PyArray_DiscardWritebackIfCopy(op1_array);
+
+typedef struct {
+ PyArrayMethod_StridedLoop *strided_loop;
+ PyArrayMethod_Context *context;
+ NpyAuxData *auxdata;
+ /* Should move to flags, but lets keep it bools for now: */
+ npy_bool requires_pyapi;
+ npy_bool no_floatingpoint_errors;
+ PyArrayMethod_Context _full_context;
+ PyArray_Descr *_descrs[];
+} ufunc_call_info;
+
+
+void
+free_ufunc_call_info(PyObject *self)
+{
+ ufunc_call_info *call_info = PyCapsule_GetPointer(
+ self, "numpy_1.24_ufunc_call_info");
+
+ PyArrayMethod_Context *context = call_info->context;
+
+ int nargs = context->method->nin + context->method->nout;
+ for (int i = 0; i < nargs; i++) {
+ Py_DECREF(context->descriptors[i]);
}
- Py_XDECREF(op2_array);
- Py_XDECREF(iter);
- Py_XDECREF(iter2);
- for (int i = 0; i < 3; i++) {
+ Py_DECREF(context->caller);
+ Py_DECREF(context->method);
+ NPY_AUXDATA_FREE(call_info->auxdata);
+
+ PyObject_Free(call_info);
+}
+
+
+/*
+ * Python entry-point to ufunc promotion and dtype/descr resolution.
+ *
+ * This function does most of the work required to execute ufunc without
+ * actually executing it.
+ * This can be very useful for downstream libraries that reimplement NumPy
+ * functionality, such as Numba or Dask.
+ */
+static PyObject *
+py_resolve_dtypes_generic(PyUFuncObject *ufunc, npy_bool return_context,
+ PyObject *const *args, Py_ssize_t len_args, PyObject *kwnames)
+{
+ NPY_PREPARE_ARGPARSER;
+
+ PyObject *descrs_tuple;
+ PyObject *signature_obj = NULL;
+ NPY_CASTING casting = NPY_DEFAULT_ASSIGN_CASTING;
+ npy_bool reduction = NPY_FALSE;
+
+ if (npy_parse_arguments("resolve_dtypes", args, len_args, kwnames,
+ "", NULL, &descrs_tuple,
+ "$signature", NULL, &signature_obj,
+ "$casting", &PyArray_CastingConverter, &casting,
+ "$reduction", &PyArray_BoolConverter, &reduction,
+ NULL, NULL, NULL) < 0) {
+ return NULL;
+ }
+
+ if (reduction && (ufunc->nin != 2 || ufunc->nout != 1)) {
+ PyErr_SetString(PyExc_ValueError,
+ "ufunc is not compatible with reduction operations.");
+ return NULL;
+ }
+
+ /*
+ * Legacy type resolvers expect NumPy arrays as input. Until NEP 50 is
+ * adopted, it is most convenient to ensure that we have an "array" object
+ * before calling the type promotion. Eventually, this hack may be moved
+ * into the legacy type resolution code itself (probably after NumPy stops
+ * using legacy type resolution itself for the most part).
+ *
+ * We make the pretty safe assumptions here that:
+ * - Nobody will actually do anything with the array objects besides
+ * checking the descriptor or calling CanCast.
+ * - No type resolver will cause weird paths that mess with our promotion
+ * state (or mind us messing with it).
+ */
+ PyObject *result = NULL;
+ PyObject *result_dtype_tuple = NULL;
+
+ PyArrayObject *dummy_arrays[NPY_MAXARGS] = {NULL};
+ PyArray_DTypeMeta *DTypes[NPY_MAXARGS] = {NULL};
+ PyArray_DTypeMeta *signature[NPY_MAXARGS] = {NULL};
+ PyArray_Descr *operation_descrs[NPY_MAXARGS] = {NULL};
+
+ /* This entry-point to promotion lives in the NEP 50 future: */
+ int original_promotion_state = npy_promotion_state;
+ npy_promotion_state = NPY_USE_WEAK_PROMOTION;
+
+ npy_bool promoting_pyscalars = NPY_FALSE;
+ npy_bool allow_legacy_promotion = NPY_TRUE;
+
+ if (_get_fixed_signature(ufunc, NULL, signature_obj, signature) < 0) {
+ goto finish;
+ }
+
+ if (!PyTuple_CheckExact(descrs_tuple)
+ || PyTuple_Size(descrs_tuple) != ufunc->nargs) {
+ PyErr_SetString(PyExc_TypeError,
+ "resolve_dtypes: The dtypes must be a tuple of "
+ "`ufunc.nargs` length.");
+ goto finish;
+ }
+ for (int i=0; i < ufunc->nargs; i++) {
+ /*
+ * We create dummy arrays for now. It should be OK to make this
+ * truly "dummy" (not even proper objects), but that is a hack better
+ * left for the legacy_type_resolution wrapper when NEP 50 is done.
+ */
+ PyObject *descr_obj = PyTuple_GET_ITEM(descrs_tuple, i);
+ PyArray_Descr *descr;
+
+ if (PyArray_DescrCheck(descr_obj)) {
+ descr = (PyArray_Descr *)descr_obj;
+ Py_INCREF(descr);
+ dummy_arrays[i] = (PyArrayObject *)PyArray_NewFromDescr_int(
+ &PyArray_Type, descr, 0, NULL, NULL, NULL,
+ 0, NULL, NULL, _NPY_ARRAY_ENSURE_DTYPE_IDENTITY);
+ if (dummy_arrays[i] == NULL) {
+ goto finish;
+ }
+ DTypes[i] = NPY_DTYPE(descr);
+ Py_INCREF(DTypes[i]);
+ if (!NPY_DT_is_legacy(DTypes[i])) {
+ allow_legacy_promotion = NPY_FALSE;
+ }
+ }
+ /* Explicitly allow int, float, and complex for the "weak" types. */
+ else if (descr_obj == (PyObject *)&PyLong_Type) {
+ descr = PyArray_DescrFromType(NPY_LONG);
+ dummy_arrays[i] = (PyArrayObject *)PyArray_Empty(0, NULL, descr, 0);
+ if (dummy_arrays[i] == NULL) {
+ goto finish;
+ }
+ PyArray_ENABLEFLAGS(dummy_arrays[i], NPY_ARRAY_WAS_PYTHON_INT);
+ Py_INCREF(&PyArray_PyIntAbstractDType);
+ DTypes[i] = &PyArray_PyIntAbstractDType;
+ promoting_pyscalars = NPY_TRUE;
+ }
+ else if (descr_obj == (PyObject *)&PyFloat_Type) {
+ descr = PyArray_DescrFromType(NPY_DOUBLE);
+ dummy_arrays[i] = (PyArrayObject *)PyArray_Empty(0, NULL, descr, 0);
+ if (dummy_arrays[i] == NULL) {
+ goto finish;
+ }
+ PyArray_ENABLEFLAGS(dummy_arrays[i], NPY_ARRAY_WAS_PYTHON_FLOAT);
+ Py_INCREF(&PyArray_PyFloatAbstractDType);
+ DTypes[i] = &PyArray_PyFloatAbstractDType;
+ promoting_pyscalars = NPY_TRUE;
+ }
+ else if (descr_obj == (PyObject *)&PyComplex_Type) {
+ descr = PyArray_DescrFromType(NPY_CDOUBLE);
+ dummy_arrays[i] = (PyArrayObject *)PyArray_Empty(0, NULL, descr, 0);
+ if (dummy_arrays[i] == NULL) {
+ goto finish;
+ }
+ PyArray_ENABLEFLAGS(dummy_arrays[i], NPY_ARRAY_WAS_PYTHON_COMPLEX);
+ Py_INCREF(&PyArray_PyComplexAbstractDType);
+ DTypes[i] = &PyArray_PyComplexAbstractDType;
+ promoting_pyscalars = NPY_TRUE;
+ }
+ else if (descr_obj == Py_None) {
+ if (i < ufunc->nin && !(reduction && i == 0)) {
+ PyErr_SetString(PyExc_TypeError,
+ "All input dtypes must be provided "
+ "(except the first one in reductions)");
+ goto finish;
+ }
+ }
+ else {
+ PyErr_SetString(PyExc_TypeError,
+ "Provided dtype must be a valid NumPy dtype, "
+ "int, float, complex, or None.");
+ goto finish;
+ }
+ }
+
+ PyArrayMethodObject *ufuncimpl;
+ if (!reduction) {
+ ufuncimpl = promote_and_get_ufuncimpl(ufunc,
+ dummy_arrays, signature, DTypes, NPY_FALSE,
+ allow_legacy_promotion, promoting_pyscalars, NPY_FALSE);
+ if (ufuncimpl == NULL) {
+ goto finish;
+ }
+
+ /* Find the correct descriptors for the operation */
+ if (resolve_descriptors(ufunc->nargs, ufunc, ufuncimpl,
+ dummy_arrays, operation_descrs, signature, casting) < 0) {
+ goto finish;
+ }
+
+ if (validate_casting(
+ ufuncimpl, ufunc, dummy_arrays, operation_descrs, casting) < 0) {
+ goto finish;
+ }
+ }
+ else { /* reduction */
+ if (signature[2] != NULL) {
+ PyErr_SetString(PyExc_ValueError,
+ "Reduction signature must end with None, instead pass "
+ "the first DType in the signature.");
+ goto finish;
+ }
+
+ if (dummy_arrays[2] != NULL) {
+ PyErr_SetString(PyExc_TypeError,
+ "Output dtype must not be passed for reductions, "
+ "pass the first input instead.");
+ goto finish;
+ }
+
+ ufuncimpl = reducelike_promote_and_resolve(ufunc,
+ dummy_arrays[1], dummy_arrays[0], signature, NPY_FALSE,
+ operation_descrs, casting, "resolve_dtypes");
+
+ if (ufuncimpl == NULL) {
+ goto finish;
+ }
+ }
+
+ result = PyArray_TupleFromItems(
+ ufunc->nargs, (PyObject **)operation_descrs, 0);
+
+ if (result == NULL || !return_context) {
+ goto finish;
+ }
+ /* Result will be (dtype_tuple, call_info), so move it and clear result */
+ result_dtype_tuple = result;
+ result = NULL;
+
+ /* We may have to return the context: */
+ ufunc_call_info *call_info;
+ call_info = PyObject_Malloc(sizeof(ufunc_call_info)
+ + ufunc->nargs * sizeof(PyArray_Descr *));
+ if (call_info == NULL) {
+ PyErr_NoMemory();
+ goto finish;
+ }
+ call_info->strided_loop = NULL;
+ call_info->auxdata = NULL;
+ call_info->context = &call_info->_full_context;
+
+ /*
+ * We create a capsule with NumPy 1.24 in the name to signal that it is
+ * prone to change in version updates (it doesn't have to).
+ * This capsule is documented in the `ufunc._resolve_dtypes_and_context`
+ * docstring.
+ */
+ PyObject *capsule = PyCapsule_New(
+ call_info, "numpy_1.24_ufunc_call_info", &free_ufunc_call_info);
+ if (capsule == NULL) {
+ PyObject_Free(call_info);
+ goto finish;
+ }
+
+ PyArrayMethod_Context *context = call_info->context;
+
+ Py_INCREF(ufunc);
+ context->caller = (PyObject *)ufunc;
+ Py_INCREF(ufuncimpl);
+ context->method = ufuncimpl;
+ context->descriptors = call_info->_descrs;
+ for (int i=0; i < ufunc->nargs; i++) {
+ Py_INCREF(operation_descrs[i]);
+ context->descriptors[i] = operation_descrs[i];
+ }
+
+ result = PyTuple_Pack(2, result_dtype_tuple, capsule);
+ /* cleanup and return */
+ Py_DECREF(capsule);
+
+ finish:
+ npy_promotion_state = original_promotion_state;
+
+ Py_XDECREF(result_dtype_tuple);
+ for (int i = 0; i < ufunc->nargs; i++) {
Py_XDECREF(signature[i]);
- Py_XDECREF(operand_DTypes[i]);
+ Py_XDECREF(dummy_arrays[i]);
Py_XDECREF(operation_descrs[i]);
- Py_XDECREF(array_operands[i]);
+ Py_XDECREF(DTypes[i]);
}
- NPY_AUXDATA_FREE(auxdata);
- return NULL;
+ return result;
+}
+
+
+static PyObject *
+py_resolve_dtypes(PyUFuncObject *ufunc,
+ PyObject *const *args, Py_ssize_t len_args, PyObject *kwnames)
+{
+ return py_resolve_dtypes_generic(ufunc, NPY_FALSE, args, len_args, kwnames);
+}
+
+
+static PyObject *
+py_resolve_dtypes_and_context(PyUFuncObject *ufunc,
+ PyObject *const *args, Py_ssize_t len_args, PyObject *kwnames)
+{
+ return py_resolve_dtypes_generic(ufunc, NPY_TRUE, args, len_args, kwnames);
+}
+
+
+static PyObject *
+py_get_strided_loop(PyUFuncObject *ufunc,
+ PyObject *const *args, Py_ssize_t len_args, PyObject *kwnames)
+{
+ NPY_PREPARE_ARGPARSER;
+
+ PyObject *call_info_obj;
+ PyObject *fixed_strides_obj = Py_None;
+ npy_intp fixed_strides[NPY_MAXARGS];
+
+ if (npy_parse_arguments("_get_strided_loop", args, len_args, kwnames,
+ "", NULL, &call_info_obj,
+ "$fixed_strides", NULL, &fixed_strides_obj,
+ NULL, NULL, NULL) < 0) {
+ return NULL;
+ }
+
+ ufunc_call_info *call_info = PyCapsule_GetPointer(
+ call_info_obj, "numpy_1.24_ufunc_call_info");
+ if (call_info == NULL) {
+ /* Cannot have a context with NULL inside... */
+ assert(PyErr_Occurred());
+ return NULL;
+ }
+ if (call_info->strided_loop != NULL) {
+ PyErr_SetString(PyExc_TypeError,
+ "ufunc call_info has already been filled/used!");
+ return NULL;
+ }
+
+ if (call_info->context->caller != (PyObject *)ufunc) {
+ PyErr_SetString(PyExc_TypeError,
+ "calling get_strided_loop with incompatible context");
+ return NULL;
+ }
+
+ /*
+ * Strict conversion of fixed_strides, None, or tuple of int or None.
+ */
+ if (fixed_strides_obj == Py_None) {
+ for (int i = 0; i < ufunc->nargs; i++) {
+ fixed_strides[i] = NPY_MAX_INTP;
+ }
+ }
+ else if (PyTuple_CheckExact(fixed_strides_obj)
+ && PyTuple_Size(fixed_strides_obj) == ufunc->nargs) {
+ for (int i = 0; i < ufunc->nargs; i++) {
+ PyObject *stride = PyTuple_GET_ITEM(fixed_strides_obj, i);
+ if (PyLong_CheckExact(stride)) {
+ fixed_strides[i] = PyLong_AsSsize_t(stride);
+ if (error_converting(fixed_strides[i])) {
+ return NULL;
+ }
+ }
+ else if (stride == Py_None) {
+ fixed_strides[i] = NPY_MAX_INTP;
+ }
+ else {
+ PyErr_SetString(PyExc_TypeError,
+ "_get_strided_loop(): fixed_strides tuple must contain "
+ "Python ints or None");
+ return NULL;
+ }
+ }
+ }
+ else {
+ PyErr_SetString(PyExc_TypeError,
+ "_get_strided_loop(): fixed_strides must be a tuple or None");
+ return NULL;
+ }
+
+ NPY_ARRAYMETHOD_FLAGS flags;
+ if (call_info->context->method->get_strided_loop(call_info->context,
+ 1, 0, fixed_strides, &call_info->strided_loop, &call_info->auxdata,
+ &flags) < 0) {
+ return NULL;
+ }
+
+ call_info->requires_pyapi = flags & NPY_METH_REQUIRES_PYAPI;
+ call_info->no_floatingpoint_errors = (
+ flags & NPY_METH_NO_FLOATINGPOINT_ERRORS);
+
+ Py_RETURN_NONE;
}
@@ -6338,6 +6895,21 @@ static struct PyMethodDef ufunc_methods[] = {
{"at",
(PyCFunction)ufunc_at,
METH_VARARGS, NULL},
+ /* Lower level methods: */
+ {"resolve_dtypes",
+ (PyCFunction)py_resolve_dtypes,
+ METH_FASTCALL | METH_KEYWORDS, NULL},
+ /*
+ * The following two functions are public API, but underscored since they
+ * are C-user specific and allow direct access to the core of ufunc loops.
+ * (See their documentation for API stability.)
+ */
+ {"_resolve_dtypes_and_context",
+ (PyCFunction)py_resolve_dtypes_and_context,
+ METH_FASTCALL | METH_KEYWORDS, NULL},
+ {"_get_strided_loop",
+ (PyCFunction)py_get_strided_loop,
+ METH_FASTCALL | METH_KEYWORDS, NULL},
{NULL, NULL, 0, NULL} /* sentinel */
};
@@ -6459,7 +7031,7 @@ static PyObject *
ufunc_get_identity(PyUFuncObject *ufunc, void *NPY_UNUSED(ignored))
{
npy_bool reorderable;
- return _get_identity(ufunc, &reorderable);
+ return PyUFunc_GetDefaultIdentity(ufunc, &reorderable);
}
static PyObject *
@@ -6518,6 +7090,7 @@ NPY_NO_EXPORT PyTypeObject PyUFunc_Type = {
.tp_name = "numpy.ufunc",
.tp_basicsize = sizeof(PyUFuncObject),
.tp_dealloc = (destructor)ufunc_dealloc,
+ .tp_vectorcall_offset = offsetof(PyUFuncObject, vectorcall),
.tp_repr = (reprfunc)ufunc_repr,
.tp_call = &PyVectorcall_Call,
.tp_str = (reprfunc)ufunc_repr,
@@ -6527,7 +7100,6 @@ NPY_NO_EXPORT PyTypeObject PyUFunc_Type = {
.tp_traverse = (traverseproc)ufunc_traverse,
.tp_methods = ufunc_methods,
.tp_getset = ufunc_getset,
- .tp_vectorcall_offset = offsetof(PyUFuncObject, vectorcall),
};
/* End of code for ufunc objects */
diff --git a/numpy/core/src/umath/ufunc_object.h b/numpy/core/src/umath/ufunc_object.h
index 32af6c58e..1b80bb2df 100644
--- a/numpy/core/src/umath/ufunc_object.h
+++ b/numpy/core/src/umath/ufunc_object.h
@@ -4,14 +4,17 @@
#include <numpy/ufuncobject.h>
NPY_NO_EXPORT PyObject *
-ufunc_geterr(PyObject *NPY_UNUSED(dummy), PyObject *args);
+ufunc_geterr(PyObject *NPY_UNUSED(dummy), PyObject *NPY_UNUSED(arg));
NPY_NO_EXPORT PyObject *
-ufunc_seterr(PyObject *NPY_UNUSED(dummy), PyObject *args);
+ufunc_seterr(PyObject *NPY_UNUSED(dummy), PyObject *arg);
NPY_NO_EXPORT const char*
ufunc_get_name_cstr(PyUFuncObject *ufunc);
+NPY_NO_EXPORT PyObject *
+PyUFunc_GetDefaultIdentity(PyUFuncObject *ufunc, npy_bool *reorderable);
+
/* strings from umathmodule.c that are interned on umath import */
NPY_VISIBILITY_HIDDEN extern PyObject *npy_um_str_array_ufunc;
NPY_VISIBILITY_HIDDEN extern PyObject *npy_um_str_array_prepare;
diff --git a/numpy/core/src/umath/ufunc_type_resolution.c b/numpy/core/src/umath/ufunc_type_resolution.c
index 94338e031..decd26580 100644
--- a/numpy/core/src/umath/ufunc_type_resolution.c
+++ b/numpy/core/src/umath/ufunc_type_resolution.c
@@ -358,20 +358,51 @@ PyUFunc_SimpleBinaryComparisonTypeResolver(PyUFuncObject *ufunc,
}
if (type_tup == NULL) {
+ if (PyArray_ISDATETIME(operands[0])
+ && PyArray_ISDATETIME(operands[1])
+ && type_num1 != type_num2) {
+ /*
+ * Reject mixed datetime and timedelta explicitly, this was always
+ * implicitly rejected because casting fails (except with
+ * `casting="unsafe"` admittedly).
+ * This is required to ensure that `==` and `!=` can correctly
+ * detect that they should return a result array of False/True.
+ */
+ return raise_binary_type_reso_error(ufunc, operands);
+ }
/*
- * DEPRECATED NumPy 1.20, 2020-12.
- * This check is required to avoid the FutureWarning that
- * ResultType will give for number->string promotions.
+ * This check is required to avoid a potential FutureWarning that
+ * ResultType would give for number->string promotions.
* (We never supported flexible dtypes here.)
*/
- if (!PyArray_ISFLEXIBLE(operands[0]) &&
+ else if (!PyArray_ISFLEXIBLE(operands[0]) &&
!PyArray_ISFLEXIBLE(operands[1])) {
out_dtypes[0] = PyArray_ResultType(2, operands, 0, NULL);
if (out_dtypes[0] == NULL) {
return -1;
}
- out_dtypes[1] = out_dtypes[0];
- Py_INCREF(out_dtypes[1]);
+ if (PyArray_ISINTEGER(operands[0])
+ && PyArray_ISINTEGER(operands[1])
+ && !PyDataType_ISINTEGER(out_dtypes[0])) {
+ /*
+ * NumPy promotion allows uint+int to go to float, avoid it
+ * (input must have been a mix of signed and unsigned)
+ */
+ if (PyArray_ISSIGNED(operands[0])) {
+ Py_SETREF(out_dtypes[0], PyArray_DescrFromType(NPY_LONGLONG));
+ out_dtypes[1] = PyArray_DescrFromType(NPY_ULONGLONG);
+ Py_INCREF(out_dtypes[1]);
+ }
+ else {
+ Py_SETREF(out_dtypes[0], PyArray_DescrFromType(NPY_ULONGLONG));
+ out_dtypes[1] = PyArray_DescrFromType(NPY_LONGLONG);
+ Py_INCREF(out_dtypes[1]);
+ }
+ }
+ else {
+ out_dtypes[1] = out_dtypes[0];
+ Py_INCREF(out_dtypes[1]);
+ }
}
else {
/* Not doing anything will lead to a loop no found error. */
@@ -382,64 +413,13 @@ PyUFunc_SimpleBinaryComparisonTypeResolver(PyUFuncObject *ufunc,
}
}
else {
- PyArray_Descr *descr;
- /*
- * DEPRECATED 2021-03, NumPy 1.20
- *
- * If the type tuple was originally a single element (probably),
- * issue a deprecation warning, but otherwise accept it. Since the
- * result dtype is always boolean, this is not actually valid unless it
- * is `object` (but if there is an object input we already deferred).
- *
- * TODO: Once this deprecation is gone, the special case for
- * `PyUFunc_SimpleBinaryComparisonTypeResolver` in dispatching.c
- * can be removed.
- */
- if (PyTuple_Check(type_tup) && PyTuple_GET_SIZE(type_tup) == 3 &&
- PyTuple_GET_ITEM(type_tup, 0) == Py_None &&
- PyTuple_GET_ITEM(type_tup, 1) == Py_None &&
- PyArray_DescrCheck(PyTuple_GET_ITEM(type_tup, 2))) {
- descr = (PyArray_Descr *)PyTuple_GET_ITEM(type_tup, 2);
- if (descr->type_num == NPY_OBJECT) {
- if (DEPRECATE_FUTUREWARNING(
- "using `dtype=object` (or equivalent signature) will "
- "return object arrays in the future also when the "
- "inputs do not already have `object` dtype.") < 0) {
- return -1;
- }
- }
- else if (descr->type_num != NPY_BOOL) {
- if (DEPRECATE(
- "using `dtype=` in comparisons is only useful for "
- "`dtype=object` (and will do nothing for bool). "
- "This operation will fail in the future.") < 0) {
- return -1;
- }
- }
- }
- else {
- /* Usually a failure, but let the default version handle it */
- return PyUFunc_DefaultTypeResolver(ufunc, casting,
- operands, type_tup, out_dtypes);
- }
-
- out_dtypes[0] = NPY_DT_CALL_ensure_canonical(descr);
- if (out_dtypes[0] == NULL) {
- return -1;
- }
- out_dtypes[1] = out_dtypes[0];
- Py_INCREF(out_dtypes[1]);
+ /* Usually a failure, but let the default version handle it */
+ return PyUFunc_DefaultTypeResolver(ufunc, casting,
+ operands, type_tup, out_dtypes);
}
- /* Output type is always boolean */
+ /* Output type is always boolean (cannot fail for builtins) */
out_dtypes[2] = PyArray_DescrFromType(NPY_BOOL);
- if (out_dtypes[2] == NULL) {
- for (i = 0; i < 2; ++i) {
- Py_DECREF(out_dtypes[i]);
- out_dtypes[i] = NULL;
- }
- return -1;
- }
/* Check against the casting rules */
if (PyUFunc_ValidateCasting(ufunc, casting, operands, out_dtypes) < 0) {
@@ -694,7 +674,7 @@ PyUFunc_IsNaTTypeResolver(PyUFuncObject *ufunc,
{
if (!PyTypeNum_ISDATETIME(PyArray_DESCR(operands[0])->type_num)) {
PyErr_SetString(PyExc_TypeError,
- "ufunc 'isnat' is only defined for datetime and timedelta.");
+ "ufunc 'isnat' is only defined for np.datetime64 and np.timedelta64.");
return -1;
}
diff --git a/numpy/core/src/umath/wrapping_array_method.c b/numpy/core/src/umath/wrapping_array_method.c
index 9f8f036e8..64fea7aeb 100644
--- a/numpy/core/src/umath/wrapping_array_method.c
+++ b/numpy/core/src/umath/wrapping_array_method.c
@@ -177,6 +177,39 @@ wrapping_method_get_loop(
}
+/*
+ * Wraps the original identity function, needs to translate the descriptors
+ * back to the original ones and provide an "original" context (identically to
+ * `get_loop`).
+ * We assume again that translating the descriptors is quick.
+ */
+static int
+wrapping_method_get_identity_function(
+ PyArrayMethod_Context *context, npy_bool reduction_is_empty,
+ char *item)
+{
+ /* Copy the context, and replace descriptors: */
+ PyArrayMethod_Context orig_context = *context;
+ PyArray_Descr *orig_descrs[NPY_MAXARGS];
+ orig_context.descriptors = orig_descrs;
+ orig_context.method = context->method->wrapped_meth;
+
+ int nin = context->method->nin, nout = context->method->nout;
+ PyArray_DTypeMeta **dtypes = context->method->wrapped_dtypes;
+
+ if (context->method->translate_given_descrs(
+ nin, nout, dtypes, context->descriptors, orig_descrs) < 0) {
+ return -1;
+ }
+ int res = context->method->wrapped_meth->get_reduction_initial(
+ &orig_context, reduction_is_empty, item);
+ for (int i = 0; i < nin + nout; i++) {
+ Py_DECREF(orig_descrs);
+ }
+ return res;
+}
+
+
/**
* Allows creating of a fairly lightweight wrapper around an existing ufunc
* loop. The idea is mainly for units, as this is currently slightly limited
@@ -235,14 +268,17 @@ PyUFunc_AddWrappingLoop(PyObject *ufunc_obj,
break;
}
if (wrapped_meth == NULL) {
- PyErr_SetString(PyExc_TypeError,
- "Did not find the to-be-wrapped loop in the ufunc.");
+ PyErr_Format(PyExc_TypeError,
+ "Did not find the to-be-wrapped loop in the ufunc with given "
+ "DTypes. Received wrapping types: %S", wrapped_dt_tuple);
goto finish;
}
PyType_Slot slots[] = {
{NPY_METH_resolve_descriptors, &wrapping_method_resolve_descriptors},
- {NPY_METH_get_loop, &wrapping_method_get_loop},
+ {_NPY_METH_get_loop, &wrapping_method_get_loop},
+ {NPY_METH_get_reduction_initial,
+ &wrapping_method_get_identity_function},
{0, NULL}
};