summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJulian Taylor <jtaylor.debian@googlemail.com>2013-05-19 17:04:27 +0200
committerJulian Taylor <jtaylor.debian@googlemail.com>2013-05-25 17:36:00 +0200
commit0adccaaa910ab495e993f453956fd983775604f3 (patch)
tree575e6b1bc7066bbe24ade1fee8576e4e31f2f7ef
parent8ff5e37bff03925da4c1b121b38188f9fd779b4d (diff)
downloadnumpy-0adccaaa910ab495e993f453956fd983775604f3.tar.gz
ENH: vectorize sqrt ufunc using SSE2
specialize the sqrt ufunc for float and double and vectorize it using SSE2. improves performance by 4/2 for float/double if one is not memory bound due to non-cached data. performance is always better on all tested machines (amd phenom X2, intel xeon 5xxx/7xxx, core2duo, corei7) This version will not set errno on invalid input, but numpy only checks the fpu flags so the behavior is the same. In principle the compiler could autovectorize it when setting ffast-math (for no errno) and specializing the loop for the vectorizable strides and giving it some hints (restrict, __builtin_assume_aligned, etc.), but its simpler and more reliable to simply vectorize it by hand.
-rw-r--r--numpy/core/code_generators/generate_umath.py2
-rw-r--r--numpy/core/setup.py5
-rw-r--r--numpy/core/setup_common.py4
-rw-r--r--numpy/core/src/private/lowlevel_strided_loops.h46
-rw-r--r--numpy/core/src/scalarmathmodule.c.src10
-rw-r--r--numpy/core/src/umath/loops.c.src81
-rw-r--r--numpy/core/src/umath/loops.h356
-rw-r--r--numpy/core/src/umath/loops.h.src6
-rw-r--r--numpy/core/tests/test_umath.py16
-rw-r--r--numpy/testing/utils.py67
10 files changed, 417 insertions, 176 deletions
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py
index ad711f888..1bc22d777 100644
--- a/numpy/core/code_generators/generate_umath.py
+++ b/numpy/core/code_generators/generate_umath.py
@@ -207,6 +207,7 @@ cmplx = 'FDG'
cmplxO = cmplx + O
cmplxP = cmplx + P
inexact = flts + cmplx
+inexactvec = 'fd'
noint = inexact+O
nointP = inexact+P
allP = bints+times+flts+cmplxP
@@ -686,6 +687,7 @@ defdict = {
Ufunc(1, 1, None,
docstrings.get('numpy.core.umath.sqrt'),
None,
+ TD(inexactvec),
TD(inexact, f='sqrt', astype={'e':'f'}),
TD(P, f='sqrt'),
),
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index 5a1e6cf6e..a6ddfb843 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -161,6 +161,11 @@ def check_math_capabilities(config, moredefs, mathlibs):
check_funcs(OPTIONAL_STDFUNCS)
+
+ for h in OPTIONAL_HEADERS:
+ if config.check_func("", decl=False, call=False, headers=[h]):
+ moredefs.append((fname2def(h).replace(".", "_"), 1))
+
for f, args in OPTIONAL_INTRINSICS:
if config.check_func(f, decl=False, call=True, call_args=args):
moredefs.append((fname2def(f), 1))
diff --git a/numpy/core/setup_common.py b/numpy/core/setup_common.py
index e778e507b..847c4a624 100644
--- a/numpy/core/setup_common.py
+++ b/numpy/core/setup_common.py
@@ -97,6 +97,10 @@ OPTIONAL_STDFUNCS = ["expm1", "log1p", "acosh", "asinh", "atanh",
"rint", "trunc", "exp2", "log2", "hypot", "atan2", "pow",
"copysign", "nextafter"]
+
+OPTIONAL_HEADERS = ["emmintrin.h" # SSE2, amd64 only
+]
+
# optional gcc compiler builtins and their call arguments
# call arguments are required as the compiler will do strict signature checking
OPTIONAL_INTRINSICS = [("__builtin_isnan", '5.'),
diff --git a/numpy/core/src/private/lowlevel_strided_loops.h b/numpy/core/src/private/lowlevel_strided_loops.h
index c9fd1248f..742882a92 100644
--- a/numpy/core/src/private/lowlevel_strided_loops.h
+++ b/numpy/core/src/private/lowlevel_strided_loops.h
@@ -396,6 +396,52 @@ PyArray_PrepareThreeRawArrayIter(int ndim, npy_intp *shape,
char **out_dataB, npy_intp *out_stridesB,
char **out_dataC, npy_intp *out_stridesC);
+
+/*
+ * Return number of elements that must be peeled from
+ * the start of 'addr' with 'nvals' elements of size 'esize'
+ * in order to reach 'alignment'.
+ * see npy_blocked_end for an example
+ */
+static NPY_INLINE npy_intp
+npy_aligned_block_offset(const void * addr, const npy_intp esize,
+ const npy_intp alignment, const npy_intp nvals)
+{
+ const npy_intp offset = (npy_intp)addr & (alignment - 1);
+ npy_intp peel = offset ? (alignment - offset) / esize : 0;
+ peel = nvals < peel ? nvals : peel;
+ return peel;
+}
+
+/*
+ * Return upper loop bound for an array of 'nvals' elements
+ * of size 'esize' peeled by 'offset' elements and blocking to
+ * a vector size of 'vsz' in bytes
+ *
+ * example usage:
+ * npy_intp i;
+ * double v[101];
+ * npy_intp esize = sizeof(v[0]);
+ * npy_intp peel = npy_aligned_block_offset(v, esize, 16, n);
+ * // peel to alignment 16
+ * for (i = 0; i < peel; i++)
+ * <scalar-op>
+ * // simd vectorized operation
+ * for (; i < npy_blocked_end(peel, esize, 16, n); i += 16 / esize)
+ * <blocked-op>
+ * // handle scalar rest
+ * for(; i < n; i++)
+ * <scalar-op>
+ */
+static NPY_INLINE npy_intp
+npy_blocked_end(const npy_intp offset, const npy_intp esize,
+ const npy_intp vsz, const npy_intp nvals)
+{
+ return nvals - offset - (nvals - offset) % (vsz / esize);
+}
+
+
+
/* Start raw iteration */
#define NPY_RAW_ITER_START(idim, ndim, coord, shape) \
memset((coord), 0, (ndim) * sizeof(coord[0])); \
diff --git a/numpy/core/src/scalarmathmodule.c.src b/numpy/core/src/scalarmathmodule.c.src
index b87d9b405..47ad9de61 100644
--- a/numpy/core/src/scalarmathmodule.c.src
+++ b/numpy/core/src/scalarmathmodule.c.src
@@ -1766,8 +1766,14 @@ get_functions(void)
}
funcdata = ((PyUFuncObject *)obj)->data;
signatures = ((PyUFuncObject *)obj)->types;
- i = 0;
- j = 0;
+ /*
+ * sqrt ufunc is specialized for double and float loops in
+ * generate_umath.py, the first to go into FLOAT/DOUBLE_sqrt
+ * they have the same signature as the scalar variants so we need to skip
+ * over them
+ */
+ i = 4;
+ j = 2;
while (signatures[i] != NPY_FLOAT) {
i += 2; j++;
}
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index 040486a32..e307faa46 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -15,10 +15,16 @@
#include "numpy/ufuncobject.h"
#include "numpy/npy_math.h"
#include "numpy/halffloat.h"
+#include "lowlevel_strided_loops.h"
#include "npy_pycompat.h"
#include "ufunc_object.h"
+#include <assert.h>
+
+#ifdef HAVE_EMMINTRIN_H
+#include <emmintrin.h>
+#endif
/*
@@ -27,10 +33,25 @@
*****************************************************************************
*/
+
+static NPY_INLINE int npy_is_aligned(const void * p, const npy_intp alignment)
+{
+ return ((npy_intp)(p) & ((alignment) - 1)) == 0;
+}
+
#define IS_BINARY_REDUCE ((args[0] == args[2])\
&& (steps[0] == steps[2])\
&& (steps[0] == 0))
+/*
+ * stride is equal to element size and input and destination are equal or
+ * don't overlap within one register
+ */
+#define IS_BLOCKABLE_UNARY(esize, vsize) \
+ (steps[0] == (esize) && steps[0] == steps[1] && \
+ (npy_is_aligned(args[0], esize) && npy_is_aligned(args[1], esize)) && \
+ ((abs(args[1] - args[0]) >= (vsize)) || ((abs(args[1] - args[0]) == 0))))
+
#define OUTPUT_LOOP\
char *op1 = args[1];\
npy_intp os1 = steps[1];\
@@ -52,6 +73,22 @@
npy_intp i;\
for(i = 0; i < n; i++, ip1 += is1, op1 += os1, op2 += os2)
+/* align output to alignment
+ * store alignment is usually more important than load alignment */
+#define UNARY_LOOP_BLOCK_ALIGN_OUT(type, alignment)\
+ type *ip1 = (type *)args[0], *op1 = (type *)args[1];\
+ npy_intp n = dimensions[0];\
+ npy_intp i, peel = npy_aligned_block_offset(op1, sizeof(type),\
+ alignment, n);\
+ for(i = 0; i < peel; i++)
+
+#define UNARY_LOOP_BLOCKED(type, vsize)\
+ for(; i < npy_blocked_end(peel, sizeof(type), vsize, n);\
+ i += (vsize / sizeof(type)))
+
+#define UNARY_LOOP_BLOCKED_END\
+ for (; i < n; i++)
+
#define BINARY_LOOP\
char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];\
npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2];\
@@ -1268,6 +1305,50 @@ TIMEDELTA_mm_d_divide(char **args, npy_intp *dimensions, npy_intp *steps, void *
/**begin repeat
+ * #type = npy_float, npy_double#
+ * #TYPE = FLOAT, DOUBLE#
+ * #scalarf = npy_sqrtf, npy_sqrt#
+ * #vtype = __m128, __m128d#
+ * #vsuf = ps, pd#
+ */
+NPY_NO_EXPORT void
+@TYPE@_sqrt(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+#ifdef HAVE_EMMINTRIN_H
+ if (IS_BLOCKABLE_UNARY(sizeof(@type@), 16)) {
+ UNARY_LOOP_BLOCK_ALIGN_OUT(@type@, 16) {
+ op1[i] = @scalarf@(ip1[i]);
+ }
+ assert(npy_is_aligned(&op1[i], 16));
+ if (npy_is_aligned(&ip1[i], 16)) {
+ UNARY_LOOP_BLOCKED(@type@, 16) {
+ @vtype@ d = _mm_load_@vsuf@(&ip1[i]);
+ _mm_store_@vsuf@(&op1[i], _mm_sqrt_@vsuf@(d));
+ }
+ }
+ else {
+ UNARY_LOOP_BLOCKED(@type@, 16) {
+ @vtype@ d = _mm_loadu_@vsuf@(&ip1[i]);
+ _mm_store_@vsuf@(&op1[i], _mm_sqrt_@vsuf@(d));
+ }
+ }
+ UNARY_LOOP_BLOCKED_END {
+ op1[i] = @scalarf@(ip1[i]);
+ }
+ }
+ else
+#endif
+ {
+ UNARY_LOOP {
+ const @type@ in1 = *(@type@ *)ip1;
+ *(@type@ *)op1 = @scalarf@(in1);
+ }
+ }
+}
+/**end repeat**/
+
+
+/**begin repeat
* Float types
* #type = npy_float, npy_double, npy_longdouble#
* #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
diff --git a/numpy/core/src/umath/loops.h b/numpy/core/src/umath/loops.h
index c2123459b..7f49f1956 100644
--- a/numpy/core/src/umath/loops.h
+++ b/numpy/core/src/umath/loops.h
@@ -1541,56 +1541,64 @@ ULONGLONG_remainder(char **args, npy_intp *dimensions, npy_intp *steps, void *NP
*****************************************************************************
*/
+#line 187
+NPY_NO_EXPORT void
+FLOAT_sqrt(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+
+#line 187
+NPY_NO_EXPORT void
+DOUBLE_sqrt(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+
-#line 191
+#line 197
-#line 198
+#line 204
NPY_NO_EXPORT void
HALF_add(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 198
+#line 204
NPY_NO_EXPORT void
HALF_subtract(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 198
+#line 204
NPY_NO_EXPORT void
HALF_multiply(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 198
+#line 204
NPY_NO_EXPORT void
HALF_divide(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 207
+#line 213
NPY_NO_EXPORT void
HALF_equal(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 207
+#line 213
NPY_NO_EXPORT void
HALF_not_equal(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 207
+#line 213
NPY_NO_EXPORT void
HALF_less(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 207
+#line 213
NPY_NO_EXPORT void
HALF_less_equal(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 207
+#line 213
NPY_NO_EXPORT void
HALF_greater(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 207
+#line 213
NPY_NO_EXPORT void
HALF_greater_equal(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 207
+#line 213
NPY_NO_EXPORT void
HALF_logical_and(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 207
+#line 213
NPY_NO_EXPORT void
HALF_logical_or(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
@@ -1601,49 +1609,49 @@ HALF_logical_xor(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_U
NPY_NO_EXPORT void
HALF_logical_not(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 221
+#line 227
NPY_NO_EXPORT void
HALF_isnan(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 221
+#line 227
NPY_NO_EXPORT void
HALF_isinf(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 221
+#line 227
NPY_NO_EXPORT void
HALF_isfinite(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 221
+#line 227
NPY_NO_EXPORT void
HALF_signbit(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 221
+#line 227
NPY_NO_EXPORT void
HALF_copysign(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 221
+#line 227
NPY_NO_EXPORT void
HALF_nextafter(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 221
+#line 227
NPY_NO_EXPORT void
HALF_spacing(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 229
+#line 235
NPY_NO_EXPORT void
HALF_maximum(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 229
+#line 235
NPY_NO_EXPORT void
HALF_minimum(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 237
+#line 243
NPY_NO_EXPORT void
HALF_fmax(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 237
+#line 243
NPY_NO_EXPORT void
HALF_fmin(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
@@ -1696,55 +1704,55 @@ HALF_ldexp_long(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UN
#define HALF_true_divide HALF_divide
-#line 191
+#line 197
-#line 198
+#line 204
NPY_NO_EXPORT void
FLOAT_add(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 198
+#line 204
NPY_NO_EXPORT void
FLOAT_subtract(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 198
+#line 204
NPY_NO_EXPORT void
FLOAT_multiply(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 198
+#line 204
NPY_NO_EXPORT void
FLOAT_divide(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 207
+#line 213
NPY_NO_EXPORT void
FLOAT_equal(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 207
+#line 213
NPY_NO_EXPORT void
FLOAT_not_equal(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 207
+#line 213
NPY_NO_EXPORT void
FLOAT_less(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 207
+#line 213
NPY_NO_EXPORT void
FLOAT_less_equal(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 207
+#line 213
NPY_NO_EXPORT void
FLOAT_greater(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 207
+#line 213
NPY_NO_EXPORT void
FLOAT_greater_equal(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 207
+#line 213
NPY_NO_EXPORT void
FLOAT_logical_and(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 207
+#line 213
NPY_NO_EXPORT void
FLOAT_logical_or(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
@@ -1755,49 +1763,49 @@ FLOAT_logical_xor(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_
NPY_NO_EXPORT void
FLOAT_logical_not(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 221
+#line 227
NPY_NO_EXPORT void
FLOAT_isnan(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 221
+#line 227
NPY_NO_EXPORT void
FLOAT_isinf(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 221
+#line 227
NPY_NO_EXPORT void
FLOAT_isfinite(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 221
+#line 227
NPY_NO_EXPORT void
FLOAT_signbit(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 221
+#line 227
NPY_NO_EXPORT void
FLOAT_copysign(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 221
+#line 227
NPY_NO_EXPORT void
FLOAT_nextafter(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 221
+#line 227
NPY_NO_EXPORT void
FLOAT_spacing(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 229
+#line 235
NPY_NO_EXPORT void
FLOAT_maximum(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 229
+#line 235
NPY_NO_EXPORT void
FLOAT_minimum(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 237
+#line 243
NPY_NO_EXPORT void
FLOAT_fmax(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 237
+#line 243
NPY_NO_EXPORT void
FLOAT_fmin(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
@@ -1850,55 +1858,55 @@ FLOAT_ldexp_long(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_U
#define FLOAT_true_divide FLOAT_divide
-#line 191
+#line 197
-#line 198
+#line 204
NPY_NO_EXPORT void
DOUBLE_add(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 198
+#line 204
NPY_NO_EXPORT void
DOUBLE_subtract(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 198
+#line 204
NPY_NO_EXPORT void
DOUBLE_multiply(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 198
+#line 204
NPY_NO_EXPORT void
DOUBLE_divide(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 207
+#line 213
NPY_NO_EXPORT void
DOUBLE_equal(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 207
+#line 213
NPY_NO_EXPORT void
DOUBLE_not_equal(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 207
+#line 213
NPY_NO_EXPORT void
DOUBLE_less(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 207
+#line 213
NPY_NO_EXPORT void
DOUBLE_less_equal(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 207
+#line 213
NPY_NO_EXPORT void
DOUBLE_greater(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 207
+#line 213
NPY_NO_EXPORT void
DOUBLE_greater_equal(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 207
+#line 213
NPY_NO_EXPORT void
DOUBLE_logical_and(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 207
+#line 213
NPY_NO_EXPORT void
DOUBLE_logical_or(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
@@ -1909,49 +1917,49 @@ DOUBLE_logical_xor(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY
NPY_NO_EXPORT void
DOUBLE_logical_not(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 221
+#line 227
NPY_NO_EXPORT void
DOUBLE_isnan(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 221
+#line 227
NPY_NO_EXPORT void
DOUBLE_isinf(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 221
+#line 227
NPY_NO_EXPORT void
DOUBLE_isfinite(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 221
+#line 227
NPY_NO_EXPORT void
DOUBLE_signbit(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 221
+#line 227
NPY_NO_EXPORT void
DOUBLE_copysign(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 221
+#line 227
NPY_NO_EXPORT void
DOUBLE_nextafter(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 221
+#line 227
NPY_NO_EXPORT void
DOUBLE_spacing(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 229
+#line 235
NPY_NO_EXPORT void
DOUBLE_maximum(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 229
+#line 235
NPY_NO_EXPORT void
DOUBLE_minimum(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 237
+#line 243
NPY_NO_EXPORT void
DOUBLE_fmax(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 237
+#line 243
NPY_NO_EXPORT void
DOUBLE_fmin(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
@@ -2004,55 +2012,55 @@ DOUBLE_ldexp_long(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_
#define DOUBLE_true_divide DOUBLE_divide
-#line 191
+#line 197
-#line 198
+#line 204
NPY_NO_EXPORT void
LONGDOUBLE_add(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 198
+#line 204
NPY_NO_EXPORT void
LONGDOUBLE_subtract(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 198
+#line 204
NPY_NO_EXPORT void
LONGDOUBLE_multiply(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 198
+#line 204
NPY_NO_EXPORT void
LONGDOUBLE_divide(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 207
+#line 213
NPY_NO_EXPORT void
LONGDOUBLE_equal(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 207
+#line 213
NPY_NO_EXPORT void
LONGDOUBLE_not_equal(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 207
+#line 213
NPY_NO_EXPORT void
LONGDOUBLE_less(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 207
+#line 213
NPY_NO_EXPORT void
LONGDOUBLE_less_equal(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 207
+#line 213
NPY_NO_EXPORT void
LONGDOUBLE_greater(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 207
+#line 213
NPY_NO_EXPORT void
LONGDOUBLE_greater_equal(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 207
+#line 213
NPY_NO_EXPORT void
LONGDOUBLE_logical_and(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 207
+#line 213
NPY_NO_EXPORT void
LONGDOUBLE_logical_or(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
@@ -2063,49 +2071,49 @@ LONGDOUBLE_logical_xor(char **args, npy_intp *dimensions, npy_intp *steps, void
NPY_NO_EXPORT void
LONGDOUBLE_logical_not(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 221
+#line 227
NPY_NO_EXPORT void
LONGDOUBLE_isnan(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 221
+#line 227
NPY_NO_EXPORT void
LONGDOUBLE_isinf(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 221
+#line 227
NPY_NO_EXPORT void
LONGDOUBLE_isfinite(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 221
+#line 227
NPY_NO_EXPORT void
LONGDOUBLE_signbit(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 221
+#line 227
NPY_NO_EXPORT void
LONGDOUBLE_copysign(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 221
+#line 227
NPY_NO_EXPORT void
LONGDOUBLE_nextafter(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 221
+#line 227
NPY_NO_EXPORT void
LONGDOUBLE_spacing(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 229
+#line 235
NPY_NO_EXPORT void
LONGDOUBLE_maximum(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 229
+#line 235
NPY_NO_EXPORT void
LONGDOUBLE_minimum(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 237
+#line 243
NPY_NO_EXPORT void
LONGDOUBLE_fmax(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 237
+#line 243
NPY_NO_EXPORT void
LONGDOUBLE_fmin(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
@@ -2173,14 +2181,14 @@ LONGDOUBLE_ldexp_long(char **args, npy_intp *dimensions, npy_intp *steps, void *
#define CEQ(xr,xi,yr,yi) (xr == yr && xi == yi);
#define CNE(xr,xi,yr,yi) (xr != yr || xi != yi);
-#line 310
-
#line 316
+
+#line 322
NPY_NO_EXPORT void
CFLOAT_add(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 316
+#line 322
NPY_NO_EXPORT void
CFLOAT_subtract(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
@@ -2195,36 +2203,36 @@ CFLOAT_divide(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUS
NPY_NO_EXPORT void
CFLOAT_floor_divide(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 334
+#line 340
NPY_NO_EXPORT void
CFLOAT_greater(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 334
+#line 340
NPY_NO_EXPORT void
CFLOAT_greater_equal(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 334
+#line 340
NPY_NO_EXPORT void
CFLOAT_less(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 334
+#line 340
NPY_NO_EXPORT void
CFLOAT_less_equal(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 334
+#line 340
NPY_NO_EXPORT void
CFLOAT_equal(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 334
+#line 340
NPY_NO_EXPORT void
CFLOAT_not_equal(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 343
+#line 349
NPY_NO_EXPORT void
CFLOAT_logical_and(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 343
+#line 349
NPY_NO_EXPORT void
CFLOAT_logical_or(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
@@ -2234,15 +2242,15 @@ CFLOAT_logical_xor(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY
NPY_NO_EXPORT void
CFLOAT_logical_not(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 357
+#line 363
NPY_NO_EXPORT void
CFLOAT_isnan(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 357
+#line 363
NPY_NO_EXPORT void
CFLOAT_isinf(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 357
+#line 363
NPY_NO_EXPORT void
CFLOAT_isfinite(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
@@ -2268,20 +2276,20 @@ CFLOAT__arg(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED
NPY_NO_EXPORT void
CFLOAT_sign(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 386
+#line 392
NPY_NO_EXPORT void
CFLOAT_maximum(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 386
+#line 392
NPY_NO_EXPORT void
CFLOAT_minimum(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 394
+#line 400
NPY_NO_EXPORT void
CFLOAT_fmax(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 394
+#line 400
NPY_NO_EXPORT void
CFLOAT_fmin(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
@@ -2289,14 +2297,14 @@ CFLOAT_fmin(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED
#define CFLOAT_true_divide CFLOAT_divide
-#line 310
-
#line 316
+
+#line 322
NPY_NO_EXPORT void
CDOUBLE_add(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 316
+#line 322
NPY_NO_EXPORT void
CDOUBLE_subtract(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
@@ -2311,36 +2319,36 @@ CDOUBLE_divide(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNU
NPY_NO_EXPORT void
CDOUBLE_floor_divide(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 334
+#line 340
NPY_NO_EXPORT void
CDOUBLE_greater(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 334
+#line 340
NPY_NO_EXPORT void
CDOUBLE_greater_equal(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 334
+#line 340
NPY_NO_EXPORT void
CDOUBLE_less(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 334
+#line 340
NPY_NO_EXPORT void
CDOUBLE_less_equal(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 334
+#line 340
NPY_NO_EXPORT void
CDOUBLE_equal(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 334
+#line 340
NPY_NO_EXPORT void
CDOUBLE_not_equal(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 343
+#line 349
NPY_NO_EXPORT void
CDOUBLE_logical_and(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 343
+#line 349
NPY_NO_EXPORT void
CDOUBLE_logical_or(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
@@ -2350,15 +2358,15 @@ CDOUBLE_logical_xor(char **args, npy_intp *dimensions, npy_intp *steps, void *NP
NPY_NO_EXPORT void
CDOUBLE_logical_not(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 357
+#line 363
NPY_NO_EXPORT void
CDOUBLE_isnan(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 357
+#line 363
NPY_NO_EXPORT void
CDOUBLE_isinf(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 357
+#line 363
NPY_NO_EXPORT void
CDOUBLE_isfinite(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
@@ -2384,20 +2392,20 @@ CDOUBLE__arg(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSE
NPY_NO_EXPORT void
CDOUBLE_sign(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 386
+#line 392
NPY_NO_EXPORT void
CDOUBLE_maximum(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 386
+#line 392
NPY_NO_EXPORT void
CDOUBLE_minimum(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 394
+#line 400
NPY_NO_EXPORT void
CDOUBLE_fmax(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 394
+#line 400
NPY_NO_EXPORT void
CDOUBLE_fmin(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
@@ -2405,14 +2413,14 @@ CDOUBLE_fmin(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSE
#define CDOUBLE_true_divide CDOUBLE_divide
-#line 310
-
#line 316
+
+#line 322
NPY_NO_EXPORT void
CLONGDOUBLE_add(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 316
+#line 322
NPY_NO_EXPORT void
CLONGDOUBLE_subtract(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
@@ -2427,36 +2435,36 @@ CLONGDOUBLE_divide(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY
NPY_NO_EXPORT void
CLONGDOUBLE_floor_divide(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 334
+#line 340
NPY_NO_EXPORT void
CLONGDOUBLE_greater(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 334
+#line 340
NPY_NO_EXPORT void
CLONGDOUBLE_greater_equal(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 334
+#line 340
NPY_NO_EXPORT void
CLONGDOUBLE_less(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 334
+#line 340
NPY_NO_EXPORT void
CLONGDOUBLE_less_equal(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 334
+#line 340
NPY_NO_EXPORT void
CLONGDOUBLE_equal(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 334
+#line 340
NPY_NO_EXPORT void
CLONGDOUBLE_not_equal(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 343
+#line 349
NPY_NO_EXPORT void
CLONGDOUBLE_logical_and(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 343
+#line 349
NPY_NO_EXPORT void
CLONGDOUBLE_logical_or(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
@@ -2466,15 +2474,15 @@ CLONGDOUBLE_logical_xor(char **args, npy_intp *dimensions, npy_intp *steps, void
NPY_NO_EXPORT void
CLONGDOUBLE_logical_not(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 357
+#line 363
NPY_NO_EXPORT void
CLONGDOUBLE_isnan(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 357
+#line 363
NPY_NO_EXPORT void
CLONGDOUBLE_isinf(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 357
+#line 363
NPY_NO_EXPORT void
CLONGDOUBLE_isfinite(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
@@ -2500,20 +2508,20 @@ CLONGDOUBLE__arg(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_U
NPY_NO_EXPORT void
CLONGDOUBLE_sign(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 386
+#line 392
NPY_NO_EXPORT void
CLONGDOUBLE_maximum(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 386
+#line 392
NPY_NO_EXPORT void
CLONGDOUBLE_minimum(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 394
+#line 400
NPY_NO_EXPORT void
CLONGDOUBLE_fmax(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 394
+#line 400
NPY_NO_EXPORT void
CLONGDOUBLE_fmin(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
@@ -2544,81 +2552,81 @@ TIMEDELTA_absolute(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY
NPY_NO_EXPORT void
TIMEDELTA_sign(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 427
+#line 433
NPY_NO_EXPORT void
DATETIME__ones_like(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data));
-#line 435
+#line 441
NPY_NO_EXPORT void
DATETIME_equal(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 435
+#line 441
NPY_NO_EXPORT void
DATETIME_not_equal(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 435
+#line 441
NPY_NO_EXPORT void
DATETIME_greater(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 435
+#line 441
NPY_NO_EXPORT void
DATETIME_greater_equal(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 435
+#line 441
NPY_NO_EXPORT void
DATETIME_less(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 435
+#line 441
NPY_NO_EXPORT void
DATETIME_less_equal(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 443
+#line 449
NPY_NO_EXPORT void
DATETIME_maximum(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 443
+#line 449
NPY_NO_EXPORT void
DATETIME_minimum(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 427
+#line 433
NPY_NO_EXPORT void
TIMEDELTA__ones_like(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data));
-#line 435
+#line 441
NPY_NO_EXPORT void
TIMEDELTA_equal(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 435
+#line 441
NPY_NO_EXPORT void
TIMEDELTA_not_equal(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 435
+#line 441
NPY_NO_EXPORT void
TIMEDELTA_greater(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 435
+#line 441
NPY_NO_EXPORT void
TIMEDELTA_greater_equal(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 435
+#line 441
NPY_NO_EXPORT void
TIMEDELTA_less(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 435
+#line 441
NPY_NO_EXPORT void
TIMEDELTA_less_equal(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 443
+#line 449
NPY_NO_EXPORT void
TIMEDELTA_maximum(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 443
+#line 449
NPY_NO_EXPORT void
TIMEDELTA_minimum(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
@@ -2683,27 +2691,27 @@ TIMEDELTA_mm_d_divide(char **args, npy_intp *dimensions, npy_intp *steps, void *
*****************************************************************************
*/
-#line 511
+#line 517
NPY_NO_EXPORT void
OBJECT_equal(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 511
+#line 517
NPY_NO_EXPORT void
OBJECT_not_equal(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 511
+#line 517
NPY_NO_EXPORT void
OBJECT_greater(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 511
+#line 517
NPY_NO_EXPORT void
OBJECT_greater_equal(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 511
+#line 517
NPY_NO_EXPORT void
OBJECT_less(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-#line 511
+#line 517
NPY_NO_EXPORT void
OBJECT_less_equal(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src
index 87b47a754..a8a58c5de 100644
--- a/numpy/core/src/umath/loops.h.src
+++ b/numpy/core/src/umath/loops.h.src
@@ -181,6 +181,12 @@ U@TYPE@_remainder(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_
*****************************************************************************
*/
+/**begin repeat
+ * #TYPE = FLOAT, DOUBLE#
+ */
+NPY_NO_EXPORT void
+@TYPE@_sqrt(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+/**end repeat**/
/**begin repeat
* Float types
diff --git a/numpy/core/tests/test_umath.py b/numpy/core/tests/test_umath.py
index ea92ce7de..1967db547 100644
--- a/numpy/core/tests/test_umath.py
+++ b/numpy/core/tests/test_umath.py
@@ -4,6 +4,7 @@ import sys
import platform
from numpy.testing import *
+from numpy.testing.utils import gen_alignment_data
import numpy.core.umath as ncu
import numpy as np
@@ -100,6 +101,21 @@ class TestPower(TestCase):
assert_almost_equal(x**(-1), [1., 0.5, 1./3])
assert_almost_equal(x**(0.5), [1., ncu.sqrt(2), ncu.sqrt(3)])
+ for out, inp, msg in gen_alignment_data(dtype=np.float32,
+ type='unary'):
+ exp = [ncu.sqrt(i) for i in inp]
+ assert_almost_equal(inp**(0.5), exp, err_msg=msg)
+ np.sqrt(inp, out=out)
+ assert_equal(out, exp, err_msg=msg)
+
+ for out, inp, msg in gen_alignment_data(dtype=np.float64,
+ type='unary'):
+ exp = [ncu.sqrt(i) for i in inp]
+ assert_almost_equal(inp**(0.5), exp, err_msg=msg)
+ np.sqrt(inp, out=out)
+ assert_equal(out, exp, err_msg=msg)
+
+
def test_power_complex(self):
x = np.array([1+2j, 2+3j, 3+4j])
assert_equal(x**0, [1., 1., 1.])
diff --git a/numpy/testing/utils.py b/numpy/testing/utils.py
index 40c569c0f..7a3ea7a1c 100644
--- a/numpy/testing/utils.py
+++ b/numpy/testing/utils.py
@@ -10,6 +10,7 @@ import re
import operator
import warnings
from .nosetester import import_nose
+from numpy.core import float32, empty, arange
if sys.version_info[0] >= 3:
from io import StringIO
@@ -1523,3 +1524,69 @@ def assert_no_warnings(func, *args, **kw):
finally:
ctx.__exit__()
return result
+
+
+def gen_alignment_data(dtype=float32, type='binary', max_size=24):
+ """
+ generator producing data with different alignment and offsets
+ to test simd vectorization
+
+ Parameters
+ ----------
+ dtype : dtype
+ data type to produce
+ type : string
+ 'unary': create data for unary operations, creates one input
+ and output array
+ 'binary': create data for unary operations, creates two input
+ and output array
+ max_size : integer
+ maximum size of data to produce
+
+ Returns
+ -------
+ if type is 'unary' yields one output, one input array and a message
+ containing information on the data
+ if type is 'binary' yields one output array, two input array and a message
+ containing information on the data
+
+ """
+ ufmt = 'unary offset=(%d, %d), size=%d, dtype=%r, %s'
+ bfmt = 'binary offset=(%d, %d, %d), size=%d, dtype=%r, %s'
+ for o in range(3):
+ for s in range(o + 2, max(o + 3, max_size)):
+ if type == 'unary':
+ inp = lambda : arange(s, dtype=dtype)[o:]
+ out = empty((s,), dtype=dtype)[o:]
+ yield out, inp(), ufmt % (o, o, s, dtype, 'out of place')
+ yield inp(), inp(), ufmt % (o, o, s, dtype, 'in place')
+ yield out[1:], inp()[:-1], ufmt % \
+ (o + 1, o, s - 1, dtype, 'out of place')
+ yield out[:-1], inp()[1:], ufmt % \
+ (o, o + 1, s - 1, dtype, 'out of place')
+ yield inp()[:-1], inp()[1:], ufmt % \
+ (o, o + 1, s - 1, dtype, 'aliased')
+ yield inp()[1:], inp()[:-1], ufmt % \
+ (o + 1, o, s - 1, dtype, 'aliased')
+ if type == 'binary':
+ inp1 = lambda :arange(s, dtype=dtype)[o:]
+ inp2 = lambda :arange(s, dtype=dtype)[o:]
+ out = empty((s,), dtype=dtype)[o:]
+ yield out, inp1(), inp2(), bfmt % \
+ (o, o, o, s, dtype, 'out of place')
+ yield inp1(), inp1(), inp2(), bfmt % \
+ (o, o, o, s, dtype, 'in place1')
+ yield inp2(), inp1(), inp2(), bfmt % \
+ (o, o, o, s, dtype, 'in place2')
+ yield out[1:], inp1()[:-1], inp2()[:-1], bfmt % \
+ (o + 1, o, o, s - 1, dtype, 'out of place')
+ yield out[-1:], inp1()[1:], inp2()[:-1], bfmt % \
+ (o, o + 1, o, s - 1, dtype, 'out of place')
+ yield out[-1:], inp1()[:-1], inp2()[1:], bfmt % \
+ (o, o, o + 1, s - 1, dtype, 'out of place')
+ yield inp1()[1:], inp1()[:-1], inp2()[:-1], bfmt % \
+ (o + 1, o, o, s - 1, dtype, 'aliased')
+ yield inp1()[-1:], inp1()[1:], inp2()[:-1], bfmt % \
+ (o, o + 1, o, s - 1, dtype, 'aliased')
+ yield inp1()[-1:], inp1()[:-1], inp2()[1:], bfmt % \
+ (o, o, o + 1, s - 1, dtype, 'aliased')