summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCharles Harris <charlesr.harris@gmail.com>2014-03-05 14:14:29 -0700
committerCharles Harris <charlesr.harris@gmail.com>2014-03-05 14:14:29 -0700
commit2e2bd93641aa505da8206d16d87271e606bb5192 (patch)
treeecb217a655eb5db209f59d6c41bbdbfccc9b99a3
parent7571ca733d58d2eef0b8cd5f550cab2959b852a9 (diff)
parent37967930e05700af8ee3b66bdb35c5b8a02d17dd (diff)
downloadnumpy-2e2bd93641aa505da8206d16d87271e606bb5192.tar.gz
Merge pull request #4449 from juliantaylor/vectorize-isnan
ENH: vectorize isnan
-rw-r--r--numpy/core/src/umath/loops.c.src11
-rw-r--r--numpy/core/src/umath/simd.inc.src101
-rw-r--r--numpy/core/tests/test_numeric.py11
3 files changed, 71 insertions, 52 deletions
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index 0fa03f343..3f5048592 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -1514,13 +1514,18 @@ NPY_NO_EXPORT void
/**begin repeat1
* #kind = isnan, isinf, isfinite, signbit#
* #func = npy_isnan, npy_isinf, npy_isfinite, npy_signbit#
+ * #isnan = 1, 0*3#
**/
NPY_NO_EXPORT void
@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
- UNARY_LOOP {
- const @type@ in1 = *(@type@ *)ip1;
- *((npy_bool *)op1) = @func@(in1) != 0;
+ char * margs[] = {args[0], args[0], args[1]};
+ npy_intp msteps[] = {steps[0], steps[0], steps[1]};
+ if (!@isnan@ || !run_binary_simd_not_equal_@TYPE@(margs, dimensions, msteps)) {
+ UNARY_LOOP {
+ const @type@ in1 = *(@type@ *)ip1;
+ *((npy_bool *)op1) = @func@(in1) != 0;
+ }
}
}
/**end repeat1**/
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index 646c60339..92dc0c659 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -480,10 +480,30 @@ sse2_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_i
/**end repeat1**/
+/* compress 4 vectors to 4/8 bytes in op with filled with 0 or 1 */
+static NPY_INLINE void
+sse2_compress4_to_byte_@TYPE@(@vtype@ r1, @vtype@ r2, @vtype@ r3, @vtype@ r4,
+ npy_bool * op)
+{
+ const __m128i mask = @vpre@_set1_epi8(0x1);
+ __m128i ir1 = @vpre@_packs_epi32(@cast@(r1), @cast@(r2));
+ __m128i ir2 = @vpre@_packs_epi32(@cast@(r3), @cast@(r4));
+ __m128i rr = @vpre@_packs_epi16(ir1, ir2);
+#if @double@
+ rr = @vpre@_packs_epi16(rr, rr);
+ rr = @vpre@_and_si128(rr, mask);
+ @vpre@_storel_epi64((__m128i*)op, rr);
+#else
+ rr = @vpre@_and_si128(rr, mask);
+ @vpre@_storeu_si128((__m128i*)op, rr);
+#endif
+}
+
/**begin repeat1
* #kind = equal, not_equal, less, less_equal, greater, greater_equal#
* #OP = ==, !=, <, <=, >, >=#
* #VOP = cmpeq, cmpneq, cmplt, cmple, cmpgt, cmpge#
+ * #neq = 0, 1, 0*4#
*/
/* sets invalid fpu flag on QNaN for consistency with packed compare */
@@ -501,34 +521,39 @@ sse2_ordered_cmp_@kind@_@TYPE@(const @type@ a, const @type@ b)
static void
sse2_binary_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, @type@ * ip2, npy_intp n)
{
- const __m128i mask = @vpre@_set1_epi8(0x1);
LOOP_BLOCK_ALIGN_VAR(ip1, @type@, 16) {
op[i] = sse2_ordered_cmp_@kind@_@TYPE@(ip1[i], ip2[i]);
}
- LOOP_BLOCKED(@type@, 64) {
- @vtype@ a1 = @vpre@_load_@vsuf@(&ip1[i + 0 * 16 / sizeof(@type@)]);
- @vtype@ b1 = @vpre@_load_@vsuf@(&ip1[i + 1 * 16 / sizeof(@type@)]);
- @vtype@ c1 = @vpre@_load_@vsuf@(&ip1[i + 2 * 16 / sizeof(@type@)]);
- @vtype@ d1 = @vpre@_load_@vsuf@(&ip1[i + 3 * 16 / sizeof(@type@)]);
- @vtype@ a2 = @vpre@_loadu_@vsuf@(&ip2[i + 0 * 16 / sizeof(@type@)]);
- @vtype@ b2 = @vpre@_loadu_@vsuf@(&ip2[i + 1 * 16 / sizeof(@type@)]);
- @vtype@ c2 = @vpre@_loadu_@vsuf@(&ip2[i + 2 * 16 / sizeof(@type@)]);
- @vtype@ d2 = @vpre@_loadu_@vsuf@(&ip2[i + 3 * 16 / sizeof(@type@)]);
- @vtype@ r1 = @vpre@_@VOP@_@vsuf@(a1, a2);
- @vtype@ r2 = @vpre@_@VOP@_@vsuf@(b1, b2);
- @vtype@ r3 = @vpre@_@VOP@_@vsuf@(c1, c2);
- @vtype@ r4 = @vpre@_@VOP@_@vsuf@(d1, d2);
- __m128i ir1 = @vpre@_packs_epi32(@cast@(r1), @cast@(r2));
- __m128i ir2 = @vpre@_packs_epi32(@cast@(r3), @cast@(r4));
- __m128i rr = @vpre@_packs_epi16(ir1, ir2);
-#if @double@
- rr = @vpre@_packs_epi16(rr, rr);
- rr = @vpre@_and_si128(rr, mask);
- @vpre@_storel_epi64((__m128i*)&op[i], rr);
-#else
- rr = @vpre@_and_si128(rr, mask);
- @vpre@_storeu_si128((__m128i*)&op[i], rr);
-#endif
+ /* isnan special unary case */
+ if (@neq@ && ip1 == ip2) {
+ LOOP_BLOCKED(@type@, 64) {
+ @vtype@ a = @vpre@_load_@vsuf@(&ip1[i + 0 * 16 / sizeof(@type@)]);
+ @vtype@ b = @vpre@_load_@vsuf@(&ip1[i + 1 * 16 / sizeof(@type@)]);
+ @vtype@ c = @vpre@_load_@vsuf@(&ip1[i + 2 * 16 / sizeof(@type@)]);
+ @vtype@ d = @vpre@_load_@vsuf@(&ip1[i + 3 * 16 / sizeof(@type@)]);
+ @vtype@ r1 = @vpre@_@VOP@_@vsuf@(a, a);
+ @vtype@ r2 = @vpre@_@VOP@_@vsuf@(b, b);
+ @vtype@ r3 = @vpre@_@VOP@_@vsuf@(c, c);
+ @vtype@ r4 = @vpre@_@VOP@_@vsuf@(d, d);
+ sse2_compress4_to_byte_@TYPE@(r1, r2, r3, r4, &op[i]);
+ }
+ }
+ else {
+ LOOP_BLOCKED(@type@, 64) {
+ @vtype@ a1 = @vpre@_load_@vsuf@(&ip1[i + 0 * 16 / sizeof(@type@)]);
+ @vtype@ b1 = @vpre@_load_@vsuf@(&ip1[i + 1 * 16 / sizeof(@type@)]);
+ @vtype@ c1 = @vpre@_load_@vsuf@(&ip1[i + 2 * 16 / sizeof(@type@)]);
+ @vtype@ d1 = @vpre@_load_@vsuf@(&ip1[i + 3 * 16 / sizeof(@type@)]);
+ @vtype@ a2 = @vpre@_loadu_@vsuf@(&ip2[i + 0 * 16 / sizeof(@type@)]);
+ @vtype@ b2 = @vpre@_loadu_@vsuf@(&ip2[i + 1 * 16 / sizeof(@type@)]);
+ @vtype@ c2 = @vpre@_loadu_@vsuf@(&ip2[i + 2 * 16 / sizeof(@type@)]);
+ @vtype@ d2 = @vpre@_loadu_@vsuf@(&ip2[i + 3 * 16 / sizeof(@type@)]);
+ @vtype@ r1 = @vpre@_@VOP@_@vsuf@(a1, a2);
+ @vtype@ r2 = @vpre@_@VOP@_@vsuf@(b1, b2);
+ @vtype@ r3 = @vpre@_@VOP@_@vsuf@(c1, c2);
+ @vtype@ r4 = @vpre@_@VOP@_@vsuf@(d1, d2);
+ sse2_compress4_to_byte_@TYPE@(r1, r2, r3, r4, &op[i]);
+ }
}
LOOP_BLOCKED_END {
op[i] = sse2_ordered_cmp_@kind@_@TYPE@(ip1[i], ip2[i]);
@@ -540,7 +565,6 @@ static void
sse2_binary_scalar1_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, @type@ * ip2, npy_intp n)
{
@vtype@ s = @vpre@_set1_@vsuf@(ip1[0]);
- const __m128i mask = @vpre@_set1_epi8(0x1);
LOOP_BLOCK_ALIGN_VAR(ip2, @type@, 16) {
op[i] = sse2_ordered_cmp_@kind@_@TYPE@(ip1[0], ip2[i]);
}
@@ -553,17 +577,7 @@ sse2_binary_scalar1_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, @type@ * ip2, npy
@vtype@ r2 = @vpre@_@VOP@_@vsuf@(s, b);
@vtype@ r3 = @vpre@_@VOP@_@vsuf@(s, c);
@vtype@ r4 = @vpre@_@VOP@_@vsuf@(s, d);
- __m128i ir1 = @vpre@_packs_epi32(@cast@(r1), @cast@(r2));
- __m128i ir2 = @vpre@_packs_epi32(@cast@(r3), @cast@(r4));
- __m128i rr = @vpre@_packs_epi16(ir1, ir2);
-#if @double@
- rr = @vpre@_packs_epi16(rr, rr);
- rr = @vpre@_and_si128(rr, mask);
- @vpre@_storel_epi64((__m128i*)&op[i], rr);
-#else
- rr = @vpre@_and_si128(rr, mask);
- @vpre@_storeu_si128((__m128i*)&op[i], rr);
-#endif
+ sse2_compress4_to_byte_@TYPE@(r1, r2, r3, r4, &op[i]);
}
LOOP_BLOCKED_END {
op[i] = sse2_ordered_cmp_@kind@_@TYPE@(ip1[0], ip2[i]);
@@ -575,7 +589,6 @@ static void
sse2_binary_scalar2_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, @type@ * ip2, npy_intp n)
{
@vtype@ s = @vpre@_set1_@vsuf@(ip2[0]);
- const __m128i mask = @vpre@_set1_epi8(0x1);
LOOP_BLOCK_ALIGN_VAR(ip1, @type@, 16) {
op[i] = sse2_ordered_cmp_@kind@_@TYPE@(ip1[i], ip2[0]);
}
@@ -588,17 +601,7 @@ sse2_binary_scalar2_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, @type@ * ip2, npy
@vtype@ r2 = @vpre@_@VOP@_@vsuf@(b, s);
@vtype@ r3 = @vpre@_@VOP@_@vsuf@(c, s);
@vtype@ r4 = @vpre@_@VOP@_@vsuf@(d, s);
- __m128i ir1 = @vpre@_packs_epi32(@cast@(r1), @cast@(r2));
- __m128i ir2 = @vpre@_packs_epi32(@cast@(r3), @cast@(r4));
- __m128i rr = @vpre@_packs_epi16(ir1, ir2);
-#if @double@
- rr = @vpre@_packs_epi16(rr, rr);
- rr = @vpre@_and_si128(rr, mask);
- @vpre@_storel_epi64((__m128i*)&op[i], rr);
-#else
- rr = @vpre@_and_si128(rr, mask);
- @vpre@_storeu_si128((__m128i*)&op[i], rr);
-#endif
+ sse2_compress4_to_byte_@TYPE@(r1, r2, r3, r4, &op[i]);
}
LOOP_BLOCKED_END {
op[i] = sse2_ordered_cmp_@kind@_@TYPE@(ip1[i], ip2[0]);
diff --git a/numpy/core/tests/test_numeric.py b/numpy/core/tests/test_numeric.py
index a089d44dc..3a708d9e8 100644
--- a/numpy/core/tests/test_numeric.py
+++ b/numpy/core/tests/test_numeric.py
@@ -346,6 +346,11 @@ class TestBoolCmp(TestCase):
self.ed[s:s+4] = [(i & 2**x) != 0 for x in range(4)]
s += 4
+ self.nf = self.f.copy()
+ self.nd = self.d.copy()
+ self.nf[self.ef] = np.nan
+ self.nd[self.ed] = np.nan
+
def test_float(self):
# offset for alignment test
for i in range(4):
@@ -365,6 +370,9 @@ class TestBoolCmp(TestCase):
assert_array_equal(r2.view(np.int8), r2.astype(np.int8))
assert_array_equal(r3.view(np.int8), r3.astype(np.int8))
+ # isnan on amd64 takes the same codepath
+ assert_array_equal(np.isnan(self.nf[i:]), self.ef[i:])
+
def test_double(self):
# offset for alignment test
for i in range(2):
@@ -384,6 +392,9 @@ class TestBoolCmp(TestCase):
assert_array_equal(r2.view(np.int8), r2.astype(np.int8))
assert_array_equal(r3.view(np.int8), r3.astype(np.int8))
+ # isnan on amd64 takes the same codepath
+ assert_array_equal(np.isnan(self.nd[i:]), self.ed[i:])
+
class TestSeterr(TestCase):
def test_default(self):