ENH: vectorize isnan

isnan on amd64 is equivalent to x != x so we can use the existing code with only adding a specialization for unary loops.
author: Julian Taylor <jtaylor.debian@googlemail.com> 2014-03-05 20:34:57 +0100
committer: Julian Taylor <jtaylor.debian@googlemail.com> 2014-03-05 20:48:17 +0100
commit: 37967930e05700af8ee3b66bdb35c5b8a02d17dd (patch)
tree: 4cbe9d5d43e249706d6e323d1b612e23f81a7e70
parent: 342a935d24adf0746a01bfa37fd1cfe696ddc571 (diff)
download: numpy-37967930e05700af8ee3b66bdb35c5b8a02d17dd.tar.gz
3 files changed, 50 insertions, 17 deletions
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index 0fa03f343..3f5048592 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -1514,13 +1514,18 @@ NPY_NO_EXPORT void
 /**begin repeat1
  * #kind = isnan, isinf, isfinite, signbit#
  * #func = npy_isnan, npy_isinf, npy_isfinite, npy_signbit#
+ * #isnan = 1, 0*3#
  **/
 NPY_NO_EXPORT void
 @TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
 {
-    UNARY_LOOP {
-        const @type@ in1 = *(@type@ *)ip1;
-        *((npy_bool *)op1) = @func@(in1) != 0;
+    char * margs[] = {args[0], args[0], args[1]};
+    npy_intp msteps[] = {steps[0], steps[0], steps[1]};
+    if (!@isnan@ || !run_binary_simd_not_equal_@TYPE@(margs, dimensions, msteps)) {
+        UNARY_LOOP {
+            const @type@ in1 = *(@type@ *)ip1;
+            *((npy_bool *)op1) = @func@(in1) != 0;
+        }
     }
 }
 /**end repeat1**/
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index 6db0c7b0e..92dc0c659 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -503,6 +503,7 @@ sse2_compress4_to_byte_@TYPE@(@vtype@ r1, @vtype@ r2, @vtype@ r3, @vtype@ r4,
  * #kind = equal, not_equal, less, less_equal, greater, greater_equal#
  * #OP = ==, !=, <, <=, >, >=#
  * #VOP = cmpeq, cmpneq, cmplt, cmple, cmpgt, cmpge#
+ * #neq = 0, 1, 0*4#
 */
 
 /* sets invalid fpu flag on QNaN for consistency with packed compare */
@@ -523,20 +524,36 @@ sse2_binary_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, @type@ * ip2, npy_intp n)
     LOOP_BLOCK_ALIGN_VAR(ip1, @type@, 16) {
         op[i] = sse2_ordered_cmp_@kind@_@TYPE@(ip1[i], ip2[i]);
     }
-    LOOP_BLOCKED(@type@, 64) {
-        @vtype@ a1 = @vpre@_load_@vsuf@(&ip1[i + 0 * 16 / sizeof(@type@)]);
-        @vtype@ b1 = @vpre@_load_@vsuf@(&ip1[i + 1 * 16 / sizeof(@type@)]);
-        @vtype@ c1 = @vpre@_load_@vsuf@(&ip1[i + 2 * 16 / sizeof(@type@)]);
-        @vtype@ d1 = @vpre@_load_@vsuf@(&ip1[i + 3 * 16 / sizeof(@type@)]);
-        @vtype@ a2 = @vpre@_loadu_@vsuf@(&ip2[i + 0 * 16 / sizeof(@type@)]);
-        @vtype@ b2 = @vpre@_loadu_@vsuf@(&ip2[i + 1 * 16 / sizeof(@type@)]);
-        @vtype@ c2 = @vpre@_loadu_@vsuf@(&ip2[i + 2 * 16 / sizeof(@type@)]);
-        @vtype@ d2 = @vpre@_loadu_@vsuf@(&ip2[i + 3 * 16 / sizeof(@type@)]);
-        @vtype@ r1 = @vpre@_@VOP@_@vsuf@(a1, a2);
-        @vtype@ r2 = @vpre@_@VOP@_@vsuf@(b1, b2);
-        @vtype@ r3 = @vpre@_@VOP@_@vsuf@(c1, c2);
-        @vtype@ r4 = @vpre@_@VOP@_@vsuf@(d1, d2);
-        sse2_compress4_to_byte_@TYPE@(r1, r2, r3, r4, &op[i]);
+    /* isnan special unary case */
+    if (@neq@ && ip1 == ip2) {
+        LOOP_BLOCKED(@type@, 64) {
+            @vtype@ a = @vpre@_load_@vsuf@(&ip1[i + 0 * 16 / sizeof(@type@)]);
+            @vtype@ b = @vpre@_load_@vsuf@(&ip1[i + 1 * 16 / sizeof(@type@)]);
+            @vtype@ c = @vpre@_load_@vsuf@(&ip1[i + 2 * 16 / sizeof(@type@)]);
+            @vtype@ d = @vpre@_load_@vsuf@(&ip1[i + 3 * 16 / sizeof(@type@)]);
+            @vtype@ r1 = @vpre@_@VOP@_@vsuf@(a, a);
+            @vtype@ r2 = @vpre@_@VOP@_@vsuf@(b, b);
+            @vtype@ r3 = @vpre@_@VOP@_@vsuf@(c, c);
+            @vtype@ r4 = @vpre@_@VOP@_@vsuf@(d, d);
+            sse2_compress4_to_byte_@TYPE@(r1, r2, r3, r4, &op[i]);
+        }
+    }
+    else {
+        LOOP_BLOCKED(@type@, 64) {
+            @vtype@ a1 = @vpre@_load_@vsuf@(&ip1[i + 0 * 16 / sizeof(@type@)]);
+            @vtype@ b1 = @vpre@_load_@vsuf@(&ip1[i + 1 * 16 / sizeof(@type@)]);
+            @vtype@ c1 = @vpre@_load_@vsuf@(&ip1[i + 2 * 16 / sizeof(@type@)]);
+            @vtype@ d1 = @vpre@_load_@vsuf@(&ip1[i + 3 * 16 / sizeof(@type@)]);
+            @vtype@ a2 = @vpre@_loadu_@vsuf@(&ip2[i + 0 * 16 / sizeof(@type@)]);
+            @vtype@ b2 = @vpre@_loadu_@vsuf@(&ip2[i + 1 * 16 / sizeof(@type@)]);
+            @vtype@ c2 = @vpre@_loadu_@vsuf@(&ip2[i + 2 * 16 / sizeof(@type@)]);
+            @vtype@ d2 = @vpre@_loadu_@vsuf@(&ip2[i + 3 * 16 / sizeof(@type@)]);
+            @vtype@ r1 = @vpre@_@VOP@_@vsuf@(a1, a2);
+            @vtype@ r2 = @vpre@_@VOP@_@vsuf@(b1, b2);
+            @vtype@ r3 = @vpre@_@VOP@_@vsuf@(c1, c2);
+            @vtype@ r4 = @vpre@_@VOP@_@vsuf@(d1, d2);
+            sse2_compress4_to_byte_@TYPE@(r1, r2, r3, r4, &op[i]);
+        }
     }
     LOOP_BLOCKED_END {
         op[i] = sse2_ordered_cmp_@kind@_@TYPE@(ip1[i], ip2[i]);
diff --git a/numpy/core/tests/test_numeric.py b/numpy/core/tests/test_numeric.py
index a089d44dc..3a708d9e8 100644
--- a/numpy/core/tests/test_numeric.py
+++ b/numpy/core/tests/test_numeric.py
@@ -346,6 +346,11 @@ class TestBoolCmp(TestCase):
             self.ed[s:s+4] = [(i & 2**x) != 0 for x in range(4)]
             s += 4
 
+        self.nf = self.f.copy()
+        self.nd = self.d.copy()
+        self.nf[self.ef] = np.nan
+        self.nd[self.ed] = np.nan
+
     def test_float(self):
         # offset for alignment test
         for i in range(4):
@@ -365,6 +370,9 @@ class TestBoolCmp(TestCase):
             assert_array_equal(r2.view(np.int8), r2.astype(np.int8))
             assert_array_equal(r3.view(np.int8), r3.astype(np.int8))
 
+            # isnan on amd64 takes the same codepath
+            assert_array_equal(np.isnan(self.nf[i:]), self.ef[i:])
+
     def test_double(self):
         # offset for alignment test
         for i in range(2):
@@ -384,6 +392,9 @@ class TestBoolCmp(TestCase):
             assert_array_equal(r2.view(np.int8), r2.astype(np.int8))
             assert_array_equal(r3.view(np.int8), r3.astype(np.int8))
 
+            # isnan on amd64 takes the same codepath
+            assert_array_equal(np.isnan(self.nd[i:]), self.ed[i:])
+
 
 class TestSeterr(TestCase):
     def test_default(self):
author	Julian Taylor <jtaylor.debian@googlemail.com>	2014-03-05 20:34:57 +0100
committer	Julian Taylor <jtaylor.debian@googlemail.com>	2014-03-05 20:48:17 +0100
commit	37967930e05700af8ee3b66bdb35c5b8a02d17dd (patch)
tree	4cbe9d5d43e249706d6e323d1b612e23f81a7e70
parent	342a935d24adf0746a01bfa37fd1cfe696ddc571 (diff)
download	numpy-37967930e05700af8ee3b66bdb35c5b8a02d17dd.tar.gz