Merge pull request #3517 from juliantaylor/memchr-all-any

ENH: use memchr for unit stride all/any
author: Charles Harris <charlesr.harris@gmail.com> 2013-08-10 10:53:34 -0700
committer: Charles Harris <charlesr.harris@gmail.com> 2013-08-10 10:53:34 -0700
commit: c6da120806f70d417619e1a34512f38dbd0dcc8d (patch)
tree: 277f03b5f7134c281d36f9323eb90dfe82fa9b87 /numpy
parent: 78801c50ad314edae040968e5ffcd0d27dd70a45 (diff)
parent: 7819817653003fdae4554cbfab4cdbedf824c305 (diff)
download: numpy-c6da120806f70d417619e1a34512f38dbd0dcc8d.tar.gz
3 files changed, 57 insertions, 3 deletions
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index 0559fb416..d99fafaf2 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -20,6 +20,8 @@
 
 #include "ufunc_object.h"
 
+#include <string.h> /* for memchr */
+
 
 /*
  * include vectorized functions and dispatchers
@@ -555,15 +557,47 @@ BOOL_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED
  * #kind = logical_and, logical_or#
  * #OP =  &&, ||#
  * #SC =  ==, !=#
+ * #and = 1, 0#
  **/
 
 NPY_NO_EXPORT void
 BOOL_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
 {
     if(IS_BINARY_REDUCE) {
+#ifdef HAVE_EMMINTRIN_H
+        /*
+         * stick with our variant for more reliable performance, only known
+         * platform which outperforms it by ~20% is an i7 with glibc 2.17
+         */
         if (run_reduce_simd_@kind@_BOOL(args, dimensions, steps)) {
             return;
         }
+#else
+        /* for now only use libc on 32-bit/non-x86 */
+        if (steps[1] == 1) {
+            npy_bool * op = (npy_bool *)args[0];
+#if @and@
+            /* np.all(), search for a zero (false) */
+            if (*op) {
+                *op = memchr(args[1], 0, dimensions[0]) == NULL;
+            }
+#else
+            /*
+             * np.any(), search for a non-zero (true) via comparing against
+             * zero blocks, memcmp is faster than memchr on SSE4 machines
+             * with glibc >= 2.12 and memchr can only check for equal 1
+             */
+            static const npy_bool zero[4096]; /* zero by C standard */
+            npy_uintp i, n = dimensions[0];
+            for (i = 0; !*op && i < n - (n % sizeof(zero)); i += sizeof(zero)) {
+                *op = memcmp(&args[1][i], zero, sizeof(zero)) != 0;
+            }
+            if (!*op && n - i > 0)
+                *op = memcmp(&args[1][i], zero, n - i) != 0;
+#endif
+            return;
+        }
+#endif
         BINARY_REDUCE_LOOP(npy_bool) {
             const npy_bool in2 = *(npy_bool *)ip2;
             io1 = io1 @OP@ in2;
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index 98e2beb30..2f1c3055b 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -779,14 +779,17 @@ sse2_reduce_@kind@_BOOL(npy_bool * op, npy_bool * ip, const npy_intp n)
             return;
         }
     }
-    LOOP_BLOCKED(npy_bool, 16) {
+    /* unrolled once to replace a slow movmsk with a fast pmaxb */
+    LOOP_BLOCKED(npy_bool, 32) {
         @vtype@ v = @vload@((@vtype@*)&ip[i]);
+        @vtype@ v2 = @vload@((@vtype@*)&ip[i + 16]);
         v = @vpre@_cmpeq_epi8(v, zero);
+        v2 = @vpre@_cmpeq_epi8(v2, zero);
 #if @and@
-        if ((@vpre@_movemask_epi8(v) != 0)) {
+        if ((@vpre@_movemask_epi8(@vpre@_max_epu8(v, v2)) != 0)) {
             *op = 0;
 #else
-        if ((@vpre@_movemask_epi8(v) != 0xFFFF)) {
+        if ((@vpre@_movemask_epi8(@vpre@_min_epu8(v, v2)) != 0xFFFF)) {
             *op = 1;
 #endif
             return;
diff --git a/numpy/core/tests/test_numeric.py b/numpy/core/tests/test_numeric.py
index 1be0f4105..782ddd687 100644
--- a/numpy/core/tests/test_numeric.py
+++ b/numpy/core/tests/test_numeric.py
@@ -245,6 +245,23 @@ class TestBoolArray(TestCase):
         self.assertTrue(self.im.any())
         self.assertFalse(self.nm.all())
         self.assertFalse(self.im.all())
+        # check bad element in all positions
+        for i in range(256 - 7):
+            d = array([False] * 256, dtype=np.bool)[7::]
+            d[i] = True
+            self.assertTrue(np.any(d))
+            e = array([True] * 256, dtype=np.bool)[7::]
+            e[i] = False
+            self.assertFalse(np.all(e))
+            assert_array_equal(e, ~d)
+        # big array test for blocked libc loops
+        for i in list(range(9, 6000, 507)) + [7764, 90021, -10]:
+            d = array([False] * 100043, dtype=np.bool)
+            d[i] = True
+            self.assertTrue(np.any(d), msg="%r" % i)
+            e = array([True] * 100043, dtype=np.bool)
+            e[i] = False
+            self.assertFalse(np.all(e), msg="%r" % i)
 
     def test_logical_not_abs(self):
         assert_array_equal(~self.t, self.f)
author	Charles Harris <charlesr.harris@gmail.com>	2013-08-10 10:53:34 -0700
committer	Charles Harris <charlesr.harris@gmail.com>	2013-08-10 10:53:34 -0700
commit	c6da120806f70d417619e1a34512f38dbd0dcc8d (patch)
tree	277f03b5f7134c281d36f9323eb90dfe82fa9b87 /numpy
parent	78801c50ad314edae040968e5ffcd0d27dd70a45 (diff)
parent	7819817653003fdae4554cbfab4cdbedf824c305 (diff)
download	numpy-c6da120806f70d417619e1a34512f38dbd0dcc8d.tar.gz