Merge pull request #3436 from juliantaylor/vectorize-bool

ENH: vectorize boolean logical &&, ||, abs and not
author: Charles Harris <charlesr.harris@gmail.com> 2013-06-16 06:48:13 -0700
committer: Charles Harris <charlesr.harris@gmail.com> 2013-06-16 06:48:13 -0700
commit: 2a5c2c8227b600654f31ed346c73cce77bef554d (patch)
tree: a712e09e278e01797a60ac562ac56f5b72b5ae9b /numpy
parent: bb8c89db8bc5afd39dbe42d6f1f6657e769165d7 (diff)
parent: 4b4a7365aeff3dfd19d4b72765db5358963614e9 (diff)
download: numpy-2a5c2c8227b600654f31ed346c73cce77bef554d.tar.gz
3 files changed, 242 insertions, 3 deletions
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index 068ecde7c..59d144569 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -571,6 +571,9 @@ BOOL_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED
         *((npy_bool *)iop1) = io1;
     }
     else {
+        if (run_binary_simd_@kind@_BOOL(args, dimensions, steps)) {
+            return;
+        }
         BINARY_LOOP {
             const npy_bool in1 = *(npy_bool *)ip1;
             const npy_bool in2 = *(npy_bool *)ip2;
@@ -613,6 +616,9 @@ BOOL_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED
 NPY_NO_EXPORT void
 BOOL_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
 {
+    if (run_unary_simd_@kind@_BOOL(args, dimensions, steps)) {
+        return;
+    }
     UNARY_LOOP {
         npy_bool in1 = *(npy_bool *)ip1;
         *((npy_bool *)op1) = in1 @OP@ 0;
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index 746943097..0382f2cf7 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -19,6 +19,9 @@
 #include "npy_config.h"
 /* for NO_FLOATING_POINT_SUPPORT */
 #include "numpy/ufuncobject.h"
+#ifdef HAVE_EMMINTRIN_H
+#include <emmintrin.h>
+#endif
 #include <assert.h>
 #include <stdlib.h>
 
@@ -75,6 +78,12 @@ void PyUFunc_clearfperr(void);
  * if it was run returns true and false if nothing was done
  */
 
+/*
+ *****************************************************************************
+ **                           FLOAT DISPATCHERS
+ *****************************************************************************
+ */
+
 /**begin repeat
  * Float types
  *  #type = npy_float, npy_double, npy_longdouble#
@@ -161,13 +170,66 @@ run_binary_simd_@kind@_@TYPE@(char **args, npy_intp *dimensions, npy_intp *steps
 
 /**end repeat**/
 
-
 /*
- * Vectorized operations
+ *****************************************************************************
+ **                           BOOL DISPATCHERS
+ *****************************************************************************
+ */
+
+/**begin repeat
+ * # kind = logical_or, logical_and#
  */
 
+static void
+sse2_binary_@kind@_BOOL(npy_bool * op, npy_bool * ip1, npy_bool * ip2,
+                        npy_intp n);
+
+static NPY_INLINE int
+run_binary_simd_@kind@_BOOL(char **args, npy_intp *dimensions, npy_intp *steps)
+{
+#if defined HAVE_EMMINTRIN_H
+    if (sizeof(npy_bool) == 1 && IS_BLOCKABLE_BINARY(sizeof(npy_bool), 16)) {
+        sse2_binary_@kind@_BOOL((npy_bool*)args[2], (npy_bool*)args[0],
+                               (npy_bool*)args[1], dimensions[0]);
+        return 1;
+    }
+#endif
+    return 0;
+}
+
+/**end repeat**/
+
+/**begin repeat
+ * # kind = absolute, logical_not#
+ */
+
+static void
+sse2_@kind@_BOOL(npy_bool *, npy_bool *, const npy_intp n);
+
+static NPY_INLINE int
+run_unary_simd_@kind@_BOOL(char **args, npy_intp *dimensions, npy_intp *steps)
+{
+#if defined HAVE_EMMINTRIN_H
+    if (sizeof(npy_bool) == 1 && IS_BLOCKABLE_UNARY(sizeof(npy_bool), 16)) {
+        sse2_@kind@_BOOL((npy_bool*)args[1], (npy_bool*)args[0], dimensions[0]);
+        return 1;
+    }
+#endif
+    return 0;
+}
+
+/**end repeat**/
+
 #ifdef HAVE_EMMINTRIN_H
-#include <emmintrin.h>
+
+/*
+ * Vectorized operations
+ */
+/*
+ *****************************************************************************
+ **                           FLOAT LOOPS
+ *****************************************************************************
+ */
 
 /**begin repeat
 * horizontal reductions on a vector
@@ -446,6 +508,107 @@ sse2_@kind@_@TYPE@(@type@ * ip, @type@ * op, const npy_intp n)
 
 /**end repeat**/
 
+/*
+ *****************************************************************************
+ **                           BOOL LOOPS
+ *****************************************************************************
+ */
+
+/**begin repeat
+ * # kind = logical_or, logical_and#
+ * # and = 0, 1#
+ * # op = ||, &&#
+ * # vop = or, and#
+ * # vpre = _mm*2#
+ * # vsuf = si128*2#
+ * # vtype = __m128i*2#
+ * # type = npy_bool*2#
+ * # vloadu = _mm_loadu_si128*2#
+ * # vstore = _mm_store_si128*2#
+ */
+
+/*
+ * convert any bit set to boolean true so vectorized and normal operations are
+ * consistent, should not be required if bool is used correctly everywhere but
+ * you never know
+ */
+#if !@and@
+static NPY_INLINE @vtype@ byte_to_true(@vtype@ v)
+{
+    const @vtype@ zero = @vpre@_setzero_@vsuf@();
+    const @vtype@ truemask = @vpre@_set1_epi8(1 == 1);
+    /* get 0xFF for zeros */
+    @vtype@ tmp = @vpre@_cmpeq_epi8(v, zero);
+    /* filled with 0xFF/0x00, negate and mask to boolean true */
+    return @vpre@_andnot_@vsuf@(tmp, truemask);
+}
+#endif
+
+static void
+sse2_binary_@kind@_BOOL(npy_bool * op, npy_bool * ip1, npy_bool * ip2, npy_intp n)
+{
+    LOOP_BLOCK_ALIGN_VAR(op, @type@, 16)
+        op[i] = ip1[i] @op@ ip2[i];
+    LOOP_BLOCKED(@type@, 16) {
+        @vtype@ a = @vloadu@((__m128i*)&ip1[i]);
+        @vtype@ b = @vloadu@((__m128i*)&ip2[i]);
+#if @and@
+        const @vtype@ zero = @vpre@_setzero_@vsuf@();
+        /* get 0xFF for non zeros*/
+        @vtype@ tmp = @vpre@_cmpeq_epi8(a, zero);
+        /* andnot -> 0x00 for zeros xFF for non zeros, & with ip2 */
+        tmp = @vpre@_andnot_@vsuf@(tmp, b);
+#else
+        @vtype@ tmp = @vpre@_or_@vsuf@(a, b);
+#endif
+
+        @vstore@((__m128i*)&op[i], byte_to_true(tmp));
+    }
+    LOOP_BLOCKED_END {
+        op[i] = (ip1[i] @op@ ip2[i]);
+    }
+}
+
+/**end repeat**/
+
+/**begin repeat
+ * # kind = absolute, logical_not#
+ * # op = !=, ==#
+ * # not = 0, 1#
+ * # vpre = _mm*2#
+ * # vsuf = si128*2#
+ * # vtype = __m128i*2#
+ * # type = npy_bool*2#
+ * # vloadu = _mm_loadu_si128*2#
+ * # vstore = _mm_store_si128*2#
+ */
+
+static void
+sse2_@kind@_BOOL(@type@ * op, @type@ * ip, const npy_intp n)
+{
+    LOOP_BLOCK_ALIGN_VAR(op, @type@, 16)
+        op[i] = (ip[i] @op@ 0);
+    LOOP_BLOCKED(@type@, 16) {
+        @vtype@ a = @vloadu@((__m128i*)&ip[i]);
+#if @not@
+        const @vtype@ zero = @vpre@_setzero_@vsuf@();
+        const @vtype@ truemask = @vpre@_set1_epi8(1 == 1);
+        /* equivalent to byte_to_true but can skip the negation */
+        a = @vpre@_cmpeq_epi8(a, zero);
+        a = @vpre@_and_@vsuf@(a, truemask);
+#else
+        /* abs is kind of pointless but maybe its used for byte_to_true */
+        a = byte_to_true(a);
+#endif
+        @vstore@((__m128i*)&op[i], a);
+    }
+    LOOP_BLOCKED_END {
+        op[i] = (ip[i] @op@ 0);
+    }
+}
+
+/**end repeat**/
+
 #endif /* HAVE_EMMINTRIN_H */
 
 #endif
diff --git a/numpy/core/tests/test_numeric.py b/numpy/core/tests/test_numeric.py
index 5c8de3734..ed4e0b79e 100644
--- a/numpy/core/tests/test_numeric.py
+++ b/numpy/core/tests/test_numeric.py
@@ -223,6 +223,76 @@ class TestBoolScalar(TestCase):
         self.assertTrue((f ^ f) is f)
 
 
+class TestBoolArray(TestCase):
+    def setUp(self):
+        # offset for simd tests
+        self.t = array([True] * 41, dtype=np.bool)[1::]
+        self.f = array([False] * 41, dtype=np.bool)[1::]
+        self.o = array([False] * 42, dtype=np.bool)[2::]
+        self.nm = self.f.copy()
+        self.im = self.t.copy()
+        self.nm[3] = True
+        self.nm[-2] = True
+        self.im[3] = False
+        self.im[-2] = False
+
+    def test_all_any(self):
+        self.assertTrue(self.t.all())
+        self.assertTrue(self.t.any())
+        self.assertFalse(self.f.all())
+        self.assertFalse(self.f.any())
+        self.assertTrue(self.nm.any())
+        self.assertTrue(self.im.any())
+        self.assertFalse(self.nm.all())
+        self.assertFalse(self.im.all())
+
+    def test_logical_not_abs(self):
+        assert_array_equal(~self.t, self.f)
+        assert_array_equal(np.abs(~self.t), self.f)
+        assert_array_equal(np.abs(~self.f), self.t)
+        assert_array_equal(np.abs(self.f), self.f)
+        assert_array_equal(~np.abs(self.f), self.t)
+        assert_array_equal(~np.abs(self.t), self.f)
+        assert_array_equal(np.abs(~self.nm), self.im)
+        np.logical_not(self.t, out=self.o)
+        assert_array_equal(self.o, self.f)
+        np.abs(self.t, out=self.o)
+        assert_array_equal(self.o, self.t)
+
+    def test_logical_and_or_xor(self):
+        assert_array_equal(self.t | self.t, self.t)
+        assert_array_equal(self.f | self.f, self.f)
+        assert_array_equal(self.t | self.f, self.t)
+        assert_array_equal(self.f | self.t, self.t)
+        np.logical_or(self.t, self.t, out=self.o)
+        assert_array_equal(self.o, self.t)
+        assert_array_equal(self.t & self.t, self.t)
+        assert_array_equal(self.f & self.f, self.f)
+        assert_array_equal(self.t & self.f, self.f)
+        assert_array_equal(self.f & self.t, self.f)
+        np.logical_and(self.t, self.t, out=self.o)
+        assert_array_equal(self.o, self.t)
+        assert_array_equal(self.t ^ self.t, self.f)
+        assert_array_equal(self.f ^ self.f, self.f)
+        assert_array_equal(self.t ^ self.f, self.t)
+        assert_array_equal(self.f ^ self.t, self.t)
+        np.logical_xor(self.t, self.t, out=self.o)
+        assert_array_equal(self.o, self.f)
+
+        assert_array_equal(self.nm & self.t, self.nm)
+        assert_array_equal(self.im & self.f, False)
+        assert_array_equal(self.nm & True, self.nm)
+        assert_array_equal(self.im & False, self.f)
+        assert_array_equal(self.nm | self.t, self.t)
+        assert_array_equal(self.im | self.f, self.im)
+        assert_array_equal(self.nm | True, self.t)
+        assert_array_equal(self.im | False, self.im)
+        assert_array_equal(self.nm ^ self.t, self.im)
+        assert_array_equal(self.im ^ self.f, self.im)
+        assert_array_equal(self.nm ^ True, self.im)
+        assert_array_equal(self.im ^ False, self.im)
+
+
 class TestSeterr(TestCase):
     def test_default(self):
         err = geterr()
author	Charles Harris <charlesr.harris@gmail.com>	2013-06-16 06:48:13 -0700
committer	Charles Harris <charlesr.harris@gmail.com>	2013-06-16 06:48:13 -0700
commit	2a5c2c8227b600654f31ed346c73cce77bef554d (patch)
tree	a712e09e278e01797a60ac562ac56f5b72b5ae9b /numpy
parent	bb8c89db8bc5afd39dbe42d6f1f6657e769165d7 (diff)
parent	4b4a7365aeff3dfd19d4b72765db5358963614e9 (diff)
download	numpy-2a5c2c8227b600654f31ed346c73cce77bef554d.tar.gz