ENH: vectorize boolean logical &&, ||, abs and not

The code ensures the result is identical to a boolean operation even though this might be unnecessary if bools are used correctly everywhere. The overhead doesn't matter much as vectorizing single byte operations hits the memory bandwidth limit very quickly. Improves performance by about a factor of 5 to 10 depending on the cpu. These operations currently can't be autovectorized by gcc 4.8.
author: Julian Taylor <jtaylor.debian@googlemail.com> 2013-06-14 20:23:18 +0200
committer: Julian Taylor <jtaylor.debian@googlemail.com> 2013-06-14 20:33:48 +0200
commit: 4b4a7365aeff3dfd19d4b72765db5358963614e9 (patch)
tree: 03d7c97b1e5cae4da6a3b04ffba785077c1bff25 /numpy
parent: 3b7d3aef83bd914ff20fdc2a3d680591e69f61de (diff)
download: numpy-4b4a7365aeff3dfd19d4b72765db5358963614e9.tar.gz
3 files changed, 242 insertions, 3 deletions
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index 068ecde7c..59d144569 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -571,6 +571,9 @@ BOOL_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED
         *((npy_bool *)iop1) = io1;
     }
     else {
+        if (run_binary_simd_@kind@_BOOL(args, dimensions, steps)) {
+            return;
+        }
         BINARY_LOOP {
             const npy_bool in1 = *(npy_bool *)ip1;
             const npy_bool in2 = *(npy_bool *)ip2;
@@ -613,6 +616,9 @@ BOOL_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED
 NPY_NO_EXPORT void
 BOOL_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
 {
+    if (run_unary_simd_@kind@_BOOL(args, dimensions, steps)) {
+        return;
+    }
     UNARY_LOOP {
         npy_bool in1 = *(npy_bool *)ip1;
         *((npy_bool *)op1) = in1 @OP@ 0;
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index 746943097..0382f2cf7 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -19,6 +19,9 @@
 #include "npy_config.h"
 /* for NO_FLOATING_POINT_SUPPORT */
 #include "numpy/ufuncobject.h"
+#ifdef HAVE_EMMINTRIN_H
+#include <emmintrin.h>
+#endif
 #include <assert.h>
 #include <stdlib.h>
 
@@ -75,6 +78,12 @@ void PyUFunc_clearfperr(void);
  * if it was run returns true and false if nothing was done
  */
 
+/*
+ *****************************************************************************
+ **                           FLOAT DISPATCHERS
+ *****************************************************************************
+ */
+
 /**begin repeat
  * Float types
  *  #type = npy_float, npy_double, npy_longdouble#
@@ -161,13 +170,66 @@ run_binary_simd_@kind@_@TYPE@(char **args, npy_intp *dimensions, npy_intp *steps
 
 /**end repeat**/
 
-
 /*
- * Vectorized operations
+ *****************************************************************************
+ **                           BOOL DISPATCHERS
+ *****************************************************************************
+ */
+
+/**begin repeat
+ * # kind = logical_or, logical_and#
  */
 
+static void
+sse2_binary_@kind@_BOOL(npy_bool * op, npy_bool * ip1, npy_bool * ip2,
+                        npy_intp n);
+
+static NPY_INLINE int
+run_binary_simd_@kind@_BOOL(char **args, npy_intp *dimensions, npy_intp *steps)
+{
+#if defined HAVE_EMMINTRIN_H
+    if (sizeof(npy_bool) == 1 && IS_BLOCKABLE_BINARY(sizeof(npy_bool), 16)) {
+        sse2_binary_@kind@_BOOL((npy_bool*)args[2], (npy_bool*)args[0],
+                               (npy_bool*)args[1], dimensions[0]);
+        return 1;
+    }
+#endif
+    return 0;
+}
+
+/**end repeat**/
+
+/**begin repeat
+ * # kind = absolute, logical_not#
+ */
+
+static void
+sse2_@kind@_BOOL(npy_bool *, npy_bool *, const npy_intp n);
+
+static NPY_INLINE int
+run_unary_simd_@kind@_BOOL(char **args, npy_intp *dimensions, npy_intp *steps)
+{
+#if defined HAVE_EMMINTRIN_H
+    if (sizeof(npy_bool) == 1 && IS_BLOCKABLE_UNARY(sizeof(npy_bool), 16)) {
+        sse2_@kind@_BOOL((npy_bool*)args[1], (npy_bool*)args[0], dimensions[0]);
+        return 1;
+    }
+#endif
+    return 0;
+}
+
+/**end repeat**/
+
 #ifdef HAVE_EMMINTRIN_H
-#include <emmintrin.h>
+
+/*
+ * Vectorized operations
+ */
+/*
+ *****************************************************************************
+ **                           FLOAT LOOPS
+ *****************************************************************************
+ */
 
 /**begin repeat
 * horizontal reductions on a vector
@@ -446,6 +508,107 @@ sse2_@kind@_@TYPE@(@type@ * ip, @type@ * op, const npy_intp n)
 
 /**end repeat**/
 
+/*
+ *****************************************************************************
+ **                           BOOL LOOPS
+ *****************************************************************************
+ */
+
+/**begin repeat
+ * # kind = logical_or, logical_and#
+ * # and = 0, 1#
+ * # op = ||, &&#
+ * # vop = or, and#
+ * # vpre = _mm*2#
+ * # vsuf = si128*2#
+ * # vtype = __m128i*2#
+ * # type = npy_bool*2#
+ * # vloadu = _mm_loadu_si128*2#
+ * # vstore = _mm_store_si128*2#
+ */
+
+/*
+ * convert any bit set to boolean true so vectorized and normal operations are
+ * consistent, should not be required if bool is used correctly everywhere but
+ * you never know
+ */
+#if !@and@
+static NPY_INLINE @vtype@ byte_to_true(@vtype@ v)
+{
+    const @vtype@ zero = @vpre@_setzero_@vsuf@();
+    const @vtype@ truemask = @vpre@_set1_epi8(1 == 1);
+    /* get 0xFF for zeros */
+    @vtype@ tmp = @vpre@_cmpeq_epi8(v, zero);
+    /* filled with 0xFF/0x00, negate and mask to boolean true */
+    return @vpre@_andnot_@vsuf@(tmp, truemask);
+}
+#endif
+
+static void
+sse2_binary_@kind@_BOOL(npy_bool * op, npy_bool * ip1, npy_bool * ip2, npy_intp n)
+{
+    LOOP_BLOCK_ALIGN_VAR(op, @type@, 16)
+        op[i] = ip1[i] @op@ ip2[i];
+    LOOP_BLOCKED(@type@, 16) {
+        @vtype@ a = @vloadu@((__m128i*)&ip1[i]);
+        @vtype@ b = @vloadu@((__m128i*)&ip2[i]);
+#if @and@
+        const @vtype@ zero = @vpre@_setzero_@vsuf@();
+        /* get 0xFF for non zeros*/
+        @vtype@ tmp = @vpre@_cmpeq_epi8(a, zero);
+        /* andnot -> 0x00 for zeros xFF for non zeros, & with ip2 */
+        tmp = @vpre@_andnot_@vsuf@(tmp, b);
+#else
+        @vtype@ tmp = @vpre@_or_@vsuf@(a, b);
+#endif
+
+        @vstore@((__m128i*)&op[i], byte_to_true(tmp));
+    }
+    LOOP_BLOCKED_END {
+        op[i] = (ip1[i] @op@ ip2[i]);
+    }
+}
+
+/**end repeat**/
+
+/**begin repeat
+ * # kind = absolute, logical_not#
+ * # op = !=, ==#
+ * # not = 0, 1#
+ * # vpre = _mm*2#
+ * # vsuf = si128*2#
+ * # vtype = __m128i*2#
+ * # type = npy_bool*2#
+ * # vloadu = _mm_loadu_si128*2#
+ * # vstore = _mm_store_si128*2#
+ */
+
+static void
+sse2_@kind@_BOOL(@type@ * op, @type@ * ip, const npy_intp n)
+{
+    LOOP_BLOCK_ALIGN_VAR(op, @type@, 16)
+        op[i] = (ip[i] @op@ 0);
+    LOOP_BLOCKED(@type@, 16) {
+        @vtype@ a = @vloadu@((__m128i*)&ip[i]);
+#if @not@
+        const @vtype@ zero = @vpre@_setzero_@vsuf@();
+        const @vtype@ truemask = @vpre@_set1_epi8(1 == 1);
+        /* equivalent to byte_to_true but can skip the negation */
+        a = @vpre@_cmpeq_epi8(a, zero);
+        a = @vpre@_and_@vsuf@(a, truemask);
+#else
+        /* abs is kind of pointless but maybe its used for byte_to_true */
+        a = byte_to_true(a);
+#endif
+        @vstore@((__m128i*)&op[i], a);
+    }
+    LOOP_BLOCKED_END {
+        op[i] = (ip[i] @op@ 0);
+    }
+}
+
+/**end repeat**/
+
 #endif /* HAVE_EMMINTRIN_H */
 
 #endif
diff --git a/numpy/core/tests/test_numeric.py b/numpy/core/tests/test_numeric.py
index 5c8de3734..ed4e0b79e 100644
--- a/numpy/core/tests/test_numeric.py
+++ b/numpy/core/tests/test_numeric.py
@@ -223,6 +223,76 @@ class TestBoolScalar(TestCase):
         self.assertTrue((f ^ f) is f)
 
 
+class TestBoolArray(TestCase):
+    def setUp(self):
+        # offset for simd tests
+        self.t = array([True] * 41, dtype=np.bool)[1::]
+        self.f = array([False] * 41, dtype=np.bool)[1::]
+        self.o = array([False] * 42, dtype=np.bool)[2::]
+        self.nm = self.f.copy()
+        self.im = self.t.copy()
+        self.nm[3] = True
+        self.nm[-2] = True
+        self.im[3] = False
+        self.im[-2] = False
+
+    def test_all_any(self):
+        self.assertTrue(self.t.all())
+        self.assertTrue(self.t.any())
+        self.assertFalse(self.f.all())
+        self.assertFalse(self.f.any())
+        self.assertTrue(self.nm.any())
+        self.assertTrue(self.im.any())
+        self.assertFalse(self.nm.all())
+        self.assertFalse(self.im.all())
+
+    def test_logical_not_abs(self):
+        assert_array_equal(~self.t, self.f)
+        assert_array_equal(np.abs(~self.t), self.f)
+        assert_array_equal(np.abs(~self.f), self.t)
+        assert_array_equal(np.abs(self.f), self.f)
+        assert_array_equal(~np.abs(self.f), self.t)
+        assert_array_equal(~np.abs(self.t), self.f)
+        assert_array_equal(np.abs(~self.nm), self.im)
+        np.logical_not(self.t, out=self.o)
+        assert_array_equal(self.o, self.f)
+        np.abs(self.t, out=self.o)
+        assert_array_equal(self.o, self.t)
+
+    def test_logical_and_or_xor(self):
+        assert_array_equal(self.t | self.t, self.t)
+        assert_array_equal(self.f | self.f, self.f)
+        assert_array_equal(self.t | self.f, self.t)
+        assert_array_equal(self.f | self.t, self.t)
+        np.logical_or(self.t, self.t, out=self.o)
+        assert_array_equal(self.o, self.t)
+        assert_array_equal(self.t & self.t, self.t)
+        assert_array_equal(self.f & self.f, self.f)
+        assert_array_equal(self.t & self.f, self.f)
+        assert_array_equal(self.f & self.t, self.f)
+        np.logical_and(self.t, self.t, out=self.o)
+        assert_array_equal(self.o, self.t)
+        assert_array_equal(self.t ^ self.t, self.f)
+        assert_array_equal(self.f ^ self.f, self.f)
+        assert_array_equal(self.t ^ self.f, self.t)
+        assert_array_equal(self.f ^ self.t, self.t)
+        np.logical_xor(self.t, self.t, out=self.o)
+        assert_array_equal(self.o, self.f)
+
+        assert_array_equal(self.nm & self.t, self.nm)
+        assert_array_equal(self.im & self.f, False)
+        assert_array_equal(self.nm & True, self.nm)
+        assert_array_equal(self.im & False, self.f)
+        assert_array_equal(self.nm | self.t, self.t)
+        assert_array_equal(self.im | self.f, self.im)
+        assert_array_equal(self.nm | True, self.t)
+        assert_array_equal(self.im | False, self.im)
+        assert_array_equal(self.nm ^ self.t, self.im)
+        assert_array_equal(self.im ^ self.f, self.im)
+        assert_array_equal(self.nm ^ True, self.im)
+        assert_array_equal(self.im ^ False, self.im)
+
+
 class TestSeterr(TestCase):
     def test_default(self):
         err = geterr()
author	Julian Taylor <jtaylor.debian@googlemail.com>	2013-06-14 20:23:18 +0200
committer	Julian Taylor <jtaylor.debian@googlemail.com>	2013-06-14 20:33:48 +0200
commit	4b4a7365aeff3dfd19d4b72765db5358963614e9 (patch)
tree	03d7c97b1e5cae4da6a3b04ffba785077c1bff25 /numpy
parent	3b7d3aef83bd914ff20fdc2a3d680591e69f61de (diff)
download	numpy-4b4a7365aeff3dfd19d4b72765db5358963614e9.tar.gz