diff options
author | Julian Taylor <jtaylor.debian@googlemail.com> | 2013-06-14 20:23:18 +0200 |
---|---|---|
committer | Julian Taylor <jtaylor.debian@googlemail.com> | 2013-06-14 20:33:48 +0200 |
commit | 4b4a7365aeff3dfd19d4b72765db5358963614e9 (patch) | |
tree | 03d7c97b1e5cae4da6a3b04ffba785077c1bff25 /numpy | |
parent | 3b7d3aef83bd914ff20fdc2a3d680591e69f61de (diff) | |
download | numpy-4b4a7365aeff3dfd19d4b72765db5358963614e9.tar.gz |
ENH: vectorize boolean logical &&, ||, abs and not
The code ensures the result is identical to a boolean operation even
though this might be unnecessary if bools are used correctly everywhere.
The overhead doesn't matter much as vectorizing single byte operations
hits the memory bandwidth limit very quickly.
Improves performance by about a factor of 5 to 10 depending on the cpu.
These operations currently can't be autovectorized by gcc 4.8.
Diffstat (limited to 'numpy')
-rw-r--r-- | numpy/core/src/umath/loops.c.src | 6 | ||||
-rw-r--r-- | numpy/core/src/umath/simd.inc.src | 169 | ||||
-rw-r--r-- | numpy/core/tests/test_numeric.py | 70 |
3 files changed, 242 insertions, 3 deletions
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src index 068ecde7c..59d144569 100644 --- a/numpy/core/src/umath/loops.c.src +++ b/numpy/core/src/umath/loops.c.src @@ -571,6 +571,9 @@ BOOL_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED *((npy_bool *)iop1) = io1; } else { + if (run_binary_simd_@kind@_BOOL(args, dimensions, steps)) { + return; + } BINARY_LOOP { const npy_bool in1 = *(npy_bool *)ip1; const npy_bool in2 = *(npy_bool *)ip2; @@ -613,6 +616,9 @@ BOOL_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED NPY_NO_EXPORT void BOOL_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func)) { + if (run_unary_simd_@kind@_BOOL(args, dimensions, steps)) { + return; + } UNARY_LOOP { npy_bool in1 = *(npy_bool *)ip1; *((npy_bool *)op1) = in1 @OP@ 0; diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src index 746943097..0382f2cf7 100644 --- a/numpy/core/src/umath/simd.inc.src +++ b/numpy/core/src/umath/simd.inc.src @@ -19,6 +19,9 @@ #include "npy_config.h" /* for NO_FLOATING_POINT_SUPPORT */ #include "numpy/ufuncobject.h" +#ifdef HAVE_EMMINTRIN_H +#include <emmintrin.h> +#endif #include <assert.h> #include <stdlib.h> @@ -75,6 +78,12 @@ void PyUFunc_clearfperr(void); * if it was run returns true and false if nothing was done */ +/* + ***************************************************************************** + ** FLOAT DISPATCHERS + ***************************************************************************** + */ + /**begin repeat * Float types * #type = npy_float, npy_double, npy_longdouble# @@ -161,13 +170,66 @@ run_binary_simd_@kind@_@TYPE@(char **args, npy_intp *dimensions, npy_intp *steps /**end repeat**/ - /* - * Vectorized operations + ***************************************************************************** + ** BOOL DISPATCHERS + ***************************************************************************** + */ + +/**begin repeat + * # kind = logical_or, logical_and# */ +static void +sse2_binary_@kind@_BOOL(npy_bool * op, npy_bool * ip1, npy_bool * ip2, + npy_intp n); + +static NPY_INLINE int +run_binary_simd_@kind@_BOOL(char **args, npy_intp *dimensions, npy_intp *steps) +{ +#if defined HAVE_EMMINTRIN_H + if (sizeof(npy_bool) == 1 && IS_BLOCKABLE_BINARY(sizeof(npy_bool), 16)) { + sse2_binary_@kind@_BOOL((npy_bool*)args[2], (npy_bool*)args[0], + (npy_bool*)args[1], dimensions[0]); + return 1; + } +#endif + return 0; +} + +/**end repeat**/ + +/**begin repeat + * # kind = absolute, logical_not# + */ + +static void +sse2_@kind@_BOOL(npy_bool *, npy_bool *, const npy_intp n); + +static NPY_INLINE int +run_unary_simd_@kind@_BOOL(char **args, npy_intp *dimensions, npy_intp *steps) +{ +#if defined HAVE_EMMINTRIN_H + if (sizeof(npy_bool) == 1 && IS_BLOCKABLE_UNARY(sizeof(npy_bool), 16)) { + sse2_@kind@_BOOL((npy_bool*)args[1], (npy_bool*)args[0], dimensions[0]); + return 1; + } +#endif + return 0; +} + +/**end repeat**/ + #ifdef HAVE_EMMINTRIN_H -#include <emmintrin.h> + +/* + * Vectorized operations + */ +/* + ***************************************************************************** + ** FLOAT LOOPS + ***************************************************************************** + */ /**begin repeat * horizontal reductions on a vector @@ -446,6 +508,107 @@ sse2_@kind@_@TYPE@(@type@ * ip, @type@ * op, const npy_intp n) /**end repeat**/ +/* + ***************************************************************************** + ** BOOL LOOPS + ***************************************************************************** + */ + +/**begin repeat + * # kind = logical_or, logical_and# + * # and = 0, 1# + * # op = ||, &&# + * # vop = or, and# + * # vpre = _mm*2# + * # vsuf = si128*2# + * # vtype = __m128i*2# + * # type = npy_bool*2# + * # vloadu = _mm_loadu_si128*2# + * # vstore = _mm_store_si128*2# + */ + +/* + * convert any bit set to boolean true so vectorized and normal operations are + * consistent, should not be required if bool is used correctly everywhere but + * you never know + */ +#if !@and@ +static NPY_INLINE @vtype@ byte_to_true(@vtype@ v) +{ + const @vtype@ zero = @vpre@_setzero_@vsuf@(); + const @vtype@ truemask = @vpre@_set1_epi8(1 == 1); + /* get 0xFF for zeros */ + @vtype@ tmp = @vpre@_cmpeq_epi8(v, zero); + /* filled with 0xFF/0x00, negate and mask to boolean true */ + return @vpre@_andnot_@vsuf@(tmp, truemask); +} +#endif + +static void +sse2_binary_@kind@_BOOL(npy_bool * op, npy_bool * ip1, npy_bool * ip2, npy_intp n) +{ + LOOP_BLOCK_ALIGN_VAR(op, @type@, 16) + op[i] = ip1[i] @op@ ip2[i]; + LOOP_BLOCKED(@type@, 16) { + @vtype@ a = @vloadu@((__m128i*)&ip1[i]); + @vtype@ b = @vloadu@((__m128i*)&ip2[i]); +#if @and@ + const @vtype@ zero = @vpre@_setzero_@vsuf@(); + /* get 0xFF for non zeros*/ + @vtype@ tmp = @vpre@_cmpeq_epi8(a, zero); + /* andnot -> 0x00 for zeros xFF for non zeros, & with ip2 */ + tmp = @vpre@_andnot_@vsuf@(tmp, b); +#else + @vtype@ tmp = @vpre@_or_@vsuf@(a, b); +#endif + + @vstore@((__m128i*)&op[i], byte_to_true(tmp)); + } + LOOP_BLOCKED_END { + op[i] = (ip1[i] @op@ ip2[i]); + } +} + +/**end repeat**/ + +/**begin repeat + * # kind = absolute, logical_not# + * # op = !=, ==# + * # not = 0, 1# + * # vpre = _mm*2# + * # vsuf = si128*2# + * # vtype = __m128i*2# + * # type = npy_bool*2# + * # vloadu = _mm_loadu_si128*2# + * # vstore = _mm_store_si128*2# + */ + +static void +sse2_@kind@_BOOL(@type@ * op, @type@ * ip, const npy_intp n) +{ + LOOP_BLOCK_ALIGN_VAR(op, @type@, 16) + op[i] = (ip[i] @op@ 0); + LOOP_BLOCKED(@type@, 16) { + @vtype@ a = @vloadu@((__m128i*)&ip[i]); +#if @not@ + const @vtype@ zero = @vpre@_setzero_@vsuf@(); + const @vtype@ truemask = @vpre@_set1_epi8(1 == 1); + /* equivalent to byte_to_true but can skip the negation */ + a = @vpre@_cmpeq_epi8(a, zero); + a = @vpre@_and_@vsuf@(a, truemask); +#else + /* abs is kind of pointless but maybe its used for byte_to_true */ + a = byte_to_true(a); +#endif + @vstore@((__m128i*)&op[i], a); + } + LOOP_BLOCKED_END { + op[i] = (ip[i] @op@ 0); + } +} + +/**end repeat**/ + #endif /* HAVE_EMMINTRIN_H */ #endif diff --git a/numpy/core/tests/test_numeric.py b/numpy/core/tests/test_numeric.py index 5c8de3734..ed4e0b79e 100644 --- a/numpy/core/tests/test_numeric.py +++ b/numpy/core/tests/test_numeric.py @@ -223,6 +223,76 @@ class TestBoolScalar(TestCase): self.assertTrue((f ^ f) is f) +class TestBoolArray(TestCase): + def setUp(self): + # offset for simd tests + self.t = array([True] * 41, dtype=np.bool)[1::] + self.f = array([False] * 41, dtype=np.bool)[1::] + self.o = array([False] * 42, dtype=np.bool)[2::] + self.nm = self.f.copy() + self.im = self.t.copy() + self.nm[3] = True + self.nm[-2] = True + self.im[3] = False + self.im[-2] = False + + def test_all_any(self): + self.assertTrue(self.t.all()) + self.assertTrue(self.t.any()) + self.assertFalse(self.f.all()) + self.assertFalse(self.f.any()) + self.assertTrue(self.nm.any()) + self.assertTrue(self.im.any()) + self.assertFalse(self.nm.all()) + self.assertFalse(self.im.all()) + + def test_logical_not_abs(self): + assert_array_equal(~self.t, self.f) + assert_array_equal(np.abs(~self.t), self.f) + assert_array_equal(np.abs(~self.f), self.t) + assert_array_equal(np.abs(self.f), self.f) + assert_array_equal(~np.abs(self.f), self.t) + assert_array_equal(~np.abs(self.t), self.f) + assert_array_equal(np.abs(~self.nm), self.im) + np.logical_not(self.t, out=self.o) + assert_array_equal(self.o, self.f) + np.abs(self.t, out=self.o) + assert_array_equal(self.o, self.t) + + def test_logical_and_or_xor(self): + assert_array_equal(self.t | self.t, self.t) + assert_array_equal(self.f | self.f, self.f) + assert_array_equal(self.t | self.f, self.t) + assert_array_equal(self.f | self.t, self.t) + np.logical_or(self.t, self.t, out=self.o) + assert_array_equal(self.o, self.t) + assert_array_equal(self.t & self.t, self.t) + assert_array_equal(self.f & self.f, self.f) + assert_array_equal(self.t & self.f, self.f) + assert_array_equal(self.f & self.t, self.f) + np.logical_and(self.t, self.t, out=self.o) + assert_array_equal(self.o, self.t) + assert_array_equal(self.t ^ self.t, self.f) + assert_array_equal(self.f ^ self.f, self.f) + assert_array_equal(self.t ^ self.f, self.t) + assert_array_equal(self.f ^ self.t, self.t) + np.logical_xor(self.t, self.t, out=self.o) + assert_array_equal(self.o, self.f) + + assert_array_equal(self.nm & self.t, self.nm) + assert_array_equal(self.im & self.f, False) + assert_array_equal(self.nm & True, self.nm) + assert_array_equal(self.im & False, self.f) + assert_array_equal(self.nm | self.t, self.t) + assert_array_equal(self.im | self.f, self.im) + assert_array_equal(self.nm | True, self.t) + assert_array_equal(self.im | False, self.im) + assert_array_equal(self.nm ^ self.t, self.im) + assert_array_equal(self.im ^ self.f, self.im) + assert_array_equal(self.nm ^ True, self.im) + assert_array_equal(self.im ^ False, self.im) + + class TestSeterr(TestCase): def test_default(self): err = geterr() |