diff options
author | Charles Harris <charlesr.harris@gmail.com> | 2013-08-10 10:53:34 -0700 |
---|---|---|
committer | Charles Harris <charlesr.harris@gmail.com> | 2013-08-10 10:53:34 -0700 |
commit | c6da120806f70d417619e1a34512f38dbd0dcc8d (patch) | |
tree | 277f03b5f7134c281d36f9323eb90dfe82fa9b87 /numpy | |
parent | 78801c50ad314edae040968e5ffcd0d27dd70a45 (diff) | |
parent | 7819817653003fdae4554cbfab4cdbedf824c305 (diff) | |
download | numpy-c6da120806f70d417619e1a34512f38dbd0dcc8d.tar.gz |
Merge pull request #3517 from juliantaylor/memchr-all-any
ENH: use memchr for unit stride all/any
Diffstat (limited to 'numpy')
-rw-r--r-- | numpy/core/src/umath/loops.c.src | 34 | ||||
-rw-r--r-- | numpy/core/src/umath/simd.inc.src | 9 | ||||
-rw-r--r-- | numpy/core/tests/test_numeric.py | 17 |
3 files changed, 57 insertions, 3 deletions
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src index 0559fb416..d99fafaf2 100644 --- a/numpy/core/src/umath/loops.c.src +++ b/numpy/core/src/umath/loops.c.src @@ -20,6 +20,8 @@ #include "ufunc_object.h" +#include <string.h> /* for memchr */ + /* * include vectorized functions and dispatchers @@ -555,15 +557,47 @@ BOOL_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED * #kind = logical_and, logical_or# * #OP = &&, ||# * #SC = ==, !=# + * #and = 1, 0# **/ NPY_NO_EXPORT void BOOL_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func)) { if(IS_BINARY_REDUCE) { +#ifdef HAVE_EMMINTRIN_H + /* + * stick with our variant for more reliable performance, only known + * platform which outperforms it by ~20% is an i7 with glibc 2.17 + */ if (run_reduce_simd_@kind@_BOOL(args, dimensions, steps)) { return; } +#else + /* for now only use libc on 32-bit/non-x86 */ + if (steps[1] == 1) { + npy_bool * op = (npy_bool *)args[0]; +#if @and@ + /* np.all(), search for a zero (false) */ + if (*op) { + *op = memchr(args[1], 0, dimensions[0]) == NULL; + } +#else + /* + * np.any(), search for a non-zero (true) via comparing against + * zero blocks, memcmp is faster than memchr on SSE4 machines + * with glibc >= 2.12 and memchr can only check for equal 1 + */ + static const npy_bool zero[4096]; /* zero by C standard */ + npy_uintp i, n = dimensions[0]; + for (i = 0; !*op && i < n - (n % sizeof(zero)); i += sizeof(zero)) { + *op = memcmp(&args[1][i], zero, sizeof(zero)) != 0; + } + if (!*op && n - i > 0) + *op = memcmp(&args[1][i], zero, n - i) != 0; +#endif + return; + } +#endif BINARY_REDUCE_LOOP(npy_bool) { const npy_bool in2 = *(npy_bool *)ip2; io1 = io1 @OP@ in2; diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src index 98e2beb30..2f1c3055b 100644 --- a/numpy/core/src/umath/simd.inc.src +++ b/numpy/core/src/umath/simd.inc.src @@ -779,14 +779,17 @@ sse2_reduce_@kind@_BOOL(npy_bool * op, npy_bool * ip, const npy_intp n) return; } } - LOOP_BLOCKED(npy_bool, 16) { + /* unrolled once to replace a slow movmsk with a fast pmaxb */ + LOOP_BLOCKED(npy_bool, 32) { @vtype@ v = @vload@((@vtype@*)&ip[i]); + @vtype@ v2 = @vload@((@vtype@*)&ip[i + 16]); v = @vpre@_cmpeq_epi8(v, zero); + v2 = @vpre@_cmpeq_epi8(v2, zero); #if @and@ - if ((@vpre@_movemask_epi8(v) != 0)) { + if ((@vpre@_movemask_epi8(@vpre@_max_epu8(v, v2)) != 0)) { *op = 0; #else - if ((@vpre@_movemask_epi8(v) != 0xFFFF)) { + if ((@vpre@_movemask_epi8(@vpre@_min_epu8(v, v2)) != 0xFFFF)) { *op = 1; #endif return; diff --git a/numpy/core/tests/test_numeric.py b/numpy/core/tests/test_numeric.py index 1be0f4105..782ddd687 100644 --- a/numpy/core/tests/test_numeric.py +++ b/numpy/core/tests/test_numeric.py @@ -245,6 +245,23 @@ class TestBoolArray(TestCase): self.assertTrue(self.im.any()) self.assertFalse(self.nm.all()) self.assertFalse(self.im.all()) + # check bad element in all positions + for i in range(256 - 7): + d = array([False] * 256, dtype=np.bool)[7::] + d[i] = True + self.assertTrue(np.any(d)) + e = array([True] * 256, dtype=np.bool)[7::] + e[i] = False + self.assertFalse(np.all(e)) + assert_array_equal(e, ~d) + # big array test for blocked libc loops + for i in list(range(9, 6000, 507)) + [7764, 90021, -10]: + d = array([False] * 100043, dtype=np.bool) + d[i] = True + self.assertTrue(np.any(d), msg="%r" % i) + e = array([True] * 100043, dtype=np.bool) + e[i] = False + self.assertFalse(np.all(e), msg="%r" % i) def test_logical_not_abs(self): assert_array_equal(~self.t, self.f) |