summaryrefslogtreecommitdiff
path: root/numpy
diff options
context:
space:
mode:
authorCharles Harris <charlesr.harris@gmail.com>2013-08-10 10:53:34 -0700
committerCharles Harris <charlesr.harris@gmail.com>2013-08-10 10:53:34 -0700
commitc6da120806f70d417619e1a34512f38dbd0dcc8d (patch)
tree277f03b5f7134c281d36f9323eb90dfe82fa9b87 /numpy
parent78801c50ad314edae040968e5ffcd0d27dd70a45 (diff)
parent7819817653003fdae4554cbfab4cdbedf824c305 (diff)
downloadnumpy-c6da120806f70d417619e1a34512f38dbd0dcc8d.tar.gz
Merge pull request #3517 from juliantaylor/memchr-all-any
ENH: use memchr for unit stride all/any
Diffstat (limited to 'numpy')
-rw-r--r--numpy/core/src/umath/loops.c.src34
-rw-r--r--numpy/core/src/umath/simd.inc.src9
-rw-r--r--numpy/core/tests/test_numeric.py17
3 files changed, 57 insertions, 3 deletions
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index 0559fb416..d99fafaf2 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -20,6 +20,8 @@
#include "ufunc_object.h"
+#include <string.h> /* for memchr */
+
/*
* include vectorized functions and dispatchers
@@ -555,15 +557,47 @@ BOOL_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED
* #kind = logical_and, logical_or#
* #OP = &&, ||#
* #SC = ==, !=#
+ * #and = 1, 0#
**/
NPY_NO_EXPORT void
BOOL_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
if(IS_BINARY_REDUCE) {
+#ifdef HAVE_EMMINTRIN_H
+ /*
+ * stick with our variant for more reliable performance, only known
+ * platform which outperforms it by ~20% is an i7 with glibc 2.17
+ */
if (run_reduce_simd_@kind@_BOOL(args, dimensions, steps)) {
return;
}
+#else
+ /* for now only use libc on 32-bit/non-x86 */
+ if (steps[1] == 1) {
+ npy_bool * op = (npy_bool *)args[0];
+#if @and@
+ /* np.all(), search for a zero (false) */
+ if (*op) {
+ *op = memchr(args[1], 0, dimensions[0]) == NULL;
+ }
+#else
+ /*
+ * np.any(), search for a non-zero (true) via comparing against
+ * zero blocks, memcmp is faster than memchr on SSE4 machines
+ * with glibc >= 2.12 and memchr can only check for equal 1
+ */
+ static const npy_bool zero[4096]; /* zero by C standard */
+ npy_uintp i, n = dimensions[0];
+ for (i = 0; !*op && i < n - (n % sizeof(zero)); i += sizeof(zero)) {
+ *op = memcmp(&args[1][i], zero, sizeof(zero)) != 0;
+ }
+ if (!*op && n - i > 0)
+ *op = memcmp(&args[1][i], zero, n - i) != 0;
+#endif
+ return;
+ }
+#endif
BINARY_REDUCE_LOOP(npy_bool) {
const npy_bool in2 = *(npy_bool *)ip2;
io1 = io1 @OP@ in2;
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index 98e2beb30..2f1c3055b 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -779,14 +779,17 @@ sse2_reduce_@kind@_BOOL(npy_bool * op, npy_bool * ip, const npy_intp n)
return;
}
}
- LOOP_BLOCKED(npy_bool, 16) {
+ /* unrolled once to replace a slow movmsk with a fast pmaxb */
+ LOOP_BLOCKED(npy_bool, 32) {
@vtype@ v = @vload@((@vtype@*)&ip[i]);
+ @vtype@ v2 = @vload@((@vtype@*)&ip[i + 16]);
v = @vpre@_cmpeq_epi8(v, zero);
+ v2 = @vpre@_cmpeq_epi8(v2, zero);
#if @and@
- if ((@vpre@_movemask_epi8(v) != 0)) {
+ if ((@vpre@_movemask_epi8(@vpre@_max_epu8(v, v2)) != 0)) {
*op = 0;
#else
- if ((@vpre@_movemask_epi8(v) != 0xFFFF)) {
+ if ((@vpre@_movemask_epi8(@vpre@_min_epu8(v, v2)) != 0xFFFF)) {
*op = 1;
#endif
return;
diff --git a/numpy/core/tests/test_numeric.py b/numpy/core/tests/test_numeric.py
index 1be0f4105..782ddd687 100644
--- a/numpy/core/tests/test_numeric.py
+++ b/numpy/core/tests/test_numeric.py
@@ -245,6 +245,23 @@ class TestBoolArray(TestCase):
self.assertTrue(self.im.any())
self.assertFalse(self.nm.all())
self.assertFalse(self.im.all())
+ # check bad element in all positions
+ for i in range(256 - 7):
+ d = array([False] * 256, dtype=np.bool)[7::]
+ d[i] = True
+ self.assertTrue(np.any(d))
+ e = array([True] * 256, dtype=np.bool)[7::]
+ e[i] = False
+ self.assertFalse(np.all(e))
+ assert_array_equal(e, ~d)
+ # big array test for blocked libc loops
+ for i in list(range(9, 6000, 507)) + [7764, 90021, -10]:
+ d = array([False] * 100043, dtype=np.bool)
+ d[i] = True
+ self.assertTrue(np.any(d), msg="%r" % i)
+ e = array([True] * 100043, dtype=np.bool)
+ e[i] = False
+ self.assertFalse(np.all(e), msg="%r" % i)
def test_logical_not_abs(self):
assert_array_equal(~self.t, self.f)