ENH: improve npy_memchr(p, 0) by using __builtin_ctz (tzcnt on x86)

Improves sparse mask performance by about a factor of three, the worst case of no consecutive mask elements slows down by about 10%-15%.
author: Julian Taylor <jtaylor.debian@googlemail.com> 2013-10-16 19:49:47 +0200
committer: Julian Taylor <jtaylor.debian@googlemail.com> 2013-10-17 01:01:59 +0200
commit: 7d4ea165817fc613c79bb92ccb3844df94d1beed (patch)
tree: 708b52ceaefb48fc41343cf0d81ac8fdd9fab4f0 /numpy
parent: 935017cbc3a336f66a110b53d2b595fffc8adbd7 (diff)
download: numpy-7d4ea165817fc613c79bb92ccb3844df94d1beed.tar.gz
2 files changed, 22 insertions, 0 deletions
diff --git a/numpy/core/setup_common.py b/numpy/core/setup_common.py
index bad3607fa..4633aef84 100644
--- a/numpy/core/setup_common.py
+++ b/numpy/core/setup_common.py
@@ -116,6 +116,7 @@ OPTIONAL_INTRINSICS = [("__builtin_isnan", '5.'),
                        ("__builtin_bswap32", '5u'),
                        ("__builtin_bswap64", '5u'),
                        ("__builtin_expect", '5, 0'),
+                       ("__builtin_ctz", '5'),
                        ("_mm_load_ps", '(float*)0', "xmmintrin.h"), # SSE
                        ("_mm_load_pd", '(double*)0', "emmintrin.h"), # SSE2
                        ]
diff --git a/numpy/core/src/multiarray/common.h b/numpy/core/src/multiarray/common.h
index f94bd07d5..3e060de3d 100644
--- a/numpy/core/src/multiarray/common.h
+++ b/numpy/core/src/multiarray/common.h
@@ -1,6 +1,7 @@
 #ifndef _NPY_PRIVATE_COMMON_H_
 #define _NPY_PRIVATE_COMMON_H_
 #include <numpy/npy_common.h>
+#include <numpy/npy_cpu.h>
 
 #define error_converting(x)  (((x) == -1) && PyErr_Occurred())
 
@@ -109,11 +110,31 @@ npy_memchr(char * haystack, char needle,
     }
 
     if (!invert) {
+        /*
+         * this is usually the path to determine elements to process,
+         * performance less important here.
+         * memchr has large setup cost if 0 byte is close to start.
+         */
         while (p < end && *p != needle) {
             p += stride;
         }
     }
     else {
+        /* usually find elements to skip path */
+#if (defined HAVE___BUILTIN_CTZ && defined NPY_CPU_HAVE_UNALIGNED_ACCESS)
+        if (needle == 0 && stride == 1) {
+            while (p < end - ((npy_uintp)end % sizeof(unsigned int))) {
+                unsigned int  v = *(unsigned int*)p;
+                if (v == 0) {
+                    p += sizeof(unsigned int);
+                    continue;
+                }
+                p += __builtin_ctz(v) / 8;
+                *subloopsize = (p - haystack) / stride;
+                return p;
+            }
+        }
+#endif
         while (p < end && *p == needle) {
             p += stride;
         }
author	Julian Taylor <jtaylor.debian@googlemail.com>	2013-10-16 19:49:47 +0200
committer	Julian Taylor <jtaylor.debian@googlemail.com>	2013-10-17 01:01:59 +0200
commit	7d4ea165817fc613c79bb92ccb3844df94d1beed (patch)
tree	708b52ceaefb48fc41343cf0d81ac8fdd9fab4f0 /numpy
parent	935017cbc3a336f66a110b53d2b595fffc8adbd7 (diff)
download	numpy-7d4ea165817fc613c79bb92ccb3844df94d1beed.tar.gz