ENH: use prefetching for summation

It seems the small blocksizes (128) messes up the hardware prefetcher which would usually be able to work fine on this iteration pattern. Fix this by using software prefetching. Improves performance for large sums by 15%-30%. Tested on core2duo, xeon E5-4620, i5-3470 and AMD phenom II X4. Prefers __builtin_prefetch as that, unlike SSE2 _mm_prefetch, also works on capable non-x86 cpus.
author: Julian Taylor <jtaylor.debian@googlemail.com> 2015-11-11 19:34:23 +0100
committer: Julian Taylor <jtaylor.debian@googlemail.com> 2015-11-16 21:10:46 +0100
commit: 904da7c202384c8a2a6ec88cece378f70e2dd956 (patch)
tree: 92bdf5542fc7e4483ebbd4d94135450a0ccf5a0c /numpy/core
parent: 1d511429ac04d137c3d9ec7da9160bec7baa2829 (diff)
download: numpy-904da7c202384c8a2a6ec88cece378f70e2dd956.tar.gz
3 files changed, 22 insertions, 0 deletions
diff --git a/numpy/core/include/numpy/npy_common.h b/numpy/core/include/numpy/npy_common.h
index eff5dd339..47ef94c92 100644
--- a/numpy/core/include/numpy/npy_common.h
+++ b/numpy/core/include/numpy/npy_common.h
@@ -61,6 +61,21 @@
 #define NPY_UNLIKELY(x) (x)
 #endif
 
+#ifdef HAVE___BUILTIN_PREFETCH
+/* unlike _mm_prefetch also works on non-x86 */
+#define NPY_PREFETCH(x, rw, loc) __builtin_prefetch((x), (rw), (loc))
+#else
+#ifdef HAVE__MM_PREFETCH
+/* _MM_HINT_ET[01] (rw = 1) unsupported, only available in gcc >= 4.9 */
+#define NPY_PREFETCH(x, rw, loc) _mm_prefetch((x), loc == 0 ? _MM_HINT_NTA : \
+                                             (loc == 1 ? _MM_HINT_T2 : \
+                                              (loc == 2 ? _MM_HINT_T1 : \
+                                               (loc == 3 ? _MM_HINT_T0 : -1))))
+#else
+#define NPY_PREFETCH(x, rw,loc)
+#endif
+#endif
+
 #if defined(_MSC_VER)
         #define NPY_INLINE __inline
 #elif defined(__GNUC__)
diff --git a/numpy/core/setup_common.py b/numpy/core/setup_common.py
index 68efd1791..d93e475e3 100644
--- a/numpy/core/setup_common.py
+++ b/numpy/core/setup_common.py
@@ -125,7 +125,10 @@ OPTIONAL_INTRINSICS = [("__builtin_isnan", '5.'),
                        ("__builtin_expect", '5, 0'),
                        ("__builtin_mul_overflow", '5, 5, (int*)5'),
                        ("_mm_load_ps", '(float*)0', "xmmintrin.h"),  # SSE
+                       ("_mm_prefetch", '(float*)0, _MM_HINT_NTA',
+                        "xmmintrin.h"),  # SSE
                        ("_mm_load_pd", '(double*)0', "emmintrin.h"),  # SSE2
+                       ("__builtin_prefetch", "(float*)0, 0, 3"),
                        ]
 
 # function attributes
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index 854c1e17a..aff6180c7 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -1444,6 +1444,8 @@ pairwise_sum_@TYPE@(@dtype@ *a, npy_uintp n, npy_intp stride)
         r[7] = @trf@(a[7 * stride]);
 
         for (i = 8; i < n - (n % 8); i += 8) {
+            /* small blocksizes seems to mess with hardware prefetch */
+            NPY_PREFETCH(&a[(i + 512 / sizeof(a[0])) * stride], 0, 3);
             r[0] += @trf@(a[(i + 0) * stride]);
             r[1] += @trf@(a[(i + 1) * stride]);
             r[2] += @trf@(a[(i + 2) * stride]);
@@ -2190,6 +2192,8 @@ pairwise_sum_@TYPE@(@ftype@ *rr, @ftype@ * ri, @ftype@ * a, npy_uintp n,
         r[7] = a[6 * stride + 1];
 
         for (i = 8; i < n - (n % 8); i += 8) {
+            /* small blocksizes seems to mess with hardware prefetch */
+            NPY_PREFETCH(&a[(i + 512 / sizeof(a[0])) * stride], 0, 3);
             r[0] += a[(i + 0) * stride];
             r[1] += a[(i + 0) * stride + 1];
             r[2] += a[(i + 2) * stride];
author	Julian Taylor <jtaylor.debian@googlemail.com>	2015-11-11 19:34:23 +0100
committer	Julian Taylor <jtaylor.debian@googlemail.com>	2015-11-16 21:10:46 +0100
commit	904da7c202384c8a2a6ec88cece378f70e2dd956 (patch)
tree	92bdf5542fc7e4483ebbd4d94135450a0ccf5a0c /numpy/core
parent	1d511429ac04d137c3d9ec7da9160bec7baa2829 (diff)
download	numpy-904da7c202384c8a2a6ec88cece378f70e2dd956.tar.gz