From 904da7c202384c8a2a6ec88cece378f70e2dd956 Mon Sep 17 00:00:00 2001
From: Julian Taylor <jtaylor.debian@googlemail.com>
Date: Wed, 11 Nov 2015 19:34:23 +0100
Subject: ENH: use prefetching for summation

It seems the small blocksizes (128) messes up the hardware prefetcher
which would usually be able to work fine on this iteration pattern.
Fix this by using software prefetching. Improves performance for large
sums by 15%-30%. Tested on core2duo, xeon E5-4620, i5-3470 and AMD phenom II X4.

Prefers __builtin_prefetch as that, unlike SSE2 _mm_prefetch, also works
on capable non-x86 cpus.
---
 numpy/core/src/umath/loops.c.src | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'numpy/core/src')

diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index 854c1e17a..aff6180c7 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -1444,6 +1444,8 @@ pairwise_sum_@TYPE@(@dtype@ *a, npy_uintp n, npy_intp stride)
         r[7] = @trf@(a[7 * stride]);
 
         for (i = 8; i < n - (n % 8); i += 8) {
+            /* small blocksizes seems to mess with hardware prefetch */
+            NPY_PREFETCH(&a[(i + 512 / sizeof(a[0])) * stride], 0, 3);
             r[0] += @trf@(a[(i + 0) * stride]);
             r[1] += @trf@(a[(i + 1) * stride]);
             r[2] += @trf@(a[(i + 2) * stride]);
@@ -2190,6 +2192,8 @@ pairwise_sum_@TYPE@(@ftype@ *rr, @ftype@ * ri, @ftype@ * a, npy_uintp n,
         r[7] = a[6 * stride + 1];
 
         for (i = 8; i < n - (n % 8); i += 8) {
+            /* small blocksizes seems to mess with hardware prefetch */
+            NPY_PREFETCH(&a[(i + 512 / sizeof(a[0])) * stride], 0, 3);
             r[0] += a[(i + 0) * stride];
             r[1] += a[(i + 0) * stride + 1];
             r[2] += a[(i + 2) * stride];
-- 
cgit v1.2.1