ENH: tell gcc to unroll strided copy loops

The strided copy loops profit a lot from unrolling as the number of operations executed is in each iterations very small. GCC needs to be told explicitly to do unrolling even on O3. Unrolling is only profitable if the move can be done in a single instruction, else the increased code size makes it slower, thus the flag is only used for operations on element sizes less equal the native pointer size. Tested to improve performance of by 20-50% on intel core2duo, xeon 5xxx/7xxx and amd phenom x4.
author: Julian Taylor <jtaylor.debian@googlemail.com> 2013-06-10 23:02:36 +0200
committer: Julian Taylor <jtaylor.debian@googlemail.com> 2013-06-13 18:52:31 +0200
commit: 75df68b5ca238eabc2de144dce08d481c59ffcaf (patch)
tree: c52fe52ada823c87ce5b264e5fecedecf56566a2 /numpy/core
parent: 266a968d5d9b3cb5be59e30b697f4e9876c3a00c (diff)
download: numpy-75df68b5ca238eabc2de144dce08d481c59ffcaf.tar.gz
4 files changed, 40 insertions, 2 deletions
diff --git a/numpy/core/include/numpy/npy_common.h b/numpy/core/include/numpy/npy_common.h
index 30829f929..2dccc575e 100644
--- a/numpy/core/include/numpy/npy_common.h
+++ b/numpy/core/include/numpy/npy_common.h
@@ -3,6 +3,20 @@
 
 /* numpconfig.h is auto-generated */
 #include "numpyconfig.h"
+#ifdef HAVE_NPY_CONFIG_H
+#include <npy_config.h>
+#endif
+
+/*
+ * gcc does not unroll even with -O3
+ * use with care, unrolling on modern cpus rarely speeds things up
+ */
+#ifdef HAVE_ATTRIBUTE_OPTIMIZE_UNROLL_LOOPS
+#define NPY_GCC_UNROLL_LOOPS \
+    __attribute__((optimize("unroll-loops")))
+#else
+#define NPY_GCC_UNROLL_LOOPS
+#endif
 
 #if defined(_MSC_VER)
         #define NPY_INLINE __inline
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index 926142b55..b48414c2d 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -161,7 +161,6 @@ def check_math_capabilities(config, moredefs, mathlibs):
 
     check_funcs(OPTIONAL_STDFUNCS)
 
-
     for h in OPTIONAL_HEADERS:
         if config.check_func("", decl=False, call=False, headers=[h]):
             moredefs.append((fname2def(h).replace(".", "_"), 1))
@@ -170,6 +169,12 @@ def check_math_capabilities(config, moredefs, mathlibs):
         if config.check_func(f, decl=False, call=True, call_args=args):
             moredefs.append((fname2def(f), 1))
 
+    for dec, fn in OPTIONAL_GCC_ATTRIBUTES:
+        if config.check_funcs_once([fn],
+                                   decl=dict((('%s %s' % (dec, fn), True),)),
+                                   call=False):
+            moredefs.append((fname2def(fn), 1))
+
     # C99 functions: float and long double versions
     check_funcs(C99_FUNCS_SINGLE)
     check_funcs(C99_FUNCS_EXTENDED)
diff --git a/numpy/core/setup_common.py b/numpy/core/setup_common.py
index 284acfe21..cb30c83c9 100644
--- a/numpy/core/setup_common.py
+++ b/numpy/core/setup_common.py
@@ -113,6 +113,13 @@ OPTIONAL_INTRINSICS = [("__builtin_isnan", '5.'),
                        ("__builtin_bswap64", '5u'),
                        ]
 
+# gcc function attributes
+# (attribute as understood by gcc, function name),
+# function name will be converted to HAVE_<upper-case-name> preprocessor macro
+OPTIONAL_GCC_ATTRIBUTES = [('__attribute__((optimize("unroll-loops")))',
+                            'attribute_optimize_unroll_loops'),
+                          ]
+
 # Subset of OPTIONAL_STDFUNCS which may alreay have HAVE_* defined by Python.h
 OPTIONAL_STDFUNCS_MAYBE = ["expm1", "log1p", "acosh", "atanh", "asinh", "hypot",
         "copysign"]
diff --git a/numpy/core/src/multiarray/lowlevel_strided_loops.c.src b/numpy/core/src/multiarray/lowlevel_strided_loops.c.src
index b0770168f..bc2279a98 100644
--- a/numpy/core/src/multiarray/lowlevel_strided_loops.c.src
+++ b/numpy/core/src/multiarray/lowlevel_strided_loops.c.src
@@ -99,7 +99,15 @@
 
 
 #if @is_swap@ || @src_contig@ == 0 || @dst_contig@ == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
 static void
+#if @is_aligned@ && @is_swap@ == 0 && @elsize@ <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
 @prefix@_@oper@_size@elsize@(char *dst, npy_intp dst_stride,
                         char *src, npy_intp src_stride,
                         npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
@@ -156,7 +164,11 @@ static void
 #endif
 
 
-/* specialized copy and swap for source stride 0 */
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ */
 #if (@src_contig@ == 0) && @is_aligned@
 static void
 @prefix@_@oper@_size@elsize@_srcstride0(char *dst,
author	Julian Taylor <jtaylor.debian@googlemail.com>	2013-06-10 23:02:36 +0200
committer	Julian Taylor <jtaylor.debian@googlemail.com>	2013-06-13 18:52:31 +0200
commit	75df68b5ca238eabc2de144dce08d481c59ffcaf (patch)
tree	c52fe52ada823c87ce5b264e5fecedecf56566a2 /numpy/core
parent	266a968d5d9b3cb5be59e30b697f4e9876c3a00c (diff)
download	numpy-75df68b5ca238eabc2de144dce08d481c59ffcaf.tar.gz