diff options
author | Julian Taylor <jtaylor.debian@googlemail.com> | 2013-06-10 23:02:36 +0200 |
---|---|---|
committer | Julian Taylor <jtaylor.debian@googlemail.com> | 2013-06-13 18:52:31 +0200 |
commit | 75df68b5ca238eabc2de144dce08d481c59ffcaf (patch) | |
tree | c52fe52ada823c87ce5b264e5fecedecf56566a2 /numpy/core | |
parent | 266a968d5d9b3cb5be59e30b697f4e9876c3a00c (diff) | |
download | numpy-75df68b5ca238eabc2de144dce08d481c59ffcaf.tar.gz |
ENH: tell gcc to unroll strided copy loops
The strided copy loops profit a lot from unrolling as the number of
operations executed is in each iterations very small.
GCC needs to be told explicitly to do unrolling even on O3.
Unrolling is only profitable if the move can be done in a single
instruction, else the increased code size makes it slower, thus the flag
is only used for operations on element sizes less equal the native
pointer size.
Tested to improve performance of by 20-50% on intel core2duo, xeon
5xxx/7xxx and amd phenom x4.
Diffstat (limited to 'numpy/core')
-rw-r--r-- | numpy/core/include/numpy/npy_common.h | 14 | ||||
-rw-r--r-- | numpy/core/setup.py | 7 | ||||
-rw-r--r-- | numpy/core/setup_common.py | 7 | ||||
-rw-r--r-- | numpy/core/src/multiarray/lowlevel_strided_loops.c.src | 14 |
4 files changed, 40 insertions, 2 deletions
diff --git a/numpy/core/include/numpy/npy_common.h b/numpy/core/include/numpy/npy_common.h index 30829f929..2dccc575e 100644 --- a/numpy/core/include/numpy/npy_common.h +++ b/numpy/core/include/numpy/npy_common.h @@ -3,6 +3,20 @@ /* numpconfig.h is auto-generated */ #include "numpyconfig.h" +#ifdef HAVE_NPY_CONFIG_H +#include <npy_config.h> +#endif + +/* + * gcc does not unroll even with -O3 + * use with care, unrolling on modern cpus rarely speeds things up + */ +#ifdef HAVE_ATTRIBUTE_OPTIMIZE_UNROLL_LOOPS +#define NPY_GCC_UNROLL_LOOPS \ + __attribute__((optimize("unroll-loops"))) +#else +#define NPY_GCC_UNROLL_LOOPS +#endif #if defined(_MSC_VER) #define NPY_INLINE __inline diff --git a/numpy/core/setup.py b/numpy/core/setup.py index 926142b55..b48414c2d 100644 --- a/numpy/core/setup.py +++ b/numpy/core/setup.py @@ -161,7 +161,6 @@ def check_math_capabilities(config, moredefs, mathlibs): check_funcs(OPTIONAL_STDFUNCS) - for h in OPTIONAL_HEADERS: if config.check_func("", decl=False, call=False, headers=[h]): moredefs.append((fname2def(h).replace(".", "_"), 1)) @@ -170,6 +169,12 @@ def check_math_capabilities(config, moredefs, mathlibs): if config.check_func(f, decl=False, call=True, call_args=args): moredefs.append((fname2def(f), 1)) + for dec, fn in OPTIONAL_GCC_ATTRIBUTES: + if config.check_funcs_once([fn], + decl=dict((('%s %s' % (dec, fn), True),)), + call=False): + moredefs.append((fname2def(fn), 1)) + # C99 functions: float and long double versions check_funcs(C99_FUNCS_SINGLE) check_funcs(C99_FUNCS_EXTENDED) diff --git a/numpy/core/setup_common.py b/numpy/core/setup_common.py index 284acfe21..cb30c83c9 100644 --- a/numpy/core/setup_common.py +++ b/numpy/core/setup_common.py @@ -113,6 +113,13 @@ OPTIONAL_INTRINSICS = [("__builtin_isnan", '5.'), ("__builtin_bswap64", '5u'), ] +# gcc function attributes +# (attribute as understood by gcc, function name), +# function name will be converted to HAVE_<upper-case-name> preprocessor macro +OPTIONAL_GCC_ATTRIBUTES = [('__attribute__((optimize("unroll-loops")))', + 'attribute_optimize_unroll_loops'), + ] + # Subset of OPTIONAL_STDFUNCS which may alreay have HAVE_* defined by Python.h OPTIONAL_STDFUNCS_MAYBE = ["expm1", "log1p", "acosh", "atanh", "asinh", "hypot", "copysign"] diff --git a/numpy/core/src/multiarray/lowlevel_strided_loops.c.src b/numpy/core/src/multiarray/lowlevel_strided_loops.c.src index b0770168f..bc2279a98 100644 --- a/numpy/core/src/multiarray/lowlevel_strided_loops.c.src +++ b/numpy/core/src/multiarray/lowlevel_strided_loops.c.src @@ -99,7 +99,15 @@ #if @is_swap@ || @src_contig@ == 0 || @dst_contig@ == 0 +/* + * unrolling gains about 20-50% if the copy can be done in one mov instruction + * if not it can decrease performance + * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4 + */ static void +#if @is_aligned@ && @is_swap@ == 0 && @elsize@ <= NPY_SIZEOF_INTP + NPY_GCC_UNROLL_LOOPS +#endif @prefix@_@oper@_size@elsize@(char *dst, npy_intp dst_stride, char *src, npy_intp src_stride, npy_intp N, npy_intp NPY_UNUSED(src_itemsize), @@ -156,7 +164,11 @@ static void #endif -/* specialized copy and swap for source stride 0 */ +/* + * specialized copy and swap for source stride 0, + * interestingly unrolling here is like above is only marginally profitable for + * small types and detrimental for >= 8byte moves on x86 + */ #if (@src_contig@ == 0) && @is_aligned@ static void @prefix@_@oper@_size@elsize@_srcstride0(char *dst, |