diff options
author | Julian Taylor <jtaylor.debian@googlemail.com> | 2013-10-24 23:07:19 +0200 |
---|---|---|
committer | Julian Taylor <jtaylor.debian@googlemail.com> | 2013-11-06 20:29:13 +0100 |
commit | dcfe3260e2d6d9de7a2c3fb0022ecf124faa2ae2 (patch) | |
tree | 54199a2f2da24be4d82553fd38a9d890934c649f | |
parent | 8988715e1aea7b10b230115aecae029cf6e80d66 (diff) | |
download | numpy-dcfe3260e2d6d9de7a2c3fb0022ecf124faa2ae2.tar.gz |
ENH: avoid expensive clears in fenv functions
Clearing is 50-100 times more expensive than checking on x86, so check if there
is anything needs to be cleared first. This speeds up scalar operations
by 10%-20%.
Based on Arink Verma code in #3739.
Implement the functions as new C-API functions npy_get_floatstatus and
npy_clear_floatstatus in npy_math.
-rw-r--r-- | doc/source/reference/c-api.coremath.rst | 41 | ||||
-rw-r--r-- | doc/source/reference/c-api.ufunc.rst | 2 | ||||
-rw-r--r-- | numpy/core/include/numpy/npy_math.h | 11 | ||||
-rw-r--r-- | numpy/core/include/numpy/ufuncobject.h | 149 | ||||
-rw-r--r-- | numpy/core/src/npymath/ieee754.c.src | 133 | ||||
-rw-r--r-- | numpy/core/src/umath/simd.inc.src | 8 | ||||
-rw-r--r-- | numpy/core/src/umath/ufunc_object.c | 17 | ||||
-rw-r--r-- | numpy/linalg/umath_linalg.c.src | 6 |
8 files changed, 226 insertions, 141 deletions
diff --git a/doc/source/reference/c-api.coremath.rst b/doc/source/reference/c-api.coremath.rst index 2d5aedc73..5c76bd601 100644 --- a/doc/source/reference/c-api.coremath.rst +++ b/doc/source/reference/c-api.coremath.rst @@ -150,6 +150,47 @@ Those can be useful for precise floating point comparison. .. versionadded:: 1.4.0 +.. cfunction:: void npy_set_floatstatus_divbyzero() + + Set the divide by zero floating point exception + + .. versionadded:: 1.6.0 + +.. cfunction:: void npy_set_floatstatus_overflow() + + Set the overflow floating point exception + + .. versionadded:: 1.6.0 + +.. cfunction:: void npy_set_floatstatus_underflow() + + Set the underflow floating point exception + + .. versionadded:: 1.6.0 + +.. cfunction:: void npy_set_floatstatus_invalid() + + Set the invalid floating point exception + + .. versionadded:: 1.6.0 + +.. cfunction:: int npy_get_floatstatus() + + Get floating point status. Returns a bitmask with following possible flags: + + * NPY_FPE_DIVIDEBYZERO + * NPY_FPE_OVERFLOW + * NPY_FPE_UNDERFLOW + * NPY_FPE_INVALID + + .. versionadded:: 1.9.0 + +.. cfunction:: int npy_clear_floatstatus() + + Clears the floating point status. Returns the previous status mask. + + .. versionadded:: 1.9.0 + Complex functions ~~~~~~~~~~~~~~~~~ diff --git a/doc/source/reference/c-api.ufunc.rst b/doc/source/reference/c-api.ufunc.rst index d4de28188..71abffd04 100644 --- a/doc/source/reference/c-api.ufunc.rst +++ b/doc/source/reference/c-api.ufunc.rst @@ -54,6 +54,8 @@ Macros .. cfunction:: UFUNC_CHECK_STATUS(ret) + Deprecated: use npy_clear_floatstatus from npy_math.h instead. + A macro that expands to platform-dependent code. The *ret* variable can can be any integer. The :cdata:`UFUNC_FPE_{ERR}` bits are set in *ret* according to the status of the corresponding error diff --git a/numpy/core/include/numpy/npy_math.h b/numpy/core/include/numpy/npy_math.h index 094286181..b7920460d 100644 --- a/numpy/core/include/numpy/npy_math.h +++ b/numpy/core/include/numpy/npy_math.h @@ -456,6 +456,17 @@ npy_clongdouble npy_csinl(npy_clongdouble z); * status word. */ +/* + * platform-dependent code translates floating point + * status to an integer sum of these values + */ +#define NPY_FPE_DIVIDEBYZERO 1 +#define NPY_FPE_OVERFLOW 2 +#define NPY_FPE_UNDERFLOW 4 +#define NPY_FPE_INVALID 8 + +int npy_get_floatstatus(void); +int npy_clear_floatstatus(void); void npy_set_floatstatus_divbyzero(void); void npy_set_floatstatus_overflow(void); void npy_set_floatstatus_underflow(void); diff --git a/numpy/core/include/numpy/ufuncobject.h b/numpy/core/include/numpy/ufuncobject.h index a6307df91..38e3dcf0f 100644 --- a/numpy/core/include/numpy/ufuncobject.h +++ b/numpy/core/include/numpy/ufuncobject.h @@ -2,6 +2,7 @@ #define Py_UFUNCOBJECT_H #include <numpy/npy_math.h> +#include <numpy/npy_common.h> #ifdef __cplusplus extern "C" { @@ -250,14 +251,6 @@ typedef struct _tagPyUFuncObject { #define UFUNC_SHIFT_INVALID 9 -/* platform-dependent code translates floating point - status to an integer sum of these values -*/ -#define UFUNC_FPE_DIVIDEBYZERO 1 -#define UFUNC_FPE_OVERFLOW 2 -#define UFUNC_FPE_UNDERFLOW 4 -#define UFUNC_FPE_INVALID 8 - #define UFUNC_OBJ_ISOBJECT 1 #define UFUNC_OBJ_NEEDS_API 2 @@ -333,141 +326,47 @@ typedef struct _loop1d_info { &(arg)->first))) \ goto fail;} while (0) -/* This code checks the IEEE status flags in a platform-dependent way */ -/* Adapted from Numarray */ - -#if (defined(__unix__) || defined(unix)) && !defined(USG) -#include <sys/param.h> -#endif - -/* OSF/Alpha (Tru64) ---------------------------------------------*/ -#if defined(__osf__) && defined(__alpha) - -#include <machine/fpu.h> -#define UFUNC_CHECK_STATUS(ret) { \ - unsigned long fpstatus; \ - \ - fpstatus = ieee_get_fp_control(); \ - /* clear status bits as well as disable exception mode if on */ \ - ieee_set_fp_control( 0 ); \ - ret = ((IEEE_STATUS_DZE & fpstatus) ? UFUNC_FPE_DIVIDEBYZERO : 0) \ - | ((IEEE_STATUS_OVF & fpstatus) ? UFUNC_FPE_OVERFLOW : 0) \ - | ((IEEE_STATUS_UNF & fpstatus) ? UFUNC_FPE_UNDERFLOW : 0) \ - | ((IEEE_STATUS_INV & fpstatus) ? UFUNC_FPE_INVALID : 0); \ - } - -/* MS Windows -----------------------------------------------------*/ -#elif defined(_MSC_VER) - -#include <float.h> - -/* Clear the floating point exception default of Borland C++ */ -#if defined(__BORLANDC__) -#define UFUNC_NOFPE _control87(MCW_EM, MCW_EM); -#endif - -#if defined(_WIN64) -#define UFUNC_CHECK_STATUS(ret) { \ - int fpstatus = (int) _clearfp(); \ - \ - ret = ((SW_ZERODIVIDE & fpstatus) ? UFUNC_FPE_DIVIDEBYZERO : 0) \ - | ((SW_OVERFLOW & fpstatus) ? UFUNC_FPE_OVERFLOW : 0) \ - | ((SW_UNDERFLOW & fpstatus) ? UFUNC_FPE_UNDERFLOW : 0) \ - | ((SW_INVALID & fpstatus) ? UFUNC_FPE_INVALID : 0); \ - } -#else -/* windows enables sse on 32 bit, so check both flags */ -#define UFUNC_CHECK_STATUS(ret) { \ - int fpstatus, fpstatus2; \ - _statusfp2(&fpstatus, &fpstatus2); \ - _clearfp(); \ - fpstatus |= fpstatus2; \ - \ - ret = ((SW_ZERODIVIDE & fpstatus) ? UFUNC_FPE_DIVIDEBYZERO : 0) \ - | ((SW_OVERFLOW & fpstatus) ? UFUNC_FPE_OVERFLOW : 0) \ - | ((SW_UNDERFLOW & fpstatus) ? UFUNC_FPE_UNDERFLOW : 0) \ - | ((SW_INVALID & fpstatus) ? UFUNC_FPE_INVALID : 0); \ - } -#endif - -/* Solaris --------------------------------------------------------*/ -/* --------ignoring SunOS ieee_flags approach, someone else can -** deal with that! */ -#elif defined(sun) || defined(__BSD__) || defined(__OpenBSD__) || \ +/* keep in sync with ieee754.c.src */ +#if defined(sun) || defined(__BSD__) || defined(__OpenBSD__) || \ (defined(__FreeBSD__) && (__FreeBSD_version < 502114)) || \ - defined(__NetBSD__) -#include <ieeefp.h> - -#define UFUNC_CHECK_STATUS(ret) { \ - int fpstatus; \ - \ - fpstatus = (int) fpgetsticky(); \ - ret = ((FP_X_DZ & fpstatus) ? UFUNC_FPE_DIVIDEBYZERO : 0) \ - | ((FP_X_OFL & fpstatus) ? UFUNC_FPE_OVERFLOW : 0) \ - | ((FP_X_UFL & fpstatus) ? UFUNC_FPE_UNDERFLOW : 0) \ - | ((FP_X_INV & fpstatus) ? UFUNC_FPE_INVALID : 0); \ - (void) fpsetsticky(0); \ - } - -#elif defined(__GLIBC__) || defined(__APPLE__) || \ + defined(__NetBSD__) || \ + defined(__GLIBC__) || defined(__APPLE__) || \ defined(__CYGWIN__) || defined(__MINGW32__) || \ - (defined(__FreeBSD__) && (__FreeBSD_version >= 502114)) - -#if defined(__GLIBC__) || defined(__APPLE__) || \ - defined(__MINGW32__) || defined(__FreeBSD__) -#include <fenv.h> -#elif defined(__CYGWIN__) -#include "numpy/fenv/fenv.h" -#endif - -#define UFUNC_CHECK_STATUS(ret) { \ - int fpstatus = (int) fetestexcept(FE_DIVBYZERO | FE_OVERFLOW | \ - FE_UNDERFLOW | FE_INVALID); \ - ret = ((FE_DIVBYZERO & fpstatus) ? UFUNC_FPE_DIVIDEBYZERO : 0) \ - | ((FE_OVERFLOW & fpstatus) ? UFUNC_FPE_OVERFLOW : 0) \ - | ((FE_UNDERFLOW & fpstatus) ? UFUNC_FPE_UNDERFLOW : 0) \ - | ((FE_INVALID & fpstatus) ? UFUNC_FPE_INVALID : 0); \ - (void) feclearexcept(FE_DIVBYZERO | FE_OVERFLOW | \ - FE_UNDERFLOW | FE_INVALID); \ -} - -#elif defined(_AIX) - -#include <float.h> -#include <fpxcp.h> - -#define UFUNC_CHECK_STATUS(ret) { \ - fpflag_t fpstatus; \ - \ - fpstatus = fp_read_flag(); \ - ret = ((FP_DIV_BY_ZERO & fpstatus) ? UFUNC_FPE_DIVIDEBYZERO : 0) \ - | ((FP_OVERFLOW & fpstatus) ? UFUNC_FPE_OVERFLOW : 0) \ - | ((FP_UNDERFLOW & fpstatus) ? UFUNC_FPE_UNDERFLOW : 0) \ - | ((FP_INVALID & fpstatus) ? UFUNC_FPE_INVALID : 0); \ - fp_swap_flag(0); \ -} - + (defined(__FreeBSD__) && (__FreeBSD_version >= 502114)) || \ + defined(_AIX) || \ + defined(_MSC_VER) || \ + defined(__osf__) && defined(__alpha) #else - #define NO_FLOATING_POINT_SUPPORT -#define UFUNC_CHECK_STATUS(ret) { \ - ret = 0; \ - } - #endif + /* * THESE MACROS ARE DEPRECATED. * Use npy_set_floatstatus_* in the npymath library. */ +#define UFUNC_FPE_DIVIDEBYZERO NPY_FPE_DIVIDEBYZERO +#define UFUNC_FPE_OVERFLOW NPY_FPE_OVERFLOW +#define UFUNC_FPE_UNDERFLOW NPY_FPE_UNDERFLOW +#define UFUNC_FPE_INVALID NPY_FPE_INVALID + +#define UFUNC_CHECK_STATUS(ret) \ + { \ + ret = npy_clear_floatstatus(); \ + } #define generate_divbyzero_error() npy_set_floatstatus_divbyzero() #define generate_overflow_error() npy_set_floatstatus_overflow() /* Make sure it gets defined if it isn't already */ #ifndef UFUNC_NOFPE +/* Clear the floating point exception default of Borland C++ */ +#if defined(__BORLANDC__) +#define UFUNC_NOFPE _control87(MCW_EM, MCW_EM); +#else #define UFUNC_NOFPE #endif +#endif #ifdef __cplusplus diff --git a/numpy/core/src/npymath/ieee754.c.src b/numpy/core/src/npymath/ieee754.c.src index 2aba8be8f..4b007f18d 100644 --- a/numpy/core/src/npymath/ieee754.c.src +++ b/numpy/core/src/npymath/ieee754.c.src @@ -558,13 +558,38 @@ npy_longdouble npy_nextafterl(npy_longdouble x, npy_longdouble y) /* * Functions to set the floating point status word. + * keep in sync with NO_FLOATING_POINT_SUPPORT in ufuncobject.h */ +#if (defined(__unix__) || defined(unix)) && !defined(USG) +#include <sys/param.h> +#endif + +/* Solaris --------------------------------------------------------*/ +/* --------ignoring SunOS ieee_flags approach, someone else can +** deal with that! */ #if defined(sun) || defined(__BSD__) || defined(__OpenBSD__) || \ (defined(__FreeBSD__) && (__FreeBSD_version < 502114)) || \ defined(__NetBSD__) #include <ieeefp.h> +int npy_get_floatstatus(void) +{ + int fpstatus = fpgetsticky(); + return ((FP_X_DZ & fpstatus) ? NPY_FPE_DIVIDEBYZERO : 0) | + ((FP_X_OFL & fpstatus) ? NPY_FPE_OVERFLOW : 0) | + ((FP_X_UFL & fpstatus) ? NPY_FPE_UNDERFLOW : 0) | + ((FP_X_INV & fpstatus) ? NPY_FPE_INVALID : 0); +} + +int npy_clear_floatstatus(void) +{ + int fpstatus = npy_get_floatstatus(); + fpsetsticky(0); + + return fpstatus; +} + void npy_set_floatstatus_divbyzero(void) { fpsetsticky(FP_X_DZ); @@ -597,6 +622,30 @@ void npy_set_floatstatus_invalid(void) # include "numpy/fenv/fenv.h" # endif +int npy_get_floatstatus(void) +{ + int fpstatus = fetestexcept(FE_DIVBYZERO | FE_OVERFLOW | + FE_UNDERFLOW | FE_INVALID); + + return ((FE_DIVBYZERO & fpstatus) ? NPY_FPE_DIVIDEBYZERO : 0) | + ((FE_OVERFLOW & fpstatus) ? NPY_FPE_OVERFLOW : 0) | + ((FE_UNDERFLOW & fpstatus) ? NPY_FPE_UNDERFLOW : 0) | + ((FE_INVALID & fpstatus) ? NPY_FPE_INVALID : 0); +} + +int npy_clear_floatstatus(void) +{ + /* testing float status is 50-100 times faster than clearing on x86 */ + int fpstatus = npy_get_floatstatus(); + if (fpstatus != 0) { + feclearexcept(FE_DIVBYZERO | FE_OVERFLOW | + FE_UNDERFLOW | FE_INVALID); + } + + return fpstatus; +} + + void npy_set_floatstatus_divbyzero(void) { feraiseexcept(FE_DIVBYZERO); @@ -621,6 +670,23 @@ void npy_set_floatstatus_invalid(void) #include <float.h> #include <fpxcp.h> +int npy_get_floatstatus(void) +{ + int fpstatus = fp_read_flag(); + return ((FP_DIV_BY_ZERO & fpstatus) ? NPY_FPE_DIVIDEBYZERO : 0) | + ((FP_OVERFLOW & fpstatus) ? NPY_FPE_OVERFLOW : 0) | + ((FP_UNDERFLOW & fpstatus) ? NPY_FPE_UNDERFLOW : 0) | + ((FP_INVALID & fpstatus) ? NPY_FPE_INVALID : 0); +} + +int npy_clear_floatstatus(void) +{ + int fpstatus = npy_get_floatstatus(); + fp_swap_flag(0); + + return fpstatus; +} + void npy_set_floatstatus_divbyzero(void) { fp_raise_xcp(FP_DIV_BY_ZERO); @@ -643,6 +709,73 @@ void npy_set_floatstatus_invalid(void) #else +/* MS Windows -----------------------------------------------------*/ +#if defined(_MSC_VER) + +#include <float.h> + + +int npy_get_floatstatus(void) +{ +#if defined(_WIN64) + int fpstatus = _statusfp(); +#else + /* windows enables sse on 32 bit, so check both flags */ + int fpstatus, fpstatus2; + _statusfp2(&fpstatus, &fpstatus2); + fpstatus |= fpstatus2; +#endif + return ((SW_ZERODIVIDE & fpstatus) ? NPY_FPE_DIVIDEBYZERO : 0) | + ((SW_OVERFLOW & fpstatus) ? NPY_FPE_OVERFLOW : 0) | + ((SW_UNDERFLOW & fpstatus) ? NPY_FPE_UNDERFLOW : 0) | + ((SW_INVALID & fpstatus) ? NPY_FPE_INVALID : 0); +} + +int npy_clear_floatstatus(void) +{ + int fpstatus = npy_get_floatstatus(); + _clearfp(); + + return fpstatus; +} + +/* OSF/Alpha (Tru64) ---------------------------------------------*/ +#elif defined(__osf__) && defined(__alpha) + +#include <machine/fpu.h> + +int npy_get_floatstatus(void) +{ + unsigned long fpstatus = ieee_get_fp_control(); + return ((IEEE_STATUS_DZE & fpstatus) ? NPY_FPE_DIVIDEBYZERO : 0) | + ((IEEE_STATUS_OVF & fpstatus) ? NPY_FPE_OVERFLOW : 0) | + ((IEEE_STATUS_UNF & fpstatus) ? NPY_FPE_UNDERFLOW : 0) | + ((IEEE_STATUS_INV & fpstatus) ? NPY_FPE_INVALID : 0); +} + +int npy_clear_floatstatus(void) +{ + long fpstatus = npy_get_floatstatus(); + /* clear status bits as well as disable exception mode if on */ + ieee_set_fp_control(0); + + return fpstatus; +} + +#else + +int npy_get_floatstatus(void) +{ + return 0; +} + +int npy_clear_floatstatus(void) +{ + return 0; +} + +#endif + /* * By using a volatile floating point value, * the compiler is forced to actually do the requested diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src index e5c4e9335..3a7c4f75b 100644 --- a/numpy/core/src/umath/simd.inc.src +++ b/numpy/core/src/umath/simd.inc.src @@ -19,6 +19,7 @@ #include "numpy/npy_common.h" /* for NO_FLOATING_POINT_SUPPORT */ #include "numpy/ufuncobject.h" +#include "numpy/npy_math.h" #ifdef NPY_HAVE_SSE2_INTRINSICS #include <emmintrin.h> #endif @@ -26,9 +27,6 @@ #include <stdlib.h> #include <string.h> /* for memcpy */ -int PyUFunc_getfperr(void); -void PyUFunc_clearfperr(void); - /* * stride is equal to element size and input and destination are equal or * don't overlap within one register @@ -677,7 +675,7 @@ sse2_@kind@_@TYPE@(@type@ * ip, @type@ * op, const npy_intp n) i += 2 * stride; /* minps/minpd will set invalid flag if nan is encountered */ - PyUFunc_clearfperr(); + npy_clear_floatstatus(); LOOP_BLOCKED(@type@, 32) { @vtype@ v1 = @vpre@_load_@vsuf@((@type@*)&ip[i]); @vtype@ v2 = @vpre@_load_@vsuf@((@type@*)&ip[i + stride]); @@ -686,7 +684,7 @@ sse2_@kind@_@TYPE@(@type@ * ip, @type@ * op, const npy_intp n) } c1 = @vpre@_@VOP@_@vsuf@(c1, c2); - if (PyUFunc_getfperr() & UFUNC_FPE_INVALID) { + if (npy_get_floatstatus() & NPY_FPE_INVALID) { *op = @nan@; } else { diff --git a/numpy/core/src/umath/ufunc_object.c b/numpy/core/src/umath/ufunc_object.c index 38091d5ff..3d7ea2cf5 100644 --- a/numpy/core/src/umath/ufunc_object.c +++ b/numpy/core/src/umath/ufunc_object.c @@ -180,12 +180,14 @@ fail: NPY_NO_EXPORT int PyUFunc_getfperr(void) { - int retstatus; - UFUNC_CHECK_STATUS(retstatus); - return retstatus; + /* + * non-clearing get was only added in 1.9 so this function always cleared + * keep it so just in case third party code relied on the clearing + */ + return npy_clear_floatstatus(); } -#define HANDLEIT(NAME, str) {if (retstatus & UFUNC_FPE_##NAME) { \ +#define HANDLEIT(NAME, str) {if (retstatus & NPY_FPE_##NAME) { \ handle = errmask & UFUNC_MASK_##NAME; \ if (handle && \ _error_handler(handle >> UFUNC_SHIFT_##NAME, \ @@ -214,10 +216,9 @@ PyUFunc_handlefperr(int errmask, PyObject *errobj, int retstatus, int *first) NPY_NO_EXPORT int PyUFunc_checkfperr(int errmask, PyObject *errobj, int *first) { - int retstatus; + /* clearing is done for backward compatiblity */ + int retstatus = npy_clear_floatstatus(); - /* 1. check hardware flag --- this is platform dependent code */ - retstatus = PyUFunc_getfperr(); return PyUFunc_handlefperr(errmask, errobj, retstatus, first); } @@ -227,7 +228,7 @@ PyUFunc_checkfperr(int errmask, PyObject *errobj, int *first) NPY_NO_EXPORT void PyUFunc_clearfperr() { - PyUFunc_getfperr(); + npy_clear_floatstatus(); } diff --git a/numpy/linalg/umath_linalg.c.src b/numpy/linalg/umath_linalg.c.src index 35cf545bb..4b50bf1f6 100644 --- a/numpy/linalg/umath_linalg.c.src +++ b/numpy/linalg/umath_linalg.c.src @@ -378,8 +378,8 @@ static inline int get_fp_invalid_and_clear(void) { int status; - status = PyUFunc_getfperr(); - return !!(status & UFUNC_FPE_INVALID); + status = npy_clear_floatstatus(); + return !!(status & NPY_FPE_INVALID); } static inline void @@ -389,7 +389,7 @@ set_fp_invalid_or_clear(int error_occurred) npy_set_floatstatus_invalid(); } else { - PyUFunc_getfperr(); + npy_clear_floatstatus(); } } |