summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJulian Taylor <jtaylor.debian@googlemail.com>2013-10-24 23:07:19 +0200
committerJulian Taylor <jtaylor.debian@googlemail.com>2013-11-06 20:29:13 +0100
commitdcfe3260e2d6d9de7a2c3fb0022ecf124faa2ae2 (patch)
tree54199a2f2da24be4d82553fd38a9d890934c649f
parent8988715e1aea7b10b230115aecae029cf6e80d66 (diff)
downloadnumpy-dcfe3260e2d6d9de7a2c3fb0022ecf124faa2ae2.tar.gz
ENH: avoid expensive clears in fenv functions
Clearing is 50-100 times more expensive than checking on x86, so check if there is anything needs to be cleared first. This speeds up scalar operations by 10%-20%. Based on Arink Verma code in #3739. Implement the functions as new C-API functions npy_get_floatstatus and npy_clear_floatstatus in npy_math.
-rw-r--r--doc/source/reference/c-api.coremath.rst41
-rw-r--r--doc/source/reference/c-api.ufunc.rst2
-rw-r--r--numpy/core/include/numpy/npy_math.h11
-rw-r--r--numpy/core/include/numpy/ufuncobject.h149
-rw-r--r--numpy/core/src/npymath/ieee754.c.src133
-rw-r--r--numpy/core/src/umath/simd.inc.src8
-rw-r--r--numpy/core/src/umath/ufunc_object.c17
-rw-r--r--numpy/linalg/umath_linalg.c.src6
8 files changed, 226 insertions, 141 deletions
diff --git a/doc/source/reference/c-api.coremath.rst b/doc/source/reference/c-api.coremath.rst
index 2d5aedc73..5c76bd601 100644
--- a/doc/source/reference/c-api.coremath.rst
+++ b/doc/source/reference/c-api.coremath.rst
@@ -150,6 +150,47 @@ Those can be useful for precise floating point comparison.
.. versionadded:: 1.4.0
+.. cfunction:: void npy_set_floatstatus_divbyzero()
+
+ Set the divide by zero floating point exception
+
+ .. versionadded:: 1.6.0
+
+.. cfunction:: void npy_set_floatstatus_overflow()
+
+ Set the overflow floating point exception
+
+ .. versionadded:: 1.6.0
+
+.. cfunction:: void npy_set_floatstatus_underflow()
+
+ Set the underflow floating point exception
+
+ .. versionadded:: 1.6.0
+
+.. cfunction:: void npy_set_floatstatus_invalid()
+
+ Set the invalid floating point exception
+
+ .. versionadded:: 1.6.0
+
+.. cfunction:: int npy_get_floatstatus()
+
+ Get floating point status. Returns a bitmask with following possible flags:
+
+ * NPY_FPE_DIVIDEBYZERO
+ * NPY_FPE_OVERFLOW
+ * NPY_FPE_UNDERFLOW
+ * NPY_FPE_INVALID
+
+ .. versionadded:: 1.9.0
+
+.. cfunction:: int npy_clear_floatstatus()
+
+ Clears the floating point status. Returns the previous status mask.
+
+ .. versionadded:: 1.9.0
+
Complex functions
~~~~~~~~~~~~~~~~~
diff --git a/doc/source/reference/c-api.ufunc.rst b/doc/source/reference/c-api.ufunc.rst
index d4de28188..71abffd04 100644
--- a/doc/source/reference/c-api.ufunc.rst
+++ b/doc/source/reference/c-api.ufunc.rst
@@ -54,6 +54,8 @@ Macros
.. cfunction:: UFUNC_CHECK_STATUS(ret)
+ Deprecated: use npy_clear_floatstatus from npy_math.h instead.
+
A macro that expands to platform-dependent code. The *ret*
variable can can be any integer. The :cdata:`UFUNC_FPE_{ERR}` bits are
set in *ret* according to the status of the corresponding error
diff --git a/numpy/core/include/numpy/npy_math.h b/numpy/core/include/numpy/npy_math.h
index 094286181..b7920460d 100644
--- a/numpy/core/include/numpy/npy_math.h
+++ b/numpy/core/include/numpy/npy_math.h
@@ -456,6 +456,17 @@ npy_clongdouble npy_csinl(npy_clongdouble z);
* status word.
*/
+/*
+ * platform-dependent code translates floating point
+ * status to an integer sum of these values
+ */
+#define NPY_FPE_DIVIDEBYZERO 1
+#define NPY_FPE_OVERFLOW 2
+#define NPY_FPE_UNDERFLOW 4
+#define NPY_FPE_INVALID 8
+
+int npy_get_floatstatus(void);
+int npy_clear_floatstatus(void);
void npy_set_floatstatus_divbyzero(void);
void npy_set_floatstatus_overflow(void);
void npy_set_floatstatus_underflow(void);
diff --git a/numpy/core/include/numpy/ufuncobject.h b/numpy/core/include/numpy/ufuncobject.h
index a6307df91..38e3dcf0f 100644
--- a/numpy/core/include/numpy/ufuncobject.h
+++ b/numpy/core/include/numpy/ufuncobject.h
@@ -2,6 +2,7 @@
#define Py_UFUNCOBJECT_H
#include <numpy/npy_math.h>
+#include <numpy/npy_common.h>
#ifdef __cplusplus
extern "C" {
@@ -250,14 +251,6 @@ typedef struct _tagPyUFuncObject {
#define UFUNC_SHIFT_INVALID 9
-/* platform-dependent code translates floating point
- status to an integer sum of these values
-*/
-#define UFUNC_FPE_DIVIDEBYZERO 1
-#define UFUNC_FPE_OVERFLOW 2
-#define UFUNC_FPE_UNDERFLOW 4
-#define UFUNC_FPE_INVALID 8
-
#define UFUNC_OBJ_ISOBJECT 1
#define UFUNC_OBJ_NEEDS_API 2
@@ -333,141 +326,47 @@ typedef struct _loop1d_info {
&(arg)->first))) \
goto fail;} while (0)
-/* This code checks the IEEE status flags in a platform-dependent way */
-/* Adapted from Numarray */
-
-#if (defined(__unix__) || defined(unix)) && !defined(USG)
-#include <sys/param.h>
-#endif
-
-/* OSF/Alpha (Tru64) ---------------------------------------------*/
-#if defined(__osf__) && defined(__alpha)
-
-#include <machine/fpu.h>
-#define UFUNC_CHECK_STATUS(ret) { \
- unsigned long fpstatus; \
- \
- fpstatus = ieee_get_fp_control(); \
- /* clear status bits as well as disable exception mode if on */ \
- ieee_set_fp_control( 0 ); \
- ret = ((IEEE_STATUS_DZE & fpstatus) ? UFUNC_FPE_DIVIDEBYZERO : 0) \
- | ((IEEE_STATUS_OVF & fpstatus) ? UFUNC_FPE_OVERFLOW : 0) \
- | ((IEEE_STATUS_UNF & fpstatus) ? UFUNC_FPE_UNDERFLOW : 0) \
- | ((IEEE_STATUS_INV & fpstatus) ? UFUNC_FPE_INVALID : 0); \
- }
-
-/* MS Windows -----------------------------------------------------*/
-#elif defined(_MSC_VER)
-
-#include <float.h>
-
-/* Clear the floating point exception default of Borland C++ */
-#if defined(__BORLANDC__)
-#define UFUNC_NOFPE _control87(MCW_EM, MCW_EM);
-#endif
-
-#if defined(_WIN64)
-#define UFUNC_CHECK_STATUS(ret) { \
- int fpstatus = (int) _clearfp(); \
- \
- ret = ((SW_ZERODIVIDE & fpstatus) ? UFUNC_FPE_DIVIDEBYZERO : 0) \
- | ((SW_OVERFLOW & fpstatus) ? UFUNC_FPE_OVERFLOW : 0) \
- | ((SW_UNDERFLOW & fpstatus) ? UFUNC_FPE_UNDERFLOW : 0) \
- | ((SW_INVALID & fpstatus) ? UFUNC_FPE_INVALID : 0); \
- }
-#else
-/* windows enables sse on 32 bit, so check both flags */
-#define UFUNC_CHECK_STATUS(ret) { \
- int fpstatus, fpstatus2; \
- _statusfp2(&fpstatus, &fpstatus2); \
- _clearfp(); \
- fpstatus |= fpstatus2; \
- \
- ret = ((SW_ZERODIVIDE & fpstatus) ? UFUNC_FPE_DIVIDEBYZERO : 0) \
- | ((SW_OVERFLOW & fpstatus) ? UFUNC_FPE_OVERFLOW : 0) \
- | ((SW_UNDERFLOW & fpstatus) ? UFUNC_FPE_UNDERFLOW : 0) \
- | ((SW_INVALID & fpstatus) ? UFUNC_FPE_INVALID : 0); \
- }
-#endif
-
-/* Solaris --------------------------------------------------------*/
-/* --------ignoring SunOS ieee_flags approach, someone else can
-** deal with that! */
-#elif defined(sun) || defined(__BSD__) || defined(__OpenBSD__) || \
+/* keep in sync with ieee754.c.src */
+#if defined(sun) || defined(__BSD__) || defined(__OpenBSD__) || \
(defined(__FreeBSD__) && (__FreeBSD_version < 502114)) || \
- defined(__NetBSD__)
-#include <ieeefp.h>
-
-#define UFUNC_CHECK_STATUS(ret) { \
- int fpstatus; \
- \
- fpstatus = (int) fpgetsticky(); \
- ret = ((FP_X_DZ & fpstatus) ? UFUNC_FPE_DIVIDEBYZERO : 0) \
- | ((FP_X_OFL & fpstatus) ? UFUNC_FPE_OVERFLOW : 0) \
- | ((FP_X_UFL & fpstatus) ? UFUNC_FPE_UNDERFLOW : 0) \
- | ((FP_X_INV & fpstatus) ? UFUNC_FPE_INVALID : 0); \
- (void) fpsetsticky(0); \
- }
-
-#elif defined(__GLIBC__) || defined(__APPLE__) || \
+ defined(__NetBSD__) || \
+ defined(__GLIBC__) || defined(__APPLE__) || \
defined(__CYGWIN__) || defined(__MINGW32__) || \
- (defined(__FreeBSD__) && (__FreeBSD_version >= 502114))
-
-#if defined(__GLIBC__) || defined(__APPLE__) || \
- defined(__MINGW32__) || defined(__FreeBSD__)
-#include <fenv.h>
-#elif defined(__CYGWIN__)
-#include "numpy/fenv/fenv.h"
-#endif
-
-#define UFUNC_CHECK_STATUS(ret) { \
- int fpstatus = (int) fetestexcept(FE_DIVBYZERO | FE_OVERFLOW | \
- FE_UNDERFLOW | FE_INVALID); \
- ret = ((FE_DIVBYZERO & fpstatus) ? UFUNC_FPE_DIVIDEBYZERO : 0) \
- | ((FE_OVERFLOW & fpstatus) ? UFUNC_FPE_OVERFLOW : 0) \
- | ((FE_UNDERFLOW & fpstatus) ? UFUNC_FPE_UNDERFLOW : 0) \
- | ((FE_INVALID & fpstatus) ? UFUNC_FPE_INVALID : 0); \
- (void) feclearexcept(FE_DIVBYZERO | FE_OVERFLOW | \
- FE_UNDERFLOW | FE_INVALID); \
-}
-
-#elif defined(_AIX)
-
-#include <float.h>
-#include <fpxcp.h>
-
-#define UFUNC_CHECK_STATUS(ret) { \
- fpflag_t fpstatus; \
- \
- fpstatus = fp_read_flag(); \
- ret = ((FP_DIV_BY_ZERO & fpstatus) ? UFUNC_FPE_DIVIDEBYZERO : 0) \
- | ((FP_OVERFLOW & fpstatus) ? UFUNC_FPE_OVERFLOW : 0) \
- | ((FP_UNDERFLOW & fpstatus) ? UFUNC_FPE_UNDERFLOW : 0) \
- | ((FP_INVALID & fpstatus) ? UFUNC_FPE_INVALID : 0); \
- fp_swap_flag(0); \
-}
-
+ (defined(__FreeBSD__) && (__FreeBSD_version >= 502114)) || \
+ defined(_AIX) || \
+ defined(_MSC_VER) || \
+ defined(__osf__) && defined(__alpha)
#else
-
#define NO_FLOATING_POINT_SUPPORT
-#define UFUNC_CHECK_STATUS(ret) { \
- ret = 0; \
- }
-
#endif
+
/*
* THESE MACROS ARE DEPRECATED.
* Use npy_set_floatstatus_* in the npymath library.
*/
+#define UFUNC_FPE_DIVIDEBYZERO NPY_FPE_DIVIDEBYZERO
+#define UFUNC_FPE_OVERFLOW NPY_FPE_OVERFLOW
+#define UFUNC_FPE_UNDERFLOW NPY_FPE_UNDERFLOW
+#define UFUNC_FPE_INVALID NPY_FPE_INVALID
+
+#define UFUNC_CHECK_STATUS(ret) \
+ { \
+ ret = npy_clear_floatstatus(); \
+ }
#define generate_divbyzero_error() npy_set_floatstatus_divbyzero()
#define generate_overflow_error() npy_set_floatstatus_overflow()
/* Make sure it gets defined if it isn't already */
#ifndef UFUNC_NOFPE
+/* Clear the floating point exception default of Borland C++ */
+#if defined(__BORLANDC__)
+#define UFUNC_NOFPE _control87(MCW_EM, MCW_EM);
+#else
#define UFUNC_NOFPE
#endif
+#endif
#ifdef __cplusplus
diff --git a/numpy/core/src/npymath/ieee754.c.src b/numpy/core/src/npymath/ieee754.c.src
index 2aba8be8f..4b007f18d 100644
--- a/numpy/core/src/npymath/ieee754.c.src
+++ b/numpy/core/src/npymath/ieee754.c.src
@@ -558,13 +558,38 @@ npy_longdouble npy_nextafterl(npy_longdouble x, npy_longdouble y)
/*
* Functions to set the floating point status word.
+ * keep in sync with NO_FLOATING_POINT_SUPPORT in ufuncobject.h
*/
+#if (defined(__unix__) || defined(unix)) && !defined(USG)
+#include <sys/param.h>
+#endif
+
+/* Solaris --------------------------------------------------------*/
+/* --------ignoring SunOS ieee_flags approach, someone else can
+** deal with that! */
#if defined(sun) || defined(__BSD__) || defined(__OpenBSD__) || \
(defined(__FreeBSD__) && (__FreeBSD_version < 502114)) || \
defined(__NetBSD__)
#include <ieeefp.h>
+int npy_get_floatstatus(void)
+{
+ int fpstatus = fpgetsticky();
+ return ((FP_X_DZ & fpstatus) ? NPY_FPE_DIVIDEBYZERO : 0) |
+ ((FP_X_OFL & fpstatus) ? NPY_FPE_OVERFLOW : 0) |
+ ((FP_X_UFL & fpstatus) ? NPY_FPE_UNDERFLOW : 0) |
+ ((FP_X_INV & fpstatus) ? NPY_FPE_INVALID : 0);
+}
+
+int npy_clear_floatstatus(void)
+{
+ int fpstatus = npy_get_floatstatus();
+ fpsetsticky(0);
+
+ return fpstatus;
+}
+
void npy_set_floatstatus_divbyzero(void)
{
fpsetsticky(FP_X_DZ);
@@ -597,6 +622,30 @@ void npy_set_floatstatus_invalid(void)
# include "numpy/fenv/fenv.h"
# endif
+int npy_get_floatstatus(void)
+{
+ int fpstatus = fetestexcept(FE_DIVBYZERO | FE_OVERFLOW |
+ FE_UNDERFLOW | FE_INVALID);
+
+ return ((FE_DIVBYZERO & fpstatus) ? NPY_FPE_DIVIDEBYZERO : 0) |
+ ((FE_OVERFLOW & fpstatus) ? NPY_FPE_OVERFLOW : 0) |
+ ((FE_UNDERFLOW & fpstatus) ? NPY_FPE_UNDERFLOW : 0) |
+ ((FE_INVALID & fpstatus) ? NPY_FPE_INVALID : 0);
+}
+
+int npy_clear_floatstatus(void)
+{
+ /* testing float status is 50-100 times faster than clearing on x86 */
+ int fpstatus = npy_get_floatstatus();
+ if (fpstatus != 0) {
+ feclearexcept(FE_DIVBYZERO | FE_OVERFLOW |
+ FE_UNDERFLOW | FE_INVALID);
+ }
+
+ return fpstatus;
+}
+
+
void npy_set_floatstatus_divbyzero(void)
{
feraiseexcept(FE_DIVBYZERO);
@@ -621,6 +670,23 @@ void npy_set_floatstatus_invalid(void)
#include <float.h>
#include <fpxcp.h>
+int npy_get_floatstatus(void)
+{
+ int fpstatus = fp_read_flag();
+ return ((FP_DIV_BY_ZERO & fpstatus) ? NPY_FPE_DIVIDEBYZERO : 0) |
+ ((FP_OVERFLOW & fpstatus) ? NPY_FPE_OVERFLOW : 0) |
+ ((FP_UNDERFLOW & fpstatus) ? NPY_FPE_UNDERFLOW : 0) |
+ ((FP_INVALID & fpstatus) ? NPY_FPE_INVALID : 0);
+}
+
+int npy_clear_floatstatus(void)
+{
+ int fpstatus = npy_get_floatstatus();
+ fp_swap_flag(0);
+
+ return fpstatus;
+}
+
void npy_set_floatstatus_divbyzero(void)
{
fp_raise_xcp(FP_DIV_BY_ZERO);
@@ -643,6 +709,73 @@ void npy_set_floatstatus_invalid(void)
#else
+/* MS Windows -----------------------------------------------------*/
+#if defined(_MSC_VER)
+
+#include <float.h>
+
+
+int npy_get_floatstatus(void)
+{
+#if defined(_WIN64)
+ int fpstatus = _statusfp();
+#else
+ /* windows enables sse on 32 bit, so check both flags */
+ int fpstatus, fpstatus2;
+ _statusfp2(&fpstatus, &fpstatus2);
+ fpstatus |= fpstatus2;
+#endif
+ return ((SW_ZERODIVIDE & fpstatus) ? NPY_FPE_DIVIDEBYZERO : 0) |
+ ((SW_OVERFLOW & fpstatus) ? NPY_FPE_OVERFLOW : 0) |
+ ((SW_UNDERFLOW & fpstatus) ? NPY_FPE_UNDERFLOW : 0) |
+ ((SW_INVALID & fpstatus) ? NPY_FPE_INVALID : 0);
+}
+
+int npy_clear_floatstatus(void)
+{
+ int fpstatus = npy_get_floatstatus();
+ _clearfp();
+
+ return fpstatus;
+}
+
+/* OSF/Alpha (Tru64) ---------------------------------------------*/
+#elif defined(__osf__) && defined(__alpha)
+
+#include <machine/fpu.h>
+
+int npy_get_floatstatus(void)
+{
+ unsigned long fpstatus = ieee_get_fp_control();
+ return ((IEEE_STATUS_DZE & fpstatus) ? NPY_FPE_DIVIDEBYZERO : 0) |
+ ((IEEE_STATUS_OVF & fpstatus) ? NPY_FPE_OVERFLOW : 0) |
+ ((IEEE_STATUS_UNF & fpstatus) ? NPY_FPE_UNDERFLOW : 0) |
+ ((IEEE_STATUS_INV & fpstatus) ? NPY_FPE_INVALID : 0);
+}
+
+int npy_clear_floatstatus(void)
+{
+ long fpstatus = npy_get_floatstatus();
+ /* clear status bits as well as disable exception mode if on */
+ ieee_set_fp_control(0);
+
+ return fpstatus;
+}
+
+#else
+
+int npy_get_floatstatus(void)
+{
+ return 0;
+}
+
+int npy_clear_floatstatus(void)
+{
+ return 0;
+}
+
+#endif
+
/*
* By using a volatile floating point value,
* the compiler is forced to actually do the requested
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index e5c4e9335..3a7c4f75b 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -19,6 +19,7 @@
#include "numpy/npy_common.h"
/* for NO_FLOATING_POINT_SUPPORT */
#include "numpy/ufuncobject.h"
+#include "numpy/npy_math.h"
#ifdef NPY_HAVE_SSE2_INTRINSICS
#include <emmintrin.h>
#endif
@@ -26,9 +27,6 @@
#include <stdlib.h>
#include <string.h> /* for memcpy */
-int PyUFunc_getfperr(void);
-void PyUFunc_clearfperr(void);
-
/*
* stride is equal to element size and input and destination are equal or
* don't overlap within one register
@@ -677,7 +675,7 @@ sse2_@kind@_@TYPE@(@type@ * ip, @type@ * op, const npy_intp n)
i += 2 * stride;
/* minps/minpd will set invalid flag if nan is encountered */
- PyUFunc_clearfperr();
+ npy_clear_floatstatus();
LOOP_BLOCKED(@type@, 32) {
@vtype@ v1 = @vpre@_load_@vsuf@((@type@*)&ip[i]);
@vtype@ v2 = @vpre@_load_@vsuf@((@type@*)&ip[i + stride]);
@@ -686,7 +684,7 @@ sse2_@kind@_@TYPE@(@type@ * ip, @type@ * op, const npy_intp n)
}
c1 = @vpre@_@VOP@_@vsuf@(c1, c2);
- if (PyUFunc_getfperr() & UFUNC_FPE_INVALID) {
+ if (npy_get_floatstatus() & NPY_FPE_INVALID) {
*op = @nan@;
}
else {
diff --git a/numpy/core/src/umath/ufunc_object.c b/numpy/core/src/umath/ufunc_object.c
index 38091d5ff..3d7ea2cf5 100644
--- a/numpy/core/src/umath/ufunc_object.c
+++ b/numpy/core/src/umath/ufunc_object.c
@@ -180,12 +180,14 @@ fail:
NPY_NO_EXPORT int
PyUFunc_getfperr(void)
{
- int retstatus;
- UFUNC_CHECK_STATUS(retstatus);
- return retstatus;
+ /*
+ * non-clearing get was only added in 1.9 so this function always cleared
+ * keep it so just in case third party code relied on the clearing
+ */
+ return npy_clear_floatstatus();
}
-#define HANDLEIT(NAME, str) {if (retstatus & UFUNC_FPE_##NAME) { \
+#define HANDLEIT(NAME, str) {if (retstatus & NPY_FPE_##NAME) { \
handle = errmask & UFUNC_MASK_##NAME; \
if (handle && \
_error_handler(handle >> UFUNC_SHIFT_##NAME, \
@@ -214,10 +216,9 @@ PyUFunc_handlefperr(int errmask, PyObject *errobj, int retstatus, int *first)
NPY_NO_EXPORT int
PyUFunc_checkfperr(int errmask, PyObject *errobj, int *first)
{
- int retstatus;
+ /* clearing is done for backward compatiblity */
+ int retstatus = npy_clear_floatstatus();
- /* 1. check hardware flag --- this is platform dependent code */
- retstatus = PyUFunc_getfperr();
return PyUFunc_handlefperr(errmask, errobj, retstatus, first);
}
@@ -227,7 +228,7 @@ PyUFunc_checkfperr(int errmask, PyObject *errobj, int *first)
NPY_NO_EXPORT void
PyUFunc_clearfperr()
{
- PyUFunc_getfperr();
+ npy_clear_floatstatus();
}
diff --git a/numpy/linalg/umath_linalg.c.src b/numpy/linalg/umath_linalg.c.src
index 35cf545bb..4b50bf1f6 100644
--- a/numpy/linalg/umath_linalg.c.src
+++ b/numpy/linalg/umath_linalg.c.src
@@ -378,8 +378,8 @@ static inline int
get_fp_invalid_and_clear(void)
{
int status;
- status = PyUFunc_getfperr();
- return !!(status & UFUNC_FPE_INVALID);
+ status = npy_clear_floatstatus();
+ return !!(status & NPY_FPE_INVALID);
}
static inline void
@@ -389,7 +389,7 @@ set_fp_invalid_or_clear(int error_occurred)
npy_set_floatstatus_invalid();
}
else {
- PyUFunc_getfperr();
+ npy_clear_floatstatus();
}
}