diff options
author | Julian Taylor <jtaylor.debian@googlemail.com> | 2016-08-31 20:15:16 +0200 |
---|---|---|
committer | Julian Taylor <jtaylor.debian@googlemail.com> | 2016-09-01 09:58:24 +0200 |
commit | fd298a341ddeb05c471c8dfc16f4cc641d08f8a7 (patch) | |
tree | 65ff792e70bd848d90250195ee2f03b50618d905 /numpy/core | |
parent | a93d9f7a97358e618aa52b2bbfa119317ee56d08 (diff) | |
download | numpy-fd298a341ddeb05c471c8dfc16f4cc641d08f8a7.tar.gz |
ENH: add inplace cases to fast ufunc loop macros
Both gcc and clang don't automatically specialize the inplace case, so
add extra conditions to the loop macros to get the compilers to emit
decent code.
Without them inplace code ends up much slower than the out of place
code.
Diffstat (limited to 'numpy/core')
-rw-r--r-- | numpy/core/src/umath/loops.c.src | 75 | ||||
-rw-r--r-- | numpy/core/tests/test_scalarmath.py | 8 |
2 files changed, 50 insertions, 33 deletions
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src index 157b30e70..2720c361f 100644 --- a/numpy/core/src/umath/loops.c.src +++ b/numpy/core/src/umath/loops.c.src @@ -87,22 +87,25 @@ * combine with NPY_GCC_OPT_3 to allow autovectorization * should only be used where its worthwhile to avoid code bloat */ +#define BASE_UNARY_LOOP(tin, tout, op) \ + UNARY_LOOP { \ + const tin in = *(tin *)ip1; \ + tout * out = (tout *)op1; \ + op; \ + } #define UNARY_LOOP_FAST(tin, tout, op) \ do { \ /* condition allows compiler to optimize the generic macro */ \ if (IS_UNARY_CONT(tin, tout)) { \ - UNARY_LOOP { \ - const tin in = *(tin *)ip1; \ - tout * out = (tout *)op1; \ - op; \ + if (args[0] == args[1]) { \ + BASE_UNARY_LOOP(tin, tout, op) \ + } \ + else { \ + BASE_UNARY_LOOP(tin, tout, op) \ } \ } \ else { \ - UNARY_LOOP { \ - const tin in = *(tin *)ip1; \ - tout * out = (tout *)op1; \ - op; \ - } \ + BASE_UNARY_LOOP(tin, tout, op) \ } \ } \ while (0) @@ -128,40 +131,52 @@ * combine with NPY_GCC_OPT_3 to allow autovectorization * should only be used where its worthwhile to avoid code bloat */ +#define BASE_BINARY_LOOP(tin, tout, op) \ + BINARY_LOOP { \ + const tin in1 = *(tin *)ip1; \ + const tin in2 = *(tin *)ip2; \ + tout * out = (tout *)op1; \ + op; \ + } +#define BASE_BINARY_LOOP_S(tin, tout, cin, cinp, vin, vinp, op) \ + const tin cin = *(tin *)cinp; \ + BINARY_LOOP { \ + const tin vin = *(tin *)vinp; \ + tout * out = (tout *)op1; \ + op; \ + } #define BINARY_LOOP_FAST(tin, tout, op) \ do { \ /* condition allows compiler to optimize the generic macro */ \ if (IS_BINARY_CONT(tin, tout)) { \ - BINARY_LOOP { \ - const tin in1 = *(tin *)ip1; \ - const tin in2 = *(tin *)ip2; \ - tout * out = (tout *)op1; \ - op; \ + if (args[2] == args[0]) { \ + BASE_BINARY_LOOP(tin, tout, op) \ + } \ + else if (args[2] == args[1]) { \ + BASE_BINARY_LOOP(tin, tout, op) \ + } \ + else { \ + BASE_BINARY_LOOP(tin, tout, op) \ } \ } \ else if (IS_BINARY_CONT_S1(tin, tout)) { \ - const tin in1 = *(tin *)args[0]; \ - BINARY_LOOP { \ - const tin in2 = *(tin *)ip2; \ - tout * out = (tout *)op1; \ - op; \ + if (args[1] == args[2]) { \ + BASE_BINARY_LOOP_S(tin, tout, in1, args[0], in2, ip2, op) \ + } \ + else { \ + BASE_BINARY_LOOP_S(tin, tout, in1, args[0], in2, ip2, op) \ } \ } \ else if (IS_BINARY_CONT_S2(tin, tout)) { \ - const tin in2 = *(tin *)args[1]; \ - BINARY_LOOP { \ - const tin in1 = *(tin *)ip1; \ - tout * out = (tout *)op1; \ - op; \ + if (args[0] == args[2]) { \ + BASE_BINARY_LOOP_S(tin, tout, in2, args[1], in1, ip1, op) \ } \ + else { \ + BASE_BINARY_LOOP_S(tin, tout, in2, args[1], in1, ip1, op) \ + }\ } \ else { \ - BINARY_LOOP { \ - const tin in1 = *(tin *)ip1; \ - const tin in2 = *(tin *)ip2; \ - tout * out = (tout *)op1; \ - op; \ - } \ + BASE_BINARY_LOOP(tin, tout, op) \ } \ } \ while (0) diff --git a/numpy/core/tests/test_scalarmath.py b/numpy/core/tests/test_scalarmath.py index 1c71565f4..f9aeb6382 100644 --- a/numpy/core/tests/test_scalarmath.py +++ b/numpy/core/tests/test_scalarmath.py @@ -65,7 +65,7 @@ class TestBaseMath(TestCase): def test_blocked(self): # test alignments offsets for simd instructions # alignments for vz + 2 * (vs - 1) + 1 - for dt, sz in [(np.float32, 11), (np.float64, 7)]: + for dt, sz in [(np.float32, 11), (np.float64, 7), (np.int32, 11)]: for out, inp1, inp2, msg in _gen_alignment_data(dtype=dt, type='binary', max_size=sz): @@ -82,8 +82,10 @@ class TestBaseMath(TestCase): inp2[...] += np.arange(inp2.size, dtype=dt) + 1 assert_almost_equal(np.square(inp2), np.multiply(inp2, inp2), err_msg=msg) - assert_almost_equal(np.reciprocal(inp2), - np.divide(1, inp2), err_msg=msg) + # skip true divide for ints + if dt != np.int32 or sys.version_info.major < 3: + assert_almost_equal(np.reciprocal(inp2), + np.divide(1, inp2), err_msg=msg) inp1[...] = np.ones_like(inp1) inp2[...] = np.zeros_like(inp2) |