diff options
author | Julian Taylor <jtaylor.debian@googlemail.com> | 2013-05-19 17:04:27 +0200 |
---|---|---|
committer | Julian Taylor <jtaylor.debian@googlemail.com> | 2013-05-25 17:36:00 +0200 |
commit | 0adccaaa910ab495e993f453956fd983775604f3 (patch) | |
tree | 575e6b1bc7066bbe24ade1fee8576e4e31f2f7ef /numpy/testing | |
parent | 8ff5e37bff03925da4c1b121b38188f9fd779b4d (diff) | |
download | numpy-0adccaaa910ab495e993f453956fd983775604f3.tar.gz |
ENH: vectorize sqrt ufunc using SSE2
specialize the sqrt ufunc for float and double and vectorize it using
SSE2.
improves performance by 4/2 for float/double if one is not memory bound
due to non-cached data.
performance is always better on all tested machines (amd phenom X2,
intel xeon 5xxx/7xxx, core2duo, corei7)
This version will not set errno on invalid input, but numpy only checks
the fpu flags so the behavior is the same.
In principle the compiler could autovectorize it when setting ffast-math
(for no errno) and specializing the loop for the vectorizable strides
and giving it some hints (restrict, __builtin_assume_aligned, etc.),
but its simpler and more reliable to simply vectorize it by hand.
Diffstat (limited to 'numpy/testing')
-rw-r--r-- | numpy/testing/utils.py | 67 |
1 files changed, 67 insertions, 0 deletions
diff --git a/numpy/testing/utils.py b/numpy/testing/utils.py index 40c569c0f..7a3ea7a1c 100644 --- a/numpy/testing/utils.py +++ b/numpy/testing/utils.py @@ -10,6 +10,7 @@ import re import operator import warnings from .nosetester import import_nose +from numpy.core import float32, empty, arange if sys.version_info[0] >= 3: from io import StringIO @@ -1523,3 +1524,69 @@ def assert_no_warnings(func, *args, **kw): finally: ctx.__exit__() return result + + +def gen_alignment_data(dtype=float32, type='binary', max_size=24): + """ + generator producing data with different alignment and offsets + to test simd vectorization + + Parameters + ---------- + dtype : dtype + data type to produce + type : string + 'unary': create data for unary operations, creates one input + and output array + 'binary': create data for unary operations, creates two input + and output array + max_size : integer + maximum size of data to produce + + Returns + ------- + if type is 'unary' yields one output, one input array and a message + containing information on the data + if type is 'binary' yields one output array, two input array and a message + containing information on the data + + """ + ufmt = 'unary offset=(%d, %d), size=%d, dtype=%r, %s' + bfmt = 'binary offset=(%d, %d, %d), size=%d, dtype=%r, %s' + for o in range(3): + for s in range(o + 2, max(o + 3, max_size)): + if type == 'unary': + inp = lambda : arange(s, dtype=dtype)[o:] + out = empty((s,), dtype=dtype)[o:] + yield out, inp(), ufmt % (o, o, s, dtype, 'out of place') + yield inp(), inp(), ufmt % (o, o, s, dtype, 'in place') + yield out[1:], inp()[:-1], ufmt % \ + (o + 1, o, s - 1, dtype, 'out of place') + yield out[:-1], inp()[1:], ufmt % \ + (o, o + 1, s - 1, dtype, 'out of place') + yield inp()[:-1], inp()[1:], ufmt % \ + (o, o + 1, s - 1, dtype, 'aliased') + yield inp()[1:], inp()[:-1], ufmt % \ + (o + 1, o, s - 1, dtype, 'aliased') + if type == 'binary': + inp1 = lambda :arange(s, dtype=dtype)[o:] + inp2 = lambda :arange(s, dtype=dtype)[o:] + out = empty((s,), dtype=dtype)[o:] + yield out, inp1(), inp2(), bfmt % \ + (o, o, o, s, dtype, 'out of place') + yield inp1(), inp1(), inp2(), bfmt % \ + (o, o, o, s, dtype, 'in place1') + yield inp2(), inp1(), inp2(), bfmt % \ + (o, o, o, s, dtype, 'in place2') + yield out[1:], inp1()[:-1], inp2()[:-1], bfmt % \ + (o + 1, o, o, s - 1, dtype, 'out of place') + yield out[-1:], inp1()[1:], inp2()[:-1], bfmt % \ + (o, o + 1, o, s - 1, dtype, 'out of place') + yield out[-1:], inp1()[:-1], inp2()[1:], bfmt % \ + (o, o, o + 1, s - 1, dtype, 'out of place') + yield inp1()[1:], inp1()[:-1], inp2()[:-1], bfmt % \ + (o + 1, o, o, s - 1, dtype, 'aliased') + yield inp1()[-1:], inp1()[1:], inp2()[:-1], bfmt % \ + (o, o + 1, o, s - 1, dtype, 'aliased') + yield inp1()[-1:], inp1()[:-1], inp2()[1:], bfmt % \ + (o, o, o + 1, s - 1, dtype, 'aliased') |