ENH: vectorize sqrt ufunc using SSE2

specialize the sqrt ufunc for float and double and vectorize it using SSE2. improves performance by 4/2 for float/double if one is not memory bound due to non-cached data. performance is always better on all tested machines (amd phenom X2, intel xeon 5xxx/7xxx, core2duo, corei7) This version will not set errno on invalid input, but numpy only checks the fpu flags so the behavior is the same. In principle the compiler could autovectorize it when setting ffast-math (for no errno) and specializing the loop for the vectorizable strides and giving it some hints (restrict, __builtin_assume_aligned, etc.), but its simpler and more reliable to simply vectorize it by hand.
author: Julian Taylor <jtaylor.debian@googlemail.com> 2013-05-19 17:04:27 +0200
committer: Julian Taylor <jtaylor.debian@googlemail.com> 2013-05-25 17:36:00 +0200
commit: 0adccaaa910ab495e993f453956fd983775604f3 (patch)
tree: 575e6b1bc7066bbe24ade1fee8576e4e31f2f7ef /numpy/testing
parent: 8ff5e37bff03925da4c1b121b38188f9fd779b4d (diff)
download: numpy-0adccaaa910ab495e993f453956fd983775604f3.tar.gz
1 files changed, 67 insertions, 0 deletions
diff --git a/numpy/testing/utils.py b/numpy/testing/utils.py
index 40c569c0f..7a3ea7a1c 100644
--- a/numpy/testing/utils.py
+++ b/numpy/testing/utils.py
@@ -10,6 +10,7 @@ import re
 import operator
 import warnings
 from .nosetester import import_nose
+from numpy.core import float32, empty, arange
 
 if sys.version_info[0] >= 3:
     from io import StringIO
@@ -1523,3 +1524,69 @@ def assert_no_warnings(func, *args, **kw):
     finally:
         ctx.__exit__()
     return result
+
+
+def gen_alignment_data(dtype=float32, type='binary', max_size=24):
+    """
+    generator producing data with different alignment and offsets
+    to test simd vectorization
+
+    Parameters
+    ----------
+    dtype : dtype
+        data type to produce
+    type : string
+        'unary': create data for unary operations, creates one input
+                 and output array
+        'binary': create data for unary operations, creates two input
+                 and output array
+    max_size : integer
+        maximum size of data to produce
+
+    Returns
+    -------
+    if type is 'unary' yields one output, one input array and a message
+    containing information on the data
+    if type is 'binary' yields one output array, two input array and a message
+    containing information on the data
+
+    """
+    ufmt = 'unary offset=(%d, %d), size=%d, dtype=%r, %s'
+    bfmt = 'binary offset=(%d, %d, %d), size=%d, dtype=%r, %s'
+    for o in range(3):
+        for s in range(o + 2, max(o + 3, max_size)):
+            if type == 'unary':
+                inp = lambda : arange(s, dtype=dtype)[o:]
+                out = empty((s,), dtype=dtype)[o:]
+                yield out, inp(), ufmt % (o, o, s, dtype, 'out of place')
+                yield inp(), inp(), ufmt % (o, o, s, dtype, 'in place')
+                yield out[1:], inp()[:-1], ufmt % \
+                    (o + 1, o, s - 1, dtype, 'out of place')
+                yield out[:-1], inp()[1:], ufmt % \
+                    (o, o + 1, s - 1, dtype, 'out of place')
+                yield inp()[:-1], inp()[1:], ufmt % \
+                    (o, o + 1, s - 1, dtype, 'aliased')
+                yield inp()[1:], inp()[:-1], ufmt % \
+                    (o + 1, o, s - 1, dtype, 'aliased')
+            if type == 'binary':
+                inp1 = lambda :arange(s, dtype=dtype)[o:]
+                inp2 = lambda :arange(s, dtype=dtype)[o:]
+                out = empty((s,), dtype=dtype)[o:]
+                yield out, inp1(), inp2(),  bfmt % \
+                    (o, o, o, s, dtype, 'out of place')
+                yield inp1(), inp1(), inp2(), bfmt % \
+                    (o, o, o, s, dtype, 'in place1')
+                yield inp2(), inp1(), inp2(), bfmt % \
+                    (o, o, o, s, dtype, 'in place2')
+                yield out[1:], inp1()[:-1], inp2()[:-1], bfmt % \
+                    (o + 1, o, o, s - 1, dtype, 'out of place')
+                yield out[-1:], inp1()[1:], inp2()[:-1], bfmt % \
+                    (o, o + 1, o, s - 1, dtype, 'out of place')
+                yield out[-1:], inp1()[:-1], inp2()[1:], bfmt % \
+                    (o, o, o + 1, s - 1, dtype, 'out of place')
+                yield inp1()[1:], inp1()[:-1], inp2()[:-1], bfmt % \
+                    (o + 1, o, o, s - 1, dtype, 'aliased')
+                yield inp1()[-1:], inp1()[1:], inp2()[:-1], bfmt % \
+                    (o, o + 1, o, s - 1, dtype, 'aliased')
+                yield inp1()[-1:], inp1()[:-1], inp2()[1:], bfmt % \
+                    (o, o, o + 1, s - 1, dtype, 'aliased')
author	Julian Taylor <jtaylor.debian@googlemail.com>	2013-05-19 17:04:27 +0200
committer	Julian Taylor <jtaylor.debian@googlemail.com>	2013-05-25 17:36:00 +0200
commit	0adccaaa910ab495e993f453956fd983775604f3 (patch)
tree	575e6b1bc7066bbe24ade1fee8576e4e31f2f7ef /numpy/testing
parent	8ff5e37bff03925da4c1b121b38188f9fd779b4d (diff)
download	numpy-0adccaaa910ab495e993f453956fd983775604f3.tar.gz