diff options
author | Charles Harris <charlesr.harris@gmail.com> | 2017-01-20 19:55:16 -0700 |
---|---|---|
committer | GitHub <noreply@github.com> | 2017-01-20 19:55:16 -0700 |
commit | d25e7332ab1579aa71d481fdb382c477bf4a2b22 (patch) | |
tree | 13536f06df9b5f5cfd6f55b41e25ffcc0dd59c84 /numpy | |
parent | a621a2b700415a5c155546f9cb1f064c6099579e (diff) | |
parent | f2a47486a4ccaa141ceeb585e24b2be17b359998 (diff) | |
download | numpy-d25e7332ab1579aa71d481fdb382c477bf4a2b22.tar.gz |
Merge pull request #8326 from juliantaylor/vectorize-packbits
ENH: Vectorize packbits with SSE2
Diffstat (limited to 'numpy')
-rw-r--r-- | numpy/core/setup_common.py | 2 | ||||
-rw-r--r-- | numpy/core/src/multiarray/compiled_base.c | 84 | ||||
-rw-r--r-- | numpy/lib/tests/test_packbits.py | 179 |
3 files changed, 255 insertions, 10 deletions
diff --git a/numpy/core/setup_common.py b/numpy/core/setup_common.py index 18066d991..596b3996c 100644 --- a/numpy/core/setup_common.py +++ b/numpy/core/setup_common.py @@ -130,6 +130,8 @@ OPTIONAL_INTRINSICS = [("__builtin_isnan", '5.'), # broken on OSX 10.11, make sure its not optimized away ("volatile int r = __builtin_cpu_supports", '"sse"', "stdio.h", "__BUILTIN_CPU_SUPPORTS"), + # MMX only needed for icc, but some clangs don't have it + ("_m_from_int64", '0', "emmintrin.h"), ("_mm_load_ps", '(float*)0', "xmmintrin.h"), # SSE ("_mm_prefetch", '(float*)0, _MM_HINT_NTA', "xmmintrin.h"), # SSE diff --git a/numpy/core/src/multiarray/compiled_base.c b/numpy/core/src/multiarray/compiled_base.c index f2323d9e2..6c76dbd4b 100644 --- a/numpy/core/src/multiarray/compiled_base.c +++ b/numpy/core/src/multiarray/compiled_base.c @@ -9,6 +9,7 @@ #include "numpy/npy_math.h" #include "npy_config.h" #include "templ_common.h" /* for npy_mul_with_overflow_intp */ +#include "lowlevel_strided_loops.h" /* for npy_bswap8 */ /* @@ -1475,6 +1476,9 @@ arr_add_docstring(PyObject *NPY_UNUSED(dummy), PyObject *args) Py_RETURN_NONE; } +#if defined NPY_HAVE_SSE2_INTRINSICS +#include <emmintrin.h> +#endif /* * This function packs boolean values in the input array into the bits of a @@ -1497,13 +1501,41 @@ pack_inner(const char *inptr, * No: move on * Every 8th value, set the value of build and increment the outptr */ - npy_intp index; + npy_intp index = 0; int remain = n_in % 8; /* uneven bits */ +#if defined NPY_HAVE_SSE2_INTRINSICS && defined HAVE__M_FROM_INT64 + if (in_stride == 1 && element_size == 1 && n_out > 2) { + __m128i zero = _mm_setzero_si128(); + /* don't handle non-full 8-byte remainder */ + npy_intp vn_out = n_out - (remain ? 1 : 0); + vn_out -= (vn_out & 2); + for (index = 0; index < vn_out; index += 2) { + unsigned int r; + /* swap as packbits is "big endian", note x86 can load unaligned */ + npy_uint64 a = npy_bswap8(*(npy_uint64*)inptr); + npy_uint64 b = npy_bswap8(*(npy_uint64*)(inptr + 8)); + __m128i v = _mm_set_epi64(_m_from_int64(b), _m_from_int64(a)); + /* false -> 0x00 and true -> 0xFF (there is no cmpneq) */ + v = _mm_cmpeq_epi8(v, zero); + v = _mm_cmpeq_epi8(v, zero); + /* extract msb of 16 bytes and pack it into 16 bit */ + r = _mm_movemask_epi8(v); + /* store result */ + memcpy(outptr, &r, 1); + outptr += out_stride; + memcpy(outptr, (char*)&r + 1, 1); + outptr += out_stride; + inptr += 16; + } + } +#endif + if (remain == 0) { /* assumes n_in > 0 */ remain = 8; } - for (index = 0; index < n_out; index++) { + /* don't reset index to handle remainder of above block */ + for (; index < n_out; index++) { char build = 0; int i, maxi; npy_intp j; @@ -1628,6 +1660,8 @@ fail: static PyObject * unpack_bits(PyObject *input, int axis) { + static int unpack_init = 0; + static char unpack_lookup[256][8]; PyArrayObject *inp; PyArrayObject *new = NULL; PyArrayObject *out = NULL; @@ -1693,6 +1727,28 @@ unpack_bits(PyObject *input, int axis) goto fail; } + /* setup lookup table under GIL, big endian 0..256 as bytes */ + if (unpack_init == 0) { + npy_uint64 j; + npy_uint64 * unpack_lookup_64 = (npy_uint64 *)unpack_lookup; + for (j=0; j < 256; j++) { + npy_uint64 v = 0; + v |= (npy_uint64)((j & 1) == 1); + v |= (npy_uint64)((j & 2) == 2) << 8; + v |= (npy_uint64)((j & 4) == 4) << 16; + v |= (npy_uint64)((j & 8) == 8) << 24; + v |= (npy_uint64)((j & 16) == 16) << 32; + v |= (npy_uint64)((j & 32) == 32) << 40; + v |= (npy_uint64)((j & 64) == 64) << 48; + v |= (npy_uint64)((j & 128) == 128) << 56; +#if NPY_BYTE_ORDER == NPY_LITTLE_ENDIAN + v = npy_bswap8(v); +#endif + unpack_lookup_64[j] = v; + } + unpack_init = 1; + } + NPY_BEGIN_THREADS_THRESHOLDED(PyArray_DIM(new, axis)); n_in = PyArray_DIM(new, axis); @@ -1704,15 +1760,25 @@ unpack_bits(PyObject *input, int axis) unsigned const char *inptr = PyArray_ITER_DATA(it); char *outptr = PyArray_ITER_DATA(ot); - for (index = 0; index < n_in; index++) { - unsigned char mask = 128; + if (out_stride == 1) { + /* for unity stride we can just copy out of the lookup table */ + for (index = 0; index < n_in; index++) { + memcpy(outptr, unpack_lookup[*inptr], 8); + outptr += 8; + inptr += in_stride; + } + } + else { + for (index = 0; index < n_in; index++) { + unsigned char mask = 128; - for (i = 0; i < 8; i++) { - *outptr = ((mask & (*inptr)) != 0); - outptr += out_stride; - mask >>= 1; + for (i = 0; i < 8; i++) { + *outptr = ((mask & (*inptr)) != 0); + outptr += out_stride; + mask >>= 1; + } + inptr += in_stride; } - inptr += in_stride; } PyArray_ITER_NEXT(it); PyArray_ITER_NEXT(ot); diff --git a/numpy/lib/tests/test_packbits.py b/numpy/lib/tests/test_packbits.py index 5d8ac861b..4bf505f56 100644 --- a/numpy/lib/tests/test_packbits.py +++ b/numpy/lib/tests/test_packbits.py @@ -1,7 +1,9 @@ from __future__ import division, absolute_import, print_function import numpy as np -from numpy.testing import assert_array_equal, assert_equal, assert_raises +from numpy.testing import ( + assert_array_equal, assert_equal, assert_raises, run_module_suite +) def test_packbits(): @@ -51,6 +53,166 @@ def test_packbits_empty_with_axis(): assert_equal(b.shape, out_shape) +def test_packbits_large(): + # test data large enough for 16 byte vectorization + a = np.array([1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, + 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, + 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, + 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, + 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, + 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, + 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, + 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, + 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, + 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, + 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, + 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, + 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, + 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, + 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0]) + a = a.repeat(3) + for dtype in '?bBhHiIlLqQ': + arr = np.array(a, dtype=dtype) + b = np.packbits(arr, axis=None) + assert_equal(b.dtype, np.uint8) + r = [252, 127, 192, 3, 254, 7, 252, 0, 7, 31, 240, 0, 28, 1, 255, 252, + 113, 248, 3, 255, 192, 28, 15, 192, 28, 126, 0, 224, 127, 255, + 227, 142, 7, 31, 142, 63, 28, 126, 56, 227, 240, 0, 227, 128, 63, + 224, 14, 56, 252, 112, 56, 255, 241, 248, 3, 240, 56, 224, 112, + 63, 255, 255, 199, 224, 14, 0, 31, 143, 192, 3, 255, 199, 0, 1, + 255, 224, 1, 255, 252, 126, 63, 0, 1, 192, 252, 14, 63, 0, 15, + 199, 252, 113, 255, 3, 128, 56, 252, 14, 7, 0, 113, 255, 255, 142, 56, 227, + 129, 248, 227, 129, 199, 31, 128] + assert_array_equal(b, r) + # equal for size being multiple of 8 + assert_array_equal(np.unpackbits(b)[:-4], a) + + # check last byte of different remainders (16 byte vectorization) + b = [np.packbits(arr[:-i], axis=None)[-1] for i in range(1, 16)] + assert_array_equal(b, [128, 128, 128, 31, 30, 28, 24, 16, 0, 0, 0, 199, + 198, 196, 192]) + + + arr = arr.reshape(36, 25) + b = np.packbits(arr, axis=0) + assert_equal(b.dtype, np.uint8) + assert_array_equal(b, [[190, 186, 178, 178, 150, 215, 87, 83, 83, 195, + 199, 206, 204, 204, 140, 140, 136, 136, 8, 40, 105, + 107, 75, 74, 88], + [72, 216, 248, 241, 227, 195, 202, 90, 90, 83, + 83, 119, 127, 109, 73, 64, 208, 244, 189, 45, + 41, 104, 122, 90, 18], + [113, 120, 248, 216, 152, 24, 60, 52, 182, 150, + 150, 150, 146, 210, 210, 246, 255, 255, 223, + 151, 21, 17, 17, 131, 163], + [214, 210, 210, 64, 68, 5, 5, 1, 72, 88, 92, + 92, 78, 110, 39, 181, 149, 220, 222, 218, 218, + 202, 234, 170, 168], + [0, 128, 128, 192, 80, 112, 48, 160, 160, 224, + 240, 208, 144, 128, 160, 224, 240, 208, 144, + 144, 176, 240, 224, 192, 128]]) + + b = np.packbits(arr, axis=1) + assert_equal(b.dtype, np.uint8) + assert_array_equal(b, [[252, 127, 192, 0], + [ 7, 252, 15, 128], + [240, 0, 28, 0], + [255, 128, 0, 128], + [192, 31, 255, 128], + [142, 63, 0, 0], + [255, 240, 7, 0], + [ 7, 224, 14, 0], + [126, 0, 224, 0], + [255, 255, 199, 0], + [ 56, 28, 126, 0], + [113, 248, 227, 128], + [227, 142, 63, 0], + [ 0, 28, 112, 0], + [ 15, 248, 3, 128], + [ 28, 126, 56, 0], + [ 56, 255, 241, 128], + [240, 7, 224, 0], + [227, 129, 192, 128], + [255, 255, 254, 0], + [126, 0, 224, 0], + [ 3, 241, 248, 0], + [ 0, 255, 241, 128], + [128, 0, 255, 128], + [224, 1, 255, 128], + [248, 252, 126, 0], + [ 0, 7, 3, 128], + [224, 113, 248, 0], + [ 0, 252, 127, 128], + [142, 63, 224, 0], + [224, 14, 63, 0], + [ 7, 3, 128, 0], + [113, 255, 255, 128], + [ 28, 113, 199, 0], + [ 7, 227, 142, 0], + [ 14, 56, 252, 0]]) + + arr = arr.T.copy() + b = np.packbits(arr, axis=0) + assert_equal(b.dtype, np.uint8) + assert_array_equal(b, [[252, 7, 240, 255, 192, 142, 255, 7, 126, 255, + 56, 113, 227, 0, 15, 28, 56, 240, 227, 255, + 126, 3, 0, 128, 224, 248, 0, 224, 0, 142, 224, + 7, 113, 28, 7, 14], + [127, 252, 0, 128, 31, 63, 240, 224, 0, 255, + 28, 248, 142, 28, 248, 126, 255, 7, 129, 255, + 0, 241, 255, 0, 1, 252, 7, 113, 252, 63, 14, + 3, 255, 113, 227, 56], + [192, 15, 28, 0, 255, 0, 7, 14, 224, 199, 126, + 227, 63, 112, 3, 56, 241, 224, 192, 254, 224, + 248, 241, 255, 255, 126, 3, 248, 127, 224, 63, + 128, 255, 199, 142, 252], + [0, 128, 0, 128, 128, 0, 0, 0, 0, 0, 0, 128, 0, + 0, 128, 0, 128, 0, 128, 0, 0, 0, 128, 128, + 128, 0, 128, 0, 128, 0, 0, 0, 128, 0, 0, 0]]) + + b = np.packbits(arr, axis=1) + assert_equal(b.dtype, np.uint8) + assert_array_equal(b, [[190, 72, 113, 214, 0], + [186, 216, 120, 210, 128], + [178, 248, 248, 210, 128], + [178, 241, 216, 64, 192], + [150, 227, 152, 68, 80], + [215, 195, 24, 5, 112], + [ 87, 202, 60, 5, 48], + [ 83, 90, 52, 1, 160], + [ 83, 90, 182, 72, 160], + [195, 83, 150, 88, 224], + [199, 83, 150, 92, 240], + [206, 119, 150, 92, 208], + [204, 127, 146, 78, 144], + [204, 109, 210, 110, 128], + [140, 73, 210, 39, 160], + [140, 64, 246, 181, 224], + [136, 208, 255, 149, 240], + [136, 244, 255, 220, 208], + [ 8, 189, 223, 222, 144], + [ 40, 45, 151, 218, 144], + [105, 41, 21, 218, 176], + [107, 104, 17, 202, 240], + [ 75, 122, 17, 234, 224], + [ 74, 90, 131, 170, 192], + [ 88, 18, 163, 168, 128]]) + + + # result is the same if input is multiplied with a nonzero value + for dtype in 'bBhHiIlLqQ': + arr = np.array(a, dtype=dtype) + rnd = np.random.randint(low=np.iinfo(dtype).min, + high=np.iinfo(dtype).max, size=arr.size, + dtype=dtype) + rnd[rnd == 0] = 1 + arr *= rnd.astype(dtype) + b = np.packbits(arr, axis=-1) + assert_array_equal(np.unpackbits(b)[:-4], a) + + assert_raises(TypeError, np.packbits, np.array(a, dtype=float)) + + def test_unpackbits(): # Copied from the docstring. a = np.array([[2], [7], [23]], dtype=np.uint8) @@ -86,3 +248,18 @@ def test_unpackbits_empty_with_axis(): b = np.unpackbits(a, axis=ax) assert_equal(b.dtype, np.uint8) assert_equal(b.shape, out_shape) + + +def test_unpackbits_large(): + # test all possible numbers via comparison to already tested packbits + d = np.arange(277, dtype=np.uint8) + assert_array_equal(np.packbits(np.unpackbits(d)), d) + assert_array_equal(np.packbits(np.unpackbits(d[::2])), d[::2]) + d = np.tile(d, (3, 1)) + assert_array_equal(np.packbits(np.unpackbits(d, axis=1), axis=1), d) + d = d.T.copy() + assert_array_equal(np.packbits(np.unpackbits(d, axis=0), axis=0), d) + + +if __name__ == "__main__": + run_module_suite() |