Merge pull request #8326 from juliantaylor/vectorize-packbits

ENH: Vectorize packbits with SSE2
author: Charles Harris <charlesr.harris@gmail.com> 2017-01-20 19:55:16 -0700
committer: GitHub <noreply@github.com> 2017-01-20 19:55:16 -0700
commit: d25e7332ab1579aa71d481fdb382c477bf4a2b22 (patch)
tree: 13536f06df9b5f5cfd6f55b41e25ffcc0dd59c84 /numpy
parent: a621a2b700415a5c155546f9cb1f064c6099579e (diff)
parent: f2a47486a4ccaa141ceeb585e24b2be17b359998 (diff)
download: numpy-d25e7332ab1579aa71d481fdb382c477bf4a2b22.tar.gz
3 files changed, 255 insertions, 10 deletions
diff --git a/numpy/core/setup_common.py b/numpy/core/setup_common.py
index 18066d991..596b3996c 100644
--- a/numpy/core/setup_common.py
+++ b/numpy/core/setup_common.py
@@ -130,6 +130,8 @@ OPTIONAL_INTRINSICS = [("__builtin_isnan", '5.'),
                        # broken on OSX 10.11, make sure its not optimized away
                        ("volatile int r = __builtin_cpu_supports", '"sse"',
                         "stdio.h", "__BUILTIN_CPU_SUPPORTS"),
+                       # MMX only needed for icc, but some clangs don't have it
+                       ("_m_from_int64", '0', "emmintrin.h"),
                        ("_mm_load_ps", '(float*)0', "xmmintrin.h"),  # SSE
                        ("_mm_prefetch", '(float*)0, _MM_HINT_NTA',
                         "xmmintrin.h"),  # SSE
diff --git a/numpy/core/src/multiarray/compiled_base.c b/numpy/core/src/multiarray/compiled_base.c
index f2323d9e2..6c76dbd4b 100644
--- a/numpy/core/src/multiarray/compiled_base.c
+++ b/numpy/core/src/multiarray/compiled_base.c
@@ -9,6 +9,7 @@
 #include "numpy/npy_math.h"
 #include "npy_config.h"
 #include "templ_common.h" /* for npy_mul_with_overflow_intp */
+#include "lowlevel_strided_loops.h" /* for npy_bswap8 */
 
 
 /*
@@ -1475,6 +1476,9 @@ arr_add_docstring(PyObject *NPY_UNUSED(dummy), PyObject *args)
     Py_RETURN_NONE;
 }
 
+#if defined NPY_HAVE_SSE2_INTRINSICS
+#include <emmintrin.h>
+#endif
 
 /*
  * This function packs boolean values in the input array into the bits of a
@@ -1497,13 +1501,41 @@ pack_inner(const char *inptr,
      *  No:  move on
      * Every 8th value, set the value of build and increment the outptr
      */
-    npy_intp index;
+    npy_intp index = 0;
     int remain = n_in % 8;              /* uneven bits */
 
+#if defined NPY_HAVE_SSE2_INTRINSICS && defined HAVE__M_FROM_INT64
+    if (in_stride == 1 && element_size == 1 && n_out > 2) {
+        __m128i zero = _mm_setzero_si128();
+        /* don't handle non-full 8-byte remainder */
+        npy_intp vn_out = n_out - (remain ? 1 : 0);
+        vn_out -= (vn_out & 2);
+        for (index = 0; index < vn_out; index += 2) {
+            unsigned int r;
+            /* swap as packbits is "big endian", note x86 can load unaligned */
+            npy_uint64 a = npy_bswap8(*(npy_uint64*)inptr);
+            npy_uint64 b = npy_bswap8(*(npy_uint64*)(inptr + 8));
+            __m128i v = _mm_set_epi64(_m_from_int64(b), _m_from_int64(a));
+            /* false -> 0x00 and true -> 0xFF (there is no cmpneq) */
+            v = _mm_cmpeq_epi8(v, zero);
+            v = _mm_cmpeq_epi8(v, zero);
+            /* extract msb of 16 bytes and pack it into 16 bit */
+            r = _mm_movemask_epi8(v);
+            /* store result */
+            memcpy(outptr, &r, 1);
+            outptr += out_stride;
+            memcpy(outptr, (char*)&r + 1, 1);
+            outptr += out_stride;
+            inptr += 16;
+        }
+    }
+#endif
+
     if (remain == 0) {                  /* assumes n_in > 0 */
         remain = 8;
     }
-    for (index = 0; index < n_out; index++) {
+    /* don't reset index to handle remainder of above block */
+    for (; index < n_out; index++) {
         char build = 0;
         int i, maxi;
         npy_intp j;
@@ -1628,6 +1660,8 @@ fail:
 static PyObject *
 unpack_bits(PyObject *input, int axis)
 {
+    static int unpack_init = 0;
+    static char unpack_lookup[256][8];
     PyArrayObject *inp;
     PyArrayObject *new = NULL;
     PyArrayObject *out = NULL;
@@ -1693,6 +1727,28 @@ unpack_bits(PyObject *input, int axis)
         goto fail;
     }
 
+    /* setup lookup table under GIL, big endian 0..256 as bytes */
+    if (unpack_init == 0) {
+        npy_uint64 j;
+        npy_uint64 * unpack_lookup_64 = (npy_uint64 *)unpack_lookup;
+        for (j=0; j < 256; j++) {
+            npy_uint64 v = 0;
+            v |= (npy_uint64)((j &   1) ==   1);
+            v |= (npy_uint64)((j &   2) ==   2) << 8;
+            v |= (npy_uint64)((j &   4) ==   4) << 16;
+            v |= (npy_uint64)((j &   8) ==   8) << 24;
+            v |= (npy_uint64)((j &  16) ==  16) << 32;
+            v |= (npy_uint64)((j &  32) ==  32) << 40;
+            v |= (npy_uint64)((j &  64) ==  64) << 48;
+            v |= (npy_uint64)((j & 128) == 128) << 56;
+#if NPY_BYTE_ORDER == NPY_LITTLE_ENDIAN
+            v = npy_bswap8(v);
+#endif
+            unpack_lookup_64[j] = v;
+        }
+        unpack_init = 1;
+    }
+
     NPY_BEGIN_THREADS_THRESHOLDED(PyArray_DIM(new, axis));
 
     n_in = PyArray_DIM(new, axis);
@@ -1704,15 +1760,25 @@ unpack_bits(PyObject *input, int axis)
         unsigned const char *inptr = PyArray_ITER_DATA(it);
         char *outptr = PyArray_ITER_DATA(ot);
 
-        for (index = 0; index < n_in; index++) {
-            unsigned char mask = 128;
+        if (out_stride == 1) {
+            /* for unity stride we can just copy out of the lookup table */
+            for (index = 0; index < n_in; index++) {
+                memcpy(outptr, unpack_lookup[*inptr], 8);
+                outptr += 8;
+                inptr += in_stride;
+            }
+        }
+        else {
+            for (index = 0; index < n_in; index++) {
+                unsigned char mask = 128;
 
-            for (i = 0; i < 8; i++) {
-                *outptr = ((mask & (*inptr)) != 0);
-                outptr += out_stride;
-                mask >>= 1;
+                for (i = 0; i < 8; i++) {
+                    *outptr = ((mask & (*inptr)) != 0);
+                    outptr += out_stride;
+                    mask >>= 1;
+                }
+                inptr += in_stride;
             }
-            inptr += in_stride;
         }
         PyArray_ITER_NEXT(it);
         PyArray_ITER_NEXT(ot);
diff --git a/numpy/lib/tests/test_packbits.py b/numpy/lib/tests/test_packbits.py
index 5d8ac861b..4bf505f56 100644
--- a/numpy/lib/tests/test_packbits.py
+++ b/numpy/lib/tests/test_packbits.py
@@ -1,7 +1,9 @@
 from __future__ import division, absolute_import, print_function
 
 import numpy as np
-from numpy.testing import assert_array_equal, assert_equal, assert_raises
+from numpy.testing import (
+    assert_array_equal, assert_equal, assert_raises, run_module_suite
+)
 
 
 def test_packbits():
@@ -51,6 +53,166 @@ def test_packbits_empty_with_axis():
                 assert_equal(b.shape, out_shape)
 
 
+def test_packbits_large():
+    # test data large enough for 16 byte vectorization
+    a = np.array([1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0,
+                  0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
+                  1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0,
+                  1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1,
+                  1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1,
+                  1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1,
+                  1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1,
+                  0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1,
+                  1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0,
+                  1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1,
+                  1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0,
+                  0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1,
+                  1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0,
+                  1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0,
+                  1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0])
+    a = a.repeat(3)
+    for dtype in '?bBhHiIlLqQ':
+        arr = np.array(a, dtype=dtype)
+        b = np.packbits(arr, axis=None)
+        assert_equal(b.dtype, np.uint8)
+        r = [252, 127, 192, 3, 254, 7, 252, 0, 7, 31, 240, 0, 28, 1, 255, 252,
+             113, 248, 3, 255, 192, 28, 15, 192, 28, 126, 0, 224, 127, 255,
+             227, 142, 7, 31, 142, 63, 28, 126, 56, 227, 240, 0, 227, 128, 63,
+             224, 14, 56, 252, 112, 56, 255, 241, 248, 3, 240, 56, 224, 112,
+             63, 255, 255, 199, 224, 14, 0, 31, 143, 192, 3, 255, 199, 0, 1,
+             255, 224, 1, 255, 252, 126, 63, 0, 1, 192, 252, 14, 63, 0, 15,
+             199, 252, 113, 255, 3, 128, 56, 252, 14, 7, 0, 113, 255, 255, 142, 56, 227,
+             129, 248, 227, 129, 199, 31, 128]
+        assert_array_equal(b, r)
+        # equal for size being multiple of 8
+        assert_array_equal(np.unpackbits(b)[:-4], a)
+
+        # check last byte of different remainders (16 byte vectorization)
+        b = [np.packbits(arr[:-i], axis=None)[-1] for i in range(1, 16)]
+        assert_array_equal(b, [128, 128, 128, 31, 30, 28, 24, 16, 0, 0, 0, 199,
+                               198, 196, 192])
+
+
+        arr = arr.reshape(36, 25)
+        b = np.packbits(arr, axis=0)
+        assert_equal(b.dtype, np.uint8)
+        assert_array_equal(b, [[190, 186, 178, 178, 150, 215, 87, 83, 83, 195,
+                                199, 206, 204, 204, 140, 140, 136, 136, 8, 40, 105,
+                                107, 75, 74, 88],
+                               [72, 216, 248, 241, 227, 195, 202, 90, 90, 83,
+                                83, 119, 127, 109, 73, 64, 208, 244, 189, 45,
+                                41, 104, 122, 90, 18],
+                               [113, 120, 248, 216, 152, 24, 60, 52, 182, 150,
+                                150, 150, 146, 210, 210, 246, 255, 255, 223,
+                                151, 21, 17, 17, 131, 163],
+                               [214, 210, 210, 64, 68, 5, 5, 1, 72, 88, 92,
+                                92, 78, 110, 39, 181, 149, 220, 222, 218, 218,
+                                202, 234, 170, 168],
+                               [0, 128, 128, 192, 80, 112, 48, 160, 160, 224,
+                                240, 208, 144, 128, 160, 224, 240, 208, 144,
+                                144, 176, 240, 224, 192, 128]])
+
+        b = np.packbits(arr, axis=1)
+        assert_equal(b.dtype, np.uint8)
+        assert_array_equal(b, [[252, 127, 192,   0],
+                               [  7, 252,  15, 128],
+                               [240,   0,  28,   0],
+                               [255, 128,   0, 128],
+                               [192,  31, 255, 128],
+                               [142,  63,   0,   0],
+                               [255, 240,   7,   0],
+                               [  7, 224,  14,   0],
+                               [126,   0, 224,   0],
+                               [255, 255, 199,   0],
+                               [ 56,  28, 126,   0],
+                               [113, 248, 227, 128],
+                               [227, 142,  63,   0],
+                               [  0,  28, 112,   0],
+                               [ 15, 248,   3, 128],
+                               [ 28, 126,  56,   0],
+                               [ 56, 255, 241, 128],
+                               [240,   7, 224,   0],
+                               [227, 129, 192, 128],
+                               [255, 255, 254,   0],
+                               [126,   0, 224,   0],
+                               [  3, 241, 248,   0],
+                               [  0, 255, 241, 128],
+                               [128,   0, 255, 128],
+                               [224,   1, 255, 128],
+                               [248, 252, 126,   0],
+                               [  0,   7,   3, 128],
+                               [224, 113, 248,   0],
+                               [  0, 252, 127, 128],
+                               [142,  63, 224,   0],
+                               [224,  14,  63,   0],
+                               [  7,   3, 128,   0],
+                               [113, 255, 255, 128],
+                               [ 28, 113, 199,   0],
+                               [  7, 227, 142,   0],
+                               [ 14,  56, 252,   0]])
+
+        arr = arr.T.copy()
+        b = np.packbits(arr, axis=0)
+        assert_equal(b.dtype, np.uint8)
+        assert_array_equal(b, [[252, 7, 240, 255, 192, 142, 255, 7, 126, 255,
+                                56, 113, 227, 0, 15, 28, 56, 240, 227, 255,
+                                126, 3, 0, 128, 224, 248, 0, 224, 0, 142, 224,
+                                7, 113, 28, 7, 14],
+                                [127, 252, 0, 128, 31, 63, 240, 224, 0, 255,
+                                 28, 248, 142, 28, 248, 126, 255, 7, 129, 255,
+                                 0, 241, 255, 0, 1, 252, 7, 113, 252, 63, 14,
+                                 3, 255, 113, 227, 56],
+                                [192, 15, 28, 0, 255, 0, 7, 14, 224, 199, 126,
+                                 227, 63, 112, 3, 56, 241, 224, 192, 254, 224,
+                                 248, 241, 255, 255, 126, 3, 248, 127, 224, 63,
+                                 128, 255, 199, 142, 252],
+                                [0, 128, 0, 128, 128, 0, 0, 0, 0, 0, 0, 128, 0,
+                                 0, 128, 0, 128, 0, 128, 0, 0, 0, 128, 128,
+                                 128, 0, 128, 0, 128, 0, 0, 0, 128, 0, 0, 0]])
+
+        b = np.packbits(arr, axis=1)
+        assert_equal(b.dtype, np.uint8)
+        assert_array_equal(b, [[190,  72, 113, 214,   0],
+                               [186, 216, 120, 210, 128],
+                               [178, 248, 248, 210, 128],
+                               [178, 241, 216,  64, 192],
+                               [150, 227, 152,  68,  80],
+                               [215, 195,  24,   5, 112],
+                               [ 87, 202,  60,   5,  48],
+                               [ 83,  90,  52,   1, 160],
+                               [ 83,  90, 182,  72, 160],
+                               [195,  83, 150,  88, 224],
+                               [199,  83, 150,  92, 240],
+                               [206, 119, 150,  92, 208],
+                               [204, 127, 146,  78, 144],
+                               [204, 109, 210, 110, 128],
+                               [140,  73, 210,  39, 160],
+                               [140,  64, 246, 181, 224],
+                               [136, 208, 255, 149, 240],
+                               [136, 244, 255, 220, 208],
+                               [  8, 189, 223, 222, 144],
+                               [ 40,  45, 151, 218, 144],
+                               [105,  41,  21, 218, 176],
+                               [107, 104,  17, 202, 240],
+                               [ 75, 122,  17, 234, 224],
+                               [ 74,  90, 131, 170, 192],
+                               [ 88,  18, 163, 168, 128]])
+
+
+    # result is the same if input is multiplied with a nonzero value
+    for dtype in 'bBhHiIlLqQ':
+        arr = np.array(a, dtype=dtype)
+        rnd = np.random.randint(low=np.iinfo(dtype).min,
+                                high=np.iinfo(dtype).max, size=arr.size,
+                                dtype=dtype)
+        rnd[rnd == 0] = 1
+        arr *= rnd.astype(dtype)
+        b = np.packbits(arr, axis=-1)
+        assert_array_equal(np.unpackbits(b)[:-4], a)
+
+    assert_raises(TypeError, np.packbits, np.array(a, dtype=float))
+
+
 def test_unpackbits():
     # Copied from the docstring.
     a = np.array([[2], [7], [23]], dtype=np.uint8)
@@ -86,3 +248,18 @@ def test_unpackbits_empty_with_axis():
             b = np.unpackbits(a, axis=ax)
             assert_equal(b.dtype, np.uint8)
             assert_equal(b.shape, out_shape)
+
+
+def test_unpackbits_large():
+    # test all possible numbers via comparison to already tested packbits
+    d = np.arange(277, dtype=np.uint8)
+    assert_array_equal(np.packbits(np.unpackbits(d)), d)
+    assert_array_equal(np.packbits(np.unpackbits(d[::2])), d[::2])
+    d = np.tile(d, (3, 1))
+    assert_array_equal(np.packbits(np.unpackbits(d, axis=1), axis=1), d)
+    d = d.T.copy()
+    assert_array_equal(np.packbits(np.unpackbits(d, axis=0), axis=0), d)
+
+
+if __name__ == "__main__":
+    run_module_suite()
author	Charles Harris <charlesr.harris@gmail.com>	2017-01-20 19:55:16 -0700
committer	GitHub <noreply@github.com>	2017-01-20 19:55:16 -0700
commit	d25e7332ab1579aa71d481fdb382c477bf4a2b22 (patch)
tree	13536f06df9b5f5cfd6f55b41e25ffcc0dd59c84 /numpy
parent	a621a2b700415a5c155546f9cb1f064c6099579e (diff)
parent	f2a47486a4ccaa141ceeb585e24b2be17b359998 (diff)
download	numpy-d25e7332ab1579aa71d481fdb382c477bf4a2b22.tar.gz