diff options
Diffstat (limited to 'numpy')
-rw-r--r-- | numpy/core/include/numpy/npy_cpu.h | 4 | ||||
-rw-r--r-- | numpy/core/src/multiarray/common.h | 4 | ||||
-rw-r--r-- | numpy/core/src/multiarray/item_selection.c | 17 | ||||
-rw-r--r-- | numpy/core/src/multiarray/lowlevel_strided_loops.c.src | 2 |
4 files changed, 14 insertions, 13 deletions
diff --git a/numpy/core/include/numpy/npy_cpu.h b/numpy/core/include/numpy/npy_cpu.h index 6773d3258..80bd5fc56 100644 --- a/numpy/core/include/numpy/npy_cpu.h +++ b/numpy/core/include/numpy/npy_cpu.h @@ -109,7 +109,9 @@ #endif #if (defined(NPY_CPU_X86) || defined(NPY_CPU_AMD64)) -#define NPY_CPU_HAVE_UNALIGNED_ACCESS +#define NPY_CPU_HAVE_UNALIGNED_ACCESS 1 +#else +#define NPY_CPU_HAVE_UNALIGNED_ACCESS 0 #endif #endif diff --git a/numpy/core/src/multiarray/common.h b/numpy/core/src/multiarray/common.h index b9316dc5c..e2a3b006d 100644 --- a/numpy/core/src/multiarray/common.h +++ b/numpy/core/src/multiarray/common.h @@ -140,8 +140,7 @@ npy_memchr(char * haystack, char needle, } else { /* usually find elements to skip path */ -#if defined NPY_CPU_HAVE_UNALIGNED_ACCESS - if (needle == 0 && stride == 1) { + if (NPY_CPU_HAVE_UNALIGNED_ACCESS && needle == 0 && stride == 1) { /* iterate until last multiple of 4 */ char * block_end = haystack + size - (size % sizeof(unsigned int)); while (p < block_end) { @@ -154,7 +153,6 @@ npy_memchr(char * haystack, char needle, /* handle rest */ subloopsize = (p - haystack); } -#endif while (subloopsize < size && *p == needle) { subloopsize++; p += stride; diff --git a/numpy/core/src/multiarray/item_selection.c b/numpy/core/src/multiarray/item_selection.c index bc076ba83..3a4b7cbcc 100644 --- a/numpy/core/src/multiarray/item_selection.c +++ b/numpy/core/src/multiarray/item_selection.c @@ -8,6 +8,7 @@ #include "numpy/arrayscalars.h" #include "numpy/npy_math.h" +#include "numpy/npy_cpu.h" #include "npy_config.h" @@ -2401,11 +2402,10 @@ PyArray_Compress(PyArrayObject *self, PyObject *condition, int axis, * but a 32 bit type version would make it even faster on these platforms */ static NPY_INLINE int -count_nonzero_bytes_128(const char *w) +count_nonzero_bytes_128(const npy_uint64 * w) { - const npy_uint64 *w64 = (const npy_uint64 *)w; - npy_uint64 w1 = w64[0]; - npy_uint64 w2 = w64[1]; + const npy_uint64 w1 = w[0]; + const npy_uint64 w2 = w[1]; /* * bytes not exclusively 0 or 1, sum them individually. @@ -2414,7 +2414,7 @@ count_nonzero_bytes_128(const char *w) */ if (NPY_UNLIKELY(((w1 | w2) & 0xFEFEFEFEFEFEFEFEULL) != 0)) { /* reload from pointer to avoid a unnecessary stack spill with gcc */ - const char *c = w; + const char * c = (const char *)w; npy_uintp i, count = 0; for (i = 0; i < 16; i++) { count += (c[i] != 0); @@ -2466,10 +2466,11 @@ count_boolean_trues(int ndim, char *data, npy_intp *ashape, npy_intp *astrides) /* Process the innermost dimension */ const char *d = data; const char *e = data + shape[0]; - if (npy_is_aligned(data, sizeof(npy_uint64))) { + if (NPY_CPU_HAVE_UNALIGNED_ACCESS || + npy_is_aligned(d, sizeof(npy_uint64))) { npy_uintp stride = 2 * sizeof(npy_uint64); - for (; d < e - (shape[0] % stride); d += stride) { - count += count_nonzero_bytes_128((npy_uint64 *)d); + for (; d < e - (shape[0] % stride); d += stride) { + count += count_nonzero_bytes_128((const npy_uint64 *)d); } } for (; d < e; ++d) { diff --git a/numpy/core/src/multiarray/lowlevel_strided_loops.c.src b/numpy/core/src/multiarray/lowlevel_strided_loops.c.src index 007ec8b9b..638ae4d1e 100644 --- a/numpy/core/src/multiarray/lowlevel_strided_loops.c.src +++ b/numpy/core/src/multiarray/lowlevel_strided_loops.c.src @@ -39,7 +39,7 @@ * instructions (16 byte). * So this flag can only be enabled if autovectorization is disabled. */ -#ifdef NPY_CPU_HAVE_UNALIGNED_ACCESS +#if NPY_CPU_HAVE_UNALIGNED_ACCESS # define NPY_USE_UNALIGNED_ACCESS 0 #else # define NPY_USE_UNALIGNED_ACCESS 0 |