summaryrefslogtreecommitdiff
path: root/numpy
diff options
context:
space:
mode:
Diffstat (limited to 'numpy')
-rw-r--r--numpy/core/include/numpy/npy_cpu.h4
-rw-r--r--numpy/core/src/multiarray/common.h4
-rw-r--r--numpy/core/src/multiarray/item_selection.c17
-rw-r--r--numpy/core/src/multiarray/lowlevel_strided_loops.c.src2
4 files changed, 14 insertions, 13 deletions
diff --git a/numpy/core/include/numpy/npy_cpu.h b/numpy/core/include/numpy/npy_cpu.h
index 6773d3258..80bd5fc56 100644
--- a/numpy/core/include/numpy/npy_cpu.h
+++ b/numpy/core/include/numpy/npy_cpu.h
@@ -109,7 +109,9 @@
#endif
#if (defined(NPY_CPU_X86) || defined(NPY_CPU_AMD64))
-#define NPY_CPU_HAVE_UNALIGNED_ACCESS
+#define NPY_CPU_HAVE_UNALIGNED_ACCESS 1
+#else
+#define NPY_CPU_HAVE_UNALIGNED_ACCESS 0
#endif
#endif
diff --git a/numpy/core/src/multiarray/common.h b/numpy/core/src/multiarray/common.h
index b9316dc5c..e2a3b006d 100644
--- a/numpy/core/src/multiarray/common.h
+++ b/numpy/core/src/multiarray/common.h
@@ -140,8 +140,7 @@ npy_memchr(char * haystack, char needle,
}
else {
/* usually find elements to skip path */
-#if defined NPY_CPU_HAVE_UNALIGNED_ACCESS
- if (needle == 0 && stride == 1) {
+ if (NPY_CPU_HAVE_UNALIGNED_ACCESS && needle == 0 && stride == 1) {
/* iterate until last multiple of 4 */
char * block_end = haystack + size - (size % sizeof(unsigned int));
while (p < block_end) {
@@ -154,7 +153,6 @@ npy_memchr(char * haystack, char needle,
/* handle rest */
subloopsize = (p - haystack);
}
-#endif
while (subloopsize < size && *p == needle) {
subloopsize++;
p += stride;
diff --git a/numpy/core/src/multiarray/item_selection.c b/numpy/core/src/multiarray/item_selection.c
index bc076ba83..3a4b7cbcc 100644
--- a/numpy/core/src/multiarray/item_selection.c
+++ b/numpy/core/src/multiarray/item_selection.c
@@ -8,6 +8,7 @@
#include "numpy/arrayscalars.h"
#include "numpy/npy_math.h"
+#include "numpy/npy_cpu.h"
#include "npy_config.h"
@@ -2401,11 +2402,10 @@ PyArray_Compress(PyArrayObject *self, PyObject *condition, int axis,
* but a 32 bit type version would make it even faster on these platforms
*/
static NPY_INLINE int
-count_nonzero_bytes_128(const char *w)
+count_nonzero_bytes_128(const npy_uint64 * w)
{
- const npy_uint64 *w64 = (const npy_uint64 *)w;
- npy_uint64 w1 = w64[0];
- npy_uint64 w2 = w64[1];
+ const npy_uint64 w1 = w[0];
+ const npy_uint64 w2 = w[1];
/*
* bytes not exclusively 0 or 1, sum them individually.
@@ -2414,7 +2414,7 @@ count_nonzero_bytes_128(const char *w)
*/
if (NPY_UNLIKELY(((w1 | w2) & 0xFEFEFEFEFEFEFEFEULL) != 0)) {
/* reload from pointer to avoid a unnecessary stack spill with gcc */
- const char *c = w;
+ const char * c = (const char *)w;
npy_uintp i, count = 0;
for (i = 0; i < 16; i++) {
count += (c[i] != 0);
@@ -2466,10 +2466,11 @@ count_boolean_trues(int ndim, char *data, npy_intp *ashape, npy_intp *astrides)
/* Process the innermost dimension */
const char *d = data;
const char *e = data + shape[0];
- if (npy_is_aligned(data, sizeof(npy_uint64))) {
+ if (NPY_CPU_HAVE_UNALIGNED_ACCESS ||
+ npy_is_aligned(d, sizeof(npy_uint64))) {
npy_uintp stride = 2 * sizeof(npy_uint64);
- for (; d < e - (shape[0] % stride); d += stride) {
- count += count_nonzero_bytes_128((npy_uint64 *)d);
+ for (; d < e - (shape[0] % stride); d += stride) {
+ count += count_nonzero_bytes_128((const npy_uint64 *)d);
}
}
for (; d < e; ++d) {
diff --git a/numpy/core/src/multiarray/lowlevel_strided_loops.c.src b/numpy/core/src/multiarray/lowlevel_strided_loops.c.src
index 007ec8b9b..638ae4d1e 100644
--- a/numpy/core/src/multiarray/lowlevel_strided_loops.c.src
+++ b/numpy/core/src/multiarray/lowlevel_strided_loops.c.src
@@ -39,7 +39,7 @@
* instructions (16 byte).
* So this flag can only be enabled if autovectorization is disabled.
*/
-#ifdef NPY_CPU_HAVE_UNALIGNED_ACCESS
+#if NPY_CPU_HAVE_UNALIGNED_ACCESS
# define NPY_USE_UNALIGNED_ACCESS 0
#else
# define NPY_USE_UNALIGNED_ACCESS 0