4 files changed, 14 insertions, 13 deletions
diff --git a/numpy/core/include/numpy/npy_cpu.h b/numpy/core/include/numpy/npy_cpu.h
index 6773d3258..80bd5fc56 100644
--- a/numpy/core/include/numpy/npy_cpu.h
+++ b/numpy/core/include/numpy/npy_cpu.h
@@ -109,7 +109,9 @@
 #endif
 
 #if (defined(NPY_CPU_X86) || defined(NPY_CPU_AMD64))
-#define NPY_CPU_HAVE_UNALIGNED_ACCESS
+#define NPY_CPU_HAVE_UNALIGNED_ACCESS 1
+#else
+#define NPY_CPU_HAVE_UNALIGNED_ACCESS 0
 #endif
 
 #endif
diff --git a/numpy/core/src/multiarray/common.h b/numpy/core/src/multiarray/common.h
index b9316dc5c..e2a3b006d 100644
--- a/numpy/core/src/multiarray/common.h
+++ b/numpy/core/src/multiarray/common.h
@@ -140,8 +140,7 @@ npy_memchr(char * haystack, char needle,
     }
     else {
         /* usually find elements to skip path */
-#if defined NPY_CPU_HAVE_UNALIGNED_ACCESS
-        if (needle == 0 && stride == 1) {
+        if (NPY_CPU_HAVE_UNALIGNED_ACCESS && needle == 0 && stride == 1) {
             /* iterate until last multiple of 4 */
             char * block_end = haystack + size - (size % sizeof(unsigned int));
             while (p < block_end) {
@@ -154,7 +153,6 @@ npy_memchr(char * haystack, char needle,
             /* handle rest */
             subloopsize = (p - haystack);
         }
-#endif
         while (subloopsize < size && *p == needle) {
             subloopsize++;
             p += stride;
diff --git a/numpy/core/src/multiarray/item_selection.c b/numpy/core/src/multiarray/item_selection.c
index bc076ba83..3a4b7cbcc 100644
--- a/numpy/core/src/multiarray/item_selection.c
+++ b/numpy/core/src/multiarray/item_selection.c
@@ -8,6 +8,7 @@
 #include "numpy/arrayscalars.h"
 
 #include "numpy/npy_math.h"
+#include "numpy/npy_cpu.h"
 
 #include "npy_config.h"
 
@@ -2401,11 +2402,10 @@ PyArray_Compress(PyArrayObject *self, PyObject *condition, int axis,
  * but a 32 bit type version would make it even faster on these platforms
  */
 static NPY_INLINE int
-count_nonzero_bytes_128(const char *w)
+count_nonzero_bytes_128(const npy_uint64 * w)
 {
-    const npy_uint64 *w64 = (const npy_uint64 *)w;
-    npy_uint64 w1 = w64[0];
-    npy_uint64 w2 = w64[1];
+    const npy_uint64 w1 = w[0];
+    const npy_uint64 w2 = w[1];
 
     /*
      * bytes not exclusively 0 or 1, sum them individually.
@@ -2414,7 +2414,7 @@ count_nonzero_bytes_128(const char *w)
      */
     if (NPY_UNLIKELY(((w1 | w2)  & 0xFEFEFEFEFEFEFEFEULL) != 0)) {
         /* reload from pointer to avoid a unnecessary stack spill with gcc */
-        const char *c = w;
+        const char * c = (const char *)w;
         npy_uintp i, count = 0;
         for (i = 0; i < 16; i++) {
             count += (c[i] != 0);
@@ -2466,10 +2466,11 @@ count_boolean_trues(int ndim, char *data, npy_intp *ashape, npy_intp *astrides)
             /* Process the innermost dimension */
             const char *d = data;
             const char *e = data + shape[0];
-            if (npy_is_aligned(data, sizeof(npy_uint64))) {
+            if (NPY_CPU_HAVE_UNALIGNED_ACCESS ||
+                    npy_is_aligned(d, sizeof(npy_uint64))) {
                 npy_uintp stride = 2 * sizeof(npy_uint64);
-                for (; d < e - (shape[0]  % stride); d += stride) {
-                    count += count_nonzero_bytes_128((npy_uint64 *)d);
+                for (; d < e - (shape[0] % stride); d += stride) {
+                    count += count_nonzero_bytes_128((const npy_uint64 *)d);
                 }
             }
             for (; d < e; ++d) {
diff --git a/numpy/core/src/multiarray/lowlevel_strided_loops.c.src b/numpy/core/src/multiarray/lowlevel_strided_loops.c.src
index 007ec8b9b..638ae4d1e 100644
--- a/numpy/core/src/multiarray/lowlevel_strided_loops.c.src
+++ b/numpy/core/src/multiarray/lowlevel_strided_loops.c.src
@@ -39,7 +39,7 @@
  * instructions (16 byte).
  * So this flag can only be enabled if autovectorization is disabled.
  */
-#ifdef NPY_CPU_HAVE_UNALIGNED_ACCESS
+#if NPY_CPU_HAVE_UNALIGNED_ACCESS
 #  define NPY_USE_UNALIGNED_ACCESS 0
 #else
 #  define NPY_USE_UNALIGNED_ACCESS 0