Merge pull request #8043 from pv/ufunc-copy-overlap

ENH: umath: ensure ufuncs are well-defined with memory overlapping inputs
author: Charles Harris <charlesr.harris@gmail.com> 2017-02-16 15:41:27 -0700
committer: GitHub <noreply@github.com> 2017-02-16 15:41:27 -0700
commit: d55f40b1e48d5a5fbed80e00a140c9db6e19732f (patch)
tree: 733aa0ac1abdcc1aadf6452be5729b7b83095a09 /numpy
parent: ba1ddc421eac7b5c3accb99a4c633490a1814838 (diff)
parent: 8e8ce442e8449916a93951093cdce16cec006bcc (diff)
download: numpy-d55f40b1e48d5a5fbed80e00a140c9db6e19732f.tar.gz
16 files changed, 1035 insertions, 126 deletions
diff --git a/numpy/add_newdocs.py b/numpy/add_newdocs.py
index 41c560074..09f4e40c4 100644
--- a/numpy/add_newdocs.py
+++ b/numpy/add_newdocs.py
@@ -169,6 +169,10 @@ add_newdoc('numpy.core', 'nditer',
             with one per iteration dimension, to be tracked.
           * "common_dtype" causes all the operands to be converted to
             a common data type, with copying or buffering as necessary.
+          * "copy_if_overlap" causes the iterator to determine if read
+            operands have overlap with write operands, and make temporary
+            copies as necessary to avoid overlap. False positives (needless
+            copying) are possible in some cases.
           * "delay_bufalloc" delays allocation of the buffers until
             a reset() call is made. Allows "allocate" operands to
             be initialized before their values are copied into the buffers.
@@ -208,6 +212,9 @@ add_newdoc('numpy.core', 'nditer',
             copies those elements indicated by this mask.
           * 'writemasked' indicates that only elements where the chosen
             'arraymask' operand is True will be written to.
+          * "overlap_assume_elementwise" can be used to mark operands that are
+            accessed only in the iterator order, to allow less conservative
+            copying when "copy_if_overlap" is present.
     op_dtypes : dtype or tuple of dtype(s), optional
         The required data type(s) of the operands. If copying or buffering
         is enabled, the data will be converted to/from their original types.
diff --git a/numpy/core/code_generators/cversions.txt b/numpy/core/code_generators/cversions.txt
index 9ade153f5..54140f24a 100644
--- a/numpy/core/code_generators/cversions.txt
+++ b/numpy/core/code_generators/cversions.txt
@@ -34,3 +34,6 @@
 # Version 10 (NumPy 1.11) No change.
 # Version 10 (NumPy 1.12) No change.
 0x0000000a = 9b8bce614655d3eb02acddcb508203cb
+
+# Version 11 (NumPy 1.13) Added PyArray_MapIterArrayCopyIfOverlap
+0x0000000b = edb1ba83730c650fd9bc5772a919cda7
diff --git a/numpy/core/code_generators/numpy_api.py b/numpy/core/code_generators/numpy_api.py
index 972966627..d1406e3b2 100644
--- a/numpy/core/code_generators/numpy_api.py
+++ b/numpy/core/code_generators/numpy_api.py
@@ -344,6 +344,8 @@ multiarray_funcs_api = {
     # End 1.9 API
     'PyArray_CheckAnyScalarExact':          (300, NonNull(1)),
     # End 1.10 API
+    'PyArray_MapIterArrayCopyIfOverlap':    (301,),
+    # End 1.13 API
 }
 
 ufunc_types_api = {
diff --git a/numpy/core/include/numpy/ndarraytypes.h b/numpy/core/include/numpy/ndarraytypes.h
index a9848f434..7f6fe6524 100644
--- a/numpy/core/include/numpy/ndarraytypes.h
+++ b/numpy/core/include/numpy/ndarraytypes.h
@@ -1008,6 +1008,12 @@ typedef void (NpyIter_GetMultiIndexFunc)(NpyIter *iter,
 #define NPY_ITER_DELAY_BUFALLOC             0x00000800
 /* When NPY_KEEPORDER is specified, disable reversing negative-stride axes */
 #define NPY_ITER_DONT_NEGATE_STRIDES        0x00001000
+/*
+ * If output operands overlap with other operands (based on heuristics that
+ * has false positives but no false negatives), make temporary copies to
+ * eliminate overlap.
+ */
+#define NPY_ITER_COPY_IF_OVERLAP            0x00002000
 
 /*** Per-operand flags that may be passed to the iterator constructors ***/
 
@@ -1039,6 +1045,8 @@ typedef void (NpyIter_GetMultiIndexFunc)(NpyIter *iter,
 #define NPY_ITER_WRITEMASKED                0x10000000
 /* This array is the mask for all WRITEMASKED operands */
 #define NPY_ITER_ARRAYMASK                  0x20000000
+/* Assume iterator order data access for COPY_IF_OVERLAP */
+#define NPY_ITER_OVERLAP_ASSUME_ELEMENTWISE 0x40000000
 
 #define NPY_ITER_GLOBAL_FLAGS               0x0000ffff
 #define NPY_ITER_PER_OP_FLAGS               0xffff0000
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index 01070cc74..f45042bec 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -901,7 +901,8 @@ def configuration(parent_package='',top_path=None):
             join('src', 'umath', 'loops.c.src'),
             join('src', 'umath', 'ufunc_object.c'),
             join('src', 'umath', 'scalarmath.c.src'),
-            join('src', 'umath', 'ufunc_type_resolution.c')]
+            join('src', 'umath', 'ufunc_type_resolution.c'),
+            join('src', 'private', 'mem_overlap.c')]
 
     umath_deps = [
             generate_umath_py,
@@ -911,6 +912,8 @@ def configuration(parent_package='',top_path=None):
             join('src', 'private', 'templ_common.h.src'),
             join('src', 'umath', 'simd.inc.src'),
             join(codegen_dir, 'generate_ufunc_api.py'),
+            join('src', 'private', 'lowlevel_strided_loops.h'),
+            join('src', 'private', 'mem_overlap.h'),
             join('src', 'private', 'ufunc_override.h')] + npymath_sources
 
     config.add_extension('umath',
diff --git a/numpy/core/setup_common.py b/numpy/core/setup_common.py
index 596b3996c..357051cdb 100644
--- a/numpy/core/setup_common.py
+++ b/numpy/core/setup_common.py
@@ -38,7 +38,8 @@ C_ABI_VERSION = 0x01000009
 # 0x0000000a - 1.10.x
 # 0x0000000a - 1.11.x
 # 0x0000000a - 1.12.x
-C_API_VERSION = 0x0000000a
+# 0x0000000b - 1.13.x
+C_API_VERSION = 0x0000000b
 
 class MismatchCAPIWarning(Warning):
     pass
diff --git a/numpy/core/src/multiarray/mapping.c b/numpy/core/src/multiarray/mapping.c
index 70affbc9b..7a009d237 100644
--- a/numpy/core/src/multiarray/mapping.c
+++ b/numpy/core/src/multiarray/mapping.c
@@ -19,6 +19,7 @@
 #include "mapping.h"
 #include "lowlevel_strided_loops.h"
 #include "item_selection.h"
+#include "mem_overlap.h"
 
 
 #define HAS_INTEGER 1
@@ -704,6 +705,39 @@ prepare_index(PyArrayObject *self, PyObject *index,
 
 
 /**
+ * Check if self has memory overlap with one of the index arrays, or with extra_op.
+ *
+ * @returns 1 if memory overlap found, 0 if not.
+ */
+NPY_NO_EXPORT int
+index_has_memory_overlap(PyArrayObject *self,
+                         int index_type, npy_index_info *indices, int num,
+                         PyObject *extra_op)
+{
+    int i;
+
+    if (index_type & (HAS_FANCY | HAS_BOOL)) {
+        for (i = 0; i < num; ++i) {
+            if (indices[i].object != NULL &&
+                    PyArray_Check(indices[i].object) &&
+                    solve_may_share_memory(self,
+                                           (PyArrayObject *)indices[i].object,
+                                           1) != 0) {
+                return 1;
+            }
+        }
+    }
+
+    if (extra_op != NULL && PyArray_Check(extra_op) &&
+            solve_may_share_memory(self, (PyArrayObject *)extra_op, 1) != 0) {
+        return 1;
+    }
+
+    return 0;
+}
+
+
+/**
  * Get pointer for an integer index.
  *
  * For a purely integer index, set ptr to the memory address.
@@ -1923,7 +1957,9 @@ array_assign_subscript(PyArrayObject *self, PyObject *ind, PyObject *op)
                  * Either they are equivalent, or the values must
                  * be a scalar
                  */
-                (PyArray_EQUIVALENTLY_ITERABLE(ind, tmp_arr) ||
+                (PyArray_EQUIVALENTLY_ITERABLE(ind, tmp_arr,
+                                               PyArray_TRIVIALLY_ITERABLE_OP_READ,
+                                               PyArray_TRIVIALLY_ITERABLE_OP_READ) ||
                  (PyArray_NDIM(tmp_arr) == 0 &&
                         PyArray_TRIVIALLY_ITERABLE(tmp_arr))) &&
                 /* Check if the type is equivalent to INTP */
@@ -3130,19 +3166,22 @@ PyArray_MapIterNew(npy_index_info *indices , int index_num, int index_type,
 
 /*NUMPY_API
  *
- * Use advanced indexing to iterate an array. Please note
- * that most of this public API is currently not guaranteed
- * to stay the same between versions. If you plan on using
- * it, please consider adding more utility functions here
- * to accommodate new features.
+ * Same as PyArray_MapIterArray, but:
+ *
+ * If copy_if_overlap != 0, check if `a` has memory overlap with any of the
+ * arrays in `index` and with `extra_op`. If yes, make copies as appropriate
+ * to avoid problems if `a` is modified during the iteration.
+ * `iter->array` may contain a copied array (with UPDATEIFCOPY set).
  */
 NPY_NO_EXPORT PyObject *
-PyArray_MapIterArray(PyArrayObject * a, PyObject * index)
+PyArray_MapIterArrayCopyIfOverlap(PyArrayObject * a, PyObject * index,
+                                  int copy_if_overlap, PyArrayObject *extra_op)
 {
     PyArrayMapIterObject * mit = NULL;
     PyArrayObject *subspace = NULL;
     npy_index_info indices[NPY_MAXDIMS * 2 + 1];
     int i, index_num, ndim, fancy_ndim, index_type;
+    PyArrayObject *a_copy = NULL;
 
     index_type = prepare_index(a, index, indices, &index_num,
                                &ndim, &fancy_ndim, 0);
@@ -3151,6 +3190,28 @@ PyArray_MapIterArray(PyArrayObject * a, PyObject * index)
         return NULL;
     }
 
+    if (copy_if_overlap && index_has_memory_overlap(a, index_type, indices,
+                                                    index_num,
+                                                    (PyObject *)extra_op)) {
+        /* Make a copy of the input array */
+        a_copy = (PyArrayObject *)PyArray_NewLikeArray(a, NPY_ANYORDER,
+                                                       NULL, 0);
+        if (a_copy == NULL) {
+            goto fail;
+        }
+
+        if (PyArray_CopyInto(a_copy, a) != 0) {
+            goto fail;
+        }
+
+        Py_INCREF(a);
+        if (PyArray_SetUpdateIfCopyBase(a_copy, a) < 0) {
+            goto fail;
+        }
+
+        a = a_copy;
+    }
+
     /* If it is not a pure fancy index, need to get the subspace */
     if (index_type != HAS_FANCY) {
         if (get_view_from_index(a, &subspace, indices, index_num, 1) < 0) {
@@ -3178,6 +3239,7 @@ PyArray_MapIterArray(PyArrayObject * a, PyObject * index)
         goto fail;
     }
 
+    Py_XDECREF(a_copy);
     Py_XDECREF(subspace);
     PyArray_MapIterReset(mit);
 
@@ -3188,6 +3250,7 @@ PyArray_MapIterArray(PyArrayObject * a, PyObject * index)
     return (PyObject *)mit;
 
  fail:
+    Py_XDECREF(a_copy);
     Py_XDECREF(subspace);
     Py_XDECREF((PyObject *)mit);
     for (i=0; i < index_num; i++) {
@@ -3197,6 +3260,17 @@ PyArray_MapIterArray(PyArrayObject * a, PyObject * index)
 }
 
 
+/*NUMPY_API
+ *
+ * Use advanced indexing to iterate an array.
+ */
+NPY_NO_EXPORT PyObject *
+PyArray_MapIterArray(PyArrayObject * a, PyObject * index)
+{
+    return PyArray_MapIterArrayCopyIfOverlap(a, index, 0, NULL);
+}
+
+
 #undef HAS_INTEGER
 #undef HAS_NEWAXIS
 #undef HAS_SLICE
diff --git a/numpy/core/src/multiarray/nditer_constr.c b/numpy/core/src/multiarray/nditer_constr.c
index 48f4928db..b74aca01c 100644
--- a/numpy/core/src/multiarray/nditer_constr.c
+++ b/numpy/core/src/multiarray/nditer_constr.c
@@ -17,6 +17,8 @@
 
 #include "arrayobject.h"
 #include "templ_common.h"
+#include "mem_overlap.h"
+
 
 /* Internal helper functions private to this file */
 static int
@@ -2708,6 +2710,93 @@ npyiter_allocate_arrays(NpyIter *iter,
         bufferdata = NIT_BUFFERDATA(iter);
     }
 
+    if (flags & NPY_ITER_COPY_IF_OVERLAP) {
+        /*
+         * Perform operand memory overlap checks, if requested.
+         *
+         * If any write operand has memory overlap with any read operand,
+         * eliminate all overlap by making temporary copies, by enabling
+         * NPY_OP_ITFLAG_FORCECOPY for the write operand to force UPDATEIFCOPY.
+         *
+         * Operands with NPY_ITER_OVERLAP_ASSUME_ELEMENTWISE enabled are not
+         * considered overlapping if the arrays are exactly the same. In this
+         * case, the iterator loops through them in the same order element by
+         * element.  (As usual, the user-provided inner loop is assumed to be
+         * able to deal with this level of simple aliasing.)
+         */
+        for (iop = 0; iop < nop; ++iop) {
+            int may_share_memory = 0;
+            int iother;
+
+            if (op[iop] == NULL) {
+                /* Iterator will always allocate */
+                continue;
+            }
+
+            if (!(op_itflags[iop] & NPY_OP_ITFLAG_WRITE)) {
+                /*
+                 * Copy output operands only, not inputs.
+                 * A more sophisticated heuristic could be
+                 * substituted here later.
+                 */
+                continue;
+            }
+
+            for (iother = 0; iother < nop; ++iother) {
+                if (iother == iop || op[iother] == NULL) {
+                    continue;
+                }
+
+                if (!(op_itflags[iother] & NPY_OP_ITFLAG_READ)) {
+                    /* No data dependence for arrays not read from */
+                    continue;
+                }
+
+                if (op_itflags[iother] & NPY_OP_ITFLAG_FORCECOPY) {
+                    /* Already copied */
+                    continue;
+                }
+
+                /*
+                 * If the arrays are views to exactly the same data, no need
+                 * to make copies, if the caller (eg ufunc) says it accesses
+                 * data only in the iterator order.
+                 *
+                 * However, if there is internal overlap (e.g. a zero stride on
+                 * a non-unit dimension), a copy cannot be avoided.
+                 */
+                if ((op_flags[iop] & NPY_ITER_OVERLAP_ASSUME_ELEMENTWISE) &&
+                    (op_flags[iother] & NPY_ITER_OVERLAP_ASSUME_ELEMENTWISE) &&
+                    PyArray_BYTES(op[iop]) == PyArray_BYTES(op[iother]) &&
+                    PyArray_NDIM(op[iop]) == PyArray_NDIM(op[iother]) &&
+                    PyArray_CompareLists(PyArray_DIMS(op[iop]),
+                                         PyArray_DIMS(op[iother]),
+                                         PyArray_NDIM(op[iop])) &&
+                    PyArray_CompareLists(PyArray_STRIDES(op[iop]),
+                                         PyArray_STRIDES(op[iother]),
+                                         PyArray_NDIM(op[iop])) &&
+                    PyArray_DESCR(op[iop]) == PyArray_DESCR(op[iother]) &&
+                    solve_may_have_internal_overlap(op[iop], 1) == 0) {
+
+                    continue;
+                }
+
+                /*
+                 * Use max work = 1. If the arrays are large, it might
+                 * make sense to go further.
+                 */
+                may_share_memory = solve_may_share_memory(op[iop],
+                                                          op[iother],
+                                                          1);
+
+                if (may_share_memory) {
+                    op_itflags[iop] |= NPY_OP_ITFLAG_FORCECOPY;
+                    break;
+                }
+            }
+        }
+    }
+
     for (iop = 0; iop < nop; ++iop) {
         /*
          * Check whether there are any WRITEMASKED REDUCE operands
@@ -2797,9 +2886,15 @@ npyiter_allocate_arrays(NpyIter *iter,
                 NBF_STRIDES(bufferdata)[iop] = 0;
             }
         }
-        /* If casting is required and permitted */
-        else if ((op_itflags[iop] & NPY_OP_ITFLAG_CAST) &&
-                   (op_flags[iop] & (NPY_ITER_COPY|NPY_ITER_UPDATEIFCOPY))) {
+        /*
+         * Make a temporary copy if,
+         * 1. If casting is required and permitted, or,
+         * 2. If force-copy is requested
+         */
+        else if (((op_itflags[iop] & NPY_OP_ITFLAG_CAST) &&
+                        (op_flags[iop] &
+                        (NPY_ITER_COPY|NPY_ITER_UPDATEIFCOPY))) ||
+                 (op_itflags[iop] & NPY_OP_ITFLAG_FORCECOPY)) {
             PyArrayObject *temp;
             int ondim = PyArray_NDIM(op[iop]);
 
diff --git a/numpy/core/src/multiarray/nditer_impl.h b/numpy/core/src/multiarray/nditer_impl.h
index ae24f46e6..7788d327b 100644
--- a/numpy/core/src/multiarray/nditer_impl.h
+++ b/numpy/core/src/multiarray/nditer_impl.h
@@ -122,6 +122,8 @@
 #define NPY_OP_ITFLAG_WRITEMASKED  0x0080
 /* The operand's data pointer is pointing into its buffer */
 #define NPY_OP_ITFLAG_USINGBUFFER  0x0100
+/* The operand must be copied (with UPDATEIFCOPY if also ITFLAG_WRITE) */
+#define NPY_OP_ITFLAG_FORCECOPY    0x0200
 
 /*
  * The data layout of the iterator is fully specified by
diff --git a/numpy/core/src/multiarray/nditer_pywrap.c b/numpy/core/src/multiarray/nditer_pywrap.c
index 3d035390c..5f921fda0 100644
--- a/numpy/core/src/multiarray/nditer_pywrap.c
+++ b/numpy/core/src/multiarray/nditer_pywrap.c
@@ -148,6 +148,11 @@ NpyIter_GlobalFlagsConverter(PyObject *flags_in, npy_uint32 *flags)
                             flag = NPY_ITER_C_INDEX;
                         }
                         break;
+                    case 'i':
+                        if (strcmp(str, "copy_if_overlap") == 0) {
+                            flag = NPY_ITER_COPY_IF_OVERLAP;
+                        }
+                        break;
                     case 'n':
                         if (strcmp(str, "common_dtype") == 0) {
                             flag = NPY_ITER_COMMON_DTYPE;
@@ -355,6 +360,11 @@ NpyIter_OpFlagsConverter(PyObject *op_flags_in,
                         break;
                 }
                 break;
+            case 'o':
+                if (strcmp(str, "overlap_assume_elementwise") == 0) {
+                    flag = NPY_ITER_OVERLAP_ASSUME_ELEMENTWISE;
+                }
+                break;
             case 'r':
                 if (length > 4) switch (str[4]) {
                     case 'o':
diff --git a/numpy/core/src/private/lowlevel_strided_loops.h b/numpy/core/src/private/lowlevel_strided_loops.h
index 02b8c73c1..f66de14b5 100644
--- a/numpy/core/src/private/lowlevel_strided_loops.h
+++ b/numpy/core/src/private/lowlevel_strided_loops.h
@@ -2,6 +2,7 @@
 #define __LOWLEVEL_STRIDED_LOOPS_H
 #include "common.h"
 #include <npy_config.h>
+#include "mem_overlap.h"
 
 /*
  * NOTE: This API should remain private for the time being, to allow
@@ -662,7 +663,24 @@ npy_bswap8_unaligned(char * x)
  * Note: Equivalently iterable macro requires one of arr1 or arr2 be
  *       trivially iterable to be valid.
  */
-#define PyArray_EQUIVALENTLY_ITERABLE(arr1, arr2) ( \
+
+/**
+ * Determine whether two arrays are safe for trivial iteration in cases where
+ * some of the arrays may be modified.
+ *
+ * In-place iteration is safe if one of the following is true:
+ *
+ * - Both arrays are read-only
+ * - The arrays do not have overlapping memory (based on a check that may be too
+ *   strict)
+ * - The strides match, and the non-read-only array base addresses are equal or
+ *   before the read-only one, ensuring correct data dependency.
+ */
+
+#define PyArray_TRIVIALLY_ITERABLE_OP_NOREAD 0
+#define PyArray_TRIVIALLY_ITERABLE_OP_READ 1
+
+#define PyArray_EQUIVALENTLY_ITERABLE_BASE(arr1, arr2) (            \
                         PyArray_NDIM(arr1) == PyArray_NDIM(arr2) && \
                         PyArray_CompareLists(PyArray_DIMS(arr1), \
                                              PyArray_DIMS(arr2), \
@@ -673,6 +691,67 @@ npy_bswap8_unaligned(char * x)
                                               NPY_ARRAY_F_CONTIGUOUS)) \
                         )
 
+#define PyArray_TRIVIAL_PAIR_ITERATION_STRIDE(size, arr) ( \
+                        size == 1 ? 0 : ((PyArray_NDIM(arr) == 1) ? \
+                                          PyArray_STRIDE(arr, 0) : \
+                                          PyArray_ITEMSIZE(arr)))
+
+static NPY_INLINE int
+PyArray_EQUIVALENTLY_ITERABLE_OVERLAP_OK(PyArrayObject *arr1, PyArrayObject *arr2,
+                                         int arr1_read, int arr2_read)
+{
+    npy_intp size1, size2, stride1, stride2;
+    int arr1_ahead = 0, arr2_ahead = 0;
+
+    if (arr1_read && arr2_read) {
+        return 1;
+    }
+
+    if (solve_may_share_memory(arr1, arr2, 1) == 0) {
+        return 1;
+    }
+
+    /*
+     * Arrays overlapping in memory may be equivalently iterable if input
+     * arrays stride ahead faster than output arrays.
+     */
+
+    size1 = PyArray_SIZE(arr1);
+    size2 = PyArray_SIZE(arr2);
+
+    stride1 = PyArray_TRIVIAL_PAIR_ITERATION_STRIDE(size1, arr1);
+    stride2 = PyArray_TRIVIAL_PAIR_ITERATION_STRIDE(size2, arr2);
+
+    /*
+     * Arrays with zero stride are never "ahead" since the element is reused
+     * (at this point we know the array extents overlap).
+     */
+
+    if (stride1 > 0) {
+        arr1_ahead = (stride1 >= stride2 &&
+                      PyArray_BYTES(arr1) >= PyArray_BYTES(arr2));
+    }
+    else if (stride1 < 0) {
+        arr1_ahead = (stride1 <= stride2 &&
+                      PyArray_BYTES(arr1) <= PyArray_BYTES(arr2));
+    }
+
+    if (stride2 > 0) {
+        arr2_ahead = (stride2 >= stride1 &&
+                      PyArray_BYTES(arr2) >= PyArray_BYTES(arr1));
+    }
+    else if (stride2 < 0) {
+        arr2_ahead = (stride2 <= stride1 &&
+                      PyArray_BYTES(arr2) <= PyArray_BYTES(arr1));
+    }
+
+    return (!arr1_read || arr1_ahead) && (!arr2_read || arr2_ahead);
+}
+
+#define PyArray_EQUIVALENTLY_ITERABLE(arr1, arr2, arr1_read, arr2_read) ( \
+                        PyArray_EQUIVALENTLY_ITERABLE_BASE(arr1, arr2) && \
+                        PyArray_EQUIVALENTLY_ITERABLE_OVERLAP_OK( \
+                            arr1, arr2, arr1_read, arr2_read))
 #define PyArray_TRIVIALLY_ITERABLE(arr) ( \
                     PyArray_NDIM(arr) <= 1 || \
                     PyArray_CHKFLAGS(arr, NPY_ARRAY_C_CONTIGUOUS) || \
@@ -687,15 +766,16 @@ npy_bswap8_unaligned(char * x)
                                             PyArray_ITEMSIZE(arr)));
 
 
-#define PyArray_TRIVIALLY_ITERABLE_PAIR(arr1, arr2) (\
+#define PyArray_TRIVIALLY_ITERABLE_PAIR(arr1, arr2, arr1_read, arr2_read) (   \
                     PyArray_TRIVIALLY_ITERABLE(arr1) && \
                         (PyArray_NDIM(arr2) == 0 || \
-                         PyArray_EQUIVALENTLY_ITERABLE(arr1, arr2) || \
+                         PyArray_EQUIVALENTLY_ITERABLE_BASE(arr1, arr2) ||  \
                          (PyArray_NDIM(arr1) == 0 && \
                              PyArray_TRIVIALLY_ITERABLE(arr2) \
                          ) \
-                        ) \
-                    )
+                        ) && \
+                        PyArray_EQUIVALENTLY_ITERABLE_OVERLAP_OK(arr1, arr2, arr1_read, arr2_read) \
+                        )
 #define PyArray_PREPARE_TRIVIAL_PAIR_ITERATION(arr1, arr2, \
                                         count, \
                                         data1, data2, \
@@ -705,33 +785,32 @@ npy_bswap8_unaligned(char * x)
                     count = ((size1 > size2) || size1 == 0) ? size1 : size2; \
                     data1 = PyArray_BYTES(arr1); \
                     data2 = PyArray_BYTES(arr2); \
-                    stride1 = (size1 == 1 ? 0 : ((PyArray_NDIM(arr1) == 1) ? \
-                                                PyArray_STRIDE(arr1, 0) : \
-                                                PyArray_ITEMSIZE(arr1))); \
-                    stride2 = (size2 == 1 ? 0 : ((PyArray_NDIM(arr2) == 1) ? \
-                                                PyArray_STRIDE(arr2, 0) : \
-                                                PyArray_ITEMSIZE(arr2))); \
+                    stride1 = PyArray_TRIVIAL_PAIR_ITERATION_STRIDE(size1, arr1); \
+                    stride2 = PyArray_TRIVIAL_PAIR_ITERATION_STRIDE(size2, arr2); \
                 }
 
-#define PyArray_TRIVIALLY_ITERABLE_TRIPLE(arr1, arr2, arr3) (\
+#define PyArray_TRIVIALLY_ITERABLE_TRIPLE(arr1, arr2, arr3, arr1_read, arr2_read, arr3_read) ( \
                 PyArray_TRIVIALLY_ITERABLE(arr1) && \
                     ((PyArray_NDIM(arr2) == 0 && \
                         (PyArray_NDIM(arr3) == 0 || \
-                            PyArray_EQUIVALENTLY_ITERABLE(arr1, arr3) \
+                            PyArray_EQUIVALENTLY_ITERABLE_BASE(arr1, arr3) \
                         ) \
                      ) || \
-                     (PyArray_EQUIVALENTLY_ITERABLE(arr1, arr2) && \
+                     (PyArray_EQUIVALENTLY_ITERABLE_BASE(arr1, arr2) && \
                         (PyArray_NDIM(arr3) == 0 || \
-                            PyArray_EQUIVALENTLY_ITERABLE(arr1, arr3) \
+                            PyArray_EQUIVALENTLY_ITERABLE_BASE(arr1, arr3) \
                         ) \
                      ) || \
                      (PyArray_NDIM(arr1) == 0 && \
                         PyArray_TRIVIALLY_ITERABLE(arr2) && \
                             (PyArray_NDIM(arr3) == 0 || \
-                                PyArray_EQUIVALENTLY_ITERABLE(arr2, arr3) \
+                                PyArray_EQUIVALENTLY_ITERABLE_BASE(arr2, arr3) \
                             ) \
                      ) \
-                    ) \
+                    ) && \
+                    PyArray_EQUIVALENTLY_ITERABLE_OVERLAP_OK(arr1, arr2, arr1_read, arr2_read) && \
+                    PyArray_EQUIVALENTLY_ITERABLE_OVERLAP_OK(arr1, arr3, arr1_read, arr3_read) && \
+                    PyArray_EQUIVALENTLY_ITERABLE_OVERLAP_OK(arr2, arr3, arr2_read, arr3_read) \
                 )
 
 #define PyArray_PREPARE_TRIVIAL_TRIPLE_ITERATION(arr1, arr2, arr3, \
@@ -746,15 +825,9 @@ npy_bswap8_unaligned(char * x)
                     data1 = PyArray_BYTES(arr1); \
                     data2 = PyArray_BYTES(arr2); \
                     data3 = PyArray_BYTES(arr3); \
-                    stride1 = (size1 == 1 ? 0 : ((PyArray_NDIM(arr1) == 1) ? \
-                                                PyArray_STRIDE(arr1, 0) : \
-                                                PyArray_ITEMSIZE(arr1))); \
-                    stride2 = (size2 == 1 ? 0 : ((PyArray_NDIM(arr2) == 1) ? \
-                                                PyArray_STRIDE(arr2, 0) : \
-                                                PyArray_ITEMSIZE(arr2))); \
-                    stride3 = (size3 == 1 ? 0 : ((PyArray_NDIM(arr3) == 1) ? \
-                                                PyArray_STRIDE(arr3, 0) : \
-                                                PyArray_ITEMSIZE(arr3))); \
+                    stride1 = PyArray_TRIVIAL_PAIR_ITERATION_STRIDE(size1, arr1); \
+                    stride2 = PyArray_TRIVIAL_PAIR_ITERATION_STRIDE(size2, arr2); \
+                    stride3 = PyArray_TRIVIAL_PAIR_ITERATION_STRIDE(size3, arr3); \
                 }
 
 #endif
diff --git a/numpy/core/src/umath/reduction.c b/numpy/core/src/umath/reduction.c
index 8079f7e0f..47598bed9 100644
--- a/numpy/core/src/umath/reduction.c
+++ b/numpy/core/src/umath/reduction.c
@@ -83,7 +83,8 @@ allocate_reduce_result(PyArrayObject *arr, npy_bool *axis_flags,
  */
 static PyArrayObject *
 conform_reduce_result(int ndim, npy_bool *axis_flags,
-                    PyArrayObject *out, int keepdims, const char *funcname)
+                      PyArrayObject *out, int keepdims, const char *funcname,
+                      int need_copy)
 {
     npy_intp strides[NPY_MAXDIMS], shape[NPY_MAXDIMS];
     npy_intp *strides_out = PyArray_STRIDES(out);
@@ -151,6 +152,7 @@ conform_reduce_result(int ndim, npy_bool *axis_flags,
     /* Allocate the view */
     dtype = PyArray_DESCR(out);
     Py_INCREF(dtype);
+
     ret = (PyArrayObject_fields *)PyArray_NewFromDescr(&PyArray_Type,
                                dtype,
                                ndim, shape,
@@ -161,13 +163,41 @@ conform_reduce_result(int ndim, npy_bool *axis_flags,
     if (ret == NULL) {
         return NULL;
     }
+
     Py_INCREF(out);
     if (PyArray_SetBaseObject((PyArrayObject *)ret, (PyObject *)out) < 0) {
         Py_DECREF(ret);
         return NULL;
     }
 
-    return (PyArrayObject *)ret;
+    if (need_copy) {
+        PyArrayObject *ret_copy;
+
+        ret_copy = (PyArrayObject *)PyArray_NewLikeArray(
+            (PyArrayObject *)ret, NPY_ANYORDER, NULL, 0);
+        if (ret_copy == NULL) {
+            Py_DECREF(ret);
+            return NULL;
+        }
+
+        if (PyArray_CopyInto(ret_copy, (PyArrayObject *)ret) != 0) {
+            Py_DECREF(ret);
+            Py_DECREF(ret_copy);
+            return NULL;
+        }
+
+        Py_INCREF(ret);
+        if (PyArray_SetUpdateIfCopyBase(ret_copy, (PyArrayObject *)ret) < 0) {
+            Py_DECREF(ret);
+            Py_DECREF(ret_copy);
+            return NULL;
+        }
+
+        return ret_copy;
+    }
+    else {
+        return (PyArrayObject *)ret;
+    }
 }
 
 /*
@@ -201,11 +231,16 @@ PyArray_CreateReduceResult(PyArrayObject *operand, PyArrayObject *out,
         result = allocate_reduce_result(operand, axis_flags, dtype, subok);
     }
     else {
+        int need_copy = 0;
+
+        if (solve_may_share_memory(operand, out, 1) != 0) {
+            need_copy = 1;
+        }
+
         /* Steal the dtype reference */
         Py_XDECREF(dtype);
-
         result = conform_reduce_result(PyArray_NDIM(operand), axis_flags,
-                                        out, keepdims, funcname);
+                                       out, keepdims, funcname, need_copy);
     }
 
     return result;
@@ -445,6 +480,9 @@ PyUFunc_ReduceWrapper(PyArrayObject *operand, PyArrayObject *out,
     /*
      * This either conforms 'out' to the ndim of 'operand', or allocates
      * a new array appropriate for this reduction.
+     *
+     * A new array with UPDATEIFCOPY is allocated if operand and out have memory
+     * overlap.
      */
     Py_INCREF(result_dtype);
     result = PyArray_CreateReduceResult(operand, out,
diff --git a/numpy/core/src/umath/ufunc_object.c b/numpy/core/src/umath/ufunc_object.c
index 62024d2e3..06ce22464 100644
--- a/numpy/core/src/umath/ufunc_object.c
+++ b/numpy/core/src/umath/ufunc_object.c
@@ -41,6 +41,7 @@
 #include "lowlevel_strided_loops.h"
 #include "ufunc_type_resolution.h"
 #include "reduction.h"
+#include "mem_overlap.h"
 
 #include "ufunc_object.h"
 #include "ufunc_override.h"
@@ -1416,7 +1417,8 @@ iterator_loop(PyUFuncObject *ufunc,
     /* Set up the flags */
     for (i = 0; i < nin; ++i) {
         op_flags[i] = NPY_ITER_READONLY |
-                      NPY_ITER_ALIGNED;
+                      NPY_ITER_ALIGNED |
+                      NPY_ITER_OVERLAP_ASSUME_ELEMENTWISE;
         /*
          * If READWRITE flag has been set for this operand,
          * then clear default READONLY flag
@@ -1431,7 +1433,8 @@ iterator_loop(PyUFuncObject *ufunc,
                       NPY_ITER_ALIGNED |
                       NPY_ITER_ALLOCATE |
                       NPY_ITER_NO_BROADCAST |
-                      NPY_ITER_NO_SUBTYPE;
+                      NPY_ITER_NO_SUBTYPE |
+                      NPY_ITER_OVERLAP_ASSUME_ELEMENTWISE;
     }
 
     iter_flags = ufunc->iter_flags |
@@ -1440,7 +1443,22 @@ iterator_loop(PyUFuncObject *ufunc,
                  NPY_ITER_ZEROSIZE_OK |
                  NPY_ITER_BUFFERED |
                  NPY_ITER_GROWINNER |
-                 NPY_ITER_DELAY_BUFALLOC;
+                 NPY_ITER_DELAY_BUFALLOC |
+                 NPY_ITER_COPY_IF_OVERLAP;
+
+    /* Call the __array_prepare__ functions for already existing output arrays.
+     * Do this before creating the iterator, as the iterator may UPDATEIFCOPY
+     * some of them.
+     */
+    for (i = 0; i < nout; ++i) {
+        if (op[nin+i] == NULL) {
+            continue;
+        }
+        if (prepare_ufunc_output(ufunc, &op[nin+i],
+                            arr_prep[i], arr_prep_args, i) < 0) {
+            return -1;
+        }
+    }
 
     /*
      * Allocate the iterator.  Because the types of the inputs
@@ -1458,32 +1476,41 @@ iterator_loop(PyUFuncObject *ufunc,
 
     /* Copy any allocated outputs */
     op_it = NpyIter_GetOperandArray(iter);
-    for (i = nin; i < nop; ++i) {
-        if (op[i] == NULL) {
-            op[i] = op_it[i];
-            Py_INCREF(op[i]);
-        }
-    }
-
-    /* Call the __array_prepare__ functions where necessary */
     for (i = 0; i < nout; ++i) {
-        if (prepare_ufunc_output(ufunc, &op[nin+i],
-                            arr_prep[i], arr_prep_args, i) < 0) {
-            NpyIter_Deallocate(iter);
-            return -1;
+        if (op[nin+i] == NULL) {
+            op[nin+i] = op_it[nin+i];
+            Py_INCREF(op[nin+i]);
+
+            /* Call the __array_prepare__ functions for the new array */
+            if (prepare_ufunc_output(ufunc, &op[nin+i],
+                                     arr_prep[i], arr_prep_args, i) < 0) {
+                NpyIter_Deallocate(iter);
+                return -1;
+            }
+
+            /*
+             * In case __array_prepare__ returned a different array, put the
+             * results directly there, ignoring the array allocated by the
+             * iterator.
+             *
+             * Here, we assume the user-provided __array_prepare__ behaves
+             * sensibly and doesn't return an array overlapping in memory
+             * with other operands --- the op[nin+i] array passed to it is newly
+             * allocated and doesn't have any overlap.
+             */
+            baseptrs[nin+i] = PyArray_BYTES(op[nin+i]);
+        }
+        else {
+            baseptrs[nin+i] = PyArray_BYTES(op_it[nin+i]);
         }
     }
 
     /* Only do the loop if the iteration size is non-zero */
     if (NpyIter_GetIterSize(iter) != 0) {
-
-        /* Reset the iterator with the base pointers from the wrapped outputs */
+        /* Reset the iterator with the base pointers from possible __array_prepare__ */
         for (i = 0; i < nin; ++i) {
             baseptrs[i] = PyArray_BYTES(op_it[i]);
         }
-        for (i = nin; i < nop; ++i) {
-            baseptrs[i] = PyArray_BYTES(op[i]);
-        }
         if (NpyIter_ResetBasePointers(iter, baseptrs, NULL) != NPY_SUCCEED) {
             NpyIter_Deallocate(iter);
             return -1;
@@ -1581,7 +1608,9 @@ execute_legacy_ufunc_loop(PyUFuncObject *ufunc,
             }
             else if (op[1] != NULL &&
                         PyArray_NDIM(op[1]) >= PyArray_NDIM(op[0]) &&
-                        PyArray_TRIVIALLY_ITERABLE_PAIR(op[0], op[1])) {
+                        PyArray_TRIVIALLY_ITERABLE_PAIR(op[0], op[1],
+                                                        PyArray_TRIVIALLY_ITERABLE_OP_READ,
+                                                        PyArray_TRIVIALLY_ITERABLE_OP_NOREAD)) {
 
                 /* Call the __prepare_array__ if necessary */
                 if (prepare_ufunc_output(ufunc, &op[1],
@@ -1598,7 +1627,9 @@ execute_legacy_ufunc_loop(PyUFuncObject *ufunc,
         else if (nin == 2 && nout == 1) {
             if (op[2] == NULL &&
                         (order == NPY_ANYORDER || order == NPY_KEEPORDER) &&
-                        PyArray_TRIVIALLY_ITERABLE_PAIR(op[0], op[1])) {
+                        PyArray_TRIVIALLY_ITERABLE_PAIR(op[0], op[1],
+                                                        PyArray_TRIVIALLY_ITERABLE_OP_READ,
+                                                        PyArray_TRIVIALLY_ITERABLE_OP_READ)) {
                 PyArrayObject *tmp;
                 /*
                  * Have to choose the input with more dimensions to clone, as
@@ -1637,7 +1668,10 @@ execute_legacy_ufunc_loop(PyUFuncObject *ufunc,
             else if (op[2] != NULL &&
                     PyArray_NDIM(op[2]) >= PyArray_NDIM(op[0]) &&
                     PyArray_NDIM(op[2]) >= PyArray_NDIM(op[1]) &&
-                    PyArray_TRIVIALLY_ITERABLE_TRIPLE(op[0], op[1], op[2])) {
+                    PyArray_TRIVIALLY_ITERABLE_TRIPLE(op[0], op[1], op[2],
+                                                      PyArray_TRIVIALLY_ITERABLE_OP_READ,
+                                                      PyArray_TRIVIALLY_ITERABLE_OP_READ,
+                                                      PyArray_TRIVIALLY_ITERABLE_OP_NOREAD)) {
 
                 /* Call the __prepare_array__ if necessary */
                 if (prepare_ufunc_output(ufunc, &op[2],
@@ -1701,7 +1735,6 @@ execute_fancy_ufunc_loop(PyUFuncObject *ufunc,
     npy_intp *strides;
     npy_intp *countptr;
 
-    PyArrayObject **op_it;
     npy_uint32 iter_flags;
 
     if (wheremask != NULL) {
@@ -1719,7 +1752,8 @@ execute_fancy_ufunc_loop(PyUFuncObject *ufunc,
     for (i = 0; i < nin; ++i) {
         op_flags[i] = default_op_in_flags |
                       NPY_ITER_READONLY |
-                      NPY_ITER_ALIGNED;
+                      NPY_ITER_ALIGNED |
+                      NPY_ITER_OVERLAP_ASSUME_ELEMENTWISE;
         /*
          * If READWRITE flag has been set for this operand,
          * then clear default READONLY flag
@@ -1730,12 +1764,19 @@ execute_fancy_ufunc_loop(PyUFuncObject *ufunc,
         }
     }
     for (i = nin; i < nop; ++i) {
+        /*
+         * We don't write to all elements, and the iterator may make
+         * UPDATEIFCOPY temporary copies. The output arrays must be considered
+         * READWRITE by the iterator, so that the elements we don't write to are
+         * copied to the possible temporary array.
+         */
         op_flags[i] = default_op_out_flags |
-                      NPY_ITER_WRITEONLY |
+                      NPY_ITER_READWRITE |
                       NPY_ITER_ALIGNED |
                       NPY_ITER_ALLOCATE |
                       NPY_ITER_NO_BROADCAST |
-                      NPY_ITER_NO_SUBTYPE;
+                      NPY_ITER_NO_SUBTYPE |
+                      NPY_ITER_OVERLAP_ASSUME_ELEMENTWISE;
     }
     if (wheremask != NULL) {
         op_flags[nop] = NPY_ITER_READONLY | NPY_ITER_ARRAYMASK;
@@ -1748,7 +1789,8 @@ execute_fancy_ufunc_loop(PyUFuncObject *ufunc,
                  NPY_ITER_REFS_OK |
                  NPY_ITER_ZEROSIZE_OK |
                  NPY_ITER_BUFFERED |
-                 NPY_ITER_GROWINNER;
+                 NPY_ITER_GROWINNER |
+                 NPY_ITER_COPY_IF_OVERLAP;
 
     /*
      * Allocate the iterator.  Because the types of the inputs
@@ -1768,22 +1810,37 @@ execute_fancy_ufunc_loop(PyUFuncObject *ufunc,
 
     needs_api = NpyIter_IterationNeedsAPI(iter);
 
-    /* Copy any allocated outputs */
-    op_it = NpyIter_GetOperandArray(iter);
+    /* Call the __array_prepare__ functions where necessary */
     for (i = nin; i < nop; ++i) {
-        if (op[i] == NULL) {
-            op[i] = op_it[i];
-            Py_INCREF(op[i]);
+        PyArrayObject *op_tmp;
+
+        /* prepare_ufunc_output may decref & replace pointer */
+        op_tmp = op[i];
+        Py_INCREF(op_tmp);
+
+        if (prepare_ufunc_output(ufunc, &op_tmp,
+                                 arr_prep[i], arr_prep_args, i) < 0) {
+            NpyIter_Deallocate(iter);
+            return -1;
         }
-    }
 
-    /* Call the __array_prepare__ functions where necessary */
-    for (i = 0; i < nout; ++i) {
-        if (prepare_ufunc_output(ufunc, &op[nin+i],
-                            arr_prep[i], arr_prep_args, i) < 0) {
+        /* Validate that the prepare_ufunc_output didn't mess with pointers */
+        if (PyArray_BYTES(op_tmp) != PyArray_BYTES(op[i])) {
+            PyErr_SetString(PyExc_ValueError,
+                        "The __array_prepare__ functions modified the data "
+                        "pointer addresses in an invalid fashion");
+            Py_DECREF(op_tmp);
             NpyIter_Deallocate(iter);
             return -1;
         }
+
+        /*
+         * Put the updated operand back and undo the DECREF above. If
+         * COPY_IF_OVERLAP made a temporary copy, the output will be copied in
+         * by UPDATEIFCOPY even if op[i] was changed.
+         */
+        op[i] = op_tmp;
+        Py_DECREF(op_tmp);
     }
 
     /* Only do the loop if the iteration size is non-zero */
@@ -1794,17 +1851,6 @@ execute_fancy_ufunc_loop(PyUFuncObject *ufunc,
         PyArray_Descr **iter_dtypes;
         NPY_BEGIN_THREADS_DEF;
 
-        /* Validate that the prepare_ufunc_output didn't mess with pointers */
-        for (i = nin; i < nop; ++i) {
-            if (PyArray_BYTES(op[i]) != PyArray_BYTES(op_it[i])) {
-                PyErr_SetString(PyExc_ValueError,
-                        "The __array_prepare__ functions modified the data "
-                        "pointer addresses in an invalid fashion");
-                NpyIter_Deallocate(iter);
-                return -1;
-            }
-        }
-
         /*
          * Get the inner loop, with the possibility of specialization
          * based on the fixed strides.
@@ -2265,7 +2311,8 @@ PyUFunc_GeneralizedFunction(PyUFuncObject *ufunc,
     for (i = 0; i < nin; ++i) {
         op_flags[i] = NPY_ITER_READONLY |
                       NPY_ITER_COPY |
-                      NPY_ITER_ALIGNED;
+                      NPY_ITER_ALIGNED |
+                      NPY_ITER_OVERLAP_ASSUME_ELEMENTWISE;
         /*
          * If READWRITE flag has been set for this operand,
          * then clear default READONLY flag
@@ -2280,14 +2327,16 @@ PyUFunc_GeneralizedFunction(PyUFuncObject *ufunc,
                       NPY_ITER_UPDATEIFCOPY|
                       NPY_ITER_ALIGNED|
                       NPY_ITER_ALLOCATE|
-                      NPY_ITER_NO_BROADCAST;
+                      NPY_ITER_NO_BROADCAST|
+                      NPY_ITER_OVERLAP_ASSUME_ELEMENTWISE;
     }
 
     iter_flags = ufunc->iter_flags |
                  NPY_ITER_MULTI_INDEX |
                  NPY_ITER_REFS_OK |
                  NPY_ITER_REDUCE_OK |
-                 NPY_ITER_ZEROSIZE_OK;
+                 NPY_ITER_ZEROSIZE_OK |
+                 NPY_ITER_COPY_IF_OVERLAP;
 
     /* Create the iterator */
     iter = NpyIter_AdvancedNew(nop, op, iter_flags,
@@ -3174,11 +3223,16 @@ PyUFunc_Accumulate(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *out,
              !PyArray_EquivTypes(op_dtypes[0], PyArray_DESCR(out)))) {
         need_outer_iterator = 1;
     }
+    /* If input and output overlap in memory, use iterator to figure it out */
+    else if (out != NULL && solve_may_share_memory(out, arr, NPY_MAY_SHARE_BOUNDS) != 0) {
+        need_outer_iterator = 1;
+    }
 
     if (need_outer_iterator) {
         int ndim_iter = 0;
         npy_uint32 flags = NPY_ITER_ZEROSIZE_OK|
-                           NPY_ITER_REFS_OK;
+                           NPY_ITER_REFS_OK|
+                           NPY_ITER_COPY_IF_OVERLAP;
         PyArray_Descr **op_dtypes_param = NULL;
 
         /*
@@ -3592,7 +3646,8 @@ PyUFunc_Reduceat(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *ind,
     if (need_outer_iterator) {
         npy_uint32 flags = NPY_ITER_ZEROSIZE_OK|
                            NPY_ITER_REFS_OK|
-                           NPY_ITER_MULTI_INDEX;
+                           NPY_ITER_MULTI_INDEX|
+                           NPY_ITER_COPY_IF_OVERLAP;
 
         /*
          * The way reduceat is set up, we can't do buffering,
@@ -3635,6 +3690,7 @@ PyUFunc_Reduceat(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *ind,
         /* In case COPY or UPDATEIFCOPY occurred */
         op[0] = NpyIter_GetOperandArray(iter)[0];
         op[1] = NpyIter_GetOperandArray(iter)[1];
+        op[2] = NpyIter_GetOperandArray(iter)[2];
 
         if (out == NULL) {
             out = op[0];
@@ -5259,11 +5315,6 @@ ufunc_at(PyUFuncObject *ufunc, PyObject *args)
 
     op1_array = (PyArrayObject *)op1;
 
-    iter = (PyArrayMapIterObject *)PyArray_MapIterArray(op1_array, idx);
-    if (iter == NULL) {
-        goto fail;
-    }
-
     /* Create second operand from number array if needed. */
     if (op2 != NULL) {
         op2_array = (PyArrayObject *)PyArray_FromAny(op2, NULL,
@@ -5271,7 +5322,17 @@ ufunc_at(PyUFuncObject *ufunc, PyObject *args)
         if (op2_array == NULL) {
             goto fail;
         }
+    }
 
+    /* Create map iterator */
+    iter = (PyArrayMapIterObject *)PyArray_MapIterArrayCopyIfOverlap(
+        op1_array, idx, 1, op2_array);
+    if (iter == NULL) {
+        goto fail;
+    }
+    op1_array = iter->array;  /* May be updateifcopied on overlap */
+
+    if (op2 != NULL) {
         /*
          * May need to swap axes so that second operand is
          * iterated over correctly
diff --git a/numpy/core/tests/test_mem_overlap.py b/numpy/core/tests/test_mem_overlap.py
index acca53856..a95a03fa4 100644
--- a/numpy/core/tests/test_mem_overlap.py
+++ b/numpy/core/tests/test_mem_overlap.py
@@ -4,9 +4,11 @@ import sys
 import itertools
 
 import numpy as np
-from numpy.testing import run_module_suite, assert_, assert_raises, assert_equal
+from numpy.testing import (run_module_suite, assert_, assert_raises, assert_equal,
+                           assert_array_equal, assert_allclose)
 
 from numpy.core.multiarray_tests import solve_diophantine, internal_overlap
+from numpy.core import umath_tests
 from numpy.lib.stride_tricks import as_strided
 from numpy.compat import long
 
@@ -84,7 +86,7 @@ def _check_assignment(srcidx, dstidx):
 
 
 def test_overlapping_assignments():
-    """Test automatically generated assignments which overlap in memory."""
+    # Test automatically generated assignments which overlap in memory.
 
     inds = _indices(ndims)
 
@@ -107,7 +109,6 @@ def test_diophantine_fuzz():
 
         min_count = 500//(ndim + 1)
 
-        numbers = []
         while min(feasible_count, infeasible_count) < min_count:
             # Ensure big and small integer problems
             A_max = 1 + rng.randint(0, 11, dtype=np.intp)**6
@@ -252,13 +253,12 @@ def test_may_share_memory_manual():
     check_may_share_memory_exact(x, x.copy())
 
 
-def check_may_share_memory_easy_fuzz(get_max_work, same_steps, min_count):
-    # Check that overlap problems with common strides are solved with
-    # little work.
-    x = np.zeros([17,34,71,97], dtype=np.int16)
-
+def iter_random_view_pairs(x, same_steps=True, equal_size=False):
     rng = np.random.RandomState(1234)
 
+    if equal_size and same_steps:
+        raise ValueError()
+
     def random_slice(n, step):
         start = rng.randint(0, n+1, dtype=np.intp)
         stop = rng.randint(start, n+1, dtype=np.intp)
@@ -267,31 +267,93 @@ def check_may_share_memory_easy_fuzz(get_max_work, same_steps, min_count):
             step *= -1
         return slice(start, stop, step)
 
-    feasible = 0
-    infeasible = 0
+    def random_slice_fixed_size(n, step, size):
+        start = rng.randint(0, n+1 - size*step)
+        stop = start + (size-1)*step + 1
+        if rng.randint(0, 2) == 0:
+            stop, start = start-1, stop-1
+            if stop < 0:
+                stop = None
+            step *= -1
+        return slice(start, stop, step)
 
-    while min(feasible, infeasible) < min_count:
+    # First a few regular views
+    yield x, x
+    for j in range(1, 7, 3):
+        yield x[j:], x[:-j]
+        yield x[...,j:], x[...,:-j]
+
+    # An array with zero stride internal overlap
+    strides = list(x.strides)
+    strides[0] = 0
+    xp = as_strided(x, shape=x.shape, strides=strides)
+    yield x, xp
+    yield xp, xp
+
+    # An array with non-zero stride internal overlap
+    strides = list(x.strides)
+    if strides[0] > 1:
+        strides[0] = 1
+    xp = as_strided(x, shape=x.shape, strides=strides)
+    yield x, xp
+    yield xp, xp
+
+    # Then discontiguous views
+    while True:
         steps = tuple(rng.randint(1, 11, dtype=np.intp)
                       if rng.randint(0, 5, dtype=np.intp) == 0 else 1
                       for j in range(x.ndim))
-        if same_steps:
+        s1 = tuple(random_slice(p, s) for p, s in zip(x.shape, steps))
+
+        t1 = np.arange(x.ndim)
+        rng.shuffle(t1)
+
+        if equal_size:
+            t2 = t1
+        else:
+            t2 = np.arange(x.ndim)
+            rng.shuffle(t2)
+
+        a = x[s1]
+
+        if equal_size:
+            if a.size == 0:
+                continue
+
+            steps2 = tuple(rng.randint(1, max(2, p//(1+pa)))
+                           if rng.randint(0, 5) == 0 else 1
+                           for p, s, pa in zip(x.shape, s1, a.shape))
+            s2 = tuple(random_slice_fixed_size(p, s, pa)
+                       for p, s, pa in zip(x.shape, steps2, a.shape))
+        elif same_steps:
             steps2 = steps
         else:
             steps2 = tuple(rng.randint(1, 11, dtype=np.intp)
                            if rng.randint(0, 5, dtype=np.intp) == 0 else 1
                            for j in range(x.ndim))
 
-        t1 = np.arange(x.ndim)
-        rng.shuffle(t1)
-
-        t2 = np.arange(x.ndim)
-        rng.shuffle(t2)
+        if not equal_size:
+            s2 = tuple(random_slice(p, s) for p, s in zip(x.shape, steps2))
 
-        s1 = tuple(random_slice(p, s) for p, s in zip(x.shape, steps))
-        s2 = tuple(random_slice(p, s) for p, s in zip(x.shape, steps2))
-        a = x[s1].transpose(t1)
+        a = a.transpose(t1)
         b = x[s2].transpose(t2)
 
+        yield a, b
+
+
+def check_may_share_memory_easy_fuzz(get_max_work, same_steps, min_count):
+    # Check that overlap problems with common strides are solved with
+    # little work.
+    x = np.zeros([17,34,71,97], dtype=np.int16)
+
+    feasible = 0
+    infeasible = 0
+
+    pair_iter = iter_random_view_pairs(x, same_steps)
+
+    while min(feasible, infeasible) < min_count:
+        a, b = next(pair_iter)
+
         bounds_overlap = np.may_share_memory(a, b)
         may_share_answer = np.may_share_memory(a, b)
         easy_answer = np.may_share_memory(a, b, max_work=get_max_work(a, b))
@@ -299,11 +361,10 @@ def check_may_share_memory_easy_fuzz(get_max_work, same_steps, min_count):
 
         if easy_answer != exact_answer:
             # assert_equal is slow...
-            assert_equal(easy_answer, exact_answer, err_msg=repr((s1, s2)))
+            assert_equal(easy_answer, exact_answer)
 
         if may_share_answer != bounds_overlap:
-            assert_equal(may_share_answer, bounds_overlap,
-                         err_msg=repr((s1, s2)))
+            assert_equal(may_share_answer, bounds_overlap)
 
         if bounds_overlap:
             if exact_answer:
@@ -524,5 +585,364 @@ def test_non_ndarray_inputs():
         assert_(np.may_share_memory(cls(x[1::3]), x[::2]))
 
 
+def view_element_first_byte(x):
+    """Construct an array viewing the first byte of each element of `x`"""
+    from numpy.lib.stride_tricks import DummyArray
+    interface = dict(x.__array_interface__)
+    interface['typestr'] = '|b1'
+    interface['descr'] = [('', '|b1')]
+    return np.asarray(DummyArray(interface, x))
+
+
+def assert_copy_equivalent(operation, args, out, **kwargs):
+    """
+    Check that operation(*args, out=out) produces results
+    equivalent to out[...] = operation(*args, out=out.copy())
+    """
+
+    kwargs['out'] = out
+    kwargs2 = dict(kwargs)
+    kwargs2['out'] = out.copy()
+
+    out_orig = out.copy()
+    out[...] = operation(*args, **kwargs2)
+    expected = out.copy()
+    out[...] = out_orig
+
+    got = operation(*args, **kwargs).copy()
+
+    if (got != expected).any():
+        assert_equal(got, expected)
+
+
+class TestUFunc(object):
+    """
+    Test ufunc call memory overlap handling
+    """
+
+    def check_unary_fuzz(self, operation, get_out_axis_size, dtype=np.int16,
+                             count=5000):
+        shapes = [7, 13, 8, 21, 29, 32]
+
+        rng = np.random.RandomState(1234)
+
+        for ndim in range(1, 6):
+            x = rng.randint(0, 2**16, size=shapes[:ndim]).astype(dtype)
+
+            it = iter_random_view_pairs(x, same_steps=False, equal_size=True)
+
+            min_count = count // (ndim + 1)**2
+
+            overlapping = 0
+            while overlapping < min_count:
+                a, b = next(it)
+
+                a_orig = a.copy()
+                b_orig = b.copy()
+
+                if get_out_axis_size is None:
+                    assert_copy_equivalent(operation, [a], out=b)
+
+                    if np.shares_memory(a, b):
+                        overlapping += 1
+                else:
+                    for axis in itertools.chain(range(ndim), [None]):
+                        a[...] = a_orig
+                        b[...] = b_orig
+
+                        # Determine size for reduction axis (None if scalar)
+                        outsize, scalarize = get_out_axis_size(a, b, axis)
+                        if outsize == 'skip':
+                            continue
+
+                        # Slice b to get an output array of the correct size
+                        sl = [slice(None)] * ndim
+                        if axis is None:
+                            if outsize is None:
+                                sl = [slice(0, 1)] + [0]*(ndim - 1)
+                            else:
+                                sl = [slice(0, outsize)] + [0]*(ndim - 1)
+                        else:
+                            if outsize is None:
+                                k = b.shape[axis]//2
+                                if ndim == 1:
+                                    sl[axis] = slice(k, k + 1)
+                                else:
+                                    sl[axis] = k
+                            else:
+                                assert b.shape[axis] >= outsize
+                                sl[axis] = slice(0, outsize)
+                        b_out = b[tuple(sl)]
+
+                        if scalarize:
+                            b_out = b_out.reshape([])
+
+                        if np.shares_memory(a, b_out):
+                            overlapping += 1
+
+                        # Check result
+                        assert_copy_equivalent(operation, [a], out=b_out, axis=axis)
+
+    def test_unary_ufunc_call_fuzz(self):
+        self.check_unary_fuzz(np.invert, None, np.int16)
+
+    def test_binary_ufunc_accumulate_fuzz(self):
+        def get_out_axis_size(a, b, axis):
+            if axis is None:
+                if a.ndim == 1:
+                    return a.size, False
+                else:
+                    return 'skip', False  # accumulate doesn't support this
+            else:
+                return a.shape[axis], False
+
+        self.check_unary_fuzz(np.add.accumulate, get_out_axis_size,
+                              dtype=np.int16, count=500)
+
+    def test_binary_ufunc_reduce_fuzz(self):
+        def get_out_axis_size(a, b, axis):
+            return None, (axis is None or a.ndim == 1)
+
+        self.check_unary_fuzz(np.add.reduce, get_out_axis_size,
+                              dtype=np.int16, count=500)
+
+    def test_binary_ufunc_reduceat_fuzz(self):
+        def get_out_axis_size(a, b, axis):
+            if axis is None:
+                if a.ndim == 1:
+                    return a.size, False
+                else:
+                    return 'skip', False  # reduceat doesn't support this
+            else:
+                return a.shape[axis], False
+
+        def do_reduceat(a, out, axis):
+            if axis is None:
+                size = len(a)
+                step = size//len(out)
+            else:
+                size = a.shape[axis]
+                step = a.shape[axis] // out.shape[axis]
+            idx = np.arange(0, size, step)
+            return np.add.reduceat(a, idx, out=out, axis=axis)
+
+        self.check_unary_fuzz(do_reduceat, get_out_axis_size,
+                              dtype=np.int16, count=500)
+
+    def test_binary_ufunc_reduceat_manual(self):
+        def check(ufunc, a, ind, out):
+            c1 = ufunc.reduceat(a.copy(), ind.copy(), out=out.copy())
+            c2 = ufunc.reduceat(a, ind, out=out)
+            assert_array_equal(c1, c2)
+
+        # Exactly same input/output arrays
+        a = np.arange(10000, dtype=np.int16)
+        check(np.add, a, a[::-1].copy(), a)
+
+        # Overlap with index
+        a = np.arange(10000, dtype=np.int16)
+        check(np.add, a, a[::-1], a)
+
+    def test_unary_gufunc_fuzz(self):
+        shapes = [7, 13, 8, 21, 29, 32]
+        gufunc = umath_tests.euclidean_pdist
+
+        rng = np.random.RandomState(1234)
+
+        for ndim in range(2, 6):
+            x = rng.rand(*shapes[:ndim])
+
+            it = iter_random_view_pairs(x, same_steps=False, equal_size=True)
+
+            min_count = 500 // (ndim + 1)**2
+
+            overlapping = 0
+            while overlapping < min_count:
+                a, b = next(it)
+
+                if min(a.shape[-2:]) < 2 or min(b.shape[-2:]) < 2 or a.shape[-1] < 2:
+                    continue
+
+                # Ensure the shapes are so that euclidean_pdist is happy
+                if b.shape[-1] > b.shape[-2]:
+                    b = b[...,0,:]
+                else:
+                    b = b[...,:,0]
+
+                n = a.shape[-2]
+                p = n * (n - 1) // 2
+                if p <= b.shape[-1] and p > 0:
+                    b = b[...,:p]
+                else:
+                    n = max(2, int(np.sqrt(b.shape[-1]))//2)
+                    p = n * (n - 1) // 2
+                    a = a[...,:n,:]
+                    b = b[...,:p]
+
+                # Call
+                if np.shares_memory(a, b):
+                    overlapping += 1
+
+                with np.errstate(over='ignore', invalid='ignore'):
+                    assert_copy_equivalent(gufunc, [a], out=b)
+
+    def test_ufunc_at_manual(self):
+        def check(ufunc, a, ind, b=None):
+            a0 = a.copy()
+            if b is None:
+                ufunc.at(a0, ind.copy())
+                c1 = a0.copy()
+                ufunc.at(a, ind)
+                c2 = a.copy()
+            else:
+                ufunc.at(a0, ind.copy(), b.copy())
+                c1 = a0.copy()
+                ufunc.at(a, ind, b)
+                c2 = a.copy()
+            assert_array_equal(c1, c2)
+
+        # Overlap with index
+        a = np.arange(10000, dtype=np.int16)
+        check(np.invert, a[::-1], a)
+
+        # Overlap with second data array
+        a = np.arange(100, dtype=np.int16)
+        ind = np.arange(0, 100, 2, dtype=np.int16)
+        check(np.add, a, ind, a[25:75])
+
+    def test_unary_ufunc_1d_manual(self):
+        # Exercise branches in PyArray_EQUIVALENTLY_ITERABLE
+
+        def check(a, b):
+            a_orig = a.copy()
+            b_orig = b.copy()
+
+            b0 = b.copy()
+            c1 = ufunc(a, out=b0)
+            c2 = ufunc(a, out=b)
+            assert_array_equal(c1, c2)
+
+            # Trigger "fancy ufunc loop" code path
+            mask = view_element_first_byte(b).view(np.bool_)
+
+            a[...] = a_orig
+            b[...] = b_orig
+            c1 = ufunc(a, out=b.copy(), where=mask.copy()).copy()
+
+            a[...] = a_orig
+            b[...] = b_orig
+            c2 = ufunc(a, out=b, where=mask.copy()).copy()
+
+            # Also, mask overlapping with output
+            a[...] = a_orig
+            b[...] = b_orig
+            c3 = ufunc(a, out=b, where=mask).copy()
+
+            assert_array_equal(c1, c2)
+            assert_array_equal(c1, c3)
+
+        dtypes = [np.int8, np.int16, np.int32, np.int64, np.float32,
+                  np.float64, np.complex64, np.complex128]
+        dtypes = [np.dtype(x) for x in dtypes]
+
+        for dtype in dtypes:
+            if np.issubdtype(dtype, np.integer):
+                ufunc = np.invert
+            else:
+                ufunc = np.reciprocal
+
+            n = 1000
+            k = 10
+            indices = [
+                np.index_exp[:n],
+                np.index_exp[k:k+n],
+                np.index_exp[n-1::-1],
+                np.index_exp[k+n-1:k-1:-1],
+                np.index_exp[:2*n:2],
+                np.index_exp[k:k+2*n:2],
+                np.index_exp[2*n-1::-2],
+                np.index_exp[k+2*n-1:k-1:-2],
+            ]
+
+            for xi, yi in itertools.product(indices, indices):
+                v = np.arange(1, 1 + n*2 + k, dtype=dtype)
+                x = v[xi]
+                y = v[yi]
+
+                with np.errstate(all='ignore'):
+                    check(x, y)
+
+                    # Scalar cases
+                    check(x[:1], y)
+                    check(x[-1:], y)
+                    check(x[:1].reshape([]), y)
+                    check(x[-1:].reshape([]), y)
+
+    def test_unary_ufunc_where_same(self):
+        # Check behavior at wheremask overlap
+        ufunc = np.invert
+
+        def check(a, out, mask):
+            c1 = ufunc(a, out=out.copy(), where=mask.copy())
+            c2 = ufunc(a, out=out, where=mask)
+            assert_array_equal(c1, c2)
+
+        # Check behavior with same input and output arrays
+        x = np.arange(100).astype(np.bool_)
+        check(x, x, x)
+        check(x, x.copy(), x)
+        check(x, x, x.copy())
+
+    def test_binary_ufunc_1d_manual(self):
+        ufunc = np.add
+
+        def check(a, b, c):
+            c0 = c.copy()
+            c1 = ufunc(a, b, out=c0)
+            c2 = ufunc(a, b, out=c)
+            assert_array_equal(c1, c2)
+
+        for dtype in [np.int8, np.int16, np.int32, np.int64,
+                      np.float32, np.float64, np.complex64, np.complex128]:
+            # Check different data dependency orders
+
+            n = 1000
+            k = 10
+
+            indices = []
+            for p in [1, 2]:
+                indices.extend([
+                    np.index_exp[:p*n:p],
+                    np.index_exp[k:k+p*n:p],
+                    np.index_exp[p*n-1::-p],
+                    np.index_exp[k+p*n-1:k-1:-p],
+                ])
+
+            for x, y, z in itertools.product(indices, indices, indices):
+                v = np.arange(6*n).astype(dtype)
+                x = v[x]
+                y = v[y]
+                z = v[z]
+
+                check(x, y, z)
+
+                # Scalar cases
+                check(x[:1], y, z)
+                check(x[-1:], y, z)
+                check(x[:1].reshape([]), y, z)
+                check(x[-1:].reshape([]), y, z)
+                check(x, y[:1], z)
+                check(x, y[-1:], z)
+                check(x, y[:1].reshape([]), z)
+                check(x, y[-1:].reshape([]), z)
+
+    def test_inplace_op_simple_manual(self):
+        rng = np.random.RandomState(1234)
+        x = rng.rand(200, 200)  # bigger than bufsize
+
+        x += x.T
+        assert_array_equal(x - x.T, 0)
+
+
 if __name__ == "__main__":
     run_module_suite()
diff --git a/numpy/core/tests/test_nditer.py b/numpy/core/tests/test_nditer.py
index f5096e023..1863f36f7 100644
--- a/numpy/core/tests/test_nditer.py
+++ b/numpy/core/tests/test_nditer.py
@@ -1139,6 +1139,94 @@ def test_iter_common_dtype():
     assert_equal(i.dtypes[1], np.dtype('c16'))
     assert_equal(i.dtypes[2], np.dtype('c16'))
 
+def test_iter_copy_if_overlap():
+    # Ensure the iterator makes copies on read/write overlap, if requested
+
+    # Copy not needed, 1 op
+    for flag in ['readonly', 'writeonly', 'readwrite']:
+        a = arange(10)
+        i = nditer([a], ['copy_if_overlap'], [[flag]])
+        assert_(i.operands[0] is a)
+
+    # Copy needed, 2 ops, read-write overlap
+    x = arange(10)
+    a = x[1:]
+    b = x[:-1]
+    i = nditer([a, b], ['copy_if_overlap'], [['readonly'], ['readwrite']])
+    assert_(not np.shares_memory(*i.operands))
+
+    # Copy not needed with elementwise, 2 ops, exactly same arrays
+    x = arange(10)
+    a = x
+    b = x
+    i = nditer([a, b], ['copy_if_overlap'], [['readonly', 'overlap_assume_elementwise'],
+                                             ['readwrite', 'overlap_assume_elementwise']])
+    assert_(i.operands[0] is a and i.operands[1] is b)
+    i = nditer([a, b], ['copy_if_overlap'], [['readonly'], ['readwrite']])
+    assert_(i.operands[0] is a and not np.shares_memory(i.operands[1], b))
+
+    # Copy not needed, 2 ops, no overlap
+    x = arange(10)
+    a = x[::2]
+    b = x[1::2]
+    i = nditer([a, b], ['copy_if_overlap'], [['readonly'], ['writeonly']])
+    assert_(i.operands[0] is a and i.operands[1] is b)
+
+    # Copy needed, 2 ops, read-write overlap
+    x = arange(4, dtype=np.int8)
+    a = x[3:]
+    b = x.view(np.int32)[:1]
+    i = nditer([a, b], ['copy_if_overlap'], [['readonly'], ['writeonly']])
+    assert_(not np.shares_memory(*i.operands))
+
+    # Copy needed, 3 ops, read-write overlap
+    for flag in ['writeonly', 'readwrite']:
+        x = np.ones([10, 10])
+        a = x
+        b = x.T
+        c = x
+        i = nditer([a, b, c], ['copy_if_overlap'],
+                   [['readonly'], ['readonly'], [flag]])
+        a2, b2, c2 = i.operands
+        assert_(not np.shares_memory(a2, c2))
+        assert_(not np.shares_memory(b2, c2))
+
+    # Copy not needed, 3 ops, read-only overlap
+    x = np.ones([10, 10])
+    a = x
+    b = x.T
+    c = x
+    i = nditer([a, b, c], ['copy_if_overlap'],
+               [['readonly'], ['readonly'], ['readonly']])
+    a2, b2, c2 = i.operands
+    assert_(a is a2)
+    assert_(b is b2)
+    assert_(c is c2)
+
+    # Copy not needed, 3 ops, read-only overlap
+    x = np.ones([10, 10])
+    a = x
+    b = np.ones([10, 10])
+    c = x.T
+    i = nditer([a, b, c], ['copy_if_overlap'],
+               [['readonly'], ['writeonly'], ['readonly']])
+    a2, b2, c2 = i.operands
+    assert_(a is a2)
+    assert_(b is b2)
+    assert_(c is c2)
+
+    # Copy not needed, 3 ops, write-only overlap
+    x = np.arange(7)
+    a = x[:3]
+    b = x[3:6]
+    c = x[4:7]
+    i = nditer([a, b, c], ['copy_if_overlap'],
+               [['readonly'], ['writeonly'], ['writeonly']])
+    a2, b2, c2 = i.operands
+    assert_(a is a2)
+    assert_(b is b2)
+    assert_(c is c2)
+
 def test_iter_op_axes():
     # Check that custom axes work
 
@@ -2320,7 +2408,11 @@ def test_iter_reduction():
     assert_equal(i[1].strides, (0,))
     # Do the reduction
     for x, y in i:
-        y[...] += x
+        # Use a for loop instead of ``y[...] += x``
+        # (equivalent to ``y[...] = y[...].copy() + x``),
+        # because y has zero strides we use for the reduction
+        for j in range(len(y)):
+            y[j] += x[j]
     # Since no axes were specified, should have allocated a scalar
     assert_equal(i.operands[1].ndim, 0)
     assert_equal(i.operands[1], np.sum(a))
@@ -2371,7 +2463,11 @@ def test_iter_buffering_reduction():
     assert_equal(i[1].strides, (0,))
     # Do the reduction
     for x, y in i:
-        y[...] += x
+        # Use a for loop instead of ``y[...] += x``
+        # (equivalent to ``y[...] = y[...].copy() + x``),
+        # because y has zero strides we use for the reduction
+        for j in range(len(y)):
+            y[j] += x[j]
     assert_equal(b, np.sum(a, axis=1))
 
     # Iterator inner double loop was wrong on this one
diff --git a/numpy/core/tests/test_umath.py b/numpy/core/tests/test_umath.py
index 436cb0355..48d8f9379 100644
--- a/numpy/core/tests/test_umath.py
+++ b/numpy/core/tests/test_umath.py
@@ -1452,6 +1452,22 @@ class TestSpecialMethods(TestCase):
         assert_equal(x, np.array(2))
         assert_equal(type(x), with_prepare)
 
+    def test_prepare_out(self):
+
+        class with_prepare(np.ndarray):
+            __array_priority__ = 10
+
+            def __array_prepare__(self, arr, context):
+                return np.array(arr).view(type=with_prepare)
+
+        a = np.array([1]).view(type=with_prepare)
+        x = np.add(a, a, a)
+        # Returned array is new, because of the strange
+        # __array_prepare__ above
+        assert_(not np.shares_memory(x, a))
+        assert_equal(x, np.array([2]))
+        assert_equal(type(x), with_prepare)
+
     def test_failing_prepare(self):
 
         class A(object):
author	Charles Harris <charlesr.harris@gmail.com>	2017-02-16 15:41:27 -0700
committer	GitHub <noreply@github.com>	2017-02-16 15:41:27 -0700
commit	d55f40b1e48d5a5fbed80e00a140c9db6e19732f (patch)
tree	733aa0ac1abdcc1aadf6452be5729b7b83095a09 /numpy
parent	ba1ddc421eac7b5c3accb99a4c633490a1814838 (diff)
parent	8e8ce442e8449916a93951093cdce16cec006bcc (diff)
download	numpy-d55f40b1e48d5a5fbed80e00a140c9db6e19732f.tar.gz