18 files changed, 766 insertions, 393 deletions
diff --git a/numpy/core/_add_newdocs.py b/numpy/core/_add_newdocs.py
index 11d1810f9..a800143a8 100644
--- a/numpy/core/_add_newdocs.py
+++ b/numpy/core/_add_newdocs.py
@@ -2265,10 +2265,6 @@ add_newdoc('numpy.core.multiarray', 'ndarray', ('__array_interface__',
     """Array protocol: Python side."""))
 
 
-add_newdoc('numpy.core.multiarray', 'ndarray', ('__array_finalize__',
-    """None."""))
-
-
 add_newdoc('numpy.core.multiarray', 'ndarray', ('__array_priority__',
     """Array priority."""))
 
@@ -2278,12 +2274,12 @@ add_newdoc('numpy.core.multiarray', 'ndarray', ('__array_struct__',
 
 add_newdoc('numpy.core.multiarray', 'ndarray', ('__dlpack__',
     """a.__dlpack__(*, stream=None)
-    
+
     DLPack Protocol: Part of the Array API."""))
 
 add_newdoc('numpy.core.multiarray', 'ndarray', ('__dlpack_device__',
     """a.__dlpack_device__()
-    
+
     DLPack Protocol: Part of the Array API."""))
 
 add_newdoc('numpy.core.multiarray', 'ndarray', ('base',
@@ -2811,6 +2807,14 @@ add_newdoc('numpy.core.multiarray', 'ndarray', ('__array__',
     """))
 
 
+add_newdoc('numpy.core.multiarray', 'ndarray', ('__array_finalize__',
+    """a.__array_finalize__(obj, /)
+
+    Present so subclasses can call super. Does nothing.
+
+    """))
+
+
 add_newdoc('numpy.core.multiarray', 'ndarray', ('__array_prepare__',
     """a.__array_prepare__(array[, context], /)
 
diff --git a/numpy/core/_asarray.pyi b/numpy/core/_asarray.pyi
index 0da2de912..51b794130 100644
--- a/numpy/core/_asarray.pyi
+++ b/numpy/core/_asarray.pyi
@@ -2,7 +2,7 @@ from collections.abc import Iterable
 from typing import TypeVar, Union, overload, Literal
 
 from numpy import ndarray
-from numpy.typing import ArrayLike, DTypeLike
+from numpy.typing import DTypeLike, _SupportsArrayFunc
 
 _ArrayType = TypeVar("_ArrayType", bound=ndarray)
 
@@ -22,7 +22,7 @@ def require(
     dtype: None = ...,
     requirements: None | _Requirements | Iterable[_Requirements] = ...,
     *,
-    like: ArrayLike = ...
+    like: _SupportsArrayFunc = ...
 ) -> _ArrayType: ...
 @overload
 def require(
@@ -30,7 +30,7 @@ def require(
     dtype: DTypeLike = ...,
     requirements: _E | Iterable[_RequirementsWithE] = ...,
     *,
-    like: ArrayLike = ...
+    like: _SupportsArrayFunc = ...
 ) -> ndarray: ...
 @overload
 def require(
@@ -38,5 +38,5 @@ def require(
     dtype: DTypeLike = ...,
     requirements: None | _Requirements | Iterable[_Requirements] = ...,
     *,
-    like: ArrayLike = ...
+    like: _SupportsArrayFunc = ...
 ) -> ndarray: ...
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py
index 1844ab96a..054150b28 100644
--- a/numpy/core/code_generators/generate_umath.py
+++ b/numpy/core/code_generators/generate_umath.py
@@ -522,14 +522,14 @@ defdict = {
     Ufunc(2, 1, ReorderableNone,
           docstrings.get('numpy.core.umath.maximum'),
           'PyUFunc_SimpleUniformOperationTypeResolver',
-          TD(noobj, simd=[('avx512f', 'fd')]),
+          TD(noobj, dispatch=[('loops_minmax', ints+'fdg')]),
           TD(O, f='npy_ObjectMax')
           ),
 'minimum':
     Ufunc(2, 1, ReorderableNone,
           docstrings.get('numpy.core.umath.minimum'),
           'PyUFunc_SimpleUniformOperationTypeResolver',
-          TD(noobj, simd=[('avx512f', 'fd')]),
+          TD(noobj, dispatch=[('loops_minmax', ints+'fdg')]),
           TD(O, f='npy_ObjectMin')
           ),
 'clip':
@@ -543,6 +543,7 @@ defdict = {
     Ufunc(2, 1, ReorderableNone,
           docstrings.get('numpy.core.umath.fmax'),
           'PyUFunc_SimpleUniformOperationTypeResolver',
+          TD('fdg', dispatch=[('loops_minmax', 'fdg')]),
           TD(noobj),
           TD(O, f='npy_ObjectMax')
           ),
@@ -550,6 +551,7 @@ defdict = {
     Ufunc(2, 1, ReorderableNone,
           docstrings.get('numpy.core.umath.fmin'),
           'PyUFunc_SimpleUniformOperationTypeResolver',
+          TD('fdg', dispatch=[('loops_minmax', 'fdg')]),
           TD(noobj),
           TD(O, f='npy_ObjectMin')
           ),
diff --git a/numpy/core/multiarray.pyi b/numpy/core/multiarray.pyi
index 423aed85e..f2d3622d2 100644
--- a/numpy/core/multiarray.pyi
+++ b/numpy/core/multiarray.pyi
@@ -61,6 +61,7 @@ from numpy.typing import (
     NDArray,
     ArrayLike,
     _SupportsArray,
+    _SupportsArrayFunc,
     _NestedSequence,
     _FiniteNestedSequence,
     _ArrayLikeBool_co,
@@ -177,7 +178,7 @@ def array(
     order: _OrderKACF = ...,
     subok: L[True],
     ndmin: int = ...,
-    like: ArrayLike = ...,
+    like: _SupportsArrayFunc = ...,
 ) -> _ArrayType: ...
 @overload
 def array(
@@ -188,7 +189,7 @@ def array(
     order: _OrderKACF = ...,
     subok: bool = ...,
     ndmin: int = ...,
-    like: ArrayLike = ...,
+    like: _SupportsArrayFunc = ...,
 ) -> NDArray[_SCT]: ...
 @overload
 def array(
@@ -199,7 +200,7 @@ def array(
     order: _OrderKACF = ...,
     subok: bool = ...,
     ndmin: int = ...,
-    like: ArrayLike = ...,
+    like: _SupportsArrayFunc = ...,
 ) -> NDArray[Any]: ...
 @overload
 def array(
@@ -210,7 +211,7 @@ def array(
     order: _OrderKACF = ...,
     subok: bool = ...,
     ndmin: int = ...,
-    like: ArrayLike = ...,
+    like: _SupportsArrayFunc = ...,
 ) -> NDArray[_SCT]: ...
 @overload
 def array(
@@ -221,7 +222,7 @@ def array(
     order: _OrderKACF = ...,
     subok: bool = ...,
     ndmin: int = ...,
-    like: ArrayLike = ...,
+    like: _SupportsArrayFunc = ...,
 ) -> NDArray[Any]: ...
 
 @overload
@@ -230,7 +231,7 @@ def zeros(
     dtype: None = ...,
     order: _OrderCF = ...,
     *,
-    like: ArrayLike = ...,
+    like: _SupportsArrayFunc = ...,
 ) -> NDArray[float64]: ...
 @overload
 def zeros(
@@ -238,7 +239,7 @@ def zeros(
     dtype: _DTypeLike[_SCT],
     order: _OrderCF = ...,
     *,
-    like: ArrayLike = ...,
+    like: _SupportsArrayFunc = ...,
 ) -> NDArray[_SCT]: ...
 @overload
 def zeros(
@@ -246,7 +247,7 @@ def zeros(
     dtype: DTypeLike,
     order: _OrderCF = ...,
     *,
-    like: ArrayLike = ...,
+    like: _SupportsArrayFunc = ...,
 ) -> NDArray[Any]: ...
 
 @overload
@@ -255,7 +256,7 @@ def empty(
     dtype: None = ...,
     order: _OrderCF = ...,
     *,
-    like: ArrayLike = ...,
+    like: _SupportsArrayFunc = ...,
 ) -> NDArray[float64]: ...
 @overload
 def empty(
@@ -263,7 +264,7 @@ def empty(
     dtype: _DTypeLike[_SCT],
     order: _OrderCF = ...,
     *,
-    like: ArrayLike = ...,
+    like: _SupportsArrayFunc = ...,
 ) -> NDArray[_SCT]: ...
 @overload
 def empty(
@@ -271,7 +272,7 @@ def empty(
     dtype: DTypeLike,
     order: _OrderCF = ...,
     *,
-    like: ArrayLike = ...,
+    like: _SupportsArrayFunc = ...,
 ) -> NDArray[Any]: ...
 
 @overload
@@ -468,7 +469,7 @@ def asarray(
     dtype: None = ...,
     order: _OrderKACF = ...,
     *,
-    like: ArrayLike = ...,
+    like: _SupportsArrayFunc = ...,
 ) -> NDArray[_SCT]: ...
 @overload
 def asarray(
@@ -476,7 +477,7 @@ def asarray(
     dtype: None = ...,
     order: _OrderKACF = ...,
     *,
-    like: ArrayLike = ...,
+    like: _SupportsArrayFunc = ...,
 ) -> NDArray[Any]: ...
 @overload
 def asarray(
@@ -484,7 +485,7 @@ def asarray(
     dtype: _DTypeLike[_SCT],
     order: _OrderKACF = ...,
     *,
-    like: ArrayLike = ...,
+    like: _SupportsArrayFunc = ...,
 ) -> NDArray[_SCT]: ...
 @overload
 def asarray(
@@ -492,7 +493,7 @@ def asarray(
     dtype: DTypeLike,
     order: _OrderKACF = ...,
     *,
-    like: ArrayLike = ...,
+    like: _SupportsArrayFunc = ...,
 ) -> NDArray[Any]: ...
 
 @overload
@@ -501,7 +502,7 @@ def asanyarray(
     dtype: None = ...,
     order: _OrderKACF = ...,
     *,
-    like: ArrayLike = ...,
+    like: _SupportsArrayFunc = ...,
 ) -> _ArrayType: ...
 @overload
 def asanyarray(
@@ -509,7 +510,7 @@ def asanyarray(
     dtype: None = ...,
     order: _OrderKACF = ...,
     *,
-    like: ArrayLike = ...,
+    like: _SupportsArrayFunc = ...,
 ) -> NDArray[_SCT]: ...
 @overload
 def asanyarray(
@@ -517,7 +518,7 @@ def asanyarray(
     dtype: None = ...,
     order: _OrderKACF = ...,
     *,
-    like: ArrayLike = ...,
+    like: _SupportsArrayFunc = ...,
 ) -> NDArray[Any]: ...
 @overload
 def asanyarray(
@@ -525,7 +526,7 @@ def asanyarray(
     dtype: _DTypeLike[_SCT],
     order: _OrderKACF = ...,
     *,
-    like: ArrayLike = ...,
+    like: _SupportsArrayFunc = ...,
 ) -> NDArray[_SCT]: ...
 @overload
 def asanyarray(
@@ -533,7 +534,7 @@ def asanyarray(
     dtype: DTypeLike,
     order: _OrderKACF = ...,
     *,
-    like: ArrayLike = ...,
+    like: _SupportsArrayFunc = ...,
 ) -> NDArray[Any]: ...
 
 @overload
@@ -541,28 +542,28 @@ def ascontiguousarray(
     a: _ArrayLike[_SCT],
     dtype: None = ...,
     *,
-    like: ArrayLike = ...,
+    like: _SupportsArrayFunc = ...,
 ) -> NDArray[_SCT]: ...
 @overload
 def ascontiguousarray(
     a: object,
     dtype: None = ...,
     *,
-    like: ArrayLike = ...,
+    like: _SupportsArrayFunc = ...,
 ) -> NDArray[Any]: ...
 @overload
 def ascontiguousarray(
     a: Any,
     dtype: _DTypeLike[_SCT],
     *,
-    like: ArrayLike = ...,
+    like: _SupportsArrayFunc = ...,
 ) -> NDArray[_SCT]: ...
 @overload
 def ascontiguousarray(
     a: Any,
     dtype: DTypeLike,
     *,
-    like: ArrayLike = ...,
+    like: _SupportsArrayFunc = ...,
 ) -> NDArray[Any]: ...
 
 @overload
@@ -570,28 +571,28 @@ def asfortranarray(
     a: _ArrayLike[_SCT],
     dtype: None = ...,
     *,
-    like: ArrayLike = ...,
+    like: _SupportsArrayFunc = ...,
 ) -> NDArray[_SCT]: ...
 @overload
 def asfortranarray(
     a: object,
     dtype: None = ...,
     *,
-    like: ArrayLike = ...,
+    like: _SupportsArrayFunc = ...,
 ) -> NDArray[Any]: ...
 @overload
 def asfortranarray(
     a: Any,
     dtype: _DTypeLike[_SCT],
     *,
-    like: ArrayLike = ...,
+    like: _SupportsArrayFunc = ...,
 ) -> NDArray[_SCT]: ...
 @overload
 def asfortranarray(
     a: Any,
     dtype: DTypeLike,
     *,
-    like: ArrayLike = ...,
+    like: _SupportsArrayFunc = ...,
 ) -> NDArray[Any]: ...
 
 # In practice `list[Any]` is list with an int, int and a valid
@@ -609,7 +610,7 @@ def fromstring(
     count: SupportsIndex = ...,
     *,
     sep: str,
-    like: ArrayLike = ...,
+    like: _SupportsArrayFunc = ...,
 ) -> NDArray[float64]: ...
 @overload
 def fromstring(
@@ -618,7 +619,7 @@ def fromstring(
     count: SupportsIndex = ...,
     *,
     sep: str,
-    like: ArrayLike = ...,
+    like: _SupportsArrayFunc = ...,
 ) -> NDArray[_SCT]: ...
 @overload
 def fromstring(
@@ -627,7 +628,7 @@ def fromstring(
     count: SupportsIndex = ...,
     *,
     sep: str,
-    like: ArrayLike = ...,
+    like: _SupportsArrayFunc = ...,
 ) -> NDArray[Any]: ...
 
 def frompyfunc(
@@ -646,7 +647,7 @@ def fromfile(
     sep: str = ...,
     offset: SupportsIndex = ...,
     *,
-    like: ArrayLike = ...,
+    like: _SupportsArrayFunc = ...,
 ) -> NDArray[float64]: ...
 @overload
 def fromfile(
@@ -656,7 +657,7 @@ def fromfile(
     sep: str = ...,
     offset: SupportsIndex = ...,
     *,
-    like: ArrayLike = ...,
+    like: _SupportsArrayFunc = ...,
 ) -> NDArray[_SCT]: ...
 @overload
 def fromfile(
@@ -666,7 +667,7 @@ def fromfile(
     sep: str = ...,
     offset: SupportsIndex = ...,
     *,
-    like: ArrayLike = ...,
+    like: _SupportsArrayFunc = ...,
 ) -> NDArray[Any]: ...
 
 @overload
@@ -675,7 +676,7 @@ def fromiter(
     dtype: _DTypeLike[_SCT],
     count: SupportsIndex = ...,
     *,
-    like: ArrayLike = ...,
+    like: _SupportsArrayFunc = ...,
 ) -> NDArray[_SCT]: ...
 @overload
 def fromiter(
@@ -683,7 +684,7 @@ def fromiter(
     dtype: DTypeLike,
     count: SupportsIndex = ...,
     *,
-    like: ArrayLike = ...,
+    like: _SupportsArrayFunc = ...,
 ) -> NDArray[Any]: ...
 
 @overload
@@ -693,7 +694,7 @@ def frombuffer(
     count: SupportsIndex = ...,
     offset: SupportsIndex = ...,
     *,
-    like: ArrayLike = ...,
+    like: _SupportsArrayFunc = ...,
 ) -> NDArray[float64]: ...
 @overload
 def frombuffer(
@@ -702,7 +703,7 @@ def frombuffer(
     count: SupportsIndex = ...,
     offset: SupportsIndex = ...,
     *,
-    like: ArrayLike = ...,
+    like: _SupportsArrayFunc = ...,
 ) -> NDArray[_SCT]: ...
 @overload
 def frombuffer(
@@ -711,7 +712,7 @@ def frombuffer(
     count: SupportsIndex = ...,
     offset: SupportsIndex = ...,
     *,
-    like: ArrayLike = ...,
+    like: _SupportsArrayFunc = ...,
 ) -> NDArray[Any]: ...
 
 @overload
@@ -719,7 +720,7 @@ def arange(  # type: ignore[misc]
     stop: _IntLike_co,
     /, *,
     dtype: None = ...,
-    like: ArrayLike = ...,
+    like: _SupportsArrayFunc = ...,
 ) -> NDArray[signedinteger[Any]]: ...
 @overload
 def arange(  # type: ignore[misc]
@@ -728,14 +729,14 @@ def arange(  # type: ignore[misc]
     step: _IntLike_co = ...,
     dtype: None = ...,
     *,
-    like: ArrayLike = ...,
+    like: _SupportsArrayFunc = ...,
 ) -> NDArray[signedinteger[Any]]: ...
 @overload
 def arange(  # type: ignore[misc]
     stop: _FloatLike_co,
     /, *,
     dtype: None = ...,
-    like: ArrayLike = ...,
+    like: _SupportsArrayFunc = ...,
 ) -> NDArray[floating[Any]]: ...
 @overload
 def arange(  # type: ignore[misc]
@@ -744,14 +745,14 @@ def arange(  # type: ignore[misc]
     step: _FloatLike_co = ...,
     dtype: None = ...,
     *,
-    like: ArrayLike = ...,
+    like: _SupportsArrayFunc = ...,
 ) -> NDArray[floating[Any]]: ...
 @overload
 def arange(
     stop: _TD64Like_co,
     /, *,
     dtype: None = ...,
-    like: ArrayLike = ...,
+    like: _SupportsArrayFunc = ...,
 ) -> NDArray[timedelta64]: ...
 @overload
 def arange(
@@ -760,7 +761,7 @@ def arange(
     step: _TD64Like_co = ...,
     dtype: None = ...,
     *,
-    like: ArrayLike = ...,
+    like: _SupportsArrayFunc = ...,
 ) -> NDArray[timedelta64]: ...
 @overload
 def arange(  # both start and stop must always be specified for datetime64
@@ -769,14 +770,14 @@ def arange(  # both start and stop must always be specified for datetime64
     step: datetime64 = ...,
     dtype: None = ...,
     *,
-    like: ArrayLike = ...,
+    like: _SupportsArrayFunc = ...,
 ) -> NDArray[datetime64]: ...
 @overload
 def arange(
     stop: Any,
     /, *,
     dtype: _DTypeLike[_SCT],
-    like: ArrayLike = ...,
+    like: _SupportsArrayFunc = ...,
 ) -> NDArray[_SCT]: ...
 @overload
 def arange(
@@ -785,14 +786,14 @@ def arange(
     step: Any = ...,
     dtype: _DTypeLike[_SCT] = ...,
     *,
-    like: ArrayLike = ...,
+    like: _SupportsArrayFunc = ...,
 ) -> NDArray[_SCT]: ...
 @overload
 def arange(
     stop: Any, /,
     *,
     dtype: DTypeLike,
-    like: ArrayLike = ...,
+    like: _SupportsArrayFunc = ...,
 ) -> NDArray[Any]: ...
 @overload
 def arange(
@@ -801,7 +802,7 @@ def arange(
     step: Any = ...,
     dtype: DTypeLike = ...,
     *,
-    like: ArrayLike = ...,
+    like: _SupportsArrayFunc = ...,
 ) -> NDArray[Any]: ...
 
 def datetime_data(
diff --git a/numpy/core/numeric.pyi b/numpy/core/numeric.pyi
index 8b92abab4..d5e28d24c 100644
--- a/numpy/core/numeric.pyi
+++ b/numpy/core/numeric.pyi
@@ -37,6 +37,7 @@ from numpy.typing import (
     _SupportsDType,
     _FiniteNestedSequence,
     _SupportsArray,
+    _SupportsArrayFunc,
     _ScalarLike_co,
     _ArrayLikeBool_co,
     _ArrayLikeUInt_co,
@@ -108,7 +109,7 @@ def ones(
     dtype: None = ...,
     order: _OrderCF = ...,
     *,
-    like: ArrayLike = ...,
+    like: _SupportsArrayFunc = ...,
 ) -> NDArray[float64]: ...
 @overload
 def ones(
@@ -116,7 +117,7 @@ def ones(
     dtype: _DTypeLike[_SCT],
     order: _OrderCF = ...,
     *,
-    like: ArrayLike = ...,
+    like: _SupportsArrayFunc = ...,
 ) -> NDArray[_SCT]: ...
 @overload
 def ones(
@@ -124,7 +125,7 @@ def ones(
     dtype: DTypeLike,
     order: _OrderCF = ...,
     *,
-    like: ArrayLike = ...,
+    like: _SupportsArrayFunc = ...,
 ) -> NDArray[Any]: ...
 
 @overload
@@ -175,7 +176,7 @@ def full(
     dtype: None = ...,
     order: _OrderCF = ...,
     *,
-    like: ArrayLike = ...,
+    like: _SupportsArrayFunc = ...,
 ) -> NDArray[Any]: ...
 @overload
 def full(
@@ -184,7 +185,7 @@ def full(
     dtype: _DTypeLike[_SCT],
     order: _OrderCF = ...,
     *,
-    like: ArrayLike = ...,
+    like: _SupportsArrayFunc = ...,
 ) -> NDArray[_SCT]: ...
 @overload
 def full(
@@ -193,7 +194,7 @@ def full(
     dtype: DTypeLike,
     order: _OrderCF = ...,
     *,
-    like: ArrayLike = ...,
+    like: _SupportsArrayFunc = ...,
 ) -> NDArray[Any]: ...
 
 @overload
@@ -563,7 +564,7 @@ def fromfunction(
     shape: Sequence[int],
     *,
     dtype: DTypeLike = ...,
-    like: ArrayLike = ...,
+    like: _SupportsArrayFunc = ...,
     **kwargs: Any,
 ) -> _T: ...
 
@@ -584,21 +585,21 @@ def identity(
     n: int,
     dtype: None = ...,
     *,
-    like: ArrayLike = ...,
+    like: _SupportsArrayFunc = ...,
 ) -> NDArray[float64]: ...
 @overload
 def identity(
     n: int,
     dtype: _DTypeLike[_SCT],
     *,
-    like: ArrayLike = ...,
+    like: _SupportsArrayFunc = ...,
 ) -> NDArray[_SCT]: ...
 @overload
 def identity(
     n: int,
     dtype: DTypeLike,
     *,
-    like: ArrayLike = ...,
+    like: _SupportsArrayFunc = ...,
 ) -> NDArray[Any]: ...
 
 def allclose(
diff --git a/numpy/core/numerictypes.py b/numpy/core/numerictypes.py
index 8e5de852b..3d1cb6fd1 100644
--- a/numpy/core/numerictypes.py
+++ b/numpy/core/numerictypes.py
@@ -516,7 +516,7 @@ def _scalar_type_key(typ):
     return (dt.kind.lower(), dt.itemsize)
 
 
-ScalarType = [int, float, complex, int, bool, bytes, str, memoryview]
+ScalarType = [int, float, complex, bool, bytes, str, memoryview]
 ScalarType += sorted(_concrete_types, key=_scalar_type_key)
 ScalarType = tuple(ScalarType)
 
diff --git a/numpy/core/numerictypes.pyi b/numpy/core/numerictypes.pyi
index f8c49b957..09c180a0f 100644
--- a/numpy/core/numerictypes.pyi
+++ b/numpy/core/numerictypes.pyi
@@ -137,7 +137,6 @@ ScalarType: tuple[
     type[int],
     type[float],
     type[complex],
-    type[int],
     type[bool],
     type[bytes],
     type[str],
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index a67a4cab6..22cac1e9a 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -999,6 +999,7 @@ def configuration(parent_package='',top_path=None):
             join('src', 'umath', 'loops_unary_fp.dispatch.c.src'),
             join('src', 'umath', 'loops_arithm_fp.dispatch.c.src'),
             join('src', 'umath', 'loops_arithmetic.dispatch.c.src'),
+            join('src', 'umath', 'loops_minmax.dispatch.c.src'),
             join('src', 'umath', 'loops_trigonometric.dispatch.c.src'),
             join('src', 'umath', 'loops_umath_fp.dispatch.c.src'),
             join('src', 'umath', 'loops_exponent_log.dispatch.c.src'),
diff --git a/numpy/core/src/multiarray/ctors.c b/numpy/core/src/multiarray/ctors.c
index e60cacc18..25eb91977 100644
--- a/numpy/core/src/multiarray/ctors.c
+++ b/numpy/core/src/multiarray/ctors.c
@@ -875,16 +875,39 @@ PyArray_NewFromDescr_int(
     /*
      * call the __array_finalize__ method if a subtype was requested.
      * If obj is NULL use Py_None for the Python callback.
+     * For speed, we skip if __array_finalize__ is inherited from ndarray
+     * (since that function does nothing), or, for backward compatibility,
+     * if it is None.
      */
     if (subtype != &PyArray_Type) {
         PyObject *res, *func;
-
-        func = PyObject_GetAttr((PyObject *)fa, npy_ma_str_array_finalize);
+        static PyObject *ndarray_array_finalize = NULL;
+        /* First time, cache ndarray's __array_finalize__ */
+        if (ndarray_array_finalize == NULL) {
+            ndarray_array_finalize = PyObject_GetAttr(
+                (PyObject *)&PyArray_Type, npy_ma_str_array_finalize);
+        }
+        func = PyObject_GetAttr((PyObject *)subtype, npy_ma_str_array_finalize);
         if (func == NULL) {
             goto fail;
         }
+        else if (func == ndarray_array_finalize) {
+            Py_DECREF(func);
+        }
         else if (func == Py_None) {
             Py_DECREF(func);
+            /*
+             * 2022-01-08, NumPy 1.23; when deprecation period is over, remove this
+             * whole stanza so one gets a "NoneType object is not callable" TypeError.
+             */
+            if (DEPRECATE(
+                    "Setting __array_finalize__ = None to indicate no finalization"
+                    "should be done is deprecated.  Instead, just inherit from "
+                    "ndarray or, if that is not possible, explicitly set to "
+                    "ndarray.__array_function__; this will raise a TypeError "
+                    "in the future. (Deprecated since NumPy 1.23)") < 0) {
+                goto fail;
+            }
         }
         else {
             if (PyCapsule_CheckExact(func)) {
@@ -903,7 +926,7 @@ PyArray_NewFromDescr_int(
                 if (obj == NULL) {
                     obj = Py_None;
                 }
-                res = PyObject_CallFunctionObjArgs(func, obj, NULL);
+                res = PyObject_CallFunctionObjArgs(func, (PyObject *)fa, obj, NULL);
                 Py_DECREF(func);
                 if (res == NULL) {
                     goto fail;
diff --git a/numpy/core/src/multiarray/getset.c b/numpy/core/src/multiarray/getset.c
index ac6465acd..a4f972ba4 100644
--- a/numpy/core/src/multiarray/getset.c
+++ b/numpy/core/src/multiarray/getset.c
@@ -926,15 +926,6 @@ array_transpose_get(PyArrayObject *self, void *NPY_UNUSED(ignored))
     return PyArray_Transpose(self, NULL);
 }
 
-/* If this is None, no function call is made
-   --- default sub-class behavior
-*/
-static PyObject *
-array_finalize_get(PyArrayObject *NPY_UNUSED(self), void *NPY_UNUSED(ignored))
-{
-    Py_RETURN_NONE;
-}
-
 NPY_NO_EXPORT PyGetSetDef array_getsetlist[] = {
     {"ndim",
         (getter)array_ndim_get,
@@ -1008,10 +999,6 @@ NPY_NO_EXPORT PyGetSetDef array_getsetlist[] = {
         (getter)array_priority_get,
         NULL,
         NULL, NULL},
-    {"__array_finalize__",
-        (getter)array_finalize_get,
-        NULL,
-        NULL, NULL},
     {NULL, NULL, NULL, NULL, NULL},  /* Sentinel */
 };
 
diff --git a/numpy/core/src/multiarray/methods.c b/numpy/core/src/multiarray/methods.c
index 8dd7c112f..33f78dff2 100644
--- a/numpy/core/src/multiarray/methods.c
+++ b/numpy/core/src/multiarray/methods.c
@@ -859,7 +859,7 @@ array_astype(PyArrayObject *self,
      * and it's not a subtype if subok is False, then we
      * can skip the copy.
      */
-    if (forcecopy != NPY_COPY_ALWAYS && 
+    if (forcecopy != NPY_COPY_ALWAYS &&
                     (order == NPY_KEEPORDER ||
                     (order == NPY_ANYORDER &&
                         (PyArray_IS_C_CONTIGUOUS(self) ||
@@ -881,7 +881,7 @@ array_astype(PyArrayObject *self,
         Py_DECREF(dtype);
         return NULL;
     }
-    
+
     if (!PyArray_CanCastArrayTo(self, dtype, casting)) {
         PyErr_Clear();
         npy_set_invalid_cast_error(
@@ -926,6 +926,13 @@ array_astype(PyArrayObject *self,
 
 
 static PyObject *
+array_finalizearray(PyArrayObject *self, PyObject *obj)
+{
+    Py_RETURN_NONE;
+}
+
+
+static PyObject *
 array_wraparray(PyArrayObject *self, PyObject *args)
 {
     PyArrayObject *arr;
@@ -2777,6 +2784,9 @@ NPY_NO_EXPORT PyMethodDef array_methods[] = {
     {"__array_prepare__",
         (PyCFunction)array_preparearray,
         METH_VARARGS, NULL},
+    {"__array_finalize__",
+        (PyCFunction)array_finalizearray,
+        METH_O, NULL},
     {"__array_wrap__",
         (PyCFunction)array_wraparray,
         METH_VARARGS, NULL},
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index aaa694f34..5f054d0a9 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -724,32 +724,6 @@ NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
 
 /**end repeat1**/
 
-/**begin repeat1
- * #kind = maximum, minimum#
- * #OP =  >, <#
- **/
-
-NPY_NO_EXPORT void
-@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    if (IS_BINARY_REDUCE) {
-        BINARY_REDUCE_LOOP(@type@) {
-            const @type@ in2 = *(@type@ *)ip2;
-            io1 = (io1 @OP@ in2) ? io1 : in2;
-        }
-        *((@type@ *)iop1) = io1;
-    }
-    else {
-        BINARY_LOOP {
-            const @type@ in1 = *(@type@ *)ip1;
-            const @type@ in2 = *(@type@ *)ip2;
-            *((@type@ *)op1) = (in1 @OP@ in2) ? in1 : in2;
-        }
-    }
-}
-
-/**end repeat1**/
-
 NPY_NO_EXPORT void
 @TYPE@_power(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
 {
@@ -1684,93 +1658,6 @@ NPY_NO_EXPORT void
     }
 }
 
-/**begin repeat1
- * #kind = maximum, minimum#
- * #OP =  >=, <=#
- **/
-NPY_NO_EXPORT void
-@TYPE@_@kind@_avx512f(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    /*  */
-    if (IS_BINARY_REDUCE) {
-        if (!run_unary_reduce_simd_@kind@_@TYPE@(args, dimensions, steps)) {
-            BINARY_REDUCE_LOOP(@type@) {
-                const @type@ in2 = *(@type@ *)ip2;
-                /* Order of operations important for MSVC 2015 */
-                io1 = (io1 @OP@ in2 || npy_isnan(io1)) ? io1 : in2;
-            }
-            *((@type@ *)iop1) = io1;
-        }
-    }
-    else {
-        if (!run_binary_avx512f_@kind@_@TYPE@(args, dimensions, steps)) {
-            BINARY_LOOP {
-                @type@ in1 = *(@type@ *)ip1;
-                const @type@ in2 = *(@type@ *)ip2;
-                /* Order of operations important for MSVC 2015 */
-                in1 = (in1 @OP@ in2 || npy_isnan(in1)) ? in1 : in2;
-                *((@type@ *)op1) = in1;
-            }
-        }
-    }
-    npy_clear_floatstatus_barrier((char*)dimensions);
-}
-
-NPY_NO_EXPORT void
-@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    /*  */
-    if (IS_BINARY_REDUCE) {
-        if (!run_unary_reduce_simd_@kind@_@TYPE@(args, dimensions, steps)) {
-            BINARY_REDUCE_LOOP(@type@) {
-                const @type@ in2 = *(@type@ *)ip2;
-                /* Order of operations important for MSVC 2015 */
-                io1 = (io1 @OP@ in2 || npy_isnan(io1)) ? io1 : in2;
-            }
-            *((@type@ *)iop1) = io1;
-        }
-    }
-    else {
-        BINARY_LOOP {
-            @type@ in1 = *(@type@ *)ip1;
-            const @type@ in2 = *(@type@ *)ip2;
-            /* Order of operations important for MSVC 2015 */
-            in1 = (in1 @OP@ in2 || npy_isnan(in1)) ? in1 : in2;
-            *((@type@ *)op1) = in1;
-        }
-    }
-    npy_clear_floatstatus_barrier((char*)dimensions);
-}
-/**end repeat1**/
-
-/**begin repeat1
- * #kind = fmax, fmin#
- * #OP =  >=, <=#
- **/
-NPY_NO_EXPORT void
-@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    /*  */
-    if (IS_BINARY_REDUCE) {
-        BINARY_REDUCE_LOOP(@type@) {
-            const @type@ in2 = *(@type@ *)ip2;
-            /* Order of operations important for MSVC 2015 */
-            io1 = (io1 @OP@ in2 || npy_isnan(in2)) ? io1 : in2;
-        }
-        *((@type@ *)iop1) = io1;
-    }
-    else {
-        BINARY_LOOP {
-            const @type@ in1 = *(@type@ *)ip1;
-            const @type@ in2 = *(@type@ *)ip2;
-            /* Order of operations important for MSVC 2015 */
-            *((@type@ *)op1) = (in1 @OP@ in2 || npy_isnan(in2)) ? in1 : in2;
-        }
-    }
-    npy_clear_floatstatus_barrier((char*)dimensions);
-}
-/**end repeat1**/
-
 NPY_NO_EXPORT void
 @TYPE@_floor_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
 {
diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src
index 081ca9957..3eafbdf66 100644
--- a/numpy/core/src/umath/loops.h.src
+++ b/numpy/core/src/umath/loops.h.src
@@ -22,7 +22,6 @@
 #define BOOL_fmax BOOL_maximum
 #define BOOL_fmin BOOL_minimum
 
-
 /*
  *****************************************************************************
  **                             BOOLEAN LOOPS                               **
@@ -660,6 +659,43 @@ PyUFunc_OOO_O(char **args, npy_intp const *dimensions, npy_intp const *steps, vo
 
 /*
  *****************************************************************************
+ **                            MIN/MAX LOOPS                                **
+ *****************************************************************************
+ */
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_minmax.dispatch.h"
+#endif
+
+//---------- Integers ----------
+
+/**begin repeat
+ * #TYPE = BYTE, UBYTE, SHORT, USHORT, INT, UINT,
+ *         LONG, ULONG, LONGLONG, ULONGLONG#
+ */
+/**begin repeat1
+ * #kind = maximum, minimum#
+ */
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+/**end repeat1**/
+/**end repeat**/
+
+//---------- Float ----------
+
+ /**begin repeat
+  * #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
+  */
+/**begin repeat1
+ * #kind = maximum, minimum, fmax, fmin#
+ */
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+/**end repeat1**/
+/**end repeat**/
+
+/*
+ *****************************************************************************
  **                              END LOOPS                                  **
  *****************************************************************************
  */
diff --git a/numpy/core/src/umath/loops_minmax.dispatch.c.src b/numpy/core/src/umath/loops_minmax.dispatch.c.src
new file mode 100644
index 000000000..dbd158db9
--- /dev/null
+++ b/numpy/core/src/umath/loops_minmax.dispatch.c.src
@@ -0,0 +1,551 @@
+/*@targets
+ ** $maxopt baseline
+ ** neon asimd
+ ** sse2 avx2 avx512_skx
+ ** vsx2
+ **/
+#define _UMATHMODULE
+#define _MULTIARRAYMODULE
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+#include "simd/simd.h"
+#include "loops_utils.h"
+#include "loops.h"
+#include "lowlevel_strided_loops.h"
+// Provides the various *_LOOP macros
+#include "fast_loop_macros.h"
+
+/*******************************************************************************
+ ** Scalar intrinsics
+ ******************************************************************************/
+// signed/unsigned int
+#define scalar_max_i(A, B) ((A > B) ? A : B)
+#define scalar_min_i(A, B) ((A < B) ? A : B)
+// fp, propagates NaNs
+#define scalar_max_f(A, B) ((A >= B || npy_isnan(A)) ? A : B)
+#define scalar_max_d scalar_max_f
+#define scalar_max_l scalar_max_f
+#define scalar_min_f(A, B) ((A <= B || npy_isnan(A)) ? A : B)
+#define scalar_min_d scalar_min_f
+#define scalar_min_l scalar_min_f
+// fp, ignores NaNs
+#define scalar_maxp_f fmaxf
+#define scalar_maxp_d fmax
+#define scalar_maxp_l fmaxl
+#define scalar_minp_f fminf
+#define scalar_minp_d fmin
+#define scalar_minp_l fminl
+
+// special optimization for fp scalars propagates NaNs
+// since there're no C99 support for it
+#ifndef NPY_DISABLE_OPTIMIZATION
+/**begin repeat
+ * #type = npy_float, npy_double#
+ * #sfx = f32, f64#
+ * #c_sfx = f, d#
+ * #isa_sfx = s, d#
+ * #sse_type = __m128, __m128d#
+ */
+/**begin repeat1
+ * #op = max, min#
+ * #neon_instr = fmax, fmin#
+ */
+#ifdef NPY_HAVE_SSE2
+#undef scalar_@op@_@c_sfx@
+NPY_FINLINE @type@ scalar_@op@_@c_sfx@(@type@ a, @type@ b) {
+    @sse_type@ va = _mm_set_s@isa_sfx@(a);
+    @sse_type@ vb = _mm_set_s@isa_sfx@(b);
+    @sse_type@ rv = _mm_@op@_s@isa_sfx@(va, vb);
+    // X86 handel second operand
+    @sse_type@ nn = _mm_cmpord_s@isa_sfx@(va, va);
+    #ifdef NPY_HAVE_SSE41
+    rv = _mm_blendv_p@isa_sfx@(va, rv, nn);
+    #else
+    rv = _mm_xor_p@isa_sfx@(va, _mm_and_p@isa_sfx@(_mm_xor_p@isa_sfx@(va, rv), nn));
+    #endif
+    return _mm_cvts@isa_sfx@_@sfx@(rv);
+}
+#endif // SSE2
+#ifdef __aarch64__
+#undef scalar_@op@_@c_sfx@
+NPY_FINLINE @type@ scalar_@op@_@c_sfx@(@type@ a, @type@ b) {
+    @type@ result = 0;
+    __asm(
+        "@neon_instr@ %@isa_sfx@[result], %@isa_sfx@[a], %@isa_sfx@[b]"
+        : [result] "=w" (result)
+        : [a] "w" (a), [b] "w" (b)
+    );
+    return result;
+}
+#endif // __aarch64__
+/**end repeat1**/
+/**end repeat**/
+#endif // NPY_DISABLE_OPTIMIZATION
+// mapping to double if its possible
+#if NPY_BITSOF_DOUBLE == NPY_BITSOF_LONGDOUBLE
+/**begin repeat
+ * #op = max, min, maxp, minp#
+ */
+    #undef scalar_@op@_l
+    #define scalar_@op@_l scalar_@op@_d
+/**end repeat**/
+#endif
+
+/*******************************************************************************
+ ** extra SIMD intrinsics
+ ******************************************************************************/
+
+#if NPY_SIMD
+/**begin repeat
+ * #sfx = s8, u8, s16, u16, s32, u32, s64, u64#
+ * #is_64 = 0*6, 1*2#
+ */
+#if defined(NPY_HAVE_ASIMD) && defined(__aarch64__)
+    #if !@is_64@
+        #define npyv_reduce_min_@sfx@ vminvq_@sfx@
+        #define npyv_reduce_max_@sfx@ vmaxvq_@sfx@
+    #else
+        NPY_FINLINE npyv_lanetype_@sfx@ npyv_reduce_min_@sfx@(npyv_@sfx@ v)
+        {
+            npyv_lanetype_@sfx@ a = vgetq_lane_@sfx@(v, 0);
+            npyv_lanetype_@sfx@ b = vgetq_lane_@sfx@(v, 1);
+            npyv_lanetype_@sfx@ result = (a < b) ? a : b;
+            return result;
+        }
+        NPY_FINLINE npyv_lanetype_@sfx@ npyv_reduce_max_@sfx@(npyv_@sfx@ v)
+        {
+            npyv_lanetype_@sfx@ a = vgetq_lane_@sfx@(v, 0);
+            npyv_lanetype_@sfx@ b = vgetq_lane_@sfx@(v, 1);
+            npyv_lanetype_@sfx@ result = (a > b) ? a : b;
+            return result;
+        }
+    #endif // !@is_64@
+#else
+    /**begin repeat1
+     * #intrin = min, max#
+     */
+    NPY_FINLINE npyv_lanetype_@sfx@ npyv_reduce_@intrin@_@sfx@(npyv_@sfx@ v)
+    {
+        npyv_lanetype_@sfx@ NPY_DECL_ALIGNED(NPY_SIMD_WIDTH) s[npyv_nlanes_@sfx@];
+        npyv_storea_@sfx@(s, v);
+        npyv_lanetype_@sfx@ result = s[0];
+        for(int i=1; i<npyv_nlanes_@sfx@; ++i){
+            result = scalar_@intrin@_i(result, s[i]);
+        }
+        return result;
+    }
+    /**end repeat1**/
+#endif
+/**end repeat**/
+#endif // NPY_SIMD
+
+/**begin repeat
+ * #sfx = f32, f64#
+ * #bsfx = b32, b64#
+ * #simd_chk = NPY_SIMD, NPY_SIMD_F64#
+ * #scalar_sfx = f, d#
+ */
+#if @simd_chk@
+#if defined(NPY_HAVE_ASIMD) && defined(__aarch64__)
+    #define npyv_minn_@sfx@ vminq_@sfx@
+    #define npyv_maxn_@sfx@ vmaxq_@sfx@
+    #define npyv_reduce_minn_@sfx@ vminvq_@sfx@
+    #define npyv_reduce_maxn_@sfx@ vmaxvq_@sfx@
+    #define npyv_reduce_minp_@sfx@ vminnmvq_@sfx@
+    #define npyv_reduce_maxp_@sfx@ vmaxnmvq_@sfx@
+#else
+    /**begin repeat1
+     * #intrin = min, max#
+     */
+    // propagates NaNs
+    NPY_FINLINE npyv_@sfx@ npyv_@intrin@n_@sfx@(npyv_@sfx@ a, npyv_@sfx@ b)
+    {
+        npyv_@sfx@ result = npyv_@intrin@_@sfx@(a, b);
+        // result = npyv_select_@sfx@(npyv_notnan_@sfx@(b), result, b);
+     // X86 handle second operand
+    #ifndef NPY_HAVE_SSE2
+        result = npyv_select_@sfx@(npyv_notnan_@sfx@(b), result, b);
+    #endif
+        result = npyv_select_@sfx@(npyv_notnan_@sfx@(a), result, a);
+        return result;
+    }
+    /**end repeat1**/
+    /**begin repeat1
+     * #intrin = minn, maxn, minp, maxp#
+     * #scalar_intrin = min, max, minp, maxp#
+     */
+    NPY_FINLINE npyv_lanetype_@sfx@ npyv_reduce_@intrin@_@sfx@(npyv_@sfx@ v)
+    {
+        npyv_lanetype_@sfx@ NPY_DECL_ALIGNED(NPY_SIMD_WIDTH) s[npyv_nlanes_@sfx@];
+        npyv_storea_@sfx@(s, v);
+        npyv_lanetype_@sfx@ result = s[0];
+        for(int i=1; i<npyv_nlanes_@sfx@; ++i){
+            result = scalar_@scalar_intrin@_@scalar_sfx@(result, s[i]);
+        }
+        return result;
+    }
+    /**end repeat1**/
+#endif
+#endif // simd_chk
+/**end repeat**/
+
+/*******************************************************************************
+ ** Defining the SIMD kernels
+ ******************************************************************************/
+/**begin repeat
+ * #sfx = s8, u8, s16, u16, s32, u32, s64, u64, f32, f64#
+ * #simd_chk = NPY_SIMD*9, NPY_SIMD_F64#
+ * #is_fp = 0*8, 1, 1#
+ * #scalar_sfx = i*8, f, d#
+ */
+/**begin repeat1
+ * # intrin = max, min, maxp, minp#
+ * # fp_only = 0, 0, 1, 1#
+ */
+#define SCALAR_OP scalar_@intrin@_@scalar_sfx@
+#if @simd_chk@ && (!@fp_only@ || (@is_fp@ && @fp_only@))
+
+#if @is_fp@ && !@fp_only@
+    #define V_INTRIN npyv_@intrin@n_@sfx@ // propagates NaNs
+    #define V_REDUCE_INTRIN npyv_reduce_@intrin@n_@sfx@
+#else
+    #define V_INTRIN npyv_@intrin@_@sfx@
+    #define V_REDUCE_INTRIN npyv_reduce_@intrin@_@sfx@
+#endif
+
+// contiguous input.
+static inline void
+simd_reduce_c_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip, npyv_lanetype_@sfx@ *op1, npy_intp len)
+{
+    if (len < 1) {
+        return;
+    }
+    const int vstep = npyv_nlanes_@sfx@;
+    const int wstep = vstep*8;
+    npyv_@sfx@ acc = npyv_setall_@sfx@(op1[0]);
+    for (; len >= wstep; len -= wstep, ip += wstep) {
+    #ifdef NPY_HAVE_SSE2
+        NPY_PREFETCH(ip + wstep, 0, 3);
+    #endif
+        npyv_@sfx@ v0 = npyv_load_@sfx@(ip + vstep * 0);
+        npyv_@sfx@ v1 = npyv_load_@sfx@(ip + vstep * 1);
+        npyv_@sfx@ v2 = npyv_load_@sfx@(ip + vstep * 2);
+        npyv_@sfx@ v3 = npyv_load_@sfx@(ip + vstep * 3);
+
+        npyv_@sfx@ v4 = npyv_load_@sfx@(ip + vstep * 4);
+        npyv_@sfx@ v5 = npyv_load_@sfx@(ip + vstep * 5);
+        npyv_@sfx@ v6 = npyv_load_@sfx@(ip + vstep * 6);
+        npyv_@sfx@ v7 = npyv_load_@sfx@(ip + vstep * 7);
+
+        npyv_@sfx@ r01 = V_INTRIN(v0, v1);
+        npyv_@sfx@ r23 = V_INTRIN(v2, v3);
+        npyv_@sfx@ r45 = V_INTRIN(v4, v5);
+        npyv_@sfx@ r67 = V_INTRIN(v6, v7);
+        acc = V_INTRIN(acc, V_INTRIN(V_INTRIN(r01, r23), V_INTRIN(r45, r67)));
+    }
+    for (; len >= vstep; len -= vstep, ip += vstep) {
+        acc = V_INTRIN(acc, npyv_load_@sfx@(ip));
+    }
+    npyv_lanetype_@sfx@ r = V_REDUCE_INTRIN(acc);
+    // Scalar - finish up any remaining iterations
+    for (; len > 0; --len, ++ip) {
+        const npyv_lanetype_@sfx@ in2 = *ip;
+        r = SCALAR_OP(r, in2);
+    }
+    op1[0] = r;
+}
+
+// contiguous inputs and output.
+static inline void
+simd_binary_ccc_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip1, const npyv_lanetype_@sfx@ *ip2,
+                                     npyv_lanetype_@sfx@ *op1, npy_intp len)
+{
+#if NPY_SIMD_WIDTH == 128
+    // Note, 6x unroll was chosen for best results on Apple M1
+    const int vectorsPerLoop = 6;
+#else
+    // To avoid memory bandwidth bottleneck
+    const int vectorsPerLoop = 2;
+#endif
+    const int elemPerVector = npyv_nlanes_@sfx@;
+    int elemPerLoop = vectorsPerLoop * elemPerVector;
+
+    npy_intp i = 0;
+
+    for (; (i+elemPerLoop) <= len; i += elemPerLoop) {
+        npyv_@sfx@ v0 = npyv_load_@sfx@(&ip1[i + 0 * elemPerVector]);
+        npyv_@sfx@ v1 = npyv_load_@sfx@(&ip1[i + 1 * elemPerVector]);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_@sfx@ v2 = npyv_load_@sfx@(&ip1[i + 2 * elemPerVector]);
+        npyv_@sfx@ v3 = npyv_load_@sfx@(&ip1[i + 3 * elemPerVector]);
+        npyv_@sfx@ v4 = npyv_load_@sfx@(&ip1[i + 4 * elemPerVector]);
+        npyv_@sfx@ v5 = npyv_load_@sfx@(&ip1[i + 5 * elemPerVector]);
+    #endif
+        npyv_@sfx@ u0 = npyv_load_@sfx@(&ip2[i + 0 * elemPerVector]);
+        npyv_@sfx@ u1 = npyv_load_@sfx@(&ip2[i + 1 * elemPerVector]);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_@sfx@ u2 = npyv_load_@sfx@(&ip2[i + 2 * elemPerVector]);
+        npyv_@sfx@ u3 = npyv_load_@sfx@(&ip2[i + 3 * elemPerVector]);
+        npyv_@sfx@ u4 = npyv_load_@sfx@(&ip2[i + 4 * elemPerVector]);
+        npyv_@sfx@ u5 = npyv_load_@sfx@(&ip2[i + 5 * elemPerVector]);
+    #endif
+        npyv_@sfx@ m0 = V_INTRIN(v0, u0);
+        npyv_@sfx@ m1 = V_INTRIN(v1, u1);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_@sfx@ m2 = V_INTRIN(v2, u2);
+        npyv_@sfx@ m3 = V_INTRIN(v3, u3);
+        npyv_@sfx@ m4 = V_INTRIN(v4, u4);
+        npyv_@sfx@ m5 = V_INTRIN(v5, u5);
+    #endif
+        npyv_store_@sfx@(&op1[i + 0 * elemPerVector], m0);
+        npyv_store_@sfx@(&op1[i + 1 * elemPerVector], m1);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_store_@sfx@(&op1[i + 2 * elemPerVector], m2);
+        npyv_store_@sfx@(&op1[i + 3 * elemPerVector], m3);
+        npyv_store_@sfx@(&op1[i + 4 * elemPerVector], m4);
+        npyv_store_@sfx@(&op1[i + 5 * elemPerVector], m5);
+    #endif
+    }
+    for (; (i+elemPerVector) <= len; i += elemPerVector) {
+        npyv_@sfx@ v0 = npyv_load_@sfx@(ip1 + i);
+        npyv_@sfx@ u0 = npyv_load_@sfx@(ip2 + i);
+        npyv_@sfx@ m0 = V_INTRIN(v0, u0);
+        npyv_store_@sfx@(op1 + i, m0);
+    }
+    // Scalar - finish up any remaining iterations
+    for (; i < len; ++i) {
+        const npyv_lanetype_@sfx@ in1 = ip1[i];
+        const npyv_lanetype_@sfx@ in2 = ip2[i];
+        op1[i] = SCALAR_OP(in1, in2);
+    }
+}
+// non-contiguous for float 32/64-bit memory access
+#if @is_fp@
+static inline void
+simd_binary_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip1, npy_intp sip1,
+                           const npyv_lanetype_@sfx@ *ip2, npy_intp sip2,
+                                 npyv_lanetype_@sfx@ *op1, npy_intp sop1,
+                                 npy_intp len)
+{
+    const int vstep = npyv_nlanes_@sfx@;
+    for (; len >= vstep; len -= vstep, ip1 += sip1*vstep,
+                         ip2 += sip2*vstep, op1 += sop1*vstep
+    ) {
+        npyv_@sfx@ a, b;
+        if (sip1 == 1) {
+            a = npyv_load_@sfx@(ip1);
+        } else {
+            a = npyv_loadn_@sfx@(ip1, sip1);
+        }
+        if (sip2 == 1) {
+            b = npyv_load_@sfx@(ip2);
+        } else {
+            b = npyv_loadn_@sfx@(ip2, sip2);
+        }
+        npyv_@sfx@ r = V_INTRIN(a, b);
+        if (sop1 == 1) {
+            npyv_store_@sfx@(op1, r);
+        } else {
+            npyv_storen_@sfx@(op1, sop1, r);
+        }
+    }
+    for (; len > 0; --len, ip1 += sip1, ip2 += sip2, op1 += sop1) {
+        const npyv_lanetype_@sfx@ a = *ip1;
+        const npyv_lanetype_@sfx@ b = *ip2;
+        *op1 = SCALAR_OP(a, b);
+    }
+}
+#endif
+
+#undef V_INTRIN
+#undef V_REDUCE_INTRIN
+
+#endif // simd_chk && (!fp_only || (is_fp && fp_only))
+
+#undef SCALAR_OP
+/**end repeat1**/
+/**end repeat**/
+
+/*******************************************************************************
+ ** Defining ufunc inner functions
+ ******************************************************************************/
+/**begin repeat
+ * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG,
+ *         BYTE, SHORT, INT, LONG, LONGLONG,
+ *         FLOAT, DOUBLE, LONGDOUBLE#
+ *
+ * #BTYPE = BYTE, SHORT, INT,  LONG, LONGLONG,
+ *          BYTE, SHORT, INT, LONG, LONGLONG,
+ *          FLOAT, DOUBLE, LONGDOUBLE#
+ * #type = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong,
+ *         npy_byte, npy_short, npy_int, npy_long, npy_longlong,
+ *         npy_float, npy_double, npy_longdouble#
+ *
+ * #is_fp = 0*10, 1*3#
+ * #is_unsigned = 1*5, 0*5, 0*3#
+ * #scalar_sfx = i*10, f, d, l#
+ */
+#undef TO_SIMD_SFX
+#if 0
+/**begin repeat1
+ * #len = 8, 16, 32, 64#
+ */
+#elif NPY_SIMD && NPY_BITSOF_@BTYPE@ == @len@
+    #if @is_fp@
+        #define TO_SIMD_SFX(X) X##_f@len@
+        #if NPY_BITSOF_@BTYPE@ == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif @is_unsigned@
+        #define TO_SIMD_SFX(X) X##_u@len@
+    #else
+        #define TO_SIMD_SFX(X) X##_s@len@
+    #endif
+/**end repeat1**/
+#endif
+
+/**begin repeat1
+ * # kind = maximum, minimum, fmax, fmin#
+ * # intrin = max, min, maxp, minp#
+ * # fp_only = 0, 0, 1, 1#
+ */
+#if !@fp_only@ || (@is_fp@ && @fp_only@)
+#define SCALAR_OP scalar_@intrin@_@scalar_sfx@
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
+    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
+             len = dimensions[0];
+    npy_intp i = 0;
+#ifdef TO_SIMD_SFX
+    #undef STYPE
+    #define STYPE TO_SIMD_SFX(npyv_lanetype)
+    if (IS_BINARY_REDUCE) {
+        // reduce and contiguous
+        if (is2 == sizeof(@type@)) {
+            TO_SIMD_SFX(simd_reduce_c_@intrin@)(
+                (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    }
+    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
+        !is_mem_overlap(ip2, is2, op1, os1, len)
+    ) {
+        // no overlap and operands are binary contiguous
+        if (IS_BINARY_CONT(@type@, @type@)) {
+            TO_SIMD_SFX(simd_binary_ccc_@intrin@)(
+                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    // unroll scalars faster than non-contiguous vector load/store on Arm
+    #if !defined(NPY_HAVE_NEON) && @is_fp@
+        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
+        ) {
+            TO_SIMD_SFX(simd_binary_@intrin@)(
+                (STYPE*)ip1, is1/sizeof(STYPE),
+                (STYPE*)ip2, is2/sizeof(STYPE),
+                (STYPE*)op1, os1/sizeof(STYPE), len
+            );
+            goto clear_fp;
+        }
+    #endif
+    }
+#endif // TO_SIMD_SFX
+#ifndef NPY_DISABLE_OPTIMIZATION
+    // scalar unrolls
+    if (IS_BINARY_REDUCE) {
+        // Note, 8x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 8;
+        if((i+elemPerLoop) <= len){
+            @type@ m0 = *((@type@ *)(ip2 + (i + 0) * is2));
+            @type@ m1 = *((@type@ *)(ip2 + (i + 1) * is2));
+            @type@ m2 = *((@type@ *)(ip2 + (i + 2) * is2));
+            @type@ m3 = *((@type@ *)(ip2 + (i + 3) * is2));
+            @type@ m4 = *((@type@ *)(ip2 + (i + 4) * is2));
+            @type@ m5 = *((@type@ *)(ip2 + (i + 5) * is2));
+            @type@ m6 = *((@type@ *)(ip2 + (i + 6) * is2));
+            @type@ m7 = *((@type@ *)(ip2 + (i + 7) * is2));
+
+            i += elemPerLoop;
+            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+                @type@ v0 = *((@type@ *)(ip2 + (i + 0) * is2));
+                @type@ v1 = *((@type@ *)(ip2 + (i + 1) * is2));
+                @type@ v2 = *((@type@ *)(ip2 + (i + 2) * is2));
+                @type@ v3 = *((@type@ *)(ip2 + (i + 3) * is2));
+                @type@ v4 = *((@type@ *)(ip2 + (i + 4) * is2));
+                @type@ v5 = *((@type@ *)(ip2 + (i + 5) * is2));
+                @type@ v6 = *((@type@ *)(ip2 + (i + 6) * is2));
+                @type@ v7 = *((@type@ *)(ip2 + (i + 7) * is2));
+
+                m0 = SCALAR_OP(m0, v0);
+                m1 = SCALAR_OP(m1, v1);
+                m2 = SCALAR_OP(m2, v2);
+                m3 = SCALAR_OP(m3, v3);
+                m4 = SCALAR_OP(m4, v4);
+                m5 = SCALAR_OP(m5, v5);
+                m6 = SCALAR_OP(m6, v6);
+                m7 = SCALAR_OP(m7, v7);
+            }
+
+            m0 = SCALAR_OP(m0, m1);
+            m2 = SCALAR_OP(m2, m3);
+            m4 = SCALAR_OP(m4, m5);
+            m6 = SCALAR_OP(m6, m7);
+
+            m0 = SCALAR_OP(m0, m2);
+            m4 = SCALAR_OP(m4, m6);
+
+            m0 = SCALAR_OP(m0, m4);
+
+             *((@type@ *)op1) = SCALAR_OP(*((@type@ *)op1), m0);
+        }
+    } else{
+        // Note, 4x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 4;
+        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+            /* Note, we can't just load all, do all ops, then store all here.
+             * Sometimes ufuncs are called with `accumulate`, which makes the
+             * assumption that previous iterations have finished before next
+             * iteration.  For example, the output of iteration 2 depends on the
+             * result of iteration 1.
+             */
+
+            /**begin repeat2
+             * #unroll = 0, 1, 2, 3#
+             */
+            @type@ v@unroll@ = *((@type@ *)(ip1 + (i + @unroll@) * is1));
+            @type@ u@unroll@ = *((@type@ *)(ip2 + (i + @unroll@) * is2));
+            *((@type@ *)(op1 + (i + @unroll@) * os1)) = SCALAR_OP(v@unroll@, u@unroll@);
+            /**end repeat2**/
+        }
+    }
+#endif // NPY_DISABLE_OPTIMIZATION
+    ip1 += is1 * i;
+    ip2 += is2 * i;
+    op1 += os1 * i;
+    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
+        const @type@ in1 = *(@type@ *)ip1;
+        const @type@ in2 = *(@type@ *)ip2;
+        *((@type@ *)op1) = SCALAR_OP(in1, in2);
+    }
+#ifdef TO_SIMD_SFX
+clear_fp:
+    npyv_cleanup();
+#endif
+#if @is_fp@
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+#undef SCALAR_OP
+
+#endif // !fp_only || (is_fp && fp_only)
+/**end repeat1**/
+/**end repeat**/
+
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index 0e2c1ab8b..8b833ee56 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -95,38 +95,6 @@ run_unary_avx512f_@func@_@TYPE@(char **args, const npy_intp *dimensions, const n
  */
 
 /**begin repeat1
- *  #func = maximum, minimum#
- */
-
-#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS && @EXISTS@
-static NPY_INLINE NPY_GCC_TARGET_AVX512F void
-AVX512F_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps);
-#endif
-
-static NPY_INLINE int
-run_binary_avx512f_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
-{
-#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS && @EXISTS@
-    if (IS_BINARY_SMALL_STEPS_AND_NOMEMOVERLAP) {
-        AVX512F_@func@_@TYPE@(args, dimensions, steps);
-        return 1;
-    }
-    else
-        return 0;
-#endif
-    return 0;
-}
-/**end repeat1**/
-
-/**end repeat**/
-
-/**begin repeat
- * #type = npy_float, npy_double, npy_longdouble#
- * #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
- * #EXISTS = 1, 1, 0#
- */
-
-/**begin repeat1
  * #func = isnan, isfinite, isinf, signbit#
  */
 
@@ -204,9 +172,9 @@ run_unary_@isa@_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp
  */
 
 /**begin repeat1
- * #func = negative, minimum, maximum#
- * #check = IS_BLOCKABLE_UNARY, IS_BLOCKABLE_REDUCE*2 #
- * #name = unary, unary_reduce*2#
+ * #func = negative#
+ * #check = IS_BLOCKABLE_UNARY#
+ * #name = unary#
  */
 
 #if @vector@ && defined NPY_HAVE_SSE2_INTRINSICS
@@ -678,55 +646,6 @@ sse2_negative_@TYPE@(@type@ * op, @type@ * ip, const npy_intp n)
 }
 /**end repeat1**/
 
-
-/**begin repeat1
- * #kind = maximum, minimum#
- * #VOP = max, min#
- * #OP = >=, <=#
- **/
-/* arguments swapped as unary reduce has the swapped compared to unary */
-static void
-sse2_@kind@_@TYPE@(@type@ * ip, @type@ * op, const npy_intp n)
-{
-    const npy_intp stride = VECTOR_SIZE_BYTES / (npy_intp)sizeof(@type@);
-    LOOP_BLOCK_ALIGN_VAR(ip, @type@, VECTOR_SIZE_BYTES) {
-        /* Order of operations important for MSVC 2015 */
-        *op = (*op @OP@ ip[i] || npy_isnan(*op)) ? *op : ip[i];
-    }
-    assert(n < stride || npy_is_aligned(&ip[i], VECTOR_SIZE_BYTES));
-    if (i + 3 * stride <= n) {
-        /* load the first elements */
-        @vtype@ c1 = @vpre@_load_@vsuf@((@type@*)&ip[i]);
-        @vtype@ c2 = @vpre@_load_@vsuf@((@type@*)&ip[i + stride]);
-        i += 2 * stride;
-
-        /* minps/minpd will set invalid flag if nan is encountered */
-        npy_clear_floatstatus_barrier((char*)&c1);
-        LOOP_BLOCKED(@type@, 2 * VECTOR_SIZE_BYTES) {
-            @vtype@ v1 = @vpre@_load_@vsuf@((@type@*)&ip[i]);
-            @vtype@ v2 = @vpre@_load_@vsuf@((@type@*)&ip[i + stride]);
-            c1 = @vpre@_@VOP@_@vsuf@(c1, v1);
-            c2 = @vpre@_@VOP@_@vsuf@(c2, v2);
-        }
-        c1 = @vpre@_@VOP@_@vsuf@(c1, c2);
-
-        if (npy_get_floatstatus_barrier((char*)&c1) & NPY_FPE_INVALID) {
-            *op = @nan@;
-        }
-        else {
-            @type@ tmp = sse2_horizontal_@VOP@_@vtype@(c1);
-            /* Order of operations important for MSVC 2015 */
-            *op  = (*op @OP@ tmp || npy_isnan(*op)) ? *op : tmp;
-        }
-    }
-    LOOP_BLOCKED_END {
-        /* Order of operations important for MSVC 2015 */
-        *op  = (*op @OP@ ip[i] || npy_isnan(*op)) ? *op : ip[i];
-    }
-    npy_clear_floatstatus_barrier((char*)op);
-}
-/**end repeat1**/
-
 /**end repeat**/
 
 /* bunch of helper functions used in ISA_exp/log_FLOAT*/
@@ -1200,107 +1119,6 @@ AVX512_SKX_@func@_@TYPE@(npy_bool* op, @type@* ip, const npy_intp array_size, co
 /**end repeat**/
 
 /**begin repeat
- * #type = npy_float, npy_double#
- * #TYPE = FLOAT, DOUBLE#
- * #num_lanes = 16, 8#
- * #vsuffix = ps, pd#
- * #mask = __mmask16, __mmask8#
- * #vtype1 = __m512, __m512d#
- * #vtype2 = __m512i, __m256i#
- * #scale = 4, 8#
- * #vindextype = __m512i, __m256i#
- * #vindexsize = 512, 256#
- * #vindexload = _mm512_loadu_si512, _mm256_loadu_si256#
- * #vtype2_load = _mm512_maskz_loadu_epi32, _mm256_maskz_loadu_epi32#
- * #vtype2_gather = _mm512_mask_i32gather_epi32, _mm256_mmask_i32gather_epi32#
- * #vtype2_store = _mm512_mask_storeu_epi32, _mm256_mask_storeu_epi32#
- * #vtype2_scatter = _mm512_mask_i32scatter_epi32, _mm256_mask_i32scatter_epi32#
- * #setzero = _mm512_setzero_epi32, _mm256_setzero_si256#
- */
-/**begin repeat1
- *  #func = maximum, minimum#
- *  #vectorf = max, min#
- */
-
-#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
-static NPY_INLINE NPY_GCC_TARGET_AVX512F void
-AVX512F_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
-{
-    const npy_intp stride_ip1 = steps[0]/(npy_intp)sizeof(@type@);
-    const npy_intp stride_ip2 = steps[1]/(npy_intp)sizeof(@type@);
-    const npy_intp stride_op = steps[2]/(npy_intp)sizeof(@type@);
-    const npy_intp array_size = dimensions[0];
-    npy_intp num_remaining_elements = array_size;
-    @type@* ip1 = (@type@*) args[0];
-    @type@* ip2 = (@type@*) args[1];
-    @type@* op  = (@type@*) args[2];
-
-    @mask@ load_mask = avx512_get_full_load_mask_@vsuffix@();
-
-    /*
-     * Note: while generally indices are npy_intp, we ensure that our maximum index
-     * will fit in an int32 as a precondition for this function via
-     * IS_BINARY_SMALL_STEPS_AND_NOMEMOVERLAP
-     */
-
-    npy_int32 index_ip1[@num_lanes@], index_ip2[@num_lanes@], index_op[@num_lanes@];
-    for (npy_int32 ii = 0; ii < @num_lanes@; ii++) {
-        index_ip1[ii] = ii*stride_ip1;
-        index_ip2[ii] = ii*stride_ip2;
-        index_op[ii] = ii*stride_op;
-    }
-    @vindextype@ vindex_ip1 = @vindexload@((@vindextype@*)&index_ip1[0]);
-    @vindextype@ vindex_ip2 = @vindexload@((@vindextype@*)&index_ip2[0]);
-    @vindextype@ vindex_op  = @vindexload@((@vindextype@*)&index_op[0]);
-    @vtype1@ zeros_f = _mm512_setzero_@vsuffix@();
-
-    while (num_remaining_elements > 0) {
-        if (num_remaining_elements < @num_lanes@) {
-            load_mask = avx512_get_partial_load_mask_@vsuffix@(
-                                    num_remaining_elements, @num_lanes@);
-        }
-        @vtype1@ x1, x2;
-        if (stride_ip1 == 1) {
-            x1 = avx512_masked_load_@vsuffix@(load_mask, ip1);
-        }
-        else {
-            x1 = avx512_masked_gather_@vsuffix@(zeros_f, ip1, vindex_ip1, load_mask);
-        }
-        if (stride_ip2 == 1) {
-            x2 = avx512_masked_load_@vsuffix@(load_mask, ip2);
-        }
-        else {
-            x2 = avx512_masked_gather_@vsuffix@(zeros_f, ip2, vindex_ip2, load_mask);
-        }
-
-        /*
-         * when only one of the argument is a nan, the maxps/maxpd instruction
-         * returns the second argument. The additional blend instruction fixes
-         * this issue to conform with NumPy behaviour.
-         */
-        @mask@ nan_mask = _mm512_cmp_@vsuffix@_mask(x1, x1, _CMP_NEQ_UQ);
-        @vtype1@ out = _mm512_@vectorf@_@vsuffix@(x1, x2);
-        out = _mm512_mask_blend_@vsuffix@(nan_mask, out, x1);
-
-        if (stride_op == 1) {
-            _mm512_mask_storeu_@vsuffix@(op, load_mask, out);
-        }
-        else {
-            /* scatter! */
-            _mm512_mask_i32scatter_@vsuffix@(op, load_mask, vindex_op, out, @scale@);
-        }
-
-        ip1 += @num_lanes@*stride_ip1;
-        ip2 += @num_lanes@*stride_ip2;
-        op += @num_lanes@*stride_op;
-        num_remaining_elements -= @num_lanes@;
-    }
-}
-#endif
-/**end repeat1**/
-/**end repeat**/
-
-/**begin repeat
  * #ISA = FMA, AVX512F#
  * #isa = fma, avx512#
  * #vsize = 256, 512#
diff --git a/numpy/core/src/umath/ufunc_object.c b/numpy/core/src/umath/ufunc_object.c
index 27a2cfcd5..7b0db9e04 100644
--- a/numpy/core/src/umath/ufunc_object.c
+++ b/numpy/core/src/umath/ufunc_object.c
@@ -2720,6 +2720,21 @@ reducelike_promote_and_resolve(PyUFuncObject *ufunc,
      * is NULL) so we pass `arr` instead in that case.
      */
     PyArrayObject *ops[3] = {out ? out : arr, arr, out};
+
+    /*
+     * TODO: This is a dangerous hack, that works by relying on the GIL, it is
+     *       terrible, terrifying, and trusts that nobody does crazy stuff
+     *       in their type-resolvers.
+     *       By mutating the `out` dimension, we ensure that reduce-likes
+     *       live in a future without value-based promotion even when legacy
+     *       promotion has to be used.
+     */
+    npy_bool evil_ndim_mutating_hack = NPY_FALSE;
+    if (out != NULL && PyArray_NDIM(out) == 0 && PyArray_NDIM(arr) != 0) {
+        evil_ndim_mutating_hack = NPY_TRUE;
+        ((PyArrayObject_fields *)out)->nd = 1;
+    }
+
     /*
      * TODO: If `out` is not provided, arguably `initial` could define
      *       the first DType (and maybe also the out one), that way
@@ -2740,6 +2755,9 @@ reducelike_promote_and_resolve(PyUFuncObject *ufunc,
 
     PyArrayMethodObject *ufuncimpl = promote_and_get_ufuncimpl(ufunc,
             ops, signature, operation_DTypes, NPY_FALSE, NPY_TRUE, NPY_TRUE);
+    if (evil_ndim_mutating_hack) {
+        ((PyArrayObject_fields *)out)->nd = 0;
+    }
     /* DTypes may currently get filled in fallbacks and XDECREF for error: */
     Py_XDECREF(operation_DTypes[0]);
     Py_XDECREF(operation_DTypes[1]);
diff --git a/numpy/core/tests/test_deprecations.py b/numpy/core/tests/test_deprecations.py
index 76486f755..d2a69d4cf 100644
--- a/numpy/core/tests/test_deprecations.py
+++ b/numpy/core/tests/test_deprecations.py
@@ -1239,3 +1239,14 @@ class TestMemEventHook(_DeprecationTestCase):
         with pytest.warns(DeprecationWarning,
                           match='PyDataMem_SetEventHook is deprecated'):
             ma_tests.test_pydatamem_seteventhook_end()
+
+
+class TestArrayFinalizeNone(_DeprecationTestCase):
+    message = "Setting __array_finalize__ = None"
+
+    def test_use_none_is_deprecated(self):
+        # Deprecated way that ndarray itself showed nothing needs finalizing.
+        class NoFinalize(np.ndarray):
+            __array_finalize__ = None
+
+        self.assert_deprecated(lambda: np.array(1).view(NoFinalize))
diff --git a/numpy/core/tests/test_multiarray.py b/numpy/core/tests/test_multiarray.py
index 2529705d5..708e82910 100644
--- a/numpy/core/tests/test_multiarray.py
+++ b/numpy/core/tests/test_multiarray.py
@@ -8954,12 +8954,28 @@ class TestArrayFinalize:
         a = np.array(1).view(SavesBase)
         assert_(a.saved_base is a.base)
 
-    def test_bad_finalize(self):
+    def test_bad_finalize1(self):
         class BadAttributeArray(np.ndarray):
             @property
             def __array_finalize__(self):
                 raise RuntimeError("boohoo!")
 
+        with pytest.raises(TypeError, match="not callable"):
+            np.arange(10).view(BadAttributeArray)
+
+    def test_bad_finalize2(self):
+        class BadAttributeArray(np.ndarray):
+            def __array_finalize__(self):
+                raise RuntimeError("boohoo!")
+
+        with pytest.raises(TypeError, match="takes 1 positional"):
+            np.arange(10).view(BadAttributeArray)
+
+    def test_bad_finalize3(self):
+        class BadAttributeArray(np.ndarray):
+            def __array_finalize__(self, obj):
+                raise RuntimeError("boohoo!")
+
         with pytest.raises(RuntimeError, match="boohoo!"):
             np.arange(10).view(BadAttributeArray)
 
@@ -8997,6 +9013,14 @@ class TestArrayFinalize:
         break_cycles()
         assert_(obj_ref() is None, "no references should remain")
 
+    def test_can_use_super(self):
+        class SuperFinalize(np.ndarray):
+            def __array_finalize__(self, obj):
+                self.saved_result = super().__array_finalize__(obj)
+
+        a = np.array(1).view(SuperFinalize)
+        assert_(a.saved_result is None)
+
 
 def test_orderconverter_with_nonASCII_unicode_ordering():
     # gh-7475
@@ -9201,7 +9225,7 @@ class TestViewDtype:
         # x is non-contiguous
         x = np.arange(10, dtype='<i4')[::2]
         with pytest.raises(ValueError,
-                           match='the last axis must be contiguous'): 
+                           match='the last axis must be contiguous'):
             x.view('<i2')
         expected = [[0, 0], [2, 0], [4, 0], [6, 0], [8, 0]]
         assert_array_equal(x[:, np.newaxis].view('<i2'), expected)