129 files changed, 3270 insertions, 1738 deletions
diff --git a/numpy/__init__.py b/numpy/__init__.py
index aae5c95ac..83487dc97 100644
--- a/numpy/__init__.py
+++ b/numpy/__init__.py
@@ -274,6 +274,7 @@ else:
     def __getattr__(attr):
         # Warn for expired attributes, and return a dummy function
         # that always raises an exception.
+        import warnings
         try:
             msg = __expired_functions__[attr]
         except KeyError:
@@ -312,7 +313,11 @@ else:
                              "{!r}".format(__name__, attr))
 
     def __dir__():
-        return list(globals().keys() | {'Tester', 'testing'})
+        public_symbols = globals().keys() | {'Tester', 'testing'}
+        public_symbols -= {
+            "core", "matrixlib",
+        }
+        return list(public_symbols)
 
     # Pytest testing
     from numpy._pytesttester import PytestTester
@@ -358,7 +363,6 @@ else:
         except ValueError:
             pass
 
-    import sys
     if sys.platform == "darwin":
         with warnings.catch_warnings(record=True) as w:
             _mac_os_check()
@@ -414,6 +418,12 @@ else:
         from pathlib import Path
         return [str(Path(__file__).with_name("_pyinstaller").resolve())]
 
+    # Remove symbols imported for internal use
+    del os
+
 
 # get the version using versioneer
 from .version import __version__, git_revision as __git_version__
+
+# Remove symbols imported for internal use
+del sys, warnings
diff --git a/numpy/__init__.pyi b/numpy/__init__.pyi
index 2eb4a0634..d6faa9ca3 100644
--- a/numpy/__init__.pyi
+++ b/numpy/__init__.pyi
@@ -203,7 +203,6 @@ from numpy import (
     lib as lib,
     linalg as linalg,
     ma as ma,
-    matrixlib as matrixlib,
     polynomial as polynomial,
     random as random,
     testing as testing,
diff --git a/numpy/_pytesttester.py b/numpy/_pytesttester.py
index 8decb9dd7..01ddaaf98 100644
--- a/numpy/_pytesttester.py
+++ b/numpy/_pytesttester.py
@@ -33,7 +33,6 @@ import os
 __all__ = ['PytestTester']
 
 
-
 def _show_numpy_info():
     import numpy as np
 
@@ -44,7 +43,6 @@ def _show_numpy_info():
     print("NumPy CPU features: ", (info if info else 'nothing enabled'))
 
 
-
 class PytestTester:
     """
     Pytest test runner.
@@ -167,7 +165,7 @@ class PytestTester:
             ]
 
         if doctests:
-            raise ValueError("Doctests not supported")
+            pytest_args += ["--doctest-modules"]
 
         if extra_argv:
             pytest_args += list(extra_argv)
diff --git a/numpy/array_api/linalg.py b/numpy/array_api/linalg.py
index f422e1c27..a4a2f23e4 100644
--- a/numpy/array_api/linalg.py
+++ b/numpy/array_api/linalg.py
@@ -1,8 +1,11 @@
 from __future__ import annotations
 
 from ._dtypes import _floating_dtypes, _numeric_dtypes
+from ._manipulation_functions import reshape
 from ._array_object import Array
 
+from ..core.numeric import normalize_axis_tuple
+
 from typing import TYPE_CHECKING
 if TYPE_CHECKING:
     from ._typing import Literal, Optional, Sequence, Tuple, Union
@@ -395,18 +398,38 @@ def vector_norm(x: Array, /, *, axis: Optional[Union[int, Tuple[int, ...]]] = No
     if x.dtype not in _floating_dtypes:
         raise TypeError('Only floating-point dtypes are allowed in norm')
 
+    # np.linalg.norm tries to do a matrix norm whenever axis is a 2-tuple or
+    # when axis=None and the input is 2-D, so to force a vector norm, we make
+    # it so the input is 1-D (for axis=None), or reshape so that norm is done
+    # on a single dimension.
     a = x._array
     if axis is None:
-        a = a.flatten()
-        axis = 0
+        # Note: np.linalg.norm() doesn't handle 0-D arrays
+        a = a.ravel()
+        _axis = 0
     elif isinstance(axis, tuple):
-        # Note: The axis argument supports any number of axes, whereas norm()
-        # only supports a single axis for vector norm.
-        rest = tuple(i for i in range(a.ndim) if i not in axis)
+        # Note: The axis argument supports any number of axes, whereas
+        # np.linalg.norm() only supports a single axis for vector norm.
+        normalized_axis = normalize_axis_tuple(axis, x.ndim)
+        rest = tuple(i for i in range(a.ndim) if i not in normalized_axis)
         newshape = axis + rest
-        a = np.transpose(a, newshape).reshape((np.prod([a.shape[i] for i in axis]), *[a.shape[i] for i in rest]))
-        axis = 0
-    return Array._new(np.linalg.norm(a, axis=axis, keepdims=keepdims, ord=ord))
+        a = np.transpose(a, newshape).reshape(
+            (np.prod([a.shape[i] for i in axis], dtype=int), *[a.shape[i] for i in rest]))
+        _axis = 0
+    else:
+        _axis = axis
+
+    res = Array._new(np.linalg.norm(a, axis=_axis, ord=ord))
+
+    if keepdims:
+        # We can't reuse np.linalg.norm(keepdims) because of the reshape hacks
+        # above to avoid matrix norm logic.
+        shape = list(x.shape)
+        _axis = normalize_axis_tuple(range(x.ndim) if axis is None else axis, x.ndim)
+        for i in _axis:
+            shape[i] = 1
+        res = reshape(res, tuple(shape))
 
+    return res
 
 __all__ = ['cholesky', 'cross', 'det', 'diagonal', 'eigh', 'eigvalsh', 'inv', 'matmul', 'matrix_norm', 'matrix_power', 'matrix_rank', 'matrix_transpose', 'outer', 'pinv', 'qr', 'slogdet', 'solve', 'svd', 'svdvals', 'tensordot', 'trace', 'vecdot', 'vector_norm']
diff --git a/numpy/core/_add_newdocs.py b/numpy/core/_add_newdocs.py
index fb9c30d93..3e8df6d46 100644
--- a/numpy/core/_add_newdocs.py
+++ b/numpy/core/_add_newdocs.py
@@ -3437,6 +3437,24 @@ add_newdoc('numpy.core.multiarray', 'ndarray', ('fill',
     >>> a
     array([1.,  1.])
 
+    Fill expects a scalar value and always behaves the same as assigning
+    to a single array element.  The following is a rare example where this
+    distinction is important:
+
+    >>> a = np.array([None, None], dtype=object)
+    >>> a[0] = np.array(3)
+    >>> a
+    array([array(3), None], dtype=object)
+    >>> a.fill(np.array(3))
+    >>> a
+    array([array(3), array(3)], dtype=object)
+
+    Where other forms of assignments will unpack the array being assigned:
+
+    >>> a[...] = np.array(3)
+    >>> a
+    array([3, 3], dtype=object)
+
     """))
 
 
diff --git a/numpy/core/_asarray.py b/numpy/core/_asarray.py
index 89d422e99..cbaab8c3f 100644
--- a/numpy/core/_asarray.py
+++ b/numpy/core/_asarray.py
@@ -14,6 +14,15 @@ from .multiarray import array, asanyarray
 __all__ = ["require"]
 
 
+POSSIBLE_FLAGS = {
+    'C': 'C', 'C_CONTIGUOUS': 'C', 'CONTIGUOUS': 'C',
+    'F': 'F', 'F_CONTIGUOUS': 'F', 'FORTRAN': 'F',
+    'A': 'A', 'ALIGNED': 'A',
+    'W': 'W', 'WRITEABLE': 'W',
+    'O': 'O', 'OWNDATA': 'O',
+    'E': 'E', 'ENSUREARRAY': 'E'
+}
+
 
 def _require_dispatcher(a, dtype=None, requirements=None, *, like=None):
     return (like,)
@@ -36,7 +45,7 @@ def require(a, dtype=None, requirements=None, *, like=None):
        The required data-type. If None preserve the current dtype. If your
        application requires the data to be in native byteorder, include
        a byteorder specification as a part of the dtype specification.
-    requirements : str or list of str
+    requirements : str or sequence of str
        The requirements list can be any of the following
 
        * 'F_CONTIGUOUS' ('F') - ensure a Fortran-contiguous array
@@ -97,16 +106,10 @@ def require(a, dtype=None, requirements=None, *, like=None):
             like=like,
         )
 
-    possible_flags = {'C': 'C', 'C_CONTIGUOUS': 'C', 'CONTIGUOUS': 'C',
-                      'F': 'F', 'F_CONTIGUOUS': 'F', 'FORTRAN': 'F',
-                      'A': 'A', 'ALIGNED': 'A',
-                      'W': 'W', 'WRITEABLE': 'W',
-                      'O': 'O', 'OWNDATA': 'O',
-                      'E': 'E', 'ENSUREARRAY': 'E'}
     if not requirements:
         return asanyarray(a, dtype=dtype)
-    else:
-        requirements = {possible_flags[x.upper()] for x in requirements}
+
+    requirements = {POSSIBLE_FLAGS[x.upper()] for x in requirements}
 
     if 'E' in requirements:
         requirements.remove('E')
@@ -128,8 +131,7 @@ def require(a, dtype=None, requirements=None, *, like=None):
 
     for prop in requirements:
         if not arr.flags[prop]:
-            arr = arr.copy(order)
-            break
+            return arr.copy(order)
     return arr
 
 
diff --git a/numpy/core/code_generators/generate_numpy_api.py b/numpy/core/code_generators/generate_numpy_api.py
index 37975966f..a966be57d 100644
--- a/numpy/core/code_generators/generate_numpy_api.py
+++ b/numpy/core/code_generators/generate_numpy_api.py
@@ -89,19 +89,22 @@ _import_array(void)
    */
   st = PyArray_GetEndianness();
   if (st == NPY_CPU_UNKNOWN_ENDIAN) {
-      PyErr_Format(PyExc_RuntimeError, "FATAL: module compiled as unknown endian");
+      PyErr_SetString(PyExc_RuntimeError,
+                      "FATAL: module compiled as unknown endian");
       return -1;
   }
 #if NPY_BYTE_ORDER == NPY_BIG_ENDIAN
   if (st != NPY_CPU_BIG) {
-      PyErr_Format(PyExc_RuntimeError, "FATAL: module compiled as "\
-             "big endian, but detected different endianness at runtime");
+      PyErr_SetString(PyExc_RuntimeError,
+                      "FATAL: module compiled as big endian, but "
+                      "detected different endianness at runtime");
       return -1;
   }
 #elif NPY_BYTE_ORDER == NPY_LITTLE_ENDIAN
   if (st != NPY_CPU_LITTLE) {
-      PyErr_Format(PyExc_RuntimeError, "FATAL: module compiled as "\
-             "little endian, but detected different endianness at runtime");
+      PyErr_SetString(PyExc_RuntimeError,
+                      "FATAL: module compiled as little endian, but "
+                      "detected different endianness at runtime");
       return -1;
   }
 #endif
diff --git a/numpy/core/code_generators/ufunc_docstrings.py b/numpy/core/code_generators/ufunc_docstrings.py
index 24b707a12..7c020fa2e 100644
--- a/numpy/core/code_generators/ufunc_docstrings.py
+++ b/numpy/core/code_generators/ufunc_docstrings.py
@@ -2011,7 +2011,7 @@ add_newdoc('numpy.core.umath', 'log',
     -----
     Logarithm is a multivalued function: for each `x` there is an infinite
     number of `z` such that `exp(z) = x`. The convention is to return the
-    `z` whose imaginary part lies in `[-pi, pi]`.
+    `z` whose imaginary part lies in `(-pi, pi]`.
 
     For real-valued input data types, `log` always returns real output. For
     each value that cannot be expressed as a real number or infinity, it
@@ -2021,6 +2021,10 @@ add_newdoc('numpy.core.umath', 'log',
     has a branch cut `[-inf, 0]` and is continuous from above on it. `log`
     handles the floating-point negative zero as an infinitesimal negative
     number, conforming to the C99 standard.
+    
+    In the cases where the input has a negative real part and a very small
+    negative complex part (approaching 0), the result is so close to `-pi`
+    that it evaluates to exactly `-pi`.
 
     References
     ----------
@@ -2061,7 +2065,7 @@ add_newdoc('numpy.core.umath', 'log10',
     -----
     Logarithm is a multivalued function: for each `x` there is an infinite
     number of `z` such that `10**z = x`. The convention is to return the
-    `z` whose imaginary part lies in `[-pi, pi]`.
+    `z` whose imaginary part lies in `(-pi, pi]`.
 
     For real-valued input data types, `log10` always returns real output.
     For each value that cannot be expressed as a real number or infinity,
@@ -2072,6 +2076,10 @@ add_newdoc('numpy.core.umath', 'log10',
     `log10` handles the floating-point negative zero as an infinitesimal
     negative number, conforming to the C99 standard.
 
+    In the cases where the input has a negative real part and a very small
+    negative complex part (approaching 0), the result is so close to `-pi`
+    that it evaluates to exactly `-pi`.
+
     References
     ----------
     .. [1] M. Abramowitz and I.A. Stegun, "Handbook of Mathematical Functions",
@@ -2112,7 +2120,7 @@ add_newdoc('numpy.core.umath', 'log2',
 
     Logarithm is a multivalued function: for each `x` there is an infinite
     number of `z` such that `2**z = x`. The convention is to return the `z`
-    whose imaginary part lies in `[-pi, pi]`.
+    whose imaginary part lies in `(-pi, pi]`.
 
     For real-valued input data types, `log2` always returns real output.
     For each value that cannot be expressed as a real number or infinity,
@@ -2123,6 +2131,10 @@ add_newdoc('numpy.core.umath', 'log2',
     handles the floating-point negative zero as an infinitesimal negative
     number, conforming to the C99 standard.
 
+    In the cases where the input has a negative real part and a very small
+    negative complex part (approaching 0), the result is so close to `-pi`
+    that it evaluates to exactly `-pi`.
+
     Examples
     --------
     >>> x = np.array([0, 1, 2, 2**4])
diff --git a/numpy/core/include/numpy/experimental_dtype_api.h b/numpy/core/include/numpy/experimental_dtype_api.h
index 1dd6215e6..23e9a8d21 100644
--- a/numpy/core/include/numpy/experimental_dtype_api.h
+++ b/numpy/core/include/numpy/experimental_dtype_api.h
@@ -214,7 +214,7 @@ typedef struct {
 } PyArrayMethod_Spec;
 
 
-typedef PyObject *_ufunc_addloop_fromspec_func(
+typedef int _ufunc_addloop_fromspec_func(
         PyObject *ufunc, PyArrayMethod_Spec *spec);
 /*
  * The main ufunc registration function.  This adds a new implementation/loop
diff --git a/numpy/core/include/numpy/ndarraytypes.h b/numpy/core/include/numpy/ndarraytypes.h
index c295f34bb..97e0f4e2a 100644
--- a/numpy/core/include/numpy/ndarraytypes.h
+++ b/numpy/core/include/numpy/ndarraytypes.h
@@ -1380,7 +1380,10 @@ typedef struct {
         int                   nd_fancy;
         npy_intp              fancy_dims[NPY_MAXDIMS];
 
-        /* Whether the iterator (any of the iterators) requires API */
+        /*
+         * Whether the iterator (any of the iterators) requires API.  This is
+         * unused by NumPy itself; ArrayMethod flags are more precise.
+         */
         int                   needs_api;
 
         /*
diff --git a/numpy/core/overrides.py b/numpy/core/overrides.py
index cb550152e..663436a4c 100644
--- a/numpy/core/overrides.py
+++ b/numpy/core/overrides.py
@@ -2,6 +2,7 @@
 import collections
 import functools
 import os
+import sys
 
 from numpy.core._multiarray_umath import (
     add_docstring, implement_array_function, _get_implementing_args)
@@ -176,7 +177,27 @@ def array_function_dispatch(dispatcher, module=None, verify=True,
 
         @functools.wraps(implementation)
         def public_api(*args, **kwargs):
-            relevant_args = dispatcher(*args, **kwargs)
+            try:
+                relevant_args = dispatcher(*args, **kwargs)
+            except TypeError as exc:
+                # Try to clean up a signature related TypeError.  Such an
+                # error will be something like:
+                #     dispatcher.__name__() got an unexpected keyword argument
+                #
+                # So replace the dispatcher name in this case.  In principle
+                # TypeErrors may be raised from _within_ the dispatcher, so
+                # we check that the traceback contains a string that starts
+                # with the name.  (In principle we could also check the
+                # traceback length, as it would be deeper.)
+                msg = exc.args[0]
+                disp_name = dispatcher.__name__
+                if not isinstance(msg, str) or not msg.startswith(disp_name):
+                    raise
+
+                # Replace with the correct name and re-raise:
+                new_msg = msg.replace(disp_name, public_api.__name__)
+                raise TypeError(new_msg) from None
+
             return implement_array_function(
                 implementation, public_api, relevant_args, args, kwargs)
 
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index 7d072c15c..543b6ae39 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -1,9 +1,9 @@
 import os
 import sys
+import sysconfig
 import pickle
 import copy
 import warnings
-import platform
 import textwrap
 import glob
 from os.path import join
@@ -79,9 +79,8 @@ def can_link_svml():
     """
     if NPY_DISABLE_SVML:
         return False
-    machine = platform.machine()
-    system = platform.system()
-    return "x86_64" in machine and system == "Linux"
+    platform = sysconfig.get_platform()
+    return "x86_64" in platform and "linux" in platform
 
 def check_svml_submodule(svmlpath):
     if not os.path.exists(svmlpath + "/README.md"):
@@ -1081,6 +1080,7 @@ def configuration(parent_package='',top_path=None):
             join('src', 'umath', 'scalarmath.c.src'),
             join('src', 'umath', 'ufunc_type_resolution.c'),
             join('src', 'umath', 'override.c'),
+            join('src', 'umath', 'string_ufuncs.cpp'),
             # For testing. Eventually, should use public API and be separate:
             join('src', 'umath', '_scaled_float_dtype.c'),
             ]
diff --git a/numpy/core/src/_simd/_simd.dispatch.c.src b/numpy/core/src/_simd/_simd.dispatch.c.src
index 0f3e4fc8f..997205957 100644
--- a/numpy/core/src/_simd/_simd.dispatch.c.src
+++ b/numpy/core/src/_simd/_simd.dispatch.c.src
@@ -18,7 +18,7 @@
  * #esfx      = u16,s8, u32, s16, u32, s32, u64, s64, f32, f64#
  * #size      = 8,  8,  16,  16,  32,  32,  64,  64,  32,  64#
  * #expand_sup= 1,  0,  1,   0,   0,   0,   0,   0,   0,   0#
- * #simd_sup  = 1,  1,  1,   1,   1,   1,   1,   1,   1,   NPY_SIMD_F64#
+ * #simd_sup  = 1,  1,  1,   1,   1,   1,   1,   1,   NPY_SIMD_F32, NPY_SIMD_F64#
  * #fp_only   = 0,  0,  0,   0,   0,   0,   0,   0,   1,   1#
  * #sat_sup   = 1,  1,  1,   1,   0,   0,   0,   0,   0,   0#
  * #mul_sup   = 1,  1,  1,   1,   1,   1,   0,   0,   1,   1#
@@ -252,7 +252,7 @@ SIMD_IMPL_INTRIN_3(select_@sfx@, v@sfx@, v@bsfx@, v@sfx@, v@sfx@)
 
 /**begin repeat1
  * #sfx_to     = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64#
- * #simd_sup2  = 1,  1,  1,   1,   1,   1,   1,   1,   1,   NPY_SIMD_F64#
+ * #simd_sup2  = 1,  1,  1,   1,   1,   1,   1,   1,   NPY_SIMD_F32, NPY_SIMD_F64#
  */
 #if @simd_sup2@
 SIMD_IMPL_INTRIN_1(reinterpret_@sfx_to@_@sfx@, v@sfx_to@, v@sfx@)
@@ -442,7 +442,9 @@ SIMD_IMPL_INTRIN_0N(cleanup)
  * Operators
  ***************************/
 // check special cases
-SIMD_IMPL_INTRIN_1(notnan_f32, vb32, vf32)
+#if NPY_SIMD_F32
+    SIMD_IMPL_INTRIN_1(notnan_f32, vb32, vf32)
+#endif
 #if NPY_SIMD_F64
     SIMD_IMPL_INTRIN_1(notnan_f64, vb64, vf64)
 #endif
@@ -450,7 +452,9 @@ SIMD_IMPL_INTRIN_1(notnan_f32, vb32, vf32)
  * Conversions
  ***************************/
 // round to nearest integer (assume even)
-SIMD_IMPL_INTRIN_1(round_s32_f32, vs32, vf32)
+#if NPY_SIMD_F32
+    SIMD_IMPL_INTRIN_1(round_s32_f32, vs32, vf32)
+#endif
 #if NPY_SIMD_F64
     SIMD_IMPL_INTRIN_2(round_s32_f64, vs32, vf64, vf64)
 #endif
@@ -492,10 +496,10 @@ static PyMethodDef simd__intrinsics_methods[] = {
 /**begin repeat
  * #sfx       = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64#
  * #bsfx      = b8, b8, b16, b16, b32, b32, b64, b64, b32, b64#
- * #esfx      = u16,s8, u32, s16, u32, s32, u64, s64, f32, f64#
  * #size      = 8,  8,  16,  16,  32,  32,  64,  64,  32,  64#
+ * #esfx      = u16, s8, u32,s16, u32, s32, u64, s64, f32, f64#
  * #expand_sup= 1,  0,  1,   0,   0,   0,   0,   0,   0,   0#
- * #simd_sup  = 1,  1,  1,   1,   1,   1,   1,   1,   1,   NPY_SIMD_F64#
+ * #simd_sup  = 1,  1,  1,   1,   1,   1,   1,   1,   NPY_SIMD_F32, NPY_SIMD_F64#
  * #fp_only   = 0,  0,  0,   0,   0,   0,   0,   0,   1,   1#
  * #sat_sup   = 1,  1,  1,   1,   0,   0,   0,   0,   0,   0#
  * #mul_sup   = 1,  1,  1,   1,   1,   1,   0,   0,   1,   1#
@@ -547,7 +551,7 @@ SIMD_INTRIN_DEF(lut16_@sfx@)
  ***************************/
 /**begin repeat1
  * #sfx_to     = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64#
- * #simd_sup2  = 1,  1,  1,   1,   1,   1,   1,   1,   1,   NPY_SIMD_F64#
+ * #simd_sup2  = 1,  1,  1,   1,   1,   1,   1,   1,   NPY_SIMD_F32, NPY_SIMD_F64#
  */
 #if @simd_sup2@
 SIMD_INTRIN_DEF(reinterpret_@sfx_to@_@sfx@)
@@ -698,7 +702,9 @@ SIMD_INTRIN_DEF(cleanup)
  * Operators
  ***************************/
 // check special cases
-SIMD_INTRIN_DEF(notnan_f32)
+#if NPY_SIMD_F32
+    SIMD_INTRIN_DEF(notnan_f32)
+#endif
 #if NPY_SIMD_F64
     SIMD_INTRIN_DEF(notnan_f64)
 #endif
@@ -706,7 +712,9 @@ SIMD_INTRIN_DEF(notnan_f32)
  * Conversions
  ***************************/
 // round to nearest integer (assume even)
-SIMD_INTRIN_DEF(round_s32_f32)
+#if NPY_SIMD_F32
+    SIMD_INTRIN_DEF(round_s32_f32)
+#endif
 #if NPY_SIMD_F64
     SIMD_INTRIN_DEF(round_s32_f64)
 #endif
@@ -777,12 +785,18 @@ NPY_CPU_DISPATCH_CURFX(simd_create_module)(void)
     if (PyModule_AddIntConstant(m, "simd_f64", NPY_SIMD_F64)) {
         goto err;
     }
+    if (PyModule_AddIntConstant(m, "simd_f32", NPY_SIMD_F32)) {
+        goto err;
+    }
     if (PyModule_AddIntConstant(m, "simd_fma3", NPY_SIMD_FMA3)) {
         goto err;
     }
     if (PyModule_AddIntConstant(m, "simd_width", NPY_SIMD_WIDTH)) {
         goto err;
     }
+    if (PyModule_AddIntConstant(m, "simd_bigendian", NPY_SIMD_BIGENDIAN)) {
+        goto err;
+    }
 #if NPY_SIMD
     if (PySIMDVectorType_Init(m)) {
         goto err;
diff --git a/numpy/core/src/_simd/_simd_convert.inc b/numpy/core/src/_simd/_simd_convert.inc
index 46e044479..58eb90d69 100644
--- a/numpy/core/src/_simd/_simd_convert.inc
+++ b/numpy/core/src/_simd/_simd_convert.inc
@@ -20,6 +20,10 @@ simd_scalar_from_number(PyObject *obj, simd_data_type dtype)
         }
     } else {
         data.u64 = PyLong_AsUnsignedLongLongMask(obj);
+    #if NPY_SIMD_BIGENDIAN
+        int leftb = (sizeof(npyv_lanetype_u64) - info->lane_size) * 8;
+        data.u64 <<= leftb;
+    #endif
     }
     return data;
 }
@@ -36,7 +40,9 @@ simd_scalar_to_number(simd_data data, simd_data_type dtype)
         return PyFloat_FromDouble(data.f64);
     }
     int leftb = (sizeof(npyv_lanetype_u64) - info->lane_size) * 8;
+#if !NPY_SIMD_BIGENDIAN
     data.u64 <<= leftb;
+#endif
     if (info->is_signed) {
         return PyLong_FromLongLong(data.s64 >> leftb);
     }
diff --git a/numpy/core/src/_simd/_simd_inc.h.src b/numpy/core/src/_simd/_simd_inc.h.src
index fbdf982c2..887545414 100644
--- a/numpy/core/src/_simd/_simd_inc.h.src
+++ b/numpy/core/src/_simd/_simd_inc.h.src
@@ -27,22 +27,27 @@ typedef union
     /**end repeat**/
     // vectors
     /**begin repeat
-     * #sfx  = u8, u16, u32, u64, s8, s16, s32, s64, f32, b8, b16, b32, b64#
+     * #sfx  = u8, u16, u32, u64, s8, s16, s32, s64, b8, b16, b32, b64#
      */
     npyv_@sfx@ v@sfx@;
     /**end repeat**/
     // multi-vectors x2
     /**begin repeat
-     * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32#
+     * #sfx = u8, u16, u32, u64, s8, s16, s32, s64#
      */
     npyv_@sfx@x2 v@sfx@x2;
     /**end repeat**/
     // multi-vectors x3
     /**begin repeat
-     * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32#
+     * #sfx = u8, u16, u32, u64, s8, s16, s32, s64#
      */
     npyv_@sfx@x3 v@sfx@x3;
     /**end repeat**/
+#if NPY_SIMD_F32
+    npyv_f32    vf32;
+    npyv_f32x2  vf32x2;
+    npyv_f32x3  vf32x3;
+#endif
 #if NPY_SIMD_F64
     npyv_f64    vf64;
     npyv_f64x2  vf64x2;
diff --git a/numpy/core/src/common/lowlevel_strided_loops.h b/numpy/core/src/common/lowlevel_strided_loops.h
index 118ce9cb1..924a34db5 100644
--- a/numpy/core/src/common/lowlevel_strided_loops.h
+++ b/numpy/core/src/common/lowlevel_strided_loops.h
@@ -196,7 +196,7 @@ PyArray_GetDTypeTransferFunction(int aligned,
                             PyArray_Descr *src_dtype, PyArray_Descr *dst_dtype,
                             int move_references,
                             NPY_cast_info *cast_info,
-                            int *out_needs_api);
+                            NPY_ARRAYMETHOD_FLAGS *out_flags);
 
 NPY_NO_EXPORT int
 get_fields_transfer_function(int aligned,
@@ -205,7 +205,7 @@ get_fields_transfer_function(int aligned,
         int move_references,
         PyArrayMethod_StridedLoop **out_stransfer,
         NpyAuxData **out_transferdata,
-        int *out_needs_api);
+        NPY_ARRAYMETHOD_FLAGS *out_flags);
 
 NPY_NO_EXPORT int
 get_subarray_transfer_function(int aligned,
@@ -214,7 +214,7 @@ get_subarray_transfer_function(int aligned,
         int move_references,
         PyArrayMethod_StridedLoop **out_stransfer,
         NpyAuxData **out_transferdata,
-        int *out_needs_api);
+        NPY_ARRAYMETHOD_FLAGS *out_flags);
 
 /*
  * This is identical to PyArray_GetDTypeTransferFunction, but returns a
@@ -241,7 +241,7 @@ PyArray_GetMaskedDTypeTransferFunction(int aligned,
                             PyArray_Descr *mask_dtype,
                             int move_references,
                             NPY_cast_info *cast_info,
-                            int *out_needs_api);
+                            NPY_ARRAYMETHOD_FLAGS *out_flags);
 
 /*
  * Casts the specified number of elements from 'src' with data type
@@ -336,10 +336,14 @@ mapiter_trivial_set(PyArrayObject *self, PyArrayObject *ind,
                        PyArrayObject *result);
 
 NPY_NO_EXPORT int
-mapiter_get(PyArrayMapIterObject *mit);
+mapiter_get(
+        PyArrayMapIterObject *mit, NPY_cast_info *cast_info,
+        NPY_ARRAYMETHOD_FLAGS flags, int is_aligned);
 
 NPY_NO_EXPORT int
-mapiter_set(PyArrayMapIterObject *mit);
+mapiter_set(
+        PyArrayMapIterObject *mit, NPY_cast_info *cast_info,
+        NPY_ARRAYMETHOD_FLAGS flags, int is_aligned);
 
 /*
  * Prepares shape and strides for a simple raw array iteration.
diff --git a/numpy/core/src/common/npy_cpu_dispatch.h b/numpy/core/src/common/npy_cpu_dispatch.h
index e814cd425..4d5addec8 100644
--- a/numpy/core/src/common/npy_cpu_dispatch.h
+++ b/numpy/core/src/common/npy_cpu_dispatch.h
@@ -22,7 +22,7 @@
  * which is explicitly disabling the module ccompiler_opt.
  */
 #ifndef NPY_DISABLE_OPTIMIZATION
-    #if defined(__powerpc64__) && !defined(__cplusplus) && defined(bool)
+    #if (defined(__s390x__) || defined(__powerpc64__)) && !defined(__cplusplus) && defined(bool)
         /**
          * "altivec.h" header contains the definitions(bool, vector, pixel),
          * usually in c++ we undefine them after including the header.
@@ -34,7 +34,7 @@
         typedef bool npy__dispatch_bkbool;
     #endif
     #include "npy_cpu_dispatch_config.h"
-    #ifdef NPY_HAVE_VSX
+    #if defined(NPY_HAVE_VSX) || defined(NPY_HAVE_VX)
         #undef bool
         #undef vector
         #undef pixel
diff --git a/numpy/core/src/common/numpyos.h b/numpy/core/src/common/numpyos.h
index ce49cbea7..6e526af17 100644
--- a/numpy/core/src/common/numpyos.h
+++ b/numpy/core/src/common/numpyos.h
@@ -1,6 +1,10 @@
 #ifndef NUMPY_CORE_SRC_COMMON_NPY_NUMPYOS_H_
 #define NUMPY_CORE_SRC_COMMON_NPY_NUMPYOS_H_
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 NPY_NO_EXPORT char*
 NumPyOS_ascii_formatd(char *buffer, size_t buf_size,
                       const char *format,
@@ -39,4 +43,8 @@ NumPyOS_strtoll(const char *str, char **endptr, int base);
 NPY_NO_EXPORT npy_ulonglong
 NumPyOS_strtoull(const char *str, char **endptr, int base);
 
+#ifdef __cplusplus
+}
+#endif
+
 #endif  /* NUMPY_CORE_SRC_COMMON_NPY_NUMPYOS_H_ */
diff --git a/numpy/core/src/common/simd/avx2/avx2.h b/numpy/core/src/common/simd/avx2/avx2.h
index 02ff536fb..8cb74df2b 100644
--- a/numpy/core/src/common/simd/avx2/avx2.h
+++ b/numpy/core/src/common/simd/avx2/avx2.h
@@ -3,12 +3,14 @@
 #endif
 #define NPY_SIMD 256
 #define NPY_SIMD_WIDTH 32
+#define NPY_SIMD_F32 1
 #define NPY_SIMD_F64 1
 #ifdef NPY_HAVE_FMA3
     #define NPY_SIMD_FMA3 1 // native support
 #else
     #define NPY_SIMD_FMA3 0 // fast emulated
 #endif
+#define NPY_SIMD_BIGENDIAN 0
 // Enough limit to allow us to use _mm256_i32gather_*
 #define NPY_SIMD_MAXLOAD_STRIDE32 (0x7fffffff / 8)
 
diff --git a/numpy/core/src/common/simd/avx512/avx512.h b/numpy/core/src/common/simd/avx512/avx512.h
index f38686834..0946e6443 100644
--- a/numpy/core/src/common/simd/avx512/avx512.h
+++ b/numpy/core/src/common/simd/avx512/avx512.h
@@ -3,8 +3,10 @@
 #endif
 #define NPY_SIMD 512
 #define NPY_SIMD_WIDTH 64
+#define NPY_SIMD_F32 1
 #define NPY_SIMD_F64 1
 #define NPY_SIMD_FMA3 1 // native support
+#define NPY_SIMD_BIGENDIAN 0
 // Enough limit to allow us to use _mm512_i32gather_* and _mm512_i32scatter_*
 #define NPY_SIMD_MAXLOAD_STRIDE32  (0x7fffffff / 16)
 #define NPY_SIMD_MAXSTORE_STRIDE32 (0x7fffffff / 16)
diff --git a/numpy/core/src/common/simd/emulate_maskop.h b/numpy/core/src/common/simd/emulate_maskop.h
index 41e397c2d..2a808a153 100644
--- a/numpy/core/src/common/simd/emulate_maskop.h
+++ b/numpy/core/src/common/simd/emulate_maskop.h
@@ -36,7 +36,9 @@ NPYV_IMPL_EMULATE_MASK_ADDSUB(u32, b32)
 NPYV_IMPL_EMULATE_MASK_ADDSUB(s32, b32)
 NPYV_IMPL_EMULATE_MASK_ADDSUB(u64, b64)
 NPYV_IMPL_EMULATE_MASK_ADDSUB(s64, b64)
-NPYV_IMPL_EMULATE_MASK_ADDSUB(f32, b32)
+#if NPY_SIMD_F32
+    NPYV_IMPL_EMULATE_MASK_ADDSUB(f32, b32)
+#endif
 #if NPY_SIMD_F64
     NPYV_IMPL_EMULATE_MASK_ADDSUB(f64, b64)
 #endif
diff --git a/numpy/core/src/common/simd/intdiv.h b/numpy/core/src/common/simd/intdiv.h
index 8b65b3a76..f5066b59b 100644
--- a/numpy/core/src/common/simd/intdiv.h
+++ b/numpy/core/src/common/simd/intdiv.h
@@ -89,7 +89,9 @@ NPY_FINLINE unsigned npyv__bitscan_revnz_u32(npy_uint32 a)
     unsigned long rl;
     (void)_BitScanReverse(&rl, (unsigned long)a);
     r = (unsigned)rl;
-#elif defined(NPY_HAVE_SSE2) && (defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER))
+
+#elif defined(NPY_HAVE_SSE2) && (defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER)) \
+    &&  (defined(NPY_CPU_X86) || defined(NPY_CPU_AMD64))
     __asm__("bsr %1, %0" : "=r" (r) : "r"(a));
 #elif defined(__GNUC__) || defined(__clang__)
     r = 31 - __builtin_clz(a); // performs on arm -> clz, ppc -> cntlzw
@@ -206,7 +208,7 @@ NPY_FINLINE npyv_u8x3 npyv_divisor_u8(npy_uint8 d)
     divisor.val[0] = npyv_setall_u16(m);
     divisor.val[1] = npyv_set_u8(sh1);
     divisor.val[2] = npyv_set_u8(sh2);
-#elif defined(NPY_HAVE_VSX2)
+#elif defined(NPY_HAVE_VSX2) || defined(NPY_HAVE_VX)
     divisor.val[0] = npyv_setall_u8(m);
     divisor.val[1] = npyv_setall_u8(sh1);
     divisor.val[2] = npyv_setall_u8(sh2);
@@ -247,7 +249,7 @@ NPY_FINLINE npyv_s8x3 npyv_divisor_s8(npy_int8 d)
     npyv_s8x3 divisor;
     divisor.val[0] = npyv_setall_s8(m);
     divisor.val[2] = npyv_setall_s8(d < 0 ? -1 : 0);
-    #ifdef NPY_HAVE_VSX2
+    #if defined(NPY_HAVE_VSX2) || defined(NPY_HAVE_VX)
         divisor.val[1] = npyv_setall_s8(sh);
     #elif defined(NPY_HAVE_NEON)
         divisor.val[1] = npyv_setall_s8(-sh);
@@ -283,7 +285,7 @@ NPY_FINLINE npyv_u16x3 npyv_divisor_u16(npy_uint16 d)
 #ifdef NPY_HAVE_SSE2 // SSE/AVX2/AVX512
     divisor.val[1] = npyv_set_u16(sh1);
     divisor.val[2] = npyv_set_u16(sh2);
-#elif defined(NPY_HAVE_VSX2)
+#elif defined(NPY_HAVE_VSX2) || defined(NPY_HAVE_VX)
     divisor.val[1] = npyv_setall_u16(sh1);
     divisor.val[2] = npyv_setall_u16(sh2);
 #elif defined(NPY_HAVE_NEON)
@@ -315,7 +317,7 @@ NPY_FINLINE npyv_s16x3 npyv_divisor_s16(npy_int16 d)
     divisor.val[2] = npyv_setall_s16(d < 0 ? -1 : 0); // sign of divisor
 #ifdef NPY_HAVE_SSE2 // SSE/AVX2/AVX512
     divisor.val[1] = npyv_set_s16(sh);
-#elif defined(NPY_HAVE_VSX2)
+#elif defined(NPY_HAVE_VSX2) || defined(NPY_HAVE_VX)
     divisor.val[1] = npyv_setall_s16(sh);
 #elif defined(NPY_HAVE_NEON)
     divisor.val[1] = npyv_setall_s16(-sh);
@@ -350,7 +352,7 @@ NPY_FINLINE npyv_u32x3 npyv_divisor_u32(npy_uint32 d)
 #ifdef NPY_HAVE_SSE2 // SSE/AVX2/AVX512
     divisor.val[1] = npyv_set_u32(sh1);
     divisor.val[2] = npyv_set_u32(sh2);
-#elif defined(NPY_HAVE_VSX2)
+#elif defined(NPY_HAVE_VSX2) || defined(NPY_HAVE_VX)
     divisor.val[1] = npyv_setall_u32(sh1);
     divisor.val[2] = npyv_setall_u32(sh2);
 #elif defined(NPY_HAVE_NEON)
@@ -387,7 +389,7 @@ NPY_FINLINE npyv_s32x3 npyv_divisor_s32(npy_int32 d)
     divisor.val[2] = npyv_setall_s32(d < 0 ? -1 : 0); // sign of divisor
 #ifdef NPY_HAVE_SSE2 // SSE/AVX2/AVX512
     divisor.val[1] = npyv_set_s32(sh);
-#elif defined(NPY_HAVE_VSX2)
+#elif defined(NPY_HAVE_VSX2) || defined(NPY_HAVE_VX)
     divisor.val[1] = npyv_setall_s32(sh);
 #elif defined(NPY_HAVE_NEON)
     divisor.val[1] = npyv_setall_s32(-sh);
@@ -400,7 +402,7 @@ NPY_FINLINE npyv_s32x3 npyv_divisor_s32(npy_int32 d)
 NPY_FINLINE npyv_u64x3 npyv_divisor_u64(npy_uint64 d)
 {
     npyv_u64x3 divisor;
-#if defined(NPY_HAVE_VSX2) || defined(NPY_HAVE_NEON)
+#if defined(NPY_HAVE_VSX2) || defined(NPY_HAVE_VX) || defined(NPY_HAVE_NEON)
     divisor.val[0] = npyv_setall_u64(d);
 #else
     npy_uint64 l, l2, sh1, sh2, m;
@@ -435,7 +437,7 @@ NPY_FINLINE npyv_u64x3 npyv_divisor_u64(npy_uint64 d)
 NPY_FINLINE npyv_s64x3 npyv_divisor_s64(npy_int64 d)
 {
     npyv_s64x3 divisor;
-#if defined(NPY_HAVE_VSX2) || defined(NPY_HAVE_NEON)
+#if defined(NPY_HAVE_VSX2) || defined(NPY_HAVE_VX) || defined(NPY_HAVE_NEON)
     divisor.val[0] = npyv_setall_s64(d);
     divisor.val[1] = npyv_cvt_s64_b64(
         npyv_cmpeq_s64(npyv_setall_s64(-1), divisor.val[0])
diff --git a/numpy/core/src/common/simd/neon/math.h b/numpy/core/src/common/simd/neon/math.h
index 4607d6f27..8f4680c8f 100644
--- a/numpy/core/src/common/simd/neon/math.h
+++ b/numpy/core/src/common/simd/neon/math.h
@@ -161,7 +161,7 @@ NPY_FINLINE npyv_f32 npyv_rint_f32(npyv_f32 a)
 #else
     // ARMv7 NEON only supports fp to int truncate conversion.
     // a magic trick of adding 1.5 * 2**23 is used for rounding
-    // to nearest even and then substract this magic number to get
+    // to nearest even and then subtract this magic number to get
     // the integer.
     const npyv_s32 szero = vreinterpretq_s32_f32(vdupq_n_f32(-0.0f));
     const npyv_f32 magic = vdupq_n_f32(12582912.0f); // 1.5 * 2**23
diff --git a/numpy/core/src/common/simd/neon/neon.h b/numpy/core/src/common/simd/neon/neon.h
index e6f6a7324..b08071527 100644
--- a/numpy/core/src/common/simd/neon/neon.h
+++ b/numpy/core/src/common/simd/neon/neon.h
@@ -4,7 +4,7 @@
 
 #define NPY_SIMD 128
 #define NPY_SIMD_WIDTH 16
-
+#define NPY_SIMD_F32 1
 #ifdef __aarch64__
     #define NPY_SIMD_F64 1
 #else
@@ -15,6 +15,7 @@
 #else
     #define NPY_SIMD_FMA3 0  // HW emulated
 #endif
+#define NPY_SIMD_BIGENDIAN 0
 
 typedef uint8x16_t  npyv_u8;
 typedef int8x16_t   npyv_s8;
diff --git a/numpy/core/src/common/simd/simd.h b/numpy/core/src/common/simd/simd.h
index 08b2a7d00..b1492500f 100644
--- a/numpy/core/src/common/simd/simd.h
+++ b/numpy/core/src/common/simd/simd.h
@@ -34,7 +34,7 @@ typedef double     npyv_lanetype_f64;
  * They had bad impact on the generated instructions,
  * sometimes the compiler deal with them without the respect
  * of 32-bit mode which lead to crush due to execute 64-bit
- * instructions and other times generate bad emulated instructions. 
+ * instructions and other times generate bad emulated instructions.
  */
     #undef _mm512_set1_epi64
     #undef _mm256_set1_epi64x
@@ -54,9 +54,9 @@ typedef double     npyv_lanetype_f64;
     #include "sse/sse.h"
 #endif
 
-// TODO: Add support for VSX(2.06) and BE Mode
-#if defined(NPY_HAVE_VSX2) && defined(__LITTLE_ENDIAN__)
-    #include "vsx/vsx.h"
+// TODO: Add support for VSX(2.06) and BE Mode for VSX
+#if defined(NPY_HAVE_VX) || (defined(NPY_HAVE_VSX2) && defined(__LITTLE_ENDIAN__))
+    #include "vec/vec.h"
 #endif
 
 #ifdef NPY_HAVE_NEON
@@ -64,10 +64,20 @@ typedef double     npyv_lanetype_f64;
 #endif
 
 #ifndef NPY_SIMD
+    /// SIMD width in bits or 0 if there's no SIMD extension available.
     #define NPY_SIMD 0
+    /// SIMD width in bytes or 0 if there's no SIMD extension available.
     #define NPY_SIMD_WIDTH 0
+    /// 1 if the enabled SIMD extension supports single-precision otherwise 0.
+    #define NPY_SIMD_F32 0
+    /// 1 if the enabled SIMD extension supports double-precision otherwise 0.
     #define NPY_SIMD_F64 0
+    /// 1 if the enabled SIMD extension supports native FMA otherwise 0.
+    /// note: we still emulate(fast) FMA intrinsics even if they
+    /// aren't supported but they shouldn't be used if the precision is matters.
     #define NPY_SIMD_FMA3 0
+    /// 1 if the enabled SIMD extension is running on big-endian mode otherwise 0.
+    #define NPY_SIMD_BIGENDIAN 0
 #endif
 
 // enable emulated mask operations for all SIMD extension except for AVX512
diff --git a/numpy/core/src/common/simd/sse/sse.h b/numpy/core/src/common/simd/sse/sse.h
index 0bb404312..c21bbfda7 100644
--- a/numpy/core/src/common/simd/sse/sse.h
+++ b/numpy/core/src/common/simd/sse/sse.h
@@ -4,12 +4,15 @@
 
 #define NPY_SIMD 128
 #define NPY_SIMD_WIDTH 16
+#define NPY_SIMD_F32 1
 #define NPY_SIMD_F64 1
 #if defined(NPY_HAVE_FMA3) || defined(NPY_HAVE_FMA4)
     #define NPY_SIMD_FMA3 1  // native support
 #else
     #define NPY_SIMD_FMA3 0  // fast emulated
 #endif
+#define NPY_SIMD_BIGENDIAN 0
+
 typedef __m128i npyv_u8;
 typedef __m128i npyv_s8;
 typedef __m128i npyv_u16;
diff --git a/numpy/core/src/common/simd/vsx/arithmetic.h b/numpy/core/src/common/simd/vec/arithmetic.h
index 01dbf5480..a2e9d07eb 100644
--- a/numpy/core/src/common/simd/vsx/arithmetic.h
+++ b/numpy/core/src/common/simd/vec/arithmetic.h
@@ -2,8 +2,8 @@
     #error "Not a standalone header"
 #endif
 
-#ifndef _NPY_SIMD_VSX_ARITHMETIC_H
-#define _NPY_SIMD_VSX_ARITHMETIC_H
+#ifndef _NPY_SIMD_VEC_ARITHMETIC_H
+#define _NPY_SIMD_VEC_ARITHMETIC_H
 
 /***************************
  * Addition
@@ -17,15 +17,32 @@
 #define npyv_add_s32 vec_add
 #define npyv_add_u64 vec_add
 #define npyv_add_s64 vec_add
+#if NPY_SIMD_F32
 #define npyv_add_f32 vec_add
+#endif
 #define npyv_add_f64 vec_add
 
 // saturated
-#define npyv_adds_u8  vec_adds
-#define npyv_adds_s8  vec_adds
-#define npyv_adds_u16 vec_adds
-#define npyv_adds_s16 vec_adds
+#ifdef NPY_HAVE_VX
+    #define NPYV_IMPL_VX_ADDS(SFX, PSFX) \
+        NPY_FINLINE npyv_##SFX npyv_adds_##SFX(npyv_##SFX a, npyv_##SFX b)\
+        {                                                                 \
+            return vec_pack##PSFX(                                        \
+                vec_add(vec_unpackh(a), vec_unpackh(b)),                  \
+                vec_add(vec_unpackl(a), vec_unpackl(b))                   \
+            );                                                            \
+        }
 
+    NPYV_IMPL_VX_ADDS(u8, su)
+    NPYV_IMPL_VX_ADDS(s8, s)
+    NPYV_IMPL_VX_ADDS(u16, su)
+    NPYV_IMPL_VX_ADDS(s16, s)
+#else // VSX
+    #define npyv_adds_u8  vec_adds
+    #define npyv_adds_s8  vec_adds
+    #define npyv_adds_u16 vec_adds
+    #define npyv_adds_s16 vec_adds
+#endif
 /***************************
  * Subtraction
  ***************************/
@@ -38,21 +55,39 @@
 #define npyv_sub_s32 vec_sub
 #define npyv_sub_u64 vec_sub
 #define npyv_sub_s64 vec_sub
+#if NPY_SIMD_F32
 #define npyv_sub_f32 vec_sub
+#endif
 #define npyv_sub_f64 vec_sub
 
 // saturated
-#define npyv_subs_u8  vec_subs
-#define npyv_subs_s8  vec_subs
-#define npyv_subs_u16 vec_subs
-#define npyv_subs_s16 vec_subs
+#ifdef NPY_HAVE_VX
+    #define NPYV_IMPL_VX_SUBS(SFX, PSFX)                                  \
+        NPY_FINLINE npyv_##SFX npyv_subs_##SFX(npyv_##SFX a, npyv_##SFX b)\
+        {                                                                 \
+            return vec_pack##PSFX(                                        \
+                vec_sub(vec_unpackh(a), vec_unpackh(b)),                  \
+                vec_sub(vec_unpackl(a), vec_unpackl(b))                   \
+            );                                                            \
+        }
+
+    NPYV_IMPL_VX_SUBS(u8, su)
+    NPYV_IMPL_VX_SUBS(s8, s)
+    NPYV_IMPL_VX_SUBS(u16, su)
+    NPYV_IMPL_VX_SUBS(s16, s)
+#else // VSX
+    #define npyv_subs_u8  vec_subs
+    #define npyv_subs_s8  vec_subs
+    #define npyv_subs_u16 vec_subs
+    #define npyv_subs_s16 vec_subs
+#endif
 
 /***************************
  * Multiplication
  ***************************/
 // non-saturated
 // up to GCC 6 vec_mul only supports precisions and llong
-#if defined(__GNUC__) && __GNUC__ < 7
+#if defined(NPY_HAVE_VSX) && defined(__GNUC__) && __GNUC__ < 7
     #define NPYV_IMPL_VSX_MUL(T_VEC, SFX, ...)              \
         NPY_FINLINE T_VEC npyv_mul_##SFX(T_VEC a, T_VEC b)  \
         {                                                   \
@@ -91,7 +126,9 @@
     #define npyv_mul_u32 vec_mul
     #define npyv_mul_s32 vec_mul
 #endif
+#if NPY_SIMD_F32
 #define npyv_mul_f32 vec_mul
+#endif
 #define npyv_mul_f64 vec_mul
 
 /***************************
@@ -101,6 +138,9 @@
 // divide each unsigned 8-bit element by a precomputed divisor
 NPY_FINLINE npyv_u8 npyv_divc_u8(npyv_u8 a, const npyv_u8x3 divisor)
 {
+#ifdef NPY_HAVE_VX
+    npyv_u8  mulhi    = vec_mulh(a, divisor.val[0]);
+#else // VSX
     const npyv_u8 mergeo_perm = {
         1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31
     };
@@ -108,6 +148,7 @@ NPY_FINLINE npyv_u8 npyv_divc_u8(npyv_u8 a, const npyv_u8x3 divisor)
     npyv_u16 mul_even = vec_mule(a, divisor.val[0]);
     npyv_u16 mul_odd  = vec_mulo(a, divisor.val[0]);
     npyv_u8  mulhi    = (npyv_u8)vec_perm(mul_even, mul_odd, mergeo_perm);
+#endif
     // floor(a/d)     = (mulhi + ((a-mulhi) >> sh1)) >> sh2
     npyv_u8 q         = vec_sub(a, mulhi);
             q         = vec_sr(q, divisor.val[1]);
@@ -118,6 +159,9 @@ NPY_FINLINE npyv_u8 npyv_divc_u8(npyv_u8 a, const npyv_u8x3 divisor)
 // divide each signed 8-bit element by a precomputed divisor
 NPY_FINLINE npyv_s8 npyv_divc_s8(npyv_s8 a, const npyv_s8x3 divisor)
 {
+#ifdef NPY_HAVE_VX
+    npyv_s8  mulhi    = vec_mulh(a, divisor.val[0]);
+#else
     const npyv_u8 mergeo_perm = {
         1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31
     };
@@ -125,16 +169,20 @@ NPY_FINLINE npyv_s8 npyv_divc_s8(npyv_s8 a, const npyv_s8x3 divisor)
     npyv_s16 mul_even = vec_mule(a, divisor.val[0]);
     npyv_s16 mul_odd  = vec_mulo(a, divisor.val[0]);
     npyv_s8  mulhi    = (npyv_s8)vec_perm(mul_even, mul_odd, mergeo_perm);
+#endif
     // q              = ((a + mulhi) >> sh1) - XSIGN(a)
     // trunc(a/d)     = (q ^ dsign) - dsign
-    npyv_s8 q         = vec_sra(vec_add(a, mulhi), (npyv_u8)divisor.val[1]);
-            q         = vec_sub(q, vec_sra(a, npyv_setall_u8(7)));
+    npyv_s8 q         = vec_sra_s8(vec_add(a, mulhi), (npyv_u8)divisor.val[1]);
+            q         = vec_sub(q, vec_sra_s8(a, npyv_setall_u8(7)));
             q         = vec_sub(vec_xor(q, divisor.val[2]), divisor.val[2]);
     return  q;
 }
 // divide each unsigned 16-bit element by a precomputed divisor
 NPY_FINLINE npyv_u16 npyv_divc_u16(npyv_u16 a, const npyv_u16x3 divisor)
 {
+#ifdef NPY_HAVE_VX
+    npyv_u16 mulhi    = vec_mulh(a, divisor.val[0]);
+#else // VSX
     const npyv_u8 mergeo_perm = {
         2, 3, 18, 19, 6, 7, 22, 23, 10, 11, 26, 27, 14, 15, 30, 31
     };
@@ -142,6 +190,7 @@ NPY_FINLINE npyv_u16 npyv_divc_u16(npyv_u16 a, const npyv_u16x3 divisor)
     npyv_u32 mul_even = vec_mule(a, divisor.val[0]);
     npyv_u32 mul_odd  = vec_mulo(a, divisor.val[0]);
     npyv_u16 mulhi    = (npyv_u16)vec_perm(mul_even, mul_odd, mergeo_perm);
+#endif
     // floor(a/d)     = (mulhi + ((a-mulhi) >> sh1)) >> sh2
     npyv_u16 q        = vec_sub(a, mulhi);
              q        = vec_sr(q, divisor.val[1]);
@@ -152,6 +201,9 @@ NPY_FINLINE npyv_u16 npyv_divc_u16(npyv_u16 a, const npyv_u16x3 divisor)
 // divide each signed 16-bit element by a precomputed divisor (round towards zero)
 NPY_FINLINE npyv_s16 npyv_divc_s16(npyv_s16 a, const npyv_s16x3 divisor)
 {
+#ifdef NPY_HAVE_VX
+    npyv_s16 mulhi    = vec_mulh(a, divisor.val[0]);
+#else // VSX
     const npyv_u8 mergeo_perm = {
         2, 3, 18, 19, 6, 7, 22, 23, 10, 11, 26, 27, 14, 15, 30, 31
     };
@@ -159,30 +211,31 @@ NPY_FINLINE npyv_s16 npyv_divc_s16(npyv_s16 a, const npyv_s16x3 divisor)
     npyv_s32 mul_even = vec_mule(a, divisor.val[0]);
     npyv_s32 mul_odd  = vec_mulo(a, divisor.val[0]);
     npyv_s16 mulhi    = (npyv_s16)vec_perm(mul_even, mul_odd, mergeo_perm);
+#endif
     // q              = ((a + mulhi) >> sh1) - XSIGN(a)
     // trunc(a/d)     = (q ^ dsign) - dsign
-    npyv_s16 q        = vec_sra(vec_add(a, mulhi), (npyv_u16)divisor.val[1]);
-             q        = vec_sub(q, vec_sra(a, npyv_setall_u16(15)));
+    npyv_s16 q        = vec_sra_s16(vec_add(a, mulhi), (npyv_u16)divisor.val[1]);
+             q        = vec_sub(q, vec_sra_s16(a, npyv_setall_u16(15)));
              q        = vec_sub(vec_xor(q, divisor.val[2]), divisor.val[2]);
     return   q;
 }
 // divide each unsigned 32-bit element by a precomputed divisor
 NPY_FINLINE npyv_u32 npyv_divc_u32(npyv_u32 a, const npyv_u32x3 divisor)
 {
-#if defined(NPY_HAVE_VSX4)
+#if defined(NPY_HAVE_VSX4) || defined(NPY_HAVE_VX)
     // high part of unsigned multiplication
     npyv_u32 mulhi    = vec_mulh(a, divisor.val[0]);
-#else
-#if defined(__GNUC__) && __GNUC__ < 8
-    // Doubleword integer wide multiplication supported by GCC 8+
-    npyv_u64 mul_even, mul_odd;
-    __asm__ ("vmulouw %0,%1,%2" : "=v" (mul_even) : "v" (a), "v" (divisor.val[0]));
-    __asm__ ("vmuleuw %0,%1,%2" : "=v" (mul_odd)  : "v" (a), "v" (divisor.val[0]));
-#else
-    // Doubleword integer wide multiplication supported by GCC 8+
-    npyv_u64 mul_even = vec_mule(a, divisor.val[0]);
-    npyv_u64 mul_odd  = vec_mulo(a, divisor.val[0]);
-#endif
+#else // VSX
+    #if defined(__GNUC__) && __GNUC__ < 8
+        // Doubleword integer wide multiplication supported by GCC 8+
+        npyv_u64 mul_even, mul_odd;
+        __asm__ ("vmulouw %0,%1,%2" : "=v" (mul_even) : "v" (a), "v" (divisor.val[0]));
+        __asm__ ("vmuleuw %0,%1,%2" : "=v" (mul_odd)  : "v" (a), "v" (divisor.val[0]));
+    #else
+        // Doubleword integer wide multiplication supported by GCC 8+
+        npyv_u64 mul_even = vec_mule(a, divisor.val[0]);
+        npyv_u64 mul_odd  = vec_mulo(a, divisor.val[0]);
+    #endif
     // high part of unsigned multiplication
     npyv_u32 mulhi    = vec_mergeo((npyv_u32)mul_even, (npyv_u32)mul_odd);
 #endif
@@ -196,27 +249,27 @@ NPY_FINLINE npyv_u32 npyv_divc_u32(npyv_u32 a, const npyv_u32x3 divisor)
 // divide each signed 32-bit element by a precomputed divisor (round towards zero)
 NPY_FINLINE npyv_s32 npyv_divc_s32(npyv_s32 a, const npyv_s32x3 divisor)
 {
-#if defined(NPY_HAVE_VSX4)
+#if defined(NPY_HAVE_VSX4) || defined(NPY_HAVE_VX)
     // high part of signed multiplication
     npyv_s32 mulhi    = vec_mulh(a, divisor.val[0]);
 #else
-#if defined(__GNUC__) && __GNUC__ < 8
-    // Doubleword integer wide multiplication supported by GCC8+
-    npyv_s64 mul_even, mul_odd;
-    __asm__ ("vmulosw %0,%1,%2" : "=v" (mul_even) : "v" (a), "v" (divisor.val[0]));
-    __asm__ ("vmulesw %0,%1,%2" : "=v" (mul_odd)  : "v" (a), "v" (divisor.val[0]));
-#else
-    // Doubleword integer wide multiplication supported by GCC8+
-    npyv_s64 mul_even = vec_mule(a, divisor.val[0]);
-    npyv_s64 mul_odd  = vec_mulo(a, divisor.val[0]);
-#endif
+    #if defined(__GNUC__) && __GNUC__ < 8
+        // Doubleword integer wide multiplication supported by GCC8+
+        npyv_s64 mul_even, mul_odd;
+        __asm__ ("vmulosw %0,%1,%2" : "=v" (mul_even) : "v" (a), "v" (divisor.val[0]));
+        __asm__ ("vmulesw %0,%1,%2" : "=v" (mul_odd)  : "v" (a), "v" (divisor.val[0]));
+    #else
+        // Doubleword integer wide multiplication supported by GCC8+
+        npyv_s64 mul_even = vec_mule(a, divisor.val[0]);
+        npyv_s64 mul_odd  = vec_mulo(a, divisor.val[0]);
+    #endif
     // high part of signed multiplication
     npyv_s32 mulhi    = vec_mergeo((npyv_s32)mul_even, (npyv_s32)mul_odd);
 #endif
     // q              = ((a + mulhi) >> sh1) - XSIGN(a)
     // trunc(a/d)     = (q ^ dsign) - dsign
-    npyv_s32 q        = vec_sra(vec_add(a, mulhi), (npyv_u32)divisor.val[1]);
-             q        = vec_sub(q, vec_sra(a, npyv_setall_u32(31)));
+    npyv_s32 q        = vec_sra_s32(vec_add(a, mulhi), (npyv_u32)divisor.val[1]);
+             q        = vec_sub(q, vec_sra_s32(a, npyv_setall_u32(31)));
              q        = vec_sub(vec_xor(q, divisor.val[2]), divisor.val[2]);
     return   q;
 }
@@ -240,45 +293,67 @@ NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor)
 /***************************
  * Division
  ***************************/
-#define npyv_div_f32 vec_div
+#if NPY_SIMD_F32
+    #define npyv_div_f32 vec_div
+#endif
 #define npyv_div_f64 vec_div
 
 /***************************
  * FUSED
  ***************************/
 // multiply and add, a*b + c
-#define npyv_muladd_f32 vec_madd
 #define npyv_muladd_f64 vec_madd
 // multiply and subtract, a*b - c
-#define npyv_mulsub_f32 vec_msub
 #define npyv_mulsub_f64 vec_msub
-// negate multiply and add, -(a*b) + c
-#define npyv_nmuladd_f32 vec_nmsub // equivalent to -(a*b - c)
-#define npyv_nmuladd_f64 vec_nmsub
-// negate multiply and subtract, -(a*b) - c
-#define npyv_nmulsub_f32 vec_nmadd // equivalent to -(a*b + c)
-#define npyv_nmulsub_f64 vec_nmadd
-
+#if NPY_SIMD_F32
+    #define npyv_muladd_f32 vec_madd
+    #define npyv_mulsub_f32 vec_msub
+#endif
+#if defined(NPY_HAVE_VXE) || defined(NPY_HAVE_VSX)
+    // negate multiply and add, -(a*b) + c
+    #define npyv_nmuladd_f32 vec_nmsub // equivalent to -(a*b - c)
+    #define npyv_nmuladd_f64 vec_nmsub
+    // negate multiply and subtract, -(a*b) - c
+    #define npyv_nmulsub_f64 vec_nmadd
+    #define npyv_nmulsub_f32 vec_nmadd // equivalent to -(a*b + c)
+#else
+    NPY_FINLINE npyv_f64 npyv_nmuladd_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c)
+    { return vec_neg(vec_msub(a, b, c)); }
+    NPY_FINLINE npyv_f64 npyv_nmulsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c)
+    { return vec_neg(vec_madd(a, b, c)); }
+#endif
 /***************************
  * Summation
  ***************************/
 // reduce sum across vector
 NPY_FINLINE npy_uint64 npyv_sum_u64(npyv_u64 a)
 {
+#ifdef NPY_HAVE_VX
+    const npyv_u64 zero = npyv_zero_u64();
+    return vec_extract((npyv_u64)vec_sum_u128(a, zero), 1);
+#else
     return vec_extract(vec_add(a, vec_mergel(a, a)), 0);
+#endif
 }
 
 NPY_FINLINE npy_uint32 npyv_sum_u32(npyv_u32 a)
 {
+#ifdef NPY_HAVE_VX
+    const npyv_u32 zero = npyv_zero_u32();
+    return vec_extract((npyv_u32)vec_sum_u128(a, zero), 3);
+#else
     const npyv_u32 rs = vec_add(a, vec_sld(a, a, 8));
     return vec_extract(vec_add(rs, vec_sld(rs, rs, 4)), 0);
+#endif
 }
 
+#if NPY_SIMD_F32
 NPY_FINLINE float npyv_sum_f32(npyv_f32 a)
 {
     npyv_f32 sum = vec_add(a, npyv_combineh_f32(a, a));
     return vec_extract(sum, 0) + vec_extract(sum, 1);
 }
+#endif
 
 NPY_FINLINE double npyv_sum_f64(npyv_f64 a)
 {
@@ -288,19 +363,30 @@ NPY_FINLINE double npyv_sum_f64(npyv_f64 a)
 // expand the source vector and performs sum reduce
 NPY_FINLINE npy_uint16 npyv_sumup_u8(npyv_u8 a)
 {
+#ifdef NPY_HAVE_VX
+    const npyv_u8 zero = npyv_zero_u8();
+    npyv_u32 sum4 = vec_sum4(a, zero);
+    return (npy_uint16)npyv_sum_u32(sum4);
+#else
     const npyv_u32 zero = npyv_zero_u32();
     npyv_u32 four = vec_sum4s(a, zero);
     npyv_s32 one  = vec_sums((npyv_s32)four, (npyv_s32)zero);
     return (npy_uint16)vec_extract(one, 3);
+#endif
 }
 
 NPY_FINLINE npy_uint32 npyv_sumup_u16(npyv_u16 a)
 {
+#ifdef NPY_HAVE_VX
+    npyv_u64 sum = vec_sum2(a, npyv_zero_u16());
+    return (npy_uint32)npyv_sum_u64(sum);
+#else // VSX
     const npyv_s32 zero = npyv_zero_s32();
     npyv_u32x2 eight = npyv_expand_u32_u16(a);
     npyv_u32   four  = vec_add(eight.val[0], eight.val[1]);
     npyv_s32   one   = vec_sums((npyv_s32)four, zero);
     return (npy_uint32)vec_extract(one, 3);
+#endif
 }
 
-#endif // _NPY_SIMD_VSX_ARITHMETIC_H
+#endif // _NPY_SIMD_VEC_ARITHMETIC_H
diff --git a/numpy/core/src/common/simd/vec/conversion.h b/numpy/core/src/common/simd/vec/conversion.h
new file mode 100644
index 000000000..f0d625c55
--- /dev/null
+++ b/numpy/core/src/common/simd/vec/conversion.h
@@ -0,0 +1,228 @@
+#ifndef NPY_SIMD
+    #error "Not a standalone header"
+#endif
+
+#ifndef _NPY_SIMD_VEC_CVT_H
+#define _NPY_SIMD_VEC_CVT_H
+
+// convert boolean vectors to integer vectors
+#define npyv_cvt_u8_b8(BL)   ((npyv_u8)  BL)
+#define npyv_cvt_s8_b8(BL)   ((npyv_s8)  BL)
+#define npyv_cvt_u16_b16(BL) ((npyv_u16) BL)
+#define npyv_cvt_s16_b16(BL) ((npyv_s16) BL)
+#define npyv_cvt_u32_b32(BL) ((npyv_u32) BL)
+#define npyv_cvt_s32_b32(BL) ((npyv_s32) BL)
+#define npyv_cvt_u64_b64(BL) ((npyv_u64) BL)
+#define npyv_cvt_s64_b64(BL) ((npyv_s64) BL)
+#if NPY_SIMD_F32
+    #define npyv_cvt_f32_b32(BL) ((npyv_f32) BL)
+#endif
+#define npyv_cvt_f64_b64(BL) ((npyv_f64) BL)
+
+// convert integer vectors to boolean vectors
+#define npyv_cvt_b8_u8(A)   ((npyv_b8)  A)
+#define npyv_cvt_b8_s8(A)   ((npyv_b8)  A)
+#define npyv_cvt_b16_u16(A) ((npyv_b16) A)
+#define npyv_cvt_b16_s16(A) ((npyv_b16) A)
+#define npyv_cvt_b32_u32(A) ((npyv_b32) A)
+#define npyv_cvt_b32_s32(A) ((npyv_b32) A)
+#define npyv_cvt_b64_u64(A) ((npyv_b64) A)
+#define npyv_cvt_b64_s64(A) ((npyv_b64) A)
+#if NPY_SIMD_F32
+    #define npyv_cvt_b32_f32(A) ((npyv_b32) A)
+#endif
+#define npyv_cvt_b64_f64(A) ((npyv_b64) A)
+
+//expand
+NPY_FINLINE npyv_u16x2 npyv_expand_u16_u8(npyv_u8 data)
+{
+    npyv_u16x2 r;
+#ifdef NPY_HAVE_VX
+    r.val[0] = vec_unpackh(data);
+    r.val[1] = vec_unpackl(data);
+#else
+    npyv_u8 zero = npyv_zero_u8();
+    r.val[0] = (npyv_u16)vec_mergeh(data, zero);
+    r.val[1] = (npyv_u16)vec_mergel(data, zero);
+#endif
+    return r;
+}
+
+NPY_FINLINE npyv_u32x2 npyv_expand_u32_u16(npyv_u16 data)
+{
+    npyv_u32x2 r;
+#ifdef NPY_HAVE_VX
+    r.val[0] = vec_unpackh(data);
+    r.val[1] = vec_unpackl(data);
+#else
+    npyv_u16 zero = npyv_zero_u16();
+    r.val[0] = (npyv_u32)vec_mergeh(data, zero);
+    r.val[1] = (npyv_u32)vec_mergel(data, zero);
+#endif
+    return r;
+}
+
+// pack two 16-bit boolean into one 8-bit boolean vector
+NPY_FINLINE npyv_b8 npyv_pack_b8_b16(npyv_b16 a, npyv_b16 b) {
+    return vec_pack(a, b);
+}
+
+// pack four 32-bit boolean vectors into one 8-bit boolean vector
+NPY_FINLINE npyv_b8 npyv_pack_b8_b32(npyv_b32 a, npyv_b32 b, npyv_b32 c, npyv_b32 d) {
+    npyv_b16 ab = vec_pack(a, b);
+    npyv_b16 cd = vec_pack(c, d);
+    return npyv_pack_b8_b16(ab, cd);
+}
+
+// pack eight 64-bit boolean vectors into one 8-bit boolean vector
+NPY_FINLINE npyv_b8
+npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d,
+                 npyv_b64 e, npyv_b64 f, npyv_b64 g, npyv_b64 h) {
+    npyv_b32 ab = vec_pack(a, b);
+    npyv_b32 cd = vec_pack(c, d);
+    npyv_b32 ef = vec_pack(e, f);
+    npyv_b32 gh = vec_pack(g, h);
+    return npyv_pack_b8_b32(ab, cd, ef, gh);
+}
+
+// convert boolean vector to integer bitfield
+#if defined(NPY_HAVE_VXE) || defined(NPY_HAVE_VSX2)
+    NPY_FINLINE npy_uint64 npyv_tobits_b8(npyv_b8 a)
+    {
+        const npyv_u8 qperm = npyv_set_u8(120, 112, 104, 96, 88, 80, 72, 64, 56, 48, 40, 32, 24, 16, 8, 0);
+        npyv_u16 r = (npyv_u16)vec_vbpermq((npyv_u8)a, qperm);
+    #ifdef NPY_HAVE_VXE
+        return vec_extract(r, 3);
+    #else
+        return vec_extract(r, 4);
+    #endif
+    }
+    NPY_FINLINE npy_uint64 npyv_tobits_b16(npyv_b16 a)
+    {
+        const npyv_u8 qperm = npyv_setf_u8(128, 112, 96, 80, 64, 48, 32, 16, 0);
+        npyv_u8 r = (npyv_u8)vec_vbpermq((npyv_u8)a, qperm);
+    #ifdef NPY_HAVE_VXE
+        return vec_extract(r, 6);
+    #else
+        return vec_extract(r, 8);
+    #endif
+    }
+    NPY_FINLINE npy_uint64 npyv_tobits_b32(npyv_b32 a)
+    {
+    #ifdef NPY_HAVE_VXE
+        const npyv_u8 qperm = npyv_setf_u8(128, 128, 128, 128, 128, 96, 64, 32, 0);
+    #else
+        const npyv_u8 qperm = npyv_setf_u8(128, 96, 64, 32, 0);
+    #endif
+        npyv_u8 r = (npyv_u8)vec_vbpermq((npyv_u8)a, qperm);
+    #ifdef NPY_HAVE_VXE
+        return vec_extract(r, 6);
+    #else
+        return vec_extract(r, 8);
+    #endif
+    }
+    NPY_FINLINE npy_uint64 npyv_tobits_b64(npyv_b64 a)
+    {
+    #ifdef NPY_HAVE_VXE
+        const npyv_u8 qperm = npyv_setf_u8(128, 128, 128, 128, 128, 128, 128, 64, 0);
+    #else
+        const npyv_u8 qperm = npyv_setf_u8(128, 64, 0);
+    #endif
+        npyv_u8 r = (npyv_u8)vec_vbpermq((npyv_u8)a, qperm);
+    #ifdef NPY_HAVE_VXE
+        return vec_extract(r, 6);
+    #else
+        return vec_extract(r, 8);
+    #endif
+    }
+#else
+    NPY_FINLINE npy_uint64 npyv_tobits_b8(npyv_b8 a)
+    {
+        const npyv_u8 scale = npyv_set_u8(1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128);
+        npyv_u8 seq_scale = vec_and((npyv_u8)a, scale);
+        npyv_u64 sum = vec_sum2(vec_sum4(seq_scale, npyv_zero_u8()), npyv_zero_u32());
+        return vec_extract(sum, 0) + ((int)vec_extract(sum, 1) << 8);
+    }
+    NPY_FINLINE npy_uint64 npyv_tobits_b16(npyv_b16 a)
+    {
+        const npyv_u16 scale = npyv_set_u16(1, 2, 4, 8, 16, 32, 64, 128);
+        npyv_u16 seq_scale = vec_and((npyv_u16)a, scale);
+        npyv_u64 sum = vec_sum2(seq_scale, npyv_zero_u16());
+        return vec_extract(vec_sum_u128(sum, npyv_zero_u64()), 15);
+    }
+    NPY_FINLINE npy_uint64 npyv_tobits_b32(npyv_b32 a)
+    {
+        const npyv_u32 scale = npyv_set_u32(1, 2, 4, 8);
+        npyv_u32 seq_scale = vec_and((npyv_u32)a, scale);
+        return vec_extract(vec_sum_u128(seq_scale, npyv_zero_u32()), 15);
+    }
+    NPY_FINLINE npy_uint64 npyv_tobits_b64(npyv_b64 a)
+    {
+        const npyv_u64 scale = npyv_set_u64(1, 2);
+        npyv_u64 seq_scale = vec_and((npyv_u64)a, scale);
+        return vec_extract(vec_sum_u128(seq_scale, npyv_zero_u64()), 15);
+    }
+#endif
+// truncate compatible with all compilers(internal use for now)
+#if NPY_SIMD_F32
+    NPY_FINLINE npyv_s32 npyv__trunc_s32_f32(npyv_f32 a)
+    {
+    #ifdef NPY_HAVE_VXE2
+        return vec_signed(a);
+    #elif defined(NPY_HAVE_VXE)
+        return vec_packs(vec_signed(npyv_doublee(a)), vec_signed(npyv_doublee(vec_mergel(a, a))));
+    // VSX
+    #elif defined(__IBMC__)
+        return vec_cts(a, 0);
+    #elif defined(__clang__)
+        /**
+         * old versions of CLANG doesn't support %x<n> in the inline asm template
+         * which fixes register number when using any of the register constraints wa, wd, wf.
+         * therefore, we count on built-in functions.
+         */
+        return __builtin_convertvector(a, npyv_s32);
+    #else // gcc
+        npyv_s32 ret;
+        __asm__ ("xvcvspsxws %x0,%x1" : "=wa" (ret) : "wa" (a));
+        return ret;
+    #endif
+    }
+#endif
+
+NPY_FINLINE npyv_s32 npyv__trunc_s32_f64(npyv_f64 a, npyv_f64 b)
+{
+#ifdef NPY_HAVE_VX
+    return vec_packs(vec_signed(a), vec_signed(b));
+// VSX
+#elif defined(__IBMC__)
+    const npyv_u8 seq_even = npyv_set_u8(0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27);
+    // unfortunately, XLC missing asm register vsx fixer
+    // hopefully, xlc can optimize around big-endian compatibility
+    npyv_s32 lo_even = vec_cts(a, 0);
+    npyv_s32 hi_even = vec_cts(b, 0);
+    return vec_perm(lo_even, hi_even, seq_even);
+#else
+    const npyv_u8 seq_odd = npyv_set_u8(4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31);
+    #ifdef __clang__
+        // __builtin_convertvector doesn't support this conversion on wide range of versions
+        // fortunately, almost all versions have direct builtin of 'xvcvdpsxws'
+        npyv_s32 lo_odd = __builtin_vsx_xvcvdpsxws(a);
+        npyv_s32 hi_odd = __builtin_vsx_xvcvdpsxws(b);
+    #else // gcc
+        npyv_s32 lo_odd, hi_odd;
+        __asm__ ("xvcvdpsxws %x0,%x1" : "=wa" (lo_odd) : "wa" (a));
+        __asm__ ("xvcvdpsxws %x0,%x1" : "=wa" (hi_odd) : "wa" (b));
+    #endif
+    return vec_perm(lo_odd, hi_odd, seq_odd);
+#endif
+}
+
+// round to nearest integer (assuming even)
+#if NPY_SIMD_F32
+    NPY_FINLINE npyv_s32 npyv_round_s32_f32(npyv_f32 a)
+    { return npyv__trunc_s32_f32(vec_rint(a)); }
+#endif
+NPY_FINLINE npyv_s32 npyv_round_s32_f64(npyv_f64 a, npyv_f64 b)
+{ return npyv__trunc_s32_f64(vec_rint(a), vec_rint(b)); }
+
+#endif // _NPY_SIMD_VEC_CVT_H
diff --git a/numpy/core/src/common/simd/vsx/math.h b/numpy/core/src/common/simd/vec/math.h
index 444bc9e54..7714a612d 100644
--- a/numpy/core/src/common/simd/vsx/math.h
+++ b/numpy/core/src/common/simd/vec/math.h
@@ -2,21 +2,25 @@
     #error "Not a standalone header"
 #endif
 
-#ifndef _NPY_SIMD_VSX_MATH_H
-#define _NPY_SIMD_VSX_MATH_H
+#ifndef _NPY_SIMD_VEC_MATH_H
+#define _NPY_SIMD_VEC_MATH_H
 /***************************
  * Elementary
  ***************************/
 // Square root
-#define npyv_sqrt_f32 vec_sqrt
+#if NPY_SIMD_F32
+    #define npyv_sqrt_f32 vec_sqrt
+#endif
 #define npyv_sqrt_f64 vec_sqrt
 
 // Reciprocal
-NPY_FINLINE npyv_f32 npyv_recip_f32(npyv_f32 a)
-{
-    const npyv_f32 one = npyv_setall_f32(1.0f);
-    return vec_div(one, a);
-}
+#if NPY_SIMD_F32
+    NPY_FINLINE npyv_f32 npyv_recip_f32(npyv_f32 a)
+    {
+        const npyv_f32 one = npyv_setall_f32(1.0f);
+        return vec_div(one, a);
+    }
+#endif
 NPY_FINLINE npyv_f64 npyv_recip_f64(npyv_f64 a)
 {
     const npyv_f64 one = npyv_setall_f64(1.0);
@@ -24,23 +28,41 @@ NPY_FINLINE npyv_f64 npyv_recip_f64(npyv_f64 a)
 }
 
 // Absolute
-#define npyv_abs_f32 vec_abs
+#if NPY_SIMD_F32
+    #define npyv_abs_f32 vec_abs
+#endif
 #define npyv_abs_f64 vec_abs
 
 // Square
-NPY_FINLINE npyv_f32 npyv_square_f32(npyv_f32 a)
-{ return vec_mul(a, a); }
+#if NPY_SIMD_F32
+    NPY_FINLINE npyv_f32 npyv_square_f32(npyv_f32 a)
+    { return vec_mul(a, a); }
+#endif
 NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a)
 { return vec_mul(a, a); }
 
 // Maximum, natively mapping with no guarantees to handle NaN.
-#define npyv_max_f32 vec_max
+#if NPY_SIMD_F32
+    #define npyv_max_f32 vec_max
+#endif
 #define npyv_max_f64 vec_max
 // Maximum, supports IEEE floating-point arithmetic (IEC 60559),
 // - If one of the two vectors contains NaN, the equivalent element of the other vector is set
 // - Only if both corresponded elements are NaN, NaN is set.
-#define npyv_maxp_f32 vec_max
-#define npyv_maxp_f64 vec_max
+#if NPY_SIMD_F32
+    #define npyv_maxp_f32 vec_max
+#endif
+#if defined(NPY_HAVE_VXE) || defined(NPY_HAVE_VSX)
+    #define npyv_maxp_f64 vec_max
+#else
+    // vfmindb & vfmaxdb appears in zarch12
+    NPY_FINLINE npyv_f64 npyv_maxp_f64(npyv_f64 a, npyv_f64 b)
+    {
+        npyv_b64 nn_a = npyv_notnan_f64(a);
+        npyv_b64 nn_b = npyv_notnan_f64(b);
+        return vec_max(vec_sel(b, a, nn_a), vec_sel(a, b, nn_b));
+    }
+#endif
 // Maximum, integer operations
 #define npyv_max_u8 vec_max
 #define npyv_max_s8 vec_max
@@ -52,13 +74,27 @@ NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a)
 #define npyv_max_s64 vec_max
 
 // Minimum, natively mapping with no guarantees to handle NaN.
-#define npyv_min_f32 vec_min
+#if NPY_SIMD_F32
+    #define npyv_min_f32 vec_min
+#endif
 #define npyv_min_f64 vec_min
 // Minimum, supports IEEE floating-point arithmetic (IEC 60559),
 // - If one of the two vectors contains NaN, the equivalent element of the other vector is set
 // - Only if both corresponded elements are NaN, NaN is set.
-#define npyv_minp_f32 vec_min
-#define npyv_minp_f64 vec_min
+#if NPY_SIMD_F32
+    #define npyv_minp_f32 vec_min
+#endif
+#if defined(NPY_HAVE_VXE) || defined(NPY_HAVE_VSX)
+    #define npyv_minp_f64 vec_min
+#else
+    // vfmindb & vfmaxdb appears in zarch12
+    NPY_FINLINE npyv_f64 npyv_minp_f64(npyv_f64 a, npyv_f64 b)
+    {
+        npyv_b64 nn_a = npyv_notnan_f64(a);
+        npyv_b64 nn_b = npyv_notnan_f64(b);
+        return vec_min(vec_sel(b, a, nn_a), vec_sel(a, b, nn_b));
+    }
+#endif
 // Minimum, integer operations
 #define npyv_min_u8 vec_min
 #define npyv_min_s8 vec_min
@@ -70,19 +106,18 @@ NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a)
 #define npyv_min_s64 vec_min
 
 // round to nearest int even
-#define npyv_rint_f32 vec_rint
 #define npyv_rint_f64 vec_rint
-
 // ceil
-#define npyv_ceil_f32 vec_ceil
 #define npyv_ceil_f64 vec_ceil
-
 // trunc
-#define npyv_trunc_f32 vec_trunc
 #define npyv_trunc_f64 vec_trunc
-
 // floor
-#define npyv_floor_f32 vec_floor
 #define npyv_floor_f64 vec_floor
+#if NPY_SIMD_F32
+    #define npyv_rint_f32 vec_rint
+    #define npyv_ceil_f32 vec_ceil
+    #define npyv_trunc_f32 vec_trunc
+    #define npyv_floor_f32 vec_floor
+#endif
 
-#endif // _NPY_SIMD_VSX_MATH_H
+#endif // _NPY_SIMD_VEC_MATH_H
diff --git a/numpy/core/src/common/simd/vsx/memory.h b/numpy/core/src/common/simd/vec/memory.h
index 3007584ef..e8f588ef2 100644
--- a/numpy/core/src/common/simd/vsx/memory.h
+++ b/numpy/core/src/common/simd/vec/memory.h
@@ -2,8 +2,8 @@
     #error "Not a standalone header"
 #endif
 
-#ifndef _NPY_SIMD_VSX_MEMORY_H
-#define _NPY_SIMD_VSX_MEMORY_H
+#ifndef _NPY_SIMD_VEC_MEMORY_H
+#define _NPY_SIMD_VEC_MEMORY_H
 
 #include "misc.h"
 
@@ -19,19 +19,32 @@
      * CLANG fails to load unaligned addresses via vec_xl, vec_xst
      * so we failback to vec_vsx_ld, vec_vsx_st
      */
-    #if (defined(__GNUC__) && !defined(vec_xl)) || (defined(__clang__) && !defined(__IBMC__))
+    #if defined (NPY_HAVE_VSX2) && ( \
+        (defined(__GNUC__) && !defined(vec_xl)) || (defined(__clang__) && !defined(__IBMC__)) \
+    )
         #define npyv__load(T_VEC, PTR) vec_vsx_ld(0, PTR)
-    #else
+    #else // VX
         #define npyv__load(T_VEC, PTR) vec_xl(0, PTR)
     #endif
 #endif
 // unaligned store
-#if (defined(__GNUC__) && !defined(vec_xl)) || (defined(__clang__) && !defined(__IBMC__))
+#if defined (NPY_HAVE_VSX2) && ( \
+    (defined(__GNUC__) && !defined(vec_xl)) || (defined(__clang__) && !defined(__IBMC__)) \
+)
     #define npyv__store(PTR, VEC) vec_vsx_st(VEC, 0, PTR)
-#else
+#else // VX
     #define npyv__store(PTR, VEC) vec_xst(VEC, 0, PTR)
 #endif
 
+// aligned load/store
+#if defined (NPY_HAVE_VSX)
+    #define npyv__loada(PTR) vec_ld(0, PTR)
+    #define npyv__storea(PTR, VEC) vec_st(VEC, 0, PTR)
+#else // VX
+    #define npyv__loada(PTR) vec_xl(0, PTR)
+    #define npyv__storea(PTR, VEC) vec_xst(VEC, 0, PTR)
+#endif
+
 // avoid aliasing rules
 #ifdef __cplusplus
     template<typename T_PTR>
@@ -45,12 +58,16 @@
 // load lower part
 NPY_FINLINE npyv_u64 npyv__loadl(const void *ptr)
 {
+#ifdef NPY_HAVE_VSX
     #if defined(__clang__) && !defined(__IBMC__)
         // vec_promote doesn't support doubleword on clang
         return npyv_setall_u64(*npyv__ptr2u64(ptr));
     #else
         return vec_promote(*npyv__ptr2u64(ptr), 0);
     #endif
+#else // VX
+    return vec_load_len((const unsigned long long*)ptr, 7);
+#endif
 }
 // store lower part
 #define npyv__storel(PTR, VEC) \
@@ -62,11 +79,11 @@ NPY_FINLINE npyv_u64 npyv__loadl(const void *ptr)
 /****************************
  * load/store
  ****************************/
-#define NPYV_IMPL_VSX_MEM(SFX, DW_CAST)                                                 \
+#define NPYV_IMPL_VEC_MEM(SFX, DW_CAST)                                                 \
     NPY_FINLINE npyv_##SFX npyv_load_##SFX(const npyv_lanetype_##SFX *ptr)              \
     { return (npyv_##SFX)npyv__load(npyv_##SFX, (const npyv_lanetype_##DW_CAST*)ptr); } \
     NPY_FINLINE npyv_##SFX npyv_loada_##SFX(const npyv_lanetype_##SFX *ptr)             \
-    { return (npyv_##SFX)vec_ld(0, (const npyv_lanetype_u32*)ptr); }                    \
+    { return (npyv_##SFX)npyv__loada((const npyv_lanetype_u32*)ptr); }                  \
     NPY_FINLINE npyv_##SFX npyv_loads_##SFX(const npyv_lanetype_##SFX *ptr)             \
     { return npyv_loada_##SFX(ptr); }                                                   \
     NPY_FINLINE npyv_##SFX npyv_loadl_##SFX(const npyv_lanetype_##SFX *ptr)             \
@@ -74,7 +91,7 @@ NPY_FINLINE npyv_u64 npyv__loadl(const void *ptr)
     NPY_FINLINE void npyv_store_##SFX(npyv_lanetype_##SFX *ptr, npyv_##SFX vec)         \
     { npyv__store((npyv_lanetype_##DW_CAST*)ptr, (npyv_##DW_CAST)vec); }                \
     NPY_FINLINE void npyv_storea_##SFX(npyv_lanetype_##SFX *ptr, npyv_##SFX vec)        \
-    { vec_st((npyv_u32)vec, 0, (npyv_lanetype_u32*)ptr); }                              \
+    { npyv__storea((npyv_lanetype_##DW_CAST*)ptr, (npyv_##DW_CAST)vec); }               \
     NPY_FINLINE void npyv_stores_##SFX(npyv_lanetype_##SFX *ptr, npyv_##SFX vec)        \
     { npyv_storea_##SFX(ptr, vec); }                                                    \
     NPY_FINLINE void npyv_storel_##SFX(npyv_lanetype_##SFX *ptr, npyv_##SFX vec)        \
@@ -82,16 +99,18 @@ NPY_FINLINE npyv_u64 npyv__loadl(const void *ptr)
     NPY_FINLINE void npyv_storeh_##SFX(npyv_lanetype_##SFX *ptr, npyv_##SFX vec)        \
     { npyv__storeh(ptr, vec); }
 
-NPYV_IMPL_VSX_MEM(u8,  u8)
-NPYV_IMPL_VSX_MEM(s8,  s8)
-NPYV_IMPL_VSX_MEM(u16, u16)
-NPYV_IMPL_VSX_MEM(s16, s16)
-NPYV_IMPL_VSX_MEM(u32, u32)
-NPYV_IMPL_VSX_MEM(s32, s32)
-NPYV_IMPL_VSX_MEM(u64, f64)
-NPYV_IMPL_VSX_MEM(s64, f64)
-NPYV_IMPL_VSX_MEM(f32, f32)
-NPYV_IMPL_VSX_MEM(f64, f64)
+NPYV_IMPL_VEC_MEM(u8,  u8)
+NPYV_IMPL_VEC_MEM(s8,  s8)
+NPYV_IMPL_VEC_MEM(u16, u16)
+NPYV_IMPL_VEC_MEM(s16, s16)
+NPYV_IMPL_VEC_MEM(u32, u32)
+NPYV_IMPL_VEC_MEM(s32, s32)
+NPYV_IMPL_VEC_MEM(u64, f64)
+NPYV_IMPL_VEC_MEM(s64, f64)
+#if NPY_SIMD_F32
+NPYV_IMPL_VEC_MEM(f32, f32)
+#endif
+NPYV_IMPL_VEC_MEM(f64, f64)
 
 /***************************
  * Non-contiguous Load
@@ -106,8 +125,10 @@ NPY_FINLINE npyv_u32 npyv_loadn_u32(const npy_uint32 *ptr, npy_intp stride)
 }
 NPY_FINLINE npyv_s32 npyv_loadn_s32(const npy_int32 *ptr, npy_intp stride)
 { return (npyv_s32)npyv_loadn_u32((const npy_uint32*)ptr, stride); }
+#if NPY_SIMD_F32
 NPY_FINLINE npyv_f32 npyv_loadn_f32(const float *ptr, npy_intp stride)
 { return (npyv_f32)npyv_loadn_u32((const npy_uint32*)ptr, stride); }
+#endif
 //// 64
 NPY_FINLINE npyv_u64 npyv_loadn_u64(const npy_uint64 *ptr, npy_intp stride)
 { return npyv_set_u64(ptr[0], ptr[stride]); }
@@ -128,8 +149,10 @@ NPY_FINLINE void npyv_storen_u32(npy_uint32 *ptr, npy_intp stride, npyv_u32 a)
 }
 NPY_FINLINE void npyv_storen_s32(npy_int32 *ptr, npy_intp stride, npyv_s32 a)
 { npyv_storen_u32((npy_uint32*)ptr, stride, (npyv_u32)a); }
+#if NPY_SIMD_F32
 NPY_FINLINE void npyv_storen_f32(float *ptr, npy_intp stride, npyv_f32 a)
 { npyv_storen_u32((npy_uint32*)ptr, stride, (npyv_u32)a); }
+#endif
 //// 64
 NPY_FINLINE void npyv_storen_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a)
 {
@@ -149,6 +172,14 @@ NPY_FINLINE npyv_s32 npyv_load_till_s32(const npy_int32 *ptr, npy_uintp nlane, n
 {
     assert(nlane > 0);
     npyv_s32 vfill = npyv_setall_s32(fill);
+#ifdef NPY_HAVE_VX
+    const unsigned blane = (unsigned short)nlane;
+    const npyv_u32 steps = npyv_set_u32(0, 1, 2, 3);
+    const npyv_u32 vlane = npyv_setall_u32((unsigned)blane);
+    const npyv_b32 mask  = vec_cmpgt(vlane, steps);
+    npyv_s32 a = vec_load_len(ptr, blane*4-1);
+    return vec_sel(vfill, a, mask);
+#else
     switch(nlane) {
     case 1:
         return vec_insert(ptr[0], vfill, 0);
@@ -164,10 +195,18 @@ NPY_FINLINE npyv_s32 npyv_load_till_s32(const npy_int32 *ptr, npy_uintp nlane, n
     default:
         return npyv_load_s32(ptr);
     }
+#endif
 }
 // fill zero to rest lanes
 NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
-{ return npyv_load_till_s32(ptr, nlane, 0); }
+{
+#ifdef NPY_HAVE_VX
+    unsigned blane = ((unsigned short)nlane)*4 - 1;
+    return vec_load_len(ptr, blane);
+#else
+    return npyv_load_till_s32(ptr, nlane, 0);
+#endif
+}
 //// 64
 NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, npy_int64 fill)
 {
@@ -179,7 +218,14 @@ NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, n
 }
 // fill zero to rest lanes
 NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
-{  return npyv_load_till_s64(ptr, nlane, 0); }
+{
+#ifdef NPY_HAVE_VX
+    unsigned blane = (unsigned short)nlane;
+    return vec_load_len((const signed long long*)ptr, blane*8-1);
+#else
+    return npyv_load_till_s64(ptr, nlane, 0);
+#endif
+}
 /*********************************
  * Non-contiguous partial load
  *********************************/
@@ -226,6 +272,10 @@ NPY_FINLINE npyv_s64 npyv_loadn_tillz_s64(const npy_int64 *ptr, npy_intp stride,
 NPY_FINLINE void npyv_store_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a)
 {
     assert(nlane > 0);
+#ifdef NPY_HAVE_VX
+    unsigned blane = (unsigned short)nlane;
+    vec_store_len(a, ptr, blane*4-1);
+#else
     switch(nlane) {
     case 1:
         *ptr = vec_extract(a, 0);
@@ -240,16 +290,22 @@ NPY_FINLINE void npyv_store_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a
     default:
         npyv_store_s32(ptr, a);
     }
+#endif
 }
 //// 64
 NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a)
 {
     assert(nlane > 0);
+#ifdef NPY_HAVE_VX
+    unsigned blane = (unsigned short)nlane;
+    vec_store_len(a, (signed long long*)ptr, blane*8-1);
+#else
     if (nlane == 1) {
         npyv_storel_s64(ptr, a);
         return;
     }
     npyv_store_s64(ptr, a);
+#endif
 }
 /*********************************
  * Non-contiguous partial store
@@ -283,7 +339,7 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp
 /*****************************************************************
  * Implement partial load/store for u32/f32/u64/f64... via casting
  *****************************************************************/
-#define NPYV_IMPL_VSX_REST_PARTIAL_TYPES(F_SFX, T_SFX)                                      \
+#define NPYV_IMPL_VEC_REST_PARTIAL_TYPES(F_SFX, T_SFX)                                      \
     NPY_FINLINE npyv_##F_SFX npyv_load_till_##F_SFX                                         \
     (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, npyv_lanetype_##F_SFX fill)         \
     {                                                                                       \
@@ -338,39 +394,47 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp
         );                                                                                  \
     }
 
-NPYV_IMPL_VSX_REST_PARTIAL_TYPES(u32, s32)
-NPYV_IMPL_VSX_REST_PARTIAL_TYPES(f32, s32)
-NPYV_IMPL_VSX_REST_PARTIAL_TYPES(u64, s64)
-NPYV_IMPL_VSX_REST_PARTIAL_TYPES(f64, s64)
+NPYV_IMPL_VEC_REST_PARTIAL_TYPES(u32, s32)
+#if NPY_SIMD_F32
+NPYV_IMPL_VEC_REST_PARTIAL_TYPES(f32, s32)
+#endif
+NPYV_IMPL_VEC_REST_PARTIAL_TYPES(u64, s64)
+NPYV_IMPL_VEC_REST_PARTIAL_TYPES(f64, s64)
 
 /*********************************
  * Lookup table
  *********************************/
 // uses vector as indexes into a table
 // that contains 32 elements of float32.
-NPY_FINLINE npyv_f32 npyv_lut32_f32(const float *table, npyv_u32 idx)
+NPY_FINLINE npyv_u32 npyv_lut32_u32(const npy_uint32 *table, npyv_u32 idx)
 {
     const unsigned i0 = vec_extract(idx, 0);
     const unsigned i1 = vec_extract(idx, 1);
     const unsigned i2 = vec_extract(idx, 2);
     const unsigned i3 = vec_extract(idx, 3);
-    npyv_f32 r = vec_promote(table[i0], 0);
+    npyv_u32 r = vec_promote(table[i0], 0);
              r = vec_insert(table[i1], r, 1);
              r = vec_insert(table[i2], r, 2);
              r = vec_insert(table[i3], r, 3);
     return r;
 }
-NPY_FINLINE npyv_u32 npyv_lut32_u32(const npy_uint32 *table, npyv_u32 idx)
-{ return npyv_reinterpret_u32_f32(npyv_lut32_f32((const float*)table, idx)); }
 NPY_FINLINE npyv_s32 npyv_lut32_s32(const npy_int32 *table, npyv_u32 idx)
-{ return npyv_reinterpret_s32_f32(npyv_lut32_f32((const float*)table, idx)); }
-
+{ return (npyv_s32)npyv_lut32_u32((const npy_uint32*)table, idx); }
+#if NPY_SIMD_F32
+    NPY_FINLINE npyv_f32 npyv_lut32_f32(const float *table, npyv_u32 idx)
+    { return (npyv_f32)npyv_lut32_u32((const npy_uint32*)table, idx); }
+#endif
 // uses vector as indexes into a table
 // that contains 16 elements of float64.
 NPY_FINLINE npyv_f64 npyv_lut16_f64(const double *table, npyv_u64 idx)
 {
+#ifdef NPY_HAVE_VX
+    const unsigned i0 = vec_extract((npyv_u32)idx, 1);
+    const unsigned i1 = vec_extract((npyv_u32)idx, 3);
+#else
     const unsigned i0 = vec_extract((npyv_u32)idx, 0);
     const unsigned i1 = vec_extract((npyv_u32)idx, 2);
+#endif
     npyv_f64 r = vec_promote(table[i0], 0);
              r = vec_insert(table[i1], r, 1);
     return r;
@@ -380,4 +444,4 @@ NPY_FINLINE npyv_u64 npyv_lut16_u64(const npy_uint64 *table, npyv_u64 idx)
 NPY_FINLINE npyv_s64 npyv_lut16_s64(const npy_int64 *table, npyv_u64 idx)
 { return npyv_reinterpret_s64_f64(npyv_lut16_f64((const double*)table, idx)); }
 
-#endif // _NPY_SIMD_VSX_MEMORY_H
+#endif // _NPY_SIMD_VEC_MEMORY_H
diff --git a/numpy/core/src/common/simd/vsx/misc.h b/numpy/core/src/common/simd/vec/misc.h
index f7a0cdd5c..c4f35cfc0 100644
--- a/numpy/core/src/common/simd/vsx/misc.h
+++ b/numpy/core/src/common/simd/vec/misc.h
@@ -2,8 +2,8 @@
     #error "Not a standalone header"
 #endif
 
-#ifndef _NPY_SIMD_VSX_MISC_H
-#define _NPY_SIMD_VSX_MISC_H
+#ifndef _NPY_SIMD_VEC_MISC_H
+#define _NPY_SIMD_VEC_MISC_H
 
 // vector with zero lanes
 #define npyv_zero_u8()  ((npyv_u8)   npyv_setall_s32(0))
@@ -14,26 +14,30 @@
 #define npyv_zero_s32() npyv_setall_s32(0)
 #define npyv_zero_u64() ((npyv_u64) npyv_setall_s32(0))
 #define npyv_zero_s64() ((npyv_s64) npyv_setall_s32(0))
-#define npyv_zero_f32() npyv_setall_f32(0.0f)
+#if NPY_SIMD_F32
+    #define npyv_zero_f32() npyv_setall_f32(0.0f)
+#endif
 #define npyv_zero_f64() npyv_setall_f64(0.0)
 
 // vector with a specific value set to all lanes
 // the safest way to generate vsplti* and vsplt* instructions
-#define NPYV_IMPL_VSX_SPLTB(T_VEC, V) ((T_VEC){V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V})
-#define NPYV_IMPL_VSX_SPLTH(T_VEC, V) ((T_VEC){V, V, V, V, V, V, V, V})
-#define NPYV_IMPL_VSX_SPLTW(T_VEC, V) ((T_VEC){V, V, V, V})
-#define NPYV_IMPL_VSX_SPLTD(T_VEC, V) ((T_VEC){V, V})
-
-#define npyv_setall_u8(VAL)  NPYV_IMPL_VSX_SPLTB(npyv_u8,  (unsigned char)VAL)
-#define npyv_setall_s8(VAL)  NPYV_IMPL_VSX_SPLTB(npyv_s8,  (signed char)VAL)
-#define npyv_setall_u16(VAL) NPYV_IMPL_VSX_SPLTH(npyv_u16, (unsigned short)VAL)
-#define npyv_setall_s16(VAL) NPYV_IMPL_VSX_SPLTH(npyv_s16, (short)VAL)
-#define npyv_setall_u32(VAL) NPYV_IMPL_VSX_SPLTW(npyv_u32, (unsigned int)VAL)
-#define npyv_setall_s32(VAL) NPYV_IMPL_VSX_SPLTW(npyv_s32, (int)VAL)
-#define npyv_setall_f32(VAL) NPYV_IMPL_VSX_SPLTW(npyv_f32, VAL)
-#define npyv_setall_u64(VAL) NPYV_IMPL_VSX_SPLTD(npyv_u64, (npy_uint64)VAL)
-#define npyv_setall_s64(VAL) NPYV_IMPL_VSX_SPLTD(npyv_s64, (npy_int64)VAL)
-#define npyv_setall_f64(VAL) NPYV_IMPL_VSX_SPLTD(npyv_f64, VAL)
+#define NPYV_IMPL_VEC_SPLTB(T_VEC, V) ((T_VEC){V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V})
+#define NPYV_IMPL_VEC_SPLTH(T_VEC, V) ((T_VEC){V, V, V, V, V, V, V, V})
+#define NPYV_IMPL_VEC_SPLTW(T_VEC, V) ((T_VEC){V, V, V, V})
+#define NPYV_IMPL_VEC_SPLTD(T_VEC, V) ((T_VEC){V, V})
+
+#define npyv_setall_u8(VAL)  NPYV_IMPL_VEC_SPLTB(npyv_u8,  (unsigned char)VAL)
+#define npyv_setall_s8(VAL)  NPYV_IMPL_VEC_SPLTB(npyv_s8,  (signed char)VAL)
+#define npyv_setall_u16(VAL) NPYV_IMPL_VEC_SPLTH(npyv_u16, (unsigned short)VAL)
+#define npyv_setall_s16(VAL) NPYV_IMPL_VEC_SPLTH(npyv_s16, (short)VAL)
+#define npyv_setall_u32(VAL) NPYV_IMPL_VEC_SPLTW(npyv_u32, (unsigned int)VAL)
+#define npyv_setall_s32(VAL) NPYV_IMPL_VEC_SPLTW(npyv_s32, (int)VAL)
+#if NPY_SIMD_F32
+    #define npyv_setall_f32(VAL) NPYV_IMPL_VEC_SPLTW(npyv_f32, VAL)
+#endif
+#define npyv_setall_u64(VAL) NPYV_IMPL_VEC_SPLTD(npyv_u64, (npy_uint64)VAL)
+#define npyv_setall_s64(VAL) NPYV_IMPL_VEC_SPLTD(npyv_s64, (npy_int64)VAL)
+#define npyv_setall_f64(VAL) NPYV_IMPL_VEC_SPLTD(npyv_f64, VAL)
 
 // vector with specific values set to each lane and
 // set a specific value to all remained lanes
@@ -45,7 +49,9 @@
 #define npyv_setf_s32(FILL, ...) ((npyv_s32){NPYV__SET_FILL_4(int, FILL, __VA_ARGS__)})
 #define npyv_setf_u64(FILL, ...) ((npyv_u64){NPYV__SET_FILL_2(npy_int64, FILL, __VA_ARGS__)})
 #define npyv_setf_s64(FILL, ...) ((npyv_s64){NPYV__SET_FILL_2(npy_int64, FILL, __VA_ARGS__)})
-#define npyv_setf_f32(FILL, ...) ((npyv_f32){NPYV__SET_FILL_4(float, FILL, __VA_ARGS__)})
+#if NPY_SIMD_F32
+    #define npyv_setf_f32(FILL, ...) ((npyv_f32){NPYV__SET_FILL_4(float, FILL, __VA_ARGS__)})
+#endif
 #define npyv_setf_f64(FILL, ...) ((npyv_f64){NPYV__SET_FILL_2(double, FILL, __VA_ARGS__)})
 
 // vector with specific values set to each lane and
@@ -58,7 +64,9 @@
 #define npyv_set_s32(...) npyv_setf_s32(0, __VA_ARGS__)
 #define npyv_set_u64(...) npyv_setf_u64(0, __VA_ARGS__)
 #define npyv_set_s64(...) npyv_setf_s64(0, __VA_ARGS__)
-#define npyv_set_f32(...) npyv_setf_f32(0, __VA_ARGS__)
+#if NPY_SIMD_F32
+    #define npyv_set_f32(...) npyv_setf_f32(0, __VA_ARGS__)
+#endif
 #define npyv_set_f64(...) npyv_setf_f64(0, __VA_ARGS__)
 
 // Per lane select
@@ -70,7 +78,9 @@
 #define npyv_select_s32 npyv_select_u8
 #define npyv_select_u64 npyv_select_u8
 #define npyv_select_s64 npyv_select_u8
-#define npyv_select_f32 npyv_select_u8
+#if NPY_SIMD_F32
+    #define npyv_select_f32 npyv_select_u8
+#endif
 #define npyv_select_f64 npyv_select_u8
 
 // Reinterpret
@@ -82,7 +92,9 @@
 #define npyv_reinterpret_u8_s32 npyv_reinterpret_u8_s8
 #define npyv_reinterpret_u8_u64 npyv_reinterpret_u8_s8
 #define npyv_reinterpret_u8_s64 npyv_reinterpret_u8_s8
-#define npyv_reinterpret_u8_f32 npyv_reinterpret_u8_s8
+#if NPY_SIMD_F32
+    #define npyv_reinterpret_u8_f32 npyv_reinterpret_u8_s8
+#endif
 #define npyv_reinterpret_u8_f64 npyv_reinterpret_u8_s8
 
 #define npyv_reinterpret_s8_s8(X) X
@@ -93,7 +105,9 @@
 #define npyv_reinterpret_s8_s32 npyv_reinterpret_s8_u8
 #define npyv_reinterpret_s8_u64 npyv_reinterpret_s8_u8
 #define npyv_reinterpret_s8_s64 npyv_reinterpret_s8_u8
-#define npyv_reinterpret_s8_f32 npyv_reinterpret_s8_u8
+#if NPY_SIMD_F32
+    #define npyv_reinterpret_s8_f32 npyv_reinterpret_s8_u8
+#endif
 #define npyv_reinterpret_s8_f64 npyv_reinterpret_s8_u8
 
 #define npyv_reinterpret_u16_u16(X) X
@@ -104,7 +118,9 @@
 #define npyv_reinterpret_u16_s32 npyv_reinterpret_u16_u8
 #define npyv_reinterpret_u16_u64 npyv_reinterpret_u16_u8
 #define npyv_reinterpret_u16_s64 npyv_reinterpret_u16_u8
-#define npyv_reinterpret_u16_f32 npyv_reinterpret_u16_u8
+#if NPY_SIMD_F32
+    #define npyv_reinterpret_u16_f32 npyv_reinterpret_u16_u8
+#endif
 #define npyv_reinterpret_u16_f64 npyv_reinterpret_u16_u8
 
 #define npyv_reinterpret_s16_s16(X) X
@@ -115,7 +131,9 @@
 #define npyv_reinterpret_s16_s32 npyv_reinterpret_s16_u8
 #define npyv_reinterpret_s16_u64 npyv_reinterpret_s16_u8
 #define npyv_reinterpret_s16_s64 npyv_reinterpret_s16_u8
-#define npyv_reinterpret_s16_f32 npyv_reinterpret_s16_u8
+#if NPY_SIMD_F32
+    #define npyv_reinterpret_s16_f32 npyv_reinterpret_s16_u8
+#endif
 #define npyv_reinterpret_s16_f64 npyv_reinterpret_s16_u8
 
 #define npyv_reinterpret_u32_u32(X) X
@@ -126,7 +144,9 @@
 #define npyv_reinterpret_u32_s32 npyv_reinterpret_u32_u8
 #define npyv_reinterpret_u32_u64 npyv_reinterpret_u32_u8
 #define npyv_reinterpret_u32_s64 npyv_reinterpret_u32_u8
-#define npyv_reinterpret_u32_f32 npyv_reinterpret_u32_u8
+#if NPY_SIMD_F32
+    #define npyv_reinterpret_u32_f32 npyv_reinterpret_u32_u8
+#endif
 #define npyv_reinterpret_u32_f64 npyv_reinterpret_u32_u8
 
 #define npyv_reinterpret_s32_s32(X) X
@@ -137,7 +157,9 @@
 #define npyv_reinterpret_s32_u32 npyv_reinterpret_s32_u8
 #define npyv_reinterpret_s32_u64 npyv_reinterpret_s32_u8
 #define npyv_reinterpret_s32_s64 npyv_reinterpret_s32_u8
-#define npyv_reinterpret_s32_f32 npyv_reinterpret_s32_u8
+#if NPY_SIMD_F32
+    #define npyv_reinterpret_s32_f32 npyv_reinterpret_s32_u8
+#endif
 #define npyv_reinterpret_s32_f64 npyv_reinterpret_s32_u8
 
 #define npyv_reinterpret_u64_u64(X) X
@@ -148,7 +170,9 @@
 #define npyv_reinterpret_u64_u32 npyv_reinterpret_u64_u8
 #define npyv_reinterpret_u64_s32 npyv_reinterpret_u64_u8
 #define npyv_reinterpret_u64_s64 npyv_reinterpret_u64_u8
-#define npyv_reinterpret_u64_f32 npyv_reinterpret_u64_u8
+#if NPY_SIMD_F32
+    #define npyv_reinterpret_u64_f32 npyv_reinterpret_u64_u8
+#endif
 #define npyv_reinterpret_u64_f64 npyv_reinterpret_u64_u8
 
 #define npyv_reinterpret_s64_s64(X) X
@@ -159,19 +183,23 @@
 #define npyv_reinterpret_s64_u32 npyv_reinterpret_s64_u8
 #define npyv_reinterpret_s64_s32 npyv_reinterpret_s64_u8
 #define npyv_reinterpret_s64_u64 npyv_reinterpret_s64_u8
-#define npyv_reinterpret_s64_f32 npyv_reinterpret_s64_u8
+#if NPY_SIMD_F32
+    #define npyv_reinterpret_s64_f32 npyv_reinterpret_s64_u8
+#endif
 #define npyv_reinterpret_s64_f64 npyv_reinterpret_s64_u8
 
-#define npyv_reinterpret_f32_f32(X) X
-#define npyv_reinterpret_f32_u8(X) ((npyv_f32)X)
-#define npyv_reinterpret_f32_s8  npyv_reinterpret_f32_u8
-#define npyv_reinterpret_f32_u16 npyv_reinterpret_f32_u8
-#define npyv_reinterpret_f32_s16 npyv_reinterpret_f32_u8
-#define npyv_reinterpret_f32_u32 npyv_reinterpret_f32_u8
-#define npyv_reinterpret_f32_s32 npyv_reinterpret_f32_u8
-#define npyv_reinterpret_f32_u64 npyv_reinterpret_f32_u8
-#define npyv_reinterpret_f32_s64 npyv_reinterpret_f32_u8
-#define npyv_reinterpret_f32_f64 npyv_reinterpret_f32_u8
+#if NPY_SIMD_F32
+    #define npyv_reinterpret_f32_f32(X) X
+    #define npyv_reinterpret_f32_u8(X) ((npyv_f32)X)
+    #define npyv_reinterpret_f32_s8  npyv_reinterpret_f32_u8
+    #define npyv_reinterpret_f32_u16 npyv_reinterpret_f32_u8
+    #define npyv_reinterpret_f32_s16 npyv_reinterpret_f32_u8
+    #define npyv_reinterpret_f32_u32 npyv_reinterpret_f32_u8
+    #define npyv_reinterpret_f32_s32 npyv_reinterpret_f32_u8
+    #define npyv_reinterpret_f32_u64 npyv_reinterpret_f32_u8
+    #define npyv_reinterpret_f32_s64 npyv_reinterpret_f32_u8
+    #define npyv_reinterpret_f32_f64 npyv_reinterpret_f32_u8
+#endif
 
 #define npyv_reinterpret_f64_f64(X) X
 #define npyv_reinterpret_f64_u8(X) ((npyv_f64)X)
@@ -182,9 +210,10 @@
 #define npyv_reinterpret_f64_s32 npyv_reinterpret_f64_u8
 #define npyv_reinterpret_f64_u64 npyv_reinterpret_f64_u8
 #define npyv_reinterpret_f64_s64 npyv_reinterpret_f64_u8
-#define npyv_reinterpret_f64_f32 npyv_reinterpret_f64_u8
-
+#if NPY_SIMD_F32
+    #define npyv_reinterpret_f64_f32 npyv_reinterpret_f64_u8
+#endif
 // Only required by AVX2/AVX512
 #define npyv_cleanup() ((void)0)
 
-#endif // _NPY_SIMD_VSX_MISC_H
+#endif // _NPY_SIMD_VEC_MISC_H
diff --git a/numpy/core/src/common/simd/vsx/operators.h b/numpy/core/src/common/simd/vec/operators.h
index b01d85321..8b58676e7 100644
--- a/numpy/core/src/common/simd/vsx/operators.h
+++ b/numpy/core/src/common/simd/vec/operators.h
@@ -2,8 +2,8 @@
     #error "Not a standalone header"
 #endif
 
-#ifndef _NPY_SIMD_VSX_OPERATORS_H
-#define _NPY_SIMD_VSX_OPERATORS_H
+#ifndef _NPY_SIMD_VEC_OPERATORS_H
+#define _NPY_SIMD_VEC_OPERATORS_H
 
 /***************************
  * Shifting
@@ -11,11 +11,11 @@
 
 // Left
 #define npyv_shl_u16(A, C) vec_sl(A, npyv_setall_u16(C))
-#define npyv_shl_s16(A, C) vec_sl(A, npyv_setall_u16(C))
+#define npyv_shl_s16(A, C) vec_sl_s16(A, npyv_setall_u16(C))
 #define npyv_shl_u32(A, C) vec_sl(A, npyv_setall_u32(C))
-#define npyv_shl_s32(A, C) vec_sl(A, npyv_setall_u32(C))
+#define npyv_shl_s32(A, C) vec_sl_s32(A, npyv_setall_u32(C))
 #define npyv_shl_u64(A, C) vec_sl(A, npyv_setall_u64(C))
-#define npyv_shl_s64(A, C) vec_sl(A, npyv_setall_u64(C))
+#define npyv_shl_s64(A, C) vec_sl_s64(A, npyv_setall_u64(C))
 
 // Left by an immediate constant
 #define npyv_shli_u16 npyv_shl_u16
@@ -27,11 +27,11 @@
 
 // Right
 #define npyv_shr_u16(A, C) vec_sr(A,  npyv_setall_u16(C))
-#define npyv_shr_s16(A, C) vec_sra(A, npyv_setall_u16(C))
+#define npyv_shr_s16(A, C) vec_sra_s16(A, npyv_setall_u16(C))
 #define npyv_shr_u32(A, C) vec_sr(A,  npyv_setall_u32(C))
-#define npyv_shr_s32(A, C) vec_sra(A, npyv_setall_u32(C))
+#define npyv_shr_s32(A, C) vec_sra_s32(A, npyv_setall_u32(C))
 #define npyv_shr_u64(A, C) vec_sr(A,  npyv_setall_u64(C))
-#define npyv_shr_s64(A, C) vec_sra(A, npyv_setall_u64(C))
+#define npyv_shr_s64(A, C) vec_sra_s64(A, npyv_setall_u64(C))
 
 // Right by an immediate constant
 #define npyv_shri_u16 npyv_shr_u16
@@ -44,15 +44,15 @@
 /***************************
  * Logical
  ***************************/
-#define NPYV_IMPL_VSX_BIN_CAST(INTRIN, SFX, CAST) \
+#define NPYV_IMPL_VEC_BIN_CAST(INTRIN, SFX, CAST) \
     NPY_FINLINE npyv_##SFX npyv_##INTRIN##_##SFX(npyv_##SFX a, npyv_##SFX b) \
     { return (npyv_##SFX)vec_##INTRIN((CAST)a, (CAST)b); }
 
 // Up to GCC 6 logical intrinsics don't support bool long long
 #if defined(__GNUC__) && __GNUC__ <= 6
-    #define NPYV_IMPL_VSX_BIN_B64(INTRIN) NPYV_IMPL_VSX_BIN_CAST(INTRIN, b64, npyv_u64)
+    #define NPYV_IMPL_VEC_BIN_B64(INTRIN) NPYV_IMPL_VEC_BIN_CAST(INTRIN, b64, npyv_u64)
 #else
-    #define NPYV_IMPL_VSX_BIN_B64(INTRIN) NPYV_IMPL_VSX_BIN_CAST(INTRIN, b64, npyv_b64)
+    #define NPYV_IMPL_VEC_BIN_B64(INTRIN) NPYV_IMPL_VEC_BIN_CAST(INTRIN, b64, npyv_b64)
 #endif
 // AND
 #define npyv_and_u8  vec_and
@@ -63,12 +63,14 @@
 #define npyv_and_s32 vec_and
 #define npyv_and_u64 vec_and
 #define npyv_and_s64 vec_and
-#define npyv_and_f32 vec_and
+#if NPY_SIMD_F32
+    #define npyv_and_f32 vec_and
+#endif
 #define npyv_and_f64 vec_and
 #define npyv_and_b8  vec_and
 #define npyv_and_b16 vec_and
 #define npyv_and_b32 vec_and
-NPYV_IMPL_VSX_BIN_B64(and)
+NPYV_IMPL_VEC_BIN_B64(and)
 
 // OR
 #define npyv_or_u8  vec_or
@@ -79,12 +81,14 @@ NPYV_IMPL_VSX_BIN_B64(and)
 #define npyv_or_s32 vec_or
 #define npyv_or_u64 vec_or
 #define npyv_or_s64 vec_or
-#define npyv_or_f32 vec_or
+#if NPY_SIMD_F32
+    #define npyv_or_f32 vec_or
+#endif
 #define npyv_or_f64 vec_or
 #define npyv_or_b8  vec_or
 #define npyv_or_b16 vec_or
 #define npyv_or_b32 vec_or
-NPYV_IMPL_VSX_BIN_B64(or)
+NPYV_IMPL_VEC_BIN_B64(or)
 
 // XOR
 #define npyv_xor_u8  vec_xor
@@ -95,16 +99,18 @@ NPYV_IMPL_VSX_BIN_B64(or)
 #define npyv_xor_s32 vec_xor
 #define npyv_xor_u64 vec_xor
 #define npyv_xor_s64 vec_xor
-#define npyv_xor_f32 vec_xor
+#if NPY_SIMD_F32
+    #define npyv_xor_f32 vec_xor
+#endif
 #define npyv_xor_f64 vec_xor
 #define npyv_xor_b8  vec_xor
 #define npyv_xor_b16 vec_xor
 #define npyv_xor_b32 vec_xor
-NPYV_IMPL_VSX_BIN_B64(xor)
+NPYV_IMPL_VEC_BIN_B64(xor)
 
 // NOT
 // note: we implement npyv_not_b*(boolean types) for internal use*/
-#define NPYV_IMPL_VSX_NOT_INT(VEC_LEN)                                 \
+#define NPYV_IMPL_VEC_NOT_INT(VEC_LEN)                                 \
     NPY_FINLINE npyv_u##VEC_LEN npyv_not_u##VEC_LEN(npyv_u##VEC_LEN a) \
     { return vec_nor(a, a); }                                          \
     NPY_FINLINE npyv_s##VEC_LEN npyv_not_s##VEC_LEN(npyv_s##VEC_LEN a) \
@@ -112,13 +118,13 @@ NPYV_IMPL_VSX_BIN_B64(xor)
     NPY_FINLINE npyv_b##VEC_LEN npyv_not_b##VEC_LEN(npyv_b##VEC_LEN a) \
     { return vec_nor(a, a); }
 
-NPYV_IMPL_VSX_NOT_INT(8)
-NPYV_IMPL_VSX_NOT_INT(16)
-NPYV_IMPL_VSX_NOT_INT(32)
+NPYV_IMPL_VEC_NOT_INT(8)
+NPYV_IMPL_VEC_NOT_INT(16)
+NPYV_IMPL_VEC_NOT_INT(32)
 
-// up to gcc5 vec_nor doesn't support bool long long
-#if defined(__GNUC__) && __GNUC__ > 5
-    NPYV_IMPL_VSX_NOT_INT(64)
+// on ppc64, up to gcc5 vec_nor doesn't support bool long long
+#if defined(NPY_HAVE_VSX) && defined(__GNUC__) && __GNUC__ > 5
+    NPYV_IMPL_VEC_NOT_INT(64)
 #else
     NPY_FINLINE npyv_u64 npyv_not_u64(npyv_u64 a)
     { return vec_nor(a, a); }
@@ -128,16 +134,23 @@ NPYV_IMPL_VSX_NOT_INT(32)
     { return (npyv_b64)vec_nor((npyv_u64)a, (npyv_u64)a); }
 #endif
 
-NPY_FINLINE npyv_f32 npyv_not_f32(npyv_f32 a)
-{ return vec_nor(a, a); }
+#if NPY_SIMD_F32
+    NPY_FINLINE npyv_f32 npyv_not_f32(npyv_f32 a)
+    { return vec_nor(a, a); }
+#endif
 NPY_FINLINE npyv_f64 npyv_not_f64(npyv_f64 a)
 { return vec_nor(a, a); }
 
 // ANDC, ORC and XNOR
 #define npyv_andc_u8 vec_andc
 #define npyv_andc_b8 vec_andc
-#define npyv_orc_b8 vec_orc
-#define npyv_xnor_b8 vec_eqv
+#if defined(NPY_HAVE_VXE) || defined(NPY_HAVE_VSX)
+    #define npyv_orc_b8 vec_orc
+    #define npyv_xnor_b8 vec_eqv
+#else
+    #define npyv_orc_b8(A, B) npyv_or_b8(npyv_not_b8(B), A)
+    #define npyv_xnor_b8(A, B) npyv_not_b8(npyv_xor_b8(B, A))
+#endif
 
 /***************************
  * Comparison
@@ -152,7 +165,9 @@ NPY_FINLINE npyv_f64 npyv_not_f64(npyv_f64 a)
 #define npyv_cmpeq_s32 vec_cmpeq
 #define npyv_cmpeq_u64 vec_cmpeq
 #define npyv_cmpeq_s64 vec_cmpeq
-#define npyv_cmpeq_f32 vec_cmpeq
+#if NPY_SIMD_F32
+    #define npyv_cmpeq_f32 vec_cmpeq
+#endif
 #define npyv_cmpeq_f64 vec_cmpeq
 
 // Int Not Equal
@@ -177,7 +192,9 @@ NPY_FINLINE npyv_f64 npyv_not_f64(npyv_f64 a)
     #define npyv_cmpneq_s32(A, B) npyv_not_b32(vec_cmpeq(A, B))
     #define npyv_cmpneq_u64(A, B) npyv_not_b64(vec_cmpeq(A, B))
     #define npyv_cmpneq_s64(A, B) npyv_not_b64(vec_cmpeq(A, B))
-    #define npyv_cmpneq_f32(A, B) npyv_not_b32(vec_cmpeq(A, B))
+    #if NPY_SIMD_F32
+        #define npyv_cmpneq_f32(A, B) npyv_not_b32(vec_cmpeq(A, B))
+    #endif
     #define npyv_cmpneq_f64(A, B) npyv_not_b64(vec_cmpeq(A, B))
 #endif
 
@@ -190,12 +207,14 @@ NPY_FINLINE npyv_f64 npyv_not_f64(npyv_f64 a)
 #define npyv_cmpgt_s32 vec_cmpgt
 #define npyv_cmpgt_u64 vec_cmpgt
 #define npyv_cmpgt_s64 vec_cmpgt
-#define npyv_cmpgt_f32 vec_cmpgt
+#if NPY_SIMD_F32
+    #define npyv_cmpgt_f32 vec_cmpgt
+#endif
 #define npyv_cmpgt_f64 vec_cmpgt
 
 // Greater than or equal
-// up to gcc5 vec_cmpge only supports single and double precision
-#if defined(__GNUC__) && __GNUC__ > 5
+// On ppc64le, up to gcc5 vec_cmpge only supports single and double precision
+#if defined(NPY_HAVE_VX) || (defined(__GNUC__) && __GNUC__ > 5)
     #define npyv_cmpge_u8  vec_cmpge
     #define npyv_cmpge_s8  vec_cmpge
     #define npyv_cmpge_u16 vec_cmpge
@@ -214,7 +233,9 @@ NPY_FINLINE npyv_f64 npyv_not_f64(npyv_f64 a)
     #define npyv_cmpge_u64(A, B) npyv_not_b64(vec_cmpgt(B, A))
     #define npyv_cmpge_s64(A, B) npyv_not_b64(vec_cmpgt(B, A))
 #endif
-#define npyv_cmpge_f32 vec_cmpge
+#if NPY_SIMD_F32
+    #define npyv_cmpge_f32 vec_cmpge
+#endif
 #define npyv_cmpge_f64 vec_cmpge
 
 // Less than
@@ -226,7 +247,9 @@ NPY_FINLINE npyv_f64 npyv_not_f64(npyv_f64 a)
 #define npyv_cmplt_s32(A, B) npyv_cmpgt_s32(B, A)
 #define npyv_cmplt_u64(A, B) npyv_cmpgt_u64(B, A)
 #define npyv_cmplt_s64(A, B) npyv_cmpgt_s64(B, A)
-#define npyv_cmplt_f32(A, B) npyv_cmpgt_f32(B, A)
+#if NPY_SIMD_F32
+    #define npyv_cmplt_f32(A, B) npyv_cmpgt_f32(B, A)
+#endif
 #define npyv_cmplt_f64(A, B) npyv_cmpgt_f64(B, A)
 
 // Less than or equal
@@ -238,13 +261,17 @@ NPY_FINLINE npyv_f64 npyv_not_f64(npyv_f64 a)
 #define npyv_cmple_s32(A, B) npyv_cmpge_s32(B, A)
 #define npyv_cmple_u64(A, B) npyv_cmpge_u64(B, A)
 #define npyv_cmple_s64(A, B) npyv_cmpge_s64(B, A)
-#define npyv_cmple_f32(A, B) npyv_cmpge_f32(B, A)
+#if NPY_SIMD_F32
+    #define npyv_cmple_f32(A, B) npyv_cmpge_f32(B, A)
+#endif
 #define npyv_cmple_f64(A, B) npyv_cmpge_f64(B, A)
 
 // check special cases
-NPY_FINLINE npyv_b32 npyv_notnan_f32(npyv_f32 a)
-{ return vec_cmpeq(a, a); }
+#if NPY_SIMD_F32
+    NPY_FINLINE npyv_b32 npyv_notnan_f32(npyv_f32 a)
+    { return vec_cmpeq(a, a); }
+#endif
 NPY_FINLINE npyv_b64 npyv_notnan_f64(npyv_f64 a)
 { return vec_cmpeq(a, a); }
 
-#endif // _NPY_SIMD_VSX_OPERATORS_H
+#endif // _NPY_SIMD_VEC_OPERATORS_H
diff --git a/numpy/core/src/common/simd/vsx/reorder.h b/numpy/core/src/common/simd/vec/reorder.h
index 6533e5093..b60b9287d 100644
--- a/numpy/core/src/common/simd/vsx/reorder.h
+++ b/numpy/core/src/common/simd/vec/reorder.h
@@ -2,8 +2,8 @@
     #error "Not a standalone header"
 #endif
 
-#ifndef _NPY_SIMD_VSX_REORDER_H
-#define _NPY_SIMD_VSX_REORDER_H
+#ifndef _NPY_SIMD_VEC_REORDER_H
+#define _NPY_SIMD_VEC_REORDER_H
 
 // combine lower part of two vectors
 #define npyv__combinel(A, B) vec_mergeh((npyv_u64)(A), (npyv_u64)(B))
@@ -15,7 +15,9 @@
 #define npyv_combinel_s32(A, B) ((npyv_s32)npyv__combinel(A, B))
 #define npyv_combinel_u64       vec_mergeh
 #define npyv_combinel_s64       vec_mergeh
-#define npyv_combinel_f32(A, B) ((npyv_f32)npyv__combinel(A, B))
+#if NPY_SIMD_F32
+    #define npyv_combinel_f32(A, B) ((npyv_f32)npyv__combinel(A, B))
+#endif
 #define npyv_combinel_f64       vec_mergeh
 
 // combine higher part of two vectors
@@ -28,14 +30,16 @@
 #define npyv_combineh_s32(A, B) ((npyv_s32)npyv__combineh(A, B))
 #define npyv_combineh_u64       vec_mergel
 #define npyv_combineh_s64       vec_mergel
-#define npyv_combineh_f32(A, B) ((npyv_f32)npyv__combineh(A, B))
+#if NPY_SIMD_F32
+    #define npyv_combineh_f32(A, B) ((npyv_f32)npyv__combineh(A, B))
+#endif
 #define npyv_combineh_f64       vec_mergel
 
 /*
  * combine: combine two vectors from lower and higher parts of two other vectors
  * zip: interleave two vectors
 */
-#define NPYV_IMPL_VSX_COMBINE_ZIP(T_VEC, SFX)                  \
+#define NPYV_IMPL_VEC_COMBINE_ZIP(T_VEC, SFX)                  \
     NPY_FINLINE T_VEC##x2 npyv_combine_##SFX(T_VEC a, T_VEC b) \
     {                                                          \
         T_VEC##x2 r;                                           \
@@ -51,16 +55,18 @@
         return r;                                              \
     }
 
-NPYV_IMPL_VSX_COMBINE_ZIP(npyv_u8,  u8)
-NPYV_IMPL_VSX_COMBINE_ZIP(npyv_s8,  s8)
-NPYV_IMPL_VSX_COMBINE_ZIP(npyv_u16, u16)
-NPYV_IMPL_VSX_COMBINE_ZIP(npyv_s16, s16)
-NPYV_IMPL_VSX_COMBINE_ZIP(npyv_u32, u32)
-NPYV_IMPL_VSX_COMBINE_ZIP(npyv_s32, s32)
-NPYV_IMPL_VSX_COMBINE_ZIP(npyv_u64, u64)
-NPYV_IMPL_VSX_COMBINE_ZIP(npyv_s64, s64)
-NPYV_IMPL_VSX_COMBINE_ZIP(npyv_f32, f32)
-NPYV_IMPL_VSX_COMBINE_ZIP(npyv_f64, f64)
+NPYV_IMPL_VEC_COMBINE_ZIP(npyv_u8,  u8)
+NPYV_IMPL_VEC_COMBINE_ZIP(npyv_s8,  s8)
+NPYV_IMPL_VEC_COMBINE_ZIP(npyv_u16, u16)
+NPYV_IMPL_VEC_COMBINE_ZIP(npyv_s16, s16)
+NPYV_IMPL_VEC_COMBINE_ZIP(npyv_u32, u32)
+NPYV_IMPL_VEC_COMBINE_ZIP(npyv_s32, s32)
+NPYV_IMPL_VEC_COMBINE_ZIP(npyv_u64, u64)
+NPYV_IMPL_VEC_COMBINE_ZIP(npyv_s64, s64)
+#if NPY_SIMD_F32
+    NPYV_IMPL_VEC_COMBINE_ZIP(npyv_f32, f32)
+#endif
+NPYV_IMPL_VEC_COMBINE_ZIP(npyv_f64, f64)
 
 // Reverse elements of each 64-bit lane
 NPY_FINLINE npyv_u8 npyv_rev64_u8(npyv_u8 a)
@@ -100,7 +106,9 @@ NPY_FINLINE npyv_u32 npyv_rev64_u32(npyv_u32 a)
 }
 NPY_FINLINE npyv_s32 npyv_rev64_s32(npyv_s32 a)
 { return (npyv_s32)npyv_rev64_u32((npyv_u32)a); }
-NPY_FINLINE npyv_f32 npyv_rev64_f32(npyv_f32 a)
-{ return (npyv_f32)npyv_rev64_u32((npyv_u32)a); }
+#if NPY_SIMD_F32
+    NPY_FINLINE npyv_f32 npyv_rev64_f32(npyv_f32 a)
+    { return (npyv_f32)npyv_rev64_u32((npyv_u32)a); }
+#endif
 
-#endif // _NPY_SIMD_VSX_REORDER_H
+#endif // _NPY_SIMD_VEC_REORDER_H
diff --git a/numpy/core/src/common/simd/vec/utils.h b/numpy/core/src/common/simd/vec/utils.h
new file mode 100644
index 000000000..f8b28cfeb
--- /dev/null
+++ b/numpy/core/src/common/simd/vec/utils.h
@@ -0,0 +1,84 @@
+#ifndef NPY_SIMD
+    #error "Not a standalone header"
+#endif
+
+#ifndef _NPY_SIMD_VEC_UTILS_H
+#define _NPY_SIMD_VEC_UTILS_H
+
+// the following intrinsics may not some|all by zvector API on gcc/clang
+#ifdef NPY_HAVE_VX
+    #ifndef vec_neg
+        #define vec_neg(a) (-(a)) // Vector Negate
+    #endif
+    #ifndef vec_add
+        #define vec_add(a, b) ((a) + (b)) // Vector Add
+    #endif
+    #ifndef vec_sub
+        #define vec_sub(a, b) ((a) - (b)) // Vector Subtract
+    #endif
+    #ifndef vec_mul
+        #define vec_mul(a, b) ((a) * (b)) // Vector Multiply
+    #endif
+    #ifndef vec_div
+        #define vec_div(a, b) ((a) / (b)) // Vector Divide
+    #endif
+    #ifndef vec_neg
+        #define vec_neg(a) (-(a))
+    #endif
+    #ifndef vec_and
+        #define vec_and(a, b) ((a) & (b)) // Vector AND
+    #endif
+    #ifndef vec_or
+        #define vec_or(a, b) ((a) | (b)) // Vector OR
+    #endif
+    #ifndef vec_xor
+        #define vec_xor(a, b) ((a) ^ (b)) // Vector XOR
+    #endif
+    #ifndef vec_sl
+        #define vec_sl(a, b) ((a) << (b)) // Vector Shift Left
+    #endif
+    #ifndef vec_sra
+        #define vec_sra(a, b) ((a) >> (b)) // Vector Shift Right
+    #endif
+    #ifndef vec_sr
+        #define vec_sr(a, b) ((a) >> (b)) // Vector Shift Right Algebraic
+    #endif
+    #ifndef vec_slo
+        #define vec_slo(a, b) vec_slb(a, (b) << 64) // Vector Shift Left by Octet
+    #endif
+    #ifndef vec_sro
+        #define vec_sro(a, b) vec_srb(a, (b) << 64) // Vector Shift Right by Octet
+    #endif
+    // vec_doublee maps to wrong intrin "vfll".
+    // see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100871
+    #if defined(__GNUC__) && !defined(__clang__)
+        #define npyv_doublee __builtin_s390_vflls
+    #else
+        #define npyv_doublee vec_doublee
+    #endif
+    // compatibility with vsx
+    #ifndef vec_vbpermq
+        #define vec_vbpermq vec_bperm_u128
+    #endif
+    // zvector requires second operand to signed while vsx api expected to be
+    // unsigned, the following macros are set to remove this conflict
+    #define vec_sl_s8(a, b)   vec_sl(a, (npyv_s8)(b))
+    #define vec_sl_s16(a, b)  vec_sl(a, (npyv_s16)(b))
+    #define vec_sl_s32(a, b)  vec_sl(a, (npyv_s32)(b))
+    #define vec_sl_s64(a, b)  vec_sl(a, (npyv_s64)(b))
+    #define vec_sra_s8(a, b)  vec_sra(a, (npyv_s8)(b))
+    #define vec_sra_s16(a, b) vec_sra(a, (npyv_s16)(b))
+    #define vec_sra_s32(a, b) vec_sra(a, (npyv_s32)(b))
+    #define vec_sra_s64(a, b) vec_sra(a, (npyv_s64)(b))
+#else
+    #define vec_sl_s8 vec_sl
+    #define vec_sl_s16 vec_sl
+    #define vec_sl_s32 vec_sl
+    #define vec_sl_s64 vec_sl
+    #define vec_sra_s8 vec_sra
+    #define vec_sra_s16 vec_sra
+    #define vec_sra_s32 vec_sra
+    #define vec_sra_s64 vec_sra
+#endif
+
+#endif // _NPY_SIMD_VEC_UTILS_H
diff --git a/numpy/core/src/common/simd/vsx/vsx.h b/numpy/core/src/common/simd/vec/vec.h
index b4d8172a2..abcd33ce1 100644
--- a/numpy/core/src/common/simd/vsx/vsx.h
+++ b/numpy/core/src/common/simd/vec/vec.h
@@ -1,7 +1,22 @@
+/**
+ * branch /vec(altivec-like) provides the SIMD operations for
+ * both IBM VSX(Power) and VX(ZArch).
+*/
 #ifndef _NPY_SIMD_H_
     #error "Not a standalone header"
 #endif
 
+#if !defined(NPY_HAVE_VX) && !defined(NPY_HAVE_VSX2)
+    #error "require minimum support VX(zarch11) or VSX2(Power8/ISA2.07)"
+#endif
+
+#if defined(NPY_HAVE_VSX) && !defined(__LITTLE_ENDIAN__)
+    #error "VSX support doesn't cover big-endian mode yet, only zarch."
+#endif
+#if defined(NPY_HAVE_VX) && defined(__LITTLE_ENDIAN__)
+    #error "VX(zarch) support doesn't cover little-endian mode."
+#endif
+
 #if defined(__GNUC__) && __GNUC__ <= 7
     /**
       * GCC <= 7 produces ambiguous warning caused by -Werror=maybe-uninitialized,
@@ -15,8 +30,19 @@
 #define NPY_SIMD 128
 #define NPY_SIMD_WIDTH 16
 #define NPY_SIMD_F64 1
+#if defined(NPY_HAVE_VXE) || defined(NPY_HAVE_VSX)
+    #define NPY_SIMD_F32 1
+#else
+    #define NPY_SIMD_F32 0
+#endif
 #define NPY_SIMD_FMA3 1 // native support
 
+#ifdef NPY_HAVE_VX
+    #define NPY_SIMD_BIGENDIAN 1
+#else
+    #define NPY_SIMD_BIGENDIAN 0
+#endif
+
 typedef __vector unsigned char      npyv_u8;
 typedef __vector signed char        npyv_s8;
 typedef __vector unsigned short     npyv_u16;
@@ -25,7 +51,9 @@ typedef __vector unsigned int       npyv_u32;
 typedef __vector signed int         npyv_s32;
 typedef __vector unsigned long long npyv_u64;
 typedef __vector signed long long   npyv_s64;
+#if NPY_SIMD_F32
 typedef __vector float              npyv_f32;
+#endif
 typedef __vector double             npyv_f64;
 
 typedef struct { npyv_u8  val[2]; } npyv_u8x2;
@@ -36,7 +64,9 @@ typedef struct { npyv_u32 val[2]; } npyv_u32x2;
 typedef struct { npyv_s32 val[2]; } npyv_s32x2;
 typedef struct { npyv_u64 val[2]; } npyv_u64x2;
 typedef struct { npyv_s64 val[2]; } npyv_s64x2;
+#if NPY_SIMD_F32
 typedef struct { npyv_f32 val[2]; } npyv_f32x2;
+#endif
 typedef struct { npyv_f64 val[2]; } npyv_f64x2;
 
 typedef struct { npyv_u8  val[3]; } npyv_u8x3;
@@ -47,7 +77,9 @@ typedef struct { npyv_u32 val[3]; } npyv_u32x3;
 typedef struct { npyv_s32 val[3]; } npyv_s32x3;
 typedef struct { npyv_u64 val[3]; } npyv_u64x3;
 typedef struct { npyv_s64 val[3]; } npyv_s64x3;
+#if NPY_SIMD_F32
 typedef struct { npyv_f32 val[3]; } npyv_f32x3;
+#endif
 typedef struct { npyv_f64 val[3]; } npyv_f64x3;
 
 #define npyv_nlanes_u8  16
@@ -67,6 +99,7 @@ typedef struct { npyv_f64 val[3]; } npyv_f64x3;
 #define npyv_b32 __vector __bool int
 #define npyv_b64 __vector __bool long long
 
+#include "utils.h"
 #include "memory.h"
 #include "misc.h"
 #include "reorder.h"
diff --git a/numpy/core/src/common/simd/vsx/conversion.h b/numpy/core/src/common/simd/vsx/conversion.h
deleted file mode 100644
index a599f3950..000000000
--- a/numpy/core/src/common/simd/vsx/conversion.h
+++ /dev/null
@@ -1,146 +0,0 @@
-#ifndef NPY_SIMD
-    #error "Not a standalone header"
-#endif
-
-#ifndef _NPY_SIMD_VSX_CVT_H
-#define _NPY_SIMD_VSX_CVT_H
-
-// convert boolean vectors to integer vectors
-#define npyv_cvt_u8_b8(BL)   ((npyv_u8)  BL)
-#define npyv_cvt_s8_b8(BL)   ((npyv_s8)  BL)
-#define npyv_cvt_u16_b16(BL) ((npyv_u16) BL)
-#define npyv_cvt_s16_b16(BL) ((npyv_s16) BL)
-#define npyv_cvt_u32_b32(BL) ((npyv_u32) BL)
-#define npyv_cvt_s32_b32(BL) ((npyv_s32) BL)
-#define npyv_cvt_u64_b64(BL) ((npyv_u64) BL)
-#define npyv_cvt_s64_b64(BL) ((npyv_s64) BL)
-#define npyv_cvt_f32_b32(BL) ((npyv_f32) BL)
-#define npyv_cvt_f64_b64(BL) ((npyv_f64) BL)
-
-// convert integer vectors to boolean vectors
-#define npyv_cvt_b8_u8(A)   ((npyv_b8)  A)
-#define npyv_cvt_b8_s8(A)   ((npyv_b8)  A)
-#define npyv_cvt_b16_u16(A) ((npyv_b16) A)
-#define npyv_cvt_b16_s16(A) ((npyv_b16) A)
-#define npyv_cvt_b32_u32(A) ((npyv_b32) A)
-#define npyv_cvt_b32_s32(A) ((npyv_b32) A)
-#define npyv_cvt_b64_u64(A) ((npyv_b64) A)
-#define npyv_cvt_b64_s64(A) ((npyv_b64) A)
-#define npyv_cvt_b32_f32(A) ((npyv_b32) A)
-#define npyv_cvt_b64_f64(A) ((npyv_b64) A)
-
-//expand
-NPY_FINLINE npyv_u16x2 npyv_expand_u16_u8(npyv_u8 data)
-{
-    npyv_u16x2 r;
-    npyv_u8 zero = npyv_zero_u8();
-    r.val[0] = (npyv_u16)vec_mergeh(data, zero);
-    r.val[1] = (npyv_u16)vec_mergel(data, zero);
-    return r;
-}
-
-NPY_FINLINE npyv_u32x2 npyv_expand_u32_u16(npyv_u16 data)
-{
-    npyv_u32x2 r;
-    npyv_u16 zero = npyv_zero_u16();
-    r.val[0] = (npyv_u32)vec_mergeh(data, zero);
-    r.val[1] = (npyv_u32)vec_mergel(data, zero);
-    return r;
-}
-
-// pack two 16-bit boolean into one 8-bit boolean vector
-NPY_FINLINE npyv_b8 npyv_pack_b8_b16(npyv_b16 a, npyv_b16 b) {
-    return vec_pack(a, b);
-}
-
-// pack four 32-bit boolean vectors into one 8-bit boolean vector
-NPY_FINLINE npyv_b8 npyv_pack_b8_b32(npyv_b32 a, npyv_b32 b, npyv_b32 c, npyv_b32 d) {
-    npyv_b16 ab = vec_pack(a, b);
-    npyv_b16 cd = vec_pack(c, d);
-    return npyv_pack_b8_b16(ab, cd);
-}
-
-// pack eight 64-bit boolean vectors into one 8-bit boolean vector
-NPY_FINLINE npyv_b8
-npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d,
-                 npyv_b64 e, npyv_b64 f, npyv_b64 g, npyv_b64 h) {
-    npyv_b32 ab = vec_pack(a, b);
-    npyv_b32 cd = vec_pack(c, d);
-    npyv_b32 ef = vec_pack(e, f);
-    npyv_b32 gh = vec_pack(g, h);
-    return npyv_pack_b8_b32(ab, cd, ef, gh);
-}
-
-// convert boolean vector to integer bitfield
-NPY_FINLINE npy_uint64 npyv_tobits_b8(npyv_b8 a)
-{
-    const npyv_u8 qperm = npyv_set_u8(120, 112, 104, 96, 88, 80, 72, 64, 56, 48, 40, 32, 24, 16, 8, 0);
-    return vec_extract((npyv_u32)vec_vbpermq((npyv_u8)a, qperm), 2);
-}
-NPY_FINLINE npy_uint64 npyv_tobits_b16(npyv_b16 a)
-{
-    const npyv_u8 qperm = npyv_setf_u8(128, 112, 96, 80, 64, 48, 32, 16, 0);
-    return vec_extract((npyv_u32)vec_vbpermq((npyv_u8)a, qperm), 2);
-}
-NPY_FINLINE npy_uint64 npyv_tobits_b32(npyv_b32 a)
-{
-    const npyv_u8 qperm = npyv_setf_u8(128, 96, 64, 32, 0);
-    return vec_extract((npyv_u32)vec_vbpermq((npyv_u8)a, qperm), 2);
-}
-NPY_FINLINE npy_uint64 npyv_tobits_b64(npyv_b64 a)
-{
-    npyv_u64 bit = npyv_shri_u64((npyv_u64)a, 63);
-    return vec_extract(bit, 0) | (int)vec_extract(bit, 1) << 1;
-}
-
-// truncate compatible with all compilers(internal use for now)
-NPY_FINLINE npyv_s32 npyv__trunc_s32_f32(npyv_f32 a)
-{
-#ifdef __IBMC__
-    return vec_cts(a, 0);
-#elif defined(__clang__)
-    /**
-     * old versions of CLANG doesn't support %x<n> in the inline asm template
-     * which fixes register number when using any of the register constraints wa, wd, wf.
-     * therefore, we count on built-in functions.
-     */
-    return __builtin_convertvector(a, npyv_s32);
-#else // gcc
-    npyv_s32 ret;
-    __asm__ ("xvcvspsxws %x0,%x1" : "=wa" (ret) : "wa" (a));
-    return ret;
-#endif
-}
-NPY_FINLINE npyv_s32 npyv__trunc_s32_f64(npyv_f64 a, npyv_f64 b)
-{
-#ifdef __IBMC__
-    const npyv_u8 seq_even = npyv_set_u8(0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27);
-    // unfortunately, XLC missing asm register vsx fixer
-    // hopefully, xlc can optimize around big-endian compatibility
-    npyv_s32 lo_even = vec_cts(a, 0);
-    npyv_s32 hi_even = vec_cts(b, 0);
-    return vec_perm(lo_even, hi_even, seq_even);
-#else
-    const npyv_u8 seq_odd = npyv_set_u8(4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31);
-    #ifdef __clang__
-        // __builtin_convertvector doesn't support this conversion on wide range of versions
-        // fortunately, almost all versions have direct builtin of 'xvcvdpsxws'
-        npyv_s32 lo_odd = __builtin_vsx_xvcvdpsxws(a);
-        npyv_s32 hi_odd = __builtin_vsx_xvcvdpsxws(b);
-    #else // gcc
-        npyv_s32 lo_odd, hi_odd;
-        __asm__ ("xvcvdpsxws %x0,%x1" : "=wa" (lo_odd) : "wa" (a));
-        __asm__ ("xvcvdpsxws %x0,%x1" : "=wa" (hi_odd) : "wa" (b));
-    #endif
-    return vec_perm(lo_odd, hi_odd, seq_odd);
-#endif
-}
-
-// round to nearest integer (assuming even)
-NPY_FINLINE npyv_s32 npyv_round_s32_f32(npyv_f32 a)
-{ return npyv__trunc_s32_f32(vec_rint(a)); }
-
-NPY_FINLINE npyv_s32 npyv_round_s32_f64(npyv_f64 a, npyv_f64 b)
-{ return npyv__trunc_s32_f64(vec_rint(a), vec_rint(b)); }
-
-#endif // _NPY_SIMD_VSX_CVT_H
diff --git a/numpy/core/src/common/umathmodule.h b/numpy/core/src/common/umathmodule.h
index fe44fe403..0c69f8f54 100644
--- a/numpy/core/src/common/umathmodule.h
+++ b/numpy/core/src/common/umathmodule.h
@@ -7,8 +7,14 @@
 NPY_NO_EXPORT PyObject *
 get_sfloat_dtype(PyObject *NPY_UNUSED(mod), PyObject *NPY_UNUSED(args));
 
+/* Defined in umath/extobj.c */
+NPY_NO_EXPORT int
+PyUFunc_GiveFloatingpointErrors(const char *name, int fpe_errors);
+
 PyObject * add_newdoc_ufunc(PyObject *NPY_UNUSED(dummy), PyObject *args);
 PyObject * ufunc_frompyfunc(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject *NPY_UNUSED(kwds));
+
+
 int initumath(PyObject *m);
 
 #endif  /* NUMPY_CORE_SRC_COMMON_UMATHMODULE_H_ */
diff --git a/numpy/core/src/multiarray/argfunc.dispatch.c.src b/numpy/core/src/multiarray/argfunc.dispatch.c.src
index cbfaebdb4..1d7753275 100644
--- a/numpy/core/src/multiarray/argfunc.dispatch.c.src
+++ b/numpy/core/src/multiarray/argfunc.dispatch.c.src
@@ -4,6 +4,7 @@
  ** sse2 sse42 xop avx2 avx512_skx
  ** vsx2
  ** neon asimd
+ ** vx vxe
  **/
 
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
@@ -123,7 +124,7 @@ simd_@func@_@sfx@(npyv_lanetype_@sfx@ *ip, npy_intp len)
  * #bsfx = b32, b32, b64, b64, b32, b64#
  * #is_fp = 0*4, 1*2#
  * #is_idx32 = 1*2, 0*2, 1, 0#
- * #chk_simd = NPY_SIMD*5, NPY_SIMD_F64#
+ * #chk_simd = NPY_SIMD*4, NPY_SIMD_F32, NPY_SIMD_F64#
  */
 #if @chk_simd@
 /**begin repeat1
@@ -298,6 +299,9 @@ scalar_loop:
         #if NPY_BITSOF_@BTYPE@ == 64 && !NPY_SIMD_F64
             #undef TO_SIMD_SFX
         #endif
+        #if NPY_BITSOF_@BTYPE@ == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
     #elif @is_unsigned@
         #define TO_SIMD_SFX(X) X##_u@len@
     #else
diff --git a/numpy/core/src/multiarray/array_assign_array.c b/numpy/core/src/multiarray/array_assign_array.c
index 020a7f29a..9d5bf6875 100644
--- a/numpy/core/src/multiarray/array_assign_array.c
+++ b/numpy/core/src/multiarray/array_assign_array.c
@@ -8,11 +8,13 @@
  */
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
 #define _MULTIARRAYMODULE
+#define _UMATHMODULE
 
 #define PY_SSIZE_T_CLEAN
 #include <Python.h>
 
 #include "numpy/ndarraytypes.h"
+#include "numpy/npy_math.h"
 
 #include "npy_config.h"
 #include "npy_pycompat.h"
@@ -25,6 +27,8 @@
 #include "array_assign.h"
 #include "dtype_transfer.h"
 
+#include "umathmodule.h"
+
 /*
  * Check that array data is both uint-aligned and true-aligned for all array
  * elements, as required by the copy/casting code in lowlevel_strided_loops.c
@@ -83,7 +87,7 @@ raw_array_assign_array(int ndim, npy_intp const *shape,
     npy_intp src_strides_it[NPY_MAXDIMS];
     npy_intp coord[NPY_MAXDIMS];
 
-    int aligned, needs_api = 0;
+    int aligned;
 
     NPY_BEGIN_THREADS_DEF;
 
@@ -116,15 +120,19 @@ raw_array_assign_array(int ndim, npy_intp const *shape,
 
     /* Get the function to do the casting */
     NPY_cast_info cast_info;
+    NPY_ARRAYMETHOD_FLAGS flags;
     if (PyArray_GetDTypeTransferFunction(aligned,
                         src_strides_it[0], dst_strides_it[0],
                         src_dtype, dst_dtype,
                         0,
-                        &cast_info, &needs_api) != NPY_SUCCEED) {
+                        &cast_info, &flags) != NPY_SUCCEED) {
         return -1;
     }
 
-    if (!needs_api) {
+    if (!(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) {
+        npy_clear_floatstatus_barrier(src_data);
+    }
+    if (!(flags & NPY_METH_REQUIRES_PYAPI)) {
         NPY_BEGIN_THREADS;
     }
 
@@ -143,6 +151,14 @@ raw_array_assign_array(int ndim, npy_intp const *shape,
 
     NPY_END_THREADS;
     NPY_cast_info_xfree(&cast_info);
+
+    if (!(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) {
+        int fpes = npy_get_floatstatus_barrier(src_data);
+        if (fpes && PyUFunc_GiveFloatingpointErrors("cast", fpes) < 0) {
+            return -1;
+        }
+    }
+
     return 0;
 fail:
     NPY_END_THREADS;
@@ -170,7 +186,7 @@ raw_array_wheremasked_assign_array(int ndim, npy_intp const *shape,
     npy_intp wheremask_strides_it[NPY_MAXDIMS];
     npy_intp coord[NPY_MAXDIMS];
 
-    int aligned, needs_api = 0;
+    int aligned;
 
     NPY_BEGIN_THREADS_DEF;
 
@@ -207,17 +223,21 @@ raw_array_wheremasked_assign_array(int ndim, npy_intp const *shape,
 
     /* Get the function to do the casting */
     NPY_cast_info cast_info;
+    NPY_ARRAYMETHOD_FLAGS flags;
     if (PyArray_GetMaskedDTypeTransferFunction(aligned,
                         src_strides_it[0],
                         dst_strides_it[0],
                         wheremask_strides_it[0],
                         src_dtype, dst_dtype, wheremask_dtype,
                         0,
-                        &cast_info, &needs_api) != NPY_SUCCEED) {
+                        &cast_info, &flags) != NPY_SUCCEED) {
         return -1;
     }
 
-    if (!needs_api) {
+    if (!(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) {
+        npy_clear_floatstatus_barrier(src_data);
+    }
+    if (!(flags & NPY_METH_REQUIRES_PYAPI)) {
         NPY_BEGIN_THREADS;
     }
     npy_intp strides[2] = {src_strides_it[0], dst_strides_it[0]};
@@ -232,7 +252,7 @@ raw_array_wheremasked_assign_array(int ndim, npy_intp const *shape,
                 args, &shape_it[0], strides,
                 (npy_bool *)wheremask_data, wheremask_strides_it[0],
                 cast_info.auxdata) < 0) {
-            break;
+            goto fail;
         }
     } NPY_RAW_ITER_THREE_NEXT(idim, ndim, coord, shape_it,
                             dst_data, dst_strides_it,
@@ -241,7 +261,20 @@ raw_array_wheremasked_assign_array(int ndim, npy_intp const *shape,
 
     NPY_END_THREADS;
     NPY_cast_info_xfree(&cast_info);
-    return (needs_api && PyErr_Occurred()) ? -1 : 0;
+
+    if (!(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) {
+        int fpes = npy_get_floatstatus_barrier(src_data);
+        if (fpes && PyUFunc_GiveFloatingpointErrors("cast", fpes) < 0) {
+            return -1;
+        }
+    }
+
+    return 0;
+
+fail:
+    NPY_END_THREADS;
+    NPY_cast_info_xfree(&cast_info);
+    return -1;
 }
 
 /*
diff --git a/numpy/core/src/multiarray/array_assign_scalar.c b/numpy/core/src/multiarray/array_assign_scalar.c
index 4ffef7ecc..ba964b86d 100644
--- a/numpy/core/src/multiarray/array_assign_scalar.c
+++ b/numpy/core/src/multiarray/array_assign_scalar.c
@@ -8,11 +8,13 @@
  */
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
 #define _MULTIARRAYMODULE
+#define _UMATHMODULE
 
 #define PY_SSIZE_T_CLEAN
 #include <Python.h>
 
 #include <numpy/ndarraytypes.h>
+#include "numpy/npy_math.h"
 
 #include "npy_config.h"
 #include "npy_pycompat.h"
@@ -25,6 +27,8 @@
 #include "array_assign.h"
 #include "dtype_transfer.h"
 
+#include "umathmodule.h"
+
 /*
  * Assigns the scalar value to every element of the destination raw array.
  *
@@ -39,7 +43,7 @@ raw_array_assign_scalar(int ndim, npy_intp const *shape,
     npy_intp shape_it[NPY_MAXDIMS], dst_strides_it[NPY_MAXDIMS];
     npy_intp coord[NPY_MAXDIMS];
 
-    int aligned, needs_api = 0;
+    int aligned;
 
     NPY_BEGIN_THREADS_DEF;
 
@@ -62,15 +66,19 @@ raw_array_assign_scalar(int ndim, npy_intp const *shape,
 
     /* Get the function to do the casting */
     NPY_cast_info cast_info;
+    NPY_ARRAYMETHOD_FLAGS flags;
     if (PyArray_GetDTypeTransferFunction(aligned,
                         0, dst_strides_it[0],
                         src_dtype, dst_dtype,
                         0,
-                        &cast_info, &needs_api) != NPY_SUCCEED) {
+                        &cast_info, &flags) != NPY_SUCCEED) {
         return -1;
     }
 
-    if (!needs_api) {
+    if (!(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) {
+        npy_clear_floatstatus_barrier(src_data);
+    }
+    if (!(flags & NPY_METH_REQUIRES_PYAPI)) {
         npy_intp nitems = 1, i;
         for (i = 0; i < ndim; i++) {
             nitems *= shape_it[i];
@@ -92,6 +100,14 @@ raw_array_assign_scalar(int ndim, npy_intp const *shape,
 
     NPY_END_THREADS;
     NPY_cast_info_xfree(&cast_info);
+
+    if (!(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) {
+        int fpes = npy_get_floatstatus_barrier(src_data);
+        if (fpes && PyUFunc_GiveFloatingpointErrors("cast", fpes) < 0) {
+            return -1;
+        }
+    }
+
     return 0;
 fail:
     NPY_END_THREADS;
@@ -117,7 +133,7 @@ raw_array_wheremasked_assign_scalar(int ndim, npy_intp const *shape,
     npy_intp wheremask_strides_it[NPY_MAXDIMS];
     npy_intp coord[NPY_MAXDIMS];
 
-    int aligned, needs_api = 0;
+    int aligned;
 
     NPY_BEGIN_THREADS_DEF;
 
@@ -142,15 +158,19 @@ raw_array_wheremasked_assign_scalar(int ndim, npy_intp const *shape,
 
     /* Get the function to do the casting */
     NPY_cast_info cast_info;
+    NPY_ARRAYMETHOD_FLAGS flags;
     if (PyArray_GetMaskedDTypeTransferFunction(aligned,
                         0, dst_strides_it[0], wheremask_strides_it[0],
                         src_dtype, dst_dtype, wheremask_dtype,
                         0,
-                        &cast_info, &needs_api) != NPY_SUCCEED) {
+                        &cast_info, &flags) != NPY_SUCCEED) {
         return -1;
     }
 
-    if (!needs_api) {
+    if (!(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) {
+        npy_clear_floatstatus_barrier(src_data);
+    }
+    if (!(flags & NPY_METH_REQUIRES_PYAPI)) {
         npy_intp nitems = 1, i;
         for (i = 0; i < ndim; i++) {
             nitems *= shape_it[i];
@@ -170,7 +190,7 @@ raw_array_wheremasked_assign_scalar(int ndim, npy_intp const *shape,
                 args, &shape_it[0], strides,
                 (npy_bool *)wheremask_data, wheremask_strides_it[0],
                 cast_info.auxdata) < 0) {
-            break;
+            goto fail;
         }
     } NPY_RAW_ITER_TWO_NEXT(idim, ndim, coord, shape_it,
                             dst_data, dst_strides_it,
@@ -178,7 +198,20 @@ raw_array_wheremasked_assign_scalar(int ndim, npy_intp const *shape,
 
     NPY_END_THREADS;
     NPY_cast_info_xfree(&cast_info);
-    return (needs_api && PyErr_Occurred()) ? -1 : 0;
+
+    if (!(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) {
+        int fpes = npy_get_floatstatus_barrier(src_data);
+        if (fpes && PyUFunc_GiveFloatingpointErrors("cast", fpes) < 0) {
+            return -1;
+        }
+    }
+
+    return 0;
+
+fail:
+    NPY_END_THREADS;
+    NPY_cast_info_xfree(&cast_info);
+    return -1;
 }
 
 /*
diff --git a/numpy/core/src/multiarray/array_coercion.c b/numpy/core/src/multiarray/array_coercion.c
index 1559f3485..e703e7382 100644
--- a/numpy/core/src/multiarray/array_coercion.c
+++ b/numpy/core/src/multiarray/array_coercion.c
@@ -9,6 +9,7 @@
 
 #include "lowlevel_strided_loops.h"
 #include "numpy/arrayobject.h"
+#include "numpy/npy_math.h"
 
 #include "descriptor.h"
 #include "convert_datatype.h"
@@ -22,6 +23,7 @@
 #include "_datetime.h"
 #include "npy_import.h"
 
+#include "umathmodule.h"
 
 /*
  * This file defines helpers for some of the ctors.c functions which
@@ -378,6 +380,49 @@ find_scalar_descriptor(
 }
 
 
+/*
+ * Helper function for casting a raw value from one descriptor to another.
+ * This helper uses the normal casting machinery, but e.g. does not care about
+ * checking cast safety.
+ */
+static int
+cast_raw_scalar_item(
+        PyArray_Descr *from_descr, char *from_item,
+        PyArray_Descr *to_descr, char *to_item)
+{
+    NPY_cast_info cast_info;
+    NPY_ARRAYMETHOD_FLAGS flags;
+    if (PyArray_GetDTypeTransferFunction(
+            0, 0, 0, from_descr, to_descr, 0, &cast_info,
+            &flags) == NPY_FAIL) {
+        return -1;
+    }
+
+    if (!(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) {
+        npy_clear_floatstatus_barrier(from_item);
+    }
+
+    char *args[2] = {from_item, to_item};
+    const npy_intp strides[2] = {0, 0};
+    const npy_intp length = 1;
+    if (cast_info.func(&cast_info.context,
+            args, &length, strides, cast_info.auxdata) < 0) {
+        NPY_cast_info_xfree(&cast_info);
+        return -1;
+    }
+    NPY_cast_info_xfree(&cast_info);
+
+    if (!(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) {
+        int fpes = npy_get_floatstatus_barrier(to_item);
+        if (fpes && PyUFunc_GiveFloatingpointErrors("cast", fpes) < 0) {
+            return -1;
+        }
+    }
+
+    return 0;
+}
+
+
 /**
  * Assign a single element in an array from a python value.
  *
@@ -388,26 +433,35 @@ find_scalar_descriptor(
  * This function handles the cast, which is for example hit when assigning
  * a float128 to complex128.
  *
- * At this time, this function does not support arrays (historically we
- * mainly supported arrays through `__float__()`, etc.). Such support should
- * possibly be added (although when called from `PyArray_AssignFromCache`
- * the input cannot be an array).
- * Note that this is also problematic for some array-likes, such as
- * `astropy.units.Quantity` and `np.ma.masked`.  These are used to us calling
- * `__float__`/`__int__` for 0-D instances in many cases.
- * Eventually, we may want to define this as wrong: They must use DTypes
- * instead of (only) subclasses.  Until then, here as well as in
- * `PyArray_AssignFromCache` (which already does this), we need to special
- * case 0-D array-likes to behave like arbitrary (unknown!) Python objects.
+ * TODO: This function probably needs to be passed an "owner" for the sake of
+ *       future HPy (non CPython) support
+ *
+ * NOTE: We do support 0-D exact NumPy arrays correctly via casting here.
+ *       There be dragons, because we must NOT support generic array-likes.
+ *       The problem is that some (e.g. astropy's Quantity and our masked
+ *       arrays) have divergent behaviour for `__array__` as opposed to
+ *       `__float__`.  And they rely on that.
+ *       That is arguably bad as it limits the things that work seamlessly
+ *       because `__float__`, etc. cannot even begin to cover all of casting.
+ *       However, we have no choice.  We simply CANNOT support array-likes
+ *       here without finding a solution for this first.
+ *       And the only plausible one I see currently, is expanding protocols
+ *       in some form, either to indicate that we want a scalar or to indicate
+ *       that we want the unsafe version that `__array__` currently gives
+ *       for both objects.
+ *
+ *       If we ever figure out how to expand this to other array-likes, care
+ *       may need to be taken. `PyArray_FromAny`/`PyArray_AssignFromCache`
+ *       uses this function but know if the input is an array, array-like,
+ *       or scalar.  Relaxing things here should be OK, but looks a bit
+ *       like possible recursion, so it may make sense to make a "scalars only"
+ *       version of this function.
  *
  * @param descr
  * @param item
  * @param value
  * @return 0 on success -1 on failure.
  */
-/*
- * TODO: This function should possibly be public API.
- */
 NPY_NO_EXPORT int
 PyArray_Pack(PyArray_Descr *descr, char *item, PyObject *value)
 {
@@ -433,6 +487,29 @@ PyArray_Pack(PyArray_Descr *descr, char *item, PyObject *value)
     if (DType == NULL) {
         return -1;
     }
+    if (DType == (PyArray_DTypeMeta *)Py_None && PyArray_CheckExact(value)
+            && PyArray_NDIM((PyArrayObject *)value) == 0) {
+        /*
+         * WARNING: Do NOT relax the above `PyArray_CheckExact`, unless you
+         *          read the function doc NOTE carefully and understood it.
+         *
+         * NOTE: The ndim == 0 check should probably be an error, but
+         *       unfortunately. `arr.__float__()` works for 1 element arrays
+         *       so in some contexts we need to let it handled like a scalar.
+         *       (If we manage to deprecate the above, we can do that.)
+         */
+        Py_DECREF(DType);
+
+        PyArrayObject *arr = (PyArrayObject *)value;
+        if (PyArray_DESCR(arr) == descr && !PyDataType_REFCHK(descr)) {
+            /* light-weight fast-path for when the descrs obviously matches */
+            memcpy(item, PyArray_BYTES(arr), descr->elsize);
+            return 0;  /* success (it was an array-like) */
+        }
+        return cast_raw_scalar_item(
+                PyArray_DESCR(arr), PyArray_BYTES(arr), descr, item);
+
+    }
     if (DType == NPY_DTYPE(descr) || DType == (PyArray_DTypeMeta *)Py_None) {
         /* We can set the element directly (or at least will try to) */
         Py_XDECREF(DType);
@@ -461,30 +538,8 @@ PyArray_Pack(PyArray_Descr *descr, char *item, PyObject *value)
         Py_DECREF(tmp_descr);
         return -1;
     }
-    if (PyDataType_REFCHK(tmp_descr)) {
-        /* We could probably use move-references above */
-        PyArray_Item_INCREF(data, tmp_descr);
-    }
-
-    int res = 0;
-    int needs_api = 0;
-    NPY_cast_info cast_info;
-    if (PyArray_GetDTypeTransferFunction(
-            0, 0, 0, tmp_descr, descr, 0, &cast_info,
-            &needs_api) == NPY_FAIL) {
-        res = -1;
-        goto finish;
-    }
-    char *args[2] = {data, item};
-    const npy_intp strides[2] = {0, 0};
-    const npy_intp length = 1;
-    if (cast_info.func(&cast_info.context,
-            args, &length, strides, cast_info.auxdata) < 0) {
-        res = -1;
-    }
-    NPY_cast_info_xfree(&cast_info);
+    int res = cast_raw_scalar_item(tmp_descr, data, descr, item);
 
-  finish:
     if (PyDataType_REFCHK(tmp_descr)) {
         /* We could probably use move-references above */
         PyArray_Item_XDECREF(data, tmp_descr);
diff --git a/numpy/core/src/multiarray/array_method.h b/numpy/core/src/multiarray/array_method.h
index 30dd94a80..c9ec8903d 100644
--- a/numpy/core/src/multiarray/array_method.h
+++ b/numpy/core/src/multiarray/array_method.h
@@ -7,6 +7,9 @@
 #include <Python.h>
 #include <numpy/ndarraytypes.h>
 
+#ifdef __cplusplus
+extern "C" {
+#endif
 
 typedef enum {
     /* Flag for whether the GIL is required */
@@ -17,7 +20,11 @@ typedef enum {
      * setup/check. No function should set error flags and ignore them
      * since it would interfere with chaining operations (e.g. casting).
      */
-    /* TODO: Change this into a positive flag */
+    /*
+     * TODO: Change this into a positive flag?  That would make "combing"
+     *       multiple methods easier. OTOH, if we add more flags, the default
+     *       would be 0 just like it is here.
+     */
     NPY_METH_NO_FLOATINGPOINT_ERRORS = 1 << 2,
     /* Whether the method supports unaligned access (not runtime) */
     NPY_METH_SUPPORTS_UNALIGNED = 1 << 3,
@@ -40,6 +47,20 @@ typedef enum {
 } NPY_ARRAYMETHOD_FLAGS;
 
 
+/*
+ * It would be nice to just | flags, but in general it seems that 0 bits
+ * probably should indicate "default".
+ * And that is not necessarily compatible with `|`.
+ *
+ * NOTE: If made public, should maybe be a function to easier add flags?
+ */
+#define PyArrayMethod_MINIMAL_FLAGS NPY_METH_NO_FLOATINGPOINT_ERRORS
+#define PyArrayMethod_COMBINED_FLAGS(flags1, flags2)  \
+        ((NPY_ARRAYMETHOD_FLAGS)(  \
+            ((flags1 | flags2) & ~PyArrayMethod_MINIMAL_FLAGS)  \
+            | (flags1 & flags2)))
+
+
 struct PyArrayMethodObject_tag;
 
 /*
@@ -249,6 +270,10 @@ PyArrayMethod_FromSpec(PyArrayMethod_Spec *spec);
  *       need better tests when a public version is exposed.
  */
 NPY_NO_EXPORT PyBoundArrayMethodObject *
-PyArrayMethod_FromSpec_int(PyArrayMethod_Spec *spec, int private);
+PyArrayMethod_FromSpec_int(PyArrayMethod_Spec *spec, int priv);
+
+#ifdef __cplusplus
+}
+#endif
 
 #endif  /* NUMPY_CORE_SRC_MULTIARRAY_ARRAY_METHOD_H_ */
diff --git a/numpy/core/src/multiarray/arrayobject.c b/numpy/core/src/multiarray/arrayobject.c
index a1f0e2d5b..d18fe1b10 100644
--- a/numpy/core/src/multiarray/arrayobject.c
+++ b/numpy/core/src/multiarray/arrayobject.c
@@ -641,375 +641,11 @@ PyArray_FailUnlessWriteable(PyArrayObject *obj, const char *name)
     return 0;
 }
 
-/* This also handles possibly mis-aligned data */
-/* Compare s1 and s2 which are not necessarily NULL-terminated.
-   s1 is of length len1
-   s2 is of length len2
-   If they are NULL terminated, then stop comparison.
-*/
-static int
-_myunincmp(npy_ucs4 const *s1, npy_ucs4 const *s2, int len1, int len2)
-{
-    npy_ucs4 const *sptr;
-    npy_ucs4 *s1t = NULL;
-    npy_ucs4 *s2t = NULL;
-    int val;
-    npy_intp size;
-    int diff;
-
-    /* Replace `s1` and `s2` with aligned copies if needed */
-    if ((npy_intp)s1 % sizeof(npy_ucs4) != 0) {
-        size = len1*sizeof(npy_ucs4);
-        s1t = malloc(size);
-        memcpy(s1t, s1, size);
-        s1 = s1t;
-    }
-    if ((npy_intp)s2 % sizeof(npy_ucs4) != 0) {
-        size = len2*sizeof(npy_ucs4);
-        s2t = malloc(size);
-        memcpy(s2t, s2, size);
-        s2 = s1t;
-    }
-
-    val = PyArray_CompareUCS4(s1, s2, PyArray_MIN(len1,len2));
-    if ((val != 0) || (len1 == len2)) {
-        goto finish;
-    }
-    if (len2 > len1) {
-        sptr = s2+len1;
-        val = -1;
-        diff = len2-len1;
-    }
-    else {
-        sptr = s1+len2;
-        val = 1;
-        diff=len1-len2;
-    }
-    while (diff--) {
-        if (*sptr != 0) {
-            goto finish;
-        }
-        sptr++;
-    }
-    val = 0;
-
- finish:
-    /* Cleanup the aligned copies */
-    if (s1t) {
-        free(s1t);
-    }
-    if (s2t) {
-        free(s2t);
-    }
-    return val;
-}
-
-
-
-
-/*
- * Compare s1 and s2 which are not necessarily NULL-terminated.
- * s1 is of length len1
- * s2 is of length len2
- * If they are NULL terminated, then stop comparison.
- */
-static int
-_mystrncmp(char const *s1, char const *s2, int len1, int len2)
-{
-    char const *sptr;
-    int val;
-    int diff;
-
-    val = memcmp(s1, s2, PyArray_MIN(len1, len2));
-    if ((val != 0) || (len1 == len2)) {
-        return val;
-    }
-    if (len2 > len1) {
-        sptr = s2 + len1;
-        val = -1;
-        diff = len2 - len1;
-    }
-    else {
-        sptr = s1 + len2;
-        val = 1;
-        diff = len1 - len2;
-    }
-    while (diff--) {
-        if (*sptr != 0) {
-            return val;
-        }
-        sptr++;
-    }
-    return 0; /* Only happens if NULLs are everywhere */
-}
-
-/* Borrowed from Numarray */
-
-#define SMALL_STRING 2048
-
-static void _rstripw(char *s, int n)
-{
-    int i;
-    for (i = n - 1; i >= 1; i--) { /* Never strip to length 0. */
-        int c = s[i];
-
-        if (!c || NumPyOS_ascii_isspace((int)c)) {
-            s[i] = 0;
-        }
-        else {
-            break;
-        }
-    }
-}
-
-static void _unistripw(npy_ucs4 *s, int n)
-{
-    int i;
-    for (i = n - 1; i >= 1; i--) { /* Never strip to length 0. */
-        npy_ucs4 c = s[i];
-        if (!c || NumPyOS_ascii_isspace((int)c)) {
-            s[i] = 0;
-        }
-        else {
-            break;
-        }
-    }
-}
-
-
-static char *
-_char_copy_n_strip(char const *original, char *temp, int nc)
-{
-    if (nc > SMALL_STRING) {
-        temp = malloc(nc);
-        if (!temp) {
-            PyErr_NoMemory();
-            return NULL;
-        }
-    }
-    memcpy(temp, original, nc);
-    _rstripw(temp, nc);
-    return temp;
-}
-
-static void
-_char_release(char *ptr, int nc)
-{
-    if (nc > SMALL_STRING) {
-        free(ptr);
-    }
-}
-
-static char *
-_uni_copy_n_strip(char const *original, char *temp, int nc)
-{
-    if (nc*sizeof(npy_ucs4) > SMALL_STRING) {
-        temp = malloc(nc*sizeof(npy_ucs4));
-        if (!temp) {
-            PyErr_NoMemory();
-            return NULL;
-        }
-    }
-    memcpy(temp, original, nc*sizeof(npy_ucs4));
-    _unistripw((npy_ucs4 *)temp, nc);
-    return temp;
-}
-
-static void
-_uni_release(char *ptr, int nc)
-{
-    if (nc*sizeof(npy_ucs4) > SMALL_STRING) {
-        free(ptr);
-    }
-}
-
-
-/* End borrowed from numarray */
-
-#define _rstrip_loop(CMP) {                                     \
-        void *aptr, *bptr;                                      \
-        char atemp[SMALL_STRING], btemp[SMALL_STRING];          \
-        while(size--) {                                         \
-            aptr = stripfunc(iself->dataptr, atemp, N1);        \
-            if (!aptr) return -1;                               \
-            bptr = stripfunc(iother->dataptr, btemp, N2);       \
-            if (!bptr) {                                        \
-                relfunc(aptr, N1);                              \
-                return -1;                                      \
-            }                                                   \
-            val = compfunc(aptr, bptr, N1, N2);                 \
-            *dptr = (val CMP 0);                                \
-            PyArray_ITER_NEXT(iself);                           \
-            PyArray_ITER_NEXT(iother);                          \
-            dptr += 1;                                          \
-            relfunc(aptr, N1);                                  \
-            relfunc(bptr, N2);                                  \
-        }                                                       \
-    }
-
-#define _reg_loop(CMP) {                                \
-        while(size--) {                                 \
-            val = compfunc((void *)iself->dataptr,      \
-                          (void *)iother->dataptr,      \
-                          N1, N2);                      \
-            *dptr = (val CMP 0);                        \
-            PyArray_ITER_NEXT(iself);                   \
-            PyArray_ITER_NEXT(iother);                  \
-            dptr += 1;                                  \
-        }                                               \
-    }
-
-static int
-_compare_strings(PyArrayObject *result, PyArrayMultiIterObject *multi,
-                 int cmp_op, void *func, int rstrip)
-{
-    PyArrayIterObject *iself, *iother;
-    npy_bool *dptr;
-    npy_intp size;
-    int val;
-    int N1, N2;
-    int (*compfunc)(void *, void *, int, int);
-    void (*relfunc)(char *, int);
-    char* (*stripfunc)(char const *, char *, int);
-
-    compfunc = func;
-    dptr = (npy_bool *)PyArray_DATA(result);
-    iself = multi->iters[0];
-    iother = multi->iters[1];
-    size = multi->size;
-    N1 = PyArray_DESCR(iself->ao)->elsize;
-    N2 = PyArray_DESCR(iother->ao)->elsize;
-    if ((void *)compfunc == (void *)_myunincmp) {
-        N1 >>= 2;
-        N2 >>= 2;
-        stripfunc = _uni_copy_n_strip;
-        relfunc = _uni_release;
-    }
-    else {
-        stripfunc = _char_copy_n_strip;
-        relfunc = _char_release;
-    }
-    switch (cmp_op) {
-    case Py_EQ:
-        if (rstrip) {
-            _rstrip_loop(==);
-        } else {
-            _reg_loop(==);
-        }
-        break;
-    case Py_NE:
-        if (rstrip) {
-            _rstrip_loop(!=);
-        } else {
-            _reg_loop(!=);
-        }
-        break;
-    case Py_LT:
-        if (rstrip) {
-            _rstrip_loop(<);
-        } else {
-            _reg_loop(<);
-        }
-        break;
-    case Py_LE:
-        if (rstrip) {
-            _rstrip_loop(<=);
-        } else {
-            _reg_loop(<=);
-        }
-        break;
-    case Py_GT:
-        if (rstrip) {
-            _rstrip_loop(>);
-        } else {
-            _reg_loop(>);
-        }
-        break;
-    case Py_GE:
-        if (rstrip) {
-            _rstrip_loop(>=);
-        } else {
-            _reg_loop(>=);
-        }
-        break;
-    default:
-        PyErr_SetString(PyExc_RuntimeError, "bad comparison operator");
-        return -1;
-    }
-    return 0;
-}
-
-#undef _reg_loop
-#undef _rstrip_loop
-#undef SMALL_STRING
 
+/* From umath/string_ufuncs.cpp/h */
 NPY_NO_EXPORT PyObject *
-_strings_richcompare(PyArrayObject *self, PyArrayObject *other, int cmp_op,
-                     int rstrip)
-{
-    PyArrayObject *result;
-    PyArrayMultiIterObject *mit;
-    int val;
-
-    if (PyArray_TYPE(self) != PyArray_TYPE(other)) {
-        /*
-         * Comparison between Bytes and Unicode is not defined in Py3K;
-         * we follow.
-         */
-        Py_INCREF(Py_NotImplemented);
-        return Py_NotImplemented;
-    }
-    if (PyArray_ISNOTSWAPPED(self) != PyArray_ISNOTSWAPPED(other)) {
-        /* Cast `other` to the same byte order as `self` (both unicode here) */
-        PyArray_Descr* unicode = PyArray_DescrNew(PyArray_DESCR(self));
-        if (unicode == NULL) {
-            return NULL;
-        }
-        unicode->elsize = PyArray_DESCR(other)->elsize;
-        PyObject *new = PyArray_FromAny((PyObject *)other,
-                unicode, 0, 0, 0, NULL);
-        if (new == NULL) {
-            return NULL;
-        }
-        other = (PyArrayObject *)new;
-    }
-    else {
-        Py_INCREF(other);
-    }
-
-    /* Broad-cast the arrays to a common shape */
-    mit = (PyArrayMultiIterObject *)PyArray_MultiIterNew(2, self, other);
-    Py_DECREF(other);
-    if (mit == NULL) {
-        return NULL;
-    }
-
-    result = (PyArrayObject *)PyArray_NewFromDescr(&PyArray_Type,
-                                  PyArray_DescrFromType(NPY_BOOL),
-                                  mit->nd,
-                                  mit->dimensions,
-                                  NULL, NULL, 0,
-                                  NULL);
-    if (result == NULL) {
-        goto finish;
-    }
-
-    if (PyArray_TYPE(self) == NPY_UNICODE) {
-        val = _compare_strings(result, mit, cmp_op, _myunincmp, rstrip);
-    }
-    else {
-        val = _compare_strings(result, mit, cmp_op, _mystrncmp, rstrip);
-    }
-
-    if (val < 0) {
-        Py_DECREF(result);
-        result = NULL;
-    }
-
- finish:
-    Py_DECREF(mit);
-    return (PyObject *)result;
-}
+_umath_strings_richcompare(
+        PyArrayObject *self, PyArrayObject *other, int cmp_op, int rstrip);
 
 /*
  * VOID-type arrays can only be compared equal and not-equal
@@ -1130,7 +766,15 @@ _void_compare(PyArrayObject *self, PyArrayObject *other, int cmp_op)
                         memcpy(dimensions, PyArray_DIMS((PyArrayObject *)temp),
                                sizeof(npy_intp)*result_ndim);
                     }
-                    dimensions[result_ndim] = -1;
+
+                    /*
+                     * Compute the new dimension size manually, as reshaping
+                     * with -1 does not work on empty arrays.
+                     */
+                    dimensions[result_ndim] = PyArray_MultiplyList(
+                        PyArray_DIMS((PyArrayObject *)temp) + result_ndim,
+                        PyArray_NDIM((PyArrayObject *)temp) - result_ndim);
+
                     temp2 = PyArray_Newshape((PyArrayObject *)temp,
                                              &newdims, NPY_ANYORDER);
                     if (temp2 == NULL) {
@@ -1207,7 +851,7 @@ _void_compare(PyArrayObject *self, PyArrayObject *other, int cmp_op)
             return NULL;
         }
         /* compare as a string. Assumes self and other have same descr->type */
-        return _strings_richcompare(self, other, cmp_op, 0);
+        return _umath_strings_richcompare(self, other, cmp_op, 0);
     }
 }
 
@@ -1341,36 +985,6 @@ array_richcompare(PyArrayObject *self, PyObject *other, int cmp_op)
     PyObject *obj_self = (PyObject *)self;
     PyObject *result = NULL;
 
-    /* Special case for string arrays (which don't and currently can't have
-     * ufunc loops defined, so there's no point in trying).
-     */
-    if (PyArray_ISSTRING(self)) {
-        array_other = (PyArrayObject *)PyArray_FromObject(other,
-                                                          NPY_NOTYPE, 0, 0);
-        if (array_other == NULL) {
-            PyErr_Clear();
-            /* Never mind, carry on, see what happens */
-        }
-        else if (!PyArray_ISSTRING(array_other)) {
-            Py_DECREF(array_other);
-            /* Never mind, carry on, see what happens */
-        }
-        else {
-            result = _strings_richcompare(self, array_other, cmp_op, 0);
-            Py_DECREF(array_other);
-            return result;
-        }
-        /* If we reach this point, it means that we are not comparing
-         * string-to-string. It's possible that this will still work out,
-         * e.g. if the other array is an object array, then both will be cast
-         * to object or something? I don't know how that works actually, but
-         * it does, b/c this works:
-         *   l = ["a", "b"]
-         *   assert np.array(l, dtype="S1") == np.array(l, dtype="O")
-         * So we fall through and see what happens.
-         */
-    }
-
     switch (cmp_op) {
     case Py_LT:
         RICHCMP_GIVE_UP_IF_NEEDED(obj_self, other);
diff --git a/numpy/core/src/multiarray/arraytypes.c.src b/numpy/core/src/multiarray/arraytypes.c.src
index ee4f5f312..a9f8dfdd2 100644
--- a/numpy/core/src/multiarray/arraytypes.c.src
+++ b/numpy/core/src/multiarray/arraytypes.c.src
@@ -7,6 +7,7 @@
 
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
 #define _MULTIARRAYMODULE
+#define _UMATHMODULE
 #define _NPY_NO_DEPRECATIONS /* for NPY_CHAR */
 
 #include "numpy/npy_common.h"
@@ -37,6 +38,9 @@
 #include "npy_buffer.h"
 
 #include "arraytypes.h"
+
+#include "umathmodule.h"
+
 /*
  * Define a stack allocated dummy array with only the minimum information set:
  *   1. The descr, the main field interesting here.
@@ -96,10 +100,32 @@ MyPyFloat_AsDouble(PyObject *obj)
     return ret;
 }
 
+
+static float
+MyPyFloat_AsFloat(PyObject *obj)
+{
+    double d_val = MyPyFloat_AsDouble(obj);
+    float res = (float)d_val;
+    if (NPY_UNLIKELY(npy_isinf(res) && !npy_isinf(d_val))) {
+        if (PyUFunc_GiveFloatingpointErrors("cast", NPY_FPE_OVERFLOW) < 0) {
+            return -1;
+        }
+    }
+    return res;
+}
+
+
 static npy_half
 MyPyFloat_AsHalf(PyObject *obj)
 {
-    return npy_double_to_half(MyPyFloat_AsDouble(obj));
+    double d_val = MyPyFloat_AsDouble(obj);
+    npy_half res = npy_double_to_half(d_val);
+    if (NPY_UNLIKELY(npy_half_isinf(res) && !npy_isinf(d_val))) {
+        if (PyUFunc_GiveFloatingpointErrors("cast", NPY_FPE_OVERFLOW) < 0) {
+            return npy_double_to_half(-1.);
+        }
+    }
+    return res;
 }
 
 static PyObject *
@@ -200,7 +226,7 @@ MyPyLong_AsUnsigned@Type@ (PyObject *obj)
  *          MyPyFloat_FromHalf, PyFloat_FromDouble*2#
  * #func2 = PyObject_IsTrue, MyPyLong_AsLong*6, MyPyLong_AsUnsignedLong*2,
  *          MyPyLong_AsLongLong, MyPyLong_AsUnsignedLongLong,
- *          MyPyFloat_AsHalf, MyPyFloat_AsDouble*2#
+ *          MyPyFloat_AsHalf, MyPyFloat_AsFloat, MyPyFloat_AsDouble#
  * #type = npy_bool,
  *         npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int,
  *         npy_long, npy_uint, npy_ulong, npy_longlong, npy_ulonglong,
@@ -363,6 +389,26 @@ static int
         }
         temp.real = (@ftype@) oop.real;
         temp.imag = (@ftype@) oop.imag;
+
+#if NPY_SIZEOF_@NAME@ < NPY_SIZEOF_CDOUBLE  /* really just float... */
+        /* Overflow could have occured converting double to float */
+        if (NPY_UNLIKELY((npy_isinf(temp.real) && !npy_isinf(oop.real)) ||
+                         (npy_isinf(temp.imag) && !npy_isinf(oop.imag)))) {
+            int bufsize, errmask;
+            PyObject *errobj;
+
+            if (PyUFunc_GetPyValues("assignment", &bufsize, &errmask,
+                    &errobj) < 0) {
+                return -1;
+            }
+            int first = 1;
+            if (PyUFunc_handlefperr(errmask, errobj, NPY_FPE_OVERFLOW, &first)) {
+                Py_XDECREF(errobj);
+                return -1;
+            }
+            Py_XDECREF(errobj);
+        }
+#endif
     }
 
     memcpy(ov, &temp, PyArray_DESCR(ap)->elsize);
@@ -1151,13 +1197,22 @@ static void
     @totype@ *op = output;
 
     while (n--) {
-        @fromtype@ f = *ip++;
-        @totype@ t = (@totype@)f;
 #if @supports_nat@ && @floatingpoint@
-        /* Avoid undefined behaviour for NaN -> NaT */
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile @fromtype@ f = *ip++;
+        @totype@ t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
         if (npy_isnan(f)) {
             t = (@totype@)NPY_DATETIME_NAT;
         }
+        else {
+            t = (@totype@)f;
+        }
+#else
+        @totype@ t = (@totype@)*ip++;
 #endif
         *op++ = t;
     }
@@ -1177,13 +1232,22 @@ static void
     @totype@ *op = output;
 
     while (n--) {
-        @fromtype@ f = *ip;
-        @totype@ t = (@totype@)f;
 #if @supports_nat@
-        /* Avoid undefined behaviour for NaN -> NaT */
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile @fromtype@ f = *ip;
+        @totype@ t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
         if (npy_isnan(f)) {
             t = (@totype@)NPY_DATETIME_NAT;
         }
+        else {
+            t = (@totype@)f;
+        }
+#else
+        @totype@ t = (@totype@)*ip;
 #endif
         *op++ = t;
         ip += 2;
diff --git a/numpy/core/src/multiarray/common_dtype.h b/numpy/core/src/multiarray/common_dtype.h
index 13d38ddf8..9f25fc14e 100644
--- a/numpy/core/src/multiarray/common_dtype.h
+++ b/numpy/core/src/multiarray/common_dtype.h
@@ -7,6 +7,10 @@
 #include <numpy/ndarraytypes.h>
 #include "dtypemeta.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 NPY_NO_EXPORT PyArray_DTypeMeta *
 PyArray_CommonDType(PyArray_DTypeMeta *dtype1, PyArray_DTypeMeta *dtype2);
 
@@ -14,4 +18,8 @@ NPY_NO_EXPORT PyArray_DTypeMeta *
 PyArray_PromoteDTypeSequence(
         npy_intp length, PyArray_DTypeMeta **dtypes_in);
 
+#ifdef __cplusplus
+}
+#endif
+
 #endif  /* NUMPY_CORE_SRC_MULTIARRAY_COMMON_DTYPE_H_ */
diff --git a/numpy/core/src/multiarray/convert.c b/numpy/core/src/multiarray/convert.c
index 630253e38..2aed0bbb4 100644
--- a/numpy/core/src/multiarray/convert.c
+++ b/numpy/core/src/multiarray/convert.c
@@ -20,6 +20,7 @@
 #include "array_assign.h"
 
 #include "convert.h"
+#include "array_coercion.h"
 
 int
 fallocate(int fd, int mode, off_t offset, off_t len);
@@ -358,151 +359,42 @@ PyArray_ToString(PyArrayObject *self, NPY_ORDER order)
 NPY_NO_EXPORT int
 PyArray_FillWithScalar(PyArrayObject *arr, PyObject *obj)
 {
-    PyArray_Descr *dtype = NULL;
-    npy_longlong value_buffer[4];
-    char *value = NULL;
-    int retcode = 0;
-
     /*
-     * If 'arr' is an object array, copy the object as is unless
-     * 'obj' is a zero-dimensional array, in which case we copy
-     * the element in that array instead.
+     * If we knew that the output array has at least one element, we would
+     * not actually need a helping buffer, we always null it, just in case.
+     *
+     * (The longlong here should help with alignment.)
      */
-    if (PyArray_DESCR(arr)->type_num == NPY_OBJECT &&
-                        !(PyArray_Check(obj) &&
-                          PyArray_NDIM((PyArrayObject *)obj) == 0)) {
-        value = (char *)&obj;
-
-        dtype = PyArray_DescrFromType(NPY_OBJECT);
-        if (dtype == NULL) {
-            return -1;
-        }
-    }
-    /* NumPy scalar */
-    else if (PyArray_IsScalar(obj, Generic)) {
-        dtype = PyArray_DescrFromScalar(obj);
-        if (dtype == NULL) {
-            return -1;
-        }
-        value = scalar_value(obj, dtype);
-        if (value == NULL) {
-            Py_DECREF(dtype);
-            return -1;
-        }
-    }
-    /* Python boolean */
-    else if (PyBool_Check(obj)) {
-        value = (char *)value_buffer;
-        *value = (obj == Py_True);
-
-        dtype = PyArray_DescrFromType(NPY_BOOL);
-        if (dtype == NULL) {
-            return -1;
-        }
-    }
-    /* Python integer */
-    else if (PyLong_Check(obj)) {
-        /* Try long long before unsigned long long */
-        npy_longlong ll_v = PyLong_AsLongLong(obj);
-        if (error_converting(ll_v)) {
-            /* Long long failed, try unsigned long long */
-            npy_ulonglong ull_v;
-            PyErr_Clear();
-            ull_v = PyLong_AsUnsignedLongLong(obj);
-            if (ull_v == (unsigned long long)-1 && PyErr_Occurred()) {
-                return -1;
-            }
-            value = (char *)value_buffer;
-            *(npy_ulonglong *)value = ull_v;
-
-            dtype = PyArray_DescrFromType(NPY_ULONGLONG);
-            if (dtype == NULL) {
-                return -1;
-            }
-        }
-        else {
-            /* Long long succeeded */
-            value = (char *)value_buffer;
-            *(npy_longlong *)value = ll_v;
-
-            dtype = PyArray_DescrFromType(NPY_LONGLONG);
-            if (dtype == NULL) {
-                return -1;
-            }
-        }
-    }
-    /* Python float */
-    else if (PyFloat_Check(obj)) {
-        npy_double v = PyFloat_AsDouble(obj);
-        if (error_converting(v)) {
-            return -1;
-        }
-        value = (char *)value_buffer;
-        *(npy_double *)value = v;
-
-        dtype = PyArray_DescrFromType(NPY_DOUBLE);
-        if (dtype == NULL) {
+    npy_longlong value_buffer_stack[4] = {0};
+    char *value_buffer_heap = NULL;
+    char *value = (char *)value_buffer_stack;
+    PyArray_Descr *descr = PyArray_DESCR(arr);
+
+    if (descr->elsize > sizeof(value_buffer_stack)) {
+        /* We need a large temporary buffer... */
+        value_buffer_heap = PyObject_Calloc(1, descr->elsize);
+        if (value_buffer_heap == NULL) {
+            PyErr_NoMemory();
             return -1;
         }
+        value = value_buffer_heap;
     }
-    /* Python complex */
-    else if (PyComplex_Check(obj)) {
-        npy_double re, im;
-
-        re = PyComplex_RealAsDouble(obj);
-        if (error_converting(re)) {
-            return -1;
-        }
-        im = PyComplex_ImagAsDouble(obj);
-        if (error_converting(im)) {
-            return -1;
-        }
-        value = (char *)value_buffer;
-        ((npy_double *)value)[0] = re;
-        ((npy_double *)value)[1] = im;
-
-        dtype = PyArray_DescrFromType(NPY_CDOUBLE);
-        if (dtype == NULL) {
-            return -1;
-        }
-    }
-
-    /* Use the value pointer we got if possible */
-    if (value != NULL) {
-        /* TODO: switch to SAME_KIND casting */
-        retcode = PyArray_AssignRawScalar(arr, dtype, value,
-                                NULL, NPY_UNSAFE_CASTING);
-        Py_DECREF(dtype);
-        return retcode;
+    if (PyArray_Pack(descr, value, obj) < 0) {
+        PyMem_FREE(value_buffer_heap);
+        return -1;
     }
-    /* Otherwise convert to an array to do the assignment */
-    else {
-        PyArrayObject *src_arr;
 
-        /**
-         * The dtype of the destination is used when converting
-         * from the pyobject, so that for example a tuple gets
-         * recognized as a struct scalar of the required type.
-         */
-        Py_INCREF(PyArray_DTYPE(arr));
-        src_arr = (PyArrayObject *)PyArray_FromAny(obj,
-                        PyArray_DTYPE(arr), 0, 0, 0, NULL);
-        if (src_arr == NULL) {
-            return -1;
-        }
-
-        if (PyArray_NDIM(src_arr) != 0) {
-            PyErr_SetString(PyExc_ValueError,
-                    "Input object to FillWithScalar is not a scalar");
-            Py_DECREF(src_arr);
-            return -1;
-        }
-
-        retcode = PyArray_CopyInto(arr, src_arr);
+    /*
+     * There is no cast anymore, the above already coerced using scalar
+     * coercion rules
+     */
+    int retcode = raw_array_assign_scalar(
+            PyArray_NDIM(arr), PyArray_DIMS(arr), descr,
+            PyArray_BYTES(arr), PyArray_STRIDES(arr),
+            descr, value);
 
-        Py_DECREF(src_arr);
-        return retcode;
-    }
+    PyMem_FREE(value_buffer_heap);
+    return retcode;
 }
 
 /*
diff --git a/numpy/core/src/multiarray/convert_datatype.c b/numpy/core/src/multiarray/convert_datatype.c
index 8d0a4cd56..bc8a3bf88 100644
--- a/numpy/core/src/multiarray/convert_datatype.c
+++ b/numpy/core/src/multiarray/convert_datatype.c
@@ -1691,8 +1691,12 @@ PyArray_ResultType(
             all_DTypes[i_all] = &PyArray_PyComplexAbstractDType;
         }
         else {
-            /* N.B.: Could even be an object dtype here for large ints */
+            /* This could even be an object dtype here for large ints */
             all_DTypes[i_all] = &PyArray_PyIntAbstractDType;
+            if (PyArray_TYPE(arrs[i]) != NPY_LONG) {
+                /* Not a "normal" scalar, so we cannot avoid the legacy path */
+                all_pyscalar = 0;
+            }
         }
         Py_INCREF(all_DTypes[i_all]);
         /*
@@ -3042,26 +3046,22 @@ nonstructured_to_structured_get_loop(
         NPY_ARRAYMETHOD_FLAGS *flags)
 {
     if (context->descriptors[1]->names != NULL) {
-        int needs_api = 0;
         if (get_fields_transfer_function(
                 aligned, strides[0], strides[1],
                 context->descriptors[0], context->descriptors[1],
                 move_references, out_loop, out_transferdata,
-                &needs_api) == NPY_FAIL) {
+                flags) == NPY_FAIL) {
             return -1;
         }
-        *flags = needs_api ? NPY_METH_REQUIRES_PYAPI : 0;
     }
     else if (context->descriptors[1]->subarray != NULL) {
-        int needs_api = 0;
         if (get_subarray_transfer_function(
                 aligned, strides[0], strides[1],
                 context->descriptors[0], context->descriptors[1],
                 move_references, out_loop, out_transferdata,
-                &needs_api) == NPY_FAIL) {
+                flags) == NPY_FAIL) {
             return -1;
         }
-        *flags = needs_api ? NPY_METH_REQUIRES_PYAPI : 0;
     }
     else {
         /*
@@ -3204,26 +3204,22 @@ structured_to_nonstructured_get_loop(
         NPY_ARRAYMETHOD_FLAGS *flags)
 {
     if (context->descriptors[0]->names != NULL) {
-        int needs_api = 0;
         if (get_fields_transfer_function(
                 aligned, strides[0], strides[1],
                 context->descriptors[0], context->descriptors[1],
                 move_references, out_loop, out_transferdata,
-                &needs_api) == NPY_FAIL) {
+                flags) == NPY_FAIL) {
             return -1;
         }
-        *flags = needs_api ? NPY_METH_REQUIRES_PYAPI : 0;
     }
     else if (context->descriptors[0]->subarray != NULL) {
-        int needs_api = 0;
         if (get_subarray_transfer_function(
                 aligned, strides[0], strides[1],
                 context->descriptors[0], context->descriptors[1],
                 move_references, out_loop, out_transferdata,
-                &needs_api) == NPY_FAIL) {
+                flags) == NPY_FAIL) {
             return -1;
         }
-        *flags = needs_api ? NPY_METH_REQUIRES_PYAPI : 0;
     }
     else {
         /*
@@ -3513,27 +3509,23 @@ void_to_void_get_loop(
 {
     if (context->descriptors[0]->names != NULL ||
             context->descriptors[1]->names != NULL) {
-        int needs_api = 0;
         if (get_fields_transfer_function(
                 aligned, strides[0], strides[1],
                 context->descriptors[0], context->descriptors[1],
                 move_references, out_loop, out_transferdata,
-                &needs_api) == NPY_FAIL) {
+                flags) == NPY_FAIL) {
             return -1;
         }
-        *flags = needs_api ? NPY_METH_REQUIRES_PYAPI : 0;
     }
     else if (context->descriptors[0]->subarray != NULL ||
              context->descriptors[1]->subarray != NULL) {
-        int needs_api = 0;
         if (get_subarray_transfer_function(
                 aligned, strides[0], strides[1],
                 context->descriptors[0], context->descriptors[1],
                 move_references, out_loop, out_transferdata,
-                &needs_api) == NPY_FAIL) {
+                flags) == NPY_FAIL) {
             return -1;
         }
-        *flags = needs_api ? NPY_METH_REQUIRES_PYAPI : 0;
     }
     else {
         /*
@@ -3546,7 +3538,7 @@ void_to_void_get_loop(
                 out_loop, out_transferdata) == NPY_FAIL) {
             return -1;
         }
-        *flags = 0;
+        *flags = PyArrayMethod_MINIMAL_FLAGS;
     }
     return 0;
 }
diff --git a/numpy/core/src/multiarray/convert_datatype.h b/numpy/core/src/multiarray/convert_datatype.h
index d1865d1c2..af6d790cf 100644
--- a/numpy/core/src/multiarray/convert_datatype.h
+++ b/numpy/core/src/multiarray/convert_datatype.h
@@ -3,6 +3,10 @@
 
 #include "array_method.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 extern NPY_NO_EXPORT npy_intp REQUIRED_STR_LEN[];
 
 NPY_NO_EXPORT PyObject *
@@ -34,7 +38,7 @@ dtype_kind_to_ordering(char kind);
 /* Used by PyArray_CanCastArrayTo and in the legacy ufunc type resolution */
 NPY_NO_EXPORT npy_bool
 can_cast_scalar_to(PyArray_Descr *scal_type, char *scal_data,
-                    PyArray_Descr *to, NPY_CASTING casting);
+                   PyArray_Descr *to, NPY_CASTING casting);
 
 NPY_NO_EXPORT int
 should_use_min_scalar(npy_intp narrs, PyArrayObject **arr,
@@ -59,7 +63,7 @@ NPY_NO_EXPORT int
 PyArray_AddCastingImplementation(PyBoundArrayMethodObject *meth);
 
 NPY_NO_EXPORT int
-PyArray_AddCastingImplementation_FromSpec(PyArrayMethod_Spec *spec, int private);
+PyArray_AddCastingImplementation_FromSpec(PyArrayMethod_Spec *spec, int private_);
 
 NPY_NO_EXPORT NPY_CASTING
 PyArray_MinCastSafety(NPY_CASTING casting1, NPY_CASTING casting2);
@@ -99,4 +103,8 @@ simple_cast_resolve_descriptors(
 NPY_NO_EXPORT int
 PyArray_InitializeCasts(void);
 
+#ifdef __cplusplus
+}
+#endif
+
 #endif  /* NUMPY_CORE_SRC_MULTIARRAY_CONVERT_DATATYPE_H_ */
diff --git a/numpy/core/src/multiarray/ctors.c b/numpy/core/src/multiarray/ctors.c
index c780f4b2b..c3d66dd6b 100644
--- a/numpy/core/src/multiarray/ctors.c
+++ b/numpy/core/src/multiarray/ctors.c
@@ -1,5 +1,6 @@
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
 #define _MULTIARRAYMODULE
+#define _UMATHMODULE
 
 #define PY_SSIZE_T_CLEAN
 #include <Python.h>
@@ -33,6 +34,8 @@
 #include "get_attr_string.h"
 #include "array_coercion.h"
 
+#include "umathmodule.h"
+
 /*
  * Reading from a file or a string.
  *
@@ -465,55 +468,12 @@ PyArray_AssignFromCache_Recursive(
         PyArrayObject *self, const int ndim, coercion_cache_obj **cache)
 {
     /* Consume first cache element by extracting information and freeing it */
-    PyObject *original_obj = (*cache)->converted_obj;
     PyObject *obj = (*cache)->arr_or_sequence;
     Py_INCREF(obj);
     npy_bool sequence = (*cache)->sequence;
     int depth = (*cache)->depth;
     *cache = npy_unlink_coercion_cache(*cache);
 
-    /*
-     * The maximum depth is special (specifically for objects), but usually
-     * unrolled in the sequence branch below.
-     */
-    if (NPY_UNLIKELY(depth == ndim)) {
-        /*
-         * We have reached the maximum depth. We should simply assign to the
-         * element in principle. There is one exception. If this is a 0-D
-         * array being stored into a 0-D array (but we do not reach here then).
-         */
-        if (PyArray_ISOBJECT(self)) {
-            assert(ndim != 0);  /* guaranteed by PyArray_AssignFromCache */
-            assert(PyArray_NDIM(self) == 0);
-            Py_DECREF(obj);
-            return PyArray_Pack(PyArray_DESCR(self), PyArray_BYTES(self),
-                                original_obj);
-        }
-        if (sequence) {
-            /*
-             * Sanity check which may be removed, the error is raised already
-             * in `PyArray_DiscoverDTypeAndShape`.
-             */
-            assert(0);
-            PyErr_SetString(PyExc_RuntimeError,
-                    "setting an array element with a sequence");
-            goto fail;
-        }
-        else if (original_obj != obj || !PyArray_CheckExact(obj)) {
-            /*
-             * If the leave node is an array-like, but not a numpy array,
-             * we pretend it is an arbitrary scalar.  This means that in
-             * most cases (where the dtype is int or float), we will end
-             * up using float(array-like), or int(array-like).  That does
-             * not support general casting, but helps Quantity and masked
-             * arrays, because it allows them to raise an error when
-             * `__float__()` or `__int__()` is called.
-             */
-            Py_DECREF(obj);
-            return PyArray_SETITEM(self, PyArray_BYTES(self), original_obj);
-        }
-    }
-
     /* The element is either a sequence, or an array */
     if (!sequence) {
         /* Straight forward array assignment */
@@ -535,20 +495,24 @@ PyArray_AssignFromCache_Recursive(
         for (npy_intp i = 0; i < length; i++) {
             PyObject *value = PySequence_Fast_GET_ITEM(obj, i);
 
-            if (*cache == NULL || (*cache)->converted_obj != value ||
-                        (*cache)->depth != depth + 1) {
-                if (ndim != depth + 1) {
-                    PyErr_SetString(PyExc_RuntimeError,
-                            "Inconsistent object during array creation? "
-                            "Content of sequences changed (now too shallow).");
-                    goto fail;
-                }
-                /* Straight forward assignment of elements */
+            if (ndim == depth + 1) {
+                /*
+                 * Straight forward assignment of elements.  Note that it is
+                 * possible for such an element to be a 0-D array or array-like.
+                 * `PyArray_Pack` supports arrays as well as we want: We
+                 * support exact NumPy arrays, but at this point ignore others.
+                 * (Please see the `PyArray_Pack` function comment if this
+                 * rightly confuses you.)
+                 */
                 char *item;
                 item = (PyArray_BYTES(self) + i * PyArray_STRIDES(self)[0]);
                 if (PyArray_Pack(PyArray_DESCR(self), item, value) < 0) {
                     goto fail;
                 }
+                /* If this was an array(-like) we still need to unlike int: */
+                if (*cache != NULL && (*cache)->converted_obj == value) {
+                    *cache = npy_unlink_coercion_cache(*cache);
+                }
             }
             else {
                 PyArrayObject *view;
@@ -2780,18 +2744,22 @@ PyArray_CopyAsFlat(PyArrayObject *dst, PyArrayObject *src, NPY_ORDER order)
      * contiguous strides, etc.
      */
     NPY_cast_info cast_info;
+    NPY_ARRAYMETHOD_FLAGS flags;
     if (PyArray_GetDTypeTransferFunction(
                     IsUintAligned(src) && IsAligned(src) &&
                     IsUintAligned(dst) && IsAligned(dst),
                     src_stride, dst_stride,
                     PyArray_DESCR(src), PyArray_DESCR(dst),
                     0,
-                    &cast_info, &needs_api) != NPY_SUCCEED) {
+                    &cast_info, &flags) != NPY_SUCCEED) {
         NpyIter_Deallocate(dst_iter);
         NpyIter_Deallocate(src_iter);
         return -1;
     }
-
+    needs_api |= (flags & NPY_METH_REQUIRES_PYAPI) != 0;
+    if (!(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) {
+        npy_clear_floatstatus_barrier((char *)src_iter);
+    }
     if (!needs_api) {
         NPY_BEGIN_THREADS;
     }
@@ -2843,8 +2811,20 @@ PyArray_CopyAsFlat(PyArrayObject *dst, PyArrayObject *src, NPY_ORDER order)
     NPY_END_THREADS;
 
     NPY_cast_info_xfree(&cast_info);
-    NpyIter_Deallocate(dst_iter);
-    NpyIter_Deallocate(src_iter);
+    if (!NpyIter_Deallocate(dst_iter)) {
+        res = -1;
+    }
+    if (!NpyIter_Deallocate(src_iter)) {
+        res = -1;
+    }
+
+    if (res == 0 && !(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) {
+        int fpes = npy_get_floatstatus_barrier((char *)src_iter);
+        if (fpes && PyUFunc_GiveFloatingpointErrors("cast", fpes) < 0) {
+            return -1;
+        }
+    }
+
     return res;
 }
 
diff --git a/numpy/core/src/multiarray/dtype_transfer.c b/numpy/core/src/multiarray/dtype_transfer.c
index 18de5d132..f8458d2d7 100644
--- a/numpy/core/src/multiarray/dtype_transfer.c
+++ b/numpy/core/src/multiarray/dtype_transfer.c
@@ -11,12 +11,14 @@
  */
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
 #define _MULTIARRAYMODULE
+#define _UMATHMODULE
 
 #define PY_SSIZE_T_CLEAN
 #include <Python.h>
 #include <structmember.h>
 
 #include "numpy/arrayobject.h"
+#include "numpy/npy_math.h"
 
 #include "lowlevel_strided_loops.h"
 #include "npy_pycompat.h"
@@ -35,6 +37,8 @@
 #include "array_method.h"
 #include "array_coercion.h"
 
+#include "umathmodule.h"
+
 #define NPY_LOWLEVEL_BUFFER_BLOCKSIZE  128
 
 /********** PRINTF DEBUG TRACING **************/
@@ -1506,7 +1510,7 @@ get_one_to_n_transfer_function(int aligned,
                             npy_intp N,
                             PyArrayMethod_StridedLoop **out_stransfer,
                             NpyAuxData **out_transferdata,
-                            int *out_needs_api)
+                            NPY_ARRAYMETHOD_FLAGS *out_flags)
 {
     _one_to_n_data *data = PyMem_Malloc(sizeof(_one_to_n_data));
     if (data == NULL) {
@@ -1530,18 +1534,19 @@ get_one_to_n_transfer_function(int aligned,
                     src_dtype, dst_dtype,
                     0,
                     &data->wrapped,
-                    out_needs_api) != NPY_SUCCEED) {
+                    out_flags) != NPY_SUCCEED) {
         NPY_AUXDATA_FREE((NpyAuxData *)data);
         return NPY_FAIL;
     }
 
     /* If the src object will need a DECREF, set src_dtype */
     if (move_references && PyDataType_REFCHK(src_dtype)) {
+        *out_flags |= NPY_METH_REQUIRES_PYAPI;
         if (get_decref_transfer_function(aligned,
                             src_stride,
                             src_dtype,
                             &data->decref_src,
-                            out_needs_api) != NPY_SUCCEED) {
+                            NULL) != NPY_SUCCEED) {
             NPY_AUXDATA_FREE((NpyAuxData *)data);
             return NPY_FAIL;
         }
@@ -1667,7 +1672,7 @@ get_n_to_n_transfer_function(int aligned,
                             npy_intp N,
                             PyArrayMethod_StridedLoop **out_stransfer,
                             NpyAuxData **out_transferdata,
-                            int *out_needs_api)
+                            NPY_ARRAYMETHOD_FLAGS *out_flags)
 {
     _n_to_n_data *data = PyMem_Malloc(sizeof(_n_to_n_data));
     if (data == NULL) {
@@ -1699,7 +1704,7 @@ get_n_to_n_transfer_function(int aligned,
                     src_dtype, dst_dtype,
                     move_references,
                     &data->wrapped,
-                    out_needs_api) != NPY_SUCCEED) {
+                    out_flags) != NPY_SUCCEED) {
         NPY_AUXDATA_FREE((NpyAuxData *)data);
         return NPY_FAIL;
     }
@@ -1913,7 +1918,7 @@ get_subarray_broadcast_transfer_function(int aligned,
                             int move_references,
                             PyArrayMethod_StridedLoop **out_stransfer,
                             NpyAuxData **out_transferdata,
-                            int *out_needs_api)
+                            NPY_ARRAYMETHOD_FLAGS *out_flags)
 {
     _subarray_broadcast_data *data;
     npy_intp structsize, loop_index, run, run_size,
@@ -1946,7 +1951,7 @@ get_subarray_broadcast_transfer_function(int aligned,
                     src_dtype, dst_dtype,
                     0,
                     &data->wrapped,
-                    out_needs_api) != NPY_SUCCEED) {
+                    out_flags) != NPY_SUCCEED) {
         NPY_AUXDATA_FREE((NpyAuxData *)data);
         return NPY_FAIL;
     }
@@ -1958,7 +1963,7 @@ get_subarray_broadcast_transfer_function(int aligned,
                         src_dtype, NULL,
                         1,
                         &data->decref_src,
-                        out_needs_api) != NPY_SUCCEED) {
+                        out_flags) != NPY_SUCCEED) {
             NPY_AUXDATA_FREE((NpyAuxData *)data);
             return NPY_FAIL;
         }
@@ -1971,7 +1976,7 @@ get_subarray_broadcast_transfer_function(int aligned,
                         dst_dtype, NULL,
                         1,
                         &data->decref_dst,
-                        out_needs_api) != NPY_SUCCEED) {
+                        out_flags) != NPY_SUCCEED) {
             NPY_AUXDATA_FREE((NpyAuxData *)data);
             return NPY_FAIL;
         }
@@ -2087,7 +2092,7 @@ get_subarray_transfer_function(int aligned,
                             int move_references,
                             PyArrayMethod_StridedLoop **out_stransfer,
                             NpyAuxData **out_transferdata,
-                            int *out_needs_api)
+                            NPY_ARRAYMETHOD_FLAGS *out_flags)
 {
     PyArray_Dims src_shape = {NULL, -1}, dst_shape = {NULL, -1};
     npy_intp src_size = 1, dst_size = 1;
@@ -2132,7 +2137,7 @@ get_subarray_transfer_function(int aligned,
                         move_references,
                         src_size,
                         out_stransfer, out_transferdata,
-                        out_needs_api);
+                        out_flags);
     }
     /* Copy the src value to all the dst values */
     else if (src_size == 1) {
@@ -2145,7 +2150,7 @@ get_subarray_transfer_function(int aligned,
                 move_references,
                 dst_size,
                 out_stransfer, out_transferdata,
-                out_needs_api);
+                out_flags);
     }
     /*
      * Copy the subarray with broadcasting, truncating, and zero-padding
@@ -2159,7 +2164,7 @@ get_subarray_transfer_function(int aligned,
                         src_shape, dst_shape,
                         move_references,
                         out_stransfer, out_transferdata,
-                        out_needs_api);
+                        out_flags);
 
         npy_free_cache_dim_obj(src_shape);
         npy_free_cache_dim_obj(dst_shape);
@@ -2277,7 +2282,7 @@ get_fields_transfer_function(int NPY_UNUSED(aligned),
                             int move_references,
                             PyArrayMethod_StridedLoop **out_stransfer,
                             NpyAuxData **out_transferdata,
-                            int *out_needs_api)
+                            NPY_ARRAYMETHOD_FLAGS *out_flags)
 {
     PyObject *key, *tup, *title;
     PyArray_Descr *src_fld_dtype, *dst_fld_dtype;
@@ -2308,6 +2313,7 @@ get_fields_transfer_function(int NPY_UNUSED(aligned),
         data->base.clone = &_field_transfer_data_clone;
         data->field_count = 0;
 
+        *out_flags = PyArrayMethod_MINIMAL_FLAGS;
         for (i = 0; i < field_count; ++i) {
             key = PyTuple_GET_ITEM(dst_dtype->names, i);
             tup = PyDict_GetItem(dst_dtype->fields, key);
@@ -2316,15 +2322,17 @@ get_fields_transfer_function(int NPY_UNUSED(aligned),
                 PyMem_Free(data);
                 return NPY_FAIL;
             }
+            NPY_ARRAYMETHOD_FLAGS field_flags;
             if (PyArray_GetDTypeTransferFunction(0,
                                     src_stride, dst_stride,
                                     src_dtype, dst_fld_dtype,
                                     0,
                                     &data->fields[i].info,
-                                    out_needs_api) != NPY_SUCCEED) {
+                                    &field_flags) != NPY_SUCCEED) {
                 NPY_AUXDATA_FREE((NpyAuxData *)data);
                 return NPY_FAIL;
             }
+            *out_flags = PyArrayMethod_COMBINED_FLAGS(*out_flags, field_flags);
             data->fields[i].src_offset = 0;
             data->fields[i].dst_offset = dst_offset;
             data->field_count++;
@@ -2336,11 +2344,12 @@ get_fields_transfer_function(int NPY_UNUSED(aligned),
          * input, the second one (normally output) just does not matter here.
          */
         if (move_references && PyDataType_REFCHK(src_dtype)) {
+            *out_flags |= NPY_METH_REQUIRES_PYAPI;
             if (get_decref_transfer_function(0,
                                     src_stride,
                                     src_dtype,
                                     &data->fields[field_count].info,
-                                    out_needs_api) != NPY_SUCCEED) {
+                                    NULL) != NPY_SUCCEED) {
                 NPY_AUXDATA_FREE((NpyAuxData *)data);
                 return NPY_FAIL;
             }
@@ -2388,7 +2397,7 @@ get_fields_transfer_function(int NPY_UNUSED(aligned),
                                              src_fld_dtype, dst_dtype,
                                              move_references,
                                              &data->fields[0].info,
-                                             out_needs_api) != NPY_SUCCEED) {
+                                             out_flags) != NPY_SUCCEED) {
             PyMem_Free(data);
             return NPY_FAIL;
         }
@@ -2423,6 +2432,7 @@ get_fields_transfer_function(int NPY_UNUSED(aligned),
     data->base.clone = &_field_transfer_data_clone;
     data->field_count = 0;
 
+    *out_flags = PyArrayMethod_MINIMAL_FLAGS;
     /* set up the transfer function for each field */
     for (i = 0; i < field_count; ++i) {
         key = PyTuple_GET_ITEM(dst_dtype->names, i);
@@ -2440,15 +2450,17 @@ get_fields_transfer_function(int NPY_UNUSED(aligned),
             return NPY_FAIL;
         }
 
+        NPY_ARRAYMETHOD_FLAGS field_flags;
         if (PyArray_GetDTypeTransferFunction(0,
                                              src_stride, dst_stride,
                                              src_fld_dtype, dst_fld_dtype,
                                              move_references,
                                              &data->fields[i].info,
-                                             out_needs_api) != NPY_SUCCEED) {
+                                             &field_flags) != NPY_SUCCEED) {
             NPY_AUXDATA_FREE((NpyAuxData *)data);
             return NPY_FAIL;
         }
+        *out_flags = PyArrayMethod_COMBINED_FLAGS(*out_flags, field_flags);
         data->fields[i].src_offset = src_offset;
         data->fields[i].dst_offset = dst_offset;
         data->field_count++;
@@ -2748,11 +2760,12 @@ get_decref_transfer_function(int aligned,
         src_size = PyArray_MultiplyList(src_shape.ptr, src_shape.len);
         npy_free_cache_dim_obj(src_shape);
 
+        NPY_ARRAYMETHOD_FLAGS ignored_flags;
         if (get_n_to_n_transfer_function(aligned,
                 src_stride, 0,
                 src_dtype->subarray->base, NULL, 1, src_size,
                 &cast_info->func, &cast_info->auxdata,
-                out_needs_api) != NPY_SUCCEED) {
+                &ignored_flags) != NPY_SUCCEED) {
             return NPY_FAIL;
         }
 
@@ -3098,7 +3111,7 @@ define_cast_for_descrs(
         npy_intp src_stride, npy_intp dst_stride,
         PyArray_Descr *src_dtype, PyArray_Descr *dst_dtype,
         int move_references,
-        NPY_cast_info *cast_info, int *out_needs_api)
+        NPY_cast_info *cast_info, NPY_ARRAYMETHOD_FLAGS *out_flags)
 {
     /* Storage for all cast info in case multi-step casting is necessary */
     _multistep_castdata castdata;
@@ -3109,6 +3122,7 @@ define_cast_for_descrs(
     /* `view_offset` passed to `init_cast_info` but unused for the main cast */
     npy_intp view_offset = NPY_MIN_INTP;
     NPY_CASTING casting = -1;
+    *out_flags = PyArrayMethod_MINIMAL_FLAGS;
 
     if (init_cast_info(
             cast_info, &casting, &view_offset, src_dtype, dst_dtype, 1) < 0) {
@@ -3159,7 +3173,7 @@ define_cast_for_descrs(
             }
             assert(castdata.from.func != NULL);
 
-            *out_needs_api |= (flags & NPY_METH_REQUIRES_PYAPI) != 0;
+            *out_flags = PyArrayMethod_COMBINED_FLAGS(*out_flags, flags);
             /* The main cast now uses a buffered input: */
             src_stride = strides[1];
             move_references = 1;  /* main cast has to clear the buffer */
@@ -3198,7 +3212,7 @@ define_cast_for_descrs(
             }
             assert(castdata.to.func != NULL);
 
-            *out_needs_api |= (flags & NPY_METH_REQUIRES_PYAPI) != 0;
+            *out_flags = PyArrayMethod_COMBINED_FLAGS(*out_flags, flags);
             /* The main cast now uses a buffered input: */
             dst_stride = strides[0];
             if (castdata.from.func != NULL) {
@@ -3219,7 +3233,7 @@ define_cast_for_descrs(
         goto fail;
     }
 
-    *out_needs_api |= (flags & NPY_METH_REQUIRES_PYAPI) != 0;
+    *out_flags = PyArrayMethod_COMBINED_FLAGS(*out_flags, flags);
 
     if (castdata.from.func == NULL && castdata.to.func == NULL) {
         /* Most of the time, there will be only one step required. */
@@ -3256,7 +3270,7 @@ PyArray_GetDTypeTransferFunction(int aligned,
                             PyArray_Descr *src_dtype, PyArray_Descr *dst_dtype,
                             int move_references,
                             NPY_cast_info *cast_info,
-                            int *out_needs_api)
+                            NPY_ARRAYMETHOD_FLAGS *out_flags)
 {
     assert(src_dtype != NULL);
 
@@ -3271,17 +3285,24 @@ PyArray_GetDTypeTransferFunction(int aligned,
      */
     if (dst_dtype == NULL) {
         assert(move_references);
-        return get_decref_transfer_function(aligned,
+        int needs_api = 0;
+        int res = get_decref_transfer_function(aligned,
                                 src_dtype->elsize,
                                 src_dtype,
                                 cast_info,
-                                out_needs_api);
+                                &needs_api);
+        /* decref'ing never creates floating point errors, so just ignore it */
+        *out_flags = PyArrayMethod_MINIMAL_FLAGS;
+        if (needs_api) {
+            *out_flags |= NPY_METH_REQUIRES_PYAPI;
+        }
+        return res;
     }
 
     if (define_cast_for_descrs(aligned,
             src_stride, dst_stride,
             src_dtype, dst_dtype, move_references,
-            cast_info, out_needs_api) < 0) {
+            cast_info, out_flags) < 0) {
         return NPY_FAIL;
     }
 
@@ -3353,21 +3374,29 @@ wrap_aligned_transferfunction(
      *       have an explicit implementation instead if we want performance.
      */
     if (must_wrap || src_wrapped_dtype != src_dtype) {
+        NPY_ARRAYMETHOD_FLAGS flags;
         if (PyArray_GetDTypeTransferFunction(aligned,
                 src_stride, castdata.main.descriptors[0]->elsize,
                 src_dtype, castdata.main.descriptors[0], 0,
-                &castdata.from, out_needs_api) != NPY_SUCCEED) {
+                &castdata.from, &flags) != NPY_SUCCEED) {
             goto fail;
         }
+        if (flags & NPY_METH_REQUIRES_PYAPI) {
+            *out_needs_api = 1;
+        }
     }
     if (must_wrap || dst_wrapped_dtype != dst_dtype) {
+        NPY_ARRAYMETHOD_FLAGS flags;
         if (PyArray_GetDTypeTransferFunction(aligned,
                 castdata.main.descriptors[1]->elsize, dst_stride,
                 castdata.main.descriptors[1], dst_dtype,
                 1,  /* clear buffer if it includes references */
-                &castdata.to, out_needs_api) != NPY_SUCCEED) {
+                &castdata.to, &flags) != NPY_SUCCEED) {
             goto fail;
         }
+        if (flags & NPY_METH_REQUIRES_PYAPI) {
+            *out_needs_api = 1;
+        }
     }
 
     *out_transferdata = _multistep_cast_auxdata_clone_int(&castdata, 1);
@@ -3492,7 +3521,7 @@ PyArray_GetMaskedDTypeTransferFunction(int aligned,
                             PyArray_Descr *mask_dtype,
                             int move_references,
                             NPY_cast_info *cast_info,
-                            int *out_needs_api)
+                            NPY_ARRAYMETHOD_FLAGS *out_flags)
 {
     NPY_cast_info_init(cast_info);
 
@@ -3520,18 +3549,19 @@ PyArray_GetMaskedDTypeTransferFunction(int aligned,
                                 src_dtype, dst_dtype,
                                 move_references,
                                 &data->wrapped,
-                                out_needs_api) != NPY_SUCCEED) {
+                                out_flags) != NPY_SUCCEED) {
         PyMem_Free(data);
         return NPY_FAIL;
     }
 
     /* If the src object will need a DECREF, get a function to handle that */
     if (move_references && PyDataType_REFCHK(src_dtype)) {
+        *out_flags |= NPY_METH_REQUIRES_PYAPI;
         if (get_decref_transfer_function(aligned,
                             src_stride,
                             src_dtype,
                             &data->decref_src,
-                            out_needs_api) != NPY_SUCCEED) {
+                            NULL) != NPY_SUCCEED) {
             NPY_AUXDATA_FREE((NpyAuxData *)data);
             return NPY_FAIL;
         }
@@ -3562,7 +3592,7 @@ PyArray_CastRawArrays(npy_intp count,
                       PyArray_Descr *src_dtype, PyArray_Descr *dst_dtype,
                       int move_references)
 {
-    int aligned = 1, needs_api = 0;
+    int aligned;
 
     /* Make sure the copy is reasonable */
     if (dst_stride == 0 && count > 1) {
@@ -3586,15 +3616,20 @@ PyArray_CastRawArrays(npy_intp count,
 
     /* Get the function to do the casting */
     NPY_cast_info cast_info;
+    NPY_ARRAYMETHOD_FLAGS flags;
     if (PyArray_GetDTypeTransferFunction(aligned,
                         src_stride, dst_stride,
                         src_dtype, dst_dtype,
                         move_references,
                         &cast_info,
-                        &needs_api) != NPY_SUCCEED) {
+                        &flags) != NPY_SUCCEED) {
         return NPY_FAIL;
     }
 
+    if (!(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) {
+        npy_clear_floatstatus_barrier((char*)&cast_info);
+    }
+
     /* Cast */
     char *args[2] = {src, dst};
     npy_intp strides[2] = {src_stride, dst_stride};
@@ -3603,8 +3638,16 @@ PyArray_CastRawArrays(npy_intp count,
     /* Cleanup */
     NPY_cast_info_xfree(&cast_info);
 
-    /* If needs_api was set to 1, it may have raised a Python exception */
-    return (needs_api && PyErr_Occurred()) ? NPY_FAIL : NPY_SUCCEED;
+    if (flags & NPY_METH_REQUIRES_PYAPI && PyErr_Occurred()) {
+        return NPY_FAIL;
+    }
+    if (!(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) {
+        int fpes = npy_get_floatstatus_barrier(*args);
+        if (fpes && PyUFunc_GiveFloatingpointErrors("cast", fpes) < 0) {
+            return NPY_FAIL;
+        }
+    }
+    return NPY_SUCCEED;
 }
 
 /*
diff --git a/numpy/core/src/multiarray/dtypemeta.c b/numpy/core/src/multiarray/dtypemeta.c
index 577478d2a..cc99a3eca 100644
--- a/numpy/core/src/multiarray/dtypemeta.c
+++ b/numpy/core/src/multiarray/dtypemeta.c
@@ -613,6 +613,7 @@ string_unicode_common_dtype(PyArray_DTypeMeta *cls, PyArray_DTypeMeta *other)
     return cls;
 }
 
+
 static PyArray_DTypeMeta *
 datetime_common_dtype(PyArray_DTypeMeta *cls, PyArray_DTypeMeta *other)
 {
diff --git a/numpy/core/src/multiarray/dtypemeta.h b/numpy/core/src/multiarray/dtypemeta.h
index e7d5505d8..618491c98 100644
--- a/numpy/core/src/multiarray/dtypemeta.h
+++ b/numpy/core/src/multiarray/dtypemeta.h
@@ -1,6 +1,9 @@
 #ifndef NUMPY_CORE_SRC_MULTIARRAY_DTYPEMETA_H_
 #define NUMPY_CORE_SRC_MULTIARRAY_DTYPEMETA_H_
 
+#ifdef __cplusplus
+extern "C" {
+#endif
 
 /* DType flags, currently private, since we may just expose functions */
 #define NPY_DT_LEGACY 1 << 0
@@ -126,4 +129,8 @@ python_builtins_are_known_scalar_types(
 NPY_NO_EXPORT int
 dtypemeta_wrap_legacy_descriptor(PyArray_Descr *dtypem);
 
+#ifdef __cplusplus
+}
+#endif
+
 #endif  /* NUMPY_CORE_SRC_MULTIARRAY_DTYPEMETA_H_ */
diff --git a/numpy/core/src/multiarray/einsum_sumprod.c.src b/numpy/core/src/multiarray/einsum_sumprod.c.src
index 3114a5896..e7b2f2c2c 100644
--- a/numpy/core/src/multiarray/einsum_sumprod.c.src
+++ b/numpy/core/src/multiarray/einsum_sumprod.c.src
@@ -68,7 +68,7 @@
  *            0*3#
  * #NPYV_CHK = 0*5,
  *             0*5,
- *             0, NPY_SIMD, NPY_SIMD_F64, 0,
+ *             0, NPY_SIMD_F32, NPY_SIMD_F64, 0,
  *             0*3#
  */
 
diff --git a/numpy/core/src/multiarray/experimental_public_dtype_api.c b/numpy/core/src/multiarray/experimental_public_dtype_api.c
index cf5f152ab..441dbdc1f 100644
--- a/numpy/core/src/multiarray/experimental_public_dtype_api.c
+++ b/numpy/core/src/multiarray/experimental_public_dtype_api.c
@@ -300,37 +300,13 @@ PyArrayInitDTypeMeta_FromSpec(
 }
 
 
-/* Function is defined in umath/dispatching.c (same/one compilation unit) */
+/* Functions defined in umath/dispatching.c (same/one compilation unit) */
 NPY_NO_EXPORT int
 PyUFunc_AddLoop(PyUFuncObject *ufunc, PyObject *info, int ignore_duplicate);
 
-static int
-PyUFunc_AddLoopFromSpec(PyObject *ufunc, PyArrayMethod_Spec *spec)
-{
-    if (!PyObject_TypeCheck(ufunc, &PyUFunc_Type)) {
-        PyErr_SetString(PyExc_TypeError,
-                "ufunc object passed is not a ufunc!");
-        return -1;
-    }
-    PyBoundArrayMethodObject *bmeth =
-            (PyBoundArrayMethodObject *)PyArrayMethod_FromSpec(spec);
-    if (bmeth == NULL) {
-        return -1;
-    }
-    int nargs = bmeth->method->nin + bmeth->method->nout;
-    PyObject *dtypes = PyArray_TupleFromItems(
-            nargs, (PyObject **)bmeth->dtypes, 1);
-    if (dtypes == NULL) {
-        return -1;
-    }
-    PyObject *info = PyTuple_Pack(2, dtypes, bmeth->method);
-    Py_DECREF(bmeth);
-    Py_DECREF(dtypes);
-    if (info == NULL) {
-        return -1;
-    }
-    return PyUFunc_AddLoop((PyUFuncObject *)ufunc, info, 0);
-}
+NPY_NO_EXPORT int
+PyUFunc_AddLoopFromSpec(PyUFuncObject *ufunc, PyObject *info, int ignore_duplicate);
+
 
 /*
  * Function is defined in umath/wrapping_array_method.c
diff --git a/numpy/core/src/multiarray/iterators.c b/numpy/core/src/multiarray/iterators.c
index f959162fd..95aa11d2d 100644
--- a/numpy/core/src/multiarray/iterators.c
+++ b/numpy/core/src/multiarray/iterators.c
@@ -827,7 +827,8 @@ iter_ass_subscript(PyArrayIterObject *self, PyObject *ind, PyObject *val)
     if (PyBool_Check(ind)) {
         retval = 0;
         if (PyObject_IsTrue(ind)) {
-            retval = PyArray_Pack(PyArray_DESCR(self->ao), self->dataptr, val);
+            retval = PyArray_Pack(
+                    PyArray_DESCR(self->ao), self->dataptr, val);
         }
         goto finish;
     }
diff --git a/numpy/core/src/multiarray/lowlevel_strided_loops.c.src b/numpy/core/src/multiarray/lowlevel_strided_loops.c.src
index e313d2447..8e3afd3cc 100644
--- a/numpy/core/src/multiarray/lowlevel_strided_loops.c.src
+++ b/numpy/core/src/multiarray/lowlevel_strided_loops.c.src
@@ -13,6 +13,7 @@
 
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
 #define _MULTIARRAYMODULE
+#define _UMATHMODULE
 #include <numpy/arrayobject.h>
 #include <numpy/npy_cpu.h>
 #include <numpy/halffloat.h>
@@ -22,6 +23,7 @@
 #include "array_method.h"
 #include "usertypes.h"
 
+#include "umathmodule.h"
 
 /*
  * x86 platform works with unaligned access but the compiler is allowed to
@@ -1557,14 +1559,16 @@ mapiter_trivial_@name@(PyArrayObject *self, PyArrayObject *ind,
  * General advanced indexing iteration.
  */
 NPY_NO_EXPORT int
-mapiter_@name@(PyArrayMapIterObject *mit)
+mapiter_@name@(
+        PyArrayMapIterObject *mit, NPY_cast_info *cast_info,
+        NPY_ARRAYMETHOD_FLAGS flags, int is_aligned)
 {
     npy_intp *counter, count;
-    int i, is_aligned;
+    int i;
 
     /* Cached mit info */
     int numiter = mit->numiter;
-    int needs_api = mit->needs_api;
+    int needs_api = (flags & NPY_METH_REQUIRES_PYAPI) != 0;
     /* Constant information */
     npy_intp fancy_dims[NPY_MAXDIMS];
     npy_intp fancy_strides[NPY_MAXDIMS];
@@ -1586,13 +1590,6 @@ mapiter_@name@(PyArrayMapIterObject *mit)
         fancy_strides[i] = mit->fancy_strides[i];
     }
 
-    /*
-     * Alignment information (swapping is never needed, since we buffer),
-     * could also check extra_op is buffered, but it should rarely matter.
-     */
-
-    is_aligned = IsUintAligned(array) && IsUintAligned(mit->extra_op);
-
     if (mit->size == 0) {
        return 0;
     }
@@ -1600,9 +1597,11 @@ mapiter_@name@(PyArrayMapIterObject *mit)
     if (mit->subspace_iter == NULL) {
         /*
          * Item by item copy situation, the operand is buffered
-         * so use copyswap.
+         * so use copyswap.  The iterator may not do any transfers, so may
+         * not have set `needs_api` yet, set it if necessary:
          */
-         PyArray_CopySwapFunc *copyswap = PyArray_DESCR(array)->f->copyswap;
+        needs_api |= PyDataType_REFCHK(PyArray_DESCR(array));
+        PyArray_CopySwapFunc *copyswap = PyArray_DESCR(array)->f->copyswap;
 
         /* We have only one iterator handling everything */
         counter = NpyIter_GetInnerLoopSizePtr(mit->outer);
@@ -1715,28 +1714,9 @@ mapiter_@name@(PyArrayMapIterObject *mit)
         int is_subiter_trivial = 0; /* has three states */
         npy_intp reset_offsets[2] = {0, 0};
 
-        /* Use strided transfer functions for the inner loop */
-        npy_intp fixed_strides[2];
-
-        /*
-         * Get a dtype transfer function, since there are no
-         * buffers, this is safe.
-         */
-        NpyIter_GetInnerFixedStrideArray(mit->subspace_iter, fixed_strides);
-
-        NPY_cast_info cast_info;
-        if (PyArray_GetDTypeTransferFunction(is_aligned,
-#if @isget@
-                        fixed_strides[0], fixed_strides[1],
-                        PyArray_DESCR(array), PyArray_DESCR(mit->extra_op),
-#else
-                        fixed_strides[1], fixed_strides[0],
-                         PyArray_DESCR(mit->extra_op), PyArray_DESCR(array),
-#endif
-                        0,
-                        &cast_info,
-                        &needs_api) != NPY_SUCCEED) {
-            return -1;
+        /* Note: it may make sense to refactor `needs_api` out in this branch */
+        if (flags & NPY_METH_REQUIRES_PYAPI) {
+            needs_api = 1;
         }
 
         counter = NpyIter_GetInnerLoopSizePtr(mit->subspace_iter);
@@ -1771,7 +1751,6 @@ mapiter_@name@(PyArrayMapIterObject *mit)
 #if @isget@ && @one_iter@
                     if (check_and_adjust_index(&indval, fancy_dims[i],
                                                iteraxis, _save) < 0 ) {
-                        NPY_cast_info_xfree(&cast_info);
                         return -1;
                     }
 #else
@@ -1803,7 +1782,6 @@ mapiter_@name@(PyArrayMapIterObject *mit)
                                                    &errmsg)) {
                         NPY_END_THREADS;
                         PyErr_SetString(PyExc_ValueError, errmsg);
-                        NPY_cast_info_xfree(&cast_info);
                         return -1;
                     }
                     if (is_subiter_trivial != 0) {
@@ -1833,7 +1811,6 @@ mapiter_@name@(PyArrayMapIterObject *mit)
                  *       not at all...
                  */
                 if (needs_api && PyErr_Occurred()) {
-                    NPY_cast_info_xfree(&cast_info);
                     return -1;
                 }
 #endif
@@ -1841,21 +1818,19 @@ mapiter_@name@(PyArrayMapIterObject *mit)
                 do {
 
 #if @isget@
-                    if (NPY_UNLIKELY(cast_info.func(&cast_info.context,
+                    if (NPY_UNLIKELY(cast_info->func(&cast_info->context,
                             subspace_ptrs, counter, subspace_strides,
-                            cast_info.auxdata) < 0)) {
+                            cast_info->auxdata) < 0)) {
                         NPY_END_THREADS;
-                        NPY_cast_info_xfree(&cast_info);
                         return -1;
                     }
 #else
                     /* The operand order is reversed here */
                     char *args[2] = {subspace_ptrs[1], subspace_ptrs[0]};
                     npy_intp strides[2] = {subspace_strides[1], subspace_strides[0]};
-                    if (NPY_UNLIKELY(cast_info.func(&cast_info.context,
-                            args, counter, strides, cast_info.auxdata) < 0)) {
+                    if (NPY_UNLIKELY(cast_info->func(&cast_info->context,
+                            args, counter, strides, cast_info->auxdata) < 0)) {
                         NPY_END_THREADS;
-                        NPY_cast_info_xfree(&cast_info);
                         return -1;
                     }
 #endif
@@ -1866,8 +1841,6 @@ mapiter_@name@(PyArrayMapIterObject *mit)
             NPY_END_THREADS;
         }
 /**end repeat1**/
-
-        NPY_cast_info_xfree(&cast_info);
     }
     return 0;
 }
diff --git a/numpy/core/src/multiarray/mapping.c b/numpy/core/src/multiarray/mapping.c
index 1a2ade11b..98c2d7eda 100644
--- a/numpy/core/src/multiarray/mapping.c
+++ b/numpy/core/src/multiarray/mapping.c
@@ -1,11 +1,14 @@
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
 #define _MULTIARRAYMODULE
+#define _UMATHMODULE
 
 #define PY_SSIZE_T_CLEAN
 #include <Python.h>
 #include <structmember.h>
 
 #include "numpy/arrayobject.h"
+#include "numpy/npy_math.h"
+
 #include "arrayobject.h"
 
 #include "npy_config.h"
@@ -23,6 +26,11 @@
 #include "mem_overlap.h"
 #include "array_assign.h"
 #include "array_coercion.h"
+/* TODO: Only for `NpyIter_GetTransferFlags` until it is public */
+#define NPY_ITERATOR_IMPLEMENTATION_CODE
+#include "nditer_impl.h"
+
+#include "umathmodule.h"
 
 
 #define HAS_INTEGER 1
@@ -914,7 +922,6 @@ array_boolean_subscript(PyArrayObject *self,
     char *ret_data;
     PyArray_Descr *dtype;
     PyArrayObject *ret;
-    int needs_api = 0;
 
     size = count_boolean_trues(PyArray_NDIM(bmask), PyArray_DATA(bmask),
                                 PyArray_DIMS(bmask), PyArray_STRIDES(bmask));
@@ -962,13 +969,18 @@ array_boolean_subscript(PyArrayObject *self,
         /* Get a dtype transfer function */
         NpyIter_GetInnerFixedStrideArray(iter, fixed_strides);
         NPY_cast_info cast_info;
+        /*
+         * TODO: Ignoring cast flags, since this is only ever a copy. In
+         *       principle that may not be quite right in some future?
+         */
+        NPY_ARRAYMETHOD_FLAGS cast_flags;
         if (PyArray_GetDTypeTransferFunction(
                         IsUintAligned(self) && IsAligned(self),
                         fixed_strides[0], itemsize,
                         dtype, dtype,
                         0,
                         &cast_info,
-                        &needs_api) != NPY_SUCCEED) {
+                        &cast_flags) != NPY_SUCCEED) {
             Py_DECREF(ret);
             NpyIter_Deallocate(iter);
             return NULL;
@@ -1068,7 +1080,6 @@ array_assign_boolean_subscript(PyArrayObject *self,
 {
     npy_intp size, v_stride;
     char *v_data;
-    int needs_api = 0;
     npy_intp bmask_size;
 
     if (PyArray_DESCR(bmask)->type_num != NPY_BOOL) {
@@ -1164,6 +1175,7 @@ array_assign_boolean_subscript(PyArrayObject *self,
         /* Get a dtype transfer function */
         NpyIter_GetInnerFixedStrideArray(iter, fixed_strides);
         NPY_cast_info cast_info;
+        NPY_ARRAYMETHOD_FLAGS cast_flags;
         if (PyArray_GetDTypeTransferFunction(
                  IsUintAligned(self) && IsAligned(self) &&
                         IsUintAligned(v) && IsAligned(v),
@@ -1171,14 +1183,17 @@ array_assign_boolean_subscript(PyArrayObject *self,
                         PyArray_DESCR(v), PyArray_DESCR(self),
                         0,
                         &cast_info,
-                        &needs_api) != NPY_SUCCEED) {
+                        &cast_flags) != NPY_SUCCEED) {
             NpyIter_Deallocate(iter);
             return -1;
         }
 
-        if (!needs_api) {
+        if (!(cast_flags & NPY_METH_REQUIRES_PYAPI)) {
             NPY_BEGIN_THREADS_NDITER(iter);
         }
+        if (!(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) {
+            npy_clear_floatstatus_barrier((char *)self);
+        }
 
         npy_intp strides[2] = {v_stride, self_stride};
 
@@ -1209,7 +1224,7 @@ array_assign_boolean_subscript(PyArrayObject *self,
             }
         } while (iternext(iter));
 
-        if (!needs_api) {
+        if (!(cast_flags & NPY_METH_REQUIRES_PYAPI)) {
             NPY_END_THREADS;
         }
 
@@ -1217,6 +1232,12 @@ array_assign_boolean_subscript(PyArrayObject *self,
         if (!NpyIter_Deallocate(iter)) {
             res = -1;
         }
+        if (res == 0 && !(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) {
+            int fpes = npy_get_floatstatus_barrier((char *)self);
+            if (fpes && PyUFunc_GiveFloatingpointErrors("cast", fpes) < 0) {
+                return -1;
+            }
+        }
     }
 
     return res;
@@ -1414,6 +1435,8 @@ array_subscript(PyArrayObject *self, PyObject *op)
     int index_type;
     int index_num;
     int i, ndim, fancy_ndim;
+    NPY_cast_info cast_info = {.func = NULL};
+
     /*
      * Index info array. We can have twice as many indices as dimensions
      * (because of None). The + 1 is to not need to check as much.
@@ -1579,7 +1602,43 @@ array_subscript(PyArrayObject *self, PyObject *op)
         goto finish;
     }
 
-    if (mapiter_get(mit) < 0) {
+    /*
+     * Alignment information (swapping is never needed, since we buffer),
+     * could also check extra_op is buffered, but it should rarely matter.
+     */
+    int is_aligned = IsUintAligned(self) && IsUintAligned(mit->extra_op);
+    /*
+     * NOTE: Getting never actually casts, so we currently do not bother to do
+     *       the full checks (floating point errors) here (unlike assignment).
+     */
+    int meth_flags = NpyIter_GetTransferFlags(mit->outer);
+    if (mit->extra_op_iter) {
+        int extra_op_flags = NpyIter_GetTransferFlags(mit->extra_op_iter);
+        meth_flags = PyArrayMethod_COMBINED_FLAGS(meth_flags, extra_op_flags);
+    }
+
+    if (mit->subspace_iter != NULL) {
+        int extra_op_flags = NpyIter_GetTransferFlags(mit->subspace_iter);
+        meth_flags = PyArrayMethod_COMBINED_FLAGS(meth_flags, extra_op_flags);
+
+        NPY_ARRAYMETHOD_FLAGS transfer_flags;
+        npy_intp fixed_strides[2];
+        /*
+         * Get a dtype transfer function, since there are no
+         * buffers, this is safe.
+         */
+        NpyIter_GetInnerFixedStrideArray(mit->subspace_iter, fixed_strides);
+
+        if (PyArray_GetDTypeTransferFunction(is_aligned,
+                fixed_strides[0], fixed_strides[1],
+                PyArray_DESCR(self), PyArray_DESCR(mit->extra_op),
+                0, &cast_info, &transfer_flags) != NPY_SUCCEED) {
+            goto finish;
+        }
+        meth_flags = PyArrayMethod_COMBINED_FLAGS(meth_flags, transfer_flags);
+    }
+
+    if (mapiter_get(mit, &cast_info, meth_flags, is_aligned) < 0) {
         goto finish;
     }
 
@@ -1614,6 +1673,7 @@ array_subscript(PyArrayObject *self, PyObject *op)
     }
 
   finish:
+    NPY_cast_info_xfree(&cast_info);
     Py_XDECREF(mit);
     Py_XDECREF(view);
     /* Clean up indices */
@@ -1699,6 +1759,9 @@ array_assign_subscript(PyArrayObject *self, PyObject *ind, PyObject *op)
 
     PyArrayMapIterObject *mit = NULL;
 
+    /* When a subspace is used, casting is done manually. */
+    NPY_cast_info cast_info = {.func = NULL};
+
     if (op == NULL) {
         PyErr_SetString(PyExc_ValueError,
                         "cannot delete array elements");
@@ -1871,7 +1934,6 @@ array_assign_subscript(PyArrayObject *self, PyObject *ind, PyObject *op)
             index_num == 1 && tmp_arr) {
         /* The array being indexed has one dimension and it is a fancy index */
         PyArrayObject *ind = (PyArrayObject*)indices[0].object;
-
         /* Check if the type is equivalent */
         if (PyArray_EquivTypes(PyArray_DESCR(self),
                                    PyArray_DESCR(tmp_arr)) &&
@@ -1935,12 +1997,50 @@ array_assign_subscript(PyArrayObject *self, PyObject *ind, PyObject *op)
         }
     }
 
-    /* Can now reset the outer iterator (delayed bufalloc) */
-    if (NpyIter_Reset(mit->outer, NULL) < 0) {
+    if (PyArray_MapIterCheckIndices(mit) < 0) {
         goto fail;
     }
 
-    if (PyArray_MapIterCheckIndices(mit) < 0) {
+    /*
+     * Alignment information (swapping is never needed, since we buffer),
+     * could also check extra_op is buffered, but it should rarely matter.
+     */
+    int is_aligned = IsUintAligned(self) && IsUintAligned(mit->extra_op);
+    int meth_flags = NpyIter_GetTransferFlags(mit->outer);
+
+    if (mit->extra_op_iter) {
+        int extra_op_flags = NpyIter_GetTransferFlags(mit->extra_op_iter);
+        meth_flags = PyArrayMethod_COMBINED_FLAGS(meth_flags, extra_op_flags);
+    }
+
+    if (mit->subspace_iter != NULL) {
+        int extra_op_flags = NpyIter_GetTransferFlags(mit->subspace_iter);
+        meth_flags = PyArrayMethod_COMBINED_FLAGS(meth_flags, extra_op_flags);
+
+        NPY_ARRAYMETHOD_FLAGS transfer_flags;
+        npy_intp fixed_strides[2];
+
+        /*
+         * Get a dtype transfer function, since there are no
+         * buffers, this is safe.
+         */
+        NpyIter_GetInnerFixedStrideArray(mit->subspace_iter, fixed_strides);
+
+        if (PyArray_GetDTypeTransferFunction(is_aligned,
+                fixed_strides[1], fixed_strides[0],
+                PyArray_DESCR(mit->extra_op), PyArray_DESCR(self),
+                0, &cast_info, &transfer_flags) != NPY_SUCCEED) {
+            goto fail;
+        }
+        meth_flags = PyArrayMethod_COMBINED_FLAGS(meth_flags, transfer_flags);
+    }
+
+    if (!(meth_flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) {
+        npy_clear_floatstatus_barrier((char *)mit);
+    }
+
+    /* Can now reset the outer iterator (delayed bufalloc) */
+    if (NpyIter_Reset(mit->outer, NULL) < 0) {
         goto fail;
     }
 
@@ -1948,11 +2048,17 @@ array_assign_subscript(PyArrayObject *self, PyObject *ind, PyObject *op)
      * Could add a casting check, but apparently most assignments do
      * not care about safe casting.
      */
-
-    if (mapiter_set(mit) < 0) {
+    if (mapiter_set(mit, &cast_info, meth_flags, is_aligned) < 0) {
         goto fail;
     }
 
+    if (!(meth_flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) {
+        int fpes = npy_get_floatstatus_barrier((char *)mit);
+        if (fpes && PyUFunc_GiveFloatingpointErrors("cast", fpes) < 0) {
+            goto fail;
+        }
+    }
+
     Py_DECREF(mit);
     goto success;
 
@@ -1961,6 +2067,8 @@ array_assign_subscript(PyArrayObject *self, PyObject *ind, PyObject *op)
     Py_XDECREF((PyObject *)view);
     Py_XDECREF((PyObject *)tmp_arr);
     Py_XDECREF((PyObject *)mit);
+    NPY_cast_info_xfree(&cast_info);
+
     for (i=0; i < index_num; i++) {
         Py_XDECREF(indices[i].object);
     }
@@ -1969,6 +2077,8 @@ array_assign_subscript(PyArrayObject *self, PyObject *ind, PyObject *op)
   success:
     Py_XDECREF((PyObject *)view);
     Py_XDECREF((PyObject *)tmp_arr);
+    NPY_cast_info_xfree(&cast_info);
+
     for (i=0; i < index_num; i++) {
         Py_XDECREF(indices[i].object);
     }
@@ -2089,7 +2199,7 @@ _nonzero_indices(PyObject *myBool, PyArrayObject **arrays)
 
 
 /* Reset the map iterator to the beginning */
-NPY_NO_EXPORT void
+NPY_NO_EXPORT int
 PyArray_MapIterReset(PyArrayMapIterObject *mit)
 {
     npy_intp indval;
@@ -2097,12 +2207,16 @@ PyArray_MapIterReset(PyArrayMapIterObject *mit)
     int i;
 
     if (mit->size == 0) {
-        return;
+        return 0;
     }
 
-    NpyIter_Reset(mit->outer, NULL);
+    if (!NpyIter_Reset(mit->outer, NULL)) {
+        return -1;
+    }
     if (mit->extra_op_iter) {
-        NpyIter_Reset(mit->extra_op_iter, NULL);
+        if (!NpyIter_Reset(mit->extra_op_iter, NULL)) {
+            return -1;
+        }
 
         baseptrs[1] = mit->extra_op_ptrs[0];
     }
@@ -2119,14 +2233,16 @@ PyArray_MapIterReset(PyArrayMapIterObject *mit)
     mit->dataptr = baseptrs[0];
 
     if (mit->subspace_iter) {
-        NpyIter_ResetBasePointers(mit->subspace_iter, baseptrs, NULL);
+        if (!NpyIter_ResetBasePointers(mit->subspace_iter, baseptrs, NULL)) {
+            return -1;
+        }
         mit->iter_count = *NpyIter_GetInnerLoopSizePtr(mit->subspace_iter);
     }
     else {
         mit->iter_count = *NpyIter_GetInnerLoopSizePtr(mit->outer);
     }
 
-    return;
+    return 0;
 }
 
 
@@ -2592,13 +2708,14 @@ PyArray_MapIterNew(npy_index_info *indices , int index_num, int index_type,
     }
 
     /* create new MapIter object */
-    mit = (PyArrayMapIterObject *)PyArray_malloc(sizeof(PyArrayMapIterObject));
+    mit = (PyArrayMapIterObject *)PyArray_malloc(
+            sizeof(PyArrayMapIterObject) + sizeof(NPY_cast_info));
     if (mit == NULL) {
         Py_DECREF(intp_descr);
         return NULL;
     }
     /* set all attributes of mapiter to zero */
-    memset(mit, 0, sizeof(PyArrayMapIterObject));
+    memset(mit, 0, sizeof(PyArrayMapIterObject) + sizeof(NPY_cast_info));
     PyObject_Init((PyObject *)mit, &PyArrayMapIter_Type);
 
     Py_INCREF(arr);
@@ -2874,6 +2991,11 @@ PyArray_MapIterNew(npy_index_info *indices , int index_num, int index_type,
 
     /* If external array is iterated, and no subspace is needed */
     nops = mit->numiter;
+
+    if (!uses_subspace) {
+        outer_flags |= NPY_ITER_EXTERNAL_LOOP;
+    }
+
     if (extra_op_flags && !uses_subspace) {
         /*
          * NOTE: This small limitation should practically not matter.
@@ -2921,9 +3043,6 @@ PyArray_MapIterNew(npy_index_info *indices , int index_num, int index_type,
     if (mit->outer == NULL) {
         goto fail;
     }
-    if (!uses_subspace) {
-        NpyIter_EnableExternalLoop(mit->outer);
-    }
 
     mit->outer_next = NpyIter_GetIterNext(mit->outer, NULL);
     if (mit->outer_next == NULL) {
@@ -3061,7 +3180,7 @@ PyArray_MapIterNew(npy_index_info *indices , int index_num, int index_type,
     mit->subspace_ptrs = NpyIter_GetDataPtrArray(mit->subspace_iter);
     mit->subspace_strides = NpyIter_GetInnerStrideArray(mit->subspace_iter);
 
-    if (NpyIter_IterationNeedsAPI(mit->outer)) {
+    if (NpyIter_IterationNeedsAPI(mit->subspace_iter)) {
         mit->needs_api = 1;
         /*
          * NOTE: In this case, need to call PyErr_Occurred() after
@@ -3212,9 +3331,12 @@ PyArray_MapIterArrayCopyIfOverlap(PyArrayObject * a, PyObject * index,
         goto fail;
     }
 
+    if (PyArray_MapIterReset(mit) < 0) {
+        goto fail;
+    }
+
     Py_XDECREF(a_copy);
     Py_XDECREF(subspace);
-    PyArray_MapIterReset(mit);
 
     for (i=0; i < index_num; i++) {
         Py_XDECREF(indices[i].object);
diff --git a/numpy/core/src/multiarray/mapping.h b/numpy/core/src/multiarray/mapping.h
index e929b8b3f..4e5d06238 100644
--- a/numpy/core/src/multiarray/mapping.h
+++ b/numpy/core/src/multiarray/mapping.h
@@ -51,7 +51,7 @@ array_assign_item(PyArrayObject *self, Py_ssize_t i, PyObject *v);
  * Prototypes for Mapping calls --- not part of the C-API
  * because only useful as part of a getitem call.
  */
-NPY_NO_EXPORT void
+NPY_NO_EXPORT int
 PyArray_MapIterReset(PyArrayMapIterObject *mit);
 
 NPY_NO_EXPORT void
diff --git a/numpy/core/src/multiarray/multiarraymodule.c b/numpy/core/src/multiarray/multiarraymodule.c
index 5209d6914..96d0c893d 100644
--- a/numpy/core/src/multiarray/multiarraymodule.c
+++ b/numpy/core/src/multiarray/multiarraymodule.c
@@ -85,6 +85,10 @@ NPY_NO_EXPORT int NPY_NUMUSERTYPES = 0;
 
 NPY_NO_EXPORT int initscalarmath(PyObject *);
 NPY_NO_EXPORT int set_matmul_flags(PyObject *d); /* in ufunc_object.c */
+/* From umath/string_ufuncs.cpp/h */
+NPY_NO_EXPORT PyObject *
+_umath_strings_richcompare(
+        PyArrayObject *self, PyArrayObject *other, int cmp_op, int rstrip);
 
 /*
  * global variable to determine if legacy printing is enabled, accessible from
@@ -138,12 +142,12 @@ PyArray_GetPriority(PyObject *obj, double default_)
     }
 
     priority = PyFloat_AsDouble(ret);
+    Py_DECREF(ret);
     if (error_converting(priority)) {
         /* TODO[gh-14801]: propagate crashes for bad priority? */
         PyErr_Clear();
         return default_;
     }
-    Py_DECREF(ret);
     return priority;
 }
 
@@ -3726,6 +3730,12 @@ format_longfloat(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject *kwds)
                               TrimMode_LeaveOneZero, -1, -1);
 }
 
+
+/*
+ * The only purpose of this function is that it allows the "rstrip".
+ * From my (@seberg's) perspective, this function should be deprecated
+ * and I do not think it matters if it is not particularly fast.
+ */
 static PyObject *
 compare_chararrays(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject *kwds)
 {
@@ -3791,7 +3801,7 @@ compare_chararrays(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject *kwds)
         return NULL;
     }
     if (PyArray_ISSTRING(newarr) && PyArray_ISSTRING(newoth)) {
-        res = _strings_richcompare(newarr, newoth, cmp_op, rstrip != 0);
+        res = _umath_strings_richcompare(newarr, newoth, cmp_op, rstrip != 0);
     }
     else {
         PyErr_SetString(PyExc_TypeError,
diff --git a/numpy/core/src/multiarray/nditer_api.c b/numpy/core/src/multiarray/nditer_api.c
index 860c8c1f6..b80312e06 100644
--- a/numpy/core/src/multiarray/nditer_api.c
+++ b/numpy/core/src/multiarray/nditer_api.c
@@ -857,6 +857,13 @@ NpyIter_RequiresBuffering(NpyIter *iter)
  * Whether the iteration loop, and in particular the iternext()
  * function, needs API access.  If this is true, the GIL must
  * be retained while iterating.
+ *
+ * NOTE: Internally (currently), `NpyIter_GetTransferFlags` will
+ *       additionally provide information on whether floating point errors
+ *       may be given during casts.  The flags only require the API use
+ *       necessary for buffering though.  So an iterate which does not require
+ *       buffering may indicate `NpyIter_IterationNeedsAPI`, but not include
+ *       the flag in `NpyIter_GetTransferFlags`.
  */
 NPY_NO_EXPORT npy_bool
 NpyIter_IterationNeedsAPI(NpyIter *iter)
@@ -864,6 +871,21 @@ NpyIter_IterationNeedsAPI(NpyIter *iter)
     return (NIT_ITFLAGS(iter)&NPY_ITFLAG_NEEDSAPI) != 0;
 }
 
+
+/*
+ * Fetch the ArrayMethod (runtime) flags for all "transfer functions' (i.e.
+ * copy to buffer/casts).
+ *
+ * TODO: This should be public API, but that only makes sense when the
+ *       ArrayMethod API is made public.
+ */
+NPY_NO_EXPORT int
+NpyIter_GetTransferFlags(NpyIter *iter)
+{
+    return NIT_ITFLAGS(iter) >> NPY_ITFLAG_TRANSFERFLAGS_SHIFT;
+}
+
+
 /*NUMPY_API
  * Gets the number of dimensions being iterated
  */
diff --git a/numpy/core/src/multiarray/nditer_constr.c b/numpy/core/src/multiarray/nditer_constr.c
index f82a9624e..a383c63e8 100644
--- a/numpy/core/src/multiarray/nditer_constr.c
+++ b/numpy/core/src/multiarray/nditer_constr.c
@@ -3141,7 +3141,9 @@ npyiter_allocate_transfer_functions(NpyIter *iter)
     npy_intp *strides = NAD_STRIDES(axisdata), op_stride;
     NpyIter_TransferInfo *transferinfo = NBF_TRANSFERINFO(bufferdata);
 
-    int needs_api = 0;
+    /* combined cast flags, the new cast flags for each cast: */
+    NPY_ARRAYMETHOD_FLAGS cflags = PyArrayMethod_MINIMAL_FLAGS;
+    NPY_ARRAYMETHOD_FLAGS nc_flags;
 
     for (iop = 0; iop < nop; ++iop) {
         npyiter_opitflags flags = op_itflags[iop];
@@ -3167,10 +3169,11 @@ npyiter_allocate_transfer_functions(NpyIter *iter)
                                         op_dtype[iop],
                                         move_references,
                                         &transferinfo[iop].read,
-                                        &needs_api) != NPY_SUCCEED) {
+                                        &nc_flags) != NPY_SUCCEED) {
                     iop -= 1;  /* This one cannot be cleaned up yet. */
                     goto fail;
                 }
+                cflags = PyArrayMethod_COMBINED_FLAGS(cflags, nc_flags);
             }
             else {
                 transferinfo[iop].read.func = NULL;
@@ -3199,9 +3202,10 @@ npyiter_allocate_transfer_functions(NpyIter *iter)
                             mask_dtype,
                             move_references,
                             &transferinfo[iop].write,
-                            &needs_api) != NPY_SUCCEED) {
+                            &nc_flags) != NPY_SUCCEED) {
                         goto fail;
                     }
+                    cflags = PyArrayMethod_COMBINED_FLAGS(cflags, nc_flags);
                 }
                 else {
                     if (PyArray_GetDTypeTransferFunction(
@@ -3212,9 +3216,10 @@ npyiter_allocate_transfer_functions(NpyIter *iter)
                             PyArray_DESCR(op[iop]),
                             move_references,
                             &transferinfo[iop].write,
-                            &needs_api) != NPY_SUCCEED) {
+                            &nc_flags) != NPY_SUCCEED) {
                         goto fail;
                     }
+                    cflags = PyArrayMethod_COMBINED_FLAGS(cflags, nc_flags);
                 }
             }
             /* If no write back but there are references make a decref fn */
@@ -3230,9 +3235,10 @@ npyiter_allocate_transfer_functions(NpyIter *iter)
                         op_dtype[iop], NULL,
                         1,
                         &transferinfo[iop].write,
-                        &needs_api) != NPY_SUCCEED) {
+                        &nc_flags) != NPY_SUCCEED) {
                     goto fail;
                 }
+                cflags = PyArrayMethod_COMBINED_FLAGS(cflags, nc_flags);
             }
             else {
                 transferinfo[iop].write.func = NULL;
@@ -3244,8 +3250,12 @@ npyiter_allocate_transfer_functions(NpyIter *iter)
         }
     }
 
-    /* If any of the dtype transfer functions needed the API, flag it */
-    if (needs_api) {
+    /* Store the combined transfer flags on the iterator */
+    NIT_ITFLAGS(iter) |= cflags << NPY_ITFLAG_TRANSFERFLAGS_SHIFT;
+    assert(NIT_ITFLAGS(iter) >> NPY_ITFLAG_TRANSFERFLAGS_SHIFT == cflags);
+
+    /* If any of the dtype transfer functions needed the API, flag it. */
+    if (cflags & NPY_METH_REQUIRES_PYAPI) {
         NIT_ITFLAGS(iter) |= NPY_ITFLAG_NEEDSAPI;
     }
 
diff --git a/numpy/core/src/multiarray/nditer_impl.h b/numpy/core/src/multiarray/nditer_impl.h
index 2a82b7e54..459675ea8 100644
--- a/numpy/core/src/multiarray/nditer_impl.h
+++ b/numpy/core/src/multiarray/nditer_impl.h
@@ -76,33 +76,38 @@
 /* Internal iterator flags */
 
 /* The perm is the identity */
-#define NPY_ITFLAG_IDENTPERM    0x0001
+#define NPY_ITFLAG_IDENTPERM    (1 << 0)
 /* The perm has negative entries (indicating flipped axes) */
-#define NPY_ITFLAG_NEGPERM      0x0002
+#define NPY_ITFLAG_NEGPERM      (1 << 1)
 /* The iterator is tracking an index */
-#define NPY_ITFLAG_HASINDEX     0x0004
+#define NPY_ITFLAG_HASINDEX     (1 << 2)
 /* The iterator is tracking a multi-index */
-#define NPY_ITFLAG_HASMULTIINDEX    0x0008
+#define NPY_ITFLAG_HASMULTIINDEX    (1 << 3)
 /* The iteration order was forced on construction */
-#define NPY_ITFLAG_FORCEDORDER  0x0010
+#define NPY_ITFLAG_FORCEDORDER  (1 << 4)
 /* The inner loop is handled outside the iterator */
-#define NPY_ITFLAG_EXLOOP      0x0020
+#define NPY_ITFLAG_EXLOOP      (1 << 5)
 /* The iterator is ranged */
-#define NPY_ITFLAG_RANGE        0x0040
+#define NPY_ITFLAG_RANGE        (1 << 6)
 /* The iterator is buffered */
-#define NPY_ITFLAG_BUFFER       0x0080
+#define NPY_ITFLAG_BUFFER       (1 << 7)
 /* The iterator should grow the buffered inner loop when possible */
-#define NPY_ITFLAG_GROWINNER    0x0100
+#define NPY_ITFLAG_GROWINNER    (1 << 8)
 /* There is just one iteration, can specialize iternext for that */
-#define NPY_ITFLAG_ONEITERATION 0x0200
+#define NPY_ITFLAG_ONEITERATION (1 << 9)
 /* Delay buffer allocation until first Reset* call */
-#define NPY_ITFLAG_DELAYBUF     0x0400
+#define NPY_ITFLAG_DELAYBUF     (1 << 10)
 /* Iteration needs API access during iternext */
-#define NPY_ITFLAG_NEEDSAPI     0x0800
+#define NPY_ITFLAG_NEEDSAPI     (1 << 11)
 /* Iteration includes one or more operands being reduced */
-#define NPY_ITFLAG_REDUCE       0x1000
+#define NPY_ITFLAG_REDUCE       (1 << 12)
 /* Reduce iteration doesn't need to recalculate reduce loops next time */
-#define NPY_ITFLAG_REUSE_REDUCE_LOOPS 0x2000
+#define NPY_ITFLAG_REUSE_REDUCE_LOOPS (1 << 13)
+/*
+ * Offset of (combined) ArrayMethod flags for all transfer functions.
+ * For now, we use the top 8 bits.
+ */
+#define NPY_ITFLAG_TRANSFERFLAGS_SHIFT 24
 
 /* Internal iterator per-operand iterator flags */
 
@@ -356,4 +361,12 @@ npyiter_copy_to_buffers(NpyIter *iter, char **prev_dataptrs);
 NPY_NO_EXPORT void
 npyiter_clear_buffers(NpyIter *iter);
 
+/*
+ * Function to get the ArrayMethod flags of the transfer functions.
+ * TODO: This function should be public and removed from `nditer_impl.h`, but
+ *       this requires making the ArrayMethod flags public API first.
+ */
+NPY_NO_EXPORT int
+NpyIter_GetTransferFlags(NpyIter *iter);
+
 #endif  /* NUMPY_CORE_SRC_MULTIARRAY_NDITER_IMPL_H_ */
diff --git a/numpy/core/src/multiarray/textreading/readtext.c b/numpy/core/src/multiarray/textreading/readtext.c
index 9804fd462..a5db1cb77 100644
--- a/numpy/core/src/multiarray/textreading/readtext.c
+++ b/numpy/core/src/multiarray/textreading/readtext.c
@@ -270,6 +270,10 @@ _load_from_filelike(PyObject *NPY_UNUSED(mod),
         }
         /* Calloc just to not worry about overflow */
         usecols = PyMem_Calloc(num_usecols, sizeof(Py_ssize_t));
+        if (usecols == NULL) {
+            PyErr_NoMemory();
+            return NULL;
+        }
         for (Py_ssize_t i = 0; i < num_usecols; i++) {
             PyObject *tmp = PySequence_GetItem(usecols_obj, i);
             if (tmp == NULL) {
diff --git a/numpy/core/src/multiarray/textreading/rows.c b/numpy/core/src/multiarray/textreading/rows.c
index e30ff835e..a72fb79d9 100644
--- a/numpy/core/src/multiarray/textreading/rows.c
+++ b/numpy/core/src/multiarray/textreading/rows.c
@@ -91,7 +91,7 @@ create_conv_funcs(
             if (column < -num_fields || column >= num_fields) {
                 PyErr_Format(PyExc_ValueError,
                         "converter specified for column %zd, which is invalid "
-                        "for the number of fields %d.", column, num_fields);
+                        "for the number of fields %zd.", column, num_fields);
                 goto error;
             }
             if (column < 0) {
@@ -319,7 +319,7 @@ read_rows(stream *s,
 
         if (!usecols && (actual_num_fields != current_num_fields)) {
             PyErr_Format(PyExc_ValueError,
-                    "the number of columns changed from %d to %d at row %zu; "
+                    "the number of columns changed from %zd to %zd at row %zd; "
                     "use `usecols` to select a subset and avoid this error",
                     actual_num_fields, current_num_fields, row_count+1);
             goto error;
@@ -382,9 +382,9 @@ read_rows(stream *s,
                 }
                 if (NPY_UNLIKELY((col < 0) || (col >= current_num_fields))) {
                     PyErr_Format(PyExc_ValueError,
-                            "invalid column index %d at row %zu with %d "
+                            "invalid column index %zd at row %zd with %zd "
                             "columns",
-                            usecols[i], current_num_fields, row_count+1);
+                            usecols[i], row_count+1, current_num_fields);
                     goto error;
                 }
             }
@@ -419,7 +419,7 @@ read_rows(stream *s,
                 }
                 PyErr_Format(PyExc_ValueError,
                         "could not convert string %.100R to %S at "
-                        "row %zu, column %d.",
+                        "row %zd, column %zd.",
                         string, field_types[f].descr, row_count, col+1);
                 Py_DECREF(string);
                 npy_PyErr_ChainExceptionsCause(exc, val, tb);
@@ -432,7 +432,12 @@ read_rows(stream *s,
     }
 
     tokenizer_clear(&ts);
-    PyMem_FREE(conv_funcs);
+    if (conv_funcs != NULL) {
+        for (Py_ssize_t i = 0; i < actual_num_fields; i++) {
+            Py_XDECREF(conv_funcs[i]);
+        }
+        PyMem_FREE(conv_funcs);
+    }
 
     if (data_array == NULL) {
         assert(row_count == 0 && result_shape[0] == 0);
@@ -474,7 +479,12 @@ read_rows(stream *s,
     return data_array;
 
   error:
-    PyMem_FREE(conv_funcs);
+    if (conv_funcs != NULL) {
+        for (Py_ssize_t i = 0; i < actual_num_fields; i++) {
+            Py_XDECREF(conv_funcs[i]);
+        }
+        PyMem_FREE(conv_funcs);
+    }
     tokenizer_clear(&ts);
     Py_XDECREF(data_array);
     return NULL;
diff --git a/numpy/core/src/npymath/ieee754.c.src b/numpy/core/src/npymath/ieee754.c.src
index 4e6ddb712..5d1ea3a69 100644
--- a/numpy/core/src/npymath/ieee754.c.src
+++ b/numpy/core/src/npymath/ieee754.c.src
@@ -566,228 +566,38 @@ int npy_get_floatstatus() {
     return npy_get_floatstatus_barrier(&x);
 }
 
-/*
- * Functions to set the floating point status word.
- */
-
-#if (defined(__unix__) || defined(unix)) && !defined(USG)
-#include <sys/param.h>
-#endif
-
 
 /*
- * Define floating point status functions. We must define
- * npy_get_floatstatus_barrier, npy_clear_floatstatus_barrier,
- * npy_set_floatstatus_{divbyzero, overflow, underflow, invalid}
- * for all supported platforms.
+ * General C99 code for floating point error handling.  These functions mainly
+ * exists, because `fenv.h` was not standardized in C89 so they gave better
+ * portability.  This should be unnecessary with C99/C++11 and further
+ * functionality can be used from `fenv.h` directly.
  */
-
-
-/* Solaris --------------------------------------------------------*/
-/* --------ignoring SunOS ieee_flags approach, someone else can
-**         deal with that! */
-#if defined(sun) || defined(__BSD__) || defined(__OpenBSD__) || \
-    (defined(__FreeBSD__) && (__FreeBSD_version < 502114)) || \
-    defined(__NetBSD__)
-#include <ieeefp.h>
-
-int npy_get_floatstatus_barrier(char * param)
-{
-    int fpstatus = fpgetsticky();
-    /*
-     * By using a volatile, the compiler cannot reorder this call
-     */
-    if (param != NULL) {
-        volatile char NPY_UNUSED(c) = *(char*)param;
-    }
-    return ((FP_X_DZ  & fpstatus) ? NPY_FPE_DIVIDEBYZERO : 0) |
-           ((FP_X_OFL & fpstatus) ? NPY_FPE_OVERFLOW : 0) |
-           ((FP_X_UFL & fpstatus) ? NPY_FPE_UNDERFLOW : 0) |
-           ((FP_X_INV & fpstatus) ? NPY_FPE_INVALID : 0);
-}
-
-int npy_clear_floatstatus_barrier(char * param)
-{
-    int fpstatus = npy_get_floatstatus_barrier(param);
-    fpsetsticky(0);
-
-    return fpstatus;
-}
-
-void npy_set_floatstatus_divbyzero(void)
-{
-    fpsetsticky(FP_X_DZ);
-}
-
-void npy_set_floatstatus_overflow(void)
-{
-    fpsetsticky(FP_X_OFL);
-}
-
-void npy_set_floatstatus_underflow(void)
-{
-    fpsetsticky(FP_X_UFL);
-}
-
-void npy_set_floatstatus_invalid(void)
-{
-    fpsetsticky(FP_X_INV);
-}
-
-#elif defined(_AIX) && !defined(__GNUC__)
-#include <float.h>
-#include <fpxcp.h>
-
-int npy_get_floatstatus_barrier(char *param)
-{
-    int fpstatus = fp_read_flag();
-    /*
-     * By using a volatile, the compiler cannot reorder this call
-     */
-    if (param != NULL) {
-        volatile char NPY_UNUSED(c) = *(char*)param;
-    }
-    return ((FP_DIV_BY_ZERO & fpstatus) ? NPY_FPE_DIVIDEBYZERO : 0) |
-           ((FP_OVERFLOW & fpstatus) ? NPY_FPE_OVERFLOW : 0) |
-           ((FP_UNDERFLOW & fpstatus) ? NPY_FPE_UNDERFLOW : 0) |
-           ((FP_INVALID & fpstatus) ? NPY_FPE_INVALID : 0);
-}
-
-int npy_clear_floatstatus_barrier(char * param)
-{
-    int fpstatus = npy_get_floatstatus_barrier(param);
-    fp_swap_flag(0);
-
-    return fpstatus;
-}
-
-void npy_set_floatstatus_divbyzero(void)
-{
-    fp_raise_xcp(FP_DIV_BY_ZERO);
-}
-
-void npy_set_floatstatus_overflow(void)
-{
-    fp_raise_xcp(FP_OVERFLOW);
-}
-
-void npy_set_floatstatus_underflow(void)
-{
-    fp_raise_xcp(FP_UNDERFLOW);
-}
-
-void npy_set_floatstatus_invalid(void)
-{
-    fp_raise_xcp(FP_INVALID);
-}
-
-#elif defined(_MSC_VER) || (defined(__osf__) && defined(__alpha)) || \
-      defined (__UCLIBC__) || (defined(__arc__) && defined(__GLIBC__))
+#  include <fenv.h>
 
 /*
- * By using a volatile floating point value,
- * the compiler is forced to actually do the requested
- * operations because of potential concurrency.
- *
- * We shouldn't write multiple values to a single
- * global here, because that would cause
- * a race condition.
+ * According to the C99 standard FE_DIVBYZERO, etc. may not be provided when
+ * unsupported.  In such cases NumPy will not report these correctly, but we
+ * should still allow compiling (whether tests pass or not).
+ * By defining them as 0 locally, we make them no-ops.  Unlike these defines,
+ * for example `musl` still defines all of the functions (as no-ops):
+ *     https://git.musl-libc.org/cgit/musl/tree/src/fenv/fenv.c
+ * and does similar replacement in its tests:
+ * http://nsz.repo.hu/git/?p=libc-test;a=blob;f=src/common/mtest.h;h=706c1ba23ea8989b17a2f72ed1a919e187c06b6a;hb=HEAD#l30
  */
-static volatile double _npy_floatstatus_x,
-    _npy_floatstatus_zero = 0.0, _npy_floatstatus_big = 1e300,
-    _npy_floatstatus_small = 1e-300, _npy_floatstatus_inf;
-
-void npy_set_floatstatus_divbyzero(void)
-{
-    _npy_floatstatus_x = 1.0 / _npy_floatstatus_zero;
-}
-
-void npy_set_floatstatus_overflow(void)
-{
-    _npy_floatstatus_x = _npy_floatstatus_big * 1e300;
-}
-
-void npy_set_floatstatus_underflow(void)
-{
-    _npy_floatstatus_x = _npy_floatstatus_small * 1e-300;
-}
-
-void npy_set_floatstatus_invalid(void)
-{
-    _npy_floatstatus_inf = NPY_INFINITY;
-    _npy_floatstatus_x = _npy_floatstatus_inf - NPY_INFINITY;
-}
-
-/* MS Windows -----------------------------------------------------*/
-#if defined(_MSC_VER)
-
-#include <float.h>
-
-int npy_get_floatstatus_barrier(char *param)
-{
-    /*
-     * By using a volatile, the compiler cannot reorder this call
-     */
-#if defined(_WIN64)
-    int fpstatus = _statusfp();
-#else
-    /* windows enables sse on 32 bit, so check both flags */
-    int fpstatus, fpstatus2;
-    _statusfp2(&fpstatus, &fpstatus2);
-    fpstatus |= fpstatus2;
+#ifndef FE_DIVBYZERO
+    #define FE_DIVBYZERO 0
 #endif
-    if (param != NULL) {
-        volatile char NPY_UNUSED(c) = *(char*)param;
-    }
-    return ((SW_ZERODIVIDE & fpstatus) ? NPY_FPE_DIVIDEBYZERO : 0) |
-           ((SW_OVERFLOW & fpstatus) ? NPY_FPE_OVERFLOW : 0) |
-           ((SW_UNDERFLOW & fpstatus) ? NPY_FPE_UNDERFLOW : 0) |
-           ((SW_INVALID & fpstatus) ? NPY_FPE_INVALID : 0);
-}
-
-int npy_clear_floatstatus_barrier(char *param)
-{
-    int fpstatus = npy_get_floatstatus_barrier(param);
-    _clearfp();
-
-    return fpstatus;
-}
-
-/*  OSF/Alpha (Tru64)  ---------------------------------------------*/
-#elif defined(__osf__) && defined(__alpha)
-
-#include <machine/fpu.h>
-
-int npy_get_floatstatus_barrier(char *param)
-{
-    unsigned long fpstatus = ieee_get_fp_control();
-    /*
-     * By using a volatile, the compiler cannot reorder this call
-     */
-    if (param != NULL) {
-        volatile char NPY_UNUSED(c) = *(char*)param;
-    }
-    return  ((IEEE_STATUS_DZE & fpstatus) ? NPY_FPE_DIVIDEBYZERO : 0) |
-            ((IEEE_STATUS_OVF & fpstatus) ? NPY_FPE_OVERFLOW : 0) |
-            ((IEEE_STATUS_UNF & fpstatus) ? NPY_FPE_UNDERFLOW : 0) |
-            ((IEEE_STATUS_INV & fpstatus) ? NPY_FPE_INVALID : 0);
-}
-
-int npy_clear_floatstatus_barrier(char *param)
-{
-    int fpstatus = npy_get_floatstatus_barrier(param);
-    /* clear status bits as well as disable exception mode if on */
-    ieee_set_fp_control(0);
-
-    return fpstatus;
-}
-
+#ifndef FE_OVERFLOW
+    #define FE_OVERFLOW 0
+#endif
+#ifndef FE_UNDERFLOW
+    #define FE_UNDERFLOW 0
+#endif
+#ifndef FE_INVALID
+    #define FE_INVALID 0
 #endif
-/* End of defined(_MSC_VER) || (defined(__osf__) && defined(__alpha)) */
 
-#else
-/* General GCC code, should work on most platforms */
-#  include <fenv.h>
 
 int npy_get_floatstatus_barrier(char* param)
 {
@@ -839,4 +649,3 @@ void npy_set_floatstatus_invalid(void)
     feraiseexcept(FE_INVALID);
 }
 
-#endif
diff --git a/numpy/core/src/npymath/ieee754.cpp b/numpy/core/src/npymath/ieee754.cpp
index 2244004c0..27fcf7c6e 100644
--- a/numpy/core/src/npymath/ieee754.cpp
+++ b/numpy/core/src/npymath/ieee754.cpp
@@ -655,6 +655,30 @@ npy_get_floatstatus()
  */
 #include <fenv.h>
 
+/*
+ * According to the C99 standard FE_DIVBYZERO, etc. may not be provided when
+ * unsupported.  In such cases NumPy will not report these correctly, but we
+ * should still allow compiling (whether tests pass or not).
+ * By defining them as 0 locally, we make them no-ops.  Unlike these defines,
+ * for example `musl` still defines all of the functions (as no-ops):
+ *     https://git.musl-libc.org/cgit/musl/tree/src/fenv/fenv.c
+ * and does similar replacement in its tests:
+ * http://nsz.repo.hu/git/?p=libc-test;a=blob;f=src/common/mtest.h;h=706c1ba23ea8989b17a2f72ed1a919e187c06b6a;hb=HEAD#l30
+ */
+#ifndef FE_DIVBYZERO
+    #define FE_DIVBYZERO 0
+#endif
+#ifndef FE_OVERFLOW
+    #define FE_OVERFLOW 0
+#endif
+#ifndef FE_UNDERFLOW
+    #define FE_UNDERFLOW 0
+#endif
+#ifndef FE_INVALID
+    #define FE_INVALID 0
+#endif
+
+
 extern "C" int
 npy_get_floatstatus_barrier(char *param)
 {
diff --git a/numpy/core/src/umath/dispatching.c b/numpy/core/src/umath/dispatching.c
index b8f102b3d..620335d88 100644
--- a/numpy/core/src/umath/dispatching.c
+++ b/numpy/core/src/umath/dispatching.c
@@ -145,6 +145,38 @@ PyUFunc_AddLoop(PyUFuncObject *ufunc, PyObject *info, int ignore_duplicate)
 }
 
 
+/*
+ * Add loop directly to a ufunc from a given ArrayMethod spec.
+ */
+NPY_NO_EXPORT int
+PyUFunc_AddLoopFromSpec(PyObject *ufunc, PyArrayMethod_Spec *spec)
+{
+    if (!PyObject_TypeCheck(ufunc, &PyUFunc_Type)) {
+        PyErr_SetString(PyExc_TypeError,
+                "ufunc object passed is not a ufunc!");
+        return -1;
+    }
+    PyBoundArrayMethodObject *bmeth =
+            (PyBoundArrayMethodObject *)PyArrayMethod_FromSpec(spec);
+    if (bmeth == NULL) {
+        return -1;
+    }
+    int nargs = bmeth->method->nin + bmeth->method->nout;
+    PyObject *dtypes = PyArray_TupleFromItems(
+            nargs, (PyObject **)bmeth->dtypes, 1);
+    if (dtypes == NULL) {
+        return -1;
+    }
+    PyObject *info = PyTuple_Pack(2, dtypes, bmeth->method);
+    Py_DECREF(bmeth);
+    Py_DECREF(dtypes);
+    if (info == NULL) {
+        return -1;
+    }
+    return PyUFunc_AddLoop((PyUFuncObject *)ufunc, info, 0);
+}
+
+
 /**
  * Resolves the implementation to use, this uses typical multiple dispatching
  * methods of finding the best matching implementation or resolver.
diff --git a/numpy/core/src/umath/dispatching.h b/numpy/core/src/umath/dispatching.h
index a7e9e88d0..f2ab0be2e 100644
--- a/numpy/core/src/umath/dispatching.h
+++ b/numpy/core/src/umath/dispatching.h
@@ -6,6 +6,9 @@
 #include <numpy/ufuncobject.h>
 #include "array_method.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
 
 typedef int promoter_function(PyUFuncObject *ufunc,
         PyArray_DTypeMeta *op_dtypes[], PyArray_DTypeMeta *signature[],
@@ -14,6 +17,9 @@ typedef int promoter_function(PyUFuncObject *ufunc,
 NPY_NO_EXPORT int
 PyUFunc_AddLoop(PyUFuncObject *ufunc, PyObject *info, int ignore_duplicate);
 
+NPY_NO_EXPORT int
+PyUFunc_AddLoopFromSpec(PyObject *ufunc, PyArrayMethod_Spec *spec);
+
 NPY_NO_EXPORT PyArrayMethodObject *
 promote_and_get_ufuncimpl(PyUFuncObject *ufunc,
         PyArrayObject *const ops[],
@@ -41,5 +47,8 @@ object_only_ufunc_promoter(PyUFuncObject *ufunc,
 NPY_NO_EXPORT int
 install_logical_ufunc_promoter(PyObject *ufunc);
 
+#ifdef __cplusplus
+}
+#endif
 
 #endif  /*_NPY_DISPATCHING_H */
diff --git a/numpy/core/src/umath/extobj.c b/numpy/core/src/umath/extobj.c
index 6b9a27e26..893429107 100644
--- a/numpy/core/src/umath/extobj.c
+++ b/numpy/core/src/umath/extobj.c
@@ -267,6 +267,33 @@ _extract_pyvals(PyObject *ref, const char *name, int *bufsize,
 }
 
 /*
+ * Handler which uses the default `np.errstate` given that `fpe_errors` is
+ * already set.  `fpe_errors` is typically the (nonzero) result of
+ * `npy_get_floatstatus_barrier`.
+ *
+ * Returns -1 on failure (an error was raised) and 0 on success.
+ */
+NPY_NO_EXPORT int
+PyUFunc_GiveFloatingpointErrors(const char *name, int fpe_errors)
+{
+    int bufsize, errmask;
+    PyObject *errobj;
+
+    if (PyUFunc_GetPyValues((char *)name, &bufsize, &errmask,
+                            &errobj) < 0) {
+        return -1;
+    }
+    int first = 1;
+    if (PyUFunc_handlefperr(errmask, errobj, fpe_errors, &first)) {
+        Py_XDECREF(errobj);
+        return -1;
+    }
+    Py_XDECREF(errobj);
+    return 0;
+}
+
+
+/*
  * check the floating point status
  *  - errmask: mask of status to check
  *  - extobj: ufunc pyvals object
diff --git a/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src b/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src
index 51b167844..bf8142880 100644
--- a/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src
+++ b/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src
@@ -1,6 +1,7 @@
 /*@targets
  ** $maxopt baseline
  ** sse2 avx2 avx512f
+ ** vx vxe
  **/
 #define _UMATHMODULE
 #define _MULTIARRAYMODULE
@@ -364,7 +365,7 @@ sse2_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_i
  *  #type = npy_float, npy_double#
  *  #TYPE = FLOAT, DOUBLE#
  *  #sfx = f32, f64#
- *  #CHK =    , _F64#
+ *  #CHK = _F32, _F64#
  */
 #if NPY_SIMD@CHK@
 /**begin repeat1
@@ -444,7 +445,7 @@ simd_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_i
  *  #type = npy_float, npy_double, npy_longdouble#
  *  #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
  *  #vector = 1, 1, 0#
- *  #VECTOR = NPY_SIMD, NPY_SIMD_F64, 0 #
+ *  #VECTOR = NPY_SIMD_F32, NPY_SIMD_F64, 0 #
  */
 /**begin repeat1
  * Arithmetic
diff --git a/numpy/core/src/umath/loops_arithmetic.dispatch.c.src b/numpy/core/src/umath/loops_arithmetic.dispatch.c.src
index 16a9eac2e..5b5f13ad1 100644
--- a/numpy/core/src/umath/loops_arithmetic.dispatch.c.src
+++ b/numpy/core/src/umath/loops_arithmetic.dispatch.c.src
@@ -3,6 +3,7 @@
  ** sse2 sse41 avx2 avx512f avx512_skx
  ** vsx2 vsx4
  ** neon
+ ** vx
  **/
 #define _UMATHMODULE
 #define _MULTIARRAYMODULE
@@ -51,13 +52,14 @@ simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len)
     const npyv_@sfx@x3 divisor = npyv_divisor_@sfx@(scalar);
 
     if (scalar == -1) {
-        npyv_b@len@ noverflow = npyv_cvt_b@len@_@sfx@(npyv_setall_@sfx@(-1));
-        npyv_@sfx@ vzero      = npyv_zero_@sfx@();
+        npyv_b@len@ noverflow  = npyv_cvt_b@len@_@sfx@(npyv_setall_@sfx@(-1));
+        const npyv_@sfx@ vzero = npyv_zero_@sfx@();
+        const npyv_@sfx@ vmin  = npyv_setall_@sfx@(NPY_MIN_INT@len@);
         for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
             npyv_@sfx@ a       = npyv_load_@sfx@(src);
             npyv_b@len@ gt_min = npyv_cmpgt_@sfx@(a, npyv_setall_@sfx@(NPY_MIN_INT@len@));
             noverflow          = npyv_and_b@len@(noverflow, gt_min);
-            npyv_@sfx@ neg     = npyv_ifsub_@sfx@(gt_min, vzero, a, vzero);
+            npyv_@sfx@ neg     = npyv_ifsub_@sfx@(gt_min, vzero, a, vmin);
             npyv_store_@sfx@(dst, neg);
         }
 
@@ -66,13 +68,13 @@ simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len)
             npyv_lanetype_@sfx@ a = *src;
             if (a == NPY_MIN_INT@len@) {
                 raise_err = 1;
-                *dst  = 0;
+                *dst  = NPY_MIN_INT@len@;
             } else {
                 *dst = -a;
             }
         }
         if (raise_err) {
-            npy_set_floatstatus_divbyzero();
+            npy_set_floatstatus_overflow();
         }
     } else {
         for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
@@ -253,7 +255,8 @@ vsx4_simd_divide_contig_@sfx@(char **args, npy_intp len)
     const npyv_@sfx@ vneg_one = npyv_setall_@sfx@(-1);
     const npyv_@sfx@ vzero    = npyv_zero_@sfx@();
     const npyv_@sfx@ vmin     = npyv_setall_@sfx@(NPY_MIN_INT@len@);
-    npyv_b@len@ warn          = npyv_cvt_b@len@_@sfx@(npyv_zero_@sfx@());
+    npyv_b@len@ warn_zero     = npyv_cvt_b@len@_@sfx@(npyv_zero_@sfx@());
+    npyv_b@len@ warn_overflow = npyv_cvt_b@len@_@sfx@(npyv_zero_@sfx@());
     const int vstep           = npyv_nlanes_@sfx@;
 
     for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
@@ -267,10 +270,8 @@ vsx4_simd_divide_contig_@sfx@(char **args, npy_intp len)
         npyv_b@len@ amin     = npyv_cmpeq_@sfx@(a, vmin);
         npyv_b@len@ bneg_one = npyv_cmpeq_@sfx@(b, vneg_one);
         npyv_b@len@ overflow = npyv_and_@sfx@(bneg_one, amin);
-        npyv_b@len@ error    = npyv_or_@sfx@(bzero, overflow);
-        // in case of overflow or b = 0, 'cvtozero' forces quo/rem to be 0
-        npyv_@sfx@ cvtozero  = npyv_select_@sfx@(error, vzero, vneg_one);
-                        warn = npyv_or_@sfx@(error, warn);
+                   warn_zero = npyv_or_@sfx@(bzero, warn_zero);
+               warn_overflow = npyv_or_@sfx@(overflow, warn_overflow);
         // handle mixed case the way Python does
         // ((a > 0) == (b > 0) || rem == 0)
         npyv_b@len@ a_gt_zero  = npyv_cmpgt_@sfx@(a, vzero);
@@ -280,21 +281,30 @@ vsx4_simd_divide_contig_@sfx@(char **args, npy_intp len)
         npyv_b@len@ or         = npyv_or_@sfx@(ab_eq_cond, rem_zero);
         npyv_@sfx@ to_sub = npyv_select_@sfx@(or, vzero, vneg_one);
                       quo = npyv_add_@sfx@(quo, to_sub);
-        npyv_store_@sfx@(dst1, npyv_and_@sfx@(cvtozero, quo));
+                      // Divide by zero
+                      quo = npyv_select_@sfx@(bzero, vzero, quo);
+                      // Overflow
+                      quo = npyv_select_@sfx@(overflow, vmin, quo);
+        npyv_store_@sfx@(dst1, quo);
     }
 
-    if (!vec_all_eq(warn, vzero)) {
+    if (!vec_all_eq(warn_zero, vzero)) {
         npy_set_floatstatus_divbyzero();
     }
+    if (!vec_all_eq(warn_overflow, vzero)) {
+        npy_set_floatstatus_overflow();
+    }
 
     for (; len > 0; --len, ++src1, ++src2, ++dst1) {
         const npyv_lanetype_@sfx@ a = *src1;
         const npyv_lanetype_@sfx@ b = *src2;
-        if (b == 0 || (a == NPY_MIN_INT@len@ && b == -1)) {
+        if (NPY_UNLIKELY(b == 0)) {
             npy_set_floatstatus_divbyzero();
             *dst1 = 0;
-        }
-        else {
+        } else if (NPY_UNLIKELY((a == NPY_MIN_INT@len@) && (b == -1))) {
+            npy_set_floatstatus_overflow();
+            *dst1 = NPY_MIN_INT@len@;
+        } else {
             *dst1 = a / b;
             if (((a > 0) != (b > 0)) && ((*dst1 * b) != a)) {
                 *dst1 -= 1;
@@ -340,8 +350,14 @@ NPY_FINLINE @type@ floor_div_@TYPE@(const @type@ n, const @type@ d)
      * (i.e. a different approach than npy_set_floatstatus_divbyzero()).
      */
     if (NPY_UNLIKELY(d == 0 || (n == NPY_MIN_@TYPE@ && d == -1))) {
-        npy_set_floatstatus_divbyzero();
-        return 0;
+        if (d == 0) {
+            npy_set_floatstatus_divbyzero();
+            return 0;
+        }
+        else {
+            npy_set_floatstatus_overflow();
+            return NPY_MIN_@TYPE@;
+        }
     }
     @type@ r = n / d;
     // Negative quotients needs to be rounded down
diff --git a/numpy/core/src/umath/loops_comparison.dispatch.c.src b/numpy/core/src/umath/loops_comparison.dispatch.c.src
index 01d58fbf9..2f75593a5 100644
--- a/numpy/core/src/umath/loops_comparison.dispatch.c.src
+++ b/numpy/core/src/umath/loops_comparison.dispatch.c.src
@@ -3,6 +3,7 @@
  ** sse2 sse42 avx2 avx512f avx512_skx
  ** vsx2 vsx3
  ** neon
+ ** vx vxe
  **/
 #define _UMATHMODULE
 #define _MULTIARRAYMODULE
@@ -22,7 +23,7 @@
  * #sfx    = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64#
  * #len    =  8,  8,  16,  16,  32,  32,  64,  64,  32,  64#
  * #signed =  0,  1,   0,   1,   0,   1,   0,   1,   0,   0#
- * #VECTOR = NPY_SIMD*9, NPY_SIMD_F64#
+ * #VECTOR = NPY_SIMD*8, NPY_SIMD_F32, NPY_SIMD_F64#
  */
 /**begin repeat1
  * #kind = equal, not_equal, less, less_equal#
@@ -298,7 +299,7 @@ static void simd_binary_scalar2_@kind@_b8(char **args, npy_intp len)
  * #bool = 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0#
  * #fp = 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1#
  * #signed = 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0#
- * #VECTOR = NPY_SIMD*10, NPY_SIMD_F64#
+ * #VECTOR = NPY_SIMD*9, NPY_SIMD_F32, NPY_SIMD_F64#
  */
 /**begin repeat1
  * #kind = equal, not_equal, less, less_equal#
diff --git a/numpy/core/src/umath/loops_hyperbolic.dispatch.c.src b/numpy/core/src/umath/loops_hyperbolic.dispatch.c.src
index 8cccc18f0..ce4962ce3 100644
--- a/numpy/core/src/umath/loops_hyperbolic.dispatch.c.src
+++ b/numpy/core/src/umath/loops_hyperbolic.dispatch.c.src
@@ -3,6 +3,7 @@
  ** (avx2 fma3) AVX512_SKX
  ** vsx2 vsx4
  ** neon_vfpv4
+ ** vx vxe
  **/
 #include "numpy/npy_math.h"
 #include "simd/simd.h"
@@ -240,6 +241,8 @@ simd_tanh_f64(const double *src, npy_intp ssrc, double *dst, npy_intp sdst, npy_
     }
 }
 #endif // NPY_SIMD_F64
+
+#if NPY_SIMD_F32
 static void
 simd_tanh_f32(const float *src, npy_intp ssrc, float *dst, npy_intp sdst, npy_intp len)
 {
@@ -335,6 +338,7 @@ simd_tanh_f32(const float *src, npy_intp ssrc, float *dst, npy_intp sdst, npy_in
         }
     }
 }
+#endif // NPY_SIMD_F32
 #endif // NPY_SIMD_FMA3
 
 /**begin repeat
@@ -342,7 +346,7 @@ simd_tanh_f32(const float *src, npy_intp ssrc, float *dst, npy_intp sdst, npy_in
  * #type = float, double#
  * #sfx  = f32,   f64#
  * #ssfx = f,     #
- * #simd = NPY_SIMD_FMA3, NPY_SIMD_FMA3 && NPY_SIMD_F64#
+ * #simd = NPY_SIMD_FMA3 && NPY_SIMD_F32, NPY_SIMD_FMA3 && NPY_SIMD_F64#
  */
 /**begin repeat1
  *  #func = tanh#
diff --git a/numpy/core/src/umath/loops_minmax.dispatch.c.src b/numpy/core/src/umath/loops_minmax.dispatch.c.src
index ba2288f0b..b4fb205a0 100644
--- a/numpy/core/src/umath/loops_minmax.dispatch.c.src
+++ b/numpy/core/src/umath/loops_minmax.dispatch.c.src
@@ -3,6 +3,7 @@
  ** neon asimd
  ** sse2 avx2 avx512_skx
  ** vsx2
+ ** vx vxe
  **/
 #define _UMATHMODULE
 #define _MULTIARRAYMODULE
@@ -144,7 +145,7 @@ NPY_FINLINE @type@ scalar_@op@_@c_sfx@(@type@ a, @type@ b) {
 /**begin repeat
  * #sfx = f32, f64#
  * #bsfx = b32, b64#
- * #simd_chk = NPY_SIMD, NPY_SIMD_F64#
+ * #simd_chk = NPY_SIMD_F32, NPY_SIMD_F64#
  * #scalar_sfx = f, d#
  */
 #if @simd_chk@
@@ -196,7 +197,7 @@ NPY_FINLINE @type@ scalar_@op@_@c_sfx@(@type@ a, @type@ b) {
  ******************************************************************************/
 /**begin repeat
  * #sfx = s8, u8, s16, u16, s32, u32, s64, u64, f32, f64#
- * #simd_chk = NPY_SIMD*9, NPY_SIMD_F64#
+ * #simd_chk = NPY_SIMD*8, NPY_SIMD_F32, NPY_SIMD_F64#
  * #is_fp = 0*8, 1, 1#
  * #scalar_sfx = i*8, f, d#
  */
@@ -395,6 +396,9 @@ simd_binary_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip1, npy_intp sip1,
 #elif NPY_SIMD && NPY_BITSOF_@BTYPE@ == @len@
     #if @is_fp@
         #define TO_SIMD_SFX(X) X##_f@len@
+        #if NPY_BITSOF_@BTYPE@ == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
         #if NPY_BITSOF_@BTYPE@ == 64 && !NPY_SIMD_F64
             #undef TO_SIMD_SFX
         #endif
diff --git a/numpy/core/src/umath/loops_trigonometric.dispatch.c.src b/numpy/core/src/umath/loops_trigonometric.dispatch.c.src
index 44c47d14f..78685e807 100644
--- a/numpy/core/src/umath/loops_trigonometric.dispatch.c.src
+++ b/numpy/core/src/umath/loops_trigonometric.dispatch.c.src
@@ -3,6 +3,7 @@
  ** (avx2 fma3) avx512f
  ** vsx2 vsx3 vsx4
  ** neon_vfpv4
+ ** vxe vxe2
  **/
 #include "numpy/npy_math.h"
 #include "simd/simd.h"
@@ -13,7 +14,7 @@
  * - use vectorized version of Payne-Hanek style reduction for large elements or
  *   when there's no native FUSED support instead of fallback to libc
  */
-#if NPY_SIMD_FMA3 // native support
+#if NPY_SIMD_F32 && NPY_SIMD_FMA3 // native support
 /*
  * Vectorized Cody-Waite range reduction technique
  * Performs the reduction step x* = x - y*C in three steps:
@@ -210,7 +211,7 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_@func@)
     const npy_intp sdst = steps[1] / lsize;
     npy_intp len = dimensions[0];
     assert(len <= 1 || (steps[0] % lsize == 0 && steps[1] % lsize == 0));
-#if NPY_SIMD_FMA3
+#if NPY_SIMD_F32 && NPY_SIMD_FMA3
     if (is_mem_overlap(src, steps[0], dst, steps[1], len) ||
         !npyv_loadable_stride_f32(ssrc) || !npyv_storable_stride_f32(sdst)
     ) {
diff --git a/numpy/core/src/umath/loops_unary_fp.dispatch.c.src b/numpy/core/src/umath/loops_unary_fp.dispatch.c.src
index 78e231965..0ac39a9b1 100644
--- a/numpy/core/src/umath/loops_unary_fp.dispatch.c.src
+++ b/numpy/core/src/umath/loops_unary_fp.dispatch.c.src
@@ -3,6 +3,7 @@
  ** sse2 sse41
  ** vsx2
  ** neon asimd
+ ** vx vxe
  **/
 /**
  * Force use SSE only on x86, even if AVX2 or AVX512F are enabled
@@ -18,7 +19,7 @@
 /**********************************************************
  ** Scalars
  **********************************************************/
-#if !NPY_SIMD
+#if !NPY_SIMD_F32
 NPY_FINLINE float c_recip_f32(float a)
 { return 1.0f / a; }
 NPY_FINLINE float c_abs_f32(float a)
@@ -29,7 +30,7 @@ NPY_FINLINE float c_abs_f32(float a)
 }
 NPY_FINLINE float c_square_f32(float a)
 { return a * a; }
-#endif // !NPY_SIMD
+#endif // !NPY_SIMD_F32
 
 #if !NPY_SIMD_F64
 NPY_FINLINE double c_recip_f64(double a)
@@ -147,7 +148,7 @@ NPY_FINLINE double c_square_f64(double a)
 /**begin repeat
  * #TYPE = FLOAT, DOUBLE#
  * #sfx  = f32, f64#
- * #VCHK = NPY_SIMD, NPY_SIMD_F64#
+ * #VCHK = NPY_SIMD_F32, NPY_SIMD_F64#
  */
 #if @VCHK@
 /**begin repeat1
@@ -259,7 +260,7 @@ static void simd_@TYPE@_@kind@_@STYPE@_@DTYPE@
 /**begin repeat
  * #TYPE = FLOAT, DOUBLE#
  * #sfx  = f32, f64#
- * #VCHK = NPY_SIMD, NPY_SIMD_F64#
+ * #VCHK = NPY_SIMD_F32, NPY_SIMD_F64#
  */
 /**begin repeat1
  * #kind  = rint, floor, ceil, trunc, sqrt, absolute, square, reciprocal#
diff --git a/numpy/core/src/umath/scalarmath.c.src b/numpy/core/src/umath/scalarmath.c.src
index 4993546f8..ef608378a 100644
--- a/numpy/core/src/umath/scalarmath.c.src
+++ b/numpy/core/src/umath/scalarmath.c.src
@@ -499,17 +499,26 @@ half_ctype_power(npy_half a, npy_half b, npy_half *out)
  * #type = npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint,
  *         npy_long, npy_ulong, npy_longlong, npy_ulonglong,
  *         npy_float, npy_double, npy_longdouble#
+ * #NAME = BYTE, UBYTE, SHORT, USHORT, INT, UINT,
+ *         LONG, ULONG, LONGLONG, ULONGLONG,
+ *         FLOAT, DOUBLE, LONGDOUBLE#
  * #uns = (0,1)*5,0*3#
+ * #int = 1*10,0*3#
  */
 static NPY_INLINE int
 @name@_ctype_negative(@type@ a, @type@ *out)
 {
-    *out = -a;
 #if @uns@
+    *out = -a;
     return NPY_FPE_OVERFLOW;
-#else
-    return 0;
+#elif @int@
+    if(a == NPY_MIN_@NAME@){
+        *out = a;
+        return NPY_FPE_OVERFLOW;
+    }
 #endif
+    *out = -a;
+    return 0;
 }
 /**end repeat**/
 
@@ -584,10 +593,15 @@ static NPY_INLINE int
 /**begin repeat
  * #name = byte, short, int, long, longlong#
  * #type = npy_byte, npy_short, npy_int, npy_long, npy_longlong#
+ * #NAME = BYTE, SHORT, INT, LONG, LONGLONG#
  */
 static NPY_INLINE int
 @name@_ctype_absolute(@type@ a, @type@ *out)
 {
+    if (a == NPY_MIN_@NAME@) {
+        *out = a;
+        return NPY_FPE_OVERFLOW;
+    }
     *out = (a < 0 ? -a : a);
     return 0;
 }
@@ -1564,8 +1578,23 @@ static PyObject *
 
 
     val = PyArrayScalar_VAL(a, @Name@);
+    int retstatus = @name@_ctype_@oper@(val, &out);
 
-    @name@_ctype_@oper@(val, &out);
+    if (retstatus) {
+        int bufsize, errmask;
+        PyObject *errobj;
+
+        if (PyUFunc_GetPyValues("@name@_scalars", &bufsize, &errmask,
+                                &errobj) < 0) {
+            return NULL;
+        }
+        int first = 1;
+        if (PyUFunc_handlefperr(errmask, errobj, retstatus, &first)) {
+            Py_XDECREF(errobj);
+            return NULL;
+        }
+        Py_XDECREF(errobj);
+    }
 
     /*
      * TODO: Complex absolute should check floating point flags.
diff --git a/numpy/core/src/umath/string_ufuncs.cpp b/numpy/core/src/umath/string_ufuncs.cpp
new file mode 100644
index 000000000..5a35c318b
--- /dev/null
+++ b/numpy/core/src/umath/string_ufuncs.cpp
@@ -0,0 +1,449 @@
+#include <Python.h>
+
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
+#define _UMATHMODULE
+
+#include "numpy/ndarraytypes.h"
+
+#include "numpyos.h"
+#include "dispatching.h"
+#include "dtypemeta.h"
+#include "common_dtype.h"
+#include "convert_datatype.h"
+
+#include "string_ufuncs.h"
+
+
+template <typename character>
+static NPY_INLINE int
+character_cmp(character a, character b)
+{
+    if (a == b) {
+        return 0;
+    }
+    else if (a < b) {
+        return -1;
+    }
+    else {
+        return 1;
+    }
+}
+
+
+/*
+ * Compare two strings of different length.  Note that either string may be
+ * zero padded (trailing zeros are ignored in other words, the shorter word
+ * is always padded with zeros).
+ */
+template <bool rstrip, typename character>
+static NPY_INLINE int
+string_cmp(int len1, const character *str1, int len2, const character *str2)
+{
+    if (rstrip) {
+        /*
+         * Ignore/"trim" trailing whitespace (and 0s).  Note that this function
+         * does not support unicode whitespace (and never has).
+         */
+        while (len1 > 0) {
+            character c = str1[len1-1];
+            if (c != (character)0 && !NumPyOS_ascii_isspace(c)) {
+                break;
+            }
+            len1--;
+        }
+        while (len2 > 0) {
+            character c = str2[len2-1];
+            if (c != (character)0 && !NumPyOS_ascii_isspace(c)) {
+                break;
+            }
+            len2--;
+        }
+    }
+
+    int n = PyArray_MIN(len1, len2);
+
+    if (sizeof(character) == 1) {
+        /*
+         * TODO: `memcmp` makes things 2x faster for longer words that match
+         *       exactly, but at least 2x slower for short or mismatching ones.
+         */
+        int cmp = memcmp(str1, str2, n);
+        if (cmp != 0) {
+            return cmp;
+        }
+        str1 += n;
+        str2 += n;
+    }
+    else {
+        for (int i = 0; i < n; i++) {
+            int cmp = character_cmp(*str1, *str2);
+            if (cmp != 0) {
+                return cmp;
+            }
+            str1++;
+            str2++;
+        }
+    }
+    if (len1 > len2) {
+        for (int i = n; i < len1; i++) {
+            int cmp = character_cmp(*str1, (character)0);
+            if (cmp != 0) {
+                return cmp;
+            }
+            str1++;
+        }
+    }
+    else if (len2 > len1) {
+        for (int i = n; i < len2; i++) {
+            int cmp = character_cmp((character)0, *str2);
+            if (cmp != 0) {
+                return cmp;
+            }
+            str2++;
+        }
+    }
+    return 0;
+}
+
+
+/*
+ * Helper for templating, avoids warnings about uncovered switch paths.
+ */
+enum class COMP {
+    EQ, NE, LT, LE, GT, GE,
+};
+
+static char const *
+comp_name(COMP comp) {
+    switch(comp) {
+        case COMP::EQ: return "equal";
+        case COMP::NE: return "not_equal";
+        case COMP::LT: return "less";
+        case COMP::LE: return "less_equal";
+        case COMP::GT: return "greater";
+        case COMP::GE: return "greater_equal";
+        default:
+            assert(0);
+            return nullptr;
+    }
+}
+
+
+template <bool rstrip, COMP comp, typename character>
+static int
+string_comparison_loop(PyArrayMethod_Context *context,
+        char *const data[], npy_intp const dimensions[],
+        npy_intp const strides[], NpyAuxData *NPY_UNUSED(auxdata))
+{
+    /*
+     * Note, fetching `elsize` from the descriptor is OK even without the GIL,
+     * however it may be that this should be moved into `auxdata` eventually,
+     * which may also be slightly faster/cleaner (but more involved).
+     */
+    int len1 = context->descriptors[0]->elsize / sizeof(character);
+    int len2 = context->descriptors[1]->elsize / sizeof(character);
+
+    char *in1 = data[0];
+    char *in2 = data[1];
+    char *out = data[2];
+
+    npy_intp N = dimensions[0];
+
+    while (N--) {
+        int cmp = string_cmp<rstrip>(
+                len1, (character *)in1, len2, (character *)in2);
+        npy_bool res;
+        switch (comp) {
+            case COMP::EQ:
+                res = cmp == 0;
+                break;
+            case COMP::NE:
+                res = cmp != 0;
+                break;
+            case COMP::LT:
+                res = cmp < 0;
+                break;
+            case COMP::LE:
+                res = cmp <= 0;
+                break;
+            case COMP::GT:
+                res = cmp > 0;
+                break;
+            case COMP::GE:
+                res = cmp >= 0;
+                break;
+        }
+        *(npy_bool *)out = res;
+
+        in1 += strides[0];
+        in2 += strides[1];
+        out += strides[2];
+    }
+    return 0;
+}
+
+
+/*
+ * Machinery to add the string loops to the existing ufuncs.
+ */
+
+/*
+ * This function replaces the strided loop with the passed in one,
+ * and registers it with the given ufunc.
+ */
+static int
+add_loop(PyObject *umath, const char *ufunc_name,
+         PyArrayMethod_Spec *spec, PyArrayMethod_StridedLoop *loop)
+{
+    PyObject *name = PyUnicode_FromString(ufunc_name);
+    if (name == nullptr) {
+        return -1;
+    }
+    PyObject *ufunc = PyObject_GetItem(umath, name);
+    Py_DECREF(name);
+    if (ufunc == nullptr) {
+        return -1;
+    }
+    spec->slots[0].pfunc = (void *)loop;
+
+    int res = PyUFunc_AddLoopFromSpec(ufunc, spec);
+    Py_DECREF(ufunc);
+    return res;
+}
+
+
+template<bool rstrip, typename character, COMP...>
+struct add_loops;
+
+template<bool rstrip, typename character>
+struct add_loops<rstrip, character> {
+    int operator()(PyObject*, PyArrayMethod_Spec*) {
+        return 0;
+    }
+};
+
+template<bool rstrip, typename character, COMP comp, COMP... comps>
+struct add_loops<rstrip, character, comp, comps...> {
+    int operator()(PyObject* umath, PyArrayMethod_Spec* spec) {
+        PyArrayMethod_StridedLoop* loop = string_comparison_loop<rstrip, comp, character>;
+
+        if (add_loop(umath, comp_name(comp), spec, loop) < 0) {
+            return -1;
+        }
+        else {
+            return add_loops<rstrip, character, comps...>()(umath, spec);
+        }
+    }
+};
+
+
+NPY_NO_EXPORT int
+init_string_ufuncs(PyObject *umath)
+{
+    int res = -1;
+    /* NOTE: This should receive global symbols? */
+    PyArray_DTypeMeta *String = PyArray_DTypeFromTypeNum(NPY_STRING);
+    PyArray_DTypeMeta *Unicode = PyArray_DTypeFromTypeNum(NPY_UNICODE);
+    PyArray_DTypeMeta *Bool = PyArray_DTypeFromTypeNum(NPY_BOOL);
+
+    /* We start with the string loops: */
+    PyArray_DTypeMeta *dtypes[] = {String, String, Bool};
+    /*
+     * We only have one loop right now, the strided one.  The default type
+     * resolver ensures native byte order/canonical representation.
+     */
+    PyType_Slot slots[] = {
+        {NPY_METH_strided_loop, nullptr},
+        {0, nullptr}
+    };
+
+    PyArrayMethod_Spec spec = {};
+    spec.name = "templated_string_comparison";
+    spec.nin = 2;
+    spec.nout = 1;
+    spec.dtypes = dtypes;
+    spec.slots = slots;
+    spec.flags = NPY_METH_NO_FLOATINGPOINT_ERRORS;
+
+    /* All String loops */
+    using string_looper = add_loops<false, npy_byte, COMP::EQ, COMP::NE, COMP::LT, COMP::LE, COMP::GT, COMP::GE>;
+    if (string_looper()(umath, &spec) < 0) {
+        goto finish;
+    }
+
+    /* All Unicode loops */
+    using ucs_looper = add_loops<false, npy_ucs4, COMP::EQ, COMP::NE, COMP::LT, COMP::LE, COMP::GT, COMP::GE>;
+    dtypes[0] = Unicode;
+    dtypes[1] = Unicode;
+    if (ucs_looper()(umath, &spec) < 0) {
+        goto finish;
+    }
+
+    res = 0;
+  finish:
+    Py_DECREF(String);
+    Py_DECREF(Unicode);
+    Py_DECREF(Bool);
+    return res;
+}
+
+
+template <bool rstrip, typename character>
+static PyArrayMethod_StridedLoop *
+get_strided_loop(int comp)
+{
+    switch (comp) {
+        case Py_EQ:
+            return string_comparison_loop<rstrip, COMP::EQ, character>;
+        case Py_NE:
+            return string_comparison_loop<rstrip, COMP::NE, character>;
+        case Py_LT:
+            return string_comparison_loop<rstrip, COMP::LT, character>;
+        case Py_LE:
+            return string_comparison_loop<rstrip, COMP::LE, character>;
+        case Py_GT:
+            return string_comparison_loop<rstrip, COMP::GT, character>;
+        case Py_GE:
+            return string_comparison_loop<rstrip, COMP::GE, character>;
+        default:
+            assert(false);  /* caller ensures this */
+    }
+    return nullptr;
+}
+
+
+/*
+ * This function is used for `compare_chararrays` and currently also void
+ * comparisons (unstructured voids).  The first could probably be deprecated
+ * and removed but is used by `np.char.chararray` the latter should also be
+ * moved to the ufunc probably (removing the need for manual looping).
+ *
+ * The `rstrip` mechanism is presumably for some fortran compat, but the
+ * question is whether it would not be better to have/use `rstrip` on such
+ * an array first...
+ *
+ * NOTE: This function is also used for unstructured voids, this works because
+ *       `npy_byte` is correct.
+ */
+NPY_NO_EXPORT PyObject *
+_umath_strings_richcompare(
+        PyArrayObject *self, PyArrayObject *other, int cmp_op, int rstrip)
+{
+    NpyIter *iter = nullptr;
+    PyObject *result = nullptr;
+
+    char **dataptr = nullptr;
+    npy_intp *strides = nullptr;
+    npy_intp *countptr = nullptr;
+    npy_intp size = 0;
+
+    PyArrayMethod_Context context = {};
+    NpyIter_IterNextFunc *iternext = nullptr;
+
+    npy_uint32 it_flags = (
+            NPY_ITER_EXTERNAL_LOOP | NPY_ITER_ZEROSIZE_OK |
+            NPY_ITER_BUFFERED | NPY_ITER_GROWINNER);
+    npy_uint32 op_flags[3] = {
+            NPY_ITER_READONLY | NPY_ITER_ALIGNED,
+            NPY_ITER_READONLY | NPY_ITER_ALIGNED,
+            NPY_ITER_WRITEONLY | NPY_ITER_ALLOCATE | NPY_ITER_ALIGNED};
+
+    PyArrayMethod_StridedLoop *strided_loop = nullptr;
+    NPY_BEGIN_THREADS_DEF;
+
+    if (PyArray_TYPE(self) != PyArray_TYPE(other)) {
+        /*
+         * Comparison between Bytes and Unicode is not defined in Py3K;
+         * we follow.
+         * TODO: This makes no sense at all for `compare_chararrays`, kept
+         *       only under the assumption that we are more likely to deprecate
+         *       than fix it to begin with.
+         */
+        Py_INCREF(Py_NotImplemented);
+        return Py_NotImplemented;
+    }
+
+    PyArrayObject *ops[3] = {self, other, nullptr};
+    PyArray_Descr *descrs[3] = {nullptr, nullptr, PyArray_DescrFromType(NPY_BOOL)};
+    /* TODO: ensuring native byte order is not really necessary for == and != */
+    descrs[0] = NPY_DT_CALL_ensure_canonical(PyArray_DESCR(self));
+    if (descrs[0] == nullptr) {
+        goto finish;
+    }
+    descrs[1] = NPY_DT_CALL_ensure_canonical(PyArray_DESCR(other));
+    if (descrs[1] == nullptr) {
+        goto finish;
+    }
+
+    /*
+     * Create the iterator:
+     */
+    iter = NpyIter_AdvancedNew(
+            3, ops, it_flags, NPY_KEEPORDER, NPY_SAFE_CASTING, op_flags, descrs,
+            -1, nullptr, nullptr, 0);
+    if (iter == nullptr) {
+        goto finish;
+    }
+
+    size = NpyIter_GetIterSize(iter);
+    if (size == 0) {
+        result = (PyObject *)NpyIter_GetOperandArray(iter)[2];
+        Py_INCREF(result);
+        goto finish;
+    }
+
+    iternext = NpyIter_GetIterNext(iter, nullptr);
+    if (iternext == nullptr) {
+        goto finish;
+    }
+
+    /*
+     * Prepare the inner-loop and execute it (we only need descriptors to be
+     * passed in).
+     */
+    context.descriptors = descrs;
+
+    dataptr = NpyIter_GetDataPtrArray(iter);
+    strides = NpyIter_GetInnerStrideArray(iter);
+    countptr = NpyIter_GetInnerLoopSizePtr(iter);
+
+    if (rstrip == 0) {
+        /* NOTE: Also used for VOID, so can be STRING, UNICODE, or VOID: */
+        if (descrs[0]->type_num != NPY_UNICODE) {
+            strided_loop = get_strided_loop<false, npy_byte>(cmp_op);
+        }
+        else {
+            strided_loop = get_strided_loop<false, npy_ucs4>(cmp_op);
+        }
+    }
+    else {
+        if (descrs[0]->type_num != NPY_UNICODE) {
+            strided_loop = get_strided_loop<true, npy_byte>(cmp_op);
+        }
+        else {
+            strided_loop = get_strided_loop<true, npy_ucs4>(cmp_op);
+        }
+    }
+
+    NPY_BEGIN_THREADS_THRESHOLDED(size);
+
+    do {
+         /* We know the loop cannot fail */
+         strided_loop(&context, dataptr, countptr, strides, nullptr);
+    } while (iternext(iter) != 0);
+
+    NPY_END_THREADS;
+
+    result = (PyObject *)NpyIter_GetOperandArray(iter)[2];
+    Py_INCREF(result);
+
+ finish:
+    if (NpyIter_Deallocate(iter) < 0) {
+        Py_CLEAR(result);
+    }
+    Py_XDECREF(descrs[0]);
+    Py_XDECREF(descrs[1]);
+    Py_XDECREF(descrs[2]);
+    return result;
+}
diff --git a/numpy/core/src/umath/string_ufuncs.h b/numpy/core/src/umath/string_ufuncs.h
new file mode 100644
index 000000000..aa1719954
--- /dev/null
+++ b/numpy/core/src/umath/string_ufuncs.h
@@ -0,0 +1,19 @@
+#ifndef _NPY_CORE_SRC_UMATH_STRING_UFUNCS_H_
+#define _NPY_CORE_SRC_UMATH_STRING_UFUNCS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+NPY_NO_EXPORT int
+init_string_ufuncs(PyObject *umath);
+
+NPY_NO_EXPORT PyObject *
+_umath_strings_richcompare(
+        PyArrayObject *self, PyArrayObject *other, int cmp_op, int rstrip);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* _NPY_CORE_SRC_UMATH_STRING_UFUNCS_H_ */
+\ No newline at end of file
diff --git a/numpy/core/src/umath/ufunc_object.c b/numpy/core/src/umath/ufunc_object.c
index fce7d61de..2636396d3 100644
--- a/numpy/core/src/umath/ufunc_object.c
+++ b/numpy/core/src/umath/ufunc_object.c
@@ -57,6 +57,10 @@
 #include "legacy_array_method.h"
 #include "abstractdtypes.h"
 
+/* TODO: Only for `NpyIter_GetTransferFlags` until it is public */
+#define NPY_ITERATOR_IMPLEMENTATION_CODE
+#include "nditer_impl.h"
+
 /********** PRINTF DEBUG TRACING **************/
 #define NPY_UF_DBG_TRACING 0
 
@@ -1544,10 +1548,6 @@ execute_ufunc_loop(PyArrayMethod_Context *context, int masked,
     if (masked) {
         baseptrs[nop] = PyArray_BYTES(op_it[nop]);
     }
-    if (NpyIter_ResetBasePointers(iter, baseptrs, NULL) != NPY_SUCCEED) {
-        NpyIter_Deallocate(iter);
-        return -1;
-    }
 
     /*
      * Get the inner loop, with the possibility of specialization
@@ -1584,17 +1584,25 @@ execute_ufunc_loop(PyArrayMethod_Context *context, int masked,
     char **dataptr = NpyIter_GetDataPtrArray(iter);
     npy_intp *strides = NpyIter_GetInnerStrideArray(iter);
     npy_intp *countptr = NpyIter_GetInnerLoopSizePtr(iter);
-    int needs_api = NpyIter_IterationNeedsAPI(iter);
 
     NPY_BEGIN_THREADS_DEF;
 
+    flags = PyArrayMethod_COMBINED_FLAGS(flags, NpyIter_GetTransferFlags(iter));
+
     if (!(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) {
         npy_clear_floatstatus_barrier((char *)context);
     }
-    if (!needs_api && !(flags & NPY_METH_REQUIRES_PYAPI)) {
+    if (!(flags & NPY_METH_REQUIRES_PYAPI)) {
         NPY_BEGIN_THREADS_THRESHOLDED(full_size);
     }
 
+    /* The reset may copy the first buffer chunk, which could cause FPEs */
+    if (NpyIter_ResetBasePointers(iter, baseptrs, NULL) != NPY_SUCCEED) {
+        NPY_AUXDATA_FREE(auxdata);
+        NpyIter_Deallocate(iter);
+        return -1;
+    }
+
     NPY_UF_DBG_PRINT("Actual inner loop:\n");
     /* Execute the loop */
     int res;
@@ -2388,7 +2396,8 @@ PyUFunc_GeneralizedFunctionInternal(PyUFuncObject *ufunc,
                  NPY_ITER_MULTI_INDEX |
                  NPY_ITER_REFS_OK |
                  NPY_ITER_ZEROSIZE_OK |
-                 NPY_ITER_COPY_IF_OVERLAP;
+                 NPY_ITER_COPY_IF_OVERLAP |
+                 NPY_ITER_DELAY_BUFALLOC;
 
     /* Create the iterator */
     iter = NpyIter_AdvancedNew(nop, op, iter_flags,
diff --git a/numpy/core/src/umath/umathmodule.c b/numpy/core/src/umath/umathmodule.c
index 49328d19e..17fedec6f 100644
--- a/numpy/core/src/umath/umathmodule.c
+++ b/numpy/core/src/umath/umathmodule.c
@@ -23,11 +23,13 @@
 #include "numpy/npy_math.h"
 #include "number.h"
 #include "dispatching.h"
+#include "string_ufuncs.h"
 
 /* Automatically generated code to define all ufuncs: */
 #include "funcs.inc"
 #include "__umath_generated.c"
 
+
 static PyUFuncGenericFunction pyfunc_functions[] = {PyUFunc_On_Om};
 
 static int
@@ -347,5 +349,10 @@ int initumath(PyObject *m)
     if (install_logical_ufunc_promoter(s) < 0) {
         return -1;
     }
+
+    if (init_string_ufuncs(d) < 0) {
+        return -1;
+    }
+
     return 0;
 }
diff --git a/numpy/core/tests/test_abc.py b/numpy/core/tests/test_abc.py
index 30e5748af..8b12d07ac 100644
--- a/numpy/core/tests/test_abc.py
+++ b/numpy/core/tests/test_abc.py
@@ -20,35 +20,35 @@ class TestABC:
     def test_floats(self):
         for t in sctypes['float']:
             assert_(isinstance(t(), numbers.Real),
-                    "{0} is not instance of Real".format(t.__name__))
+                    f"{t.__name__} is not instance of Real")
             assert_(issubclass(t, numbers.Real),
-                    "{0} is not subclass of Real".format(t.__name__))
+                    f"{t.__name__} is not subclass of Real")
             assert_(not isinstance(t(), numbers.Rational),
-                    "{0} is instance of Rational".format(t.__name__))
+                    f"{t.__name__} is instance of Rational")
             assert_(not issubclass(t, numbers.Rational),
-                    "{0} is subclass of Rational".format(t.__name__))
+                    f"{t.__name__} is subclass of Rational")
 
     def test_complex(self):
         for t in sctypes['complex']:
             assert_(isinstance(t(), numbers.Complex),
-                    "{0} is not instance of Complex".format(t.__name__))
+                    f"{t.__name__} is not instance of Complex")
             assert_(issubclass(t, numbers.Complex),
-                    "{0} is not subclass of Complex".format(t.__name__))
+                    f"{t.__name__} is not subclass of Complex")
             assert_(not isinstance(t(), numbers.Real),
-                    "{0} is instance of Real".format(t.__name__))
+                    f"{t.__name__} is instance of Real")
             assert_(not issubclass(t, numbers.Real),
-                    "{0} is subclass of Real".format(t.__name__))
+                    f"{t.__name__} is subclass of Real")
 
     def test_int(self):
         for t in sctypes['int']:
             assert_(isinstance(t(), numbers.Integral),
-                    "{0} is not instance of Integral".format(t.__name__))
+                    f"{t.__name__} is not instance of Integral")
             assert_(issubclass(t, numbers.Integral),
-                    "{0} is not subclass of Integral".format(t.__name__))
+                    f"{t.__name__} is not subclass of Integral")
 
     def test_uint(self):
         for t in sctypes['uint']:
             assert_(isinstance(t(), numbers.Integral),
-                    "{0} is not instance of Integral".format(t.__name__))
+                    f"{t.__name__} is not instance of Integral")
             assert_(issubclass(t, numbers.Integral),
-                    "{0} is not subclass of Integral".format(t.__name__))
+                    f"{t.__name__} is not subclass of Integral")
diff --git a/numpy/core/tests/test_array_coercion.py b/numpy/core/tests/test_array_coercion.py
index e858cd8b6..ed3ef7e67 100644
--- a/numpy/core/tests/test_array_coercion.py
+++ b/numpy/core/tests/test_array_coercion.py
@@ -373,28 +373,29 @@ class TestScalarDiscovery:
         assert discovered_dtype.itemsize == dtype.itemsize
 
     @pytest.mark.parametrize("dtype", np.typecodes["Integer"])
-    def test_scalar_to_int_coerce_does_not_cast(self, dtype):
+    @pytest.mark.parametrize(["scalar", "error"],
+            [(np.float64(np.nan), ValueError),
+             (np.ulonglong(-1), OverflowError)])
+    def test_scalar_to_int_coerce_does_not_cast(self, dtype, scalar, error):
         """
         Signed integers are currently different in that they do not cast other
         NumPy scalar, but instead use scalar.__int__(). The hardcoded
         exception to this rule is `np.array(scalar, dtype=integer)`.
         """
         dtype = np.dtype(dtype)
-        invalid_int = np.ulonglong(-1)
 
-        float_nan = np.float64(np.nan)
-
-        for scalar in [float_nan, invalid_int]:
-            # This is a special case using casting logic and thus not failing:
+        # This is a special case using casting logic.  It warns for the NaN
+        # but allows the cast (giving undefined behaviour).
+        with np.errstate(invalid="ignore"):
             coerced = np.array(scalar, dtype=dtype)
             cast = np.array(scalar).astype(dtype)
-            assert_array_equal(coerced, cast)
+        assert_array_equal(coerced, cast)
 
-            # However these fail:
-            with pytest.raises((ValueError, OverflowError)):
-                np.array([scalar], dtype=dtype)
-            with pytest.raises((ValueError, OverflowError)):
-                cast[()] = scalar
+        # However these fail:
+        with pytest.raises(error):
+            np.array([scalar], dtype=dtype)
+        with pytest.raises(error):
+            cast[()] = scalar
 
 
 class TestTimeScalars:
@@ -614,8 +615,8 @@ class TestBadSequences:
 
         obj.append([2, 3])
         obj.append(mylist([1, 2]))
-        with pytest.raises(RuntimeError):
-            np.array(obj)
+        # Does not crash:
+        np.array(obj)
 
     def test_replace_0d_array(self):
         # List to coerce, `mylist` will mutate the first element
diff --git a/numpy/core/tests/test_casting_floatingpoint_errors.py b/numpy/core/tests/test_casting_floatingpoint_errors.py
new file mode 100644
index 000000000..4fafc4ed8
--- /dev/null
+++ b/numpy/core/tests/test_casting_floatingpoint_errors.py
@@ -0,0 +1,153 @@
+import pytest
+from pytest import param
+
+import numpy as np
+
+
+def values_and_dtypes():
+    """
+    Generate value+dtype pairs that generate floating point errors during
+    casts.  The invalid casts to integers will generate "invalid" value
+    warnings, the float casts all generate "overflow".
+
+    (The Python int/float paths don't need to get tested in all the same
+    situations, but it does not hurt.)
+    """
+    # Casting to float16:
+    yield param(70000, "float16", id="int-to-f2")
+    yield param("70000", "float16", id="str-to-f2")
+    yield param(70000.0, "float16", id="float-to-f2")
+    yield param(np.longdouble(70000.), "float16", id="longdouble-to-f2")
+    yield param(np.float64(70000.), "float16", id="double-to-f2")
+    yield param(np.float32(70000.), "float16", id="float-to-f2")
+    # Casting to float32:
+    yield param(10**100, "float32", id="int-to-f4")
+    yield param(1e100, "float32", id="float-to-f2")
+    yield param(np.longdouble(1e300), "float32", id="longdouble-to-f2")
+    yield param(np.float64(1e300), "float32", id="double-to-f2")
+    # Casting to float64:
+    # If longdouble is double-double, its max can be rounded down to the double
+    # max.  So we correct the double spacing (a bit weird, admittedly):
+    max_ld = np.finfo(np.longdouble).max
+    spacing = np.spacing(np.nextafter(np.finfo("f8").max, 0))
+    if max_ld - spacing > np.finfo("f8").max:
+        yield param(np.finfo(np.longdouble).max, "float64",
+                    id="longdouble-to-f8")
+
+    # Cast to complex32:
+    yield param(2e300, "complex64", id="float-to-c8")
+    yield param(2e300+0j, "complex64", id="complex-to-c8")
+    yield param(2e300j, "complex64", id="complex-to-c8")
+    yield param(np.longdouble(2e300), "complex64", id="longdouble-to-c8")
+
+    # Invalid float to integer casts:
+    with np.errstate(over="ignore"):
+        for to_dt in np.typecodes["AllInteger"]:
+            for value in [np.inf, np.nan]:
+                for from_dt in np.typecodes["AllFloat"]:
+                    from_dt = np.dtype(from_dt)
+                    from_val = from_dt.type(value)
+
+                    yield param(from_val, to_dt, id=f"{from_val}-to-{to_dt}")
+
+
+def check_operations(dtype, value):
+    """
+    There are many dedicated paths in NumPy which cast and should check for
+    floating point errors which occurred during those casts.
+    """
+    if dtype.kind != 'i':
+        # These assignments use the stricter setitem logic:
+        def assignment():
+            arr = np.empty(3, dtype=dtype)
+            arr[0] = value
+
+        yield assignment
+
+        def fill():
+            arr = np.empty(3, dtype=dtype)
+            arr.fill(value)
+
+        yield fill
+
+    def copyto_scalar():
+        arr = np.empty(3, dtype=dtype)
+        np.copyto(arr, value, casting="unsafe")
+
+    yield copyto_scalar
+
+    def copyto():
+        arr = np.empty(3, dtype=dtype)
+        np.copyto(arr, np.array([value, value, value]), casting="unsafe")
+
+    yield copyto
+
+    def copyto_scalar_masked():
+        arr = np.empty(3, dtype=dtype)
+        np.copyto(arr, value, casting="unsafe",
+                  where=[True, False, True])
+
+    yield copyto_scalar_masked
+
+    def copyto_masked():
+        arr = np.empty(3, dtype=dtype)
+        np.copyto(arr, np.array([value, value, value]), casting="unsafe",
+                  where=[True, False, True])
+
+    yield copyto_masked
+
+    def direct_cast():
+        np.array([value, value, value]).astype(dtype)
+
+    yield direct_cast
+
+    def direct_cast_nd_strided():
+        arr = np.full((5, 5, 5), fill_value=value)[:, ::2, :]
+        arr.astype(dtype)
+
+    yield direct_cast_nd_strided
+
+    def boolean_array_assignment():
+        arr = np.empty(3, dtype=dtype)
+        arr[[True, False, True]] = np.array([value, value])
+
+    yield boolean_array_assignment
+
+    def integer_array_assignment():
+        arr = np.empty(3, dtype=dtype)
+        values = np.array([value, value])
+
+        arr[[0, 1]] = values
+
+    yield integer_array_assignment
+
+    def integer_array_assignment_with_subspace():
+        arr = np.empty((5, 3), dtype=dtype)
+        values = np.array([value, value, value])
+
+        arr[[0, 2]] = values
+
+    yield integer_array_assignment_with_subspace
+
+    def flat_assignment():
+        arr = np.empty((3,), dtype=dtype)
+        values = np.array([value, value, value])
+        arr.flat[:] = values
+
+    yield flat_assignment
+
+@pytest.mark.parametrize(["value", "dtype"], values_and_dtypes())
+@pytest.mark.filterwarnings("ignore::numpy.ComplexWarning")
+def test_floatingpoint_errors_casting(dtype, value):
+    dtype = np.dtype(dtype)
+    for operation in check_operations(dtype, value):
+        dtype = np.dtype(dtype)
+
+        match = "invalid" if dtype.kind in 'iu' else "overflow"
+        with pytest.warns(RuntimeWarning, match=match):
+            operation()
+
+        with np.errstate(all="raise"):
+            with pytest.raises(FloatingPointError, match=match):
+                operation()
+
diff --git a/numpy/core/tests/test_deprecations.py b/numpy/core/tests/test_deprecations.py
index 2b7864433..2255cb2a3 100644
--- a/numpy/core/tests/test_deprecations.py
+++ b/numpy/core/tests/test_deprecations.py
@@ -166,7 +166,7 @@ class TestComparisonDeprecations(_DeprecationTestCase):
         # For two string arrays, strings always raised the broadcasting error:
         a = np.array(['a', 'b'])
         b = np.array(['a', 'b', 'c'])
-        assert_raises(ValueError, lambda x, y: x == y, a, b)
+        assert_warns(FutureWarning, lambda x, y: x == y, a, b)
 
         # The empty list is not cast to string, and this used to pass due
         # to dtype mismatch; now (2018-06-21) it correctly leads to a
diff --git a/numpy/core/tests/test_dtype.py b/numpy/core/tests/test_dtype.py
index 32e2c6842..b37bded73 100644
--- a/numpy/core/tests/test_dtype.py
+++ b/numpy/core/tests/test_dtype.py
@@ -1346,6 +1346,16 @@ class TestPromotion:
                 match=r".* no common DType exists for the given inputs"):
             np.result_type(1j, rational(1, 2))
 
+    @pytest.mark.parametrize("val", [2, 2**32, 2**63, 2**64, 2*100])
+    def test_python_integer_promotion(self, val):
+        # If we only path scalars (mainly python ones!), the result must take
+        # into account that the integer may be considered int32, int64, uint64,
+        # or object depending on the input value.  So test those paths!
+        expected_dtype = np.result_type(np.array(val).dtype, np.array(0).dtype)
+        assert np.result_type(val, 0) == expected_dtype
+        # For completeness sake, also check with a NumPy scalar as second arg:
+        assert np.result_type(val, np.int8(0)) == expected_dtype
+
     @pytest.mark.parametrize(["other", "expected"],
             [(1, rational), (1., np.float64)])
     def test_float_int_pyscalar_promote_rational(self, other, expected):
diff --git a/numpy/core/tests/test_half.py b/numpy/core/tests/test_half.py
index 1b6fd21e1..6743dfb51 100644
--- a/numpy/core/tests/test_half.py
+++ b/numpy/core/tests/test_half.py
@@ -104,9 +104,9 @@ class TestHalf:
 
         # Increase the float by a minimal value:
         if offset == "up":
-            f16s_float = np.nextafter(f16s_float, float_t(1e50))
+            f16s_float = np.nextafter(f16s_float, float_t(np.inf))
         elif offset == "down":
-            f16s_float = np.nextafter(f16s_float, float_t(-1e50))
+            f16s_float = np.nextafter(f16s_float, float_t(-np.inf))
 
         # Convert back to float16 and its bit pattern:
         res_patterns = f16s_float.astype(np.float16).view(np.uint16)
@@ -233,12 +233,14 @@ class TestHalf:
                    np.inf]
 
         # Check float64->float16 rounding
-        b = np.array(a, dtype=float16)
+        with np.errstate(over="ignore"):
+            b = np.array(a, dtype=float16)
         assert_equal(b, rounded)
 
         # Check float32->float16 rounding
         a = np.array(a, dtype=float32)
-        b = np.array(a, dtype=float16)
+        with np.errstate(over="ignore"):
+            b = np.array(a, dtype=float16)
         assert_equal(b, rounded)
 
     def test_half_correctness(self):
diff --git a/numpy/core/tests/test_indexing.py b/numpy/core/tests/test_indexing.py
index efcb92c2e..9ef30eae2 100644
--- a/numpy/core/tests/test_indexing.py
+++ b/numpy/core/tests/test_indexing.py
@@ -1297,11 +1297,10 @@ class TestBooleanIndexing:
     def test_boolean_indexing_weirdness(self):
         # Weird boolean indexing things
         a = np.ones((2, 3, 4))
-        a[False, True, ...].shape == (0, 2, 3, 4)
-        a[True, [0, 1], True, True, [1], [[2]]] == (1, 2)
+        assert a[False, True, ...].shape == (0, 2, 3, 4)
+        assert a[True, [0, 1], True, True, [1], [[2]]].shape == (1, 2)
         assert_raises(IndexError, lambda: a[False, [0, 1], ...])
 
-
     def test_boolean_indexing_fast_path(self):
         # These used to either give the wrong error, or incorrectly give no
         # error.
diff --git a/numpy/core/tests/test_multiarray.py b/numpy/core/tests/test_multiarray.py
index f4454130d..84fdf545f 100644
--- a/numpy/core/tests/test_multiarray.py
+++ b/numpy/core/tests/test_multiarray.py
@@ -68,8 +68,8 @@ def _aligned_zeros(shape, dtype=float, order="C", align=None):
     # Note: slices producing 0-size arrays do not necessarily change
     # data pointer --- so we use and allocate size+1
     buf = buf[offset:offset+size+1][:-1]
+    buf.fill(0)
     data = np.ndarray(shape, dtype, buf, order=order)
-    data.fill(0)
     return data
 
 
@@ -1244,6 +1244,18 @@ class TestStructured:
         # The main importance is that it does not return True:
         with pytest.raises(TypeError):
             x == y
+ 
+    def test_empty_structured_array_comparison(self):
+        # Check that comparison works on empty arrays with nontrivially 
+        # shaped fields
+        a = np.zeros(0, [('a', '<f8', (1, 1))])
+        assert_equal(a, a)
+        a = np.zeros(0, [('a', '<f8', (1,))])
+        assert_equal(a, a)
+        a = np.zeros((0, 0), [('a', '<f8', (1, 1))])
+        assert_equal(a, a)
+        a = np.zeros((1, 0, 1), [('a', '<f8', (1, 1))])
+        assert_equal(a, a)
 
     def test_structured_comparisons_with_promotion(self):
         # Check that structured arrays can be compared so long as their
diff --git a/numpy/core/tests/test_numeric.py b/numpy/core/tests/test_numeric.py
index 0b03c6576..5b15e29b4 100644
--- a/numpy/core/tests/test_numeric.py
+++ b/numpy/core/tests/test_numeric.py
@@ -2939,7 +2939,9 @@ class TestLikeFuncs:
         self.check_like_function(np.full_like, 1, True)
         self.check_like_function(np.full_like, 1000, True)
         self.check_like_function(np.full_like, 123.456, True)
-        self.check_like_function(np.full_like, np.inf, True)
+        # Inf to integer casts cause invalid-value errors: ignore them.
+        with np.errstate(invalid="ignore"):
+            self.check_like_function(np.full_like, np.inf, True)
 
     @pytest.mark.parametrize('likefunc', [np.empty_like, np.full_like,
                                           np.zeros_like, np.ones_like])
diff --git a/numpy/core/tests/test_overrides.py b/numpy/core/tests/test_overrides.py
index 36970dbc0..e68406ebd 100644
--- a/numpy/core/tests/test_overrides.py
+++ b/numpy/core/tests/test_overrides.py
@@ -355,6 +355,45 @@ class TestArrayFunctionImplementation:
                 TypeError, "no implementation found for 'my.func'"):
             func(MyArray())
 
+    def test_signature_error_message(self):
+        # The lambda function will be named "<lambda>", but the TypeError
+        # should show the name as "func"
+        def _dispatcher():
+            return ()
+
+        @array_function_dispatch(_dispatcher)
+        def func():
+            pass
+
+        try:
+            func(bad_arg=3)
+        except TypeError as e:
+            expected_exception = e
+
+        try:
+            func(bad_arg=3)
+            raise AssertionError("must fail")
+        except TypeError as exc:
+            assert exc.args == expected_exception.args
+
+    @pytest.mark.parametrize("value", [234, "this func is not replaced"])
+    def test_dispatcher_error(self, value):
+        # If the dispatcher raises an error, we must not attempt to mutate it
+        error = TypeError(value)
+
+        def dispatcher():
+            raise error
+
+        @array_function_dispatch(dispatcher)
+        def func():
+            return 3
+
+        try:
+            func()
+            raise AssertionError("must fail")
+        except TypeError as exc:
+            assert exc is error  # unmodified exception
+
 
 class TestNDArrayMethods:
 
diff --git a/numpy/core/tests/test_regression.py b/numpy/core/tests/test_regression.py
index 98e0df9b8..4538c825d 100644
--- a/numpy/core/tests/test_regression.py
+++ b/numpy/core/tests/test_regression.py
@@ -326,20 +326,20 @@ class TestRegression:
         assert_raises(ValueError, bfa)
         assert_raises(ValueError, bfb)
 
-    def test_nonarray_assignment(self):
+    @pytest.mark.parametrize("index",
+            [np.ones(10, dtype=bool), np.arange(10)],
+            ids=["boolean-arr-index", "integer-arr-index"])
+    def test_nonarray_assignment(self, index):
         # See also Issue gh-2870, test for non-array assignment
         # and equivalent unsafe casted array assignment
         a = np.arange(10)
-        b = np.ones(10, dtype=bool)
-        r = np.arange(10)
 
-        def assign(a, b, c):
-            a[b] = c
+        with pytest.raises(ValueError):
+            a[index] = np.nan
 
-        assert_raises(ValueError, assign, a, b, np.nan)
-        a[b] = np.array(np.nan)  # but not this.
-        assert_raises(ValueError, assign, a, r, np.nan)
-        a[r] = np.array(np.nan)
+        with np.errstate(invalid="warn"):
+            with pytest.warns(RuntimeWarning, match="invalid value"):
+                a[index] = np.array(np.nan)  # Only warns
 
     def test_unpickle_dtype_with_object(self):
         # Implemented in r2840
@@ -1496,7 +1496,7 @@ class TestRegression:
             min = np.array([np.iinfo(t).min])
             min //= -1
 
-        with np.errstate(divide="ignore"):
+        with np.errstate(over="ignore"):
             for t in (np.int8, np.int16, np.int32, np.int64, int):
                 test_type(t)
 
diff --git a/numpy/core/tests/test_scalarmath.py b/numpy/core/tests/test_scalarmath.py
index b7fe5183e..8b14284ff 100644
--- a/numpy/core/tests/test_scalarmath.py
+++ b/numpy/core/tests/test_scalarmath.py
@@ -683,8 +683,12 @@ class TestNegative:
             sup.filter(RuntimeWarning)
             for dt in types:
                 a = np.ones((), dtype=dt)[()]
-                assert_equal(operator.neg(a) + a, 0)
-
+                if dt in np.typecodes['UnsignedInteger']:
+                    st = np.dtype(dt).type
+                    max = st(np.iinfo(dt).max)
+                    assert_equal(operator.neg(a), max)
+                else:
+                    assert_equal(operator.neg(a) + a, 0)
 
 class TestSubtract:
     def test_exceptions(self):
@@ -896,9 +900,13 @@ def test_scalar_integer_operation_overflow(dtype, operation):
 
 @pytest.mark.parametrize("dtype", np.typecodes["Integer"])
 @pytest.mark.parametrize("operation", [
+        lambda min, neg_1: -min,
         lambda min, neg_1: abs(min),
-        lambda min, neg_1: min * neg_1,
-        lambda min, neg_1: min // neg_1], ids=["abs", "*", "//"])
+        pytest.param(lambda min, neg_1: min * neg_1,
+            marks=pytest.mark.xfail(reason="broken on some platforms")),
+        pytest.param(lambda min, neg_1: min // neg_1,
+            marks=pytest.mark.skip(reason="broken on some platforms"))],
+        ids=["neg", "abs", "*", "//"])
 def test_scalar_signed_integer_overflow(dtype, operation):
     # The minimum signed integer can "overflow" for some additional operations
     st = np.dtype(dtype).type
@@ -910,8 +918,7 @@ def test_scalar_signed_integer_overflow(dtype, operation):
 
 
 @pytest.mark.parametrize("dtype", np.typecodes["UnsignedInteger"])
-@pytest.mark.xfail  # TODO: the check is quite simply missing!
-def test_scalar_signed_integer_overflow(dtype):
+def test_scalar_unsigned_integer_overflow(dtype):
     val = np.dtype(dtype).type(8)
     with pytest.warns(RuntimeWarning, match="overflow encountered"):
         -val
diff --git a/numpy/core/tests/test_simd.py b/numpy/core/tests/test_simd.py
index 324948cf2..c4488533a 100644
--- a/numpy/core/tests/test_simd.py
+++ b/numpy/core/tests/test_simd.py
@@ -85,16 +85,13 @@ class _Test_Utility:
             return getattr(self.npyv, cvt_intrin.format(sfx[1:], sfx))(vector)
 
     def _pinfinity(self):
-        v = self.npyv.setall_u32(0x7f800000)
-        return self.npyv.reinterpret_f32_u32(v)[0]
+        return float("inf")
 
     def _ninfinity(self):
-        v = self.npyv.setall_u32(0xff800000)
-        return self.npyv.reinterpret_f32_u32(v)[0]
+        return -float("inf")
 
     def _nan(self):
-        v = self.npyv.setall_u32(0x7fc00000)
-        return self.npyv.reinterpret_f32_u32(v)[0]
+        return float("nan")
 
     def _cpu_features(self):
         target = self.target_name
@@ -170,8 +167,9 @@ class _SIMD_BOOL(_Test_Utility):
         for data in (self._data(), self._data(reverse=True)):
             vdata = self._load_b(data)
             data_bits = data2bits(data)
-            tobits = bin(self.tobits(vdata))
-            assert tobits == bin(data_bits)
+            tobits = self.tobits(vdata)
+            bin_tobits = bin(tobits)
+            assert bin_tobits == bin(data_bits)
 
     def test_pack(self):
         """
@@ -746,9 +744,11 @@ class _SIMD_ALL(_Test_Utility):
         # We're testing the sanity of _simd's type-vector,
         # reinterpret* intrinsics itself are tested via compiler
         # during the build of _simd module
-        sfxes = ["u8", "s8", "u16", "s16", "u32", "s32", "u64", "s64", "f32"]
+        sfxes = ["u8", "s8", "u16", "s16", "u32", "s32", "u64", "s64"]
         if self.npyv.simd_f64:
             sfxes.append("f64")
+        if self.npyv.simd_f32:
+            sfxes.append("f32")
         for sfx in sfxes:
             vec_name = getattr(self, "reinterpret_" + sfx)(vdata_a).__name__
             assert vec_name == "npyv_" + sfx
@@ -1077,8 +1077,13 @@ for target_name, npyv in targets.items():
         skip = f"target '{pretty_name}' isn't supported by current machine"
     elif not npyv.simd:
         skip = f"target '{pretty_name}' isn't supported by NPYV"
-    elif not npyv.simd_f64:
-        skip_sfx["f64"] = f"target '{pretty_name}' doesn't support double-precision"
+    else:
+        if not npyv.simd_f32:
+            skip_sfx["f32"] = f"target '{pretty_name}' "\
+                               "doesn't support single-precision"
+        if not npyv.simd_f64:
+            skip_sfx["f64"] = f"target '{pretty_name}' doesn't"\
+                               "support double-precision"
 
     for sfxes, cls in tests_registry.items():
         for sfx in sfxes:
diff --git a/numpy/core/tests/test_simd_module.py b/numpy/core/tests/test_simd_module.py
index 3d710884a..44dc58dac 100644
--- a/numpy/core/tests/test_simd_module.py
+++ b/numpy/core/tests/test_simd_module.py
@@ -12,7 +12,9 @@ npyv, npyv2 = (npyvs + [None, None])[:2]
 
 unsigned_sfx = ["u8", "u16", "u32", "u64"]
 signed_sfx = ["s8", "s16", "s32", "s64"]
-fp_sfx = ["f32"]
+fp_sfx = []
+if npyv and npyv.simd_f32:
+    fp_sfx.append("f32")
 if npyv and npyv.simd_f64:
     fp_sfx.append("f64")
 
diff --git a/numpy/core/tests/test_strings.py b/numpy/core/tests/test_strings.py
new file mode 100644
index 000000000..2b87ed654
--- /dev/null
+++ b/numpy/core/tests/test_strings.py
@@ -0,0 +1,85 @@
+import pytest
+
+import operator
+import numpy as np
+
+from numpy.testing import assert_array_equal
+
+
+COMPARISONS = [
+    (operator.eq, np.equal, "=="),
+    (operator.ne, np.not_equal, "!="),
+    (operator.lt, np.less, "<"),
+    (operator.le, np.less_equal, "<="),
+    (operator.gt, np.greater, ">"),
+    (operator.ge, np.greater_equal, ">="),
+]
+
+
+@pytest.mark.parametrize(["op", "ufunc", "sym"], COMPARISONS)
+def test_mixed_string_comparison_ufuncs_fail(op, ufunc, sym):
+    arr_string = np.array(["a", "b"], dtype="S")
+    arr_unicode = np.array(["a", "c"], dtype="U")
+
+    with pytest.raises(TypeError, match="did not contain a loop"):
+        ufunc(arr_string, arr_unicode)
+
+    with pytest.raises(TypeError, match="did not contain a loop"):
+        ufunc(arr_unicode, arr_string)
+
+@pytest.mark.parametrize(["op", "ufunc", "sym"], COMPARISONS)
+def test_mixed_string_comparisons_ufuncs_with_cast(op, ufunc, sym):
+    arr_string = np.array(["a", "b"], dtype="S")
+    arr_unicode = np.array(["a", "c"], dtype="U")
+
+    # While there is no loop, manual casting is acceptable:
+    res1 = ufunc(arr_string, arr_unicode, signature="UU->?", casting="unsafe")
+    res2 = ufunc(arr_string, arr_unicode, signature="SS->?", casting="unsafe")
+
+    expected = op(arr_string.astype('U'), arr_unicode)
+    assert_array_equal(res1, expected)
+    assert_array_equal(res2, expected)
+
+
+@pytest.mark.parametrize(["op", "ufunc", "sym"], COMPARISONS)
+@pytest.mark.parametrize("dtypes", [
+        ("S2", "S2"), ("S2", "S10"),
+        ("<U1", "<U1"), ("<U1", ">U1"), (">U1", ">U1"),
+        ("<U1", "<U10"), ("<U1", ">U10")])
+@pytest.mark.parametrize("aligned", [True, False])
+def test_string_comparisons(op, ufunc, sym, dtypes, aligned):
+    # ensure native byte-order for the first view to stay within unicode range
+    native_dt = np.dtype(dtypes[0]).newbyteorder("=")
+    arr = np.arange(2**15).view(native_dt).astype(dtypes[0])
+    if not aligned:
+        # Make `arr` unaligned:
+        new = np.zeros(arr.nbytes + 1, dtype=np.uint8)[1:].view(dtypes[0])
+        new[...] = arr
+        arr = new
+
+    arr2 = arr.astype(dtypes[1], copy=True)
+    np.random.shuffle(arr2)
+    arr[0] = arr2[0]  # make sure one matches
+
+    expected = [op(d1, d2) for d1, d2 in zip(arr.tolist(), arr2.tolist())]
+    assert_array_equal(op(arr, arr2), expected)
+    assert_array_equal(ufunc(arr, arr2), expected)
+    assert_array_equal(np.compare_chararrays(arr, arr2, sym, False), expected)
+
+    expected = [op(d2, d1) for d1, d2 in zip(arr.tolist(), arr2.tolist())]
+    assert_array_equal(op(arr2, arr), expected)
+    assert_array_equal(ufunc(arr2, arr), expected)
+    assert_array_equal(np.compare_chararrays(arr2, arr, sym, False), expected)
+
+
+@pytest.mark.parametrize(["op", "ufunc", "sym"], COMPARISONS)
+@pytest.mark.parametrize("dtypes", [
+        ("S2", "S2"), ("S2", "S10"), ("<U1", "<U1"), ("<U1", ">U10")])
+def test_string_comparisons_empty(op, ufunc, sym, dtypes):
+    arr = np.empty((1, 0, 1, 5), dtype=dtypes[0])
+    arr2 = np.empty((100, 1, 0, 1), dtype=dtypes[1])
+
+    expected = np.empty(np.broadcast_shapes(arr.shape, arr2.shape), dtype=bool)
+    assert_array_equal(op(arr, arr2), expected)
+    assert_array_equal(ufunc(arr, arr2), expected)
+    assert_array_equal(np.compare_chararrays(arr, arr2, sym, False), expected)
diff --git a/numpy/core/tests/test_ufunc.py b/numpy/core/tests/test_ufunc.py
index 852044d32..3466178a3 100644
--- a/numpy/core/tests/test_ufunc.py
+++ b/numpy/core/tests/test_ufunc.py
@@ -620,8 +620,9 @@ class TestUfunc:
                                 atol = max(np.finfo(dtout).tiny, 3e-308)
                             else:
                                 atol = 3e-308
-                        # Some test values result in invalid for float16.
-                        with np.errstate(invalid='ignore'):
+                        # Some test values result in invalid for float16
+                        # and the cast to it may overflow to inf.
+                        with np.errstate(invalid='ignore', over='ignore'):
                             res = np.true_divide(x, y, dtype=dtout)
                         if not np.isfinite(res) and tcout == 'e':
                             continue
@@ -665,20 +666,22 @@ class TestUfunc:
         for dt in (int, np.float16, np.float32, np.float64, np.longdouble):
             for v in (0, 1, 2, 7, 8, 9, 15, 16, 19, 127,
                       128, 1024, 1235):
-                tgt = dt(v * (v + 1) / 2)
-                d = np.arange(1, v + 1, dtype=dt)
-
                 # warning if sum overflows, which it does in float16
-                overflow = not np.isfinite(tgt)
-
                 with warnings.catch_warnings(record=True) as w:
-                    warnings.simplefilter("always")
-                    assert_almost_equal(np.sum(d), tgt)
+                    warnings.simplefilter("always", RuntimeWarning)
+
+                    tgt = dt(v * (v + 1) / 2)
+                    overflow = not np.isfinite(tgt)
                     assert_equal(len(w), 1 * overflow)
 
-                    assert_almost_equal(np.sum(d[::-1]), tgt)
+                    d = np.arange(1, v + 1, dtype=dt)
+
+                    assert_almost_equal(np.sum(d), tgt)
                     assert_equal(len(w), 2 * overflow)
 
+                    assert_almost_equal(np.sum(d[::-1]), tgt)
+                    assert_equal(len(w), 3 * overflow)
+
             d = np.ones(500, dtype=dt)
             assert_almost_equal(np.sum(d[::2]), 250.)
             assert_almost_equal(np.sum(d[1::2]), 250.)
@@ -2454,7 +2457,7 @@ def test_ufunc_warn_with_nan(ufunc):
 
 
 @pytest.mark.skipif(not HAS_REFCOUNT, reason="Python lacks refcounts")
-def test_ufunc_casterrors():
+def test_ufunc_out_casterrors():
     # Tests that casting errors are correctly reported and buffers are
     # cleared.
     # The following array can be added to itself as an object array, but
@@ -2485,6 +2488,28 @@ def test_ufunc_casterrors():
     assert out[-1] == 1
 
 
+@pytest.mark.parametrize("bad_offset", [0, int(np.BUFSIZE * 1.5)])
+def test_ufunc_input_casterrors(bad_offset):
+    value = 123
+    arr = np.array([value] * bad_offset +
+                   ["string"] +
+                   [value] * int(1.5 * np.BUFSIZE), dtype=object)
+    with pytest.raises(ValueError):
+        # Force cast inputs, but the buffered cast of `arr` to intp fails:
+        np.add(arr, arr, dtype=np.intp, casting="unsafe")
+
+
+@pytest.mark.parametrize("bad_offset", [0, int(np.BUFSIZE * 1.5)])
+def test_ufunc_input_floatingpoint_error(bad_offset):
+    value = 123
+    arr = np.array([value] * bad_offset +
+                   [np.nan] +
+                   [value] * int(1.5 * np.BUFSIZE))
+    with np.errstate(invalid="raise"), pytest.raises(FloatingPointError):
+        # Force cast inputs, but the buffered cast of `arr` to intp fails:
+        np.add(arr, arr, dtype=np.intp, casting="unsafe")
+
+
 def test_trivial_loop_invalid_cast():
     # This tests the fast-path "invalid cast", see gh-19904.
     with pytest.raises(TypeError,
diff --git a/numpy/core/tests/test_umath.py b/numpy/core/tests/test_umath.py
index 7b6e2ee92..a696fceb8 100644
--- a/numpy/core/tests/test_umath.py
+++ b/numpy/core/tests/test_umath.py
@@ -327,7 +327,9 @@ class TestDivision:
         a_lst, b_lst = a.tolist(), b.tolist()
 
         c_div = lambda n, d: (
-            0 if d == 0 or (n and n == fo.min and d == -1) else n//d
+            0 if d == 0 else (
+                fo.min if (n and n == fo.min and d == -1) else n//d
+            )
         )
         with np.errstate(divide='ignore'):
             ac = a.copy()
@@ -342,7 +344,7 @@ class TestDivision:
 
         for divisor in divisors:
             ac = a.copy()
-            with np.errstate(divide='ignore'):
+            with np.errstate(divide='ignore', over='ignore'):
                 div_a = a // divisor
                 ac //= divisor
             div_lst = [c_div(i, divisor) for i in a_lst]
@@ -350,21 +352,25 @@ class TestDivision:
             assert all(div_a == div_lst), msg
             assert all(ac == div_lst), msg_eq
 
-        with np.errstate(divide='raise'):
-            if 0 in b or (fo.min and -1 in b and fo.min in a):
+        with np.errstate(divide='raise', over='raise'):
+            if 0 in b:
                 # Verify overflow case
-                with pytest.raises(FloatingPointError):
+                with pytest.raises(FloatingPointError,
+                        match="divide by zero encountered in floor_divide"):
                     a // b
             else:
                 a // b
             if fo.min and fo.min in a:
-                with pytest.raises(FloatingPointError):
+                with pytest.raises(FloatingPointError,
+                        match='overflow encountered in floor_divide'):
                     a // -1
             elif fo.min:
                 a // -1
-            with pytest.raises(FloatingPointError):
+            with pytest.raises(FloatingPointError,
+                    match="divide by zero encountered in floor_divide"):
                 a // 0
-            with pytest.raises(FloatingPointError):
+            with pytest.raises(FloatingPointError,
+                    match="divide by zero encountered in floor_divide"):
                 ac = a.copy()
                 ac //= 0
 
@@ -392,11 +398,13 @@ class TestDivision:
         msg = "Reduce floor integer division check"
         assert div_a == div_lst, msg
 
-        with np.errstate(divide='raise'):
-            with pytest.raises(FloatingPointError):
+        with np.errstate(divide='raise', over='raise'):
+            with pytest.raises(FloatingPointError,
+                    match="divide by zero encountered in reduce"):
                 np.floor_divide.reduce(np.arange(-100, 100, dtype=dtype))
             if fo.min:
-                with pytest.raises(FloatingPointError):
+                with pytest.raises(FloatingPointError,
+                        match='overflow encountered in reduce'):
                     np.floor_divide.reduce(
                         np.array([fo.min, 1, -1], dtype=dtype)
                     )
diff --git a/numpy/core/tests/test_unicode.py b/numpy/core/tests/test_unicode.py
index 8e0dd47cb..12de25771 100644
--- a/numpy/core/tests/test_unicode.py
+++ b/numpy/core/tests/test_unicode.py
@@ -1,3 +1,5 @@
+import pytest
+
 import numpy as np
 from numpy.testing import assert_, assert_equal, assert_array_equal
 
@@ -33,8 +35,11 @@ def test_string_cast():
     uni_arr1 = str_arr.astype('>U')
     uni_arr2 = str_arr.astype('<U')
 
-    assert_(str_arr != uni_arr1)
-    assert_(str_arr != uni_arr2)
+    with pytest.warns(FutureWarning):
+        assert str_arr != uni_arr1
+    with pytest.warns(FutureWarning):
+        assert str_arr != uni_arr2
+
     assert_array_equal(uni_arr1, uni_arr2)
 
 
diff --git a/numpy/distutils/ccompiler_opt.py b/numpy/distutils/ccompiler_opt.py
index befc83c16..2019dcb25 100644
--- a/numpy/distutils/ccompiler_opt.py
+++ b/numpy/distutils/ccompiler_opt.py
@@ -955,51 +955,57 @@ class _CCompiler:
     def __init__(self):
         if hasattr(self, "cc_is_cached"):
             return
-        #      attr                regex
+        #      attr            regex        compiler-expression
         detect_arch = (
-            ("cc_on_x64",      ".*(x|x86_|amd)64.*"),
-            ("cc_on_x86",      ".*(win32|x86|i386|i686).*"),
-            ("cc_on_ppc64le",  ".*(powerpc|ppc)64(el|le).*"),
-            ("cc_on_ppc64",    ".*(powerpc|ppc)64.*"),
-            ("cc_on_aarch64",  ".*(aarch64|arm64).*"),
-            ("cc_on_armhf",    ".*arm.*"),
-            ("cc_on_s390x",    ".*s390x.*"),
+            ("cc_on_x64",      ".*(x|x86_|amd)64.*", ""),
+            ("cc_on_x86",      ".*(win32|x86|i386|i686).*", ""),
+            ("cc_on_ppc64le",  ".*(powerpc|ppc)64(el|le).*", ""),
+            ("cc_on_ppc64",    ".*(powerpc|ppc)64.*", ""),
+            ("cc_on_aarch64",  ".*(aarch64|arm64).*", ""),
+            ("cc_on_armhf",    ".*arm.*", "defined(__ARM_ARCH_7__) || "
+                                          "defined(__ARM_ARCH_7A__)"),
+            ("cc_on_s390x",    ".*s390x.*", ""),
             # undefined platform
-            ("cc_on_noarch",    ""),
+            ("cc_on_noarch",   "", ""),
         )
         detect_compiler = (
-            ("cc_is_gcc",     r".*(gcc|gnu\-g).*"),
-            ("cc_is_clang",    ".*clang.*"),
-            ("cc_is_iccw",     ".*(intelw|intelemw|iccw).*"), # intel msvc like
-            ("cc_is_icc",      ".*(intel|icc).*"), # intel unix like
-            ("cc_is_msvc",     ".*msvc.*"),
+            ("cc_is_gcc",     r".*(gcc|gnu\-g).*", ""),
+            ("cc_is_clang",    ".*clang.*", ""),
+            # intel msvc like
+            ("cc_is_iccw",     ".*(intelw|intelemw|iccw).*", ""),
+            ("cc_is_icc",      ".*(intel|icc).*", ""),  # intel unix like
+            ("cc_is_msvc",     ".*msvc.*", ""),
             # undefined compiler will be treat it as gcc
-            ("cc_is_nocc",     ""),
+            ("cc_is_nocc",     "", ""),
         )
         detect_args = (
-           ("cc_has_debug",  ".*(O0|Od|ggdb|coverage|debug:full).*"),
-           ("cc_has_native", ".*(-march=native|-xHost|/QxHost).*"),
+           ("cc_has_debug",  ".*(O0|Od|ggdb|coverage|debug:full).*", ""),
+           ("cc_has_native", ".*(-march=native|-xHost|/QxHost).*", ""),
            # in case if the class run with -DNPY_DISABLE_OPTIMIZATION
-           ("cc_noopt", ".*DISABLE_OPT.*"),
+           ("cc_noopt", ".*DISABLE_OPT.*", ""),
         )
 
         dist_info = self.dist_info()
         platform, compiler_info, extra_args = dist_info
         # set False to all attrs
         for section in (detect_arch, detect_compiler, detect_args):
-            for attr, rgex in section:
+            for attr, rgex, cexpr in section:
                 setattr(self, attr, False)
 
         for detect, searchin in ((detect_arch, platform), (detect_compiler, compiler_info)):
-            for attr, rgex in detect:
+            for attr, rgex, cexpr in detect:
                 if rgex and not re.match(rgex, searchin, re.IGNORECASE):
                     continue
+                if cexpr and not self.cc_test_cexpr(cexpr):
+                    continue
                 setattr(self, attr, True)
                 break
 
-        for attr, rgex in detect_args:
+        for attr, rgex, cexpr in detect_args:
             if rgex and not re.match(rgex, extra_args, re.IGNORECASE):
                 continue
+            if cexpr and not self.cc_test_cexpr(cexpr):
+                continue
             setattr(self, attr, True)
 
         if self.cc_on_noarch:
@@ -1071,6 +1077,25 @@ class _CCompiler:
             self.dist_log("testing failed", stderr=True)
         return test
 
+    @_Cache.me
+    def cc_test_cexpr(self, cexpr, flags=[]):
+        """
+        Same as the above but supports compile-time expressions.
+        """
+        self.dist_log("testing compiler expression", cexpr)
+        test_path = os.path.join(self.conf_tmp_path, "npy_dist_test_cexpr.c")
+        with open(test_path, "w") as fd:
+            fd.write(textwrap.dedent(f"""\
+               #if !({cexpr})
+                   #error "unsupported expression"
+               #endif
+               int dummy;
+            """))
+        test = self.dist_test(test_path, flags)
+        if not test:
+            self.dist_log("testing failed", stderr=True)
+        return test
+
     def cc_normalize_flags(self, flags):
         """
         Remove the conflicts that caused due gathering implied features flags.
diff --git a/numpy/distutils/checks/cpu_asimd.c b/numpy/distutils/checks/cpu_asimd.c
index 8df556b6c..6bc9022a5 100644
--- a/numpy/distutils/checks/cpu_asimd.c
+++ b/numpy/distutils/checks/cpu_asimd.c
@@ -3,9 +3,10 @@
 #endif
 #include <arm_neon.h>
 
-int main(void)
+int main(int argc, char **argv)
 {
-    float32x4_t v1 = vdupq_n_f32(1.0f), v2 = vdupq_n_f32(2.0f);
+    float *src = (float*)argv[argc-1];
+    float32x4_t v1 = vdupq_n_f32(src[0]), v2 = vdupq_n_f32(src[1]);
     /* MAXMIN */
     int ret  = (int)vgetq_lane_f32(vmaxnmq_f32(v1, v2), 0);
         ret += (int)vgetq_lane_f32(vminnmq_f32(v1, v2), 0);
@@ -13,7 +14,8 @@ int main(void)
     ret += (int)vgetq_lane_f32(vrndq_f32(v1), 0);
 #ifdef __aarch64__
     {
-        float64x2_t vd1 = vdupq_n_f64(1.0), vd2 = vdupq_n_f64(2.0);
+        double *src2 = (double*)argv[argc-1];
+        float64x2_t vd1 = vdupq_n_f64(src2[0]), vd2 = vdupq_n_f64(src2[1]);
         /* MAXMIN */
         ret += (int)vgetq_lane_f64(vmaxnmq_f64(vd1, vd2), 0);
         ret += (int)vgetq_lane_f64(vminnmq_f64(vd1, vd2), 0);
diff --git a/numpy/distutils/checks/cpu_asimddp.c b/numpy/distutils/checks/cpu_asimddp.c
index 0158d1354..e7068ce02 100644
--- a/numpy/distutils/checks/cpu_asimddp.c
+++ b/numpy/distutils/checks/cpu_asimddp.c
@@ -3,9 +3,10 @@
 #endif
 #include <arm_neon.h>
 
-int main(void)
+int main(int argc, char **argv)
 {
-    uint8x16_t v1 = vdupq_n_u8((unsigned char)1), v2 = vdupq_n_u8((unsigned char)2);
+    unsigned char *src = (unsigned char*)argv[argc-1];
+    uint8x16_t v1 = vdupq_n_u8(src[0]), v2 = vdupq_n_u8(src[1]);
     uint32x4_t va = vdupq_n_u32(3);
     int ret = (int)vgetq_lane_u32(vdotq_u32(va, v1, v2), 0);
 #ifdef __aarch64__
diff --git a/numpy/distutils/checks/cpu_asimdfhm.c b/numpy/distutils/checks/cpu_asimdfhm.c
index cb49751c4..54e328098 100644
--- a/numpy/distutils/checks/cpu_asimdfhm.c
+++ b/numpy/distutils/checks/cpu_asimdfhm.c
@@ -3,12 +3,14 @@
 #endif
 #include <arm_neon.h>
 
-int main(void)
+int main(int argc, char **argv)
 {
-    float16x8_t vhp  = vdupq_n_f16((float16_t)1);
-    float16x4_t vlhp = vdup_n_f16((float16_t)1);
-    float32x4_t vf   = vdupq_n_f32(1.0f);
-    float32x2_t vlf  = vdup_n_f32(1.0f);
+    float16_t *src = (float16_t*)argv[argc-1];
+    float *src2 = (float*)argv[argc-2];
+    float16x8_t vhp  = vdupq_n_f16(src[0]);
+    float16x4_t vlhp = vdup_n_f16(src[1]);
+    float32x4_t vf   = vdupq_n_f32(src2[0]);
+    float32x2_t vlf  = vdup_n_f32(src2[1]);
 
     int ret  = (int)vget_lane_f32(vfmlal_low_f16(vlf, vlhp, vlhp), 0);
         ret += (int)vgetq_lane_f32(vfmlslq_high_f16(vf, vhp, vhp), 0);
diff --git a/numpy/distutils/checks/cpu_asimdhp.c b/numpy/distutils/checks/cpu_asimdhp.c
index 80b94000f..e2de0306e 100644
--- a/numpy/distutils/checks/cpu_asimdhp.c
+++ b/numpy/distutils/checks/cpu_asimdhp.c
@@ -3,10 +3,11 @@
 #endif
 #include <arm_neon.h>
 
-int main(void)
+int main(int argc, char **argv)
 {
-    float16x8_t vhp  = vdupq_n_f16((float16_t)-1);
-    float16x4_t vlhp = vdup_n_f16((float16_t)-1);
+    float16_t *src = (float16_t*)argv[argc-1];
+    float16x8_t vhp  = vdupq_n_f16(src[0]);
+    float16x4_t vlhp = vdup_n_f16(src[1]);
 
     int ret  =  (int)vgetq_lane_f16(vabdq_f16(vhp, vhp), 0);
         ret  += (int)vget_lane_f16(vabd_f16(vlhp, vlhp), 0);
diff --git a/numpy/distutils/checks/cpu_neon.c b/numpy/distutils/checks/cpu_neon.c
index 4eab1f384..8c64f864d 100644
--- a/numpy/distutils/checks/cpu_neon.c
+++ b/numpy/distutils/checks/cpu_neon.c
@@ -3,12 +3,16 @@
 #endif
 #include <arm_neon.h>
 
-int main(void)
+int main(int argc, char **argv)
 {
-    float32x4_t v1 = vdupq_n_f32(1.0f), v2 = vdupq_n_f32(2.0f);
+    // passing from untraced pointers to avoid optimizing out any constants
+    // so we can test against the linker.
+    float *src = (float*)argv[argc-1];
+    float32x4_t v1 = vdupq_n_f32(src[0]), v2 = vdupq_n_f32(src[1]);
     int ret = (int)vgetq_lane_f32(vmulq_f32(v1, v2), 0);
 #ifdef __aarch64__
-    float64x2_t vd1 = vdupq_n_f64(1.0), vd2 = vdupq_n_f64(2.0);
+    double *src2 = (double*)argv[argc-2];
+    float64x2_t vd1 = vdupq_n_f64(src2[0]), vd2 = vdupq_n_f64(src2[1]);
     ret += (int)vgetq_lane_f64(vmulq_f64(vd1, vd2), 0);
 #endif
     return ret;
diff --git a/numpy/distutils/checks/cpu_neon_fp16.c b/numpy/distutils/checks/cpu_neon_fp16.c
index 745d2e793..f3b949770 100644
--- a/numpy/distutils/checks/cpu_neon_fp16.c
+++ b/numpy/distutils/checks/cpu_neon_fp16.c
@@ -3,9 +3,9 @@
 #endif
 #include <arm_neon.h>
 
-int main(void)
+int main(int argc, char **argv)
 {
-    short z4[] = {0, 0, 0, 0, 0, 0, 0, 0};
-    float32x4_t v_z4 = vcvt_f32_f16((float16x4_t)vld1_s16((const short*)z4));
+    short *src = (short*)argv[argc-1];
+    float32x4_t v_z4 = vcvt_f32_f16((float16x4_t)vld1_s16(src));
     return (int)vgetq_lane_f32(v_z4, 0);
 }
diff --git a/numpy/distutils/checks/cpu_neon_vfpv4.c b/numpy/distutils/checks/cpu_neon_vfpv4.c
index 45f7b5d69..a039159dd 100644
--- a/numpy/distutils/checks/cpu_neon_vfpv4.c
+++ b/numpy/distutils/checks/cpu_neon_vfpv4.c
@@ -3,16 +3,18 @@
 #endif
 #include <arm_neon.h>
 
-int main(void)
+int main(int argc, char **argv)
 {
-    float32x4_t v1 = vdupq_n_f32(1.0f);
-    float32x4_t v2 = vdupq_n_f32(2.0f);
-    float32x4_t v3 = vdupq_n_f32(3.0f);
+    float *src = (float*)argv[argc-1];
+    float32x4_t v1 = vdupq_n_f32(src[0]);
+    float32x4_t v2 = vdupq_n_f32(src[1]);
+    float32x4_t v3 = vdupq_n_f32(src[2]);
     int ret = (int)vgetq_lane_f32(vfmaq_f32(v1, v2, v3), 0);
 #ifdef __aarch64__
-    float64x2_t vd1 = vdupq_n_f64(1.0);
-    float64x2_t vd2 = vdupq_n_f64(2.0);
-    float64x2_t vd3 = vdupq_n_f64(3.0);
+    double *src2 = (double*)argv[argc-2];
+    float64x2_t vd1 = vdupq_n_f64(src2[0]);
+    float64x2_t vd2 = vdupq_n_f64(src2[1]);
+    float64x2_t vd3 = vdupq_n_f64(src2[2]);
     ret += (int)vgetq_lane_f64(vfmaq_f64(vd1, vd2, vd3), 0);
 #endif
     return ret;
diff --git a/numpy/distutils/misc_util.py b/numpy/distutils/misc_util.py
index 78665d351..b3916a2c8 100644
--- a/numpy/distutils/misc_util.py
+++ b/numpy/distutils/misc_util.py
@@ -358,7 +358,7 @@ if terminal_has_colors():
             fgcode = 30 + _colour_codes.get(fg.lower(), 0)
             seq.append(str(fgcode))
         if bg:
-            bgcode = 40 + _colour_codes.get(fg.lower(), 7)
+            bgcode = 40 + _colour_codes.get(bg.lower(), 7)
             seq.append(str(bgcode))
         if seq:
             return '\x1b[%sm%s\x1b[0m' % (';'.join(seq), s)
diff --git a/numpy/f2py/capi_maps.py b/numpy/f2py/capi_maps.py
index e5dc2331a..f07066a09 100644
--- a/numpy/f2py/capi_maps.py
+++ b/numpy/f2py/capi_maps.py
@@ -176,6 +176,7 @@ f2cmap_all = {'real': {'': 'float', '4': 'float', '8': 'double',
 
 f2cmap_default = copy.deepcopy(f2cmap_all)
 
+f2cmap_mapped = []
 
 def load_f2cmap_file(f2cmap_file):
     global f2cmap_all
@@ -212,6 +213,7 @@ def load_f2cmap_file(f2cmap_file):
                     f2cmap_all[k][k1] = d[k][k1]
                     outmess('\tMapping "%s(kind=%s)" to "%s"\n' %
                             (k, k1, d[k][k1]))
+                    f2cmap_mapped.append(d[k][k1])
                 else:
                     errmess("\tIgnoring map {'%s':{'%s':'%s'}}: '%s' must be in %s\n" % (
                         k, k1, d[k][k1], d[k][k1], list(c2py_map.keys())))
diff --git a/numpy/f2py/rules.py b/numpy/f2py/rules.py
index c9c3b2383..63c48a878 100755
--- a/numpy/f2py/rules.py
+++ b/numpy/f2py/rules.py
@@ -1323,6 +1323,9 @@ def buildmodule(m, um):
         rd = dictappend(rd, ar)
 
     needs = cfuncs.get_needs()
+    # Add mapped definitions
+    needs['typedefs'] += [cvar for cvar in capi_maps.f2cmap_mapped #
+                          if cvar in typedef_need_dict.values()]
     code = {}
     for n in needs.keys():
         code[n] = []
diff --git a/numpy/f2py/src/fortranobject.h b/numpy/f2py/src/fortranobject.h
index 376b83dad..abd699c2f 100644
--- a/numpy/f2py/src/fortranobject.h
+++ b/numpy/f2py/src/fortranobject.h
@@ -6,7 +6,9 @@ extern "C" {
 
 #include <Python.h>
 
-#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
+#ifndef NPY_NO_DEPRECATED_API
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#endif
 #ifdef FORTRANOBJECT_C
 #define NO_IMPORT_ARRAY
 #endif
diff --git a/numpy/f2py/tests/src/f2cmap/.f2py_f2cmap b/numpy/f2py/tests/src/f2cmap/.f2py_f2cmap
new file mode 100644
index 000000000..a4425f887
--- /dev/null
+++ b/numpy/f2py/tests/src/f2cmap/.f2py_f2cmap
@@ -0,0 +1 @@
+dict(real=dict(real32='float', real64='double'), integer=dict(int64='long_long'))
diff --git a/numpy/f2py/tests/src/f2cmap/isoFortranEnvMap.f90 b/numpy/f2py/tests/src/f2cmap/isoFortranEnvMap.f90
new file mode 100644
index 000000000..3f0e12c76
--- /dev/null
+++ b/numpy/f2py/tests/src/f2cmap/isoFortranEnvMap.f90
@@ -0,0 +1,9 @@
+      subroutine func1(n, x, res)
+        use, intrinsic :: iso_fortran_env, only: int64, real64
+        implicit none
+        integer(int64), intent(in) :: n
+        real(real64), intent(in) :: x(n)
+        real(real64), intent(out) :: res
+Cf2py   intent(hide) :: n
+        res = sum(x)
+      end
diff --git a/numpy/f2py/tests/test_f2cmap.py b/numpy/f2py/tests/test_f2cmap.py
new file mode 100644
index 000000000..d2967e4f7
--- /dev/null
+++ b/numpy/f2py/tests/test_f2cmap.py
@@ -0,0 +1,15 @@
+from . import util
+import numpy as np
+
+class TestF2Cmap(util.F2PyTest):
+    sources = [
+        util.getpath("tests", "src", "f2cmap", "isoFortranEnvMap.f90"),
+        util.getpath("tests", "src", "f2cmap", ".f2py_f2cmap")
+    ]
+
+    # gh-15095
+    def test_long_long_map(self):
+        inp = np.ones(3)
+        out = self.module.func1(inp)
+        exp_out = 3
+        assert out == exp_out
diff --git a/numpy/lib/tests/test_loadtxt.py b/numpy/lib/tests/test_loadtxt.py
index 8839ef0a8..0b8fe3c47 100644
--- a/numpy/lib/tests/test_loadtxt.py
+++ b/numpy/lib/tests/test_loadtxt.py
@@ -5,6 +5,7 @@ These tests complement those found in `test_io.py`.
 """
 
 import sys
+import os
 import pytest
 from tempfile import NamedTemporaryFile, mkstemp
 from io import StringIO
@@ -252,7 +253,7 @@ def test_ragged_usecols():
 
     txt = StringIO("0,0,XXX\n0\n0,XXX,XXX,0,XXX\n")
     with pytest.raises(ValueError,
-                match="invalid column index -2 at row 1 with 2 columns"):
+                match="invalid column index -2 at row 2 with 1 columns"):
         # There is no -2 column in the second row:
         np.loadtxt(txt, dtype=float, delimiter=",", usecols=[0, -2])
 
@@ -960,9 +961,11 @@ def test_parametric_unit_discovery(
 
     # file-obj path
     fd, fname = mkstemp()
+    os.close(fd)
     with open(fname, "w") as fh:
         fh.write("\n".join(data))
     a = np.loadtxt(fname, dtype=unitless_dtype)
+    os.remove(fname)
     assert a.dtype == expected.dtype
     assert_equal(a, expected)
 
@@ -982,9 +985,11 @@ def test_str_dtype_unit_discovery_with_converter():
 
     # file-obj path
     fd, fname = mkstemp()
+    os.close(fd)
     with open(fname, "w") as fh:
         fh.write("\n".join(data))
     a = np.loadtxt(fname, dtype="U", converters=conv, encoding=None)
+    os.remove(fname)
     assert a.dtype == expected.dtype
     assert_equal(a, expected)
 
diff --git a/numpy/linalg/lapack_lite/f2c.c b/numpy/linalg/lapack_lite/f2c.c
index 9a1e9cec1..f1d3fdfbe 100644
--- a/numpy/linalg/lapack_lite/f2c.c
+++ b/numpy/linalg/lapack_lite/f2c.c
@@ -14,9 +14,9 @@
 #include "f2c.h"
 
 
-extern void s_wsfe(cilist *f) {;}
-extern void e_wsfe(void) {;}
-extern void do_fio(integer *c, char *s, ftnlen l) {;}
+extern int s_wsfe(cilist *f) {return 0;}
+extern int e_wsfe(void) {return 0;}
+extern int do_fio(integer *c, char *s, ftnlen l) {return 0;}
 
 /* You'll want this if you redo the f2c_*.c files with the -C option
  * to f2c for checking array subscripts. (It's not suggested you do that
@@ -377,7 +377,7 @@ p->i = p1.i;
 
 #endif /* NO_OVERWRITE */
 
- VOID
+ int
 #ifdef KR_headers
 s_cat(lp, rpp, rnp, np, ll) char *lp, *rpp[]; ftnlen rnp[], *np, ll;
 #else
@@ -485,9 +485,9 @@ return(0);
 /* assign strings:  a = b */
 
 #ifdef KR_headers
-VOID s_copy(a, b, la, lb) register char *a, *b; ftnlen la, lb;
+int s_copy(a, b, la, lb) register char *a, *b; ftnlen la, lb;
 #else
-void s_copy(register char *a, register char *b, ftnlen la, ftnlen lb)
+int s_copy(register char *a, register char *b, ftnlen la, ftnlen lb)
 #endif
 {
 	register char *aend, *bend;
@@ -524,6 +524,7 @@ void s_copy(register char *a, register char *b, ftnlen la, ftnlen lb)
 		while(a < aend)
 			*a++ = ' ';
 		}
+        return 0;
 	}
 
 
diff --git a/numpy/linalg/lapack_lite/f2c.h b/numpy/linalg/lapack_lite/f2c.h
index d3fbfc177..b44aaac44 100644
--- a/numpy/linalg/lapack_lite/f2c.h
+++ b/numpy/linalg/lapack_lite/f2c.h
@@ -263,7 +263,7 @@ extern double d_tan(double *);
 extern double d_tanh(double *);
 extern double derf_(double *);
 extern double derfc_(double *);
-extern void do_fio(ftnint *, char *, ftnlen);
+extern int do_fio(ftnint *, char *, ftnlen);
 extern integer do_lio(ftnint *, ftnint *, char *, ftnlen);
 extern integer do_uio(ftnint *, char *, ftnlen);
 extern integer e_rdfe(void);
@@ -275,7 +275,7 @@ extern integer e_rsli(void);
 extern integer e_rsue(void);
 extern integer e_wdfe(void);
 extern integer e_wdue(void);
-extern void e_wsfe(void);
+extern int e_wsfe(void);
 extern integer e_wsfi(void);
 extern integer e_wsle(void);
 extern integer e_wsli(void);
@@ -350,9 +350,9 @@ extern double r_sinh(float *);
 extern double r_sqrt(float *);
 extern double r_tan(float *);
 extern double r_tanh(float *);
-extern void s_cat(char *, char **, integer *, integer *, ftnlen);
+extern int s_cat(char *, char **, integer *, integer *, ftnlen);
 extern integer s_cmp(char *, char *, ftnlen, ftnlen);
-extern void s_copy(char *, char *, ftnlen, ftnlen);
+extern int s_copy(char *, char *, ftnlen, ftnlen);
 extern int s_paus(char *, ftnlen);
 extern integer s_rdfe(cilist *);
 extern integer s_rdue(cilist *);
@@ -367,7 +367,7 @@ extern integer s_rsue(cilist *);
 extern int s_stop(char *, ftnlen);
 extern integer s_wdfe(cilist *);
 extern integer s_wdue(cilist *);
-extern void s_wsfe(	cilist *);
+extern int s_wsfe(	cilist *);
 extern integer s_wsfi(icilist *);
 extern integer s_wsle(cilist *);
 extern integer s_wsli(icilist *);
diff --git a/numpy/linalg/setup.py b/numpy/linalg/setup.py
index dc62dff8f..1c4e1295e 100644
--- a/numpy/linalg/setup.py
+++ b/numpy/linalg/setup.py
@@ -1,5 +1,6 @@
 import os
 import sys
+import sysconfig
 
 def configuration(parent_package='', top_path=None):
     from numpy.distutils.misc_util import Configuration
@@ -38,7 +39,14 @@ def configuration(parent_package='', top_path=None):
         class numpy_linalg_lapack_lite(system_info):
             def calc_info(self):
                 info = {'language': 'c'}
-                if sys.maxsize > 2**32:
+                size_t_size = sysconfig.get_config_var("SIZEOF_SIZE_T")
+                if size_t_size:
+                    maxsize = 2**(size_t_size - 1) - 1
+                else:
+                    # We prefer using sysconfig as it allows cross-compilation
+                    # but the information may be missing (e.g. on windows).
+                    maxsize = sys.maxsize
+                if maxsize > 2**32:
                     # Build lapack-lite in 64-bit integer mode.
                     # The suffix is arbitrary (lapack_lite symbols follow it),
                     # but use the "64_" convention here.
diff --git a/numpy/ma/core.py b/numpy/ma/core.py
index 78333ed02..d8fd4f389 100644
--- a/numpy/ma/core.py
+++ b/numpy/ma/core.py
@@ -31,6 +31,7 @@ from functools import reduce
 import numpy as np
 import numpy.core.umath as umath
 import numpy.core.numerictypes as ntypes
+from numpy.core import multiarray as mu
 from numpy import ndarray, amax, amin, iscomplexobj, bool_, _NoValue
 from numpy import array as narray
 from numpy.lib.function_base import angle
@@ -5289,14 +5290,22 @@ class MaskedArray(ndarray):
 
         """
         kwargs = {} if keepdims is np._NoValue else {'keepdims': keepdims}
-
         if self._mask is nomask:
             result = super().mean(axis=axis, dtype=dtype, **kwargs)[()]
         else:
+            is_float16_result = False
+            if dtype is None:
+                if issubclass(self.dtype.type, (ntypes.integer, ntypes.bool_)):
+                    dtype = mu.dtype('f8')
+                elif issubclass(self.dtype.type, ntypes.float16):
+                    dtype = mu.dtype('f4')
+                    is_float16_result = True
             dsum = self.sum(axis=axis, dtype=dtype, **kwargs)
             cnt = self.count(axis=axis, **kwargs)
             if cnt.shape == () and (cnt == 0):
                 result = masked
+            elif is_float16_result:
+                result = self.dtype.type(dsum * 1. / cnt)
             else:
                 result = dsum * 1. / cnt
         if out is not None:
diff --git a/numpy/ma/tests/test_core.py b/numpy/ma/tests/test_core.py
index 0dada104d..4fac897de 100644
--- a/numpy/ma/tests/test_core.py
+++ b/numpy/ma/tests/test_core.py
@@ -4036,6 +4036,12 @@ class TestMaskedArrayMathMethods:
         assert_equal(a.max(-1), [3, 6])
         assert_equal(a.max(1), [3, 6])
 
+    def test_mean_overflow(self):
+        # Test overflow in masked arrays
+        # gh-20272
+        a = masked_array(np.full((10000, 10000), 65535, dtype=np.uint16),
+                         mask=np.zeros((10000, 10000)))
+        assert_equal(a.mean(), 65535.0)
 
 class TestMaskedArrayMathMethodsComplex:
     # Test class for miscellaneous MaskedArrays methods.
@@ -4158,7 +4164,11 @@ class TestMaskedArrayFunctions:
         # test that masked_where on a structured array sets a structured
         # mask (see issue #2972)
         a = np.zeros(10, dtype=[("A", "<f2"), ("B", "<f4")])
-        am = np.ma.masked_where(a["A"] < 5, a)
+        with np.errstate(over="ignore"):
+            # NOTE: The float16 "uses" 1e20 as mask, which overflows to inf
+            #       and warns.  Unrelated to this test, but probably undesired.
+            #       But NumPy previously did not warn for this overflow.
+            am = np.ma.masked_where(a["A"] < 5, a)
         assert_equal(am.mask.dtype.names, am.dtype.names)
         assert_equal(am["A"],
                     np.ma.masked_array(np.zeros(10), np.ones(10)))
@@ -4334,7 +4344,10 @@ class TestMaskedArrayFunctions:
         tmp[(xm <= 2).filled(True)] = True
         assert_equal(d._mask, tmp)
 
-        ixm = xm.astype(int)
+        with np.errstate(invalid="warn"):
+            # The fill value is 1e20, it cannot be converted to `int`:
+            with pytest.warns(RuntimeWarning, match="invalid value"):
+                ixm = xm.astype(int)
         d = where(ixm > 2, ixm, masked)
         assert_equal(d, [-9, -9, -9, -9, -9, 4, -9, -9, 10, -9, -9, 3])
         assert_equal(d.dtype, ixm.dtype)
diff --git a/numpy/polynomial/__init__.py b/numpy/polynomial/__init__.py
index 5a3addf4c..c4e7baf2c 100644
--- a/numpy/polynomial/__init__.py
+++ b/numpy/polynomial/__init__.py
@@ -156,17 +156,17 @@ def set_default_printstyle(style):
     >>> c = np.polynomial.Chebyshev([1, 2, 3])
     >>> np.polynomial.set_default_printstyle('unicode')
     >>> print(p)
-    1.0 + 2.0·x¹ + 3.0·x²
+    1.0 + 2.0·x + 3.0·x²
     >>> print(c)
     1.0 + 2.0·T₁(x) + 3.0·T₂(x)
     >>> np.polynomial.set_default_printstyle('ascii')
     >>> print(p)
-    1.0 + 2.0 x**1 + 3.0 x**2
+    1.0 + 2.0 x + 3.0 x**2
     >>> print(c)
     1.0 + 2.0 T_1(x) + 3.0 T_2(x)
     >>> # Formatting supersedes all class/package-level defaults
     >>> print(f"{p:unicode}")
-    1.0 + 2.0·x¹ + 3.0·x²
+    1.0 + 2.0·x + 3.0·x²
     """
     if style not in ('unicode', 'ascii'):
         raise ValueError(
diff --git a/numpy/polynomial/_polybase.py b/numpy/polynomial/_polybase.py
index 6382732dc..9674dee0b 100644
--- a/numpy/polynomial/_polybase.py
+++ b/numpy/polynomial/_polybase.py
@@ -366,7 +366,7 @@ class ABCPolyBase(abc.ABC):
         linewidth = np.get_printoptions().get('linewidth', 75)
         if linewidth < 1:
             linewidth = 1
-        out = f"{self.coef[0]}"
+        out = pu.format_float(self.coef[0])
         for i, coef in enumerate(self.coef[1:]):
             out += " "
             power = str(i + 1)
@@ -376,9 +376,9 @@ class ABCPolyBase(abc.ABC):
             # complex). In this case, represent the coefficient as-is.
             try:
                 if coef >= 0:
-                    next_term = f"+ {coef}"
+                    next_term = f"+ " + pu.format_float(coef, parens=True)
                 else:
-                    next_term = f"- {-coef}"
+                    next_term = f"- " + pu.format_float(-coef, parens=True)
             except TypeError:
                 next_term = f"+ {coef}"
             # Polynomial term
@@ -432,10 +432,10 @@ class ABCPolyBase(abc.ABC):
         return f"{{{cls.basis_name}}}_{{{i}}}({arg_str})"
 
     @staticmethod
-    def _repr_latex_scalar(x):
+    def _repr_latex_scalar(x, parens=False):
         # TODO: we're stuck with disabling math formatting until we handle
         # exponents in this function
-        return r'\text{{{}}}'.format(x)
+        return r'\text{{{}}}'.format(pu.format_float(x, parens=parens))
 
     def _repr_latex_(self):
         # get the scaled argument string to the basis functions
@@ -466,9 +466,9 @@ class ABCPolyBase(abc.ABC):
             elif not isinstance(c, numbers.Real):
                 coef_str = f" + ({self._repr_latex_scalar(c)})"
             elif not np.signbit(c):
-                coef_str = f" + {self._repr_latex_scalar(c)}"
+                coef_str = f" + {self._repr_latex_scalar(c, parens=True)}"
             else:
-                coef_str = f" - {self._repr_latex_scalar(-c)}"
+                coef_str = f" - {self._repr_latex_scalar(-c, parens=True)}"
 
             # produce the string for the term
             term_str = self._repr_latex_term(i, term, needs_parens)
diff --git a/numpy/polynomial/polynomial.py b/numpy/polynomial/polynomial.py
index b4741355f..8e2c6f002 100644
--- a/numpy/polynomial/polynomial.py
+++ b/numpy/polynomial/polynomial.py
@@ -1512,11 +1512,17 @@ class Polynomial(ABCPolyBase):
 
     @classmethod
     def _str_term_unicode(cls, i, arg_str):
-        return f"·{arg_str}{i.translate(cls._superscript_mapping)}"
+        if i == '1':
+            return f"·{arg_str}"
+        else:
+            return f"·{arg_str}{i.translate(cls._superscript_mapping)}"
 
     @staticmethod
     def _str_term_ascii(i, arg_str):
-        return f" {arg_str}**{i}"
+        if i == '1':
+            return f" {arg_str}"
+        else:
+            return f" {arg_str}**{i}"
 
     @staticmethod
     def _repr_latex_term(i, arg_str, needs_parens):
diff --git a/numpy/polynomial/polyutils.py b/numpy/polynomial/polyutils.py
index a2bc75a4d..482913892 100644
--- a/numpy/polynomial/polyutils.py
+++ b/numpy/polynomial/polyutils.py
@@ -32,9 +32,13 @@ import warnings
 
 import numpy as np
 
+from numpy.core.multiarray import dragon4_positional, dragon4_scientific
+from numpy.core.umath import absolute
+
 __all__ = [
     'RankWarning', 'as_series', 'trimseq',
-    'trimcoef', 'getdomain', 'mapdomain', 'mapparms']
+    'trimcoef', 'getdomain', 'mapdomain', 'mapparms',
+    'format_float']
 
 #
 # Warnings and Exceptions
@@ -748,3 +752,38 @@ def _deprecate_as_int(x, desc):
                 return ix
 
         raise TypeError(f"{desc} must be an integer") from e
+
+
+def format_float(x, parens=False):
+    if not np.issubdtype(type(x), np.floating):
+        return str(x)
+
+    opts = np.get_printoptions()
+
+    if np.isnan(x):
+        return opts['nanstr']
+    elif np.isinf(x):
+        return opts['infstr']
+
+    exp_format = False
+    if x != 0:
+        a = absolute(x)
+        if a >= 1.e8 or a < 10**min(0, -(opts['precision']-1)//2):
+            exp_format = True
+
+    trim, unique = '0', True
+    if opts['floatmode'] == 'fixed':
+        trim, unique = 'k', False
+
+    if exp_format:
+        s = dragon4_scientific(x, precision=opts['precision'],
+                               unique=unique, trim=trim, 
+                               sign=opts['sign'] == '+')
+        if parens:
+            s = '(' + s + ')'
+    else:
+        s = dragon4_positional(x, precision=opts['precision'],
+                               fractional=True,
+                               unique=unique, trim=trim,
+                               sign=opts['sign'] == '+')
+    return s
diff --git a/numpy/polynomial/polyutils.pyi b/numpy/polynomial/polyutils.pyi
index 06260a9f1..c0bcc6784 100644
--- a/numpy/polynomial/polyutils.pyi
+++ b/numpy/polynomial/polyutils.pyi
@@ -8,3 +8,4 @@ def trimcoef(c, tol=...): ...
 def getdomain(x): ...
 def mapparms(old, new): ...
 def mapdomain(x, old, new): ...
+def format_float(x, parens=...): ...
diff --git a/numpy/polynomial/tests/test_printing.py b/numpy/polynomial/tests/test_printing.py
index 0c4316223..990a0d179 100644
--- a/numpy/polynomial/tests/test_printing.py
+++ b/numpy/polynomial/tests/test_printing.py
@@ -1,3 +1,4 @@
+from math import nan, inf
 import pytest
 from numpy.core import array, arange, printoptions
 import numpy.polynomial as poly
@@ -15,9 +16,9 @@ class TestStrUnicodeSuperSubscripts:
         poly.set_default_printstyle('unicode')
 
     @pytest.mark.parametrize(('inp', 'tgt'), (
-        ([1, 2, 3], "1.0 + 2.0·x¹ + 3.0·x²"),
-        ([-1, 0, 3, -1], "-1.0 + 0.0·x¹ + 3.0·x² - 1.0·x³"),
-        (arange(12), ("0.0 + 1.0·x¹ + 2.0·x² + 3.0·x³ + 4.0·x⁴ + 5.0·x⁵ + "
+        ([1, 2, 3], "1.0 + 2.0·x + 3.0·x²"),
+        ([-1, 0, 3, -1], "-1.0 + 0.0·x + 3.0·x² - 1.0·x³"),
+        (arange(12), ("0.0 + 1.0·x + 2.0·x² + 3.0·x³ + 4.0·x⁴ + 5.0·x⁵ + "
                       "6.0·x⁶ + 7.0·x⁷ +\n8.0·x⁸ + 9.0·x⁹ + 10.0·x¹⁰ + "
                       "11.0·x¹¹")),
     ))
@@ -89,9 +90,9 @@ class TestStrAscii:
         poly.set_default_printstyle('ascii')
 
     @pytest.mark.parametrize(('inp', 'tgt'), (
-        ([1, 2, 3], "1.0 + 2.0 x**1 + 3.0 x**2"),
-        ([-1, 0, 3, -1], "-1.0 + 0.0 x**1 + 3.0 x**2 - 1.0 x**3"),
-        (arange(12), ("0.0 + 1.0 x**1 + 2.0 x**2 + 3.0 x**3 + 4.0 x**4 + "
+        ([1, 2, 3], "1.0 + 2.0 x + 3.0 x**2"),
+        ([-1, 0, 3, -1], "-1.0 + 0.0 x + 3.0 x**2 - 1.0 x**3"),
+        (arange(12), ("0.0 + 1.0 x + 2.0 x**2 + 3.0 x**3 + 4.0 x**4 + "
                       "5.0 x**5 + 6.0 x**6 +\n7.0 x**7 + 8.0 x**8 + "
                       "9.0 x**9 + 10.0 x**10 + 11.0 x**11")),
     ))
@@ -168,51 +169,51 @@ class TestLinebreaking:
 
     def test_single_line_one_less(self):
         # With 'ascii' style, len(str(p)) is default linewidth - 1 (i.e. 74)
-        p = poly.Polynomial([123456789, 123456789, 123456789, 1234, 1])
+        p = poly.Polynomial([12345678, 12345678, 12345678, 12345678, 123])
         assert_equal(len(str(p)), 74)
         assert_equal(str(p), (
-            '123456789.0 + 123456789.0 x**1 + 123456789.0 x**2 + '
-            '1234.0 x**3 + 1.0 x**4'
+            '12345678.0 + 12345678.0 x + 12345678.0 x**2 + '
+            '12345678.0 x**3 + 123.0 x**4'
         ))
 
     def test_num_chars_is_linewidth(self):
         # len(str(p)) == default linewidth == 75
-        p = poly.Polynomial([123456789, 123456789, 123456789, 1234, 10])
+        p = poly.Polynomial([12345678, 12345678, 12345678, 12345678, 1234])
         assert_equal(len(str(p)), 75)
         assert_equal(str(p), (
-            '123456789.0 + 123456789.0 x**1 + 123456789.0 x**2 + '
-            '1234.0 x**3 +\n10.0 x**4'
+            '12345678.0 + 12345678.0 x + 12345678.0 x**2 + '
+            '12345678.0 x**3 +\n1234.0 x**4'
         ))
 
     def test_first_linebreak_multiline_one_less_than_linewidth(self):
         # Multiline str where len(first_line) + len(next_term) == lw - 1 == 74
         p = poly.Polynomial(
-                [123456789, 123456789, 123456789, 12, 1, 123456789]
+                [12345678, 12345678, 12345678, 12345678, 1, 12345678]
             )
         assert_equal(len(str(p).split('\n')[0]), 74)
         assert_equal(str(p), (
-            '123456789.0 + 123456789.0 x**1 + 123456789.0 x**2 + '
-            '12.0 x**3 + 1.0 x**4 +\n123456789.0 x**5'
+            '12345678.0 + 12345678.0 x + 12345678.0 x**2 + '
+            '12345678.0 x**3 + 1.0 x**4 +\n12345678.0 x**5'
         ))
 
     def test_first_linebreak_multiline_on_linewidth(self):
         # First line is one character longer than previous test
         p = poly.Polynomial(
-                [123456789, 123456789, 123456789, 123, 1, 123456789]
+                [12345678, 12345678, 12345678, 12345678.12, 1, 12345678]
             )
         assert_equal(str(p), (
-            '123456789.0 + 123456789.0 x**1 + 123456789.0 x**2 + '
-            '123.0 x**3 +\n1.0 x**4 + 123456789.0 x**5'
+            '12345678.0 + 12345678.0 x + 12345678.0 x**2 + '
+            '12345678.12 x**3 +\n1.0 x**4 + 12345678.0 x**5'
         ))
 
     @pytest.mark.parametrize(('lw', 'tgt'), (
-        (75, ('0.0 + 10.0 x**1 + 200.0 x**2 + 3000.0 x**3 + 40000.0 x**4 +\n'
-              '500000.0 x**5 + 600000.0 x**6 + 70000.0 x**7 + 8000.0 x**8 + '
+        (75, ('0.0 + 10.0 x + 200.0 x**2 + 3000.0 x**3 + 40000.0 x**4 + '
+              '500000.0 x**5 +\n600000.0 x**6 + 70000.0 x**7 + 8000.0 x**8 + '
               '900.0 x**9')),
-        (45, ('0.0 + 10.0 x**1 + 200.0 x**2 + 3000.0 x**3 +\n40000.0 x**4 + '
+        (45, ('0.0 + 10.0 x + 200.0 x**2 + 3000.0 x**3 +\n40000.0 x**4 + '
               '500000.0 x**5 +\n600000.0 x**6 + 70000.0 x**7 + 8000.0 x**8 +\n'
               '900.0 x**9')),
-        (132, ('0.0 + 10.0 x**1 + 200.0 x**2 + 3000.0 x**3 + 40000.0 x**4 + '
+        (132, ('0.0 + 10.0 x + 200.0 x**2 + 3000.0 x**3 + 40000.0 x**4 + '
                '500000.0 x**5 + 600000.0 x**6 + 70000.0 x**7 + 8000.0 x**8 + '
                '900.0 x**9')),
     ))
@@ -230,10 +231,10 @@ def test_set_default_printoptions():
     p = poly.Polynomial([1, 2, 3])
     c = poly.Chebyshev([1, 2, 3])
     poly.set_default_printstyle('ascii')
-    assert_equal(str(p), "1.0 + 2.0 x**1 + 3.0 x**2")
+    assert_equal(str(p), "1.0 + 2.0 x + 3.0 x**2")
     assert_equal(str(c), "1.0 + 2.0 T_1(x) + 3.0 T_2(x)")
     poly.set_default_printstyle('unicode')
-    assert_equal(str(p), "1.0 + 2.0·x¹ + 3.0·x²")
+    assert_equal(str(p), "1.0 + 2.0·x + 3.0·x²")
     assert_equal(str(c), "1.0 + 2.0·T₁(x) + 3.0·T₂(x)")
     with pytest.raises(ValueError):
         poly.set_default_printstyle('invalid_input')
@@ -247,22 +248,22 @@ def test_complex_coefficients():
     # Python complex
     p2 = poly.Polynomial(array(coefs, dtype=object))
     poly.set_default_printstyle('unicode')
-    assert_equal(str(p1), "1j + (1+1j)·x¹ - (2-2j)·x² + (3+0j)·x³")
-    assert_equal(str(p2), "1j + (1+1j)·x¹ + (-2+2j)·x² + (3+0j)·x³")
+    assert_equal(str(p1), "1j + (1+1j)·x - (2-2j)·x² + (3+0j)·x³")
+    assert_equal(str(p2), "1j + (1+1j)·x + (-2+2j)·x² + (3+0j)·x³")
     poly.set_default_printstyle('ascii')
-    assert_equal(str(p1), "1j + (1+1j) x**1 - (2-2j) x**2 + (3+0j) x**3")
-    assert_equal(str(p2), "1j + (1+1j) x**1 + (-2+2j) x**2 + (3+0j) x**3")
+    assert_equal(str(p1), "1j + (1+1j) x - (2-2j) x**2 + (3+0j) x**3")
+    assert_equal(str(p2), "1j + (1+1j) x + (-2+2j) x**2 + (3+0j) x**3")
 
 
 @pytest.mark.parametrize(('coefs', 'tgt'), (
     (array([Fraction(1, 2), Fraction(3, 4)], dtype=object), (
-        "1/2 + 3/4·x¹"
+        "1/2 + 3/4·x"
     )),
     (array([1, 2, Fraction(5, 7)], dtype=object), (
-        "1 + 2·x¹ + 5/7·x²"
+        "1 + 2·x + 5/7·x²"
     )),
     (array([Decimal('1.00'), Decimal('2.2'), 3], dtype=object), (
-        "1.00 + 2.2·x¹ + 3·x²"
+        "1.00 + 2.2·x + 3·x²"
     )),
 ))
 def test_numeric_object_coefficients(coefs, tgt):
@@ -272,8 +273,8 @@ def test_numeric_object_coefficients(coefs, tgt):
 
 
 @pytest.mark.parametrize(('coefs', 'tgt'), (
-    (array([1, 2, 'f'], dtype=object), '1 + 2·x¹ + f·x²'),
-    (array([1, 2, [3, 4]], dtype=object), '1 + 2·x¹ + [3, 4]·x²'),
+    (array([1, 2, 'f'], dtype=object), '1 + 2·x + f·x²'),
+    (array([1, 2, [3, 4]], dtype=object), '1 + 2·x + [3, 4]·x²'),
 ))
 def test_nonnumeric_object_coefficients(coefs, tgt):
     """
@@ -288,20 +289,20 @@ class TestFormat:
     def test_format_unicode(self):
         poly.set_default_printstyle('ascii')
         p = poly.Polynomial([1, 2, 0, -1])
-        assert_equal(format(p, 'unicode'), "1.0 + 2.0·x¹ + 0.0·x² - 1.0·x³")
+        assert_equal(format(p, 'unicode'), "1.0 + 2.0·x + 0.0·x² - 1.0·x³")
 
     def test_format_ascii(self):
         poly.set_default_printstyle('unicode')
         p = poly.Polynomial([1, 2, 0, -1])
         assert_equal(
-            format(p, 'ascii'), "1.0 + 2.0 x**1 + 0.0 x**2 - 1.0 x**3"
+            format(p, 'ascii'), "1.0 + 2.0 x + 0.0 x**2 - 1.0 x**3"
         )
 
     def test_empty_formatstr(self):
         poly.set_default_printstyle('ascii')
         p = poly.Polynomial([1, 2, 3])
-        assert_equal(format(p), "1.0 + 2.0 x**1 + 3.0 x**2")
-        assert_equal(f"{p}", "1.0 + 2.0 x**1 + 3.0 x**2")
+        assert_equal(format(p), "1.0 + 2.0 x + 3.0 x**2")
+        assert_equal(f"{p}", "1.0 + 2.0 x + 3.0 x**2")
 
     def test_bad_formatstr(self):
         p = poly.Polynomial([1, 2, 0, -1])
@@ -310,7 +311,7 @@ class TestFormat:
 
 
 @pytest.mark.parametrize(('poly', 'tgt'), (
-    (poly.Polynomial, '1.0 + 2.0·z¹ + 3.0·z²'),
+    (poly.Polynomial, '1.0 + 2.0·z + 3.0·z²'),
     (poly.Chebyshev, '1.0 + 2.0·T₁(z) + 3.0·T₂(z)'),
     (poly.Hermite, '1.0 + 2.0·H₁(z) + 3.0·H₂(z)'),
     (poly.HermiteE, '1.0 + 2.0·He₁(z) + 3.0·He₂(z)'),
@@ -379,7 +380,7 @@ class TestLatexRepr:
         # right now we ignore the formatting of scalars in our tests, since
         # it makes them too verbose. Ideally, the formatting of scalars will
         # be fixed such that tests below continue to pass
-        obj._repr_latex_scalar = lambda x: str(x)
+        obj._repr_latex_scalar = lambda x, parens=False: str(x)
         try:
             return obj._repr_latex_()
         finally:
@@ -455,3 +456,71 @@ class TestLatexRepr:
                 r'\left(1.0 + 2.0z\right)^{2}$'
             ),
         )
+
+
+SWITCH_TO_EXP = (
+    '1.0 + (1.0e-01) x + (1.0e-02) x**2',
+    '1.2 + (1.2e-01) x + (1.2e-02) x**2',
+    '1.23 + 0.12 x + (1.23e-02) x**2 + (1.23e-03) x**3',
+    '1.235 + 0.123 x + (1.235e-02) x**2 + (1.235e-03) x**3',
+    '1.2346 + 0.1235 x + 0.0123 x**2 + (1.2346e-03) x**3 + (1.2346e-04) x**4',
+    '1.23457 + 0.12346 x + 0.01235 x**2 + (1.23457e-03) x**3 + '
+    '(1.23457e-04) x**4',
+    '1.234568 + 0.123457 x + 0.012346 x**2 + 0.001235 x**3 + '
+    '(1.234568e-04) x**4 + (1.234568e-05) x**5',
+    '1.2345679 + 0.1234568 x + 0.0123457 x**2 + 0.0012346 x**3 + '
+    '(1.2345679e-04) x**4 + (1.2345679e-05) x**5')
+
+class TestPrintOptions:
+    """
+    Test the output is properly configured via printoptions.
+    The exponential notation is enabled automatically when the values 
+    are too small or too large.
+    """
+
+    def test_str(self):
+        p = poly.Polynomial([1/2, 1/7, 1/7*10**8, 1/7*10**9])
+        assert_equal(str(p), '0.5 + 0.14285714 x + 14285714.28571429 x**2 '
+                             '+ (1.42857143e+08) x**3')
+
+        with printoptions(precision=3):
+            assert_equal(str(p), '0.5 + 0.143 x + 14285714.286 x**2 '
+                                 '+ (1.429e+08) x**3')
+
+    def test_latex(self):
+        p = poly.Polynomial([1/2, 1/7, 1/7*10**8, 1/7*10**9])
+        assert_equal(p._repr_latex_(),
+            r'$x \mapsto \text{0.5} + \text{0.14285714}\,x + '
+            r'\text{14285714.28571429}\,x^{2} + '
+            r'\text{(1.42857143e+08)}\,x^{3}$')
+        
+        with printoptions(precision=3):
+            assert_equal(p._repr_latex_(),
+                r'$x \mapsto \text{0.5} + \text{0.143}\,x + '
+                r'\text{14285714.286}\,x^{2} + \text{(1.429e+08)}\,x^{3}$')
+
+    def test_fixed(self):
+        p = poly.Polynomial([1/2])
+        assert_equal(str(p), '0.5')
+        
+        with printoptions(floatmode='fixed'):
+            assert_equal(str(p), '0.50000000')
+        
+        with printoptions(floatmode='fixed', precision=4):
+            assert_equal(str(p), '0.5000')
+
+    def test_switch_to_exp(self):
+        for i, s in enumerate(SWITCH_TO_EXP):
+            with printoptions(precision=i):
+                p = poly.Polynomial([1.23456789*10**-i 
+                                     for i in range(i//2+3)])
+                assert str(p).replace('\n', ' ') == s 
+    
+    def test_non_finite(self):
+        p = poly.Polynomial([nan, inf])
+        assert str(p) == 'nan + inf x'
+        assert p._repr_latex_() == r'$x \mapsto \text{nan} + \text{inf}\,x$'
+        with printoptions(nanstr='NAN', infstr='INF'):
+            assert str(p) == 'NAN + INF x'
+            assert p._repr_latex_() == \
+                r'$x \mapsto \text{NAN} + \text{INF}\,x$'
diff --git a/numpy/random/_generator.pyx b/numpy/random/_generator.pyx
index b54fe3610..0019c4bcd 100644
--- a/numpy/random/_generator.pyx
+++ b/numpy/random/_generator.pyx
@@ -3660,6 +3660,11 @@ cdef class Generator:
         # Check preconditions on arguments
         mean = np.array(mean)
         cov = np.array(cov)
+
+        if (np.issubdtype(mean.dtype, np.complexfloating) or
+                np.issubdtype(cov.dtype, np.complexfloating)):
+            raise TypeError("mean and cov must not be complex")
+
         if size is None:
             shape = []
         elif isinstance(size, (int, long, np.integer)):
diff --git a/numpy/random/tests/test_generator_mt19937.py b/numpy/random/tests/test_generator_mt19937.py
index 3ccb9103c..fa55ac0ee 100644
--- a/numpy/random/tests/test_generator_mt19937.py
+++ b/numpy/random/tests/test_generator_mt19937.py
@@ -1452,6 +1452,12 @@ class TestRandomDist:
                       mu, np.empty((3, 2)))
         assert_raises(ValueError, random.multivariate_normal,
                       mu, np.eye(3))
+        
+    @pytest.mark.parametrize('mean, cov', [([0], [[1+1j]]), ([0j], [[1]])])
+    def test_multivariate_normal_disallow_complex(self, mean, cov):
+        random = Generator(MT19937(self.seed))
+        with pytest.raises(TypeError, match="must not be complex"):
+            random.multivariate_normal(mean, cov)
 
     @pytest.mark.parametrize("method", ["svd", "eigh", "cholesky"])
     def test_multivariate_normal_basic_stats(self, method):
diff --git a/numpy/typing/tests/data/pass/arithmetic.py b/numpy/typing/tests/data/pass/arithmetic.py
index 4ed69c923..07a990127 100644
--- a/numpy/typing/tests/data/pass/arithmetic.py
+++ b/numpy/typing/tests/data/pass/arithmetic.py
@@ -2,6 +2,7 @@ from __future__ import annotations
 
 from typing import Any
 import numpy as np
+import pytest
 
 c16 = np.complex128(1)
 f8 = np.float64(1)
@@ -330,8 +331,9 @@ AR_O **= AR_LIKE_O
 -f4
 -i8
 -i4
--u8
--u4
+with pytest.warns(RuntimeWarning):
+    -u8
+    -u4
 -td
 -AR_f