Merge branch 'main' into deprecate-find-common-type

author: Charles Harris <charlesr.harris@gmail.com> 2023-05-13 11:02:49 -0600
committer: GitHub <noreply@github.com> 2023-05-13 11:02:49 -0600
commit: 5187067d7ad176ee3614beab2b99a524dd719aa8 (patch)
tree: 907997d0c294f550193322aaa73237c1a7bcfaa6
parent: b786189222ac5bf2f4efbb04399261f7f760bc18 (diff)
parent: 81caed6e3c34c4bf4b22b4f6167e816ba2a3f73c (diff)
download: numpy-5187067d7ad176ee3614beab2b99a524dd719aa8.tar.gz
786 files changed, 31161 insertions, 16889 deletions
diff --git a/.circleci/config.yml b/.circleci/config.yml
index a75f669af..f5bd44798 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -1,23 +1,27 @@
-# Python CircleCI 2.0 configuration file
+# Python CircleCI 2.1 configuration file
 #
-# Check https://circleci.com/docs/2.0/language-python/ for more details
+# Check https://circleci.com/docs/2.1/language-python/ for more details
 #
-version: 2
-jobs:
-  build:
-    docker:
-      # CircleCI maintains a library of pre-built images
-      # documented at https://circleci.com/docs/2.0/circleci-images/
-      # circleci/python3.8 images come with old versions of Doxygen(1.6.x),
-      # therefore a new base image chose instead to guarantee to
-      # have a newer version >= 1.8.10 to avoid warnings
-      # that related to the default behaviors or non-exist config options
-      - image: cimg/base:2021.05
+version: 2.1
+
+# Aliases to reuse
+_defaults: &defaults
+  docker:
+    # CircleCI maintains a library of pre-built images
+    # documented at https://circleci.com/docs/2.1/circleci-images/
+    # circleci/python3.8 images come with old versions of Doxygen(1.6.x),
+    # therefore a new base image chose instead to guarantee to
+    # have a newer version >= 1.8.10 to avoid warnings
+    # that related to the default behaviors or non-exist config options
+    - image: cimg/python:3.9
+  working_directory: ~/repo
 
-    working_directory: ~/repo
 
+jobs:
+  build:
+    <<: *defaults
     steps:
-      - checkout:
+      - checkout
 
       - run:
           name: Check-skip
@@ -47,9 +51,10 @@ jobs:
           name: create virtual environment, install dependencies
           command: |
             sudo apt-get update
-            sudo apt-get install -y python3.8 python3.8-dev python3-venv graphviz texlive-fonts-recommended texlive-latex-recommended \
-              texlive-latex-extra latexmk texlive-xetex doxygen
-            python3.8 -m venv venv
+            #sudo apt-get install -y python3.9 python3.9-dev python3-venv graphviz texlive-fonts-recommended texlive-latex-recommended \
+            sudo apt-get install -y graphviz texlive-fonts-recommended texlive-latex-recommended \
+              texlive-latex-extra latexmk texlive-xetex texlive-lang-chinese doxygen
+            python3.9 -m venv venv
             . venv/bin/activate
 
       - run:
@@ -82,77 +87,92 @@ jobs:
             . venv/bin/activate
             cd doc
             # Don't use -q, show warning summary"
-            SPHINXOPTS="-j4 -n" make -e html || echo "ignoring errors for now, see gh-13114"
-
-      - run:
-          name: build devdocs
-          no_output_timeout: 30m
-          command: |
-            . venv/bin/activate
-            cd doc
-            make clean
-            SPHINXOPTS="-j4 -q" make -e html
+            SPHINXOPTS="-j2 -n" make -e html || echo "ignoring errors for now, see gh-13114"
+            if [[ $(find build/html -type f | wc -l) -lt 1000 ]]; then
+                echo "doc build failed: build/html is empty"
+                exit -1
+            fi
 
       - run:
           name: build neps
           command: |
             . venv/bin/activate
             cd doc/neps
-            SPHINXOPTS="-j4 -q" make -e html
+            SPHINXOPTS="-j2 -q" make -e html
 
       - store_artifacts:
           path: doc/build/html/
 
-
       - store_artifacts:
           path: doc/neps/_build/html/
      #      destination: neps
 
+      - persist_to_workspace:
+          root: ~/repo
+          paths:
+            - doc/build/html
+            - doc/neps/_build
+            - tools/ci/push_docs_to_repo.py
+
+  deploy:
+    <<: *defaults
+    steps:
+      - checkout
+
+      - attach_workspace:
+          at: ~/repo
+
       - add_ssh_keys:
           fingerprints:
-            - "9f:8c:e5:3f:53:40:0b:ee:c9:c3:0f:fd:0f:3c:cc:55"
+            - "45:d8:d1:d6:f7:53:47:c5:d0:9e:35:19:79:e7:ff:24"
 
       -  run:
           name: deploy devdocs
           command: |
-            if [ "${CIRCLE_BRANCH}" == "main" ]; then
-              touch doc/build/html/.nojekyll
-
-              ./tools/ci/push_docs_to_repo.py doc/build/html \
-                  git@github.com:numpy/devdocs.git \
-                  --committer "numpy-circleci-bot" \
-                  --email "numpy-circleci-bot@nomail" \
-                  --message "Docs build of $CIRCLE_SHA1" \
-                  --force
-            else
-              echo "Not on the main branch; skipping deployment"
-            fi
+            touch doc/build/html/.nojekyll
+
+            ./tools/ci/push_docs_to_repo.py doc/build/html \
+                --committer "numpy-circleci-bot" \
+                --email "numpy-circleci-bot@nomail" \
+                --message "Docs build of $CIRCLE_SHA1" \
+                --count 5 \
+                --force \
+                git@github.com:numpy/devdocs.git
 
       - add_ssh_keys:
           fingerprints:
-            - "11:fb:19:69:80:3a:6d:37:9c:d1:ac:20:17:cd:c8:17"
+            - "df:8b:fb:34:2d:38:7d:49:fc:1b:e8:44:4f:bd:2c:0e"
 
       - run:
           name: select SSH key for neps repo
           command: |
-            cat <<\EOF > ~/.ssh/config
+            cat \<<\EOF > ~/.ssh/config
             Host github.com
               IdentitiesOnly yes
-              IdentityFile /home/circleci/.ssh/id_rsa_11fb1969803a6d379cd1ac2017cdc817
+              IdentityFile /home/circleci/.ssh/id_rsa_df8bfb342d387d49fc1be8444fbd2c0e
             EOF
 
       -  run:
           name: deploy neps
           command: |
-            if [ "${CIRCLE_BRANCH}" == "main" ]; then
-              touch doc/neps/_build/html/.nojekyll
-
-              ./tools/ci/push_docs_to_repo.py doc/neps/_build/html \
-                  git@github.com:numpy/neps.git \
-                  --committer "numpy-circleci-bot" \
-                  --email "numpy-circleci-bot@nomail" \
-                  --message "Docs build of $CIRCLE_SHA1" \
-                  --force
-            else
-              echo "Not on the main branch; skipping deployment"
-            fi
+            touch doc/neps/_build/html/.nojekyll
+
+            ./tools/ci/push_docs_to_repo.py doc/neps/_build/html \
+                --committer "numpy-circleci-bot" \
+                --email "numpy-circleci-bot@nomail" \
+                --message "Docs build of $CIRCLE_SHA1" \
+                --count 5 \
+                --force \
+                git@github.com:numpy/neps.git \
+
+workflows:
+  version: 2
+  default:
+    jobs:
+      - build
+      - deploy:
+          requires:
+            - build
+          filters:
+            branches:
+              only: main
diff --git a/.cirrus.star b/.cirrus.star
new file mode 100644
index 000000000..25c7b7dfd
--- /dev/null
+++ b/.cirrus.star
@@ -0,0 +1,39 @@
+# The guide to programming cirrus-ci tasks using starlark is found at
+# https://cirrus-ci.org/guide/programming-tasks/
+#
+# In this simple starlark script we simply check conditions for whether
+# a CI run should go ahead. If the conditions are met, then we just
+# return the yaml containing the tasks to be run.
+
+load("cirrus", "env", "fs", "http")
+
+def main(ctx):
+    ######################################################################
+    # Should wheels be built?
+    # Only test on the numpy/numpy repository
+    ######################################################################
+
+    if env.get("CIRRUS_REPO_FULL_NAME") != "numpy/numpy":
+        return []
+
+    # only run the wheels entry on a cron job
+    if env.get("CIRRUS_CRON", "") == "nightly":
+        return fs.read("tools/ci/cirrus_wheels.yml")
+
+    # Obtain commit message for the event. Unfortunately CIRRUS_CHANGE_MESSAGE
+    # only contains the actual commit message on a non-PR trigger event.
+    # For a PR event it contains the PR title and description.
+    SHA = env.get("CIRRUS_CHANGE_IN_REPO")
+    url = "https://api.github.com/repos/scipy/scipy/git/commits/" + SHA
+    dct = http.get(url).json()
+    # if "[wheel build]" in dct["message"]:
+    #     return fs.read("ci/cirrus_wheels.yml")
+
+    if "[skip cirrus]" in dct["message"] or "[skip ci]" in dct["message"]:
+        return []
+
+    # add extra jobs to the cirrus run by += adding to config
+    config = fs.read("tools/ci/cirrus_wheels.yml")
+    config += fs.read("tools/ci/cirrus_macosx_arm64.yml")
+
+    return config
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
new file mode 100644
index 000000000..a31be7931
--- /dev/null
+++ b/.devcontainer/devcontainer.json
@@ -0,0 +1,17 @@
+{
+	// More info about Features: https://containers.dev/features
+	"image": "mcr.microsoft.com/devcontainers/universal:2",
+	"features": {},
+
+	"onCreateCommand": ".devcontainer/setup.sh",
+	"postCreateCommand": "",
+
+	"customizations": {
+		"vscode": {
+			"extensions": [
+				"ms-python.python"
+			],
+			"settings": {}
+		}
+	}
+}
diff --git a/.devcontainer/setup.sh b/.devcontainer/setup.sh
new file mode 100755
index 000000000..4ea718ec9
--- /dev/null
+++ b/.devcontainer/setup.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+set -e
+
+curl micro.mamba.pm/install.sh | bash
+
+conda init --all
+micromamba shell init -s bash
+micromamba env create -f environment.yml --yes
+# Note that `micromamba activate numpy-dev` doesn't work, it must be run by the
+# user (same applies to `conda activate`)
+
+git submodule update --init
diff --git a/.gitattributes b/.gitattributes
index 911db2b72..537650d39 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -12,6 +12,7 @@ numpy/linalg/lapack_lite/f2c.h linguist-vendored
 tools/npy_tempita/* linguist-vendored
 numpy/core/include/numpy/libdivide/* linguist-vendored
 numpy/core/src/umath/svml/* linguist-vendored
+numpy/core/src/common/dlpack/dlpack.h linguist-vendored
 
 # Mark some files as generated
 numpy/linalg/lapack_lite/f2c_*.c linguist-generated
diff --git a/.github/ISSUE_TEMPLATE/bug-report.yml b/.github/ISSUE_TEMPLATE/bug-report.yml
index cfb07f450..a15a3a144 100644
--- a/.github/ISSUE_TEMPLATE/bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/bug-report.yml
@@ -45,8 +45,10 @@ body:
 
 - type: textarea
   attributes:
-    label: "NumPy/Python version information:"
-    description: Output from `import sys, numpy; print(numpy.__version__, sys.version)`.
+    label: "Runtime information:"
+    description: >
+      Output from `import sys, numpy; print(numpy.__version__); print(sys.version)`
+      If you are running NumPy 1.24+, also show `print(numpy.show_runtime())`
   validations:
     required: true
 
@@ -58,4 +60,4 @@ body:
     placeholder: |
       << your explanation here >>
   validations:
-    required: false
-\ No newline at end of file
+    required: false
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
new file mode 100644
index 000000000..b59fe181d
--- /dev/null
+++ b/.github/dependabot.yml
@@ -0,0 +1,10 @@
+version: 2
+updates:
+  - package-ecosystem: github-actions
+    directory: /
+    schedule:
+      interval: daily
+    commit-message:
+      prefix: "MAINT"
+    labels:
+      - "03 - Maintenance"
diff --git a/.github/workflows/build_test.yml b/.github/workflows/build_test.yml
index 53fa13f76..70e11a4a4 100644
--- a/.github/workflows/build_test.yml
+++ b/.github/workflows/build_test.yml
@@ -3,8 +3,8 @@ name: Build_Test
 on:
   push:
     branches:
+      # coverage comparison in the "full" step needs to run on main after merges
       - main
-      - maintenance/**
   pull_request:
     branches:
       - main
@@ -16,7 +16,7 @@ defaults:
 
 env:
   DOWNLOAD_OPENBLAS: 1
-  PYTHON_VERSION: 3.8
+  PYTHON_VERSION: 3.9
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
@@ -27,15 +27,15 @@ permissions:
 
 jobs:
   lint:
-    if: "github.repository == 'numpy/numpy' && github.ref != 'refs/heads/main' && !contains(github.event.head_commit.message, '[ci skip]') && !contains(github.event.head_commit.message, '[skip ci]') && !contains(github.event.head_commit.message, '[skip github]')"
+    if: github.repository == 'numpy/numpy' && github.event_name != 'push'
     runs-on: ubuntu-latest
     continue-on-error: true
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@8e5e7e5ab8b370d6c329ec480221332ada57f0ab # v3.5.2
       with:
         submodules: recursive
         fetch-depth: 0
-    - uses: actions/setup-python@v4
+    - uses: actions/setup-python@57ded4d7d5e986d7296eab16560982c6dd7c923b # v4.6.0
       with:
         python-version: ${{ env.PYTHON_VERSION }}
     - name: Install linter requirements
@@ -46,16 +46,16 @@ jobs:
         python tools/linter.py --branch origin/${{ github.base_ref }}
 
   smoke_test:
-    if: "github.repository == 'numpy/numpy' && !contains(github.event.head_commit.message, '[ci skip]') && !contains(github.event.head_commit.message, '[skip ci]') && !contains(github.event.head_commit.message, '[skip github]')"
+    if: "github.repository == 'numpy/numpy'"
     runs-on: ubuntu-latest
     env:
       WITHOUT_SIMD: 1
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@8e5e7e5ab8b370d6c329ec480221332ada57f0ab # v3.5.2
       with:
         submodules: recursive
         fetch-depth: 0
-    - uses: actions/setup-python@v4
+    - uses: actions/setup-python@57ded4d7d5e986d7296eab16560982c6dd7c923b # v4.6.0
       with:
         python-version: ${{ env.PYTHON_VERSION }}
     - uses: ./.github/actions
@@ -63,50 +63,45 @@ jobs:
   basic:
     needs: [smoke_test]
     runs-on: ubuntu-latest
+    if: github.event_name != 'push'
     strategy:
       matrix:
-        python-version: ["3.9", "3.10", "3.11-dev"]
+        python-version: ["3.9", "3.10", "3.11", "pypy3.9-v7.3.11"]
     env:
       EXPECT_CPU_FEATURES: "SSE SSE2 SSE3 SSSE3 SSE41 POPCNT SSE42 AVX F16C FMA3 AVX2 AVX512F AVX512CD AVX512_KNL AVX512_KNM AVX512_SKX AVX512_CLX AVX512_CNL AVX512_ICL"
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@8e5e7e5ab8b370d6c329ec480221332ada57f0ab # v3.5.2
       with:
         submodules: recursive
         fetch-depth: 0
-    - uses: actions/setup-python@v4
+    - uses: actions/setup-python@57ded4d7d5e986d7296eab16560982c6dd7c923b # v4.6.0
       with:
         python-version: ${{ matrix.python-version }}
     - uses: ./.github/actions
 
   old_gcc:
     needs: [smoke_test]
-    # provides GCC 6, 7, 8
-    runs-on: ubuntu-18.04
+    # provides GCC  7, 8
+    runs-on: ubuntu-20.04
+    if: github.event_name != 'push'
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@8e5e7e5ab8b370d6c329ec480221332ada57f0ab # v3.5.2
       with:
         submodules: recursive
         fetch-depth: 0
     # comes with python3.6
-    - name: Install Python3.8
+    - name: Install Python3.9
       run: |
         sudo apt update
         # for add-apt-repository
         sudo apt install software-properties-common -y
         sudo add-apt-repository ppa:deadsnakes/ppa -y
-        sudo apt install python3.8-dev -y
-        sudo ln -s /usr/bin/python3.8 /usr/bin/pythonx
+        sudo apt install python3.9-dev -y
+        sudo ln -s /usr/bin/python3.9 /usr/bin/pythonx
         pythonx -m pip install --upgrade pip setuptools wheel
         pythonx -m pip install -r test_requirements.txt
     - name: Install Compilers
-      run: sudo apt install g++-6 g++-7 g++-8 -y
-    - name: Build gcc-6
-      run: |
-        export CC=/usr/bin/gcc-6
-        export CXX=/usr/bin/g++-6
-        pythonx setup.py install --user
-    - name: Runtests gcc-6
-      run: pythonx runtests.py -n
+      run: sudo apt install g++-7 g++-8 -y
     - name: Build gcc-7
       run: |
         export CC=/usr/bin/gcc-7
@@ -125,14 +120,31 @@ jobs:
   without_optimizations:
     needs: [smoke_test]
     runs-on: ubuntu-latest
+    if: github.event_name != 'push'
     env:
       WITHOUT_OPTIMIZATIONS: 1
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@8e5e7e5ab8b370d6c329ec480221332ada57f0ab # v3.5.2
       with:
         submodules: recursive
         fetch-depth: 0
-    - uses: actions/setup-python@v4
+    - uses: actions/setup-python@57ded4d7d5e986d7296eab16560982c6dd7c923b # v4.6.0
+      with:
+        python-version: ${{ env.PYTHON_VERSION }}
+    - uses: ./.github/actions
+
+  with_baseline_only:
+    needs: [smoke_test]
+    runs-on: ubuntu-latest
+    if: github.event_name != 'push'
+    env:
+      CPU_DISPATCH: "none"
+    steps:
+    - uses: actions/checkout@8e5e7e5ab8b370d6c329ec480221332ada57f0ab # v3.5.2
+      with:
+        submodules: recursive
+        fetch-depth: 0
+    - uses: actions/setup-python@57ded4d7d5e986d7296eab16560982c6dd7c923b # v4.6.0
       with:
         python-version: ${{ env.PYTHON_VERSION }}
     - uses: ./.github/actions
@@ -140,14 +152,15 @@ jobs:
   without_avx512:
     needs: [smoke_test]
     runs-on: ubuntu-latest
+    if: github.event_name != 'push'
     env:
       CPU_DISPATCH: "max -xop -fma4 -avx512f -avx512cd -avx512_knl -avx512_knm -avx512_skx -avx512_clx -avx512_cnl -avx512_icl"
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@8e5e7e5ab8b370d6c329ec480221332ada57f0ab # v3.5.2
       with:
         submodules: recursive
         fetch-depth: 0
-    - uses: actions/setup-python@v4
+    - uses: actions/setup-python@57ded4d7d5e986d7296eab16560982c6dd7c923b # v4.6.0
       with:
         python-version: ${{ env.PYTHON_VERSION }}
     - uses: ./.github/actions
@@ -155,29 +168,31 @@ jobs:
   without_avx512_avx2_fma3:
     needs: [smoke_test]
     runs-on: ubuntu-latest
+    if: github.event_name != 'push'
     env:
       CPU_DISPATCH: "SSSE3 SSE41 POPCNT SSE42 AVX F16C"
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@8e5e7e5ab8b370d6c329ec480221332ada57f0ab # v3.5.2
       with:
         submodules: recursive
         fetch-depth: 0
-    - uses: actions/setup-python@v4
+    - uses: actions/setup-python@57ded4d7d5e986d7296eab16560982c6dd7c923b # v4.6.0
       with:
         python-version: ${{ env.PYTHON_VERSION }}
     - uses: ./.github/actions
 
   debug:
     needs: [smoke_test]
-    runs-on: ubuntu-20.04
+    runs-on: ubuntu-latest
+    if: github.event_name != 'push'
     env:
       USE_DEBUG: 1
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@8e5e7e5ab8b370d6c329ec480221332ada57f0ab # v3.5.2
       with:
         submodules: recursive
         fetch-depth: 0
-    - uses: actions/setup-python@v4
+    - uses: actions/setup-python@57ded4d7d5e986d7296eab16560982c6dd7c923b # v4.6.0
       with:
         python-version: ${{ env.PYTHON_VERSION }}
     - uses: ./.github/actions
@@ -185,32 +200,33 @@ jobs:
   blas64:
     needs: [smoke_test]
     runs-on: ubuntu-latest
+    if: github.event_name != 'push'
     env:
       NPY_USE_BLAS_ILP64: 1
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@8e5e7e5ab8b370d6c329ec480221332ada57f0ab # v3.5.2
       with:
         submodules: recursive
         fetch-depth: 0
-    - uses: actions/setup-python@v4
+    - uses: actions/setup-python@57ded4d7d5e986d7296eab16560982c6dd7c923b # v4.6.0
       with:
         python-version: ${{ env.PYTHON_VERSION }}
     - uses: ./.github/actions
 
   full:
     needs: [smoke_test]
-    runs-on: ubuntu-18.04
+    runs-on: ubuntu-22.04
     env:
       USE_WHEEL: 1
       RUN_FULL_TESTS: 1
       RUN_COVERAGE: 1
       INSTALL_PICKLE5: 1
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@8e5e7e5ab8b370d6c329ec480221332ada57f0ab # v3.5.2
       with:
         submodules: recursive
         fetch-depth: 0
-    - uses: actions/setup-python@v4
+    - uses: actions/setup-python@57ded4d7d5e986d7296eab16560982c6dd7c923b # v4.6.0
       with:
         python-version: ${{ env.PYTHON_VERSION }}
     - uses: ./.github/actions
@@ -218,6 +234,7 @@ jobs:
   benchmark:
     needs: [smoke_test]
     runs-on: ubuntu-latest
+    if: github.event_name != 'push'
     env:
       PYTHONOPTIMIZE: 2
       BLAS: None
@@ -227,11 +244,11 @@ jobs:
       NPY_LAPACK_ORDER: MKL,OPENBLAS,ATLAS,LAPACK
       USE_ASV: 1
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@8e5e7e5ab8b370d6c329ec480221332ada57f0ab # v3.5.2
       with:
         submodules: recursive
         fetch-depth: 0
-    - uses: actions/setup-python@v4
+    - uses: actions/setup-python@57ded4d7d5e986d7296eab16560982c6dd7c923b # v4.6.0
       with:
         python-version: ${{ env.PYTHON_VERSION }}
     - uses: ./.github/actions
@@ -239,16 +256,17 @@ jobs:
   relaxed_strides_debug:
     needs: [smoke_test]
     runs-on: ubuntu-latest
+    if: github.event_name != 'push'
     env:
       CHECK_BLAS: 1
       NPY_USE_BLAS_ILP64: 1
       NPY_RELAXED_STRIDES_DEBUG: 1
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@8e5e7e5ab8b370d6c329ec480221332ada57f0ab # v3.5.2
       with:
         submodules: recursive
         fetch-depth: 0
-    - uses: actions/setup-python@v4
+    - uses: actions/setup-python@57ded4d7d5e986d7296eab16560982c6dd7c923b # v4.6.0
       with:
         python-version: ${{ env.PYTHON_VERSION }}
     - uses: ./.github/actions
@@ -256,29 +274,35 @@ jobs:
   use_wheel:
     needs: [smoke_test]
     runs-on: ubuntu-latest
+    if: github.event_name != 'push'
     env:
       USE_WHEEL: 1
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@8e5e7e5ab8b370d6c329ec480221332ada57f0ab # v3.5.2
       with:
         submodules: recursive
         fetch-depth: 0
-    - uses: actions/setup-python@v4
+    - uses: actions/setup-python@57ded4d7d5e986d7296eab16560982c6dd7c923b # v4.6.0
       with:
         python-version: ${{ env.PYTHON_VERSION }}
     - uses: ./.github/actions
 
-  no_array_func:
+  numpy2_flag:
     needs: [smoke_test]
     runs-on: ubuntu-latest
+    if: github.event_name != 'push'
     env:
-      NUMPY_EXPERIMENTAL_ARRAY_FUNCTION: 0
+      # Test for numpy-2.0 feature-flagged behavior.
+      NPY_NUMPY_2_BEHAVIOR: 1
+      # Using the future "weak" state doesn't pass tests
+      # currently unfortunately
+      NPY_PROMOTION_STATE: legacy
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@8e5e7e5ab8b370d6c329ec480221332ada57f0ab # v3.5.2
       with:
         submodules: recursive
         fetch-depth: 0
-    - uses: actions/setup-python@v4
+    - uses: actions/setup-python@57ded4d7d5e986d7296eab16560982c6dd7c923b # v4.6.0
       with:
         python-version: ${{ env.PYTHON_VERSION }}
     - uses: ./.github/actions
@@ -286,55 +310,45 @@ jobs:
   no_openblas:
     needs: [smoke_test]
     runs-on: ubuntu-latest
+    if: github.event_name != 'push'
     env:
       BLAS: None
       LAPACK: None
       ATLAS: None
       DOWNLOAD_OPENBLAS: ''
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@8e5e7e5ab8b370d6c329ec480221332ada57f0ab # v3.5.2
       with:
         submodules: recursive
         fetch-depth: 0
-    - uses: actions/setup-python@v4
+    - uses: actions/setup-python@57ded4d7d5e986d7296eab16560982c6dd7c923b # v4.6.0
       with:
         python-version: ${{ env.PYTHON_VERSION }}
     - uses: ./.github/actions
 
-  pypy38:
-    needs: [smoke_test]
-    runs-on: ubuntu-latest
-    steps:
-    - uses: actions/checkout@v3
-      with:
-        submodules: recursive
-        fetch-depth: 0
-    - uses: actions/setup-python@v4
-      with:
-        python-version: pypy-3.8-v7.3.9
-    - uses: ./.github/actions
-
   sdist:
     needs: [smoke_test]
     runs-on: ubuntu-latest
+    if: github.event_name != 'push'
     env:
       USE_SDIST: 1
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@8e5e7e5ab8b370d6c329ec480221332ada57f0ab # v3.5.2
       with:
         submodules: recursive
         fetch-depth: 0
-    - uses: actions/setup-python@v4
+    - uses: actions/setup-python@57ded4d7d5e986d7296eab16560982c6dd7c923b # v4.6.0
       with:
         python-version: ${{ env.PYTHON_VERSION }}
     - uses: ./.github/actions
 
   armv7_simd_test:
     needs: [smoke_test]
-    # make sure this (20.04) matches the base docker image (focal) below
-    runs-on: ubuntu-20.04
+    # make sure this matches the base docker image below
+    runs-on: ubuntu-22.04
+    if: github.event_name != 'push'
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@8e5e7e5ab8b370d6c329ec480221332ada57f0ab # v3.5.2
       with:
         submodules: recursive
         fetch-depth: 0
@@ -345,19 +359,20 @@ jobs:
       run: |
         # use x86_64 cross-compiler to speed up the build
         sudo apt update
-        sudo apt install -y gcc-arm-linux-gnueabihf g++-arm-linux-gnueabihf
+        sudo apt install -y gcc-arm-linux-gnueabihf g++-arm-linux-gnueabihf gfortran-arm-linux-gnueabihf
 
         # Keep the `test_requirements.txt` dependency-subset synced
-        docker run --name the_container --interactive -v /:/host arm32v7/ubuntu:focal /bin/bash -c "
+        docker run --name the_container --interactive -v /:/host arm32v7/ubuntu:22.04 /bin/bash -c "
           apt update &&
-          apt install -y git python3 python3-dev python3-pip &&
-          pip3 install cython==0.29.30 setuptools\<49.2.0 hypothesis==6.23.3 pytest==6.2.5 'typing_extensions>=4.2.0' &&
+          apt install -y git python3 python3-dev python3-pip  &&
+          python3 -m pip install cython==0.29.34 setuptools\<49.2.0 hypothesis==6.23.3 pytest==6.2.5 'typing_extensions>=4.2.0' &&
           ln -s /host/lib64 /lib64 &&
           ln -s /host/lib/x86_64-linux-gnu /lib/x86_64-linux-gnu &&
           ln -s /host/usr/arm-linux-gnueabihf /usr/arm-linux-gnueabihf &&
           rm -rf /usr/lib/gcc/arm-linux-gnueabihf && ln -s /host/usr/lib/gcc-cross/arm-linux-gnueabihf /usr/lib/gcc/arm-linux-gnueabihf &&
           rm -f /usr/bin/arm-linux-gnueabihf-gcc && ln -s /host/usr/bin/arm-linux-gnueabihf-gcc /usr/bin/arm-linux-gnueabihf-gcc &&
           rm -f /usr/bin/arm-linux-gnueabihf-g++ && ln -s /host/usr/bin/arm-linux-gnueabihf-g++ /usr/bin/arm-linux-gnueabihf-g++ &&
+          rm -f /usr/bin/arm-linux-gnueabihf-gfortran && ln -s /host/usr/bin/arm-linux-gnueabihf-gfortran /usr/bin/arm-linux-gnueabihf-gfortran &&
           rm -f /usr/bin/arm-linux-gnueabihf-ar && ln -s /host/usr/bin/arm-linux-gnueabihf-ar /usr/bin/arm-linux-gnueabihf-ar &&
           rm -f /usr/bin/arm-linux-gnueabihf-as && ln -s /host/usr/bin/arm-linux-gnueabihf-as /usr/bin/arm-linux-gnueabihf-as &&
           rm -f /usr/bin/arm-linux-gnueabihf-ld && ln -s /host/usr/bin/arm-linux-gnueabihf-ld /usr/bin/arm-linux-gnueabihf-ld &&
@@ -370,6 +385,7 @@ jobs:
           uname -a &&
           gcc --version &&
           g++ --version &&
+          arm-linux-gnueabihf-gfortran --version &&
           python3 --version &&
           git config --global --add safe.directory /numpy
           cd /numpy &&
@@ -379,7 +395,7 @@ jobs:
     - name: Run SIMD Tests
       run: |
         docker run --rm --interactive -v $(pwd):/numpy the_build /bin/bash -c "
-          cd /numpy && python3 runtests.py -n -v -- -k test_simd
+          cd /numpy && F90=arm-linux-gnueabihf-gfortran python3 runtests.py -n -v -- -k 'test_simd or test_kind'
         "
 
   sde_simd_avx512_test:
@@ -389,11 +405,11 @@ jobs:
     needs: [smoke_test]
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@8e5e7e5ab8b370d6c329ec480221332ada57f0ab # v3.5.2
       with:
         submodules: recursive
         fetch-depth: 0
-    - uses: actions/setup-python@v4
+    - uses: actions/setup-python@57ded4d7d5e986d7296eab16560982c6dd7c923b # v4.6.0
       with:
         python-version: ${{ env.PYTHON_VERSION }}
     - name: Install Intel SDE
@@ -413,3 +429,39 @@ jobs:
     # ICL implies SKX, CLX and CNL
     - name: Run SIMD tests (Ice Lake)
       run: sde -icl -- python runtests.py -n -v -- -k test_simd
+
+  intel_spr_sde_test:
+    needs: [smoke_test]
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@8e5e7e5ab8b370d6c329ec480221332ada57f0ab # v3.4.0
+      with:
+        submodules: recursive
+        fetch-depth: 0
+    - uses: actions/setup-python@57ded4d7d5e986d7296eab16560982c6dd7c923b # v4.6.0
+      with:
+        python-version: ${{ env.PYTHON_VERSION }}
+    - name: Install Intel SDE
+      run: |
+        curl -o /tmp/sde.tar.xz https://downloadmirror.intel.com/751535/sde-external-9.14.0-2022-10-25-lin.tar.xz
+        mkdir /tmp/sde && tar -xvf /tmp/sde.tar.xz -C /tmp/sde/
+        sudo mv /tmp/sde/* /opt/sde && sudo ln -s /opt/sde/sde64 /usr/bin/sde
+    - name: Install dependencies
+      run: |
+        python -m pip install -r test_requirements.txt
+        sudo apt install gcc-12 g++-12
+    - name: Build and install NumPy
+      run: |
+        export CC=/usr/bin/gcc-12
+        export CXX=/usr/bin/g++-12
+        python -m pip install -e .
+    - name: Show config
+      run: |
+        python -c "import numpy as np; np.show_config()"
+    # Run only a few tests, running everything in an SDE takes a long time
+    # Using pytest directly, unable to use python runtests.py -n -t ...
+    # Disabled running in the SDE because of an SDE bug
+    - name: Run linalg/ufunc/umath tests
+      run: |
+        python -m pytest numpy/core/tests/test_umath* numpy/core/tests/test_ufunc.py numpy/linalg/tests/test_*
+        #sde -spr -- python -m pytest numpy/core/tests/test_umath* numpy/core/tests/test_ufunc.py numpy/linalg/tests/test_*
diff --git a/.github/workflows/circleci.yml b/.github/workflows/circleci.yml
index 63c666891..75638f6b6 100644
--- a/.github/workflows/circleci.yml
+++ b/.github/workflows/circleci.yml
@@ -2,19 +2,24 @@
 #
 # if: github.repository == 'numpy/numpy'
 
-permissions:
-  contents: read # to fetch code (actions/checkout)
+name: CircleCI artifact redirector
 
 on: [status]
+
+permissions: read-all
+
 jobs:
   circleci_artifacts_redirector_job:
     runs-on: ubuntu-latest
+    if: "github.repository == 'numpy/numpy' && !contains(github.event.head_commit.message, '[circle skip]') && !contains(github.event.head_commit.message, '[skip circle]')  && github.event.context == 'ci/circleci: build'"
     name: Run CircleCI artifacts redirector
-    # if: github.repository == 'numpy/numpy'
+    permissions:
+      statuses: write
     steps:
       - name: GitHub Action step
-        uses: larsoner/circleci-artifacts-redirector-action@master
+        uses: larsoner/circleci-artifacts-redirector-action@0a7552bf8cf99cbd40a8928fa48e858e205b98c8 # master
         with:
           repo-token: ${{ secrets.GITHUB_TOKEN }}
+          api-token: ${{ secrets.CIRCLE_TOKEN }}
           artifact-path: 0/doc/build/html/index.html
           circleci-jobs: build
diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
new file mode 100644
index 000000000..05eca6116
--- /dev/null
+++ b/.github/workflows/codeql.yml
@@ -0,0 +1,73 @@
+# For most projects, this workflow file will not need changing; you simply need
+# to commit it to your repository.
+#
+# You may wish to alter this file to override the set of languages analyzed,
+# or to provide custom queries or build logic.
+#
+# ******** NOTE ********
+# We have attempted to detect the languages in your repository. Please check
+# the `language` matrix defined below to confirm you have the correct set of
+# supported CodeQL languages.
+#
+name: "CodeQL"
+
+on:
+  push:
+    branches: ["main"]
+  pull_request:
+    # The branches below must be a subset of the branches above
+    branches: ["main"]
+  schedule:
+    - cron: "0 0 * * 1"
+
+permissions:
+  contents: read
+
+jobs:
+  analyze:
+    name: Analyze
+    runs-on: ubuntu-latest
+    permissions:
+      actions: read
+      contents: read
+      security-events: write
+
+    strategy:
+      fail-fast: false
+      matrix:
+        language: ["python"]
+        # CodeQL supports [ $supported-codeql-languages ]
+        # Learn more about CodeQL language support at https://aka.ms/codeql-docs/language-support
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@8e5e7e5ab8b370d6c329ec480221332ada57f0ab # v3.5.2
+
+      # Initializes the CodeQL tools for scanning.
+      - name: Initialize CodeQL
+        uses: github/codeql-action/init@29b1f65c5e92e24fe6b6647da1eaabe529cec70f # v2.3.3
+        with:
+          languages: ${{ matrix.language }}
+          # If you wish to specify custom queries, you can do so here or in a config file.
+          # By default, queries listed here will override any specified in a config file.
+          # Prefix the list here with "+" to use these queries and those in the config file.
+
+      # Autobuild attempts to build any compiled languages  (C/C++, C#, or Java).
+      # If this step fails, then you should remove it and run the build manually (see below)
+      - name: Autobuild
+        uses: github/codeql-action/autobuild@29b1f65c5e92e24fe6b6647da1eaabe529cec70f # v2.3.3
+
+      # ℹ️ Command-line programs to run using the OS shell.
+      # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun
+
+      #   If the Autobuild fails above, remove it and uncomment the following three lines.
+      #   modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance.
+
+      # - run: |
+      #   echo "Run, Build Application using script"
+      #   ./location_of_script_within_repo/buildscript.sh
+
+      - name: Perform CodeQL Analysis
+        uses: github/codeql-action/analyze@29b1f65c5e92e24fe6b6647da1eaabe529cec70f # v2.3.3
+        with:
+          category: "/language:${{matrix.language}}"
diff --git a/.github/workflows/cygwin.yml b/.github/workflows/cygwin.yml
index c028b2974..1865825c4 100644
--- a/.github/workflows/cygwin.yml
+++ b/.github/workflows/cygwin.yml
@@ -3,10 +3,6 @@
 # if: github.repository == 'numpy/numpy'
 name: Test on Cygwin
 on:
-  push:
-    branches:
-      - main
-      - maintenance/**
   pull_request:
     branches:
       - main
@@ -22,25 +18,25 @@ permissions:
 jobs:
   cygwin_build_test:
     runs-on: windows-latest
-    if: github.repository == 'numpy/numpy'
+    if: "github.repository == 'numpy/numpy'"
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@8e5e7e5ab8b370d6c329ec480221332ada57f0ab # v3.5.2
         with:
           submodules: recursive
           fetch-depth: 0
       - name: Install Cygwin
-        uses: cygwin/cygwin-install-action@v2
+        uses: cygwin/cygwin-install-action@006ad0b0946ca6d0a3ea2d4437677fa767392401 # v4
         with:
           platform: x86_64
           install-dir: 'C:\tools\cygwin'
           packages: >-
-            python38-devel python38-zipp python38-importlib-metadata
-            python38-cython python38-pip python38-wheel python38-cffi
-            python38-pytz python38-setuptools python38-pytest
-            python38-hypothesis liblapack-devel
+            python39-devel python39-zipp python39-importlib-metadata
+            python39-cython python39-pip python39-wheel python39-cffi
+            python39-pytz python39-setuptools python39-pytest
+            python39-hypothesis liblapack-devel
             gcc-fortran gcc-g++ git dash
       - name: Set Windows PATH
-        uses: egor-tensin/cleanup-path@v1
+        uses: egor-tensin/cleanup-path@8469525c8ee3eddabbd3487658621a6235b3c581 # v3
         with:
           dirs: 'C:\tools\cygwin\bin;C:\tools\cygwin\lib\lapack'
       - name: Verify that bash is Cygwin bash
@@ -53,34 +49,35 @@ jobs:
       - name: Verify python version
         # Make sure it's the Cygwin one, not a Windows one
         run: |
-          dash -c "which python3.8; /usr/bin/python3.8 --version -V"
+          dash -c "which python3.9; /usr/bin/python3.9 --version -V"
       - name: Build NumPy wheel
         run: |
-          dash -c "/usr/bin/python3.8 -m pip install 'setuptools<49.2.0' pytest pytz cffi pickle5 importlib_metadata typing_extensions"
-          dash -c "/usr/bin/python3.8 -m pip install -r test_requirements.txt"
-          dash -c "/usr/bin/python3.8 setup.py bdist_wheel"
+          dash -c "/usr/bin/python3.9 -m pip install 'setuptools<49.2.0' pytest pytz cffi pickle5 importlib_metadata typing_extensions"
+          dash -c "/usr/bin/python3.9 -m pip install -r test_requirements.txt"
+          dash -c "/usr/bin/python3.9 setup.py bdist_wheel"
       - name: Install new NumPy
         run: |
-          bash -c "/usr/bin/python3.8 -m pip install dist/numpy-*cp38*.whl"
+          bash -c "/usr/bin/python3.9 -m pip install dist/numpy-*cp39*.whl"
       - name: Rebase NumPy compiled extensions
         run: |
-          dash "tools/rebase_installed_dlls_cygwin.sh" 3.8
+          dash "tools/rebase_installed_dlls_cygwin.sh" 3.9
       - name: Run NumPy test suite
-        run: >-
-          dash -c "/usr/bin/python3.8 runtests.py -n -vv"
+        shell: "C:\\tools\\cygwin\\bin\\bash.exe -o igncr -eo pipefail {0}"
+        run: |
+          /usr/bin/python3.9 runtests.py -n
       - name: Upload wheel if tests fail
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@0b7f8abb1508181956e8e162db84b466c27e18ce # v3.1.2
         if: failure()
         with:
           name: numpy-cygwin-wheel
-          path: dist/numpy-*cp38*.whl
+          path: dist/numpy-*cp39*.whl
       - name: Check the extension modules on failure
         if: failure()
         run: |
-          dash -c "/usr/bin/python3.8 -m pip show numpy"
-          dash -c "/usr/bin/python3.8 -m pip show -f numpy | grep .dll"
+          dash -c "/usr/bin/python3.9 -m pip show numpy"
+          dash -c "/usr/bin/python3.9 -m pip show -f numpy | grep .dll"
           dash -c "/bin/tr -d '\r' <tools/list_installed_dll_dependencies_cygwin.sh >list_dlls_unix.sh"
-          dash "list_dlls_unix.sh" 3.8
+          dash "list_dlls_unix.sh" 3.9
       - name: Print installed package versions on failure
         if: failure()
         run: |
diff --git a/.github/workflows/dependency-review.yml b/.github/workflows/dependency-review.yml
new file mode 100644
index 000000000..cc5530fd6
--- /dev/null
+++ b/.github/workflows/dependency-review.yml
@@ -0,0 +1,20 @@
+# Dependency Review Action
+#
+# This Action will scan dependency manifest files that change as part of a Pull Request, surfacing known-vulnerable versions of the packages declared or updated in the PR. Once installed, if the workflow run is marked as required, PRs introducing known-vulnerable packages will be blocked from merging.
+#
+# Source repository: https://github.com/actions/dependency-review-action
+# Public documentation: https://docs.github.com/en/code-security/supply-chain-security/understanding-your-software-supply-chain/about-dependency-review#dependency-review-enforcement
+name: 'Dependency Review'
+on: [pull_request]
+
+permissions:
+  contents: read
+
+jobs:
+  dependency-review:
+    runs-on: ubuntu-latest
+    steps:
+      - name: 'Checkout Repository'
+        uses: actions/checkout@8e5e7e5ab8b370d6c329ec480221332ada57f0ab # v3.5.2
+      - name: 'Dependency Review'
+        uses: actions/dependency-review-action@f46c48ed6d4f1227fb2d9ea62bf6bcbed315589e # v3.0.4
diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml
deleted file mode 100644
index 694483ed7..000000000
--- a/.github/workflows/docker.yml
+++ /dev/null
@@ -1,60 +0,0 @@
-name: Build Base Docker Image
-
-on:
-  push:
-    branches:
-      - main
-    paths:
-      - "environment.yml"
-
-permissions:
-  contents: read # to fetch code (actions/checkout)
-
-jobs:
-  build_docker:
-    name: Build base Docker image
-    runs-on: ubuntu-latest
-    environment: numpy-dev
-    if: "github.repository_owner == 'numpy' && !contains(github.event.head_commit.message, '[ci skip]') && !contains(github.event.head_commit.message, '[skip ci]') && !contains(github.event.head_commit.message, '[skip github]')"
-    steps:
-      - name: Clone repository
-        uses: actions/checkout@v3
-      - name: Lint Docker
-        uses: brpaz/hadolint-action@v1.2.1
-        with:
-          dockerfile: ./tools/gitpod/Dockerfile
-      - name: Get refs
-        shell: bash
-        run: |
-          export raw_branch=${GITHUB_REF#refs/heads/}
-          echo "branch=${raw_branch//\//-}" >> $GITHUB_OUTPUT
-          echo "date=$(date +'%Y%m%d')" >> $GITHUB_OUTPUT
-          echo "sha8=$(echo ${GITHUB_SHA} | cut -c1-8)" >> $GITHUB_OUTPUT
-        id: getrefs
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v1
-      - name: Cache Docker layers
-        uses: actions/cache@v3
-        with:
-          path: /tmp/.buildx-cache
-          key: ${{ runner.os }}-buildx-${{ github.sha }}
-          restore-keys: ${{ runner.os }}-buildx-
-      - name: Login to Docker Hub
-        uses: docker/login-action@v1
-        with:
-          username: ${{ secrets.DOCKERHUB_USERNAME }}
-          password: ${{ secrets.DOCKERHUB_TOKEN }}
-      - name: Build and push
-        id: docker_build
-        uses: docker/build-push-action@v2
-        with:
-          context: "."
-          file: "./tools/gitpod/Dockerfile"
-          push: ${{ github.event_name != 'pull_request' }}
-          cache-from: type=local,src=/tmp/.buildx-cache
-          cache-to: type=local,dest=/tmp/.buildx-cache
-          tags: |
-            numpy/numpy-dev:${{ steps.getrefs.outputs.date }}-${{ steps.getrefs.outputs.branch}}-${{ steps.getrefs.outputs.sha8 }}, numpy/numpy-dev:latest
-      - name: Image digest
-        # Return details of the image build: sha and shell
-        run: echo ${{ steps.docker_build.outputs.digest }}
diff --git a/.github/workflows/emscripten.yml b/.github/workflows/emscripten.yml
new file mode 100644
index 000000000..5f01e72b2
--- /dev/null
+++ b/.github/workflows/emscripten.yml
@@ -0,0 +1,75 @@
+# To enable this workflow on a fork, comment out:
+#
+# if: github.repository == 'numpy/numpy'
+name: Test Emscripten/Pyodide build
+
+on:
+  pull_request:
+    branches:
+      - main
+      - maintenance/**
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+permissions:
+  contents: read # to fetch code (actions/checkout)
+
+jobs:
+  build-wasm-emscripten:
+    runs-on: ubuntu-22.04
+    if: "github.repository == 'numpy/numpy'"
+    env:
+      PYODIDE_VERSION: 0.23.1
+      # PYTHON_VERSION and EMSCRIPTEN_VERSION are determined by PYODIDE_VERSION.
+      # The appropriate versions can be found in the Pyodide repodata.json
+      # "info" field, or in Makefile.envs:
+      # https://github.com/pyodide/pyodide/blob/main/Makefile.envs#L2
+      PYTHON_VERSION: 3.11.2
+      EMSCRIPTEN_VERSION: 3.1.32
+      NODE_VERSION: 18
+    steps:
+      - name: Checkout numpy
+        uses: actions/checkout@8e5e7e5ab8b370d6c329ec480221332ada57f0ab # v3.5.2
+        with:
+          submodules: true
+          # versioneer.py requires the latest tag to be reachable. Here we
+          # fetch the complete history to get access to the tags.
+          # A shallow clone can work when the following issue is resolved:
+          # https://github.com/actions/checkout/issues/338
+          fetch-depth: 0
+
+      - name: set up python
+        id: setup-python
+        uses: actions/setup-python@57ded4d7d5e986d7296eab16560982c6dd7c923b # v4.6.0
+        with:
+          python-version: ${{ env.PYTHON_VERSION }}
+
+      - uses: mymindstorm/setup-emsdk@ab889da2abbcbb280f91ec4c215d3bb4f3a8f775 # v12
+        with:
+          version: ${{ env.EMSCRIPTEN_VERSION }}
+          actions-cache-folder: emsdk-cache
+
+      - name: Install pyodide-build
+        run: pip install pyodide-build==$PYODIDE_VERSION
+
+      - name: Build
+        run: CFLAGS=-g2 LDFLAGS=-g2 pyodide build
+
+      - name: set up node
+        uses: actions/setup-node@64ed1c7eab4cce3362f8c340dee64e5eaeef8f7c # v3.6.0
+        with:
+          node-version: ${{ env.NODE_VERSION }}
+
+      - name: Set up Pyodide virtual environment
+        run: |
+          pyodide venv .venv-pyodide
+          source .venv-pyodide/bin/activate
+          pip install dist/*.whl
+          pip install -r test_requirements.txt
+      - name: Test
+        run: |
+          source .venv-pyodide/bin/activate
+          cd ..
+          python numpy/runtests.py -n -vv
diff --git a/.github/workflows/gitpod.yml b/.github/workflows/gitpod.yml
deleted file mode 100644
index 1b521fc63..000000000
--- a/.github/workflows/gitpod.yml
+++ /dev/null
@@ -1,60 +0,0 @@
-name: Build Gitpod Docker image
-
-on:
-  push:
-    branches:
-      - main
-
-permissions:
-  contents: read # to fetch code (actions/checkout)
-
-jobs:
-  build_gitpod:
-    name: Build Gitpod Docker image
-    runs-on: ubuntu-latest
-    environment: numpy-dev
-    if: "github.repository_owner == 'numpy' && !contains(github.event.head_commit.message, '[ci skip]') && !contains(github.event.head_commit.message, '[skip ci]') && !contains(github.event.head_commit.message, '[skip github]')"
-    steps:
-      - name: Clone repository
-        uses: actions/checkout@v3
-        with:
-          fetch-depth: 0
-      - name: Lint Docker
-        uses: brpaz/hadolint-action@v1.2.1
-        with:
-          dockerfile: ./tools/gitpod/gitpod.Dockerfile
-      - name: Get refs
-        shell: bash
-        run: |
-          export raw_branch=${GITHUB_REF#refs/heads/}
-          echo "branch=${raw_branch//\//-}" >> $GITHUB_OUTPUT
-          echo "date=$(date +'%Y%m%d')" >> $GITHUB_OUTPUT
-          echo "sha8=$(echo ${GITHUB_SHA} | cut -c1-8)" >> $GITHUB_OUTPUT
-        id: getrefs
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v1
-      - name: Cache Docker layers
-        uses: actions/cache@v3
-        with:
-          path: /tmp/.buildx-cache
-          key: ${{ runner.os }}-buildx-${{ github.sha }}
-          restore-keys: ${{ runner.os }}-buildx-
-      - name: Login to Docker Hub
-        uses: docker/login-action@v1
-        with:
-          username: ${{ secrets.DOCKERHUB_USERNAME }}
-          password: ${{ secrets.DOCKERHUB_TOKEN }}
-      - name: Build and push
-        id: docker_build
-        uses: docker/build-push-action@v2
-        with:
-          context: "."
-          file: "./tools/gitpod/gitpod.Dockerfile"
-          push: ${{ github.event_name != 'pull_request' }}
-          cache-from: type=local,src=/tmp/.buildx-cache
-          cache-to: type=local,dest=/tmp/.buildx-cache
-          tags: |
-            numpy/numpy-gitpod:${{ steps.getrefs.outputs.date }}-${{ steps.getrefs.outputs.branch}}-${{ steps.getrefs.outputs.sha8 }}, numpy/numpy-gitpod:latest
-      - name: Image digest
-        # Return details of the image build: sha and shell
-        run: echo ${{ steps.docker_build.outputs.digest }}
diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml
index d673b2faf..9ceafebb7 100644
--- a/.github/workflows/labeler.yml
+++ b/.github/workflows/labeler.yml
@@ -1,19 +1,19 @@
 name: "Pull Request Labeler"
 on:
   pull_request_target:
-    types: [opened, synchronize, reopened, edited]
+    types: [opened]
 
 permissions: {}
 
 jobs:
   pr-labeler:
-    permissions:
-      contents: read  #  to read a configuration file
-      pull-requests: write  #  to add labels to pull requests
-
     runs-on: ubuntu-latest
+    permissions: 
+      pull-requests: write  # to add labels
     steps:
     - name: Label the PR
-      uses: gerrymanoim/pr-prefix-labeler@v3
+      uses: gerrymanoim/pr-prefix-labeler@c8062327f6de59a9ae1c19f7f07cacd0b976b6fa # v3
+      continue-on-error: true
       env:
         GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      if: github.repository == 'numpy/numpy'
diff --git a/.github/workflows/linux_meson.yml b/.github/workflows/linux_meson.yml
new file mode 100644
index 000000000..126eb8f29
--- /dev/null
+++ b/.github/workflows/linux_meson.yml
@@ -0,0 +1,57 @@
+name: Test Meson build (Linux)
+
+on:
+  pull_request:
+    branches:
+      - main
+      - maintenance/**
+
+defaults:
+  run:
+    shell: bash
+
+env:
+  PYTHON_VERSION: 3.11
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+permissions:
+  contents: read # to fetch code (actions/checkout)
+
+jobs:
+  meson_spin:
+    if: "github.repository == 'numpy/numpy'"
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@8e5e7e5ab8b370d6c329ec480221332ada57f0ab # v3.5.2
+      with:
+        submodules: recursive
+        fetch-depth: 0
+    - uses: actions/setup-python@57ded4d7d5e986d7296eab16560982c6dd7c923b # v4.6.0
+      with:
+        python-version: ${{ env.PYTHON_VERSION }}
+    - name: Install dependencies
+      run: |
+        pip install -r build_requirements.txt
+        sudo apt-get install -y libopenblas-serial-dev
+    - name: Build
+      shell: 'script -q -e -c "bash --noprofile --norc -eo pipefail {0}"'
+      env:
+        TERM: xterm-256color
+      run:
+        spin build -- --werror
+    - name: Check build-internal dependencies
+      run:
+        ninja -C build -t missingdeps
+    - name: Check installed test and stub files
+      run:
+        python tools/check_installed_files.py $(find ./build-install -path '*/site-packages/numpy')
+    - name: Test
+      shell: 'script -q -e -c "bash --noprofile --norc -eo pipefail {0}"'
+      env:
+        TERM: xterm-256color
+      run: |
+        pip install pytest hypothesis typing_extensions
+        spin test
diff --git a/.github/workflows/linux_musl.yml b/.github/workflows/linux_musl.yml
new file mode 100644
index 000000000..7d90c20ed
--- /dev/null
+++ b/.github/workflows/linux_musl.yml
@@ -0,0 +1,66 @@
+name: Test musllinux_x86_64
+
+on:
+  pull_request:
+    branches:
+      - main
+      - maintenance/**
+
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+
+permissions:
+  contents: read # to fetch code (actions/checkout)
+
+
+jobs:
+  musllinux_x86_64:
+    runs-on: ubuntu-latest
+    if: "github.repository == 'numpy/numpy'"
+    container:
+      # Use container used for building musllinux wheels
+      # it has git installed, all the pythons, etc
+      image: quay.io/pypa/musllinux_1_1_x86_64
+
+    steps:
+    - name: setup
+      run: |
+        apk update --quiet
+
+        # using git commands to clone because versioneer doesn't work when
+        # actions/checkout is used for the clone step in a container
+
+        git config --global --add safe.directory $PWD 
+        
+        if [ $GITHUB_EVENT_NAME != pull_request ]; then
+            git clone --recursive --branch=$GITHUB_REF_NAME https://github.com/${GITHUB_REPOSITORY}.git $GITHUB_WORKSPACE
+            git reset --hard $GITHUB_SHA
+        else        
+            git clone --recursive https://github.com/${GITHUB_REPOSITORY}.git $GITHUB_WORKSPACE
+            git fetch origin $GITHUB_REF:my_ref_name
+            git checkout $GITHUB_BASE_REF
+            git -c user.email="you@example.com" merge --no-commit my_ref_name
+        fi
+
+        ln -s /usr/local/bin/python3.10 /usr/local/bin/python
+
+    - name: test musllinux_x86_64
+      run: |
+        python -m venv test_env
+        source test_env/bin/activate
+
+        # required for figuring out the system tags in openblas_support
+        pip install packaging       
+        
+        # install openblas by co-opting the CIBW setup script
+        RUNNER_OS=Linux sh tools/wheels/cibw_before_build.sh .
+
+        pip install -r build_requirements.txt
+        pip install pytest hypothesis typing_extensions
+
+        # use meson to build and test 
+        spin build
+        spin test
diff --git a/.github/workflows/scorecards.yml b/.github/workflows/scorecards.yml
index 0cef4f7a9..55fd3ba38 100644
--- a/.github/workflows/scorecards.yml
+++ b/.github/workflows/scorecards.yml
@@ -25,12 +25,12 @@ jobs:
 
     steps:
       - name: "Checkout code"
-        uses: actions/checkout@93ea575cb5d8a053eaa0ac8fa3b40d7e05a33cc8 # v3.1.0
+        uses: actions/checkout@8e5e7e5ab8b370d6c329ec480221332ada57f0ab # v3.1.0
         with:
           persist-credentials: false
 
       - name: "Run analysis"
-        uses: ossf/scorecard-action@99c53751e09b9529366343771cc321ec74e9bd3d # v2.0.6
+        uses: ossf/scorecard-action@80e868c13c90f172d68d1f4501dee99e2479f7af # v2.1.3
         with:
           results_file: results.sarif
           results_format: sarif
@@ -42,7 +42,7 @@ jobs:
       # Upload the results as artifacts (optional). Commenting out will disable
       # uploads of run results in SARIF format to the repository Actions tab.
       - name: "Upload artifact"
-        uses: actions/upload-artifact@3cea5372237819ed00197afe530f5a7ea3e805c8 # v3.1.0
+        uses: actions/upload-artifact@0b7f8abb1508181956e8e162db84b466c27e18ce # v3.1.2
         with:
           name: SARIF file
           path: results.sarif
@@ -50,6 +50,6 @@ jobs:
 
       # Upload the results to GitHub's code scanning dashboard.
       - name: "Upload to code-scanning"
-        uses: github/codeql-action/upload-sarif@807578363a7869ca324a79039e6db9c843e0e100 # v2.1.27
+        uses: github/codeql-action/upload-sarif@29b1f65c5e92e24fe6b6647da1eaabe529cec70f # v2.1.27
         with:
           sarif_file: results.sarif
diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
index cca807435..f8f91b49d 100644
--- a/.github/workflows/wheels.yml
+++ b/.github/workflows/wheels.yml
@@ -38,12 +38,12 @@ jobs:
   get_commit_message:
     name: Get commit message
     runs-on: ubuntu-latest
-    if: github.repository == 'numpy/numpy'
+    if: "github.repository == 'numpy/numpy'"
     outputs:
       message: ${{ steps.commit_message.outputs.message }}
     steps:
       - name: Checkout numpy
-        uses: actions/checkout@v3
+        uses: actions/checkout@8e5e7e5ab8b370d6c329ec480221332ada57f0ab # v3.5.2
         # Gets the correct commit message for pull request
         with:
           ref: ${{ github.event.pull_request.head.sha }}
@@ -75,18 +75,16 @@ jobs:
         # https://github.com/github/feedback/discussions/7835#discussioncomment-1769026
         buildplat:
           - [ubuntu-20.04, manylinux_x86_64]
-          - [macos-10.15, macosx_*]
+          - [ubuntu-20.04, musllinux_x86_64]
+          - [macos-12, macosx_x86_64]
           - [windows-2019, win_amd64]
           - [windows-2019, win32]
-        # TODO: uncomment PyPy 3.9 builds once PyPy
-        # re-releases a new minor version
-        # NOTE: This needs a bump of cibuildwheel version, also, once that happens.
-        python: ["cp38", "cp39", "cp310", "cp311", "pp38"] #, "pp39"]
+        python: ["cp39", "cp310", "cp311", "pp39"]
         exclude:
           # Don't build PyPy 32-bit windows
           - buildplat: [windows-2019, win32]
-            python: "pp38"
-          - buildplat: [windows-2019, win32]
+            python: "pp39"
+          - buildplat: [ ubuntu-20.04, musllinux_x86_64 ]
             python: "pp39"
     env:
       IS_32_BIT: ${{ matrix.buildplat[1] == 'win32' }}
@@ -94,7 +92,7 @@ jobs:
       IS_SCHEDULE_DISPATCH: ${{ github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }}
     steps:
       - name: Checkout numpy
-        uses: actions/checkout@v3
+        uses: actions/checkout@8e5e7e5ab8b370d6c329ec480221332ada57f0ab # v3.5.2
         with:
           submodules: true
           # versioneer.py requires the latest tag to be reachable. Here we
@@ -104,36 +102,51 @@ jobs:
           fetch-depth: 0
 
       # Used to push the built wheels
-      - uses: actions/setup-python@v4
+      - uses: actions/setup-python@57ded4d7d5e986d7296eab16560982c6dd7c923b # v4.6.0
         with:
           python-version: "3.x"
 
-      - name: Configure mingw for 32-bit builds
+      # We need rtools 4.0 to have 32 bit support on windows
+      - if: runner.os == 'windows'
+        uses: r-windows/install-rtools@13886bb4048f1b862d33869a18b73cdd446a3961 # main
+
+      - name: setup rtools for 32-bit
         run: |
-          # Force 32-bit mingw
-          choco uninstall mingw
-          choco install -y mingw --forcex86 --force --version=7.3.0
-          echo "C:\ProgramData\chocolatey\lib\mingw\tools\install\mingw32\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-          refreshenv
-        if: ${{ env.IS_32_BIT == 'true' }}
+          echo "PLAT=i686" >> $env:GITHUB_ENV
+          echo "PATH=c:\rtools40\mingw32\bin;$env:PATH" >> $env:GITHUB_ENV
+          gfortran --version
+        if: ${{ matrix.buildplat[1] == 'win32' }}
 
       - name: Build wheels
-        uses: pypa/cibuildwheel@v2.11.2
+        uses: pypa/cibuildwheel@5e15bb25b428e1bf2daf2215f173d2b40135f56f # v2.12.3
         env:
           CIBW_BUILD: ${{ matrix.python }}-${{ matrix.buildplat[1] }}
 
-      - uses: actions/upload-artifact@v3
+      - uses: actions/upload-artifact@0b7f8abb1508181956e8e162db84b466c27e18ce # v3.1.2
         with:
           name: ${{ matrix.python }}-${{ startsWith(matrix.buildplat[1], 'macosx') && 'macosx' || matrix.buildplat[1] }}
           path: ./wheelhouse/*.whl
 
+      - uses: conda-incubator/setup-miniconda@3b0f2504dd76ef23b6d31f291f4913fb60ab5ff3 # v2.2.0
+        with:
+          # for installation of anaconda-client, required for upload to
+          # anaconda.org
+          # default (and activated) environment name is test
+          # Note that this step is *after* specific pythons have been used to
+          # build and test the wheel
+          auto-update-conda: true
+          python-version: "3.10"
+
       - name: Upload wheels
         if: success()
-        shell: bash
+        shell: bash -el {0}
+        # see https://github.com/marketplace/actions/setup-miniconda for why
+        # `-el {0}` is required.
         env:
           NUMPY_STAGING_UPLOAD_TOKEN: ${{ secrets.NUMPY_STAGING_UPLOAD_TOKEN }}
           NUMPY_NIGHTLY_UPLOAD_TOKEN: ${{ secrets.NUMPY_NIGHTLY_UPLOAD_TOKEN }}
         run: |
+          conda install -y anaconda-client
           source tools/wheels/upload_wheels.sh
           set_upload_vars
           # trigger an upload to
@@ -143,6 +156,7 @@ jobs:
           # https://anaconda.org/multibuild-wheels-staging/numpy
           # The tokens were originally generated at anaconda.org
           upload_wheels
+
   build_sdist:
     name: Build sdist
     needs: get_commit_message
@@ -157,10 +171,11 @@ jobs:
     runs-on: ubuntu-latest
     env:
       IS_PUSH: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') }}
-      IS_SCHEDULE_DISPATCH: ${{ github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }}
+      # commented out so the sdist doesn't upload to nightly
+      # IS_SCHEDULE_DISPATCH: ${{ github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }}
     steps:
       - name: Checkout numpy
-        uses: actions/checkout@v3
+        uses: actions/checkout@8e5e7e5ab8b370d6c329ec480221332ada57f0ab # v3.5.2
         with:
           submodules: true
           # versioneer.py requires the latest tag to be reachable. Here we
@@ -169,10 +184,10 @@ jobs:
           # https://github.com/actions/checkout/issues/338
           fetch-depth: 0
       # Used to push the built wheels
-      - uses: actions/setup-python@v4
+      - uses: actions/setup-python@57ded4d7d5e986d7296eab16560982c6dd7c923b # v4.6.0
         with:
           # Build sdist on lowest supported Python
-          python-version: "3.8"
+          python-version: "3.9"
       - name: Build sdist
         run: |
           python setup.py sdist
@@ -183,25 +198,37 @@ jobs:
           python -m pip install dist/*.gz
           pip install -r test_requirements.txt
           cd .. # Can't import numpy within numpy src directory
-          python -c "import numpy; print(numpy.__version__); numpy.test();"
+          python -c "import numpy, sys; print(numpy.__version__); sys.exit(numpy.test() is False)"
 
       - name: Check README rendering for PyPI
         run: |
           python -mpip install twine
           twine check dist/*
 
-      - uses: actions/upload-artifact@v3
+      - uses: actions/upload-artifact@0b7f8abb1508181956e8e162db84b466c27e18ce # v3.1.2
         with:
           name: sdist
           path: ./dist/*
 
+      - uses: conda-incubator/setup-miniconda@3b0f2504dd76ef23b6d31f291f4913fb60ab5ff3 # v2.2.0
+        with:
+          # for installation of anaconda-client, required for upload to
+          # anaconda.org
+          # default (and activated) environment name is test
+          # Note that this step is *after* specific pythons have been used to
+          # build and test
+          auto-update-conda: true
+          python-version: "3.10"
+
       - name: Upload sdist
         if: success()
-        shell: bash
+        shell: bash -el {0}
         env:
           NUMPY_STAGING_UPLOAD_TOKEN: ${{ secrets.NUMPY_STAGING_UPLOAD_TOKEN }}
-          NUMPY_NIGHTLY_UPLOAD_TOKEN: ${{ secrets.NUMPY_NIGHTLY_UPLOAD_TOKEN }}
+          # commented out so the sdist doesn't upload to nightly
+          # NUMPY_NIGHTLY_UPLOAD_TOKEN: ${{ secrets.NUMPY_NIGHTLY_UPLOAD_TOKEN }}
         run: |
+          conda install -y anaconda-client
           source tools/wheels/upload_wheels.sh
           set_upload_vars
           # trigger an upload to
diff --git a/.github/workflows/windows_meson.yml b/.github/workflows/windows_meson.yml
new file mode 100644
index 000000000..d4372bd3c
--- /dev/null
+++ b/.github/workflows/windows_meson.yml
@@ -0,0 +1,88 @@
+name: Test Meson build (Windows)
+
+on:
+  pull_request:
+    branches:
+      - main
+      - maintenance/**
+
+env:
+  PYTHON_VERSION: 3.11
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+permissions:
+  contents: read # to fetch code (actions/checkout)
+
+jobs:
+  meson:
+    name: Meson windows build/test
+    runs-on: windows-2019
+    # if: "github.repository == 'numpy/numpy'"
+    steps:
+    - name: Checkout
+      uses: actions/checkout@8e5e7e5ab8b370d6c329ec480221332ada57f0ab # v3.5.2
+      with:
+        submodules: recursive
+        fetch-depth: 0
+    - name: Setup Python
+      uses: actions/setup-python@57ded4d7d5e986d7296eab16560982c6dd7c923b # v4.6.0
+      with:
+        python-version: ${{ env.PYTHON_VERSION }}
+
+    - name: Install dependencies
+      run: |
+        pip install -r build_requirements.txt
+    - name: openblas-libs
+      run: |
+        # Download and install pre-built OpenBLAS library
+        # with 32-bit interfaces
+        # Unpack it in the pkg-config hardcoded path
+        choco install unzip -y
+        choco install wget -y
+        choco install -y --checksum 6004DF17818F5A6DBF19CB335CC92702 pkgconfiglite
+        wget https://anaconda.org/multibuild-wheels-staging/openblas-libs/v0.3.21/download/openblas-v0.3.21-win_amd64-gcc_10_3_0.zip
+        unzip -d c:\opt openblas-v0.3.21-win_amd64-gcc_10_3_0.zip
+        echo "PKG_CONFIG_PATH=c:\opt\64\lib\pkgconfig;" >> $env:GITHUB_ENV
+    - name: meson-configure
+      run: |
+        meson setup build --prefix=$PWD\build-install -Ddebug=false --optimization 2 --vsenv 
+    - name: meson-build
+      run: |
+        meson compile -C build -v
+
+    - name: meson-install
+      run: |
+        cd build
+        meson install --no-rebuild
+    - name: build-path
+      run: |
+        echo "installed_path=$PWD\build-install\Lib\site-packages" >> $env:GITHUB_ENV
+    - name: post-install
+      run: |
+        $numpy_path = "${env:installed_path}\numpy"
+        $libs_path = "${numpy_path}\.libs"
+        mkdir ${libs_path}
+        $ob_path = "C:/opt/64/bin/"
+        cp $ob_path/*.dll $libs_path
+        # Write _distributor_init.py to load .libs DLLs.
+        python -c "from tools import openblas_support; openblas_support.make_init(r'${numpy_path}')"
+
+    - name: prep-test
+      run: |
+        echo "PYTHONPATH=${env:installed_path}" >> $env:GITHUB_ENV
+        python -m pip install -r test_requirements.txt
+        python -m pip install threadpoolctl
+
+    - name: test
+      run: |
+        mkdir tmp
+        cd tmp
+        echo "============================================"
+        python -c "import numpy; print(numpy.show_runtime())"
+        echo "============================================"
+        echo "LASTEXITCODE is '$LASTEXITCODE'"
+        python -c "import numpy, sys; sys.exit(numpy.test(verbose=3) is False)"
+        echo "LASTEXITCODE is '$LASTEXITCODE'"
diff --git a/.gitignore b/.gitignore
index e2ee6a013..e5784971e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -61,8 +61,11 @@ GTAGS
 
 # Python files #
 ################
-# setup.py working directory
+# meson build/installation directories
 build
+build-install
+# meson python output
+.mesonpy-native-file.ini
 # sphinx build directory
 _build
 # setup.py dist directory
@@ -129,12 +132,7 @@ numpy/core/include/numpy/__ufunc_api.c
 numpy/core/include/numpy/__umath_generated.c
 numpy/core/include/numpy/_umath_doc_generated.h
 numpy/core/include/numpy/config.h
-numpy/core/include/numpy/multiarray_api.txt
-numpy/core/include/numpy/ufunc_api.txt
 numpy/core/lib/
-numpy/core/src/common/npy_binsearch.h
-numpy/core/src/common/npy_cpu_features.c
-numpy/core/src/common/npy_partition.h
 numpy/core/src/common/npy_sort.h
 numpy/core/src/common/templ_common.h
 numpy/core/src/multiarray/_multiarray_tests.c
@@ -160,8 +158,6 @@ numpy/core/src/npysort/sort.c
 numpy/core/src/private/npy_binsearch.h
 numpy/core/src/private/npy_partition.h
 numpy/core/src/private/templ_common.h
-numpy/core/src/umath/_rational_tests.c
-numpy/core/src/umath/_struct_ufunc_tests.c
 numpy/core/src/umath/_umath_tests.c
 numpy/core/src/umath/scalarmath.c
 numpy/core/src/umath/funcs.inc
@@ -183,6 +179,7 @@ benchmarks/html
 benchmarks/env
 benchmarks/numpy
 benchmarks/_asv_compare.conf.json
+test.obj
 # cythonized files
 cythonize.dat
 numpy/random/_mtrand/_mtrand.c
@@ -213,14 +210,18 @@ tools/swig/test/Array.py
 *.dispatch.h
 # wrapped sources of dispatched targets, e.g. *.dispatch.avx2.c
 *.dispatch.*.c
+*.dispatch.*.cpp
 # _simd module
 numpy/core/src/_simd/_simd.dispatch.c
 numpy/core/src/_simd/_simd_data.inc
 numpy/core/src/_simd/_simd_inc.h
 # umath module
+numpy/core/src/umath/loops_unary.dispatch.c
 numpy/core/src/umath/loops_unary_fp.dispatch.c
+numpy/core/src/umath/loops_unary_fp_le.dispatch.c
 numpy/core/src/umath/loops_arithm_fp.dispatch.c
 numpy/core/src/umath/loops_arithmetic.dispatch.c
+numpy/core/src/umath/loops_logical.dispatch.c
 numpy/core/src/umath/loops_minmax.dispatch.c
 numpy/core/src/umath/loops_trigonometric.dispatch.c
 numpy/core/src/umath/loops_exponent_log.dispatch.c
@@ -228,9 +229,8 @@ numpy/core/src/umath/loops_umath_fp.dispatch.c
 numpy/core/src/umath/loops_hyperbolic.dispatch.c
 numpy/core/src/umath/loops_modulo.dispatch.c
 numpy/core/src/umath/loops_comparison.dispatch.c
-# npysort module
-numpy/core/src/npysort/x86-qsort.dispatch.c
-numpy/core/src/npysort/x86-qsort.dispatch.*.cpp
+numpy/core/src/umath/loops_unary_complex.dispatch.c
+numpy/core/src/umath/loops_autovec.dispatch.c
 # multiarray module
 numpy/core/src/multiarray/argfunc.dispatch.c
 numpy/core/src/multiarray/arraytypes.h
diff --git a/.gitmodules b/.gitmodules
index 1ea274daf..d849a3caf 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -4,3 +4,6 @@
 [submodule "numpy/core/src/umath/svml"]
 	path = numpy/core/src/umath/svml
 	url = https://github.com/numpy/SVML.git
+[submodule "numpy/core/src/npysort/x86-simd-sort"]
+	path = numpy/core/src/npysort/x86-simd-sort
+	url = https://github.com/intel/x86-simd-sort
diff --git a/.gitpod.yml b/.gitpod.yml
deleted file mode 100644
index c46752f10..000000000
--- a/.gitpod.yml
+++ /dev/null
@@ -1,65 +0,0 @@
-# Rebuilding NumPy on init - rather than on prebuild: this ensures
-# that even forks do have a usable freshly built NumPy
-# Might delegate this later to prebuild with Q2 improvements on gitpod
-# https://www.gitpod.io/docs/config-start-tasks/#configuring-the-terminal
-# -------------------------------------------------------------------------
-
-image: numpy/numpy-gitpod:latest
-tasks:
-  - name: Prepare development environment
-    init: |
-      mkdir -p .vscode
-      cp tools/gitpod/settings.json .vscode/settings.json
-      rm -f /workspace/numpy/.git/shallow.lock  
-      conda activate numpy-dev
-      git pull --unshallow  # need to force this else the prebuild fails
-      git fetch --tags
-      python setup.py build_ext --inplace
-      echo "🛠 Completed rebuilding NumPy!! 🛠 "
-      echo "📖 Building docs 📖 "
-      cd doc
-      make html
-      echo "✨ Pre-build complete! You can close this terminal ✨ "
-
-# --------------------------------------------------------
-# exposing ports for liveserve
-ports:
-  - port: 5500
-    onOpen: notify
-
-# --------------------------------------------------------
-# some useful extensions to have
-vscode:
-  extensions:
-    - eamodio.gitlens
-    - njpwerner.autodocstring
-    - lextudio.restructuredtext
-    - ritwickdey.liveserver
-    - ms-python.python
-    - yzhang.markdown-all-in-one
-    - bungcip.better-toml
-    - mhutchie.git-graph
-
-# --------------------------------------------------------
-# using prebuilds for the container - note: atm this only
-# works for the NumPy repo
-# With this configuration the prebuild will happen on push to master 
-github:
-  prebuilds:
-    # enable for main/default branch
-    master: true
-    # enable for other branches (defaults to false) 
-    branches: false 
-    # enable for pull requests coming from this repo (defaults to true) 
-    pullRequests: false
-    # enable for pull requests coming from forks (defaults to false)
-    pullRequestsFromForks: false
-    # add a check to pull requests (defaults to true)
-    addCheck: false
-    # add a "Review in Gitpod" button as a comment to pull requests (defaults to false)
-    addComment: false
-    # add a "Review in Gitpod" button to the pull request's description (defaults to false)
-    addBadge: false
-    # add a label once the prebuild is ready to pull requests (defaults to false)
-    addLabel: false
-    
-\ No newline at end of file
diff --git a/.hadolint.yaml b/.hadolint.yaml
deleted file mode 100644
index 0188ba2cf..000000000
--- a/.hadolint.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
----
-ignored:
-  - DL3006
-  - DL3008
-  - SC2016
-  - DL3004
-  - DL3007
-\ No newline at end of file
diff --git a/.mailmap b/.mailmap
index 8ff3c21e8..fd89ecd65 100644
--- a/.mailmap
+++ b/.mailmap
@@ -8,11 +8,14 @@
 # This file is up-to-date if the command git log --format="%aN <%aE>" | sort -u
 # gives no duplicates.
 
+@amagicmuffin <2014wcheng@gmail.com>
 @8bitmp3 <19637339+8bitmp3@users.noreply.github.com>
+@dg3192 <113710955+dg3192@users.noreply.github.com>
 @DWesl <22566757+DWesl@users.noreply.github.com>
 @Endolith <endolith@gmail.com>
 @GalaxySnail <ylc991@163.com>
 @Illviljan <14371165+Illviljan@users.noreply.github.com>
+@juztamau5 <juztamau5@gmail.com>
 @LSchroefl <65246829+LSchroefl@users.noreply.github.com>
 @Lbogula <bogulala7@gmail.com>
 @Lisa <34400837+lyzlisa@users.noreply.github.com>
@@ -34,6 +37,7 @@
 @yetanothercheer <yetanothercheer@protonmail.com>
 Aaron Baecker <abaecker@localhost>
 Aarthi Agurusa <agurusa@gmail.com>
+Adarsh Singh <adarshsng090@gmail.com> ADARSH SINGH <adarshsng090@gmail.com>
 Andrei Batomunkuev <abatomunkuev@myseneca.ca>
 Ajay DS <turingman@protonmail.com>
 Ajay DS <turingman@protonmail.com> <ajayds2001@gmail.com>
@@ -49,13 +53,16 @@ Aerik Pawson <45904740+aerikpawson@users.noreply.github.com>
 Ahmet Can Solak <asolak14@ku.edu.tr>
 Amrit Krishnan <amrit110@gmail.com>
 Amrit Krishnan <amrit110@gmail.com> <amritk@vectorinstitute.ai>
+Alban Desmaison <desmaison.alban@gmail.com>
 Albert Jornet Puig <albert.jornet@ic3.cat>
 Alberto Rubiales <arubiales11@gmail.com>
+Ales Crocaro <alescrocaro@gmail.com>
 Alex Rockhill <aprockhill206@gmail.com>
 Alex Griffing <argriffi@ncsu.edu>
 Alex Griffing <argriffi@ncsu.edu> <argriffing@gmail.com>
 Alex Griffing <argriffi@ncsu.edu> <argriffing@users.noreply.github.com>
 Alex Henrie <alexhenrie24@gmail.com> <alex.henrie@utah.edu>
+Alex Low <aleksandrosansan@gmail.com>
 Alex Rogozhnikov <iamfullofspam@gmail.com> <arogozhnikov@users.noreply.github.com>
 Alex Thomas <alexthomas93@users.noreply.github.com>
 Alexander Belopolsky <abalkin@enlnt.com>
@@ -115,6 +122,9 @@ Bertrand Lefebvre <bertrand.l3f@gmail.com>
 Bharat Raghunathan <bharatraghunthan9767@gmail.com>
 Bharat Raghunathan <bharatraghunthan9767@gmail.com> <bharatr@symphonyai.com>
 Bob Eldering <eldering@jive.eu>
+Bram Ton <bram@cbbg.nl>
+Bram Ton <bram@cbbg.nl> <b.t.ton@saxion.nl>
+Bram Ton <bram@cbbg.nl> <github@cbbg.nl>
 Brent Brewington <brent.brewington@gmail.com>
 Brett R Murphy <bmurphy@enthought.com>
 Brian Soto <iambriansoto@gmail.com>
@@ -122,6 +132,7 @@ Brian Soto <iambriansoto@gmail.com> <Iamsoto@users.noreply.github.com>
 Brian Soto <iambriansoto@gmail.com> <theintrocode@gmail.com>
 Brigitta Sipőcz <bsipocz@gmail.com>
 Brigitta Sipőcz <bsipocz@gmail.com> <b.sipocz@gmail.com>
+Brock Mendel <jbrockmendel@gmail.com>
 Bryan Van de Ven <bryanv@continuum.io> Bryan Van de Ven <bryan@Laptop-3.local>
 Bryan Van de Ven <bryanv@continuum.io> Bryan Van de Ven <bryan@laptop.local>
 Brénainn Woodsend <bwoodsend@gmail.com>
@@ -163,6 +174,7 @@ Daniel Rasmussen <daniel.rasmussen@appliedbrainresearch.com>
 Daniel G. A. Smith <dgasmith@icloud.com>
 Daniel G. A. Smith <dgasmith@icloud.com> <malorian@me.com>
 Dario Mory <daaawx@gmail.com>
+Dave Goel <deego3@gmail.com>
 David Badnar <bdvd001@gmail.com>
 David Cortes <david.cortes.rivera@gmail.com>
 David Huard <david.huard@gmail.com> dhuard <dhuard@localhost>
@@ -179,6 +191,7 @@ Derek Homeier <derek@astro.physik.uni-goettingen.de> <dhomeie@gwdg.de>
 Derek Homeier <derek@astro.physik.uni-goettingen.de> <derek@astro.phsik.uni-goettingen.de>
 Derrick Williams <myutat@gmail.com>
 Devin Shanahan <dshanahan88@gmail.com>
+Digya Acharya <digyaacharyaa@gmail.com>
 Dima Pasechnik <dima@pasechnik.info>
 Dima Pasechnik <dima@pasechnik.info> <dimpase@gmail.com>
 Dimitri Papadopoulos Orfanos <3234522+DimitriPapadopoulos@users.noreply.github.com>
@@ -227,6 +240,7 @@ Gregory R. Lee <grlee77@gmail.com> <gregory.lee@cchmc.org>
 Guo Ci <zguoci@gmail.com> guoci <zguoci@gmail.com>
 Guo Shuai <gs0801@foxmail.com>
 Hameer Abbasi <einstein.edison@gmail.com> <hameerabbasi@yahoo.com>
+Hannah Aizenman <story645@gmail.com>
 Han Genuit <hangenuit@gmail.com>
 Hanno Klemm <hanno.klemm@maerskoil.com> hklemm <hanno.klemm@maerskoil.com>
 Harsh Mishra <erbeusgriffincasper@gmail.com>
@@ -243,6 +257,7 @@ Irina Maria Mocan <28827042+IrinaMaria@users.noreply.github.com>
 Irvin Probst <irvin.probst@ensta-bretagne.fr>
 Ivan Meleshko <vn.mlshk@gmail.com>
 Isabela Presedo-Floyd <irpf.design@gmail.com> <ipresedo@calpoly.edu>
+Ganesh Kathiresan <ganesh3597@gmail.com>
 Gerhard Hobler <gerhard.hobler@tuwien.ac.at>
 Giannis Zapantis <sdi1900059@di.uoa.gr>
 Guillaume Peillex <guillaume.peillex@gmail.com>
@@ -250,6 +265,7 @@ Jack J. Woehr <jwoehr@softwoehr.com>
 Jaime Fernandez <jaime.frio@gmail.com>
 Jaime Fernandez <jaime.frio@gmail.com> <jaime.fernandez@hp.com>
 Jaime Fernandez <jaime.frio@gmail.com> <jaimefrio@google.com>
+Jake Close <jsclose@umich.edu>
 Jakob Jakobson <jakobjakobson13@posteo.de>
 Jakob Jakobson <jakobjakobson13@posteo.de> <43045863+jakobjakobson13@users.noreply.github.com>
 James Bourbeau <jrbourbeau@gmail.com> <jrbourbeau@users.noreply.github.com>
@@ -269,7 +285,10 @@ Jérémie du Boisberranger <jeremie.du-boisberranger@inria.fr> jeremiedbb <34657
 Jérome Eertmans <jeertmans@icloud.com>
 Jérôme Richard <jeromerichard111@msn.com> <ubuntu@ip-172-31-17-195.eu-west-3.compute.internal>
 Jerome Kelleher <jerome.kelleher@ed.ac.uk>
+Jessé Pires <jesserocha@alunos.utfpr.edu.br>
 Jessi J Zhao <35235453+jessijzhao@users.noreply.github.com>
+João Fontes Gonçalves <jfontesgoncalves@gmail.com>
+Johnathon Cusick <jonathan.cusick09@gmail.com>
 Jhong-Ken Chen (陳仲肯) <kenny.kuo.fs@gmail.com>
 Jhong-Ken Chen (陳仲肯) <kenny.kuo.fs@gmail.com> <37182101+kennychenfs@users.noreply.github.com>
 Johannes Hampp <johannes.hampp@zeu.uni-giessen.de> <42553970+euronion@users.noreply.github.com>
@@ -319,9 +338,11 @@ Lars Buitinck <larsmans@gmail.com> Lars Buitinck <l.buitinck@esciencecenter.nl>
 Lars Buitinck <larsmans@gmail.com> Lars Buitinck <L.J.Buitinck@uva.nl>
 Lars Grüter <lagru@mailbox.org>
 Lars Grüter <lagru@mailbox.org> <lagru@users.noreply.github.com>
+Leona Taric <92495067+LeonaTaric@users.noreply.github.com>
 Leonardus Chen <leonardus.chen@gmail.com>
 Licht Takeuchi <licht-t@outlook.jp> <licht-t@math.dis.titech.ac.jp>
 Lorenzo Mammana <mammanalorenzo@outlook.it> <lorenzom96@hotmail.it>
+Lillian Zha <lillianzha@gmail.com>
 Luis Pedro Coelho <luis@luispedro.org> <lpc@cmu.edu>
 Luke Zoltan Kelley <lkelley@cfa.harvard.edu>
 Madhulika Jain Chambers <madhulikajain@gmail.com> <53166646+madhulikajc@users.noreply.github.com>
@@ -346,6 +367,7 @@ Martin Teichmann <martin.teichmann@xfel.eu> <lkb.teichmann@gmail.com>
 Mary Conley <sleeplessinseattle.dev@gmail.com>
 Masashi Kishimoto <drehbleistift@gmail.com>
 Matheus Vieira Portela <matheus.v.portela@gmail.com>
+Matheus Santana Patriarca <matheussantanapatriarca2019@outlook.com>
 Mathieu Lamarre <mlamarre@ea.com> <mathieu@vlam3d.com>
 Matías Ríos <riosm@dickinson.edu>
 Matt Ord <Matthew.ord1@gmail.com>
@@ -375,6 +397,7 @@ Michael Schnaitter <schnaitterm@knights.ucf.edu> <schnaitterm@users.noreply.gith
 Michael Seifert <michaelseifert04@yahoo.de>
 Michel Fruchart <michel.fruchart@ens-lyon.org> <fruchart@users.noreply.github.com>
 Mike Toews <mwtoews@gmail.com>
+Miles Cranmer <miles.cranmer@gmail.com>
 Mircea Akos Bruma <bruma.mircea.a@gmail.com>
 Mircea Akos Bruma <bruma.mircea.a@gmail.com> <akos@debian-gnu-linux-vm.localdomain>
 Mitchell Faas <Faas.Mitchell@gmail.com> <35742861+Mitchell-Faas@users.noreply.github.com>
@@ -391,6 +414,9 @@ Nicholas A. Del Grosso <delgrosso@bio.lmu.de> nickdg <delgrosso@bio.lmu.de>
 Nicholas McKibben <nicholas.bgp@gmail.com>
 Nick Minkyu Lee <mknicklee@protonmail.com> fivemok <9394929+fivemok@users.noreply.github.com>
 Oliver Eberle <oliver_eberle@web.de>
+Oleksiy Kononenko <Oleksiy.S.Kononenko@gmail.com>
+Oleksiy Kononenko <Oleksiy.S.Kononenko@gmail.com> <35204136+oleksiyskononenko@users.noreply.github.com>
+Omar Ali <Omarh90@gmail.com>
 Omid Rajaei <rajaei.net@gmail.com>
 Omid Rajaei <rajaei.net@gmail.com> <89868505+rajaeinet@users.noreply.github.com>
 Ondřej Čertík <ondrej.certik@gmail.com>
@@ -403,6 +429,8 @@ Paul Ivanov <pivanov5@bloomberg.net> <pi@berkeley.edu>
 Paul Ivanov <pivanov5@bloomberg.net> <paul.ivanov@local>
 Paul YS Lee <leeyspaul@gmail.com> Paul <leeyspaul@users.noreply.github.com>
 Paul Jacobson <hpj3@myuw.net>
+Pey Lian Lim <lim@stsci.edu>
+Pey Lian Lim <lim@stsci.edu> <2090236+pllim@users.noreply.github.com>
 Pearu Peterson <pearu.peterson@gmail.com> <pearu@pearu-laptop.(none)>
 Pete Peeradej Tanruangporn <pete.tanru@gmail.com>
 Peter Bell <peterbell10@live.co.uk>
@@ -447,18 +475,22 @@ Samesh Lakhotia <samesh.lakhotia@gmail.com>
 Samesh Lakhotia <samesh.lakhotia@gmail.com> <43701530+sameshl@users.noreply.github.com>
 Sami Salonen <ssalonen@gmail.com> <sami.salonen@eniram.fi>
 Sanchez Gonzalez Alvaro <as12513@imperial.ac.uk>
+Sanya Sinha <83265366+ssanya942@users.noreply.github.com>
 Saransh Chopra <saransh0701@gmail.com>
 Saullo Giovani <saullogiovani@gmail.com>
 Saurabh Mehta <e.samehta@gmail.com>
 Sayantika Banik <sayantikabanik122@gmail.com>
+Schrijvers Luc <begasus@gmail.com>
 Sebastian Berg <sebastian@sipsolutions.net>
 Sebastian Schleehauf <slepton@posteo.de>
 Serge Guelton <serge.guelton@telecom-bretagne.eu>
 Sergei Vorfolomeev <svorfolomeev@vmssoftware.com> <39548292+vorfol@users.noreply.github.com>
+Shuangchi He <yulvchi@qq.com>
 Shubham Gupta <mastershubham@gmail.com>
 Shubham Gupta <mastershubham@gmail.com> <63910248+shubham11941140@users.noreply.github.com>
 Shekhar Prasad Rajak <shekharrajak@live.com>
 Shen Zhou <shen_zhou@u.nus.edu>
+Shreya Singh <shreyas1696@gmail.com>
 Shota Kawabuchi <shota.kawabuchi+GitHub@gmail.com>
 Siavash Eliasi <siavashserver@gmail.com>
 Simon Conseil <contact@saimon.org> <simon.conseil@univ-lyon1.fr>
@@ -503,6 +535,7 @@ Travis Oliphant <teoliphant@gmail.com> <oliphant@enthought.com>
 Travis Oliphant <teoliphant@gmail.com> <travis@continuum.io>
 Valentin Haenel <valentin@haenel.co> <valentin.haenel@gmx.de>
 Valentin Haenel <valentin@haenel.co> <vhaenel@anaconda.com>
+Vardhaman Kalloli <83634399+cyai@users.noreply.github.com>
 Varun Nayyar <nayyarv@gmail.com> <nayyarv@users.noreply.github.com>
 Vinith Kishore <vinith.kishore@unibas.ch>
 Vinith Kishore <vinith.kishore@unibas.ch> <85550536+vinith2@users.noreply.github.com>
diff --git a/.travis.yml b/.travis.yml
index 1c219a6d1..c10e20483 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -26,18 +26,18 @@ cache:
 
 jobs:
   include:
-    - python: "3.8"
+    - python: "3.9"
       os: linux
       arch: ppc64le
       env:
        # use OpenBLAS build, not system ATLAS
        - DOWNLOAD_OPENBLAS=1
-       - NPY_USE_BLAS_ILP64=1
+       # - NPY_USE_BLAS_ILP64=1   # the openblas build fails
        - ATLAS=None
        # VSX4 still not supported by ubuntu/gcc-11
        - EXPECT_CPU_FEATURES="VSX VSX2 VSX3"
 
-    - python: "3.8"
+    - python: "3.9"
       os: linux
       arch: s390x
       # fixes VX assembler ambiguous errors
@@ -50,62 +50,6 @@ jobs:
        - ATLAS=None
        - EXPECT_CPU_FEATURES="VX VXE VXE2"
 
-    # Wheel builders
-    # Note that Ubuntu focal comes with Python 3.8 and CIBW_BUILD determines
-    # the Python used to build the wheels.
-    - python: "3.8"
-      os: linux
-      arch: arm64
-      virt: vm
-      env:
-        - CIBW_BUILD: cp38-manylinux_aarch64
-        - EXPECT_CPU_FEATURES: "NEON NEON_FP16 NEON_VFPV4 ASIMD ASIMDHP ASIMDDP ASIMDFHM"
-      install: python3 -m pip install cibuildwheel==2.11.2
-      script: |
-        cibuildwheel --output-dir wheelhouse
-        source ./tools/wheels/upload_wheels.sh
-        set_travis_vars
-        set_upload_vars
-        upload_wheels # Will be skipped if not a push/tag/scheduled build
-    - python: "3.8"
-      os: linux
-      arch: arm64
-      virt: vm
-      env:
-        - CIBW_BUILD: cp39-manylinux_aarch64
-      install: python3 -m pip install cibuildwheel==2.11.2
-      script: |
-        cibuildwheel --output-dir wheelhouse
-        source ./tools/wheels/upload_wheels.sh
-        set_travis_vars
-        set_upload_vars
-        upload_wheels # Will be skipped if not a push/tag/scheduled build
-    - python: "3.8"
-      os: linux
-      arch: arm64
-      virt: vm
-      env:
-        - CIBW_BUILD: cp310-manylinux_aarch64
-      install: python3 -m pip install cibuildwheel==2.11.2
-      script: |
-        cibuildwheel --output-dir wheelhouse
-        source ./tools/wheels/upload_wheels.sh
-        set_travis_vars
-        set_upload_vars
-        upload_wheels # Will be skipped if not a push/tag/scheduled build
-    - python: "3.8"
-      os: linux
-      arch: arm64
-      virt: vm
-      env:
-        - CIBW_BUILD: cp311-manylinux_aarch64
-      install: python3 -m pip install cibuildwheel==2.11.2
-      script: |
-        cibuildwheel --output-dir wheelhouse
-        source ./tools/wheels/upload_wheels.sh
-        set_travis_vars
-        set_upload_vars
-        upload_wheels # Will be skipped if not a push/tag/scheduled build
 before_install:
   - ./tools/travis-before-install.sh
 
diff --git a/22137.new_feature.rst b/22137.new_feature.rst
new file mode 100644
index 000000000..a095115ec
--- /dev/null
+++ b/22137.new_feature.rst
@@ -0,0 +1,8 @@
+Added `NPY_ENABLE_CPU_FEATURES` environment variable
+----------------------------------------------------
+
+Users may now choose to enable only a subset of the built CPU features at
+runtime by specifying the `NPY_ENABLE_CPU_FEATURES` environment variable.
+Note that these specified features must be outside the baseline, since those
+are always assumed. Errors will be raised if attempting to enable a feature
+that is either not supported by your CPU, or that NumPy was not built with.
+\ No newline at end of file
diff --git a/INSTALL.rst b/INSTALL.rst
index f136b7cfc..9ac3aa526 100644
--- a/INSTALL.rst
+++ b/INSTALL.rst
@@ -14,7 +14,7 @@ Prerequisites
 
 Building NumPy requires the following installed software:
 
-1) Python__ 3.8.x or newer.
+1) Python__ 3.9.x or newer.
 
    Please note that the Python development headers also need to be installed,
    e.g., on Debian/Ubuntu one needs to install both `python3` and
diff --git a/LICENSE.txt b/LICENSE.txt
index e1fb614d6..014d51c98 100644
--- a/LICENSE.txt
+++ b/LICENSE.txt
@@ -1,4 +1,4 @@
-Copyright (c) 2005-2022, NumPy Developers.
+Copyright (c) 2005-2023, NumPy Developers.
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/README.md b/README.md
index 02f8f17fb..c0e0b7388 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,5 @@
 <h1 align="center">
-<img src="/branding/logo/primary/numpylogo.svg" width="300">
+<img src="https://raw.githubusercontent.com/numpy/numpy/main/branding/logo/primary/numpylogo.svg" width="300">
 </h1><br>
 
 
@@ -37,7 +37,7 @@ Testing:
 
 NumPy requires `pytest` and `hypothesis`.  Tests can then be run after installation with:
 
-    python -c 'import numpy; numpy.test()'
+    python -c "import numpy, sys; sys.exit(numpy.test() is False)"
 
 Code of Conduct
 ----------------------
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 922b7c8b2..e99ee1002 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -49,14 +49,16 @@ stages:
             if ! `gcc 2>/dev/null`; then
                 sudo apt install gcc
             fi
-            sudo apt install python3
-            sudo apt install python3-dev
+            sudo add-apt-repository ppa:deadsnakes/ppa -y
+            sudo apt install python3.9
+            sudo apt install python3.9-dev
+            sudo apt install python3.9-distutils
             # python3 has no setuptools, so install one to get us going
-            python3 -m pip install --user --upgrade pip 'setuptools<49.2.0'
-            python3 -m pip install --user -r test_requirements.txt
+            python3.9 -m pip install --user --upgrade pip 'setuptools<49.2.0'
+            python3.9 -m pip install --user -r test_requirements.txt
       displayName: 'install python/requirements'
     - script: |
-            python3 runtests.py --show-build-log --cpu-baseline=native --cpu-dispatch=none \
+            python3.9 runtests.py --show-build-log --cpu-baseline=native --cpu-dispatch=none \
             --debug-info --mode=full -- -rsx --junitxml=junit/test-results.xml
       displayName: 'Run native baseline Build / Tests'
     - task: PublishTestResults@2
@@ -74,11 +76,11 @@ stages:
   - job: Lint
     condition: and(succeeded(), eq(variables['Build.Reason'], 'PullRequest'))
     pool:
-      vmImage: 'ubuntu-18.04'
+      vmImage: 'ubuntu-20.04'
     steps:
     - task: UsePythonVersion@0
       inputs:
-        versionSpec: '3.8'
+        versionSpec: '3.9'
         addToPath: true
         architecture: 'x64'
     - script: >-
@@ -91,7 +93,7 @@ stages:
       displayName: 'Run Lint Checks'
       failOnStderr: true
 
-  - job: Linux_Python_38_32bit_full_with_asserts
+  - job: Linux_Python_39_32bit_full_with_asserts
     pool:
       vmImage: 'ubuntu-20.04'
     steps:
@@ -104,7 +106,7 @@ stages:
             /bin/bash -xc " \
             git config --global --add safe.directory /numpy && \
             cd /numpy && \
-            /opt/python/cp38-cp38/bin/python -mvenv venv && \
+            /opt/python/cp39-cp39/bin/python -mvenv venv && \
             source venv/bin/activate && \
             target=\$(python3 tools/openblas_support.py) && \
             cp -r \$target/lib/* /usr/lib && \
@@ -113,6 +115,7 @@ stages:
             echo CFLAGS \$CFLAGS && \
             python3 -m pip install -v . && \
             python3 runtests.py -n --debug-info --mode=full -- -rsx --junitxml=junit/test-results.xml && \
+            python3 -m pip install threadpoolctl && \
             python3 tools/openblas_support.py --check_version"
       displayName: 'Run 32-bit manylinux2014 Docker Build / Tests'
     - task: PublishTestResults@2
@@ -120,7 +123,7 @@ stages:
       inputs:
         testResultsFiles: '**/test-*.xml'
         failTaskOnFailedTests: true
-        testRunTitle: 'Publish test results for Python 3.8-32 bit full Linux'
+        testRunTitle: 'Publish test results for Python 3.9-32 bit full Linux'
 
 
   - job: macOS
@@ -129,14 +132,17 @@ stages:
     strategy:
       maxParallel: 3
       matrix:
-          Python38:
-            PYTHON_VERSION: '3.8'
+          Python39:
+            PYTHON_VERSION: '3.9'
             USE_OPENBLAS: '1'
-          Python38-ILP64:
-            PYTHON_VERSION: '3.8'
+          Python39-ILP64:
+            PYTHON_VERSION: '3.9'
             NPY_USE_BLAS_ILP64: '1'
             USE_OPENBLAS: '1'
     steps:
+    - script: |
+        git submodule update --init
+      displayName: 'Fetch submodules'
     # the @0 refers to the (major) version of the *task* on Microsoft's
     # end, not the order in the build matrix nor anything to do
     # with version of Python selected
@@ -150,28 +156,14 @@ stages:
         [ -n "$USE_XCODE_10" ] && /bin/bash -c "sudo xcode-select -s /Applications/Xcode_10.app/Contents/Developer"
         clang --version
       displayName: 'report clang version'
-    # NOTE: might be better if we could avoid installing
-    # two C compilers, but with homebrew looks like we're
-    # now stuck getting the full gcc toolchain instead of
-    # just pulling in gfortran
+
     - script: |
-        set -xe
-        # same version of gfortran as the open-libs and numpy-wheel builds
-        curl -L https://github.com/MacPython/gfortran-install/raw/master/archives/gfortran-4.9.0-Mavericks.dmg -o gfortran.dmg
-        GFORTRAN_SHA256=$(shasum -a 256 gfortran.dmg)
-        KNOWN_SHA256="d2d5ca5ba8332d63bbe23a07201c4a0a5d7e09ee56f0298a96775f928c3c4b30  gfortran.dmg"
-        if [ "$GFORTRAN_SHA256" != "$KNOWN_SHA256" ]; then
-            echo sha256 mismatch
-            exit 1
+        if [[ $PLATFORM == "macosx-arm64" ]]; then
+            PLAT="arm64"
         fi
-        hdiutil attach -mountpoint /Volumes/gfortran gfortran.dmg
-        sudo installer -pkg /Volumes/gfortran/gfortran.pkg -target /
-        otool -L /usr/local/gfortran/lib/libgfortran.3.dylib
-        # Manually symlink gfortran-4.9 to plain gfortran for f2py.
-        # No longer needed after Feb 13 2020 as gfortran is already present
-        # and the attempted link errors. Keep this for future reference.
-        # ln -s /usr/local/bin/gfortran-4.9 /usr/local/bin/gfortran
-      displayName: 'make libgfortran available on mac os for openblas'
+        source tools/wheels/gfortran_utils.sh
+        install_gfortran
+      displayName: 'install gfortran'
     # use the pre-built openblas binary that most closely
     # matches our MacOS wheel builds -- currently based
     # primarily on file size / name details
@@ -194,6 +186,10 @@ stages:
       displayName: 'Install dependencies; some are optional to avoid test skips'
     - script: /bin/bash -c "! vulture . --min-confidence 100 --exclude doc/,numpy/distutils/ | grep 'unreachable'"
       displayName: 'Check for unreachable code paths in Python modules'
+
+    - script: git submodule update --init
+      displayName: 'Fetch submodules'
+
     # prefer usage of clang over gcc proper
     # to match likely scenario on many user mac machines
     - script: python setup.py build -j 4 build_src --verbose-cfg install
@@ -203,22 +199,16 @@ stages:
         LAPACK: None
         ATLAS: None
         CC: /usr/bin/clang
-      condition: eq(variables['USE_OPENBLAS'], '1')
-    - script: python setup.py build -j 4 build_ext --inplace install
-      displayName: 'Build NumPy without OpenBLAS and new casting'
-      env:
-        BLAS: None
-        LAPACK: None
-        ATLAS: None
-        CC: /usr/bin/clang
-      condition: eq(variables['USE_OPENBLAS'], '0')
+
     # wait until after dev build of NumPy to pip
     # install matplotlib to avoid pip install of older numpy
     - script: python -m pip install matplotlib
       displayName: 'Install matplotlib before refguide run'
+
     - script: python runtests.py -g --refguide-check
       displayName: 'Run Refguide Check'
       condition: eq(variables['USE_OPENBLAS'], '1')
+
     - script: |
         echo LIBRARY_PATH ${LIBRARY_PATH}
         python runtests.py -n --mode=full -- -rsx --junitxml=junit/test-results.xml
@@ -227,9 +217,13 @@ stages:
       env:
         # gfortran installed above adds -lSystem, so this is needed to find it (gh-22043)
         LIBRARY_PATH: /Library/Developer/CommandLineTools/SDKs/MacOSX.sdk/usr/lib
-    - bash: python tools/openblas_support.py --check_version
+
+    - bash: |
+        python -m pip install threadpoolctl
+        python tools/openblas_support.py --check_version
       displayName: 'Verify OpenBLAS version'
       condition: eq(variables['USE_OPENBLAS'], '1')
+
     # import doesn't work when in numpy src directory , so do a pip dev install of build lib to test
     - script: |
         #!/bin/bash -v
@@ -242,43 +236,45 @@ stages:
         if [ $check_output_code == 1 ] && [ $check_message == 0 ]; then exit 0; else exit 1;fi
       displayName: "Check if numpy import fails with accelerate"
       condition: eq(variables['USE_OPENBLAS'], '0')
+
     - task: PublishTestResults@2
       condition: succeededOrFailed()
       inputs:
         testResultsFiles: '**/test-*.xml'
         failTaskOnFailedTests: true
-        testRunTitle: 'Publish test results for Python 3.8 64-bit full Mac OS'
+        testRunTitle: 'Publish test results for Python 3.9 64-bit full Mac OS'
 
 
   - job: Windows
     pool:
       vmImage: 'windows-2019'
     strategy:
-      maxParallel: 6
+      maxParallel: 5
       matrix:
-          Python38-32bit-fast:
-            PYTHON_VERSION: '3.8'
+          Python39-32bit-full:
+            PYTHON_VERSION: '3.9'
             PYTHON_ARCH: 'x86'
-            TEST_MODE: fast
+            TEST_MODE: full
             BITS: 32
-          Python38-64bit-full:
-            PYTHON_VERSION: '3.8'
+          Python310-64bit-fast:
+            PYTHON_VERSION: '3.10'
             PYTHON_ARCH: 'x64'
-            TEST_MODE: full
+            TEST_MODE: fast
             BITS: 64
-          Python39-32bit-fast:
-            PYTHON_VERSION: '3.9'
+          Python311-32bit-fast:
+            PYTHON_VERSION: '3.11'
             PYTHON_ARCH: 'x86'
             TEST_MODE: fast
             BITS: 32
-          Python39-64bit-full:
-            PYTHON_VERSION: '3.9'
+          Python311-64bit-full:
+            PYTHON_VERSION: '3.11'
             PYTHON_ARCH: 'x64'
             TEST_MODE: full
             BITS: 64
             NPY_USE_BLAS_ILP64: '1'
-          PyPy38-64bit-fast:
-            PYTHON_VERSION: 'PyPy'
+
+          PyPy39-64bit-fast:
+            PYTHON_VERSION: 'pypy3.9'
             PYTHON_ARCH: 'x64'
             TEST_MODE: fast
             BITS: 64
diff --git a/azure-steps-windows.yml b/azure-steps-windows.yml
index 671e01c5b..0e2cd6cba 100644
--- a/azure-steps-windows.yml
+++ b/azure-steps-windows.yml
@@ -1,29 +1,11 @@
 steps:
+- script: git submodule update --init
+  displayName: 'Fetch submodules'
 - task: UsePythonVersion@0
   inputs:
     versionSpec: $(PYTHON_VERSION)
     addToPath: true
     architecture: $(PYTHON_ARCH)
-  condition: not(contains(variables['PYTHON_VERSION'], 'PyPy'))
-- powershell: |
-    # UsePythonVersion only supports pypy3.6, we need 3.8
-    # https://github.com/microsoft/azure-pipelines-tasks/pull/15553
-    $url = "https://downloads.python.org/pypy/pypy3.8-v7.3.9-win64.zip"
-    $output = "pypy.zip"
-    $wc = New-Object System.Net.WebClient
-    $wc.DownloadFile($url, $output)
-    echo "downloaded $url to $output"
-    mkdir pypy3
-    Expand-Archive $output -DestinationPath pypy3
-    # move pypy3/pypy-c-*/* pypy3
-    move pypy3/pypy*/* pypy3
-    $pypypath = Join-Path (Get-Item .).FullName pypy3
-    $env:Path = $pypypath + ";" + $env:Path
-    setx PATH $env:Path
-    python -mensurepip
-    echo "##vso[task.prependpath]$pypypath"
-  condition: contains(variables['PYTHON_VERSION'], 'PyPy')
-  displayName: "Install PyPy3.8 "
 
 - script: python -m pip install --upgrade pip wheel
   displayName: 'Install tools'
@@ -32,21 +14,22 @@ steps:
   displayName: 'Install dependencies; some are optional to avoid test skips'
 
 - powershell: |
-    choco install -y mingw --forcex86 --force --version=7.3.0
-    refreshenv
-  displayName: 'Install 32-bit mingw for 32-bit builds'
-  condition: eq(variables['BITS'], 32)
+    # rtools 42+ does not support 32 bits builds.
+    choco install --confirm --no-progress --side-by-side rtools --version=4.0.0.20220206
+    echo "##vso[task.setvariable variable=RTOOLS40_HOME]c:\rtools40"
+  displayName: 'Install rtools'
 
 - powershell: |
     $ErrorActionPreference = "Stop"
-    # Download and get the path to "openblas.a". We cannot copy it
+    # Download and get the path to "openblas". We cannot copy it
     # to $PYTHON_EXE's directory since that is on a different drive which
     # mingw does not like. Instead copy it to a directory and set OPENBLAS,
     # since OPENBLAS will be picked up by the openblas discovery
     $target = $(python tools/openblas_support.py)
     mkdir openblas
     echo "Copying $target to openblas/"
-    cp $target openblas/
+    cp -r $target/* openblas/
+    $env:OPENBLAS = $target
   displayName: 'Download / Install OpenBLAS'
 
 # NOTE: for Windows builds it seems much more tractable to use runtests.py
@@ -57,7 +40,11 @@ steps:
     If ($(BITS) -eq 32) {
         $env:CFLAGS = "-m32"
         $env:LDFLAGS = "-m32"
-        $env:PATH = "C:\\ProgramData\\chocolatey\\lib\\mingw\\tools\\install\\mingw$(BITS)\\bin;" + $env:PATH
+        $env:PATH = "$env:RTOOLS40_HOME\\mingw32\\bin;$env:PATH"
+    }
+    Else
+    {
+        $env:PATH = "$env:RTOOLS40_HOME\\mingw64\\bin;$env:PATH"
     }
     If ( Test-Path env:NPY_USE_BLAS_ILP64 ) {
         $env:OPENBLAS64_ = "openblas"
@@ -72,7 +59,22 @@ steps:
     }
   displayName: 'Build NumPy'
 
-- script: python runtests.py -n --show-build-log --mode=$(TEST_MODE) -- -rsx --junitxml=junit/test-results.xml
+- script: |
+    python -m pip install threadpoolctl
+    python tools/openblas_support.py --check_version
+  displayName: 'Check OpenBLAS version'
+
+- powershell: |
+    If ($(BITS) -eq 32) {
+        $env:CFLAGS = "-m32"
+        $env:LDFLAGS = "-m32"
+        $env:PATH = "$env:RTOOLS40_HOME\\mingw32\\bin;$env:PATH"
+    }
+    Else
+    {
+        $env:PATH = "$env:RTOOLS40_HOME\\mingw64\\bin;$env:PATH"
+    }
+    python runtests.py -n --show-build-log --mode=$(TEST_MODE) -- -rsx --junitxml=junit/test-results.xml
   displayName: 'Run NumPy Test Suite'
 
 - task: PublishTestResults@2
diff --git a/benchmarks/README.rst b/benchmarks/README.rst
index 2700e95e7..ef841a818 100644
--- a/benchmarks/README.rst
+++ b/benchmarks/README.rst
@@ -22,8 +22,8 @@ By default, `asv` ships with support for anaconda and virtualenv::
     pip install asv
     pip install virtualenv
 
-After contributing new benchmarks, you should test them locally
-before submitting a pull request.
+After contributing new benchmarks, you should test them locally before
+submitting a pull request.
 
 To run all benchmarks, navigate to the root NumPy directory at
 the command line and execute::
@@ -31,11 +31,21 @@ the command line and execute::
     python runtests.py --bench
 
 where ``--bench`` activates the benchmark suite instead of the
-test suite. This builds NumPy and runs  all available benchmarks
+test suite. This builds NumPy and runs all available benchmarks
 defined in ``benchmarks/``. (Note: this could take a while. Each
 benchmark is run multiple times to measure the distribution in
 execution times.)
 
+For **testing** benchmarks locally, it may be better to run these without
+replications::
+
+    cd benchmarks/
+    export REGEXP="bench.*Ufunc"
+    asv run --dry-run --show-stderr --python=same --quick -b $REGEXP
+
+Where the regular expression used to match benchmarks is stored in ``$REGEXP``,
+and `--quick` is used to avoid repetitions.
+
 To run benchmarks from a particular benchmark module, such as
 ``bench_core.py``, simply append the filename without the extension::
 
@@ -69,6 +79,27 @@ Command-line help is available as usual via ``asv --help`` and
 
 .. _ASV documentation: https://asv.readthedocs.io/
 
+Benchmarking versions
+---------------------
+
+To benchmark or visualize only releases on different machines locally, the tags with their commits can be generated, before being run with ``asv``, that is::
+
+    cd benchmarks
+    # Get commits for tags
+    # delete tag_commits.txt before re-runs
+    for gtag in $(git tag --list --sort taggerdate | grep "^v"); do
+    git log $gtag --oneline -n1 --decorate=no | awk '{print $1;}' >> tag_commits.txt
+    done
+    # Use the last 20
+    tail --lines=20 tag_commits.txt > 20_vers.txt
+    asv run HASHFILE:20_vers.txt
+    # Publish and view
+    asv publish
+    asv preview
+
+For details on contributing these, see the `benchmark results repository`_.
+
+.. _benchmark results repository: https://github.com/HaoZeke/asv-numpy
 
 Writing benchmarks
 ------------------
diff --git a/benchmarks/asv.conf.json b/benchmarks/asv.conf.json
index b60135524..267450448 100644
--- a/benchmarks/asv.conf.json
+++ b/benchmarks/asv.conf.json
@@ -43,7 +43,8 @@
     // version.
     "matrix": {
         "Cython": [],
-        "setuptools": ["59.2.0"]
+        "setuptools": ["59.2.0"],
+        "packaging": []
     },
 
     // The directory (relative to the current directory) that benchmarks are
diff --git a/benchmarks/asv_compare.conf.json.tpl b/benchmarks/asv_compare.conf.json.tpl
index 01f4e41de..f0ef0bf49 100644
--- a/benchmarks/asv_compare.conf.json.tpl
+++ b/benchmarks/asv_compare.conf.json.tpl
@@ -47,7 +47,8 @@
     // version.
     "matrix": {
         "Cython": [],
-        "setuptools": ["59.2.0"]
+        "setuptools": ["59.2.0"],
+        "packaging": []
     },
 
     // The directory (relative to the current directory) that benchmarks are
diff --git a/benchmarks/benchmarks/__init__.py b/benchmarks/benchmarks/__init__.py
index 7b9f1d3e6..35fc87eac 100644
--- a/benchmarks/benchmarks/__init__.py
+++ b/benchmarks/benchmarks/__init__.py
@@ -26,7 +26,7 @@ def dirty_lock(lock_name, lock_on_count=1):
     lock_path = os.path.abspath(os.path.join(
         os.path.dirname(__file__), "..", "env", lock_name)
     )
-    # ASV load the 'benchmark_dir' to discovering the available benchmarks
+    # ASV loads the 'benchmark_dir' to discover the available benchmarks
     # the issue here is ASV doesn't capture any strings from stdout or stderr
     # during this stage so we escape it and lock on the second increment
     try:
diff --git a/benchmarks/benchmarks/bench_core.py b/benchmarks/benchmarks/bench_core.py
index 4fcd7ace5..fe1cd37b6 100644
--- a/benchmarks/benchmarks/bench_core.py
+++ b/benchmarks/benchmarks/bench_core.py
@@ -45,6 +45,12 @@ class Core(Benchmark):
     def time_array_l_view(self):
         np.array(self.l_view)
 
+    def time_can_cast(self):
+        np.can_cast(self.l10x10, self.float64_dtype)
+
+    def time_can_cast_same_kind(self):
+        np.can_cast(self.l10x10, self.float64_dtype, casting="same_kind")
+
     def time_vstack_l(self):
         np.vstack(self.l)
 
@@ -66,6 +72,9 @@ class Core(Benchmark):
     def time_empty_100(self):
         np.empty(100)
 
+    def time_empty_like(self):
+        np.empty_like(self.l10x10)
+
     def time_eye_100(self):
         np.eye(100)
 
@@ -206,13 +215,41 @@ class Indices(Benchmark):
     def time_indices(self):
         np.indices((1000, 500))
 
-class VarComplex(Benchmark):
-    params = [10**n for n in range(0, 9)]
-    def setup(self, n):
-        self.arr = np.random.randn(n) + 1j * np.random.randn(n)
 
-    def teardown(self, n):
-        del self.arr
+class StatsMethods(Benchmark):
+    # Not testing, but in array_api (redundant)
+    # 8, 16, 32 bit variants, and 128 complexes
+    params = [['int64', 'uint64', 'float64', 'intp',
+               'complex64', 'bool', 'float', 'int',
+               'complex', 'complex256'],
+              [100**n for n in range(0, 2)]]
+    param_names = ['dtype', 'size']
+
+    def setup(self, dtype, size):
+        try:
+            self.data = np.ones(size, dtype=getattr(np, dtype))
+        except AttributeError:  # builtins throw AttributeError after 1.20
+            self.data = np.ones(size, dtype=dtype)
+        if dtype.startswith('complex'):
+            self.data = np.random.randn(size) + 1j * np.random.randn(size)
+
+    def time_min(self, dtype, size):
+        self.data.min()
+
+    def time_max(self, dtype, size):
+        self.data.max()
+
+    def time_mean(self, dtype, size):
+        self.data.mean()
+
+    def time_std(self, dtype, size):
+        self.data.std()
+
+    def time_prod(self, dtype, size):
+        self.data.prod()
+
+    def time_var(self, dtype, size):
+        self.data.var()
 
-    def time_var(self, n):
-        self.arr.var()
+    def time_sum(self, dtype, size):
+        self.data.sum()
diff --git a/benchmarks/benchmarks/bench_creation.py b/benchmarks/benchmarks/bench_creation.py
new file mode 100644
index 000000000..3a577df7a
--- /dev/null
+++ b/benchmarks/benchmarks/bench_creation.py
@@ -0,0 +1,81 @@
+from .common import Benchmark, TYPES1
+
+import numpy as np
+
+
+class MeshGrid(Benchmark):
+    """ Benchmark meshgrid generation
+    """
+    params = [[16, 32],
+              [2, 3, 4],
+              ['ij', 'xy'], TYPES1]
+    param_names = ['size', 'ndims', 'ind', 'ndtype']
+    timeout = 10
+
+    def setup(self, size, ndims, ind, ndtype):
+        self.grid_dims = [(np.random.ranf(size)).astype(ndtype) for
+                          x in range(ndims)]
+
+    def time_meshgrid(self, size, ndims, ind, ndtype):
+        np.meshgrid(*self.grid_dims, indexing=ind)
+
+
+class Create(Benchmark):
+    """ Benchmark for creation functions
+    """
+    # (64, 64), (128, 128), (256, 256)
+    # , (512, 512), (1024, 1024)
+    params = [[16, 32, 128, 256, 512,
+               (16, 16), (32, 32)],
+              ['C', 'F'],
+              TYPES1]
+    param_names = ['shape', 'order', 'npdtypes']
+    timeout = 10
+
+    def setup(self, shape, order, npdtypes):
+        values = get_squares_()
+        self.xarg = values.get(npdtypes)[0]
+
+    def time_full(self, shape, order, npdtypes):
+        np.full(shape, self.xarg[1], dtype=npdtypes, order=order)
+
+    def time_full_like(self, shape, order, npdtypes):
+        np.full_like(self.xarg, self.xarg[0], order=order)
+
+    def time_ones(self, shape, order, npdtypes):
+        np.ones(shape, dtype=npdtypes, order=order)
+
+    def time_ones_like(self, shape, order, npdtypes):
+        np.ones_like(self.xarg, order=order)
+
+    def time_zeros(self, shape, order, npdtypes):
+        np.zeros(shape, dtype=npdtypes, order=order)
+
+    def time_zeros_like(self, shape, order, npdtypes):
+        np.zeros_like(self.xarg, order=order)
+
+    def time_empty(self, shape, order, npdtypes):
+        np.empty(shape, dtype=npdtypes, order=order)
+
+    def time_empty_like(self, shape, order, npdtypes):
+        np.empty_like(self.xarg, order=order)
+
+
+class UfuncsFromDLP(Benchmark):
+    """ Benchmark for creation functions
+    """
+    params = [[16, 32, (16, 16),
+               (32, 32), (64, 64)],
+              TYPES1]
+    param_names = ['shape', 'npdtypes']
+    timeout = 10
+
+    def setup(self, shape, npdtypes):
+        if npdtypes in ['longdouble', 'clongdouble']:
+            raise NotImplementedError(
+                'Only IEEE dtypes are supported')
+        values = get_squares_()
+        self.xarg = values.get(npdtypes)[0]
+
+    def time_from_dlpack(self, shape, npdtypes):
+        np.from_dlpack(self.xarg)
diff --git a/benchmarks/benchmarks/bench_function_base.py b/benchmarks/benchmarks/bench_function_base.py
index 2e44ff76b..cc37bef39 100644
--- a/benchmarks/benchmarks/bench_function_base.py
+++ b/benchmarks/benchmarks/bench_function_base.py
@@ -248,7 +248,7 @@ class Sort(Benchmark):
         # In NumPy 1.17 and newer, 'merge' can be one of several
         # stable sorts, it isn't necessarily merge sort.
         ['quick', 'merge', 'heap'],
-        ['float64', 'int64', 'float32', 'uint32', 'int32', 'int16'],
+        ['float64', 'int64', 'float32', 'uint32', 'int32', 'int16', 'float16'],
         [
             ('random',),
             ('ordered',),
diff --git a/benchmarks/benchmarks/bench_io.py b/benchmarks/benchmarks/bench_io.py
index 357adbb87..e316d07f3 100644
--- a/benchmarks/benchmarks/bench_io.py
+++ b/benchmarks/benchmarks/bench_io.py
@@ -1,7 +1,7 @@
-from .common import Benchmark, get_squares
+from .common import Benchmark, get_squares, get_squares_
 
 import numpy as np
-from io import StringIO
+from io import SEEK_SET, StringIO, BytesIO
 
 
 class Copy(Benchmark):
@@ -67,6 +67,15 @@ class Savez(Benchmark):
         np.savez('tmp.npz', **self.squares)
 
 
+class LoadNpyOverhead(Benchmark):
+    def setup(self):
+        self.buffer = BytesIO()
+        np.save(self.buffer, get_squares_()['float32'])
+
+    def time_loadnpy_overhead(self):
+        self.buffer.seek(0, SEEK_SET)
+        np.load(self.buffer)
+
 class LoadtxtCSVComments(Benchmark):
     # benchmarks for np.loadtxt comment handling
     # when reading in CSV files
diff --git a/benchmarks/benchmarks/bench_itemselection.py b/benchmarks/benchmarks/bench_itemselection.py
index 518258a8f..46a39372c 100644
--- a/benchmarks/benchmarks/bench_itemselection.py
+++ b/benchmarks/benchmarks/bench_itemselection.py
@@ -7,7 +7,7 @@ class Take(Benchmark):
     params = [
         [(1000, 1), (1000, 2), (2, 1000, 1), (1000, 3)],
         ["raise", "wrap", "clip"],
-        TYPES1]
+        TYPES1 + ["O", "i,O"]]
     param_names = ["shape", "mode", "dtype"]
 
     def setup(self, shape, mode, dtype):
@@ -21,7 +21,7 @@ class Take(Benchmark):
 class PutMask(Benchmark):
     params = [
         [True, False],
-        TYPES1]
+        TYPES1 + ["O", "i,O"]]
     param_names = ["values_is_scalar", "dtype"]
 
     def setup(self, values_is_scalar, dtype):
@@ -41,3 +41,21 @@ class PutMask(Benchmark):
     def time_sparse(self, values_is_scalar, dtype):
         np.putmask(self.arr, self.sparse_mask, self.vals)
 
+
+class Put(Benchmark):
+    params = [
+        [True, False],
+        TYPES1 + ["O", "i,O"]]
+    param_names = ["values_is_scalar", "dtype"]
+
+    def setup(self, values_is_scalar, dtype):
+        if values_is_scalar:
+            self.vals = np.array(1., dtype=dtype)
+        else:
+            self.vals = np.ones(1000, dtype=dtype)
+
+        self.arr = np.ones(1000, dtype=dtype)
+        self.indx = np.arange(1000, dtype=np.intp)
+
+    def time_ordered(self, values_is_scalar, dtype):
+        np.put(self.arr, self.indx, self.vals)
diff --git a/benchmarks/benchmarks/bench_lib.py b/benchmarks/benchmarks/bench_lib.py
index b64f8ab17..f792116a6 100644
--- a/benchmarks/benchmarks/bench_lib.py
+++ b/benchmarks/benchmarks/bench_lib.py
@@ -132,11 +132,26 @@ class Unique(Benchmark):
         # produce a randomly shuffled array with the
         # approximate desired percentage np.nan content
         base_array = np.random.uniform(size=array_size)
-        base_array[base_array < percent_nans / 100.] = np.nan
+        n_nan = int(percent_nans * array_size)
+        nan_indices = np.random.choice(np.arange(array_size), size=n_nan)
+        base_array[nan_indices] = np.nan
         self.arr = base_array
 
-    def time_unique(self, array_size, percent_nans):
-        np.unique(self.arr)
+    def time_unique_values(self, array_size, percent_nans):
+        np.unique(self.arr, return_index=False,
+                  return_inverse=False, return_counts=False)
+
+    def time_unique_counts(self, array_size, percent_nans):
+        np.unique(self.arr, return_index=False,
+                  return_inverse=False, return_counts=True)
+
+    def time_unique_inverse(self, array_size, percent_nans):
+        np.unique(self.arr, return_index=False,
+                  return_inverse=True, return_counts=False)
+
+    def time_unique_all(self, array_size, percent_nans):
+        np.unique(self.arr, return_index=True,
+                  return_inverse=True, return_counts=True)
 
 
 class Isin(Benchmark):
diff --git a/benchmarks/benchmarks/bench_linalg.py b/benchmarks/benchmarks/bench_linalg.py
index a94ba1139..b4e39b084 100644
--- a/benchmarks/benchmarks/bench_linalg.py
+++ b/benchmarks/benchmarks/bench_linalg.py
@@ -190,3 +190,27 @@ class Einsum(Benchmark):
     # sum_of_products_contig_outstride0_one：non_contiguous arrays
     def time_einsum_noncon_contig_outstride0(self, dtype):
         np.einsum("i->", self.non_contiguous_dim1, optimize=True)
+
+
+class LinAlgTransposeVdot(Benchmark):
+    # Smaller for speed
+    # , (128, 128), (256, 256), (512, 512),
+    # (1024, 1024)
+    params = [[(16, 16), (32, 32),
+               (64, 64)], TYPES1]
+    param_names = ['shape', 'npdtypes']
+
+    def setup(self, shape, npdtypes):
+        self.xarg = np.random.uniform(-1, 1, np.dot(*shape)).reshape(shape)
+        self.xarg = self.xarg.astype(npdtypes)
+        self.x2arg = np.random.uniform(-1, 1, np.dot(*shape)).reshape(shape)
+        self.x2arg = self.x2arg.astype(npdtypes)
+        if npdtypes.startswith('complex'):
+            self.xarg += self.xarg.T*1j
+            self.x2arg += self.x2arg.T*1j
+
+    def time_transpose(self, shape, npdtypes):
+        np.transpose(self.xarg)
+
+    def time_vdot(self, shape, npdtypes):
+        np.vdot(self.xarg, self.x2arg)
diff --git a/benchmarks/benchmarks/bench_ma.py b/benchmarks/benchmarks/bench_ma.py
index b214c0b86..49ccf92fe 100644
--- a/benchmarks/benchmarks/bench_ma.py
+++ b/benchmarks/benchmarks/bench_ma.py
@@ -17,6 +17,14 @@ class MA(Benchmark):
     def time_masked_array_l100_t100(self):
         np.ma.masked_array(self.l100, self.t100)
 
+class MACreation(Benchmark):
+    param_names = ['data', 'mask']
+    params = [[10, 100, 1000],
+              [True, False, None]]
+
+    def time_ma_creations(self, data, mask):
+        np.ma.array(data=np.zeros(int(data)), mask=mask)
+
 
 class Indexing(Benchmark):
     param_names = ['masked', 'ndim', 'size']
@@ -111,3 +119,149 @@ class Concatenate(Benchmark):
 
     def time_it(self, mode, n):
         np.ma.concatenate(self.args)
+
+
+class MAFunctions1v(Benchmark):
+    param_names = ['mtype', 'func', 'msize']
+    params = [['np', 'np.ma'],
+              ['sin', 'log', 'sqrt'],
+              ['small', 'big']]
+
+    def setup(self, mtype, func, msize):
+        xs = np.random.uniform(-1, 1, 6).reshape(2, 3)
+        m1 = [[True, False, False], [False, False, True]]
+        xl = np.random.uniform(-1, 1, 100*100).reshape(100, 100)
+        maskx = xl > 0.8
+        self.nmxs = np.ma.array(xs, mask=m1)
+        self.nmxl = np.ma.array(xl, mask=maskx)
+
+    def time_functions_1v(self, mtype, func, msize):
+        # fun = {'np.ma.sin': np.ma.sin, 'np.sin': np.sin}[func]
+        fun = eval(f"{mtype}.{func}")
+        if msize == 'small':
+            fun(self.nmxs)
+        elif msize == 'big':
+            fun(self.nmxl)
+
+
+class MAMethod0v(Benchmark):
+    param_names = ['method', 'msize']
+    params = [['ravel', 'transpose', 'compressed', 'conjugate'],
+              ['small', 'big']]
+
+    def setup(self, method, msize):
+        xs = np.random.uniform(-1, 1, 6).reshape(2, 3)
+        m1 = [[True, False, False], [False, False, True]]
+        xl = np.random.uniform(-1, 1, 100*100).reshape(100, 100)
+        maskx = xl > 0.8
+        self.nmxs = np.ma.array(xs, mask=m1)
+        self.nmxl = np.ma.array(xl, mask=maskx)
+
+    def time_methods_0v(self, method, msize):
+        if msize == 'small':
+            mdat = self.nmxs
+        elif msize == 'big':
+            mdat = self.nmxl
+        getattr(mdat, method)()
+
+
+class MAFunctions2v(Benchmark):
+    param_names = ['mtype', 'func', 'msize']
+    params = [['np', 'np.ma'],
+              ['multiply', 'divide', 'power'],
+              ['small', 'big']]
+
+    def setup(self, mtype, func, msize):
+        # Small arrays
+        xs = np.random.uniform(-1, 1, 6).reshape(2, 3)
+        ys = np.random.uniform(-1, 1, 6).reshape(2, 3)
+        m1 = [[True, False, False], [False, False, True]]
+        m2 = [[True, False, True], [False, False, True]]
+        self.nmxs = np.ma.array(xs, mask=m1)
+        self.nmys = np.ma.array(ys, mask=m2)
+        # Big arrays
+        xl = np.random.uniform(-1, 1, 100*100).reshape(100, 100)
+        yl = np.random.uniform(-1, 1, 100*100).reshape(100, 100)
+        maskx = xl > 0.8
+        masky = yl < -0.8
+        self.nmxl = np.ma.array(xl, mask=maskx)
+        self.nmyl = np.ma.array(yl, mask=masky)
+
+    def time_functions_2v(self, mtype, func, msize):
+        fun = eval(f"{mtype}.{func}")
+        if msize == 'small':
+            fun(self.nmxs, self.nmys)
+        elif msize == 'big':
+            fun(self.nmxl, self.nmyl)
+
+
+class MAMethodGetItem(Benchmark):
+    param_names = ['margs', 'msize']
+    params = [[0, (0, 0), [0, -1]],
+              ['small', 'big']]
+
+    def setup(self, margs, msize):
+        xs = np.random.uniform(-1, 1, 6).reshape(2, 3)
+        m1 = [[True, False, False], [False, False, True]]
+        xl = np.random.uniform(-1, 1, 100*100).reshape(100, 100)
+        maskx = xl > 0.8
+        self.nmxs = np.ma.array(xs, mask=m1)
+        self.nmxl = np.ma.array(xl, mask=maskx)
+
+    def time_methods_getitem(self, margs, msize):
+        if msize == 'small':
+            mdat = self.nmxs
+        elif msize == 'big':
+            mdat = self.nmxl
+        getattr(mdat, '__getitem__')(margs)
+
+
+class MAMethodSetItem(Benchmark):
+    param_names = ['margs', 'mset', 'msize']
+    params = [[0, (0, 0), (-1, 0)],
+              [17, np.ma.masked],
+              ['small', 'big']]
+
+    def setup(self, margs, mset, msize):
+        xs = np.random.uniform(-1, 1, 6).reshape(2, 3)
+        m1 = [[True, False, False], [False, False, True]]
+        xl = np.random.uniform(-1, 1, 100*100).reshape(100, 100)
+        maskx = xl > 0.8
+        self.nmxs = np.ma.array(xs, mask=m1)
+        self.nmxl = np.ma.array(xl, mask=maskx)
+
+    def time_methods_setitem(self, margs, mset, msize):
+        if msize == 'small':
+            mdat = self.nmxs
+        elif msize == 'big':
+            mdat = self.nmxl
+        getattr(mdat, '__setitem__')(margs, mset)
+
+
+class Where(Benchmark):
+    param_names = ['mtype', 'msize']
+    params = [['np', 'np.ma'],
+              ['small', 'big']]
+
+    def setup(self, mtype, msize):
+        # Small arrays
+        xs = np.random.uniform(-1, 1, 6).reshape(2, 3)
+        ys = np.random.uniform(-1, 1, 6).reshape(2, 3)
+        m1 = [[True, False, False], [False, False, True]]
+        m2 = [[True, False, True], [False, False, True]]
+        self.nmxs = np.ma.array(xs, mask=m1)
+        self.nmys = np.ma.array(ys, mask=m2)
+        # Big arrays
+        xl = np.random.uniform(-1, 1, 100*100).reshape(100, 100)
+        yl = np.random.uniform(-1, 1, 100*100).reshape(100, 100)
+        maskx = xl > 0.8
+        masky = yl < -0.8
+        self.nmxl = np.ma.array(xl, mask=maskx)
+        self.nmyl = np.ma.array(yl, mask=masky)
+
+    def time_where(self, mtype, msize):
+        fun = eval(f"{mtype}.where")
+        if msize == 'small':
+            fun(self.nmxs > 2, self.nmxs, self.nmys)
+        elif msize == 'big':
+            fun(self.nmxl > 2, self.nmxl, self.nmyl)
diff --git a/benchmarks/benchmarks/bench_manipulate.py b/benchmarks/benchmarks/bench_manipulate.py
new file mode 100644
index 000000000..0a312479c
--- /dev/null
+++ b/benchmarks/benchmarks/bench_manipulate.py
@@ -0,0 +1,107 @@
+from .common import Benchmark, get_squares_, TYPES1, DLPACK_TYPES
+
+import numpy as np
+from collections import deque
+
+class BroadcastArrays(Benchmark):
+    params = [[(16, 32), (32, 64),
+               (64, 128), (128, 256),
+               (256, 512), (512, 1024)],
+              TYPES1]
+    param_names = ['shape', 'ndtype']
+    timeout = 10
+
+    def setup(self, shape, ndtype):
+        self.xarg = np.random.ranf(shape[0]*shape[1]).reshape(shape)
+        self.xarg = self.xarg.astype(ndtype)
+        if ndtype.startswith('complex'):
+            self.xarg += np.random.ranf(1)*1j
+
+    def time_broadcast_arrays(self, shape, ndtype):
+        np.broadcast_arrays(self.xarg, np.ones(1))
+
+
+class BroadcastArraysTo(Benchmark):
+    params = [[16, 32, 64, 128, 256, 512],
+              TYPES1]
+    param_names = ['size', 'ndtype']
+    timeout = 10
+
+    def setup(self, size, ndtype):
+        self.rng = np.random.default_rng()
+        self.xarg = self.rng.random(size)
+        self.xarg = self.xarg.astype(ndtype)
+        if ndtype.startswith('complex'):
+            self.xarg += self.rng.random(1)*1j
+
+    def time_broadcast_to(self, size, ndtype):
+        np.broadcast_to(self.xarg, (size, size))
+
+
+class ConcatenateStackArrays(Benchmark):
+    # (64, 128), (128, 256), (256, 512)
+    params = [[(16, 32), (32, 64)],
+              [2, 3, 4, 5],
+              TYPES1]
+    param_names = ['shape', 'narrays', 'ndtype']
+    timeout = 10
+
+    def setup(self, shape, narrays, ndtype):
+        self.xarg = [np.random.ranf(shape[0]*shape[1]).reshape(shape)
+                     for x in range(narrays)]
+        self.xarg = [x.astype(ndtype) for x in self.xarg]
+        if ndtype.startswith('complex'):
+            [x + np.random.ranf(1)*1j for x in self.xarg]
+
+    def time_concatenate_ax0(self, size, narrays, ndtype):
+        np.concatenate(self.xarg, axis=0)
+
+    def time_concatenate_ax1(self, size, narrays, ndtype):
+        np.concatenate(self.xarg, axis=1)
+
+    def time_stack_ax0(self, size, narrays, ndtype):
+        np.stack(self.xarg, axis=0)
+
+    def time_stack_ax1(self, size, narrays, ndtype):
+        np.stack(self.xarg, axis=1)
+
+
+class DimsManipulations(Benchmark):
+    params = [
+        [(2, 1, 4), (2, 1), (5, 2, 3, 1)],
+    ]
+    param_names = ['shape']
+    timeout = 10
+
+    def setup(self, shape):
+        self.xarg = np.ones(shape=shape)
+        self.reshaped = deque(shape)
+        self.reshaped.rotate(1)
+        self.reshaped = tuple(self.reshaped)
+
+    def time_expand_dims(self, shape):
+        np.expand_dims(self.xarg, axis=1)
+
+    def time_expand_dims_neg(self, shape):
+        np.expand_dims(self.xarg, axis=-1)
+
+    def time_squeeze_dims(self, shape):
+        np.squeeze(self.xarg)
+
+    def time_flip_all(self, shape):
+        np.flip(self.xarg, axis=None)
+
+    def time_flip_one(self, shape):
+        np.flip(self.xarg, axis=1)
+
+    def time_flip_neg(self, shape):
+        np.flip(self.xarg, axis=-1)
+
+    def time_moveaxis(self, shape):
+        np.moveaxis(self.xarg, [0, 1], [-1, -2])
+
+    def time_roll(self, shape):
+        np.roll(self.xarg, 3)
+
+    def time_reshape(self, shape):
+        np.reshape(self.xarg, self.reshaped)
diff --git a/benchmarks/benchmarks/bench_reduce.py b/benchmarks/benchmarks/bench_reduce.py
index ca07bd180..040b5ca73 100644
--- a/benchmarks/benchmarks/bench_reduce.py
+++ b/benchmarks/benchmarks/bench_reduce.py
@@ -45,19 +45,40 @@ class AnyAll(Benchmark):
         self.zeros.any()
 
 
-class MinMax(Benchmark):
-    params = [np.int8, np.uint8, np.int16, np.uint16, np.int32, np.uint32,
-              np.int64, np.uint64, np.float32, np.float64, np.intp]
+class StatsReductions(Benchmark):
+    # Not testing, but in array_api (redundant)
+    # 8, 16, 32 bit variants, and 128 complexes
+    params = ['int64', 'uint64', 'float64', 'intp',
+               'complex64', 'bool', 'float', 'int',
+               'complex', 'complex256'],
     param_names = ['dtype']
 
     def setup(self, dtype):
-        self.d = np.ones(20000, dtype=dtype)
+        try:
+            self.data = np.ones(200, dtype=getattr(np, dtype))
+        except AttributeError:  # builtins throw AttributeError after 1.20
+            self.data = np.ones(200, dtype=dtype)
+        if dtype.startswith('complex'):
+            self.data = self.data * self.data.T*1j
 
     def time_min(self, dtype):
-        np.min(self.d)
+        np.min(self.data)
 
     def time_max(self, dtype):
-        np.max(self.d)
+        np.max(self.data)
+
+    def time_mean(self, dtype):
+        np.mean(self.data)
+
+    def time_std(self, dtype):
+        np.std(self.data)
+
+    def time_prod(self, dtype):
+        np.prod(self.data)
+
+    def time_var(self, dtype):
+        np.var(self.data)
+
 
 class FMinMax(Benchmark):
     params = [np.float32, np.float64]
@@ -72,6 +93,7 @@ class FMinMax(Benchmark):
     def time_max(self, dtype):
         np.fmax.reduce(self.d)
 
+
 class ArgMax(Benchmark):
     params = [np.int8, np.uint8, np.int16, np.uint16, np.int32, np.uint32,
               np.int64, np.uint64, np.float32, np.float64, bool]
@@ -83,6 +105,7 @@ class ArgMax(Benchmark):
     def time_argmax(self, dtype):
         np.argmax(self.d)
 
+
 class ArgMin(Benchmark):
     params = [np.int8, np.uint8, np.int16, np.uint16, np.int32, np.uint32,
               np.int64, np.uint64, np.float32, np.float64, bool]
@@ -94,6 +117,7 @@ class ArgMin(Benchmark):
     def time_argmin(self, dtype):
         np.argmin(self.d)
 
+
 class SmallReduction(Benchmark):
     def setup(self):
         self.d = np.ones(100, dtype=np.float32)
diff --git a/benchmarks/benchmarks/bench_scalar.py b/benchmarks/benchmarks/bench_scalar.py
index 650daa89d..638f66df5 100644
--- a/benchmarks/benchmarks/bench_scalar.py
+++ b/benchmarks/benchmarks/bench_scalar.py
@@ -65,3 +65,15 @@ class ScalarMath(Benchmark):
         other + int32
         other + int32
         other + int32
+
+
+class ScalarStr(Benchmark):
+    # Test scalar to str conversion
+    params = [TYPES1]
+    param_names = ["type"]
+
+    def setup(self, typename):
+        self.a = np.array([100] * 100, dtype=typename)
+
+    def time_str_repr(self, typename):
+        res = [str(x) for x in self.a]
diff --git a/benchmarks/benchmarks/bench_ufunc.py b/benchmarks/benchmarks/bench_ufunc.py
index 36d8621e8..f7c77d90c 100644
--- a/benchmarks/benchmarks/bench_ufunc.py
+++ b/benchmarks/benchmarks/bench_ufunc.py
@@ -1,6 +1,9 @@
-from .common import Benchmark, get_squares_
+from .common import Benchmark, get_squares_, TYPES1, DLPACK_TYPES
 
 import numpy as np
+import itertools
+from packaging import version
+import operator
 
 
 ufuncs = ['abs', 'absolute', 'add', 'arccos', 'arccosh', 'arcsin', 'arcsinh',
@@ -13,11 +16,13 @@ ufuncs = ['abs', 'absolute', 'add', 'arccos', 'arccosh', 'arcsin', 'arcsinh',
           'isinf', 'isnan', 'isnat', 'lcm', 'ldexp', 'left_shift', 'less',
           'less_equal', 'log', 'log10', 'log1p', 'log2', 'logaddexp',
           'logaddexp2', 'logical_and', 'logical_not', 'logical_or',
-          'logical_xor', 'matmul', 'maximum', 'minimum', 'mod', 'modf', 'multiply',
-          'negative', 'nextafter', 'not_equal', 'positive', 'power',
-          'rad2deg', 'radians', 'reciprocal', 'remainder', 'right_shift',
-          'rint', 'sign', 'signbit', 'sin', 'sinh', 'spacing', 'sqrt',
-          'square', 'subtract', 'tan', 'tanh', 'true_divide', 'trunc']
+          'logical_xor', 'matmul', 'maximum', 'minimum', 'mod', 'modf',
+          'multiply', 'negative', 'nextafter', 'not_equal', 'positive',
+          'power', 'rad2deg', 'radians', 'reciprocal', 'remainder',
+          'right_shift', 'rint', 'sign', 'signbit', 'sin',
+          'sinh', 'spacing', 'sqrt', 'square', 'subtract', 'tan', 'tanh',
+          'true_divide', 'trunc']
+arrayfuncdisp = ['real', 'round']
 
 
 for name in dir(np):
@@ -25,6 +30,30 @@ for name in dir(np):
         print("Missing ufunc %r" % (name,))
 
 
+class ArrayFunctionDispatcher(Benchmark):
+    params = [arrayfuncdisp]
+    param_names = ['func']
+    timeout = 10
+
+    def setup(self, ufuncname):
+        np.seterr(all='ignore')
+        try:
+            self.afdn = getattr(np, ufuncname)
+        except AttributeError:
+            raise NotImplementedError()
+        self.args = []
+        for _, aarg in get_squares_().items():
+            arg = (aarg,) * 1  # no nin
+            try:
+                self.afdn(*arg)
+            except TypeError:
+                continue
+            self.args.append(arg)
+
+    def time_afdn_types(self, ufuncname):
+        [self.afdn(*arg) for arg in self.args]
+
+
 class Broadcast(Benchmark):
     def setup(self):
         self.d = np.ones((50000, 100), dtype=np.float64)
@@ -34,6 +63,20 @@ class Broadcast(Benchmark):
         self.d - self.e
 
 
+class At(Benchmark):
+    def setup(self):
+        rng = np.random.default_rng(1)
+        self.vals = rng.random(10_000_000, dtype=np.float64)
+        self.idx = rng.integers(1000, size=10_000_000).astype(np.intp)
+        self.res = np.zeros(1000, dtype=self.vals.dtype)
+
+    def time_sum_at(self):
+        np.add.at(self.res, self.idx, self.vals)
+
+    def time_maximum_at(self):
+        np.maximum.at(self.res, self.idx, self.vals)
+
+
 class UFunc(Benchmark):
     params = [ufuncs]
     param_names = ['ufunc']
@@ -42,23 +85,179 @@ class UFunc(Benchmark):
     def setup(self, ufuncname):
         np.seterr(all='ignore')
         try:
-            self.f = getattr(np, ufuncname)
+            self.ufn = getattr(np, ufuncname)
         except AttributeError:
             raise NotImplementedError()
         self.args = []
-        for t, a in get_squares_().items():
-            arg = (a,) * self.f.nin
+        for _, aarg in get_squares_().items():
+            arg = (aarg,) * self.ufn.nin
             try:
-                self.f(*arg)
+                self.ufn(*arg)
             except TypeError:
                 continue
             self.args.append(arg)
 
     def time_ufunc_types(self, ufuncname):
-        [self.f(*arg) for arg in self.args]
+        [self.ufn(*arg) for arg in self.args]
+
+
+class MethodsV0(Benchmark):
+    """ Benchmark for the methods which do not take any arguments
+    """
+    params = [['__abs__', '__neg__', '__pos__'], TYPES1]
+    param_names = ['methods', 'npdtypes']
+    timeout = 10
+
+    def setup(self, methname, npdtypes):
+        values = get_squares_()
+        self.xarg = values.get(npdtypes)[0]
+
+    def time_ndarray_meth(self, methname, npdtypes):
+        getattr(operator, methname)(self.xarg)
+
+
+class NDArrayLRShifts(Benchmark):
+    """ Benchmark for the shift methods
+    """
+    params = [['__lshift__', '__rshift__'],
+              ['intp', 'int8', 'int16',
+                'int32', 'int64', 'uint8',
+                'uint16', 'uint32', 'uint64']]
+    param_names = ['methods', 'npdtypes']
+    timeout = 10
+
+    def setup(self, methname, npdtypes):
+        self.vals = np.ones(1000,
+                            dtype=getattr(np, npdtypes)) * \
+                            np.random.randint(9)
+
+    def time_ndarray_meth(self, methname, npdtypes):
+        getattr(operator, methname)(*[self.vals, 2])
+
+
+class Methods0D(Benchmark):
+    """Zero dimension array methods
+    """
+    params = [['__bool__', '__complex__', '__invert__',
+               '__float__', '__int__'], TYPES1]
+    param_names = ['methods', 'npdtypes']
+    timeout = 10
+
+    def setup(self, methname, npdtypes):
+        self.xarg = np.array(3, dtype=npdtypes)
+        if (npdtypes.startswith('complex') and
+           methname in ['__float__', '__int__']) or \
+           (npdtypes.startswith('int') and methname == '__invert__'):
+            # Skip
+            raise NotImplementedError
+
+    def time_ndarray__0d__(self, methname, npdtypes):
+        meth = getattr(self.xarg, methname)
+        meth()
+
+
+class MethodsV1(Benchmark):
+    """ Benchmark for the methods which take an argument
+    """
+    params = [['__and__', '__add__', '__eq__', '__floordiv__', '__ge__',
+               '__gt__', '__le__', '__lt__', '__matmul__',
+               '__mod__', '__mul__', '__ne__', '__or__',
+               '__pow__', '__sub__', '__truediv__', '__xor__'],
+              TYPES1]
+    param_names = ['methods', 'npdtypes']
+    timeout = 10
+
+    def setup(self, methname, npdtypes):
+        if (
+            npdtypes.startswith("complex")
+                and methname in ["__floordiv__", "__mod__"]
+        ) or (
+            not npdtypes.startswith("int")
+            and methname in ["__and__", "__or__", "__xor__"]
+        ):
+            raise NotImplementedError  # skip
+        values = get_squares_().get(npdtypes)
+        self.xargs = [values[0], values[1]]
+
+    def time_ndarray_meth(self, methname, npdtypes):
+        getattr(operator, methname)(*self.xargs)
+
+
+class NDArrayGetItem(Benchmark):
+    param_names = ['margs', 'msize']
+    params = [[0, (0, 0), (-1, 0), [0, -1]],
+              ['small', 'big']]
+
+    def setup(self, margs, msize):
+        self.xs = np.random.uniform(-1, 1, 6).reshape(2, 3)
+        self.xl = np.random.uniform(-1, 1, 50*50).reshape(50, 50)
+
+    def time_methods_getitem(self, margs, msize):
+        if msize == 'small':
+            mdat = self.xs
+        elif msize == 'big':
+            mdat = self.xl
+        getattr(mdat, '__getitem__')(margs)
+
+
+class NDArraySetItem(Benchmark):
+    param_names = ['margs', 'msize']
+    params = [[0, (0, 0), (-1, 0), [0, -1]],
+              ['small', 'big']]
+
+    def setup(self, margs, msize):
+        self.xs = np.random.uniform(-1, 1, 6).reshape(2, 3)
+        self.xl = np.random.uniform(-1, 1, 100*100).reshape(100, 100)
+
+    def time_methods_setitem(self, margs, msize):
+        if msize == 'small':
+            mdat = self.xs
+        elif msize == 'big':
+            mdat = self.xl
+            mdat[margs] = 17
+
+
+class DLPMethods(Benchmark):
+    """ Benchmark for DLPACK helpers
+    """
+    params = [['__dlpack__', '__dlpack_device__'], DLPACK_TYPES]
+    param_names = ['methods', 'npdtypes']
+    timeout = 10
+
+    def setup(self, methname, npdtypes):
+        values = get_squares_()
+        if npdtypes == 'bool':
+            if version.parse(np.__version__) > version.parse("1.25"):
+                self.xarg = values.get('int16')[0].astype('bool')
+            else:
+                raise NotImplementedError("Not supported before v1.25")
+        else:
+            self.xarg = values.get('int16')[0]
+
+    def time_ndarray_dlp(self, methname, npdtypes):
+        meth = getattr(self.xarg, methname)
+        meth()
+
+
+class NDArrayAsType(Benchmark):
+    """ Benchmark for type conversion
+    """
+    params = [list(itertools.combinations(TYPES1, 2))]
+    param_names = ['typeconv']
+    timeout = 10
+
+    def setup(self, typeconv):
+        if typeconv[0] == typeconv[1]:
+            raise NotImplementedError(
+                    "Skipping test for converting to the same dtype")
+        self.xarg = get_squares_().get(typeconv[0])
+
+    def time_astype(self, typeconv):
+        self.xarg.astype(typeconv[1])
+
 
 class UFuncSmall(Benchmark):
-    """  Benchmark for a selection of ufuncs on a small arrays and scalars 
+    """  Benchmark for a selection of ufuncs on a small arrays and scalars
 
     Since the arrays and scalars are small, we are benchmarking the overhead 
     of the numpy ufunc functionality
diff --git a/benchmarks/benchmarks/bench_ufunc_strides.py b/benchmarks/benchmarks/bench_ufunc_strides.py
index f80bf90f9..898cc0818 100644
--- a/benchmarks/benchmarks/bench_ufunc_strides.py
+++ b/benchmarks/benchmarks/bench_ufunc_strides.py
@@ -1,156 +1,181 @@
-from .common import Benchmark
+from .common import Benchmark, get_data
 
 import numpy as np
 
-UNARY_UFUNCS = [obj for obj in np.core.umath.__dict__.values() if
-        isinstance(obj, np.ufunc)]
-UNARY_OBJECT_UFUNCS = [uf for uf in UNARY_UFUNCS if "O->O" in uf.types]
-UNARY_OBJECT_UFUNCS.remove(getattr(np, 'invert'))
+UFUNCS = [obj for obj in np.core.umath.__dict__.values() if
+          isinstance(obj, np.ufunc)]
+UFUNCS_UNARY = [uf for uf in UFUNCS if "O->O" in uf.types]
 
-stride = [1, 2, 4]
-stride_out = [1, 2, 4]
-dtype = ['e', 'f', 'd']
-
-class Unary(Benchmark):
-    params = [UNARY_OBJECT_UFUNCS, stride, stride_out, dtype]
-    param_names = ['ufunc', 'stride_in', 'stride_out', 'dtype']
-    timeout = 10
-
-    def setup(self, ufuncname, stride, stride_out, dtype):
-        np.seterr(all='ignore')
-        try:
-            self.f = ufuncname
-        except AttributeError:
-            raise NotImplementedError(f"No ufunc {ufuncname} found") from None
-        N = 100000
-        self.arr_out = np.empty(stride_out*N, dtype)
-        self.arr = np.random.rand(stride*N).astype(dtype)
-        if (ufuncname.__name__ == 'arccosh'):
-            self.arr = 1.0 + self.arr
-
-    def time_ufunc(self, ufuncname, stride, stride_out, dtype):
-        self.f(self.arr[::stride], self.arr_out[::stride_out])
-
-class AVX_UFunc_log(Benchmark):
-    params = [stride, dtype]
-    param_names = ['stride', 'dtype']
-    timeout = 10
-
-    def setup(self, stride, dtype):
-        np.seterr(all='ignore')
-        N = 10000
-        self.arr = np.array(np.random.random_sample(stride*N), dtype=dtype)
-
-    def time_log(self, stride, dtype):
-        np.log(self.arr[::stride])
-
-
-binary_ufuncs = [
-    'maximum', 'minimum', 'fmax', 'fmin'
-]
-binary_dtype = ['f', 'd']
-
-class Binary(Benchmark):
-    param_names = ['ufunc', 'stride_in0', 'stride_in1', 'stride_out', 'dtype']
-    params = [binary_ufuncs, stride, stride, stride_out, binary_dtype]
+class _AbstractBinary(Benchmark):
+    params = []
+    param_names = ['ufunc', 'stride_in0', 'stride_in1' 'stride_out', 'dtype']
     timeout = 10
+    arrlen = 10000
+    data_finite = True
+    data_denormal = False
+    data_zeros = False
+
+    def setup(self, ufunc, stride_in0, stride_in1, stride_out, dtype):
+        ufunc_insig = f'{dtype}{dtype}->'
+        if ufunc_insig+dtype not in ufunc.types:
+            for st_sig in (ufunc_insig, dtype):
+                test = [sig for sig in ufunc.types if sig.startswith(st_sig)]
+                if test:
+                    break
+            if not test:
+                raise NotImplementedError(
+                    f"Ufunc {ufunc} doesn't support "
+                    f"binary input of dtype {dtype}"
+                ) from None
+            tin, tout = test[0].split('->')
+        else:
+            tin = dtype + dtype
+            tout = dtype
+
+        self.ufunc_args = []
+        for i, (dt, stride) in enumerate(zip(tin, (stride_in0, stride_in1))):
+            self.ufunc_args += [get_data(
+                self.arrlen*stride, dt, i,
+                zeros=self.data_zeros,
+                finite=self.data_finite,
+                denormal=self.data_denormal,
+            )[::stride]]
+        for dt in tout:
+            self.ufunc_args += [
+                np.empty(stride_out*self.arrlen, dt)[::stride_out]
+            ]
 
-    def setup(self, ufuncname, stride_in0, stride_in1, stride_out, dtype):
         np.seterr(all='ignore')
-        try:
-            self.f = getattr(np, ufuncname)
-        except AttributeError:
-            raise NotImplementedError(f"No ufunc {ufuncname} found") from None
-        N = 100000
-        self.arr1 = np.array(np.random.rand(stride_in0*N), dtype=dtype)
-        self.arr2 = np.array(np.random.rand(stride_in1*N), dtype=dtype)
-        self.arr_out = np.empty(stride_out*N, dtype)
-
-    def time_ufunc(self, ufuncname, stride_in0, stride_in1, stride_out, dtype):
-        self.f(self.arr1[::stride_in0], self.arr2[::stride_in1],
-               self.arr_out[::stride_out])
-
 
-binary_int_ufuncs = ['maximum', 'minimum']
-binary_int_dtype = ['b', 'B', 'h', 'H', 'i', 'I', 'l', 'L', 'q', 'Q']
+    def time_binary(self, ufunc, stride_in0, stride_in1, stride_out,
+             dtype):
+        ufunc(*self.ufunc_args)
 
-class BinaryInt(Binary):
+    def time_binary_scalar_in0(self, ufunc, stride_in0, stride_in1,
+                        stride_out, dtype):
+        ufunc(self.ufunc_args[0][0], *self.ufunc_args[1:])
 
-    param_names = ['ufunc', 'stride_in0', 'stride_in1', 'stride_out', 'dtype']
-    params = [binary_int_ufuncs, stride, stride, stride_out, binary_int_dtype]
-
-class AVX_ldexp(Benchmark):
-
-    params = [dtype, stride]
-    param_names = ['dtype', 'stride']
-    timeout = 10
+    def time_binary_scalar_in1(self, ufunc, stride_in0, stride_in1,
+                        stride_out, dtype):
+        ufunc(self.ufunc_args[0], self.ufunc_args[1][0], *self.ufunc_args[2:])
 
-    def setup(self, dtype, stride):
-        np.seterr(all='ignore')
-        self.f = getattr(np, 'ldexp')
-        N = 10000
-        self.arr1 = np.array(np.random.rand(stride*N), dtype=dtype)
-        self.arr2 = np.array(np.random.rand(stride*N), dtype='i')
-
-    def time_ufunc(self, dtype, stride):
-        self.f(self.arr1[::stride], self.arr2[::stride])
-
-cmplx_bfuncs = ['add',
-                'subtract',
-                'multiply',
-                'divide']
-cmplxstride = [1, 2, 4]
-cmplxdtype  = ['F', 'D']
-
-class AVX_cmplx_arithmetic(Benchmark):
-    params = [cmplx_bfuncs, cmplxstride, cmplxdtype]
-    param_names = ['bfunc', 'stride', 'dtype']
-    timeout = 10
-
-    def setup(self, bfuncname, stride, dtype):
-        np.seterr(all='ignore')
-        try:
-            self.f = getattr(np, bfuncname)
-        except AttributeError:
-            raise NotImplementedError(f"No bfunc {bfuncname} found") from None
-        N = 10000
-        self.arr1 = np.ones(stride*N, dtype)
-        self.arr2 = np.ones(stride*N, dtype)
-
-    def time_ufunc(self, bfuncname, stride, dtype):
-        self.f(self.arr1[::stride], self.arr2[::stride])
-
-cmplx_ufuncs = ['reciprocal',
-                'absolute',
-                'square',
-                'conjugate']
-
-class AVX_cmplx_funcs(Benchmark):
-    params = [cmplx_ufuncs, cmplxstride, cmplxdtype]
-    param_names = ['bfunc', 'stride', 'dtype']
+class _AbstractUnary(Benchmark):
+    params = []
+    param_names = ['ufunc', 'stride_in', 'stride_out', 'dtype']
     timeout = 10
+    arrlen = 10000
+    data_finite = True
+    data_denormal = False
+    data_zeros = False
+
+    def setup(self, ufunc, stride_in, stride_out, dtype):
+        arr_in = get_data(
+            stride_in*self.arrlen, dtype,
+            zeros=self.data_zeros,
+            finite=self.data_finite,
+            denormal=self.data_denormal,
+        )
+        self.ufunc_args = [arr_in[::stride_in]]
+
+        ufunc_insig = f'{dtype}->'
+        if ufunc_insig+dtype not in ufunc.types:
+            test = [sig for sig in ufunc.types if sig.startswith(ufunc_insig)]
+            if not test:
+                raise NotImplementedError(
+                    f"Ufunc {ufunc} doesn't support "
+                    f"unary input of dtype {dtype}"
+                ) from None
+            tout = test[0].split('->')[1]
+        else:
+            tout = dtype
+
+        for dt in tout:
+            self.ufunc_args += [
+                np.empty(stride_out*self.arrlen, dt)[::stride_out]
+            ]
 
-    def setup(self, bfuncname, stride, dtype):
         np.seterr(all='ignore')
-        try:
-            self.f = getattr(np, bfuncname)
-        except AttributeError:
-            raise NotImplementedError(f"No bfunc {bfuncname} found") from None
-        N = 10000
-        self.arr1 = np.ones(stride*N, dtype)
 
-    def time_ufunc(self, bfuncname, stride, dtype):
-        self.f(self.arr1[::stride])
+    def time_unary(self, ufunc, stride_in, stride_out, dtype):
+        ufunc(*self.ufunc_args)
+
+class UnaryFP(_AbstractUnary):
+    params = [UFUNCS_UNARY, [1, 2, 4], [1, 2, 4], ['e', 'f', 'd']]
+
+    def setup(self, ufunc, stride_in, stride_out, dtype):
+        _AbstractUnary.setup(self, ufunc, stride_in, stride_out, dtype)
+        if (ufunc.__name__ == 'arccosh'):
+            self.ufunc_args[0] += 1.0
+
+class UnaryFPSpecial(UnaryFP):
+    data_finite = False
+    data_denormal = True
+    data_zeros = True
+
+class BinaryFP(_AbstractBinary):
+    params = [
+        [np.maximum, np.minimum, np.fmax, np.fmin, np.ldexp],
+        [1, 2, 4], [1, 2, 4], [1, 2, 4], ['f', 'd']
+    ]
+
+class BinaryFPSpecial(BinaryFP):
+    data_finite = False
+    data_denormal = True
+    data_zeros = True
+
+class BinaryComplex(_AbstractBinary):
+    params = [
+        [np.add, np.subtract, np.multiply, np.divide],
+        [1, 2, 4], [1, 2, 4], [1, 2, 4],
+        ['F', 'D']
+    ]
+
+class UnaryComplex(_AbstractUnary):
+    params = [
+        [np.reciprocal, np.absolute, np.square, np.conjugate],
+        [1, 2, 4], [1, 2, 4], ['F', 'D']
+    ]
+
+class BinaryInt(_AbstractBinary):
+    arrlen = 100000
+    params = [
+        [np.maximum, np.minimum],
+        [1, 2], [1, 2], [1, 2],
+        ['b', 'B', 'h', 'H', 'i', 'I', 'l', 'L', 'q', 'Q']
+    ]
+
+class BinaryIntContig(_AbstractBinary):
+    params = [
+        [getattr(np, uf) for uf in (
+            'add', 'subtract', 'multiply', 'bitwise_and', 'bitwise_or',
+            'bitwise_xor', 'logical_and', 'logical_or', 'logical_xor',
+            'right_shift', 'left_shift',
+        )],
+        [1], [1], [1],
+        ['b', 'B', 'h', 'H', 'i', 'I', 'l', 'L', 'q', 'Q']
+    ]
+
+class UnaryIntContig(_AbstractUnary):
+    arrlen = 100000
+    params = [
+        [getattr(np, uf) for uf in (
+            'positive', 'square', 'reciprocal', 'conjugate', 'logical_not',
+            'invert', 'isnan', 'isinf', 'isfinite',
+            'absolute', 'sign'
+        )],
+        [1], [1],
+        ['b', 'B', 'h', 'H', 'i', 'I', 'l', 'L', 'q', 'Q']
+    ]
 
 class Mandelbrot(Benchmark):
     def f(self,z):
         return np.abs(z) < 4.0
 
     def g(self,z,c):
-        return np.sum(np.multiply(z,z) + c)
+        return np.sum(np.multiply(z, z) + c)
 
     def mandelbrot_numpy(self, c, maxiter):
-        output = np.zeros(c.shape, np.int)
+        output = np.zeros(c.shape, np.int32)
         z = np.empty(c.shape, np.complex64)
         for it in range(maxiter):
             notdone = self.f(z)
diff --git a/benchmarks/benchmarks/common.py b/benchmarks/benchmarks/common.py
index 0c40e85b0..d10fe999d 100644
--- a/benchmarks/benchmarks/common.py
+++ b/benchmarks/benchmarks/common.py
@@ -1,5 +1,8 @@
-import numpy
+import numpy as np
 import random
+import os
+from functools import lru_cache
+from pathlib import Path
 
 # Various pre-crafted datasets/variables for testing
 # !!! Must not be changed -- only appended !!!
@@ -7,7 +10,7 @@ import random
 # sequences
 random.seed(1)
 # but will seed it nevertheless
-numpy.random.seed(1)
+np.random.seed(1)
 
 nx, ny = 1000, 1000
 # reduced squares based on indexes_rand, primarily for testing more
@@ -19,37 +22,37 @@ TYPES1 = [
     'int16', 'float16',
     'int32', 'float32',
     'int64', 'float64',  'complex64',
-    'longfloat', 'complex128',
+    'longdouble', 'complex128',
 ]
-if 'complex256' in numpy.sctypeDict:
-    TYPES1.append('complex256')
+if 'complex256' in np.sctypeDict:
+    TYPES1.append('clongdouble')
 
+DLPACK_TYPES = [
+    'int16', 'float16',
+    'int32', 'float32',
+    'int64', 'float64',  'complex64',
+    'complex128', 'bool',
+]
 
-def memoize(func):
-    result = []
-    def wrapper():
-        if not result:
-            result.append(func())
-        return result[0]
-    return wrapper
-
+# Path for caching
+CACHE_ROOT = Path(__file__).resolve().parent.parent / 'env' / 'numpy_benchdata'
 
 # values which will be used to construct our sample data matrices
 # replicate 10 times to speed up initial imports of this helper
 # and generate some redundancy
 
-@memoize
+@lru_cache(typed=True)
 def get_values():
-    rnd = numpy.random.RandomState(1)
-    values = numpy.tile(rnd.uniform(0, 100, size=nx*ny//10), 10)
+    rnd = np.random.RandomState(1)
+    values = np.tile(rnd.uniform(0, 100, size=nx*ny//10), 10)
     return values
 
 
-@memoize
+@lru_cache(typed=True)
 def get_squares():
     values = get_values()
-    squares = {t: numpy.array(values,
-                              dtype=getattr(numpy, t)).reshape((nx, ny))
+    squares = {t: np.array(values,
+                              dtype=getattr(np, t)).reshape((nx, ny))
                for t in TYPES1}
 
     # adjust complex ones to have non-degenerated imagery part -- use
@@ -60,42 +63,42 @@ def get_squares():
     return squares
 
 
-@memoize
+@lru_cache(typed=True)
 def get_squares_():
     # smaller squares
     squares_ = {t: s[:nxs, :nys] for t, s in get_squares().items()}
     return squares_
 
 
-@memoize
+@lru_cache(typed=True)
 def get_vectors():
     # vectors
     vectors = {t: s[0] for t, s in get_squares().items()}
     return vectors
 
 
-@memoize
+@lru_cache(typed=True)
 def get_indexes():
     indexes = list(range(nx))
     # so we do not have all items
     indexes.pop(5)
     indexes.pop(95)
 
-    indexes = numpy.array(indexes)
+    indexes = np.array(indexes)
     return indexes
 
 
-@memoize
+@lru_cache(typed=True)
 def get_indexes_rand():
     rnd = random.Random(1)
 
     indexes_rand = get_indexes().tolist()       # copy
     rnd.shuffle(indexes_rand)         # in-place shuffle
-    indexes_rand = numpy.array(indexes_rand)
+    indexes_rand = np.array(indexes_rand)
     return indexes_rand
 
 
-@memoize
+@lru_cache(typed=True)
 def get_indexes_():
     # smaller versions
     indexes = get_indexes()
@@ -103,12 +106,112 @@ def get_indexes_():
     return indexes_
 
 
-@memoize
+@lru_cache(typed=True)
 def get_indexes_rand_():
     indexes_rand = get_indexes_rand()
     indexes_rand_ = indexes_rand[indexes_rand < nxs]
     return indexes_rand_
 
 
+@lru_cache(typed=True)
+def get_data(size, dtype, ip_num=0, zeros=False, finite=True, denormal=False):
+    """
+    Generates a cached random array that covers several scenarios that
+    may affect the benchmark for fairness and to stabilize the benchmark.
+
+    Parameters
+    ----------
+    size: int
+        Array length.
+
+    dtype: dtype or dtype specifier
+
+    ip_num: int
+        Input number, to avoid memory overload
+        and to provide unique data for each operand.
+
+    zeros: bool
+        Spreading zeros along with generated data.
+
+    finite: bool
+        Avoid spreading fp special cases nan/inf.
+
+    denormal:
+        Spreading subnormal numbers along with generated data.
+    """
+    dtype = np.dtype(dtype)
+    dname = dtype.name
+    cache_name = f'{dname}_{size}_{ip_num}_{int(zeros)}'
+    if dtype.kind in 'fc':
+        cache_name += f'{int(finite)}{int(denormal)}'
+    cache_name += '.bin'
+    cache_path = CACHE_ROOT / cache_name
+    if cache_path.exists():
+        return np.fromfile(cache_path, dtype)
+
+    array = np.ones(size, dtype)
+    rands = []
+    if dtype.kind == 'i':
+        dinfo = np.iinfo(dtype)
+        scale = 8
+        if zeros:
+            scale += 1
+        lsize = size // scale
+        for low, high in (
+            (-0x80, -1),
+            (1, 0x7f),
+            (-0x8000, -1),
+            (1, 0x7fff),
+            (-0x80000000, -1),
+            (1, 0x7fffffff),
+            (-0x8000000000000000, -1),
+            (1, 0x7fffffffffffffff),
+        ):
+            rands += [np.random.randint(
+                max(low, dinfo.min),
+                min(high, dinfo.max),
+                lsize, dtype
+            )]
+    elif dtype.kind == 'u':
+        dinfo = np.iinfo(dtype)
+        scale = 4
+        if zeros:
+            scale += 1
+        lsize = size // scale
+        for high in (0xff, 0xffff, 0xffffffff, 0xffffffffffffffff):
+            rands += [np.random.randint(1, min(high, dinfo.max), lsize, dtype)]
+    elif dtype.kind in 'fc':
+        scale = 1
+        if zeros:
+            scale += 1
+        if not finite:
+            scale += 2
+        if denormal:
+            scale += 1
+        dinfo = np.finfo(dtype)
+        lsize = size // scale
+        rands = [np.random.rand(lsize).astype(dtype)]
+        if not finite:
+            rands += [
+                np.empty(lsize, dtype=dtype), np.empty(lsize, dtype=dtype)
+            ]
+            rands[1].fill(float('nan'))
+            rands[2].fill(float('inf'))
+        if denormal:
+            rands += [np.empty(lsize, dtype=dtype)]
+            rands[-1].fill(dinfo.smallest_subnormal)
+
+    if rands:
+        if zeros:
+            rands += [np.zeros(lsize, dtype)]
+        stride = len(rands)
+        for start, r in enumerate(rands):
+            array[start:len(r)*stride:stride] = r
+
+    if not CACHE_ROOT.exists():
+        CACHE_ROOT.mkdir(parents=True)
+    array.tofile(cache_path)
+    return array
+
 class Benchmark:
     pass
diff --git a/build_requirements.txt b/build_requirements.txt
new file mode 100644
index 000000000..de57da279
--- /dev/null
+++ b/build_requirements.txt
@@ -0,0 +1,5 @@
+meson-python>=0.10.0
+Cython>=0.29.34,<3.0
+wheel==0.38.1
+ninja
+spin==0.3
diff --git a/building_with_meson.md b/building_with_meson.md
new file mode 100644
index 000000000..259498998
--- /dev/null
+++ b/building_with_meson.md
@@ -0,0 +1,71 @@
+# Building with Meson
+
+_Note: this is for early adopters. It has been tested on Linux and macOS, and
+with Python 3.9-3.12. Windows will be tested soon. There is one CI job to keep
+the build stable. This may have rough edges, please open an issue if you run
+into a problem._
+
+### Developer build
+
+**Install build tools:** Use one of:
+
+- `mamba env create -f environment.yml && mamba activate numpy-dev
+
+- `python -m pip install -r build_requirements.txt
+  # also make sure you have pkg-config and the usual system dependencies for
+  # NumPy`
+
+Then install spin:
+- `python -m pip install spin`
+
+**Compile and install:** `spin build`
+
+This builds in the `build/` directory, and installs into the `build-install` directory.
+
+Then run the test suite or a shell via `spin`:
+```
+spin test
+spin ipython
+```
+
+Alternatively, to use the package, add it to your `PYTHONPATH`:
+```
+export PYTHONPATH=${PWD}/build/lib64/python3.10/site-packages  # may vary
+pytest --pyargs numpy
+```
+
+
+### pip install
+
+Note that `pip` will use the default build system, which is (as of now) still
+`numpy.distutils`. In order to switch that default to Meson, uncomment the
+`build-backend = "mesonpy"` line at the top of `pyproject.toml`.
+
+After that is done, `pip install .` or `pip install --no-build-isolation .`
+will work as expected. As does building an sdist or wheel with `python -m build`.
+Note, however, that `pip install -e .` (in-place developer install) does not!
+Use `spin` instead (see above).
+
+
+
+### Workaround for a hiccup on Fedora
+
+- Fedora does not distribute `openblas.pc`. Install the following file in `~/lib/pkgconfig/openblas.pc`:
+
+```
+prefix=/usr
+includedir=${prefix}/include
+libdir=${prefix}/lib64
+
+Name: openblas
+Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version
+Version: 0.3.19
+Cflags: -I${includedir}/openblas
+Libs: -L${libdir} -lopenblas
+```
+
+Then build with:
+
+```
+spin build -- -Dpkg_config_path=${HOME}/lib/pkgconfig
+```
diff --git a/doc/changelog/1.23.5-changelog.rst b/doc/changelog/1.23.5-changelog.rst
new file mode 100644
index 000000000..e24a291e4
--- /dev/null
+++ b/doc/changelog/1.23.5-changelog.rst
@@ -0,0 +1,30 @@
+
+Contributors
+============
+
+A total of 7 people contributed to this release.  People with a "+" by their
+names contributed a patch for the first time.
+
+* @DWesl
+* Aayush Agrawal +
+* Adam Knapp +
+* Charles Harris
+* Navpreet Singh +
+* Sebastian Berg
+* Tania Allard
+
+Pull requests merged
+====================
+
+A total of 10 pull requests were merged for this release.
+
+* `#22489 <https://github.com/numpy/numpy/pull/22489>`__: TST, MAINT: Replace most setup with setup_method (also teardown)
+* `#22490 <https://github.com/numpy/numpy/pull/22490>`__: MAINT, CI: Switch to cygwin/cygwin-install-action@v2
+* `#22494 <https://github.com/numpy/numpy/pull/22494>`__: TST: Make test_partial_iteration_cleanup robust but require leak...
+* `#22592 <https://github.com/numpy/numpy/pull/22592>`__: MAINT: Ensure graceful handling of large header sizes
+* `#22593 <https://github.com/numpy/numpy/pull/22593>`__: TYP: Spelling alignment for array flag literal
+* `#22594 <https://github.com/numpy/numpy/pull/22594>`__: BUG: Fix bounds checking for ``random.logseries``
+* `#22595 <https://github.com/numpy/numpy/pull/22595>`__: DEV: Update GH actions and Dockerfile for Gitpod
+* `#22596 <https://github.com/numpy/numpy/pull/22596>`__: CI: Only fetch in actions/checkout
+* `#22597 <https://github.com/numpy/numpy/pull/22597>`__: BUG: Decrement ref count in gentype_reduce if allocated memory...
+* `#22625 <https://github.com/numpy/numpy/pull/22625>`__: BUG: Histogramdd breaks on big arrays in Windows
diff --git a/doc/changelog/1.24.0-changelog.rst b/doc/changelog/1.24.0-changelog.rst
new file mode 100644
index 000000000..7ae8cd4c9
--- /dev/null
+++ b/doc/changelog/1.24.0-changelog.rst
@@ -0,0 +1,634 @@
+
+Contributors
+============
+
+A total of 177 people contributed to this release.  People with a "+" by their
+names contributed a patch for the first time.
+
+* @DWesl
+* @amagicmuffin +
+* @dg3192 +
+* @h-vetinari
+* @juztamau5 +
+* @swagatip +
+* Aaron Meurer
+* Aayush Agrawal +
+* Adam Knapp +
+* Adarsh Singh +
+* Al-Baraa El-Hag
+* Alban Desmaison +
+* Ales Crocaro +
+* Alex Low +
+* Alexander Grund +
+* Alexander Kanavin +
+* Andras Deak
+* Andrew Nelson
+* Angéllica Araujo +
+* Anselm Hahn +
+* Ari Cooper Davis +
+* Axel Huebl +
+* Bas van Beek
+* Bhavuk Kalra
+* Bram Ton +
+* Brent Tan +
+* Brigitta Sipőcz
+* Callum O'Riley +
+* Charles Harris
+* Chiara Marmo
+* Christoph Reiter
+* Damien Caliste
+* Dan Schult +
+* Daniel da Silva
+* DanielHabenicht +
+* Dave Goel +
+* David Gilbertson +
+* Developer-Ecosystem-Engineering
+* Digya Acharya +
+* Dimitri Papadopoulos Orfanos
+* Eric-Shang +
+* Evgeni Burovski
+* Ewout ter Hoeven
+* Felix Hirwa Nshuti +
+* Frazer McLean +
+* Ganesh Kathiresan
+* Gavin Zhang +
+* Gaëtan de Menten
+* Greg Lucas
+* Greg Sadetsky +
+* Gregory P. Smith [Google LLC] +
+* Hameer Abbasi
+* Hannah Aizenman +
+* Hood Chatham
+* I-Shen Leong
+* Iantra Solari +
+* Ikko Ashimine +
+* Inessa Pawson
+* Jake Bowhay +
+* Jake Close +
+* Jarrod Millman
+* Jason Thai
+* JessePires +
+* Jessé Pires +
+* Jhonatan Cunha +
+* Jingxuan He +
+* Johnathon Cusick +
+* Jon Morris
+* Jordy Williams +
+* Josh Wilson
+* João Fontes Gonçalves +
+* Juan Luis Cano Rodríguez
+* Jyn Spring 琴春 +
+* KIU Shueng Chuan
+* Karl Otness +
+* Kevin Sheppard
+* Kinshuk Dua +
+* Leo Fang
+* Leona Taric +
+* Lev Maximov +
+* Lillian Zha +
+* Loïc Estève
+* M. Eric Irrgang +
+* Mariusz Felisiak
+* Mark Harfouche
+* Martin Zacho +
+* Matheus Santana Patriarca +
+* Matt Williams +
+* Matteo Raso +
+* Matthew Barber
+* Matthew Brett
+* Matthew Sterrett +
+* Matthias Bussonnier
+* Matthias Koeppe +
+* Matti Picus
+* Meekail Zain +
+* Melissa Weber Mendonça
+* Michael Osthege +
+* Michael Siebert +
+* Mike Toews
+* Miki Watanabe +
+* Miles Cranmer +
+* Monika Kubek +
+* Muhammad Jarir Kanji +
+* Mukulika Pahari
+* Namami Shanker
+* Nathan Goldbaum +
+* Nathan Rooy +
+* Navpreet Singh +
+* Noritada Kobayashi +
+* Oleksiy Kononenko +
+* Omar Ali +
+* Pal Barta +
+* Pamphile Roy
+* Patrick Hoefler +
+* Pearu Peterson
+* Pedro Nacht +
+* Petar Mlinarić +
+* Peter Hawkins
+* Pey Lian Lim
+* Pieter Eendebak
+* Pradipta Ghosh
+* Pranab Das +
+* Precision Wordcraft LLC +
+* PrecisionWordcraft +
+* Rafael CF Sousa +
+* Rafael Cardoso Fernandes Sousa
+* Rafael Sousa +
+* Raghuveer Devulapalli
+* Ralf Gommers
+* Rin Cat (鈴猫) +
+* Robert Kern
+* Rohit Davas +
+* Rohit Goswami
+* Ross Barnowski
+* Roy Smart +
+* Ruth Comer +
+* Sabiha Tahsin Soha +
+* Sachin Krishnan T V +
+* Sanjana M Moodbagil +
+* Sanya Sinha +
+* Sarah Coffland +
+* Saransh Chopra +
+* Satish Kumar Mishra +
+* Satish Mishra +
+* Sayed Adel
+* Schrijvers Luc +
+* Sebastian Berg
+* Serge Guelton
+* Seva Lotoshnikov +
+* Shashank Gupta +
+* Shoban Chiddarth +
+* Shreya Singh +
+* Shreyas Joshi +
+* Sicheng Zeng +
+* Simran Makandar +
+* Shuangchi He +
+* Srimukh Sripada +
+* Stefan van der Walt
+* Stefanie Molin +
+* Stuart Archibald
+* Tania Allard
+* Thomas A Caswell
+* Thomas Kastl +
+* Thomas Mansencal +
+* Tony Newton / Teng Liu +
+* Toshiki Kataoka
+* Tyler Reddy
+* Vardhaman Kalloli +
+* Warren Weckesser
+* Will Ayd +
+* William Stein +
+* XinRu Lin +
+* Yin Li +
+* Yunika Bajracharya +
+* Zachary Blackwood +
+* 渡邉 美希 +
+
+Pull requests merged
+====================
+
+A total of 444 pull requests were merged for this release.
+
+* `#12065 <https://github.com/numpy/numpy/pull/12065>`__: API: Optimize np.isin and np.in1d for integer arrays and add...
+* `#15782 <https://github.com/numpy/numpy/pull/15782>`__: ENH: complete the 'vars' list of a module
+* `#16022 <https://github.com/numpy/numpy/pull/16022>`__: ENH: Adding __array_ufunc__ capability to MaskedArrays
+* `#16154 <https://github.com/numpy/numpy/pull/16154>`__: ENH: Add support for symbol to polynomial package
+* `#16507 <https://github.com/numpy/numpy/pull/16507>`__: BUG: Do not allow nditer to reduce the mask
+* `#16971 <https://github.com/numpy/numpy/pull/16971>`__: BUG: Fix three ``complex``- & ``float128``-related issues with ``nd_grid``
+* `#19388 <https://github.com/numpy/numpy/pull/19388>`__: ENH: Support character and character string arrays
+* `#20321 <https://github.com/numpy/numpy/pull/20321>`__: ENH: allow NumPy created .npy files to be appended in-place
+* `#20659 <https://github.com/numpy/numpy/pull/20659>`__: BUG: cross product. Added dtype conversions of inputs. See. #19138
+* `#20913 <https://github.com/numpy/numpy/pull/20913>`__: ENH, SIMD: Extend universal intrinsics to support IBMZ
+* `#20914 <https://github.com/numpy/numpy/pull/20914>`__: BUG: change ``ma.mean`` dtype to be consistent with ``np.mean``
+* `#20924 <https://github.com/numpy/numpy/pull/20924>`__: MAINT: Simplify element setting and use it for filling
+* `#20949 <https://github.com/numpy/numpy/pull/20949>`__: MAINT: Rename INSTALL.rst.txt to INSTALL.rst
+* `#21041 <https://github.com/numpy/numpy/pull/21041>`__: ENH: Implement string comparison ufuncs (or almost)
+* `#21084 <https://github.com/numpy/numpy/pull/21084>`__: MAINT: Fix computation of numpy.array_api.linalg.vector_norm
+* `#21098 <https://github.com/numpy/numpy/pull/21098>`__: DOC: Fix axis naming in ``argpartition`` docs
+* `#21103 <https://github.com/numpy/numpy/pull/21103>`__: NEP: Add NEP 50 draft about fixing scalar promotion rules
+* `#21152 <https://github.com/numpy/numpy/pull/21152>`__: DOC: verifying bug existence and fixes - replacement for PR 17851
+* `#21248 <https://github.com/numpy/numpy/pull/21248>`__: DOC: improve description of the ``data`` entry in ``__array_interface__``
+* `#21308 <https://github.com/numpy/numpy/pull/21308>`__: MAINT: Start testing with Python 3.11.
+* `#21403 <https://github.com/numpy/numpy/pull/21403>`__: MAINT: remove some names from main numpy namespace
+* `#21437 <https://github.com/numpy/numpy/pull/21437>`__: ENH: Add floating point error checking to (almost) all casts
+* `#21468 <https://github.com/numpy/numpy/pull/21468>`__: ENH: Use ``threadpoolctl`` in ``show_runtime`` (a new function)
+* `#21471 <https://github.com/numpy/numpy/pull/21471>`__: DOC: adding docstring to TooHardError class
+* `#21483 <https://github.com/numpy/numpy/pull/21483>`__: SIMD: Use universal intrinsics to implement comparison functions
+* `#21501 <https://github.com/numpy/numpy/pull/21501>`__: DOC: improve ``ascontiguousarray()`` and ``asfortranarray()`` examples
+* `#21504 <https://github.com/numpy/numpy/pull/21504>`__: MAINT: Fix some typos.
+* `#21507 <https://github.com/numpy/numpy/pull/21507>`__: BUG: Better report integer division overflow
+* `#21527 <https://github.com/numpy/numpy/pull/21527>`__: PERF: Fast path for dtype lookup of float and long
+* `#21537 <https://github.com/numpy/numpy/pull/21537>`__: DOC: Add a policy about inactive PRs.
+* `#21564 <https://github.com/numpy/numpy/pull/21564>`__: TST: Enable doctests in IO Howto with testsetup and testcleanup
+* `#21567 <https://github.com/numpy/numpy/pull/21567>`__: BUG: Adding the default pytest doctest instead of the ValueError
+* `#21572 <https://github.com/numpy/numpy/pull/21572>`__: MAINT: allow absolute module names in refguide-check
+* `#21579 <https://github.com/numpy/numpy/pull/21579>`__: DOC: Improve docstring of numpy.testing.assert_allclose
+* `#21581 <https://github.com/numpy/numpy/pull/21581>`__: REL: Prepare main for NumPy 1.24.0 development
+* `#21582 <https://github.com/numpy/numpy/pull/21582>`__: MAINT: Fix some small refcounting and similar issues
+* `#21583 <https://github.com/numpy/numpy/pull/21583>`__: Add ``CODEOWNER`` for the ``array_api`` module
+* `#21587 <https://github.com/numpy/numpy/pull/21587>`__: DOC: update logarithm docs as per theory.
+* `#21591 <https://github.com/numpy/numpy/pull/21591>`__: BUG, STY: Fix doctest failure in EXAMPLE_DOCSTRING.
+* `#21595 <https://github.com/numpy/numpy/pull/21595>`__: ENH: Add strict parameter to assert_array_equal. Fixes #9542
+* `#21596 <https://github.com/numpy/numpy/pull/21596>`__: TYP,MAINT: Allow unsigned integer inplace-ops to accept signed...
+* `#21600 <https://github.com/numpy/numpy/pull/21600>`__: DOC: Add missing links for NEP36 and NEP49
+* `#21601 <https://github.com/numpy/numpy/pull/21601>`__: DOC: update NEP29 to address PEP0602
+* `#21602 <https://github.com/numpy/numpy/pull/21602>`__: BUILD: fix tag name for travis: it is v1.23.0rc1
+* `#21605 <https://github.com/numpy/numpy/pull/21605>`__: MAINT: Adapt npt._GenericAlias to Python 3.11 types.GenericAlias...
+* `#21609 <https://github.com/numpy/numpy/pull/21609>`__: MAINT: Fix some API versions.
+* `#21611 <https://github.com/numpy/numpy/pull/21611>`__: MAINT: make MismatchCAPIWarnining into MismatchCAPIError
+* `#21615 <https://github.com/numpy/numpy/pull/21615>`__: MAINT: Remove compatibility shims for old versions of PyPy
+* `#21616 <https://github.com/numpy/numpy/pull/21616>`__: DOC: Describe the changed Python release cadence better
+* `#21617 <https://github.com/numpy/numpy/pull/21617>`__: MAINT, STY: Make download-wheels download source files.
+* `#21620 <https://github.com/numpy/numpy/pull/21620>`__: MAINT: Fortify masked in-place ops against promotion warnings
+* `#21622 <https://github.com/numpy/numpy/pull/21622>`__: TST: Skip F2PY tests without Fortran compilers
+* `#21623 <https://github.com/numpy/numpy/pull/21623>`__: ENH: Add equals_nans kwarg to np.unique
+* `#21624 <https://github.com/numpy/numpy/pull/21624>`__: DOC: move ``import_array`` and ``import_umath`` above ``PyModule_Create``
+* `#21626 <https://github.com/numpy/numpy/pull/21626>`__: API: Introduce optional (and partial) NEP 50 weak scalar logic
+* `#21627 <https://github.com/numpy/numpy/pull/21627>`__: ENH: adding casting option to numpy.stack.
+* `#21630 <https://github.com/numpy/numpy/pull/21630>`__: MAINT: back out conversion of npymath component to c++
+* `#21632 <https://github.com/numpy/numpy/pull/21632>`__: API: Retain ``arr.base`` more strictly in ``np.frombuffer``
+* `#21639 <https://github.com/numpy/numpy/pull/21639>`__: BUG: use explicit einsum_path whenever it is given
+* `#21641 <https://github.com/numpy/numpy/pull/21641>`__: DOC: Note version-switcher update in the release walkthrough
+* `#21644 <https://github.com/numpy/numpy/pull/21644>`__: MAINT: Fixup ``unique``'s ``equal_nan`` kwarg to match ``np.array_equal``
+* `#21645 <https://github.com/numpy/numpy/pull/21645>`__: DEP: Remove ``normed=`` keyword argument from histograms
+* `#21648 <https://github.com/numpy/numpy/pull/21648>`__: ENH: issue overflow warning when using ``abs`` on ``np.int8(-128)``
+* `#21651 <https://github.com/numpy/numpy/pull/21651>`__: MAINT: Remove unused/not useful CODEOWNERS file again
+* `#21654 <https://github.com/numpy/numpy/pull/21654>`__: MAINT: limit the number of decimals in Polynomial representation
+* `#21658 <https://github.com/numpy/numpy/pull/21658>`__: MAINT: improved overflow check to avoid undefined behavior
+* `#21659 <https://github.com/numpy/numpy/pull/21659>`__: BUG: Fix a bug left after f2py2e refactor
+* `#21661 <https://github.com/numpy/numpy/pull/21661>`__: TYP, ENH: Add annotations for the ``equal_nan`` keyword to ``np.unique``
+* `#21662 <https://github.com/numpy/numpy/pull/21662>`__: DOC: ``np`` in double backticks.
+* `#21663 <https://github.com/numpy/numpy/pull/21663>`__: DEP: Deprecate (rather than remove) the int-via-float parsing...
+* `#21664 <https://github.com/numpy/numpy/pull/21664>`__: DOC: Misc RST reformatting.
+* `#21666 <https://github.com/numpy/numpy/pull/21666>`__: MAINT: Change array API unique_*() functions to not compare nans...
+* `#21675 <https://github.com/numpy/numpy/pull/21675>`__: DOC: Add note about broadcasting support for ``random.Generator.multinomial``
+* `#21677 <https://github.com/numpy/numpy/pull/21677>`__: DOC: RST Titles Underline reordering
+* `#21681 <https://github.com/numpy/numpy/pull/21681>`__: MAINT: Point documentation version switcher at the docs homepage
+* `#21687 <https://github.com/numpy/numpy/pull/21687>`__: BUG: switch _CMP_NEQ_OQ to _CMP_NEQ_UQ for npyv_cmpneq_f[32,64]
+* `#21689 <https://github.com/numpy/numpy/pull/21689>`__: DOC: Fix broadcasting documentation.
+* `#21690 <https://github.com/numpy/numpy/pull/21690>`__: BUG: Prevent attempted broadcasting of 0-D output operands in...
+* `#21691 <https://github.com/numpy/numpy/pull/21691>`__: BUG: Small fixupes found using valgrind
+* `#21692 <https://github.com/numpy/numpy/pull/21692>`__: MAINT: Renamed doc/records.rst.txt to doc/records.rst
+* `#21701 <https://github.com/numpy/numpy/pull/21701>`__: BUG: Enable fortran preprocessing for ifort on Windows
+* `#21704 <https://github.com/numpy/numpy/pull/21704>`__: DOC: Mention positional-only arguments in the array API compatibility...
+* `#21705 <https://github.com/numpy/numpy/pull/21705>`__: BLD, SIMD: Fix detect armhf and hardened the Neon/ASIMD compile-time...
+* `#21707 <https://github.com/numpy/numpy/pull/21707>`__: MAINT: generate_umath.py: do not write full path into output...
+* `#21709 <https://github.com/numpy/numpy/pull/21709>`__: TST: Fixup loadtxt int-via-float tests when in release mode
+* `#21712 <https://github.com/numpy/numpy/pull/21712>`__: BUG: ``.f2py_f2cmap`` doesn't map ``long_long`` and other options
+* `#21715 <https://github.com/numpy/numpy/pull/21715>`__: DOC: Tell people to use only-binary option
+* `#21723 <https://github.com/numpy/numpy/pull/21723>`__: DOC: Update sphinx, numpydoc, and pydata-sphinx-theme
+* `#21727 <https://github.com/numpy/numpy/pull/21727>`__: MAINT, SIMD: Handle overflow gracefully in integer division
+* `#21731 <https://github.com/numpy/numpy/pull/21731>`__: ENH: Ensure dispatcher TypeErrors report original name
+* `#21732 <https://github.com/numpy/numpy/pull/21732>`__: ENH: Add support for platforms with missing fenv flags
+* `#21733 <https://github.com/numpy/numpy/pull/21733>`__: ENH: Fix pointer size determination for cross build
+* `#21734 <https://github.com/numpy/numpy/pull/21734>`__: ENH: Check that we are targeting x86 or x64 architecture before...
+* `#21735 <https://github.com/numpy/numpy/pull/21735>`__: ENH: cross compilation: use sysconfig to determine if x86_64...
+* `#21745 <https://github.com/numpy/numpy/pull/21745>`__: MAINT: Remove FPE helper code that is unnecessary on C99/C++11
+* `#21748 <https://github.com/numpy/numpy/pull/21748>`__: BUG: Fix for npyv_orc_b8 and npyv_xnor_b8 for s390x (z13)
+* `#21749 <https://github.com/numpy/numpy/pull/21749>`__: BUG, SIMD: Fix detecting NEON/ASIMD on aarch64
+* `#21750 <https://github.com/numpy/numpy/pull/21750>`__: CI: Fix CI SIMD build on s390x
+* `#21755 <https://github.com/numpy/numpy/pull/21755>`__: BUG: Do not skip value-based promotion path for large Python...
+* `#21759 <https://github.com/numpy/numpy/pull/21759>`__: BUG: Fix small reference leaks found with pytest-leaks
+* `#21763 <https://github.com/numpy/numpy/pull/21763>`__: MAINT: use f-string format in test_abc.py
+* `#21764 <https://github.com/numpy/numpy/pull/21764>`__: BUG: Fix a potential variable misuse bug
+* `#21766 <https://github.com/numpy/numpy/pull/21766>`__: CI: Guard compile-time CPU features tests
+* `#21771 <https://github.com/numpy/numpy/pull/21771>`__: MAINT: Add a check of the return value of PyMem_Calloc().
+* `#21773 <https://github.com/numpy/numpy/pull/21773>`__: MAINT: fix typo in string_ufuncs.cpp
+* `#21776 <https://github.com/numpy/numpy/pull/21776>`__: CI: add workflow for non-optimized builds
+* `#21778 <https://github.com/numpy/numpy/pull/21778>`__: MAINT, SIMD: remove orphan path vsx/conversion.h
+* `#21779 <https://github.com/numpy/numpy/pull/21779>`__: MAINT: random: Disallow complex inputs in multivariate_normal
+* `#21786 <https://github.com/numpy/numpy/pull/21786>`__: MAINT: fix up use of ``NPY_NO_DEPRECATED_API`` usage in f2py
+* `#21789 <https://github.com/numpy/numpy/pull/21789>`__: ENH: Change f2c declarations with void return type to int
+* `#21790 <https://github.com/numpy/numpy/pull/21790>`__: ENH: add overflow handling for scalar ``negative`` operation
+* `#21793 <https://github.com/numpy/numpy/pull/21793>`__: ENH: Add overflow handling for negative integers scalar multiplication
+* `#21794 <https://github.com/numpy/numpy/pull/21794>`__: BUG: lib: A loadtxt error message had two values reversed.
+* `#21795 <https://github.com/numpy/numpy/pull/21795>`__: ENH: Ensure that assertion of unsigned dtypes does not return...
+* `#21796 <https://github.com/numpy/numpy/pull/21796>`__: BUG: Fix comparison for empty structured arrays
+* `#21797 <https://github.com/numpy/numpy/pull/21797>`__: MAINT: Reduce object construction in np.require
+* `#21798 <https://github.com/numpy/numpy/pull/21798>`__: MAINT: use PyErr_SetString in _import_array where possible
+* `#21807 <https://github.com/numpy/numpy/pull/21807>`__: ENH: Handle the value attribute in F2Py wrappers
+* `#21812 <https://github.com/numpy/numpy/pull/21812>`__: BUG: Define ``<``, ``<=``, ``>``, ``>=`` for masked arrays
+* `#21815 <https://github.com/numpy/numpy/pull/21815>`__: BUG: Fix discovered MachAr (still used within valgrind)
+* `#21817 <https://github.com/numpy/numpy/pull/21817>`__: ENH: Always fill object fields with None rather than NULL
+* `#21823 <https://github.com/numpy/numpy/pull/21823>`__: MAINT: Try fixing broken Anaconda uploads.
+* `#21828 <https://github.com/numpy/numpy/pull/21828>`__: MAINT: Update main after 1.23.0 release.
+* `#21830 <https://github.com/numpy/numpy/pull/21830>`__: DOC: Change substract to subtract in comment
+* `#21832 <https://github.com/numpy/numpy/pull/21832>`__: PERF: Micro optimize np.linspace
+* `#21835 <https://github.com/numpy/numpy/pull/21835>`__: TST: Add f2py CLI tests
+* `#21836 <https://github.com/numpy/numpy/pull/21836>`__: DOC: F2PY documentation improvements
+* `#21837 <https://github.com/numpy/numpy/pull/21837>`__: DOC: Document expectation for object array initialization
+* `#21842 <https://github.com/numpy/numpy/pull/21842>`__: BUG: Fix in1d for empty integer array as input
+* `#21844 <https://github.com/numpy/numpy/pull/21844>`__: DOC: Rephrase dimensionality, size in broadcasting.rst
+* `#21848 <https://github.com/numpy/numpy/pull/21848>`__: BUG: Handle NaNs correctly for float16 during sorting
+* `#21849 <https://github.com/numpy/numpy/pull/21849>`__: MAINT: Update the documentation Makefile
+* `#21851 <https://github.com/numpy/numpy/pull/21851>`__: BUG: Use ``keepdims`` during normalization in ``np.average`` and...
+* `#21853 <https://github.com/numpy/numpy/pull/21853>`__: DOC: Update the docstring for np.around
+* `#21854 <https://github.com/numpy/numpy/pull/21854>`__: DOC: mention changes to ``max_rows`` behaviour in ``np.loadtxt``
+* `#21855 <https://github.com/numpy/numpy/pull/21855>`__: DOC: Replace the mathematical notation N(...) with text.
+* `#21856 <https://github.com/numpy/numpy/pull/21856>`__: DOC: Mention uniform in the np.random.Generator.random function.
+* `#21857 <https://github.com/numpy/numpy/pull/21857>`__: BUG: Reject non integer array-likes with size 1 in delete
+* `#21860 <https://github.com/numpy/numpy/pull/21860>`__: BUG: Fix numpy.isin for timedelta dtype
+* `#21861 <https://github.com/numpy/numpy/pull/21861>`__: DOC: clarify loadtxt input cols requirement
+* `#21863 <https://github.com/numpy/numpy/pull/21863>`__: ENH,MAINT: Improve and simplify scalar floating point warnings
+* `#21872 <https://github.com/numpy/numpy/pull/21872>`__: CI: tools: Remove a long but obsolete comment from travis-test.sh
+* `#21875 <https://github.com/numpy/numpy/pull/21875>`__: ENH: Implement correct scalar and integer overflow errors for...
+* `#21876 <https://github.com/numpy/numpy/pull/21876>`__: DOC: Fixed minor typo in reference
+* `#21877 <https://github.com/numpy/numpy/pull/21877>`__: DOC: dark theme css rules
+* `#21879 <https://github.com/numpy/numpy/pull/21879>`__: CI: skip azp and circleCI logic
+* `#21881 <https://github.com/numpy/numpy/pull/21881>`__: MAINT: Disable checks for Win workaround for GCC
+* `#21884 <https://github.com/numpy/numpy/pull/21884>`__: MAINT: Fix non-void function does not return a value warning
+* `#21885 <https://github.com/numpy/numpy/pull/21885>`__: DOC: Link to PEP-484 in the typing docs
+* `#21886 <https://github.com/numpy/numpy/pull/21886>`__: Fix lib flags for librandom
+* `#21887 <https://github.com/numpy/numpy/pull/21887>`__: BLD: Allow GCC compile on mingw-w64-based systems
+* `#21890 <https://github.com/numpy/numpy/pull/21890>`__: BUG: Fix KeyError in crackfortran operator support
+* `#21894 <https://github.com/numpy/numpy/pull/21894>`__: BUG: Fix datetime_to_timedelta_resolve_descriptors signature
+* `#21895 <https://github.com/numpy/numpy/pull/21895>`__: ENH, CI: Add Emscripten to CI
+* `#21896 <https://github.com/numpy/numpy/pull/21896>`__: BLD: Make can_link_svml return False for 32bit builds on x86_64
+* `#21904 <https://github.com/numpy/numpy/pull/21904>`__: DOC: Add usage example to np.char.center docstring
+* `#21905 <https://github.com/numpy/numpy/pull/21905>`__: DOC: Fix the interpolation formulae for quantile and percentile
+* `#21909 <https://github.com/numpy/numpy/pull/21909>`__: DOC: fix typo in ``numpy._typing._NestedSequence`` docstring example
+* `#21921 <https://github.com/numpy/numpy/pull/21921>`__: DOC: Fix typo on Byte-swapping page
+* `#21922 <https://github.com/numpy/numpy/pull/21922>`__: DOC: fix typo on custom array containers page
+* `#21924 <https://github.com/numpy/numpy/pull/21924>`__: CI: Use OpenBLAS again on Cygwin
+* `#21925 <https://github.com/numpy/numpy/pull/21925>`__: BUG: Fix subarray to object cast ownership details
+* `#21926 <https://github.com/numpy/numpy/pull/21926>`__: MAINT: random: Annotate default_rng with cython.embedsignature
+* `#21927 <https://github.com/numpy/numpy/pull/21927>`__: DOC: Add a note about security and NumPy to the documentation
+* `#21928 <https://github.com/numpy/numpy/pull/21928>`__: BUG: Fix the implementation of numpy.array_api.vecdot
+* `#21943 <https://github.com/numpy/numpy/pull/21943>`__: DOC, MAINT: Document the C-API incompatibility error and point...
+* `#21945 <https://github.com/numpy/numpy/pull/21945>`__: MAINT, DOC: Update release guide
+* `#21946 <https://github.com/numpy/numpy/pull/21946>`__: BUG: Reorder extern "C" to only apply to function declarations...
+* `#21948 <https://github.com/numpy/numpy/pull/21948>`__: [DOC] Double backticks in lagfit.
+* `#21954 <https://github.com/numpy/numpy/pull/21954>`__: TST: Add tests for FP16 umath functions
+* `#21955 <https://github.com/numpy/numpy/pull/21955>`__: ENH: Vectorize FP16 umath functions using AVX512
+* `#21956 <https://github.com/numpy/numpy/pull/21956>`__: MAINT: Update main after 1.23.1 release.
+* `#21957 <https://github.com/numpy/numpy/pull/21957>`__: TYP,ENH: Mark all unhashable classes as such
+* `#21959 <https://github.com/numpy/numpy/pull/21959>`__: BUG: Use ``Popen`` to silently invoke f77 -v
+* `#21960 <https://github.com/numpy/numpy/pull/21960>`__: Revert "ENH: Adding __array_ufunc__ capability to MaskedArrays"
+* `#21963 <https://github.com/numpy/numpy/pull/21963>`__: BLD: remove outdated pin for ``packaging`` on macOS arm64
+* `#21965 <https://github.com/numpy/numpy/pull/21965>`__: Added priority in bug-report issue-template
+* `#21968 <https://github.com/numpy/numpy/pull/21968>`__: ENH: Add ``__array_ufunc__`` typing support to the ``nin=1`` ufuncs
+* `#21973 <https://github.com/numpy/numpy/pull/21973>`__: DOC: correct docstring for numpy.correlate()
+* `#21974 <https://github.com/numpy/numpy/pull/21974>`__: MAINT, TYP: Fix ``np.angle`` dtype-overloads
+* `#21976 <https://github.com/numpy/numpy/pull/21976>`__: ENH: Add the capability to swap the singleton bit generator
+* `#21977 <https://github.com/numpy/numpy/pull/21977>`__: ENH: Adding __array_ufunc__ capability to MaskedArrays
+* `#21979 <https://github.com/numpy/numpy/pull/21979>`__: BUG: Fix experimental dtype slot numbers
+* `#21981 <https://github.com/numpy/numpy/pull/21981>`__: TST: ensure ``np.equal.reduce`` raises a ``TypeError``
+* `#21982 <https://github.com/numpy/numpy/pull/21982>`__: MAINT: Do not let ``_GenericAlias`` wrap the underlying classes'...
+* `#21983 <https://github.com/numpy/numpy/pull/21983>`__: TYP,MAINT: Allow ``einsum`` subscripts to be passed via integer...
+* `#21984 <https://github.com/numpy/numpy/pull/21984>`__: MAINT,TYP: Add object-overloads for the ``np.generic`` rich comparisons
+* `#21986 <https://github.com/numpy/numpy/pull/21986>`__: MAINT: Remove two unused imports
+* `#21991 <https://github.com/numpy/numpy/pull/21991>`__: MAINT: rm old warning
+* `#21992 <https://github.com/numpy/numpy/pull/21992>`__: DOC: cross-reference descriptions of frombuffer and ndarray.tobytes
+* `#21993 <https://github.com/numpy/numpy/pull/21993>`__: BUG: fix ma.minimum.reduce with axis keyword
+* `#21995 <https://github.com/numpy/numpy/pull/21995>`__: BUG: Distinguish exact vs. equivalent dtype for C type aliases.
+* `#21996 <https://github.com/numpy/numpy/pull/21996>`__: BUG: Avoid errors on NULL during deepcopy
+* `#21997 <https://github.com/numpy/numpy/pull/21997>`__: DOC: unify ``np.transpose``, ``np.ndarray.transpose``, and ``np.ndarray.T``
+* `#21999 <https://github.com/numpy/numpy/pull/21999>`__: BUG: Fix masked median bug
+* `#22000 <https://github.com/numpy/numpy/pull/22000>`__: DOC: some improvements in the NumPy/MATLAB comparison
+* `#22002 <https://github.com/numpy/numpy/pull/22002>`__: DOC: add links to ``linalg`` in docs of ``dot`` and ``matmul``
+* `#22004 <https://github.com/numpy/numpy/pull/22004>`__: DEP: Finalize ragged array creation deprecation
+* `#22008 <https://github.com/numpy/numpy/pull/22008>`__: MAINT: remove unneeded ``__future__`` imports
+* `#22009 <https://github.com/numpy/numpy/pull/22009>`__: BUG: fix np.average for Fraction elements
+* `#22010 <https://github.com/numpy/numpy/pull/22010>`__: DOC: Add versionchanged for converter callable behavior.
+* `#22013 <https://github.com/numpy/numpy/pull/22013>`__: DOC: updated masked_print_option and added explanation
+* `#22014 <https://github.com/numpy/numpy/pull/22014>`__: BUG/ENH: Allow bit generators to supply their own constructor
+* `#22016 <https://github.com/numpy/numpy/pull/22016>`__: BUG: Revert using __array_ufunc__ for MaskedArray
+* `#22017 <https://github.com/numpy/numpy/pull/22017>`__: ENH: reorder includes for testing on top of system installations...
+* `#22022 <https://github.com/numpy/numpy/pull/22022>`__: MAINT,TYP: Allow the ``squeeze`` and ``transpose`` method to...
+* `#22024 <https://github.com/numpy/numpy/pull/22024>`__: BUG: Expose string_heapsort algorithm in a shared header
+* `#22043 <https://github.com/numpy/numpy/pull/22043>`__: BLD: use macos-11 image on azure, macos-1015 is deprecated
+* `#22045 <https://github.com/numpy/numpy/pull/22045>`__: ENH: allow importlib.LazyLoader to work with numpy and add test...
+* `#22046 <https://github.com/numpy/numpy/pull/22046>`__: BUG: Make ``mask_invalid`` consistent with ``mask_where`` if ``copy``...
+* `#22053 <https://github.com/numpy/numpy/pull/22053>`__: MAINT: Quiet the anaconda uploads.
+* `#22060 <https://github.com/numpy/numpy/pull/22060>`__: PERF: Improve import time of numpy
+* `#22071 <https://github.com/numpy/numpy/pull/22071>`__: MAINT: fix typo in test_array_object.py
+* `#22081 <https://github.com/numpy/numpy/pull/22081>`__: PERF: Remove numpy.compat._pep440 from default imports
+* `#22083 <https://github.com/numpy/numpy/pull/22083>`__: BUG: Fix skip condition for test_loss_of_precision[complex256]
+* `#22087 <https://github.com/numpy/numpy/pull/22087>`__: ENH: raise TypeError when arange() is called with string dtype
+* `#22090 <https://github.com/numpy/numpy/pull/22090>`__: MAINT: Simplify npymath
+* `#22096 <https://github.com/numpy/numpy/pull/22096>`__: PERF: Improve intrinsics for tobits and pack on Apple silicon
+* `#22099 <https://github.com/numpy/numpy/pull/22099>`__: TST: Remove workaround for gh-9968 from ``test_scalar_methods.test_roundtrip``
+* `#22102 <https://github.com/numpy/numpy/pull/22102>`__: BLD: Try building python3.11 wheels.
+* `#22105 <https://github.com/numpy/numpy/pull/22105>`__: TST: fix test_linear_interpolation_formula_symmetric
+* `#22110 <https://github.com/numpy/numpy/pull/22110>`__: BUG: Address failures in aarch64 gcc builds due to #22096
+* `#22111 <https://github.com/numpy/numpy/pull/22111>`__: BUG: Missed a case in the diff merge for #22110
+* `#22112 <https://github.com/numpy/numpy/pull/22112>`__: MAINT: remove redundant reversal of eigenvalues order in svd...
+* `#22116 <https://github.com/numpy/numpy/pull/22116>`__: MAINT,DOC: Remove sphinx-panels in favor of sphinx-design
+* `#22117 <https://github.com/numpy/numpy/pull/22117>`__: DOC: Reorganize user guide outline
+* `#22118 <https://github.com/numpy/numpy/pull/22118>`__: DOC: Fix documentation for percentile and quantile
+* `#22120 <https://github.com/numpy/numpy/pull/22120>`__: DOC: Explain spawn_key a little more.
+* `#22122 <https://github.com/numpy/numpy/pull/22122>`__: DOC: Explain how to use sequences of integers as seeds.
+* `#22124 <https://github.com/numpy/numpy/pull/22124>`__: MAINT: support IBM i system
+* `#22127 <https://github.com/numpy/numpy/pull/22127>`__: BLD: Add Python 3.11 wheels to aarch64 build
+* `#22130 <https://github.com/numpy/numpy/pull/22130>`__: MAINT: Update main after 1.23.2 release.
+* `#22132 <https://github.com/numpy/numpy/pull/22132>`__: DOC: Add a release note for bit generator reduce changes
+* `#22139 <https://github.com/numpy/numpy/pull/22139>`__: DEP: drop support for msvc<=1900 and Interix
+* `#22142 <https://github.com/numpy/numpy/pull/22142>`__: MAINT: Update setup.py for Python 3.11.
+* `#22143 <https://github.com/numpy/numpy/pull/22143>`__: MAINT: Update the RELEASE_WALKTHROUGH file.
+* `#22144 <https://github.com/numpy/numpy/pull/22144>`__: DOC: Add missing word
+* `#22147 <https://github.com/numpy/numpy/pull/22147>`__: DOC: fix linalg.tensorsolve docstring
+* `#22149 <https://github.com/numpy/numpy/pull/22149>`__: DOC: Copy-edit the 'partition' docstring.
+* `#22150 <https://github.com/numpy/numpy/pull/22150>`__: CI: Test NumPy build against old versions of GCC(6, 7, 8)
+* `#22152 <https://github.com/numpy/numpy/pull/22152>`__: BUG: Support using libunwind for backtrack
+* `#22154 <https://github.com/numpy/numpy/pull/22154>`__: DOC: add more prominent warnings to pin setuptools
+* `#22159 <https://github.com/numpy/numpy/pull/22159>`__: DEP: drop older cygwin compatibility shims
+* `#22161 <https://github.com/numpy/numpy/pull/22161>`__: MAINT: simplify complex math function handling in npymath
+* `#22163 <https://github.com/numpy/numpy/pull/22163>`__: PERF: Eliminate slow check for pypy during numpy import
+* `#22164 <https://github.com/numpy/numpy/pull/22164>`__: ENH: Improve tanh for architectures without efficient gather/scatter...
+* `#22168 <https://github.com/numpy/numpy/pull/22168>`__: ENH: Remove AVX related functions from non x86 based builds
+* `#22169 <https://github.com/numpy/numpy/pull/22169>`__: MAINT: fix defines for universal2 python builds of NumPy
+* `#22171 <https://github.com/numpy/numpy/pull/22171>`__: DOC: Note symmetry requirement in ``multivariate_normal`` error
+* `#22179 <https://github.com/numpy/numpy/pull/22179>`__: TST: Implemented an unused test for np.random.randint
+* `#22189 <https://github.com/numpy/numpy/pull/22189>`__: MAINT: fix an incorrect pointer type usage in f2py
+* `#22193 <https://github.com/numpy/numpy/pull/22193>`__: BUG: change overloads to play nice with pyright.
+* `#22194 <https://github.com/numpy/numpy/pull/22194>`__: BUG: Fix circleci build
+* `#22199 <https://github.com/numpy/numpy/pull/22199>`__: MAINT: Unpin towncrier
+* `#22204 <https://github.com/numpy/numpy/pull/22204>`__: TST,BUG: Use fork context to fix MacOS savez test
+* `#22205 <https://github.com/numpy/numpy/pull/22205>`__: DOC: Clarify that ``like`` is not passed to ``function``
+* `#22206 <https://github.com/numpy/numpy/pull/22206>`__: DOC: Fix typo disutils -> distutils in numpy.distutils migration...
+* `#22210 <https://github.com/numpy/numpy/pull/22210>`__: DOC: Fix typos in cast warning release note
+* `#22212 <https://github.com/numpy/numpy/pull/22212>`__: TYP,BUG: Reduce argument validation in C-based ``__class_getitem__``
+* `#22227 <https://github.com/numpy/numpy/pull/22227>`__: DOC: fix up release note
+* `#22228 <https://github.com/numpy/numpy/pull/22228>`__: MAINT: Remove long deprecated functionality from np.ma
+* `#22235 <https://github.com/numpy/numpy/pull/22235>`__: Remove incorrect comment about checking generated C files in
+* `#22236 <https://github.com/numpy/numpy/pull/22236>`__: BUG: Fix incorrect refcounting in new ``asarray`` path
+* `#22239 <https://github.com/numpy/numpy/pull/22239>`__: MAINT: Update main after 1.23.3 release.
+* `#22240 <https://github.com/numpy/numpy/pull/22240>`__: ENH: Use SVML for fp32 and fp64 power and arctan2
+* `#22242 <https://github.com/numpy/numpy/pull/22242>`__: BLD: add back stdlib.h include in pcg64.h
+* `#22245 <https://github.com/numpy/numpy/pull/22245>`__: MAINT: random: remove ``get_info`` from "extending with Cython"...
+* `#22251 <https://github.com/numpy/numpy/pull/22251>`__: TST: Move new ``asarray`` test to a more appropriate place.
+* `#22253 <https://github.com/numpy/numpy/pull/22253>`__: DOC: Update concatenate exception message.
+* `#22254 <https://github.com/numpy/numpy/pull/22254>`__: DOC: Improve ``converters`` parameter description for loadtxt
+* `#22256 <https://github.com/numpy/numpy/pull/22256>`__: DOC: Add C API example for NPY_ITER_MULTI_INDEX
+* `#22259 <https://github.com/numpy/numpy/pull/22259>`__: DOC: Better report integer division overflow (#21506)
+* `#22261 <https://github.com/numpy/numpy/pull/22261>`__: NEP: Make NEP 51 to propose changing the scalar representation
+* `#22263 <https://github.com/numpy/numpy/pull/22263>`__: DOC: Clarified how finfo works with complex numbers (#22260)
+* `#22272 <https://github.com/numpy/numpy/pull/22272>`__: MAINT, Haiku defines neither __STDC_NO_THREADS__ nor __GLIBC__
+* `#22276 <https://github.com/numpy/numpy/pull/22276>`__: DOC: Update for return value of np.ptp()
+* `#22279 <https://github.com/numpy/numpy/pull/22279>`__: DOC: Add example to msort docstring
+* `#22280 <https://github.com/numpy/numpy/pull/22280>`__: DOC: updated the description for array-like type in histogramdd...
+* `#22282 <https://github.com/numpy/numpy/pull/22282>`__: DOC: Add example for find
+* `#22285 <https://github.com/numpy/numpy/pull/22285>`__: DOC: Add ``isupper`` examples
+* `#22291 <https://github.com/numpy/numpy/pull/22291>`__: DOC: Add examples for isdigit and str_len
+* `#22292 <https://github.com/numpy/numpy/pull/22292>`__: DOC: Add copyto example
+* `#22294 <https://github.com/numpy/numpy/pull/22294>`__: DOC: add examples to np.char.multiply
+* `#22295 <https://github.com/numpy/numpy/pull/22295>`__: DOC: Added an example for isupper() function
+* `#22296 <https://github.com/numpy/numpy/pull/22296>`__: BUG: Memory leaks in numpy.nested_iters
+* `#22297 <https://github.com/numpy/numpy/pull/22297>`__: DOC: Add example to np.prod
+* `#22298 <https://github.com/numpy/numpy/pull/22298>`__: DOC: add example for ma.unique function
+* `#22299 <https://github.com/numpy/numpy/pull/22299>`__: DOC: Add examples for join and index
+* `#22300 <https://github.com/numpy/numpy/pull/22300>`__: DOC: examples for ``np.char.isdecimal`` and ``np.char.isnumeric``
+* `#22302 <https://github.com/numpy/numpy/pull/22302>`__: DOC: Change in the documentation for chebpts2 method
+* `#22304 <https://github.com/numpy/numpy/pull/22304>`__: DOC: default_rng cleanup
+* `#22306 <https://github.com/numpy/numpy/pull/22306>`__: ENH: Implement essential intrinsics required by the upcoming...
+* `#22308 <https://github.com/numpy/numpy/pull/22308>`__: DOC: add examples to numpy.char.replace
+* `#22309 <https://github.com/numpy/numpy/pull/22309>`__: DOC: Correct usage example for np.char.decode docstring
+* `#22312 <https://github.com/numpy/numpy/pull/22312>`__: MAINT: Shorten test output on travis builds
+* `#22313 <https://github.com/numpy/numpy/pull/22313>`__: DEP: Deprecate fastCopyAndTranspose
+* `#22314 <https://github.com/numpy/numpy/pull/22314>`__: MAINT: Shorten wheel test output on travis builds
+* `#22316 <https://github.com/numpy/numpy/pull/22316>`__: ENH: Allow creating structured void scalars by passing dtype
+* `#22319 <https://github.com/numpy/numpy/pull/22319>`__: TST: add functional tests for kron
+* `#22321 <https://github.com/numpy/numpy/pull/22321>`__: MAINT: core: Fix a typo in a datetime error message.
+* `#22324 <https://github.com/numpy/numpy/pull/22324>`__: MAINT: update function's __module__ attribute in deprecate
+* `#22325 <https://github.com/numpy/numpy/pull/22325>`__: SIMD: Improve the performance of NEON vector initializer
+* `#22326 <https://github.com/numpy/numpy/pull/22326>`__: CI, SIMD: Test and build without the support of AVX2 and AVX512
+* `#22327 <https://github.com/numpy/numpy/pull/22327>`__: BUG: Fix complex vector dot with more than NPY_CBLAS_CHUNK elements
+* `#22331 <https://github.com/numpy/numpy/pull/22331>`__: DOC: Adding examples to ``ma.max`` function
+* `#22332 <https://github.com/numpy/numpy/pull/22332>`__: DOC: Minor typo in docs
+* `#22334 <https://github.com/numpy/numpy/pull/22334>`__: DOC: Added missing dtype attribute to ``iinfo`` and ``finfo`` docstring
+* `#22336 <https://github.com/numpy/numpy/pull/22336>`__: MAINT: Rm numpydoc remnant example docstring.
+* `#22342 <https://github.com/numpy/numpy/pull/22342>`__: MAINT: update sde toolkit to 9.0, fix download link
+* `#22343 <https://github.com/numpy/numpy/pull/22343>`__: DOC: fixed two more typos in docstrings
+* `#22344 <https://github.com/numpy/numpy/pull/22344>`__: DOC: fixed minor typo in percentile docstring
+* `#22354 <https://github.com/numpy/numpy/pull/22354>`__: MAINT: switch sponsor link from numfocus to opencollective
+* `#22356 <https://github.com/numpy/numpy/pull/22356>`__: REV: Loosen ``lookfor``'s import try/except again
+* `#22357 <https://github.com/numpy/numpy/pull/22357>`__: TYP,ENH: Mark ``numpy.typing`` protocols as runtime checkable
+* `#22358 <https://github.com/numpy/numpy/pull/22358>`__: ENH,TYP: Add special casing for ``ndarray``-based indexing
+* `#22359 <https://github.com/numpy/numpy/pull/22359>`__: TYP,MAINT: Change more overloads to play nice with pyright
+* `#22360 <https://github.com/numpy/numpy/pull/22360>`__: TST,TYP: Bump mypy to 0.981
+* `#22362 <https://github.com/numpy/numpy/pull/22362>`__: DOC: Updated amin/amax output dimensions for tuple axis
+* `#22363 <https://github.com/numpy/numpy/pull/22363>`__: DOC: Added examples to ``maa.min`` function
+* `#22365 <https://github.com/numpy/numpy/pull/22365>`__: BUG: Add ``__array_api_version__`` to ``numpy.array_api`` namespace
+* `#22367 <https://github.com/numpy/numpy/pull/22367>`__: BUILD: add permissions to github actions
+* `#22371 <https://github.com/numpy/numpy/pull/22371>`__: MAINT: remove permission restrictions for PR labeler [skip ci]
+* `#22372 <https://github.com/numpy/numpy/pull/22372>`__: DOC: Update delimiter param description.
+* `#22373 <https://github.com/numpy/numpy/pull/22373>`__: DOC, MAINT: Remove unused files
+* `#22374 <https://github.com/numpy/numpy/pull/22374>`__: DOC: typo additional colon in beginners tutorial
+* `#22375 <https://github.com/numpy/numpy/pull/22375>`__: DOC: How to partition domains
+* `#22379 <https://github.com/numpy/numpy/pull/22379>`__: ENH: allow explicit ``like=None`` in all array creation functions
+* `#22385 <https://github.com/numpy/numpy/pull/22385>`__: DEP: Deprecate conversion of out-of-bound Python integers
+* `#22393 <https://github.com/numpy/numpy/pull/22393>`__: MAINT: Ensure graceful handling of large header sizes
+* `#22398 <https://github.com/numpy/numpy/pull/22398>`__: DOC: Update broken link to diataxis framework.
+* `#22399 <https://github.com/numpy/numpy/pull/22399>`__: MAINT: Fix typos found by codespell
+* `#22401 <https://github.com/numpy/numpy/pull/22401>`__: MAINT: fix obsolete code comment
+* `#22404 <https://github.com/numpy/numpy/pull/22404>`__: DOC: added ``ma.round`` and ``ma.round_`` examples
+* `#22406 <https://github.com/numpy/numpy/pull/22406>`__: DOC: Add changelog for ``masked_invalid`` change.
+* `#22407 <https://github.com/numpy/numpy/pull/22407>`__: DOC: Fix title level for release note improvements
+* `#22409 <https://github.com/numpy/numpy/pull/22409>`__: DOC: Adopt a harmonic color scheme with the dark mode of pydata-sphinx-theme
+* `#22411 <https://github.com/numpy/numpy/pull/22411>`__: DOC: Remove documentation specific to Python 2
+* `#22418 <https://github.com/numpy/numpy/pull/22418>`__: TST, BLD: Fix failing aarch64 wheel builds.
+* `#22419 <https://github.com/numpy/numpy/pull/22419>`__: MAINT: Remove PyCObject from the SWIG interface
+* `#22421 <https://github.com/numpy/numpy/pull/22421>`__: DOC: Replace CObject with Capsule consistently
+* `#22422 <https://github.com/numpy/numpy/pull/22422>`__: ENH: Expose ``ufunc.resolve_dtypes`` and strided loop access
+* `#22430 <https://github.com/numpy/numpy/pull/22430>`__: MAINT: Update main after 1.23.4 release.
+* `#22432 <https://github.com/numpy/numpy/pull/22432>`__: MAINT: always use sys.base_prefix, never use sys.real_prefix
+* `#22433 <https://github.com/numpy/numpy/pull/22433>`__: DOC: remove reference to Python 2
+* `#22436 <https://github.com/numpy/numpy/pull/22436>`__: DOC: Clarify docstring of ``masked_equal`` and ``masked_values``
+* `#22438 <https://github.com/numpy/numpy/pull/22438>`__: MAINT: Update versioneer 0.19 → 0.26
+* `#22440 <https://github.com/numpy/numpy/pull/22440>`__: MAINT: fix typo in f2c_lapack.c
+* `#22441 <https://github.com/numpy/numpy/pull/22441>`__: MAINT,DOC: Revert "MAINT: fix typo in f2c_lapack.c"
+* `#22442 <https://github.com/numpy/numpy/pull/22442>`__: ENH: unstructured_to_structured converts dtype argument
+* `#22447 <https://github.com/numpy/numpy/pull/22447>`__: TYP: Spelling alignment for array flag literal
+* `#22450 <https://github.com/numpy/numpy/pull/22450>`__: BUG: Fix bounds checking for random.logseries
+* `#22452 <https://github.com/numpy/numpy/pull/22452>`__: DEV: Update GH actions and Dockerfile for Gitpod
+* `#22453 <https://github.com/numpy/numpy/pull/22453>`__: DOC: Update NEP 50 text since integer conversion errors now exist.
+* `#22456 <https://github.com/numpy/numpy/pull/22456>`__: DEP: Proposal to deprecate the ``msort`` convenience function
+* `#22458 <https://github.com/numpy/numpy/pull/22458>`__: ENH: Allow all allocated operands in nditer/NpyIter
+* `#22462 <https://github.com/numpy/numpy/pull/22462>`__: MAINT: refactor mandatory npymath functions to #define macros
+* `#22463 <https://github.com/numpy/numpy/pull/22463>`__: DOC: remove mention of ``ipython -p numpy``.
+* `#22466 <https://github.com/numpy/numpy/pull/22466>`__: MAINT: Ensure that fmin loops do not show up multiple times
+* `#22475 <https://github.com/numpy/numpy/pull/22475>`__: MAINT: remove code specific to Python 2
+* `#22478 <https://github.com/numpy/numpy/pull/22478>`__: BUG: -unsigned_int(0) no overflow warning
+* `#22479 <https://github.com/numpy/numpy/pull/22479>`__: MAINT: remove u-prefix for former Unicode strings
+* `#22480 <https://github.com/numpy/numpy/pull/22480>`__: DOC: fix typo in advanced_debugging.rst [skip ci]
+* `#22481 <https://github.com/numpy/numpy/pull/22481>`__: GitHub Workflows security hardening
+* `#22482 <https://github.com/numpy/numpy/pull/22482>`__: ENH: Add OpenSSF Scorecard GitHub Action
+* `#22483 <https://github.com/numpy/numpy/pull/22483>`__: MAINT: change subprocess arguments from Python>=3.7
+* `#22485 <https://github.com/numpy/numpy/pull/22485>`__: TST: Make test_partial_iteration_cleanup robust but require leak...
+* `#22487 <https://github.com/numpy/numpy/pull/22487>`__: TST, MAINT: Replace most setup with setup_method (also teardown)
+* `#22488 <https://github.com/numpy/numpy/pull/22488>`__: MAINT, CI: Switch to cygwin/cygwin-install-action@v2
+* `#22491 <https://github.com/numpy/numpy/pull/22491>`__: CI: Make cygwin build run for branches.
+* `#22496 <https://github.com/numpy/numpy/pull/22496>`__: MAINT: Use SPDX license expression in project metadata
+* `#22498 <https://github.com/numpy/numpy/pull/22498>`__: REL: readme in PyPI plus test
+* `#22500 <https://github.com/numpy/numpy/pull/22500>`__: DOC: added example in char
+* `#22503 <https://github.com/numpy/numpy/pull/22503>`__: CI: Only fetch in actions/checkout
+* `#22504 <https://github.com/numpy/numpy/pull/22504>`__: DOC: Add code-formatting on install instructions
+* `#22505 <https://github.com/numpy/numpy/pull/22505>`__: DOC: fixed minor typo in f2py docs
+* `#22510 <https://github.com/numpy/numpy/pull/22510>`__: MAINT: Ensure raw dlpack deleter works when called without the...
+* `#22519 <https://github.com/numpy/numpy/pull/22519>`__: DOC: fixed pad_width description in numpy.pad
+* `#22521 <https://github.com/numpy/numpy/pull/22521>`__: DOC: Remove "current" from byte-order note and expand it slightly
+* `#22524 <https://github.com/numpy/numpy/pull/22524>`__: MAINT, BLD: Wheel CI: Update cibuildwheel to 2.11.2
+* `#22525 <https://github.com/numpy/numpy/pull/22525>`__: BLD: update OpenBLAS to 0.3.21 and clean up openblas download...
+* `#22531 <https://github.com/numpy/numpy/pull/22531>`__: BLD, CI: Update setup-python
+* `#22538 <https://github.com/numpy/numpy/pull/22538>`__: DOC: update libnpymath docs on its status and how to consume...
+* `#22540 <https://github.com/numpy/numpy/pull/22540>`__: DEP: Expire deprecation of dtype/signature allowing instances
+* `#22541 <https://github.com/numpy/numpy/pull/22541>`__: DEP: Expire deprecation to ignore bad dtype= in logical ufuncs
+* `#22542 <https://github.com/numpy/numpy/pull/22542>`__: API: Always use BufferError when dlpack export fails
+* `#22543 <https://github.com/numpy/numpy/pull/22543>`__: DOC: Add instruction to initialize git submodules
+* `#22548 <https://github.com/numpy/numpy/pull/22548>`__: MAINT: Fix designator order not matching declaration order
+* `#22550 <https://github.com/numpy/numpy/pull/22550>`__: MAINT: Fix Fortran order flag use (using bool rather than enum)
+* `#22552 <https://github.com/numpy/numpy/pull/22552>`__: MAINT: Do not use temporary struct construct
+* `#22554 <https://github.com/numpy/numpy/pull/22554>`__: MAINT: Match arguments of constant in ``isless()``
+* `#22557 <https://github.com/numpy/numpy/pull/22557>`__: BUG: Decrement ref count in gentype_reduce if allocated memory...
+* `#22561 <https://github.com/numpy/numpy/pull/22561>`__: BUG: Histogramdd breaks on big arrays in Windows
+* `#22566 <https://github.com/numpy/numpy/pull/22566>`__: BUG: Fix use and errorchecking of ObjectType use
+* `#22567 <https://github.com/numpy/numpy/pull/22567>`__: CI: Add PR write permissions to artifact redirector.
+* `#22571 <https://github.com/numpy/numpy/pull/22571>`__: DOC: Mention numpy types in ``isnat`` error message
+* `#22576 <https://github.com/numpy/numpy/pull/22576>`__: BUG: fix issue with broken assert statement in ``templ_common.h.src``
+* `#22578 <https://github.com/numpy/numpy/pull/22578>`__: BLD: fix issue with header includes in dlpack.c
+* `#22579 <https://github.com/numpy/numpy/pull/22579>`__: MAINT: remove ``NPY_RESTRICT`` in favor of C99 ``restrict``
+* `#22580 <https://github.com/numpy/numpy/pull/22580>`__: BLD: remove unused ``ncola`` variables from lapack-lite
+* `#22583 <https://github.com/numpy/numpy/pull/22583>`__: MAINT: Patch to remove ncola variable from f2c_blas.c
+* `#22586 <https://github.com/numpy/numpy/pull/22586>`__: MAINT: Rm meaningless checks in determining typeless data
+* `#22587 <https://github.com/numpy/numpy/pull/22587>`__: TYP: Update type annotations for new 1.24 features
+* `#22588 <https://github.com/numpy/numpy/pull/22588>`__: DOC: Clarify relationship between row_stack and vstack.
+* `#22589 <https://github.com/numpy/numpy/pull/22589>`__: DOC: expand docs on debugging with gdb
+* `#22598 <https://github.com/numpy/numpy/pull/22598>`__: MAINT, CI: Update Ubuntu 18.04 to Ubuntu 20.04
+* `#22601 <https://github.com/numpy/numpy/pull/22601>`__: MAINT: Use C99 flexible struct construct for ``NpyIter_InternalOnly``
+* `#22605 <https://github.com/numpy/numpy/pull/22605>`__: MAINT: (array-coercion) Silence invalid read warning in some...
+* `#22607 <https://github.com/numpy/numpy/pull/22607>`__: DEP: Next step in scalar type alias deprecations/futurewarnings
+* `#22608 <https://github.com/numpy/numpy/pull/22608>`__: MAINT, CI: Enable coverage checking.
+* `#22612 <https://github.com/numpy/numpy/pull/22612>`__: BLD: update wheel builds on macos to macos-12 image
+* `#22614 <https://github.com/numpy/numpy/pull/22614>`__: MAINT: remove macOS specific long double handling in numpyconfig.h
+* `#22615 <https://github.com/numpy/numpy/pull/22615>`__: DOC: Rm ``round_`` from the autosummary for rounding
+* `#22616 <https://github.com/numpy/numpy/pull/22616>`__: TST: Rename setup to setup_method in _locales
+* `#22620 <https://github.com/numpy/numpy/pull/22620>`__: DOC: testing: Fix typo: nulps -> nulp
+* `#22628 <https://github.com/numpy/numpy/pull/22628>`__: DOC: Add example for np.ma.power
+* `#22629 <https://github.com/numpy/numpy/pull/22629>`__: MAINT: Update main after 1.23.5 release.
+* `#22630 <https://github.com/numpy/numpy/pull/22630>`__: BLD: Use cibuildwheel 2.9.0 for Python 3.8 aarch64 builds
+* `#22634 <https://github.com/numpy/numpy/pull/22634>`__: CI: Increase travis timeout to 20 minutes.
+* `#22636 <https://github.com/numpy/numpy/pull/22636>`__: CI: Increase travis timeout to 30 minutes (2).
+* `#22639 <https://github.com/numpy/numpy/pull/22639>`__: DOC: Remove traces of interrupt handling utilities
+* `#22662 <https://github.com/numpy/numpy/pull/22662>`__: rel: prepare for the numpy 1.24.0rc1 release.
+* `#22665 <https://github.com/numpy/numpy/pull/22665>`__: MAINT: Fix 1.24.0 release note markup.
+* `#22697 <https://github.com/numpy/numpy/pull/22697>`__: MAINT: Rename symbols in textreading/ that may clash when statically...
+* `#22698 <https://github.com/numpy/numpy/pull/22698>`__: MAINT: check user-defined dtype has an ensure_canonical implementation
+* `#22699 <https://github.com/numpy/numpy/pull/22699>`__: BUG: Ensure string aliases "int0", etc. remain valid for now
+* `#22700 <https://github.com/numpy/numpy/pull/22700>`__: BLD: revert function() -> #define for 3 npymath functions
+* `#22702 <https://github.com/numpy/numpy/pull/22702>`__: MAINT: pin ubuntu and python for emscripten
+* `#22716 <https://github.com/numpy/numpy/pull/22716>`__: BLD, CI: Use cirrus for building aarch64 wheels
+* `#22717 <https://github.com/numpy/numpy/pull/22717>`__: TST: Skip when numba/numpy compat issues cause SystemError
+* `#22719 <https://github.com/numpy/numpy/pull/22719>`__: CI: Make benchmark asv run quick to only check that benchmarks...
+* `#22729 <https://github.com/numpy/numpy/pull/22729>`__: REL: Prepare for the NumPy 1.24.0rc2 release.
+* `#22743 <https://github.com/numpy/numpy/pull/22743>`__: DOC: Fix release note link to out-of-bounds integer deprecation
+* `#22746 <https://github.com/numpy/numpy/pull/22746>`__: BUG: Fix some valgrind errors (and probably harmless warnings)
+* `#22748 <https://github.com/numpy/numpy/pull/22748>`__: BUG: ``keepdims=True`` is ignored if ``out`` is not ``None``...
+* `#22749 <https://github.com/numpy/numpy/pull/22749>`__: MAINT: check if PyArrayDTypeMeta_Spec->casts is set
+* `#22757 <https://github.com/numpy/numpy/pull/22757>`__: CI: fix CIRRUS_TAG check when tagging. Closes #22730.
+* `#22758 <https://github.com/numpy/numpy/pull/22758>`__: DOC: Some updates to the array_api compat document (#22747)
+* `#22759 <https://github.com/numpy/numpy/pull/22759>`__: BUG, SIMD: Fix rounding large numbers >= 2^52 on SSE2
+* `#22760 <https://github.com/numpy/numpy/pull/22760>`__: CI, SIMD: Add workflow to test default baseline features only
+* `#22761 <https://github.com/numpy/numpy/pull/22761>`__: BUG: Fix deepcopy cleanup on error
+* `#22793 <https://github.com/numpy/numpy/pull/22793>`__: BUG: Fix infinite recursion in longdouble/large integer scalar...
+* `#22795 <https://github.com/numpy/numpy/pull/22795>`__: BUG: Ensure arguments to ``npy_floatstatus_..._barrier()`` can...
+* `#22805 <https://github.com/numpy/numpy/pull/22805>`__: REV: revert change to ``numpyconfig.h`` for sizeof(type) hardcoding...
+* `#22815 <https://github.com/numpy/numpy/pull/22815>`__: BLD: use newer version of delocate
diff --git a/doc/changelog/1.24.1-changelog.rst b/doc/changelog/1.24.1-changelog.rst
new file mode 100644
index 000000000..5650438b6
--- /dev/null
+++ b/doc/changelog/1.24.1-changelog.rst
@@ -0,0 +1,43 @@
+
+Contributors
+============
+
+A total of 12 people contributed to this release.  People with a "+" by their
+names contributed a patch for the first time.
+
+* Andrew Nelson
+* Ben Greiner +
+* Charles Harris
+* Clément Robert
+* Matteo Raso
+* Matti Picus
+* Melissa Weber Mendonça
+* Miles Cranmer
+* Ralf Gommers
+* Rohit Goswami
+* Sayed Adel
+* Sebastian Berg
+
+Pull requests merged
+====================
+
+A total of 18 pull requests were merged for this release.
+
+* `#22820 <https://github.com/numpy/numpy/pull/22820>`__: BLD: add workaround in setup.py for newer setuptools
+* `#22830 <https://github.com/numpy/numpy/pull/22830>`__: BLD: CIRRUS_TAG redux
+* `#22831 <https://github.com/numpy/numpy/pull/22831>`__: DOC: fix a couple typos in 1.23 notes
+* `#22832 <https://github.com/numpy/numpy/pull/22832>`__: BUG: Fix refcounting errors found using pytest-leaks
+* `#22834 <https://github.com/numpy/numpy/pull/22834>`__: BUG, SIMD: Fix invalid value encountered in several ufuncs
+* `#22837 <https://github.com/numpy/numpy/pull/22837>`__: TST: ignore more np.distutils.log imports
+* `#22839 <https://github.com/numpy/numpy/pull/22839>`__: BUG: Do not use getdata() in np.ma.masked_invalid
+* `#22847 <https://github.com/numpy/numpy/pull/22847>`__: BUG: Ensure correct behavior for rows ending in delimiter in...
+* `#22848 <https://github.com/numpy/numpy/pull/22848>`__: BUG, SIMD: Fix the bitmask of the boolean comparison
+* `#22857 <https://github.com/numpy/numpy/pull/22857>`__: BLD: Help raspian arm + clang 13 about __builtin_mul_overflow
+* `#22858 <https://github.com/numpy/numpy/pull/22858>`__: API: Ensure a full mask is returned for masked_invalid
+* `#22866 <https://github.com/numpy/numpy/pull/22866>`__: BUG: Polynomials now copy properly (#22669)
+* `#22867 <https://github.com/numpy/numpy/pull/22867>`__: BUG, SIMD: Fix memory overlap in ufunc comparison loops
+* `#22868 <https://github.com/numpy/numpy/pull/22868>`__: BUG: Fortify string casts against floating point warnings
+* `#22875 <https://github.com/numpy/numpy/pull/22875>`__: TST: Ignore nan-warnings in randomized out tests
+* `#22883 <https://github.com/numpy/numpy/pull/22883>`__: MAINT: restore npymath implementations needed for freebsd
+* `#22884 <https://github.com/numpy/numpy/pull/22884>`__: BUG: Fix integer overflow in in1d for mixed integer dtypes #22877
+* `#22887 <https://github.com/numpy/numpy/pull/22887>`__: BUG: Use whole file for encoding checks with ``charset_normalizer``.
diff --git a/doc/changelog/1.24.2-changelog.rst b/doc/changelog/1.24.2-changelog.rst
new file mode 100644
index 000000000..399092eb0
--- /dev/null
+++ b/doc/changelog/1.24.2-changelog.rst
@@ -0,0 +1,44 @@
+
+Contributors
+============
+
+A total of 14 people contributed to this release.  People with a "+" by their
+names contributed a patch for the first time.
+
+* Bas van Beek
+* Charles Harris
+* Khem Raj +
+* Mark Harfouche
+* Matti Picus
+* Panagiotis Zestanakis +
+* Peter Hawkins
+* Pradipta Ghosh
+* Ross Barnowski
+* Sayed Adel
+* Sebastian Berg
+* Syam Gadde +
+* dmbelov +
+* pkubaj +
+
+Pull requests merged
+====================
+
+A total of 17 pull requests were merged for this release.
+
+* `#22965 <https://github.com/numpy/numpy/pull/22965>`__: MAINT: Update python 3.11-dev to 3.11.
+* `#22966 <https://github.com/numpy/numpy/pull/22966>`__: DOC: Remove dangling deprecation warning
+* `#22967 <https://github.com/numpy/numpy/pull/22967>`__: ENH: Detect CPU features on FreeBSD/powerpc64*
+* `#22968 <https://github.com/numpy/numpy/pull/22968>`__: BUG: np.loadtxt cannot load text file with quoted fields separated...
+* `#22969 <https://github.com/numpy/numpy/pull/22969>`__: TST: Add fixture to avoid issue with randomizing test order.
+* `#22970 <https://github.com/numpy/numpy/pull/22970>`__: BUG: Fix fill violating read-only flag. (#22959)
+* `#22971 <https://github.com/numpy/numpy/pull/22971>`__: MAINT: Add additional information to missing scalar AttributeError
+* `#22972 <https://github.com/numpy/numpy/pull/22972>`__: MAINT: Move export for scipy arm64 helper into main module
+* `#22976 <https://github.com/numpy/numpy/pull/22976>`__: BUG, SIMD: Fix spurious invalid exception for sin/cos on arm64/clang
+* `#22989 <https://github.com/numpy/numpy/pull/22989>`__: BUG: Ensure correct loop order in sin, cos, and arctan2
+* `#23030 <https://github.com/numpy/numpy/pull/23030>`__: DOC: Add version added information for the strict parameter in...
+* `#23031 <https://github.com/numpy/numpy/pull/23031>`__: BUG: use ``_Alignof`` rather than ``offsetof()`` on most compilers
+* `#23147 <https://github.com/numpy/numpy/pull/23147>`__: BUG: Fix for npyv__trunc_s32_f32 (VXE)
+* `#23148 <https://github.com/numpy/numpy/pull/23148>`__: BUG: Fix integer / float scalar promotion
+* `#23149 <https://github.com/numpy/numpy/pull/23149>`__: BUG: Add missing <type_traits> header.
+* `#23150 <https://github.com/numpy/numpy/pull/23150>`__: TYP, MAINT: Add a missing explicit ``Any`` parameter to the ``npt.ArrayLike``...
+* `#23161 <https://github.com/numpy/numpy/pull/23161>`__: BLD: remove redundant definition of npy_nextafter [wheel build]
diff --git a/doc/changelog/1.24.3-changelog.rst b/doc/changelog/1.24.3-changelog.rst
new file mode 100644
index 000000000..ccfdfe8f0
--- /dev/null
+++ b/doc/changelog/1.24.3-changelog.rst
@@ -0,0 +1,42 @@
+
+Contributors
+============
+
+A total of 12 people contributed to this release.  People with a "+" by their
+names contributed a patch for the first time.
+
+* Aleksei Nikiforov +
+* Alexander Heger
+* Bas van Beek
+* Bob Eldering
+* Brock Mendel
+* Charles Harris
+* Kyle Sunden
+* Peter Hawkins
+* Rohit Goswami
+* Sebastian Berg
+* Warren Weckesser
+* dependabot[bot]
+
+Pull requests merged
+====================
+
+A total of 17 pull requests were merged for this release.
+
+* `#23206 <https://github.com/numpy/numpy/pull/23206>`__: BUG: fix for f2py string scalars (#23194)
+* `#23207 <https://github.com/numpy/numpy/pull/23207>`__: BUG: datetime64/timedelta64 comparisons return NotImplemented
+* `#23208 <https://github.com/numpy/numpy/pull/23208>`__: MAINT: Pin matplotlib to version 3.6.3 for refguide checks
+* `#23221 <https://github.com/numpy/numpy/pull/23221>`__: DOC: Fix matplotlib error in documentation
+* `#23226 <https://github.com/numpy/numpy/pull/23226>`__: CI: Ensure submodules are initialized in gitpod.
+* `#23341 <https://github.com/numpy/numpy/pull/23341>`__: TYP: Replace duplicate reduce in ufunc type signature with reduceat.
+* `#23342 <https://github.com/numpy/numpy/pull/23342>`__: TYP: Remove duplicate CLIP/WRAP/RAISE in ``__init__.pyi``.
+* `#23343 <https://github.com/numpy/numpy/pull/23343>`__: TYP: Mark ``d`` argument to fftfreq and rfftfreq as optional...
+* `#23344 <https://github.com/numpy/numpy/pull/23344>`__: TYP: Add type annotations for comparison operators to MaskedArray.
+* `#23345 <https://github.com/numpy/numpy/pull/23345>`__: TYP: Remove some stray type-check-only imports of ``msort``
+* `#23370 <https://github.com/numpy/numpy/pull/23370>`__: BUG: Ensure like is only stripped for ``like=`` dispatched functions
+* `#23543 <https://github.com/numpy/numpy/pull/23543>`__: BUG: fix loading and storing big arrays on s390x
+* `#23544 <https://github.com/numpy/numpy/pull/23544>`__: MAINT: Bump larsoner/circleci-artifacts-redirector-action
+* `#23634 <https://github.com/numpy/numpy/pull/23634>`__: BUG: Ignore invalid and overflow warnings in masked setitem
+* `#23635 <https://github.com/numpy/numpy/pull/23635>`__: BUG: Fix masked array raveling when ``order="A"`` or ``order="K"``
+* `#23636 <https://github.com/numpy/numpy/pull/23636>`__: MAINT: Update conftest for newer hypothesis versions
+* `#23637 <https://github.com/numpy/numpy/pull/23637>`__: BUG: Fix bug in parsing F77 style string arrays.
diff --git a/doc/make.bat b/doc/make.bat
new file mode 100644
index 000000000..974d5c3d8
--- /dev/null
+++ b/doc/make.bat
@@ -0,0 +1,74 @@
+@echo off
+
+pushd %~dp0
+
+:: Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=LANG=C sphinx-build
+)
+set SOURCEDIR=source
+set BUILDDIR=build
+if defined SPHINXOPTS goto skipopts
+set SPHINXOPTS=-W --keep-going -d build/doctrees %SPHINXOPTS% source
+set DOXYGEN=doxygen
+set FILES=
+:skipopts
+
+if "%1" == "" goto help
+if "%1" == "clean" goto clean
+if "%1" == "docenv" goto docenv
+if "%1" == "html" goto html
+if "%1" == "linkcheck" goto linkcheck
+if "%1" == "show" goto show
+
+:help
+	echo.
+	echo Please use "make.bat <target>" where ^<target^> is one of
+	echo.
+	echo    clean     to remove generated doc files and start fresh
+	echo    docenv    make a virtual environment in which to build docs
+	echo    html      to make standalone HTML files
+	echo    linkcheck to check all external links for integrity
+	echo    show      to show the html output in a browser
+goto end
+
+:clean
+if exist "%SOURCEDIR%\build\" (
+	rmdir /s /q "%SOURCEDIR%\build"
+	:: TODO
+	:: find . -name generated -type d -prune -exec rm -rf "{}" ";"
+)
+goto end
+
+:docenv
+echo Not implemented
+Rem 	python -mvenv docenv
+Rem 	( \
+Rem             . docenv/bin/activate; \
+Rem             pip install -q --upgrade pip; \
+Rem             pip install -q  -r ../test_requirements.txt; \
+Rem             pip install -q  -r ../doc_requirements.txt; \
+Rem             pip install -q ..; \
+Rem 	)
+goto end
+
+:html
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+goto end
+
+:linkcheck
+	md build
+	md build\linkcheck
+	md build\doctrees
+	%SPHINXBUILD% -b linkcheck %SOURCEDIR% build\linkcheck
+	echo.
+	echo Link check complete; look for any errors in the above output
+	echo    or in build\linkcheck\output.txt.
+goto end
+
+:show
+python -m webbrowser -t "%~dp0\build\html\index.html"
+
+:end
+popd
diff --git a/doc/neps/nep-0013-ufunc-overrides.rst b/doc/neps/nep-0013-ufunc-overrides.rst
index c132113db..d69af6924 100644
--- a/doc/neps/nep-0013-ufunc-overrides.rst
+++ b/doc/neps/nep-0013-ufunc-overrides.rst
@@ -20,6 +20,8 @@ NEP 13 — A mechanism for overriding Ufuncs
 :Date: 2017-03-31
 
 :Status: Final
+:Updated: 2023-02-19
+:Author: Roy Smart
 
 Executive summary
 =================
@@ -173,12 +175,12 @@ where in all current cases only a single output makes sense).
 
 The function dispatch proceeds as follows:
 
-- If one of the input or output arguments implements
+- If one of the input, output, or ``where`` arguments implements
   ``__array_ufunc__``, it is executed instead of the ufunc.
 
 - If more than one of the arguments implements ``__array_ufunc__``,
   they are tried in the following order: subclasses before superclasses,
-  inputs before outputs, otherwise left to right.
+  inputs before outputs, outputs before ``where``, otherwise left to right.
 
 - The first ``__array_ufunc__`` method returning something else than
   :obj:`NotImplemented` determines the return value of the Ufunc.
@@ -326,7 +328,10 @@ equivalent to::
     def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
         # Cannot handle items that have __array_ufunc__ (other than our own).
         outputs = kwargs.get('out', ())
-        for item in inputs + outputs:
+        objs = inputs + outputs
+        if "where" in kwargs:
+            objs = objs + (kwargs["where"], )
+        for item in objs:
             if (hasattr(item, '__array_ufunc__') and
                     type(item).__array_ufunc__ is not ndarray.__array_ufunc__):
                 return NotImplemented
diff --git a/doc/neps/nep-0019-rng-policy.rst b/doc/neps/nep-0019-rng-policy.rst
index c5c46603b..44033357a 100644
--- a/doc/neps/nep-0019-rng-policy.rst
+++ b/doc/neps/nep-0019-rng-policy.rst
@@ -17,7 +17,7 @@ Abstract
 For the past decade, NumPy has had a strict backwards compatibility policy for
 the number stream of all of its random number distributions.  Unlike other
 numerical components in ``numpy``, which are usually allowed to return
-different when results when they are modified if they remain correct, we have
+different results when modified if the results remain correct, we have
 obligated the random number distributions to always produce the exact same
 numbers in every version.  The objective of our stream-compatibility guarantee
 was to provide exact reproducibility for simulations across numpy versions in
diff --git a/doc/neps/nep-0021-advanced-indexing.rst b/doc/neps/nep-0021-advanced-indexing.rst
index 7751d309b..4c7a16375 100644
--- a/doc/neps/nep-0021-advanced-indexing.rst
+++ b/doc/neps/nep-0021-advanced-indexing.rst
@@ -143,7 +143,7 @@ exactly.
 
 Vectorized indexing in particular can be challenging to implement with array
 storage backends not based on NumPy. In contrast, indexing by 1D arrays along
-at least one dimension in the style of outer indexing is much more acheivable.
+at least one dimension in the style of outer indexing is much more achievable.
 This has led many libraries (including dask and h5py) to attempt to define a
 safe subset of NumPy-style indexing that is equivalent to outer indexing, e.g.,
 by only allowing indexing with an array along at most one dimension. However,
diff --git a/doc/neps/nep-0029-deprecation_policy.rst b/doc/neps/nep-0029-deprecation_policy.rst
index 4eb2aa50f..ccb566227 100644
--- a/doc/neps/nep-0029-deprecation_policy.rst
+++ b/doc/neps/nep-0029-deprecation_policy.rst
@@ -127,7 +127,10 @@ Apr 14, 2023 3.9+   1.21+
 Jun 23, 2023 3.9+   1.22+
 Jan 01, 2024 3.9+   1.23+
 Apr 05, 2024 3.10+  1.23+
-Apr 04, 2025 3.11+  1.23+
+Jun 22, 2024 3.10+  1.24+
+Dec 18, 2024 3.10+  1.25+
+Apr 04, 2025 3.11+  1.25+
+Apr 24, 2026 3.12+  1.25+
 ============ ====== =====
 
 
@@ -150,7 +153,10 @@ Drop Schedule
   On Jun 23, 2023 drop support for NumPy 1.21 (initially released on Jun 22, 2021)
   On Jan 01, 2024 drop support for NumPy 1.22 (initially released on Dec 31, 2021)
   On Apr 05, 2024 drop support for Python 3.9 (initially released on Oct 05, 2020)
+  On Jun 22, 2024 drop support for NumPy 1.23 (initially released on Jun 22, 2022)
+  On Dec 18, 2024 drop support for NumPy 1.24 (initially released on Dec 18, 2022)
   On Apr 04, 2025 drop support for Python 3.10 (initially released on Oct 04, 2021)
+  On Apr 24, 2026 drop support for Python 3.11 (initially released on Oct 24, 2022)
 
 
 Implementation
@@ -284,6 +290,9 @@ Code to generate support and drop schedule tables ::
   Jun 22, 2021: NumPy 1.21
   Oct 04, 2021: Python 3.10
   Dec 31, 2021: NumPy 1.22
+  Jun 22, 2022: NumPy 1.23
+  Oct 24, 2022: Python 3.11
+  Dec 18, 2022: NumPy 1.24
   """
 
   releases = []
diff --git a/doc/neps/nep-0046-sponsorship-guidelines.rst b/doc/neps/nep-0046-sponsorship-guidelines.rst
index ed54c21d5..746a73f5d 100644
--- a/doc/neps/nep-0046-sponsorship-guidelines.rst
+++ b/doc/neps/nep-0046-sponsorship-guidelines.rst
@@ -108,11 +108,12 @@ About page) will be added to acknowledge all current and previous sponsors,
 partners, and any other entities and individuals who provided $5,000 or more of
 financial or in-kind support. This page will include relevant details of
 support (dates, amounts, names, and purpose); no logos will be used on this
-page. The rationale for the $5,000 minimum level is to keep the amount of work
-maintaining the page reasonable; the level is the equivalent of, e.g., one GSoC
-or a person-week's worth of engineering time in a Western country, which seems
-like a reasonable lower limit.
-
+page. Such support, if provided for a specific enhancements or fix, may be
+acknowledged in the appropriate release note snippet. The rationale for the
+$5,000 minimum level is to keep the amount of work maintaining the page
+reasonable; the level is the equivalent of, e.g., one GSoC or a person-week's
+worth of engineering time in a Western country, which seems like a reasonable
+lower limit.
 
 Implementation
 --------------
@@ -128,7 +129,6 @@ The following content changes need to be made:
 - Update https://numpy.org/about with details on how to get in touch with the
   NumPy project about sponsorship related matters (see next section).
 
-
 NumPy Funding Team
 ~~~~~~~~~~~~~~~~~~
 
diff --git a/doc/neps/nep-0051-scalar-representation.rst b/doc/neps/nep-0051-scalar-representation.rst
index 851f173de..1e6f27a5f 100644
--- a/doc/neps/nep-0051-scalar-representation.rst
+++ b/doc/neps/nep-0051-scalar-representation.rst
@@ -70,7 +70,7 @@ Even ``np.float64``, which is very similar to Python's ``float`` and inherits
 from it, does behave differently for example when dividing by zero.
 
 Another common source of confusion are the NumPy booleans.  Python programmers
-somtimes write ``obj is True`` and will surprised when an object that shows
+sometimes write ``obj is True`` and will surprised when an object that shows
 as ``True`` fails to pass the test.
 It is much easier to understand this behavior when the value is
 shown as ``np.True_``.
@@ -96,7 +96,7 @@ Jupyter notebook cells will often show the type information intact.
 
     np.longdouble('3.0')
 
-to allow round-tripping.  Addtionally to this change, ``float128`` will
+to allow round-tripping.  Additionally to this change, ``float128`` will
 now always be printed as ``longdouble`` since the old name gives a false
 impression of precision.
 
@@ -137,7 +137,7 @@ The proposal is to change the default (back) to use ``str`` rather than
 Detailed description
 ====================
 
-This NEP proposes to change the represenatation for NumPy scalars to:
+This NEP proposes to change the representation for NumPy scalars to:
 
 * ``np.True_`` and ``np.False_`` for booleans (their singleton instances)
 * ``np.scalar(<value>)``, i.e. ``np.float64(3.0)`` for all numerical dtypes.
@@ -202,8 +202,8 @@ to always print as ``longdouble`` and never ``float128`` or ``float96``.
 It does not include deprecating the ``np.float128`` alias.
 However, such a deprecation may occur independently of the NEP.
 
-Integer scalar type name and instance represenatation
------------------------------------------------------
+Integer scalar type name and instance representation
+----------------------------------------------------
 
 One detail is that due to NumPy scalar types being based on the C types,
 NumPy sometimes distinguishes them, for example on most 64 bit systems
@@ -263,7 +263,7 @@ allowing customized formatting for all DTypes.
 
 Making ``get_formatter`` public allows it to be used for ``np.record`` and
 masked arrays.
-Currenlty, the formatters themselves seem semi-public; using a single
+Currently, the formatters themselves seem semi-public; using a single
 entry-point will hopefully provide a clear API for formatting NumPy values.
 
 The large part for the scalar representation changes had previously been done
diff --git a/doc/postprocess.py b/doc/postprocess.py
index 3e066d22e..4b48fa443 100755
--- a/doc/postprocess.py
+++ b/doc/postprocess.py
@@ -2,7 +2,7 @@
 """
 Post-processes HTML and Latex files output by Sphinx.
 """
-import io
+
 
 def main():
     import argparse
@@ -15,13 +15,13 @@ def main():
     mode = args.mode
 
     for fn in args.file:
-        with io.open(fn, 'r', encoding="utf-8") as f:
+        with open(fn, encoding="utf-8") as f:
             if mode == 'html':
                 lines = process_html(fn, f.readlines())
             elif mode == 'tex':
                 lines = process_tex(f.readlines())
 
-        with io.open(fn, 'w', encoding="utf-8") as f:
+        with open(fn, 'w', encoding="utf-8") as f:
             f.write("".join(lines))
 
 def process_html(fn, lines):
diff --git a/doc/preprocess.py b/doc/preprocess.py
index 870d3e123..83980bb2f 100755
--- a/doc/preprocess.py
+++ b/doc/preprocess.py
@@ -32,7 +32,7 @@ def doxy_config(root_path):
     confs = []
     dsrc_path = os.path.join(root_path, "doc", "source")
     sub = dict(ROOT_DIR=root_path)
-    with open(os.path.join(dsrc_path, "doxyfile"), "r") as fd:
+    with open(os.path.join(dsrc_path, "doxyfile")) as fd:
         conf = DoxyTpl(fd.read())
         confs.append(conf.substitute(CUR_DIR=dsrc_path, **sub))
 
@@ -40,7 +40,7 @@ def doxy_config(root_path):
         if ".doxyfile" not in files:
             continue
         conf_path = os.path.join(dpath, ".doxyfile")
-        with open(conf_path, "r") as fd:
+        with open(conf_path) as fd:
             conf = DoxyTpl(fd.read())
             confs.append(conf.substitute(CUR_DIR=dpath, **sub))
     return confs
diff --git a/doc/records.rst b/doc/records.rst
index 3c0d55216..2e9b1251a 100644
--- a/doc/records.rst
+++ b/doc/records.rst
@@ -74,7 +74,7 @@ New possibilities for the "data-type"
 Reference counting for ``PyArray_Descr *`` objects.
 ```````````````````````````````````````````````````
 
-Most functions that take ``PyArary_Descr *`` as arguments and return a
+Most functions that take ``PyArray_Descr *`` as arguments and return a
 ``PyObject *`` steal the reference unless otherwise noted in the code:
 
 Functions that return ``PyArray_Descr *`` objects return a new
diff --git a/doc/release/numpy2_changes/23089.change.rst b/doc/release/numpy2_changes/23089.change.rst
new file mode 100644
index 000000000..fbd87e0aa
--- /dev/null
+++ b/doc/release/numpy2_changes/23089.change.rst
@@ -0,0 +1,2 @@
+* ``np.gradient()`` now returns a tuple rather than list making the
+  return value immutable.
diff --git a/doc/release/numpy2_changes/README.md b/doc/release/numpy2_changes/README.md
new file mode 100644
index 000000000..fb59749a9
--- /dev/null
+++ b/doc/release/numpy2_changes/README.md
@@ -0,0 +1,4 @@
+Same as ``../upcoming_changes`` but for changes that currently
+are opt-in behind ``NPY_NUMPY_2_BEHAVIOR=1``.
+
+This is to get a start on release notes for such changes.
diff --git a/doc/release/upcoming_changes/10615.deprecation.rst b/doc/release/upcoming_changes/10615.deprecation.rst
new file mode 100644
index 000000000..7fa948ea8
--- /dev/null
+++ b/doc/release/upcoming_changes/10615.deprecation.rst
@@ -0,0 +1,14 @@
+Only ndim-0 arrays are treated as scalars
+-----------------------------------------
+NumPy used to treat all arrays of size 1 (e.g., ``np.array([3.14])``) as scalars.
+In the future, this will be limited to arrays of ndim 0 (e.g., ``np.array(3.14)``).
+The following expressions will report a deprecation warning:
+
+.. code-block:: python
+
+    a = np.array([3.14])
+    float(a)  # better: a[0] to get the numpy.float or a.item()
+
+    b = np.array([[3.14]])
+    c = numpy.random.rand(10)
+    c[0] = b  # better: c[0] = b[0, 0]
diff --git a/doc/release/upcoming_changes/12065.performance.rst b/doc/release/upcoming_changes/12065.performance.rst
deleted file mode 100644
index 08d1263b9..000000000
--- a/doc/release/upcoming_changes/12065.performance.rst
+++ /dev/null
@@ -1,6 +0,0 @@
-Faster version of ``np.isin`` and ``np.in1d`` for integer arrays
-----------------------------------------------------------------
-``np.in1d`` (used by ``np.isin``) can now switch to a faster algorithm
-(up to >10x faster) when it is passed two integer arrays.
-This is often automatically used, but you can use ``kind="sort"`` or 
-``kind="table"`` to force the old or new method, respectively.
-\ No newline at end of file
diff --git a/doc/release/upcoming_changes/16154.new_feature.rst b/doc/release/upcoming_changes/16154.new_feature.rst
deleted file mode 100644
index 99d4b1b04..000000000
--- a/doc/release/upcoming_changes/16154.new_feature.rst
+++ /dev/null
@@ -1,25 +0,0 @@
-New attribute ``symbol`` added to polynomial classes
-----------------------------------------------------
-
-The polynomial classes in the ``numpy.polynomial`` package have a new
-``symbol`` attribute which is used to represent the indeterminate
-of the polynomial.
-This can be used to change the value of the variable when printing::
-
-    >>> P_y = np.polynomial.Polynomial([1, 0, -1], symbol="y")
-    >>> print(P_y)
-    1.0 + 0.0·y¹ - 1.0·y²
-
-Note that the polynomial classes only support 1D polynomials, so operations
-that involve polynomials with different symbols are disallowed when the
-result would be multivariate::
-
-    >>> P = np.polynomial.Polynomial([1, -1])  # default symbol is "x"
-    >>> P_z = np.polynomial.Polynomial([1, 1], symbol="z")
-    >>> P * P_z
-    Traceback (most recent call last)
-       ...
-    ValueError: Polynomial symbols differ
-
-The symbol can be any valid Python identifier. The default is ``symbol=x``,
-consistent with existing behavior.
diff --git a/doc/release/upcoming_changes/18053.new_feature.rst b/doc/release/upcoming_changes/18053.new_feature.rst
new file mode 100644
index 000000000..fea04f79a
--- /dev/null
+++ b/doc/release/upcoming_changes/18053.new_feature.rst
@@ -0,0 +1,4 @@
+``np.einsum`` now accepts arrays with ``object`` dtype
+------------------------------------------------------
+The code path will call python operators on object dtype arrays, much
+like ``np.dot`` and ``np.matmul``.
diff --git a/doc/release/upcoming_changes/18535.improvement.rst b/doc/release/upcoming_changes/18535.improvement.rst
new file mode 100644
index 000000000..a0386b3bf
--- /dev/null
+++ b/doc/release/upcoming_changes/18535.improvement.rst
@@ -0,0 +1,7 @@
+Fix power of complex zero
+-------------------------
+``np.power`` now returns a different result for ``0^{non-zero}``
+for complex numbers.  Note that the value is only defined when
+the real part of the exponent is larger than zero.
+Previously, NaN was returned unless the imaginary part was strictly
+zero.  The return value is either ``0+0j`` or ``0-0j``.
diff --git a/doc/release/upcoming_changes/19388.improvement.rst b/doc/release/upcoming_changes/19388.improvement.rst
deleted file mode 100644
index c899f9019..000000000
--- a/doc/release/upcoming_changes/19388.improvement.rst
+++ /dev/null
@@ -1,10 +0,0 @@
-F2PY Improvements
------------------
-
-* The generated extension modules don't use the deprecated NumPy-C API anymore
-* Improved ``f2py`` generated exception messages
-* Numerous bug and ``flake8`` warning fixes
-* various CPP macros that one can use within C-expressions of signature files are prefixed with ``f2py_``. For example, one should use ``f2py_len(x)`` instead of ``len(x)``
-* A new construct ``character(f2py_len=...)`` is introduced to support returning assumed length character strings (e.g. ``character(len=*)``) from wrapper functions
-
-A hook to support rewriting ``f2py`` internal data structures after reading all its input files is introduced. This is required, for instance, for BC of SciPy support where character arguments are treated as character strings arguments in ``C`` expressions.
diff --git a/doc/release/upcoming_changes/19388.new_feature.rst b/doc/release/upcoming_changes/19388.new_feature.rst
deleted file mode 100644
index bffe5753b..000000000
--- a/doc/release/upcoming_changes/19388.new_feature.rst
+++ /dev/null
@@ -1,10 +0,0 @@
-F2PY support for Fortran ``character`` strings
-----------------------------------------------
-F2PY now supports wrapping Fortran functions with:
-
-* character (e.g. ``character x``)
-* character array (e.g. ``character, dimension(n) :: x``)
-* character string (e.g. ``character(len=10) x``)
-* and character string array (e.g. ``character(len=10), dimension(n, m) :: x``)
-
-arguments, including passing Python unicode strings as Fortran character string arguments.
diff --git a/doc/release/upcoming_changes/20913.improvement.rst b/doc/release/upcoming_changes/20913.improvement.rst
deleted file mode 100755
index 335f44831..000000000
--- a/doc/release/upcoming_changes/20913.improvement.rst
+++ /dev/null
@@ -1,12 +0,0 @@
-IBM zSystems Vector Extension Facility (SIMD)
----------------------------------------------
-
-Added support for SIMD extensions of zSystem (z13, z14, z15),
-through the universal intrinsics interface. This support leads
-to performance improvements for all SIMD kernels implemented
-using the universal intrinsics, including the following operations:
-
-rint, floor, trunc, ceil, sqrt, absolute, square, reciprocal, tanh, sin, cos,
-equal, not_equal, greater, greater_equal, less, less_equal,
-maximum, minimum, fmax, fmin, argmax, argmin,
-add, subtract, multiply, divide.
diff --git a/doc/release/upcoming_changes/20924.compatibility.rst b/doc/release/upcoming_changes/20924.compatibility.rst
deleted file mode 100644
index 02f3e9fdb..000000000
--- a/doc/release/upcoming_changes/20924.compatibility.rst
+++ /dev/null
@@ -1,13 +0,0 @@
-``array.fill(scalar)`` may behave slightly different
-----------------------------------------------------
-`~numpy.ndarray.fill` may in some cases behave slightly different
-now due to the fact that the logic is aligned with item assignment::
-
-    arr = np.array([1])  # with any dtype/value
-    arr.fill(scalar)
-    # is now identical to:
-    arr[0] = scalar
-
-Previously casting may have produced slightly different answers when using
-values that could not be represented in the target ``dtype`` or when the
-target had ``object`` dtype.
diff --git a/doc/release/upcoming_changes/21120.new_feature.rst b/doc/release/upcoming_changes/21120.new_feature.rst
new file mode 100644
index 000000000..7d4dbf743
--- /dev/null
+++ b/doc/release/upcoming_changes/21120.new_feature.rst
@@ -0,0 +1,21 @@
+Add support for inplace matrix multiplication
+----------------------------------------------
+It is now possible to perform inplace matrix multiplication
+via the ``@=`` operator.
+
+.. code-block:: python
+
+    >>> import numpy as np
+
+    >>> a = np.arange(6).reshape(3, 2)
+    >>> print(a)
+    [[0 1]
+     [2 3]
+     [4 5]]
+
+    >>> b = np.ones((2, 2), dtype=int)
+    >>> a @= b
+    >>> print(a)
+    [[1 1]
+     [5 5]
+     [9 9]]
diff --git a/doc/release/upcoming_changes/21437.improvement.rst b/doc/release/upcoming_changes/21437.improvement.rst
deleted file mode 100644
index 291483b5f..000000000
--- a/doc/release/upcoming_changes/21437.improvement.rst
+++ /dev/null
@@ -1,33 +0,0 @@
-NumPy now gives floating point errors in casts
-----------------------------------------------
-
-In most cases, NumPy previously did not give floating point
-warnings or errors when these happened during casts.
-For examples, casts like::
-
-    np.array([2e300]).astype(np.float32)  # overflow for float32
-    np.array([np.inf]).astype(np.int64)
-
-Should now generally give floating point warnings.  These warnings
-should warn that floating point overflow occurred.
-For errors when converting floating point values to integers users
-should expect invalid value warnings.
-
-Users can modify the behavior of these warnings using `np.errstate`.
-
-Note that for float to int casts, the exact warnings that are given may
-be platform dependent.  For example::
-
-    arr = np.full(100, value=1000, dtype=np.float64)
-    arr.astype(np.int8)
-
-May give a result equivalent to (the intermediate cast means no warning is given)::
-
-    arr.astype(np.int64).astype(np.int8)
-
-May return an undefined result, with a warning set::
-
-    RuntimeWarning: invalid value encountered in cast
-
-The precise behavior is subject to the C99 standard and its implementation
-in both software and hardware.
diff --git a/doc/release/upcoming_changes/21468.new_feature.rst b/doc/release/upcoming_changes/21468.new_feature.rst
deleted file mode 100644
index c37f05735..000000000
--- a/doc/release/upcoming_changes/21468.new_feature.rst
+++ /dev/null
@@ -1,6 +0,0 @@
-New function ``np.show_runtime``
---------------------------------
-
-A new function `numpy.show_runtime` has been added to display the runtime
-information of the machine in addition to `numpy.show_config` which displays
-the build-related information.
diff --git a/doc/release/upcoming_changes/21483.performance.rst b/doc/release/upcoming_changes/21483.performance.rst
deleted file mode 100644
index f9456d69f..000000000
--- a/doc/release/upcoming_changes/21483.performance.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-Faster comparison operators
-----------------------------
-The comparison functions (``numpy.equal``, ``numpy.not_equal``, ``numpy.less``,
-``numpy.less_equal``, ``numpy.greater`` and ``numpy.greater_equal``) are now
-much faster as they are now vectorized with universal intrinsics. For a CPU
-with SIMD extension AVX512BW, the performance gain is up to 2.57x, 1.65x and
-19.15x for integer, float and boolean data types, respectively (with N=50000).
diff --git a/doc/release/upcoming_changes/21506.change.rst b/doc/release/upcoming_changes/21506.change.rst
deleted file mode 100644
index 18bc6cbc2..000000000
--- a/doc/release/upcoming_changes/21506.change.rst
+++ /dev/null
@@ -1,18 +0,0 @@
-Better reporting of integer division overflow
----------------------------------------------
-
-Integer division overflow of scalars and arrays used to provide a ``RuntimeWarning``
-and the return value was undefined leading to crashes at rare occasions::
-
-    >>> np.array([np.iinfo(np.int32).min]*10, dtype=np.int32) // np.int32(-1)
-    <stdin>:1: RuntimeWarning: divide by zero encountered in floor_divide
-    array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)
-
-Integer division overflow now returns the input dtype's minimum value and raise the
-following ``RuntimeWarning``::
-
-    >>> np.array([np.iinfo(np.int32).min]*10, dtype=np.int32) // np.int32(-1)
-    <stdin>:1: RuntimeWarning: overflow encountered in floor_divide
-    array([-2147483648, -2147483648, -2147483648, -2147483648, -2147483648,
-           -2147483648, -2147483648, -2147483648, -2147483648, -2147483648],
-          dtype=int32)
diff --git a/doc/release/upcoming_changes/21595.new_feature.rst b/doc/release/upcoming_changes/21595.new_feature.rst
deleted file mode 100644
index 21b2a746f..000000000
--- a/doc/release/upcoming_changes/21595.new_feature.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-``strict`` option for `testing.assert_array_equal`
---------------------------------------------------
-The ``strict`` option is now available for `testing.assert_array_equal`.
-Setting ``strict=True`` will disable the broadcasting behaviour for scalars and
-ensure that input arrays have the same data type.
diff --git a/doc/release/upcoming_changes/21623.new_feature.rst b/doc/release/upcoming_changes/21623.new_feature.rst
deleted file mode 100644
index d783ef5a9..000000000
--- a/doc/release/upcoming_changes/21623.new_feature.rst
+++ /dev/null
@@ -1,6 +0,0 @@
-New parameter ``equal_nan`` added to `np.unique`
-------------------------------------------------
-
-`np.unique` was changed in 1.21 to treat all ``NaN`` values as equal and return
-a single ``NaN``. Setting ``equal_nan=False`` will restore pre-1.21 behavior
-to treat ``NaNs`` as unique. Defaults to ``True``.
diff --git a/doc/release/upcoming_changes/21627.new_feature.rst b/doc/release/upcoming_changes/21627.new_feature.rst
deleted file mode 100644
index f516ac96d..000000000
--- a/doc/release/upcoming_changes/21627.new_feature.rst
+++ /dev/null
@@ -1,16 +0,0 @@
-``casting`` and ``dtype`` keyword arguments for `numpy.stack`
--------------------------------------------------------------
-The ``casting`` and ``dtype`` keyword arguments are now available for `numpy.stack`. 
-To use them, write ``np.stack(..., dtype=None, casting='same_kind')``.
-
-
-``casting`` and ``dtype`` keyword arguments for `numpy.vstack`
---------------------------------------------------------------
-The ``casting`` and ``dtype`` keyword arguments are now available for `numpy.vstack`. 
-To use them, write ``np.vstack(..., dtype=None, casting='same_kind')``.
-
-
-``casting`` and ``dtype`` keyword arguments for `numpy.hstack`
---------------------------------------------------------------
-The ``casting`` and ``dtype`` keyword arguments are now available for `numpy.hstack`. 
-To use them, write ``np.hstack(..., dtype=None, casting='same_kind')``.
-\ No newline at end of file
diff --git a/doc/release/upcoming_changes/21645.expired.rst b/doc/release/upcoming_changes/21645.expired.rst
deleted file mode 100644
index 4c9933244..000000000
--- a/doc/release/upcoming_changes/21645.expired.rst
+++ /dev/null
@@ -1,4 +0,0 @@
-* The ``normed`` keyword argument has been removed from
-  `np.histogram`, `np.histogram2d`, and `np.histogramdd`.
-  Use ``density`` instead.  If ``normed`` was passed by
-  position, ``density`` is now used.
diff --git a/doc/release/upcoming_changes/21807.improvement.rst b/doc/release/upcoming_changes/21807.improvement.rst
deleted file mode 100644
index bfad55fb7..000000000
--- a/doc/release/upcoming_changes/21807.improvement.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-F2PY supports the value attribute
----------------------------------
-
-The Fortran standard requires that variables declared with the ``value``
-attribute must be passed by value instead of reference. F2PY now supports this
-use pattern correctly. So ``integer, intent(in), value :: x`` in Fortran codes
-will have correct wrappers generated.
diff --git a/doc/release/upcoming_changes/21925.compatibility.rst b/doc/release/upcoming_changes/21925.compatibility.rst
deleted file mode 100644
index af9c47127..000000000
--- a/doc/release/upcoming_changes/21925.compatibility.rst
+++ /dev/null
@@ -1,12 +0,0 @@
-Subarray to object cast now copies
-----------------------------------
-Casting a dtype that includes a subarray to an object will now ensure
-a copy of the subarray.  Previously an unsafe view was returned::
-
-    arr = np.ones(3, dtype=[("f", "i", 3)])
-    subarray_fields = arr.astype(object)[0]
-    subarray = subarray_fields[0]  # "f" field
-
-    np.may_share_memory(subarray, arr)
-
-Is now always false.  While previously it was true for the specific cast.
diff --git a/doc/release/upcoming_changes/21976.new_feature.rst b/doc/release/upcoming_changes/21976.new_feature.rst
deleted file mode 100644
index 387f093dd..000000000
--- a/doc/release/upcoming_changes/21976.new_feature.rst
+++ /dev/null
@@ -1,33 +0,0 @@
-The bit generator underlying the singleton RandomState can be changed
----------------------------------------------------------------------
-The singleton ``RandomState`` instance exposed in the ``numpy.random`` module
-is initialized at startup with the ``MT19937` bit generator. The new
-function ``set_bit_generator`` allows the default bit generator to be
-replaced with a user-provided bit generator. This function has been introduced
-to provide a method allowing seamless integration of a high-quality, modern bit
-generator in new code with existing code that makes use of the
-singleton-provided random variate generating functions. The companion function
-``get_bit_generator`` returns the current bit generator being used by the
-singleton ``RandomState``. This is provided to simplify restoring
-the original source of randomness if required.
-
-The preferred method to generate reproducible random numbers is to use a modern
-bit generator in an instance of ``Generator``. The function ``default_rng``
-simplifies instantization.
-
-   >>> rg = np.random.default_rng(3728973198)
-   >>> rg.random()
-
-The same bit generator can then be shared with the singleton instance so that
-calling functions in the ``random`` module will use the same bit
-generator.
-
-   >>> orig_bit_gen = np.random.get_bit_generator()
-   >>> np.random.set_bit_generator(rg.bit_generator)
-   >>> np.random.normal()
-
-The swap is permanent (until reversed) and so any call to functions
-in the ``random`` module will use the new bit generator. The original
-can be restored if required for code to run correctly.
-
-   >>> np.random.set_bit_generator(orig_bit_gen)
diff --git a/doc/release/upcoming_changes/21995.compatibility.rst b/doc/release/upcoming_changes/21995.compatibility.rst
deleted file mode 100644
index 0ae5e3626..000000000
--- a/doc/release/upcoming_changes/21995.compatibility.rst
+++ /dev/null
@@ -1,21 +0,0 @@
-Returned arrays respect uniqueness of dtype kwarg objects
----------------------------------------------------------
-When the ``dtype`` keyword argument is used with :py:func:`np.array()`
-or :py:func:`asarray()`, the dtype of the returned array now
-always exactly matches the dtype provided by the caller.
-
-In some cases this change means that a *view* rather than the
-input array is returned.
-The following is an example for this on 64bit Linux where ``long``
-and ``longlong`` are the same precision but different ``dtypes``::
-
-    >>> arr = np.array([1, 2, 3], dtype="long")
-    >>> new_dtype = np.dtype("longlong")
-    >>> new = np.asarray(arr, dtype=new_dtype)
-    >>> new.dtype is new_dtype
-    True
-    >>> new is arr
-    False
-
-Before the change, the ``dtype`` did not match because ``new is arr``
-was ``True``.
diff --git a/doc/release/upcoming_changes/22004.expired.rst b/doc/release/upcoming_changes/22004.expired.rst
deleted file mode 100644
index e07dffa84..000000000
--- a/doc/release/upcoming_changes/22004.expired.rst
+++ /dev/null
@@ -1,2 +0,0 @@
-* Ragged array creation will now always raise a ``ValueError`` unless
-  ``dtype=object`` is passed.  This includes very deeply nested sequences.
diff --git a/doc/release/upcoming_changes/22014.improvement.rst b/doc/release/upcoming_changes/22014.improvement.rst
deleted file mode 100644
index b32590c85..000000000
--- a/doc/release/upcoming_changes/22014.improvement.rst
+++ /dev/null
@@ -1,10 +0,0 @@
-Added pickle support for third-party BitGenerators
---------------------------------------------------
-
-The pickle format for bit generators was extended to allow each bit generator
-to supply its own constructor when during pickling. Previous  versions of NumPy
-only supported unpickling ``Generator`` instances created with one of the core set
-of bit generators supplied with NumPy. Attempting to unpickle a ``Generator``
-that used a third-party bit generators would fail since the constructor used during
-the unpickling was only aware of the bit generators included in NumPy.
-
diff --git a/doc/release/upcoming_changes/22046.change.rst b/doc/release/upcoming_changes/22046.change.rst
deleted file mode 100644
index 13a15f9e1..000000000
--- a/doc/release/upcoming_changes/22046.change.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-``masked_invalid`` now modifies the mask in-place
------------------------------------------------------------------
-
-When used with ``copy=False``, `numpy.ma.masked_invalid` now modifies the
-input masked array in-place.
-This makes it now behave identical to ``masked_where`` and better matches the
-documentation.
diff --git a/doc/release/upcoming_changes/22139.expired.rst b/doc/release/upcoming_changes/22139.expired.rst
deleted file mode 100644
index baf19a0b2..000000000
--- a/doc/release/upcoming_changes/22139.expired.rst
+++ /dev/null
@@ -1,4 +0,0 @@
-* Support for Visual Studio 2015 and earlier has been removed.
-
-* Support for the Windows Interix POSIX interop layer has been removed.
-
diff --git a/doc/release/upcoming_changes/22159.expired.rst b/doc/release/upcoming_changes/22159.expired.rst
deleted file mode 100644
index bb8405718..000000000
--- a/doc/release/upcoming_changes/22159.expired.rst
+++ /dev/null
@@ -1 +0,0 @@
-* Support for cygwin < 3.3 has been removed.
diff --git a/doc/release/upcoming_changes/22228.expired.rst b/doc/release/upcoming_changes/22228.expired.rst
deleted file mode 100644
index 220ae2a7b..000000000
--- a/doc/release/upcoming_changes/22228.expired.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-* The mini() method of ``np.ma.MaskedArray`` has been removed. Use either
-  ``np.ma.MaskedArray.min()`` or ``np.ma.minimum.reduce()``.
-
-* The single-argument form of ``np.ma.minimum`` and ``np.ma.maximum`` has been
-  removed. Use ``np.ma.minimum.reduce()`` or ``np.ma.maximum.reduce()`` instead.
diff --git a/doc/release/upcoming_changes/22313.deprecation.rst b/doc/release/upcoming_changes/22313.deprecation.rst
deleted file mode 100644
index c4a0a4a9a..000000000
--- a/doc/release/upcoming_changes/22313.deprecation.rst
+++ /dev/null
@@ -1,10 +0,0 @@
-Deprecate fastCopyAndTranspose and PyArray_CopyAndTranspose
------------------------------------------------------------
-
-The ``numpy.fastCopyAndTranspose`` function has been deprecated. Use the
-corresponding copy and transpose methods directly::
-
-    arr.T.copy()
-
-The underlying C function ``PyArray_CopyAndTranspose`` has also been
-deprecated from the NumPy C-API.
diff --git a/doc/release/upcoming_changes/22315.performance.rst b/doc/release/upcoming_changes/22315.performance.rst
new file mode 100644
index 000000000..a2f623e5a
--- /dev/null
+++ b/doc/release/upcoming_changes/22315.performance.rst
@@ -0,0 +1,7 @@
+Faster ``np.sort`` on AVX-512 enabled processors
+------------------------------------------------
+Quicksort for 16-bit and 64-bit dtypes gain up to 15x and 9x speed up on
+processors that support AVX-512 instruction set.
+
+Thanks to `Intel corporation <https://open.intel.com/>`_ for sponsoring this
+work.
diff --git a/doc/release/upcoming_changes/22357.improvement.rst b/doc/release/upcoming_changes/22357.improvement.rst
deleted file mode 100644
index a0332eaa0..000000000
--- a/doc/release/upcoming_changes/22357.improvement.rst
+++ /dev/null
@@ -1,6 +0,0 @@
-``numpy.typing`` protocols are now runtime checkable
-----------------------------------------------------
-
-The protocols used in `~numpy.typing.ArrayLike` and `~numpy.typing.DTypeLike`
-are now properly marked as runtime checkable, making them easier to use for
-runtime type checkers.
diff --git a/doc/release/upcoming_changes/22393.deprecation.rst b/doc/release/upcoming_changes/22393.deprecation.rst
deleted file mode 100644
index 52099506c..000000000
--- a/doc/release/upcoming_changes/22393.deprecation.rst
+++ /dev/null
@@ -1,17 +0,0 @@
-Conversion of out-of-bound Python integers
-------------------------------------------
-Attempting a conversion from a Python integer to a NumPy
-value will now always check whether the result can be
-represented by NumPy.  This means the following examples will
-fail in the future and give a ``DeprecationWarning`` now::
-
-    np.uint8(-1)
-    np.array([3000], dtype=np.int8)
-
-Many of these did succeed before.  Such code was mainly
-useful for unsigned integers with negative values such as
-`np.uint8(-1)` giving `np.iinfo(np.uint8).max`.
-
-Note that conversion between NumPy integers is unaffected,
-so that `np.array(-1).astype(np.uint8)` continues to work
-and use C integer overflow logic.
diff --git a/doc/release/upcoming_changes/22456.deprecation.rst b/doc/release/upcoming_changes/22456.deprecation.rst
deleted file mode 100644
index ab5e7ee31..000000000
--- a/doc/release/upcoming_changes/22456.deprecation.rst
+++ /dev/null
@@ -1,4 +0,0 @@
-Deprecate ``msort``
--------------------
-
-The ``numpy.msort`` function is deprecated. Use ``np.sort(a, axis=0)`` instead.
diff --git a/doc/release/upcoming_changes/22457.change.rst b/doc/release/upcoming_changes/22457.change.rst
deleted file mode 100644
index 7d787441f..000000000
--- a/doc/release/upcoming_changes/22457.change.rst
+++ /dev/null
@@ -1,8 +0,0 @@
-``nditer``/``NpyIter`` allows all allocating all operands
----------------------------------------------------------
-The NumPy iterator available through `np.nditer` in Python
-and as ``NpyIter`` in C now supports allocating all arrays.
-
-The iterator shape defaults to ``()`` in this case.  The operands
-dtype must be provided, since a "common dtype" cannot be inferred
-from the other inputs.
diff --git a/doc/release/upcoming_changes/22575.compatibility.rst b/doc/release/upcoming_changes/22575.compatibility.rst
new file mode 100644
index 000000000..256a8e933
--- /dev/null
+++ b/doc/release/upcoming_changes/22575.compatibility.rst
@@ -0,0 +1,9 @@
+``np.pad`` with ``mode=wrap`` pads with strict multiples of original data
+-------------------------------------------------------------------------
+
+Code based on earlier version of ``pad`` that uses  ``mode="wrap"`` will return
+different results when the padding size is larger than initial array.
+
+``np.pad`` with ``mode=wrap`` now always fills the space with 
+strict multiples of original data even if the padding size is larger than the
+initial array.
diff --git a/doc/release/upcoming_changes/22637.compatibility.rst b/doc/release/upcoming_changes/22637.compatibility.rst
new file mode 100644
index 000000000..a35b70861
--- /dev/null
+++ b/doc/release/upcoming_changes/22637.compatibility.rst
@@ -0,0 +1,15 @@
+Cython ``long_t`` and ``ulong_t`` removed
+-----------------------------------------
+``long_t`` and ``ulong_t`` were aliases for ``longlong_t`` and ``ulonglong_t``
+and confusing (a remainder from of Python 2).  This change may lead to the errors::
+
+     'long_t' is not a type identifier
+     'ulong_t' is not a type identifier
+
+We recommend use of bit-sized types such as ``cnp.int64_t`` or the use of
+``cnp.intp_t`` which is 32 bits on 32 bit systems and 64 bits on 64 bit
+systems (this is most compatible with indexing).
+If C ``long`` is desired, use plain ``long`` or ``npy_long``.
+``cnp.int_t`` is also ``long`` (NumPy's default integer).  However, ``long``
+is 32 bit on 64 bit windows and we may wish to adjust this even in NumPy.
+(Please do not hesitate to contact NumPy developers if you are curious about this.)
diff --git a/doc/release/upcoming_changes/22638.deprecation.rst b/doc/release/upcoming_changes/22638.deprecation.rst
new file mode 100644
index 000000000..2db186106
--- /dev/null
+++ b/doc/release/upcoming_changes/22638.deprecation.rst
@@ -0,0 +1,2 @@
+* ``np.core.MachAr`` is deprecated.  It is private API.  In names
+  defined in ``np.core`` should generally be considered private.
diff --git a/doc/release/upcoming_changes/22638.expired.rst b/doc/release/upcoming_changes/22638.expired.rst
new file mode 100644
index 000000000..01e87fa9f
--- /dev/null
+++ b/doc/release/upcoming_changes/22638.expired.rst
@@ -0,0 +1 @@
+* ``np.core.machar`` and ``np.finfo.machar`` have been removed.
diff --git a/doc/release/upcoming_changes/22644.new_feature.rst b/doc/release/upcoming_changes/22644.new_feature.rst
new file mode 100644
index 000000000..e903fc7a0
--- /dev/null
+++ b/doc/release/upcoming_changes/22644.new_feature.rst
@@ -0,0 +1,7 @@
+NumPy now has an ``np.exceptions`` namespace
+--------------------------------------------
+NumPy now has a dedicated namespace making most exceptions
+and warnings available.  All of these remain available in the
+main namespace, although some may be moved slowly in the future.
+The main reason for this is to increase discoverably and add
+future exceptions.
diff --git a/doc/release/upcoming_changes/22675.compatibility.rst b/doc/release/upcoming_changes/22675.compatibility.rst
new file mode 100644
index 000000000..f3c08246b
--- /dev/null
+++ b/doc/release/upcoming_changes/22675.compatibility.rst
@@ -0,0 +1,6 @@
+Changed error message and type for bad ``axes`` argument to ``ufunc``
+---------------------------------------------------------------------
+The error message and type when a wrong ``axes`` value is passed to
+``ufunc(..., axes=[...])``` has changed. The message is now more indicative of
+the problem, and if the value is mismatched an ``AxisError`` will be raised.
+A ``TypeError`` will still be raised for invalid input types.
diff --git a/doc/release/upcoming_changes/22707.compatibility.rst b/doc/release/upcoming_changes/22707.compatibility.rst
new file mode 100644
index 000000000..8c9805f37
--- /dev/null
+++ b/doc/release/upcoming_changes/22707.compatibility.rst
@@ -0,0 +1,4 @@
+* When comparing datetimes and timedelta using ``np.equal`` or ``np.not_equal``
+  numpy previously allowed the comparison with ``casting="unsafe"``.
+  This operation now fails. Forcing the output dtype using the ``dtype``
+  kwarg can make the operation succeed, but we do not recommend it.
diff --git a/doc/release/upcoming_changes/22707.expired.rst b/doc/release/upcoming_changes/22707.expired.rst
new file mode 100644
index 000000000..496752e8d
--- /dev/null
+++ b/doc/release/upcoming_changes/22707.expired.rst
@@ -0,0 +1,13 @@
+``==`` and ``!=`` warnings finalized
+------------------------------------
+The ``==`` and ``!=`` operators on arrays now always:
+
+* raise errors that occur during comparisons such as when the arrays
+  have incompatible shapes (``np.array([1, 2]) == np.array([1, 2, 3])``).
+* return an array of all ``True`` or all ``False`` when values are
+  fundamentally not comparable (e.g. have different dtypes).  An example
+  is ``np.array(["a"]) == np.array([1])``.
+
+This mimics the Python behavior of returning ``False`` and ``True``
+when comparing incompatible types like ``"a" == 1`` and ``"a" != 1``.
+For a long time these gave ``DeprecationWarning`` or ``FutureWarning``.
diff --git a/doc/release/upcoming_changes/22707.improvement.rst b/doc/release/upcoming_changes/22707.improvement.rst
new file mode 100644
index 000000000..1b8d4f844
--- /dev/null
+++ b/doc/release/upcoming_changes/22707.improvement.rst
@@ -0,0 +1,8 @@
+New ``DTypePromotionError``
+---------------------------
+NumPy now has a new ``DTypePromotionError`` which is used when two
+dtypes cannot be promoted to a common one, for example::
+
+    np.result_type("M8[s]", np.complex128)
+
+raises this new exception.
diff --git a/doc/release/upcoming_changes/22769.improvement.rst b/doc/release/upcoming_changes/22769.improvement.rst
new file mode 100644
index 000000000..3566648ac
--- /dev/null
+++ b/doc/release/upcoming_changes/22769.improvement.rst
@@ -0,0 +1,5 @@
+`np.show_config` uses information from Meson
+--------------------------------------------
+Build and system information now contains information from Meson.
+`np.show_config` now has a new optional parameter ``mode`` to help
+customize the output.
diff --git a/doc/release/upcoming_changes/22776.improvement.rst b/doc/release/upcoming_changes/22776.improvement.rst
new file mode 100644
index 000000000..669fcff16
--- /dev/null
+++ b/doc/release/upcoming_changes/22776.improvement.rst
@@ -0,0 +1,6 @@
+Fix ``np.ma.diff`` not preserving the mask when called with arguments prepend/append.
+-------------------------------------------------------------------------------------
+Calling ``np.ma.diff`` with arguments prepend and/or append now returns a 
+``MaskedArray`` with the input mask preserved.
+
+Previously, a ``MaskedArray`` without the mask was returned.
+\ No newline at end of file
diff --git a/doc/release/upcoming_changes/22863.new_feature.rst b/doc/release/upcoming_changes/22863.new_feature.rst
new file mode 100644
index 000000000..3f45ed834
--- /dev/null
+++ b/doc/release/upcoming_changes/22863.new_feature.rst
@@ -0,0 +1,4 @@
+String functions in np.char are compatible with NEP 42 custom dtypes
+--------------------------------------------------------------------
+Custom dtypes that represent unicode strings or byte strings can now be
+passed to the string functions in np.char.
diff --git a/doc/release/upcoming_changes/22963.new_feature.rst b/doc/release/upcoming_changes/22963.new_feature.rst
new file mode 100644
index 000000000..88ec3f641
--- /dev/null
+++ b/doc/release/upcoming_changes/22963.new_feature.rst
@@ -0,0 +1,7 @@
+String dtype instances can be created from the string abstract dtype classes
+----------------------------------------------------------------------------
+It is now possible to create a string dtype instance with a size without
+using the string name of the dtype. For example, ``type(np.dtype('U'))(8)``
+will create a dtype that is equivalent to ``np.dtype('U8')``. This feature
+is most useful when writing generic code dealing with string dtype
+classes.
diff --git a/doc/release/upcoming_changes/22982.new_feature.rst b/doc/release/upcoming_changes/22982.new_feature.rst
new file mode 100644
index 000000000..c98f2791c
--- /dev/null
+++ b/doc/release/upcoming_changes/22982.new_feature.rst
@@ -0,0 +1,13 @@
+Fujitsu C/C++ compiler is now supported
+----------------------------------------------
+Support for Fujitsu compiler has been added.
+To build with Fujitsu compiler, run:
+
+    python setup.py build -c fujitsu
+
+
+SSL2 is now supported
+-----------------------------------
+Support for SSL2 has been added. SSL2 is a library that provides OpenBLAS compatible GEMM functions.
+To enable SSL2, it need to edit site.cfg and build with Fujitsu compiler.
+See site.cfg.example.
diff --git a/doc/release/upcoming_changes/22997.improvement.rst b/doc/release/upcoming_changes/22997.improvement.rst
new file mode 100644
index 000000000..156c9dece
--- /dev/null
+++ b/doc/release/upcoming_changes/22997.improvement.rst
@@ -0,0 +1,5 @@
+Corrected error handling for NumPy C-API in Cython
+--------------------------------------------------
+Many NumPy C functions defined for use in Cython were lacking the
+correct error indicator like ``except -1`` or ``except *``.
+These have now been added.
diff --git a/doc/release/upcoming_changes/22998.expired.rst b/doc/release/upcoming_changes/22998.expired.rst
new file mode 100644
index 000000000..a4399b639
--- /dev/null
+++ b/doc/release/upcoming_changes/22998.expired.rst
@@ -0,0 +1,2 @@
+* ``+arr`` will now raise an error when the dtype is not
+  numeric (and positive is undefined).
diff --git a/doc/release/upcoming_changes/23011.deprecation.rst b/doc/release/upcoming_changes/23011.deprecation.rst
new file mode 100644
index 000000000..72168ff87
--- /dev/null
+++ b/doc/release/upcoming_changes/23011.deprecation.rst
@@ -0,0 +1 @@
+* ``np.finfo(None)`` is deprecated.
diff --git a/doc/release/upcoming_changes/23019.expired.rst b/doc/release/upcoming_changes/23019.expired.rst
new file mode 100644
index 000000000..0879c2658
--- /dev/null
+++ b/doc/release/upcoming_changes/23019.expired.rst
@@ -0,0 +1,2 @@
+* A sequence must now be passed into the stacking family of functions
+  (``stack``, ``vstack``, ``hstack``, ``dstack`` and ``column_stack``).
diff --git a/doc/release/upcoming_changes/23020.change.rst b/doc/release/upcoming_changes/23020.change.rst
new file mode 100644
index 000000000..b4198fcba
--- /dev/null
+++ b/doc/release/upcoming_changes/23020.change.rst
@@ -0,0 +1,9 @@
+Most NumPy functions are wrapped into a C-callable
+--------------------------------------------------
+To speed up the ``__array_function__`` dispatching, most NumPy functions
+are now wrapped into C-callables and are not proper Python functions or
+C methods.
+They still look and feel the same as before (like a Python function), and this
+should only improve performance and user experience (cleaner tracebacks).
+However, please inform the NumPy developers if this change confuses your
+program for some reason.
diff --git a/doc/release/upcoming_changes/23020.performance.rst b/doc/release/upcoming_changes/23020.performance.rst
new file mode 100644
index 000000000..db9fcbf3c
--- /dev/null
+++ b/doc/release/upcoming_changes/23020.performance.rst
@@ -0,0 +1,5 @@
+``__array_function__`` machinery is now much faster
+---------------------------------------------------
+The overhead of the majority of functions in NumPy is now smaller
+especially when keyword arguments are used.  This change significantly
+speeds up many simple function calls.
diff --git a/doc/release/upcoming_changes/23041.expired.rst b/doc/release/upcoming_changes/23041.expired.rst
new file mode 100644
index 000000000..9049ea70f
--- /dev/null
+++ b/doc/release/upcoming_changes/23041.expired.rst
@@ -0,0 +1,27 @@
+Nose support has been removed
+-----------------------------
+NumPy switched to using pytest in 2018 and nose has been unmaintained for many
+years. We have kept NumPy's nose support to avoid breaking downstream projects
+who might have been using it and not yet switched to pytest or some other
+testing framework. With the arrival of Python 3.12, unpatched nose will raise
+an error. It is time to move on.
+
+Decorators removed
+^^^^^^^^^^^^^^^^^^
+- raises
+- slow
+- setastest
+- skipif
+- knownfailif
+- deprecated
+- parametrize
+- _needs_refcount
+
+These are not to be confused with pytest versions with similar names, e.g.,
+pytest.mark.slow, pytest.mark.skipif, pytest.mark.parametrize.
+
+Functions removed
+^^^^^^^^^^^^^^^^^
+- Tester
+- import_nose
+- run_module_suite
diff --git a/doc/release/upcoming_changes/23060.expired.rst b/doc/release/upcoming_changes/23060.expired.rst
new file mode 100644
index 000000000..f80ba6fd0
--- /dev/null
+++ b/doc/release/upcoming_changes/23060.expired.rst
@@ -0,0 +1,5 @@
+The ``numpy.testing.utils`` shim has been removed.
+--------------------------------------------------
+Importing from the ``numpy.testing.utils`` shim has been deprecated since 2019,
+the shim has now been removed. All imports should be made directly from
+``numpy.testing``.
diff --git a/doc/release/upcoming_changes/23105.compatibility.rst b/doc/release/upcoming_changes/23105.compatibility.rst
new file mode 100644
index 000000000..8a0b677e5
--- /dev/null
+++ b/doc/release/upcoming_changes/23105.compatibility.rst
@@ -0,0 +1,5 @@
+* When loading data from a file handle using ``np.load``,
+  if the handle is at the end of file, as can happen when reading
+  multiple arrays by calling ``np.load`` repeatedly, numpy previously
+  raised ``ValueError`` if ``allow_pickle=False``, and ``OSError`` if
+  ``allow_pickle=True``. Now it raises ``EOFError`` instead, in both cases.
diff --git a/doc/release/upcoming_changes/23113.improvement.rst b/doc/release/upcoming_changes/23113.improvement.rst
new file mode 100644
index 000000000..299b8f762
--- /dev/null
+++ b/doc/release/upcoming_changes/23113.improvement.rst
@@ -0,0 +1,3 @@
+- The ``NDArrayOperatorsMixin`` class now specifies that it contains no
+  ``__slots__`` ensureing that subclasses can now make use of this feature in
+  Python.
diff --git a/doc/release/upcoming_changes/23136.performance.rst b/doc/release/upcoming_changes/23136.performance.rst
new file mode 100644
index 000000000..1096f8bd1
--- /dev/null
+++ b/doc/release/upcoming_changes/23136.performance.rst
@@ -0,0 +1,18 @@
+``ufunc.at`` can be much faster
+-------------------------------
+Generic ``ufunc.at`` can be up to 9x faster. The conditions for this speedup:
+
+- operands are aligned
+- no casting
+
+If ufuncs with appropriate indexed loops on 1d arguments with the above
+conditions, ``ufunc.at`` can be up to 60x faster (an additional 7x speedup).
+Appropriate indexed loops have been added to ``add``, ``subtract``,
+``multiply``, ``floor_divide``, ``maximum``, ``minimum``, ``fmax``, and
+``fmin``.
+
+The internal logic is similar to the logic used for regular ufuncs, which also
+have fast paths.
+
+Thanks to the `D. E. Shaw group <https://deshaw.com/>`_ for sponsoring this
+work.
diff --git a/doc/release/upcoming_changes/23195.improvement.rst b/doc/release/upcoming_changes/23195.improvement.rst
new file mode 100644
index 000000000..38b33e849
--- /dev/null
+++ b/doc/release/upcoming_changes/23195.improvement.rst
@@ -0,0 +1,20 @@
+Ability to directly spawn random number generators
+--------------------------------------------------
+`numpy.random.Generator.spawn` now allows to directly spawn new
+independent child generators via the `numpy.random.SeedSequence.spawn`
+mechanism.
+`numpy.random.BitGenerator.spawn` does the same for the underlying
+bit generator.
+
+Additionally, `numpy.random.BitGenerator.seed_seq` now gives direct
+access to the seed sequence used for initializing the bit generator.
+This allows for example::
+
+    seed = 0x2e09b90939db40c400f8f22dae617151
+    rng = np.random.default_rng(seed)
+    child_rng1, child_rng2 = rng.spawn(2)
+
+    # safely use rng, child_rng1, and child_rng2
+
+Previously, this was hard to do without passing the ``SeedSequence``
+explicitly.  Please see `numpy.random.SeedSequence` for more information.
diff --git a/doc/release/upcoming_changes/23229.compatibility.rst b/doc/release/upcoming_changes/23229.compatibility.rst
new file mode 100644
index 000000000..284cc06ab
--- /dev/null
+++ b/doc/release/upcoming_changes/23229.compatibility.rst
@@ -0,0 +1,3 @@
+- The ``busday_count`` method now correctly handles cases where the ``begindates`` is later in time
+  than the ``enddates``. Previously, the ``enddates`` was included, even though the documentation states
+  it is always excluded.
diff --git a/doc/release/upcoming_changes/23240.compatibility.rst b/doc/release/upcoming_changes/23240.compatibility.rst
new file mode 100644
index 000000000..28536a020
--- /dev/null
+++ b/doc/release/upcoming_changes/23240.compatibility.rst
@@ -0,0 +1,10 @@
+Array-likes that define ``__array_ufunc__`` can now override ufuncs if used as ``where``
+----------------------------------------------------------------------------------------
+If the ``where`` keyword argument of a :class:`numpy.ufunc` is a subclass of
+:class:`numpy.ndarray` or is a duck type that defines
+:func:`numpy.class.__array_ufunc__` it can override the behavior of the ufunc
+using the same mechanism as the input and output arguments.
+Note that for this to work properly, the ``where.__array_ufunc__``
+implementation will have to unwrap the ``where`` argument to pass it into the
+default implementation of the ``ufunc`` or, for :class:`numpy.ndarray`
+subclasses before using ``super().__array_ufunc__``.
+\ No newline at end of file
diff --git a/doc/release/upcoming_changes/23275.improvement.rst b/doc/release/upcoming_changes/23275.improvement.rst
new file mode 100644
index 000000000..14ed5d9ad
--- /dev/null
+++ b/doc/release/upcoming_changes/23275.improvement.rst
@@ -0,0 +1,4 @@
+``numpy.logspace`` now supports a non-scalar ``base`` argument
+--------------------------------------------------------------
+The ``base`` argument of ``numpy.logspace`` can now be array-like if it's
+broadcastable against the ``start`` and ``stop`` arguments.
+\ No newline at end of file
diff --git a/doc/release/upcoming_changes/23302.deprecation.rst b/doc/release/upcoming_changes/23302.deprecation.rst
new file mode 100644
index 000000000..9e36d658c
--- /dev/null
+++ b/doc/release/upcoming_changes/23302.deprecation.rst
@@ -0,0 +1 @@
+* ``np.round_`` is deprecated. Use `np.round` instead. 
diff --git a/doc/release/upcoming_changes/23314.deprecation.rst b/doc/release/upcoming_changes/23314.deprecation.rst
new file mode 100644
index 000000000..8bed1aef8
--- /dev/null
+++ b/doc/release/upcoming_changes/23314.deprecation.rst
@@ -0,0 +1,4 @@
+* ``np.product`` is deprecated. Use `np.prod` instead.
+* ``np.cumproduct`` is deprecated. Use `np.cumprod` instead.
+* ``np.sometrue`` is deprecated. Use `np.any` instead.
+* ``np.alltrue`` is deprecated. Use `np.all` instead.
diff --git a/doc/release/upcoming_changes/23322.improvement.rst b/doc/release/upcoming_changes/23322.improvement.rst
new file mode 100644
index 000000000..ce5ab8cf5
--- /dev/null
+++ b/doc/release/upcoming_changes/23322.improvement.rst
@@ -0,0 +1,4 @@
+`np.ma.dot()` now supports for non-2d arrays
+--------------------------------------------
+Previously `np.ma.dot()` only worked if `a` and `b` were both 2d.
+Now it works for non-2d arrays as well as `np.dot()`.
diff --git a/doc/release/upcoming_changes/23357.improvement.rst b/doc/release/upcoming_changes/23357.improvement.rst
new file mode 100644
index 000000000..3b474146b
--- /dev/null
+++ b/doc/release/upcoming_changes/23357.improvement.rst
@@ -0,0 +1,9 @@
+Explicitly show keys of .npz file in repr
+-----------------------------------------
+``NpzFile`` shows keys of loaded .npz file when printed.
+
+.. code-block:: python
+
+   >>> npzfile = np.load('arr.npz')
+   >>> npzfile
+   NpzFile 'arr.npz' with keys arr_0, arr_1, arr_2, arr_3, arr_4...
diff --git a/doc/release/upcoming_changes/23358.improvement.rst b/doc/release/upcoming_changes/23358.improvement.rst
new file mode 100644
index 000000000..a8a0e56ef
--- /dev/null
+++ b/doc/release/upcoming_changes/23358.improvement.rst
@@ -0,0 +1,5 @@
+NumPy now exposes DType classes in ``np.dtypes``
+------------------------------------------------
+The new `numpy.dtypes` module now exposes DType classes and
+will contain future dtype related functionality.
+Most users should have no need to use these classes directly.
diff --git a/doc/release/upcoming_changes/23376.expired.rst b/doc/release/upcoming_changes/23376.expired.rst
new file mode 100644
index 000000000..e289b087c
--- /dev/null
+++ b/doc/release/upcoming_changes/23376.expired.rst
@@ -0,0 +1,9 @@
+Environment variable to disable dispatching removed
+---------------------------------------------------
+Support for the ``NUMPY_EXPERIMENTAL_ARRAY_FUNCTION`` environment variable has
+been removed. This variable disabled dispatching with ``__array_function__``.
+
+Support for ``y=`` as an alias of ``out=`` removed
+--------------------------------------------------
+The ``fix``, ``isposinf`` and ``isneginf`` functions allowed using ``y=`` as a
+(deprecated) alias for ``out=``. This is no longer supported.
diff --git a/doc/release/upcoming_changes/23403.expired.rst b/doc/release/upcoming_changes/23403.expired.rst
new file mode 100644
index 000000000..b099eb4e9
--- /dev/null
+++ b/doc/release/upcoming_changes/23403.expired.rst
@@ -0,0 +1,4 @@
+* ``np.clip`` now defaults to same-kind casting. Falling back to
+  unsafe casting was deprecated in NumPy 1.17.
+* ``np.clip`` will now propagate ``np.nan`` values passed as ``min`` or ``max``.
+  Previously, a scalar NaN was usually ignored.  This was deprecated in NumPy 1.17.
diff --git a/doc/release/upcoming_changes/23480.expired.rst b/doc/release/upcoming_changes/23480.expired.rst
new file mode 100644
index 000000000..164677b98
--- /dev/null
+++ b/doc/release/upcoming_changes/23480.expired.rst
@@ -0,0 +1 @@
+* The ``np.dual`` submodule has been removed.
diff --git a/doc/release/upcoming_changes/23528.compatibility.rst b/doc/release/upcoming_changes/23528.compatibility.rst
new file mode 100644
index 000000000..439fb6a27
--- /dev/null
+++ b/doc/release/upcoming_changes/23528.compatibility.rst
@@ -0,0 +1,16 @@
+By default, the exported NumPy C API is now compatible with NumPy 1.19
+----------------------------------------------------------------------
+Starting with NumPy 1.25 when including NumPy headers, NumPy now
+defaults to exposing a backwards compatible API.
+This means that by default binaries such as wheels build against
+NumPy 1.25 will also work with NumPy 1.16 because it has the same API version
+as NumPy 1.19 which is the oldest NumPy version compatible with Python 3.9.
+
+You can customize this behavior using::
+
+    #define NPY_TARGET_VERSION NPY_1_22_API_VERSION
+
+or the equivalent ``-D`` option to the compiler.  For more details
+please see `for-downstream-package-authors`_.
+A change should only be required in rare cases when a package relies on newly
+added C-API.
diff --git a/doc/release/upcoming_changes/23601.change.rst b/doc/release/upcoming_changes/23601.change.rst
new file mode 100644
index 000000000..e09bd50fe
--- /dev/null
+++ b/doc/release/upcoming_changes/23601.change.rst
@@ -0,0 +1,6 @@
+C++ standard library usage
+--------------------------
+
+NumPy builds now depend on the C++ standard library, because
+the ``numpy.core._multiarray_umath`` extension is linked with
+the C++ linker.
diff --git a/doc/release/upcoming_changes/23660.expired.rst b/doc/release/upcoming_changes/23660.expired.rst
new file mode 100644
index 000000000..615367350
--- /dev/null
+++ b/doc/release/upcoming_changes/23660.expired.rst
@@ -0,0 +1,2 @@
+* NumPy now always ignores sequence behavior for an array-like (defining
+  one of the array protocols).  (Deprecation started NumPy 1.20)
diff --git a/doc/release/upcoming_changes/23661.performance.rst b/doc/release/upcoming_changes/23661.performance.rst
new file mode 100644
index 000000000..e3e55f3d5
--- /dev/null
+++ b/doc/release/upcoming_changes/23661.performance.rst
@@ -0,0 +1,4 @@
+Faster membership test on ``NpzFile``
+-------------------------------------
+Membership test on ``NpzFile`` will no longer
+decompress the archive if it is successful.
diff --git a/doc/release/upcoming_changes/23666.expired.rst b/doc/release/upcoming_changes/23666.expired.rst
new file mode 100644
index 000000000..eb51c91f3
--- /dev/null
+++ b/doc/release/upcoming_changes/23666.expired.rst
@@ -0,0 +1,5 @@
+* The niche ``FutureWarning`` when casting to a subarray dtype in ``astype``
+  or the array creation functions such as ``asarray`` is now finalized.
+  The behavior is now always the same as if the subarray dtype was
+  wrapped into a single field (which was the workaround, previously).
+  (FutureWarning since NumPy 1.20)
diff --git a/doc/release/upcoming_changes/23713.improvement.rst b/doc/release/upcoming_changes/23713.improvement.rst
new file mode 100644
index 000000000..15a4f412b
--- /dev/null
+++ b/doc/release/upcoming_changes/23713.improvement.rst
@@ -0,0 +1,9 @@
+Signed and unsigned integers always compare correctly
+-----------------------------------------------------
+When ``uint64`` and ``int64`` are mixed in NumPy, NumPy typically
+promotes both to ``float64``.  This behavior may be argued about
+but is confusing for comparisons ``==``, ``<=``, since the results
+returned can be incorrect but the conversion is hidden since the
+result is a boolean.
+NumPy will now return the correct results for these by avoiding
+the cast to float.
diff --git a/doc/source/conf.py b/doc/source/conf.py
index 9546db5f2..ed5c11781 100644
--- a/doc/source/conf.py
+++ b/doc/source/conf.py
@@ -62,6 +62,13 @@ def replace_scalar_type_names():
 
 replace_scalar_type_names()
 
+
+# As of NumPy 1.25, a deprecation of `str`/`bytes` attributes happens.
+# For some reasons, the doc build accesses these, so ignore them.
+import warnings
+warnings.filterwarnings("ignore", "In the future.*NumPy scalar", FutureWarning)
+
+
 # -----------------------------------------------------------------------------
 # General configuration
 # -----------------------------------------------------------------------------
@@ -156,7 +163,6 @@ def setup(app):
 # If we deemed it desirable, we could in future make these real modules, which
 # would make `from numpy.char import split` work.
 sys.modules['numpy.char'] = numpy.char
-sys.modules['numpy.testing.dec'] = numpy.testing.dec
 
 # -----------------------------------------------------------------------------
 # HTML output
@@ -248,25 +254,39 @@ latex_documents = [
 #latex_use_parts = False
 
 latex_elements = {
-    'fontenc': r'\usepackage[LGR,T1]{fontenc}'
 }
 
 # Additional stuff for the LaTeX preamble.
 latex_elements['preamble'] = r'''
+\newfontfamily\FontForChinese{FandolSong-Regular}[Extension=.otf]
+\catcode`琴\active\protected\def琴{{\FontForChinese\string琴}}
+\catcode`春\active\protected\def春{{\FontForChinese\string春}}
+\catcode`鈴\active\protected\def鈴{{\FontForChinese\string鈴}}
+\catcode`猫\active\protected\def猫{{\FontForChinese\string猫}}
+\catcode`傅\active\protected\def傅{{\FontForChinese\string傅}}
+\catcode`立\active\protected\def立{{\FontForChinese\string立}}
+\catcode`业\active\protected\def业{{\FontForChinese\string业}}
+\catcode`（\active\protected\def（{{\FontForChinese\string（}}
+\catcode`）\active\protected\def）{{\FontForChinese\string）}}
+
 % In the parameters section, place a newline after the Parameters
-% header
-\usepackage{xcolor}
+% header.  This is default with Sphinx 5.0.0+, so no need for
+% the old hack then.
+% Unfortunately sphinx.sty 5.0.0 did not bump its version date
+% so we check rather sphinxpackagefootnote.sty (which exists
+% since Sphinx 4.0.0).
+\makeatletter
+\@ifpackagelater{sphinxpackagefootnote}{2022/02/12}
+    {}% Sphinx >= 5.0.0, nothing to do
+    {%
 \usepackage{expdlist}
 \let\latexdescription=\description
 \def\description{\latexdescription{}{} \breaklabel}
 % but expdlist old LaTeX package requires fixes:
 % 1) remove extra space
 \usepackage{etoolbox}
-\makeatletter
 \patchcmd\@item{{\@breaklabel} }{{\@breaklabel}}{}{}
-\makeatother
 % 2) fix bug in expdlist's way of breaking the line after long item label
-\makeatletter
 \def\breaklabel{%
     \def\@breaklabel{%
         \leavevmode\par
@@ -274,6 +294,7 @@ latex_elements['preamble'] = r'''
         \def\leavevmode{\def\leavevmode{\unhbox\voidb@x}}%
     }%
 }
+    }% Sphinx < 5.0.0 (and assumed >= 4.0.0)
 \makeatother
 
 % Make Examples/etc section headers smaller and more compact
diff --git a/doc/source/dev/alignment.rst b/doc/source/dev/alignment.rst
index bb1198ebf..f2321d17b 100644
--- a/doc/source/dev/alignment.rst
+++ b/doc/source/dev/alignment.rst
@@ -3,7 +3,7 @@
 .. _alignment:
 
 ****************
-Memory Alignment
+Memory alignment
 ****************
 
 NumPy alignment goals
diff --git a/doc/source/dev/depending_on_numpy.rst b/doc/source/dev/depending_on_numpy.rst
index 0582048cb..868b44162 100644
--- a/doc/source/dev/depending_on_numpy.rst
+++ b/doc/source/dev/depending_on_numpy.rst
@@ -48,70 +48,80 @@ job, either all warnings or otherwise at least ``DeprecationWarning`` and
 adapt your code.
 
 
+.. _depending_on_numpy:
+
 Adding a dependency on NumPy
 ----------------------------
 
 Build-time dependency
 ~~~~~~~~~~~~~~~~~~~~~
 
+.. note::
+
+    Before NumPy 1.25, the NumPy C-API was *not* backwards compatible.  This
+    means that when compiling with a NumPy version earlier than 1.25 you
+    have to compile with the oldest version you wish to support.
+    This can be done by using
+    `oldest-supported-numpy <https://github.com/scipy/oldest-supported-numpy/>`__.
+    Please see the NumPy 1.24 documentation at
+    `https://numpy.org/doc/1.24/dev/depending_on_numpy.html`__.
+
+
 If a package either uses the NumPy C API directly or it uses some other tool
 that depends on it like Cython or Pythran, NumPy is a *build-time* dependency
-of the package. Because the NumPy ABI is only forward compatible, you must
-build your own binaries (wheels or other package formats) against the lowest
-NumPy version that you support (or an even older version).
-
-Picking the correct NumPy version to build against for each Python version and
-platform can get complicated. There are a couple of ways to do this.
-Build-time dependencies are specified in ``pyproject.toml`` (see PEP 517),
-which is the file used to build wheels by PEP 517 compliant tools (e.g.,
-when using ``pip wheel``).
-
-You can specify everything manually in ``pyproject.toml``, or you can instead
-rely on the `oldest-supported-numpy <https://github.com/scipy/oldest-supported-numpy/>`__
-metapackage. ``oldest-supported-numpy`` will specify the correct NumPy version
-at build time for wheels, taking into account Python version, Python
-implementation (CPython or PyPy), operating system and hardware platform. It
-will specify the oldest NumPy version that supports that combination of
-characteristics.  Note: for platforms for which NumPy provides wheels on PyPI,
-it will be the first version with wheels (even if some older NumPy version
-happens to build).
-
-For conda-forge it's a little less complicated: there's dedicated handling for
-NumPy in build-time and runtime dependencies, so typically this is enough
-(see `here <https://conda-forge.org/docs/maintainer/knowledge_base.html#building-against-numpy>`__ for docs)::
+of the package. 
 
-    host:
-      - numpy
-    run:
-      - {{ pin_compatible('numpy') }}
+By default, NumPy will expose an API that is backwards compatible with the
+oldest NumPy version that supports the currently oldest compatible Python
+version.  NumPy 1.25.0 supports Python 3.9 and higher and NumPy 1.19 is the
+first version to support Python 3.9.  Thus, we guarantee that, when using
+defaults, NumPy 1.25 will expose a C-API compatible with NumPy 1.19.
+(the exact version is set within NumPy-internal header files).
 
-.. note::
+NumPy is also forward compatible for all minor releases, but a major release
+will require recompilation.
+
+The default behavior can be customized for example by adding::
 
-    ``pip`` has ``--no-use-pep517`` and ``--no-build-isolation`` flags that may
-    ignore ``pyproject.toml`` or treat it differently - if users use those
-    flags, they are responsible for installing the correct build dependencies
-    themselves.
+    #define NPY_TARGET_VERSION NPY_1_22_API_VERSION
 
-    ``conda`` will always use ``-no-build-isolation``; dependencies for conda
-    builds are given in the conda recipe (``meta.yaml``), the ones in
-    ``pyproject.toml`` have no effect.
+before including any NumPy headers (or the equivalent ``-D`` compiler flag) in
+every extension module that requires the NumPy C-API.
+This is mainly useful if you need to use newly added API at the cost of not
+being compatible with older versions.
 
-    Please do not use ``setup_requires`` (it is deprecated and may invoke
-    ``easy_install``).
+If for some reason you wish to compile for the currently installed NumPy
+version by default you can add::
 
-Because for NumPy you have to care about ABI compatibility, you
-specify the version with ``==`` to the lowest supported version. For your other
-build dependencies you can probably be looser, however it's still important to
-set lower and upper bounds for each dependency. It's fine to specify either a
-range or a specific version for a dependency like ``wheel`` or ``setuptools``.
+    #ifndef NPY_TARGET_VERSION
+        #define NPY_TARGET_VERSION NPY_API_VERSION
+    #endif
 
-.. warning::
+Which allows a user to override the default via ``-DNPY_TARGET_VERSION``.
+This define must be consistent for each extension module (use of
+``import_array()``) and also applies to the umath module.
+
+When you compile against NumPy, you should add the proper version restrictions
+to your ``pyproject.toml`` (see PEP 517).  Since your extension will not be
+compatible with a new major release of NumPy and may not be compatible with
+very old versions.
+
+For conda-forge packages, please see
+`here <https://conda-forge.org/docs/maintainer/knowledge_base.html#building-against-numpy>`__.
+
+as of now, it is usually as easy as including::
+
+    host:
+      - numpy
+    run:
+      - {{ pin_compatible('numpy') }}
+
+.. note::
 
-    Note that ``setuptools`` does major releases often and those may contain
-    changes that break ``numpy.distutils``, which will *not* be updated anymore
-    for new ``setuptools`` versions. It is therefore recommended to set an
-    upper version bound in your build configuration for the last known version
-    of ``setuptools`` that works with your build.
+    At the time of NumPy 1.25, NumPy 2.0 is expected to be the next release
+    of NumPy.  The NumPy 2.0 release is expected to require a different pin,
+    since NumPy 2+ will be needed in order to be compatible with both NumPy
+    1.x and 2.x.
 
 
 Runtime dependency & version ranges
diff --git a/doc/source/dev/development_advanced_debugging.rst b/doc/source/dev/development_advanced_debugging.rst
index efd78cfad..735ff7328 100644
--- a/doc/source/dev/development_advanced_debugging.rst
+++ b/doc/source/dev/development_advanced_debugging.rst
@@ -1,3 +1,5 @@
+.. _advanced_debugging:
+
 ========================
 Advanced debugging tools
 ========================
@@ -39,12 +41,33 @@ and means you do not have to worry about making reference counting errors,
 which can be intimidating.
 
 
-Python debug build for finding memory leaks
-===========================================
+Python debug build
+==================
+
+Debug builds of Python are easily available for example via the system package
+manager on Linux systems, but are also available on other platforms, possibly in
+a less convenient format. If you cannot easily install a debug build of Python
+from a system package manager, you can build one yourself using `pyenv
+<https://github.com/pyenv/pyenv>`_. For example, to install and globally
+activate a debug build of Python 3.10.8, one would do::
+
+    pyenv install -g 3.10.8
+    pyenv global 3.10.8
 
-Debug builds of Python are easily available for example on ``debian`` systems,
-and can be used on all platforms.
-Running a test or terminal is usually as easy as::
+Note that ``pyenv install`` builds Python from source, so you must ensure that
+Python's dependencies are installed before building, see the pyenv documentation
+for platform-specific installation instructions. You can use ``pip`` to install
+Python dependencies you may need for your debugging session. If there is no
+debug wheel available on `pypi,` you will need to build the dependencies from
+source and ensure that your dependencies are also compiled as debug builds.
+
+Often debug builds of Python name the Python executable ``pythond`` instead of
+``python``. To check if you have a debug build of Python installed, you can run
+e.g. ``pythond -m sysconfig`` to get the build configuration for the Python
+executable. A debug build will be built with debug compiler options in
+``CFLAGS`` (e.g. ``-g -Og``).
+
+Running the Numpy tests or an interactive terminal is usually as easy as::
 
     python3.8d runtests.py
     # or
@@ -63,6 +86,8 @@ A Python debug build will help:
     sys.gettotalrefcount()
     sys.getallocatedblocks()
 
+- Python debug builds allow easier debugging with gdb and other C debuggers.
+
 
 Use together with ``pytest``
 ----------------------------
diff --git a/doc/source/dev/development_environment.rst b/doc/source/dev/development_environment.rst
index 4772366d2..aedc489d6 100644
--- a/doc/source/dev/development_environment.rst
+++ b/doc/source/dev/development_environment.rst
@@ -18,15 +18,36 @@ sources needs some additional steps, which are explained below.  For the rest
 of this chapter we assume that you have set up your git repo as described in
 :ref:`using-git`.
 
-.. note:: If you are having trouble building NumPy from source or setting up
-   your local development environment, you can try
-   to :ref:`build NumPy with Gitpod <development-gitpod>`.
+.. note::
+
+   If you are having trouble building NumPy from source or setting up your
+   local development environment, you can try to build NumPy with GitHub
+   Codespaces. It allows you to create the correct development environment
+   right in your browser, reducing the need to install local development
+   environments and deal with incompatible dependencies.
+
+   If you have good internet connectivity and want a temporary set-up, it is
+   often faster to work on NumPy in a Codespaces environment. For documentation
+   on how to get started with Codespaces, see
+   `the Codespaces docs <https://docs.github.com/en/codespaces>`__.
+   When creating a codespace for the ``numpy/numpy`` repository, the default
+   2-core machine type works; 4-core will build and work a bit faster (but of
+   course at a cost of halving your number of free usage hours). Once your
+   codespace has started, you can run ``conda activate numpy-dev`` and your
+   development environment is completely set up - you can then follow the
+   relevant parts of the NumPy documentation to build, test, develop, write
+   docs, and contribute to NumPy.
+
 
 .. _testing-builds:
 
 Testing builds
 --------------
 
+Before running the tests, first install the test dependencies::
+
+    $ python -m pip install -r test_requirements.txt
+
 To build the development version of NumPy and run tests, spawn
 interactive shells with the Python import paths properly set up etc.,
 do one of::
@@ -267,6 +288,10 @@ Python is running inside gdb to verify your setup::
     >end
     sys.version_info(major=3, minor=7, micro=0, releaselevel='final', serial=0)
 
+Most python builds do not include debug symbols and are built with compiler
+optimizations enabled. To get the best debugging experience using a debug build
+of Python is encouraged, see :ref:`advanced_debugging`.
+
 Next you need to write a Python script that invokes the C code whose execution
 you want to debug. For instance ``mytest.py``::
 
@@ -285,14 +310,28 @@ And then in the debugger::
 
 The execution will now stop at the corresponding C function and you can step
 through it as usual. A number of useful Python-specific commands are available.
-For example to see where in the Python code you are, use ``py-list``.  For more
-details, see `DebuggingWithGdb`_. Here are some commonly used commands:
+For example to see where in the Python code you are, use ``py-list``, to see the
+python traceback, use ``py-bt``.  For more details, see
+`DebuggingWithGdb`_. Here are some commonly used commands:
 
    - ``list``: List specified function or line.
    - ``next``: Step program, proceeding through subroutine calls.
    - ``step``: Continue program being debugged, after signal or breakpoint.
    - ``print``: Print value of expression EXP.
 
+Rich support for Python debugging requires that the ``python-gdb.py`` script
+distributed with Python is installed in a path where gdb can find it. If you
+installed your Python build from your system package manager, you likely do
+not need to manually do anything. However, if you built Python from source,
+you will likely need to create a ``.gdbinit`` file in your home directory
+pointing gdb at the location of your Python installation. For example, a
+version of python installed via `pyenv <https://github.com/pyenv/pyenv>`_
+needs a ``.gdbinit`` file with the following contents:
+
+.. code-block:: text
+
+    add-auto-load-safe-path ~/.pyenv
+
 Instead of plain ``gdb`` you can of course use your favourite
 alternative debugger; run it on the python binary with arguments
 ``runtests.py -g --python mytest.py``.
@@ -300,8 +339,6 @@ alternative debugger; run it on the python binary with arguments
 Building NumPy with a Python built with debug support (on Linux distributions
 typically packaged as ``python-dbg``) is highly recommended.
 
-
-
 .. _DebuggingWithGdb: https://wiki.python.org/moin/DebuggingWithGdb
 .. _tox: https://tox.readthedocs.io/
 .. _virtualenv: http://www.virtualenv.org/
diff --git a/doc/source/dev/development_gitpod.rst b/doc/source/dev/development_gitpod.rst
deleted file mode 100644
index 4e386867d..000000000
--- a/doc/source/dev/development_gitpod.rst
+++ /dev/null
@@ -1,271 +0,0 @@
-.. _development-gitpod:
-
-
-Using Gitpod for NumPy development
-==================================
-
-This section of the documentation will guide you through:
-
-*  using GitPod for your NumPy development environment
-*  creating a personal fork of the NumPy repository on GitHub
-*  a quick tour of Gitpod and VSCode
-*  working on the NumPy documentation in Gitpod
-
-Gitpod
-------
-
-`Gitpod`_  is an open-source platform for automated and ready-to-code 
-development environments. It enables developers to describe their dev 
-environment as code and start instant and fresh development environments for 
-each new task directly from your browser. This reduces the need to install local 
-development environments and deal with incompatible dependencies.
-
-Gitpod GitHub integration
--------------------------
-
-To be able to use Gitpod, you will need to have the Gitpod app installed on your 
-GitHub account, so if
-you do not have an account yet, you will need to create one first.
-
-Head over to the `Gitpod`_ website and click on the **Continue with GitHub** 
-button. You will be redirected to the GitHub authentication page.
-You will then be asked to install the `Gitpod GitHub app <https://github.com/marketplace/gitpod-io>`_.
-
-Make sure to select **All repositories** access option to avoid issues with 
-permissions later on. Click on the green **Install** button
-
-.. image:: ./gitpod-imgs/installing-gitpod-io.png
-   :alt: Gitpod repository access and installation screenshot
-
-This will install the necessary hooks for the integration.
-
-Forking the NumPy repository
-----------------------------
-
-The best way to work on NumPy as a contributor is by making a fork of the 
-repository first.
-
-#. Browse to the `NumPy repository on GitHub`_ and `create your own fork`_.
-#. Browse to your fork. Your fork will have a URL like 
-   https://github.com/melissawm/NumPy, except with your GitHub username in place of ``melissawm``.
-
-Starting Gitpod
----------------
-Once you have authenticated to Gitpod through GitHub, you can install the 
-`Gitpod browser extension <https://www.gitpod.io/docs/browser-extension>`_  
-which will add a **Gitpod** button next to the **Code** button in the 
-repository:
-
-.. image:: ./gitpod-imgs/NumPy-github.png
-   :alt: NumPy repository with Gitpod button screenshot
-
-#. If you install the extension - you can click the **Gitpod** button to start 
-   a new workspace.
-
-#. Alternatively, if you do not want to install the browser extension, you can 
-   visit https://gitpod.io/#https://github.com/USERNAME/NumPy replacing 
-   ``USERNAME`` with your GitHub username.
-
-#. In both cases, this will open a new tab on your web browser and start 
-   building your development environment. Please note this can take a few 
-   minutes.
-
-#. Once the build is complete, you will be directed to your workspace, 
-   including the VSCode editor and all the dependencies you need to work on 
-   NumPy. The first time you start your workspace, you will notice that there 
-   might be some actions running. This will ensure that you have a development 
-   version of NumPy installed and that the docs are being pre-built for you.
-
-#. When your workspace is ready, you can :ref:`test the build<testing-builds>` by 
-   entering::
-
-      $ python runtests.py -v
-
-``runtests.py`` is another script in the NumPy root directory. It runs a suite 
-of tests that make sure NumPy is working as it should, and ``-v`` activates the 
-``--verbose`` option to show all the test output.
-
-Quick workspace tour
---------------------
-Gitpod uses VSCode as the editor. If you have not used this editor before, you 
-can check the Getting started `VSCode docs`_ to familiarize yourself with it.
-
-Your workspace will look similar to the image below:
-
-.. image:: ./gitpod-imgs/gitpod-workspace.png
-   :alt: Gitpod workspace screenshot
-
-.. note::  By default, VSCode initializes with a light theme. You can change to 
-   a dark theme by with the keyboard shortcut :kbd:`Cmd-K Cmd-T` in Mac or 
-   :kbd:`Ctrl-K Ctrl-T` in Linux and Windows.
-
-We have marked some important sections in the editor:
-
-#. Your current Python interpreter - by default, this is ``numpy-dev`` and 
-   should be displayed in the status bar and on your terminal. You do not need 
-   to activate the conda environment as this will always be activated for you.
-#. Your current branch is always displayed in the status bar. You can also use 
-   this button to change or create branches.
-#. GitHub Pull Requests extension - you can use this to work with Pull Requests 
-   from your workspace.
-#. Marketplace extensions - we have added some essential extensions to the NumPy 
-   Gitpod. Still, you can also install other extensions or syntax highlighting 
-   themes for your user, and these will be preserved for you.
-#. Your workspace directory - by default, it is ``/workspace/numpy``. **Do not 
-   change this** as this is the only directory preserved in Gitpod.
-
-We have also pre-installed a few tools and VSCode extensions to help with the 
-development experience:
-
-*  `GitHub CLI <https://cli.github.com/>`_
-*  `VSCode rst extension <https://marketplace.visualstudio.com/items?itemName=lextudio.restructuredtext>`_
-*  `VSCode Live server extension <https://marketplace.visualstudio.com/items?itemName=ritwickdey.LiveServer>`_
-*  `VSCode Gitlens extension <https://marketplace.visualstudio.com/items?itemName=eamodio.gitlens>`_
-*  `VSCode autodocstrings extension <https://marketplace.visualstudio.com/items?itemName=njpwerner.autodocstring>`_
-*  `VSCode Git Graph extension <https://marketplace.visualstudio.com/items?itemName=mhutchie.git-graph>`_
-
-Development workflow with Gitpod
---------------------------------
-The  :ref:`development-workflow` section of this documentation contains 
-information regarding the NumPy development workflow. Make sure to check this 
-before working on your contributions.
-
-When using Gitpod, git is pre configured for you:
-
-#. You do not need to configure your git username, and email as this should be 
-   done for you as you authenticated through GitHub. You can check the git 
-   configuration with the command ``git config --list`` in your terminal.
-#. As you started your workspace from your own NumPy fork, you will by default 
-   have both ``upstream`` and ``origin`` added as remotes. You can verify this by 
-   typing ``git remote`` on your terminal or by clicking on the **branch name** 
-   on the status bar (see image below).
-
-   .. image:: ./gitpod-imgs/NumPy-gitpod-branches.png
-      :alt: Gitpod workspace branches plugin screenshot
-
-Rendering the NumPy documentation
----------------------------------
-You can find the detailed documentation on how rendering the documentation with 
-Sphinx works in the :ref:`howto-build-docs` section.
-
-The documentation is pre-built during your workspace initialization. So once 
-this task is completed, you have two main options to render the documentation 
-in Gitpod.
-
-Option 1: Using Liveserve
-~~~~~~~~~~~~~~~~~~~~~~~~~
-
-#. View the documentation in ``NumPy/doc/build/html``. You can start with 
-   ``index.html`` and browse, or you can jump straight to the file you're 
-   interested in.
-#. To see the rendered version of a page, you can right-click on the ``.html`` 
-   file and click on **Open with Live Serve**. Alternatively, you can open the 
-   file in the editor and click on the **Go live** button on the status bar.
-
-    .. image:: ./gitpod-imgs/vscode-statusbar.png
-        :alt: Gitpod workspace VSCode start live serve screenshot
-
-#. A simple browser will open to the right-hand side of the editor. We recommend 
-   closing it and click on the **Open in browser** button in the pop-up.
-#. To stop the server click on the **Port: 5500** button on the status bar.
-
-Option 2: Using the rst extension
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-A quick and easy way to see live changes in a ``.rst`` file as you work on it 
-uses the rst extension with docutils.
-
-.. note:: This will generate a simple live preview of the document without the 
-    ``html`` theme, and some backlinks might not be added correctly. But it is an 
-    easy and lightweight way to get instant feedback on your work.
-
-#. Open any of the source documentation files located in ``doc/source`` in the 
-   editor.
-#. Open VSCode Command Palette with :kbd:`Cmd-Shift-P` in Mac or 
-   :kbd:`Ctrl-Shift-P` in Linux and Windows. Start typing "restructured" 
-   and choose either "Open preview" or "Open preview to the Side".
-
-    .. image:: ./gitpod-imgs/vscode-rst.png
-        :alt: Gitpod workspace VSCode open rst screenshot
-
-#. As you work on the document, you will see a live rendering of it on the editor.
-
-    .. image:: ./gitpod-imgs/rst-rendering.png
-        :alt: Gitpod workspace VSCode rst rendering screenshot
-
-If you want to see the final output with the ``html`` theme you will need to 
-rebuild the docs with ``make html`` and use Live Serve as described in option 1.
-
-FAQ's and troubleshooting
--------------------------
-
-How long is my Gitpod workspace kept for?
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Your stopped workspace will be kept for 14 days and deleted afterwards if you do 
-not use them.
-
-Can I come back to a previous workspace?
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Yes, let's say you stepped away for a while and you want to carry on working on 
-your NumPy contributions. You need to visit https://gitpod.io/workspaces and 
-click on the workspace you want to spin up again. All your changes will be there 
-as you last left them.
-
-Can I install additional VSCode extensions?
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Absolutely! Any extensions you installed will be installed in your own workspace 
-and preserved.
-
-I registered on Gitpod but I still cannot see a ``Gitpod`` button in my repositories.
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Head to https://gitpod.io/integrations and make sure you are logged in. 
-Hover over GitHub and click on the three buttons that appear on the right. 
-Click on edit permissions and make sure you have ``user:email``, 
-``read:user``, and ``public_repo`` checked. Click on **Update Permissions** 
-and confirm the changes in the GitHub application page.
-
-.. image:: ./gitpod-imgs/gitpod-edit-permissions-gh.png
-   :alt: Gitpod integrations - edit GH permissions screenshot
-
-How long does my workspace stay active if I'm not using it?
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-If you keep your workspace open in a browser tab but don't interact with it, 
-it will shut down after 30 minutes. If you close the browser tab, it will 
-shut down after 3 minutes.
-
-My terminal is blank - there is no cursor and it's completely unresponsive
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Unfortunately this is a known-issue on Gitpod's side. You can sort this 
-issue in two ways:
-
-#. Create a new Gitpod workspace altogether.
-#. Head to your `Gitpod dashboard <https://gitpod.io/workspaces>`_ and locate 
-   the running workspace. Hover on it and click on the **three dots menu** 
-   and then click on **Stop**. When the workspace is completely stopped you 
-   can click on its name to restart it again.   
-
-.. image:: ./gitpod-imgs/gitpod-dashboard-stop.png
-   :alt: Gitpod dashboard and workspace menu screenshot
-
-I authenticated through GitHub but I still cannot commit to the repository through Gitpod. 
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Head to https://gitpod.io/integrations and make sure you are logged in. 
-Hover over GitHub and click on the three buttons that appear on the right. 
-Click on edit permissions and make sure you have ``public_repo`` checked.
-Click on **Update Permissions** and confirm the changes in the 
-GitHub application page.
-
-.. image:: ./gitpod-imgs/gitpod-edit-permissions-repo.png
-   :alt: Gitpod integrations - edit GH repository permissions screenshot
-
-.. _Gitpod: https://www.gitpod.io/
-.. _NumPy repository on GitHub: https://github.com/NumPy/NumPy
-.. _create your own fork: https://help.github.com/en/articles/fork-a-repo
-.. _VSCode docs: https://code.visualstudio.com/docs/getstarted/tips-and-tricks
diff --git a/doc/source/dev/development_workflow.rst b/doc/source/dev/development_workflow.rst
index 33bd0126d..9fa9c5809 100644
--- a/doc/source/dev/development_workflow.rst
+++ b/doc/source/dev/development_workflow.rst
@@ -200,12 +200,46 @@ sequences. In such cases you may explicitly skip CI by including one of these
 fragments in your commit message:
 
 * ``[skip ci]``: skip all CI
-* ``[skip github]``: skip GitHub Actions "build numpy and run tests" jobs
-* ``[skip actions]``: skip all GitHub Actions
+
+  Only recommended if you are still not ready for the checks to run on your PR
+  (for example, if this is only a draft.)
+
+* ``[skip actions]``: skip GitHub Actions jobs
+
+  `GitHub Actions <https://docs.github.com/actions>`__ is where most of the CI
+  checks are run, including the linter, benchmarking, running basic tests for
+  most architectures and OSs, and several compiler and CPU optimization
+  settings.
+  `See the configuration files for these checks. <https://github.com/numpy/numpy/tree/main/.github/workflows>`__
+
 * ``[skip travis]``: skip TravisCI jobs
+
+  `TravisCI <https://www.travis-ci.com/>`__ will test your changes against
+  Python 3.9 on the PowerPC and s390x architectures.
+  `See the configuration file for these checks. <https://github.com/numpy/numpy/blob/main/.travis.yml>`__
+
 * ``[skip azp]``: skip Azure jobs
+
+  `Azure <https://azure.microsoft.com/en-us/products/devops/pipelines>`__ is
+  where all comprehensive tests are run. This is an expensive run, and one you
+  could typically skip if you do documentation-only changes, for example.
+  `See the main configuration file for these checks. <https://github.com/numpy/numpy/blob/main/azure-pipelines.yml>`__
+
 * ``[skip circle]``: skip CircleCI jobs
 
+  `CircleCI <https://circleci.com/>`__ is where we build the documentation and
+  store the generated artifact for preview in each PR. This check will also run
+  all the docstrings examples and verify their results. If you don't make
+  documentation changes, but you make changes to a function's API, for example,
+  you may need to run these tests to verify that the doctests are still valid.
+  `See the configuration file for these checks. <https://github.com/numpy/numpy/blob/main/.circleci/config.yml>`__
+
+* ``[skip cirrus]``: skip Cirrus jobs
+
+  `CirrusCI <https://cirrus-ci.org/>`__ mostly triggers Linux aarch64 and MacOS Arm64 wheels
+  uploads.
+  `See the configuration file for these checks. <https://github.com/numpy/numpy/blob/main/.cirrus.star>`__
+
 Test building wheels
 ~~~~~~~~~~~~~~~~~~~~
 
@@ -481,6 +515,28 @@ usual::
      git commit -am 'ENH - much better code'
      git push origin my-feature-branch # pushes directly into your repo
 
+
+Checkout changes from an existing pull request
+==============================================
+
+If you want to test the changes in a pull request or continue the work in a
+new pull request, the commits are to be cloned into a local branch in your
+forked repository
+
+First ensure your upstream points to the main repo, as from :ref:`linking-to-upstream`
+
+Then, fetch the changes and create a local branch. Assuming ``$ID`` is the pull request number
+and ``$BRANCHNAME`` is the name of the *new local* branch you wish to create::
+
+    git fetch upstream pull/$ID/head:$BRANCHNAME
+
+Checkout the newly created branch::
+
+    git checkout $BRANCHNAME
+
+You now have the changes in the pull request.
+
+
 Exploring your repository
 =========================
 
diff --git a/doc/source/dev/gitpod-imgs/NumPy-github.png b/doc/source/dev/gitpod-imgs/NumPy-github.png
deleted file mode 100644
index 010b0fc5e..000000000
--- a/doc/source/dev/gitpod-imgs/NumPy-github.png
+++ /dev/null
diff --git a/doc/source/dev/gitpod-imgs/NumPy-gitpod-branches.png b/doc/source/dev/gitpod-imgs/NumPy-gitpod-branches.png
deleted file mode 100644
index 3ee6c5f20..000000000
--- a/doc/source/dev/gitpod-imgs/NumPy-gitpod-branches.png
+++ /dev/null
diff --git a/doc/source/dev/gitpod-imgs/gitpod-dashboard-stop.png b/doc/source/dev/gitpod-imgs/gitpod-dashboard-stop.png
deleted file mode 100644
index 40f137745..000000000
--- a/doc/source/dev/gitpod-imgs/gitpod-dashboard-stop.png
+++ /dev/null
diff --git a/doc/source/dev/gitpod-imgs/gitpod-edit-permissions-gh.png b/doc/source/dev/gitpod-imgs/gitpod-edit-permissions-gh.png
deleted file mode 100644
index 8955e907a..000000000
--- a/doc/source/dev/gitpod-imgs/gitpod-edit-permissions-gh.png
+++ /dev/null
diff --git a/doc/source/dev/gitpod-imgs/gitpod-edit-permissions-repo.png b/doc/source/dev/gitpod-imgs/gitpod-edit-permissions-repo.png
deleted file mode 100644
index 8bfaff81c..000000000
--- a/doc/source/dev/gitpod-imgs/gitpod-edit-permissions-repo.png
+++ /dev/null
diff --git a/doc/source/dev/gitpod-imgs/gitpod-workspace.png b/doc/source/dev/gitpod-imgs/gitpod-workspace.png
deleted file mode 100644
index a65c9bd7e..000000000
--- a/doc/source/dev/gitpod-imgs/gitpod-workspace.png
+++ /dev/null
diff --git a/doc/source/dev/gitpod-imgs/installing-gitpod-io.png b/doc/source/dev/gitpod-imgs/installing-gitpod-io.png
deleted file mode 100644
index 97319a729..000000000
--- a/doc/source/dev/gitpod-imgs/installing-gitpod-io.png
+++ /dev/null
diff --git a/doc/source/dev/gitpod-imgs/rst-rendering.png b/doc/source/dev/gitpod-imgs/rst-rendering.png
deleted file mode 100644
index 41cc305f3..000000000
--- a/doc/source/dev/gitpod-imgs/rst-rendering.png
+++ /dev/null
diff --git a/doc/source/dev/gitpod-imgs/vscode-rst.png b/doc/source/dev/gitpod-imgs/vscode-rst.png
deleted file mode 100644
index 5b574c115..000000000
--- a/doc/source/dev/gitpod-imgs/vscode-rst.png
+++ /dev/null
diff --git a/doc/source/dev/gitpod-imgs/vscode-statusbar.png b/doc/source/dev/gitpod-imgs/vscode-statusbar.png
deleted file mode 100644
index 3febbcee0..000000000
--- a/doc/source/dev/gitpod-imgs/vscode-statusbar.png
+++ /dev/null
diff --git a/doc/source/dev/gitwash/development_setup.rst b/doc/source/dev/gitwash/development_setup.rst
index a2fc61d2e..75b7fb2f8 100644
--- a/doc/source/dev/gitwash/development_setup.rst
+++ b/doc/source/dev/gitwash/development_setup.rst
@@ -103,6 +103,14 @@ Make the local copy
     git config branch.main.remote upstream
     git config branch.main.merge refs/heads/main
 
+#. Initialize the required git submodules: ::
+
+    git submodule update --init
+
+#. Pull from upstream to get latest tag information: ::
+
+    git pull
+
 ******************************************************************************
 Look it over
 ******************************************************************************
diff --git a/doc/source/dev/gitwash/git_resources.rst b/doc/source/dev/gitwash/git_resources.rst
index c41af762c..b6930f394 100644
--- a/doc/source/dev/gitwash/git_resources.rst
+++ b/doc/source/dev/gitwash/git_resources.rst
@@ -1,7 +1,7 @@
 .. _git-resources:
 
 =========================
-Additional Git_ Resources
+Additional Git_ resources
 =========================
 
 Tutorials and summaries
diff --git a/doc/source/dev/howto_build_docs.rst b/doc/source/dev/howto_build_docs.rst
index 02a8820c9..b3d2e3055 100644
--- a/doc/source/dev/howto_build_docs.rst
+++ b/doc/source/dev/howto_build_docs.rst
@@ -14,22 +14,13 @@ in several different formats.
 Development environments
 ========================
 
-Before proceeding further it should be noted that the documentation is built with the ``make`` tool,
-which is not natively available on Windows. MacOS or Linux users can jump
-to :ref:`how-todoc.prerequisites`. It is recommended for Windows users to set up their development
-environment on :ref:`Gitpod <development-gitpod>` or `Windows Subsystem
-for Linux (WSL) <https://docs.microsoft.com/en-us/windows/wsl/install-win10>`_. WSL is a good option
-for a persistent local set-up.
-
-Gitpod
-~~~~~~
-Gitpod is an open-source platform that automatically creates the correct development environment right
-in your browser, reducing the need to install local development environments and deal with
-incompatible dependencies.
-
-If you have good internet connectivity and want a temporary set-up,
-it is often faster to build with Gitpod. Here are the in-depth instructions for
-:ref:`building NumPy with Gitpod <development-gitpod>`.
+Before proceeding further it should be noted that the documentation is built
+with the ``make`` tool, which is not natively available on Windows. MacOS or
+Linux users can jump to :ref:`how-todoc.prerequisites`. It is recommended for
+Windows users to set up their development environment on
+GitHub Codespaces (see :ref:`recommended-development-setup`) or
+`Windows Subsystem for Linux (WSL) <https://learn.microsoft.com/en-us/windows/wsl/install>`_.
+WSL is a good option for a persistent local set-up.
 
 
 .. _how-todoc.prerequisites:
@@ -44,7 +35,8 @@ NumPy
 
 Since large parts of the main documentation are obtained from NumPy via
 ``import numpy`` and examining the docstrings, you will need to first
-:ref:`build <building-from-source>` and install it so that the correct version is imported.
+:ref:`build <development-environment>` and install it so that the correct version is
+imported.
 NumPy has to be re-built and re-installed every time you fetch the latest version of the
 repository, before generating the documentation. This ensures that the NumPy version and
 the git repository version are in sync.
diff --git a/doc/source/dev/index.rst b/doc/source/dev/index.rst
index 93b57f7e2..b4479fa0d 100644
--- a/doc/source/dev/index.rst
+++ b/doc/source/dev/index.rst
@@ -45,7 +45,7 @@ Here's the short summary, complete TOC links are below:
 
    * Clone the project to your local computer::
 
-      git clone https://github.com/your-username/numpy.git
+      git clone --recurse-submodules https://github.com/your-username/numpy.git
 
    * Change the directory::
 
@@ -60,12 +60,16 @@ Here's the short summary, complete TOC links are below:
      - ``upstream``, which refers to the ``numpy`` repository
      - ``origin``, which refers to your personal fork
 
-2. Develop your contribution:
-
-   * Pull the latest changes from upstream::
+   * Pull the latest changes from upstream, including tags::
 
       git checkout main
-      git pull upstream main
+      git pull upstream main --tags
+
+   * Initialize numpy's submodules::
+
+      git submodule update --init
+
+2. Develop your contribution:
 
    * Create a branch for the feature you want to work on. Since the
      branch name will appear in the merge message, use a sensible name
@@ -180,7 +184,7 @@ Guidelines
   get no response to your pull request within a week.
 
 .. _stylistic-guidelines:
-  
+
 Stylistic Guidelines
 --------------------
 
@@ -210,7 +214,7 @@ Running NumPy's test suite locally requires some additional packages, such as
 in ``test_requirements.txt`` in the top-level directory, and can conveniently
 be installed with::
 
-    pip install -r test_requirements.txt
+    $ python -m pip install -r test_requirements.txt
 
 Tests for a module should ideally cover all code in that module,
 i.e., statement coverage should be at 100%.
@@ -258,7 +262,6 @@ The rest of the story
 
    Git Basics <gitwash/index>
    development_environment
-   development_gitpod
    howto_build_docs
    development_workflow
    development_advanced_debugging
diff --git a/doc/source/dev/reviewer_guidelines.rst b/doc/source/dev/reviewer_guidelines.rst
index 8d938319e..0ed9bf2e5 100644
--- a/doc/source/dev/reviewer_guidelines.rst
+++ b/doc/source/dev/reviewer_guidelines.rst
@@ -1,7 +1,7 @@
 .. _reviewer-guidelines:
 
 ===================
-Reviewer Guidelines
+Reviewer guidelines
 ===================
 
 Reviewing open pull requests (PRs) helps move the project forward. We encourage
@@ -101,7 +101,34 @@ For maintainers
   If a PR becomes inactive, maintainers may make larger changes. 
   Remember, a PR is a collaboration between a contributor and a reviewer/s, 
   sometimes a direct push is the best way to finish it.
- 
+
+API Changes
+-----------
+As mentioned most public API changes should be discussed ahead of time and
+often with a wider audience (on the mailing list, or even through a NEP).
+
+For changes in the public C-API be aware that the NumPy C-API is backwards
+compatible so that any addition must be ABI compatible with previous versions.
+When it is not the case, you must add a guard.
+
+For example ``PyUnicodeScalarObject`` struct contains the following::
+
+    #if NPY_FEATURE_VERSION >= NPY_1_20_API_VERSION
+        char *buffer_fmt;
+    #endif
+
+Because the ``buffer_fmt`` field was added to its end in NumPy 1.20 (all
+previous fields remained ABI compatible).
+Similarly, any function added to the API table in
+``numpy/core/code_generators/numpy_api.py`` must use the ``MinVersion``
+annotation.
+For example::
+
+    'PyDataMem_SetHandler':                 (304, MinVersion("1.22")),
+
+Header only functionality (such as a new macro) typically does not need to be
+guarded.
+
 GitHub Workflow
 ---------------
 
diff --git a/doc/source/f2py/buildtools/cmake.rst b/doc/source/f2py/buildtools/cmake.rst
index 8c654c73e..db64453b4 100644
--- a/doc/source/f2py/buildtools/cmake.rst
+++ b/doc/source/f2py/buildtools/cmake.rst
@@ -18,7 +18,7 @@ but this `extensive CMake collection`_ of resources is great.
    ``f2py`` is not particularly native or pleasant; and a more natural approach
    is to consider :ref:`f2py-skbuild`
 
-Fibonacci Walkthrough (F77)
+Fibonacci walkthrough (F77)
 ===========================
 
 Returning to the ``fib``  example from :ref:`f2py-getting-started` section.
diff --git a/doc/source/f2py/buildtools/meson.rst b/doc/source/f2py/buildtools/meson.rst
index 7edc6722f..23b454ee5 100644
--- a/doc/source/f2py/buildtools/meson.rst
+++ b/doc/source/f2py/buildtools/meson.rst
@@ -23,13 +23,12 @@ build system like ``meson``. We will acquire this by:
 
 .. code-block:: bash
 
-    python -n numpy.f2py fib1.f -m fib2
+    python -m numpy.f2py fib1.f -m fib2
 
 Now, consider the following ``meson.build`` file for the ``fib`` and ``scalar``
 examples from :ref:`f2py-getting-started` section:
 
-.. literalinclude:: ./../code/meson.build
-    :language: meson
+.. literalinclude:: ../code/meson.build
 
 At this point the build will complete, but the import will fail:
 
@@ -58,7 +57,7 @@ to lowercase the original Fortran file with say:
 .. code-block:: bash
 
    tr "[:upper:]" "[:lower:]" < fib1.f > fib1.f
-   python -n numpy.f2py fib1.f -m fib2
+   python -m numpy.f2py fib1.f -m fib2
    meson --wipe builddir
    meson compile -C builddir
    cd builddir
@@ -69,7 +68,7 @@ possible. The easiest way to solve this is to let ``f2py`` deal with it:
 
 .. code-block:: bash
 
-   python -n numpy.f2py fib1.f -m fib2 --lower
+   python -m numpy.f2py fib1.f -m fib2 --lower
    meson --wipe builddir
    meson compile -C builddir
    cd builddir
@@ -93,8 +92,7 @@ for reasons discussed in :ref:`f2py-bldsys`.
 However, we can augment our workflow in a straightforward to take into account
 files for which the outputs are known when the build system is set up.
 
-.. literalinclude:: ./../code/meson_upd.build
-    :language: meson
+.. literalinclude:: ../code/meson_upd.build
 
 This can be compiled and run as before.
 
diff --git a/doc/source/f2py/code/ftype.f b/doc/source/f2py/code/ftype.f
index cabbb9e2d..67d5a224b 100644
--- a/doc/source/f2py/code/ftype.f
+++ b/doc/source/f2py/code/ftype.f
@@ -4,6 +4,6 @@ C FILE: FTYPE.F
 Cf2py integer optional,intent(in) :: n = 13
       REAL A,X
       COMMON /DATA/ A,X(3)
-      PRINT*, "IN FOO: N=",N," A=",A," X=[",X(1),X(2),X(3),"]"
+C     PRINT*, "IN FOO: N=",N," A=",A," X=[",X(1),X(2),X(3),"]"
       END
 C END OF FTYPE.F
diff --git a/doc/source/f2py/code/meson.build b/doc/source/f2py/code/meson.build
index 04276a176..1d409f2fb 100644
--- a/doc/source/f2py/code/meson.build
+++ b/doc/source/f2py/code/meson.build
@@ -1,31 +1,35 @@
 project('f2py_examples', 'c',
   version : '0.1',
-  default_options : ['warning_level=2'])
+  license: 'BSD-3',
+  meson_version: '>=0.64.0',
+  default_options : ['warning_level=2'],
+)
 
 add_languages('fortran')
 
 py_mod = import('python')
-py3 = py_mod.find_installation('python3')
-py3_dep = py3.dependency()
-message(py3.path())
-message(py3.get_install_dir())
+py = py_mod.find_installation(pure: false)
+py_dep = py.dependency()
 
-incdir_numpy = run_command(py3,
+incdir_numpy = run_command(py,
   ['-c', 'import os; os.chdir(".."); import numpy; print(numpy.get_include())'],
   check : true
 ).stdout().strip()
 
-incdir_f2py = run_command(py3,
+incdir_f2py = run_command(py,
     ['-c', 'import os; os.chdir(".."); import numpy.f2py; print(numpy.f2py.get_include())'],
     check : true
 ).stdout().strip()
 
 inc_np = include_directories(incdir_numpy, incdir_f2py)
 
-py3.extension_module('fib2',
-           'fib1.f',
-           'fib2module.c',
-           incdir_f2py+'/fortranobject.c',
-           include_directories: inc_np,
-           dependencies : py3_dep,
-           install : true)
-\ No newline at end of file
+py.extension_module('fib2',
+  [
+    'fib1.f',
+    'fib2module.c',  # note: this assumes f2py was manually run before!
+  ],
+  incdir_f2py / 'fortranobject.c',
+  include_directories: inc_np,
+  dependencies : py_dep,
+  install : true
+)
diff --git a/doc/source/f2py/code/meson_upd.build b/doc/source/f2py/code/meson_upd.build
index 44d69d182..5e4b13bae 100644
--- a/doc/source/f2py/code/meson_upd.build
+++ b/doc/source/f2py/code/meson_upd.build
@@ -1,37 +1,38 @@
 project('f2py_examples', 'c',
   version : '0.1',
-  default_options : ['warning_level=2'])
+  license: 'BSD-3',
+  meson_version: '>=0.64.0',
+  default_options : ['warning_level=2'],
+)
 
 add_languages('fortran')
 
 py_mod = import('python')
-py3 = py_mod.find_installation('python3')
-py3_dep = py3.dependency()
-message(py3.path())
-message(py3.get_install_dir())
+py = py_mod.find_installation(pure: false)
+py_dep = py.dependency()
 
-incdir_numpy = run_command(py3,
+incdir_numpy = run_command(py,
   ['-c', 'import os; os.chdir(".."); import numpy; print(numpy.get_include())'],
   check : true
 ).stdout().strip()
 
-incdir_f2py = run_command(py3,
+incdir_f2py = run_command(py,
     ['-c', 'import os; os.chdir(".."); import numpy.f2py; print(numpy.f2py.get_include())'],
     check : true
 ).stdout().strip()
 
 fibby_source = custom_target('fibbymodule.c',
-                            input : ['fib1.f'], # .f so no F90 wrappers
-                            output : ['fibbymodule.c', 'fibby-f2pywrappers.f'],
-                            command : [ py3, '-m', 'numpy.f2py', '@INPUT@',
-                            '-m', 'fibby', '--lower'])
+  input : ['fib1.f'],  # .f so no F90 wrappers
+  output : ['fibbymodule.c', 'fibby-f2pywrappers.f'],
+  command : [py, '-m', 'numpy.f2py', '@INPUT@', '-m', 'fibby', '--lower']
+)
 
 inc_np = include_directories(incdir_numpy, incdir_f2py)
 
-py3.extension_module('fibby',
-           'fib1.f',
-           fibby_source,
-           incdir_f2py+'/fortranobject.c',
-           include_directories: inc_np,
-           dependencies : py3_dep,
-           install : true)
+py.extension_module('fibby',
+  ['fib1.f', fibby_source],
+  incdir_f2py / 'fortranobject.c',
+  include_directories: inc_np,
+  dependencies : py_dep,
+  install : true
+)
diff --git a/doc/source/f2py/f2py.getting-started.rst b/doc/source/f2py/f2py.getting-started.rst
index da88b46f5..e96564814 100644
--- a/doc/source/f2py/f2py.getting-started.rst
+++ b/doc/source/f2py/f2py.getting-started.rst
@@ -18,6 +18,7 @@ following steps:
 
 * F2PY reads a signature file and writes a Python C/API module containing
   Fortran/C/Python bindings.
+
 * F2PY compiles all sources and builds an extension module containing
   the wrappers.
 
@@ -26,6 +27,13 @@ following steps:
     Fortran, SGI MIPSpro, Absoft, NAG, Compaq etc. For different build systems,
     see :ref:`f2py-bldsys`.
 
+  * Depending on your operating system, you may need to install the Python
+    development headers (which provide the file ``Python.h``) separately. In
+    Linux Debian-based distributions this package should be called ``python3-dev``,
+    in Fedora-based distributions it is ``python3-devel``. For macOS, depending
+    how Python was installed, your mileage may vary. In Windows, the headers are
+    typically installed already.
+
 Depending on the situation, these steps can be carried out in a single composite
 command or step-by-step; in which case some steps can be omitted or combined
 with others.
diff --git a/doc/source/f2py/windows/index.rst b/doc/source/f2py/windows/index.rst
index c1e6b4128..980346667 100644
--- a/doc/source/f2py/windows/index.rst
+++ b/doc/source/f2py/windows/index.rst
@@ -56,7 +56,7 @@ PGI Compilers (commercial)
    Windows support`_.
 
 Cygwin (FOSS)
-   Can also be used for ``gfortran``. Howeve, the POSIX API compatibility layer provided by
+   Can also be used for ``gfortran``. However, the POSIX API compatibility layer provided by
    Cygwin is meant to compile UNIX software on Windows, instead of building
    native Windows programs. This means cross compilation is required.
 
diff --git a/doc/source/f2py/windows/pgi.rst b/doc/source/f2py/windows/pgi.rst
index 644259abe..28e25f016 100644
--- a/doc/source/f2py/windows/pgi.rst
+++ b/doc/source/f2py/windows/pgi.rst
@@ -22,7 +22,5 @@ as classic Flang requires a custom LLVM and compilation from sources.
 	As of 29-01-2022, `PGI compiler toolchains`_ have been superseded by the Nvidia
   	HPC SDK, with no `native Windows support`_.
 
-However, 
-
 .. _PGI compiler toolchains: https://www.pgroup.com/index.html
 .. _native Windows support: https://developer.nvidia.com/nvidia-hpc-sdk-downloads#collapseFour
diff --git a/doc/source/index.rst b/doc/source/index.rst
index f31e730d7..5e31ec0a4 100644
--- a/doc/source/index.rst
+++ b/doc/source/index.rst
@@ -17,7 +17,6 @@ NumPy documentation
 **Version**: |version|
 
 **Download documentation**:
-`PDF Version <https://numpy.org/doc/stable/numpy-user.pdf>`_ |
 `Historical versions of documentation <https://numpy.org/doc/>`_
    
 **Useful links**:
diff --git a/doc/source/reference/array_api.rst b/doc/source/reference/array_api.rst
index 01c8787eb..6982ef310 100644
--- a/doc/source/reference/array_api.rst
+++ b/doc/source/reference/array_api.rst
@@ -152,8 +152,9 @@ Keyword Argument Renames
 
 The following functions have keyword arguments that have been renamed. The
 functionality of the keyword argument is identical unless otherwise stated.
-Each new keyword argument is not already present on the given function in
-``numpy``, so the changes are **compatible**.
+Renamed keyword arguments with the same semantic definition may be considered
+either **compatible** or **breaking**, depending on how the change is
+implemented.
 
 Note, this page does not list function keyword arguments that are in the main
 ``numpy`` namespace but not in the array API. Such keyword arguments are
@@ -189,7 +190,11 @@ functions to include additional keyword arguments from those required.
      - ``correction``
      - ``ddof``
      -
-
+   * - ``reshape``
+     - ``shape``
+     - ``newshape``
+     - The argument may be passed as a positional or keyword argument for both
+       NumPy and the array API.
 
 .. _array_api-type-promotion-differences:
 
@@ -228,6 +233,12 @@ independently of values or shapes.
        promoted.
      - **Breaking**
      - Example: ``a = np.array(1, dtype=np.int8); a += np.array(1, dtype=np.int16)``. The spec explicitly disallows this.
+   * - In-place operators are disallowed when the right-hand side operand
+       cannot broadcast to the shape of the left-hand side operand.
+     - **Strictness**
+     - This so-called "reverse broadcasting" should not be allowed. Example:
+       ``a = np.empty((2, 3, 4)); a += np.empty((3, 4))`` should error. See
+       https://github.com/numpy/numpy/issues/10404.
    * - ``int`` promotion for operators is only specified for integers within
        the bounds of the dtype.
      - **Strictness**
diff --git a/doc/source/reference/arrays.classes.rst b/doc/source/reference/arrays.classes.rst
index 2f40423ee..34da83670 100644
--- a/doc/source/reference/arrays.classes.rst
+++ b/doc/source/reference/arrays.classes.rst
@@ -71,10 +71,11 @@ NumPy provides several hooks that classes can customize:
    The method should return either the result of the operation, or
    :obj:`NotImplemented` if the operation requested is not implemented.
 
-   If one of the input or output arguments has a :func:`__array_ufunc__`
+   If one of the input, output, or ``where`` arguments has a :func:`__array_ufunc__`
    method, it is executed *instead* of the ufunc.  If more than one of the
    arguments implements :func:`__array_ufunc__`, they are tried in the
-   order: subclasses before superclasses, inputs before outputs, otherwise
+   order: subclasses before superclasses, inputs before outputs,
+   outputs before ``where``, otherwise
    left to right. The first routine returning something other than
    :obj:`NotImplemented` determines the result. If all of the
    :func:`__array_ufunc__` operations return :obj:`NotImplemented`, a
@@ -162,15 +163,6 @@ NumPy provides several hooks that classes can customize:
 
    .. versionadded:: 1.16
 
-   .. note::
-
-       - In NumPy 1.17, the protocol is enabled by default, but can be disabled
-         with ``NUMPY_EXPERIMENTAL_ARRAY_FUNCTION=0``.
-       - In NumPy 1.16, you need to set the environment variable
-         ``NUMPY_EXPERIMENTAL_ARRAY_FUNCTION=1`` before importing NumPy to use
-         NumPy function overrides.
-       - Eventually, expect to ``__array_function__`` to always be enabled.
-
    -  ``func`` is an arbitrary callable exposed by NumPy's public API,
       which was called in the form ``func(*args, **kwargs)``.
    -  ``types`` is a collection :py:class:`collections.abc.Collection`
diff --git a/doc/source/reference/arrays.dtypes.rst b/doc/source/reference/arrays.dtypes.rst
index 8606bc8f1..f4218ce86 100644
--- a/doc/source/reference/arrays.dtypes.rst
+++ b/doc/source/reference/arrays.dtypes.rst
@@ -188,10 +188,10 @@ Built-in Python types
     (all others)      :class:`object_`
     ================  ===============
 
-    Note that ``str`` refers to either null terminated bytes or unicode strings
-    depending on the Python version. In code targeting both Python 2 and 3
-    ``np.unicode_`` should be used as a dtype for strings.
-    See :ref:`Note on string types<string-dtype-note>`.
+    Note that ``str`` corresponds to UCS4 encoded unicode strings, while
+    ``string`` is an alias to ``bytes_``. The name ``np.unicode_`` is also
+    available as an alias to ``np.str_``, see :ref:`Note on string
+    types<string-dtype-note>`.
 
     .. admonition:: Example
 
@@ -263,11 +263,11 @@ Array-protocol type strings (see :ref:`arrays.interface`)
 
    .. admonition:: Note on string types
 
-    For backward compatibility with Python 2 the ``S`` and ``a`` typestrings
-    remain zero-terminated bytes and `numpy.string_` continues to alias
-    `numpy.bytes_`. To use actual strings in Python 3 use ``U`` or `numpy.str_`.
-    For signed bytes that do not need zero-termination ``b`` or ``i1`` can be
-    used.
+    For backward compatibility with existing code originally written to support
+    Python 2, ``S`` and ``a`` typestrings are zero-terminated bytes and
+    `numpy.string_` continues to alias `numpy.bytes_`. For unicode strings,
+    use ``U``, `numpy.str_`, or `numpy.unicode_`.  For signed bytes that do not
+    need zero-termination ``b`` or ``i1`` can be used.
 
 String with comma-separated fields
    A short-hand notation for specifying the format of a structured data type is
diff --git a/doc/source/reference/arrays.nditer.rst b/doc/source/reference/arrays.nditer.rst
index 8cabc1a06..fb1c91a07 100644
--- a/doc/source/reference/arrays.nditer.rst
+++ b/doc/source/reference/arrays.nditer.rst
@@ -3,7 +3,7 @@
 .. _arrays.nditer:
 
 *********************
-Iterating Over Arrays
+Iterating over arrays
 *********************
 
 .. note::
@@ -23,7 +23,7 @@ can accelerate the inner loop in Cython. Since the Python exposure of
 iterator API, these ideas will also provide help working with array
 iteration from C or C++.
 
-Single Array Iteration
+Single array iteration
 ======================
 
 The most basic task that can be done with the :class:`nditer` is to
@@ -64,7 +64,7 @@ namely the order they are stored in memory, whereas the elements of
 `a.T.copy(order='C')` get visited in a different order because they
 have been put into a different memory layout.
 
-Controlling Iteration Order
+Controlling iteration order
 ---------------------------
 
 There are times when it is important to visit the elements of an array
@@ -88,7 +88,7 @@ order='C' for C order and order='F' for Fortran order.
 
 .. _nditer-context-manager:
 
-Modifying Array Values
+Modifying array values
 ----------------------
 
 By default, the :class:`nditer` treats the input operand as a read-only
@@ -128,7 +128,7 @@ note that prior to 1.15, :class:`nditer` was not a context manager and
 did not have a `close` method. Instead it relied on the destructor to
 initiate the writeback of the buffer.
 
-Using an External Loop
+Using an external loop
 ----------------------
 
 In all the examples so far, the elements of `a` are provided by the
@@ -161,7 +161,7 @@ elements each.
     ...
     [0 3] [1 4] [2 5]
 
-Tracking an Index or Multi-Index
+Tracking an index or multi-index
 --------------------------------
 
 During iteration, you may want to use the index of the current
@@ -210,7 +210,7 @@ raise an exception.
       File "<stdin>", line 1, in <module>
     ValueError: Iterator flag EXTERNAL_LOOP cannot be used if an index or multi-index is being tracked
 
-Alternative Looping and Element Access
+Alternative looping and element access
 --------------------------------------
 
 To make its properties more readily accessible during iteration,
@@ -246,7 +246,7 @@ produce identical results to the ones in the previous section.
     array([[ 0,  1,  2],
            [-1,  0,  1]])
 
-Buffering the Array Elements
+Buffering the array elements
 ----------------------------
 
 When forcing an iteration order, we observed that the external loop
@@ -274,7 +274,7 @@ is enabled.
     ...
     [0 3 1 4 2 5]
 
-Iterating as a Specific Data Type
+Iterating as a specific data type
 ---------------------------------
 
 There are times when it is necessary to treat an array as a different
@@ -382,7 +382,7 @@ would violate the casting rule.
       File "<stdin>", line 2, in <module>
     TypeError: Iterator requested dtype could not be cast from dtype('float64') to dtype('int64'), the operand 0 dtype, according to the rule 'same_kind'
 
-Broadcasting Array Iteration
+Broadcasting array iteration
 ============================
 
 NumPy has a set of rules for dealing with arrays that have differing
@@ -417,7 +417,7 @@ which includes the input shapes to help diagnose the problem.
     ...
     ValueError: operands could not be broadcast together with shapes (2,) (2,3)
 
-Iterator-Allocated Output Arrays
+Iterator-allocated output arrays
 --------------------------------
 
 A common case in NumPy functions is to have outputs allocated based
diff --git a/doc/source/reference/arrays.scalars.rst b/doc/source/reference/arrays.scalars.rst
index 30d7be9f9..4dba54d6b 100644
--- a/doc/source/reference/arrays.scalars.rst
+++ b/doc/source/reference/arrays.scalars.rst
@@ -293,6 +293,10 @@ elements the data type consists of.)
    :members: __init__
    :exclude-members: __init__
 
+.. autoclass:: numpy.character
+   :members: __init__
+   :exclude-members: __init__
+
 .. autoclass:: numpy.bytes_
    :members: __init__
    :exclude-members: __init__
@@ -333,8 +337,6 @@ are also provided.
 .. note that these are documented with ..attribute because that is what
    autoclass does for aliases under the hood.
 
-.. autoclass:: numpy.bool8
-
 .. attribute:: int8
                int16
                int32
diff --git a/doc/source/reference/c-api/array.rst b/doc/source/reference/c-api/array.rst
index 8772b494c..baebfdb02 100644
--- a/doc/source/reference/c-api/array.rst
+++ b/doc/source/reference/c-api/array.rst
@@ -423,7 +423,7 @@ From other objects
     :c:data:`NPY_ARRAY_FORCECAST` is present in ``flags``,
     this call will generate an error if the data
     type cannot be safely obtained from the object. If you want to use
-    ``NULL`` for the *dtype* and ensure the array is notswapped then
+    ``NULL`` for the *dtype* and ensure the array is not swapped then
     use :c:func:`PyArray_CheckFromAny`. A value of 0 for either of the
     depth parameters causes the parameter to be ignored. Any of the
     following array flags can be added (*e.g.* using \|) to get the
@@ -548,22 +548,6 @@ From other objects
         :c:data:`NPY_ARRAY_F_CONTIGUOUS` \| :c:data:`NPY_ARRAY_WRITEABLE` \|
         :c:data:`NPY_ARRAY_ALIGNED` \| :c:data:`NPY_ARRAY_WRITEBACKIFCOPY`
 
-.. c:function:: int PyArray_GetArrayParamsFromObject( \
-        PyObject* op, PyArray_Descr* requested_dtype, npy_bool writeable, \
-        PyArray_Descr** out_dtype, int* out_ndim, npy_intp* out_dims, \
-        PyArrayObject** out_arr, PyObject* context)
-
-    .. deprecated:: NumPy 1.19
-
-        Unless NumPy is made aware of an issue with this, this function
-        is scheduled for rapid removal without replacement.
-
-    .. versionchanged:: NumPy 1.19
-
-        `context` is never used. Its use results in an error.
-
-    .. versionadded:: 1.6
-
 .. c:function:: PyObject* PyArray_CheckFromAny( \
         PyObject* op, PyArray_Descr* dtype, int min_depth, int max_depth, \
         int requirements, PyObject* context)
@@ -1202,17 +1186,6 @@ Converting data types
     return value is the enumerated typenumber that represents the
     data-type that *op* should have.
 
-.. c:function:: void PyArray_ArrayType( \
-        PyObject* op, PyArray_Descr* mintype, PyArray_Descr* outtype)
-
-    This function is superseded by :c:func:`PyArray_ResultType`.
-
-    This function works similarly to :c:func:`PyArray_ObjectType` (...)
-    except it handles flexible arrays. The *mintype* argument can have
-    an itemsize member and the *outtype* argument will have an
-    itemsize member at least as big but perhaps bigger depending on
-    the object *op*.
-
 .. c:function:: PyArrayObject** PyArray_ConvertToCommonType( \
         PyObject* op, int* n)
 
@@ -1490,7 +1463,7 @@ of the constant names is deprecated in 1.7.
     :c:func:`PyArray_FromAny` and a copy had to be made of some other
     array (and the user asked for this flag to be set in such a
     situation). The base attribute then points to the "misbehaved"
-    array (which is set read_only). :c:func`PyArray_ResolveWritebackIfCopy`
+    array (which is set read_only). :c:func:`PyArray_ResolveWritebackIfCopy`
     will copy its contents back to the "misbehaved"
     array (casting if necessary) and will reset the "misbehaved" array
     to :c:data:`NPY_ARRAY_WRITEABLE`. If the "misbehaved" array was not
@@ -2276,7 +2249,7 @@ Array Functions
     output array must have the correct shape, type, and be
     C-contiguous, or an exception is raised.
 
-.. c:function:: PyObject* PyArray_EinsteinSum( \
+.. c:function:: PyArrayObject* PyArray_EinsteinSum( \
         char* subscripts, npy_intp nop, PyArrayObject** op_in, \
         PyArray_Descr* dtype, NPY_ORDER order, NPY_CASTING casting, \
         PyArrayObject* out)
@@ -2475,9 +2448,9 @@ As of NumPy 1.6.0, these array iterators are superseded by
 the new array iterator, :c:type:`NpyIter`.
 
 An array iterator is a simple way to access the elements of an
-N-dimensional array quickly and efficiently. Section `2
-<#sec-array-iterator>`__ provides more description and examples of
-this useful approach to looping over an array.
+N-dimensional array quickly and efficiently, as seen in :ref:`the
+example <iteration-example>` which provides more description
+of this useful approach to looping over an array from C. 
 
 .. c:function:: PyObject* PyArray_IterNew(PyObject* arr)
 
@@ -3378,7 +3351,7 @@ Memory management
 
 .. c:function:: int PyArray_ResolveWritebackIfCopy(PyArrayObject* obj)
 
-    If ``obj.flags`` has :c:data:`NPY_ARRAY_WRITEBACKIFCOPY`, this function
+    If ``obj->flags`` has :c:data:`NPY_ARRAY_WRITEBACKIFCOPY`, this function
     clears the flags, `DECREF` s
     `obj->base` and makes it writeable, and sets ``obj->base`` to NULL. It then
     copies ``obj->data`` to `obj->base->data`, and returns the error state of
@@ -3608,29 +3581,17 @@ Miscellaneous Macros
 
     Returns the reference count of any Python object.
 
-.. c:function:: void PyArray_DiscardWritebackIfCopy(PyObject* obj)
+.. c:function:: void PyArray_DiscardWritebackIfCopy(PyArrayObject* obj)
 
-    If ``obj.flags`` has :c:data:`NPY_ARRAY_WRITEBACKIFCOPY`, this function
+    If ``obj->flags`` has :c:data:`NPY_ARRAY_WRITEBACKIFCOPY`, this function
     clears the flags, `DECREF` s
     `obj->base` and makes it writeable, and sets ``obj->base`` to NULL. In
-    contrast to :c:func:`PyArray_DiscardWritebackIfCopy` it makes no attempt
-    to copy the data from `obj->base` This undoes
+    contrast to :c:func:`PyArray_ResolveWritebackIfCopy` it makes no attempt
+    to copy the data from `obj->base`. This undoes
     :c:func:`PyArray_SetWritebackIfCopyBase`. Usually this is called after an
     error when you are finished with ``obj``, just before ``Py_DECREF(obj)``.
     It may be called multiple times, or with ``NULL`` input.
 
-.. c:function:: void PyArray_XDECREF_ERR(PyObject* obj)
-
-    Deprecated in 1.14, use :c:func:`PyArray_DiscardWritebackIfCopy`
-    followed by ``Py_XDECREF``
-
-    DECREF's an array object which may have the
-    :c:data:`NPY_ARRAY_WRITEBACKIFCOPY`
-    flag set without causing the contents to be copied back into the
-    original array. Resets the :c:data:`NPY_ARRAY_WRITEABLE` flag on the base
-    object. This is useful for recovering from an error condition when
-    writeback semantics are used, but will lead to wrong results.
-
 
 Enumerated Types
 ~~~~~~~~~~~~~~~~
diff --git a/doc/source/reference/c-api/config.rst b/doc/source/reference/c-api/config.rst
index 87130699b..963a106f8 100644
--- a/doc/source/reference/c-api/config.rst
+++ b/doc/source/reference/c-api/config.rst
@@ -118,14 +118,3 @@ Compiler directives
 .. c:macro:: NPY_LIKELY
 .. c:macro:: NPY_UNLIKELY
 .. c:macro:: NPY_UNUSED
-
-
-Interrupt Handling
-------------------
-
-.. c:macro:: NPY_INTERRUPT_H
-.. c:macro:: NPY_SIGSETJMP
-.. c:macro:: NPY_SIGLONGJMP
-.. c:macro:: NPY_SIGJMP_BUF
-.. c:macro:: NPY_SIGINT_ON
-.. c:macro:: NPY_SIGINT_OFF
diff --git a/doc/source/reference/c-api/coremath.rst b/doc/source/reference/c-api/coremath.rst
index eb0990edb..2293f0ae8 100644
--- a/doc/source/reference/c-api/coremath.rst
+++ b/doc/source/reference/c-api/coremath.rst
@@ -55,30 +55,30 @@ Floating point classification
 
 .. c:macro:: npy_isnan(x)
 
-    This is a macro, and is equivalent to C99 isnan: works for single, double
+    This is an alias for C99 isnan: works for single, double
     and extended precision, and return a non 0 value if x is a NaN.
 
 .. c:macro:: npy_isfinite(x)
 
-    This is a macro, and is equivalent to C99 isfinite: works for single,
+    This is an alias for C99 isfinite: works for single,
     double and extended precision, and return a non 0 value if x is neither a
     NaN nor an infinity.
 
 .. c:macro:: npy_isinf(x)
 
-    This is a macro, and is equivalent to C99 isinf: works for single, double
+    This is an alias for C99 isinf: works for single, double
     and extended precision, and return a non 0 value if x is infinite (positive
     and negative).
 
 .. c:macro:: npy_signbit(x)
 
-    This is a macro, and is equivalent to C99 signbit: works for single, double
+    This is an alias for C99 signbit: works for single, double
     and extended precision, and return a non 0 value if x has the signbit set
     (that is the number is negative).
 
 .. c:macro:: npy_copysign(x, y)
 
-    This is a function equivalent to C99 copysign: return x with the same sign
+    This is an alias for  C99 copysign: return x with the same sign
     as y. Works for any value, including inf and nan. Single and extended
     precisions are available with suffix f and l.
 
@@ -141,7 +141,7 @@ Those can be useful for precise floating point comparison.
 
 .. c:function:: double npy_nextafter(double x, double y)
 
-    This is a function equivalent to C99 nextafter: return next representable
+    This is an alias to C99 nextafter: return next representable
     floating point value from x in the direction of y. Single and extended
     precisions are available with suffix f and l.
 
diff --git a/doc/source/reference/c-api/data_memory.rst b/doc/source/reference/c-api/data_memory.rst
index 2084ab5d0..cab587efc 100644
--- a/doc/source/reference/c-api/data_memory.rst
+++ b/doc/source/reference/c-api/data_memory.rst
@@ -159,3 +159,87 @@ A better technique would be to use a ``PyCapsule`` as a base object:
         return NULL;
     }
     ...
+
+Example of memory tracing with ``np.lib.tracemalloc_domain``
+------------------------------------------------------------
+
+Note that since Python 3.6 (or newer), the builtin ``tracemalloc`` module can be used to
+track allocations inside NumPy. NumPy places its CPU memory allocations into the 
+``np.lib.tracemalloc_domain`` domain.
+For additional information, check: `https://docs.python.org/3/library/tracemalloc.html`.
+
+Here is an example on how to use ``np.lib.tracemalloc_domain``:
+
+.. code-block:: python
+
+    """
+       The goal of this example is to show how to trace memory
+       from an application that has NumPy and non-NumPy sections.
+       We only select the sections using NumPy related calls.
+    """
+    
+    import tracemalloc
+    import numpy as np
+    
+    # Flag to determine if we select NumPy domain
+    use_np_domain = True
+    
+    nx = 300
+    ny = 500
+    
+    # Start to trace memory
+    tracemalloc.start()
+    
+    # Section 1
+    # ---------
+    
+    # NumPy related call
+    a = np.zeros((nx,ny))
+    
+    # non-NumPy related call
+    b = [i**2 for i in range(nx*ny)]
+    
+    snapshot1 = tracemalloc.take_snapshot()
+    # We filter the snapshot to only select NumPy related calls
+    np_domain = np.lib.tracemalloc_domain
+    dom_filter = tracemalloc.DomainFilter(inclusive=use_np_domain,
+                                          domain=np_domain)
+    snapshot1 = snapshot1.filter_traces([dom_filter])
+    top_stats1 = snapshot1.statistics('traceback')
+    
+    print("================ SNAPSHOT 1 =================")
+    for stat in top_stats1:
+        print(f"{stat.count} memory blocks: {stat.size / 1024:.1f} KiB")
+        print(stat.traceback.format()[-1])
+    
+    # Clear traces of memory blocks allocated by Python
+    # before moving to the next section.
+    tracemalloc.clear_traces()
+    
+    # Section 2
+    #----------
+    
+    # We are only using NumPy
+    c = np.sum(a*a)
+    
+    snapshot2 = tracemalloc.take_snapshot()
+    top_stats2 = snapshot2.statistics('traceback')
+
+    print()
+    print("================ SNAPSHOT 2 =================")
+    for stat in top_stats2:
+        print(f"{stat.count} memory blocks: {stat.size / 1024:.1f} KiB")
+        print(stat.traceback.format()[-1])
+    
+    tracemalloc.stop()
+    
+    print()
+    print("============================================")
+    print("\nTracing Status : ", tracemalloc.is_tracing())
+    
+    try:
+        print("\nTrying to Take Snapshot After Tracing is Stopped.")
+        snap = tracemalloc.take_snapshot()
+    except Exception as e:
+        print("Exception : ", e)
+    
diff --git a/doc/source/reference/c-api/deprecations.rst b/doc/source/reference/c-api/deprecations.rst
index 5b1abc6f2..d805421f2 100644
--- a/doc/source/reference/c-api/deprecations.rst
+++ b/doc/source/reference/c-api/deprecations.rst
@@ -58,3 +58,7 @@ On compilers which support a #warning mechanism, NumPy issues a
 compiler warning if you do not define the symbol NPY_NO_DEPRECATED_API.
 This way, the fact that there are deprecations will be flagged for
 third-party developers who may not have read the release notes closely.
+
+Note that defining NPY_NO_DEPRECATED_API is not sufficient to make your
+extension ABI compatible with a given NumPy version. See
+:ref:`for-downstream-package-authors`.
diff --git a/doc/source/reference/c-api/dtype.rst b/doc/source/reference/c-api/dtype.rst
index 642f62749..bdd5a6f55 100644
--- a/doc/source/reference/c-api/dtype.rst
+++ b/doc/source/reference/c-api/dtype.rst
@@ -25,161 +25,161 @@ select the precision desired.
 Enumerated Types
 ----------------
 
-.. c:enumerator:: NPY_TYPES
+.. c:enum:: NPY_TYPES
 
-There is a list of enumerated types defined providing the basic 24
-data types plus some useful generic names. Whenever the code requires
-a type number, one of these enumerated types is requested. The types
-are all called ``NPY_{NAME}``:
+    There is a list of enumerated types defined providing the basic 24
+    data types plus some useful generic names. Whenever the code requires
+    a type number, one of these enumerated types is requested. The types
+    are all called ``NPY_{NAME}``:
 
-.. c:enumerator:: NPY_BOOL
+    .. c:enumerator:: NPY_BOOL
 
-    The enumeration value for the boolean type, stored as one byte.
-    It may only be set to the values 0 and 1.
+        The enumeration value for the boolean type, stored as one byte.
+        It may only be set to the values 0 and 1.
 
-.. c:enumerator:: NPY_BYTE
-.. c:enumerator:: NPY_INT8
+    .. c:enumerator:: NPY_BYTE
+    .. c:enumerator:: NPY_INT8
 
-    The enumeration value for an 8-bit/1-byte signed integer.
+        The enumeration value for an 8-bit/1-byte signed integer.
 
-.. c:enumerator:: NPY_SHORT
-.. c:enumerator:: NPY_INT16
+    .. c:enumerator:: NPY_SHORT
+    .. c:enumerator:: NPY_INT16
 
-    The enumeration value for a 16-bit/2-byte signed integer.
+        The enumeration value for a 16-bit/2-byte signed integer.
 
-.. c:enumerator:: NPY_INT
-.. c:enumerator:: NPY_INT32
+    .. c:enumerator:: NPY_INT
+    .. c:enumerator:: NPY_INT32
 
-    The enumeration value for a 32-bit/4-byte signed integer.
+        The enumeration value for a 32-bit/4-byte signed integer.
 
-.. c:enumerator:: NPY_LONG
+    .. c:enumerator:: NPY_LONG
 
-    Equivalent to either NPY_INT or NPY_LONGLONG, depending on the
-    platform.
+        Equivalent to either NPY_INT or NPY_LONGLONG, depending on the
+        platform.
 
-.. c:enumerator:: NPY_LONGLONG
-.. c:enumerator:: NPY_INT64
+    .. c:enumerator:: NPY_LONGLONG
+    .. c:enumerator:: NPY_INT64
 
-    The enumeration value for a 64-bit/8-byte signed integer.
+        The enumeration value for a 64-bit/8-byte signed integer.
 
-.. c:enumerator:: NPY_UBYTE
-.. c:enumerator:: NPY_UINT8
+    .. c:enumerator:: NPY_UBYTE
+    .. c:enumerator:: NPY_UINT8
 
-    The enumeration value for an 8-bit/1-byte unsigned integer.
+        The enumeration value for an 8-bit/1-byte unsigned integer.
 
-.. c:enumerator:: NPY_USHORT
-.. c:enumerator:: NPY_UINT16
+    .. c:enumerator:: NPY_USHORT
+    .. c:enumerator:: NPY_UINT16
 
-    The enumeration value for a 16-bit/2-byte unsigned integer.
+        The enumeration value for a 16-bit/2-byte unsigned integer.
 
-.. c:enumerator:: NPY_UINT
-.. c:enumerator:: NPY_UINT32
+    .. c:enumerator:: NPY_UINT
+    .. c:enumerator:: NPY_UINT32
 
-    The enumeration value for a 32-bit/4-byte unsigned integer.
+        The enumeration value for a 32-bit/4-byte unsigned integer.
 
-.. c:enumerator:: NPY_ULONG
+    .. c:enumerator:: NPY_ULONG
 
-    Equivalent to either NPY_UINT or NPY_ULONGLONG, depending on the
-    platform.
+        Equivalent to either NPY_UINT or NPY_ULONGLONG, depending on the
+        platform.
 
-.. c:enumerator:: NPY_ULONGLONG
-.. c:enumerator:: NPY_UINT64
+    .. c:enumerator:: NPY_ULONGLONG
+    .. c:enumerator:: NPY_UINT64
 
-    The enumeration value for a 64-bit/8-byte unsigned integer.
+        The enumeration value for a 64-bit/8-byte unsigned integer.
 
-.. c:enumerator:: NPY_HALF
-.. c:enumerator:: NPY_FLOAT16
+    .. c:enumerator:: NPY_HALF
+    .. c:enumerator:: NPY_FLOAT16
 
-    The enumeration value for a 16-bit/2-byte IEEE 754-2008 compatible floating
-    point type.
+        The enumeration value for a 16-bit/2-byte IEEE 754-2008 compatible floating
+        point type.
 
-.. c:enumerator:: NPY_FLOAT
-.. c:enumerator:: NPY_FLOAT32
+    .. c:enumerator:: NPY_FLOAT
+    .. c:enumerator:: NPY_FLOAT32
 
-    The enumeration value for a 32-bit/4-byte IEEE 754 compatible floating
-    point type.
+        The enumeration value for a 32-bit/4-byte IEEE 754 compatible floating
+        point type.
 
-.. c:enumerator:: NPY_DOUBLE
-.. c:enumerator:: NPY_FLOAT64
+    .. c:enumerator:: NPY_DOUBLE
+    .. c:enumerator:: NPY_FLOAT64
 
-    The enumeration value for a 64-bit/8-byte IEEE 754 compatible floating
-    point type.
+        The enumeration value for a 64-bit/8-byte IEEE 754 compatible floating
+        point type.
 
-.. c:enumerator:: NPY_LONGDOUBLE
+    .. c:enumerator:: NPY_LONGDOUBLE
 
-    The enumeration value for a platform-specific floating point type which is
-    at least as large as NPY_DOUBLE, but larger on many platforms.
+        The enumeration value for a platform-specific floating point type which is
+        at least as large as NPY_DOUBLE, but larger on many platforms.
 
-.. c:enumerator:: NPY_CFLOAT
-.. c:enumerator:: NPY_COMPLEX64
+    .. c:enumerator:: NPY_CFLOAT
+    .. c:enumerator:: NPY_COMPLEX64
 
-    The enumeration value for a 64-bit/8-byte complex type made up of
-    two NPY_FLOAT values.
+        The enumeration value for a 64-bit/8-byte complex type made up of
+        two NPY_FLOAT values.
 
-.. c:enumerator:: NPY_CDOUBLE
-.. c:enumerator:: NPY_COMPLEX128
+    .. c:enumerator:: NPY_CDOUBLE
+    .. c:enumerator:: NPY_COMPLEX128
 
-    The enumeration value for a 128-bit/16-byte complex type made up of
-    two NPY_DOUBLE values.
+        The enumeration value for a 128-bit/16-byte complex type made up of
+        two NPY_DOUBLE values.
 
-.. c:enumerator:: NPY_CLONGDOUBLE
+    .. c:enumerator:: NPY_CLONGDOUBLE
 
-    The enumeration value for a platform-specific complex floating point
-    type which is made up of two NPY_LONGDOUBLE values.
+        The enumeration value for a platform-specific complex floating point
+        type which is made up of two NPY_LONGDOUBLE values.
 
-.. c:enumerator:: NPY_DATETIME
+    .. c:enumerator:: NPY_DATETIME
 
-    The enumeration value for a data type which holds dates or datetimes with
-    a precision based on selectable date or time units.
+        The enumeration value for a data type which holds dates or datetimes with
+        a precision based on selectable date or time units.
 
-.. c:enumerator:: NPY_TIMEDELTA
+    .. c:enumerator:: NPY_TIMEDELTA
 
-    The enumeration value for a data type which holds lengths of times in
-    integers of selectable date or time units.
+        The enumeration value for a data type which holds lengths of times in
+        integers of selectable date or time units.
 
-.. c:enumerator:: NPY_STRING
+    .. c:enumerator:: NPY_STRING
 
-    The enumeration value for ASCII strings of a selectable size. The
-    strings have a fixed maximum size within a given array.
+        The enumeration value for ASCII strings of a selectable size. The
+        strings have a fixed maximum size within a given array.
 
-.. c:enumerator:: NPY_UNICODE
+    .. c:enumerator:: NPY_UNICODE
 
-    The enumeration value for UCS4 strings of a selectable size. The
-    strings have a fixed maximum size within a given array.
+        The enumeration value for UCS4 strings of a selectable size. The
+        strings have a fixed maximum size within a given array.
 
-.. c:enumerator:: NPY_OBJECT
+    .. c:enumerator:: NPY_OBJECT
 
-    The enumeration value for references to arbitrary Python objects.
+        The enumeration value for references to arbitrary Python objects.
 
-.. c:enumerator:: NPY_VOID
+    .. c:enumerator:: NPY_VOID
 
-    Primarily used to hold struct dtypes, but can contain arbitrary
-    binary data.
+        Primarily used to hold struct dtypes, but can contain arbitrary
+        binary data.
 
-Some useful aliases of the above types are
+    Some useful aliases of the above types are
 
-.. c:enumerator:: NPY_INTP
+    .. c:enumerator:: NPY_INTP
 
-    The enumeration value for a signed integer type which is the same
-    size as a (void \*) pointer. This is the type used by all
-    arrays of indices.
+        The enumeration value for a signed integer type which is the same
+        size as a (void \*) pointer. This is the type used by all
+        arrays of indices.
 
-.. c:enumerator:: NPY_UINTP
+    .. c:enumerator:: NPY_UINTP
 
-    The enumeration value for an unsigned integer type which is the
-    same size as a (void \*) pointer.
+        The enumeration value for an unsigned integer type which is the
+        same size as a (void \*) pointer.
 
-.. c:enumerator:: NPY_MASK
+    .. c:enumerator:: NPY_MASK
 
-    The enumeration value of the type used for masks, such as with
-    the :c:data:`NPY_ITER_ARRAYMASK` iterator flag. This is equivalent
-    to :c:data:`NPY_UINT8`.
+        The enumeration value of the type used for masks, such as with
+        the :c:data:`NPY_ITER_ARRAYMASK` iterator flag. This is equivalent
+        to :c:data:`NPY_UINT8`.
 
-.. c:enumerator:: NPY_DEFAULT_TYPE
+    .. c:enumerator:: NPY_DEFAULT_TYPE
 
-    The default type to use when no dtype is explicitly specified, for
-    example when calling np.zero(shape). This is equivalent to
-    :c:data:`NPY_DOUBLE`.
+        The default type to use when no dtype is explicitly specified, for
+        example when calling np.zero(shape). This is equivalent to
+        :c:data:`NPY_DOUBLE`.
 
 Other useful related constants are
 
diff --git a/doc/source/reference/c-api/iterator.rst b/doc/source/reference/c-api/iterator.rst
index 07187e7f1..92e3e435b 100644
--- a/doc/source/reference/c-api/iterator.rst
+++ b/doc/source/reference/c-api/iterator.rst
@@ -26,8 +26,10 @@ which may be of interest for those using this C API. In many instances,
 testing out ideas by creating the iterator in Python is a good idea
 before writing the C iteration code.
 
-Simple Iteration Example
-------------------------
+.. _iteration-example:
+
+Iteration Example
+-----------------
 
 The best way to become familiar with the iterator is to look at its
 usage within the NumPy codebase itself. For example, here is a slightly
@@ -115,10 +117,10 @@ number of non-zero elements in an array.
         return nonzero_count;
     }
 
-Simple Multi-Iteration Example
+Multi-Iteration Example
 ------------------------------
 
-Here is a simple copy function using the iterator.  The ``order`` parameter
+Here is a copy function using the iterator.  The ``order`` parameter
 is used to control the memory layout of the allocated result, typically
 :c:data:`NPY_KEEPORDER` is desired.
 
diff --git a/doc/source/reference/c-api/types-and-structures.rst b/doc/source/reference/c-api/types-and-structures.rst
index 34437bd30..cff1b3d38 100644
--- a/doc/source/reference/c-api/types-and-structures.rst
+++ b/doc/source/reference/c-api/types-and-structures.rst
@@ -122,7 +122,7 @@ PyArray_Type and PyArrayObject
        ``ndarraytypes.h`` points to this data member. :c:data:`NPY_MAXDIMS`
        is the largest number of dimensions for any array.
 
-   .. c:member:: npy_intp dimensions
+   .. c:member:: npy_intp *dimensions
 
        An array of integers providing the shape in each dimension as
        long as nd :math:`\geq` 1. The integer is always large enough
@@ -225,7 +225,7 @@ PyArrayDescr_Type and PyArray_Descr
 
    - Never declare a non-pointer instance of the struct
    - Never perform pointer arithmetic
-   - Never use ``sizof(PyArray_Descr)``
+   - Never use ``sizeof(PyArray_Descr)``
 
    It has the following structure:
 
@@ -285,83 +285,16 @@ PyArrayDescr_Type and PyArray_Descr
        array like behavior. Each bit in this member is a flag which are named
        as:
 
-   .. c:member:: int alignment
-
-       Non-NULL if this type is an array (C-contiguous) of some other type
-
-
-..
-  dedented to allow internal linking, pending a refactoring
-
-.. c:macro:: NPY_ITEM_REFCOUNT
-
-    Indicates that items of this data-type must be reference
-    counted (using :c:func:`Py_INCREF` and :c:func:`Py_DECREF` ).
-
-       .. c:macro:: NPY_ITEM_HASOBJECT
-
-           Same as :c:data:`NPY_ITEM_REFCOUNT`.
-
-..
-  dedented to allow internal linking, pending a refactoring
-
-.. c:macro:: NPY_LIST_PICKLE
-
-    Indicates arrays of this data-type must be converted to a list
-    before pickling.
-
-.. c:macro:: NPY_ITEM_IS_POINTER
-
-    Indicates the item is a pointer to some other data-type
-
-.. c:macro:: NPY_NEEDS_INIT
-
-    Indicates memory for this data-type must be initialized (set
-    to 0) on creation.
-
-.. c:macro:: NPY_NEEDS_PYAPI
-
-    Indicates this data-type requires the Python C-API during
-    access (so don't give up the GIL if array access is going to
-    be needed).
-
-.. c:macro:: NPY_USE_GETITEM
-
-    On array access use the ``f->getitem`` function pointer
-    instead of the standard conversion to an array scalar. Must
-    use if you don't define an array scalar to go along with
-    the data-type.
-
-.. c:macro:: NPY_USE_SETITEM
-
-    When creating a 0-d array from an array scalar use
-    ``f->setitem`` instead of the standard copy from an array
-    scalar. Must use if you don't define an array scalar to go
-    along with the data-type.
-
-       .. c:macro:: NPY_FROM_FIELDS
-
-           The bits that are inherited for the parent data-type if these
-           bits are set in any field of the data-type. Currently (
-           :c:data:`NPY_NEEDS_INIT` \| :c:data:`NPY_LIST_PICKLE` \|
-           :c:data:`NPY_ITEM_REFCOUNT` \| :c:data:`NPY_NEEDS_PYAPI` ).
-
-       .. c:macro:: NPY_OBJECT_DTYPE_FLAGS
-
-           Bits set for the object data-type: ( :c:data:`NPY_LIST_PICKLE`
-           \| :c:data:`NPY_USE_GETITEM` \| :c:data:`NPY_ITEM_IS_POINTER` \|
-           :c:data:`NPY_ITEM_REFCOUNT` \| :c:data:`NPY_NEEDS_INIT` \|
-           :c:data:`NPY_NEEDS_PYAPI`).
-
-       .. c:function:: int PyDataType_FLAGCHK(PyArray_Descr *dtype, int flags)
-
-           Return true if all the given flags are set for the data-type
-           object.
-
-       .. c:function:: int PyDataType_REFCHK(PyArray_Descr *dtype)
-
-           Equivalent to :c:func:`PyDataType_FLAGCHK` (*dtype*,
-           :c:data:`NPY_ITEM_REFCOUNT`).
+       * :c:macro:`NPY_ITEM_REFCOUNT`
+       * :c:macro:`NPY_ITEM_HASOBJECT`
+       * :c:macro:`NPY_LIST_PICKLE`
+       * :c:macro:`NPY_ITEM_IS_POINTER`
+       * :c:macro:`NPY_NEEDS_INIT`
+       * :c:macro:`NPY_NEEDS_PYAPI`
+       * :c:macro:`NPY_USE_GETITEM`
+       * :c:macro:`NPY_USE_SETITEM`
+       * :c:macro:`NPY_FROM_FIELDS`
+       * :c:macro:`NPY_OBJECT_DTYPE_FLAGS`
 
    .. c:member:: int type_num
 
@@ -452,6 +385,73 @@ PyArrayDescr_Type and PyArray_Descr
        Currently unused. Reserved for future use in caching
        hash values.
 
+.. c:macro:: NPY_ITEM_REFCOUNT
+
+    Indicates that items of this data-type must be reference
+    counted (using :c:func:`Py_INCREF` and :c:func:`Py_DECREF` ).
+
+.. c:macro:: NPY_ITEM_HASOBJECT
+
+   Same as :c:data:`NPY_ITEM_REFCOUNT`.
+
+.. c:macro:: NPY_LIST_PICKLE
+
+    Indicates arrays of this data-type must be converted to a list
+    before pickling.
+
+.. c:macro:: NPY_ITEM_IS_POINTER
+
+    Indicates the item is a pointer to some other data-type
+
+.. c:macro:: NPY_NEEDS_INIT
+
+    Indicates memory for this data-type must be initialized (set
+    to 0) on creation.
+
+.. c:macro:: NPY_NEEDS_PYAPI
+
+    Indicates this data-type requires the Python C-API during
+    access (so don't give up the GIL if array access is going to
+    be needed).
+
+.. c:macro:: NPY_USE_GETITEM
+
+    On array access use the ``f->getitem`` function pointer
+    instead of the standard conversion to an array scalar. Must
+    use if you don't define an array scalar to go along with
+    the data-type.
+
+.. c:macro:: NPY_USE_SETITEM
+
+    When creating a 0-d array from an array scalar use
+    ``f->setitem`` instead of the standard copy from an array
+    scalar. Must use if you don't define an array scalar to go
+    along with the data-type.
+
+.. c:macro:: NPY_FROM_FIELDS
+
+   The bits that are inherited for the parent data-type if these
+   bits are set in any field of the data-type. Currently (
+   :c:data:`NPY_NEEDS_INIT` \| :c:data:`NPY_LIST_PICKLE` \|
+   :c:data:`NPY_ITEM_REFCOUNT` \| :c:data:`NPY_NEEDS_PYAPI` ).
+
+.. c:macro:: NPY_OBJECT_DTYPE_FLAGS
+
+   Bits set for the object data-type: ( :c:data:`NPY_LIST_PICKLE`
+   \| :c:data:`NPY_USE_GETITEM` \| :c:data:`NPY_ITEM_IS_POINTER` \|
+   :c:data:`NPY_ITEM_REFCOUNT` \| :c:data:`NPY_NEEDS_INIT` \|
+   :c:data:`NPY_NEEDS_PYAPI`).
+
+.. c:function:: int PyDataType_FLAGCHK(PyArray_Descr *dtype, int flags)
+
+   Return true if all the given flags are set for the data-type
+   object.
+
+.. c:function:: int PyDataType_REFCHK(PyArray_Descr *dtype)
+
+   Equivalent to :c:func:`PyDataType_FLAGCHK` (*dtype*,
+   :c:data:`NPY_ITEM_REFCOUNT`).
+
 .. c:type:: PyArray_ArrFuncs
 
     Functions implementing internal features. Not all of these
@@ -973,7 +973,7 @@ PyUFunc_Type and PyUFuncObject
             Some fallback support for this slot exists, but will be removed
             eventually.  A universal function that relied on this will
             have to be ported eventually.
-            See ref:`NEP 41 <NEP41>` and ref:`NEP 43 <NEP43>`
+            See :ref:`NEP 41 <NEP41>` and :ref:`NEP 43 <NEP43>`
 
    .. c:member:: void *reserved2
 
@@ -997,10 +997,14 @@ PyUFunc_Type and PyUFuncObject
 
    .. c:member:: npy_uint32 *core_dim_flags
 
-       For each distinct core dimension, a set of ``UFUNC_CORE_DIM*`` flags
+       For each distinct core dimension, a set of flags (
+       :c:macro:`UFUNC_CORE_DIM_CAN_IGNORE` and
+       :c:macro:`UFUNC_CORE_DIM_SIZE_INFERRED`)
+
+   .. c:member:: PyObject *identity_value
 
-..
-  dedented to allow internal linking, pending a refactoring
+       Identity for reduction, when :c:member:`PyUFuncObject.identity`
+       is equal to :c:data:`PyUFunc_IdentityValue`.
 
 .. c:macro:: UFUNC_CORE_DIM_CAN_IGNORE
 
@@ -1011,11 +1015,6 @@ PyUFunc_Type and PyUFuncObject
     if the dim size will be determined from the operands
     and not from a :ref:`frozen <frozen>` signature
 
-   .. c:member:: PyObject *identity_value
-
-       Identity for reduction, when :c:member:`PyUFuncObject.identity`
-       is equal to :c:data:`PyUFunc_IdentityValue`.
-
 PyArrayIter_Type and PyArrayIterObject
 --------------------------------------
 
@@ -1253,7 +1252,7 @@ ScalarArrayTypes
 ----------------
 
 There is a Python type for each of the different built-in data types
-that can be present in the array Most of these are simple wrappers
+that can be present in the array. Most of these are simple wrappers
 around the corresponding data type in C. The C-names for these types
 are ``Py{TYPE}ArrType_Type`` where ``{TYPE}`` can be
 
@@ -1457,22 +1456,6 @@ memory management. These types are not accessible directly from
 Python, and are not exposed to the C-API. They are included here only
 for completeness and assistance in understanding the code.
 
-
-.. c:type:: PyUFuncLoopObject
-
-   A loose wrapper for a C-structure that contains the information
-   needed for looping. This is useful if you are trying to understand
-   the ufunc looping code. The :c:type:`PyUFuncLoopObject` is the associated
-   C-structure. It is defined in the ``ufuncobject.h`` header.
-
-.. c:type:: PyUFuncReduceObject
-
-   A loose wrapper for the C-structure that contains the information
-   needed for reduce-like methods of ufuncs. This is useful if you are
-   trying to understand the reduce, accumulate, and reduce-at
-   code. The :c:type:`PyUFuncReduceObject` is the associated C-structure. It
-   is defined in the ``ufuncobject.h`` header.
-
 .. c:type:: PyUFunc_Loop1d
 
    A simple linked-list of C-structures containing the information needed
@@ -1483,8 +1466,12 @@ for completeness and assistance in understanding the code.
 
    Advanced indexing is handled with this Python type. It is simply a
    loose wrapper around the C-structure containing the variables
-   needed for advanced array indexing. The associated C-structure,
-   ``PyArrayMapIterObject``, is useful if you are trying to
+   needed for advanced array indexing.
+
+.. c:type:: PyArrayMapIterObject
+
+   The C-structure associated with :c:var:`PyArrayMapIter_Type`.
+   This structure is useful if you are trying to
    understand the advanced-index mapping code. It is defined in the
    ``arrayobject.h`` header. This type is not exposed to Python and
    could be replaced with a C-structure. As a Python type it takes
diff --git a/doc/source/reference/distutils_guide.rst b/doc/source/reference/distutils_guide.rst
index a72a9c4d6..6f927c231 100644
--- a/doc/source/reference/distutils_guide.rst
+++ b/doc/source/reference/distutils_guide.rst
@@ -1,6 +1,6 @@
 .. _distutils-user-guide:
 
-NumPy Distutils - Users Guide
+NumPy distutils - users guide
 =============================
 
 .. warning::
diff --git a/doc/source/reference/distutils_status_migration.rst b/doc/source/reference/distutils_status_migration.rst
index eda4790b5..67695978c 100644
--- a/doc/source/reference/distutils_status_migration.rst
+++ b/doc/source/reference/distutils_status_migration.rst
@@ -83,7 +83,7 @@ present in ``setuptools``:
 - Support for a few other scientific libraries, like FFTW and UMFPACK
 - Better MinGW support
 - Per-compiler build flag customization (e.g. `-O3` and `SSE2` flags are default)
-- a simple user build config system, see [site.cfg.example](https://github.com/numpy/numpy/blob/master/site.cfg.example)
+- a simple user build config system, see `site.cfg.example <https://github.com/numpy/numpy/blob/master/site.cfg.example>`__
 - SIMD intrinsics support
 
 The most widely used feature is nested ``setup.py`` files. This feature will
diff --git a/doc/source/reference/figures/dtype-hierarchy.dia b/doc/source/reference/figures/dtype-hierarchy.dia
index 62e925cfd..d5c01bf27 100644
--- a/doc/source/reference/figures/dtype-hierarchy.dia
+++ b/doc/source/reference/figures/dtype-hierarchy.dia
diff --git a/doc/source/reference/figures/dtype-hierarchy.pdf b/doc/source/reference/figures/dtype-hierarchy.pdf
index 6ce496a3e..b2375e152 100644
--- a/doc/source/reference/figures/dtype-hierarchy.pdf
+++ b/doc/source/reference/figures/dtype-hierarchy.pdf
diff --git a/doc/source/reference/figures/dtype-hierarchy.png b/doc/source/reference/figures/dtype-hierarchy.png
index 6c45758b1..891de096c 100644
--- a/doc/source/reference/figures/dtype-hierarchy.png
+++ b/doc/source/reference/figures/dtype-hierarchy.png
diff --git a/doc/source/reference/global_state.rst b/doc/source/reference/global_state.rst
index 81685ec7d..a898450af 100644
--- a/doc/source/reference/global_state.rst
+++ b/doc/source/reference/global_state.rst
@@ -1,7 +1,7 @@
 .. _global_state:
 
 ************
-Global State
+Global state
 ************
 
 NumPy has a few import-time, compile-time, or runtime options
@@ -11,10 +11,10 @@ purposes and will not be interesting to the vast majority
 of users.
 
 
-Performance-Related Options
+Performance-related options
 ===========================
 
-Number of Threads used for Linear Algebra
+Number of threads used for Linear Algebra
 -----------------------------------------
 
 NumPy itself is normally intentionally limited to a single thread
@@ -49,25 +49,17 @@ is to use madvise on Kernels 4.6 and newer. These kernels presumably
 experience a large speedup with hugepage support.
 This flag is checked at import time.
 
+SIMD feature selection
+----------------------
 
-Interoperability-Related Options
-================================
+Setting ``NPY_DISABLE_CPU_FEATURES`` will exclude simd features at runtime.
+See :ref:`runtime-simd-dispatch`.
 
-The array function protocol which allows array-like objects to
-hook into the NumPy API is currently enabled by default.
-This option exists since NumPy 1.16 and is enabled by default since
-NumPy 1.17. It can be disabled using::
 
-    NUMPY_EXPERIMENTAL_ARRAY_FUNCTION=0
-
-See also :py:meth:`numpy.class.__array_function__` for more information.
-This flag is checked at import time.
-
-
-Debugging-Related Options
+Debugging-related options
 =========================
 
-Relaxed Strides Checking
+Relaxed strides checking
 ------------------------
 
 The *compile-time* environment variable::
@@ -90,3 +82,17 @@ memory allocation policy, the default will be to call ``free``. If
 ``NUMPY_WARN_IF_NO_MEM_POLICY`` is set to ``"1"``, a ``RuntimeWarning`` will
 be emitted. A better alternative is to use a ``PyCapsule`` with a deallocator
 and set the ``ndarray.base``.
+
+
+Testing planned future behavior
+===============================
+
+NumPy has some code paths which are planned to be activated in the future
+but are not yet the default behavior.
+You can try testing some of these which may be shipped with a new "major"
+release (NumPy 2.0) by setting an environment before importing NumPy:
+
+    NPY_NUMPY_2_BEHAVIOR=1
+
+By default this will also activate the :ref:`NEP 50 <NEP50>` related setting
+``NPY_PROMOTION_STATE`` (please see the NEP for details on this).
diff --git a/doc/source/reference/index.rst b/doc/source/reference/index.rst
index 8e4a81436..4756af5fa 100644
--- a/doc/source/reference/index.rst
+++ b/doc/source/reference/index.rst
@@ -3,7 +3,7 @@
 .. _reference:
 
 ###############
-NumPy Reference
+NumPy reference
 ###############
 
 :Release: |version|
diff --git a/doc/source/reference/internals.code-explanations.rst b/doc/source/reference/internals.code-explanations.rst
index d34314610..db3b3b452 100644
--- a/doc/source/reference/internals.code-explanations.rst
+++ b/doc/source/reference/internals.code-explanations.rst
@@ -1,7 +1,7 @@
 :orphan:
 
 *************************
-NumPy C Code Explanations
+NumPy C code explanations
 *************************
 
 .. This document has been moved to ../dev/internals.code-explanations.rst.
diff --git a/doc/source/reference/maskedarray.baseclass.rst b/doc/source/reference/maskedarray.baseclass.rst
index fcd310faa..7121914b9 100644
--- a/doc/source/reference/maskedarray.baseclass.rst
+++ b/doc/source/reference/maskedarray.baseclass.rst
@@ -72,19 +72,19 @@ Attributes and properties of masked arrays
 
 .. seealso:: :ref:`Array Attributes <arrays.ndarray.attributes>`
 
-.. autoattribute:: MaskedArray.data
+.. autoattribute:: numpy::ma.MaskedArray.data
 
-.. autoattribute:: MaskedArray.mask
+.. autoattribute:: numpy::ma.MaskedArray.mask
 
-.. autoattribute:: MaskedArray.recordmask
+.. autoattribute:: numpy::ma.MaskedArray.recordmask
 
-.. autoattribute:: MaskedArray.fill_value
+.. autoattribute:: numpy::ma.MaskedArray.fill_value
 
-.. autoattribute:: MaskedArray.baseclass
+.. autoattribute:: numpy::ma.MaskedArray.baseclass
 
-.. autoattribute:: MaskedArray.sharedmask
+.. autoattribute:: numpy::ma.MaskedArray.sharedmask
 
-.. autoattribute:: MaskedArray.hardmask
+.. autoattribute:: numpy::ma.MaskedArray.hardmask
 
 As :class:`MaskedArray` is a subclass of :class:`~numpy.ndarray`, a masked array also inherits all the attributes and properties of a  :class:`~numpy.ndarray` instance.
 
diff --git a/doc/source/reference/random/bit_generators/index.rst b/doc/source/reference/random/bit_generators/index.rst
index d93f38d0b..e523b4339 100644
--- a/doc/source/reference/random/bit_generators/index.rst
+++ b/doc/source/reference/random/bit_generators/index.rst
@@ -1,5 +1,7 @@
 .. currentmodule:: numpy.random
 
+.. _random-bit-generators:
+
 Bit Generators
 ==============
 
@@ -50,6 +52,8 @@ The included BitGenerators are:
     Philox <philox>
     SFC64 <sfc64>
 
+.. _seeding_and_entropy:
+
 Seeding and Entropy
 ===================
 
@@ -127,6 +131,16 @@ of 12 instances:
 
 .. end_block
 
+If you already have an initial random generator instance, you can shorten
+the above by using the `~BitGenerator.spawn` method:
+
+.. code-block:: python
+
+    from numpy.random import PCG64, SeedSequence
+    # High quality initial entropy
+    entropy = 0x87351080e25cb0fad77a44a3be03b491
+    base_bitgen = PCG64(entropy)
+    generators = base_bitgen.spawn(12)
 
 An alternative way is to use the fact that a `~SeedSequence` can be initialized
 by a tuple of elements. Here we use a base entropy value and an integer
diff --git a/doc/source/reference/random/compatibility.rst b/doc/source/reference/random/compatibility.rst
new file mode 100644
index 000000000..138f03ca3
--- /dev/null
+++ b/doc/source/reference/random/compatibility.rst
@@ -0,0 +1,87 @@
+.. _random-compatibility:
+
+.. currentmodule:: numpy.random
+
+Compatibility Policy
+====================
+
+`numpy.random` has a somewhat stricter compatibility policy than the rest of
+NumPy. Users of pseudorandomness often have use cases for being able to
+reproduce runs in fine detail given the same seed (so-called "stream
+compatibility"), and so we try to balance those needs with the flexibility to
+enhance our algorithms. :ref:`NEP 19 <NEP19>` describes the evolution of this
+policy.
+
+The main kind of compatibility that we enforce is stream-compatibility from run
+to run under certain conditions. If you create a `Generator` with the same
+`BitGenerator`, with the same seed, perform the same sequence of method calls
+with the same arguments, on the same build of ``numpy``, in the same
+environment, on the same machine, you should get the same stream of numbers.
+Note that these conditions are very strict. There are a number of factors
+outside of NumPy's control that limit our ability to guarantee much more than
+this. For example, different CPUs implement floating point arithmetic
+differently, and this can cause differences in certain edge cases that cascade
+to the rest of the stream. `Generator.multivariate_normal`, for another
+example, uses a matrix decomposition from ``numpy.linalg``. Even on the same
+platform, a different build of ``numpy`` may use a different version of this
+matrix decomposition algorithm from the LAPACK that it links to, causing
+`Generator.multivariate_normal` to return completely different (but equally
+valid!) results. We strive to prefer algorithms that are more resistant to
+these effects, but this is always imperfect.
+
+.. note::
+
+   Most of the `Generator` methods allow you to draw multiple values from
+   a distribution as arrays. The requested size of this array is a parameter,
+   for the purposes of the above policy. Calling ``rng.random()`` 5 times is
+   not *guaranteed* to give the same numbers as ``rng.random(5)``. We reserve
+   the ability to decide to use different algorithms for different-sized
+   blocks. In practice, this happens rarely.
+
+Like the rest of NumPy, we generally maintain API source
+compatibility from version to version. If we *must* make an API-breaking
+change, then we will only do so with an appropriate deprecation period and
+warnings, according to :ref:`general NumPy policy <NEP23>`.
+
+Breaking stream-compatibility in order to introduce new features or
+improve performance in `Generator` or `default_rng` will be *allowed* with
+*caution*. Such changes will be considered features, and as such will be no
+faster than the standard release cadence of features (i.e. on ``X.Y`` releases,
+never ``X.Y.Z``).  Slowness will not be considered a bug for this purpose.
+Correctness bug fixes that break stream-compatibility can happen on bugfix
+releases, per usual, but developers should consider if they can wait until the
+next feature release. We encourage developers to strongly weight user’s pain
+from the break in stream-compatibility against the improvements. One example
+of a worthwhile improvement would be to change algorithms for a significant
+increase in performance, for example, moving from the `Box-Muller transform
+<https://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform>`_ method of
+Gaussian variate generation to the faster `Ziggurat algorithm
+<https://en.wikipedia.org/wiki/Ziggurat_algorithm>`_. An example of
+a discouraged improvement would be tweaking the Ziggurat tables just a little
+bit for a small performance improvement.
+
+.. note::
+
+    In particular, `default_rng` is allowed to change the default
+    `BitGenerator` that it uses (again, with *caution* and plenty of advance
+    warning).
+
+In general, `BitGenerator` classes have stronger guarantees of
+version-to-version stream compatibility. This allows them to be a firmer
+building block for downstream users that need it. Their limited API surface
+makes it easier for them to maintain this compatibility from version to version. See
+the docstrings of each `BitGenerator` class for their individual compatibility
+guarantees.
+
+The legacy `RandomState` and the :ref:`associated convenience functions
+<functions-in-numpy-random>` have a stricter version-to-version
+compatibility guarantee. For reasons outlined in :ref:`NEP 19 <NEP19>`, we had made
+stronger promises about their version-to-version stability early in NumPy's
+development. There are still some limited use cases for this kind of
+compatibility (like generating data for tests), so we maintain as much
+compatibility as we can. There will be no more modifications to `RandomState`,
+not even to fix correctness bugs. There are a few gray areas where we can make
+minor fixes to keep `RandomState` working without segfaulting as NumPy's
+internals change, and some docstring fixes. However, the previously-mentioned
+caveats about the variability from machine to machine and build to build still
+apply to `RandomState` just as much as it does to `Generator`.
diff --git a/doc/source/reference/random/extending.rst b/doc/source/reference/random/extending.rst
index 6bb941496..998faf80a 100644
--- a/doc/source/reference/random/extending.rst
+++ b/doc/source/reference/random/extending.rst
@@ -25,7 +25,7 @@ provided by ``ctypes.next_double``.
 Both CTypes and CFFI allow the more complicated distributions to be used
 directly in Numba after compiling the file distributions.c into a ``DLL`` or
 ``so``.  An example showing the use of a more complicated distribution is in
-the `examples` section below.
+the `Examples`_ section below.
 
 .. _random_cython:
 
@@ -113,6 +113,6 @@ Examples
 
 .. toctree::
     Numba <examples/numba>
-    CFFI + Numba <examples/numba_cffi> 
+    CFFI + Numba <examples/numba_cffi>
     Cython <examples/cython/index>
     CFFI <examples/cffi>
diff --git a/doc/source/reference/random/generator.rst b/doc/source/reference/random/generator.rst
index dc71cb1f9..e08395b17 100644
--- a/doc/source/reference/random/generator.rst
+++ b/doc/source/reference/random/generator.rst
@@ -18,12 +18,13 @@ can be changed by passing an instantized BitGenerator to ``Generator``.
     :members: __init__
     :exclude-members: __init__
 
-Accessing the BitGenerator
---------------------------
+Accessing the BitGenerator and Spawning
+---------------------------------------
 .. autosummary::
    :toctree: generated/
 
    ~numpy.random.Generator.bit_generator
+   ~numpy.random.Generator.spawn
 
 Simple random data
 ------------------
diff --git a/doc/source/reference/random/index.rst b/doc/source/reference/random/index.rst
index 83a27d80c..38555b133 100644
--- a/doc/source/reference/random/index.rst
+++ b/doc/source/reference/random/index.rst
@@ -7,210 +7,124 @@
 Random sampling (:mod:`numpy.random`)
 =====================================
 
-Numpy's random number routines produce pseudo random numbers using
-combinations of a `BitGenerator` to create sequences and a `Generator`
-to use those sequences to sample from different statistical distributions:
-
-* BitGenerators: Objects that generate random numbers. These are typically
-  unsigned integer words filled with sequences of either 32 or 64 random bits.
-* Generators: Objects that transform sequences of random bits from a
-  BitGenerator into sequences of numbers that follow a specific probability
-  distribution (such as uniform, Normal or Binomial) within a specified
-  interval.
-
-Since Numpy version 1.17.0 the Generator can be initialized with a
-number of different BitGenerators. It exposes many different probability
-distributions. See `NEP 19 <https://www.numpy.org/neps/
-nep-0019-rng-policy.html>`_ for context on the updated random Numpy number
-routines. The legacy `RandomState` random number routines are still
-available, but limited to a single BitGenerator. See :ref:`new-or-different` 
-for a complete list of improvements and differences from the legacy
-``RandomState``.
-
-For convenience and backward compatibility, a single `RandomState`
-instance's methods are imported into the numpy.random namespace, see
-:ref:`legacy` for the complete list.
-
 .. _random-quick-start:
 
 Quick Start
 -----------
 
-Call `default_rng` to get a new instance of a `Generator`, then call its
-methods to obtain samples from different distributions.  By default,
-`Generator` uses bits provided by `PCG64` which has better statistical
-properties than the legacy `MT19937` used in `RandomState`.
-
-.. code-block:: python
-
-  # Do this (new version)
-  from numpy.random import default_rng
-  rng = default_rng()
-  vals = rng.standard_normal(10)
-  more_vals = rng.standard_normal(10)
-
-  # instead of this (legacy version)
-  from numpy import random
-  vals = random.standard_normal(10)
-  more_vals = random.standard_normal(10)
-
-`Generator` can be used as a replacement for `RandomState`. Both class
-instances hold an internal `BitGenerator` instance to provide the bit
-stream, it is accessible as ``gen.bit_generator``. Some long-overdue API
-cleanup means that legacy and compatibility methods have been removed from
-`Generator`
-
-=================== ============== ============
-`RandomState`       `Generator`    Notes
-------------------- -------------- ------------
-``random_sample``,  ``random``     Compatible with `random.random`
-``rand``
-------------------- -------------- ------------
-``randint``,        ``integers``   Add an ``endpoint`` kwarg
-``random_integers``
-------------------- -------------- ------------
-``tomaxint``        removed        Use ``integers(0, np.iinfo(np.int_).max,``
-                                   ``endpoint=False)``
-------------------- -------------- ------------
-``seed``            removed        Use `SeedSequence.spawn`
-=================== ============== ============
-
-See :ref:`new-or-different` for more information.
-
-Something like the following code can be used to support both ``RandomState``
-and ``Generator``, with the understanding that the interfaces are slightly
-different
-
-.. code-block:: python
-
-    try:
-        rng_integers = rng.integers
-    except AttributeError:
-        rng_integers = rng.randint
-    a = rng_integers(1000)
-
-Seeds can be passed to any of the BitGenerators. The provided value is mixed
-via `SeedSequence` to spread a possible sequence of seeds across a wider
-range of initialization states for the BitGenerator. Here `PCG64` is used and
-is wrapped with a `Generator`.
-
-.. code-block:: python
-
-  from numpy.random import Generator, PCG64
-  rng = Generator(PCG64(12345))
-  rng.standard_normal()
-  
-Here we use `default_rng` to create an instance of `Generator` to generate a 
-random float:
- 
->>> import numpy as np
->>> rng = np.random.default_rng(12345)
->>> print(rng)
-Generator(PCG64)
->>> rfloat = rng.random()
->>> rfloat
-0.22733602246716966
->>> type(rfloat)
-<class 'float'>
- 
-Here we use `default_rng` to create an instance of `Generator` to generate 3 
-random integers between 0 (inclusive) and 10 (exclusive):
-    
->>> import numpy as np
->>> rng = np.random.default_rng(12345)
->>> rints = rng.integers(low=0, high=10, size=3)
->>> rints
-array([6, 2, 7])
->>> type(rints[0])
-<class 'numpy.int64'> 
-
-Introduction
-------------
-The new infrastructure takes a different approach to producing random numbers
-from the `RandomState` object.  Random number generation is separated into
-two components, a bit generator and a random generator.
-
-The `BitGenerator` has a limited set of responsibilities. It manages state
-and provides functions to produce random doubles and random unsigned 32- and
-64-bit values.
-
-The `random generator <Generator>` takes the
-bit generator-provided stream and transforms them into more useful
-distributions, e.g., simulated normal random values. This structure allows
-alternative bit generators to be used with little code duplication.
-
-The `Generator` is the user-facing object that is nearly identical to the
-legacy `RandomState`. It accepts a bit generator instance as an argument.
-The default is currently `PCG64` but this may change in future versions. 
-As a convenience NumPy  provides the `default_rng` function to hide these 
-details:
-  
->>> from numpy.random import default_rng
->>> rng = default_rng(12345)
->>> print(rng)
-Generator(PCG64)
->>> print(rng.random())
-0.22733602246716966
-  
-One can also instantiate `Generator` directly with a `BitGenerator` instance.
-
-To use the default `PCG64` bit generator, one can instantiate it directly and 
-pass it to `Generator`:
-
->>> from numpy.random import Generator, PCG64
->>> rng = Generator(PCG64(12345))
->>> print(rng)
-Generator(PCG64)
-
-Similarly to use the older `MT19937` bit generator (not recommended), one can
-instantiate it directly and pass it to `Generator`:
-
->>> from numpy.random import Generator, MT19937
->>> rng = Generator(MT19937(12345))
->>> print(rng)
-Generator(MT19937)
-
-What's New or Different
-~~~~~~~~~~~~~~~~~~~~~~~
+The :mod:`numpy.random` module implements pseudo-random number generators
+(PRNGs or RNGs, for short) with the ability to draw samples from a variety of
+probability distributions. In general, users will create a `Generator` instance
+with `default_rng` and call the various methods on it to obtain samples from
+different distributions.
+
+::
+
+  >>> import numpy as np
+  >>> rng = np.random.default_rng()
+  # Generate one random float uniformly distributed over the range [0, 1)
+  >>> rng.random()  #doctest: +SKIP
+  0.06369197489564249  # may vary
+  # Generate an array of 10 numbers according to a unit Gaussian distribution.
+  >>> rng.standard_normal(10)  #doctest: +SKIP
+  array([-0.31018314, -1.8922078 , -0.3628523 , -0.63526532,  0.43181166,  # may vary
+          0.51640373,  1.25693945,  0.07779185,  0.84090247, -2.13406828])
+  # Generate an array of 5 integers uniformly over the range [0, 10).
+  >>> rng.integers(low=0, high=10, size=5)  #doctest: +SKIP
+  array([8, 7, 6, 2, 0])  # may vary
+
+Our RNGs are deterministic sequences and can be reproduced by specifying a seed integer to
+derive its initial state. By default, with no seed provided, `default_rng` will create
+seed the RNG from nondeterministic data from the operating system and therefore
+generate different numbers each time. The pseudo-random sequences will be
+independent for all practical purposes, at least those purposes for which our
+pseudo-randomness was good for in the first place.
+
+::
+
+    >>> rng1 = np.random.default_rng()
+    >>> rng1.random()  #doctest: +SKIP
+    0.6596288841243357  # may vary
+    >>> rng2 = np.random.default_rng()
+    >>> rng2.random()  #doctest: +SKIP
+    0.11885628817151628  # may vary
+
 .. warning::
 
-  The Box-Muller method used to produce NumPy's normals is no longer available
-  in `Generator`.  It is not possible to reproduce the exact random
-  values using Generator for the normal distribution or any other
-  distribution that relies on the normal such as the `RandomState.gamma` or
-  `RandomState.standard_t`. If you require bitwise backward compatible
-  streams, use `RandomState`.
-
-* The Generator's normal, exponential and gamma functions use 256-step Ziggurat
-  methods which are 2-10 times faster than NumPy's Box-Muller or inverse CDF
-  implementations.
-* Optional ``dtype`` argument that accepts ``np.float32`` or ``np.float64``
-  to produce either single or double precision uniform random variables for
-  select distributions
-* Optional ``out`` argument that allows existing arrays to be filled for
-  select distributions
-* All BitGenerators can produce doubles, uint64s and uint32s via CTypes
-  (`PCG64.ctypes`) and CFFI (`PCG64.cffi`). This allows the bit generators
-  to be used in numba.
-* The bit generators can be used in downstream projects via
-  :ref:`Cython <random_cython>`.
-* `Generator.integers` is now the canonical way to generate integer
-  random numbers from a discrete uniform distribution. The ``rand`` and
-  ``randn`` methods are only available through the legacy `RandomState`.
-  The ``endpoint`` keyword can be used to specify open or closed intervals.
-  This replaces both ``randint`` and the deprecated ``random_integers``.
-* `Generator.random` is now the canonical way to generate floating-point
-  random numbers, which replaces `RandomState.random_sample`,
-  `RandomState.sample`, and `RandomState.ranf`. This is consistent with
-  Python's `random.random`.
-* All BitGenerators in numpy use `SeedSequence` to convert seeds into
-  initialized states.
-* The addition of an ``axis`` keyword argument to methods such as 
-  `Generator.choice`, `Generator.permutation`,  and `Generator.shuffle` 
-  improves support for sampling from and shuffling multi-dimensional arrays.
-
-See :ref:`new-or-different` for a complete list of improvements and
-differences from the traditional ``Randomstate``.
+  The pseudo-random number generators implemented in this module are designed
+  for statistical modeling and simulation. They are not suitable for security
+  or cryptographic purposes. See the :py:mod:`secrets` module from the
+  standard library for such use cases.
+
+Seeds should be large positive integers. `default_rng` can take positive
+integers of any size. We recommend using very large, unique numbers to ensure
+that your seed is different from anyone else's. This is good practice to ensure
+that your results are statistically independent from theirs unless you are
+intentionally *trying* to reproduce their result. A convenient way to get
+such a seed number is to use :py:func:`secrets.randbits` to get an
+arbitrary 128-bit integer.
+
+::
+
+    >>> import secrets
+    >>> import numpy as np
+    >>> secrets.randbits(128)  #doctest: +SKIP
+    122807528840384100672342137672332424406  # may vary
+    >>> rng1 = np.random.default_rng(122807528840384100672342137672332424406)
+    >>> rng1.random()
+    0.5363922081269535
+    >>> rng2 = np.random.default_rng(122807528840384100672342137672332424406)
+    >>> rng2.random()
+    0.5363922081269535
+
+See the documentation on `default_rng` and `SeedSequence` for more advanced
+options for controlling the seed in specialized scenarios.
+
+`Generator` and its associated infrastructure was introduced in NumPy version
+1.17.0. There is still a lot of code that uses the older `RandomState` and the
+functions in `numpy.random`. While there are no plans to remove them at this
+time, we do recommend transitioning to `Generator` as you can. The algorithms
+are faster, more flexible, and will receive more improvements in the future.
+For the most part, `Generator` can be used as a replacement for `RandomState`.
+See :ref:`legacy` for information on the legacy infrastructure,
+:ref:`new-or-different` for information on transitioning, and :ref:`NEP 19
+<NEP19>` for some of the reasoning for the transition.
+
+Design
+------
+
+Users primarily interact with `Generator` instances. Each `Generator` instance
+owns a `BitGenerator` instance that implements the core RNG algorithm. The
+`BitGenerator` has a limited set of responsibilities. It manages state and
+provides functions to produce random doubles and random unsigned 32- and 64-bit
+values.
+
+The `Generator` takes the bit generator-provided stream and transforms them
+into more useful distributions, e.g., simulated normal random values. This
+structure allows alternative bit generators to be used with little code
+duplication.
+
+NumPy implements several different `BitGenerator` classes implementing
+different RNG algorithms. `default_rng` currently uses `~PCG64` as the
+default `BitGenerator`. It has better statistical properties and performance
+than the `~MT19937` algorithm used in the legacy `RandomState`. See
+:ref:`random-bit-generators` for more details on the supported BitGenerators.
+
+`default_rng` and BitGenerators delegate the conversion of seeds into RNG
+states to `SeedSequence` internally. `SeedSequence` implements a sophisticated
+algorithm that intermediates between the user's input and the internal
+implementation details of each `BitGenerator` algorithm, each of which can
+require different amounts of bits for its state. Importantly, it lets you use
+arbitrary-sized integers and arbitrary sequences of such integers to mix
+together into the RNG state. This is a useful primitive for constructing
+a :ref:`flexible pattern for parallel RNG streams <seedsequence-spawn>`.
+
+For backward compatibility, we still maintain the legacy `RandomState` class.
+It continues to use the `~MT19937` algorithm by default, and old seeds continue
+to reproduce the same results. The convenience :ref:`functions-in-numpy-random`
+are still aliases to the methods on a single global `RandomState` instance. See
+:ref:`legacy` for the complete details. See :ref:`new-or-different` for
+a detailed comparison between `Generator` and `RandomState`.
 
 Parallel Generation
 ~~~~~~~~~~~~~~~~~~~
@@ -235,6 +149,7 @@ Concepts
    Legacy Generator (RandomState) <legacy>
    BitGenerators, SeedSequences <bit_generators/index>
    Upgrading PCG64 with PCG64DXSM <upgrading-pcg64>
+   compatibility
 
 Features
 --------
diff --git a/doc/source/reference/random/legacy.rst b/doc/source/reference/random/legacy.rst
index b1fce49a1..00921c477 100644
--- a/doc/source/reference/random/legacy.rst
+++ b/doc/source/reference/random/legacy.rst
@@ -52,7 +52,7 @@ using the state of the `RandomState`:
     :exclude-members: __init__
 
 Seeding and State
------------------
+=================
 
 .. autosummary::
    :toctree: generated/
@@ -62,7 +62,7 @@ Seeding and State
    ~RandomState.seed
 
 Simple random data
-------------------
+==================
 .. autosummary::
    :toctree: generated/
 
@@ -75,7 +75,7 @@ Simple random data
    ~RandomState.bytes
 
 Permutations
-------------
+============
 .. autosummary::
    :toctree: generated/
 
@@ -83,7 +83,7 @@ Permutations
    ~RandomState.permutation
 
 Distributions
--------------
+==============
 .. autosummary::
    :toctree: generated/
 
@@ -123,8 +123,10 @@ Distributions
    ~RandomState.weibull
    ~RandomState.zipf
 
+.. _functions-in-numpy-random:
+
 Functions in `numpy.random`
----------------------------
+===========================
 Many of the RandomState methods above are exported as functions in
 `numpy.random` This usage is discouraged, as it is implemented via a global
 `RandomState` instance which is not advised on two counts:
@@ -133,8 +135,7 @@ Many of the RandomState methods above are exported as functions in
 
 - It uses a `RandomState` rather than the more modern `Generator`.
 
-For backward compatible legacy reasons, we cannot change this. See
-:ref:`random-quick-start`.
+For backward compatible legacy reasons, we will not change this.
 
 .. autosummary::
    :toctree: generated/
diff --git a/doc/source/reference/random/new-or-different.rst b/doc/source/reference/random/new-or-different.rst
index 8f4a70540..9b5bf38e5 100644
--- a/doc/source/reference/random/new-or-different.rst
+++ b/doc/source/reference/random/new-or-different.rst
@@ -5,17 +5,9 @@
 What's New or Different
 -----------------------
 
-.. warning::
-
-  The Box-Muller method used to produce NumPy's normals is no longer available
-  in `Generator`.  It is not possible to reproduce the exact random
-  values using ``Generator`` for the normal distribution or any other
-  distribution that relies on the normal such as the `Generator.gamma` or
-  `Generator.standard_t`. If you require bitwise backward compatible
-  streams, use `RandomState`, i.e., `RandomState.gamma` or
-  `RandomState.standard_t`.
-
-Quick comparison of legacy :ref:`mtrand <legacy>` to the new `Generator`
+NumPy 1.17.0 introduced `Generator` as an improved replacement for
+the :ref:`legacy <legacy>` `RandomState`. Here is a quick comparison of the two
+implementations.
 
 ================== ==================== =============
 Feature            Older Equivalent     Notes
@@ -44,21 +36,17 @@ Feature            Older Equivalent     Notes
                                         ``high`` interval endpoint
 ================== ==================== =============
 
-And in more detail:
-
-* Simulate from the complex normal distribution
-  (`~.Generator.complex_normal`)
 * The normal, exponential and gamma generators use 256-step Ziggurat
   methods which are 2-10 times faster than NumPy's default implementation in
   `~.Generator.standard_normal`, `~.Generator.standard_exponential` or
-  `~.Generator.standard_gamma`.
-
+  `~.Generator.standard_gamma`. Because of the change in algorithms, it is not
+  possible to reproduce the exact random values using ``Generator`` for these
+  distributions or any distribution method that relies on them.
 
 .. ipython:: python
 
-  from  numpy.random import Generator, PCG64
   import numpy.random
-  rng = Generator(PCG64())
+  rng = np.random.default_rng()
   %timeit -n 1 rng.standard_normal(100000)
   %timeit -n 1 numpy.random.standard_normal(100000)
 
@@ -74,18 +62,25 @@ And in more detail:
 
 
 * `~.Generator.integers` is now the canonical way to generate integer
-  random numbers from a discrete uniform distribution. The ``rand`` and
-  ``randn`` methods are only available through the legacy `~.RandomState`.
-  This replaces both ``randint`` and the deprecated ``random_integers``.
-* The Box-Muller method used to produce NumPy's normals is no longer available.
+  random numbers from a discrete uniform distribution. This replaces both
+  ``randint`` and the deprecated ``random_integers``.
+* The ``rand`` and ``randn`` methods are only available through the legacy
+  `~.RandomState`.
+* `Generator.random` is now the canonical way to generate floating-point
+  random numbers, which replaces `RandomState.random_sample`,
+  `sample`, and `ranf`, all of which were aliases. This is consistent with
+  Python's `random.random`.
 * All bit generators can produce doubles, uint64s and
   uint32s via CTypes (`~PCG64.ctypes`) and CFFI (`~PCG64.cffi`).
   This allows these bit generators to be used in numba.
 * The bit generators can be used in downstream projects via
   Cython.
+* All bit generators use `SeedSequence` to :ref:`convert seed integers to
+  initialized states <seeding_and_entropy>`.
 * Optional ``dtype`` argument that accepts ``np.float32`` or ``np.float64``
   to produce either single or double precision uniform random variables for
-  select distributions
+  select distributions. `~.Generator.integers` accepts a ``dtype`` argument
+  with any signed or unsigned integer dtype.
 
   * Uniforms (`~.Generator.random` and `~.Generator.integers`)
   * Normals (`~.Generator.standard_normal`)
@@ -94,9 +89,10 @@ And in more detail:
 
 .. ipython:: python
 
-  rng = Generator(PCG64(0))
-  rng.random(3, dtype='d')
-  rng.random(3, dtype='f')
+  rng = np.random.default_rng()
+  rng.random(3, dtype=np.float64)
+  rng.random(3, dtype=np.float32)
+  rng.integers(0, 256, size=3, dtype=np.uint8)
 
 * Optional ``out`` argument that allows existing arrays to be filled for
   select distributions
@@ -111,6 +107,7 @@ And in more detail:
 
 .. ipython:: python
 
+  rng = np.random.default_rng()
   existing = np.zeros(4)
   rng.random(out=existing[:2])
   print(existing)
@@ -121,9 +118,12 @@ And in more detail:
 
 .. ipython:: python
 
-  rng = Generator(PCG64(123456789))
+  rng = np.random.default_rng()
   a = np.arange(12).reshape((3, 4))
   a
   rng.choice(a, axis=1, size=5)
   rng.shuffle(a, axis=1)        # Shuffle in-place
   a
+
+* Added a method to sample from the complex normal distribution
+  (`~.Generator.complex_normal`)
diff --git a/doc/source/reference/random/parallel.rst b/doc/source/reference/random/parallel.rst
index b625d34b7..b4934a0ca 100644
--- a/doc/source/reference/random/parallel.rst
+++ b/doc/source/reference/random/parallel.rst
@@ -12,6 +12,11 @@ or distributed).
 `~SeedSequence` spawning
 ------------------------
 
+NumPy allows you to spawn new (with very high probability) independent
+`~BitGenerator` and `~Generator` instances via their ``spawn()`` method.
+This spawning is implemented by the `~SeedSequence` used for initializing
+the bit generators random stream.
+
 `~SeedSequence` `implements an algorithm`_ to process a user-provided seed,
 typically as an integer of some size, and to convert it into an initial state for
 a `~BitGenerator`. It uses hashing techniques to ensure that low-quality seeds
@@ -53,15 +58,25 @@ wrap this together into an API that is easy to use and difficult to misuse.
 
 .. end_block
 
-Child `~SeedSequence` objects can also spawn to make grandchildren, and so on.
-Each `~SeedSequence` has its position in the tree of spawned `~SeedSequence`
-objects mixed in with the user-provided seed to generate independent (with very
-high probability) streams.
+For convenience the direct use of `~SeedSequence` is not necessary.
+The above ``streams`` can be spawned directly from a parent generator
+via `~Generator.spawn`:
+
+.. code-block:: python
+
+  parent_rng = default_rng(12345)
+  streams = parent_rng.spawn(10)
+
+.. end_block
+
+Child objects can also spawn to make grandchildren, and so on.
+Each child has a `~SeedSequence` with its position in the tree of spawned
+child objects mixed in with the user-provided seed to generate independent
+(with very high probability) streams.
 
 .. code-block:: python
 
-  grandchildren = child_seeds[0].spawn(4)
-  grand_streams = [default_rng(s) for s in grandchildren]
+  grandchildren = streams[0].spawn(4)
 
 .. end_block
 
diff --git a/doc/source/reference/routines.ctypeslib.rst b/doc/source/reference/routines.ctypeslib.rst
index c6127ca64..fe5ffb5a8 100644
--- a/doc/source/reference/routines.ctypeslib.rst
+++ b/doc/source/reference/routines.ctypeslib.rst
@@ -1,7 +1,7 @@
 .. module:: numpy.ctypeslib
 
 ***********************************************************
-C-Types Foreign Function Interface (:mod:`numpy.ctypeslib`)
+C-Types foreign function interface (:mod:`numpy.ctypeslib`)
 ***********************************************************
 
 .. currentmodule:: numpy.ctypeslib
diff --git a/doc/source/reference/routines.datetime.rst b/doc/source/reference/routines.datetime.rst
index 966ed5a47..72513882b 100644
--- a/doc/source/reference/routines.datetime.rst
+++ b/doc/source/reference/routines.datetime.rst
@@ -1,6 +1,6 @@
 .. _routines.datetime:
 
-Datetime Support Functions
+Datetime support functions
 **************************
 
 .. currentmodule:: numpy
@@ -12,7 +12,7 @@ Datetime Support Functions
    datetime_data
 
 
-Business Day Functions
+Business day functions
 ======================
 
 .. currentmodule:: numpy
diff --git a/doc/source/reference/routines.dtype.rst b/doc/source/reference/routines.dtype.rst
index e9189ca07..6cb036031 100644
--- a/doc/source/reference/routines.dtype.rst
+++ b/doc/source/reference/routines.dtype.rst
@@ -30,7 +30,6 @@ Data type information
 
    finfo
    iinfo
-   MachAr
 
 Data type testing
 -----------------
diff --git a/doc/source/reference/routines.dual.rst b/doc/source/reference/routines.dual.rst
deleted file mode 100644
index 18c7791d0..000000000
--- a/doc/source/reference/routines.dual.rst
+++ /dev/null
@@ -1,47 +0,0 @@
-Optionally SciPy-accelerated routines (:mod:`numpy.dual`)
-=========================================================
-
-.. automodule:: numpy.dual
-
-Linear algebra
---------------
-
-.. currentmodule:: numpy.linalg
-
-.. autosummary::
-
-   cholesky
-   det
-   eig
-   eigh
-   eigvals
-   eigvalsh
-   inv
-   lstsq
-   norm
-   pinv
-   solve
-   svd
-
-FFT
----
-
-.. currentmodule:: numpy.fft
-
-.. autosummary::
-
-   fft
-   fft2
-   fftn
-   ifft
-   ifft2
-   ifftn
-
-Other
------
-
-.. currentmodule:: numpy
-
-.. autosummary::
-
-   i0
diff --git a/doc/source/reference/routines.io.rst b/doc/source/reference/routines.io.rst
index 2542b336f..1ec2ccb5e 100644
--- a/doc/source/reference/routines.io.rst
+++ b/doc/source/reference/routines.io.rst
@@ -83,7 +83,7 @@ Data sources
 
    DataSource
 
-Binary Format Description
+Binary format description
 -------------------------
 .. autosummary::
    :toctree: generated/
diff --git a/doc/source/reference/routines.ma.rst b/doc/source/reference/routines.ma.rst
index d503cc243..fd22a74aa 100644
--- a/doc/source/reference/routines.ma.rst
+++ b/doc/source/reference/routines.ma.rst
@@ -31,6 +31,7 @@ From existing data
    ma.fromfunction
 
    ma.MaskedArray.copy
+   ma.diagflat
 
 
 Ones and zeros
@@ -72,6 +73,9 @@ Inspecting the array
    ma.isMaskedArray
    ma.isMA
    ma.isarray
+   ma.isin
+   ma.in1d
+   ma.unique
 
 
    ma.MaskedArray.all
@@ -394,6 +398,17 @@ Clipping and rounding
    ma.MaskedArray.clip
    ma.MaskedArray.round
 
+Set operations
+~~~~~~~~~~~~~~
+.. autosummary::
+   :toctree: generated/
+
+
+   ma.intersect1d
+   ma.setdiff1d
+   ma.setxor1d
+   ma.union1d
+
 
 Miscellanea
 ~~~~~~~~~~~
diff --git a/doc/source/reference/routines.math.rst b/doc/source/reference/routines.math.rst
index a454841b3..4cc8719fb 100644
--- a/doc/source/reference/routines.math.rst
+++ b/doc/source/reference/routines.math.rst
@@ -39,8 +39,8 @@ Rounding
 .. autosummary::
    :toctree: generated/
 
+   round
    around
-   round_
    rint
    fix
    floor
@@ -149,13 +149,15 @@ Extrema Finding
    :toctree: generated/
 
    maximum
-   fmax
+   max
    amax
+   fmax
    nanmax
    
    minimum
-   fmin
+   min
    amin
+   fmin
    nanmin
    
 
diff --git a/doc/source/reference/routines.other.rst b/doc/source/reference/routines.other.rst
index e980406eb..769b3d910 100644
--- a/doc/source/reference/routines.other.rst
+++ b/doc/source/reference/routines.other.rst
@@ -58,9 +58,6 @@ Matlab-like Functions
    who
    disp
 
-Exceptions
-----------
-.. autosummary::
-   :toctree: generated/
+.. automodule:: numpy.exceptions
 
-   AxisError
+.. automodule:: numpy.dtypes
diff --git a/doc/source/reference/routines.rst b/doc/source/reference/routines.rst
index 593d017cc..72ed80972 100644
--- a/doc/source/reference/routines.rst
+++ b/doc/source/reference/routines.rst
@@ -24,7 +24,6 @@ indentation.
    routines.ctypeslib
    routines.datetime
    routines.dtype
-   routines.dual
    routines.emath
    routines.err
    routines.fft
@@ -44,4 +43,5 @@ indentation.
    routines.sort
    routines.statistics
    routines.testing
+   routines.testing.overrides
    routines.window
diff --git a/doc/source/reference/routines.testing.overrides.rst b/doc/source/reference/routines.testing.overrides.rst
new file mode 100644
index 000000000..262852633
--- /dev/null
+++ b/doc/source/reference/routines.testing.overrides.rst
@@ -0,0 +1,18 @@
+.. module:: numpy.testing.overrides
+
+Support for testing overrides (:mod:`numpy.testing.overrides`)
+==============================================================
+
+.. currentmodule:: numpy.testing.overrides
+
+Support for testing custom array container implementations.
+
+Utility Functions
+-----------------
+.. autosummary::
+   :toctree: generated/
+
+   allows_array_function_override
+   allows_array_ufunc_override
+   get_overridable_numpy_ufuncs
+   get_overridable_numpy_array_functions
diff --git a/doc/source/reference/routines.testing.rst b/doc/source/reference/routines.testing.rst
index 16d53bb4e..82e43dfdb 100644
--- a/doc/source/reference/routines.testing.rst
+++ b/doc/source/reference/routines.testing.rst
@@ -51,11 +51,6 @@ Decorators
 .. autosummary::
    :toctree: generated/
 
-   dec.deprecated
-   dec.knownfailureif
-   dec.setastest
-   dec.skipif
-   dec.slow
    decorate_methods
 
 Test Running
@@ -63,10 +58,8 @@ Test Running
 .. autosummary::
    :toctree: generated/
 
-   Tester
    clear_and_catch_warnings
    measure
-   run_module_suite
    rundocs
    suppress_warnings
 
diff --git a/doc/source/reference/simd/build-options.rst b/doc/source/reference/simd/build-options.rst
index 0994f15aa..d1e2e6b8e 100644
--- a/doc/source/reference/simd/build-options.rst
+++ b/doc/source/reference/simd/build-options.rst
@@ -333,7 +333,7 @@ and here is how it looks on x86_64/gcc:
 .. literalinclude:: log_example.txt
    :language: bash
 
-As you see, there is a separate report for each of ``build_ext`` and ``build_clib``
+There is a separate report for each of ``build_ext`` and ``build_clib``
 that includes several sections, and each section has several values, representing the following:
 
 **Platform**:
@@ -371,6 +371,17 @@ that includes several sections, and each section has several values, representin
   - The lines that come after the above property and end with a ':' on a separate line,
     represent the paths of c/c++ sources that define the generated optimizations.
 
-Runtime Trace
--------------
-To be completed.
+.. _runtime-simd-dispatch:
+
+Runtime dispatch
+----------------
+Importing NumPy triggers a scan of the available CPU features from the set
+of dispatchable features. This can be further restricted by setting the
+environment variable ``NPY_DISABLE_CPU_FEATURES`` to a comma-, tab-, or
+space-separated list of features to disable. This will raise an error if
+parsing fails or if the feature was not enabled. For instance, on ``x86_64``
+this will disable ``AVX2`` and ``FMA3``::
+
+    NPY_DISABLE_CPU_FEATURES="AVX2,FMA3"
+
+If the feature is not available, a warning will be emitted.
diff --git a/doc/source/reference/simd/gen_features.py b/doc/source/reference/simd/gen_features.py
index 9a38ef5c9..b141e23d0 100644
--- a/doc/source/reference/simd/gen_features.py
+++ b/doc/source/reference/simd/gen_features.py
@@ -168,7 +168,7 @@ if __name__ == '__main__':
     gen_path = path.join(
         path.dirname(path.realpath(__file__)), "generated_tables"
     )
-    with open(path.join(gen_path, 'cpu_features.inc'), 'wt') as fd:
+    with open(path.join(gen_path, 'cpu_features.inc'), 'w') as fd:
         fd.write(f'.. generated via {__file__}\n\n')
         for arch in (
             ("x86", "PPC64", "PPC64LE", "ARMHF", "AARCH64", "S390X")
@@ -177,7 +177,7 @@ if __name__ == '__main__':
             table = Features(arch, 'gcc').table()
             fd.write(wrapper_section(title, table))
 
-    with open(path.join(gen_path, 'compilers-diff.inc'), 'wt') as fd:
+    with open(path.join(gen_path, 'compilers-diff.inc'), 'w') as fd:
         fd.write(f'.. generated via {__file__}\n\n')
         for arch, cc_names in (
             ("x86", ("clang", "ICC", "MSVC")),
diff --git a/doc/source/reference/simd/generated_tables/compilers-diff.inc b/doc/source/reference/simd/generated_tables/compilers-diff.inc
index 4b9009a68..d5a87da3c 100644
--- a/doc/source/reference/simd/generated_tables/compilers-diff.inc
+++ b/doc/source/reference/simd/generated_tables/compilers-diff.inc
@@ -1,33 +1,35 @@
-.. generated via /home/seiko/work/repos/numpy/doc/source/reference/simd/./gen_features.py
+.. generated via /numpy/numpy/./doc/source/reference/simd/gen_features.py
 
 On x86::Intel Compiler
 ~~~~~~~~~~~~~~~~~~~~~~
 .. table::
     :align: left
 
-    ================ ==========================================================================================================================================
-    Name             Implies                                                                                                                                   
-    ================ ==========================================================================================================================================
-    FMA3             SSE SSE2 SSE3 SSSE3 SSE41 POPCNT SSE42 AVX F16C :enabled:`AVX2`                                                                           
-    AVX2             SSE SSE2 SSE3 SSSE3 SSE41 POPCNT SSE42 AVX F16C :enabled:`FMA3`                                                                           
-    AVX512F          SSE SSE2 SSE3 SSSE3 SSE41 POPCNT SSE42 AVX F16C FMA3 AVX2 :enabled:`AVX512CD`                                                             
-    :disabled:`XOP`  :disabled:`SSE` :disabled:`SSE2` :disabled:`SSE3` :disabled:`SSSE3` :disabled:`SSE41` :disabled:`POPCNT` :disabled:`SSE42` :disabled:`AVX`
-    :disabled:`FMA4` :disabled:`SSE` :disabled:`SSE2` :disabled:`SSE3` :disabled:`SSSE3` :disabled:`SSE41` :disabled:`POPCNT` :disabled:`SSE42` :disabled:`AVX`
-    ================ ==========================================================================================================================================
+    ====================== ================================================================================================================================================================================================================================================================================================================================== ======================
+    Name                   Implies                                                                                                                                                                                                                                                                                                                            Gathers               
+    ====================== ================================================================================================================================================================================================================================================================================================================================== ======================
+    FMA3                   SSE SSE2 SSE3 SSSE3 SSE41 POPCNT SSE42 AVX F16C :enabled:`AVX2`                                                                                                                                                                                                                                                                                          
+    AVX2                   SSE SSE2 SSE3 SSSE3 SSE41 POPCNT SSE42 AVX F16C :enabled:`FMA3`                                                                                                                                                                                                                                                                                          
+    AVX512F                SSE SSE2 SSE3 SSSE3 SSE41 POPCNT SSE42 AVX F16C FMA3 AVX2 :enabled:`AVX512CD`                                                                                                                                                                                                                                                                            
+    :disabled:`XOP`        :disabled:`SSE` :disabled:`SSE2` :disabled:`SSE3` :disabled:`SSSE3` :disabled:`SSE41` :disabled:`POPCNT` :disabled:`SSE42` :disabled:`AVX`                                                                                                                                                                                                               
+    :disabled:`FMA4`       :disabled:`SSE` :disabled:`SSE2` :disabled:`SSE3` :disabled:`SSSE3` :disabled:`SSE41` :disabled:`POPCNT` :disabled:`SSE42` :disabled:`AVX`                                                                                                                                                                                                               
+    :disabled:`AVX512_SPR` :disabled:`SSE` :disabled:`SSE2` :disabled:`SSE3` :disabled:`SSSE3` :disabled:`SSE41` :disabled:`POPCNT` :disabled:`SSE42` :disabled:`AVX` :disabled:`F16C` :disabled:`FMA3` :disabled:`AVX2` :disabled:`AVX512F` :disabled:`AVX512CD` :disabled:`AVX512_SKX` :disabled:`AVX512_CLX` :disabled:`AVX512_CNL` :disabled:`AVX512_ICL` :disabled:`AVX512FP16`
+    ====================== ================================================================================================================================================================================================================================================================================================================================== ======================
 
 On x86::Microsoft Visual C/C++
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. table::
     :align: left
 
-    ====================== ============================================================================================================================================================================================================================================================= =============================================================================
-    Name                   Implies                                                                                                                                                                                                                                                       Gathers                                                                      
-    ====================== ============================================================================================================================================================================================================================================================= =============================================================================
-    FMA3                   SSE SSE2 SSE3 SSSE3 SSE41 POPCNT SSE42 AVX F16C :enabled:`AVX2`                                                                                                                                                                                                                                                                            
-    AVX2                   SSE SSE2 SSE3 SSSE3 SSE41 POPCNT SSE42 AVX F16C :enabled:`FMA3`                                                                                                                                                                                                                                                                            
-    AVX512F                SSE SSE2 SSE3 SSSE3 SSE41 POPCNT SSE42 AVX F16C FMA3 AVX2 :enabled:`AVX512CD` :enabled:`AVX512_SKX`                                                                                                                                                                                                                                        
-    AVX512CD               SSE SSE2 SSE3 SSSE3 SSE41 POPCNT SSE42 AVX F16C FMA3 AVX2 AVX512F :enabled:`AVX512_SKX`                                                                                                                                                                                                                                                    
-    :disabled:`AVX512_KNL` :disabled:`SSE` :disabled:`SSE2` :disabled:`SSE3` :disabled:`SSSE3` :disabled:`SSE41` :disabled:`POPCNT` :disabled:`SSE42` :disabled:`AVX` :disabled:`F16C` :disabled:`FMA3` :disabled:`AVX2` :disabled:`AVX512F` :disabled:`AVX512CD`                        :disabled:`AVX512ER` :disabled:`AVX512PF`                                    
-    :disabled:`AVX512_KNM` :disabled:`SSE` :disabled:`SSE2` :disabled:`SSE3` :disabled:`SSSE3` :disabled:`SSE41` :disabled:`POPCNT` :disabled:`SSE42` :disabled:`AVX` :disabled:`F16C` :disabled:`FMA3` :disabled:`AVX2` :disabled:`AVX512F` :disabled:`AVX512CD` :disabled:`AVX512_KNL` :disabled:`AVX5124FMAPS` :disabled:`AVX5124VNNIW` :disabled:`AVX512VPOPCNTDQ`
-    ====================== ============================================================================================================================================================================================================================================================= =============================================================================
+    ====================== ================================================================================================================================================================================================================================================================================================================================== =============================================================================
+    Name                   Implies                                                                                                                                                                                                                                                                                                                            Gathers                                                                      
+    ====================== ================================================================================================================================================================================================================================================================================================================================== =============================================================================
+    FMA3                   SSE SSE2 SSE3 SSSE3 SSE41 POPCNT SSE42 AVX F16C :enabled:`AVX2`                                                                                                                                                                                                                                                                                                                                                 
+    AVX2                   SSE SSE2 SSE3 SSSE3 SSE41 POPCNT SSE42 AVX F16C :enabled:`FMA3`                                                                                                                                                                                                                                                                                                                                                 
+    AVX512F                SSE SSE2 SSE3 SSSE3 SSE41 POPCNT SSE42 AVX F16C FMA3 AVX2 :enabled:`AVX512CD` :enabled:`AVX512_SKX`                                                                                                                                                                                                                                                                                                             
+    AVX512CD               SSE SSE2 SSE3 SSSE3 SSE41 POPCNT SSE42 AVX F16C FMA3 AVX2 AVX512F :enabled:`AVX512_SKX`                                                                                                                                                                                                                                                                                                                         
+    :disabled:`AVX512_KNL` :disabled:`SSE` :disabled:`SSE2` :disabled:`SSE3` :disabled:`SSSE3` :disabled:`SSE41` :disabled:`POPCNT` :disabled:`SSE42` :disabled:`AVX` :disabled:`F16C` :disabled:`FMA3` :disabled:`AVX2` :disabled:`AVX512F` :disabled:`AVX512CD`                                                                                             :disabled:`AVX512ER` :disabled:`AVX512PF`                                    
+    :disabled:`AVX512_KNM` :disabled:`SSE` :disabled:`SSE2` :disabled:`SSE3` :disabled:`SSSE3` :disabled:`SSE41` :disabled:`POPCNT` :disabled:`SSE42` :disabled:`AVX` :disabled:`F16C` :disabled:`FMA3` :disabled:`AVX2` :disabled:`AVX512F` :disabled:`AVX512CD` :disabled:`AVX512_KNL`                                                                      :disabled:`AVX5124FMAPS` :disabled:`AVX5124VNNIW` :disabled:`AVX512VPOPCNTDQ`
+    :disabled:`AVX512_SPR` :disabled:`SSE` :disabled:`SSE2` :disabled:`SSE3` :disabled:`SSSE3` :disabled:`SSE41` :disabled:`POPCNT` :disabled:`SSE42` :disabled:`AVX` :disabled:`F16C` :disabled:`FMA3` :disabled:`AVX2` :disabled:`AVX512F` :disabled:`AVX512CD` :disabled:`AVX512_SKX` :disabled:`AVX512_CLX` :disabled:`AVX512_CNL` :disabled:`AVX512_ICL` :disabled:`AVX512FP16`                                                       
+    ====================== ================================================================================================================================================================================================================================================================================================================================== =============================================================================
 
diff --git a/doc/source/reference/simd/generated_tables/cpu_features.inc b/doc/source/reference/simd/generated_tables/cpu_features.inc
index 7782172d2..603370e21 100644
--- a/doc/source/reference/simd/generated_tables/cpu_features.inc
+++ b/doc/source/reference/simd/generated_tables/cpu_features.inc
@@ -1,35 +1,36 @@
-.. generated via /home/seiko/work/repos/review/numpy/doc/source/reference/simd/gen_features.py
+.. generated via /numpy/numpy/./doc/source/reference/simd/gen_features.py
 
 On x86
 ~~~~~~
 .. table::
     :align: left
 
-    ============== =========================================================================================================================================================================== =====================================================
-    Name           Implies                                                                                                                                                                     Gathers                                              
-    ============== =========================================================================================================================================================================== =====================================================
-    ``SSE``        ``SSE2``                                                                                                                                                                                                                         
-    ``SSE2``       ``SSE``                                                                                                                                                                                                                          
-    ``SSE3``       ``SSE`` ``SSE2``                                                                                                                                                                                                                 
-    ``SSSE3``      ``SSE`` ``SSE2`` ``SSE3``                                                                                                                                                                                                        
-    ``SSE41``      ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3``                                                                                                                                                                                              
-    ``POPCNT``     ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41``                                                                                                                                                                                    
-    ``SSE42``      ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT``                                                                                                                                                                         
-    ``AVX``        ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42``                                                                                                                                                               
-    ``XOP``        ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX``                                                                                                                                                       
-    ``FMA4``       ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX``                                                                                                                                                       
-    ``F16C``       ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX``                                                                                                                                                       
-    ``FMA3``       ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C``                                                                                                                                              
-    ``AVX2``       ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C``                                                                                                                                              
-    ``AVX512F``    ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2``                                                                                                                            
-    ``AVX512CD``   ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` ``AVX512F``                                                                                                                
-    ``AVX512_KNL`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` ``AVX512F`` ``AVX512CD``                                              ``AVX512ER`` ``AVX512PF``                            
-    ``AVX512_KNM`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` ``AVX512F`` ``AVX512CD`` ``AVX512_KNL``                               ``AVX5124FMAPS`` ``AVX5124VNNIW`` ``AVX512VPOPCNTDQ``
-    ``AVX512_SKX`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` ``AVX512F`` ``AVX512CD``                                              ``AVX512VL`` ``AVX512BW`` ``AVX512DQ``               
-    ``AVX512_CLX`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` ``AVX512F`` ``AVX512CD`` ``AVX512_SKX``                               ``AVX512VNNI``                                       
-    ``AVX512_CNL`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` ``AVX512F`` ``AVX512CD`` ``AVX512_SKX``                               ``AVX512IFMA`` ``AVX512VBMI``                        
-    ``AVX512_ICL`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` ``AVX512F`` ``AVX512CD`` ``AVX512_SKX`` ``AVX512_CLX`` ``AVX512_CNL`` ``AVX512VBMI2`` ``AVX512BITALG`` ``AVX512VPOPCNTDQ`` 
-    ============== =========================================================================================================================================================================== =====================================================
+    ============== ========================================================================================================================================================================================== =====================================================
+    Name           Implies                                                                                                                                                                                    Gathers                                              
+    ============== ========================================================================================================================================================================================== =====================================================
+    ``SSE``        ``SSE2``                                                                                                                                                                                                                                        
+    ``SSE2``       ``SSE``                                                                                                                                                                                                                                         
+    ``SSE3``       ``SSE`` ``SSE2``                                                                                                                                                                                                                                
+    ``SSSE3``      ``SSE`` ``SSE2`` ``SSE3``                                                                                                                                                                                                                       
+    ``SSE41``      ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3``                                                                                                                                                                                                             
+    ``POPCNT``     ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41``                                                                                                                                                                                                   
+    ``SSE42``      ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT``                                                                                                                                                                                        
+    ``AVX``        ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42``                                                                                                                                                                              
+    ``XOP``        ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX``                                                                                                                                                                      
+    ``FMA4``       ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX``                                                                                                                                                                      
+    ``F16C``       ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX``                                                                                                                                                                      
+    ``FMA3``       ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C``                                                                                                                                                             
+    ``AVX2``       ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C``                                                                                                                                                             
+    ``AVX512F``    ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2``                                                                                                                                           
+    ``AVX512CD``   ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` ``AVX512F``                                                                                                                               
+    ``AVX512_KNL`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` ``AVX512F`` ``AVX512CD``                                                             ``AVX512ER`` ``AVX512PF``                            
+    ``AVX512_KNM`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` ``AVX512F`` ``AVX512CD`` ``AVX512_KNL``                                              ``AVX5124FMAPS`` ``AVX5124VNNIW`` ``AVX512VPOPCNTDQ``
+    ``AVX512_SKX`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` ``AVX512F`` ``AVX512CD``                                                             ``AVX512VL`` ``AVX512BW`` ``AVX512DQ``               
+    ``AVX512_CLX`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` ``AVX512F`` ``AVX512CD`` ``AVX512_SKX``                                              ``AVX512VNNI``                                       
+    ``AVX512_CNL`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` ``AVX512F`` ``AVX512CD`` ``AVX512_SKX``                                              ``AVX512IFMA`` ``AVX512VBMI``                        
+    ``AVX512_ICL`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` ``AVX512F`` ``AVX512CD`` ``AVX512_SKX`` ``AVX512_CLX`` ``AVX512_CNL``                ``AVX512VBMI2`` ``AVX512BITALG`` ``AVX512VPOPCNTDQ`` 
+    ``AVX512_SPR`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` ``AVX512F`` ``AVX512CD`` ``AVX512_SKX`` ``AVX512_CLX`` ``AVX512_CNL`` ``AVX512_ICL`` ``AVX512FP16``                                       
+    ============== ========================================================================================================================================================================================== =====================================================
 
 On IBM/POWER big-endian
 ~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/doc/source/reference/simd/how-it-works.rst b/doc/source/reference/simd/how-it-works.rst
index 19b3dba45..dac80dd02 100644
--- a/doc/source/reference/simd/how-it-works.rst
+++ b/doc/source/reference/simd/how-it-works.rst
@@ -1,6 +1,6 @@
-**********************************
+*********************************
 How does the CPU dispatcher work?
-**********************************
+*********************************
 
 NumPy dispatcher is based on multi-source compiling, which means taking
 a certain source and compiling it multiple times with different compiler
diff --git a/doc/source/release.rst b/doc/source/release.rst
index 5742f7e57..b8a877924 100644
--- a/doc/source/release.rst
+++ b/doc/source/release.rst
@@ -5,7 +5,12 @@ Release notes
 .. toctree::
     :maxdepth: 3
 
+    1.25.0 <release/1.25.0-notes>
+    1.24.3 <release/1.24.3-notes>
+    1.24.2 <release/1.24.2-notes>
+    1.24.1 <release/1.24.1-notes>
     1.24.0 <release/1.24.0-notes>
+    1.23.5 <release/1.23.5-notes>
     1.23.4 <release/1.23.4-notes>
     1.23.3 <release/1.23.3-notes>
     1.23.2 <release/1.23.2-notes>
diff --git a/doc/source/release/1.15.0-notes.rst b/doc/source/release/1.15.0-notes.rst
index 2d9d068e5..e84386f0f 100644
--- a/doc/source/release/1.15.0-notes.rst
+++ b/doc/source/release/1.15.0-notes.rst
@@ -92,7 +92,7 @@ Deprecations
   ``np.sum(np.from_iter(generator))`` or the built-in Python ``sum`` instead.
 
 * Users of the C-API should call ``PyArrayResolveWriteBackIfCopy`` or
-  ``PyArray_DiscardWritbackIfCopy`` on any array with the ``WRITEBACKIFCOPY``
+  ``PyArray_DiscardWritebackIfCopy`` on any array with the ``WRITEBACKIFCOPY``
   flag set, before deallocating the array. A deprecation warning will be
   emitted if those calls are not used when needed.
 
diff --git a/doc/source/release/1.23.0-notes.rst b/doc/source/release/1.23.0-notes.rst
index c5c23620a..2301192cc 100644
--- a/doc/source/release/1.23.0-notes.rst
+++ b/doc/source/release/1.23.0-notes.rst
@@ -83,8 +83,8 @@ Expired deprecations
 * The ``UPDATEIFCOPY`` array flag has been removed together with the enum
   ``NPY_ARRAY_UPDATEIFCOPY``. The associated (and deprecated)
   ``PyArray_XDECREF_ERR`` was also removed. These were all deprecated in 1.14. They
-  are replaced by ``WRITEBACKIFCOPY``, that requires calling
-  ``PyArray_ResoveWritebackIfCopy`` before the array is deallocated.
+  are replaced by ``NPY_ARRAY_WRITEBACKIFCOPY``, that requires calling
+  ``PyArray_ResolveWritebackIfCopy`` before the array is deallocated.
 
   (`gh-20589 <https://github.com/numpy/numpy/pull/20589>`__)
 
diff --git a/doc/source/release/1.23.5-notes.rst b/doc/source/release/1.23.5-notes.rst
new file mode 100644
index 000000000..8e14794e4
--- /dev/null
+++ b/doc/source/release/1.23.5-notes.rst
@@ -0,0 +1,39 @@
+.. currentmodule:: numpy
+
+==========================
+NumPy 1.23.5 Release Notes
+==========================
+NumPy 1.23.5 is a maintenance release that fixes bugs discovered after the
+1.23.4 release and keeps the build infrastructure current.
+The Python versions supported for this release are 3.8-3.11.
+
+Contributors
+============
+
+A total of 7 people contributed to this release.  People with a "+" by their
+names contributed a patch for the first time.
+
+* @DWesl
+* Aayush Agrawal +
+* Adam Knapp +
+* Charles Harris
+* Navpreet Singh +
+* Sebastian Berg
+* Tania Allard
+
+Pull requests merged
+====================
+
+A total of 10 pull requests were merged for this release.
+
+* `#22489 <https://github.com/numpy/numpy/pull/22489>`__: TST, MAINT: Replace most setup with setup_method (also teardown)
+* `#22490 <https://github.com/numpy/numpy/pull/22490>`__: MAINT, CI: Switch to cygwin/cygwin-install-action@v2
+* `#22494 <https://github.com/numpy/numpy/pull/22494>`__: TST: Make test_partial_iteration_cleanup robust but require leak...
+* `#22592 <https://github.com/numpy/numpy/pull/22592>`__: MAINT: Ensure graceful handling of large header sizes
+* `#22593 <https://github.com/numpy/numpy/pull/22593>`__: TYP: Spelling alignment for array flag literal
+* `#22594 <https://github.com/numpy/numpy/pull/22594>`__: BUG: Fix bounds checking for ``random.logseries``
+* `#22595 <https://github.com/numpy/numpy/pull/22595>`__: DEV: Update GH actions and Dockerfile for Gitpod
+* `#22596 <https://github.com/numpy/numpy/pull/22596>`__: CI: Only fetch in actions/checkout
+* `#22597 <https://github.com/numpy/numpy/pull/22597>`__: BUG: Decrement ref count in gentype_reduce if allocated memory...
+* `#22625 <https://github.com/numpy/numpy/pull/22625>`__: BUG: Histogramdd breaks on big arrays in Windows
+
diff --git a/doc/source/release/1.24.0-notes.rst b/doc/source/release/1.24.0-notes.rst
index 68134bce2..1c9e719b3 100644
--- a/doc/source/release/1.24.0-notes.rst
+++ b/doc/source/release/1.24.0-notes.rst
@@ -1,45 +1,515 @@
 .. currentmodule:: numpy
 
-==========================
-NumPy 1.24.0 Release Notes
-==========================
+========================
+NumPy 1.24 Release Notes
+========================
+The NumPy 1.24.0 release continues the ongoing work to improve the handling and
+promotion of dtypes, increase the execution speed, and clarify the
+documentation.  There are also a large number of new and expired deprecations
+due to changes in promotion and cleanups. This might be called a deprecation
+release. Highlights are
 
+* Many new deprecations, check them out.
+* Many expired deprecations,
+* New F2PY features and fixes.
+* New "dtype" and "casting" keywords for stacking functions.
 
-Highlights
-==========
+See below for the details,
 
-
-New functions
-=============
+This release supports Python versions 3.8-3.11.
 
 
 Deprecations
 ============
 
+Deprecate fastCopyAndTranspose and PyArray_CopyAndTranspose
+-----------------------------------------------------------
+The ``numpy.fastCopyAndTranspose`` function has been deprecated. Use the
+corresponding copy and transpose methods directly::
+
+    arr.T.copy()
+
+The underlying C function ``PyArray_CopyAndTranspose`` has also been deprecated
+from the NumPy C-API.
+
+(`gh-22313 <https://github.com/numpy/numpy/pull/22313>`__)
+
+Conversion of out-of-bound Python integers
+------------------------------------------
+Attempting a conversion from a Python integer to a NumPy value will now always
+check whether the result can be represented by NumPy.  This means the following
+examples will fail in the future and give a ``DeprecationWarning`` now::
+
+    np.uint8(-1)
+    np.array([3000], dtype=np.int8)
+
+Many of these did succeed before.  Such code was mainly useful for unsigned
+integers with negative values such as ``np.uint8(-1)`` giving
+``np.iinfo(np.uint8).max``.
+
+Note that conversion between NumPy integers is unaffected, so that
+``np.array(-1).astype(np.uint8)`` continues to work and use C integer overflow
+logic.  For negative values, it will also work to view the array:
+``np.array(-1, dtype=np.int8).view(np.uint8)``.
+In some cases, using ``np.iinfo(np.uint8).max`` or ``val % 2**8`` may also
+work well.
 
-Future Changes
-==============
+In rare cases input data may mix both negative values and very large unsigned
+values (i.e. ``-1`` and ``2**63``).  There it is unfortunately necessary
+to use ``%`` on the Python value or use signed or unsigned conversion
+depending on whether negative values are expected.
+
+(`gh-22385 <https://github.com/numpy/numpy/pull/22385>`__)
+
+Deprecate ``msort``
+-------------------
+The ``numpy.msort`` function is deprecated. Use ``np.sort(a, axis=0)`` instead.
+
+(`gh-22456 <https://github.com/numpy/numpy/pull/22456>`__)
+
+``np.str0`` and similar are now deprecated
+------------------------------------------
+The scalar type aliases ending in a 0 bit size: ``np.object0``, ``np.str0``,
+``np.bytes0``, ``np.void0``, ``np.int0``, ``np.uint0`` as well as ``np.bool8``
+are now deprecated and will eventually be removed.
+
+(`gh-22607 <https://github.com/numpy/numpy/pull/22607>`__)
 
 
 Expired deprecations
 ====================
 
+* The ``normed`` keyword argument has been removed from
+  `np.histogram`, `np.histogram2d`, and `np.histogramdd`.
+  Use ``density`` instead.  If ``normed`` was passed by
+  position, ``density`` is now used.
+
+  (`gh-21645 <https://github.com/numpy/numpy/pull/21645>`__)
+
+* Ragged array creation will now always raise a ``ValueError`` unless
+  ``dtype=object`` is passed.  This includes very deeply nested sequences.
+
+  (`gh-22004 <https://github.com/numpy/numpy/pull/22004>`__)
+
+* Support for Visual Studio 2015 and earlier has been removed.
+
+* Support for the Windows Interix POSIX interop layer has been removed.
+
+  (`gh-22139 <https://github.com/numpy/numpy/pull/22139>`__)
+
+* Support for Cygwin < 3.3 has been removed.
+
+  (`gh-22159 <https://github.com/numpy/numpy/pull/22159>`__)
+
+* The mini() method of ``np.ma.MaskedArray`` has been removed. Use either
+  ``np.ma.MaskedArray.min()`` or ``np.ma.minimum.reduce()``.
+
+* The single-argument form of ``np.ma.minimum`` and ``np.ma.maximum`` has been
+  removed. Use ``np.ma.minimum.reduce()`` or ``np.ma.maximum.reduce()``
+  instead.
+
+  (`gh-22228 <https://github.com/numpy/numpy/pull/22228>`__)
+
+* Passing dtype instances other than the canonical (mainly native byte-order)
+  ones to ``dtype=`` or ``signature=`` in ufuncs will now raise a
+  ``TypeError``.  We recommend passing the strings ``"int8"`` or scalar types
+  ``np.int8`` since the byte-order, datetime/timedelta unit, etc. are never
+  enforced.  (Initially deprecated in NumPy 1.21.)
+
+  (`gh-22540 <https://github.com/numpy/numpy/pull/22540>`__)
+
+* The ``dtype=`` argument to comparison ufuncs is now applied correctly.  That
+  means that only ``bool`` and ``object`` are valid values and ``dtype=object``
+  is enforced.
+
+  (`gh-22541 <https://github.com/numpy/numpy/pull/22541>`__)
+
+* The deprecation for the aliases ``np.object``, ``np.bool``, ``np.float``,
+  ``np.complex``, ``np.str``, and ``np.int`` is expired (introduces NumPy
+  1.20).  Some of these will now give a FutureWarning in addition to raising an
+  error since they will be mapped to the NumPy scalars in the future.
+
+  (`gh-22607 <https://github.com/numpy/numpy/pull/22607>`__)
+
 
 Compatibility notes
 ===================
 
+``array.fill(scalar)`` may behave slightly different
+----------------------------------------------------
+``numpy.ndarray.fill`` may in some cases behave slightly different now due to
+the fact that the logic is aligned with item assignment::
+
+    arr = np.array([1])  # with any dtype/value
+    arr.fill(scalar)
+    # is now identical to:
+    arr[0] = scalar
+
+Previously casting may have produced slightly different answers when using
+values that could not be represented in the target ``dtype`` or when the target
+had ``object`` dtype.
+
+(`gh-20924 <https://github.com/numpy/numpy/pull/20924>`__)
+
+Subarray to object cast now copies
+----------------------------------
+Casting a dtype that includes a subarray to an object will now ensure a copy of
+the subarray.  Previously an unsafe view was returned::
+
+    arr = np.ones(3, dtype=[("f", "i", 3)])
+    subarray_fields = arr.astype(object)[0]
+    subarray = subarray_fields[0]  # "f" field
+
+    np.may_share_memory(subarray, arr)
+
+Is now always false.  While previously it was true for the specific cast.
+
+(`gh-21925 <https://github.com/numpy/numpy/pull/21925>`__)
+
+Returned arrays respect uniqueness of dtype kwarg objects
+---------------------------------------------------------
+When the ``dtype`` keyword argument is used with :py:func:`np.array()` or
+:py:func:`asarray()`, the dtype of the returned array now always exactly
+matches the dtype provided by the caller.
+
+In some cases this change means that a *view* rather than the input array is
+returned.  The following is an example for this on 64bit Linux where ``long``
+and ``longlong`` are the same precision but different ``dtypes``::
+
+    >>> arr = np.array([1, 2, 3], dtype="long")
+    >>> new_dtype = np.dtype("longlong")
+    >>> new = np.asarray(arr, dtype=new_dtype)
+    >>> new.dtype is new_dtype
+    True
+    >>> new is arr
+    False
+
+Before the change, the ``dtype`` did not match because ``new is arr`` was
+``True``.
+
+(`gh-21995 <https://github.com/numpy/numpy/pull/21995>`__)
+
+DLPack export raises ``BufferError``
+------------------------------------
+When an array buffer cannot be exported via DLPack a ``BufferError`` is now
+always raised where previously ``TypeError`` or ``RuntimeError`` was raised.
+This allows falling back to the buffer protocol or ``__array_interface__`` when
+DLPack was tried first.
 
-C API changes
-=============
+(`gh-22542 <https://github.com/numpy/numpy/pull/22542>`__)
+
+NumPy builds are no longer tested on GCC-6
+------------------------------------------
+Ubuntu 18.04 is deprecated for GitHub actions and GCC-6 is not available on
+Ubuntu 20.04, so builds using that compiler are no longer tested. We still test
+builds using GCC-7 and GCC-8.
+
+(`gh-22598 <https://github.com/numpy/numpy/pull/22598>`__)
 
 
 New Features
 ============
 
+New attribute ``symbol`` added to polynomial classes
+----------------------------------------------------
+The polynomial classes in the ``numpy.polynomial`` package have a new
+``symbol`` attribute which is used to represent the indeterminate of the
+polynomial.  This can be used to change the value of the variable when
+printing::
+
+    >>> P_y = np.polynomial.Polynomial([1, 0, -1], symbol="y")
+    >>> print(P_y)
+    1.0 + 0.0·y¹ - 1.0·y²
+
+Note that the polynomial classes only support 1D polynomials, so operations
+that involve polynomials with different symbols are disallowed when the result
+would be multivariate::
+
+    >>> P = np.polynomial.Polynomial([1, -1])  # default symbol is "x"
+    >>> P_z = np.polynomial.Polynomial([1, 1], symbol="z")
+    >>> P * P_z
+    Traceback (most recent call last)
+       ...
+    ValueError: Polynomial symbols differ
+
+The symbol can be any valid Python identifier. The default is ``symbol=x``,
+consistent with existing behavior.
+
+(`gh-16154 <https://github.com/numpy/numpy/pull/16154>`__)
+
+F2PY support for Fortran ``character`` strings
+----------------------------------------------
+F2PY now supports wrapping Fortran functions with:
+
+* character (e.g. ``character x``)
+* character array (e.g. ``character, dimension(n) :: x``)
+* character string (e.g. ``character(len=10) x``)
+* and character string array (e.g. ``character(len=10), dimension(n, m) :: x``)
+
+arguments, including passing Python unicode strings as Fortran character string
+arguments.
+
+(`gh-19388 <https://github.com/numpy/numpy/pull/19388>`__)
+
+New function ``np.show_runtime``
+--------------------------------
+A new function ``numpy.show_runtime`` has been added to display the runtime
+information of the machine in addition to ``numpy.show_config`` which displays
+the build-related information.
+
+(`gh-21468 <https://github.com/numpy/numpy/pull/21468>`__)
+
+``strict`` option for ``testing.assert_array_equal``
+----------------------------------------------------
+The ``strict`` option is now available for ``testing.assert_array_equal``.
+Setting ``strict=True`` will disable the broadcasting behaviour for scalars and
+ensure that input arrays have the same data type.
+
+(`gh-21595 <https://github.com/numpy/numpy/pull/21595>`__)
+
+New parameter ``equal_nan`` added to ``np.unique``
+--------------------------------------------------
+``np.unique`` was changed in 1.21 to treat all ``NaN`` values as equal and
+return a single ``NaN``. Setting ``equal_nan=False`` will restore pre-1.21
+behavior to treat ``NaNs`` as unique. Defaults to ``True``.
+
+(`gh-21623 <https://github.com/numpy/numpy/pull/21623>`__)
+
+``casting`` and ``dtype`` keyword arguments for ``numpy.stack``
+---------------------------------------------------------------
+The ``casting`` and ``dtype`` keyword arguments are now available for
+``numpy.stack``.  To use them, write ``np.stack(..., dtype=None,
+casting='same_kind')``.
+
+``casting`` and ``dtype`` keyword arguments for ``numpy.vstack``
+----------------------------------------------------------------
+The ``casting`` and ``dtype`` keyword arguments are now available for
+``numpy.vstack``.  To use them, write ``np.vstack(..., dtype=None,
+casting='same_kind')``.
+
+``casting`` and ``dtype`` keyword arguments for ``numpy.hstack``
+----------------------------------------------------------------
+The ``casting`` and ``dtype`` keyword arguments are now available for
+``numpy.hstack``.  To use them, write ``np.hstack(..., dtype=None,
+casting='same_kind')``.
+
+(`gh-21627 <https://github.com/numpy/numpy/pull/21627>`__)
+
+The bit generator underlying the singleton RandomState can be changed
+---------------------------------------------------------------------
+The singleton ``RandomState`` instance exposed in the ``numpy.random`` module
+is initialized at startup with the ``MT19937`` bit generator. The new function
+``set_bit_generator`` allows the default bit generator to be replaced with a
+user-provided bit generator. This function has been introduced to provide a
+method allowing seamless integration of a high-quality, modern bit generator in
+new code with existing code that makes use of the singleton-provided random
+variate generating functions. The companion function ``get_bit_generator``
+returns the current bit generator being used by the singleton ``RandomState``.
+This is provided to simplify restoring the original source of randomness if
+required.
+
+The preferred method to generate reproducible random numbers is to use a modern
+bit generator in an instance of ``Generator``. The function ``default_rng``
+simplifies instantiation::
+
+   >>> rg = np.random.default_rng(3728973198)
+   >>> rg.random()
+
+The same bit generator can then be shared with the singleton instance so that
+calling functions in the ``random`` module will use the same bit generator::
+
+   >>> orig_bit_gen = np.random.get_bit_generator()
+   >>> np.random.set_bit_generator(rg.bit_generator)
+   >>> np.random.normal()
+
+The swap is permanent (until reversed) and so any call to functions in the
+``random`` module will use the new bit generator. The original can be restored
+if required for code to run correctly::
+
+   >>> np.random.set_bit_generator(orig_bit_gen)
+
+(`gh-21976 <https://github.com/numpy/numpy/pull/21976>`__)
+
+``np.void`` now has a ``dtype`` argument
+----------------------------------------
+NumPy now allows constructing structured void scalars directly by
+passing the ``dtype`` argument to ``np.void``.
+
+(`gh-22316 <https://github.com/numpy/numpy/pull/22316>`__)
+
 
 Improvements
 ============
 
+F2PY Improvements
+-----------------
+* The generated extension modules don't use the deprecated NumPy-C API anymore
+* Improved ``f2py`` generated exception messages
+* Numerous bug and ``flake8`` warning fixes
+* various CPP macros that one can use within C-expressions of signature files
+  are prefixed with ``f2py_``. For example, one should use ``f2py_len(x)``
+  instead of ``len(x)``
+* A new construct ``character(f2py_len=...)`` is introduced to support
+  returning assumed length character strings (e.g. ``character(len=*)``) from
+  wrapper functions
+
+A hook to support rewriting ``f2py`` internal data structures after reading all
+its input files is introduced. This is required, for instance, for BC of SciPy
+support where character arguments are treated as character strings arguments in
+``C`` expressions.
+
+(`gh-19388 <https://github.com/numpy/numpy/pull/19388>`__)
+
+IBM zSystems Vector Extension Facility (SIMD)
+---------------------------------------------
+Added support for SIMD extensions of zSystem (z13, z14, z15), through the
+universal intrinsics interface. This support leads to performance improvements
+for all SIMD kernels implemented using the universal intrinsics, including the
+following operations: rint, floor, trunc, ceil, sqrt, absolute, square,
+reciprocal, tanh, sin, cos, equal, not_equal, greater, greater_equal, less,
+less_equal, maximum, minimum, fmax, fmin, argmax, argmin, add, subtract,
+multiply, divide.
+
+(`gh-20913 <https://github.com/numpy/numpy/pull/20913>`__)
+
+NumPy now gives floating point errors in casts
+----------------------------------------------
+In most cases, NumPy previously did not give floating point warnings or errors
+when these happened during casts.  For examples, casts like::
+
+    np.array([2e300]).astype(np.float32)  # overflow for float32
+    np.array([np.inf]).astype(np.int64)
+
+Should now generally give floating point warnings.  These warnings should warn
+that floating point overflow occurred.  For errors when converting floating
+point values to integers users should expect invalid value warnings.
+
+Users can modify the behavior of these warnings using ``np.errstate``.
+
+Note that for float to int casts, the exact warnings that are given may
+be platform dependent.  For example::
+
+    arr = np.full(100, fill_value=1000, dtype=np.float64)
+    arr.astype(np.int8)
+
+May give a result equivalent to (the intermediate cast means no warning is
+given)::
+
+    arr.astype(np.int64).astype(np.int8)
+
+May return an undefined result, with a warning set::
+
+    RuntimeWarning: invalid value encountered in cast
+
+The precise behavior is subject to the C99 standard and its implementation in
+both software and hardware.
+
+(`gh-21437 <https://github.com/numpy/numpy/pull/21437>`__)
+
+F2PY supports the value attribute
+---------------------------------
+The Fortran standard requires that variables declared with the ``value``
+attribute must be passed by value instead of reference. F2PY now supports this
+use pattern correctly. So ``integer, intent(in), value :: x`` in Fortran codes
+will have correct wrappers generated.
+
+(`gh-21807 <https://github.com/numpy/numpy/pull/21807>`__)
+
+Added pickle support for third-party BitGenerators
+--------------------------------------------------
+The pickle format for bit generators was extended to allow each bit generator
+to supply its own constructor when during pickling. Previous  versions of NumPy
+only supported unpickling ``Generator`` instances created with one of the core
+set of bit generators supplied with NumPy. Attempting to unpickle a
+``Generator`` that used a third-party bit generators would fail since the
+constructor used during the unpickling was only aware of the bit generators
+included in NumPy.
+
+(`gh-22014 <https://github.com/numpy/numpy/pull/22014>`__)
+
+arange() now explicitly fails with dtype=str
+---------------------------------------------
+Previously, the ``np.arange(n, dtype=str)`` function worked for ``n=1`` and
+``n=2``, but would raise a non-specific exception message for other values of
+``n``. Now, it raises a `TypeError` informing that ``arange`` does not support
+string dtypes::
+
+    >>> np.arange(2, dtype=str)
+    Traceback (most recent call last)
+       ...
+    TypeError: arange() not supported for inputs with DType <class 'numpy.dtype[str_]'>.
+
+(`gh-22055 <https://github.com/numpy/numpy/pull/22055>`__)
+
+``numpy.typing`` protocols are now runtime checkable
+----------------------------------------------------
+The protocols used in ``numpy.typing.ArrayLike`` and ``numpy.typing.DTypeLike``
+are now properly marked as runtime checkable, making them easier to use for
+runtime type checkers.
+
+(`gh-22357 <https://github.com/numpy/numpy/pull/22357>`__)
+
+
+Performance improvements and changes
+====================================
+
+Faster version of ``np.isin`` and ``np.in1d`` for integer arrays
+----------------------------------------------------------------
+``np.in1d`` (used by ``np.isin``) can now switch to a faster algorithm (up to
+>10x faster) when it is passed two integer arrays.  This is often automatically
+used, but you can use ``kind="sort"`` or ``kind="table"`` to force the old or
+new method, respectively.
+
+(`gh-12065 <https://github.com/numpy/numpy/pull/12065>`__)
+
+Faster comparison operators
+----------------------------
+The comparison functions (``numpy.equal``, ``numpy.not_equal``, ``numpy.less``,
+``numpy.less_equal``, ``numpy.greater`` and ``numpy.greater_equal``) are now
+much faster as they are now vectorized with universal intrinsics. For a CPU
+with SIMD extension AVX512BW, the performance gain is up to 2.57x, 1.65x and
+19.15x for integer, float and boolean data types, respectively (with N=50000).
+
+(`gh-21483 <https://github.com/numpy/numpy/pull/21483>`__)
+
 
 Changes
 =======
+
+Better reporting of integer division overflow
+---------------------------------------------
+Integer division overflow of scalars and arrays used to provide a
+``RuntimeWarning`` and the return value was undefined leading to crashes at
+rare occasions::
+
+    >>> np.array([np.iinfo(np.int32).min]*10, dtype=np.int32) // np.int32(-1)
+    <stdin>:1: RuntimeWarning: divide by zero encountered in floor_divide
+    array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)
+
+Integer division overflow now returns the input dtype's minimum value and raise
+the following ``RuntimeWarning``::
+
+    >>> np.array([np.iinfo(np.int32).min]*10, dtype=np.int32) // np.int32(-1)
+    <stdin>:1: RuntimeWarning: overflow encountered in floor_divide
+    array([-2147483648, -2147483648, -2147483648, -2147483648, -2147483648,
+           -2147483648, -2147483648, -2147483648, -2147483648, -2147483648],
+          dtype=int32)
+
+(`gh-21506 <https://github.com/numpy/numpy/pull/21506>`__)
+
+``masked_invalid`` now modifies the mask in-place
+-------------------------------------------------
+When used with ``copy=False``, ``numpy.ma.masked_invalid`` now modifies the
+input masked array in-place.  This makes it behave identically to
+``masked_where`` and better matches the documentation.
+
+(`gh-22046 <https://github.com/numpy/numpy/pull/22046>`__)
+
+``nditer``/``NpyIter`` allows all allocating all operands
+---------------------------------------------------------
+The NumPy iterator available through ``np.nditer`` in Python and as ``NpyIter``
+in C now supports allocating all arrays.  The iterator shape defaults to ``()``
+in this case.  The operands dtype must be provided, since a "common dtype"
+cannot be inferred from the other inputs.
+
+(`gh-22457 <https://github.com/numpy/numpy/pull/22457>`__)
diff --git a/doc/source/release/1.24.1-notes.rst b/doc/source/release/1.24.1-notes.rst
new file mode 100644
index 000000000..c346f6d20
--- /dev/null
+++ b/doc/source/release/1.24.1-notes.rst
@@ -0,0 +1,50 @@
+.. currentmodule:: numpy
+
+==========================
+NumPy 1.24.1 Release Notes
+==========================
+NumPy 1.24.1 is a maintenance release that fixes bugs and regressions discovered after the
+1.24.0 release. The Python versions supported by this release are 3.8-3.11.
+
+Contributors
+============
+
+A total of 12 people contributed to this release.  People with a "+" by their
+names contributed a patch for the first time.
+
+* Andrew Nelson
+* Ben Greiner +
+* Charles Harris
+* Clément Robert
+* Matteo Raso
+* Matti Picus
+* Melissa Weber Mendonça
+* Miles Cranmer
+* Ralf Gommers
+* Rohit Goswami
+* Sayed Adel
+* Sebastian Berg
+
+Pull requests merged
+====================
+
+A total of 18 pull requests were merged for this release.
+
+* `#22820 <https://github.com/numpy/numpy/pull/22820>`__: BLD: add workaround in setup.py for newer setuptools
+* `#22830 <https://github.com/numpy/numpy/pull/22830>`__: BLD: CIRRUS_TAG redux
+* `#22831 <https://github.com/numpy/numpy/pull/22831>`__: DOC: fix a couple typos in 1.23 notes
+* `#22832 <https://github.com/numpy/numpy/pull/22832>`__: BUG: Fix refcounting errors found using pytest-leaks
+* `#22834 <https://github.com/numpy/numpy/pull/22834>`__: BUG, SIMD: Fix invalid value encountered in several ufuncs
+* `#22837 <https://github.com/numpy/numpy/pull/22837>`__: TST: ignore more np.distutils.log imports
+* `#22839 <https://github.com/numpy/numpy/pull/22839>`__: BUG: Do not use getdata() in np.ma.masked_invalid
+* `#22847 <https://github.com/numpy/numpy/pull/22847>`__: BUG: Ensure correct behavior for rows ending in delimiter in...
+* `#22848 <https://github.com/numpy/numpy/pull/22848>`__: BUG, SIMD: Fix the bitmask of the boolean comparison
+* `#22857 <https://github.com/numpy/numpy/pull/22857>`__: BLD: Help raspian arm + clang 13 about __builtin_mul_overflow
+* `#22858 <https://github.com/numpy/numpy/pull/22858>`__: API: Ensure a full mask is returned for masked_invalid
+* `#22866 <https://github.com/numpy/numpy/pull/22866>`__: BUG: Polynomials now copy properly (#22669)
+* `#22867 <https://github.com/numpy/numpy/pull/22867>`__: BUG, SIMD: Fix memory overlap in ufunc comparison loops
+* `#22868 <https://github.com/numpy/numpy/pull/22868>`__: BUG: Fortify string casts against floating point warnings
+* `#22875 <https://github.com/numpy/numpy/pull/22875>`__: TST: Ignore nan-warnings in randomized out tests
+* `#22883 <https://github.com/numpy/numpy/pull/22883>`__: MAINT: restore npymath implementations needed for freebsd
+* `#22884 <https://github.com/numpy/numpy/pull/22884>`__: BUG: Fix integer overflow in in1d for mixed integer dtypes #22877
+* `#22887 <https://github.com/numpy/numpy/pull/22887>`__: BUG: Use whole file for encoding checks with ``charset_normalizer``.
diff --git a/doc/source/release/1.24.2-notes.rst b/doc/source/release/1.24.2-notes.rst
new file mode 100644
index 000000000..9e9412244
--- /dev/null
+++ b/doc/source/release/1.24.2-notes.rst
@@ -0,0 +1,51 @@
+.. currentmodule:: numpy
+
+==========================
+NumPy 1.24.2 Release Notes
+==========================
+NumPy 1.24.2 is a maintenance release that fixes bugs and regressions discovered after the
+1.24.1 release. The Python versions supported by this release are 3.8-3.11.
+
+Contributors
+============
+
+A total of 14 people contributed to this release.  People with a "+" by their
+names contributed a patch for the first time.
+
+* Bas van Beek
+* Charles Harris
+* Khem Raj +
+* Mark Harfouche
+* Matti Picus
+* Panagiotis Zestanakis +
+* Peter Hawkins
+* Pradipta Ghosh
+* Ross Barnowski
+* Sayed Adel
+* Sebastian Berg
+* Syam Gadde +
+* dmbelov +
+* pkubaj +
+
+Pull requests merged
+====================
+
+A total of 17 pull requests were merged for this release.
+
+* `#22965 <https://github.com/numpy/numpy/pull/22965>`__: MAINT: Update python 3.11-dev to 3.11.
+* `#22966 <https://github.com/numpy/numpy/pull/22966>`__: DOC: Remove dangling deprecation warning
+* `#22967 <https://github.com/numpy/numpy/pull/22967>`__: ENH: Detect CPU features on FreeBSD/powerpc64*
+* `#22968 <https://github.com/numpy/numpy/pull/22968>`__: BUG: np.loadtxt cannot load text file with quoted fields separated...
+* `#22969 <https://github.com/numpy/numpy/pull/22969>`__: TST: Add fixture to avoid issue with randomizing test order.
+* `#22970 <https://github.com/numpy/numpy/pull/22970>`__: BUG: Fix fill violating read-only flag. (#22959)
+* `#22971 <https://github.com/numpy/numpy/pull/22971>`__: MAINT: Add additional information to missing scalar AttributeError
+* `#22972 <https://github.com/numpy/numpy/pull/22972>`__: MAINT: Move export for scipy arm64 helper into main module
+* `#22976 <https://github.com/numpy/numpy/pull/22976>`__: BUG, SIMD: Fix spurious invalid exception for sin/cos on arm64/clang
+* `#22989 <https://github.com/numpy/numpy/pull/22989>`__: BUG: Ensure correct loop order in sin, cos, and arctan2
+* `#23030 <https://github.com/numpy/numpy/pull/23030>`__: DOC: Add version added information for the strict parameter in...
+* `#23031 <https://github.com/numpy/numpy/pull/23031>`__: BUG: use ``_Alignof`` rather than ``offsetof()`` on most compilers
+* `#23147 <https://github.com/numpy/numpy/pull/23147>`__: BUG: Fix for npyv__trunc_s32_f32 (VXE)
+* `#23148 <https://github.com/numpy/numpy/pull/23148>`__: BUG: Fix integer / float scalar promotion
+* `#23149 <https://github.com/numpy/numpy/pull/23149>`__: BUG: Add missing <type_traits> header.
+* `#23150 <https://github.com/numpy/numpy/pull/23150>`__: TYP, MAINT: Add a missing explicit ``Any`` parameter to the ``npt.ArrayLike``...
+* `#23161 <https://github.com/numpy/numpy/pull/23161>`__: BLD: remove redundant definition of npy_nextafter [wheel build]
diff --git a/doc/source/release/1.24.3-notes.rst b/doc/source/release/1.24.3-notes.rst
new file mode 100644
index 000000000..1aaf79cb6
--- /dev/null
+++ b/doc/source/release/1.24.3-notes.rst
@@ -0,0 +1,49 @@
+.. currentmodule:: numpy
+
+==========================
+NumPy 1.24.3 Release Notes
+==========================
+NumPy 1.24.3 is a maintenance release that fixes bugs and regressions discovered after the
+1.24.2 release. The Python versions supported by this release are 3.8-3.11.
+
+Contributors
+============
+
+A total of 12 people contributed to this release.  People with a "+" by their
+names contributed a patch for the first time.
+
+* Aleksei Nikiforov +
+* Alexander Heger
+* Bas van Beek
+* Bob Eldering
+* Brock Mendel
+* Charles Harris
+* Kyle Sunden
+* Peter Hawkins
+* Rohit Goswami
+* Sebastian Berg
+* Warren Weckesser
+* dependabot[bot]
+
+Pull requests merged
+====================
+
+A total of 17 pull requests were merged for this release.
+
+* `#23206 <https://github.com/numpy/numpy/pull/23206>`__: BUG: fix for f2py string scalars (#23194)
+* `#23207 <https://github.com/numpy/numpy/pull/23207>`__: BUG: datetime64/timedelta64 comparisons return NotImplemented
+* `#23208 <https://github.com/numpy/numpy/pull/23208>`__: MAINT: Pin matplotlib to version 3.6.3 for refguide checks
+* `#23221 <https://github.com/numpy/numpy/pull/23221>`__: DOC: Fix matplotlib error in documentation
+* `#23226 <https://github.com/numpy/numpy/pull/23226>`__: CI: Ensure submodules are initialized in gitpod.
+* `#23341 <https://github.com/numpy/numpy/pull/23341>`__: TYP: Replace duplicate reduce in ufunc type signature with reduceat.
+* `#23342 <https://github.com/numpy/numpy/pull/23342>`__: TYP: Remove duplicate CLIP/WRAP/RAISE in ``__init__.pyi``.
+* `#23343 <https://github.com/numpy/numpy/pull/23343>`__: TYP: Mark ``d`` argument to fftfreq and rfftfreq as optional...
+* `#23344 <https://github.com/numpy/numpy/pull/23344>`__: TYP: Add type annotations for comparison operators to MaskedArray.
+* `#23345 <https://github.com/numpy/numpy/pull/23345>`__: TYP: Remove some stray type-check-only imports of ``msort``
+* `#23370 <https://github.com/numpy/numpy/pull/23370>`__: BUG: Ensure like is only stripped for ``like=`` dispatched functions
+* `#23543 <https://github.com/numpy/numpy/pull/23543>`__: BUG: fix loading and storing big arrays on s390x
+* `#23544 <https://github.com/numpy/numpy/pull/23544>`__: MAINT: Bump larsoner/circleci-artifacts-redirector-action
+* `#23634 <https://github.com/numpy/numpy/pull/23634>`__: BUG: Ignore invalid and overflow warnings in masked setitem
+* `#23635 <https://github.com/numpy/numpy/pull/23635>`__: BUG: Fix masked array raveling when ``order="A"`` or ``order="K"``
+* `#23636 <https://github.com/numpy/numpy/pull/23636>`__: MAINT: Update conftest for newer hypothesis versions
+* `#23637 <https://github.com/numpy/numpy/pull/23637>`__: BUG: Fix bug in parsing F77 style string arrays.
diff --git a/doc/source/release/1.25.0-notes.rst b/doc/source/release/1.25.0-notes.rst
new file mode 100644
index 000000000..9b29b79df
--- /dev/null
+++ b/doc/source/release/1.25.0-notes.rst
@@ -0,0 +1,45 @@
+.. currentmodule:: numpy
+
+==========================
+NumPy 1.25.0 Release Notes
+==========================
+
+
+Highlights
+==========
+
+
+New functions
+=============
+
+
+Deprecations
+============
+
+
+Future Changes
+==============
+
+
+Expired deprecations
+====================
+
+
+Compatibility notes
+===================
+
+
+C API changes
+=============
+
+
+New Features
+============
+
+
+Improvements
+============
+
+
+Changes
+=======
diff --git a/doc/source/release/1.4.0-notes.rst b/doc/source/release/1.4.0-notes.rst
index 9480a054e..df7b72da6 100644
--- a/doc/source/release/1.4.0-notes.rst
+++ b/doc/source/release/1.4.0-notes.rst
@@ -110,7 +110,7 @@ Testing
     #. deprecated decorator: this decorator may be used to avoid cluttering
        testing output while testing DeprecationWarning is effectively raised by
        the decorated test.
-    #. assert_array_almost_equal_nulps: new method to compare two arrays of
+    #. assert_array_almost_equal_nulp: new method to compare two arrays of
        floating point values. With this function, two values are considered
        close if there are not many representable floating point values in
        between, thus being more robust than assert_array_almost_equal when the
diff --git a/doc/source/user/absolute_beginners.rst b/doc/source/user/absolute_beginners.rst
index dae2c4f06..dfcdc669b 100644
--- a/doc/source/user/absolute_beginners.rst
+++ b/doc/source/user/absolute_beginners.rst
@@ -64,8 +64,8 @@ To access NumPy and its functions import it in your Python code like this::
   import numpy as np
 
 We shorten the imported name to ``np`` for better readability of code using
-NumPy. This is a widely adopted convention that you should follow so that
-anyone working with your code can easily understand it.
+NumPy. This is a widely adopted convention that makes your code more readable
+for everyone working on it. We recommend to always use import numpy as ``np``.
 
 Reading the example code
 ------------------------
diff --git a/doc/source/user/basics.creation.rst b/doc/source/user/basics.creation.rst
index a97d69d8b..3ee501889 100644
--- a/doc/source/user/basics.creation.rst
+++ b/doc/source/user/basics.creation.rst
@@ -75,8 +75,8 @@ the computation, here ``uint32`` and ``int32`` can both be represented in
 as ``int64``. 
 
 The default NumPy behavior is to create arrays in either 32 or 64-bit signed
-integers (platform dependent and matches C int size) or double precision
-floating point numbers, int32/int64 and float, respectively. If you expect your
+integers (platform dependent and matches C ``long`` size) or double precision
+floating point numbers. If you expect your
 integer arrays to be a specific type, then you need to specify the dtype while
 you create the array.
 
@@ -352,7 +352,7 @@ and :func:`numpy.genfromtxt`. These functions have more involved use cases in
  2, 4
  3, 9
 
-Importing ``simple.csv`` is accomplished using :func:`loadtxt`::
+Importing ``simple.csv`` is accomplished using :func:`numpy.loadtxt`::
 
  >>> np.loadtxt('simple.csv', delimiter = ',', skiprows = 1) # doctest: +SKIP
  array([[0., 0.],
diff --git a/doc/source/user/basics.dispatch.rst b/doc/source/user/basics.dispatch.rst
index 0d0ddfcdb..a493ef769 100644
--- a/doc/source/user/basics.dispatch.rst
+++ b/doc/source/user/basics.dispatch.rst
@@ -271,6 +271,36 @@ array([[1., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 1.]])
 
+
+The implementation of ``DiagonalArray`` in this example only handles the
+``np.sum`` and ``np.mean`` functions for brevity. Many other functions in the
+Numpy API are also available to wrap and a full-fledged custom array container
+can explicitly support all functions that Numpy makes available to wrap.
+
+Numpy provides some utilities to aid testing of custom array containers that
+implement the ``__array_ufunc__`` and ``__array_function__`` protocols in the
+``numpy.testing.overrides`` namespace.
+
+To check if a Numpy function can be overridden via ``__array_ufunc__``, you can
+use :func:`~numpy.testing.overrides.allows_array_ufunc_override`:
+
+>>> from np.testing.overrides import allows_array_ufunc_override
+>>> allows_array_ufunc_override(np.add)
+True
+
+Similarly, you can check if a function can be overridden via
+``__array_function__`` using
+:func:`~numpy.testing.overrides.allows_array_function_override`.
+
+Lists of every overridable function in the Numpy API are also available via
+:func:`~numpy.testing.overrides.get_overridable_numpy_array_functions` for
+functions that support the ``__array_function__`` protocol and
+:func:`~numpy.testing.overrides.get_overridable_numpy_ufuncs` for functions that
+support the ``__array_ufunc__`` protocol. Both functions return sets of
+functions that are present in the Numpy public API. User-defined ufuncs or
+ufuncs defined in other libraries that depend on Numpy are not present in
+these sets.
+
 Refer to the `dask source code <https://github.com/dask/dask>`_ and
 `cupy source code <https://github.com/cupy/cupy>`_  for more fully-worked
 examples of custom array containers.
diff --git a/doc/source/user/basics.indexing.rst b/doc/source/user/basics.indexing.rst
index 088d280a2..17fc9c0cc 100644
--- a/doc/source/user/basics.indexing.rst
+++ b/doc/source/user/basics.indexing.rst
@@ -294,10 +294,6 @@ basic slicing that returns a :term:`view`).
    the former will trigger advanced indexing. Be sure to understand
    why this occurs.
 
-   Also recognize that ``x[[1, 2, 3]]`` will trigger advanced indexing,
-   whereas due to the deprecated Numeric compatibility mentioned above,
-   ``x[[1, 2, slice(None)]]`` will trigger basic slicing.
-
 Integer array indexing
 ~~~~~~~~~~~~~~~~~~~~~~
 
@@ -485,12 +481,13 @@ tuple (of length :attr:`obj.ndim <ndarray.ndim>`) of integer index
 arrays showing the :py:data:`True` elements of *obj*. However, it is
 faster when ``obj.shape == x.shape``.
 
-If ``obj.ndim == x.ndim``, ``x[obj]`` returns a 1-dimensional array
-filled with the elements of *x* corresponding to the :py:data:`True`
-values of *obj*.  The search order will be :term:`row-major`,
-C-style. If *obj* has :py:data:`True` values at entries that are outside
-of the bounds of *x*, then an index error will be raised. If *obj* is
-smaller than *x* it is identical to filling it with :py:data:`False`.
+If ``obj.ndim == x.ndim``, ``x[obj]``
+returns a 1-dimensional array filled with the elements of *x*
+corresponding to the :py:data:`True` values of *obj*. The search order
+will be :term:`row-major`, C-style. An index error will be raised if
+the shape of *obj* does not match the corresponding dimensions of *x*,
+regardless of whether those values are :py:data:`True` or
+:py:data:`False`.
 
 A common use case for this is filtering for desired element values.
 For example, one may wish to select all entries from an array which
diff --git a/doc/source/user/basics.rec.rst b/doc/source/user/basics.rec.rst
index b56b4d177..640cfaa8b 100644
--- a/doc/source/user/basics.rec.rst
+++ b/doc/source/user/basics.rec.rst
@@ -166,6 +166,11 @@ attribute of the dtype object::
  >>> d.names
  ('x', 'y')
 
+The dtype of each individual field can be looked up by name::
+
+ >>> d['x']
+ dtype('int64')
+
 The field names may be modified by assigning to the ``names`` attribute using a
 sequence of strings of the same length.
 
diff --git a/doc/source/user/building.rst b/doc/source/user/building.rst
index 81f6d33a7..442bda4b3 100644
--- a/doc/source/user/building.rst
+++ b/doc/source/user/building.rst
@@ -3,31 +3,14 @@
 Building from source
 ====================
 
-There are two options for building NumPy- building with Gitpod or locally from
-source. Your choice depends on your operating system and familiarity with the
-command line.
-
-Gitpod
-------
-
-Gitpod is an open-source platform that automatically creates
-the correct development environment right in your browser, reducing the need to
-install local development environments and deal with incompatible dependencies.
-
-If you are a Windows user, unfamiliar with using the command line or building
-NumPy for the first time, it is often faster to build with Gitpod. Here are the
-in-depth instructions for building NumPy with `building NumPy with Gitpod`_.
-
-.. _building NumPy with Gitpod: https://numpy.org/devdocs/dev/development_gitpod.html
-
-Building locally
-----------------
-
-Building locally on your machine gives you
-more granular control. If you are a MacOS or Linux user familiar with using the
+Building locally on your machine gives you complete control over build options.
+If you are a MacOS or Linux user familiar with using the
 command line, you can continue with building NumPy locally by following the
 instructions below.
 
+.. note:: If you want to build NumPy for development purposes, please refer to 
+   :ref:`development-environment` for additional information.
+
 ..
   This page is referenced from numpy/numpy/__init__.py. Please keep its
   location in sync with the link there.
@@ -37,7 +20,7 @@ Prerequisites
 
 Building NumPy requires the following software installed:
 
-1) Python 3.8.x or newer
+1) Python 3.9.x or newer
 
    Please note that the Python development headers also need to be installed,
    e.g., on Debian/Ubuntu one needs to install both `python3` and
@@ -79,6 +62,10 @@ Building NumPy requires the following software installed:
 
    For building NumPy, you'll need a recent version of Cython.
 
+5) The NumPy source code
+
+   Clone the repository following the instructions in :doc:`/dev/index`.
+
 Basic Installation
 ------------------
 
diff --git a/doc/source/user/how-to-index.rst b/doc/source/user/how-to-index.rst
index 41061d5f4..97c451260 100644
--- a/doc/source/user/how-to-index.rst
+++ b/doc/source/user/how-to-index.rst
@@ -1,6 +1,6 @@
 .. currentmodule:: numpy
 
-.. _how-to-index.rst:
+.. _how-to-index:
 
 *****************************************
 How to index :class:`ndarrays <.ndarray>`
@@ -309,6 +309,30 @@ result as dimensions with size one::
     array([[[2, 2, 2, 2, 2]],
     <BLANKLINE>
            [[2, 2, 2, 2, 2]]])
+	   
+To get the indices of each maximum or minimum value for each
+(N-1)-dimensional array in an N-dimensional array, use :meth:`reshape`
+to reshape the array to a 2D array, apply :meth:`argmax` or :meth:`argmin`
+along ``axis=1`` and use :meth:`unravel_index` to recover the index of the
+values per slice::
+
+    >>> x = np.arange(2*2*3).reshape(2, 2, 3) % 7  # 3D example array
+    >>> x
+    array([[[0, 1, 2],
+            [3, 4, 5]],
+    <BLANKLINE>
+           [[6, 0, 1],
+            [2, 3, 4]]])
+    >>> x_2d = np.reshape(x, (x.shape[0], -1))
+    >>> indices_2d = np.argmax(x_2d, axis=1)
+    >>> indices_2d
+    array([5, 0])
+    >>> np.unravel_index(indices_2d, x.shape[1:])
+    (array([1, 0]), array([2, 0]))
+    
+The first array returned contains the indices along axis 1 in the original
+array, the second array contains the indices along axis 2. The highest
+value in ``x[0]`` is therefore ``x[0, 1, 2]``.
 
 Index the same ndarray multiple times efficiently
 =================================================
@@ -348,4 +372,4 @@ Exercises `6`_, `8`_, `10`_, `15`_, `16`_, `19`_, `20`_, `45`_, `59`_,
 .. _87: https://github.com/rougier/numpy-100/blob/master/100_Numpy_exercises_with_solutions.md#87-consider-a-16x16-array-how-to-get-the-block-sum-block-size-is-4x4-
 .. _90: https://github.com/rougier/numpy-100/blob/master/100_Numpy_exercises_with_solutions.md#90-given-an-arbitrary-number-of-vectors-build-the-cartesian-product-every-combinations-of-every-item-
 .. _93: https://github.com/rougier/numpy-100/blob/master/100_Numpy_exercises_with_solutions.md#93-consider-two-arrays-a-and-b-of-shape-83-and-22-how-to-find-rows-of-a-that-contain-elements-of-each-row-of-b-regardless-of-the-order-of-the-elements-in-b-
-.. _94: https://github.com/rougier/numpy-100/blob/master/100_Numpy_exercises_with_solutions.md#94-considering-a-10x3-matrix-extract-rows-with-unequal-values-eg-223-
-\ No newline at end of file
+.. _94: https://github.com/rougier/numpy-100/blob/master/100_Numpy_exercises_with_solutions.md#94-considering-a-10x3-matrix-extract-rows-with-unequal-values-eg-223-
diff --git a/doc/source/user/how-to-partition.rst b/doc/source/user/how-to-partition.rst
new file mode 100644
index 000000000..224519364
--- /dev/null
+++ b/doc/source/user/how-to-partition.rst
@@ -0,0 +1,268 @@
+.. _how-to-partition:
+
+=================================================
+How to create arrays with regularly-spaced values
+=================================================
+
+There are a few NumPy functions that are similar in application, but which
+provide slightly different results, which may cause confusion if one is not sure
+when and how to use them. The following guide aims to list these functions and
+describe their recommended usage.
+
+The functions mentioned here are
+
+* `numpy.linspace`
+* `numpy.arange`
+* `numpy.geomspace`
+* `numpy.logspace`
+* `numpy.meshgrid`
+* `numpy.mgrid`
+* `numpy.ogrid`
+
+1D domains (intervals)
+======================
+
+``linspace`` vs. ``arange``
+---------------------------
+
+Both `numpy.linspace` and `numpy.arange` provide ways to partition an interval
+(a 1D domain) into equal-length subintervals. These partitions will vary
+depending on the chosen starting and ending points, and the **step** (the length 
+of the subintervals).
+
+* **Use** `numpy.arange` **if you want integer steps.**
+
+  `numpy.arange` relies on step size to determine how many elements are in the
+  returned array, which excludes the endpoint. This is determined through the
+  ``step`` argument to ``arange``.
+
+  Example::
+
+    >>> np.arange(0, 10, 2)  # np.arange(start, stop, step)
+    array([0, 2, 4, 6, 8])
+
+  The arguments ``start`` and ``stop`` should be integer or real, but not
+  complex numbers. `numpy.arange` is similar to the Python built-in
+  :py:class:`range`.
+
+  Floating-point inaccuracies can make ``arange`` results with floating-point
+  numbers confusing. In this case, you should use `numpy.linspace` instead.
+
+* **Use** `numpy.linspace` **if you want the endpoint to be included in the
+  result, or if you are using a non-integer step size.**
+
+  `numpy.linspace` *can* include the endpoint and determines step size from the
+  `num` argument, which specifies the number of elements in the returned
+  array.
+  
+  The inclusion of the endpoint is determined by an optional boolean
+  argument ``endpoint``, which defaults to ``True``. Note that selecting
+  ``endpoint=False`` will change the step size computation, and the subsequent
+  output for the function.
+
+  Example::
+
+    >>> np.linspace(0.1, 0.2, num=5)  # np.linspace(start, stop, num)
+    array([0.1  , 0.125, 0.15 , 0.175, 0.2  ])
+    >>> np.linspace(0.1, 0.2, num=5, endpoint=False)
+    array([0.1, 0.12, 0.14, 0.16, 0.18])
+
+  `numpy.linspace` can also be used with complex arguments::
+
+    >>> np.linspace(1+1.j, 4, 5, dtype=np.complex64)
+    array([1.  +1.j  , 1.75+0.75j, 2.5 +0.5j , 3.25+0.25j, 4.  +0.j  ],
+          dtype=complex64)
+
+Other examples
+--------------
+
+1. Unexpected results may happen if floating point values are used as ``step``
+   in ``numpy.arange``. To avoid this, make sure all floating point conversion
+   happens after the computation of results. For example, replace
+
+   ::
+
+     >>> list(np.arange(0.1,0.4,0.1).round(1))
+     [0.1, 0.2, 0.3, 0.4]  # endpoint should not be included!
+
+   with
+
+   ::
+
+     >>> list(np.arange(1, 4, 1) / 10.0)
+     [0.1, 0.2, 0.3]  # expected result
+
+2. Note that
+
+   ::
+
+     >>> np.arange(0, 1.12, 0.04)
+     array([0.  , 0.04, 0.08, 0.12, 0.16, 0.2 , 0.24, 0.28, 0.32, 0.36, 0.4 ,
+            0.44, 0.48, 0.52, 0.56, 0.6 , 0.64, 0.68, 0.72, 0.76, 0.8 , 0.84,
+            0.88, 0.92, 0.96, 1.  , 1.04, 1.08, 1.12])
+
+   and
+
+   ::
+
+     >>> np.arange(0, 1.08, 0.04)
+     array([0.  , 0.04, 0.08, 0.12, 0.16, 0.2 , 0.24, 0.28, 0.32, 0.36, 0.4 ,
+            0.44, 0.48, 0.52, 0.56, 0.6 , 0.64, 0.68, 0.72, 0.76, 0.8 , 0.84,
+            0.88, 0.92, 0.96, 1.  , 1.04])
+
+   These differ because of numeric noise. When using floating point values, it
+   is possible that ``0 + 0.04 * 28 < 1.12``, and so ``1.12`` is in the
+   interval. In fact, this is exactly the case::
+
+     >>> 1.12/0.04
+     28.000000000000004
+
+   But ``0 + 0.04 * 27 >= 1.08`` so that 1.08 is excluded::
+
+     >>> 1.08/0.04
+     27.0
+
+   Alternatively, you could use ``np.arange(0, 28)*0.04`` which would always
+   give you precise control of the end point since it is integral::
+
+    >>> np.arange(0, 28)*0.04
+    array([0.  , 0.04, 0.08, 0.12, 0.16, 0.2 , 0.24, 0.28, 0.32, 0.36, 0.4 ,
+           0.44, 0.48, 0.52, 0.56, 0.6 , 0.64, 0.68, 0.72, 0.76, 0.8 , 0.84,
+           0.88, 0.92, 0.96, 1.  , 1.04, 1.08])
+
+
+``geomspace`` and ``logspace``
+------------------------------
+
+``numpy.geomspace`` is similar to ``numpy.linspace``, but with numbers spaced
+evenly on a log scale (a geometric progression). The endpoint is included in the
+result.
+
+Example::
+
+  >>> np.geomspace(2, 3, num=5)
+  array([2.        , 2.21336384, 2.44948974, 2.71080601, 3.        ])
+
+``numpy.logspace`` is similar to ``numpy.geomspace``, but with the start and end
+points specified as logarithms (with base 10 as default)::
+
+  >>> np.logspace(2, 3, num=5)
+  array([ 100.        ,  177.827941  ,  316.22776602,  562.34132519, 1000.        ])
+
+In linear space, the sequence starts at ``base ** start`` (``base`` to the power
+of ``start``) and ends with ``base ** stop``::
+
+  >>> np.logspace(2, 3, num=5, base=2)
+  array([4.        , 4.75682846, 5.65685425, 6.72717132, 8.        ])
+
+nD domains
+==========
+
+nD domains can be partitioned into *grids*. This can be done using one of the
+following functions.
+
+``meshgrid``
+------------
+
+The purpose of ``numpy.meshgrid`` is to create a rectangular grid out of a set
+of one-dimensional coordinate arrays.
+
+Given arrays
+
+ ::
+   
+   >>> x = np.array([0, 1, 2, 3])
+   >>> y = np.array([0, 1, 2, 3, 4, 5])
+
+``meshgrid`` will create two coordinate arrays, which can be used to generate
+the coordinate pairs determining this grid.
+
+ ::
+
+   >>> xx, yy = np.meshgrid(x, y)
+   >>> xx
+   array([[0, 1, 2, 3],
+          [0, 1, 2, 3],
+          [0, 1, 2, 3],
+          [0, 1, 2, 3],
+          [0, 1, 2, 3],
+          [0, 1, 2, 3]])
+   >>> yy
+   array([[0, 0, 0, 0],
+          [1, 1, 1, 1],
+          [2, 2, 2, 2],
+          [3, 3, 3, 3],
+          [4, 4, 4, 4],
+          [5, 5, 5, 5]])
+
+   >>> import matplotlib.pyplot as plt
+   >>> plt.plot(xx, yy, marker='.', color='k', linestyle='none')
+
+.. plot:: user/plots/meshgrid_plot.py
+  :align: center
+  :include-source: 0
+
+``mgrid``
+---------
+
+``numpy.mgrid`` can be used as a shortcut for creating meshgrids. It is not a
+function, but when indexed, returns a multidimensional meshgrid.
+
+::
+
+  >>> xx, yy = np.meshgrid(np.array([0, 1, 2, 3]), np.array([0, 1, 2, 3, 4, 5]))
+  >>> xx.T, yy.T
+  (array([[0, 0, 0, 0, 0, 0],
+          [1, 1, 1, 1, 1, 1],
+          [2, 2, 2, 2, 2, 2],
+          [3, 3, 3, 3, 3, 3]]),
+   array([[0, 1, 2, 3, 4, 5],
+          [0, 1, 2, 3, 4, 5],
+          [0, 1, 2, 3, 4, 5],
+          [0, 1, 2, 3, 4, 5]]))
+
+  >>> np.mgrid[0:4, 0:6]
+  array([[[0, 0, 0, 0, 0, 0],
+          [1, 1, 1, 1, 1, 1],
+          [2, 2, 2, 2, 2, 2],
+          [3, 3, 3, 3, 3, 3]],
+  <BLANKLINE>
+         [[0, 1, 2, 3, 4, 5],
+          [0, 1, 2, 3, 4, 5],
+          [0, 1, 2, 3, 4, 5],
+          [0, 1, 2, 3, 4, 5]]])
+
+
+``ogrid``
+---------
+
+Similar to ``numpy.mgrid``, ``numpy.ogrid`` returns an *open* multidimensional
+meshgrid. This means that when it is indexed, only one dimension of each
+returned array is greater than 1. This avoids repeating the data and thus saves
+memory, which is often desirable.
+
+These sparse coordinate grids are intended to be use with :ref:`broadcasting`.
+When all coordinates are used in an expression, broadcasting still leads to a
+fully-dimensonal result array.
+
+::
+
+   >>> np.ogrid[0:4, 0:6]
+   [array([[0],
+           [1],
+           [2],
+           [3]]), array([[0, 1, 2, 3, 4, 5]])]
+
+All three methods described here can be used to evaluate function values on a
+grid.
+
+::
+
+   >>> g = np.ogrid[0:4, 0:6]
+   >>> zg = np.sqrt(g[0]**2 + g[1]**2)
+   >>> g[0].shape, g[1].shape, zg.shape
+   ((4, 1), (1, 6), (4, 6))
+   >>> m = np.mgrid[0:4, 0:6]
+   >>> zm = np.sqrt(m[0]**2 + m[1]**2)
+   >>> np.array_equal(zm, zg)
+   True
diff --git a/doc/source/user/how-to-verify-bug.rst b/doc/source/user/how-to-verify-bug.rst
index 4fc58c707..6e76f453a 100644
--- a/doc/source/user/how-to-verify-bug.rst
+++ b/doc/source/user/how-to-verify-bug.rst
@@ -76,7 +76,7 @@ The report references NumPy version 1.18.4, so that is the version you need to
 install in this case.
 
 Since this bug is tied to a release and not a specific commit, a pre-built wheel
-installed in your virtual environment via `pip` will suffice::
+installed in your virtual environment via ``pip`` will suffice::
 
     pip install numpy==1.18.4
 
diff --git a/doc/source/user/howtos_index.rst b/doc/source/user/howtos_index.rst
index 0582d82f2..ca30f7e91 100644
--- a/doc/source/user/howtos_index.rst
+++ b/doc/source/user/howtos_index.rst
@@ -1,7 +1,7 @@
 .. _howtos:
 
 ################
-NumPy How Tos
+NumPy how-tos
 ################
 
 These documents are intended as recipes to common tasks using NumPy. For
@@ -15,3 +15,4 @@ the package, see the :ref:`API reference <reference>`.
    how-to-io
    how-to-index
    how-to-verify-bug
+   how-to-partition
diff --git a/doc/source/user/plots/meshgrid_plot.py b/doc/source/user/plots/meshgrid_plot.py
new file mode 100644
index 000000000..91032145a
--- /dev/null
+++ b/doc/source/user/plots/meshgrid_plot.py
@@ -0,0 +1,7 @@
+import numpy as np
+import matplotlib.pyplot as plt
+
+x = np.array([0, 1, 2, 3])
+y = np.array([0, 1, 2, 3, 4, 5])
+xx, yy = np.meshgrid(x, y)
+plt.plot(xx, yy, marker='o', color='k', linestyle='none')
diff --git a/doc/source/user/quickstart.rst b/doc/source/user/quickstart.rst
index d138242d7..783d5a447 100644
--- a/doc/source/user/quickstart.rst
+++ b/doc/source/user/quickstart.rst
@@ -1482,7 +1482,7 @@ Further reading
 
 -  The `Python tutorial <https://docs.python.org/tutorial/>`__
 -  :ref:`reference`
--  `SciPy Tutorial <https://docs.scipy.org/doc/scipy/reference/tutorial/index.html>`__
+-  `SciPy Tutorial <https://docs.scipy.org/doc/scipy/tutorial/index.html>`__
 -  `SciPy Lecture Notes <https://scipy-lectures.org>`__
 -  A `matlab, R, IDL, NumPy/SciPy dictionary <http://mathesaurus.sf.net/>`__
 -  :doc:`tutorial-svd <numpy-tutorials:content/tutorial-svd>`
diff --git a/doc/source/user/theory.broadcasting.rst b/doc/source/user/theory.broadcasting.rst
index a4973e4e6..f277d4afd 100644
--- a/doc/source/user/theory.broadcasting.rst
+++ b/doc/source/user/theory.broadcasting.rst
@@ -1,7 +1,7 @@
 :orphan:
 
 ===========================
-Array Broadcasting in Numpy
+Array broadcasting in Numpy
 ===========================
 
 .. 
diff --git a/doc/source/user/troubleshooting-importerror.rst b/doc/source/user/troubleshooting-importerror.rst
index 2227d7b07..552748a6a 100644
--- a/doc/source/user/troubleshooting-importerror.rst
+++ b/doc/source/user/troubleshooting-importerror.rst
@@ -6,9 +6,9 @@
    to this page.
 
 
-***************************
-Troubleshooting ImportError
-***************************
+***************
+Troubleshooting
+***************
 
 .. note::
 
@@ -183,3 +183,65 @@ that usually works is to upgrade the NumPy version::
 
     pip install numpy --upgrade
 
+Segfaults or crashes
+====================
+
+NumPy tries to use advanced CPU features (SIMD) to speed up operations. If you
+are getting an "illegal instruction" error or a segfault, one cause could be
+that the environment claims it can support one or more of these features but
+actually cannot. This can happen inside a docker image or a VM (qemu, VMWare,
+...)
+
+You can use the output of ``np.show_runtime()`` to show which SIMD features are
+detected. For instance::
+
+    >>> np.show_runtime()
+    WARNING: `threadpoolctl` not found in system! Install it by `pip install \
+    threadpoolctl`. Once installed, try `np.show_runtime` again for more detailed
+    build information
+    [{'simd_extensions': {'baseline': ['SSE', 'SSE2', 'SSE3'],
+                          'found': ['SSSE3',
+                                    'SSE41',
+                                    'POPCNT',
+                                    'SSE42',
+                                    'AVX',
+                                    'F16C',
+                                    'FMA3',
+                                    'AVX2'],
+                          'not_found': ['AVX512F',
+                                        'AVX512CD',
+                                        'AVX512_KNL',
+                                        'AVX512_KNM',
+                                        'AVX512_SKX',
+                                        'AVX512_CLX',
+                                        'AVX512_CNL',
+                                        'AVX512_ICL']}}]
+
+In this case, it shows AVX2 and FMA3 under the ``found`` section, so you can
+try disabling them by setting ``NPY_DISABLE_CPU_FEATURES="AVX2,FMA3"`` in your
+environment before running python (for cmd.exe on windows)::
+
+    >SET NPY_DISABLE_CPU_FEATURES="AVX2,FMA3"
+    >python <myprogram.py>
+
+By installing threadpoolctl ``np.show_runtime()`` will show additional information::
+
+    ...
+    {'architecture': 'Zen',
+      'filepath': '/tmp/venv3/lib/python3.9/site-packages/numpy.libs/libopenblas64_p-r0-15028c96.3.21.so',
+      'internal_api': 'openblas',
+      'num_threads': 24,
+      'prefix': 'libopenblas',
+      'threading_layer': 'pthreads',
+      'user_api': 'blas',
+      'version': '0.3.21'}]
+
+If you use the wheel from PyPI, it contains code from the OpenBLAS project to
+speed up matrix operations. This code too can try to use SIMD instructions. It
+has a different mechanism for choosing which to use, based on a CPU
+architecture, You can override this architecture by setting
+``OPENBLAS_CORETYPE``: a minimal value for ``x86_64`` is
+``OPENBLAS_CORETYPE=Haswell``.  This too needs to be set before running your
+python (this time for posix)::
+
+    $ OPENBLAS_CORETYPE=Haswell python <myprogram.py>
diff --git a/doc_requirements.txt b/doc_requirements.txt
index 4c7396efe..b8a82aff7 100644
--- a/doc_requirements.txt
+++ b/doc_requirements.txt
@@ -1,13 +1,13 @@
 # doxygen required, use apt-get or dnf
 sphinx>=4.5.0
 numpydoc==1.4
-pydata-sphinx-theme==0.9.0
+pydata-sphinx-theme==0.13.3
 sphinx-design
 ipython!=8.1.0
 scipy
 matplotlib
 pandas
-breathe
+breathe>4.33.0
 
 # needed to build release notes
 towncrier
diff --git a/environment.yml b/environment.yml
index b99fa1256..a6a309c8d 100644
--- a/environment.yml
+++ b/environment.yml
@@ -13,6 +13,11 @@ dependencies:
   - openblas
   - nomkl
   - setuptools=59.2.0
+  - meson >= 0.64.0
+  - ninja
+  - pkg-config
+  - meson-python
+  - pip   # so you can use pip to install spin
   # For testing
   - pytest
   - pytest-cov
@@ -29,7 +34,7 @@ dependencies:
   - scipy
   - pandas
   - matplotlib
-  - pydata-sphinx-theme=0.9.0
+  - pydata-sphinx-theme=0.13.3
   - doxygen
   # NOTE: breathe 4.33.0 collides with sphinx.ext.graphviz
   - breathe>4.33.0
diff --git a/generate_version.py b/generate_version.py
new file mode 100644
index 000000000..b3c8a3978
--- /dev/null
+++ b/generate_version.py
@@ -0,0 +1,40 @@
+# Note: This file has to live next to versioneer.py or it will not work
+import argparse
+import os
+
+import versioneer
+
+
+def write_version_info(path):
+    vinfo = versioneer.get_versions()
+    full_version = vinfo['version']
+    git_revision = vinfo['full-revisionid']
+
+    if os.environ.get("MESON_DIST_ROOT"):
+        path = os.path.join(os.environ.get("MESON_DIST_ROOT"), path)
+
+    with open(path, "w") as f:
+        f.write("def get_versions():\n")
+        f.write("    return {\n")
+        f.write(f"        'full-revisionid': '{git_revision}',\n")
+        f.write(f"        'version': '{full_version}'\n")
+        f.write("}")
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-o", "--outfile", type=str, help="Path to write version info to"
+    )
+    args = parser.parse_args()
+
+    if not args.outfile.endswith(".py"):
+        raise ValueError(
+            f"Output file must be a Python file. "
+            f"Got: {args.outfile} as filename instead"
+        )
+
+    write_version_info(args.outfile)
+
+
+main()
diff --git a/linter_requirements.txt b/linter_requirements.txt
index 6ed26c5c0..2e0298bae 100644
--- a/linter_requirements.txt
+++ b/linter_requirements.txt
@@ -1,2 +1,2 @@
 pycodestyle==2.8.0
-GitPython==3.1.13
-\ No newline at end of file
+GitPython>=3.1.30
diff --git a/meson.build b/meson.build
new file mode 100644
index 000000000..47e71efc0
--- /dev/null
+++ b/meson.build
@@ -0,0 +1,65 @@
+project(
+  'NumPy',
+  'c', 'cpp', 'cython',
+  # Note that the git commit hash cannot be added dynamically here
+  # It is dynamically added upon import by versioneer
+  # See `numpy/__init__.py`
+  version: '1.24.0.dev0',
+  license: 'BSD-3',
+  meson_version: '>= 0.64.0',
+  default_options: [
+    'buildtype=debugoptimized',
+    'b_ndebug=if-release',
+    'c_std=c99',
+    'cpp_std=c++17',
+    'blas=openblas',
+    'lapack=openblas',
+    'pkgconfig.relocatable=true',
+  ],
+)
+
+fs = import('fs')
+
+cc = meson.get_compiler('c')
+cpp = meson.get_compiler('cpp')
+
+# Check compiler is recent enough (see the SciPy Toolchain Roadmap for details)
+if cc.get_id() == 'gcc'
+  if not cc.version().version_compare('>=8.4')
+    error('NumPy requires GCC >= 8.4')
+  endif
+elif cc.get_id() == 'msvc'
+  if not cc.version().version_compare('>=19.20')
+    error('NumPy requires at least vc142 (default with Visual Studio 2019) ' + \
+          'when building with MSVC')
+  endif
+endif
+
+# https://mesonbuild.com/Python-module.html
+py_mod = import('python')
+py = py_mod.find_installation(pure: false)
+py_dep = py.dependency()
+
+if not cc.has_header('Python.h', dependencies: py_dep)
+  error('Cannot compile `Python.h`. Perhaps you need to install python-dev|python-devel')
+endif
+
+# Generate version number. Note that this will not (yet) update the version
+# number seen by pip or reflected in wheel filenames. See
+# https://github.com/mesonbuild/meson-python/issues/159 for that.
+versioneer = files('generate_version.py')
+if fs.exists('_version_meson.py')
+    py.install_sources('_version_meson.py', subdir: 'numpy')
+else
+    custom_target('write_version_file',
+        output: '_version_meson.py',
+        command: [py, versioneer, '-o', '@OUTPUT@'],
+        build_by_default: true,
+        build_always_stale: true,
+        install: true,
+        install_dir: py.get_install_dir() / 'numpy'
+    )
+    meson.add_dist_script(py, versioneer, '-o', '_version_meson.py')
+endif
+
+subdir('numpy')
diff --git a/meson_options.txt b/meson_options.txt
new file mode 100644
index 000000000..f53432def
--- /dev/null
+++ b/meson_options.txt
@@ -0,0 +1,13 @@
+option('blas', type: 'string', value: 'openblas',
+        description: 'option for BLAS library switching')
+option('lapack', type: 'string', value: 'openblas',
+        description: 'option for LAPACK library switching')
+option('disable-svml', type: 'boolean', value: 'false',
+        description: 'Disable building against SVML')
+option('disable-threading', type: 'boolean', value: 'false',
+        description: 'Disable threading support (see `NPY_ALLOW_THREADS` docs)')
+# TODO: flip value to 'false' once we have `npy_cpu_dispatch_config.h` & co.
+option('disable-simd-optimizations', type: 'boolean', value: 'true',
+        description: 'Disable SIMD features beyond the baseline ones')
+option('relaxed-strides-debug', type: 'boolean', value: 'false',
+        description: 'Enable relaxed strides debug mode (see `NPY_RELAXED_STRIDES_DEBUG` docs)')
diff --git a/numpy/__config__.py.in b/numpy/__config__.py.in
new file mode 100644
index 000000000..6c6c21cb8
--- /dev/null
+++ b/numpy/__config__.py.in
@@ -0,0 +1,156 @@
+# This file is generated by numpy's build process
+# It contains system_info results at the time of building this package.
+from enum import Enum
+from numpy.core._multiarray_umath import (
+    __cpu_features__,
+    __cpu_baseline__,
+    __cpu_dispatch__,
+)
+
+__all__ = ["show"]
+_built_with_meson = True
+
+
+class DisplayModes(Enum):
+    stdout = "stdout"
+    dicts = "dicts"
+
+
+def _cleanup(d):
+    """
+    Removes empty values in a `dict` recursively
+    This ensures we remove values that Meson could not provide to CONFIG
+    """
+    if isinstance(d, dict):
+        return {k: _cleanup(v) for k, v in d.items() if v and _cleanup(v)}
+    else:
+        return d
+
+
+CONFIG = _cleanup(
+    {
+        "Compilers": {
+            "c": {
+                "name": "@C_COMP@",
+                "linker": "@C_COMP_LINKER_ID@",
+                "version": "@C_COMP_VERSION@",
+                "commands": "@C_COMP_CMD_ARRAY@",
+            },
+            "cython": {
+                "name": "@CYTHON_COMP@",
+                "linker": "@CYTHON_COMP_LINKER_ID@",
+                "version": "@CYTHON_COMP_VERSION@",
+                "commands": "@CYTHON_COMP_CMD_ARRAY@",
+            },
+            "c++": {
+                "name": "@CPP_COMP@",
+                "linker": "@CPP_COMP_LINKER_ID@",
+                "version": "@CPP_COMP_VERSION@",
+                "commands": "@CPP_COMP_CMD_ARRAY@",
+            },
+        },
+        "Machine Information": {
+            "host": {
+                "cpu": "@HOST_CPU@",
+                "family": "@HOST_CPU_FAMILY@",
+                "endian": "@HOST_CPU_ENDIAN@",
+                "system": "@HOST_CPU_SYSTEM@",
+            },
+            "build": {
+                "cpu": "@BUILD_CPU@",
+                "family": "@BUILD_CPU_FAMILY@",
+                "endian": "@BUILD_CPU_ENDIAN@",
+                "system": "@BUILD_CPU_SYSTEM@",
+            },
+            "cross-compiled": bool("@CROSS_COMPILED@".lower().replace("false", "")),
+        },
+        "Build Dependencies": {
+            "blas": {
+                "name": "@BLAS_NAME@",
+                "found": bool("@BLAS_FOUND@".lower().replace("false", "")),
+                "version": "@BLAS_VERSION@",
+                "detection method": "@BLAS_TYPE_NAME@",
+                "include directory": r"@BLAS_INCLUDEDIR@",
+                "lib directory": r"@BLAS_LIBDIR@",
+                "openblas configuration": "@BLAS_OPENBLAS_CONFIG@",
+                "pc file directory": r"@BLAS_PCFILEDIR@",
+            },
+            "lapack": {
+                "name": "@LAPACK_NAME@",
+                "found": bool("@LAPACK_FOUND@".lower().replace("false", "")),
+                "version": "@LAPACK_VERSION@",
+                "detection method": "@LAPACK_TYPE_NAME@",
+                "include directory": r"@LAPACK_INCLUDEDIR@",
+                "lib directory": r"@LAPACK_LIBDIR@",
+                "openblas configuration": "@LAPACK_OPENBLAS_CONFIG@",
+                "pc file directory": r"@LAPACK_PCFILEDIR@",
+            },
+        },
+        "Python Information": {
+            "path": r"@PYTHON_PATH@",
+            "version": "@PYTHON_VERSION@",
+        },
+        "SIMD Extensions": {
+            "baseline": __cpu_baseline__,
+            "found": [
+                feature for feature in __cpu_dispatch__ if __cpu_features__[feature]
+            ],
+            "not found": [
+                feature for feature in __cpu_dispatch__ if not __cpu_features__[feature]
+            ],
+        },
+    }
+)
+
+
+def _check_pyyaml():
+    import yaml
+
+    return yaml
+
+
+def show(mode=DisplayModes.stdout.value):
+    """
+    Show libraries and system information on which NumPy was built
+    and is being used
+
+    Parameters
+    ----------
+    mode : {`'stdout'`, `'dicts'`}, optional.
+        Indicates how to display the config information.
+        `'stdout'` prints to console, `'dicts'` returns a dictionary
+        of the configuration.
+
+    Returns
+    -------
+    out : {`dict`, `None`}
+        If mode is `'dicts'`, a dict is returned, else None
+
+    See Also
+    --------
+    get_include : Returns the directory containing NumPy C
+                  header files.
+
+    Notes
+    -----
+    1. The `'stdout'` mode will give more readable
+       output if ``pyyaml`` is installed
+
+    """
+    if mode == DisplayModes.stdout.value:
+        try:  # Non-standard library, check import
+            yaml = _check_pyyaml()
+
+            print(yaml.dump(CONFIG))
+        except ModuleNotFoundError:
+            import warnings
+            import json
+
+            warnings.warn("Install `pyyaml` for better output", stacklevel=1)
+            print(json.dumps(CONFIG, indent=2))
+    elif mode == DisplayModes.dicts.value:
+        return CONFIG
+    else:
+        raise AttributeError(
+            f"Invalid `mode`, use one of: {', '.join([e.value for e in DisplayModes])}"
+        )
diff --git a/numpy/__init__.cython-30.pxd b/numpy/__init__.cython-30.pxd
index 5fd6086e0..0dd2fff2b 100644
--- a/numpy/__init__.cython-30.pxd
+++ b/numpy/__init__.cython-30.pxd
@@ -401,7 +401,7 @@ cdef extern from "numpy/arrayobject.h":
     int PyArray_TYPE(ndarray arr) nogil
 
     object PyArray_GETITEM(ndarray arr, void *itemptr)
-    int PyArray_SETITEM(ndarray arr, void *itemptr, object obj)
+    int PyArray_SETITEM(ndarray arr, void *itemptr, object obj) except -1
 
     bint PyTypeNum_ISBOOL(int) nogil
     bint PyTypeNum_ISUNSIGNED(int) nogil
@@ -452,7 +452,7 @@ cdef extern from "numpy/arrayobject.h":
 
     bint PyArray_SAFEALIGNEDCOPY(ndarray) nogil
     bint PyArray_ISNBO(char) nogil              # works on ndarray.byteorder
-    bint PyArray_IsNativeByteOrder(char) nogil  # works on ndarray.byteorder
+    bint PyArray_IsNativeByteOrder(char) nogil # works on ndarray.byteorder
     bint PyArray_ISNOTSWAPPED(ndarray) nogil
     bint PyArray_ISBYTESWAPPED(ndarray) nogil
 
@@ -515,7 +515,6 @@ cdef extern from "numpy/arrayobject.h":
     void* PyArray_GETPTR3(ndarray m, npy_intp i, npy_intp j, npy_intp k) nogil
     void* PyArray_GETPTR4(ndarray m, npy_intp i, npy_intp j, npy_intp k, npy_intp l) nogil
 
-    void PyArray_XDECREF_ERR(ndarray)
     # Cannot be supported due to out arg
     # void PyArray_DESCR_REPLACE(descr)
 
@@ -549,21 +548,21 @@ cdef extern from "numpy/arrayobject.h":
     # Functions taking dtype and returning object/ndarray are disabled
     # for now as they steal dtype references. I'm conservative and disable
     # more than is probably needed until it can be checked further.
-    int PyArray_SetNumericOps        (object)
+    int PyArray_SetNumericOps (object) except -1
     object PyArray_GetNumericOps ()
-    int PyArray_INCREF (ndarray)
-    int PyArray_XDECREF (ndarray)
+    int PyArray_INCREF (ndarray) except *  # uses PyArray_Item_INCREF...
+    int PyArray_XDECREF (ndarray) except *  # uses PyArray_Item_DECREF...
     void PyArray_SetStringFunction (object, int)
     dtype PyArray_DescrFromType (int)
     object PyArray_TypeObjectFromType (int)
     char * PyArray_Zero (ndarray)
     char * PyArray_One (ndarray)
     #object PyArray_CastToType (ndarray, dtype, int)
-    int PyArray_CastTo (ndarray, ndarray)
-    int PyArray_CastAnyTo (ndarray, ndarray)
-    int PyArray_CanCastSafely (int, int)
-    npy_bool PyArray_CanCastTo (dtype, dtype)
-    int PyArray_ObjectType (object, int)
+    int PyArray_CastTo (ndarray, ndarray) except -1
+    int PyArray_CastAnyTo (ndarray, ndarray) except -1
+    int PyArray_CanCastSafely (int, int)  # writes errors
+    npy_bool PyArray_CanCastTo (dtype, dtype)  # writes errors
+    int PyArray_ObjectType (object, int) except 0
     dtype PyArray_DescrFromObject (object, dtype)
     #ndarray* PyArray_ConvertToCommonType (object, int *)
     dtype PyArray_DescrFromScalar (object)
@@ -587,34 +586,34 @@ cdef extern from "numpy/arrayobject.h":
     #object PyArray_FromIter (object, dtype, npy_intp)
     object PyArray_Return (ndarray)
     #object PyArray_GetField (ndarray, dtype, int)
-    #int PyArray_SetField (ndarray, dtype, int, object)
+    #int PyArray_SetField (ndarray, dtype, int, object) except -1
     object PyArray_Byteswap (ndarray, npy_bool)
     object PyArray_Resize (ndarray, PyArray_Dims *, int, NPY_ORDER)
-    int PyArray_MoveInto (ndarray, ndarray)
-    int PyArray_CopyInto (ndarray, ndarray)
-    int PyArray_CopyAnyInto (ndarray, ndarray)
-    int PyArray_CopyObject (ndarray, object)
+    int PyArray_MoveInto (ndarray, ndarray) except -1
+    int PyArray_CopyInto (ndarray, ndarray) except -1
+    int PyArray_CopyAnyInto (ndarray, ndarray) except -1
+    int PyArray_CopyObject (ndarray, object) except -1
     object PyArray_NewCopy (ndarray, NPY_ORDER)
     object PyArray_ToList (ndarray)
     object PyArray_ToString (ndarray, NPY_ORDER)
-    int PyArray_ToFile (ndarray, stdio.FILE *, char *, char *)
-    int PyArray_Dump (object, object, int)
+    int PyArray_ToFile (ndarray, stdio.FILE *, char *, char *) except -1
+    int PyArray_Dump (object, object, int) except -1
     object PyArray_Dumps (object, int)
-    int PyArray_ValidType (int)
+    int PyArray_ValidType (int)  # Cannot error
     void PyArray_UpdateFlags (ndarray, int)
     object PyArray_New (type, int, npy_intp *, int, npy_intp *, void *, int, int, object)
     #object PyArray_NewFromDescr (type, dtype, int, npy_intp *, npy_intp *, void *, int, object)
     #dtype PyArray_DescrNew (dtype)
     dtype PyArray_DescrNewFromType (int)
-    double PyArray_GetPriority (object, double)
+    double PyArray_GetPriority (object, double)  # clears errors as of 1.25
     object PyArray_IterNew (object)
     object PyArray_MultiIterNew (int, ...)
 
-    int PyArray_PyIntAsInt (object)
+    int PyArray_PyIntAsInt (object) except? -1
     npy_intp PyArray_PyIntAsIntp (object)
-    int PyArray_Broadcast (broadcast)
-    void PyArray_FillObjectArray (ndarray, object)
-    int PyArray_FillWithScalar (ndarray, object)
+    int PyArray_Broadcast (broadcast) except -1
+    void PyArray_FillObjectArray (ndarray, object) except *
+    int PyArray_FillWithScalar (ndarray, object) except -1
     npy_bool PyArray_CheckStrides (int, int, npy_intp, npy_intp, npy_intp *, npy_intp *)
     dtype PyArray_DescrNewByteorder (dtype, char)
     object PyArray_IterAllButAxis (object, int *)
@@ -628,10 +627,10 @@ cdef extern from "numpy/arrayobject.h":
     object PyArray_NewFlagsObject (object)
     npy_bool PyArray_CanCastScalar (type, type)
     #int PyArray_CompareUCS4 (npy_ucs4 *, npy_ucs4 *, register size_t)
-    int PyArray_RemoveSmallest (broadcast)
+    int PyArray_RemoveSmallest (broadcast) except -1
     int PyArray_ElementStrides (object)
-    void PyArray_Item_INCREF (char *, dtype)
-    void PyArray_Item_XDECREF (char *, dtype)
+    void PyArray_Item_INCREF (char *, dtype) except *
+    void PyArray_Item_XDECREF (char *, dtype) except *
     object PyArray_FieldNames (object)
     object PyArray_Transpose (ndarray, PyArray_Dims *)
     object PyArray_TakeFrom (ndarray, object, int, ndarray, NPY_CLIPMODE)
@@ -639,7 +638,7 @@ cdef extern from "numpy/arrayobject.h":
     object PyArray_PutMask (ndarray, object, object)
     object PyArray_Repeat (ndarray, object, int)
     object PyArray_Choose (ndarray, object, ndarray, NPY_CLIPMODE)
-    int PyArray_Sort (ndarray, int, NPY_SORTKIND)
+    int PyArray_Sort (ndarray, int, NPY_SORTKIND) except -1
     object PyArray_ArgSort (ndarray, int, NPY_SORTKIND)
     object PyArray_SearchSorted (ndarray, object, NPY_SEARCHSIDE, PyObject *)
     object PyArray_ArgMax (ndarray, int, ndarray)
@@ -677,49 +676,49 @@ cdef extern from "numpy/arrayobject.h":
     #int PyArray_As2D (object*, char ***, int *, int *, int)
     int PyArray_Free (object, void *)
     #int PyArray_Converter (object, object*)
-    int PyArray_IntpFromSequence (object, npy_intp *, int)
+    int PyArray_IntpFromSequence (object, npy_intp *, int) except -1
     object PyArray_Concatenate (object, int)
     object PyArray_InnerProduct (object, object)
     object PyArray_MatrixProduct (object, object)
     object PyArray_CopyAndTranspose (object)
     object PyArray_Correlate (object, object, int)
     int PyArray_TypestrConvert (int, int)
-    #int PyArray_DescrConverter (object, dtype*)
-    #int PyArray_DescrConverter2 (object, dtype*)
-    int PyArray_IntpConverter (object, PyArray_Dims *)
-    #int PyArray_BufferConverter (object, chunk)
-    int PyArray_AxisConverter (object, int *)
-    int PyArray_BoolConverter (object, npy_bool *)
-    int PyArray_ByteorderConverter (object, char *)
-    int PyArray_OrderConverter (object, NPY_ORDER *)
-    unsigned char PyArray_EquivTypes (dtype, dtype)
+    #int PyArray_DescrConverter (object, dtype*) except 0
+    #int PyArray_DescrConverter2 (object, dtype*) except 0
+    int PyArray_IntpConverter (object, PyArray_Dims *) except 0
+    #int PyArray_BufferConverter (object, chunk) except 0
+    int PyArray_AxisConverter (object, int *) except 0
+    int PyArray_BoolConverter (object, npy_bool *) except 0
+    int PyArray_ByteorderConverter (object, char *) except 0
+    int PyArray_OrderConverter (object, NPY_ORDER *) except 0
+    unsigned char PyArray_EquivTypes (dtype, dtype)  # clears errors
     #object PyArray_Zeros (int, npy_intp *, dtype, int)
     #object PyArray_Empty (int, npy_intp *, dtype, int)
     object PyArray_Where (object, object, object)
     object PyArray_Arange (double, double, double, int)
     #object PyArray_ArangeObj (object, object, object, dtype)
-    int PyArray_SortkindConverter (object, NPY_SORTKIND *)
+    int PyArray_SortkindConverter (object, NPY_SORTKIND *) except 0
     object PyArray_LexSort (object, int)
     object PyArray_Round (ndarray, int, ndarray)
     unsigned char PyArray_EquivTypenums (int, int)
-    int PyArray_RegisterDataType (dtype)
-    int PyArray_RegisterCastFunc (dtype, int, PyArray_VectorUnaryFunc *)
-    int PyArray_RegisterCanCast (dtype, int, NPY_SCALARKIND)
+    int PyArray_RegisterDataType (dtype) except -1
+    int PyArray_RegisterCastFunc (dtype, int, PyArray_VectorUnaryFunc *) except -1
+    int PyArray_RegisterCanCast (dtype, int, NPY_SCALARKIND) except -1
     #void PyArray_InitArrFuncs (PyArray_ArrFuncs *)
     object PyArray_IntTupleFromIntp (int, npy_intp *)
     int PyArray_TypeNumFromName (char *)
-    int PyArray_ClipmodeConverter (object, NPY_CLIPMODE *)
-    #int PyArray_OutputConverter (object, ndarray*)
+    int PyArray_ClipmodeConverter (object, NPY_CLIPMODE *) except 0
+    #int PyArray_OutputConverter (object, ndarray*) except 0
     object PyArray_BroadcastToShape (object, npy_intp *, int)
     void _PyArray_SigintHandler (int)
     void* _PyArray_GetSigintBuf ()
-    #int PyArray_DescrAlignConverter (object, dtype*)
-    #int PyArray_DescrAlignConverter2 (object, dtype*)
-    int PyArray_SearchsideConverter (object, void *)
+    #int PyArray_DescrAlignConverter (object, dtype*) except 0
+    #int PyArray_DescrAlignConverter2 (object, dtype*) except 0
+    int PyArray_SearchsideConverter (object, void *) except 0
     object PyArray_CheckAxis (ndarray, int *, int)
     npy_intp PyArray_OverflowMultiplyList (npy_intp *, int)
     int PyArray_CompareString (char *, char *, size_t)
-    int PyArray_SetBaseObject(ndarray, base)  # NOTE: steals a reference to base! Use "set_array_base()" instead.
+    int PyArray_SetBaseObject(ndarray, base) except -1 # NOTE: steals a reference to base! Use "set_array_base()" instead.
 
 
 # Typedefs that matches the runtime dtype objects in
@@ -753,11 +752,9 @@ ctypedef double complex complex128_t
 # The int types are mapped a bit surprising --
 # numpy.int corresponds to 'l' and numpy.long to 'q'
 ctypedef npy_long       int_t
-ctypedef npy_longlong   long_t
 ctypedef npy_longlong   longlong_t
 
 ctypedef npy_ulong      uint_t
-ctypedef npy_ulonglong  ulong_t
 ctypedef npy_ulonglong  ulonglong_t
 
 ctypedef npy_intp       intp_t
@@ -908,7 +905,7 @@ cdef extern from "numpy/ufuncobject.h":
     object PyUFunc_FromFuncAndData(PyUFuncGenericFunction *,
           void **, char *, int, int, int, int, char *, char *, int)
     int PyUFunc_RegisterLoopForType(ufunc, int,
-                                    PyUFuncGenericFunction, int *, void *)
+                                    PyUFuncGenericFunction, int *, void *) except -1
     void PyUFunc_f_f_As_d_d \
          (char **, npy_intp *, npy_intp *, void *)
     void PyUFunc_d_d \
@@ -958,7 +955,7 @@ cdef extern from "numpy/ufuncobject.h":
     void PyUFunc_clearfperr()
     int PyUFunc_getfperr()
     int PyUFunc_handlefperr \
-        (int, PyObject *, int, int *)
+        (int, PyObject *, int, int *) except -1
     int PyUFunc_ReplaceLoopBySignature \
         (ufunc, PyUFuncGenericFunction, int *, PyUFuncGenericFunction *)
     object PyUFunc_FromFuncAndDataAndSignature \
diff --git a/numpy/__init__.pxd b/numpy/__init__.pxd
index 03db9a0c1..47d9294c1 100644
--- a/numpy/__init__.pxd
+++ b/numpy/__init__.pxd
@@ -359,7 +359,7 @@ cdef extern from "numpy/arrayobject.h":
     int PyArray_TYPE(ndarray arr) nogil
 
     object PyArray_GETITEM(ndarray arr, void *itemptr)
-    int PyArray_SETITEM(ndarray arr, void *itemptr, object obj)
+    int PyArray_SETITEM(ndarray arr, void *itemptr, object obj) except -1
 
     bint PyTypeNum_ISBOOL(int) nogil
     bint PyTypeNum_ISUNSIGNED(int) nogil
@@ -473,7 +473,6 @@ cdef extern from "numpy/arrayobject.h":
     void* PyArray_GETPTR3(ndarray m, npy_intp i, npy_intp j, npy_intp k) nogil
     void* PyArray_GETPTR4(ndarray m, npy_intp i, npy_intp j, npy_intp k, npy_intp l) nogil
 
-    void PyArray_XDECREF_ERR(ndarray)
     # Cannot be supported due to out arg
     # void PyArray_DESCR_REPLACE(descr)
 
@@ -507,21 +506,21 @@ cdef extern from "numpy/arrayobject.h":
     # Functions taking dtype and returning object/ndarray are disabled
     # for now as they steal dtype references. I'm conservative and disable
     # more than is probably needed until it can be checked further.
-    int PyArray_SetNumericOps        (object)
+    int PyArray_SetNumericOps (object) except -1
     object PyArray_GetNumericOps ()
-    int PyArray_INCREF (ndarray)
-    int PyArray_XDECREF (ndarray)
+    int PyArray_INCREF (ndarray) except *  # uses PyArray_Item_INCREF...
+    int PyArray_XDECREF (ndarray) except *  # uses PyArray_Item_DECREF...
     void PyArray_SetStringFunction (object, int)
     dtype PyArray_DescrFromType (int)
     object PyArray_TypeObjectFromType (int)
     char * PyArray_Zero (ndarray)
     char * PyArray_One (ndarray)
     #object PyArray_CastToType (ndarray, dtype, int)
-    int PyArray_CastTo (ndarray, ndarray)
-    int PyArray_CastAnyTo (ndarray, ndarray)
-    int PyArray_CanCastSafely (int, int)
-    npy_bool PyArray_CanCastTo (dtype, dtype)
-    int PyArray_ObjectType (object, int)
+    int PyArray_CastTo (ndarray, ndarray) except -1
+    int PyArray_CastAnyTo (ndarray, ndarray) except -1
+    int PyArray_CanCastSafely (int, int)  # writes errors
+    npy_bool PyArray_CanCastTo (dtype, dtype)  # writes errors
+    int PyArray_ObjectType (object, int) except 0
     dtype PyArray_DescrFromObject (object, dtype)
     #ndarray* PyArray_ConvertToCommonType (object, int *)
     dtype PyArray_DescrFromScalar (object)
@@ -545,34 +544,34 @@ cdef extern from "numpy/arrayobject.h":
     #object PyArray_FromIter (object, dtype, npy_intp)
     object PyArray_Return (ndarray)
     #object PyArray_GetField (ndarray, dtype, int)
-    #int PyArray_SetField (ndarray, dtype, int, object)
+    #int PyArray_SetField (ndarray, dtype, int, object) except -1
     object PyArray_Byteswap (ndarray, npy_bool)
     object PyArray_Resize (ndarray, PyArray_Dims *, int, NPY_ORDER)
-    int PyArray_MoveInto (ndarray, ndarray)
-    int PyArray_CopyInto (ndarray, ndarray)
-    int PyArray_CopyAnyInto (ndarray, ndarray)
-    int PyArray_CopyObject (ndarray, object)
+    int PyArray_MoveInto (ndarray, ndarray) except -1
+    int PyArray_CopyInto (ndarray, ndarray) except -1
+    int PyArray_CopyAnyInto (ndarray, ndarray) except -1
+    int PyArray_CopyObject (ndarray, object) except -1
     object PyArray_NewCopy (ndarray, NPY_ORDER)
     object PyArray_ToList (ndarray)
     object PyArray_ToString (ndarray, NPY_ORDER)
-    int PyArray_ToFile (ndarray, stdio.FILE *, char *, char *)
-    int PyArray_Dump (object, object, int)
+    int PyArray_ToFile (ndarray, stdio.FILE *, char *, char *) except -1
+    int PyArray_Dump (object, object, int) except -1
     object PyArray_Dumps (object, int)
-    int PyArray_ValidType (int)
+    int PyArray_ValidType (int)  # Cannot error
     void PyArray_UpdateFlags (ndarray, int)
     object PyArray_New (type, int, npy_intp *, int, npy_intp *, void *, int, int, object)
     #object PyArray_NewFromDescr (type, dtype, int, npy_intp *, npy_intp *, void *, int, object)
     #dtype PyArray_DescrNew (dtype)
     dtype PyArray_DescrNewFromType (int)
-    double PyArray_GetPriority (object, double)
+    double PyArray_GetPriority (object, double)  # clears errors as of 1.25
     object PyArray_IterNew (object)
     object PyArray_MultiIterNew (int, ...)
 
-    int PyArray_PyIntAsInt (object)
+    int PyArray_PyIntAsInt (object) except? -1
     npy_intp PyArray_PyIntAsIntp (object)
-    int PyArray_Broadcast (broadcast)
-    void PyArray_FillObjectArray (ndarray, object)
-    int PyArray_FillWithScalar (ndarray, object)
+    int PyArray_Broadcast (broadcast) except -1
+    void PyArray_FillObjectArray (ndarray, object) except *
+    int PyArray_FillWithScalar (ndarray, object) except -1
     npy_bool PyArray_CheckStrides (int, int, npy_intp, npy_intp, npy_intp *, npy_intp *)
     dtype PyArray_DescrNewByteorder (dtype, char)
     object PyArray_IterAllButAxis (object, int *)
@@ -586,10 +585,10 @@ cdef extern from "numpy/arrayobject.h":
     object PyArray_NewFlagsObject (object)
     npy_bool PyArray_CanCastScalar (type, type)
     #int PyArray_CompareUCS4 (npy_ucs4 *, npy_ucs4 *, register size_t)
-    int PyArray_RemoveSmallest (broadcast)
+    int PyArray_RemoveSmallest (broadcast) except -1
     int PyArray_ElementStrides (object)
-    void PyArray_Item_INCREF (char *, dtype)
-    void PyArray_Item_XDECREF (char *, dtype)
+    void PyArray_Item_INCREF (char *, dtype) except *
+    void PyArray_Item_XDECREF (char *, dtype) except *
     object PyArray_FieldNames (object)
     object PyArray_Transpose (ndarray, PyArray_Dims *)
     object PyArray_TakeFrom (ndarray, object, int, ndarray, NPY_CLIPMODE)
@@ -597,7 +596,7 @@ cdef extern from "numpy/arrayobject.h":
     object PyArray_PutMask (ndarray, object, object)
     object PyArray_Repeat (ndarray, object, int)
     object PyArray_Choose (ndarray, object, ndarray, NPY_CLIPMODE)
-    int PyArray_Sort (ndarray, int, NPY_SORTKIND)
+    int PyArray_Sort (ndarray, int, NPY_SORTKIND) except -1
     object PyArray_ArgSort (ndarray, int, NPY_SORTKIND)
     object PyArray_SearchSorted (ndarray, object, NPY_SEARCHSIDE, PyObject *)
     object PyArray_ArgMax (ndarray, int, ndarray)
@@ -635,49 +634,49 @@ cdef extern from "numpy/arrayobject.h":
     #int PyArray_As2D (object*, char ***, int *, int *, int)
     int PyArray_Free (object, void *)
     #int PyArray_Converter (object, object*)
-    int PyArray_IntpFromSequence (object, npy_intp *, int)
+    int PyArray_IntpFromSequence (object, npy_intp *, int) except -1
     object PyArray_Concatenate (object, int)
     object PyArray_InnerProduct (object, object)
     object PyArray_MatrixProduct (object, object)
     object PyArray_CopyAndTranspose (object)
     object PyArray_Correlate (object, object, int)
     int PyArray_TypestrConvert (int, int)
-    #int PyArray_DescrConverter (object, dtype*)
-    #int PyArray_DescrConverter2 (object, dtype*)
-    int PyArray_IntpConverter (object, PyArray_Dims *)
-    #int PyArray_BufferConverter (object, chunk)
-    int PyArray_AxisConverter (object, int *)
-    int PyArray_BoolConverter (object, npy_bool *)
-    int PyArray_ByteorderConverter (object, char *)
-    int PyArray_OrderConverter (object, NPY_ORDER *)
-    unsigned char PyArray_EquivTypes (dtype, dtype)
+    #int PyArray_DescrConverter (object, dtype*) except 0
+    #int PyArray_DescrConverter2 (object, dtype*) except 0
+    int PyArray_IntpConverter (object, PyArray_Dims *) except 0
+    #int PyArray_BufferConverter (object, chunk) except 0
+    int PyArray_AxisConverter (object, int *) except 0
+    int PyArray_BoolConverter (object, npy_bool *) except 0
+    int PyArray_ByteorderConverter (object, char *) except 0
+    int PyArray_OrderConverter (object, NPY_ORDER *) except 0
+    unsigned char PyArray_EquivTypes (dtype, dtype)  # clears errors
     #object PyArray_Zeros (int, npy_intp *, dtype, int)
     #object PyArray_Empty (int, npy_intp *, dtype, int)
     object PyArray_Where (object, object, object)
     object PyArray_Arange (double, double, double, int)
     #object PyArray_ArangeObj (object, object, object, dtype)
-    int PyArray_SortkindConverter (object, NPY_SORTKIND *)
+    int PyArray_SortkindConverter (object, NPY_SORTKIND *) except 0
     object PyArray_LexSort (object, int)
     object PyArray_Round (ndarray, int, ndarray)
     unsigned char PyArray_EquivTypenums (int, int)
-    int PyArray_RegisterDataType (dtype)
-    int PyArray_RegisterCastFunc (dtype, int, PyArray_VectorUnaryFunc *)
-    int PyArray_RegisterCanCast (dtype, int, NPY_SCALARKIND)
+    int PyArray_RegisterDataType (dtype) except -1
+    int PyArray_RegisterCastFunc (dtype, int, PyArray_VectorUnaryFunc *) except -1
+    int PyArray_RegisterCanCast (dtype, int, NPY_SCALARKIND) except -1
     #void PyArray_InitArrFuncs (PyArray_ArrFuncs *)
     object PyArray_IntTupleFromIntp (int, npy_intp *)
     int PyArray_TypeNumFromName (char *)
-    int PyArray_ClipmodeConverter (object, NPY_CLIPMODE *)
-    #int PyArray_OutputConverter (object, ndarray*)
+    int PyArray_ClipmodeConverter (object, NPY_CLIPMODE *) except 0
+    #int PyArray_OutputConverter (object, ndarray*) except 0
     object PyArray_BroadcastToShape (object, npy_intp *, int)
     void _PyArray_SigintHandler (int)
     void* _PyArray_GetSigintBuf ()
-    #int PyArray_DescrAlignConverter (object, dtype*)
-    #int PyArray_DescrAlignConverter2 (object, dtype*)
-    int PyArray_SearchsideConverter (object, void *)
+    #int PyArray_DescrAlignConverter (object, dtype*) except 0
+    #int PyArray_DescrAlignConverter2 (object, dtype*) except 0
+    int PyArray_SearchsideConverter (object, void *) except 0
     object PyArray_CheckAxis (ndarray, int *, int)
     npy_intp PyArray_OverflowMultiplyList (npy_intp *, int)
     int PyArray_CompareString (char *, char *, size_t)
-    int PyArray_SetBaseObject(ndarray, base)  # NOTE: steals a reference to base! Use "set_array_base()" instead.
+    int PyArray_SetBaseObject(ndarray, base) except -1 # NOTE: steals a reference to base! Use "set_array_base()" instead.
 
 
 # Typedefs that matches the runtime dtype objects in
@@ -711,11 +710,9 @@ ctypedef double complex complex128_t
 # The int types are mapped a bit surprising --
 # numpy.int corresponds to 'l' and numpy.long to 'q'
 ctypedef npy_long       int_t
-ctypedef npy_longlong   long_t
 ctypedef npy_longlong   longlong_t
 
 ctypedef npy_ulong      uint_t
-ctypedef npy_ulonglong  ulong_t
 ctypedef npy_ulonglong  ulonglong_t
 
 ctypedef npy_intp       intp_t
@@ -866,7 +863,7 @@ cdef extern from "numpy/ufuncobject.h":
     object PyUFunc_FromFuncAndData(PyUFuncGenericFunction *,
           void **, char *, int, int, int, int, char *, char *, int)
     int PyUFunc_RegisterLoopForType(ufunc, int,
-                                    PyUFuncGenericFunction, int *, void *)
+                                    PyUFuncGenericFunction, int *, void *) except -1
     void PyUFunc_f_f_As_d_d \
          (char **, npy_intp *, npy_intp *, void *)
     void PyUFunc_d_d \
@@ -916,7 +913,7 @@ cdef extern from "numpy/ufuncobject.h":
     void PyUFunc_clearfperr()
     int PyUFunc_getfperr()
     int PyUFunc_handlefperr \
-        (int, PyObject *, int, int *)
+        (int, PyObject *, int, int *) except -1
     int PyUFunc_ReplaceLoopBySignature \
         (ufunc, PyUFuncGenericFunction, int *, PyUFuncGenericFunction *)
     object PyUFunc_FromFuncAndDataAndSignature \
diff --git a/numpy/__init__.py b/numpy/__init__.py
index 22c90677e..ae8631387 100644
--- a/numpy/__init__.py
+++ b/numpy/__init__.py
@@ -74,10 +74,6 @@ test
     Run numpy unittests
 show_config
     Show numpy build configuration
-dual
-    Overwrite certain functions with high-performance SciPy tools.
-    Note: `numpy.dual` is deprecated.  Use the functions from NumPy or Scipy
-    directly instead of importing them from `numpy.dual`.
 matlib
     Make everything matrices.
 __version__
@@ -106,10 +102,11 @@ Exceptions to this rule are documented.
 import sys
 import warnings
 
-from ._globals import (
-    ModuleDeprecationWarning, VisibleDeprecationWarning,
-    _NoValue, _CopyMode
-)
+from ._globals import _NoValue, _CopyMode
+# These exceptions were moved in 1.25 and are hidden from __dir__()
+from .exceptions import (
+    ComplexWarning, ModuleDeprecationWarning, VisibleDeprecationWarning,
+    TooHardError, AxisError)
 
 # We first need to detect if we're being called as part of the numpy setup
 # procedure itself in a reliable manner.
@@ -121,6 +118,9 @@ except NameError:
 if __NUMPY_SETUP__:
     sys.stderr.write('Running from numpy source directory.\n')
 else:
+    # Allow distributors to run custom init code before importing numpy.core
+    from . import _distributor_init
+
     try:
         from numpy.__config__ import show as show_config
     except ImportError as e:
@@ -129,18 +129,18 @@ else:
         your python interpreter from there."""
         raise ImportError(msg) from e
 
-    __all__ = ['ModuleDeprecationWarning',
-               'VisibleDeprecationWarning']
+    __all__ = [
+        'exceptions', 'ModuleDeprecationWarning', 'VisibleDeprecationWarning',
+        'ComplexWarning', 'TooHardError', 'AxisError']
 
     # mapping of {name: (value, deprecation_msg)}
     __deprecated_attrs__ = {}
 
-    # Allow distributors to run custom init code
-    from . import _distributor_init
-
     from . import core
     from .core import *
     from . import compat
+    from . import exceptions
+    from . import dtypes
     from . import lib
     # NOTE: to be revisited following future namespace cleanup.
     # See gh-14454 and gh-15672 for discussion.
@@ -159,11 +159,13 @@ else:
     import builtins as _builtins
 
     _msg = (
-        "`np.{n}` is a deprecated alias for the builtin `{n}`. "
-        "To silence this warning, use `{n}` by itself. Doing this will not "
-        "modify any behavior and is safe. {extended_msg}\n"
-        "Deprecated in NumPy 1.20; for more details and guidance: "
-        "https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations")
+        "module 'numpy' has no attribute '{n}'.\n"
+        "`np.{n}` was a deprecated alias for the builtin `{n}`. "
+        "To avoid this error in existing code, use `{n}` by itself. "
+        "Doing this will not modify any behavior and is safe. {extended_msg}\n"
+        "The aliases was originally deprecated in NumPy 1.20; for more "
+        "details and guidance see the original release note at:\n"
+        "    https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations")
 
     _specific_msg = (
         "If you specifically wanted the numpy scalar type, use `np.{}` here.")
@@ -182,45 +184,48 @@ else:
         ("str", _specific_msg.format("str_")),
         ("int", _int_extended_msg.format("int"))]
 
-    __deprecated_attrs__.update({
-        n: (getattr(_builtins, n), _msg.format(n=n, extended_msg=extended_msg))
-        for n, extended_msg in _type_info
-    })
-
-    # Numpy 1.20.0, 2020-10-19
-    __deprecated_attrs__["typeDict"] = (
-        core.numerictypes.typeDict,
-        "`np.typeDict` is a deprecated alias for `np.sctypeDict`."
-    )
-
-    # NumPy 1.22, 2021-10-20
-    __deprecated_attrs__["MachAr"] = (
-        core._machar.MachAr,
-        "`np.MachAr` is deprecated (NumPy 1.22)."
-    )
+    __former_attrs__ = {
+         n: _msg.format(n=n, extended_msg=extended_msg)
+         for n, extended_msg in _type_info
+     }
 
+    # Future warning introduced in NumPy 1.24.0, 2022-11-17
     _msg = (
-        "`np.{n}` is a deprecated alias for `np.compat.{n}`. "
-        "To silence this warning, use `np.compat.{n}` by itself, or "
-        "the builtin `{n2}` for which `np.compat.{n}` is itself an "
-        "alias. Doing this will not modify any behaviour and is safe. "
-        "{extended_msg}\n"
-        "Deprecated in NumPy 1.20; for more details and guidance: "
-        "https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations")
-
-    __deprecated_attrs__["long"] = (
-        getattr(compat, "long"),
-        _msg.format(n="long", n2="int",
-                    extended_msg=_int_extended_msg.format("long")))
-
-    __deprecated_attrs__["unicode"] = (
-        getattr(compat, "unicode"),
-        _msg.format(n="unicode", n2="str",
-                    extended_msg=_specific_msg.format("str_")))
-
-    del _msg, _specific_msg, _int_extended_msg, _type_info, _builtins
-
-    from .core import round, abs, max, min
+        "`np.{n}` is a deprecated alias for `{an}`.  (Deprecated NumPy 1.24)")
+
+    # Some of these are awkward (since `np.str` may be preferable in the long
+    # term), but overall the names ending in 0 seem undesirable
+    _type_info = [
+        ("bool8", bool_, "np.bool_"),
+        ("int0", intp, "np.intp"),
+        ("uint0", uintp, "np.uintp"),
+        ("str0", str_, "np.str_"),
+        ("bytes0", bytes_, "np.bytes_"),
+        ("void0", void, "np.void"),
+        ("object0", object_,
+            "`np.object0` is a deprecated alias for `np.object_`. "
+            "`object` can be used instead.  (Deprecated NumPy 1.24)")]
+
+    # Some of these could be defined right away, but most were aliases to
+    # the Python objects and only removed in NumPy 1.24.  Defining them should
+    # probably wait for NumPy 1.26 or 2.0.
+    # When defined, these should possibly not be added to `__all__` to avoid
+    # import with `from numpy import *`.
+    __future_scalars__ = {"bool", "long", "ulong", "str", "bytes", "object"}
+
+    __deprecated_attrs__.update({
+        n: (alias, _msg.format(n=n, an=an)) for n, alias, an in _type_info})
+
+    import math
+
+    __deprecated_attrs__['math'] = (math,
+        "`np.math` is a deprecated alias for the standard library `math` "
+        "module (Deprecated Numpy 1.25). Replace usages of `np.math` with "
+        "`math`")
+
+    del math, _msg, _type_info
+
+    from .core import abs
     # now that numpy modules are imported, can initialize limits
     core.getlimits._register_known_types()
 
@@ -275,6 +280,7 @@ else:
         # Warn for expired attributes, and return a dummy function
         # that always raises an exception.
         import warnings
+        import math
         try:
             msg = __expired_functions__[attr]
         except KeyError:
@@ -296,26 +302,33 @@ else:
             warnings.warn(msg, DeprecationWarning, stacklevel=2)
             return val
 
-        # Importing Tester requires importing all of UnitTest which is not a
-        # cheap import Since it is mainly used in test suits, we lazy import it
-        # here to save on the order of 10 ms of import time for most users
-        #
-        # The previous way Tester was imported also had a side effect of adding
-        # the full `numpy.testing` namespace
+        if attr in __future_scalars__:
+            # And future warnings for those that will change, but also give
+            # the AttributeError
+            warnings.warn(
+                f"In the future `np.{attr}` will be defined as the "
+                "corresponding NumPy scalar.", FutureWarning, stacklevel=2)
+
+        if attr in __former_attrs__:
+            raise AttributeError(__former_attrs__[attr])
+
         if attr == 'testing':
             import numpy.testing as testing
             return testing
         elif attr == 'Tester':
-            from .testing import Tester
-            return Tester
+            "Removed in NumPy 1.25.0"
+            raise RuntimeError("Tester was removed in NumPy 1.25.")
 
         raise AttributeError("module {!r} has no attribute "
                              "{!r}".format(__name__, attr))
 
     def __dir__():
-        public_symbols = globals().keys() | {'Tester', 'testing'}
+        public_symbols = globals().keys() | {'testing'}
         public_symbols -= {
             "core", "matrixlib",
+            # These were moved in 1.25 and may be deprecated eventually:
+            "ModuleDeprecationWarning", "VisibleDeprecationWarning",
+            "ComplexWarning", "TooHardError", "AxisError"
         }
         return list(public_symbols)
 
@@ -407,13 +420,17 @@ else:
 
     # Note that this will currently only make a difference on Linux
     core.multiarray._set_madvise_hugepage(use_hugepage)
+    del use_hugepage
 
     # Give a warning if NumPy is reloaded or imported on a sub-interpreter
     # We do this from python, since the C-module may not be reloaded and
     # it is tidier organized.
     core.multiarray._multiarray_umath._reload_guard()
 
-    core._set_promotion_state(os.environ.get("NPY_PROMOTION_STATE", "legacy"))
+    # default to "weak" promotion for "NumPy 2".
+    core._set_promotion_state(
+        os.environ.get("NPY_PROMOTION_STATE",
+                       "weak" if _using_numpy2_behavior() else "legacy"))
 
     # Tell PyInstaller where to find hook-numpy.py
     def _pyinstaller_hooks_dir():
diff --git a/numpy/__init__.pyi b/numpy/__init__.pyi
index f23050b17..ef1f9046a 100644
--- a/numpy/__init__.pyi
+++ b/numpy/__init__.pyi
@@ -1,19 +1,15 @@
 import builtins
 import os
-import sys
 import mmap
 import ctypes as ct
 import array as _array
 import datetime as dt
 import enum
 from abc import abstractmethod
-from types import TracebackType, MappingProxyType
+from types import TracebackType, MappingProxyType, GenericAlias
 from contextlib import ContextDecorator
 from contextlib import contextmanager
 
-if sys.version_info >= (3, 9):
-    from types import GenericAlias
-
 from numpy._pytesttester import PytestTester
 from numpy.core._internal import _ctypes
 
@@ -42,6 +38,7 @@ from numpy._typing import (
     # DTypes
     DTypeLike,
     _DTypeLike,
+    _DTypeLikeVoid,
     _SupportsDType,
     _VoidDTypeLike,
 
@@ -203,6 +200,7 @@ from typing import (
 # Ensures that the stubs are picked up
 from numpy import (
     ctypeslib as ctypeslib,
+    exceptions as exceptions,
     fft as fft,
     lib as lib,
     linalg as linalg,
@@ -211,6 +209,8 @@ from numpy import (
     random as random,
     testing as testing,
     version as version,
+    exceptions as exceptions,
+    dtypes as dtypes,
 )
 
 from numpy.core import defchararray, records
@@ -252,6 +252,8 @@ from numpy.core.fromnumeric import (
     any as any,
     cumsum as cumsum,
     ptp as ptp,
+    max as max,
+    min as min,
     amax as amax,
     amin as amin,
     prod as prod,
@@ -259,6 +261,7 @@ from numpy.core.fromnumeric import (
     ndim as ndim,
     size as size,
     around as around,
+    round as round,
     mean as mean,
     std as std,
     var as var,
@@ -410,6 +413,15 @@ from numpy.core.shape_base import (
     vstack as vstack,
 )
 
+from numpy.exceptions import (
+    ComplexWarning as ComplexWarning,
+    ModuleDeprecationWarning as ModuleDeprecationWarning,
+    VisibleDeprecationWarning as VisibleDeprecationWarning,
+    TooHardError as TooHardError,
+    DTypePromotionError as DTypePromotionError,
+    AxisError as AxisError,
+)
+
 from numpy.lib import (
     emath as emath,
 )
@@ -456,7 +468,6 @@ from numpy.lib.function_base import (
     digitize as digitize,
     cov as cov,
     corrcoef as corrcoef,
-    msort as msort,
     median as median,
     sinc as sinc,
     hamming as hamming,
@@ -620,6 +631,7 @@ from numpy.lib.utils import (
     lookfor as lookfor,
     byte_bounds as byte_bounds,
     safe_eval as safe_eval,
+    show_runtime as show_runtime,
 )
 
 from numpy.matrixlib import (
@@ -663,19 +675,9 @@ test: PytestTester
 #
 # Placeholders for classes
 
-# Some of these are aliases; others are wrappers with an identical signature
-round = around
-round_ = around
-max = amax
-min = amin
-product = prod
-cumproduct = cumprod
-sometrue = any
-alltrue = all
-
 def show_config() -> None: ...
 
-_NdArraySubClass = TypeVar("_NdArraySubClass", bound=ndarray)
+_NdArraySubClass = TypeVar("_NdArraySubClass", bound=ndarray[Any, Any])
 _DTypeScalar_co = TypeVar("_DTypeScalar_co", covariant=True, bound=generic)
 _ByteOrder = L["S", "<", ">", "=", "|", "L", "B", "N", "I"]
 
@@ -801,7 +803,7 @@ class dtype(Generic[_DTypeScalar_co]):
     @overload
     def __new__(cls, dtype: _VoidCodes, align: bool = ..., copy: bool = ...) -> dtype[void]: ...
     @overload
-    def __new__(cls, dtype: _ObjectCodes | type[ct.py_object], align: bool = ..., copy: bool = ...) -> dtype[object_]: ...
+    def __new__(cls, dtype: _ObjectCodes | type[ct.py_object[Any]], align: bool = ..., copy: bool = ...) -> dtype[object_]: ...
 
     # dtype of a dtype is the same dtype
     @overload
@@ -843,8 +845,7 @@ class dtype(Generic[_DTypeScalar_co]):
         copy: bool = ...,
     ) -> dtype[object_]: ...
 
-    if sys.version_info >= (3, 9):
-        def __class_getitem__(self, item: Any) -> GenericAlias: ...
+    def __class_getitem__(self, item: Any) -> GenericAlias: ...
 
     @overload
     def __getitem__(self: dtype[void], key: list[builtins.str]) -> dtype[void]: ...
@@ -926,13 +927,13 @@ class dtype(Generic[_DTypeScalar_co]):
 
 _ArrayLikeInt = Union[
     int,
-    integer,
-    Sequence[Union[int, integer]],
+    integer[Any],
+    Sequence[Union[int, integer[Any]]],
     Sequence[Sequence[Any]],  # TODO: wait for support for recursive types
-    ndarray
+    ndarray[Any, Any]
 ]
 
-_FlatIterSelf = TypeVar("_FlatIterSelf", bound=flatiter)
+_FlatIterSelf = TypeVar("_FlatIterSelf", bound=flatiter[Any])
 
 @final
 class flatiter(Generic[_NdArraySubClass]):
@@ -950,7 +951,7 @@ class flatiter(Generic[_NdArraySubClass]):
     @overload
     def __getitem__(
         self: flatiter[ndarray[Any, dtype[_ScalarType]]],
-        key: int | integer | tuple[int | integer],
+        key: int | integer[Any] | tuple[int | integer[Any]],
     ) -> _ScalarType: ...
     @overload
     def __getitem__(
@@ -1145,7 +1146,7 @@ class _ArrayOrScalarCommon:
         axis: None | SupportsIndex = ...,
         kind: None | _SortKind = ...,
         order: None | str | Sequence[str] = ...,
-    ) -> ndarray: ...
+    ) -> ndarray[Any, Any]: ...
 
     @overload
     def choose(
@@ -1153,7 +1154,7 @@ class _ArrayOrScalarCommon:
         choices: ArrayLike,
         out: None = ...,
         mode: _ModeKind = ...,
-    ) -> ndarray: ...
+    ) -> ndarray[Any, Any]: ...
     @overload
     def choose(
         self,
@@ -1169,7 +1170,7 @@ class _ArrayOrScalarCommon:
         max: None | ArrayLike = ...,
         out: None = ...,
         **kwargs: Any,
-    ) -> ndarray: ...
+    ) -> ndarray[Any, Any]: ...
     @overload
     def clip(
         self,
@@ -1177,7 +1178,7 @@ class _ArrayOrScalarCommon:
         max: ArrayLike = ...,
         out: None = ...,
         **kwargs: Any,
-    ) -> ndarray: ...
+    ) -> ndarray[Any, Any]: ...
     @overload
     def clip(
         self,
@@ -1201,7 +1202,7 @@ class _ArrayOrScalarCommon:
         a: ArrayLike,
         axis: None | SupportsIndex = ...,
         out: None = ...,
-    ) -> ndarray: ...
+    ) -> ndarray[Any, Any]: ...
     @overload
     def compress(
         self,
@@ -1220,7 +1221,7 @@ class _ArrayOrScalarCommon:
         axis: None | SupportsIndex = ...,
         dtype: DTypeLike = ...,
         out: None = ...,
-    ) -> ndarray: ...
+    ) -> ndarray[Any, Any]: ...
     @overload
     def cumprod(
         self,
@@ -1235,7 +1236,7 @@ class _ArrayOrScalarCommon:
         axis: None | SupportsIndex = ...,
         dtype: DTypeLike = ...,
         out: None = ...,
-    ) -> ndarray: ...
+    ) -> ndarray[Any, Any]: ...
     @overload
     def cumsum(
         self,
@@ -1480,7 +1481,7 @@ class _SupportsImag(Protocol[_T_co]):
 class ndarray(_ArrayOrScalarCommon, Generic[_ShapeType, _DType_co]):
     __hash__: ClassVar[None]
     @property
-    def base(self) -> None | ndarray: ...
+    def base(self) -> None | ndarray[Any, Any]: ...
     @property
     def ndim(self) -> int: ...
     @property
@@ -1507,8 +1508,7 @@ class ndarray(_ArrayOrScalarCommon, Generic[_ShapeType, _DType_co]):
         order: _OrderKACF = ...,
     ) -> _ArraySelf: ...
 
-    if sys.version_info >= (3, 9):
-        def __class_getitem__(self, item: Any) -> GenericAlias: ...
+    def __class_getitem__(self, item: Any) -> GenericAlias: ...
 
     @overload
     def __array__(self, dtype: None = ..., /) -> ndarray[Any, _DType_co]: ...
@@ -1533,7 +1533,7 @@ class ndarray(_ArrayOrScalarCommon, Generic[_ShapeType, _DType_co]):
 
     # NOTE: In practice any object is accepted by `obj`, but as `__array_finalize__`
     # is a pseudo-abstract method the type has been narrowed down in order to
-    # grant subclasses a bit more flexiblity
+    # grant subclasses a bit more flexibility
     def __array_finalize__(self, obj: None | NDArray[Any], /) -> None: ...
 
     def __array_wrap__(
@@ -1648,7 +1648,7 @@ class ndarray(_ArrayOrScalarCommon, Generic[_ShapeType, _DType_co]):
     # 1D + 1D returns a scalar;
     # all other with at least 1 non-0D array return an ndarray.
     @overload
-    def dot(self, b: _ScalarLike_co, out: None = ...) -> ndarray: ...
+    def dot(self, b: _ScalarLike_co, out: None = ...) -> ndarray[Any, Any]: ...
     @overload
     def dot(self, b: ArrayLike, out: None = ...) -> Any: ...  # type: ignore[misc]
     @overload
@@ -1925,7 +1925,6 @@ class ndarray(_ArrayOrScalarCommon, Generic[_ShapeType, _DType_co]):
     def __neg__(self: NDArray[object_]) -> Any: ...
 
     # Binary ops
-    # NOTE: `ndarray` does not implement `__imatmul__`
     @overload
     def __matmul__(self: NDArray[bool_], other: _ArrayLikeBool_co) -> NDArray[bool_]: ...  # type: ignore[misc]
     @overload
@@ -2512,6 +2511,19 @@ class ndarray(_ArrayOrScalarCommon, Generic[_ShapeType, _DType_co]):
     @overload
     def __ior__(self: NDArray[object_], other: Any) -> NDArray[object_]: ...
 
+    @overload
+    def __imatmul__(self: NDArray[bool_], other: _ArrayLikeBool_co) -> NDArray[bool_]: ...
+    @overload
+    def __imatmul__(self: NDArray[unsignedinteger[_NBit1]], other: _ArrayLikeUInt_co) -> NDArray[unsignedinteger[_NBit1]]: ...
+    @overload
+    def __imatmul__(self: NDArray[signedinteger[_NBit1]], other: _ArrayLikeInt_co) -> NDArray[signedinteger[_NBit1]]: ...
+    @overload
+    def __imatmul__(self: NDArray[floating[_NBit1]], other: _ArrayLikeFloat_co) -> NDArray[floating[_NBit1]]: ...
+    @overload
+    def __imatmul__(self: NDArray[complexfloating[_NBit1, _NBit1]], other: _ArrayLikeComplex_co) -> NDArray[complexfloating[_NBit1, _NBit1]]: ...
+    @overload
+    def __imatmul__(self: NDArray[object_], other: Any) -> NDArray[object_]: ...
+
     def __dlpack__(self: NDArray[number[Any]], *, stream: None = ...) -> _PyCapsule: ...
     def __dlpack_device__(self) -> tuple[int, L[0]]: ...
 
@@ -2671,8 +2683,7 @@ class number(generic, Generic[_NBit1]):  # type: ignore
     def real(self: _ArraySelf) -> _ArraySelf: ...
     @property
     def imag(self: _ArraySelf) -> _ArraySelf: ...
-    if sys.version_info >= (3, 9):
-        def __class_getitem__(self, item: Any) -> GenericAlias: ...
+    def __class_getitem__(self, item: Any) -> GenericAlias: ...
     def __int__(self) -> int: ...
     def __float__(self) -> float: ...
     def __complex__(self) -> complex: ...
@@ -2743,8 +2754,6 @@ class bool_(generic):
     __gt__: _ComparisonOp[_NumberLike_co, _ArrayLikeNumber_co]
     __ge__: _ComparisonOp[_NumberLike_co, _ArrayLikeNumber_co]
 
-bool8 = bool_
-
 class object_(generic):
     def __init__(self, value: object = ..., /) -> None: ...
     @property
@@ -2757,8 +2766,6 @@ class object_(generic):
     def __float__(self) -> float: ...
     def __complex__(self) -> complex: ...
 
-object0 = object_
-
 # The `datetime64` constructors requires an object with the three attributes below,
 # and thus supports datetime duck typing
 class _DatetimeScalar(Protocol):
@@ -2830,20 +2837,20 @@ class integer(number[_NBit1]):  # type: ignore
     def __index__(self) -> int: ...
     __truediv__: _IntTrueDiv[_NBit1]
     __rtruediv__: _IntTrueDiv[_NBit1]
-    def __mod__(self, value: _IntLike_co) -> integer: ...
-    def __rmod__(self, value: _IntLike_co) -> integer: ...
+    def __mod__(self, value: _IntLike_co) -> integer[Any]: ...
+    def __rmod__(self, value: _IntLike_co) -> integer[Any]: ...
     def __invert__(self: _IntType) -> _IntType: ...
     # Ensure that objects annotated as `integer` support bit-wise operations
-    def __lshift__(self, other: _IntLike_co) -> integer: ...
-    def __rlshift__(self, other: _IntLike_co) -> integer: ...
-    def __rshift__(self, other: _IntLike_co) -> integer: ...
-    def __rrshift__(self, other: _IntLike_co) -> integer: ...
-    def __and__(self, other: _IntLike_co) -> integer: ...
-    def __rand__(self, other: _IntLike_co) -> integer: ...
-    def __or__(self, other: _IntLike_co) -> integer: ...
-    def __ror__(self, other: _IntLike_co) -> integer: ...
-    def __xor__(self, other: _IntLike_co) -> integer: ...
-    def __rxor__(self, other: _IntLike_co) -> integer: ...
+    def __lshift__(self, other: _IntLike_co) -> integer[Any]: ...
+    def __rlshift__(self, other: _IntLike_co) -> integer[Any]: ...
+    def __rshift__(self, other: _IntLike_co) -> integer[Any]: ...
+    def __rrshift__(self, other: _IntLike_co) -> integer[Any]: ...
+    def __and__(self, other: _IntLike_co) -> integer[Any]: ...
+    def __rand__(self, other: _IntLike_co) -> integer[Any]: ...
+    def __or__(self, other: _IntLike_co) -> integer[Any]: ...
+    def __ror__(self, other: _IntLike_co) -> integer[Any]: ...
+    def __xor__(self, other: _IntLike_co) -> integer[Any]: ...
+    def __rxor__(self, other: _IntLike_co) -> integer[Any]: ...
 
 class signedinteger(integer[_NBit1]):
     def __init__(self, value: _IntValue = ..., /) -> None: ...
@@ -2881,7 +2888,6 @@ byte = signedinteger[_NBitByte]
 short = signedinteger[_NBitShort]
 intc = signedinteger[_NBitIntC]
 intp = signedinteger[_NBitIntP]
-int0 = signedinteger[_NBitIntP]
 int_ = signedinteger[_NBitInt]
 longlong = signedinteger[_NBitLongLong]
 
@@ -2963,15 +2969,14 @@ ubyte = unsignedinteger[_NBitByte]
 ushort = unsignedinteger[_NBitShort]
 uintc = unsignedinteger[_NBitIntC]
 uintp = unsignedinteger[_NBitIntP]
-uint0 = unsignedinteger[_NBitIntP]
 uint = unsignedinteger[_NBitInt]
 ulonglong = unsignedinteger[_NBitLongLong]
 
 class inexact(number[_NBit1]):  # type: ignore
     def __getnewargs__(self: inexact[_64Bit]) -> tuple[float, ...]: ...
 
-_IntType = TypeVar("_IntType", bound=integer)
-_FloatType = TypeVar('_FloatType', bound=floating)
+_IntType = TypeVar("_IntType", bound=integer[Any])
+_FloatType = TypeVar('_FloatType', bound=floating[Any])
 
 class floating(inexact[_NBit1]):
     def __init__(self, value: _FloatValue = ..., /) -> None: ...
@@ -2985,9 +2990,8 @@ class floating(inexact[_NBit1]):
     @classmethod
     def fromhex(cls: type[float64], string: str, /) -> float64: ...
     def as_integer_ratio(self) -> tuple[int, int]: ...
-    if sys.version_info >= (3, 9):
-        def __ceil__(self: float64) -> int: ...
-        def __floor__(self: float64) -> int: ...
+    def __ceil__(self: float64) -> int: ...
+    def __floor__(self: float64) -> int: ...
     def __trunc__(self: float64) -> int: ...
     def __getnewargs__(self: float64) -> tuple[float]: ...
     def __getformat__(self: float64, typestr: L["double", "float"], /) -> str: ...
@@ -3070,7 +3074,10 @@ class flexible(generic): ...  # type: ignore
 # depending on whether or not it's used as an opaque bytes sequence
 # or a structure
 class void(flexible):
-    def __init__(self, value: _IntLike_co | bytes, /) -> None: ...
+    @overload
+    def __init__(self, value: _IntLike_co | bytes, /, dtype : None = ...) -> None: ...
+    @overload
+    def __init__(self, value: Any, /, dtype: _DTypeLikeVoid) -> None: ...
     @property
     def real(self: _ArraySelf) -> _ArraySelf: ...
     @property
@@ -3088,8 +3095,6 @@ class void(flexible):
         value: ArrayLike,
     ) -> None: ...
 
-void0 = void
-
 class character(flexible):  # type: ignore
     def __int__(self) -> int: ...
     def __float__(self) -> float: ...
@@ -3110,7 +3115,6 @@ class bytes_(character, bytes):
     def tolist(self) -> bytes: ...
 
 string_ = bytes_
-bytes0 = bytes_
 
 class str_(character, str):
     @overload
@@ -3125,7 +3129,6 @@ class str_(character, str):
     def tolist(self) -> str: ...
 
 unicode_ = str_
-str0 = str_
 
 #
 # Constants
@@ -3146,10 +3149,6 @@ infty: Final[float]
 nan: Final[float]
 pi: Final[float]
 
-CLIP: L[0]
-WRAP: L[1]
-RAISE: L[2]
-
 ERR_IGNORE: L[0]
 ERR_WARN: L[1]
 ERR_RAISE: L[2]
@@ -3218,7 +3217,7 @@ class ufunc:
     # can't type them very precisely.
     reduce: Any
     accumulate: Any
-    reduce: Any
+    reduceat: Any
     outer: Any
     # Similarly at won't be defined for ufuncs that return multiple
     # outputs, so we can't type it very precisely.
@@ -3324,22 +3323,8 @@ class _CopyMode(enum.Enum):
     NEVER: L[2]
 
 # Warnings
-class ModuleDeprecationWarning(DeprecationWarning): ...
-class VisibleDeprecationWarning(UserWarning): ...
-class ComplexWarning(RuntimeWarning): ...
 class RankWarning(UserWarning): ...
 
-# Errors
-class TooHardError(RuntimeError): ...
-
-class AxisError(ValueError, IndexError):
-    axis: None | int
-    ndim: None | int
-    @overload
-    def __init__(self, axis: str, ndim: None = ..., msg_prefix: None = ...) -> None: ...
-    @overload
-    def __init__(self, axis: int, ndim: int, msg_prefix: None | str = ...) -> None: ...
-
 _CallType = TypeVar("_CallType", bound=_ErrFunc | _SupportsWrite[str])
 
 class errstate(Generic[_CallType], ContextDecorator):
diff --git a/numpy/_build_utils/__init__.py b/numpy/_build_utils/__init__.py
new file mode 100644
index 000000000..ac4908957
--- /dev/null
+++ b/numpy/_build_utils/__init__.py
@@ -0,0 +1,22 @@
+# Don't use the deprecated NumPy C API. Define this to a fixed version
+# instead of NPY_API_VERSION in order not to break compilation for
+# released SciPy versions when NumPy introduces a new deprecation. Use
+# in setup.py::
+#
+#   config.add_extension('_name', sources=['source_fname'], **numpy_nodepr_api)
+#
+numpy_nodepr_api = dict(
+    define_macros=[("NPY_NO_DEPRECATED_API", "NPY_1_9_API_VERSION")]
+)
+
+
+def import_file(folder, module_name):
+    """Import a file directly, avoiding importing scipy"""
+    import importlib
+    import pathlib
+
+    fname = pathlib.Path(folder) / f'{module_name}.py'
+    spec = importlib.util.spec_from_file_location(module_name, str(fname))
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    return module
diff --git a/numpy/_build_utils/gcc_build_bitness.py b/numpy/_build_utils/gcc_build_bitness.py
new file mode 100644
index 000000000..fcad237e9
--- /dev/null
+++ b/numpy/_build_utils/gcc_build_bitness.py
@@ -0,0 +1,21 @@
+#!python
+""" Detect bitness (32 or 64) of Mingw-w64 gcc build target on Windows.
+"""
+
+import re
+from subprocess import run, PIPE
+
+
+def main():
+    res = run(['gcc', '-v'], check=True, text=True, capture_output=True)
+    target = re.search(r'^Target: (.*)$', res.stderr, flags=re.M).groups()[0]
+    if target.startswith('i686'):
+        print('32')
+    elif target.startswith('x86_64'):
+        print('64')
+    else:
+        raise RuntimeError('Could not detect Mingw-w64 bitness')
+
+
+if __name__ == "__main__":
+    main()
diff --git a/numpy/_build_utils/process_src_template.py b/numpy/_build_utils/process_src_template.py
new file mode 100644
index 000000000..4a0915e25
--- /dev/null
+++ b/numpy/_build_utils/process_src_template.py
@@ -0,0 +1,67 @@
+#!/usr/bin/env python3
+import sys
+import os
+import argparse
+import importlib.util
+
+
+def get_processor():
+    # Convoluted because we can't import from numpy.distutils
+    # (numpy is not yet built)
+    conv_template_path = os.path.join(
+        os.path.dirname(__file__),
+        '..', 'distutils', 'conv_template.py'
+    )
+    spec = importlib.util.spec_from_file_location(
+        'conv_template', conv_template_path
+    )
+    mod = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(mod)
+    return mod.process_file
+
+
+def process_and_write_file(fromfile, outfile):
+    """Process tempita templated file and write out the result.
+
+    The template file is expected to end in `.src`
+    (e.g., `.c.src` or `.h.src`).
+    Processing `npy_somefile.c.src` generates `npy_somefile.c`.
+
+    """
+    process_file = get_processor()
+    content = process_file(fromfile)
+    with open(outfile, 'w') as f:
+        f.write(content)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "infile",
+        type=str,
+        help="Path to the input file"
+    )
+    parser.add_argument(
+        "-o",
+        "--outfile",
+        type=str,
+        help="Path to the output file"
+    )
+    parser.add_argument(
+        "-i",
+        "--ignore",
+        type=str,
+        help="An ignored input - may be useful to add a "
+        "dependency between custom targets",
+    )
+    args = parser.parse_args()
+
+    if not args.infile.endswith('.src'):
+        raise ValueError(f"Unexpected extension: {args.infile}")
+
+    outfile_abs = os.path.join(os.getcwd(), args.outfile)
+    process_and_write_file(args.infile, outfile_abs)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/numpy/_build_utils/setup.py b/numpy/_build_utils/setup.py
new file mode 100644
index 000000000..73a8f2fff
--- /dev/null
+++ b/numpy/_build_utils/setup.py
@@ -0,0 +1,12 @@
+def configuration(parent_package='', top_path=None):
+    from numpy.distutils.misc_util import Configuration
+
+    config = Configuration('_build_utils', parent_package, top_path)
+    config.add_data_dir('tests')
+    return config
+
+
+if __name__ == '__main__':
+    from numpy.distutils.core import setup
+
+    setup(**configuration(top_path='').todict())
diff --git a/numpy/_build_utils/tempita.py b/numpy/_build_utils/tempita.py
new file mode 100755
index 000000000..3bcc789c5
--- /dev/null
+++ b/numpy/_build_utils/tempita.py
@@ -0,0 +1,62 @@
+#!/usr/bin/env python3
+import sys
+import os
+import argparse
+
+from Cython import Tempita as tempita
+
+# XXX: If this import ever fails (does it really?), vendor either
+# cython.tempita or numpy/npy_tempita.
+
+
+def process_tempita(fromfile, outfile=None):
+    """Process tempita templated file and write out the result.
+
+    The template file is expected to end in `.c.in` or `.pyx.in`:
+    E.g. processing `template.c.in` generates `template.c`.
+
+    """
+    if outfile is None:
+        # We're dealing with a distutils build here, write in-place
+        outfile = os.path.splitext(fromfile)[0]
+
+    from_filename = tempita.Template.from_filename
+    template = from_filename(fromfile, encoding=sys.getdefaultencoding())
+
+    content = template.substitute()
+
+    with open(outfile, 'w') as f:
+        f.write(content)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "infile",
+        type=str,
+        help="Path to the input file"
+    )
+    parser.add_argument(
+        "-o",
+        "--outfile",
+        type=str,
+        help="Path to the output file"
+    )
+    parser.add_argument(
+        "-i",
+        "--ignore",
+        type=str,
+        help="An ignored input - may be useful to add a "
+        "dependency between custom targets",
+    )
+    args = parser.parse_args()
+
+    if not args.infile.endswith('.in'):
+        raise ValueError(f"Unexpected extension: {args.infile}")
+
+    outfile_abs = os.path.join(os.getcwd(), args.outfile)
+    process_tempita(args.infile, outfile_abs)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/numpy/_globals.py b/numpy/_globals.py
index 1e4f26cd8..416a20f5e 100644
--- a/numpy/_globals.py
+++ b/numpy/_globals.py
@@ -17,10 +17,9 @@ motivated this module.
 """
 import enum
 
-__ALL__ = [
-    'ModuleDeprecationWarning', 'VisibleDeprecationWarning',
-    '_NoValue', '_CopyMode'
-    ]
+from ._utils import set_module as _set_module
+
+__all__ = ['_NoValue', '_CopyMode']
 
 
 # Disallow reloading this module so as to preserve the identities of the
@@ -30,33 +29,6 @@ if '_is_loaded' in globals():
 _is_loaded = True
 
 
-class ModuleDeprecationWarning(DeprecationWarning):
-    """Module deprecation warning.
-
-    The nose tester turns ordinary Deprecation warnings into test failures.
-    That makes it hard to deprecate whole modules, because they get
-    imported by default. So this is a special Deprecation warning that the
-    nose tester will let pass without making tests fail.
-
-    """
-
-
-ModuleDeprecationWarning.__module__ = 'numpy'
-
-
-class VisibleDeprecationWarning(UserWarning):
-    """Visible deprecation warning.
-
-    By default, python will not show deprecation warnings, so this class
-    can be used when a very visible warning is helpful, for example because
-    the usage is most likely a user bug.
-
-    """
-
-
-VisibleDeprecationWarning.__module__ = 'numpy'
-
-
 class _NoValueType:
     """Special keyword value.
 
@@ -90,6 +62,7 @@ class _NoValueType:
 _NoValue = _NoValueType()
 
 
+@_set_module("numpy")
 class _CopyMode(enum.Enum):
     """
     An enumeration for the copy modes supported
@@ -120,6 +93,3 @@ class _CopyMode(enum.Enum):
             return False
 
         raise ValueError(f"{self} is neither True nor False.")
-
-
-_CopyMode.__module__ = 'numpy'
diff --git a/numpy/_pyinstaller/hook-numpy.py b/numpy/_pyinstaller/hook-numpy.py
index a08b7c963..6f24318ad 100644
--- a/numpy/_pyinstaller/hook-numpy.py
+++ b/numpy/_pyinstaller/hook-numpy.py
@@ -20,18 +20,15 @@ if is_pure_conda:
     from PyInstaller.utils.hooks import conda_support
     datas = conda_support.collect_dynamic_libs("numpy", dependencies=True)
 
-# Submodules PyInstaller cannot detect (probably because they are only imported
-# by extension modules, which PyInstaller cannot read).
-hiddenimports = ['numpy.core._dtype_ctypes']
-if is_conda:
-    hiddenimports.append("six")
+# Submodules PyInstaller cannot detect.  `_dtype_ctypes` is only imported
+# from C and `_multiarray_tests` is used in tests (which are not packed).
+hiddenimports = ['numpy.core._dtype_ctypes', 'numpy.core._multiarray_tests']
 
 # Remove testing and building code and packages that are referenced throughout
 # NumPy but are not really dependencies.
 excludedimports = [
     "scipy",
     "pytest",
-    "nose",
     "f2py",
     "setuptools",
     "numpy.f2py",
diff --git a/numpy/_typing/__init__.py b/numpy/_typing/__init__.py
index b800c54b6..29922d958 100644
--- a/numpy/_typing/__init__.py
+++ b/numpy/_typing/__init__.py
@@ -2,8 +2,8 @@
 
 from __future__ import annotations
 
-from numpy import ufunc
-from numpy.core.overrides import set_module
+from .. import ufunc
+from .._utils import set_module
 from typing import TYPE_CHECKING, final
 
 
@@ -180,6 +180,7 @@ from ._dtype_like import (
     _DTypeLikeComplex_co as _DTypeLikeComplex_co,
 )
 from ._array_like import (
+    NDArray as NDArray,
     ArrayLike as ArrayLike,
     _ArrayLike as _ArrayLike,
     _FiniteNestedSequence as _FiniteNestedSequence,
@@ -201,11 +202,6 @@ from ._array_like import (
     _ArrayLikeUnknown as _ArrayLikeUnknown,
     _UnknownType as _UnknownType,
 )
-from ._generic_alias import (
-    NDArray as NDArray,
-    _DType as _DType,
-    _GenericAlias as _GenericAlias,
-)
 
 if TYPE_CHECKING:
     from ._ufunc import (
diff --git a/numpy/_typing/_add_docstring.py b/numpy/_typing/_add_docstring.py
index 10d77f516..f84d19271 100644
--- a/numpy/_typing/_add_docstring.py
+++ b/numpy/_typing/_add_docstring.py
@@ -3,7 +3,7 @@
 import re
 import textwrap
 
-from ._generic_alias import NDArray
+from ._array_like import NDArray
 
 _docstrings_list = []
 
diff --git a/numpy/_typing/_array_like.py b/numpy/_typing/_array_like.py
index 67d67ce19..cba6fffaf 100644
--- a/numpy/_typing/_array_like.py
+++ b/numpy/_typing/_array_like.py
@@ -1,9 +1,7 @@
 from __future__ import annotations
 
-# NOTE: Import `Sequence` from `typing` as we it is needed for a type-alias,
-# not an annotation
-from collections.abc import Collection, Callable
-from typing import Any, Sequence, Protocol, Union, TypeVar, runtime_checkable
+from collections.abc import Collection, Callable, Sequence
+from typing import Any, Protocol, Union, TypeVar, runtime_checkable
 from numpy import (
     ndarray,
     dtype,
@@ -25,8 +23,11 @@ from ._nested_sequence import _NestedSequence
 
 _T = TypeVar("_T")
 _ScalarType = TypeVar("_ScalarType", bound=generic)
-_DType = TypeVar("_DType", bound="dtype[Any]")
-_DType_co = TypeVar("_DType_co", covariant=True, bound="dtype[Any]")
+_ScalarType_co = TypeVar("_ScalarType_co", bound=generic, covariant=True)
+_DType = TypeVar("_DType", bound=dtype[Any])
+_DType_co = TypeVar("_DType_co", covariant=True, bound=dtype[Any])
+
+NDArray = ndarray[Any, dtype[_ScalarType_co]]
 
 # The `_SupportsArray` protocol only cares about the default dtype
 # (i.e. `dtype=None` or no `dtype` parameter at all) of the to-be returned
@@ -61,8 +62,8 @@ _FiniteNestedSequence = Union[
 
 # A subset of `npt.ArrayLike` that can be parametrized w.r.t. `np.generic`
 _ArrayLike = Union[
-    _SupportsArray["dtype[_ScalarType]"],
-    _NestedSequence[_SupportsArray["dtype[_ScalarType]"]],
+    _SupportsArray[dtype[_ScalarType]],
+    _NestedSequence[_SupportsArray[dtype[_ScalarType]]],
 ]
 
 # A union representing array-like objects; consists of two typevars:
@@ -83,64 +84,69 @@ _DualArrayLike = Union[
 #
 # https://github.com/python/typing/issues/593
 ArrayLike = _DualArrayLike[
-    dtype,
+    dtype[Any],
     Union[bool, int, float, complex, str, bytes],
 ]
 
 # `ArrayLike<X>_co`: array-like objects that can be coerced into `X`
 # given the casting rules `same_kind`
 _ArrayLikeBool_co = _DualArrayLike[
-    "dtype[bool_]",
+    dtype[bool_],
     bool,
 ]
 _ArrayLikeUInt_co = _DualArrayLike[
-    "dtype[Union[bool_, unsignedinteger[Any]]]",
+    dtype[Union[bool_, unsignedinteger[Any]]],
     bool,
 ]
 _ArrayLikeInt_co = _DualArrayLike[
-    "dtype[Union[bool_, integer[Any]]]",
+    dtype[Union[bool_, integer[Any]]],
     Union[bool, int],
 ]
 _ArrayLikeFloat_co = _DualArrayLike[
-    "dtype[Union[bool_, integer[Any], floating[Any]]]",
+    dtype[Union[bool_, integer[Any], floating[Any]]],
     Union[bool, int, float],
 ]
 _ArrayLikeComplex_co = _DualArrayLike[
-    "dtype[Union[bool_, integer[Any], floating[Any], complexfloating[Any, Any]]]",
+    dtype[Union[
+        bool_,
+        integer[Any],
+        floating[Any],
+        complexfloating[Any, Any],
+    ]],
     Union[bool, int, float, complex],
 ]
 _ArrayLikeNumber_co = _DualArrayLike[
-    "dtype[Union[bool_, number[Any]]]",
+    dtype[Union[bool_, number[Any]]],
     Union[bool, int, float, complex],
 ]
 _ArrayLikeTD64_co = _DualArrayLike[
-    "dtype[Union[bool_, integer[Any], timedelta64]]",
+    dtype[Union[bool_, integer[Any], timedelta64]],
     Union[bool, int],
 ]
 _ArrayLikeDT64_co = Union[
-    _SupportsArray["dtype[datetime64]"],
-    _NestedSequence[_SupportsArray["dtype[datetime64]"]],
+    _SupportsArray[dtype[datetime64]],
+    _NestedSequence[_SupportsArray[dtype[datetime64]]],
 ]
 _ArrayLikeObject_co = Union[
-    _SupportsArray["dtype[object_]"],
-    _NestedSequence[_SupportsArray["dtype[object_]"]],
+    _SupportsArray[dtype[object_]],
+    _NestedSequence[_SupportsArray[dtype[object_]]],
 ]
 
 _ArrayLikeVoid_co = Union[
-    _SupportsArray["dtype[void]"],
-    _NestedSequence[_SupportsArray["dtype[void]"]],
+    _SupportsArray[dtype[void]],
+    _NestedSequence[_SupportsArray[dtype[void]]],
 ]
 _ArrayLikeStr_co = _DualArrayLike[
-    "dtype[str_]",
+    dtype[str_],
     str,
 ]
 _ArrayLikeBytes_co = _DualArrayLike[
-    "dtype[bytes_]",
+    dtype[bytes_],
     bytes,
 ]
 
 _ArrayLikeInt = _DualArrayLike[
-    "dtype[integer[Any]]",
+    dtype[integer[Any]],
     int,
 ]
 
@@ -153,6 +159,6 @@ class _UnknownType:
 
 
 _ArrayLikeUnknown = _DualArrayLike[
-    "dtype[_UnknownType]",
+    dtype[_UnknownType],
     _UnknownType,
 ]
diff --git a/numpy/_typing/_callable.pyi b/numpy/_typing/_callable.pyi
index 5dfe9e7c2..ee818e905 100644
--- a/numpy/_typing/_callable.pyi
+++ b/numpy/_typing/_callable.pyi
@@ -43,7 +43,7 @@ from ._scalars import (
     _NumberLike_co,
 )
 from . import NBitBase
-from ._generic_alias import NDArray
+from ._array_like import NDArray
 from ._nested_sequence import _NestedSequence
 
 _T1 = TypeVar("_T1")
diff --git a/numpy/_typing/_dtype_like.py b/numpy/_typing/_dtype_like.py
index e92e17dd2..207a99c56 100644
--- a/numpy/_typing/_dtype_like.py
+++ b/numpy/_typing/_dtype_like.py
@@ -1,10 +1,8 @@
+from collections.abc import Sequence
 from typing import (
     Any,
-    List,
     Sequence,
-    Tuple,
     Union,
-    Type,
     TypeVar,
     Protocol,
     TypedDict,
@@ -14,7 +12,6 @@ from typing import (
 import numpy as np
 
 from ._shape import _ShapeLike
-from ._generic_alias import _DType as DType
 
 from ._char_codes import (
     _BoolCodes,
@@ -59,7 +56,7 @@ from ._char_codes import (
 )
 
 _SCT = TypeVar("_SCT", bound=np.generic)
-_DType_co = TypeVar("_DType_co", covariant=True, bound=DType[Any])
+_DType_co = TypeVar("_DType_co", covariant=True, bound=np.dtype[Any])
 
 _DTypeLikeNested = Any  # TODO: wait for support for recursive types
 
@@ -89,41 +86,41 @@ class _SupportsDType(Protocol[_DType_co]):
 
 # A subset of `npt.DTypeLike` that can be parametrized w.r.t. `np.generic`
 _DTypeLike = Union[
-    "np.dtype[_SCT]",
-    Type[_SCT],
-    _SupportsDType["np.dtype[_SCT]"],
+    np.dtype[_SCT],
+    type[_SCT],
+    _SupportsDType[np.dtype[_SCT]],
 ]
 
 
 # Would create a dtype[np.void]
 _VoidDTypeLike = Union[
     # (flexible_dtype, itemsize)
-    Tuple[_DTypeLikeNested, int],
+    tuple[_DTypeLikeNested, int],
     # (fixed_dtype, shape)
-    Tuple[_DTypeLikeNested, _ShapeLike],
+    tuple[_DTypeLikeNested, _ShapeLike],
     # [(field_name, field_dtype, field_shape), ...]
     #
     # The type here is quite broad because NumPy accepts quite a wide
     # range of inputs inside the list; see the tests for some
     # examples.
-    List[Any],
+    list[Any],
     # {'names': ..., 'formats': ..., 'offsets': ..., 'titles': ...,
     #  'itemsize': ...}
     _DTypeDict,
     # (base_dtype, new_dtype)
-    Tuple[_DTypeLikeNested, _DTypeLikeNested],
+    tuple[_DTypeLikeNested, _DTypeLikeNested],
 ]
 
 # Anything that can be coerced into numpy.dtype.
 # Reference: https://docs.scipy.org/doc/numpy/reference/arrays.dtypes.html
 DTypeLike = Union[
-    DType[Any],
+    np.dtype[Any],
     # default data type (float64)
     None,
     # array-scalar types and generic types
-    Type[Any],  # NOTE: We're stuck with `Type[Any]` due to object dtypes
+    type[Any],  # NOTE: We're stuck with `type[Any]` due to object dtypes
     # anything with a dtype attribute
-    _SupportsDType[DType[Any]],
+    _SupportsDType[np.dtype[Any]],
     # character codes, type strings or comma-separated fields, e.g., 'float64'
     str,
     _VoidDTypeLike,
@@ -139,16 +136,16 @@ DTypeLike = Union[
 # Aliases for commonly used dtype-like objects.
 # Note that the precision of `np.number` subclasses is ignored herein.
 _DTypeLikeBool = Union[
-    Type[bool],
-    Type[np.bool_],
-    DType[np.bool_],
-    _SupportsDType[DType[np.bool_]],
+    type[bool],
+    type[np.bool_],
+    np.dtype[np.bool_],
+    _SupportsDType[np.dtype[np.bool_]],
     _BoolCodes,
 ]
 _DTypeLikeUInt = Union[
-    Type[np.unsignedinteger],
-    DType[np.unsignedinteger],
-    _SupportsDType[DType[np.unsignedinteger]],
+    type[np.unsignedinteger],
+    np.dtype[np.unsignedinteger],
+    _SupportsDType[np.dtype[np.unsignedinteger]],
     _UInt8Codes,
     _UInt16Codes,
     _UInt32Codes,
@@ -161,10 +158,10 @@ _DTypeLikeUInt = Union[
     _ULongLongCodes,
 ]
 _DTypeLikeInt = Union[
-    Type[int],
-    Type[np.signedinteger],
-    DType[np.signedinteger],
-    _SupportsDType[DType[np.signedinteger]],
+    type[int],
+    type[np.signedinteger],
+    np.dtype[np.signedinteger],
+    _SupportsDType[np.dtype[np.signedinteger]],
     _Int8Codes,
     _Int16Codes,
     _Int32Codes,
@@ -177,10 +174,10 @@ _DTypeLikeInt = Union[
     _LongLongCodes,
 ]
 _DTypeLikeFloat = Union[
-    Type[float],
-    Type[np.floating],
-    DType[np.floating],
-    _SupportsDType[DType[np.floating]],
+    type[float],
+    type[np.floating],
+    np.dtype[np.floating],
+    _SupportsDType[np.dtype[np.floating]],
     _Float16Codes,
     _Float32Codes,
     _Float64Codes,
@@ -190,10 +187,10 @@ _DTypeLikeFloat = Union[
     _LongDoubleCodes,
 ]
 _DTypeLikeComplex = Union[
-    Type[complex],
-    Type[np.complexfloating],
-    DType[np.complexfloating],
-    _SupportsDType[DType[np.complexfloating]],
+    type[complex],
+    type[np.complexfloating],
+    np.dtype[np.complexfloating],
+    _SupportsDType[np.dtype[np.complexfloating]],
     _Complex64Codes,
     _Complex128Codes,
     _CSingleCodes,
@@ -201,42 +198,42 @@ _DTypeLikeComplex = Union[
     _CLongDoubleCodes,
 ]
 _DTypeLikeDT64 = Union[
-    Type[np.timedelta64],
-    DType[np.timedelta64],
-    _SupportsDType[DType[np.timedelta64]],
+    type[np.timedelta64],
+    np.dtype[np.timedelta64],
+    _SupportsDType[np.dtype[np.timedelta64]],
     _TD64Codes,
 ]
 _DTypeLikeTD64 = Union[
-    Type[np.datetime64],
-    DType[np.datetime64],
-    _SupportsDType[DType[np.datetime64]],
+    type[np.datetime64],
+    np.dtype[np.datetime64],
+    _SupportsDType[np.dtype[np.datetime64]],
     _DT64Codes,
 ]
 _DTypeLikeStr = Union[
-    Type[str],
-    Type[np.str_],
-    DType[np.str_],
-    _SupportsDType[DType[np.str_]],
+    type[str],
+    type[np.str_],
+    np.dtype[np.str_],
+    _SupportsDType[np.dtype[np.str_]],
     _StrCodes,
 ]
 _DTypeLikeBytes = Union[
-    Type[bytes],
-    Type[np.bytes_],
-    DType[np.bytes_],
-    _SupportsDType[DType[np.bytes_]],
+    type[bytes],
+    type[np.bytes_],
+    np.dtype[np.bytes_],
+    _SupportsDType[np.dtype[np.bytes_]],
     _BytesCodes,
 ]
 _DTypeLikeVoid = Union[
-    Type[np.void],
-    DType[np.void],
-    _SupportsDType[DType[np.void]],
+    type[np.void],
+    np.dtype[np.void],
+    _SupportsDType[np.dtype[np.void]],
     _VoidCodes,
     _VoidDTypeLike,
 ]
 _DTypeLikeObject = Union[
     type,
-    DType[np.object_],
-    _SupportsDType[DType[np.object_]],
+    np.dtype[np.object_],
+    _SupportsDType[np.dtype[np.object_]],
     _ObjectCodes,
 ]
 
diff --git a/numpy/_typing/_extended_precision.py b/numpy/_typing/_extended_precision.py
index edc1778ce..7246b47d0 100644
--- a/numpy/_typing/_extended_precision.py
+++ b/numpy/_typing/_extended_precision.py
@@ -5,8 +5,6 @@ The subclasses are defined here (instead of ``__init__.pyi``) such
 that they can be imported conditionally via the numpy's mypy plugin.
 """
 
-from typing import TYPE_CHECKING
-
 import numpy as np
 from . import (
     _80Bit,
@@ -15,29 +13,15 @@ from . import (
     _256Bit,
 )
 
-if TYPE_CHECKING:
-    uint128 = np.unsignedinteger[_128Bit]
-    uint256 = np.unsignedinteger[_256Bit]
-    int128 = np.signedinteger[_128Bit]
-    int256 = np.signedinteger[_256Bit]
-    float80 = np.floating[_80Bit]
-    float96 = np.floating[_96Bit]
-    float128 = np.floating[_128Bit]
-    float256 = np.floating[_256Bit]
-    complex160 = np.complexfloating[_80Bit, _80Bit]
-    complex192 = np.complexfloating[_96Bit, _96Bit]
-    complex256 = np.complexfloating[_128Bit, _128Bit]
-    complex512 = np.complexfloating[_256Bit, _256Bit]
-else:
-    uint128 = Any
-    uint256 = Any
-    int128 = Any
-    int256 = Any
-    float80 = Any
-    float96 = Any
-    float128 = Any
-    float256 = Any
-    complex160 = Any
-    complex192 = Any
-    complex256 = Any
-    complex512 = Any
+uint128 = np.unsignedinteger[_128Bit]
+uint256 = np.unsignedinteger[_256Bit]
+int128 = np.signedinteger[_128Bit]
+int256 = np.signedinteger[_256Bit]
+float80 = np.floating[_80Bit]
+float96 = np.floating[_96Bit]
+float128 = np.floating[_128Bit]
+float256 = np.floating[_256Bit]
+complex160 = np.complexfloating[_80Bit, _80Bit]
+complex192 = np.complexfloating[_96Bit, _96Bit]
+complex256 = np.complexfloating[_128Bit, _128Bit]
+complex512 = np.complexfloating[_256Bit, _256Bit]
diff --git a/numpy/_typing/_generic_alias.py b/numpy/_typing/_generic_alias.py
deleted file mode 100644
index 01cd224ad..000000000
--- a/numpy/_typing/_generic_alias.py
+++ /dev/null
@@ -1,245 +0,0 @@
-from __future__ import annotations
-
-import sys
-import types
-from collections.abc import Generator, Iterable, Iterator
-from typing import (
-    Any,
-    ClassVar,
-    NoReturn,
-    TypeVar,
-    TYPE_CHECKING,
-)
-
-import numpy as np
-
-__all__ = ["_GenericAlias", "NDArray"]
-
-_T = TypeVar("_T", bound="_GenericAlias")
-
-
-def _to_str(obj: object) -> str:
-    """Helper function for `_GenericAlias.__repr__`."""
-    if obj is Ellipsis:
-        return '...'
-    elif isinstance(obj, type) and not isinstance(obj, _GENERIC_ALIAS_TYPE):
-        if obj.__module__ == 'builtins':
-            return obj.__qualname__
-        else:
-            return f'{obj.__module__}.{obj.__qualname__}'
-    else:
-        return repr(obj)
-
-
-def _parse_parameters(args: Iterable[Any]) -> Generator[TypeVar, None, None]:
-    """Search for all typevars and typevar-containing objects in `args`.
-
-    Helper function for `_GenericAlias.__init__`.
-
-    """
-    for i in args:
-        if hasattr(i, "__parameters__"):
-            yield from i.__parameters__
-        elif isinstance(i, TypeVar):
-            yield i
-
-
-def _reconstruct_alias(alias: _T, parameters: Iterator[TypeVar]) -> _T:
-    """Recursively replace all typevars with those from `parameters`.
-
-    Helper function for `_GenericAlias.__getitem__`.
-
-    """
-    args = []
-    for i in alias.__args__:
-        if isinstance(i, TypeVar):
-            value: Any = next(parameters)
-        elif isinstance(i, _GenericAlias):
-            value = _reconstruct_alias(i, parameters)
-        elif hasattr(i, "__parameters__"):
-            prm_tup = tuple(next(parameters) for _ in i.__parameters__)
-            value = i[prm_tup]
-        else:
-            value = i
-        args.append(value)
-
-    cls = type(alias)
-    return cls(alias.__origin__, tuple(args), alias.__unpacked__)
-
-
-class _GenericAlias:
-    """A python-based backport of the `types.GenericAlias` class.
-
-    E.g. for ``t = list[int]``, ``t.__origin__`` is ``list`` and
-    ``t.__args__`` is ``(int,)``.
-
-    See Also
-    --------
-    :pep:`585`
-        The PEP responsible for introducing `types.GenericAlias`.
-
-    """
-
-    __slots__ = (
-        "__weakref__",
-        "_origin",
-        "_args",
-        "_parameters",
-        "_hash",
-        "_starred",
-    )
-
-    @property
-    def __origin__(self) -> type:
-        return super().__getattribute__("_origin")
-
-    @property
-    def __args__(self) -> tuple[object, ...]:
-        return super().__getattribute__("_args")
-
-    @property
-    def __parameters__(self) -> tuple[TypeVar, ...]:
-        """Type variables in the ``GenericAlias``."""
-        return super().__getattribute__("_parameters")
-
-    @property
-    def __unpacked__(self) -> bool:
-        return super().__getattribute__("_starred")
-
-    @property
-    def __typing_unpacked_tuple_args__(self) -> tuple[object, ...] | None:
-        # NOTE: This should return `__args__` if `__origin__` is a tuple,
-        # which should never be the case with how `_GenericAlias` is used
-        # within numpy
-        return None
-
-    def __init__(
-        self,
-        origin: type,
-        args: object | tuple[object, ...],
-        starred: bool = False,
-    ) -> None:
-        self._origin = origin
-        self._args = args if isinstance(args, tuple) else (args,)
-        self._parameters = tuple(_parse_parameters(self.__args__))
-        self._starred = starred
-
-    @property
-    def __call__(self) -> type[Any]:
-        return self.__origin__
-
-    def __reduce__(self: _T) -> tuple[
-        type[_T],
-        tuple[type[Any], tuple[object, ...], bool],
-    ]:
-        cls = type(self)
-        return cls, (self.__origin__, self.__args__, self.__unpacked__)
-
-    def __mro_entries__(self, bases: Iterable[object]) -> tuple[type[Any]]:
-        return (self.__origin__,)
-
-    def __dir__(self) -> list[str]:
-        """Implement ``dir(self)``."""
-        cls = type(self)
-        dir_origin = set(dir(self.__origin__))
-        return sorted(cls._ATTR_EXCEPTIONS | dir_origin)
-
-    def __hash__(self) -> int:
-        """Return ``hash(self)``."""
-        # Attempt to use the cached hash
-        try:
-            return super().__getattribute__("_hash")
-        except AttributeError:
-            self._hash: int = (
-                hash(self.__origin__) ^
-                hash(self.__args__) ^
-                hash(self.__unpacked__)
-            )
-            return super().__getattribute__("_hash")
-
-    def __instancecheck__(self, obj: object) -> NoReturn:
-        """Check if an `obj` is an instance."""
-        raise TypeError("isinstance() argument 2 cannot be a "
-                        "parameterized generic")
-
-    def __subclasscheck__(self, cls: type) -> NoReturn:
-        """Check if a `cls` is a subclass."""
-        raise TypeError("issubclass() argument 2 cannot be a "
-                        "parameterized generic")
-
-    def __repr__(self) -> str:
-        """Return ``repr(self)``."""
-        args = ", ".join(_to_str(i) for i in self.__args__)
-        origin = _to_str(self.__origin__)
-        prefix = "*" if self.__unpacked__ else ""
-        return f"{prefix}{origin}[{args}]"
-
-    def __getitem__(self: _T, key: object | tuple[object, ...]) -> _T:
-        """Return ``self[key]``."""
-        key_tup = key if isinstance(key, tuple) else (key,)
-
-        if len(self.__parameters__) == 0:
-            raise TypeError(f"There are no type variables left in {self}")
-        elif len(key_tup) > len(self.__parameters__):
-            raise TypeError(f"Too many arguments for {self}")
-        elif len(key_tup) < len(self.__parameters__):
-            raise TypeError(f"Too few arguments for {self}")
-
-        key_iter = iter(key_tup)
-        return _reconstruct_alias(self, key_iter)
-
-    def __eq__(self, value: object) -> bool:
-        """Return ``self == value``."""
-        if not isinstance(value, _GENERIC_ALIAS_TYPE):
-            return NotImplemented
-        return (
-            self.__origin__ == value.__origin__ and
-            self.__args__ == value.__args__ and
-            self.__unpacked__ == getattr(
-                value, "__unpacked__", self.__unpacked__
-            )
-        )
-
-    def __iter__(self: _T) -> Generator[_T, None, None]:
-        """Return ``iter(self)``."""
-        cls = type(self)
-        yield cls(self.__origin__, self.__args__, True)
-
-    _ATTR_EXCEPTIONS: ClassVar[frozenset[str]] = frozenset({
-        "__origin__",
-        "__args__",
-        "__parameters__",
-        "__mro_entries__",
-        "__reduce__",
-        "__reduce_ex__",
-        "__copy__",
-        "__deepcopy__",
-        "__unpacked__",
-        "__typing_unpacked_tuple_args__",
-        "__class__",
-    })
-
-    def __getattribute__(self, name: str) -> Any:
-        """Return ``getattr(self, name)``."""
-        # Pull the attribute from `__origin__` unless its
-        # name is in `_ATTR_EXCEPTIONS`
-        cls = type(self)
-        if name in cls._ATTR_EXCEPTIONS:
-            return super().__getattribute__(name)
-        return getattr(self.__origin__, name)
-
-
-# See `_GenericAlias.__eq__`
-if sys.version_info >= (3, 9):
-    _GENERIC_ALIAS_TYPE = (_GenericAlias, types.GenericAlias)
-else:
-    _GENERIC_ALIAS_TYPE = (_GenericAlias,)
-
-ScalarType = TypeVar("ScalarType", bound=np.generic, covariant=True)
-
-if TYPE_CHECKING or sys.version_info >= (3, 9):
-    _DType = np.dtype[ScalarType]
-    NDArray = np.ndarray[Any, np.dtype[ScalarType]]
-else:
-    _DType = _GenericAlias(np.dtype, (ScalarType,))
-    NDArray = _GenericAlias(np.ndarray, (Any, _DType))
diff --git a/numpy/_typing/_nested_sequence.py b/numpy/_typing/_nested_sequence.py
index 789bf3844..4b6cafc51 100644
--- a/numpy/_typing/_nested_sequence.py
+++ b/numpy/_typing/_nested_sequence.py
@@ -2,9 +2,9 @@
 
 from __future__ import annotations
 
+from collections.abc import Iterator
 from typing import (
     Any,
-    Iterator,
     overload,
     TypeVar,
     Protocol,
diff --git a/numpy/_typing/_scalars.py b/numpy/_typing/_scalars.py
index 516b996dc..e46ff04a0 100644
--- a/numpy/_typing/_scalars.py
+++ b/numpy/_typing/_scalars.py
@@ -1,4 +1,4 @@
-from typing import Union, Tuple, Any
+from typing import Union, Any
 
 import numpy as np
 
@@ -10,13 +10,13 @@ _CharLike_co = Union[str, bytes]
 # The 6 `<X>Like_co` type-aliases below represent all scalars that can be
 # coerced into `<X>` (with the casting rule `same_kind`)
 _BoolLike_co = Union[bool, np.bool_]
-_UIntLike_co = Union[_BoolLike_co, np.unsignedinteger]
-_IntLike_co = Union[_BoolLike_co, int, np.integer]
-_FloatLike_co = Union[_IntLike_co, float, np.floating]
-_ComplexLike_co = Union[_FloatLike_co, complex, np.complexfloating]
+_UIntLike_co = Union[_BoolLike_co, np.unsignedinteger[Any]]
+_IntLike_co = Union[_BoolLike_co, int, np.integer[Any]]
+_FloatLike_co = Union[_IntLike_co, float, np.floating[Any]]
+_ComplexLike_co = Union[_FloatLike_co, complex, np.complexfloating[Any, Any]]
 _TD64Like_co = Union[_IntLike_co, np.timedelta64]
 
-_NumberLike_co = Union[int, float, complex, np.number, np.bool_]
+_NumberLike_co = Union[int, float, complex, np.number[Any], np.bool_]
 _ScalarLike_co = Union[
     int,
     float,
@@ -27,4 +27,4 @@ _ScalarLike_co = Union[
 ]
 
 # `_VoidLike_co` is technically not a scalar, but it's close enough
-_VoidLike_co = Union[Tuple[Any, ...], np.void]
+_VoidLike_co = Union[tuple[Any, ...], np.void]
diff --git a/numpy/_typing/_shape.py b/numpy/_typing/_shape.py
index c28859b19..4f1204e47 100644
--- a/numpy/_typing/_shape.py
+++ b/numpy/_typing/_shape.py
@@ -1,6 +1,7 @@
-from typing import Sequence, Tuple, Union, SupportsIndex
+from collections.abc import Sequence
+from typing import Union, SupportsIndex
 
-_Shape = Tuple[int, ...]
+_Shape = tuple[int, ...]
 
 # Anything that can be coerced to a shape tuple
 _ShapeLike = Union[SupportsIndex, Sequence[SupportsIndex]]
diff --git a/numpy/_utils/__init__.py b/numpy/_utils/__init__.py
new file mode 100644
index 000000000..60703f145
--- /dev/null
+++ b/numpy/_utils/__init__.py
@@ -0,0 +1,27 @@
+"""
+This is a module for defining private helpers which do not depend on the
+rest of NumPy.
+
+Everything in here must be self-contained so that it can be
+imported anywhere else without creating circular imports.
+If a utility requires the import of NumPy, it probably belongs
+in ``numpy.core``.
+"""
+
+
+def set_module(module):
+    """Private decorator for overriding __module__ on a function or class.
+
+    Example usage::
+
+        @set_module('numpy')
+        def example():
+            pass
+
+        assert example.__module__ == 'numpy'
+    """
+    def decorator(func):
+        if module is not None:
+            func.__module__ = module
+        return func
+    return decorator
diff --git a/numpy/compat/_inspect.py b/numpy/_utils/_inspect.py
index 9a874a71d..9a874a71d 100644
--- a/numpy/compat/_inspect.py
+++ b/numpy/_utils/_inspect.py
diff --git a/numpy/compat/_pep440.py b/numpy/_utils/_pep440.py
index 73d0afb5e..73d0afb5e 100644
--- a/numpy/compat/_pep440.py
+++ b/numpy/_utils/_pep440.py
diff --git a/numpy/array_api/__init__.py b/numpy/array_api/__init__.py
index 5e58ee0a8..e154b9952 100644
--- a/numpy/array_api/__init__.py
+++ b/numpy/array_api/__init__.py
@@ -333,6 +333,10 @@ __all__ += [
     "trunc",
 ]
 
+from ._indexing_functions import take
+
+__all__ += ["take"]
+
 # linalg is an extension in the array API spec, which is a sub-namespace. Only
 # a subset of functions in it are imported into the top-level namespace.
 from . import linalg
diff --git a/numpy/array_api/_array_object.py b/numpy/array_api/_array_object.py
index c4746fad9..a949b5977 100644
--- a/numpy/array_api/_array_object.py
+++ b/numpy/array_api/_array_object.py
@@ -56,7 +56,7 @@ class Array:
     functions, such as asarray().
 
     """
-    _array: np.ndarray
+    _array: np.ndarray[Any, Any]
 
     # Use a custom constructor instead of __init__, as manually initializing
     # this class is not supported API.
@@ -850,23 +850,13 @@ class Array:
         """
         Performs the operation __imatmul__.
         """
-        # Note: NumPy does not implement __imatmul__.
-
         # matmul is not defined for scalars, but without this, we may get
         # the wrong error message from asarray.
         other = self._check_allowed_dtypes(other, "numeric", "__imatmul__")
         if other is NotImplemented:
             return other
-
-        # __imatmul__ can only be allowed when it would not change the shape
-        # of self.
-        other_shape = other.shape
-        if self.shape == () or other_shape == ():
-            raise ValueError("@= requires at least one dimension")
-        if len(other_shape) == 1 or other_shape[-1] != other_shape[-2]:
-            raise ValueError("@= cannot change the shape of the input array")
-        self._array[:] = self._array.__matmul__(other._array)
-        return self
+        res = self._array.__imatmul__(other._array)
+        return self.__class__._new(res)
 
     def __rmatmul__(self: Array, other: Array, /) -> Array:
         """
diff --git a/numpy/array_api/_indexing_functions.py b/numpy/array_api/_indexing_functions.py
new file mode 100644
index 000000000..ba56bcd6f
--- /dev/null
+++ b/numpy/array_api/_indexing_functions.py
@@ -0,0 +1,18 @@
+from __future__ import annotations
+
+from ._array_object import Array
+from ._dtypes import _integer_dtypes
+
+import numpy as np
+
+def take(x: Array, indices: Array, /, *, axis: int) -> Array:
+    """
+    Array API compatible wrapper for :py:func:`np.take <numpy.take>`.
+
+    See its docstring for more information.
+    """    
+    if indices.dtype not in _integer_dtypes:
+        raise TypeError("Only integer dtypes are allowed in indexing")
+    if indices.ndim != 1: 
+        raise ValueError("Only 1-dim indices array is supported")
+    return Array._new(np.take(x._array, indices._array, axis=axis))
diff --git a/numpy/array_api/_manipulation_functions.py b/numpy/array_api/_manipulation_functions.py
index 4f2114ff5..7991f46a2 100644
--- a/numpy/array_api/_manipulation_functions.py
+++ b/numpy/array_api/_manipulation_functions.py
@@ -52,6 +52,7 @@ def permute_dims(x: Array, /, axes: Tuple[int, ...]) -> Array:
     return Array._new(np.transpose(x._array, axes))
 
 
+# Note: the optional argument is called 'shape', not 'newshape'
 def reshape(x: Array, /, shape: Tuple[int, ...]) -> Array:
     """
     Array API compatible wrapper for :py:func:`np.reshape <numpy.reshape>`.
diff --git a/numpy/array_api/_typing.py b/numpy/array_api/_typing.py
index dfa87b358..3f9b7186a 100644
--- a/numpy/array_api/_typing.py
+++ b/numpy/array_api/_typing.py
@@ -17,14 +17,12 @@ __all__ = [
     "PyCapsule",
 ]
 
-import sys
 from typing import (
     Any,
     Literal,
     Sequence,
     Type,
     Union,
-    TYPE_CHECKING,
     TypeVar,
     Protocol,
 )
@@ -51,21 +49,20 @@ class NestedSequence(Protocol[_T_co]):
     def __len__(self, /) -> int: ...
 
 Device = Literal["cpu"]
-if TYPE_CHECKING or sys.version_info >= (3, 9):
-    Dtype = dtype[Union[
-        int8,
-        int16,
-        int32,
-        int64,
-        uint8,
-        uint16,
-        uint32,
-        uint64,
-        float32,
-        float64,
-    ]]
-else:
-    Dtype = dtype
+
+Dtype = dtype[Union[
+    int8,
+    int16,
+    int32,
+    int64,
+    uint8,
+    uint16,
+    uint32,
+    uint64,
+    float32,
+    float64,
+]]
+
 
 SupportsBufferProtocol = Any
 PyCapsule = Any
diff --git a/numpy/array_api/tests/test_indexing_functions.py b/numpy/array_api/tests/test_indexing_functions.py
new file mode 100644
index 000000000..9e05c6386
--- /dev/null
+++ b/numpy/array_api/tests/test_indexing_functions.py
@@ -0,0 +1,24 @@
+import pytest
+
+from numpy import array_api as xp
+
+
+@pytest.mark.parametrize(
+    "x, indices, axis, expected",
+    [
+        ([2, 3], [1, 1, 0], 0,  [3, 3, 2]),
+        ([2, 3], [1, 1, 0], -1, [3, 3, 2]),
+        ([[2, 3]], [1], -1, [[3]]),
+        ([[2, 3]], [0, 0], 0, [[2, 3], [2, 3]]),
+    ],
+)
+def test_take_function(x, indices, axis, expected):
+    """
+    Indices respect relative order of a descending stable-sort
+
+    See https://github.com/numpy/numpy/issues/20778
+    """
+    x = xp.asarray(x)
+    indices = xp.asarray(indices)
+    out = xp.take(x, indices, axis=axis)
+    assert xp.all(out == xp.asarray(expected))
diff --git a/numpy/compat/__init__.py b/numpy/compat/__init__.py
index afee621b8..504f8b003 100644
--- a/numpy/compat/__init__.py
+++ b/numpy/compat/__init__.py
@@ -8,9 +8,10 @@ extensions, which may be included for the following reasons:
   * we may only need a small subset of the copied library/module
 
 """
-from . import _inspect
+
+from .._utils import _inspect
+from .._utils._inspect import getargspec, formatargspec
 from . import py3k
-from ._inspect import getargspec, formatargspec
 from .py3k import *
 
 __all__ = []
diff --git a/numpy/compat/py3k.py b/numpy/compat/py3k.py
index 3d10bb988..d02c9f8fe 100644
--- a/numpy/compat/py3k.py
+++ b/numpy/compat/py3k.py
@@ -47,7 +47,15 @@ def asstr(s):
     return str(s)
 
 def isfileobj(f):
-    return isinstance(f, (io.FileIO, io.BufferedReader, io.BufferedWriter))
+    if not isinstance(f, (io.FileIO, io.BufferedReader, io.BufferedWriter)):
+        return False
+    try:
+        # BufferedReader/Writer may raise OSError when
+        # fetching `fileno()` (e.g. when wrapping BytesIO).
+        f.fileno()
+        return True
+    except OSError:
+        return False
 
 def open_latin1(filename, mode='r'):
     return open(filename, mode=mode, encoding='iso-8859-1')
diff --git a/numpy/compat/tests/test_compat.py b/numpy/compat/tests/test_compat.py
index 2b8acbaa0..d4391565e 100644
--- a/numpy/compat/tests/test_compat.py
+++ b/numpy/compat/tests/test_compat.py
@@ -1,4 +1,5 @@
 from os.path import join
+from io import BufferedReader, BytesIO
 
 from numpy.compat import isfileobj
 from numpy.testing import assert_
@@ -17,3 +18,5 @@ def test_isfileobj():
 
         with open(filename, 'rb') as f:
             assert_(isfileobj(f))
+
+        assert_(isfileobj(BufferedReader(BytesIO())) is False)
diff --git a/numpy/conftest.py b/numpy/conftest.py
index 8aa6587ee..f1a3eda98 100644
--- a/numpy/conftest.py
+++ b/numpy/conftest.py
@@ -30,7 +30,7 @@ hypothesis.settings.register_profile(
 hypothesis.settings.register_profile(
     name="np.test() profile",
     deadline=None, print_blob=True, database=None, derandomize=True,
-    suppress_health_check=hypothesis.HealthCheck.all(),
+    suppress_health_check=list(hypothesis.HealthCheck),
 )
 # Note that the default profile is chosen based on the presence 
 # of pytest.ini, but can be overridden by passing the 
@@ -40,6 +40,8 @@ hypothesis.settings.load_profile(
     "numpy-profile" if os.path.isfile(_pytest_ini) else "np.test() profile"
 )
 
+# The experimentalAPI is used in _umath_tests
+os.environ["NUMPY_EXPERIMENTAL_DTYPE_API"] = "1"
 
 def pytest_configure(config):
     config.addinivalue_line("markers",
diff --git a/numpy/core/__init__.py b/numpy/core/__init__.py
index 748705e33..08e717363 100644
--- a/numpy/core/__init__.py
+++ b/numpy/core/__init__.py
@@ -84,7 +84,6 @@ from .defchararray import chararray
 from . import function_base
 from .function_base import *
 from . import _machar
-from ._machar import *
 from . import getlimits
 from .getlimits import *
 from . import shape_base
@@ -93,7 +92,6 @@ from . import einsumfunc
 from .einsumfunc import *
 del nt
 
-from .fromnumeric import amax as max, amin as min, round_ as round
 from .numeric import absolute as abs
 
 # do this after everything else, to minimize the chance of this misleadingly
@@ -144,22 +142,25 @@ def _DType_reconstruct(scalar_type):
 
 
 def _DType_reduce(DType):
-    # To pickle a DType without having to add top-level names, pickle the
-    # scalar type for now (and assume that reconstruction will be possible).
-    if DType is dtype:
-        return "dtype"  # must pickle `np.dtype` as a singleton.
-    scalar_type = DType.type  # pickle the scalar type for reconstruction
+    # As types/classes, most DTypes can simply be pickled by their name:
+    if not DType._legacy or DType.__module__ == "numpy.dtypes":
+        return DType.__name__
+
+    # However, user defined legacy dtypes (like rational) do not end up in
+    # `numpy.dtypes` as module and do not have a public class at all.
+    # For these, we pickle them by reconstructing them from the scalar type:
+    scalar_type = DType.type
     return _DType_reconstruct, (scalar_type,)
 
 
 def __getattr__(name):
-    # Deprecated 2021-10-20, NumPy 1.22
-    if name == "machar":
+    # Deprecated 2022-11-22, NumPy 1.25.
+    if name == "MachAr":
         warnings.warn(
-            "The `np.core.machar` module is deprecated (NumPy 1.22)",
+            "The `np.core.MachAr` is considered private API (NumPy 1.24)",
             DeprecationWarning, stacklevel=2,
         )
-        return _machar
+        return _machar.MachAr
     raise AttributeError(f"Module {__name__!r} has no attribute {name!r}")
 
 
diff --git a/numpy/core/_add_newdocs.py b/numpy/core/_add_newdocs.py
index 4ff838b77..bd7c4f519 100644
--- a/numpy/core/_add_newdocs.py
+++ b/numpy/core/_add_newdocs.py
@@ -12,6 +12,7 @@ NOTE: Many of the methods of ndarray have corresponding functions.
 from numpy.core.function_base import add_newdoc
 from numpy.core.overrides import array_function_like_doc
 
+
 ###############################################################################
 #
 # flatiter
@@ -798,18 +799,18 @@ add_newdoc('numpy.core.multiarray', 'array',
     ----------
     object : array_like
         An array, any object exposing the array interface, an object whose
-        __array__ method returns an array, or any (nested) sequence.
+        ``__array__`` method returns an array, or any (nested) sequence.
         If object is a scalar, a 0-dimensional array containing object is
         returned.
     dtype : data-type, optional
-        The desired data-type for the array.  If not given, then the type will
-        be determined as the minimum type required to hold the objects in the
-        sequence.
+        The desired data-type for the array. If not given, NumPy will try to use
+        a default ``dtype`` that can represent the values (by applying promotion
+        rules when necessary.)
     copy : bool, optional
         If true (default), then the object is copied.  Otherwise, a copy will
-        only be made if __array__ returns a copy, if obj is a nested sequence,
-        or if a copy is needed to satisfy any of the other requirements
-        (`dtype`, `order`, etc.).
+        only be made if ``__array__`` returns a copy, if obj is a nested
+        sequence, or if a copy is needed to satisfy any of the other
+        requirements (``dtype``, ``order``, etc.).
     order : {'K', 'A', 'C', 'F'}, optional
         Specify the memory layout of the array. If object is not an array, the
         newly created array will be in C order (row major) unless 'F' is
@@ -858,7 +859,7 @@ add_newdoc('numpy.core.multiarray', 'array',
 
     Notes
     -----
-    When order is 'A' and `object` is an array in neither 'C' nor 'F' order,
+    When order is 'A' and ``object`` is an array in neither 'C' nor 'F' order,
     and a copy is forced by a change in dtype, then the order of the result is
     not necessarily 'C' as expected. This is likely a bug.
 
@@ -1780,6 +1781,7 @@ add_newdoc('numpy.core.multiarray', 'arange',
     numpy.linspace : Evenly spaced numbers with careful handling of endpoints.
     numpy.ogrid: Arrays of evenly spaced numbers in N-dimensions.
     numpy.mgrid: Grid-shaped arrays of evenly spaced numbers in N-dimensions.
+    :ref:`how-to-partition`
 
     Examples
     --------
@@ -2702,6 +2704,12 @@ add_newdoc('numpy.core.multiarray', 'ndarray', ('nbytes',
     Does not include memory consumed by non-element attributes of the
     array object.
 
+    See Also
+    --------
+    sys.getsizeof
+        Memory consumed by the object itself without parents in case view.
+        This does include memory consumed by non-element attributes.
+
     Examples
     --------
     >>> x = np.zeros((3,5,2), dtype=np.complex128)
@@ -2927,7 +2935,7 @@ add_newdoc('numpy.core.multiarray', 'ndarray', ('T',
 
 
 add_newdoc('numpy.core.multiarray', 'ndarray', ('__array__',
-    """ a.__array__([dtype], /) -> reference if type unchanged, copy otherwise.
+    """ a.__array__([dtype], /)
 
     Returns either a new reference to self if dtype is not given or a new array
     of provided data type if dtype is different from the current dtype of the
@@ -2990,10 +2998,6 @@ add_newdoc('numpy.core.multiarray', 'ndarray', ('__class_getitem__',
     >>> np.ndarray[Any, np.dtype[Any]]
     numpy.ndarray[typing.Any, numpy.dtype[typing.Any]]
 
-    Notes
-    -----
-    This method is only available for python 3.9 and later.
-
     See Also
     --------
     :pep:`585` : Type hinting generics in standard collections.
@@ -3004,7 +3008,7 @@ add_newdoc('numpy.core.multiarray', 'ndarray', ('__class_getitem__',
 
 
 add_newdoc('numpy.core.multiarray', 'ndarray', ('__deepcopy__',
-    """a.__deepcopy__(memo, /) -> Deep copy of array.
+    """a.__deepcopy__(memo, /)
 
     Used if :func:`copy.deepcopy` is called on an array.
 
@@ -5693,6 +5697,165 @@ add_newdoc('numpy.core', 'ufunc', ('at',
 
     """))
 
+add_newdoc('numpy.core', 'ufunc', ('resolve_dtypes',
+    """
+    resolve_dtypes(dtypes, *, signature=None, casting=None, reduction=False)
+
+    Find the dtypes NumPy will use for the operation.  Both input and
+    output dtypes are returned and may differ from those provided.
+
+    .. note::
+
+        This function always applies NEP 50 rules since it is not provided
+        any actual values.  The Python types ``int``, ``float``, and
+        ``complex`` thus behave weak and should be passed for "untyped"
+        Python input.
+
+    Parameters
+    ----------
+    dtypes : tuple of dtypes, None, or literal int, float, complex
+        The input dtypes for each operand.  Output operands can be
+        None, indicating that the dtype must be found.
+    signature : tuple of DTypes or None, optional
+        If given, enforces exact DType (classes) of the specific operand.
+        The ufunc ``dtype`` argument is equivalent to passing a tuple with
+        only output dtypes set.
+    casting : {'no', 'equiv', 'safe', 'same_kind', 'unsafe'}, optional
+        The casting mode when casting is necessary.  This is identical to
+        the ufunc call casting modes.
+    reduction : boolean
+        If given, the resolution assumes a reduce operation is happening
+        which slightly changes the promotion and type resolution rules.
+        `dtypes` is usually something like ``(None, np.dtype("i2"), None)``
+        for reductions (first input is also the output).
+
+        .. note::
+
+            The default casting mode is "same_kind", however, as of
+            NumPy 1.24, NumPy uses "unsafe" for reductions.
+
+    Returns
+    -------
+    dtypes : tuple of dtypes
+        The dtypes which NumPy would use for the calculation.  Note that
+        dtypes may not match the passed in ones (casting is necessary).
+
+    See Also
+    --------
+    numpy.ufunc._resolve_dtypes_and_context :
+        Similar function to this, but returns additional information which
+        give access to the core C functionality of NumPy.
+
+    Examples
+    --------
+    This API requires passing dtypes, define them for convenience:
+
+    >>> int32 = np.dtype("int32")
+    >>> float32 = np.dtype("float32")
+
+    The typical ufunc call does not pass an output dtype.  `np.add` has two
+    inputs and one output, so leave the output as ``None`` (not provided):
+
+    >>> np.add.resolve_dtypes((int32, float32, None))
+    (dtype('float64'), dtype('float64'), dtype('float64'))
+
+    The loop found uses "float64" for all operands (including the output), the
+    first input would be cast.
+
+    ``resolve_dtypes`` supports "weak" handling for Python scalars by passing
+    ``int``, ``float``, or ``complex``:
+
+    >>> np.add.resolve_dtypes((float32, float, None))
+    (dtype('float32'), dtype('float32'), dtype('float32'))
+
+    Where the Python ``float`` behaves samilar to a Python value ``0.0``
+    in a ufunc call.  (See :ref:`NEP 50 <NEP50>` for details.)
+
+    """))
+
+add_newdoc('numpy.core', 'ufunc', ('_resolve_dtypes_and_context',
+    """
+    _resolve_dtypes_and_context(dtypes, *, signature=None, casting=None, reduction=False)
+
+    See `numpy.ufunc.resolve_dtypes` for parameter information.  This
+    function is considered *unstable*.  You may use it, but the returned
+    information is NumPy version specific and expected to change.
+    Large API/ABI changes are not expected, but a new NumPy version is
+    expected to require updating code using this functionality.
+
+    This function is designed to be used in conjunction with
+    `numpy.ufunc._get_strided_loop`.  The calls are split to mirror the C API
+    and allow future improvements.
+
+    Returns
+    -------
+    dtypes : tuple of dtypes
+    call_info :
+        PyCapsule with all necessary information to get access to low level
+        C calls.  See `numpy.ufunc._get_strided_loop` for more information.
+
+    """))
+
+add_newdoc('numpy.core', 'ufunc', ('_get_strided_loop',
+    """
+    _get_strided_loop(call_info, /, *, fixed_strides=None)
+
+    This function fills in the ``call_info`` capsule to include all
+    information necessary to call the low-level strided loop from NumPy.
+
+    See notes for more information.
+
+    Parameters
+    ----------
+    call_info : PyCapsule
+        The PyCapsule returned by `numpy.ufunc._resolve_dtypes_and_context`.
+    fixed_strides : tuple of int or None, optional
+        A tuple with fixed byte strides of all input arrays.  NumPy may use
+        this information to find specialized loops, so any call must follow
+        the given stride.  Use ``None`` to indicate that the stride is not
+        known (or not fixed) for all calls.
+
+    Notes
+    -----
+    Together with `numpy.ufunc._resolve_dtypes_and_context` this function
+    gives low-level access to the NumPy ufunc loops.
+    The first function does general preparation and returns the required
+    information. It returns this as a C capsule with the version specific
+    name ``numpy_1.24_ufunc_call_info``.
+    The NumPy 1.24 ufunc call info capsule has the following layout::
+
+        typedef struct {
+            PyArrayMethod_StridedLoop *strided_loop;
+            PyArrayMethod_Context *context;
+            NpyAuxData *auxdata;
+
+            /* Flag information (expected to change) */
+            npy_bool requires_pyapi;  /* GIL is required by loop */
+
+            /* Loop doesn't set FPE flags; if not set check FPE flags */
+            npy_bool no_floatingpoint_errors;
+        } ufunc_call_info;
+
+    Note that the first call only fills in the ``context``.  The call to
+    ``_get_strided_loop`` fills in all other data.
+    Please see the ``numpy/experimental_dtype_api.h`` header for exact
+    call information; the main thing to note is that the new-style loops
+    return 0 on success, -1 on failure.  They are passed context as new
+    first input and ``auxdata`` as (replaced) last.
+
+    Only the ``strided_loop``signature is considered guaranteed stable
+    for NumPy bug-fix releases.  All other API is tied to the experimental
+    API versioning.
+
+    The reason for the split call is that cast information is required to
+    decide what the fixed-strides will be.
+
+    NumPy ties the lifetime of the ``auxdata`` information to the capsule.
+
+    """))
+
+
+
 ##############################################################################
 #
 # Documentation for dtype attributes and methods
@@ -6333,10 +6496,6 @@ add_newdoc('numpy.core.multiarray', 'dtype', ('__class_getitem__',
     >>> np.dtype[np.int64]
     numpy.dtype[numpy.int64]
 
-    Notes
-    -----
-    This method is only available for python 3.9 and later.
-
     See Also
     --------
     :pep:`585` : Type hinting generics in standard collections.
@@ -6846,10 +7005,6 @@ add_newdoc('numpy.core.numerictypes', 'number', ('__class_getitem__',
     >>> np.signedinteger[Any]
     numpy.signedinteger[typing.Any]
 
-    Notes
-    -----
-    This method is only available for python 3.9 and later.
-
     See Also
     --------
     :pep:`585` : Type hinting generics in standard collections.
diff --git a/numpy/core/_add_newdocs_scalars.py b/numpy/core/_add_newdocs_scalars.py
index 491052fae..f9a6ad963 100644
--- a/numpy/core/_add_newdocs_scalars.py
+++ b/numpy/core/_add_newdocs_scalars.py
@@ -93,7 +93,7 @@ def add_newdoc_for_scalar_type(obj, fixed_aliases, doc):
     add_newdoc('numpy.core.numerictypes', obj, docstring)
 
 
-add_newdoc_for_scalar_type('bool_', ['bool8'],
+add_newdoc_for_scalar_type('bool_', [],
     """
     Boolean type (True or False), stored as a byte.
 
@@ -204,7 +204,11 @@ add_newdoc_for_scalar_type('str_', ['unicode_'],
     r"""
     A unicode string.
 
-    When used in arrays, this type strips trailing null codepoints.
+    This type strips trailing null codepoints.
+
+    >>> s = np.str_("abc\x00")
+    >>> s
+    'abc'
 
     Unlike the builtin `str`, this supports the :ref:`python:bufferobjects`, exposing its
     contents as UCS4:
@@ -225,16 +229,52 @@ add_newdoc_for_scalar_type('bytes_', ['string_'],
 
 add_newdoc_for_scalar_type('void', [],
     r"""
-    Either an opaque sequence of bytes, or a structure.
+    np.void(length_or_data, /, dtype=None)
+
+    Create a new structured or unstructured void scalar.
+
+    Parameters
+    ----------
+    length_or_data : int, array-like, bytes-like, object
+       One of multiple meanings (see notes).  The length or
+       bytes data of an unstructured void.  Or alternatively,
+       the data to be stored in the new scalar when `dtype`
+       is provided.
+       This can be an array-like, in which case an array may
+       be returned.
+    dtype : dtype, optional
+        If provided the dtype of the new scalar.  This dtype must
+        be "void" dtype (i.e. a structured or unstructured void,
+        see also :ref:`defining-structured-types`).
+
+       ..versionadded:: 1.24
+
+    Notes
+    -----
+    For historical reasons and because void scalars can represent both
+    arbitrary byte data and structured dtypes, the void constructor
+    has three calling conventions:
+
+    1. ``np.void(5)`` creates a ``dtype="V5"`` scalar filled with five
+       ``\0`` bytes.  The 5 can be a Python or NumPy integer.
+    2. ``np.void(b"bytes-like")`` creates a void scalar from the byte string.
+       The dtype itemsize will match the byte string length, here ``"V10"``.
+    3. When a ``dtype=`` is passed the call is roughly the same as an
+       array creation.  However, a void scalar rather than array is returned.
+
+    Please see the examples which show all three different conventions.
 
+    Examples
+    --------
+    >>> np.void(5)
+    void(b'\x00\x00\x00\x00\x00')
     >>> np.void(b'abcd')
     void(b'\x61\x62\x63\x64')
+    >>> np.void((5, 3.2, "eggs"), dtype="i,d,S5")
+    (5, 3.2, b'eggs')  # looks like a tuple, but is `np.void`
+    >>> np.void(3, dtype=[('x', np.int8), ('y', np.int8)])
+    (3, 3)  # looks like a tuple, but is `np.void`
 
-    Structured `void` scalars can only be constructed via extraction from :ref:`structured_arrays`:
-
-    >>> arr = np.array((1, 2), dtype=[('x', np.int8), ('y', np.int8)])
-    >>> arr[()]
-    (1, 2)  # looks like a tuple, but is `np.void`
     """)
 
 add_newdoc_for_scalar_type('datetime64', [],
diff --git a/numpy/core/_asarray.py b/numpy/core/_asarray.py
index cbaab8c3f..a9abc5a88 100644
--- a/numpy/core/_asarray.py
+++ b/numpy/core/_asarray.py
@@ -24,10 +24,6 @@ POSSIBLE_FLAGS = {
 }
 
 
-def _require_dispatcher(a, dtype=None, requirements=None, *, like=None):
-    return (like,)
-
-
 @set_array_function_like_doc
 @set_module('numpy')
 def require(a, dtype=None, requirements=None, *, like=None):
@@ -100,10 +96,10 @@ def require(a, dtype=None, requirements=None, *, like=None):
     """
     if like is not None:
         return _require_with_like(
+            like,
             a,
             dtype=dtype,
             requirements=requirements,
-            like=like,
         )
 
     if not requirements:
@@ -135,6 +131,4 @@ def require(a, dtype=None, requirements=None, *, like=None):
     return arr
 
 
-_require_with_like = array_function_dispatch(
-    _require_dispatcher
-)(require)
+_require_with_like = array_function_dispatch()(require)
diff --git a/numpy/core/_asarray.pyi b/numpy/core/_asarray.pyi
index 473bc037c..69d1528d4 100644
--- a/numpy/core/_asarray.pyi
+++ b/numpy/core/_asarray.pyi
@@ -1,10 +1,10 @@
 from collections.abc import Iterable
-from typing import TypeVar, Union, overload, Literal
+from typing import Any, TypeVar, Union, overload, Literal
 
 from numpy import ndarray
 from numpy._typing import DTypeLike, _SupportsArrayFunc
 
-_ArrayType = TypeVar("_ArrayType", bound=ndarray)
+_ArrayType = TypeVar("_ArrayType", bound=ndarray[Any, Any])
 
 _Requirements = Literal[
     "C", "C_CONTIGUOUS", "CONTIGUOUS",
@@ -31,7 +31,7 @@ def require(
     requirements: _E | Iterable[_RequirementsWithE] = ...,
     *,
     like: _SupportsArrayFunc = ...
-) -> ndarray: ...
+) -> ndarray[Any, Any]: ...
 @overload
 def require(
     a: object,
@@ -39,4 +39,4 @@ def require(
     requirements: None | _Requirements | Iterable[_Requirements] = ...,
     *,
     like: _SupportsArrayFunc = ...
-) -> ndarray: ...
+) -> ndarray[Any, Any]: ...
diff --git a/numpy/core/_dtype.py b/numpy/core/_dtype.py
index 3db80c17e..ff50f5199 100644
--- a/numpy/core/_dtype.py
+++ b/numpy/core/_dtype.py
@@ -114,13 +114,13 @@ def _scalar_str(dtype, short):
         # platforms, so it should never include the itemsize here.
         return "'O'"
 
-    elif dtype.type == np.string_:
+    elif dtype.type == np.bytes_:
         if _isunsized(dtype):
             return "'S'"
         else:
             return "'S%d'" % dtype.itemsize
 
-    elif dtype.type == np.unicode_:
+    elif dtype.type == np.str_:
         if _isunsized(dtype):
             return "'%sU'" % byteorder
         else:
@@ -334,6 +334,8 @@ def _name_includes_bit_suffix(dtype):
     elif dtype.type == np.bool_:
         # implied
         return False
+    elif dtype.type is None:
+        return True
     elif np.issubdtype(dtype, np.flexible) and _isunsized(dtype):
         # unspecified
         return False
@@ -348,7 +350,9 @@ def _name_get(dtype):
         # user dtypes don't promise to do anything special
         return dtype.type.__name__
 
-    if issubclass(dtype.type, np.void):
+    if dtype.kind == '\x00':
+        name = type(dtype).__name__
+    elif issubclass(dtype.type, np.void):
         # historically, void subclasses preserve their name, eg `record64`
         name = dtype.type.__name__
     else:
diff --git a/numpy/core/_exceptions.py b/numpy/core/_exceptions.py
index be3aa8bee..87d4213a6 100644
--- a/numpy/core/_exceptions.py
+++ b/numpy/core/_exceptions.py
@@ -5,7 +5,7 @@ in python where it's easier.
 By putting the formatting in `__str__`, we also avoid paying the cost for
 users who silence the exceptions.
 """
-from numpy.core.overrides import set_module
+from .._utils import set_module
 
 def _unpack_tuple(tup):
     if len(tup) == 1:
@@ -36,36 +36,35 @@ class UFuncTypeError(TypeError):
 
 
 @_display_as_base
-class _UFuncBinaryResolutionError(UFuncTypeError):
-    """ Thrown when a binary resolution fails """
+class _UFuncNoLoopError(UFuncTypeError):
+    """ Thrown when a ufunc loop cannot be found """
     def __init__(self, ufunc, dtypes):
         super().__init__(ufunc)
         self.dtypes = tuple(dtypes)
-        assert len(self.dtypes) == 2
 
     def __str__(self):
         return (
-            "ufunc {!r} cannot use operands with types {!r} and {!r}"
+            "ufunc {!r} did not contain a loop with signature matching types "
+            "{!r} -> {!r}"
         ).format(
-            self.ufunc.__name__, *self.dtypes
+            self.ufunc.__name__,
+            _unpack_tuple(self.dtypes[:self.ufunc.nin]),
+            _unpack_tuple(self.dtypes[self.ufunc.nin:])
         )
 
 
 @_display_as_base
-class _UFuncNoLoopError(UFuncTypeError):
-    """ Thrown when a ufunc loop cannot be found """
+class _UFuncBinaryResolutionError(_UFuncNoLoopError):
+    """ Thrown when a binary resolution fails """
     def __init__(self, ufunc, dtypes):
-        super().__init__(ufunc)
-        self.dtypes = tuple(dtypes)
+        super().__init__(ufunc, dtypes)
+        assert len(self.dtypes) == 2
 
     def __str__(self):
         return (
-            "ufunc {!r} did not contain a loop with signature matching types "
-            "{!r} -> {!r}"
+            "ufunc {!r} cannot use operands with types {!r} and {!r}"
         ).format(
-            self.ufunc.__name__,
-            _unpack_tuple(self.dtypes[:self.ufunc.nin]),
-            _unpack_tuple(self.dtypes[self.ufunc.nin:])
+            self.ufunc.__name__, *self.dtypes
         )
 
 
@@ -114,113 +113,6 @@ class _UFuncOutputCastingError(_UFuncCastingError):
         )
 
 
-# Exception used in shares_memory()
-@set_module('numpy')
-class TooHardError(RuntimeError):
-    """max_work was exceeded.
-
-    This is raised whenever the maximum number of candidate solutions 
-    to consider specified by the ``max_work`` parameter is exceeded.
-    Assigning a finite number to max_work may have caused the operation 
-    to fail.
-
-    """
-    
-    pass
-
-
-@set_module('numpy')
-class AxisError(ValueError, IndexError):
-    """Axis supplied was invalid.
-
-    This is raised whenever an ``axis`` parameter is specified that is larger
-    than the number of array dimensions.
-    For compatibility with code written against older numpy versions, which
-    raised a mixture of `ValueError` and `IndexError` for this situation, this
-    exception subclasses both to ensure that ``except ValueError`` and
-    ``except IndexError`` statements continue to catch `AxisError`.
-
-    .. versionadded:: 1.13
-
-    Parameters
-    ----------
-    axis : int or str
-        The out of bounds axis or a custom exception message.
-        If an axis is provided, then `ndim` should be specified as well.
-    ndim : int, optional
-        The number of array dimensions.
-    msg_prefix : str, optional
-        A prefix for the exception message.
-
-    Attributes
-    ----------
-    axis : int, optional
-        The out of bounds axis or ``None`` if a custom exception
-        message was provided. This should be the axis as passed by
-        the user, before any normalization to resolve negative indices.
-
-        .. versionadded:: 1.22
-    ndim : int, optional
-        The number of array dimensions or ``None`` if a custom exception
-        message was provided.
-
-        .. versionadded:: 1.22
-
-
-    Examples
-    --------
-    >>> array_1d = np.arange(10)
-    >>> np.cumsum(array_1d, axis=1)
-    Traceback (most recent call last):
-      ...
-    numpy.AxisError: axis 1 is out of bounds for array of dimension 1
-
-    Negative axes are preserved:
-
-    >>> np.cumsum(array_1d, axis=-2)
-    Traceback (most recent call last):
-      ...
-    numpy.AxisError: axis -2 is out of bounds for array of dimension 1
-
-    The class constructor generally takes the axis and arrays'
-    dimensionality as arguments:
-
-    >>> print(np.AxisError(2, 1, msg_prefix='error'))
-    error: axis 2 is out of bounds for array of dimension 1
-
-    Alternatively, a custom exception message can be passed:
-
-    >>> print(np.AxisError('Custom error message'))
-    Custom error message
-
-    """
-
-    __slots__ = ("axis", "ndim", "_msg")
-
-    def __init__(self, axis, ndim=None, msg_prefix=None):
-        if ndim is msg_prefix is None:
-            # single-argument form: directly set the error message
-            self._msg = axis
-            self.axis = None
-            self.ndim = None
-        else:
-            self._msg = msg_prefix
-            self.axis = axis
-            self.ndim = ndim
-
-    def __str__(self):
-        axis = self.axis
-        ndim = self.ndim
-
-        if axis is ndim is None:
-            return self._msg
-        else:
-            msg = f"axis {axis} is out of bounds for array of dimension {ndim}"
-            if self._msg is not None:
-                msg = f"{self._msg}: {msg}"
-            return msg
-
-
 @_display_as_base
 class _ArrayMemoryError(MemoryError):
     """ Thrown when an array cannot be allocated"""
diff --git a/numpy/core/_internal.py b/numpy/core/_internal.py
index 85076f3e1..c78385880 100644
--- a/numpy/core/_internal.py
+++ b/numpy/core/_internal.py
@@ -9,6 +9,7 @@ import re
 import sys
 import warnings
 
+from ..exceptions import DTypePromotionError
 from .multiarray import dtype, array, ndarray, promote_types
 try:
     import ctypes
@@ -454,7 +455,8 @@ def _promote_fields(dt1, dt2):
     """
     # Both must be structured and have the same names in the same order
     if (dt1.names is None or dt2.names is None) or dt1.names != dt2.names:
-        raise TypeError("invalid type promotion")
+        raise DTypePromotionError(
+                f"field names `{dt1.names}` and `{dt2.names}` mismatch.")
 
     # if both are identical, we can (maybe!) just return the same dtype.
     identical = dt1 is dt2
@@ -467,7 +469,8 @@ def _promote_fields(dt1, dt2):
 
         # Check that the titles match (if given):
         if field1[2:] != field2[2:]:
-            raise TypeError("invalid type promotion")
+            raise DTypePromotionError(
+                    f"field titles of field '{name}' mismatch")
         if len(field1) == 2:
             new_fields.append((name, new_descr))
         else:
diff --git a/numpy/core/_machar.py b/numpy/core/_machar.py
index 3cc7db278..59d71014f 100644
--- a/numpy/core/_machar.py
+++ b/numpy/core/_machar.py
@@ -7,14 +7,13 @@ Author: Pearu Peterson, September 2003
 """
 __all__ = ['MachAr']
 
-from numpy.core.fromnumeric import any
-from numpy.core._ufunc_config import errstate
-from numpy.core.overrides import set_module
+from .fromnumeric import any
+from ._ufunc_config import errstate
+from .._utils import set_module
 
 # Need to speed this up...especially for longfloat
 
 # Deprecated 2021-10-20, NumPy 1.22
-@set_module('numpy')
 class MachAr:
     """
     Diagnosing machine parameters.
diff --git a/numpy/core/_methods.py b/numpy/core/_methods.py
index 040f02a9d..0fc070b34 100644
--- a/numpy/core/_methods.py
+++ b/numpy/core/_methods.py
@@ -87,79 +87,16 @@ def _count_reduce_items(arr, axis, keepdims=False, where=True):
                         keepdims)
     return items
 
-# Numpy 1.17.0, 2019-02-24
-# Various clip behavior deprecations, marked with _clip_dep as a prefix.
-
-def _clip_dep_is_scalar_nan(a):
-    # guarded to protect circular imports
-    from numpy.core.fromnumeric import ndim
-    if ndim(a) != 0:
-        return False
-    try:
-        return um.isnan(a)
-    except TypeError:
-        return False
-
-def _clip_dep_is_byte_swapped(a):
-    if isinstance(a, mu.ndarray):
-        return not a.dtype.isnative
-    return False
-
-def _clip_dep_invoke_with_casting(ufunc, *args, out=None, casting=None, **kwargs):
-    # normal path
-    if casting is not None:
-        return ufunc(*args, out=out, casting=casting, **kwargs)
-
-    # try to deal with broken casting rules
-    try:
-        return ufunc(*args, out=out, **kwargs)
-    except _exceptions._UFuncOutputCastingError as e:
-        # Numpy 1.17.0, 2019-02-24
-        warnings.warn(
-            "Converting the output of clip from {!r} to {!r} is deprecated. "
-            "Pass `casting=\"unsafe\"` explicitly to silence this warning, or "
-            "correct the type of the variables.".format(e.from_, e.to),
-            DeprecationWarning,
-            stacklevel=2
-        )
-        return ufunc(*args, out=out, casting="unsafe", **kwargs)
-
-def _clip(a, min=None, max=None, out=None, *, casting=None, **kwargs):
+def _clip(a, min=None, max=None, out=None, **kwargs):
     if min is None and max is None:
         raise ValueError("One of max or min must be given")
 
-    # Numpy 1.17.0, 2019-02-24
-    # This deprecation probably incurs a substantial slowdown for small arrays,
-    # it will be good to get rid of it.
-    if not _clip_dep_is_byte_swapped(a) and not _clip_dep_is_byte_swapped(out):
-        using_deprecated_nan = False
-        if _clip_dep_is_scalar_nan(min):
-            min = -float('inf')
-            using_deprecated_nan = True
-        if _clip_dep_is_scalar_nan(max):
-            max = float('inf')
-            using_deprecated_nan = True
-        if using_deprecated_nan:
-            warnings.warn(
-                "Passing `np.nan` to mean no clipping in np.clip has always "
-                "been unreliable, and is now deprecated. "
-                "In future, this will always return nan, like it already does "
-                "when min or max are arrays that contain nan. "
-                "To skip a bound, pass either None or an np.inf of an "
-                "appropriate sign.",
-                DeprecationWarning,
-                stacklevel=2
-            )
-
     if min is None:
-        return _clip_dep_invoke_with_casting(
-            um.minimum, a, max, out=out, casting=casting, **kwargs)
+        return um.minimum(a, max, out=out, **kwargs)
     elif max is None:
-        return _clip_dep_invoke_with_casting(
-            um.maximum, a, min, out=out, casting=casting, **kwargs)
+        return um.maximum(a, min, out=out, **kwargs)
     else:
-        return _clip_dep_invoke_with_casting(
-            um.clip, a, min, max, out=out, casting=casting, **kwargs)
+        return um.clip(a, min, max, out=out, **kwargs)
 
 def _mean(a, axis=None, dtype=None, out=None, keepdims=False, *, where=True):
     arr = asanyarray(a)
diff --git a/numpy/core/_type_aliases.py b/numpy/core/_type_aliases.py
index 9b1dcb8cf..38f1a099e 100644
--- a/numpy/core/_type_aliases.py
+++ b/numpy/core/_type_aliases.py
@@ -107,14 +107,16 @@ def _add_aliases():
         if name in ('longdouble', 'clongdouble') and myname in allTypes:
             continue
 
-        allTypes[myname] = info.type
-
-        # add mapping for both the bit name and the numarray name
-        sctypeDict[myname] = info.type
+        # Add to the main namespace if desired:
+        if bit != 0 and base != "bool":
+            allTypes[myname] = info.type
 
         # add forward, reverse, and string mapping to numarray
         sctypeDict[char] = info.type
 
+        # add mapping for both the bit name
+        sctypeDict[myname] = info.type
+
 
 _add_aliases()
 
@@ -148,8 +150,6 @@ void = allTypes['void']
 #
 def _set_up_aliases():
     type_pairs = [('complex_', 'cdouble'),
-                  ('int0', 'intp'),
-                  ('uint0', 'uintp'),
                   ('single', 'float'),
                   ('csingle', 'cfloat'),
                   ('singlecomplex', 'cfloat'),
@@ -233,7 +233,8 @@ _set_array_types()
 
 # Add additional strings to the sctypeDict
 _toadd = ['int', 'float', 'complex', 'bool', 'object',
-          'str', 'bytes', ('a', 'bytes_')]
+          'str', 'bytes', ('a', 'bytes_'),
+          ('int0', 'intp'), ('uint0', 'uintp')]
 
 for name in _toadd:
     if isinstance(name, tuple):
diff --git a/numpy/core/_type_aliases.pyi b/numpy/core/_type_aliases.pyi
index bbead0cb5..c0b6f1a80 100644
--- a/numpy/core/_type_aliases.pyi
+++ b/numpy/core/_type_aliases.pyi
@@ -1,12 +1,12 @@
-from typing import TypedDict
+from typing import Any, TypedDict
 
 from numpy import generic, signedinteger, unsignedinteger, floating, complexfloating
 
 class _SCTypes(TypedDict):
-    int: list[type[signedinteger]]
-    uint: list[type[unsignedinteger]]
-    float: list[type[floating]]
-    complex: list[type[complexfloating]]
+    int: list[type[signedinteger[Any]]]
+    uint: list[type[unsignedinteger[Any]]]
+    float: list[type[floating[Any]]]
+    complex: list[type[complexfloating[Any, Any]]]
     others: list[type]
 
 sctypeDict: dict[int | str, type[generic]]
diff --git a/numpy/core/_ufunc_config.py b/numpy/core/_ufunc_config.py
index 5aac7ab09..df8213095 100644
--- a/numpy/core/_ufunc_config.py
+++ b/numpy/core/_ufunc_config.py
@@ -7,7 +7,7 @@ import collections.abc
 import contextlib
 import contextvars
 
-from .overrides import set_module
+from .._utils import set_module
 from .umath import (
     UFUNC_BUFSIZE_DEFAULT,
     ERR_IGNORE, ERR_WARN, ERR_RAISE, ERR_CALL, ERR_PRINT, ERR_LOG, ERR_DEFAULT,
diff --git a/numpy/core/arrayprint.py b/numpy/core/arrayprint.py
index d7e9bf795..62cd52707 100644
--- a/numpy/core/arrayprint.py
+++ b/numpy/core/arrayprint.py
@@ -169,7 +169,7 @@ def set_printoptions(precision=None, threshold=None, edgeitems=None,
         - 'longfloat' : 128-bit floats
         - 'complexfloat'
         - 'longcomplexfloat' : composed of two 128-bit floats
-        - 'numpystr' : types `numpy.string_` and `numpy.unicode_`
+        - 'numpystr' : types `numpy.bytes_` and `numpy.str_`
         - 'object' : `np.object_` arrays
 
         Other keys that can be used to set a group of types at once are:
@@ -475,7 +475,7 @@ def _get_format_function(data, **options):
             return formatdict['longcomplexfloat']()
         else:
             return formatdict['complexfloat']()
-    elif issubclass(dtypeobj, (_nt.unicode_, _nt.string_)):
+    elif issubclass(dtypeobj, (_nt.str_, _nt.bytes_)):
         return formatdict['numpystr']()
     elif issubclass(dtypeobj, _nt.datetime64):
         return formatdict['datetime']()
@@ -616,7 +616,7 @@ def array2string(a, max_line_width=None, precision=None,
         - 'complexfloat'
         - 'longcomplexfloat' : composed of two 128-bit floats
         - 'void' : type `numpy.void`
-        - 'numpystr' : types `numpy.string_` and `numpy.unicode_`
+        - 'numpystr' : types `numpy.bytes_` and `numpy.str_`
 
         Other keys that can be used to set a group of types at once are:
 
@@ -724,7 +724,7 @@ def array2string(a, max_line_width=None, precision=None,
         # Deprecation 11-9-2017  v1.14
         warnings.warn("'style' argument is deprecated and no longer functional"
                       " except in 1.13 'legacy' mode",
-                      DeprecationWarning, stacklevel=3)
+                      DeprecationWarning, stacklevel=2)
 
     if options['legacy'] > 113:
         options['linewidth'] -= len(suffix)
@@ -1096,7 +1096,7 @@ def format_float_scientific(x, precision=None, unique=True, trim='k',
         identify the value may be printed and rounded unbiased.
 
         -- versionadded:: 1.21.0
-        
+
     Returns
     -------
     rep : string
@@ -1181,7 +1181,7 @@ def format_float_positional(x, precision=None, unique=True,
         Minimum number of digits to print. Only has an effect if `unique=True`
         in which case additional digits past those necessary to uniquely
         identify the value may be printed, rounding the last additional digit.
-        
+
         -- versionadded:: 1.21.0
 
     Returns
@@ -1339,13 +1339,29 @@ class TimedeltaFormat(_TimelikeFormat):
 
 
 class SubArrayFormat:
-    def __init__(self, format_function):
+    def __init__(self, format_function, **options):
         self.format_function = format_function
+        self.threshold = options['threshold']
+        self.edge_items = options['edgeitems']
+
+    def __call__(self, a):
+        self.summary_insert = "..." if a.size > self.threshold else ""
+        return self.format_array(a)
+
+    def format_array(self, a):
+        if np.ndim(a) == 0:
+            return self.format_function(a)
+
+        if self.summary_insert and a.shape[0] > 2*self.edge_items:
+            formatted = (
+                [self.format_array(a_) for a_ in a[:self.edge_items]]
+                + [self.summary_insert]
+                + [self.format_array(a_) for a_ in a[-self.edge_items:]]
+            )
+        else:
+            formatted = [self.format_array(a_) for a_ in a]
 
-    def __call__(self, arr):
-        if arr.ndim <= 1:
-            return "[" + ", ".join(self.format_function(a) for a in arr) + "]"
-        return "[" + ", ".join(self.__call__(a) for a in arr) + "]"
+        return "[" + ", ".join(formatted) + "]"
 
 
 class StructuredVoidFormat:
@@ -1369,7 +1385,7 @@ class StructuredVoidFormat:
         for field_name in data.dtype.names:
             format_function = _get_format_function(data[field_name], **options)
             if data.dtype[field_name].shape != ():
-                format_function = SubArrayFormat(format_function)
+                format_function = SubArrayFormat(format_function, **options)
             format_functions.append(format_function)
         return cls(format_functions)
 
@@ -1394,10 +1410,6 @@ def _void_scalar_repr(x):
 
 
 _typelessdata = [int_, float_, complex_, bool_]
-if issubclass(intc, int):
-    _typelessdata.append(intc)
-if issubclass(longlong, int):
-    _typelessdata.append(longlong)
 
 
 def dtype_is_implied(dtype):
@@ -1433,6 +1445,10 @@ def dtype_is_implied(dtype):
     if dtype.names is not None:
         return False
 
+    # should care about endianness *unless size is 1* (e.g., int8, bool)
+    if not dtype.isnative:
+        return False
+
     return dtype.type in _typelessdata
 
 
@@ -1457,10 +1473,14 @@ def dtype_short_repr(dtype):
         return "'%s'" % str(dtype)
 
     typename = dtype.name
+    if not dtype.isnative:
+        # deal with cases like dtype('<u2') that are identical to an
+        # established dtype (in this case uint16)
+        # except that they have a different endianness.
+        return "'%s'" % str(dtype)
     # quote typenames which can't be represented as python variable names
     if typename and not (typename[0].isalpha() and typename.isalnum()):
         typename = repr(typename)
-
     return typename
 
 
diff --git a/numpy/core/check_longdouble.c b/numpy/core/check_longdouble.c
new file mode 100644
index 000000000..756ef3fc6
--- /dev/null
+++ b/numpy/core/check_longdouble.c
@@ -0,0 +1,15 @@
+/* "before" is 16 bytes to ensure there's no padding between it and "x".
+ *    We're not expecting any "long double" bigger than 16 bytes or with
+ *       alignment requirements stricter than 16 bytes.  */
+typedef long double test_type;
+
+struct {
+        char         before[16];
+        test_type    x;
+        char         after[8];
+} foo = {
+        { '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
+          '\001', '\043', '\105', '\147', '\211', '\253', '\315', '\357' },
+        -123456789.0,
+        { '\376', '\334', '\272', '\230', '\166', '\124', '\062', '\020' }
+};
diff --git a/numpy/core/code_generators/cversions.txt b/numpy/core/code_generators/cversions.txt
index dddcc148c..e52193d7a 100644
--- a/numpy/core/code_generators/cversions.txt
+++ b/numpy/core/code_generators/cversions.txt
@@ -68,3 +68,6 @@
 # NonNull attributes removed from numpy_api.py
 # Version 16 (NumPy 1.24) No change.
 0x00000010 = 04a7bf1e65350926a0e528798da263c0
+
+# Version 17 (NumPy 1.25) No actual change.
+0x00000011 = ca1aebdad799358149567d9d93cbca09
diff --git a/numpy/core/code_generators/genapi.py b/numpy/core/code_generators/genapi.py
index 68ae30d5b..2cdaba52d 100644
--- a/numpy/core/code_generators/genapi.py
+++ b/numpy/core/code_generators/genapi.py
@@ -6,17 +6,35 @@ See ``find_function`` for how functions should be formatted, and
 specified.
 
 """
-from numpy.distutils.conv_template import process_file as process_c_file
-
 import hashlib
 import io
 import os
 import re
 import sys
+import importlib.util
 import textwrap
 
 from os.path import join
 
+
+def get_processor():
+    # Convoluted because we can't import from numpy.distutils
+    # (numpy is not yet built)
+    conv_template_path = os.path.join(
+        os.path.dirname(__file__),
+        '..', '..', 'distutils', 'conv_template.py'
+    )
+    spec = importlib.util.spec_from_file_location(
+        'conv_template', conv_template_path
+    )
+    mod = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(mod)
+    return mod.process_file
+
+
+process_c_file = get_processor()
+
+
 __docformat__ = 'restructuredtext'
 
 # The files under src/ that are scanned for API functions
@@ -81,6 +99,27 @@ def _repl(str):
     return str.replace('Bool', 'npy_bool')
 
 
+class MinVersion:
+    def __init__(self, version):
+        """ Version should be the normal NumPy version, e.g. "1.25" """
+        major, minor = version.split(".")
+        self.version = f"NPY_{major}_{minor}_API_VERSION"
+
+    def __str__(self):
+        # Used by version hashing:
+        return self.version
+
+    def add_guard(self, name, normal_define):
+        """Wrap a definition behind a version guard"""
+        wrap = textwrap.dedent(f"""
+            #if NPY_FEATURE_VERSION >= {self.version}
+            {{define}}
+            #endif""")
+
+        # we only insert `define` later to avoid confusing dedent:
+        return wrap.format(define=normal_define)
+
+
 class StealRef:
     def __init__(self, arg):
         self.arg = arg # counting from 1
@@ -113,21 +152,6 @@ class Function:
             doccomment = ''
         return '%s%s %s(%s)' % (doccomment, self.return_type, self.name, argstr)
 
-    def to_ReST(self):
-        lines = ['::', '', '  ' + self.return_type]
-        argstr = ',\000'.join([self._format_arg(*a) for a in self.args])
-        name = '  %s' % (self.name,)
-        s = textwrap.wrap('(%s)' % (argstr,), width=72,
-                          initial_indent=name,
-                          subsequent_indent=' ' * (len(name)+1),
-                          break_long_words=False)
-        for l in s:
-            lines.append(l.replace('\000', ' ').rstrip())
-        lines.append('')
-        if self.doc:
-            lines.append(textwrap.dedent(self.doc))
-        return '\n'.join(lines)
-
     def api_hash(self):
         m = hashlib.md5()
         m.update(remove_whitespace(self.return_type))
@@ -389,7 +413,20 @@ class FunctionApi:
     def __init__(self, name, index, annotations, return_type, args, api_name):
         self.name = name
         self.index = index
-        self.annotations = annotations
+
+        self.min_version = None
+        self.annotations = []
+        for annotation in annotations:
+            # String checks, because manual import breaks isinstance
+            if type(annotation).__name__ == "StealRef":
+                self.annotations.append(annotation)
+            elif type(annotation).__name__ == "MinVersion":
+                if self.min_version is not None:
+                    raise ValueError("Two minimum versions specified!")
+                self.min_version = annotation
+            else:
+                raise ValueError(f"unknown annotation {annotation}")
+
         self.return_type = return_type
         self.args = args
         self.api_name = api_name
@@ -401,13 +438,14 @@ class FunctionApi:
         return argstr
 
     def define_from_array_api_string(self):
-        define = """\
-#define %s \\\n        (*(%s (*)(%s)) \\
-         %s[%d])""" % (self.name,
-                                self.return_type,
-                                self._argtypes_string(),
-                                self.api_name,
-                                self.index)
+        arguments = self._argtypes_string()
+        define = textwrap.dedent(f"""\
+            #define {self.name} \\
+                    (*({self.return_type} (*)({arguments})) \\
+                {self.api_name}[{self.index}])""")
+
+        if self.min_version is not None:
+            define = self.min_version.add_guard(self.name, define)
         return define
 
     def array_api_define(self):
@@ -498,7 +536,7 @@ def get_versions_hash():
     d = []
 
     file = os.path.join(os.path.dirname(__file__), 'cversions.txt')
-    with open(file, 'r') as fid:
+    with open(file) as fid:
         for line in fid:
             m = VERRE.match(line)
             if m:
diff --git a/numpy/core/code_generators/generate_numpy_api.py b/numpy/core/code_generators/generate_numpy_api.py
index a57a36a78..bfcb0d0e5 100644
--- a/numpy/core/code_generators/generate_numpy_api.py
+++ b/numpy/core/code_generators/generate_numpy_api.py
@@ -1,6 +1,8 @@
+#!/usr/bin/env python3
 import os
-import genapi
+import argparse
 
+import genapi
 from genapi import \
         TypeApi, GlobalVarApi, FunctionApi, BoolValuesApi
 
@@ -139,19 +141,12 @@ void *PyArray_API[] = {
 };
 """
 
-c_api_header = """
-===========
-NumPy C-API
-===========
-"""
-
 def generate_api(output_dir, force=False):
     basename = 'multiarray_api'
 
     h_file = os.path.join(output_dir, '__%s.h' % basename)
     c_file = os.path.join(output_dir, '__%s.c' % basename)
-    d_file = os.path.join(output_dir, '%s.txt' % basename)
-    targets = (h_file, c_file, d_file)
+    targets = (h_file, c_file)
 
     sources = numpy_api.multiarray_api
 
@@ -165,7 +160,6 @@ def generate_api(output_dir, force=False):
 def do_generate_api(targets, sources):
     header_file = targets[0]
     c_file = targets[1]
-    doc_file = targets[2]
 
     global_vars = sources[0]
     scalar_bool_values = sources[1]
@@ -234,11 +228,30 @@ def do_generate_api(targets, sources):
     s = c_template % ',\n'.join(init_list)
     genapi.write_file(c_file, s)
 
-    # write to documentation
-    s = c_api_header
-    for func in numpyapi_list:
-        s += func.to_ReST()
-        s += '\n\n'
-    genapi.write_file(doc_file, s)
-
     return targets
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-o",
+        "--outdir",
+        type=str,
+        help="Path to the output directory"
+    )
+    parser.add_argument(
+        "-i",
+        "--ignore",
+        type=str,
+        help="An ignored input - may be useful to add a "
+             "dependency between custom targets"
+    )
+    args = parser.parse_args()
+
+    outdir_abs = os.path.join(os.getcwd(), args.outdir)
+
+    generate_api(outdir_abs)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/numpy/core/code_generators/generate_ufunc_api.py b/numpy/core/code_generators/generate_ufunc_api.py
index 04c023675..e03299a52 100644
--- a/numpy/core/code_generators/generate_ufunc_api.py
+++ b/numpy/core/code_generators/generate_ufunc_api.py
@@ -1,9 +1,9 @@
 import os
-import genapi
-
-import numpy_api
+import argparse
 
+import genapi
 from genapi import TypeApi, FunctionApi
+import numpy_api
 
 h_template = r"""
 #ifdef _UMATHMODULE
@@ -30,7 +30,7 @@ static void **PyUFunc_API=NULL;
 
 %s
 
-static NPY_INLINE int
+static inline int
 _import_umath(void)
 {
   PyObject *numpy = PyImport_ImportModule("numpy.core._multiarray_umath");
@@ -122,8 +122,7 @@ def generate_api(output_dir, force=False):
 
     h_file = os.path.join(output_dir, '__%s.h' % basename)
     c_file = os.path.join(output_dir, '__%s.c' % basename)
-    d_file = os.path.join(output_dir, '%s.txt' % basename)
-    targets = (h_file, c_file, d_file)
+    targets = (h_file, c_file)
 
     sources = ['ufunc_api_order.txt']
 
@@ -137,7 +136,6 @@ def generate_api(output_dir, force=False):
 def do_generate_api(targets, sources):
     header_file = targets[0]
     c_file = targets[1]
-    doc_file = targets[2]
 
     ufunc_api_index = genapi.merge_api_dicts((
             numpy_api.ufunc_funcs_api,
@@ -179,15 +177,23 @@ def do_generate_api(targets, sources):
     s = c_template % ',\n'.join(init_list)
     genapi.write_file(c_file, s)
 
-    # Write to documentation
-    s = '''
-=================
-NumPy Ufunc C-API
-=================
-'''
-    for func in ufunc_api_list:
-        s += func.to_ReST()
-        s += '\n\n'
-    genapi.write_file(doc_file, s)
-
     return targets
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-o",
+        "--outdir",
+        type=str,
+        help="Path to the output directory"
+    )
+    args = parser.parse_args()
+
+    outdir_abs = os.path.join(os.getcwd(), args.outdir)
+
+    generate_api(outdir_abs)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py
index 39d4497b5..a170f83be 100644
--- a/numpy/core/code_generators/generate_umath.py
+++ b/numpy/core/code_generators/generate_umath.py
@@ -1,9 +1,16 @@
+"""
+Generate the code to build all the internal ufuncs. At the base is the defdict:
+a dictionary ofUfunc classes. This is fed to make_code to generate
+__umath_generated.c
+"""
 import os
 import re
 import struct
 import sys
 import textwrap
+import argparse
 
+# identity objects
 Zero = "PyLong_FromLong(0)"
 One = "PyLong_FromLong(1)"
 True_ = "(Py_INCREF(Py_True), Py_True)"
@@ -54,9 +61,6 @@ class TypeDescription:
     cfunc_alias : str or none, optional
         Appended to inner loop C function name, e.g., FLOAT_{cfunc_alias}. See make_arrays.
         NOTE: it doesn't support 'astype'
-    simd : list
-        Available SIMD ufunc loops, dispatched at runtime in specified order
-        Currently only supported for simples types (see make_arrays)
     dispatch : str or None, optional
         Dispatch-able source name without its extension '.dispatch.c' that
         contains the definition of ufunc, dispatched at runtime depending on the
@@ -64,7 +68,7 @@ class TypeDescription:
         NOTE: it doesn't support 'astype'
     """
     def __init__(self, type, f=None, in_=None, out=None, astype=None, cfunc_alias=None,
-                 simd=None, dispatch=None):
+                 dispatch=None):
         self.type = type
         self.func_data = f
         if astype is None:
@@ -77,7 +81,6 @@ class TypeDescription:
             out = out.replace('P', type)
         self.out = out
         self.cfunc_alias = cfunc_alias
-        self.simd = simd
         self.dispatch = dispatch
 
     def finish_signature(self, nin, nout):
@@ -89,7 +92,47 @@ class TypeDescription:
         assert len(self.out) == nout
         self.astype = self.astype_dict.get(self.type, None)
 
-_fdata_map = dict(
+
+def _check_order(types1, types2):
+    dtype_order = allP + "O"
+    for t1, t2 in zip(types1, types2):
+        # We have no opinion on object or time ordering for now:
+        if t1 in "OP" or t2 in "OP":
+            return True
+        if t1 in "mM" or t2 in "mM":
+            return True
+
+        t1i = dtype_order.index(t1)
+        t2i = dtype_order.index(t2)
+        if t1i < t2i:
+            return
+        if t2i > t1i:
+            break
+
+    if types1 == "QQ?" and types2 == "qQ?":
+        # Explicitly allow this mixed case, rather than figure out what order
+        # is nicer or how to encode it.
+        return
+
+    raise TypeError(
+            f"Input dtypes are unsorted or duplicate: {types1} and {types2}")
+
+
+def check_td_order(tds):
+    # A quick check for whether the signatures make sense, it happened too
+    # often that SIMD additions added loops that do not even make some sense.
+    # TODO: This should likely be a test and it would be nice if it rejected
+    #       duplicate entries as well (but we have many as of writing this).
+    signatures = [t.in_+t.out for t in tds]
+
+    for prev_i, sign in enumerate(signatures[1:]):
+        if sign in signatures[:prev_i+1]:
+            continue  # allow duplicates...
+
+        _check_order(signatures[prev_i], sign)
+
+
+_floatformat_map = dict(
     e='npy_%sf',
     f='npy_%sf',
     d='npy_%s',
@@ -100,11 +143,14 @@ _fdata_map = dict(
 )
 
 def build_func_data(types, f):
-    func_data = [_fdata_map.get(t, '%s') % (f,) for t in types]
+    func_data = [_floatformat_map.get(t, '%s') % (f,) for t in types]
     return func_data
 
 def TD(types, f=None, astype=None, in_=None, out=None, cfunc_alias=None,
-       simd=None, dispatch=None):
+       dispatch=None):
+    """
+    Generate a TypeDescription instance for each item in types
+    """
     if f is not None:
         if isinstance(f, str):
             func_data = build_func_data(types, f)
@@ -128,12 +174,6 @@ def TD(types, f=None, astype=None, in_=None, out=None, cfunc_alias=None,
         raise ValueError("Number of types and outputs do not match")
     tds = []
     for t, fd, i, o in zip(types, func_data, in_, out):
-        # [(simd-name, list of types)]
-        if simd is not None:
-            simdt = [k for k, v in simd if t in v]
-        else:
-            simdt = []
-
         # [(dispatch file name without extension '.dispatch.c*', list of types)]
         if dispatch:
             dispt = ([k for k, v in dispatch if t in v]+[None])[0]
@@ -141,7 +181,7 @@ def TD(types, f=None, astype=None, in_=None, out=None, cfunc_alias=None,
             dispt = None
         tds.append(TypeDescription(
             t, f=fd, in_=i, out=o, astype=astype, cfunc_alias=cfunc_alias,
-            simd=simdt, dispatch=dispt
+            dispatch=dispt
         ))
     return tds
 
@@ -152,12 +192,15 @@ class Ufunc:
     ----------
     nin : number of input arguments
     nout : number of output arguments
-    identity : identity element for a two-argument function
+    identity : identity element for a two-argument function (like Zero)
     docstring : docstring for the ufunc
-    type_descriptions : list of TypeDescription objects
+    typereso: type resolver function of type PyUFunc_TypeResolutionFunc
+    type_descriptions : TypeDescription objects
+    signature: a generalized ufunc signature (like for matmul)
+    indexed: add indexed loops (ufunc.at) for these type characters
     """
     def __init__(self, nin, nout, identity, docstring, typereso,
-                 *type_descriptions, signature=None):
+                 *type_descriptions, signature=None, indexed=''):
         self.nin = nin
         self.nout = nout
         if identity is None:
@@ -167,11 +210,15 @@ class Ufunc:
         self.typereso = typereso
         self.type_descriptions = []
         self.signature = signature
+        self.indexed = indexed
         for td in type_descriptions:
             self.type_descriptions.extend(td)
         for td in self.type_descriptions:
             td.finish_signature(self.nin, self.nout)
 
+        check_td_order(self.type_descriptions)
+
+
 # String-handling utilities to avoid locale-dependence.
 
 import string
@@ -245,7 +292,8 @@ chartoname = {
     'P': 'OBJECT',
 }
 
-noobj = '?bBhHiIlLqQefdgFDGmM'
+no_obj_bool = 'bBhHiIlLqQefdgFDGmM'
+noobj = '?' + no_obj_bool
 all = '?bBhHiIlLqQefdgFDGOmM'
 
 O = 'O'
@@ -279,6 +327,7 @@ nocmplxO = nocmplx+O
 nocmplxP = nocmplx+P
 notimes_or_obj = bints + inexact
 nodatetime_or_obj = bints + inexact
+no_bool_times_obj = ints + inexact
 
 # Find which code corresponds to int64.
 int64 = ''
@@ -298,35 +347,50 @@ defdict = {
     Ufunc(2, 1, Zero,
           docstrings.get('numpy.core.umath.add'),
           'PyUFunc_AdditionTypeResolver',
-          TD(notimes_or_obj, simd=[('avx2', ints)], dispatch=[('loops_arithm_fp', 'fdFD')]),
+          TD('?', cfunc_alias='logical_or', dispatch=[('loops_logical', '?')]),
+          TD(no_bool_times_obj, dispatch=[
+              ('loops_arithm_fp', 'fdFD'),
+              ('loops_autovec', ints),
+          ]),
           [TypeDescription('M', FullTypeDescr, 'Mm', 'M'),
            TypeDescription('m', FullTypeDescr, 'mm', 'm'),
            TypeDescription('M', FullTypeDescr, 'mM', 'M'),
           ],
           TD(O, f='PyNumber_Add'),
+          indexed=intfltcmplx
           ),
 'subtract':
     Ufunc(2, 1, None, # Zero is only a unit to the right, not the left
           docstrings.get('numpy.core.umath.subtract'),
           'PyUFunc_SubtractionTypeResolver',
-          TD(ints + inexact, simd=[('avx2', ints)], dispatch=[('loops_arithm_fp', 'fdFD')]),
+          TD(no_bool_times_obj, dispatch=[
+              ('loops_arithm_fp', 'fdFD'),
+              ('loops_autovec', ints),
+          ]),
           [TypeDescription('M', FullTypeDescr, 'Mm', 'M'),
            TypeDescription('m', FullTypeDescr, 'mm', 'm'),
            TypeDescription('M', FullTypeDescr, 'MM', 'm'),
           ],
           TD(O, f='PyNumber_Subtract'),
+          indexed=intfltcmplx
           ),
 'multiply':
     Ufunc(2, 1, One,
           docstrings.get('numpy.core.umath.multiply'),
           'PyUFunc_MultiplicationTypeResolver',
-          TD(notimes_or_obj, simd=[('avx2', ints)], dispatch=[('loops_arithm_fp', 'fdFD')]),
+          TD('?', cfunc_alias='logical_and',
+                  dispatch=[('loops_logical', '?')]),
+          TD(no_bool_times_obj, dispatch=[
+              ('loops_arithm_fp', 'fdFD'),
+              ('loops_autovec', ints),
+          ]),
           [TypeDescription('m', FullTypeDescr, 'mq', 'm'),
            TypeDescription('m', FullTypeDescr, 'qm', 'm'),
            TypeDescription('m', FullTypeDescr, 'md', 'm'),
            TypeDescription('m', FullTypeDescr, 'dm', 'm'),
           ],
           TD(O, f='PyNumber_Multiply'),
+          indexed=intfltcmplx
           ),
 #'true_divide' : aliased to divide in umathmodule.c:initumath
 'floor_divide':
@@ -341,6 +405,7 @@ defdict = {
            TypeDescription('m', FullTypeDescr, 'mm', 'q'),
           ],
           TD(O, f='PyNumber_FloorDivide'),
+          indexed=flts + ints
           ),
 'divide':
     Ufunc(2, 1, None, # One is only a unit to the right, not the left
@@ -352,12 +417,16 @@ defdict = {
            TypeDescription('m', FullTypeDescr, 'mm', 'd', cfunc_alias='divide'),
           ],
           TD(O, f='PyNumber_TrueDivide'),
+          indexed=flts
           ),
 'conjugate':
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.conjugate'),
           None,
-          TD(ints+flts+cmplx, simd=[('avx2', ints), ('avx512f', cmplxvec)]),
+          TD(ints+flts+cmplx, dispatch=[
+              ('loops_arithm_fp', 'FD'),
+              ('loops_autovec', ints),
+          ]),
           TD(P, f='conjugate'),
           ),
 'fmod':
@@ -372,14 +441,21 @@ defdict = {
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.square'),
           None,
-          TD(ints+inexact, simd=[('avx2', ints), ('avx512f', 'FD')], dispatch=[('loops_unary_fp', 'fd')]),
+          TD(ints+inexact, dispatch=[
+              ('loops_unary_fp', 'fd'),
+              ('loops_arithm_fp', 'FD'),
+              ('loops_autovec', ints),
+          ]),
           TD(O, f='Py_square'),
           ),
 'reciprocal':
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.reciprocal'),
           None,
-          TD(ints+inexact, simd=[('avx2', ints)], dispatch=[('loops_unary_fp', 'fd')]),
+          TD(ints+inexact, dispatch=[
+              ('loops_unary_fp', 'fd'),
+              ('loops_autovec', ints),
+          ]),
           TD(O, f='Py_reciprocal'),
           ),
 # This is no longer used as numpy.ones_like, however it is
@@ -411,8 +487,13 @@ defdict = {
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.absolute'),
           'PyUFunc_AbsoluteTypeResolver',
-          TD(bints+flts+timedeltaonly, dispatch=[('loops_unary_fp', 'fd')]),
-          TD(cmplx, simd=[('avx512f', cmplxvec)], out=('f', 'd', 'g')),
+          TD(bints+flts+timedeltaonly, dispatch=[
+              ('loops_unary_fp', 'fd'),
+              ('loops_logical', '?'),
+              ('loops_autovec', ints + 'e'),
+          ]),
+          TD(cmplx, dispatch=[('loops_unary_complex', 'FD')],
+             out=('f', 'd', 'g')),
           TD(O, f='PyNumber_Absolute'),
           ),
 '_arg':
@@ -425,7 +506,7 @@ defdict = {
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.negative'),
           'PyUFunc_NegativeTypeResolver',
-          TD(ints+flts+timedeltaonly, simd=[('avx2', ints)]),
+          TD(ints+flts+timedeltaonly, dispatch=[('loops_unary', ints+'fdg')]),
           TD(cmplx, f='neg'),
           TD(O, f='PyNumber_Negative'),
           ),
@@ -441,82 +522,113 @@ defdict = {
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.sign'),
           'PyUFunc_SimpleUniformOperationTypeResolver',
-          TD(nobool_or_datetime),
+          TD(nobool_or_datetime, dispatch=[('loops_autovec', ints)]),
           ),
 'greater':
     Ufunc(2, 1, None,
           docstrings.get('numpy.core.umath.greater'),
           'PyUFunc_SimpleBinaryComparisonTypeResolver',
-          TD(all, out='?', dispatch=[('loops_comparison', bints+'fd')]),
-          [TypeDescription('O', FullTypeDescr, 'OO', 'O')],
+          TD(bints, out='?'),
+          [TypeDescription('q', FullTypeDescr, 'qQ', '?'),
+           TypeDescription('q', FullTypeDescr, 'Qq', '?')],
+          TD(inexact+times, out='?', dispatch=[('loops_comparison', bints+'fd')]),
           TD('O', out='?'),
+          [TypeDescription('O', FullTypeDescr, 'OO', 'O')],
           ),
 'greater_equal':
     Ufunc(2, 1, None,
           docstrings.get('numpy.core.umath.greater_equal'),
           'PyUFunc_SimpleBinaryComparisonTypeResolver',
-          TD(all, out='?', dispatch=[('loops_comparison', bints+'fd')]),
-          [TypeDescription('O', FullTypeDescr, 'OO', 'O')],
+          TD(bints, out='?'),
+          [TypeDescription('q', FullTypeDescr, 'qQ', '?'),
+           TypeDescription('q', FullTypeDescr, 'Qq', '?')],
+          TD(inexact+times, out='?', dispatch=[('loops_comparison', bints+'fd')]),
           TD('O', out='?'),
+          [TypeDescription('O', FullTypeDescr, 'OO', 'O')],
           ),
 'less':
     Ufunc(2, 1, None,
           docstrings.get('numpy.core.umath.less'),
           'PyUFunc_SimpleBinaryComparisonTypeResolver',
-          TD(all, out='?', dispatch=[('loops_comparison', bints+'fd')]),
-          [TypeDescription('O', FullTypeDescr, 'OO', 'O')],
+          TD(bints, out='?'),
+          [TypeDescription('q', FullTypeDescr, 'qQ', '?'),
+           TypeDescription('q', FullTypeDescr, 'Qq', '?')],
+          TD(inexact+times, out='?', dispatch=[('loops_comparison', bints+'fd')]),
           TD('O', out='?'),
+          [TypeDescription('O', FullTypeDescr, 'OO', 'O')],
           ),
 'less_equal':
     Ufunc(2, 1, None,
           docstrings.get('numpy.core.umath.less_equal'),
           'PyUFunc_SimpleBinaryComparisonTypeResolver',
-          TD(all, out='?', dispatch=[('loops_comparison', bints+'fd')]),
-          [TypeDescription('O', FullTypeDescr, 'OO', 'O')],
+          TD(bints, out='?'),
+          [TypeDescription('q', FullTypeDescr, 'qQ', '?'),
+           TypeDescription('q', FullTypeDescr, 'Qq', '?')],
+          TD(inexact+times, out='?', dispatch=[('loops_comparison', bints+'fd')]),
           TD('O', out='?'),
+          [TypeDescription('O', FullTypeDescr, 'OO', 'O')],
           ),
 'equal':
     Ufunc(2, 1, None,
           docstrings.get('numpy.core.umath.equal'),
           'PyUFunc_SimpleBinaryComparisonTypeResolver',
-          TD(all, out='?', dispatch=[('loops_comparison', bints+'fd')]),
-          [TypeDescription('O', FullTypeDescr, 'OO', 'O')],
+          TD(bints, out='?'),
+          [TypeDescription('q', FullTypeDescr, 'qQ', '?'),
+           TypeDescription('q', FullTypeDescr, 'Qq', '?')],
+          TD(inexact+times, out='?', dispatch=[('loops_comparison', bints+'fd')]),
           TD('O', out='?'),
+          [TypeDescription('O', FullTypeDescr, 'OO', 'O')],
           ),
 'not_equal':
     Ufunc(2, 1, None,
           docstrings.get('numpy.core.umath.not_equal'),
           'PyUFunc_SimpleBinaryComparisonTypeResolver',
-          TD(all, out='?', dispatch=[('loops_comparison', bints+'fd')]),
-          [TypeDescription('O', FullTypeDescr, 'OO', 'O')],
+          TD(bints, out='?'),
+          [TypeDescription('q', FullTypeDescr, 'qQ', '?'),
+           TypeDescription('q', FullTypeDescr, 'Qq', '?')],
+          TD(inexact+times, out='?', dispatch=[('loops_comparison', bints+'fd')]),
           TD('O', out='?'),
+          [TypeDescription('O', FullTypeDescr, 'OO', 'O')],
           ),
 'logical_and':
     Ufunc(2, 1, True_,
           docstrings.get('numpy.core.umath.logical_and'),
           'PyUFunc_SimpleBinaryComparisonTypeResolver',
-          TD(nodatetime_or_obj, out='?', simd=[('avx2', ints)]),
+          TD(nodatetime_or_obj, out='?', dispatch=[
+              ('loops_logical', '?'),
+              ('loops_autovec', ints),
+          ]),
           TD(O, f='npy_ObjectLogicalAnd'),
           ),
 'logical_not':
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.logical_not'),
           None,
-          TD(nodatetime_or_obj, out='?', simd=[('avx2', ints)]),
+          TD(nodatetime_or_obj, out='?', dispatch=[
+              ('loops_logical', '?'),
+              ('loops_autovec', ints),
+          ]),
           TD(O, f='npy_ObjectLogicalNot'),
           ),
 'logical_or':
     Ufunc(2, 1, False_,
           docstrings.get('numpy.core.umath.logical_or'),
           'PyUFunc_SimpleBinaryComparisonTypeResolver',
-          TD(nodatetime_or_obj, out='?', simd=[('avx2', ints)]),
+          TD(nodatetime_or_obj, out='?', dispatch=[
+              ('loops_logical', '?'),
+              ('loops_autovec', ints),
+          ]),
           TD(O, f='npy_ObjectLogicalOr'),
           ),
 'logical_xor':
     Ufunc(2, 1, False_,
           docstrings.get('numpy.core.umath.logical_xor'),
           'PyUFunc_SimpleBinaryComparisonTypeResolver',
-          TD(nodatetime_or_obj, out='?'),
+          TD('?', out='?', cfunc_alias='not_equal',
+                           dispatch=[('loops_comparison', '?')]),
+          TD(no_bool_times_obj, out='?', dispatch=[
+              ('loops_autovec', ints),
+          ]),
           # TODO: using obj.logical_xor() seems pretty much useless:
           TD(P, f='logical_xor'),
           ),
@@ -524,15 +636,20 @@ defdict = {
     Ufunc(2, 1, ReorderableNone,
           docstrings.get('numpy.core.umath.maximum'),
           'PyUFunc_SimpleUniformOperationTypeResolver',
-          TD(noobj, dispatch=[('loops_minmax', ints+'fdg')]),
-          TD(O, f='npy_ObjectMax')
+          TD('?', cfunc_alias='logical_or', dispatch=[('loops_logical', '?')]),
+          TD(no_obj_bool, dispatch=[('loops_minmax', ints+'fdg')]),
+          TD(O, f='npy_ObjectMax'),
+          indexed=flts + ints,
           ),
 'minimum':
     Ufunc(2, 1, ReorderableNone,
           docstrings.get('numpy.core.umath.minimum'),
           'PyUFunc_SimpleUniformOperationTypeResolver',
-          TD(noobj, dispatch=[('loops_minmax', ints+'fdg')]),
-          TD(O, f='npy_ObjectMin')
+          TD('?', cfunc_alias='logical_and',
+                  dispatch=[('loops_logical', '?')]),
+          TD(no_obj_bool, dispatch=[('loops_minmax', ints+'fdg')]),
+          TD(O, f='npy_ObjectMin'),
+          indexed=flts + ints,
           ),
 'clip':
     Ufunc(3, 1, ReorderableNone,
@@ -545,15 +662,20 @@ defdict = {
     Ufunc(2, 1, ReorderableNone,
           docstrings.get('numpy.core.umath.fmax'),
           'PyUFunc_SimpleUniformOperationTypeResolver',
-          TD(noobj, dispatch=[('loops_minmax', 'fdg')]),
-          TD(O, f='npy_ObjectMax')
+          TD('?', cfunc_alias='logical_or', dispatch=[('loops_logical', '?')]),
+          TD(no_obj_bool, dispatch=[('loops_minmax', 'fdg')]),
+          TD(O, f='npy_ObjectMax'),
+          indexed=flts + ints,
           ),
 'fmin':
     Ufunc(2, 1, ReorderableNone,
           docstrings.get('numpy.core.umath.fmin'),
           'PyUFunc_SimpleUniformOperationTypeResolver',
-          TD(noobj, dispatch=[('loops_minmax', 'fdg')]),
-          TD(O, f='npy_ObjectMin')
+          TD('?', cfunc_alias='logical_and',
+                  dispatch=[('loops_logical', '?')]),
+          TD(no_obj_bool, dispatch=[('loops_minmax', 'fdg')]),
+          TD(O, f='npy_ObjectMin'),
+          indexed=flts + ints,
           ),
 'logaddexp':
     Ufunc(2, 1, MinusInfinity,
@@ -571,42 +693,49 @@ defdict = {
     Ufunc(2, 1, AllOnes,
           docstrings.get('numpy.core.umath.bitwise_and'),
           None,
-          TD(bints, simd=[('avx2', ints)]),
+          TD('?', cfunc_alias='logical_and',
+                  dispatch=[('loops_logical', '?')]),
+          TD(ints, dispatch=[('loops_autovec', ints)]),
           TD(O, f='PyNumber_And'),
           ),
 'bitwise_or':
     Ufunc(2, 1, Zero,
           docstrings.get('numpy.core.umath.bitwise_or'),
           None,
-          TD(bints, simd=[('avx2', ints)]),
+          TD('?', cfunc_alias='logical_or', dispatch=[('loops_logical', '?')]),
+          TD(ints, dispatch=[('loops_autovec', ints)]),
           TD(O, f='PyNumber_Or'),
           ),
 'bitwise_xor':
     Ufunc(2, 1, Zero,
           docstrings.get('numpy.core.umath.bitwise_xor'),
           None,
-          TD(bints, simd=[('avx2', ints)]),
+          TD('?', cfunc_alias='not_equal',
+                  dispatch=[('loops_comparison', '?')]),
+          TD(ints, dispatch=[('loops_autovec', ints)]),
           TD(O, f='PyNumber_Xor'),
           ),
 'invert':
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.invert'),
           None,
-          TD(bints, simd=[('avx2', ints)]),
+          TD('?', cfunc_alias='logical_not',
+                  dispatch=[('loops_logical', '?')]),
+          TD(ints, dispatch=[('loops_autovec', ints)]),
           TD(O, f='PyNumber_Invert'),
           ),
 'left_shift':
     Ufunc(2, 1, None,
           docstrings.get('numpy.core.umath.left_shift'),
           None,
-          TD(ints, simd=[('avx2', ints)]),
+          TD(ints, dispatch=[('loops_autovec', ints)]),
           TD(O, f='PyNumber_Lshift'),
           ),
 'right_shift':
     Ufunc(2, 1, None,
           docstrings.get('numpy.core.umath.right_shift'),
           None,
-          TD(ints, simd=[('avx2', ints)]),
+          TD(ints, dispatch=[('loops_autovec', ints)]),
           TD(O, f='PyNumber_Rshift'),
           ),
 'heaviside':
@@ -691,18 +820,20 @@ defdict = {
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.cos'),
           None,
+          TD('e', dispatch=[('loops_umath_fp', 'e')]),
           TD('f', dispatch=[('loops_trigonometric', 'f')]),
-          TD('ed', dispatch=[('loops_umath_fp', 'ed')]),
-          TD('fdg' + cmplx, f='cos'),
+          TD('d', dispatch=[('loops_trigonometric', 'd')]),
+          TD('g' + cmplx, f='cos'),
           TD(P, f='cos'),
           ),
 'sin':
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.sin'),
           None,
+          TD('e', dispatch=[('loops_umath_fp', 'e')]),
           TD('f', dispatch=[('loops_trigonometric', 'f')]),
-          TD('ed', dispatch=[('loops_umath_fp', 'ed')]),
-          TD('fdg' + cmplx, f='sin'),
+          TD('d', dispatch=[('loops_trigonometric', 'd')]),
+          TD('g' + cmplx, f='sin'),
           TD(P, f='sin'),
           ),
 'tan':
@@ -860,8 +991,9 @@ defdict = {
     Ufunc(2, 1, None,
           docstrings.get('numpy.core.umath.arctan2'),
           None,
+          TD('e', f='atan2', astype={'e': 'f'}),
           TD('fd', dispatch=[('loops_umath_fp', 'fd')]),
-          TD(flts, f='atan2', astype={'e': 'f'}),
+          TD('g', f='atan2', astype={'e': 'f'}),
           TD(P, f='arctan2'),
           ),
 'remainder':
@@ -893,7 +1025,10 @@ defdict = {
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.isnan'),
           'PyUFunc_IsFiniteTypeResolver',
-          TD(noobj, simd=[('avx512_skx', 'fd')], out='?'),
+          TD(noobj, out='?', dispatch=[
+              ('loops_unary_fp_le', inexactvec),
+              ('loops_autovec', bints),
+          ]),
           ),
 'isnat':
     Ufunc(1, 1, None,
@@ -905,19 +1040,25 @@ defdict = {
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.isinf'),
           'PyUFunc_IsFiniteTypeResolver',
-          TD(noobj, simd=[('avx512_skx', 'fd')], out='?'),
+          TD(noobj, out='?', dispatch=[
+              ('loops_unary_fp_le', inexactvec),
+              ('loops_autovec', bints + 'mM'),
+          ]),
           ),
 'isfinite':
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.isfinite'),
           'PyUFunc_IsFiniteTypeResolver',
-          TD(noobj, simd=[('avx512_skx', 'fd')], out='?'),
+          TD(noobj, out='?', dispatch=[
+              ('loops_unary_fp_le', inexactvec),
+              ('loops_autovec', bints),
+          ]),
           ),
 'signbit':
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.signbit'),
           None,
-          TD(flts, simd=[('avx512_skx', 'fd')], out='?'),
+          TD(flts, out='?', dispatch=[('loops_unary_fp_le', inexactvec)]),
           ),
 'copysign':
     Ufunc(2, 1, None,
@@ -1054,7 +1195,10 @@ def make_arrays(funcdict):
             if t.func_data is FullTypeDescr:
                 tname = english_upper(chartoname[t.type])
                 datalist.append('(void *)NULL')
-                cfunc_fname = f"{tname}_{t.in_}_{t.out}_{cfunc_alias}"
+                if t.out == "?":
+                    cfunc_fname = f"{tname}_{t.in_}_bool_{cfunc_alias}"
+                else:
+                    cfunc_fname = f"{tname}_{t.in_}_{t.out}_{cfunc_alias}"
             elif isinstance(t.func_data, FuncNameSuffix):
                 datalist.append('(void *)NULL')
                 tname = english_upper(chartoname[t.type])
@@ -1063,18 +1207,6 @@ def make_arrays(funcdict):
                 datalist.append('(void *)NULL')
                 tname = english_upper(chartoname[t.type])
                 cfunc_fname = f"{tname}_{cfunc_alias}"
-                if t.simd is not None:
-                    for vt in t.simd:
-                        code2list.append(textwrap.dedent("""\
-                        #ifdef HAVE_ATTRIBUTE_TARGET_{ISA}
-                        if (NPY_CPU_HAVE({ISA})) {{
-                            {fname}_functions[{idx}] = {cname}_{isa};
-                        }}
-                        #endif
-                        """).format(
-                            ISA=vt.upper(), isa=vt,
-                            fname=name, cname=cfunc_fname, idx=k
-                        ))
             else:
                 try:
                     thedict = arity_lookup[uf.nin, uf.nout]
@@ -1186,6 +1318,40 @@ def make_ufuncs(funcdict):
         if uf.typereso is not None:
             mlist.append(
                 r"((PyUFuncObject *)f)->type_resolver = &%s;" % uf.typereso)
+        for c in uf.indexed:
+            # Handle indexed loops by getting the underlying ArrayMethodObject
+            # from the list in f._loops and setting its field appropriately
+            fmt = textwrap.dedent("""
+            {{
+                PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum({typenum});
+                PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                                   dtype, {count});
+                if (info == NULL) {{
+                    return -1;
+                }}
+                if (info == Py_None) {{
+                    PyErr_SetString(PyExc_RuntimeError,
+                        "cannot add indexed loop to ufunc "
+                        "{name} with {typenum}");
+                    return -1;
+                }}
+                if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {{
+                    PyErr_SetString(PyExc_RuntimeError,
+                        "Not a PyArrayMethodObject in ufunc "
+                        "{name} with {typenum}");
+                }}
+                ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                                 {funcname};
+                /* info is borrowed, no need to decref*/
+            }}
+            """)
+            mlist.append(fmt.format(
+                typenum=f"NPY_{english_upper(chartoname[c])}",
+                count=uf.nin+uf.nout,
+                name=name,
+                funcname = f"{english_upper(chartoname[c])}_{name}_indexed",
+            ))
+
         mlist.append(r"""PyDict_SetItemString(dictionary, "%s", f);""" % name)
         mlist.append(r"""Py_DECREF(f);""")
         code3list.append('\n'.join(mlist))
@@ -1208,8 +1374,14 @@ def make_code(funcdict, filename):
     #include "loops.h"
     #include "matmul.h"
     #include "clip.h"
+    #include "dtypemeta.h"
     #include "_umath_doc_generated.h"
+
     %s
+    /* Returns a borrowed ref of the second value in the matching info tuple */
+    PyObject *
+    get_info_no_cast(PyUFuncObject *ufunc, PyArray_DTypeMeta *op_dtype,
+                     int ndtypes);
 
     static int
     InitOperators(PyObject *dictionary) {
@@ -1224,8 +1396,29 @@ def make_code(funcdict, filename):
     return code
 
 
-if __name__ == "__main__":
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-o",
+        "--outfile",
+        type=str,
+        help="Path to the output directory"
+    )
+    args = parser.parse_args()
+
+    # used to insert the name of this file into the generated file
     filename = __file__
     code = make_code(defdict, filename)
-    with open('__umath_generated.c', 'w') as fid:
-        fid.write(code)
+
+    if not args.outfile:
+        # This is the distutils-based build
+        outfile = '__umath_generated.c'
+    else:
+        outfile = os.path.join(os.getcwd(), args.outfile)
+
+    with open(outfile, 'w') as f:
+        f.write(code)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/numpy/core/code_generators/generate_umath_doc.py b/numpy/core/code_generators/generate_umath_doc.py
index 9888730fd..fc0c2a138 100644
--- a/numpy/core/code_generators/generate_umath_doc.py
+++ b/numpy/core/code_generators/generate_umath_doc.py
@@ -1,6 +1,7 @@
 import sys
 import os
 import textwrap
+import argparse
 
 sys.path.insert(0, os.path.dirname(__file__))
 import ufunc_docstrings as docstrings
@@ -28,3 +29,21 @@ def write_code(target):
             cdef_str = normalize_doc(string)
             fid.write(f"#define {cdef_name} \"{cdef_str}\"\n")
         fid.write("#endif //NUMPY_CORE_INCLUDE__UMATH_DOC_GENERATED_H\n")
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-o",
+        "--outfile",
+        type=str,
+        help="Path to the output directory"
+    )
+    args = parser.parse_args()
+
+    outfile = os.path.join(os.getcwd(), args.outfile)
+    write_code(outfile)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/numpy/core/code_generators/numpy_api.py b/numpy/core/code_generators/numpy_api.py
index fa28f92b7..bb3b15806 100644
--- a/numpy/core/code_generators/numpy_api.py
+++ b/numpy/core/code_generators/numpy_api.py
@@ -13,7 +13,23 @@ When adding a function, make sure to use the next integer not used as an index
 exception, so it should hopefully not get unnoticed).
 
 """
-from code_generators.genapi import StealRef
+
+import os
+import importlib.util
+
+
+def get_annotations():
+    # Convoluted because we can't import from numpy.distutils
+    # (numpy is not yet built)
+    genapi_py = os.path.join(os.path.dirname(__file__), 'genapi.py')
+    spec = importlib.util.spec_from_file_location('conv_template', genapi_py)
+    mod = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(mod)
+    return mod.StealRef, mod.MinVersion
+
+
+StealRef, MinVersion = get_annotations()
+#from code_generators.genapi import StealRef
 
 # index, type
 multiarray_global_vars = {
@@ -351,8 +367,8 @@ multiarray_funcs_api = {
     'PyArray_ResolveWritebackIfCopy':       (302,),
     'PyArray_SetWritebackIfCopyBase':       (303,),
     # End 1.14 API
-    'PyDataMem_SetHandler':                 (304,),
-    'PyDataMem_GetHandler':                 (305,),
+    'PyDataMem_SetHandler':                 (304, MinVersion("1.22")),
+    'PyDataMem_GetHandler':                 (305, MinVersion("1.22")),
     # End 1.22 API
 }
 
@@ -406,7 +422,7 @@ ufunc_funcs_api = {
     # End 1.7 API
     'PyUFunc_RegisterLoopForDescr':             (41,),
     # End 1.8 API
-    'PyUFunc_FromFuncAndDataAndSignatureAndIdentity': (42,),
+    'PyUFunc_FromFuncAndDataAndSignatureAndIdentity': (42, MinVersion("1.16")),
     # End 1.16 API
 }
 
diff --git a/numpy/core/code_generators/ufunc_docstrings.py b/numpy/core/code_generators/ufunc_docstrings.py
index efc43b01c..1695b4d27 100644
--- a/numpy/core/code_generators/ufunc_docstrings.py
+++ b/numpy/core/code_generators/ufunc_docstrings.py
@@ -667,7 +667,7 @@ add_newdoc('numpy.core.umath', 'bitwise_or',
     --------
     The number 13 has the binary representation ``00001101``. Likewise,
     16 is represented by ``00010000``.  The bit-wise OR of 13 and 16 is
-    then ``000111011``, or 29:
+    then ``00011101``, or 29:
 
     >>> np.bitwise_or(13, 16)
     29
@@ -2021,7 +2021,7 @@ add_newdoc('numpy.core.umath', 'log',
     has a branch cut `[-inf, 0]` and is continuous from above on it. `log`
     handles the floating-point negative zero as an infinitesimal negative
     number, conforming to the C99 standard.
-    
+
     In the cases where the input has a negative real part and a very small
     negative complex part (approaching 0), the result is so close to `-pi`
     that it evaluates to exactly `-pi`.
@@ -2457,7 +2457,7 @@ add_newdoc('numpy.core.umath', 'maximum',
     """
     Element-wise maximum of array elements.
 
-    Compare two arrays and returns a new array containing the element-wise
+    Compare two arrays and return a new array containing the element-wise
     maxima. If one of the elements being compared is a NaN, then that
     element is returned. If both elements are NaNs then the first is
     returned. The latter distinction is important for complex NaNs, which
@@ -2516,7 +2516,7 @@ add_newdoc('numpy.core.umath', 'minimum',
     """
     Element-wise minimum of array elements.
 
-    Compare two arrays and returns a new array containing the element-wise
+    Compare two arrays and return a new array containing the element-wise
     minima. If one of the elements being compared is a NaN, then that
     element is returned. If both elements are NaNs then the first is
     returned. The latter distinction is important for complex NaNs, which
@@ -2575,7 +2575,7 @@ add_newdoc('numpy.core.umath', 'fmax',
     """
     Element-wise maximum of array elements.
 
-    Compare two arrays and returns a new array containing the element-wise
+    Compare two arrays and return a new array containing the element-wise
     maxima. If one of the elements being compared is a NaN, then the
     non-nan element is returned. If both elements are NaNs then the first
     is returned.  The latter distinction is important for complex NaNs,
@@ -2633,7 +2633,7 @@ add_newdoc('numpy.core.umath', 'fmin',
     """
     Element-wise minimum of array elements.
 
-    Compare two arrays and returns a new array containing the element-wise
+    Compare two arrays and return a new array containing the element-wise
     minima. If one of the elements being compared is a NaN, then the
     non-nan element is returned. If both elements are NaNs then the first
     is returned.  The latter distinction is important for complex NaNs,
@@ -3833,7 +3833,7 @@ add_newdoc('numpy.core.umath', 'sqrt',
     --------
     emath.sqrt
         A version which returns complex numbers when given negative reals.
-        Note: 0.0 and -0.0 are handled differently for complex inputs.
+        Note that 0.0 and -0.0 are handled differently for complex inputs.
 
     Notes
     -----
diff --git a/numpy/core/code_generators/verify_c_api_version.py b/numpy/core/code_generators/verify_c_api_version.py
new file mode 100644
index 000000000..282e5e7d3
--- /dev/null
+++ b/numpy/core/code_generators/verify_c_api_version.py
@@ -0,0 +1,66 @@
+#!/usr/bin/env python3
+import os
+import sys
+import argparse
+
+
+class MismatchCAPIError(ValueError):
+    pass
+
+
+def get_api_versions(apiversion):
+    """
+    Return current C API checksum and the recorded checksum.
+
+    Return current C API checksum and the recorded checksum for the given
+    version of the C API version.
+
+    """
+    # Compute the hash of the current API as defined in the .txt files in
+    # code_generators
+    sys.path.insert(0, os.path.abspath(os.path.dirname(__file__)))
+    try:
+        m = __import__('genapi')
+        numpy_api = __import__('numpy_api')
+        curapi_hash = m.fullapi_hash(numpy_api.full_api)
+        apis_hash = m.get_versions_hash()
+    finally:
+        del sys.path[0]
+
+    return curapi_hash, apis_hash[apiversion]
+
+
+def check_api_version(apiversion):
+    """Emits a MismatchCAPIWarning if the C API version needs updating."""
+    curapi_hash, api_hash = get_api_versions(apiversion)
+
+    # If different hash, it means that the api .txt files in
+    # codegen_dir have been updated without the API version being
+    # updated. Any modification in those .txt files should be reflected
+    # in the api and eventually abi versions.
+    # To compute the checksum of the current API, use numpy/core/cversions.py
+    if not curapi_hash == api_hash:
+        msg = ("API mismatch detected, the C API version "
+               "numbers have to be updated. Current C api version is "
+               f"{apiversion}, with checksum {curapi_hash}, but recorded "
+               f"checksum in core/codegen_dir/cversions.txt is {api_hash}. If "
+               "functions were added in the C API, you have to update "
+               f"C_API_VERSION in {__file__}."
+               )
+        raise MismatchCAPIError(msg)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--api-version",
+        type=str,
+        help="C API version to verify (as a hex string)"
+    )
+    args = parser.parse_args()
+
+    check_api_version(int(args.api_version, base=16))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/numpy/core/config.h.in b/numpy/core/config.h.in
new file mode 100644
index 000000000..e3b559753
--- /dev/null
+++ b/numpy/core/config.h.in
@@ -0,0 +1,116 @@
+#mesondefine SIZEOF_PY_INTPTR_T
+#mesondefine SIZEOF_OFF_T
+#mesondefine SIZEOF_PY_LONG_LONG
+
+#mesondefine HAVE_BACKTRACE
+#mesondefine HAVE_MADVISE
+#mesondefine HAVE_FTELLO
+#mesondefine HAVE_FSEEKO
+#mesondefine HAVE_FALLOCATE
+#mesondefine HAVE_STRTOLD_L
+#mesondefine HAVE__THREAD
+#mesondefine HAVE___DECLSPEC_THREAD_
+
+/* Optional headers */
+#mesondefine HAVE_XLOCALE_H
+#mesondefine HAVE_DLFCN_H
+#mesondefine HAVE_EXECINFO_H
+#mesondefine HAVE_LIBUNWIND_H
+#mesondefine HAVE_SYS_MMAN_H
+#mesondefine HAVE_XMMINTRIN_H
+#mesondefine HAVE_EMMINTRIN_H
+#mesondefine HAVE_IMMINTRIN_H
+
+/* Optional intrinsics */
+#mesondefine HAVE___BUILTIN_ISNAN
+#mesondefine HAVE___BUILTIN_ISINF
+#mesondefine HAVE___BUILTIN_ISFINITE
+#mesondefine HAVE___BUILTIN_BSWAP32
+#mesondefine HAVE___BUILTIN_BSWAP64
+#mesondefine HAVE___BUILTIN_EXPECT
+#mesondefine HAVE___BUILTIN_MUL_OVERFLOW
+#mesondefine HAVE___BUILTIN_PREFETCH
+
+#mesondefine HAVE_ATTRIBUTE_OPTIMIZE_UNROLL_LOOPS
+#mesondefine HAVE_ATTRIBUTE_OPTIMIZE_OPT_3
+#mesondefine HAVE_ATTRIBUTE_OPTIMIZE_OPT_2
+#mesondefine HAVE_ATTRIBUTE_NONNULL
+
+/* C99 complex support and complex.h are not universal */
+#mesondefine HAVE_COMPLEX_H
+#mesondefine HAVE_CABS
+#mesondefine HAVE_CACOS
+#mesondefine HAVE_CACOSH
+#mesondefine HAVE_CARG
+#mesondefine HAVE_CASIN
+#mesondefine HAVE_CASINH
+#mesondefine HAVE_CATAN
+#mesondefine HAVE_CATANH
+#mesondefine HAVE_CEXP
+#mesondefine HAVE_CLOG
+#mesondefine HAVE_CPOW
+#mesondefine HAVE_CSQRT
+#mesondefine HAVE_CABSF
+#mesondefine HAVE_CACOSF
+#mesondefine HAVE_CACOSHF
+#mesondefine HAVE_CARGF
+#mesondefine HAVE_CASINF
+#mesondefine HAVE_CASINHF
+#mesondefine HAVE_CATANF
+#mesondefine HAVE_CATANHF
+#mesondefine HAVE_CEXPF
+#mesondefine HAVE_CLOGF
+#mesondefine HAVE_CPOWF
+#mesondefine HAVE_CSQRTF
+#mesondefine HAVE_CABSL
+#mesondefine HAVE_CACOSL
+#mesondefine HAVE_CACOSHL
+#mesondefine HAVE_CARGL
+#mesondefine HAVE_CASINL
+#mesondefine HAVE_CASINHL
+#mesondefine HAVE_CATANL
+#mesondefine HAVE_CATANHL
+#mesondefine HAVE_CEXPL
+#mesondefine HAVE_CLOGL
+#mesondefine HAVE_CPOWL
+#mesondefine HAVE_CSQRTL
+/* FreeBSD */
+#mesondefine HAVE_CSINF
+#mesondefine HAVE_CSINHF
+#mesondefine HAVE_CCOSF
+#mesondefine HAVE_CCOSHF
+#mesondefine HAVE_CTANF
+#mesondefine HAVE_CTANHF
+#mesondefine HAVE_CSIN
+#mesondefine HAVE_CSINH
+#mesondefine HAVE_CCOS
+#mesondefine HAVE_CCOSH
+#mesondefine HAVE_CTAN
+#mesondefine HAVE_CTANH
+#mesondefine HAVE_CSINL
+#mesondefine HAVE_CSINHL
+#mesondefine HAVE_CCOSL
+#mesondefine HAVE_CCOSHL
+#mesondefine HAVE_CTANL
+#mesondefine HAVE_CTANHL
+
+#mesondefine NPY_CAN_LINK_SVML
+#mesondefine NPY_RELAXED_STRIDES_DEBUG
+
+#mesondefine HAVE_LDOUBLE_INTEL_EXTENDED_16_BYTES_LE
+#mesondefine HAVE_LDOUBLE_INTEL_EXTENDED_12_BYTES_LE
+#mesondefine HAVE_LDOUBLE_MOTOROLA_EXTENDED_12_BYTES_BE
+#mesondefine HAVE_LDOUBLE_IEEE_DOUBLE_LE
+#mesondefine HAVE_LDOUBLE_IEEE_DOUBLE_BE
+#mesondefine HAVE_LDOUBLE_IEEE_QUAD_LE
+#mesondefine HAVE_LDOUBLE_IEEE_QUAD_BE
+#mesondefine HAVE_LDOUBLE_IBM_DOUBLE_DOUBLE_LE
+#mesondefine HAVE_LDOUBLE_IBM_DOUBLE_DOUBLE_BE
+
+#ifndef __cplusplus
+/* #undef inline */
+#endif
+
+#ifndef NUMPY_CORE_SRC_COMMON_NPY_CONFIG_H_
+#error config.h should never be included directly, include npy_config.h instead
+#endif
diff --git a/numpy/core/defchararray.py b/numpy/core/defchararray.py
index 6750e497a..11c5a30bf 100644
--- a/numpy/core/defchararray.py
+++ b/numpy/core/defchararray.py
@@ -6,7 +6,7 @@ operations and methods.
    The `chararray` class exists for backwards compatibility with
    Numarray, it is not recommended for new development. Starting from numpy
    1.4, if one needs arrays of strings, it is recommended to use arrays of
-   `dtype` `object_`, `string_` or `unicode_`, and use the free functions
+   `dtype` `object_`, `bytes_` or `str_`, and use the free functions
    in the `numpy.char` module for fast vectorized string operations.
 
 Some methods will only be available if the corresponding string method is
@@ -16,12 +16,13 @@ The preferred alias for `defchararray` is `numpy.char`.
 
 """
 import functools
+
+from .._utils import set_module
 from .numerictypes import (
-    string_, unicode_, integer, int_, object_, bool_, character)
+    bytes_, str_, integer, int_, object_, bool_, character)
 from .numeric import ndarray, compare_chararrays
 from .numeric import array as narray
 from numpy.core.multiarray import _vec_string
-from numpy.core.overrides import set_module
 from numpy.core import overrides
 from numpy.compat import asbytes
 import numpy
@@ -45,26 +46,29 @@ array_function_dispatch = functools.partial(
     overrides.array_function_dispatch, module='numpy.char')
 
 
-def _use_unicode(*args):
-    """
-    Helper function for determining the output type of some string
-    operations.
+def _is_unicode(arr):
+    """Returns True if arr is a string or a string array with a dtype that
+    represents a unicode string, otherwise returns False.
 
-    For an operation on two ndarrays, if at least one is unicode, the
-    result should be unicode.
     """
-    for x in args:
-        if (isinstance(x, str) or
-                issubclass(numpy.asarray(x).dtype.type, unicode_)):
-            return unicode_
-    return string_
+    if (isinstance(arr, str) or
+            issubclass(numpy.asarray(arr).dtype.type, str)):
+        return True
+    return False
+
 
-def _to_string_or_unicode_array(result):
+def _to_bytes_or_str_array(result, output_dtype_like=None):
     """
-    Helper function to cast a result back into a string or unicode array
-    if an object array must be used as an intermediary.
+    Helper function to cast a result back into an array
+    with the appropriate dtype if an object array must be used
+    as an intermediary.
     """
-    return numpy.asarray(result.tolist())
+    ret = numpy.asarray(result.tolist())
+    dtype = getattr(output_dtype_like, 'dtype', None)
+    if dtype is not None:
+        return ret.astype(type(dtype)(_get_num_chars(ret)), copy=False)
+    return ret
+
 
 def _clean_args(*args):
     """
@@ -88,7 +92,7 @@ def _get_num_chars(a):
     a string or unicode array.  This is to abstract out the fact that
     for a unicode array this is itemsize / 4.
     """
-    if issubclass(a.dtype.type, unicode_):
+    if issubclass(a.dtype.type, str_):
         return a.itemsize // 4
     return a.itemsize
 
@@ -274,7 +278,7 @@ def str_len(a):
 
     See Also
     --------
-    builtins.len
+    len
 
     Examples
     --------
@@ -311,16 +315,26 @@ def add(x1, x2):
     Returns
     -------
     add : ndarray
-        Output array of `string_` or `unicode_`, depending on input types
+        Output array of `bytes_` or `str_`, depending on input types
         of the same shape as `x1` and `x2`.
 
     """
     arr1 = numpy.asarray(x1)
     arr2 = numpy.asarray(x2)
     out_size = _get_num_chars(arr1) + _get_num_chars(arr2)
-    dtype = _use_unicode(arr1, arr2)
-    return _vec_string(arr1, (dtype, out_size), '__add__', (arr2,))
 
+    if type(arr1.dtype) != type(arr2.dtype):
+        # Enforce this for now.  The solution to it will be implement add
+        # as a ufunc.  It never worked right on Python 3: bytes + unicode gave
+        # nonsense unicode + bytes errored, and unicode + object used the
+        # object dtype itemsize as num chars (worked on short strings).
+        # bytes + void worked but promoting void->bytes is dubious also.
+        raise TypeError(
+            "np.char.add() requires both arrays of the same dtype kind, but "
+            f"got dtypes: '{arr1.dtype}' and '{arr2.dtype}' (the few cases "
+            "where this used to work often lead to incorrect results).")
+
+    return _vec_string(arr1, type(arr1.dtype)(out_size), '__add__', (arr2,))
 
 def _multiply_dispatcher(a, i):
     return (a,)
@@ -370,7 +384,7 @@ def multiply(a, i):
         raise ValueError("Can only multiply by integers")
     out_size = _get_num_chars(a_arr) * max(int(i_arr.max()), 0)
     return _vec_string(
-        a_arr, (a_arr.dtype.type, out_size), '__mul__', (i_arr,))
+        a_arr, type(a_arr.dtype)(out_size), '__mul__', (i_arr,))
 
 
 def _mod_dispatcher(a, values):
@@ -401,8 +415,8 @@ def mod(a, values):
     str.__mod__
 
     """
-    return _to_string_or_unicode_array(
-        _vec_string(a, object_, '__mod__', (values,)))
+    return _to_bytes_or_str_array(
+        _vec_string(a, object_, '__mod__', (values,)), a)
 
 
 @array_function_dispatch(_unary_op_dispatcher)
@@ -495,10 +509,10 @@ def center(a, width, fillchar=' '):
     a_arr = numpy.asarray(a)
     width_arr = numpy.asarray(width)
     size = int(numpy.max(width_arr.flat))
-    if numpy.issubdtype(a_arr.dtype, numpy.string_):
+    if numpy.issubdtype(a_arr.dtype, numpy.bytes_):
         fillchar = asbytes(fillchar)
     return _vec_string(
-        a_arr, (a_arr.dtype.type, size), 'center', (width_arr, fillchar))
+        a_arr, type(a_arr.dtype)(size), 'center', (width_arr, fillchar))
 
 
 def _count_dispatcher(a, sub, start=None, end=None):
@@ -597,7 +611,7 @@ def decode(a, encoding=None, errors=None):
     array(['aAaAaA', '  aA  ', 'abBABba'], dtype='<U7')
 
     """
-    return _to_string_or_unicode_array(
+    return _to_bytes_or_str_array(
         _vec_string(a, object_, 'decode', _clean_args(encoding, errors)))
 
 
@@ -633,7 +647,7 @@ def encode(a, encoding=None, errors=None):
     The type of the result will depend on the encoding specified.
 
     """
-    return _to_string_or_unicode_array(
+    return _to_bytes_or_str_array(
         _vec_string(a, object_, 'encode', _clean_args(encoding, errors)))
 
 
@@ -721,8 +735,8 @@ def expandtabs(a, tabsize=8):
     str.expandtabs
 
     """
-    return _to_string_or_unicode_array(
-        _vec_string(a, object_, 'expandtabs', (tabsize,)))
+    return _to_bytes_or_str_array(
+        _vec_string(a, object_, 'expandtabs', (tabsize,)), a)
 
 
 @array_function_dispatch(_count_dispatcher)
@@ -1041,8 +1055,8 @@ def join(sep, seq):
     array(['g-h-c', 'o.s.d'], dtype='<U5')
 
     """
-    return _to_string_or_unicode_array(
-        _vec_string(sep, object_, 'join', (seq,)))
+    return _to_bytes_or_str_array(
+        _vec_string(sep, object_, 'join', (seq,)), seq)
 
 
 
@@ -1080,10 +1094,10 @@ def ljust(a, width, fillchar=' '):
     a_arr = numpy.asarray(a)
     width_arr = numpy.asarray(width)
     size = int(numpy.max(width_arr.flat))
-    if numpy.issubdtype(a_arr.dtype, numpy.string_):
+    if numpy.issubdtype(a_arr.dtype, numpy.bytes_):
         fillchar = asbytes(fillchar)
     return _vec_string(
-        a_arr, (a_arr.dtype.type, size), 'ljust', (width_arr, fillchar))
+        a_arr, type(a_arr.dtype)(size), 'ljust', (width_arr, fillchar))
 
 
 @array_function_dispatch(_unary_op_dispatcher)
@@ -1216,8 +1230,8 @@ def partition(a, sep):
     str.partition
 
     """
-    return _to_string_or_unicode_array(
-        _vec_string(a, object_, 'partition', (sep,)))
+    return _to_bytes_or_str_array(
+        _vec_string(a, object_, 'partition', (sep,)), a)
 
 
 def _replace_dispatcher(a, old, new, count=None):
@@ -1261,9 +1275,8 @@ def replace(a, old, new, count=None):
     >>> np.char.replace(a, 'is', 'was')
     array(['The dwash was fresh', 'Thwas was it'], dtype='<U19')
     """
-    return _to_string_or_unicode_array(
-        _vec_string(
-            a, object_, 'replace', [old, new] + _clean_args(count)))
+    return _to_bytes_or_str_array(
+        _vec_string(a, object_, 'replace', [old, new] + _clean_args(count)), a)
 
 
 @array_function_dispatch(_count_dispatcher)
@@ -1359,10 +1372,10 @@ def rjust(a, width, fillchar=' '):
     a_arr = numpy.asarray(a)
     width_arr = numpy.asarray(width)
     size = int(numpy.max(width_arr.flat))
-    if numpy.issubdtype(a_arr.dtype, numpy.string_):
+    if numpy.issubdtype(a_arr.dtype, numpy.bytes_):
         fillchar = asbytes(fillchar)
     return _vec_string(
-        a_arr, (a_arr.dtype.type, size), 'rjust', (width_arr, fillchar))
+        a_arr, type(a_arr.dtype)(size), 'rjust', (width_arr, fillchar))
 
 
 @array_function_dispatch(_partition_dispatcher)
@@ -1397,8 +1410,8 @@ def rpartition(a, sep):
     str.rpartition
 
     """
-    return _to_string_or_unicode_array(
-        _vec_string(a, object_, 'rpartition', (sep,)))
+    return _to_bytes_or_str_array(
+        _vec_string(a, object_, 'rpartition', (sep,)), a)
 
 
 def _split_dispatcher(a, sep=None, maxsplit=None):
@@ -1753,7 +1766,7 @@ def translate(a, table, deletechars=None):
 
     """
     a_arr = numpy.asarray(a)
-    if issubclass(a_arr.dtype.type, unicode_):
+    if issubclass(a_arr.dtype.type, str_):
         return _vec_string(
             a_arr, a_arr.dtype, 'translate', (table,))
     else:
@@ -1828,7 +1841,7 @@ def zfill(a, width):
     width_arr = numpy.asarray(width)
     size = int(numpy.max(width_arr.flat))
     return _vec_string(
-        a_arr, (a_arr.dtype.type, size), 'zfill', (width_arr,))
+        a_arr, type(a_arr.dtype)(size), 'zfill', (width_arr,))
 
 
 @array_function_dispatch(_unary_op_dispatcher)
@@ -1837,7 +1850,7 @@ def isnumeric(a):
     For each element, return True if there are only numeric
     characters in the element.
 
-    Calls `unicode.isnumeric` element-wise.
+    Calls `str.isnumeric` element-wise.
 
     Numeric characters include digit characters, and all characters
     that have the Unicode numeric value property, e.g. ``U+2155,
@@ -1855,7 +1868,7 @@ def isnumeric(a):
 
     See Also
     --------
-    unicode.isnumeric
+    str.isnumeric
 
     Examples
     --------
@@ -1863,7 +1876,7 @@ def isnumeric(a):
     array([ True, False, False, False, False])
 
     """
-    if _use_unicode(a) != unicode_:
+    if not _is_unicode(a):
         raise TypeError("isnumeric is only available for Unicode strings and arrays")
     return _vec_string(a, bool_, 'isnumeric')
 
@@ -1874,7 +1887,7 @@ def isdecimal(a):
     For each element, return True if there are only decimal
     characters in the element.
 
-    Calls `unicode.isdecimal` element-wise.
+    Calls `str.isdecimal` element-wise.
 
     Decimal characters include digit characters, and all characters
     that can be used to form decimal-radix numbers,
@@ -1892,7 +1905,7 @@ def isdecimal(a):
 
     See Also
     --------
-    unicode.isdecimal
+    str.isdecimal
 
     Examples
     --------
@@ -1900,8 +1913,9 @@ def isdecimal(a):
     array([ True, False, False, False])
 
     """ 
-    if _use_unicode(a) != unicode_:
-        raise TypeError("isnumeric is only available for Unicode strings and arrays")
+    if not _is_unicode(a):
+        raise TypeError(
+            "isdecimal is only available for Unicode strings and arrays")
     return _vec_string(a, bool_, 'isdecimal')
 
 
@@ -1917,7 +1931,7 @@ class chararray(ndarray):
        The `chararray` class exists for backwards compatibility with
        Numarray, it is not recommended for new development. Starting from numpy
        1.4, if one needs arrays of strings, it is recommended to use arrays of
-       `dtype` `object_`, `string_` or `unicode_`, and use the free functions
+       `dtype` `object_`, `bytes_` or `str_`, and use the free functions
        in the `numpy.char` module for fast vectorized string operations.
 
     Versus a regular NumPy array of type `str` or `unicode`, this
@@ -2051,9 +2065,9 @@ class chararray(ndarray):
         global _globalvar
 
         if unicode:
-            dtype = unicode_
+            dtype = str_
         else:
-            dtype = string_
+            dtype = bytes_
 
         # force itemsize to be a Python int, since using NumPy integer
         # types results in itemsize.itemsize being used as the size of
@@ -2177,7 +2191,7 @@ class chararray(ndarray):
     def __radd__(self, other):
         """
         Return (other + self), that is string concatenation,
-        element-wise for a pair of array_likes of `string_` or `unicode_`.
+        element-wise for a pair of array_likes of `bytes_` or `str_`.
 
         See Also
         --------
@@ -2210,8 +2224,8 @@ class chararray(ndarray):
     def __mod__(self, i):
         """
         Return (self % i), that is pre-Python 2.6 string formatting
-        (interpolation), element-wise for a pair of array_likes of `string_`
-        or `unicode_`.
+        (interpolation), element-wise for a pair of array_likes of `bytes_`
+        or `str_`.
 
         See Also
         --------
@@ -2721,7 +2735,7 @@ def array(obj, itemsize=None, copy=True, unicode=None, order=None):
     .. note::
        This class is provided for numarray backward-compatibility.
        New code (not concerned with numarray compatibility) should use
-       arrays of type `string_` or `unicode_` and use the free functions
+       arrays of type `bytes_` or `str_` and use the free functions
        in :mod:`numpy.char <numpy.core.defchararray>` for fast
        vectorized string operations instead.
 
@@ -2804,26 +2818,26 @@ def array(obj, itemsize=None, copy=True, unicode=None, order=None):
             # itemsize is in 8-bit chars, so for Unicode, we need
             # to divide by the size of a single Unicode character,
             # which for NumPy is always 4
-            if issubclass(obj.dtype.type, unicode_):
+            if issubclass(obj.dtype.type, str_):
                 itemsize //= 4
 
         if unicode is None:
-            if issubclass(obj.dtype.type, unicode_):
+            if issubclass(obj.dtype.type, str_):
                 unicode = True
             else:
                 unicode = False
 
         if unicode:
-            dtype = unicode_
+            dtype = str_
         else:
-            dtype = string_
+            dtype = bytes_
 
         if order is not None:
             obj = numpy.asarray(obj, order=order)
         if (copy or
                 (itemsize != obj.itemsize) or
-                (not unicode and isinstance(obj, unicode_)) or
-                (unicode and isinstance(obj, string_))):
+                (not unicode and isinstance(obj, str_)) or
+                (unicode and isinstance(obj, bytes_))):
             obj = obj.astype((dtype, int(itemsize)))
         return obj
 
@@ -2836,9 +2850,9 @@ def array(obj, itemsize=None, copy=True, unicode=None, order=None):
             # Fall through to the default case
 
     if unicode:
-        dtype = unicode_
+        dtype = str_
     else:
-        dtype = string_
+        dtype = bytes_
 
     if itemsize is None:
         val = narray(obj, dtype=dtype, order=order, subok=True)
diff --git a/numpy/core/einsumfunc.py b/numpy/core/einsumfunc.py
index d6c5885b8..01966f0fe 100644
--- a/numpy/core/einsumfunc.py
+++ b/numpy/core/einsumfunc.py
@@ -728,7 +728,7 @@ def einsum_path(*operands, optimize='greedy', einsum_call=False):
         * if False no optimization is taken
         * if True defaults to the 'greedy' algorithm
         * 'optimal' An algorithm that combinatorially explores all possible
-          ways of contracting the listed tensors and choosest the least costly
+          ways of contracting the listed tensors and chooses the least costly
           path. Scales exponentially with the number of terms in the
           contraction.
         * 'greedy' An algorithm that chooses the best pair contraction
diff --git a/numpy/core/einsumfunc.pyi b/numpy/core/einsumfunc.pyi
index c811a5783..ad483bb90 100644
--- a/numpy/core/einsumfunc.pyi
+++ b/numpy/core/einsumfunc.pyi
@@ -5,10 +5,6 @@ from numpy import (
     ndarray,
     dtype,
     bool_,
-    unsignedinteger,
-    signedinteger,
-    floating,
-    complexfloating,
     number,
     _OrderKACF,
 )
@@ -18,12 +14,14 @@ from numpy._typing import (
     _ArrayLikeInt_co,
     _ArrayLikeFloat_co,
     _ArrayLikeComplex_co,
+    _ArrayLikeObject_co,
     _DTypeLikeBool,
     _DTypeLikeUInt,
     _DTypeLikeInt,
     _DTypeLikeFloat,
     _DTypeLikeComplex,
     _DTypeLikeComplex_co,
+    _DTypeLikeObject,
 )
 
 _ArrayType = TypeVar(
@@ -132,6 +130,51 @@ def einsum(
     optimize: _OptimizeKind = ...,
 ) -> _ArrayType: ...
 
+@overload
+def einsum(
+    subscripts: str | _ArrayLikeInt_co,
+    /,
+    *operands: _ArrayLikeObject_co,
+    out: None = ...,
+    dtype: None | _DTypeLikeObject = ...,
+    order: _OrderKACF = ...,
+    casting: _CastingSafe = ...,
+    optimize: _OptimizeKind = ...,
+) -> Any: ...
+@overload
+def einsum(
+    subscripts: str | _ArrayLikeInt_co,
+    /,
+    *operands: Any,
+    casting: _CastingUnsafe,
+    dtype: None | _DTypeLikeObject = ...,
+    out: None = ...,
+    order: _OrderKACF = ...,
+    optimize: _OptimizeKind = ...,
+) -> Any: ...
+@overload
+def einsum(
+    subscripts: str | _ArrayLikeInt_co,
+    /,
+    *operands: _ArrayLikeObject_co,
+    out: _ArrayType,
+    dtype: None | _DTypeLikeObject = ...,
+    order: _OrderKACF = ...,
+    casting: _CastingSafe = ...,
+    optimize: _OptimizeKind = ...,
+) -> _ArrayType: ...
+@overload
+def einsum(
+    subscripts: str | _ArrayLikeInt_co,
+    /,
+    *operands: Any,
+    out: _ArrayType,
+    casting: _CastingUnsafe,
+    dtype: None | _DTypeLikeObject = ...,
+    order: _OrderKACF = ...,
+    optimize: _OptimizeKind = ...,
+) -> _ArrayType: ...
+
 # NOTE: `einsum_call` is a hidden kwarg unavailable for public use.
 # It is therefore excluded from the signatures below.
 # NOTE: In practice the list consists of a `str` (first element)
@@ -139,6 +182,6 @@ def einsum(
 def einsum_path(
     subscripts: str | _ArrayLikeInt_co,
     /,
-    *operands: _ArrayLikeComplex_co,
+    *operands: _ArrayLikeComplex_co | _DTypeLikeObject,
     optimize: _OptimizeKind = ...,
 ) -> tuple[list[Any], str]: ...
diff --git a/numpy/core/feature_detection_cmath.h b/numpy/core/feature_detection_cmath.h
index ce9a3d4c9..1dd71c643 100644
--- a/numpy/core/feature_detection_cmath.h
+++ b/numpy/core/feature_detection_cmath.h
@@ -10,6 +10,18 @@
 #define cldouble complex long double
 #endif
 
+// musl libc defines the following macros which breaks the declarations.
+// See https://git.musl-libc.org/cgit/musl/tree/include/complex.h#n108
+#undef crealf
+#undef cimagf
+#undef conjf
+#undef creal
+#undef cimag
+#undef conj
+#undef creall
+#undef cimagl
+#undef cconjl
+
 cfloat csinf(cfloat);
 cfloat ccosf(cfloat);
 cfloat ctanf(cfloat);
diff --git a/numpy/core/fromnumeric.py b/numpy/core/fromnumeric.py
index ca9d55cb7..69cabb33e 100644
--- a/numpy/core/fromnumeric.py
+++ b/numpy/core/fromnumeric.py
@@ -6,6 +6,7 @@ import types
 import warnings
 
 import numpy as np
+from .._utils import set_module
 from . import multiarray as mu
 from . import overrides
 from . import umath as um
@@ -20,8 +21,9 @@ __all__ = [
     'all', 'alltrue', 'amax', 'amin', 'any', 'argmax',
     'argmin', 'argpartition', 'argsort', 'around', 'choose', 'clip',
     'compress', 'cumprod', 'cumproduct', 'cumsum', 'diagonal', 'mean',
+    'max', 'min',
     'ndim', 'nonzero', 'partition', 'prod', 'product', 'ptp', 'put',
-    'ravel', 'repeat', 'reshape', 'resize', 'round_',
+    'ravel', 'repeat', 'reshape', 'resize', 'round', 'round_',
     'searchsorted', 'shape', 'size', 'sometrue', 'sort', 'squeeze',
     'std', 'sum', 'swapaxes', 'take', 'trace', 'transpose', 'var',
 ]
@@ -236,24 +238,9 @@ def reshape(a, newshape, order='C'):
 
     Notes
     -----
-    It is not always possible to change the shape of an array without
-    copying the data. If you want an error to be raised when the data is copied,
-    you should assign the new shape to the shape attribute of the array::
-
-     >>> a = np.zeros((10, 2))
-
-     # A transpose makes the array non-contiguous
-     >>> b = a.T
-
-     # Taking a view makes it possible to modify the shape without modifying
-     # the initial object.
-     >>> c = b.view()
-     >>> c.shape = (20)
-     Traceback (most recent call last):
-        ...
-     AttributeError: Incompatible shape for in-place modification. Use
-     `.reshape()` to make a copy with the desired shape.
-
+    It is not always possible to change the shape of an array without copying
+    the data.
+    
     The `order` keyword gives the index ordering both for *fetching* the values
     from `a`, and then *placing* the values into the output array.
     For example, let's say you have an array:
@@ -436,7 +423,7 @@ def _repeat_dispatcher(a, repeats, axis=None):
 @array_function_dispatch(_repeat_dispatcher)
 def repeat(a, repeats, axis=None):
     """
-    Repeat elements of an array.
+    Repeat each element of an array after themselves
 
     Parameters
     ----------
@@ -1814,9 +1801,10 @@ def ravel(a, order='C'):
     Returns
     -------
     y : array_like
-        y is an array of the same subtype as `a`, with shape ``(a.size,)``.
-        Note that matrices are special cased for backward compatibility, if `a`
-        is a matrix, then y is a 1-D ndarray.
+        y is a contiguous 1-D array of the same subtype as `a`,
+        with shape ``(a.size,)``.
+        Note that matrices are special cased for backward compatibility,
+        if `a` is a matrix, then y is a 1-D ndarray.
 
     See Also
     --------
@@ -1835,7 +1823,8 @@ def ravel(a, order='C'):
     column-major, Fortran-style index ordering.
 
     When a view is desired in as many cases as possible, ``arr.reshape(-1)``
-    may be preferable.
+    may be preferable. However, ``ravel`` supports ``K`` in the optional
+    ``order`` argument while ``reshape`` does not.
 
     Examples
     --------
@@ -2313,7 +2302,7 @@ def sum(a, axis=None, dtype=None, out=None, keepdims=np._NoValue,
         warnings.warn(
             "Calling np.sum(generator) is deprecated, and in the future will give a different result. "
             "Use np.sum(np.fromiter(generator)) or the python sum builtin instead.",
-            DeprecationWarning, stacklevel=3)
+            DeprecationWarning, stacklevel=2)
 
         res = _sum_(a)
         if out is not None:
@@ -2695,13 +2684,14 @@ def ptp(a, axis=None, out=None, keepdims=np._NoValue):
     return _methods._ptp(a, axis=axis, out=out, **kwargs)
 
 
-def _amax_dispatcher(a, axis=None, out=None, keepdims=None, initial=None,
-                     where=None):
+def _max_dispatcher(a, axis=None, out=None, keepdims=None, initial=None,
+                    where=None):
     return (a, out)
 
 
-@array_function_dispatch(_amax_dispatcher)
-def amax(a, axis=None, out=None, keepdims=np._NoValue, initial=np._NoValue,
+@array_function_dispatch(_max_dispatcher)
+@set_module('numpy')
+def max(a, axis=None, out=None, keepdims=np._NoValue, initial=np._NoValue,
          where=np._NoValue):
     """
     Return the maximum of an array or maximum along an axis.
@@ -2729,7 +2719,7 @@ def amax(a, axis=None, out=None, keepdims=np._NoValue, initial=np._NoValue,
         the result will broadcast correctly against the input array.
 
         If the default value is passed, then `keepdims` will not be
-        passed through to the `amax` method of sub-classes of
+        passed through to the ``max`` method of sub-classes of
         `ndarray`, however any non-default value will be.  If the
         sub-class' method does not implement `keepdims` any
         exceptions will be raised.
@@ -2748,7 +2738,7 @@ def amax(a, axis=None, out=None, keepdims=np._NoValue, initial=np._NoValue,
 
     Returns
     -------
-    amax : ndarray or scalar
+    max : ndarray or scalar
         Maximum of `a`. If `axis` is None, the result is a scalar value.
         If `axis` is an int, the result is an array of dimension
         ``a.ndim - 1``. If `axis` is a tuple, the result is an array of 
@@ -2775,9 +2765,9 @@ def amax(a, axis=None, out=None, keepdims=np._NoValue, initial=np._NoValue,
     corresponding max value will be NaN as well. To ignore NaN values
     (MATLAB behavior), please use nanmax.
 
-    Don't use `amax` for element-wise comparison of 2 arrays; when
+    Don't use `~numpy.max` for element-wise comparison of 2 arrays; when
     ``a.shape[0]`` is 2, ``maximum(a[0], a[1])`` is faster than
-    ``amax(a, axis=0)``.
+    ``max(a, axis=0)``.
 
     Examples
     --------
@@ -2785,19 +2775,19 @@ def amax(a, axis=None, out=None, keepdims=np._NoValue, initial=np._NoValue,
     >>> a
     array([[0, 1],
            [2, 3]])
-    >>> np.amax(a)           # Maximum of the flattened array
+    >>> np.max(a)           # Maximum of the flattened array
     3
-    >>> np.amax(a, axis=0)   # Maxima along the first axis
+    >>> np.max(a, axis=0)   # Maxima along the first axis
     array([2, 3])
-    >>> np.amax(a, axis=1)   # Maxima along the second axis
+    >>> np.max(a, axis=1)   # Maxima along the second axis
     array([1, 3])
-    >>> np.amax(a, where=[False, True], initial=-1, axis=0)
+    >>> np.max(a, where=[False, True], initial=-1, axis=0)
     array([-1,  3])
     >>> b = np.arange(5, dtype=float)
     >>> b[2] = np.NaN
-    >>> np.amax(b)
+    >>> np.max(b)
     nan
-    >>> np.amax(b, where=~np.isnan(b), initial=-1)
+    >>> np.max(b, where=~np.isnan(b), initial=-1)
     4.0
     >>> np.nanmax(b)
     4.0
@@ -2805,14 +2795,14 @@ def amax(a, axis=None, out=None, keepdims=np._NoValue, initial=np._NoValue,
     You can use an initial value to compute the maximum of an empty slice, or
     to initialize it to a different value:
 
-    >>> np.amax([[-50], [10]], axis=-1, initial=0)
+    >>> np.max([[-50], [10]], axis=-1, initial=0)
     array([ 0, 10])
 
     Notice that the initial value is used as one of the elements for which the
     maximum is determined, unlike for the default argument Python's max
     function, which is only used for empty iterables.
 
-    >>> np.amax([5], initial=6)
+    >>> np.max([5], initial=6)
     6
     >>> max([5], default=6)
     5
@@ -2821,14 +2811,31 @@ def amax(a, axis=None, out=None, keepdims=np._NoValue, initial=np._NoValue,
                           keepdims=keepdims, initial=initial, where=where)
 
 
-def _amin_dispatcher(a, axis=None, out=None, keepdims=None, initial=None,
-                     where=None):
+@array_function_dispatch(_max_dispatcher)
+def amax(a, axis=None, out=None, keepdims=np._NoValue, initial=np._NoValue,
+         where=np._NoValue):
+    """
+    Return the maximum of an array or maximum along an axis.
+
+    `amax` is an alias of `~numpy.max`.
+
+    See Also
+    --------
+    max : alias of this function
+    ndarray.max : equivalent method
+    """
+    return _wrapreduction(a, np.maximum, 'max', axis, None, out,
+                          keepdims=keepdims, initial=initial, where=where)
+
+
+def _min_dispatcher(a, axis=None, out=None, keepdims=None, initial=None,
+                    where=None):
     return (a, out)
 
 
-@array_function_dispatch(_amin_dispatcher)
-def amin(a, axis=None, out=None, keepdims=np._NoValue, initial=np._NoValue,
-         where=np._NoValue):
+@array_function_dispatch(_min_dispatcher)
+def min(a, axis=None, out=None, keepdims=np._NoValue, initial=np._NoValue,
+        where=np._NoValue):
     """
     Return the minimum of an array or minimum along an axis.
 
@@ -2855,7 +2862,7 @@ def amin(a, axis=None, out=None, keepdims=np._NoValue, initial=np._NoValue,
         the result will broadcast correctly against the input array.
 
         If the default value is passed, then `keepdims` will not be
-        passed through to the `amin` method of sub-classes of
+        passed through to the ``min`` method of sub-classes of
         `ndarray`, however any non-default value will be.  If the
         sub-class' method does not implement `keepdims` any
         exceptions will be raised.
@@ -2874,7 +2881,7 @@ def amin(a, axis=None, out=None, keepdims=np._NoValue, initial=np._NoValue,
 
     Returns
     -------
-    amin : ndarray or scalar
+    min : ndarray or scalar
         Minimum of `a`. If `axis` is None, the result is a scalar value.
         If `axis` is an int, the result is an array of dimension
         ``a.ndim - 1``.  If `axis` is a tuple, the result is an array of 
@@ -2901,9 +2908,9 @@ def amin(a, axis=None, out=None, keepdims=np._NoValue, initial=np._NoValue,
     corresponding min value will be NaN as well. To ignore NaN values
     (MATLAB behavior), please use nanmin.
 
-    Don't use `amin` for element-wise comparison of 2 arrays; when
+    Don't use `~numpy.min` for element-wise comparison of 2 arrays; when
     ``a.shape[0]`` is 2, ``minimum(a[0], a[1])`` is faster than
-    ``amin(a, axis=0)``.
+    ``min(a, axis=0)``.
 
     Examples
     --------
@@ -2911,25 +2918,25 @@ def amin(a, axis=None, out=None, keepdims=np._NoValue, initial=np._NoValue,
     >>> a
     array([[0, 1],
            [2, 3]])
-    >>> np.amin(a)           # Minimum of the flattened array
+    >>> np.min(a)           # Minimum of the flattened array
     0
-    >>> np.amin(a, axis=0)   # Minima along the first axis
+    >>> np.min(a, axis=0)   # Minima along the first axis
     array([0, 1])
-    >>> np.amin(a, axis=1)   # Minima along the second axis
+    >>> np.min(a, axis=1)   # Minima along the second axis
     array([0, 2])
-    >>> np.amin(a, where=[False, True], initial=10, axis=0)
+    >>> np.min(a, where=[False, True], initial=10, axis=0)
     array([10,  1])
 
     >>> b = np.arange(5, dtype=float)
     >>> b[2] = np.NaN
-    >>> np.amin(b)
+    >>> np.min(b)
     nan
-    >>> np.amin(b, where=~np.isnan(b), initial=10)
+    >>> np.min(b, where=~np.isnan(b), initial=10)
     0.0
     >>> np.nanmin(b)
     0.0
 
-    >>> np.amin([[-50], [10]], axis=-1, initial=0)
+    >>> np.min([[-50], [10]], axis=-1, initial=0)
     array([-50,   0])
 
     Notice that the initial value is used as one of the elements for which the
@@ -2938,7 +2945,7 @@ def amin(a, axis=None, out=None, keepdims=np._NoValue, initial=np._NoValue,
 
     Notice that this isn't the same as Python's ``default`` argument.
 
-    >>> np.amin([6], initial=5)
+    >>> np.min([6], initial=5)
     5
     >>> min([6], default=5)
     6
@@ -2947,6 +2954,23 @@ def amin(a, axis=None, out=None, keepdims=np._NoValue, initial=np._NoValue,
                           keepdims=keepdims, initial=initial, where=where)
 
 
+@array_function_dispatch(_min_dispatcher)
+def amin(a, axis=None, out=None, keepdims=np._NoValue, initial=np._NoValue,
+         where=np._NoValue):
+    """
+    Return the minimum of an array or minimum along an axis.
+
+    `amin` is an alias of `~numpy.min`.
+
+    See Also
+    --------
+    min : alias of this function
+    ndarray.min : equivalent method
+    """
+    return _wrapreduction(a, np.minimum, 'min', axis, None, out,
+                          keepdims=keepdims, initial=initial, where=where)
+
+
 def _prod_dispatcher(a, axis=None, dtype=None, out=None, keepdims=None,
                      initial=None, where=None):
     return (a, out)
@@ -3238,12 +3262,12 @@ def size(a, axis=None):
             return asarray(a).shape[axis]
 
 
-def _around_dispatcher(a, decimals=None, out=None):
+def _round_dispatcher(a, decimals=None, out=None):
     return (a, out)
 
 
-@array_function_dispatch(_around_dispatcher)
-def around(a, decimals=0, out=None):
+@array_function_dispatch(_round_dispatcher)
+def round(a, decimals=0, out=None):
     """
     Evenly round to the given number of decimals.
 
@@ -3274,18 +3298,17 @@ def around(a, decimals=0, out=None):
     See Also
     --------
     ndarray.round : equivalent method
+    around : an alias for this function
     ceil, fix, floor, rint, trunc
 
 
     Notes
     -----
-    `~numpy.round` is often used as an alias for `~numpy.around`.
-    
     For values exactly halfway between rounded decimal values, NumPy
     rounds to the nearest even value. Thus 1.5 and 2.5 round to 2.0,
     -0.5 and 0.5 round to 0.0, etc.
 
-    ``np.around`` uses a fast but sometimes inexact algorithm to round
+    ``np.round`` uses a fast but sometimes inexact algorithm to round
     floating-point datatypes. For positive `decimals` it is equivalent to
     ``np.true_divide(np.rint(a * 10**decimals), 10**decimals)``, which has
     error due to the inexact representation of decimal fractions in the IEEE
@@ -3322,21 +3345,38 @@ def around(a, decimals=0, out=None):
 
     Examples
     --------
-    >>> np.around([0.37, 1.64])
+    >>> np.round([0.37, 1.64])
     array([0., 2.])
-    >>> np.around([0.37, 1.64], decimals=1)
+    >>> np.round([0.37, 1.64], decimals=1)
     array([0.4, 1.6])
-    >>> np.around([.5, 1.5, 2.5, 3.5, 4.5]) # rounds to nearest even value
+    >>> np.round([.5, 1.5, 2.5, 3.5, 4.5]) # rounds to nearest even value
     array([0., 2., 2., 4., 4.])
-    >>> np.around([1,2,3,11], decimals=1) # ndarray of ints is returned
+    >>> np.round([1,2,3,11], decimals=1) # ndarray of ints is returned
     array([ 1,  2,  3, 11])
-    >>> np.around([1,2,3,11], decimals=-1)
+    >>> np.round([1,2,3,11], decimals=-1)
     array([ 0,  0,  0, 10])
 
     """
     return _wrapfunc(a, 'round', decimals=decimals, out=out)
 
 
+@array_function_dispatch(_round_dispatcher)
+def around(a, decimals=0, out=None):
+    """
+    Round an array to the given number of decimals.
+
+    `around` is an alias of `~numpy.round`.
+
+    See Also
+    --------
+    ndarray.round : equivalent method
+    round : alias for this function
+    ceil, fix, floor, rint, trunc
+
+    """
+    return _wrapfunc(a, 'round', decimals=decimals, out=out)
+
+
 def _mean_dispatcher(a, axis=None, dtype=None, out=None, keepdims=None, *,
                      where=None):
     return (a, where, out)
@@ -3748,14 +3788,31 @@ def var(a, axis=None, dtype=None, out=None, ddof=0, keepdims=np._NoValue, *,
                          **kwargs)
 
 
-# Aliases of other functions. These have their own definitions only so that
-# they can have unique docstrings.
+# Aliases of other functions. Provided unique docstrings 
+# are for reference purposes only. Wherever possible,
+# avoid using them.
+
+
+def _round__dispatcher(a, decimals=None, out=None):
+    # 2023-02-28, 1.25.0
+    warnings.warn("`round_` is deprecated as of NumPy 1.25.0, and will be "
+                  "removed in NumPy 2.0. Please use `round` instead.",
+                  DeprecationWarning, stacklevel=3)
+    return (a, out)
+
 
-@array_function_dispatch(_around_dispatcher)
+@array_function_dispatch(_round__dispatcher)
 def round_(a, decimals=0, out=None):
     """
     Round an array to the given number of decimals.
 
+    `~numpy.round_` is a disrecommended backwards-compatibility
+    alias of `~numpy.around` and `~numpy.round`.
+
+    .. deprecated:: 1.25.0
+        ``round_`` is deprecated as of NumPy 1.25.0, and will be
+        removed in NumPy 2.0. Please use `round` instead.
+
     See Also
     --------
     around : equivalent function; see for details.
@@ -3763,11 +3820,24 @@ def round_(a, decimals=0, out=None):
     return around(a, decimals=decimals, out=out)
 
 
-@array_function_dispatch(_prod_dispatcher, verify=False)
+def _product_dispatcher(a, axis=None, dtype=None, out=None, keepdims=None,
+                        initial=None, where=None):
+    # 2023-03-02, 1.25.0
+    warnings.warn("`product` is deprecated as of NumPy 1.25.0, and will be "
+                  "removed in NumPy 2.0. Please use `prod` instead.",
+                  DeprecationWarning, stacklevel=3)
+    return (a, out)
+
+
+@array_function_dispatch(_product_dispatcher, verify=False)
 def product(*args, **kwargs):
     """
     Return the product of array elements over a given axis.
 
+    .. deprecated:: 1.25.0
+        ``product`` is deprecated as of NumPy 1.25.0, and will be
+        removed in NumPy 2.0. Please use `prod` instead.
+
     See Also
     --------
     prod : equivalent function; see for details.
@@ -3775,11 +3845,23 @@ def product(*args, **kwargs):
     return prod(*args, **kwargs)
 
 
-@array_function_dispatch(_cumprod_dispatcher, verify=False)
+def _cumproduct_dispatcher(a, axis=None, dtype=None, out=None):
+    # 2023-03-02, 1.25.0
+    warnings.warn("`cumproduct` is deprecated as of NumPy 1.25.0, and will be "
+                  "removed in NumPy 2.0. Please use `cumprod` instead.",
+                  DeprecationWarning, stacklevel=3)
+    return (a, out)
+
+
+@array_function_dispatch(_cumproduct_dispatcher, verify=False)
 def cumproduct(*args, **kwargs):
     """
     Return the cumulative product over the given axis.
 
+    .. deprecated:: 1.25.0
+        ``cumproduct`` is deprecated as of NumPy 1.25.0, and will be
+        removed in NumPy 2.0. Please use `cumprod` instead.
+
     See Also
     --------
     cumprod : equivalent function; see for details.
@@ -3787,13 +3869,26 @@ def cumproduct(*args, **kwargs):
     return cumprod(*args, **kwargs)
 
 
-@array_function_dispatch(_any_dispatcher, verify=False)
+def _sometrue_dispatcher(a, axis=None, out=None, keepdims=None, *,
+                         where=np._NoValue):
+    # 2023-03-02, 1.25.0
+    warnings.warn("`sometrue` is deprecated as of NumPy 1.25.0, and will be "
+                  "removed in NumPy 2.0. Please use `any` instead.",
+                  DeprecationWarning, stacklevel=3)
+    return (a, where, out)
+
+
+@array_function_dispatch(_sometrue_dispatcher, verify=False)
 def sometrue(*args, **kwargs):
     """
     Check whether some values are true.
 
     Refer to `any` for full documentation.
 
+    .. deprecated:: 1.25.0
+        ``sometrue`` is deprecated as of NumPy 1.25.0, and will be
+        removed in NumPy 2.0. Please use `any` instead.
+
     See Also
     --------
     any : equivalent function; see for details.
@@ -3801,11 +3896,23 @@ def sometrue(*args, **kwargs):
     return any(*args, **kwargs)
 
 
-@array_function_dispatch(_all_dispatcher, verify=False)
+def _alltrue_dispatcher(a, axis=None, out=None, keepdims=None, *, where=None):
+    # 2023-03-02, 1.25.0
+    warnings.warn("`alltrue` is deprecated as of NumPy 1.25.0, and will be "
+                  "removed in NumPy 2.0. Please use `all` instead.",
+                  DeprecationWarning, stacklevel=3)
+    return (a, where, out)
+
+
+@array_function_dispatch(_alltrue_dispatcher, verify=False)
 def alltrue(*args, **kwargs):
     """
     Check if all elements of input array are true.
 
+    .. deprecated:: 1.25.0
+        ``alltrue`` is deprecated as of NumPy 1.25.0, and will be
+        removed in NumPy 2.0. Please use `all` instead.
+
     See Also
     --------
     numpy.all : Equivalent function; see for details.
diff --git a/numpy/core/fromnumeric.pyi b/numpy/core/fromnumeric.pyi
index 17b17819d..43d178557 100644
--- a/numpy/core/fromnumeric.pyi
+++ b/numpy/core/fromnumeric.pyi
@@ -1047,3 +1047,7 @@ def var(
     *,
     where: _ArrayLikeBool_co = ...,
 ) -> _ArrayType: ...
+
+max = amax
+min = amin
+round = around
diff --git a/numpy/core/function_base.py b/numpy/core/function_base.py
index b5323fb17..00e4e6b0e 100644
--- a/numpy/core/function_base.py
+++ b/numpy/core/function_base.py
@@ -3,6 +3,7 @@ import warnings
 import operator
 import types
 
+import numpy as np
 from . import numeric as _nx
 from .numeric import result_type, NaN, asanyarray, ndim
 from numpy.core.multiarray import add_docstring
@@ -91,6 +92,7 @@ def linspace(start, stop, num=50, endpoint=True, retstep=False, dtype=None,
                 scale (a geometric progression).
     logspace : Similar to `geomspace`, but with the end points specified as
                logarithms.
+    :ref:`how-to-partition`
 
     Examples
     --------
@@ -166,7 +168,7 @@ def linspace(start, stop, num=50, endpoint=True, retstep=False, dtype=None,
     y += start
 
     if endpoint and num > 1:
-        y[-1] = stop
+        y[-1, ...] = stop
 
     if axis != 0:
         y = _nx.moveaxis(y, 0, axis)
@@ -182,7 +184,7 @@ def linspace(start, stop, num=50, endpoint=True, retstep=False, dtype=None,
 
 def _logspace_dispatcher(start, stop, num=None, endpoint=None, base=None,
                          dtype=None, axis=None):
-    return (start, stop)
+    return (start, stop, base)
 
 
 @array_function_dispatch(_logspace_dispatcher)
@@ -198,6 +200,9 @@ def logspace(start, stop, num=50, endpoint=True, base=10.0, dtype=None,
     .. versionchanged:: 1.16.0
         Non-scalar `start` and `stop` are now supported.
 
+    .. versionchanged:: 1.25.0
+        Non-scalar 'base` is now supported
+
     Parameters
     ----------
     start : array_like
@@ -222,9 +227,10 @@ def logspace(start, stop, num=50, endpoint=True, base=10.0, dtype=None,
         an integer; `float` is chosen even if the arguments would produce an
         array of integers.
     axis : int, optional
-        The axis in the result to store the samples.  Relevant only if start
-        or stop are array-like.  By default (0), the samples will be along a
-        new axis inserted at the beginning. Use -1 to get an axis at the end.
+        The axis in the result to store the samples.  Relevant only if start,
+        stop, or base are array-like.  By default (0), the samples will be
+        along a new axis inserted at the beginning. Use -1 to get an axis at
+        the end.
 
         .. versionadded:: 1.16.0
 
@@ -242,10 +248,11 @@ def logspace(start, stop, num=50, endpoint=True, base=10.0, dtype=None,
     linspace : Similar to logspace, but with the samples uniformly distributed
                in linear space, instead of log space.
     geomspace : Similar to logspace, but with endpoints specified directly.
+    :ref:`how-to-partition`
 
     Notes
     -----
-    Logspace is equivalent to the code
+    If base is a scalar, logspace is equivalent to the code
 
     >>> y = np.linspace(start, stop, num=num, endpoint=endpoint)
     ... # doctest: +SKIP
@@ -260,6 +267,9 @@ def logspace(start, stop, num=50, endpoint=True, base=10.0, dtype=None,
     array([100.        ,  177.827941  ,  316.22776602,  562.34132519])
     >>> np.logspace(2.0, 3.0, num=4, base=2.0)
     array([4.        ,  5.0396842 ,  6.34960421,  8.        ])
+    >>> np.logspace(2.0, 3.0, num=4, base=[2.0, 3.0], axis=-1)
+    array([[ 4.        ,  5.0396842 ,  6.34960421,  8.        ],
+           [ 9.        , 12.98024613, 18.72075441, 27.        ]])
 
     Graphical illustration:
 
@@ -277,7 +287,13 @@ def logspace(start, stop, num=50, endpoint=True, base=10.0, dtype=None,
     >>> plt.show()
 
     """
+    ndmax = np.broadcast(start, stop, base).ndim
+    start, stop, base = (
+        np.array(a, copy=False, subok=True, ndmin=ndmax)
+        for a in (start, stop, base)
+    )
     y = linspace(start, stop, num=num, endpoint=endpoint, axis=axis)
+    base = np.expand_dims(base, axis=axis)
     if dtype is None:
         return _nx.power(base, y)
     return _nx.power(base, y).astype(dtype, copy=False)
@@ -338,6 +354,7 @@ def geomspace(start, stop, num=50, endpoint=True, dtype=None, axis=0):
                progression.
     arange : Similar to linspace, with the step size specified instead of the
              number of samples.
+    :ref:`how-to-partition`
 
     Notes
     -----
diff --git a/numpy/core/getlimits.py b/numpy/core/getlimits.py
index 2c0f462cc..da9e1d7f3 100644
--- a/numpy/core/getlimits.py
+++ b/numpy/core/getlimits.py
@@ -5,8 +5,8 @@ __all__ = ['finfo', 'iinfo']
 
 import warnings
 
+from .._utils import set_module
 from ._machar import MachAr
-from .overrides import set_module
 from . import numeric
 from . import numerictypes as ntypes
 from .numeric import array, inf, NaN
@@ -147,8 +147,12 @@ _MACHAR_PARAMS = {
 
 # Key to identify the floating point type.  Key is result of
 # ftype('-0.1').newbyteorder('<').tobytes()
+#
+# 20230201 - use (ftype(-1.0) / ftype(10.0)).newbyteorder('<').tobytes()
+#            instead because stold may have deficiencies on some platforms.
 # See:
 # https://perl5.git.perl.org/perl.git/blob/3118d7d684b56cbeb702af874f4326683c45f045:/Configure
+
 _KNOWN_TYPES = {}
 def _register_type(machar, bytepat):
     _KNOWN_TYPES[bytepat] = machar
@@ -240,8 +244,6 @@ def _register_known_types():
     # IEEE 754 128-bit binary float
     _register_type(float128_ma,
         b'\x9a\x99\x99\x99\x99\x99\x99\x99\x99\x99\x99\x99\x99\x99\xfb\xbf')
-    _register_type(float128_ma,
-        b'\x9a\x99\x99\x99\x99\x99\x99\x99\x99\x99\x99\x99\x99\x99\xfb\xbf')
     _float_ma[128] = float128_ma
 
     # Known parameters for float80 (Intel 80-bit extended precision)
@@ -329,7 +331,9 @@ def _get_machar(ftype):
     if params is None:
         raise ValueError(repr(ftype))
     # Detect known / suspected types
-    key = ftype('-0.1').newbyteorder('<').tobytes()
+    # ftype(-1.0) / ftype(10.0) is better than ftype('-0.1') because stold
+    # may be deficient
+    key = (ftype(-1.0) / ftype(10.)).newbyteorder('<').tobytes()
     ma_like = None
     if ftype == ntypes.longdouble:
         # Could be 80 bit == 10 byte extended precision, where last bytes can
@@ -338,7 +342,14 @@ def _get_machar(ftype):
         # random garbage.
         ma_like = _KNOWN_TYPES.get(key[:10])
     if ma_like is None:
+        # see if the full key is known.
         ma_like = _KNOWN_TYPES.get(key)
+    if ma_like is None and len(key) == 16:
+        # machine limits could be f80 masquerading as np.float128,
+        # find all keys with length 16 and make new dict, but make the keys
+        # only 10 bytes long, the last bytes can be random garbage
+        _kt = {k[:10]: v for k, v in _KNOWN_TYPES.items() if len(k) == 16}
+        ma_like = _kt.get(key[:10])
     if ma_like is not None:
         return ma_like
     # Fall back to parameter discovery
@@ -352,6 +363,9 @@ def _get_machar(ftype):
 
 def _discovered_machar(ftype):
     """ Create MachAr instance with found information on float types
+
+    TODO: MachAr should be retired completely ideally.  We currently only
+          ever use it system with broken longdouble (valgrind, WSL).
     """
     params = _MACHAR_PARAMS[ftype]
     return MachAr(lambda v: array([v], ftype),
@@ -387,11 +401,6 @@ class finfo:
     iexp : int
         The number of bits in the exponent portion of the floating point
         representation.
-    machar : MachAr
-        The object which calculated these parameters and holds more
-        detailed information.
-
-        .. deprecated:: 1.22
     machep : int
         The exponent that yields `eps`.
     max : floating point number of the appropriate type
@@ -432,7 +441,6 @@ class finfo:
 
     See Also
     --------
-    MachAr : The implementation of the tests that produce this information.
     iinfo : The equivalent for integer data types.
     spacing : The distance between a value and the nearest adjacent number
     nextafter : The next floating point value after x1 towards x2
@@ -474,13 +482,26 @@ class finfo:
     _finfo_cache = {}
 
     def __new__(cls, dtype):
+        obj = cls._finfo_cache.get(dtype)  # most common path
+        if obj is not None:
+            return obj
+
+        if dtype is None:
+            # Deprecated in NumPy 1.25, 2023-01-16
+            warnings.warn(
+                "finfo() dtype cannot be None. This behavior will "
+                "raise an error in the future. (Deprecated in NumPy 1.25)",
+                DeprecationWarning,
+                stacklevel=2
+            )
+
         try:
             dtype = numeric.dtype(dtype)
         except TypeError:
             # In case a float instance was given
             dtype = numeric.dtype(type(dtype))
 
-        obj = cls._finfo_cache.get(dtype, None)
+        obj = cls._finfo_cache.get(dtype)
         if obj is not None:
             return obj
         dtypes = [dtype]
@@ -490,17 +511,24 @@ class finfo:
             dtype = newdtype
         if not issubclass(dtype, numeric.inexact):
             raise ValueError("data type %r not inexact" % (dtype))
-        obj = cls._finfo_cache.get(dtype, None)
+        obj = cls._finfo_cache.get(dtype)
         if obj is not None:
             return obj
         if not issubclass(dtype, numeric.floating):
             newdtype = _convert_to_float[dtype]
             if newdtype is not dtype:
+                # dtype changed, for example from complex128 to float64
                 dtypes.append(newdtype)
                 dtype = newdtype
-        obj = cls._finfo_cache.get(dtype, None)
-        if obj is not None:
-            return obj
+
+                obj = cls._finfo_cache.get(dtype, None)
+                if obj is not None:
+                    # the original dtype was not in the cache, but the new
+                    # dtype is in the cache. we add the original dtypes to
+                    # the cache and return the result
+                    for dt in dtypes:
+                        cls._finfo_cache[dt] = obj
+                    return obj
         obj = object.__new__(cls)._init(dtype)
         for dt in dtypes:
             cls._finfo_cache[dt] = obj
@@ -595,20 +623,6 @@ class finfo:
         """
         return self.smallest_normal
 
-    @property
-    def machar(self):
-        """The object which calculated these parameters and holds more
-        detailed information.
-
-        .. deprecated:: 1.22
-        """
-        # Deprecated 2021-10-27, NumPy 1.22
-        warnings.warn(
-            "`finfo.machar` is deprecated (NumPy 1.22)",
-            DeprecationWarning, stacklevel=2,
-        )
-        return self._machar
-
 
 @set_module('numpy')
 class iinfo:
diff --git a/numpy/core/include/meson.build b/numpy/core/include/meson.build
new file mode 100644
index 000000000..be07a07b8
--- /dev/null
+++ b/numpy/core/include/meson.build
@@ -0,0 +1,52 @@
+installed_headers = [
+  'numpy/_neighborhood_iterator_imp.h',
+  'numpy/arrayobject.h',
+  'numpy/arrayscalars.h',
+  'numpy/_dtype_api.h',
+  'numpy/experimental_dtype_api.h',
+  'numpy/halffloat.h',
+  'numpy/ndarrayobject.h',
+  'numpy/ndarraytypes.h',
+  'numpy/noprefix.h',
+  'numpy/npy_1_7_deprecated_api.h',
+  'numpy/npy_3kcompat.h',
+  'numpy/npy_common.h',
+  'numpy/npy_cpu.h',
+  'numpy/npy_endian.h',
+  'numpy/npy_interrupt.h',
+  'numpy/npy_math.h',
+  'numpy/npy_no_deprecated_api.h',
+  'numpy/npy_os.h',
+  'numpy/numpyconfig.h',
+  'numpy/old_defines.h',
+  'numpy/ufuncobject.h',
+  'numpy/utils.h',
+]
+
+# Note that oldnumeric.h is not installed on purpose. After investigation,
+# there are only two active repos left which use it, and those got a heads up
+# about removal of this header in numpy 1.25.0:
+#   https://github.com/enthought/enable/issues/1005
+#   https://github.com/fhs/pyhdf/issues/63
+
+py.install_sources(
+  installed_headers,
+  subdir: 'numpy/core/include/numpy'
+)
+
+py.install_sources(
+  [
+    'numpy/random/bitgen.h',
+    'numpy/random/distributions.h',
+  ],
+  subdir: 'numpy/core/include/numpy/random'
+)
+
+
+py.install_sources(
+  [
+    'numpy/libdivide/libdivide.h',
+    'numpy/libdivide/LICENSE.txt',
+  ],
+  subdir: 'numpy/core/include/numpy/random'
+)
diff --git a/numpy/core/include/numpy/_dtype_api.h b/numpy/core/include/numpy/_dtype_api.h
new file mode 100644
index 000000000..c397699c7
--- /dev/null
+++ b/numpy/core/include/numpy/_dtype_api.h
@@ -0,0 +1,408 @@
+/*
+ * DType related API shared by the (experimental) public API And internal API.
+ */
+
+#ifndef NUMPY_CORE_INCLUDE_NUMPY___DTYPE_API_H_
+#define NUMPY_CORE_INCLUDE_NUMPY___DTYPE_API_H_
+
+#define __EXPERIMENTAL_DTYPE_API_VERSION 10
+
+struct PyArrayMethodObject_tag;
+
+/*
+ * Largely opaque struct for DType classes (i.e. metaclass instances).
+ * The internal definition is currently in `ndarraytypes.h` (export is a bit
+ * more complex because `PyArray_Descr` is a DTypeMeta internally but not
+ * externally).
+ */
+#if !(defined(NPY_INTERNAL_BUILD) && NPY_INTERNAL_BUILD)
+
+    typedef struct PyArray_DTypeMeta_tag {
+        PyHeapTypeObject super;
+
+        /*
+        * Most DTypes will have a singleton default instance, for the
+        * parametric legacy DTypes (bytes, string, void, datetime) this
+        * may be a pointer to the *prototype* instance?
+        */
+        PyArray_Descr *singleton;
+        /* Copy of the legacy DTypes type number, usually invalid. */
+        int type_num;
+
+        /* The type object of the scalar instances (may be NULL?) */
+        PyTypeObject *scalar_type;
+        /*
+        * DType flags to signal legacy, parametric, or
+        * abstract.  But plenty of space for additional information/flags.
+        */
+        npy_uint64 flags;
+
+        /*
+        * Use indirection in order to allow a fixed size for this struct.
+        * A stable ABI size makes creating a static DType less painful
+        * while also ensuring flexibility for all opaque API (with one
+        * indirection due the pointer lookup).
+        */
+        void *dt_slots;
+        /* Allow growing (at the moment also beyond this) */
+        void *reserved[3];
+    } PyArray_DTypeMeta;
+
+#endif  /* not internal build */
+
+/*
+ * ******************************************************
+ *         ArrayMethod API (Casting and UFuncs)
+ * ******************************************************
+ */
+/*
+ * NOTE: Expected changes:
+ *       * probably split runtime and general flags into two
+ *       * should possibly not use an enum for typedef for more stable ABI?
+ */
+typedef enum {
+    /* Flag for whether the GIL is required */
+    NPY_METH_REQUIRES_PYAPI = 1 << 0,
+    /*
+     * Some functions cannot set floating point error flags, this flag
+     * gives us the option (not requirement) to skip floating point error
+     * setup/check. No function should set error flags and ignore them
+     * since it would interfere with chaining operations (e.g. casting).
+     */
+    NPY_METH_NO_FLOATINGPOINT_ERRORS = 1 << 1,
+    /* Whether the method supports unaligned access (not runtime) */
+    NPY_METH_SUPPORTS_UNALIGNED = 1 << 2,
+    /*
+     * Used for reductions to allow reordering the operation.  At this point
+     * assume that if set, it also applies to normal operations though!
+     */
+    NPY_METH_IS_REORDERABLE = 1 << 3,
+    /*
+     * Private flag for now for *logic* functions.  The logical functions
+     * `logical_or` and `logical_and` can always cast the inputs to booleans
+     * "safely" (because that is how the cast to bool is defined).
+     * @seberg: I am not sure this is the best way to handle this, so its
+     * private for now (also it is very limited anyway).
+     * There is one "exception". NA aware dtypes cannot cast to bool
+     * (hopefully), so the `??->?` loop should error even with this flag.
+     * But a second NA fallback loop will be necessary.
+     */
+    _NPY_METH_FORCE_CAST_INPUTS = 1 << 17,
+
+    /* All flags which can change at runtime */
+    NPY_METH_RUNTIME_FLAGS = (
+            NPY_METH_REQUIRES_PYAPI |
+            NPY_METH_NO_FLOATINGPOINT_ERRORS),
+} NPY_ARRAYMETHOD_FLAGS;
+
+
+typedef struct PyArrayMethod_Context_tag {
+    /* The caller, which is typically the original ufunc.  May be NULL */
+    PyObject *caller;
+    /* The method "self".  Publically currentl an opaque object. */
+    struct PyArrayMethodObject_tag *method;
+
+    /* Operand descriptors, filled in by resolve_descriptors */
+    PyArray_Descr **descriptors;
+    /* Structure may grow (this is harmless for DType authors) */
+} PyArrayMethod_Context;
+
+
+/*
+ * The main object for creating a new ArrayMethod. We use the typical `slots`
+ * mechanism used by the Python limited API (see below for the slot defs).
+ */
+typedef struct {
+    const char *name;
+    int nin, nout;
+    NPY_CASTING casting;
+    NPY_ARRAYMETHOD_FLAGS flags;
+    PyArray_DTypeMeta **dtypes;
+    PyType_Slot *slots;
+} PyArrayMethod_Spec;
+
+
+/*
+ * ArrayMethod slots
+ * -----------------
+ *
+ * SLOTS IDs For the ArrayMethod creation, once fully public, IDs are fixed
+ * but can be deprecated and arbitrarily extended.
+ */
+#define NPY_METH_resolve_descriptors 1
+/* We may want to adapt the `get_loop` signature a bit: */
+#define _NPY_METH_get_loop 2
+#define NPY_METH_get_reduction_initial 3
+/* specific loops for constructions/default get_loop: */
+#define NPY_METH_strided_loop 4
+#define NPY_METH_contiguous_loop 5
+#define NPY_METH_unaligned_strided_loop 6
+#define NPY_METH_unaligned_contiguous_loop 7
+#define NPY_METH_contiguous_indexed_loop 8
+
+/*
+ * The resolve descriptors function, must be able to handle NULL values for
+ * all output (but not input) `given_descrs` and fill `loop_descrs`.
+ * Return -1 on error or 0 if the operation is not possible without an error
+ * set.  (This may still be in flux.)
+ * Otherwise must return the "casting safety", for normal functions, this is
+ * almost always "safe" (or even "equivalent"?).
+ *
+ * `resolve_descriptors` is optional if all output DTypes are non-parametric.
+ */
+typedef NPY_CASTING (resolve_descriptors_function)(
+        /* "method" is currently opaque (necessary e.g. to wrap Python) */
+        struct PyArrayMethodObject_tag *method,
+        /* DTypes the method was created for */
+        PyArray_DTypeMeta **dtypes,
+        /* Input descriptors (instances).  Outputs may be NULL. */
+        PyArray_Descr **given_descrs,
+        /* Exact loop descriptors to use, must not hold references on error */
+        PyArray_Descr **loop_descrs,
+        npy_intp *view_offset);
+
+
+typedef int (PyArrayMethod_StridedLoop)(PyArrayMethod_Context *context,
+        char *const *data, const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *transferdata);
+
+
+typedef int (get_loop_function)(
+        PyArrayMethod_Context *context,
+        int aligned, int move_references,
+        const npy_intp *strides,
+        PyArrayMethod_StridedLoop **out_loop,
+        NpyAuxData **out_transferdata,
+        NPY_ARRAYMETHOD_FLAGS *flags);
+
+/**
+ * Query an ArrayMethod for the initial value for use in reduction.
+ *
+ * @param context The arraymethod context, mainly to access the descriptors.
+ * @param reduction_is_empty Whether the reduction is empty. When it is, the
+ *     value returned may differ.  In this case it is a "default" value that
+ *     may differ from the "identity" value normally used.  For example:
+ *     - `0.0` is the default for `sum([])`.  But `-0.0` is the correct
+ *       identity otherwise as it preserves the sign for `sum([-0.0])`.
+ *     - We use no identity for object, but return the default of `0` and `1`
+ *       for the empty `sum([], dtype=object)` and `prod([], dtype=object)`.
+ *       This allows `np.sum(np.array(["a", "b"], dtype=object))` to work.
+ *     - `-inf` or `INT_MIN` for `max` is an identity, but at least `INT_MIN`
+ *       not a good *default* when there are no items.
+ * @param initial Pointer to initial data to be filled (if possible)
+ *
+ * @returns -1, 0, or 1 indicating error, no initial value, and initial being
+ *     successfully filled.  Errors must not be given where 0 is correct, NumPy
+ *     may call this even when not strictly necessary.
+ */
+typedef int (get_reduction_initial_function)(
+        PyArrayMethod_Context *context, npy_bool reduction_is_empty,
+        char *initial);
+
+/*
+ * The following functions are only used by the wrapping array method defined
+ * in umath/wrapping_array_method.c
+ */
+
+/*
+ * The function to convert the given descriptors (passed in to
+ * `resolve_descriptors`) and translates them for the wrapped loop.
+ * The new descriptors MUST be viewable with the old ones, `NULL` must be
+ * supported (for outputs) and should normally be forwarded.
+ *
+ * The function must clean up on error.
+ *
+ * NOTE: We currently assume that this translation gives "viewable" results.
+ *       I.e. there is no additional casting related to the wrapping process.
+ *       In principle that could be supported, but not sure it is useful.
+ *       This currently also means that e.g. alignment must apply identically
+ *       to the new dtypes.
+ *
+ * TODO: Due to the fact that `resolve_descriptors` is also used for `can_cast`
+ *       there is no way to "pass out" the result of this function.  This means
+ *       it will be called twice for every ufunc call.
+ *       (I am considering including `auxdata` as an "optional" parameter to
+ *       `resolve_descriptors`, so that it can be filled there if not NULL.)
+ */
+typedef int translate_given_descrs_func(int nin, int nout,
+        PyArray_DTypeMeta *wrapped_dtypes[],
+        PyArray_Descr *given_descrs[], PyArray_Descr *new_descrs[]);
+
+/**
+ * The function to convert the actual loop descriptors (as returned by the
+ * original `resolve_descriptors` function) to the ones the output array
+ * should use.
+ * This function must return "viewable" types, it must not mutate them in any
+ * form that would break the inner-loop logic.  Does not need to support NULL.
+ *
+ * The function must clean up on error.
+ *
+ * @param nargs Number of arguments
+ * @param new_dtypes The DTypes of the output (usually probably not needed)
+ * @param given_descrs Original given_descrs to the resolver, necessary to
+ *        fetch any information related to the new dtypes from the original.
+ * @param original_descrs The `loop_descrs` returned by the wrapped loop.
+ * @param loop_descrs The output descriptors, compatible to `original_descrs`.
+ *
+ * @returns 0 on success, -1 on failure.
+ */
+typedef int translate_loop_descrs_func(int nin, int nout,
+        PyArray_DTypeMeta *new_dtypes[], PyArray_Descr *given_descrs[],
+        PyArray_Descr *original_descrs[], PyArray_Descr *loop_descrs[]);
+
+
+/*
+ * A traverse loop working on a single array. This is similar to the general
+ * strided-loop function. This is designed for loops that need to visit every
+ * element of a single array.
+ *
+ * Currently this is used for array clearing, via the NPY_DT_get_clear_loop
+ * API hook, and zero-filling, via the NPY_DT_get_fill_zero_loop API hook.
+ * These are most useful for handling arrays storing embedded references to
+ * python objects or heap-allocated data.
+ *
+ * The `void *traverse_context` is passed in because we may need to pass in
+ * Intepreter state or similar in the future, but we don't want to pass in
+ * a full context (with pointers to dtypes, method, caller which all make
+ * no sense for a traverse function).
+ *
+ * We assume for now that this context can be just passed through in the
+ * the future (for structured dtypes).
+ *
+ */
+typedef int (traverse_loop_function)(
+        void *traverse_context, PyArray_Descr *descr, char *data,
+        npy_intp size, npy_intp stride, NpyAuxData *auxdata);
+
+
+/*
+ * Simplified get_loop function specific to dtype traversal
+ *
+ * It should set the flags needed for the traversal loop and set out_loop to the
+ * loop function, which must be a valid traverse_loop_function
+ * pointer. Currently this is used for zero-filling and clearing arrays storing
+ * embedded references.
+ *
+ */
+typedef int (get_traverse_loop_function)(
+        void *traverse_context, PyArray_Descr *descr,
+        int aligned, npy_intp fixed_stride,
+        traverse_loop_function **out_loop, NpyAuxData **out_auxdata,
+        NPY_ARRAYMETHOD_FLAGS *flags);
+
+
+/*
+ * ****************************
+ *          DTYPE API
+ * ****************************
+ */
+
+#define NPY_DT_ABSTRACT 1 << 1
+#define NPY_DT_PARAMETRIC 1 << 2
+#define NPY_DT_NUMERIC 1 << 3
+
+/*
+ * These correspond to slots in the NPY_DType_Slots struct and must
+ * be in the same order as the members of that struct. If new slots
+ * get added or old slots get removed NPY_NUM_DTYPE_SLOTS must also
+ * be updated
+ */
+
+#define NPY_DT_discover_descr_from_pyobject 1
+// this slot is considered private because its API hasn't beed decided
+#define _NPY_DT_is_known_scalar_type 2
+#define NPY_DT_default_descr 3
+#define NPY_DT_common_dtype 4
+#define NPY_DT_common_instance 5
+#define NPY_DT_ensure_canonical 6
+#define NPY_DT_setitem 7
+#define NPY_DT_getitem 8
+#define NPY_DT_get_clear_loop 9
+#define NPY_DT_get_fill_zero_loop 10
+
+// These PyArray_ArrFunc slots will be deprecated and replaced eventually
+// getitem and setitem can be defined as a performance optimization;
+// by default the user dtypes call `legacy_getitem_using_DType` and
+// `legacy_setitem_using_DType`, respectively. This functionality is
+// only supported for basic NumPy DTypes.
+
+
+// used to separate dtype slots from arrfuncs slots
+// intended only for internal use but defined here for clarity
+#define _NPY_DT_ARRFUNCS_OFFSET (1 << 10)
+
+// Cast is disabled
+// #define NPY_DT_PyArray_ArrFuncs_cast 0 + _NPY_DT_ARRFUNCS_OFFSET
+
+#define NPY_DT_PyArray_ArrFuncs_getitem 1 + _NPY_DT_ARRFUNCS_OFFSET
+#define NPY_DT_PyArray_ArrFuncs_setitem 2 + _NPY_DT_ARRFUNCS_OFFSET
+
+#define NPY_DT_PyArray_ArrFuncs_copyswapn 3 + _NPY_DT_ARRFUNCS_OFFSET
+#define NPY_DT_PyArray_ArrFuncs_copyswap 4 + _NPY_DT_ARRFUNCS_OFFSET
+#define NPY_DT_PyArray_ArrFuncs_compare 5 + _NPY_DT_ARRFUNCS_OFFSET
+#define NPY_DT_PyArray_ArrFuncs_argmax 6 + _NPY_DT_ARRFUNCS_OFFSET
+#define NPY_DT_PyArray_ArrFuncs_dotfunc 7 + _NPY_DT_ARRFUNCS_OFFSET
+#define NPY_DT_PyArray_ArrFuncs_scanfunc 8 + _NPY_DT_ARRFUNCS_OFFSET
+#define NPY_DT_PyArray_ArrFuncs_fromstr 9 + _NPY_DT_ARRFUNCS_OFFSET
+#define NPY_DT_PyArray_ArrFuncs_nonzero 10 + _NPY_DT_ARRFUNCS_OFFSET
+#define NPY_DT_PyArray_ArrFuncs_fill 11 + _NPY_DT_ARRFUNCS_OFFSET
+#define NPY_DT_PyArray_ArrFuncs_fillwithscalar 12 + _NPY_DT_ARRFUNCS_OFFSET
+#define NPY_DT_PyArray_ArrFuncs_sort 13 + _NPY_DT_ARRFUNCS_OFFSET
+#define NPY_DT_PyArray_ArrFuncs_argsort 14 + _NPY_DT_ARRFUNCS_OFFSET
+
+// Casting related slots are disabled. See
+// https://github.com/numpy/numpy/pull/23173#discussion_r1101098163
+// #define NPY_DT_PyArray_ArrFuncs_castdict 15 + _NPY_DT_ARRFUNCS_OFFSET
+// #define NPY_DT_PyArray_ArrFuncs_scalarkind 16 + _NPY_DT_ARRFUNCS_OFFSET
+// #define NPY_DT_PyArray_ArrFuncs_cancastscalarkindto 17 + _NPY_DT_ARRFUNCS_OFFSET
+// #define NPY_DT_PyArray_ArrFuncs_cancastto 18 + _NPY_DT_ARRFUNCS_OFFSET
+
+// These are deprecated in NumPy 1.19, so are disabled here.
+// #define NPY_DT_PyArray_ArrFuncs_fastclip 19 + _NPY_DT_ARRFUNCS_OFFSET
+// #define NPY_DT_PyArray_ArrFuncs_fastputmask 20 + _NPY_DT_ARRFUNCS_OFFSET
+// #define NPY_DT_PyArray_ArrFuncs_fasttake 21 + _NPY_DT_ARRFUNCS_OFFSET
+#define NPY_DT_PyArray_ArrFuncs_argmin 22 + _NPY_DT_ARRFUNCS_OFFSET
+
+// TODO: These slots probably still need some thought, and/or a way to "grow"?
+typedef struct {
+    PyTypeObject *typeobj;    /* type of python scalar or NULL */
+    int flags;                /* flags, including parametric and abstract */
+    /* NULL terminated cast definitions. Use NULL for the newly created DType */
+    PyArrayMethod_Spec **casts;
+    PyType_Slot *slots;
+    /* Baseclass or NULL (will always subclass `np.dtype`) */
+    PyTypeObject *baseclass;
+} PyArrayDTypeMeta_Spec;
+
+
+typedef PyArray_Descr *(discover_descr_from_pyobject_function)(
+        PyArray_DTypeMeta *cls, PyObject *obj);
+
+/*
+ * Before making this public, we should decide whether it should pass
+ * the type, or allow looking at the object. A possible use-case:
+ * `np.array(np.array([0]), dtype=np.ndarray)`
+ * Could consider arrays that are not `dtype=ndarray` "scalars".
+ */
+typedef int (is_known_scalar_type_function)(
+        PyArray_DTypeMeta *cls, PyTypeObject *obj);
+
+typedef PyArray_Descr *(default_descr_function)(PyArray_DTypeMeta *cls);
+typedef PyArray_DTypeMeta *(common_dtype_function)(
+        PyArray_DTypeMeta *dtype1, PyArray_DTypeMeta *dtype2);
+typedef PyArray_Descr *(common_instance_function)(
+        PyArray_Descr *dtype1, PyArray_Descr *dtype2);
+typedef PyArray_Descr *(ensure_canonical_function)(PyArray_Descr *dtype);
+
+/*
+ * TODO: These two functions are currently only used for experimental DType
+ *       API support.  Their relation should be "reversed": NumPy should
+ *       always use them internally.
+ *       There are open points about "casting safety" though, e.g. setting
+ *       elements is currently always unsafe.
+ */
+typedef int(setitemfunction)(PyArray_Descr *, PyObject *, char *);
+typedef PyObject *(getitemfunction)(PyArray_Descr *, char *);
+
+
+#endif  /* NUMPY_CORE_INCLUDE_NUMPY___DTYPE_API_H_ */
diff --git a/numpy/core/include/numpy/_neighborhood_iterator_imp.h b/numpy/core/include/numpy/_neighborhood_iterator_imp.h
index 07e2363d0..b365cb508 100644
--- a/numpy/core/include/numpy/_neighborhood_iterator_imp.h
+++ b/numpy/core/include/numpy/_neighborhood_iterator_imp.h
@@ -4,7 +4,7 @@
 /*
  * Private API (here for inline)
  */
-static NPY_INLINE int
+static inline int
 _PyArrayNeighborhoodIter_IncrCoord(PyArrayNeighborhoodIterObject* iter);
 
 /*
@@ -34,7 +34,7 @@ _PyArrayNeighborhoodIter_IncrCoord(PyArrayNeighborhoodIterObject* iter);
         iter->coordinates[c] = iter->bounds[c][0]; \
     }
 
-static NPY_INLINE int
+static inline int
 _PyArrayNeighborhoodIter_IncrCoord(PyArrayNeighborhoodIterObject* iter)
 {
     npy_intp i, wb;
@@ -49,7 +49,7 @@ _PyArrayNeighborhoodIter_IncrCoord(PyArrayNeighborhoodIterObject* iter)
 /*
  * Version optimized for 2d arrays, manual loop unrolling
  */
-static NPY_INLINE int
+static inline int
 _PyArrayNeighborhoodIter_IncrCoord2D(PyArrayNeighborhoodIterObject* iter)
 {
     npy_intp wb;
@@ -64,7 +64,7 @@ _PyArrayNeighborhoodIter_IncrCoord2D(PyArrayNeighborhoodIterObject* iter)
 /*
  * Advance to the next neighbour
  */
-static NPY_INLINE int
+static inline int
 PyArrayNeighborhoodIter_Next(PyArrayNeighborhoodIterObject* iter)
 {
     _PyArrayNeighborhoodIter_IncrCoord (iter);
@@ -76,7 +76,7 @@ PyArrayNeighborhoodIter_Next(PyArrayNeighborhoodIterObject* iter)
 /*
  * Reset functions
  */
-static NPY_INLINE int
+static inline int
 PyArrayNeighborhoodIter_Reset(PyArrayNeighborhoodIterObject* iter)
 {
     npy_intp i;
diff --git a/numpy/core/include/numpy/_numpyconfig.h.in b/numpy/core/include/numpy/_numpyconfig.h.in
new file mode 100644
index 000000000..a21820026
--- /dev/null
+++ b/numpy/core/include/numpy/_numpyconfig.h.in
@@ -0,0 +1,32 @@
+#mesondefine NPY_HAVE_ENDIAN_H
+
+#mesondefine NPY_SIZEOF_SHORT
+#mesondefine NPY_SIZEOF_INT
+#mesondefine NPY_SIZEOF_LONG
+#mesondefine NPY_SIZEOF_FLOAT
+#mesondefine NPY_SIZEOF_COMPLEX_FLOAT
+#mesondefine NPY_SIZEOF_DOUBLE
+#mesondefine NPY_SIZEOF_COMPLEX_DOUBLE
+#mesondefine NPY_SIZEOF_LONGDOUBLE
+#mesondefine NPY_SIZEOF_COMPLEX_LONGDOUBLE
+#mesondefine NPY_SIZEOF_PY_INTPTR_T
+#mesondefine NPY_SIZEOF_OFF_T
+#mesondefine NPY_SIZEOF_PY_LONG_LONG
+#mesondefine NPY_SIZEOF_LONGLONG
+
+#mesondefine NPY_USE_C99_COMPLEX
+#mesondefine NPY_HAVE_COMPLEX_DOUBLE
+#mesondefine NPY_HAVE_COMPLEX_FLOAT
+#mesondefine NPY_HAVE_COMPLEX_LONG_DOUBLE
+#mesondefine NPY_USE_C99_FORMATS
+
+#mesondefine NPY_NO_SIGNAL
+#mesondefine NPY_NO_SMP
+
+#mesondefine NPY_VISIBILITY_HIDDEN
+#mesondefine NPY_ABI_VERSION
+#mesondefine NPY_API_VERSION
+
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS 1
+#endif
diff --git a/numpy/core/include/numpy/arrayscalars.h b/numpy/core/include/numpy/arrayscalars.h
index a20a68016..258bf95b6 100644
--- a/numpy/core/include/numpy/arrayscalars.h
+++ b/numpy/core/include/numpy/arrayscalars.h
@@ -139,7 +139,9 @@ typedef struct {
         /* note that the PyObject_HEAD macro lives right here */
         PyUnicodeObject base;
         Py_UCS4 *obval;
+    #if NPY_FEATURE_VERSION >= NPY_1_20_API_VERSION
         char *buffer_fmt;
+    #endif
 } PyUnicodeScalarObject;
 
 
@@ -149,7 +151,9 @@ typedef struct {
         PyArray_Descr *descr;
         int flags;
         PyObject *base;
+    #if NPY_FEATURE_VERSION >= NPY_1_20_API_VERSION
         void *_buffer_info;  /* private buffer info, tagged to allow warning */
+    #endif
 } PyVoidScalarObject;
 
 /* Macros
diff --git a/numpy/core/include/numpy/experimental_dtype_api.h b/numpy/core/include/numpy/experimental_dtype_api.h
index 1fbd41981..120d33324 100644
--- a/numpy/core/include/numpy/experimental_dtype_api.h
+++ b/numpy/core/include/numpy/experimental_dtype_api.h
@@ -3,6 +3,9 @@
  * NEPs 41 to 43.  For background, please check these NEPs.  Otherwise,
  * this header also serves as documentation for the time being.
  *
+ * The header includes `_dtype_api.h` which holds most definition while this
+ * header mainly wraps functions for public consumption.
+ *
  * Please do not hesitate to contact @seberg with questions.  This is
  * developed together with https://github.com/seberg/experimental_user_dtypes
  * and those interested in experimenting are encouraged to contribute there.
@@ -114,7 +117,13 @@
 
 #include <Python.h>
 #include "ndarraytypes.h"
+#include "_dtype_api.h"
 
+/*
+ * The contents of PyArrayMethodObject are currently opaque (is there a way
+ * good way to make them be `PyObject *`?)
+ */
+typedef struct PyArrayMethodObject_tag PyArrayMethodObject;
 
 /*
  * There must be a better way?! -- Oh well, this is experimental
@@ -154,66 +163,6 @@
 #endif
 
 
-/*
- * DTypeMeta struct, the content may be made fully opaque (except the size).
- * We may also move everything into a single `void *dt_slots`.
- */
-typedef struct {
-    PyHeapTypeObject super;
-    PyArray_Descr *singleton;
-    int type_num;
-    PyTypeObject *scalar_type;
-    npy_uint64 flags;
-    void *dt_slots;
-    void *reserved[3];
-} PyArray_DTypeMeta;
-
-
-/*
- * ******************************************************
- *         ArrayMethod API (Casting and UFuncs)
- * ******************************************************
- */
-/*
- * NOTE: Expected changes:
- *       * invert logic of floating point error flag
- *       * probably split runtime and general flags into two
- *       * should possibly not use an enum for typedef for more stable ABI?
- */
-typedef enum {
-    /* Flag for whether the GIL is required */
-    NPY_METH_REQUIRES_PYAPI = 1 << 1,
-    /*
-     * Some functions cannot set floating point error flags, this flag
-     * gives us the option (not requirement) to skip floating point error
-     * setup/check. No function should set error flags and ignore them
-     * since it would interfere with chaining operations (e.g. casting).
-     */
-    NPY_METH_NO_FLOATINGPOINT_ERRORS = 1 << 2,
-    /* Whether the method supports unaligned access (not runtime) */
-    NPY_METH_SUPPORTS_UNALIGNED = 1 << 3,
-
-    /* All flags which can change at runtime */
-    NPY_METH_RUNTIME_FLAGS = (
-            NPY_METH_REQUIRES_PYAPI |
-            NPY_METH_NO_FLOATINGPOINT_ERRORS),
-} NPY_ARRAYMETHOD_FLAGS;
-
-
-/*
- * The main object for creating a new ArrayMethod. We use the typical `slots`
- * mechanism used by the Python limited API (see below for the slot defs).
- */
-typedef struct {
-    const char *name;
-    int nin, nout;
-    NPY_CASTING casting;
-    NPY_ARRAYMETHOD_FLAGS flags;
-    PyArray_DTypeMeta **dtypes;
-    PyType_Slot *slots;
-} PyArrayMethod_Spec;
-
-
 typedef int _ufunc_addloop_fromspec_func(
         PyObject *ufunc, PyArrayMethod_Spec *spec);
 /*
@@ -267,92 +216,6 @@ typedef int _ufunc_addpromoter_func(
 #define PyUFunc_AddPromoter \
     (*(_ufunc_addpromoter_func *)(__experimental_dtype_api_table[1]))
 
-
-/*
- * The resolve descriptors function, must be able to handle NULL values for
- * all output (but not input) `given_descrs` and fill `loop_descrs`.
- * Return -1 on error or 0 if the operation is not possible without an error
- * set.  (This may still be in flux.)
- * Otherwise must return the "casting safety", for normal functions, this is
- * almost always "safe" (or even "equivalent"?).
- *
- * `resolve_descriptors` is optional if all output DTypes are non-parametric.
- */
-#define NPY_METH_resolve_descriptors 1
-typedef NPY_CASTING (resolve_descriptors_function)(
-        /* "method" is currently opaque (necessary e.g. to wrap Python) */
-        PyObject *method,
-        /* DTypes the method was created for */
-        PyObject **dtypes,
-        /* Input descriptors (instances).  Outputs may be NULL. */
-        PyArray_Descr **given_descrs,
-        /* Exact loop descriptors to use, must not hold references on error */
-        PyArray_Descr **loop_descrs,
-        npy_intp *view_offset);
-
-/* NOT public yet: Signature needs adapting as external API. */
-#define _NPY_METH_get_loop 2
-
-/*
- * Current public API to define fast inner-loops.  You must provide a
- * strided loop.  If this is a cast between two "versions" of the same dtype
- * you must also provide an unaligned strided loop.
- * Other loops are useful to optimize the very common contiguous case.
- *
- * NOTE: As of now, NumPy will NOT use unaligned loops in ufuncs!
- */
-#define NPY_METH_strided_loop 3
-#define NPY_METH_contiguous_loop 4
-#define NPY_METH_unaligned_strided_loop 5
-#define NPY_METH_unaligned_contiguous_loop 6
-
-
-typedef struct {
-    PyObject *caller;  /* E.g. the original ufunc, may be NULL */
-    PyObject *method;  /* The method "self". Currently an opaque object */
-
-    /* Operand descriptors, filled in by resolve_descriptors */
-    PyArray_Descr **descriptors;
-    /* Structure may grow (this is harmless for DType authors) */
-} PyArrayMethod_Context;
-
-typedef int (PyArrayMethod_StridedLoop)(PyArrayMethod_Context *context,
-        char *const *data, const npy_intp *dimensions, const npy_intp *strides,
-        NpyAuxData *transferdata);
-
-
-
-/*
- * ****************************
- *          DTYPE API
- * ****************************
- */
-
-#define NPY_DT_ABSTRACT 1 << 1
-#define NPY_DT_PARAMETRIC 1 << 2
-
-#define NPY_DT_discover_descr_from_pyobject 1
-#define _NPY_DT_is_known_scalar_type 2
-#define NPY_DT_default_descr 3
-#define NPY_DT_common_dtype 4
-#define NPY_DT_common_instance 5
-#define NPY_DT_ensure_canonical 6
-#define NPY_DT_setitem 7
-#define NPY_DT_getitem 8
-
-
-// TODO: These slots probably still need some thought, and/or a way to "grow"?
-typedef struct{
-    PyTypeObject *typeobj;    /* type of python scalar or NULL */
-    int flags;                /* flags, including parametric and abstract */
-    /* NULL terminated cast definitions. Use NULL for the newly created DType */
-    PyArrayMethod_Spec **casts;
-    PyType_Slot *slots;
-    /* Baseclass or NULL (will always subclass `np.dtype`) */
-    PyTypeObject *baseclass;
-} PyArrayDTypeMeta_Spec;
-
-
 #define PyArrayDTypeMeta_Type \
     (*(PyTypeObject *)__experimental_dtype_api_table[2])
 typedef int __dtypemeta_fromspec(
@@ -392,7 +255,7 @@ typedef PyArray_Descr *__get_default_descr(
 #define _PyArray_GetDefaultDescr \
     ((__get_default_descr *)(__experimental_dtype_api_table[6]))
 
-static NPY_INLINE PyArray_Descr *
+static inline PyArray_Descr *
 PyArray_GetDefaultDescr(PyArray_DTypeMeta *DType)
 {
     if (DType->singleton != NULL) {
@@ -458,16 +321,14 @@ PyArray_GetDefaultDescr(PyArray_DTypeMeta *DType)
  */
 #if !defined(NO_IMPORT) && !defined(NO_IMPORT_ARRAY)
 
-#define __EXPERIMENTAL_DTYPE_VERSION 5
-
 static int
 import_experimental_dtype_api(int version)
 {
-    if (version != __EXPERIMENTAL_DTYPE_VERSION) {
+    if (version != __EXPERIMENTAL_DTYPE_API_VERSION) {
         PyErr_Format(PyExc_RuntimeError,
                 "DType API version %d did not match header version %d. Please "
                 "update the import statement and check for API changes.",
-                version, __EXPERIMENTAL_DTYPE_VERSION);
+                version, __EXPERIMENTAL_DTYPE_API_VERSION);
         return -1;
     }
     if (__experimental_dtype_api_table != __uninitialized_table) {
diff --git a/numpy/core/include/numpy/ndarrayobject.h b/numpy/core/include/numpy/ndarrayobject.h
index aaaefd7de..36cfdd6f6 100644
--- a/numpy/core/include/numpy/ndarrayobject.h
+++ b/numpy/core/include/numpy/ndarrayobject.h
@@ -152,7 +152,7 @@ extern "C" {
                                             (k)*PyArray_STRIDES(obj)[2] + \
                                             (l)*PyArray_STRIDES(obj)[3]))
 
-static NPY_INLINE void
+static inline void
 PyArray_DiscardWritebackIfCopy(PyArrayObject *arr)
 {
     PyArrayObject_fields *fa = (PyArrayObject_fields *)arr;
@@ -214,7 +214,7 @@ PyArray_DiscardWritebackIfCopy(PyArrayObject *arr)
    dict).
 */
 
-static NPY_INLINE int
+static inline int
 NPY_TITLE_KEY_check(PyObject *key, PyObject *value)
 {
     PyObject *title;
diff --git a/numpy/core/include/numpy/ndarraytypes.h b/numpy/core/include/numpy/ndarraytypes.h
index e894b08f6..742ba5261 100644
--- a/numpy/core/include/numpy/ndarraytypes.h
+++ b/numpy/core/include/numpy/ndarraytypes.h
@@ -44,23 +44,6 @@
 #define NPY_FAIL 0
 #define NPY_SUCCEED 1
 
-/*
- * Binary compatibility version number.  This number is increased
- * whenever the C-API is changed such that binary compatibility is
- * broken, i.e. whenever a recompile of extension modules is needed.
- */
-#define NPY_VERSION NPY_ABI_VERSION
-
-/*
- * Minor API version.  This number is increased whenever a change is
- * made to the C-API -- whether it breaks binary compatibility or not.
- * Some changes, such as adding a function pointer to the end of the
- * function table, can be made without breaking binary compatibility.
- * In this case, only the NPY_FEATURE_VERSION (*not* NPY_VERSION)
- * would be increased.  Whenever binary compatibility is broken, both
- * NPY_VERSION and NPY_FEATURE_VERSION should be increased.
- */
-#define NPY_FEATURE_VERSION NPY_API_VERSION
 
 enum NPY_TYPES {    NPY_BOOL=0,
                     NPY_BYTE, NPY_UBYTE,
@@ -729,11 +712,15 @@ typedef struct tagPyArrayObject_fields {
     int flags;
     /* For weak references */
     PyObject *weakreflist;
+#if NPY_FEATURE_VERSION >= NPY_1_20_API_VERSION
     void *_buffer_info;  /* private buffer info, tagged to allow warning */
+#endif
     /*
      * For malloc/calloc/realloc/free per object
      */
+#if NPY_FEATURE_VERSION >= NPY_1_22_API_VERSION
     PyObject *mem_handler;
+#endif
 } PyArrayObject_fields;
 
 /*
@@ -1463,12 +1450,12 @@ typedef struct {
  */
 
 /* General: those work for any mode */
-static NPY_INLINE int
+static inline int
 PyArrayNeighborhoodIter_Reset(PyArrayNeighborhoodIterObject* iter);
-static NPY_INLINE int
+static inline int
 PyArrayNeighborhoodIter_Next(PyArrayNeighborhoodIterObject* iter);
 #if 0
-static NPY_INLINE int
+static inline int
 PyArrayNeighborhoodIter_Next2D(PyArrayNeighborhoodIterObject* iter);
 #endif
 
@@ -1514,85 +1501,85 @@ PyArrayNeighborhoodIter_Next2D(PyArrayNeighborhoodIterObject* iter);
  * ABI compatibility.
  */
 
-static NPY_INLINE int
+static inline int
 PyArray_NDIM(const PyArrayObject *arr)
 {
     return ((PyArrayObject_fields *)arr)->nd;
 }
 
-static NPY_INLINE void *
+static inline void *
 PyArray_DATA(PyArrayObject *arr)
 {
     return ((PyArrayObject_fields *)arr)->data;
 }
 
-static NPY_INLINE char *
+static inline char *
 PyArray_BYTES(PyArrayObject *arr)
 {
     return ((PyArrayObject_fields *)arr)->data;
 }
 
-static NPY_INLINE npy_intp *
+static inline npy_intp *
 PyArray_DIMS(PyArrayObject *arr)
 {
     return ((PyArrayObject_fields *)arr)->dimensions;
 }
 
-static NPY_INLINE npy_intp *
+static inline npy_intp *
 PyArray_STRIDES(PyArrayObject *arr)
 {
     return ((PyArrayObject_fields *)arr)->strides;
 }
 
-static NPY_INLINE npy_intp
+static inline npy_intp
 PyArray_DIM(const PyArrayObject *arr, int idim)
 {
     return ((PyArrayObject_fields *)arr)->dimensions[idim];
 }
 
-static NPY_INLINE npy_intp
+static inline npy_intp
 PyArray_STRIDE(const PyArrayObject *arr, int istride)
 {
     return ((PyArrayObject_fields *)arr)->strides[istride];
 }
 
-static NPY_INLINE NPY_RETURNS_BORROWED_REF PyObject *
+static inline NPY_RETURNS_BORROWED_REF PyObject *
 PyArray_BASE(PyArrayObject *arr)
 {
     return ((PyArrayObject_fields *)arr)->base;
 }
 
-static NPY_INLINE NPY_RETURNS_BORROWED_REF PyArray_Descr *
+static inline NPY_RETURNS_BORROWED_REF PyArray_Descr *
 PyArray_DESCR(PyArrayObject *arr)
 {
     return ((PyArrayObject_fields *)arr)->descr;
 }
 
-static NPY_INLINE int
+static inline int
 PyArray_FLAGS(const PyArrayObject *arr)
 {
     return ((PyArrayObject_fields *)arr)->flags;
 }
 
-static NPY_INLINE npy_intp
+static inline npy_intp
 PyArray_ITEMSIZE(const PyArrayObject *arr)
 {
     return ((PyArrayObject_fields *)arr)->descr->elsize;
 }
 
-static NPY_INLINE int
+static inline int
 PyArray_TYPE(const PyArrayObject *arr)
 {
     return ((PyArrayObject_fields *)arr)->descr->type_num;
 }
 
-static NPY_INLINE int
+static inline int
 PyArray_CHKFLAGS(const PyArrayObject *arr, int flags)
 {
     return (PyArray_FLAGS(arr) & flags) == flags;
 }
 
-static NPY_INLINE PyObject *
+static inline PyObject *
 PyArray_GETITEM(const PyArrayObject *arr, const char *itemptr)
 {
     return ((PyArrayObject_fields *)arr)->descr->f->getitem(
@@ -1604,7 +1591,7 @@ PyArray_GETITEM(const PyArrayObject *arr, const char *itemptr)
  * and of a type understood by the arrays dtype.
  * Use `PyArray_Pack` if the value may be of a different dtype.
  */
-static NPY_INLINE int
+static inline int
 PyArray_SETITEM(PyArrayObject *arr, char *itemptr, PyObject *v)
 {
     return ((PyArrayObject_fields *)arr)->descr->f->setitem(v, itemptr, arr);
@@ -1639,13 +1626,13 @@ PyArray_SETITEM(PyArrayObject *arr, char *itemptr, PyObject *v)
                                      (PyArrayObject *)(obj))
 #endif
 
-static NPY_INLINE PyArray_Descr *
+static inline PyArray_Descr *
 PyArray_DTYPE(PyArrayObject *arr)
 {
     return ((PyArrayObject_fields *)arr)->descr;
 }
 
-static NPY_INLINE npy_intp *
+static inline npy_intp *
 PyArray_SHAPE(PyArrayObject *arr)
 {
     return ((PyArrayObject_fields *)arr)->dimensions;
@@ -1655,7 +1642,7 @@ PyArray_SHAPE(PyArrayObject *arr)
  * Enables the specified array flags. Does no checking,
  * assumes you know what you're doing.
  */
-static NPY_INLINE void
+static inline void
 PyArray_ENABLEFLAGS(PyArrayObject *arr, int flags)
 {
     ((PyArrayObject_fields *)arr)->flags |= flags;
@@ -1665,17 +1652,19 @@ PyArray_ENABLEFLAGS(PyArrayObject *arr, int flags)
  * Clears the specified array flags. Does no checking,
  * assumes you know what you're doing.
  */
-static NPY_INLINE void
+static inline void
 PyArray_CLEARFLAGS(PyArrayObject *arr, int flags)
 {
     ((PyArrayObject_fields *)arr)->flags &= ~flags;
 }
 
-static NPY_INLINE NPY_RETURNS_BORROWED_REF PyObject *
-PyArray_HANDLER(PyArrayObject *arr)
-{
-    return ((PyArrayObject_fields *)arr)->mem_handler;
-}
+#if NPY_FEATURE_VERSION >= NPY_1_22_API_VERSION
+    static inline NPY_RETURNS_BORROWED_REF PyObject *
+    PyArray_HANDLER(PyArrayObject *arr)
+    {
+        return ((PyArrayObject_fields *)arr)->mem_handler;
+    }
+#endif
 
 #define PyTypeNum_ISBOOL(type) ((type) == NPY_BOOL)
 
diff --git a/numpy/core/include/numpy/npy_3kcompat.h b/numpy/core/include/numpy/npy_3kcompat.h
index 11cc47765..62fde943a 100644
--- a/numpy/core/include/numpy/npy_3kcompat.h
+++ b/numpy/core/include/numpy/npy_3kcompat.h
@@ -36,7 +36,7 @@ extern "C" {
  * included here because it is missing from the PyPy API. It completes the PyLong_As*
  * group of functions and can be useful in replacing PyInt_Check.
  */
-static NPY_INLINE int
+static inline int
 Npy__PyLong_AsInt(PyObject *obj)
 {
     int overflow;
@@ -56,7 +56,7 @@ Npy__PyLong_AsInt(PyObject *obj)
 
 #if defined(NPY_PY3K)
 /* Return True only if the long fits in a C long */
-static NPY_INLINE int PyInt_Check(PyObject *op) {
+static inline int PyInt_Check(PyObject *op) {
     int overflow = 0;
     if (!PyLong_Check(op)) {
         return 0;
@@ -99,32 +99,6 @@ static NPY_INLINE int PyInt_Check(PyObject *op) {
 
 #define Npy_EnterRecursiveCall(x) Py_EnterRecursiveCall(x)
 
-/* Py_SETREF was added in 3.5.2, and only if Py_LIMITED_API is absent */
-#if PY_VERSION_HEX < 0x03050200
-    #define Py_SETREF(op, op2)                      \
-        do {                                        \
-            PyObject *_py_tmp = (PyObject *)(op);   \
-            (op) = (op2);                           \
-            Py_DECREF(_py_tmp);                     \
-        } while (0)
-#endif
-
-/* introduced in https://github.com/python/cpython/commit/a24107b04c1277e3c1105f98aff5bfa3a98b33a0 */
-#if PY_VERSION_HEX < 0x030800A3
-    static NPY_INLINE PyObject *
-    _PyDict_GetItemStringWithError(PyObject *v, const char *key)
-    {
-        PyObject *kv, *rv;
-        kv = PyUnicode_FromString(key);
-        if (kv == NULL) {
-            return NULL;
-        }
-        rv = PyDict_GetItemWithError(v, kv);
-        Py_DECREF(kv);
-        return rv;
-    }
-#endif
-
 /*
  * PyString -> PyBytes
  */
@@ -193,15 +167,35 @@ static NPY_INLINE int PyInt_Check(PyObject *op) {
 
 #endif /* NPY_PY3K */
 
+/*
+ * Macros to protect CRT calls against instant termination when passed an
+ * invalid parameter (https://bugs.python.org/issue23524).
+ */
+#if defined _MSC_VER && _MSC_VER >= 1900
+
+#include <stdlib.h>
+
+extern _invalid_parameter_handler _Py_silent_invalid_parameter_handler;
+#define NPY_BEGIN_SUPPRESS_IPH { _invalid_parameter_handler _Py_old_handler = \
+    _set_thread_local_invalid_parameter_handler(_Py_silent_invalid_parameter_handler);
+#define NPY_END_SUPPRESS_IPH _set_thread_local_invalid_parameter_handler(_Py_old_handler); }
+
+#else
+
+#define NPY_BEGIN_SUPPRESS_IPH
+#define NPY_END_SUPPRESS_IPH
+
+#endif /* _MSC_VER >= 1900 */
+
 
-static NPY_INLINE void
+static inline void
 PyUnicode_ConcatAndDel(PyObject **left, PyObject *right)
 {
     Py_SETREF(*left, PyUnicode_Concat(*left, right));
     Py_DECREF(right);
 }
 
-static NPY_INLINE void
+static inline void
 PyUnicode_Concat2(PyObject **left, PyObject *right)
 {
     Py_SETREF(*left, PyUnicode_Concat(*left, right));
@@ -214,7 +208,7 @@ PyUnicode_Concat2(PyObject **left, PyObject *right)
 /*
  * Get a FILE* handle to the file represented by the Python object
  */
-static NPY_INLINE FILE*
+static inline FILE*
 npy_PyFile_Dup2(PyObject *file, char *mode, npy_off_t *orig_pos)
 {
     int fd, fd2, unbuf;
@@ -268,13 +262,17 @@ npy_PyFile_Dup2(PyObject *file, char *mode, npy_off_t *orig_pos)
 
     /* Convert to FILE* handle */
 #ifdef _WIN32
+    NPY_BEGIN_SUPPRESS_IPH
     handle = _fdopen(fd2, mode);
+    NPY_END_SUPPRESS_IPH
 #else
     handle = fdopen(fd2, mode);
 #endif
     if (handle == NULL) {
         PyErr_SetString(PyExc_IOError,
-                        "Getting a FILE* from a Python file object failed");
+                        "Getting a FILE* from a Python file object via "
+                        "_fdopen failed. If you built NumPy, you probably "
+                        "linked with the wrong debug/release runtime");
         return NULL;
     }
 
@@ -330,7 +328,7 @@ npy_PyFile_Dup2(PyObject *file, char *mode, npy_off_t *orig_pos)
 /*
  * Close the dup-ed file handle, and seek the Python one to the current position
  */
-static NPY_INLINE int
+static inline int
 npy_PyFile_DupClose2(PyObject *file, FILE* handle, npy_off_t orig_pos)
 {
     int fd, unbuf;
@@ -397,7 +395,7 @@ npy_PyFile_DupClose2(PyObject *file, FILE* handle, npy_off_t orig_pos)
     return 0;
 }
 
-static NPY_INLINE int
+static inline int
 npy_PyFile_Check(PyObject *file)
 {
     int fd;
@@ -415,7 +413,7 @@ npy_PyFile_Check(PyObject *file)
     return 1;
 }
 
-static NPY_INLINE PyObject*
+static inline PyObject*
 npy_PyFile_OpenFile(PyObject *filename, const char *mode)
 {
     PyObject *open;
@@ -426,7 +424,7 @@ npy_PyFile_OpenFile(PyObject *filename, const char *mode)
     return PyObject_CallFunction(open, "Os", filename, mode);
 }
 
-static NPY_INLINE int
+static inline int
 npy_PyFile_CloseFile(PyObject *file)
 {
     PyObject *ret;
@@ -442,7 +440,7 @@ npy_PyFile_CloseFile(PyObject *file)
 
 /* This is a copy of _PyErr_ChainExceptions
  */
-static NPY_INLINE void
+static inline void
 npy_PyErr_ChainExceptions(PyObject *exc, PyObject *val, PyObject *tb)
 {
     if (exc == NULL)
@@ -474,7 +472,7 @@ npy_PyErr_ChainExceptions(PyObject *exc, PyObject *val, PyObject *tb)
  *  - a minimal implementation for python 2
  *  - __cause__ used instead of __context__
  */
-static NPY_INLINE void
+static inline void
 npy_PyErr_ChainExceptionsCause(PyObject *exc, PyObject *val, PyObject *tb)
 {
     if (exc == NULL)
@@ -505,7 +503,7 @@ npy_PyErr_ChainExceptionsCause(PyObject *exc, PyObject *val, PyObject *tb)
  * PyObject_Cmp
  */
 #if defined(NPY_PY3K)
-static NPY_INLINE int
+static inline int
 PyObject_Cmp(PyObject *i1, PyObject *i2, int *cmp)
 {
     int v;
@@ -545,7 +543,7 @@ PyObject_Cmp(PyObject *i1, PyObject *i2, int *cmp)
  * The main job here is to get rid of the improved error handling
  * of PyCapsules. It's a shame...
  */
-static NPY_INLINE PyObject *
+static inline PyObject *
 NpyCapsule_FromVoidPtr(void *ptr, void (*dtor)(PyObject *))
 {
     PyObject *ret = PyCapsule_New(ptr, NULL, dtor);
@@ -555,7 +553,7 @@ NpyCapsule_FromVoidPtr(void *ptr, void (*dtor)(PyObject *))
     return ret;
 }
 
-static NPY_INLINE PyObject *
+static inline PyObject *
 NpyCapsule_FromVoidPtrAndDesc(void *ptr, void* context, void (*dtor)(PyObject *))
 {
     PyObject *ret = NpyCapsule_FromVoidPtr(ptr, dtor);
@@ -567,7 +565,7 @@ NpyCapsule_FromVoidPtrAndDesc(void *ptr, void* context, void (*dtor)(PyObject *)
     return ret;
 }
 
-static NPY_INLINE void *
+static inline void *
 NpyCapsule_AsVoidPtr(PyObject *obj)
 {
     void *ret = PyCapsule_GetPointer(obj, NULL);
@@ -577,13 +575,13 @@ NpyCapsule_AsVoidPtr(PyObject *obj)
     return ret;
 }
 
-static NPY_INLINE void *
+static inline void *
 NpyCapsule_GetDesc(PyObject *obj)
 {
     return PyCapsule_GetContext(obj);
 }
 
-static NPY_INLINE int
+static inline int
 NpyCapsule_Check(PyObject *ptr)
 {
     return PyCapsule_CheckExact(ptr);
diff --git a/numpy/core/include/numpy/npy_common.h b/numpy/core/include/numpy/npy_common.h
index 2bcc45e4f..728cedbf1 100644
--- a/numpy/core/include/numpy/npy_common.h
+++ b/numpy/core/include/numpy/npy_common.h
@@ -40,39 +40,6 @@
 #define NPY_GCC_OPT_3
 #endif
 
-/* compile target attributes */
-#if defined HAVE_ATTRIBUTE_TARGET_AVX && defined HAVE_LINK_AVX
-#define NPY_GCC_TARGET_AVX __attribute__((target("avx")))
-#else
-#define NPY_GCC_TARGET_AVX
-#endif
-
-#if defined HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS
-#define HAVE_ATTRIBUTE_TARGET_FMA
-#define NPY_GCC_TARGET_FMA __attribute__((target("avx2,fma")))
-#endif
-
-#if defined HAVE_ATTRIBUTE_TARGET_AVX2 && defined HAVE_LINK_AVX2
-#define NPY_GCC_TARGET_AVX2 __attribute__((target("avx2")))
-#else
-#define NPY_GCC_TARGET_AVX2
-#endif
-
-#if defined HAVE_ATTRIBUTE_TARGET_AVX512F && defined HAVE_LINK_AVX512F
-#define NPY_GCC_TARGET_AVX512F __attribute__((target("avx512f")))
-#elif defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS
-#define NPY_GCC_TARGET_AVX512F __attribute__((target("avx512f")))
-#else
-#define NPY_GCC_TARGET_AVX512F
-#endif
-
-#if defined HAVE_ATTRIBUTE_TARGET_AVX512_SKX && defined HAVE_LINK_AVX512_SKX
-#define NPY_GCC_TARGET_AVX512_SKX __attribute__((target("avx512f,avx512dq,avx512vl,avx512bw,avx512cd")))
-#elif defined HAVE_ATTRIBUTE_TARGET_AVX512_SKX_WITH_INTRINSICS
-#define NPY_GCC_TARGET_AVX512_SKX __attribute__((target("avx512f,avx512dq,avx512vl,avx512bw,avx512cd")))
-#else
-#define NPY_GCC_TARGET_AVX512_SKX
-#endif
 /*
  * mark an argument (starting from 1) that must not be NULL and is not checked
  * DO NOT USE IF FUNCTION CHECKS FOR NULL!! the compiler will remove the check
@@ -83,21 +50,6 @@
 #define NPY_GCC_NONNULL(n)
 #endif
 
-#if defined HAVE_XMMINTRIN_H && defined HAVE__MM_LOAD_PS
-#define NPY_HAVE_SSE_INTRINSICS
-#endif
-
-#if defined HAVE_EMMINTRIN_H && defined HAVE__MM_LOAD_PD
-#define NPY_HAVE_SSE2_INTRINSICS
-#endif
-
-#if defined HAVE_IMMINTRIN_H && defined HAVE_LINK_AVX2
-#define NPY_HAVE_AVX2_INTRINSICS
-#endif
-
-#if defined HAVE_IMMINTRIN_H && defined HAVE_LINK_AVX512F
-#define NPY_HAVE_AVX512F_INTRINSICS
-#endif
 /*
  * give a hint to the compiler which branch is more likely or unlikely
  * to occur, e.g. rare error cases:
@@ -120,7 +72,7 @@
 /* unlike _mm_prefetch also works on non-x86 */
 #define NPY_PREFETCH(x, rw, loc) __builtin_prefetch((x), (rw), (loc))
 #else
-#ifdef HAVE__MM_PREFETCH
+#ifdef NPY_HAVE_SSE
 /* _MM_HINT_ET[01] (rw = 1) unsupported, only available in gcc >= 4.9 */
 #define NPY_PREFETCH(x, rw, loc) _mm_prefetch((x), loc == 0 ? _MM_HINT_NTA : \
                                              (loc == 1 ? _MM_HINT_T2 : \
@@ -131,6 +83,7 @@
 #endif
 #endif
 
+/* `NPY_INLINE` kept for backwards compatibility; use `inline` instead */
 #if defined(_MSC_VER) && !defined(__clang__)
     #define NPY_INLINE __inline
 /* clang included here to handle clang-cl on Windows */
@@ -147,11 +100,19 @@
 #ifdef _MSC_VER
     #define NPY_FINLINE static __forceinline
 #elif defined(__GNUC__)
-    #define NPY_FINLINE static NPY_INLINE __attribute__((always_inline))
+    #define NPY_FINLINE static inline __attribute__((always_inline))
 #else
     #define NPY_FINLINE static
 #endif
 
+#if defined(_MSC_VER)
+    #define NPY_NOINLINE static __declspec(noinline)
+#elif defined(__GNUC__) || defined(__clang__)
+    #define NPY_NOINLINE static __attribute__((noinline))
+#else
+    #define NPY_NOINLINE static
+#endif
+
 #ifdef HAVE___THREAD
     #define NPY_TLS __thread
 #else
diff --git a/numpy/core/include/numpy/npy_cpu.h b/numpy/core/include/numpy/npy_cpu.h
index 78d229e7d..a19f8e6bb 100644
--- a/numpy/core/include/numpy/npy_cpu.h
+++ b/numpy/core/include/numpy/npy_cpu.h
@@ -77,7 +77,7 @@
     #elif defined(__ARMEL__) || defined(__AARCH64EL__) || defined(_M_ARM64)
         #if defined(__ARM_32BIT_STATE)
             #define NPY_CPU_ARMEL_AARCH32
-        #elif defined(__ARM_64BIT_STATE) || defined(_M_ARM64)
+        #elif defined(__ARM_64BIT_STATE) || defined(_M_ARM64) || defined(__AARCH64EL__)
             #define NPY_CPU_ARMEL_AARCH64
         #else
             #define NPY_CPU_ARMEL
diff --git a/numpy/core/include/numpy/npy_math.h b/numpy/core/include/numpy/npy_math.h
index 27e1ddad0..2fcd41eb0 100644
--- a/numpy/core/include/numpy/npy_math.h
+++ b/numpy/core/include/numpy/npy_math.h
@@ -8,7 +8,7 @@
 /* By adding static inline specifiers to npy_math function definitions when
    appropriate, compiler is given the opportunity to optimize */
 #if NPY_INLINE_MATH
-#define NPY_INPLACE NPY_INLINE static
+#define NPY_INPLACE static inline
 #else
 #define NPY_INPLACE
 #endif
@@ -24,25 +24,25 @@ extern "C" {
  *
  * XXX: I should test whether INFINITY and NAN are available on the platform
  */
-NPY_INLINE static float __npy_inff(void)
+static inline float __npy_inff(void)
 {
     const union { npy_uint32 __i; float __f;} __bint = {0x7f800000UL};
     return __bint.__f;
 }
 
-NPY_INLINE static float __npy_nanf(void)
+static inline float __npy_nanf(void)
 {
     const union { npy_uint32 __i; float __f;} __bint = {0x7fc00000UL};
     return __bint.__f;
 }
 
-NPY_INLINE static float __npy_pzerof(void)
+static inline float __npy_pzerof(void)
 {
     const union { npy_uint32 __i; float __f;} __bint = {0x00000000UL};
     return __bint.__f;
 }
 
-NPY_INLINE static float __npy_nzerof(void)
+static inline float __npy_nzerof(void)
 {
     const union { npy_uint32 __i; float __f;} __bint = {0x80000000UL};
     return __bint.__f;
@@ -198,12 +198,12 @@ NPY_INPLACE double npy_atan2(double x, double y);
 #define npy_sqrt sqrt
 #define npy_pow pow
 #define npy_modf modf
+#define npy_nextafter nextafter
 
-double npy_nextafter(double x, double y);
 double npy_spacing(double x);
 
 /*
- * IEEE 754 fpu handling. Those are guaranteed to be macros
+ * IEEE 754 fpu handling
  */
 
 /* use builtins to avoid function calls in tight loops
@@ -211,11 +211,7 @@ double npy_spacing(double x);
 #ifdef HAVE___BUILTIN_ISNAN
     #define npy_isnan(x) __builtin_isnan(x)
 #else
-    #ifndef NPY_HAVE_DECL_ISNAN
-        #define npy_isnan(x) ((x) != (x))
-    #else
-        #define npy_isnan(x) isnan(x)
-    #endif
+    #define npy_isnan(x) isnan(x)
 #endif
 
 
@@ -223,39 +219,17 @@ double npy_spacing(double x);
 #ifdef HAVE___BUILTIN_ISFINITE
     #define npy_isfinite(x) __builtin_isfinite(x)
 #else
-    #ifndef NPY_HAVE_DECL_ISFINITE
-        #ifdef _MSC_VER
-            #define npy_isfinite(x) _finite((x))
-        #else
-            #define npy_isfinite(x) !npy_isnan((x) + (-x))
-        #endif
-    #else
-        #define npy_isfinite(x) isfinite((x))
-    #endif
+    #define npy_isfinite(x) isfinite((x))
 #endif
 
 /* only available if npy_config.h is available (= numpys own build) */
 #ifdef HAVE___BUILTIN_ISINF
     #define npy_isinf(x) __builtin_isinf(x)
 #else
-    #ifndef NPY_HAVE_DECL_ISINF
-        #define npy_isinf(x) (!npy_isfinite(x) && !npy_isnan(x))
-    #else
-        #define npy_isinf(x) isinf((x))
-    #endif
+    #define npy_isinf(x) isinf((x))
 #endif
 
-#ifndef NPY_HAVE_DECL_SIGNBIT
-    int _npy_signbit_f(float x);
-    int _npy_signbit_d(double x);
-    int _npy_signbit_ld(long double x);
-    #define npy_signbit(x) \
-        (sizeof (x) == sizeof (long double) ? _npy_signbit_ld (x) \
-         : sizeof (x) == sizeof (double) ? _npy_signbit_d (x) \
-         : _npy_signbit_f (x))
-#else
-    #define npy_signbit(x) signbit((x))
-#endif
+#define npy_signbit(x) signbit((x))
 
 /*
  * float C99 math funcs that need fixups or are blocklist-able
@@ -298,8 +272,8 @@ NPY_INPLACE float npy_modff(float x, float* y);
 #define npy_frexpf frexpf
 #define npy_ldexpf ldexpf
 #define npy_copysignf copysignf
+#define npy_nextafterf nextafterf
 
-float npy_nextafterf(float x, float y);
 float npy_spacingf(float x);
 
 /*
@@ -342,8 +316,8 @@ NPY_INPLACE npy_longdouble npy_modfl(npy_longdouble x, npy_longdouble* y);
 #define npy_frexpl frexpl
 #define npy_ldexpl ldexpl
 #define npy_copysignl copysignl
+#define npy_nextafterl nextafterl
 
-npy_longdouble npy_nextafterl(npy_longdouble x, npy_longdouble y);
 npy_longdouble npy_spacingl(npy_longdouble x);
 
 /*
@@ -399,17 +373,17 @@ NPY_INPLACE npy_longdouble npy_heavisidel(npy_longdouble x, npy_longdouble h0);
                                              \
     return z1.z;
 
-static NPY_INLINE npy_cdouble npy_cpack(double x, double y)
+static inline npy_cdouble npy_cpack(double x, double y)
 {
     __NPY_CPACK_IMP(x, y, double, npy_cdouble);
 }
 
-static NPY_INLINE npy_cfloat npy_cpackf(float x, float y)
+static inline npy_cfloat npy_cpackf(float x, float y)
 {
     __NPY_CPACK_IMP(x, y, float, npy_cfloat);
 }
 
-static NPY_INLINE npy_clongdouble npy_cpackl(npy_longdouble x, npy_longdouble y)
+static inline npy_clongdouble npy_cpackl(npy_longdouble x, npy_longdouble y)
 {
     __NPY_CPACK_IMP(x, y, npy_longdouble, npy_clongdouble);
 }
@@ -431,32 +405,32 @@ static NPY_INLINE npy_clongdouble npy_cpackl(npy_longdouble x, npy_longdouble y)
                                                     \
     return __z_repr.a[index];
 
-static NPY_INLINE double npy_creal(npy_cdouble z)
+static inline double npy_creal(npy_cdouble z)
 {
     __NPY_CEXTRACT_IMP(z, 0, double, npy_cdouble);
 }
 
-static NPY_INLINE double npy_cimag(npy_cdouble z)
+static inline double npy_cimag(npy_cdouble z)
 {
     __NPY_CEXTRACT_IMP(z, 1, double, npy_cdouble);
 }
 
-static NPY_INLINE float npy_crealf(npy_cfloat z)
+static inline float npy_crealf(npy_cfloat z)
 {
     __NPY_CEXTRACT_IMP(z, 0, float, npy_cfloat);
 }
 
-static NPY_INLINE float npy_cimagf(npy_cfloat z)
+static inline float npy_cimagf(npy_cfloat z)
 {
     __NPY_CEXTRACT_IMP(z, 1, float, npy_cfloat);
 }
 
-static NPY_INLINE npy_longdouble npy_creall(npy_clongdouble z)
+static inline npy_longdouble npy_creall(npy_clongdouble z)
 {
     __NPY_CEXTRACT_IMP(z, 0, npy_longdouble, npy_clongdouble);
 }
 
-static NPY_INLINE npy_longdouble npy_cimagl(npy_clongdouble z)
+static inline npy_longdouble npy_cimagl(npy_clongdouble z)
 {
     __NPY_CEXTRACT_IMP(z, 1, npy_longdouble, npy_clongdouble);
 }
diff --git a/numpy/core/include/numpy/numpyconfig.h b/numpy/core/include/numpy/numpyconfig.h
index 164fcbdbe..1c25aa5fc 100644
--- a/numpy/core/include/numpy/numpyconfig.h
+++ b/numpy/core/include/numpy/numpyconfig.h
@@ -6,7 +6,13 @@
 /*
  * On Mac OS X, because there is only one configuration stage for all the archs
  * in universal builds, any macro which depends on the arch needs to be
- * hardcoded
+ * hardcoded.
+ *
+ * Note that distutils/pip will attempt a universal2 build when Python itself
+ * is built as universal2, hence this hardcoding is needed even if we do not
+ * support universal2 wheels anymore (see gh-22796).
+ * This code block can be removed after we have dropped the setup.py based
+ * build completely.
  */
 #ifdef __APPLE__
     #undef NPY_SIZEOF_LONG
@@ -48,30 +54,85 @@
     #endif
 #endif
 
+
 /**
- * To help with the NPY_NO_DEPRECATED_API macro, we include API version
- * numbers for specific versions of NumPy. To exclude all API that was
- * deprecated as of 1.7, add the following before #including any NumPy
- * headers:
+ * To help with both NPY_TARGET_VERSION and the NPY_NO_DEPRECATED_API macro,
+ * we include API version numbers for specific versions of NumPy.
+ * To exclude all API that was deprecated as of 1.7, add the following before
+ * #including any NumPy headers:
  *   #define NPY_NO_DEPRECATED_API  NPY_1_7_API_VERSION
+ * The same is true for NPY_TARGET_VERSION, although NumPy will default to
+ * a backwards compatible build anyway.
  */
 #define NPY_1_7_API_VERSION 0x00000007
 #define NPY_1_8_API_VERSION 0x00000008
-#define NPY_1_9_API_VERSION 0x00000008
-#define NPY_1_10_API_VERSION 0x00000008
-#define NPY_1_11_API_VERSION 0x00000008
-#define NPY_1_12_API_VERSION 0x00000008
-#define NPY_1_13_API_VERSION 0x00000008
-#define NPY_1_14_API_VERSION 0x00000008
-#define NPY_1_15_API_VERSION 0x00000008
-#define NPY_1_16_API_VERSION 0x00000008
-#define NPY_1_17_API_VERSION 0x00000008
-#define NPY_1_18_API_VERSION 0x00000008
-#define NPY_1_19_API_VERSION 0x00000008
+#define NPY_1_9_API_VERSION 0x00000009
+#define NPY_1_10_API_VERSION 0x0000000a
+#define NPY_1_11_API_VERSION 0x0000000a
+#define NPY_1_12_API_VERSION 0x0000000a
+#define NPY_1_13_API_VERSION 0x0000000b
+#define NPY_1_14_API_VERSION 0x0000000c
+#define NPY_1_15_API_VERSION 0x0000000c
+#define NPY_1_16_API_VERSION 0x0000000d
+#define NPY_1_17_API_VERSION 0x0000000d
+#define NPY_1_18_API_VERSION 0x0000000d
+#define NPY_1_19_API_VERSION 0x0000000d
 #define NPY_1_20_API_VERSION 0x0000000e
 #define NPY_1_21_API_VERSION 0x0000000e
 #define NPY_1_22_API_VERSION 0x0000000f
 #define NPY_1_23_API_VERSION 0x00000010
 #define NPY_1_24_API_VERSION 0x00000010
+#define NPY_1_25_API_VERSION 0x00000011
+
+
+/*
+ * Binary compatibility version number.  This number is increased
+ * whenever the C-API is changed such that binary compatibility is
+ * broken, i.e. whenever a recompile of extension modules is needed.
+ */
+#define NPY_VERSION NPY_ABI_VERSION
+
+/*
+ * Minor API version we are compiling to be compatible with.  The version
+ * Number is always increased when the API changes via: `NPY_API_VERSION`
+ * (and should maybe just track the NumPy version).
+ *
+ * If we have an internal build, we always target the current version of
+ * course.
+ *
+ * For downstream users, we default to an older version to provide them with
+ * maximum compatibility by default.  Downstream can choose to extend that
+ * default, or narrow it down if they wish to use newer API.  If you adjust
+ * this, consider the Python version support (example for 1.25.x):
+ *
+ * NumPy 1.25.x supports Python:                     3.9  3.10  3.11  (3.12)
+ * NumPy 1.19.x supports Python:      3.6  3.7  3.8  3.9
+ * NumPy 1.17.x supports Python: 3.5  3.6  3.7  3.8
+ * NumPy 1.15.x supports Python: ...  3.6  3.7
+ *
+ * Users of the stable ABI may wish to target the last Python that is not
+ * end of life.  This would be 3.8 at NumPy 1.25 release time.
+ * 1.17 as default was the choice of oldest-support-numpy at the time and
+ * has in practice no limit (comapared to 1.19).  Even earlier becomes legacy.
+ */
+#if defined(NPY_INTERNAL_BUILD) && NPY_INTERNAL_BUILD
+    /* NumPy internal build, always use current version. */
+    #define NPY_FEATURE_VERSION NPY_API_VERSION
+#elif defined(NPY_TARGET_VERSION) && NPY_TARGET_VERSION
+    /* user provided a target version, use it */
+    #define NPY_FEATURE_VERSION NPY_TARGET_VERSION
+#else
+    /* Use the default (increase when dropping Python 3.9 support) */
+    #define NPY_FEATURE_VERSION NPY_1_19_API_VERSION
+#endif
+
+/* Sanity check the (requested) feature version */
+#if NPY_FEATURE_VERSION > NPY_API_VERSION
+    #error "NPY_TARGET_VERSION higher than NumPy headers!"
+#elif NPY_FEATURE_VERSION < NPY_1_15_API_VERSION
+    /* No support for irrelevant old targets, no need for error, but warn. */
+    #warning "Requested NumPy target lower than supported NumPy 1.15."
+#endif
+
 
 #endif  /* NUMPY_CORE_INCLUDE_NUMPY_NPY_NUMPYCONFIG_H_ */
diff --git a/numpy/core/include/numpy/random/distributions.h b/numpy/core/include/numpy/random/distributions.h
index 78bd06ff5..e7fa4bd00 100644
--- a/numpy/core/include/numpy/random/distributions.h
+++ b/numpy/core/include/numpy/random/distributions.h
@@ -198,7 +198,7 @@ RAND_INT_TYPE random_binomial_inversion(bitgen_t *bitgen_state,
                                         double p,
                                         binomial_t *binomial);
 double random_loggam(double x);
-static NPY_INLINE double next_double(bitgen_t *bitgen_state) {
+static inline double next_double(bitgen_t *bitgen_state) {
     return bitgen_state->next_double(bitgen_state->state);
 }
 
diff --git a/numpy/core/include/numpy/ufuncobject.h b/numpy/core/include/numpy/ufuncobject.h
index bb0633100..9e00f2e56 100644
--- a/numpy/core/include/numpy/ufuncobject.h
+++ b/numpy/core/include/numpy/ufuncobject.h
@@ -197,7 +197,7 @@ typedef struct _tagPyUFuncObject {
         npy_uint32 iter_flags;
 
         /* New in NPY_API_VERSION 0x0000000D and above */
-
+    #if NPY_FEATURE_VERSION >= NPY_1_16_API_VERSION
         /*
          * for each core_num_dim_ix distinct dimension names,
          * the possible "frozen" size (-1 if not frozen).
@@ -211,13 +211,15 @@ typedef struct _tagPyUFuncObject {
 
         /* Identity for reduction, when identity == PyUFunc_IdentityValue */
         PyObject *identity_value;
+    #endif  /* NPY_FEATURE_VERSION >= NPY_1_16_API_VERSION */
 
         /* New in NPY_API_VERSION 0x0000000F and above */
-
+    #if NPY_FEATURE_VERSION >= NPY_1_22_API_VERSION
         /* New private fields related to dispatching */
         void *_dispatch_cache;
         /* A PyListObject of `(tuple of DTypes, ArrayMethod/Promoter)` */
         PyObject *_loops;
+    #endif
 } PyUFuncObject;
 
 #include "arrayobject.h"
diff --git a/numpy/core/memmap.py b/numpy/core/memmap.py
index b0d9cb3af..79c695455 100644
--- a/numpy/core/memmap.py
+++ b/numpy/core/memmap.py
@@ -1,9 +1,9 @@
 from contextlib import nullcontext
 
 import numpy as np
+from .._utils import set_module
 from .numeric import uint8, ndarray, dtype
 from numpy.compat import os_fspath, is_pathlib_path
-from numpy.core.overrides import set_module
 
 __all__ = ['memmap']
 
@@ -59,6 +59,7 @@ class memmap(ndarray):
         | 'r+' | Open existing file for reading and writing.                 |
         +------+-------------------------------------------------------------+
         | 'w+' | Create or overwrite existing file for reading and writing.  |
+        |      | If ``mode == 'w+'`` then `shape` must also be specified.    |
         +------+-------------------------------------------------------------+
         | 'c'  | Copy-on-write: assignments affect data in memory, but       |
         |      | changes are not saved to disk.  The file on disk is         |
@@ -220,7 +221,7 @@ class memmap(ndarray):
                 ) from None
 
         if mode == 'w+' and shape is None:
-            raise ValueError("shape must be given")
+            raise ValueError("shape must be given if mode == 'w+'")
 
         if hasattr(filename, 'read'):
             f_ctx = nullcontext(filename)
diff --git a/numpy/core/meson.build b/numpy/core/meson.build
new file mode 100644
index 000000000..f375e9647
--- /dev/null
+++ b/numpy/core/meson.build
@@ -0,0 +1,942 @@
+# This file should contain what setup.py + setup_common.py do (WIP)
+#
+# Potential issues to address or keep track of:
+#   - sincos detection incorrect on NetBSD: https://github.com/mesonbuild/meson/issues/10641
+
+# Versioning support
+#-------------------
+#
+# How to change C_API_VERSION ?
+#   - increase C_API_VERSION value
+#   - record the hash for the new C API with the cversions.py script
+#   and add the hash to cversions.txt
+# The hash values are used to remind developers when the C API number was not
+# updated - generates a MismatchCAPIWarning warning which is turned into an
+# exception for released version.
+
+# Binary compatibility version number. This number is increased whenever the
+# C-API is changed such that binary compatibility is broken, i.e. whenever a
+# recompile of extension modules is needed.
+C_ABI_VERSION = '0x01000009'
+
+# Minor API version.  This number is increased whenever a change is made to the
+# C-API -- whether it breaks binary compatibility or not.  Some changes, such
+# as adding a function pointer to the end of the function table, can be made
+# without breaking binary compatibility.  In this case, only the C_API_VERSION
+# (*not* C_ABI_VERSION) would be increased.  Whenever binary compatibility is
+# broken, both C_API_VERSION and C_ABI_VERSION should be increased.
+#
+# The version needs to be kept in sync with that in cversions.txt.
+#
+# 0x00000008 - 1.7.x
+# 0x00000009 - 1.8.x
+# 0x00000009 - 1.9.x
+# 0x0000000a - 1.10.x
+# 0x0000000a - 1.11.x
+# 0x0000000a - 1.12.x
+# 0x0000000b - 1.13.x
+# 0x0000000c - 1.14.x
+# 0x0000000c - 1.15.x
+# 0x0000000d - 1.16.x
+# 0x0000000d - 1.19.x
+# 0x0000000e - 1.20.x
+# 0x0000000e - 1.21.x
+# 0x0000000f - 1.22.x
+# 0x00000010 - 1.23.x
+# 0x00000010 - 1.24.x
+# 0x00000011 - 1.25.x
+C_API_VERSION = '0x00000011'
+
+# Check whether we have a mismatch between the set C API VERSION and the
+# actual C API VERSION. Will raise a MismatchCAPIError if so.
+run_command('code_generators/verify_c_api_version.py', '--api-version', C_API_VERSION, check: true)
+
+
+# Generate config.h and _numpyconfig.h
+# ------------------------------------
+#
+# There are two generated headers:
+#   - config.h: a private header, which is not installed and used only at
+#               build time, mostly to determine whether or not optional
+#               headers, compiler attributes, etc. are available
+#   - _numpyconfig.h: a header with public `NPY_` symbols which get made
+#                     available via numpyconfig.h
+#
+# Note that `HAVE_*` symbols indicate the presence or absence of a checked
+# property of the build environment. When available, these symbols get defined
+# to `1`; when not available they must not be defined at all. This is
+# important, because code in NumPy typically does not check the value but only
+# whether the symbol is defined. So `#define HAVE_SOMETHING 0` is wrong.
+
+cdata = configuration_data()
+
+cdata.set('NPY_ABI_VERSION', C_ABI_VERSION)
+cdata.set('NPY_API_VERSION', C_API_VERSION)
+
+use_svml = (
+  host_machine.system() == 'linux' and
+  host_machine.cpu_family() == 'x86_64' and
+  not get_option('disable-svml')
+)
+if use_svml
+  cdata.set10('NPY_CAN_LINK_SVML', true)
+  if not fs.exists('src/umath/svml')
+    error('Missing the `SVML` git submodule! Run `git submodule update --init` to fix this.')
+  endif
+endif
+if not fs.exists('src/npysort/x86-simd-sort/README.md')
+  error('Missing the `x86-simd-sort` git submodule! Run `git submodule update --init` to fix this.')
+endif
+
+# Check sizes of types. Note, some of these landed in config.h before, but were
+# unused. So clean that up and only define the NPY_SIZEOF flavors rather than
+# the SIZEOF ones
+types_to_check = [
+  ['NPY_SIZEOF_SHORT', 'short'],
+  ['NPY_SIZEOF_INT', 'int'],
+  ['NPY_SIZEOF_LONG', 'long'],
+  ['NPY_SIZEOF_LONGLONG', 'long long'],
+  ['NPY_SIZEOF_FLOAT', 'float'],
+  ['NPY_SIZEOF_DOUBLE', 'double'],
+  ['NPY_SIZEOF_LONGDOUBLE', 'long double'],
+]
+foreach symbol_type: types_to_check
+  cdata.set(symbol_type[0], cc.sizeof(symbol_type[1]))
+endforeach
+cdata.set('NPY_SIZEOF_OFF_T', cc.sizeof('off_t', prefix: '#include <sys/types.h>'))
+cdata.set('NPY_SIZEOF_PY_INTPTR_T',
+  cc.sizeof('Py_intptr_t', dependencies: py_dep, prefix: '#include <Python.h>'))
+cdata.set('NPY_SIZEOF_PY_LONG_LONG',
+  cc.sizeof('PY_LONG_LONG', dependencies: py_dep, prefix: '#include <Python.h>'))
+
+# Check for complex support
+if cc.has_header('complex.h')
+  cdata.set10('HAVE_COMPLEX_H', true)
+  cdata.set10('NPY_USE_C99_COMPLEX', true)
+  if cc.get_argument_syntax() == 'msvc'
+    complex_types_to_check = [
+      ['NPY_HAVE_COMPLEX_FLOAT', 'NPY_SIZEOF_COMPLEX_FLOAT', '_Fcomplex', 'float'],
+      ['NPY_HAVE_COMPLEX_DOUBLE', 'NPY_SIZEOF_COMPLEX_DOUBLE', '_Dcomplex', 'double'],
+      ['NPY_HAVE_COMPLEX_LONG_DOUBLE', 'NPY_SIZEOF_COMPLEX_LONGDOUBLE', '_Lcomplex', 'long double'],
+    ]
+  else
+    complex_types_to_check = [
+      ['NPY_HAVE_COMPLEX_FLOAT', 'NPY_SIZEOF_COMPLEX_FLOAT', 'complex float', 'float'],
+      ['NPY_HAVE_COMPLEX_DOUBLE', 'NPY_SIZEOF_COMPLEX_DOUBLE', 'complex double', 'double'],
+      ['NPY_HAVE_COMPLEX_LONG_DOUBLE', 'NPY_SIZEOF_COMPLEX_LONGDOUBLE', 'complex long double', 'long double'],
+    ]
+  endif
+  foreach symbol_type: complex_types_to_check
+    if cc.has_type(symbol_type[2], prefix: '#include <complex.h>')
+      cdata.set10(symbol_type[0], true)
+      # Determine size of NumPy's own, struct-based, complex types. Binary
+      # compatibility with C99 complex types is checked at build time in `npy_common.h`.
+      ftype = symbol_type[3]
+      cdata.set(symbol_type[1], cc.sizeof(f'struct {@ftype@ __x; @ftype@ __y;}'))
+    endif
+  endforeach
+endif
+
+# Mandatory functions: if not found, fail the build
+# Some of these can still be blocklisted if the C99 implementation
+# is buggy, see numpy/core/src/common/npy_config.h
+mandatory_funcs = [
+  'sin', 'cos', 'tan', 'sinh', 'cosh', 'tanh', 'fabs',
+  'floor', 'ceil', 'sqrt', 'log10', 'log', 'exp', 'asin',
+  'acos', 'atan', 'fmod', 'modf', 'frexp', 'ldexp',
+  'expm1', 'log1p', 'acosh', 'asinh', 'atanh',
+  'rint', 'trunc', 'exp2',
+  'copysign', 'nextafter', 'strtoll', 'strtoull', 'cbrt',
+  'log2', 'pow', 'hypot', 'atan2',
+  'csin', 'csinh', 'ccos', 'ccosh', 'ctan', 'ctanh',
+  'creal', 'cimag', 'conj'
+]
+foreach func: mandatory_funcs
+  if not cc.has_function(func)
+    error('Function `{func}` not found')
+  endif
+endforeach
+
+c99_complex_funcs = [
+  'cabs', 'cacos', 'cacosh', 'carg', 'casin', 'casinh', 'catan',
+  'catanh', 'cexp', 'clog', 'cpow', 'csqrt',
+  # The long double variants (like csinl)  should be mandatory on C11,
+  # but are missing in FreeBSD. Issue gh-22850
+  'csin', 'csinh', 'ccos', 'ccosh', 'ctan', 'ctanh',
+
+]
+foreach func: c99_complex_funcs
+  func_single = func + 'f'
+  func_longdouble = func + 'l'
+  if cc.has_function(func)
+    cdata.set10('HAVE_' + func.to_upper(), true)
+  endif
+  if cc.has_function(func_single)
+    cdata.set10('HAVE_' + func_single.to_upper(), true)
+  endif
+  if cc.has_function(func_longdouble)
+    cdata.set10('HAVE_' + func_longdouble.to_upper(), true)
+  endif
+endforeach
+
+# We require C99 so these should always be found at build time. But for
+# libnpymath as a C99 compat layer, these may still be relevant.
+c99_macros = ['isfinite', 'isinf', 'isnan', 'signbit']
+foreach macro: c99_macros
+  if cc.has_function(macro)
+    cdata.set10('NPY_HAVE_DECL_' + macro.to_upper(), true)
+    if not cc.has_header_symbol('Python.h', macro, dependencies: py_dep)
+      # Add in config.h as well, except if it will clash with Python's config.h content
+      cdata.set10('HAVE_DECL_' + macro.to_upper(), true)
+    endif
+  endif
+endforeach
+
+# variable attributes tested via "int %s a" % attribute
+optional_variable_attributes = [
+  ['__thread', 'HAVE__THREAD'],
+  ['__declspec(thread)', 'HAVE___DECLSPEC_THREAD_']
+]
+foreach optional_attr: optional_variable_attributes
+  attr = optional_attr[0]
+  code = f'''
+    #pragma GCC diagnostic error "-Wattributes"
+    #pragma clang diagnostic error "-Wattributes"
+
+    int @attr@ foo;
+  '''
+  code += '''
+    int
+    main()
+    {
+      return 0;
+    }
+  '''
+  if cc.compiles(code)
+    cdata.set10(optional_attr[1], true)
+  endif
+endforeach
+
+inc_curdir = include_directories('.')
+optional_file_funcs = ['fallocate', 'ftello', 'fseeko']
+foreach filefunc_maybe: optional_file_funcs
+  config_value = 'HAVE_' + filefunc_maybe.to_upper()
+  # Some functions may already have HAVE_* defined by `Python.h`. Python puts
+  # its config.h in the public Python.h namespace, so we have a possible clash
+  # for the common functions we test. Hence we skip those checks.
+  if (filefunc_maybe == 'fallocate' or
+      not cc.has_header_symbol('Python.h', config_value, dependencies: py_dep)
+    )
+    if cc.has_function(filefunc_maybe,
+                       include_directories: inc_curdir,
+                       prefix: '#include "feature_detection_stdio.h"'
+      )
+      cdata.set10(config_value, true)
+    endif
+  endif
+endforeach
+
+# Optional locale function
+have_strtold_l = cc.has_function('strtold_l', include_directories: inc_curdir,
+  prefix:'''
+  #include <stdlib.h>
+  #include <xlocale.h>
+  #include "feature_detection_locale.h"
+''')
+if not have_strtold_l
+  # Retry with locale.h, seems to vary across Linux distros
+  have_strtold_l = cc.has_function('strtold_l', include_directories: inc_curdir,
+    prefix:'''
+    #include <stdlib.h>
+    #include <locale.h>
+    #include "feature_detection_locale.h"
+  ''')
+endif
+if have_strtold_l
+  cdata.set10('HAVE_STRTOLD_L', true)
+else
+  # FIXME: this is wrong! the HAVE_ define should not exist, or it'll be
+  # interpreted as the function being available (true/false does nothing, see
+  # note on HAVE_ defines higher up). This is necessary though in order to make
+  # the Linux CI job pass. So either the check is wrong somehow, or this
+  # function is not available in CI. For the latter there is a fallback path,
+  # but that is broken because we don't have the exact long double
+  # representation checks.
+  if cc.get_argument_syntax() != 'msvc'
+    cdata.set10('HAVE_STRTOLD_L', false)
+  endif
+endif
+
+# Other optional functions
+optional_misc_funcs = [
+  'backtrace',
+  'madvise',
+]
+foreach func: optional_misc_funcs
+  if cc.has_function(func,
+      include_directories: inc_curdir,
+      prefix: '#include "feature_detection_misc.h"'
+    )
+    cdata.set10('HAVE_' + func.to_upper(), true)
+  endif
+endforeach
+
+# SSE headers only enabled automatically on amd64/x32 builds
+optional_headers = [
+  'xmmintrin.h',  # SSE
+  'emmintrin.h',  # SSE2
+  'immintrin.h',  # AVX
+  'features.h',   # for glibc version linux
+  'xlocale.h',    # see GH#8367
+  'dlfcn.h',      # dladdr
+  'execinfo.h',   # backtrace
+  'libunwind.h',  # backtrace for LLVM/Clang using libunwind
+  'sys/mman.h',   # madvise
+]
+foreach header: optional_headers
+  if cc.has_header(header)
+    cdata.set10('HAVE_' + header.to_upper().replace('.', '_').replace('/', '_'), true)
+  endif
+endforeach
+
+# Optional compiler attributes
+# TODO: this doesn't work with cc.has_function_attribute, see
+#       https://github.com/mesonbuild/meson/issues/10732
+optional_function_attributes = [
+  ['optimize("unroll-loops")', 'OPTIMIZE_UNROLL_LOOPS'],
+  ['optimize("O3")', 'OPTIMIZE_OPT_3'],
+  ['optimize("O2")', 'OPTIMIZE_OPT_2'],
+  ['optimize("nonnull (1)")', 'NONNULL'],
+]
+#foreach attr: optional_function_attributes
+#  if cc.has_function_attribute(attr[0])
+#    cdata.set10('HAVE_ATTRIBUTE_' + attr[1], true)
+#  endif
+#endforeach
+
+# Optional GCC compiler builtins and their call arguments.
+# If given, a required header and definition name (HAVE_ prepended)
+# Call arguments are required as the compiler will do strict signature checking
+optional_intrinsics = [
+  ['__builtin_isnan', '5.', [], []],
+  ['__builtin_isinf', '5.', [], []],
+  ['__builtin_isfinite', '5.', [], []],
+  ['__builtin_bswap32', '5u', [], []],
+  ['__builtin_bswap64', '5u', [], []],
+  ['__builtin_expect', '5, 0', [], []],
+  # Test `long long` for arm+clang 13 (gh-22811, but we use all versions):
+  ['__builtin_mul_overflow', '(long long)5, 5, (int*)5', [], []],
+  ['__builtin_prefetch', '(float*)0, 0, 3', [], []],
+]
+foreach intrin: optional_intrinsics
+  func = intrin[0]
+  func_args = intrin[1]
+  header = intrin[2]
+  define = intrin[3]
+  code = ''
+  if header.length() == 1
+    header_name = header[0]
+    code += f'#include <@header_name@>'
+  endif
+  code += f'''
+    #ifdef _MSC_VER
+    #pragma function(@func@)
+    #endif
+    int main(void) {
+      @func@(@func_args@);
+      return 0;
+    };
+    '''
+  if define.length() == 1
+    define_name = define[0]
+  else
+    define_name = 'HAVE_' + func.to_upper()
+  endif
+  if cc.links(code)
+    cdata.set10(define_name, true)
+  endif
+endforeach
+
+# long double representation detection (see setup_common.py)
+# TODO: this is still incomplete, and different from how it's done in the
+# numpy.distutils based build, see https://github.com/mesonbuild/meson/issues/11068
+longdouble_size = cc.sizeof('long double')
+if longdouble_size == 8
+  if host_machine.endian() == 'little'
+    longdouble_format = 'IEEE_DOUBLE_LE'
+  else
+    longdouble_format = 'IEEE_DOUBLE_BE'
+  endif
+elif longdouble_size == 12
+  error('This should not be possible, 12 bits of "content" should still result in sizeof() being 16. Please report this error!'
+  )
+elif longdouble_size == 16
+  if host_machine.endian() == 'little'
+    # FIXME: this varies, there's multiple formats here! Not yet implemented.
+    #        TBD how we deal with the mess of old long double formats.
+    longdouble_format = 'INTEL_EXTENDED_16_BYTES_LE'
+  else
+    error('No idea what this is ....')
+  endif
+else
+  error('Unknown long double size: ' + londouble_size)
+endif
+cdata.set10('HAVE_LDOUBLE_' + longdouble_format, true)
+
+if cc.has_header('endian.h')
+  cdata.set10('NPY_HAVE_ENDIAN_H', true)
+endif
+if cc.has_header('sys/endian.h')
+  cdata.set10('NPY_HAVE_SYS_ENDIAN_H', true)
+endif
+if is_windows
+  cdata.set10('NPY_NO_SIGNAL', true)
+endif
+# Command-line switch; distutils build checked for `NPY_NOSMP` env var instead
+# TODO: document this (search for NPY_NOSMP in C API docs)
+cdata.set10('NPY_NO_SMP', get_option('disable-threading'))
+
+# Use bogus stride debug aid to flush out bugs where users use strides of
+# dimensions with length 1 to index a full contiguous array.
+cdata.set10('NPY_RELAXED_STRIDES_DEBUG', get_option('relaxed-strides-debug'))
+
+# Check whether we can use inttypes (C99) formats
+if cc.has_header_symbol('inttypes.h', 'PRIdPTR')
+  cdata.set10('NPY_USE_C99_FORMATS', true)
+endif
+
+visibility_hidden = ''
+if cc.has_function_attribute('visibility:hidden')
+  visibility_hidden = '__attribute__((visibility("hidden")))'
+endif
+cdata.set('NPY_VISIBILITY_HIDDEN', visibility_hidden)
+
+
+config_h = configure_file(
+  input: 'config.h.in',
+  output: 'config.h',
+  configuration: cdata,
+  install: false
+)
+
+_numpyconfig_h = configure_file(
+  input: 'include/numpy/_numpyconfig.h.in',
+  output: '_numpyconfig.h',
+  configuration: cdata,
+  install: true,
+  install_dir: np_dir / 'core/include/numpy'
+)
+
+# Build npymath static library
+# ----------------------------
+
+staticlib_cflags = []
+staticlib_cppflags = []
+if cc.get_id() == 'msvc'
+  # Disable voltbl section for vc142 to allow link using mingw-w64; see:
+  # https://github.com/matthew-brett/dll_investigation/issues/1#issuecomment-1100468171
+  # Needs to be added to static libraries that are shipped for reuse (i.e.,
+  # libnpymath and libnpyrandom)
+  if cc.has_argument('-d2VolatileMetadata-')
+     staticlib_cflags +=  '-d2VolatileMetadata-'
+  endif
+endif
+# TODO: change to "feature" option in meson_options.txt? See
+# https://mesonbuild.com/Build-options.html#build-options
+if get_option('disable-simd-optimizations')
+  staticlib_cflags += '-DNPY_DISABLE_OPTIMIZATION'
+  staticlib_cppflags += '-DNPY_DISABLE_OPTIMIZATION'
+endif
+
+npy_math_internal_h = custom_target(
+  output: 'npy_math_internal.h',
+  input: 'src/npymath/npy_math_internal.h.src',
+  command: [src_file_cli, '@INPUT@', '-o', '@OUTPUT@'],
+)
+
+npymath_sources = [
+  src_file.process('src/npymath/ieee754.c.src'),
+  src_file.process('src/npymath/npy_math_complex.c.src'),
+  npy_math_internal_h,
+  'src/npymath/halffloat.cpp',
+  'src/npymath/npy_math.c',
+]
+npymath_lib = static_library('npymath',
+  npymath_sources,
+  c_args: staticlib_cflags,
+  cpp_args: staticlib_cppflags,
+  include_directories: ['include', 'src/npymath', 'src/common'],
+  dependencies: py_dep,
+  install: true,
+  install_dir: np_dir / 'core/lib',
+)
+
+dir_separator = '/'
+if build_machine.system() == 'windows'
+  dir_separator = '\\'
+endif
+configure_file(
+  input: 'npymath.ini.in',
+  output: 'npymath.ini',
+  configuration: configuration_data({
+    'pkgname' : 'numpy.core',
+    'sep' : dir_separator,
+  }),
+  install: true,
+  install_dir: np_dir / 'core/lib/npy-pkg-config'
+)
+configure_file(
+  input: 'mlib.ini.in',
+  output: 'mlib.ini',
+  configuration: configuration_data({
+    'posix_mathlib' : mlib_linkflag,
+    'msvc_mathlib' : 'm.lib',
+  }),
+  install: true,
+  install_dir: np_dir / 'core/lib/npy-pkg-config'
+)
+
+if false
+  # This doesn't quite work (yet), it assumes we'll install headers under
+  # include/, and trying to add the correct path with `extra_cflags` runs into
+  # not being able to get a path relative to the prefix.
+  # Note that installing numpy headers under include/ would be a good idea, but
+  # that needs work (and may run into trouble with wheels perhaps, not quite
+  # clear if they allow this).
+  pkg = import('pkgconfig')
+  pkg.generate(npymath_lib,
+    name: 'npymath',
+    description: 'Portable, core math library implementing C99 standard',
+    url: 'https://github.com/numpy/numpy',
+    requires: 'numpy',
+    install_dir: np_dir / 'core/lib/npy-pkg-config',
+    extra_cflags: '-I${includedir}' + np_dir / 'core' / 'include',
+  )
+endif
+
+# Generate NumPy C API sources
+# ----------------------------
+
+# This is a single C file. It needs to be built before _multiarray_umath starts
+# building, but we can't add it to the sources of that extension (this C file
+# doesn't compile, it's only included in another r file. Hence use the slightly
+# hacky --ignore argument to the next custom_target().
+src_umath_api_c = custom_target('__umath_generated',
+  output : '__umath_generated.c',
+  input : 'code_generators/generate_umath.py',
+  command: [py, '@INPUT@', '-o', '@OUTPUT@'],
+)
+
+src_umath_doc_h = custom_target('_umath_doc_generated',
+  output : '_umath_doc_generated.h',
+  input : 'code_generators/generate_umath_doc.py',
+  command: [py, '@INPUT@', '-o', '@OUTPUT@'],
+)
+
+src_numpy_api = custom_target('__multiarray_api',
+  output : ['__multiarray_api.c', '__multiarray_api.h'],
+  input : 'code_generators/generate_numpy_api.py',
+  command: [py, '@INPUT@', '-o', '@OUTDIR@', '--ignore', src_umath_api_c],
+  install: true,  # NOTE: setup.py build installs all, but just need .h?
+  install_dir: np_dir / 'core/include/numpy'
+)
+
+src_ufunc_api = custom_target('__ufunc_api',
+  output : ['__ufunc_api.c', '__ufunc_api.h'],
+  input : 'code_generators/generate_ufunc_api.py',
+  command: [py, '@INPUT@', '-o', '@OUTDIR@'],
+  install: true,  # NOTE: setup.py build installs all, but just need .h?
+  install_dir: np_dir / 'core/include/numpy'
+)
+
+
+# Set common build flags for C and C++ code
+# -----------------------------------------
+
+# TODO: change to "feature" option in meson_options.txt? See
+# https://mesonbuild.com/Build-options.html#build-options
+disable_simd_optimizations = []
+if get_option('disable-simd-optimizations')
+  disable_simd_optimizations = '-DNPY_DISABLE_OPTIMIZATION'
+endif
+
+# Common build flags
+c_args_common = [
+  '-DNPY_INTERNAL_BUILD',
+  '-DHAVE_NPY_CONFIG_H',
+  disable_simd_optimizations,
+  cflags_large_file_support,
+]
+
+# Same as NPY_CXX_FLAGS (TODO: extend for what ccompiler_opt adds)
+cpp_args_common = c_args_common + [
+  '-D__STDC_VERSION__=0',  # for compatibility with C headers
+]
+if cc.get_argument_syntax() != 'msvc'
+  cpp_args_common += [
+    '-fno-exceptions',  # no exception support
+    '-fno-rtti',  # no runtime type information
+  ]
+endif
+
+# Other submodules depend on generated headers and include directories from
+# core, wrap those up into a reusable dependency. Also useful for some test
+# modules in this build file.
+np_core_dep = declare_dependency(
+  sources: [
+    _numpyconfig_h,
+    npy_math_internal_h,
+    src_numpy_api[1],  # __multiarray_api.h
+    src_ufunc_api[1],  # __ufunc_api.h
+  ],
+  include_directories: [
+    '.',
+    'include',
+    'src/common',
+  ],
+  compile_args: disable_simd_optimizations
+)
+
+
+# Build multiarray_tests module
+# -----------------------------
+py.extension_module('_multiarray_tests',
+  [
+    src_file.process('src/multiarray/_multiarray_tests.c.src'),
+    'src/common/mem_overlap.c',
+    'src/common/npy_argparse.c',
+    'src/common/npy_hashtable.c',
+    src_file.process('src/common/templ_common.h.src')
+  ],
+  c_args: c_args_common,
+  include_directories: ['src/multiarray', 'src/npymath'],
+  dependencies: np_core_dep,
+  link_with: npymath_lib,
+  gnu_symbol_visibility: 'default',
+  install: true,
+  subdir: 'numpy/core',
+)
+
+test_modules_src = [
+  ['_umath_tests', [
+      src_file.process('src/umath/_umath_tests.c.src'),
+      'src/umath/_umath_tests.dispatch.c',
+      'src/common/npy_cpu_features.c',
+    ]],
+  ['_rational_tests', 'src/umath/_rational_tests.c'],
+  ['_struct_ufunc_tests', 'src/umath/_struct_ufunc_tests.c'],
+  ['_operand_flag_tests', 'src/umath/_operand_flag_tests.c'],
+]
+foreach gen: test_modules_src
+  py.extension_module(gen[0],
+    gen[1],
+    c_args: c_args_common,
+    include_directories: ['src/multiarray', 'src/npymath'],
+    dependencies: np_core_dep,
+    install: true,
+    subdir: 'numpy/core',
+  )
+endforeach
+
+# Build _multiarray_umath module
+# ------------------------------
+src_multiarray_umath_common = [
+  'src/common/array_assign.c',
+  'src/common/mem_overlap.c',
+  'src/common/npy_argparse.c',
+  'src/common/npy_hashtable.c',
+  'src/common/npy_longdouble.c',
+  'src/common/ucsnarrow.c',
+  'src/common/ufunc_override.c',
+  'src/common/numpyos.c',
+  'src/common/npy_cpu_features.c',
+  src_file.process('src/common/templ_common.h.src')
+]
+if have_blas
+  src_multiarray_umath_common += [
+    'src/common/cblasfuncs.c',
+    'src/common/python_xerbla.c',
+  ]
+endif
+
+src_multiarray = [
+  'src/multiarray/abstractdtypes.c',
+  'src/multiarray/alloc.c',
+  src_file.process('src/multiarray/argfunc.dispatch.c.src'),
+  'src/multiarray/arrayobject.c',
+  src_file.process('src/multiarray/arraytypes.h.src'),
+  'src/multiarray/array_coercion.c',
+  'src/multiarray/array_method.c',
+  'src/multiarray/array_assign_scalar.c',
+  'src/multiarray/array_assign_array.c',
+  'src/multiarray/arrayfunction_override.c',
+  src_file.process('src/multiarray/arraytypes.c.src'),
+  'src/multiarray/buffer.c',
+  'src/multiarray/calculation.c',
+  'src/multiarray/compiled_base.c',
+  'src/multiarray/common.c',
+  'src/multiarray/common_dtype.c',
+  'src/multiarray/convert.c',
+  'src/multiarray/convert_datatype.c',
+  'src/multiarray/conversion_utils.c',
+  'src/multiarray/ctors.c',
+  'src/multiarray/datetime.c',
+  'src/multiarray/datetime_strings.c',
+  'src/multiarray/datetime_busday.c',
+  'src/multiarray/datetime_busdaycal.c',
+  'src/multiarray/descriptor.c',
+  'src/multiarray/dlpack.c',
+  'src/multiarray/dtypemeta.c',
+  'src/multiarray/dragon4.c',
+  'src/multiarray/dtype_transfer.c',
+  'src/multiarray/dtype_traversal.c',
+  src_file.process('src/multiarray/einsum.c.src'),
+  src_file.process('src/multiarray/einsum_sumprod.c.src'),
+  'src/multiarray/experimental_public_dtype_api.c',
+  'src/multiarray/flagsobject.c',
+  'src/multiarray/getset.c',
+  'src/multiarray/hashdescr.c',
+  'src/multiarray/item_selection.c',
+  'src/multiarray/iterators.c',
+  'src/multiarray/legacy_dtype_implementation.c',
+  src_file.process('src/multiarray/lowlevel_strided_loops.c.src'),
+  'src/multiarray/mapping.c',
+  'src/multiarray/methods.c',
+  'src/multiarray/multiarraymodule.c',
+  'src/multiarray/nditer_api.c',
+  'src/multiarray/nditer_constr.c',
+  'src/multiarray/nditer_pywrap.c',
+  src_file.process('src/multiarray/nditer_templ.c.src'),
+  'src/multiarray/number.c',
+  'src/multiarray/refcount.c',
+  src_file.process('src/multiarray/scalartypes.c.src'),
+  'src/multiarray/sequence.c',
+  'src/multiarray/shape.c',
+  'src/multiarray/scalarapi.c',
+  'src/multiarray/strfuncs.c',
+  'src/multiarray/temp_elide.c',
+  'src/multiarray/typeinfo.c',
+  'src/multiarray/usertypes.c',
+  'src/multiarray/vdot.c',
+  src_file.process('src/common/npy_sort.h.src'),
+  'src/npysort/simd_qsort.dispatch.cpp',
+  'src/npysort/simd_qsort_16bit.dispatch.cpp',
+  'src/npysort/quicksort.cpp',
+  'src/npysort/mergesort.cpp',
+  'src/npysort/timsort.cpp',
+  'src/npysort/heapsort.cpp',
+  'src/npysort/radixsort.cpp',
+  'src/common/npy_partition.h',
+  'src/npysort/selection.cpp',
+  'src/common/npy_binsearch.h',
+  'src/npysort/binsearch.cpp',
+  'src/multiarray/textreading/conversions.c',
+  'src/multiarray/textreading/field_types.c',
+  'src/multiarray/textreading/growth.c',
+  'src/multiarray/textreading/readtext.c',
+  'src/multiarray/textreading/rows.c',
+  'src/multiarray/textreading/stream_pyobject.c',
+  'src/multiarray/textreading/str_to_int.c',
+  'src/multiarray/textreading/tokenize.cpp',
+  # Remove this `arm64_exports.c` file once scipy macos arm64 build correctly
+  # links to the arm64 npymath library, see gh-22673
+  'src/npymath/arm64_exports.c',
+]
+
+src_umath = [
+  src_file.process('src/umath/funcs.inc.src'),
+  src_file.process('src/umath/loops.h.src'),
+  src_file.process('src/umath/loops_utils.h.src'),
+  src_file.process('src/umath/loops.c.src'),
+  src_file.process('src/umath/loops_arithm_fp.dispatch.c.src'),
+  src_file.process('src/umath/loops_arithmetic.dispatch.c.src'),
+  src_file.process('src/umath/loops_comparison.dispatch.c.src'),
+  src_file.process('src/umath/loops_exponent_log.dispatch.c.src'),
+  src_file.process('src/umath/loops_hyperbolic.dispatch.c.src'),
+  src_file.process('src/umath/loops_logical.dispatch.c.src'),
+  src_file.process('src/umath/loops_minmax.dispatch.c.src'),
+  src_file.process('src/umath/loops_modulo.dispatch.c.src'),
+  src_file.process('src/umath/loops_trigonometric.dispatch.c.src'),
+  src_file.process('src/umath/loops_umath_fp.dispatch.c.src'),
+  src_file.process('src/umath/loops_unary.dispatch.c.src'),
+  src_file.process('src/umath/loops_unary_fp.dispatch.c.src'),
+  src_file.process('src/umath/loops_unary_fp_le.dispatch.c.src'),
+  src_file.process('src/umath/loops_unary_complex.dispatch.c.src'),
+  src_file.process('src/umath/loops_autovec.dispatch.c.src'),
+  src_file.process('src/umath/matmul.c.src'),
+  src_file.process('src/umath/matmul.h.src'),
+  'src/umath/ufunc_type_resolution.c',
+  'src/umath/clip.cpp',
+  'src/umath/clip.h',
+  'src/umath/dispatching.c',
+  'src/umath/extobj.c',
+  'src/umath/legacy_array_method.c',
+  'src/umath/override.c',
+  'src/umath/reduction.c',
+  src_file.process('src/umath/scalarmath.c.src'),
+  'src/umath/ufunc_object.c',
+  'src/umath/umathmodule.c',
+  'src/umath/string_ufuncs.cpp',
+  'src/umath/wrapping_array_method.c',
+  # For testing. Eventually, should use public API and be separate:
+  'src/umath/_scaled_float_dtype.c',
+]
+
+# SVML object files. If functionality is migrated to universal intrinsics and
+# the object files are no longer needed, comment out the relevant object files
+# here. Note that this migration is desirable; we then get the performance
+# benefits for all platforms rather than only for AVX512 on 64-bit Linux, and
+# may be able to avoid the accuracy regressions in SVML.
+svml_objects = []
+if use_svml
+  svml_objects += [
+    'src/umath/svml/linux/avx512/svml_z0_acos_d_la.s',
+    'src/umath/svml/linux/avx512/svml_z0_acos_s_la.s',
+    'src/umath/svml/linux/avx512/svml_z0_acosh_d_la.s',
+    'src/umath/svml/linux/avx512/svml_z0_acosh_s_la.s',
+    'src/umath/svml/linux/avx512/svml_z0_asin_d_la.s',
+    'src/umath/svml/linux/avx512/svml_z0_asin_s_la.s',
+    'src/umath/svml/linux/avx512/svml_z0_asinh_d_la.s',
+    'src/umath/svml/linux/avx512/svml_z0_asinh_s_la.s',
+    'src/umath/svml/linux/avx512/svml_z0_atan2_d_la.s',
+    'src/umath/svml/linux/avx512/svml_z0_atan2_s_la.s',
+    'src/umath/svml/linux/avx512/svml_z0_atan_d_la.s',
+    'src/umath/svml/linux/avx512/svml_z0_atan_s_la.s',
+    'src/umath/svml/linux/avx512/svml_z0_atanh_d_la.s',
+    'src/umath/svml/linux/avx512/svml_z0_atanh_s_la.s',
+    'src/umath/svml/linux/avx512/svml_z0_cbrt_d_la.s',
+    'src/umath/svml/linux/avx512/svml_z0_cbrt_s_la.s',
+    'src/umath/svml/linux/avx512/svml_z0_cos_d_la.s',
+    'src/umath/svml/linux/avx512/svml_z0_cos_s_la.s',
+    'src/umath/svml/linux/avx512/svml_z0_cosh_d_la.s',
+    'src/umath/svml/linux/avx512/svml_z0_cosh_s_la.s',
+    'src/umath/svml/linux/avx512/svml_z0_exp2_d_la.s',
+    'src/umath/svml/linux/avx512/svml_z0_exp2_s_la.s',
+    'src/umath/svml/linux/avx512/svml_z0_exp_d_la.s',
+    'src/umath/svml/linux/avx512/svml_z0_exp_s_la.s',
+    'src/umath/svml/linux/avx512/svml_z0_expm1_d_la.s',
+    'src/umath/svml/linux/avx512/svml_z0_expm1_s_la.s',
+    'src/umath/svml/linux/avx512/svml_z0_log10_d_la.s',
+    'src/umath/svml/linux/avx512/svml_z0_log10_s_la.s',
+    'src/umath/svml/linux/avx512/svml_z0_log1p_d_la.s',
+    'src/umath/svml/linux/avx512/svml_z0_log1p_s_la.s',
+    'src/umath/svml/linux/avx512/svml_z0_log2_d_la.s',
+    'src/umath/svml/linux/avx512/svml_z0_log2_s_la.s',
+    'src/umath/svml/linux/avx512/svml_z0_log_d_la.s',
+    'src/umath/svml/linux/avx512/svml_z0_log_s_la.s',
+    'src/umath/svml/linux/avx512/svml_z0_pow_d_la.s',
+    'src/umath/svml/linux/avx512/svml_z0_pow_s_la.s',
+    'src/umath/svml/linux/avx512/svml_z0_sin_d_la.s',
+    'src/umath/svml/linux/avx512/svml_z0_sin_s_la.s',
+    'src/umath/svml/linux/avx512/svml_z0_sinh_d_la.s',
+    'src/umath/svml/linux/avx512/svml_z0_sinh_s_la.s',
+    'src/umath/svml/linux/avx512/svml_z0_tan_d_la.s',
+    'src/umath/svml/linux/avx512/svml_z0_tan_s_la.s',
+    # 'src/umath/svml/linux/avx512/svml_z0_tanh_d_la.s',
+    'src/umath/svml/linux/avx512/svml_z0_tanh_s_la.s',
+  ]
+endif
+
+py.extension_module('_multiarray_umath',
+  [
+    config_h,
+    _numpyconfig_h,
+    src_multiarray,
+    src_multiarray_umath_common,
+    src_umath,
+    src_ufunc_api[1],  # __ufunc_api.h
+    src_numpy_api[1],  # __multiarray_api.h
+    src_umath_doc_h,
+    npy_math_internal_h,
+  ],
+  objects: svml_objects,
+  c_args: c_args_common,
+  cpp_args: cpp_args_common,
+  include_directories: [
+    'include',
+    'src/common',
+    'src/multiarray',
+    'src/npymath',
+    'src/umath',
+  ],
+  dependencies: blas,
+  link_with: npymath_lib,
+  install: true,
+  subdir: 'numpy/core',
+)
+
+# Build SIMD module
+# -----------------
+
+py.extension_module('_simd',
+  [
+    'src/common/npy_cpu_features.c',
+    'src/_simd/_simd.c',
+    src_file.process('src/_simd/_simd_inc.h.src'),
+    src_file.process('src/_simd/_simd_data.inc.src'),
+    src_file.process('src/_simd/_simd.dispatch.c.src'),
+  ],
+  c_args: c_args_common,
+  include_directories: ['src/_simd', 'src/npymath'],
+  dependencies: np_core_dep,
+  link_with: npymath_lib,
+  install: true,
+  subdir: 'numpy/core',
+)
+
+python_sources = [
+  '__init__.py',
+  '__init__.pyi',
+  '_add_newdocs.py',
+  '_add_newdocs_scalars.py',
+  '_asarray.py',
+  '_asarray.pyi',
+  '_dtype.py',
+  '_dtype_ctypes.py',
+  '_exceptions.py',
+  '_internal.py',
+  '_internal.pyi',
+  '_machar.py',
+  '_methods.py',
+  '_string_helpers.py',
+  '_type_aliases.py',
+  '_type_aliases.pyi',
+  '_ufunc_config.py',
+  '_ufunc_config.pyi',
+  'arrayprint.py',
+  'arrayprint.pyi',
+  'cversions.py',
+  'defchararray.py',
+  'defchararray.pyi',
+  'einsumfunc.py',
+  'einsumfunc.pyi',
+  'fromnumeric.py',
+  'fromnumeric.pyi',
+  'function_base.py',
+  'function_base.pyi',
+  'getlimits.py',
+  'getlimits.pyi',
+  'memmap.py',
+  'memmap.pyi',
+  'multiarray.py',
+  'multiarray.pyi',
+  'numeric.py',
+  'numeric.pyi',
+  'numerictypes.py',
+  'numerictypes.pyi',
+  'overrides.py',
+  'records.py',
+  'records.pyi',
+  'shape_base.py',
+  'shape_base.pyi',
+  'umath.py',
+  'umath_tests.py',
+]
+
+py.install_sources(
+  python_sources,
+  subdir: 'numpy/core'
+)
+
+subdir('include')
+install_subdir('tests', install_dir: np_dir / 'core')
diff --git a/numpy/core/multiarray.py b/numpy/core/multiarray.py
index 31b779783..d11283345 100644
--- a/numpy/core/multiarray.py
+++ b/numpy/core/multiarray.py
@@ -14,10 +14,10 @@ from ._multiarray_umath import *  # noqa: F403
 # do not change them. issue gh-15518
 # _get_ndarray_c_version is semi-public, on purpose not added to __all__
 from ._multiarray_umath import (
-    fastCopyAndTranspose, _flagdict, from_dlpack, _insert, _reconstruct,
+    fastCopyAndTranspose, _flagdict, from_dlpack, _place, _reconstruct,
     _vec_string, _ARRAY_API, _monotonicity, _get_ndarray_c_version,
     _get_madvise_hugepage, _set_madvise_hugepage,
-    _get_promotion_state, _set_promotion_state,
+    _get_promotion_state, _set_promotion_state, _using_numpy2_behavior
     )
 
 __all__ = [
@@ -25,7 +25,7 @@ __all__ = [
     'ITEM_HASOBJECT', 'ITEM_IS_POINTER', 'LIST_PICKLE', 'MAXDIMS',
     'MAY_SHARE_BOUNDS', 'MAY_SHARE_EXACT', 'NEEDS_INIT', 'NEEDS_PYAPI',
     'RAISE', 'USE_GETITEM', 'USE_SETITEM', 'WRAP',
-    '_flagdict', 'from_dlpack', '_insert', '_reconstruct', '_vec_string',
+    '_flagdict', 'from_dlpack', '_place', '_reconstruct', '_vec_string',
     '_monotonicity', 'add_docstring', 'arange', 'array', 'asarray',
     'asanyarray', 'ascontiguousarray', 'asfortranarray', 'bincount',
     'broadcast', 'busday_count', 'busday_offset', 'busdaycalendar', 'can_cast',
@@ -42,7 +42,7 @@ __all__ = [
     'set_legacy_print_mode', 'set_numeric_ops', 'set_string_function',
     'set_typeDict', 'shares_memory', 'tracemalloc_domain', 'typeinfo',
     'unpackbits', 'unravel_index', 'vdot', 'where', 'zeros',
-    '_get_promotion_state', '_set_promotion_state']
+    '_get_promotion_state', '_set_promotion_state', '_using_numpy2_behavior']
 
 # For backward compatibility, make sure pickle imports these functions from here
 _reconstruct.__module__ = 'numpy.core.multiarray'
@@ -72,6 +72,7 @@ seterrobj.__module__ = 'numpy'
 zeros.__module__ = 'numpy'
 _get_promotion_state.__module__ = 'numpy'
 _set_promotion_state.__module__ = 'numpy'
+_using_numpy2_behavior.__module__ = 'numpy'
 
 
 # We can't verify dispatcher signatures because NumPy's C functions don't
@@ -714,7 +715,7 @@ def result_type(*arrays_and_dtypes):
     the data types are combined with :func:`promote_types`
     to produce the return value.
 
-    Otherwise, `min_scalar_type` is called on each array, and
+    Otherwise, `min_scalar_type` is called on each scalar, and
     the resulting data types are all combined with :func:`promote_types`
     to produce the return value.
 
@@ -1122,13 +1123,13 @@ def copyto(dst, src, casting=None, where=None):
     >>> A
     array([[4, 5, 6],
            [7, 8, 9]])
-       
+
     """
     return (dst, src, where)
 
 
 @array_function_from_c_func_and_dispatcher(_multiarray_umath.putmask)
-def putmask(a, mask, values):
+def putmask(a, /, mask, values):
     """
     putmask(a, mask, values)
 
@@ -1345,7 +1346,7 @@ def shares_memory(a, b, max_work=None):
 
     Raises
     ------
-    numpy.TooHardError
+    numpy.exceptions.TooHardError
         Exceeded max_work.
 
     Returns
@@ -1379,7 +1380,7 @@ def shares_memory(a, b, max_work=None):
     >>> np.shares_memory(x1, x2, max_work=1000)
     Traceback (most recent call last):
     ...
-    numpy.TooHardError: Exceeded max_work
+    numpy.exceptions.TooHardError: Exceeded max_work
 
     Running ``np.shares_memory(x1, x2)`` without `max_work` set takes
     around 1 minute for this case. It is possible to find problems
diff --git a/numpy/core/multiarray.pyi b/numpy/core/multiarray.pyi
index 0701085b7..dc05f8126 100644
--- a/numpy/core/multiarray.pyi
+++ b/numpy/core/multiarray.pyi
@@ -428,6 +428,7 @@ def copyto(
 
 def putmask(
     a: NDArray[Any],
+    /,
     mask: _ArrayLikeBool_co,
     values: ArrayLike,
 ) -> None: ...
diff --git a/numpy/core/numeric.py b/numpy/core/numeric.py
index 5b92972d1..91ac3f860 100644
--- a/numpy/core/numeric.py
+++ b/numpy/core/numeric.py
@@ -4,6 +4,7 @@ import operator
 import sys
 import warnings
 import numbers
+import builtins
 
 import numpy as np
 from . import multiarray
@@ -17,7 +18,8 @@ from .multiarray import (
     fromstring, inner, lexsort, matmul, may_share_memory,
     min_scalar_type, ndarray, nditer, nested_iters, promote_types,
     putmask, result_type, set_numeric_ops, shares_memory, vdot, where,
-    zeros, normalize_axis_index, _get_promotion_state, _set_promotion_state)
+    zeros, normalize_axis_index, _get_promotion_state, _set_promotion_state,
+    _using_numpy2_behavior)
 
 from . import overrides
 from . import umath
@@ -26,7 +28,7 @@ from .overrides import set_array_function_like_doc, set_module
 from .umath import (multiply, invert, sin, PINF, NAN)
 from . import numerictypes
 from .numerictypes import longlong, intc, int_, float_, complex_, bool_
-from ._exceptions import TooHardError, AxisError
+from ..exceptions import ComplexWarning, TooHardError, AxisError
 from ._ufunc_config import errstate, _no_nep50_warning
 
 bitwise_not = invert
@@ -52,22 +54,10 @@ __all__ = [
     'identity', 'allclose', 'compare_chararrays', 'putmask',
     'flatnonzero', 'Inf', 'inf', 'infty', 'Infinity', 'nan', 'NaN',
     'False_', 'True_', 'bitwise_not', 'CLIP', 'RAISE', 'WRAP', 'MAXDIMS',
-    'BUFSIZE', 'ALLOW_THREADS', 'ComplexWarning', 'full', 'full_like',
+    'BUFSIZE', 'ALLOW_THREADS', 'full', 'full_like',
     'matmul', 'shares_memory', 'may_share_memory', 'MAY_SHARE_BOUNDS',
-    'MAY_SHARE_EXACT', 'TooHardError', 'AxisError',
-    '_get_promotion_state', '_set_promotion_state']
-
-
-@set_module('numpy')
-class ComplexWarning(RuntimeWarning):
-    """
-    The warning raised when casting a complex dtype to a real dtype.
-
-    As implemented, casting a complex number to a real discards its imaginary
-    part, but this behavior may not be what the user actually wants.
-
-    """
-    pass
+    'MAY_SHARE_EXACT', '_get_promotion_state', '_set_promotion_state',
+    '_using_numpy2_behavior']
 
 
 def _zeros_like_dispatcher(a, dtype=None, order=None, subok=None, shape=None):
@@ -143,10 +133,6 @@ def zeros_like(a, dtype=None, order='K', subok=True, shape=None):
     return res
 
 
-def _ones_dispatcher(shape, dtype=None, order=None, *, like=None):
-    return(like,)
-
-
 @set_array_function_like_doc
 @set_module('numpy')
 def ones(shape, dtype=None, order='C', *, like=None):
@@ -200,16 +186,14 @@ def ones(shape, dtype=None, order='C', *, like=None):
 
     """
     if like is not None:
-        return _ones_with_like(shape, dtype=dtype, order=order, like=like)
+        return _ones_with_like(like, shape, dtype=dtype, order=order)
 
     a = empty(shape, dtype, order)
     multiarray.copyto(a, 1, casting='unsafe')
     return a
 
 
-_ones_with_like = array_function_dispatch(
-    _ones_dispatcher
-)(ones)
+_ones_with_like = array_function_dispatch()(ones)
 
 
 def _ones_like_dispatcher(a, dtype=None, order=None, subok=None, shape=None):
@@ -336,7 +320,8 @@ def full(shape, fill_value, dtype=None, order='C', *, like=None):
 
     """
     if like is not None:
-        return _full_with_like(shape, fill_value, dtype=dtype, order=order, like=like)
+        return _full_with_like(
+                like, shape, fill_value, dtype=dtype, order=order)
 
     if dtype is None:
         fill_value = asarray(fill_value)
@@ -346,9 +331,7 @@ def full(shape, fill_value, dtype=None, order='C', *, like=None):
     return a
 
 
-_full_with_like = array_function_dispatch(
-    _full_dispatcher
-)(full)
+_full_with_like = array_function_dispatch()(full)
 
 
 def _full_like_dispatcher(a, fill_value, dtype=None, order=None, subok=None, shape=None):
@@ -707,7 +690,7 @@ def correlate(a, v, mode='valid'):
     --------
     convolve : Discrete, linear convolution of two one-dimensional sequences.
     multiarray.correlate : Old, no conjugate, version of correlate.
-    scipy.signal.correlate : uses FFT which has superior performance on large arrays. 
+    scipy.signal.correlate : uses FFT which has superior performance on large arrays.
 
     Notes
     -----
@@ -721,7 +704,7 @@ def correlate(a, v, mode='valid'):
     `numpy.correlate` may perform slowly in large arrays (i.e. n = 1e5) because it does
     not use the FFT to compute the convolution; in that case, `scipy.signal.correlate` might
     be preferable.
-    
+
 
     Examples
     --------
@@ -738,7 +721,7 @@ def correlate(a, v, mode='valid'):
     array([ 0.5-0.5j,  1.0+0.j ,  1.5-1.5j,  3.0-1.j ,  0.0+0.j ])
 
     Note that you get the time reversed, complex conjugated result
-    (:math:`\overline{c_{-k}}`) when the two input sequences a and v change 
+    (:math:`\overline{c_{-k}}`) when the two input sequences a and v change
     places:
 
     >>> np.correlate([0, 1, 0.5j], [1+1j, 2, 3-1j], 'full')
@@ -860,14 +843,13 @@ def outer(a, b, out=None):
     """
     Compute the outer product of two vectors.
 
-    Given two vectors, ``a = [a0, a1, ..., aM]`` and
-    ``b = [b0, b1, ..., bN]``,
+    Given two vectors `a` and `b` of length ``M`` and ``N``, repsectively,
     the outer product [1]_ is::
 
-      [[a0*b0  a0*b1 ... a0*bN ]
-       [a1*b0    .
+      [[a_0*b_0  a_0*b_1 ... a_0*b_{N-1} ]
+       [a_1*b_0    .
        [ ...          .
-       [aM*b0            aM*bN ]]
+       [a_{M-1}*b_0            a_{M-1}*b_{N-1} ]]
 
     Parameters
     ----------
@@ -899,9 +881,9 @@ def outer(a, b, out=None):
 
     References
     ----------
-    .. [1] : G. H. Golub and C. F. Van Loan, *Matrix Computations*, 3rd
-             ed., Baltimore, MD, Johns Hopkins University Press, 1996,
-             pg. 8.
+    .. [1] G. H. Golub and C. F. Van Loan, *Matrix Computations*, 3rd
+           ed., Baltimore, MD, Johns Hopkins University Press, 1996,
+           pg. 8.
 
     Examples
     --------
@@ -1300,7 +1282,7 @@ def rollaxis(a, axis, start=0):
            +-------------------+----------------------+
            | ``arr.ndim + 1``  | raise ``AxisError``  |
            +-------------------+----------------------+
-           
+
         .. |vdots|   unicode:: U+22EE .. Vertical Ellipsis
 
     Returns
@@ -1791,10 +1773,6 @@ def indices(dimensions, dtype=int, sparse=False):
     return res
 
 
-def _fromfunction_dispatcher(function, shape, *, dtype=None, like=None, **kwargs):
-    return (like,)
-
-
 @set_array_function_like_doc
 @set_module('numpy')
 def fromfunction(function, shape, *, dtype=float, like=None, **kwargs):
@@ -1843,11 +1821,11 @@ def fromfunction(function, shape, *, dtype=float, like=None, **kwargs):
     >>> np.fromfunction(lambda i, j: i, (2, 2), dtype=float)
     array([[0., 0.],
            [1., 1.]])
-           
-    >>> np.fromfunction(lambda i, j: j, (2, 2), dtype=float)    
+
+    >>> np.fromfunction(lambda i, j: j, (2, 2), dtype=float)
     array([[0., 1.],
            [0., 1.]])
-           
+
     >>> np.fromfunction(lambda i, j: i == j, (3, 3), dtype=int)
     array([[ True, False, False],
            [False,  True, False],
@@ -1860,15 +1838,14 @@ def fromfunction(function, shape, *, dtype=float, like=None, **kwargs):
 
     """
     if like is not None:
-        return _fromfunction_with_like(function, shape, dtype=dtype, like=like, **kwargs)
+        return _fromfunction_with_like(
+                like, function, shape, dtype=dtype, **kwargs)
 
     args = indices(shape, dtype=dtype)
     return function(*args, **kwargs)
 
 
-_fromfunction_with_like = array_function_dispatch(
-    _fromfunction_dispatcher
-)(fromfunction)
+_fromfunction_with_like = array_function_dispatch()(fromfunction)
 
 
 def _frombuffer(buf, dtype, shape, order):
@@ -2046,7 +2023,7 @@ def binary_repr(num, width=None):
         binary = bin(num)[2:]
         binwidth = len(binary)
         outwidth = (binwidth if width is None
-                    else max(binwidth, width))
+                    else builtins.max(binwidth, width))
         warn_if_insufficient(width, binwidth)
         return binary.zfill(outwidth)
 
@@ -2066,7 +2043,7 @@ def binary_repr(num, width=None):
             binary = bin(twocomp)[2:]
             binwidth = len(binary)
 
-            outwidth = max(binwidth, width)
+            outwidth = builtins.max(binwidth, width)
             warn_if_insufficient(width, binwidth)
             return '1' * (outwidth - binwidth) + binary
 
@@ -2143,10 +2120,6 @@ def _maketup(descr, val):
         return tuple(res)
 
 
-def _identity_dispatcher(n, dtype=None, *, like=None):
-    return (like,)
-
-
 @set_array_function_like_doc
 @set_module('numpy')
 def identity(n, dtype=None, *, like=None):
@@ -2181,15 +2154,13 @@ def identity(n, dtype=None, *, like=None):
 
     """
     if like is not None:
-        return _identity_with_like(n, dtype=dtype, like=like)
+        return _identity_with_like(like, n, dtype=dtype)
 
     from numpy import eye
     return eye(n, dtype=dtype, like=like)
 
 
-_identity_with_like = array_function_dispatch(
-    _identity_dispatcher
-)(identity)
+_identity_with_like = array_function_dispatch()(identity)
 
 
 def _allclose_dispatcher(a, b, rtol=None, atol=None, equal_nan=None):
diff --git a/numpy/core/numerictypes.py b/numpy/core/numerictypes.py
index 43bc2e77b..aea41bc2e 100644
--- a/numpy/core/numerictypes.py
+++ b/numpy/core/numerictypes.py
@@ -47,14 +47,14 @@ Exported symbols include:
      |   |   |     byte
      |   |   |     short
      |   |   |     intc
-     |   |   |     intp            int0
+     |   |   |     intp
      |   |   |     int_
      |   |   |     longlong
      |   |   \\-> unsignedinteger  (uintxx)     (kind=u)
      |   |         ubyte
      |   |         ushort
      |   |         uintc
-     |   |         uintp           uint0
+     |   |         uintp
      |   |         uint_
      |   |         ulonglong
      |   +-> inexact
@@ -82,11 +82,11 @@ Exported symbols include:
 import numbers
 import warnings
 
-from numpy.core.multiarray import (
+from .multiarray import (
         ndarray, array, dtype, datetime_data, datetime_as_string,
         busday_offset, busday_count, is_busday, busdaycalendar
         )
-from numpy.core.overrides import set_module
+from .._utils import set_module
 
 # we add more at the bottom
 __all__ = ['sctypeDict', 'sctypes',
diff --git a/numpy/core/overrides.py b/numpy/core/overrides.py
index 450464f89..6403e65b0 100644
--- a/numpy/core/overrides.py
+++ b/numpy/core/overrides.py
@@ -3,13 +3,13 @@ import collections
 import functools
 import os
 
+from .._utils import set_module
+from .._utils._inspect import getargspec
 from numpy.core._multiarray_umath import (
-    add_docstring, implement_array_function, _get_implementing_args)
-from numpy.compat._inspect import getargspec
+    add_docstring,  _get_implementing_args, _ArrayFunctionDispatcher)
 
 
-ARRAY_FUNCTION_ENABLED = bool(
-    int(os.environ.get('NUMPY_EXPERIMENTAL_ARRAY_FUNCTION', 1)))
+ARRAY_FUNCTIONS = set()
 
 array_function_like_doc = (
     """like : array_like, optional
@@ -30,40 +30,35 @@ def set_array_function_like_doc(public_api):
 
 
 add_docstring(
-    implement_array_function,
+    _ArrayFunctionDispatcher,
     """
-    Implement a function with checks for __array_function__ overrides.
+    Class to wrap functions with checks for __array_function__ overrides.
 
     All arguments are required, and can only be passed by position.
 
     Parameters
     ----------
+    dispatcher : function or None
+        The dispatcher function that returns a single sequence-like object
+        of all arguments relevant.  It must have the same signature (except
+        the default values) as the actual implementation.
+        If ``None``, this is a ``like=`` dispatcher and the
+        ``_ArrayFunctionDispatcher`` must be called with ``like`` as the
+        first (additional and positional) argument.
     implementation : function
-        Function that implements the operation on NumPy array without
-        overrides when called like ``implementation(*args, **kwargs)``.
-    public_api : function
-        Function exposed by NumPy's public API originally called like
-        ``public_api(*args, **kwargs)`` on which arguments are now being
-        checked.
-    relevant_args : iterable
-        Iterable of arguments to check for __array_function__ methods.
-    args : tuple
-        Arbitrary positional arguments originally passed into ``public_api``.
-    kwargs : dict
-        Arbitrary keyword arguments originally passed into ``public_api``.
+        Function that implements the operation on NumPy arrays without
+        overrides.  Arguments passed calling the ``_ArrayFunctionDispatcher``
+        will be forwarded to this (and the ``dispatcher``) as if using
+        ``*args, **kwargs``.
 
-    Returns
-    -------
-    Result from calling ``implementation()`` or an ``__array_function__``
-    method, as appropriate.
-
-    Raises
-    ------
-    TypeError : if no implementation is found.
+    Attributes
+    ----------
+    _implementation : function
+        The original implementation passed in.
     """)
 
 
-# exposed for testing purposes; used internally by implement_array_function
+# exposed for testing purposes; used internally by _ArrayFunctionDispatcher
 add_docstring(
     _get_implementing_args,
     """
@@ -107,25 +102,7 @@ def verify_matching_signatures(implementation, dispatcher):
                                'default argument values')
 
 
-def set_module(module):
-    """Decorator for overriding __module__ on a function or class.
-
-    Example usage::
-
-        @set_module('numpy')
-        def example():
-            pass
-
-        assert example.__module__ == 'numpy'
-    """
-    def decorator(func):
-        if module is not None:
-            func.__module__ = module
-        return func
-    return decorator
-
-
-def array_function_dispatch(dispatcher, module=None, verify=True,
+def array_function_dispatch(dispatcher=None, module=None, verify=True,
                             docs_from_dispatcher=False):
     """Decorator for adding dispatch with the __array_function__ protocol.
 
@@ -133,10 +110,14 @@ def array_function_dispatch(dispatcher, module=None, verify=True,
 
     Parameters
     ----------
-    dispatcher : callable
+    dispatcher : callable or None
         Function that when called like ``dispatcher(*args, **kwargs)`` with
         arguments from the NumPy function call returns an iterable of
         array-like arguments to check for ``__array_function__``.
+
+        If `None`, the first argument is used as the single `like=` argument
+        and not passed on.  A function implementing `like=` must call its
+        dispatcher with `like` as the first non-keyword argument.
     module : str, optional
         __module__ attribute to set on new function, e.g., ``module='numpy'``.
         By default, module is copied from the decorated function.
@@ -156,57 +137,33 @@ def array_function_dispatch(dispatcher, module=None, verify=True,
     Returns
     -------
     Function suitable for decorating the implementation of a NumPy function.
-    """
-
-    if not ARRAY_FUNCTION_ENABLED:
-        def decorator(implementation):
-            if docs_from_dispatcher:
-                add_docstring(implementation, dispatcher.__doc__)
-            if module is not None:
-                implementation.__module__ = module
-            return implementation
-        return decorator
 
+    """
     def decorator(implementation):
         if verify:
-            verify_matching_signatures(implementation, dispatcher)
+            if dispatcher is not None:
+                verify_matching_signatures(implementation, dispatcher)
+            else:
+                # Using __code__ directly similar to verify_matching_signature
+                co = implementation.__code__
+                last_arg = co.co_argcount + co.co_kwonlyargcount - 1
+                last_arg = co.co_varnames[last_arg]
+                if last_arg != "like" or co.co_kwonlyargcount == 0:
+                    raise RuntimeError(
+                        "__array_function__ expects `like=` to be the last "
+                        "argument and a keyword-only argument. "
+                        f"{implementation} does not seem to comply.")
 
         if docs_from_dispatcher:
             add_docstring(implementation, dispatcher.__doc__)
 
-        @functools.wraps(implementation)
-        def public_api(*args, **kwargs):
-            try:
-                relevant_args = dispatcher(*args, **kwargs)
-            except TypeError as exc:
-                # Try to clean up a signature related TypeError.  Such an
-                # error will be something like:
-                #     dispatcher.__name__() got an unexpected keyword argument
-                #
-                # So replace the dispatcher name in this case.  In principle
-                # TypeErrors may be raised from _within_ the dispatcher, so
-                # we check that the traceback contains a string that starts
-                # with the name.  (In principle we could also check the
-                # traceback length, as it would be deeper.)
-                msg = exc.args[0]
-                disp_name = dispatcher.__name__
-                if not isinstance(msg, str) or not msg.startswith(disp_name):
-                    raise
-
-                # Replace with the correct name and re-raise:
-                new_msg = msg.replace(disp_name, public_api.__name__)
-                raise TypeError(new_msg) from None
-
-            return implement_array_function(
-                implementation, public_api, relevant_args, args, kwargs)
-
-        public_api.__code__ = public_api.__code__.replace(
-                co_name=implementation.__name__,
-                co_filename='<__array_function__ internals>')
+        public_api = _ArrayFunctionDispatcher(dispatcher, implementation)
+        public_api = functools.wraps(implementation)(public_api)
+
         if module is not None:
             public_api.__module__ = module
 
-        public_api._implementation = implementation
+        ARRAY_FUNCTIONS.add(public_api)
 
         return public_api
 
diff --git a/numpy/core/records.py b/numpy/core/records.py
index c014bc97c..0fb49e8f7 100644
--- a/numpy/core/records.py
+++ b/numpy/core/records.py
@@ -37,10 +37,10 @@ import warnings
 from collections import Counter
 from contextlib import nullcontext
 
+from .._utils import set_module
 from . import numeric as sb
 from . import numerictypes as nt
 from numpy.compat import os_fspath
-from numpy.core.overrides import set_module
 from .arrayprint import _get_legacy_print_mode
 
 # All of the functions allow formats to be a dtype
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index 10b8c093e..8ef012f70 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -20,7 +20,7 @@ from setup_common import *  # noqa: F403
 NPY_RELAXED_STRIDES_CHECKING = (os.environ.get('NPY_RELAXED_STRIDES_CHECKING', "1") != "0")
 if not NPY_RELAXED_STRIDES_CHECKING:
     raise SystemError(
-        "Support for NPY_RELAXED_STRIDES_CHECKING=0 has been remove as of "
+        "Support for NPY_RELAXED_STRIDES_CHECKING=0 has been removed as of "
         "NumPy 1.23.  This error will eventually be removed entirely.")
 
 # Put NPY_RELAXED_STRIDES_DEBUG=1 in the environment if you want numpy to use a
@@ -45,6 +45,8 @@ NPY_DISABLE_SVML = (os.environ.get('NPY_DISABLE_SVML', "0") == "1")
 # in time is only in build. -- Charles Harris, 2013-03-30
 
 class CallOnceOnly:
+    # NOTE: we don't need any of this in the Meson build,
+    # it takes care of caching
     def __init__(self):
         self._check_types = None
         self._check_ieee_macros = None
@@ -58,14 +60,6 @@ class CallOnceOnly:
             out = copy.deepcopy(pickle.loads(self._check_types))
         return out
 
-    def check_ieee_macros(self, *a, **kw):
-        if self._check_ieee_macros is None:
-            out = check_ieee_macros(*a, **kw)
-            self._check_ieee_macros = pickle.dumps(out)
-        else:
-            out = copy.deepcopy(pickle.loads(self._check_ieee_macros))
-        return out
-
     def check_complex(self, *a, **kw):
         if self._check_complex is None:
             out = check_complex(*a, **kw)
@@ -85,11 +79,13 @@ def can_link_svml():
             and "linux" in platform
             and sys.maxsize > 2**31)
 
-def check_svml_submodule(svmlpath):
-    if not os.path.exists(svmlpath + "/README.md"):
-        raise RuntimeError("Missing `SVML` submodule! Run `git submodule "
-                           "update --init` to fix this.")
-    return True
+def check_git_submodules():
+    out = os.popen("git submodule status")
+    modules = out.readlines()
+    for submodule in modules:
+        if (submodule.strip()[0] == "-"):
+            raise RuntimeError("git submodules are not initialized."
+                    "Please run `git submodule update --init` to fix this.")
 
 def pythonlib_dir():
     """return path where libpython* is."""
@@ -177,16 +173,6 @@ def check_math_capabilities(config, ext, moredefs, mathlibs):
         else:
             return 1
 
-    # GH-14787: Work around GCC<8.4 bug when compiling with AVX512
-    # support on Windows-based platforms
-    def check_gh14787(fn):
-        if fn == 'attribute_target_avx512f':
-            if (sys.platform in ('win32', 'cygwin') and
-                    config.check_compiler_gcc() and
-                    not config.check_gcc_version_at_least(8, 4)):
-                ext.extra_compile_args.extend(
-                        ['-ffixed-xmm%s' % n for n in range(16, 32)])
-
     #use_msvc = config.check_decl("_MSC_VER")
     if not check_funcs_once(MANDATORY_FUNCS, add_to_moredefs=False):
         raise SystemError("One of the required function to build numpy is not"
@@ -237,20 +223,8 @@ def check_math_capabilities(config, ext, moredefs, mathlibs):
     for dec, fn in OPTIONAL_FUNCTION_ATTRIBUTES:
         if config.check_gcc_function_attribute(dec, fn):
             moredefs.append((fname2def(fn), 1))
-            check_gh14787(fn)
 
     platform = sysconfig.get_platform()
-    if ("x86_64" in platform):
-        for dec, fn in OPTIONAL_FUNCTION_ATTRIBUTES_AVX:
-            if config.check_gcc_function_attribute(dec, fn):
-                moredefs.append((fname2def(fn), 1))
-                check_gh14787(fn)
-        for dec, fn, code, header in (
-        OPTIONAL_FUNCTION_ATTRIBUTES_WITH_INTRINSICS_AVX):
-            if config.check_gcc_function_attribute_with_intrinsics(
-                    dec, fn, code, header):
-                moredefs.append((fname2def(fn), 1))
-
     for fn in OPTIONAL_VARIABLE_ATTRIBUTES:
         if config.check_gcc_variable_attribute(fn):
             m = fn.replace("(", "_").replace(")", "_")
@@ -289,43 +263,6 @@ def check_complex(config, mathlibs):
 
     return priv, pub
 
-def check_ieee_macros(config):
-    priv = []
-    pub = []
-
-    macros = []
-
-    def _add_decl(f):
-        priv.append(fname2def("decl_%s" % f))
-        pub.append('NPY_%s' % fname2def("decl_%s" % f))
-
-    # XXX: hack to circumvent cpp pollution from python: python put its
-    # config.h in the public namespace, so we have a clash for the common
-    # functions we test. We remove every function tested by python's
-    # autoconf, hoping their own test are correct
-    _macros = ["isnan", "isinf", "signbit", "isfinite"]
-    for f in _macros:
-        py_symbol = fname2def("decl_%s" % f)
-        already_declared = config.check_decl(py_symbol,
-                headers=["Python.h", "math.h"])
-        if already_declared:
-            if config.check_macro_true(py_symbol,
-                    headers=["Python.h", "math.h"]):
-                pub.append('NPY_%s' % fname2def("decl_%s" % f))
-        else:
-            macros.append(f)
-    # Normally, isnan and isinf are macro (C99), but some platforms only have
-    # func, or both func and macro version. Check for macro only, and define
-    # replacement ones if not found.
-    # Note: including Python.h is necessary because it modifies some math.h
-    # definitions
-    for f in macros:
-        st = config.check_decl(f, headers=["Python.h", "math.h"])
-        if st:
-            _add_decl(f)
-
-    return priv, pub
-
 def check_types(config_cmd, ext, build_dir):
     private_defines = []
     public_defines = []
@@ -427,6 +364,8 @@ def check_types(config_cmd, ext, build_dir):
 
     return private_defines, public_defines
 
+# NOTE: this isn't needed in the Meson build,
+#       and we won't support a MATHLIB env var
 def check_mathlib(config_cmd):
     # Testing the C math library
     mathlibs = []
@@ -466,7 +405,6 @@ def configuration(parent_package='',top_path=None):
                                            exec_mod_from_location)
     from numpy.distutils.system_info import (get_info, blas_opt_info,
                                              lapack_opt_info)
-    from numpy.distutils.ccompiler_opt import NPY_CXX_FLAGS
     from numpy.version import release as is_released
 
     config = Configuration('core', parent_package, top_path)
@@ -477,6 +415,8 @@ def configuration(parent_package='',top_path=None):
     # actual C API VERSION. Will raise a MismatchCAPIError if so.
     check_api_version(C_API_VERSION, codegen_dir)
 
+    check_git_submodules()
+
     generate_umath_py = join(codegen_dir, 'generate_umath.py')
     n = dot_join(config.name, 'generate_umath')
     generate_umath = exec_mod_from_location('_'.join(n.split('.')),
@@ -504,12 +444,11 @@ def configuration(parent_package='',top_path=None):
             moredefs.append(('MATHLIB', ','.join(mathlibs)))
 
             check_math_capabilities(config_cmd, ext, moredefs, mathlibs)
-            moredefs.extend(cocache.check_ieee_macros(config_cmd)[0])
             moredefs.extend(cocache.check_complex(config_cmd, mathlibs)[0])
 
             # Signal check
             if is_npy_no_signal():
-                moredefs.append('__NPY_PRIVATE_NO_SIGNAL')
+                moredefs.append('NPY_NO_SIGNAL')
 
             # Windows checks
             if sys.platform == 'win32' or os.name == 'nt':
@@ -623,7 +562,6 @@ def configuration(parent_package='',top_path=None):
                 moredefs.append(('NPY_NO_SMP', 0))
 
             mathlibs = check_mathlib(config_cmd)
-            moredefs.extend(cocache.check_ieee_macros(config_cmd)[1])
             moredefs.extend(cocache.check_complex(config_cmd, mathlibs)[1])
 
             if NPY_RELAXED_STRIDES_DEBUG:
@@ -671,11 +609,11 @@ def configuration(parent_package='',top_path=None):
             try:
                 m = __import__(module_name)
                 log.info('executing %s', script)
-                h_file, c_file, doc_file = m.generate_api(os.path.join(build_dir, header_dir))
+                h_file, c_file = m.generate_api(os.path.join(build_dir, header_dir))
             finally:
                 del sys.path[0]
             config.add_data_files((header_dir, h_file),
-                                  (header_dir, doc_file))
+                                 )
             return (h_file,)
         return generate_api
 
@@ -704,8 +642,7 @@ def configuration(parent_package='',top_path=None):
 
     config.numpy_include_dirs.extend(config.paths('include'))
 
-    deps = [join('src', 'npymath', '_signbit.c'),
-            join('include', 'numpy', '*object.h'),
+    deps = [join('include', 'numpy', '*object.h'),
             join(codegen_dir, 'genapi.py'),
             ]
 
@@ -720,44 +657,6 @@ def configuration(parent_package='',top_path=None):
         # but we cannot use add_installed_pkg_config here either, so we only
         # update the substitution dictionary during npymath build
         config_cmd = config.get_config_cmd()
-        # Check that the toolchain works, to fail early if it doesn't
-        # (avoid late errors with MATHLIB which are confusing if the
-        # compiler does not work).
-        for lang, test_code, note in (
-            ('c', 'int main(void) { return 0;}', ''),
-            ('c++', (
-                    'int main(void)'
-                    '{ auto x = 0.0; return static_cast<int>(x); }'
-                ), (
-                    'note: A compiler with support for C++11 language '
-                    'features is required.'
-                )
-             ),
-        ):
-            is_cpp = lang == 'c++'
-            if is_cpp:
-                # this a workaround to get rid of invalid c++ flags
-                # without doing big changes to config.
-                # c tested first, compiler should be here
-                bk_c = config_cmd.compiler
-                config_cmd.compiler = bk_c.cxx_compiler()
-
-                # Check that Linux compiler actually support the default flags
-                if hasattr(config_cmd.compiler, 'compiler'):
-                    config_cmd.compiler.compiler.extend(NPY_CXX_FLAGS)
-                    config_cmd.compiler.compiler_so.extend(NPY_CXX_FLAGS)
-
-            st = config_cmd.try_link(test_code, lang=lang)
-            if not st:
-                # rerun the failing command in verbose mode
-                config_cmd.compiler.verbose = True
-                config_cmd.try_link(test_code, lang=lang)
-                raise RuntimeError(
-                    f"Broken toolchain: cannot link a simple {lang.upper()} "
-                    f"program. {note}"
-                )
-            if is_cpp:
-                config_cmd.compiler = bk_c
         mlibs = check_mathlib(config_cmd)
 
         posix_mlib = ' '.join(['-l%s' % l for l in mlibs])
@@ -770,7 +669,7 @@ def configuration(parent_package='',top_path=None):
                        # join('src', 'npymath', 'ieee754.cpp'),
                        join('src', 'npymath', 'ieee754.c.src'),
                        join('src', 'npymath', 'npy_math_complex.c.src'),
-                       join('src', 'npymath', 'halffloat.c')
+                       join('src', 'npymath', 'halffloat.cpp'),
                        ]
 
     config.add_installed_library('npymath',
@@ -828,7 +727,8 @@ def configuration(parent_package='',top_path=None):
             join('src', 'common', 'numpyos.h'),
             join('src', 'common', 'npy_cpu_dispatch.h'),
             join('src', 'common', 'simd', 'simd.h'),
-            ]
+            join('src', 'common', 'common.hpp'),
+        ]
 
     common_src = [
             join('src', 'common', 'array_assign.c'),
@@ -882,6 +782,7 @@ def configuration(parent_package='',top_path=None):
             join('src', 'multiarray', 'descriptor.h'),
             join('src', 'multiarray', 'dtypemeta.h'),
             join('src', 'multiarray', 'dtype_transfer.h'),
+            join('src', 'multiarray', 'dtype_traversal.h'),
             join('src', 'multiarray', 'dragon4.h'),
             join('src', 'multiarray', 'einsum_debug.h'),
             join('src', 'multiarray', 'einsum_sumprod.h'),
@@ -955,6 +856,7 @@ def configuration(parent_package='',top_path=None):
             join('src', 'multiarray', 'dtypemeta.c'),
             join('src', 'multiarray', 'dragon4.c'),
             join('src', 'multiarray', 'dtype_transfer.c'),
+            join('src', 'multiarray', 'dtype_traversal.c'),
             join('src', 'multiarray', 'einsum.c.src'),
             join('src', 'multiarray', 'einsum_sumprod.c.src'),
             join('src', 'multiarray', 'experimental_public_dtype_api.c'),
@@ -984,7 +886,6 @@ def configuration(parent_package='',top_path=None):
             join('src', 'multiarray', 'usertypes.c'),
             join('src', 'multiarray', 'vdot.c'),
             join('src', 'common', 'npy_sort.h.src'),
-            join('src', 'npysort', 'x86-qsort.dispatch.cpp'),
             join('src', 'npysort', 'quicksort.cpp'),
             join('src', 'npysort', 'mergesort.cpp'),
             join('src', 'npysort', 'timsort.cpp'),
@@ -1002,6 +903,12 @@ def configuration(parent_package='',top_path=None):
             join('src', 'multiarray', 'textreading', 'stream_pyobject.c'),
             join('src', 'multiarray', 'textreading', 'str_to_int.c'),
             join('src', 'multiarray', 'textreading', 'tokenize.cpp'),
+            # Remove this once scipy macos arm64 build correctly
+            # links to the arm64 npymath library,
+            # see gh-22673
+            join('src', 'npymath', 'arm64_exports.c'),
+            join('src', 'npysort', 'simd_qsort.dispatch.cpp'),
+            join('src', 'npysort', 'simd_qsort_16bit.dispatch.cpp'),
             ]
 
     #######################################################################
@@ -1039,13 +946,15 @@ def configuration(parent_package='',top_path=None):
             join('src', 'umath', 'umathmodule.c'),
             join('src', 'umath', 'reduction.c'),
             join('src', 'umath', 'funcs.inc.src'),
-            join('src', 'umath', 'simd.inc.src'),
             join('src', 'umath', 'loops.h.src'),
             join('src', 'umath', 'loops_utils.h.src'),
             join('src', 'umath', 'loops.c.src'),
+            join('src', 'umath', 'loops_unary.dispatch.c.src'),
             join('src', 'umath', 'loops_unary_fp.dispatch.c.src'),
+            join('src', 'umath', 'loops_unary_fp_le.dispatch.c.src'),
             join('src', 'umath', 'loops_arithm_fp.dispatch.c.src'),
             join('src', 'umath', 'loops_arithmetic.dispatch.c.src'),
+            join('src', 'umath', 'loops_logical.dispatch.c.src'),
             join('src', 'umath', 'loops_minmax.dispatch.c.src'),
             join('src', 'umath', 'loops_trigonometric.dispatch.c.src'),
             join('src', 'umath', 'loops_umath_fp.dispatch.c.src'),
@@ -1053,6 +962,8 @@ def configuration(parent_package='',top_path=None):
             join('src', 'umath', 'loops_hyperbolic.dispatch.c.src'),
             join('src', 'umath', 'loops_modulo.dispatch.c.src'),
             join('src', 'umath', 'loops_comparison.dispatch.c.src'),
+            join('src', 'umath', 'loops_unary_complex.dispatch.c.src'),
+            join('src', 'umath', 'loops_autovec.dispatch.c.src'),
             join('src', 'umath', 'matmul.h.src'),
             join('src', 'umath', 'matmul.c.src'),
             join('src', 'umath', 'clip.h'),
@@ -1077,7 +988,6 @@ def configuration(parent_package='',top_path=None):
             join('src', 'multiarray', 'common.h'),
             join('src', 'multiarray', 'number.h'),
             join('src', 'common', 'templ_common.h.src'),
-            join('src', 'umath', 'simd.inc.src'),
             join('src', 'umath', 'override.h'),
             join(codegen_dir, 'generate_ufunc_api.py'),
             join(codegen_dir, 'ufunc_docstrings.py'),
@@ -1092,7 +1002,7 @@ def configuration(parent_package='',top_path=None):
     # after all maintainable code.
     svml_filter = (
     )
-    if can_link_svml() and check_svml_submodule(svml_path):
+    if can_link_svml():
         svml_objs = glob.glob(svml_path + '/**/*.s', recursive=True)
         svml_objs = [o for o in svml_objs if not o.endswith(svml_filter)]
 
@@ -1101,9 +1011,6 @@ def configuration(parent_package='',top_path=None):
         svml_objs.sort()
 
     config.add_extension('_multiarray_umath',
-                         # Forcing C language even though we have C++ sources.
-                         # It forces the C linker and don't link C++ runtime.
-                         language = 'c',
                          sources=multiarray_src + umath_src +
                                  common_src +
                                  [generate_config_h,
@@ -1119,8 +1026,7 @@ def configuration(parent_package='',top_path=None):
                                 common_deps,
                          libraries=['npymath'],
                          extra_objects=svml_objs,
-                         extra_info=extra_info,
-                         extra_cxx_compile_args=NPY_CXX_FLAGS)
+                         extra_info=extra_info)
 
     #######################################################################
     #                        umath_tests module                           #
@@ -1158,23 +1064,26 @@ def configuration(parent_package='',top_path=None):
     #                        SIMD module                                  #
     #######################################################################
 
-    config.add_extension('_simd', sources=[
-        join('src', 'common', 'npy_cpu_features.c'),
-        join('src', '_simd', '_simd.c'),
-        join('src', '_simd', '_simd_inc.h.src'),
-        join('src', '_simd', '_simd_data.inc.src'),
-        join('src', '_simd', '_simd.dispatch.c.src'),
-    ], depends=[
-        join('src', 'common', 'npy_cpu_dispatch.h'),
-        join('src', 'common', 'simd', 'simd.h'),
-        join('src', '_simd', '_simd.h'),
-        join('src', '_simd', '_simd_inc.h.src'),
-        join('src', '_simd', '_simd_data.inc.src'),
-        join('src', '_simd', '_simd_arg.inc'),
-        join('src', '_simd', '_simd_convert.inc'),
-        join('src', '_simd', '_simd_easyintrin.inc'),
-        join('src', '_simd', '_simd_vector.inc'),
-    ])
+    config.add_extension('_simd',
+        sources=[
+            join('src', 'common', 'npy_cpu_features.c'),
+            join('src', '_simd', '_simd.c'),
+            join('src', '_simd', '_simd_inc.h.src'),
+            join('src', '_simd', '_simd_data.inc.src'),
+            join('src', '_simd', '_simd.dispatch.c.src'),
+        ], depends=[
+            join('src', 'common', 'npy_cpu_dispatch.h'),
+            join('src', 'common', 'simd', 'simd.h'),
+            join('src', '_simd', '_simd.h'),
+            join('src', '_simd', '_simd_inc.h.src'),
+            join('src', '_simd', '_simd_data.inc.src'),
+            join('src', '_simd', '_simd_arg.inc'),
+            join('src', '_simd', '_simd_convert.inc'),
+            join('src', '_simd', '_simd_easyintrin.inc'),
+            join('src', '_simd', '_simd_vector.inc'),
+        ],
+        libraries=['npymath']
+    )
 
     config.add_subpackage('tests')
     config.add_data_dir('tests/data')
diff --git a/numpy/core/setup_common.py b/numpy/core/setup_common.py
index 085c0baf5..b5bc0dec1 100644
--- a/numpy/core/setup_common.py
+++ b/numpy/core/setup_common.py
@@ -48,7 +48,13 @@ C_ABI_VERSION = 0x01000009
 # 0x0000000f - 1.22.x
 # 0x00000010 - 1.23.x
 # 0x00000010 - 1.24.x
-C_API_VERSION = 0x00000010
+# 0x00000011 - 1.25.x
+C_API_VERSION = 0x00000011
+
+# By default, when compiling downstream libraries against NumPy,```
+# pick an older feature version.  For example, for 1.25.x we default to the
+# 1.19 API and support going back all the way to 1.15.x (if so desired).
+# This is set up in `numpyconfig.h`.
 
 class MismatchCAPIError(ValueError):
     pass
@@ -91,6 +97,10 @@ def check_api_version(apiversion, codegen_dir):
                f"checksum in core/codegen_dir/cversions.txt is {api_hash}. If "
                "functions were added in the C API, you have to update "
                f"C_API_VERSION in {__file__}."
+               "\n"
+               "Please make sure that new additions are guarded with "
+               "MinVersion() to make them unavailable when wishing support "
+               "for older NumPy versions."
                )
         raise MismatchCAPIError(msg)
 
@@ -129,10 +139,9 @@ MANDATORY_FUNCS = [
     "floor", "ceil", "sqrt", "log10", "log", "exp", "asin",
     "acos", "atan", "fmod", 'modf', 'frexp', 'ldexp',
     "expm1", "log1p", "acosh", "asinh", "atanh",
-    "rint", "trunc", "exp2", 
+    "rint", "trunc", "exp2",
     "copysign", "nextafter", "strtoll", "strtoull", "cbrt",
     "log2", "pow", "hypot", "atan2",
-    "csin", "csinh", "ccos", "ccosh", "ctan", "ctanh",
     "creal", "cimag", "conj"
 ]
 
@@ -154,6 +163,9 @@ C99_COMPLEX_TYPES = [
 C99_COMPLEX_FUNCS = [
     "cabs", "cacos", "cacosh", "carg", "casin", "casinh", "catan",
     "catanh", "cexp", "clog", "cpow", "csqrt",
+    # The long double variants (like csinl)  should be mandatory on C11,
+    # but are missing in FreeBSD. Issue gh-22850
+    "csin", "csinh", "ccos", "ccosh", "ctan", "ctanh",
     ]
 
 OPTIONAL_HEADERS = [
@@ -178,26 +190,10 @@ OPTIONAL_INTRINSICS = [("__builtin_isnan", '5.'),
                        ("__builtin_bswap32", '5u'),
                        ("__builtin_bswap64", '5u'),
                        ("__builtin_expect", '5, 0'),
-                       ("__builtin_mul_overflow", '5, 5, (int*)5'),
-                       # MMX only needed for icc, but some clangs don't have it
-                       ("_m_from_int64", '0', "emmintrin.h"),
-                       ("_mm_load_ps", '(float*)0', "xmmintrin.h"),  # SSE
-                       ("_mm_prefetch", '(float*)0, _MM_HINT_NTA',
-                        "xmmintrin.h"),  # SSE
-                       ("_mm_load_pd", '(double*)0', "emmintrin.h"),  # SSE2
+                       # Test `long long` for arm+clang 13 (gh-22811,
+                       # but we use all versions of __builtin_mul_overflow):
+                       ("__builtin_mul_overflow", '(long long)5, 5, (int*)5'),
                        ("__builtin_prefetch", "(float*)0, 0, 3"),
-                       # check that the linker can handle avx
-                       ("__asm__ volatile", '"vpand %xmm1, %xmm2, %xmm3"',
-                        "stdio.h", "LINK_AVX"),
-                       ("__asm__ volatile", '"vpand %ymm1, %ymm2, %ymm3"',
-                        "stdio.h", "LINK_AVX2"),
-                       ("__asm__ volatile", '"vpaddd %zmm1, %zmm2, %zmm3"',
-                        "stdio.h", "LINK_AVX512F"),
-                       ("__asm__ volatile", '"vfpclasspd $0x40, %zmm15, %k6\\n"\
-                                             "vmovdqu8 %xmm0, %xmm1\\n"\
-                                             "vpbroadcastmb2q %k0, %xmm0\\n"',
-                        "stdio.h", "LINK_AVX512_SKX"),
-                       ("__asm__ volatile", '"xgetbv"', "stdio.h", "XGETBV"),
                        ]
 
 # function attributes
@@ -212,44 +208,6 @@ OPTIONAL_FUNCTION_ATTRIBUTES = [('__attribute__((optimize("unroll-loops")))',
                                 ('__attribute__((nonnull (1)))',
                                  'attribute_nonnull'),
                                 ]
-
-OPTIONAL_FUNCTION_ATTRIBUTES_AVX = [('__attribute__((target ("avx")))',
-    'attribute_target_avx'),
-    ('__attribute__((target ("avx2")))',
-    'attribute_target_avx2'),
-    ('__attribute__((target ("avx512f")))',
-    'attribute_target_avx512f'),
-    ('__attribute__((target ("avx512f,avx512dq,avx512bw,avx512vl,avx512cd")))',
-    'attribute_target_avx512_skx'),
-    ]
-
-# function attributes with intrinsics
-# To ensure your compiler can compile avx intrinsics with just the attributes
-# gcc 4.8.4 support attributes but not with intrisics
-# tested via "#include<%s> int %s %s(void *){code; return 0;};" % (header, attribute, name, code)
-# function name will be converted to HAVE_<upper-case-name> preprocessor macro
-# The _mm512_castps_si512 instruction is specific check for AVX-512F support
-# in gcc-4.9 which is missing a subset of intrinsics. See
-# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=61878
-OPTIONAL_FUNCTION_ATTRIBUTES_WITH_INTRINSICS_AVX = [
-    ('__attribute__((target("avx2,fma")))',
-    'attribute_target_avx2_with_intrinsics',
-    '__m256 temp = _mm256_set1_ps(1.0); temp = \
-    _mm256_fmadd_ps(temp, temp, temp)',
-    'immintrin.h'),
-    ('__attribute__((target("avx512f")))',
-    'attribute_target_avx512f_with_intrinsics',
-    '__m512i temp = _mm512_castps_si512(_mm512_set1_ps(1.0))',
-    'immintrin.h'),
-    ('__attribute__((target ("avx512f,avx512dq,avx512bw,avx512vl,avx512cd")))',
-    'attribute_target_avx512_skx_with_intrinsics',
-    '__mmask8 temp = _mm512_fpclass_pd_mask(_mm512_set1_pd(1.0), 0x01);\
-    __m512i unused_temp = \
-        _mm512_castps_si512(_mm512_set1_ps(1.0));\
-    _mm_mask_storeu_epi8(NULL, 0xFF, _mm_broadcastmb_epi64(temp))',
-    'immintrin.h'),
-    ]
-
 def fname2def(name):
     return "HAVE_%s" % name.upper()
 
diff --git a/numpy/core/shape_base.py b/numpy/core/shape_base.py
index c5e0ad475..250fffd42 100644
--- a/numpy/core/shape_base.py
+++ b/numpy/core/shape_base.py
@@ -204,19 +204,15 @@ def atleast_3d(*arys):
         return res
 
 
-def _arrays_for_stack_dispatcher(arrays, stacklevel=4):
-    if not hasattr(arrays, '__getitem__') and hasattr(arrays, '__iter__'):
-        warnings.warn('arrays to stack must be passed as a "sequence" type '
-                      'such as list or tuple. Support for non-sequence '
-                      'iterables such as generators is deprecated as of '
-                      'NumPy 1.16 and will raise an error in the future.',
-                      FutureWarning, stacklevel=stacklevel)
-        return ()
-    return arrays
+def _arrays_for_stack_dispatcher(arrays):
+    if not hasattr(arrays, "__getitem__"):
+        raise TypeError('arrays to stack must be passed as a "sequence" type '
+                        'such as list or tuple.')
+
+    return tuple(arrays)
 
 
-def _vhstack_dispatcher(tup, *, 
-                        dtype=None, casting=None):
+def _vhstack_dispatcher(tup, *, dtype=None, casting=None):
     return _arrays_for_stack_dispatcher(tup)
 
 
@@ -234,6 +230,8 @@ def vstack(tup, *, dtype=None, casting="same_kind"):
     and r/g/b channels (third axis). The functions `concatenate`, `stack` and
     `block` provide more general stacking and concatenation operations.
 
+    ``np.row_stack`` is an alias for `vstack`. They are the same function.
+
     Parameters
     ----------
     tup : sequence of ndarrays
@@ -285,9 +283,6 @@ def vstack(tup, *, dtype=None, casting="same_kind"):
            [6]])
 
     """
-    if not overrides.ARRAY_FUNCTION_ENABLED:
-        # raise warning if necessary
-        _arrays_for_stack_dispatcher(tup, stacklevel=2)
     arrs = atleast_2d(*tup)
     if not isinstance(arrs, list):
         arrs = [arrs]
@@ -354,10 +349,6 @@ def hstack(tup, *, dtype=None, casting="same_kind"):
            [3, 6]])
 
     """
-    if not overrides.ARRAY_FUNCTION_ENABLED:
-        # raise warning if necessary
-        _arrays_for_stack_dispatcher(tup, stacklevel=2)
-
     arrs = atleast_1d(*tup)
     if not isinstance(arrs, list):
         arrs = [arrs]
@@ -370,7 +361,7 @@ def hstack(tup, *, dtype=None, casting="same_kind"):
 
 def _stack_dispatcher(arrays, axis=None, out=None, *,
                       dtype=None, casting=None):
-    arrays = _arrays_for_stack_dispatcher(arrays, stacklevel=6)
+    arrays = _arrays_for_stack_dispatcher(arrays)
     if out is not None:
         # optimize for the typical case where only arrays is provided
         arrays = list(arrays)
@@ -449,10 +440,6 @@ def stack(arrays, axis=0, out=None, *, dtype=None, casting="same_kind"):
            [3, 6]])
 
     """
-    if not overrides.ARRAY_FUNCTION_ENABLED:
-        # raise warning if necessary
-        _arrays_for_stack_dispatcher(arrays, stacklevel=2)
-
     arrays = [asanyarray(arr) for arr in arrays]
     if not arrays:
         raise ValueError('need at least one array to stack')
diff --git a/numpy/core/shape_base.pyi b/numpy/core/shape_base.pyi
index 82541b55b..10116f1ee 100644
--- a/numpy/core/shape_base.pyi
+++ b/numpy/core/shape_base.pyi
@@ -2,7 +2,13 @@ from collections.abc import Sequence
 from typing import TypeVar, overload, Any, SupportsIndex
 
 from numpy import generic, _CastingKind
-from numpy._typing import ArrayLike, NDArray, _ArrayLike, DTypeLike
+from numpy._typing import (
+    NDArray,
+    ArrayLike,
+    DTypeLike,
+    _ArrayLike,
+    _DTypeLike,
+)
 
 _SCT = TypeVar("_SCT", bound=generic)
 _ArrayType = TypeVar("_ArrayType", bound=NDArray[Any])
@@ -34,29 +40,43 @@ def atleast_3d(*arys: ArrayLike) -> list[NDArray[Any]]: ...
 def vstack(
     tup: Sequence[_ArrayLike[_SCT]],
     *,
-    dtype: DTypeLike  = ..., 
+    dtype: None = ...,
     casting: _CastingKind = ...
 ) -> NDArray[_SCT]: ...
 @overload
 def vstack(
     tup: Sequence[ArrayLike],
     *,
-    dtype: DTypeLike  = ..., 
+    dtype: _DTypeLike[_SCT],
+    casting: _CastingKind = ...
+) -> NDArray[_SCT]: ...
+@overload
+def vstack(
+    tup: Sequence[ArrayLike],
+    *,
+    dtype: DTypeLike = ...,
     casting: _CastingKind = ...
 ) -> NDArray[Any]: ...
 
 @overload
 def hstack(
-    tup: Sequence[_ArrayLike[_SCT]], 
+    tup: Sequence[_ArrayLike[_SCT]],
+    *,
+    dtype: None = ...,
+    casting: _CastingKind = ...
+) -> NDArray[_SCT]: ...
+@overload
+def hstack(
+    tup: Sequence[ArrayLike],
     *,
-    dtype: DTypeLike  = ..., 
+    dtype: _DTypeLike[_SCT],
     casting: _CastingKind = ...
 ) -> NDArray[_SCT]: ...
 @overload
 def hstack(
-    tup: Sequence[ArrayLike], 
+    tup: Sequence[ArrayLike],
     *,
-    dtype: DTypeLike  = ..., 
+    dtype: DTypeLike = ...,
     casting: _CastingKind = ...
 ) -> NDArray[Any]: ...
 
@@ -64,9 +84,18 @@ def hstack(
 def stack(
     arrays: Sequence[_ArrayLike[_SCT]],
     axis: SupportsIndex = ...,
-    out: None  = ...,
+    out: None = ...,
     *,
-    dtype: DTypeLike = ...,
+    dtype: None = ...,
+    casting: _CastingKind = ...
+) -> NDArray[_SCT]: ...
+@overload
+def stack(
+    arrays: Sequence[ArrayLike],
+    axis: SupportsIndex = ...,
+    out: None = ...,
+    *,
+    dtype: _DTypeLike[_SCT],
     casting: _CastingKind = ...
 ) -> NDArray[_SCT]: ...
 @overload
@@ -75,7 +104,7 @@ def stack(
     axis: SupportsIndex = ...,
     out: None = ...,
     *,
-    dtype: DTypeLike  = ...,
+    dtype: DTypeLike = ...,
     casting: _CastingKind = ...
 ) -> NDArray[Any]: ...
 @overload
@@ -84,7 +113,7 @@ def stack(
     axis: SupportsIndex = ...,
     out: _ArrayType = ...,
     *,
-    dtype: DTypeLike  = ...,
+    dtype: DTypeLike = ...,
     casting: _CastingKind = ...
 ) -> _ArrayType: ...
 
diff --git a/numpy/core/src/_simd/_simd.c b/numpy/core/src/_simd/_simd.c
index b1fdd4478..52b66e765 100644
--- a/numpy/core/src/_simd/_simd.c
+++ b/numpy/core/src/_simd/_simd.c
@@ -1,11 +1,33 @@
 #include "_simd.h"
 
+#include "numpy/npy_math.h"
+
+static PyObject *
+get_floatstatus(PyObject* NPY_UNUSED(self), PyObject *NPY_UNUSED(args))
+{
+    return PyLong_FromLong(npy_get_floatstatus());
+}
+
+static PyObject *
+clear_floatstatus(PyObject* NPY_UNUSED(self), PyObject *NPY_UNUSED(args))
+{
+    npy_clear_floatstatus();
+    Py_RETURN_NONE;
+}
+
+static PyMethodDef _simd_methods[] = {
+    {"get_floatstatus", get_floatstatus, METH_NOARGS, NULL},
+    {"clear_floatstatus", clear_floatstatus, METH_NOARGS, NULL},
+    {NULL, NULL, 0, NULL}
+};
+
 PyMODINIT_FUNC PyInit__simd(void)
 {
     static struct PyModuleDef defs = {
         .m_base = PyModuleDef_HEAD_INIT,
         .m_name = "numpy.core._simd",
-        .m_size = -1
+        .m_size = -1,
+        .m_methods = _simd_methods
     };
     if (npy_cpu_init() < 0) {
         return NULL;
diff --git a/numpy/core/src/_simd/_simd.dispatch.c.src b/numpy/core/src/_simd/_simd.dispatch.c.src
index b6af8e6a9..f532c9e02 100644
--- a/numpy/core/src/_simd/_simd.dispatch.c.src
+++ b/numpy/core/src/_simd/_simd.dispatch.c.src
@@ -42,23 +42,26 @@
  */
 SIMD_IMPL_INTRIN_1(@intrin@_@sfx@, v@sfx@, q@sfx@)
 /**end repeat1**/
+SIMD_IMPL_INTRIN_1(load_@sfx@x2, v@sfx@x2, q@sfx@)
+
 /**begin repeat1
- * # intrin = store, storea, stores, storel, storeh#
+ * # intrin = store, storea, stores, storel, storeh, store#
+ * # x = ,,,,, x2#
  */
 // special definition due to the nature of @intrin@
 static PyObject *
-simd__intrin_@intrin@_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args)
+simd__intrin_@intrin@_@sfx@@x@(PyObject* NPY_UNUSED(self), PyObject *args)
 {
     simd_arg seq_arg = {.dtype = simd_data_q@sfx@};
-    simd_arg vec_arg = {.dtype = simd_data_v@sfx@};
+    simd_arg vec_arg = {.dtype = simd_data_v@sfx@@x@};
     if (!PyArg_ParseTuple(
-        args, "O&O&:@intrin@_@sfx@",
+        args, "O&O&:@intrin@_@sfx@@x@",
         simd_arg_converter, &seq_arg,
         simd_arg_converter, &vec_arg
     )) {
         return NULL;
     }
-    npyv_@intrin@_@sfx@(seq_arg.data.q@sfx@, vec_arg.data.v@sfx@);
+    npyv_@intrin@_@sfx@@x@(seq_arg.data.q@sfx@, vec_arg.data.v@sfx@@x@);
     // write-back
     if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.q@sfx@, simd_data_q@sfx@)) {
         simd_arg_free(&seq_arg);
@@ -76,23 +79,35 @@ simd__intrin_@intrin@_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args)
 // Partial Load
 SIMD_IMPL_INTRIN_3(load_till_@sfx@, v@sfx@, q@sfx@, u32, @sfx@)
 SIMD_IMPL_INTRIN_2(load_tillz_@sfx@, v@sfx@, q@sfx@, u32)
+#if @size@ == 32
+    SIMD_IMPL_INTRIN_4(load2_till_@sfx@, v@sfx@, q@sfx@, u32, @sfx@, @sfx@)
+    SIMD_IMPL_INTRIN_2(load2_tillz_@sfx@, v@sfx@, q@sfx@, u32)
+#else
+    SIMD_IMPL_INTRIN_4(load2_till_@sfx@, v@sfx@, q@sfx@, u32, @sfx@, @sfx@)
+    SIMD_IMPL_INTRIN_2(load2_tillz_@sfx@, v@sfx@, q@sfx@, u32)
+#endif
 
 // Partial Store
+/**begin repeat1
+ * #intrin = store_till, store2_till, store2_till#
+ * #chksize= 0,          32,           64#
+ */
+#if !@chksize@ || @chksize@ == @size@
 static PyObject *
-simd__intrin_store_till_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args)
+simd__intrin_@intrin@_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args)
 {
     simd_arg seq_arg = {.dtype = simd_data_q@sfx@};
     simd_arg nlane_arg = {.dtype = simd_data_u32};
     simd_arg vec_arg = {.dtype = simd_data_v@sfx@};
     if (!PyArg_ParseTuple(
-        args, "O&O&O&:store_till_@sfx@",
+        args, "O&O&O&:@intrin@_@sfx@",
         simd_arg_converter, &seq_arg,
         simd_arg_converter, &nlane_arg,
         simd_arg_converter, &vec_arg
     )) {
         return NULL;
     }
-    npyv_store_till_@sfx@(
+    npyv_@intrin@_@sfx@(
         seq_arg.data.q@sfx@, nlane_arg.data.u32, vec_arg.data.v@sfx@
     );
     // write-back
@@ -103,14 +118,22 @@ simd__intrin_store_till_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args)
     simd_arg_free(&seq_arg);
     Py_RETURN_NONE;
 }
+#endif // chksize
 
+/**end repeat1**/
 // Non-contiguous Load
 /**begin repeat1
- * #intrin = loadn, loadn_till, loadn_tillz#
- * #till   = 0,     1,          1#
- * #fill   = 0,     1,          0#
- * #format = ,    O&O&,         O&#
- */
+ * #intrin = loadn,       loadn2,       loadn2,
+ *           loadn_till,  loadn2_till,  loadn2_till,
+ *           loadn_tillz, loadn2_tillz, loadn2_tillz#
+ * #scale  = 1,2,2,       1,2,2,         1,2,2#
+ * #till   = 0*3,         1*3,           1*3#
+ * #fill   = 0*3,         1*3,           0*3#
+ # #fill2  = 0*3,         0,1,1,         0*3#
+ * #format = ,,,          O&O&, O&O&O&*2,O&*3#
+ * #chksize= 0,32,64,     0,32,64,       0,32,64#
+ */
+#if !@chksize@ || @chksize@ == @size@
 static PyObject *
 simd__intrin_@intrin@_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args)
 {
@@ -122,6 +145,9 @@ simd__intrin_@intrin@_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args)
 #if @fill@
     simd_arg fill_arg = {.dtype = simd_data_@sfx@};
 #endif
+#if @fill2@
+    simd_arg fill2_arg = {.dtype = simd_data_@sfx@};
+#endif
     if (!PyArg_ParseTuple(
         args, "@format@O&O&:@intrin@_@sfx@",
         simd_arg_converter, &seq_arg,
@@ -132,6 +158,9 @@ simd__intrin_@intrin@_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args)
 #if @fill@
         ,simd_arg_converter, &fill_arg
 #endif
+#if @fill2@
+        ,simd_arg_converter, &fill2_arg
+#endif
     )) {
         return NULL;
     }
@@ -140,7 +169,7 @@ simd__intrin_@intrin@_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args)
     Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
     Py_ssize_t min_seq_len = stride * npyv_nlanes_@sfx@;
     if (stride < 0) {
-        seq_ptr += cur_seq_len -1;
+        seq_ptr += cur_seq_len - 1 * @scale@;
         min_seq_len = -min_seq_len;
     }
     if (cur_seq_len < min_seq_len) {
@@ -159,6 +188,9 @@ simd__intrin_@intrin@_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args)
     #if @fill@
         , fill_arg.data.@sfx@
     #endif
+    #if @fill2@
+        , fill2_arg.data.@sfx@
+    #endif
     );
     simd_arg ret = {
         .dtype = simd_data_v@sfx@, .data = {.v@sfx@=rvec}
@@ -169,14 +201,19 @@ err:
     simd_arg_free(&seq_arg);
     return NULL;
 }
+#endif // chksize
 /**end repeat1**/
 
 // Non-contiguous Store
 /**begin repeat1
- * #intrin = storen, storen_till#
- * #till   = 0,      1#
- * #format = ,       O&#
+ * #intrin = storen,      storen2,      storen2,
+             storen_till, storen2_till, storen2_till#
+ * #scale  = 1,2,2,       1,2,2#
+ * #till   = 0*3,         1*3#
+ * #format = ,,,          O&*3#
+ * #chksize= 0,32,64,     0,32,64#
  */
+#if !@chksize@ || @chksize@ == @size@
 static PyObject *
 simd__intrin_@intrin@_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args)
 {
@@ -202,7 +239,7 @@ simd__intrin_@intrin@_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args)
     Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
     Py_ssize_t min_seq_len = stride * npyv_nlanes_@sfx@;
     if (stride < 0) {
-        seq_ptr += cur_seq_len -1;
+        seq_ptr += cur_seq_len - 1*@scale@;
         min_seq_len = -min_seq_len;
     }
     // overflow guard
@@ -231,6 +268,7 @@ err:
     simd_arg_free(&seq_arg);
     return NULL;
 }
+#endif // chksize
 /**end repeat1**/
 #endif // @ncont_sup@
 
@@ -300,7 +338,7 @@ SIMD_IMPL_INTRIN_2(@intrin@_@sfx@, v@sfx@, v@sfx@, v@sfx@)
 /**end repeat1**/
 
 /**begin repeat1
- * # intrin = combine, zip#
+ * # intrin = combine, zip, unzip#
  */
 SIMD_IMPL_INTRIN_2(@intrin@_@sfx@, v@sfx@x2, v@sfx@, v@sfx@)
 /**end repeat1**/
@@ -309,6 +347,60 @@ SIMD_IMPL_INTRIN_2(@intrin@_@sfx@, v@sfx@x2, v@sfx@, v@sfx@)
 SIMD_IMPL_INTRIN_1(rev64_@sfx@, v@sfx@, v@sfx@)
 #endif
 
+// special implementation to convert runtime constants to immediate values
+#if @size@ == 32
+// one call for element index then gather them within one vector
+// instead of unroll the 255 possible cases.
+NPY_FINLINE npyv_@sfx@
+npyv_permi128_@sfx@_(npyv_@sfx@ a, unsigned e0, unsigned e1, unsigned e2, unsigned e3)
+{
+   /**begin repeat1
+    * # en = e0, e1, e2, e3#
+    */
+    npyv_@sfx@ v@en@;
+    npyv_lanetype_@sfx@ d@en@[npyv_nlanes_@sfx@];
+    if (0) {}
+   /**begin repeat2
+    * # imm = 1, 2, 3#
+    */
+    else if (@en@ == @imm@) {
+        v@en@ = npyv_permi128_@sfx@(a, @imm@, @imm@, @imm@, @imm@);
+    }
+    /**end repeat2**/
+    else {
+        v@en@ = npyv_permi128_@sfx@(a, 0, 0, 0, 0);
+    }
+    npyv_store_@sfx@(d@en@, v@en@);
+    /**end repeat1**/
+    if (e0 == e1 && e0 == e2 && e0 == e3) {
+        return ve0;
+    }
+    for (int i = 0; i < npyv_nlanes_@sfx@; i += 4) {
+        de0[i+1] = de1[i+1];
+        de0[i+2] = de2[i+2];
+        de0[i+3] = de3[i+3];
+    }
+    return npyv_load_@sfx@(de0);
+}
+SIMD_IMPL_INTRIN_5(permi128_@sfx@_, v@sfx@, v@sfx@, u8, u8, u8, u8)
+#elif @size@ == 64
+NPY_FINLINE npyv_@sfx@
+npyv_permi128_@sfx@_(npyv_@sfx@ a, unsigned e0, unsigned e1)
+{
+    if (e0 == 1 && e1 == 0) {
+        return npyv_permi128_@sfx@(a, 1, 0);
+    }
+    else if (e0 == 0 && e1 == 1) {
+        return npyv_permi128_@sfx@(a, 0, 1);
+    }
+    else if (e0 == 1 && e1 == 1) {
+        return npyv_permi128_@sfx@(a, 1, 1);
+    }
+    return npyv_permi128_@sfx@(a, 0, 0);
+}
+SIMD_IMPL_INTRIN_3(permi128_@sfx@_, v@sfx@, v@sfx@, u8, u8)
+#endif
+
 /***************************
  * Operators
  ***************************/
@@ -387,7 +479,7 @@ SIMD_IMPL_INTRIN_2(divc_@sfx@, v@sfx@, v@sfx@, v@sfx@x3)
 
 #if @fused_sup@
 /**begin repeat1
- * #intrin = muladd, mulsub, nmuladd, nmulsub#
+ * #intrin = muladd, mulsub, nmuladd, nmulsub, muladdsub#
  */
 SIMD_IMPL_INTRIN_3(@intrin@_@sfx@, v@sfx@, v@sfx@, v@sfx@, v@sfx@)
 /**end repeat1**/
@@ -438,6 +530,11 @@ SIMD_IMPL_INTRIN_1(reduce_@intrin@_@sfx@, @sfx@, v@sfx@)
  SIMD_IMPL_INTRIN_4(@intrin@_@sfx@, v@sfx@, v@bsfx@, v@sfx@, v@sfx@, v@sfx@)
 /**end repeat1**/
 
+#if @fp_only@
+SIMD_IMPL_INTRIN_4(ifdiv_@sfx@, v@sfx@, v@bsfx@, v@sfx@, v@sfx@, v@sfx@)
+SIMD_IMPL_INTRIN_3(ifdivz_@sfx@, v@sfx@, v@bsfx@, v@sfx@, v@sfx@)
+#endif
+
 #endif // simd_sup
 /**end repeat**/
 /*************************************************************************
@@ -541,6 +638,12 @@ static PyMethodDef simd__intrinsics_methods[] = {
 SIMD_INTRIN_DEF(@intrin@_@sfx@)
 /**end repeat1**/
 
+/**begin repeat1
+ * # intrin = load, store#
+ */
+SIMD_INTRIN_DEF(@intrin@_@sfx@x2)
+/**end repeat1**/
+
 /****************************************
  * Non-contiguous/Partial Memory access
  ****************************************/
@@ -551,6 +654,21 @@ SIMD_INTRIN_DEF(@intrin@_@sfx@)
  */
 SIMD_INTRIN_DEF(@intrin@_@sfx@)
 /**end repeat1**/
+#if @size@ == 32
+    /**begin repeat1
+     * #intrin = load2_till, load2_tillz, loadn2, loadn2_till, loadn2_tillz,
+     *           store2_till, storen2, storen2_till#
+     */
+    SIMD_INTRIN_DEF(@intrin@_@sfx@)
+    /**end repeat1**/
+#else
+    /**begin repeat1
+     * #intrin = load2_till, load2_tillz, loadn2, loadn2_till, loadn2_tillz,
+     *           store2_till, storen2, storen2_till#
+     */
+    SIMD_INTRIN_DEF(@intrin@_@sfx@)
+    /**end repeat1**/
+#endif
 #endif // ncont_sup
 
 /****************************
@@ -584,7 +702,7 @@ SIMD_INTRIN_DEF(@intrin@_@sfx@)
  * Reorder
  ***************************/
 /**begin repeat1
- * # intrin = combinel, combineh, combine, zip#
+ * # intrin = combinel, combineh, combine, zip, unzip#
  */
 SIMD_INTRIN_DEF(@intrin@_@sfx@)
 /**end repeat1**/
@@ -593,6 +711,10 @@ SIMD_INTRIN_DEF(@intrin@_@sfx@)
 SIMD_INTRIN_DEF(rev64_@sfx@)
 #endif
 
+#if @size@ > 16
+{ "permi128_@sfx@", simd__intrin_permi128_@sfx@_, METH_VARARGS, NULL },
+#endif
+
 /***************************
  * Operators
  ***************************/
@@ -658,7 +780,7 @@ SIMD_INTRIN_DEF(divc_@sfx@)
 
 #if @fused_sup@
 /**begin repeat1
- * #intrin = muladd, mulsub, nmuladd, nmulsub#
+ * #intrin = muladd, mulsub, nmuladd, nmulsub, muladdsub#
  */
 SIMD_INTRIN_DEF(@intrin@_@sfx@)
 /**end repeat1**/
@@ -708,6 +830,14 @@ SIMD_INTRIN_DEF(reduce_@intrin@_@sfx@)
  SIMD_INTRIN_DEF(@intrin@_@sfx@)
 /**end repeat1**/
 
+#if @fp_only@
+/**begin repeat1
+ * #intrin = ifdiv, ifdivz#
+ */
+SIMD_INTRIN_DEF(@intrin@_@sfx@)
+/**end repeat1**/
+#endif
+
 #endif // simd_sup
 /**end repeat**/
 /*************************************************************************
@@ -789,12 +919,12 @@ NPY_CPU_DISPATCH_CURFX(simd_create_module)(void)
 {
     static struct PyModuleDef defs = {
         .m_base = PyModuleDef_HEAD_INIT,
-        .m_size = -1,
     #ifdef NPY__CPU_TARGET_CURRENT
         .m_name = "numpy.core._simd." NPY_TOSTRING(NPY__CPU_TARGET_CURRENT),
     #else
         .m_name = "numpy.core._simd.baseline",
     #endif
+        .m_size = -1,
     #if NPY_SIMD
         .m_methods = simd__intrinsics_methods
     #else
diff --git a/numpy/core/src/_simd/_simd_easyintrin.inc b/numpy/core/src/_simd/_simd_easyintrin.inc
index f2e0da26e..e300e5484 100644
--- a/numpy/core/src/_simd/_simd_easyintrin.inc
+++ b/numpy/core/src/_simd/_simd_easyintrin.inc
@@ -197,6 +197,39 @@
         return simd_arg_to_obj(&ret);                     \
     }
 
+#define SIMD_IMPL_INTRIN_5(NAME, RET, IN0, IN1, IN2, IN3, IN4) \
+    static PyObject *simd__intrin_##NAME                  \
+    (PyObject* NPY_UNUSED(self), PyObject *args)          \
+    {                                                     \
+        simd_arg arg1 = {.dtype = simd_data_##IN0};       \
+        simd_arg arg2 = {.dtype = simd_data_##IN1};       \
+        simd_arg arg3 = {.dtype = simd_data_##IN2};       \
+        simd_arg arg4 = {.dtype = simd_data_##IN3};       \
+        simd_arg arg5 = {.dtype = simd_data_##IN4};       \
+        if (!PyArg_ParseTuple(                            \
+            args, "O&O&O&O&O&:"NPY_TOSTRING(NAME),        \
+            simd_arg_converter, &arg1,                    \
+            simd_arg_converter, &arg2,                    \
+            simd_arg_converter, &arg3,                    \
+            simd_arg_converter, &arg4,                    \
+            simd_arg_converter, &arg5                     \
+        )) return NULL;                                   \
+        simd_data data = {.RET = npyv_##NAME(             \
+            arg1.data.IN0, arg2.data.IN1,                 \
+            arg3.data.IN2, arg4.data.IN3,                 \
+            arg5.data.IN4                                 \
+        )};                                               \
+        simd_arg_free(&arg1);                             \
+        simd_arg_free(&arg2);                             \
+        simd_arg_free(&arg3);                             \
+        simd_arg_free(&arg4);                             \
+        simd_arg_free(&arg5);                             \
+        simd_arg ret = {                                  \
+            .data = data, .dtype = simd_data_##RET        \
+        };                                                \
+        return simd_arg_to_obj(&ret);                     \
+    }
+
 /**
  * Helper macros for repeating and expand a certain macro.
  * Mainly used for converting a scalar to an immediate constant.
diff --git a/numpy/core/src/common/common.hpp b/numpy/core/src/common/common.hpp
new file mode 100644
index 000000000..44ba449d8
--- /dev/null
+++ b/numpy/core/src/common/common.hpp
@@ -0,0 +1,14 @@
+#ifndef NUMPY_CORE_SRC_COMMON_COMMON_HPP
+#define NUMPY_CORE_SRC_COMMON_COMMON_HPP
+/*
+ * The following C++ headers are safe to be used standalone, however,
+ * they are gathered to make it easy for us and for the future need to support PCH.
+ */
+#include "npdef.hpp"
+#include "utils.hpp"
+#include "npstd.hpp"
+#include "half.hpp"
+#include "meta.hpp"
+#include "float_status.hpp"
+
+#endif // NUMPY_CORE_SRC_COMMON_COMMON_HPP
diff --git a/numpy/core/src/common/dlpack/dlpack.h b/numpy/core/src/common/dlpack/dlpack.h
index 29209aee1..f0cbf6136 100644
--- a/numpy/core/src/common/dlpack/dlpack.h
+++ b/numpy/core/src/common/dlpack/dlpack.h
@@ -1,5 +1,5 @@
 // Taken from:
-// https://github.com/dmlc/dlpack/blob/9b6176fdecb55e9bf39b16f08b96913ed3f275b4/include/dlpack/dlpack.h
+// https://github.com/dmlc/dlpack/blob/ca4d00ad3e2e0f410eeab3264d21b8a39397f362/include/dlpack/dlpack.h
 /*!
  *  Copyright (c) 2017 by Contributors
  * \file dlpack.h
@@ -8,14 +8,20 @@
 #ifndef DLPACK_DLPACK_H_
 #define DLPACK_DLPACK_H_
 
+/**
+ * \brief Compatibility with C++
+ */
 #ifdef __cplusplus
 #define DLPACK_EXTERN_C extern "C"
 #else
 #define DLPACK_EXTERN_C
 #endif
 
-/*! \brief The current version of dlpack */
-#define DLPACK_VERSION 050
+/*! \brief The current major version of dlpack */
+#define DLPACK_MAJOR_VERSION 1
+
+/*! \brief The current minor version of dlpack */
+#define DLPACK_MINOR_VERSION 0
 
 /*! \brief DLPACK_DLL prefix for windows */
 #ifdef _WIN32
@@ -34,10 +40,41 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
+
+/*!
+ * \brief The DLPack version.
+ *
+ * A change in major version indicates that we have changed the
+ * data layout of the ABI - DLManagedTensorVersioned.
+ *
+ * A change in minor version indicates that we have added new
+ * code, such as a new device type, but the ABI is kept the same.
+ *
+ * If an obtained DLPack tensor has a major version that disagrees
+ * with the version number specified in this header file
+ * (i.e. major != DLPACK_MAJOR_VERSION), the consumer must call the deleter
+ * (and it is safe to do so). It is not safe to access any other fields
+ * as the memory layout will have changed.
+ *
+ * In the case of a minor version mismatch, the tensor can be safely used as
+ * long as the consumer knows how to interpret all fields. Minor version
+ * updates indicate the addition of enumeration values.
+ */
+typedef struct {
+  /*! \brief DLPack major version. */
+  uint32_t major;
+  /*! \brief DLPack minor version. */
+  uint32_t minor;
+} DLPackVersion;
+
 /*!
  * \brief The device type in DLDevice.
  */
+#ifdef __cplusplus
+typedef enum : int32_t {
+#else
 typedef enum {
+#endif
   /*! \brief CPU device */
   kDLCPU = 1,
   /*! \brief CUDA GPU device */
@@ -70,6 +107,17 @@ typedef enum {
    * \brief CUDA managed/unified memory allocated by cudaMallocManaged
    */
   kDLCUDAManaged = 13,
+  /*!
+   * \brief Unified shared memory allocated on a oneAPI non-partititioned
+   * device. Call to oneAPI runtime is required to determine the device
+   * type, the USM allocation type and the sycl context it is bound to.
+   *
+   */
+  kDLOneAPI = 14,
+  /*! \brief GPU support for next generation WebGPU standard. */
+  kDLWebGPU = 15,
+  /*! \brief Qualcomm Hexagon DSP */
+  kDLHexagon = 16,
 } DLDeviceType;
 
 /*!
@@ -82,7 +130,7 @@ typedef struct {
    * \brief The device index.
    * For vanilla CPU memory, pinned memory, or managed memory, this is set to 0.
    */
-  int device_id;
+  int32_t device_id;
 } DLDevice;
 
 /*!
@@ -107,16 +155,21 @@ typedef enum {
    * (C/C++/Python layout: compact struct per complex number)
    */
   kDLComplex = 5U,
+  /*! \brief boolean */
+  kDLBool = 6U,
 } DLDataTypeCode;
 
 /*!
- * \brief The data type the tensor can hold.
+ * \brief The data type the tensor can hold. The data type is assumed to follow the
+ * native endian-ness. An explicit error message should be raised when attempting to
+ * export an array with non-native endianness
  *
  *  Examples
- *   - float: type_code = 2, bits = 32, lanes=1
- *   - float4(vectorized 4 float): type_code = 2, bits = 32, lanes=4
- *   - int8: type_code = 0, bits = 8, lanes=1
+ *   - float: type_code = 2, bits = 32, lanes = 1
+ *   - float4(vectorized 4 float): type_code = 2, bits = 32, lanes = 4
+ *   - int8: type_code = 0, bits = 8, lanes = 1
  *   - std::complex<float>: type_code = 5, bits = 64, lanes = 1
+ *   - bool: type_code = 6, bits = 8, lanes = 1 (as per common array library convention, the underlying storage size of bool is 8 bits)
  */
 typedef struct {
   /*!
@@ -138,9 +191,16 @@ typedef struct {
  */
 typedef struct {
   /*!
-   * \brief The opaque data pointer points to the allocated data. This will be
-   * CUDA device pointer or cl_mem handle in OpenCL. This pointer is always
-   * aligned to 256 bytes as in CUDA.
+   * \brief The data pointer points to the allocated data. This will be CUDA
+   * device pointer or cl_mem handle in OpenCL. It may be opaque on some device
+   * types. This pointer is always aligned to 256 bytes as in CUDA. The
+   * `byte_offset` field should be used to point to the beginning of the data.
+   *
+   * Note that as of Nov 2021, multiply libraries (CuPy, PyTorch, TensorFlow,
+   * TVM, perhaps others) do not adhere to this 256 byte alignment requirement
+   * on CPU/CUDA/ROCm, and always use `byte_offset=0`.  This must be fixed
+   * (after which this note will be updated); at the moment it is recommended
+   * to not rely on the data pointer being correctly aligned.
    *
    * For given DLTensor, the size of memory required to store the contents of
    * data is calculated as follows:
@@ -160,7 +220,7 @@ typedef struct {
   /*! \brief The device of the tensor */
   DLDevice device;
   /*! \brief Number of dimensions */
-  int ndim;
+  int32_t ndim;
   /*! \brief The data type of the pointer*/
   DLDataType dtype;
   /*! \brief The shape of the tensor */
@@ -180,6 +240,13 @@ typedef struct {
  *  not meant to transfer the tensor. When the borrowing framework doesn't need
  *  the tensor, it should call the deleter to notify the host that the resource
  *  is no longer needed.
+ *
+ * \note This data structure is used as Legacy DLManagedTensor
+ *       in DLPack exchange and is deprecated after DLPack v0.8
+ *       Use DLManagedTensorVersioned instead.
+ *       This data structure may get renamed or deleted in future versions.
+ *
+ * \sa DLManagedTensorVersioned
  */
 typedef struct DLManagedTensor {
   /*! \brief DLTensor which is being memory managed */
@@ -188,13 +255,65 @@ typedef struct DLManagedTensor {
    *   which DLManagedTensor is used in the framework. It can also be NULL.
    */
   void * manager_ctx;
-  /*! \brief Destructor signature void (*)(void*) - this should be called
-   *   to destruct manager_ctx which holds the DLManagedTensor. It can be NULL
-   *   if there is no way for the caller to provide a reasonable destructor.
-   *   The destructors deletes the argument self as well.
+  /*!
+   * \brief Destructor - this should be called
+   * to destruct the manager_ctx  which backs the DLManagedTensor. It can be
+   * NULL if there is no way for the caller to provide a reasonable destructor.
+   * The destructors deletes the argument self as well.
    */
   void (*deleter)(struct DLManagedTensor * self);
 } DLManagedTensor;
+
+// bit masks used in in the DLManagedTensorVersioned
+
+/*! \brief bit mask to indicate that the tensor is read only. */
+#define DLPACK_FLAG_BITMASK_READ_ONLY (1UL << 0UL)
+
+/*!
+ * \brief A versioned and managed C Tensor object, manage memory of DLTensor.
+ *
+ * This data structure is intended to facilitate the borrowing of DLTensor by
+ * another framework. It is not meant to transfer the tensor. When the borrowing
+ * framework doesn't need the tensor, it should call the deleter to notify the
+ * host that the resource is no longer needed.
+ *
+ * \note This is the current standard DLPack exchange data structure.
+ */
+struct DLManagedTensorVersioned {
+  /*!
+   * \brief The API and ABI version of the current managed Tensor
+   */
+  DLPackVersion version;
+  /*!
+   * \brief the context of the original host framework.
+   *
+   * Stores DLManagedTensorVersioned is used in the
+   * framework. It can also be NULL.
+   */
+  void *manager_ctx;
+  /*!
+   * \brief Destructor.
+   *
+   * This should be called to destruct manager_ctx which holds the DLManagedTensorVersioned.
+   * It can be NULL if there is no way for the caller to provide a reasonable
+   * destructor. The destructors deletes the argument self as well.
+   */
+  void (*deleter)(struct DLManagedTensorVersioned *self);
+  /*!
+   * \brief Additional bitmask flags information about the tensor.
+   *
+   * By default the flags should be set to 0.
+   *
+   * \note Future ABI changes should keep everything until this field
+   *       stable, to ensure that deleter can be correctly called.
+   *
+   * \sa DLPACK_FLAG_BITMASK_READ_ONLY
+   */
+  uint64_t flags;
+  /*! \brief DLTensor which is being memory managed */
+  DLTensor dl_tensor;
+};
+
 #ifdef __cplusplus
 }  // DLPACK_EXTERN_C
 #endif
diff --git a/numpy/core/src/common/float_status.hpp b/numpy/core/src/common/float_status.hpp
new file mode 100644
index 000000000..8e4d5e06a
--- /dev/null
+++ b/numpy/core/src/common/float_status.hpp
@@ -0,0 +1,134 @@
+#ifndef NUMPY_CORE_SRC_COMMON_FLOAT_STATUS_HPP
+#define NUMPY_CORE_SRC_COMMON_FLOAT_STATUS_HPP
+
+#include "npstd.hpp"
+
+#include <fenv.h>
+
+namespace np {
+
+/// @addtogroup cpp_core_utility
+/// @{
+/**
+ * Class wraps floating-point environment operations,
+ * provides lazy access to its functionality.
+ */
+class FloatStatus {
+ public:
+/*
+ * According to the C99 standard FE_DIVBYZERO, etc. may not be provided when
+ * unsupported.  In such cases NumPy will not report these correctly, but we
+ * should still allow compiling (whether tests pass or not).
+ * By defining them as 0 locally, we make them no-ops.  Unlike these defines,
+ * for example `musl` still defines all of the functions (as no-ops):
+ *     https://git.musl-libc.org/cgit/musl/tree/src/fenv/fenv.c
+ * and does similar replacement in its tests:
+ * http://nsz.repo.hu/git/?p=libc-test;a=blob;f=src/common/mtest.h;h=706c1ba23ea8989b17a2f72ed1a919e187c06b6a;hb=HEAD#l30
+ */
+#ifdef FE_DIVBYZERO
+    static constexpr int kDivideByZero = FE_DIVBYZERO;
+#else
+    static constexpr int kDivideByZero = 0;
+#endif
+#ifdef FE_INVALID
+    static constexpr int kInvalid = FE_INVALID;
+#else
+    static constexpr int kInvalid = 0;
+#endif
+#ifdef FE_INEXACT
+    static constexpr int kInexact = FE_INEXACT;
+#else
+    static constexpr int kInexact = 0;
+#endif
+#ifdef FE_OVERFLOW
+    static constexpr int kOverflow = FE_OVERFLOW;
+#else
+    static constexpr int kOverflow = 0;
+#endif
+#ifdef FE_UNDERFLOW
+    static constexpr int kUnderflow = FE_UNDERFLOW;
+#else
+    static constexpr int kUnderflow = 0;
+#endif
+    static constexpr int kAllExcept = (kDivideByZero | kInvalid | kInexact |
+                                       kOverflow | kUnderflow);
+
+    FloatStatus(bool clear_on_dst=true)
+        : clear_on_dst_(clear_on_dst)
+    {
+        if constexpr (kAllExcept != 0) {
+            fpstatus_ = fetestexcept(kAllExcept);
+        }
+        else {
+            fpstatus_ = 0;
+        }
+    }
+    ~FloatStatus()
+    {
+        if constexpr (kAllExcept != 0) {
+            if (fpstatus_ != 0 && clear_on_dst_) {
+                feclearexcept(kAllExcept);
+            }
+        }
+    }
+    constexpr bool IsDivideByZero() const
+    {
+        return (fpstatus_ & kDivideByZero) != 0;
+    }
+    constexpr bool IsInexact() const
+    {
+        return (fpstatus_ & kInexact) != 0;
+    }
+    constexpr bool IsInvalid() const
+    {
+        return (fpstatus_ & kInvalid) != 0;
+    }
+    constexpr bool IsOverFlow() const
+    {
+        return (fpstatus_ & kOverflow) != 0;
+    }
+    constexpr bool IsUnderFlow() const
+    {
+        return (fpstatus_ & kUnderflow) != 0;
+    }
+    static void RaiseDivideByZero()
+    {
+        if constexpr (kDivideByZero != 0) {
+            feraiseexcept(kDivideByZero);
+        }
+    }
+    static void RaiseInexact()
+    {
+        if constexpr (kInexact != 0) {
+            feraiseexcept(kInexact);
+        }
+    }
+    static void RaiseInvalid()
+    {
+        if constexpr (kInvalid != 0) {
+            feraiseexcept(kInvalid);
+        }
+    }
+    static void RaiseOverflow()
+    {
+        if constexpr (kOverflow != 0) {
+            feraiseexcept(kOverflow);
+        }
+    }
+    static void RaiseUnderflow()
+    {
+        if constexpr (kUnderflow != 0) {
+            feraiseexcept(kUnderflow);
+        }
+    }
+
+  private:
+    bool clear_on_dst_;
+    int fpstatus_;
+};
+
+/// @} cpp_core_utility
+} // namespace np
+
+#endif // NUMPY_CORE_SRC_COMMON_FLOAT_STATUS_HPP
+
diff --git a/numpy/core/src/common/get_attr_string.h b/numpy/core/src/common/get_attr_string.h
index 90eca5ee6..36d39189f 100644
--- a/numpy/core/src/common/get_attr_string.h
+++ b/numpy/core/src/common/get_attr_string.h
@@ -4,7 +4,7 @@
 #include <Python.h>
 #include "ufunc_object.h"
 
-static NPY_INLINE npy_bool
+static inline npy_bool
 _is_basic_python_type(PyTypeObject *tp)
 {
     return (
@@ -46,7 +46,7 @@ _is_basic_python_type(PyTypeObject *tp)
  *
  * In future, could be made more like _Py_LookupSpecial
  */
-static NPY_INLINE PyObject *
+static inline PyObject *
 PyArray_LookupSpecial(PyObject *obj, PyObject *name_unicode)
 {
     PyTypeObject *tp = Py_TYPE(obj);
@@ -73,7 +73,7 @@ PyArray_LookupSpecial(PyObject *obj, PyObject *name_unicode)
  *
  * Kept for backwards compatibility. In future, we should deprecate this.
  */
-static NPY_INLINE PyObject *
+static inline PyObject *
 PyArray_LookupSpecial_OnInstance(PyObject *obj, PyObject *name_unicode)
 {
     PyTypeObject *tp = Py_TYPE(obj);
diff --git a/numpy/core/src/common/half.hpp b/numpy/core/src/common/half.hpp
new file mode 100644
index 000000000..4d16e3bcc
--- /dev/null
+++ b/numpy/core/src/common/half.hpp
@@ -0,0 +1,264 @@
+#ifndef NUMPY_CORE_SRC_COMMON_HALF_HPP
+#define NUMPY_CORE_SRC_COMMON_HALF_HPP
+
+#include "npstd.hpp"
+
+#include "npy_cpu_dispatch.h" // NPY_HAVE_CPU_FEATURES
+#include "half_private.hpp"
+
+// TODO(@seiko2plus):
+// - covers half-precision operations that being supported by numpy/halffloat.h
+// - add support for arithmetic operations
+// - enables __fp16 causes massive FP exceptions on aarch64,
+//   needs a deep investigation
+
+namespace np {
+
+/// @addtogroup cpp_core_types
+/// @{
+
+/// Provides a type that implements 16-bit floating point (half-precision).
+/// This type is ensured to be 16-bit size.
+#if 1 // ndef __ARM_FP16_FORMAT_IEEE
+class Half final {
+  public:
+    /// Whether `Half` has a full native HW support.
+    static constexpr bool kNative = false;
+    /// Whether `Half` has a native HW support for single/double conversion.
+    template<typename T>
+    static constexpr bool kNativeConversion = (
+        (
+            std::is_same_v<T, float> &&
+        #if defined(NPY_HAVE_FP16) || defined(NPY_HAVE_VSX3)
+            true
+        #else
+            false
+        #endif
+        ) || (
+            std::is_same_v<T, double> &&
+        #if defined(NPY_HAVE_AVX512FP16) || defined(NPY_HAVE_VSX3)
+            true
+        #else
+            false
+        #endif
+        )
+    );
+
+    /// Default constructor. initialize nothing.
+    Half() = default;
+
+    /// Constract from float
+    /// If there are no hardware optimization available, rounding will always
+    /// be set to ties to even.
+    explicit Half(float f)
+    {
+    #if defined(NPY_HAVE_FP16)
+        __m128 mf = _mm_load_ss(&f);
+        bits_ = static_cast<uint16_t>(_mm_cvtsi128_si32(_mm_cvtps_ph(mf, _MM_FROUND_TO_NEAREST_INT)));
+    #elif defined(NPY_HAVE_VSX3) && defined(NPY_HAVE_VSX_ASM)
+        __vector float vf32 = vec_splats(f);
+        __vector unsigned short vf16;
+        __asm__ __volatile__ ("xvcvsphp %x0,%x1" : "=wa" (vf16) : "wa" (vf32));
+        bits_ = vec_extract(vf16, 0);
+    #else
+        bits_ = half_private::FromFloatBits(BitCast<uint32_t>(f));
+    #endif
+    }
+
+    /// Construct from double.
+    /// If there are no hardware optimization available, rounding will always
+    /// be set to ties to even.
+    explicit Half(double f)
+    {
+    #if defined(NPY_HAVE_AVX512FP16)
+        __m128d md = _mm_load_sd(&f);
+        bits_ = static_cast<uint16_t>(_mm_cvtsi128_si32(_mm_castph_si128(_mm_cvtpd_ph(md))));
+    #elif defined(NPY_HAVE_VSX3) && defined(NPY_HAVE_VSX_ASM)
+        __vector double vf64 = vec_splats(f);
+        __vector unsigned short vf16;
+        __asm__ __volatile__ ("xvcvdphp %x0,%x1" : "=wa" (vf16) : "wa" (vf64));
+        bits_ = vec_extract(vf16, 0);
+    #else
+        bits_ = half_private::FromDoubleBits(BitCast<uint64_t>(f));
+    #endif
+    }
+
+    /// Cast to float
+    explicit operator float() const
+    {
+    #if defined(NPY_HAVE_FP16)
+        float ret;
+        _mm_store_ss(&ret, _mm_cvtph_ps(_mm_cvtsi32_si128(bits_)));
+        return ret;
+    #elif defined(NPY_HAVE_VSX3) && defined(vec_extract_fp_from_shorth)
+        return vec_extract(vec_extract_fp_from_shorth(vec_splats(bits_)), 0);
+    #elif defined(NPY_HAVE_VSX3) && defined(NPY_HAVE_VSX_ASM)
+        __vector float vf32;
+        __asm__ __volatile__("xvcvhpsp %x0,%x1"
+                             : "=wa"(vf32)
+                             : "wa"(vec_splats(bits_.u)));
+        return vec_extract(vf32, 0);
+    #else
+        return BitCast<float>(half_private::ToFloatBits(bits_));
+    #endif
+    }
+
+    /// Cast to double
+    explicit operator double() const
+    {
+    #if defined(NPY_HAVE_AVX512FP16)
+        double ret;
+        _mm_store_sd(&ret, _mm_cvtph_pd(_mm_castsi128_ph(_mm_cvtsi32_si128(bits_))));
+        return ret;
+    #elif defined(NPY_HAVE_VSX3) && defined(NPY_HAVE_VSX_ASM)
+        __vector float vf64;
+        __asm__ __volatile__("xvcvhpdp %x0,%x1"
+                             : "=wa"(vf32)
+                             : "wa"(vec_splats(bits_)));
+        return vec_extract(vf64, 0);
+    #else
+        return BitCast<double>(half_private::ToDoubleBits(bits_));
+    #endif
+    }
+
+    /// Returns a new Half constracted from the IEEE 754 binary16.
+    static constexpr Half FromBits(uint16_t bits)
+    {
+        Half h{};
+        h.bits_ = bits;
+        return h;
+    }
+    /// Returns the IEEE 754 binary16 representation.
+    constexpr uint16_t Bits() const
+    {
+        return bits_;
+    }
+
+    /// @name Comparison operators (orderd)
+    /// @{
+    constexpr bool operator==(Half r) const
+    {
+        return !(IsNaN() || r.IsNaN()) && Equal(r);
+    }
+    constexpr bool operator<(Half r) const
+    {
+        return !(IsNaN() || r.IsNaN()) && Less(r);
+    }
+    constexpr bool operator<=(Half r) const
+    {
+        return !(IsNaN() || r.IsNaN()) && LessEqual(r);
+    }
+    constexpr bool operator>(Half r) const
+    {
+        return r < *this;
+    }
+    constexpr bool operator>=(Half r) const
+    {
+        return r <= *this;
+    }
+    /// @}
+
+    /// @name Comparison operators (unorderd)
+    /// @{
+    constexpr bool operator!=(Half r) const
+    {
+        return !(*this == r);
+    }
+    /// @} Comparison operators
+
+    /// @name Comparison with no guarantee of NaN behavior
+    /// @{
+    constexpr bool Less(Half r) const
+    {
+        uint_fast16_t a = static_cast<uint_fast16_t>(bits_),
+                      b = static_cast<uint_fast16_t>(r.bits_);
+        bool sign_a = (a & 0x8000u) == 0x8000u;
+        bool sign_b = (b & 0x8000u) == 0x8000u;
+        // if both `a` and `b` have same sign
+        //   Test if `a` > `b` when `a` has the sign
+        //        or `a` < `b` when is not.
+        //   And make sure they are not equal to each other
+        //       in case of both are equal to +-0
+        // else
+        //   Test if  `a` has the sign.
+        //        and `a` != -0.0 and `b` != 0.0
+        return (sign_a == sign_b) ? (sign_a ^ (a < b)) && (a != b)
+                                  : sign_a && ((a | b) != 0x8000u);
+    }
+    constexpr bool LessEqual(Half r) const
+    {
+        uint_fast16_t a = static_cast<uint_fast16_t>(bits_),
+                      b = static_cast<uint_fast16_t>(r.bits_);
+        bool sign_a = (a & 0x8000u) == 0x8000u;
+        bool sign_b = (b & 0x8000u) == 0x8000u;
+        // if both `a` and `b` have same sign
+        //   Test if `a` > `b` when `a` has the sign
+        //        or `a` < `b` when is not.
+        //        or a == b (needed even if we used <= above instead
+        //                   since testing +-0 still required)
+        // else
+        //   Test if `a` has the sign
+        //        or `a` and `b` equal to +-0.0
+        return (sign_a == sign_b) ? (sign_a ^ (a < b)) || (a == b)
+                                  : sign_a || ((a | b) == 0x8000u);
+    }
+    constexpr bool Equal(Half r) const
+    {
+        // fast16 cast is not worth it, since unpack op should involved.
+        uint16_t a = bits_, b = r.bits_;
+        return a == b || ((a | b) == 0x8000u);
+    }
+    /// @} Comparison
+
+    /// @name Properties
+    // @{
+    constexpr bool IsNaN() const
+    {
+        return ((bits_ & 0x7c00u) == 0x7c00u) &&
+               ((bits_ & 0x03ffu) != 0);
+    }
+    /// @} Properties
+
+  private:
+    uint16_t bits_;
+};
+#else // __ARM_FP16_FORMAT_IEEE
+class Half final {
+  public:
+    static constexpr bool kNative = true;
+    template<typename T>
+    static constexpr bool kNativeConversion = (
+        std::is_same_v<T, float> || std::is_same_v<T, double>
+    );
+    Half() = default;
+    constexpr Half(__fp16 h) : half_(h)
+    {}
+    constexpr operator __fp16() const
+    { return half_; }
+    static Half FromBits(uint16_t bits)
+    {
+        Half h;
+        h.half_ = BitCast<__fp16>(bits);
+        return h;
+    }
+    uint16_t Bits() const
+    { return BitCast<uint16_t>(half_); }
+    constexpr bool Less(Half r) const
+    { return half_ < r.half_; }
+    constexpr bool LessEqual(Half r) const
+    { return half_ <= r.half_; }
+    constexpr bool Equal(Half r) const
+    { return half_ == r.half_; }
+    constexpr bool IsNaN() const
+    { return half_ != half_; }
+
+  private:
+    __fp16 half_;
+};
+#endif // __ARM_FP16_FORMAT_IEEE
+
+/// @} cpp_core_types
+
+} // namespace np
+
+#endif // NUMPY_CORE_SRC_COMMON_HALF_HPP
diff --git a/numpy/core/src/common/half_private.hpp b/numpy/core/src/common/half_private.hpp
new file mode 100644
index 000000000..7a64eb397
--- /dev/null
+++ b/numpy/core/src/common/half_private.hpp
@@ -0,0 +1,330 @@
+#ifndef NUMPY_CORE_SRC_COMMON_HALF_PRIVATE_HPP
+#define NUMPY_CORE_SRC_COMMON_HALF_PRIVATE_HPP
+
+#include "npstd.hpp"
+#include "float_status.hpp"
+
+/*
+ * The following functions that emulating float/double/half conversions
+ * are copied from npymath without any changes to its functionalty.
+ */
+namespace np { namespace half_private {
+
+template<bool gen_overflow=true, bool gen_underflow=true, bool round_even=true>
+inline uint16_t FromFloatBits(uint32_t f)
+{
+    uint32_t f_exp, f_sig;
+    uint16_t h_sgn, h_exp, h_sig;
+
+    h_sgn = (uint16_t) ((f&0x80000000u) >> 16);
+    f_exp = (f&0x7f800000u);
+
+    /* Exponent overflow/NaN converts to signed inf/NaN */
+    if (f_exp >= 0x47800000u) {
+        if (f_exp == 0x7f800000u) {
+            /* Inf or NaN */
+            f_sig = (f&0x007fffffu);
+            if (f_sig != 0) {
+                /* NaN - propagate the flag in the significand... */
+                uint16_t ret = (uint16_t) (0x7c00u + (f_sig >> 13));
+                /* ...but make sure it stays a NaN */
+                if (ret == 0x7c00u) {
+                    ret++;
+                }
+                return h_sgn + ret;
+            } else {
+                /* signed inf */
+                return (uint16_t) (h_sgn + 0x7c00u);
+            }
+        } else {
+            if constexpr (gen_overflow) {
+                /* overflow to signed inf */
+                FloatStatus::RaiseOverflow();
+            }
+            return (uint16_t) (h_sgn + 0x7c00u);
+        }
+    }
+
+    /* Exponent underflow converts to a subnormal half or signed zero */
+    if (f_exp <= 0x38000000u) {
+        /*
+         * Signed zeros, subnormal floats, and floats with small
+         * exponents all convert to signed zero half-floats.
+         */
+        if (f_exp < 0x33000000u) {
+            if constexpr (gen_underflow) {
+                /* If f != 0, it underflowed to 0 */
+                if ((f&0x7fffffff) != 0) {
+                    FloatStatus::RaiseUnderflow();
+                }
+            }
+            return h_sgn;
+        }
+        /* Make the subnormal significand */
+        f_exp >>= 23;
+        f_sig = (0x00800000u + (f&0x007fffffu));
+        if constexpr (gen_underflow) {
+            /* If it's not exactly represented, it underflowed */
+            if ((f_sig&(((uint32_t)1 << (126 - f_exp)) - 1)) != 0) {
+                FloatStatus::RaiseUnderflow();
+            }
+        }
+        /*
+         * Usually the significand is shifted by 13. For subnormals an
+         * additional shift needs to occur. This shift is one for the largest
+         * exponent giving a subnormal `f_exp = 0x38000000 >> 23 = 112`, which
+         * offsets the new first bit. At most the shift can be 1+10 bits.
+         */
+        f_sig >>= (113 - f_exp);
+        /* Handle rounding by adding 1 to the bit beyond half precision */
+        if constexpr (round_even) {
+            /*
+             * If the last bit in the half significand is 0 (already even), and
+             * the remaining bit pattern is 1000...0, then we do not add one
+             * to the bit after the half significand. However, the (113 - f_exp)
+             * shift can lose up to 11 bits, so the || checks them in the original.
+             * In all other cases, we can just add one.
+             */
+            if (((f_sig&0x00003fffu) != 0x00001000u) || (f&0x000007ffu)) {
+                f_sig += 0x00001000u;
+            }
+        }
+        else {
+            f_sig += 0x00001000u;
+        }
+        h_sig = (uint16_t) (f_sig >> 13);
+        /*
+         * If the rounding causes a bit to spill into h_exp, it will
+         * increment h_exp from zero to one and h_sig will be zero.
+         * This is the correct result.
+         */
+        return (uint16_t) (h_sgn + h_sig);
+    }
+
+    /* Regular case with no overflow or underflow */
+    h_exp = (uint16_t) ((f_exp - 0x38000000u) >> 13);
+    /* Handle rounding by adding 1 to the bit beyond half precision */
+    f_sig = (f&0x007fffffu);
+    if constexpr (round_even) {
+        /*
+         * If the last bit in the half significand is 0 (already even), and
+         * the remaining bit pattern is 1000...0, then we do not add one
+         * to the bit after the half significand.  In all other cases, we do.
+         */
+        if ((f_sig&0x00003fffu) != 0x00001000u) {
+            f_sig += 0x00001000u;
+        }
+    }
+    else {
+        f_sig += 0x00001000u;
+    }
+    h_sig = (uint16_t) (f_sig >> 13);
+    /*
+     * If the rounding causes a bit to spill into h_exp, it will
+     * increment h_exp by one and h_sig will be zero.  This is the
+     * correct result.  h_exp may increment to 15, at greatest, in
+     * which case the result overflows to a signed inf.
+     */
+    if constexpr (gen_overflow) {
+        h_sig += h_exp;
+        if (h_sig == 0x7c00u) {
+            FloatStatus::RaiseOverflow();
+        }
+        return h_sgn + h_sig;
+    }
+    else {
+        return h_sgn + h_exp + h_sig;
+    }
+}
+
+template<bool gen_overflow=true, bool gen_underflow=true, bool round_even=true>
+inline uint16_t FromDoubleBits(uint64_t d)
+{
+    uint64_t d_exp, d_sig;
+    uint16_t h_sgn, h_exp, h_sig;
+
+    h_sgn = (d&0x8000000000000000ULL) >> 48;
+    d_exp = (d&0x7ff0000000000000ULL);
+
+    /* Exponent overflow/NaN converts to signed inf/NaN */
+    if (d_exp >= 0x40f0000000000000ULL) {
+        if (d_exp == 0x7ff0000000000000ULL) {
+            /* Inf or NaN */
+            d_sig = (d&0x000fffffffffffffULL);
+            if (d_sig != 0) {
+                /* NaN - propagate the flag in the significand... */
+                uint16_t ret = (uint16_t) (0x7c00u + (d_sig >> 42));
+                /* ...but make sure it stays a NaN */
+                if (ret == 0x7c00u) {
+                    ret++;
+                }
+                return h_sgn + ret;
+            } else {
+                /* signed inf */
+                return h_sgn + 0x7c00u;
+            }
+        } else {
+            /* overflow to signed inf */
+            if constexpr (gen_overflow) {
+                FloatStatus::RaiseOverflow();
+            }
+            return h_sgn + 0x7c00u;
+        }
+    }
+
+    /* Exponent underflow converts to subnormal half or signed zero */
+    if (d_exp <= 0x3f00000000000000ULL) {
+        /*
+         * Signed zeros, subnormal floats, and floats with small
+         * exponents all convert to signed zero half-floats.
+         */
+        if (d_exp < 0x3e60000000000000ULL) {
+            if constexpr (gen_underflow) {
+                /* If d != 0, it underflowed to 0 */
+                if ((d&0x7fffffffffffffffULL) != 0) {
+                    FloatStatus::RaiseUnderflow();
+                }
+            }
+            return h_sgn;
+        }
+        /* Make the subnormal significand */
+        d_exp >>= 52;
+        d_sig = (0x0010000000000000ULL + (d&0x000fffffffffffffULL));
+        if constexpr (gen_underflow) {
+            /* If it's not exactly represented, it underflowed */
+            if ((d_sig&(((uint64_t)1 << (1051 - d_exp)) - 1)) != 0) {
+                FloatStatus::RaiseUnderflow();
+            }
+        }
+        /*
+         * Unlike floats, doubles have enough room to shift left to align
+         * the subnormal significand leading to no loss of the last bits.
+         * The smallest possible exponent giving a subnormal is:
+         * `d_exp = 0x3e60000000000000 >> 52 = 998`. All larger subnormals are
+         * shifted with respect to it. This adds a shift of 10+1 bits the final
+         * right shift when comparing it to the one in the normal branch.
+         */
+        assert(d_exp - 998 >= 0);
+        d_sig <<= (d_exp - 998);
+        /* Handle rounding by adding 1 to the bit beyond half precision */
+        if constexpr (round_even) {
+            /*
+             * If the last bit in the half significand is 0 (already even), and
+             * the remaining bit pattern is 1000...0, then we do not add one
+             * to the bit after the half significand.  In all other cases, we do.
+             */
+            if ((d_sig&0x003fffffffffffffULL) != 0x0010000000000000ULL) {
+                d_sig += 0x0010000000000000ULL;
+            }
+        }
+        else {
+            d_sig += 0x0010000000000000ULL;
+        }
+        h_sig = (uint16_t) (d_sig >> 53);
+        /*
+         * If the rounding causes a bit to spill into h_exp, it will
+         * increment h_exp from zero to one and h_sig will be zero.
+         * This is the correct result.
+         */
+        return h_sgn + h_sig;
+    }
+
+    /* Regular case with no overflow or underflow */
+    h_exp = (uint16_t) ((d_exp - 0x3f00000000000000ULL) >> 42);
+    /* Handle rounding by adding 1 to the bit beyond half precision */
+    d_sig = (d&0x000fffffffffffffULL);
+    if constexpr (round_even) {
+        /*
+         * If the last bit in the half significand is 0 (already even), and
+         * the remaining bit pattern is 1000...0, then we do not add one
+         * to the bit after the half significand.  In all other cases, we do.
+         */
+        if ((d_sig&0x000007ffffffffffULL) != 0x0000020000000000ULL) {
+            d_sig += 0x0000020000000000ULL;
+        }
+    }
+    else {
+        d_sig += 0x0000020000000000ULL;
+    }
+    h_sig = (uint16_t) (d_sig >> 42);
+
+    /*
+     * If the rounding causes a bit to spill into h_exp, it will
+     * increment h_exp by one and h_sig will be zero.  This is the
+     * correct result.  h_exp may increment to 15, at greatest, in
+     * which case the result overflows to a signed inf.
+     */
+    if constexpr (gen_overflow) {
+        h_sig += h_exp;
+        if (h_sig == 0x7c00u) {
+            FloatStatus::RaiseOverflow();
+        }
+        return h_sgn + h_sig;
+    }
+    else {
+        return h_sgn + h_exp + h_sig;
+    }
+}
+
+constexpr uint32_t ToFloatBits(uint16_t h)
+{
+    uint16_t h_exp = (h&0x7c00u);
+    uint32_t f_sgn = ((uint32_t)h&0x8000u) << 16;
+    switch (h_exp) {
+        case 0x0000u: { // 0 or subnormal
+            uint16_t h_sig = (h&0x03ffu);
+            // Signed zero
+            if (h_sig == 0) {
+                return f_sgn;
+            }
+            // Subnormal
+            h_sig <<= 1;
+            while ((h_sig&0x0400u) == 0) {
+                h_sig <<= 1;
+                h_exp++;
+            }
+            uint32_t f_exp = ((uint32_t)(127 - 15 - h_exp)) << 23;
+            uint32_t f_sig = ((uint32_t)(h_sig&0x03ffu)) << 13;
+            return f_sgn + f_exp + f_sig;
+        }
+        case 0x7c00u: // inf or NaN
+            // All-ones exponent and a copy of the significand
+            return f_sgn + 0x7f800000u + (((uint32_t)(h&0x03ffu)) << 13);
+        default: // normalized
+            // Just need to adjust the exponent and shift
+            return f_sgn + (((uint32_t)(h&0x7fffu) + 0x1c000u) << 13);
+    }
+}
+
+constexpr uint64_t ToDoubleBits(uint16_t h)
+{
+    uint16_t h_exp = (h&0x7c00u);
+    uint64_t d_sgn = ((uint64_t)h&0x8000u) << 48;
+    switch (h_exp) {
+        case 0x0000u: { // 0 or subnormal
+            uint16_t h_sig = (h&0x03ffu);
+            // Signed zero
+            if (h_sig == 0) {
+                return d_sgn;
+            }
+            // Subnormal
+            h_sig <<= 1;
+            while ((h_sig&0x0400u) == 0) {
+                h_sig <<= 1;
+                h_exp++;
+            }
+            uint64_t d_exp = ((uint64_t)(1023 - 15 - h_exp)) << 52;
+            uint64_t d_sig = ((uint64_t)(h_sig&0x03ffu)) << 42;
+            return d_sgn + d_exp + d_sig;
+        }
+        case 0x7c00u: // inf or NaN
+            // All-ones exponent and a copy of the significand
+            return d_sgn + 0x7ff0000000000000ULL + (((uint64_t)(h&0x03ffu)) << 42);
+        default: // normalized
+            // Just need to adjust the exponent and shift
+            return d_sgn + (((uint64_t)(h&0x7fffu) + 0xfc000u) << 42);
+    }
+}
+
+}} // namespace np::half_private
+#endif // NUMPY_CORE_SRC_COMMON_HALF_PRIVATE_HPP
diff --git a/numpy/core/src/common/lowlevel_strided_loops.h b/numpy/core/src/common/lowlevel_strided_loops.h
index 924a34db5..4ad110663 100644
--- a/numpy/core/src/common/lowlevel_strided_loops.h
+++ b/numpy/core/src/common/lowlevel_strided_loops.h
@@ -425,7 +425,7 @@ PyArray_PrepareThreeRawArrayIter(int ndim, npy_intp const *shape,
  * blocking. See the 'npy_blocked_end' function documentation below for an
  * example of how this function is used.
  */
-static NPY_INLINE npy_intp
+static inline npy_intp
 npy_aligned_block_offset(const void * addr, const npy_uintp esize,
                          const npy_uintp alignment, const npy_uintp nvals)
 {
@@ -458,7 +458,7 @@ npy_aligned_block_offset(const void * addr, const npy_uintp esize,
  * for(; i < n; i++)
  *   <scalar-op>
  */
-static NPY_INLINE npy_intp
+static inline npy_intp
 npy_blocked_end(const npy_uintp peel, const npy_uintp esize,
                 const npy_uintp vsz, const npy_uintp nvals)
 {
@@ -472,7 +472,7 @@ npy_blocked_end(const npy_uintp peel, const npy_uintp esize,
 
 
 /* byte swapping functions */
-static NPY_INLINE npy_uint16
+static inline npy_uint16
 npy_bswap2(npy_uint16 x)
 {
     return ((x & 0xffu) << 8) | (x >> 8);
@@ -482,7 +482,7 @@ npy_bswap2(npy_uint16 x)
  * treat as int16 and byteswap unaligned memory,
  * some cpus don't support unaligned access
  */
-static NPY_INLINE void
+static inline void
 npy_bswap2_unaligned(char * x)
 {
     char a = x[0];
@@ -490,7 +490,7 @@ npy_bswap2_unaligned(char * x)
     x[1] = a;
 }
 
-static NPY_INLINE npy_uint32
+static inline npy_uint32
 npy_bswap4(npy_uint32 x)
 {
 #ifdef HAVE___BUILTIN_BSWAP32
@@ -501,7 +501,7 @@ npy_bswap4(npy_uint32 x)
 #endif
 }
 
-static NPY_INLINE void
+static inline void
 npy_bswap4_unaligned(char * x)
 {
     char a = x[0];
@@ -512,7 +512,7 @@ npy_bswap4_unaligned(char * x)
     x[2] = a;
 }
 
-static NPY_INLINE npy_uint64
+static inline npy_uint64
 npy_bswap8(npy_uint64 x)
 {
 #ifdef HAVE___BUILTIN_BSWAP64
@@ -529,7 +529,7 @@ npy_bswap8(npy_uint64 x)
 #endif
 }
 
-static NPY_INLINE void
+static inline void
 npy_bswap8_unaligned(char * x)
 {
     char a = x[0]; x[0] = x[7]; x[7] = a;
@@ -685,7 +685,7 @@ npy_bswap8_unaligned(char * x)
         size == 1 ? 0 : ((PyArray_NDIM(arr) == 1) ? \
                              PyArray_STRIDE(arr, 0) : PyArray_ITEMSIZE(arr)))
 
-static NPY_INLINE int
+static inline int
 PyArray_EQUIVALENTLY_ITERABLE_OVERLAP_OK(PyArrayObject *arr1, PyArrayObject *arr2,
                                          int arr1_read, int arr2_read)
 {
diff --git a/numpy/core/src/common/meta.hpp b/numpy/core/src/common/meta.hpp
new file mode 100644
index 000000000..27ea1857e
--- /dev/null
+++ b/numpy/core/src/common/meta.hpp
@@ -0,0 +1,54 @@
+#ifndef NUMPY_CORE_SRC_COMMON_META_HPP
+#define NUMPY_CORE_SRC_COMMON_META_HPP
+
+#include "npstd.hpp"
+
+namespace np { namespace meta {
+/// @addtogroup cpp_core_meta
+/// @{
+
+namespace details {
+template<int size, bool unsig>
+struct IntBySize;
+
+template<bool unsig>
+struct IntBySize<sizeof(uint8_t), unsig> {
+    using Type = typename std::conditional<
+        unsig, uint8_t, int8_t>::type;
+};
+template<bool unsig>
+struct IntBySize<sizeof(uint16_t), unsig> {
+    using Type = typename std::conditional<
+        unsig, uint16_t, int16_t>::type;
+};
+template<bool unsig>
+struct IntBySize<sizeof(uint32_t), unsig> {
+    using Type = typename std::conditional<
+        unsig, uint32_t, int32_t>::type;
+};
+template<bool unsig>
+struct IntBySize<sizeof(uint64_t), unsig> {
+    using Type = typename std::conditional<
+        unsig, uint64_t, int64_t>::type;
+};
+} // namespace details
+
+/// Provides safe conversion of any integer type synonyms
+/// to a fixed-width integer type.
+template<typename T>
+struct FixedWidth {
+    using TF_ = typename details::IntBySize<
+        sizeof(T), std::is_unsigned<T>::value
+    >::Type;
+
+    using Type = typename std::conditional<
+        std::is_integral<T>::value, TF_, T
+    >::type;
+};
+
+/// @} cpp_core_meta
+
+}} // namespace np::meta
+
+#endif // NUMPY_CORE_SRC_COMMON_META_HPP
+
diff --git a/numpy/core/src/common/npdef.hpp b/numpy/core/src/common/npdef.hpp
new file mode 100644
index 000000000..56a0df52e
--- /dev/null
+++ b/numpy/core/src/common/npdef.hpp
@@ -0,0 +1,28 @@
+#ifndef NUMPY_CORE_SRC_COMMON_NPDEF_HPP
+#define NUMPY_CORE_SRC_COMMON_NPDEF_HPP
+
+#if !defined(__cplusplus) || __cplusplus < 201703L
+    #error "NumPy requires a compiler with at least C++17 enabled"
+#endif
+
+/// @addtogroup cpp_core_defs
+/// @{
+
+/// Whether compiler supports C++20
+#if __cplusplus > 202002L
+    #define NP_HAS_CPP20 1
+#else
+    #define NP_HAS_CPP20 0
+#endif
+
+/// Wraps `__has_builtin`
+#if defined(__has_builtin)
+    #define NP_HAS_BUILTIN(INTRIN) __has_builtin(INTRIN)
+#else
+    #define NP_HAS_BUILTIN(INTRIN) 0
+#endif
+
+/// @} cpp_core_defs
+
+#endif // NUMPY_CORE_SRC_COMMON_NPDEF_HPP
+
diff --git a/numpy/core/src/common/npstd.hpp b/numpy/core/src/common/npstd.hpp
new file mode 100644
index 000000000..92e3d5cc5
--- /dev/null
+++ b/numpy/core/src/common/npstd.hpp
@@ -0,0 +1,57 @@
+#ifndef NUMPY_CORE_SRC_COMMON_NPSTD_HPP
+#define NUMPY_CORE_SRC_COMMON_NPSTD_HPP
+
+#include <cstddef>
+#include <cstring>
+#include <cctype>
+#include <cstdint>
+
+#include <string>
+#include <algorithm>
+#include <utility>
+#include <cstdlib>
+#include <cmath>
+#include <complex>
+#include <type_traits>
+
+#include <numpy/npy_common.h>
+
+#include "npy_config.h"
+
+namespace np {
+/// @addtogroup cpp_core_types
+/// @{
+using std::uint8_t;
+using std::int8_t;
+using std::uint16_t;
+using std::int16_t;
+using std::uint32_t;
+using std::int32_t;
+using std::uint64_t;
+using std::int64_t;
+using std::uintptr_t;
+using std::intptr_t;
+using std::complex;
+using std::uint_fast16_t;
+using std::uint_fast32_t;
+
+/** Guard for long double.
+ *
+ * The C implementation defines long double as double
+ * on MinGW to provide compatibility with MSVC to unify
+ * one behavior under Windows OS, which makes npy_longdouble
+ * not fit to be used with template specialization or overloading.
+ *
+ * This type will be set to `void` when `npy_longdouble` is not defined
+ * as `long double`.
+ */
+using LongDouble = typename std::conditional<
+    !std::is_same<npy_longdouble, long double>::value,
+     void, npy_longdouble
+>::type;
+/// @} cpp_core_types
+
+} // namespace np
+
+#endif // NUMPY_CORE_SRC_COMMON_NPSTD_HPP
+
diff --git a/numpy/core/src/common/npy_cblas.h b/numpy/core/src/common/npy_cblas.h
index 30fec1a65..751854b6e 100644
--- a/numpy/core/src/common/npy_cblas.h
+++ b/numpy/core/src/common/npy_cblas.h
@@ -66,7 +66,7 @@ enum CBLAS_SIDE {CblasLeft=141, CblasRight=142};
  * Convert NumPy stride to BLAS stride. Returns 0 if conversion cannot be done
  * (BLAS won't handle negative or zero strides the way we want).
  */
-static NPY_INLINE CBLAS_INT
+static inline CBLAS_INT
 blas_stride(npy_intp stride, unsigned itemsize)
 {
     /*
diff --git a/numpy/core/src/common/npy_config.h b/numpy/core/src/common/npy_config.h
index d6886c5ea..715b17777 100644
--- a/numpy/core/src/common/npy_config.h
+++ b/numpy/core/src/common/npy_config.h
@@ -2,6 +2,7 @@
 #define NUMPY_CORE_SRC_COMMON_NPY_CONFIG_H_
 
 #include "config.h"
+#include "npy_cpu_dispatch.h" // brings NPY_HAVE_[CPU features]
 #include "numpy/numpyconfig.h"
 #include "numpy/utils.h"
 #include "numpy/npy_os.h"
diff --git a/numpy/core/src/common/npy_cpu_features.c b/numpy/core/src/common/npy_cpu_features.c
index 773f4af86..64a85f6fb 100644
--- a/numpy/core/src/common/npy_cpu_features.c
+++ b/numpy/core/src/common/npy_cpu_features.c
@@ -1,6 +1,6 @@
 #include "npy_cpu_features.h"
 #include "npy_cpu_dispatch.h" // To guarantee the CPU baseline definitions are in scope.
-#include "numpy/npy_common.h" // for NPY_INLINE
+#include "numpy/npy_common.h"
 #include "numpy/npy_cpu.h" // To guarantee the CPU definitions are in scope.
 
 /******************** Private Definitions *********************/
@@ -14,13 +14,15 @@ static unsigned char npy__cpu_have[NPY_CPU_FEATURE_MAX];
 static void
 npy__cpu_init_features(void);
 /*
- * Disable CPU dispatched features at runtime if environment variable
- * 'NPY_DISABLE_CPU_FEATURES' is defined.
+ * Enable or disable CPU dispatched features at runtime if the environment variable
+ * `NPY_ENABLE_CPU_FEATURES`  or  `NPY_DISABLE_CPU_FEATURES`
+ * depends on the value of boolean parameter `disable`(toggle).
+ *
  * Multiple features can be present, and separated by space, comma, or tab.
- * Raises an error if parsing fails or if the feature was not enabled
+ * Raises an error if parsing fails or if the feature was not enabled or disabled
 */
 static int
-npy__cpu_try_disable_env(void);
+npy__cpu_check_env(int disable, const char *env);
 
 /* Ensure the build's CPU baseline features are supported at runtime */
 static int
@@ -43,9 +45,22 @@ npy_cpu_init(void)
     if (npy__cpu_validate_baseline() < 0) {
         return -1;
     }
-    if (npy__cpu_try_disable_env() < 0) {
+    char *enable_env = getenv("NPY_ENABLE_CPU_FEATURES");
+    char *disable_env = getenv("NPY_DISABLE_CPU_FEATURES");
+    int is_enable = enable_env && enable_env[0];
+    int is_disable = disable_env && disable_env[0];
+    if (is_enable & is_disable) {
+        PyErr_Format(PyExc_ImportError,
+            "Both NPY_DISABLE_CPU_FEATURES and NPY_ENABLE_CPU_FEATURES "
+            "environment variables cannot be set simultaneously."
+        );
         return -1;
     }
+    if (is_enable | is_disable) {
+        if (npy__cpu_check_env(is_disable, is_disable ? disable_env : enable_env) < 0) {
+            return -1;
+        }
+    }
     return 0;
 }
 
@@ -81,12 +96,14 @@ static struct {
                 {NPY_CPU_FEATURE_AVX512VBMI, "AVX512VBMI"},
                 {NPY_CPU_FEATURE_AVX512VBMI2, "AVX512VBMI2"},
                 {NPY_CPU_FEATURE_AVX512BITALG, "AVX512BITALG"},
+                {NPY_CPU_FEATURE_AVX512FP16 , "AVX512FP16"},
                 {NPY_CPU_FEATURE_AVX512_KNL, "AVX512_KNL"},
                 {NPY_CPU_FEATURE_AVX512_KNM, "AVX512_KNM"},
                 {NPY_CPU_FEATURE_AVX512_SKX, "AVX512_SKX"},
                 {NPY_CPU_FEATURE_AVX512_CLX, "AVX512_CLX"},
                 {NPY_CPU_FEATURE_AVX512_CNL, "AVX512_CNL"},
                 {NPY_CPU_FEATURE_AVX512_ICL, "AVX512_ICL"},
+                {NPY_CPU_FEATURE_AVX512_SPR, "AVX512_SPR"},
                 {NPY_CPU_FEATURE_VSX, "VSX"},
                 {NPY_CPU_FEATURE_VSX2, "VSX2"},
                 {NPY_CPU_FEATURE_VSX3, "VSX3"},
@@ -166,7 +183,7 @@ npy_cpu_dispatch_list(void)
  * features that had been configured via --cpu-baseline
  * otherwise it returns 0
 */
-static NPY_INLINE int
+static inline int
 npy__cpu_baseline_fid(const char *feature)
 {
 #if !defined(NPY_DISABLE_OPTIMIZATION) && NPY_WITH_CPU_BASELINE_N > 0
@@ -179,7 +196,7 @@ npy__cpu_baseline_fid(const char *feature)
  * features that had been configured via --cpu-dispatch
  * otherwise it returns 0
 */
-static NPY_INLINE int
+static inline int
 npy__cpu_dispatch_fid(const char *feature)
 {
 #if !defined(NPY_DISABLE_OPTIMIZATION) && NPY_WITH_CPU_DISPATCH_N > 0
@@ -208,7 +225,8 @@ npy__cpu_validate_baseline(void)
         *(fptr-1) = '\0'; // trim the last space
         PyErr_Format(PyExc_RuntimeError,
             "NumPy was built with baseline optimizations: \n"
-            "(" NPY_WITH_CPU_BASELINE ") but your machine doesn't support:\n(%s).",
+            "(" NPY_WITH_CPU_BASELINE ") but your machine "
+            "doesn't support:\n(%s).",
             baseline_failure
         );
         return -1;
@@ -218,27 +236,31 @@ npy__cpu_validate_baseline(void)
 }
 
 static int
-npy__cpu_try_disable_env(void)
-{
-    char *disenv = getenv("NPY_DISABLE_CPU_FEATURES");
-    if (disenv == NULL || disenv[0] == 0) {
-        return 0;
-    }
-    #define NPY__CPU_ENV_ERR_HEAD \
-        "During parsing environment variable 'NPY_DISABLE_CPU_FEATURES':\n"
+npy__cpu_check_env(int disable, const char *env) {
+
+    static const char *names[] = {
+        "enable", "disable",
+        "NPY_ENABLE_CPU_FEATURES", "NPY_DISABLE_CPU_FEATURES",
+        "During parsing environment variable: 'NPY_ENABLE_CPU_FEATURES':\n",
+        "During parsing environment variable: 'NPY_DISABLE_CPU_FEATURES':\n"
+    };
+    disable = disable ? 1 : 0;
+    const char *act_name = names[disable];
+    const char *env_name = names[disable + 2];
+    const char *err_head = names[disable + 4];
 
 #if !defined(NPY_DISABLE_OPTIMIZATION) && NPY_WITH_CPU_DISPATCH_N > 0
     #define NPY__MAX_VAR_LEN 1024 // More than enough for this era
-    size_t var_len = strlen(disenv) + 1;
+    size_t var_len = strlen(env) + 1;
     if (var_len > NPY__MAX_VAR_LEN) {
         PyErr_Format(PyExc_RuntimeError,
-            "Length of environment variable 'NPY_DISABLE_CPU_FEATURES' is %d, only %d accepted",
-            var_len, NPY__MAX_VAR_LEN - 1
+            "Length of environment variable '%s' is %d, only %d accepted",
+            env_name, var_len, NPY__MAX_VAR_LEN
         );
         return -1;
     }
-    char disable_features[NPY__MAX_VAR_LEN];
-    memcpy(disable_features, disenv, var_len);
+    char features[NPY__MAX_VAR_LEN];
+    memcpy(features, env, var_len);
 
     char nexist[NPY__MAX_VAR_LEN];
     char *nexist_cur = &nexist[0];
@@ -248,17 +270,19 @@ npy__cpu_try_disable_env(void)
 
     //comma and space including (htab, vtab, CR, LF, FF)
     const char *delim = ", \t\v\r\n\f";
-    char *feature = strtok(disable_features, delim);
+    char *feature = strtok(features, delim);
     while (feature) {
-        if (npy__cpu_baseline_fid(feature) > 0) {
-            PyErr_Format(PyExc_RuntimeError,
-                NPY__CPU_ENV_ERR_HEAD
-                "You cannot disable CPU feature '%s', since it is part of "
-                "the baseline optimizations:\n"
-                "(" NPY_WITH_CPU_BASELINE ").",
-                feature
-            );
-            return -1;
+        if (npy__cpu_baseline_fid(feature) > 0){
+            if (disable) {
+                PyErr_Format(PyExc_RuntimeError,
+                    "%s"
+                    "You cannot disable CPU feature '%s', since it is part of "
+                    "the baseline optimizations:\n"
+                    "(" NPY_WITH_CPU_BASELINE ").",
+                    err_head, feature
+                );
+                return -1;
+            } goto next;
         }
         // check if the feature is part of dispatched features
         int feature_id = npy__cpu_dispatch_fid(feature);
@@ -275,47 +299,58 @@ npy__cpu_try_disable_env(void)
             notsupp_cur[flen] = ' '; notsupp_cur += flen + 1;
             goto next;
         }
-        // Finally we can disable it
-        npy__cpu_have[feature_id] = 0;
+        // Finally we can disable or mark for enabling
+        npy__cpu_have[feature_id] = disable ? 0:2;
     next:
         feature = strtok(NULL, delim);
     }
+    if (!disable){
+        // Disables any unmarked dispatched feature.
+        #define NPY__CPU_DISABLE_DISPATCH_CB(FEATURE, DUMMY) \
+            if(npy__cpu_have[NPY_CAT(NPY_CPU_FEATURE_, FEATURE)] != 0)\
+            {npy__cpu_have[NPY_CAT(NPY_CPU_FEATURE_, FEATURE)]--;}\
+
+        NPY_WITH_CPU_DISPATCH_CALL(NPY__CPU_DISABLE_DISPATCH_CB, DUMMY) // extra arg for msvc
+    }
 
     *nexist_cur = '\0';
     if (nexist[0] != '\0') {
         *(nexist_cur-1) = '\0'; // trim the last space
-        if (PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
-                NPY__CPU_ENV_ERR_HEAD
-                "You cannot disable CPU features (%s), since "
-                "they are not part of the dispatched optimizations\n"
-                "(" NPY_WITH_CPU_DISPATCH ").",
-                nexist
+        if (PyErr_WarnFormat(PyExc_ImportWarning, 1,
+            "%sYou cannot %s CPU features (%s), since "
+            "they are not part of the dispatched optimizations\n"
+            "(" NPY_WITH_CPU_DISPATCH ").",
+            err_head, act_name, nexist
         ) < 0) {
             return -1;
         }
+        return 0;
     }
 
+    #define NOTSUPP_BODY \
+                "%s" \
+                "You cannot %s CPU features (%s), since " \
+                "they are not supported by your machine.", \
+                err_head, act_name, notsupp
+
     *notsupp_cur = '\0';
     if (notsupp[0] != '\0') {
         *(notsupp_cur-1) = '\0'; // trim the last space
-        if (PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
-                NPY__CPU_ENV_ERR_HEAD
-                "You cannot disable CPU features (%s), since "
-                "they are not supported by your machine.",
-                notsupp
-        ) < 0) {
+        if (!disable){
+            PyErr_Format(PyExc_RuntimeError, NOTSUPP_BODY);
             return -1;
         }
     }
 #else
-    if (PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
-            NPY__CPU_ENV_ERR_HEAD
-            "You cannot use environment variable 'NPY_DISABLE_CPU_FEATURES', since "
+    if (PyErr_WarnFormat(PyExc_ImportWarning, 1,
+            "%s"
+            "You cannot use environment variable '%s', since "
         #ifdef NPY_DISABLE_OPTIMIZATION
-            "the NumPy library was compiled with optimization disabled."
+            "the NumPy library was compiled with optimization disabled.",
         #else
-            "the NumPy library was compiled without any dispatched optimizations."
+            "the NumPy library was compiled without any dispatched optimizations.",
         #endif
+        err_head, env_name, act_name
     ) < 0) {
         return -1;
     }
@@ -506,6 +541,11 @@ npy__cpu_init_features(void)
                                                          npy__cpu_have[NPY_CPU_FEATURE_AVX512VBMI2] &&
                                                          npy__cpu_have[NPY_CPU_FEATURE_AVX512BITALG] &&
                                                          npy__cpu_have[NPY_CPU_FEATURE_AVX512VPOPCNTDQ];
+        // Sapphire Rapids
+        npy__cpu_have[NPY_CPU_FEATURE_AVX512FP16]     = (reg[3] & (1 << 23))  != 0;
+        npy__cpu_have[NPY_CPU_FEATURE_AVX512_SPR]      = npy__cpu_have[NPY_CPU_FEATURE_AVX512_ICL] &&
+                                                         npy__cpu_have[NPY_CPU_FEATURE_AVX512FP16];
+
     }
 }
 
@@ -513,7 +553,10 @@ npy__cpu_init_features(void)
 
 #elif defined(NPY_CPU_PPC64) || defined(NPY_CPU_PPC64LE)
 
-#ifdef __linux__
+#if defined(__linux__) || defined(__FreeBSD__)
+    #ifdef __FreeBSD__
+        #include <machine/cpu.h> // defines PPC_FEATURE_HAS_VSX
+    #endif
     #include <sys/auxv.h>
     #ifndef AT_HWCAP2
         #define AT_HWCAP2 26
@@ -533,12 +576,21 @@ static void
 npy__cpu_init_features(void)
 {
     memset(npy__cpu_have, 0, sizeof(npy__cpu_have[0]) * NPY_CPU_FEATURE_MAX);
+#if defined(__linux__) || defined(__FreeBSD__)
 #ifdef __linux__
     unsigned int hwcap = getauxval(AT_HWCAP);
     if ((hwcap & PPC_FEATURE_HAS_VSX) == 0)
         return;
 
     hwcap = getauxval(AT_HWCAP2);
+#else
+    unsigned long hwcap;
+    elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap));
+    if ((hwcap & PPC_FEATURE_HAS_VSX) == 0)
+        return;
+
+    elf_aux_info(AT_HWCAP2, &hwcap, sizeof(hwcap));
+#endif // __linux__
     if (hwcap & PPC_FEATURE2_ARCH_3_1)
     {
         npy__cpu_have[NPY_CPU_FEATURE_VSX]  =
@@ -551,7 +603,7 @@ npy__cpu_init_features(void)
     npy__cpu_have[NPY_CPU_FEATURE_VSX2] = (hwcap & PPC_FEATURE2_ARCH_2_07) != 0;
     npy__cpu_have[NPY_CPU_FEATURE_VSX3] = (hwcap & PPC_FEATURE2_ARCH_3_00) != 0;
     npy__cpu_have[NPY_CPU_FEATURE_VSX4] = (hwcap & PPC_FEATURE2_ARCH_3_1) != 0;
-// TODO: AIX, FreeBSD
+// TODO: AIX, OpenBSD
 #else
     npy__cpu_have[NPY_CPU_FEATURE_VSX]  = 1;
     #if defined(NPY_CPU_PPC64LE) || defined(NPY_HAVE_VSX2)
@@ -606,7 +658,7 @@ npy__cpu_init_features(void)
 
 #elif defined(__arm__) || defined(__aarch64__)
 
-static NPY_INLINE void
+static inline void
 npy__cpu_init_features_arm8(void)
 {
     npy__cpu_have[NPY_CPU_FEATURE_NEON]       =
diff --git a/numpy/core/src/common/npy_cpu_features.h b/numpy/core/src/common/npy_cpu_features.h
index 96c543e70..9180670c4 100644
--- a/numpy/core/src/common/npy_cpu_features.h
+++ b/numpy/core/src/common/npy_cpu_features.h
@@ -43,6 +43,7 @@ enum npy_cpu_features
     NPY_CPU_FEATURE_AVX512VNNI        = 42,
     NPY_CPU_FEATURE_AVX512VBMI2       = 43,
     NPY_CPU_FEATURE_AVX512BITALG      = 44,
+    NPY_CPU_FEATURE_AVX512FP16        = 45,
 
     // X86 CPU Groups
     // Knights Landing (F,CD,ER,PF)
@@ -57,6 +58,8 @@ enum npy_cpu_features
     NPY_CPU_FEATURE_AVX512_CNL        = 105,
     // Ice Lake        (F,CD,BW,DQ,VL,IFMA,VBMI,VNNI,VBMI2,BITALG,VPOPCNTDQ)
     NPY_CPU_FEATURE_AVX512_ICL        = 106,
+    // Sapphire Rapids (Ice Lake, AVX512FP16)
+    NPY_CPU_FEATURE_AVX512_SPR        = 107,
 
     // IBM/POWER VSX
     // POWER7
@@ -103,13 +106,25 @@ enum npy_cpu_features
  *  - detects runtime CPU features
  *  - check that baseline CPU features are present
  *  - uses 'NPY_DISABLE_CPU_FEATURES' to disable dispatchable features
+ *  - uses 'NPY_ENABLE_CPU_FEATURES' to enable dispatchable features
  *
  * It will set a RuntimeError when 
  *  - CPU baseline features from the build are not supported at runtime
  *  - 'NPY_DISABLE_CPU_FEATURES' tries to disable a baseline feature
- * and will warn if 'NPY_DISABLE_CPU_FEATURES' tries to disable a feature that
- * is not disabled (the machine or build does not support it, or the project was
- * not built with any feature optimization support)
+ *  - 'NPY_DISABLE_CPU_FEATURES' and 'NPY_ENABLE_CPU_FEATURES' are
+ *    simultaneously set
+ *  - 'NPY_ENABLE_CPU_FEATURES' tries to enable a feature that is not supported
+ *    by the machine or build
+ *  - 'NPY_ENABLE_CPU_FEATURES' tries to enable a feature when the project was
+ *    not built with any feature optimization support
+ *  
+ * It will set an ImportWarning when:
+ *  - 'NPY_DISABLE_CPU_FEATURES' tries to disable a feature that is not supported
+ *    by the machine or build
+ *  - 'NPY_DISABLE_CPU_FEATURES' or 'NPY_ENABLE_CPU_FEATURES' tries to
+ *    disable/enable a feature when the project was not built with any feature
+ *    optimization support
+ * 
  * return 0 on success otherwise return -1
  */
 NPY_VISIBILITY_HIDDEN int
diff --git a/numpy/core/src/common/npy_ctypes.h b/numpy/core/src/common/npy_ctypes.h
index 05761cad3..4b5634574 100644
--- a/numpy/core/src/common/npy_ctypes.h
+++ b/numpy/core/src/common/npy_ctypes.h
@@ -14,7 +14,7 @@
  * This entire function is just a wrapper around the Python function of the
  * same name.
  */
-NPY_INLINE static int
+static inline int
 npy_ctypes_check(PyTypeObject *obj)
 {
     static PyObject *py_func = NULL;
diff --git a/numpy/core/src/common/npy_extint128.h b/numpy/core/src/common/npy_extint128.h
index d563c2ac8..776d71c77 100644
--- a/numpy/core/src/common/npy_extint128.h
+++ b/numpy/core/src/common/npy_extint128.h
@@ -9,7 +9,7 @@ typedef struct {
 
 
 /* Integer addition with overflow checking */
-static NPY_INLINE npy_int64
+static inline npy_int64
 safe_add(npy_int64 a, npy_int64 b, char *overflow_flag)
 {
     if (a > 0 && b > NPY_MAX_INT64 - a) {
@@ -23,7 +23,7 @@ safe_add(npy_int64 a, npy_int64 b, char *overflow_flag)
 
 
 /* Integer subtraction with overflow checking */
-static NPY_INLINE npy_int64
+static inline npy_int64
 safe_sub(npy_int64 a, npy_int64 b, char *overflow_flag)
 {
     if (a >= 0 && b < a - NPY_MAX_INT64) {
@@ -37,7 +37,7 @@ safe_sub(npy_int64 a, npy_int64 b, char *overflow_flag)
 
 
 /* Integer multiplication with overflow checking */
-static NPY_INLINE npy_int64
+static inline npy_int64
 safe_mul(npy_int64 a, npy_int64 b, char *overflow_flag)
 {
     if (a > 0) {
@@ -58,7 +58,7 @@ safe_mul(npy_int64 a, npy_int64 b, char *overflow_flag)
 
 
 /* Long integer init */
-static NPY_INLINE npy_extint128_t
+static inline npy_extint128_t
 to_128(npy_int64 x)
 {
     npy_extint128_t result;
@@ -74,7 +74,7 @@ to_128(npy_int64 x)
 }
 
 
-static NPY_INLINE npy_int64
+static inline npy_int64
 to_64(npy_extint128_t x, char *overflow)
 {
     if (x.hi != 0 ||
@@ -87,7 +87,7 @@ to_64(npy_extint128_t x, char *overflow)
 
 
 /* Long integer multiply */
-static NPY_INLINE npy_extint128_t
+static inline npy_extint128_t
 mul_64_64(npy_int64 a, npy_int64 b)
 {
     npy_extint128_t x, y, z;
@@ -127,7 +127,7 @@ mul_64_64(npy_int64 a, npy_int64 b)
 
 
 /* Long integer add */
-static NPY_INLINE npy_extint128_t
+static inline npy_extint128_t
 add_128(npy_extint128_t x, npy_extint128_t y, char *overflow)
 {
     npy_extint128_t z;
@@ -170,7 +170,7 @@ add_128(npy_extint128_t x, npy_extint128_t y, char *overflow)
 
 
 /* Long integer negation */
-static NPY_INLINE npy_extint128_t
+static inline npy_extint128_t
 neg_128(npy_extint128_t x)
 {
     npy_extint128_t z = x;
@@ -179,14 +179,14 @@ neg_128(npy_extint128_t x)
 }
 
 
-static NPY_INLINE npy_extint128_t
+static inline npy_extint128_t
 sub_128(npy_extint128_t x, npy_extint128_t y, char *overflow)
 {
     return add_128(x, neg_128(y), overflow);
 }
 
 
-static NPY_INLINE npy_extint128_t
+static inline npy_extint128_t
 shl_128(npy_extint128_t v)
 {
     npy_extint128_t z;
@@ -198,7 +198,7 @@ shl_128(npy_extint128_t v)
 }
 
 
-static NPY_INLINE npy_extint128_t
+static inline npy_extint128_t
 shr_128(npy_extint128_t v)
 {
     npy_extint128_t z;
@@ -209,7 +209,7 @@ shr_128(npy_extint128_t v)
     return z;
 }
 
-static NPY_INLINE int
+static inline int
 gt_128(npy_extint128_t a, npy_extint128_t b)
 {
     if (a.sign > 0 && b.sign > 0) {
@@ -228,7 +228,7 @@ gt_128(npy_extint128_t a, npy_extint128_t b)
 
 
 /* Long integer divide */
-static NPY_INLINE npy_extint128_t
+static inline npy_extint128_t
 divmod_128_64(npy_extint128_t x, npy_int64 b, npy_int64 *mod)
 {
     npy_extint128_t remainder, pointer, result, divisor;
@@ -284,7 +284,7 @@ divmod_128_64(npy_extint128_t x, npy_int64 b, npy_int64 *mod)
 
 
 /* Divide and round down (positive divisor; no overflows) */
-static NPY_INLINE npy_extint128_t
+static inline npy_extint128_t
 floordiv_128_64(npy_extint128_t a, npy_int64 b)
 {
     npy_extint128_t result;
@@ -300,7 +300,7 @@ floordiv_128_64(npy_extint128_t a, npy_int64 b)
 
 
 /* Divide and round up (positive divisor; no overflows) */
-static NPY_INLINE npy_extint128_t
+static inline npy_extint128_t
 ceildiv_128_64(npy_extint128_t a, npy_int64 b)
 {
     npy_extint128_t result;
diff --git a/numpy/core/src/common/npy_hashtable.c b/numpy/core/src/common/npy_hashtable.c
index af82c0f85..14f6cca1b 100644
--- a/numpy/core/src/common/npy_hashtable.c
+++ b/numpy/core/src/common/npy_hashtable.c
@@ -35,7 +35,7 @@
  *
  * Users cannot control pointers, so we do not have to worry about DoS attacks?
  */
-static NPY_INLINE Py_hash_t
+static inline Py_hash_t
 identity_list_hash(PyObject *const *v, int len)
 {
     Py_uhash_t acc = _NpyHASH_XXPRIME_5;
@@ -58,7 +58,7 @@ identity_list_hash(PyObject *const *v, int len)
 #undef _NpyHASH_XXROTATE
 
 
-static NPY_INLINE PyObject **
+static inline PyObject **
 find_item(PyArrayIdentityHash const *tb, PyObject *const *key)
 {
     Py_hash_t hash = identity_list_hash(key, tb->key_len);
diff --git a/numpy/core/src/common/npy_import.h b/numpy/core/src/common/npy_import.h
index f36b6924a..58b4ba0bc 100644
--- a/numpy/core/src/common/npy_import.h
+++ b/numpy/core/src/common/npy_import.h
@@ -16,7 +16,7 @@
  * @param attr module attribute to cache.
  * @param cache Storage location for imported function.
  */
-NPY_INLINE static void
+static inline void
 npy_cache_import(const char *module, const char *attr, PyObject **cache)
 {
     if (NPY_UNLIKELY(*cache == NULL)) {
diff --git a/numpy/core/src/common/npy_pycompat.h b/numpy/core/src/common/npy_pycompat.h
index 6641cd591..ce6c34fa1 100644
--- a/numpy/core/src/common/npy_pycompat.h
+++ b/numpy/core/src/common/npy_pycompat.h
@@ -11,7 +11,7 @@
 #if PY_VERSION_HEX > 0x030a00a6
 #define Npy_HashDouble _Py_HashDouble
 #else
-static NPY_INLINE Py_hash_t
+static inline Py_hash_t
 Npy_HashDouble(PyObject *NPY_UNUSED(identity), double val)
 {
     return _Py_HashDouble(val);
diff --git a/numpy/core/src/common/npy_sort.h.src b/numpy/core/src/common/npy_sort.h.src
index a3f556f56..d6e435722 100644
--- a/numpy/core/src/common/npy_sort.h.src
+++ b/numpy/core/src/common/npy_sort.h.src
@@ -9,7 +9,7 @@
 #define NPY_ENOMEM 1
 #define NPY_ECOMP 2
 
-static NPY_INLINE int npy_get_msb(npy_uintp unum)
+static inline int npy_get_msb(npy_uintp unum)
 {
     int depth_limit = 0;
     while (unum >>= 1)  {
diff --git a/numpy/core/src/common/numpyos.c b/numpy/core/src/common/numpyos.c
index 5ee95191d..2fec06e1c 100644
--- a/numpy/core/src/common/numpyos.c
+++ b/numpy/core/src/common/numpyos.c
@@ -779,3 +779,25 @@ NumPyOS_strtoull(const char *str, char **endptr, int base)
     return strtoull(str, endptr, base);
 }
 
+#ifdef _MSC_VER
+
+#include <stdlib.h>
+
+#if _MSC_VER >= 1900
+/* npy3k_compat.h uses this function in the _Py_BEGIN/END_SUPPRESS_IPH
+ * macros. It does not need to be defined when building using MSVC
+ * earlier than 14.0 (_MSC_VER == 1900).
+ */
+
+static void __cdecl _silent_invalid_parameter_handler(
+    wchar_t const* expression,
+    wchar_t const* function,
+    wchar_t const* file,
+    unsigned int line,
+    uintptr_t pReserved) { }
+
+_invalid_parameter_handler _Py_silent_invalid_parameter_handler = _silent_invalid_parameter_handler;
+
+#endif
+
+#endif
diff --git a/numpy/core/src/common/simd/avx2/arithmetic.h b/numpy/core/src/common/simd/avx2/arithmetic.h
index ad9688338..58d842a6d 100644
--- a/numpy/core/src/common/simd/avx2/arithmetic.h
+++ b/numpy/core/src/common/simd/avx2/arithmetic.h
@@ -247,6 +247,10 @@ NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor)
     // negate multiply and subtract, -(a*b) - c
     #define npyv_nmulsub_f32 _mm256_fnmsub_ps
     #define npyv_nmulsub_f64 _mm256_fnmsub_pd
+    // multiply, add for odd elements and subtract even elements.
+    // (a * b) -+ c
+    #define npyv_muladdsub_f32 _mm256_fmaddsub_ps
+    #define npyv_muladdsub_f64 _mm256_fmaddsub_pd
 #else
     // multiply and add, a*b + c
     NPY_FINLINE npyv_f32 npyv_muladd_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
@@ -274,6 +278,13 @@ NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor)
         npyv_f64 neg_a = npyv_xor_f64(a, npyv_setall_f64(-0.0));
         return npyv_sub_f64(npyv_mul_f64(neg_a, b), c);
     }
+    // multiply, add for odd elements and subtract even elements.
+    // (a * b) -+ c
+    NPY_FINLINE npyv_f32 npyv_muladdsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
+    { return _mm256_addsub_ps(npyv_mul_f32(a, b), c); }
+    NPY_FINLINE npyv_f64 npyv_muladdsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c)
+    { return _mm256_addsub_pd(npyv_mul_f64(a, b), c); }
+
 #endif // !NPY_HAVE_FMA3
 
 /***************************
diff --git a/numpy/core/src/common/simd/avx2/avx2.h b/numpy/core/src/common/simd/avx2/avx2.h
index 8cb74df2b..d64f3c6d6 100644
--- a/numpy/core/src/common/simd/avx2/avx2.h
+++ b/numpy/core/src/common/simd/avx2/avx2.h
@@ -11,6 +11,7 @@
     #define NPY_SIMD_FMA3 0 // fast emulated
 #endif
 #define NPY_SIMD_BIGENDIAN 0
+#define NPY_SIMD_CMPSIGNAL 0
 // Enough limit to allow us to use _mm256_i32gather_*
 #define NPY_SIMD_MAXLOAD_STRIDE32 (0x7fffffff / 8)
 
diff --git a/numpy/core/src/common/simd/avx2/memory.h b/numpy/core/src/common/simd/avx2/memory.h
index 410c35dc8..81144a36b 100644
--- a/numpy/core/src/common/simd/avx2/memory.h
+++ b/numpy/core/src/common/simd/avx2/memory.h
@@ -84,17 +84,6 @@ NPY_FINLINE npyv_s32 npyv_loadn_s32(const npy_int32 *ptr, npy_intp stride)
 NPY_FINLINE npyv_f32 npyv_loadn_f32(const float *ptr, npy_intp stride)
 { return _mm256_castsi256_ps(npyv_loadn_u32((const npy_uint32*)ptr, stride)); }
 //// 64
-#if 0 // slower
-NPY_FINLINE npyv_u64 npyv_loadn_u64(const npy_uint64 *ptr, npy_intp stride)
-{
-    const __m256i idx = npyv_set_s64(0, 1*stride, 2*stride, 3*stride);
-    return _mm256_i64gather_epi64((const void*)ptr, idx, 8);
-}
-NPY_FINLINE npyv_s64 npyv_loadn_s64(const npy_int64 *ptr, npy_intp stride)
-{ return npyv_loadn_u64((const npy_uint64*)ptr, stride); }
-NPY_FINLINE npyv_f64 npyv_loadn_f64(const double *ptr, npy_intp stride)
-{ return _mm256_castsi256_pd(npyv_loadn_u64((const npy_uint64*)ptr, stride)); }
-#endif
 NPY_FINLINE npyv_f64 npyv_loadn_f64(const double *ptr, npy_intp stride)
 {
     __m128d a0 = _mm_castsi128_pd(_mm_loadl_epi64((const __m128i*)ptr));
@@ -107,6 +96,32 @@ NPY_FINLINE npyv_u64 npyv_loadn_u64(const npy_uint64 *ptr, npy_intp stride)
 { return _mm256_castpd_si256(npyv_loadn_f64((const double*)ptr, stride)); }
 NPY_FINLINE npyv_s64 npyv_loadn_s64(const npy_int64 *ptr, npy_intp stride)
 { return _mm256_castpd_si256(npyv_loadn_f64((const double*)ptr, stride)); }
+
+//// 64-bit load over 32-bit stride
+NPY_FINLINE npyv_f32 npyv_loadn2_f32(const float *ptr, npy_intp stride)
+{
+    __m128d a0 = _mm_castsi128_pd(_mm_loadl_epi64((const __m128i*)ptr));
+    __m128d a2 = _mm_castsi128_pd(_mm_loadl_epi64((const __m128i*)(ptr + stride*2)));
+    __m128d a01 = _mm_loadh_pd(a0, (const double*)(ptr + stride));
+    __m128d a23 = _mm_loadh_pd(a2, (const double*)(ptr + stride*3));
+    return _mm256_castpd_ps(_mm256_insertf128_pd(_mm256_castpd128_pd256(a01), a23, 1));
+}
+NPY_FINLINE npyv_u32 npyv_loadn2_u32(const npy_uint32 *ptr, npy_intp stride)
+{ return _mm256_castps_si256(npyv_loadn2_f32((const float*)ptr, stride)); }
+NPY_FINLINE npyv_s32 npyv_loadn2_s32(const npy_int32 *ptr, npy_intp stride)
+{ return _mm256_castps_si256(npyv_loadn2_f32((const float*)ptr, stride)); }
+
+//// 128-bit load over 64-bit stride
+NPY_FINLINE npyv_f64 npyv_loadn2_f64(const double *ptr, npy_intp stride)
+{
+    __m256d a = npyv_loadl_f64(ptr);
+    return _mm256_insertf128_pd(a, _mm_loadu_pd(ptr + stride), 1);
+}
+NPY_FINLINE npyv_u64 npyv_loadn2_u64(const npy_uint64 *ptr, npy_intp stride)
+{ return _mm256_castpd_si256(npyv_loadn2_f64((const double*)ptr, stride)); }
+NPY_FINLINE npyv_s64 npyv_loadn2_s64(const npy_int64 *ptr, npy_intp stride)
+{ return _mm256_castpd_si256(npyv_loadn2_f64((const double*)ptr, stride)); }
+
 /***************************
  * Non-contiguous Store
  ***************************/
@@ -143,6 +158,32 @@ NPY_FINLINE void npyv_storen_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a)
 NPY_FINLINE void npyv_storen_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a)
 { npyv_storen_f64((double*)ptr, stride, _mm256_castsi256_pd(a)); }
 
+//// 64-bit store over 32-bit stride
+NPY_FINLINE void npyv_storen2_u32(npy_uint32 *ptr, npy_intp stride, npyv_u32 a)
+{
+    __m128d a0 = _mm256_castpd256_pd128(_mm256_castsi256_pd(a));
+    __m128d a1 = _mm256_extractf128_pd(_mm256_castsi256_pd(a), 1);
+    _mm_storel_pd((double*)ptr, a0);
+    _mm_storeh_pd((double*)(ptr + stride), a0);
+    _mm_storel_pd((double*)(ptr + stride*2), a1);
+    _mm_storeh_pd((double*)(ptr + stride*3), a1);
+}
+NPY_FINLINE void npyv_storen2_s32(npy_int32 *ptr, npy_intp stride, npyv_s32 a)
+{ npyv_storen2_u32((npy_uint32*)ptr, stride, a); }
+NPY_FINLINE void npyv_storen2_f32(float *ptr, npy_intp stride, npyv_f32 a)
+{ npyv_storen2_u32((npy_uint32*)ptr, stride, _mm256_castps_si256(a)); }
+
+//// 128-bit store over 64-bit stride
+NPY_FINLINE void npyv_storen2_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a)
+{
+    npyv_storel_u64(ptr, a);
+    npyv_storeh_u64(ptr + stride, a);
+}
+NPY_FINLINE void npyv_storen2_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a)
+{ npyv_storen2_u64((npy_uint64*)ptr, stride, a); }
+NPY_FINLINE void npyv_storen2_f64(double *ptr, npy_intp stride, npyv_f64 a)
+{ npyv_storen2_u64((npy_uint64*)ptr, stride, _mm256_castpd_si256(a)); }
+
 /*********************************
  * Partial Load
  *********************************/
@@ -174,7 +215,7 @@ NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, n
     const __m256i steps = npyv_set_s64(0, 1, 2, 3);
     __m256i vnlane  = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane);
     __m256i mask    = _mm256_cmpgt_epi64(vnlane, steps);
-    __m256i payload = _mm256_maskload_epi64((const void*)ptr, mask);
+    __m256i payload = _mm256_maskload_epi64((const long long*)ptr, mask);
     return _mm256_blendv_epi8(vfill, payload, mask);
 }
 // fill zero to rest lanes
@@ -184,7 +225,45 @@ NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
     const __m256i steps = npyv_set_s64(0, 1, 2, 3);
     __m256i vnlane  = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane);
     __m256i mask    = _mm256_cmpgt_epi64(vnlane, steps);
-    return _mm256_maskload_epi64((const void*)ptr, mask);
+    return _mm256_maskload_epi64((const long long*)ptr, mask);
+}
+
+//// 64-bit nlane
+NPY_FINLINE npyv_s32 npyv_load2_till_s32(const npy_int32 *ptr, npy_uintp nlane,
+                                          npy_int32 fill_lo, npy_int32 fill_hi)
+{
+    assert(nlane > 0);
+    const __m256i vfill = npyv_set_s32(
+        fill_lo, fill_hi, fill_lo, fill_hi,
+        fill_lo, fill_hi, fill_lo, fill_hi
+    );
+    const __m256i steps = npyv_set_s64(0, 1, 2, 3);
+    __m256i vnlane  = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane);
+    __m256i mask    = _mm256_cmpgt_epi64(vnlane, steps);
+    __m256i payload = _mm256_maskload_epi64((const long long*)ptr, mask);
+    return _mm256_blendv_epi8(vfill, payload, mask);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s32 npyv_load2_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
+{ return npyv_load_tillz_s64((const npy_int64*)ptr, nlane); }
+
+/// 128-bit nlane
+NPY_FINLINE npyv_u64 npyv_load2_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
+{
+    assert(nlane > 0);
+    npy_int64 m = -((npy_int64)(nlane > 1));
+    __m256i mask = npyv_set_s64(-1, -1, m, m);
+    return _mm256_maskload_epi64((const long long*)ptr, mask);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_u64 npyv_load2_till_s64(const npy_int64 *ptr, npy_uintp nlane,
+                                           npy_int64 fill_lo, npy_int64 fill_hi)
+{
+    const __m256i vfill = npyv_set_s64(0, 0, fill_lo, fill_hi);
+    npy_int64 m = -((npy_int64)(nlane > 1));
+    __m256i mask = npyv_set_s64(-1, -1, m, m);
+    __m256i payload = _mm256_maskload_epi64((const long long*)ptr, mask);
+    return _mm256_blendv_epi8(vfill, payload, mask);
 }
 /*********************************
  * Non-contiguous partial load
@@ -216,12 +295,53 @@ npyv_loadn_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_
     const __m256i steps = npyv_set_s64(0, 1, 2, 3);
     __m256i vnlane  = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane);
     __m256i mask    = _mm256_cmpgt_epi64(vnlane, steps);
-    return _mm256_mask_i64gather_epi64(vfill, (const void*)ptr, idx, mask, 8);
+    return _mm256_mask_i64gather_epi64(vfill, (const long long*)ptr, idx, mask, 8);
 }
 // fill zero to rest lanes
 NPY_FINLINE npyv_s64
 npyv_loadn_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane)
 { return npyv_loadn_till_s64(ptr, stride, nlane, 0); }
+
+//// 64-bit load over 32-bit stride
+NPY_FINLINE npyv_s64 npyv_loadn2_till_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane,
+                                                 npy_int32 fill_lo, npy_int32 fill_hi)
+{
+    assert(nlane > 0);
+    const __m256i vfill = npyv_set_s32(
+        fill_lo, fill_hi, fill_lo, fill_hi,
+        fill_lo, fill_hi, fill_lo, fill_hi
+    );
+    const __m256i idx   = npyv_set_s64(0, 1*stride, 2*stride, 3*stride);
+    const __m256i steps = npyv_set_s64(0, 1, 2, 3);
+    __m256i vnlane  = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane);
+    __m256i mask    = _mm256_cmpgt_epi64(vnlane, steps);
+    return _mm256_mask_i64gather_epi64(vfill, (const long long*)ptr, idx, mask, 4);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s32 npyv_loadn2_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane)
+{ return npyv_loadn2_till_s32(ptr, stride, nlane, 0, 0); }
+
+//// 128-bit load over 64-bit stride
+NPY_FINLINE npyv_s64 npyv_loadn2_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane,
+                                                  npy_int64 fill_lo, npy_int64 fill_hi)
+{
+    assert(nlane > 0);
+    __m256i a = npyv_loadl_s64(ptr);
+#if defined(_MSC_VER) && defined(_M_IX86)
+    __m128i fill =_mm_setr_epi32(
+        (int)fill_lo, (int)(fill_lo >> 32),
+        (int)fill_hi, (int)(fill_hi >> 32)
+    );
+#else
+    __m128i fill = _mm_set_epi64x(fill_hi, fill_lo);
+#endif
+    __m128i b = nlane > 1 ? _mm_loadu_si128((const __m128i*)(ptr + stride)) : fill;
+    return _mm256_inserti128_si256(a, b, 1);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s64 npyv_loadn2_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane)
+{ return npyv_loadn2_till_s64(ptr, stride, nlane, 0, 0); }
+
 /*********************************
  * Partial store
  *********************************/
@@ -241,7 +361,21 @@ NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a
     const __m256i steps = npyv_set_s64(0, 1, 2, 3);
     __m256i vnlane = npyv_setall_s64(nlane > 8 ? 8 : (int)nlane);
     __m256i mask   = _mm256_cmpgt_epi64(vnlane, steps);
-    _mm256_maskstore_epi64((void*)ptr, mask, a);
+    _mm256_maskstore_epi64((long long*)ptr, mask, a);
+}
+
+//// 64-bit nlane
+NPY_FINLINE void npyv_store2_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a)
+{ npyv_store_till_s64((npy_int64*)ptr, nlane, a); }
+
+//// 128-bit nlane
+NPY_FINLINE void npyv_store2_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a)
+{
+    assert(nlane > 0);
+    npyv_storel_s64(ptr, a);
+    if (nlane > 1) {
+        npyv_storeh_s64(ptr + 2, a);
+    }
 }
 /*********************************
  * Non-contiguous partial store
@@ -252,23 +386,52 @@ NPY_FINLINE void npyv_storen_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp
     assert(nlane > 0);
     __m128i a0 = _mm256_castsi256_si128(a);
     __m128i a1 = _mm256_extracti128_si256(a, 1);
+
+    ptr[stride*0] = _mm_extract_epi32(a0, 0);
     switch(nlane) {
-    default:
-        ptr[stride*7] = _mm_extract_epi32(a1, 3);
-    case 7:
-        ptr[stride*6] = _mm_extract_epi32(a1, 2);
-    case 6:
-        ptr[stride*5] = _mm_extract_epi32(a1, 1);
+    case 1:
+        return;
+    case 2:
+        ptr[stride*1] = _mm_extract_epi32(a0, 1);
+        return;
+    case 3:
+        ptr[stride*1] = _mm_extract_epi32(a0, 1);
+        ptr[stride*2] = _mm_extract_epi32(a0, 2);
+        return;
+    case 4:
+        ptr[stride*1] = _mm_extract_epi32(a0, 1);
+        ptr[stride*2] = _mm_extract_epi32(a0, 2);
+        ptr[stride*3] = _mm_extract_epi32(a0, 3);
+        return;
     case 5:
+        ptr[stride*1] = _mm_extract_epi32(a0, 1);
+        ptr[stride*2] = _mm_extract_epi32(a0, 2);
+        ptr[stride*3] = _mm_extract_epi32(a0, 3);
         ptr[stride*4] = _mm_extract_epi32(a1, 0);
-    case 4:
+        return;
+    case 6:
+        ptr[stride*1] = _mm_extract_epi32(a0, 1);
+        ptr[stride*2] = _mm_extract_epi32(a0, 2);
         ptr[stride*3] = _mm_extract_epi32(a0, 3);
-    case 3:
+        ptr[stride*4] = _mm_extract_epi32(a1, 0);
+        ptr[stride*5] = _mm_extract_epi32(a1, 1);
+        return;
+    case 7:
+        ptr[stride*1] = _mm_extract_epi32(a0, 1);
         ptr[stride*2] = _mm_extract_epi32(a0, 2);
-    case 2:
+        ptr[stride*3] = _mm_extract_epi32(a0, 3);
+        ptr[stride*4] = _mm_extract_epi32(a1, 0);
+        ptr[stride*5] = _mm_extract_epi32(a1, 1);
+        ptr[stride*6] = _mm_extract_epi32(a1, 2);
+        return;
+    default:
         ptr[stride*1] = _mm_extract_epi32(a0, 1);
-    case 1:
-        ptr[stride*0] = _mm_extract_epi32(a0, 0);
+        ptr[stride*2] = _mm_extract_epi32(a0, 2);
+        ptr[stride*3] = _mm_extract_epi32(a0, 3);
+        ptr[stride*4] = _mm_extract_epi32(a1, 0);
+        ptr[stride*5] = _mm_extract_epi32(a1, 1);
+        ptr[stride*6] = _mm_extract_epi32(a1, 2);
+        ptr[stride*7] = _mm_extract_epi32(a1, 3);
     }
 }
 //// 64
@@ -277,19 +440,60 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp
     assert(nlane > 0);
     __m128d a0 = _mm256_castpd256_pd128(_mm256_castsi256_pd(a));
     __m128d a1 = _mm256_extractf128_pd(_mm256_castsi256_pd(a), 1);
+
     double *dptr = (double*)ptr;
+    _mm_storel_pd(dptr, a0);
     switch(nlane) {
-    default:
-        _mm_storeh_pd(dptr + stride * 3, a1);
+    case 1:
+        return;
+    case 2:
+        _mm_storeh_pd(dptr + stride * 1, a0);
+        return;
     case 3:
+        _mm_storeh_pd(dptr + stride * 1, a0);
         _mm_storel_pd(dptr + stride * 2, a1);
-    case 2:
+        return;
+    default:
         _mm_storeh_pd(dptr + stride * 1, a0);
+        _mm_storel_pd(dptr + stride * 2, a1);
+        _mm_storeh_pd(dptr + stride * 3, a1);
+    }
+}
+
+//// 64-bit store over 32-bit stride
+NPY_FINLINE void npyv_storen2_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npyv_s32 a)
+{
+    assert(nlane > 0);
+    __m128d a0 = _mm256_castpd256_pd128(_mm256_castsi256_pd(a));
+    __m128d a1 = _mm256_extractf128_pd(_mm256_castsi256_pd(a), 1);
+
+    _mm_storel_pd((double*)ptr, a0);
+    switch(nlane) {
     case 1:
-        _mm_storel_pd(dptr + stride * 0, a0);
+        return;
+    case 2:
+        _mm_storeh_pd((double*)(ptr + stride * 1), a0);
+        return;
+    case 3:
+        _mm_storeh_pd((double*)(ptr + stride * 1), a0);
+        _mm_storel_pd((double*)(ptr + stride * 2), a1);
+        return;
+    default:
+        _mm_storeh_pd((double*)(ptr + stride * 1), a0);
+        _mm_storel_pd((double*)(ptr + stride * 2), a1);
+        _mm_storeh_pd((double*)(ptr + stride * 3), a1);
     }
 }
 
+//// 128-bit store over 64-bit stride
+NPY_FINLINE void npyv_storen2_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npyv_s64 a)
+{
+    assert(nlane > 0);
+    npyv_storel_s64(ptr, a);
+    if (nlane > 1) {
+        npyv_storeh_s64(ptr + stride, a);
+    }
+}
 /*****************************************************************************
  * Implement partial load/store for u32/f32/u64/f64... via reinterpret cast
  *****************************************************************************/
@@ -300,7 +504,8 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp
         union {                                                                             \
             npyv_lanetype_##F_SFX from_##F_SFX;                                             \
             npyv_lanetype_##T_SFX to_##T_SFX;                                               \
-        } pun = {.from_##F_SFX = fill};                                                     \
+        } pun;                                                                              \
+        pun.from_##F_SFX = fill;                                                            \
         return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load_till_##T_SFX(                   \
             (const npyv_lanetype_##T_SFX *)ptr, nlane, pun.to_##T_SFX                       \
         ));                                                                                 \
@@ -312,7 +517,8 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp
         union {                                                                             \
             npyv_lanetype_##F_SFX from_##F_SFX;                                             \
             npyv_lanetype_##T_SFX to_##T_SFX;                                               \
-        } pun = {.from_##F_SFX = fill};                                                     \
+        } pun;                                                                              \
+        pun.from_##F_SFX = fill;                                                            \
         return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn_till_##T_SFX(                  \
             (const npyv_lanetype_##T_SFX *)ptr, stride, nlane, pun.to_##T_SFX               \
         ));                                                                                 \
@@ -353,6 +559,110 @@ NPYV_IMPL_AVX2_REST_PARTIAL_TYPES(f32, s32)
 NPYV_IMPL_AVX2_REST_PARTIAL_TYPES(u64, s64)
 NPYV_IMPL_AVX2_REST_PARTIAL_TYPES(f64, s64)
 
+// 128-bit/64-bit stride (load/store pair)
+#define NPYV_IMPL_AVX2_REST_PARTIAL_TYPES_PAIR(F_SFX, T_SFX)                                \
+    NPY_FINLINE npyv_##F_SFX npyv_load2_till_##F_SFX                                        \
+    (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane,                                     \
+     npyv_lanetype_##F_SFX fill_lo, npyv_lanetype_##F_SFX fill_hi)                          \
+    {                                                                                       \
+        union pun {                                                                         \
+            npyv_lanetype_##F_SFX from_##F_SFX;                                             \
+            npyv_lanetype_##T_SFX to_##T_SFX;                                               \
+        };                                                                                  \
+        union pun pun_lo;                                                                   \
+        union pun pun_hi;                                                                   \
+        pun_lo.from_##F_SFX = fill_lo;                                                      \
+        pun_hi.from_##F_SFX = fill_hi;                                                      \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load2_till_##T_SFX(                  \
+            (const npyv_lanetype_##T_SFX *)ptr, nlane, pun_lo.to_##T_SFX, pun_hi.to_##T_SFX \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE npyv_##F_SFX npyv_loadn2_till_##F_SFX                                       \
+    (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane,                    \
+     npyv_lanetype_##F_SFX fill_lo, npyv_lanetype_##F_SFX fill_hi)                          \
+    {                                                                                       \
+        union pun {                                                                         \
+            npyv_lanetype_##F_SFX from_##F_SFX;                                             \
+            npyv_lanetype_##T_SFX to_##T_SFX;                                               \
+        };                                                                                  \
+        union pun pun_lo;                                                                   \
+        union pun pun_hi;                                                                   \
+        pun_lo.from_##F_SFX = fill_lo;                                                      \
+        pun_hi.from_##F_SFX = fill_hi;                                                      \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn2_till_##T_SFX(                 \
+            (const npyv_lanetype_##T_SFX *)ptr, stride, nlane, pun_lo.to_##T_SFX,           \
+            pun_hi.to_##T_SFX                                                               \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE npyv_##F_SFX npyv_load2_tillz_##F_SFX                                       \
+    (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane)                                     \
+    {                                                                                       \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load2_tillz_##T_SFX(                 \
+            (const npyv_lanetype_##T_SFX *)ptr, nlane                                       \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE npyv_##F_SFX npyv_loadn2_tillz_##F_SFX                                      \
+    (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane)                    \
+    {                                                                                       \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn2_tillz_##T_SFX(                \
+            (const npyv_lanetype_##T_SFX *)ptr, stride, nlane                               \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE void npyv_store2_till_##F_SFX                                               \
+    (npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, npyv_##F_SFX a)                           \
+    {                                                                                       \
+        npyv_store2_till_##T_SFX(                                                           \
+            (npyv_lanetype_##T_SFX *)ptr, nlane,                                            \
+            npyv_reinterpret_##T_SFX##_##F_SFX(a)                                           \
+        );                                                                                  \
+    }                                                                                       \
+    NPY_FINLINE void npyv_storen2_till_##F_SFX                                              \
+    (npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, npyv_##F_SFX a)          \
+    {                                                                                       \
+        npyv_storen2_till_##T_SFX(                                                          \
+            (npyv_lanetype_##T_SFX *)ptr, stride, nlane,                                    \
+            npyv_reinterpret_##T_SFX##_##F_SFX(a)                                           \
+        );                                                                                  \
+    }
+
+NPYV_IMPL_AVX2_REST_PARTIAL_TYPES_PAIR(u32, s32)
+NPYV_IMPL_AVX2_REST_PARTIAL_TYPES_PAIR(f32, s32)
+NPYV_IMPL_AVX2_REST_PARTIAL_TYPES_PAIR(u64, s64)
+NPYV_IMPL_AVX2_REST_PARTIAL_TYPES_PAIR(f64, s64)
+
+/************************************************************
+ *  de-interlave load / interleave contiguous store
+ ************************************************************/
+// two channels
+#define NPYV_IMPL_AVX2_MEM_INTERLEAVE(SFX, ZSFX)                             \
+    NPY_FINLINE npyv_##ZSFX##x2 npyv_zip_##ZSFX(npyv_##ZSFX, npyv_##ZSFX);   \
+    NPY_FINLINE npyv_##ZSFX##x2 npyv_unzip_##ZSFX(npyv_##ZSFX, npyv_##ZSFX); \
+    NPY_FINLINE npyv_##SFX##x2 npyv_load_##SFX##x2(                          \
+        const npyv_lanetype_##SFX *ptr                                       \
+    ) {                                                                      \
+        return npyv_unzip_##ZSFX(                                            \
+            npyv_load_##SFX(ptr), npyv_load_##SFX(ptr+npyv_nlanes_##SFX)     \
+        );                                                                   \
+    }                                                                        \
+    NPY_FINLINE void npyv_store_##SFX##x2(                                   \
+        npyv_lanetype_##SFX *ptr, npyv_##SFX##x2 v                           \
+    ) {                                                                      \
+        npyv_##SFX##x2 zip = npyv_zip_##ZSFX(v.val[0], v.val[1]);            \
+        npyv_store_##SFX(ptr, zip.val[0]);                                   \
+        npyv_store_##SFX(ptr + npyv_nlanes_##SFX, zip.val[1]);               \
+    }
+
+NPYV_IMPL_AVX2_MEM_INTERLEAVE(u8, u8)
+NPYV_IMPL_AVX2_MEM_INTERLEAVE(s8, u8)
+NPYV_IMPL_AVX2_MEM_INTERLEAVE(u16, u16)
+NPYV_IMPL_AVX2_MEM_INTERLEAVE(s16, u16)
+NPYV_IMPL_AVX2_MEM_INTERLEAVE(u32, u32)
+NPYV_IMPL_AVX2_MEM_INTERLEAVE(s32, u32)
+NPYV_IMPL_AVX2_MEM_INTERLEAVE(u64, u64)
+NPYV_IMPL_AVX2_MEM_INTERLEAVE(s64, u64)
+NPYV_IMPL_AVX2_MEM_INTERLEAVE(f32, f32)
+NPYV_IMPL_AVX2_MEM_INTERLEAVE(f64, f64)
+
 /*********************************
  * Lookup tables
  *********************************/
diff --git a/numpy/core/src/common/simd/avx2/operators.h b/numpy/core/src/common/simd/avx2/operators.h
index c10267b21..7b9b6a344 100644
--- a/numpy/core/src/common/simd/avx2/operators.h
+++ b/numpy/core/src/common/simd/avx2/operators.h
@@ -205,7 +205,7 @@ NPY_FINLINE __m256i npyv_cmpge_u32(__m256i a, __m256i b)
 #define npyv_cmple_u64(A, B) npyv_cmpge_u64(B, A)
 #define npyv_cmple_s64(A, B) npyv_cmpge_s64(B, A)
 
-// precision comparison
+// precision comparison (ordered)
 #define npyv_cmpeq_f32(A, B)  _mm256_castps_si256(_mm256_cmp_ps(A, B, _CMP_EQ_OQ))
 #define npyv_cmpeq_f64(A, B)  _mm256_castpd_si256(_mm256_cmp_pd(A, B, _CMP_EQ_OQ))
 #define npyv_cmpneq_f32(A, B) _mm256_castps_si256(_mm256_cmp_ps(A, B, _CMP_NEQ_UQ))
diff --git a/numpy/core/src/common/simd/avx2/reorder.h b/numpy/core/src/common/simd/avx2/reorder.h
index 4d6ec8f75..9ebe0e7f4 100644
--- a/numpy/core/src/common/simd/avx2/reorder.h
+++ b/numpy/core/src/common/simd/avx2/reorder.h
@@ -94,6 +94,75 @@ NPY_FINLINE npyv_f64x2 npyv_zip_f64(__m256d a, __m256d b)
     return npyv_combine_f64(ab0, ab1);
 }
 
+// deinterleave two vectors
+NPY_FINLINE npyv_u8x2 npyv_unzip_u8(npyv_u8 ab0, npyv_u8 ab1)
+{
+    const __m256i idx = _mm256_setr_epi8(
+        0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
+        0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
+    );
+    __m256i ab_03 = _mm256_shuffle_epi8(ab0, idx);
+    __m256i ab_12 = _mm256_shuffle_epi8(ab1, idx);
+    npyv_u8x2 ab_lh = npyv_combine_u8(ab_03, ab_12);
+    npyv_u8x2 r;
+    r.val[0] = _mm256_unpacklo_epi64(ab_lh.val[0], ab_lh.val[1]);
+    r.val[1] = _mm256_unpackhi_epi64(ab_lh.val[0], ab_lh.val[1]);
+    return r;
+}
+#define npyv_unzip_s8 npyv_unzip_u8
+
+NPY_FINLINE npyv_u16x2 npyv_unzip_u16(npyv_u16 ab0, npyv_u16 ab1)
+{
+    const __m256i idx = _mm256_setr_epi8(
+        0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15,
+        0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15
+    );
+    __m256i ab_03 = _mm256_shuffle_epi8(ab0, idx);
+    __m256i ab_12 = _mm256_shuffle_epi8(ab1, idx);
+    npyv_u16x2 ab_lh = npyv_combine_u16(ab_03, ab_12);
+    npyv_u16x2 r;
+    r.val[0] = _mm256_unpacklo_epi64(ab_lh.val[0], ab_lh.val[1]);
+    r.val[1] = _mm256_unpackhi_epi64(ab_lh.val[0], ab_lh.val[1]);
+    return r;
+}
+#define npyv_unzip_s16 npyv_unzip_u16
+
+NPY_FINLINE npyv_u32x2 npyv_unzip_u32(npyv_u32 ab0, npyv_u32 ab1)
+{
+    const __m256i idx = npyv_set_u32(0, 2, 4, 6, 1, 3, 5, 7);
+    __m256i abl = _mm256_permutevar8x32_epi32(ab0, idx);
+    __m256i abh = _mm256_permutevar8x32_epi32(ab1, idx);
+    return npyv_combine_u32(abl, abh);
+}
+#define npyv_unzip_s32 npyv_unzip_u32
+
+NPY_FINLINE npyv_u64x2 npyv_unzip_u64(npyv_u64 ab0, npyv_u64 ab1)
+{
+    npyv_u64x2 ab_lh = npyv_combine_u64(ab0, ab1);
+    npyv_u64x2 r;
+    r.val[0] = _mm256_unpacklo_epi64(ab_lh.val[0], ab_lh.val[1]);
+    r.val[1] = _mm256_unpackhi_epi64(ab_lh.val[0], ab_lh.val[1]);
+    return r;
+}
+#define npyv_unzip_s64 npyv_unzip_u64
+
+NPY_FINLINE npyv_f32x2 npyv_unzip_f32(npyv_f32 ab0, npyv_f32 ab1)
+{
+    const __m256i idx = npyv_set_u32(0, 2, 4, 6, 1, 3, 5, 7);
+    __m256 abl = _mm256_permutevar8x32_ps(ab0, idx);
+    __m256 abh = _mm256_permutevar8x32_ps(ab1, idx);
+    return npyv_combine_f32(abl, abh);
+}
+
+NPY_FINLINE npyv_f64x2 npyv_unzip_f64(npyv_f64 ab0, npyv_f64 ab1)
+{
+    npyv_f64x2 ab_lh = npyv_combine_f64(ab0, ab1);
+    npyv_f64x2 r;
+    r.val[0] = _mm256_unpacklo_pd(ab_lh.val[0], ab_lh.val[1]);
+    r.val[1] = _mm256_unpackhi_pd(ab_lh.val[0], ab_lh.val[1]);
+    return r;
+}
+
 // Reverse elements of each 64-bit lane
 NPY_FINLINE npyv_u8 npyv_rev64_u8(npyv_u8 a)
 {
@@ -126,4 +195,22 @@ NPY_FINLINE npyv_f32 npyv_rev64_f32(npyv_f32 a)
     return _mm256_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 0, 1));
 }
 
+// Permuting the elements of each 128-bit lane by immediate index for
+// each element.
+#define npyv_permi128_u32(A, E0, E1, E2, E3) \
+    _mm256_shuffle_epi32(A, _MM_SHUFFLE(E3, E2, E1, E0))
+
+#define npyv_permi128_s32 npyv_permi128_u32
+
+#define npyv_permi128_u64(A, E0, E1) \
+    _mm256_shuffle_epi32(A, _MM_SHUFFLE(((E1)<<1)+1, ((E1)<<1), ((E0)<<1)+1, ((E0)<<1)))
+
+#define npyv_permi128_s64 npyv_permi128_u64
+
+#define npyv_permi128_f32(A, E0, E1, E2, E3) \
+    _mm256_permute_ps(A, _MM_SHUFFLE(E3, E2, E1, E0))
+
+#define npyv_permi128_f64(A, E0, E1) \
+    _mm256_permute_pd(A, ((E1)<<3) | ((E0)<<2) | ((E1)<<1) | (E0))
+
 #endif // _NPY_SIMD_AVX2_REORDER_H
diff --git a/numpy/core/src/common/simd/avx512/arithmetic.h b/numpy/core/src/common/simd/avx512/arithmetic.h
index 1290dc0ad..a63da87d0 100644
--- a/numpy/core/src/common/simd/avx512/arithmetic.h
+++ b/numpy/core/src/common/simd/avx512/arithmetic.h
@@ -349,6 +349,10 @@ NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor)
 // negate multiply and subtract, -(a*b) - c
 #define npyv_nmulsub_f32 _mm512_fnmsub_ps
 #define npyv_nmulsub_f64 _mm512_fnmsub_pd
+// multiply, add for odd elements and subtract even elements.
+// (a * b) -+ c
+#define npyv_muladdsub_f32 _mm512_fmaddsub_ps
+#define npyv_muladdsub_f64 _mm512_fmaddsub_pd
 
 /***************************
  * Summation: Calculates the sum of all vector elements.
diff --git a/numpy/core/src/common/simd/avx512/avx512.h b/numpy/core/src/common/simd/avx512/avx512.h
index 0946e6443..aa6abe256 100644
--- a/numpy/core/src/common/simd/avx512/avx512.h
+++ b/numpy/core/src/common/simd/avx512/avx512.h
@@ -7,6 +7,7 @@
 #define NPY_SIMD_F64 1
 #define NPY_SIMD_FMA3 1 // native support
 #define NPY_SIMD_BIGENDIAN 0
+#define NPY_SIMD_CMPSIGNAL 0
 // Enough limit to allow us to use _mm512_i32gather_* and _mm512_i32scatter_*
 #define NPY_SIMD_MAXLOAD_STRIDE32  (0x7fffffff / 16)
 #define NPY_SIMD_MAXSTORE_STRIDE32 (0x7fffffff / 16)
diff --git a/numpy/core/src/common/simd/avx512/maskop.h b/numpy/core/src/common/simd/avx512/maskop.h
index d1c188390..88fa4a68c 100644
--- a/numpy/core/src/common/simd/avx512/maskop.h
+++ b/numpy/core/src/common/simd/avx512/maskop.h
@@ -51,4 +51,17 @@ NPYV_IMPL_AVX512_MASK_ADDSUB(s64, b64, epi64)
 NPYV_IMPL_AVX512_MASK_ADDSUB(f32, b32, ps)
 NPYV_IMPL_AVX512_MASK_ADDSUB(f64, b64, pd)
 
+// division, m ? a / b : c
+NPY_FINLINE npyv_f32 npyv_ifdiv_f32(npyv_b32 m, npyv_f32 a, npyv_f32 b, npyv_f32 c)
+{ return _mm512_mask_div_ps(c, m, a, b); }
+// conditional division, m ? a / b : 0
+NPY_FINLINE npyv_f32 npyv_ifdivz_f32(npyv_b32 m, npyv_f32 a, npyv_f32 b)
+{ return _mm512_maskz_div_ps(m, a, b); }
+// division, m ? a / b : c
+NPY_FINLINE npyv_f64 npyv_ifdiv_f64(npyv_b32 m, npyv_f64 a, npyv_f64 b, npyv_f64 c)
+{ return _mm512_mask_div_pd(c, m, a, b); }
+// conditional division, m ? a / b : 0
+NPY_FINLINE npyv_f64 npyv_ifdivz_f64(npyv_b32 m, npyv_f64 a, npyv_f64 b)
+{ return _mm512_maskz_div_pd(m, a, b); }
+
 #endif // _NPY_SIMD_AVX512_MASKOP_H
diff --git a/numpy/core/src/common/simd/avx512/memory.h b/numpy/core/src/common/simd/avx512/memory.h
index 03fcb4630..fdf96a92c 100644
--- a/numpy/core/src/common/simd/avx512/memory.h
+++ b/numpy/core/src/common/simd/avx512/memory.h
@@ -120,6 +120,52 @@ NPY_FINLINE npyv_s64 npyv_loadn_s64(const npy_int64 *ptr, npy_intp stride)
 { return npyv_loadn_u64((const npy_uint64*)ptr, stride); }
 NPY_FINLINE npyv_f64 npyv_loadn_f64(const double *ptr, npy_intp stride)
 { return _mm512_castsi512_pd(npyv_loadn_u64((const npy_uint64*)ptr, stride)); }
+
+//// 64-bit load over 32-bit stride
+NPY_FINLINE npyv_u32 npyv_loadn2_u32(const npy_uint32 *ptr, npy_intp stride)
+{
+    __m128d a = _mm_loadh_pd(
+        _mm_castsi128_pd(_mm_loadl_epi64((const __m128i*)ptr)),
+        (const double*)(ptr + stride)
+    );
+    __m128d b = _mm_loadh_pd(
+        _mm_castsi128_pd(_mm_loadl_epi64((const __m128i*)(ptr + stride*2))),
+        (const double*)(ptr + stride*3)
+    );
+    __m128d c = _mm_loadh_pd(
+        _mm_castsi128_pd(_mm_loadl_epi64((const __m128i*)(ptr + stride*4))),
+        (const double*)(ptr + stride*5)
+    );
+    __m128d d = _mm_loadh_pd(
+        _mm_castsi128_pd(_mm_loadl_epi64((const __m128i*)(ptr + stride*6))),
+        (const double*)(ptr + stride*7)
+    );
+    return _mm512_castpd_si512(npyv512_combine_pd256(
+        _mm256_insertf128_pd(_mm256_castpd128_pd256(a), b, 1),
+        _mm256_insertf128_pd(_mm256_castpd128_pd256(c), d, 1)
+    ));
+}
+NPY_FINLINE npyv_s32 npyv_loadn2_s32(const npy_int32 *ptr, npy_intp stride)
+{ return npyv_loadn2_u32((const npy_uint32*)ptr, stride); }
+NPY_FINLINE npyv_f32 npyv_loadn2_f32(const float *ptr, npy_intp stride)
+{ return _mm512_castsi512_ps(npyv_loadn2_u32((const npy_uint32*)ptr, stride)); }
+
+//// 128-bit load over 64-bit stride
+NPY_FINLINE npyv_f64 npyv_loadn2_f64(const double *ptr, npy_intp stride)
+{
+    __m128d a = _mm_loadu_pd(ptr);
+    __m128d b = _mm_loadu_pd(ptr + stride);
+    __m128d c = _mm_loadu_pd(ptr + stride * 2);
+    __m128d d = _mm_loadu_pd(ptr + stride * 3);
+    return npyv512_combine_pd256(
+        _mm256_insertf128_pd(_mm256_castpd128_pd256(a), b, 1),
+        _mm256_insertf128_pd(_mm256_castpd128_pd256(c), d, 1)
+    );
+}
+NPY_FINLINE npyv_u64 npyv_loadn2_u64(const npy_uint64 *ptr, npy_intp stride)
+{ return npyv_reinterpret_u64_f64(npyv_loadn2_f64((const double*)ptr, stride)); }
+NPY_FINLINE npyv_s64 npyv_loadn2_s64(const npy_int64 *ptr, npy_intp stride)
+{ return npyv_loadn2_u64((const npy_uint64*)ptr, stride); }
 /***************************
  * Non-contiguous Store
  ***************************/
@@ -151,6 +197,48 @@ NPY_FINLINE void npyv_storen_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a)
 NPY_FINLINE void npyv_storen_f64(double *ptr, npy_intp stride, npyv_f64 a)
 { npyv_storen_u64((npy_uint64*)ptr, stride, _mm512_castpd_si512(a)); }
 
+//// 64-bit store over 32-bit stride
+NPY_FINLINE void npyv_storen2_u32(npy_uint32 *ptr, npy_intp stride, npyv_u32 a)
+{
+    __m256d lo = _mm512_castpd512_pd256(_mm512_castsi512_pd(a));
+    __m256d hi = _mm512_extractf64x4_pd(_mm512_castsi512_pd(a), 1);
+    __m128d e0 = _mm256_castpd256_pd128(lo);
+    __m128d e1 = _mm256_extractf128_pd(lo, 1);
+    __m128d e2 = _mm256_castpd256_pd128(hi);
+    __m128d e3 = _mm256_extractf128_pd(hi, 1);
+    _mm_storel_pd((double*)(ptr + stride * 0), e0);
+    _mm_storeh_pd((double*)(ptr + stride * 1), e0);
+    _mm_storel_pd((double*)(ptr + stride * 2), e1);
+    _mm_storeh_pd((double*)(ptr + stride * 3), e1);
+    _mm_storel_pd((double*)(ptr + stride * 4), e2);
+    _mm_storeh_pd((double*)(ptr + stride * 5), e2);
+    _mm_storel_pd((double*)(ptr + stride * 6), e3);
+    _mm_storeh_pd((double*)(ptr + stride * 7), e3);
+}
+NPY_FINLINE void npyv_storen2_s32(npy_int32 *ptr, npy_intp stride, npyv_s32 a)
+{ npyv_storen2_u32((npy_uint32*)ptr, stride, a); }
+NPY_FINLINE void npyv_storen2_f32(float *ptr, npy_intp stride, npyv_f32 a)
+{ npyv_storen2_u32((npy_uint32*)ptr, stride, _mm512_castps_si512(a)); }
+
+//// 128-bit store over 64-bit stride
+NPY_FINLINE void npyv_storen2_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a)
+{
+    __m256i lo = npyv512_lower_si256(a);
+    __m256i hi = npyv512_higher_si256(a);
+    __m128i e0 = _mm256_castsi256_si128(lo);
+    __m128i e1 = _mm256_extracti128_si256(lo, 1);
+    __m128i e2 = _mm256_castsi256_si128(hi);
+    __m128i e3 = _mm256_extracti128_si256(hi, 1);
+    _mm_storeu_si128((__m128i*)(ptr + stride * 0), e0);
+    _mm_storeu_si128((__m128i*)(ptr + stride * 1), e1);
+    _mm_storeu_si128((__m128i*)(ptr + stride * 2), e2);
+    _mm_storeu_si128((__m128i*)(ptr + stride * 3), e3);
+}
+NPY_FINLINE void npyv_storen2_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a)
+{ npyv_storen2_u64((npy_uint64*)ptr, stride, a); }
+NPY_FINLINE void npyv_storen2_f64(double *ptr, npy_intp stride, npyv_f64 a)
+{ npyv_storen2_u64((npy_uint64*)ptr, stride, _mm512_castpd_si512(a)); }
+
 /*********************************
  * Partial Load
  *********************************/
@@ -159,14 +247,14 @@ NPY_FINLINE npyv_s32 npyv_load_till_s32(const npy_int32 *ptr, npy_uintp nlane, n
 {
     assert(nlane > 0);
     const __m512i vfill = _mm512_set1_epi32(fill);
-    const __mmask16 mask = nlane > 31 ? -1 : (1 << nlane) - 1;
+    const __mmask16 mask = nlane > 15 ? -1 : (1 << nlane) - 1;
     return _mm512_mask_loadu_epi32(vfill, mask, (const __m512i*)ptr);
 }
 // fill zero to rest lanes
 NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
 {
     assert(nlane > 0);
-    const __mmask16 mask = nlane > 31 ? -1 : (1 << nlane) - 1;
+    const __mmask16 mask = nlane > 15 ? -1 : (1 << nlane) - 1;
     return _mm512_maskz_loadu_epi32(mask, (const __m512i*)ptr);
 }
 //// 64
@@ -174,14 +262,44 @@ NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, n
 {
     assert(nlane > 0);
     const __m512i vfill = npyv_setall_s64(fill);
-    const __mmask8 mask = nlane > 31 ? -1 : (1 << nlane) - 1;
+    const __mmask8 mask = nlane > 7 ? -1 : (1 << nlane) - 1;
     return _mm512_mask_loadu_epi64(vfill, mask, (const __m512i*)ptr);
 }
 // fill zero to rest lanes
 NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
 {
     assert(nlane > 0);
-    const __mmask8 mask = nlane > 15 ? -1 : (1 << nlane) - 1;
+    const __mmask8 mask = nlane > 7 ? -1 : (1 << nlane) - 1;
+    return _mm512_maskz_loadu_epi64(mask, (const __m512i*)ptr);
+}
+
+//// 64-bit nlane
+NPY_FINLINE npyv_s32 npyv_load2_till_s32(const npy_int32 *ptr, npy_uintp nlane,
+                                          npy_int32 fill_lo, npy_int32 fill_hi)
+{
+    assert(nlane > 0);
+    const __m512i vfill = _mm512_set4_epi32(fill_hi, fill_lo, fill_hi, fill_lo);
+    const __mmask8 mask = nlane > 7 ? -1 : (1 << nlane) - 1;
+    return _mm512_mask_loadu_epi64(vfill, mask, (const __m512i*)ptr);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s32 npyv_load2_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
+{ return npyv_load_tillz_s64((const npy_int64*)ptr, nlane); }
+
+//// 128-bit nlane
+NPY_FINLINE npyv_u64 npyv_load2_till_s64(const npy_int64 *ptr, npy_uintp nlane,
+                                           npy_int64 fill_lo, npy_int64 fill_hi)
+{
+    assert(nlane > 0);
+    const __m512i vfill = _mm512_set4_epi64(fill_hi, fill_lo, fill_hi, fill_lo);
+    const __mmask8 mask = nlane > 3 ? -1 : (1 << (nlane*2)) - 1;
+    return _mm512_mask_loadu_epi64(vfill, mask, (const __m512i*)ptr);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s64 npyv_load2_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
+{
+    assert(nlane > 0);
+    const __mmask8 mask = nlane > 3 ? -1 : (1 << (nlane*2)) - 1;
     return _mm512_maskz_loadu_epi64(mask, (const __m512i*)ptr);
 }
 /*********************************
@@ -198,7 +316,7 @@ npyv_loadn_till_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npy_
     );
     const __m512i idx = _mm512_mullo_epi32(steps, _mm512_set1_epi32((int)stride));
     const __m512i vfill = _mm512_set1_epi32(fill);
-    const __mmask16 mask = nlane > 31 ? -1 : (1 << nlane) - 1;
+    const __mmask16 mask = nlane > 15 ? -1 : (1 << nlane) - 1;
     return _mm512_mask_i32gather_epi32(vfill, mask, idx, (const __m512i*)ptr, 4);
 }
 // fill zero to rest lanes
@@ -215,13 +333,48 @@ npyv_loadn_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_
         4*stride, 5*stride, 6*stride, 7*stride
     );
     const __m512i vfill = npyv_setall_s64(fill);
-    const __mmask8 mask = nlane > 31 ? -1 : (1 << nlane) - 1;
+    const __mmask8 mask = nlane > 15 ? -1 : (1 << nlane) - 1;
     return _mm512_mask_i64gather_epi64(vfill, mask, idx, (const __m512i*)ptr, 8);
 }
 // fill zero to rest lanes
 NPY_FINLINE npyv_s64
 npyv_loadn_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane)
 { return npyv_loadn_till_s64(ptr, stride, nlane, 0); }
+
+//// 64-bit load over 32-bit stride
+NPY_FINLINE npyv_s64 npyv_loadn2_till_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane,
+                                                 npy_int32 fill_lo, npy_int32 fill_hi)
+{
+    assert(nlane > 0);
+    const __m512i idx = npyv_set_s64(
+        0*stride, 1*stride, 2*stride, 3*stride,
+        4*stride, 5*stride, 6*stride, 7*stride
+    );
+    const __m512i vfill = _mm512_set4_epi32(fill_hi, fill_lo, fill_hi, fill_lo);
+    const __mmask8 mask = nlane > 7 ? -1 : (1 << nlane) - 1;
+    return _mm512_mask_i64gather_epi64(vfill, mask, idx, (const __m512i*)ptr, 4);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s32 npyv_loadn2_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane)
+{ return npyv_loadn2_till_s32(ptr, stride, nlane, 0, 0); }
+
+//// 128-bit load over 64-bit stride
+NPY_FINLINE npyv_s64 npyv_loadn2_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane,
+                                                  npy_int64 fill_lo, npy_int64 fill_hi)
+{
+    assert(nlane > 0);
+    const __m512i idx = npyv_set_s64(
+       0,        1,          stride,   stride+1,
+       stride*2, stride*2+1, stride*3, stride*3+1
+    );
+    const __mmask8 mask = nlane > 3 ? -1 : (1 << (nlane*2)) - 1;
+    const __m512i vfill = _mm512_set4_epi64(fill_hi, fill_lo, fill_hi, fill_lo);
+    return _mm512_mask_i64gather_epi64(vfill, mask, idx, (const __m512i*)ptr, 8);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s64 npyv_loadn2_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane)
+{ return npyv_loadn2_till_s64(ptr, stride, nlane, 0, 0); }
+
 /*********************************
  * Partial store
  *********************************/
@@ -229,14 +382,30 @@ npyv_loadn_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane)
 NPY_FINLINE void npyv_store_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a)
 {
     assert(nlane > 0);
-    const __mmask16 mask = nlane > 31 ? -1 : (1 << nlane) - 1;
+    const __mmask16 mask = nlane > 15 ? -1 : (1 << nlane) - 1;
     _mm512_mask_storeu_epi32((__m512i*)ptr, mask, a);
 }
 //// 64
 NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a)
 {
     assert(nlane > 0);
-    const __mmask8 mask = nlane > 15 ? -1 : (1 << nlane) - 1;
+    const __mmask8 mask = nlane > 7 ? -1 : (1 << nlane) - 1;
+    _mm512_mask_storeu_epi64((__m512i*)ptr, mask, a);
+}
+
+//// 64-bit nlane
+NPY_FINLINE void npyv_store2_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a)
+{
+    assert(nlane > 0);
+    const __mmask8 mask = nlane > 7 ? -1 : (1 << nlane) - 1;
+    _mm512_mask_storeu_epi64((__m512i*)ptr, mask, a);
+}
+
+//// 128-bit nlane
+NPY_FINLINE void npyv_store2_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a)
+{
+    assert(nlane > 0);
+    const __mmask8 mask = nlane > 3 ? -1 : (1 << (nlane*2)) - 1;
     _mm512_mask_storeu_epi64((__m512i*)ptr, mask, a);
 }
 /*********************************
@@ -251,7 +420,7 @@ NPY_FINLINE void npyv_storen_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp
         0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
     );
     const __m512i idx = _mm512_mullo_epi32(steps, _mm512_set1_epi32((int)stride));
-    const __mmask16 mask = nlane > 31 ? -1 : (1 << nlane) - 1;
+    const __mmask16 mask = nlane > 15 ? -1 : (1 << nlane) - 1;
     _mm512_mask_i32scatter_epi32((__m512i*)ptr, mask, idx, a, 4);
 }
 //// 64
@@ -262,7 +431,31 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp
         0*stride, 1*stride, 2*stride, 3*stride,
         4*stride, 5*stride, 6*stride, 7*stride
     );
-    const __mmask8 mask = nlane > 15 ? -1 : (1 << nlane) - 1;
+    const __mmask8 mask = nlane > 7 ? -1 : (1 << nlane) - 1;
+    _mm512_mask_i64scatter_epi64((__m512i*)ptr, mask, idx, a, 8);
+}
+
+//// 64-bit store over 32-bit stride
+NPY_FINLINE void npyv_storen2_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npyv_s32 a)
+{
+    assert(nlane > 0);
+    const __m512i idx = npyv_set_s64(
+        0*stride, 1*stride, 2*stride, 3*stride,
+        4*stride, 5*stride, 6*stride, 7*stride
+    );
+    const __mmask8 mask = nlane > 7 ? -1 : (1 << nlane) - 1;
+    _mm512_mask_i64scatter_epi64((__m512i*)ptr, mask, idx, a, 4);
+}
+
+//// 128-bit store over 64-bit stride
+NPY_FINLINE void npyv_storen2_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npyv_s64 a)
+{
+    assert(nlane > 0);
+    const __m512i idx = npyv_set_s64(
+        0,        1,            stride,   stride+1,
+        2*stride, 2*stride+1, 3*stride, 3*stride+1
+    );
+    const __mmask8 mask = nlane > 3 ? -1 : (1 << (nlane*2)) - 1;
     _mm512_mask_i64scatter_epi64((__m512i*)ptr, mask, idx, a, 8);
 }
 
@@ -331,6 +524,110 @@ NPYV_IMPL_AVX512_REST_PARTIAL_TYPES(f32, s32)
 NPYV_IMPL_AVX512_REST_PARTIAL_TYPES(u64, s64)
 NPYV_IMPL_AVX512_REST_PARTIAL_TYPES(f64, s64)
 
+// 128-bit/64-bit stride (pair load/store)
+#define NPYV_IMPL_AVX512_REST_PARTIAL_TYPES_PAIR(F_SFX, T_SFX)                              \
+    NPY_FINLINE npyv_##F_SFX npyv_load2_till_##F_SFX                                        \
+    (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane,                                     \
+     npyv_lanetype_##F_SFX fill_lo, npyv_lanetype_##F_SFX fill_hi)                          \
+    {                                                                                       \
+        union pun {                                                                         \
+            npyv_lanetype_##F_SFX from_##F_SFX;                                             \
+            npyv_lanetype_##T_SFX to_##T_SFX;                                               \
+        };                                                                                  \
+        union pun pun_lo;                                                                   \
+        union pun pun_hi;                                                                   \
+        pun_lo.from_##F_SFX = fill_lo;                                                      \
+        pun_hi.from_##F_SFX = fill_hi;                                                      \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load2_till_##T_SFX(                  \
+            (const npyv_lanetype_##T_SFX *)ptr, nlane, pun_lo.to_##T_SFX, pun_hi.to_##T_SFX \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE npyv_##F_SFX npyv_loadn2_till_##F_SFX                                       \
+    (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane,                    \
+     npyv_lanetype_##F_SFX fill_lo, npyv_lanetype_##F_SFX fill_hi)                          \
+    {                                                                                       \
+        union pun {                                                                         \
+            npyv_lanetype_##F_SFX from_##F_SFX;                                             \
+            npyv_lanetype_##T_SFX to_##T_SFX;                                               \
+        };                                                                                  \
+        union pun pun_lo;                                                                   \
+        union pun pun_hi;                                                                   \
+        pun_lo.from_##F_SFX = fill_lo;                                                      \
+        pun_hi.from_##F_SFX = fill_hi;                                                      \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn2_till_##T_SFX(                 \
+            (const npyv_lanetype_##T_SFX *)ptr, stride, nlane, pun_lo.to_##T_SFX,           \
+            pun_hi.to_##T_SFX                                                               \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE npyv_##F_SFX npyv_load2_tillz_##F_SFX                                       \
+    (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane)                                     \
+    {                                                                                       \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load2_tillz_##T_SFX(                 \
+            (const npyv_lanetype_##T_SFX *)ptr, nlane                                       \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE npyv_##F_SFX npyv_loadn2_tillz_##F_SFX                                      \
+    (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane)                    \
+    {                                                                                       \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn2_tillz_##T_SFX(                \
+            (const npyv_lanetype_##T_SFX *)ptr, stride, nlane                               \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE void npyv_store2_till_##F_SFX                                               \
+    (npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, npyv_##F_SFX a)                           \
+    {                                                                                       \
+        npyv_store2_till_##T_SFX(                                                           \
+            (npyv_lanetype_##T_SFX *)ptr, nlane,                                            \
+            npyv_reinterpret_##T_SFX##_##F_SFX(a)                                           \
+        );                                                                                  \
+    }                                                                                       \
+    NPY_FINLINE void npyv_storen2_till_##F_SFX                                              \
+    (npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, npyv_##F_SFX a)          \
+    {                                                                                       \
+        npyv_storen2_till_##T_SFX(                                                          \
+            (npyv_lanetype_##T_SFX *)ptr, stride, nlane,                                    \
+            npyv_reinterpret_##T_SFX##_##F_SFX(a)                                           \
+        );                                                                                  \
+    }
+
+NPYV_IMPL_AVX512_REST_PARTIAL_TYPES_PAIR(u32, s32)
+NPYV_IMPL_AVX512_REST_PARTIAL_TYPES_PAIR(f32, s32)
+NPYV_IMPL_AVX512_REST_PARTIAL_TYPES_PAIR(u64, s64)
+NPYV_IMPL_AVX512_REST_PARTIAL_TYPES_PAIR(f64, s64)
+
+/************************************************************
+ *  de-interlave load / interleave contiguous store
+ ************************************************************/
+// two channels
+#define NPYV_IMPL_AVX512_MEM_INTERLEAVE(SFX, ZSFX)                           \
+    NPY_FINLINE npyv_##ZSFX##x2 npyv_zip_##ZSFX(npyv_##ZSFX, npyv_##ZSFX);   \
+    NPY_FINLINE npyv_##ZSFX##x2 npyv_unzip_##ZSFX(npyv_##ZSFX, npyv_##ZSFX); \
+    NPY_FINLINE npyv_##SFX##x2 npyv_load_##SFX##x2(                          \
+        const npyv_lanetype_##SFX *ptr                                       \
+    ) {                                                                      \
+        return npyv_unzip_##ZSFX(                                            \
+            npyv_load_##SFX(ptr), npyv_load_##SFX(ptr+npyv_nlanes_##SFX)     \
+        );                                                                   \
+    }                                                                        \
+    NPY_FINLINE void npyv_store_##SFX##x2(                                   \
+        npyv_lanetype_##SFX *ptr, npyv_##SFX##x2 v                           \
+    ) {                                                                      \
+        npyv_##SFX##x2 zip = npyv_zip_##ZSFX(v.val[0], v.val[1]);            \
+        npyv_store_##SFX(ptr, zip.val[0]);                                   \
+        npyv_store_##SFX(ptr + npyv_nlanes_##SFX, zip.val[1]);               \
+    }
+
+NPYV_IMPL_AVX512_MEM_INTERLEAVE(u8, u8)
+NPYV_IMPL_AVX512_MEM_INTERLEAVE(s8, u8)
+NPYV_IMPL_AVX512_MEM_INTERLEAVE(u16, u16)
+NPYV_IMPL_AVX512_MEM_INTERLEAVE(s16, u16)
+NPYV_IMPL_AVX512_MEM_INTERLEAVE(u32, u32)
+NPYV_IMPL_AVX512_MEM_INTERLEAVE(s32, u32)
+NPYV_IMPL_AVX512_MEM_INTERLEAVE(u64, u64)
+NPYV_IMPL_AVX512_MEM_INTERLEAVE(s64, u64)
+NPYV_IMPL_AVX512_MEM_INTERLEAVE(f32, f32)
+NPYV_IMPL_AVX512_MEM_INTERLEAVE(f64, f64)
+
 /**************************************************
  * Lookup table
  *************************************************/
diff --git a/numpy/core/src/common/simd/avx512/reorder.h b/numpy/core/src/common/simd/avx512/reorder.h
index c0b2477f3..27e66b5e7 100644
--- a/numpy/core/src/common/simd/avx512/reorder.h
+++ b/numpy/core/src/common/simd/avx512/reorder.h
@@ -167,6 +167,140 @@ NPY_FINLINE npyv_f64x2 npyv_zip_f64(__m512d a, __m512d b)
     return r;
 }
 
+// deinterleave two vectors
+NPY_FINLINE npyv_u8x2 npyv_unzip_u8(npyv_u8 ab0, npyv_u8 ab1)
+{
+    npyv_u8x2 r;
+#ifdef NPY_HAVE_AVX512VBMI
+    const __m512i idx_a = npyv_set_u8(
+        0,  2,  4,   6,   8,   10,  12,  14,  16,  18,  20,  22,  24,  26,  28,  30,
+        32, 34, 36,  38,  40,  42,  44,  46,  48,  50,  52,  54,  56,  58,  60,  62,
+        64, 66, 68,  70,  72,  74,  76,  78,  80,  82,  84,  86,  88,  90,  92,  94,
+        96, 98, 100, 102, 104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126
+    );
+    const __m512i idx_b = npyv_set_u8(
+        1,  3,  5,   7,   9,   11,  13,  15,  17,  19,  21,  23,  25,  27,  29,  31,
+        33, 35, 37,  39,  41,  43,  45,  47,  49,  51,  53,  55,  57,  59,  61,  63,
+        65, 67, 69,  71,  73,  75,  77,  79,  81,  83,  85,  87,  89,  91,  93,  95,
+        97, 99, 101, 103, 105, 107, 109, 111, 113, 115, 117, 119, 121, 123, 125, 127
+    );
+    r.val[0] = _mm512_permutex2var_epi8(ab0, idx_a, ab1);
+    r.val[1] = _mm512_permutex2var_epi8(ab0, idx_b, ab1);
+#else
+    #ifdef NPY_HAVE_AVX512BW
+        const __m512i idx = npyv_set_u8(
+            0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
+            0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
+            0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
+            0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
+        );
+        __m512i abl = _mm512_shuffle_epi8(ab0, idx);
+        __m512i abh = _mm512_shuffle_epi8(ab1, idx);
+    #else
+        const __m256i idx = _mm256_setr_epi8(
+            0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
+            0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
+        );
+        __m256i abl_lo = _mm256_shuffle_epi8(npyv512_lower_si256(ab0),  idx);
+        __m256i abl_hi = _mm256_shuffle_epi8(npyv512_higher_si256(ab0), idx);
+        __m256i abh_lo = _mm256_shuffle_epi8(npyv512_lower_si256(ab1),  idx);
+        __m256i abh_hi = _mm256_shuffle_epi8(npyv512_higher_si256(ab1), idx);
+        __m512i abl = npyv512_combine_si256(abl_lo, abl_hi);
+        __m512i abh = npyv512_combine_si256(abh_lo, abh_hi);
+    #endif
+    const __m512i idx_a = npyv_set_u64(0, 2, 4, 6, 8, 10, 12, 14);
+    const __m512i idx_b = npyv_set_u64(1, 3, 5, 7, 9, 11, 13, 15);
+    r.val[0] = _mm512_permutex2var_epi64(abl, idx_a, abh);
+    r.val[1] = _mm512_permutex2var_epi64(abl, idx_b, abh);
+#endif
+    return r;
+}
+#define npyv_unzip_s8 npyv_unzip_u8
+
+NPY_FINLINE npyv_u16x2 npyv_unzip_u16(npyv_u16 ab0, npyv_u16 ab1)
+{
+    npyv_u16x2 r;
+#ifdef NPY_HAVE_AVX512BW
+    const __m512i idx_a = npyv_set_u16(
+        0,  2,  4,  6,  8,  10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
+        32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62
+    );
+    const __m512i idx_b = npyv_set_u16(
+        1,  3,  5,  7,  9,  11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31,
+        33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63
+    );
+    r.val[0] = _mm512_permutex2var_epi16(ab0, idx_a, ab1);
+    r.val[1] = _mm512_permutex2var_epi16(ab0, idx_b, ab1);
+#else
+    const __m256i idx = _mm256_setr_epi8(
+        0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15,
+        0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15
+    );
+    __m256i abl_lo = _mm256_shuffle_epi8(npyv512_lower_si256(ab0),  idx);
+    __m256i abl_hi = _mm256_shuffle_epi8(npyv512_higher_si256(ab0), idx);
+    __m256i abh_lo = _mm256_shuffle_epi8(npyv512_lower_si256(ab1),  idx);
+    __m256i abh_hi = _mm256_shuffle_epi8(npyv512_higher_si256(ab1), idx);
+    __m512i abl = npyv512_combine_si256(abl_lo, abl_hi);
+    __m512i abh = npyv512_combine_si256(abh_lo, abh_hi);
+
+    const __m512i idx_a = npyv_set_u64(0, 2, 4, 6, 8, 10, 12, 14);
+    const __m512i idx_b = npyv_set_u64(1, 3, 5, 7, 9, 11, 13, 15);
+    r.val[0] = _mm512_permutex2var_epi64(abl, idx_a, abh);
+    r.val[1] = _mm512_permutex2var_epi64(abl, idx_b, abh);
+#endif
+    return r;
+}
+#define npyv_unzip_s16 npyv_unzip_u16
+
+NPY_FINLINE npyv_u32x2 npyv_unzip_u32(npyv_u32 ab0, npyv_u32 ab1)
+{
+    const __m512i idx_a = npyv_set_u32(
+        0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+    );
+    const __m512i idx_b = npyv_set_u32(
+        1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+    );
+    npyv_u32x2 r;
+    r.val[0] = _mm512_permutex2var_epi32(ab0, idx_a, ab1);
+    r.val[1] = _mm512_permutex2var_epi32(ab0, idx_b, ab1);
+    return r;
+}
+#define npyv_unzip_s32 npyv_unzip_u32
+
+NPY_FINLINE npyv_u64x2 npyv_unzip_u64(npyv_u64 ab0, npyv_u64 ab1)
+{
+    const __m512i idx_a = npyv_set_u64(0, 2, 4, 6, 8, 10, 12, 14);
+    const __m512i idx_b = npyv_set_u64(1, 3, 5, 7, 9, 11, 13, 15);
+    npyv_u64x2 r;
+    r.val[0] = _mm512_permutex2var_epi64(ab0, idx_a, ab1);
+    r.val[1] = _mm512_permutex2var_epi64(ab0, idx_b, ab1);
+    return r;
+}
+#define npyv_unzip_s64 npyv_unzip_u64
+
+NPY_FINLINE npyv_f32x2 npyv_unzip_f32(npyv_f32 ab0, npyv_f32 ab1)
+{
+    const __m512i idx_a = npyv_set_u32(
+        0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+    );
+    const __m512i idx_b = npyv_set_u32(
+        1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+    );
+    npyv_f32x2 r;
+    r.val[0] = _mm512_permutex2var_ps(ab0, idx_a, ab1);
+    r.val[1] = _mm512_permutex2var_ps(ab0, idx_b, ab1);
+    return r;
+}
+NPY_FINLINE npyv_f64x2 npyv_unzip_f64(npyv_f64 ab0, npyv_f64 ab1)
+{
+    const __m512i idx_a = npyv_set_u64(0, 2, 4, 6, 8, 10, 12, 14);
+    const __m512i idx_b = npyv_set_u64(1, 3, 5, 7, 9, 11, 13, 15);
+    npyv_f64x2 r;
+    r.val[0] = _mm512_permutex2var_pd(ab0, idx_a, ab1);
+    r.val[1] = _mm512_permutex2var_pd(ab0, idx_b, ab1);
+    return r;
+}
+
 // Reverse elements of each 64-bit lane
 NPY_FINLINE npyv_u8 npyv_rev64_u8(npyv_u8 a)
 {
@@ -223,4 +357,22 @@ NPY_FINLINE npyv_f32 npyv_rev64_f32(npyv_f32 a)
     return _mm512_shuffle_ps(a, a, (_MM_PERM_ENUM)_MM_SHUFFLE(2, 3, 0, 1));
 }
 
+// Permuting the elements of each 128-bit lane by immediate index for
+// each element.
+#define npyv_permi128_u32(A, E0, E1, E2, E3) \
+    _mm512_shuffle_epi32(A, _MM_SHUFFLE(E3, E2, E1, E0))
+
+#define npyv_permi128_s32 npyv_permi128_u32
+
+#define npyv_permi128_u64(A, E0, E1) \
+    _mm512_shuffle_epi32(A, _MM_SHUFFLE(((E1)<<1)+1, ((E1)<<1), ((E0)<<1)+1, ((E0)<<1)))
+
+#define npyv_permi128_s64 npyv_permi128_u64
+
+#define npyv_permi128_f32(A, E0, E1, E2, E3) \
+    _mm512_permute_ps(A, _MM_SHUFFLE(E3, E2, E1, E0))
+
+#define npyv_permi128_f64(A, E0, E1) \
+    _mm512_permute_pd(A, (((E1)<<7) | ((E0)<<6) | ((E1)<<5) | ((E0)<<4) | ((E1)<<3) | ((E0)<<2) | ((E1)<<1) | (E0)))
+
 #endif // _NPY_SIMD_AVX512_REORDER_H
diff --git a/numpy/core/src/common/simd/emulate_maskop.h b/numpy/core/src/common/simd/emulate_maskop.h
index 2a808a153..6627e5010 100644
--- a/numpy/core/src/common/simd/emulate_maskop.h
+++ b/numpy/core/src/common/simd/emulate_maskop.h
@@ -42,5 +42,39 @@ NPYV_IMPL_EMULATE_MASK_ADDSUB(s64, b64)
 #if NPY_SIMD_F64
     NPYV_IMPL_EMULATE_MASK_ADDSUB(f64, b64)
 #endif
+#if NPY_SIMD_F32
+    // conditional division, m ? a / b : c
+    NPY_FINLINE npyv_f32
+    npyv_ifdiv_f32(npyv_b32 m, npyv_f32 a, npyv_f32 b, npyv_f32 c)
+    {
+        const npyv_f32 one = npyv_setall_f32(1.0f);
+        npyv_f32 div = npyv_div_f32(a, npyv_select_f32(m, b, one));
+        return npyv_select_f32(m, div, c);
+    }
+    // conditional division, m ? a / b : 0
+    NPY_FINLINE npyv_f32
+    npyv_ifdivz_f32(npyv_b32 m, npyv_f32 a, npyv_f32 b)
+    {
+        const npyv_f32 zero = npyv_zero_f32();
+        return npyv_ifdiv_f32(m, a, b, zero);
+    }
+#endif
+#if NPY_SIMD_F64
+    // conditional division, m ? a / b : c
+    NPY_FINLINE npyv_f64
+    npyv_ifdiv_f64(npyv_b64 m, npyv_f64 a, npyv_f64 b, npyv_f64 c)
+    {
+        const npyv_f64 one = npyv_setall_f64(1.0);
+        npyv_f64 div = npyv_div_f64(a, npyv_select_f64(m, b, one));
+        return npyv_select_f64(m, div, c);
+    }
+    // conditional division, m ? a / b : 0
+    NPY_FINLINE npyv_f64
+    npyv_ifdivz_f64(npyv_b64 m, npyv_f64 a, npyv_f64 b)
+    {
+        const npyv_f64 zero = npyv_zero_f64();
+        return npyv_ifdiv_f64(m, a, b, zero);
+    }
+#endif
 
 #endif // _NPY_SIMD_EMULATE_MASKOP_H
diff --git a/numpy/core/src/common/simd/neon/arithmetic.h b/numpy/core/src/common/simd/neon/arithmetic.h
index 00994806d..683620111 100644
--- a/numpy/core/src/common/simd/neon/arithmetic.h
+++ b/numpy/core/src/common/simd/neon/arithmetic.h
@@ -265,6 +265,14 @@ NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor)
     NPY_FINLINE npyv_f32 npyv_nmulsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
     { return vmlsq_f32(vnegq_f32(c), a, b); }
 #endif
+// multiply, add for odd elements and subtract even elements.
+// (a * b) -+ c
+NPY_FINLINE npyv_f32 npyv_muladdsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
+{
+    const npyv_f32 msign = npyv_set_f32(-0.0f, 0.0f, -0.0f, 0.0f);
+    return npyv_muladd_f32(a, b, npyv_xor_f32(msign, c));
+}
+
 /***************************
  * FUSED F64
  ***************************/
@@ -277,6 +285,11 @@ NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor)
     { return vfmsq_f64(c, a, b); }
     NPY_FINLINE npyv_f64 npyv_nmulsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c)
     { return vfmsq_f64(vnegq_f64(c), a, b); }
+    NPY_FINLINE npyv_f64 npyv_muladdsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c)
+    {
+        const npyv_f64 msign = npyv_set_f64(-0.0, 0.0);
+        return npyv_muladd_f64(a, b, npyv_xor_f64(msign, c));
+    }
 #endif // NPY_SIMD_F64
 
 /***************************
diff --git a/numpy/core/src/common/simd/neon/math.h b/numpy/core/src/common/simd/neon/math.h
index c0a771b5d..58d14809f 100644
--- a/numpy/core/src/common/simd/neon/math.h
+++ b/numpy/core/src/common/simd/neon/math.h
@@ -278,20 +278,25 @@ NPY_FINLINE npyv_f32 npyv_rint_f32(npyv_f32 a)
     return vrndnq_f32(a);
 #else
     // ARMv7 NEON only supports fp to int truncate conversion.
-    // a magic trick of adding 1.5 * 2**23 is used for rounding
+    // a magic trick of adding 1.5 * 2^23 is used for rounding
     // to nearest even and then subtract this magic number to get
     // the integer.
-    const npyv_s32 szero = vreinterpretq_s32_f32(vdupq_n_f32(-0.0f));
-    const npyv_f32 magic = vdupq_n_f32(12582912.0f); // 1.5 * 2**23
-    npyv_f32 round = vsubq_f32(vaddq_f32(a, magic), magic);
-    npyv_b32 overflow = vcleq_f32(vabsq_f32(a), vreinterpretq_f32_u32(vdupq_n_u32(0x4b000000)));
-    round = vbslq_f32(overflow, round, a);
-    // signed zero
-    round = vreinterpretq_f32_s32(vorrq_s32(
-         vreinterpretq_s32_f32(round),
-         vandq_s32(vreinterpretq_s32_f32(a), szero)
-    ));
-    return round;
+    //
+    const npyv_u32 szero = vreinterpretq_u32_f32(vdupq_n_f32(-0.0f));
+    const npyv_u32 sign_mask = vandq_u32(vreinterpretq_u32_f32(a), szero);
+    const npyv_f32 two_power_23 = vdupq_n_f32(8388608.0); // 2^23
+    const npyv_f32 two_power_23h = vdupq_n_f32(12582912.0f); // 1.5 * 2^23
+    npyv_u32 nnan_mask = vceqq_f32(a, a);
+    // eliminate nans to avoid invalid fp errors
+    npyv_f32 abs_x = vabsq_f32(vreinterpretq_f32_u32(vandq_u32(nnan_mask, vreinterpretq_u32_f32(a))));
+    // round by add magic number 1.5 * 2^23
+    npyv_f32 round = vsubq_f32(vaddq_f32(two_power_23h, abs_x), two_power_23h);
+    // copysign
+    round = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(round), sign_mask ));
+    // a if |a| >= 2^23 or a == NaN
+    npyv_u32 mask = vcleq_f32(abs_x, two_power_23);
+             mask = vandq_u32(mask, nnan_mask);
+    return vbslq_f32(mask, round, a);
 #endif
 }
 #if NPY_SIMD_F64
@@ -302,33 +307,30 @@ NPY_FINLINE npyv_f32 npyv_rint_f32(npyv_f32 a)
 #ifdef NPY_HAVE_ASIMD
     #define npyv_ceil_f32 vrndpq_f32
 #else
-   NPY_FINLINE npyv_f32 npyv_ceil_f32(npyv_f32 a)
-   {
-        const npyv_s32 szero = vreinterpretq_s32_f32(vdupq_n_f32(-0.0f));
+    NPY_FINLINE npyv_f32 npyv_ceil_f32(npyv_f32 a)
+    {
         const npyv_u32 one = vreinterpretq_u32_f32(vdupq_n_f32(1.0f));
-        const npyv_s32 max_int = vdupq_n_s32(0x7fffffff);
-        /**
-         * On armv7, vcvtq.f32 handles special cases as follows:
-         *  NaN return 0
-         * +inf or +outrange return 0x80000000(-0.0f)
-         * -inf or -outrange return 0x7fffffff(nan)
-         */
-        npyv_s32 roundi = vcvtq_s32_f32(a);
-        npyv_f32 round = vcvtq_f32_s32(roundi);
+        const npyv_u32 szero = vreinterpretq_u32_f32(vdupq_n_f32(-0.0f));
+        const npyv_u32 sign_mask = vandq_u32(vreinterpretq_u32_f32(a), szero);
+        const npyv_f32 two_power_23 = vdupq_n_f32(8388608.0); // 2^23
+        const npyv_f32 two_power_23h = vdupq_n_f32(12582912.0f); // 1.5 * 2^23
+        npyv_u32 nnan_mask = vceqq_f32(a, a);
+        npyv_f32 x = vreinterpretq_f32_u32(vandq_u32(nnan_mask, vreinterpretq_u32_f32(a)));
+        // eliminate nans to avoid invalid fp errors
+        npyv_f32 abs_x = vabsq_f32(x);
+        // round by add magic number 1.5 * 2^23
+        npyv_f32 round = vsubq_f32(vaddq_f32(two_power_23h, abs_x), two_power_23h);
+        // copysign
+        round = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(round), sign_mask));
         npyv_f32 ceil = vaddq_f32(round, vreinterpretq_f32_u32(
-            vandq_u32(vcltq_f32(round, a), one))
-        );
-        // respect signed zero, e.g. -0.5 -> -0.0
-        npyv_f32 rzero = vreinterpretq_f32_s32(vorrq_s32(
-            vreinterpretq_s32_f32(ceil),
-            vandq_s32(vreinterpretq_s32_f32(a), szero)
-        ));
-        // if nan or overflow return a
-        npyv_u32 nnan = npyv_notnan_f32(a);
-        npyv_u32 overflow = vorrq_u32(
-            vceqq_s32(roundi, szero), vceqq_s32(roundi, max_int)
+            vandq_u32(vcltq_f32(round, x), one))
         );
-        return vbslq_f32(vbicq_u32(nnan, overflow), rzero, a);
+        // respects signed zero
+        ceil = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(ceil), sign_mask));
+        // a if |a| >= 2^23 or a == NaN
+        npyv_u32 mask = vcleq_f32(abs_x, two_power_23);
+                 mask = vandq_u32(mask, nnan_mask);
+        return vbslq_f32(mask, ceil, a);
    }
 #endif
 #if NPY_SIMD_F64
@@ -339,29 +341,37 @@ NPY_FINLINE npyv_f32 npyv_rint_f32(npyv_f32 a)
 #ifdef NPY_HAVE_ASIMD
     #define npyv_trunc_f32 vrndq_f32
 #else
-   NPY_FINLINE npyv_f32 npyv_trunc_f32(npyv_f32 a)
-   {
-        const npyv_s32 szero = vreinterpretq_s32_f32(vdupq_n_f32(-0.0f));
+    NPY_FINLINE npyv_f32 npyv_trunc_f32(npyv_f32 a)
+    {
         const npyv_s32 max_int = vdupq_n_s32(0x7fffffff);
+        const npyv_u32 exp_mask = vdupq_n_u32(0xff000000);
+        const npyv_s32 szero = vreinterpretq_s32_f32(vdupq_n_f32(-0.0f));
+        const npyv_u32 sign_mask = vandq_u32(
+            vreinterpretq_u32_f32(a), vreinterpretq_u32_s32(szero));
+
+        npyv_u32 nfinite_mask = vshlq_n_u32(vreinterpretq_u32_f32(a), 1);
+                 nfinite_mask = vandq_u32(nfinite_mask, exp_mask);
+                 nfinite_mask = vceqq_u32(nfinite_mask, exp_mask);
+        // eliminate nans/inf to avoid invalid fp errors
+        npyv_f32 x = vreinterpretq_f32_u32(
+            veorq_u32(nfinite_mask, vreinterpretq_u32_f32(a)));
         /**
          * On armv7, vcvtq.f32 handles special cases as follows:
          *  NaN return 0
          * +inf or +outrange return 0x80000000(-0.0f)
          * -inf or -outrange return 0x7fffffff(nan)
          */
-        npyv_s32 roundi = vcvtq_s32_f32(a);
-        npyv_f32 round = vcvtq_f32_s32(roundi);
+        npyv_s32 trunci = vcvtq_s32_f32(x);
+        npyv_f32 trunc = vcvtq_f32_s32(trunci);
         // respect signed zero, e.g. -0.5 -> -0.0
-        npyv_f32 rzero = vreinterpretq_f32_s32(vorrq_s32(
-            vreinterpretq_s32_f32(round),
-            vandq_s32(vreinterpretq_s32_f32(a), szero)
-        ));
-        // if nan or overflow return a
-        npyv_u32 nnan = npyv_notnan_f32(a);
-        npyv_u32 overflow = vorrq_u32(
-            vceqq_s32(roundi, szero), vceqq_s32(roundi, max_int)
+        trunc = vreinterpretq_f32_u32(
+            vorrq_u32(vreinterpretq_u32_f32(trunc), sign_mask));
+        // if overflow return a
+        npyv_u32 overflow_mask = vorrq_u32(
+            vceqq_s32(trunci, szero), vceqq_s32(trunci, max_int)
         );
-        return vbslq_f32(vbicq_u32(nnan, overflow), rzero, a);
+        // a if a overflow or nonfinite
+        return vbslq_f32(vorrq_u32(nfinite_mask, overflow_mask), a, trunc);
    }
 #endif
 #if NPY_SIMD_F64
@@ -372,28 +382,31 @@ NPY_FINLINE npyv_f32 npyv_rint_f32(npyv_f32 a)
 #ifdef NPY_HAVE_ASIMD
     #define npyv_floor_f32 vrndmq_f32
 #else
-   NPY_FINLINE npyv_f32 npyv_floor_f32(npyv_f32 a)
-   {
-        const npyv_s32 szero = vreinterpretq_s32_f32(vdupq_n_f32(-0.0f));
+    NPY_FINLINE npyv_f32 npyv_floor_f32(npyv_f32 a)
+    {
         const npyv_u32 one = vreinterpretq_u32_f32(vdupq_n_f32(1.0f));
-        const npyv_s32 max_int = vdupq_n_s32(0x7fffffff);
+        const npyv_u32 szero = vreinterpretq_u32_f32(vdupq_n_f32(-0.0f));
+        const npyv_u32 sign_mask = vandq_u32(vreinterpretq_u32_f32(a), szero);
+        const npyv_f32 two_power_23 = vdupq_n_f32(8388608.0); // 2^23
+        const npyv_f32 two_power_23h = vdupq_n_f32(12582912.0f); // 1.5 * 2^23
 
-        npyv_s32 roundi = vcvtq_s32_f32(a);
-        npyv_f32 round = vcvtq_f32_s32(roundi);
+        npyv_u32 nnan_mask = vceqq_f32(a, a);
+        npyv_f32 x = vreinterpretq_f32_u32(vandq_u32(nnan_mask, vreinterpretq_u32_f32(a)));
+        // eliminate nans to avoid invalid fp errors
+        npyv_f32 abs_x = vabsq_f32(x);
+        // round by add magic number 1.5 * 2^23
+        npyv_f32 round = vsubq_f32(vaddq_f32(two_power_23h, abs_x), two_power_23h);
+        // copysign
+        round = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(round), sign_mask));
         npyv_f32 floor = vsubq_f32(round, vreinterpretq_f32_u32(
-            vandq_u32(vcgtq_f32(round, a), one)
-        ));
-        // respect signed zero
-        npyv_f32 rzero = vreinterpretq_f32_s32(vorrq_s32(
-            vreinterpretq_s32_f32(floor),
-            vandq_s32(vreinterpretq_s32_f32(a), szero)
+            vandq_u32(vcgtq_f32(round, x), one)
         ));
-        npyv_u32 nnan = npyv_notnan_f32(a);
-        npyv_u32 overflow = vorrq_u32(
-            vceqq_s32(roundi, szero), vceqq_s32(roundi, max_int)
-        );
-
-       return vbslq_f32(vbicq_u32(nnan, overflow), rzero, a);
+        // respects signed zero
+        floor = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(floor), sign_mask));
+        // a if |a| >= 2^23 or a == NaN
+        npyv_u32 mask = vcleq_f32(abs_x, two_power_23);
+                 mask = vandq_u32(mask, nnan_mask);
+        return vbslq_f32(mask, floor, a);
    }
 #endif // NPY_HAVE_ASIMD
 #if NPY_SIMD_F64
diff --git a/numpy/core/src/common/simd/neon/memory.h b/numpy/core/src/common/simd/neon/memory.h
index 7060ea628..6163440c3 100644
--- a/numpy/core/src/common/simd/neon/memory.h
+++ b/numpy/core/src/common/simd/neon/memory.h
@@ -102,6 +102,29 @@ NPY_FINLINE npyv_f64 npyv_loadn_f64(const double *ptr, npy_intp stride)
     );
 }
 #endif
+
+//// 64-bit load over 32-bit stride
+NPY_FINLINE npyv_u32 npyv_loadn2_u32(const npy_uint32 *ptr, npy_intp stride)
+{
+    return vcombine_u32(
+        vld1_u32((const uint32_t*)ptr), vld1_u32((const uint32_t*)ptr + stride)
+    );
+}
+NPY_FINLINE npyv_s32 npyv_loadn2_s32(const npy_int32 *ptr, npy_intp stride)
+{ return npyv_reinterpret_s32_u32(npyv_loadn2_u32((const npy_uint32*)ptr, stride)); }
+NPY_FINLINE npyv_f32 npyv_loadn2_f32(const float *ptr, npy_intp stride)
+{ return npyv_reinterpret_f32_u32(npyv_loadn2_u32((const npy_uint32*)ptr, stride)); }
+
+//// 128-bit load over 64-bit stride
+NPY_FINLINE npyv_u64 npyv_loadn2_u64(const npy_uint64 *ptr, npy_intp stride)
+{ (void)stride; return npyv_load_u64(ptr); }
+NPY_FINLINE npyv_s64 npyv_loadn2_s64(const npy_int64 *ptr, npy_intp stride)
+{ (void)stride; return npyv_load_s64(ptr); }
+#if NPY_SIMD_F64
+NPY_FINLINE npyv_f64 npyv_loadn2_f64(const double *ptr, npy_intp stride)
+{ (void)stride; return npyv_load_f64(ptr); }
+#endif
+
 /***************************
  * Non-contiguous Store
  ***************************/
@@ -131,6 +154,32 @@ NPY_FINLINE void npyv_storen_f64(double *ptr, npy_intp stride, npyv_f64 a)
 { npyv_storen_s64((npy_int64*)ptr, stride, npyv_reinterpret_s64_f64(a)); }
 #endif
 
+//// 64-bit store over 32-bit stride
+NPY_FINLINE void npyv_storen2_u32(npy_uint32 *ptr, npy_intp stride, npyv_u32 a)
+{
+#if NPY_SIMD_F64
+    vst1q_lane_u64((uint64_t*)ptr, npyv_reinterpret_u64_u32(a), 0);
+    vst1q_lane_u64((uint64_t*)(ptr + stride), npyv_reinterpret_u64_u32(a), 1);
+#else
+    // armhf strict to alignment
+    vst1_u32((uint32_t*)ptr, vget_low_u32(a));
+    vst1_u32((uint32_t*)ptr + stride, vget_high_u32(a));
+#endif
+}
+NPY_FINLINE void npyv_storen2_s32(npy_int32 *ptr, npy_intp stride, npyv_s32 a)
+{ npyv_storen2_u32((npy_uint32*)ptr, stride, npyv_reinterpret_u32_s32(a)); }
+NPY_FINLINE void npyv_storen2_f32(float *ptr, npy_intp stride, npyv_f32 a)
+{ npyv_storen2_u32((npy_uint32*)ptr, stride, npyv_reinterpret_u32_f32(a)); }
+
+//// 128-bit store over 64-bit stride
+NPY_FINLINE void npyv_storen2_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a)
+{ (void)stride; npyv_store_u64(ptr, a); }
+NPY_FINLINE void npyv_storen2_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a)
+{ (void)stride; npyv_store_s64(ptr, a); }
+#if NPY_SIMD_F64
+NPY_FINLINE void npyv_storen2_f64(double *ptr, npy_intp stride, npyv_f64 a)
+{ (void)stride; npyv_store_f64(ptr, a); }
+#endif
 /*********************************
  * Partial Load
  *********************************/
@@ -168,6 +217,29 @@ NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, n
 NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
 { return npyv_load_till_s64(ptr, nlane, 0); }
 
+//// 64-bit nlane
+NPY_FINLINE npyv_s32 npyv_load2_till_s32(const npy_int32 *ptr, npy_uintp nlane,
+                                          npy_int32 fill_lo, npy_int32 fill_hi)
+{
+    assert(nlane > 0);
+    if (nlane == 1) {
+        const int32_t NPY_DECL_ALIGNED(16) fill[2] = {fill_lo, fill_hi};
+        return vcombine_s32(vld1_s32((const int32_t*)ptr), vld1_s32(fill));
+    }
+    return npyv_load_s32(ptr);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s32 npyv_load2_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
+{ return vreinterpretq_s32_s64(npyv_load_tillz_s64((const npy_int64*)ptr, nlane)); }
+
+//// 128-bit nlane
+NPY_FINLINE npyv_s64 npyv_load2_till_s64(const npy_int64 *ptr, npy_uintp nlane,
+                                           npy_int64 fill_lo, npy_int64 fill_hi)
+{ (void)nlane; (void)fill_lo; (void)fill_hi; return npyv_load_s64(ptr); }
+
+NPY_FINLINE npyv_s64 npyv_load2_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
+{ (void)nlane; return npyv_load_s64(ptr); }
+
 /*********************************
  * Non-contiguous partial load
  *********************************/
@@ -206,6 +278,34 @@ npyv_loadn_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_
 NPY_FINLINE npyv_s64 npyv_loadn_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane)
 { return npyv_loadn_till_s64(ptr, stride, nlane, 0); }
 
+//// 64-bit load over 32-bit stride
+NPY_FINLINE npyv_s32 npyv_loadn2_till_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane,
+                                                 npy_int32 fill_lo, npy_int32 fill_hi)
+{
+    assert(nlane > 0);
+    if (nlane == 1) {
+        const int32_t NPY_DECL_ALIGNED(16) fill[2] = {fill_lo, fill_hi};
+        return vcombine_s32(vld1_s32((const int32_t*)ptr), vld1_s32(fill));
+    }
+    return npyv_loadn2_s32(ptr, stride);
+}
+NPY_FINLINE npyv_s32 npyv_loadn2_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane)
+{
+    assert(nlane > 0);
+    if (nlane == 1) {
+        return vcombine_s32(vld1_s32((const int32_t*)ptr), vdup_n_s32(0));
+    }
+    return npyv_loadn2_s32(ptr, stride);
+}
+
+//// 128-bit load over 64-bit stride
+NPY_FINLINE npyv_s64 npyv_loadn2_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane,
+                                          npy_int64 fill_lo, npy_int64 fill_hi)
+{ assert(nlane > 0); (void)stride; (void)nlane; (void)fill_lo; (void)fill_hi; return npyv_load_s64(ptr); }
+
+NPY_FINLINE npyv_s64 npyv_loadn2_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane)
+{ assert(nlane > 0); (void)stride; (void)nlane; return npyv_load_s64(ptr); }
+
 /*********************************
  * Partial store
  *********************************/
@@ -238,6 +338,30 @@ NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a
     }
     npyv_store_s64(ptr, a);
 }
+
+//// 64-bit nlane
+NPY_FINLINE void npyv_store2_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a)
+{
+    assert(nlane > 0);
+    if (nlane == 1) {
+        // armhf strict to alignment, may cause bus error
+    #if NPY_SIMD_F64
+        vst1q_lane_s64((int64_t*)ptr, npyv_reinterpret_s64_s32(a), 0);
+    #else
+        npyv_storel_s32(ptr, a);
+    #endif
+        return;
+    }
+    npyv_store_s32(ptr, a);
+}
+
+//// 128-bit nlane
+NPY_FINLINE void npyv_store2_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a)
+{
+    assert(nlane > 0); (void)nlane;
+    npyv_store_s64(ptr, a);
+}
+
 /*********************************
  * Non-contiguous partial store
  *********************************/
@@ -245,16 +369,21 @@ NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a
 NPY_FINLINE void npyv_storen_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npyv_s32 a)
 {
     assert(nlane > 0);
+    vst1q_lane_s32((int32_t*)ptr, a, 0);
     switch(nlane) {
-    default:
-        vst1q_lane_s32((int32_t*)ptr + stride*3, a, 3);
+    case 1:
+        return;
+    case 2:
+        vst1q_lane_s32((int32_t*)ptr + stride, a, 1);
+        return;
     case 3:
+        vst1q_lane_s32((int32_t*)ptr + stride, a, 1);
         vst1q_lane_s32((int32_t*)ptr + stride*2, a, 2);
-    case 2:
+        return;
+    default:
         vst1q_lane_s32((int32_t*)ptr + stride, a, 1);
-    case 1:
-        vst1q_lane_s32((int32_t*)ptr, a, 0);
-        break;
+        vst1q_lane_s32((int32_t*)ptr + stride*2, a, 2);
+        vst1q_lane_s32((int32_t*)ptr + stride*3, a, 3);
     }
 }
 //// 64
@@ -268,6 +397,27 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp
     npyv_storen_s64(ptr, stride, a);
 }
 
+//// 64-bit store over 32-bit stride
+NPY_FINLINE void npyv_storen2_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npyv_s32 a)
+{
+    assert(nlane > 0);
+#if NPY_SIMD_F64
+    vst1q_lane_s64((int64_t*)ptr, npyv_reinterpret_s64_s32(a), 0);
+    if (nlane > 1) {
+        vst1q_lane_s64((int64_t*)(ptr + stride), npyv_reinterpret_s64_s32(a), 1);
+    }
+#else
+    npyv_storel_s32(ptr, a);
+    if (nlane > 1) {
+        npyv_storeh_s32(ptr + stride, a);
+    }
+#endif
+}
+
+//// 128-bit store over 64-bit stride
+NPY_FINLINE void npyv_storen2_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npyv_s64 a)
+{ assert(nlane > 0); (void)stride; (void)nlane; npyv_store_s64(ptr, a); }
+
 /*****************************************************************
  * Implement partial load/store for u32/f32/u64/f64... via casting
  *****************************************************************/
@@ -278,7 +428,8 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp
         union {                                                                             \
             npyv_lanetype_##F_SFX from_##F_SFX;                                             \
             npyv_lanetype_##T_SFX to_##T_SFX;                                               \
-        } pun = {.from_##F_SFX = fill};                                                     \
+        } pun;                                                                              \
+        pun.from_##F_SFX = fill;                                                            \
         return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load_till_##T_SFX(                   \
             (const npyv_lanetype_##T_SFX *)ptr, nlane, pun.to_##T_SFX                       \
         ));                                                                                 \
@@ -290,7 +441,8 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp
         union {                                                                             \
             npyv_lanetype_##F_SFX from_##F_SFX;                                             \
             npyv_lanetype_##T_SFX to_##T_SFX;                                               \
-        } pun = {.from_##F_SFX = fill};                                                     \
+        } pun;                                                                              \
+        pun.from_##F_SFX = fill;                                                            \
         return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn_till_##T_SFX(                  \
             (const npyv_lanetype_##T_SFX *)ptr, stride, nlane, pun.to_##T_SFX               \
         ));                                                                                 \
@@ -332,6 +484,131 @@ NPYV_IMPL_NEON_REST_PARTIAL_TYPES(u64, s64)
 #if NPY_SIMD_F64
 NPYV_IMPL_NEON_REST_PARTIAL_TYPES(f64, s64)
 #endif
+
+// 128-bit/64-bit stride
+#define NPYV_IMPL_NEON_REST_PARTIAL_TYPES_PAIR(F_SFX, T_SFX)                                \
+    NPY_FINLINE npyv_##F_SFX npyv_load2_till_##F_SFX                                        \
+    (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane,                                     \
+     npyv_lanetype_##F_SFX fill_lo, npyv_lanetype_##F_SFX fill_hi)                          \
+    {                                                                                       \
+        union pun {                                                                         \
+            npyv_lanetype_##F_SFX from_##F_SFX;                                             \
+            npyv_lanetype_##T_SFX to_##T_SFX;                                               \
+        };                                                                                  \
+        union pun pun_lo;                                                                   \
+        union pun pun_hi;                                                                   \
+        pun_lo.from_##F_SFX = fill_lo;                                                      \
+        pun_hi.from_##F_SFX = fill_hi;                                                      \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load2_till_##T_SFX(                  \
+            (const npyv_lanetype_##T_SFX *)ptr, nlane, pun_lo.to_##T_SFX, pun_hi.to_##T_SFX \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE npyv_##F_SFX npyv_loadn2_till_##F_SFX                                       \
+    (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane,                    \
+     npyv_lanetype_##F_SFX fill_lo, npyv_lanetype_##F_SFX fill_hi)                          \
+    {                                                                                       \
+        union pun {                                                                         \
+            npyv_lanetype_##F_SFX from_##F_SFX;                                             \
+            npyv_lanetype_##T_SFX to_##T_SFX;                                               \
+        };                                                                                  \
+        union pun pun_lo;                                                                   \
+        union pun pun_hi;                                                                   \
+        pun_lo.from_##F_SFX = fill_lo;                                                      \
+        pun_hi.from_##F_SFX = fill_hi;                                                      \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn2_till_##T_SFX(                 \
+            (const npyv_lanetype_##T_SFX *)ptr, stride, nlane, pun_lo.to_##T_SFX,           \
+            pun_hi.to_##T_SFX                                                               \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE npyv_##F_SFX npyv_load2_tillz_##F_SFX                                       \
+    (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane)                                     \
+    {                                                                                       \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load2_tillz_##T_SFX(                 \
+            (const npyv_lanetype_##T_SFX *)ptr, nlane                                       \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE npyv_##F_SFX npyv_loadn2_tillz_##F_SFX                                      \
+    (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane)                    \
+    {                                                                                       \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn2_tillz_##T_SFX(                \
+            (const npyv_lanetype_##T_SFX *)ptr, stride, nlane                               \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE void npyv_store2_till_##F_SFX                                               \
+    (npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, npyv_##F_SFX a)                           \
+    {                                                                                       \
+        npyv_store2_till_##T_SFX(                                                           \
+            (npyv_lanetype_##T_SFX *)ptr, nlane,                                            \
+            npyv_reinterpret_##T_SFX##_##F_SFX(a)                                           \
+        );                                                                                  \
+    }                                                                                       \
+    NPY_FINLINE void npyv_storen2_till_##F_SFX                                              \
+    (npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, npyv_##F_SFX a)          \
+    {                                                                                       \
+        npyv_storen2_till_##T_SFX(                                                          \
+            (npyv_lanetype_##T_SFX *)ptr, stride, nlane,                                    \
+            npyv_reinterpret_##T_SFX##_##F_SFX(a)                                           \
+        );                                                                                  \
+    }
+
+NPYV_IMPL_NEON_REST_PARTIAL_TYPES_PAIR(u32, s32)
+NPYV_IMPL_NEON_REST_PARTIAL_TYPES_PAIR(f32, s32)
+NPYV_IMPL_NEON_REST_PARTIAL_TYPES_PAIR(u64, s64)
+#if NPY_SIMD_F64
+NPYV_IMPL_NEON_REST_PARTIAL_TYPES_PAIR(f64, s64)
+#endif
+
+/************************************************************
+ *  de-interlave load / interleave contiguous store
+ ************************************************************/
+// two channels
+#define NPYV_IMPL_NEON_MEM_INTERLEAVE(SFX, T_PTR)                        \
+    NPY_FINLINE npyv_##SFX##x2 npyv_load_##SFX##x2(                      \
+        const npyv_lanetype_##SFX *ptr                                   \
+    ) {                                                                  \
+        return vld2q_##SFX((const T_PTR*)ptr);                           \
+    }                                                                    \
+    NPY_FINLINE void npyv_store_##SFX##x2(                               \
+        npyv_lanetype_##SFX *ptr, npyv_##SFX##x2 v                       \
+    ) {                                                                  \
+        vst2q_##SFX((T_PTR*)ptr, v);                                     \
+    }
+
+NPYV_IMPL_NEON_MEM_INTERLEAVE(u8,  uint8_t)
+NPYV_IMPL_NEON_MEM_INTERLEAVE(s8,  int8_t)
+NPYV_IMPL_NEON_MEM_INTERLEAVE(u16, uint16_t)
+NPYV_IMPL_NEON_MEM_INTERLEAVE(s16, int16_t)
+NPYV_IMPL_NEON_MEM_INTERLEAVE(u32, uint32_t)
+NPYV_IMPL_NEON_MEM_INTERLEAVE(s32, int32_t)
+NPYV_IMPL_NEON_MEM_INTERLEAVE(f32, float)
+
+#if NPY_SIMD_F64
+    NPYV_IMPL_NEON_MEM_INTERLEAVE(f64, double)
+    NPYV_IMPL_NEON_MEM_INTERLEAVE(u64, uint64_t)
+    NPYV_IMPL_NEON_MEM_INTERLEAVE(s64, int64_t)
+#else
+    #define NPYV_IMPL_NEON_MEM_INTERLEAVE_64(SFX)                               \
+        NPY_FINLINE npyv_##SFX##x2 npyv_load_##SFX##x2(                         \
+            const npyv_lanetype_##SFX *ptr)                                     \
+        {                                                                       \
+            npyv_##SFX a = npyv_load_##SFX(ptr);                                \
+            npyv_##SFX b = npyv_load_##SFX(ptr + 2);                            \
+            npyv_##SFX##x2 r;                                                   \
+            r.val[0] = vcombine_##SFX(vget_low_##SFX(a),  vget_low_##SFX(b));   \
+            r.val[1] = vcombine_##SFX(vget_high_##SFX(a), vget_high_##SFX(b));  \
+            return r;                                                           \
+        }                                                                       \
+        NPY_FINLINE void npyv_store_##SFX##x2(                                  \
+            npyv_lanetype_##SFX *ptr, npyv_##SFX##x2 v)                         \
+        {                                                                       \
+            npyv_store_##SFX(ptr, vcombine_##SFX(                               \
+                vget_low_##SFX(v.val[0]),  vget_low_##SFX(v.val[1])));          \
+            npyv_store_##SFX(ptr + 2, vcombine_##SFX(                           \
+                vget_high_##SFX(v.val[0]),  vget_high_##SFX(v.val[1])));        \
+        }
+        NPYV_IMPL_NEON_MEM_INTERLEAVE_64(u64)
+        NPYV_IMPL_NEON_MEM_INTERLEAVE_64(s64)
+#endif
 /*********************************
  * Lookup table
  *********************************/
diff --git a/numpy/core/src/common/simd/neon/neon.h b/numpy/core/src/common/simd/neon/neon.h
index b08071527..49c35c415 100644
--- a/numpy/core/src/common/simd/neon/neon.h
+++ b/numpy/core/src/common/simd/neon/neon.h
@@ -16,6 +16,7 @@
     #define NPY_SIMD_FMA3 0  // HW emulated
 #endif
 #define NPY_SIMD_BIGENDIAN 0
+#define NPY_SIMD_CMPSIGNAL 1
 
 typedef uint8x16_t  npyv_u8;
 typedef int8x16_t   npyv_s8;
diff --git a/numpy/core/src/common/simd/neon/operators.h b/numpy/core/src/common/simd/neon/operators.h
index 249621bd6..e18ea94b8 100644
--- a/numpy/core/src/common/simd/neon/operators.h
+++ b/numpy/core/src/common/simd/neon/operators.h
@@ -240,10 +240,35 @@
 
 // check special cases
 NPY_FINLINE npyv_b32 npyv_notnan_f32(npyv_f32 a)
-{ return vceqq_f32(a, a); }
+{
+#if defined(__clang__)
+/**
+ * To avoid signaling qNaN, workaround for clang symmetric inputs bug
+ * check https://github.com/numpy/numpy/issues/22933,
+ * for more clarification.
+ */
+    npyv_b32 ret;
+    #if NPY_SIMD_F64
+        __asm("fcmeq %0.4s, %1.4s, %1.4s" : "=w" (ret) : "w" (a));
+    #else
+        __asm("vceq.f32 %q0, %q1, %q1" : "=w" (ret) : "w" (a));
+    #endif
+    return ret;
+#else
+    return vceqq_f32(a, a);
+#endif
+}
 #if NPY_SIMD_F64
     NPY_FINLINE npyv_b64 npyv_notnan_f64(npyv_f64 a)
-    { return vceqq_f64(a, a); }
+    {
+    #if defined(__clang__)
+        npyv_b64 ret;
+        __asm("fcmeq %0.2d, %1.2d, %1.2d" : "=w" (ret) : "w" (a));
+        return ret;
+    #else
+        return vceqq_f64(a, a);
+    #endif
+    }
 #endif
 
 // Test cross all vector lanes
diff --git a/numpy/core/src/common/simd/neon/reorder.h b/numpy/core/src/common/simd/neon/reorder.h
index 50b06ed11..8bf68f5be 100644
--- a/numpy/core/src/common/simd/neon/reorder.h
+++ b/numpy/core/src/common/simd/neon/reorder.h
@@ -76,36 +76,45 @@ NPYV_IMPL_NEON_COMBINE(npyv_f32, f32)
 NPYV_IMPL_NEON_COMBINE(npyv_f64, f64)
 #endif
 
-// interleave two vectors
-#define NPYV_IMPL_NEON_ZIP(T_VEC, SFX)                       \
-    NPY_FINLINE T_VEC##x2 npyv_zip_##SFX(T_VEC a, T_VEC b)   \
-    {                                                        \
-        T_VEC##x2 r;                                         \
-        r.val[0] = vzip1q_##SFX(a, b);                       \
-        r.val[1] = vzip2q_##SFX(a, b);                       \
-        return r;                                            \
-    }
-
+// interleave & deinterleave two vectors
 #ifdef __aarch64__
-    NPYV_IMPL_NEON_ZIP(npyv_u8,  u8)
-    NPYV_IMPL_NEON_ZIP(npyv_s8,  s8)
-    NPYV_IMPL_NEON_ZIP(npyv_u16, u16)
-    NPYV_IMPL_NEON_ZIP(npyv_s16, s16)
-    NPYV_IMPL_NEON_ZIP(npyv_u32, u32)
-    NPYV_IMPL_NEON_ZIP(npyv_s32, s32)
-    NPYV_IMPL_NEON_ZIP(npyv_f32, f32)
-    NPYV_IMPL_NEON_ZIP(npyv_f64, f64)
+    #define NPYV_IMPL_NEON_ZIP(T_VEC, SFX)                       \
+        NPY_FINLINE T_VEC##x2 npyv_zip_##SFX(T_VEC a, T_VEC b)   \
+        {                                                        \
+            T_VEC##x2 r;                                         \
+            r.val[0] = vzip1q_##SFX(a, b);                       \
+            r.val[1] = vzip2q_##SFX(a, b);                       \
+            return r;                                            \
+        }                                                        \
+        NPY_FINLINE T_VEC##x2 npyv_unzip_##SFX(T_VEC a, T_VEC b) \
+        {                                                        \
+            T_VEC##x2 r;                                         \
+            r.val[0] = vuzp1q_##SFX(a, b);                       \
+            r.val[1] = vuzp2q_##SFX(a, b);                       \
+            return r;                                            \
+        }
 #else
-    #define npyv_zip_u8  vzipq_u8
-    #define npyv_zip_s8  vzipq_s8
-    #define npyv_zip_u16 vzipq_u16
-    #define npyv_zip_s16 vzipq_s16
-    #define npyv_zip_u32 vzipq_u32
-    #define npyv_zip_s32 vzipq_s32
-    #define npyv_zip_f32 vzipq_f32
+    #define NPYV_IMPL_NEON_ZIP(T_VEC, SFX)                       \
+        NPY_FINLINE T_VEC##x2 npyv_zip_##SFX(T_VEC a, T_VEC b)   \
+        { return vzipq_##SFX(a, b); }                            \
+        NPY_FINLINE T_VEC##x2 npyv_unzip_##SFX(T_VEC a, T_VEC b) \
+        { return vuzpq_##SFX(a, b); }
 #endif
+
+NPYV_IMPL_NEON_ZIP(npyv_u8,  u8)
+NPYV_IMPL_NEON_ZIP(npyv_s8,  s8)
+NPYV_IMPL_NEON_ZIP(npyv_u16, u16)
+NPYV_IMPL_NEON_ZIP(npyv_s16, s16)
+NPYV_IMPL_NEON_ZIP(npyv_u32, u32)
+NPYV_IMPL_NEON_ZIP(npyv_s32, s32)
+NPYV_IMPL_NEON_ZIP(npyv_f32, f32)
+
 #define npyv_zip_u64 npyv_combine_u64
 #define npyv_zip_s64 npyv_combine_s64
+#define npyv_zip_f64 npyv_combine_f64
+#define npyv_unzip_u64 npyv_combine_u64
+#define npyv_unzip_s64 npyv_combine_s64
+#define npyv_unzip_f64 npyv_combine_f64
 
 // Reverse elements of each 64-bit lane
 #define npyv_rev64_u8  vrev64q_u8
@@ -116,4 +125,65 @@ NPYV_IMPL_NEON_COMBINE(npyv_f64, f64)
 #define npyv_rev64_s32 vrev64q_s32
 #define npyv_rev64_f32 vrev64q_f32
 
+// Permuting the elements of each 128-bit lane by immediate index for
+// each element.
+#ifdef __clang__
+    #define npyv_permi128_u32(A, E0, E1, E2, E3) \
+        __builtin_shufflevector(A, A, E0, E1, E2, E3)
+#elif defined(__GNUC__)
+    #define npyv_permi128_u32(A, E0, E1, E2, E3) \
+        __builtin_shuffle(A, npyv_set_u32(E0, E1, E2, E3))
+#else
+    #define npyv_permi128_u32(A, E0, E1, E2, E3)          \
+        npyv_set_u32(                                     \
+            vgetq_lane_u32(A, E0), vgetq_lane_u32(A, E1), \
+            vgetq_lane_u32(A, E2), vgetq_lane_u32(A, E3)  \
+        )
+    #define npyv_permi128_s32(A, E0, E1, E2, E3)          \
+        npyv_set_s32(                                     \
+            vgetq_lane_s32(A, E0), vgetq_lane_s32(A, E1), \
+            vgetq_lane_s32(A, E2), vgetq_lane_s32(A, E3)  \
+        )
+    #define npyv_permi128_f32(A, E0, E1, E2, E3)          \
+        npyv_set_f32(                                     \
+            vgetq_lane_f32(A, E0), vgetq_lane_f32(A, E1), \
+            vgetq_lane_f32(A, E2), vgetq_lane_f32(A, E3)  \
+        )
+#endif
+
+#if defined(__clang__) || defined(__GNUC__)
+    #define npyv_permi128_s32 npyv_permi128_u32
+    #define npyv_permi128_f32 npyv_permi128_u32
+#endif
+
+#ifdef __clang__
+    #define npyv_permi128_u64(A, E0, E1) \
+        __builtin_shufflevector(A, A, E0, E1)
+#elif defined(__GNUC__)
+    #define npyv_permi128_u64(A, E0, E1) \
+        __builtin_shuffle(A, npyv_set_u64(E0, E1))
+#else
+    #define npyv_permi128_u64(A, E0, E1)                  \
+        npyv_set_u64(                                     \
+            vgetq_lane_u64(A, E0), vgetq_lane_u64(A, E1)  \
+        )
+    #define npyv_permi128_s64(A, E0, E1)                  \
+        npyv_set_s64(                                     \
+            vgetq_lane_s64(A, E0), vgetq_lane_s64(A, E1)  \
+        )
+    #define npyv_permi128_f64(A, E0, E1)                  \
+        npyv_set_f64(                                     \
+            vgetq_lane_f64(A, E0), vgetq_lane_f64(A, E1)  \
+        )
+#endif
+
+#if defined(__clang__) || defined(__GNUC__)
+    #define npyv_permi128_s64 npyv_permi128_u64
+    #define npyv_permi128_f64 npyv_permi128_u64
+#endif
+
+#if !NPY_SIMD_F64
+    #undef npyv_permi128_f64
+#endif
+
 #endif // _NPY_SIMD_NEON_REORDER_H
diff --git a/numpy/core/src/common/simd/simd.h b/numpy/core/src/common/simd/simd.h
index 92a77ad80..8c9b14251 100644
--- a/numpy/core/src/common/simd/simd.h
+++ b/numpy/core/src/common/simd/simd.h
@@ -82,6 +82,9 @@ typedef double     npyv_lanetype_f64;
     #define NPY_SIMD_FMA3 0
     /// 1 if the enabled SIMD extension is running on big-endian mode otherwise 0.
     #define NPY_SIMD_BIGENDIAN 0
+    /// 1 if the supported comparison intrinsics(lt, le, gt, ge)
+    /// raises FP invalid exception for quite NaNs.
+    #define NPY_SIMD_CMPSIGNAL 0
 #endif
 
 // enable emulated mask operations for all SIMD extension except for AVX512
diff --git a/numpy/core/src/common/simd/sse/arithmetic.h b/numpy/core/src/common/simd/sse/arithmetic.h
index bced35108..72a87eac1 100644
--- a/numpy/core/src/common/simd/sse/arithmetic.h
+++ b/numpy/core/src/common/simd/sse/arithmetic.h
@@ -282,6 +282,10 @@ NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor)
     // negate multiply and subtract, -(a*b) - c
     #define npyv_nmulsub_f32 _mm_fnmsub_ps
     #define npyv_nmulsub_f64 _mm_fnmsub_pd
+    // multiply, add for odd elements and subtract even elements.
+    // (a * b) -+ c
+    #define npyv_muladdsub_f32 _mm_fmaddsub_ps
+    #define npyv_muladdsub_f64 _mm_fmaddsub_pd
 #elif defined(NPY_HAVE_FMA4)
     // multiply and add, a*b + c
     #define npyv_muladd_f32 _mm_macc_ps
@@ -292,6 +296,10 @@ NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor)
     // negate multiply and add, -(a*b) + c
     #define npyv_nmuladd_f32 _mm_nmacc_ps
     #define npyv_nmuladd_f64 _mm_nmacc_pd
+    // multiply, add for odd elements and subtract even elements.
+    // (a * b) -+ c
+    #define npyv_muladdsub_f32 _mm_maddsub_ps
+    #define npyv_muladdsub_f64 _mm_maddsub_pd
 #else
     // multiply and add, a*b + c
     NPY_FINLINE npyv_f32 npyv_muladd_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
@@ -308,6 +316,28 @@ NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor)
     { return npyv_sub_f32(c, npyv_mul_f32(a, b)); }
     NPY_FINLINE npyv_f64 npyv_nmuladd_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c)
     { return npyv_sub_f64(c, npyv_mul_f64(a, b)); }
+    // multiply, add for odd elements and subtract even elements.
+    // (a * b) -+ c
+    NPY_FINLINE npyv_f32 npyv_muladdsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
+    {
+        npyv_f32 m = npyv_mul_f32(a, b);
+    #if NPY_HAVE_SSE3
+        return _mm_addsub_ps(m, c);
+    #else
+        const npyv_f32 msign = npyv_set_f32(-0.0f, 0.0f, -0.0f, 0.0f);
+        return npyv_add_f32(m, npyv_xor_f32(msign, c));
+    #endif
+    }
+    NPY_FINLINE npyv_f64 npyv_muladdsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c)
+    {
+        npyv_f64 m = npyv_mul_f64(a, b);
+    #if NPY_HAVE_SSE3
+        return _mm_addsub_pd(m, c);
+    #else
+        const npyv_f64 msign = npyv_set_f64(-0.0, 0.0);
+        return npyv_add_f64(m, npyv_xor_f64(msign, c));
+    #endif
+    }
 #endif // NPY_HAVE_FMA3
 #ifndef NPY_HAVE_FMA3 // for FMA4 and NON-FMA3
     // negate multiply and subtract, -(a*b) - c
diff --git a/numpy/core/src/common/simd/sse/math.h b/numpy/core/src/common/simd/sse/math.h
index b7f8e6ebb..b51c935af 100644
--- a/numpy/core/src/common/simd/sse/math.h
+++ b/numpy/core/src/common/simd/sse/math.h
@@ -269,13 +269,23 @@ NPY_FINLINE npyv_f32 npyv_rint_f32(npyv_f32 a)
 #ifdef NPY_HAVE_SSE41
     return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT);
 #else
-    const npyv_f32 szero = _mm_set1_ps(-0.0f);
-    __m128i roundi = _mm_cvtps_epi32(a);
-    __m128i overflow = _mm_cmpeq_epi32(roundi, _mm_castps_si128(szero));
-    __m128 r = _mm_cvtepi32_ps(roundi);
-    // respect sign of zero
-    r = _mm_or_ps(r, _mm_and_ps(a, szero));
-    return npyv_select_f32(overflow, a, r);
+    const __m128 szero = _mm_set1_ps(-0.0f);
+    const __m128i exp_mask = _mm_set1_epi32(0xff000000);
+
+    __m128i nfinite_mask = _mm_slli_epi32(_mm_castps_si128(a), 1);
+            nfinite_mask = _mm_and_si128(nfinite_mask, exp_mask);
+            nfinite_mask = _mm_cmpeq_epi32(nfinite_mask, exp_mask);
+
+    // eliminate nans/inf to avoid invalid fp errors
+    __m128 x = _mm_xor_ps(a, _mm_castsi128_ps(nfinite_mask));
+    __m128i roundi = _mm_cvtps_epi32(x);
+    __m128 round = _mm_cvtepi32_ps(roundi);
+    // respect signed zero
+    round = _mm_or_ps(round, _mm_and_ps(a, szero));
+    // if overflow return a
+    __m128i overflow_mask = _mm_cmpeq_epi32(roundi, _mm_castps_si128(szero));
+    // a if a overflow or nonfinite
+    return npyv_select_f32(_mm_or_si128(nfinite_mask, overflow_mask), a, round);
 #endif
 }
 
@@ -285,16 +295,22 @@ NPY_FINLINE npyv_f64 npyv_rint_f64(npyv_f64 a)
 #ifdef NPY_HAVE_SSE41
     return _mm_round_pd(a, _MM_FROUND_TO_NEAREST_INT);
 #else
-    const npyv_f64 szero = _mm_set1_pd(-0.0);
-    const npyv_f64 two_power_52 = _mm_set1_pd(0x10000000000000);
-    npyv_f64 sign_two52 = _mm_or_pd(two_power_52, _mm_and_pd(a, szero));
+    const __m128d szero = _mm_set1_pd(-0.0);
+    const __m128d two_power_52 = _mm_set1_pd(0x10000000000000);
+    __m128d nan_mask = _mm_cmpunord_pd(a, a);
+    // eliminate nans to avoid invalid fp errors within cmpge
+    __m128d abs_x = npyv_abs_f64(_mm_xor_pd(nan_mask, a));
     // round by add magic number 2^52
-    npyv_f64 round = _mm_sub_pd(_mm_add_pd(a, sign_two52), sign_two52);
-    // respect signed zero, e.g. -0.5 -> -0.0
-    return _mm_or_pd(round, _mm_and_pd(a, szero));
+    // assuming that MXCSR register is set to rounding
+    __m128d round = _mm_sub_pd(_mm_add_pd(two_power_52, abs_x), two_power_52);
+    // copysign
+    round = _mm_or_pd(round, _mm_and_pd(a, szero));
+    // a if |a| >= 2^52 or a == NaN
+    __m128d mask = _mm_cmpge_pd(abs_x, two_power_52);
+            mask = _mm_or_pd(mask, nan_mask);
+    return npyv_select_f64(_mm_castpd_si128(mask), a, round);
 #endif
 }
-
 // ceil
 #ifdef NPY_HAVE_SSE41
     #define npyv_ceil_f32 _mm_ceil_ps
@@ -302,27 +318,48 @@ NPY_FINLINE npyv_f64 npyv_rint_f64(npyv_f64 a)
 #else
     NPY_FINLINE npyv_f32 npyv_ceil_f32(npyv_f32 a)
     {
-        const npyv_f32 szero = _mm_set1_ps(-0.0f);
-        const npyv_f32 one = _mm_set1_ps(1.0f);
-        npyv_s32 roundi = _mm_cvttps_epi32(a);
-        npyv_f32 round = _mm_cvtepi32_ps(roundi);
-        npyv_f32 ceil = _mm_add_ps(round, _mm_and_ps(_mm_cmplt_ps(round, a), one));
-        // respect signed zero, e.g. -0.5 -> -0.0
-        npyv_f32 rzero = _mm_or_ps(ceil, _mm_and_ps(a, szero));
+        const __m128 one = _mm_set1_ps(1.0f);
+        const __m128 szero = _mm_set1_ps(-0.0f);
+        const __m128i exp_mask = _mm_set1_epi32(0xff000000);
+
+        __m128i nfinite_mask = _mm_slli_epi32(_mm_castps_si128(a), 1);
+                nfinite_mask = _mm_and_si128(nfinite_mask, exp_mask);
+                nfinite_mask = _mm_cmpeq_epi32(nfinite_mask, exp_mask);
+
+        // eliminate nans/inf to avoid invalid fp errors
+        __m128 x = _mm_xor_ps(a, _mm_castsi128_ps(nfinite_mask));
+        __m128i roundi = _mm_cvtps_epi32(x);
+        __m128 round = _mm_cvtepi32_ps(roundi);
+        __m128 ceil = _mm_add_ps(round, _mm_and_ps(_mm_cmplt_ps(round, x), one));
+        // respect signed zero
+        ceil = _mm_or_ps(ceil, _mm_and_ps(a, szero));
         // if overflow return a
-        return npyv_select_f32(_mm_cmpeq_epi32(roundi, _mm_castps_si128(szero)), a, rzero);
+        __m128i overflow_mask = _mm_cmpeq_epi32(roundi, _mm_castps_si128(szero));
+        // a if a overflow or nonfinite
+        return npyv_select_f32(_mm_or_si128(nfinite_mask, overflow_mask), a, ceil);
     }
     NPY_FINLINE npyv_f64 npyv_ceil_f64(npyv_f64 a)
     {
-        const npyv_f64 szero = _mm_set1_pd(-0.0);
-        const npyv_f64 one = _mm_set1_pd(1.0);
-        const npyv_f64 two_power_52 = _mm_set1_pd(0x10000000000000);
-        npyv_f64 sign_two52 = _mm_or_pd(two_power_52, _mm_and_pd(a, szero));
+        const __m128d one = _mm_set1_pd(1.0);
+        const __m128d szero = _mm_set1_pd(-0.0);
+        const __m128d two_power_52 = _mm_set1_pd(0x10000000000000);
+        __m128d nan_mask = _mm_cmpunord_pd(a, a);
+        // eliminate nans to avoid invalid fp errors within cmpge
+        __m128d x = _mm_xor_pd(nan_mask, a);
+        __m128d abs_x = npyv_abs_f64(x);
+        __m128d sign_x = _mm_and_pd(x, szero);
         // round by add magic number 2^52
-        npyv_f64 round = _mm_sub_pd(_mm_add_pd(a, sign_two52), sign_two52);
-        npyv_f64 ceil = _mm_add_pd(round, _mm_and_pd(_mm_cmplt_pd(round, a), one));
-        // respect signed zero, e.g. -0.5 -> -0.0
-        return _mm_or_pd(ceil, _mm_and_pd(a, szero));
+        // assuming that MXCSR register is set to rounding
+        __m128d round = _mm_sub_pd(_mm_add_pd(two_power_52, abs_x), two_power_52);
+        // copysign
+        round = _mm_or_pd(round, sign_x);
+        __m128d ceil = _mm_add_pd(round, _mm_and_pd(_mm_cmplt_pd(round, x), one));
+        // respects sign of 0.0
+        ceil = _mm_or_pd(ceil, sign_x);
+        // a if |a| >= 2^52 or a == NaN
+        __m128d mask = _mm_cmpge_pd(abs_x, two_power_52);
+                mask = _mm_or_pd(mask, nan_mask);
+        return npyv_select_f64(_mm_castpd_si128(mask), a, ceil);
     }
 #endif
 
@@ -333,24 +370,43 @@ NPY_FINLINE npyv_f64 npyv_rint_f64(npyv_f64 a)
 #else
     NPY_FINLINE npyv_f32 npyv_trunc_f32(npyv_f32 a)
     {
-        const npyv_f32 szero = _mm_set1_ps(-0.0f);
-        npyv_s32 roundi = _mm_cvttps_epi32(a);
-        npyv_f32 trunc = _mm_cvtepi32_ps(roundi);
+        const __m128 szero = _mm_set1_ps(-0.0f);
+        const __m128i exp_mask = _mm_set1_epi32(0xff000000);
+
+        __m128i nfinite_mask = _mm_slli_epi32(_mm_castps_si128(a), 1);
+                nfinite_mask = _mm_and_si128(nfinite_mask, exp_mask);
+                nfinite_mask = _mm_cmpeq_epi32(nfinite_mask, exp_mask);
+
+        // eliminate nans/inf to avoid invalid fp errors
+        __m128 x = _mm_xor_ps(a, _mm_castsi128_ps(nfinite_mask));
+        __m128i trunci = _mm_cvttps_epi32(x);
+        __m128 trunc = _mm_cvtepi32_ps(trunci);
         // respect signed zero, e.g. -0.5 -> -0.0
-        npyv_f32 rzero = _mm_or_ps(trunc, _mm_and_ps(a, szero));
+        trunc = _mm_or_ps(trunc, _mm_and_ps(a, szero));
         // if overflow return a
-        return npyv_select_f32(_mm_cmpeq_epi32(roundi, _mm_castps_si128(szero)), a, rzero);
+        __m128i overflow_mask = _mm_cmpeq_epi32(trunci, _mm_castps_si128(szero));
+        // a if a overflow or nonfinite
+        return npyv_select_f32(_mm_or_si128(nfinite_mask, overflow_mask), a, trunc);
     }
     NPY_FINLINE npyv_f64 npyv_trunc_f64(npyv_f64 a)
     {
-        const npyv_f64 szero = _mm_set1_pd(-0.0);
-        const npyv_f64 one = _mm_set1_pd(1.0);
-        const npyv_f64 two_power_52 = _mm_set1_pd(0x10000000000000);
-        npyv_f64 abs_a = npyv_abs_f64(a);
+        const __m128d one = _mm_set1_pd(1.0);
+        const __m128d szero = _mm_set1_pd(-0.0);
+        const __m128d two_power_52 = _mm_set1_pd(0x10000000000000);
+        __m128d nan_mask = _mm_cmpunord_pd(a, a);
+        // eliminate nans to avoid invalid fp errors within cmpge
+        __m128d abs_x = npyv_abs_f64(_mm_xor_pd(nan_mask, a));
         // round by add magic number 2^52
-        npyv_f64 abs_round = _mm_sub_pd(_mm_add_pd(abs_a, two_power_52), two_power_52);
-        npyv_f64 subtrahend = _mm_and_pd(_mm_cmpgt_pd(abs_round, abs_a), one);
-        return _mm_or_pd(_mm_sub_pd(abs_round, subtrahend), _mm_and_pd(a, szero));
+        // assuming that MXCSR register is set to rounding
+        __m128d abs_round = _mm_sub_pd(_mm_add_pd(two_power_52, abs_x), two_power_52);
+        __m128d subtrahend = _mm_and_pd(_mm_cmpgt_pd(abs_round, abs_x), one);
+        __m128d trunc = _mm_sub_pd(abs_round, subtrahend);
+        // copysign
+        trunc = _mm_or_pd(trunc, _mm_and_pd(a, szero));
+        // a if |a| >= 2^52 or a == NaN
+        __m128d mask = _mm_cmpge_pd(abs_x, two_power_52);
+               mask = _mm_or_pd(mask, nan_mask);
+        return npyv_select_f64(_mm_castpd_si128(mask), a, trunc);
     }
 #endif
 
@@ -361,15 +417,46 @@ NPY_FINLINE npyv_f64 npyv_rint_f64(npyv_f64 a)
 #else
     NPY_FINLINE npyv_f32 npyv_floor_f32(npyv_f32 a)
     {
-        const npyv_f32 one = _mm_set1_ps(1.0f);
-        npyv_f32 round = npyv_rint_f32(a);
-        return _mm_sub_ps(round, _mm_and_ps(_mm_cmpgt_ps(round, a), one));
+        const __m128 one = _mm_set1_ps(1.0f);
+        const __m128 szero = _mm_set1_ps(-0.0f);
+        const __m128i exp_mask = _mm_set1_epi32(0xff000000);
+
+        __m128i nfinite_mask = _mm_slli_epi32(_mm_castps_si128(a), 1);
+                nfinite_mask = _mm_and_si128(nfinite_mask, exp_mask);
+                nfinite_mask = _mm_cmpeq_epi32(nfinite_mask, exp_mask);
+
+        // eliminate nans/inf to avoid invalid fp errors
+        __m128 x = _mm_xor_ps(a, _mm_castsi128_ps(nfinite_mask));
+        __m128i roundi = _mm_cvtps_epi32(x);
+        __m128 round = _mm_cvtepi32_ps(roundi);
+        __m128 floor = _mm_sub_ps(round, _mm_and_ps(_mm_cmpgt_ps(round, x), one));
+        // respect signed zero
+        floor = _mm_or_ps(floor, _mm_and_ps(a, szero));
+        // if overflow return a
+        __m128i overflow_mask = _mm_cmpeq_epi32(roundi, _mm_castps_si128(szero));
+        // a if a overflow or nonfinite
+        return npyv_select_f32(_mm_or_si128(nfinite_mask, overflow_mask), a, floor);
     }
     NPY_FINLINE npyv_f64 npyv_floor_f64(npyv_f64 a)
     {
-        const npyv_f64 one = _mm_set1_pd(1.0);
-        npyv_f64 round = npyv_rint_f64(a);
-        return _mm_sub_pd(round, _mm_and_pd(_mm_cmpgt_pd(round, a), one));
+        const __m128d one = _mm_set1_pd(1.0f);
+        const __m128d szero = _mm_set1_pd(-0.0f);
+        const __m128d two_power_52 = _mm_set1_pd(0x10000000000000);
+        __m128d nan_mask = _mm_cmpunord_pd(a, a);
+        // eliminate nans to avoid invalid fp errors within cmpge
+        __m128d x = _mm_xor_pd(nan_mask, a);
+        __m128d abs_x = npyv_abs_f64(x);
+        __m128d sign_x = _mm_and_pd(x, szero);
+        // round by add magic number 2^52
+        // assuming that MXCSR register is set to rounding
+        __m128d round = _mm_sub_pd(_mm_add_pd(two_power_52, abs_x), two_power_52);
+        // copysign
+        round = _mm_or_pd(round, sign_x);
+        __m128d floor = _mm_sub_pd(round, _mm_and_pd(_mm_cmpgt_pd(round, x), one));
+        // a if |a| >= 2^52 or a == NaN
+        __m128d mask = _mm_cmpge_pd(abs_x, two_power_52);
+               mask = _mm_or_pd(mask, nan_mask);
+        return npyv_select_f64(_mm_castpd_si128(mask), a, floor);
     }
 #endif // NPY_HAVE_SSE41
 
diff --git a/numpy/core/src/common/simd/sse/memory.h b/numpy/core/src/common/simd/sse/memory.h
index 3ff64848d..4c8e86a6f 100644
--- a/numpy/core/src/common/simd/sse/memory.h
+++ b/numpy/core/src/common/simd/sse/memory.h
@@ -103,6 +103,28 @@ NPY_FINLINE npyv_u64 npyv_loadn_u64(const npy_uint64 *ptr, npy_intp stride)
 { return _mm_castpd_si128(npyv_loadn_f64((const double*)ptr, stride)); }
 NPY_FINLINE npyv_s64 npyv_loadn_s64(const npy_int64 *ptr, npy_intp stride)
 { return _mm_castpd_si128(npyv_loadn_f64((const double*)ptr, stride)); }
+
+//// 64-bit load over 32-bit stride
+NPY_FINLINE npyv_f32 npyv_loadn2_f32(const float *ptr, npy_intp stride)
+{
+    __m128d r = _mm_loadh_pd(
+        npyv_loadl_f64((const double*)ptr), (const double*)(ptr + stride)
+    );
+    return _mm_castpd_ps(r);
+}
+NPY_FINLINE npyv_u32 npyv_loadn2_u32(const npy_uint32 *ptr, npy_intp stride)
+{ return _mm_castps_si128(npyv_loadn2_f32((const float*)ptr, stride)); }
+NPY_FINLINE npyv_s32 npyv_loadn2_s32(const npy_int32 *ptr, npy_intp stride)
+{ return _mm_castps_si128(npyv_loadn2_f32((const float*)ptr, stride)); }
+
+//// 128-bit load over 64-bit stride
+NPY_FINLINE npyv_f64 npyv_loadn2_f64(const double *ptr, npy_intp stride)
+{ (void)stride; return npyv_load_f64(ptr); }
+NPY_FINLINE npyv_u64 npyv_loadn2_u64(const npy_uint64 *ptr, npy_intp stride)
+{ (void)stride; return npyv_load_u64(ptr); }
+NPY_FINLINE npyv_s64 npyv_loadn2_s64(const npy_int64 *ptr, npy_intp stride)
+{ (void)stride; return npyv_load_s64(ptr); }
+
 /***************************
  * Non-contiguous Store
  ***************************/
@@ -135,6 +157,24 @@ NPY_FINLINE void npyv_storen_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a)
 NPY_FINLINE void npyv_storen_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a)
 { npyv_storen_f64((double*)ptr, stride, _mm_castsi128_pd(a)); }
 
+//// 64-bit store over 32-bit stride
+NPY_FINLINE void npyv_storen2_u32(npy_uint32 *ptr, npy_intp stride, npyv_u32 a)
+{
+    _mm_storel_pd((double*)ptr, _mm_castsi128_pd(a));
+    _mm_storeh_pd((double*)(ptr + stride), _mm_castsi128_pd(a));
+}
+NPY_FINLINE void npyv_storen2_s32(npy_int32 *ptr, npy_intp stride, npyv_s32 a)
+{ npyv_storen2_u32((npy_uint32*)ptr, stride, a); }
+NPY_FINLINE void npyv_storen2_f32(float *ptr, npy_intp stride, npyv_f32 a)
+{ npyv_storen2_u32((npy_uint32*)ptr, stride, _mm_castps_si128(a)); }
+
+//// 128-bit store over 64-bit stride
+NPY_FINLINE void npyv_storen2_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a)
+{ (void)stride; npyv_store_u64(ptr, a); }
+NPY_FINLINE void npyv_storen2_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a)
+{ (void)stride; npyv_store_s64(ptr, a); }
+NPY_FINLINE void npyv_storen2_f64(double *ptr, npy_intp stride, npyv_f64 a)
+{ (void)stride; npyv_store_f64(ptr, a); }
 /*********************************
  * Partial Load
  *********************************/
@@ -204,13 +244,14 @@ NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
         return _mm_cvtsi32_si128(*ptr);
     case 2:
         return _mm_loadl_epi64((const __m128i*)ptr);
-    case 3:;
-        npyv_s32 a = _mm_loadl_epi64((const __m128i*)ptr);
-    #ifdef NPY_HAVE_SSE41
-        return _mm_insert_epi32(a, ptr[2], 2);
-    #else
-        return _mm_unpacklo_epi64(a, _mm_cvtsi32_si128(ptr[2]));
-    #endif
+    case 3: {
+            npyv_s32 a = _mm_loadl_epi64((const __m128i*)ptr);
+        #ifdef NPY_HAVE_SSE41
+            return _mm_insert_epi32(a, ptr[2], 2);
+        #else
+            return _mm_unpacklo_epi64(a, _mm_cvtsi32_si128(ptr[2]));
+        #endif
+        }
     default:
         return npyv_load_s32(ptr);
     }
@@ -246,6 +287,32 @@ NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
     }
     return npyv_load_s64(ptr);
 }
+
+//// 64-bit nlane
+NPY_FINLINE npyv_s32 npyv_load2_till_s32(const npy_int32 *ptr, npy_uintp nlane,
+                                          npy_int32 fill_lo, npy_int32 fill_hi)
+{
+    assert(nlane > 0);
+    if (nlane == 1) {
+        const __m128i vfill = npyv_set_s32(fill_lo, fill_hi, fill_lo, fill_hi);
+        return _mm_castpd_si128(
+            _mm_loadl_pd(_mm_castsi128_pd(vfill), (double*)ptr)
+        );
+    }
+    return npyv_load_s32(ptr);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s32 npyv_load2_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
+{ return npyv_load_tillz_s64((const npy_int64*)ptr, nlane); }
+
+//// 128-bit nlane
+NPY_FINLINE npyv_s64 npyv_load2_till_s64(const npy_int64 *ptr, npy_uintp nlane,
+                                           npy_int64 fill_lo, npy_int64 fill_hi)
+{ (void)nlane; (void)fill_lo; (void)fill_hi; return npyv_load_s64(ptr); }
+
+NPY_FINLINE npyv_s64 npyv_load2_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
+{ (void)nlane; return npyv_load_s64(ptr); }
+
 /*********************************
  * Non-contiguous partial load
  *********************************/
@@ -305,23 +372,27 @@ npyv_loadn_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane)
     case 1:
         return _mm_cvtsi32_si128(ptr[0]);
     case 2:;
-        npyv_s32 a = _mm_cvtsi32_si128(ptr[0]);
-#ifdef NPY_HAVE_SSE41
-        return _mm_insert_epi32(a, ptr[stride], 1);
-#else
-        return _mm_unpacklo_epi32(a, _mm_cvtsi32_si128(ptr[stride]));
-#endif // NPY_HAVE_SSE41
-    case 3:;
-        a = _mm_cvtsi32_si128(ptr[0]);
-#ifdef NPY_HAVE_SSE41
-        a = _mm_insert_epi32(a, ptr[stride], 1);
-        a = _mm_insert_epi32(a, ptr[stride*2], 2);
-        return a;
-#else
-        a = _mm_unpacklo_epi32(a, _mm_cvtsi32_si128(ptr[stride]));
-        a = _mm_unpacklo_epi64(a, _mm_cvtsi32_si128(ptr[stride*2]));
-        return a;
-#endif // NPY_HAVE_SSE41
+        {
+            npyv_s32 a = _mm_cvtsi32_si128(ptr[0]);
+    #ifdef NPY_HAVE_SSE41
+            return _mm_insert_epi32(a, ptr[stride], 1);
+    #else
+            return _mm_unpacklo_epi32(a, _mm_cvtsi32_si128(ptr[stride]));
+    #endif // NPY_HAVE_SSE41
+        }
+    case 3:
+        {
+            npyv_s32 a = _mm_cvtsi32_si128(ptr[0]);
+    #ifdef NPY_HAVE_SSE41
+            a = _mm_insert_epi32(a, ptr[stride], 1);
+            a = _mm_insert_epi32(a, ptr[stride*2], 2);
+            return a;
+    #else
+            a = _mm_unpacklo_epi32(a, _mm_cvtsi32_si128(ptr[stride]));
+            a = _mm_unpacklo_epi64(a, _mm_cvtsi32_si128(ptr[stride*2]));
+            return a;
+    #endif // NPY_HAVE_SSE41
+        }
     default:
         return npyv_loadn_s32(ptr, stride);
     }
@@ -358,6 +429,37 @@ NPY_FINLINE npyv_s64 npyv_loadn_tillz_s64(const npy_int64 *ptr, npy_intp stride,
     }
     return npyv_loadn_s64(ptr, stride);
 }
+
+//// 64-bit load over 32-bit stride
+NPY_FINLINE npyv_s32 npyv_loadn2_till_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane,
+                                                 npy_int32 fill_lo, npy_int32 fill_hi)
+{
+    assert(nlane > 0);
+    if (nlane == 1) {
+        const __m128i vfill = npyv_set_s32(0, 0, fill_lo, fill_hi);
+        return _mm_castpd_si128(
+            _mm_loadl_pd(_mm_castsi128_pd(vfill), (double*)ptr)
+        );
+    }
+    return npyv_loadn2_s32(ptr, stride);
+}
+NPY_FINLINE npyv_s32 npyv_loadn2_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane)
+{
+    assert(nlane > 0);
+    if (nlane == 1) {
+        return _mm_loadl_epi64((const __m128i*)ptr);
+    }
+    return npyv_loadn2_s32(ptr, stride);
+}
+
+//// 128-bit load over 64-bit stride
+NPY_FINLINE npyv_s64 npyv_loadn2_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane,
+                                                  npy_int64 fill_lo, npy_int64 fill_hi)
+{ assert(nlane > 0); (void)stride; (void)nlane; (void)fill_lo; (void)fill_hi; return npyv_load_s64(ptr); }
+
+NPY_FINLINE npyv_s64 npyv_loadn2_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane)
+{ assert(nlane > 0); (void)stride; (void)nlane; return npyv_load_s64(ptr); }
+
 /*********************************
  * Partial store
  *********************************/
@@ -394,6 +496,17 @@ NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a
     }
     npyv_store_s64(ptr, a);
 }
+//// 64-bit nlane
+NPY_FINLINE void npyv_store2_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a)
+{ npyv_store_till_s64((npy_int64*)ptr, nlane, a); }
+
+//// 128-bit nlane
+NPY_FINLINE void npyv_store2_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a)
+{
+    assert(nlane > 0); (void)nlane;
+    npyv_store_s64(ptr, a);
+}
+
 /*********************************
  * Non-contiguous partial store
  *********************************/
@@ -401,25 +514,35 @@ NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a
 NPY_FINLINE void npyv_storen_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npyv_s32 a)
 {
     assert(nlane > 0);
+    ptr[stride*0] = _mm_cvtsi128_si32(a);
     switch(nlane) {
+    case 1:
+        return;
 #ifdef NPY_HAVE_SSE41
-    default:
-        ptr[stride*3] = _mm_extract_epi32(a, 3);
+    case 2:
+        ptr[stride*1] = _mm_extract_epi32(a, 1);
+        return;
     case 3:
+        ptr[stride*1] = _mm_extract_epi32(a, 1);
         ptr[stride*2] = _mm_extract_epi32(a, 2);
-    case 2:
+        return;
+    default:
         ptr[stride*1] = _mm_extract_epi32(a, 1);
+        ptr[stride*2] = _mm_extract_epi32(a, 2);
+        ptr[stride*3] = _mm_extract_epi32(a, 3);
 #else
-    default:
-        ptr[stride*3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 3)));
+    case 2:
+        ptr[stride*1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 1)));
+        return;
     case 3:
+        ptr[stride*1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 1)));
         ptr[stride*2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 2)));
-    case 2:
+        return;
+    default:
         ptr[stride*1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 1)));
+        ptr[stride*2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 2)));
+        ptr[stride*3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 3)));
 #endif
-    case 1:
-        ptr[stride*0] = _mm_cvtsi128_si32(a);
-        break;
     }
 }
 //// 64
@@ -432,6 +555,21 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp
     }
     npyv_storen_s64(ptr, stride, a);
 }
+
+//// 64-bit store over 32-bit stride
+NPY_FINLINE void npyv_storen2_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npyv_s32 a)
+{
+    assert(nlane > 0);
+    npyv_storel_s32(ptr, a);
+    if (nlane > 1) {
+        npyv_storeh_s32(ptr + stride, a);
+    }
+}
+
+//// 128-bit store over 64-bit stride
+NPY_FINLINE void npyv_storen2_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npyv_s64 a)
+{ assert(nlane > 0); (void)stride; (void)nlane; npyv_store_s64(ptr, a); }
+
 /*****************************************************************
  * Implement partial load/store for u32/f32/u64/f64... via casting
  *****************************************************************/
@@ -442,7 +580,8 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp
         union {                                                                             \
             npyv_lanetype_##F_SFX from_##F_SFX;                                             \
             npyv_lanetype_##T_SFX to_##T_SFX;                                               \
-        } pun = {.from_##F_SFX = fill};                                                     \
+        } pun;                                                                              \
+        pun.from_##F_SFX = fill;                                                            \
         return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load_till_##T_SFX(                   \
             (const npyv_lanetype_##T_SFX *)ptr, nlane, pun.to_##T_SFX                       \
         ));                                                                                 \
@@ -454,7 +593,8 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp
         union {                                                                             \
             npyv_lanetype_##F_SFX from_##F_SFX;                                             \
             npyv_lanetype_##T_SFX to_##T_SFX;                                               \
-        } pun = {.from_##F_SFX = fill};                                                     \
+        } pun;                                                                              \
+        pun.from_##F_SFX = fill;                                                            \
         return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn_till_##T_SFX(                  \
             (const npyv_lanetype_##T_SFX *)ptr, stride, nlane, pun.to_##T_SFX               \
         ));                                                                                 \
@@ -495,6 +635,110 @@ NPYV_IMPL_SSE_REST_PARTIAL_TYPES(f32, s32)
 NPYV_IMPL_SSE_REST_PARTIAL_TYPES(u64, s64)
 NPYV_IMPL_SSE_REST_PARTIAL_TYPES(f64, s64)
 
+// 128-bit/64-bit stride
+#define NPYV_IMPL_SSE_REST_PARTIAL_TYPES_PAIR(F_SFX, T_SFX)                                 \
+    NPY_FINLINE npyv_##F_SFX npyv_load2_till_##F_SFX                                        \
+    (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane,                                     \
+     npyv_lanetype_##F_SFX fill_lo, npyv_lanetype_##F_SFX fill_hi)                          \
+    {                                                                                       \
+        union pun {                                                                         \
+            npyv_lanetype_##F_SFX from_##F_SFX;                                             \
+            npyv_lanetype_##T_SFX to_##T_SFX;                                               \
+        };                                                                                  \
+        union pun pun_lo;                                                                   \
+        union pun pun_hi;                                                                   \
+        pun_lo.from_##F_SFX = fill_lo;                                                      \
+        pun_hi.from_##F_SFX = fill_hi;                                                      \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load2_till_##T_SFX(                  \
+            (const npyv_lanetype_##T_SFX *)ptr, nlane, pun_lo.to_##T_SFX, pun_hi.to_##T_SFX \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE npyv_##F_SFX npyv_loadn2_till_##F_SFX                                       \
+    (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane,                    \
+     npyv_lanetype_##F_SFX fill_lo, npyv_lanetype_##F_SFX fill_hi)                          \
+    {                                                                                       \
+        union pun {                                                                         \
+            npyv_lanetype_##F_SFX from_##F_SFX;                                             \
+            npyv_lanetype_##T_SFX to_##T_SFX;                                               \
+        };                                                                                  \
+        union pun pun_lo;                                                                   \
+        union pun pun_hi;                                                                   \
+        pun_lo.from_##F_SFX = fill_lo;                                                      \
+        pun_hi.from_##F_SFX = fill_hi;                                                      \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn2_till_##T_SFX(                 \
+            (const npyv_lanetype_##T_SFX *)ptr, stride, nlane, pun_lo.to_##T_SFX,           \
+            pun_hi.to_##T_SFX                                                               \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE npyv_##F_SFX npyv_load2_tillz_##F_SFX                                       \
+    (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane)                                     \
+    {                                                                                       \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load2_tillz_##T_SFX(                 \
+            (const npyv_lanetype_##T_SFX *)ptr, nlane                                       \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE npyv_##F_SFX npyv_loadn2_tillz_##F_SFX                                      \
+    (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane)                    \
+    {                                                                                       \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn2_tillz_##T_SFX(                \
+            (const npyv_lanetype_##T_SFX *)ptr, stride, nlane                               \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE void npyv_store2_till_##F_SFX                                               \
+    (npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, npyv_##F_SFX a)                           \
+    {                                                                                       \
+        npyv_store2_till_##T_SFX(                                                           \
+            (npyv_lanetype_##T_SFX *)ptr, nlane,                                            \
+            npyv_reinterpret_##T_SFX##_##F_SFX(a)                                           \
+        );                                                                                  \
+    }                                                                                       \
+    NPY_FINLINE void npyv_storen2_till_##F_SFX                                              \
+    (npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, npyv_##F_SFX a)          \
+    {                                                                                       \
+        npyv_storen2_till_##T_SFX(                                                          \
+            (npyv_lanetype_##T_SFX *)ptr, stride, nlane,                                    \
+            npyv_reinterpret_##T_SFX##_##F_SFX(a)                                           \
+        );                                                                                  \
+    }
+
+NPYV_IMPL_SSE_REST_PARTIAL_TYPES_PAIR(u32, s32)
+NPYV_IMPL_SSE_REST_PARTIAL_TYPES_PAIR(f32, s32)
+NPYV_IMPL_SSE_REST_PARTIAL_TYPES_PAIR(u64, s64)
+NPYV_IMPL_SSE_REST_PARTIAL_TYPES_PAIR(f64, s64)
+
+/************************************************************
+ *  de-interlave load / interleave contiguous store
+ ************************************************************/
+// two channels
+#define NPYV_IMPL_SSE_MEM_INTERLEAVE(SFX, ZSFX)                              \
+    NPY_FINLINE npyv_##ZSFX##x2 npyv_zip_##ZSFX(npyv_##ZSFX, npyv_##ZSFX);   \
+    NPY_FINLINE npyv_##ZSFX##x2 npyv_unzip_##ZSFX(npyv_##ZSFX, npyv_##ZSFX); \
+    NPY_FINLINE npyv_##SFX##x2 npyv_load_##SFX##x2(                          \
+        const npyv_lanetype_##SFX *ptr                                       \
+    ) {                                                                      \
+        return npyv_unzip_##ZSFX(                                            \
+            npyv_load_##SFX(ptr), npyv_load_##SFX(ptr+npyv_nlanes_##SFX)     \
+        );                                                                   \
+    }                                                                        \
+    NPY_FINLINE void npyv_store_##SFX##x2(                                   \
+        npyv_lanetype_##SFX *ptr, npyv_##SFX##x2 v                           \
+    ) {                                                                      \
+        npyv_##SFX##x2 zip = npyv_zip_##ZSFX(v.val[0], v.val[1]);            \
+        npyv_store_##SFX(ptr, zip.val[0]);                                   \
+        npyv_store_##SFX(ptr + npyv_nlanes_##SFX, zip.val[1]);               \
+    }
+
+NPYV_IMPL_SSE_MEM_INTERLEAVE(u8, u8)
+NPYV_IMPL_SSE_MEM_INTERLEAVE(s8, u8)
+NPYV_IMPL_SSE_MEM_INTERLEAVE(u16, u16)
+NPYV_IMPL_SSE_MEM_INTERLEAVE(s16, u16)
+NPYV_IMPL_SSE_MEM_INTERLEAVE(u32, u32)
+NPYV_IMPL_SSE_MEM_INTERLEAVE(s32, u32)
+NPYV_IMPL_SSE_MEM_INTERLEAVE(u64, u64)
+NPYV_IMPL_SSE_MEM_INTERLEAVE(s64, u64)
+NPYV_IMPL_SSE_MEM_INTERLEAVE(f32, f32)
+NPYV_IMPL_SSE_MEM_INTERLEAVE(f64, f64)
+
 /*********************************
  * Lookup table
  *********************************/
diff --git a/numpy/core/src/common/simd/sse/reorder.h b/numpy/core/src/common/simd/sse/reorder.h
index d96ab9c56..9a57f6489 100644
--- a/numpy/core/src/common/simd/sse/reorder.h
+++ b/numpy/core/src/common/simd/sse/reorder.h
@@ -81,6 +81,75 @@ NPYV_IMPL_SSE_ZIP(npyv_s64, s64, epi64)
 NPYV_IMPL_SSE_ZIP(npyv_f32, f32, ps)
 NPYV_IMPL_SSE_ZIP(npyv_f64, f64, pd)
 
+// deinterleave two vectors
+NPY_FINLINE npyv_u8x2 npyv_unzip_u8(npyv_u8 ab0, npyv_u8 ab1)
+{
+#ifdef NPY_HAVE_SSSE3
+    const __m128i idx = _mm_setr_epi8(
+        0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
+    );
+    __m128i abl = _mm_shuffle_epi8(ab0, idx);
+    __m128i abh = _mm_shuffle_epi8(ab1, idx);
+    return npyv_combine_u8(abl, abh);
+#else
+    __m128i ab_083b = _mm_unpacklo_epi8(ab0, ab1);
+    __m128i ab_4c6e = _mm_unpackhi_epi8(ab0, ab1);
+    __m128i ab_048c = _mm_unpacklo_epi8(ab_083b, ab_4c6e);
+    __m128i ab_36be = _mm_unpackhi_epi8(ab_083b, ab_4c6e);
+    __m128i ab_0346 = _mm_unpacklo_epi8(ab_048c, ab_36be);
+    __m128i ab_8bc8 = _mm_unpackhi_epi8(ab_048c, ab_36be);
+    npyv_u8x2 r;
+    r.val[0] = _mm_unpacklo_epi8(ab_0346, ab_8bc8);
+    r.val[1] = _mm_unpackhi_epi8(ab_0346, ab_8bc8);
+    return r;
+#endif
+}
+#define npyv_unzip_s8 npyv_unzip_u8
+
+NPY_FINLINE npyv_u16x2 npyv_unzip_u16(npyv_u16 ab0, npyv_u16 ab1)
+{
+#ifdef NPY_HAVE_SSSE3
+    const __m128i idx = _mm_setr_epi8(
+        0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15
+    );
+    __m128i abl = _mm_shuffle_epi8(ab0, idx);
+    __m128i abh = _mm_shuffle_epi8(ab1, idx);
+    return npyv_combine_u16(abl, abh);
+#else
+    __m128i ab_0415 = _mm_unpacklo_epi16(ab0, ab1);
+    __m128i ab_263f = _mm_unpackhi_epi16(ab0, ab1);
+    __m128i ab_0246 = _mm_unpacklo_epi16(ab_0415, ab_263f);
+    __m128i ab_135f = _mm_unpackhi_epi16(ab_0415, ab_263f);
+    npyv_u16x2 r;
+    r.val[0] = _mm_unpacklo_epi16(ab_0246, ab_135f);
+    r.val[1] = _mm_unpackhi_epi16(ab_0246, ab_135f);
+    return r;
+#endif
+}
+#define npyv_unzip_s16 npyv_unzip_u16
+
+NPY_FINLINE npyv_u32x2 npyv_unzip_u32(npyv_u32 ab0, npyv_u32 ab1)
+{
+    __m128i abl = _mm_shuffle_epi32(ab0, _MM_SHUFFLE(3, 1, 2, 0));
+    __m128i abh = _mm_shuffle_epi32(ab1, _MM_SHUFFLE(3, 1, 2, 0));
+    return npyv_combine_u32(abl, abh);
+}
+#define npyv_unzip_s32 npyv_unzip_u32
+
+NPY_FINLINE npyv_u64x2 npyv_unzip_u64(npyv_u64 ab0, npyv_u64 ab1)
+{ return npyv_combine_u64(ab0, ab1); }
+#define npyv_unzip_s64 npyv_unzip_u64
+
+NPY_FINLINE npyv_f32x2 npyv_unzip_f32(npyv_f32 ab0, npyv_f32 ab1)
+{
+    npyv_f32x2 r;
+    r.val[0] = _mm_shuffle_ps(ab0, ab1, _MM_SHUFFLE(2, 0, 2, 0));
+    r.val[1] = _mm_shuffle_ps(ab0, ab1, _MM_SHUFFLE(3, 1, 3, 1));
+    return r;
+}
+NPY_FINLINE npyv_f64x2 npyv_unzip_f64(npyv_f64 ab0, npyv_f64 ab1)
+{ return npyv_combine_f64(ab0, ab1); }
+
 // Reverse elements of each 64-bit lane
 NPY_FINLINE npyv_u16 npyv_rev64_u16(npyv_u16 a)
 {
@@ -122,4 +191,22 @@ NPY_FINLINE npyv_f32 npyv_rev64_f32(npyv_f32 a)
     return _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 0, 1));
 }
 
+// Permuting the elements of each 128-bit lane by immediate index for
+// each element.
+#define npyv_permi128_u32(A, E0, E1, E2, E3) \
+    _mm_shuffle_epi32(A, _MM_SHUFFLE(E3, E2, E1, E0))
+
+#define npyv_permi128_s32 npyv_permi128_u32
+
+#define npyv_permi128_u64(A, E0, E1) \
+    _mm_shuffle_epi32(A, _MM_SHUFFLE(((E1)<<1)+1, ((E1)<<1), ((E0)<<1)+1, ((E0)<<1)))
+
+#define npyv_permi128_s64 npyv_permi128_u64
+
+#define npyv_permi128_f32(A, E0, E1, E2, E3) \
+    _mm_shuffle_ps(A, A, _MM_SHUFFLE(E3, E2, E1, E0))
+
+#define npyv_permi128_f64(A, E0, E1) \
+    _mm_shuffle_pd(A, A, _MM_SHUFFLE2(E1, E0))
+
 #endif // _NPY_SIMD_SSE_REORDER_H
diff --git a/numpy/core/src/common/simd/sse/sse.h b/numpy/core/src/common/simd/sse/sse.h
index c21bbfda7..0c6b8cdba 100644
--- a/numpy/core/src/common/simd/sse/sse.h
+++ b/numpy/core/src/common/simd/sse/sse.h
@@ -12,6 +12,7 @@
     #define NPY_SIMD_FMA3 0  // fast emulated
 #endif
 #define NPY_SIMD_BIGENDIAN 0
+#define NPY_SIMD_CMPSIGNAL 1
 
 typedef __m128i npyv_u8;
 typedef __m128i npyv_s8;
diff --git a/numpy/core/src/common/simd/vec/arithmetic.h b/numpy/core/src/common/simd/vec/arithmetic.h
index a2e9d07eb..85f4d6b26 100644
--- a/numpy/core/src/common/simd/vec/arithmetic.h
+++ b/numpy/core/src/common/simd/vec/arithmetic.h
@@ -322,6 +322,20 @@ NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor)
     NPY_FINLINE npyv_f64 npyv_nmulsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c)
     { return vec_neg(vec_madd(a, b, c)); }
 #endif
+// multiply, add for odd elements and subtract even elements.
+// (a * b) -+ c
+#if NPY_SIMD_F32
+NPY_FINLINE npyv_f32 npyv_muladdsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
+{
+    const npyv_f32 msign = npyv_set_f32(-0.0f, 0.0f, -0.0f, 0.0f);
+    return npyv_muladd_f32(a, b, npyv_xor_f32(msign, c));
+}
+#endif
+NPY_FINLINE npyv_f64 npyv_muladdsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c)
+{
+    const npyv_f64 msign = npyv_set_f64(-0.0, 0.0);
+    return npyv_muladd_f64(a, b, npyv_xor_f64(msign, c));
+}
 /***************************
  * Summation
  ***************************/
@@ -352,6 +366,7 @@ NPY_FINLINE float npyv_sum_f32(npyv_f32 a)
 {
     npyv_f32 sum = vec_add(a, npyv_combineh_f32(a, a));
     return vec_extract(sum, 0) + vec_extract(sum, 1);
+    (void)sum;
 }
 #endif
 
@@ -372,6 +387,7 @@ NPY_FINLINE npy_uint16 npyv_sumup_u8(npyv_u8 a)
     npyv_u32 four = vec_sum4s(a, zero);
     npyv_s32 one  = vec_sums((npyv_s32)four, (npyv_s32)zero);
     return (npy_uint16)vec_extract(one, 3);
+    (void)one;
 #endif
 }
 
@@ -386,6 +402,7 @@ NPY_FINLINE npy_uint32 npyv_sumup_u16(npyv_u16 a)
     npyv_u32   four  = vec_add(eight.val[0], eight.val[1]);
     npyv_s32   one   = vec_sums((npyv_s32)four, zero);
     return (npy_uint32)vec_extract(one, 3);
+    (void)one;
 #endif
 }
 
diff --git a/numpy/core/src/common/simd/vec/conversion.h b/numpy/core/src/common/simd/vec/conversion.h
index f0d625c55..922109f7b 100644
--- a/numpy/core/src/common/simd/vec/conversion.h
+++ b/numpy/core/src/common/simd/vec/conversion.h
@@ -96,6 +96,8 @@ npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d,
     #else
         return vec_extract(r, 4);
     #endif
+        // to suppress ambiguous warning: variable `r` but not used [-Wunused-but-set-variable]
+	(void)r;
     }
     NPY_FINLINE npy_uint64 npyv_tobits_b16(npyv_b16 a)
     {
@@ -106,6 +108,8 @@ npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d,
     #else
         return vec_extract(r, 8);
     #endif
+	// to suppress ambiguous warning: variable `r` but not used [-Wunused-but-set-variable]
+        (void)r;
     }
     NPY_FINLINE npy_uint64 npyv_tobits_b32(npyv_b32 a)
     {
@@ -120,6 +124,8 @@ npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d,
     #else
         return vec_extract(r, 8);
     #endif
+	// to suppress ambiguous warning: variable `r` but not used [-Wunused-but-set-variable]
+        (void)r;
     }
     NPY_FINLINE npy_uint64 npyv_tobits_b64(npyv_b64 a)
     {
@@ -134,6 +140,8 @@ npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d,
     #else
         return vec_extract(r, 8);
     #endif
+	// to suppress ambiguous warning: variable `r` but not used [-Wunused-but-set-variable]
+        (void)r;
     }
 #else
     NPY_FINLINE npy_uint64 npyv_tobits_b8(npyv_b8 a)
@@ -170,7 +178,8 @@ npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d,
     #ifdef NPY_HAVE_VXE2
         return vec_signed(a);
     #elif defined(NPY_HAVE_VXE)
-        return vec_packs(vec_signed(npyv_doublee(a)), vec_signed(npyv_doublee(vec_mergel(a, a))));
+        return vec_packs(vec_signed(npyv_doublee(vec_mergeh(a,a))),
+            vec_signed(npyv_doublee(vec_mergel(a, a))));
     // VSX
     #elif defined(__IBMC__)
         return vec_cts(a, 0);
diff --git a/numpy/core/src/common/simd/vec/math.h b/numpy/core/src/common/simd/vec/math.h
index 95b16fdf7..85690f76c 100644
--- a/numpy/core/src/common/simd/vec/math.h
+++ b/numpy/core/src/common/simd/vec/math.h
@@ -186,6 +186,7 @@ NPY_IMPL_VEC_REDUCE_MINMAX(max, int32, s32)
     {                                                                   \
         npyv_##SFX r = vec_##INTRIN(a, vec_sld(a, a, 8));               \
         return (npy_##STYPE)vec_extract(r, 0);                          \
+    	(void)r;					 	                                \
     }
 NPY_IMPL_VEC_REDUCE_MINMAX(min, uint64, u64)
 NPY_IMPL_VEC_REDUCE_MINMAX(max, uint64, u64)
@@ -225,6 +226,7 @@ NPY_IMPL_VEC_REDUCE_MINMAX(max, int64, s64)
     {                                                             \
         npyv_f64 r = vec_##INTRIN(a, vec_sld(a, a, 8));           \
         return vec_extract(r, 0);                                 \
+        (void)r;                                                  \
     }                                                             \
     NPY_FINLINE double npyv_reduce_##INTRIN##n_f64(npyv_f64 a)    \
     {                                                             \
diff --git a/numpy/core/src/common/simd/vec/memory.h b/numpy/core/src/common/simd/vec/memory.h
index e8f588ef2..1ad39cead 100644
--- a/numpy/core/src/common/simd/vec/memory.h
+++ b/numpy/core/src/common/simd/vec/memory.h
@@ -46,14 +46,8 @@
 #endif
 
 // avoid aliasing rules
-#ifdef __cplusplus
-    template<typename T_PTR>
-    NPY_FINLINE npy_uint64 *npyv__ptr2u64(const T_PTR *ptr)
-    { npy_uint64 *ptr64 = (npy_uint64*)ptr; return ptr64; }
-#else
-    NPY_FINLINE npy_uint64 *npyv__ptr2u64(const void *ptr)
-    { npy_uint64 *ptr64 = (npy_uint64*)ptr; return ptr64; }
-#endif // __cplusplus
+NPY_FINLINE npy_uint64 *npyv__ptr2u64(const void *ptr)
+{ npy_uint64 *ptr64 = (npy_uint64*)ptr; return ptr64; }
 
 // load lower part
 NPY_FINLINE npyv_u64 npyv__loadl(const void *ptr)
@@ -136,6 +130,24 @@ NPY_FINLINE npyv_s64 npyv_loadn_s64(const npy_int64 *ptr, npy_intp stride)
 { return npyv_set_s64(ptr[0], ptr[stride]); }
 NPY_FINLINE npyv_f64 npyv_loadn_f64(const double *ptr, npy_intp stride)
 { return npyv_set_f64(ptr[0], ptr[stride]); }
+
+//// 64-bit load over 32-bit stride
+NPY_FINLINE npyv_u32 npyv_loadn2_u32(const npy_uint32 *ptr, npy_intp stride)
+{ return (npyv_u32)npyv_set_u64(*(npy_uint64*)ptr, *(npy_uint64*)(ptr + stride)); }
+NPY_FINLINE npyv_s32 npyv_loadn2_s32(const npy_int32 *ptr, npy_intp stride)
+{ return (npyv_s32)npyv_set_u64(*(npy_uint64*)ptr, *(npy_uint64*)(ptr + stride)); }
+#if NPY_SIMD_F32
+NPY_FINLINE npyv_f32 npyv_loadn2_f32(const float *ptr, npy_intp stride)
+{ return (npyv_f32)npyv_set_u64(*(npy_uint64*)ptr, *(npy_uint64*)(ptr + stride)); }
+#endif
+//// 128-bit load over 64-bit stride
+NPY_FINLINE npyv_u64 npyv_loadn2_u64(const npy_uint64 *ptr, npy_intp stride)
+{ (void)stride; return npyv_load_u64(ptr); }
+NPY_FINLINE npyv_s64 npyv_loadn2_s64(const npy_int64 *ptr, npy_intp stride)
+{ (void)stride; return npyv_load_s64(ptr); }
+NPY_FINLINE npyv_f64 npyv_loadn2_f64(const double *ptr, npy_intp stride)
+{ (void)stride; return npyv_load_f64(ptr); }
+
 /***************************
  * Non-contiguous Store
  ***************************/
@@ -164,6 +176,26 @@ NPY_FINLINE void npyv_storen_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a)
 NPY_FINLINE void npyv_storen_f64(double *ptr, npy_intp stride, npyv_f64 a)
 { npyv_storen_u64((npy_uint64*)ptr, stride, (npyv_u64)a); }
 
+//// 64-bit store over 32-bit stride
+NPY_FINLINE void npyv_storen2_u32(npy_uint32 *ptr, npy_intp stride, npyv_u32 a)
+{
+    *(npy_uint64*)ptr = vec_extract((npyv_u64)a, 0);
+    *(npy_uint64*)(ptr + stride) = vec_extract((npyv_u64)a, 1);
+}
+NPY_FINLINE void npyv_storen2_s32(npy_int32 *ptr, npy_intp stride, npyv_s32 a)
+{ npyv_storen2_u32((npy_uint32*)ptr, stride, (npyv_u32)a); }
+#if NPY_SIMD_F32
+NPY_FINLINE void npyv_storen2_f32(float *ptr, npy_intp stride, npyv_f32 a)
+{ npyv_storen2_u32((npy_uint32*)ptr, stride, (npyv_u32)a); }
+#endif
+//// 128-bit store over 64-bit stride
+NPY_FINLINE void npyv_storen2_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a)
+{ (void)stride; npyv_store_u64(ptr, a); }
+NPY_FINLINE void npyv_storen2_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a)
+{ (void)stride; npyv_store_s64(ptr, a); }
+NPY_FINLINE void npyv_storen2_f64(double *ptr, npy_intp stride, npyv_f64 a)
+{ (void)stride; npyv_store_f64(ptr, a); }
+
 /*********************************
  * Partial Load
  *********************************/
@@ -173,9 +205,9 @@ NPY_FINLINE npyv_s32 npyv_load_till_s32(const npy_int32 *ptr, npy_uintp nlane, n
     assert(nlane > 0);
     npyv_s32 vfill = npyv_setall_s32(fill);
 #ifdef NPY_HAVE_VX
-    const unsigned blane = (unsigned short)nlane;
+    const unsigned blane = (nlane > 4) ? 4 : nlane;
     const npyv_u32 steps = npyv_set_u32(0, 1, 2, 3);
-    const npyv_u32 vlane = npyv_setall_u32((unsigned)blane);
+    const npyv_u32 vlane = npyv_setall_u32(blane);
     const npyv_b32 mask  = vec_cmpgt(vlane, steps);
     npyv_s32 a = vec_load_len(ptr, blane*4-1);
     return vec_sel(vfill, a, mask);
@@ -201,8 +233,8 @@ NPY_FINLINE npyv_s32 npyv_load_till_s32(const npy_int32 *ptr, npy_uintp nlane, n
 NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
 {
 #ifdef NPY_HAVE_VX
-    unsigned blane = ((unsigned short)nlane)*4 - 1;
-    return vec_load_len(ptr, blane);
+    unsigned blane = (nlane > 4) ? 4 : nlane;
+    return vec_load_len(ptr, blane*4-1);
 #else
     return npyv_load_till_s32(ptr, nlane, 0);
 #endif
@@ -220,12 +252,33 @@ NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, n
 NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
 {
 #ifdef NPY_HAVE_VX
-    unsigned blane = (unsigned short)nlane;
+    unsigned blane = (nlane > 2) ? 2 : nlane;
     return vec_load_len((const signed long long*)ptr, blane*8-1);
 #else
     return npyv_load_till_s64(ptr, nlane, 0);
 #endif
 }
+//// 64-bit nlane
+NPY_FINLINE npyv_s32 npyv_load2_till_s32(const npy_int32 *ptr, npy_uintp nlane,
+                                          npy_int32 fill_lo, npy_int32 fill_hi)
+{
+    assert(nlane > 0);
+    if (nlane == 1) {
+        return npyv_set_s32(ptr[0], ptr[1], fill_lo, fill_hi);
+    }
+    return npyv_load_s32(ptr);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s32 npyv_load2_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
+{ return (npyv_s32)npyv_load_tillz_s64((const npy_int64*)ptr, nlane); }
+
+//// 128-bit nlane
+NPY_FINLINE npyv_s64 npyv_load2_till_s64(const npy_int64 *ptr, npy_uintp nlane,
+                                           npy_int64 fill_lo, npy_int64 fill_hi)
+{ (void)nlane; (void)fill_lo; (void)fill_hi; return npyv_load_s64(ptr); }
+
+NPY_FINLINE npyv_s64 npyv_load2_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
+{ (void)nlane; return npyv_load_s64(ptr); }
 /*********************************
  * Non-contiguous partial load
  *********************************/
@@ -265,6 +318,34 @@ npyv_loadn_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_
 // fill zero to rest lanes
 NPY_FINLINE npyv_s64 npyv_loadn_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane)
 { return npyv_loadn_till_s64(ptr, stride, nlane, 0); }
+
+//// 64-bit load over 32-bit stride
+NPY_FINLINE npyv_s32 npyv_loadn2_till_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane,
+                                                 npy_int32 fill_lo, npy_int32 fill_hi)
+{
+    assert(nlane > 0);
+    if (nlane == 1) {
+        return npyv_set_s32(ptr[0], ptr[1], fill_lo, fill_hi);
+    }
+    return npyv_loadn2_s32(ptr, stride);
+}
+NPY_FINLINE npyv_s32 npyv_loadn2_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane)
+{
+    assert(nlane > 0);
+    if (nlane == 1) {
+        return (npyv_s32)npyv_set_s64(*(npy_int64*)ptr, 0);
+    }
+    return npyv_loadn2_s32(ptr, stride);
+}
+
+//// 128-bit load over 64-bit stride
+NPY_FINLINE npyv_s64 npyv_loadn2_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane,
+                                                  npy_int64 fill_lo, npy_int64 fill_hi)
+{ assert(nlane > 0); (void)stride; (void)nlane; (void)fill_lo; (void)fill_hi; return npyv_load_s64(ptr); }
+
+NPY_FINLINE npyv_s64 npyv_loadn2_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane)
+{ assert(nlane > 0); (void)stride; (void)nlane; return npyv_load_s64(ptr); }
+
 /*********************************
  * Partial store
  *********************************/
@@ -273,7 +354,7 @@ NPY_FINLINE void npyv_store_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a
 {
     assert(nlane > 0);
 #ifdef NPY_HAVE_VX
-    unsigned blane = (unsigned short)nlane;
+    unsigned blane = (nlane > 4) ? 4 : nlane;
     vec_store_len(a, ptr, blane*4-1);
 #else
     switch(nlane) {
@@ -297,7 +378,7 @@ NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a
 {
     assert(nlane > 0);
 #ifdef NPY_HAVE_VX
-    unsigned blane = (unsigned short)nlane;
+    unsigned blane = (nlane > 2) ? 2 : nlane;
     vec_store_len(a, (signed long long*)ptr, blane*8-1);
 #else
     if (nlane == 1) {
@@ -307,6 +388,18 @@ NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a
     npyv_store_s64(ptr, a);
 #endif
 }
+
+//// 64-bit nlane
+NPY_FINLINE void npyv_store2_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a)
+{ npyv_store_till_s64((npy_int64*)ptr, nlane, (npyv_s64)a); }
+
+//// 128-bit nlane
+NPY_FINLINE void npyv_store2_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a)
+{
+    assert(nlane > 0); (void)nlane;
+    npyv_store_s64(ptr, a);
+}
+
 /*********************************
  * Non-contiguous partial store
  *********************************/
@@ -314,16 +407,21 @@ NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a
 NPY_FINLINE void npyv_storen_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npyv_s32 a)
 {
     assert(nlane > 0);
+    ptr[stride*0] = vec_extract(a, 0);
     switch(nlane) {
-    default:
-        ptr[stride*3] = vec_extract(a, 3);
-    case 3:
-        ptr[stride*2] = vec_extract(a, 2);
+    case 1:
+        return;
     case 2:
         ptr[stride*1] = vec_extract(a, 1);
-    case 1:
-        ptr[stride*0] = vec_extract(a, 0);
-        break;
+        return;
+    case 3:
+        ptr[stride*1] = vec_extract(a, 1);
+        ptr[stride*2] = vec_extract(a, 2);
+        return;
+    default:
+         ptr[stride*1] = vec_extract(a, 1);
+         ptr[stride*2] = vec_extract(a, 2);
+         ptr[stride*3] = vec_extract(a, 3);
     }
 }
 //// 64
@@ -336,6 +434,21 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp
     }
     npyv_storen_s64(ptr, stride, a);
 }
+
+//// 64-bit store over 32-bit stride
+NPY_FINLINE void npyv_storen2_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npyv_s32 a)
+{
+    assert(nlane > 0);
+    npyv_storel_s32(ptr, a);
+    if (nlane > 1) {
+        npyv_storeh_s32(ptr + stride, a);
+    }
+}
+
+//// 128-bit store over 64-bit stride
+NPY_FINLINE void npyv_storen2_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npyv_s64 a)
+{ assert(nlane > 0); (void)stride; (void)nlane; npyv_store_s64(ptr, a); }
+
 /*****************************************************************
  * Implement partial load/store for u32/f32/u64/f64... via casting
  *****************************************************************/
@@ -346,7 +459,8 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp
         union {                                                                             \
             npyv_lanetype_##F_SFX from_##F_SFX;                                             \
             npyv_lanetype_##T_SFX to_##T_SFX;                                               \
-        } pun = {.from_##F_SFX = fill};                                                     \
+        } pun;                                                                              \
+        pun.from_##F_SFX = fill;                                                            \
         return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load_till_##T_SFX(                   \
             (const npyv_lanetype_##T_SFX *)ptr, nlane, pun.to_##T_SFX                       \
         ));                                                                                 \
@@ -358,7 +472,8 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp
         union {                                                                             \
             npyv_lanetype_##F_SFX from_##F_SFX;                                             \
             npyv_lanetype_##T_SFX to_##T_SFX;                                               \
-        } pun = {.from_##F_SFX = fill};                                                     \
+        } pun;                                                                              \
+        pun.from_##F_SFX = fill;                                                            \
         return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn_till_##T_SFX(                  \
             (const npyv_lanetype_##T_SFX *)ptr, stride, nlane, pun.to_##T_SFX               \
         ));                                                                                 \
@@ -401,6 +516,114 @@ NPYV_IMPL_VEC_REST_PARTIAL_TYPES(f32, s32)
 NPYV_IMPL_VEC_REST_PARTIAL_TYPES(u64, s64)
 NPYV_IMPL_VEC_REST_PARTIAL_TYPES(f64, s64)
 
+// 128-bit/64-bit stride
+#define NPYV_IMPL_VEC_REST_PARTIAL_TYPES_PAIR(F_SFX, T_SFX)                                 \
+    NPY_FINLINE npyv_##F_SFX npyv_load2_till_##F_SFX                                        \
+    (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane,                                     \
+     npyv_lanetype_##F_SFX fill_lo, npyv_lanetype_##F_SFX fill_hi)                          \
+    {                                                                                       \
+        union pun {                                                                         \
+            npyv_lanetype_##F_SFX from_##F_SFX;                                             \
+            npyv_lanetype_##T_SFX to_##T_SFX;                                               \
+        };                                                                                  \
+        union pun pun_lo;                                                                   \
+        union pun pun_hi;                                                                   \
+        pun_lo.from_##F_SFX = fill_lo;                                                      \
+        pun_hi.from_##F_SFX = fill_hi;                                                      \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load2_till_##T_SFX(                  \
+            (const npyv_lanetype_##T_SFX *)ptr, nlane, pun_lo.to_##T_SFX, pun_hi.to_##T_SFX \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE npyv_##F_SFX npyv_loadn2_till_##F_SFX                                       \
+    (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane,                    \
+     npyv_lanetype_##F_SFX fill_lo, npyv_lanetype_##F_SFX fill_hi)                          \
+    {                                                                                       \
+        union pun {                                                                         \
+            npyv_lanetype_##F_SFX from_##F_SFX;                                             \
+            npyv_lanetype_##T_SFX to_##T_SFX;                                               \
+        };                                                                                  \
+        union pun pun_lo;                                                                   \
+        union pun pun_hi;                                                                   \
+        pun_lo.from_##F_SFX = fill_lo;                                                      \
+        pun_hi.from_##F_SFX = fill_hi;                                                      \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn2_till_##T_SFX(                 \
+            (const npyv_lanetype_##T_SFX *)ptr, stride, nlane, pun_lo.to_##T_SFX,           \
+            pun_hi.to_##T_SFX                                                               \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE npyv_##F_SFX npyv_load2_tillz_##F_SFX                                       \
+    (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane)                                     \
+    {                                                                                       \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load2_tillz_##T_SFX(                 \
+            (const npyv_lanetype_##T_SFX *)ptr, nlane                                       \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE npyv_##F_SFX npyv_loadn2_tillz_##F_SFX                                      \
+    (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane)                    \
+    {                                                                                       \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn2_tillz_##T_SFX(                \
+            (const npyv_lanetype_##T_SFX *)ptr, stride, nlane                               \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE void npyv_store2_till_##F_SFX                                               \
+    (npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, npyv_##F_SFX a)                           \
+    {                                                                                       \
+        npyv_store2_till_##T_SFX(                                                           \
+            (npyv_lanetype_##T_SFX *)ptr, nlane,                                            \
+            npyv_reinterpret_##T_SFX##_##F_SFX(a)                                           \
+        );                                                                                  \
+    }                                                                                       \
+    NPY_FINLINE void npyv_storen2_till_##F_SFX                                              \
+    (npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, npyv_##F_SFX a)          \
+    {                                                                                       \
+        npyv_storen2_till_##T_SFX(                                                          \
+            (npyv_lanetype_##T_SFX *)ptr, stride, nlane,                                    \
+            npyv_reinterpret_##T_SFX##_##F_SFX(a)                                           \
+        );                                                                                  \
+    }
+
+NPYV_IMPL_VEC_REST_PARTIAL_TYPES_PAIR(u32, s32)
+#if NPY_SIMD_F32
+NPYV_IMPL_VEC_REST_PARTIAL_TYPES_PAIR(f32, s32)
+#endif
+NPYV_IMPL_VEC_REST_PARTIAL_TYPES_PAIR(u64, s64)
+NPYV_IMPL_VEC_REST_PARTIAL_TYPES_PAIR(f64, s64)
+
+/************************************************************
+ *  de-interlave load / interleave contiguous store
+ ************************************************************/
+// two channels
+#define NPYV_IMPL_VEC_MEM_INTERLEAVE(SFX)                                \
+    NPY_FINLINE npyv_##SFX##x2 npyv_zip_##SFX(npyv_##SFX, npyv_##SFX);   \
+    NPY_FINLINE npyv_##SFX##x2 npyv_unzip_##SFX(npyv_##SFX, npyv_##SFX); \
+    NPY_FINLINE npyv_##SFX##x2 npyv_load_##SFX##x2(                      \
+        const npyv_lanetype_##SFX *ptr                                   \
+    ) {                                                                  \
+        return npyv_unzip_##SFX(                                         \
+            npyv_load_##SFX(ptr), npyv_load_##SFX(ptr+npyv_nlanes_##SFX) \
+        );                                                               \
+    }                                                                    \
+    NPY_FINLINE void npyv_store_##SFX##x2(                               \
+        npyv_lanetype_##SFX *ptr, npyv_##SFX##x2 v                       \
+    ) {                                                                  \
+        npyv_##SFX##x2 zip = npyv_zip_##SFX(v.val[0], v.val[1]);         \
+        npyv_store_##SFX(ptr, zip.val[0]);                               \
+        npyv_store_##SFX(ptr + npyv_nlanes_##SFX, zip.val[1]);           \
+    }
+
+NPYV_IMPL_VEC_MEM_INTERLEAVE(u8)
+NPYV_IMPL_VEC_MEM_INTERLEAVE(s8)
+NPYV_IMPL_VEC_MEM_INTERLEAVE(u16)
+NPYV_IMPL_VEC_MEM_INTERLEAVE(s16)
+NPYV_IMPL_VEC_MEM_INTERLEAVE(u32)
+NPYV_IMPL_VEC_MEM_INTERLEAVE(s32)
+NPYV_IMPL_VEC_MEM_INTERLEAVE(u64)
+NPYV_IMPL_VEC_MEM_INTERLEAVE(s64)
+#if NPY_SIMD_F32
+NPYV_IMPL_VEC_MEM_INTERLEAVE(f32)
+#endif
+NPYV_IMPL_VEC_MEM_INTERLEAVE(f64)
+
 /*********************************
  * Lookup table
  *********************************/
diff --git a/numpy/core/src/common/simd/vec/misc.h b/numpy/core/src/common/simd/vec/misc.h
index 7ea0f21f6..79c194d90 100644
--- a/numpy/core/src/common/simd/vec/misc.h
+++ b/numpy/core/src/common/simd/vec/misc.h
@@ -26,28 +26,28 @@
 #define NPYV_IMPL_VEC_SPLTW(T_VEC, V) ((T_VEC){V, V, V, V})
 #define NPYV_IMPL_VEC_SPLTD(T_VEC, V) ((T_VEC){V, V})
 
-#define npyv_setall_u8(VAL)  NPYV_IMPL_VEC_SPLTB(npyv_u8,  (unsigned char)VAL)
-#define npyv_setall_s8(VAL)  NPYV_IMPL_VEC_SPLTB(npyv_s8,  (signed char)VAL)
-#define npyv_setall_u16(VAL) NPYV_IMPL_VEC_SPLTH(npyv_u16, (unsigned short)VAL)
-#define npyv_setall_s16(VAL) NPYV_IMPL_VEC_SPLTH(npyv_s16, (short)VAL)
-#define npyv_setall_u32(VAL) NPYV_IMPL_VEC_SPLTW(npyv_u32, (unsigned int)VAL)
-#define npyv_setall_s32(VAL) NPYV_IMPL_VEC_SPLTW(npyv_s32, (int)VAL)
+#define npyv_setall_u8(VAL)  NPYV_IMPL_VEC_SPLTB(npyv_u8,  (unsigned char)(VAL))
+#define npyv_setall_s8(VAL)  NPYV_IMPL_VEC_SPLTB(npyv_s8,  (signed char)(VAL))
+#define npyv_setall_u16(VAL) NPYV_IMPL_VEC_SPLTH(npyv_u16, (unsigned short)(VAL))
+#define npyv_setall_s16(VAL) NPYV_IMPL_VEC_SPLTH(npyv_s16, (short)(VAL))
+#define npyv_setall_u32(VAL) NPYV_IMPL_VEC_SPLTW(npyv_u32, (unsigned int)(VAL))
+#define npyv_setall_s32(VAL) NPYV_IMPL_VEC_SPLTW(npyv_s32, (int)(VAL))
 #if NPY_SIMD_F32
-    #define npyv_setall_f32(VAL) NPYV_IMPL_VEC_SPLTW(npyv_f32, VAL)
+    #define npyv_setall_f32(VAL) NPYV_IMPL_VEC_SPLTW(npyv_f32, (VAL))
 #endif
-#define npyv_setall_u64(VAL) NPYV_IMPL_VEC_SPLTD(npyv_u64, (npy_uint64)VAL)
-#define npyv_setall_s64(VAL) NPYV_IMPL_VEC_SPLTD(npyv_s64, (npy_int64)VAL)
+#define npyv_setall_u64(VAL) NPYV_IMPL_VEC_SPLTD(npyv_u64, (npy_uint64)(VAL))
+#define npyv_setall_s64(VAL) NPYV_IMPL_VEC_SPLTD(npyv_s64, (npy_int64)(VAL))
 #define npyv_setall_f64(VAL) NPYV_IMPL_VEC_SPLTD(npyv_f64, VAL)
 
 // vector with specific values set to each lane and
 // set a specific value to all remained lanes
-#define npyv_setf_u8(FILL, ...)  ((npyv_u8){NPYV__SET_FILL_16(char, FILL, __VA_ARGS__)})
-#define npyv_setf_s8(FILL, ...)  ((npyv_s8){NPYV__SET_FILL_16(char, FILL, __VA_ARGS__)})
-#define npyv_setf_u16(FILL, ...) ((npyv_u16){NPYV__SET_FILL_8(short, FILL, __VA_ARGS__)})
+#define npyv_setf_u8(FILL, ...)  ((npyv_u8){NPYV__SET_FILL_16(unsigned char, FILL, __VA_ARGS__)})
+#define npyv_setf_s8(FILL, ...)  ((npyv_s8){NPYV__SET_FILL_16(signed char, FILL, __VA_ARGS__)})
+#define npyv_setf_u16(FILL, ...) ((npyv_u16){NPYV__SET_FILL_8(unsigned short, FILL, __VA_ARGS__)})
 #define npyv_setf_s16(FILL, ...) ((npyv_s16){NPYV__SET_FILL_8(short, FILL, __VA_ARGS__)})
-#define npyv_setf_u32(FILL, ...) ((npyv_u32){NPYV__SET_FILL_4(int, FILL, __VA_ARGS__)})
+#define npyv_setf_u32(FILL, ...) ((npyv_u32){NPYV__SET_FILL_4(unsigned int, FILL, __VA_ARGS__)})
 #define npyv_setf_s32(FILL, ...) ((npyv_s32){NPYV__SET_FILL_4(int, FILL, __VA_ARGS__)})
-#define npyv_setf_u64(FILL, ...) ((npyv_u64){NPYV__SET_FILL_2(npy_int64, FILL, __VA_ARGS__)})
+#define npyv_setf_u64(FILL, ...) ((npyv_u64){NPYV__SET_FILL_2(npy_uint64, FILL, __VA_ARGS__)})
 #define npyv_setf_s64(FILL, ...) ((npyv_s64){NPYV__SET_FILL_2(npy_int64, FILL, __VA_ARGS__)})
 #if NPY_SIMD_F32
     #define npyv_setf_f32(FILL, ...) ((npyv_f32){NPYV__SET_FILL_4(float, FILL, __VA_ARGS__)})
diff --git a/numpy/core/src/common/simd/vec/reorder.h b/numpy/core/src/common/simd/vec/reorder.h
index b60b9287d..3910980a2 100644
--- a/numpy/core/src/common/simd/vec/reorder.h
+++ b/numpy/core/src/common/simd/vec/reorder.h
@@ -68,6 +68,85 @@ NPYV_IMPL_VEC_COMBINE_ZIP(npyv_s64, s64)
 #endif
 NPYV_IMPL_VEC_COMBINE_ZIP(npyv_f64, f64)
 
+// deinterleave two vectors
+NPY_FINLINE npyv_u8x2 npyv_unzip_u8(npyv_u8 ab0, npyv_u8 ab1)
+{
+    const npyv_u8 idx_even = npyv_set_u8(
+        0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+    );
+    const npyv_u8 idx_odd = npyv_set_u8(
+        1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+    );
+    npyv_u8x2 r;
+    r.val[0] = vec_perm(ab0, ab1, idx_even);
+    r.val[1] = vec_perm(ab0, ab1, idx_odd);
+    return r;
+}
+NPY_FINLINE npyv_s8x2 npyv_unzip_s8(npyv_s8 ab0, npyv_s8 ab1)
+{
+    npyv_u8x2 ru = npyv_unzip_u8((npyv_u8)ab0, (npyv_u8)ab1);
+    npyv_s8x2 r;
+    r.val[0] = (npyv_s8)ru.val[0];
+    r.val[1] = (npyv_s8)ru.val[1];
+    return r;
+}
+NPY_FINLINE npyv_u16x2 npyv_unzip_u16(npyv_u16 ab0, npyv_u16 ab1)
+{
+    const npyv_u8 idx_even = npyv_set_u8(
+        0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29
+    );
+    const npyv_u8 idx_odd = npyv_set_u8(
+        2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31
+    );
+    npyv_u16x2 r;
+    r.val[0] = vec_perm(ab0, ab1, idx_even);
+    r.val[1] = vec_perm(ab0, ab1, idx_odd);
+    return r;
+}
+NPY_FINLINE npyv_s16x2 npyv_unzip_s16(npyv_s16 ab0, npyv_s16 ab1)
+{
+    npyv_u16x2 ru = npyv_unzip_u16((npyv_u16)ab0, (npyv_u16)ab1);
+    npyv_s16x2 r;
+    r.val[0] = (npyv_s16)ru.val[0];
+    r.val[1] = (npyv_s16)ru.val[1];
+    return r;
+}
+NPY_FINLINE npyv_u32x2 npyv_unzip_u32(npyv_u32 ab0, npyv_u32 ab1)
+{
+    npyv_u32 m0 = vec_mergeh(ab0, ab1);
+    npyv_u32 m1 = vec_mergel(ab0, ab1);
+    npyv_u32 r0 = vec_mergeh(m0, m1);
+    npyv_u32 r1 = vec_mergel(m0, m1);
+    npyv_u32x2 r;
+    r.val[0] = r0;
+    r.val[1] = r1;
+    return r;
+}
+NPY_FINLINE npyv_s32x2 npyv_unzip_s32(npyv_s32 ab0, npyv_s32 ab1)
+{
+    npyv_u32x2 ru = npyv_unzip_u32((npyv_u32)ab0, (npyv_u32)ab1);
+    npyv_s32x2 r;
+    r.val[0] = (npyv_s32)ru.val[0];
+    r.val[1] = (npyv_s32)ru.val[1];
+    return r;
+}
+#if NPY_SIMD_F32
+    NPY_FINLINE npyv_f32x2 npyv_unzip_f32(npyv_f32 ab0, npyv_f32 ab1)
+    {
+        npyv_u32x2 ru = npyv_unzip_u32((npyv_u32)ab0, (npyv_u32)ab1);
+        npyv_f32x2 r;
+        r.val[0] = (npyv_f32)ru.val[0];
+        r.val[1] = (npyv_f32)ru.val[1];
+        return r;
+    }
+#endif
+NPY_FINLINE npyv_u64x2 npyv_unzip_u64(npyv_u64 ab0, npyv_u64 ab1)
+{ return npyv_combine_u64(ab0, ab1); }
+NPY_FINLINE npyv_s64x2 npyv_unzip_s64(npyv_s64 ab0, npyv_s64 ab1)
+{ return npyv_combine_s64(ab0, ab1); }
+NPY_FINLINE npyv_f64x2 npyv_unzip_f64(npyv_f64 ab0, npyv_f64 ab1)
+{ return npyv_combine_f64(ab0, ab1); }
+
 // Reverse elements of each 64-bit lane
 NPY_FINLINE npyv_u8 npyv_rev64_u8(npyv_u8 a)
 {
@@ -111,4 +190,24 @@ NPY_FINLINE npyv_s32 npyv_rev64_s32(npyv_s32 a)
     { return (npyv_f32)npyv_rev64_u32((npyv_u32)a); }
 #endif
 
+// Permuting the elements of each 128-bit lane by immediate index for
+// each element.
+#define npyv_permi128_u32(A, E0, E1, E2, E3)      \
+    vec_perm(A, A, npyv_set_u8(                   \
+        (E0<<2), (E0<<2)+1, (E0<<2)+2, (E0<<2)+3, \
+        (E1<<2), (E1<<2)+1, (E1<<2)+2, (E1<<2)+3, \
+        (E2<<2), (E2<<2)+1, (E2<<2)+2, (E2<<2)+3, \
+        (E3<<2), (E3<<2)+1, (E3<<2)+2, (E3<<2)+3  \
+    ))
+#define npyv_permi128_s32 npyv_permi128_u32
+#define npyv_permi128_f32 npyv_permi128_u32
+
+#if defined(__IBMC__) || defined(vec_permi)
+    #define npyv_permi128_u64(A, E0, E1) vec_permi(A, A, ((E0)<<1) | (E1))
+#else
+    #define npyv_permi128_u64(A, E0, E1) vec_xxpermdi(A, A, ((E0)<<1) | (E1))
+#endif
+#define npyv_permi128_s64 npyv_permi128_u64
+#define npyv_permi128_f64 npyv_permi128_u64
+
 #endif // _NPY_SIMD_VEC_REORDER_H
diff --git a/numpy/core/src/common/simd/vec/vec.h b/numpy/core/src/common/simd/vec/vec.h
index abcd33ce1..1d4508669 100644
--- a/numpy/core/src/common/simd/vec/vec.h
+++ b/numpy/core/src/common/simd/vec/vec.h
@@ -39,8 +39,10 @@
 
 #ifdef NPY_HAVE_VX
     #define NPY_SIMD_BIGENDIAN 1
+    #define NPY_SIMD_CMPSIGNAL 0
 #else
     #define NPY_SIMD_BIGENDIAN 0
+    #define NPY_SIMD_CMPSIGNAL 1
 #endif
 
 typedef __vector unsigned char      npyv_u8;
diff --git a/numpy/core/src/common/templ_common.h.src b/numpy/core/src/common/templ_common.h.src
index 1eea8eb37..d0d1917fe 100644
--- a/numpy/core/src/common/templ_common.h.src
+++ b/numpy/core/src/common/templ_common.h.src
@@ -4,6 +4,7 @@
 /* utility functions that profit from templates */
 
 #include "numpy/npy_common.h"
+#include <assert.h>
 
 /**begin repeat
  *  #name = int, uint, long, ulong,
@@ -24,7 +25,7 @@
  * that is not checked. Could use absolute values and adjust the sign if that
  * functionality was desired.
  */
-static NPY_INLINE int
+static inline int
 npy_mul_with_overflow_@name@(@type@ * r, @type@ a, @type@ b)
 {
 #ifdef HAVE___BUILTIN_MUL_OVERFLOW
@@ -51,14 +52,14 @@ npy_mul_with_overflow_@name@(@type@ * r, @type@ a, @type@ b)
 }
 /**end repeat**/
 
-static NPY_INLINE int
+static inline int
 npy_mul_sizes_with_overflow (npy_intp * r, npy_intp a, npy_intp b)
 {
 #ifdef HAVE___BUILTIN_MUL_OVERFLOW
     return __builtin_mul_overflow(a, b, r);
 #else
-
-    assert(a >= 0 && b >= 0, "this function only supports non-negative numbers");
+    /* this function only supports non-negative numbers */
+    assert(a >= 0 && b >= 0);
     const npy_intp half_sz = ((npy_intp)1 << ((sizeof(a) * 8 - 1 ) / 2));
 
     *r = a * b;
diff --git a/numpy/core/src/common/ucsnarrow.h b/numpy/core/src/common/ucsnarrow.h
index 6fe157199..4b17a2809 100644
--- a/numpy/core/src/common/ucsnarrow.h
+++ b/numpy/core/src/common/ucsnarrow.h
@@ -2,6 +2,6 @@
 #define NUMPY_CORE_SRC_COMMON_NPY_UCSNARROW_H_
 
 NPY_NO_EXPORT PyUnicodeObject *
-PyUnicode_FromUCS4(char *src, Py_ssize_t size, int swap, int align);
+PyUnicode_FromUCS4(char const *src, Py_ssize_t size, int swap, int align);
 
 #endif  /* NUMPY_CORE_SRC_COMMON_NPY_UCSNARROW_H_ */
diff --git a/numpy/core/src/common/utils.hpp b/numpy/core/src/common/utils.hpp
new file mode 100644
index 000000000..f847cab44
--- /dev/null
+++ b/numpy/core/src/common/utils.hpp
@@ -0,0 +1,51 @@
+#ifndef NUMPY_CORE_SRC_COMMON_UTILS_HPP
+#define NUMPY_CORE_SRC_COMMON_UTILS_HPP
+
+#include "npdef.hpp"
+
+#if NP_HAS_CPP20
+    #include <bit>
+#endif
+
+#include <type_traits>
+#include <string.h>
+
+namespace np {
+
+/** Create a value of type `To` from the bits of `from`.
+ *
+ * similar to `std::bit_cast` but compatible with C++17,
+ * should perform similar to `*reinterpret_cast<To*>(&from)`
+ * or through punning without expecting any undefined behaviors.
+ */
+template<typename To, typename From>
+#if NP_HAS_BUILTIN(__builtin_bit_cast) || NP_HAS_CPP20
+[[nodiscard]] constexpr
+#else
+inline
+#endif
+To BitCast(const From &from) noexcept
+{
+    static_assert(
+        sizeof(To) == sizeof(From),
+        "both data types must have the same size");
+
+    static_assert(
+        std::is_trivially_copyable_v<To> &&
+        std::is_trivially_copyable_v<From>,
+        "both data types must be trivially copyable");
+
+#if NP_HAS_CPP20
+    return std::bit_cast<To>(from);
+#elif NP_HAS_BUILTIN(__builtin_bit_cast)
+    return __builtin_bit_cast(To, from);
+#else
+    To to;
+    memcpy(&to, &from, sizeof(from));
+    return to;
+#endif
+}
+
+} // namespace np
+#endif // NUMPY_CORE_SRC_COMMON_UTILS_HPP
+
diff --git a/numpy/core/src/multiarray/_multiarray_tests.c.src b/numpy/core/src/multiarray/_multiarray_tests.c.src
index b22b2c14d..1fd28e721 100644
--- a/numpy/core/src/multiarray/_multiarray_tests.c.src
+++ b/numpy/core/src/multiarray/_multiarray_tests.c.src
@@ -177,8 +177,14 @@ test_neighborhood_iterator(PyObject* NPY_UNUSED(self), PyObject* args)
         return NULL;
     }
 
-    typenum = PyArray_ObjectType(x, 0);
+    typenum = PyArray_ObjectType(x, NPY_NOTYPE);
+    if (typenum == NPY_NOTYPE) {
+        return NULL;
+    }
     typenum = PyArray_ObjectType(fill, typenum);
+    if (typenum == NPY_NOTYPE) {
+        return NULL;
+    }
 
     ax = (PyArrayObject*)PyArray_FromObject(x, typenum, 1, 10);
     if (ax == NULL) {
@@ -343,7 +349,10 @@ test_neighborhood_iterator_oob(PyObject* NPY_UNUSED(self), PyObject* args)
         return NULL;
     }
 
-    typenum = PyArray_ObjectType(x, 0);
+    typenum = PyArray_ObjectType(x, NPY_NOTYPE);
+    if (typenum == NPY_NOTYPE) {
+        return NULL;
+    }
 
     ax = (PyArrayObject*)PyArray_FromObject(x, typenum, 1, 10);
     if (ax == NULL) {
diff --git a/numpy/core/src/multiarray/abstractdtypes.c b/numpy/core/src/multiarray/abstractdtypes.c
index 3e89d045e..da44fd01f 100644
--- a/numpy/core/src/multiarray/abstractdtypes.c
+++ b/numpy/core/src/multiarray/abstractdtypes.c
@@ -13,7 +13,7 @@
 #include "common.h"
 
 
-static NPY_INLINE PyArray_Descr *
+static inline PyArray_Descr *
 int_default_descriptor(PyArray_DTypeMeta* NPY_UNUSED(cls))
 {
     return PyArray_DescrFromType(NPY_LONG);
@@ -51,7 +51,7 @@ discover_descriptor_from_pyint(
 }
 
 
-static NPY_INLINE PyArray_Descr *
+static inline PyArray_Descr *
 float_default_descriptor(PyArray_DTypeMeta* NPY_UNUSED(cls))
 {
     return PyArray_DescrFromType(NPY_DOUBLE);
@@ -66,7 +66,7 @@ discover_descriptor_from_pyfloat(
     return PyArray_DescrFromType(NPY_DOUBLE);
 }
 
-static NPY_INLINE PyArray_Descr *
+static inline PyArray_Descr *
 complex_default_descriptor(PyArray_DTypeMeta* NPY_UNUSED(cls))
 {
     return PyArray_DescrFromType(NPY_CDOUBLE);
@@ -312,54 +312,54 @@ complex_common_dtype(PyArray_DTypeMeta *cls, PyArray_DTypeMeta *other)
  *       They will have to be renamed and exposed in that capacity.
  */
 NPY_DType_Slots pyintabstractdtype_slots = {
-    .default_descr = int_default_descriptor,
     .discover_descr_from_pyobject = discover_descriptor_from_pyint,
+    .default_descr = int_default_descriptor,
     .common_dtype = int_common_dtype,
 };
 
 NPY_NO_EXPORT PyArray_DTypeMeta PyArray_PyIntAbstractDType = {{{
         PyVarObject_HEAD_INIT(&PyArrayDTypeMeta_Type, 0)
+        .tp_name = "numpy._IntegerAbstractDType",
         .tp_basicsize = sizeof(PyArray_Descr),
         .tp_flags = Py_TPFLAGS_DEFAULT,
-        .tp_name = "numpy._IntegerAbstractDType",
     },},
-    .flags = NPY_DT_ABSTRACT,
     .type_num = -1,
+    .flags = NPY_DT_ABSTRACT,
     .dt_slots = &pyintabstractdtype_slots,
 };
 
 
 NPY_DType_Slots pyfloatabstractdtype_slots = {
-    .default_descr = float_default_descriptor,
     .discover_descr_from_pyobject = discover_descriptor_from_pyfloat,
+    .default_descr = float_default_descriptor,
     .common_dtype = float_common_dtype,
 };
 
 NPY_NO_EXPORT PyArray_DTypeMeta PyArray_PyFloatAbstractDType = {{{
         PyVarObject_HEAD_INIT(&PyArrayDTypeMeta_Type, 0)
+        .tp_name = "numpy._FloatAbstractDType",
         .tp_basicsize = sizeof(PyArray_Descr),
        .tp_flags = Py_TPFLAGS_DEFAULT,
-        .tp_name = "numpy._FloatAbstractDType",
     },},
-    .flags = NPY_DT_ABSTRACT,
     .type_num = -1,
+    .flags = NPY_DT_ABSTRACT,
     .dt_slots = &pyfloatabstractdtype_slots,
 };
 
 
 NPY_DType_Slots pycomplexabstractdtype_slots = {
-    .default_descr = complex_default_descriptor,
     .discover_descr_from_pyobject = discover_descriptor_from_pycomplex,
+    .default_descr = complex_default_descriptor,
     .common_dtype = complex_common_dtype,
 };
 
 NPY_NO_EXPORT PyArray_DTypeMeta PyArray_PyComplexAbstractDType = {{{
         PyVarObject_HEAD_INIT(&PyArrayDTypeMeta_Type, 0)
+        .tp_name = "numpy._ComplexAbstractDType",
         .tp_basicsize = sizeof(PyArray_Descr),
          .tp_flags = Py_TPFLAGS_DEFAULT,
-        .tp_name = "numpy._ComplexAbstractDType",
     },},
-    .flags = NPY_DT_ABSTRACT,
     .type_num = -1,
+    .flags = NPY_DT_ABSTRACT,
     .dt_slots = &pycomplexabstractdtype_slots,
 };
diff --git a/numpy/core/src/multiarray/abstractdtypes.h b/numpy/core/src/multiarray/abstractdtypes.h
index 6901ec213..a3f6ceb05 100644
--- a/numpy/core/src/multiarray/abstractdtypes.h
+++ b/numpy/core/src/multiarray/abstractdtypes.h
@@ -30,7 +30,7 @@ initialize_and_map_pytypes_to_dtypes(void);
  *        replaced with the abstract DType.
  * @return 0 if the `obj` was not a python scalar, and 1 if it was.
  */
-static NPY_INLINE int
+static inline int
 npy_mark_tmp_array_if_pyscalar(
         PyObject *obj, PyArrayObject *arr, PyArray_DTypeMeta **dtype)
 {
@@ -58,7 +58,8 @@ npy_mark_tmp_array_if_pyscalar(
         }
         return 1;
     }
-    else if (PyComplex_Check(obj) && PyArray_TYPE(arr) == NPY_CDOUBLE) {
+    else if (PyComplex_Check(obj) && !PyArray_IsScalar(obj, CDouble)
+             && PyArray_TYPE(arr) == NPY_CDOUBLE) {
         ((PyArrayObject_fields *)arr)->flags |= NPY_ARRAY_WAS_PYTHON_COMPLEX;
         if (dtype != NULL) {
             Py_INCREF(&PyArray_PyComplexAbstractDType);
diff --git a/numpy/core/src/multiarray/alloc.c b/numpy/core/src/multiarray/alloc.c
index 8b765aa95..d3b8612f7 100644
--- a/numpy/core/src/multiarray/alloc.c
+++ b/numpy/core/src/multiarray/alloc.c
@@ -88,7 +88,7 @@ _set_madvise_hugepage(PyObject *NPY_UNUSED(self), PyObject *enabled_obj)
  * base function for data cache with 1 byte buckets and dimension cache with
  * sizeof(npy_intp) byte buckets
  */
-static NPY_INLINE void *
+static inline void *
 _npy_alloc_cache(npy_uintp nelem, npy_uintp esz, npy_uint msz,
                  cache_bucket * cache, void * (*alloc)(size_t))
 {
@@ -126,7 +126,7 @@ _npy_alloc_cache(npy_uintp nelem, npy_uintp esz, npy_uint msz,
  * return pointer p to cache, nelem is number of elements of the cache bucket
  * size (1 or sizeof(npy_intp)) of the block pointed too
  */
-static NPY_INLINE void
+static inline void
 _npy_free_cache(void * p, npy_uintp nelem, npy_uint msz,
                 cache_bucket * cache, void (*dealloc)(void *))
 {
@@ -206,7 +206,7 @@ npy_free_cache_dim(void * p, npy_uintp sz)
 }
 
 /* Similar to array_dealloc in arrayobject.c */
-static NPY_INLINE void
+static inline void
 WARN_NO_RETURN(PyObject* warning, const char * msg) {
     if (PyErr_WarnEx(warning, msg, 1) < 0) {
         PyObject * s;
@@ -364,7 +364,7 @@ PyDataMem_RENEW(void *ptr, size_t size)
 // The default data mem allocator malloc routine does not make use of a ctx.
 // It should be called only through PyDataMem_UserNEW
 // since itself does not handle eventhook and tracemalloc logic.
-static NPY_INLINE void *
+static inline void *
 default_malloc(void *NPY_UNUSED(ctx), size_t size)
 {
     return _npy_alloc_cache(size, 1, NBUCKETS, datacache, &malloc);
@@ -373,7 +373,7 @@ default_malloc(void *NPY_UNUSED(ctx), size_t size)
 // The default data mem allocator calloc routine does not make use of a ctx.
 // It should be called only through PyDataMem_UserNEW_ZEROED
 // since itself does not handle eventhook and tracemalloc logic.
-static NPY_INLINE void *
+static inline void *
 default_calloc(void *NPY_UNUSED(ctx), size_t nelem, size_t elsize)
 {
     void * p;
@@ -395,7 +395,7 @@ default_calloc(void *NPY_UNUSED(ctx), size_t nelem, size_t elsize)
 // The default data mem allocator realloc routine does not make use of a ctx.
 // It should be called only through PyDataMem_UserRENEW
 // since itself does not handle eventhook and tracemalloc logic.
-static NPY_INLINE void *
+static inline void *
 default_realloc(void *NPY_UNUSED(ctx), void *ptr, size_t new_size)
 {
     return realloc(ptr, new_size);
@@ -404,7 +404,7 @@ default_realloc(void *NPY_UNUSED(ctx), void *ptr, size_t new_size)
 // The default data mem allocator free routine does not make use of a ctx.
 // It should be called only through PyDataMem_UserFREE
 // since itself does not handle eventhook and tracemalloc logic.
-static NPY_INLINE void
+static inline void
 default_free(void *NPY_UNUSED(ctx), void *ptr, size_t size)
 {
     _npy_free_cache(ptr, size, NBUCKETS, datacache, &free);
diff --git a/numpy/core/src/multiarray/alloc.h b/numpy/core/src/multiarray/alloc.h
index 0f5a7524c..186eb5487 100644
--- a/numpy/core/src/multiarray/alloc.h
+++ b/numpy/core/src/multiarray/alloc.h
@@ -31,13 +31,13 @@ npy_alloc_cache_dim(npy_uintp sz);
 NPY_NO_EXPORT void
 npy_free_cache_dim(void * p, npy_uintp sd);
 
-static NPY_INLINE void
+static inline void
 npy_free_cache_dim_obj(PyArray_Dims dims)
 {
     npy_free_cache_dim(dims.ptr, dims.len);
 }
 
-static NPY_INLINE void
+static inline void
 npy_free_cache_dim_array(PyArrayObject * arr)
 {
     npy_free_cache_dim(PyArray_DIMS(arr), PyArray_NDIM(arr));
diff --git a/numpy/core/src/multiarray/argfunc.dispatch.c.src b/numpy/core/src/multiarray/argfunc.dispatch.c.src
index 1d7753275..d79be1df5 100644
--- a/numpy/core/src/multiarray/argfunc.dispatch.c.src
+++ b/numpy/core/src/multiarray/argfunc.dispatch.c.src
@@ -194,7 +194,7 @@ simd_@func@_@sfx@(npyv_lanetype_@sfx@ *ip, npy_intp len)
         npyv_@bsfx@ nnan_ab = npyv_and_@bsfx@(nnan_a, nnan_b);
         npyv_@bsfx@ nnan_cd = npyv_and_@bsfx@(nnan_c, nnan_d);
         npy_uint64 nnan = npyv_tobits_@bsfx@(npyv_and_@bsfx@(nnan_ab, nnan_cd));
-        if (nnan != ((1LL << vstep) - 1)) {
+        if ((unsigned long long int)nnan != ((1ULL << vstep) - 1)) {
             npy_uint64 nnan_4[4];
             nnan_4[0] = npyv_tobits_@bsfx@(nnan_a);
             nnan_4[1] = npyv_tobits_@bsfx@(nnan_b);
@@ -219,7 +219,7 @@ simd_@func@_@sfx@(npyv_lanetype_@sfx@ *ip, npy_intp len)
     #if @is_fp@
         npyv_@bsfx@ nnan_a = npyv_notnan_@sfx@(a);
         npy_uint64 nnan = npyv_tobits_@bsfx@(nnan_a);
-        if (nnan != ((1LL << vstep) - 1)) {
+        if ((unsigned long long int)nnan != ((1ULL << vstep) - 1)) {
             for (int vi = 0; vi < vstep; ++vi) {
                 if (!((nnan >> vi) & 1)) {
                     return i + vi;
diff --git a/numpy/core/src/multiarray/array_assign_array.c b/numpy/core/src/multiarray/array_assign_array.c
index 9d5bf6875..7aee7d6e9 100644
--- a/numpy/core/src/multiarray/array_assign_array.c
+++ b/numpy/core/src/multiarray/array_assign_array.c
@@ -130,7 +130,7 @@ raw_array_assign_array(int ndim, npy_intp const *shape,
     }
 
     if (!(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) {
-        npy_clear_floatstatus_barrier(src_data);
+        npy_clear_floatstatus_barrier((char*)&src_data);
     }
     if (!(flags & NPY_METH_REQUIRES_PYAPI)) {
         NPY_BEGIN_THREADS;
@@ -153,7 +153,7 @@ raw_array_assign_array(int ndim, npy_intp const *shape,
     NPY_cast_info_xfree(&cast_info);
 
     if (!(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) {
-        int fpes = npy_get_floatstatus_barrier(src_data);
+        int fpes = npy_get_floatstatus_barrier((char*)&src_data);
         if (fpes && PyUFunc_GiveFloatingpointErrors("cast", fpes) < 0) {
             return -1;
         }
diff --git a/numpy/core/src/multiarray/array_coercion.c b/numpy/core/src/multiarray/array_coercion.c
index 117ce1805..ba9118052 100644
--- a/numpy/core/src/multiarray/array_coercion.c
+++ b/numpy/core/src/multiarray/array_coercion.c
@@ -16,6 +16,7 @@
 #include "common_dtype.h"
 #include "dtypemeta.h"
 
+#include "npy_argparse.h"
 #include "abstractdtypes.h"
 #include "array_coercion.h"
 #include "ctors.h"
@@ -209,7 +210,7 @@ _PyArray_MapPyTypeToDType(
  * @param pytype Python Type to look up
  * @return DType, None if it is a known non-scalar, or NULL if an unknown object.
  */
-static NPY_INLINE PyArray_DTypeMeta *
+static inline PyArray_DTypeMeta *
 npy_discover_dtype_from_pytype(PyTypeObject *pytype)
 {
     PyObject *DType;
@@ -262,7 +263,7 @@ PyArray_DiscoverDTypeFromScalarType(PyTypeObject *pytype)
  *        it can/wants to handle the (possible) scalar value.
  * @return New reference to either a DType class, Py_None, or NULL on error.
  */
-static NPY_INLINE PyArray_DTypeMeta *
+static inline PyArray_DTypeMeta *
 discover_dtype_from_pyobject(
         PyObject *obj, enum _dtype_discovery_flags *flags,
         PyArray_DTypeMeta *fixed_DType)
@@ -346,7 +347,7 @@ discover_dtype_from_pyobject(
  * @param obj The Python scalar object. At the time of calling this function
  *        it must be known that `obj` should represent a scalar.
  */
-static NPY_INLINE PyArray_Descr *
+static inline PyArray_Descr *
 find_scalar_descriptor(
         PyArray_DTypeMeta *fixed_DType, PyArray_DTypeMeta *DType,
         PyObject *obj)
@@ -552,8 +553,8 @@ PyArray_Pack(PyArray_Descr *descr, char *item, PyObject *value)
 
 static int
 update_shape(int curr_ndim, int *max_ndim,
-             npy_intp out_shape[NPY_MAXDIMS], int new_ndim,
-             const npy_intp new_shape[NPY_MAXDIMS], npy_bool sequence,
+             npy_intp out_shape[], int new_ndim,
+             const npy_intp new_shape[], npy_bool sequence,
              enum _dtype_discovery_flags *flags)
 {
     int success = 0;  /* unsuccessful if array is ragged */
@@ -611,7 +612,7 @@ static coercion_cache_obj *_coercion_cache_cache[COERCION_CACHE_CACHE_SIZE];
 /*
  * Steals a reference to the object.
  */
-static NPY_INLINE int
+static inline int
 npy_new_coercion_cache(
         PyObject *converted_obj, PyObject *arr_or_sequence, npy_bool sequence,
         coercion_cache_obj ***next_ptr, int ndim)
@@ -681,7 +682,7 @@ npy_free_coercion_cache(coercion_cache_obj *next) {
  * @param flags dtype discover flags to signal failed promotion.
  * @return -1 on error, 0 on success.
  */
-static NPY_INLINE int
+static inline int
 handle_promotion(PyArray_Descr **out_descr, PyArray_Descr *descr,
         PyArray_DTypeMeta *fixed_DType, enum _dtype_discovery_flags *flags)
 {
@@ -726,7 +727,7 @@ handle_promotion(PyArray_Descr **out_descr, PyArray_Descr *descr,
  *
  * @return 0 on success -1 on error
  */
-static NPY_INLINE int
+static inline int
 handle_scalar(
         PyObject *obj, int curr_dims, int *max_dims,
         PyArray_Descr **out_descr, npy_intp *out_shape,
@@ -873,42 +874,62 @@ find_descriptor_from_array(
 
 /**
  * Given a dtype or DType object, find the correct descriptor to cast the
- * array to.
+ * array to.  In some places, this function is used with dtype=NULL which
+ * means that legacy behavior is used: The dtype instances "S0", "U0", and
+ * "V0" are converted to mean the DType classes instead.
+ * When dtype != NULL, this path is ignored, and the function does nothing
+ * unless descr == NULL. If both descr and dtype are null, it returns the
+ * descriptor for the array.
  *
  * This function is identical to normal casting using only the dtype, however,
  * it supports inspecting the elements when the array has object dtype
  * (and the given datatype describes a parametric DType class).
  *
  * @param arr
- * @param dtype A dtype instance or class.
+ * @param dtype NULL or a dtype class
+ * @param descr A dtype instance, if the dtype is NULL the dtype class is
+ *              found and e.g. "S0" is converted to denote only String.
  * @return A concrete dtype instance or NULL
  */
 NPY_NO_EXPORT PyArray_Descr *
-PyArray_AdaptDescriptorToArray(PyArrayObject *arr, PyObject *dtype)
+PyArray_AdaptDescriptorToArray(
+        PyArrayObject *arr, PyArray_DTypeMeta *dtype, PyArray_Descr *descr)
 {
     /* If the requested dtype is flexible, adapt it */
-    PyArray_Descr *new_dtype;
-    PyArray_DTypeMeta *new_DType;
+    PyArray_Descr *new_descr;
     int res;
 
-    res = PyArray_ExtractDTypeAndDescriptor((PyObject *)dtype,
-            &new_dtype, &new_DType);
-    if (res < 0) {
-        return NULL;
+    if (dtype != NULL && descr != NULL) {
+        /* descr was given and no special logic, return (call not necessary) */
+        Py_INCREF(descr);
+        return descr;
     }
-    if (new_dtype == NULL) {
-        res = find_descriptor_from_array(arr, new_DType, &new_dtype);
+    if (dtype == NULL) {
+        res = PyArray_ExtractDTypeAndDescriptor(descr, &new_descr, &dtype);
         if (res < 0) {
-            Py_DECREF(new_DType);
             return NULL;
         }
-        if (new_dtype == NULL) {
-            /* This is an object array but contained no elements, use default */
-            new_dtype = NPY_DT_CALL_default_descr(new_DType);
+        if (new_descr != NULL) {
+            Py_DECREF(dtype);
+            return new_descr;
         }
     }
-    Py_DECREF(new_DType);
-    return new_dtype;
+    else {
+        assert(descr == NULL);  /* gueranteed above */
+        Py_INCREF(dtype);
+    }
+
+    res = find_descriptor_from_array(arr, dtype, &new_descr);
+    if (res < 0) {
+        Py_DECREF(dtype);
+        return NULL;
+    }
+    if (new_descr == NULL) {
+        /* This is an object array but contained no elements, use default */
+        new_descr = NPY_DT_CALL_default_descr(dtype);
+    }
+    Py_XDECREF(dtype);
+    return new_descr;
 }
 
 
@@ -1004,53 +1025,6 @@ PyArray_DiscoverDTypeAndShape_Recursive(
             Py_DECREF(arr);
             arr = NULL;
         }
-        else if (curr_dims > 0 && curr_dims != max_dims) {
-            /*
-             * Deprecated 2020-12-09, NumPy 1.20
-             *
-             * See https://github.com/numpy/numpy/issues/17965
-             * Shapely had objects which are not sequences but did export
-             * the array-interface (and so are arguably array-like).
-             * Previously numpy would not use array-like information during
-             * shape discovery, so that it ended up acting as if this was
-             * an (unknown) scalar but with the specified dtype.
-             * Thus we ignore "scalars" here, as the value stored in the
-             * array should be acceptable.
-             */
-            if (PyArray_NDIM(arr) > 0 && NPY_UNLIKELY(!PySequence_Check(obj))) {
-                if (PyErr_WarnFormat(PyExc_FutureWarning, 1,
-                        "The input object of type '%s' is an array-like "
-                        "implementing one of the corresponding protocols "
-                        "(`__array__`, `__array_interface__` or "
-                        "`__array_struct__`); but not a sequence (or 0-D). "
-                        "In the future, this object will be coerced as if it "
-                        "was first converted using `np.array(obj)`. "
-                        "To retain the old behaviour, you have to either "
-                        "modify the type '%s', or assign to an empty array "
-                        "created with `np.empty(correct_shape, dtype=object)`.",
-                        Py_TYPE(obj)->tp_name, Py_TYPE(obj)->tp_name) < 0) {
-                    Py_DECREF(arr);
-                    return -1;
-                }
-                /*
-                 * Strangely enough, even though we threw away the result here,
-                 * we did use it during descriptor discovery, so promote it:
-                 */
-                if (update_shape(curr_dims, &max_dims, out_shape,
-                        0, NULL, NPY_FALSE, flags) < 0) {
-                    *flags |= FOUND_RAGGED_ARRAY;
-                    Py_DECREF(arr);
-                    return max_dims;
-                }
-                if (!(*flags & DESCRIPTOR_WAS_SET) && handle_promotion(
-                        out_descr, PyArray_DESCR(arr), fixed_DType, flags) < 0) {
-                    Py_DECREF(arr);
-                    return -1;
-                }
-                Py_DECREF(arr);
-                return max_dims;
-            }
-        }
     }
     if (arr != NULL) {
         /*
@@ -1260,7 +1234,9 @@ PyArray_DiscoverDTypeAndShape(
     }
 
     if (requested_descr != NULL) {
-        assert(fixed_DType == NPY_DTYPE(requested_descr));
+        if (fixed_DType != NULL) {
+            assert(fixed_DType == NPY_DTYPE(requested_descr));
+        }
         /* The output descriptor must be the input. */
         Py_INCREF(requested_descr);
         *out_descr = requested_descr;
@@ -1380,105 +1356,25 @@ PyArray_DiscoverDTypeAndShape(
 }
 
 
-
-/**
- * Check the descriptor is a legacy "flexible" DType instance, this is
- * an instance which is (normally) not attached to an array, such as a string
- * of length 0 or a datetime with no unit.
- * These should be largely deprecated, and represent only the DType class
- * for most `dtype` parameters.
- *
- * TODO: This function should eventually receive a deprecation warning and
- *       be removed.
- *
- * @param descr
- * @return 1 if this is not a concrete dtype instance 0 otherwise
- */
-static int
-descr_is_legacy_parametric_instance(PyArray_Descr *descr)
-{
-    if (PyDataType_ISUNSIZED(descr)) {
-        return 1;
-    }
-    /* Flexible descr with generic time unit (which can be adapted) */
-    if (PyDataType_ISDATETIME(descr)) {
-        PyArray_DatetimeMetaData *meta;
-        meta = get_datetime_metadata_from_dtype(descr);
-        if (meta->base == NPY_FR_GENERIC) {
-            return 1;
-        }
-    }
-    return 0;
-}
-
-
-/**
- * Given either a DType instance or class, (or legacy flexible instance),
- * ands sets output dtype instance and DType class. Both results may be
- * NULL, but if `out_descr` is set `out_DType` will always be the
- * corresponding class.
- *
- * @param dtype
- * @param out_descr
- * @param out_DType
- * @return 0 on success -1 on failure
- */
-NPY_NO_EXPORT int
-PyArray_ExtractDTypeAndDescriptor(PyObject *dtype,
-        PyArray_Descr **out_descr, PyArray_DTypeMeta **out_DType)
-{
-    *out_DType = NULL;
-    *out_descr = NULL;
-
-    if (dtype != NULL) {
-        if (PyObject_TypeCheck(dtype, (PyTypeObject *)&PyArrayDTypeMeta_Type)) {
-            assert(dtype != (PyObject * )&PyArrayDescr_Type);  /* not np.dtype */
-            *out_DType = (PyArray_DTypeMeta *)dtype;
-            Py_INCREF(*out_DType);
-        }
-        else if (PyObject_TypeCheck((PyObject *)Py_TYPE(dtype),
-                    (PyTypeObject *)&PyArrayDTypeMeta_Type)) {
-            *out_DType = NPY_DTYPE(dtype);
-            Py_INCREF(*out_DType);
-            if (!descr_is_legacy_parametric_instance((PyArray_Descr *)dtype)) {
-                *out_descr = (PyArray_Descr *)dtype;
-                Py_INCREF(*out_descr);
-            }
-        }
-        else {
-            PyErr_SetString(PyExc_TypeError,
-                    "dtype parameter must be a DType instance or class.");
-            return -1;
-        }
-    }
-    return 0;
-}
-
-
 /*
  * Python API function to expose the dtype+shape discovery functionality
  * directly.
  */
 NPY_NO_EXPORT PyObject *
 _discover_array_parameters(PyObject *NPY_UNUSED(self),
-                           PyObject *args, PyObject *kwargs)
+        PyObject *const *args, Py_ssize_t len_args, PyObject *kwnames)
 {
-    static char *kwlist[] = {"obj", "dtype", NULL};
-
     PyObject *obj;
-    PyObject *dtype = NULL;
-    PyArray_Descr *fixed_descriptor = NULL;
-    PyArray_DTypeMeta *fixed_DType = NULL;
+    npy_dtype_info dt_info = {NULL, NULL};
     npy_intp shape[NPY_MAXDIMS];
 
-    if (!PyArg_ParseTupleAndKeywords(
-            args, kwargs, "O|O:_discover_array_parameters", kwlist,
-            &obj, &dtype)) {
-        return NULL;
-    }
-
-    if (PyArray_ExtractDTypeAndDescriptor(dtype,
-            &fixed_descriptor, &fixed_DType) < 0) {
+    NPY_PREPARE_ARGPARSER;
+    if (npy_parse_arguments(
+            "_discover_array_parameters", args, len_args, kwnames,
+            "", NULL, &obj,
+            "|dtype", &PyArray_DTypeOrDescrConverterOptional, &dt_info,
+            NULL, NULL, NULL) < 0) {
+        /* fixed is last to parse, so never necessary to clean up */
         return NULL;
     }
 
@@ -1487,9 +1383,9 @@ _discover_array_parameters(PyObject *NPY_UNUSED(self),
     int ndim = PyArray_DiscoverDTypeAndShape(
             obj, NPY_MAXDIMS, shape,
             &coercion_cache,
-            fixed_DType, fixed_descriptor, (PyArray_Descr **)&out_dtype, 0);
-    Py_XDECREF(fixed_DType);
-    Py_XDECREF(fixed_descriptor);
+            dt_info.dtype, dt_info.descr, (PyArray_Descr **)&out_dtype, 0);
+    Py_XDECREF(dt_info.dtype);
+    Py_XDECREF(dt_info.descr);
     if (ndim < 0) {
         return NULL;
     }
diff --git a/numpy/core/src/multiarray/array_coercion.h b/numpy/core/src/multiarray/array_coercion.h
index 63d543cf7..0757c1cb8 100644
--- a/numpy/core/src/multiarray/array_coercion.h
+++ b/numpy/core/src/multiarray/array_coercion.h
@@ -26,7 +26,8 @@ NPY_NO_EXPORT int
 PyArray_Pack(PyArray_Descr *descr, char *item, PyObject *value);
 
 NPY_NO_EXPORT PyArray_Descr *
-PyArray_AdaptDescriptorToArray(PyArrayObject *arr, PyObject *dtype);
+PyArray_AdaptDescriptorToArray(
+        PyArrayObject *arr, PyArray_DTypeMeta *dtype, PyArray_Descr *descr);
 
 NPY_NO_EXPORT int
 PyArray_DiscoverDTypeAndShape(
@@ -36,14 +37,9 @@ PyArray_DiscoverDTypeAndShape(
         PyArray_DTypeMeta *fixed_DType, PyArray_Descr *requested_descr,
         PyArray_Descr **out_descr, int never_copy);
 
-NPY_NO_EXPORT int
-PyArray_ExtractDTypeAndDescriptor(PyObject *dtype,
-        PyArray_Descr **out_descr, PyArray_DTypeMeta **out_DType);
-
 NPY_NO_EXPORT PyObject *
 _discover_array_parameters(PyObject *NPY_UNUSED(self),
-                           PyObject *args, PyObject *kwargs);
-
+        PyObject *const *args, Py_ssize_t len_args, PyObject *kwnames);
 
 /* Would make sense to inline the freeing functions everywhere */
 /* Frees the coercion cache object recursively. */
diff --git a/numpy/core/src/multiarray/array_method.c b/numpy/core/src/multiarray/array_method.c
index 3450273b1..87515bb03 100644
--- a/numpy/core/src/multiarray/array_method.c
+++ b/numpy/core/src/multiarray/array_method.c
@@ -27,15 +27,18 @@
  *    weak reference to the input DTypes.
  */
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _UMATHMODULE
 #define _MULTIARRAYMODULE
 
 #include <npy_pycompat.h>
 #include "arrayobject.h"
+#include "array_coercion.h"
 #include "array_method.h"
 #include "dtypemeta.h"
 #include "common_dtype.h"
 #include "convert_datatype.h"
 #include "common.h"
+#include "numpy/ufuncobject.h"
 
 
 /*
@@ -99,7 +102,7 @@ default_resolve_descriptors(
 }
 
 
-NPY_INLINE static int
+static inline int
 is_contiguous(
         npy_intp const *strides, PyArray_Descr *const *descriptors, int nargs)
 {
@@ -248,6 +251,7 @@ fill_arraymethod_from_slots(
     /* Set the defaults */
     meth->get_strided_loop = &npy_default_get_strided_loop;
     meth->resolve_descriptors = &default_resolve_descriptors;
+    meth->get_reduction_initial = NULL;  /* no initial/identity by default */
 
     /* Fill in the slots passed by the user */
     /*
@@ -260,7 +264,7 @@ fill_arraymethod_from_slots(
             case NPY_METH_resolve_descriptors:
                 meth->resolve_descriptors = slot->pfunc;
                 continue;
-            case NPY_METH_get_loop:
+            case _NPY_METH_get_loop:
                 /*
                  * NOTE: get_loop is considered "unstable" in the public API,
                  *       I do not like the signature, and the `move_references`
@@ -284,6 +288,12 @@ fill_arraymethod_from_slots(
             case NPY_METH_unaligned_contiguous_loop:
                 meth->unaligned_contiguous_loop = slot->pfunc;
                 continue;
+            case NPY_METH_get_reduction_initial:
+                meth->get_reduction_initial = slot->pfunc;
+                continue;
+            case NPY_METH_contiguous_indexed_loop:
+                meth->contiguous_indexed_loop = slot->pfunc;
+                continue;
             default:
                 break;
         }
@@ -334,27 +344,39 @@ fill_arraymethod_from_slots(
     }
 
     /* Check whether the provided loops make sense. */
+    if (meth->flags & NPY_METH_SUPPORTS_UNALIGNED) {
+        if (meth->unaligned_strided_loop == NULL) {
+            PyErr_Format(PyExc_TypeError,
+                    "Must provide unaligned strided inner loop when using "
+                    "NPY_METH_SUPPORTS_UNALIGNED flag (in method: %s)",
+                    spec->name);
+            return -1;
+        }
+    }
+    else {
+        if (meth->unaligned_strided_loop != NULL) {
+            PyErr_Format(PyExc_TypeError,
+                    "Must not provide unaligned strided inner loop when not "
+                    "using NPY_METH_SUPPORTS_UNALIGNED flag (in method: %s)",
+                    spec->name);
+            return -1;
+        }
+    }
+    /* Fill in the blanks: */
+    if (meth->unaligned_contiguous_loop == NULL) {
+        meth->unaligned_contiguous_loop = meth->unaligned_strided_loop;
+    }
     if (meth->strided_loop == NULL) {
-        PyErr_Format(PyExc_TypeError,
-                "Must provide a strided inner loop function. (method: %s)",
-                spec->name);
-        return -1;
+        meth->strided_loop = meth->unaligned_strided_loop;
     }
     if (meth->contiguous_loop == NULL) {
         meth->contiguous_loop = meth->strided_loop;
     }
-    if (meth->unaligned_contiguous_loop != NULL &&
-            meth->unaligned_strided_loop == NULL) {
-        PyErr_Format(PyExc_TypeError,
-                "Must provide unaligned strided inner loop when providing "
-                "a contiguous version. (method: %s)", spec->name);
-        return -1;
-    }
-    if ((meth->unaligned_strided_loop == NULL) !=
-            !(meth->flags & NPY_METH_SUPPORTS_UNALIGNED)) {
+
+    if (meth->strided_loop == NULL) {
         PyErr_Format(PyExc_TypeError,
-                "Must provide unaligned strided inner loop when providing "
-                "a contiguous version. (method: %s)", spec->name);
+                "Must provide a strided inner loop function. (method: %s)",
+                spec->name);
         return -1;
     }
 
@@ -482,8 +504,8 @@ NPY_NO_EXPORT PyTypeObject PyArrayMethod_Type = {
     PyVarObject_HEAD_INIT(NULL, 0)
     .tp_name = "numpy._ArrayMethod",
     .tp_basicsize = sizeof(PyArrayMethodObject),
-    .tp_flags = Py_TPFLAGS_DEFAULT,
     .tp_dealloc = arraymethod_dealloc,
+    .tp_flags = Py_TPFLAGS_DEFAULT,
 };
 
 
@@ -883,7 +905,7 @@ generic_masked_strided_loop(PyArrayMethod_Context *context,
 /*
  * Fetches a strided-loop function that supports a boolean mask as additional
  * (last) operand to the strided-loop.  It is otherwise largely identical to
- * the `get_loop` method which it wraps.
+ * the `get_strided_loop` method which it wraps.
  * This is the core implementation for the ufunc `where=...` keyword argument.
  *
  * NOTE: This function does not support `move_references` or inner dimensions.
@@ -951,9 +973,9 @@ NPY_NO_EXPORT PyTypeObject PyBoundArrayMethod_Type = {
     PyVarObject_HEAD_INIT(NULL, 0)
     .tp_name = "numpy._BoundArrayMethod",
     .tp_basicsize = sizeof(PyBoundArrayMethodObject),
-    .tp_flags = Py_TPFLAGS_DEFAULT,
-    .tp_repr = (reprfunc)boundarraymethod_repr,
     .tp_dealloc = boundarraymethod_dealloc,
+    .tp_repr = (reprfunc)boundarraymethod_repr,
+    .tp_flags = Py_TPFLAGS_DEFAULT,
     .tp_methods = boundarraymethod_methods,
     .tp_getset = boundarraymethods_getters,
 };
diff --git a/numpy/core/src/multiarray/array_method.h b/numpy/core/src/multiarray/array_method.h
index c9ec8903d..c82a968cd 100644
--- a/numpy/core/src/multiarray/array_method.h
+++ b/numpy/core/src/multiarray/array_method.h
@@ -11,40 +11,7 @@
 extern "C" {
 #endif
 
-typedef enum {
-    /* Flag for whether the GIL is required */
-    NPY_METH_REQUIRES_PYAPI = 1 << 1,
-    /*
-     * Some functions cannot set floating point error flags, this flag
-     * gives us the option (not requirement) to skip floating point error
-     * setup/check. No function should set error flags and ignore them
-     * since it would interfere with chaining operations (e.g. casting).
-     */
-    /*
-     * TODO: Change this into a positive flag?  That would make "combing"
-     *       multiple methods easier. OTOH, if we add more flags, the default
-     *       would be 0 just like it is here.
-     */
-    NPY_METH_NO_FLOATINGPOINT_ERRORS = 1 << 2,
-    /* Whether the method supports unaligned access (not runtime) */
-    NPY_METH_SUPPORTS_UNALIGNED = 1 << 3,
-    /*
-     * Private flag for now for *logic* functions.  The logical functions
-     * `logical_or` and `logical_and` can always cast the inputs to booleans
-     * "safely" (because that is how the cast to bool is defined).
-     * @seberg: I am not sure this is the best way to handle this, so its
-     * private for now (also it is very limited anyway).
-     * There is one "exception". NA aware dtypes cannot cast to bool
-     * (hopefully), so the `??->?` loop should error even with this flag.
-     * But a second NA fallback loop will be necessary.
-     */
-    _NPY_METH_FORCE_CAST_INPUTS = 1 << 17,
-
-    /* All flags which can change at runtime */
-    NPY_METH_RUNTIME_FLAGS = (
-            NPY_METH_REQUIRES_PYAPI |
-            NPY_METH_NO_FLOATINGPOINT_ERRORS),
-} NPY_ARRAYMETHOD_FLAGS;
+#include "numpy/_dtype_api.h"
 
 
 /*
@@ -60,119 +27,6 @@ typedef enum {
             ((flags1 | flags2) & ~PyArrayMethod_MINIMAL_FLAGS)  \
             | (flags1 & flags2)))
 
-
-struct PyArrayMethodObject_tag;
-
-/*
- * This struct is specific to an individual (possibly repeated) call of
- * the ArrayMethods strided operator, and as such is passed into the various
- * methods of the ArrayMethod object (the resolve_descriptors function,
- * the get_loop function and the individual lowlevel strided operator calls).
- * It thus has to be persistent for one end-user call, and then be discarded.
- *
- * TODO: Before making this public, we should review which information should
- *       be stored on the Context/BoundArrayMethod vs. the ArrayMethod.
- */
-typedef struct {
-    PyObject *caller;  /* E.g. the original ufunc, may be NULL */
-    struct PyArrayMethodObject_tag *method;
-
-    /* Operand descriptors, filled in by resolve_descriptors */
-    PyArray_Descr **descriptors;
-} PyArrayMethod_Context;
-
-
-typedef int (PyArrayMethod_StridedLoop)(PyArrayMethod_Context *context,
-        char *const *data, const npy_intp *dimensions, const npy_intp *strides,
-        NpyAuxData *transferdata);
-
-
-typedef NPY_CASTING (resolve_descriptors_function)(
-        struct PyArrayMethodObject_tag *method,
-        PyArray_DTypeMeta **dtypes,
-        PyArray_Descr **given_descrs,
-        PyArray_Descr **loop_descrs,
-        npy_intp *view_offset);
-
-
-typedef int (get_loop_function)(
-        PyArrayMethod_Context *context,
-        int aligned, int move_references,
-        const npy_intp *strides,
-        PyArrayMethod_StridedLoop **out_loop,
-        NpyAuxData **out_transferdata,
-        NPY_ARRAYMETHOD_FLAGS *flags);
-
-
-/*
- * The following functions are only used be the wrapping array method defined
- * in umath/wrapping_array_method.c
- */
-
-/*
- * The function to convert the given descriptors (passed in to
- * `resolve_descriptors`) and translates them for the wrapped loop.
- * The new descriptors MUST be viewable with the old ones, `NULL` must be
- * supported (for outputs) and should normally be forwarded.
- *
- * The function must clean up on error.
- *
- * NOTE: We currently assume that this translation gives "viewable" results.
- *       I.e. there is no additional casting related to the wrapping process.
- *       In principle that could be supported, but not sure it is useful.
- *       This currently also means that e.g. alignment must apply identically
- *       to the new dtypes.
- *
- * TODO: Due to the fact that `resolve_descriptors` is also used for `can_cast`
- *       there is no way to "pass out" the result of this function.  This means
- *       it will be called twice for every ufunc call.
- *       (I am considering including `auxdata` as an "optional" parameter to
- *       `resolve_descriptors`, so that it can be filled there if not NULL.)
- */
-typedef int translate_given_descrs_func(int nin, int nout,
-        PyArray_DTypeMeta *wrapped_dtypes[],
-        PyArray_Descr *given_descrs[], PyArray_Descr *new_descrs[]);
-
-/**
- * The function to convert the actual loop descriptors (as returned by the
- * original `resolve_descriptors` function) to the ones the output array
- * should use.
- * This function must return "viewable" types, it must not mutate them in any
- * form that would break the inner-loop logic.  Does not need to support NULL.
- *
- * The function must clean up on error.
- *
- * @param nargs Number of arguments
- * @param new_dtypes The DTypes of the output (usually probably not needed)
- * @param given_descrs Original given_descrs to the resolver, necessary to
- *        fetch any information related to the new dtypes from the original.
- * @param original_descrs The `loop_descrs` returned by the wrapped loop.
- * @param loop_descrs The output descriptors, compatible to `original_descrs`.
- *
- * @returns 0 on success, -1 on failure.
- */
-typedef int translate_loop_descrs_func(int nin, int nout,
-        PyArray_DTypeMeta *new_dtypes[], PyArray_Descr *given_descrs[],
-        PyArray_Descr *original_descrs[], PyArray_Descr *loop_descrs[]);
-
-
-/*
- * This struct will be public and necessary for creating a new ArrayMethod
- * object (casting and ufuncs).
- * We could version the struct, although since we allow passing arbitrary
- * data using the slots, and have flags, that may be enough?
- * (See also PyBoundArrayMethodObject.)
- */
-typedef struct {
-    const char *name;
-    int nin, nout;
-    NPY_CASTING casting;
-    NPY_ARRAYMETHOD_FLAGS flags;
-    PyArray_DTypeMeta **dtypes;
-    PyType_Slot *slots;
-} PyArrayMethod_Spec;
-
-
 /*
  * Structure of the ArrayMethod. This structure should probably not be made
  * public. If necessary, we can make certain operations on it public
@@ -193,16 +47,20 @@ typedef struct PyArrayMethodObject_tag {
     NPY_ARRAYMETHOD_FLAGS flags;
     resolve_descriptors_function *resolve_descriptors;
     get_loop_function *get_strided_loop;
+    get_reduction_initial_function  *get_reduction_initial;
     /* Typical loop functions (contiguous ones are used in current casts) */
     PyArrayMethod_StridedLoop *strided_loop;
     PyArrayMethod_StridedLoop *contiguous_loop;
     PyArrayMethod_StridedLoop *unaligned_strided_loop;
     PyArrayMethod_StridedLoop *unaligned_contiguous_loop;
+    PyArrayMethod_StridedLoop *contiguous_indexed_loop;
     /* Chunk only used for wrapping array method defined in umath */
     struct PyArrayMethodObject_tag *wrapped_meth;
     PyArray_DTypeMeta **wrapped_dtypes;
     translate_given_descrs_func *translate_given_descrs;
     translate_loop_descrs_func *translate_loop_descrs;
+    /* Chunk reserved for use by the legacy fallback arraymethod */
+    char legacy_initial[sizeof(npy_clongdouble)];  /* initial value storage */
 } PyArrayMethodObject;
 
 
@@ -226,18 +84,6 @@ typedef struct {
 extern NPY_NO_EXPORT PyTypeObject PyArrayMethod_Type;
 extern NPY_NO_EXPORT PyTypeObject PyBoundArrayMethod_Type;
 
-/*
- * SLOTS IDs For the ArrayMethod creation, one public, the IDs are fixed.
- * TODO: Before making it public, consider adding a large constant to private
- *       slots.
- */
-#define NPY_METH_resolve_descriptors 1
-#define NPY_METH_get_loop 2
-#define NPY_METH_strided_loop 3
-#define NPY_METH_contiguous_loop 4
-#define NPY_METH_unaligned_strided_loop 5
-#define NPY_METH_unaligned_contiguous_loop 6
-
 
 /*
  * Used internally (initially) for real to complex loops only
diff --git a/numpy/core/src/multiarray/arrayfunction_override.c b/numpy/core/src/multiarray/arrayfunction_override.c
index 2bb3fbe28..04768504e 100644
--- a/numpy/core/src/multiarray/arrayfunction_override.c
+++ b/numpy/core/src/multiarray/arrayfunction_override.c
@@ -1,11 +1,15 @@
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
 #define _MULTIARRAYMODULE
 
+#include <Python.h>
+#include "structmember.h"
+
 #include "npy_pycompat.h"
 #include "get_attr_string.h"
 #include "npy_import.h"
 #include "multiarraymodule.h"
 
+#include "arrayfunction_override.h"
 
 /* Return the ndarray.__array_function__ method. */
 static PyObject *
@@ -200,183 +204,67 @@ call_array_function(PyObject* argument, PyObject* method,
 }
 
 
-/**
- * Internal handler for the array-function dispatching. The helper returns
- * either the result, or NotImplemented (as a borrowed reference).
- *
- * @param public_api The public API symbol used for dispatching
- * @param relevant_args Arguments which may implement __array_function__
- * @param args Original arguments
- * @param kwargs Original keyword arguments
- *
- * @returns The result of the dispatched version, or a borrowed reference
- *          to NotImplemented to indicate the default implementation should
- *          be used.
+
+/*
+ * Helper to convert from vectorcall convention, since the protocol requires
+ * args and kwargs to be passed as tuple and dict explicitly.
+ * We always pass a dict, so always returns it.
  */
-static PyObject *
-array_implement_array_function_internal(
-    PyObject *public_api, PyObject *relevant_args,
-    PyObject *args, PyObject *kwargs)
+static int
+get_args_and_kwargs(
+        PyObject *const *fast_args, Py_ssize_t len_args, PyObject *kwnames,
+        PyObject **out_args, PyObject **out_kwargs)
 {
-    PyObject *implementing_args[NPY_MAXARGS];
-    PyObject *array_function_methods[NPY_MAXARGS];
-    PyObject *types = NULL;
-
-    PyObject *result = NULL;
-
-    static PyObject *errmsg_formatter = NULL;
-
-    relevant_args = PySequence_Fast(
-        relevant_args,
-        "dispatcher for __array_function__ did not return an iterable");
-    if (relevant_args == NULL) {
-        return NULL;
-    }
-
-    /* Collect __array_function__ implementations */
-    int num_implementing_args = get_implementing_args_and_methods(
-        relevant_args, implementing_args, array_function_methods);
-    if (num_implementing_args == -1) {
-        goto cleanup;
-    }
+    len_args = PyVectorcall_NARGS(len_args);
+    PyObject *args = PyTuple_New(len_args);
+    PyObject *kwargs = NULL;
 
-    /*
-     * Handle the typical case of no overrides. This is merely an optimization
-     * if some arguments are ndarray objects, but is also necessary if no
-     * arguments implement __array_function__ at all (e.g., if they are all
-     * built-in types).
-     */
-    int any_overrides = 0;
-    for (int j = 0; j < num_implementing_args; j++) {
-        if (!is_default_array_function(array_function_methods[j])) {
-            any_overrides = 1;
-            break;
-        }
+    if (args == NULL) {
+        return -1;
     }
-    if (!any_overrides) {
-        /*
-         * When the default implementation should be called, return
-         * `Py_NotImplemented` to indicate this.
-         */
-        result = Py_NotImplemented;
-        goto cleanup;
-    }
-
-    /*
-     * Create a Python object for types.
-     * We use a tuple, because it's the fastest Python collection to create
-     * and has the bonus of being immutable.
-     */
-    types = PyTuple_New(num_implementing_args);
-    if (types == NULL) {
-        goto cleanup;
+    for (Py_ssize_t i = 0; i < len_args; i++) {
+        Py_INCREF(fast_args[i]);
+        PyTuple_SET_ITEM(args, i, fast_args[i]);
     }
-    for (int j = 0; j < num_implementing_args; j++) {
-        PyObject *arg_type = (PyObject *)Py_TYPE(implementing_args[j]);
-        Py_INCREF(arg_type);
-        PyTuple_SET_ITEM(types, j, arg_type);
+    kwargs = PyDict_New();
+    if (kwargs == NULL) {
+        Py_DECREF(args);
+        return -1;
     }
-
-    /* Call __array_function__ methods */
-    for (int j = 0; j < num_implementing_args; j++) {
-        PyObject *argument = implementing_args[j];
-        PyObject *method = array_function_methods[j];
-
-        /*
-         * We use `public_api` instead of `implementation` here so
-         * __array_function__ implementations can do equality/identity
-         * comparisons.
-         */
-        result = call_array_function(
-            argument, method, public_api, types, args, kwargs);
-
-        if (result == Py_NotImplemented) {
-            /* Try the next one */
-            Py_DECREF(result);
-            result = NULL;
-        }
-        else {
-            /* Either a good result, or an exception was raised. */
-            goto cleanup;
+    if (kwnames != NULL) {
+        Py_ssize_t nkwargs = PyTuple_GET_SIZE(kwnames);
+        for (Py_ssize_t i = 0; i < nkwargs; i++) {
+            PyObject *key = PyTuple_GET_ITEM(kwnames, i);
+            PyObject *value = fast_args[i+len_args];
+            if (PyDict_SetItem(kwargs, key, value) < 0) {
+                Py_DECREF(args);
+                Py_DECREF(kwargs);
+                return -1;
+            }
         }
     }
+    *out_args = args;
+    *out_kwargs = kwargs;
+    return 0;
+}
 
+
+static void
+set_no_matching_types_error(PyObject *public_api, PyObject *types)
+{
+    static PyObject *errmsg_formatter = NULL;
     /* No acceptable override found, raise TypeError. */
     npy_cache_import("numpy.core._internal",
                      "array_function_errmsg_formatter",
                      &errmsg_formatter);
     if (errmsg_formatter != NULL) {
         PyObject *errmsg = PyObject_CallFunctionObjArgs(
-            errmsg_formatter, public_api, types, NULL);
+                errmsg_formatter, public_api, types, NULL);
         if (errmsg != NULL) {
             PyErr_SetObject(PyExc_TypeError, errmsg);
             Py_DECREF(errmsg);
         }
     }
-
-cleanup:
-    for (int j = 0; j < num_implementing_args; j++) {
-        Py_DECREF(implementing_args[j]);
-        Py_DECREF(array_function_methods[j]);
-    }
-    Py_XDECREF(types);
-    Py_DECREF(relevant_args);
-    return result;
-}
-
-
-/*
- * Implements the __array_function__ protocol for a Python function, as described in
- * in NEP-18. See numpy.core.overrides for a full docstring.
- */
-NPY_NO_EXPORT PyObject *
-array_implement_array_function(
-    PyObject *NPY_UNUSED(dummy), PyObject *positional_args)
-{
-    PyObject *res, *implementation, *public_api, *relevant_args, *args, *kwargs;
-
-    if (!PyArg_UnpackTuple(
-            positional_args, "implement_array_function", 5, 5,
-            &implementation, &public_api, &relevant_args, &args, &kwargs)) {
-        return NULL;
-    }
-
-    /*
-     * Remove `like=` kwarg, which is NumPy-exclusive and thus not present
-     * in downstream libraries. If `like=` is specified but doesn't
-     * implement `__array_function__`, raise a `TypeError`.
-     */
-    if (kwargs != NULL && PyDict_Contains(kwargs, npy_ma_str_like)) {
-        PyObject *like_arg = PyDict_GetItem(kwargs, npy_ma_str_like);
-        if (like_arg != NULL) {
-            PyObject *tmp_has_override = get_array_function(like_arg);
-            if (tmp_has_override == NULL) {
-                return PyErr_Format(PyExc_TypeError,
-                        "The `like` argument must be an array-like that "
-                        "implements the `__array_function__` protocol.");
-            }
-            Py_DECREF(tmp_has_override);
-            PyDict_DelItem(kwargs, npy_ma_str_like);
-
-            /*
-             * If `like=` kwarg was removed, `implementation` points to the NumPy
-             * public API, as `public_api` is in that case the wrapper dispatcher
-             * function. For example, in the `np.full` case, `implementation` is
-             * `np.full`, whereas `public_api` is `_full_with_like`. This is done
-             * to ensure `__array_function__` implementations can do
-             * equality/identity comparisons when `like=` is present.
-             */
-            public_api = implementation;
-        }
-    }
-
-    res = array_implement_array_function_internal(
-        public_api, relevant_args, args, kwargs);
-
-    if (res == Py_NotImplemented) {
-        return PyObject_Call(implementation, args, kwargs);
-    }
-    return res;
 }
 
 /*
@@ -392,64 +280,52 @@ array_implement_c_array_function_creation(
     PyObject *args, PyObject *kwargs,
     PyObject *const *fast_args, Py_ssize_t len_args, PyObject *kwnames)
 {
-    PyObject *relevant_args = NULL;
+    PyObject *dispatch_types = NULL;
     PyObject *numpy_module = NULL;
     PyObject *public_api = NULL;
     PyObject *result = NULL;
 
     /* If `like` doesn't implement `__array_function__`, raise a `TypeError` */
-    PyObject *tmp_has_override = get_array_function(like);
-    if (tmp_has_override == NULL) {
+    PyObject *method = get_array_function(like);
+    if (method == NULL) {
         return PyErr_Format(PyExc_TypeError,
                 "The `like` argument must be an array-like that "
                 "implements the `__array_function__` protocol.");
     }
-    Py_DECREF(tmp_has_override);
-
-    if (fast_args != NULL) {
+    if (is_default_array_function(method)) {
         /*
-         * Convert from vectorcall convention, since the protocol requires
-         * the normal convention.  We have to do this late to ensure the
-         * normal path where NotImplemented is returned is fast.
+         * Return a borrowed reference of Py_NotImplemented to defer back to
+         * the original function.
          */
+        Py_DECREF(method);
+        return Py_NotImplemented;
+    }
+
+    /* We needs args and kwargs for __array_function__ (when not using it). */
+    if (fast_args != NULL) {
         assert(args == NULL);
         assert(kwargs == NULL);
-        args = PyTuple_New(len_args);
-        if (args == NULL) {
-            return NULL;
-        }
-        for (Py_ssize_t i = 0; i < len_args; i++) {
-            Py_INCREF(fast_args[i]);
-            PyTuple_SET_ITEM(args, i, fast_args[i]);
-        }
-        if (kwnames != NULL) {
-            kwargs = PyDict_New();
-            if (kwargs == NULL) {
-                Py_DECREF(args);
-                return NULL;
-            }
-            Py_ssize_t nkwargs = PyTuple_GET_SIZE(kwnames);
-            for (Py_ssize_t i = 0; i < nkwargs; i++) {
-                PyObject *key = PyTuple_GET_ITEM(kwnames, i);
-                PyObject *value = fast_args[i+len_args];
-                if (PyDict_SetItem(kwargs, key, value) < 0) {
-                    Py_DECREF(args);
-                    Py_DECREF(kwargs);
-                    return NULL;
-                }
-            }
+        if (get_args_and_kwargs(
+                fast_args, len_args, kwnames, &args, &kwargs) < 0) {
+            goto finish;
         }
     }
+    else {
+        Py_INCREF(args);
+        Py_INCREF(kwargs);
+    }
 
-    relevant_args = PyTuple_Pack(1, like);
-    if (relevant_args == NULL) {
+    dispatch_types = PyTuple_Pack(1, Py_TYPE(like));
+    if (dispatch_types == NULL) {
         goto finish;
     }
+
     /* The like argument must be present in the keyword arguments, remove it */
     if (PyDict_DelItem(kwargs, npy_ma_str_like) < 0) {
         goto finish;
     }
 
+    /* Fetch the actual symbol (the long way right now) */
     numpy_module = PyImport_Import(npy_ma_str_numpy);
     if (numpy_module == NULL) {
         goto finish;
@@ -466,16 +342,21 @@ array_implement_c_array_function_creation(
         goto finish;
     }
 
-    result = array_implement_array_function_internal(
-            public_api, relevant_args, args, kwargs);
+    result = call_array_function(like, method,
+            public_api, dispatch_types, args, kwargs);
 
-  finish:
-    if (kwnames != NULL) {
-        /* args and kwargs were converted from vectorcall convention */
-        Py_XDECREF(args);
-        Py_XDECREF(kwargs);
+    if (result == Py_NotImplemented) {
+        /* This shouldn't really happen as there is only one type, but... */
+        Py_DECREF(result);
+        result = NULL;
+        set_no_matching_types_error(public_api, dispatch_types);
     }
-    Py_XDECREF(relevant_args);
+
+  finish:
+    Py_DECREF(method);
+    Py_XDECREF(args);
+    Py_XDECREF(kwargs);
+    Py_XDECREF(dispatch_types);
     Py_XDECREF(public_api);
     return result;
 }
@@ -530,3 +411,290 @@ cleanup:
     Py_DECREF(relevant_args);
     return result;
 }
+
+
+typedef struct {
+    PyObject_HEAD
+    vectorcallfunc vectorcall;
+    PyObject *dict;
+    PyObject *relevant_arg_func;
+    PyObject *default_impl;
+} PyArray_ArrayFunctionDispatcherObject;
+
+
+static void
+dispatcher_dealloc(PyArray_ArrayFunctionDispatcherObject *self)
+{
+    Py_CLEAR(self->relevant_arg_func);
+    Py_CLEAR(self->default_impl);
+    Py_CLEAR(self->dict);
+    PyObject_FREE(self);
+}
+
+
+static PyObject *
+dispatcher_vectorcall(PyArray_ArrayFunctionDispatcherObject *self,
+        PyObject *const *args, Py_ssize_t len_args, PyObject *kwnames)
+{
+    PyObject *result = NULL;
+    PyObject *types = NULL;
+    PyObject *relevant_args = NULL;
+
+    PyObject *public_api;
+
+    /* __array_function__ passes args, kwargs.  These may be filled: */
+    PyObject *packed_args = NULL;
+    PyObject *packed_kwargs = NULL;
+
+    PyObject *implementing_args[NPY_MAXARGS];
+    PyObject *array_function_methods[NPY_MAXARGS];
+
+    int num_implementing_args;
+
+    if (self->relevant_arg_func != NULL) {
+        public_api = (PyObject *)self;
+
+        /* Typical path, need to call the relevant_arg_func and unpack them */
+        relevant_args = PyObject_Vectorcall(
+                self->relevant_arg_func, args, len_args, kwnames);
+        if (relevant_args == NULL) {
+            return NULL;
+        }
+        Py_SETREF(relevant_args, PySequence_Fast(relevant_args,
+                "dispatcher for __array_function__ did not return an iterable"));
+        if (relevant_args == NULL) {
+            return NULL;
+        }
+
+        num_implementing_args = get_implementing_args_and_methods(
+                relevant_args, implementing_args, array_function_methods);
+        if (num_implementing_args < 0) {
+            Py_DECREF(relevant_args);
+            return NULL;
+        }
+    }
+    else {
+        /* For like= dispatching from Python, the public_symbol is the impl */
+        public_api = self->default_impl;
+
+        /*
+         * We are dealing with `like=` from Python.  For simplicity, the
+         * Python code passes it on as the first argument.
+         */
+        if (PyVectorcall_NARGS(len_args) == 0) {
+            PyErr_Format(PyExc_TypeError,
+                    "`like` argument dispatching, but first argument is not "
+                    "positional in call to %S.", self->default_impl);
+            return NULL;
+        }
+
+        array_function_methods[0] = get_array_function(args[0]);
+        if (array_function_methods[0] == NULL) {
+            return PyErr_Format(PyExc_TypeError,
+                    "The `like` argument must be an array-like that "
+                    "implements the `__array_function__` protocol.");
+        }
+        num_implementing_args = 1;
+        implementing_args[0] = args[0];
+        Py_INCREF(implementing_args[0]);
+
+        /* do not pass the like argument */
+        len_args = PyVectorcall_NARGS(len_args) - 1;
+        len_args |= PY_VECTORCALL_ARGUMENTS_OFFSET;
+        args++;
+    }
+
+    /*
+     * Handle the typical case of no overrides. This is merely an optimization
+     * if some arguments are ndarray objects, but is also necessary if no
+     * arguments implement __array_function__ at all (e.g., if they are all
+     * built-in types).
+     */
+    int any_overrides = 0;
+    for (int j = 0; j < num_implementing_args; j++) {
+        if (!is_default_array_function(array_function_methods[j])) {
+            any_overrides = 1;
+            break;
+        }
+    }
+    if (!any_overrides) {
+        /* Directly call the actual implementation. */
+        result = PyObject_Vectorcall(self->default_impl, args, len_args, kwnames);
+        goto cleanup;
+    }
+
+    /* Find args and kwargs as tuple and dict, as we pass them out: */
+    if (get_args_and_kwargs(
+            args, len_args, kwnames, &packed_args, &packed_kwargs) < 0) {
+        goto cleanup;
+    }
+
+    /*
+     * Create a Python object for types.
+     * We use a tuple, because it's the fastest Python collection to create
+     * and has the bonus of being immutable.
+     */
+    types = PyTuple_New(num_implementing_args);
+    if (types == NULL) {
+        goto cleanup;
+    }
+    for (int j = 0; j < num_implementing_args; j++) {
+        PyObject *arg_type = (PyObject *)Py_TYPE(implementing_args[j]);
+        Py_INCREF(arg_type);
+        PyTuple_SET_ITEM(types, j, arg_type);
+    }
+
+    /* Call __array_function__ methods */
+    for (int j = 0; j < num_implementing_args; j++) {
+        PyObject *argument = implementing_args[j];
+        PyObject *method = array_function_methods[j];
+
+        result = call_array_function(
+                argument, method, public_api, types,
+                packed_args, packed_kwargs);
+
+        if (result == Py_NotImplemented) {
+            /* Try the next one */
+            Py_DECREF(result);
+            result = NULL;
+        }
+        else {
+            /* Either a good result, or an exception was raised. */
+            goto cleanup;
+        }
+    }
+
+    set_no_matching_types_error(public_api, types);
+
+cleanup:
+    for (int j = 0; j < num_implementing_args; j++) {
+        Py_DECREF(implementing_args[j]);
+        Py_DECREF(array_function_methods[j]);
+    }
+    Py_XDECREF(packed_args);
+    Py_XDECREF(packed_kwargs);
+    Py_XDECREF(types);
+    Py_XDECREF(relevant_args);
+    return result;
+}
+
+
+static PyObject *
+dispatcher_new(PyTypeObject *NPY_UNUSED(cls), PyObject *args, PyObject *kwargs)
+{
+    PyArray_ArrayFunctionDispatcherObject *self;
+
+    self = PyObject_New(
+            PyArray_ArrayFunctionDispatcherObject,
+            &PyArrayFunctionDispatcher_Type);
+    if (self == NULL) {
+        return PyErr_NoMemory();
+    }
+
+    char *kwlist[] = {"", "", NULL};
+    if (!PyArg_ParseTupleAndKeywords(
+            args, kwargs, "OO:_ArrayFunctionDispatcher", kwlist,
+            &self->relevant_arg_func, &self->default_impl)) {
+        Py_DECREF(self);
+        return NULL;
+    }
+
+    self->vectorcall = (vectorcallfunc)dispatcher_vectorcall;
+    if (self->relevant_arg_func == Py_None) {
+        /* NULL in the relevant arg function means we use `like=` */
+        Py_CLEAR(self->relevant_arg_func);
+    }
+    else {
+        Py_INCREF(self->relevant_arg_func);
+    }
+    Py_INCREF(self->default_impl);
+
+    /* Need to be like a Python function that has arbitrary attributes */
+    self->dict = PyDict_New();
+    if (self->dict == NULL) {
+        Py_DECREF(self);
+        return NULL;
+    }
+    return (PyObject *)self;
+}
+
+
+static PyObject *
+dispatcher_str(PyArray_ArrayFunctionDispatcherObject *self)
+{
+    return PyObject_Str(self->default_impl);
+}
+
+
+static PyObject *
+dispatcher_repr(PyObject *self)
+{
+    PyObject *name = PyObject_GetAttrString(self, "__name__");
+    if (name == NULL) {
+        return NULL;
+    }
+    /* Print like a normal function */
+    return PyUnicode_FromFormat("<function %S at %p>", name, self);
+}
+
+
+static PyObject *
+func_dispatcher___get__(PyObject *self, PyObject *obj, PyObject *cls)
+{
+    if (obj == NULL) {
+        /* Act like a static method, no need to bind */
+        Py_INCREF(self);
+        return self;
+    }
+    return PyMethod_New(self, obj);
+}
+
+
+static PyObject *
+dispatcher_get_implementation(
+        PyArray_ArrayFunctionDispatcherObject *self, void *NPY_UNUSED(closure))
+{
+    Py_INCREF(self->default_impl);
+    return self->default_impl;
+}
+
+
+static PyObject *
+dispatcher_reduce(PyObject *self, PyObject *NPY_UNUSED(args))
+{
+    return PyObject_GetAttrString(self, "__qualname__");
+}
+
+
+static struct PyMethodDef func_dispatcher_methods[] = {
+    {"__reduce__",
+        (PyCFunction)dispatcher_reduce, METH_NOARGS, NULL},
+    {NULL, NULL, 0, NULL}
+};
+
+
+static struct PyGetSetDef func_dispatcher_getset[] = {
+    {"__dict__", &PyObject_GenericGetDict, 0, NULL, 0},
+    {"_implementation", (getter)&dispatcher_get_implementation, 0, NULL, 0},
+    {0, 0, 0, 0, 0}
+};
+
+
+NPY_NO_EXPORT PyTypeObject PyArrayFunctionDispatcher_Type = {
+     PyVarObject_HEAD_INIT(NULL, 0)
+     .tp_name = "numpy._ArrayFunctionDispatcher",
+     .tp_basicsize = sizeof(PyArray_ArrayFunctionDispatcherObject),
+     /* We have a dict, so in theory could traverse, but in practice... */
+     .tp_dictoffset = offsetof(PyArray_ArrayFunctionDispatcherObject, dict),
+     .tp_dealloc = (destructor)dispatcher_dealloc,
+     .tp_new = (newfunc)dispatcher_new,
+     .tp_str = (reprfunc)dispatcher_str,
+     .tp_repr = (reprfunc)dispatcher_repr,
+     .tp_flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_VECTORCALL
+                  | Py_TPFLAGS_METHOD_DESCRIPTOR),
+     .tp_methods = func_dispatcher_methods,
+     .tp_getset = func_dispatcher_getset,
+     .tp_descr_get = func_dispatcher___get__,
+     .tp_call = &PyVectorcall_Call,
+     .tp_vectorcall_offset = offsetof(PyArray_ArrayFunctionDispatcherObject, vectorcall),
+};
diff --git a/numpy/core/src/multiarray/arrayfunction_override.h b/numpy/core/src/multiarray/arrayfunction_override.h
index 09f7ee548..3b8b88bac 100644
--- a/numpy/core/src/multiarray/arrayfunction_override.h
+++ b/numpy/core/src/multiarray/arrayfunction_override.h
@@ -1,9 +1,7 @@
 #ifndef NUMPY_CORE_SRC_MULTIARRAY_ARRAYFUNCTION_OVERRIDE_H_
 #define NUMPY_CORE_SRC_MULTIARRAY_ARRAYFUNCTION_OVERRIDE_H_
 
-NPY_NO_EXPORT PyObject *
-array_implement_array_function(
-    PyObject *NPY_UNUSED(dummy), PyObject *positional_args);
+extern NPY_NO_EXPORT PyTypeObject PyArrayFunctionDispatcher_Type;
 
 NPY_NO_EXPORT PyObject *
 array__get_implementing_args(
diff --git a/numpy/core/src/multiarray/arrayobject.c b/numpy/core/src/multiarray/arrayobject.c
index b1302738d..46c4d90c9 100644
--- a/numpy/core/src/multiarray/arrayobject.c
+++ b/numpy/core/src/multiarray/arrayobject.c
@@ -56,6 +56,7 @@ maintainer email:  oliphant.travis@ieee.org
 #include "alloc.h"
 #include "mem_overlap.h"
 #include "numpyos.h"
+#include "refcount.h"
 #include "strfuncs.h"
 
 #include "binop_override.h"
@@ -394,7 +395,7 @@ PyArray_ResolveWritebackIfCopy(PyArrayObject * self)
    to stderr and clear the error
 */
 
-static NPY_INLINE void
+static inline void
 WARN_IN_DEALLOC(PyObject* warning, const char * msg) {
     if (PyErr_WarnEx(warning, msg, 1) < 0) {
         PyObject * s;
@@ -452,9 +453,11 @@ array_dealloc(PyArrayObject *self)
     }
 
     if ((fa->flags & NPY_ARRAY_OWNDATA) && fa->data) {
-        /* Free internal references if an Object array */
-        if (PyDataType_FLAGCHK(fa->descr, NPY_ITEM_REFCOUNT)) {
-            PyArray_XDECREF(self);
+        /* Free any internal references */
+        if (PyDataType_REFCHK(fa->descr)) {
+            if (PyArray_ClearArray(self) < 0) {
+                PyErr_WriteUnraisable(NULL);
+            }
         }
         if (fa->mem_handler == NULL) {
             char *env = getenv("NUMPY_WARN_IF_NO_MEM_POLICY");
@@ -875,108 +878,6 @@ DEPRECATE_silence_error(const char *msg) {
     return 0;
 }
 
-/*
- * Comparisons can fail, but we do not always want to pass on the exception
- * (see comment in array_richcompare below), but rather return NotImplemented.
- * Here, an exception should be set on entrance.
- * Returns either NotImplemented with the exception cleared, or NULL
- * with the exception set.
- * Raises deprecation warnings for cases where behaviour is meant to change
- * (2015-05-14, 1.10)
- */
-
-NPY_NO_EXPORT PyObject *
-_failed_comparison_workaround(PyArrayObject *self, PyObject *other, int cmp_op)
-{
-    PyObject *exc, *val, *tb;
-    PyArrayObject *array_other;
-    int other_is_flexible, ndim_other;
-    int self_is_flexible = PyTypeNum_ISFLEXIBLE(PyArray_DESCR(self)->type_num);
-
-    PyErr_Fetch(&exc, &val, &tb);
-    /*
-     * Determine whether other has a flexible dtype; here, inconvertible
-     * is counted as inflexible.  (This repeats work done in the ufunc,
-     * but OK to waste some time in an unlikely path.)
-     */
-    array_other = (PyArrayObject *)PyArray_FROM_O(other);
-    if (array_other) {
-        other_is_flexible = PyTypeNum_ISFLEXIBLE(
-            PyArray_DESCR(array_other)->type_num);
-        ndim_other = PyArray_NDIM(array_other);
-        Py_DECREF(array_other);
-    }
-    else {
-        PyErr_Clear(); /* we restore the original error if needed */
-        other_is_flexible = 0;
-        ndim_other = 0;
-    }
-    if (cmp_op == Py_EQ || cmp_op == Py_NE) {
-        /*
-         * note: for == and !=, a structured dtype self cannot get here,
-         * but a string can. Other can be string or structured.
-         */
-        if (other_is_flexible || self_is_flexible) {
-            /*
-             * For scalars, returning NotImplemented is correct.
-             * For arrays, we emit a future deprecation warning.
-             * When this warning is removed, a correctly shaped
-             * array of bool should be returned.
-             */
-            if (ndim_other != 0 || PyArray_NDIM(self) != 0) {
-                /* 2015-05-14, 1.10 */
-                if (DEPRECATE_FUTUREWARNING(
-                        "elementwise comparison failed; returning scalar "
-                        "instead, but in the future will perform "
-                        "elementwise comparison") < 0) {
-                    goto fail;
-                }
-            }
-        }
-        else {
-            /*
-             * If neither self nor other had a flexible dtype, the error cannot
-             * have been caused by a lack of implementation in the ufunc.
-             *
-             * 2015-05-14, 1.10
-             */
-            if (DEPRECATE(
-                    "elementwise comparison failed; "
-                    "this will raise an error in the future.") < 0) {
-                goto fail;
-            }
-        }
-        Py_XDECREF(exc);
-        Py_XDECREF(val);
-        Py_XDECREF(tb);
-        Py_INCREF(Py_NotImplemented);
-        return Py_NotImplemented;
-    }
-    else if (other_is_flexible || self_is_flexible) {
-        /*
-         * For LE, LT, GT, GE and a flexible self or other, we return
-         * NotImplemented, which is the correct answer since the ufuncs do
-         * not in fact implement loops for those.  This will get us the
-         * desired TypeError.
-         */
-        Py_XDECREF(exc);
-        Py_XDECREF(val);
-        Py_XDECREF(tb);
-        Py_INCREF(Py_NotImplemented);
-        return Py_NotImplemented;
-    }
-    else {
-        /* LE, LT, GT, or GE with non-flexible other; just pass on error */
-        goto fail;
-    }
-
-fail:
-    /*
-     * Reraise the original exception, possibly chaining with a new one.
-     */
-    npy_PyErr_ChainExceptionsCause(exc, val, tb);
-    return NULL;
-}
 
 NPY_NO_EXPORT PyObject *
 array_richcompare(PyArrayObject *self, PyObject *other, int cmp_op)
@@ -1074,33 +975,99 @@ array_richcompare(PyArrayObject *self, PyObject *other, int cmp_op)
         Py_INCREF(Py_NotImplemented);
         return Py_NotImplemented;
     }
-    if (result == NULL) {
+
+    /*
+     * At this point `self` can take control of the operation by converting
+     * `other` to an array (it would have a chance to take control).
+     * If we are not in `==` and `!=`, this is an error and we hope that
+     * the existing error makes sense and derives from `TypeError` (which
+     * python would raise for `NotImplemented`) when it should.
+     *
+     * However, if the issue is no matching loop for the given dtypes and
+     * we are inside == and !=, then returning an array of True or False
+     * makes sense (following Python behavior for `==` and `!=`).
+     * Effectively: Both *dtypes* told us that they cannot be compared.
+     *
+     * In theory, the error could be raised from within an object loop, the
+     * solution to that could be pushing this into the ufunc (where we can
+     * distinguish the two easily).  In practice, it seems like it should not
+     * but a huge problem:  The ufunc loop will itself call `==` which should
+     * probably never raise a UFuncNoLoopError.
+     *
+     * TODO: If/once we correctly push structured comparisons into the ufunc
+     *       we could consider pushing this path into the ufunc itself as a
+     *       fallback loop (which ignores the input arrays).
+     *       This would have the advantage that subclasses implementing
+     *       `__array_ufunc__` do not explicitly need `__eq__` and `__ne__`.
+     */
+    if (result == NULL
+            && (cmp_op == Py_EQ || cmp_op == Py_NE)
+            && PyErr_ExceptionMatches(npy_UFuncNoLoopError)) {
+        PyErr_Clear();
+
+        PyArrayObject *array_other = (PyArrayObject *)PyArray_FROM_O(other);
+        if (PyArray_TYPE(array_other) == NPY_VOID) {
+            /*
+            * Void arrays are currently not handled by ufuncs, so if the other
+            * is a void array, we defer to it (will raise a TypeError).
+            */
+            Py_DECREF(array_other);
+            Py_RETURN_NOTIMPLEMENTED;
+        }
+
+        if (PyArray_NDIM(self) == 0 && PyArray_NDIM(array_other) == 0) {
+            /*
+             * (seberg) not sure that this is best, but we preserve Python
+             * bool result for "scalar" inputs for now by returning
+             * `NotImplemented`.
+             */
+            Py_DECREF(array_other);
+            Py_RETURN_NOTIMPLEMENTED;
+        }
+
+        /* Hack warning: using NpyIter to allocate broadcasted result. */
+        PyArrayObject *ops[3] = {self, array_other, NULL};
+        npy_uint32 flags = NPY_ITER_ZEROSIZE_OK | NPY_ITER_REFS_OK;
+        npy_uint32 op_flags[3] = {
+            NPY_ITER_READONLY, NPY_ITER_READONLY,
+            NPY_ITER_ALLOCATE | NPY_ITER_WRITEONLY};
+
+        PyArray_Descr *bool_descr = PyArray_DescrFromType(NPY_BOOL);
+        PyArray_Descr *op_descrs[3] = {
+            PyArray_DESCR(self), PyArray_DESCR(array_other), bool_descr};
+
+        NpyIter *iter = NpyIter_MultiNew(
+                    3, ops, flags, NPY_KEEPORDER, NPY_NO_CASTING,
+                    op_flags, op_descrs);
+
+        Py_CLEAR(bool_descr);
+        Py_CLEAR(array_other);
+        if (iter == NULL) {
+            return NULL;
+        }
+        PyArrayObject *res = NpyIter_GetOperandArray(iter)[2];
+        Py_INCREF(res);
+        if (NpyIter_Deallocate(iter) != NPY_SUCCEED) {
+            Py_DECREF(res);
+            return NULL;
+        }
+
         /*
-         * 2015-05-14, 1.10; updated 2018-06-18, 1.16.
-         *
-         * Comparisons can raise errors when element-wise comparison is not
-         * possible. Some of these, though, should not be passed on.
-         * In particular, the ufuncs do not have loops for flexible dtype,
-         * so those should be treated separately.  Furthermore, for EQ and NE,
-         * we should never fail.
-         *
-         * Our ideal behaviour would be:
-         *
-         * 1. For EQ and NE:
-         *   - If self and other are scalars, return NotImplemented,
-         *     so that python can assign True of False as appropriate.
-         *   - If either is an array, return an array of False or True.
-         *
-         * 2. For LT, LE, GE, GT:
-         *   - If self or other was flexible, return NotImplemented
-         *     (as is in fact the case), so python can raise a TypeError.
-         *   - If other is not convertible to an array, pass on the error
-         *     (MHvK, 2018-06-18: not sure about this, but it's what we have).
-         *
-         * However, for backwards compatibility, we cannot yet return arrays,
-         * so we raise warnings instead.
+         * The array is guaranteed to be newly allocated and thus contiguous,
+         * so simply fill it with 0 or 1.
          */
-        result = _failed_comparison_workaround(self, other, cmp_op);
+        memset(PyArray_BYTES(res), cmp_op == Py_EQ ? 0 : 1, PyArray_NBYTES(res));
+
+        /* Ensure basic subclass support by wrapping: */
+        if (!PyArray_CheckExact(self)) {
+            /*
+             * If other is also a subclass (with higher priority) we would
+             * already have deferred.  So use `self` for wrapping.  If users
+             * need more, they need to override `==` and `!=`.
+             */
+            Py_SETREF(res, PyArray_SubclassWrap(self, res));
+        }
+        return (PyObject *)res;
     }
     return result;
 }
@@ -1256,7 +1223,7 @@ array_new(PyTypeObject *subtype, PyObject *args, PyObject *kwds)
                                      (int)dims.len,
                                      dims.ptr,
                                      strides.ptr, NULL, is_f_order, NULL, NULL,
-                                     0, 1);
+                                     _NPY_ARRAY_ALLOW_EMPTY_STRING);
         if (ret == NULL) {
             descr = NULL;
             goto fail;
@@ -1293,7 +1260,7 @@ array_new(PyTypeObject *subtype, PyObject *args, PyObject *kwds)
                 subtype, descr,
                 dims.len, dims.ptr, strides.ptr, offset + (char *)buffer.ptr,
                 buffer.flags, NULL, buffer.base,
-                0, 1);
+                _NPY_ARRAY_ALLOW_EMPTY_STRING);
         if (ret == NULL) {
             descr = NULL;
             goto fail;
diff --git a/numpy/core/src/multiarray/arraytypes.c.src b/numpy/core/src/multiarray/arraytypes.c.src
index b7c339aa2..b84625c77 100644
--- a/numpy/core/src/multiarray/arraytypes.c.src
+++ b/numpy/core/src/multiarray/arraytypes.c.src
@@ -55,7 +55,7 @@
  * This is especially important for nonzero and copyswap, which may run with
  * the GIL released.
  */
-static NPY_INLINE PyArrayObject_fields
+static inline PyArrayObject_fields
 get_dummy_stack_array(PyArrayObject *orig)
 {
     PyArrayObject_fields new_fields;
@@ -68,7 +68,7 @@ get_dummy_stack_array(PyArrayObject *orig)
 
 
 /* check for sequences, but ignore the types numpy considers scalars */
-static NPY_INLINE npy_bool
+static inline npy_bool
 PySequence_NoString_Check(PyObject *op) {
     return
         PySequence_Check(op) &&
@@ -224,8 +224,6 @@ MyPyLong_AsUnsigned@Type@(PyObject *obj)
  **                         GETITEM AND SETITEM                             **
  *****************************************************************************
  */
-
-#define _ALIGN(type) offsetof(struct {char c; type v;}, v)
 /*
  * Disable harmless compiler warning "4116: unnamed type definition in
  * parentheses" which is caused by the _ALIGN macro.
@@ -293,7 +291,7 @@ static int
                     "Python integers to integer arrays.  The conversion "
                     "of %.100R to %S will fail in the future.\n"
                     "For the old behavior, usually:\n"
-                    "    np.array(value).astype(dtype)`\n"
+                    "    np.array(value).astype(dtype)\n"
                     "will give the desired result (the cast overflows).",
                     obj, descr) < 0) {
                 Py_DECREF(descr);
@@ -525,7 +523,7 @@ NPY_NO_EXPORT int
 
 /**end repeat**/
 
-static NPY_INLINE npy_longdouble
+static inline npy_longdouble
 string_to_long_double(PyObject*op)
 {
     char *s;
@@ -2152,7 +2150,7 @@ static int
  */
 
 
-static NPY_INLINE void
+static inline void
 _basic_copyn(void *dst, npy_intp dstride, void *src, npy_intp sstride,
              npy_intp n, int elsize) {
     if (src == NULL) {
@@ -2167,7 +2165,7 @@ _basic_copyn(void *dst, npy_intp dstride, void *src, npy_intp sstride,
     }
 }
 
-static NPY_INLINE void
+static inline void
 _basic_copy(void *dst, void *src, int elsize) {
     if (src == NULL) {
         return;
@@ -3803,17 +3801,23 @@ BOOL_dot(char *ip1, npy_intp is1, char *ip2, npy_intp is2, char *op, npy_intp n,
     *((npy_bool *)op) = tmp;
 }
 
+/*
+ * `dot` does not make sense for times, for DATETIME it never worked.
+ *  For timedelta it does/did , but should probably also just be removed.
+ */
+#define DATETIME_dot NULL
+
 /**begin repeat
  *
  * #name = BYTE, UBYTE, SHORT, USHORT, INT, UINT,
  *         LONG, ULONG, LONGLONG, ULONGLONG,
- *         LONGDOUBLE, DATETIME, TIMEDELTA#
+ *         LONGDOUBLE, TIMEDELTA#
  * #type = npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint,
  *         npy_long, npy_ulong, npy_longlong, npy_ulonglong,
- *         npy_longdouble, npy_datetime, npy_timedelta#
+ *         npy_longdouble, npy_timedelta#
  * #out = npy_long, npy_ulong, npy_long, npy_ulong, npy_long, npy_ulong,
  *        npy_long, npy_ulong, npy_longlong, npy_ulonglong,
- *        npy_longdouble, npy_datetime, npy_timedelta#
+ *        npy_longdouble, npy_timedelta#
  */
 static void
 @name@_dot(char *ip1, npy_intp is1, char *ip2, npy_intp is2, char *op, npy_intp n,
@@ -3916,7 +3920,18 @@ OBJECT_dot(char *ip1, npy_intp is1, char *ip2, npy_intp is2, char *op, npy_intp
  */
 
 
-#define BOOL_fill NULL
+/* Boolean fill never works, but define it so that it works up to length 2 */
+static int
+BOOL_fill(PyObject **buffer, npy_intp length, void *NPY_UNUSED(ignored))
+{
+    NPY_ALLOW_C_API_DEF;
+    NPY_ALLOW_C_API;
+    PyErr_SetString(PyExc_TypeError,
+            "arange() is only supported for booleans when the result has at "
+            "most length 2.");
+    NPY_DISABLE_C_API;
+    return -1;
+}
 
 /* this requires buffer to be filled with objects or NULL */
 static int
@@ -4631,12 +4646,30 @@ set_typeinfo(PyObject *dict)
      * should be defined on the class and inherited to the scalar.
      * (NPY_HALF is the largest builtin one.)
      */
-    for (i = 0; i <= NPY_HALF; i++) {
-        if (dtypemeta_wrap_legacy_descriptor(_builtin_descrs[i]) < 0) {
-            return -1;
-        }
+    /**begin repeat
+     *
+     * #NAME = BOOL,
+     *         BYTE, UBYTE, SHORT, USHORT, INT, UINT,
+     *         LONG, ULONG, LONGLONG, ULONGLONG,
+     *         HALF, FLOAT, DOUBLE, LONGDOUBLE,
+     *         CFLOAT, CDOUBLE, CLONGDOUBLE,
+     *         OBJECT, STRING, UNICODE, VOID,
+     *         DATETIME, TIMEDELTA#
+     */
+    if (dtypemeta_wrap_legacy_descriptor(
+            _builtin_descrs[NPY_@NAME@],
+            "numpy.dtypes." NPY_@NAME@_Name "DType",
+#ifdef NPY_@NAME@_alias
+            "numpy.dtypes." NPY_@NAME@_Alias "DType"
+#else
+            NULL
+#endif
+            ) < 0) {
+        return -1;
     }
 
+    /**end repeat**/
+
     /*
      * Add cast functions for the new types
      */
diff --git a/numpy/core/src/multiarray/arraytypes.h.src b/numpy/core/src/multiarray/arraytypes.h.src
index aad464ccf..90a075dad 100644
--- a/numpy/core/src/multiarray/arraytypes.h.src
+++ b/numpy/core/src/multiarray/arraytypes.h.src
@@ -66,4 +66,102 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT int @TYPE@_@func@,
 NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT int BOOL_argmax,
     (npy_bool *ip, npy_intp n, npy_intp *max_ind, PyArrayObject *aip))
 
+
+/*
+ * Define DType and scalar type names and aliases as used in Python.
+ */
+
+/**begin repeat
+ * #NAME = BOOL,
+ *         HALF, FLOAT, DOUBLE, LONGDOUBLE,
+ *         CFLOAT, CDOUBLE, CLONGDOUBLE,
+ *         STRING, UNICODE, VOID, OBJECT,
+ *         DATETIME, TIMEDELTA#
+ * #name = bool_,
+ *         float16, float32, float64, longdouble,
+ *         complex64, complex128, clongdouble,
+ *         bytes_, str_, void, object_,
+ *         datetime64, timedelta64#
+ * #Name = Bool,
+ *         Float16, Float32, Float64, LongDouble,
+ *         Complex64, Complex128, CLongDouble,
+ *         Bytes, Str, Void, Object,
+ *         DateTime64, TimeDelta64#
+ */
+#define NPY_@NAME@_name "@name@"
+#define NPY_@NAME@_Name "@Name@"
+
+/**end repeat**/
+
+
+/*
+ * Give integers different names when they are the same size (gh-9799).
+ * `intX` always refers to the first int of that size in the sequence
+ * `['LONG', 'LONGLONG', 'INT', 'SHORT', 'BYTE']`.
+ * Unfortunately, since the bitsize names are not strictly fixed, we add
+ * the C name for all integer types (as aliases).
+ *
+ * Right now, we do not define the C aliases for floats (which are always
+ * the same).
+ */
+
+#if NPY_SIZEOF_BYTE == NPY_SIZEOF_SHORT
+    #define BYTE_not_size_named
+#endif
+#if NPY_SIZEOF_SHORT == NPY_SIZEOF_INT
+    #define SHORT_not_size_named
+#endif
+#if NPY_SIZEOF_INT == NPY_SIZEOF_LONG
+    #define INT_not_size_named
+#endif
+#if NPY_SIZEOF_LONGLONG == NPY_SIZEOF_LONG
+    #define LONGLONG_not_size_named
+#endif
+
+
+/**begin repeat
+ * #NAME = BYTE, SHORT, INT, LONG, LONGLONG,
+ *         UBYTE, USHORT, UINT, ULONG, ULONGLONG#
+ * #CNAME = (BYTE, SHORT, INT, LONG, LONGLONG)*2#
+ * #cname = byte, short, intc, int_, longlong,
+ *          ubyte, ushort, uintc, uint, ulonglong#
+ * #CName = Byte, Short, Int, Long, LongLong,
+ *          UByte, UShort, UInt, ULong, ULongLong#
+ * #bitname = int*5, uint*5#
+ * #BitName = Int*5, UInt*5#
+ */
+
+#ifdef @CNAME@_not_size_named
+    #define NPY_@NAME@_name "@cname@"
+    #define NPY_@NAME@_Name "@CName@"
+#else
+    /* The C-name is considered just an alias for these: */
+    #define NPY_@NAME@_alias "@cname@"
+    #define NPY_@NAME@_Alias "@CName@"
+
+    /* The bitsof macro includes math, so cannot be stringified */
+    #if NPY_BITSOF_@CNAME@ == 8
+        #define NPY_@NAME@_name "@bitname@8"
+        #define NPY_@NAME@_Name "@BitName@8"
+    #elif NPY_BITSOF_@CNAME@ == 16
+        #define NPY_@NAME@_name "@bitname@16"
+        #define NPY_@NAME@_Name "@BitName@16"
+    #elif NPY_BITSOF_@CNAME@ == 32
+        #define NPY_@NAME@_name "@bitname@32"
+        #define NPY_@NAME@_Name "@BitName@32"
+    #elif NPY_BITSOF_@CNAME@ == 64
+        #define NPY_@NAME@_name "@bitname@64"
+        #define NPY_@NAME@_Name "@BitName@64"
+    #else
+        #error "need to fix integer bit-length name code"
+    #endif
+#endif
+
+/**end repeat**/
+
+#undef BYTE_not_size_named
+#undef SHORT_not_size_named
+#undef INT_not_size_named
+#undef LONGLONG_not_size_named
+
 #endif  /* NUMPY_CORE_SRC_MULTIARRAY_ARRAYTYPES_H_ */
diff --git a/numpy/core/src/multiarray/buffer.c b/numpy/core/src/multiarray/buffer.c
index 0307d41a8..c9d881e16 100644
--- a/numpy/core/src/multiarray/buffer.c
+++ b/numpy/core/src/multiarray/buffer.c
@@ -17,6 +17,7 @@
 #include "numpyos.h"
 #include "arrayobject.h"
 #include "scalartypes.h"
+#include "dtypemeta.h"
 
 /*************************************************************************
  ****************   Implement Buffer Protocol ****************************
@@ -136,7 +137,7 @@ fail:
  * AND, the descr element size is a multiple of the alignment,
  * AND, the array data is positioned to alignment granularity.
  */
-static NPY_INLINE int
+static inline int
 _is_natively_aligned_at(PyArray_Descr *descr,
                         PyArrayObject *arr, Py_ssize_t offset)
 {
@@ -417,9 +418,16 @@ _buffer_format_string(PyArray_Descr *descr, _tmp_string_t *str,
             break;
         }
         default:
-            PyErr_Format(PyExc_ValueError,
-                         "cannot include dtype '%c' in a buffer",
-                         descr->type);
+            if (NPY_DT_is_legacy(NPY_DTYPE(descr))) {
+                PyErr_Format(PyExc_ValueError,
+                             "cannot include dtype '%c' in a buffer",
+                             descr->type);
+            }
+            else {
+                PyErr_Format(PyExc_ValueError,
+                             "cannot include dtype '%s' in a buffer",
+                             ((PyTypeObject*)NPY_DTYPE(descr))->tp_name);
+            }
             return -1;
         }
     }
@@ -591,7 +599,7 @@ _buffer_info_cmp(_buffer_info_t *a, _buffer_info_t *b)
  * a useful error message instead of crashing hard if a C-subclass uses
  * the same field.
  */
-static NPY_INLINE void *
+static inline void *
 buffer_info_tag(void *buffer_info)
 {
     if (buffer_info == NULL) {
@@ -603,7 +611,7 @@ buffer_info_tag(void *buffer_info)
 }
 
 
-static NPY_INLINE int
+static inline int
 _buffer_info_untag(
         void *tagged_buffer_info, _buffer_info_t **buffer_info, PyObject *obj)
 {
diff --git a/numpy/core/src/multiarray/calculation.c b/numpy/core/src/multiarray/calculation.c
index a985a2308..62a994c64 100644
--- a/numpy/core/src/multiarray/calculation.c
+++ b/numpy/core/src/multiarray/calculation.c
@@ -7,6 +7,7 @@
 
 #include "numpy/arrayobject.h"
 #include "lowlevel_strided_loops.h"
+#include "dtypemeta.h"
 
 #include "npy_config.h"
 
@@ -50,7 +51,7 @@ _PyArray_ArgMinMaxCommon(PyArrayObject *op,
     int axis_copy = axis;
     npy_intp _shape_buf[NPY_MAXDIMS];
     npy_intp *out_shape;
-    // Keep the number of dimensions and shape of 
+    // Keep the number of dimensions and shape of
     // original array. Helps when `keepdims` is True.
     npy_intp* original_op_shape = PyArray_DIMS(op);
     int out_ndim = PyArray_NDIM(op);
@@ -87,9 +88,13 @@ _PyArray_ArgMinMaxCommon(PyArrayObject *op,
         op = ap;
     }
 
-    /* Will get native-byte order contiguous copy. */
-    ap = (PyArrayObject *)PyArray_ContiguousFromAny((PyObject *)op,
-                                  PyArray_DESCR(op)->type_num, 1, 0);
+    // Will get native-byte order contiguous copy.
+    PyArray_Descr *descr = NPY_DT_CALL_ensure_canonical(PyArray_DESCR(op));
+    if (descr == NULL) {
+        return NULL;
+    }
+    ap = (PyArrayObject *)PyArray_FromArray(op, descr, NPY_ARRAY_DEFAULT);
+
     Py_DECREF(op);
     if (ap == NULL) {
         return NULL;
@@ -106,9 +111,9 @@ _PyArray_ArgMinMaxCommon(PyArrayObject *op,
             for (int i = 0; i < out_ndim; i++) {
                 out_shape[i] = 1;
             }
-        } 
+        }
         else {
-            /* 
+            /*
              * While `ap` may be transposed, we can ignore this for `out` because the
              * transpose only reorders the size 1 `axis` (not changing memory layout).
              */
diff --git a/numpy/core/src/multiarray/common.c b/numpy/core/src/multiarray/common.c
index da8d23a26..573d0d606 100644
--- a/numpy/core/src/multiarray/common.c
+++ b/numpy/core/src/multiarray/common.c
@@ -128,24 +128,6 @@ PyArray_DTypeFromObject(PyObject *obj, int maxdims, PyArray_Descr **out_dtype)
 }
 
 
-NPY_NO_EXPORT int
-_zerofill(PyArrayObject *ret)
-{
-    if (PyDataType_REFCHK(PyArray_DESCR(ret))) {
-        PyObject *zero = PyLong_FromLong(0);
-        PyArray_FillObjectArray(ret, zero);
-        Py_DECREF(zero);
-        if (PyErr_Occurred()) {
-            return -1;
-        }
-    }
-    else {
-        npy_intp n = PyArray_NBYTES(ret);
-        memset(PyArray_DATA(ret), 0, n);
-    }
-    return 0;
-}
-
 NPY_NO_EXPORT npy_bool
 _IsWriteable(PyArrayObject *ap)
 {
@@ -459,3 +441,32 @@ new_array_for_sum(PyArrayObject *ap1, PyArrayObject *ap2, PyArrayObject* out,
     }
 }
 
+NPY_NO_EXPORT int
+check_is_convertible_to_scalar(PyArrayObject *v)
+{
+    if (PyArray_NDIM(v) == 0) {
+        return 0;
+    }
+
+    /* Remove this if-else block when the deprecation expires */
+    if (PyArray_SIZE(v) == 1) {
+        /* Numpy 1.25.0, 2023-01-02 */
+        if (DEPRECATE(
+                "Conversion of an array with ndim > 0 to a scalar "
+                "is deprecated, and will error in future. "
+                "Ensure you extract a single element from your array "
+                "before performing this operation. "
+                "(Deprecated NumPy 1.25.)") < 0) {
+            return -1;
+        }
+        return 0;
+    } else {
+        PyErr_SetString(PyExc_TypeError,
+            "only length-1 arrays can be converted to Python scalars");
+        return -1;
+    }
+
+    PyErr_SetString(PyExc_TypeError,
+            "only 0-dimensional arrays can be converted to Python scalars");
+    return -1;
+}
diff --git a/numpy/core/src/multiarray/common.h b/numpy/core/src/multiarray/common.h
index 3de8c842d..cb9fadc4e 100644
--- a/numpy/core/src/multiarray/common.h
+++ b/numpy/core/src/multiarray/common.h
@@ -55,9 +55,6 @@ PyArray_DTypeFromObject(PyObject *obj, int maxdims,
 NPY_NO_EXPORT PyArray_Descr *
 _array_find_python_scalar_type(PyObject *op);
 
-NPY_NO_EXPORT int
-_zerofill(PyArrayObject *ret);
-
 NPY_NO_EXPORT npy_bool
 _IsWriteable(PyArrayObject *ap);
 
@@ -103,7 +100,7 @@ _may_have_objects(PyArray_Descr *dtype);
  * If _save is not NULL it is assumed the GIL is not taken and it
  * is acquired in the case of an error
  */
-static NPY_INLINE int
+static inline int
 check_and_adjust_index(npy_intp *index, npy_intp max_item, int axis,
                        PyThreadState * _save)
 {
@@ -137,7 +134,7 @@ check_and_adjust_index(npy_intp *index, npy_intp max_item, int axis,
  *
  * msg_prefix: borrowed reference, a string to prepend to the message
  */
-static NPY_INLINE int
+static inline int
 check_and_adjust_axis_msg(int *axis, int ndim, PyObject *msg_prefix)
 {
     /* Check that index is valid, taking into account negative indices */
@@ -149,7 +146,7 @@ check_and_adjust_axis_msg(int *axis, int ndim, PyObject *msg_prefix)
         static PyObject *AxisError_cls = NULL;
         PyObject *exc;
 
-        npy_cache_import("numpy.core._exceptions", "AxisError", &AxisError_cls);
+        npy_cache_import("numpy.exceptions", "AxisError", &AxisError_cls);
         if (AxisError_cls == NULL) {
             return -1;
         }
@@ -171,14 +168,26 @@ check_and_adjust_axis_msg(int *axis, int ndim, PyObject *msg_prefix)
     }
     return 0;
 }
-static NPY_INLINE int
+static inline int
 check_and_adjust_axis(int *axis, int ndim)
 {
     return check_and_adjust_axis_msg(axis, ndim, Py_None);
 }
 
 /* used for some alignment checks */
-#define _ALIGN(type) offsetof(struct {char c; type v;}, v)
+/* 
+ * GCC releases before GCC 4.9 had a bug in _Alignof.  See GCC bug 52023
+ * <https://gcc.gnu.org/bugzilla/show_bug.cgi?id=52023>.
+ * clang versions < 8.0.0 have the same bug.
+ */
+#if (!defined __STDC_VERSION__ || __STDC_VERSION__ < 201112 \
+     || (defined __GNUC__ && __GNUC__ < 4 + (__GNUC_MINOR__ < 9) \
+  && !defined __clang__) \
+     || (defined __clang__ && __clang_major__ < 8))
+# define _ALIGN(type) offsetof(struct {char c; type v;}, v)
+#else
+# define _ALIGN(type) _Alignof(type)
+#endif
 #define _UINT_ALIGN(type) npy_uint_alignment(sizeof(type))
 /*
  * Disable harmless compiler warning "4116: unnamed type definition in
@@ -191,7 +200,7 @@ check_and_adjust_axis(int *axis, int ndim)
 /*
  * return true if pointer is aligned to 'alignment'
  */
-static NPY_INLINE int
+static inline int
 npy_is_aligned(const void * p, const npy_uintp alignment)
 {
     /*
@@ -205,7 +214,7 @@ npy_is_aligned(const void * p, const npy_uintp alignment)
 }
 
 /* Get equivalent "uint" alignment given an itemsize, for use in copy code */
-static NPY_INLINE npy_uintp
+static inline npy_uintp
 npy_uint_alignment(int itemsize)
 {
     npy_uintp alignment = 0; /* return value of 0 means unaligned */
@@ -252,7 +261,7 @@ npy_uint_alignment(int itemsize)
      */
     __attribute__((no_sanitize("alignment")))
 #endif
-static NPY_INLINE char *
+static inline char *
 npy_memchr(char * haystack, char needle,
            npy_intp stride, npy_intp size, npy_intp * psubloopsize, int invert)
 {
@@ -302,7 +311,7 @@ npy_memchr(char * haystack, char needle,
  * flag means that NULL entries are replaced with None, which is occasionally
  * useful.
  */
-static NPY_INLINE PyObject *
+static inline PyObject *
 PyArray_TupleFromItems(int n, PyObject *const *items, int make_null_none)
 {
     PyObject *tuple = PyTuple_New(n);
@@ -323,6 +332,13 @@ PyArray_TupleFromItems(int n, PyObject *const *items, int make_null_none)
     return tuple;
 }
 
+/*
+ * Returns 0 if the array has rank 0, -1 otherwise. Prints a deprecation
+ * warning for arrays of _size_ 1.
+ */
+NPY_NO_EXPORT int
+check_is_convertible_to_scalar(PyArrayObject *v);
+
 
 #include "ucsnarrow.h"
 
diff --git a/numpy/core/src/multiarray/common_dtype.c b/numpy/core/src/multiarray/common_dtype.c
index 3561a905a..38a130221 100644
--- a/numpy/core/src/multiarray/common_dtype.c
+++ b/numpy/core/src/multiarray/common_dtype.c
@@ -8,6 +8,7 @@
 #include "numpy/arrayobject.h"
 
 #include "common_dtype.h"
+#include "convert_datatype.h"
 #include "dtypemeta.h"
 #include "abstractdtypes.h"
 
@@ -61,7 +62,7 @@ PyArray_CommonDType(PyArray_DTypeMeta *dtype1, PyArray_DTypeMeta *dtype2)
     }
     if (common_dtype == (PyArray_DTypeMeta *)Py_NotImplemented) {
         Py_DECREF(Py_NotImplemented);
-        PyErr_Format(PyExc_TypeError,
+        PyErr_Format(npy_DTypePromotionError,
                 "The DTypes %S and %S do not have a common DType. "
                 "For example they cannot be stored in a single array unless "
                 "the dtype is `object`.", dtype1, dtype2);
@@ -288,7 +289,7 @@ PyArray_PromoteDTypeSequence(
                 Py_INCREF(dtypes_in[l]);
                 PyTuple_SET_ITEM(dtypes_in_tuple, l, (PyObject *)dtypes_in[l]);
             }
-            PyErr_Format(PyExc_TypeError,
+            PyErr_Format(npy_DTypePromotionError,
                     "The DType %S could not be promoted by %S. This means that "
                     "no common DType exists for the given inputs. "
                     "For example they cannot be stored in a single array unless "
diff --git a/numpy/core/src/multiarray/compiled_base.c b/numpy/core/src/multiarray/compiled_base.c
index 82bed207b..22f2547ad 100644
--- a/numpy/core/src/multiarray/compiled_base.c
+++ b/numpy/core/src/multiarray/compiled_base.c
@@ -8,6 +8,7 @@
 #include "numpy/arrayobject.h"
 #include "numpy/npy_3kcompat.h"
 #include "numpy/npy_math.h"
+#include "npy_argparse.h"
 #include "npy_config.h"
 #include "templ_common.h" /* for npy_mul_sizes_with_overflow */
 #include "lowlevel_strided_loops.h" /* for npy_bswap8 */
@@ -109,7 +110,8 @@ minmax(const npy_intp *data, npy_intp data_len, npy_intp *mn, npy_intp *mx)
  * output array.
  */
 NPY_NO_EXPORT PyObject *
-arr_bincount(PyObject *NPY_UNUSED(self), PyObject *args, PyObject *kwds)
+arr_bincount(PyObject *NPY_UNUSED(self), PyObject *const *args,
+                            Py_ssize_t len_args, PyObject *kwnames)
 {
     PyObject *list = NULL, *weight = Py_None, *mlength = NULL;
     PyArrayObject *lst = NULL, *ans = NULL, *wts = NULL;
@@ -117,11 +119,14 @@ arr_bincount(PyObject *NPY_UNUSED(self), PyObject *args, PyObject *kwds)
     npy_intp minlength = 0;
     npy_intp i;
     double *weights , *dans;
-    static char *kwlist[] = {"list", "weights", "minlength", NULL};
 
-    if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|OO:bincount",
-                kwlist, &list, &weight, &mlength)) {
-            goto fail;
+    NPY_PREPARE_ARGPARSER;
+    if (npy_parse_arguments("bincount", args, len_args, kwnames,
+                "list", NULL, &list,
+                "|weights", NULL, &weight,
+                "|minlength", NULL, &mlength,
+                NULL, NULL, NULL) < 0) {
+        return NULL;
     }
 
     lst = (PyArrayObject *)PyArray_ContiguousFromAny(list, NPY_INTP, 1, 1);
@@ -265,7 +270,7 @@ arr__monotonicity(PyObject *NPY_UNUSED(self), PyObject *args, PyObject *kwds)
  * indicated by the mask
  */
 NPY_NO_EXPORT PyObject *
-arr_insert(PyObject *NPY_UNUSED(self), PyObject *args, PyObject *kwdict)
+arr_place(PyObject *NPY_UNUSED(self), PyObject *args, PyObject *kwdict)
 {
     char *src, *dest;
     npy_bool *mask_data;
@@ -378,7 +383,7 @@ arr_insert(PyObject *NPY_UNUSED(self), PyObject *args, PyObject *kwdict)
 #ifdef __INTEL_COMPILER
 #pragma intel optimization_level 0
 #endif
-static NPY_INLINE npy_intp
+static inline npy_intp
 _linear_search(const npy_double key, const npy_double *arr, const npy_intp len, const npy_intp i0)
 {
     npy_intp i;
@@ -489,7 +494,8 @@ binary_search_with_guess(const npy_double key, const npy_double *arr,
 #undef LIKELY_IN_CACHE_SIZE
 
 NPY_NO_EXPORT PyObject *
-arr_interp(PyObject *NPY_UNUSED(self), PyObject *args, PyObject *kwdict)
+arr_interp(PyObject *NPY_UNUSED(self), PyObject *const *args, Py_ssize_t len_args,
+                             PyObject *kwnames)
 {
 
     PyObject *fp, *xp, *x;
@@ -500,12 +506,16 @@ arr_interp(PyObject *NPY_UNUSED(self), PyObject *args, PyObject *kwdict)
     const npy_double *dy, *dx, *dz;
     npy_double *dres, *slopes = NULL;
 
-    static char *kwlist[] = {"x", "xp", "fp", "left", "right", NULL};
-
     NPY_BEGIN_THREADS_DEF;
 
-    if (!PyArg_ParseTupleAndKeywords(args, kwdict, "OOO|OO:interp", kwlist,
-                                     &x, &xp, &fp, &left, &right)) {
+    NPY_PREPARE_ARGPARSER;
+    if (npy_parse_arguments("interp", args, len_args, kwnames,
+                "x", NULL, &x,
+                "xp", NULL, &xp,
+                "fp", NULL, &fp,
+                "|left", NULL, &left,
+                "|right", NULL, &right,
+                NULL, NULL, NULL) < 0) {
         return NULL;
     }
 
@@ -654,7 +664,8 @@ fail:
 
 /* As for arr_interp but for complex fp values */
 NPY_NO_EXPORT PyObject *
-arr_interp_complex(PyObject *NPY_UNUSED(self), PyObject *args, PyObject *kwdict)
+arr_interp_complex(PyObject *NPY_UNUSED(self), PyObject *const *args, Py_ssize_t len_args,
+                             PyObject *kwnames)
 {
 
     PyObject *fp, *xp, *x;
@@ -667,12 +678,16 @@ arr_interp_complex(PyObject *NPY_UNUSED(self), PyObject *args, PyObject *kwdict)
     npy_cdouble lval, rval;
     npy_cdouble *dres, *slopes = NULL;
 
-    static char *kwlist[] = {"x", "xp", "fp", "left", "right", NULL};
-
     NPY_BEGIN_THREADS_DEF;
 
-    if (!PyArg_ParseTupleAndKeywords(args, kwdict, "OOO|OO:interp_complex",
-                                     kwlist, &x, &xp, &fp, &left, &right)) {
+    NPY_PREPARE_ARGPARSER;
+    if (npy_parse_arguments("interp_complex", args, len_args, kwnames,
+                "x", NULL, &x,
+                "xp", NULL, &xp,
+                "fp", NULL, &fp,
+                "|left", NULL, &left,
+                "|right", NULL, &right,
+                NULL, NULL, NULL) < 0) {
         return NULL;
     }
 
@@ -1389,7 +1404,7 @@ fail:
 
 /* Can only be called if doc is currently NULL */
 NPY_NO_EXPORT PyObject *
-arr_add_docstring(PyObject *NPY_UNUSED(dummy), PyObject *args)
+arr_add_docstring(PyObject *NPY_UNUSED(dummy), PyObject *const *args, Py_ssize_t len_args)
 {
     PyObject *obj;
     PyObject *str;
@@ -1401,7 +1416,16 @@ arr_add_docstring(PyObject *NPY_UNUSED(dummy), PyObject *args)
         Py_RETURN_NONE;
     }
 
-    if (!PyArg_ParseTuple(args, "OO!:add_docstring", &obj, &PyUnicode_Type, &str)) {
+    NPY_PREPARE_ARGPARSER;
+    if (npy_parse_arguments("add_docstring", args, len_args, NULL,
+            "", NULL, &obj,
+            "", NULL, &str,
+            NULL, NULL, NULL) < 0) {
+        return NULL;
+    }
+    if (!PyUnicode_Check(str)) {
+        PyErr_SetString(PyExc_TypeError,
+                "argument docstring of add_docstring should be a str");
         return NULL;
     }
 
@@ -1491,7 +1515,7 @@ arr_add_docstring(PyObject *NPY_UNUSED(dummy), PyObject *args)
  * byte array. Truth values are determined as usual: 0 is false, everything
  * else is true.
  */
-static NPY_GCC_OPT_3 NPY_INLINE void
+static NPY_GCC_OPT_3 inline void
 pack_inner(const char *inptr,
            npy_intp element_size,   /* in bytes */
            npy_intp n_in,
diff --git a/numpy/core/src/multiarray/compiled_base.h b/numpy/core/src/multiarray/compiled_base.h
index d3bc08cb2..e0e73ac79 100644
--- a/numpy/core/src/multiarray/compiled_base.h
+++ b/numpy/core/src/multiarray/compiled_base.h
@@ -4,21 +4,21 @@
 #include "numpy/ndarraytypes.h"
 
 NPY_NO_EXPORT PyObject *
-arr_insert(PyObject *, PyObject *, PyObject *);
+arr_place(PyObject *, PyObject *, PyObject *);
 NPY_NO_EXPORT PyObject *
-arr_bincount(PyObject *, PyObject *, PyObject *);
+arr_bincount(PyObject *, PyObject *const *, Py_ssize_t, PyObject *);
 NPY_NO_EXPORT PyObject *
 arr__monotonicity(PyObject *, PyObject *, PyObject *kwds);
 NPY_NO_EXPORT PyObject *
-arr_interp(PyObject *, PyObject *, PyObject *);
+arr_interp(PyObject *, PyObject *const *, Py_ssize_t, PyObject *, PyObject *);
 NPY_NO_EXPORT PyObject *
-arr_interp_complex(PyObject *, PyObject *, PyObject *);
+arr_interp_complex(PyObject *, PyObject *const *, Py_ssize_t, PyObject *, PyObject *);
 NPY_NO_EXPORT PyObject *
 arr_ravel_multi_index(PyObject *, PyObject *, PyObject *);
 NPY_NO_EXPORT PyObject *
 arr_unravel_index(PyObject *, PyObject *, PyObject *);
 NPY_NO_EXPORT PyObject *
-arr_add_docstring(PyObject *, PyObject *);
+arr_add_docstring(PyObject *, PyObject *const *, Py_ssize_t);
 NPY_NO_EXPORT PyObject *
 io_pack(PyObject *, PyObject *, PyObject *);
 NPY_NO_EXPORT PyObject *
diff --git a/numpy/core/src/multiarray/conversion_utils.c b/numpy/core/src/multiarray/conversion_utils.c
index 96afb6c00..84bf08206 100644
--- a/numpy/core/src/multiarray/conversion_utils.c
+++ b/numpy/core/src/multiarray/conversion_utils.c
@@ -84,7 +84,7 @@ PyArray_OutputConverter(PyObject *object, PyArrayObject **address)
  * to `PyArray_PyIntAsIntp`.  Exists mainly to retain old behaviour of
  * `PyArray_IntpConverter` and `PyArray_IntpFromSequence`
  */
-static NPY_INLINE npy_intp
+static inline npy_intp
 dimension_from_scalar(PyObject *ob)
 {
     npy_intp value = PyArray_PyIntAsIntp(ob);
diff --git a/numpy/core/src/multiarray/conversion_utils.h b/numpy/core/src/multiarray/conversion_utils.h
index 4d0fbb894..62f54749e 100644
--- a/numpy/core/src/multiarray/conversion_utils.h
+++ b/numpy/core/src/multiarray/conversion_utils.h
@@ -52,7 +52,7 @@ NPY_NO_EXPORT int
 PyArray_TypestrConvert(int itemsize, int gentype);
 
 
-static NPY_INLINE PyObject *
+static inline PyObject *
 PyArray_PyIntFromIntp(npy_intp const value)
 {
 #if NPY_SIZEOF_INTP <= NPY_SIZEOF_LONG
diff --git a/numpy/core/src/multiarray/convert.c b/numpy/core/src/multiarray/convert.c
index 2aed0bbb4..7ef80cf28 100644
--- a/numpy/core/src/multiarray/convert.c
+++ b/numpy/core/src/multiarray/convert.c
@@ -21,6 +21,7 @@
 
 #include "convert.h"
 #include "array_coercion.h"
+#include "refcount.h"
 
 int
 fallocate(int fd, int mode, off_t offset, off_t len);
@@ -157,7 +158,24 @@ PyArray_ToFile(PyArrayObject *self, FILE *fp, char *sep, char *format)
             NPY_BEGIN_ALLOW_THREADS;
 
 #if defined (_MSC_VER) && defined(_WIN64)
-            /* Workaround Win64 fwrite() bug. Ticket #1660 */
+            /* Workaround Win64 fwrite() bug. Issue gh-2556
+             * If you touch this code, please run this test which is so slow
+             * it was removed from the test suite
+             *
+             * fourgbplus = 2**32 + 2**16
+             * testbytes = np.arange(8, dtype=np.int8)
+             * n = len(testbytes)
+             * flike = tempfile.NamedTemporaryFile()
+             * f = flike.file
+             * np.tile(testbytes, fourgbplus // testbytes.nbytes).tofile(f)
+             * flike.seek(0)
+             * a = np.fromfile(f, dtype=np.int8)
+             * flike.close()
+             * assert_(len(a) == fourgbplus)
+             * # check only start and end for speed:
+             * assert_((a[:n] == testbytes).all())
+             * assert_((a[-n:] == testbytes).all())
+             */
             {
                 npy_intp maxsize = 2147483648 / PyArray_DESCR(self)->elsize;
                 npy_intp chunksize;
@@ -305,7 +323,7 @@ PyArray_ToString(PyArrayObject *self, NPY_ORDER order)
     PyArrayIterObject *it;
 
     if (order == NPY_ANYORDER)
-        order = PyArray_ISFORTRAN(self);
+        order = PyArray_ISFORTRAN(self) ? NPY_FORTRANORDER : NPY_CORDER;
 
     /*        if (PyArray_TYPE(self) == NPY_OBJECT) {
               PyErr_SetString(PyExc_ValueError, "a string for the data" \
@@ -359,6 +377,11 @@ PyArray_ToString(PyArrayObject *self, NPY_ORDER order)
 NPY_NO_EXPORT int
 PyArray_FillWithScalar(PyArrayObject *arr, PyObject *obj)
 {
+
+    if (PyArray_FailUnlessWriteable(arr, "assignment destination") < 0) {
+        return -1;
+    }
+
     /*
      * If we knew that the output array has at least one element, we would
      * not actually need a helping buffer, we always null it, just in case.
@@ -393,12 +416,17 @@ PyArray_FillWithScalar(PyArrayObject *arr, PyObject *obj)
             PyArray_BYTES(arr), PyArray_STRIDES(arr),
             descr, value);
 
+    if (PyDataType_REFCHK(descr)) {
+        PyArray_ClearBuffer(descr, value, 0, 1, 1);
+    }
     PyMem_FREE(value_buffer_heap);
     return retcode;
 }
 
 /*
- * Fills an array with zeros.
+ * Internal function to fill an array with zeros.
+ * Used in einsum and dot, which ensures the dtype is, in some sense, numerical
+ * and not a str or struct
  *
  * dst: The destination array.
  * wheremask: If non-NULL, a boolean mask specifying where to set the values.
@@ -409,21 +437,26 @@ NPY_NO_EXPORT int
 PyArray_AssignZero(PyArrayObject *dst,
                    PyArrayObject *wheremask)
 {
-    npy_bool value;
-    PyArray_Descr *bool_dtype;
-    int retcode;
-
-    /* Create a raw bool scalar with the value False */
-    bool_dtype = PyArray_DescrFromType(NPY_BOOL);
-    if (bool_dtype == NULL) {
-        return -1;
+    int retcode = 0;
+    if (PyArray_ISOBJECT(dst)) {
+        PyObject * pZero = PyLong_FromLong(0);
+        retcode = PyArray_AssignRawScalar(dst, PyArray_DESCR(dst),
+                                     (char *)&pZero, wheremask, NPY_SAFE_CASTING);
+        Py_DECREF(pZero);
     }
-    value = 0;
+    else {
+        /* Create a raw bool scalar with the value False */
+        PyArray_Descr *bool_dtype = PyArray_DescrFromType(NPY_BOOL);
+        if (bool_dtype == NULL) {
+            return -1;
+        }
+        npy_bool value = 0;
 
-    retcode = PyArray_AssignRawScalar(dst, bool_dtype, (char *)&value,
-                                      wheremask, NPY_SAFE_CASTING);
+        retcode = PyArray_AssignRawScalar(dst, bool_dtype, (char *)&value,
+                                          wheremask, NPY_SAFE_CASTING);
 
-    Py_DECREF(bool_dtype);
+        Py_DECREF(bool_dtype);
+    }
     return retcode;
 }
 
@@ -483,7 +516,7 @@ PyArray_View(PyArrayObject *self, PyArray_Descr *type, PyTypeObject *pytype)
             PyArray_NDIM(self), PyArray_DIMS(self), PyArray_STRIDES(self),
             PyArray_DATA(self),
             flags, (PyObject *)self, (PyObject *)self,
-            0, 1);
+            _NPY_ARRAY_ENSURE_DTYPE_IDENTITY);
     if (ret == NULL) {
         Py_XDECREF(type);
         return NULL;
diff --git a/numpy/core/src/multiarray/convert_datatype.c b/numpy/core/src/multiarray/convert_datatype.c
index c578a1b44..de1cd075d 100644
--- a/numpy/core/src/multiarray/convert_datatype.c
+++ b/numpy/core/src/multiarray/convert_datatype.c
@@ -18,6 +18,7 @@
 #include "can_cast_table.h"
 #include "common.h"
 #include "ctors.h"
+#include "descriptor.h"
 #include "dtypemeta.h"
 #include "common_dtype.h"
 #include "scalartypes.h"
@@ -31,6 +32,7 @@
 #include "array_method.h"
 #include "usertypes.h"
 #include "dtype_transfer.h"
+#include "dtype_traversal.h"
 #include "arrayobject.h"
 
 
@@ -50,6 +52,9 @@ NPY_NO_EXPORT npy_intp REQUIRED_STR_LEN[] = {0, 3, 5, 10, 10, 20, 20, 20, 20};
  */
 NPY_NO_EXPORT int npy_promotion_state = NPY_USE_LEGACY_PROMOTION;
 NPY_NO_EXPORT PyObject *NO_NEP50_WARNING_CTX = NULL;
+NPY_NO_EXPORT PyObject *npy_DTypePromotionError = NULL;
+NPY_NO_EXPORT PyObject *npy_UFuncNoLoopError = NULL;
+
 
 static PyObject *
 PyArray_GetGenericToVoidCastingImpl(void);
@@ -149,7 +154,7 @@ PyArray_GetCastingImpl(PyArray_DTypeMeta *from, PyArray_DTypeMeta *to)
 {
     PyObject *res;
     if (from == to) {
-        res = NPY_DT_SLOTS(from)->within_dtype_castingimpl;
+        res = (PyObject *)NPY_DT_SLOTS(from)->within_dtype_castingimpl;
     }
     else {
         res = PyDict_GetItemWithError(NPY_DT_SLOTS(from)->castingimpls, (PyObject *)to);
@@ -326,7 +331,7 @@ PyArray_CastToType(PyArrayObject *arr, PyArray_Descr *dtype, int is_f_order)
         return NULL;
     }
 
-    Py_SETREF(dtype, PyArray_AdaptDescriptorToArray(arr, (PyObject *)dtype));
+    Py_SETREF(dtype, PyArray_AdaptDescriptorToArray(arr, NULL, dtype));
     if (dtype == NULL) {
         return NULL;
     }
@@ -384,18 +389,14 @@ PyArray_GetCastFunc(PyArray_Descr *descr, int type_num)
             !PyTypeNum_ISCOMPLEX(type_num) &&
             PyTypeNum_ISNUMBER(type_num) &&
             !PyTypeNum_ISBOOL(type_num)) {
-        PyObject *cls = NULL, *obj = NULL;
-        int ret;
-        obj = PyImport_ImportModule("numpy.core");
-
-        if (obj) {
-            cls = PyObject_GetAttrString(obj, "ComplexWarning");
-            Py_DECREF(obj);
+        static PyObject *cls = NULL;
+        npy_cache_import("numpy.exceptions", "ComplexWarning", &cls);
+        if (cls == NULL) {
+            return NULL;
         }
-        ret = PyErr_WarnEx(cls,
+        int ret = PyErr_WarnEx(cls,
                 "Casting complex values to real discards "
                 "the imaginary part", 1);
-        Py_XDECREF(cls);
         if (ret < 0) {
             return NULL;
         }
@@ -1149,7 +1150,7 @@ PyArray_CastDescrToDType(PyArray_Descr *descr, PyArray_DTypeMeta *given_DType)
  */
 NPY_NO_EXPORT PyArray_Descr *
 PyArray_FindConcatenationDescriptor(
-        npy_intp n, PyArrayObject **arrays, PyObject *requested_dtype)
+        npy_intp n, PyArrayObject **arrays, PyArray_Descr *requested_dtype)
 {
     if (requested_dtype == NULL) {
         return PyArray_ResultType(n, arrays, 0, NULL);
@@ -2415,8 +2416,7 @@ PyArray_AddCastingImplementation(PyBoundArrayMethodObject *meth)
             return -1;
         }
         Py_INCREF(meth->method);
-        NPY_DT_SLOTS(meth->dtypes[0])->within_dtype_castingimpl = (
-                (PyObject *)meth->method);
+        NPY_DT_SLOTS(meth->dtypes[0])->within_dtype_castingimpl = meth->method;
 
         return 0;
     }
@@ -2613,7 +2613,7 @@ complex_to_noncomplex_get_loop(
 {
     static PyObject *cls = NULL;
     int ret;
-    npy_cache_import("numpy.core", "ComplexWarning", &cls);
+    npy_cache_import("numpy.exceptions", "ComplexWarning", &cls);
     if (cls == NULL) {
         return -1;
     }
@@ -2639,8 +2639,8 @@ add_numeric_cast(PyArray_DTypeMeta *from, PyArray_DTypeMeta *to)
             .nin = 1,
             .nout = 1,
             .flags = NPY_METH_SUPPORTS_UNALIGNED,
-            .slots = slots,
             .dtypes = dtypes,
+            .slots = slots,
     };
 
     npy_intp from_itemsize = from->singleton->elsize;
@@ -2669,7 +2669,7 @@ add_numeric_cast(PyArray_DTypeMeta *from, PyArray_DTypeMeta *to)
          * consider moving this warning into the inner-loop at some point
          * for simplicity (this requires ensuring it is only emitted once).
          */
-        slots[5].slot = NPY_METH_get_loop;
+        slots[5].slot = _NPY_METH_get_loop;
         slots[5].pfunc = &complex_to_noncomplex_get_loop;
         slots[6].slot = 0;
         slots[6].pfunc = NULL;
@@ -2690,7 +2690,7 @@ add_numeric_cast(PyArray_DTypeMeta *from, PyArray_DTypeMeta *to)
         /* When there is no casting (equivalent C-types) use byteswap loops */
         slots[0].slot = NPY_METH_resolve_descriptors;
         slots[0].pfunc = &legacy_same_dtype_resolve_descriptors;
-        slots[1].slot = NPY_METH_get_loop;
+        slots[1].slot = _NPY_METH_get_loop;
         slots[1].pfunc = &get_byteswap_loop;
         slots[2].slot = 0;
         slots[2].pfunc = NULL;
@@ -2874,14 +2874,14 @@ add_other_to_and_from_string_cast(
      */
     PyArray_DTypeMeta *dtypes[2] = {other, string};
     PyType_Slot slots[] = {
-            {NPY_METH_get_loop, &legacy_cast_get_strided_loop},
+            {_NPY_METH_get_loop, &legacy_cast_get_strided_loop},
             {NPY_METH_resolve_descriptors, &cast_to_string_resolve_descriptors},
             {0, NULL}};
     PyArrayMethod_Spec spec = {
         .name = "legacy_cast_to_string",
         .nin = 1,
         .nout = 1,
-        .flags = NPY_METH_REQUIRES_PYAPI,
+        .flags = NPY_METH_REQUIRES_PYAPI | NPY_METH_NO_FLOATINGPOINT_ERRORS,
         .dtypes = dtypes,
         .slots = slots,
     };
@@ -3013,14 +3013,14 @@ PyArray_InitializeStringCasts(void)
     /* string<->string and unicode<->unicode have their own specialized casts */
     PyArray_DTypeMeta *dtypes[2];
     PyType_Slot slots[] = {
-            {NPY_METH_get_loop, &string_to_string_get_loop},
+            {_NPY_METH_get_loop, &string_to_string_get_loop},
             {NPY_METH_resolve_descriptors, &string_to_string_resolve_descriptors},
             {0, NULL}};
     PyArrayMethod_Spec spec = {
             .name = "string_to_string_cast",
-            .casting = NPY_UNSAFE_CASTING,
             .nin = 1,
             .nout = 1,
+            .casting = NPY_UNSAFE_CASTING,
             .flags = (NPY_METH_REQUIRES_PYAPI |
                       NPY_METH_NO_FLOATINGPOINT_ERRORS |
                       NPY_METH_SUPPORTS_UNALIGNED),
@@ -3712,14 +3712,14 @@ PyArray_InitializeVoidToVoidCast(void)
     PyArray_DTypeMeta *Void = PyArray_DTypeFromTypeNum(NPY_VOID);
     PyArray_DTypeMeta *dtypes[2] = {Void, Void};
     PyType_Slot slots[] = {
-            {NPY_METH_get_loop, &void_to_void_get_loop},
+            {_NPY_METH_get_loop, &void_to_void_get_loop},
             {NPY_METH_resolve_descriptors, &void_to_void_resolve_descriptors},
             {0, NULL}};
     PyArrayMethod_Spec spec = {
             .name = "void_to_void_cast",
-            .casting = -1,  /* may not cast at all */
             .nin = 1,
             .nout = 1,
+            .casting = -1,  /* may not cast at all */
             .flags = NPY_METH_REQUIRES_PYAPI | NPY_METH_SUPPORTS_UNALIGNED,
             .dtypes = dtypes,
             .slots = slots,
@@ -3895,13 +3895,13 @@ PyArray_InitializeObjectToObjectCast(void)
     PyArray_DTypeMeta *Object = PyArray_DTypeFromTypeNum(NPY_OBJECT);
     PyArray_DTypeMeta *dtypes[2] = {Object, Object};
     PyType_Slot slots[] = {
-            {NPY_METH_get_loop, &object_to_object_get_loop},
+            {_NPY_METH_get_loop, &object_to_object_get_loop},
             {0, NULL}};
     PyArrayMethod_Spec spec = {
             .name = "object_to_object_cast",
-            .casting = NPY_NO_CASTING,
             .nin = 1,
             .nout = 1,
+            .casting = NPY_NO_CASTING,
             .flags = NPY_METH_REQUIRES_PYAPI | NPY_METH_SUPPORTS_UNALIGNED,
             .dtypes = dtypes,
             .slots = slots,
diff --git a/numpy/core/src/multiarray/convert_datatype.h b/numpy/core/src/multiarray/convert_datatype.h
index b6bc7d8a7..a68c669ee 100644
--- a/numpy/core/src/multiarray/convert_datatype.h
+++ b/numpy/core/src/multiarray/convert_datatype.h
@@ -14,6 +14,8 @@ extern NPY_NO_EXPORT npy_intp REQUIRED_STR_LEN[];
 #define NPY_USE_WEAK_PROMOTION_AND_WARN 2
 extern NPY_NO_EXPORT int npy_promotion_state;
 extern NPY_NO_EXPORT PyObject *NO_NEP50_WARNING_CTX;
+extern NPY_NO_EXPORT PyObject *npy_DTypePromotionError;
+extern NPY_NO_EXPORT PyObject *npy_UFuncNoLoopError;
 
 NPY_NO_EXPORT int
 npy_give_promotion_warnings(void);
@@ -80,7 +82,7 @@ PyArray_CastDescrToDType(PyArray_Descr *descr, PyArray_DTypeMeta *given_DType);
 
 NPY_NO_EXPORT PyArray_Descr *
 PyArray_FindConcatenationDescriptor(
-        npy_intp n, PyArrayObject **arrays, PyObject *requested_dtype);
+        npy_intp n, PyArrayObject **arrays, PyArray_Descr *requested_dtype);
 
 NPY_NO_EXPORT int
 PyArray_AddCastingImplementation(PyBoundArrayMethodObject *meth);
diff --git a/numpy/core/src/multiarray/ctors.c b/numpy/core/src/multiarray/ctors.c
index 05575cad7..3c5b047c3 100644
--- a/numpy/core/src/multiarray/ctors.c
+++ b/numpy/core/src/multiarray/ctors.c
@@ -20,6 +20,8 @@
 #include "common.h"
 #include "ctors.h"
 #include "convert_datatype.h"
+#include "descriptor.h"
+#include "dtypemeta.h"
 #include "shape.h"
 #include "npy_buffer.h"
 #include "lowlevel_strided_loops.h"
@@ -626,8 +628,7 @@ NPY_NO_EXPORT PyObject *
 PyArray_NewFromDescr_int(
         PyTypeObject *subtype, PyArray_Descr *descr, int nd,
         npy_intp const *dims, npy_intp const *strides, void *data,
-        int flags, PyObject *obj, PyObject *base, int zeroed,
-        int allow_emptystring)
+        int flags, PyObject *obj, PyObject *base, _NPY_CREATION_FLAGS cflags)
 {
     PyArrayObject_fields *fa;
     npy_intp nbytes;
@@ -642,44 +643,54 @@ PyArray_NewFromDescr_int(
         return NULL;
     }
 
-    if (descr->subarray) {
-        PyObject *ret;
-        npy_intp newdims[2*NPY_MAXDIMS];
-        npy_intp *newstrides = NULL;
-        memcpy(newdims, dims, nd*sizeof(npy_intp));
-        if (strides) {
-            newstrides = newdims + NPY_MAXDIMS;
-            memcpy(newstrides, strides, nd*sizeof(npy_intp));
-        }
-        nd =_update_descr_and_dimensions(&descr, newdims,
-                                         newstrides, nd);
-        ret = PyArray_NewFromDescr_int(
-                subtype, descr,
-                nd, newdims, newstrides, data,
-                flags, obj, base,
-                zeroed, allow_emptystring);
-        return ret;
-    }
-
-    /* Check datatype element size */
     nbytes = descr->elsize;
-    if (PyDataType_ISUNSIZED(descr)) {
-        if (!PyDataType_ISFLEXIBLE(descr)) {
-            PyErr_SetString(PyExc_TypeError, "Empty data-type");
-            Py_DECREF(descr);
-            return NULL;
-        }
-        else if (PyDataType_ISSTRING(descr) && !allow_emptystring &&
-                 data == NULL) {
-            PyArray_DESCR_REPLACE(descr);
-            if (descr == NULL) {
+    /*
+     * Unless explicitly asked not to, we do replace dtypes in some cases.
+     * This mainly means that we never create arrays with a subarray dtype
+     * (unless for internal use when requested).  And neither do we create
+     * S0/U0 arrays in most cases (unless data == NULL so this is probably
+     * a view where growing the dtype would be presumable wrong).
+     */
+    if (!(cflags & _NPY_ARRAY_ENSURE_DTYPE_IDENTITY)) {
+        if (descr->subarray) {
+            PyObject *ret;
+            npy_intp newdims[2*NPY_MAXDIMS];
+            npy_intp *newstrides = NULL;
+            memcpy(newdims, dims, nd*sizeof(npy_intp));
+            if (strides) {
+                newstrides = newdims + NPY_MAXDIMS;
+                memcpy(newstrides, strides, nd*sizeof(npy_intp));
+            }
+            nd =_update_descr_and_dimensions(&descr, newdims,
+                                            newstrides, nd);
+            ret = PyArray_NewFromDescr_int(
+                    subtype, descr,
+                    nd, newdims, newstrides, data,
+                    flags, obj, base, cflags);
+            return ret;
+        }
+
+        /* Check datatype element size */
+        if (PyDataType_ISUNSIZED(descr)) {
+            if (!PyDataType_ISFLEXIBLE(descr) &&
+                NPY_DT_is_legacy(NPY_DTYPE(descr))) {
+                PyErr_SetString(PyExc_TypeError, "Empty data-type");
+                Py_DECREF(descr);
                 return NULL;
             }
-            if (descr->type_num == NPY_STRING) {
-                nbytes = descr->elsize = 1;
-            }
-            else {
-                nbytes = descr->elsize = sizeof(npy_ucs4);
+            else if (PyDataType_ISSTRING(descr)
+                        && !(cflags & _NPY_ARRAY_ALLOW_EMPTY_STRING)
+                        && data == NULL) {
+                PyArray_DESCR_REPLACE(descr);
+                if (descr == NULL) {
+                    return NULL;
+                }
+                if (descr->type_num == NPY_STRING) {
+                    nbytes = descr->elsize = 1;
+                }
+                else {
+                    nbytes = descr->elsize = sizeof(npy_ucs4);
+                }
             }
         }
     }
@@ -712,6 +723,11 @@ PyArray_NewFromDescr_int(
     fa->base = (PyObject *)NULL;
     fa->weakreflist = (PyObject *)NULL;
 
+    /* needed for zero-filling logic below, defined and initialized up here
+       so cleanup logic can go in the fail block */
+    NPY_traverse_info fill_zero_info;
+    NPY_traverse_info_init(&fill_zero_info);
+
     if (nd > 0) {
         fa->dimensions = npy_alloc_cache_dim(2 * nd);
         if (fa->dimensions == NULL) {
@@ -781,6 +797,32 @@ PyArray_NewFromDescr_int(
 
 
     if (data == NULL) {
+        /* float errors do not matter and we do not release GIL */
+        NPY_ARRAYMETHOD_FLAGS zero_flags;
+        get_traverse_loop_function *get_fill_zero_loop =
+            NPY_DT_SLOTS(NPY_DTYPE(descr))->get_fill_zero_loop;
+        if (get_fill_zero_loop != NULL) {
+            if (get_fill_zero_loop(
+                    NULL, descr, 1, descr->elsize, &(fill_zero_info.func),
+                    &(fill_zero_info.auxdata), &zero_flags) < 0) {
+                goto fail;
+            }
+        }
+
+        /*
+         * We always want a zero-filled array allocated with calloc if
+         * NPY_NEEDS_INIT is set on the dtype, for safety.  We also want a
+         * zero-filled array if zeroed is set and the zero-filling loop isn't
+         * defined, for better performance.
+         *
+         * If the zero-filling loop is defined and zeroed is set, allocate
+         * with malloc and let the zero-filling loop fill the array buffer
+         * with valid zero values for the dtype.
+         */
+        int use_calloc = (
+                PyDataType_FLAGCHK(descr, NPY_NEEDS_INIT) ||
+                ((cflags & _NPY_ARRAY_ZEROED) && (fill_zero_info.func == NULL)));
+
         /* Store the handler in case the default is modified */
         fa->mem_handler = PyDataMem_GetHandler();
         if (fa->mem_handler == NULL) {
@@ -798,11 +840,8 @@ PyArray_NewFromDescr_int(
                 fa->strides[i] = 0;
             }
         }
-        /*
-         * It is bad to have uninitialized OBJECT pointers
-         * which could also be sub-fields of a VOID array
-         */
-        if (zeroed || PyDataType_FLAGCHK(descr, NPY_NEEDS_INIT)) {
+
+        if (use_calloc) {
             data = PyDataMem_UserNEW_ZEROED(nbytes, 1, fa->mem_handler);
         }
         else {
@@ -813,6 +852,19 @@ PyArray_NewFromDescr_int(
             goto fail;
         }
 
+        /*
+         * If the array needs special dtype-specific zero-filling logic, do that
+         */
+        if (NPY_UNLIKELY((cflags & _NPY_ARRAY_ZEROED)
+                         && (fill_zero_info.func != NULL))) {
+            npy_intp size = PyArray_MultiplyList(fa->dimensions, fa->nd);
+            if (fill_zero_info.func(
+                    NULL, descr, data, size, descr->elsize,
+                    fill_zero_info.auxdata) < 0) {
+                goto fail;
+            }
+        }
+
         fa->flags |= NPY_ARRAY_OWNDATA;
     }
     else {
@@ -907,9 +959,11 @@ PyArray_NewFromDescr_int(
             }
         }
     }
+    NPY_traverse_info_xfree(&fill_zero_info);
     return (PyObject *)fa;
 
  fail:
+    NPY_traverse_info_xfree(&fill_zero_info);
     Py_XDECREF(fa->mem_handler);
     Py_DECREF(fa);
     return NULL;
@@ -957,7 +1011,7 @@ PyArray_NewFromDescrAndBase(
 {
     return PyArray_NewFromDescr_int(subtype, descr, nd,
                                     dims, strides, data,
-                                    flags, obj, base, 0, 0);
+                                    flags, obj, base, 0);
 }
 
 /*
@@ -1440,167 +1494,44 @@ PyArray_GetArrayParamsFromObject(PyObject *NPY_UNUSED(op),
 }
 
 
-/*
- * This function is a legacy implementation to retain subarray dtype
- * behaviour in array coercion. The behaviour here makes sense if tuples
- * of matching dimensionality are being coerced. Due to the difficulty
- * that the result is ill-defined for lists of array-likes, this is deprecated.
- *
- * WARNING: Do not use this function, it exists purely to support a deprecated
- *          code path.
+/*NUMPY_API
+ * Does not check for NPY_ARRAY_ENSURECOPY and NPY_ARRAY_NOTSWAPPED in flags
+ * Steals a reference to newtype --- which can be NULL
  */
-static int
-setArrayFromSequence(PyArrayObject *a, PyObject *s,
-                        int dim, PyArrayObject * dst)
+NPY_NO_EXPORT PyObject *
+PyArray_FromAny(PyObject *op, PyArray_Descr *newtype, int min_depth,
+                int max_depth, int flags, PyObject *context)
 {
-    Py_ssize_t i, slen;
-    int res = -1;
-
-    /* first recursion, view equal destination */
-    if (dst == NULL)
-        dst = a;
-
-    /*
-     * This code is to ensure that the sequence access below will
-     * return a lower-dimensional sequence.
-     */
-
-    /* INCREF on entry DECREF on exit */
-    Py_INCREF(s);
-
-    PyObject *seq = NULL;
+    npy_dtype_info dt_info = {NULL, NULL};
 
-    if (PyArray_Check(s)) {
-        if (!(PyArray_CheckExact(s))) {
-            /*
-             * make sure a base-class array is used so that the dimensionality
-             * reduction assumption is correct.
-             */
-            /* This will DECREF(s) if replaced */
-            s = PyArray_EnsureArray(s);
-            if (s == NULL) {
-                goto fail;
-            }
-        }
-
-        /* dst points to correct array subsection */
-        if (PyArray_CopyInto(dst, (PyArrayObject *)s) < 0) {
-            goto fail;
-        }
-
-        Py_DECREF(s);
-        return 0;
-    }
-
-    if (dim > PyArray_NDIM(a)) {
-        PyErr_Format(PyExc_ValueError,
-                 "setArrayFromSequence: sequence/array dimensions mismatch.");
-        goto fail;
-    }
+    int res = PyArray_ExtractDTypeAndDescriptor(
+        newtype, &dt_info.descr, &dt_info.dtype);
 
-    /* Try __array__ before using s as a sequence */
-    PyObject *tmp = _array_from_array_like(s, NULL, 0, NULL, 0);
-    if (tmp == NULL) {
-        goto fail;
-    }
-    else if (tmp == Py_NotImplemented) {
-        Py_DECREF(tmp);
-    }
-    else {
-        int r = PyArray_CopyInto(dst, (PyArrayObject *)tmp);
-        Py_DECREF(tmp);
-        if (r < 0) {
-            goto fail;
-        }
-        Py_DECREF(s);
-        return 0;
-    }
-
-    seq = PySequence_Fast(s, "Could not convert object to sequence");
-    if (seq == NULL) {
-        goto fail;
-    }
-    slen = PySequence_Fast_GET_SIZE(seq);
-
-    /*
-     * Either the dimensions match, or the sequence has length 1 and can
-     * be broadcast to the destination.
-     */
-    if (slen != PyArray_DIMS(a)[dim] && slen != 1) {
-        PyErr_Format(PyExc_ValueError,
-                 "cannot copy sequence with size %zd to array axis "
-                 "with dimension %" NPY_INTP_FMT, slen, PyArray_DIMS(a)[dim]);
-        goto fail;
-    }
-
-    /* Broadcast the one element from the sequence to all the outputs */
-    if (slen == 1) {
-        PyObject *o = PySequence_Fast_GET_ITEM(seq, 0);
-        npy_intp alen = PyArray_DIM(a, dim);
-
-        for (i = 0; i < alen; i++) {
-            if ((PyArray_NDIM(a) - dim) > 1) {
-                PyArrayObject * tmp =
-                    (PyArrayObject *)array_item_asarray(dst, i);
-                if (tmp == NULL) {
-                    goto fail;
-                }
-
-                res = setArrayFromSequence(a, o, dim+1, tmp);
-                Py_DECREF(tmp);
-            }
-            else {
-                char * b = (PyArray_BYTES(dst) + i * PyArray_STRIDES(dst)[0]);
-                res = PyArray_SETITEM(dst, b, o);
-            }
-            if (res < 0) {
-                goto fail;
-            }
-        }
-    }
-    /* Copy element by element */
-    else {
-        for (i = 0; i < slen; i++) {
-            PyObject * o = PySequence_Fast_GET_ITEM(seq, i);
-            if ((PyArray_NDIM(a) - dim) > 1) {
-                PyArrayObject * tmp =
-                    (PyArrayObject *)array_item_asarray(dst, i);
-                if (tmp == NULL) {
-                    goto fail;
-                }
+    Py_XDECREF(newtype);
 
-                res = setArrayFromSequence(a, o, dim+1, tmp);
-                Py_DECREF(tmp);
-            }
-            else {
-                char * b = (PyArray_BYTES(dst) + i * PyArray_STRIDES(dst)[0]);
-                res = PyArray_SETITEM(dst, b, o);
-            }
-            if (res < 0) {
-                goto fail;
-            }
-        }
+    if (res < 0) {
+        Py_XDECREF(dt_info.descr);
+        Py_XDECREF(dt_info.dtype);
+        return NULL;
     }
 
-    Py_DECREF(seq);
-    Py_DECREF(s);
-    return 0;
+    PyObject* ret =  PyArray_FromAny_int(op, dt_info.descr, dt_info.dtype,
+                                         min_depth, max_depth, flags, context);
 
- fail:
-    Py_XDECREF(seq);
-    Py_DECREF(s);
-    return res;
+    Py_XDECREF(dt_info.descr);
+    Py_XDECREF(dt_info.dtype);
+    return ret;
 }
 
-
-
-/*NUMPY_API
- * Does not check for NPY_ARRAY_ENSURECOPY and NPY_ARRAY_NOTSWAPPED in flags
- * Steals a reference to newtype --- which can be NULL
+/*
+ * Internal version of PyArray_FromAny that accepts a dtypemeta. Borrows
+ * references to the descriptor and dtype.
  */
+
 NPY_NO_EXPORT PyObject *
-PyArray_FromAny(PyObject *op, PyArray_Descr *newtype, int min_depth,
-                int max_depth, int flags, PyObject *context)
+PyArray_FromAny_int(PyObject *op, PyArray_Descr *in_descr,
+                    PyArray_DTypeMeta *in_DType, int min_depth, int max_depth,
+                    int flags, PyObject *context)
 {
     /*
      * This is the main code to make a NumPy array from a Python
@@ -1617,89 +1548,14 @@ PyArray_FromAny(PyObject *op, PyArray_Descr *newtype, int min_depth,
         return NULL;
     }
 
-    PyArray_Descr *fixed_descriptor;
-    PyArray_DTypeMeta *fixed_DType;
-    if (PyArray_ExtractDTypeAndDescriptor((PyObject *)newtype,
-            &fixed_descriptor, &fixed_DType) < 0) {
-        Py_XDECREF(newtype);
-        return NULL;
-    }
-    Py_XDECREF(newtype);
-
     ndim = PyArray_DiscoverDTypeAndShape(op,
-            NPY_MAXDIMS, dims, &cache, fixed_DType, fixed_descriptor, &dtype,
+            NPY_MAXDIMS, dims, &cache, in_DType, in_descr, &dtype,
             flags & NPY_ARRAY_ENSURENOCOPY);
 
-    Py_XDECREF(fixed_descriptor);
-    Py_XDECREF(fixed_DType);
     if (ndim < 0) {
         return NULL;
     }
 
-    if (NPY_UNLIKELY(fixed_descriptor != NULL && PyDataType_HASSUBARRAY(dtype))) {
-        /*
-         * When a subarray dtype was passed in, its dimensions are appended
-         * to the array dimension (causing a dimension mismatch).
-         * There is a problem with that, because if we coerce from non-arrays
-         * we do this correctly by element (as defined by tuples), but for
-         * arrays we first append the dimensions and then assign to the base
-         * dtype and then assign which causes the problem.
-         *
-         * Thus, we check if there is an array included, in that case we
-         * give a FutureWarning.
-         * When the warning is removed, PyArray_Pack will have to ensure
-         * that it does not append the dimensions when creating the subarrays
-         * to assign `arr[0] = obj[0]`.
-         */
-        int includes_array = 0;
-        if (cache != NULL) {
-            /* This is not ideal, but it is a pretty special case */
-            coercion_cache_obj *next = cache;
-            while (next != NULL) {
-                if (!next->sequence) {
-                    includes_array = 1;
-                    break;
-                }
-                next = next->next;
-            }
-        }
-        if (includes_array) {
-            npy_free_coercion_cache(cache);
-
-            ret = (PyArrayObject *) PyArray_NewFromDescr(
-                    &PyArray_Type, dtype, ndim, dims, NULL, NULL,
-                    flags & NPY_ARRAY_F_CONTIGUOUS, NULL);
-            if (ret == NULL) {
-                return NULL;
-            }
-            assert(PyArray_NDIM(ret) != ndim);
-
-            /* NumPy 1.20, 2020-10-01 */
-            if (DEPRECATE_FUTUREWARNING(
-                    "creating an array with a subarray dtype will behave "
-                    "differently when the `np.array()` (or `asarray`, etc.) "
-                    "call includes an array or array object.\n"
-                    "If you are converting a single array or a list of arrays,"
-                    "you can opt-in to the future behaviour using:\n"
-                    "    np.array(arr, dtype=np.dtype(['f', dtype]))['f']\n"
-                    "    np.array([arr1, arr2], dtype=np.dtype(['f', dtype]))['f']\n"
-                    "\n"
-                    "By including a new field and indexing it after the "
-                    "conversion.\n"
-                    "This may lead to a different result or to current failures "
-                    "succeeding.  (FutureWarning since NumPy 1.20)") < 0) {
-                Py_DECREF(ret);
-                return NULL;
-            }
-
-            if (setArrayFromSequence(ret, op, 0, NULL) < 0) {
-                Py_DECREF(ret);
-                return NULL;
-            }
-            return (PyObject *)ret;
-        }
-    }
-
     if (dtype == NULL) {
         dtype = PyArray_DescrFromType(NPY_DEFAULT_TYPE);
     }
@@ -1734,7 +1590,7 @@ PyArray_FromAny(PyObject *op, PyArray_Descr *newtype, int min_depth,
     }
     else if (cache == NULL && PyArray_IsScalar(op, Void) &&
             !(((PyVoidScalarObject *)op)->flags & NPY_ARRAY_OWNDATA) &&
-            newtype == NULL) {
+             ((in_descr == NULL) && (in_DType == NULL))) {
         /*
          * Special case, we return a *view* into void scalars, mainly to
          * allow things similar to the "reversed" assignment:
@@ -1765,8 +1621,9 @@ PyArray_FromAny(PyObject *op, PyArray_Descr *newtype, int min_depth,
         return NULL;
     }
 
-    if (cache == NULL && newtype != NULL &&
-            PyDataType_ISSIGNED(newtype) && PyArray_IsScalar(op, Generic)) {
+    if (cache == NULL && in_descr != NULL &&
+            PyDataType_ISSIGNED(dtype) &&
+            PyArray_IsScalar(op, Generic)) {
         assert(ndim == 0);
         /*
          * This is an (possible) inconsistency where:
@@ -1895,7 +1752,6 @@ PyArray_FromAny(PyObject *op, PyArray_Descr *newtype, int min_depth,
  * NPY_ARRAY_FORCECAST will cause a cast to occur regardless of whether or not
  * it is safe.
  *
- * context is passed through to PyArray_GetArrayParamsFromObject
  */
 
 /*NUMPY_API
@@ -1905,24 +1761,57 @@ NPY_NO_EXPORT PyObject *
 PyArray_CheckFromAny(PyObject *op, PyArray_Descr *descr, int min_depth,
                      int max_depth, int requires, PyObject *context)
 {
+    npy_dtype_info dt_info = {NULL, NULL};
+
+    int res = PyArray_ExtractDTypeAndDescriptor(
+        descr, &dt_info.descr, &dt_info.dtype);
+
+    Py_XDECREF(descr);
+
+    if (res < 0) {
+        Py_XDECREF(dt_info.descr);
+        Py_XDECREF(dt_info.dtype);
+        return NULL;
+    }
+
+    PyObject* ret =  PyArray_CheckFromAny_int(
+        op, dt_info.descr, dt_info.dtype, min_depth, max_depth, requires,
+        context);
+
+    Py_XDECREF(dt_info.descr);
+    Py_XDECREF(dt_info.dtype);
+    return ret;
+}
+
+/*
+ * Internal version of PyArray_CheckFromAny that accepts a dtypemeta. Borrows
+ * references to the descriptor and dtype.
+ */
+
+NPY_NO_EXPORT PyObject *
+PyArray_CheckFromAny_int(PyObject *op, PyArray_Descr *in_descr,
+                         PyArray_DTypeMeta *in_DType, int min_depth,
+                         int max_depth, int requires, PyObject *context)
+{
     PyObject *obj;
     if (requires & NPY_ARRAY_NOTSWAPPED) {
-        if (!descr && PyArray_Check(op) &&
+        if (!in_descr && PyArray_Check(op) &&
                 PyArray_ISBYTESWAPPED((PyArrayObject* )op)) {
-            descr = PyArray_DescrNew(PyArray_DESCR((PyArrayObject *)op));
-            if (descr == NULL) {
+            in_descr = PyArray_DescrNew(PyArray_DESCR((PyArrayObject *)op));
+            if (in_descr == NULL) {
                 return NULL;
             }
         }
-        else if (descr && !PyArray_ISNBO(descr->byteorder)) {
-            PyArray_DESCR_REPLACE(descr);
+        else if (in_descr && !PyArray_ISNBO(in_descr->byteorder)) {
+            PyArray_DESCR_REPLACE(in_descr);
         }
-        if (descr && descr->byteorder != NPY_IGNORE) {
-            descr->byteorder = NPY_NATIVE;
+        if (in_descr && in_descr->byteorder != NPY_IGNORE) {
+            in_descr->byteorder = NPY_NATIVE;
         }
     }
 
-    obj = PyArray_FromAny(op, descr, min_depth, max_depth, requires, context);
+    obj = PyArray_FromAny_int(op, in_descr, in_DType, min_depth,
+                              max_depth, requires, context);
     if (obj == NULL) {
         return NULL;
     }
@@ -2030,13 +1919,26 @@ PyArray_FromArray(PyArrayObject *arr, PyArray_Descr *newtype, int flags)
         if ((flags & NPY_ARRAY_ENSUREARRAY)) {
             subok = 0;
         }
+        Py_INCREF(newtype);
         ret = (PyArrayObject *)PyArray_NewLikeArray(arr, order,
                                                     newtype, subok);
         if (ret == NULL) {
+            Py_DECREF(newtype);
             return NULL;
         }
 
-        if (PyArray_CopyInto(ret, arr) < 0) {
+        int actual_ndim = PyArray_NDIM(ret);
+        PyArray_Descr *actual_dtype = PyArray_DESCR(ret);
+        ((PyArrayObject_fields *)ret)->nd = PyArray_NDIM(arr);
+        ((PyArrayObject_fields *)ret)->descr = newtype;
+
+        int success = PyArray_CopyInto(ret, arr);
+
+        Py_DECREF(newtype);
+        ((PyArrayObject_fields *)ret)->nd = actual_ndim;
+        ((PyArrayObject_fields *)ret)->descr = actual_dtype;
+
+        if (success < 0) {
             Py_DECREF(ret);
             return NULL;
         }
@@ -2831,7 +2733,7 @@ PyArray_CopyAsFlat(PyArrayObject *dst, PyArrayObject *src, NPY_ORDER order)
     }
 
     if (res == 0 && !(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) {
-        int fpes = npy_get_floatstatus_barrier((char *)src_iter);
+        int fpes = npy_get_floatstatus_barrier((char *)&src_iter);
         if (fpes && PyUFunc_GiveFloatingpointErrors("cast", fpes) < 0) {
             return -1;
         }
@@ -2955,23 +2857,13 @@ PyArray_Zeros(int nd, npy_intp const *dims, PyArray_Descr *type, int is_f_order)
             &PyArray_Type, type,
             nd, dims, NULL, NULL,
             is_f_order, NULL, NULL,
-            1, 0);
+            _NPY_ARRAY_ZEROED);
 
     if (ret == NULL) {
         return NULL;
     }
 
-    /* handle objects */
-    if (PyDataType_REFCHK(PyArray_DESCR(ret))) {
-        if (_zerofill(ret) < 0) {
-            Py_DECREF(ret);
-            return NULL;
-        }
-    }
-
-
     return (PyObject *)ret;
-
 }
 
 /*NUMPY_API
@@ -3242,11 +3134,11 @@ _calc_length(PyObject *start, PyObject *stop, PyObject *step, PyObject **next, i
 NPY_NO_EXPORT PyObject *
 PyArray_ArangeObj(PyObject *start, PyObject *stop, PyObject *step, PyArray_Descr *dtype)
 {
-    PyArrayObject *range;
+    PyArrayObject *range = NULL;
     PyArray_ArrFuncs *funcs;
-    PyObject *next, *err;
-    npy_intp length;
+    PyObject *next = NULL;
     PyArray_Descr *native = NULL;
+    npy_intp length;
     int swap;
     NPY_BEGIN_THREADS_DEF;
 
@@ -3259,94 +3151,100 @@ PyArray_ArangeObj(PyObject *start, PyObject *stop, PyObject *step, PyArray_Descr
         return (PyObject *)datetime_arange(start, stop, step, dtype);
     }
 
-    if (!dtype) {
-        PyArray_Descr *deftype;
-        PyArray_Descr *newtype;
+    /* We need to replace many of these, so hold on for easier cleanup */
+    Py_XINCREF(start);
+    Py_XINCREF(stop);
+    Py_XINCREF(step);
+    Py_XINCREF(dtype);
 
+    if (!dtype) {
         /* intentionally made to be at least NPY_LONG */
-        deftype = PyArray_DescrFromType(NPY_LONG);
-        newtype = PyArray_DescrFromObject(start, deftype);
-        Py_DECREF(deftype);
-        if (newtype == NULL) {
-            return NULL;
+        dtype = PyArray_DescrFromType(NPY_LONG);
+        Py_SETREF(dtype, PyArray_DescrFromObject(start, dtype));
+        if (dtype == NULL) {
+            goto fail;
         }
-        deftype = newtype;
         if (stop && stop != Py_None) {
-            newtype = PyArray_DescrFromObject(stop, deftype);
-            Py_DECREF(deftype);
-            if (newtype == NULL) {
-                return NULL;
+            Py_SETREF(dtype, PyArray_DescrFromObject(stop, dtype));
+            if (dtype == NULL) {
+                goto fail;
             }
-            deftype = newtype;
         }
         if (step && step != Py_None) {
-            newtype = PyArray_DescrFromObject(step, deftype);
-            Py_DECREF(deftype);
-            if (newtype == NULL) {
-                return NULL;
+            Py_SETREF(dtype, PyArray_DescrFromObject(step, dtype));
+            if (dtype == NULL) {
+                goto fail;
             }
-            deftype = newtype;
         }
-        dtype = deftype;
+    }
+
+    /*
+     * If dtype is not in native byte-order then get native-byte
+     * order version.  And then swap on the way out.
+     */
+    if (!PyArray_ISNBO(dtype->byteorder)) {
+        native = PyArray_DescrNewByteorder(dtype, NPY_NATBYTE);
+        if (native == NULL) {
+            goto fail;
+        }
+        swap = 1;
     }
     else {
         Py_INCREF(dtype);
+        native = dtype;
+        swap = 0;
     }
-    if (!step || step == Py_None) {
-        step = PyLong_FromLong(1);
+
+    funcs = native->f;
+    if (!funcs->fill) {
+        /* This effectively forbids subarray types as well... */
+        PyErr_Format(PyExc_TypeError,
+                "arange() not supported for inputs with DType %S.",
+                Py_TYPE(dtype));
+        goto fail;
     }
-    else {
-        Py_XINCREF(step);
+
+    if (!step || step == Py_None) {
+        Py_XSETREF(step, PyLong_FromLong(1));
+        if (step == NULL) {
+            goto fail;
+        }
     }
     if (!stop || stop == Py_None) {
-        stop = start;
+        Py_XSETREF(stop, start);
         start = PyLong_FromLong(0);
+        if (start == NULL) {
+            goto fail;
+        }
     }
-    else {
-        Py_INCREF(start);
-    }
+
     /* calculate the length and next = start + step*/
     length = _calc_length(start, stop, step, &next,
                           PyTypeNum_ISCOMPLEX(dtype->type_num));
-    err = PyErr_Occurred();
+    PyObject *err = PyErr_Occurred();
     if (err) {
-        Py_DECREF(dtype);
-        if (err && PyErr_GivenExceptionMatches(err, PyExc_OverflowError)) {
+        if (PyErr_GivenExceptionMatches(err, PyExc_OverflowError)) {
             PyErr_SetString(PyExc_ValueError, "Maximum allowed size exceeded");
         }
         goto fail;
     }
     if (length <= 0) {
         length = 0;
-        range = (PyArrayObject *)PyArray_SimpleNewFromDescr(1, &length, dtype);
-        Py_DECREF(step);
-        Py_DECREF(start);
-        return (PyObject *)range;
-    }
-
-    /*
-     * If dtype is not in native byte-order then get native-byte
-     * order version.  And then swap on the way out.
-     */
-    if (!PyArray_ISNBO(dtype->byteorder)) {
-        native = PyArray_DescrNewByteorder(dtype, NPY_NATBYTE);
-        swap = 1;
-    }
-    else {
-        native = dtype;
-        swap = 0;
     }
 
+    Py_INCREF(native);
     range = (PyArrayObject *)PyArray_SimpleNewFromDescr(1, &length, native);
     if (range == NULL) {
         goto fail;
     }
 
+    if (length == 0) {
+        goto finish;
+    }
     /*
      * place start in the buffer and the next value in the second position
      * if length > 2, then call the inner loop, otherwise stop
      */
-    funcs = PyArray_DESCR(range)->f;
     if (funcs->setitem(start, PyArray_DATA(range), range) < 0) {
         goto fail;
     }
@@ -3360,11 +3258,7 @@ PyArray_ArangeObj(PyObject *start, PyObject *stop, PyObject *step, PyArray_Descr
     if (length == 2) {
         goto finish;
     }
-    if (!funcs->fill) {
-        PyErr_SetString(PyExc_ValueError, "no fill-function for data-type.");
-        Py_DECREF(range);
-        goto fail;
-    }
+
     NPY_BEGIN_THREADS_DESCR(PyArray_DESCR(range));
     funcs->fill(PyArray_DATA(range), length, range);
     NPY_END_THREADS;
@@ -3376,20 +3270,31 @@ PyArray_ArangeObj(PyObject *start, PyObject *stop, PyObject *step, PyArray_Descr
     if (swap) {
         PyObject *new;
         new = PyArray_Byteswap(range, 1);
+        if (new == NULL) {
+            goto fail;
+        }
         Py_DECREF(new);
+        /* Replace dtype after swapping in-place above: */
         Py_DECREF(PyArray_DESCR(range));
-        /* steals the reference */
+        Py_INCREF(dtype);
         ((PyArrayObject_fields *)range)->descr = dtype;
     }
+    Py_DECREF(dtype);
+    Py_DECREF(native);
     Py_DECREF(start);
+    Py_DECREF(stop);
     Py_DECREF(step);
-    Py_DECREF(next);
+    Py_XDECREF(next);
     return (PyObject *)range;
 
  fail:
-    Py_DECREF(start);
-    Py_DECREF(step);
+    Py_XDECREF(dtype);
+    Py_XDECREF(native);
+    Py_XDECREF(start);
+    Py_XDECREF(stop);
+    Py_XDECREF(step);
     Py_XDECREF(next);
+    Py_XDECREF(range);
     return NULL;
 }
 
@@ -3606,7 +3511,7 @@ PyArray_FromFile(FILE *fp, PyArray_Descr *dtype, npy_intp num, char *sep)
                 &PyArray_Type, dtype,
                 1, &num, NULL, NULL,
                 0, NULL, NULL,
-                0, 1);
+                _NPY_ARRAY_ALLOW_EMPTY_STRING);
     }
     if ((sep == NULL) || (strlen(sep) == 0)) {
         ret = array_fromfile_binary(fp, dtype, num, &nread);
diff --git a/numpy/core/src/multiarray/ctors.h b/numpy/core/src/multiarray/ctors.h
index 98160b1cc..a876ab2c0 100644
--- a/numpy/core/src/multiarray/ctors.h
+++ b/numpy/core/src/multiarray/ctors.h
@@ -13,12 +13,29 @@ PyArray_NewFromDescrAndBase(
         npy_intp const *dims, npy_intp const *strides, void *data,
         int flags, PyObject *obj, PyObject *base);
 
+
+/* Private options for NewFromDescriptor */
+typedef enum {
+    /*
+     * Indicate the the array should be zeroed, we may use calloc to do so
+     * (otherwise much like ).
+     */
+    _NPY_ARRAY_ZEROED = 1 << 0,
+    /* Whether to allow empty strings (implied by ensure dtype identity) */
+    _NPY_ARRAY_ALLOW_EMPTY_STRING = 1 << 1,
+    /*
+     * If we take a view into an existing array and use its dtype, then that
+     * dtype must be preserved (for example for subarray and S0, but also
+     * possibly for future dtypes that store more metadata).
+     */
+    _NPY_ARRAY_ENSURE_DTYPE_IDENTITY = 1 << 2,
+} _NPY_CREATION_FLAGS;
+
 NPY_NO_EXPORT PyObject *
 PyArray_NewFromDescr_int(
         PyTypeObject *subtype, PyArray_Descr *descr, int nd,
         npy_intp const *dims, npy_intp const *strides, void *data,
-        int flags, PyObject *obj, PyObject *base, int zeroed,
-        int allow_emptystring);
+        int flags, PyObject *obj, PyObject *base, _NPY_CREATION_FLAGS cflags);
 
 NPY_NO_EXPORT PyObject *
 PyArray_NewLikeArrayWithShape(
@@ -36,10 +53,20 @@ _array_from_array_like(PyObject *op,
         int never_copy);
 
 NPY_NO_EXPORT PyObject *
+PyArray_FromAny_int(PyObject *op, PyArray_Descr *in_descr,
+                    PyArray_DTypeMeta *in_DType, int min_depth, int max_depth,
+                    int flags, PyObject *context);
+
+NPY_NO_EXPORT PyObject *
 PyArray_FromAny(PyObject *op, PyArray_Descr *newtype, int min_depth,
                 int max_depth, int flags, PyObject *context);
 
 NPY_NO_EXPORT PyObject *
+PyArray_CheckFromAny_int(PyObject *op, PyArray_Descr *in_descr,
+                         PyArray_DTypeMeta *in_DType, int min_depth,
+                         int max_depth, int requires, PyObject *context);
+
+NPY_NO_EXPORT PyObject *
 PyArray_CheckFromAny(PyObject *op, PyArray_Descr *descr, int min_depth,
                      int max_depth, int requires, PyObject *context);
 
diff --git a/numpy/core/src/multiarray/datetime.c b/numpy/core/src/multiarray/datetime.c
index 1ea4c9dbb..820f40fd8 100644
--- a/numpy/core/src/multiarray/datetime.c
+++ b/numpy/core/src/multiarray/datetime.c
@@ -1622,6 +1622,12 @@ compute_datetime_metadata_greatest_common_divisor(
 
     return 0;
 
+    /*
+     * We do not use `DTypePromotionError` below.  The reason this is that a
+     * `DTypePromotionError` indicates that `arr_dt1 != arr_dt2` for
+     * all values, but this is wrong for "0".  This could be changed but
+     * for now we consider them errors that occur _while_ promoting.
+     */
 incompatible_units: {
         PyObject *umeta1 = metastr_to_unicode(meta1, 0);
         if (umeta1 == NULL) {
@@ -4089,12 +4095,12 @@ PyArray_InitializeDatetimeCasts()
         .nout = 1,
         .casting = NPY_UNSAFE_CASTING,
         .flags = NPY_METH_SUPPORTS_UNALIGNED,
-        .slots = slots,
         .dtypes = dtypes,
+        .slots = slots,
     };
     slots[0].slot = NPY_METH_resolve_descriptors;
     slots[0].pfunc = &time_to_time_resolve_descriptors;
-    slots[1].slot = NPY_METH_get_loop;
+    slots[1].slot = _NPY_METH_get_loop;
     slots[1].pfunc = &time_to_time_get_loop;
     slots[2].slot = 0;
     slots[2].pfunc = NULL;
@@ -4124,7 +4130,7 @@ PyArray_InitializeDatetimeCasts()
 
     slots[0].slot = NPY_METH_resolve_descriptors;
     slots[0].pfunc = &datetime_to_timedelta_resolve_descriptors;
-    slots[1].slot = NPY_METH_get_loop;
+    slots[1].slot = _NPY_METH_get_loop;
     slots[1].pfunc = &legacy_cast_get_strided_loop;
     slots[2].slot = 0;
     slots[2].pfunc = NULL;
@@ -4197,7 +4203,7 @@ PyArray_InitializeDatetimeCasts()
     slots[0].slot = NPY_METH_resolve_descriptors;
     slots[0].pfunc = &time_to_string_resolve_descriptors;
     /* Strided loop differs for the two */
-    slots[1].slot = NPY_METH_get_loop;
+    slots[1].slot = _NPY_METH_get_loop;
     slots[2].slot = 0;
     slots[2].pfunc = NULL;
 
@@ -4246,7 +4252,7 @@ PyArray_InitializeDatetimeCasts()
     /* The default type resolution should work fine. */
     slots[0].slot = NPY_METH_resolve_descriptors;
     slots[0].pfunc = &string_to_datetime_cast_resolve_descriptors;
-    slots[1].slot = NPY_METH_get_loop;
+    slots[1].slot = _NPY_METH_get_loop;
     slots[1].pfunc = &string_to_datetime_cast_get_loop;
     slots[2].slot = 0;
     slots[2].pfunc = NULL;
diff --git a/numpy/core/src/multiarray/datetime_busday.c b/numpy/core/src/multiarray/datetime_busday.c
index d3e9e1451..93ed0972e 100644
--- a/numpy/core/src/multiarray/datetime_busday.c
+++ b/numpy/core/src/multiarray/datetime_busday.c
@@ -365,6 +365,7 @@ apply_business_day_count(npy_datetime date_begin, npy_datetime date_end,
                     npy_datetime *holidays_begin, npy_datetime *holidays_end)
 {
     npy_int64 count, whole_weeks;
+
     int day_of_week = 0;
     int swapped = 0;
 
@@ -386,6 +387,10 @@ apply_business_day_count(npy_datetime date_begin, npy_datetime date_end,
         date_begin = date_end;
         date_end = tmp;
         swapped = 1;
+        // we swapped date_begin and date_end, so we need to correct for the
+        // original date_end that should not be included. gh-23197
+        date_begin++;
+        date_end++;
     }
 
     /* Remove any earlier holidays */
diff --git a/numpy/core/src/multiarray/descriptor.c b/numpy/core/src/multiarray/descriptor.c
index 2ab70ea14..68d398d64 100644
--- a/numpy/core/src/multiarray/descriptor.c
+++ b/numpy/core/src/multiarray/descriptor.c
@@ -74,9 +74,6 @@ _try_convert_from_ctypes_type(PyTypeObject *type)
     return (PyArray_Descr *)res;
 }
 
-static PyArray_Descr *
-_convert_from_any(PyObject *obj, int align);
-
 /*
  * This function creates a dtype object when the object has a "dtype" attribute,
  * and it can be converted to a dtype object.
@@ -1393,6 +1390,141 @@ PyArray_DescrConverter2(PyObject *obj, PyArray_Descr **at)
     }
 }
 
+
+/**
+ * Check the descriptor is a legacy "flexible" DType instance, this is
+ * an instance which is (normally) not attached to an array, such as a string
+ * of length 0 or a datetime with no unit.
+ * These should be largely deprecated, and represent only the DType class
+ * for most `dtype` parameters.
+ *
+ * TODO: This function should eventually receive a deprecation warning and
+ *       be removed.
+ *
+ * @param descr
+ * @return 1 if this is not a concrete dtype instance 0 otherwise
+ */
+static int
+descr_is_legacy_parametric_instance(PyArray_Descr *descr,
+                                    PyArray_DTypeMeta *DType)
+{
+    if (!NPY_DT_is_legacy(DType)) {
+        return 0;
+    }
+
+    if (PyDataType_ISUNSIZED(descr)) {
+        return 1;
+    }
+    /* Flexible descr with generic time unit (which can be adapted) */
+    if (PyDataType_ISDATETIME(descr)) {
+        PyArray_DatetimeMetaData *meta;
+        meta = get_datetime_metadata_from_dtype(descr);
+        if (meta->base == NPY_FR_GENERIC) {
+            return 1;
+        }
+    }
+    return 0;
+}
+
+
+/**
+ * Given a descriptor (dtype instance), handles conversion of legacy flexible
+ * "unsized" descriptors to their DType.  It returns the DType and descriptor
+ * both results can be NULL (if the input is).  But it always sets the DType
+ * when a descriptor is set.
+ *
+ * @param dtype
+ * @param out_descr
+ * @param out_DType
+ * @return 0 on success -1 on failure
+ */
+NPY_NO_EXPORT int
+PyArray_ExtractDTypeAndDescriptor(PyArray_Descr *dtype,
+        PyArray_Descr **out_descr, PyArray_DTypeMeta **out_DType)
+{
+    *out_DType = NULL;
+    *out_descr = NULL;
+
+    if (dtype != NULL) {
+        *out_DType = NPY_DTYPE(dtype);
+        Py_INCREF(*out_DType);
+        if (!descr_is_legacy_parametric_instance((PyArray_Descr *)dtype,
+                                                    *out_DType)) {
+            *out_descr = (PyArray_Descr *)dtype;
+            Py_INCREF(*out_descr);
+        }
+    }
+    return 0;
+}
+
+
+/**
+ * Converter function filling in an npy_dtype_info struct on success.
+ *
+ * @param obj representing a dtype instance (descriptor) or DType class.
+ * @param[out] npy_dtype_info filled with the DType class and dtype/descriptor
+ *         instance.  The class is always set while the instance may be NULL.
+ *         On error, both will be NULL.
+ * @return 0 on failure and 1 on success (as a converter)
+ */
+NPY_NO_EXPORT int
+PyArray_DTypeOrDescrConverterRequired(PyObject *obj, npy_dtype_info *dt_info)
+{
+    /*
+     * Allow dtype classes pass, this could also be generalized to at least
+     * some scalar types (right now most of these give instances or)
+     */
+    dt_info->dtype = NULL;
+    dt_info->descr = NULL;
+
+    if (PyObject_TypeCheck(obj, &PyArrayDTypeMeta_Type)) {
+        Py_INCREF(obj);
+        dt_info->dtype = (PyArray_DTypeMeta *)obj;
+        dt_info->descr = NULL;
+        return NPY_SUCCEED;
+    }
+    PyArray_Descr *descr;
+    if (PyArray_DescrConverter(obj, &descr) != NPY_SUCCEED) {
+        return NPY_FAIL;
+    }
+    /*
+     * The above converts e.g. "S" or "S0" to the prototype instance, we make
+     * it behave the same as the DType.  This is not fully correct, "S0" should
+     * be considered an instance with actual 0 length.
+     * TODO: It would be nice to fix that eventually.
+     */
+    int res = PyArray_ExtractDTypeAndDescriptor(
+                descr, &dt_info->descr, &dt_info->dtype);
+    Py_DECREF(descr);
+    if (res < 0) {
+        return NPY_FAIL;
+    }
+    return NPY_SUCCEED;
+}
+
+
+/**
+ * Converter function filling in an npy_dtype_info struct on success.  It
+ * accepts `None` and does nothing in that case (user must initialize to
+ * NULL anyway).
+ *
+ * @param obj None or obj representing a dtype instance (descr) or DType class.
+ * @param[out] npy_dtype_info filled with the DType class and dtype/descriptor
+ *         instance.  If `obj` is None, is not modified.  Otherwise the class
+ *         is always set while the instance may be NULL.
+ *         On error, both will be NULL.
+ * @return 0 on failure and 1 on success (as a converter)
+ */
+NPY_NO_EXPORT int
+PyArray_DTypeOrDescrConverterOptional(PyObject *obj, npy_dtype_info *dt_info)
+{
+    if (obj == Py_None) {
+        /* caller must have initialized for the optional version */
+        return NPY_SUCCEED;
+    }
+    return PyArray_DTypeOrDescrConverterRequired(obj, dt_info);
+}
+
 /**
  * Get a dtype instance from a python type
  */
@@ -2315,7 +2447,6 @@ arraydescr_new(PyTypeObject *subtype,
                 PyErr_NoMemory();
                 return NULL;
             }
-            PyObject_Init((PyObject *)descr, subtype);
             descr->f = &NPY_DT_SLOTS(DType)->f;
             Py_XINCREF(DType->scalar_type);
             descr->typeobj = DType->scalar_type;
@@ -3141,25 +3272,15 @@ arraydescr_newbyteorder(PyArray_Descr *self, PyObject *args)
 static PyObject *
 arraydescr_class_getitem(PyObject *cls, PyObject *args)
 {
-    PyObject *generic_alias;
+    const Py_ssize_t args_len = PyTuple_Check(args) ? PyTuple_Size(args) : 1;
 
-#ifdef Py_GENERICALIASOBJECT_H
-    Py_ssize_t args_len;
-
-    args_len = PyTuple_Check(args) ? PyTuple_Size(args) : 1;
     if (args_len != 1) {
         return PyErr_Format(PyExc_TypeError,
                             "Too %s arguments for %s",
                             args_len > 1 ? "many" : "few",
                             ((PyTypeObject *)cls)->tp_name);
     }
-    generic_alias = Py_GenericAlias(cls, args);
-#else
-    PyErr_SetString(PyExc_TypeError,
-                    "Type subscription requires python >= 3.9");
-    generic_alias = NULL;
-#endif
-    return generic_alias;
+    return Py_GenericAlias(cls, args);
 }
 
 static PyMethodDef arraydescr_methods[] = {
@@ -3593,8 +3714,8 @@ NPY_NO_EXPORT PyArray_DTypeMeta PyArrayDescr_TypeFull = {
         .tp_getset = arraydescr_getsets,
         .tp_new = arraydescr_new,
     },},
-    .type_num = -1,
-    .flags = NPY_DT_ABSTRACT,
     .singleton = NULL,
+    .type_num = -1,
     .scalar_type = NULL,
+    .flags = NPY_DT_ABSTRACT,
 };
diff --git a/numpy/core/src/multiarray/descriptor.h b/numpy/core/src/multiarray/descriptor.h
index 7e6f212f2..b14edc3aa 100644
--- a/numpy/core/src/multiarray/descriptor.h
+++ b/numpy/core/src/multiarray/descriptor.h
@@ -1,6 +1,29 @@
 #ifndef NUMPY_CORE_SRC_MULTIARRAY_DESCRIPTOR_H_
 #define NUMPY_CORE_SRC_MULTIARRAY_DESCRIPTOR_H_
 
+
+/*
+ * In some API calls we wish to allow users to pass a DType class or a
+ * dtype instances with different meanings.
+ * This struct is mainly used for the argument parsing in
+ * `PyArray_DTypeOrDescrConverter`.
+ */
+typedef struct {
+    PyArray_DTypeMeta *dtype;
+    PyArray_Descr *descr;
+} npy_dtype_info;
+
+
+NPY_NO_EXPORT int
+PyArray_DTypeOrDescrConverterOptional(PyObject *, npy_dtype_info *dt_info);
+
+NPY_NO_EXPORT int
+PyArray_DTypeOrDescrConverterRequired(PyObject *, npy_dtype_info *dt_info);
+
+NPY_NO_EXPORT int
+PyArray_ExtractDTypeAndDescriptor(PyArray_Descr *dtype,
+        PyArray_Descr **out_descr, PyArray_DTypeMeta **out_DType);
+
 NPY_NO_EXPORT PyObject *arraydescr_protocol_typestr_get(
         PyArray_Descr *, void *);
 NPY_NO_EXPORT PyObject *arraydescr_protocol_descr_get(
diff --git a/numpy/core/src/multiarray/dlpack.c b/numpy/core/src/multiarray/dlpack.c
index 5a46d5a85..d26701df8 100644
--- a/numpy/core/src/multiarray/dlpack.c
+++ b/numpy/core/src/multiarray/dlpack.c
@@ -3,13 +3,11 @@
 
 #define PY_SSIZE_T_CLEAN
 #include <Python.h>
-#include <dlpack/dlpack.h>
 
+#include "dlpack/dlpack.h"
 #include "numpy/arrayobject.h"
-#include "common/npy_argparse.h"
-
-#include "common/dlpack/dlpack.h"
-#include "common/npy_dlpack.h"
+#include "npy_argparse.h"
+#include "npy_dlpack.h"
 
 static void
 array_dlpack_deleter(DLManagedTensor *self)
@@ -133,14 +131,15 @@ array_dlpack(PyArrayObject *self,
     }
 
     if (stream != Py_None) {
-        PyErr_SetString(PyExc_RuntimeError, "NumPy only supports "
-                "stream=None.");
+        PyErr_SetString(PyExc_RuntimeError,
+                "NumPy only supports stream=None.");
         return NULL;
     }
 
     if ( !(PyArray_FLAGS(self) & NPY_ARRAY_WRITEABLE)) {
-        PyErr_SetString(PyExc_TypeError, "NumPy currently only supports "
-                "dlpack for writeable arrays");
+        PyErr_SetString(PyExc_BufferError,
+            "Cannot export readonly array since signalling readonly "
+            "is unsupported by DLPack.");
         return NULL;
     }
 
@@ -152,7 +151,7 @@ array_dlpack(PyArrayObject *self,
     if (!PyArray_IS_C_CONTIGUOUS(self) && PyArray_SIZE(self) != 1) {
         for (int i = 0; i < ndim; ++i) {
             if (shape[i] != 1 && strides[i] % itemsize != 0) {
-                PyErr_SetString(PyExc_RuntimeError,
+                PyErr_SetString(PyExc_BufferError,
                         "DLPack only supports strides which are a multiple of "
                         "itemsize.");
                 return NULL;
@@ -164,15 +163,18 @@ array_dlpack(PyArrayObject *self,
     PyArray_Descr *dtype = PyArray_DESCR(self);
 
     if (PyDataType_ISBYTESWAPPED(dtype)) {
-        PyErr_SetString(PyExc_TypeError, "DLPack only supports native "
-                    "byte swapping.");
+        PyErr_SetString(PyExc_BufferError,
+                "DLPack only supports native byte order.");
             return NULL;
     }
 
     managed_dtype.bits = 8 * itemsize;
     managed_dtype.lanes = 1;
 
-    if (PyDataType_ISSIGNED(dtype)) {
+    if (PyDataType_ISBOOL(dtype)) {
+        managed_dtype.code = kDLBool;
+    }
+    else if (PyDataType_ISSIGNED(dtype)) {
         managed_dtype.code = kDLInt;
     }
     else if (PyDataType_ISUNSIGNED(dtype)) {
@@ -182,8 +184,9 @@ array_dlpack(PyArrayObject *self,
         // We can't be sure that the dtype is
         // IEEE or padded.
         if (itemsize > 8) {
-            PyErr_SetString(PyExc_TypeError, "DLPack only supports IEEE "
-                    "floating point types without padding.");
+            PyErr_SetString(PyExc_BufferError,
+                    "DLPack only supports IEEE floating point types "
+                    "without padding (longdouble typically is not IEEE).");
             return NULL;
         }
         managed_dtype.code = kDLFloat;
@@ -192,16 +195,17 @@ array_dlpack(PyArrayObject *self,
         // We can't be sure that the dtype is
         // IEEE or padded.
         if (itemsize > 16) {
-            PyErr_SetString(PyExc_TypeError, "DLPack only supports IEEE "
-                    "complex point types without padding.");
+            PyErr_SetString(PyExc_BufferError,
+                    "DLPack only supports IEEE floating point types "
+                    "without padding (longdouble typically is not IEEE).");
             return NULL;
         }
         managed_dtype.code = kDLComplex;
     }
     else {
-        PyErr_SetString(PyExc_TypeError,
-                        "DLPack only supports signed/unsigned integers, float "
-                        "and complex dtypes.");
+        PyErr_SetString(PyExc_BufferError,
+                "DLPack only supports signed/unsigned integers, float "
+                "and complex dtypes.");
         return NULL;
     }
 
@@ -330,6 +334,11 @@ from_dlpack(PyObject *NPY_UNUSED(self), PyObject *obj) {
     const uint8_t bits = managed->dl_tensor.dtype.bits;
     const npy_intp itemsize = bits / 8;
     switch (managed->dl_tensor.dtype.code) {
+    case kDLBool:
+        if (bits == 8) {
+            typenum = NPY_BOOL;
+        }
+        break;
     case kDLInt:
         switch (bits)
         {
diff --git a/numpy/core/src/multiarray/dtype_transfer.c b/numpy/core/src/multiarray/dtype_transfer.c
index c588494e7..f79662040 100644
--- a/numpy/core/src/multiarray/dtype_transfer.c
+++ b/numpy/core/src/multiarray/dtype_transfer.c
@@ -32,6 +32,7 @@
 
 #include "shape.h"
 #include "dtype_transfer.h"
+#include "dtype_traversal.h"
 #include "alloc.h"
 #include "dtypemeta.h"
 #include "array_method.h"
@@ -73,18 +74,6 @@ _safe_print(PyObject *obj)
 }
 #endif
 
-/*
- * Returns a transfer function which DECREFs any references in src_type.
- *
- * Returns NPY_SUCCEED or NPY_FAIL.
- */
-static int
-get_decref_transfer_function(int aligned,
-                            npy_intp src_stride,
-                            PyArray_Descr *src_dtype,
-                            NPY_cast_info *cast_info,
-                            int *out_needs_api);
-
 
 /*************************** COPY REFERENCES *******************************/
 
@@ -159,7 +148,7 @@ typedef struct {
     NpyAuxData base;
     PyArray_GetItemFunc *getitem;
     PyArrayObject_fields arr_fields;
-    NPY_cast_info decref_src;
+    NPY_traverse_info decref_src;
 } _any_to_object_auxdata;
 
 
@@ -169,7 +158,7 @@ _any_to_object_auxdata_free(NpyAuxData *auxdata)
     _any_to_object_auxdata *data = (_any_to_object_auxdata *)auxdata;
 
     Py_DECREF(data->arr_fields.descr);
-    NPY_cast_info_xfree(&data->decref_src);
+    NPY_traverse_info_xfree(&data->decref_src);
     PyMem_Free(data);
 }
 
@@ -187,7 +176,7 @@ _any_to_object_auxdata_clone(NpyAuxData *auxdata)
     Py_INCREF(res->arr_fields.descr);
 
     if (data->decref_src.func != NULL) {
-        if (NPY_cast_info_copy(&res->decref_src, &data->decref_src) < 0) {
+        if (NPY_traverse_info_copy(&res->decref_src, &data->decref_src) < 0) {
             NPY_AUXDATA_FREE((NpyAuxData *)res);
             return NULL;
         }
@@ -228,8 +217,8 @@ _strided_to_strided_any_to_object(
     }
     if (data->decref_src.func != NULL) {
         /* If necessary, clear the input buffer (`move_references`) */
-        if (data->decref_src.func(&data->decref_src.context,
-                &orig_src, &N, &src_stride, data->decref_src.auxdata) < 0) {
+        if (data->decref_src.func(NULL, data->decref_src.descr,
+                orig_src, N, src_stride, data->decref_src.auxdata) < 0) {
             return -1;
         }
     }
@@ -265,18 +254,18 @@ any_to_object_get_loop(
     data->arr_fields.nd = 0;
 
     data->getitem = context->descriptors[0]->f->getitem;
-    NPY_cast_info_init(&data->decref_src);
+    NPY_traverse_info_init(&data->decref_src);
 
     if (move_references && PyDataType_REFCHK(context->descriptors[0])) {
-        int needs_api;
-        if (get_decref_transfer_function(
+        NPY_ARRAYMETHOD_FLAGS clear_flags;
+        if (PyArray_GetClearFunction(
                 aligned, strides[0], context->descriptors[0],
-                &data->decref_src,
-                &needs_api) == NPY_FAIL)  {
+                &data->decref_src, &clear_flags) < 0)  {
             NPY_AUXDATA_FREE(*out_transferdata);
             *out_transferdata = NULL;
             return -1;
         }
+        *flags = PyArrayMethod_COMBINED_FLAGS(*flags, clear_flags);
     }
     return 0;
 }
@@ -578,7 +567,7 @@ wrap_copy_swap_function(
             &PyArray_Type, dtype,
             1, &shape, NULL, NULL,
             0, NULL, NULL,
-            0, 1);
+            _NPY_ARRAY_ENSURE_DTYPE_IDENTITY);
     if (data->arr == NULL) {
         PyMem_Free(data);
         return NPY_FAIL;
@@ -1327,7 +1316,7 @@ get_legacy_dtype_cast_function(
             &PyArray_Type, tmp_dtype,
             1, &shape, NULL, NULL,
             0, NULL, NULL,
-            0, 1);
+            _NPY_ARRAY_ENSURE_DTYPE_IDENTITY);
     if (data->aip == NULL) {
         PyMem_Free(data);
         return NPY_FAIL;
@@ -1354,7 +1343,7 @@ get_legacy_dtype_cast_function(
             &PyArray_Type, tmp_dtype,
             1, &shape, NULL, NULL,
             0, NULL, NULL,
-            0, 1);
+            _NPY_ARRAY_ENSURE_DTYPE_IDENTITY);
     if (data->aop == NULL) {
         Py_DECREF(data->aip);
         PyMem_Free(data);
@@ -1393,7 +1382,7 @@ typedef struct {
     npy_intp N;
     NPY_cast_info wrapped;
     /* If finish->func is non-NULL the source needs a decref */
-    NPY_cast_info decref_src;
+    NPY_traverse_info decref_src;
 } _one_to_n_data;
 
 /* transfer data free function */
@@ -1401,7 +1390,7 @@ static void _one_to_n_data_free(NpyAuxData *data)
 {
     _one_to_n_data *d = (_one_to_n_data *)data;
     NPY_cast_info_xfree(&d->wrapped);
-    NPY_cast_info_xfree(&d->decref_src);
+    NPY_traverse_info_xfree(&d->decref_src);
     PyMem_Free(data);
 }
 
@@ -1420,7 +1409,7 @@ static NpyAuxData *_one_to_n_data_clone(NpyAuxData *data)
     newdata->base.clone = &_one_to_n_data_clone;
     newdata->N = d->N;
     /* Initialize in case of error, or if it is unused */
-    NPY_cast_info_init(&newdata->decref_src);
+    NPY_traverse_info_init(&newdata->decref_src);
 
     if (NPY_cast_info_copy(&newdata->wrapped, &d->wrapped) < 0) {
         _one_to_n_data_free((NpyAuxData *)newdata);
@@ -1430,7 +1419,7 @@ static NpyAuxData *_one_to_n_data_clone(NpyAuxData *data)
         return (NpyAuxData *)newdata;
     }
 
-    if (NPY_cast_info_copy(&newdata->decref_src, &d->decref_src) < 0) {
+    if (NPY_traverse_info_copy(&newdata->decref_src, &d->decref_src) < 0) {
         _one_to_n_data_free((NpyAuxData *)newdata);
         return NULL;
     }
@@ -1490,8 +1479,8 @@ _strided_to_strided_one_to_n_with_finish(
             return -1;
         }
 
-        if (d->decref_src.func(&d->decref_src.context,
-                &src, &one_item, &zero_stride, d->decref_src.auxdata) < 0) {
+        if (d->decref_src.func(NULL, d->decref_src.descr,
+                src, one_item, zero_stride, d->decref_src.auxdata) < 0) {
             return -1;
         }
 
@@ -1522,7 +1511,7 @@ get_one_to_n_transfer_function(int aligned,
     data->base.free = &_one_to_n_data_free;
     data->base.clone = &_one_to_n_data_clone;
     data->N = N;
-    NPY_cast_info_init(&data->decref_src);  /* In case of error */
+    NPY_traverse_info_init(&data->decref_src);  /* In case of error */
 
     /*
      * move_references is set to 0, handled in the wrapping transfer fn,
@@ -1542,15 +1531,14 @@ get_one_to_n_transfer_function(int aligned,
 
     /* If the src object will need a DECREF, set src_dtype */
     if (move_references && PyDataType_REFCHK(src_dtype)) {
-        *out_flags |= NPY_METH_REQUIRES_PYAPI;
-        if (get_decref_transfer_function(aligned,
-                            src_stride,
-                            src_dtype,
-                            &data->decref_src,
-                            NULL) != NPY_SUCCEED) {
+        NPY_ARRAYMETHOD_FLAGS clear_flags;
+        if (PyArray_GetClearFunction(
+                aligned, src_stride, src_dtype,
+                &data->decref_src, &clear_flags) < 0) {
             NPY_AUXDATA_FREE((NpyAuxData *)data);
             return NPY_FAIL;
         }
+        *out_flags = PyArrayMethod_COMBINED_FLAGS(*out_flags, clear_flags);
     }
 
     if (data->decref_src.func == NULL) {
@@ -1741,8 +1729,8 @@ typedef struct {
 typedef struct {
     NpyAuxData base;
     NPY_cast_info wrapped;
-    NPY_cast_info decref_src;
-    NPY_cast_info decref_dst;  /* The use-case should probably be deprecated */
+    NPY_traverse_info decref_src;
+    NPY_traverse_info decref_dst;  /* The use-case should probably be deprecated */
     npy_intp src_N, dst_N;
     /* This gets a run-length encoded representation of the transfer */
     npy_intp run_count;
@@ -1755,8 +1743,8 @@ static void _subarray_broadcast_data_free(NpyAuxData *data)
 {
     _subarray_broadcast_data *d = (_subarray_broadcast_data *)data;
     NPY_cast_info_xfree(&d->wrapped);
-    NPY_cast_info_xfree(&d->decref_src);
-    NPY_cast_info_xfree(&d->decref_dst);
+    NPY_traverse_info_xfree(&d->decref_src);
+    NPY_traverse_info_xfree(&d->decref_dst);
     PyMem_Free(data);
 }
 
@@ -1780,21 +1768,21 @@ static NpyAuxData *_subarray_broadcast_data_clone(NpyAuxData *data)
     newdata->run_count = d->run_count;
     memcpy(newdata->offsetruns, d->offsetruns, offsetruns_size);
 
-    NPY_cast_info_init(&newdata->decref_src);
-    NPY_cast_info_init(&newdata->decref_dst);
+    NPY_traverse_info_init(&newdata->decref_src);
+    NPY_traverse_info_init(&newdata->decref_dst);
 
     if (NPY_cast_info_copy(&newdata->wrapped, &d->wrapped) < 0) {
         _subarray_broadcast_data_free((NpyAuxData *)newdata);
         return NULL;
     }
     if (d->decref_src.func != NULL) {
-        if (NPY_cast_info_copy(&newdata->decref_src, &d->decref_src) < 0) {
+        if (NPY_traverse_info_copy(&newdata->decref_src, &d->decref_src) < 0) {
             _subarray_broadcast_data_free((NpyAuxData *) newdata);
             return NULL;
         }
     }
     if (d->decref_dst.func != NULL) {
-        if (NPY_cast_info_copy(&newdata->decref_dst, &d->decref_dst) < 0) {
+        if (NPY_traverse_info_copy(&newdata->decref_dst, &d->decref_dst) < 0) {
             _subarray_broadcast_data_free((NpyAuxData *) newdata);
             return NULL;
         }
@@ -1883,8 +1871,8 @@ _strided_to_strided_subarray_broadcast_withrefs(
             }
             else {
                 if (d->decref_dst.func != NULL) {
-                    if (d->decref_dst.func(&d->decref_dst.context,
-                            &dst_ptr, &count, &dst_subitemsize,
+                    if (d->decref_dst.func(NULL, d->decref_dst.descr,
+                            dst_ptr, count, dst_subitemsize,
                             d->decref_dst.auxdata) < 0) {
                         return -1;
                     }
@@ -1895,8 +1883,8 @@ _strided_to_strided_subarray_broadcast_withrefs(
         }
 
         if (d->decref_src.func != NULL) {
-            if (d->decref_src.func(&d->decref_src.context,
-                    &src, &d->src_N, &src_subitemsize,
+            if (d->decref_src.func(NULL, d->decref_src.descr,
+                    src, d->src_N, src_subitemsize,
                     d->decref_src.auxdata) < 0) {
                 return -1;
             }
@@ -1939,8 +1927,8 @@ get_subarray_broadcast_transfer_function(int aligned,
     data->src_N = src_size;
     data->dst_N = dst_size;
 
-    NPY_cast_info_init(&data->decref_src);
-    NPY_cast_info_init(&data->decref_dst);
+    NPY_traverse_info_init(&data->decref_src);
+    NPY_traverse_info_init(&data->decref_dst);
 
     /*
      * move_references is set to 0, handled in the wrapping transfer fn,
@@ -1959,12 +1947,9 @@ get_subarray_broadcast_transfer_function(int aligned,
 
     /* If the src object will need a DECREF */
     if (move_references && PyDataType_REFCHK(src_dtype)) {
-        if (PyArray_GetDTypeTransferFunction(aligned,
-                        src_dtype->elsize, 0,
-                        src_dtype, NULL,
-                        1,
-                        &data->decref_src,
-                        out_flags) != NPY_SUCCEED) {
+        if (PyArray_GetClearFunction(aligned,
+                        src_dtype->elsize, src_dtype,
+                        &data->decref_src, out_flags) < 0) {
             NPY_AUXDATA_FREE((NpyAuxData *)data);
             return NPY_FAIL;
         }
@@ -1972,12 +1957,9 @@ get_subarray_broadcast_transfer_function(int aligned,
 
     /* If the dst object needs a DECREF to set it to NULL */
     if (PyDataType_REFCHK(dst_dtype)) {
-        if (PyArray_GetDTypeTransferFunction(aligned,
-                        dst_dtype->elsize, 0,
-                        dst_dtype, NULL,
-                        1,
-                        &data->decref_dst,
-                        out_flags) != NPY_SUCCEED) {
+        if (PyArray_GetClearFunction(aligned,
+                        dst_dtype->elsize, dst_dtype,
+                        &data->decref_dst, out_flags) < 0) {
             NPY_AUXDATA_FREE((NpyAuxData *)data);
             return NPY_FAIL;
         }
@@ -2182,6 +2164,7 @@ typedef struct {
 typedef struct {
     NpyAuxData base;
     npy_intp field_count;
+    NPY_traverse_info decref_src;
     _single_field_transfer fields[];
 } _field_transfer_data;
 
@@ -2190,6 +2173,7 @@ typedef struct {
 static void _field_transfer_data_free(NpyAuxData *data)
 {
     _field_transfer_data *d = (_field_transfer_data *)data;
+    NPY_traverse_info_xfree(&d->decref_src);
 
     for (npy_intp i = 0; i < d->field_count; ++i) {
         NPY_cast_info_xfree(&d->fields[i].info);
@@ -2213,6 +2197,10 @@ static NpyAuxData *_field_transfer_data_clone(NpyAuxData *data)
     }
     newdata->base = d->base;
     newdata->field_count = 0;
+    if (NPY_traverse_info_copy(&newdata->decref_src, &d->decref_src) < 0) {
+        PyMem_Free(newdata);
+        return NULL;
+    }
 
     /* Copy all the fields transfer data */
     for (npy_intp i = 0; i < field_count; ++i) {
@@ -2254,6 +2242,11 @@ _strided_to_strided_field_transfer(
                     return -1;
                 }
             }
+            if (d->decref_src.func != NULL && d->decref_src.func(
+                    NULL, d->decref_src.descr, src, blocksize, src_stride,
+                    d->decref_src.auxdata) < 0) {
+                return -1;
+            }
             N -= NPY_LOWLEVEL_BUFFER_BLOCKSIZE;
             src += NPY_LOWLEVEL_BUFFER_BLOCKSIZE*src_stride;
             dst += NPY_LOWLEVEL_BUFFER_BLOCKSIZE*dst_stride;
@@ -2267,6 +2260,11 @@ _strided_to_strided_field_transfer(
                     return -1;
                 }
             }
+            if (d->decref_src.func != NULL && d->decref_src.func(
+                    NULL, d->decref_src.descr, src, blocksize, src_stride,
+                    d->decref_src.auxdata) < 0) {
+                return -1;
+            }
             return 0;
         }
     }
@@ -2313,6 +2311,7 @@ get_fields_transfer_function(int NPY_UNUSED(aligned),
         data->base.free = &_field_transfer_data_free;
         data->base.clone = &_field_transfer_data_clone;
         data->field_count = 0;
+        NPY_traverse_info_init(&data->decref_src);
 
         *out_flags = PyArrayMethod_MINIMAL_FLAGS;
         for (i = 0; i < field_count; ++i) {
@@ -2340,23 +2339,17 @@ get_fields_transfer_function(int NPY_UNUSED(aligned),
         }
 
         /*
-         * If references should be decrefd in src, add another transfer
-         * function to do that. Since a decref function only uses a single
-         * input, the second one (normally output) just does not matter here.
+         * If references should be decrefd in src, add a clear function.
          */
         if (move_references && PyDataType_REFCHK(src_dtype)) {
-            *out_flags |= NPY_METH_REQUIRES_PYAPI;
-            if (get_decref_transfer_function(0,
-                                    src_stride,
-                                    src_dtype,
-                                    &data->fields[field_count].info,
-                                    NULL) != NPY_SUCCEED) {
+            NPY_ARRAYMETHOD_FLAGS clear_flags;
+            if (PyArray_GetClearFunction(
+                    0, src_stride, src_dtype, &data->decref_src,
+                    &clear_flags) < 0) {
                 NPY_AUXDATA_FREE((NpyAuxData *)data);
                 return NPY_FAIL;
             }
-            data->fields[field_count].src_offset = 0;
-            data->fields[field_count].dst_offset = 0;
-            data->field_count = field_count;
+            *out_flags = PyArrayMethod_COMBINED_FLAGS(*out_flags, clear_flags);
         }
 
         *out_stransfer = &_strided_to_strided_field_transfer;
@@ -2384,6 +2377,7 @@ get_fields_transfer_function(int NPY_UNUSED(aligned),
         }
         data->base.free = &_field_transfer_data_free;
         data->base.clone = &_field_transfer_data_clone;
+        NPY_traverse_info_init(&data->decref_src);
 
         key = PyTuple_GET_ITEM(src_dtype->names, 0);
         tup = PyDict_GetItem(src_dtype->fields, key);
@@ -2432,6 +2426,7 @@ get_fields_transfer_function(int NPY_UNUSED(aligned),
     data->base.free = &_field_transfer_data_free;
     data->base.clone = &_field_transfer_data_clone;
     data->field_count = 0;
+    NPY_traverse_info_init(&data->decref_src);
 
     *out_flags = PyArrayMethod_MINIMAL_FLAGS;
     /* set up the transfer function for each field */
@@ -2473,69 +2468,6 @@ get_fields_transfer_function(int NPY_UNUSED(aligned),
     return NPY_SUCCEED;
 }
 
-static int
-get_decref_fields_transfer_function(int NPY_UNUSED(aligned),
-                            npy_intp src_stride,
-                            PyArray_Descr *src_dtype,
-                            PyArrayMethod_StridedLoop **out_stransfer,
-                            NpyAuxData **out_transferdata,
-                            int *out_needs_api)
-{
-    PyObject *names, *key, *tup, *title;
-    PyArray_Descr *src_fld_dtype;
-    npy_int i, structsize;
-    Py_ssize_t field_count;
-    int src_offset;
-
-    names = src_dtype->names;
-    field_count = PyTuple_GET_SIZE(src_dtype->names);
-
-    /* Over-allocating here: less fields may be used */
-    structsize = sizeof(_field_transfer_data) +
-                    field_count * sizeof(_single_field_transfer);
-    /* Allocate the data and populate it */
-    _field_transfer_data *data = PyMem_Malloc(structsize);
-    if (data == NULL) {
-        PyErr_NoMemory();
-        return NPY_FAIL;
-    }
-    data->base.free = &_field_transfer_data_free;
-    data->base.clone = &_field_transfer_data_clone;
-    data->field_count = 0;
-
-    _single_field_transfer *field = data->fields;
-    for (i = 0; i < field_count; ++i) {
-        key = PyTuple_GET_ITEM(names, i);
-        tup = PyDict_GetItem(src_dtype->fields, key);
-        if (!PyArg_ParseTuple(tup, "Oi|O", &src_fld_dtype,
-                                                &src_offset, &title)) {
-            NPY_AUXDATA_FREE((NpyAuxData *)data);
-            return NPY_FAIL;
-        }
-        if (PyDataType_REFCHK(src_fld_dtype)) {
-            if (out_needs_api) {
-                *out_needs_api = 1;
-            }
-            if (get_decref_transfer_function(0,
-                                    src_stride,
-                                    src_fld_dtype,
-                                    &field->info,
-                                    out_needs_api) != NPY_SUCCEED) {
-                NPY_AUXDATA_FREE((NpyAuxData *)data);
-                return NPY_FAIL;
-            }
-            field->src_offset = src_offset;
-            data->field_count++;
-            field++;
-        }
-    }
-
-    *out_stransfer = &_strided_to_strided_field_transfer;
-    *out_transferdata = (NpyAuxData *)data;
-
-    return NPY_SUCCEED;
-}
-
 
 /************************* MASKED TRANSFER WRAPPER *************************/
 
@@ -2544,7 +2476,7 @@ typedef struct {
     /* The transfer function being wrapped (could likely be stored directly) */
     NPY_cast_info wrapped;
     /* The src decref function if necessary */
-    NPY_cast_info decref_src;
+    NPY_traverse_info decref_src;
 } _masked_wrapper_transfer_data;
 
 /* transfer data free function */
@@ -2553,7 +2485,7 @@ _masked_wrapper_transfer_data_free(NpyAuxData *data)
 {
     _masked_wrapper_transfer_data *d = (_masked_wrapper_transfer_data *)data;
     NPY_cast_info_xfree(&d->wrapped);
-    NPY_cast_info_xfree(&d->decref_src);
+    NPY_traverse_info_xfree(&d->decref_src);
     PyMem_Free(data);
 }
 
@@ -2576,7 +2508,7 @@ _masked_wrapper_transfer_data_clone(NpyAuxData *data)
         return NULL;
     }
     if (d->decref_src.func != NULL) {
-        if (NPY_cast_info_copy(&newdata->decref_src, &d->decref_src) < 0) {
+        if (NPY_traverse_info_copy(&newdata->decref_src, &d->decref_src) < 0) {
             NPY_AUXDATA_FREE((NpyAuxData *)newdata);
             return NULL;
         }
@@ -2586,7 +2518,7 @@ _masked_wrapper_transfer_data_clone(NpyAuxData *data)
 }
 
 static int
-_strided_masked_wrapper_decref_transfer_function(
+_strided_masked_wrapper_clear_function(
         PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
         const npy_intp *dimensions, const npy_intp *strides,
         npy_bool *mask, npy_intp mask_stride,
@@ -2603,8 +2535,8 @@ _strided_masked_wrapper_decref_transfer_function(
         /* Skip masked values, still calling decref for move_references */
         mask = (npy_bool*)npy_memchr((char *)mask, 0, mask_stride, N,
                                      &subloopsize, 1);
-        if (d->decref_src.func(&d->decref_src.context,
-                &src, &subloopsize, &src_stride, d->decref_src.auxdata) < 0) {
+        if (d->decref_src.func(NULL, d->decref_src.descr,
+                src, subloopsize, src_stride, d->decref_src.auxdata) < 0) {
             return -1;
         }
         dst += subloopsize * dst_stride;
@@ -2670,139 +2602,18 @@ _strided_masked_wrapper_transfer_function(
 }
 
 
-/*************************** CLEAR SRC *******************************/
 
+/* A no-op function (currently only used for cleanup purposes really) */
 static int
-_dec_src_ref_nop(
+_cast_no_op(
         PyArrayMethod_Context *NPY_UNUSED(context),
         char *const *NPY_UNUSED(args), const npy_intp *NPY_UNUSED(dimensions),
         const npy_intp *NPY_UNUSED(strides), NpyAuxData *NPY_UNUSED(auxdata))
 {
-    /* NOP */
+    /* Do nothing */
     return 0;
 }
 
-static int
-_strided_to_null_dec_src_ref_reference(
-        PyArrayMethod_Context *NPY_UNUSED(context),
-        char *const *args, const npy_intp *dimensions,
-        const npy_intp *strides, NpyAuxData *NPY_UNUSED(auxdata))
-{
-    char *src = args[0];
-    npy_intp N = dimensions[0];
-    npy_intp stride = strides[0];
-
-    PyObject *src_ref = NULL;
-    while (N > 0) {
-        /* Release the reference in src and set it to NULL */
-        NPY_DT_DBG_REFTRACE("dec src ref (null dst)", src_ref);
-        memcpy(&src_ref, src, sizeof(src_ref));
-        Py_XDECREF(src_ref);
-        memset(src, 0, sizeof(PyObject *));
-
-        src += stride;
-        --N;
-    }
-    return 0;
-}
-
-
-/*
- * Get a function to decref.  Currently, this uses a cast info slot, which
- * means that the second (destination) descriptor is always set to NULL
- * and generally does not have to be passed.
- * Since we do not currently have an `ArrayMethod` representing this, the
- * method is also set to NULL.
- *
- * TODO: this function should probably be moved onto the DType eventually,
- *       which would allow for user DTypes to include dynamic allocated
- *       memory or Python objects.
- */
-static int
-get_decref_transfer_function(int aligned,
-                            npy_intp src_stride,
-                            PyArray_Descr *src_dtype,
-                            NPY_cast_info *cast_info,
-                            int *out_needs_api)
-{
-    NPY_cast_info_init(cast_info);
-
-    /* If there are no references, it's a nop */
-    if (!PyDataType_REFCHK(src_dtype)) {
-        cast_info->func = &_dec_src_ref_nop;
-        cast_info->auxdata = NULL;
-        goto finalize;
-    }
-    /* If it's a single reference, it's one decref */
-    else if (src_dtype->type_num == NPY_OBJECT) {
-        if (out_needs_api) {
-            *out_needs_api = 1;
-        }
-
-        cast_info->func = &_strided_to_null_dec_src_ref_reference;
-        cast_info->auxdata = NULL;
-        goto finalize;
-    }
-    /* If there are subarrays, need to wrap it */
-    else if (PyDataType_HASSUBARRAY(src_dtype)) {
-        PyArray_Dims src_shape = {NULL, -1};
-        npy_intp src_size;
-
-        if (out_needs_api) {
-            *out_needs_api = 1;
-        }
-
-        if (!(PyArray_IntpConverter(src_dtype->subarray->shape,
-                                            &src_shape))) {
-            PyErr_SetString(PyExc_ValueError,
-                    "invalid subarray shape");
-            return NPY_FAIL;
-        }
-        src_size = PyArray_MultiplyList(src_shape.ptr, src_shape.len);
-        npy_free_cache_dim_obj(src_shape);
-
-        NPY_ARRAYMETHOD_FLAGS ignored_flags;
-        if (get_n_to_n_transfer_function(aligned,
-                src_stride, 0,
-                src_dtype->subarray->base, NULL, 1, src_size,
-                &cast_info->func, &cast_info->auxdata,
-                &ignored_flags) != NPY_SUCCEED) {
-            return NPY_FAIL;
-        }
-
-        goto finalize;
-    }
-    /* If there are fields, need to do each field */
-    else if (PyDataType_HASFIELDS(src_dtype)) {
-        if (out_needs_api) {
-            *out_needs_api = 1;
-        }
-
-        if (get_decref_fields_transfer_function(aligned,
-                            src_stride, src_dtype,
-                            &cast_info->func, &cast_info->auxdata,
-                            out_needs_api) < 0) {
-            return NPY_FAIL;
-        }
-        goto finalize;
-    }
-    else {
-        PyErr_Format(PyExc_RuntimeError,
-                "Internal error, tried to fetch decref function for the "
-                "unsupported DType '%S'.", src_dtype);
-        return NPY_FAIL;
-    }
-
-  finalize:
-    /* Make sure all important fields are either set or cleared */
-    Py_INCREF(src_dtype);
-    cast_info->descriptors[0] = src_dtype;
-    cast_info->descriptors[1] = NULL;
-    cast_info->context.method = NULL;
-    cast_info->context.caller = NULL;
-    return NPY_SUCCEED;
-}
-
 
 /*
  * ********************* Generalized Multistep Cast ************************
@@ -2951,9 +2762,11 @@ _strided_to_strided_multistep_cast(
 
         if (castdata->from.func != NULL) {
             npy_intp out_stride = castdata->from.descriptors[1]->elsize;
+            char *const data[2] = {src, castdata->from_buffer};
+            npy_intp strides[2] = {src_stride, out_stride};
             if (castdata->from.func(&castdata->from.context,
-                    (char *[2]){src, castdata->from_buffer}, &block_size,
-                    (npy_intp [2]){src_stride, out_stride},
+                    data, &block_size,
+                    strides,
                     castdata->from.auxdata) != 0) {
                 /* TODO: Internal buffer may require cleanup on error. */
                 return -1;
@@ -2975,18 +2788,22 @@ _strided_to_strided_multistep_cast(
             main_dst_stride = dst_stride;
         }
 
+        char *const data[2] = {main_src, main_dst};
+        npy_intp strides[2] = {main_src_stride, main_dst_stride};
         if (castdata->main.func(&castdata->main.context,
-                (char *[2]){main_src, main_dst}, &block_size,
-                (npy_intp [2]){main_src_stride, main_dst_stride},
+                data, &block_size,
+                strides,
                 castdata->main.auxdata) != 0) {
             /* TODO: Internal buffer may require cleanup on error. */
             return -1;
         }
 
         if (castdata->to.func != NULL) {
+            char *const data[2] = {main_dst, dst};
+            npy_intp strides[2] = {main_dst_stride, dst_stride};
             if (castdata->to.func(&castdata->to.context,
-                    (char *[2]){main_dst, dst}, &block_size,
-                    (npy_intp [2]){main_dst_stride, dst_stride},
+                    data, &block_size,
+                    strides,
                     castdata->to.auxdata) != 0) {
                 return -1;
             }
@@ -3004,7 +2821,7 @@ _strided_to_strided_multistep_cast(
  * Initialize most of a cast-info structure, this step does not fetch the
  * transferfunction and transferdata.
  */
-static NPY_INLINE int
+static inline int
 init_cast_info(
         NPY_cast_info *cast_info, NPY_CASTING *casting, npy_intp *view_offset,
         PyArray_Descr *src_dtype, PyArray_Descr *dst_dtype, int main_step)
@@ -3077,14 +2894,14 @@ _clear_cast_info_after_get_loop_failure(NPY_cast_info *cast_info)
     /* As public API we could choose to clear auxdata != NULL */
     assert(cast_info->auxdata == NULL);
     /* Set func to be non-null so that `NPY_cats_info_xfree` does not skip */
-    cast_info->func = &_dec_src_ref_nop;
+    cast_info->func = &_cast_no_op;
     NPY_cast_info_xfree(cast_info);
 }
 
 
 /*
  * Helper for PyArray_GetDTypeTransferFunction, which fetches a single
- * transfer function from the each casting implementation (ArrayMethod).
+ * transfer function from the each casting implementation (ArrayMethod)
  * May set the transfer function to NULL when the cast can be achieved using
  * a view.
  * TODO: Expand the view functionality for general offsets, not just 0:
@@ -3114,6 +2931,8 @@ define_cast_for_descrs(
         int move_references,
         NPY_cast_info *cast_info, NPY_ARRAYMETHOD_FLAGS *out_flags)
 {
+    assert(dst_dtype != NULL);  /* Was previously used for decref */
+
     /* Storage for all cast info in case multi-step casting is necessary */
     _multistep_castdata castdata;
     /* Initialize funcs to NULL to simplify cleanup on error. */
@@ -3158,7 +2977,7 @@ define_cast_for_descrs(
         /* Prepare the actual cast (if necessary): */
         if (from_view_offset == 0 && !must_wrap) {
             /* This step is not necessary and can be skipped */
-            castdata.from.func = &_dec_src_ref_nop;  /* avoid NULL */
+            castdata.from.func = &_cast_no_op;  /* avoid NULL */
             NPY_cast_info_xfree(&castdata.from);
         }
         else {
@@ -3197,7 +3016,7 @@ define_cast_for_descrs(
         /* Prepare the actual cast (if necessary): */
         if (to_view_offset == 0 && !must_wrap) {
             /* This step is not necessary and can be skipped. */
-            castdata.to.func = &_dec_src_ref_nop;  /* avoid NULL */
+            castdata.to.func = &_cast_no_op;  /* avoid NULL */
             NPY_cast_info_xfree(&castdata.to);
         }
         else {
@@ -3273,33 +3092,6 @@ PyArray_GetDTypeTransferFunction(int aligned,
                             NPY_cast_info *cast_info,
                             NPY_ARRAYMETHOD_FLAGS *out_flags)
 {
-    assert(src_dtype != NULL);
-
-    /*
-     * If one of the dtypes is NULL, we give back either a src decref
-     * function or a dst setzero function
-     *
-     * TODO: Eventually, we may wish to support user dtype with references
-     *       (including and beyond bare `PyObject *` this may require extending
-     *       the ArrayMethod API and those paths should likely be split out
-     *       from this function.)
-     */
-    if (dst_dtype == NULL) {
-        assert(move_references);
-        int needs_api = 0;
-        int res = get_decref_transfer_function(aligned,
-                                src_dtype->elsize,
-                                src_dtype,
-                                cast_info,
-                                &needs_api);
-        /* decref'ing never creates floating point errors, so just ignore it */
-        *out_flags = PyArrayMethod_MINIMAL_FLAGS;
-        if (needs_api) {
-            *out_flags |= NPY_METH_REQUIRES_PYAPI;
-        }
-        return res;
-    }
-
     if (define_cast_for_descrs(aligned,
             src_stride, dst_stride,
             src_dtype, dst_dtype, move_references,
@@ -3557,20 +3349,19 @@ PyArray_GetMaskedDTypeTransferFunction(int aligned,
 
     /* If the src object will need a DECREF, get a function to handle that */
     if (move_references && PyDataType_REFCHK(src_dtype)) {
-        *out_flags |= NPY_METH_REQUIRES_PYAPI;
-        if (get_decref_transfer_function(aligned,
-                            src_stride,
-                            src_dtype,
-                            &data->decref_src,
-                            NULL) != NPY_SUCCEED) {
+        NPY_ARRAYMETHOD_FLAGS clear_flags;
+        if (PyArray_GetClearFunction(
+                aligned, src_stride, src_dtype,
+                &data->decref_src, &clear_flags) < 0) {
             NPY_AUXDATA_FREE((NpyAuxData *)data);
             return NPY_FAIL;
         }
+        *out_flags = PyArrayMethod_COMBINED_FLAGS(*out_flags, clear_flags);
         cast_info->func = (PyArrayMethod_StridedLoop *)
-                &_strided_masked_wrapper_decref_transfer_function;
+                &_strided_masked_wrapper_clear_function;
     }
     else {
-        NPY_cast_info_init(&data->decref_src);
+        NPY_traverse_info_init(&data->decref_src);
         cast_info->func = (PyArrayMethod_StridedLoop *)
                 &_strided_masked_wrapper_transfer_function;
     }
diff --git a/numpy/core/src/multiarray/dtype_transfer.h b/numpy/core/src/multiarray/dtype_transfer.h
index 9ae332e38..04df5cb64 100644
--- a/numpy/core/src/multiarray/dtype_transfer.h
+++ b/numpy/core/src/multiarray/dtype_transfer.h
@@ -31,7 +31,7 @@ typedef struct {
  * copied.
  * If set up otherwise, func must be NULL'ed to indicate no-cleanup necessary.
  */
-static NPY_INLINE void
+static inline void
 NPY_cast_info_init(NPY_cast_info *cast_info)
 {
     cast_info->func = NULL;  /* mark as uninitialized. */
@@ -52,7 +52,7 @@ NPY_cast_info_init(NPY_cast_info *cast_info)
  * First checks whether `cast_info.func == NULL`, and assume it is
  * uninitialized in that case.
  */
-static NPY_INLINE void
+static inline void
 NPY_cast_info_xfree(NPY_cast_info *cast_info)
 {
     if (cast_info->func == NULL) {
@@ -71,7 +71,7 @@ NPY_cast_info_xfree(NPY_cast_info *cast_info)
  * Move the data from `original` to `cast_info`. Original is cleared
  * (its func set to NULL).
  */
-static NPY_INLINE void
+static inline void
 NPY_cast_info_move(NPY_cast_info *cast_info, NPY_cast_info *original)
 {
     *cast_info = *original;
@@ -87,7 +87,7 @@ NPY_cast_info_move(NPY_cast_info *cast_info, NPY_cast_info *original)
  * NOTE: It is acceptable to call this with the same struct if the struct
  *       has been filled by a valid memcpy from an initialized one.
  */
-static NPY_INLINE int
+static inline int
 NPY_cast_info_copy(NPY_cast_info *cast_info, NPY_cast_info *original)
 {
     cast_info->context.descriptors = cast_info->descriptors;
@@ -146,7 +146,6 @@ object_to_any_get_loop(
         NpyAuxData **out_transferdata,
         NPY_ARRAYMETHOD_FLAGS *flags);
 
-
 NPY_NO_EXPORT int
 wrap_aligned_transferfunction(
         int aligned, int must_wrap,
diff --git a/numpy/core/src/multiarray/dtype_traversal.c b/numpy/core/src/multiarray/dtype_traversal.c
new file mode 100644
index 000000000..f76119f94
--- /dev/null
+++ b/numpy/core/src/multiarray/dtype_traversal.c
@@ -0,0 +1,651 @@
+/*
+ * This file is simlar to the low-level loops for data type transfer
+ * in `dtype_transfer.c` but for those which only require visiting
+ * a single array (and mutating it in-place).
+ *
+ * As of writing, it is only used for CLEARing, which means mainly
+ * Python object DECREF/dealloc followed by NULL'ing the data
+ * (to support double clearing and ensure data is again in a usable state).
+ * However, memory initialization and traverse follows similar
+ * protocols (although traversal needs additional arguments).
+ */
+
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
+#define _UMATHMODULE
+
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
+#include <structmember.h>
+
+#include "numpy/ndarraytypes.h"
+#include "numpy/arrayobject.h"
+
+#include "alloc.h"
+#include "array_method.h"
+#include "dtypemeta.h"
+#include "dtype_traversal.h"
+
+
+/* Buffer size with the same use case as the one in dtype_transfer.c */
+#define NPY_LOWLEVEL_BUFFER_BLOCKSIZE  128
+
+
+typedef int get_traverse_func_function(
+        void *traverse_context, PyArray_Descr *dtype, int aligned,
+        npy_intp stride, NPY_traverse_info *clear_info,
+        NPY_ARRAYMETHOD_FLAGS *flags);
+
+/*
+ * Generic Clear function helpers:
+ */
+
+static int
+get_clear_function(
+        void *traverse_context, PyArray_Descr *dtype, int aligned,
+        npy_intp stride, NPY_traverse_info *clear_info,
+        NPY_ARRAYMETHOD_FLAGS *flags)
+{
+    NPY_traverse_info_init(clear_info);
+    /* not that cleanup code bothers to check e.g. for floating point flags */
+    *flags = PyArrayMethod_MINIMAL_FLAGS;
+
+    get_traverse_loop_function *get_clear = NPY_DT_SLOTS(NPY_DTYPE(dtype))->get_clear_loop;
+    if (get_clear == NULL) {
+        PyErr_Format(PyExc_RuntimeError,
+                "Internal error, `get_clear_loop` not set for the DType '%S'",
+                dtype);
+        return -1;
+    }
+
+    if (get_clear(traverse_context, dtype, aligned, stride,
+                  &clear_info->func, &clear_info->auxdata, flags) <  0) {
+        /* callee should clean up, but make sure outside debug mode */
+        assert(clear_info->func == NULL);
+        clear_info->func = NULL;
+        return -1;
+    }
+    Py_INCREF(dtype);
+    clear_info->descr = dtype;
+
+    return 0;
+}
+
+/*
+ * Helper to set up a strided loop used for clearing.  Clearing means
+ * deallocating any references (e.g. via Py_DECREF) and resetting the data
+ * back into a usable/initialized state (e.g. by NULLing any references).
+ *
+ * The function will error when called on a dtype which does not have
+ * references (and thus the get_clear_loop slot NULL).
+ * Note that old-style user-dtypes use the "void" version.
+ *
+ * NOTE: This function may have a use for a `traverse_context` at some point
+ *       but right now, it is always NULL and only exists to allow adding it
+ *       in the future without changing the strided-loop signature.
+ */
+NPY_NO_EXPORT int
+PyArray_GetClearFunction(
+        int aligned, npy_intp stride, PyArray_Descr *dtype,
+        NPY_traverse_info *clear_info, NPY_ARRAYMETHOD_FLAGS *flags)
+{
+    return get_clear_function(NULL, dtype, aligned, stride, clear_info, flags);
+}
+
+
+/*
+ * Generic zerofill/fill function helper:
+ */
+
+static int
+get_zerofill_function(
+        void *traverse_context, PyArray_Descr *dtype, int aligned,
+        npy_intp stride, NPY_traverse_info *zerofill_info,
+        NPY_ARRAYMETHOD_FLAGS *flags)
+{
+    NPY_traverse_info_init(zerofill_info);
+    /* not that filling code bothers to check e.g. for floating point flags */
+    *flags = PyArrayMethod_MINIMAL_FLAGS;
+
+    get_traverse_loop_function *get_zerofill = NPY_DT_SLOTS(NPY_DTYPE(dtype))->get_fill_zero_loop;
+    if (get_zerofill == NULL) {
+        /* Allowed to be NULL (and accept it here) */
+        return 0;
+    }
+
+    if (get_zerofill(traverse_context, dtype, aligned, stride,
+                     &zerofill_info->func, &zerofill_info->auxdata, flags) <  0) {
+        /* callee should clean up, but make sure outside debug mode */
+        assert(zerofill_info->func == NULL);
+        zerofill_info->func = NULL;
+        return -1;
+    }
+    if (zerofill_info->func == NULL) {
+        /* Zerofill also may return func=NULL without an error. */
+        return 0;
+    }
+
+    Py_INCREF(dtype);
+    zerofill_info->descr = dtype;
+
+    return 0;
+}
+
+
+/****************** Python Object clear ***********************/
+
+static int
+clear_object_strided_loop(
+        void *NPY_UNUSED(traverse_context), PyArray_Descr *NPY_UNUSED(descr),
+        char *data, npy_intp size, npy_intp stride,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    PyObject *aligned_copy = NULL;
+    while (size > 0) {
+        /* Release the reference in src and set it to NULL */
+        memcpy(&aligned_copy, data, sizeof(PyObject *));
+        Py_XDECREF(aligned_copy);
+        memset(data, 0, sizeof(PyObject *));
+
+        data += stride;
+        --size;
+    }
+    return 0;
+}
+
+
+NPY_NO_EXPORT int
+npy_get_clear_object_strided_loop(
+        void *NPY_UNUSED(traverse_context), PyArray_Descr *NPY_UNUSED(descr),
+        int NPY_UNUSED(aligned), npy_intp NPY_UNUSED(fixed_stride),
+        traverse_loop_function **out_loop, NpyAuxData **out_auxdata,
+        NPY_ARRAYMETHOD_FLAGS *flags)
+{
+    *flags = NPY_METH_REQUIRES_PYAPI|NPY_METH_NO_FLOATINGPOINT_ERRORS;
+    *out_loop = &clear_object_strided_loop;
+    return 0;
+}
+
+
+/**************** Python Object zero fill *********************/
+
+static int
+fill_zero_object_strided_loop(
+        void *NPY_UNUSED(traverse_context), PyArray_Descr *NPY_UNUSED(descr),
+        char *data, npy_intp size, npy_intp stride,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    PyObject *zero = PyLong_FromLong(0);
+    while (size--) {
+        Py_INCREF(zero);
+        // assumes `data` doesn't have a pre-existing object inside it
+        memcpy(data, &zero, sizeof(zero));
+        data += stride;
+    }
+    Py_DECREF(zero);
+    return 0;
+}
+
+NPY_NO_EXPORT int
+npy_object_get_fill_zero_loop(void *NPY_UNUSED(traverse_context),
+                              PyArray_Descr *NPY_UNUSED(descr),
+                              int NPY_UNUSED(aligned),
+                              npy_intp NPY_UNUSED(fixed_stride),
+                              traverse_loop_function **out_loop,
+                              NpyAuxData **NPY_UNUSED(out_auxdata),
+                              NPY_ARRAYMETHOD_FLAGS *flags)
+{
+    *flags = NPY_METH_REQUIRES_PYAPI | NPY_METH_NO_FLOATINGPOINT_ERRORS;
+    *out_loop = &fill_zero_object_strided_loop;
+    return 0;
+}
+
+/**************** Structured DType generic funcationality ***************/
+
+/*
+ * Note that legacy user dtypes also make use of this.  Someone managed to
+ * hack objects into them by adding a field that contains objects and this
+ * remains (somewhat) valid.
+ * (Unlike our voids, those fields must be hardcoded probably, but...)
+ *
+ * The below functionality mirrors the casting functionality relatively
+ * closely.
+ */
+
+typedef struct {
+    npy_intp src_offset;
+    NPY_traverse_info info;
+} single_field_traverse_data;
+
+typedef struct {
+    NpyAuxData base;
+    npy_intp field_count;
+    single_field_traverse_data fields[];
+} fields_traverse_data;
+
+
+/* traverse data free function */
+static void
+fields_traverse_data_free(NpyAuxData *data)
+{
+    fields_traverse_data *d = (fields_traverse_data *)data;
+
+    for (npy_intp i = 0; i < d->field_count; ++i) {
+        NPY_traverse_info_xfree(&d->fields[i].info);
+    }
+    PyMem_Free(d);
+}
+
+
+/* traverse data copy function (untested due to no direct use currently) */
+static NpyAuxData *
+fields_traverse_data_clone(NpyAuxData *data)
+{
+    fields_traverse_data *d = (fields_traverse_data *)data;
+
+    npy_intp field_count = d->field_count;
+    npy_intp structsize = sizeof(fields_traverse_data) +
+                    field_count * sizeof(single_field_traverse_data);
+
+    /* Allocate the data and populate it */
+    fields_traverse_data *newdata = PyMem_Malloc(structsize);
+    if (newdata == NULL) {
+        return NULL;
+    }
+    newdata->base = d->base;
+    newdata->field_count = 0;
+
+    /* Copy all the fields transfer data */
+    single_field_traverse_data *in_field = d->fields;
+    single_field_traverse_data *new_field = newdata->fields;
+
+    for (; newdata->field_count < field_count;
+                newdata->field_count++, in_field++, new_field++) {
+        new_field->src_offset = in_field->src_offset;
+
+        if (NPY_traverse_info_copy(&new_field->info, &in_field->info) < 0) {
+            fields_traverse_data_free((NpyAuxData *)newdata);
+            return NULL;
+        }
+    }
+
+    return (NpyAuxData *)newdata;
+}
+
+
+static int
+traverse_fields_function(
+        void *traverse_context, PyArray_Descr *NPY_UNUSED(descr),
+        char *data, npy_intp N, npy_intp stride,
+        NpyAuxData *auxdata)
+{
+    fields_traverse_data *d = (fields_traverse_data *)auxdata;
+    npy_intp i, field_count = d->field_count;
+
+    /* Do the traversing a block at a time for better memory caching */
+    const npy_intp blocksize = NPY_LOWLEVEL_BUFFER_BLOCKSIZE;
+
+    for (;;) {
+        if (N > blocksize) {
+            for (i = 0; i < field_count; ++i) {
+                single_field_traverse_data field = d->fields[i];
+                if (field.info.func(traverse_context,
+                        field.info.descr, data + field.src_offset,
+                        blocksize, stride, field.info.auxdata) < 0) {
+                    return -1;
+                }
+            }
+            N -= blocksize;
+            data += blocksize * stride;
+        }
+        else {
+            for (i = 0; i < field_count; ++i) {
+                single_field_traverse_data field = d->fields[i];
+                if (field.info.func(traverse_context,
+                        field.info.descr, data + field.src_offset,
+                        N, stride, field.info.auxdata) < 0) {
+                    return -1;
+                }
+            }
+            return 0;
+        }
+    }
+}
+
+
+static int
+get_fields_traverse_function(
+        void *traverse_context, PyArray_Descr *dtype, int NPY_UNUSED(aligned),
+        npy_intp stride, traverse_loop_function **out_func,
+        NpyAuxData **out_auxdata, NPY_ARRAYMETHOD_FLAGS *flags,
+        get_traverse_func_function *get_traverse_func)
+{
+    PyObject *names, *key, *tup, *title;
+    PyArray_Descr *fld_dtype;
+    npy_int i, structsize;
+    Py_ssize_t field_count;
+
+    names = dtype->names;
+    field_count = PyTuple_GET_SIZE(dtype->names);
+
+    /* Over-allocating here: less fields may be used */
+    structsize = (sizeof(fields_traverse_data) +
+                    field_count * sizeof(single_field_traverse_data));
+    /* Allocate the data and populate it */
+    fields_traverse_data *data = PyMem_Malloc(structsize);
+    if (data == NULL) {
+        PyErr_NoMemory();
+        return -1;
+    }
+    data->base.free = &fields_traverse_data_free;
+    data->base.clone = &fields_traverse_data_clone;
+    data->field_count = 0;
+
+    single_field_traverse_data *field = data->fields;
+    for (i = 0; i < field_count; ++i) {
+        int offset;
+
+        key = PyTuple_GET_ITEM(names, i);
+        tup = PyDict_GetItem(dtype->fields, key);
+        if (!PyArg_ParseTuple(tup, "Oi|O", &fld_dtype, &offset, &title)) {
+            NPY_AUXDATA_FREE((NpyAuxData *)data);
+            return -1;
+        }
+        if (get_traverse_func == &get_clear_function
+                && !PyDataType_REFCHK(fld_dtype)) {
+            /* No need to do clearing (could change to use NULL return) */
+            continue;
+        }
+        NPY_ARRAYMETHOD_FLAGS clear_flags;
+        if (get_traverse_func(
+                traverse_context, fld_dtype, 0,
+                stride, &field->info, &clear_flags) < 0) {
+            NPY_AUXDATA_FREE((NpyAuxData *)data);
+            return -1;
+        }
+        if (field->info.func == NULL) {
+            /* zerofill allows NULL func as "default" memset to zero */
+            continue;
+        }
+        *flags = PyArrayMethod_COMBINED_FLAGS(*flags, clear_flags);
+        field->src_offset = offset;
+        data->field_count++;
+        field++;
+    }
+
+    *out_func = &traverse_fields_function;
+    *out_auxdata = (NpyAuxData *)data;
+
+    return 0;
+}
+
+
+typedef struct {
+    NpyAuxData base;
+    npy_intp count;
+    NPY_traverse_info info;
+} subarray_traverse_data;
+
+
+/* traverse data free function */
+static void
+subarray_traverse_data_free(NpyAuxData *data)
+{
+    subarray_traverse_data *d = (subarray_traverse_data *)data;
+
+    NPY_traverse_info_xfree(&d->info);
+    PyMem_Free(d);
+}
+
+
+/*
+ * We seem to be neither using nor exposing this right now, so leave it NULL.
+ * (The implementation below should be functional.)
+ */
+#define subarray_traverse_data_clone NULL
+
+#ifndef subarray_traverse_data_clone
+/* traverse data copy function */
+static NpyAuxData *
+subarray_traverse_data_clone(NpyAuxData *data)
+{
+    subarray_traverse_data *d = (subarray_traverse_data *)data;
+
+    /* Allocate the data and populate it */
+    subarray_traverse_data *newdata = PyMem_Malloc(sizeof(subarray_traverse_data));
+    if (newdata == NULL) {
+        return NULL;
+    }
+    newdata->base = d->base;
+    newdata->count = d->count;
+
+    if (NPY_traverse_info_copy(&newdata->info, &d->info) < 0) {
+        PyMem_Free(newdata);
+        return NULL;
+    }
+
+    return (NpyAuxData *)newdata;
+}
+#endif
+
+
+static int
+traverse_subarray_func(
+        void *traverse_context, PyArray_Descr *NPY_UNUSED(descr),
+        char *data, npy_intp N, npy_intp stride,
+        NpyAuxData *auxdata)
+{
+    subarray_traverse_data *subarr_data = (subarray_traverse_data *)auxdata;
+
+    traverse_loop_function *func = subarr_data->info.func;
+    PyArray_Descr *sub_descr = subarr_data->info.descr;
+    npy_intp sub_N = subarr_data->count;
+    NpyAuxData *sub_auxdata = subarr_data->info.auxdata;
+    npy_intp sub_stride = sub_descr->elsize;
+
+    while (N--) {
+        if (func(traverse_context, sub_descr, data,
+                 sub_N, sub_stride, sub_auxdata) < 0) {
+            return -1;
+        }
+        data += stride;
+    }
+    return 0;
+}
+
+
+static int
+get_subarray_traverse_func(
+        void *traverse_context, PyArray_Descr *dtype, int aligned,
+        npy_intp size, npy_intp stride, traverse_loop_function **out_func,
+        NpyAuxData **out_auxdata, NPY_ARRAYMETHOD_FLAGS *flags,
+        get_traverse_func_function *get_traverse_func)
+{
+    subarray_traverse_data *auxdata = PyMem_Malloc(sizeof(subarray_traverse_data));
+    if (auxdata == NULL) {
+        PyErr_NoMemory();
+        return -1;
+    }
+
+    auxdata->count = size;
+    auxdata->base.free = &subarray_traverse_data_free;
+    auxdata->base.clone = subarray_traverse_data_clone;
+
+    if (get_traverse_func(
+            traverse_context, dtype, aligned,
+            dtype->elsize, &auxdata->info, flags) < 0) {
+        PyMem_Free(auxdata);
+        return -1;
+    }
+    if (auxdata->info.func == NULL) {
+        /* zerofill allows func to be NULL, in which we need not do anything */
+        PyMem_Free(auxdata);
+        *out_func = NULL;
+        *out_auxdata = NULL;
+        return 0;
+    }
+    *out_func = &traverse_subarray_func;
+    *out_auxdata = (NpyAuxData *)auxdata;
+
+    return 0;
+}
+
+
+static int
+clear_no_op(
+        void *NPY_UNUSED(traverse_context), PyArray_Descr *NPY_UNUSED(descr),
+        char *NPY_UNUSED(data), npy_intp NPY_UNUSED(size),
+        npy_intp NPY_UNUSED(stride), NpyAuxData *NPY_UNUSED(auxdata))
+{
+    return 0;
+}
+
+NPY_NO_EXPORT int
+npy_get_clear_void_and_legacy_user_dtype_loop(
+        void *traverse_context, PyArray_Descr *dtype, int aligned,
+        npy_intp stride, traverse_loop_function **out_func,
+        NpyAuxData **out_auxdata, NPY_ARRAYMETHOD_FLAGS *flags)
+{
+    /*
+     * If there are no references, it's a nop.  This path should not be hit
+     * but structured dtypes are tricky when a dtype which included references
+     * was sliced to not include any.
+     */
+    if (!PyDataType_REFCHK(dtype)) {
+        *out_func = &clear_no_op;
+        return 0;
+    }
+
+    if (PyDataType_HASSUBARRAY(dtype)) {
+        PyArray_Dims shape = {NULL, -1};
+        npy_intp size;
+
+        if (!(PyArray_IntpConverter(dtype->subarray->shape, &shape))) {
+            PyErr_SetString(PyExc_ValueError,
+                    "invalid subarray shape");
+            return -1;
+        }
+        size = PyArray_MultiplyList(shape.ptr, shape.len);
+        npy_free_cache_dim_obj(shape);
+
+        if (get_subarray_traverse_func(
+                traverse_context, dtype->subarray->base, aligned, size, stride,
+                out_func, out_auxdata, flags, &get_clear_function) < 0) {
+            return -1;
+        }
+
+        return 0;
+    }
+    /* If there are fields, need to do each field */
+    else if (PyDataType_HASFIELDS(dtype)) {
+        if (get_fields_traverse_function(
+                traverse_context, dtype, aligned, stride,
+                out_func, out_auxdata, flags, &get_clear_function) < 0) {
+            return -1;
+        }
+        return 0;
+    }
+    else if (dtype->type_num == NPY_VOID) {
+        /* 
+         * Void dtypes can have "ghosts" of objects marking the dtype because
+         * holes (or the raw bytes if fields are gone) may include objects.
+         * Paths that need those flags should probably be considered incorrect.
+         * But as long as this can happen (a V8 that indicates references)
+         * we need to make it a no-op here.
+         */
+        *out_func = &clear_no_op;
+        return 0;
+    }
+
+    PyErr_Format(PyExc_RuntimeError,
+            "Internal error, tried to fetch clear function for the "
+            "user dtype '%S' without fields or subarray (legacy support).",
+            dtype);
+    return -1;
+}
+
+/**************** Structured DType zero fill ***************/
+
+
+static int
+zerofill_fields_function(
+        void *traverse_context, PyArray_Descr *descr,
+        char *data, npy_intp N, npy_intp stride,
+        NpyAuxData *auxdata)
+{
+    npy_intp itemsize = descr->elsize;
+
+    /*
+     * TODO: We could optimize this by chunking, but since we currently memset
+     *       each element always, just loop manually.
+     */
+    while (N--) {
+        memset(data, 0, itemsize);
+        if (traverse_fields_function(
+                traverse_context, descr, data, 1, stride, auxdata) < 0) {
+            return -1;
+        }
+        data +=stride;
+    }
+    return 0;
+}
+
+/*
+ * Similar to other (e.g. clear) traversal loop getter, but unlike it, we
+ * do need to take care of zeroing out everything (in principle not gaps).
+ * So we add a memset before calling the actual traverse function for the
+ * structured path.
+ */
+NPY_NO_EXPORT int
+npy_get_zerofill_void_and_legacy_user_dtype_loop(
+        void *traverse_context, PyArray_Descr *dtype, int aligned,
+        npy_intp stride, traverse_loop_function **out_func,
+        NpyAuxData **out_auxdata, NPY_ARRAYMETHOD_FLAGS *flags)
+{
+    if (PyDataType_HASSUBARRAY(dtype)) {
+        PyArray_Dims shape = {NULL, -1};
+        npy_intp size;
+
+        if (!(PyArray_IntpConverter(dtype->subarray->shape, &shape))) {
+            PyErr_SetString(PyExc_ValueError,
+                    "invalid subarray shape");
+            return -1;
+        }
+        size = PyArray_MultiplyList(shape.ptr, shape.len);
+        npy_free_cache_dim_obj(shape);
+
+        if (get_subarray_traverse_func(
+                traverse_context, dtype->subarray->base, aligned, size, stride,
+                out_func, out_auxdata, flags, &get_zerofill_function) < 0) {
+            return -1;
+        }
+
+        return 0;
+    }
+    /* If there are fields, need to do each field */
+    else if (PyDataType_HASFIELDS(dtype)) {
+        if (get_fields_traverse_function(
+                traverse_context, dtype, aligned, stride,
+                out_func, out_auxdata, flags, &get_zerofill_function) < 0) {
+            return -1;
+        }
+        if (((fields_traverse_data *)*out_auxdata)->field_count == 0) {
+            /* If there are no fields, just return NULL for zerofill */
+            NPY_AUXDATA_FREE(*out_auxdata);
+            *out_auxdata = NULL;
+            *out_func = NULL;
+            return 0;
+        }
+        /* 
+         * Traversal skips fields that have no custom zeroing, so we need
+         * to take care of it.
+         */
+        *out_func = &zerofill_fields_function;
+        return 0;
+    }
+
+    /* Otherwise, assume there is nothing to do (user dtypes reach here) */
+    *out_auxdata = NULL;
+    *out_func = NULL;
+    return 0;
+}
diff --git a/numpy/core/src/multiarray/dtype_traversal.h b/numpy/core/src/multiarray/dtype_traversal.h
new file mode 100644
index 000000000..fc12c0f7b
--- /dev/null
+++ b/numpy/core/src/multiarray/dtype_traversal.h
@@ -0,0 +1,98 @@
+#ifndef NUMPY_CORE_SRC_MULTIARRAY_DTYPE_TRAVERSAL_H_
+#define NUMPY_CORE_SRC_MULTIARRAY_DTYPE_TRAVERSAL_H_
+
+#include "array_method.h"
+
+/* NumPy DType clear (object DECREF + NULLing) implementations */
+
+NPY_NO_EXPORT int
+npy_get_clear_object_strided_loop(
+        void *traverse_context, PyArray_Descr *descr, int aligned,
+        npy_intp fixed_stride,
+        traverse_loop_function **out_loop, NpyAuxData **out_traversedata,
+        NPY_ARRAYMETHOD_FLAGS *flags);
+
+NPY_NO_EXPORT int
+npy_get_clear_void_and_legacy_user_dtype_loop(
+        void *traverse_context, PyArray_Descr *descr, int aligned,
+        npy_intp fixed_stride,
+        traverse_loop_function **out_loop, NpyAuxData **out_traversedata,
+        NPY_ARRAYMETHOD_FLAGS *flags);
+
+/* NumPy DType zero-filling implementations */
+
+NPY_NO_EXPORT int
+npy_object_get_fill_zero_loop(
+        void *NPY_UNUSED(traverse_context), PyArray_Descr *NPY_UNUSED(descr),
+        int NPY_UNUSED(aligned), npy_intp NPY_UNUSED(fixed_stride),
+        traverse_loop_function **out_loop, NpyAuxData **NPY_UNUSED(out_auxdata),
+        NPY_ARRAYMETHOD_FLAGS *flags);
+
+NPY_NO_EXPORT int
+npy_get_zerofill_void_and_legacy_user_dtype_loop(
+        void *traverse_context, PyArray_Descr *dtype, int aligned,
+        npy_intp stride, traverse_loop_function **out_func,
+        NpyAuxData **out_auxdata, NPY_ARRAYMETHOD_FLAGS *flags);
+
+
+/* Helper to deal with calling or nesting simple strided loops */
+
+typedef struct {
+    traverse_loop_function *func;
+    NpyAuxData *auxdata;
+    PyArray_Descr *descr;
+} NPY_traverse_info;
+
+
+static inline void
+NPY_traverse_info_init(NPY_traverse_info *cast_info)
+{
+    cast_info->func = NULL;  /* mark as uninitialized. */
+    cast_info->auxdata = NULL;  /* allow leaving auxdata untouched */
+    cast_info->descr = NULL;  /* mark as uninitialized. */
+}
+
+
+static inline void
+NPY_traverse_info_xfree(NPY_traverse_info *traverse_info)
+{
+    if (traverse_info->func == NULL) {
+        return;
+    }
+    traverse_info->func = NULL;
+    NPY_AUXDATA_FREE(traverse_info->auxdata);
+    Py_XDECREF(traverse_info->descr);
+}
+
+
+static inline int
+NPY_traverse_info_copy(
+        NPY_traverse_info *traverse_info, NPY_traverse_info *original)
+{
+    traverse_info->func = NULL;
+    if (original->func == NULL) {
+        /* Allow copying also of unused clear info */
+        return 0;
+    }
+    traverse_info->auxdata = NULL;
+    if (original->auxdata != NULL) {
+        traverse_info->auxdata = NPY_AUXDATA_CLONE(original->auxdata);
+        if (traverse_info->auxdata == NULL) {
+            return -1;
+        }
+    }
+    Py_INCREF(original->descr);
+    traverse_info->descr = original->descr;
+    traverse_info->func = original->func;
+
+    return 0;
+}
+
+
+NPY_NO_EXPORT int
+PyArray_GetClearFunction(
+        int aligned, npy_intp stride, PyArray_Descr *dtype,
+        NPY_traverse_info *clear_info, NPY_ARRAYMETHOD_FLAGS *flags);
+
+
+#endif  /* NUMPY_CORE_SRC_MULTIARRAY_DTYPE_TRAVERSAL_H_ */
diff --git a/numpy/core/src/multiarray/dtypemeta.c b/numpy/core/src/multiarray/dtypemeta.c
index cc99a3eca..9c1f384d8 100644
--- a/numpy/core/src/multiarray/dtypemeta.c
+++ b/numpy/core/src/multiarray/dtypemeta.c
@@ -9,7 +9,9 @@
 #include <numpy/ndarraytypes.h>
 #include <numpy/arrayscalars.h>
 #include "npy_pycompat.h"
+#include "npy_import.h"
 
+#include "arraytypes.h"
 #include "common.h"
 #include "dtypemeta.h"
 #include "descriptor.h"
@@ -18,6 +20,10 @@
 #include "scalartypes.h"
 #include "convert_datatype.h"
 #include "usertypes.h"
+#include "conversion_utils.h"
+#include "templ_common.h"
+#include "refcount.h"
+#include "dtype_traversal.h"
 
 #include <assert.h>
 
@@ -73,7 +79,7 @@ dtypemeta_init(PyTypeObject *NPY_UNUSED(type),
  * @param self
  * @return nonzero if the object is garbage collected
  */
-static NPY_INLINE int
+static inline int
 dtypemeta_is_gc(PyObject *dtype_class)
 {
     return PyType_Type.tp_is_gc(dtype_class);
@@ -122,6 +128,50 @@ legacy_dtype_default_new(PyArray_DTypeMeta *self,
     return (PyObject *)self->singleton;
 }
 
+static PyObject *
+string_unicode_new(PyArray_DTypeMeta *self, PyObject *args, PyObject *kwargs)
+{
+    npy_intp size;
+
+    static char *kwlist[] = {"", NULL};
+
+    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O&", kwlist,
+                                     PyArray_IntpFromPyIntConverter, &size)) {
+        return NULL;
+    }
+
+    if (size < 0) {
+        PyErr_Format(PyExc_ValueError,
+                     "Strings cannot have a negative size but a size of "
+                     "%"NPY_INTP_FMT" was given", size);
+        return NULL;
+    }
+
+    PyArray_Descr *res = PyArray_DescrNewFromType(self->type_num);
+
+    if (res == NULL) {
+        return NULL;
+    }
+
+    if (self->type_num == NPY_UNICODE) {
+        // unicode strings are 4 bytes per character
+        if (npy_mul_sizes_with_overflow(&size, size, 4)) {
+            PyErr_SetString(
+                PyExc_TypeError,
+                "Strings too large to store inside array.");
+            return NULL;
+        }
+    }
+
+    if (size > NPY_MAX_INT) {
+        PyErr_SetString(PyExc_TypeError,
+                        "Strings too large to store inside array.");
+        return NULL;
+    }
+
+    res->elsize = (int)size;
+    return (PyObject *)res;
+}
 
 static PyArray_Descr *
 nonparametric_discover_descr_from_pyobject(
@@ -151,7 +201,7 @@ string_discover_descr_from_pyobject(
         }
         if (itemsize > NPY_MAX_INT) {
             PyErr_SetString(PyExc_TypeError,
-                    "string to large to store inside array.");
+                    "string too large to store inside array.");
         }
         PyArray_Descr *res = PyArray_DescrNewFromType(cls->type_num);
         if (res == NULL) {
@@ -404,7 +454,7 @@ void_common_instance(PyArray_Descr *descr1, PyArray_Descr *descr2)
     if (descr1->subarray == NULL && descr1->names == NULL &&
             descr2->subarray == NULL && descr2->names == NULL) {
         if (descr1->elsize != descr2->elsize) {
-            PyErr_SetString(PyExc_TypeError,
+            PyErr_SetString(npy_DTypePromotionError,
                     "Invalid type promotion with void datatypes of different "
                     "lengths. Use the `np.bytes_` datatype instead to pad the "
                     "shorter value with trailing zero bytes.");
@@ -443,7 +493,7 @@ void_common_instance(PyArray_Descr *descr1, PyArray_Descr *descr2)
             return NULL;
         }
         if (!cmp) {
-            PyErr_SetString(PyExc_TypeError,
+            PyErr_SetString(npy_DTypePromotionError,
                     "invalid type promotion with subarray datatypes "
                     "(shape mismatch).");
             return NULL;
@@ -473,11 +523,12 @@ void_common_instance(PyArray_Descr *descr1, PyArray_Descr *descr2)
         return new_descr;
     }
 
-    PyErr_SetString(PyExc_TypeError,
+    PyErr_SetString(npy_DTypePromotionError,
             "invalid type promotion with structured datatype(s).");
     return NULL;
 }
 
+
 NPY_NO_EXPORT int
 python_builtins_are_known_scalar_types(
         PyArray_DTypeMeta *NPY_UNUSED(cls), PyTypeObject *pytype)
@@ -617,6 +668,12 @@ string_unicode_common_dtype(PyArray_DTypeMeta *cls, PyArray_DTypeMeta *other)
 static PyArray_DTypeMeta *
 datetime_common_dtype(PyArray_DTypeMeta *cls, PyArray_DTypeMeta *other)
 {
+    /*
+     * Timedelta/datetime shouldn't actually promote at all.  That they
+     * currently do means that we need additional hacks in the comparison
+     * type resolver.  For comparisons we have to make sure we reject it
+     * nicely in order to return an array of True/False values.
+     */
     if (cls->type_num == NPY_DATETIME && other->type_num == NPY_TIMEDELTA) {
         /*
          * TODO: We actually currently do allow promotion here. This is
@@ -671,12 +728,17 @@ object_common_dtype(
  * be a HeapType and its instances should be exact PyArray_Descr structs.
  *
  * @param descr The descriptor that should be wrapped.
- * @param name The name for the DType, if NULL the type character is used.
+ * @param name The name for the DType.
+ * @param alias A second name which is also set to the new class for builtins
+ *              (i.e. `np.types.LongDType` for `np.types.Int64DType`).
+ *              Some may have more aliases, as `intp` is not its own thing,
+ *              as of writing this, these are not added here.
  *
  * @returns 0 on success, -1 on failure.
  */
 NPY_NO_EXPORT int
-dtypemeta_wrap_legacy_descriptor(PyArray_Descr *descr)
+dtypemeta_wrap_legacy_descriptor(PyArray_Descr *descr,
+        const char *name, const char *alias)
 {
     int has_type_set = Py_TYPE(descr) == &PyArrayDescr_Type;
 
@@ -703,47 +765,14 @@ dtypemeta_wrap_legacy_descriptor(PyArray_Descr *descr)
         return -1;
     }
 
-    /*
-     * Note: we have no intention of freeing the memory again since this
-     * behaves identically to static type definition (see comment above).
-     * This is seems cleaner for the legacy API, in the new API both static
-     * and heap types are possible (some difficulty arises from the fact that
-     * these are instances of DTypeMeta and not type).
-     * In particular our own DTypes can be true static declarations.
-     * However, this function remains necessary for legacy user dtypes.
-     */
-
-    const char *scalar_name = descr->typeobj->tp_name;
-    /*
-     * We have to take only the name, and ignore the module to get
-     * a reasonable __name__, since static types are limited in this regard
-     * (this is not ideal, but not a big issue in practice).
-     * This is what Python does to print __name__ for static types.
-     */
-    const char *dot = strrchr(scalar_name, '.');
-    if (dot) {
-        scalar_name = dot + 1;
-    }
-    Py_ssize_t name_length = strlen(scalar_name) + 14;
-
-    char *tp_name = PyMem_Malloc(name_length);
-    if (tp_name == NULL) {
-        PyErr_NoMemory();
-        return -1;
-    }
-
-    snprintf(tp_name, name_length, "numpy.dtype[%s]", scalar_name);
-
     NPY_DType_Slots *dt_slots = PyMem_Malloc(sizeof(NPY_DType_Slots));
     if (dt_slots == NULL) {
-        PyMem_Free(tp_name);
         return -1;
     }
     memset(dt_slots, '\0', sizeof(NPY_DType_Slots));
 
     PyArray_DTypeMeta *dtype_class = PyMem_Malloc(sizeof(PyArray_DTypeMeta));
     if (dtype_class == NULL) {
-        PyMem_Free(tp_name);
         PyMem_Free(dt_slots);
         return -1;
     }
@@ -765,13 +794,19 @@ dtypemeta_wrap_legacy_descriptor(PyArray_Descr *descr)
             .tp_flags = Py_TPFLAGS_DEFAULT,
             .tp_base = &PyArrayDescr_Type,
             .tp_new = (newfunc)legacy_dtype_default_new,
+            .tp_doc = (
+                "DType class corresponding to the scalar type and dtype of "
+                "the same name.\n\n"
+                "Please see `numpy.dtype` for the typical way to create\n"
+                "dtype instances and :ref:`arrays.dtypes` for additional\n"
+                "information."),
         },},
         .flags = NPY_DT_LEGACY,
         /* Further fields are not common between DTypes */
     };
     memcpy(dtype_class, &prototype, sizeof(PyArray_DTypeMeta));
     /* Fix name of the Type*/
-    ((PyTypeObject *)dtype_class)->tp_name = tp_name;
+    ((PyTypeObject *)dtype_class)->tp_name = name;
     dtype_class->dt_slots = dt_slots;
 
     /* Let python finish the initialization (probably unnecessary) */
@@ -803,6 +838,7 @@ dtypemeta_wrap_legacy_descriptor(PyArray_Descr *descr)
     dt_slots->common_dtype = default_builtin_common_dtype;
     dt_slots->common_instance = NULL;
     dt_slots->ensure_canonical = ensure_native_byteorder;
+    dt_slots->get_fill_zero_loop = NULL;
 
     if (PyTypeNum_ISSIGNED(dtype_class->type_num)) {
         /* Convert our scalars (raise on too large unsigned and NaN, etc.) */
@@ -814,6 +850,8 @@ dtypemeta_wrap_legacy_descriptor(PyArray_Descr *descr)
     }
     else if (descr->type_num == NPY_OBJECT) {
         dt_slots->common_dtype = object_common_dtype;
+        dt_slots->get_fill_zero_loop = npy_object_get_fill_zero_loop;
+        dt_slots->get_clear_loop = npy_get_clear_object_strided_loop;
     }
     else if (PyTypeNum_ISDATETIME(descr->type_num)) {
         /* Datetimes are flexible, but were not considered previously */
@@ -835,6 +873,10 @@ dtypemeta_wrap_legacy_descriptor(PyArray_Descr *descr)
                     void_discover_descr_from_pyobject);
             dt_slots->common_instance = void_common_instance;
             dt_slots->ensure_canonical = void_ensure_canonical;
+            dt_slots->get_fill_zero_loop = 
+                    npy_get_zerofill_void_and_legacy_user_dtype_loop;
+            dt_slots->get_clear_loop =
+                    npy_get_clear_void_and_legacy_user_dtype_loop;
         }
         else {
             dt_slots->default_descr = string_and_unicode_default_descr;
@@ -843,9 +885,14 @@ dtypemeta_wrap_legacy_descriptor(PyArray_Descr *descr)
                     string_discover_descr_from_pyobject);
             dt_slots->common_dtype = string_unicode_common_dtype;
             dt_slots->common_instance = string_unicode_common_instance;
+            ((PyTypeObject*)dtype_class)->tp_new = (newfunc)string_unicode_new;
         }
     }
 
+    if (PyTypeNum_ISNUMBER(descr->type_num)) {
+        dtype_class->flags |= NPY_DT_NUMERIC;
+    }
+
     if (_PyArray_MapPyTypeToDType(dtype_class, descr->typeobj,
             PyTypeNum_ISUSERDEF(dtype_class->type_num)) < 0) {
         Py_DECREF(dtype_class);
@@ -855,6 +902,21 @@ dtypemeta_wrap_legacy_descriptor(PyArray_Descr *descr)
     /* Finally, replace the current class of the descr */
     Py_SET_TYPE(descr, (PyTypeObject *)dtype_class);
 
+    /* And it to the types submodule if it is a builtin dtype */
+    if (!PyTypeNum_ISUSERDEF(descr->type_num)) {
+        static PyObject *add_dtype_helper = NULL;
+        npy_cache_import("numpy.dtypes", "_add_dtype_helper", &add_dtype_helper);
+        if (add_dtype_helper == NULL) {
+            return -1;
+        }
+
+        if (PyObject_CallFunction(
+                add_dtype_helper,
+                "Os", (PyObject *)dtype_class, alias) == NULL) {
+            return -1;
+        }
+    }
+
     return 0;
 }
 
@@ -865,22 +927,35 @@ dtypemeta_get_abstract(PyArray_DTypeMeta *self) {
 }
 
 static PyObject *
+dtypemeta_get_legacy(PyArray_DTypeMeta *self) {
+    return PyBool_FromLong(NPY_DT_is_legacy(self));
+}
+
+static PyObject *
 dtypemeta_get_parametric(PyArray_DTypeMeta *self) {
     return PyBool_FromLong(NPY_DT_is_parametric(self));
 }
 
+static PyObject *
+dtypemeta_get_is_numeric(PyArray_DTypeMeta *self) {
+    return PyBool_FromLong(NPY_DT_is_numeric(self));
+}
+
 /*
  * Simple exposed information, defined for each DType (class).
  */
 static PyGetSetDef dtypemeta_getset[] = {
         {"_abstract", (getter)dtypemeta_get_abstract, NULL, NULL, NULL},
+        {"_legacy", (getter)dtypemeta_get_legacy, NULL, NULL, NULL},
         {"_parametric", (getter)dtypemeta_get_parametric, NULL, NULL, NULL},
+        {"_is_numeric", (getter)dtypemeta_get_is_numeric, NULL, NULL, NULL},
         {NULL, NULL, NULL, NULL, NULL}
 };
 
 static PyMemberDef dtypemeta_members[] = {
     {"type",
-        T_OBJECT, offsetof(PyArray_DTypeMeta, scalar_type), READONLY, NULL},
+        T_OBJECT, offsetof(PyArray_DTypeMeta, scalar_type), READONLY,
+        "scalar type corresponding to the DType."},
     {NULL, 0, 0, 0, NULL},
 };
 
@@ -893,12 +968,12 @@ NPY_NO_EXPORT PyTypeObject PyArrayDTypeMeta_Type = {
     /* Types are garbage collected (see dtypemeta_is_gc documentation) */
     .tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,
     .tp_doc = "Preliminary NumPy API: The Type of NumPy DTypes (metaclass)",
-    .tp_getset = dtypemeta_getset,
+    .tp_traverse = (traverseproc)dtypemeta_traverse,
     .tp_members = dtypemeta_members,
+    .tp_getset = dtypemeta_getset,
     .tp_base = NULL,  /* set to PyType_Type at import time */
-    .tp_alloc = dtypemeta_alloc,
     .tp_init = (initproc)dtypemeta_init,
+    .tp_alloc = dtypemeta_alloc,
     .tp_new = dtypemeta_new,
     .tp_is_gc = dtypemeta_is_gc,
-    .tp_traverse = (traverseproc)dtypemeta_traverse,
 };
diff --git a/numpy/core/src/multiarray/dtypemeta.h b/numpy/core/src/multiarray/dtypemeta.h
index 15318f040..32212228c 100644
--- a/numpy/core/src/multiarray/dtypemeta.h
+++ b/numpy/core/src/multiarray/dtypemeta.h
@@ -1,44 +1,18 @@
 #ifndef NUMPY_CORE_SRC_MULTIARRAY_DTYPEMETA_H_
 #define NUMPY_CORE_SRC_MULTIARRAY_DTYPEMETA_H_
 
+#include "array_method.h"
+#include "dtype_traversal.h"
+
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-/* DType flags, currently private, since we may just expose functions */
-#define NPY_DT_LEGACY 1 << 0
-#define NPY_DT_ABSTRACT 1 << 1
-#define NPY_DT_PARAMETRIC 1 << 2
-
-
-typedef PyArray_Descr *(discover_descr_from_pyobject_function)(
-        PyArray_DTypeMeta *cls, PyObject *obj);
-
-/*
- * Before making this public, we should decide whether it should pass
- * the type, or allow looking at the object. A possible use-case:
- * `np.array(np.array([0]), dtype=np.ndarray)`
- * Could consider arrays that are not `dtype=ndarray` "scalars".
- */
-typedef int (is_known_scalar_type_function)(
-        PyArray_DTypeMeta *cls, PyTypeObject *obj);
-
-typedef PyArray_Descr *(default_descr_function)(PyArray_DTypeMeta *cls);
-typedef PyArray_DTypeMeta *(common_dtype_function)(
-        PyArray_DTypeMeta *dtype1, PyArray_DTypeMeta *dtype2);
-typedef PyArray_Descr *(common_instance_function)(
-        PyArray_Descr *dtype1, PyArray_Descr *dtype2);
-typedef PyArray_Descr *(ensure_canonical_function)(PyArray_Descr *dtype);
+#include "numpy/_dtype_api.h"
 
-/*
- * TODO: These two functions are currently only used for experimental DType
- *       API support.  Their relation should be "reversed": NumPy should
- *       always use them internally.
- *       There are open points about "casting safety" though, e.g. setting
- *       elements is currently always unsafe.
- */
-typedef int(setitemfunction)(PyArray_Descr *, PyObject *, char *);
-typedef PyObject *(getitemfunction)(PyArray_Descr *, char *);
+/* DType flags, currently private, since we may just expose functions 
+   Other publicly visible flags are in _dtype_api.h                   */
+#define NPY_DT_LEGACY 1 << 0
 
 
 typedef struct {
@@ -55,10 +29,40 @@ typedef struct {
     setitemfunction *setitem;
     getitemfunction *getitem;
     /*
+     * Either NULL or fetches a clearing function.  Clearing means deallocating
+     * any referenced data and setting it to a safe state.  For Python objects
+     * this means using `Py_CLEAR` which is equivalent to `Py_DECREF` and
+     * setting the `PyObject *` to NULL.
+     * After the clear, the data must be fillable via cast/copy and calling
+     * clear a second time must be safe.
+     * If the DType class does not implement `get_clear_loop` setting
+     * NPY_ITEM_REFCOUNT on its dtype instances is invalid.  Note that it is
+     * acceptable for  NPY_ITEM_REFCOUNT to inidicate references that are not
+     * Python objects.
+     */
+    get_traverse_loop_function *get_clear_loop;
+    /*
+       Either NULL or a function that sets a function pointer to a traversal
+       loop that fills an array with zero values appropriate for the dtype. If
+       get_fill_zero_loop is undefined or the function pointer set by it is
+       NULL, the array buffer is allocated with calloc. If this function is
+       defined and it sets a non-NULL function pointer, the array buffer is
+       allocated with malloc and the zero-filling loop function pointer is
+       called to fill the buffer. For the best performance, avoid using this
+       function if a zero-filled array buffer allocated with calloc makes sense
+       for the dtype.
+
+       Note that this is currently used only for zero-filling a newly allocated
+       array. While it can be used to zero-fill an already-filled buffer, that
+       will not work correctly for arrays holding references. If you need to do
+       that, clear the array first.
+    */
+    get_traverse_loop_function *get_fill_zero_loop;
+    /*
      * The casting implementation (ArrayMethod) to convert between two
      * instances of this DType, stored explicitly for fast access:
      */
-    PyObject *within_dtype_castingimpl;
+    PyArrayMethodObject *within_dtype_castingimpl;
     /*
      * Dictionary of ArrayMethods representing most possible casts
      * (structured and object are exceptions).
@@ -74,6 +78,13 @@ typedef struct {
     PyArray_ArrFuncs f;
 } NPY_DType_Slots;
 
+// This must be updated if new slots before within_dtype_castingimpl
+// are added
+#define NPY_NUM_DTYPE_SLOTS 10
+#define NPY_NUM_DTYPE_PYARRAY_ARRFUNCS_SLOTS 22
+#define NPY_DT_MAX_ARRFUNCS_SLOT \
+  NPY_NUM_DTYPE_PYARRAY_ARRFUNCS_SLOTS + _NPY_DT_ARRFUNCS_OFFSET
+
 
 #define NPY_DTYPE(descr) ((PyArray_DTypeMeta *)Py_TYPE(descr))
 #define NPY_DT_SLOTS(dtype) ((NPY_DType_Slots *)(dtype)->dt_slots)
@@ -81,6 +92,8 @@ typedef struct {
 #define NPY_DT_is_legacy(dtype) (((dtype)->flags & NPY_DT_LEGACY) != 0)
 #define NPY_DT_is_abstract(dtype) (((dtype)->flags & NPY_DT_ABSTRACT) != 0)
 #define NPY_DT_is_parametric(dtype) (((dtype)->flags & NPY_DT_PARAMETRIC) != 0)
+#define NPY_DT_is_numeric(dtype) (((dtype)->flags & NPY_DT_NUMERIC) != 0)
+#define NPY_DT_is_user_defined(dtype) (((dtype)->type_num == -1))
 
 /*
  * Macros for convenient classmethod calls, since these require
@@ -111,7 +124,7 @@ typedef struct {
  * (Error checking is not required for DescrFromType, assuming that the
  * type is valid.)
  */
-static NPY_INLINE PyArray_DTypeMeta *
+static inline PyArray_DTypeMeta *
 PyArray_DTypeFromTypeNum(int typenum)
 {
     PyArray_Descr *descr = PyArray_DescrFromType(typenum);
@@ -127,7 +140,8 @@ python_builtins_are_known_scalar_types(
         PyArray_DTypeMeta *cls, PyTypeObject *pytype);
 
 NPY_NO_EXPORT int
-dtypemeta_wrap_legacy_descriptor(PyArray_Descr *dtypem);
+dtypemeta_wrap_legacy_descriptor(
+        PyArray_Descr *dtypem, const char *name, const char *alias);
 
 #ifdef __cplusplus
 }
diff --git a/numpy/core/src/multiarray/einsum.c.src b/numpy/core/src/multiarray/einsum.c.src
index cd1a58982..0b3b3fd8c 100644
--- a/numpy/core/src/multiarray/einsum.c.src
+++ b/numpy/core/src/multiarray/einsum.c.src
@@ -17,6 +17,7 @@
 #include <numpy/npy_common.h>
 #include <numpy/arrayobject.h>
 #include <npy_pycompat.h>
+#include <array_assign.h>   //PyArray_AssignRawScalar
 
 #include <ctype.h>
 
@@ -321,8 +322,7 @@ get_single_op_view(PyArrayObject *op, char *labels,
                 Py_TYPE(op), PyArray_DESCR(op),
                 ndim_output, new_dims, new_strides, PyArray_DATA(op),
                 PyArray_ISWRITEABLE(op) ? NPY_ARRAY_WRITEABLE : 0,
-                (PyObject *)op, (PyObject *)op,
-                0, 0);
+                (PyObject *)op, (PyObject *)op, 0);
 
         if (*ret == NULL) {
             return -1;
@@ -532,10 +532,17 @@ unbuffered_loop_nop1_ndim2(NpyIter *iter)
      * Since the iterator wasn't tracking coordinates, the
      * loop provided by the iterator is in Fortran-order.
      */
-    NPY_BEGIN_THREADS_THRESHOLDED(shape[1] * shape[0]);
+    int needs_api = NpyIter_IterationNeedsAPI(iter);
+    if (!needs_api) {
+        NPY_BEGIN_THREADS_THRESHOLDED(shape[1] * shape[0]);
+    }
     for (coord = shape[1]; coord > 0; --coord) {
         sop(1, ptrs[0], strides[0], shape[0]);
 
+        if (needs_api && PyErr_Occurred()){
+            return -1;
+        }
+
         ptr = ptrs[1][0] + strides[1][0];
         ptrs[0][0] = ptrs[1][0] = ptr;
         ptr = ptrs[1][1] + strides[1][1];
@@ -586,11 +593,18 @@ unbuffered_loop_nop1_ndim3(NpyIter *iter)
      * Since the iterator wasn't tracking coordinates, the
      * loop provided by the iterator is in Fortran-order.
      */
-    NPY_BEGIN_THREADS_THRESHOLDED(shape[2] * shape[1] * shape[0]);
+    int needs_api = NpyIter_IterationNeedsAPI(iter);
+    if (!needs_api) {
+        NPY_BEGIN_THREADS_THRESHOLDED(shape[2] * shape[1] * shape[0]);
+    }
     for (coords[1] = shape[2]; coords[1] > 0; --coords[1]) {
         for (coords[0] = shape[1]; coords[0] > 0; --coords[0]) {
             sop(1, ptrs[0], strides[0], shape[0]);
 
+            if (needs_api && PyErr_Occurred()){
+                return -1;
+            }
+
             ptr = ptrs[1][0] + strides[1][0];
             ptrs[0][0] = ptrs[1][0] = ptr;
             ptr = ptrs[1][1] + strides[1][1];
@@ -643,10 +657,17 @@ unbuffered_loop_nop2_ndim2(NpyIter *iter)
      * Since the iterator wasn't tracking coordinates, the
      * loop provided by the iterator is in Fortran-order.
      */
-    NPY_BEGIN_THREADS_THRESHOLDED(shape[1] * shape[0]);
+    int needs_api = NpyIter_IterationNeedsAPI(iter);
+    if (!needs_api) {
+        NPY_BEGIN_THREADS_THRESHOLDED(shape[1] * shape[0]);
+    }
     for (coord = shape[1]; coord > 0; --coord) {
         sop(2, ptrs[0], strides[0], shape[0]);
 
+        if(needs_api && PyErr_Occurred()){
+            return -1;
+        }
+
         ptr = ptrs[1][0] + strides[1][0];
         ptrs[0][0] = ptrs[1][0] = ptr;
         ptr = ptrs[1][1] + strides[1][1];
@@ -699,11 +720,18 @@ unbuffered_loop_nop2_ndim3(NpyIter *iter)
      * Since the iterator wasn't tracking coordinates, the
      * loop provided by the iterator is in Fortran-order.
      */
-    NPY_BEGIN_THREADS_THRESHOLDED(shape[2] * shape[1] * shape[0]);
+    int needs_api = NpyIter_IterationNeedsAPI(iter);
+    if (!needs_api) {
+        NPY_BEGIN_THREADS_THRESHOLDED(shape[2] * shape[1] * shape[0]);
+    }
     for (coords[1] = shape[2]; coords[1] > 0; --coords[1]) {
         for (coords[0] = shape[1]; coords[0] > 0; --coords[0]) {
             sop(2, ptrs[0], strides[0], shape[0]);
 
+            if(needs_api && PyErr_Occurred()){
+                return -1;
+            }
+
             ptr = ptrs[1][0] + strides[1][0];
             ptrs[0][0] = ptrs[1][0] = ptr;
             ptr = ptrs[1][1] + strides[1][1];
@@ -1025,7 +1053,7 @@ PyArray_EinsteinSum(char *subscripts, npy_intp nop,
         goto fail;
     }
 
-    /* Initialize the output to all zeros */
+    /* Initialize the output to all zeros or None*/
     ret = NpyIter_GetOperandArray(iter)[nop];
     if (PyArray_AssignZero(ret, NULL) < 0) {
         goto fail;
diff --git a/numpy/core/src/multiarray/einsum_sumprod.c.src b/numpy/core/src/multiarray/einsum_sumprod.c.src
index e7b2f2c2c..5f2ccf2dc 100644
--- a/numpy/core/src/multiarray/einsum_sumprod.c.src
+++ b/numpy/core/src/multiarray/einsum_sumprod.c.src
@@ -1026,6 +1026,57 @@ bool_sum_of_products_outstride0_@noplabel@(int nop, char **dataptr,
 
 /**end repeat**/
 
+/**begin repeat
+ * #fn_name = 
+ *  object_sum_of_products_any,
+ *  object_sum_of_products_one,
+ *  object_sum_of_products_two,
+ *  object_sum_of_products_three,
+ *  object_sum_of_products_contig_any,
+ *  object_sum_of_products_contig_one,
+ *  object_sum_of_products_contig_two,
+ *  object_sum_of_products_contig_three,
+ *  object_sum_of_products_outstride0_any,
+ *  object_sum_of_products_outstride0_one,
+ *  object_sum_of_products_outstride0_two,
+ *  object_sum_of_products_outstride0_three#
+ */
+static void
+@fn_name@(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+    while(count--){
+        PyObject *prod = *(PyObject **)dataptr[0];
+        if (!prod) {
+            prod = Py_None;  // convention is to treat nulls as None
+        }
+        Py_INCREF(prod);
+        for (int i = 1; i < nop; ++i){
+            PyObject *curr = *(PyObject **)dataptr[i];
+            if (!curr) {
+                curr = Py_None;  // convention is to treat nulls as None
+            }
+            Py_SETREF(prod, PyNumber_Multiply(prod, curr));
+            if (!prod) {
+                return;
+            }
+        }
+
+        PyObject *sum = PyNumber_Add(*(PyObject **)dataptr[nop], prod);
+        Py_DECREF(prod);
+        if (!sum) {
+            return;
+        }
+        
+        Py_XDECREF(*(PyObject **)dataptr[nop]);
+        *(PyObject **)dataptr[nop] = sum;
+        for (int i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+    }
+}
+/**end repeat**/
+
 /* These tables need to match up with the type enum */
 static sum_of_products_fn
 _contig_outstride0_unary_specialization_table[NPY_NTYPES] = {
@@ -1116,7 +1167,7 @@ static sum_of_products_fn _outstride0_specialized_table[NPY_NTYPES][4] = {
  *        1, 1,
  *        1, 1, 1,
  *        1, 1, 1,
- *        0, 0, 0, 0,
+ *        1, 0, 0, 0,
  *        0, 0, 1#
  */
 #if @use@
@@ -1152,7 +1203,7 @@ static sum_of_products_fn _allcontig_specialized_table[NPY_NTYPES][4] = {
  *        1, 1,
  *        1, 1, 1,
  *        1, 1, 1,
- *        0, 0, 0, 0,
+ *        1, 0, 0, 0,
  *        0, 0, 1#
  */
 #if @use@
@@ -1188,7 +1239,7 @@ static sum_of_products_fn _unspecialized_table[NPY_NTYPES][4] = {
  *        1, 1,
  *        1, 1, 1,
  *        1, 1, 1,
- *        0, 0, 0, 0,
+ *        1, 0, 0, 0,
  *        0, 0, 1#
  */
 #if @use@
diff --git a/numpy/core/src/multiarray/experimental_public_dtype_api.c b/numpy/core/src/multiarray/experimental_public_dtype_api.c
index 36350c832..c0da0a7b7 100644
--- a/numpy/core/src/multiarray/experimental_public_dtype_api.c
+++ b/numpy/core/src/multiarray/experimental_public_dtype_api.c
@@ -16,19 +16,6 @@
 #include "common_dtype.h"
 
 
-#define EXPERIMENTAL_DTYPE_API_VERSION 5
-
-
-typedef struct{
-    PyTypeObject *typeobj;    /* type of python scalar or NULL */
-    int flags;                /* flags, including parametric and abstract */
-    /* NULL terminated cast definitions. Use NULL for the newly created DType */
-    PyArrayMethod_Spec **casts;
-    PyType_Slot *slots;
-} PyArrayDTypeMeta_Spec;
-
-
-
 static PyArray_DTypeMeta *
 dtype_does_not_promote(
         PyArray_DTypeMeta *NPY_UNUSED(self), PyArray_DTypeMeta *NPY_UNUSED(other))
@@ -111,15 +98,11 @@ legacy_getitem_using_DType(void *data, void *arr)
  * from the user in the future and more could get defaults for compatibility.
  */
 PyArray_ArrFuncs default_funcs = {
+        .getitem = &legacy_getitem_using_DType,
         .setitem = &legacy_setitem_using_DType,
-        .getitem = &legacy_getitem_using_DType
 };
 
 
-/* other slots are in order, so keep only last around: */
-#define NUM_DTYPE_SLOTS 8
-
-
 int
 PyArrayInitDTypeMeta_FromSpec(
         PyArray_DTypeMeta *DType, PyArrayDTypeMeta_Spec *spec)
@@ -155,10 +138,12 @@ PyArrayInitDTypeMeta_FromSpec(
     }
 
     /* Check and handle flags: */
-    if (spec->flags & ~(NPY_DT_PARAMETRIC|NPY_DT_ABSTRACT)) {
+    int allowed_flags = NPY_DT_PARAMETRIC | NPY_DT_ABSTRACT | NPY_DT_NUMERIC;
+    if (spec->flags & ~(allowed_flags)) {
         PyErr_SetString(PyExc_RuntimeError,
-                "invalid DType flags specified, only parametric and abstract "
-                "are valid flags for user DTypes.");
+                "invalid DType flags specified, only NPY_DT_PARAMETRIC, "
+                "NPY_DT_ABSTRACT, and NPY_DT_NUMERIC are valid flags for "
+                "user DTypes.");
         return -1;
     }
 
@@ -178,6 +163,8 @@ PyArrayInitDTypeMeta_FromSpec(
     NPY_DT_SLOTS(DType)->common_instance = NULL;
     NPY_DT_SLOTS(DType)->setitem = NULL;
     NPY_DT_SLOTS(DType)->getitem = NULL;
+    NPY_DT_SLOTS(DType)->get_clear_loop = NULL;
+    NPY_DT_SLOTS(DType)->f = default_funcs;
 
     PyType_Slot *spec_slot = spec->slots;
     while (1) {
@@ -187,20 +174,100 @@ PyArrayInitDTypeMeta_FromSpec(
         if (slot == 0) {
             break;
         }
-        if (slot > NUM_DTYPE_SLOTS || slot < 0) {
+        if ((slot < 0) ||
+            ((slot > NPY_NUM_DTYPE_SLOTS) &&
+             (slot <= _NPY_DT_ARRFUNCS_OFFSET)) ||
+            (slot > NPY_DT_MAX_ARRFUNCS_SLOT)) {
             PyErr_Format(PyExc_RuntimeError,
                     "Invalid slot with value %d passed in.", slot);
             return -1;
         }
         /*
-         * It is up to the user to get this right, and slots are sorted
-         * exactly like they are stored right now:
+         * It is up to the user to get this right, the slots in the public API
+         * are sorted exactly like they are stored in the NPY_DT_Slots struct
+         * right now:
          */
-        void **current = (void **)(&(
-                NPY_DT_SLOTS(DType)->discover_descr_from_pyobject));
-        current += slot - 1;
-        *current = pfunc;
+        if (slot <= NPY_NUM_DTYPE_SLOTS) {
+            // slot > NPY_NUM_DTYPE_SLOTS are PyArray_ArrFuncs
+            void **current = (void **)(&(
+                    NPY_DT_SLOTS(DType)->discover_descr_from_pyobject));
+            current += slot - 1;
+            *current = pfunc;
+        }
+        else {
+            int f_slot = slot - _NPY_DT_ARRFUNCS_OFFSET;
+            if (1 <= f_slot && f_slot <= NPY_NUM_DTYPE_PYARRAY_ARRFUNCS_SLOTS) {
+                switch (f_slot) {
+                    case 1:
+                        NPY_DT_SLOTS(DType)->f.getitem = pfunc;
+                        break;
+                    case 2:
+                        NPY_DT_SLOTS(DType)->f.setitem = pfunc;
+                        break;
+                    case 3:
+                        NPY_DT_SLOTS(DType)->f.copyswapn = pfunc;
+                        break;
+                    case 4:
+                        NPY_DT_SLOTS(DType)->f.copyswap = pfunc;
+                        break;
+                    case 5:
+                        NPY_DT_SLOTS(DType)->f.compare = pfunc;
+                        break;
+                    case 6:
+                        NPY_DT_SLOTS(DType)->f.argmax = pfunc;
+                        break;
+                    case 7:
+                        NPY_DT_SLOTS(DType)->f.dotfunc = pfunc;
+                        break;
+                    case 8:
+                        NPY_DT_SLOTS(DType)->f.scanfunc = pfunc;
+                        break;
+                    case 9:
+                        NPY_DT_SLOTS(DType)->f.fromstr = pfunc;
+                        break;
+                    case 10:
+                        NPY_DT_SLOTS(DType)->f.nonzero = pfunc;
+                        break;
+                    case 11:
+                        NPY_DT_SLOTS(DType)->f.fill = pfunc;
+                        break;
+                    case 12:
+                        NPY_DT_SLOTS(DType)->f.fillwithscalar = pfunc;
+                        break;
+                    case 13:
+                        *NPY_DT_SLOTS(DType)->f.sort = pfunc;
+                        break;
+                    case 14:
+                        *NPY_DT_SLOTS(DType)->f.argsort = pfunc;
+                        break;
+                    case 15:
+                    case 16:
+                    case 17:
+                    case 18:
+                    case 19:
+                    case 20:
+                    case 21:
+                        PyErr_Format(
+                            PyExc_RuntimeError,
+                            "PyArray_ArrFunc casting slot with value %d is disabled.",
+                            f_slot
+                        );
+                        return -1;
+                    case 22:
+                        NPY_DT_SLOTS(DType)->f.argmin = pfunc;
+                        break;
+                }
+            } else {
+                    PyErr_Format(
+                        PyExc_RuntimeError,
+                        "Invalid PyArray_ArrFunc slot with value %d passed in.",
+                        f_slot
+                    );
+                    return -1;
+            }
+        }
     }
+
     if (NPY_DT_SLOTS(DType)->setitem == NULL
             || NPY_DT_SLOTS(DType)->getitem == NULL) {
         PyErr_SetString(PyExc_RuntimeError,
@@ -209,6 +276,12 @@ PyArrayInitDTypeMeta_FromSpec(
         return -1;
     }
 
+    if (NPY_DT_SLOTS(DType)->ensure_canonical == NULL) {
+        PyErr_SetString(PyExc_RuntimeError,
+                        "A DType must provide an ensure_canonical implementation.");
+        return -1;
+    }
+
     /*
      * Now that the spec is read we can check that all required functions were
      * defined by the user.
@@ -223,7 +296,7 @@ PyArrayInitDTypeMeta_FromSpec(
             return -1;
         }
     }
-    NPY_DT_SLOTS(DType)->f = default_funcs;
+
     /* invalid type num. Ideally, we get away with it! */
     DType->type_num = -1;
 
@@ -252,6 +325,14 @@ PyArrayInitDTypeMeta_FromSpec(
     /*
      * And now, register all the casts that are currently defined!
      */
+    if (spec->casts == NULL) {
+        PyErr_SetString(
+            PyExc_RuntimeError,
+            "DType must at least provide a function to cast (or just copy) "
+            "between its own instances!");
+        return -1;
+    }
+
     PyArrayMethod_Spec **next_meth_spec = spec->casts;
     while (1) {
         PyArrayMethod_Spec *meth_spec = *next_meth_spec;
@@ -429,12 +510,12 @@ _get_experimental_dtype_api(PyObject *NPY_UNUSED(mod), PyObject *arg)
     if (error_converting(version)) {
         return NULL;
     }
-    if (version != EXPERIMENTAL_DTYPE_API_VERSION) {
+    if (version != __EXPERIMENTAL_DTYPE_API_VERSION) {
         PyErr_Format(PyExc_RuntimeError,
                 "Experimental DType API version %d requested, but NumPy "
                 "is exporting version %d.  Recompile your DType and/or upgrade "
                 "NumPy to match.",
-                version, EXPERIMENTAL_DTYPE_API_VERSION);
+                version, __EXPERIMENTAL_DTYPE_API_VERSION);
         return NULL;
     }
 
diff --git a/numpy/core/src/multiarray/getset.c b/numpy/core/src/multiarray/getset.c
index ab35548ed..7022eb231 100644
--- a/numpy/core/src/multiarray/getset.c
+++ b/numpy/core/src/multiarray/getset.c
@@ -797,20 +797,17 @@ array_imag_get(PyArrayObject *self, void *NPY_UNUSED(ignored))
     }
     else {
         Py_INCREF(PyArray_DESCR(self));
-        ret = (PyArrayObject *)PyArray_NewFromDescr(Py_TYPE(self),
-                                                    PyArray_DESCR(self),
-                                                    PyArray_NDIM(self),
-                                                    PyArray_DIMS(self),
-                                                    NULL, NULL,
-                                                    PyArray_ISFORTRAN(self),
-                                                    (PyObject *)self);
+        ret = (PyArrayObject *)PyArray_NewFromDescr_int(
+                Py_TYPE(self),
+                PyArray_DESCR(self),
+                PyArray_NDIM(self),
+                PyArray_DIMS(self),
+                NULL, NULL,
+                PyArray_ISFORTRAN(self),
+                (PyObject *)self, NULL, _NPY_ARRAY_ZEROED);
         if (ret == NULL) {
             return NULL;
         }
-        if (_zerofill(ret) < 0) {
-            Py_DECREF(ret);
-            return NULL;
-        }
         PyArray_CLEARFLAGS(ret, NPY_ARRAY_WRITEABLE);
     }
     return (PyObject *) ret;
diff --git a/numpy/core/src/multiarray/item_selection.c b/numpy/core/src/multiarray/item_selection.c
index 9fad153a3..676a2a6b4 100644
--- a/numpy/core/src/multiarray/item_selection.c
+++ b/numpy/core/src/multiarray/item_selection.c
@@ -17,10 +17,12 @@
 
 #include "multiarraymodule.h"
 #include "common.h"
+#include "dtype_transfer.h"
 #include "arrayobject.h"
 #include "ctors.h"
 #include "lowlevel_strided_loops.h"
 #include "array_assign.h"
+#include "refcount.h"
 
 #include "npy_sort.h"
 #include "npy_partition.h"
@@ -30,7 +32,7 @@
 #include "array_coercion.h"
 #include "simd/simd.h"
 
-static NPY_GCC_OPT_3 NPY_INLINE int
+static NPY_GCC_OPT_3 inline int
 npy_fasttake_impl(
         char *dest, char *src, const npy_intp *indices,
         npy_intp n, npy_intp m, npy_intp max_item,
@@ -39,7 +41,26 @@ npy_fasttake_impl(
         PyArray_Descr *dtype, int axis)
 {
     NPY_BEGIN_THREADS_DEF;
-    NPY_BEGIN_THREADS_DESCR(dtype);
+
+    NPY_cast_info cast_info;
+    NPY_ARRAYMETHOD_FLAGS flags;
+    NPY_cast_info_init(&cast_info);
+
+    if (!needs_refcounting) {
+        /* if "refcounting" is not needed memcpy is safe for a simple copy  */
+        NPY_BEGIN_THREADS;
+    }
+    else {
+        if (PyArray_GetDTypeTransferFunction(
+                1, itemsize, itemsize, dtype, dtype, 0,
+                &cast_info, &flags) < 0) {
+            return -1;
+        }
+        if (!(flags & NPY_METH_REQUIRES_PYAPI)) {
+            NPY_BEGIN_THREADS;
+        }
+    }
+
     switch (clipmode) {
         case NPY_RAISE:
             for (npy_intp i = 0; i < n; i++) {
@@ -47,22 +68,23 @@ npy_fasttake_impl(
                     npy_intp tmp = indices[j];
                     if (check_and_adjust_index(&tmp, max_item, axis,
                                                _save) < 0) {
-                        return -1;
+                        goto fail;
                     }
                     char *tmp_src = src + tmp * chunk;
                     if (needs_refcounting) {
-                        for (npy_intp k = 0; k < nelem; k++) {
-                            PyArray_Item_INCREF(tmp_src, dtype);
-                            PyArray_Item_XDECREF(dest, dtype);
-                            memmove(dest, tmp_src, itemsize);
-                            dest += itemsize;
-                            tmp_src += itemsize;
+                        char *data[2] = {tmp_src, dest};
+                        npy_intp strides[2] = {itemsize, itemsize};
+                        if (cast_info.func(
+                                &cast_info.context, data, &nelem, strides,
+                                cast_info.auxdata) < 0) {
+                            NPY_END_THREADS;
+                            goto fail;
                         }
                     }
                     else {
-                        memmove(dest, tmp_src, chunk);
-                        dest += chunk;
+                        memcpy(dest, tmp_src, chunk);
                     }
+                    dest += chunk;
                 }
                 src += chunk*max_item;
             }
@@ -83,18 +105,19 @@ npy_fasttake_impl(
                     }
                     char *tmp_src = src + tmp * chunk;
                     if (needs_refcounting) {
-                        for (npy_intp k = 0; k < nelem; k++) {
-                            PyArray_Item_INCREF(tmp_src, dtype);
-                            PyArray_Item_XDECREF(dest, dtype);
-                            memmove(dest, tmp_src, itemsize);
-                            dest += itemsize;
-                            tmp_src += itemsize;
+                        char *data[2] = {tmp_src, dest};
+                        npy_intp strides[2] = {itemsize, itemsize};
+                        if (cast_info.func(
+                                &cast_info.context, data, &nelem, strides,
+                                cast_info.auxdata) < 0) {
+                            NPY_END_THREADS;
+                            goto fail;
                         }
                     }
                     else {
-                        memmove(dest, tmp_src, chunk);
-                        dest += chunk;
+                        memcpy(dest, tmp_src, chunk);
                     }
+                    dest += chunk;
                 }
                 src += chunk*max_item;
             }
@@ -111,18 +134,19 @@ npy_fasttake_impl(
                     }
                     char *tmp_src = src + tmp * chunk;
                     if (needs_refcounting) {
-                        for (npy_intp k = 0; k < nelem; k++) {
-                            PyArray_Item_INCREF(tmp_src, dtype);
-                            PyArray_Item_XDECREF(dest, dtype);
-                            memmove(dest, tmp_src, itemsize);
-                            dest += itemsize;
-                            tmp_src += itemsize;
+                        char *data[2] = {tmp_src, dest};
+                        npy_intp strides[2] = {itemsize, itemsize};
+                        if (cast_info.func(
+                                &cast_info.context, data, &nelem, strides,
+                                cast_info.auxdata) < 0) {
+                            NPY_END_THREADS;
+                            goto fail;
                         }
                     }
                     else {
-                        memmove(dest, tmp_src, chunk);
-                        dest += chunk;
+                        memcpy(dest, tmp_src, chunk);
                     }
+                    dest += chunk;
                 }
                 src += chunk*max_item;
             }
@@ -130,7 +154,13 @@ npy_fasttake_impl(
     }
 
     NPY_END_THREADS;
+    NPY_cast_info_xfree(&cast_info);
     return 0;
+
+  fail:
+    /* NPY_END_THREADS already ensured. */
+    NPY_cast_info_xfree(&cast_info);
+    return -1;
 }
 
 
@@ -322,11 +352,17 @@ PyArray_PutTo(PyArrayObject *self, PyObject* values0, PyObject *indices0,
               NPY_CLIPMODE clipmode)
 {
     PyArrayObject  *indices, *values;
-    npy_intp i, chunk, ni, max_item, nv, tmp;
+    npy_intp i, itemsize, ni, max_item, nv, tmp;
     char *src, *dest;
     int copied = 0;
     int overlap = 0;
 
+    NPY_BEGIN_THREADS_DEF;
+    NPY_cast_info cast_info;
+    NPY_ARRAYMETHOD_FLAGS flags;
+
+    NPY_cast_info_init(&cast_info);
+
     indices = NULL;
     values = NULL;
     if (!PyArray_Check(self)) {
@@ -372,25 +408,51 @@ PyArray_PutTo(PyArrayObject *self, PyObject* values0, PyObject *indices0,
     }
     max_item = PyArray_SIZE(self);
     dest = PyArray_DATA(self);
-    chunk = PyArray_DESCR(self)->elsize;
+    itemsize = PyArray_DESCR(self)->elsize;
+
+    int has_references = PyDataType_REFCHK(PyArray_DESCR(self));
+
+    if (!has_references) {
+        /* if has_references is not needed memcpy is safe for a simple copy  */
+        NPY_BEGIN_THREADS_THRESHOLDED(ni);
+    }
+    else {
+        PyArray_Descr *dtype = PyArray_DESCR(self);
+        if (PyArray_GetDTypeTransferFunction(
+                PyArray_ISALIGNED(self), itemsize, itemsize, dtype, dtype, 0,
+                &cast_info, &flags) < 0) {
+            goto fail;
+        }
+        if (!(flags & NPY_METH_REQUIRES_PYAPI)) {
+            NPY_BEGIN_THREADS_THRESHOLDED(ni);
+        }
+    }
+
+
+    if (has_references) {
+        const npy_intp one = 1;
+        const npy_intp strides[2] = {itemsize, itemsize};
 
-    if (PyDataType_REFCHK(PyArray_DESCR(self))) {
         switch(clipmode) {
         case NPY_RAISE:
             for (i = 0; i < ni; i++) {
-                src = PyArray_BYTES(values) + chunk*(i % nv);
+                src = PyArray_BYTES(values) + itemsize*(i % nv);
                 tmp = ((npy_intp *)(PyArray_DATA(indices)))[i];
-                if (check_and_adjust_index(&tmp, max_item, 0, NULL) < 0) {
+                if (check_and_adjust_index(&tmp, max_item, 0, _save) < 0) {
+                    goto fail;
+                }
+                char *data[2] = {src, dest + tmp*itemsize};
+                if (cast_info.func(
+                        &cast_info.context, data, &one, strides,
+                        cast_info.auxdata) < 0) {
+                    NPY_END_THREADS;
                     goto fail;
                 }
-                PyArray_Item_INCREF(src, PyArray_DESCR(self));
-                PyArray_Item_XDECREF(dest+tmp*chunk, PyArray_DESCR(self));
-                memmove(dest + tmp*chunk, src, chunk);
             }
             break;
         case NPY_WRAP:
             for (i = 0; i < ni; i++) {
-                src = PyArray_BYTES(values) + chunk * (i % nv);
+                src = PyArray_BYTES(values) + itemsize * (i % nv);
                 tmp = ((npy_intp *)(PyArray_DATA(indices)))[i];
                 if (tmp < 0) {
                     while (tmp < 0) {
@@ -402,14 +464,18 @@ PyArray_PutTo(PyArrayObject *self, PyObject* values0, PyObject *indices0,
                         tmp -= max_item;
                     }
                 }
-                PyArray_Item_INCREF(src, PyArray_DESCR(self));
-                PyArray_Item_XDECREF(dest+tmp*chunk, PyArray_DESCR(self));
-                memmove(dest + tmp * chunk, src, chunk);
+                char *data[2] = {src, dest + tmp*itemsize};
+                if (cast_info.func(
+                        &cast_info.context, data, &one, strides,
+                        cast_info.auxdata) < 0) {
+                    NPY_END_THREADS;
+                    goto fail;
+                }
             }
             break;
         case NPY_CLIP:
             for (i = 0; i < ni; i++) {
-                src = PyArray_BYTES(values) + chunk * (i % nv);
+                src = PyArray_BYTES(values) + itemsize * (i % nv);
                 tmp = ((npy_intp *)(PyArray_DATA(indices)))[i];
                 if (tmp < 0) {
                     tmp = 0;
@@ -417,30 +483,32 @@ PyArray_PutTo(PyArrayObject *self, PyObject* values0, PyObject *indices0,
                 else if (tmp >= max_item) {
                     tmp = max_item - 1;
                 }
-                PyArray_Item_INCREF(src, PyArray_DESCR(self));
-                PyArray_Item_XDECREF(dest+tmp*chunk, PyArray_DESCR(self));
-                memmove(dest + tmp * chunk, src, chunk);
+                char *data[2] = {src, dest + tmp*itemsize};
+                if (cast_info.func(
+                        &cast_info.context, data, &one, strides,
+                        cast_info.auxdata) < 0) {
+                    NPY_END_THREADS;
+                    goto fail;
+                }
             }
             break;
         }
     }
     else {
-        NPY_BEGIN_THREADS_DEF;
-        NPY_BEGIN_THREADS_THRESHOLDED(ni);
         switch(clipmode) {
         case NPY_RAISE:
             for (i = 0; i < ni; i++) {
-                src = PyArray_BYTES(values) + chunk * (i % nv);
+                src = PyArray_BYTES(values) + itemsize * (i % nv);
                 tmp = ((npy_intp *)(PyArray_DATA(indices)))[i];
                 if (check_and_adjust_index(&tmp, max_item, 0, _save) < 0) {
                     goto fail;
                 }
-                memmove(dest + tmp * chunk, src, chunk);
+                memmove(dest + tmp * itemsize, src, itemsize);
             }
             break;
         case NPY_WRAP:
             for (i = 0; i < ni; i++) {
-                src = PyArray_BYTES(values) + chunk * (i % nv);
+                src = PyArray_BYTES(values) + itemsize * (i % nv);
                 tmp = ((npy_intp *)(PyArray_DATA(indices)))[i];
                 if (tmp < 0) {
                     while (tmp < 0) {
@@ -452,12 +520,12 @@ PyArray_PutTo(PyArrayObject *self, PyObject* values0, PyObject *indices0,
                         tmp -= max_item;
                     }
                 }
-                memmove(dest + tmp * chunk, src, chunk);
+                memmove(dest + tmp * itemsize, src, itemsize);
             }
             break;
         case NPY_CLIP:
             for (i = 0; i < ni; i++) {
-                src = PyArray_BYTES(values) + chunk * (i % nv);
+                src = PyArray_BYTES(values) + itemsize * (i % nv);
                 tmp = ((npy_intp *)(PyArray_DATA(indices)))[i];
                 if (tmp < 0) {
                     tmp = 0;
@@ -465,14 +533,16 @@ PyArray_PutTo(PyArrayObject *self, PyObject* values0, PyObject *indices0,
                 else if (tmp >= max_item) {
                     tmp = max_item - 1;
                 }
-                memmove(dest + tmp * chunk, src, chunk);
+                memmove(dest + tmp * itemsize, src, itemsize);
             }
             break;
         }
-        NPY_END_THREADS;
     }
+    NPY_END_THREADS;
 
  finish:
+    NPY_cast_info_xfree(&cast_info);
+
     Py_XDECREF(values);
     Py_XDECREF(indices);
     if (copied) {
@@ -482,6 +552,8 @@ PyArray_PutTo(PyArrayObject *self, PyObject* values0, PyObject *indices0,
     Py_RETURN_NONE;
 
  fail:
+    NPY_cast_info_xfree(&cast_info);
+
     Py_XDECREF(indices);
     Py_XDECREF(values);
     if (copied) {
@@ -492,7 +564,7 @@ PyArray_PutTo(PyArrayObject *self, PyObject* values0, PyObject *indices0,
 }
 
 
-static NPY_GCC_OPT_3 NPY_INLINE void
+static NPY_GCC_OPT_3 inline void
 npy_fastputmask_impl(
         char *dest, char *src, const npy_bool *mask_data,
         npy_intp ni, npy_intp nv, npy_intp chunk)
@@ -562,11 +634,12 @@ PyArray_PutMask(PyArrayObject *self, PyObject* values0, PyObject* mask0)
 {
     PyArrayObject *mask, *values;
     PyArray_Descr *dtype;
-    npy_intp chunk, ni, nv;
+    npy_intp itemsize, ni, nv;
     char *src, *dest;
     npy_bool *mask_data;
     int copied = 0;
     int overlap = 0;
+    NPY_BEGIN_THREADS_DEF;
 
     mask = NULL;
     values = NULL;
@@ -627,31 +700,49 @@ PyArray_PutMask(PyArrayObject *self, PyObject* values0, PyObject* mask0)
         self = obj;
     }
 
-    chunk = PyArray_DESCR(self)->elsize;
+    itemsize = PyArray_DESCR(self)->elsize;
     dest = PyArray_DATA(self);
 
     if (PyDataType_REFCHK(PyArray_DESCR(self))) {
+        NPY_cast_info cast_info;
+        NPY_ARRAYMETHOD_FLAGS flags;
+        const npy_intp one = 1;
+        const npy_intp strides[2] = {itemsize, itemsize};
+
+        NPY_cast_info_init(&cast_info);
+        if (PyArray_GetDTypeTransferFunction(
+                PyArray_ISALIGNED(self), itemsize, itemsize, dtype, dtype, 0,
+                &cast_info, &flags) < 0) {
+            goto fail;
+        }
+        if (!(flags & NPY_METH_REQUIRES_PYAPI)) {
+            NPY_BEGIN_THREADS;
+        }
+
         for (npy_intp i = 0, j = 0; i < ni; i++, j++) {
             if (j >= nv) {
                 j = 0;
             }
             if (mask_data[i]) {
-                char *src_ptr = src + j*chunk;
-                char *dest_ptr = dest + i*chunk;
-
-                PyArray_Item_INCREF(src_ptr, PyArray_DESCR(self));
-                PyArray_Item_XDECREF(dest_ptr, PyArray_DESCR(self));
-                memmove(dest_ptr, src_ptr, chunk);
+                char *data[2] = {src + j*itemsize, dest + i*itemsize};
+                if (cast_info.func(
+                        &cast_info.context, data, &one, strides,
+                        cast_info.auxdata) < 0) {
+                    NPY_END_THREADS;
+                    NPY_cast_info_xfree(&cast_info);
+                    goto fail;
+                }
             }
         }
+        NPY_cast_info_xfree(&cast_info);
     }
     else {
-        NPY_BEGIN_THREADS_DEF;
-        NPY_BEGIN_THREADS_DESCR(PyArray_DESCR(self));
-        npy_fastputmask(dest, src, mask_data, ni, nv, chunk);
-        NPY_END_THREADS;
+        NPY_BEGIN_THREADS;
+        npy_fastputmask(dest, src, mask_data, ni, nv, itemsize);
     }
 
+    NPY_END_THREADS;
+
     Py_XDECREF(values);
     Py_XDECREF(mask);
     if (copied) {
@@ -670,6 +761,81 @@ PyArray_PutMask(PyArrayObject *self, PyObject* values0, PyObject* mask0)
     return NULL;
 }
 
+static NPY_GCC_OPT_3 inline int
+npy_fastrepeat_impl(
+    npy_intp n_outer, npy_intp n, npy_intp nel, npy_intp chunk,
+    npy_bool broadcast, npy_intp* counts, char* new_data, char* old_data,
+    npy_intp elsize, NPY_cast_info cast_info, int needs_refcounting)
+{
+    npy_intp i, j, k;
+    for (i = 0; i < n_outer; i++) {
+        for (j = 0; j < n; j++) {
+            npy_intp tmp = broadcast ? counts[0] : counts[j];
+            for (k = 0; k < tmp; k++) {
+                if (!needs_refcounting) {
+                    memcpy(new_data, old_data, chunk);
+                }
+                else {
+                    char *data[2] = {old_data, new_data};
+                    npy_intp strides[2] = {elsize, elsize};
+                    if (cast_info.func(&cast_info.context, data, &nel,
+                                       strides, cast_info.auxdata) < 0) {
+                        return -1;
+                    }
+                }
+                new_data += chunk;
+            }
+            old_data += chunk;
+        }
+    }
+    return 0;
+}
+
+static NPY_GCC_OPT_3 int
+npy_fastrepeat(
+    npy_intp n_outer, npy_intp n, npy_intp nel, npy_intp chunk,
+    npy_bool broadcast, npy_intp* counts, char* new_data, char* old_data,
+    npy_intp elsize, NPY_cast_info cast_info, int needs_refcounting)
+{
+    if (!needs_refcounting) {
+        if (chunk == 1) {
+            return npy_fastrepeat_impl(
+                n_outer, n, nel, chunk, broadcast, counts, new_data, old_data,
+                elsize, cast_info, needs_refcounting);
+        }
+        if (chunk == 2) {
+            return npy_fastrepeat_impl(
+                n_outer, n, nel, chunk, broadcast, counts, new_data, old_data,
+                elsize, cast_info, needs_refcounting);
+        }
+        if (chunk == 4) {
+            return npy_fastrepeat_impl(
+                n_outer, n, nel, chunk, broadcast, counts, new_data, old_data,
+                elsize, cast_info, needs_refcounting);
+        }
+        if (chunk == 8) {
+            return npy_fastrepeat_impl(
+                n_outer, n, nel, chunk, broadcast, counts, new_data, old_data,
+                elsize, cast_info, needs_refcounting);
+        }
+        if (chunk == 16) {
+            return npy_fastrepeat_impl(
+                n_outer, n, nel, chunk, broadcast, counts, new_data, old_data,
+                elsize, cast_info, needs_refcounting);
+        }
+        if (chunk == 32) {
+            return npy_fastrepeat_impl(
+                n_outer, n, nel, chunk, broadcast, counts, new_data, old_data,
+                elsize, cast_info, needs_refcounting);
+        }
+    }
+
+    return npy_fastrepeat_impl(
+        n_outer, n, nel, chunk, broadcast, counts, new_data, old_data, elsize,
+        cast_info, needs_refcounting);    
+}
+
+
 /*NUMPY_API
  * Repeat the array.
  */
@@ -677,13 +843,16 @@ NPY_NO_EXPORT PyObject *
 PyArray_Repeat(PyArrayObject *aop, PyObject *op, int axis)
 {
     npy_intp *counts;
-    npy_intp n, n_outer, i, j, k, chunk;
+    npy_intp i, j, n, n_outer, chunk, elsize, nel;
     npy_intp total = 0;
     npy_bool broadcast = NPY_FALSE;
     PyArrayObject *repeats = NULL;
     PyObject *ap = NULL;
     PyArrayObject *ret = NULL;
     char *new_data, *old_data;
+    NPY_cast_info cast_info;
+    NPY_ARRAYMETHOD_FLAGS flags;
+    int needs_refcounting;
 
     repeats = (PyArrayObject *)PyArray_ContiguousFromAny(op, NPY_INTP, 0, 1);
     if (repeats == NULL) {
@@ -707,6 +876,8 @@ PyArray_Repeat(PyArrayObject *aop, PyObject *op, int axis)
 
     aop = (PyArrayObject *)ap;
     n = PyArray_DIM(aop, axis);
+    NPY_cast_info_init(&cast_info);
+    needs_refcounting = PyDataType_REFCHK(PyArray_DESCR(aop));
 
     if (!broadcast && PyArray_SIZE(repeats) != n) {
         PyErr_Format(PyExc_ValueError,
@@ -744,35 +915,41 @@ PyArray_Repeat(PyArrayObject *aop, PyObject *op, int axis)
     new_data = PyArray_DATA(ret);
     old_data = PyArray_DATA(aop);
 
-    chunk = PyArray_DESCR(aop)->elsize;
+    nel = 1;
+    elsize = PyArray_DESCR(aop)->elsize;
     for(i = axis + 1; i < PyArray_NDIM(aop); i++) {
-        chunk *= PyArray_DIMS(aop)[i];
+        nel *= PyArray_DIMS(aop)[i];
     }
+    chunk = nel*elsize;
 
     n_outer = 1;
     for (i = 0; i < axis; i++) {
         n_outer *= PyArray_DIMS(aop)[i];
     }
-    for (i = 0; i < n_outer; i++) {
-        for (j = 0; j < n; j++) {
-            npy_intp tmp = broadcast ? counts[0] : counts[j];
-            for (k = 0; k < tmp; k++) {
-                memcpy(new_data, old_data, chunk);
-                new_data += chunk;
-            }
-            old_data += chunk;
+
+    if (needs_refcounting) {
+        if (PyArray_GetDTypeTransferFunction(
+                1, elsize, elsize, PyArray_DESCR(aop), PyArray_DESCR(aop), 0,
+                &cast_info, &flags) < 0) {
+            goto fail;
         }
     }
 
+    if (npy_fastrepeat(n_outer, n, nel, chunk, broadcast, counts, new_data,
+                       old_data, elsize, cast_info, needs_refcounting) < 0) {
+        goto fail;
+    }
+
     Py_DECREF(repeats);
-    PyArray_INCREF(ret);
     Py_XDECREF(aop);
+    NPY_cast_info_xfree(&cast_info);
     return (PyObject *)ret;
 
  fail:
     Py_DECREF(repeats);
     Py_XDECREF(aop);
     Py_XDECREF(ret);
+    NPY_cast_info_xfree(&cast_info);
     return NULL;
 }
 
@@ -946,7 +1123,7 @@ _new_sortlike(PyArrayObject *op, int axis, PyArray_SortFunc *sort,
     npy_intp astride = PyArray_STRIDE(op, axis);
     int swap = PyArray_ISBYTESWAPPED(op);
     int needcopy = !IsAligned(op) || swap || astride != elsize;
-    int hasrefs = PyDataType_REFCHK(PyArray_DESCR(op));
+    int needs_api = PyDataType_FLAGCHK(PyArray_DESCR(op), NPY_NEEDS_PYAPI);
 
     PyArray_CopySwapNFunc *copyswapn = PyArray_DESCR(op)->f->copyswapn;
     char *buffer = NULL;
@@ -980,6 +1157,9 @@ _new_sortlike(PyArrayObject *op, int axis, PyArray_SortFunc *sort,
             ret = -1;
             goto fail;
         }
+        if (PyDataType_FLAGCHK(PyArray_DESCR(op), NPY_NEEDS_INIT)) {
+            memset(buffer, 0, N * elsize);
+        }
     }
 
     NPY_BEGIN_THREADS_DESCR(PyArray_DESCR(op));
@@ -988,25 +1168,7 @@ _new_sortlike(PyArrayObject *op, int axis, PyArray_SortFunc *sort,
         char *bufptr = it->dataptr;
 
         if (needcopy) {
-            if (hasrefs) {
-                /*
-                 * For dtype's with objects, copyswapn Py_XINCREF's src
-                 * and Py_XDECREF's dst. This would crash if called on
-                 * an uninitialized buffer, or leak a reference to each
-                 * object if initialized.
-                 *
-                 * So, first do the copy with no refcounting...
-                 */
-                _unaligned_strided_byte_copy(buffer, elsize,
-                                             it->dataptr, astride, N, elsize);
-                /* ...then swap in-place if needed */
-                if (swap) {
-                    copyswapn(buffer, elsize, NULL, 0, N, swap, op);
-                }
-            }
-            else {
-                copyswapn(buffer, elsize, it->dataptr, astride, N, swap, op);
-            }
+            copyswapn(buffer, elsize, it->dataptr, astride, N, swap, op);
             bufptr = buffer;
         }
         /*
@@ -1019,7 +1181,7 @@ _new_sortlike(PyArrayObject *op, int axis, PyArray_SortFunc *sort,
 
         if (part == NULL) {
             ret = sort(bufptr, N, op);
-            if (hasrefs && PyErr_Occurred()) {
+            if (needs_api && PyErr_Occurred()) {
                 ret = -1;
             }
             if (ret < 0) {
@@ -1032,7 +1194,7 @@ _new_sortlike(PyArrayObject *op, int axis, PyArray_SortFunc *sort,
             npy_intp i;
             for (i = 0; i < nkth; ++i) {
                 ret = part(bufptr, N, kth[i], pivots, &npiv, op);
-                if (hasrefs && PyErr_Occurred()) {
+                if (needs_api && PyErr_Occurred()) {
                     ret = -1;
                 }
                 if (ret < 0) {
@@ -1042,16 +1204,7 @@ _new_sortlike(PyArrayObject *op, int axis, PyArray_SortFunc *sort,
         }
 
         if (needcopy) {
-            if (hasrefs) {
-                if (swap) {
-                    copyswapn(buffer, elsize, NULL, 0, N, swap, op);
-                }
-                _unaligned_strided_byte_copy(it->dataptr, astride,
-                                             buffer, elsize, N, elsize);
-            }
-            else {
-                copyswapn(it->dataptr, astride, buffer, elsize, N, swap, op);
-            }
+            copyswapn(it->dataptr, astride, buffer, elsize, N, swap, op);
         }
 
         PyArray_ITER_NEXT(it);
@@ -1060,7 +1213,10 @@ _new_sortlike(PyArrayObject *op, int axis, PyArray_SortFunc *sort,
 fail:
     NPY_END_THREADS_DESCR(PyArray_DESCR(op));
     /* cleanup internal buffer */
-    PyDataMem_UserFREE(buffer, N * elsize, mem_handler);
+    if (needcopy) {
+        PyArray_ClearBuffer(PyArray_DESCR(op), buffer, elsize, N, 1);
+        PyDataMem_UserFREE(buffer, N * elsize, mem_handler);
+    }
     if (ret < 0 && !PyErr_Occurred()) {
         /* Out of memory during sorting or buffer creation */
         PyErr_NoMemory();
@@ -1081,7 +1237,7 @@ _new_argsortlike(PyArrayObject *op, int axis, PyArray_ArgSortFunc *argsort,
     npy_intp astride = PyArray_STRIDE(op, axis);
     int swap = PyArray_ISBYTESWAPPED(op);
     int needcopy = !IsAligned(op) || swap || astride != elsize;
-    int hasrefs = PyDataType_REFCHK(PyArray_DESCR(op));
+    int needs_api = PyDataType_FLAGCHK(PyArray_DESCR(op), NPY_NEEDS_PYAPI);
     int needidxbuffer;
 
     PyArray_CopySwapNFunc *copyswapn = PyArray_DESCR(op)->f->copyswapn;
@@ -1134,6 +1290,9 @@ _new_argsortlike(PyArrayObject *op, int axis, PyArray_ArgSortFunc *argsort,
             ret = -1;
             goto fail;
         }
+        if (PyDataType_FLAGCHK(PyArray_DESCR(op), NPY_NEEDS_INIT)) {
+            memset(valbuffer, 0, N * elsize);
+        }
     }
 
     if (needidxbuffer) {
@@ -1153,26 +1312,7 @@ _new_argsortlike(PyArrayObject *op, int axis, PyArray_ArgSortFunc *argsort,
         npy_intp *iptr, i;
 
         if (needcopy) {
-            if (hasrefs) {
-                /*
-                 * For dtype's with objects, copyswapn Py_XINCREF's src
-                 * and Py_XDECREF's dst. This would crash if called on
-                 * an uninitialized valbuffer, or leak a reference to
-                 * each object item if initialized.
-                 *
-                 * So, first do the copy with no refcounting...
-                 */
-                 _unaligned_strided_byte_copy(valbuffer, elsize,
-                                              it->dataptr, astride, N, elsize);
-                /* ...then swap in-place if needed */
-                if (swap) {
-                    copyswapn(valbuffer, elsize, NULL, 0, N, swap, op);
-                }
-            }
-            else {
-                copyswapn(valbuffer, elsize,
-                          it->dataptr, astride, N, swap, op);
-            }
+            copyswapn(valbuffer, elsize, it->dataptr, astride, N, swap, op);
             valptr = valbuffer;
         }
 
@@ -1188,7 +1328,7 @@ _new_argsortlike(PyArrayObject *op, int axis, PyArray_ArgSortFunc *argsort,
         if (argpart == NULL) {
             ret = argsort(valptr, idxptr, N, op);
             /* Object comparisons may raise an exception in Python 3 */
-            if (hasrefs && PyErr_Occurred()) {
+            if (needs_api && PyErr_Occurred()) {
                 ret = -1;
             }
             if (ret < 0) {
@@ -1202,7 +1342,7 @@ _new_argsortlike(PyArrayObject *op, int axis, PyArray_ArgSortFunc *argsort,
             for (i = 0; i < nkth; ++i) {
                 ret = argpart(valptr, idxptr, N, kth[i], pivots, &npiv, op);
                 /* Object comparisons may raise an exception in Python 3 */
-                if (hasrefs && PyErr_Occurred()) {
+                if (needs_api && PyErr_Occurred()) {
                     ret = -1;
                 }
                 if (ret < 0) {
@@ -1228,7 +1368,10 @@ _new_argsortlike(PyArrayObject *op, int axis, PyArray_ArgSortFunc *argsort,
 fail:
     NPY_END_THREADS_DESCR(PyArray_DESCR(op));
     /* cleanup internal buffers */
-    PyDataMem_UserFREE(valbuffer, N * elsize, mem_handler);
+    if (needcopy) {
+        PyArray_ClearBuffer(PyArray_DESCR(op), valbuffer, elsize, N, 1);
+        PyDataMem_UserFREE(valbuffer, N * elsize, mem_handler);
+    }
     PyDataMem_UserFREE(idxbuffer, N * sizeof(npy_intp), mem_handler);
     if (ret < 0) {
         if (!PyErr_Occurred()) {
@@ -2113,7 +2256,7 @@ PyArray_Compress(PyArrayObject *self, PyObject *condition, int axis,
  * even though it uses 64 bit types its faster than the bytewise sum on 32 bit
  * but a 32 bit type version would make it even faster on these platforms
  */
-static NPY_INLINE npy_intp
+static inline npy_intp
 count_nonzero_bytes_384(const npy_uint64 * w)
 {
     const npy_uint64 w1 = w[0];
@@ -2209,7 +2352,7 @@ count_zero_bytes_u16(const npy_uint8 **d, const npy_uint8 *end, npy_uint16 max_c
  *                          !!
  *                     (2*16-1)*16
 */
-static NPY_INLINE NPY_GCC_OPT_3 npy_intp
+static inline NPY_GCC_OPT_3 npy_intp
 count_nonzero_u8(const char *data, npy_intp bstride, npy_uintp len)
 {
     npy_intp count = 0;
@@ -2245,7 +2388,7 @@ count_nonzero_u8(const char *data, npy_intp bstride, npy_uintp len)
     return count;
 }
 
-static NPY_INLINE NPY_GCC_OPT_3 npy_intp
+static inline NPY_GCC_OPT_3 npy_intp
 count_nonzero_u16(const char *data, npy_intp bstride, npy_uintp len)
 {
     npy_intp count = 0;
@@ -2277,7 +2420,7 @@ count_nonzero_u16(const char *data, npy_intp bstride, npy_uintp len)
     return count;
 }
 
-static NPY_INLINE NPY_GCC_OPT_3 npy_intp
+static inline NPY_GCC_OPT_3 npy_intp
 count_nonzero_u32(const char *data, npy_intp bstride, npy_uintp len)
 {
     npy_intp count = 0;
@@ -2307,7 +2450,7 @@ count_nonzero_u32(const char *data, npy_intp bstride, npy_uintp len)
     return count;
 }
 
-static NPY_INLINE NPY_GCC_OPT_3 npy_intp
+static inline NPY_GCC_OPT_3 npy_intp
 count_nonzero_u64(const char *data, npy_intp bstride, npy_uintp len)
 {
     npy_intp count = 0;
@@ -2642,7 +2785,7 @@ PyArray_Nonzero(PyArrayObject *self)
                 }
             }
             /*
-             * Fallback to a branchless strategy to avoid branch misprediction 
+             * Fallback to a branchless strategy to avoid branch misprediction
              * stalls that are very expensive on most modern processors.
              */
             else {
diff --git a/numpy/core/src/multiarray/iterators.c b/numpy/core/src/multiarray/iterators.c
index 95aa11d2d..4e6418bf9 100644
--- a/numpy/core/src/multiarray/iterators.c
+++ b/numpy/core/src/multiarray/iterators.c
@@ -1241,7 +1241,7 @@ PyArray_Broadcast(PyArrayMultiIterObject *mit)
     return 0;
 }
 
-static NPY_INLINE PyObject*
+static inline PyObject*
 multiiter_wrong_number_of_args(void)
 {
     return PyErr_Format(PyExc_ValueError,
@@ -1615,7 +1615,7 @@ get_ptr_constant(PyArrayIterObject* _iter, const npy_intp *coordinates)
  * value  2  3  3  2  1 1 2 3 3 2 1 1
  *
  * _npy_pos_index_mirror(4, 3) will return 1, because x[4] = x[1]*/
-NPY_INLINE static npy_intp
+static inline npy_intp
 __npy_pos_remainder(npy_intp i, npy_intp n)
 {
     npy_intp k, l, j;
@@ -1661,7 +1661,7 @@ get_ptr_mirror(PyArrayIterObject* _iter, const npy_intp *coordinates)
 #undef _INF_SET_PTR_MIRROR
 
 /* compute l such as i = k * n + l, 0 <= l < |k| */
-NPY_INLINE static npy_intp
+static inline npy_intp
 __npy_euclidean_division(npy_intp i, npy_intp n)
 {
     npy_intp l;
diff --git a/numpy/core/src/multiarray/mapping.c b/numpy/core/src/multiarray/mapping.c
index 98c2d7eda..b7fcf2ee2 100644
--- a/numpy/core/src/multiarray/mapping.c
+++ b/numpy/core/src/multiarray/mapping.c
@@ -160,7 +160,7 @@ PyArray_MapIterSwapAxes(PyArrayMapIterObject *mit, PyArrayObject **ret, int getm
     *ret = (PyArrayObject *)new;
 }
 
-static NPY_INLINE void
+static inline void
 multi_DECREF(PyObject **objects, npy_intp n)
 {
     npy_intp i;
@@ -176,7 +176,7 @@ multi_DECREF(PyObject **objects, npy_intp n)
  * Useful if a tuple is being iterated over multiple times, or for a code path
  * that doesn't always want the overhead of allocating a tuple.
  */
-static NPY_INLINE npy_intp
+static inline npy_intp
 unpack_tuple(PyTupleObject *index, PyObject **result, npy_intp result_n)
 {
     npy_intp n, i;
@@ -194,7 +194,7 @@ unpack_tuple(PyTupleObject *index, PyObject **result, npy_intp result_n)
 }
 
 /* Unpack a single scalar index, taking a new reference to match unpack_tuple */
-static NPY_INLINE npy_intp
+static inline npy_intp
 unpack_scalar(PyObject *index, PyObject **result, npy_intp NPY_UNUSED(result_n))
 {
     Py_INCREF(index);
@@ -889,13 +889,13 @@ get_view_from_index(PyArrayObject *self, PyArrayObject **view,
 
     /* Create the new view and set the base array */
     Py_INCREF(PyArray_DESCR(self));
-    *view = (PyArrayObject *)PyArray_NewFromDescrAndBase(
+    *view = (PyArrayObject *)PyArray_NewFromDescr_int(
             ensure_array ? &PyArray_Type : Py_TYPE(self),
             PyArray_DESCR(self),
             new_dim, new_shape, new_strides, data_ptr,
             PyArray_FLAGS(self),
             ensure_array ? NULL : (PyObject *)self,
-            (PyObject *)self);
+            (PyObject *)self, _NPY_ARRAY_ENSURE_DTYPE_IDENTITY);
     if (*view == NULL) {
         return -1;
     }
@@ -1361,7 +1361,8 @@ _get_field_view(PyArrayObject *arr, PyObject *ind, PyArrayObject **view)
                 PyArray_BYTES(arr) + offset,
                 PyArray_FLAGS(arr),
                 (PyObject *)arr, (PyObject *)arr,
-                0, 1);
+                /* We do not preserve the dtype for a subarray one, only str */
+                _NPY_ARRAY_ALLOW_EMPTY_STRING);
         if (*view == NULL) {
             return 0;
         }
@@ -1415,7 +1416,8 @@ _get_field_view(PyArrayObject *arr, PyObject *ind, PyArrayObject **view)
                 PyArray_DATA(arr),
                 PyArray_FLAGS(arr),
                 (PyObject *)arr, (PyObject *)arr,
-                0, 1);
+                /* We do not preserve the dtype for a subarray one, only str */
+                _NPY_ARRAY_ALLOW_EMPTY_STRING);
 
         if (*view == NULL) {
             return 0;
diff --git a/numpy/core/src/multiarray/methods.c b/numpy/core/src/multiarray/methods.c
index 6ec3798eb..bc3efc295 100644
--- a/numpy/core/src/multiarray/methods.c
+++ b/numpy/core/src/multiarray/methods.c
@@ -21,12 +21,14 @@
 #include "ctors.h"
 #include "calculation.h"
 #include "convert_datatype.h"
+#include "descriptor.h"
 #include "item_selection.h"
 #include "conversion_utils.h"
 #include "shape.h"
 #include "strfuncs.h"
 #include "array_assign.h"
 #include "npy_dlpack.h"
+#include "multiarraymodule.h"
 
 #include "methods.h"
 #include "alloc.h"
@@ -435,7 +437,7 @@ PyArray_GetField(PyArrayObject *self, PyArray_Descr *typed, int offset)
             PyArray_BYTES(self) + offset,
             PyArray_FLAGS(self) & ~NPY_ARRAY_F_CONTIGUOUS,
             (PyObject *)self, (PyObject *)self,
-            0, 1);
+            _NPY_ARRAY_ALLOW_EMPTY_STRING);
     return ret;
 }
 
@@ -851,29 +853,35 @@ static PyObject *
 array_astype(PyArrayObject *self,
         PyObject *const *args, Py_ssize_t len_args, PyObject *kwnames)
 {
-    PyArray_Descr *dtype = NULL;
     /*
      * TODO: UNSAFE default for compatibility, I think
      *       switching to SAME_KIND by default would be good.
      */
+    npy_dtype_info dt_info;
     NPY_CASTING casting = NPY_UNSAFE_CASTING;
     NPY_ORDER order = NPY_KEEPORDER;
     _PyArray_CopyMode forcecopy = 1;
     int subok = 1;
+
     NPY_PREPARE_ARGPARSER;
     if (npy_parse_arguments("astype", args, len_args, kwnames,
-            "dtype", &PyArray_DescrConverter, &dtype,
+            "dtype", &PyArray_DTypeOrDescrConverterRequired, &dt_info,
             "|order", &PyArray_OrderConverter, &order,
             "|casting", &PyArray_CastingConverter, &casting,
             "|subok", &PyArray_PythonPyIntFromInt, &subok,
             "|copy", &PyArray_CopyConverter, &forcecopy,
             NULL, NULL, NULL) < 0) {
-        Py_XDECREF(dtype);
+        Py_XDECREF(dt_info.descr);
+        Py_XDECREF(dt_info.dtype);
         return NULL;
     }
 
     /* If it is not a concrete dtype instance find the best one for the array */
-    Py_SETREF(dtype, PyArray_AdaptDescriptorToArray(self, (PyObject *)dtype));
+    PyArray_Descr *dtype;
+
+    dtype = PyArray_AdaptDescriptorToArray(self, dt_info.dtype, dt_info.descr);
+    Py_XDECREF(dt_info.descr);
+    Py_DECREF(dt_info.dtype);
     if (dtype == NULL) {
         return NULL;
     }
@@ -916,29 +924,28 @@ array_astype(PyArrayObject *self,
 
     PyArrayObject *ret;
 
-    /* This steals the reference to dtype, so no DECREF of dtype */
+    /* This steals the reference to dtype */
+    Py_INCREF(dtype);
     ret = (PyArrayObject *)PyArray_NewLikeArray(
                                 self, order, dtype, subok);
     if (ret == NULL) {
+        Py_DECREF(dtype);
         return NULL;
     }
-    /* NumPy 1.20, 2020-10-01 */
-    if ((PyArray_NDIM(self) != PyArray_NDIM(ret)) &&
-            DEPRECATE_FUTUREWARNING(
-                "casting an array to a subarray dtype "
-                "will not use broadcasting in the future, but cast each "
-                "element to the new dtype and then append the dtype's shape "
-                "to the new array. You can opt-in to the new behaviour, by "
-                "additional field to the cast: "
-                "`arr.astype(np.dtype([('f', dtype)]))['f']`.\n"
-                "This may lead to a different result or to current failures "
-                "succeeding.  "
-                "(FutureWarning since NumPy 1.20)") < 0) {
-        Py_DECREF(ret);
-        return NULL;
-    }
 
-    if (PyArray_CopyInto(ret, self) < 0) {
+    /* Decrease the number of dimensions removing subarray ones again */
+    int out_ndim = PyArray_NDIM(ret);
+    PyArray_Descr *out_descr = PyArray_DESCR(ret);
+    ((PyArrayObject_fields *)ret)->nd = PyArray_NDIM(self);
+    ((PyArrayObject_fields *)ret)->descr = dtype;
+
+    int success = PyArray_CopyInto(ret, self);
+
+    Py_DECREF(dtype);
+    ((PyArrayObject_fields *)ret)->nd = out_ndim;
+    ((PyArrayObject_fields *)ret)->descr = out_descr;
+
+    if (success < 0) {
         Py_DECREF(ret);
         return NULL;
     }
@@ -1095,7 +1102,7 @@ any_array_ufunc_overrides(PyObject *args, PyObject *kwds)
     int nin, nout;
     PyObject *out_kwd_obj;
     PyObject *fast;
-    PyObject **in_objs, **out_objs;
+    PyObject **in_objs, **out_objs, *where_obj;
 
     /* check inputs */
     nin = PyTuple_Size(args);
@@ -1126,6 +1133,17 @@ any_array_ufunc_overrides(PyObject *args, PyObject *kwds)
         }
     }
     Py_DECREF(out_kwd_obj);
+    /* check where if it exists */
+    where_obj = PyDict_GetItemWithError(kwds, npy_ma_str_where);
+    if (where_obj == NULL) {
+        if (PyErr_Occurred()) {
+            return -1;
+        }
+    } else {
+        if (PyUFunc_HasOverride(where_obj)){
+            return 1;
+        }
+    }
     return 0;
 }
 
@@ -1657,7 +1675,7 @@ array_deepcopy(PyArrayObject *self, PyObject *args)
 {
     PyArrayObject *copied_array;
     PyObject *visit;
-    NpyIter *iter;
+    NpyIter *iter = NULL;
     NpyIter_IterNextFunc *iternext;
     char *data;
     char **dataptr;
@@ -1673,64 +1691,73 @@ array_deepcopy(PyArrayObject *self, PyObject *args)
     if (copied_array == NULL) {
         return NULL;
     }
-    if (PyDataType_REFCHK(PyArray_DESCR(self))) {
-        copy = PyImport_ImportModule("copy");
-        if (copy == NULL) {
-            Py_DECREF(copied_array);
-            return NULL;
-        }
-        deepcopy = PyObject_GetAttrString(copy, "deepcopy");
-        Py_DECREF(copy);
-        if (deepcopy == NULL) {
-            Py_DECREF(copied_array);
-            return NULL;
-        }
-        iter = NpyIter_New(copied_array,
-                           NPY_ITER_READWRITE |
-                           NPY_ITER_EXTERNAL_LOOP |
-                           NPY_ITER_REFS_OK |
-                           NPY_ITER_ZEROSIZE_OK,
-                           NPY_KEEPORDER, NPY_NO_CASTING,
-                           NULL);
-        if (iter == NULL) {
-            Py_DECREF(deepcopy);
-            Py_DECREF(copied_array);
-            return NULL;
-        }
-        if (NpyIter_GetIterSize(iter) != 0) {
-            iternext = NpyIter_GetIterNext(iter, NULL);
-            if (iternext == NULL) {
-                NpyIter_Deallocate(iter);
-                Py_DECREF(deepcopy);
-                Py_DECREF(copied_array);
-                return NULL;
-            }
 
-            dataptr = NpyIter_GetDataPtrArray(iter);
-            strideptr = NpyIter_GetInnerStrideArray(iter);
-            innersizeptr = NpyIter_GetInnerLoopSizePtr(iter);
-
-            do {
-                data = *dataptr;
-                stride = *strideptr;
-                count = *innersizeptr;
-                while (count--) {
-                    deepcopy_res = _deepcopy_call(data, data, PyArray_DESCR(copied_array),
-                                                  deepcopy, visit);
-                    if (deepcopy_res == -1) {
-                        return NULL;
-                    }
+    if (!PyDataType_REFCHK(PyArray_DESCR(self))) {
+        return (PyObject *)copied_array;
+    }
 
-                    data += stride;
+    /* If the array contains objects, need to deepcopy them as well */
+    copy = PyImport_ImportModule("copy");
+    if (copy == NULL) {
+        Py_DECREF(copied_array);
+        return NULL;
+    }
+    deepcopy = PyObject_GetAttrString(copy, "deepcopy");
+    Py_DECREF(copy);
+    if (deepcopy == NULL) {
+        goto error;
+    }
+    iter = NpyIter_New(copied_array,
+                        NPY_ITER_READWRITE |
+                        NPY_ITER_EXTERNAL_LOOP |
+                        NPY_ITER_REFS_OK |
+                        NPY_ITER_ZEROSIZE_OK,
+                        NPY_KEEPORDER, NPY_NO_CASTING,
+                        NULL);
+    if (iter == NULL) {
+        goto error;
+    }
+    if (NpyIter_GetIterSize(iter) != 0) {
+        iternext = NpyIter_GetIterNext(iter, NULL);
+        if (iternext == NULL) {
+            goto error;
+        }
+
+        dataptr = NpyIter_GetDataPtrArray(iter);
+        strideptr = NpyIter_GetInnerStrideArray(iter);
+        innersizeptr = NpyIter_GetInnerLoopSizePtr(iter);
+
+        do {
+            data = *dataptr;
+            stride = *strideptr;
+            count = *innersizeptr;
+            while (count--) {
+                deepcopy_res = _deepcopy_call(data, data, PyArray_DESCR(copied_array),
+                                                deepcopy, visit);
+                if (deepcopy_res == -1) {
+                    goto error;
                 }
-            } while (iternext(iter));
-        }
-        NpyIter_Deallocate(iter);
-        Py_DECREF(deepcopy);
+
+                data += stride;
+            }
+        } while (iternext(iter));
+    }
+
+    Py_DECREF(deepcopy);
+    if (!NpyIter_Deallocate(iter)) {
+        Py_DECREF(copied_array);
+        return NULL;
     }
-    return (PyObject*) copied_array;
+    return (PyObject *)copied_array;
+
+  error:
+    Py_DECREF(deepcopy);
+    Py_DECREF(copied_array);
+    NpyIter_Deallocate(iter);
+    return NULL;
 }
 
+
 /* Convert Array to flat list (using getitem) */
 static PyObject *
 _getlist_pkl(PyArrayObject *self)
@@ -2776,9 +2803,7 @@ array_complex(PyArrayObject *self, PyObject *NPY_UNUSED(args))
     PyArray_Descr *dtype;
     PyObject *c;
 
-    if (PyArray_SIZE(self) != 1) {
-        PyErr_SetString(PyExc_TypeError,
-                "only length-1 arrays can be converted to Python scalars");
+    if (check_is_convertible_to_scalar(self) < 0) {
         return NULL;
     }
 
@@ -2823,25 +2848,15 @@ array_complex(PyArrayObject *self, PyObject *NPY_UNUSED(args))
 static PyObject *
 array_class_getitem(PyObject *cls, PyObject *args)
 {
-    PyObject *generic_alias;
-
-#ifdef Py_GENERICALIASOBJECT_H
-    Py_ssize_t args_len;
+    const Py_ssize_t args_len = PyTuple_Check(args) ? PyTuple_Size(args) : 1;
 
-    args_len = PyTuple_Check(args) ? PyTuple_Size(args) : 1;
     if ((args_len > 2) || (args_len == 0)) {
         return PyErr_Format(PyExc_TypeError,
                             "Too %s arguments for %s",
                             args_len > 2 ? "many" : "few",
                             ((PyTypeObject *)cls)->tp_name);
     }
-    generic_alias = Py_GenericAlias(cls, args);
-#else
-    PyErr_SetString(PyExc_TypeError,
-                    "Type subscription requires python >= 3.9");
-    generic_alias = NULL;
-#endif
-    return generic_alias;
+    return Py_GenericAlias(cls, args);
 }
 
 NPY_NO_EXPORT PyMethodDef array_methods[] = {
diff --git a/numpy/core/src/multiarray/multiarraymodule.c b/numpy/core/src/multiarray/multiarraymodule.c
index dda8831c5..d7d19493b 100644
--- a/numpy/core/src/multiarray/multiarraymodule.c
+++ b/numpy/core/src/multiarray/multiarraymodule.c
@@ -66,6 +66,7 @@ NPY_NO_EXPORT int NPY_NUMUSERTYPES = 0;
 #include "compiled_base.h"
 #include "mem_overlap.h"
 #include "typeinfo.h"
+#include "convert.h" /* for PyArray_AssignZero */
 
 #include "get_attr_string.h"
 #include "experimental_public_dtype_api.h"  /* _get_experimental_dtype_api */
@@ -98,6 +99,12 @@ _umath_strings_richcompare(
  * future.
  */
 int npy_legacy_print_mode = INT_MAX;
+/*
+ * Global variable to check whether NumPy 2.0 behavior is opted in.
+ * This flag is considered a runtime constant.
+ */
+int npy_numpy2_behavior = NPY_FALSE;
+
 
 static PyObject *
 set_legacy_print_mode(PyObject *NPY_UNUSED(self), PyObject *args)
@@ -467,7 +474,7 @@ PyArray_ConcatenateArrays(int narrays, PyArrayObject **arrays, int axis,
         /* Get the priority subtype for the array */
         PyTypeObject *subtype = PyArray_GetSubType(narrays, arrays);
         PyArray_Descr *descr = PyArray_FindConcatenationDescriptor(
-                narrays, arrays,  (PyObject *)dtype);
+                narrays, arrays, dtype);
         if (descr == NULL) {
             return NULL;
         }
@@ -488,7 +495,7 @@ PyArray_ConcatenateArrays(int narrays, PyArrayObject **arrays, int axis,
         /* Allocate the array for the result. This steals the 'dtype' reference. */
         ret = (PyArrayObject *)PyArray_NewFromDescr_int(
                 subtype, descr, ndim, shape, strides, NULL, 0, NULL,
-                NULL, 0, 1);
+                NULL, _NPY_ARRAY_ALLOW_EMPTY_STRING);
         if (ret == NULL) {
             return NULL;
         }
@@ -583,7 +590,7 @@ PyArray_ConcatenateFlattenedArrays(int narrays, PyArrayObject **arrays,
         PyTypeObject *subtype = PyArray_GetSubType(narrays, arrays);
 
         PyArray_Descr *descr = PyArray_FindConcatenationDescriptor(
-                narrays, arrays, (PyObject *)dtype);
+                narrays, arrays, dtype);
         if (descr == NULL) {
             return NULL;
         }
@@ -593,7 +600,7 @@ PyArray_ConcatenateFlattenedArrays(int narrays, PyArrayObject **arrays,
         /* Allocate the array for the result. This steals the 'dtype' reference. */
         ret = (PyArrayObject *)PyArray_NewFromDescr_int(
                 subtype, descr,  1, &shape, &stride, NULL, 0, NULL,
-                NULL, 0, 1);
+                NULL, _NPY_ARRAY_ALLOW_EMPTY_STRING);
         if (ret == NULL) {
             return NULL;
         }
@@ -899,11 +906,15 @@ PyArray_InnerProduct(PyObject *op1, PyObject *op2)
     int i;
     PyObject* ret = NULL;
 
-    typenum = PyArray_ObjectType(op1, 0);
-    if (typenum == NPY_NOTYPE && PyErr_Occurred()) {
+    typenum = PyArray_ObjectType(op1, NPY_NOTYPE);
+    if (typenum == NPY_NOTYPE) {
         return NULL;
     }
     typenum = PyArray_ObjectType(op2, typenum);
+    if (typenum == NPY_NOTYPE) {
+        return NULL;
+    }
+
     typec = PyArray_DescrFromType(typenum);
     if (typec == NULL) {
         if (!PyErr_Occurred()) {
@@ -991,11 +1002,15 @@ PyArray_MatrixProduct2(PyObject *op1, PyObject *op2, PyArrayObject* out)
     PyArray_Descr *typec = NULL;
     NPY_BEGIN_THREADS_DEF;
 
-    typenum = PyArray_ObjectType(op1, 0);
-    if (typenum == NPY_NOTYPE && PyErr_Occurred()) {
+    typenum = PyArray_ObjectType(op1, NPY_NOTYPE);
+    if (typenum == NPY_NOTYPE) {
         return NULL;
     }
     typenum = PyArray_ObjectType(op2, typenum);
+    if (typenum == NPY_NOTYPE) {
+        return NULL;
+    }
+
     typec = PyArray_DescrFromType(typenum);
     if (typec == NULL) {
         if (!PyErr_Occurred()) {
@@ -1028,12 +1043,11 @@ PyArray_MatrixProduct2(PyObject *op1, PyObject *op2, PyArrayObject* out)
 #endif
 
     if (PyArray_NDIM(ap1) == 0 || PyArray_NDIM(ap2) == 0) {
-        result = (PyArray_NDIM(ap1) == 0 ? ap1 : ap2);
-        result = (PyArrayObject *)Py_TYPE(result)->tp_as_number->nb_multiply(
-                                        (PyObject *)ap1, (PyObject *)ap2);
+        PyObject *mul_res = PyObject_CallFunctionObjArgs(
+                n_ops.multiply, ap1, ap2, out, NULL);
         Py_DECREF(ap1);
         Py_DECREF(ap2);
-        return (PyObject *)result;
+        return mul_res;
     }
     l = PyArray_DIMS(ap1)[PyArray_NDIM(ap1) - 1];
     if (PyArray_NDIM(ap2) > 1) {
@@ -1071,7 +1085,9 @@ PyArray_MatrixProduct2(PyObject *op1, PyObject *op2, PyArrayObject* out)
     }
     /* Ensure that multiarray.dot(<Nx0>,<0xM>) -> zeros((N,M)) */
     if (PyArray_SIZE(ap1) == 0 && PyArray_SIZE(ap2) == 0) {
-        memset(PyArray_DATA(out_buf), 0, PyArray_NBYTES(out_buf));
+        if (PyArray_AssignZero(out_buf, NULL) < 0) {
+            goto fail;
+        }
     }
 
     dot = PyArray_DESCR(out_buf)->f->dotfunc;
@@ -1373,8 +1389,14 @@ PyArray_Correlate2(PyObject *op1, PyObject *op2, int mode)
     int inverted;
     int st;
 
-    typenum = PyArray_ObjectType(op1, 0);
+    typenum = PyArray_ObjectType(op1, NPY_NOTYPE);
+    if (typenum == NPY_NOTYPE) {
+        return NULL;
+    }
     typenum = PyArray_ObjectType(op2, typenum);
+    if (typenum == NPY_NOTYPE) {
+        return NULL;
+    }
 
     typec = PyArray_DescrFromType(typenum);
     Py_INCREF(typec);
@@ -1440,8 +1462,14 @@ PyArray_Correlate(PyObject *op1, PyObject *op2, int mode)
     int unused;
     PyArray_Descr *typec;
 
-    typenum = PyArray_ObjectType(op1, 0);
+    typenum = PyArray_ObjectType(op1, NPY_NOTYPE);
+    if (typenum == NPY_NOTYPE) {
+        return NULL;
+    }
     typenum = PyArray_ObjectType(op2, typenum);
+    if (typenum == NPY_NOTYPE) {
+        return NULL;
+    }
 
     typec = PyArray_DescrFromType(typenum);
     Py_INCREF(typec);
@@ -1472,19 +1500,26 @@ fail:
     return NULL;
 }
 
-
 static PyObject *
-array_putmask(PyObject *NPY_UNUSED(module), PyObject *args, PyObject *kwds)
+array_putmask(PyObject *NPY_UNUSED(module), PyObject *const *args,
+                Py_ssize_t len_args, PyObject *kwnames )
 {
     PyObject *mask, *values;
     PyObject *array;
 
-    static char *kwlist[] = {"arr", "mask", "values", NULL};
-
-    if (!PyArg_ParseTupleAndKeywords(args, kwds, "O!OO:putmask", kwlist,
-                &PyArray_Type, &array, &mask, &values)) {
+    NPY_PREPARE_ARGPARSER;
+    if (npy_parse_arguments("putmask", args, len_args, kwnames,
+            "", NULL, &array,
+            "mask", NULL, &mask,
+            "values", NULL, &values,
+            NULL, NULL, NULL) < 0) {
         return NULL;
     }
+    if (!PyArray_Check(array)) {
+        PyErr_SetString(PyExc_TypeError,
+                "argument a of putmask must be a numpy array");
+    }
+
     return PyArray_PutMask((PyArrayObject *)array, values, mask);
 }
 
@@ -1602,25 +1637,35 @@ _prepend_ones(PyArrayObject *arr, int nd, int ndmin, NPY_ORDER order)
                  ((order) == NPY_CORDER && PyArray_IS_C_CONTIGUOUS(op)) || \
                  ((order) == NPY_FORTRANORDER && PyArray_IS_F_CONTIGUOUS(op)))
 
-static NPY_INLINE PyObject *
+
+static inline PyObject *
 _array_fromobject_generic(
-        PyObject *op, PyArray_Descr *type, _PyArray_CopyMode copy, NPY_ORDER order,
-        npy_bool subok, int ndmin)
+        PyObject *op, PyArray_Descr *in_descr, PyArray_DTypeMeta *in_DType,
+        _PyArray_CopyMode copy, NPY_ORDER order, npy_bool subok, int ndmin)
 {
     PyArrayObject *oparr = NULL, *ret = NULL;
     PyArray_Descr *oldtype = NULL;
     int nd, flags = 0;
 
+    /* Hold on to `in_descr` as `dtype`, since we may also set it below. */
+    Py_XINCREF(in_descr);
+    PyArray_Descr *dtype = in_descr;
+
     if (ndmin > NPY_MAXDIMS) {
         PyErr_Format(PyExc_ValueError,
                 "ndmin bigger than allowable number of dimensions "
                 "NPY_MAXDIMS (=%d)", NPY_MAXDIMS);
-        return NULL;
+        goto finish;
     }
     /* fast exit if simple call */
     if (PyArray_CheckExact(op) || (subok && PyArray_Check(op))) {
         oparr = (PyArrayObject *)op;
-        if (type == NULL) {
+
+        if (dtype == NULL && in_DType == NULL) {
+            /*
+             * User did not ask for a specific dtype instance or class. So
+             * we can return either self or a copy.
+             */
             if (copy != NPY_COPY_ALWAYS && STRIDING_OK(oparr, order)) {
                 ret = oparr;
                 Py_INCREF(ret);
@@ -1630,17 +1675,30 @@ _array_fromobject_generic(
                 if (copy == NPY_COPY_NEVER) {
                     PyErr_SetString(PyExc_ValueError,
                             "Unable to avoid copy while creating a new array.");
-                    return NULL;
+                    goto finish;
                 }
                 ret = (PyArrayObject *)PyArray_NewCopy(oparr, order);
                 goto finish;
             }
         }
+        else if (dtype == NULL) {
+            /*
+             * If the user passed a DType class but not a dtype instance,
+             * we must use `PyArray_AdaptDescriptorToArray` to find the
+             * correct dtype instance.
+             * Even if the fast-path doesn't work we will use this.
+             */
+            dtype = PyArray_AdaptDescriptorToArray(oparr, in_DType, NULL);
+            if (dtype == NULL) {
+                goto finish;
+            }
+        }
+
         /* One more chance for faster exit if user specified the dtype. */
         oldtype = PyArray_DESCR(oparr);
-        if (PyArray_EquivTypes(oldtype, type)) {
+        if (PyArray_EquivTypes(oldtype, dtype)) {
             if (copy != NPY_COPY_ALWAYS && STRIDING_OK(oparr, order)) {
-                if (oldtype == type) {
+                if (oldtype == dtype) {
                     Py_INCREF(op);
                     ret = oparr;
                 }
@@ -1648,10 +1706,10 @@ _array_fromobject_generic(
                     /* Create a new PyArrayObject from the caller's
                      * PyArray_Descr. Use the reference `op` as the base
                      * object. */
-                    Py_INCREF(type);
+                    Py_INCREF(dtype);
                     ret = (PyArrayObject *)PyArray_NewFromDescrAndBase(
                             Py_TYPE(op),
-                            type,
+                            dtype,
                             PyArray_NDIM(oparr),
                             PyArray_DIMS(oparr),
                             PyArray_STRIDES(oparr),
@@ -1667,10 +1725,10 @@ _array_fromobject_generic(
                 if (copy == NPY_COPY_NEVER) {
                     PyErr_SetString(PyExc_ValueError,
                             "Unable to avoid copy while creating a new array.");
-                    return NULL;
+                    goto finish;
                 }
                 ret = (PyArrayObject *)PyArray_NewCopy(oparr, order);
-                if (oldtype == type || ret == NULL) {
+                if (oldtype == dtype || ret == NULL) {
                     goto finish;
                 }
                 Py_INCREF(oldtype);
@@ -1701,11 +1759,13 @@ _array_fromobject_generic(
     }
 
     flags |= NPY_ARRAY_FORCECAST;
-    Py_XINCREF(type);
-    ret = (PyArrayObject *)PyArray_CheckFromAny(op, type,
-                                                0, 0, flags, NULL);
+
+    ret = (PyArrayObject *)PyArray_CheckFromAny_int(
+            op, dtype, in_DType, 0, 0, flags, NULL);
 
 finish:
+    Py_XDECREF(dtype);
+
     if (ret == NULL) {
         return NULL;
     }
@@ -1732,7 +1792,7 @@ array_array(PyObject *NPY_UNUSED(ignored),
     npy_bool subok = NPY_FALSE;
     _PyArray_CopyMode copy = NPY_COPY_ALWAYS;
     int ndmin = 0;
-    PyArray_Descr *type = NULL;
+    npy_dtype_info dt_info = {NULL, NULL};
     NPY_ORDER order = NPY_KEEPORDER;
     PyObject *like = Py_None;
     NPY_PREPARE_ARGPARSER;
@@ -1740,21 +1800,23 @@ array_array(PyObject *NPY_UNUSED(ignored),
     if (len_args != 1 || (kwnames != NULL)) {
         if (npy_parse_arguments("array", args, len_args, kwnames,
                 "object", NULL, &op,
-                "|dtype", &PyArray_DescrConverter2, &type,
+                "|dtype", &PyArray_DTypeOrDescrConverterOptional, &dt_info,
                 "$copy", &PyArray_CopyConverter, &copy,
                 "$order", &PyArray_OrderConverter, &order,
                 "$subok", &PyArray_BoolConverter, &subok,
                 "$ndmin", &PyArray_PythonPyIntFromInt, &ndmin,
                 "$like", NULL, &like,
                 NULL, NULL, NULL) < 0) {
-            Py_XDECREF(type);
+            Py_XDECREF(dt_info.descr);
+            Py_XDECREF(dt_info.dtype);
             return NULL;
         }
         if (like != Py_None) {
             PyObject *deferred = array_implement_c_array_function_creation(
                     "array", like, NULL, NULL, args, len_args, kwnames);
             if (deferred != Py_NotImplemented) {
-                Py_XDECREF(type);
+                Py_XDECREF(dt_info.descr);
+                Py_XDECREF(dt_info.dtype);
                 return deferred;
             }
         }
@@ -1765,8 +1827,9 @@ array_array(PyObject *NPY_UNUSED(ignored),
     }
 
     PyObject *res = _array_fromobject_generic(
-            op, type, copy, order, subok, ndmin);
-    Py_XDECREF(type);
+            op, dt_info.descr, dt_info.dtype, copy, order, subok, ndmin);
+    Py_XDECREF(dt_info.descr);
+    Py_XDECREF(dt_info.dtype);
     return res;
 }
 
@@ -1775,7 +1838,7 @@ array_asarray(PyObject *NPY_UNUSED(ignored),
         PyObject *const *args, Py_ssize_t len_args, PyObject *kwnames)
 {
     PyObject *op;
-    PyArray_Descr *type = NULL;
+    npy_dtype_info dt_info = {NULL, NULL};
     NPY_ORDER order = NPY_KEEPORDER;
     PyObject *like = Py_None;
     NPY_PREPARE_ARGPARSER;
@@ -1783,18 +1846,20 @@ array_asarray(PyObject *NPY_UNUSED(ignored),
     if (len_args != 1 || (kwnames != NULL)) {
         if (npy_parse_arguments("asarray", args, len_args, kwnames,
                 "a", NULL, &op,
-                "|dtype", &PyArray_DescrConverter2, &type,
+                "|dtype", &PyArray_DTypeOrDescrConverterOptional, &dt_info,
                 "|order", &PyArray_OrderConverter, &order,
                 "$like", NULL, &like,
                 NULL, NULL, NULL) < 0) {
-            Py_XDECREF(type);
+            Py_XDECREF(dt_info.descr);
+            Py_XDECREF(dt_info.dtype);
             return NULL;
         }
         if (like != Py_None) {
             PyObject *deferred = array_implement_c_array_function_creation(
                     "asarray", like, NULL, NULL, args, len_args, kwnames);
             if (deferred != Py_NotImplemented) {
-                Py_XDECREF(type);
+                Py_XDECREF(dt_info.descr);
+                Py_XDECREF(dt_info.dtype);
                 return deferred;
             }
         }
@@ -1804,8 +1869,9 @@ array_asarray(PyObject *NPY_UNUSED(ignored),
     }
 
     PyObject *res = _array_fromobject_generic(
-            op, type, NPY_FALSE, order, NPY_FALSE, 0);
-    Py_XDECREF(type);
+            op, dt_info.descr, dt_info.dtype, NPY_FALSE, order, NPY_FALSE, 0);
+    Py_XDECREF(dt_info.descr);
+    Py_XDECREF(dt_info.dtype);
     return res;
 }
 
@@ -1814,7 +1880,7 @@ array_asanyarray(PyObject *NPY_UNUSED(ignored),
         PyObject *const *args, Py_ssize_t len_args, PyObject *kwnames)
 {
     PyObject *op;
-    PyArray_Descr *type = NULL;
+    npy_dtype_info dt_info = {NULL, NULL};
     NPY_ORDER order = NPY_KEEPORDER;
     PyObject *like = Py_None;
     NPY_PREPARE_ARGPARSER;
@@ -1822,18 +1888,20 @@ array_asanyarray(PyObject *NPY_UNUSED(ignored),
     if (len_args != 1 || (kwnames != NULL)) {
         if (npy_parse_arguments("asanyarray", args, len_args, kwnames,
                 "a", NULL, &op,
-                "|dtype", &PyArray_DescrConverter2, &type,
+                "|dtype", &PyArray_DTypeOrDescrConverterOptional, &dt_info,
                 "|order", &PyArray_OrderConverter, &order,
                 "$like", NULL, &like,
                 NULL, NULL, NULL) < 0) {
-            Py_XDECREF(type);
+            Py_XDECREF(dt_info.descr);
+            Py_XDECREF(dt_info.dtype);
             return NULL;
         }
         if (like != Py_None) {
             PyObject *deferred = array_implement_c_array_function_creation(
                     "asanyarray", like, NULL, NULL, args, len_args, kwnames);
             if (deferred != Py_NotImplemented) {
-                Py_XDECREF(type);
+                Py_XDECREF(dt_info.descr);
+                Py_XDECREF(dt_info.dtype);
                 return deferred;
             }
         }
@@ -1843,8 +1911,9 @@ array_asanyarray(PyObject *NPY_UNUSED(ignored),
     }
 
     PyObject *res = _array_fromobject_generic(
-            op, type, NPY_FALSE, order, NPY_TRUE, 0);
-    Py_XDECREF(type);
+            op, dt_info.descr, dt_info.dtype, NPY_FALSE, order, NPY_TRUE, 0);
+    Py_XDECREF(dt_info.descr);
+    Py_XDECREF(dt_info.dtype);
     return res;
 }
 
@@ -1854,24 +1923,26 @@ array_ascontiguousarray(PyObject *NPY_UNUSED(ignored),
         PyObject *const *args, Py_ssize_t len_args, PyObject *kwnames)
 {
     PyObject *op;
-    PyArray_Descr *type = NULL;
+    npy_dtype_info dt_info = {NULL, NULL};
     PyObject *like = Py_None;
     NPY_PREPARE_ARGPARSER;
 
     if (len_args != 1 || (kwnames != NULL)) {
         if (npy_parse_arguments("ascontiguousarray", args, len_args, kwnames,
                 "a", NULL, &op,
-                "|dtype", &PyArray_DescrConverter2, &type,
+                "|dtype", &PyArray_DTypeOrDescrConverterOptional, &dt_info,
                 "$like", NULL, &like,
                 NULL, NULL, NULL) < 0) {
-            Py_XDECREF(type);
+            Py_XDECREF(dt_info.descr);
+            Py_XDECREF(dt_info.dtype);
             return NULL;
         }
         if (like != Py_None) {
             PyObject *deferred = array_implement_c_array_function_creation(
                     "ascontiguousarray", like, NULL, NULL, args, len_args, kwnames);
             if (deferred != Py_NotImplemented) {
-                Py_XDECREF(type);
+                Py_XDECREF(dt_info.descr);
+                Py_XDECREF(dt_info.dtype);
                 return deferred;
             }
         }
@@ -1881,8 +1952,10 @@ array_ascontiguousarray(PyObject *NPY_UNUSED(ignored),
     }
 
     PyObject *res = _array_fromobject_generic(
-            op, type, NPY_FALSE, NPY_CORDER, NPY_FALSE, 1);
-    Py_XDECREF(type);
+            op, dt_info.descr, dt_info.dtype, NPY_FALSE, NPY_CORDER, NPY_FALSE,
+            1);
+    Py_XDECREF(dt_info.descr);
+    Py_XDECREF(dt_info.dtype);
     return res;
 }
 
@@ -1892,24 +1965,26 @@ array_asfortranarray(PyObject *NPY_UNUSED(ignored),
         PyObject *const *args, Py_ssize_t len_args, PyObject *kwnames)
 {
     PyObject *op;
-    PyArray_Descr *type = NULL;
+    npy_dtype_info dt_info = {NULL, NULL};
     PyObject *like = Py_None;
     NPY_PREPARE_ARGPARSER;
 
     if (len_args != 1 || (kwnames != NULL)) {
         if (npy_parse_arguments("asfortranarray", args, len_args, kwnames,
                 "a", NULL, &op,
-                "|dtype", &PyArray_DescrConverter2, &type,
+                "|dtype", &PyArray_DTypeOrDescrConverterOptional, &dt_info,
                 "$like", NULL, &like,
                 NULL, NULL, NULL) < 0) {
-            Py_XDECREF(type);
+            Py_XDECREF(dt_info.descr);
+            Py_XDECREF(dt_info.dtype);
             return NULL;
         }
         if (like != Py_None) {
             PyObject *deferred = array_implement_c_array_function_creation(
                     "asfortranarray", like, NULL, NULL, args, len_args, kwnames);
             if (deferred != Py_NotImplemented) {
-                Py_XDECREF(type);
+                Py_XDECREF(dt_info.descr);
+                Py_XDECREF(dt_info.dtype);
                 return deferred;
             }
         }
@@ -1919,8 +1994,10 @@ array_asfortranarray(PyObject *NPY_UNUSED(ignored),
     }
 
     PyObject *res = _array_fromobject_generic(
-            op, type, NPY_FALSE, NPY_FORTRANORDER, NPY_FALSE, 1);
-    Py_XDECREF(type);
+            op, dt_info.descr, dt_info.dtype, NPY_FALSE, NPY_FORTRANORDER,
+            NPY_FALSE, 1);
+    Py_XDECREF(dt_info.descr);
+    Py_XDECREF(dt_info.dtype);
     return res;
 }
 
@@ -2026,10 +2103,9 @@ fail:
 }
 
 static PyObject *
-array_empty_like(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *kwds)
+array_empty_like(PyObject *NPY_UNUSED(ignored),
+        PyObject *const *args, Py_ssize_t len_args, PyObject *kwnames)
 {
-
-    static char *kwlist[] = {"prototype", "dtype", "order", "subok", "shape", NULL};
     PyArrayObject *prototype = NULL;
     PyArray_Descr *dtype = NULL;
     NPY_ORDER order = NPY_KEEPORDER;
@@ -2038,12 +2114,15 @@ array_empty_like(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *kwds)
     /* -1 is a special value meaning "not specified" */
     PyArray_Dims shape = {NULL, -1};
 
-    if (!PyArg_ParseTupleAndKeywords(args, kwds, "O&|O&O&iO&:empty_like", kwlist,
-                &PyArray_Converter, &prototype,
-                &PyArray_DescrConverter2, &dtype,
-                &PyArray_OrderConverter, &order,
-                &subok,
-                &PyArray_OptionalIntpConverter, &shape)) {
+    NPY_PREPARE_ARGPARSER;
+
+    if (npy_parse_arguments("empty_like", args, len_args, kwnames,
+            "prototype", &PyArray_Converter, &prototype,
+            "|dtype", &PyArray_DescrConverter2, &dtype,
+            "|order", &PyArray_OrderConverter, &order,
+            "|subok", &PyArray_PythonPyIntFromInt, &subok,
+            "|shape", &PyArray_OptionalIntpConverter, &shape,
+            NULL, NULL, NULL) < 0) {
         goto fail;
     }
     /* steals the reference to dtype if it's not NULL */
@@ -2229,12 +2308,15 @@ fail:
 }
 
 static PyObject *
-array_count_nonzero(PyObject *NPY_UNUSED(self), PyObject *args, PyObject *kwds)
+array_count_nonzero(PyObject *NPY_UNUSED(self), PyObject *const *args, Py_ssize_t len_args)
 {
     PyArrayObject *array;
     npy_intp count;
 
-    if (!PyArg_ParseTuple(args, "O&:count_nonzero", PyArray_Converter, &array)) {
+    NPY_PREPARE_ARGPARSER;
+    if (npy_parse_arguments("count_nonzero", args, len_args, NULL,
+            "", PyArray_Converter, &array,
+            NULL, NULL, NULL) < 0) {
         return NULL;
     }
 
@@ -2440,7 +2522,8 @@ array_frombuffer(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *keywds
 }
 
 static PyObject *
-array_concatenate(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject *kwds)
+array_concatenate(PyObject *NPY_UNUSED(dummy),
+        PyObject *const *args, Py_ssize_t len_args, PyObject *kwnames)
 {
     PyObject *a0;
     PyObject *out = NULL;
@@ -2449,10 +2532,15 @@ array_concatenate(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject *kwds)
     PyObject *casting_obj = NULL;
     PyObject *res;
     int axis = 0;
-    static char *kwlist[] = {"seq", "axis", "out", "dtype", "casting", NULL};
-    if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|O&O$O&O:concatenate", kwlist,
-                &a0, PyArray_AxisConverter, &axis, &out,
-                PyArray_DescrConverter2, &dtype, &casting_obj)) {
+
+    NPY_PREPARE_ARGPARSER;
+    if (npy_parse_arguments("concatenate", args, len_args, kwnames,
+            "seq", NULL, &a0,
+            "|axis", &PyArray_AxisConverter, &axis,
+            "|out", NULL, &out,
+            "$dtype", &PyArray_DescrConverter2, &dtype,
+            "$casting", NULL, &casting_obj,
+            NULL, NULL, NULL) < 0) {
         return NULL;
     }
     int casting_not_passed = 0;
@@ -2484,25 +2572,34 @@ array_concatenate(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject *kwds)
 }
 
 static PyObject *
-array_innerproduct(PyObject *NPY_UNUSED(dummy), PyObject *args)
+array_innerproduct(PyObject *NPY_UNUSED(dummy), PyObject *const *args, Py_ssize_t len_args)
 {
     PyObject *b0, *a0;
 
-    if (!PyArg_ParseTuple(args, "OO:innerproduct", &a0, &b0)) {
-        return NULL;
+    NPY_PREPARE_ARGPARSER;
+    if (npy_parse_arguments("innerproduct", args, len_args, NULL,
+            "", NULL, &a0,
+            "", NULL, &b0,
+            NULL, NULL, NULL) < 0) {
+    return NULL;
     }
+
     return PyArray_Return((PyArrayObject *)PyArray_InnerProduct(a0, b0));
 }
 
 static PyObject *
-array_matrixproduct(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject* kwds)
+array_matrixproduct(PyObject *NPY_UNUSED(dummy),
+        PyObject *const *args, Py_ssize_t len_args, PyObject *kwnames)
 {
     PyObject *v, *a, *o = NULL;
     PyArrayObject *ret;
-    static char* kwlist[] = {"a", "b", "out", NULL};
 
-    if (!PyArg_ParseTupleAndKeywords(args, kwds, "OO|O:matrixproduct",
-                                     kwlist, &a, &v, &o)) {
+    NPY_PREPARE_ARGPARSER;
+    if (npy_parse_arguments("dot", args, len_args, kwnames,
+            "a", NULL, &a,
+            "b", NULL, &v,
+            "|out", NULL, &o,
+            NULL, NULL, NULL) < 0) {
         return NULL;
     }
     if (o != NULL) {
@@ -2520,7 +2617,7 @@ array_matrixproduct(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject* kwds)
 
 
 static PyObject *
-array_vdot(PyObject *NPY_UNUSED(dummy), PyObject *args)
+array_vdot(PyObject *NPY_UNUSED(dummy), PyObject *const *args, Py_ssize_t len_args)
 {
     int typenum;
     char *ip1, *ip2, *op;
@@ -2533,7 +2630,11 @@ array_vdot(PyObject *NPY_UNUSED(dummy), PyObject *args)
     PyArray_DotFunc *vdot;
     NPY_BEGIN_THREADS_DEF;
 
-    if (!PyArg_ParseTuple(args, "OO:vdot", &op1, &op2)) {
+    NPY_PREPARE_ARGPARSER;
+    if (npy_parse_arguments("vdot", args, len_args, NULL,
+            "", NULL, &op1,
+            "", NULL, &op2,
+            NULL, NULL, NULL) < 0) {
         return NULL;
     }
 
@@ -2541,8 +2642,14 @@ array_vdot(PyObject *NPY_UNUSED(dummy), PyObject *args)
      * Conjugating dot product using the BLAS for vectors.
      * Flattens both op1 and op2 before dotting.
      */
-    typenum = PyArray_ObjectType(op1, 0);
+    typenum = PyArray_ObjectType(op1, NPY_NOTYPE);
+    if (typenum == NPY_NOTYPE) {
+        return NULL;
+    }
     typenum = PyArray_ObjectType(op2, typenum);
+    if (typenum == NPY_NOTYPE) {
+        return NULL;
+    }
 
     type = PyArray_DescrFromType(typenum);
     Py_INCREF(type);
@@ -3104,13 +3211,9 @@ PyArray_GetNDArrayCFeatureVersion(void)
 }
 
 static PyObject *
-array__get_ndarray_c_version(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject *kwds)
+array__get_ndarray_c_version(
+        PyObject *NPY_UNUSED(dummy), PyObject *NPY_UNUSED(arg))
 {
-    static char *kwlist[] = {NULL};
-
-    if (!PyArg_ParseTupleAndKeywords(args, kwds, "", kwlist )) {
-        return NULL;
-    }
     return PyLong_FromLong( (long) PyArray_GetNDArrayCVersion() );
 }
 
@@ -3405,32 +3508,42 @@ fail:
 #undef INNER_WHERE_LOOP
 
 static PyObject *
-array_where(PyObject *NPY_UNUSED(ignored), PyObject *args)
+array_where(PyObject *NPY_UNUSED(ignored), PyObject *const *args, Py_ssize_t len_args)
 {
     PyObject *obj = NULL, *x = NULL, *y = NULL;
 
-    if (!PyArg_ParseTuple(args, "O|OO:where", &obj, &x, &y)) {
+    NPY_PREPARE_ARGPARSER;
+    if (npy_parse_arguments("where", args, len_args, NULL,
+            "", NULL, &obj,
+            "|x", NULL, &x,
+            "|y", NULL, &y,
+            NULL, NULL, NULL) < 0) {
         return NULL;
     }
+
     return PyArray_Where(obj, x, y);
 }
 
 static PyObject *
-array_lexsort(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *kwds)
+array_lexsort(PyObject *NPY_UNUSED(ignored), PyObject *const *args, Py_ssize_t len_args,
+                             PyObject *kwnames)
 {
     int axis = -1;
     PyObject *obj;
-    static char *kwlist[] = {"keys", "axis", NULL};
 
-    if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|i:lexsort", kwlist, &obj, &axis)) {
+    NPY_PREPARE_ARGPARSER;
+    if (npy_parse_arguments("lexsort", args, len_args, kwnames,
+            "keys", NULL, &obj,
+            "|axis", PyArray_PythonPyIntFromInt, &axis,
+            NULL, NULL, NULL) < 0) {
         return NULL;
     }
     return PyArray_Return((PyArrayObject *)PyArray_LexSort(obj, axis));
 }
 
 static PyObject *
-array_can_cast_safely(PyObject *NPY_UNUSED(self), PyObject *args,
-        PyObject *kwds)
+array_can_cast_safely(PyObject *NPY_UNUSED(self),
+        PyObject *const *args, Py_ssize_t len_args, PyObject *kwnames)
 {
     PyObject *from_obj = NULL;
     PyArray_Descr *d1 = NULL;
@@ -3438,12 +3551,13 @@ array_can_cast_safely(PyObject *NPY_UNUSED(self), PyObject *args,
     int ret;
     PyObject *retobj = NULL;
     NPY_CASTING casting = NPY_SAFE_CASTING;
-    static char *kwlist[] = {"from_", "to", "casting", NULL};
 
-    if(!PyArg_ParseTupleAndKeywords(args, kwds, "OO&|O&:can_cast", kwlist,
-                &from_obj,
-                PyArray_DescrConverter2, &d2,
-                PyArray_CastingConverter, &casting)) {
+    NPY_PREPARE_ARGPARSER;
+    if (npy_parse_arguments("can_cast", args, len_args, kwnames,
+            "from_", NULL, &from_obj,
+            "to", &PyArray_DescrConverter2, &d2,
+            "|casting", &PyArray_CastingConverter, &casting,
+            NULL, NULL, NULL) < 0) {
         goto finish;
     }
     if (d2 == NULL) {
@@ -3486,13 +3600,17 @@ array_can_cast_safely(PyObject *NPY_UNUSED(self), PyObject *args,
 }
 
 static PyObject *
-array_promote_types(PyObject *NPY_UNUSED(dummy), PyObject *args)
+array_promote_types(PyObject *NPY_UNUSED(dummy), PyObject *const *args, Py_ssize_t len_args)
 {
     PyArray_Descr *d1 = NULL;
     PyArray_Descr *d2 = NULL;
     PyObject *ret = NULL;
-    if (!PyArg_ParseTuple(args, "O&O&:promote_types",
-                PyArray_DescrConverter2, &d1, PyArray_DescrConverter2, &d2)) {
+
+    NPY_PREPARE_ARGPARSER;
+    if (npy_parse_arguments("promote_types", args, len_args, NULL,
+            "", PyArray_DescrConverter2, &d1,
+            "", PyArray_DescrConverter2, &d2,
+            NULL, NULL, NULL) < 0) {
         goto finish;
     }
 
@@ -3532,14 +3650,13 @@ array_min_scalar_type(PyObject *NPY_UNUSED(dummy), PyObject *args)
 }
 
 static PyObject *
-array_result_type(PyObject *NPY_UNUSED(dummy), PyObject *args)
+array_result_type(PyObject *NPY_UNUSED(dummy), PyObject *const *args, Py_ssize_t len)
 {
-    npy_intp i, len, narr = 0, ndtypes = 0;
+    npy_intp i, narr = 0, ndtypes = 0;
     PyArrayObject **arr = NULL;
     PyArray_Descr **dtypes = NULL;
     PyObject *ret = NULL;
 
-    len = PyTuple_GET_SIZE(args);
     if (len == 0) {
         PyErr_SetString(PyExc_ValueError,
                         "at least one array or dtype is required");
@@ -3553,7 +3670,7 @@ array_result_type(PyObject *NPY_UNUSED(dummy), PyObject *args)
     dtypes = (PyArray_Descr**)&arr[len];
 
     for (i = 0; i < len; ++i) {
-        PyObject *obj = PyTuple_GET_ITEM(args, i);
+        PyObject *obj = args[i];
         if (PyArray_Check(obj)) {
             Py_INCREF(obj);
             arr[narr] = (PyArrayObject *)obj;
@@ -3759,6 +3876,34 @@ format_longfloat(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject *kwds)
                               TrimMode_LeaveOneZero, -1, -1);
 }
 
+/*
+ * returns 1 if array is a user-defined string dtype, sets an error and
+ * returns 0 otherwise
+ */
+static int _is_user_defined_string_array(PyArrayObject* array)
+{
+    if (NPY_DT_is_user_defined(PyArray_DESCR(array))) {
+        PyTypeObject* scalar_type = NPY_DTYPE(PyArray_DESCR(array))->scalar_type;
+        if (PyType_IsSubtype(scalar_type, &PyBytes_Type) ||
+            PyType_IsSubtype(scalar_type, &PyUnicode_Type)) {
+            return 1;
+        }
+        else {
+            PyErr_SetString(
+                PyExc_TypeError,
+                "string comparisons are only allowed for dtypes with a "
+                "scalar type that is a subtype of str or bytes.");
+            return 0;
+        }
+    }
+    else {
+        PyErr_SetString(
+            PyExc_TypeError,
+            "string operation on non-string array");
+        return 0;
+    }
+}
+
 
 /*
  * The only purpose of this function is that it allows the "rstrip".
@@ -3835,6 +3980,9 @@ compare_chararrays(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject *kwds)
     else {
         PyErr_SetString(PyExc_TypeError,
                 "comparison of non-string arrays");
+        Py_DECREF(newarr);
+        Py_DECREF(newoth);
+        return NULL;
     }
     Py_DECREF(newarr);
     Py_DECREF(newoth);
@@ -4035,10 +4183,15 @@ _vec_string(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject *NPY_UNUSED(kw
         method = PyObject_GetAttr((PyObject *)&PyUnicode_Type, method_name);
     }
     else {
-        PyErr_SetString(PyExc_TypeError,
-                "string operation on non-string array");
-        Py_DECREF(type);
-        goto err;
+        if (_is_user_defined_string_array(char_array)) {
+            PyTypeObject* scalar_type =
+                NPY_DTYPE(PyArray_DESCR(char_array))->scalar_type;
+            method = PyObject_GetAttr((PyObject*)scalar_type, method_name);
+        }
+        else {
+            Py_DECREF(type);
+            goto err;
+        }
     }
     if (method == NULL) {
         Py_DECREF(type);
@@ -4074,7 +4227,7 @@ _vec_string(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject *NPY_UNUSED(kw
     return 0;
 }
 
-#ifndef __NPY_PRIVATE_NO_SIGNAL
+#ifndef NPY_NO_SIGNAL
 
 static NPY_TLS int sigint_buf_init = 0;
 static NPY_TLS NPY_SIGJMP_BUF _NPY_SIGINT_BUF;
@@ -4136,7 +4289,6 @@ array_shares_memory_impl(PyObject *args, PyObject *kwds, Py_ssize_t default_max_
     static char *kwlist[] = {"self", "other", "max_work", NULL};
 
     mem_overlap_t result;
-    static PyObject *too_hard_cls = NULL;
     Py_ssize_t max_work;
     NPY_BEGIN_THREADS_DEF;
 
@@ -4216,8 +4368,8 @@ array_shares_memory_impl(PyObject *args, PyObject *kwds, Py_ssize_t default_max_
     }
     else if (result == MEM_OVERLAP_TOO_HARD) {
         if (raise_exceptions) {
-            npy_cache_import("numpy.core._exceptions", "TooHardError",
-                             &too_hard_cls);
+            static PyObject *too_hard_cls = NULL;
+            npy_cache_import("numpy.exceptions", "TooHardError", &too_hard_cls);
             if (too_hard_cls) {
                 PyErr_SetString(too_hard_cls, "Exceeded max_work");
             }
@@ -4314,13 +4466,22 @@ _reload_guard(PyObject *NPY_UNUSED(self), PyObject *NPY_UNUSED(args)) {
     Py_RETURN_NONE;
 }
 
+
+static PyObject *
+_using_numpy2_behavior(
+        PyObject *NPY_UNUSED(self), PyObject *NPY_UNUSED(args))
+{
+    return PyBool_FromLong(npy_numpy2_behavior);
+}
+
+
 static struct PyMethodDef array_module_methods[] = {
     {"_get_implementing_args",
         (PyCFunction)array__get_implementing_args,
         METH_VARARGS, NULL},
     {"_get_ndarray_c_version",
         (PyCFunction)array__get_ndarray_c_version,
-        METH_VARARGS|METH_KEYWORDS, NULL},
+        METH_NOARGS, NULL},
     {"_reconstruct",
         (PyCFunction)array__reconstruct,
         METH_VARARGS, NULL},
@@ -4365,25 +4526,25 @@ static struct PyMethodDef array_module_methods[] = {
         METH_FASTCALL | METH_KEYWORDS, NULL},
     {"count_nonzero",
         (PyCFunction)array_count_nonzero,
-        METH_VARARGS|METH_KEYWORDS, NULL},
+        METH_FASTCALL, NULL},
     {"empty",
         (PyCFunction)array_empty,
         METH_FASTCALL | METH_KEYWORDS, NULL},
     {"empty_like",
         (PyCFunction)array_empty_like,
-        METH_VARARGS|METH_KEYWORDS, NULL},
+        METH_FASTCALL|METH_KEYWORDS, NULL},
     {"scalar",
         (PyCFunction)array_scalar,
         METH_VARARGS|METH_KEYWORDS, NULL},
     {"where",
         (PyCFunction)array_where,
-        METH_VARARGS, NULL},
+        METH_FASTCALL, NULL},
     {"lexsort",
         (PyCFunction)array_lexsort,
-        METH_VARARGS | METH_KEYWORDS, NULL},
+        METH_FASTCALL | METH_KEYWORDS, NULL},
     {"putmask",
         (PyCFunction)array_putmask,
-        METH_VARARGS | METH_KEYWORDS, NULL},
+        METH_FASTCALL | METH_KEYWORDS, NULL},
     {"fromstring",
         (PyCFunction)array_fromstring,
         METH_VARARGS|METH_KEYWORDS, NULL},
@@ -4392,16 +4553,16 @@ static struct PyMethodDef array_module_methods[] = {
         METH_VARARGS|METH_KEYWORDS, NULL},
     {"concatenate",
         (PyCFunction)array_concatenate,
-        METH_VARARGS|METH_KEYWORDS, NULL},
+        METH_FASTCALL|METH_KEYWORDS, NULL},
     {"inner",
         (PyCFunction)array_innerproduct,
-        METH_VARARGS, NULL},
+        METH_FASTCALL, NULL},
     {"dot",
         (PyCFunction)array_matrixproduct,
-        METH_VARARGS | METH_KEYWORDS, NULL},
+        METH_FASTCALL | METH_KEYWORDS, NULL},
     {"vdot",
         (PyCFunction)array_vdot,
-        METH_VARARGS | METH_KEYWORDS, NULL},
+        METH_FASTCALL, NULL},
     {"c_einsum",
         (PyCFunction)array_einsum,
         METH_VARARGS|METH_KEYWORDS, NULL},
@@ -4422,16 +4583,16 @@ static struct PyMethodDef array_module_methods[] = {
         METH_VARARGS | METH_KEYWORDS, NULL},
     {"can_cast",
         (PyCFunction)array_can_cast_safely,
-        METH_VARARGS | METH_KEYWORDS, NULL},
+        METH_FASTCALL | METH_KEYWORDS, NULL},
     {"promote_types",
         (PyCFunction)array_promote_types,
-        METH_VARARGS, NULL},
+        METH_FASTCALL, NULL},
     {"min_scalar_type",
         (PyCFunction)array_min_scalar_type,
         METH_VARARGS, NULL},
     {"result_type",
         (PyCFunction)array_result_type,
-        METH_VARARGS, NULL},
+        METH_FASTCALL, NULL},
     {"shares_memory",
         (PyCFunction)array_shares_memory,
         METH_VARARGS | METH_KEYWORDS, NULL},
@@ -4470,27 +4631,24 @@ static struct PyMethodDef array_module_methods[] = {
     {"_vec_string",
         (PyCFunction)_vec_string,
         METH_VARARGS | METH_KEYWORDS, NULL},
-    {"_insert", (PyCFunction)arr_insert,
+    {"_place", (PyCFunction)arr_place,
         METH_VARARGS | METH_KEYWORDS,
         "Insert vals sequentially into equivalent 1-d positions "
         "indicated by mask."},
     {"bincount", (PyCFunction)arr_bincount,
-        METH_VARARGS | METH_KEYWORDS, NULL},
+        METH_FASTCALL | METH_KEYWORDS, NULL},
     {"_monotonicity", (PyCFunction)arr__monotonicity,
         METH_VARARGS | METH_KEYWORDS, NULL},
-    {"implement_array_function",
-        (PyCFunction)array_implement_array_function,
-        METH_VARARGS, NULL},
     {"interp", (PyCFunction)arr_interp,
-        METH_VARARGS | METH_KEYWORDS, NULL},
+        METH_FASTCALL | METH_KEYWORDS, NULL},
     {"interp_complex", (PyCFunction)arr_interp_complex,
-        METH_VARARGS | METH_KEYWORDS, NULL},
+        METH_FASTCALL | METH_KEYWORDS, NULL},
     {"ravel_multi_index", (PyCFunction)arr_ravel_multi_index,
         METH_VARARGS | METH_KEYWORDS, NULL},
     {"unravel_index", (PyCFunction)arr_unravel_index,
         METH_VARARGS | METH_KEYWORDS, NULL},
     {"add_docstring", (PyCFunction)arr_add_docstring,
-        METH_VARARGS, NULL},
+        METH_FASTCALL, NULL},
     {"packbits", (PyCFunction)io_pack,
         METH_VARARGS | METH_KEYWORDS, NULL},
     {"unpackbits", (PyCFunction)io_unpack,
@@ -4500,7 +4658,7 @@ static struct PyMethodDef array_module_methods[] = {
     {"set_legacy_print_mode", (PyCFunction)set_legacy_print_mode,
         METH_VARARGS, NULL},
     {"_discover_array_parameters", (PyCFunction)_discover_array_parameters,
-        METH_VARARGS | METH_KEYWORDS, NULL},
+        METH_FASTCALL | METH_KEYWORDS, NULL},
     {"_get_castingimpl",  (PyCFunction)_get_castingimpl,
         METH_VARARGS | METH_KEYWORDS, NULL},
     {"_get_experimental_dtype_api", (PyCFunction)_get_experimental_dtype_api,
@@ -4513,10 +4671,10 @@ static struct PyMethodDef array_module_methods[] = {
         METH_VARARGS | METH_KEYWORDS, NULL},
     {"seterrobj",
         (PyCFunction) ufunc_seterr,
-        METH_VARARGS, NULL},
+        METH_O, NULL},
     {"geterrobj",
         (PyCFunction) ufunc_geterr,
-        METH_VARARGS, NULL},
+        METH_NOARGS, NULL},
     {"get_handler_name",
         (PyCFunction) get_handler_name,
         METH_VARARGS, NULL},
@@ -4542,6 +4700,8 @@ static struct PyMethodDef array_module_methods[] = {
     {"_reload_guard", (PyCFunction)_reload_guard,
         METH_NOARGS,
         "Give a warning on reload and big warning in sub-interpreters."},
+    {"_using_numpy2_behavior", (PyCFunction)_using_numpy2_behavior,
+        METH_NOARGS, NULL},
     {"from_dlpack", (PyCFunction)from_dlpack,
         METH_O, NULL},
     {NULL, NULL, 0, NULL}                /* sentinel */
@@ -4727,6 +4887,7 @@ NPY_VISIBILITY_HIDDEN PyObject * npy_ma_str_axis1 = NULL;
 NPY_VISIBILITY_HIDDEN PyObject * npy_ma_str_axis2 = NULL;
 NPY_VISIBILITY_HIDDEN PyObject * npy_ma_str_like = NULL;
 NPY_VISIBILITY_HIDDEN PyObject * npy_ma_str_numpy = NULL;
+NPY_VISIBILITY_HIDDEN PyObject * npy_ma_str_where = NULL;
 
 static int
 intern_strings(void)
@@ -4783,9 +4944,51 @@ intern_strings(void)
     if (npy_ma_str_numpy == NULL) {
         return -1;
     }
+    npy_ma_str_where = PyUnicode_InternFromString("where");
+    if (npy_ma_str_where == NULL) {
+        return -1;
+    }
     return 0;
 }
 
+
+/*
+ * Initializes global constants.  At some points these need to be cleaned
+ * up, and sometimes we also import them where they are needed.  But for
+ * some things, adding an `npy_cache_import` everywhere seems inconvenient.
+ *
+ * These globals should not need the C-layer at all and will be imported
+ * before anything on the C-side is initialized.
+ */
+static int
+initialize_static_globals(void)
+{
+    assert(npy_DTypePromotionError == NULL);
+    npy_cache_import(
+            "numpy.exceptions", "DTypePromotionError",
+            &npy_DTypePromotionError);
+    if (npy_DTypePromotionError == NULL) {
+        return -1;
+    }
+
+    assert(npy_UFuncNoLoopError == NULL);
+    npy_cache_import(
+            "numpy.core._exceptions", "_UFuncNoLoopError",
+            &npy_UFuncNoLoopError);
+    if (npy_UFuncNoLoopError == NULL) {
+        return -1;
+    }
+
+    /* Initialize from certain environment variabels: */
+    char *env = getenv("NPY_NUMPY_2_BEHAVIOR");
+    if (env != NULL && strcmp(env, "0") != 0) {
+        npy_numpy2_behavior = NPY_TRUE;
+    }
+
+    return 0;
+}
+
+
 static struct PyModuleDef moduledef = {
         PyModuleDef_HEAD_INIT,
         "_multiarray_umath",
@@ -4836,6 +5039,14 @@ PyMODINIT_FUNC PyInit__multiarray_umath(void) {
         goto err;
     }
 
+    if (intern_strings() < 0) {
+        goto err;
+    }
+
+    if (initialize_static_globals() < 0) {
+        goto err;
+    }
+
     if (PyType_Ready(&PyUFunc_Type) < 0) {
         goto err;
     }
@@ -5008,13 +5219,15 @@ PyMODINIT_FUNC PyInit__multiarray_umath(void) {
         goto err;
     }
 
-    if (intern_strings() < 0) {
+    if (set_typeinfo(d) != 0) {
         goto err;
     }
-
-    if (set_typeinfo(d) != 0) {
+    if (PyType_Ready(&PyArrayFunctionDispatcher_Type) < 0) {
         goto err;
     }
+    PyDict_SetItemString(
+            d, "_ArrayFunctionDispatcher",
+            (PyObject *)&PyArrayFunctionDispatcher_Type);
     if (PyType_Ready(&PyArrayMethod_Type) < 0) {
         goto err;
     }
diff --git a/numpy/core/src/multiarray/multiarraymodule.h b/numpy/core/src/multiarray/multiarraymodule.h
index 809736cd2..9ba2a1831 100644
--- a/numpy/core/src/multiarray/multiarraymodule.h
+++ b/numpy/core/src/multiarray/multiarraymodule.h
@@ -1,6 +1,8 @@
 #ifndef NUMPY_CORE_SRC_MULTIARRAY_MULTIARRAYMODULE_H_
 #define NUMPY_CORE_SRC_MULTIARRAY_MULTIARRAYMODULE_H_
 
+NPY_VISIBILITY_HIDDEN extern int npy_numpy2_behavior;
+
 NPY_VISIBILITY_HIDDEN extern PyObject * npy_ma_str_current_allocator;
 NPY_VISIBILITY_HIDDEN extern PyObject * npy_ma_str_array;
 NPY_VISIBILITY_HIDDEN extern PyObject * npy_ma_str_array_function;
@@ -14,5 +16,6 @@ NPY_VISIBILITY_HIDDEN extern PyObject * npy_ma_str_axis1;
 NPY_VISIBILITY_HIDDEN extern PyObject * npy_ma_str_axis2;
 NPY_VISIBILITY_HIDDEN extern PyObject * npy_ma_str_like;
 NPY_VISIBILITY_HIDDEN extern PyObject * npy_ma_str_numpy;
+NPY_VISIBILITY_HIDDEN extern PyObject * npy_ma_str_where;
 
 #endif  /* NUMPY_CORE_SRC_MULTIARRAY_MULTIARRAYMODULE_H_ */
diff --git a/numpy/core/src/multiarray/nditer_api.c b/numpy/core/src/multiarray/nditer_api.c
index 37e8acd84..28b7bf6e6 100644
--- a/numpy/core/src/multiarray/nditer_api.c
+++ b/numpy/core/src/multiarray/nditer_api.c
@@ -17,6 +17,7 @@
 #include "nditer_impl.h"
 #include "templ_common.h"
 #include "ctors.h"
+#include "refcount.h"
 
 /* Internal helper functions private to this file */
 static npy_intp
@@ -1945,8 +1946,7 @@ npyiter_copy_from_buffers(NpyIter *iter)
          * only copy back when this flag is on.
          */
         if ((transferinfo[iop].write.func != NULL) &&
-               (op_itflags[iop]&(NPY_OP_ITFLAG_WRITE|NPY_OP_ITFLAG_USINGBUFFER))
-                        == (NPY_OP_ITFLAG_WRITE|NPY_OP_ITFLAG_USINGBUFFER)) {
+               (op_itflags[iop]&NPY_OP_ITFLAG_USINGBUFFER)) {
             npy_intp op_transfersize;
 
             npy_intp src_stride, *dst_strides, *dst_coords, *dst_shape;
@@ -2059,27 +2059,18 @@ npyiter_copy_from_buffers(NpyIter *iter)
          * The flag USINGBUFFER is set when the buffer was used, so
          * only decrement refs when this flag is on.
          */
-        else if (transferinfo[iop].write.func != NULL &&
-                       (op_itflags[iop]&NPY_OP_ITFLAG_USINGBUFFER) != 0) {
-            NPY_IT_DBG_PRINT1("Iterator: Freeing refs and zeroing buffer "
-                                "of operand %d\n", (int)iop);
-            /* Decrement refs */
+        else if (transferinfo[iop].clear.func != NULL &&
+                    (op_itflags[iop]&NPY_OP_ITFLAG_USINGBUFFER)) {
+            NPY_IT_DBG_PRINT1(
+                    "Iterator: clearing refs of operand %d\n", (int)iop);
             npy_intp buf_stride = dtypes[iop]->elsize;
-            if (transferinfo[iop].write.func(
-                    &transferinfo[iop].write.context,
-                    &buffer, &transfersize, &buf_stride,
-                    transferinfo[iop].write.auxdata) < 0) {
+            if (transferinfo[iop].clear.func(
+                    NULL, transferinfo[iop].clear.descr, buffer, transfersize,
+                    buf_stride, transferinfo[iop].clear.auxdata) < 0) {
                 /* Since this should only decrement, it should never error */
                 assert(0);
                 return -1;
             }
-            /*
-             * Zero out the memory for safety.  For instance,
-             * if during iteration some Python code copied an
-             * array pointing into the buffer, it will get None
-             * values for its references after this.
-             */
-            memset(buffer, 0, dtypes[iop]->elsize*transfersize);
         }
     }
 
@@ -2626,54 +2617,36 @@ npyiter_clear_buffers(NpyIter *iter)
         return;
     }
 
-    if (!(NIT_ITFLAGS(iter) & NPY_ITFLAG_NEEDSAPI)) {
-        /* Buffers do not require clearing, but should not be copied back */
-        NBF_SIZE(bufferdata) = 0;
-        return;
-    }
-
     /*
-     * The iterator may be using a dtype with references, which always
-     * requires the API. In that case, further cleanup may be necessary.
-     *
-     * TODO: At this time, we assume that a dtype having references
-     *       implies the need to hold the GIL at all times. In theory
-     *       we could broaden this definition for a new
-     *       `PyArray_Item_XDECREF` API and the assumption may become
-     *       incorrect.
+     * The iterator may be using a dtype with references (as of writing this
+     * means Python objects, but does not need to stay that way).
+     * In that case, further cleanup may be necessary and we clear buffers
+     * explicitly.
      */
     PyObject *type, *value, *traceback;
     PyErr_Fetch(&type,  &value, &traceback);
 
     /* Cleanup any buffers with references */
     char **buffers = NBF_BUFFERS(bufferdata);
+
+    NpyIter_TransferInfo *transferinfo = NBF_TRANSFERINFO(bufferdata);
     PyArray_Descr **dtypes = NIT_DTYPES(iter);
     npyiter_opitflags *op_itflags = NIT_OPITFLAGS(iter);
     for (int iop = 0; iop < nop; ++iop, ++buffers) {
-        /*
-         * We may want to find a better way to do this, on the other hand,
-         * this cleanup seems rare and fairly special.  A dtype using
-         * references (right now only us) must always keep the buffer in
-         * a well defined state (either NULL or owning the reference).
-         * Only we implement cleanup
-         */
-        if (!PyDataType_REFCHK(dtypes[iop]) ||
-                !(op_itflags[iop]&NPY_OP_ITFLAG_USINGBUFFER)) {
+        if (transferinfo[iop].clear.func == NULL ||
+                 !(op_itflags[iop]&NPY_OP_ITFLAG_USINGBUFFER)) {
             continue;
         }
         if (*buffers == 0) {
             continue;
         }
         int itemsize = dtypes[iop]->elsize;
-        for (npy_intp i = 0; i < NBF_SIZE(bufferdata); i++) {
-            /*
-             * See above comment, if this API is expanded the GIL assumption
-             * could become incorrect.
-             */
-            PyArray_Item_XDECREF(*buffers + (itemsize * i), dtypes[iop]);
+        if (transferinfo[iop].clear.func(NULL,
+                dtypes[iop], *buffers, NBF_SIZE(bufferdata), itemsize,
+                transferinfo[iop].clear.auxdata) < 0) {
+            /* This should never fail; if it does write it out */
+            PyErr_WriteUnraisable(NULL);
         }
-        /* Clear out the buffer just to be sure */
-        memset(*buffers, 0, NBF_SIZE(bufferdata) * itemsize);
     }
     /* Signal that the buffers are empty */
     NBF_SIZE(bufferdata) = 0;
diff --git a/numpy/core/src/multiarray/nditer_constr.c b/numpy/core/src/multiarray/nditer_constr.c
index 93d2bc961..dfa84c51c 100644
--- a/numpy/core/src/multiarray/nditer_constr.c
+++ b/numpy/core/src/multiarray/nditer_constr.c
@@ -19,6 +19,8 @@
 #include "array_coercion.h"
 #include "templ_common.h"
 #include "array_assign.h"
+#include "dtype_traversal.h"
+
 
 /* Internal helper functions private to this file */
 static int
@@ -58,7 +60,7 @@ npyiter_fill_axisdata(NpyIter *iter, npy_uint32 flags, npyiter_opitflags *op_itf
                     char **op_dataptr,
                     const npy_uint32 *op_flags, int **op_axes,
                     npy_intp const *itershape);
-static NPY_INLINE int
+static inline int
 npyiter_get_op_axis(int axis, npy_bool *reduction_axis);
 static void
 npyiter_replace_axisdata(
@@ -630,6 +632,18 @@ NpyIter_Copy(NpyIter *iter)
                     }
                 }
             }
+
+            if (transferinfo[iop].clear.func != NULL) {
+                if (out_of_memory) {
+                    transferinfo[iop].clear.func = NULL;  /* No cleanup */
+                }
+                else {
+                    if (NPY_traverse_info_copy(&transferinfo[iop].clear,
+                                               &transferinfo[iop].clear) < 0) {
+                        out_of_memory = 1;
+                    }
+                }
+            }
         }
 
         /* Initialize the buffers to the current iterindex */
@@ -706,6 +720,7 @@ NpyIter_Deallocate(NpyIter *iter)
         for (iop = 0; iop < nop; ++iop, ++transferinfo) {
             NPY_cast_info_xfree(&transferinfo->read);
             NPY_cast_info_xfree(&transferinfo->write);
+            NPY_traverse_info_xfree(&transferinfo->clear);
         }
     }
 
@@ -1106,7 +1121,8 @@ npyiter_prepare_one_operand(PyArrayObject **op,
                                              NPY_ITEM_IS_POINTER))) != 0)) {
                 PyErr_SetString(PyExc_TypeError,
                         "Iterator operand or requested dtype holds "
-                        "references, but the REFS_OK flag was not enabled");
+                        "references, but the NPY_ITER_REFS_OK flag was not "
+                        "enabled");
                 return 0;
             }
         }
@@ -1118,7 +1134,7 @@ npyiter_prepare_one_operand(PyArrayObject **op,
         if (op_request_dtype != NULL) {
             /* We just have a borrowed reference to op_request_dtype */
             Py_SETREF(*op_dtype, PyArray_AdaptDescriptorToArray(
-                                            *op, (PyObject *)op_request_dtype));
+                                            *op, NULL, op_request_dtype));
             if (*op_dtype == NULL) {
                 return 0;
             }
@@ -1133,7 +1149,7 @@ npyiter_prepare_one_operand(PyArrayObject **op,
                           PyArray_DescrNewByteorder(*op_dtype, NPY_NATIVE));
                 if (*op_dtype == NULL) {
                     return 0;
-                }                
+                }
                 NPY_IT_DBG_PRINT("Iterator: Setting NPY_OP_ITFLAG_CAST "
                                     "because of NPY_ITER_NBO\n");
                 /* Indicate that byte order or alignment needs fixing */
@@ -1445,7 +1461,7 @@ npyiter_check_reduce_ok_and_set_flags(
  * @param reduction_axis Output 1 if a reduction axis, otherwise 0.
  * @returns The normalized axis (without reduce axis flag).
  */
-static NPY_INLINE int
+static inline int
 npyiter_get_op_axis(int axis, npy_bool *reduction_axis) {
     npy_bool forced_broadcast = axis >= NPY_ITER_REDUCTION_AXIS(-1);
 
@@ -2249,7 +2265,7 @@ npyiter_reverse_axis_ordering(NpyIter *iter)
     NIT_ITFLAGS(iter) &= ~NPY_ITFLAG_IDENTPERM;
 }
 
-static NPY_INLINE npy_intp
+static inline npy_intp
 intp_abs(npy_intp x)
 {
     return (x < 0) ? -x : x;
@@ -3186,31 +3202,33 @@ npyiter_allocate_transfer_functions(NpyIter *iter)
                     cflags = PyArrayMethod_COMBINED_FLAGS(cflags, nc_flags);
                 }
             }
-            /* If no write back but there are references make a decref fn */
-            else if (PyDataType_REFCHK(op_dtype[iop])) {
+            else {
+                transferinfo[iop].write.func = NULL;
+            }
+
+            /* Get the decref function, used for no-writeback and on error */
+            if (PyDataType_REFCHK(op_dtype[iop])) {
                 /*
                  * By passing NULL to dst_type and setting move_references
                  * to 1, we get back a function that just decrements the
                  * src references.
                  */
-                if (PyArray_GetDTypeTransferFunction(
+                if (PyArray_GetClearFunction(
                         (flags & NPY_OP_ITFLAG_ALIGNED) != 0,
-                        op_dtype[iop]->elsize, 0,
-                        op_dtype[iop], NULL,
-                        1,
-                        &transferinfo[iop].write,
-                        &nc_flags) != NPY_SUCCEED) {
+                        op_dtype[iop]->elsize, op_dtype[iop],
+                        &transferinfo[iop].clear, &nc_flags) < 0) {
                     goto fail;
                 }
                 cflags = PyArrayMethod_COMBINED_FLAGS(cflags, nc_flags);
             }
             else {
-                transferinfo[iop].write.func = NULL;
+                transferinfo[iop].clear.func = NULL;
             }
         }
         else {
             transferinfo[iop].read.func = NULL;
             transferinfo[iop].write.func = NULL;
+            transferinfo[iop].clear.func = NULL;
         }
     }
 
diff --git a/numpy/core/src/multiarray/nditer_impl.h b/numpy/core/src/multiarray/nditer_impl.h
index 459675ea8..a33e4f90f 100644
--- a/numpy/core/src/multiarray/nditer_impl.h
+++ b/numpy/core/src/multiarray/nditer_impl.h
@@ -23,6 +23,8 @@
 
 #include "lowlevel_strided_loops.h"
 #include "dtype_transfer.h"
+#include "dtype_traversal.h"
+
 
 /********** ITERATOR CONSTRUCTION TIMING **************/
 #define NPY_IT_CONSTRUCTION_TIMING 0
@@ -152,7 +154,7 @@ struct NpyIter_InternalOnly {
     /* iterindex is only used if RANGED or BUFFERED is set */
     npy_intp iterindex;
     /* The rest is variable */
-    char iter_flexdata;
+    char iter_flexdata[];
 };
 
 typedef struct NpyIter_AxisData_tag NpyIter_AxisData;
@@ -221,27 +223,28 @@ typedef npy_int16 npyiter_opitflags;
 #define NIT_ITERINDEX(iter) \
         (iter->iterindex)
 #define NIT_PERM(iter)  ((npy_int8 *)( \
-        &(iter)->iter_flexdata + NIT_PERM_OFFSET()))
+        iter->iter_flexdata + NIT_PERM_OFFSET()))
 #define NIT_DTYPES(iter) ((PyArray_Descr **)( \
-        &(iter)->iter_flexdata + NIT_DTYPES_OFFSET(itflags, ndim, nop)))
+        iter->iter_flexdata + NIT_DTYPES_OFFSET(itflags, ndim, nop)))
 #define NIT_RESETDATAPTR(iter) ((char **)( \
-        &(iter)->iter_flexdata + NIT_RESETDATAPTR_OFFSET(itflags, ndim, nop)))
+        iter->iter_flexdata + NIT_RESETDATAPTR_OFFSET(itflags, ndim, nop)))
 #define NIT_BASEOFFSETS(iter) ((npy_intp *)( \
-        &(iter)->iter_flexdata + NIT_BASEOFFSETS_OFFSET(itflags, ndim, nop)))
+        iter->iter_flexdata + NIT_BASEOFFSETS_OFFSET(itflags, ndim, nop)))
 #define NIT_OPERANDS(iter) ((PyArrayObject **)( \
-        &(iter)->iter_flexdata + NIT_OPERANDS_OFFSET(itflags, ndim, nop)))
+        iter->iter_flexdata + NIT_OPERANDS_OFFSET(itflags, ndim, nop)))
 #define NIT_OPITFLAGS(iter) ((npyiter_opitflags *)( \
-        &(iter)->iter_flexdata + NIT_OPITFLAGS_OFFSET(itflags, ndim, nop)))
+        iter->iter_flexdata + NIT_OPITFLAGS_OFFSET(itflags, ndim, nop)))
 #define NIT_BUFFERDATA(iter) ((NpyIter_BufferData *)( \
-        &(iter)->iter_flexdata + NIT_BUFFERDATA_OFFSET(itflags, ndim, nop)))
+        iter->iter_flexdata + NIT_BUFFERDATA_OFFSET(itflags, ndim, nop)))
 #define NIT_AXISDATA(iter) ((NpyIter_AxisData *)( \
-        &(iter)->iter_flexdata + NIT_AXISDATA_OFFSET(itflags, ndim, nop)))
+        iter->iter_flexdata + NIT_AXISDATA_OFFSET(itflags, ndim, nop)))
 
 /* Internal-only BUFFERDATA MEMBER ACCESS */
 
 struct NpyIter_TransferInfo_tag {
     NPY_cast_info read;
     NPY_cast_info write;
+    NPY_traverse_info clear;
     /* Probably unnecessary, but make sure what follows is intp aligned: */
     npy_intp _unused_ensure_alignment[];
 };
@@ -329,7 +332,7 @@ struct NpyIter_AxisData_tag {
  * @return The unpermuted axis. Without `op_axes` this is correct, with
  *         `op_axes` this indexes into `op_axes` (unpermuted iterator axis)
  */
-static NPY_INLINE int
+static inline int
 npyiter_undo_iter_axis_perm(
         int axis, int ndim, const npy_int8 *perm, npy_bool *axis_flipped)
 {
diff --git a/numpy/core/src/multiarray/number.c b/numpy/core/src/multiarray/number.c
index 292ef55a6..35e4af79c 100644
--- a/numpy/core/src/multiarray/number.c
+++ b/numpy/core/src/multiarray/number.c
@@ -53,6 +53,8 @@ static PyObject *
 array_inplace_remainder(PyArrayObject *m1, PyObject *m2);
 static PyObject *
 array_inplace_power(PyArrayObject *a1, PyObject *o2, PyObject *NPY_UNUSED(modulo));
+static PyObject *
+array_inplace_matrix_multiply(PyArrayObject *m1, PyObject *m2);
 
 /*
  * Dictionary can contain any of the numeric operations, by name.
@@ -339,7 +341,6 @@ array_divmod(PyObject *m1, PyObject *m2)
     return PyArray_GenericBinaryFunction(m1, m2, n_ops.divmod);
 }
 
-/* Need this to be version dependent on account of the slot check */
 static PyObject *
 array_matrix_multiply(PyObject *m1, PyObject *m2)
 {
@@ -348,13 +349,70 @@ array_matrix_multiply(PyObject *m1, PyObject *m2)
 }
 
 static PyObject *
-array_inplace_matrix_multiply(
-        PyArrayObject *NPY_UNUSED(m1), PyObject *NPY_UNUSED(m2))
+array_inplace_matrix_multiply(PyArrayObject *self, PyObject *other)
 {
-    PyErr_SetString(PyExc_TypeError,
-                    "In-place matrix multiplication is not (yet) supported. "
-                    "Use 'a = a @ b' instead of 'a @= b'.");
-    return NULL;
+    static PyObject *AxisError_cls = NULL;
+    npy_cache_import("numpy.exceptions", "AxisError", &AxisError_cls);
+    if (AxisError_cls == NULL) {
+        return NULL;
+    }
+
+    INPLACE_GIVE_UP_IF_NEEDED(self, other,
+            nb_inplace_matrix_multiply, array_inplace_matrix_multiply);
+
+    /*
+     * Unlike `matmul(a, b, out=a)` we ensure that the result is not broadcast
+     * if the result without `out` would have less dimensions than `a`.
+     * Since the signature of matmul is '(n?,k),(k,m?)->(n?,m?)' this is the
+     * case exactly when the second operand has both core dimensions.
+     *
+     * The error here will be confusing, but for now, we enforce this by
+     * passing the correct `axes=`.
+     */
+    static PyObject *axes_1d_obj_kwargs = NULL;
+    static PyObject *axes_2d_obj_kwargs = NULL;
+    if (NPY_UNLIKELY(axes_1d_obj_kwargs == NULL)) {
+        axes_1d_obj_kwargs = Py_BuildValue(
+                "{s, [(i), (i, i), (i)]}", "axes", -1, -2, -1, -1);
+        if (axes_1d_obj_kwargs == NULL) {
+            return NULL;
+        }
+    }
+    if (NPY_UNLIKELY(axes_2d_obj_kwargs == NULL)) {
+        axes_2d_obj_kwargs = Py_BuildValue(
+                "{s, [(i, i), (i, i), (i, i)]}", "axes", -2, -1, -2, -1, -2, -1);
+        if (axes_2d_obj_kwargs == NULL) {
+            return NULL;
+        }
+    }
+
+    PyObject *args = PyTuple_Pack(3, self, other, self);
+    if (args == NULL) {
+        return NULL;
+    }
+    PyObject *kwargs;
+    if (PyArray_NDIM(self) == 1) {
+        kwargs = axes_1d_obj_kwargs;
+    }
+    else {
+        kwargs = axes_2d_obj_kwargs;
+    }
+    PyObject *res = PyObject_Call(n_ops.matmul, args, kwargs);
+    Py_DECREF(args);
+
+    if (res == NULL) {
+        /*
+         * AxisError should indicate that the axes argument didn't work out
+         * which should mean the second operand not being 2 dimensional.
+         */
+        if (PyErr_ExceptionMatches(AxisError_cls)) {
+            PyErr_SetString(PyExc_ValueError,
+                "inplace matrix multiplication requires the first operand to "
+                "have at least one and the second at least two dimensions.");
+        }
+    }
+
+    return res;
 }
 
 /*
@@ -539,47 +597,10 @@ array_power(PyObject *a1, PyObject *o2, PyObject *modulo)
 static PyObject *
 array_positive(PyArrayObject *m1)
 {
-    /*
-     * For backwards compatibility, where + just implied a copy,
-     * we cannot just call n_ops.positive.  Instead, we do the following
-     * 1. Try n_ops.positive
-     * 2. If we get an exception, check whether __array_ufunc__ is
-     *    overridden; if so, we live in the future and we allow the
-     *    TypeError to be passed on.
-     * 3. If not, give a deprecation warning and return a copy.
-     */
-    PyObject *value;
     if (can_elide_temp_unary(m1)) {
-        value = PyArray_GenericInplaceUnaryFunction(m1, n_ops.positive);
-    }
-    else {
-        value = PyArray_GenericUnaryFunction(m1, n_ops.positive);
-    }
-    if (value == NULL) {
-        /*
-         * We first fetch the error, as it needs to be clear to check
-         * for the override.  When the deprecation is removed,
-         * this whole stanza can be deleted.
-         */
-        PyObject *exc, *val, *tb;
-        PyErr_Fetch(&exc, &val, &tb);
-        if (PyUFunc_HasOverride((PyObject *)m1)) {
-            PyErr_Restore(exc, val, tb);
-            return NULL;
-        }
-        Py_XDECREF(exc);
-        Py_XDECREF(val);
-        Py_XDECREF(tb);
-
-        /* 2018-06-28, 1.16.0 */
-        if (DEPRECATE("Applying '+' to a non-numerical array is "
-                      "ill-defined. Returning a copy, but in the future "
-                      "this will error.") < 0) {
-            return NULL;
-        }
-        value = PyArray_Return((PyArrayObject *)PyArray_Copy(m1));
+        return PyArray_GenericInplaceUnaryFunction(m1, n_ops.positive);
     }
-    return value;
+    return PyArray_GenericUnaryFunction(m1, n_ops.positive);
 }
 
 static PyObject *
@@ -848,13 +869,11 @@ array_scalar_forward(PyArrayObject *v,
                      PyObject *(*builtin_func)(PyObject *),
                      const char *where)
 {
-    PyObject *scalar;
-    if (PyArray_SIZE(v) != 1) {
-        PyErr_SetString(PyExc_TypeError, "only size-1 arrays can be"\
-                        " converted to Python scalars");
+    if (check_is_convertible_to_scalar(v) < 0) {
         return NULL;
     }
 
+    PyObject *scalar;
     scalar = PyArray_GETITEM(v, PyArray_DATA(v));
     if (scalar == NULL) {
         return NULL;
@@ -925,7 +944,6 @@ NPY_NO_EXPORT PyNumberMethods array_as_number = {
 
     .nb_int = (unaryfunc)array_int,
     .nb_float = (unaryfunc)array_float,
-    .nb_index = (unaryfunc)array_index,
 
     .nb_inplace_add = (binaryfunc)array_inplace_add,
     .nb_inplace_subtract = (binaryfunc)array_inplace_subtract,
@@ -943,6 +961,8 @@ NPY_NO_EXPORT PyNumberMethods array_as_number = {
     .nb_inplace_floor_divide = (binaryfunc)array_inplace_floor_divide,
     .nb_inplace_true_divide = (binaryfunc)array_inplace_true_divide,
 
+    .nb_index = (unaryfunc)array_index,
+
     .nb_matrix_multiply = array_matrix_multiply,
     .nb_inplace_matrix_multiply = (binaryfunc)array_inplace_matrix_multiply,
 };
diff --git a/numpy/core/src/multiarray/refcount.c b/numpy/core/src/multiarray/refcount.c
index a1c310700..876bb53e1 100644
--- a/numpy/core/src/multiarray/refcount.c
+++ b/numpy/core/src/multiarray/refcount.c
@@ -2,6 +2,10 @@
  * This module corresponds to the `Special functions for NPY_OBJECT`
  * section in the numpy reference for C-API.
  */
+#include "array_method.h"
+#include "dtype_traversal.h"
+#include "lowlevel_strided_loops.h"
+#include "pyerrors.h"
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
 #define _MULTIARRAYMODULE
 
@@ -12,13 +16,101 @@
 #include "numpy/arrayobject.h"
 #include "numpy/arrayscalars.h"
 #include "iterators.h"
+#include "dtypemeta.h"
+#include "refcount.h"
 
 #include "npy_config.h"
 
 #include "npy_pycompat.h"
 
-static void
-_fillobject(char *optr, PyObject *obj, PyArray_Descr *dtype);
+/*
+ * Helper function to clear a strided memory (normally or always contiguous)
+ * from all Python (or other) references.  The function does nothing if the
+ * array dtype does not indicate holding references.
+ *
+ * It is safe to call this function more than once, failing here is usually
+ * critical (during cleanup) and should be set up to minimize the risk or
+ * avoid it fully.
+ */
+NPY_NO_EXPORT int
+PyArray_ClearBuffer(
+        PyArray_Descr *descr, char *data,
+        npy_intp stride, npy_intp size, int aligned)
+{
+    if (!PyDataType_REFCHK(descr)) {
+        return 0;
+    }
+
+    NPY_traverse_info clear_info;
+    /* Flags unused: float errors do not matter and we do not release GIL */
+    NPY_ARRAYMETHOD_FLAGS flags_unused;
+    if (PyArray_GetClearFunction(
+            aligned, stride, descr, &clear_info, &flags_unused) < 0) {
+        return -1;
+    }
+
+    int res = clear_info.func(
+            NULL, clear_info.descr, data, size, stride, clear_info.auxdata);
+    NPY_traverse_info_xfree(&clear_info);
+    return res;
+}
+
+
+/*
+ * Helper function to clear whole array.  It seems plausible that we should
+ * be able to get away with assuming the the array is contiguous.
+ *
+ * Must only be called on arrays which own their data (and asserts this).
+ */
+ NPY_NO_EXPORT int
+ PyArray_ClearArray(PyArrayObject *arr)
+ {
+    assert(PyArray_FLAGS(arr) & NPY_ARRAY_OWNDATA);
+
+    PyArray_Descr *descr = PyArray_DESCR(arr);
+    if (!PyDataType_REFCHK(descr)) {
+        return 0;
+    }
+    /*
+     * The contiguous path should cover practically all important cases since
+     * it is difficult to create a non-contiguous array which owns its memory
+     * and only arrays which own their memory should clear it.
+     */
+    int aligned = PyArray_ISALIGNED(arr);
+    if (PyArray_ISCONTIGUOUS(arr)) {
+        return PyArray_ClearBuffer(
+                descr, PyArray_BYTES(arr), descr->elsize,
+                PyArray_SIZE(arr), aligned);
+    }
+    int idim, ndim;
+    npy_intp shape_it[NPY_MAXDIMS], strides_it[NPY_MAXDIMS];
+    npy_intp coord[NPY_MAXDIMS];
+    char *data_it;
+    if (PyArray_PrepareOneRawArrayIter(
+                    PyArray_NDIM(arr), PyArray_DIMS(arr),
+                    PyArray_BYTES(arr), PyArray_STRIDES(arr),
+                    &ndim, shape_it, &data_it, strides_it) < 0) {
+        return -1;
+    }
+    npy_intp inner_stride = strides_it[0];
+    npy_intp inner_shape = shape_it[0];
+    NPY_traverse_info clear_info;
+    /* Flags unused: float errors do not matter and we do not release GIL */
+    NPY_ARRAYMETHOD_FLAGS flags_unused;
+    if (PyArray_GetClearFunction(
+            aligned, inner_stride, descr, &clear_info, &flags_unused) < 0) {
+        return -1;
+    }
+    NPY_RAW_ITER_START(idim, ndim, coord, shape_it) {
+        /* Process the innermost dimension */
+        if (clear_info.func(NULL, clear_info.descr,
+                data_it, inner_shape, inner_stride, clear_info.auxdata) < 0) {
+            return -1;
+        }
+    } NPY_RAW_ITER_ONE_NEXT(idim, ndim, coord,
+                            shape_it, data_it, strides_it);
+    return 0;
+}
 
 
 /*NUMPY_API
@@ -204,6 +296,10 @@ PyArray_INCREF(PyArrayObject *mp)
 /*NUMPY_API
   Decrement all internal references for object arrays.
   (or arrays with object fields)
+
+  The use of this function is strongly discouraged, within NumPy
+  use PyArray_Clear, which DECREF's and sets everything to NULL and can
+  work with any dtype.
 */
 NPY_NO_EXPORT int
 PyArray_XDECREF(PyArrayObject *mp)
@@ -254,15 +350,28 @@ PyArray_XDECREF(PyArrayObject *mp)
     return 0;
 }
 
+
+static void
+_fillobject(char *optr, PyObject *obj, PyArray_Descr *dtype);
+
+
 /*NUMPY_API
  * Assumes contiguous
  */
 NPY_NO_EXPORT void
 PyArray_FillObjectArray(PyArrayObject *arr, PyObject *obj)
 {
+    PyArray_Descr* descr = PyArray_DESCR(arr);
+
+    // non-legacy dtypes are responsible for initializing
+    // their own internal references
+    if (!NPY_DT_is_legacy(NPY_DTYPE(descr))) {
+        return;
+    }
+
     npy_intp i,n;
     n = PyArray_SIZE(arr);
-    if (PyArray_DESCR(arr)->type_num == NPY_OBJECT) {
+    if (descr->type_num == NPY_OBJECT) {
         PyObject **optr;
         optr = (PyObject **)(PyArray_DATA(arr));
         n = PyArray_SIZE(arr);
@@ -282,8 +391,8 @@ PyArray_FillObjectArray(PyArrayObject *arr, PyObject *obj)
         char *optr;
         optr = PyArray_DATA(arr);
         for (i = 0; i < n; i++) {
-            _fillobject(optr, obj, PyArray_DESCR(arr));
-            optr += PyArray_DESCR(arr)->elsize;
+            _fillobject(optr, obj, descr);
+            optr += descr->elsize;
         }
     }
 }
diff --git a/numpy/core/src/multiarray/refcount.h b/numpy/core/src/multiarray/refcount.h
index 959eef5ba..16d34e292 100644
--- a/numpy/core/src/multiarray/refcount.h
+++ b/numpy/core/src/multiarray/refcount.h
@@ -1,6 +1,14 @@
 #ifndef NUMPY_CORE_SRC_MULTIARRAY_REFCOUNT_H_
 #define NUMPY_CORE_SRC_MULTIARRAY_REFCOUNT_H_
 
+NPY_NO_EXPORT int
+PyArray_ClearBuffer(
+        PyArray_Descr *descr, char *data,
+        npy_intp stride, npy_intp size, int aligned);
+
+NPY_NO_EXPORT int
+PyArray_ClearArray(PyArrayObject *arr);
+
 NPY_NO_EXPORT void
 PyArray_Item_INCREF(char *data, PyArray_Descr *descr);
 
diff --git a/numpy/core/src/multiarray/scalartypes.c.src b/numpy/core/src/multiarray/scalartypes.c.src
index e1f236001..ff30737b5 100644
--- a/numpy/core/src/multiarray/scalartypes.c.src
+++ b/numpy/core/src/multiarray/scalartypes.c.src
@@ -15,6 +15,7 @@
 
 #include "npy_pycompat.h"
 
+#include "arraytypes.h"
 #include "npy_config.h"
 #include "mapping.h"
 #include "ctors.h"
@@ -278,7 +279,43 @@ static PyObject *
 genint_type_str(PyObject *self)
 {
     PyObject  *item, *item_str;
-    item = gentype_generic_method(self, NULL, NULL, "item");
+    PyArray_Descr *descr = PyArray_DescrFromTypeObject((PyObject *)Py_TYPE(self));
+    void *val = scalar_value(self, descr);
+    switch (descr->type_num) {
+        case NPY_BYTE:
+            item = PyLong_FromLong(*(int8_t *)val);
+            break;
+        case NPY_UBYTE:
+            item = PyLong_FromUnsignedLong(*(uint8_t *)val);
+            break;
+        case NPY_SHORT:
+            item = PyLong_FromLong(*(int16_t *)val);
+            break;
+        case NPY_USHORT:
+            item = PyLong_FromUnsignedLong(*(uint16_t *)val);
+            break;
+        case NPY_INT:
+            item = PyLong_FromLong(*(int32_t *)val);
+            break;
+        case NPY_UINT:
+            item = PyLong_FromUnsignedLong(*(uint32_t *)val);
+            break;
+        case NPY_LONG:
+            item = PyLong_FromLong(*(int64_t *)val);
+            break;
+        case NPY_ULONG:
+            item = PyLong_FromUnsignedLong(*(uint64_t *)val);
+            break;
+        case NPY_LONGLONG:
+            item = PyLong_FromLongLong(*(long long *)val);
+            break;
+        case NPY_ULONGLONG:
+            item = PyLong_FromUnsignedLongLong(*(unsigned long long *)val);
+            break;
+        default:
+            item = gentype_generic_method(self, NULL, NULL, "item");
+            break;
+    }
     if (item == NULL) {
         return NULL;
     }
@@ -447,7 +484,7 @@ unicodetype_@form@(PyObject *self)
  *
  * Largely copied from _Py_strhex_impl in CPython implementation
  */
-static NPY_INLINE PyObject *
+static inline PyObject *
 _void_to_hex(const char* argbuf, const Py_ssize_t arglen,
              const char *schars, const char *bprefix, const char *echars)
 {
@@ -896,15 +933,21 @@ static PyObject *
 @name@type_@kind@_either(npy_@name@ val, TrimMode trim_pos, TrimMode trim_sci,
                          npy_bool sign)
 {
-    npy_@name@ absval;
 
     if (npy_legacy_print_mode <= 113) {
         return legacy_@name@_format@kind@(val);
     }
 
-    absval = val < 0 ? -val : val;
+    int use_positional;
+    if (npy_isnan(val) || val == 0) {
+         use_positional = 1;
+    }
+    else {
+        npy_@name@ absval = val < 0 ? -val : val;
+        use_positional = absval < 1.e16L && absval >= 1.e-4L;
+    }
 
-    if (absval == 0 || (absval < 1.e16L && absval >= 1.e-4L) ) {
+    if (use_positional) {
         return format_@name@(val, 0, -1, sign, trim_pos, -1, -1, -1);
     }
     return format_@name@(val, 1, -1, sign, trim_sci, -1, -1, -1);
@@ -1072,6 +1115,8 @@ gentype_richcompare(PyObject *self, PyObject *other, int cmp_op)
         }
     }
 
+   RICHCMP_GIVE_UP_IF_NEEDED(self, other);
+
     arr = PyArray_FromScalar(self, NULL);
     if (arr == NULL) {
         return NULL;
@@ -1753,11 +1798,13 @@ gentype_reduce(PyObject *self, PyObject *NPY_UNUSED(args))
 
     mod = PyImport_ImportModule("numpy.core._multiarray_umath");
     if (mod == NULL) {
+        Py_DECREF(ret);
         return NULL;
     }
     obj = PyObject_GetAttrString(mod, "scalar");
     Py_DECREF(mod);
     if (obj == NULL) {
+        Py_DECREF(ret);
         return NULL;
     }
     PyTuple_SET_ITEM(ret, 0, obj);
@@ -1766,6 +1813,7 @@ gentype_reduce(PyObject *self, PyObject *NPY_UNUSED(args))
         PyObject *val = PyArrayScalar_VAL(self, Object);
         PyObject *tup = Py_BuildValue("NO", obj, val);
         if (tup == NULL) {
+            Py_DECREF(ret);
             return NULL;
         }
         PyTuple_SET_ITEM(ret, 1, tup);
@@ -1774,11 +1822,13 @@ gentype_reduce(PyObject *self, PyObject *NPY_UNUSED(args))
         /* a structured dtype with an object in a field */
         PyArrayObject *arr = (PyArrayObject *)PyArray_FromScalar(self, NULL);
         if (arr == NULL) {
+            Py_DECREF(ret);
             return NULL;
         }
         /* Use the whole array which handles sturctured void correctly */
         PyObject *tup = Py_BuildValue("NN", obj, arr);
         if (tup == NULL) {
+            Py_DECREF(ret);
             return NULL;
         }
         PyTuple_SET_ITEM(ret, 1, tup);
@@ -1839,10 +1889,7 @@ gentype_setflags(PyObject *NPY_UNUSED(self), PyObject *NPY_UNUSED(args),
 static PyObject *
 numbertype_class_getitem_abc(PyObject *cls, PyObject *args)
 {
-    PyObject *generic_alias;
-
-#ifdef Py_GENERICALIASOBJECT_H
-    Py_ssize_t args_len;
+    const Py_ssize_t args_len = PyTuple_Check(args) ? PyTuple_Size(args) : 1;
     int args_len_expected;
 
     /* complexfloating should take 2 parameters, all others take 1 */
@@ -1854,20 +1901,13 @@ numbertype_class_getitem_abc(PyObject *cls, PyObject *args)
         args_len_expected = 1;
     }
 
-    args_len = PyTuple_Check(args) ? PyTuple_Size(args) : 1;
     if ((args_len > args_len_expected) || (args_len == 0)) {
         return PyErr_Format(PyExc_TypeError,
                             "Too %s arguments for %s",
                             args_len > args_len_expected ? "many" : "few",
                             ((PyTypeObject *)cls)->tp_name);
     }
-    generic_alias = Py_GenericAlias(cls, args);
-#else
-    PyErr_SetString(PyExc_TypeError,
-                    "Type subscription requires python >= 3.9");
-    generic_alias = NULL;
-#endif
-    return generic_alias;
+    return Py_GenericAlias(cls, args);
 }
 
 /*
@@ -1878,14 +1918,9 @@ numbertype_class_getitem_abc(PyObject *cls, PyObject *args)
 static PyObject *
 numbertype_class_getitem(PyObject *cls, PyObject *args)
 {
-#ifdef Py_GENERICALIASOBJECT_H
     PyErr_Format(PyExc_TypeError,
                  "There are no type variables left in %s",
                  ((PyTypeObject *)cls)->tp_name);
-#else
-    PyErr_SetString(PyExc_TypeError,
-                    "Type subscription requires python >= 3.9");
-#endif
     return NULL;
 }
 
@@ -2273,7 +2308,7 @@ static PyGetSetDef inttype_getsets[] = {
 };
 
 static PyMethodDef numbertype_methods[] = {
-    /* for typing; requires python >= 3.9 */
+    /* for typing */
     {"__class_getitem__",
         (PyCFunction)numbertype_class_getitem_abc,
         METH_CLASS | METH_O, NULL},
@@ -2287,7 +2322,7 @@ static PyMethodDef @name@type_methods[] = {
     {"__complex__",
         (PyCFunction)@name@_complex,
         METH_VARARGS | METH_KEYWORDS, NULL},
-    /* for typing; requires python >= 3.9 */
+    /* for typing */
     {"__class_getitem__",
         (PyCFunction)numbertype_class_getitem,
         METH_CLASS | METH_O, NULL},
@@ -2328,7 +2363,7 @@ static PyMethodDef @name@type_methods[] = {
     {"is_integer",
         (PyCFunction)@name@_is_integer,
         METH_NOARGS, NULL},
-    /* for typing; requires python >= 3.9 */
+    /* for typing */
     {"__class_getitem__",
         (PyCFunction)numbertype_class_getitem,
         METH_CLASS | METH_O, NULL},
@@ -2340,7 +2375,7 @@ static PyMethodDef @name@type_methods[] = {
  * #name = timedelta, cdouble#
  */
 static PyMethodDef @name@type_methods[] = {
-    /* for typing; requires python >= 3.9 */
+    /* for typing */
     {"__class_getitem__",
         (PyCFunction)numbertype_class_getitem,
         METH_CLASS | METH_O, NULL},
@@ -2353,7 +2388,7 @@ static PyMethodDef @name@type_methods[] = {
  *         long, ulong, longlong, ulonglong#
  */
 static PyMethodDef @name@type_methods[] = {
-    /* for typing; requires python >= 3.9 */
+    /* for typing */
     {"__class_getitem__",
         (PyCFunction)numbertype_class_getitem,
         METH_CLASS | METH_O, NULL},
@@ -3170,28 +3205,33 @@ static PyObject *
 void_arrtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
 {
     PyObject *obj, *arr;
-    PyObject *new = NULL;
+    PyArray_Descr *descr = NULL;
 
-    static char *kwnames[] = {"", NULL};  /* positional-only */
-    if (!PyArg_ParseTupleAndKeywords(args, kwds, "O:void", kwnames, &obj)) {
+    static char *kwnames[] = {"", "dtype", NULL};
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|O&:void", kwnames,
+            &obj, &PyArray_DescrConverter2, &descr)) {
         return NULL;
     }
     /*
      * For a VOID scalar first see if obj is an integer or long
      * and create new memory of that size (filled with 0) for the scalar
      */
-    if (PyLong_Check(obj) ||
+    if (descr == NULL && (
+            PyLong_Check(obj) ||
             PyArray_IsScalar(obj, Integer) ||
             (PyArray_Check(obj) &&
                      PyArray_NDIM((PyArrayObject *)obj)==0 &&
-                     PyArray_ISINTEGER((PyArrayObject *)obj))) {
-        new = Py_TYPE(obj)->tp_as_number->nb_int(obj);
-    }
-    if (new && PyLong_Check(new)) {
+                     PyArray_ISINTEGER((PyArrayObject *)obj)))) {
+
+        PyObject *length = Py_TYPE(obj)->tp_as_number->nb_int(obj);
+        if (length == NULL) {
+            return NULL;
+        }
+
         PyObject *ret;
         char *destptr;
-        npy_ulonglong memu = PyLong_AsUnsignedLongLong(new);
-        Py_DECREF(new);
+        npy_ulonglong memu = PyLong_AsUnsignedLongLong(length);
+        Py_DECREF(length);
         if (PyErr_Occurred() || (memu > NPY_MAX_INT)) {
             PyErr_Clear();
             PyErr_Format(PyExc_OverflowError,
@@ -3226,7 +3266,24 @@ void_arrtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
         return ret;
     }
 
-    arr = PyArray_FROM_OTF(obj, NPY_VOID, NPY_ARRAY_FORCECAST);
+    if (descr == NULL) {
+        /* Use the "size-less" void dtype to discover the size. */
+        descr = PyArray_DescrNewFromType(NPY_VOID);
+        if (descr == NULL) {
+            return NULL;
+        }
+    }
+    else if (descr->type_num != NPY_VOID || PyDataType_HASSUBARRAY(descr)) {
+        /* we reject subarrays, since subarray scalars do not exist. */
+        PyErr_Format(PyExc_TypeError,
+                "void: descr must be a `void` dtype that is not "
+                "a subarray dtype (structured or unstructured). "
+                "Got '%.100R'.", descr);
+        Py_DECREF(descr);
+        return NULL;
+    }
+
+    arr = PyArray_FromAny(obj, descr, 0, 0, NPY_ARRAY_FORCECAST, NULL);
     return PyArray_Return((PyArrayObject *)arr);
 }
 
@@ -3292,7 +3349,7 @@ long_arrtype_hash(PyObject *obj)
  * #Char = ,U#
  * #Word = ,Unsigned#
  */
-static NPY_INLINE npy_hash_t
+static inline npy_hash_t
 @char@longlong_arrtype_hash(PyObject *obj)
 {
     PyObject * l = PyLong_From@Word@LongLong(
@@ -3561,9 +3618,8 @@ object_arrtype_call(PyObjectScalarObject *obj, PyObject *args, PyObject *kwds)
 
 NPY_NO_EXPORT PyTypeObject PyObjectArrType_Type = {
     PyVarObject_HEAD_INIT(NULL, 0)
-    .tp_name = "numpy.object_",
+    .tp_name = "numpy." NPY_OBJECT_name,
     .tp_basicsize = sizeof(PyObjectScalarObject),
-    .tp_alloc = object_arrtype_alloc,
     .tp_dealloc = (destructor)object_arrtype_dealloc,
     .tp_as_sequence = &object_arrtype_as_sequence,
     .tp_as_mapping = &object_arrtype_as_mapping,
@@ -3571,6 +3627,7 @@ NPY_NO_EXPORT PyTypeObject PyObjectArrType_Type = {
     .tp_getattro = (getattrofunc)object_arrtype_getattro,
     .tp_setattro = (setattrofunc)object_arrtype_setattro,
     .tp_as_buffer = &object_arrtype_as_buffer,
+    .tp_alloc = object_arrtype_alloc,
 };
 
 static PyObject *
@@ -3597,60 +3654,29 @@ gen_arrtype_subscript(PyObject *self, PyObject *key)
 }
 
 
-#define NAME_bool "bool"
-#define NAME_void "void"
-#define NAME_string "bytes"
-#define NAME_unicode "str"
-
 /**begin repeat
- * #name = bool, string, unicode, void#
- * #NAME = Bool, String, Unicode, Void#
- * #ex = _,_,_,#
+ * #Name = Bool,
+ *         Byte, Short, Int, Long, LongLong,
+ *         UByte, UShort, UInt, ULong, ULongLong,
+ *         Half, Float, Double, LongDouble,
+ *         CFloat, CDouble, CLongDouble,
+ *         String, Unicode, Void,
+ *         Datetime, Timedelta#
+ * #NAME = BOOL,
+ *         BYTE, SHORT, INT, LONG, LONGLONG,
+ *         UBYTE, USHORT, UINT, ULONG, ULONGLONG,
+ *         HALF, FLOAT, DOUBLE, LONGDOUBLE,
+ *         CFLOAT, CDOUBLE, CLONGDOUBLE,
+ *         STRING, UNICODE, VOID,
+ *         DATETIME, TIMEDELTA#
  */
-NPY_NO_EXPORT PyTypeObject Py@NAME@ArrType_Type = {
-    PyVarObject_HEAD_INIT(NULL, 0)
-    .tp_name = "numpy." NAME_@name@ "@ex@",
-    .tp_basicsize = sizeof(Py@NAME@ScalarObject),
-};
-/**end repeat**/
-
-#undef NAME_bool
-#undef NAME_void
-#undef NAME_string
-#undef NAME_unicode
 
-/**begin repeat
- * #NAME = Byte, Short, Int, Long, LongLong, UByte, UShort, UInt, ULong,
- *         ULongLong, Half, Float, Double, LongDouble, Datetime, Timedelta#
- * #name = int*5, uint*5, float*4, datetime, timedelta#
- * #CNAME = (CHAR, SHORT, INT, LONG, LONGLONG)*2, HALF, FLOAT, DOUBLE,
- *          LONGDOUBLE, DATETIME, TIMEDELTA#
- */
-#if NPY_BITSOF_@CNAME@ == 8
-#define _THIS_SIZE "8"
-#elif NPY_BITSOF_@CNAME@ == 16
-#define _THIS_SIZE "16"
-#elif NPY_BITSOF_@CNAME@ == 32
-#define _THIS_SIZE "32"
-#elif NPY_BITSOF_@CNAME@ == 64
-#define _THIS_SIZE "64"
-#elif NPY_BITSOF_@CNAME@ == 80
-#define _THIS_SIZE "80"
-#elif NPY_BITSOF_@CNAME@ == 96
-#define _THIS_SIZE "96"
-#elif NPY_BITSOF_@CNAME@ == 128
-#define _THIS_SIZE "128"
-#elif NPY_BITSOF_@CNAME@ == 256
-#define _THIS_SIZE "256"
-#endif
-NPY_NO_EXPORT PyTypeObject Py@NAME@ArrType_Type = {
+NPY_NO_EXPORT PyTypeObject Py@Name@ArrType_Type = {
     PyVarObject_HEAD_INIT(NULL, 0)
-    .tp_name = "numpy.@name@" _THIS_SIZE,
-    .tp_basicsize = sizeof(Py@NAME@ScalarObject),
+    .tp_name = "numpy." NPY_@NAME@_name,
+    .tp_basicsize = sizeof(Py@Name@ScalarObject),
 };
 
-
-#undef _THIS_SIZE
 /**end repeat**/
 
 
@@ -3659,37 +3685,6 @@ static PyMappingMethods gentype_as_mapping = {
 };
 
 
-/**begin repeat
- * #NAME = CFloat, CDouble, CLongDouble#
- * #name = complex*3#
- * #CNAME = FLOAT, DOUBLE, LONGDOUBLE#
- */
-#if NPY_BITSOF_@CNAME@ == 16
-#define _THIS_SIZE "32"
-#elif NPY_BITSOF_@CNAME@ == 32
-#define _THIS_SIZE "64"
-#elif NPY_BITSOF_@CNAME@ == 64
-#define _THIS_SIZE "128"
-#elif NPY_BITSOF_@CNAME@ == 80
-#define _THIS_SIZE "160"
-#elif NPY_BITSOF_@CNAME@ == 96
-#define _THIS_SIZE "192"
-#elif NPY_BITSOF_@CNAME@ == 128
-#define _THIS_SIZE "256"
-#elif NPY_BITSOF_@CNAME@ == 256
-#define _THIS_SIZE "512"
-#endif
-
-NPY_NO_EXPORT PyTypeObject Py@NAME@ArrType_Type = {
-    PyVarObject_HEAD_INIT(0, 0)
-    .tp_name = "numpy.@name@" _THIS_SIZE,
-    .tp_basicsize = sizeof(Py@NAME@ScalarObject),
-    .tp_flags = Py_TPFLAGS_DEFAULT,
-};
-#undef _THIS_SIZE
-
-/**end repeat**/
-
 /*
  * This table maps the built-in type numbers to their scalar
  * type numbers.  Note that signed integers are mapped to INTNEG_SCALAR,
@@ -4078,36 +4073,6 @@ initialize_numeric_types(void)
 
     PyArrayIter_Type.tp_iter = PyObject_SelfIter;
     PyArrayMapIter_Type.tp_iter = PyObject_SelfIter;
-
-    /*
-     * Give types different names when they are the same size (gh-9799).
-     * `np.intX` always refers to the first int of that size in the sequence
-     * `['LONG', 'LONGLONG', 'INT', 'SHORT', 'BYTE']`.
-     */
-#if (NPY_SIZEOF_BYTE == NPY_SIZEOF_SHORT)
-    PyByteArrType_Type.tp_name = "numpy.byte";
-    PyUByteArrType_Type.tp_name = "numpy.ubyte";
-#endif
-#if (NPY_SIZEOF_SHORT == NPY_SIZEOF_INT)
-    PyShortArrType_Type.tp_name = "numpy.short";
-    PyUShortArrType_Type.tp_name = "numpy.ushort";
-#endif
-#if (NPY_SIZEOF_INT == NPY_SIZEOF_LONG)
-    PyIntArrType_Type.tp_name = "numpy.intc";
-    PyUIntArrType_Type.tp_name = "numpy.uintc";
-#endif
-#if (NPY_SIZEOF_LONGLONG == NPY_SIZEOF_LONG)
-    PyLongLongArrType_Type.tp_name = "numpy.longlong";
-    PyULongLongArrType_Type.tp_name = "numpy.ulonglong";
-#endif
-
-    /*
-    Do the same for longdouble
-    */
-#if (NPY_SIZEOF_LONGDOUBLE == NPY_SIZEOF_DOUBLE)
-    PyLongDoubleArrType_Type.tp_name = "numpy.longdouble";
-    PyCLongDoubleArrType_Type.tp_name = "numpy.clongdouble";
-#endif
 }
 
 typedef struct {
diff --git a/numpy/core/src/multiarray/shape.c b/numpy/core/src/multiarray/shape.c
index 9d6afb155..15b166423 100644
--- a/numpy/core/src/multiarray/shape.c
+++ b/numpy/core/src/multiarray/shape.c
@@ -211,7 +211,7 @@ PyArray_Newshape(PyArrayObject *self, PyArray_Dims *newdims,
     int flags;
 
     if (order == NPY_ANYORDER) {
-        order = PyArray_ISFORTRAN(self);
+        order = PyArray_ISFORTRAN(self) ? NPY_FORTRANORDER : NPY_CORDER;
     }
     else if (order == NPY_KEEPORDER) {
         PyErr_SetString(PyExc_ValueError,
@@ -285,7 +285,7 @@ PyArray_Newshape(PyArrayObject *self, PyArray_Dims *newdims,
             Py_TYPE(self), PyArray_DESCR(self),
             ndim, dimensions, strides, PyArray_DATA(self),
             flags, (PyObject *)self, (PyObject *)self,
-            0, 1);
+            _NPY_ARRAY_ENSURE_DTYPE_IDENTITY);
     Py_DECREF(self);
     return (PyObject *)ret;
 }
@@ -787,7 +787,7 @@ PyArray_CreateSortedStridePerm(int ndim, npy_intp const *strides,
                                     &_npy_stride_sort_item_comparator);
 }
 
-static NPY_INLINE npy_intp
+static inline npy_intp
 s_intp_abs(npy_intp x)
 {
     return (x < 0) ? -x : x;
diff --git a/numpy/core/src/multiarray/textreading/conversions.c b/numpy/core/src/multiarray/textreading/conversions.c
index 7fa96dc5e..67a4803b9 100644
--- a/numpy/core/src/multiarray/textreading/conversions.c
+++ b/numpy/core/src/multiarray/textreading/conversions.c
@@ -19,7 +19,7 @@
  * Coercion to boolean is done via integer right now.
  */
 NPY_NO_EXPORT int
-to_bool(PyArray_Descr *NPY_UNUSED(descr),
+npy_to_bool(PyArray_Descr *NPY_UNUSED(descr),
         const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr,
         parser_config *NPY_UNUSED(pconfig))
 {
@@ -47,7 +47,7 @@ to_bool(PyArray_Descr *NPY_UNUSED(descr),
  *        (used by the complex parser).
  * @param result Output stored as double value.
  */
-static NPY_INLINE int
+static inline int
 double_from_ucs4(
         const Py_UCS4 *str, const Py_UCS4 *end,
         bool strip_whitespace, double *result, const Py_UCS4 **p_end)
@@ -110,7 +110,7 @@ double_from_ucs4(
 
 
 NPY_NO_EXPORT int
-to_float(PyArray_Descr *descr,
+npy_to_float(PyArray_Descr *descr,
         const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr,
         parser_config *NPY_UNUSED(pconfig))
 {
@@ -133,7 +133,7 @@ to_float(PyArray_Descr *descr,
 
 
 NPY_NO_EXPORT int
-to_double(PyArray_Descr *descr,
+npy_to_double(PyArray_Descr *descr,
         const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr,
         parser_config *NPY_UNUSED(pconfig))
 {
@@ -228,7 +228,7 @@ to_complex_int(
 
 
 NPY_NO_EXPORT int
-to_cfloat(PyArray_Descr *descr,
+npy_to_cfloat(PyArray_Descr *descr,
         const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr,
         parser_config *pconfig)
 {
@@ -253,7 +253,7 @@ to_cfloat(PyArray_Descr *descr,
 
 
 NPY_NO_EXPORT int
-to_cdouble(PyArray_Descr *descr,
+npy_to_cdouble(PyArray_Descr *descr,
         const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr,
         parser_config *pconfig)
 {
@@ -280,7 +280,7 @@ to_cdouble(PyArray_Descr *descr,
  * String and unicode conversion functions.
  */
 NPY_NO_EXPORT int
-to_string(PyArray_Descr *descr,
+npy_to_string(PyArray_Descr *descr,
         const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr,
         parser_config *NPY_UNUSED(unused))
 {
@@ -309,7 +309,7 @@ to_string(PyArray_Descr *descr,
 
 
 NPY_NO_EXPORT int
-to_unicode(PyArray_Descr *descr,
+npy_to_unicode(PyArray_Descr *descr,
         const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr,
         parser_config *NPY_UNUSED(unused))
 {
@@ -362,7 +362,7 @@ call_converter_function(
 
 
 NPY_NO_EXPORT int
-to_generic_with_converter(PyArray_Descr *descr,
+npy_to_generic_with_converter(PyArray_Descr *descr,
         const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr,
         parser_config *config, PyObject *func)
 {
@@ -387,9 +387,9 @@ to_generic_with_converter(PyArray_Descr *descr,
 
 
 NPY_NO_EXPORT int
-to_generic(PyArray_Descr *descr,
+npy_to_generic(PyArray_Descr *descr,
         const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr,
         parser_config *config)
 {
-    return to_generic_with_converter(descr, str, end, dataptr, config, NULL);
+    return npy_to_generic_with_converter(descr, str, end, dataptr, config, NULL);
 }
diff --git a/numpy/core/src/multiarray/textreading/conversions.h b/numpy/core/src/multiarray/textreading/conversions.h
index 222eea4e7..09f251041 100644
--- a/numpy/core/src/multiarray/textreading/conversions.h
+++ b/numpy/core/src/multiarray/textreading/conversions.h
@@ -10,47 +10,47 @@
 #include "textreading/parser_config.h"
 
 NPY_NO_EXPORT int
-to_bool(PyArray_Descr *descr,
+npy_to_bool(PyArray_Descr *descr,
         const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr,
         parser_config *pconfig);
 
 NPY_NO_EXPORT int
-to_float(PyArray_Descr *descr,
+npy_to_float(PyArray_Descr *descr,
         const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr,
         parser_config *pconfig);
 
 NPY_NO_EXPORT int
-to_double(PyArray_Descr *descr,
+npy_to_double(PyArray_Descr *descr,
         const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr,
         parser_config *pconfig);
 
 NPY_NO_EXPORT int
-to_cfloat(PyArray_Descr *descr,
+npy_to_cfloat(PyArray_Descr *descr,
         const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr,
         parser_config *pconfig);
 
 NPY_NO_EXPORT int
-to_cdouble(PyArray_Descr *descr,
+npy_to_cdouble(PyArray_Descr *descr,
         const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr,
         parser_config *pconfig);
 
 NPY_NO_EXPORT int
-to_string(PyArray_Descr *descr,
+npy_to_string(PyArray_Descr *descr,
         const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr,
         parser_config *unused);
 
 NPY_NO_EXPORT int
-to_unicode(PyArray_Descr *descr,
+npy_to_unicode(PyArray_Descr *descr,
         const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr,
         parser_config *unused);
 
 NPY_NO_EXPORT int
-to_generic_with_converter(PyArray_Descr *descr,
+npy_to_generic_with_converter(PyArray_Descr *descr,
         const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr,
         parser_config *unused, PyObject *func);
 
 NPY_NO_EXPORT int
-to_generic(PyArray_Descr *descr,
+npy_to_generic(PyArray_Descr *descr,
         const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr,
         parser_config *pconfig);
 
diff --git a/numpy/core/src/multiarray/textreading/field_types.c b/numpy/core/src/multiarray/textreading/field_types.c
index 0722efd57..c3202bd2a 100644
--- a/numpy/core/src/multiarray/textreading/field_types.c
+++ b/numpy/core/src/multiarray/textreading/field_types.c
@@ -35,18 +35,18 @@ static set_from_ucs4_function *
 get_from_ucs4_function(PyArray_Descr *descr)
 {
     if (descr->type_num == NPY_BOOL) {
-        return &to_bool;
+        return &npy_to_bool;
     }
     else if (PyDataType_ISSIGNED(descr)) {
         switch (descr->elsize) {
             case 1:
-                return &to_int8;
+                return &npy_to_int8;
             case 2:
-                return &to_int16;
+                return &npy_to_int16;
             case 4:
-                return &to_int32;
+                return &npy_to_int32;
             case 8:
-                return &to_int64;
+                return &npy_to_int64;
             default:
                 assert(0);
         }
@@ -54,36 +54,36 @@ get_from_ucs4_function(PyArray_Descr *descr)
     else if (PyDataType_ISUNSIGNED(descr)) {
         switch (descr->elsize) {
             case 1:
-                return &to_uint8;
+                return &npy_to_uint8;
             case 2:
-                return &to_uint16;
+                return &npy_to_uint16;
             case 4:
-                return &to_uint32;
+                return &npy_to_uint32;
             case 8:
-                return &to_uint64;
+                return &npy_to_uint64;
             default:
                 assert(0);
         }
     }
     else if (descr->type_num == NPY_FLOAT) {
-        return &to_float;
+        return &npy_to_float;
     }
     else if (descr->type_num == NPY_DOUBLE) {
-        return &to_double;
+        return &npy_to_double;
     }
     else if (descr->type_num == NPY_CFLOAT) {
-        return &to_cfloat;
+        return &npy_to_cfloat;
     }
     else if (descr->type_num == NPY_CDOUBLE) {
-        return &to_cdouble;
+        return &npy_to_cdouble;
     }
     else if (descr->type_num == NPY_STRING) {
-        return &to_string;
+        return &npy_to_string;
     }
     else if (descr->type_num == NPY_UNICODE) {
-        return &to_unicode;
+        return &npy_to_unicode;
     }
-    return &to_generic;
+    return &npy_to_generic;
 }
 
 
diff --git a/numpy/core/src/multiarray/textreading/field_types.h b/numpy/core/src/multiarray/textreading/field_types.h
index d335a3665..078b740cf 100644
--- a/numpy/core/src/multiarray/textreading/field_types.h
+++ b/numpy/core/src/multiarray/textreading/field_types.h
@@ -2,6 +2,9 @@
 #ifndef NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_FIELD_TYPES_H_
 #define NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_FIELD_TYPES_H_
 
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
+
 #include <stdint.h>
 #include <stdbool.h>
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
diff --git a/numpy/core/src/multiarray/textreading/readtext.c b/numpy/core/src/multiarray/textreading/readtext.c
index a5db1cb77..e8defcc4d 100644
--- a/numpy/core/src/multiarray/textreading/readtext.c
+++ b/numpy/core/src/multiarray/textreading/readtext.c
@@ -1,9 +1,9 @@
-#include <stdio.h>
-#include <stdbool.h>
-
 #define PY_SSIZE_T_CLEAN
 #include <Python.h>
 
+#include <stdio.h>
+#include <stdbool.h>
+
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
 #define _MULTIARRAYMODULE
 #include "numpy/arrayobject.h"
@@ -194,11 +194,11 @@ _load_from_filelike(PyObject *NPY_UNUSED(mod),
 
     parser_config pc = {
         .delimiter = ',',
-        .comment = '#',
         .quote = '"',
-        .imaginary_unit = 'j',
-        .delimiter_is_whitespace = false,
+        .comment = '#',
         .ignore_leading_whitespace = false,
+        .delimiter_is_whitespace = false,
+        .imaginary_unit = 'j',
         .python_byte_converters = false,
         .c_byte_converters = false,
         .gave_int_via_float_warning = false,
diff --git a/numpy/core/src/multiarray/textreading/rows.c b/numpy/core/src/multiarray/textreading/rows.c
index ce68b8bee..b8a6943dc 100644
--- a/numpy/core/src/multiarray/textreading/rows.c
+++ b/numpy/core/src/multiarray/textreading/rows.c
@@ -181,7 +181,7 @@ read_rows(stream *s,
 
     int ts_result = 0;
     tokenizer_state ts;
-    if (tokenizer_init(&ts, pconfig) < 0) {
+    if (npy_tokenizer_init(&ts, pconfig) < 0) {
         goto error;
     }
 
@@ -198,7 +198,7 @@ read_rows(stream *s,
 
     for (Py_ssize_t i = 0; i < skiplines; i++) {
         ts.state = TOKENIZE_GOTO_LINE_END;
-        ts_result = tokenize(s, &ts, pconfig);
+        ts_result = npy_tokenize(s, &ts, pconfig);
         if (ts_result < 0) {
             goto error;
         }
@@ -210,7 +210,7 @@ read_rows(stream *s,
 
     Py_ssize_t row_count = 0;  /* number of rows actually processed */
     while ((max_rows < 0 || row_count < max_rows) && ts_result == 0) {
-        ts_result = tokenize(s, &ts, pconfig);
+        ts_result = npy_tokenize(s, &ts, pconfig);
         if (ts_result < 0) {
             goto error;
         }
@@ -318,10 +318,19 @@ read_rows(stream *s,
         }
 
         if (!usecols && (actual_num_fields != current_num_fields)) {
-            PyErr_Format(PyExc_ValueError,
+            if (homogeneous) {
+                PyErr_Format(PyExc_ValueError,
                     "the number of columns changed from %zd to %zd at row %zd; "
                     "use `usecols` to select a subset and avoid this error",
                     actual_num_fields, current_num_fields, row_count+1);
+            }
+            else {
+                PyErr_Format(PyExc_ValueError,
+                    "the dtype passed requires %zd columns but %zd were found "
+                    "at row %zd; "
+                    "use `usecols` to select a subset and avoid this error",
+                    actual_num_fields, current_num_fields, row_count+1);
+            }
             goto error;
         }
 
@@ -402,7 +411,7 @@ read_rows(stream *s,
                         str, end, item_ptr, pconfig);
             }
             else {
-                parser_res = to_generic_with_converter(field_types[f].descr,
+                parser_res = npy_to_generic_with_converter(field_types[f].descr,
                         str, end, item_ptr, pconfig, conv_funcs[i]);
             }
 
@@ -431,7 +440,7 @@ read_rows(stream *s,
         data_ptr += row_size;
     }
 
-    tokenizer_clear(&ts);
+    npy_tokenizer_clear(&ts);
     if (conv_funcs != NULL) {
         for (Py_ssize_t i = 0; i < actual_num_fields; i++) {
             Py_XDECREF(conv_funcs[i]);
@@ -485,7 +494,7 @@ read_rows(stream *s,
         }
         PyMem_FREE(conv_funcs);
     }
-    tokenizer_clear(&ts);
+    npy_tokenizer_clear(&ts);
     Py_XDECREF(data_array);
     return NULL;
 }
diff --git a/numpy/core/src/multiarray/textreading/str_to_int.c b/numpy/core/src/multiarray/textreading/str_to_int.c
index 0dd6c0b0e..40b7c67a9 100644
--- a/numpy/core/src/multiarray/textreading/str_to_int.c
+++ b/numpy/core/src/multiarray/textreading/str_to_int.c
@@ -23,7 +23,7 @@ const char *deprecation_msg = (
 
 #define DECLARE_TO_INT(intw, INT_MIN, INT_MAX, byteswap_unaligned)          \
     NPY_NO_EXPORT int                                                       \
-    to_##intw(PyArray_Descr *descr,                                         \
+    npy_to_##intw(PyArray_Descr *descr,                                     \
             const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr,          \
             parser_config *pconfig)                                         \
     {                                                                       \
@@ -36,7 +36,7 @@ const char *deprecation_msg = (
             double fval;                                                    \
             PyArray_Descr *d_descr = PyArray_DescrFromType(NPY_DOUBLE);     \
             Py_DECREF(d_descr);  /* borrowed */                             \
-            if (to_double(d_descr, str, end, (char *)&fval, pconfig) < 0) { \
+            if (npy_to_double(d_descr, str, end, (char *)&fval, pconfig) < 0) { \
                 return -1;                                                  \
             }                                                               \
             if (!pconfig->gave_int_via_float_warning) {                     \
@@ -61,7 +61,7 @@ const char *deprecation_msg = (
 
 #define DECLARE_TO_UINT(uintw, UINT_MAX, byteswap_unaligned)                \
     NPY_NO_EXPORT int                                                       \
-    to_##uintw(PyArray_Descr *descr,                                        \
+    npy_to_##uintw(PyArray_Descr *descr,                                    \
             const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr,          \
             parser_config *pconfig)                                         \
     {                                                                       \
@@ -74,7 +74,7 @@ const char *deprecation_msg = (
             double fval;                                                    \
             PyArray_Descr *d_descr = PyArray_DescrFromType(NPY_DOUBLE);     \
             Py_DECREF(d_descr);  /* borrowed */                             \
-            if (to_double(d_descr, str, end, (char *)&fval, pconfig) < 0) { \
+            if (npy_to_double(d_descr, str, end, (char *)&fval, pconfig) < 0) { \
                 return -1;                                                  \
             }                                                               \
             if (!pconfig->gave_int_via_float_warning) {                     \
diff --git a/numpy/core/src/multiarray/textreading/str_to_int.h b/numpy/core/src/multiarray/textreading/str_to_int.h
index a0a89a0ef..46b8f6679 100644
--- a/numpy/core/src/multiarray/textreading/str_to_int.h
+++ b/numpy/core/src/multiarray/textreading/str_to_int.h
@@ -157,7 +157,7 @@ str_to_uint64(
 
 #define DECLARE_TO_INT_PROTOTYPE(intw)                                  \
     NPY_NO_EXPORT int                                                   \
-    to_##intw(PyArray_Descr *descr,                                     \
+    npy_to_##intw(PyArray_Descr *descr,                                     \
             const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr,      \
             parser_config *pconfig);
 
diff --git a/numpy/core/src/multiarray/textreading/stream_pyobject.c b/numpy/core/src/multiarray/textreading/stream_pyobject.c
index 6f84ff01d..3832ee84a 100644
--- a/numpy/core/src/multiarray/textreading/stream_pyobject.c
+++ b/numpy/core/src/multiarray/textreading/stream_pyobject.c
@@ -4,11 +4,12 @@
  * single line of a file.
  */
 
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
+
 #include <stdio.h>
 #include <stdlib.h>
 
-#define PY_SSIZE_T_CLEAN
-#include <Python.h>
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
 #define _MULTIARRAYMODULE
 #include "numpy/arrayobject.h"
@@ -41,7 +42,7 @@ typedef struct {
  *
  * NOTE: Steals a reference to `str` (although usually returns it unmodified).
  */
-static NPY_INLINE PyObject *
+static inline PyObject *
 process_stringlike(PyObject *str, const char *encoding)
 {
     if (PyBytes_Check(str)) {
@@ -63,7 +64,7 @@ process_stringlike(PyObject *str, const char *encoding)
 }
 
 
-static NPY_INLINE void
+static inline void
 buffer_info_from_unicode(PyObject *str, char **start, char **end, int *kind)
 {
     Py_ssize_t length = PyUnicode_GET_LENGTH(str);
diff --git a/numpy/core/src/multiarray/textreading/tokenize.cpp b/numpy/core/src/multiarray/textreading/tokenize.cpp
index d0d9cf844..e0ddc393d 100644
--- a/numpy/core/src/multiarray/textreading/tokenize.cpp
+++ b/numpy/core/src/multiarray/textreading/tokenize.cpp
@@ -45,7 +45,8 @@ copy_to_field_buffer(tokenizer_state *ts,
         const UCS *chunk_start, const UCS *chunk_end)
 {
     npy_intp chunk_length = chunk_end - chunk_start;
-    npy_intp size = chunk_length + ts->field_buffer_pos + 2;
+    /* Space for length +1 termination, +2 additional padding for add_field */
+    npy_intp size = chunk_length + ts->field_buffer_pos + 3;
 
     if (NPY_UNLIKELY(ts->field_buffer_length < size)) {
         npy_intp alloc_size = grow_size_and_multiply(&size, 32, sizeof(Py_UCS4));
@@ -104,6 +105,7 @@ add_field(tokenizer_state *ts)
     ts->num_fields += 1;
     /* Ensure this (currently empty) word is NUL terminated. */
     ts->field_buffer[ts->field_buffer_pos] = '\0';
+    assert(ts->field_buffer_length > ts->field_buffer_pos);
     return 0;
 }
 
@@ -221,7 +223,8 @@ tokenizer_core(tokenizer_state *ts, parser_config *const config)
             }
             else {
                 /* continue parsing as if unquoted */
-                ts->state = TOKENIZE_UNQUOTED;
+                /* Set to TOKENIZE_UNQUOTED or TOKENIZE_UNQUOTED_WHITESPACE */
+                ts->state = ts->unquoted_state;
             }
             break;
 
@@ -284,10 +287,10 @@ tokenizer_core(tokenizer_state *ts, parser_config *const config)
  * unicode flavors UCS1, UCS2, and UCS4 as a worthwhile optimization.
  */
 NPY_NO_EXPORT int
-tokenize(stream *s, tokenizer_state *ts, parser_config *const config)
+npy_tokenize(stream *s, tokenizer_state *ts, parser_config *const config)
 {
     assert(ts->fields_size >= 2);
-    assert(ts->field_buffer_length >= 2*sizeof(Py_UCS4));
+    assert(ts->field_buffer_length >= 2*(Py_ssize_t)sizeof(Py_UCS4));
 
     int finished_reading_file = 0;
 
@@ -402,7 +405,7 @@ tokenize(stream *s, tokenizer_state *ts, parser_config *const config)
 
 
 NPY_NO_EXPORT void
-tokenizer_clear(tokenizer_state *ts)
+npy_tokenizer_clear(tokenizer_state *ts)
 {
     PyMem_FREE(ts->field_buffer);
     ts->field_buffer = nullptr;
@@ -420,7 +423,7 @@ tokenizer_clear(tokenizer_state *ts)
  * tokenizing.
  */
 NPY_NO_EXPORT int
-tokenizer_init(tokenizer_state *ts, parser_config *config)
+npy_tokenizer_init(tokenizer_state *ts, parser_config *config)
 {
     /* State and buf_state could be moved into tokenize if we go by row */
     ts->buf_state = BUFFER_MAY_CONTAIN_NEWLINE;
@@ -446,6 +449,8 @@ tokenizer_init(tokenizer_state *ts, parser_config *config)
 
     ts->fields = (field_info *)PyMem_Malloc(4 * sizeof(*ts->fields));
     if (ts->fields == nullptr) {
+        PyMem_Free(ts->field_buffer);
+        ts->field_buffer = nullptr;
         PyErr_NoMemory();
         return -1;
     }
diff --git a/numpy/core/src/multiarray/textreading/tokenize.h b/numpy/core/src/multiarray/textreading/tokenize.h
index a78c6d936..53e97760f 100644
--- a/numpy/core/src/multiarray/textreading/tokenize.h
+++ b/numpy/core/src/multiarray/textreading/tokenize.h
@@ -46,8 +46,9 @@ typedef struct {
     char *pos;
     char *end;
     /*
-     * Space to copy words into.  The buffer must always be at least two NUL
-     * entries longer (8 bytes) than the actual word (including initially).
+     * Space to copy words into.  Due to `add_field` not growing the buffer
+     * but writing a \0 termination, the buffer must always be two larger
+     * (add_field can be called twice if a row ends in a delimiter: "123,").
      * The first byte beyond the current word is always NUL'ed on write, the
      * second byte is there to allow easy appending of an additional empty
      * word at the end (this word is also NUL terminated).
@@ -70,14 +71,14 @@ typedef struct {
 
 
 NPY_NO_EXPORT void
-tokenizer_clear(tokenizer_state *ts);
+npy_tokenizer_clear(tokenizer_state *ts);
 
 
 NPY_NO_EXPORT int
-tokenizer_init(tokenizer_state *ts, parser_config *config);
+npy_tokenizer_init(tokenizer_state *ts, parser_config *config);
 
 NPY_NO_EXPORT int
-tokenize(stream *s, tokenizer_state *ts, parser_config *const config);
+npy_tokenize(stream *s, tokenizer_state *ts, parser_config *const config);
 
 #ifdef __cplusplus
 }
diff --git a/numpy/core/src/multiarray/usertypes.c b/numpy/core/src/multiarray/usertypes.c
index a338d712d..5e36c914b 100644
--- a/numpy/core/src/multiarray/usertypes.c
+++ b/numpy/core/src/multiarray/usertypes.c
@@ -41,6 +41,7 @@ maintainer email:  oliphant.travis@ieee.org
 #include "scalartypes.h"
 #include "array_method.h"
 #include "convert_datatype.h"
+#include "dtype_traversal.h"
 #include "legacy_dtype_implementation.h"
 
 
@@ -223,6 +224,8 @@ PyArray_RegisterDataType(PyArray_Descr *descr)
         PyErr_SetString(PyExc_ValueError, "missing typeobject");
         return -1;
     }
+
+    int use_void_clearimpl = 0;
     if (descr->flags & (NPY_ITEM_IS_POINTER | NPY_ITEM_REFCOUNT)) {
         /*
          * User dtype can't actually do reference counting, however, there
@@ -231,6 +234,8 @@ PyArray_RegisterDataType(PyArray_Descr *descr)
          * so we have to support this. But such a structure must be constant
          * (i.e. fixed at registration time, this is the case for `xpress`).
          */
+        use_void_clearimpl = 1;
+
         if (descr->names == NULL || descr->fields == NULL ||
             !PyDict_CheckExact(descr->fields)) {
             PyErr_Format(PyExc_ValueError,
@@ -256,14 +261,53 @@ PyArray_RegisterDataType(PyArray_Descr *descr)
         return -1;
     }
 
+    /*
+     * Legacy user DTypes classes cannot have a name, since the user never
+     * defined one.  So we create a name for them here. These DTypes are
+     * effectively static types.
+     *
+     * Note: we have no intention of freeing the memory again since this
+     * behaves identically to static type definition.
+     */
+
+    const char *scalar_name = descr->typeobj->tp_name;
+    /*
+     * We have to take only the name, and ignore the module to get
+     * a reasonable __name__, since static types are limited in this regard
+     * (this is not ideal, but not a big issue in practice).
+     * This is what Python does to print __name__ for static types.
+     */
+    const char *dot = strrchr(scalar_name, '.');
+    if (dot) {
+        scalar_name = dot + 1;
+    }
+    Py_ssize_t name_length = strlen(scalar_name) + 14;
+
+    char *name = PyMem_Malloc(name_length);
+    if (name == NULL) {
+        PyErr_NoMemory();
+        return -1;
+    }
+
+    snprintf(name, name_length, "numpy.dtype[%s]", scalar_name);
+
     userdescrs[NPY_NUMUSERTYPES++] = descr;
 
     descr->type_num = typenum;
-    if (dtypemeta_wrap_legacy_descriptor(descr) < 0) {
+    if (dtypemeta_wrap_legacy_descriptor(descr, name, NULL) < 0) {
         descr->type_num = -1;
         NPY_NUMUSERTYPES--;
+        PyMem_Free(name);  /* free the name only on failure */
         return -1;
     }
+    if (use_void_clearimpl) {
+        /* See comment where use_void_clearimpl is set... */
+        NPY_DT_SLOTS(NPY_DTYPE(descr))->get_clear_loop = (
+                &npy_get_clear_void_and_legacy_user_dtype_loop);
+        /* Also use the void zerofill since there may be objects */
+        NPY_DT_SLOTS(NPY_DTYPE(descr))->get_clear_loop = (
+                &npy_get_zerofill_void_and_legacy_user_dtype_loop);
+    }
 
     return typenum;
 }
@@ -597,7 +641,7 @@ PyArray_AddLegacyWrapping_CastingImpl(
     if (from == to) {
         spec.flags = NPY_METH_REQUIRES_PYAPI | NPY_METH_SUPPORTS_UNALIGNED;
         PyType_Slot slots[] = {
-            {NPY_METH_get_loop, &legacy_cast_get_strided_loop},
+            {_NPY_METH_get_loop, &legacy_cast_get_strided_loop},
             {NPY_METH_resolve_descriptors, &legacy_same_dtype_resolve_descriptors},
             {0, NULL}};
         spec.slots = slots;
@@ -606,7 +650,7 @@ PyArray_AddLegacyWrapping_CastingImpl(
     else {
         spec.flags = NPY_METH_REQUIRES_PYAPI;
         PyType_Slot slots[] = {
-            {NPY_METH_get_loop, &legacy_cast_get_strided_loop},
+            {_NPY_METH_get_loop, &legacy_cast_get_strided_loop},
             {NPY_METH_resolve_descriptors, &simple_cast_resolve_descriptors},
             {0, NULL}};
         spec.slots = slots;
diff --git a/numpy/core/src/npymath/_signbit.c b/numpy/core/src/npymath/_signbit.c
deleted file mode 100644
index 12af55387..000000000
--- a/numpy/core/src/npymath/_signbit.c
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Adapted from cephes */
-
-int
-_npy_signbit_d(double x)
-{
-    union
-    {
-        double d;
-        short s[4];
-        int i[2];
-    } u;
-
-    u.d = x;
-
-#if NPY_SIZEOF_INT == 4
-
-#ifdef WORDS_BIGENDIAN /* defined in pyconfig.h */
-    return u.i[0] < 0;
-#else
-    return u.i[1] < 0;
-#endif
-
-#else  /* NPY_SIZEOF_INT != 4 */
-
-#ifdef WORDS_BIGENDIAN
-    return u.s[0] < 0;
-#else
-    return u.s[3] < 0;
-#endif
-
-#endif  /* NPY_SIZEOF_INT */
-}
diff --git a/numpy/core/src/npymath/arm64_exports.c b/numpy/core/src/npymath/arm64_exports.c
new file mode 100644
index 000000000..dfa8054d7
--- /dev/null
+++ b/numpy/core/src/npymath/arm64_exports.c
@@ -0,0 +1,30 @@
+#if defined(__arm64__) && defined(__APPLE__)
+
+#include <math.h>
+/*
+ * Export these for scipy, since the SciPy build for macos arm64
+ * downloads the macos x86_64 NumPy, and does not error when the
+ * linker fails to use npymathlib.a. Importing numpy will expose
+ * these external functions
+ * See https://github.com/numpy/numpy/issues/22673#issuecomment-1327520055
+ *
+ * This file is actually compiled as part of the main module.
+ */
+
+double npy_asinh(double x) {
+    return asinh(x);
+}
+
+double npy_copysign(double y, double x) {
+    return copysign(y, x);
+}
+
+double npy_log1p(double x) {
+    return log1p(x);
+}
+
+double npy_nextafter(double x, double y) {
+    return nextafter(x, y);
+}
+
+#endif
diff --git a/numpy/core/src/npymath/halffloat.c b/numpy/core/src/npymath/halffloat.c
deleted file mode 100644
index 51948c736..000000000
--- a/numpy/core/src/npymath/halffloat.c
+++ /dev/null
@@ -1,555 +0,0 @@
-#define NPY_NO_DEPRECATED_API NPY_API_VERSION
-
-#include "numpy/halffloat.h"
-
-/*
- * This chooses between 'ties to even' and 'ties away from zero'.
- */
-#define NPY_HALF_ROUND_TIES_TO_EVEN 1
-/*
- * If these are 1, the conversions try to trigger underflow,
- * overflow, and invalid exceptions in the FP system when needed.
- */
-#define NPY_HALF_GENERATE_OVERFLOW 1
-#define NPY_HALF_GENERATE_UNDERFLOW 1
-#define NPY_HALF_GENERATE_INVALID 1
-
-/*
- ********************************************************************
- *                   HALF-PRECISION ROUTINES                        *
- ********************************************************************
- */
-
-float npy_half_to_float(npy_half h)
-{
-    union { float ret; npy_uint32 retbits; } conv;
-    conv.retbits = npy_halfbits_to_floatbits(h);
-    return conv.ret;
-}
-
-double npy_half_to_double(npy_half h)
-{
-    union { double ret; npy_uint64 retbits; } conv;
-    conv.retbits = npy_halfbits_to_doublebits(h);
-    return conv.ret;
-}
-
-npy_half npy_float_to_half(float f)
-{
-    union { float f; npy_uint32 fbits; } conv;
-    conv.f = f;
-    return npy_floatbits_to_halfbits(conv.fbits);
-}
-
-npy_half npy_double_to_half(double d)
-{
-    union { double d; npy_uint64 dbits; } conv;
-    conv.d = d;
-    return npy_doublebits_to_halfbits(conv.dbits);
-}
-
-int npy_half_iszero(npy_half h)
-{
-    return (h&0x7fff) == 0;
-}
-
-int npy_half_isnan(npy_half h)
-{
-    return ((h&0x7c00u) == 0x7c00u) && ((h&0x03ffu) != 0x0000u);
-}
-
-int npy_half_isinf(npy_half h)
-{
-    return ((h&0x7fffu) == 0x7c00u);
-}
-
-int npy_half_isfinite(npy_half h)
-{
-    return ((h&0x7c00u) != 0x7c00u);
-}
-
-int npy_half_signbit(npy_half h)
-{
-    return (h&0x8000u) != 0;
-}
-
-npy_half npy_half_spacing(npy_half h)
-{
-    npy_half ret;
-    npy_uint16 h_exp = h&0x7c00u;
-    npy_uint16 h_sig = h&0x03ffu;
-    if (h_exp == 0x7c00u) {
-#if NPY_HALF_GENERATE_INVALID
-        npy_set_floatstatus_invalid();
-#endif
-        ret = NPY_HALF_NAN;
-    } else if (h == 0x7bffu) {
-#if NPY_HALF_GENERATE_OVERFLOW
-        npy_set_floatstatus_overflow();
-#endif
-        ret = NPY_HALF_PINF;
-    } else if ((h&0x8000u) && h_sig == 0) { /* Negative boundary case */
-        if (h_exp > 0x2c00u) { /* If result is normalized */
-            ret = h_exp - 0x2c00u;
-        } else if(h_exp > 0x0400u) { /* The result is a subnormal, but not the smallest */
-            ret = 1 << ((h_exp >> 10) - 2);
-        } else {
-            ret = 0x0001u; /* Smallest subnormal half */
-        }
-    } else if (h_exp > 0x2800u) { /* If result is still normalized */
-        ret = h_exp - 0x2800u;
-    } else if (h_exp > 0x0400u) { /* The result is a subnormal, but not the smallest */
-        ret = 1 << ((h_exp >> 10) - 1);
-    } else {
-        ret = 0x0001u;
-    }
-
-    return ret;
-}
-
-npy_half npy_half_copysign(npy_half x, npy_half y)
-{
-    return (x&0x7fffu) | (y&0x8000u);
-}
-
-npy_half npy_half_nextafter(npy_half x, npy_half y)
-{
-    npy_half ret;
-
-    if (npy_half_isnan(x) || npy_half_isnan(y)) {
-        ret = NPY_HALF_NAN;
-    } else if (npy_half_eq_nonan(x, y)) {
-        ret = x;
-    } else if (npy_half_iszero(x)) {
-        ret = (y&0x8000u) + 1; /* Smallest subnormal half */
-    } else if (!(x&0x8000u)) { /* x > 0 */
-        if ((npy_int16)x > (npy_int16)y) { /* x > y */
-            ret = x-1;
-        } else {
-            ret = x+1;
-        }
-    } else {
-        if (!(y&0x8000u) || (x&0x7fffu) > (y&0x7fffu)) { /* x < y */
-            ret = x-1;
-        } else {
-            ret = x+1;
-        }
-    }
-#if NPY_HALF_GENERATE_OVERFLOW
-    if (npy_half_isinf(ret) && npy_half_isfinite(x)) {
-        npy_set_floatstatus_overflow();
-    }
-#endif
-
-    return ret;
-}
-
-int npy_half_eq_nonan(npy_half h1, npy_half h2)
-{
-    return (h1 == h2 || ((h1 | h2) & 0x7fff) == 0);
-}
-
-int npy_half_eq(npy_half h1, npy_half h2)
-{
-    /*
-     * The equality cases are as follows:
-     *   - If either value is NaN, never equal.
-     *   - If the values are equal, equal.
-     *   - If the values are both signed zeros, equal.
-     */
-    return (!npy_half_isnan(h1) && !npy_half_isnan(h2)) &&
-           (h1 == h2 || ((h1 | h2) & 0x7fff) == 0);
-}
-
-int npy_half_ne(npy_half h1, npy_half h2)
-{
-    return !npy_half_eq(h1, h2);
-}
-
-int npy_half_lt_nonan(npy_half h1, npy_half h2)
-{
-    if (h1&0x8000u) {
-        if (h2&0x8000u) {
-            return (h1&0x7fffu) > (h2&0x7fffu);
-        } else {
-            /* Signed zeros are equal, have to check for it */
-            return (h1 != 0x8000u) || (h2 != 0x0000u);
-        }
-    } else {
-        if (h2&0x8000u) {
-            return 0;
-        } else {
-            return (h1&0x7fffu) < (h2&0x7fffu);
-        }
-    }
-}
-
-int npy_half_lt(npy_half h1, npy_half h2)
-{
-    return (!npy_half_isnan(h1) && !npy_half_isnan(h2)) && npy_half_lt_nonan(h1, h2);
-}
-
-int npy_half_gt(npy_half h1, npy_half h2)
-{
-    return npy_half_lt(h2, h1);
-}
-
-int npy_half_le_nonan(npy_half h1, npy_half h2)
-{
-    if (h1&0x8000u) {
-        if (h2&0x8000u) {
-            return (h1&0x7fffu) >= (h2&0x7fffu);
-        } else {
-            return 1;
-        }
-    } else {
-        if (h2&0x8000u) {
-            /* Signed zeros are equal, have to check for it */
-            return (h1 == 0x0000u) && (h2 == 0x8000u);
-        } else {
-            return (h1&0x7fffu) <= (h2&0x7fffu);
-        }
-    }
-}
-
-int npy_half_le(npy_half h1, npy_half h2)
-{
-    return (!npy_half_isnan(h1) && !npy_half_isnan(h2)) && npy_half_le_nonan(h1, h2);
-}
-
-int npy_half_ge(npy_half h1, npy_half h2)
-{
-    return npy_half_le(h2, h1);
-}
-
-npy_half npy_half_divmod(npy_half h1, npy_half h2, npy_half *modulus)
-{
-    float fh1 = npy_half_to_float(h1);
-    float fh2 = npy_half_to_float(h2);
-    float div, mod;
-
-    div = npy_divmodf(fh1, fh2, &mod);
-    *modulus = npy_float_to_half(mod);
-    return npy_float_to_half(div);
-}
-
-
-
-/*
- ********************************************************************
- *                     BIT-LEVEL CONVERSIONS                        *
- ********************************************************************
- */
-
-npy_uint16 npy_floatbits_to_halfbits(npy_uint32 f)
-{
-    npy_uint32 f_exp, f_sig;
-    npy_uint16 h_sgn, h_exp, h_sig;
-
-    h_sgn = (npy_uint16) ((f&0x80000000u) >> 16);
-    f_exp = (f&0x7f800000u);
-
-    /* Exponent overflow/NaN converts to signed inf/NaN */
-    if (f_exp >= 0x47800000u) {
-        if (f_exp == 0x7f800000u) {
-            /* Inf or NaN */
-            f_sig = (f&0x007fffffu);
-            if (f_sig != 0) {
-                /* NaN - propagate the flag in the significand... */
-                npy_uint16 ret = (npy_uint16) (0x7c00u + (f_sig >> 13));
-                /* ...but make sure it stays a NaN */
-                if (ret == 0x7c00u) {
-                    ret++;
-                }
-                return h_sgn + ret;
-            } else {
-                /* signed inf */
-                return (npy_uint16) (h_sgn + 0x7c00u);
-            }
-        } else {
-            /* overflow to signed inf */
-#if NPY_HALF_GENERATE_OVERFLOW
-            npy_set_floatstatus_overflow();
-#endif
-            return (npy_uint16) (h_sgn + 0x7c00u);
-        }
-    }
-
-    /* Exponent underflow converts to a subnormal half or signed zero */
-    if (f_exp <= 0x38000000u) {
-        /*
-         * Signed zeros, subnormal floats, and floats with small
-         * exponents all convert to signed zero half-floats.
-         */
-        if (f_exp < 0x33000000u) {
-#if NPY_HALF_GENERATE_UNDERFLOW
-            /* If f != 0, it underflowed to 0 */
-            if ((f&0x7fffffff) != 0) {
-                npy_set_floatstatus_underflow();
-            }
-#endif
-            return h_sgn;
-        }
-        /* Make the subnormal significand */
-        f_exp >>= 23;
-        f_sig = (0x00800000u + (f&0x007fffffu));
-#if NPY_HALF_GENERATE_UNDERFLOW
-        /* If it's not exactly represented, it underflowed */
-        if ((f_sig&(((npy_uint32)1 << (126 - f_exp)) - 1)) != 0) {
-            npy_set_floatstatus_underflow();
-        }
-#endif
-        /*
-         * Usually the significand is shifted by 13. For subnormals an
-         * additional shift needs to occur. This shift is one for the largest
-         * exponent giving a subnormal `f_exp = 0x38000000 >> 23 = 112`, which
-         * offsets the new first bit. At most the shift can be 1+10 bits.
-         */
-        f_sig >>= (113 - f_exp);
-        /* Handle rounding by adding 1 to the bit beyond half precision */
-#if NPY_HALF_ROUND_TIES_TO_EVEN
-        /*
-         * If the last bit in the half significand is 0 (already even), and
-         * the remaining bit pattern is 1000...0, then we do not add one
-         * to the bit after the half significand. However, the (113 - f_exp)
-         * shift can lose up to 11 bits, so the || checks them in the original.
-         * In all other cases, we can just add one.
-         */
-        if (((f_sig&0x00003fffu) != 0x00001000u) || (f&0x000007ffu)) {
-            f_sig += 0x00001000u;
-        }
-#else
-        f_sig += 0x00001000u;
-#endif
-        h_sig = (npy_uint16) (f_sig >> 13);
-        /*
-         * If the rounding causes a bit to spill into h_exp, it will
-         * increment h_exp from zero to one and h_sig will be zero.
-         * This is the correct result.
-         */
-        return (npy_uint16) (h_sgn + h_sig);
-    }
-
-    /* Regular case with no overflow or underflow */
-    h_exp = (npy_uint16) ((f_exp - 0x38000000u) >> 13);
-    /* Handle rounding by adding 1 to the bit beyond half precision */
-    f_sig = (f&0x007fffffu);
-#if NPY_HALF_ROUND_TIES_TO_EVEN
-    /*
-     * If the last bit in the half significand is 0 (already even), and
-     * the remaining bit pattern is 1000...0, then we do not add one
-     * to the bit after the half significand.  In all other cases, we do.
-     */
-    if ((f_sig&0x00003fffu) != 0x00001000u) {
-        f_sig += 0x00001000u;
-    }
-#else
-    f_sig += 0x00001000u;
-#endif
-    h_sig = (npy_uint16) (f_sig >> 13);
-    /*
-     * If the rounding causes a bit to spill into h_exp, it will
-     * increment h_exp by one and h_sig will be zero.  This is the
-     * correct result.  h_exp may increment to 15, at greatest, in
-     * which case the result overflows to a signed inf.
-     */
-#if NPY_HALF_GENERATE_OVERFLOW
-    h_sig += h_exp;
-    if (h_sig == 0x7c00u) {
-        npy_set_floatstatus_overflow();
-    }
-    return h_sgn + h_sig;
-#else
-    return h_sgn + h_exp + h_sig;
-#endif
-}
-
-npy_uint16 npy_doublebits_to_halfbits(npy_uint64 d)
-{
-    npy_uint64 d_exp, d_sig;
-    npy_uint16 h_sgn, h_exp, h_sig;
-
-    h_sgn = (d&0x8000000000000000ULL) >> 48;
-    d_exp = (d&0x7ff0000000000000ULL);
-
-    /* Exponent overflow/NaN converts to signed inf/NaN */
-    if (d_exp >= 0x40f0000000000000ULL) {
-        if (d_exp == 0x7ff0000000000000ULL) {
-            /* Inf or NaN */
-            d_sig = (d&0x000fffffffffffffULL);
-            if (d_sig != 0) {
-                /* NaN - propagate the flag in the significand... */
-                npy_uint16 ret = (npy_uint16) (0x7c00u + (d_sig >> 42));
-                /* ...but make sure it stays a NaN */
-                if (ret == 0x7c00u) {
-                    ret++;
-                }
-                return h_sgn + ret;
-            } else {
-                /* signed inf */
-                return h_sgn + 0x7c00u;
-            }
-        } else {
-            /* overflow to signed inf */
-#if NPY_HALF_GENERATE_OVERFLOW
-            npy_set_floatstatus_overflow();
-#endif
-            return h_sgn + 0x7c00u;
-        }
-    }
-
-    /* Exponent underflow converts to subnormal half or signed zero */
-    if (d_exp <= 0x3f00000000000000ULL) {
-        /*
-         * Signed zeros, subnormal floats, and floats with small
-         * exponents all convert to signed zero half-floats.
-         */
-        if (d_exp < 0x3e60000000000000ULL) {
-#if NPY_HALF_GENERATE_UNDERFLOW
-            /* If d != 0, it underflowed to 0 */
-            if ((d&0x7fffffffffffffffULL) != 0) {
-                npy_set_floatstatus_underflow();
-            }
-#endif
-            return h_sgn;
-        }
-        /* Make the subnormal significand */
-        d_exp >>= 52;
-        d_sig = (0x0010000000000000ULL + (d&0x000fffffffffffffULL));
-#if NPY_HALF_GENERATE_UNDERFLOW
-        /* If it's not exactly represented, it underflowed */
-        if ((d_sig&(((npy_uint64)1 << (1051 - d_exp)) - 1)) != 0) {
-            npy_set_floatstatus_underflow();
-        }
-#endif
-        /*
-         * Unlike floats, doubles have enough room to shift left to align
-         * the subnormal significand leading to no loss of the last bits.
-         * The smallest possible exponent giving a subnormal is:
-         * `d_exp = 0x3e60000000000000 >> 52 = 998`. All larger subnormals are
-         * shifted with respect to it. This adds a shift of 10+1 bits the final
-         * right shift when comparing it to the one in the normal branch.
-         */
-        assert(d_exp - 998 >= 0);
-        d_sig <<= (d_exp - 998);
-        /* Handle rounding by adding 1 to the bit beyond half precision */
-#if NPY_HALF_ROUND_TIES_TO_EVEN
-        /*
-         * If the last bit in the half significand is 0 (already even), and
-         * the remaining bit pattern is 1000...0, then we do not add one
-         * to the bit after the half significand.  In all other cases, we do.
-         */
-        if ((d_sig&0x003fffffffffffffULL) != 0x0010000000000000ULL) {
-            d_sig += 0x0010000000000000ULL;
-        }
-#else
-        d_sig += 0x0010000000000000ULL;
-#endif
-        h_sig = (npy_uint16) (d_sig >> 53);
-        /*
-         * If the rounding causes a bit to spill into h_exp, it will
-         * increment h_exp from zero to one and h_sig will be zero.
-         * This is the correct result.
-         */
-        return h_sgn + h_sig;
-    }
-
-    /* Regular case with no overflow or underflow */
-    h_exp = (npy_uint16) ((d_exp - 0x3f00000000000000ULL) >> 42);
-    /* Handle rounding by adding 1 to the bit beyond half precision */
-    d_sig = (d&0x000fffffffffffffULL);
-#if NPY_HALF_ROUND_TIES_TO_EVEN
-    /*
-     * If the last bit in the half significand is 0 (already even), and
-     * the remaining bit pattern is 1000...0, then we do not add one
-     * to the bit after the half significand.  In all other cases, we do.
-     */
-    if ((d_sig&0x000007ffffffffffULL) != 0x0000020000000000ULL) {
-        d_sig += 0x0000020000000000ULL;
-    }
-#else
-    d_sig += 0x0000020000000000ULL;
-#endif
-    h_sig = (npy_uint16) (d_sig >> 42);
-
-    /*
-     * If the rounding causes a bit to spill into h_exp, it will
-     * increment h_exp by one and h_sig will be zero.  This is the
-     * correct result.  h_exp may increment to 15, at greatest, in
-     * which case the result overflows to a signed inf.
-     */
-#if NPY_HALF_GENERATE_OVERFLOW
-    h_sig += h_exp;
-    if (h_sig == 0x7c00u) {
-        npy_set_floatstatus_overflow();
-    }
-    return h_sgn + h_sig;
-#else
-    return h_sgn + h_exp + h_sig;
-#endif
-}
-
-npy_uint32 npy_halfbits_to_floatbits(npy_uint16 h)
-{
-    npy_uint16 h_exp, h_sig;
-    npy_uint32 f_sgn, f_exp, f_sig;
-
-    h_exp = (h&0x7c00u);
-    f_sgn = ((npy_uint32)h&0x8000u) << 16;
-    switch (h_exp) {
-        case 0x0000u: /* 0 or subnormal */
-            h_sig = (h&0x03ffu);
-            /* Signed zero */
-            if (h_sig == 0) {
-                return f_sgn;
-            }
-            /* Subnormal */
-            h_sig <<= 1;
-            while ((h_sig&0x0400u) == 0) {
-                h_sig <<= 1;
-                h_exp++;
-            }
-            f_exp = ((npy_uint32)(127 - 15 - h_exp)) << 23;
-            f_sig = ((npy_uint32)(h_sig&0x03ffu)) << 13;
-            return f_sgn + f_exp + f_sig;
-        case 0x7c00u: /* inf or NaN */
-            /* All-ones exponent and a copy of the significand */
-            return f_sgn + 0x7f800000u + (((npy_uint32)(h&0x03ffu)) << 13);
-        default: /* normalized */
-            /* Just need to adjust the exponent and shift */
-            return f_sgn + (((npy_uint32)(h&0x7fffu) + 0x1c000u) << 13);
-    }
-}
-
-npy_uint64 npy_halfbits_to_doublebits(npy_uint16 h)
-{
-    npy_uint16 h_exp, h_sig;
-    npy_uint64 d_sgn, d_exp, d_sig;
-
-    h_exp = (h&0x7c00u);
-    d_sgn = ((npy_uint64)h&0x8000u) << 48;
-    switch (h_exp) {
-        case 0x0000u: /* 0 or subnormal */
-            h_sig = (h&0x03ffu);
-            /* Signed zero */
-            if (h_sig == 0) {
-                return d_sgn;
-            }
-            /* Subnormal */
-            h_sig <<= 1;
-            while ((h_sig&0x0400u) == 0) {
-                h_sig <<= 1;
-                h_exp++;
-            }
-            d_exp = ((npy_uint64)(1023 - 15 - h_exp)) << 52;
-            d_sig = ((npy_uint64)(h_sig&0x03ffu)) << 42;
-            return d_sgn + d_exp + d_sig;
-        case 0x7c00u: /* inf or NaN */
-            /* All-ones exponent and a copy of the significand */
-            return d_sgn + 0x7ff0000000000000ULL +
-                                (((npy_uint64)(h&0x03ffu)) << 42);
-        default: /* normalized */
-            /* Just need to adjust the exponent and shift */
-            return d_sgn + (((npy_uint64)(h&0x7fffu) + 0xfc000u) << 42);
-    }
-}
diff --git a/numpy/core/src/npymath/halffloat.cpp b/numpy/core/src/npymath/halffloat.cpp
new file mode 100644
index 000000000..aa582c1b9
--- /dev/null
+++ b/numpy/core/src/npymath/halffloat.cpp
@@ -0,0 +1,238 @@
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+/*
+ * If these are 1, the conversions try to trigger underflow,
+ * overflow, and invalid exceptions in the FP system when needed.
+ */
+#define NPY_HALF_GENERATE_OVERFLOW 1
+#define NPY_HALF_GENERATE_INVALID 1
+
+#include "numpy/halffloat.h"
+
+#include "common.hpp"
+/*
+ ********************************************************************
+ *                   HALF-PRECISION ROUTINES                        *
+ ********************************************************************
+ */
+using namespace np;
+
+float npy_half_to_float(npy_half h)
+{
+    return static_cast<float>(Half::FromBits(h));
+}
+
+double npy_half_to_double(npy_half h)
+{
+    return static_cast<double>(Half::FromBits(h));
+}
+
+npy_half npy_float_to_half(float f)
+{
+    return Half(f).Bits();
+}
+
+npy_half npy_double_to_half(double d)
+{
+    return Half(d).Bits();
+}
+
+int npy_half_iszero(npy_half h)
+{
+    return (h&0x7fff) == 0;
+}
+
+int npy_half_isnan(npy_half h)
+{
+    return Half::FromBits(h).IsNaN();
+}
+
+int npy_half_isinf(npy_half h)
+{
+    return ((h&0x7fffu) == 0x7c00u);
+}
+
+int npy_half_isfinite(npy_half h)
+{
+    return ((h&0x7c00u) != 0x7c00u);
+}
+
+int npy_half_signbit(npy_half h)
+{
+    return (h&0x8000u) != 0;
+}
+
+npy_half npy_half_spacing(npy_half h)
+{
+    npy_half ret;
+    npy_uint16 h_exp = h&0x7c00u;
+    npy_uint16 h_sig = h&0x03ffu;
+    if (h_exp == 0x7c00u) {
+#if NPY_HALF_GENERATE_INVALID
+        npy_set_floatstatus_invalid();
+#endif
+        ret = NPY_HALF_NAN;
+    } else if (h == 0x7bffu) {
+#if NPY_HALF_GENERATE_OVERFLOW
+        npy_set_floatstatus_overflow();
+#endif
+        ret = NPY_HALF_PINF;
+    } else if ((h&0x8000u) && h_sig == 0) { /* Negative boundary case */
+        if (h_exp > 0x2c00u) { /* If result is normalized */
+            ret = h_exp - 0x2c00u;
+        } else if(h_exp > 0x0400u) { /* The result is a subnormal, but not the smallest */
+            ret = 1 << ((h_exp >> 10) - 2);
+        } else {
+            ret = 0x0001u; /* Smallest subnormal half */
+        }
+    } else if (h_exp > 0x2800u) { /* If result is still normalized */
+        ret = h_exp - 0x2800u;
+    } else if (h_exp > 0x0400u) { /* The result is a subnormal, but not the smallest */
+        ret = 1 << ((h_exp >> 10) - 1);
+    } else {
+        ret = 0x0001u;
+    }
+
+    return ret;
+}
+
+npy_half npy_half_copysign(npy_half x, npy_half y)
+{
+    return (x&0x7fffu) | (y&0x8000u);
+}
+
+npy_half npy_half_nextafter(npy_half x, npy_half y)
+{
+    npy_half ret;
+
+    if (npy_half_isnan(x) || npy_half_isnan(y)) {
+        ret = NPY_HALF_NAN;
+    } else if (npy_half_eq_nonan(x, y)) {
+        ret = x;
+    } else if (npy_half_iszero(x)) {
+        ret = (y&0x8000u) + 1; /* Smallest subnormal half */
+    } else if (!(x&0x8000u)) { /* x > 0 */
+        if ((npy_int16)x > (npy_int16)y) { /* x > y */
+            ret = x-1;
+        } else {
+            ret = x+1;
+        }
+    } else {
+        if (!(y&0x8000u) || (x&0x7fffu) > (y&0x7fffu)) { /* x < y */
+            ret = x-1;
+        } else {
+            ret = x+1;
+        }
+    }
+#if NPY_HALF_GENERATE_OVERFLOW
+    if (npy_half_isinf(ret) && npy_half_isfinite(x)) {
+        npy_set_floatstatus_overflow();
+    }
+#endif
+
+    return ret;
+}
+
+int npy_half_eq_nonan(npy_half h1, npy_half h2)
+{
+    return Half::FromBits(h1).Equal(Half::FromBits(h2));
+}
+
+int npy_half_eq(npy_half h1, npy_half h2)
+{
+    return Half::FromBits(h1) == Half::FromBits(h2);
+}
+
+int npy_half_ne(npy_half h1, npy_half h2)
+{
+    return Half::FromBits(h1) != Half::FromBits(h2);
+}
+
+int npy_half_lt_nonan(npy_half h1, npy_half h2)
+{
+    return Half::FromBits(h1).Less(Half::FromBits(h2));
+}
+
+int npy_half_lt(npy_half h1, npy_half h2)
+{
+    return Half::FromBits(h1) < Half::FromBits(h2);
+}
+
+int npy_half_gt(npy_half h1, npy_half h2)
+{
+    return npy_half_lt(h2, h1);
+}
+
+int npy_half_le_nonan(npy_half h1, npy_half h2)
+{
+    return Half::FromBits(h1).LessEqual(Half::FromBits(h2));
+}
+
+int npy_half_le(npy_half h1, npy_half h2)
+{
+    return Half::FromBits(h1) <= Half::FromBits(h2);
+}
+
+int npy_half_ge(npy_half h1, npy_half h2)
+{
+    return npy_half_le(h2, h1);
+}
+
+npy_half npy_half_divmod(npy_half h1, npy_half h2, npy_half *modulus)
+{
+    float fh1 = npy_half_to_float(h1);
+    float fh2 = npy_half_to_float(h2);
+    float div, mod;
+
+    div = npy_divmodf(fh1, fh2, &mod);
+    *modulus = npy_float_to_half(mod);
+    return npy_float_to_half(div);
+}
+
+
+/*
+ ********************************************************************
+ *                     BIT-LEVEL CONVERSIONS                        *
+ ********************************************************************
+ */
+
+npy_uint16 npy_floatbits_to_halfbits(npy_uint32 f)
+{
+    if constexpr (Half::kNativeConversion<float>) {
+        return BitCast<uint16_t>(Half(BitCast<float>(f)));
+    }
+    else {
+        return half_private::FromFloatBits(f);
+    }
+}
+
+npy_uint16 npy_doublebits_to_halfbits(npy_uint64 d)
+{
+    if constexpr (Half::kNativeConversion<double>) {
+        return BitCast<uint16_t>(Half(BitCast<double>(d)));
+    }
+    else {
+        return half_private::FromDoubleBits(d);
+    }
+}
+
+npy_uint32 npy_halfbits_to_floatbits(npy_uint16 h)
+{
+    if constexpr (Half::kNativeConversion<float>) {
+        return BitCast<uint32_t>(static_cast<float>(Half::FromBits(h)));
+    }
+    else {
+        return half_private::ToFloatBits(h);
+    }
+}
+
+npy_uint64 npy_halfbits_to_doublebits(npy_uint16 h)
+{
+    if constexpr (Half::kNativeConversion<double>) {
+        return BitCast<uint64_t>(static_cast<double>(Half::FromBits(h)));
+    }
+    else {
+        return half_private::ToDoubleBits(h);
+    }
+}
+
diff --git a/numpy/core/src/npymath/ieee754.c.src b/numpy/core/src/npymath/ieee754.c.src
index 5d8cb06a4..8fccc9a69 100644
--- a/numpy/core/src/npymath/ieee754.c.src
+++ b/numpy/core/src/npymath/ieee754.c.src
@@ -15,20 +15,6 @@
 #define LDBL_TRUE_MIN __LDBL_DENORM_MIN__
 #endif
 
-#if !defined(HAVE_DECL_SIGNBIT)
-#include "_signbit.c"
-
-int _npy_signbit_f(float x)
-{
-    return _npy_signbit_d((double) x);
-}
-
-int _npy_signbit_ld(long double x)
-{
-    return _npy_signbit_d((double) x);
-}
-#endif
-
 /*
  * FIXME: There is a lot of redundancy between _next* and npy_nextafter*.
  * refactor this at some point
@@ -326,25 +312,6 @@ static npy_longdouble _nextl(npy_longdouble x, int p)
 }
 /**end repeat**/
 
-/*
- * Decorate all the math functions which are available on the current platform
- */
-
-float npy_nextafterf(float x, float y)
-{
-    return nextafterf(x, y);
-}
-
-double npy_nextafter(double x, double y)
-{
-    return nextafter(x, y);
-}
-
-npy_longdouble npy_nextafterl(npy_longdouble x, npy_longdouble y)
-{
-    return nextafterl(x, y);
-}
-
 int npy_clear_floatstatus() {
     char x=0;
     return npy_clear_floatstatus_barrier(&x);
diff --git a/numpy/core/src/npymath/ieee754.cpp b/numpy/core/src/npymath/ieee754.cpp
index ebc1dbeba..1c59bf300 100644
--- a/numpy/core/src/npymath/ieee754.cpp
+++ b/numpy/core/src/npymath/ieee754.cpp
@@ -17,21 +17,6 @@
 #define LDBL_TRUE_MIN __LDBL_DENORM_MIN__
 #endif
 
-#if !defined(HAVE_DECL_SIGNBIT)
-#include "_signbit.c"
-
-int
-_npy_signbit_f(float x)
-{
-    return _npy_signbit_d((double)x);
-}
-
-int
-_npy_signbit_ld(long double x)
-{
-    return _npy_signbit_d((double)x);
-}
-#endif
 
 /*
  * FIXME: There is a lot of redundancy between _next* and npy_nextafter*.
@@ -388,28 +373,6 @@ npy_spacingl(npy_longdouble x)
 }
 }
 
-/*
- * Decorate all the math functions which are available on the current platform
- */
-
-extern "C" float
-npy_nextafterf(float x, float y)
-{
-    return nextafterf(x, y);
-}
-
-extern "C" double
-npy_nextafter(double x, double y)
-{
-    return nextafter(x, y);
-}
-
-extern "C" npy_longdouble
-npy_nextafterl(npy_longdouble x, npy_longdouble y)
-{
-    return nextafterl(x, y);
-}
-
 extern "C" int
 npy_clear_floatstatus()
 {
diff --git a/numpy/core/src/npymath/npy_math_complex.c.src b/numpy/core/src/npymath/npy_math_complex.c.src
index 696495dab..eb9d810b7 100644
--- a/numpy/core/src/npymath/npy_math_complex.c.src
+++ b/numpy/core/src/npymath/npy_math_complex.c.src
@@ -72,7 +72,7 @@ static const @ctype@ c_1@c@ = {1.0@C@, 0.0};
  * These are necessary because we do not count on using a
  * C99 compiler.
  *=========================================================*/
-static NPY_INLINE
+static inline
 @ctype@
 cmul@c@(@ctype@ a, @ctype@ b)
 {
@@ -84,7 +84,7 @@ cmul@c@(@ctype@ a, @ctype@ b)
     return npy_cpack@c@(ar*br - ai*bi, ar*bi + ai*br);
 }
 
-static NPY_INLINE
+static inline
 @ctype@
 cdiv@c@(@ctype@ a, @ctype@ b)
 {
@@ -443,29 +443,43 @@ npy_cpow@c@ (@ctype@ a, @ctype@ b)
     @type@ bi = npy_cimag@c@(b);
     @ctype@ r;
 
+    /*
+     * Checking if in a^b, if b is zero.
+     * If a is not zero then by definition of logarithm a^0 is 1.
+     * If a is also zero then 0^0 is best defined as 1.
+     */
     if (br == 0. && bi == 0.) {
         return npy_cpack@c@(1., 0.);
     }
-    if (ar == 0. && ai == 0.) {
-        if (br > 0 && bi == 0) {
-            return npy_cpack@c@(0., 0.);
-        }
-        else {
-            volatile @type@ tmp = NPY_INFINITY@C@;
-            /*
-             * NB: there are four complex zeros; c0 = (+-0, +-0), so that
-             * unlike for reals, c0**p, with `p` negative is in general
-             * ill-defined.
-             *
-             *     c0**z with z complex is also ill-defined.
-             */
-            r = npy_cpack@c@(NPY_NAN@C@, NPY_NAN@C@);
-
-            /* Raise invalid */
-            tmp -= NPY_INFINITY@C@;
-            ar = tmp;
-            return r;
-        }
+    /* case 0^b
+     * If a is a complex zero (ai=ar=0), then the result depends 
+     * upon values of br and bi. The result is either:
+     * 0 (in magnitude), undefined or 1.
+     * The later case is for br=bi=0 and independent of ar and ai
+     * but is handled above).
+     */
+    else if (ar == 0. && ai == 0.) {
+        /* 
+         * If the real part of b is positive (br>0) then this is
+         * the zero complex with positive sign on both the
+         * real and imaginary part.
+         */
+         if (br > 0) {
+             return npy_cpack@c@(0., 0.);
+         }
+        /* else we are in the case where the
+         * real part of b is negative (br<0).
+         * Here we should return a complex nan
+         * and raise FloatingPointError: invalid value...
+         */
+         
+         /* Raise invalid value by calling inf - inf*/
+          volatile @type@ tmp = NPY_INFINITY@C@;
+          tmp -= NPY_INFINITY@C@;
+          ar = tmp;
+          
+          r = npy_cpack@c@(NPY_NAN@C@, NPY_NAN@C@);
+          return r;
     }
     if (bi == 0 && (n=(npy_intp)br) == br) {
         if (n == 1) {
@@ -521,6 +535,443 @@ npy_cpow@c@ (@ctype@ a, @ctype@ b)
 #endif
 }
 
+
+#ifndef HAVE_CCOS@C@
+@ctype@
+npy_ccos@c@(@ctype@ z)
+{
+    /* ccos(z) = ccosh(I * z) */
+    return npy_ccosh@c@(npy_cpack@c@(-npy_cimag@c@(z), npy_creal@c@(z)));
+}
+#endif
+
+#ifndef HAVE_CSIN@C@
+@ctype@
+npy_csin@c@(@ctype@ z)
+{
+    /* csin(z) = -I * csinh(I * z) */
+    z = npy_csinh@c@(npy_cpack@c@(-npy_cimag@c@(z), npy_creal@c@(z)));
+    return npy_cpack@c@(npy_cimag@c@(z), -npy_creal@c@(z));
+}
+#endif
+
+#ifndef HAVE_CTAN@C@
+@ctype@
+npy_ctan@c@(@ctype@ z)
+{
+    /* ctan(z) = -I * ctanh(I * z) */
+    z = npy_ctanh@c@(npy_cpack@c@(-npy_cimag@c@(z), npy_creal@c@(z)));
+    return (npy_cpack@c@(npy_cimag@c@(z), -npy_creal@c@(z)));
+}
+#endif
+
+#ifndef HAVE_CCOSH@C@
+/*
+ * Taken from the msun library in FreeBSD, rev 226599.
+ *
+ * Hyperbolic cosine of a complex argument z = x + i y.
+ *
+ * cosh(z) = cosh(x+iy)
+ *         = cosh(x) cos(y) + i sinh(x) sin(y).
+ *
+ * Exceptional values are noted in the comments within the source code.
+ * These values and the return value were taken from n1124.pdf.
+ *
+ * CCOSH_BIG is chosen such that
+ * spacing(0.5 * exp(CCOSH_BIG)) > 0.5*exp(-CCOSH_BIG)
+ * although the exact value assigned to CCOSH_BIG is not so important
+ */
+@ctype@
+npy_ccosh@c@(@ctype@ z)
+{
+#if @precision@ == 1
+    const npy_float CCOSH_BIG = 9.0f;
+    const npy_float CCOSH_HUGE = 1.70141183e+38f;
+#endif
+#if @precision@ == 2
+    const npy_double CCOSH_BIG = 22.0;
+    const npy_double CCOSH_HUGE = 8.9884656743115795e+307;
+#endif
+#if @precision@ >= 3
+#if NPY_SIZEOF_LONGDOUBLE == NPY_SIZEOF_DOUBLE
+    const npy_longdouble CCOSH_BIG = 22.0L;
+    const npy_longdouble CCOSH_HUGE = 8.9884656743115795e+307L;
+#else
+    const npy_longdouble CCOSH_BIG = 24.0L;
+    const npy_longdouble CCOSH_HUGE = 5.94865747678615882543e+4931L;
+#endif
+#endif
+
+    @type@  x, y, h, absx;
+    npy_int xfinite, yfinite;
+
+    x = npy_creal@c@(z);
+    y = npy_cimag@c@(z);
+
+    xfinite = npy_isfinite(x);
+    yfinite = npy_isfinite(y);
+
+    /* Handle the nearly-non-exceptional cases where x and y are finite. */
+    if (xfinite && yfinite) {
+        if (y == 0) {
+            return npy_cpack@c@(npy_cosh@c@(x), x * y);
+        }
+        absx = npy_fabs@c@(x);
+        if (absx < CCOSH_BIG) {
+            /* small x: normal case */
+            return npy_cpack@c@(npy_cosh@c@(x) * npy_cos@c@(y),
+                                npy_sinh@c@(x) * npy_sin@c@(y));
+        }
+
+        /* |x| >= 22, so cosh(x) ~= exp(|x|) */
+        if (absx < SCALED_CEXP_LOWER@C@) {
+            /* x < 710: exp(|x|) won't overflow */
+            h = npy_exp@c@(absx) * 0.5@c@;
+            return npy_cpack@c@(h * npy_cos@c@(y),
+                                npy_copysign@c@(h, x) * npy_sin@c@(y));
+        }
+        else if (absx < SCALED_CEXP_UPPER@C@) {
+            /* x < 1455: scale to avoid overflow */
+            z = _npy_scaled_cexp@c@(absx, y, -1);
+            return npy_cpack@c@(npy_creal@c@(z),
+                                npy_cimag@c@(z) * npy_copysign@c@(1, x));
+        }
+        else {
+            /* x >= 1455: the result always overflows */
+            h = CCOSH_HUGE * x;
+            return npy_cpack@c@(h * h * npy_cos@c@(y), h * npy_sin@c@(y));
+        }
+    }
+
+    /*
+     * cosh(+-0 +- I Inf) = dNaN + I sign(d(+-0, dNaN))0.
+     * The sign of 0 in the result is unspecified.  Choice = normally
+     * the same as dNaN.  Raise the invalid floating-point exception.
+     *
+     * cosh(+-0 +- I NaN) = d(NaN) + I sign(d(+-0, NaN))0.
+     * The sign of 0 in the result is unspecified.  Choice = normally
+     * the same as d(NaN).
+     */
+    if (x == 0 && !yfinite) {
+        return npy_cpack@c@(y - y, npy_copysign@c@(0, x * (y - y)));
+    }
+
+    /*
+     * cosh(+-Inf +- I 0) = +Inf + I (+-)(+-)0.
+     *
+     * cosh(NaN +- I 0)   = d(NaN) + I sign(d(NaN, +-0))0.
+     * The sign of 0 in the result is unspecified.
+     */
+    if (y == 0 && !xfinite) {
+        return npy_cpack@c@(x * x, npy_copysign@c@(0, x) * y);
+    }
+
+    /*
+     * cosh(x +- I Inf) = dNaN + I dNaN.
+     * Raise the invalid floating-point exception for finite nonzero x.
+     *
+     * cosh(x + I NaN) = d(NaN) + I d(NaN).
+     * Optionally raises the invalid floating-point exception for finite
+     * nonzero x.  Choice = don't raise (except for signaling NaNs).
+     */
+    if (xfinite && !yfinite) {
+        return npy_cpack@c@(y - y, x * (y - y));
+    }
+
+    /*
+     * cosh(+-Inf + I NaN)  = +Inf + I d(NaN).
+     *
+     * cosh(+-Inf +- I Inf) = +Inf + I dNaN.
+     * The sign of Inf in the result is unspecified.  Choice = always +.
+     * Raise the invalid floating-point exception.
+     *
+     * cosh(+-Inf + I y)   = +Inf cos(y) +- I Inf sin(y)
+     */
+    if (npy_isinf(x)) {
+        if (!yfinite) {
+            return npy_cpack@c@(x * x, x * (y - y));
+        }
+        return npy_cpack@c@((x * x) * npy_cos@c@(y), x * npy_sin@c@(y));
+    }
+
+    /*
+     * cosh(NaN + I NaN)  = d(NaN) + I d(NaN).
+     *
+     * cosh(NaN +- I Inf) = d(NaN) + I d(NaN).
+     * Optionally raises the invalid floating-point exception.
+     * Choice = raise.
+     *
+     * cosh(NaN + I y)    = d(NaN) + I d(NaN).
+     * Optionally raises the invalid floating-point exception for finite
+     * nonzero y.  Choice = don't raise (except for signaling NaNs).
+     */
+    return npy_cpack@c@((x * x) * (y - y), (x + x) * (y - y));
+}
+#undef CCOSH_BIG
+#undef CCOSH_HUGE
+#endif
+
+#ifndef HAVE_CSINH@C@
+/*
+ * Taken from the msun library in FreeBSD, rev 226599.
+ *
+ * Hyperbolic sine of a complex argument z = x + i y.
+ *
+ * sinh(z) = sinh(x+iy)
+ *         = sinh(x) cos(y) + i cosh(x) sin(y).
+ *
+ * Exceptional values are noted in the comments within the source code.
+ * These values and the return value were taken from n1124.pdf.
+ */
+@ctype@
+npy_csinh@c@(@ctype@ z)
+{
+#if @precision@ == 1
+    const npy_float CSINH_BIG = 9.0f;
+    const npy_float CSINH_HUGE = 1.70141183e+38f;
+#endif
+#if @precision@ == 2
+    const npy_double CSINH_BIG = 22.0;
+    const npy_double CSINH_HUGE = 8.9884656743115795e+307;
+#endif
+#if @precision@ >= 3
+#if NPY_SIZEOF_LONGDOUBLE == NPY_SIZEOF_DOUBLE
+    const npy_longdouble CSINH_BIG = 22.0L;
+    const npy_longdouble CSINH_HUGE = 8.9884656743115795e+307L;
+#else
+    const npy_longdouble CSINH_BIG = 24.0L;
+    const npy_longdouble CSINH_HUGE = 5.94865747678615882543e+4931L;
+#endif
+#endif
+
+    @type@ x, y, h, absx;
+    npy_int xfinite, yfinite;
+
+    x = npy_creal@c@(z);
+    y = npy_cimag@c@(z);
+
+    xfinite = npy_isfinite(x);
+    yfinite = npy_isfinite(y);
+
+    /* Handle the nearly-non-exceptional cases where x and y are finite. */
+    if (xfinite && yfinite) {
+        if (y == 0) {
+            return npy_cpack@c@(npy_sinh@c@(x), y);
+        }
+        absx = npy_fabs@c@(x);
+        if (absx < CSINH_BIG) {
+            /* small x: normal case */
+            return npy_cpack@c@(npy_sinh@c@(x) * npy_cos@c@(y),
+                                npy_cosh@c@(x) * npy_sin@c@(y));
+        }
+
+        /* |x| >= 22, so cosh(x) ~= exp(|x|) */
+        if (absx < SCALED_CEXP_LOWER@C@) {
+            /* x < 710: exp(|x|) won't overflow */
+            h = npy_exp@c@(npy_fabs@c@(x)) * 0.5@c@;
+            return npy_cpack@c@(npy_copysign@c@(h, x) * npy_cos@c@(y),
+                                h * npy_sin@c@(y));
+        }
+        else if (x < SCALED_CEXP_UPPER@C@) {
+            /* x < 1455: scale to avoid overflow */
+            z = _npy_scaled_cexp@c@(absx, y, -1);
+            return npy_cpack@c@(npy_creal@c@(z) * npy_copysign@c@(1, x),
+                                npy_cimag@c@(z));
+        }
+        else {
+            /* x >= 1455: the result always overflows */
+            h = CSINH_HUGE * x;
+            return npy_cpack@c@(h * npy_cos@c@(y), h * h * npy_sin@c@(y));
+        }
+    }
+
+    /*
+     * sinh(+-0 +- I Inf) = sign(d(+-0, dNaN))0 + I dNaN.
+     * The sign of 0 in the result is unspecified.  Choice = normally
+     * the same as dNaN.  Raise the invalid floating-point exception.
+     *
+     * sinh(+-0 +- I NaN) = sign(d(+-0, NaN))0 + I d(NaN).
+     * The sign of 0 in the result is unspecified.  Choice = normally
+     * the same as d(NaN).
+     */
+    if (x == 0 && !yfinite) {
+        return npy_cpack@c@(npy_copysign@c@(0, x * (y - y)), y - y);
+    }
+
+    /*
+     * sinh(+-Inf +- I 0) = +-Inf + I +-0.
+     *
+     * sinh(NaN +- I 0)   = d(NaN) + I +-0.
+     */
+    if (y == 0 && !xfinite) {
+        if (npy_isnan(x)) {
+            return z;
+        }
+        return npy_cpack@c@(x, npy_copysign@c@(0, y));
+    }
+
+    /*
+     * sinh(x +- I Inf) = dNaN + I dNaN.
+     * Raise the invalid floating-point exception for finite nonzero x.
+     *
+     * sinh(x + I NaN) = d(NaN) + I d(NaN).
+     * Optionally raises the invalid floating-point exception for finite
+     * nonzero x.  Choice = don't raise (except for signaling NaNs).
+     */
+    if (xfinite && !yfinite) {
+        return npy_cpack@c@(y - y, x * (y - y));
+    }
+
+    /*
+     * sinh(+-Inf + I NaN)  = +-Inf + I d(NaN).
+     * The sign of Inf in the result is unspecified.  Choice = normally
+     * the same as d(NaN).
+     *
+     * sinh(+-Inf +- I Inf) = +Inf + I dNaN.
+     * The sign of Inf in the result is unspecified.  Choice = always +.
+     * Raise the invalid floating-point exception.
+     *
+     * sinh(+-Inf + I y)   = +-Inf cos(y) + I Inf sin(y)
+     */
+    if (!xfinite && !npy_isnan(x)) {
+        if (!yfinite) {
+            return npy_cpack@c@(x * x, x * (y - y));
+        }
+        return npy_cpack@c@(x * npy_cos@c@(y),
+                            NPY_INFINITY@C@ * npy_sin@c@(y));
+    }
+
+    /*
+     * sinh(NaN + I NaN)  = d(NaN) + I d(NaN).
+     *
+     * sinh(NaN +- I Inf) = d(NaN) + I d(NaN).
+     * Optionally raises the invalid floating-point exception.
+     * Choice = raise.
+     *
+     * sinh(NaN + I y)    = d(NaN) + I d(NaN).
+     * Optionally raises the invalid floating-point exception for finite
+     * nonzero y.  Choice = don't raise (except for signaling NaNs).
+     */
+    return npy_cpack@c@((x * x) * (y - y), (x + x) * (y - y));
+}
+#undef CSINH_BIG
+#undef CSINH_HUGE
+#endif
+
+#ifndef HAVE_CTANH@C@
+/*
+ * Taken from the msun library in FreeBSD, rev 226600.
+ *
+ * Hyperbolic tangent of a complex argument z = x + i y.
+ *
+ * The algorithm is from:
+ *
+ *   W. Kahan.  Branch Cuts for Complex Elementary Functions or Much
+ *   Ado About Nothing's Sign Bit.  In The State of the Art in
+ *   Numerical Analysis, pp. 165 ff.  Iserles and Powell, eds., 1987.
+ *
+ * Method:
+ *
+ *   Let t    = tan(x)
+ *       beta = 1/cos^2(y)
+ *       s    = sinh(x)
+ *       rho  = cosh(x)
+ *
+ *   We have:
+ *
+ *   tanh(z) = sinh(z) / cosh(z)
+ *
+ *             sinh(x) cos(y) + i cosh(x) sin(y)
+ *           = ---------------------------------
+ *             cosh(x) cos(y) + i sinh(x) sin(y)
+ *
+ *             cosh(x) sinh(x) / cos^2(y) + i tan(y)
+ *           = -------------------------------------
+ *                    1 + sinh^2(x) / cos^2(y)
+ *
+ *             beta rho s + i t
+ *           = ----------------
+ *               1 + beta s^2
+ *
+ * Modifications:
+ *
+ *   I omitted the original algorithm's handling of overflow in tan(x) after
+ *   verifying with nearpi.c that this can't happen in IEEE single or double
+ *   precision.  I also handle large x differently.
+ */
+
+#define TANH_HUGE 22.0
+#define TANHF_HUGE 11.0F
+#define TANHL_HUGE 42.0L
+
+@ctype@
+npy_ctanh@c@(@ctype@ z)
+{
+    @type@ x, y;
+    @type@ t, beta, s, rho, denom;
+
+    x = npy_creal@c@(z);
+    y = npy_cimag@c@(z);
+
+    /*
+     * ctanh(NaN + i 0) = NaN + i 0
+     *
+     * ctanh(NaN + i y) = NaN + i NaN        for y != 0
+     *
+     * The imaginary part has the sign of x*sin(2*y), but there's no
+     * special effort to get this right.
+     *
+     * ctanh(+-Inf +- i Inf) = +-1 +- 0
+     *
+     * ctanh(+-Inf + i y) = +-1 + 0 sin(2y)        for y finite
+     *
+     * The imaginary part of the sign is unspecified.  This special
+     * case is only needed to avoid a spurious invalid exception when
+     * y is infinite.
+     */
+        if (!npy_isfinite(x)) {
+            if (npy_isnan(x)) {
+                return npy_cpack@c@(x, (y == 0 ? y : x * y));
+            }
+            return npy_cpack@c@(npy_copysign@c@(1,x),
+                                npy_copysign@c@(0,
+                                npy_isinf(y) ?
+                                    y : npy_sin@c@(y) * npy_cos@c@(y)));
+        }
+
+    /*
+     * ctanh(x + i NAN) = NaN + i NaN
+     * ctanh(x +- i Inf) = NaN + i NaN
+     */
+    if (!npy_isfinite(y)) {
+        return (npy_cpack@c@(y - y, y - y));
+    }
+
+    /*
+     * ctanh(+-huge + i +-y) ~= +-1 +- i 2sin(2y)/exp(2x), using the
+     * approximation sinh^2(huge) ~= exp(2*huge) / 4.
+     * We use a modified formula to avoid spurious overflow.
+     */
+    if (npy_fabs@c@(x) >= TANH@C@_HUGE) {
+        @type@ exp_mx = npy_exp@c@(-npy_fabs@c@(x));
+        return npy_cpack@c@(npy_copysign@c@(1, x),
+                            4 * npy_sin@c@(y) * npy_cos@c@(y) *
+                                exp_mx * exp_mx);
+    }
+
+    /* Kahan's algorithm */
+    t = npy_tan@c@(y);
+    beta = 1 + t * t;    /* = 1 / cos^2(y) */
+    s = npy_sinh@c@(x);
+    rho = npy_sqrt@c@(1 + s * s);    /* = cosh(x) */
+    denom = 1 + beta * s * s;
+    return (npy_cpack@c@((beta * rho * s) / denom, t / denom));
+}
+#undef TANH_HUGE
+#undef TANHF_HUGE
+#undef TANHL_HUGE
+#endif
+
 #if !defined (HAVE_CACOS@C@) || !defined (HAVE_CASINH@C@)
 /*
  * Complex inverse trig functions taken from the msum library in FreeBSD
@@ -573,7 +1024,7 @@ npy_cpow@c@ (@ctype@ a, @ctype@ b)
  * Function f(a, b, hypot_a_b) = (hypot(a, b) - b) / 2.
  * Pass hypot(a, b) as the third argument.
  */
-static NPY_INLINE @type@
+static inline @type@
 _f@c@(@type@ a, @type@ b, @type@ hypot_a_b)
 {
     if (b < 0) {
@@ -595,7 +1046,7 @@ _f@c@(@type@ a, @type@ b, @type@ hypot_a_b)
  * If returning sqrt_A2my2 has potential to result in an underflow, it is
  * rescaled, and new_y is similarly rescaled.
  */
-static NPY_INLINE void
+static inline void
 _do_hard_work@c@(@type@ x, @type@ y, @type@ *rx,
     npy_int *B_is_usable, @type@ *B, @type@ *sqrt_A2my2, @type@ *new_y)
 {
@@ -739,7 +1190,7 @@ _do_hard_work@c@(@type@ x, @type@ y, @type@ *rx,
 /*
  * Optimized version of clog() for |z| finite and larger than ~RECIP_EPSILON.
  */
-static NPY_INLINE void
+static inline void
 _clog_for_large_values@c@(@type@ x, @type@ y,
     @type@ *rr, @type@ *ri)
 {
@@ -1040,7 +1491,7 @@ npy_casinh@c@(@ctype@ z)
  * Assumes y is non-negative.
  * Assumes fabs(x) >= DBL_EPSILON.
  */
-static NPY_INLINE @type@
+static inline @type@
 _sum_squares@c@(@type@ x, @type@ y)
 {
 #if @precision@ == 1
@@ -1077,7 +1528,7 @@ const npy_longdouble SQRT_MIN = 1.8336038675548471656e-2466l;
 #if @precision@ == 1
 #define BIAS (FLT_MAX_EXP - 1)
 #define CUTOFF (FLT_MANT_DIG / 2 + 1)
-static NPY_INLINE npy_float
+static inline npy_float
 _real_part_reciprocalf(npy_float x, npy_float y)
 {
     npy_float scale;
@@ -1110,7 +1561,7 @@ _real_part_reciprocalf(npy_float x, npy_float y)
 #define BIAS (DBL_MAX_EXP - 1)
 /*  more guard digits are useful iff there is extra precision. */
 #define CUTOFF (DBL_MANT_DIG / 2 + 1)  /* just half or 1 guard digit */
-static NPY_INLINE npy_double
+static inline npy_double
 _real_part_reciprocal(npy_double x, npy_double y)
 {
     npy_double scale;
@@ -1152,7 +1603,7 @@ _real_part_reciprocal(npy_double x, npy_double y)
 
 #define BIAS (LDBL_MAX_EXP - 1)
 #define CUTOFF (LDBL_MANT_DIG / 2 + 1)
-static NPY_INLINE npy_longdouble
+static inline npy_longdouble
 _real_part_reciprocall(npy_longdouble x,
     npy_longdouble y)
 {
@@ -1185,7 +1636,7 @@ _real_part_reciprocall(npy_longdouble x,
 
 #else
 
-static NPY_INLINE npy_longdouble
+static inline npy_longdouble
 _real_part_reciprocall(npy_longdouble x,
     npy_longdouble y)
 {
@@ -1313,8 +1764,10 @@ npy_@kind@@c@(@ctype@ z)
 /**end repeat1**/
 
 /**begin repeat1
- * #kind = cexp,clog,csqrt,cacos,casin,catan,cacosh,casinh,catanh#
- * #KIND = CEXP,CLOG,CSQRT,CACOS,CASIN,CATAN,CACOSH,CASINH,CATANH#
+ * #kind = cexp,clog,csqrt,ccos,csin,ctan,ccosh,csinh,ctanh,
+ *         cacos,casin,catan,cacosh,casinh,catanh#
+ * #KIND = CEXP,CLOG,CSQRT,CCOS,CSIN,CTAN,CCOSH,CSINH,CTANH,
+ *         CACOS,CASIN,CATAN,CACOSH,CASINH,CATANH#
  */
 #ifdef HAVE_@KIND@@C@
 @ctype@
@@ -1329,20 +1782,5 @@ npy_@kind@@c@(@ctype@ z)
 #endif
 /**end repeat1**/
 
-/* Mandatory complex functions */
-
-/**begin repeat1
- * #kind = csin,csinh,ccos,ccosh,ctan,ctanh#
- */
-@ctype@
-npy_@kind@@c@(@ctype@ z)
-{
-    __@ctype@_to_c99_cast z1;
-    __@ctype@_to_c99_cast ret;
-    z1.npy_z = z;
-    ret.c99_z = @kind@@c@(z1.c99_z);
-    return ret.npy_z;
-}
-/**end repeat1**/
 
 /**end repeat**/
diff --git a/numpy/core/src/npymath/npy_math_internal.h.src b/numpy/core/src/npymath/npy_math_internal.h.src
index 6e84a905e..c7df5e255 100644
--- a/numpy/core/src/npymath/npy_math_internal.h.src
+++ b/numpy/core/src/npymath/npy_math_internal.h.src
@@ -567,7 +567,7 @@ npy_divmod@c@(@type@ a, @type@ b, @type@ *modulus)
 
     /* adjust fmod result to conform to Python convention of remainder */
     if (mod) {
-        if (isless(b, 0) != isless(mod, 0)) {
+        if (isless(b, (@type@)0) != isless(mod, (@type@)0)) {
             mod += b;
             div -= 1.0@c@;
         }
diff --git a/numpy/core/src/npymath/npy_math_private.h b/numpy/core/src/npymath/npy_math_private.h
index a474b3de3..20c94f98a 100644
--- a/numpy/core/src/npymath/npy_math_private.h
+++ b/numpy/core/src/npymath/npy_math_private.h
@@ -21,6 +21,7 @@
 #include <Python.h>
 #ifdef __cplusplus
 #include <cmath>
+#include <complex>
 using std::isgreater;
 using std::isless;
 #else
@@ -494,8 +495,9 @@ do {                                                            \
  * Microsoft C defines _MSC_VER
  * Intel compiler does not use MSVC complex types, but defines _MSC_VER by
  * default.
+ * since c++17 msvc is no longer support them.
  */
-#if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
+#if !defined(__cplusplus) && defined(_MSC_VER) && !defined(__INTEL_COMPILER)
 typedef union {
         npy_cdouble npy_z;
         _Dcomplex c99_z;
diff --git a/numpy/core/src/npysort/heapsort.cpp b/numpy/core/src/npysort/heapsort.cpp
index de39367c2..77a4bda74 100644
--- a/numpy/core/src/npysort/heapsort.cpp
+++ b/numpy/core/src/npysort/heapsort.cpp
@@ -21,7 +21,7 @@
  *
  * The merge sort is *stable*, meaning that equal components
  * are unmoved from their entry versions, so it can be used to
- * implement lexigraphic sorting on multiple keys.
+ * implement lexicographic sorting on multiple keys.
  *
  * The heap sort is included for completeness.
  */
@@ -55,6 +55,9 @@ npy_heapsort(void *start, npy_intp num, void *varr)
     PyArrayObject *arr = (PyArrayObject *)varr;
     npy_intp elsize = PyArray_ITEMSIZE(arr);
     PyArray_CompareFunc *cmp = PyArray_DESCR(arr)->f->compare;
+    if (elsize == 0) {
+        return 0;  /* no need for sorting elements of no size */
+    }
     char *tmp = (char *)malloc(elsize);
     char *a = (char *)start - elsize;
     npy_intp i, j, l;
diff --git a/numpy/core/src/npysort/mergesort.cpp b/numpy/core/src/npysort/mergesort.cpp
index f892dd185..f53feb320 100644
--- a/numpy/core/src/npysort/mergesort.cpp
+++ b/numpy/core/src/npysort/mergesort.cpp
@@ -21,7 +21,7 @@
  *
  * The merge sort is *stable*, meaning that equal components
  * are unmoved from their entry versions, so it can be used to
- * implement lexigraphic sorting on multiple keys.
+ * implement lexicographic sorting on multiple keys.
  *
  * The heap sort is included for completeness.
  */
@@ -171,6 +171,7 @@ amergesort_(type *v, npy_intp *tosort, npy_intp num)
 }
 
 /*
+ 
  *****************************************************************************
  **                             STRING SORTS                                **
  *****************************************************************************
diff --git a/numpy/core/src/npysort/npysort_common.h b/numpy/core/src/npysort/npysort_common.h
index ab9f456b6..851eb89da 100644
--- a/numpy/core/src/npysort/npysort_common.h
+++ b/numpy/core/src/npysort/npysort_common.h
@@ -44,112 +44,112 @@ extern "C" {
  *****************************************************************************
  */
 
-NPY_INLINE static int
+static inline int
 BOOL_LT(npy_bool a, npy_bool b)
 {
     return a < b;
 }
 
 
-NPY_INLINE static int
+static inline int
 BYTE_LT(npy_byte a, npy_byte b)
 {
     return a < b;
 }
 
 
-NPY_INLINE static int
+static inline int
 UBYTE_LT(npy_ubyte a, npy_ubyte b)
 {
     return a < b;
 }
 
 
-NPY_INLINE static int
+static inline int
 SHORT_LT(npy_short a, npy_short b)
 {
     return a < b;
 }
 
 
-NPY_INLINE static int
+static inline int
 USHORT_LT(npy_ushort a, npy_ushort b)
 {
     return a < b;
 }
 
 
-NPY_INLINE static int
+static inline int
 INT_LT(npy_int a, npy_int b)
 {
     return a < b;
 }
 
 
-NPY_INLINE static int
+static inline int
 UINT_LT(npy_uint a, npy_uint b)
 {
     return a < b;
 }
 
 
-NPY_INLINE static int
+static inline int
 LONG_LT(npy_long a, npy_long b)
 {
     return a < b;
 }
 
 
-NPY_INLINE static int
+static inline int
 ULONG_LT(npy_ulong a, npy_ulong b)
 {
     return a < b;
 }
 
 
-NPY_INLINE static int
+static inline int
 LONGLONG_LT(npy_longlong a, npy_longlong b)
 {
     return a < b;
 }
 
 
-NPY_INLINE static int
+static inline int
 ULONGLONG_LT(npy_ulonglong a, npy_ulonglong b)
 {
     return a < b;
 }
 
 
-NPY_INLINE static int
+static inline int
 FLOAT_LT(npy_float a, npy_float b)
 {
     return a < b || (b != b && a == a);
 }
 
 
-NPY_INLINE static int
+static inline int
 DOUBLE_LT(npy_double a, npy_double b)
 {
     return a < b || (b != b && a == a);
 }
 
 
-NPY_INLINE static int
+static inline int
 LONGDOUBLE_LT(npy_longdouble a, npy_longdouble b)
 {
     return a < b || (b != b && a == a);
 }
 
 
-NPY_INLINE static int
+static inline int
 _npy_half_isnan(npy_half h)
 {
     return ((h&0x7c00u) == 0x7c00u) && ((h&0x03ffu) != 0x0000u);
 }
 
 
-NPY_INLINE static int
+static inline int
 _npy_half_lt_nonan(npy_half h1, npy_half h2)
 {
     if (h1&0x8000u) {
@@ -172,7 +172,7 @@ _npy_half_lt_nonan(npy_half h1, npy_half h2)
 }
 
 
-NPY_INLINE static int
+static inline int
 HALF_LT(npy_half a, npy_half b)
 {
     int ret;
@@ -192,7 +192,7 @@ HALF_LT(npy_half a, npy_half b)
  * of an if statement. It's a SUN compiler thing, so assign the return value
  * to a variable instead.
  */
-NPY_INLINE static int
+static inline int
 CFLOAT_LT(npy_cfloat a, npy_cfloat b)
 {
     int ret;
@@ -214,7 +214,7 @@ CFLOAT_LT(npy_cfloat a, npy_cfloat b)
 }
 
 
-NPY_INLINE static int
+static inline int
 CDOUBLE_LT(npy_cdouble a, npy_cdouble b)
 {
     int ret;
@@ -236,7 +236,7 @@ CDOUBLE_LT(npy_cdouble a, npy_cdouble b)
 }
 
 
-NPY_INLINE static int
+static inline int
 CLONGDOUBLE_LT(npy_clongdouble a, npy_clongdouble b)
 {
     int ret;
@@ -258,14 +258,14 @@ CLONGDOUBLE_LT(npy_clongdouble a, npy_clongdouble b)
 }
 
 
-NPY_INLINE static void
+static inline void
 STRING_COPY(char *s1, char const*s2, size_t len)
 {
     memcpy(s1, s2, len);
 }
 
 
-NPY_INLINE static void
+static inline void
 STRING_SWAP(char *s1, char *s2, size_t len)
 {
     while(len--) {
@@ -276,7 +276,7 @@ STRING_SWAP(char *s1, char *s2, size_t len)
 }
 
 
-NPY_INLINE static int
+static inline int
 STRING_LT(const char *s1, const char *s2, size_t len)
 {
     const unsigned char *c1 = (const unsigned char *)s1;
@@ -294,7 +294,7 @@ STRING_LT(const char *s1, const char *s2, size_t len)
 }
 
 
-NPY_INLINE static void
+static inline void
 UNICODE_COPY(npy_ucs4 *s1, npy_ucs4 const *s2, size_t len)
 {
     while(len--) {
@@ -303,7 +303,7 @@ UNICODE_COPY(npy_ucs4 *s1, npy_ucs4 const *s2, size_t len)
 }
 
 
-NPY_INLINE static void
+static inline void
 UNICODE_SWAP(npy_ucs4 *s1, npy_ucs4 *s2, size_t len)
 {
     while(len--) {
@@ -314,7 +314,7 @@ UNICODE_SWAP(npy_ucs4 *s1, npy_ucs4 *s2, size_t len)
 }
 
 
-NPY_INLINE static int
+static inline int
 UNICODE_LT(const npy_ucs4 *s1, const npy_ucs4 *s2, size_t len)
 {
     size_t i;
@@ -330,7 +330,7 @@ UNICODE_LT(const npy_ucs4 *s1, const npy_ucs4 *s2, size_t len)
 }
 
 
-NPY_INLINE static int
+static inline int
 DATETIME_LT(npy_datetime a, npy_datetime b)
 {
     if (a == NPY_DATETIME_NAT) {
@@ -345,7 +345,7 @@ DATETIME_LT(npy_datetime a, npy_datetime b)
 }
 
 
-NPY_INLINE static int
+static inline int
 TIMEDELTA_LT(npy_timedelta a, npy_timedelta b)
 {
     if (a == NPY_DATETIME_NAT) {
@@ -360,14 +360,14 @@ TIMEDELTA_LT(npy_timedelta a, npy_timedelta b)
 }
 
 
-NPY_INLINE static void
+static inline void
 GENERIC_COPY(char *a, char *b, size_t len)
 {
     memcpy(a, b, len);
 }
 
 
-NPY_INLINE static void
+static inline void
 GENERIC_SWAP(char *a, char *b, size_t len)
 {
     while(len--) {
diff --git a/numpy/core/src/npysort/npysort_heapsort.h b/numpy/core/src/npysort/npysort_heapsort.h
index 762d89b7b..16750b817 100644
--- a/numpy/core/src/npysort/npysort_heapsort.h
+++ b/numpy/core/src/npysort/npysort_heapsort.h
@@ -16,7 +16,7 @@
  */
 
 template <typename Tag, typename type>
-NPY_INLINE NPY_NO_EXPORT
+inline NPY_NO_EXPORT
 int heapsort_(type *start, npy_intp n)
 {
     type tmp, *a;
@@ -67,7 +67,7 @@ int heapsort_(type *start, npy_intp n)
 }
 
 template <typename Tag, typename type>
-NPY_INLINE NPY_NO_EXPORT
+inline NPY_NO_EXPORT
 int aheapsort_(type *vv, npy_intp *tosort, npy_intp n)
 {
     type *v = vv;
@@ -123,11 +123,15 @@ int aheapsort_(type *vv, npy_intp *tosort, npy_intp n)
  */
 
 template <typename Tag, typename type>
-NPY_INLINE NPY_NO_EXPORT
+inline NPY_NO_EXPORT
 int string_heapsort_(type *start, npy_intp n, void *varr)
 {
     PyArrayObject *arr = (PyArrayObject *)varr;
     size_t len = PyArray_ITEMSIZE(arr) / sizeof(type);
+    if (len == 0) {
+        return 0;  /* no need for sorting if strings are empty */
+    }
+
     type *tmp = (type *)malloc(PyArray_ITEMSIZE(arr));
     type *a = (type *)start - len;
     npy_intp i, j, l;
@@ -177,7 +181,7 @@ int string_heapsort_(type *start, npy_intp n, void *varr)
 }
 
 template <typename Tag, typename type>
-NPY_INLINE NPY_NO_EXPORT
+inline NPY_NO_EXPORT
 int string_aheapsort_(type *vv, npy_intp *tosort, npy_intp n, void *varr)
 {
     type *v = vv;
diff --git a/numpy/core/src/npysort/quicksort.cpp b/numpy/core/src/npysort/quicksort.cpp
index 3e351dd84..7497ebaa3 100644
--- a/numpy/core/src/npysort/quicksort.cpp
+++ b/numpy/core/src/npysort/quicksort.cpp
@@ -54,15 +54,11 @@
 #include "npysort_common.h"
 #include "npysort_heapsort.h"
 #include "numpy_tag.h"
+#include "simd_qsort.hpp"
 
-#include "x86-qsort.h"
 #include <cstdlib>
 #include <utility>
 
-#ifndef NPY_DISABLE_OPTIMIZATION
-#include "x86-qsort.dispatch.h"
-#endif
-
 #define NOT_USED NPY_UNUSED(unused)
 /*
  * pushing largest partition has upper bound of log2(n) space
@@ -73,70 +69,50 @@
 #define SMALL_MERGESORT 20
 #define SMALL_STRING 16
 
+// Temporarily disable AVX512 sorting on WIN32 and CYGWIN until we can figure
+// out why it has test failures
+#if defined(_MSC_VER) || defined(__CYGWIN__)
+template<typename T>
+inline bool quicksort_dispatch(T*, npy_intp)
+{
+    return false;
+}
+#else
+template<typename T>
+inline bool quicksort_dispatch(T *start, npy_intp num)
+{
+    using TF = typename np::meta::FixedWidth<T>::Type;
+    void (*dispfunc)(TF*, intptr_t) = nullptr;
+    if (sizeof(T) == sizeof(uint16_t)) {
+        #ifndef NPY_DISABLE_OPTIMIZATION
+            #include "simd_qsort_16bit.dispatch.h"
+        #endif
+        NPY_CPU_DISPATCH_CALL_XB(dispfunc = np::qsort_simd::template QSort, <TF>);
+    }
+    else if (sizeof(T) == sizeof(uint32_t) || sizeof(T) == sizeof(uint64_t)) {
+        #ifndef NPY_DISABLE_OPTIMIZATION
+            #include "simd_qsort.dispatch.h"
+        #endif
+        NPY_CPU_DISPATCH_CALL_XB(dispfunc = np::qsort_simd::template QSort, <TF>);
+    }
+    if (dispfunc) {
+        (*dispfunc)(reinterpret_cast<TF*>(start), static_cast<intptr_t>(num));
+        return true;
+    }
+    return false;
+}
+#endif // _MSC_VER || CYGWIN
+
 /*
  *****************************************************************************
  **                            NUMERIC SORTS                                **
  *****************************************************************************
  */
 
-namespace {
-
-template <typename Tag>
-struct x86_dispatch {
-    static bool quicksort(typename Tag::type *, npy_intp) { return false; }
-};
-
-template <>
-struct x86_dispatch<npy::int_tag> {
-    static bool quicksort(npy_int *start, npy_intp num)
-    {
-        void (*dispfunc)(void *, npy_intp) = nullptr;
-        NPY_CPU_DISPATCH_CALL_XB(dispfunc = x86_quicksort_int);
-        if (dispfunc) {
-            (*dispfunc)(start, num);
-            return true;
-        }
-        return false;
-    }
-};
-
-template <>
-struct x86_dispatch<npy::uint_tag> {
-    static bool quicksort(npy_uint *start, npy_intp num)
-    {
-        void (*dispfunc)(void *, npy_intp) = nullptr;
-        NPY_CPU_DISPATCH_CALL_XB(dispfunc = x86_quicksort_uint);
-        if (dispfunc) {
-            (*dispfunc)(start, num);
-            return true;
-        }
-        return false;
-    }
-};
-
-template <>
-struct x86_dispatch<npy::float_tag> {
-    static bool quicksort(npy_float *start, npy_intp num)
-    {
-        void (*dispfunc)(void *, npy_intp) = nullptr;
-        NPY_CPU_DISPATCH_CALL_XB(dispfunc = x86_quicksort_float);
-        if (dispfunc) {
-            (*dispfunc)(start, num);
-            return true;
-        }
-        return false;
-    }
-};
-
-}  // namespace
-
 template <typename Tag, typename type>
 static int
 quicksort_(type *start, npy_intp num)
 {
-    if (x86_dispatch<Tag>::quicksort(start, num))
-        return 0;
-
     type vp;
     type *pl = start;
     type *pr = pl + num - 1;
@@ -729,56 +705,89 @@ quicksort_ubyte(void *start, npy_intp n, void *NPY_UNUSED(varr))
 NPY_NO_EXPORT int
 quicksort_short(void *start, npy_intp n, void *NPY_UNUSED(varr))
 {
+    if (quicksort_dispatch((npy_short *)start, n)) {
+        return 0;
+    }
     return quicksort_<npy::short_tag>((npy_short *)start, n);
 }
 NPY_NO_EXPORT int
 quicksort_ushort(void *start, npy_intp n, void *NPY_UNUSED(varr))
 {
+    if (quicksort_dispatch((npy_ushort *)start, n)) {
+        return 0;
+    }
     return quicksort_<npy::ushort_tag>((npy_ushort *)start, n);
 }
 NPY_NO_EXPORT int
 quicksort_int(void *start, npy_intp n, void *NPY_UNUSED(varr))
 {
+    if (quicksort_dispatch((npy_int *)start, n)) {
+        return 0;
+    }
     return quicksort_<npy::int_tag>((npy_int *)start, n);
 }
 NPY_NO_EXPORT int
 quicksort_uint(void *start, npy_intp n, void *NPY_UNUSED(varr))
 {
+    if (quicksort_dispatch((npy_uint *)start, n)) {
+        return 0;
+    }
     return quicksort_<npy::uint_tag>((npy_uint *)start, n);
 }
 NPY_NO_EXPORT int
 quicksort_long(void *start, npy_intp n, void *NPY_UNUSED(varr))
 {
+    if (quicksort_dispatch((npy_long *)start, n)) {
+        return 0;
+    }
     return quicksort_<npy::long_tag>((npy_long *)start, n);
 }
 NPY_NO_EXPORT int
 quicksort_ulong(void *start, npy_intp n, void *NPY_UNUSED(varr))
 {
+    if (quicksort_dispatch((npy_ulong *)start, n)) {
+        return 0;
+    }
     return quicksort_<npy::ulong_tag>((npy_ulong *)start, n);
 }
 NPY_NO_EXPORT int
 quicksort_longlong(void *start, npy_intp n, void *NPY_UNUSED(varr))
 {
+    if (quicksort_dispatch((npy_longlong *)start, n)) {
+        return 0;
+    }
     return quicksort_<npy::longlong_tag>((npy_longlong *)start, n);
 }
 NPY_NO_EXPORT int
 quicksort_ulonglong(void *start, npy_intp n, void *NPY_UNUSED(varr))
 {
+    if (quicksort_dispatch((npy_ulonglong *)start, n)) {
+        return 0;
+    }
     return quicksort_<npy::ulonglong_tag>((npy_ulonglong *)start, n);
 }
 NPY_NO_EXPORT int
 quicksort_half(void *start, npy_intp n, void *NPY_UNUSED(varr))
 {
+    if (quicksort_dispatch((np::Half *)start, n)) {
+        return 0;
+    }
     return quicksort_<npy::half_tag>((npy_half *)start, n);
 }
 NPY_NO_EXPORT int
 quicksort_float(void *start, npy_intp n, void *NPY_UNUSED(varr))
 {
+    if (quicksort_dispatch((npy_float *)start, n)) {
+        return 0;
+    }
     return quicksort_<npy::float_tag>((npy_float *)start, n);
 }
 NPY_NO_EXPORT int
 quicksort_double(void *start, npy_intp n, void *NPY_UNUSED(varr))
 {
+    if (quicksort_dispatch((npy_double *)start, n)) {
+        return 0;
+    }
     return quicksort_<npy::double_tag>((npy_double *)start, n);
 }
 NPY_NO_EXPORT int
diff --git a/numpy/core/src/npysort/selection.cpp b/numpy/core/src/npysort/selection.cpp
index ebe188b71..450858df9 100644
--- a/numpy/core/src/npysort/selection.cpp
+++ b/numpy/core/src/npysort/selection.cpp
@@ -423,8 +423,8 @@ static constexpr std::array<arg_map, sizeof...(Tags)>
 make_partition_map(npy::taglist<Tags...>)
 {
     return std::array<arg_map, sizeof...(Tags)>{
-            arg_map{Tags::type_value, &introselect_noarg<Tags>,
-                    &introselect_arg<Tags>}...};
+            arg_map{Tags::type_value, {&introselect_noarg<Tags>},
+                {&introselect_arg<Tags>}}...};
 }
 
 struct partition_t {
diff --git a/numpy/core/src/npysort/simd_qsort.dispatch.cpp b/numpy/core/src/npysort/simd_qsort.dispatch.cpp
new file mode 100644
index 000000000..101bb3dcc
--- /dev/null
+++ b/numpy/core/src/npysort/simd_qsort.dispatch.cpp
@@ -0,0 +1,44 @@
+/*@targets
+ * $maxopt $keep_baseline avx512_skx
+ */
+// policy $keep_baseline is used to avoid skip building avx512_skx
+// when its part of baseline features (--cpu-baseline), since
+// 'baseline' option isn't specified within targets.
+
+#include "simd_qsort.hpp"
+
+#if defined(NPY_HAVE_AVX512_SKX) && !defined(_MSC_VER)
+    #include "x86-simd-sort/src/avx512-32bit-qsort.hpp"
+    #include "x86-simd-sort/src/avx512-64bit-qsort.hpp"
+#endif
+
+namespace np { namespace qsort_simd {
+
+#if defined(NPY_HAVE_AVX512_SKX) && !defined(_MSC_VER)
+template<> void NPY_CPU_DISPATCH_CURFX(QSort)(int32_t *arr, intptr_t size)
+{
+    avx512_qsort(arr, size);
+}
+template<> void NPY_CPU_DISPATCH_CURFX(QSort)(uint32_t *arr, intptr_t size)
+{
+    avx512_qsort(arr, size);
+}
+template<> void NPY_CPU_DISPATCH_CURFX(QSort)(int64_t *arr, intptr_t size)
+{
+    avx512_qsort(arr, size);
+}
+template<> void NPY_CPU_DISPATCH_CURFX(QSort)(uint64_t *arr, intptr_t size)
+{
+    avx512_qsort(arr, size);
+}
+template<> void NPY_CPU_DISPATCH_CURFX(QSort)(float *arr, intptr_t size)
+{
+    avx512_qsort(arr, size);
+}
+template<> void NPY_CPU_DISPATCH_CURFX(QSort)(double *arr, intptr_t size)
+{
+    avx512_qsort(arr, size);
+}
+#endif  // NPY_HAVE_AVX512_SKX
+
+}} // namespace np::simd
diff --git a/numpy/core/src/npysort/simd_qsort.hpp b/numpy/core/src/npysort/simd_qsort.hpp
new file mode 100644
index 000000000..7cdee774d
--- /dev/null
+++ b/numpy/core/src/npysort/simd_qsort.hpp
@@ -0,0 +1,19 @@
+#ifndef NUMPY_SRC_COMMON_NPYSORT_SIMD_QSORT_HPP
+#define NUMPY_SRC_COMMON_NPYSORT_SIMD_QSORT_HPP
+
+#include "common.hpp"
+
+namespace np { namespace qsort_simd {
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "simd_qsort.dispatch.h"
+#endif
+NPY_CPU_DISPATCH_DECLARE(template <typename T> void QSort, (T *arr, intptr_t size))
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "simd_qsort_16bit.dispatch.h"
+#endif
+NPY_CPU_DISPATCH_DECLARE(template <typename T> void QSort, (T *arr, intptr_t size))
+
+} } // np::qsort_simd
+#endif // NUMPY_SRC_COMMON_NPYSORT_SIMD_QSORT_HPP
diff --git a/numpy/core/src/npysort/simd_qsort_16bit.dispatch.cpp b/numpy/core/src/npysort/simd_qsort_16bit.dispatch.cpp
new file mode 100644
index 000000000..3f5099758
--- /dev/null
+++ b/numpy/core/src/npysort/simd_qsort_16bit.dispatch.cpp
@@ -0,0 +1,39 @@
+/*@targets
+ * $maxopt $keep_baseline avx512_icl avx512_spr
+ */
+// policy $keep_baseline is used to avoid skip building avx512_skx
+// when its part of baseline features (--cpu-baseline), since
+// 'baseline' option isn't specified within targets.
+
+#include "simd_qsort.hpp"
+
+#if defined(NPY_HAVE_AVX512_SPR) && !defined(_MSC_VER)
+    #include "x86-simd-sort/src/avx512fp16-16bit-qsort.hpp"
+#elif defined(NPY_HAVE_AVX512_ICL) && !defined(_MSC_VER)
+    #include "x86-simd-sort/src/avx512-16bit-qsort.hpp"
+#endif
+
+namespace np { namespace qsort_simd {
+
+#if !defined(_MSC_VER)
+#if defined(NPY_HAVE_AVX512_ICL) || defined(NPY_HAVE_AVX512_SPR)
+template<> void NPY_CPU_DISPATCH_CURFX(QSort)(Half *arr, intptr_t size)
+{
+#if defined(NPY_HAVE_AVX512_SPR)
+    avx512_qsort(reinterpret_cast<_Float16*>(arr), size);
+#else
+    avx512_qsort_fp16(reinterpret_cast<uint16_t*>(arr), size);
+#endif
+}
+template<> void NPY_CPU_DISPATCH_CURFX(QSort)(uint16_t *arr, intptr_t size)
+{
+    avx512_qsort(arr, size);
+}
+template<> void NPY_CPU_DISPATCH_CURFX(QSort)(int16_t *arr, intptr_t size)
+{
+    avx512_qsort(arr, size);
+}
+#endif // NPY_HAVE_AVX512_ICL || SPR
+#endif // _MSC_VER
+
+}} // namespace np::qsort_simd
diff --git a/numpy/core/src/npysort/timsort.cpp b/numpy/core/src/npysort/timsort.cpp
index 27294af0c..9438fb293 100644
--- a/numpy/core/src/npysort/timsort.cpp
+++ b/numpy/core/src/npysort/timsort.cpp
@@ -21,7 +21,7 @@
  *
  * The merge sort is *stable*, meaning that equal components
  * are unmoved from their entry versions, so it can be used to
- * implement lexigraphic sorting on multiple keys.
+ * implement lexicographic sorting on multiple keys.
  *
  * The heap sort is included for completeness.
  */
diff --git a/numpy/core/src/npysort/x86-qsort.dispatch.cpp b/numpy/core/src/npysort/x86-qsort.dispatch.cpp
deleted file mode 100644
index 8e88cc667..000000000
--- a/numpy/core/src/npysort/x86-qsort.dispatch.cpp
+++ /dev/null
@@ -1,835 +0,0 @@
-/*@targets
- * $maxopt $keep_baseline avx512_skx
- */
-// policy $keep_baseline is used to avoid skip building avx512_skx
-// when its part of baseline features (--cpu-baseline), since
-// 'baseline' option isn't specified within targets.
-
-#include "x86-qsort.h"
-#define NPY_NO_DEPRECATED_API NPY_API_VERSION
-
-#ifdef NPY_HAVE_AVX512_SKX
-#include "numpy/npy_math.h"
-
-#include "npy_sort.h"
-#include "numpy_tag.h"
-
-#include "simd/simd.h"
-#include <immintrin.h>
-
-template <typename Tag, typename type>
-NPY_NO_EXPORT int
-heapsort_(type *start, npy_intp n);
-
-/*
- * Quicksort using AVX-512 for int, uint32 and float. The ideas and code are
- * based on these two research papers:
- * (1) Fast and Robust Vectorized In-Place Sorting of Primitive Types
- *     https://drops.dagstuhl.de/opus/volltexte/2021/13775/
- * (2) A Novel Hybrid Quicksort Algorithm Vectorized using AVX-512 on Intel
- * Skylake https://arxiv.org/pdf/1704.08579.pdf
- *
- * High level idea: Vectorize the quicksort partitioning using AVX-512
- * compressstore instructions. The algorithm to pick the pivot is to use median
- * of 72 elements picked at random. If the array size is < 128, then use
- * Bitonic sorting network. Good resource for bitonic sorting network:
- * http://mitp-content-server.mit.edu:18180/books/content/sectbyfn?collid=books_pres_0&fn=Chapter%2027.pdf&id=8030
- *
- * Refer to https://github.com/numpy/numpy/pull/20133#issuecomment-958110340
- * for potential problems when converting this code to universal intrinsics
- * framework.
- */
-
-/*
- * Constants used in sorting 16 elements in a ZMM registers. Based on Bitonic
- * sorting network (see
- * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg)
- */
-#define NETWORK1 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1
-#define NETWORK2 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3
-#define NETWORK3 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7
-#define NETWORK4 13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2
-#define NETWORK5 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
-#define NETWORK6 11, 10, 9, 8, 15, 14, 13, 12, 3, 2, 1, 0, 7, 6, 5, 4
-#define NETWORK7 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
-#define ZMM_MAX_FLOAT _mm512_set1_ps(NPY_INFINITYF)
-#define ZMM_MAX_UINT _mm512_set1_epi32(NPY_MAX_UINT32)
-#define ZMM_MAX_INT _mm512_set1_epi32(NPY_MAX_INT32)
-#define SHUFFLE_MASK(a, b, c, d) (a << 6) | (b << 4) | (c << 2) | d
-#define SHUFFLE_ps(ZMM, MASK) _mm512_shuffle_ps(zmm, zmm, MASK)
-#define SHUFFLE_epi32(ZMM, MASK) _mm512_shuffle_epi32(zmm, MASK)
-
-#define MAX(x, y) (((x) > (y)) ? (x) : (y))
-#define MIN(x, y) (((x) < (y)) ? (x) : (y))
-
-/*
- * Vectorized random number generator xoroshiro128+. Broken into 2 parts:
- * (1) vnext generates 2 64-bit random integers
- * (2) rnd_epu32 converts this to 4 32-bit random integers and bounds it to
- *     the length of the array
- */
-#define VROTL(x, k) /* rotate each uint64_t value in vector */ \
-    _mm256_or_si256(_mm256_slli_epi64((x), (k)),               \
-                    _mm256_srli_epi64((x), 64 - (k)))
-
-static inline __m256i
-vnext(__m256i *s0, __m256i *s1)
-{
-    *s1 = _mm256_xor_si256(*s0, *s1); /* modify vectors s1 and s0 */
-    *s0 = _mm256_xor_si256(_mm256_xor_si256(VROTL(*s0, 24), *s1),
-                           _mm256_slli_epi64(*s1, 16));
-    *s1 = VROTL(*s1, 37);
-    return _mm256_add_epi64(*s0, *s1); /* return random vector */
-}
-
-/* transform random numbers to the range between 0 and bound - 1 */
-static inline __m256i
-rnd_epu32(__m256i rnd_vec, __m256i bound)
-{
-    __m256i even = _mm256_srli_epi64(_mm256_mul_epu32(rnd_vec, bound), 32);
-    __m256i odd = _mm256_mul_epu32(_mm256_srli_epi64(rnd_vec, 32), bound);
-    return _mm256_blend_epi32(odd, even, 0b01010101);
-}
-
-template <typename type>
-struct vector;
-
-template <>
-struct vector<npy_int> {
-    using tag = npy::int_tag;
-    using type_t = npy_int;
-    using zmm_t = __m512i;
-    using ymm_t = __m256i;
-
-    static type_t type_max() { return NPY_MAX_INT32; }
-    static type_t type_min() { return NPY_MIN_INT32; }
-    static zmm_t zmm_max() { return _mm512_set1_epi32(type_max()); }
-
-    static __mmask16 ge(zmm_t x, zmm_t y)
-    {
-        return _mm512_cmp_epi32_mask(x, y, _MM_CMPINT_NLT);
-    }
-    template <int scale>
-    static ymm_t i64gather(__m512i index, void const *base)
-    {
-        return _mm512_i64gather_epi32(index, base, scale);
-    }
-    static zmm_t loadu(void const *mem) { return _mm512_loadu_si512(mem); }
-    static zmm_t max(zmm_t x, zmm_t y) { return _mm512_max_epi32(x, y); }
-    static void mask_compressstoreu(void *mem, __mmask16 mask, zmm_t x)
-    {
-        return _mm512_mask_compressstoreu_epi32(mem, mask, x);
-    }
-    static zmm_t mask_loadu(zmm_t x, __mmask16 mask, void const *mem)
-    {
-        return _mm512_mask_loadu_epi32(x, mask, mem);
-    }
-    static zmm_t mask_mov(zmm_t x, __mmask16 mask, zmm_t y)
-    {
-        return _mm512_mask_mov_epi32(x, mask, y);
-    }
-    static void mask_storeu(void *mem, __mmask16 mask, zmm_t x)
-    {
-        return _mm512_mask_storeu_epi32(mem, mask, x);
-    }
-    static zmm_t min(zmm_t x, zmm_t y) { return _mm512_min_epi32(x, y); }
-    static zmm_t permutexvar(__m512i idx, zmm_t zmm)
-    {
-        return _mm512_permutexvar_epi32(idx, zmm);
-    }
-    static type_t reducemax(zmm_t v) { return npyv_reduce_max_s32(v); }
-    static type_t reducemin(zmm_t v) { return npyv_reduce_min_s32(v); }
-    static zmm_t set1(type_t v) { return _mm512_set1_epi32(v); }
-    template<__mmask16 mask>
-    static zmm_t shuffle(zmm_t zmm)
-    {
-        return _mm512_shuffle_epi32(zmm, (_MM_PERM_ENUM)mask);
-    }
-    static void storeu(void *mem, zmm_t x)
-    {
-        return _mm512_storeu_si512(mem, x);
-    }
-
-    static ymm_t max(ymm_t x, ymm_t y) { return _mm256_max_epi32(x, y); }
-    static ymm_t min(ymm_t x, ymm_t y) { return _mm256_min_epi32(x, y); }
-};
-template <>
-struct vector<npy_uint> {
-    using tag = npy::uint_tag;
-    using type_t = npy_uint;
-    using zmm_t = __m512i;
-    using ymm_t = __m256i;
-
-    static type_t type_max() { return NPY_MAX_UINT32; }
-    static type_t type_min() { return 0; }
-    static zmm_t zmm_max() { return _mm512_set1_epi32(type_max()); }
-
-    template<int scale>
-    static ymm_t i64gather(__m512i index, void const *base)
-    {
-        return _mm512_i64gather_epi32(index, base, scale);
-    }
-    static __mmask16 ge(zmm_t x, zmm_t y)
-    {
-        return _mm512_cmp_epu32_mask(x, y, _MM_CMPINT_NLT);
-    }
-    static zmm_t loadu(void const *mem) { return _mm512_loadu_si512(mem); }
-    static zmm_t max(zmm_t x, zmm_t y) { return _mm512_max_epu32(x, y); }
-    static void mask_compressstoreu(void *mem, __mmask16 mask, zmm_t x)
-    {
-        return _mm512_mask_compressstoreu_epi32(mem, mask, x);
-    }
-    static zmm_t mask_loadu(zmm_t x, __mmask16 mask, void const *mem)
-    {
-        return _mm512_mask_loadu_epi32(x, mask, mem);
-    }
-    static zmm_t mask_mov(zmm_t x, __mmask16 mask, zmm_t y)
-    {
-        return _mm512_mask_mov_epi32(x, mask, y);
-    }
-    static void mask_storeu(void *mem, __mmask16 mask, zmm_t x)
-    {
-        return _mm512_mask_storeu_epi32(mem, mask, x);
-    }
-    static zmm_t min(zmm_t x, zmm_t y) { return _mm512_min_epu32(x, y); }
-    static zmm_t permutexvar(__m512i idx, zmm_t zmm)
-    {
-        return _mm512_permutexvar_epi32(idx, zmm);
-    }
-    static type_t reducemax(zmm_t v) { return npyv_reduce_max_u32(v); }
-    static type_t reducemin(zmm_t v) { return npyv_reduce_min_u32(v); }
-    static zmm_t set1(type_t v) { return _mm512_set1_epi32(v); }
-    template<__mmask16 mask>
-    static zmm_t shuffle(zmm_t zmm)
-    {
-        return _mm512_shuffle_epi32(zmm, (_MM_PERM_ENUM)mask);
-    }
-    static void storeu(void *mem, zmm_t x)
-    {
-        return _mm512_storeu_si512(mem, x);
-    }
-
-    static ymm_t max(ymm_t x, ymm_t y) { return _mm256_max_epu32(x, y); }
-    static ymm_t min(ymm_t x, ymm_t y) { return _mm256_min_epu32(x, y); }
-};
-template <>
-struct vector<npy_float> {
-    using tag = npy::float_tag;
-    using type_t = npy_float;
-    using zmm_t = __m512;
-    using ymm_t = __m256;
-
-    static type_t type_max() { return NPY_INFINITYF; }
-    static type_t type_min() { return -NPY_INFINITYF; }
-    static zmm_t zmm_max() { return _mm512_set1_ps(type_max()); }
-
-    static __mmask16 ge(zmm_t x, zmm_t y)
-    {
-        return _mm512_cmp_ps_mask(x, y, _CMP_GE_OQ);
-    }
-    template<int scale>
-    static ymm_t i64gather(__m512i index, void const *base)
-    {
-        return _mm512_i64gather_ps(index, base, scale);
-    }
-    static zmm_t loadu(void const *mem) { return _mm512_loadu_ps(mem); }
-    static zmm_t max(zmm_t x, zmm_t y) { return _mm512_max_ps(x, y); }
-    static void mask_compressstoreu(void *mem, __mmask16 mask, zmm_t x)
-    {
-        return _mm512_mask_compressstoreu_ps(mem, mask, x);
-    }
-    static zmm_t mask_loadu(zmm_t x, __mmask16 mask, void const *mem)
-    {
-        return _mm512_mask_loadu_ps(x, mask, mem);
-    }
-    static zmm_t mask_mov(zmm_t x, __mmask16 mask, zmm_t y)
-    {
-        return _mm512_mask_mov_ps(x, mask, y);
-    }
-    static void mask_storeu(void *mem, __mmask16 mask, zmm_t x)
-    {
-        return _mm512_mask_storeu_ps(mem, mask, x);
-    }
-    static zmm_t min(zmm_t x, zmm_t y) { return _mm512_min_ps(x, y); }
-    static zmm_t permutexvar(__m512i idx, zmm_t zmm)
-    {
-        return _mm512_permutexvar_ps(idx, zmm);
-    }
-    static type_t reducemax(zmm_t v) { return npyv_reduce_max_f32(v); }
-    static type_t reducemin(zmm_t v) { return npyv_reduce_min_f32(v); }
-    static zmm_t set1(type_t v) { return _mm512_set1_ps(v); }
-    template<__mmask16 mask>
-    static zmm_t shuffle(zmm_t zmm)
-    {
-        return _mm512_shuffle_ps(zmm, zmm, (_MM_PERM_ENUM)mask);
-    }
-    static void storeu(void *mem, zmm_t x) { return _mm512_storeu_ps(mem, x); }
-
-    static ymm_t max(ymm_t x, ymm_t y) { return _mm256_max_ps(x, y); }
-    static ymm_t min(ymm_t x, ymm_t y) { return _mm256_min_ps(x, y); }
-};
-
-/*
- * COEX == Compare and Exchange two registers by swapping min and max values
- */
-template <typename vtype, typename mm_t>
-void
-COEX(mm_t &a, mm_t &b)
-{
-    mm_t temp = a;
-    a = vtype::min(a, b);
-    b = vtype::max(temp, b);
-}
-
-template <typename vtype, typename zmm_t = typename vtype::zmm_t>
-static inline zmm_t
-cmp_merge(zmm_t in1, zmm_t in2, __mmask16 mask)
-{
-    zmm_t min = vtype::min(in2, in1);
-    zmm_t max = vtype::max(in2, in1);
-    return vtype::mask_mov(min, mask, max);  // 0 -> min, 1 -> max
-}
-
-/*
- * Assumes zmm is random and performs a full sorting network defined in
- * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg
- */
-template <typename vtype, typename zmm_t = typename vtype::zmm_t>
-static inline zmm_t
-sort_zmm(zmm_t zmm)
-{
-    zmm = cmp_merge<vtype>(zmm, vtype::template shuffle<SHUFFLE_MASK(2, 3, 0, 1)>(zmm),
-                           0xAAAA);
-    zmm = cmp_merge<vtype>(zmm, vtype::template shuffle<SHUFFLE_MASK(0, 1, 2, 3)>(zmm),
-                           0xCCCC);
-    zmm = cmp_merge<vtype>(zmm, vtype::template shuffle<SHUFFLE_MASK(2, 3, 0, 1)>(zmm),
-                           0xAAAA);
-    zmm = cmp_merge<vtype>(
-            zmm, vtype::permutexvar(_mm512_set_epi32(NETWORK3), zmm), 0xF0F0);
-    zmm = cmp_merge<vtype>(zmm, vtype::template shuffle<SHUFFLE_MASK(1, 0, 3, 2)>(zmm),
-                           0xCCCC);
-    zmm = cmp_merge<vtype>(zmm, vtype::template shuffle<SHUFFLE_MASK(2, 3, 0, 1)>(zmm),
-                           0xAAAA);
-    zmm = cmp_merge<vtype>(
-            zmm, vtype::permutexvar(_mm512_set_epi32(NETWORK5), zmm), 0xFF00);
-    zmm = cmp_merge<vtype>(
-            zmm, vtype::permutexvar(_mm512_set_epi32(NETWORK6), zmm), 0xF0F0);
-    zmm = cmp_merge<vtype>(zmm, vtype::template shuffle<SHUFFLE_MASK(1, 0, 3, 2)>(zmm),
-                           0xCCCC);
-    zmm = cmp_merge<vtype>(zmm, vtype::template shuffle<SHUFFLE_MASK(2, 3, 0, 1)>(zmm),
-                           0xAAAA);
-    return zmm;
-}
-
-// Assumes zmm is bitonic and performs a recursive half cleaner
-template <typename vtype, typename zmm_t = typename vtype::zmm_t>
-static inline zmm_t
-bitonic_merge_zmm(zmm_t zmm)
-{
-    // 1) half_cleaner[16]: compare 1-9, 2-10, 3-11 etc ..
-    zmm = cmp_merge<vtype>(
-            zmm, vtype::permutexvar(_mm512_set_epi32(NETWORK7), zmm), 0xFF00);
-    // 2) half_cleaner[8]: compare 1-5, 2-6, 3-7 etc ..
-    zmm = cmp_merge<vtype>(
-            zmm, vtype::permutexvar(_mm512_set_epi32(NETWORK6), zmm), 0xF0F0);
-    // 3) half_cleaner[4]
-    zmm = cmp_merge<vtype>(zmm, vtype::template shuffle<SHUFFLE_MASK(1, 0, 3, 2)>(zmm),
-                           0xCCCC);
-    // 3) half_cleaner[1]
-    zmm = cmp_merge<vtype>(zmm, vtype::template shuffle<SHUFFLE_MASK(2, 3, 0, 1)>(zmm),
-                           0xAAAA);
-    return zmm;
-}
-
-// Assumes zmm1 and zmm2 are sorted and performs a recursive half cleaner
-template <typename vtype, typename zmm_t = typename vtype::zmm_t>
-static inline void
-bitonic_merge_two_zmm(zmm_t *zmm1, zmm_t *zmm2)
-{
-    // 1) First step of a merging network: coex of zmm1 and zmm2 reversed
-    *zmm2 = vtype::permutexvar(_mm512_set_epi32(NETWORK5), *zmm2);
-    zmm_t zmm3 = vtype::min(*zmm1, *zmm2);
-    zmm_t zmm4 = vtype::max(*zmm1, *zmm2);
-    // 2) Recursive half cleaner for each
-    *zmm1 = bitonic_merge_zmm<vtype>(zmm3);
-    *zmm2 = bitonic_merge_zmm<vtype>(zmm4);
-}
-
-// Assumes [zmm0, zmm1] and [zmm2, zmm3] are sorted and performs a recursive
-// half cleaner
-template <typename vtype, typename zmm_t = typename vtype::zmm_t>
-static inline void
-bitonic_merge_four_zmm(zmm_t *zmm)
-{
-    zmm_t zmm2r = vtype::permutexvar(_mm512_set_epi32(NETWORK5), zmm[2]);
-    zmm_t zmm3r = vtype::permutexvar(_mm512_set_epi32(NETWORK5), zmm[3]);
-    zmm_t zmm_t1 = vtype::min(zmm[0], zmm3r);
-    zmm_t zmm_t2 = vtype::min(zmm[1], zmm2r);
-    zmm_t zmm_t3 = vtype::permutexvar(_mm512_set_epi32(NETWORK5),
-                                      vtype::max(zmm[1], zmm2r));
-    zmm_t zmm_t4 = vtype::permutexvar(_mm512_set_epi32(NETWORK5),
-                                      vtype::max(zmm[0], zmm3r));
-    zmm_t zmm0 = vtype::min(zmm_t1, zmm_t2);
-    zmm_t zmm1 = vtype::max(zmm_t1, zmm_t2);
-    zmm_t zmm2 = vtype::min(zmm_t3, zmm_t4);
-    zmm_t zmm3 = vtype::max(zmm_t3, zmm_t4);
-    zmm[0] = bitonic_merge_zmm<vtype>(zmm0);
-    zmm[1] = bitonic_merge_zmm<vtype>(zmm1);
-    zmm[2] = bitonic_merge_zmm<vtype>(zmm2);
-    zmm[3] = bitonic_merge_zmm<vtype>(zmm3);
-}
-
-template <typename vtype, typename zmm_t = typename vtype::zmm_t>
-static inline void
-bitonic_merge_eight_zmm(zmm_t *zmm)
-{
-    zmm_t zmm4r = vtype::permutexvar(_mm512_set_epi32(NETWORK5), zmm[4]);
-    zmm_t zmm5r = vtype::permutexvar(_mm512_set_epi32(NETWORK5), zmm[5]);
-    zmm_t zmm6r = vtype::permutexvar(_mm512_set_epi32(NETWORK5), zmm[6]);
-    zmm_t zmm7r = vtype::permutexvar(_mm512_set_epi32(NETWORK5), zmm[7]);
-    zmm_t zmm_t1 = vtype::min(zmm[0], zmm7r);
-    zmm_t zmm_t2 = vtype::min(zmm[1], zmm6r);
-    zmm_t zmm_t3 = vtype::min(zmm[2], zmm5r);
-    zmm_t zmm_t4 = vtype::min(zmm[3], zmm4r);
-    zmm_t zmm_t5 = vtype::permutexvar(_mm512_set_epi32(NETWORK5),
-                                      vtype::max(zmm[3], zmm4r));
-    zmm_t zmm_t6 = vtype::permutexvar(_mm512_set_epi32(NETWORK5),
-                                      vtype::max(zmm[2], zmm5r));
-    zmm_t zmm_t7 = vtype::permutexvar(_mm512_set_epi32(NETWORK5),
-                                      vtype::max(zmm[1], zmm6r));
-    zmm_t zmm_t8 = vtype::permutexvar(_mm512_set_epi32(NETWORK5),
-                                      vtype::max(zmm[0], zmm7r));
-    COEX<vtype>(zmm_t1, zmm_t3);
-    COEX<vtype>(zmm_t2, zmm_t4);
-    COEX<vtype>(zmm_t5, zmm_t7);
-    COEX<vtype>(zmm_t6, zmm_t8);
-    COEX<vtype>(zmm_t1, zmm_t2);
-    COEX<vtype>(zmm_t3, zmm_t4);
-    COEX<vtype>(zmm_t5, zmm_t6);
-    COEX<vtype>(zmm_t7, zmm_t8);
-    zmm[0] = bitonic_merge_zmm<vtype>(zmm_t1);
-    zmm[1] = bitonic_merge_zmm<vtype>(zmm_t2);
-    zmm[2] = bitonic_merge_zmm<vtype>(zmm_t3);
-    zmm[3] = bitonic_merge_zmm<vtype>(zmm_t4);
-    zmm[4] = bitonic_merge_zmm<vtype>(zmm_t5);
-    zmm[5] = bitonic_merge_zmm<vtype>(zmm_t6);
-    zmm[6] = bitonic_merge_zmm<vtype>(zmm_t7);
-    zmm[7] = bitonic_merge_zmm<vtype>(zmm_t8);
-}
-
-template <typename vtype, typename type_t>
-static inline void
-sort_16(type_t *arr, npy_int N)
-{
-    __mmask16 load_mask = (0x0001 << N) - 0x0001;
-    typename vtype::zmm_t zmm =
-            vtype::mask_loadu(vtype::zmm_max(), load_mask, arr);
-    vtype::mask_storeu(arr, load_mask, sort_zmm<vtype>(zmm));
-}
-
-template <typename vtype, typename type_t>
-static inline void
-sort_32(type_t *arr, npy_int N)
-{
-    if (N <= 16) {
-        sort_16<vtype>(arr, N);
-        return;
-    }
-    using zmm_t = typename vtype::zmm_t;
-    zmm_t zmm1 = vtype::loadu(arr);
-    __mmask16 load_mask = (0x0001 << (N - 16)) - 0x0001;
-    zmm_t zmm2 = vtype::mask_loadu(vtype::zmm_max(), load_mask, arr + 16);
-    zmm1 = sort_zmm<vtype>(zmm1);
-    zmm2 = sort_zmm<vtype>(zmm2);
-    bitonic_merge_two_zmm<vtype>(&zmm1, &zmm2);
-    vtype::storeu(arr, zmm1);
-    vtype::mask_storeu(arr + 16, load_mask, zmm2);
-}
-
-template <typename vtype, typename type_t>
-static inline void
-sort_64(type_t *arr, npy_int N)
-{
-    if (N <= 32) {
-        sort_32<vtype>(arr, N);
-        return;
-    }
-    using zmm_t = typename vtype::zmm_t;
-    zmm_t zmm[4];
-    zmm[0] = vtype::loadu(arr);
-    zmm[1] = vtype::loadu(arr + 16);
-    __mmask16 load_mask1 = 0xFFFF, load_mask2 = 0xFFFF;
-    if (N < 48) {
-        load_mask1 = (0x0001 << (N - 32)) - 0x0001;
-        load_mask2 = 0x0000;
-    }
-    else if (N < 64) {
-        load_mask2 = (0x0001 << (N - 48)) - 0x0001;
-    }
-    zmm[2] = vtype::mask_loadu(vtype::zmm_max(), load_mask1, arr + 32);
-    zmm[3] = vtype::mask_loadu(vtype::zmm_max(), load_mask2, arr + 48);
-    zmm[0] = sort_zmm<vtype>(zmm[0]);
-    zmm[1] = sort_zmm<vtype>(zmm[1]);
-    zmm[2] = sort_zmm<vtype>(zmm[2]);
-    zmm[3] = sort_zmm<vtype>(zmm[3]);
-    bitonic_merge_two_zmm<vtype>(&zmm[0], &zmm[1]);
-    bitonic_merge_two_zmm<vtype>(&zmm[2], &zmm[3]);
-    bitonic_merge_four_zmm<vtype>(zmm);
-    vtype::storeu(arr, zmm[0]);
-    vtype::storeu(arr + 16, zmm[1]);
-    vtype::mask_storeu(arr + 32, load_mask1, zmm[2]);
-    vtype::mask_storeu(arr + 48, load_mask2, zmm[3]);
-}
-
-template <typename vtype, typename type_t>
-static inline void
-sort_128(type_t *arr, npy_int N)
-{
-    if (N <= 64) {
-        sort_64<vtype>(arr, N);
-        return;
-    }
-    using zmm_t = typename vtype::zmm_t;
-    zmm_t zmm[8];
-    zmm[0] = vtype::loadu(arr);
-    zmm[1] = vtype::loadu(arr + 16);
-    zmm[2] = vtype::loadu(arr + 32);
-    zmm[3] = vtype::loadu(arr + 48);
-    zmm[0] = sort_zmm<vtype>(zmm[0]);
-    zmm[1] = sort_zmm<vtype>(zmm[1]);
-    zmm[2] = sort_zmm<vtype>(zmm[2]);
-    zmm[3] = sort_zmm<vtype>(zmm[3]);
-    __mmask16 load_mask1 = 0xFFFF, load_mask2 = 0xFFFF;
-    __mmask16 load_mask3 = 0xFFFF, load_mask4 = 0xFFFF;
-    if (N < 80) {
-        load_mask1 = (0x0001 << (N - 64)) - 0x0001;
-        load_mask2 = 0x0000;
-        load_mask3 = 0x0000;
-        load_mask4 = 0x0000;
-    }
-    else if (N < 96) {
-        load_mask2 = (0x0001 << (N - 80)) - 0x0001;
-        load_mask3 = 0x0000;
-        load_mask4 = 0x0000;
-    }
-    else if (N < 112) {
-        load_mask3 = (0x0001 << (N - 96)) - 0x0001;
-        load_mask4 = 0x0000;
-    }
-    else {
-        load_mask4 = (0x0001 << (N - 112)) - 0x0001;
-    }
-    zmm[4] = vtype::mask_loadu(vtype::zmm_max(), load_mask1, arr + 64);
-    zmm[5] = vtype::mask_loadu(vtype::zmm_max(), load_mask2, arr + 80);
-    zmm[6] = vtype::mask_loadu(vtype::zmm_max(), load_mask3, arr + 96);
-    zmm[7] = vtype::mask_loadu(vtype::zmm_max(), load_mask4, arr + 112);
-    zmm[4] = sort_zmm<vtype>(zmm[4]);
-    zmm[5] = sort_zmm<vtype>(zmm[5]);
-    zmm[6] = sort_zmm<vtype>(zmm[6]);
-    zmm[7] = sort_zmm<vtype>(zmm[7]);
-    bitonic_merge_two_zmm<vtype>(&zmm[0], &zmm[1]);
-    bitonic_merge_two_zmm<vtype>(&zmm[2], &zmm[3]);
-    bitonic_merge_two_zmm<vtype>(&zmm[4], &zmm[5]);
-    bitonic_merge_two_zmm<vtype>(&zmm[6], &zmm[7]);
-    bitonic_merge_four_zmm<vtype>(zmm);
-    bitonic_merge_four_zmm<vtype>(zmm + 4);
-    bitonic_merge_eight_zmm<vtype>(zmm);
-    vtype::storeu(arr, zmm[0]);
-    vtype::storeu(arr + 16, zmm[1]);
-    vtype::storeu(arr + 32, zmm[2]);
-    vtype::storeu(arr + 48, zmm[3]);
-    vtype::mask_storeu(arr + 64, load_mask1, zmm[4]);
-    vtype::mask_storeu(arr + 80, load_mask2, zmm[5]);
-    vtype::mask_storeu(arr + 96, load_mask3, zmm[6]);
-    vtype::mask_storeu(arr + 112, load_mask4, zmm[7]);
-}
-
-template <typename type_t>
-static inline void
-swap(type_t *arr, npy_intp ii, npy_intp jj)
-{
-    type_t temp = arr[ii];
-    arr[ii] = arr[jj];
-    arr[jj] = temp;
-}
-
-// Median of 3 strategy
-// template<typename type_t>
-// static inline
-// npy_intp get_pivot_index(type_t *arr, const npy_intp left, const npy_intp
-// right) {
-//    return (rand() % (right + 1 - left)) + left;
-//    //npy_intp middle = ((right-left)/2) + left;
-//    //type_t a = arr[left], b = arr[middle], c = arr[right];
-//    //if ((b >= a && b <= c) || (b <= a && b >= c))
-//    //    return middle;
-//    //if ((a >= b && a <= c) || (a <= b && a >= c))
-//    //    return left;
-//    //else
-//    //    return right;
-//}
-
-/*
- * Picking the pivot: Median of 72 array elements chosen at random.
- */
-
-template <typename vtype, typename type_t>
-static inline type_t
-get_pivot(type_t *arr, const npy_intp left, const npy_intp right)
-{
-    /* seeds for vectorized random number generator */
-    __m256i s0 = _mm256_setr_epi64x(8265987198341093849, 3762817312854612374,
-                                    1324281658759788278, 6214952190349879213);
-    __m256i s1 = _mm256_setr_epi64x(2874178529384792648, 1257248936691237653,
-                                    7874578921548791257, 1998265912745817298);
-    s0 = _mm256_add_epi64(s0, _mm256_set1_epi64x(left));
-    s1 = _mm256_sub_epi64(s1, _mm256_set1_epi64x(right));
-
-    npy_intp arrsize = right - left + 1;
-    __m256i bound =
-            _mm256_set1_epi32(arrsize > INT32_MAX ? INT32_MAX : arrsize);
-    __m512i left_vec = _mm512_set1_epi64(left);
-    __m512i right_vec = _mm512_set1_epi64(right);
-    using ymm_t = typename vtype::ymm_t;
-    ymm_t v[9];
-    /* fill 9 vectors with random numbers */
-    for (npy_int i = 0; i < 9; ++i) {
-        __m256i rand_64 = vnext(&s0, &s1); /* vector with 4 random uint64_t */
-        __m512i rand_32 = _mm512_cvtepi32_epi64(rnd_epu32(
-                rand_64, bound)); /* random numbers between 0 and bound - 1 */
-        __m512i indices;
-        if (i < 5)
-            indices =
-                    _mm512_add_epi64(left_vec, rand_32); /* indices for arr */
-        else
-            indices =
-                    _mm512_sub_epi64(right_vec, rand_32); /* indices for arr */
-
-        v[i] = vtype::template i64gather<sizeof(type_t)>(indices, arr);
-    }
-
-    /* median network for 9 elements */
-    COEX<vtype>(v[0], v[1]);
-    COEX<vtype>(v[2], v[3]);
-    COEX<vtype>(v[4], v[5]);
-    COEX<vtype>(v[6], v[7]);
-    COEX<vtype>(v[0], v[2]);
-    COEX<vtype>(v[1], v[3]);
-    COEX<vtype>(v[4], v[6]);
-    COEX<vtype>(v[5], v[7]);
-    COEX<vtype>(v[0], v[4]);
-    COEX<vtype>(v[1], v[2]);
-    COEX<vtype>(v[5], v[6]);
-    COEX<vtype>(v[3], v[7]);
-    COEX<vtype>(v[1], v[5]);
-    COEX<vtype>(v[2], v[6]);
-    COEX<vtype>(v[3], v[5]);
-    COEX<vtype>(v[2], v[4]);
-    COEX<vtype>(v[3], v[4]);
-    COEX<vtype>(v[3], v[8]);
-    COEX<vtype>(v[4], v[8]);
-
-    // technically v[4] needs to be sorted before we pick the correct median,
-    // picking the 4th element works just as well for performance
-    type_t *temp = (type_t *)&v[4];
-
-    return temp[4];
-}
-
-/*
- * Partition one ZMM register based on the pivot and returns the index of the
- * last element that is less than equal to the pivot.
- */
-template <typename vtype, typename type_t, typename zmm_t>
-static inline npy_int
-partition_vec(type_t *arr, npy_intp left, npy_intp right, const zmm_t curr_vec,
-              const zmm_t pivot_vec, zmm_t *smallest_vec, zmm_t *biggest_vec)
-{
-    /* which elements are larger than the pivot */
-    __mmask16 gt_mask = vtype::ge(curr_vec, pivot_vec);
-    npy_int amount_gt_pivot = _mm_popcnt_u32((npy_int)gt_mask);
-    vtype::mask_compressstoreu(arr + left, _mm512_knot(gt_mask), curr_vec);
-    vtype::mask_compressstoreu(arr + right - amount_gt_pivot, gt_mask,
-                               curr_vec);
-    *smallest_vec = vtype::min(curr_vec, *smallest_vec);
-    *biggest_vec = vtype::max(curr_vec, *biggest_vec);
-    return amount_gt_pivot;
-}
-
-/*
- * Partition an array based on the pivot and returns the index of the
- * last element that is less than equal to the pivot.
- */
-template <typename vtype, typename type_t>
-static inline npy_intp
-partition_avx512(type_t *arr, npy_intp left, npy_intp right, type_t pivot,
-                 type_t *smallest, type_t *biggest)
-{
-    /* make array length divisible by 16 , shortening the array */
-    for (npy_int i = (right - left) % 16; i > 0; --i) {
-        *smallest = MIN(*smallest, arr[left]);
-        *biggest = MAX(*biggest, arr[left]);
-        if (arr[left] > pivot) {
-            swap(arr, left, --right);
-        }
-        else {
-            ++left;
-        }
-    }
-
-    if (left == right)
-        return left; /* less than 16 elements in the array */
-
-    using zmm_t = typename vtype::zmm_t;
-    zmm_t pivot_vec = vtype::set1(pivot);
-    zmm_t min_vec = vtype::set1(*smallest);
-    zmm_t max_vec = vtype::set1(*biggest);
-
-    if (right - left == 16) {
-        zmm_t vec = vtype::loadu(arr + left);
-        npy_int amount_gt_pivot = partition_vec<vtype>(
-                arr, left, left + 16, vec, pivot_vec, &min_vec, &max_vec);
-        *smallest = vtype::reducemin(min_vec);
-        *biggest = vtype::reducemax(max_vec);
-        return left + (16 - amount_gt_pivot);
-    }
-
-    // first and last 16 values are partitioned at the end
-    zmm_t vec_left = vtype::loadu(arr + left);
-    zmm_t vec_right = vtype::loadu(arr + (right - 16));
-    // store points of the vectors
-    npy_intp r_store = right - 16;
-    npy_intp l_store = left;
-    // indices for loading the elements
-    left += 16;
-    right -= 16;
-    while (right - left != 0) {
-        zmm_t curr_vec;
-        /*
-         * if fewer elements are stored on the right side of the array,
-         * then next elements are loaded from the right side,
-         * otherwise from the left side
-         */
-        if ((r_store + 16) - right < left - l_store) {
-            right -= 16;
-            curr_vec = vtype::loadu(arr + right);
-        }
-        else {
-            curr_vec = vtype::loadu(arr + left);
-            left += 16;
-        }
-        // partition the current vector and save it on both sides of the array
-        npy_int amount_gt_pivot =
-                partition_vec<vtype>(arr, l_store, r_store + 16, curr_vec,
-                                     pivot_vec, &min_vec, &max_vec);
-        ;
-        r_store -= amount_gt_pivot;
-        l_store += (16 - amount_gt_pivot);
-    }
-
-    /* partition and save vec_left and vec_right */
-    npy_int amount_gt_pivot =
-            partition_vec<vtype>(arr, l_store, r_store + 16, vec_left,
-                                 pivot_vec, &min_vec, &max_vec);
-    l_store += (16 - amount_gt_pivot);
-    amount_gt_pivot =
-            partition_vec<vtype>(arr, l_store, l_store + 16, vec_right,
-                                 pivot_vec, &min_vec, &max_vec);
-    l_store += (16 - amount_gt_pivot);
-    *smallest = vtype::reducemin(min_vec);
-    *biggest = vtype::reducemax(max_vec);
-    return l_store;
-}
-
-template <typename vtype, typename type_t>
-static inline void
-qsort_(type_t *arr, npy_intp left, npy_intp right, npy_int max_iters)
-{
-    /*
-     * Resort to heapsort if quicksort isn't making any progress
-     */
-    if (max_iters <= 0) {
-        heapsort_<typename vtype::tag>(arr + left, right + 1 - left);
-        return;
-    }
-    /*
-     * Base case: use bitonic networks to sort arrays <= 128
-     */
-    if (right + 1 - left <= 128) {
-        sort_128<vtype>(arr + left, (npy_int)(right + 1 - left));
-        return;
-    }
-
-    type_t pivot = get_pivot<vtype>(arr, left, right);
-    type_t smallest = vtype::type_max();
-    type_t biggest = vtype::type_min();
-    npy_intp pivot_index = partition_avx512<vtype>(arr, left, right + 1, pivot,
-                                                   &smallest, &biggest);
-    if (pivot != smallest)
-        qsort_<vtype>(arr, left, pivot_index - 1, max_iters - 1);
-    if (pivot != biggest)
-        qsort_<vtype>(arr, pivot_index, right, max_iters - 1);
-}
-
-static inline npy_intp
-replace_nan_with_inf(npy_float *arr, npy_intp arrsize)
-{
-    npy_intp nan_count = 0;
-    __mmask16 loadmask = 0xFFFF;
-    while (arrsize > 0) {
-        if (arrsize < 16) {
-            loadmask = (0x0001 << arrsize) - 0x0001;
-        }
-        __m512 in_zmm = _mm512_maskz_loadu_ps(loadmask, arr);
-        __mmask16 nanmask = _mm512_cmp_ps_mask(in_zmm, in_zmm, _CMP_NEQ_UQ);
-        nan_count += _mm_popcnt_u32((npy_int)nanmask);
-        _mm512_mask_storeu_ps(arr, nanmask, ZMM_MAX_FLOAT);
-        arr += 16;
-        arrsize -= 16;
-    }
-    return nan_count;
-}
-
-static inline void
-replace_inf_with_nan(npy_float *arr, npy_intp arrsize, npy_intp nan_count)
-{
-    for (npy_intp ii = arrsize - 1; nan_count > 0; --ii) {
-        arr[ii] = NPY_NANF;
-        nan_count -= 1;
-    }
-}
-
-/***************************************
- * C > C++ dispatch
- ***************************************/
-
-NPY_NO_EXPORT void
-NPY_CPU_DISPATCH_CURFX(x86_quicksort_int)(void *arr, npy_intp arrsize)
-{
-    if (arrsize > 1) {
-        qsort_<vector<npy_int>, npy_int>((npy_int *)arr, 0, arrsize - 1,
-                                         2 * (npy_int)log2(arrsize));
-    }
-}
-
-NPY_NO_EXPORT void
-NPY_CPU_DISPATCH_CURFX(x86_quicksort_uint)(void *arr, npy_intp arrsize)
-{
-    if (arrsize > 1) {
-        qsort_<vector<npy_uint>, npy_uint>((npy_uint *)arr, 0, arrsize - 1,
-                                           2 * (npy_int)log2(arrsize));
-    }
-}
-
-NPY_NO_EXPORT void
-NPY_CPU_DISPATCH_CURFX(x86_quicksort_float)(void *arr, npy_intp arrsize)
-{
-    if (arrsize > 1) {
-        npy_intp nan_count = replace_nan_with_inf((npy_float *)arr, arrsize);
-        qsort_<vector<npy_float>, npy_float>((npy_float *)arr, 0, arrsize - 1,
-                                             2 * (npy_int)log2(arrsize));
-        replace_inf_with_nan((npy_float *)arr, arrsize, nan_count);
-    }
-}
-
-#endif  // NPY_HAVE_AVX512_SKX
diff --git a/numpy/core/src/npysort/x86-qsort.h b/numpy/core/src/npysort/x86-qsort.h
deleted file mode 100644
index 6340e2bc7..000000000
--- a/numpy/core/src/npysort/x86-qsort.h
+++ /dev/null
@@ -1,28 +0,0 @@
-#include "numpy/npy_common.h"
-
-#include "npy_cpu_dispatch.h"
-
-#ifndef NPY_NO_EXPORT
-#define NPY_NO_EXPORT NPY_VISIBILITY_HIDDEN
-#endif
-
-#ifndef NPY_DISABLE_OPTIMIZATION
-#include "x86-qsort.dispatch.h"
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void x86_quicksort_int,
-                         (void *start, npy_intp num))
-
-NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void x86_quicksort_uint,
-                         (void *start, npy_intp num))
-
-NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void x86_quicksort_float,
-                         (void *start, npy_intp num))
-
-#ifdef __cplusplus
-}
-#endif
diff --git a/numpy/core/src/npysort/x86-simd-sort b/numpy/core/src/npysort/x86-simd-sort
new file mode 160000
+Subproject 1735e86cda95a469357a19ab8984ad8530372e7
diff --git a/numpy/core/src/umath/_rational_tests.c b/numpy/core/src/umath/_rational_tests.c
index bf50a2226..391c5f4e1 100644
--- a/numpy/core/src/umath/_rational_tests.c
+++ b/numpy/core/src/umath/_rational_tests.c
@@ -49,7 +49,7 @@ set_zero_divide(void) {
 
 /* Integer arithmetic utilities */
 
-static NPY_INLINE npy_int32
+static inline npy_int32
 safe_neg(npy_int32 x) {
     if (x==(npy_int32)1<<31) {
         set_overflow();
@@ -57,7 +57,7 @@ safe_neg(npy_int32 x) {
     return -x;
 }
 
-static NPY_INLINE npy_int32
+static inline npy_int32
 safe_abs32(npy_int32 x) {
     npy_int32 nx;
     if (x>=0) {
@@ -70,7 +70,7 @@ safe_abs32(npy_int32 x) {
     return nx;
 }
 
-static NPY_INLINE npy_int64
+static inline npy_int64
 safe_abs64(npy_int64 x) {
     npy_int64 nx;
     if (x>=0) {
@@ -83,7 +83,7 @@ safe_abs64(npy_int64 x) {
     return nx;
 }
 
-static NPY_INLINE npy_int64
+static inline npy_int64
 gcd(npy_int64 x, npy_int64 y) {
     x = safe_abs64(x);
     y = safe_abs64(y);
@@ -102,7 +102,7 @@ gcd(npy_int64 x, npy_int64 y) {
     return x;
 }
 
-static NPY_INLINE npy_int64
+static inline npy_int64
 lcm(npy_int64 x, npy_int64 y) {
     npy_int64 lcm;
     if (!x || !y) {
@@ -128,7 +128,7 @@ typedef struct {
     npy_int32 dmm;
 } rational;
 
-static NPY_INLINE rational
+static inline rational
 make_rational_int(npy_int64 n) {
     rational r = {(npy_int32)n,0};
     if (r.n != n) {
@@ -164,7 +164,7 @@ make_rational_slow(npy_int64 n_, npy_int64 d_) {
     return r;
 }
 
-static NPY_INLINE npy_int32
+static inline npy_int32
 d(rational r) {
     return r.dmm+1;
 }
@@ -184,7 +184,7 @@ make_rational_fast(npy_int64 n_, npy_int64 d_) {
     return r;
 }
 
-static NPY_INLINE rational
+static inline rational
 rational_negative(rational r) {
     rational x;
     x.n = safe_neg(r.n);
@@ -192,7 +192,7 @@ rational_negative(rational r) {
     return x;
 }
 
-static NPY_INLINE rational
+static inline rational
 rational_add(rational x, rational y) {
     /*
      * Note that the numerator computation can never overflow int128_t,
@@ -202,25 +202,25 @@ rational_add(rational x, rational y) {
         (npy_int64)d(x)*d(y));
 }
 
-static NPY_INLINE rational
+static inline rational
 rational_subtract(rational x, rational y) {
     /* We're safe from overflow as with + */
     return make_rational_fast((npy_int64)x.n*d(y)-(npy_int64)d(x)*y.n,
         (npy_int64)d(x)*d(y));
 }
 
-static NPY_INLINE rational
+static inline rational
 rational_multiply(rational x, rational y) {
     /* We're safe from overflow as with + */
     return make_rational_fast((npy_int64)x.n*y.n,(npy_int64)d(x)*d(y));
 }
 
-static NPY_INLINE rational
+static inline rational
 rational_divide(rational x, rational y) {
     return make_rational_slow((npy_int64)x.n*d(y),(npy_int64)d(x)*y.n);
 }
 
-static NPY_INLINE npy_int64
+static inline npy_int64
 rational_floor(rational x) {
     /* Always round down */
     if (x.n>=0) {
@@ -233,18 +233,18 @@ rational_floor(rational x) {
     return -((-(npy_int64)x.n+d(x)-1)/d(x));
 }
 
-static NPY_INLINE npy_int64
+static inline npy_int64
 rational_ceil(rational x) {
     return -rational_floor(rational_negative(x));
 }
 
-static NPY_INLINE rational
+static inline rational
 rational_remainder(rational x, rational y) {
     return rational_subtract(x, rational_multiply(y,make_rational_int(
                     rational_floor(rational_divide(x,y)))));
 }
 
-static NPY_INLINE rational
+static inline rational
 rational_abs(rational x) {
     rational y;
     y.n = safe_abs32(x.n);
@@ -252,7 +252,7 @@ rational_abs(rational x) {
     return y;
 }
 
-static NPY_INLINE npy_int64
+static inline npy_int64
 rational_rint(rational x) {
     /*
      * Round towards nearest integer, moving exact half integers towards
@@ -262,12 +262,12 @@ rational_rint(rational x) {
     return (2*(npy_int64)x.n+(x.n<0?-d_:d_))/(2*(npy_int64)d_);
 }
 
-static NPY_INLINE int
+static inline int
 rational_sign(rational x) {
     return x.n<0?-1:x.n==0?0:1;
 }
 
-static NPY_INLINE rational
+static inline rational
 rational_inverse(rational x) {
     rational y = {0};
     if (!x.n) {
@@ -286,7 +286,7 @@ rational_inverse(rational x) {
     return y;
 }
 
-static NPY_INLINE int
+static inline int
 rational_eq(rational x, rational y) {
     /*
      * Since we enforce d > 0, and store fractions in reduced form,
@@ -295,42 +295,42 @@ rational_eq(rational x, rational y) {
     return x.n==y.n && x.dmm==y.dmm;
 }
 
-static NPY_INLINE int
+static inline int
 rational_ne(rational x, rational y) {
     return !rational_eq(x,y);
 }
 
-static NPY_INLINE int
+static inline int
 rational_lt(rational x, rational y) {
     return (npy_int64)x.n*d(y) < (npy_int64)y.n*d(x);
 }
 
-static NPY_INLINE int
+static inline int
 rational_gt(rational x, rational y) {
     return rational_lt(y,x);
 }
 
-static NPY_INLINE int
+static inline int
 rational_le(rational x, rational y) {
     return !rational_lt(y,x);
 }
 
-static NPY_INLINE int
+static inline int
 rational_ge(rational x, rational y) {
     return !rational_lt(x,y);
 }
 
-static NPY_INLINE npy_int32
+static inline npy_int32
 rational_int(rational x) {
     return x.n/d(x);
 }
 
-static NPY_INLINE double
+static inline double
 rational_double(rational x) {
     return (double)x.n/d(x);
 }
 
-static NPY_INLINE int
+static inline int
 rational_nonzero(rational x) {
     return x.n!=0;
 }
@@ -367,7 +367,7 @@ typedef struct {
 
 static PyTypeObject PyRational_Type;
 
-static NPY_INLINE int
+static inline int
 PyRational_Check(PyObject* object) {
     return PyObject_IsInstance(object,(PyObject*)&PyRational_Type);
 }
@@ -753,7 +753,7 @@ npyrational_setitem(PyObject* item, void* data, void* arr) {
     return 0;
 }
 
-static NPY_INLINE void
+static inline void
 byteswap(npy_int32* x) {
     char* p = (char*)x;
     size_t i;
@@ -996,7 +996,7 @@ UNARY_UFUNC(reciprocal,rational,rational_inverse(x))
 UNARY_UFUNC(numerator,npy_int64,x.n)
 UNARY_UFUNC(denominator,npy_int64,d(x))
 
-static NPY_INLINE void
+static inline void
 rational_matrix_multiply(char **args, npy_intp const *dimensions, npy_intp const *steps)
 {
     /* pointers to data for input and output arrays */
diff --git a/numpy/core/src/umath/_scaled_float_dtype.c b/numpy/core/src/umath/_scaled_float_dtype.c
index a214b32aa..c26ace9f1 100644
--- a/numpy/core/src/umath/_scaled_float_dtype.c
+++ b/numpy/core/src/umath/_scaled_float_dtype.c
@@ -26,6 +26,14 @@
 #include "dispatching.h"
 
 
+/* TODO: from wrapping_array_method.c, use proper public header eventually  */
+NPY_NO_EXPORT int
+PyUFunc_AddWrappingLoop(PyObject *ufunc_obj,
+        PyArray_DTypeMeta *new_dtypes[], PyArray_DTypeMeta *wrapped_dtypes[],
+        translate_given_descrs_func *translate_given_descrs,
+        translate_loop_descrs_func *translate_loop_descrs);
+
+
 typedef struct {
     PyArray_Descr base;
     double scaling;
@@ -125,9 +133,9 @@ sfloat_setitem(PyObject *obj, char *data, PyArrayObject *arr)
 
 /* Special DType methods and the descr->f slot storage */
 NPY_DType_Slots sfloat_slots = {
-    .default_descr = &sfloat_default_descr,
     .discover_descr_from_pyobject = &sfloat_discover_from_pyobject,
     .is_known_scalar_type = &sfloat_is_known_scalar_type,
+    .default_descr = &sfloat_default_descr,
     .common_dtype = &sfloat_common_dtype,
     .common_instance = &sfloat_common_instance,
     .f = {
@@ -136,14 +144,13 @@ NPY_DType_Slots sfloat_slots = {
     }
 };
 
-
 static PyArray_SFloatDescr SFloatSingleton = {{
-        .elsize = sizeof(double),
-        .alignment = _ALIGN(double),
+        .byteorder = '|',  /* do not bother with byte-swapping... */
         .flags = NPY_USE_GETITEM|NPY_USE_SETITEM,
         .type_num = -1,
+        .elsize = sizeof(double),
+        .alignment = _ALIGN(double),
         .f = &sfloat_slots.f,
-        .byteorder = '|',  /* do not bother with byte-swapping... */
     },
     .scaling = 1,
 };
@@ -233,15 +240,15 @@ sfloat_repr(PyArray_SFloatDescr *self)
 static PyArray_DTypeMeta PyArray_SFloatDType = {{{
         PyVarObject_HEAD_INIT(NULL, 0)
         .tp_name = "numpy._ScaledFloatTestDType",
-        .tp_methods = sfloat_methods,
-        .tp_new = sfloat_new,
+        .tp_basicsize = sizeof(PyArray_SFloatDescr),
         .tp_repr = (reprfunc)sfloat_repr,
         .tp_str = (reprfunc)sfloat_repr,
-        .tp_basicsize = sizeof(PyArray_SFloatDescr),
+        .tp_methods = sfloat_methods,
+        .tp_new = sfloat_new,
     }},
     .type_num = -1,
     .scalar_type = NULL,
-    .flags = NPY_DT_PARAMETRIC,
+    .flags = NPY_DT_PARAMETRIC | NPY_DT_NUMERIC,
     .dt_slots = &sfloat_slots,
 };
 
@@ -440,7 +447,7 @@ sfloat_to_bool_resolve_descriptors(
 
 
 static int
-init_casts(void)
+sfloat_init_casts(void)
 {
     PyArray_DTypeMeta *dtypes[2] = {&PyArray_SFloatDType, &PyArray_SFloatDType};
     PyType_Slot slots[4] = {{0, NULL}};
@@ -448,11 +455,11 @@ init_casts(void)
         .name = "sfloat_to_sfloat_cast",
         .nin = 1,
         .nout = 1,
+        /* minimal guaranteed casting */
+        .casting = NPY_SAME_KIND_CASTING,
         .flags = NPY_METH_SUPPORTS_UNALIGNED,
         .dtypes = dtypes,
         .slots = slots,
-        /* minimal guaranteed casting */
-        .casting = NPY_SAME_KIND_CASTING,
     };
 
     slots[0].slot = NPY_METH_resolve_descriptors;
@@ -646,13 +653,55 @@ add_sfloats_resolve_descriptors(
 }
 
 
+/*
+ * We define the hypot loop using the "PyUFunc_AddWrappingLoop" API.
+ * We use this very narrowly for mapping to the double hypot loop currently.
+ */
 static int
-add_loop(const char *ufunc_name,
-        PyArray_DTypeMeta *dtypes[3], PyObject *meth_or_promoter)
+translate_given_descrs_to_double(
+        int nin, int nout, PyArray_DTypeMeta *wrapped_dtypes[],
+        PyArray_Descr *given_descrs[], PyArray_Descr *new_descrs[])
+{
+    assert(nin == 2 && nout == 1);
+    for (int i = 0; i < 3; i++) {
+        if (given_descrs[i] == NULL) {
+            new_descrs[i] = NULL;
+        }
+        else {
+            new_descrs[i] = PyArray_DescrFromType(NPY_DOUBLE);
+        }
+    }
+    return 0;
+}
+
+
+static int
+translate_loop_descrs(
+        int nin, int nout, PyArray_DTypeMeta *new_dtypes[],
+        PyArray_Descr *given_descrs[],
+        PyArray_Descr *NPY_UNUSED(original_descrs[]),
+        PyArray_Descr *loop_descrs[])
+{
+    assert(nin == 2 && nout == 1);
+    loop_descrs[0] = sfloat_common_instance(
+            given_descrs[0], given_descrs[1]);
+    if (loop_descrs[0] == 0) {
+        return -1;
+    }
+    Py_INCREF(loop_descrs[0]);
+    loop_descrs[1] = loop_descrs[0];
+    Py_INCREF(loop_descrs[0]);
+    loop_descrs[2] = loop_descrs[0];
+    return 0;
+}
+
+
+static PyObject *
+sfloat_get_ufunc(const char *ufunc_name)
 {
     PyObject *mod = PyImport_ImportModule("numpy");
     if (mod == NULL) {
-        return -1;
+        return NULL;
     }
     PyObject *ufunc = PyObject_GetAttrString(mod, ufunc_name);
     Py_DECREF(mod);
@@ -660,6 +709,18 @@ add_loop(const char *ufunc_name,
         Py_DECREF(ufunc);
         PyErr_Format(PyExc_TypeError,
                 "numpy.%s was not a ufunc!", ufunc_name);
+        return NULL;
+    }
+    return ufunc;
+}
+
+
+static int
+sfloat_add_loop(const char *ufunc_name,
+        PyArray_DTypeMeta *dtypes[3], PyObject *meth_or_promoter)
+{
+    PyObject *ufunc = sfloat_get_ufunc(ufunc_name);
+    if (ufunc == NULL) {
         return -1;
     }
     PyObject *dtype_tup = PyArray_TupleFromItems(3, (PyObject **)dtypes, 1);
@@ -680,6 +741,24 @@ add_loop(const char *ufunc_name,
 }
 
 
+static int
+sfloat_add_wrapping_loop(const char *ufunc_name, PyArray_DTypeMeta *dtypes[3])
+{
+    PyObject *ufunc = sfloat_get_ufunc(ufunc_name);
+    if (ufunc == NULL) {
+        return -1;
+    }
+    PyArray_DTypeMeta *double_dt = PyArray_DTypeFromTypeNum(NPY_DOUBLE);
+    PyArray_DTypeMeta *wrapped_dtypes[3] = {double_dt, double_dt, double_dt};
+    int res = PyUFunc_AddWrappingLoop(
+        ufunc, dtypes, wrapped_dtypes, &translate_given_descrs_to_double,
+        &translate_loop_descrs);
+    Py_DECREF(ufunc);
+    Py_DECREF(double_dt);
+
+    return res;
+}
+
 
 /*
  * We add some very basic promoters to allow multiplying normal and scaled
@@ -707,7 +786,7 @@ promote_to_sfloat(PyUFuncObject *NPY_UNUSED(ufunc),
  * get less so with the introduction of public API).
  */
 static int
-init_ufuncs(void) {
+sfloat_init_ufuncs(void) {
     PyArray_DTypeMeta *dtypes[3] = {
             &PyArray_SFloatDType, &PyArray_SFloatDType, &PyArray_SFloatDType};
     PyType_Slot slots[3] = {{0, NULL}};
@@ -728,7 +807,7 @@ init_ufuncs(void) {
     if (bmeth == NULL) {
         return -1;
     }
-    int res = add_loop("multiply",
+    int res = sfloat_add_loop("multiply",
             bmeth->dtypes, (PyObject *)bmeth->method);
     Py_DECREF(bmeth);
     if (res < 0) {
@@ -746,13 +825,18 @@ init_ufuncs(void) {
     if (bmeth == NULL) {
         return -1;
     }
-    res = add_loop("add",
+    res = sfloat_add_loop("add",
             bmeth->dtypes, (PyObject *)bmeth->method);
     Py_DECREF(bmeth);
     if (res < 0) {
         return -1;
     }
 
+    /* N.B.: Wrapping isn't actually correct if scaling can be negative */
+    if (sfloat_add_wrapping_loop("hypot", dtypes) < 0) {
+        return -1;
+    }
+
     /*
      * Add a promoter for both directions of multiply with double.
      */
@@ -767,14 +851,14 @@ init_ufuncs(void) {
     if (promoter == NULL) {
         return -1;
     }
-    res = add_loop("multiply", promoter_dtypes, promoter);
+    res = sfloat_add_loop("multiply", promoter_dtypes, promoter);
     if (res < 0) {
         Py_DECREF(promoter);
         return -1;
     }
     promoter_dtypes[0] = double_DType;
     promoter_dtypes[1] = &PyArray_SFloatDType;
-    res = add_loop("multiply", promoter_dtypes, promoter);
+    res = sfloat_add_loop("multiply", promoter_dtypes, promoter);
     Py_DECREF(promoter);
     if (res < 0) {
         return -1;
@@ -815,11 +899,11 @@ get_sfloat_dtype(PyObject *NPY_UNUSED(mod), PyObject *NPY_UNUSED(args))
         return NULL;
     }
 
-    if (init_casts() < 0) {
+    if (sfloat_init_casts() < 0) {
         return NULL;
     }
 
-    if (init_ufuncs() < 0) {
+    if (sfloat_init_ufuncs() < 0) {
         return NULL;
     }
 
diff --git a/numpy/core/src/umath/_umath_tests.c.src b/numpy/core/src/umath/_umath_tests.c.src
index 1bf459ce6..b427991e5 100644
--- a/numpy/core/src/umath/_umath_tests.c.src
+++ b/numpy/core/src/umath/_umath_tests.c.src
@@ -9,6 +9,9 @@
 #include <Python.h>
 
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#if defined(NPY_INTERNAL_BUILD)
+#undef NPY_INTERNAL_BUILD
+#endif
 #include "numpy/arrayobject.h"
 #include "numpy/ufuncobject.h"
 #include "numpy/npy_math.h"
@@ -19,6 +22,9 @@
 #include "npy_cpu_features.h"
 #include "npy_cpu_dispatch.h"
 #include "numpy/npy_cpu.h"
+#include "npy_import.h"
+#include "numpy/experimental_dtype_api.h"
+
 
 /*
  *****************************************************************************
@@ -300,7 +306,7 @@ static void
                     ptr_this += stride_d;
                     ptr_that += stride_d;
                 }
-                *(@typ@ *)data_out = npy_@sqrt_func@(out);
+                *(@typ@ *)data_out = @sqrt_func@(out);
                 data_that += stride_n;
                 data_out += stride_p;
             }
@@ -343,6 +349,50 @@ static void
 
 /**end repeat**/
 
+static int
+INT32_negative(PyArrayMethod_Context *NPY_UNUSED(context),
+               char **args, npy_intp const *dimensions,
+               npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    npy_intp di = dimensions[0];
+    npy_intp i;
+    npy_intp is=steps[0], os=steps[1];
+    char *ip=args[0], *op=args[1];
+    for (i = 0; i < di; i++, ip += is, op += os) {
+        if (i == 3) {
+            *(int32_t *)op = - 100;
+        } else {
+            *(int32_t *)op = - *(int32_t *)ip;
+        }
+    }
+    return 0;
+}
+
+
+static int
+INT32_negative_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+                           char * const*args, npy_intp const *dimensions,
+                           npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indx = args[1];
+    npy_intp is1 = steps[0], isindex = steps[1];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    int32_t *indexed;
+    for(i = 0; i < n; i++, indx += isindex) {
+        indexed = (int32_t *)(ip1 + is1 * *(npy_intp *)indx);
+        if (i == 3) {
+            *indexed = -200;
+        } else {
+            *indexed = - *indexed;
+        }
+    }
+    return 0;
+}
+
+
+
 /*  The following lines were generated using a slightly modified
     version of code_generators/generate_umath.py and adding these
     lines to defdict:
@@ -671,6 +721,43 @@ err:
     return NULL;
 }
 
+static int
+add_INT32_negative_indexed(PyObject *module, PyObject *dict) {
+    if (import_experimental_dtype_api(__EXPERIMENTAL_DTYPE_API_VERSION) < 0) {
+        return -1;
+    }
+
+    PyObject * negative = PyUFunc_FromFuncAndData(NULL, NULL, NULL, 0, 1, 1,
+                                    PyUFunc_Zero, "indexed_negative", NULL, 0);
+    if (negative == NULL) {
+        return -1;
+    }
+    PyArray_DTypeMeta *dtypes[] = {&PyArray_Int32DType, &PyArray_Int32DType};
+
+    PyType_Slot slots[] = {
+        {NPY_METH_contiguous_indexed_loop, INT32_negative_indexed},
+        {NPY_METH_strided_loop, INT32_negative},
+        {0, NULL}
+    };
+
+    PyArrayMethod_Spec spec = {
+        .name = "negative_indexed_loop",
+        .nin = 1,
+        .nout = 1,
+        .dtypes = dtypes,
+        .slots = slots,
+        .flags = NPY_METH_NO_FLOATINGPOINT_ERRORS
+    };
+
+    if (PyUFunc_AddLoopFromSpec(negative, &spec) < 0) {
+        Py_DECREF(negative);
+        return -1;
+    }
+    PyDict_SetItemString(dict, "indexed_negative", negative);
+    Py_DECREF(negative);
+    return 0;
+}
+
 static PyMethodDef UMath_TestsMethods[] = {
     {"test_signature",  UMath_Tests_test_signature, METH_VARARGS,
      "Test signature parsing of ufunc. \n"
@@ -733,5 +820,13 @@ PyMODINIT_FUNC PyInit__umath_tests(void) {
                         "cannot load _umath_tests module.");
         return NULL;
     }
+
+    if (add_INT32_negative_indexed(m, d) < 0) {
+        Py_DECREF(m);
+        PyErr_Print();
+        PyErr_SetString(PyExc_RuntimeError,
+                        "cannot load _umath_tests module.");
+        return NULL;
+    }
     return m;
 }
diff --git a/numpy/core/src/umath/dispatching.c b/numpy/core/src/umath/dispatching.c
index 79de6c3c8..fee77b2d6 100644
--- a/numpy/core/src/umath/dispatching.c
+++ b/numpy/core/src/umath/dispatching.c
@@ -43,6 +43,7 @@
 #include <convert_datatype.h>
 
 #include "numpy/ndarraytypes.h"
+#include "numpy/npy_3kcompat.h"
 #include "common.h"
 
 #include "dispatching.h"
@@ -58,7 +59,7 @@
 
 
 /* forward declaration */
-static NPY_INLINE PyObject *
+static inline PyObject *
 promote_and_get_info_and_ufuncimpl(PyUFuncObject *ufunc,
         PyArrayObject *const ops[],
         PyArray_DTypeMeta *signature[],
@@ -667,12 +668,9 @@ legacy_promote_using_legacy_type_resolver(PyUFuncObject *ufunc,
         Py_DECREF(out_descrs[i]);
     }
     /*
-     * The PyUFunc_SimpleBinaryComparisonTypeResolver has a deprecation
-     * warning (ignoring `dtype=`) and cannot be cached.
-     * All datetime ones *should* have a warning, but currently don't,
-     * but ignore all signature passing also.  So they can also
-     * not be cached, and they mutate the signature which of course is wrong,
-     * but not doing it would confuse the code later.
+     * datetime legacy resolvers ignore the signature, which should be
+     * warn/raise (when used).  In such cases, the signature is (incorrectly)
+     * mutated, and caching is not possible.
      */
     for (int i = 0; i < nargs; i++) {
         if (signature[i] != NULL && signature[i] != operation_DTypes[i]) {
@@ -728,7 +726,7 @@ add_and_return_legacy_wrapping_ufunc_loop(PyUFuncObject *ufunc,
  * If value-based promotion is necessary, this is handled ahead of time by
  * `promote_and_get_ufuncimpl`.
  */
-static NPY_INLINE PyObject *
+static inline PyObject *
 promote_and_get_info_and_ufuncimpl(PyUFuncObject *ufunc,
         PyArrayObject *const ops[],
         PyArray_DTypeMeta *signature[],
@@ -916,8 +914,8 @@ promote_and_get_ufuncimpl(PyUFuncObject *ufunc,
     int nin = ufunc->nin, nargs = ufunc->nargs;
 
     /*
-     * Get the actual DTypes we operate with by mixing the operand array
-     * ones with the passed signature.
+     * Get the actual DTypes we operate with by setting op_dtypes[i] from
+     * signature[i].
      */
     for (int i = 0; i < nargs; i++) {
         if (signature[i] != NULL) {
@@ -950,7 +948,7 @@ promote_and_get_ufuncimpl(PyUFuncObject *ufunc,
         int cacheable = 1;  /* unused, as we modify the original `op_dtypes` */
         if (legacy_promote_using_legacy_type_resolver(ufunc,
                 ops, signature, op_dtypes, &cacheable, NPY_FALSE) < 0) {
-            return NULL;
+            goto handle_error;
         }
     }
 
@@ -962,10 +960,7 @@ promote_and_get_ufuncimpl(PyUFuncObject *ufunc,
     npy_promotion_state = old_promotion_state;
 
     if (info == NULL) {
-        if (!PyErr_Occurred()) {
-            raise_no_loop_found_error(ufunc, (PyObject **)op_dtypes);
-        }
-        return NULL;
+        goto handle_error;
     }
 
     PyArrayMethodObject *method = (PyArrayMethodObject *)PyTuple_GET_ITEM(info, 1);
@@ -987,7 +982,7 @@ promote_and_get_ufuncimpl(PyUFuncObject *ufunc,
         /* Reset the promotion state: */
         npy_promotion_state = NPY_USE_WEAK_PROMOTION_AND_WARN;
         if (res < 0) {
-            return NULL;
+            goto handle_error;
         }
     }
 
@@ -1021,12 +1016,29 @@ promote_and_get_ufuncimpl(PyUFuncObject *ufunc,
              * If signature is forced the cache may contain an incompatible
              * loop found via promotion (signature not enforced).  Reject it.
              */
-            raise_no_loop_found_error(ufunc, (PyObject **)op_dtypes);
-            return NULL;
+            goto handle_error;
         }
     }
 
     return method;
+
+  handle_error:
+    /* We only set the "no loop found error here" */
+    if (!PyErr_Occurred()) {
+        raise_no_loop_found_error(ufunc, (PyObject **)op_dtypes);
+    }
+    /*
+     * Otherwise an error occurred, but if the error was DTypePromotionError
+     * then we chain it, because DTypePromotionError effectively means that there
+     * is no loop available.  (We failed finding a loop by using promotion.)
+     */
+    else if (PyErr_ExceptionMatches(npy_DTypePromotionError)) {
+        PyObject *err_type = NULL, *err_value = NULL, *err_traceback = NULL;
+        PyErr_Fetch(&err_type, &err_value, &err_traceback);
+        raise_no_loop_found_error(ufunc, (PyObject **)op_dtypes);
+        npy_PyErr_ChainExceptionsCause(err_type, err_value, err_traceback);
+    }
+    return NULL;
 }
 
 
@@ -1042,13 +1054,6 @@ default_ufunc_promoter(PyUFuncObject *ufunc,
         PyArray_DTypeMeta *op_dtypes[], PyArray_DTypeMeta *signature[],
         PyArray_DTypeMeta *new_op_dtypes[])
 {
-    if (ufunc->type_resolver == &PyUFunc_SimpleBinaryComparisonTypeResolver
-            && signature[0] == NULL && signature[1] == NULL
-            && signature[2] != NULL && signature[2]->type_num != NPY_BOOL) {
-        /* bail out, this is _only_ to give future/deprecation warning! */
-        return -1;
-    }
-
     /* If nin < 2 promotion is a no-op, so it should not be registered */
     assert(ufunc->nin > 1);
     if (op_dtypes[0] == NULL) {
@@ -1235,3 +1240,40 @@ install_logical_ufunc_promoter(PyObject *ufunc)
 
     return PyUFunc_AddLoop((PyUFuncObject *)ufunc, info, 0);
 }
+
+/*
+ * Return the PyArrayMethodObject or PyCapsule that matches a registered
+ * tuple of identical dtypes. Return a borrowed ref of the first match.
+ */
+NPY_NO_EXPORT PyObject *
+get_info_no_cast(PyUFuncObject *ufunc, PyArray_DTypeMeta *op_dtype,
+                 int ndtypes)
+{
+    PyObject *t_dtypes = PyTuple_New(ndtypes);
+    if (t_dtypes == NULL) {
+        return NULL;
+    }
+    for (int i=0; i < ndtypes; i++) {
+        PyTuple_SetItem(t_dtypes, i, (PyObject *)op_dtype);
+    }
+    PyObject *loops = ufunc->_loops;
+    Py_ssize_t length = PyList_Size(loops);
+    for (Py_ssize_t i = 0; i < length; i++) {
+        PyObject *item = PyList_GetItem(loops, i);
+        PyObject *cur_DType_tuple = PyTuple_GetItem(item, 0);
+        int cmp = PyObject_RichCompareBool(cur_DType_tuple,
+                                           t_dtypes, Py_EQ);
+        if (cmp < 0) {
+            Py_DECREF(t_dtypes);
+            return NULL;
+        }
+        if (cmp == 0) {
+            continue;
+        }
+        /* Got the match */
+        Py_DECREF(t_dtypes);
+        return PyTuple_GetItem(item, 1);
+    }
+    Py_DECREF(t_dtypes);
+    Py_RETURN_NONE;
+}
diff --git a/numpy/core/src/umath/fast_loop_macros.h b/numpy/core/src/umath/fast_loop_macros.h
index cbd1f04aa..b8c1926b2 100644
--- a/numpy/core/src/umath/fast_loop_macros.h
+++ b/numpy/core/src/umath/fast_loop_macros.h
@@ -12,6 +12,19 @@
 
 #include <assert.h>
 
+#include "simd/simd.h"
+
+/*
+ * largest simd vector size in bytes numpy supports
+ * it is currently a extremely large value as it is only used for memory
+ * overlap checks
+ */
+#if NPY_SIMD > 0
+    // Enough for compiler unroll
+    #define AUTOVEC_OVERLAP_SIZE NPY_SIMD_WIDTH*4
+#else
+    #define AUTOVEC_OVERLAP_SIZE 1024
+#endif
 /*
  * MAX_STEP_SIZE is used to determine if we need to use SIMD version of the ufunc.
  * Very large step size can be as slow as processing it using scalar. The
@@ -27,7 +40,7 @@
  */
 #define MAX_STEP_SIZE 2097152
 
-static NPY_INLINE npy_uintp
+static inline npy_uintp
 abs_ptrdiff(char *a, char *b)
 {
     return (a > b) ? (a - b) : (b - a);
@@ -219,11 +232,11 @@ abs_ptrdiff(char *a, char *b)
         /* condition allows compiler to optimize the generic macro */ \
         if (IS_BINARY_CONT(tin, tout)) { \
             if (abs_ptrdiff(args[2], args[0]) == 0 && \
-                    abs_ptrdiff(args[2], args[1]) >= NPY_MAX_SIMD_SIZE) { \
+                    abs_ptrdiff(args[2], args[1]) >= AUTOVEC_OVERLAP_SIZE) { \
                 BASE_BINARY_LOOP_INP(tin, tout, op) \
             } \
             else if (abs_ptrdiff(args[2], args[1]) == 0 && \
-                         abs_ptrdiff(args[2], args[0]) >= NPY_MAX_SIMD_SIZE) { \
+                         abs_ptrdiff(args[2], args[0]) >= AUTOVEC_OVERLAP_SIZE) { \
                 BASE_BINARY_LOOP_INP(tin, tout, op) \
             } \
             else { \
@@ -378,19 +391,6 @@ abs_ptrdiff(char *a, char *b)
 
 #undef abs_ptrdiff
 
-#define IS_BLOCKABLE_BINARY_BOOL(esize, vsize) \
-    (steps[0] == (esize) && steps[0] == steps[1] && steps[2] == (1) && \
-     npy_is_aligned(args[1], (esize)) && \
-     npy_is_aligned(args[0], (esize)))
-
-#define IS_BLOCKABLE_BINARY_SCALAR1_BOOL(esize, vsize) \
-    (steps[0] == 0 && steps[1] == (esize) && steps[2] == (1) && \
-     npy_is_aligned(args[1], (esize)))
-
-#define IS_BLOCKABLE_BINARY_SCALAR2_BOOL(esize, vsize) \
-    (steps[0] == (esize) && steps[1] == 0 && steps[2] == (1) && \
-     npy_is_aligned(args[0], (esize)))
-
 /* align var to alignment */
 #define LOOP_BLOCK_ALIGN_VAR(var, type, alignment)\
     npy_intp i, peel = npy_aligned_block_offset(var, sizeof(type),\
diff --git a/numpy/core/src/umath/legacy_array_method.c b/numpy/core/src/umath/legacy_array_method.c
index c3d421d9b..965a0eb83 100644
--- a/numpy/core/src/umath/legacy_array_method.c
+++ b/numpy/core/src/umath/legacy_array_method.c
@@ -13,10 +13,13 @@
 
 #include "convert_datatype.h"
 #include "array_method.h"
+#include "array_coercion.h"
 #include "dtype_transfer.h"
 #include "legacy_array_method.h"
 #include "dtypemeta.h"
 
+#include "ufunc_object.h"
+
 
 typedef struct {
     NpyAuxData base;
@@ -91,7 +94,6 @@ generic_wrapped_legacy_loop(PyArrayMethod_Context *NPY_UNUSED(context),
     return 0;
 }
 
-
 /*
  * Signal that the old type-resolution function must be used to resolve
  * the descriptors (mainly/only used for datetimes due to the unit).
@@ -234,6 +236,97 @@ get_wrapped_legacy_ufunc_loop(PyArrayMethod_Context *context,
 }
 
 
+
+/*
+ * We can shave off a bit of time by just caching the initial and this is
+ * trivial for all internal numeric types.  (Wrapped ufuncs never use
+ * byte-swapping.)
+ */
+static int
+copy_cached_initial(
+        PyArrayMethod_Context *context, npy_bool NPY_UNUSED(reduction_is_empty),
+        char *initial)
+{
+    memcpy(initial, context->method->legacy_initial,
+           context->descriptors[0]->elsize);
+    return 1;
+}
+
+
+/*
+ * The default `get_reduction_initial` attempts to look up the identity
+ * from the calling ufunc.  This might fail, so we only call it when necessary.
+ *
+ * For internal number dtypes, we can easily cache it, so do so after the
+ * first call by overriding the function with `copy_cache_initial`.
+ * This path is not publicly available.  That could be added, and for a
+ * custom initial getter it should be static/compile time data anyway.
+ */
+static int
+get_initial_from_ufunc(
+        PyArrayMethod_Context *context, npy_bool reduction_is_empty,
+        char *initial)
+{
+    if (context->caller == NULL
+            || !PyObject_TypeCheck(context->caller, &PyUFunc_Type)) {
+        /* Impossible in NumPy 1.24;  guard in case it becomes possible. */
+        PyErr_SetString(PyExc_ValueError,
+                "getting initial failed because it can only done for legacy "
+                "ufunc loops when the ufunc is provided.");
+        return -1;
+    }
+    npy_bool reorderable;
+    PyObject *identity_obj = PyUFunc_GetDefaultIdentity(
+            (PyUFuncObject *)context->caller, &reorderable);
+    if (identity_obj == NULL) {
+        return -1;
+    }
+    if (identity_obj == Py_None) {
+        /* UFunc has no idenity (should not happen) */
+        Py_DECREF(identity_obj);
+        return 0;
+    }
+    if (PyTypeNum_ISUNSIGNED(context->descriptors[1]->type_num)
+            && PyLong_CheckExact(identity_obj)) {
+        /*
+         * This is a bit of a hack until we have truly loop specific
+         * identities.  Python -1 cannot be cast to unsigned so convert
+         * it to a NumPy scalar, but we use -1 for bitwise functions to
+         * signal all 1s.
+         * (A builtin identity would not overflow here, although we may
+         * unnecessary convert 0 and 1.)
+         */
+        Py_SETREF(identity_obj, PyObject_CallFunctionObjArgs(
+                     (PyObject *)&PyLongArrType_Type, identity_obj, NULL));
+        if (identity_obj == NULL) {
+            return -1;
+        }
+    }
+    else if (context->descriptors[0]->type_num == NPY_OBJECT
+            && !reduction_is_empty) {
+        /* Allows `sum([object()])` to work, but use 0 when empty. */
+        Py_DECREF(identity_obj);
+        return 0;
+    }
+
+    int res = PyArray_Pack(context->descriptors[0], initial, identity_obj);
+    Py_DECREF(identity_obj);
+    if (res < 0) {
+        return -1;
+    }
+
+    if (PyTypeNum_ISNUMBER(context->descriptors[0]->type_num)) {
+        /* For numbers we can cache to avoid going via Python ints */
+        memcpy(context->method->legacy_initial, initial,
+               context->descriptors[0]->elsize);
+        context->method->get_reduction_initial = &copy_cached_initial;
+    }
+
+    /* Reduction can use the initial value */
+    return 1;
+}
+
+
 /*
  * Get the unbound ArrayMethod which wraps the instances of the ufunc.
  * Note that this function stores the result on the ufunc and then only
@@ -273,6 +366,27 @@ PyArray_NewLegacyWrappingArrayMethod(PyUFuncObject *ufunc,
         flags = _NPY_METH_FORCE_CAST_INPUTS;
     }
 
+    get_reduction_initial_function *get_reduction_intial = NULL;
+    if (ufunc->nin == 2 && ufunc->nout == 1) {
+        npy_bool reorderable = NPY_FALSE;
+        PyObject *identity_obj = PyUFunc_GetDefaultIdentity(
+                ufunc, &reorderable);
+        if (identity_obj == NULL) {
+            return NULL;
+        }
+        /*
+         * TODO: For object, "reorderable" is needed(?), because otherwise
+         *       we disable multi-axis reductions `arr.sum(0, 1)`. But for
+         *       `arr = array([["a", "b"], ["c", "d"]], dtype="object")`
+         *       it isn't actually reorderable (order changes result).
+         */
+        if (reorderable) {
+            flags |= NPY_METH_IS_REORDERABLE;
+        }
+        if (identity_obj != Py_None) {
+            get_reduction_intial = &get_initial_from_ufunc;
+        }
+    }
     for (int i = 0; i < ufunc->nin+ufunc->nout; i++) {
         if (signature[i]->singleton->flags & (
                 NPY_ITEM_REFCOUNT | NPY_ITEM_IS_POINTER | NPY_NEEDS_PYAPI)) {
@@ -283,9 +397,10 @@ PyArray_NewLegacyWrappingArrayMethod(PyUFuncObject *ufunc,
         }
     }
 
-    PyType_Slot slots[3] = {
-        {NPY_METH_get_loop, &get_wrapped_legacy_ufunc_loop},
+    PyType_Slot slots[4] = {
+        {_NPY_METH_get_loop, &get_wrapped_legacy_ufunc_loop},
         {NPY_METH_resolve_descriptors, &simple_legacy_resolve_descriptors},
+        {NPY_METH_get_reduction_initial, get_reduction_intial},
         {0, NULL},
     };
     if (any_output_flexible) {
@@ -297,10 +412,10 @@ PyArray_NewLegacyWrappingArrayMethod(PyUFuncObject *ufunc,
         .name = method_name,
         .nin = ufunc->nin,
         .nout = ufunc->nout,
-        .dtypes = signature,
+        .casting = NPY_NO_CASTING,
         .flags = flags,
+        .dtypes = signature,
         .slots = slots,
-        .casting = NPY_NO_CASTING,
     };
 
     PyBoundArrayMethodObject *bound_res = PyArrayMethod_FromSpec_int(&spec, 1);
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index fe5aa9374..97a74b425 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -13,6 +13,7 @@
 #include "numpy/npy_math.h"
 #include "numpy/halffloat.h"
 #include "lowlevel_strided_loops.h"
+#include "loops_utils.h"
 
 #include "npy_pycompat.h"
 
@@ -31,27 +32,9 @@
  */
 #define PW_BLOCKSIZE    128
 
-
-/*
- * largest simd vector size in bytes numpy supports
- * it is currently a extremely large value as it is only used for memory
- * overlap checks
- */
-#ifndef NPY_MAX_SIMD_SIZE
-#define NPY_MAX_SIMD_SIZE 1024
-#endif
-
 /** Provides the various *_LOOP macros */
 #include "fast_loop_macros.h"
 
-/*
- * include vectorized functions and dispatchers
- * this file is safe to include also for generic builds
- * platform specific instructions are either masked via the proprocessor or
- * runtime detected
- */
-#include "simd.inc"
-
 /******************************************************************************
  **                          GENERIC FLOAT LOOPS                             **
  *****************************************************************************/
@@ -416,98 +399,6 @@ PyUFunc_On_Om(char **args, npy_intp const *dimensions, npy_intp const *steps, vo
  *****************************************************************************
  */
 
-/**begin repeat
- * #kind = logical_and, logical_or#
- * #OP =  &&, ||#
- * #SC =  ==, !=#
- * #and = 1, 0#
- **/
-
-NPY_NO_EXPORT void
-BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    if(IS_BINARY_REDUCE) {
-#ifdef NPY_HAVE_SSE2_INTRINSICS
-        /*
-         * stick with our variant for more reliable performance, only known
-         * platform which outperforms it by ~20% is an i7 with glibc 2.17
-         */
-        if (run_reduce_simd_@kind@_BOOL(args, dimensions, steps)) {
-            return;
-        }
-#else
-        /* for now only use libc on 32-bit/non-x86 */
-        if (steps[1] == 1) {
-            npy_bool * op = (npy_bool *)args[0];
-#if @and@
-            /* np.all(), search for a zero (false) */
-            if (*op) {
-                *op = memchr(args[1], 0, dimensions[0]) == NULL;
-            }
-#else
-            /*
-             * np.any(), search for a non-zero (true) via comparing against
-             * zero blocks, memcmp is faster than memchr on SSE4 machines
-             * with glibc >= 2.12 and memchr can only check for equal 1
-             */
-            static const npy_bool zero[4096]; /* zero by C standard */
-            npy_uintp i, n = dimensions[0];
-
-            for (i = 0; !*op && i < n - (n % sizeof(zero)); i += sizeof(zero)) {
-                *op = memcmp(&args[1][i], zero, sizeof(zero)) != 0;
-            }
-            if (!*op && n - i > 0) {
-                *op = memcmp(&args[1][i], zero, n - i) != 0;
-            }
-#endif
-            return;
-        }
-#endif
-        else {
-            BINARY_REDUCE_LOOP(npy_bool) {
-                const npy_bool in2 = *(npy_bool *)ip2;
-                io1 = io1 @OP@ in2;
-                if (io1 @SC@ 0) {
-                    break;
-                }
-            }
-            *((npy_bool *)iop1) = io1;
-        }
-    }
-    else {
-        if (run_binary_simd_@kind@_BOOL(args, dimensions, steps)) {
-            return;
-        }
-        else {
-            BINARY_LOOP {
-                const npy_bool in1 = *(npy_bool *)ip1;
-                const npy_bool in2 = *(npy_bool *)ip2;
-                *((npy_bool *)op1) = in1 @OP@ in2;
-            }
-        }
-    }
-}
-/**end repeat**/
-
-/**begin repeat
- * #kind = absolute, logical_not#
- * #OP =  !=, ==#
- **/
-NPY_NO_EXPORT void
-BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    if (run_unary_simd_@kind@_BOOL(args, dimensions, steps)) {
-        return;
-    }
-    else {
-        UNARY_LOOP {
-            npy_bool in1 = *(npy_bool *)ip1;
-            *((npy_bool *)op1) = in1 @OP@ 0;
-        }
-    }
-}
-/**end repeat**/
-
 NPY_NO_EXPORT void
 BOOL__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
 {
@@ -516,24 +407,6 @@ BOOL__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps,
     }
 }
 
-
-/**begin repeat
- * #kind = isnan, isinf, isfinite#
- * #func = npy_isnan, npy_isinf, npy_isfinite#
- * #val = NPY_FALSE, NPY_FALSE, NPY_TRUE#
- **/
-NPY_NO_EXPORT void
-BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    /*
-     * The (void)in; suppresses an unused variable warning raised by gcc and allows
-     * us to re-use this macro even though we do not depend on in
-     */
-    UNARY_LOOP_FAST(npy_bool, npy_bool, (void)in; *out = @val@);
-}
-
-/**end repeat**/
-
 /*
  *****************************************************************************
  **                           INTEGER LOOPS
@@ -552,8 +425,11 @@ BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void
  */
 
 #define @TYPE@_floor_divide @TYPE@_divide
+#define @TYPE@_floor_divide_indexed @TYPE@_divide_indexed
 #define @TYPE@_fmax @TYPE@_maximum
+#define @TYPE@_fmax_indexed @TYPE@_maximum_indexed
 #define @TYPE@_fmin @TYPE@_minimum
+#define @TYPE@_fmin_indexed @TYPE@_minimum_indexed
 
 NPY_NO_EXPORT void
 @TYPE@__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
@@ -562,163 +438,30 @@ NPY_NO_EXPORT void
         *((@type@ *)op1) = 1;
     }
 }
-
-NPY_NO_EXPORT void
-@TYPE@_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    UNARY_LOOP_FAST(@type@, @type@, *out = +in);
-}
-
 /**begin repeat1
- * #isa = , _avx2#
- * #CHK = 1, defined(HAVE_ATTRIBUTE_TARGET_AVX2)#
- * #ATTR = , NPY_GCC_TARGET_AVX2#
- */
-
-#if @CHK@
-NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
-@TYPE@_square@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
-{
-    UNARY_LOOP_FAST(@type@, @type@, *out = in * in);
-}
-#endif
-
-#if @CHK@
-NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
-@TYPE@_reciprocal@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
-{
-    UNARY_LOOP_FAST(@type@, @type@, *out = 1.0 / in);
-}
-#endif
-
-#if @CHK@
-NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
-@TYPE@_conjugate@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    UNARY_LOOP_FAST(@type@, @type@, *out = in);
-}
-#endif
-
-#if @CHK@
-NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
-@TYPE@_negative@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    UNARY_LOOP_FAST(@type@, @type@, *out = -in);
-}
-#endif
-
-#if @CHK@
-NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
-@TYPE@_logical_not@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    UNARY_LOOP_FAST(@type@, npy_bool, *out = !in);
-}
-#endif
-
-#if @CHK@
-NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
-@TYPE@_invert@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    UNARY_LOOP_FAST(@type@, @type@, *out = ~in);
-}
-#endif
-
-/**begin repeat2
  * Arithmetic
  * #kind = add, subtract, multiply, bitwise_and, bitwise_or, bitwise_xor#
  * #OP = +, -, *, &, |, ^#
  */
 
-#if @CHK@
-NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
-@TYPE@_@kind@@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    if (IS_BINARY_REDUCE) {
-        BINARY_REDUCE_LOOP_FAST(@type@, io1 @OP@= in2);
-    }
-    else {
-        BINARY_LOOP_FAST(@type@, @type@, *out = in1 @OP@ in2);
-    }
-}
-#endif
-
-/**end repeat2**/
-
-/*
- * Arithmetic bit shift operations.
- *
- * Intel hardware masks bit shift values, so large shifts wrap around
- * and can produce surprising results. The special handling ensures that
- * behavior is independent of compiler or hardware.
- * TODO: We could implement consistent behavior for negative shifts,
- *       which is undefined in C.
- */
-
-#define INT_left_shift_needs_clear_floatstatus
-#define UINT_left_shift_needs_clear_floatstatus
-
-#if @CHK@
-NPY_NO_EXPORT NPY_GCC_OPT_3 void
-@TYPE@_left_shift@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps,
-                  void *NPY_UNUSED(func))
-{
-    BINARY_LOOP_FAST(@type@, @type@, *out = npy_lshift@c@(in1, in2));
-
-#ifdef @TYPE@_left_shift_needs_clear_floatstatus
-    // For some reason, our macOS CI sets an "invalid" flag here, but only
-    // for some types.
-    npy_clear_floatstatus_barrier((char*)dimensions);
-#endif
-}
-#endif
-
-#undef INT_left_shift_needs_clear_floatstatus
-#undef UINT_left_shift_needs_clear_floatstatus
-
-#if @CHK@
-NPY_NO_EXPORT
-#ifndef NPY_DO_NOT_OPTIMIZE_@TYPE@_right_shift
-NPY_GCC_OPT_3
-#endif
-void
-@TYPE@_right_shift@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps,
-                   void *NPY_UNUSED(func))
-{
-    BINARY_LOOP_FAST(@type@, @type@, *out = npy_rshift@c@(in1, in2));
-}
-#endif
-
-/**begin repeat2
- * #kind = logical_and, logical_or#
- * #OP = &&, ||#
- */
-
-#if @CHK@
-NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
-@TYPE@_@kind@@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    /*
-     * gcc vectorization of this is not good (PR60575) but manual integer
-     * vectorization is too tedious to be worthwhile
-     */
-    BINARY_LOOP_FAST(@type@, npy_bool, *out = in1 @OP@ in2);
-}
-#endif
-
-/**end repeat2**/
-
-#if @CHK@
-NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
-@TYPE@_logical_xor@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+NPY_NO_EXPORT NPY_GCC_OPT_3 int
+@TYPE@_@kind@_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+                      char **args, npy_intp const *dimensions, npy_intp const *steps,
+                      void *NPY_UNUSED(func))
 {
-    BINARY_LOOP {
-        const int t1 = !!*(@type@ *)ip1;
-        const int t2 = !!*(@type@ *)ip2;
-        *((npy_bool *)op1) = (t1 != t2);
+    char *ip1 = args[0];
+    char *indx = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    @type@ *indexed;
+    for(i = 0; i < n; i++, indx += isindex, value += isb) {
+        indexed = (@type@ *)(ip1 + is1 * *(npy_intp *)indx);
+        *indexed = *indexed @OP@ *(@type@ *)value;
     }
+    return 0;
 }
-#endif
-
 /**end repeat1**/
 
 NPY_NO_EXPORT void
@@ -760,23 +503,6 @@ NPY_NO_EXPORT void
         *((@type@ *) op1) = out;
     }
 }
-
-/**begin repeat1
- * #kind = isnan, isinf, isfinite#
- * #func = npy_isnan, npy_isinf, npy_isfinite#
- * #val = NPY_FALSE, NPY_FALSE, NPY_TRUE#
- **/
-NPY_NO_EXPORT void
-@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    /*
-     * The (void)in; suppresses an unused variable warning raised by gcc and allows
-     * us to re-use this macro even though we do not depend on in
-     */
-    UNARY_LOOP_FAST(@type@, npy_bool, (void)in; *out = @val@);
-}
-/**end repeat1**/
-
 /**end repeat**/
 
 /**begin repeat
@@ -784,19 +510,6 @@ NPY_NO_EXPORT void
  * #type = npy_byte, npy_short, npy_int, npy_long, npy_longlong#
  * #c    = ,,,l,ll#
  */
-
-NPY_NO_EXPORT NPY_GCC_OPT_3 void
-@TYPE@_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    UNARY_LOOP_FAST(@type@, @type@, *out = (in >= 0) ? in : -in);
-}
-
-NPY_NO_EXPORT NPY_GCC_OPT_3 void
-@TYPE@_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    UNARY_LOOP_FAST(@type@, @type@, *out = in > 0 ? 1 : (in < 0 ? -1 : 0));
-}
-
 /**begin repeat1
  * #kind = gcd, lcm#
  **/
@@ -810,7 +523,6 @@ NPY_NO_EXPORT void
     }
 }
 /**end repeat1**/
-
 /**end repeat**/
 
 /**begin repeat
@@ -818,19 +530,6 @@ NPY_NO_EXPORT void
  * #type = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong#
  * #c    = u,u,u,ul,ull#
  */
-
-NPY_NO_EXPORT NPY_GCC_OPT_3 void
-@TYPE@_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    UNARY_LOOP_FAST(@type@, @type@, *out = in);
-}
-
-NPY_NO_EXPORT NPY_GCC_OPT_3 void
-@TYPE@_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    UNARY_LOOP_FAST(@type@, @type@, *out = in > 0 ? 1 : 0);
-}
-
 /**begin repeat1
  * #kind = gcd, lcm#
  **/
@@ -844,9 +543,50 @@ NPY_NO_EXPORT void
     }
 }
 /**end repeat1**/
+/**end repeat**/
+
 
+/*
+ * NOTE: It may be nice to vectorize these, OTOH, these are still faster
+ *       than the cast we used to do.
+ */
+
+/**begin repeat
+ * #kind = equal, not_equal, less, less_equal, greater, greater_equal#
+ * #OP = ==, !=, <, <=, >, >=#
+ */
+NPY_NO_EXPORT void
+LONGLONG_Qq_bool_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_ulonglong in1 = *(npy_ulonglong *)ip1;
+        const npy_longlong in2 = *(npy_longlong *)ip2;
+        if (in2 < 0) {
+            *(npy_bool *)op1 = 0 @OP@ in2;
+        }
+        else {
+            *(npy_bool *)op1 = in1 @OP@ (npy_ulonglong)in2;
+        }
+    }
+}
+
+NPY_NO_EXPORT void
+LONGLONG_qQ_bool_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_longlong in1 = *(npy_longlong *)ip1;
+        const npy_ulonglong in2 = *(npy_ulonglong *)ip2;
+        if (in1 < 0) {
+            *(npy_bool *)op1 = in1 @OP@ 0;
+        }
+        else {
+            *(npy_bool *)op1 = (npy_ulonglong)in1 @OP@ in2;
+        }
+    }
+}
 /**end repeat**/
 
+
 /*
  *****************************************************************************
  **                           DATETIME LOOPS                                **
@@ -923,12 +663,6 @@ NPY_NO_EXPORT void
 }
 
 NPY_NO_EXPORT void
-@TYPE@_isinf(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    UNARY_LOOP_FAST(npy_bool, npy_bool, (void)in; *out = NPY_FALSE);
-}
-
-NPY_NO_EXPORT void
 @TYPE@__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
 {
     OUTPUT_LOOP {
@@ -987,6 +721,7 @@ NPY_NO_EXPORT void
         }
     }
 }
+
 /**end repeat1**/
 
 /**begin repeat1
@@ -1406,6 +1141,8 @@ TIMEDELTA_mm_qm_divmod(char **args, npy_intp const *dimensions, npy_intp const *
  *  #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
  *  #c = f, , l#
  *  #C = F, , L#
+ *  #fd = 1, 1, 0#
+ *  #VCHK = 1, 1, 0#
  */
 /**begin repeat1
  * #kind = logical_and, logical_or#
@@ -1442,32 +1179,22 @@ NPY_NO_EXPORT void
     }
 }
 
+#if !@fd@
 /**begin repeat1
  * #kind = isnan, isinf, isfinite, signbit#
  * #func = npy_isnan, npy_isinf, npy_isfinite, npy_signbit#
  **/
-
-/**begin repeat2
- * #ISA  = , _avx512_skx#
- * #isa  = simd, avx512_skx#
- * #CHK  = 1, defined(HAVE_ATTRIBUTE_TARGET_AVX512_SKX)#
- **/
-
-#if @CHK@
 NPY_NO_EXPORT void
-@TYPE@_@kind@@ISA@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
 {
-    if (!run_@kind@_@isa@_@TYPE@(args, dimensions, steps)) {
-        UNARY_LOOP {
-            const @type@ in1 = *(@type@ *)ip1;
-            *((npy_bool *)op1) = @func@(in1) != 0;
-        }
+    UNARY_LOOP {
+        const @type@ in1 = *(@type@ *)ip1;
+        *((npy_bool *)op1) = @func@(in1) != 0;
     }
     npy_clear_floatstatus_barrier((char*)dimensions);
 }
-#endif
-/**end repeat2**/
 /**end repeat1**/
+#endif
 
 NPY_NO_EXPORT void
 @TYPE@_spacing(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
@@ -1508,6 +1235,25 @@ NPY_NO_EXPORT void
     }
 }
 
+NPY_NO_EXPORT int
+@TYPE@_floor_divide_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+         char **args, npy_intp const *dimensions, npy_intp const *steps,
+         void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indx = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    @type@ *indexed;
+    for(i = 0; i < n; i++, indx += isindex, value += isb) {
+        indexed = (@type@ *)(ip1 + is1 * *(npy_intp *)indx);
+        *indexed = npy_floor_divide@c@(*indexed, *(@type@ *)value);
+    }
+    return 0;
+}
+
 NPY_NO_EXPORT void
 @TYPE@_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
 {
@@ -1546,17 +1292,6 @@ NPY_NO_EXPORT void
 }
 
 NPY_NO_EXPORT void
-@TYPE@_negative(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    if (!run_unary_simd_negative_@TYPE@(args, dimensions, steps)) {
-        UNARY_LOOP {
-            const @type@ in1 = *(@type@ *)ip1;
-            *((@type@ *)op1) = -in1;
-        }
-    }
-}
-
-NPY_NO_EXPORT void
 @TYPE@_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
 {
     UNARY_LOOP {
@@ -1653,6 +1388,26 @@ LONGDOUBLE_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps
         }
     }
 }
+
+NPY_NO_EXPORT int
+LONGDOUBLE_@kind@_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+         char **args, npy_intp const *dimensions, npy_intp const *steps,
+         void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indx = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_longdouble *indexed;
+    for(i = 0; i < n; i++, indx += isindex, value += isb) {
+        indexed = (npy_longdouble *)(ip1 + is1 * *(npy_intp *)indx);
+        *indexed = *indexed @OP@ *(npy_longdouble *)value;
+    }
+    return 0;
+}
+
 /**end repeat**/
 
 /**begin repeat
@@ -1758,6 +1513,26 @@ HALF_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void
         }
     }
 }
+
+NPY_NO_EXPORT int
+HALF_@kind@_indexed(void *NPY_UNUSED(context),
+         char **args, npy_intp const *dimensions, npy_intp const *steps,
+         void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indx = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_half *indexed;
+    for(i = 0; i < n; i++, indx += isindex, value += isb) {
+        indexed = (npy_half *)(ip1 + is1 * *(npy_intp *)indx);
+        const float v = npy_half_to_float(*(npy_half *)value);
+        *indexed = npy_float_to_half(npy_half_to_float(*indexed) @OP@ v);
+    }
+    return 0;
+}
 /**end repeat**/
 
 #define _HALF_LOGICAL_AND(a,b) (!npy_half_iszero(a) && !npy_half_iszero(b))
@@ -1859,6 +1634,27 @@ HALF_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void
     }
     /* npy_half_isnan will never set floatstatus_invalid, so do not clear */
 }
+
+NPY_NO_EXPORT int
+HALF_@kind@_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+         char **args, npy_intp const *dimensions, npy_intp const *steps,
+         void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indx = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_half *indexed;
+    for(i = 0; i < n; i++, indx += isindex, value += isb) {
+        indexed = (npy_half *)(ip1 + is1 * *(npy_intp *)indx);
+        npy_half v = *(npy_half *)value;
+        *indexed = (@OP@(*indexed, v) || npy_half_isnan(*indexed)) ? *indexed : v;
+    }
+    return 0;
+}
+
 /**end repeat**/
 
 /**begin repeat
@@ -1874,8 +1670,29 @@ HALF_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void
         const npy_half in2 = *(npy_half *)ip2;
         *((npy_half *)op1) = (@OP@(in1, in2) || npy_half_isnan(in2)) ? in1 : in2;
     }
-    /* npy_half_isnan will never set floatstatus_invalid, so do not clear */
+    /* no need to clear floatstatus_invalid */
+}
+
+NPY_NO_EXPORT int
+HALF_@kind@_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+         char **args, npy_intp const *dimensions, npy_intp const *steps,
+         void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indx = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_half *indexed;
+    for (i = 0; i < n; i++, indx += isindex, value += isb) {
+        indexed = (npy_half *)(ip1 + is1 * *(npy_intp *)indx);
+        npy_half v = *(npy_half *)value;
+        *indexed = (@OP@(*indexed, v) || npy_half_isnan(v)) ? *indexed: v;
+    }
+    return 0;
 }
+
 /**end repeat**/
 
 NPY_NO_EXPORT void
@@ -1894,6 +1711,27 @@ HALF_floor_divide(char **args, npy_intp const *dimensions, npy_intp const *steps
     }
 }
 
+NPY_NO_EXPORT int
+HALF_floor_divide_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+         char **args, npy_intp const *dimensions, npy_intp const *steps,
+         void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indx = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_half *indexed;
+    for(i = 0; i < n; i++, indx += isindex, value += isb) {
+        indexed = (npy_half *)(ip1 + is1 * *(npy_intp *)indx);
+        float v = npy_half_to_float(*(npy_half *)value);
+        float div = npy_floor_dividef(npy_half_to_float(*indexed), v);
+        *indexed = npy_float_to_half(div);
+    }
+    return 0;
+}
+
 NPY_NO_EXPORT void
 HALF_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
 {
@@ -1953,12 +1791,6 @@ HALF_conjugate(char **args, npy_intp const *dimensions, npy_intp const *steps, v
     }
 }
 
-NPY_NO_EXPORT NPY_GCC_OPT_3 void
-HALF_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    UNARY_LOOP_FAST(npy_half, npy_half, *out = in&0x7fffu);
-}
-
 NPY_NO_EXPORT void
 HALF_negative(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
 {
@@ -2110,6 +1942,26 @@ NPY_NO_EXPORT void
         }
     }
 }
+
+NPY_NO_EXPORT int @TYPE@_@kind@_indexed
+(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indx = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    @ftype@ *indexed;
+    for(i = 0; i < n; i++, indx += isindex, value += isb) {
+        indexed = (@ftype@ *)(ip1 + is1 * *(npy_intp *)indx);
+        const @ftype@ b_r = ((@ftype@ *)value)[0];
+        const @ftype@ b_i = ((@ftype@ *)value)[1];
+        indexed[0] @OP@= b_r;
+        indexed[1] @OP@= b_i;
+    }
+    return 0;
+}
 /**end repeat1**/
 
 NPY_NO_EXPORT void
@@ -2124,6 +1976,28 @@ NPY_NO_EXPORT void
         ((@ftype@ *)op1)[1] = in1r*in2i + in1i*in2r;
     }
 }
+
+NPY_NO_EXPORT int @TYPE@_multiply_indexed
+(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indx = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    @ftype@ *indexed;
+    for(i = 0; i < n; i++, indx += isindex, value += isb) {
+        indexed = (@ftype@ *)(ip1 + is1 * *(npy_intp *)indx);
+        const @ftype@ a_r = indexed[0];
+        const @ftype@ a_i = indexed[1];
+        const @ftype@ b_r = ((@ftype@ *)value)[0];
+        const @ftype@ b_i = ((@ftype@ *)value)[1];
+        indexed[0] = a_r*b_r - a_i*b_i;
+        indexed[1] = a_r*b_i + a_i*b_r;
+    }
+    return 0;
+}
 #endif // !SIMD
 
 NPY_NO_EXPORT void
@@ -2235,6 +2109,8 @@ NPY_NO_EXPORT void
 }
 /**end repeat1**/
 
+#if !@SIMD@
+// CFLOAT & CDOUBLE defined by 'loops_arithm_fp.dispatch.c.src'
 NPY_NO_EXPORT void
 @TYPE@_square(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
 {
@@ -2245,6 +2121,7 @@ NPY_NO_EXPORT void
         ((@ftype@ *)op1)[1] = in1r*in1i + in1i*in1r;
     }
 }
+#endif
 
 NPY_NO_EXPORT void
 @TYPE@_reciprocal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
@@ -2275,6 +2152,8 @@ NPY_NO_EXPORT void
     }
 }
 
+#if !@SIMD@
+// CFLOAT & CDOUBLE defined by 'loops_arithm_fp.dispatch.c.src'
 NPY_NO_EXPORT void
 @TYPE@_conjugate(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) {
     UNARY_LOOP {
@@ -2294,20 +2173,6 @@ NPY_NO_EXPORT void
         *((@ftype@ *)op1) = npy_hypot@c@(in1r, in1i);
     }
 }
-
-#if @SIMD@ && defined(HAVE_ATTRIBUTE_TARGET_AVX512F)
-/**begin repeat1
- * arithmetic
- * #kind = conjugate, square, absolute#
- */
-NPY_NO_EXPORT void
-@TYPE@_@kind@_avx512f(char **args, const npy_intp *dimensions, const npy_intp *steps, void *func)
-{
-    if (!run_unary_avx512f_@kind@_@TYPE@(args, dimensions, steps)) {
-        @TYPE@_@kind@(args, dimensions, steps, func);
-    }
-}
-/**end repeat1**/
 #endif
 
 NPY_NO_EXPORT void
diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src
index 424e204c1..cce73aff8 100644
--- a/numpy/core/src/umath/loops.h.src
+++ b/numpy/core/src/umath/loops.h.src
@@ -10,24 +10,32 @@
     #define NPY_NO_EXPORT NPY_VISIBILITY_HIDDEN
 #endif
 
-#define BOOL_invert BOOL_logical_not
-#define BOOL_add BOOL_logical_or
-#define BOOL_bitwise_and BOOL_logical_and
-#define BOOL_bitwise_or BOOL_logical_or
-#define BOOL_logical_xor BOOL_not_equal
-#define BOOL_bitwise_xor BOOL_logical_xor
-#define BOOL_multiply BOOL_logical_and
-#define BOOL_maximum BOOL_logical_or
-#define BOOL_minimum BOOL_logical_and
-#define BOOL_fmax BOOL_maximum
-#define BOOL_fmin BOOL_minimum
-
 /*
  *****************************************************************************
  **                             BOOLEAN LOOPS                               **
  *****************************************************************************
  */
 
+/*
+ * Following functions are defined by umath generator
+ * to enable runtime dispatching without the need
+ * to redefine them within dsipatch-able sources.
+ */
+// #define BOOL_invert BOOL_logical_not
+// #define BOOL_add BOOL_logical_or
+// #define BOOL_bitwise_and BOOL_logical_and
+// #define BOOL_bitwise_or BOOL_logical_or
+// #define BOOL_logical_xor BOOL_not_equal
+// #define BOOL_bitwise_xor BOOL_logical_xor
+// #define BOOL_multiply BOOL_logical_and
+// #define BOOL_maximum BOOL_logical_or
+// #define BOOL_minimum BOOL_logical_and
+// #define BOOL_fmax BOOL_maximum
+// #define BOOL_fmin BOOL_minimum
+
+typedef struct PyArrayMethod_Context_tag PyArrayMethod_Context;
+typedef struct NpyAuxData_tag NpyAuxData;
+
 #ifndef NPY_DISABLE_OPTIMIZATION
     #include "loops_comparison.dispatch.h"
 #endif
@@ -39,11 +47,15 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BOOL_@kind@,
     (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
 /**end repeat**/
 
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_logical.dispatch.h"
+#endif
+
 /**begin repeat
- * #kind = logical_and, logical_or, absolute, logical_not#
- **/
-NPY_NO_EXPORT void
-BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+ * #kind = logical_and, logical_or, logical_not, absolute#
+ */
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BOOL_@kind@,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
 /**end repeat**/
 
 NPY_NO_EXPORT void
@@ -56,6 +68,17 @@ NPY_NO_EXPORT void
 BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
 /**end repeat**/
 
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_autovec.dispatch.h"
+#endif
+/**begin repeat
+ * #kind = isnan, isinf, isfinite#
+ */
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BOOL_@kind@,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+/**end repeat**/
+
 /*
  *****************************************************************************
  **                           INTEGER LOOPS
@@ -72,6 +95,11 @@ BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void
  */
 NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_divide,
     (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+NPY_NO_EXPORT int
+@TYPE@_divide_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+/**end repeat3**/
 /**end repeat**/
 
 #ifndef NPY_DISABLE_OPTIMIZATION
@@ -106,19 +134,40 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
 /**end repeat1**/
 /**end repeat**/
 
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_autovec.dispatch.h"
+#endif
 /**begin repeat
- * #TYPE = BYTE, SHORT, INT, LONG, LONGLONG#
+ * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG,
+           BYTE,  SHORT,  INT,  LONG,  LONGLONG#
  */
+/**begin repeat1
+ * #kind = invert, logical_not, conjugate, reciprocal, square, add,
+ *         subtract, multiply, bitwise_and, bitwise_or, bitwise_xor,
+ *         left_shift, right_shift, logical_and, logical_or,
+ *         logical_xor, isnan, isinf, isfinite,
+ *         absolute, sign#
+ */
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+/**end repeat1**/
+/**end repeat**/
 
+/**begin repeat
+ * #TYPE = BYTE, SHORT, INT, LONG, LONGLONG#
+ */
 /**begin repeat1
  * both signed and unsigned integer types
  * #s = , u#
  * #S = , U#
  */
-
 #define @S@@TYPE@_floor_divide @S@@TYPE@_divide
+#define @S@@TYPE@_floor_divide_indexed @S@@TYPE@_divide_indexed
 #define @S@@TYPE@_fmax @S@@TYPE@_maximum
 #define @S@@TYPE@_fmin @S@@TYPE@_minimum
+#define @S@@TYPE@_fmax_indexed @S@@TYPE@_maximum_indexed
+#define @S@@TYPE@_fmin_indexed @S@@TYPE@_minimum_indexed
 
 NPY_NO_EXPORT void
 @S@@TYPE@__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
@@ -127,85 +176,69 @@ NPY_NO_EXPORT void
 @S@@TYPE@_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
 
 /**begin repeat2
- * #isa = , _avx2#
- */
-
-NPY_NO_EXPORT void
-@S@@TYPE@_square@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
-
-NPY_NO_EXPORT void
-@S@@TYPE@_reciprocal@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
-
-NPY_NO_EXPORT void
-@S@@TYPE@_conjugate@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@S@@TYPE@_negative@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@S@@TYPE@_logical_not@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@S@@TYPE@_invert@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
-
-/**begin repeat3
  * Arithmetic
  * #kind = add, subtract, multiply, bitwise_and, bitwise_or, bitwise_xor,
  *          left_shift, right_shift#
  * #OP = +, -,*, &, |, ^, <<, >>#
  */
-NPY_NO_EXPORT void
-@S@@TYPE@_@kind@@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
-
-/**end repeat3**/
-
-/**begin repeat3
- * #kind = logical_and, logical_or#
- * #OP = &&, ||#
- */
-NPY_NO_EXPORT void
-@S@@TYPE@_@kind@@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
-
-/**end repeat3**/
+NPY_NO_EXPORT int
+@S@@TYPE@_@kind@_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
 
-NPY_NO_EXPORT void
-@S@@TYPE@_logical_xor@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
 /**end repeat2**/
 
 /**begin repeat2
  * #kind = maximum, minimum#
- * #OP =  >, <#
  **/
 NPY_NO_EXPORT void
 @S@@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
-/**end repeat2**/
 
-NPY_NO_EXPORT void
-@S@@TYPE@_power(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+NPY_NO_EXPORT int
+@S@@TYPE@_@kind@_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
 
-NPY_NO_EXPORT void
-@S@@TYPE@_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**end repeat2**/
 
 NPY_NO_EXPORT void
-@S@@TYPE@_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+@S@@TYPE@_power(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
 
 NPY_NO_EXPORT void
 @S@@TYPE@_gcd(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
 
 NPY_NO_EXPORT void
 @S@@TYPE@_lcm(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
-
-/**begin repeat2
- * #kind = isnan, isinf, isfinite#
- **/
-NPY_NO_EXPORT void
-@S@@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
 /**end repeat2**/
 
 /**end repeat1**/
+/**end repeat**/
+
+/**begin repeat
+ * #kind = equal, not_equal, less, less_equal, greater, greater_equal#
+ * #OP = ==, !=, <, <=, >, >=#
+ */
+NPY_NO_EXPORT void
+LONGLONG_Qq_bool_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+NPY_NO_EXPORT void
+LONGLONG_qQ_bool_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
 
 /**end repeat**/
 
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_unary.dispatch.h"
+#endif
+/**begin repeat
+ * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG,
+ *         BYTE,  SHORT,  INT,  LONG,  LONGLONG#
+ */
+/**begin repeat1
+ * #kind = negative#
+ */
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+/**end repeat1**/
+/**end repeat**/
+
+
 /*
  *****************************************************************************
  **                             FLOAT LOOPS                                 **
@@ -226,6 +259,34 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
 /**end repeat**/
 
 #ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_unary_fp_le.dispatch.h"
+#endif
+/**begin repeat
+ *  #TYPE = FLOAT, DOUBLE#
+ */
+/**begin repeat1
+ * #kind = isnan, isinf, isfinite, signbit#
+ */
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+/**end repeat1**/
+/**end repeat**/
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_unary.dispatch.h"
+#endif
+/**begin repeat
+ *  #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
+ */
+/**begin repeat1
+ * #kind = negative#
+ */
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+/**end repeat1**/
+/**end repeat**/
+
+#ifndef NPY_DISABLE_OPTIMIZATION
     #include "loops_arithm_fp.dispatch.h"
 #endif
 /**begin repeat
@@ -234,10 +295,13 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
 /**begin repeat1
  * Arithmetic
  * # kind = add, subtract, multiply, divide#
- * # OP = +, -, *, /#
  */
 NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
     (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+NPY_NO_EXPORT int
+@TYPE@_@kind@_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
 /**end repeat1**/
 /**end repeat**/
 
@@ -283,15 +347,6 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void HALF_@func@,
 /**end repeat**/
 
 /**begin repeat
- * #func = sin, cos#
- */
-
-NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_@func@,
-    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
-
-/**end repeat**/
-
-/**begin repeat
  *  #TYPE = FLOAT, DOUBLE#
  */
 /**begin repeat1
@@ -304,27 +359,20 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@func@,
 /**end repeat1**/
 /**end repeat**/
 
-/**begin repeat
- *  #TYPE = FLOAT, DOUBLE#
- */
-/**begin repeat1
- * #func = maximum, minimum#
- */
-NPY_NO_EXPORT void
-@TYPE@_@func@_avx512f(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
-
-/**end repeat1**/
-/**end repeat**/
-
 #ifndef NPY_DISABLE_OPTIMIZATION
     #include "loops_trigonometric.dispatch.h"
 #endif
+
 /**begin repeat
+ *  #TYPE = FLOAT, DOUBLE#
+ */
+/**begin repeat1
  *  #func = sin, cos#
  */
-NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_@func@, (
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@func@, (
     char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)
 ))
+/**end repeat1**/
 /**end repeat**/
 
 #ifndef NPY_DISABLE_OPTIMIZATION
@@ -362,6 +410,8 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, (
  *  #TYPE = HALF, FLOAT, DOUBLE, LONGDOUBLE#
  *  #c = f, f, , l#
  *  #C = F, F, , L#
+ *  #half = 1, 0, 0, 0#
+ *  #fd = 0, 1, 1, 0#
  */
 
 /**begin repeat1
@@ -371,6 +421,11 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, (
  */
 NPY_NO_EXPORT void
 @TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+@TYPE@_@kind@_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+/**end repeat1**/
 /**end repeat1**/
 
 /**begin repeat1
@@ -390,35 +445,44 @@ NPY_NO_EXPORT void
 /**begin repeat1
  * #kind = isnan, isinf, isfinite, signbit, copysign, nextafter, spacing#
  * #func = npy_isnan, npy_isinf, npy_isfinite, npy_signbit, npy_copysign, nextafter, spacing#
+ * #dispatched = 1, 1, 1, 1, 0, 0, 0#
  **/
 
-/**begin repeat2
- * #ISA  = , _avx512_skx#
- **/
+#if !@fd@ || !@dispatched@
 NPY_NO_EXPORT void
-@TYPE@_@kind@@ISA@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+#endif
 /**end repeat2**/
 /**end repeat1**/
 
 /**begin repeat1
  * #kind = maximum, minimum#
- * #OP =  >=, <=#
  **/
 NPY_NO_EXPORT void
 @TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+@TYPE@_@kind@_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
 /**end repeat1**/
 
 /**begin repeat1
  * #kind = fmax, fmin#
- * #OP =  >=, <=#
  **/
 NPY_NO_EXPORT void
 @TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+@TYPE@_@kind@_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
 /**end repeat1**/
 
 NPY_NO_EXPORT void
 @TYPE@_floor_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
 
+NPY_NO_EXPORT int
+@TYPE@_floor_divide_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
 NPY_NO_EXPORT void
 @TYPE@_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
 
@@ -440,8 +504,10 @@ NPY_NO_EXPORT void
 NPY_NO_EXPORT void
 @TYPE@_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
 
+#if @half@
 NPY_NO_EXPORT void
 @TYPE@_negative(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+#endif
 
 NPY_NO_EXPORT void
 @TYPE@_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
@@ -473,6 +539,19 @@ NPY_NO_EXPORT void
 /**end repeat1**/
 /**end repeat**/
 
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_autovec.dispatch.h"
+#endif
+/**begin repeat
+ * #TYPE = HALF#
+ */
+/**begin repeat1
+ * #kind = absolute#
+ */
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+/**end repeat1**/
+/**end repeat**/
 /*
  *****************************************************************************
  **                           COMPLEX LOOPS                                 **
@@ -485,7 +564,21 @@ NPY_NO_EXPORT void
  * #TYPE = CFLOAT, CDOUBLE#
  */
 /**begin repeat1
- * #kind = add, subtract, multiply#
+ * #kind = add, subtract, multiply, conjugate, square#
+ */
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+/**end repeat1**/
+/**end repeat**/
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_unary_complex.dispatch.h"
+#endif
+/**begin repeat
+ * #TYPE = CFLOAT, CDOUBLE#
+ */
+/**begin repeat1
+ * #kind = absolute#
  */
 NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
@@ -512,6 +605,9 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
  */
 NPY_NO_EXPORT void
 C@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+C@TYPE@_@kind@_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
 /**end repeat1**/
 
 NPY_NO_EXPORT void
@@ -554,19 +650,14 @@ C@TYPE@_reciprocal(char **args, npy_intp const *dimensions, npy_intp const *step
 NPY_NO_EXPORT void
 C@TYPE@__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
 
-/**begin repeat1
- * #isa = , _avx512f#
- */
-
 NPY_NO_EXPORT void
-C@TYPE@_conjugate@isa@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+C@TYPE@_conjugate(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
 
 NPY_NO_EXPORT void
-C@TYPE@_absolute@isa@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+C@TYPE@_absolute(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
 
 NPY_NO_EXPORT void
-C@TYPE@_square@isa@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(data));
-/**end repeat1**/
+C@TYPE@_square(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(data));
 
 NPY_NO_EXPORT void
 C@TYPE@__arg(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
@@ -627,9 +718,6 @@ NPY_NO_EXPORT void
 NPY_NO_EXPORT void
 @TYPE@_isfinite(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
 
-NPY_NO_EXPORT void
-@TYPE@_isinf(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
-
 #define @TYPE@_isnan @TYPE@_isnat
 
 NPY_NO_EXPORT void
@@ -705,6 +793,21 @@ TIMEDELTA_mm_qm_divmod(char **args, npy_intp const *dimensions, npy_intp const *
 #define TIMEDELTA_md_m_floor_divide TIMEDELTA_md_m_divide
 /* #define TIMEDELTA_mm_d_floor_divide TIMEDELTA_mm_d_divide */
 
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_autovec.dispatch.h"
+#endif
+/**begin repeat
+ * #TYPE = TIMEDELTA, DATETIME#
+ */
+/**begin repeat1
+ * #kind = isinf#
+ */
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+/**end repeat1**/
+/**end repeat**/
+
 /*
  *****************************************************************************
  **                            OBJECT LOOPS                                 **
diff --git a/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src b/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src
index bf8142880..3ab5a968d 100644
--- a/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src
+++ b/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src
@@ -1,6 +1,8 @@
 /*@targets
  ** $maxopt baseline
- ** sse2 avx2 avx512f
+ ** sse2 (avx2 fma3)
+ ** neon asimd
+ ** vsx2 vsx3
  ** vx vxe
  **/
 #define _UMATHMODULE
@@ -14,707 +16,392 @@
 // Provides the various *_LOOP macros
 #include "fast_loop_macros.h"
 
-// TODO: replace raw SIMD with NPYV
+/**
+ * TODO:
+ *  - Improve the implementation of SIMD complex absolute,
+ *    current one kinda slow and it can be optimized by
+ *    at least avoiding the division and keep sqrt.
+ *  - Vectorize reductions
+ *  - Add support for ASIMD/VCMLA through universal intrinics.
+ */
+
 //###############################################################################
 //## Real Single/Double precision
 //###############################################################################
 /********************************************************************************
- ** Defining the SIMD kernels
+ ** Defining ufunc inner functions
  ********************************************************************************/
-#ifdef NPY_HAVE_SSE2
+
+/*
+ * clang has a bug that's present at -O1 or greater.  When partially loading a
+ * vector register for a divide operation, the remaining elements are set
+ * to 1 to avoid divide-by-zero.  The partial load is paired with a partial
+ * store after the divide operation.  clang notices that the entire register
+ * is not needed for the store and optimizes out the fill of 1 to the remaining
+ * elements.  This causes either a divide-by-zero or 0/0 with invalid exception
+ * that we were trying to avoid by filling.
+ *
+ * Using a dummy variable marked 'volatile' convinces clang not to ignore
+ * the explicit fill of remaining elements.  If `-ftrapping-math` is
+ * supported, then it'll also avoid the bug.  `-ftrapping-math` is supported
+ * on Apple clang v12+ for x86_64.  It is not currently supported for arm64.
+ * `-ftrapping-math` is set by default of Numpy builds in
+ * numpy/distutils/ccompiler.py.
+ *
+ * Note: Apple clang and clang upstream have different versions that overlap
+ */
+#if defined(__clang__)
+    #if defined(__apple_build_version__)
+    // Apple Clang
+        #if __apple_build_version__ < 12000000
+        // Apple Clang before v12
+        #define WORKAROUND_CLANG_PARTIAL_LOAD_BUG 1
+        #elif defined(NPY_CPU_X86) || defined(NPY_CPU_AMD64)
+        // Apple Clang after v12, targeting i386 or x86_64
+        #define WORKAROUND_CLANG_PARTIAL_LOAD_BUG 0
+        #else
+        // Apple Clang after v12, not targeting i386 or x86_64
+        #define WORKAROUND_CLANG_PARTIAL_LOAD_BUG 1
+        #endif
+    #else
+    // Clang, not Apple Clang
+        #if __clang_major__ < 10
+        // Clang before v10
+        #define WORKAROUND_CLANG_PARTIAL_LOAD_BUG 1
+        #elif defined(_MSC_VER)
+        // clang-cl has the same bug
+        #define WORKAROUND_CLANG_PARTIAL_LOAD_BUG 1
+        #elif defined(NPY_CPU_X86) || defined(NPY_CPU_AMD64)
+        // Clang v10+, targeting i386 or x86_64
+        #define WORKAROUND_CLANG_PARTIAL_LOAD_BUG 0
+        #else
+        // Clang v10+, not targeting i386 or x86_64
+        #define WORKAROUND_CLANG_PARTIAL_LOAD_BUG 1
+        #endif
+    #endif
+#else
+// Not a Clang compiler
+#define WORKAROUND_CLANG_PARTIAL_LOAD_BUG 0
+#endif
+
 /**begin repeat
+ * Float types
  *  #type = npy_float, npy_double#
  *  #TYPE = FLOAT, DOUBLE#
- *  #scalarf = npy_sqrtf, npy_sqrt#
+ *  #sfx  = f32, f64#
  *  #c = f, #
- *  #vtype = __m128, __m128d#
- *  #vtype256 = __m256, __m256d#
- *  #vtype512 = __m512, __m512d#
- *  #vpre = _mm, _mm#
- *  #vpre256 = _mm256, _mm256#
- *  #vpre512 = _mm512, _mm512#
- *  #vsuf = ps, pd#
- *  #vsufs = ss, sd#
- *  #nan = NPY_NANF, NPY_NAN#
- *  #double = 0, 1#
- *  #cast = _mm_castps_si128, _mm_castpd_si128#
+ *  #C = F, #
+ *  #VECTOR = NPY_SIMD_F32, NPY_SIMD_F64#
  */
 /**begin repeat1
-* Arithmetic
-* # kind = add, subtract, multiply, divide#
-* # OP = +, -, *, /#
-* # VOP = add, sub, mul, div#
-*/
-static void
-sse2_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
+ * Arithmetic
+ * # kind = add, subtract, multiply, divide#
+ * # intrin = add, sub, mul, div#
+ * # OP = +, -, *, /#
+ * # PW = 1, 0, 0, 0#
+ * # is_div = 0*3, 1#
+ * # is_mul = 0*2, 1, 0#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
 {
-#ifdef NPY_HAVE_AVX512F
-    const npy_intp vector_size_bytes = 64;
-    LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
-        op[i] = ip1[i] @OP@ ip2[i];
-    /* lots of specializations, to squeeze out max performance */
-    if (npy_is_aligned(&ip1[i], vector_size_bytes) && npy_is_aligned(&ip2[i], vector_size_bytes)) {
-        if (ip1 == ip2) {
-            LOOP_BLOCKED(@type@, vector_size_bytes) {
-                @vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]);
-                @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, a);
-                @vpre512@_store_@vsuf@(&op[i], c);
+    npy_intp len = dimensions[0];
+    char *src0 = args[0], *src1 = args[1], *dst = args[2];
+    npy_intp ssrc0 = steps[0], ssrc1 = steps[1], sdst = steps[2];
+    // reduce
+    if (ssrc0 == 0 && ssrc0 == sdst && src0 == dst) {
+    #if @PW@
+        *((@type@*)src0) @OP@= @TYPE@_pairwise_sum(src1, len, ssrc1);
+    #else
+        @type@ acc = *((@type@*)src0);
+        if (ssrc1 == sizeof(@type@)) {
+            for (; len > 0; --len, src1 += sizeof(@type@)) {
+                acc @OP@= *(@type@ *)src1;
             }
-        }
-        else {
-            LOOP_BLOCKED(@type@, vector_size_bytes) {
-                @vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]);
-                @vtype512@ b = @vpre512@_load_@vsuf@(&ip2[i]);
-                @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
-                @vpre512@_store_@vsuf@(&op[i], c);
+        } else {
+            for (; len > 0; --len, src1 += ssrc1) {
+                acc @OP@= *(@type@ *)src1;
             }
         }
+        *((@type@*)src0) = acc;
+    #endif
+        return;
     }
-    else if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
-        LOOP_BLOCKED(@type@, vector_size_bytes) {
-            @vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]);
-            @vtype512@ b = @vpre512@_loadu_@vsuf@(&ip2[i]);
-            @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
-            @vpre512@_store_@vsuf@(&op[i], c);
-        }
-    }
-    else if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
-        LOOP_BLOCKED(@type@, vector_size_bytes) {
-            @vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]);
-            @vtype512@ b = @vpre512@_load_@vsuf@(&ip2[i]);
-            @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
-            @vpre512@_store_@vsuf@(&op[i], c);
-        }
-    }
-    else {
-        if (ip1 == ip2) {
-            LOOP_BLOCKED(@type@, vector_size_bytes) {
-                @vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]);
-                @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, a);
-                @vpre512@_store_@vsuf@(&op[i], c);
+#if @VECTOR@
+    if (len > npyv_nlanes_@sfx@*2 &&
+        !is_mem_overlap(src0, ssrc0, dst, sdst, len) &&
+        !is_mem_overlap(src1, ssrc1, dst, sdst, len)
+    ) {
+        const int vstep = npyv_nlanes_u8;
+        const int wstep = vstep * 2;
+        const int hstep = npyv_nlanes_@sfx@;
+        const int lstep = hstep * 2;
+        // lots of specializations, to squeeze out max performance
+        if (ssrc0 == sizeof(@type@) && ssrc0 == ssrc1 && ssrc0 == sdst) {
+            for (; len >= lstep; len -= lstep, src0 += wstep, src1 += wstep, dst += wstep) {
+                npyv_@sfx@ a0 = npyv_load_@sfx@((const @type@*)src0);
+                npyv_@sfx@ a1 = npyv_load_@sfx@((const @type@*)(src0 + vstep));
+                npyv_@sfx@ b0 = npyv_load_@sfx@((const @type@*)src1);
+                npyv_@sfx@ b1 = npyv_load_@sfx@((const @type@*)(src1 + vstep));
+                npyv_@sfx@ r0 = npyv_@intrin@_@sfx@(a0, b0);
+                npyv_@sfx@ r1 = npyv_@intrin@_@sfx@(a1, b1);
+                npyv_store_@sfx@((@type@*)dst, r0);
+                npyv_store_@sfx@((@type@*)(dst + vstep), r1);
             }
-        }
-        else {
-            LOOP_BLOCKED(@type@, vector_size_bytes) {
-                @vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]);
-                @vtype512@ b = @vpre512@_loadu_@vsuf@(&ip2[i]);
-                @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
-                @vpre512@_store_@vsuf@(&op[i], c);
+        #if @is_div@ && WORKAROUND_CLANG_PARTIAL_LOAD_BUG
+            const int vstop = hstep - 1;
+        #else
+            const int vstop = 0;
+        #endif // #if @is_div@ && WORKAROUND_CLANG_PARTIAL_LOAD_BUG
+            for (; len > vstop; len -= hstep, src0 += vstep, src1 += vstep, dst += vstep) {
+            #if @is_div@
+                npyv_@sfx@ a = npyv_load_till_@sfx@((const @type@*)src0, len, 1.0@c@);
+                npyv_@sfx@ b = npyv_load_till_@sfx@((const @type@*)src1, len, 1.0@c@);
+            #else
+                npyv_@sfx@ a = npyv_load_tillz_@sfx@((const @type@*)src0, len);
+                npyv_@sfx@ b = npyv_load_tillz_@sfx@((const @type@*)src1, len);
+            #endif
+                npyv_@sfx@ r = npyv_@intrin@_@sfx@(a, b);
+                npyv_store_till_@sfx@((@type@*)dst, len, r);
             }
-        }
-    }
-#elif defined NPY_HAVE_AVX2
-    const npy_intp vector_size_bytes = 32;
-    LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
-        op[i] = ip1[i] @OP@ ip2[i];
-    /* lots of specializations, to squeeze out max performance */
-    if (npy_is_aligned(&ip1[i], vector_size_bytes) &&
-            npy_is_aligned(&ip2[i], vector_size_bytes)) {
-        if (ip1 == ip2) {
-            LOOP_BLOCKED(@type@, vector_size_bytes) {
-                @vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]);
-                @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, a);
-                @vpre256@_store_@vsuf@(&op[i], c);
+        #if @is_div@ && WORKAROUND_CLANG_PARTIAL_LOAD_BUG
+            // last partial iteration for divide and working around clang partial load bug
+            if(len > 0){
+                npyv_@sfx@ a = npyv_load_till_@sfx@((const @type@*)src0, len, 1.0@c@);
+                volatile npyv_@sfx@ b = npyv_load_till_@sfx@((const @type@*)src1, len, 1.0@c@);
+                npyv_@sfx@ r = npyv_@intrin@_@sfx@(a, b);
+                npyv_store_till_@sfx@((@type@*)dst, len, r);
             }
-        }
-        else {
-            LOOP_BLOCKED(@type@, vector_size_bytes) {
-                @vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]);
-                @vtype256@ b = @vpre256@_load_@vsuf@(&ip2[i]);
-                @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
-                @vpre256@_store_@vsuf@(&op[i], c);
+        #endif // #if @is_div@ && WORKAROUND_CLANG_PARTIAL_LOAD_BUG
+        }
+        else if (ssrc0 == 0 && ssrc1 == sizeof(@type@) && sdst == ssrc1) {
+            npyv_@sfx@ a = npyv_setall_@sfx@(*((@type@*)src0));
+            for (; len >= lstep; len -= lstep, src1 += wstep, dst += wstep) {
+                npyv_@sfx@ b0 = npyv_load_@sfx@((const @type@*)src1);
+                npyv_@sfx@ b1 = npyv_load_@sfx@((const @type@*)(src1 + vstep));
+                npyv_@sfx@ r0 = npyv_@intrin@_@sfx@(a, b0);
+                npyv_@sfx@ r1 = npyv_@intrin@_@sfx@(a, b1);
+                npyv_store_@sfx@((@type@*)dst, r0);
+                npyv_store_@sfx@((@type@*)(dst + vstep), r1);
             }
-        }
-    }
-    else if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
-        LOOP_BLOCKED(@type@, vector_size_bytes) {
-            @vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]);
-            @vtype256@ b = @vpre256@_loadu_@vsuf@(&ip2[i]);
-            @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
-            @vpre256@_store_@vsuf@(&op[i], c);
-        }
-    }
-    else if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
-        LOOP_BLOCKED(@type@, vector_size_bytes) {
-            @vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]);
-            @vtype256@ b = @vpre256@_load_@vsuf@(&ip2[i]);
-            @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
-            @vpre256@_store_@vsuf@(&op[i], c);
-        }
-    }
-    else {
-        if (ip1 == ip2) {
-            LOOP_BLOCKED(@type@, vector_size_bytes) {
-                @vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]);
-                @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, a);
-                @vpre256@_store_@vsuf@(&op[i], c);
-            }
-        }
-        else {
-            LOOP_BLOCKED(@type@, vector_size_bytes) {
-                @vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]);
-                @vtype256@ b = @vpre256@_loadu_@vsuf@(&ip2[i]);
-                @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
-                @vpre256@_store_@vsuf@(&op[i], c);
+        #if (@is_div@ || @is_mul@) && WORKAROUND_CLANG_PARTIAL_LOAD_BUG
+            const int vstop = hstep - 1;
+        #else
+            const int vstop = 0;
+        #endif // #if (@is_div@ || @is_mul@) && WORKAROUND_CLANG_PARTIAL_LOAD_BUG
+            for (; len > vstop; len -= hstep, src1 += vstep, dst += vstep) {
+            #if @is_div@ || @is_mul@
+                npyv_@sfx@ b = npyv_load_till_@sfx@((const @type@*)src1, len, 1.0@c@);
+            #else
+                npyv_@sfx@ b = npyv_load_tillz_@sfx@((const @type@*)src1, len);
+            #endif
+                npyv_@sfx@ r = npyv_@intrin@_@sfx@(a, b);
+                npyv_store_till_@sfx@((@type@*)dst, len, r);
             }
-        }
-    }
-#else
-    const npy_intp vector_size_bytes = 16;
-    LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
-        op[i] = ip1[i] @OP@ ip2[i];
-    /* lots of specializations, to squeeze out max performance */
-    if (npy_is_aligned(&ip1[i], vector_size_bytes) &&
-            npy_is_aligned(&ip2[i], vector_size_bytes)) {
-        if (ip1 == ip2) {
-            LOOP_BLOCKED(@type@, vector_size_bytes) {
-                @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]);
-                @vtype@ c = @vpre@_@VOP@_@vsuf@(a, a);
-                @vpre@_store_@vsuf@(&op[i], c);
-            }
-        }
-        else {
-            LOOP_BLOCKED(@type@, vector_size_bytes) {
-                @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]);
-                @vtype@ b = @vpre@_load_@vsuf@(&ip2[i]);
-                @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
-                @vpre@_store_@vsuf@(&op[i], c);
+        #if (@is_div@ || @is_mul@) && WORKAROUND_CLANG_PARTIAL_LOAD_BUG
+            // last partial iteration for multiply / divide and working around clang partial load bug
+            if(len > 0){
+                volatile npyv_@sfx@ b = npyv_load_till_@sfx@((const @type@*)src1, len, 1.0@c@);
+                npyv_@sfx@ r = npyv_@intrin@_@sfx@(a, b);
+                npyv_store_till_@sfx@((@type@*)dst, len, r);
             }
-        }
-    }
-    else if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
-        LOOP_BLOCKED(@type@, vector_size_bytes) {
-            @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]);
-            @vtype@ b = @vpre@_loadu_@vsuf@(&ip2[i]);
-            @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
-            @vpre@_store_@vsuf@(&op[i], c);
-        }
-    }
-    else if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
-        LOOP_BLOCKED(@type@, vector_size_bytes) {
-            @vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]);
-            @vtype@ b = @vpre@_load_@vsuf@(&ip2[i]);
-            @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
-            @vpre@_store_@vsuf@(&op[i], c);
-        }
-    }
-    else {
-        if (ip1 == ip2) {
-            LOOP_BLOCKED(@type@, vector_size_bytes) {
-                @vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]);
-                @vtype@ c = @vpre@_@VOP@_@vsuf@(a, a);
-                @vpre@_store_@vsuf@(&op[i], c);
+        #endif // #if (@is_div@ || @is_mul@) && WORKAROUND_CLANG_PARTIAL_LOAD_BUG
+        }
+        else if (ssrc1 == 0 && ssrc0 == sizeof(@type@) && sdst == ssrc0) {
+            npyv_@sfx@ b = npyv_setall_@sfx@(*((@type@*)src1));
+            for (; len >= lstep; len -= lstep, src0 += wstep, dst += wstep) {
+                npyv_@sfx@ a0 = npyv_load_@sfx@((const @type@*)src0);
+                npyv_@sfx@ a1 = npyv_load_@sfx@((const @type@*)(src0 + vstep));
+                npyv_@sfx@ r0 = npyv_@intrin@_@sfx@(a0, b);
+                npyv_@sfx@ r1 = npyv_@intrin@_@sfx@(a1, b);
+                npyv_store_@sfx@((@type@*)dst, r0);
+                npyv_store_@sfx@((@type@*)(dst + vstep), r1);
             }
-        }
-        else {
-            LOOP_BLOCKED(@type@, vector_size_bytes) {
-                @vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]);
-                @vtype@ b = @vpre@_loadu_@vsuf@(&ip2[i]);
-                @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
-                @vpre@_store_@vsuf@(&op[i], c);
+            for (; len > 0; len -= hstep, src0 += vstep, dst += vstep) {
+            #if @is_div@ || @is_mul@
+                npyv_@sfx@ a = npyv_load_till_@sfx@((const @type@*)src0, len, 1.0@c@);
+            #else
+                npyv_@sfx@ a = npyv_load_tillz_@sfx@((const @type@*)src0, len);
+            #endif
+                npyv_@sfx@ r = npyv_@intrin@_@sfx@(a, b);
+                npyv_store_till_@sfx@((@type@*)dst, len, r);
             }
+        } else {
+            goto loop_scalar;
         }
+        npyv_cleanup();
+        return;
     }
+loop_scalar:
 #endif
-    LOOP_BLOCKED_END {
-        op[i] = ip1[i] @OP@ ip2[i];
-    }
-}
-
-static void
-sse2_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
-{
-#ifdef NPY_HAVE_AVX512F
-    const npy_intp vector_size_bytes = 64;
-    const @vtype512@ a = @vpre512@_set1_@vsuf@(ip1[0]);
-    LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
-        op[i] = ip1[0] @OP@ ip2[i];
-    if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
-        LOOP_BLOCKED(@type@, vector_size_bytes) {
-            @vtype512@ b = @vpre512@_load_@vsuf@(&ip2[i]);
-            @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
-            @vpre512@_store_@vsuf@(&op[i], c);
-        }
-    }
-    else {
-        LOOP_BLOCKED(@type@, vector_size_bytes) {
-            @vtype512@ b = @vpre512@_loadu_@vsuf@(&ip2[i]);
-            @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
-            @vpre512@_store_@vsuf@(&op[i], c);
-        }
-    }
-
-
-#elif defined NPY_HAVE_AVX2
-    const npy_intp vector_size_bytes = 32;
-    const @vtype256@ a = @vpre256@_set1_@vsuf@(ip1[0]);
-    LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
-        op[i] = ip1[0] @OP@ ip2[i];
-    if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
-        LOOP_BLOCKED(@type@, vector_size_bytes) {
-            @vtype256@ b = @vpre256@_load_@vsuf@(&ip2[i]);
-            @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
-            @vpre256@_store_@vsuf@(&op[i], c);
-        }
-    }
-    else {
-        LOOP_BLOCKED(@type@, vector_size_bytes) {
-            @vtype256@ b = @vpre256@_loadu_@vsuf@(&ip2[i]);
-            @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
-            @vpre256@_store_@vsuf@(&op[i], c);
-        }
-    }
-#else
-    const npy_intp vector_size_bytes = 16;
-    const @vtype@ a = @vpre@_set1_@vsuf@(ip1[0]);
-    LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
-        op[i] = ip1[0] @OP@ ip2[i];
-    if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
-        LOOP_BLOCKED(@type@, vector_size_bytes) {
-            @vtype@ b = @vpre@_load_@vsuf@(&ip2[i]);
-            @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
-            @vpre@_store_@vsuf@(&op[i], c);
-        }
-    }
-    else {
-        LOOP_BLOCKED(@type@, vector_size_bytes) {
-            @vtype@ b = @vpre@_loadu_@vsuf@(&ip2[i]);
-            @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
-            @vpre@_store_@vsuf@(&op[i], c);
-        }
-    }
-#endif
-    LOOP_BLOCKED_END {
-        op[i] = ip1[0] @OP@ ip2[i];
+    for (; len > 0; --len, src0 += ssrc0, src1 += ssrc1, dst += sdst) {
+        const @type@ a = *((@type@*)src0);
+        const @type@ b = *((@type@*)src1);
+        *((@type@*)dst) = a @OP@ b;
     }
 }
 
-static void
-sse2_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
 {
-#ifdef NPY_HAVE_AVX512F
-    const npy_intp vector_size_bytes = 64;
-    const @vtype512@ b = @vpre512@_set1_@vsuf@(ip2[0]);
-    LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
-        op[i] = ip1[i] @OP@ ip2[0];
-    if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
-        LOOP_BLOCKED(@type@, vector_size_bytes) {
-            @vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]);
-            @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
-            @vpre512@_store_@vsuf@(&op[i], c);
-        }
-    }
-    else {
-        LOOP_BLOCKED(@type@, vector_size_bytes) {
-            @vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]);
-            @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
-            @vpre512@_store_@vsuf@(&op[i], c);
-        }
-    }
-
-#elif defined NPY_HAVE_AVX2
-    const npy_intp vector_size_bytes = 32;
-    const @vtype256@ b = @vpre256@_set1_@vsuf@(ip2[0]);
-    LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
-        op[i] = ip1[i] @OP@ ip2[0];
-    if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
-        LOOP_BLOCKED(@type@, vector_size_bytes) {
-            @vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]);
-            @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
-            @vpre256@_store_@vsuf@(&op[i], c);
-        }
-    }
-    else {
-        LOOP_BLOCKED(@type@, vector_size_bytes) {
-            @vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]);
-            @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
-            @vpre256@_store_@vsuf@(&op[i], c);
-        }
-    }
-#else
-    const npy_intp vector_size_bytes = 16;
-    const @vtype@ b = @vpre@_set1_@vsuf@(ip2[0]);
-    LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
-        op[i] = ip1[i] @OP@ ip2[0];
-    if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
-        LOOP_BLOCKED(@type@, vector_size_bytes) {
-            @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]);
-            @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
-            @vpre@_store_@vsuf@(&op[i], c);
-        }
-    }
-    else {
-        LOOP_BLOCKED(@type@, vector_size_bytes) {
-            @vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]);
-            @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
-            @vpre@_store_@vsuf@(&op[i], c);
-        }
-    }
-#endif
-    LOOP_BLOCKED_END {
-        op[i] = ip1[i] @OP@ ip2[0];
+    char *ip1 = args[0];
+    char *indx = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    @type@ *indexed;
+    for(i = 0; i < n; i++, indx += isindex, value += isb) {
+        indexed = (@type@ *)(ip1 + is1 * *(npy_intp *)indx);
+        *indexed = *indexed @OP@ *(@type@ *)value;
     }
+    return 0;
 }
 
 /**end repeat1**/
 /**end repeat**/
 
-#else // NPY_HAVE_SSE2
+#undef WORKAROUND_CLANG_PARTIAL_LOAD_BUG
 
-/**begin repeat
- *  #type = npy_float, npy_double#
- *  #TYPE = FLOAT, DOUBLE#
- *  #sfx = f32, f64#
- *  #CHK = _F32, _F64#
- */
-#if NPY_SIMD@CHK@
-/**begin repeat1
-* Arithmetic
-* # kind = add, subtract, multiply, divide#
-* # OP = +, -, *, /#
-* # VOP = add, sub, mul, div#
-*/
-
-static void
-simd_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
-{
-    LOOP_BLOCK_ALIGN_VAR(op, @type@, NPY_SIMD_WIDTH) {
-        op[i] = ip1[i] @OP@ ip2[i];
-    }
-    /* lots of specializations, to squeeze out max performance */
-    if (ip1 == ip2) {
-        LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) {
-            npyv_@sfx@ a = npyv_load_@sfx@(&ip1[i]);
-            npyv_@sfx@ c = npyv_@VOP@_@sfx@(a, a);
-            npyv_store_@sfx@(&op[i], c);
-        }
-    }
-    else {
-        LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) {
-            npyv_@sfx@ a = npyv_load_@sfx@(&ip1[i]);
-            npyv_@sfx@ b = npyv_load_@sfx@(&ip2[i]);
-            npyv_@sfx@ c = npyv_@VOP@_@sfx@(a, b);
-            npyv_store_@sfx@(&op[i], c);
-        }
-    }
-    LOOP_BLOCKED_END {
-        op[i] = ip1[i] @OP@ ip2[i];
-    }
-}
+//###############################################################################
+//## Complex Single/Double precision
+//###############################################################################
 
-static void
-simd_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
-{
-    const npyv_@sfx@ v1 = npyv_setall_@sfx@(ip1[0]);
-    LOOP_BLOCK_ALIGN_VAR(op, @type@, NPY_SIMD_WIDTH) {
-        op[i] = ip1[0] @OP@ ip2[i];
-    }
-    LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) {
-        npyv_@sfx@ v2 = npyv_load_@sfx@(&ip2[i]);
-        npyv_@sfx@ v3 = npyv_@VOP@_@sfx@(v1, v2);
-        npyv_store_@sfx@(&op[i], v3);
-    }
-    LOOP_BLOCKED_END {
-        op[i] = ip1[0] @OP@ ip2[i];
-    }
-}
+/********************************************************************************
+ ** op intrinics
+ ********************************************************************************/
 
-static void
-simd_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
+#if NPY_SIMD_F32
+NPY_FINLINE npyv_f32x2 simd_set2_f32(const float *a)
 {
-    const npyv_@sfx@ v2 = npyv_setall_@sfx@(ip2[0]);
-    LOOP_BLOCK_ALIGN_VAR(op, @type@, NPY_SIMD_WIDTH) {
-        op[i] = ip1[i] @OP@ ip2[0];
-    }
-    LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) {
-        npyv_@sfx@ v1 = npyv_load_@sfx@(&ip1[i]);
-        npyv_@sfx@ v3 = npyv_@VOP@_@sfx@(v1, v2);
-        npyv_store_@sfx@(&op[i], v3);
-    }
-    LOOP_BLOCKED_END {
-        op[i] = ip1[i] @OP@ ip2[0];
-    }
+    npyv_f32 fill = npyv_reinterpret_f32_u64(npyv_setall_u64(*(npy_uint64*)a));
+    npyv_f32x2 r;
+    r.val[0] = fill;
+    r.val[1] = fill;
+    return r;
 }
-/**end repeat1**/
-#endif /* NPY_SIMD@CHK@ */
-/**end repeat**/
-#endif // NPY_HAVE_SSE2
 
-/**begin repeat
- * Float types
- *  #type = npy_float, npy_double, npy_longdouble#
- *  #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
- *  #vector = 1, 1, 0#
- *  #VECTOR = NPY_SIMD_F32, NPY_SIMD_F64, 0 #
- */
-/**begin repeat1
- * Arithmetic
- * # kind = add, subtract, multiply, divide#
- */
-static NPY_INLINE int
-run_binary_simd_@kind@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
+NPY_FINLINE npyv_f32
+simd_cconjugate_f32(npyv_f32 x)
 {
-#if @vector@ && defined NPY_HAVE_SSE2
-    @type@ * ip1 = (@type@ *)args[0];
-    @type@ * ip2 = (@type@ *)args[1];
-    @type@ * op = (@type@ *)args[2];
-    npy_intp n = dimensions[0];
-#if defined NPY_HAVE_AVX512F
-    const npy_uintp vector_size_bytes = 64;
-#elif defined NPY_HAVE_AVX2
-    const npy_uintp vector_size_bytes = 32;
+#if NPY_SIMD_BIGENDIAN
+    const npyv_f32 mask = npyv_reinterpret_f32_u64(npyv_setall_u64(0x80000000));
 #else
-    const npy_uintp vector_size_bytes = 32;
-#endif
-    /* argument one scalar */
-    if (IS_BLOCKABLE_BINARY_SCALAR1(sizeof(@type@), vector_size_bytes)) {
-        sse2_binary_scalar1_@kind@_@TYPE@(op, ip1, ip2, n);
-        return 1;
-    }
-    /* argument two scalar */
-    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), vector_size_bytes)) {
-        sse2_binary_scalar2_@kind@_@TYPE@(op, ip1, ip2, n);
-        return 1;
-    }
-    else if (IS_BLOCKABLE_BINARY(sizeof(@type@), vector_size_bytes)) {
-        sse2_binary_@kind@_@TYPE@(op, ip1, ip2, n);
-        return 1;
-    }
-#elif @VECTOR@
-    @type@ * ip1 = (@type@ *)args[0];
-    @type@ * ip2 = (@type@ *)args[1];
-    @type@ * op = (@type@ *)args[2];
-    npy_intp n = dimensions[0];
-    /* argument one scalar */
-    if (IS_BLOCKABLE_BINARY_SCALAR1(sizeof(@type@), NPY_SIMD_WIDTH)) {
-        simd_binary_scalar1_@kind@_@TYPE@(op, ip1, ip2, n);
-        return 1;
-    }
-    /* argument two scalar */
-    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), NPY_SIMD_WIDTH)) {
-        simd_binary_scalar2_@kind@_@TYPE@(op, ip1, ip2, n);
-        return 1;
-    }
-    else if (IS_BLOCKABLE_BINARY(sizeof(@type@), NPY_SIMD_WIDTH)) {
-        simd_binary_@kind@_@TYPE@(op, ip1, ip2, n);
-        return 1;
-    }
+    const npyv_f32 mask = npyv_reinterpret_f32_u64(npyv_setall_u64(0x8000000000000000ULL));
 #endif
-    return 0;
+    return npyv_xor_f32(x, mask);
 }
-/**end repeat1**/
-/**end repeat**/
 
-/********************************************************************************
- ** Defining ufunc inner functions
- ********************************************************************************/
-/**begin repeat
- * Float types
- *  #type = npy_float, npy_double#
- *  #TYPE = FLOAT, DOUBLE#
- *  #c = f, #
- *  #C = F, #
- */
-/**begin repeat1
- * Arithmetic
- * # kind = add, subtract, multiply, divide#
- * # OP = +, -, *, /#
- * # PW = 1, 0, 0, 0#
- */
-NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
-(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+NPY_FINLINE npyv_f32
+simd_cmul_f32(npyv_f32 a, npyv_f32 b)
 {
-    if (IS_BINARY_REDUCE) {
-#if @PW@
-        @type@ * iop1 = (@type@ *)args[0];
-        npy_intp n = dimensions[0];
-
-        *iop1 @OP@= @TYPE@_pairwise_sum(args[1], n, steps[1]);
-#else
-        BINARY_REDUCE_LOOP(@type@) {
-            io1 @OP@= *(@type@ *)ip2;
-        }
-        *((@type@ *)iop1) = io1;
-#endif
-    }
-    else if (!run_binary_simd_@kind@_@TYPE@(args, dimensions, steps)) {
-        BINARY_LOOP {
-            const @type@ in1 = *(@type@ *)ip1;
-            const @type@ in2 = *(@type@ *)ip2;
-            *((@type@ *)op1) = in1 @OP@ in2;
-        }
-    }
+    npyv_f32 b_rev = npyv_permi128_f32(b, 1, 0, 3, 2);
+    npyv_f32 a_re = npyv_permi128_f32(a, 0, 0, 2, 2);
+    npyv_f32 a_im = npyv_permi128_f32(a, 1, 1, 3, 3);
+    // a_im * b_im, a_im * b_re
+    npyv_f32 ab_iiir = npyv_mul_f32(a_im, b_rev);
+    return npyv_muladdsub_f32(a_re, b, ab_iiir);
 }
-/**end repeat1**/
-/**end repeat**/
 
-//###############################################################################
-//## Complex Single/Double precision
-//###############################################################################
-/********************************************************************************
- ** Defining the SIMD kernels
- ********************************************************************************/
-#if !defined(_MSC_VER) && defined(NPY_HAVE_AVX512F)
-    /**
-     * For somehow MSVC commit aggressive optimization lead
-     * to raises 'RuntimeWarning: invalid value encountered in multiply'
-     *
-     * the issue mainly caused by '_mm512_maskz_loadu_ps', we need to
-     * investigate about it while moving to NPYV.
-     */
-    #define AVX512F_NOMSVC
+NPY_FINLINE npyv_f32
+simd_csquare_f32(npyv_f32 x)
+{ return simd_cmul_f32(x, x); }
 #endif
 
-#ifdef AVX512F_NOMSVC
-NPY_FINLINE __mmask16
-avx512_get_full_load_mask_ps(void)
-{
-    return 0xFFFF;
-}
-
-NPY_FINLINE __mmask8
-avx512_get_full_load_mask_pd(void)
-{
-    return 0xFF;
-}
-NPY_FINLINE __m512
-avx512_masked_load_ps(__mmask16 mask, npy_float* addr)
-{
-    return _mm512_maskz_loadu_ps(mask, (__m512 *)addr);
-}
+#if NPY_SIMD_F64
 
-NPY_FINLINE __m512d
-avx512_masked_load_pd(__mmask8 mask, npy_double* addr)
+NPY_FINLINE npyv_f64x2 simd_set2_f64(const double *a)
 {
-    return _mm512_maskz_loadu_pd(mask, (__m512d *)addr);
+    npyv_f64 r = npyv_setall_f64(a[0]);
+    npyv_f64 i = npyv_setall_f64(a[1]);
+    return npyv_zip_f64(r, i);
 }
 
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
-avx512_get_partial_load_mask_ps(const npy_int num_elem, const npy_int total_elem)
+NPY_FINLINE npyv_f64
+simd_cconjugate_f64(npyv_f64 x)
 {
-    return (0x0001 << num_elem) - 0x0001;
+    const npyv_f64 mask = npyv_reinterpret_f64_u64(npyv_set_u64(
+       0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL,
+       0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL,
+       0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL,
+       0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL,
+       0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL,
+       0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL,
+       0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL,
+       0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL
+    ));
+    return npyv_xor_f64(x, mask);
 }
 
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask8
-avx512_get_partial_load_mask_pd(const npy_int num_elem, const npy_int total_elem)
-{
-    return (0x01 << num_elem) - 0x01;
-}
-/**begin repeat
- *  #vsub  = ps, pd#
- *  #type= npy_float, npy_double#
- *  #epi_vsub  = epi32, epi64#
- *  #vtype = __m512, __m512d#
- *  #mask = __mmask16, __mmask8#
- *  #and_const = 0x7fffffff, 0x7fffffffffffffffLL#
- *  #neg_mask = 0x80000000, 0x8000000000000000#
- *  #perm_ = 0xb1, 0x55#
- *  #cmpx_img_mask = 0xAAAA, 0xAA#
- *  #cmpx_re_mask = 0x5555, 0x55#
- *  #INF = NPY_INFINITYF, NPY_INFINITY#
- *  #NAN = NPY_NANF, NPY_NAN#
- */
-NPY_FINLINE @vtype@
-avx512_hadd_@vsub@(const @vtype@ x)
+NPY_FINLINE npyv_f64
+simd_cmul_f64(npyv_f64 a, npyv_f64 b)
 {
-    return _mm512_add_@vsub@(x, _mm512_permute_@vsub@(x, @perm_@));
+    npyv_f64 b_rev = npyv_permi128_f64(b, 1, 0);
+    npyv_f64 a_re = npyv_permi128_f64(a, 0, 0);
+    npyv_f64 a_im = npyv_permi128_f64(a, 1, 1);
+    // a_im * b_im, a_im * b_re
+    npyv_f64 ab_iiir = npyv_mul_f64(a_im, b_rev);
+    return npyv_muladdsub_f64(a_re, b, ab_iiir);
 }
 
-NPY_FINLINE @vtype@
-avx512_hsub_@vsub@(const @vtype@ x)
-{
-    return _mm512_sub_@vsub@(x, _mm512_permute_@vsub@(x, @perm_@));
-}
-NPY_FINLINE @vtype@
-avx512_cmul_@vsub@(@vtype@ x1, @vtype@ x2)
-{
-    // x1 = r1, i1
-    // x2 = r2, i2
-    @vtype@ x3  = _mm512_permute_@vsub@(x2, @perm_@);   // i2, r2
-    @vtype@ x12 = _mm512_mul_@vsub@(x1, x2);            // r1*r2, i1*i2
-    @vtype@ x13 = _mm512_mul_@vsub@(x1, x3);            // r1*i2, r2*i1
-    @vtype@ outreal = avx512_hsub_@vsub@(x12);          // r1*r2 - i1*i2, r1*r2 - i1*i2
-    @vtype@ outimg  = avx512_hadd_@vsub@(x13);          // r1*i2 + i1*r2, r1*i2 + i1*r2
-    return _mm512_mask_blend_@vsub@(@cmpx_img_mask@, outreal, outimg);
-}
-/**end repeat**/
+NPY_FINLINE npyv_f64
+simd_csquare_f64(npyv_f64 x)
+{ return simd_cmul_f64(x, x); }
 #endif
 
 /**begin repeat
- * #TYPE = CFLOAT, CDOUBLE#
  * #type = npy_float, npy_double#
- * #num_lanes = 16, 8#
- * #vsuffix = ps, pd#
- * #epi_vsub  = epi32, epi64#
- * #mask = __mmask16, __mmask8#
- * #vtype = __m512, __m512d#
- * #scale = 4, 8#
- * #vindextype = __m512i, __m256i#
- * #vindexload = _mm512_loadu_si512, _mm256_loadu_si256#
- * #storemask = 0xFF, 0xF#
- * #IS_FLOAT = 1, 0#
- */
-/**begin repeat1
- *  #func = add, subtract, multiply#
- *  #vectorf = _mm512_add, _mm512_sub, avx512_cmul#
- */
-#if defined AVX512F_NOMSVC
-static NPY_INLINE void
-AVX512F_@func@_@TYPE@(char **args, const npy_intp *dimensions, const npy_intp *steps)
-{
-    const npy_intp array_size = dimensions[0];
-    npy_intp num_remaining_elements = 2*array_size;
-    @type@* ip1 = (@type@*) args[0];
-    @type@* ip2 = (@type@*) args[1];
-    @type@* op  = (@type@*) args[2];
-
-    @mask@ load_mask = avx512_get_full_load_mask_@vsuffix@();
-
-    while (num_remaining_elements > 0) {
-        if (num_remaining_elements < @num_lanes@) {
-            load_mask = avx512_get_partial_load_mask_@vsuffix@(
-                                    num_remaining_elements, @num_lanes@);
-        }
-        @vtype@ x1, x2;
-        x1 = avx512_masked_load_@vsuffix@(load_mask, ip1);
-        x2 = avx512_masked_load_@vsuffix@(load_mask, ip2);
-
-        @vtype@ out = @vectorf@_@vsuffix@(x1, x2);
-
-        _mm512_mask_storeu_@vsuffix@(op, load_mask, out);
-
-        ip1 += @num_lanes@;
-        ip2 += @num_lanes@;
-        op += @num_lanes@;
-        num_remaining_elements -= @num_lanes@;
-    }
-}
-#endif // AVX512F_NOMSVC
-/**end repeat1**/
-/**end repeat**/
-
-/**begin repeat
- * #TYPE = CFLOAT, CDOUBLE#
- * #type= npy_float, npy_double#
- * #esize = 8, 16#
- */
-/**begin repeat1
- *  #func = add, subtract, multiply#
+ * #sfx = f32, f64#
+ * #bsfx = b32, b64#
+ * #usfx = b32, u64#
+ * #VECTOR = NPY_SIMD_F32, NPY_SIMD_F64#
+ * #is_double = 0, 1#
+ * #c = f, #
+ * #INF = NPY_INFINITYF, NPY_INFINITY#
+ * #NAN = NPY_NANF, NPY_NAN#
  */
-static NPY_INLINE int
-run_binary_avx512f_@func@_@TYPE@(char **args, const npy_intp *dimensions, const npy_intp *steps)
+#if @VECTOR@
+NPY_FINLINE npyv_@sfx@
+simd_cabsolute_@sfx@(npyv_@sfx@ re, npyv_@sfx@ im)
 {
-#if defined AVX512F_NOMSVC
-    if (IS_BINARY_STRIDE_ONE(@esize@, 64)) {
-        AVX512F_@func@_@TYPE@(args, dimensions, steps);
-        return 1;
-    }
-    else
-        return 0;
-#endif
-    return 0;
+    const npyv_@sfx@ inf = npyv_setall_@sfx@(@INF@);
+    const npyv_@sfx@ nan = npyv_setall_@sfx@(@NAN@);
+
+    re = npyv_abs_@sfx@(re);
+    im = npyv_abs_@sfx@(im);
+    /*
+     * If real or imag = INF, then convert it to inf + j*inf
+     * Handles: inf + j*nan, nan + j*inf
+     */
+    npyv_@bsfx@ re_infmask = npyv_cmpeq_@sfx@(re, inf);
+    npyv_@bsfx@ im_infmask = npyv_cmpeq_@sfx@(im, inf);
+    im = npyv_select_@sfx@(re_infmask, inf, im);
+    re = npyv_select_@sfx@(im_infmask, inf, re);
+    /*
+     * If real or imag = NAN, then convert it to nan + j*nan
+     * Handles: x + j*nan, nan + j*x
+     */
+    npyv_@bsfx@ re_nnanmask = npyv_notnan_@sfx@(re);
+    npyv_@bsfx@ im_nnanmask = npyv_notnan_@sfx@(im);
+    im = npyv_select_@sfx@(re_nnanmask, im, nan);
+    re = npyv_select_@sfx@(im_nnanmask, re, nan);
+
+    npyv_@sfx@ larger  = npyv_max_@sfx@(re, im);
+    npyv_@sfx@ smaller = npyv_min_@sfx@(im, re);
+    /*
+     * Calculate div_mask to prevent 0./0. and inf/inf operations in div
+     */
+    npyv_@bsfx@ zeromask = npyv_cmpeq_@sfx@(larger, npyv_zero_@sfx@());
+    npyv_@bsfx@ infmask = npyv_cmpeq_@sfx@(smaller, inf);
+    npyv_@bsfx@ div_mask = npyv_not_@bsfx@(npyv_or_@bsfx@(zeromask, infmask));
+
+    npyv_@sfx@ ratio = npyv_ifdivz_@sfx@(div_mask, smaller, larger);
+    npyv_@sfx@ hypot = npyv_sqrt_@sfx@(
+        npyv_muladd_@sfx@(ratio, ratio, npyv_setall_@sfx@(1.0@c@)
+    ));
+    return npyv_mul_@sfx@(hypot, larger);
 }
-/**end repeat1**/
+#endif // VECTOR
 /**end repeat**/
 
 /********************************************************************************
@@ -724,55 +411,345 @@ run_binary_avx512f_@func@_@TYPE@(char **args, const npy_intp *dimensions, const
  * complex types
  * #TYPE = CFLOAT, CDOUBLE#
  * #ftype = npy_float, npy_double#
+ * #VECTOR = NPY_SIMD_F32, NPY_SIMD_F64#
+ * #sfx = f32, f64#
  * #c = f, #
  * #C = F, #
  */
 /**begin repeat1
  * arithmetic
- * #kind = add, subtract#
- * #OP = +, -#
- * #PW = 1, 0#
+ * #kind = add, subtract, multiply#
+ * #vectorf = npyv_add, npyv_sub, simd_cmul#
+ * #OP = +, -, *#
+ * #PW = 1, 0, 0#
+ * #is_mul = 0*2, 1#
  */
 NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
 (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
 {
-    // Parenthesis around @PW@ tells clang dead code is intentional
-    if (IS_BINARY_REDUCE && (@PW@)) {
-        npy_intp n = dimensions[0];
-        @ftype@ * or = ((@ftype@ *)args[0]);
-        @ftype@ * oi = ((@ftype@ *)args[0]) + 1;
+    npy_intp len = dimensions[0];
+    char *b_src0 = args[0], *b_src1 = args[1], *b_dst = args[2];
+    npy_intp b_ssrc0 = steps[0], b_ssrc1 = steps[1], b_sdst = steps[2];
+#if @PW@
+    // reduce
+    if (b_ssrc0 == 0 && b_ssrc0 == b_sdst && b_src0 == b_dst &&
+        b_ssrc1 % (sizeof(@ftype@)*2) == 0
+    ) {
+        @ftype@ *rl_im = (@ftype@ *)b_src0;
         @ftype@ rr, ri;
-
-        @TYPE@_pairwise_sum(&rr, &ri, args[1], n * 2, steps[1] / 2);
-        *or @OP@= rr;
-        *oi @OP@= ri;
+        @TYPE@_pairwise_sum(&rr, &ri, b_src1, len * 2, b_ssrc1 / 2);
+        rl_im[0] @OP@= rr;
+        rl_im[1] @OP@= ri;
         return;
     }
-    if (!run_binary_avx512f_@kind@_@TYPE@(args, dimensions, steps)) {
-        BINARY_LOOP {
-            const @ftype@ in1r = ((@ftype@ *)ip1)[0];
-            const @ftype@ in1i = ((@ftype@ *)ip1)[1];
-            const @ftype@ in2r = ((@ftype@ *)ip2)[0];
-            const @ftype@ in2i = ((@ftype@ *)ip2)[1];
-            ((@ftype@ *)op1)[0] = in1r @OP@ in2r;
-            ((@ftype@ *)op1)[1] = in1i @OP@ in2i;
+#endif
+#if @VECTOR@
+    if (is_mem_overlap(b_src0, b_ssrc0, b_dst, b_sdst, len) ||
+        is_mem_overlap(b_src1, b_ssrc1, b_dst, b_sdst, len) ||
+        b_sdst  % sizeof(@ftype@) != 0 || b_sdst == 0 ||
+        b_ssrc0 % sizeof(@ftype@) != 0 ||
+        b_ssrc1 % sizeof(@ftype@) != 0
+    ) {
+        goto loop_scalar;
+    }
+    const @ftype@ *src0 = (@ftype@*)b_src0;
+    const @ftype@ *src1 = (@ftype@*)b_src1;
+          @ftype@ *dst  = (@ftype@*)b_dst;
+
+    const npy_intp ssrc0 = b_ssrc0 / sizeof(@ftype@);
+    const npy_intp ssrc1 = b_ssrc1 / sizeof(@ftype@);
+    const npy_intp sdst  = b_sdst / sizeof(@ftype@);
+
+    const int vstep = npyv_nlanes_@sfx@;
+    const int wstep = vstep * 2;
+    const int hstep = vstep / 2;
+
+    const int loadable0 = npyv_loadable_stride_s64(ssrc0);
+    const int loadable1 = npyv_loadable_stride_s64(ssrc1);
+    const int storable = npyv_storable_stride_s64(sdst);
+
+    // lots**lots of specializations, to squeeze out max performance
+    // contig
+    if (ssrc0 == 2 && ssrc0 == ssrc1 && ssrc0 == sdst) {
+        for (; len >= vstep; len -= vstep, src0 += wstep, src1 += wstep, dst += wstep) {
+            npyv_@sfx@ a0 = npyv_load_@sfx@(src0);
+            npyv_@sfx@ a1 = npyv_load_@sfx@(src0 + vstep);
+            npyv_@sfx@ b0 = npyv_load_@sfx@(src1);
+            npyv_@sfx@ b1 = npyv_load_@sfx@(src1 + vstep);
+            npyv_@sfx@ r0 = @vectorf@_@sfx@(a0, b0);
+            npyv_@sfx@ r1 = @vectorf@_@sfx@(a1, b1);
+            npyv_store_@sfx@(dst, r0);
+            npyv_store_@sfx@(dst + vstep, r1);
+        }
+        for (; len > 0; len -= hstep, src0 += vstep, src1 += vstep, dst += vstep) {
+            npyv_@sfx@ a = npyv_load2_tillz_@sfx@(src0, len);
+            npyv_@sfx@ b = npyv_load2_tillz_@sfx@(src1, len);
+            npyv_@sfx@ r = @vectorf@_@sfx@(a, b);
+            npyv_store2_till_@sfx@(dst, len, r);
+        }
+    }
+    // scalar 0
+    else if (ssrc0 == 0) {
+        npyv_@sfx@x2 a = simd_set2_@sfx@(src0);
+        // contig
+        if (ssrc1 == 2 && sdst == ssrc1) {
+            for (; len >= vstep; len -= vstep, src1 += wstep, dst += wstep) {
+                npyv_@sfx@ b0 = npyv_load_@sfx@(src1);
+                npyv_@sfx@ b1 = npyv_load_@sfx@(src1 + vstep);
+                npyv_@sfx@ r0 = @vectorf@_@sfx@(a.val[0], b0);
+                npyv_@sfx@ r1 = @vectorf@_@sfx@(a.val[1], b1);
+                npyv_store_@sfx@(dst, r0);
+                npyv_store_@sfx@(dst + vstep, r1);
+            }
+            for (; len > 0; len -= hstep, src1 += vstep, dst += vstep) {
+            #if @is_mul@
+                npyv_@sfx@ b = npyv_load2_till_@sfx@(src1, len, 1.0@c@, 1.0@c@);
+            #else
+                npyv_@sfx@ b = npyv_load2_tillz_@sfx@(src1, len);
+            #endif
+                npyv_@sfx@ r = @vectorf@_@sfx@(a.val[0], b);
+                npyv_store2_till_@sfx@(dst, len, r);
+            }
+        }
+        // non-contig
+        else if (loadable1 && storable) {
+            for (; len >= vstep; len -= vstep, src1 += ssrc1*vstep, dst += sdst*vstep) {
+                npyv_@sfx@ b0 = npyv_loadn2_@sfx@(src1, ssrc1);
+                npyv_@sfx@ b1 = npyv_loadn2_@sfx@(src1 + ssrc1*hstep, ssrc1);
+                npyv_@sfx@ r0 = @vectorf@_@sfx@(a.val[0], b0);
+                npyv_@sfx@ r1 = @vectorf@_@sfx@(a.val[1], b1);
+                npyv_storen2_@sfx@(dst, sdst, r0);
+                npyv_storen2_@sfx@(dst + sdst*hstep, sdst, r1);
+            }
+            for (; len > 0; len -= hstep, src1 += ssrc1*hstep, dst += sdst*hstep) {
+            #if @is_mul@
+                npyv_@sfx@ b = npyv_loadn2_till_@sfx@(src1, ssrc1, len, 1.0@c@, 1.0@c@);
+            #else
+                npyv_@sfx@ b = npyv_loadn2_tillz_@sfx@(src1, ssrc1, len);
+            #endif
+                npyv_@sfx@ r = @vectorf@_@sfx@(a.val[0], b);
+                npyv_storen2_till_@sfx@(dst, sdst, len, r);
+            }
+        }
+        else {
+            goto loop_scalar;
+        }
+    }
+    // scalar 1
+    else if (ssrc1 == 0) {
+        npyv_@sfx@x2 b = simd_set2_@sfx@(src1);
+        if (ssrc0 == 2 && sdst == ssrc0) {
+            for (; len >= vstep; len -= vstep, src0 += wstep, dst += wstep) {
+                npyv_@sfx@ a0 = npyv_load_@sfx@(src0);
+                npyv_@sfx@ a1 = npyv_load_@sfx@(src0 + vstep);
+                npyv_@sfx@ r0 = @vectorf@_@sfx@(a0, b.val[0]);
+                npyv_@sfx@ r1 = @vectorf@_@sfx@(a1, b.val[1]);
+                npyv_store_@sfx@(dst, r0);
+                npyv_store_@sfx@(dst + vstep, r1);
+            }
+            for (; len > 0; len -= hstep, src0 += vstep, dst += vstep) {
+            #if @is_mul@
+                npyv_@sfx@ a = npyv_load2_till_@sfx@(src0, len, 1.0@c@, 1.0@c@);
+            #else
+                npyv_@sfx@ a = npyv_load2_tillz_@sfx@(src0, len);
+            #endif
+                npyv_@sfx@ r = @vectorf@_@sfx@(a, b.val[0]);
+                npyv_store2_till_@sfx@(dst, len, r);
+            }
+        }
+        // non-contig
+        else if (loadable0 && storable) {
+            for (; len >= vstep; len -= vstep, src0 += ssrc0*vstep, dst += sdst*vstep) {
+                npyv_@sfx@ a0 = npyv_loadn2_@sfx@(src0, ssrc0);
+                npyv_@sfx@ a1 = npyv_loadn2_@sfx@(src0 + ssrc0*hstep, ssrc0);
+                npyv_@sfx@ r0 = @vectorf@_@sfx@(a0, b.val[0]);
+                npyv_@sfx@ r1 = @vectorf@_@sfx@(a1, b.val[1]);
+                npyv_storen2_@sfx@(dst, sdst, r0);
+                npyv_storen2_@sfx@(dst + sdst*hstep, sdst, r1);
+            }
+            for (; len > 0; len -= hstep, src0 += ssrc0*hstep, dst += sdst*hstep) {
+            #if @is_mul@
+                npyv_@sfx@ a = npyv_loadn2_till_@sfx@(src0, ssrc0, len, 1.0@c@, 1.0@c@);
+            #else
+                npyv_@sfx@ a = npyv_loadn2_tillz_@sfx@(src0, ssrc0, len);
+            #endif
+                npyv_@sfx@ r = @vectorf@_@sfx@(a, b.val[0]);
+                npyv_storen2_till_@sfx@(dst, sdst, len, r);
+            }
         }
+        else {
+            goto loop_scalar;
+        }
+    }
+    #if @is_mul@
+    // non-contig
+    else if (loadable0 && loadable1 && storable) {
+        for (; len >= vstep; len -= vstep, src0 += ssrc0*vstep,
+                            src1 += ssrc1*vstep, dst += sdst*vstep
+        ) {
+            npyv_@sfx@ a0 = npyv_loadn2_@sfx@(src0, ssrc0);
+            npyv_@sfx@ a1 = npyv_loadn2_@sfx@(src0 + ssrc0*hstep, ssrc0);
+            npyv_@sfx@ b0 = npyv_loadn2_@sfx@(src1, ssrc1);
+            npyv_@sfx@ b1 = npyv_loadn2_@sfx@(src1 + ssrc1*hstep, ssrc1);
+            npyv_@sfx@ r0 = @vectorf@_@sfx@(a0, b0);
+            npyv_@sfx@ r1 = @vectorf@_@sfx@(a1, b1);
+            npyv_storen2_@sfx@(dst, sdst, r0);
+            npyv_storen2_@sfx@(dst + sdst*hstep, sdst, r1);
+        }
+        for (; len > 0; len -= hstep, src0 += ssrc0*hstep,
+                       src1 += ssrc1*hstep, dst += sdst*hstep
+        ) {
+        #if @is_mul@
+            npyv_@sfx@ a = npyv_loadn2_till_@sfx@(src0, ssrc0, len, 1.0@c@, 1.0@c@);
+            npyv_@sfx@ b = npyv_loadn2_till_@sfx@(src1, ssrc1, len, 1.0@c@, 1.0@c@);
+        #else
+            npyv_@sfx@ a = npyv_loadn2_tillz_@sfx@(src0, ssrc0, len);
+            npyv_@sfx@ b = npyv_loadn2_tillz_@sfx@(src1, ssrc1, len);
+        #endif
+            npyv_@sfx@ r = @vectorf@_@sfx@(a, b);
+            npyv_storen2_till_@sfx@(dst, sdst, len, r);
+        }
+    }
+    #endif
+    else {
+        goto loop_scalar;
     }
+    npyv_cleanup();
+    return;
+loop_scalar:
+#endif
+    for (; len > 0; --len, b_src0 += b_ssrc0, b_src1 += b_ssrc1, b_dst += b_sdst) {
+        const @ftype@ a_r = ((@ftype@ *)b_src0)[0];
+        const @ftype@ a_i = ((@ftype@ *)b_src0)[1];
+        const @ftype@ b_r = ((@ftype@ *)b_src1)[0];
+        const @ftype@ b_i = ((@ftype@ *)b_src1)[1];
+    #if @is_mul@
+        ((@ftype@ *)b_dst)[0] = a_r*b_r - a_i*b_i;
+        ((@ftype@ *)b_dst)[1] = a_r*b_i + a_i*b_r;
+    #else
+        ((@ftype@ *)b_dst)[0] = a_r @OP@ b_r;
+        ((@ftype@ *)b_dst)[1] = a_i @OP@ b_i;
+    #endif
+    }
+}
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indx = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    @ftype@ *indexed;
+    for(i = 0; i < n; i++, indx += isindex, value += isb) {
+        indexed = (@ftype@ *)(ip1 + is1 * *(npy_intp *)indx);
+        const @ftype@ b_r = ((@ftype@ *)value)[0];
+        const @ftype@ b_i = ((@ftype@ *)value)[1];
+    #if @is_mul@
+        const @ftype@ a_r = indexed[0];
+        const @ftype@ a_i = indexed[1];
+        indexed[0] = a_r*b_r - a_i*b_i;
+        indexed[1] = a_r*b_i + a_i*b_r;
+    #else
+        indexed[0] @OP@= b_r;
+        indexed[1] @OP@= b_i;
+    #endif
+    }
+    return 0;
 }
 /**end repeat1**/
 
-NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_multiply)
+/**begin repeat1
+ *  #kind = conjugate, square#
+ *  #is_square = 0, 1#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
 (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
 {
-    if (!run_binary_avx512f_multiply_@TYPE@(args, dimensions, steps)) {
-        BINARY_LOOP {
-            const @ftype@ in1r = ((@ftype@ *)ip1)[0];
-            const @ftype@ in1i = ((@ftype@ *)ip1)[1];
-            const @ftype@ in2r = ((@ftype@ *)ip2)[0];
-            const @ftype@ in2i = ((@ftype@ *)ip2)[1];
-            ((@ftype@ *)op1)[0] = in1r*in2r - in1i*in2i;
-            ((@ftype@ *)op1)[1] = in1r*in2i + in1i*in2r;
+    npy_intp len = dimensions[0];
+    char *b_src = args[0], *b_dst = args[1];
+    npy_intp b_ssrc = steps[0], b_sdst = steps[1];
+#if @VECTOR@
+    if (is_mem_overlap(b_src, b_ssrc, b_dst, b_sdst, len) ||
+        b_sdst % sizeof(@ftype@) != 0 ||
+        b_ssrc % sizeof(@ftype@) != 0
+    ) {
+        goto loop_scalar;
+    }
+    const @ftype@ *src  = (@ftype@*)b_src;
+          @ftype@ *dst  = (@ftype@*)b_dst;
+    const npy_intp ssrc = b_ssrc / sizeof(@ftype@);
+    const npy_intp sdst = b_sdst / sizeof(@ftype@);
+
+    const int vstep = npyv_nlanes_@sfx@;
+    const int wstep = vstep * 2;
+    const int hstep = vstep / 2;
+
+    if (ssrc == 2 && ssrc == sdst) {
+        for (; len >= vstep; len -= vstep, src += wstep, dst += wstep) {
+            npyv_@sfx@ a0 = npyv_load_@sfx@(src);
+            npyv_@sfx@ a1 = npyv_load_@sfx@(src + vstep);
+            npyv_@sfx@ r0 = simd_c@kind@_@sfx@(a0);
+            npyv_@sfx@ r1 = simd_c@kind@_@sfx@(a1);
+            npyv_store_@sfx@(dst, r0);
+            npyv_store_@sfx@(dst + vstep, r1);
+        }
+        for (; len > 0; len -= hstep, src += vstep, dst += vstep) {
+            npyv_@sfx@ a = npyv_load2_tillz_@sfx@(src, len);
+            npyv_@sfx@ r = simd_c@kind@_@sfx@(a);
+            npyv_store2_till_@sfx@(dst, len, r);
+        }
+    }
+    else if (ssrc == 2 && npyv_storable_stride_s64(sdst)) {
+        for (; len >= vstep; len -= vstep, src += wstep, dst += sdst*vstep) {
+            npyv_@sfx@ a0 = npyv_load_@sfx@(src);
+            npyv_@sfx@ a1 = npyv_load_@sfx@(src + vstep);
+            npyv_@sfx@ r0 = simd_c@kind@_@sfx@(a0);
+            npyv_@sfx@ r1 = simd_c@kind@_@sfx@(a1);
+            npyv_storen2_@sfx@(dst, sdst, r0);
+            npyv_storen2_@sfx@(dst + sdst*hstep, sdst, r1);
+        }
+        for (; len > 0; len -= hstep, src += vstep, dst += sdst*hstep) {
+            npyv_@sfx@ a = npyv_load2_tillz_@sfx@(src, len);
+            npyv_@sfx@ r = simd_c@kind@_@sfx@(a);
+            npyv_storen2_till_@sfx@(dst, sdst, len, r);
+        }
+    }
+    else if (sdst == 2 && npyv_loadable_stride_s64(ssrc)) {
+        for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += wstep) {
+            npyv_@sfx@ a0 = npyv_loadn2_@sfx@(src, ssrc);
+            npyv_@sfx@ a1 = npyv_loadn2_@sfx@(src + ssrc*hstep, ssrc);
+            npyv_@sfx@ r0 = simd_c@kind@_@sfx@(a0);
+            npyv_@sfx@ r1 = simd_c@kind@_@sfx@(a1);
+            npyv_store_@sfx@(dst, r0);
+            npyv_store_@sfx@(dst + vstep, r1);
+        }
+        for (; len > 0; len -= hstep, src += ssrc*hstep, dst += vstep) {
+            npyv_@sfx@ a = npyv_loadn2_tillz_@sfx@((@ftype@*)src, ssrc, len);
+            npyv_@sfx@ r = simd_c@kind@_@sfx@(a);
+            npyv_store2_till_@sfx@(dst, len, r);
         }
     }
+    else {
+        goto loop_scalar;
+    }
+    npyv_cleanup();
+    return;
+loop_scalar:
+#endif
+    for (; len > 0; --len, b_src += b_ssrc, b_dst += b_sdst) {
+        const @ftype@ rl = ((@ftype@ *)b_src)[0];
+        const @ftype@ im = ((@ftype@ *)b_src)[1];
+    #if @is_square@
+        ((@ftype@ *)b_dst)[0] = rl*rl - im*im;
+        ((@ftype@ *)b_dst)[1] = rl*im + im*rl;
+    #else
+        ((@ftype@ *)b_dst)[0] = rl;
+        ((@ftype@ *)b_dst)[1] = -im;
+    #endif
+    }
 }
+/**end repeat1**/
 /**end repeat**/
diff --git a/numpy/core/src/umath/loops_arithmetic.dispatch.c.src b/numpy/core/src/umath/loops_arithmetic.dispatch.c.src
index 5b5f13ad1..b6f126298 100644
--- a/numpy/core/src/umath/loops_arithmetic.dispatch.c.src
+++ b/numpy/core/src/umath/loops_arithmetic.dispatch.c.src
@@ -42,7 +42,7 @@
  * #sfx    = s8, s16, s32, s64#
  * #len    = 8,  16,  32,  64#
  */
-static NPY_INLINE void
+static inline void
 simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len)
 {
     npyv_lanetype_@sfx@ *src   = (npyv_lanetype_@sfx@ *) args[0];
@@ -108,7 +108,7 @@ simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len)
  * #sfx    = u8, u16, u32, u64#
  * #len    = 8,  16,  32,  64#
  */
-static NPY_INLINE void
+static inline void
 simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len)
 {
     npyv_lanetype_@sfx@ *src   = (npyv_lanetype_@sfx@ *) args[0];
@@ -207,7 +207,7 @@ vsx4_div_@t@16(npyv_@t@16 a, npyv_@t@16 b)
  * #sfx = u8, u16, u32, u64#
  * #len = 8,  16,  32,  64#
  */
-static NPY_INLINE void
+static inline void
 vsx4_simd_divide_contig_@sfx@(char **args, npy_intp len)
 {
     npyv_lanetype_@sfx@ *src1 = (npyv_lanetype_@sfx@ *) args[0];
@@ -246,7 +246,7 @@ vsx4_simd_divide_contig_@sfx@(char **args, npy_intp len)
  * #sfx = s8, s16, s32, s64#
  * #len = 8,  16,  32,  64#
  */
-static NPY_INLINE void
+static inline void
 vsx4_simd_divide_contig_@sfx@(char **args, npy_intp len)
 {
     npyv_lanetype_@sfx@ *src1 = (npyv_lanetype_@sfx@ *) args[0];
@@ -395,6 +395,24 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_divide)
         }
     }
 }
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(@TYPE@_divide_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indx = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    @type@ *indexed;
+    for(i = 0; i < n; i++, indx += isindex, value += isb) {
+        indexed = (@type@ *)(ip1 + is1 * *(npy_intp *)indx);
+        *indexed = floor_div_@TYPE@(*indexed, *(@type@ *)value);
+    }
+    return 0;
+}
+
 /**end repeat**/
 
 /**begin repeat
@@ -463,4 +481,28 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_divide)
         }
     }
 }
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(@TYPE@_divide_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indx = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    @type@ *indexed;
+    for(i = 0; i < n; i++, indx += isindex, value += isb) {
+        indexed = (@type@ *)(ip1 + is1 * *(npy_intp *)indx);
+        @type@ in2 = *(@type@ *)value;
+        if (NPY_UNLIKELY(in2 == 0)) {
+            npy_set_floatstatus_divbyzero();
+            *indexed = 0;
+        } else {
+            *indexed = *indexed / in2;
+        }
+    }
+    return 0;
+}
+
 /**end repeat**/
diff --git a/numpy/core/src/umath/loops_autovec.dispatch.c.src b/numpy/core/src/umath/loops_autovec.dispatch.c.src
new file mode 100644
index 000000000..bdbfa0f86
--- /dev/null
+++ b/numpy/core/src/umath/loops_autovec.dispatch.c.src
@@ -0,0 +1,287 @@
+/*@targets
+ ** $maxopt $autovec baseline
+ ** sse2 avx2
+ ** neon
+ ** vsx2
+ ** vx
+ **/
+#define _UMATHMODULE
+#define _MULTIARRAYMODULE
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+#include "simd/simd.h"
+#include "loops_utils.h"
+#include "loops.h"
+// Provides the various *_LOOP macros
+#include "fast_loop_macros.h"
+
+/*
+ *****************************************************************************
+ **                           INTEGER LOOPS
+ *****************************************************************************
+ */
+/*
+ * Arithmetic bit shift operations.
+ *
+ * Intel hardware masks bit shift values, so large shifts wrap around
+ * and can produce surprising results. The special handling ensures that
+ * behavior is independent of compiler or hardware.
+ * TODO: We could implement consistent behavior for negative shifts,
+ *       which is undefined in C.
+ */
+#define INT_left_shift_needs_clear_floatstatus
+#define UINT_left_shift_needs_clear_floatstatus
+
+/**begin repeat
+ * #TYPE = BYTE, UBYTE, SHORT, USHORT, INT, UINT,
+ *         LONG, ULONG, LONGLONG, ULONGLONG#
+ * #type = npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint,
+ *         npy_long, npy_ulong, npy_longlong, npy_ulonglong#
+ * #ftype = npy_float, npy_float, npy_float, npy_float, npy_double, npy_double,
+ *          npy_double, npy_double, npy_double, npy_double#
+ * #SIGNED = 1, 0, 1, 0, 1, 0, 1, 0, 1, 0#
+ * #c = hh,uhh,h,uh,,u,l,ul,ll,ull#
+ */
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_positive)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP_FAST(@type@, @type@, *out = +in);
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_square)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+    UNARY_LOOP_FAST(@type@, @type@, *out = in * in);
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_reciprocal)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+    UNARY_LOOP_FAST(@type@, @type@, *out = 1.0 / in);
+}
+
+/**begin repeat1
+ * Arithmetic
+ * #kind = add, subtract, multiply#
+ * #OP = +, -, *#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if (IS_BINARY_REDUCE) {
+        BINARY_REDUCE_LOOP_FAST(@type@, io1 @OP@= in2);
+    }
+    else {
+        BINARY_LOOP_FAST(@type@, @type@, *out = in1 @OP@ in2);
+    }
+}
+/**end repeat1**/
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_left_shift)
+(char **args, npy_intp const *dimensions, npy_intp const *steps,
+                  void *NPY_UNUSED(func))
+{
+    BINARY_LOOP_FAST(@type@, @type@, *out = npy_lshift@c@(in1, in2));
+#ifdef @TYPE@_left_shift_needs_clear_floatstatus
+    // For some reason, our macOS CI sets an "invalid" flag here, but only
+    // for some types.
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_right_shift)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+#ifndef NPY_DO_NOT_OPTIMIZE_@TYPE@_right_shift
+    BINARY_LOOP_FAST(@type@, @type@, *out = npy_rshift@c@(in1, in2));
+#else
+    BINARY_LOOP {
+        @type@ in1 = *(@type@ *)ip1;
+        @type@ in2 = *(@type@ *)ip2;
+        *(@type@ *)op1 = npy_rshift@c@(in1, in2);
+    }
+#endif
+}
+/**end repeat**/
+
+/*
+ *****************************************************************************
+ **                         UNSIGNED INTEGER LOOPS
+ *****************************************************************************
+ */
+/**begin repeat
+ * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG#
+ * #type = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong#
+ * #c    = u,u,u,ul,ull#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_absolute)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP_FAST(@type@, @type@, *out = in);
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_sign)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP_FAST(@type@, @type@, *out = in > 0 ? 1 : 0);
+}
+
+/**begin repeat1
+ * Arithmetic
+ * #kind = bitwise_and, bitwise_or, bitwise_xor#
+ * #OP = &, |, ^#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if (IS_BINARY_REDUCE) {
+        BINARY_REDUCE_LOOP_FAST(@type@, io1 @OP@= in2);
+    }
+    else {
+        BINARY_LOOP_FAST(@type@, @type@, *out = in1 @OP@ in2);
+    }
+}
+/**end repeat1**/
+
+/**begin repeat1
+ * #kind = logical_and, logical_or#
+ * #OP = &&, ||#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    /*
+     * gcc vectorization of this is not good (PR60575) but manual integer
+     * vectorization is too tedious to be worthwhile
+     */
+    BINARY_LOOP_FAST(@type@, npy_bool, *out = in1 @OP@ in2);
+}
+/**end repeat1**/
+
+NPY_FINLINE npy_bool @TYPE@_logical_xor_(@type@ in1, @type@ in2)
+{ return (!!in1) != (!!in2); }
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_logical_xor)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP_FAST(@type@, npy_bool, *out = @TYPE@_logical_xor_(in1, in2));
+}
+
+/**begin repeat1
+ * #kind = isnan, isinf, isfinite#
+ * #func = npy_isnan, npy_isinf, npy_isfinite#
+ * #val = NPY_FALSE, NPY_FALSE, NPY_TRUE#
+ **/
+NPY_NO_EXPORT void  NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    /*
+     * The (void)in; suppresses an unused variable warning raised by gcc and allows
+     * us to re-use this macro even though we do not depend on in
+     */
+    UNARY_LOOP_FAST(@type@, npy_bool, (void)in; *out = @val@);
+}
+/**end repeat1**/
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_conjugate)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP_FAST(@type@, @type@, *out = in);
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_logical_not)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP_FAST(@type@, npy_bool, *out = !in);
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_invert)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP_FAST(@type@, @type@, *out = ~in);
+}
+/**end repeat**/
+
+/*
+ *****************************************************************************
+ **                         SIGNED! INTEGER LOOPS
+ *****************************************************************************
+ */
+
+/**begin repeat
+ * #TYPE = BYTE, SHORT, INT, LONG, LONGLONG#
+ * #type = npy_byte, npy_short, npy_int, npy_long, npy_longlong#
+ * #c    = ,,,l,ll#
+ */
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_absolute)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP_FAST(@type@, @type@, *out = (in >= 0) ? in : -in);
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_sign)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP_FAST(@type@, @type@, *out = in > 0 ? 1 : (in < 0 ? -1 : 0));
+}
+
+/**begin repeat1
+ * #kind = conjugate, invert, isnan, isinf, isfinite,
+ *         logical_and, logical_or, logical_xor, logical_not,
+ *         bitwise_and, bitwise_or, bitwise_xor#
+ **/
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    NPY_CPU_DISPATCH_CURFX(U@TYPE@_@kind@)(args, dimensions, steps, func);
+}
+/**end repeat1**/
+/**end repeat**/
+
+/*
+ *****************************************************************************
+ **                             BOOLEAN LOOPS                               **
+ *****************************************************************************
+ */
+/**begin repeat
+ * #kind = isnan, isinf, isfinite#
+ * #func = npy_isnan, npy_isinf, npy_isfinite#
+ * #val = NPY_FALSE, NPY_FALSE, NPY_TRUE#
+ **/
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BOOL_@kind@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    NPY_CPU_DISPATCH_CURFX(UBYTE_@kind@)(args, dimensions, steps, func);
+}
+/**end repeat**/
+
+/*
+ *****************************************************************************
+ **                          HALF-FLOAT LOOPS                               **
+ *****************************************************************************
+ */
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(HALF_absolute)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP_FAST(npy_half, npy_half, *out = in&0x7fffu);
+}
+
+/*
+ *****************************************************************************
+ **                           DATETIME LOOPS                                **
+ *****************************************************************************
+ */
+
+/**begin repeat
+ * #type = npy_datetime, npy_timedelta#
+ * #TYPE = DATETIME, TIMEDELTA#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_isinf)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    NPY_CPU_DISPATCH_CURFX(ULONGLONG_isinf)(args, dimensions, steps, func);
+}
+/**end repeat**/
diff --git a/numpy/core/src/umath/loops_comparison.dispatch.c.src b/numpy/core/src/umath/loops_comparison.dispatch.c.src
index 2f75593a5..751080871 100644
--- a/numpy/core/src/umath/loops_comparison.dispatch.c.src
+++ b/numpy/core/src/umath/loops_comparison.dispatch.c.src
@@ -234,7 +234,7 @@ static void simd_binary_@kind@_b8(char **args, npy_intp len)
         npyv_b8 a = npyv_cmpeq_u8(npyv_load_u8(src1), vzero);
         npyv_b8 b = npyv_cmpeq_u8(npyv_load_u8(src2), vzero);
         npyv_b8 c = npyv_@VOP@_b8(a, b);
-        npyv_store_u8(dst, npyv_andc_u8(npyv_cvt_u8_b8(c), truemask));
+        npyv_store_u8(dst, npyv_and_u8(npyv_cvt_u8_b8(c), truemask));
     }
 
     for (; len > 0; --len, ++src1, ++src2, ++dst) {
@@ -258,7 +258,7 @@ static void simd_binary_scalar1_@kind@_b8(char **args, npy_intp len)
     for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
         npyv_b8 b = npyv_cmpeq_u8(npyv_load_u8(src), vzero);
         npyv_b8 c = npyv_@VOP@_b8(a, b);
-        npyv_store_u8(dst, npyv_andc_u8(npyv_cvt_u8_b8(c), truemask));
+        npyv_store_u8(dst, npyv_and_u8(npyv_cvt_u8_b8(c), truemask));
     }
 
     for (; len > 0; --len, ++src, ++dst) {
@@ -281,7 +281,7 @@ static void simd_binary_scalar2_@kind@_b8(char **args, npy_intp len)
     for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
         npyv_b8 a = npyv_cmpeq_u8(npyv_load_u8(src), vzero);
         npyv_b8 c = npyv_@VOP@_b8(a, b);
-        npyv_store_u8(dst, npyv_andc_u8(npyv_cvt_u8_b8(c), truemask));
+        npyv_store_u8(dst, npyv_and_u8(npyv_cvt_u8_b8(c), truemask));
     }
 
     for (; len > 0; --len, ++src, ++dst) {
@@ -308,23 +308,27 @@ static void simd_binary_scalar2_@kind@_b8(char **args, npy_intp len)
  * #OP = ==, !=, <, <=#
  */
 #if !((@eq@ || @neq@) && @signed@)
-static NPY_INLINE void
+static inline void
 run_binary_simd_@kind@_@sfx@(char **args, npy_intp const *dimensions, npy_intp const *steps)
 {
 #if @VECTOR@
-    /* argument one scalar */
-    if (IS_BLOCKABLE_BINARY_SCALAR1_BOOL(sizeof(@type@), NPY_SIMD_WIDTH)) {
-        simd_binary_scalar1_@kind@_@sfx@(args, dimensions[0]);
-        return;
-    }
-    /* argument two scalar */
-    else if (IS_BLOCKABLE_BINARY_SCALAR2_BOOL(sizeof(@type@), NPY_SIMD_WIDTH)) {
-        simd_binary_scalar2_@kind@_@sfx@(args, dimensions[0]);
-        return;
-    }
-    else if (IS_BLOCKABLE_BINARY_BOOL(sizeof(@type@), NPY_SIMD_WIDTH)) {
-        simd_binary_@kind@_@sfx@(args, dimensions[0]);
-        return;
+    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
+        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
+    ) {
+        /* argument one scalar */
+        if (IS_BINARY_CONT_S1(@type@, npy_bool)) {
+            simd_binary_scalar1_@kind@_@sfx@(args, dimensions[0]);
+            return;
+        }
+        /* argument two scalar */
+        else if (IS_BINARY_CONT_S2(@type@, npy_bool)) {
+            simd_binary_scalar2_@kind@_@sfx@(args, dimensions[0]);
+            return;
+        }
+        else if (IS_BINARY_CONT(@type@, npy_bool)) {
+            simd_binary_@kind@_@sfx@(args, dimensions[0]);
+            return;
+        }
     }
 #endif
 
diff --git a/numpy/core/src/umath/loops_exponent_log.dispatch.c.src b/numpy/core/src/umath/loops_exponent_log.dispatch.c.src
index 8f123a48b..1fac3c150 100644
--- a/numpy/core/src/umath/loops_exponent_log.dispatch.c.src
+++ b/numpy/core/src/umath/loops_exponent_log.dispatch.c.src
@@ -239,7 +239,7 @@ fma_scalef_ps(__m256 poly, __m256 quadrant)
 
 #ifdef SIMD_AVX512F
 
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
+NPY_FINLINE __mmask16
 avx512_get_full_load_mask_ps(void)
 {
     return 0xFFFF;
@@ -1146,7 +1146,7 @@ AVX512F_log_DOUBLE(npy_double * op,
  * #vtype2_scatter = _mm512_mask_i32scatter_epi32, _mm256_mask_i32scatter_epi32#
  * #setzero = _mm512_setzero_epi32, _mm256_setzero_si256#
  */
-static NPY_INLINE void
+static inline void
 AVX512_SKX_ldexp_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
 {
     const npy_intp stride_ip1 = steps[0]/(npy_intp)sizeof(@type@);
@@ -1215,7 +1215,7 @@ AVX512_SKX_ldexp_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const
     }
 }
 
-static NPY_INLINE void
+static inline void
 AVX512_SKX_frexp_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
 {
     const npy_intp stride_ip1 = steps[0]/(npy_intp)sizeof(@type@);
diff --git a/numpy/core/src/umath/loops_logical.dispatch.c.src b/numpy/core/src/umath/loops_logical.dispatch.c.src
new file mode 100644
index 000000000..c07525be4
--- /dev/null
+++ b/numpy/core/src/umath/loops_logical.dispatch.c.src
@@ -0,0 +1,377 @@
+/*@targets
+ ** $maxopt baseline
+ ** neon asimd
+ ** sse2 avx2 avx512_skx
+ ** vsx2
+ ** vx
+ **/
+#define _UMATHMODULE
+#define _MULTIARRAYMODULE
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+#include "simd/simd.h"
+#include "loops_utils.h"
+#include "loops.h"
+#include "lowlevel_strided_loops.h"
+// Provides the various *_LOOP macros
+#include "fast_loop_macros.h"
+
+/*******************************************************************************
+ ** Defining the SIMD kernels
+ ******************************************************************************/
+
+#if NPY_SIMD
+/*
+ * convert any bit set to boolean true so vectorized and normal operations are
+ * consistent, should not be required if bool is used correctly everywhere but
+ * you never know
+ */
+NPY_FINLINE npyv_u8 byte_to_true(npyv_u8 v)
+{
+    const npyv_u8 zero = npyv_zero_u8();
+    const npyv_u8 truemask = npyv_setall_u8(1 == 1);
+    // cmpeq(v, 0) turns 0x00 -> 0xff and non-zero -> 0x00
+    npyv_u8 tmp = npyv_cvt_u8_b8(npyv_cmpeq_u8(v, zero));
+    // tmp is filled with 0xff/0x00, negate and mask to boolean true
+    return npyv_andc_u8(truemask, tmp);
+}
+/*
+ * convert mask vector (0xff/0x00) to boolean true.  similar to byte_to_true(),
+ * but we've already got a mask and can skip negation.
+ */
+NPY_FINLINE npyv_u8 mask_to_true(npyv_b8 v)
+{
+    const npyv_u8 truemask = npyv_setall_u8(1 == 1);
+    return npyv_and_u8(truemask, npyv_cvt_u8_b8(v));
+}
+/*
+ * For logical_and, we have to be careful to handle non-bool inputs where
+ * bits of each operand might not overlap. Example: a = 0x01, b = 0x80
+ * Both evaluate to boolean true, however, a & b is false.  Return value
+ * should be consistent with byte_to_true().
+ */
+NPY_FINLINE npyv_u8 simd_logical_and_u8(npyv_u8 a, npyv_u8 b)
+{
+    const npyv_u8 zero = npyv_zero_u8();
+    const npyv_u8 truemask = npyv_setall_u8(1 == 1);
+    npyv_b8 ma = npyv_cmpeq_u8(a, zero);
+    npyv_b8 mb = npyv_cmpeq_u8(b, zero);
+    npyv_u8 r = npyv_cvt_u8_b8(npyv_or_b8(ma, mb));
+    return npyv_andc_u8(truemask, r);
+}
+/*
+ * We don't really need the following, but it simplifies the templating code
+ * below since it is paired with simd_logical_and_u8() above.
+ */
+NPY_FINLINE npyv_u8 simd_logical_or_u8(npyv_u8 a, npyv_u8 b)
+{
+    npyv_u8 r = npyv_or_u8(a, b);
+    return byte_to_true(r);
+}
+
+
+/**begin repeat
+ * #kind = logical_and, logical_or#
+ * #and  = 1, 0#
+ * #scalar_op = &&, ||#
+ * #intrin = and, or#
+ * #reduce = min, max#
+ * #scalar_cmp = ==, !=#
+ * #anyall = all, any#
+ */
+static void
+simd_binary_@kind@_BOOL(npy_bool * op, npy_bool * ip1, npy_bool * ip2, npy_intp len)
+{
+    #define UNROLL 16
+
+    const int vstep = npyv_nlanes_u8;
+    const int wstep = vstep * UNROLL;
+
+    // Unrolled vectors loop
+    for (; len >= wstep; len -= wstep, ip1 += wstep, ip2 += wstep, op += wstep) {
+        /**begin repeat1
+         * #unroll = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15#
+         */
+        #if UNROLL > @unroll@
+        npyv_u8 a@unroll@ = npyv_load_u8(ip1 + vstep * @unroll@);
+        npyv_u8 b@unroll@ = npyv_load_u8(ip2 + vstep * @unroll@);
+        npyv_u8 r@unroll@ = simd_logical_@intrin@_u8(a@unroll@, b@unroll@);
+        npyv_store_u8(op + vstep * @unroll@, r@unroll@);
+        #endif
+        /**end repeat1**/
+    }
+    #undef UNROLL
+
+    // Single vectors loop
+    for (; len >= vstep; len -= vstep, ip1 += vstep, ip2 += vstep, op += vstep) {
+        npyv_u8 a = npyv_load_u8(ip1);
+        npyv_u8 b = npyv_load_u8(ip2);
+        npyv_u8 r = simd_logical_@intrin@_u8(a, b);
+        npyv_store_u8(op, r);
+    }
+
+    // Scalar loop to finish off
+    for (; len > 0; len--, ip1++, ip2++, op++) {
+        *op = *ip1 @scalar_op@ *ip2;
+    }
+}
+
+static void
+simd_reduce_@kind@_BOOL(npy_bool * op, npy_bool * ip, npy_intp len)
+{
+    #define UNROLL 8
+
+    const int vstep = npyv_nlanes_u8;
+    const int wstep = vstep * UNROLL;
+
+    // Unrolled vectors loop
+    for (; len >= wstep; len -= wstep, ip += wstep) {
+    #if defined(NPY_HAVE_SSE2)
+        NPY_PREFETCH(ip + wstep, 0, 3);
+    #endif
+        npyv_u8 v0 = npyv_load_u8(ip + vstep * 0);
+        npyv_u8 v1 = npyv_load_u8(ip + vstep * 1);
+        npyv_u8 v2 = npyv_load_u8(ip + vstep * 2);
+        npyv_u8 v3 = npyv_load_u8(ip + vstep * 3);
+        npyv_u8 v4 = npyv_load_u8(ip + vstep * 4);
+        npyv_u8 v5 = npyv_load_u8(ip + vstep * 5);
+        npyv_u8 v6 = npyv_load_u8(ip + vstep * 6);
+        npyv_u8 v7 = npyv_load_u8(ip + vstep * 7);
+
+        npyv_u8 m01 = npyv_@reduce@_u8(v0, v1);
+        npyv_u8 m23 = npyv_@reduce@_u8(v2, v3);
+        npyv_u8 m45 = npyv_@reduce@_u8(v4, v5);
+        npyv_u8 m67 = npyv_@reduce@_u8(v6, v7);
+
+        npyv_u8 m0123 = npyv_@reduce@_u8(m01, m23);
+        npyv_u8 m4567 = npyv_@reduce@_u8(m45, m67);
+
+        npyv_u8 mv = npyv_@reduce@_u8(m0123, m4567);
+
+        if(npyv_@anyall@_u8(mv) @scalar_cmp@ 0){
+            *op = !@and@;
+            return;
+        }
+    }
+
+    // Single vectors loop
+    for (; len >= vstep; len -= vstep, ip += vstep) {
+        npyv_u8 v0 = npyv_load_u8(ip);
+        if(npyv_@anyall@_u8(v0) @scalar_cmp@ 0){
+            *op = !@and@;
+            return;
+        }
+    }
+
+    // Scalar loop to finish off
+    for (; len > 0; --len, ++ip) {
+        *op = *op @scalar_op@ *ip;
+        if (*op @scalar_cmp@ 0) {
+            return;
+        }
+    }
+#undef UNROLL
+}
+/**end repeat**/ 
+
+/**begin repeat
+ * #kind = logical_not, absolute#
+ * #op = ==, !=#
+ * #not = 1, 0#
+ */
+static void
+simd_@kind@_BOOL(npy_bool * op, npy_bool * ip, npy_intp len)
+{
+    #define UNROLL 16
+
+    const int vstep = npyv_nlanes_u8;
+    const int wstep = vstep * UNROLL;
+
+    #if @not@
+    const npyv_u8 zero = npyv_zero_u8();
+    #endif
+
+    // Unrolled vectors loop
+    for (; len >= wstep; len -= wstep, ip += wstep, op += wstep) {
+        /**begin repeat1
+         * #unroll = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15#
+         */
+        #if UNROLL > @unroll@
+        npyv_u8 v@unroll@ = npyv_load_u8(ip + vstep * @unroll@);
+#if @not@
+        npyv_u8 r@unroll@ = mask_to_true(npyv_cmpeq_u8(v@unroll@, zero));
+#else
+        npyv_u8 r@unroll@ = byte_to_true(v@unroll@);
+#endif
+        npyv_store_u8(op + vstep * @unroll@, r@unroll@);
+        #endif
+        /**end repeat1**/
+    }
+    #undef UNROLL
+
+    // Single vectors loop
+    for (; len >= vstep; len -= vstep, ip += vstep, op += vstep) {
+        npyv_u8 v = npyv_load_u8(ip);
+#if @not@
+        npyv_u8 r = mask_to_true(npyv_cmpeq_u8(v, zero));
+#else
+        npyv_u8 r = byte_to_true(v);
+#endif
+        npyv_store_u8(op, r);
+    }
+
+    // Scalar loop to finish off
+    for (; len > 0; --len, ++ip, ++op) {
+        *op = (*ip @op@ 0);
+    }
+}
+/**end repeat**/
+
+#endif // NPY_SIMD
+
+/*******************************************************************************
+ ** Defining ufunc inner functions
+ ******************************************************************************/
+
+/**begin repeat
+ * # kind = logical_or, logical_and#
+ */
+static NPY_INLINE int
+run_binary_simd_@kind@_BOOL(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+#if NPY_SIMD
+    if (sizeof(npy_bool) == 1 &&
+            IS_BLOCKABLE_BINARY(sizeof(npy_bool), NPY_SIMD_WIDTH)) {
+        simd_binary_@kind@_BOOL((npy_bool*)args[2], (npy_bool*)args[0],
+                               (npy_bool*)args[1], dimensions[0]);
+        return 1;
+    }
+#endif
+    return 0;
+}
+
+
+static NPY_INLINE int
+run_reduce_simd_@kind@_BOOL(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+#if NPY_SIMD
+    if (sizeof(npy_bool) == 1 &&
+            IS_BLOCKABLE_REDUCE(sizeof(npy_bool), NPY_SIMD_WIDTH)) {
+        simd_reduce_@kind@_BOOL((npy_bool*)args[0], (npy_bool*)args[1],
+                                dimensions[0]);
+        return 1;
+    }
+#endif
+    return 0;
+}
+/**end repeat**/
+
+/**begin repeat
+ * #kind = logical_not, absolute#
+ */
+static NPY_INLINE int
+run_unary_simd_@kind@_BOOL(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+#if NPY_SIMD
+    if (sizeof(npy_bool) == 1 &&
+            IS_BLOCKABLE_UNARY(sizeof(npy_bool), NPY_SIMD_WIDTH)) {
+        simd_@kind@_BOOL((npy_bool*)args[1], (npy_bool*)args[0], dimensions[0]);
+        return 1;
+    }
+#endif
+    return 0;
+}
+/**end repeat**/
+
+
+/**begin repeat
+ * #kind = logical_and, logical_or#
+ * #OP =  &&, ||#
+ * #SC =  ==, !=#
+ * #and = 1, 0#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BOOL_@kind@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if(IS_BINARY_REDUCE) {
+#if NPY_SIMD
+        /*
+         * stick with our variant for more reliable performance, only known
+         * platform which outperforms it by ~20% is an i7 with glibc 2.17
+         */
+        if (run_reduce_simd_@kind@_BOOL(args, dimensions, steps)) {
+            return;
+        }
+#else
+        /* for now only use libc on 32-bit/non-x86 */
+        if (steps[1] == 1) {
+            npy_bool * op = (npy_bool *)args[0];
+#if @and@
+            /* np.all(), search for a zero (false) */
+            if (*op) {
+                *op = memchr(args[1], 0, dimensions[0]) == NULL;
+            }
+#else
+            /*
+             * np.any(), search for a non-zero (true) via comparing against
+             * zero blocks, memcmp is faster than memchr on SSE4 machines
+             * with glibc >= 2.12 and memchr can only check for equal 1
+             */
+            static const npy_bool zero[4096]; /* zero by C standard */
+            npy_uintp i, n = dimensions[0];
+
+            for (i = 0; !*op && i < n - (n % sizeof(zero)); i += sizeof(zero)) {
+                *op = memcmp(&args[1][i], zero, sizeof(zero)) != 0;
+            }
+            if (!*op && n - i > 0) {
+                *op = memcmp(&args[1][i], zero, n - i) != 0;
+            }
+#endif
+            return;
+        }
+#endif
+        else {
+            BINARY_REDUCE_LOOP(npy_bool) {
+                const npy_bool in2 = *(npy_bool *)ip2;
+                io1 = io1 @OP@ in2;
+                if (io1 @SC@ 0) {
+                    break;
+                }
+            }
+            *((npy_bool *)iop1) = io1;
+        }
+    }
+    else {
+        if (run_binary_simd_@kind@_BOOL(args, dimensions, steps)) {
+            return;
+        }
+        else {
+            BINARY_LOOP {
+                const npy_bool in1 = *(npy_bool *)ip1;
+                const npy_bool in2 = *(npy_bool *)ip2;
+                *((npy_bool *)op1) = in1 @OP@ in2;
+            }
+        }
+    }
+}
+/**end repeat**/
+
+/**begin repeat
+ * #kind = logical_not, absolute#
+ * #OP = ==, !=#
+ **/
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BOOL_@kind@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if (run_unary_simd_@kind@_BOOL(args, dimensions, steps)) {
+        return;
+    }
+    else {
+        UNARY_LOOP {
+            npy_bool in1 = *(npy_bool *)ip1;
+            *((npy_bool *)op1) = in1 @OP@ 0;
+        }
+    }
+}
+/**end repeat**/
+
diff --git a/numpy/core/src/umath/loops_minmax.dispatch.c.src b/numpy/core/src/umath/loops_minmax.dispatch.c.src
index 237c8e933..9d8667d38 100644
--- a/numpy/core/src/umath/loops_minmax.dispatch.c.src
+++ b/numpy/core/src/umath/loops_minmax.dispatch.c.src
@@ -451,6 +451,24 @@ clear_fp:
 #endif
 }
 
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indx = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    @type@ *indexed;
+    for(i = 0; i < n; i++, indx += isindex, value += isb) {
+        indexed = (@type@ *)(ip1 + is1 * *(npy_intp *)indx);
+        *indexed = SCALAR_OP(*indexed, *(@type@ *)value);
+    }
+    return 0;
+}
+
 #undef SCALAR_OP
 
 #endif // !fp_only || (is_fp && fp_only)
diff --git a/numpy/core/src/umath/loops_modulo.dispatch.c.src b/numpy/core/src/umath/loops_modulo.dispatch.c.src
index 53b7da289..25edffb1e 100644
--- a/numpy/core/src/umath/loops_modulo.dispatch.c.src
+++ b/numpy/core/src/umath/loops_modulo.dispatch.c.src
@@ -171,7 +171,7 @@ vsx4_divisor_@sfx@(const npyv_@sfx@ vscalar)
  * #func = fmod, remainder, divmod#
  * #id = 0, 1, 2#
  */
-static NPY_INLINE void
+static inline void
 vsx4_simd_@func@_contig_@sfx@(char **args, npy_intp len)
 {
     npyv_lanetype_@sfx@ *src1 = (npyv_lanetype_@sfx@ *) args[0];
@@ -239,7 +239,7 @@ vsx4_simd_@func@_contig_@sfx@(char **args, npy_intp len)
     npyv_cleanup();
 }
 
-static NPY_INLINE void
+static inline void
 vsx4_simd_@func@_by_scalar_contig_@sfx@(char **args, npy_intp len)
 {
     npyv_lanetype_@sfx@ *src1  = (npyv_lanetype_@sfx@ *) args[0];
@@ -292,7 +292,7 @@ vsx4_simd_@func@_by_scalar_contig_@sfx@(char **args, npy_intp len)
  * #func = fmod, remainder, divmod#
  * #id = 0, 1, 2#
  */
-static NPY_INLINE void
+static inline void
 vsx4_simd_@func@_contig_@sfx@(char **args, npy_intp len)
 {
     npyv_lanetype_@sfx@ *src1 = (npyv_lanetype_@sfx@ *) args[0];
@@ -410,7 +410,7 @@ vsx4_simd_@func@_contig_@sfx@(char **args, npy_intp len)
     npyv_cleanup();
 }
 
-static NPY_INLINE void
+static inline void
 vsx4_simd_@func@_by_scalar_contig_@sfx@(char **args, npy_intp len)
 {
     npyv_lanetype_@sfx@ *src1  = (npyv_lanetype_@sfx@ *) args[0];
diff --git a/numpy/core/src/umath/loops_trigonometric.dispatch.c.src b/numpy/core/src/umath/loops_trigonometric.dispatch.c.src
index 78685e807..43eb58ffe 100644
--- a/numpy/core/src/umath/loops_trigonometric.dispatch.c.src
+++ b/numpy/core/src/umath/loops_trigonometric.dispatch.c.src
@@ -9,12 +9,18 @@
 #include "simd/simd.h"
 #include "loops_utils.h"
 #include "loops.h"
+#include "fast_loop_macros.h"
 /*
  * TODO:
  * - use vectorized version of Payne-Hanek style reduction for large elements or
  *   when there's no native FUSED support instead of fallback to libc
  */
-#if NPY_SIMD_F32 && NPY_SIMD_FMA3 // native support
+#if NPY_SIMD_FMA3  // native support
+/**begin repeat
+ *  #check = F64, F32#
+ *  #sfx  = f64, f32#
+ */
+#if NPY_SIMD_@check@
 /*
  * Vectorized Cody-Waite range reduction technique
  * Performs the reduction step x* = x - y*C in three steps:
@@ -23,14 +29,189 @@
  * 3) x* = x - y*c3
  * c1, c2 are exact floating points, c3 = C - c1 - c2 simulates higher precision
  */
-NPY_FINLINE npyv_f32
-simd_range_reduction_f32(npyv_f32 x, npyv_f32 y, npyv_f32 c1, npyv_f32 c2, npyv_f32 c3)
+NPY_FINLINE npyv_@sfx@
+simd_range_reduction_@sfx@(npyv_@sfx@ x, npyv_@sfx@ y, npyv_@sfx@ c1, npyv_@sfx@ c2, npyv_@sfx@ c3)
 {
-    npyv_f32 reduced_x = npyv_muladd_f32(y, c1, x);
-    reduced_x = npyv_muladd_f32(y, c2, reduced_x);
-    reduced_x = npyv_muladd_f32(y, c3, reduced_x);
+    npyv_@sfx@ reduced_x = npyv_muladd_@sfx@(y, c1, x);
+    reduced_x = npyv_muladd_@sfx@(y, c2, reduced_x);
+    reduced_x = npyv_muladd_@sfx@(y, c3, reduced_x);
     return reduced_x;
 }
+#endif
+/**end repeat**/
+
+#if NPY_SIMD_F64
+/**begin repeat
+ *  #op = cos, sin#
+ */
+#if defined(NPY_OS_WIN32) || defined(NPY_OS_CYGWIN)
+NPY_FINLINE npyv_f64
+#else
+NPY_NOINLINE npyv_f64
+#endif
+simd_@op@_scalar_f64(npyv_f64 out, npy_uint64 cmp_bits)
+{
+    // MSVC doesn't compile with direct vector access, so we copy it here
+    // as we have no npyv_get_lane/npyv_set_lane intrinsics
+    npy_double NPY_DECL_ALIGNED(NPY_SIMD_WIDTH) out_copy[npyv_nlanes_f64];
+    npyv_storea_f64(out_copy, out);
+
+    for (unsigned i = 0; i < npyv_nlanes_f64; ++i) {
+        if (cmp_bits & (1 << i)) {
+            out_copy[i] = npy_@op@(out_copy[i]);
+        }
+    }
+
+    return npyv_loada_f64(out_copy);
+}
+/**end repeat**/
+
+/*
+ * Approximate sine algorithm for x \in [-pi/2, pi/2]
+ * worst-case error is 3.5 ulp.
+ * abs error: 0x1.be222a58p-53 in [-pi/2, pi/2].
+ */
+NPY_FINLINE npyv_f64
+simd_approx_sine_poly_f64(npyv_f64 r)
+{
+    const npyv_f64 poly1 = npyv_setall_f64(-0x1.9f4a9c8b21dc9p-41);
+    const npyv_f64 poly2 = npyv_setall_f64(0x1.60e88a10163f2p-33);
+    const npyv_f64 poly3 = npyv_setall_f64(-0x1.ae6361b7254e7p-26);
+    const npyv_f64 poly4 = npyv_setall_f64(0x1.71de382e8d62bp-19);
+    const npyv_f64 poly5 = npyv_setall_f64(-0x1.a01a019aeb4ffp-13);
+    const npyv_f64 poly6 = npyv_setall_f64(0x1.111111110b25ep-7);
+    const npyv_f64 poly7 = npyv_setall_f64(-0x1.55555555554c3p-3);
+
+    npyv_f64 r2 = npyv_mul_f64(r, r);
+    npyv_f64 y = npyv_muladd_f64(poly1, r2, poly2);
+    y = npyv_muladd_f64(y, r2, poly3);
+    y = npyv_muladd_f64(y, r2, poly4);
+    y = npyv_muladd_f64(y, r2, poly5);
+    y = npyv_muladd_f64(y, r2, poly6);
+    y = npyv_muladd_f64(y, r2, poly7);
+    y = npyv_muladd_f64(npyv_mul_f64(y, r2), r, r);
+
+    return y;
+}
+
+/* r = |x| - n*pi  (range reduction into -pi/2 .. pi/2).  */
+NPY_FINLINE npyv_f64
+simd_range_reduction_pi2(npyv_f64 r, npyv_f64 n) {
+    const npyv_f64 pi1 = npyv_setall_f64(-0x1.921fb54442d18p+1);
+    const npyv_f64 pi2 = npyv_setall_f64(-0x1.1a62633145c06p-53);
+    const npyv_f64 pi3 = npyv_setall_f64(-0x1.c1cd129024e09p-106);
+
+    return simd_range_reduction_f64(r, n, pi1, pi2, pi3);
+}
+
+NPY_FINLINE npyv_b64 simd_sin_range_check_f64(npyv_u64 ir) {
+    const npyv_u64 tiny_bound = npyv_setall_u64(0x202); /* top12 (asuint64 (0x1p-509)).  */
+    const npyv_u64 simd_thresh = npyv_setall_u64(0x214); /* top12 (asuint64 (RangeVal)) - SIMD_TINY_BOUND.  */
+
+    return npyv_cmpge_u64(npyv_sub_u64(npyv_shri_u64(ir, 52), tiny_bound), simd_thresh);
+}
+
+NPY_FINLINE npyv_b64 simd_cos_range_check_f64(npyv_u64 ir) {
+    const npyv_f64 range_val = npyv_setall_f64(0x1p23);
+
+    return npyv_cmpge_u64(ir, npyv_reinterpret_u64_f64(range_val));
+}
+
+NPY_FINLINE npyv_f64
+simd_cos_poly_f64(npyv_f64 r, npyv_u64 ir, npyv_u64 sign)
+{
+    const npyv_f64 inv_pi = npyv_setall_f64(0x1.45f306dc9c883p-2);
+    const npyv_f64 half_pi = npyv_setall_f64(0x1.921fb54442d18p+0);
+    const npyv_f64 shift = npyv_setall_f64(0x1.8p52);
+
+    /* n = rint((|x|+pi/2)/pi) - 0.5.  */
+    npyv_f64 n = npyv_muladd_f64(inv_pi, npyv_add_f64(r, half_pi), shift);
+    npyv_u64 odd = npyv_shli_u64(npyv_reinterpret_u64_f64(n), 63);
+    n = npyv_sub_f64(n, shift);
+    n = npyv_sub_f64(n, npyv_setall_f64(0.5));
+
+    /* r = |x| - n*pi  (range reduction into -pi/2 .. pi/2).  */
+    r = simd_range_reduction_pi2(r, n);
+
+    /* sin(r) poly approx.  */
+    npyv_f64 y = simd_approx_sine_poly_f64(r);
+
+    /* sign.  */
+    return npyv_reinterpret_f64_u64(npyv_xor_u64(npyv_reinterpret_u64_f64(y), odd));
+}
+
+NPY_FINLINE npyv_f64
+simd_sin_poly_f64(npyv_f64 r, npyv_u64 ir, npyv_u64 sign)
+{
+    const npyv_f64 inv_pi = npyv_setall_f64(0x1.45f306dc9c883p-2);
+    const npyv_f64 shift = npyv_setall_f64(0x1.8p52);
+
+    /* n = rint(|x|/pi).  */
+    npyv_f64 n = npyv_muladd_f64(inv_pi, r, shift);
+    npyv_u64 odd = npyv_shli_u64(npyv_reinterpret_u64_f64(n), 63);
+    n = npyv_sub_f64(n, shift);
+
+    /* r = |x| - n*pi  (range reduction into -pi/2 .. pi/2).  */
+    r = simd_range_reduction_pi2(r, n);
+
+    /* sin(r) poly approx.  */
+    npyv_f64 y = simd_approx_sine_poly_f64(r);
+
+    /* sign.  */
+    return npyv_reinterpret_f64_u64(npyv_xor_u64(npyv_xor_u64(npyv_reinterpret_u64_f64(y), sign), odd));
+}
+
+/**begin repeat
+ *  #op = cos, sin#
+ */
+NPY_FINLINE void
+simd_@op@_f64(const double *src, npy_intp ssrc, double *dst, npy_intp sdst, npy_intp len)
+{
+    const npyv_u64 abs_mask = npyv_setall_u64(0x7fffffffffffffff);
+    const int vstep = npyv_nlanes_f64;
+
+    npyv_f64 out = npyv_zero_f64();
+    npyv_f64 x_in;
+
+    for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+        if (ssrc == 1) {
+            x_in = npyv_load_tillz_f64(src, len);
+        } else {
+            x_in = npyv_loadn_tillz_f64(src, ssrc, len);
+        }
+
+        npyv_u64 ir = npyv_and_u64(npyv_reinterpret_u64_f64(x_in), abs_mask);
+        npyv_f64 r = npyv_reinterpret_f64_u64(ir);
+        npyv_u64 sign = npyv_and_u64(npyv_reinterpret_u64_f64(x_in), npyv_not_u64(abs_mask));
+
+        npyv_b64 cmp = simd_@op@_range_check_f64(ir);
+        /* If fenv exceptions are to be triggered correctly, set any special lanes
+        to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by
+        scalar loop later.  */
+        r = npyv_select_f64(cmp, npyv_setall_f64(1.0), r);
+
+        // Some in range, at least one calculation is useful
+        if (!npyv_all_b64(cmp)) {
+            out = simd_@op@_poly_f64(r, ir, sign);
+        }
+
+        if (npyv_any_b64(cmp)) {
+            out = npyv_select_f64(cmp, x_in, out);
+            out = simd_@op@_scalar_f64(out, npyv_tobits_b64(cmp));
+        }
+
+        if (sdst == 1) {
+            npyv_store_till_f64(dst, len, out);
+        } else {
+            npyv_storen_till_f64(dst, sdst, len, out);
+        }
+    }
+    npyv_cleanup();
+}
+/**end repeat**/
+#endif // NPY_SIMD_F64
+
+#if NPY_SIMD_F32
 /*
  * Approximate cosine algorithm for x \in [-PI/4, PI/4]
  * Maximum ULP across all 32-bit floats = 0.875
@@ -124,6 +305,11 @@ simd_sincos_f32(const float *src, npy_intp ssrc, float *dst, npy_intp sdst,
         } else {
             x_in = npyv_loadn_tillz_f32(src, ssrc, len);
         }
+        npyv_b32 nnan_mask = npyv_notnan_f32(x_in);
+    #if NPY_SIMD_CMPSIGNAL
+        // Eliminate NaN to avoid FP invalid exception
+        x_in = npyv_and_f32(x_in, npyv_reinterpret_f32_u32(npyv_cvt_u32_b32(nnan_mask)));
+    #endif
         npyv_b32 simd_mask = npyv_cmple_f32(npyv_abs_f32(x_in), max_cody);
         npy_uint64 simd_maski = npyv_tobits_b32(simd_mask);
         /*
@@ -132,7 +318,6 @@ simd_sincos_f32(const float *src, npy_intp ssrc, float *dst, npy_intp sdst,
          * these numbers
          */
         if (simd_maski != 0) {
-            npyv_b32 nnan_mask = npyv_notnan_f32(x_in);
             npyv_f32 x = npyv_select_f32(npyv_and_b32(nnan_mask, simd_mask), x_in, zerosf);
 
             npyv_f32 quadrant = npyv_mul_f32(x, two_over_pi);
@@ -194,24 +379,58 @@ simd_sincos_f32(const float *src, npy_intp ssrc, float *dst, npy_intp sdst,
     }
     npyv_cleanup();
 }
-#endif // NPY_SIMD_FMA3
+#endif // NPY_SIMD_FP32
+#endif // NYP_SIMD_FMA3
 
 /**begin repeat
  *  #func = cos, sin#
- *  #enum = SIMD_COMPUTE_COS, SIMD_COMPUTE_SIN#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_@func@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+#if NPY_SIMD_F64 && NPY_SIMD_FMA3
+    const double *src = (double*)args[0];
+          double *dst = (double*)args[1];
+    const int lsize = sizeof(src[0]);
+    const npy_intp ssrc = steps[0] / lsize;
+    const npy_intp sdst = steps[1] / lsize;
+    npy_intp len = dimensions[0];
+    assert(len <= 1 || (steps[0] % lsize == 0 && steps[1] % lsize == 0));
+
+    if (is_mem_overlap(src, steps[0], dst, steps[1], len) ||
+        !npyv_loadable_stride_f64(ssrc) || !npyv_storable_stride_f64(sdst)
+    ) {
+        for (; len > 0; --len, src += ssrc, dst += sdst) {
+            simd_@func@_f64(src, 1, dst, 1, 1);
+        }
+    } else {
+        simd_@func@_f64(src, ssrc, dst, sdst, len);
+    }
+#else
+    UNARY_LOOP {
+        const npy_double in1 = *(npy_double *)ip1;
+        *(npy_double *)op1 = npy_@func@(in1);
+    }
+#endif
+}
+/**end repeat**/
+
+/**begin repeat
+ *  #func = sin, cos#
+ *  #enum = SIMD_COMPUTE_SIN, SIMD_COMPUTE_COS#
  */
 NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_@func@)
 (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
 {
-    const float *src = (float*)args[0];
-          float *dst = (float*)args[1];
+#if NPY_SIMD_F32 && NPY_SIMD_FMA3
+    const npy_float *src = (npy_float*)args[0];
+          npy_float *dst = (npy_float*)args[1];
 
     const int lsize = sizeof(src[0]);
     const npy_intp ssrc = steps[0] / lsize;
     const npy_intp sdst = steps[1] / lsize;
     npy_intp len = dimensions[0];
     assert(len <= 1 || (steps[0] % lsize == 0 && steps[1] % lsize == 0));
-#if NPY_SIMD_F32 && NPY_SIMD_FMA3
     if (is_mem_overlap(src, steps[0], dst, steps[1], len) ||
         !npyv_loadable_stride_f32(ssrc) || !npyv_storable_stride_f32(sdst)
     ) {
@@ -222,9 +441,9 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_@func@)
         simd_sincos_f32(src, ssrc, dst, sdst, len, @enum@);
     }
 #else
-    for (; len > 0; --len, src += ssrc, dst += sdst) {
-        const float src0 = *src;
-        *dst = npy_@func@f(src0);
+    UNARY_LOOP {
+        const npy_float in1 = *(npy_float *)ip1;
+        *(npy_float *)op1 = npy_@func@f(in1);
     }
 #endif
 }
diff --git a/numpy/core/src/umath/loops_umath_fp.dispatch.c.src b/numpy/core/src/umath/loops_umath_fp.dispatch.c.src
index 46ce51824..89999e879 100644
--- a/numpy/core/src/umath/loops_umath_fp.dispatch.c.src
+++ b/numpy/core/src/umath/loops_umath_fp.dispatch.c.src
@@ -12,10 +12,12 @@
 /**begin repeat
  * #sfx = f32, f64#
  * #func_suffix = f16, 8#
+ * #len = 32, 64#
  */
 /**begin repeat1
  * #func = exp2, log2, log10, expm1, log1p, cbrt, tan, asin, acos, atan, sinh, cosh, asinh, acosh, atanh#
  * #default_val = 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0#
+ * #fxnan = 0,    0,   0,     64,    0,     0,    0,   0,    0,    0,    0,    0,    0,     0,     0#
  */
 static void
 simd_@func@_@sfx@(const npyv_lanetype_@sfx@ *src, npy_intp ssrc,
@@ -37,7 +39,15 @@ simd_@func@_@sfx@(const npyv_lanetype_@sfx@ *src, npy_intp ssrc,
                 x = npyv_loadn_tillz_@sfx@(src, ssrc, len);
             }
         #endif
+    #if @fxnan@ == @len@
+        // Workaround, invalid value encountered when x is set to nan
+        npyv_b@len@ nnan_mask = npyv_notnan_@sfx@(x);
+        npyv_@sfx@ x_exnan = npyv_select_@sfx@(nnan_mask, x, npyv_setall_@sfx@(@default_val@));
+        npyv_@sfx@ out = __svml_@func@@func_suffix@(x_exnan);
+        out = npyv_select_@sfx@(nnan_mask, out, x);
+    #else
         npyv_@sfx@ out = __svml_@func@@func_suffix@(x);
+    #endif
         if (sdst == 1) {
             npyv_store_till_@sfx@(dst, len, out);
         } else {
@@ -50,32 +60,6 @@ simd_@func@_@sfx@(const npyv_lanetype_@sfx@ *src, npy_intp ssrc,
 /**end repeat**/
 
 /**begin repeat
- * #func = sin, cos#
- */
-static void
-simd_@func@_f64(const double *src, npy_intp ssrc,
-                      double *dst, npy_intp sdst, npy_intp len)
-{
-    const int vstep = npyv_nlanes_f64;
-    for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
-        npyv_f64 x;
-        if (ssrc == 1) {
-            x = npyv_load_tillz_f64(src, len);
-        } else {
-            x = npyv_loadn_tillz_f64(src, ssrc, len);
-        }
-        npyv_f64 out = __svml_@func@8(x);
-        if (sdst == 1) {
-            npyv_store_till_f64(dst, len, out);
-        } else {
-            npyv_storen_till_f64(dst, sdst, len, out);
-        }
-    }
-    npyv_cleanup();
-}
-/**end repeat**/
-
-/**begin repeat
  * #sfx = f32, f64#
  * #func_suffix = f16, 8#
  */
@@ -267,31 +251,3 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@func@)
 }
 /**end repeat1**/
 /**end repeat**/
-
-/**begin repeat
- *  #func = sin, cos#
- */
-NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_@func@)
-(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
-{
-#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
-    const double *src = (double*)args[0];
-          double *dst = (double*)args[1];
-    const int lsize = sizeof(src[0]);
-    const npy_intp ssrc = steps[0] / lsize;
-    const npy_intp sdst = steps[1] / lsize;
-    const npy_intp len = dimensions[0];
-    assert(len <= 1 || (steps[0] % lsize == 0 && steps[1] % lsize == 0));
-    if (!is_mem_overlap(src, steps[0], dst, steps[1], len) &&
-        npyv_loadable_stride_f64(ssrc) &&
-        npyv_storable_stride_f64(sdst)) {
-        simd_@func@_f64(src, ssrc, dst, sdst, len);
-        return;
-    }
-#endif
-    UNARY_LOOP {
-        const npy_double in1 = *(npy_double *)ip1;
-        *(npy_double *)op1 = npy_@func@(in1);
-    }
-}
-/**end repeat**/
diff --git a/numpy/core/src/umath/loops_unary.dispatch.c.src b/numpy/core/src/umath/loops_unary.dispatch.c.src
new file mode 100644
index 000000000..1e2a81d20
--- /dev/null
+++ b/numpy/core/src/umath/loops_unary.dispatch.c.src
@@ -0,0 +1,364 @@
+/*@targets
+ ** $maxopt baseline
+ ** neon asimd
+ ** sse2 avx2 avx512_skx
+ ** vsx2
+ ** vx vxe
+ **/
+
+#define _UMATHMODULE
+#define _MULTIARRAYMODULE
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+#include "numpy/npy_math.h"
+#include "simd/simd.h"
+#include "loops_utils.h"
+#include "loops.h"
+#include "lowlevel_strided_loops.h"
+// Provides the various *_LOOP macros
+#include "fast_loop_macros.h"
+
+/*******************************************************************************
+ ** Scalar ops
+ ******************************************************************************/
+#define scalar_negative(X) (-X)
+
+/*******************************************************************************
+ ** extra SIMD intrinsics
+ ******************************************************************************/
+
+#if NPY_SIMD
+
+/**begin repeat
+ * #sfx  = s8, u8, s16, u16, s32, u32, s64, u64#
+ * #ssfx =  8,  8,  16,  16,  32,  32,  64,  64#
+ */
+static NPY_INLINE npyv_@sfx@
+npyv_negative_@sfx@(npyv_@sfx@ v)
+{
+#if defined(NPY_HAVE_NEON) && (defined(__aarch64__) || @ssfx@ < 64)
+    return npyv_reinterpret_@sfx@_s@ssfx@(vnegq_s@ssfx@(npyv_reinterpret_s@ssfx@_@sfx@(v)));
+#else
+    // (x ^ -1) + 1
+    const npyv_@sfx@ m1 = npyv_setall_@sfx@((npyv_lanetype_@sfx@)-1);
+    return npyv_sub_@sfx@(npyv_xor_@sfx@(v, m1), m1);
+#endif
+}
+/**end repeat**/
+
+/**begin repeat
+ * #sfx  = f32, f64#
+ * #VCHK = NPY_SIMD_F32, NPY_SIMD_F64#
+ * #fd = f, #
+ */
+#if @VCHK@
+static NPY_INLINE npyv_@sfx@
+npyv_negative_@sfx@(npyv_@sfx@ v)
+{
+#if defined(NPY_HAVE_NEON)
+    return vnegq_@sfx@(v);
+#else
+    // (v ^ signmask)
+    const npyv_@sfx@ signmask = npyv_setall_@sfx@(-0.@fd@);
+    return npyv_xor_@sfx@(v, signmask);
+#endif
+}
+#endif // @VCHK@
+/**end repeat**/
+
+#endif // NPY_SIMD
+
+/********************************************************************************
+ ** Defining the SIMD kernels
+ ********************************************************************************/
+/**begin repeat
+ * #sfx = s8, u8, s16, u16, s32, u32, s64, u64, f32, f64#
+ * #simd_chk = NPY_SIMD*8, NPY_SIMD_F32, NPY_SIMD_F64#
+ * #is_fp = 0*8, 1*2#
+ * #supports_ncontig = 0*4,1*6#
+ */
+/**begin repeat1
+ * #kind   = negative#
+ * #intrin = negative#
+ * #unroll = 4#
+ */
+#if @simd_chk@
+#if @unroll@ < 1
+#error "Unroll must be at least 1"
+#elif NPY_SIMD != 128 && @unroll@ > 2
+// Avoid memory bandwidth bottleneck for larger SIMD
+#define UNROLL 2
+#else
+#define UNROLL @unroll@
+#endif
+// contiguous inputs and output.
+static NPY_INLINE void
+simd_unary_cc_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip,
+                             npyv_lanetype_@sfx@ *op,
+                             npy_intp len)
+{
+    const int vstep = npyv_nlanes_@sfx@;
+    const int wstep = vstep * UNROLL;
+
+    // unrolled vector loop
+    for (; len >= wstep; len -= wstep, ip += wstep, op += wstep) {
+    /**begin repeat2
+     * #U = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15#
+     */
+    #if UNROLL > @U@
+        npyv_@sfx@ v_@U@ = npyv_load_@sfx@(ip + @U@ * vstep);
+        npyv_@sfx@ r_@U@ = npyv_@intrin@_@sfx@(v_@U@);
+        npyv_store_@sfx@(op + @U@ * vstep, r_@U@);
+    #endif
+    /**end repeat2**/
+    }
+    // single vector loop
+    for (; len >= vstep; len -= vstep, ip += vstep, op +=vstep) {
+        npyv_@sfx@ v = npyv_load_@sfx@(ip);
+        npyv_@sfx@ r = npyv_@intrin@_@sfx@(v);
+        npyv_store_@sfx@(op, r);
+    }
+    // scalar finish up any remaining iterations
+    for (; len > 0; --len, ++ip, ++op) {
+        *op = scalar_@intrin@(*ip);
+    }
+}
+
+#if @supports_ncontig@
+// contiguous input, non-contiguous output
+static NPY_INLINE void
+simd_unary_cn_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip,
+                             npyv_lanetype_@sfx@ *op, npy_intp ostride,
+                             npy_intp len)
+{
+    const int vstep = npyv_nlanes_@sfx@;
+    const int wstep = vstep * UNROLL;
+
+    // unrolled vector loop
+    for (; len >= wstep; len -= wstep, ip += wstep, op += ostride*wstep) {
+    /**begin repeat2
+     * #U = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15#
+     */
+    #if UNROLL > @U@
+        npyv_@sfx@ v_@U@ = npyv_load_@sfx@(ip + @U@ * vstep);
+        npyv_@sfx@ r_@U@ = npyv_@intrin@_@sfx@(v_@U@);
+        npyv_storen_@sfx@(op + @U@ * vstep * ostride, ostride, r_@U@);
+    #endif
+    /**end repeat2**/
+    }
+    // single vector loop
+    for (; len >= vstep; len -= vstep, ip += vstep, op += ostride*vstep) {
+        npyv_@sfx@ v = npyv_load_@sfx@(ip);
+        npyv_@sfx@ r = npyv_@intrin@_@sfx@(v);
+        npyv_storen_@sfx@(op, ostride, r);
+    }
+    // scalar finish up any remaining iterations
+    for (; len > 0; --len, ++ip, op += ostride) {
+        *op = scalar_@intrin@(*ip);
+    }
+}
+// non-contiguous input, contiguous output
+static NPY_INLINE void
+simd_unary_nc_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip, npy_intp istride,
+                             npyv_lanetype_@sfx@ *op,
+                             npy_intp len)
+{
+    const int vstep = npyv_nlanes_@sfx@;
+    const int wstep = vstep * UNROLL;
+
+    // unrolled vector loop
+    for (; len >= wstep; len -= wstep, ip += istride*wstep, op += wstep) {
+    /**begin repeat2
+     * #U = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15#
+     */
+    #if UNROLL > @U@
+        npyv_@sfx@ v_@U@ = npyv_loadn_@sfx@(ip + @U@ * vstep * istride, istride);
+        npyv_@sfx@ r_@U@ = npyv_@intrin@_@sfx@(v_@U@);
+        npyv_store_@sfx@(op + @U@ * vstep, r_@U@);
+    #endif
+    /**end repeat2**/
+    }
+    // single vector loop
+    for (; len >= vstep; len -= vstep, ip += istride*vstep, op += vstep) {
+        npyv_@sfx@ v = npyv_loadn_@sfx@(ip, istride);
+        npyv_@sfx@ r = npyv_@intrin@_@sfx@(v);
+        npyv_store_@sfx@(op, r);
+    }
+    // scalar finish up any remaining iterations
+    for (; len > 0; --len, ip += istride, ++op) {
+        *op = scalar_@intrin@(*ip);
+    }
+}
+// non-contiguous input and output
+// limit unroll to 2x
+#if UNROLL > 2
+#undef UNROLL
+#define UNROLL 2
+#endif
+static NPY_INLINE void
+simd_unary_nn_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip, npy_intp istride,
+                             npyv_lanetype_@sfx@ *op, npy_intp ostride,
+                             npy_intp len)
+{
+    const int vstep = npyv_nlanes_@sfx@;
+    const int wstep = vstep * UNROLL;
+
+    // unrolled vector loop
+    for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) {
+    /**begin repeat2
+     * #U = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15#
+     */
+    #if UNROLL > @U@
+        npyv_@sfx@ v_@U@ = npyv_loadn_@sfx@(ip + @U@ * vstep * istride, istride);
+        npyv_@sfx@ r_@U@ = npyv_@intrin@_@sfx@(v_@U@);
+        npyv_storen_@sfx@(op + @U@ * vstep * ostride, ostride, r_@U@);
+    #endif
+    /**end repeat2**/
+    }
+    // single vector loop
+    for (; len >= vstep; len -= vstep, ip += istride*vstep, op += ostride*vstep) {
+        npyv_@sfx@ v = npyv_loadn_@sfx@(ip, istride);
+        npyv_@sfx@ r = npyv_@intrin@_@sfx@(v);
+        npyv_storen_@sfx@(op, ostride, r);
+    }
+    // scalar finish up any remaining iterations
+    for (; len > 0; --len, ip += istride, op += ostride) {
+        *op = scalar_@intrin@(*ip);
+    }
+}
+#endif // @supports_ncontig@
+#undef UNROLL
+#endif // @simd_chk@
+/*end repeat1**/
+/**end repeat**/
+
+/********************************************************************************
+ ** Defining ufunc inner functions
+ ********************************************************************************/
+/**begin repeat
+ * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG,
+ *         BYTE,  SHORT,  INT,  LONG,  LONGLONG,
+ *         FLOAT, DOUBLE, LONGDOUBLE#
+ *
+ * #BTYPE = BYTE, SHORT, INT,  LONG, LONGLONG,
+ *          BYTE, SHORT, INT, LONG, LONGLONG,
+ *          FLOAT, DOUBLE, LONGDOUBLE#
+ * #type = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong,
+ *         npy_byte, npy_short, npy_int, npy_long, npy_longlong,
+ *         npy_float, npy_double, npy_longdouble#
+ *
+ * #is_fp = 0*10, 1*3#
+ * #is_unsigned = 1*5, 0*5, 0*3#
+ * #supports_ncontig = 0*2, 1*3, 0*2, 1*3, 1*3#
+ */
+#undef TO_SIMD_SFX
+#if 0
+/**begin repeat1
+ * #len = 8, 16, 32, 64#
+ */
+#elif NPY_SIMD && NPY_BITSOF_@BTYPE@ == @len@
+    #if @is_fp@
+        #define TO_SIMD_SFX(X) X##_f@len@
+        #if NPY_BITSOF_@BTYPE@ == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_@BTYPE@ == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif @is_unsigned@
+        #define TO_SIMD_SFX(X) X##_u@len@
+    #else
+        #define TO_SIMD_SFX(X) X##_s@len@
+    #endif
+/**end repeat1**/
+#endif
+
+/**begin repeat1
+ * #kind = negative#
+ * #intrin = negative#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *ip = args[0], *op = args[1];
+    npy_intp istep = steps[0], ostep = steps[1],
+             len = dimensions[0];
+#ifdef TO_SIMD_SFX
+    #undef STYPE
+    #define STYPE TO_SIMD_SFX(npyv_lanetype)
+    if (!is_mem_overlap(ip, istep, op, ostep, len)) {
+        if (IS_UNARY_CONT(@type@, @type@)) {
+            // no overlap and operands are contiguous
+            TO_SIMD_SFX(simd_unary_cc_@intrin@)(
+                (STYPE*)ip, (STYPE*)op, len
+            );
+            goto clear;
+        }
+    #if @supports_ncontig@
+        const npy_intp istride = istep / sizeof(STYPE);
+        const npy_intp ostride = ostep / sizeof(STYPE);
+        if (TO_SIMD_SFX(npyv_loadable_stride)(istride) &&
+            TO_SIMD_SFX(npyv_storable_stride)(ostride))
+        {
+            if (istride == 1 && ostride != 1) {
+                // contiguous input, non-contiguous output
+                TO_SIMD_SFX(simd_unary_cn_@intrin@)(
+                    (STYPE*)ip, (STYPE*)op, ostride, len
+                );
+                goto clear;
+            }
+            else if (istride != 1 && ostride == 1) {
+                // non-contiguous input, contiguous output
+                TO_SIMD_SFX(simd_unary_nc_@intrin@)(
+                    (STYPE*)ip, istride, (STYPE*)op, len
+                );
+                goto clear;
+            }
+        // SSE2 does better with unrolled scalar for heavy non-contiguous
+        #if !defined(NPY_HAVE_SSE2)
+            else if (istride != 1 && ostride != 1) {
+                // non-contiguous input and output
+                TO_SIMD_SFX(simd_unary_nn_@intrin@)(
+                    (STYPE*)ip, istride, (STYPE*)op, ostride, len
+                );
+                goto clear;
+            }
+        #endif
+        }
+    #endif // @supports_ncontig@
+    }
+#endif // TO_SIMD_SFX
+#ifndef NPY_DISABLE_OPTIMIZATION
+    /*
+     * scalar unrolls
+     * 8x unroll performed best on
+     *  - Apple M1 Native / arm64
+     *  - Apple M1 Rosetta / SSE42
+     *  - iMacPro / AVX512
+     */
+    #define UNROLL 8
+    for (; len >= UNROLL; len -= UNROLL, ip += istep*UNROLL, op += ostep*UNROLL) {
+    /**begin repeat2
+     * #U = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15#
+     */
+    #if UNROLL > @U@
+        const @type@ in_@U@ = *((const @type@ *)(ip + @U@ * istep));
+        *((@type@ *)(op + @U@ * ostep)) = scalar_@intrin@(in_@U@);
+    #endif
+    /**end repeat2**/
+    }
+#endif // NPY_DISABLE_OPTIMIZATION
+    for (; len > 0; --len, ip += istep, op += ostep) {
+        *((@type@ *)op) = scalar_@intrin@(*(const @type@ *)ip);
+    }
+#ifdef TO_SIMD_SFX
+clear:
+    npyv_cleanup();
+#endif
+#if @is_fp@
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+/**end repeat**/
+
+#undef NEGATIVE_CONTIG_ONLY
diff --git a/numpy/core/src/umath/loops_unary_complex.dispatch.c.src b/numpy/core/src/umath/loops_unary_complex.dispatch.c.src
new file mode 100644
index 000000000..052ad464c
--- /dev/null
+++ b/numpy/core/src/umath/loops_unary_complex.dispatch.c.src
@@ -0,0 +1,139 @@
+/*@targets
+ ** $maxopt baseline
+ ** sse2 (avx2 fma3) avx512f
+ ** neon asimd
+ ** vsx2 vsx3
+ ** vx vxe
+ **/
+#define _UMATHMODULE
+#define _MULTIARRAYMODULE
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+#include "simd/simd.h"
+#include "loops_utils.h"
+#include "loops.h"
+#include "lowlevel_strided_loops.h"
+// Provides the various *_LOOP macros
+#include "fast_loop_macros.h"
+
+/**begin repeat
+ * #type = npy_float, npy_double#
+ * #sfx = f32, f64#
+ * #bsfx = b32, b64#
+ * #usfx = b32, u64#
+ * #VECTOR = NPY_SIMD_F32, NPY_SIMD_F64#
+ * #is_double = 0, 1#
+ * #c = f, #
+ * #INF = NPY_INFINITYF, NPY_INFINITY#
+ * #NAN = NPY_NANF, NPY_NAN#
+ */
+#if @VECTOR@
+NPY_FINLINE npyv_@sfx@
+simd_cabsolute_@sfx@(npyv_@sfx@ re, npyv_@sfx@ im)
+{
+    const npyv_@sfx@ inf = npyv_setall_@sfx@(@INF@);
+    const npyv_@sfx@ nan = npyv_setall_@sfx@(@NAN@);
+
+    re = npyv_abs_@sfx@(re);
+    im = npyv_abs_@sfx@(im);
+    /*
+     * If real or imag = INF, then convert it to inf + j*inf
+     * Handles: inf + j*nan, nan + j*inf
+     */
+    npyv_@bsfx@ re_infmask = npyv_cmpeq_@sfx@(re, inf);
+    npyv_@bsfx@ im_infmask = npyv_cmpeq_@sfx@(im, inf);
+    im = npyv_select_@sfx@(re_infmask, inf, im);
+    re = npyv_select_@sfx@(im_infmask, inf, re);
+    /*
+     * If real or imag = NAN, then convert it to nan + j*nan
+     * Handles: x + j*nan, nan + j*x
+     */
+    npyv_@bsfx@ re_nnanmask = npyv_notnan_@sfx@(re);
+    npyv_@bsfx@ im_nnanmask = npyv_notnan_@sfx@(im);
+    im = npyv_select_@sfx@(re_nnanmask, im, nan);
+    re = npyv_select_@sfx@(im_nnanmask, re, nan);
+
+    npyv_@sfx@ larger  = npyv_max_@sfx@(re, im);
+    npyv_@sfx@ smaller = npyv_min_@sfx@(im, re);
+    /*
+     * Calculate div_mask to prevent 0./0. and inf/inf operations in div
+     */
+    npyv_@bsfx@ zeromask = npyv_cmpeq_@sfx@(larger, npyv_zero_@sfx@());
+    npyv_@bsfx@ infmask = npyv_cmpeq_@sfx@(smaller, inf);
+    npyv_@bsfx@ div_mask = npyv_not_@bsfx@(npyv_or_@bsfx@(zeromask, infmask));
+
+    npyv_@sfx@ ratio = npyv_ifdivz_@sfx@(div_mask, smaller, larger);
+    npyv_@sfx@ hypot = npyv_sqrt_@sfx@(
+        npyv_muladd_@sfx@(ratio, ratio, npyv_setall_@sfx@(1.0@c@)
+    ));
+    return npyv_mul_@sfx@(hypot, larger);
+}
+#endif // VECTOR
+/**end repeat**/
+
+/********************************************************************************
+ ** Defining ufunc inner functions
+ ********************************************************************************/
+/**begin repeat
+ * complex types
+ * #TYPE = CFLOAT, CDOUBLE#
+ * #ftype = npy_float, npy_double#
+ * #VECTOR = NPY_SIMD_F32, NPY_SIMD_F64#
+ * #sfx = f32, f64#
+ * #c = f, #
+ * #C = F, #
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_absolute)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+#if @VECTOR@
+    npy_intp len = dimensions[0];
+    npy_intp ssrc = steps[0] / sizeof(@ftype@);
+    npy_intp sdst = steps[1] / sizeof(@ftype@);
+
+    if (!is_mem_overlap(args[0], steps[0], args[1], steps[1], len) &&
+        npyv_loadable_stride_@sfx@(ssrc) && npyv_storable_stride_@sfx@(sdst)
+        && steps[0] % sizeof(@ftype@) == 0
+        && steps[1] % sizeof(@ftype@) == 0
+    ) {
+        const @ftype@ *src = (@ftype@*)args[0];
+              @ftype@ *dst = (@ftype@*)args[1];
+
+        const int vstep = npyv_nlanes_@sfx@;
+        const int wstep = vstep * 2;
+        const int hstep = vstep / 2;
+
+        if (ssrc == 2 && sdst == 1) {
+            for (; len >= vstep; len -= vstep, src += wstep, dst += vstep) {
+                npyv_@sfx@x2 ab = npyv_load_@sfx@x2(src);
+                npyv_@sfx@ r = simd_cabsolute_@sfx@(ab.val[0], ab.val[1]);
+                npyv_store_@sfx@(dst, r);
+            }
+        }
+        else {
+            for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+                npyv_@sfx@ re_im0 = npyv_loadn2_@sfx@(src, ssrc);
+                npyv_@sfx@ re_im1 = npyv_loadn2_@sfx@(src + ssrc*hstep, ssrc);
+                npyv_@sfx@x2 ab = npyv_unzip_@sfx@(re_im0, re_im1);
+                npyv_@sfx@ r = simd_cabsolute_@sfx@(ab.val[0], ab.val[1]);
+                npyv_storen_@sfx@(dst, sdst, r);
+            }
+        }
+        for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+            npyv_@sfx@ rl = npyv_loadn_tillz_@sfx@(src, ssrc, len);
+            npyv_@sfx@ im = npyv_loadn_tillz_@sfx@(src + 1, ssrc, len);
+            npyv_@sfx@ r = simd_cabsolute_@sfx@(rl, im);
+            npyv_storen_till_@sfx@(dst, sdst, len, r);
+        }
+        npyv_cleanup();
+        npy_clear_floatstatus_barrier((char*)&len);
+        return;
+    }
+#endif
+    UNARY_LOOP {
+        const @ftype@ re = ((@ftype@ *)ip1)[0];
+        const @ftype@ im = ((@ftype@ *)ip1)[1];
+        *((@ftype@ *)op1) = npy_hypot@c@(re, im);
+    }
+}
+/**end repeat**/
diff --git a/numpy/core/src/umath/loops_unary_fp.dispatch.c.src b/numpy/core/src/umath/loops_unary_fp.dispatch.c.src
index 0ac39a9b1..c4e7b8929 100644
--- a/numpy/core/src/umath/loops_unary_fp.dispatch.c.src
+++ b/numpy/core/src/umath/loops_unary_fp.dispatch.c.src
@@ -301,7 +301,7 @@ no_unroll:
 #endif // @VCHK@
     for (; len > 0; --len, src += src_step, dst += dst_step) {
     #if @VCHK@
-        // to guarantee the same precsion and fp/domain errors for both scalars and vectors
+        // to guarantee the same precision and fp/domain errors for both scalars and vectors
         simd_@TYPE@_@kind@_CONTIG_CONTIG(src, 0, dst, 0, 1);
     #else
         const npyv_lanetype_@sfx@ src0 = *(npyv_lanetype_@sfx@*)src;
diff --git a/numpy/core/src/umath/loops_unary_fp_le.dispatch.c.src b/numpy/core/src/umath/loops_unary_fp_le.dispatch.c.src
new file mode 100644
index 000000000..ba133dc1e
--- /dev/null
+++ b/numpy/core/src/umath/loops_unary_fp_le.dispatch.c.src
@@ -0,0 +1,565 @@
+/*@targets
+ ** $maxopt baseline
+ ** sse2 sse41
+ ** vsx2
+ ** neon asimd
+ **/
+
+/**
+ * Force use SSE only on x86, even if AVX2 or AVX512F are enabled
+ * through the baseline, since scatter(AVX512F) and gather very costly
+ * to handle non-contiguous memory access comparing with SSE for
+ * such small operations that this file covers.
+ */
+#define NPY_SIMD_FORCE_128
+#define _UMATHMODULE
+#define _MULTIARRAYMODULE
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#include <float.h>
+#include "numpy/npy_math.h"
+#include "simd/simd.h"
+#include "loops_utils.h"
+#include "loops.h"
+#include "lowlevel_strided_loops.h"
+// Provides the various *_LOOP macros
+#include "fast_loop_macros.h"
+
+/**
+ * This code should really be merged into loops_unary_fp.dispatch.c.src
+ * However there is an issue with enabling the code here for VX and VXE
+ * as the shifts don't behave as expected.
+ * See the code below that references NPY__CPU_TARGET_VX and
+ * NPY_BIG_ENDIAN. Suspect that this is a big endian vector issue.
+ *
+ * Splitting the files out allows us to keep loops_unary_fp.dispatch.c.src
+ * building for VX and VXE so we don't regress performance while adding this
+ * code for other platforms.
+ */
+// TODO(@seiko2plus): add support for big-endian
+#if NPY_SIMD_BIGENDIAN
+    #undef NPY_SIMD
+    #undef NPY_SIMD_F32
+    #undef NPY_SIMD_F64
+    #define NPY_SIMD 0
+    #define NPY_SIMD_F32 0
+    #define NPY_SIMD_F64 0
+#endif
+
+/*******************************************************************************
+ ** extra SIMD intrinsics
+ ******************************************************************************/
+
+#if NPY_SIMD
+
+/**
+ * We define intrinsics for isnan, isinf, isfinite, and signbit below.  There's
+ * a few flavors of each.  We'll use f32 as an example although f64 versions
+ * are also defined.
+ * 
+ * npyv_u32 npyv_KIND_f32(npyv_f32 v)
+ *   These are mainly used for the single vector loops.  As such, result should
+ *   be bool true / false, ready to write back.
+ * 
+ * npyv_b32 _npyv_KIND_f32(npyv_f32 v)
+ *   These are used by the geneal intrinsics above as well as the multi-vector
+ *   packing intrinsics.  The multi-vector packing intrinsics are the ones
+ *   utilized in the unrolled vector loops.  Results should be vector masks
+ *   of 0x00/0xff.
+ * 
+ * npyv_u8 npyv_pack_KIND_f32(npyv_f32 v0, npyv_f32 v1, npyv_f32 v2, npyv_f32 v3)
+ *   These are the multi-vector packing intrinsics utilized by unrolled vector
+ *   loops.  They perform the operation on all input vectors and pack the
+ *   results to a single npyv_u8.  Assuming NPY_SIMD == 128, that means we
+ *   can pack results from 4x npyv_f32 or 8x npyv_64 in a single npyv_u8.
+ *   Result should be bool true / false, ready to write back.
+ */
+
+#if NPY_SIMD_F32
+NPY_FINLINE npyv_u32
+npyv_isnan_f32(npyv_f32 v)
+{
+    const npyv_u8 truemask = npyv_reinterpret_u8_u32(npyv_setall_u32(1==1));
+    npyv_u8 notnan = npyv_reinterpret_u8_u32(npyv_cvt_u32_b32(npyv_notnan_f32(v)));
+    return npyv_reinterpret_u32_u8(npyv_andc_u8(truemask, notnan));
+}
+NPY_FINLINE npyv_u8
+npyv_pack_isnan_f32(npyv_f32 v0, npyv_f32 v1, npyv_f32 v2, npyv_f32 v3)
+{
+    const npyv_u8 truemask = npyv_setall_u8(1==1);
+    npyv_b32 b0 = npyv_notnan_f32(v0);
+    npyv_b32 b1 = npyv_notnan_f32(v1);
+    npyv_b32 b2 = npyv_notnan_f32(v2);
+    npyv_b32 b3 = npyv_notnan_f32(v3);
+    npyv_b8 notnan = npyv_pack_b8_b32(b0, b1, b2, b3);
+    return npyv_andc_u8(truemask, npyv_cvt_u8_b8(notnan));
+}
+#endif
+#if NPY_SIMD_F64
+NPY_FINLINE npyv_u64
+npyv_isnan_f64(npyv_f64 v)
+{
+    const npyv_u8 truemask = npyv_reinterpret_u8_u64(npyv_setall_u64(1==1));
+    npyv_u8 notnan = npyv_reinterpret_u8_u64(npyv_cvt_u64_b64(npyv_notnan_f64(v)));
+    return npyv_reinterpret_u64_u8(npyv_andc_u8(truemask, notnan));
+}
+NPY_FINLINE npyv_u8
+npyv_pack_isnan_f64(npyv_f64 v0, npyv_f64 v1, npyv_f64 v2, npyv_f64 v3,
+                    npyv_f64 v4, npyv_f64 v5, npyv_f64 v6, npyv_f64 v7)
+{
+    const npyv_u8 truemask = npyv_setall_u8(1==1);
+    npyv_b64 b0 = npyv_notnan_f64(v0);
+    npyv_b64 b1 = npyv_notnan_f64(v1);
+    npyv_b64 b2 = npyv_notnan_f64(v2);
+    npyv_b64 b3 = npyv_notnan_f64(v3);
+    npyv_b64 b4 = npyv_notnan_f64(v4);
+    npyv_b64 b5 = npyv_notnan_f64(v5);
+    npyv_b64 b6 = npyv_notnan_f64(v6);
+    npyv_b64 b7 = npyv_notnan_f64(v7);
+    npyv_b8 notnan = npyv_pack_b8_b64(b0, b1, b2, b3, b4, b5, b6, b7);
+    return npyv_andc_u8(truemask, npyv_cvt_u8_b8(notnan));
+}
+#endif
+
+#if NPY_SIMD_F32
+NPY_FINLINE npyv_b32
+_npyv_isinf_f32(npyv_f32 v)
+{
+#if defined(NPY_HAVE_NEON)
+    // abs(v) > FLT_MAX
+    const npyv_f32 fltmax = npyv_setall_f32(FLT_MAX);
+    return vcagtq_f32(v, fltmax);
+#else
+    // cast out the sign and check if all exponent bits are set.
+    const npyv_u32 exp_mask = npyv_setall_u32(0xff000000);
+    npyv_u32 bits = npyv_shli_u32(npyv_reinterpret_u32_f32(v), 1);
+    return npyv_cmpeq_u32(bits, exp_mask);
+#endif
+}
+NPY_FINLINE npyv_u32
+npyv_isinf_f32(npyv_f32 v)
+{
+    const npyv_u32 truemask = npyv_setall_u32(1==1);
+    return npyv_and_u32(truemask, npyv_cvt_u32_b32(_npyv_isinf_f32(v)));
+}
+NPY_FINLINE npyv_u8
+npyv_pack_isinf_f32(npyv_f32 v0, npyv_f32 v1, npyv_f32 v2, npyv_f32 v3)
+{
+    const npyv_u8 truemask = npyv_setall_u8(1==1);
+    npyv_b32 b0 = _npyv_isinf_f32(v0);
+    npyv_b32 b1 = _npyv_isinf_f32(v1);
+    npyv_b32 b2 = _npyv_isinf_f32(v2);
+    npyv_b32 b3 = _npyv_isinf_f32(v3);
+    npyv_b8 isinf = npyv_pack_b8_b32(b0, b1, b2, b3);
+    return npyv_and_u8(truemask, npyv_cvt_u8_b8(isinf));
+}
+#endif // NPY_SIMD_F32
+#if NPY_SIMD_F64
+NPY_FINLINE npyv_b64
+_npyv_isinf_f64(npyv_f64 v)
+{
+#if defined(NPY_HAVE_NEON)
+    // abs(v) > DBL_MAX
+    const npyv_f64 fltmax = npyv_setall_f64(DBL_MAX);
+    return vcagtq_f64(v, fltmax);
+#else
+    // cast out the sign and check if all exponent bits are set.
+    const npyv_u64 exp_mask = npyv_setall_u64(0xffe0000000000000);
+    npyv_u64 bits = npyv_shli_u64(npyv_reinterpret_u64_f64(v), 1);
+    return npyv_cmpeq_u64(bits, exp_mask);
+#endif
+}
+NPY_FINLINE npyv_u64
+npyv_isinf_f64(npyv_f64 v)
+{
+    const npyv_u64 truemask = npyv_setall_u64(1==1);
+    return npyv_and_u64(truemask, npyv_cvt_u64_b64(_npyv_isinf_f64(v)));
+}
+NPY_FINLINE npyv_u8
+npyv_pack_isinf_f64(npyv_f64 v0, npyv_f64 v1, npyv_f64 v2, npyv_f64 v3,
+                    npyv_f64 v4, npyv_f64 v5, npyv_f64 v6, npyv_f64 v7)
+{
+    const npyv_u8 truemask = npyv_setall_u8(1==1);
+    npyv_b64 b0 = _npyv_isinf_f64(v0);
+    npyv_b64 b1 = _npyv_isinf_f64(v1);
+    npyv_b64 b2 = _npyv_isinf_f64(v2);
+    npyv_b64 b3 = _npyv_isinf_f64(v3);
+    npyv_b64 b4 = _npyv_isinf_f64(v4);
+    npyv_b64 b5 = _npyv_isinf_f64(v5);
+    npyv_b64 b6 = _npyv_isinf_f64(v6);
+    npyv_b64 b7 = _npyv_isinf_f64(v7);
+    npyv_b8 isinf = npyv_pack_b8_b64(b0, b1, b2, b3, b4, b5, b6, b7);
+    return npyv_and_u8(truemask, npyv_cvt_u8_b8(isinf));
+}
+#endif // NPY_SIMD_F64
+
+
+#if NPY_SIMD_F32
+NPY_FINLINE npyv_b32
+npyv_notfinite_f32(npyv_f32 v)
+{
+    // cast out the sign and check if all exponent bits are set
+    // no matter the mentissa is.
+    const npyv_u32 exp_mask = npyv_setall_u32(0x7f800000);
+    npyv_u32 bits = npyv_reinterpret_u32_f32(v);
+    bits = npyv_and_u32(bits, exp_mask);
+    return npyv_cmpeq_u32(bits, exp_mask);
+}
+NPY_FINLINE npyv_u32
+npyv_isfinite_f32(npyv_f32 v)
+{
+    const npyv_u8 truemask = npyv_reinterpret_u8_u32(npyv_setall_u32(1==1));
+    npyv_u8 notfinite = npyv_reinterpret_u8_u32(npyv_cvt_u32_b32(npyv_notfinite_f32(v)));
+    return npyv_reinterpret_u32_u8(npyv_andc_u8(truemask, notfinite));
+}
+NPY_FINLINE npyv_u8
+npyv_pack_isfinite_f32(npyv_f32 v0, npyv_f32 v1, npyv_f32 v2, npyv_f32 v3)
+{
+#if defined(NPY_HAVE_NEON) && defined(__aarch64__)
+    // F32 exponent is 8-bits, which means we can pack multiple into
+    // a single vector.  We shift out sign bit so that we're left
+    // with only exponent in high byte.  If not all bits are set,
+    // then we've got a finite number.
+    uint8x16x4_t tbl;
+    tbl.val[0] = npyv_reinterpret_u8_u32(npyv_shli_u32(npyv_reinterpret_u32_f32(v0), 1));
+    tbl.val[1] = npyv_reinterpret_u8_u32(npyv_shli_u32(npyv_reinterpret_u32_f32(v1), 1));
+    tbl.val[2] = npyv_reinterpret_u8_u32(npyv_shli_u32(npyv_reinterpret_u32_f32(v2), 1));
+    tbl.val[3] = npyv_reinterpret_u8_u32(npyv_shli_u32(npyv_reinterpret_u32_f32(v3), 1));
+
+    const npyv_u8 permute = {3,7,11,15,  19,23,27,31,  35,39,43,47,  51,55,59,63};
+    npyv_u8 r = vqtbl4q_u8(tbl, permute);
+
+    const npyv_u8 expmask = npyv_setall_u8(0xff);
+    r = npyv_cmpneq_u8(r, expmask);
+    r = vshrq_n_u8(r, 7);
+    return r;
+#else
+    const npyv_u8 truemask = npyv_setall_u8(1==1);
+    npyv_b32 b0 = npyv_notfinite_f32(v0);
+    npyv_b32 b1 = npyv_notfinite_f32(v1);
+    npyv_b32 b2 = npyv_notfinite_f32(v2);
+    npyv_b32 b3 = npyv_notfinite_f32(v3);
+    npyv_b8 notfinite = npyv_pack_b8_b32(b0, b1, b2, b3);
+    return npyv_andc_u8(truemask, npyv_cvt_u8_b8(notfinite));
+#endif
+}
+#endif // NPY_SIMD_F32
+#if NPY_SIMD_F64
+NPY_FINLINE npyv_b64
+npyv_notfinite_f64(npyv_f64 v)
+{
+    // cast out the sign and check if all exponent bits are set
+    // no matter the mantissa is.
+    const npyv_u64 exp_mask = npyv_setall_u64(0x7ff0000000000000);
+    npyv_u64 bits = npyv_reinterpret_u64_f64(v);
+    bits = npyv_and_u64(bits, exp_mask);
+    return npyv_cmpeq_u64(bits, exp_mask);
+}
+NPY_FINLINE npyv_u64
+npyv_isfinite_f64(npyv_f64 v)
+{
+    const npyv_u8 truemask = npyv_reinterpret_u8_u64(npyv_setall_u64(1==1));
+    npyv_u8 notfinite = npyv_reinterpret_u8_u64(npyv_cvt_u64_b64(npyv_notfinite_f64(v)));
+    return npyv_reinterpret_u64_u8(npyv_andc_u8(truemask, notfinite));
+}
+NPY_FINLINE npyv_u8
+npyv_pack_isfinite_f64(npyv_f64 v0, npyv_f64 v1, npyv_f64 v2, npyv_f64 v3,
+                       npyv_f64 v4, npyv_f64 v5, npyv_f64 v6, npyv_f64 v7)
+{
+#if defined(NPY_HAVE_NEON) && defined(__aarch64__)
+    // F64 exponent is 11-bits, which means we can pack multiple into
+    // a single vector.  We'll need to use u16 to fit all exponent
+    // bits.  If not all bits are set, then we've got a finite number.
+    uint8x16x4_t t0123, t4567;
+    t0123.val[0] = npyv_reinterpret_u8_f64(v0);
+    t0123.val[1] = npyv_reinterpret_u8_f64(v1);
+    t0123.val[2] = npyv_reinterpret_u8_f64(v2);
+    t0123.val[3] = npyv_reinterpret_u8_f64(v3);
+    t4567.val[0] = npyv_reinterpret_u8_f64(v4);
+    t4567.val[1] = npyv_reinterpret_u8_f64(v5);
+    t4567.val[2] = npyv_reinterpret_u8_f64(v6);
+    t4567.val[3] = npyv_reinterpret_u8_f64(v7);
+
+    const npyv_u8 permute = {6,7,14,15,  22,23,30,31,  38,39,46,47,  54,55,62,63};
+    npyv_u16 r0 = npyv_reinterpret_u16_u8(vqtbl4q_u8(t0123, permute));
+    npyv_u16 r1 = npyv_reinterpret_u16_u8(vqtbl4q_u8(t4567, permute));
+
+    const npyv_u16 expmask = npyv_setall_u16(0x7ff0);
+    r0 = npyv_and_u16(r0, expmask);
+    r0 = npyv_cmpneq_u16(r0, expmask);
+    r0 = npyv_shri_u16(r0, 15);
+    r1 = npyv_and_u16(r1, expmask);
+    r1 = npyv_cmpneq_u16(r1, expmask);
+    r1 = npyv_shri_u16(r1, 15);
+
+    npyv_u8 r = npyv_pack_b8_b16(r0, r1);
+    return r;
+#else
+    const npyv_u8 truemask = npyv_setall_u8(1==1);
+    npyv_b64 b0 = npyv_notfinite_f64(v0);
+    npyv_b64 b1 = npyv_notfinite_f64(v1);
+    npyv_b64 b2 = npyv_notfinite_f64(v2);
+    npyv_b64 b3 = npyv_notfinite_f64(v3);
+    npyv_b64 b4 = npyv_notfinite_f64(v4);
+    npyv_b64 b5 = npyv_notfinite_f64(v5);
+    npyv_b64 b6 = npyv_notfinite_f64(v6);
+    npyv_b64 b7 = npyv_notfinite_f64(v7);
+    npyv_b8 notfinite = npyv_pack_b8_b64(b0, b1, b2, b3, b4, b5, b6, b7);
+    return npyv_andc_u8(truemask, npyv_cvt_u8_b8(notfinite));
+#endif
+}
+#endif // NPY_SIMD_F64
+
+#if NPY_SIMD_F32
+NPY_FINLINE npyv_u32
+npyv_signbit_f32(npyv_f32 v)
+{
+    return npyv_shri_u32(npyv_reinterpret_u32_f32(v), (sizeof(npyv_lanetype_f32)*8)-1);
+}
+NPY_FINLINE npyv_u8
+npyv_pack_signbit_f32(npyv_f32 v0, npyv_f32 v1, npyv_f32 v2, npyv_f32 v3)
+{
+#if defined(NPY_HAVE_NEON) && defined(__aarch64__)
+    // We only need high byte for signbit, which means we can pack
+    // multiple inputs into a single vector.
+    uint8x16x4_t tbl;
+    tbl.val[0] = npyv_reinterpret_u8_f32(v0);
+    tbl.val[1] = npyv_reinterpret_u8_f32(v1);
+    tbl.val[2] = npyv_reinterpret_u8_f32(v2);
+    tbl.val[3] = npyv_reinterpret_u8_f32(v3);
+
+    const npyv_u8 permute = {3,7,11,15,  19,23,27,31,  35,39,43,47,  51,55,59,63};
+    npyv_u8 r = vqtbl4q_u8(tbl, permute);
+            r = vshrq_n_u8(r, 7);
+    return r;
+#else
+    npyv_b32 b0 = npyv_cvt_b32_u32(npyv_signbit_f32(v0));
+    npyv_b32 b1 = npyv_cvt_b32_u32(npyv_signbit_f32(v1));
+    npyv_b32 b2 = npyv_cvt_b32_u32(npyv_signbit_f32(v2));
+    npyv_b32 b3 = npyv_cvt_b32_u32(npyv_signbit_f32(v3));
+    npyv_b8 signbit = npyv_pack_b8_b32(b0, b1, b2, b3);
+    return npyv_cvt_u8_b8(signbit);
+#endif
+}
+#endif // NPY_SIMD_F32
+#if NPY_SIMD_F64
+NPY_FINLINE npyv_u64
+npyv_signbit_f64(npyv_f64 v)
+{
+    return npyv_shri_u64(npyv_reinterpret_u64_f64(v), (sizeof(npyv_lanetype_f64)*8)-1);
+}
+NPY_FINLINE npyv_u8
+npyv_pack_signbit_f64(npyv_f64 v0, npyv_f64 v1, npyv_f64 v2, npyv_f64 v3,
+                      npyv_f64 v4, npyv_f64 v5, npyv_f64 v6, npyv_f64 v7)
+{
+#if defined(NPY_HAVE_NEON) && defined(__aarch64__)
+    // We only need high byte for signbit, which means we can pack
+    // multiple inputs into a single vector.
+
+    // vuzp2 faster than vtbl for f64
+    npyv_u32 v01 = vuzp2q_u32(npyv_reinterpret_u32_f64(v0), npyv_reinterpret_u32_f64(v1));
+    npyv_u32 v23 = vuzp2q_u32(npyv_reinterpret_u32_f64(v2), npyv_reinterpret_u32_f64(v3));
+    npyv_u32 v45 = vuzp2q_u32(npyv_reinterpret_u32_f64(v4), npyv_reinterpret_u32_f64(v5));
+    npyv_u32 v67 = vuzp2q_u32(npyv_reinterpret_u32_f64(v6), npyv_reinterpret_u32_f64(v7));
+
+    npyv_u16 v0123 = vuzp2q_u16(npyv_reinterpret_u16_u32(v01), npyv_reinterpret_u16_u32(v23));
+    npyv_u16 v4567 = vuzp2q_u16(npyv_reinterpret_u16_u32(v45), npyv_reinterpret_u16_u32(v67));
+
+    npyv_u8 r = vuzp2q_u8(npyv_reinterpret_u8_u16(v0123), npyv_reinterpret_u8_u16(v4567));
+            r = vshrq_n_u8(r, 7);
+    return r;
+#else
+    npyv_b64 b0 = npyv_cvt_b64_u64(npyv_signbit_f64(v0));
+    npyv_b64 b1 = npyv_cvt_b64_u64(npyv_signbit_f64(v1));
+    npyv_b64 b2 = npyv_cvt_b64_u64(npyv_signbit_f64(v2));
+    npyv_b64 b3 = npyv_cvt_b64_u64(npyv_signbit_f64(v3));
+    npyv_b64 b4 = npyv_cvt_b64_u64(npyv_signbit_f64(v4));
+    npyv_b64 b5 = npyv_cvt_b64_u64(npyv_signbit_f64(v5));
+    npyv_b64 b6 = npyv_cvt_b64_u64(npyv_signbit_f64(v6));
+    npyv_b64 b7 = npyv_cvt_b64_u64(npyv_signbit_f64(v7));
+    npyv_b8 signbit = npyv_pack_b8_b64(b0, b1, b2, b3, b4, b5, b6, b7);
+    return npyv_cvt_u8_b8(signbit);
+#endif
+}
+#endif // NPY_SIMD_F64
+
+#endif // NPY_SIMD
+
+/********************************************************************************
+ ** Defining the SIMD kernels
+ ********************************************************************************/
+/** Notes:
+ * - avoid the use of libmath to unify fp/domain errors
+ *   for both scalars and vectors among all compilers/architectures.
+ * - use intrinsic npyv_load_till_* instead of npyv_load_tillz_
+ *   to fill the remind lanes with 1.0 to avoid divide by zero fp
+ *   exception in reciprocal.
+ */
+#define CONTIG  0
+#define NCONTIG 1
+
+/**begin repeat
+ * #TYPE = FLOAT, DOUBLE#
+ * #sfx  = f32, f64#
+ * #VCHK = NPY_SIMD_F32, NPY_SIMD_F64#
+ * #ssfx = 32, 64#
+ */
+#if @VCHK@
+/**begin repeat1
+ * #kind = isnan, isinf, isfinite, signbit#
+ */
+/**begin repeat2
+ * #STYPE  = CONTIG, NCONTIG, CONTIG,  NCONTIG#
+ * #DTYPE  = CONTIG, CONTIG,  NCONTIG, NCONTIG#
+ */
+static void simd_unary_@kind@_@TYPE@_@STYPE@_@DTYPE@
+(const void *src, npy_intp istride, void *dst, npy_intp ostride, npy_intp len)
+{
+    const npyv_lanetype_@sfx@ *ip = src;
+    npy_bool *op = dst;
+
+    // How many vectors can be packed into a u8 / bool vector?
+    #define PACK_FACTOR (NPY_SIMD_WIDTH / npyv_nlanes_@sfx@)
+    assert(PACK_FACTOR == 4 || PACK_FACTOR == 8);
+
+    const int vstep = npyv_nlanes_@sfx@;
+    const int wstep = vstep * PACK_FACTOR;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) {
+        // Load vectors
+        #if @STYPE@ == CONTIG
+            // contiguous input
+            npyv_@sfx@ v0 = npyv_load_@sfx@(ip + vstep * 0);
+            npyv_@sfx@ v1 = npyv_load_@sfx@(ip + vstep * 1);
+            npyv_@sfx@ v2 = npyv_load_@sfx@(ip + vstep * 2);
+            npyv_@sfx@ v3 = npyv_load_@sfx@(ip + vstep * 3);
+            #if PACK_FACTOR == 8
+            npyv_@sfx@ v4 = npyv_load_@sfx@(ip + vstep * 4);
+            npyv_@sfx@ v5 = npyv_load_@sfx@(ip + vstep * 5);
+            npyv_@sfx@ v6 = npyv_load_@sfx@(ip + vstep * 6);
+            npyv_@sfx@ v7 = npyv_load_@sfx@(ip + vstep * 7);
+            #endif
+        #else
+            // non-contiguous input
+            npyv_@sfx@ v0 = npyv_loadn_@sfx@(ip + istride * vstep * 0, istride);
+            npyv_@sfx@ v1 = npyv_loadn_@sfx@(ip + istride * vstep * 1, istride);
+            npyv_@sfx@ v2 = npyv_loadn_@sfx@(ip + istride * vstep * 2, istride);
+            npyv_@sfx@ v3 = npyv_loadn_@sfx@(ip + istride * vstep * 3, istride);
+            #if PACK_FACTOR == 8
+            npyv_@sfx@ v4 = npyv_loadn_@sfx@(ip + istride * vstep * 4, istride);
+            npyv_@sfx@ v5 = npyv_loadn_@sfx@(ip + istride * vstep * 5, istride);
+            npyv_@sfx@ v6 = npyv_loadn_@sfx@(ip + istride * vstep * 6, istride);
+            npyv_@sfx@ v7 = npyv_loadn_@sfx@(ip + istride * vstep * 7, istride);
+            #endif
+        #endif
+
+        #if PACK_FACTOR == 4
+        npyv_u8 r = npyv_pack_@kind@_@sfx@(v0, v1, v2, v3);
+        #elif PACK_FACTOR == 8
+        npyv_u8 r = npyv_pack_@kind@_@sfx@(v0, v1, v2, v3, v4, v5, v6, v7);
+        #endif
+
+        #if @DTYPE@ == CONTIG
+            npyv_store_u8(op, r);
+        #else // @DTYPE@ == CONTIG
+            // Results are packed, so we can just loop over them
+            npy_uint8 lane[npyv_nlanes_u8];
+            npyv_store_u8(lane, r);
+            for (int ln=0; (ln * sizeof(npyv_lanetype_@sfx@)) < npyv_nlanes_u8; ++ln){
+                op[ln * ostride] = lane[ln * sizeof(npyv_lanetype_@sfx@)];
+            }
+        #endif // @DTYPE@ == CONTIG
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, ip += istride*vstep, op += ostride*vstep) {
+    #if @STYPE@ == CONTIG
+        npyv_@sfx@ v = npyv_load_@sfx@(ip);
+    #else
+        npyv_@sfx@ v = npyv_loadn_@sfx@(ip, istride);
+    #endif
+
+        npyv_u@ssfx@ r = npyv_@kind@_@sfx@(v);
+
+        npy_uint8 lane[npyv_nlanes_u8];
+        npyv_store_u8(lane, npyv_reinterpret_u8_u@ssfx@(r));
+
+        op[0 * ostride] = lane[0 * sizeof(npyv_lanetype_@sfx@)];
+        op[1 * ostride] = lane[1 * sizeof(npyv_lanetype_@sfx@)];
+        #if npyv_nlanes_@sfx@ == 4
+        op[2 * ostride] = lane[2 * sizeof(npyv_lanetype_@sfx@)];
+        op[3 * ostride] = lane[3 * sizeof(npyv_lanetype_@sfx@)];
+        #endif
+    }
+
+    #undef PACK_FACTOR
+
+    // Scalar loop to finish off
+    for (; len > 0; --len, ip += istride, op += ostride) {
+        *op = (npy_@kind@(*ip) != 0);
+    }
+
+    npyv_cleanup();
+}
+/**end repeat2**/
+/**end repeat1**/
+
+#endif // @VCHK@
+/**end repeat**/
+
+/********************************************************************************
+ ** Defining ufunc inner functions
+ ********************************************************************************/
+/**begin repeat
+ * #TYPE = FLOAT, DOUBLE#
+ * #sfx  = f32, f64#
+ * #VCHK = NPY_SIMD_F32, NPY_SIMD_F64#
+ */
+
+/**begin repeat1
+ * #kind = isnan, isinf, isfinite, signbit#
+ **/
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+#if @VCHK@
+    const char *ip = args[0];
+    char *op = args[1];
+    const npy_intp istep = steps[0];
+    const npy_intp ostep = steps[1];
+    npy_intp len = dimensions[0];
+    const int ilsize = sizeof(npyv_lanetype_@sfx@);
+    const int olsize = sizeof(npy_bool);
+    const npy_intp istride = istep / ilsize;
+    const npy_intp ostride = ostep / olsize;
+    assert(len <= 1 || ostep % olsize == 0);
+
+    if ((istep % ilsize == 0) &&
+        !is_mem_overlap(ip, istep, op, ostep, len) &&
+        npyv_loadable_stride_@sfx@(istride) &&
+        npyv_storable_stride_@sfx@(ostride))
+    {
+        if (istride == 1 && ostride == 1) {
+            simd_unary_@kind@_@TYPE@_CONTIG_CONTIG(ip, 1, op, 1, len);
+        }
+        else if (ostride == 1) {
+            simd_unary_@kind@_@TYPE@_NCONTIG_CONTIG(ip, istride, op, 1, len);
+        }
+        else if (istride == 1) {
+            simd_unary_@kind@_@TYPE@_CONTIG_NCONTIG(ip, 1, op, ostride, len);
+        } else {
+            simd_unary_@kind@_@TYPE@_NCONTIG_NCONTIG(ip, istride, op, ostride, len);
+        }
+    } else
+#endif // @VCHK@
+    {
+    UNARY_LOOP {
+        const npyv_lanetype_@sfx@ in = *(npyv_lanetype_@sfx@ *)ip1;
+        *((npy_bool *)op1) = (npy_@kind@(in) != 0);
+    }
+    }
+
+    npy_clear_floatstatus_barrier((char*)dimensions);
+}
+/**end repeat1**/
+/**end repeat**/
diff --git a/numpy/core/src/umath/loops_utils.h.src b/numpy/core/src/umath/loops_utils.h.src
index df92bc315..5640a1f0b 100644
--- a/numpy/core/src/umath/loops_utils.h.src
+++ b/numpy/core/src/umath/loops_utils.h.src
@@ -74,7 +74,7 @@ is_mem_overlap(const void *src, npy_intp src_step, const void *dst, npy_intp dst
  * The recursion depth is O(lg n) as well.
  * when updating also update similar complex floats summation
  */
-static NPY_INLINE @type@
+static inline @type@
 @TYPE@_pairwise_sum(char *a, npy_intp n, npy_intp stride)
 {
     if (n < 8) {
@@ -152,7 +152,7 @@ static NPY_INLINE @type@
  * #SIMD = 1, 1, 0#
  */
 /* similar to pairwise sum of real floats */
-static NPY_INLINE void
+static inline void
 @TYPE@_pairwise_sum(@ftype@ *rr, @ftype@ * ri, char * a, npy_intp n,
                     npy_intp stride)
 {
diff --git a/numpy/core/src/umath/matmul.c.src b/numpy/core/src/umath/matmul.c.src
index 4dd0c4759..127bb5e27 100644
--- a/numpy/core/src/umath/matmul.c.src
+++ b/numpy/core/src/umath/matmul.c.src
@@ -45,7 +45,7 @@
  * 3. The slower (first) axis stride, in unit steps, must be larger than
  *    the faster axis dimension
  */
-static NPY_INLINE npy_bool
+static inline npy_bool
 is_blasable2d(npy_intp byte_stride1, npy_intp byte_stride2,
               npy_intp d1, npy_intp d2,  npy_intp itemsize)
 {
diff --git a/numpy/core/src/umath/override.c b/numpy/core/src/umath/override.c
index d247c2639..167164163 100644
--- a/numpy/core/src/umath/override.c
+++ b/numpy/core/src/umath/override.c
@@ -23,18 +23,19 @@
  * Returns -1 on failure.
  */
 static int
-get_array_ufunc_overrides(PyObject *in_args, PyObject *out_args,
+get_array_ufunc_overrides(PyObject *in_args, PyObject *out_args, PyObject *wheremask_obj,
                           PyObject **with_override, PyObject **methods)
 {
     int i;
     int num_override_args = 0;
-    int narg, nout;
+    int narg, nout, nwhere;
 
     narg = (int)PyTuple_GET_SIZE(in_args);
     /* It is valid for out_args to be NULL: */
     nout = (out_args != NULL) ? (int)PyTuple_GET_SIZE(out_args) : 0;
+    nwhere = (wheremask_obj != NULL) ? 1: 0;
 
-    for (i = 0; i < narg + nout; ++i) {
+    for (i = 0; i < narg + nout + nwhere; ++i) {
         PyObject *obj;
         int j;
         int new_class = 1;
@@ -42,9 +43,12 @@ get_array_ufunc_overrides(PyObject *in_args, PyObject *out_args,
         if (i < narg) {
             obj = PyTuple_GET_ITEM(in_args, i);
         }
-        else {
+        else if (i < narg + nout){
             obj = PyTuple_GET_ITEM(out_args, i - narg);
         }
+        else {
+            obj = wheremask_obj;
+        }
         /*
          * Have we seen this class before?  If so, ignore.
          */
@@ -208,7 +212,7 @@ copy_positional_args_to_kwargs(const char **keywords,
  */
 NPY_NO_EXPORT int
 PyUFunc_CheckOverride(PyUFuncObject *ufunc, char *method,
-        PyObject *in_args, PyObject *out_args,
+        PyObject *in_args, PyObject *out_args, PyObject *wheremask_obj,
         PyObject *const *args, Py_ssize_t len_args, PyObject *kwnames,
         PyObject **result)
 {
@@ -227,7 +231,7 @@ PyUFunc_CheckOverride(PyUFuncObject *ufunc, char *method,
      * Check inputs for overrides
      */
     num_override_args = get_array_ufunc_overrides(
-           in_args, out_args, with_override, array_ufunc_methods);
+           in_args, out_args, wheremask_obj, with_override, array_ufunc_methods);
     if (num_override_args == -1) {
         goto fail;
     }
diff --git a/numpy/core/src/umath/override.h b/numpy/core/src/umath/override.h
index 4e9a323ca..20621bb19 100644
--- a/numpy/core/src/umath/override.h
+++ b/numpy/core/src/umath/override.h
@@ -6,7 +6,7 @@
 
 NPY_NO_EXPORT int
 PyUFunc_CheckOverride(PyUFuncObject *ufunc, char *method,
-        PyObject *in_args, PyObject *out_args,
+        PyObject *in_args, PyObject *out_args, PyObject *wheremask_obj,
         PyObject *const *args, Py_ssize_t len_args, PyObject *kwnames,
         PyObject **result);
 
diff --git a/numpy/core/src/umath/reduction.c b/numpy/core/src/umath/reduction.c
index 817f99a04..9416e9a29 100644
--- a/numpy/core/src/umath/reduction.c
+++ b/numpy/core/src/umath/reduction.c
@@ -17,6 +17,9 @@
 #include "numpy/arrayobject.h"
 
 #include "npy_pycompat.h"
+#include "array_assign.h"
+#include "array_coercion.h"
+#include "array_method.h"
 #include "ctors.h"
 
 #include "numpy/ufuncobject.h"
@@ -148,24 +151,15 @@ PyArray_CopyInitialReduceValues(
  * context     : The ArrayMethod context (with ufunc, method, and descriptors).
  * operand     : The array to be reduced.
  * out         : NULL, or the array into which to place the result.
- * wheremask   : NOT YET SUPPORTED, but this parameter is placed here
- *               so that support can be added in the future without breaking
- *               API compatibility. Pass in NULL.
+ * wheremask   : Reduction mask of valid values used for `where=`.
  * axis_flags  : Flags indicating the reduction axes of 'operand'.
- * reorderable : If True, the reduction being done is reorderable, which
- *               means specifying multiple axes of reduction at once is ok,
- *               and the reduction code may calculate the reduction in an
- *               arbitrary order. The calculation may be reordered because
- *               of cache behavior or multithreading requirements.
  * keepdims    : If true, leaves the reduction dimensions in the result
  *               with size one.
  * subok       : If true, the result uses the subclass of operand, otherwise
  *               it is always a base class ndarray.
- * identity    : If Py_None, PyArray_CopyInitialReduceValues is used, otherwise
- *               this value is used to initialize the result to
- *               the reduction's unit.
+ * initial     : Initial value, if NULL the default is fetched from the
+ *               ArrayMethod (typically as the default from the ufunc).
  * loop        : `reduce_loop` from `ufunc_object.c`.  TODO: Refactor
- * data        : Data which is passed to the inner loop.
  * buffersize  : Buffer size for the iterator. For the default, pass in 0.
  * funcname    : The name of the reduction function, for error messages.
  * errormask   : forwarded from _get_bufsize_errmask
@@ -182,9 +176,9 @@ PyArray_CopyInitialReduceValues(
 NPY_NO_EXPORT PyArrayObject *
 PyUFunc_ReduceWrapper(PyArrayMethod_Context *context,
         PyArrayObject *operand, PyArrayObject *out, PyArrayObject *wheremask,
-        npy_bool *axis_flags, int reorderable, int keepdims,
-        PyObject *identity, PyArray_ReduceLoopFunc *loop,
-        void *data, npy_intp buffersize, const char *funcname, int errormask)
+        npy_bool *axis_flags, int keepdims,
+        PyObject *initial, PyArray_ReduceLoopFunc *loop,
+        npy_intp buffersize, const char *funcname, int errormask)
 {
     assert(loop != NULL);
     PyArrayObject *result = NULL;
@@ -198,38 +192,35 @@ PyUFunc_ReduceWrapper(PyArrayMethod_Context *context,
     /* Loop auxdata (must be freed on error) */
     NpyAuxData *auxdata = NULL;
 
-    /* More than one axis means multiple orders are possible */
-    if (!reorderable && count_axes(PyArray_NDIM(operand), axis_flags) > 1) {
-        PyErr_Format(PyExc_ValueError,
-                     "reduction operation '%s' is not reorderable, "
-                     "so at most one axis may be specified",
-                     funcname);
-        return NULL;
-    }
-    /* Can only use where with an initial ( from identity or argument) */
-    if (wheremask != NULL && identity == Py_None) {
-        PyErr_Format(PyExc_ValueError,
-                     "reduction operation '%s' does not have an identity, "
-                     "so to use a where mask one has to specify 'initial'",
-                     funcname);
-        return NULL;
-    }
-
-
     /* Set up the iterator */
     op[0] = out;
     op[1] = operand;
     op_dtypes[0] = context->descriptors[0];
     op_dtypes[1] = context->descriptors[1];
 
+    /* Buffer to use when we need an initial value */
+    char *initial_buf = NULL;
+
+    /* More than one axis means multiple orders are possible */
+    if (!(context->method->flags & NPY_METH_IS_REORDERABLE)
+            && count_axes(PyArray_NDIM(operand), axis_flags) > 1) {
+        PyErr_Format(PyExc_ValueError,
+                "reduction operation '%s' is not reorderable, "
+                "so at most one axis may be specified",
+                funcname);
+        goto fail;
+    }
+
     it_flags = NPY_ITER_BUFFERED |
             NPY_ITER_EXTERNAL_LOOP |
             NPY_ITER_GROWINNER |
-            NPY_ITER_DONT_NEGATE_STRIDES |
             NPY_ITER_ZEROSIZE_OK |
             NPY_ITER_REFS_OK |
             NPY_ITER_DELAY_BUFALLOC |
             NPY_ITER_COPY_IF_OVERLAP;
+    if (!(context->method->flags & NPY_METH_IS_REORDERABLE)) {
+        it_flags |= NPY_ITER_DONT_NEGATE_STRIDES;
+    }
     op_flags[0] = NPY_ITER_READWRITE |
                   NPY_ITER_ALIGNED |
                   NPY_ITER_ALLOCATE |
@@ -297,8 +288,52 @@ PyUFunc_ReduceWrapper(PyArrayMethod_Context *context,
         goto fail;
     }
 
+    npy_bool empty_iteration = NpyIter_GetIterSize(iter) == 0;
     result = NpyIter_GetOperandArray(iter)[0];
 
+    /*
+     * Get the initial value (if it exists).  If the iteration is empty
+     * then we assume the reduction is also empty.  The reason is that when
+     * the outer iteration is empty we just won't use the initial value
+     * in any case.  (`np.sum(np.zeros((0, 3)), axis=0)` is a length 3
+     * reduction but has an empty result.)
+     */
+    if ((initial == NULL && context->method->get_reduction_initial == NULL)
+            || initial == Py_None) {
+        /* There is no initial value, or initial value was explicitly unset */
+    }
+    else {
+        /* Not all functions will need initialization, but init always: */
+        initial_buf = PyMem_Calloc(1, op_dtypes[0]->elsize);
+        if (initial_buf == NULL) {
+            PyErr_NoMemory();
+            goto fail;
+        }
+        if (initial != NULL) {
+            /* must use user provided initial value */
+            if (PyArray_Pack(op_dtypes[0], initial_buf, initial) < 0) {
+                goto fail;
+            }
+        }
+        else {
+            /*
+             * Fetch initial from ArrayMethod, we pretend the reduction is
+             * empty when the iteration is.  This may be wrong, but when it is,
+             * we will not need the identity as the result is also empty.
+             */
+            int has_initial = context->method->get_reduction_initial(
+                    context, empty_iteration, initial_buf);
+            if (has_initial < 0) {
+                goto fail;
+            }
+            if (!has_initial) {
+                /* We have no initial value available, free buffer to indicate */
+                PyMem_FREE(initial_buf);
+                initial_buf = NULL;
+            }
+        }
+    }
+
     PyArrayMethod_StridedLoop *strided_loop;
     NPY_ARRAYMETHOD_FLAGS flags = 0;
 
@@ -313,12 +348,27 @@ PyUFunc_ReduceWrapper(PyArrayMethod_Context *context,
      * Initialize the result to the reduction unit if possible,
      * otherwise copy the initial values and get a view to the rest.
      */
-    if (identity != Py_None) {
-        if (PyArray_FillWithScalar(result, identity) < 0) {
+    if (initial_buf != NULL) {
+        /* Loop provided an identity or default value, assign to result. */
+        int ret = raw_array_assign_scalar(
+                PyArray_NDIM(result), PyArray_DIMS(result),
+                PyArray_DESCR(result),
+                PyArray_BYTES(result), PyArray_STRIDES(result),
+                op_dtypes[0], initial_buf);
+        if (ret < 0) {
             goto fail;
         }
     }
     else {
+        /* Can only use where with an initial (from identity or argument) */
+        if (wheremask != NULL) {
+            PyErr_Format(PyExc_ValueError,
+                    "reduction operation '%s' does not have an identity, "
+                    "so to use a where mask one has to specify 'initial'",
+                    funcname);
+            return NULL;
+        }
+
         /*
          * For 1-D skip_first_count could be optimized to 0, but no-identity
          * reductions are not super common.
@@ -354,7 +404,7 @@ PyUFunc_ReduceWrapper(PyArrayMethod_Context *context,
         }
     }
 
-    if (NpyIter_GetIterSize(iter) != 0) {
+    if (!empty_iteration) {
         NpyIter_IterNextFunc *iternext;
         char **dataptr;
         npy_intp *strideptr;
@@ -387,6 +437,10 @@ PyUFunc_ReduceWrapper(PyArrayMethod_Context *context,
     }
     Py_INCREF(result);
 
+    if (initial_buf != NULL && PyDataType_REFCHK(PyArray_DESCR(result))) {
+        PyArray_Item_XDECREF(initial_buf, PyArray_DESCR(result));
+    }
+    PyMem_FREE(initial_buf);
     NPY_AUXDATA_FREE(auxdata);
     if (!NpyIter_Deallocate(iter)) {
         Py_DECREF(result);
@@ -395,6 +449,10 @@ PyUFunc_ReduceWrapper(PyArrayMethod_Context *context,
     return result;
 
 fail:
+    if (initial_buf != NULL && PyDataType_REFCHK(op_dtypes[0])) {
+        PyArray_Item_XDECREF(initial_buf, op_dtypes[0]);
+    }
+    PyMem_FREE(initial_buf);
     NPY_AUXDATA_FREE(auxdata);
     if (iter != NULL) {
         NpyIter_Deallocate(iter);
diff --git a/numpy/core/src/umath/reduction.h b/numpy/core/src/umath/reduction.h
index 2170e27a7..d2cbe4849 100644
--- a/numpy/core/src/umath/reduction.h
+++ b/numpy/core/src/umath/reduction.h
@@ -64,8 +64,8 @@ typedef int (PyArray_ReduceLoopFunc)(PyArrayMethod_Context *context,
 NPY_NO_EXPORT PyArrayObject *
 PyUFunc_ReduceWrapper(PyArrayMethod_Context *context,
         PyArrayObject *operand, PyArrayObject *out, PyArrayObject *wheremask,
-        npy_bool *axis_flags, int reorderable, int keepdims,
-        PyObject *identity, PyArray_ReduceLoopFunc *loop,
-        void *data, npy_intp buffersize, const char *funcname, int errormask);
+        npy_bool *axis_flags, int keepdims,
+        PyObject *initial, PyArray_ReduceLoopFunc *loop,
+        npy_intp buffersize, const char *funcname, int errormask);
 
 #endif
diff --git a/numpy/core/src/umath/scalarmath.c.src b/numpy/core/src/umath/scalarmath.c.src
index 7c63ac0f1..a159fdc12 100644
--- a/numpy/core/src/umath/scalarmath.c.src
+++ b/numpy/core/src/umath/scalarmath.c.src
@@ -57,7 +57,7 @@
  *  #name = byte, short, int, long, longlong#
  *  #type = npy_byte, npy_short, npy_int, npy_long, npy_longlong#
  */
-static NPY_INLINE int
+static inline int
 @name@_ctype_add(@type@ a, @type@ b, @type@ *out) {
     *out = a + b;
     if ((*out^a) >= 0 || (*out^b) >= 0) {
@@ -66,7 +66,7 @@ static NPY_INLINE int
     return NPY_FPE_OVERFLOW;
 }
 
-static NPY_INLINE int
+static inline int
 @name@_ctype_subtract(@type@ a, @type@ b, @type@ *out) {
     *out = a - b;
     if ((*out^a) >= 0 || (*out^~b) >= 0) {
@@ -80,7 +80,7 @@ static NPY_INLINE int
  *  #name = ubyte, ushort, uint, ulong, ulonglong#
  *  #type = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong#
  */
-static NPY_INLINE int
+static inline int
 @name@_ctype_add(@type@ a, @type@ b, @type@ *out) {
     *out = a + b;
     if (*out >= a && *out >= b) {
@@ -89,7 +89,7 @@ static NPY_INLINE int
     return NPY_FPE_OVERFLOW;
 }
 
-static NPY_INLINE int
+static inline int
 @name@_ctype_subtract(@type@ a, @type@ b, @type@ *out) {
     *out = a - b;
     if (a >= b) {
@@ -118,7 +118,7 @@ static NPY_INLINE int
  * #neg = (1,0)*4#
  */
 #if NPY_SIZEOF_@SIZE@ > NPY_SIZEOF_@SIZENAME@
-static NPY_INLINE int
+static inline int
 @name@_ctype_multiply(@type@ a, @type@ b, @type@ *out) {
     @big@ temp;
     temp = ((@big@) a) * ((@big@) b);
@@ -144,7 +144,7 @@ static NPY_INLINE int
  * #SIZE = INT*2, LONG*2, LONGLONG*2#
  */
 #if NPY_SIZEOF_LONGLONG == NPY_SIZEOF_@SIZE@
-static NPY_INLINE int
+static inline int
 @name@_ctype_multiply(@type@ a, @type@ b, @type@ *out) {
     if (npy_mul_with_overflow_@name@(out, a, b)) {
         return NPY_FPE_OVERFLOW;
@@ -171,7 +171,7 @@ static NPY_INLINE int
     #define DIVIDEBYZERO_CHECK (b == 0)
 #endif
 
-static NPY_INLINE int
+static inline int
 @name@_ctype_divide(@type@ a, @type@ b, @type@ *out) {
     if (b == 0) {
         *out = 0;
@@ -200,7 +200,7 @@ static NPY_INLINE int
 
 #define @name@_ctype_floor_divide @name@_ctype_divide
 
-static NPY_INLINE int
+static inline int
 @name@_ctype_remainder(@type@ a, @type@ b, @type@ *out) {
     if (DIVIDEBYZERO_CHECK) {
         *out = 0;
@@ -232,7 +232,7 @@ static NPY_INLINE int
  *         ulong, longlong, ulonglong#
  */
 
-static NPY_INLINE int
+static inline int
 @name@_ctype_true_divide(npy_@name@ a, npy_@name@ b, npy_double *out)
 {
     *out = (npy_double)a / (npy_double)b;
@@ -251,7 +251,7 @@ static NPY_INLINE int
  * #upc = BYTE, UBYTE, SHORT, USHORT, INT, UINT,
  *        LONG, ULONG, LONGLONG, ULONGLONG#
  */
-static NPY_INLINE int
+static inline int
 @name@_ctype_power(@type@ a, @type@ b, @type@ *out) {
     @type@ tmp;
 
@@ -292,7 +292,7 @@ static NPY_INLINE int
  * #op = &, ^, |#
  */
 
-static NPY_INLINE int
+static inline int
 @name@_ctype_@oper@(@type@ arg1, @type@ arg2, @type@ *out)
 {
     *out = arg1 @op@ arg2;
@@ -301,14 +301,14 @@ static NPY_INLINE int
 
 /**end repeat1**/
 
-static NPY_INLINE int
+static inline int
 @name@_ctype_lshift(@type@ arg1, @type@ arg2, @type@ *out)
 {
     *out = npy_lshift@suffix@(arg1, arg2);
     return 0;
 }
 
-static NPY_INLINE int
+static inline int
 @name@_ctype_rshift(@type@ arg1, @type@ arg2, @type@ *out)
 {
     *out = npy_rshift@suffix@(arg1, arg2);
@@ -328,7 +328,7 @@ static NPY_INLINE int
  * #oper = add, subtract, multiply, divide#
  */
 
-static NPY_INLINE int
+static inline int
 @name@_ctype_@oper@(@type@ a, @type@ b, @type@ *out)
 {
     *out = a @OP@ b;
@@ -340,21 +340,21 @@ static NPY_INLINE int
 #define @name@_ctype_true_divide @name@_ctype_divide
 
 
-static NPY_INLINE int
+static inline int
 @name@_ctype_floor_divide(@type@ a, @type@ b, @type@ *out) {
     *out = npy_floor_divide@c@(a, b);
     return 0;
 }
 
 
-static NPY_INLINE int
+static inline int
 @name@_ctype_remainder(@type@ a, @type@ b, @type@ *out) {
     *out = npy_remainder@c@(a, b);
     return 0;
 }
 
 
-static NPY_INLINE int
+static inline int
 @name@_ctype_divmod(@type@ a, @type@ b, @type@ *out1, @type@ *out2) {
     *out1 = npy_divmod@c@(a, b, out2);
     return 0;
@@ -368,7 +368,7 @@ static NPY_INLINE int
  * #oper = add, subtract, multiply, divide#
  */
 
-static NPY_INLINE int
+static inline int
 half_ctype_@oper@(npy_half a, npy_half b, npy_half *out)
 {
     float res = npy_half_to_float(a) @OP@ npy_half_to_float(b);
@@ -380,7 +380,7 @@ half_ctype_@oper@(npy_half a, npy_half b, npy_half *out)
 #define half_ctype_true_divide half_ctype_divide
 
 
-static NPY_INLINE int
+static inline int
 half_ctype_floor_divide(npy_half a, npy_half b, npy_half *out)
 {
     npy_half mod;
@@ -396,7 +396,7 @@ half_ctype_floor_divide(npy_half a, npy_half b, npy_half *out)
 }
 
 
-static NPY_INLINE int
+static inline int
 half_ctype_remainder(npy_half a, npy_half b, npy_half *out)
 {
     npy_half_divmod(a, b, out);
@@ -404,7 +404,7 @@ half_ctype_remainder(npy_half a, npy_half b, npy_half *out)
 }
 
 
-static NPY_INLINE int
+static inline int
 half_ctype_divmod(npy_half a, npy_half b, npy_half *out1, npy_half *out2)
 {
     *out1 = npy_half_divmod(a, b, out2);
@@ -419,7 +419,7 @@ half_ctype_divmod(npy_half a, npy_half b, npy_half *out1, npy_half *out2)
  * #rtype = npy_float, npy_double, npy_longdouble#
  * #c = f,,l#
  */
-static NPY_INLINE int
+static inline int
 @name@_ctype_add(@type@ a, @type@ b, @type@ *out)
 {
     out->real = a.real + b.real;
@@ -427,7 +427,7 @@ static NPY_INLINE int
     return 0;
 }
 
-static NPY_INLINE int
+static inline int
 @name@_ctype_subtract(@type@ a, @type@ b, @type@ *out)
 {
     out->real = a.real - b.real;
@@ -440,7 +440,7 @@ static NPY_INLINE int
  * TODO: Mark as  to work around FPEs not being issues on clang 12.
  *       This should be removed when possible.
  */
-static NPY_INLINE int
+static inline int
 @name@_ctype_multiply( @type@ a, @type@ b, @type@ *out)
 {
     out->real = a.real * b.real - a.imag * b.imag;
@@ -449,11 +449,11 @@ static NPY_INLINE int
 }
 
 /* Use the ufunc loop directly to avoid duplicating the complicated logic */
-static NPY_INLINE int
+static inline int
 @name@_ctype_divide(@type@ a, @type@ b, @type@ *out)
 {
     char *args[3] = {(char *)&a, (char *)&b, (char *)out};
-    npy_intp steps[3];
+    npy_intp steps[3] = {0, 0, 0};
     npy_intp size = 1;
     @TYPE@_divide(args, &size, steps, NULL);
     return 0;
@@ -470,7 +470,7 @@ static NPY_INLINE int
  *         longlong, ulonglong#
  */
 
-static NPY_INLINE int
+static inline int
 @name@_ctype_divmod(npy_@name@ a, npy_@name@ b, npy_@name@ *out, npy_@name@ *out2)
 {
     int res = @name@_ctype_floor_divide(a, b, out);
@@ -487,7 +487,7 @@ static NPY_INLINE int
  * #c = f,,l#
  */
 
-static NPY_INLINE int
+static inline int
 @name@_ctype_power(@type@ a, @type@ b, @type@ *out)
 {
     *out = npy_pow@c@(a, b);
@@ -495,7 +495,7 @@ static NPY_INLINE int
 }
 
 /**end repeat**/
-static NPY_INLINE int
+static inline int
 half_ctype_power(npy_half a, npy_half b, npy_half *out)
 {
     const npy_float af = npy_half_to_float(a);
@@ -518,7 +518,7 @@ half_ctype_power(npy_half a, npy_half b, npy_half *out)
  * #uns = (0,1)*5,0*3#
  * #int = 1*10,0*3#
  */
-static NPY_INLINE int
+static inline int
 @name@_ctype_negative(@type@ a, @type@ *out)
 {
 #if @uns@
@@ -541,7 +541,7 @@ static NPY_INLINE int
 }
 /**end repeat**/
 
-static NPY_INLINE int
+static inline int
 half_ctype_negative(npy_half a, npy_half *out)
 {
     *out = a^0x8000u;
@@ -553,7 +553,7 @@ half_ctype_negative(npy_half a, npy_half *out)
  * #name = cfloat, cdouble, clongdouble#
  * #type = npy_cfloat, npy_cdouble, npy_clongdouble#
  */
-static NPY_INLINE int
+static inline int
 @name@_ctype_negative(@type@ a, @type@ *out)
 {
     out->real = -a.real;
@@ -570,7 +570,7 @@ static NPY_INLINE int
  *         npy_long, npy_ulong, npy_longlong, npy_ulonglong,
  *         npy_half, npy_float, npy_double, npy_longdouble#
  */
-static NPY_INLINE int
+static inline int
 @name@_ctype_positive(@type@ a, @type@ *out)
 {
     *out = a;
@@ -583,7 +583,7 @@ static NPY_INLINE int
  * #type = npy_cfloat, npy_cdouble, npy_clongdouble#
  * #c = f,,l#
  */
-static NPY_INLINE int
+static inline int
 @name@_ctype_positive(@type@ a, @type@ *out)
 {
     out->real = a.real;
@@ -591,7 +591,7 @@ static NPY_INLINE int
     return 0;
 }
 
-static NPY_INLINE int
+static inline int
 @name@_ctype_power(@type@ a, @type@ b, @type@ *out)
 {
     *out = npy_cpow@c@(a, b);
@@ -614,7 +614,7 @@ static NPY_INLINE int
  * #type = npy_byte, npy_short, npy_int, npy_long, npy_longlong#
  * #NAME = BYTE, SHORT, INT, LONG, LONGLONG#
  */
-static NPY_INLINE int
+static inline int
 @name@_ctype_absolute(@type@ a, @type@ *out)
 {
     if (a == NPY_MIN_@NAME@) {
@@ -631,7 +631,7 @@ static NPY_INLINE int
  * #type = npy_float, npy_double, npy_longdouble#
  * #c = f,,l#
  */
-static NPY_INLINE int
+static inline int
 @name@_ctype_absolute(@type@ a, @type@ *out)
 {
     *out = npy_fabs@c@(a);
@@ -639,7 +639,7 @@ static NPY_INLINE int
 }
 /**end repeat**/
 
-static NPY_INLINE int
+static inline int
 half_ctype_absolute(npy_half a, npy_half *out)
 {
     *out = a&0x7fffu;
@@ -652,7 +652,7 @@ half_ctype_absolute(npy_half a, npy_half *out)
  * #rtype = npy_float, npy_double, npy_longdouble#
  * #c = f,,l#
  */
-static NPY_INLINE int
+static inline int
 @name@_ctype_absolute(@type@ a, @rtype@ *out)
 {
     *out = npy_cabs@c@(a);
@@ -665,7 +665,7 @@ static NPY_INLINE int
  *         ulong, longlong, ulonglong#
  */
 
-static NPY_INLINE int
+static inline int
 @name@_ctype_invert(npy_@name@ a, npy_@name@ *out)
 {
     *out = ~a;
@@ -806,7 +806,7 @@ typedef enum {
      */
     CONVERT_PYSCALAR,
     /*
-     * Other object is an unkown scalar or array-like, we (typically) use
+     * Other object is an unknown scalar or array-like, we (typically) use
      * the generic path, which normally ends up in the ufunc machinery.
      */
     OTHER_IS_UNKNOWN_OBJECT,
@@ -929,7 +929,7 @@ typedef enum {
  * @result The result value indicating what we did with `value` or what type
  *         of object it is (see `conversion_result`).
  */
-static NPY_INLINE conversion_result
+static inline conversion_result
 convert_to_@name@(PyObject *value, @type@ *result, npy_bool *may_need_deferring)
 {
     PyArray_Descr *descr;
@@ -1004,7 +1004,7 @@ convert_to_@name@(PyObject *value, @type@ *result, npy_bool *may_need_deferring)
         if (overflow) {
             /* handle as if "unsafe" */
             if (npy_promotion_state != NPY_USE_WEAK_PROMOTION) {
-                return PROMOTION_REQUIRED;
+                return OTHER_IS_UNKNOWN_OBJECT;
             }
             return CONVERT_PYSCALAR;
         }
@@ -1179,6 +1179,11 @@ convert_to_@name@(PyObject *value, @type@ *result, npy_bool *may_need_deferring)
  *         (Half, Float, Double, LongDouble,
  *             CFloat, CDouble, CLongDouble)*4,
  *         (Half, Float, Double, LongDouble)*3#
+ * #NAME = (BYTE, UBYTE, SHORT, USHORT, INT, UINT,
+ *              LONG, ULONG, LONGLONG, ULONGLONG)*12,
+ *          (HALF, FLOAT, DOUBLE, LONGDOUBLE,
+ *              CFLOAT, CDOUBLE, CLONGDOUBLE)*4,
+ *          (HALF, FLOAT, DOUBLE, LONGDOUBLE)*3#
  * #type = (npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint,
  *             npy_long, npy_ulong, npy_longlong, npy_ulonglong)*12,
  *         (npy_half, npy_float, npy_double, npy_longdouble,
@@ -1202,24 +1207,12 @@ convert_to_@name@(PyObject *value, @type@ *result, npy_bool *may_need_deferring)
  *         (npy_half, npy_float, npy_double, npy_longdouble,
  *             npy_cfloat, npy_cdouble, npy_clongdouble)*4,
  *         (npy_half, npy_float, npy_double, npy_longdouble)*3#
- * #oname = (byte, ubyte, short, ushort, int, uint,
- *              long, ulong, longlong, ulonglong)*11,
- *          double*10,
- *          (half, float, double, longdouble,
- *              cfloat, cdouble, clongdouble)*4,
- *          (half, float, double, longdouble)*3#
  * #OName = (Byte, UByte, Short, UShort, Int, UInt,
  *              Long, ULong, LongLong, ULongLong)*11,
  *          Double*10,
  *          (Half, Float, Double, LongDouble,
  *              CFloat, CDouble, CLongDouble)*4,
  *          (Half, Float, Double, LongDouble)*3#
- * #ONAME = (BYTE, UBYTE, SHORT, USHORT, INT, UINT,
- *              LONG, ULONG, LONGLONG, ULONGLONG)*11,
- *          DOUBLE*10,
- *          (HALF, FLOAT, DOUBLE, LONGDOUBLE,
- *              CFLOAT, CDOUBLE, CLONGDOUBLE)*4,
- *          (HALF, FLOAT, DOUBLE, LONGDOUBLE)*3#
  */
 #define IS_@name@
 /* drop the "true_" from "true_divide" for floating point warnings: */
@@ -1234,7 +1227,7 @@ static PyObject *
 @name@_@oper@(PyObject *a, PyObject *b)
 {
     PyObject *ret;
-    @otype@ arg1, arg2, other_val;
+    @type@ arg1, arg2, other_val;
 
     /*
      * Check if this operation may be considered forward.  Note `is_forward`
@@ -1263,7 +1256,7 @@ static PyObject *
     PyObject *other = is_forward ? b : a;
 
     npy_bool may_need_deferring;
-    conversion_result res = convert_to_@oname@(
+    conversion_result res = convert_to_@name@(
             other, &other_val, &may_need_deferring);
     if (res == CONVERSION_ERROR) {
         return NULL;  /* an error occurred (should never happen) */
@@ -1305,7 +1298,7 @@ static PyObject *
              */
             return PyGenericArrType_Type.tp_as_number->nb_@oper@(a,b);
         case CONVERT_PYSCALAR:
-            if (@ONAME@_setitem(other, (char *)&other_val, NULL) < 0) {
+            if (@NAME@_setitem(other, (char *)&other_val, NULL) < 0) {
                 return NULL;
             }
             break;
@@ -1345,7 +1338,7 @@ static PyObject *
 #if @twoout@
     int retstatus = @name@_ctype_@oper@(arg1, arg2, &out, &out2);
 #else
-    int retstatus = @oname@_ctype_@oper@(arg1, arg2, &out);
+    int retstatus = @name@_ctype_@oper@(arg1, arg2, &out);
 #endif
 
 #if @fperr@
@@ -1549,7 +1542,7 @@ static PyObject *
  *
  */
 
-/* 
+/*
  * Complex numbers do not support remainder so we manually make sure that the
  * operation is not defined.  This is/was especially important for longdoubles
  * due to their tendency to recurse for some operations, see gh-18548.
@@ -1711,7 +1704,7 @@ static int
 emit_complexwarning(void)
 {
     static PyObject *cls = NULL;
-    npy_cache_import("numpy.core", "ComplexWarning", &cls);
+    npy_cache_import("numpy.exceptions", "ComplexWarning", &cls);
     if (cls == NULL) {
         return -1;
     }
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
deleted file mode 100644
index d6c9a7e65..000000000
--- a/numpy/core/src/umath/simd.inc.src
+++ /dev/null
@@ -1,1215 +0,0 @@
-
-
-/*
- * This file is for the definitions of simd vectorized operations.
- *
- * Currently contains sse2 functions that are built on amd64, x32 or
- * non-generic builds (CFLAGS=-march=...)
- * In future it may contain other instruction sets like AVX or NEON detected
- * at runtime in which case it needs to be included indirectly via a file
- * compiled with special options (or use gcc target attributes) so the binary
- * stays portable.
- */
-
-
-#ifndef __NPY_SIMD_INC
-#define __NPY_SIMD_INC
-
-#include "lowlevel_strided_loops.h"
-#include "numpy/npy_common.h"
-#include "numpy/npy_math.h"
-#include "npy_simd_data.h"
-#ifdef NPY_HAVE_SSE2_INTRINSICS
-#include <emmintrin.h>
-#if !defined(_MSC_VER) || _MSC_VER >= 1600
-#include <immintrin.h>
-#else
-#undef __AVX2__
-#undef __AVX512F__
-#endif
-#endif
-#include "loops_utils.h" // nomemoverlap
-#include <assert.h>
-#include <stdlib.h>
-#include <float.h>
-#include <string.h> /* for memcpy */
-
-#define VECTOR_SIZE_BYTES 16
-
-/*
- * Dispatcher functions
- * decide whether the operation can be vectorized and run it
- * if it was run returns true and false if nothing was done
- */
-
-/*
- *****************************************************************************
- **                           CMPLX DISPATCHERS
- *****************************************************************************
- */
-
-/**begin repeat
- * #TYPE = CFLOAT, CDOUBLE#
- * #type= npy_float, npy_double#
- * #esize = 8, 16#
- */
-
-/**begin repeat1
- *  #func = square, absolute, conjugate#
- *  #outsize = 1, 2, 1#
- *  #max_stride = 2, 8, 8#
- */
-
-#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
-static NPY_INLINE NPY_GCC_TARGET_AVX512F void
-AVX512F_@func@_@TYPE@(@type@*, @type@*, const npy_intp n, const npy_intp stride);
-#endif
-
-static NPY_INLINE int
-run_unary_avx512f_@func@_@TYPE@(char **args, const npy_intp *dimensions, const npy_intp *steps)
-{
-#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
-    if ((IS_OUTPUT_BLOCKABLE_UNARY(@esize@, (npy_uint)(@esize@/@outsize@), 64)) && (labs(steps[0]) < 2*@max_stride@*@esize@)) {
-        AVX512F_@func@_@TYPE@((@type@*)args[1], (@type@*)args[0], dimensions[0], steps[0]);
-        return 1;
-    }
-    else
-        return 0;
-#endif
-    return 0;
-}
-
-/**end repeat1**/
-/**end repeat**/
-
-/*
- *****************************************************************************
- **                           FLOAT DISPATCHERS
- *****************************************************************************
- */
-
-/**begin repeat
- * #type = npy_float, npy_double, npy_longdouble#
- * #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
- * #EXISTS = 1, 1, 0#
- */
-
-/**begin repeat1
- * #func = isnan, isfinite, isinf, signbit#
- */
-
-#if defined HAVE_ATTRIBUTE_TARGET_AVX512_SKX_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS && @EXISTS@
-static NPY_INLINE NPY_GCC_TARGET_AVX512_SKX void
-AVX512_SKX_@func@_@TYPE@(npy_bool*, @type@*, const npy_intp n, const npy_intp stride);
-#endif
-
-static NPY_INLINE int
-run_@func@_avx512_skx_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
-{
-#if defined HAVE_ATTRIBUTE_TARGET_AVX512_SKX_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS && @EXISTS@
-    if (IS_OUTPUT_BLOCKABLE_UNARY(sizeof(@type@), sizeof(npy_bool), 64)) {
-        AVX512_SKX_@func@_@TYPE@((npy_bool*)args[1], (@type@*)args[0], dimensions[0], steps[0]);
-        return 1;
-    }
-    else {
-        return 0;
-    }
-#endif
-    return 0;
-}
-
-
-/**end repeat1**/
-/**end repeat**/
-
-/**begin repeat
- * Float types
- *  #type = npy_float, npy_double, npy_longdouble#
- *  #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
- *  #vector = 1, 1, 0#
- *  #VECTOR = NPY_SIMD, NPY_SIMD_F64, 0 #
- */
-
-/**begin repeat1
- * #func = negative#
- * #check = IS_BLOCKABLE_UNARY#
- * #name = unary#
- */
-
-#if @vector@ && defined NPY_HAVE_SSE2_INTRINSICS
-
-/* prototypes */
-static void
-sse2_@func@_@TYPE@(@type@ *, @type@ *, const npy_intp n);
-
-#endif
-
-static NPY_INLINE int
-run_@name@_simd_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
-{
-#if @vector@ && defined NPY_HAVE_SSE2_INTRINSICS
-    if (@check@(sizeof(@type@), VECTOR_SIZE_BYTES)) {
-        sse2_@func@_@TYPE@((@type@*)args[1], (@type@*)args[0], dimensions[0]);
-        return 1;
-    }
-#endif
-    return 0;
-}
-
-/**end repeat1**/
-
-/**begin repeat1
- * #kind = isnan, isfinite, isinf, signbit#
- */
-
-#if @vector@ && defined NPY_HAVE_SSE2_INTRINSICS
-
-static void
-sse2_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, npy_intp n);
-
-#endif
-
-static NPY_INLINE int
-run_@kind@_simd_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
-{
-#if @vector@ && defined NPY_HAVE_SSE2_INTRINSICS
-    if (steps[0] == sizeof(@type@) && steps[1] == 1 &&
-        npy_is_aligned(args[0], sizeof(@type@))) {
-        sse2_@kind@_@TYPE@((npy_bool*)args[1], (@type@*)args[0], dimensions[0]);
-        return 1;
-    }
-#endif
-    return 0;
-}
-
-/**end repeat1**/
-
-/**end repeat**/
-
-/*
- *****************************************************************************
- **                           BOOL DISPATCHERS
- *****************************************************************************
- */
-
-/**begin repeat
- * # kind = logical_or, logical_and#
- */
-
-#if defined NPY_HAVE_SSE2_INTRINSICS
-static void
-sse2_binary_@kind@_BOOL(npy_bool * op, npy_bool * ip1, npy_bool * ip2,
-                        npy_intp n);
-
-static void
-sse2_reduce_@kind@_BOOL(npy_bool * op, npy_bool * ip, npy_intp n);
-#endif
-
-static NPY_INLINE int
-run_binary_simd_@kind@_BOOL(char **args, npy_intp const *dimensions, npy_intp const *steps)
-{
-#if defined NPY_HAVE_SSE2_INTRINSICS
-    if (sizeof(npy_bool) == 1 &&
-            IS_BLOCKABLE_BINARY(sizeof(npy_bool), VECTOR_SIZE_BYTES)) {
-        sse2_binary_@kind@_BOOL((npy_bool*)args[2], (npy_bool*)args[0],
-                               (npy_bool*)args[1], dimensions[0]);
-        return 1;
-    }
-#endif
-    return 0;
-}
-
-
-static NPY_INLINE int
-run_reduce_simd_@kind@_BOOL(char **args, npy_intp const *dimensions, npy_intp const *steps)
-{
-#if defined NPY_HAVE_SSE2_INTRINSICS
-    if (sizeof(npy_bool) == 1 &&
-            IS_BLOCKABLE_REDUCE(sizeof(npy_bool), VECTOR_SIZE_BYTES)) {
-        sse2_reduce_@kind@_BOOL((npy_bool*)args[0], (npy_bool*)args[1],
-                                dimensions[0]);
-        return 1;
-    }
-#endif
-    return 0;
-}
-
-/**end repeat**/
-
-/**begin repeat
- * # kind = absolute, logical_not#
- */
-
-#if defined NPY_HAVE_SSE2_INTRINSICS
-static void
-sse2_@kind@_BOOL(npy_bool *, npy_bool *, const npy_intp n);
-#endif
-
-static NPY_INLINE int
-run_unary_simd_@kind@_BOOL(char **args, npy_intp const *dimensions, npy_intp const *steps)
-{
-#if defined NPY_HAVE_SSE2_INTRINSICS
-    if (sizeof(npy_bool) == 1 &&
-            IS_BLOCKABLE_UNARY(sizeof(npy_bool), VECTOR_SIZE_BYTES)) {
-        sse2_@kind@_BOOL((npy_bool*)args[1], (npy_bool*)args[0], dimensions[0]);
-        return 1;
-    }
-#endif
-    return 0;
-}
-
-/**end repeat**/
-
-#ifdef NPY_HAVE_SSE2_INTRINSICS
-
-/*
- * Vectorized operations
- */
-/*
- *****************************************************************************
- **                           FLOAT LOOPS
- *****************************************************************************
- */
-
-/**begin repeat
-* horizontal reductions on a vector
-* # VOP = min, max#
-*/
-
-NPY_FINLINE npy_float sse2_horizontal_@VOP@___m128(__m128 v)
-{
-    npy_float r;
-    __m128 tmp = _mm_movehl_ps(v, v);                   /* c     d     ... */
-    __m128 m = _mm_@VOP@_ps(v, tmp);                    /* m(ac) m(bd) ... */
-    tmp = _mm_shuffle_ps(m, m, _MM_SHUFFLE(1, 1, 1, 1));/* m(bd) m(bd) ... */
-    _mm_store_ss(&r, _mm_@VOP@_ps(tmp, m));             /* m(acbd) ... */
-    return r;
-}
-
-NPY_FINLINE npy_double sse2_horizontal_@VOP@___m128d(__m128d v)
-{
-    npy_double r;
-    __m128d tmp = _mm_unpackhi_pd(v, v);    /* b     b */
-    _mm_store_sd(&r, _mm_@VOP@_pd(tmp, v)); /* m(ab) m(bb) */
-    return r;
-}
-/**end repeat**/
-
-/**begin repeat
- *  #type = npy_float, npy_double#
- *  #TYPE = FLOAT, DOUBLE#
- *  #scalarf = npy_sqrtf, npy_sqrt#
- *  #c = f, #
- *  #vtype = __m128, __m128d#
- *  #vtype256 = __m256, __m256d#
- *  #vtype512 = __m512, __m512d#
- *  #vpre = _mm, _mm#
- *  #vpre256 = _mm256, _mm256#
- *  #vpre512 = _mm512, _mm512#
- *  #vsuf = ps, pd#
- *  #vsufs = ss, sd#
- *  #nan = NPY_NANF, NPY_NAN#
- *  #double = 0, 1#
- *  #cast = _mm_castps_si128, _mm_castpd_si128#
- */
-/*
- * compress 4 vectors to 4/8 bytes in op with filled with 0 or 1
- * the last vector is passed as a pointer as MSVC 2010 is unable to ignore the
- * calling convention leading to C2719 on 32 bit, see #4795
- */
-NPY_FINLINE void
-sse2_compress4_to_byte_@TYPE@(@vtype@ r1, @vtype@ r2, @vtype@ r3, @vtype@ * r4,
-                              npy_bool * op)
-{
-    const __m128i mask = @vpre@_set1_epi8(0x1);
-    __m128i ir1 = @vpre@_packs_epi32(@cast@(r1), @cast@(r2));
-    __m128i ir2 = @vpre@_packs_epi32(@cast@(r3), @cast@(*r4));
-    __m128i rr = @vpre@_packs_epi16(ir1, ir2);
-#if @double@
-    rr = @vpre@_packs_epi16(rr, rr);
-    rr = @vpre@_and_si128(rr, mask);
-    @vpre@_storel_epi64((__m128i*)op, rr);
-#else
-    rr = @vpre@_and_si128(rr, mask);
-    @vpre@_storeu_si128((__m128i*)op, rr);
-#endif
-}
-
-static void
-sse2_signbit_@TYPE@(npy_bool * op, @type@ * ip1, npy_intp n)
-{
-    LOOP_BLOCK_ALIGN_VAR(ip1, @type@, VECTOR_SIZE_BYTES) {
-        op[i] = npy_signbit(ip1[i]) != 0;
-    }
-    LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
-        @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]);
-        int r = @vpre@_movemask_@vsuf@(a);
-        if (sizeof(@type@) == 8) {
-            op[i] = r & 1;
-            op[i + 1] = (r >> 1);
-        }
-        else {
-            op[i] = r & 1;
-            op[i + 1] = (r >> 1) & 1;
-            op[i + 2] = (r >> 2) & 1;
-            op[i + 3] = (r >> 3);
-        }
-    }
-    LOOP_BLOCKED_END {
-        op[i] = npy_signbit(ip1[i]) != 0;
-    }
-}
-
-/**begin repeat1
- * #kind = isnan, isfinite, isinf#
- * #var = 0, 1, 2#
- */
-
-static void
-sse2_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, npy_intp n)
-{
-#if @var@ != 0 /* isinf/isfinite */
-    /* signbit mask 0x7FFFFFFF after andnot */
-    const @vtype@ mask = @vpre@_set1_@vsuf@(-0.@c@);
-    const @vtype@ ones = @vpre@_cmpeq_@vsuf@(@vpre@_setzero_@vsuf@(),
-                                             @vpre@_setzero_@vsuf@());
-#if @double@
-    const @vtype@ fltmax = @vpre@_set1_@vsuf@(DBL_MAX);
-#else
-    const @vtype@ fltmax = @vpre@_set1_@vsuf@(FLT_MAX);
-#endif
-#endif
-    LOOP_BLOCK_ALIGN_VAR(ip1, @type@, VECTOR_SIZE_BYTES) {
-        op[i] = npy_@kind@(ip1[i]) != 0;
-    }
-    LOOP_BLOCKED(@type@, 4 * VECTOR_SIZE_BYTES) {
-        @vtype@ a = @vpre@_load_@vsuf@(&ip1[i + 0 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
-        @vtype@ b = @vpre@_load_@vsuf@(&ip1[i + 1 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
-        @vtype@ c = @vpre@_load_@vsuf@(&ip1[i + 2 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
-        @vtype@ d = @vpre@_load_@vsuf@(&ip1[i + 3 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
-        @vtype@ r1, r2, r3, r4;
-#if @var@ != 0 /* isinf/isfinite */
-        /* fabs via masking of sign bit */
-        r1 = @vpre@_andnot_@vsuf@(mask, a);
-        r2 = @vpre@_andnot_@vsuf@(mask, b);
-        r3 = @vpre@_andnot_@vsuf@(mask, c);
-        r4 = @vpre@_andnot_@vsuf@(mask, d);
-#if @var@ == 1 /* isfinite */
-        /* negative compare against max float, nan is always true */
-        r1 = @vpre@_cmpnle_@vsuf@(r1, fltmax);
-        r2 = @vpre@_cmpnle_@vsuf@(r2, fltmax);
-        r3 = @vpre@_cmpnle_@vsuf@(r3, fltmax);
-        r4 = @vpre@_cmpnle_@vsuf@(r4, fltmax);
-#else /* isinf */
-        r1 = @vpre@_cmpnlt_@vsuf@(fltmax, r1);
-        r2 = @vpre@_cmpnlt_@vsuf@(fltmax, r2);
-        r3 = @vpre@_cmpnlt_@vsuf@(fltmax, r3);
-        r4 = @vpre@_cmpnlt_@vsuf@(fltmax, r4);
-#endif
-        /* flip results to what we want (andnot as there is no sse not) */
-        r1 = @vpre@_andnot_@vsuf@(r1, ones);
-        r2 = @vpre@_andnot_@vsuf@(r2, ones);
-        r3 = @vpre@_andnot_@vsuf@(r3, ones);
-        r4 = @vpre@_andnot_@vsuf@(r4, ones);
-#endif
-#if @var@ == 0 /* isnan */
-        r1 = @vpre@_cmpneq_@vsuf@(a, a);
-        r2 = @vpre@_cmpneq_@vsuf@(b, b);
-        r3 = @vpre@_cmpneq_@vsuf@(c, c);
-        r4 = @vpre@_cmpneq_@vsuf@(d, d);
-#endif
-        sse2_compress4_to_byte_@TYPE@(r1, r2, r3, &r4, &op[i]);
-    }
-    LOOP_BLOCKED_END {
-        op[i] = npy_@kind@(ip1[i]) != 0;
-    }
-}
-
-/**end repeat1**/
-
-static void
-sse2_negative_@TYPE@(@type@ * op, @type@ * ip, const npy_intp n)
-{
-    /*
-     * get 0x7FFFFFFF mask (everything but signbit set)
-     * float & ~mask will remove the sign, float ^ mask flips the sign
-     * this is equivalent to how the compiler implements fabs on amd64
-     */
-    const @vtype@ mask = @vpre@_set1_@vsuf@(-0.@c@);
-
-    /* align output to VECTOR_SIZE_BYTES bytes */
-    LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES) {
-        op[i] = -ip[i];
-    }
-    assert((npy_uintp)n < (VECTOR_SIZE_BYTES / sizeof(@type@)) ||
-           npy_is_aligned(&op[i], VECTOR_SIZE_BYTES));
-    if (npy_is_aligned(&ip[i], VECTOR_SIZE_BYTES)) {
-        LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
-            @vtype@ a = @vpre@_load_@vsuf@(&ip[i]);
-            @vpre@_store_@vsuf@(&op[i], @vpre@_xor_@vsuf@(mask, a));
-        }
-    }
-    else {
-        LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
-            @vtype@ a = @vpre@_loadu_@vsuf@(&ip[i]);
-            @vpre@_store_@vsuf@(&op[i], @vpre@_xor_@vsuf@(mask, a));
-        }
-    }
-    LOOP_BLOCKED_END {
-        op[i] = -ip[i];
-    }
-}
-/**end repeat1**/
-
-/**end repeat**/
-
-/* bunch of helper functions used in ISA_exp/log_FLOAT*/
-
-#if defined HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
-fma_get_full_load_mask_ps(void)
-{
-    return _mm256_set1_ps(-1.0);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256i
-fma_get_full_load_mask_pd(void)
-{
-    return _mm256_castpd_si256(_mm256_set1_pd(-1.0));
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
-fma_get_partial_load_mask_ps(const npy_int num_elem, const npy_int num_lanes)
-{
-    float maskint[16] = {-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,
-                            1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0};
-    float* addr = maskint + num_lanes - num_elem;
-    return _mm256_loadu_ps(addr);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256i
-fma_get_partial_load_mask_pd(const npy_int num_elem, const npy_int num_lanes)
-{
-    npy_int maskint[16] = {-1,-1,-1,-1,-1,-1,-1,-1,1,1,1,1,1,1,1,1};
-    npy_int* addr = maskint + 2*num_lanes - 2*num_elem;
-    return _mm256_loadu_si256((__m256i*) addr);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
-fma_masked_gather_ps(__m256 src,
-                     npy_float* addr,
-                     __m256i vindex,
-                     __m256 mask)
-{
-    return _mm256_mask_i32gather_ps(src, addr, vindex, mask, 4);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256d
-fma_masked_gather_pd(__m256d src,
-                     npy_double* addr,
-                     __m128i vindex,
-                     __m256d mask)
-{
-    return _mm256_mask_i32gather_pd(src, addr, vindex, mask, 8);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
-fma_masked_load_ps(__m256 mask, npy_float* addr)
-{
-    return _mm256_maskload_ps(addr, _mm256_cvtps_epi32(mask));
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256d
-fma_masked_load_pd(__m256i mask, npy_double* addr)
-{
-    return _mm256_maskload_pd(addr, mask);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
-fma_set_masked_lanes_ps(__m256 x, __m256 val, __m256 mask)
-{
-    return _mm256_blendv_ps(x, val, mask);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256d
-fma_set_masked_lanes_pd(__m256d x, __m256d val, __m256d mask)
-{
-    return _mm256_blendv_pd(x, val, mask);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
-fma_blend(__m256 x, __m256 y, __m256 ymask)
-{
-    return _mm256_blendv_ps(x, y, ymask);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
-fma_invert_mask_ps(__m256 ymask)
-{
-    return _mm256_andnot_ps(ymask, _mm256_set1_ps(-1.0));
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256i
-fma_invert_mask_pd(__m256i ymask)
-{
-    return _mm256_andnot_si256(ymask, _mm256_set1_epi32(0xFFFFFFFF));
-}
-
-/**begin repeat
- *  #vsub = ps, pd#
- *  #vtype = __m256, __m256d#
- */
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@
-fma_abs_@vsub@(@vtype@ x)
-{
-    return _mm256_andnot_@vsub@(_mm256_set1_@vsub@(-0.0), x);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@
-fma_reciprocal_@vsub@(@vtype@ x)
-{
-    return _mm256_div_@vsub@(_mm256_set1_@vsub@(1.0f), x);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@
-fma_rint_@vsub@(@vtype@ x)
-{
-    return _mm256_round_@vsub@(x, _MM_FROUND_TO_NEAREST_INT);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@
-fma_floor_@vsub@(@vtype@ x)
-{
-    return _mm256_round_@vsub@(x, _MM_FROUND_TO_NEG_INF);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@
-fma_trunc_@vsub@(@vtype@ x)
-{
-    return _mm256_round_@vsub@(x, _MM_FROUND_TO_ZERO);
-}
-/**end repeat**/
-#endif
-
-#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
-avx512_get_full_load_mask_ps(void)
-{
-    return 0xFFFF;
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask8
-avx512_get_full_load_mask_pd(void)
-{
-    return 0xFF;
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
-avx512_get_partial_load_mask_ps(const npy_int num_elem, const npy_int total_elem)
-{
-    return (0x0001 << num_elem) - 0x0001;
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask8
-avx512_get_partial_load_mask_pd(const npy_int num_elem, const npy_int total_elem)
-{
-    return (0x01 << num_elem) - 0x01;
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
-avx512_masked_gather_ps(__m512 src,
-                        npy_float* addr,
-                        __m512i vindex,
-                        __mmask16 kmask)
-{
-    return _mm512_mask_i32gather_ps(src, kmask, vindex, addr, 4);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512d
-avx512_masked_gather_pd(__m512d src,
-                        npy_double* addr,
-                        __m256i vindex,
-                        __mmask8 kmask)
-{
-    return _mm512_mask_i32gather_pd(src, kmask, vindex, addr, 8);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
-avx512_masked_load_ps(__mmask16 mask, npy_float* addr)
-{
-    return _mm512_maskz_loadu_ps(mask, (__m512 *)addr);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512d
-avx512_masked_load_pd(__mmask8 mask, npy_double* addr)
-{
-    return _mm512_maskz_loadu_pd(mask, (__m512d *)addr);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
-avx512_set_masked_lanes_ps(__m512 x, __m512 val, __mmask16 mask)
-{
-    return _mm512_mask_blend_ps(mask, x, val);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512d
-avx512_set_masked_lanes_pd(__m512d x, __m512d val, __mmask8 mask)
-{
-    return _mm512_mask_blend_pd(mask, x, val);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
-avx512_blend(__m512 x, __m512 y, __mmask16 ymask)
-{
-    return _mm512_mask_mov_ps(x, ymask, y);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
-avx512_invert_mask_ps(__mmask16 ymask)
-{
-    return _mm512_knot(ymask);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask8
-avx512_invert_mask_pd(__mmask8 ymask)
-{
-    return _mm512_knot(ymask);
-}
-
-/**begin repeat
- *  #vsub  = ps, pd#
- *  #type= npy_float, npy_double#
- *  #epi_vsub  = epi32, epi64#
- *  #vtype = __m512, __m512d#
- *  #mask = __mmask16, __mmask8#
- *  #and_const = 0x7fffffff, 0x7fffffffffffffffLL#
- *  #neg_mask = 0x80000000, 0x8000000000000000#
- *  #perm_ = 0xb1, 0x55#
- *  #cmpx_img_mask = 0xAAAA, 0xAA#
- *  #cmpx_re_mask = 0x5555, 0x55#
- *  #INF = NPY_INFINITYF, NPY_INFINITY#
- *  #NAN = NPY_NANF, NPY_NAN#
- */
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
-avx512_abs_@vsub@(@vtype@ x)
-{
-    return (@vtype@) _mm512_and_@epi_vsub@((__m512i) x,
-				    _mm512_set1_@epi_vsub@ (@and_const@));
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
-avx512_reciprocal_@vsub@(@vtype@ x)
-{
-    return _mm512_div_@vsub@(_mm512_set1_@vsub@(1.0f), x);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
-avx512_rint_@vsub@(@vtype@ x)
-{
-    return _mm512_roundscale_@vsub@(x, 0x08);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
-avx512_floor_@vsub@(@vtype@ x)
-{
-    return _mm512_roundscale_@vsub@(x, 0x09);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
-avx512_trunc_@vsub@(@vtype@ x)
-{
-    return _mm512_roundscale_@vsub@(x, 0x0B);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
-avx512_hadd_@vsub@(const @vtype@ x)
-{
-    return _mm512_add_@vsub@(x, _mm512_permute_@vsub@(x, @perm_@));
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
-avx512_hsub_@vsub@(const @vtype@ x)
-{
-    return _mm512_sub_@vsub@(x, _mm512_permute_@vsub@(x, @perm_@));
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
-avx512_cabsolute_@vsub@(const @vtype@ x1,
-                        const @vtype@ x2,
-                        const __m512i re_indices,
-                        const __m512i im_indices)
-{
-    @vtype@ inf = _mm512_set1_@vsub@(@INF@);
-    @vtype@ nan = _mm512_set1_@vsub@(@NAN@);
-    @vtype@ x1_abs = avx512_abs_@vsub@(x1);
-    @vtype@ x2_abs = avx512_abs_@vsub@(x2);
-    @vtype@ re = _mm512_permutex2var_@vsub@(x1_abs, re_indices, x2_abs);
-    @vtype@ im = _mm512_permutex2var_@vsub@(x1_abs, im_indices , x2_abs);
-    /*
-     * If real or imag = INF, then convert it to inf + j*inf
-     * Handles: inf + j*nan, nan + j*inf
-     */
-    @mask@ re_infmask = _mm512_cmp_@vsub@_mask(re, inf, _CMP_EQ_OQ);
-    @mask@ im_infmask = _mm512_cmp_@vsub@_mask(im, inf, _CMP_EQ_OQ);
-    im = _mm512_mask_mov_@vsub@(im, re_infmask, inf);
-    re = _mm512_mask_mov_@vsub@(re, im_infmask, inf);
-
-    /*
-     * If real or imag = NAN, then convert it to nan + j*nan
-     * Handles: x + j*nan, nan + j*x
-     */
-    @mask@ re_nanmask = _mm512_cmp_@vsub@_mask(re, re, _CMP_NEQ_UQ);
-    @mask@ im_nanmask = _mm512_cmp_@vsub@_mask(im, im, _CMP_NEQ_UQ);
-    im = _mm512_mask_mov_@vsub@(im, re_nanmask, nan);
-    re = _mm512_mask_mov_@vsub@(re, im_nanmask, nan);
-
-    @vtype@ larger  = _mm512_max_@vsub@(re, im);
-    @vtype@ smaller = _mm512_min_@vsub@(im, re);
-
-    /*
-     * Calculate div_mask to prevent 0./0. and inf/inf operations in div
-     */
-    @mask@ zeromask = _mm512_cmp_@vsub@_mask(larger, _mm512_setzero_@vsub@(), _CMP_EQ_OQ);
-    @mask@ infmask = _mm512_cmp_@vsub@_mask(smaller, inf, _CMP_EQ_OQ);
-    @mask@ div_mask = _mm512_knot(_mm512_kor(zeromask, infmask));
-    @vtype@ ratio = _mm512_maskz_div_@vsub@(div_mask, smaller, larger);
-    @vtype@ hypot = _mm512_sqrt_@vsub@(_mm512_fmadd_@vsub@(
-                                        ratio, ratio, _mm512_set1_@vsub@(1.0f)));
-    return _mm512_mul_@vsub@(hypot, larger);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
-avx512_conjugate_@vsub@(const @vtype@ x)
-{
-    /*
-     * __mm512_mask_xor_ps/pd requires AVX512DQ. We cast it to __m512i and
-     * use the xor_epi32/64 uinstruction instead. Cast is a zero latency instruction
-     */
-    __m512i cast_x = _mm512_cast@vsub@_si512(x);
-    __m512i res = _mm512_mask_xor_@epi_vsub@(cast_x, @cmpx_img_mask@,
-                                        cast_x, _mm512_set1_@epi_vsub@(@neg_mask@));
-    return _mm512_castsi512_@vsub@(res);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
-avx512_cmul_@vsub@(@vtype@ x1, @vtype@ x2)
-{
-    // x1 = r1, i1
-    // x2 = r2, i2
-    @vtype@ x3  = _mm512_permute_@vsub@(x2, @perm_@);   // i2, r2
-    @vtype@ x12 = _mm512_mul_@vsub@(x1, x2);            // r1*r2, i1*i2
-    @vtype@ x13 = _mm512_mul_@vsub@(x1, x3);            // r1*i2, r2*i1
-    @vtype@ outreal = avx512_hsub_@vsub@(x12);          // r1*r2 - i1*i2, r1*r2 - i1*i2
-    @vtype@ outimg  = avx512_hadd_@vsub@(x13);          // r1*i2 + i1*r2, r1*i2 + i1*r2
-    return _mm512_mask_blend_@vsub@(@cmpx_img_mask@, outreal, outimg);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
-avx512_csquare_@vsub@(@vtype@ x)
-{
-    return avx512_cmul_@vsub@(x, x);
-}
-
-/**end repeat**/
-#endif
-
-/**begin repeat
- * #ISA = FMA, AVX512F#
- * #isa = fma, avx512#
- * #vtype = __m256, __m512#
- * #vsize = 256, 512#
- * #or = or_ps, kor#
- * #vsub = , _mask#
- * #mask = __m256, __mmask16#
- * #fmadd = _mm256_fmadd_ps, _mm512_fmadd_ps#
- * #CHK = HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS#
- **/
-
-#if defined @CHK@
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
-@isa@_sqrt_ps(@vtype@ x)
-{
-    return _mm@vsize@_sqrt_ps(x);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@d
-@isa@_sqrt_pd(@vtype@d x)
-{
-    return _mm@vsize@_sqrt_pd(x);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
-@isa@_square_ps(@vtype@ x)
-{
-    return _mm@vsize@_mul_ps(x,x);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@d
-@isa@_square_pd(@vtype@d x)
-{
-    return _mm@vsize@_mul_pd(x,x);
-}
-
-#endif
-/**end repeat**/
-
-/**begin repeat
- * #type = npy_float, npy_double#
- * #TYPE = FLOAT, DOUBLE#
- * #num_lanes = 16, 8#
- * #vsuffix = ps, pd#
- * #mask = __mmask16, __mmask8#
- * #vtype = __m512, __m512d#
- * #scale = 4, 8#
- * #vindextype = __m512i, __m256i#
- * #vindexload = _mm512_loadu_si512, _mm256_loadu_si256#
- * #episize = epi32, epi64#
- */
-
-/**begin repeat1
- * #func = isnan, isfinite, isinf, signbit#
- * #IMM8 = 0x81, 0x99, 0x18, 0x04#
- * #is_finite = 0, 1, 0, 0#
- * #is_signbit = 0, 0, 0, 1#
- */
-
-#if defined HAVE_ATTRIBUTE_TARGET_AVX512_SKX_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
-static NPY_INLINE NPY_GCC_TARGET_AVX512_SKX void
-AVX512_SKX_@func@_@TYPE@(npy_bool* op, @type@* ip, const npy_intp array_size, const npy_intp steps)
-{
-    const npy_intp stride_ip = steps/(npy_intp)sizeof(@type@);
-    npy_intp num_remaining_elements = array_size;
-
-    @mask@ load_mask = avx512_get_full_load_mask_@vsuffix@();
-#if @is_signbit@
-    @vtype@ signbit = _mm512_set1_@vsuffix@(-0.0);
-#endif
-
-    /*
-     * Note: while generally indices are npy_intp, we ensure that our maximum
-     * index will fit in an int32 as a precondition for this function via
-     * IS_OUTPUT_BLOCKABLE_UNARY
-     */
-
-    npy_int32 index_ip[@num_lanes@];
-    for (npy_int32 ii = 0; ii < @num_lanes@; ii++) {
-        index_ip[ii] = ii*stride_ip;
-    }
-    @vindextype@ vindex_ip = @vindexload@((@vindextype@*)&index_ip[0]);
-    @vtype@ zeros_f = _mm512_setzero_@vsuffix@();
-    __m512i ones = _mm512_set1_@episize@(1);
-
-    while (num_remaining_elements > 0) {
-        if (num_remaining_elements < @num_lanes@) {
-            load_mask = avx512_get_partial_load_mask_@vsuffix@(
-                                    num_remaining_elements, @num_lanes@);
-        }
-        @vtype@ x1;
-        if (stride_ip == 1) {
-            x1 = avx512_masked_load_@vsuffix@(load_mask, ip);
-        }
-        else {
-            x1 = avx512_masked_gather_@vsuffix@(zeros_f, ip, vindex_ip, load_mask);
-        }
-#if @is_signbit@
-        x1 = _mm512_and_@vsuffix@(x1,signbit);
-#endif
-
-        @mask@ fpclassmask = _mm512_fpclass_@vsuffix@_mask(x1, @IMM8@);
-#if @is_finite@
-        fpclassmask = _mm512_knot(fpclassmask);
-#endif
-
-        __m128i out =_mm512_maskz_cvts@episize@_epi8(fpclassmask, ones);
-        _mm_mask_storeu_epi8(op, load_mask, out);
-
-        ip += @num_lanes@*stride_ip;
-        op += @num_lanes@;
-        num_remaining_elements -= @num_lanes@;
-    }
-}
-#endif
-/**end repeat1**/
-/**end repeat**/
-
-/**begin repeat
- * #TYPE = CFLOAT, CDOUBLE#
- * #type = npy_float, npy_double#
- * #num_lanes = 16, 8#
- * #vsuffix = ps, pd#
- * #epi_vsub  = epi32, epi64#
- * #mask = __mmask16, __mmask8#
- * #vtype = __m512, __m512d#
- * #scale = 4, 8#
- * #vindextype = __m512i, __m256i#
- * #vindexload = _mm512_loadu_si512, _mm256_loadu_si256#
- * #storemask = 0xFF, 0xF#
- * #IS_FLOAT = 1, 0#
- */
-
-/**begin repeat1
- *  #func = square, conjugate#
- *  #vectorf = avx512_csquare, avx512_conjugate#
- */
-
-#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
-static NPY_GCC_OPT_3 NPY_INLINE NPY_GCC_TARGET_AVX512F void
-AVX512F_@func@_@TYPE@(@type@ * op,
-                      @type@ * ip,
-                      const npy_intp array_size,
-                      const npy_intp steps)
-{
-    npy_intp num_remaining_elements = 2*array_size;
-    const npy_intp stride_ip1 = steps/(npy_intp)sizeof(@type@)/2;
-
-     /*
-      * Note: while generally indices are npy_intp, we ensure that our maximum index
-      * will fit in an int32 as a precondition for this function via max_stride
-      */
-    npy_int32 index_ip1[16];
-    for (npy_int32 ii = 0; ii < @num_lanes@; ii=ii+2) {
-        index_ip1[ii] = ii*stride_ip1;
-        index_ip1[ii+1] = ii*stride_ip1 + 1;
-    }
-    @vindextype@ vindex = @vindexload@((@vindextype@*)index_ip1);
-    @mask@ load_mask = avx512_get_full_load_mask_@vsuffix@();
-    @vtype@ zeros = _mm512_setzero_@vsuffix@();
-
-    while (num_remaining_elements > 0) {
-        if (num_remaining_elements < @num_lanes@) {
-            load_mask = avx512_get_partial_load_mask_@vsuffix@(
-                                    num_remaining_elements, @num_lanes@);
-        }
-        @vtype@ x1;
-        if (stride_ip1 == 1) {
-            x1 = avx512_masked_load_@vsuffix@(load_mask, ip);
-        }
-        else {
-            x1  = avx512_masked_gather_@vsuffix@(zeros, ip, vindex, load_mask);
-        }
-
-        @vtype@ out = @vectorf@_@vsuffix@(x1);
-
-        _mm512_mask_storeu_@vsuffix@(op, load_mask, out);
-        op += @num_lanes@;
-        ip += @num_lanes@*stride_ip1;
-        num_remaining_elements -= @num_lanes@;
-    }
-}
-#endif
-/**end repeat1**/
-
-#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
-static NPY_GCC_OPT_3 NPY_INLINE NPY_GCC_TARGET_AVX512F void
-AVX512F_absolute_@TYPE@(@type@ * op,
-                        @type@ * ip,
-                        const npy_intp array_size,
-                        const npy_intp steps)
-{
-    npy_intp num_remaining_elements = 2*array_size;
-    const npy_intp stride_ip1 = steps/(npy_intp)sizeof(@type@)/2;
-
-    /*
-     * Note: while generally indices are npy_intp, we ensure that our maximum index
-     * will fit in an int32 as a precondition for this function via max_stride
-     */
-    npy_int32 index_ip[32];
-    for (npy_int32 ii = 0; ii < 2*@num_lanes@; ii=ii+2) {
-        index_ip[ii] = ii*stride_ip1;
-        index_ip[ii+1] = ii*stride_ip1 + 1;
-    }
-    @vindextype@ vindex1 = @vindexload@((@vindextype@*)index_ip);
-    @vindextype@ vindex2 = @vindexload@((@vindextype@*)(index_ip+@num_lanes@));
-
-    @mask@ load_mask1 = avx512_get_full_load_mask_@vsuffix@();
-    @mask@ load_mask2 = avx512_get_full_load_mask_@vsuffix@();
-    @mask@ store_mask = avx512_get_full_load_mask_@vsuffix@();
-    @vtype@ zeros = _mm512_setzero_@vsuffix@();
-
-#if @IS_FLOAT@
-    __m512i re_index = _mm512_set_epi32(30,28,26,24,22,20,18,16,14,12,10,8,6,4,2,0);
-    __m512i im_index  = _mm512_set_epi32(31,29,27,25,23,21,19,17,15,13,11,9,7,5,3,1);
-#else
-    __m512i re_index = _mm512_set_epi64(14,12,10,8,6,4,2,0);
-    __m512i im_index  = _mm512_set_epi64(15,13,11,9,7,5,3,1);
-#endif
-
-    while (num_remaining_elements > 0) {
-        if (num_remaining_elements < @num_lanes@) {
-            load_mask1 = avx512_get_partial_load_mask_@vsuffix@(
-                                    num_remaining_elements, @num_lanes@);
-            load_mask2 = 0x0000;
-            store_mask = avx512_get_partial_load_mask_@vsuffix@(
-                                    num_remaining_elements/2, @num_lanes@);
-        } else if (num_remaining_elements < 2*@num_lanes@) {
-            load_mask1 = avx512_get_full_load_mask_@vsuffix@();
-            load_mask2 = avx512_get_partial_load_mask_@vsuffix@(
-                                    num_remaining_elements - @num_lanes@, @num_lanes@);
-            store_mask = avx512_get_partial_load_mask_@vsuffix@(
-                                    num_remaining_elements/2, @num_lanes@);
-        }
-        @vtype@ x1, x2;
-        if (stride_ip1 == 1) {
-            x1 = avx512_masked_load_@vsuffix@(load_mask1, ip);
-            x2 = avx512_masked_load_@vsuffix@(load_mask2, ip+@num_lanes@);
-        }
-        else {
-            x1  = avx512_masked_gather_@vsuffix@(zeros, ip, vindex1, load_mask1);
-            x2  = avx512_masked_gather_@vsuffix@(zeros, ip, vindex2, load_mask2);
-        }
-
-        @vtype@ out = avx512_cabsolute_@vsuffix@(x1, x2, re_index, im_index);
-
-        _mm512_mask_storeu_@vsuffix@(op, store_mask, out);
-        op += @num_lanes@;
-        ip += 2*@num_lanes@*stride_ip1;
-        num_remaining_elements -= 2*@num_lanes@;
-    }
-    npy_clear_floatstatus_barrier((char*)&num_remaining_elements);
-}
-
-#endif
-/**end repeat**/
-
-/*
- *****************************************************************************
- **                           BOOL LOOPS
- *****************************************************************************
- */
-
-/**begin repeat
- * # kind = logical_or, logical_and#
- * # and = 0, 1#
- * # op = ||, &&#
- * # sc = !=, ==#
- * # vpre = _mm*2#
- * # vsuf = si128*2#
- * # vtype = __m128i*2#
- * # type = npy_bool*2#
- * # vload = _mm_load_si128*2#
- * # vloadu = _mm_loadu_si128*2#
- * # vstore = _mm_store_si128*2#
- */
-
-/*
- * convert any bit set to boolean true so vectorized and normal operations are
- * consistent, should not be required if bool is used correctly everywhere but
- * you never know
- */
-#if !@and@
-NPY_FINLINE @vtype@ byte_to_true(@vtype@ v)
-{
-    const @vtype@ zero = @vpre@_setzero_@vsuf@();
-    const @vtype@ truemask = @vpre@_set1_epi8(1 == 1);
-    /* get 0xFF for zeros */
-    @vtype@ tmp = @vpre@_cmpeq_epi8(v, zero);
-    /* filled with 0xFF/0x00, negate and mask to boolean true */
-    return @vpre@_andnot_@vsuf@(tmp, truemask);
-}
-#endif
-
-static void
-sse2_binary_@kind@_BOOL(npy_bool * op, npy_bool * ip1, npy_bool * ip2, npy_intp n)
-{
-    LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES)
-        op[i] = ip1[i] @op@ ip2[i];
-    LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
-        @vtype@ a = @vloadu@((@vtype@*)&ip1[i]);
-        @vtype@ b = @vloadu@((@vtype@*)&ip2[i]);
-#if @and@
-        const @vtype@ zero = @vpre@_setzero_@vsuf@();
-        /* get 0xFF for non zeros*/
-        @vtype@ tmp = @vpre@_cmpeq_epi8(a, zero);
-        /* andnot -> 0x00 for zeros xFF for non zeros, & with ip2 */
-        tmp = @vpre@_andnot_@vsuf@(tmp, b);
-#else
-        @vtype@ tmp = @vpre@_or_@vsuf@(a, b);
-#endif
-
-        @vstore@((@vtype@*)&op[i], byte_to_true(tmp));
-    }
-    LOOP_BLOCKED_END {
-        op[i] = (ip1[i] @op@ ip2[i]);
-    }
-}
-
-
-static void
-sse2_reduce_@kind@_BOOL(npy_bool * op, npy_bool * ip, const npy_intp n)
-{
-    const @vtype@ zero = @vpre@_setzero_@vsuf@();
-    LOOP_BLOCK_ALIGN_VAR(ip, npy_bool, VECTOR_SIZE_BYTES) {
-        *op = *op @op@ ip[i];
-        if (*op @sc@ 0) {
-            return;
-        }
-    }
-    /* unrolled once to replace a slow movmsk with a fast pmaxb */
-    LOOP_BLOCKED(npy_bool, 2 * VECTOR_SIZE_BYTES) {
-        @vtype@ v = @vload@((@vtype@*)&ip[i]);
-        @vtype@ v2 = @vload@((@vtype@*)&ip[i + VECTOR_SIZE_BYTES]);
-        v = @vpre@_cmpeq_epi8(v, zero);
-        v2 = @vpre@_cmpeq_epi8(v2, zero);
-#if @and@
-        if ((@vpre@_movemask_epi8(@vpre@_max_epu8(v, v2)) != 0)) {
-            *op = 0;
-#else
-        if ((@vpre@_movemask_epi8(@vpre@_min_epu8(v, v2)) != 0xFFFF)) {
-            *op = 1;
-#endif
-            return;
-        }
-    }
-    LOOP_BLOCKED_END {
-        *op = *op @op@ ip[i];
-        if (*op @sc@ 0) {
-            return;
-        }
-    }
-}
-
-/**end repeat**/
-
-/**begin repeat
- * # kind = absolute, logical_not#
- * # op = !=, ==#
- * # not = 0, 1#
- * # vpre = _mm*2#
- * # vsuf = si128*2#
- * # vtype = __m128i*2#
- * # type = npy_bool*2#
- * # vloadu = _mm_loadu_si128*2#
- * # vstore = _mm_store_si128*2#
- */
-
-static void
-sse2_@kind@_BOOL(@type@ * op, @type@ * ip, const npy_intp n)
-{
-    LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES)
-        op[i] = (ip[i] @op@ 0);
-    LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
-        @vtype@ a = @vloadu@((@vtype@*)&ip[i]);
-#if @not@
-        const @vtype@ zero = @vpre@_setzero_@vsuf@();
-        const @vtype@ truemask = @vpre@_set1_epi8(1 == 1);
-        /* equivalent to byte_to_true but can skip the negation */
-        a = @vpre@_cmpeq_epi8(a, zero);
-        a = @vpre@_and_@vsuf@(a, truemask);
-#else
-        /* abs is kind of pointless but maybe its used for byte_to_true */
-        a = byte_to_true(a);
-#endif
-        @vstore@((@vtype@*)&op[i], a);
-    }
-    LOOP_BLOCKED_END {
-        op[i] = (ip[i] @op@ 0);
-    }
-}
-
-/**end repeat**/
-
-#undef VECTOR_SIZE_BYTES
-#endif  /* NPY_HAVE_SSE2_INTRINSICS */
-#endif
-
diff --git a/numpy/core/src/umath/string_ufuncs.cpp b/numpy/core/src/umath/string_ufuncs.cpp
index 5a35c318b..5d82be6db 100644
--- a/numpy/core/src/umath/string_ufuncs.cpp
+++ b/numpy/core/src/umath/string_ufuncs.cpp
@@ -16,7 +16,7 @@
 
 
 template <typename character>
-static NPY_INLINE int
+static inline int
 character_cmp(character a, character b)
 {
     if (a == b) {
@@ -37,7 +37,7 @@ character_cmp(character a, character b)
  * is always padded with zeros).
  */
 template <bool rstrip, typename character>
-static NPY_INLINE int
+static inline int
 string_cmp(int len1, const character *str1, int len2, const character *str2)
 {
     if (rstrip) {
diff --git a/numpy/core/src/umath/ufunc_object.c b/numpy/core/src/umath/ufunc_object.c
index 693a6d6c9..39e64decb 100644
--- a/numpy/core/src/umath/ufunc_object.c
+++ b/numpy/core/src/umath/ufunc_object.c
@@ -52,6 +52,7 @@
 
 #include "arrayobject.h"
 #include "common.h"
+#include "ctors.h"
 #include "dtypemeta.h"
 #include "numpyos.h"
 #include "dispatching.h"
@@ -1391,7 +1392,7 @@ try_trivial_single_output_loop(PyArrayMethod_Context *context,
  * or pass in the full cast information.  But this can special case
  * the logical functions and prints a better error message.
  */
-static NPY_INLINE int
+static inline int
 validate_casting(PyArrayMethodObject *method, PyUFuncObject *ufunc,
         PyArrayObject *ops[], PyArray_Descr *descriptors[],
         NPY_CASTING casting)
@@ -1781,6 +1782,8 @@ _check_keepdims_support(PyUFuncObject *ufunc) {
 static int
 _parse_axes_arg(PyUFuncObject *ufunc, int op_core_num_dims[], PyObject *axes,
                 PyArrayObject **op, int broadcast_ndim, int **remap_axis) {
+    static PyObject *AxisError_cls = NULL;
+
     int nin = ufunc->nin;
     int nop = ufunc->nargs;
     int iop, list_size;
@@ -1825,16 +1828,17 @@ _parse_axes_arg(PyUFuncObject *ufunc, int op_core_num_dims[], PyObject *axes,
         op_axes_tuple = PyList_GET_ITEM(axes, iop);
         if (PyTuple_Check(op_axes_tuple)) {
             if (PyTuple_Size(op_axes_tuple) != op_ncore) {
-                if (op_ncore == 1) {
-                    PyErr_Format(PyExc_ValueError,
-                                 "axes item %d should be a tuple with a "
-                                 "single element, or an integer", iop);
-                }
-                else {
-                    PyErr_Format(PyExc_ValueError,
-                                 "axes item %d should be a tuple with %d "
-                                 "elements", iop, op_ncore);
+                /* must have been a tuple with too many entries. */
+                npy_cache_import(
+                        "numpy.exceptions", "AxisError", &AxisError_cls);
+                if (AxisError_cls == NULL) {
+                    return -1;
                 }
+                PyErr_Format(AxisError_cls,
+                        "%s: operand %d has %d core dimensions, "
+                        "but %zd dimensions are specified by axes tuple.",
+                        ufunc_get_name_cstr(ufunc), iop, op_ncore,
+                        PyTuple_Size(op_axes_tuple));
                 return -1;
             }
             Py_INCREF(op_axes_tuple);
@@ -1846,8 +1850,22 @@ _parse_axes_arg(PyUFuncObject *ufunc, int op_core_num_dims[], PyObject *axes,
             }
         }
         else {
-            PyErr_Format(PyExc_TypeError, "axes item %d should be a tuple",
-                         iop);
+            /* If input is not an integer tell user that a tuple is needed */
+            if (error_converting(PyArray_PyIntAsInt(op_axes_tuple))) {
+                PyErr_Format(PyExc_TypeError,
+                        "%s: axes item %d should be a tuple.",
+                        ufunc_get_name_cstr(ufunc), iop);
+                return -1;
+            }
+            /* If it is a single integer, inform user that more are needed */
+            npy_cache_import("numpy.exceptions", "AxisError", &AxisError_cls);
+            if (AxisError_cls == NULL) {
+                return -1;
+            }
+            PyErr_Format(AxisError_cls,
+                    "%s: operand %d has %d core dimensions, "
+                    "but the axes item is a single integer.",
+                    ufunc_get_name_cstr(ufunc), iop, op_ncore);
             return -1;
         }
         /*
@@ -2067,12 +2085,16 @@ _get_coredim_sizes(PyUFuncObject *ufunc, PyArrayObject **op,
 }
 
 /*
- * Returns a new reference
+ * Returns a new reference to the ufunc identity.  Note that this identity
+ * is only a default identity value stored on the ufunc, since the invidiual
+ * ufunc loop (ArrayMethod) is queried for the actual identity.
+ *
  * TODO: store a reference in the ufunc object itself, rather than
  *       constructing one each time
  */
-static PyObject *
-_get_identity(PyUFuncObject *ufunc, npy_bool *reorderable) {
+NPY_NO_EXPORT PyObject *
+PyUFunc_GetDefaultIdentity(PyUFuncObject *ufunc, npy_bool *reorderable)
+{
     switch(ufunc->identity) {
     case PyUFunc_One:
         *reorderable = 1;
@@ -2739,8 +2761,41 @@ reducelike_promote_and_resolve(PyUFuncObject *ufunc,
         PyArrayObject *arr, PyArrayObject *out,
         PyArray_DTypeMeta *signature[3],
         npy_bool enforce_uniform_args, PyArray_Descr *out_descrs[3],
-        char *method)
+        NPY_CASTING casting, char *method)
 {
+     /*
+      * If no dtype is specified and out is not specified, we override the
+      * integer and bool dtype used for add and multiply.
+      *
+      * TODO: The following should be handled by a promoter!
+      */
+    if (signature[0] == NULL && out == NULL) {
+        /*
+         * For integer types --- make sure at least a long
+         * is used for add and multiply reduction to avoid overflow
+         */
+        int typenum = PyArray_TYPE(arr);
+        if ((PyTypeNum_ISBOOL(typenum) || PyTypeNum_ISINTEGER(typenum))
+                && ((strcmp(ufunc->name, "add") == 0)
+                    || (strcmp(ufunc->name, "multiply") == 0))) {
+            if (PyTypeNum_ISBOOL(typenum)) {
+                typenum = NPY_LONG;
+            }
+            else if ((size_t)PyArray_DESCR(arr)->elsize < sizeof(long)) {
+                if (PyTypeNum_ISUNSIGNED(typenum)) {
+                    typenum = NPY_ULONG;
+                }
+                else {
+                    typenum = NPY_LONG;
+                }
+            }
+            signature[0] = PyArray_DTypeFromTypeNum(typenum);
+        }
+    }
+    assert(signature[2] == NULL);  /* we always fill it here */
+    Py_XINCREF(signature[0]);
+    signature[2] = signature[0];
+
     /*
      * Note that the `ops` is not really correct.  But legacy resolution
      * cannot quite handle the correct ops (e.g. a NULL first item if `out`
@@ -2802,7 +2857,7 @@ reducelike_promote_and_resolve(PyUFuncObject *ufunc,
      * (although this should possibly happen through a deprecation)
      */
     if (resolve_descriptors(3, ufunc, ufuncimpl,
-            ops, out_descrs, signature, NPY_UNSAFE_CASTING) < 0) {
+            ops, out_descrs, signature, casting) < 0) {
         return NULL;
     }
 
@@ -2825,8 +2880,7 @@ reducelike_promote_and_resolve(PyUFuncObject *ufunc,
         goto fail;
     }
     /* TODO: This really should _not_ be unsafe casting (same above)! */
-    if (validate_casting(ufuncimpl,
-            ufunc, ops, out_descrs, NPY_UNSAFE_CASTING) < 0) {
+    if (validate_casting(ufuncimpl, ufunc, ops, out_descrs, casting) < 0) {
         goto fail;
     }
 
@@ -2834,7 +2888,7 @@ reducelike_promote_and_resolve(PyUFuncObject *ufunc,
 
   fail:
     for (int i = 0; i < 3; ++i) {
-        Py_DECREF(out_descrs[i]);
+        Py_CLEAR(out_descrs[i]);
     }
     return NULL;
 }
@@ -2956,10 +3010,8 @@ PyUFunc_Reduce(PyUFuncObject *ufunc,
         PyObject *initial, PyArrayObject *wheremask)
 {
     int iaxes, ndim;
-    npy_bool reorderable;
     npy_bool axis_flags[NPY_MAXDIMS];
-    PyArrayObject *result = NULL;
-    PyObject *identity;
+
     const char *ufunc_name = ufunc_get_name_cstr(ufunc);
     /* These parameters come from a TLS global */
     int buffersize = 0, errormask = 0;
@@ -2984,72 +3036,26 @@ PyUFunc_Reduce(PyUFuncObject *ufunc,
         return NULL;
     }
 
-    /*
-     * Promote and fetch ufuncimpl (currently needed to fix up the identity).
-     */
     PyArray_Descr *descrs[3];
     PyArrayMethodObject *ufuncimpl = reducelike_promote_and_resolve(ufunc,
-            arr, out, signature, NPY_FALSE, descrs, "reduce");
+            arr, out, signature, NPY_FALSE, descrs, NPY_UNSAFE_CASTING, "reduce");
     if (ufuncimpl == NULL) {
         return NULL;
     }
 
-    /* Get the identity */
-    /* TODO: Both of these should be provided by the ArrayMethod! */
-    identity = _get_identity(ufunc, &reorderable);
-    if (identity == NULL) {
-        goto finish;
-    }
-
-    /* Get the initial value */
-    if (initial == NULL) {
-        initial = identity;
-
-        /*
-        * The identity for a dynamic dtype like
-        * object arrays can't be used in general
-        */
-        if (initial != Py_None && PyArray_ISOBJECT(arr) && PyArray_SIZE(arr) != 0) {
-            Py_DECREF(initial);
-            initial = Py_None;
-            Py_INCREF(initial);
-        }
-        else if (PyTypeNum_ISUNSIGNED(descrs[2]->type_num)
-                    && PyLong_CheckExact(initial)) {
-            /*
-             * This is a bit of a hack until we have truly loop specific
-             * identities.  Python -1 cannot be cast to unsigned so convert
-             * it to a NumPy scalar, but we use -1 for bitwise functions to
-             * signal all 1s.
-             * (A builtin identity would not overflow here, although we may
-             * unnecessary convert 0 and 1.)
-             */
-            Py_SETREF(initial, PyObject_CallFunctionObjArgs(
-                        (PyObject *)&PyLongArrType_Type, initial, NULL));
-            if (initial == NULL) {
-                goto finish;
-            }
-        }
-    } else {
-        Py_DECREF(identity);
-        Py_INCREF(initial);  /* match the reference count in the if above */
-    }
-
     PyArrayMethod_Context context = {
         .caller = (PyObject *)ufunc,
         .method = ufuncimpl,
         .descriptors = descrs,
     };
 
-    result = PyUFunc_ReduceWrapper(&context,
-            arr, out, wheremask, axis_flags, reorderable, keepdims,
-            initial, reduce_loop, ufunc, buffersize, ufunc_name, errormask);
+    PyArrayObject *result = PyUFunc_ReduceWrapper(&context,
+            arr, out, wheremask, axis_flags, keepdims,
+            initial, reduce_loop, buffersize, ufunc_name, errormask);
 
-  finish:
     for (int i = 0; i < 3; i++) {
         Py_DECREF(descrs[i]);
     }
-    Py_XDECREF(initial);
     return result;
 }
 
@@ -3094,7 +3100,8 @@ PyUFunc_Accumulate(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *out,
 
     PyArray_Descr *descrs[3];
     PyArrayMethodObject *ufuncimpl = reducelike_promote_and_resolve(ufunc,
-            arr, out, signature, NPY_TRUE, descrs, "accumulate");
+            arr, out, signature, NPY_TRUE, descrs, NPY_UNSAFE_CASTING,
+            "accumulate");
     if (ufuncimpl == NULL) {
         return NULL;
     }
@@ -3511,7 +3518,8 @@ PyUFunc_Reduceat(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *ind,
 
     PyArray_Descr *descrs[3];
     PyArrayMethodObject *ufuncimpl = reducelike_promote_and_resolve(ufunc,
-            arr, out, signature, NPY_TRUE, descrs, "reduceat");
+            arr, out, signature, NPY_TRUE, descrs, NPY_UNSAFE_CASTING,
+            "reduceat");
     if (ufuncimpl == NULL) {
         return NULL;
     }
@@ -4063,7 +4071,7 @@ PyUFunc_GenericReduction(PyUFuncObject *ufunc,
     /* We now have all the information required to check for Overrides */
     PyObject *override = NULL;
     int errval = PyUFunc_CheckOverride(ufunc, _reduce_type[operation],
-            full_args.in, full_args.out, args, len_args, kwnames, &override);
+            full_args.in, full_args.out, wheremask_obj, args, len_args, kwnames, &override);
     if (errval) {
         return NULL;
     }
@@ -4169,38 +4177,6 @@ PyUFunc_GenericReduction(PyUFuncObject *ufunc,
         }
     }
 
-     /*
-      * If no dtype is specified and out is not specified, we override the
-      * integer and bool dtype used for add and multiply.
-      *
-      * TODO: The following should be handled by a promoter!
-      */
-    if (signature[0] == NULL && out == NULL) {
-        /*
-         * For integer types --- make sure at least a long
-         * is used for add and multiply reduction to avoid overflow
-         */
-        int typenum = PyArray_TYPE(mp);
-        if ((PyTypeNum_ISBOOL(typenum) || PyTypeNum_ISINTEGER(typenum))
-                && ((strcmp(ufunc->name, "add") == 0)
-                    || (strcmp(ufunc->name, "multiply") == 0))) {
-            if (PyTypeNum_ISBOOL(typenum)) {
-                typenum = NPY_LONG;
-            }
-            else if ((size_t)PyArray_DESCR(mp)->elsize < sizeof(long)) {
-                if (PyTypeNum_ISUNSIGNED(typenum)) {
-                    typenum = NPY_ULONG;
-                }
-                else {
-                    typenum = NPY_LONG;
-                }
-            }
-            signature[0] = PyArray_DTypeFromTypeNum(typenum);
-        }
-    }
-    Py_XINCREF(signature[0]);
-    signature[2] = signature[0];
-
     switch(operation) {
     case UFUNC_REDUCE:
         ret = PyUFunc_Reduce(ufunc,
@@ -4361,23 +4337,16 @@ _get_dtype(PyObject *dtype_obj) {
         }
         else if (NPY_UNLIKELY(out->singleton != descr)) {
             /* This does not warn about `metadata`, but units is important. */
-            if (!PyArray_EquivTypes(out->singleton, descr)) {
-                /* Deprecated NumPy 1.21.2 (was an accidental error in 1.21) */
-                if (DEPRECATE(
+            if (out->singleton == NULL
+                    || !PyArray_EquivTypes(out->singleton, descr)) {
+                PyErr_SetString(PyExc_TypeError,
                         "The `dtype` and `signature` arguments to "
                         "ufuncs only select the general DType and not details "
-                        "such as the byte order or time unit (with rare "
-                        "exceptions see release notes).  To avoid this warning "
-                        "please use the scalar types `np.float64`, or string "
-                        "notation.\n"
-                        "In rare cases where the time unit was preserved, "
-                        "either cast the inputs or provide an output array. "
-                        "In the future NumPy may transition to allow providing "
-                        "`dtype=` to denote the outputs `dtype` as well. "
-                        "(Deprecated NumPy 1.21)") < 0) {
-                    Py_DECREF(descr);
-                    return NULL;
-                }
+                        "such as the byte order or time unit. "
+                        "You can avoid this error by using the scalar types "
+                        "`np.float64` or the dtype string notation.");
+                Py_DECREF(descr);
+                return NULL;
             }
         }
         Py_INCREF(out);
@@ -4874,7 +4843,7 @@ ufunc_generic_fastcall(PyUFuncObject *ufunc,
     /* We now have all the information required to check for Overrides */
     PyObject *override = NULL;
     errval = PyUFunc_CheckOverride(ufunc, method,
-            full_args.in, full_args.out,
+            full_args.in, full_args.out, where_obj,
             args, len_args, kwnames, &override);
     if (errval) {
         goto fail;
@@ -5066,14 +5035,11 @@ ufunc_generic_vectorcall(PyObject *ufunc,
 
 
 NPY_NO_EXPORT PyObject *
-ufunc_geterr(PyObject *NPY_UNUSED(dummy), PyObject *args)
+ufunc_geterr(PyObject *NPY_UNUSED(dummy), PyObject *NPY_UNUSED(arg))
 {
     PyObject *thedict;
     PyObject *res;
 
-    if (!PyArg_ParseTuple(args, "")) {
-        return NULL;
-    }
     thedict = PyThreadState_GetDict();
     if (thedict == NULL) {
         thedict = PyEval_GetBuiltins();
@@ -5099,16 +5065,13 @@ ufunc_geterr(PyObject *NPY_UNUSED(dummy), PyObject *args)
 
 
 NPY_NO_EXPORT PyObject *
-ufunc_seterr(PyObject *NPY_UNUSED(dummy), PyObject *args)
+ufunc_seterr(PyObject *NPY_UNUSED(dummy), PyObject *arg)
 {
     PyObject *thedict;
     int res;
-    PyObject *val;
+    PyObject *val = arg;
     static char *msg = "Error object must be a list of length 3";
 
-    if (!PyArg_ParseTuple(args, "O:seterrobj", &val)) {
-        return NULL;
-    }
     if (!PyList_CheckExact(val) || PyList_GET_SIZE(val) != 3) {
         PyErr_SetString(PyExc_ValueError, msg);
         return NULL;
@@ -5379,7 +5342,7 @@ cmp_arg_types(int *arg1, int *arg2, int n)
  * This frees the linked-list structure when the CObject
  * is destroyed (removed from the internal dictionary)
 */
-static NPY_INLINE void
+static inline void
 _free_loop1d_list(PyUFunc_Loop1d *data)
 {
     int i;
@@ -5920,10 +5883,11 @@ ufunc_reduceat(PyUFuncObject *ufunc,
 }
 
 /* Helper for ufunc_at, below */
-static NPY_INLINE PyArrayObject *
+static inline PyArrayObject *
 new_array_op(PyArrayObject *op_array, char *data)
 {
     npy_intp dims[1] = {1};
+    Py_INCREF(PyArray_DESCR(op_array));  /* NewFromDescr steals a reference */
     PyObject *r = PyArray_NewFromDescr(&PyArray_Type, PyArray_DESCR(op_array),
                                        1, dims, NULL, data,
                                        NPY_ARRAY_WRITEABLE, NULL);
@@ -5931,206 +5895,171 @@ new_array_op(PyArrayObject *op_array, char *data)
 }
 
 /*
- * Call ufunc only on selected array items and store result in first operand.
- * For add ufunc, method call is equivalent to op1[idx] += op2 with no
- * buffering of the first operand.
- * Arguments:
- * op1 - First operand to ufunc
- * idx - Indices that are applied to first operand. Equivalent to op1[idx].
- * op2 - Second operand to ufunc (if needed). Must be able to broadcast
- *       over first operand.
+ * Use an indexed loop to do the work
+ * Returns 0 if successful
  */
-static PyObject *
-ufunc_at(PyUFuncObject *ufunc, PyObject *args)
+static int
+trivial_at_loop(PyArrayMethodObject *ufuncimpl, NPY_ARRAYMETHOD_FLAGS flags,
+                    PyArrayMapIterObject *iter,
+                    PyArrayObject *op1_array, PyArrayObject *op2_array,
+                    PyArrayMethod_Context *context)
 {
-    PyObject *op1 = NULL;
-    PyObject *idx = NULL;
-    PyObject *op2 = NULL;
-    PyArrayObject *op1_array = NULL;
-    PyArrayObject *op2_array = NULL;
-    PyArrayMapIterObject *iter = NULL;
-    PyArrayIterObject *iter2 = NULL;
-    PyArrayObject *operands[3] = {NULL, NULL, NULL};
-    PyArrayObject *array_operands[3] = {NULL, NULL, NULL};
-
-    PyArray_DTypeMeta *signature[3] = {NULL, NULL, NULL};
-    PyArray_DTypeMeta *operand_DTypes[3] = {NULL, NULL, NULL};
-    PyArray_Descr *operation_descrs[3] = {NULL, NULL, NULL};
-
-    int nop;
-
-    /* override vars */
-    int errval;
-    PyObject *override = NULL;
-
-    NpyIter *iter_buffer;
-    NpyIter_IterNextFunc *iternext;
-    npy_uint32 op_flags[NPY_MAXARGS];
-    int buffersize;
-    int errormask = 0;
-    char * err_msg = NULL;
-
-    PyArrayMethod_StridedLoop *strided_loop;
-    NpyAuxData *auxdata = NULL;
+    int buffersize=0, errormask = 0;
+    int res;
+    char *args[3];
+    npy_intp steps[3];
+    args[0] = (char *) iter->baseoffset;
+    steps[0] = iter->fancy_strides[0];
+    if (ufuncimpl->nin == 1) {
+        args[2] = NULL;
+        steps[2] = 0;
+    } else {
+        args[2] = (char *)PyArray_DATA(op2_array);
+        if (PyArray_NDIM(op2_array) == 0
+            || PyArray_DIM(op2_array, 0) <= 1) {
+            steps[2] = 0;
+        } else {
+            steps[2] = PyArray_STRIDE(op2_array, 0);
+        }
+    }
 
-    NPY_BEGIN_THREADS_DEF;
+    npy_intp *inner_size = NpyIter_GetInnerLoopSizePtr(iter->outer);
 
-    if (ufunc->core_enabled) {
-        PyErr_Format(PyExc_TypeError,
-            "%s.at does not support ufunc with non-trivial signature: %s has signature %s.",
-            ufunc->name, ufunc->name, ufunc->core_signature);
-        return NULL;
+    if (!(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) {
+        npy_clear_floatstatus_barrier((char *)context);
     }
 
-    if (ufunc->nin > 2) {
-        PyErr_SetString(PyExc_ValueError,
-            "Only unary and binary ufuncs supported at this time");
-        return NULL;
-    }
+    do {
+        args[1] = (char *) iter->outer_ptrs[0];
+        steps[1] = iter->outer_strides[0];
 
-    if (ufunc->nout != 1) {
-        PyErr_SetString(PyExc_ValueError,
-            "Only single output ufuncs supported at this time");
-        return NULL;
-    }
+        res = ufuncimpl->contiguous_indexed_loop(
+                context, args, inner_size, steps, NULL);
+        if (args[2] != NULL) {
+            args[2] += (*inner_size) * steps[2];
+        }
+    } while (res == 0 && iter->outer_next(iter->outer));
 
-    if (!PyArg_ParseTuple(args, "OO|O:at", &op1, &idx, &op2)) {
-        return NULL;
+    if (res == 0 && !(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) {
+        const char * ufunc_name =
+                        ufunc_get_name_cstr((PyUFuncObject *)context->caller);
+        if (_get_bufsize_errmask(NULL, ufunc_name,
+                                 &buffersize, &errormask) < 0) {
+            return -1;
+        }
+        res = _check_ufunc_fperr(errormask, NULL, ufunc_name);
     }
+    return res;
+}
 
-    if (ufunc->nin == 2 && op2 == NULL) {
-        PyErr_SetString(PyExc_ValueError,
-                        "second operand needed for ufunc");
-        return NULL;
-    }
-    errval = PyUFunc_CheckOverride(ufunc, "at",
-            args, NULL, NULL, 0, NULL, &override);
+static int
+ufunc_at__fast_iter(PyUFuncObject *ufunc, NPY_ARRAYMETHOD_FLAGS flags,
+                    PyArrayMapIterObject *iter, PyArrayIterObject *iter2,
+                    PyArrayObject *op1_array, PyArrayObject *op2_array,
+                    PyArrayMethod_StridedLoop *strided_loop,
+                    PyArrayMethod_Context *context,
+                    NpyAuxData *auxdata
+                    )
+{
+    int buffersize;
+    int errormask = 0;
+    int res = 0;
+    NPY_BEGIN_THREADS_DEF;
 
-    if (errval) {
-        return NULL;
-    }
-    else if (override) {
-        return override;
+    if (_get_bufsize_errmask(NULL, ufunc->name, &buffersize, &errormask) < 0) {
+        return -1;
     }
-
-    if (!PyArray_Check(op1)) {
-        PyErr_SetString(PyExc_TypeError,
-                        "first operand must be array");
-        return NULL;
+    int needs_api = (flags & NPY_METH_REQUIRES_PYAPI) != 0;
+    if (!(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) {
+        /* Start with the floating-point exception flags cleared */
+        npy_clear_floatstatus_barrier((char*)&iter);
     }
 
-    op1_array = (PyArrayObject *)op1;
-
-    /* Create second operand from number array if needed. */
-    if (op2 != NULL) {
-        op2_array = (PyArrayObject *)PyArray_FromAny(op2, NULL,
-                                0, 0, 0, NULL);
-        if (op2_array == NULL) {
-            goto fail;
-        }
+    if (!needs_api) {
+        NPY_BEGIN_THREADS;
     }
 
-    /* Create map iterator */
-    iter = (PyArrayMapIterObject *)PyArray_MapIterArrayCopyIfOverlap(
-        op1_array, idx, 1, op2_array);
-    if (iter == NULL) {
-        goto fail;
-    }
-    op1_array = iter->array;  /* May be updateifcopied on overlap */
+    npy_intp strides[3] = {0, 0, 0};
+    /*
+     * Iterate over first and second operands and call ufunc
+     * for each pair of inputs
+     */
+    for (npy_intp i = iter->size; i > 0; i--)
+    {
+        char *dataptr[3];
+        /* one element at a time, no stride required but read by innerloop */
+        npy_intp count = 1;
 
-    if (op2 != NULL) {
         /*
-         * May need to swap axes so that second operand is
-         * iterated over correctly
+         * Set up data pointers for either one or two input operands.
+         * The output data pointer points to the first operand data.
          */
-        if ((iter->subspace != NULL) && (iter->consec)) {
-            PyArray_MapIterSwapAxes(iter, &op2_array, 0);
-            if (op2_array == NULL) {
-                goto fail;
-            }
+        dataptr[0] = iter->dataptr;
+        if (iter2 != NULL) {
+            dataptr[1] = PyArray_ITER_DATA(iter2);
+            dataptr[2] = iter->dataptr;
+        }
+        else {
+            dataptr[1] = iter->dataptr;
+            dataptr[2] = NULL;
         }
 
-        /*
-         * Create array iter object for second operand that
-         * "matches" the map iter object for the first operand.
-         * Then we can just iterate over the first and second
-         * operands at the same time and not have to worry about
-         * picking the correct elements from each operand to apply
-         * the ufunc to.
-         */
-        if ((iter2 = (PyArrayIterObject *)\
-             PyArray_BroadcastToShape((PyObject *)op2_array,
-                                        iter->dimensions, iter->nd))==NULL) {
-            goto fail;
+        res = strided_loop(context, dataptr, &count, strides, auxdata);
+        if (res != 0) {
+            break;
         }
-    }
 
-    /*
-     * Create dtypes array for either one or two input operands.
-     * Compare to the logic in `convert_ufunc_arguments`.
-     * TODO: It may be good to review some of this behaviour, since the
-     *       operand array is special (it is written to) similar to reductions.
-     *       Using unsafe-casting as done here, is likely not desirable.
-     */
-    operands[0] = op1_array;
-    operand_DTypes[0] = NPY_DTYPE(PyArray_DESCR(op1_array));
-    Py_INCREF(operand_DTypes[0]);
-    int force_legacy_promotion = 0;
-    int allow_legacy_promotion = NPY_DT_is_legacy(operand_DTypes[0]);
+        PyArray_MapIterNext(iter);
+        if (iter2 != NULL) {
+            PyArray_ITER_NEXT(iter2);
+        }
+    }
 
-    if (op2_array != NULL) {
-        operands[1] = op2_array;
-        operand_DTypes[1] = NPY_DTYPE(PyArray_DESCR(op2_array));
-        Py_INCREF(operand_DTypes[1]);
-        allow_legacy_promotion &= NPY_DT_is_legacy(operand_DTypes[1]);
-        operands[2] = operands[0];
-        operand_DTypes[2] = operand_DTypes[0];
-        Py_INCREF(operand_DTypes[2]);
+    NPY_END_THREADS;
 
-        nop = 3;
-        if (allow_legacy_promotion && ((PyArray_NDIM(op1_array) == 0)
-                                       != (PyArray_NDIM(op2_array) == 0))) {
-                /* both are legacy and only one is 0-D: force legacy */
-                force_legacy_promotion = should_use_min_scalar(2, operands, 0, NULL);
-            }
-    }
-    else {
-        operands[1] = operands[0];
-        operand_DTypes[1] = operand_DTypes[0];
-        Py_INCREF(operand_DTypes[1]);
-        operands[2] = NULL;
-        nop = 2;
+    if (res == 0 && !(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) {
+        /* NOTE: We could check float errors even when `res < 0` */
+        res = _check_ufunc_fperr(errormask, NULL, "at");
     }
+    return res;
+}
 
-    PyArrayMethodObject *ufuncimpl = promote_and_get_ufuncimpl(ufunc,
-            operands, signature, operand_DTypes,
-            force_legacy_promotion, allow_legacy_promotion,
-            NPY_FALSE, NPY_FALSE);
-    if (ufuncimpl == NULL) {
-        goto fail;
-    }
+static int
+ufunc_at__slow_iter(PyUFuncObject *ufunc, NPY_ARRAYMETHOD_FLAGS flags,
+                    PyArrayMapIterObject *iter, PyArrayIterObject *iter2,
+                    PyArrayObject *op1_array, PyArrayObject *op2_array,
+                    PyArray_Descr *operation_descrs[3],
+                    PyArrayMethod_StridedLoop *strided_loop,
+                    PyArrayMethod_Context *context,
+                    NpyAuxData *auxdata
+                    )
+{
+    NpyIter *iter_buffer = NULL;
+    PyArrayObject *array_operands[3] = {NULL, NULL, NULL};
+    int buffersize;
+    int errormask = 0;
+    int res = 0;
+    int nop = 0;
+    NpyIter_IterNextFunc *iternext;
+    char * err_msg = NULL;
+    NPY_BEGIN_THREADS_DEF;
 
-    /* Find the correct descriptors for the operation */
-    if (resolve_descriptors(nop, ufunc, ufuncimpl,
-            operands, operation_descrs, signature, NPY_UNSAFE_CASTING) < 0) {
-        goto fail;
+    if (_get_bufsize_errmask(NULL, ufunc->name, &buffersize, &errormask) < 0) {
+        return -1;
     }
-
-    Py_INCREF(PyArray_DESCR(op1_array));
     array_operands[0] = new_array_op(op1_array, iter->dataptr);
     if (iter2 != NULL) {
-        Py_INCREF(PyArray_DESCR(op2_array));
         array_operands[1] = new_array_op(op2_array, PyArray_ITER_DATA(iter2));
-        Py_INCREF(PyArray_DESCR(op1_array));
         array_operands[2] = new_array_op(op1_array, iter->dataptr);
+        nop = 3;
     }
     else {
-        Py_INCREF(PyArray_DESCR(op1_array));
         array_operands[1] = new_array_op(op1_array, iter->dataptr);
         array_operands[2] = NULL;
+        nop = 2;
     }
-
     /* Set up the flags */
+    npy_uint32 op_flags[3];
     op_flags[0] = NPY_ITER_READONLY|
                   NPY_ITER_ALIGNED;
 
@@ -6150,11 +6079,6 @@ ufunc_at(PyUFuncObject *ufunc, PyObject *args)
                       NPY_ITER_NO_BROADCAST|
                       NPY_ITER_NO_SUBTYPE;
     }
-
-    if (_get_bufsize_errmask(NULL, ufunc->name, &buffersize, &errormask) < 0) {
-        goto fail;
-    }
-
     /*
      * Create NpyIter object to "iterate" over single element of each input
      * operand. This is an easy way to reuse the NpyIter logic for dealing
@@ -6177,33 +6101,23 @@ ufunc_at(PyUFuncObject *ufunc, PyObject *args)
                         -1, NULL, NULL, buffersize);
 
     if (iter_buffer == NULL) {
-        goto fail;
+        /* will fail only on memory allocation errors */
+        for (int i = 0; i < 3; i++) {
+            Py_XDECREF(array_operands[i]);
+        }
+        return -1;
     }
 
     iternext = NpyIter_GetIterNext(iter_buffer, NULL);
     if (iternext == NULL) {
+        /* can not really happen, iter_buffer creation is tightly controlled */
         NpyIter_Deallocate(iter_buffer);
-        goto fail;
-    }
-
-    PyArrayMethod_Context context = {
-            .caller = (PyObject *)ufunc,
-            .method = ufuncimpl,
-            .descriptors = operation_descrs,
-    };
-
-    NPY_ARRAYMETHOD_FLAGS flags;
-    /* Use contiguous strides; if there is such a loop it may be faster */
-    npy_intp strides[3] = {
-            operation_descrs[0]->elsize, operation_descrs[1]->elsize, 0};
-    if (nop == 3) {
-        strides[2] = operation_descrs[2]->elsize;
+        for (int i = 0; i < 3; i++) {
+            Py_XDECREF(array_operands[i]);
+        }
+        return -1;
     }
 
-    if (ufuncimpl->get_strided_loop(&context, 1, 0, strides,
-            &strided_loop, &auxdata, &flags) < 0) {
-        goto fail;
-    }
     int needs_api = (flags & NPY_METH_REQUIRES_PYAPI) != 0;
     needs_api |= NpyIter_IterationNeedsAPI(iter_buffer);
     if (!(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) {
@@ -6211,6 +6125,7 @@ ufunc_at(PyUFuncObject *ufunc, PyObject *args)
         npy_clear_floatstatus_barrier((char*)&iter);
     }
 
+    npy_intp strides[3] = {0, 0, 0};
     if (!needs_api) {
         NPY_BEGIN_THREADS;
     }
@@ -6219,7 +6134,6 @@ ufunc_at(PyUFuncObject *ufunc, PyObject *args)
      * Iterate over first and second operands and call ufunc
      * for each pair of inputs
      */
-    int res = 0;
     for (npy_intp i = iter->size; i > 0; i--)
     {
         char *dataptr[3];
@@ -6250,7 +6164,7 @@ ufunc_at(PyUFuncObject *ufunc, PyObject *args)
 
         buffer_dataptr = NpyIter_GetDataPtrArray(iter_buffer);
 
-        res = strided_loop(&context, buffer_dataptr, &count, strides, auxdata);
+        res = strided_loop(context, buffer_dataptr, &count, strides, auxdata);
         if (res != 0) {
             break;
         }
@@ -6276,18 +6190,284 @@ ufunc_at(PyUFuncObject *ufunc, PyObject *args)
         /* NOTE: We could check float errors even when `res < 0` */
         res = _check_ufunc_fperr(errormask, NULL, "at");
     }
+    NpyIter_Deallocate(iter_buffer);
+    for (int i = 0; i < 3; i++) {
+        Py_XDECREF(array_operands[i]);
+    }
+    return res;
+}
 
+/*
+ * Call ufunc only on selected array items and store result in first operand.
+ * For add ufunc, method call is equivalent to op1[idx] += op2 with no
+ * buffering of the first operand.
+ * Arguments:
+ * op1 - First operand to ufunc
+ * idx - Indices that are applied to first operand. Equivalent to op1[idx].
+ * op2 - Second operand to ufunc (if needed). Must be able to broadcast
+ *       over first operand.
+ */
+static PyObject *
+ufunc_at(PyUFuncObject *ufunc, PyObject *args)
+{
+    PyObject *op1 = NULL;
+    PyObject *idx = NULL;
+    PyObject *op2 = NULL;
+    PyArrayObject *op1_array = NULL;
+    PyArrayObject *op2_array = NULL;
+    PyArrayMapIterObject *iter = NULL;
+    PyArrayIterObject *iter2 = NULL;
+    PyArray_Descr *operation_descrs[3] = {NULL, NULL, NULL};
+
+    int nop;
+
+    /* override vars */
+    int errval;
+    PyObject *override = NULL;
+    int res = -1;   /* start with fail condition so "goto fail" will error */
+
+    PyArrayMethod_StridedLoop *strided_loop;
+    NpyAuxData *auxdata = NULL;
+
+    if (ufunc->core_enabled) {
+        PyErr_Format(PyExc_TypeError,
+            "%s.at does not support ufunc with non-trivial signature: %s has signature %s.",
+            ufunc->name, ufunc->name, ufunc->core_signature);
+        return NULL;
+    }
+
+    if (ufunc->nin > 2) {
+        PyErr_SetString(PyExc_ValueError,
+            "Only unary and binary ufuncs supported at this time");
+        return NULL;
+    }
+
+    if (ufunc->nout != 1) {
+        PyErr_SetString(PyExc_ValueError,
+            "Only single output ufuncs supported at this time");
+        return NULL;
+    }
+
+    if (!PyArg_ParseTuple(args, "OO|O:at", &op1, &idx, &op2)) {
+        return NULL;
+    }
+
+    if (ufunc->nin == 2 && op2 == NULL) {
+        PyErr_SetString(PyExc_ValueError,
+                        "second operand needed for ufunc");
+        return NULL;
+    }
+
+    if (ufunc->nin == 1 && op2 != NULL) {
+        PyErr_SetString(PyExc_ValueError,
+                        "second operand provided when ufunc is unary");
+        return NULL;
+    }
+    errval = PyUFunc_CheckOverride(ufunc, "at",
+            args, NULL, NULL, NULL, 0, NULL, &override);
+
+    if (errval) {
+        return NULL;
+    }
+    else if (override) {
+        return override;
+    }
+
+    if (!PyArray_Check(op1)) {
+        PyErr_SetString(PyExc_TypeError,
+                        "first operand must be array");
+        return NULL;
+    }
+
+    op1_array = (PyArrayObject *)op1;
+
+    /* Create second operand from number array if needed. */
+    if (op2 == NULL) {
+        nop = 2;
+    }
+    else {
+        nop = 3;
+        op2_array = (PyArrayObject *)PyArray_FromAny(op2, NULL,
+                                0, 0, 0, NULL);
+        if (op2_array == NULL) {
+            goto fail;
+        }
+    }
+
+    PyArrayMethodObject *ufuncimpl = NULL;
+    {
+        /* Do all the dtype handling and find the correct ufuncimpl */
+
+        PyArrayObject *tmp_operands[3] = {NULL, NULL, NULL};
+        PyArray_DTypeMeta *signature[3] = {NULL, NULL, NULL};
+        PyArray_DTypeMeta *operand_DTypes[3] = {NULL, NULL, NULL};
+        /*
+         * Create dtypes array for either one or two input operands.
+         * Compare to the logic in `convert_ufunc_arguments`.
+         * TODO: It may be good to review some of this behaviour, since the
+         *       operand array is special (it is written to) similar to reductions.
+         *       Using unsafe-casting as done here, is likely not desirable.
+         */
+        tmp_operands[0] = op1_array;
+        operand_DTypes[0] = NPY_DTYPE(PyArray_DESCR(op1_array));
+        Py_INCREF(operand_DTypes[0]);
+        int force_legacy_promotion = 0;
+        int allow_legacy_promotion = NPY_DT_is_legacy(operand_DTypes[0]);
+
+        if (op2_array != NULL) {
+            tmp_operands[1] = op2_array;
+            operand_DTypes[1] = NPY_DTYPE(PyArray_DESCR(op2_array));
+            Py_INCREF(operand_DTypes[1]);
+            allow_legacy_promotion &= NPY_DT_is_legacy(operand_DTypes[1]);
+            tmp_operands[2] = tmp_operands[0];
+            operand_DTypes[2] = operand_DTypes[0];
+            Py_INCREF(operand_DTypes[2]);
+
+            if (allow_legacy_promotion && ((PyArray_NDIM(op1_array) == 0)
+                                           != (PyArray_NDIM(op2_array) == 0))) {
+                    /* both are legacy and only one is 0-D: force legacy */
+                    force_legacy_promotion = should_use_min_scalar(2, tmp_operands, 0, NULL);
+                }
+        }
+        else {
+            tmp_operands[1] = tmp_operands[0];
+            operand_DTypes[1] = operand_DTypes[0];
+            Py_INCREF(operand_DTypes[1]);
+            tmp_operands[2] = NULL;
+        }
+
+        ufuncimpl = promote_and_get_ufuncimpl(ufunc, tmp_operands, signature,
+                        operand_DTypes, force_legacy_promotion,
+                        allow_legacy_promotion, NPY_FALSE, NPY_FALSE);
+        if (ufuncimpl == NULL) {
+            for (int i = 0; i < 3; i++) {
+                Py_XDECREF(signature[i]);
+                Py_XDECREF(operand_DTypes[i]);
+            }
+            goto fail;
+        }
+
+        /* Find the correct operation_descrs for the operation */
+        int resolve_result = resolve_descriptors(nop, ufunc, ufuncimpl,
+                tmp_operands, operation_descrs, signature, NPY_UNSAFE_CASTING);
+        for (int i = 0; i < 3; i++) {
+            Py_XDECREF(signature[i]);
+            Py_XDECREF(operand_DTypes[i]);
+        }
+        if (resolve_result < 0) {
+            goto fail;
+        }
+    }
+
+    iter = (PyArrayMapIterObject *)PyArray_MapIterArrayCopyIfOverlap(
+        op1_array, idx, 1, op2_array);
+    if (iter == NULL) {
+        goto fail;
+    }
+    op1_array = iter->array;  /* May be updateifcopied on overlap */
+
+    if (op2_array != NULL) {
+        /*
+         * May need to swap axes so that second operand is
+         * iterated over correctly
+         */
+        if ((iter->subspace != NULL) && (iter->consec)) {
+            PyArray_MapIterSwapAxes(iter, &op2_array, 0);
+            if (op2_array == NULL) {
+                /* only on memory allocation failure */
+                goto fail;
+            }
+        }
+
+        /*
+         * Create array iter object for second operand that
+         * "matches" the map iter object for the first operand.
+         * Then we can just iterate over the first and second
+         * operands at the same time and not have to worry about
+         * picking the correct elements from each operand to apply
+         * the ufunc to.
+         */
+        if ((iter2 = (PyArrayIterObject *)\
+             PyArray_BroadcastToShape((PyObject *)op2_array,
+                                        iter->dimensions, iter->nd))==NULL) {
+            goto fail;
+        }
+    }
+
+    PyArrayMethod_Context context = {
+            .caller = (PyObject *)ufunc,
+            .method = ufuncimpl,
+            .descriptors = operation_descrs,
+    };
+
+    /* Use contiguous strides; if there is such a loop it may be faster */
+    npy_intp strides[3] = {
+            operation_descrs[0]->elsize, operation_descrs[1]->elsize, 0};
+    if (nop == 3) {
+        strides[2] = operation_descrs[2]->elsize;
+    }
+
+    NPY_ARRAYMETHOD_FLAGS flags;
+    if (ufuncimpl->get_strided_loop(&context, 1, 0, strides,
+            &strided_loop, &auxdata, &flags) < 0) {
+        goto fail;
+    }
+    int fast_path = 1;
+    /* check no casting, alignment */
+    if (PyArray_DESCR(op1_array) != operation_descrs[0]) {
+        fast_path = 0;
+    }
+    if (PyArray_DESCR(op1_array) != operation_descrs[nop - 1]) {
+        /* output casting */
+        fast_path = 0;
+    }
+    if (!PyArray_ISALIGNED(op1_array)) {
+        fast_path = 0;
+    }
+    if (nop >2) {
+        if  (PyArray_DESCR(op2_array) != operation_descrs[1]) {
+            fast_path = 0;
+        }
+        if (!PyArray_ISALIGNED(op2_array)) {
+            fast_path = 0;
+        }
+    }
+    if (fast_path == 1) {
+        /*
+         * Try to use trivial loop (1d, no casting, aligned) if
+         * - the matching info has a indexed loop
+         * - idx must be exactly one integer index array
+         * - all operands are 1d
+         * A future enhancement could loosen the restriction on 1d operands
+         * by adding an iteration loop inside trivial_at_loop
+         */
+        if ((ufuncimpl->contiguous_indexed_loop != NULL) &&
+                (PyArray_NDIM(op1_array) == 1)  &&
+                (op2_array == NULL || PyArray_NDIM(op2_array) <= 1) &&
+                (iter->subspace_iter == NULL) && (iter->numiter == 1)) {
+            res = trivial_at_loop(ufuncimpl, flags, iter, op1_array,
+                        op2_array, &context);
+
+        }
+        else {
+            /* Couldn't use the fastest path, use the faster path */
+            res = ufunc_at__fast_iter(ufunc, flags, iter, iter2, op1_array,
+                        op2_array, strided_loop, &context, auxdata);
+        }
+    } else {
+        res = ufunc_at__slow_iter(ufunc, flags, iter, iter2, op1_array,
+                        op2_array, operation_descrs, strided_loop, &context,
+                        auxdata);
+    }
+
+fail:
     NPY_AUXDATA_FREE(auxdata);
-    NpyIter_Deallocate(iter_buffer);
 
     Py_XDECREF(op2_array);
     Py_XDECREF(iter);
     Py_XDECREF(iter2);
     for (int i = 0; i < nop; i++) {
-        Py_DECREF(signature[i]);
-        Py_XDECREF(operand_DTypes[i]);
         Py_XDECREF(operation_descrs[i]);
-        Py_XDECREF(array_operands[i]);
     }
 
     /*
@@ -6296,29 +6476,406 @@ ufunc_at(PyUFuncObject *ufunc, PyObject *args)
      * (e.g. `power` released the GIL but manually set an Exception).
      */
     if (res != 0 || PyErr_Occurred()) {
+        /* iter_buffer has already been deallocated, don't use NpyIter_Dealloc */
+        if (PyArray_FLAGS(op1_array) & NPY_ARRAY_WRITEBACKIFCOPY) {
+            PyArray_DiscardWritebackIfCopy(op1_array);
+        }
         return NULL;
     }
     else {
         Py_RETURN_NONE;
     }
+}
 
-fail:
-    /* iter_buffer has already been deallocated, don't use NpyIter_Dealloc */
-    if (op1_array != (PyArrayObject*)op1) {
-        PyArray_DiscardWritebackIfCopy(op1_array);
+
+typedef struct {
+    PyArrayMethod_StridedLoop *strided_loop;
+    PyArrayMethod_Context *context;
+    NpyAuxData *auxdata;
+    /* Should move to flags, but lets keep it bools for now: */
+    npy_bool requires_pyapi;
+    npy_bool no_floatingpoint_errors;
+    PyArrayMethod_Context _full_context;
+    PyArray_Descr *_descrs[];
+} ufunc_call_info;
+
+
+void
+free_ufunc_call_info(PyObject *self)
+{
+    ufunc_call_info *call_info = PyCapsule_GetPointer(
+            self, "numpy_1.24_ufunc_call_info");
+
+    PyArrayMethod_Context *context = call_info->context;
+
+    int nargs = context->method->nin + context->method->nout;
+    for (int i = 0; i < nargs; i++) {
+        Py_DECREF(context->descriptors[i]);
     }
-    Py_XDECREF(op2_array);
-    Py_XDECREF(iter);
-    Py_XDECREF(iter2);
-    for (int i = 0; i < 3; i++) {
+    Py_DECREF(context->caller);
+    Py_DECREF(context->method);
+    NPY_AUXDATA_FREE(call_info->auxdata);
+
+    PyObject_Free(call_info);
+}
+
+
+/*
+ * Python entry-point to ufunc promotion and dtype/descr resolution.
+ *
+ * This function does most of the work required to execute ufunc without
+ * actually executing it.
+ * This can be very useful for downstream libraries that reimplement NumPy
+ * functionality, such as Numba or Dask.
+ */
+static PyObject *
+py_resolve_dtypes_generic(PyUFuncObject *ufunc, npy_bool return_context,
+        PyObject *const *args, Py_ssize_t len_args, PyObject *kwnames)
+{
+    NPY_PREPARE_ARGPARSER;
+
+    PyObject *descrs_tuple;
+    PyObject *signature_obj = NULL;
+    NPY_CASTING casting = NPY_DEFAULT_ASSIGN_CASTING;
+    npy_bool reduction = NPY_FALSE;
+
+    if (npy_parse_arguments("resolve_dtypes", args, len_args, kwnames,
+            "", NULL, &descrs_tuple,
+            "$signature", NULL, &signature_obj,
+            "$casting", &PyArray_CastingConverter, &casting,
+            "$reduction", &PyArray_BoolConverter, &reduction,
+            NULL, NULL, NULL) < 0) {
+        return NULL;
+    }
+
+    if (reduction && (ufunc->nin != 2 || ufunc->nout != 1)) {
+        PyErr_SetString(PyExc_ValueError,
+                "ufunc is not compatible with reduction operations.");
+        return NULL;
+    }
+
+    /*
+     * Legacy type resolvers expect NumPy arrays as input.  Until NEP 50 is
+     * adopted, it is most convenient to ensure that we have an "array" object
+     * before calling the type promotion.  Eventually, this hack may be moved
+     * into the legacy type resolution code itself (probably after NumPy stops
+     * using legacy type resolution itself for the most part).
+     *
+     * We make the pretty safe assumptions here that:
+     * - Nobody will actually do anything with the array objects besides
+     *   checking the descriptor or calling CanCast.
+     * - No type resolver will cause weird paths that mess with our promotion
+     *   state (or mind us messing with it).
+     */
+    PyObject *result = NULL;
+    PyObject *result_dtype_tuple = NULL;
+
+    PyArrayObject *dummy_arrays[NPY_MAXARGS] = {NULL};
+    PyArray_DTypeMeta *DTypes[NPY_MAXARGS] = {NULL};
+    PyArray_DTypeMeta *signature[NPY_MAXARGS] = {NULL};
+    PyArray_Descr *operation_descrs[NPY_MAXARGS] = {NULL};
+
+    /* This entry-point to promotion lives in the NEP 50 future: */
+    int original_promotion_state = npy_promotion_state;
+    npy_promotion_state = NPY_USE_WEAK_PROMOTION;
+
+    npy_bool promoting_pyscalars = NPY_FALSE;
+    npy_bool allow_legacy_promotion = NPY_TRUE;
+
+    if (_get_fixed_signature(ufunc, NULL, signature_obj, signature) < 0) {
+        goto finish;
+    }
+
+    if (!PyTuple_CheckExact(descrs_tuple)
+            || PyTuple_Size(descrs_tuple) != ufunc->nargs)  {
+        PyErr_SetString(PyExc_TypeError,
+                "resolve_dtypes: The dtypes must be a tuple of "
+                "`ufunc.nargs` length.");
+        goto finish;
+    }
+    for (int i=0; i < ufunc->nargs; i++) {
+        /*
+         * We create dummy arrays for now.  It should be OK to make this
+         * truly "dummy" (not even proper objects), but that is a hack better
+         * left for the legacy_type_resolution wrapper when NEP 50 is done.
+         */
+        PyObject *descr_obj = PyTuple_GET_ITEM(descrs_tuple, i);
+        PyArray_Descr *descr;
+
+        if (PyArray_DescrCheck(descr_obj)) {
+            descr = (PyArray_Descr *)descr_obj;
+            Py_INCREF(descr);
+            dummy_arrays[i] = (PyArrayObject *)PyArray_NewFromDescr_int(
+                    &PyArray_Type, descr, 0, NULL, NULL, NULL,
+                    0, NULL, NULL, _NPY_ARRAY_ENSURE_DTYPE_IDENTITY);
+            if (dummy_arrays[i] == NULL) {
+                goto finish;
+            }
+            DTypes[i] = NPY_DTYPE(descr);
+            Py_INCREF(DTypes[i]);
+            if (!NPY_DT_is_legacy(DTypes[i])) {
+                allow_legacy_promotion = NPY_FALSE;
+            }
+        }
+         /* Explicitly allow int, float, and complex for the "weak" types. */
+        else if (descr_obj == (PyObject *)&PyLong_Type) {
+            descr = PyArray_DescrFromType(NPY_LONG);
+            dummy_arrays[i] = (PyArrayObject *)PyArray_Empty(0, NULL, descr, 0);
+            if (dummy_arrays[i] == NULL) {
+                goto finish;
+            }
+            PyArray_ENABLEFLAGS(dummy_arrays[i], NPY_ARRAY_WAS_PYTHON_INT);
+            Py_INCREF(&PyArray_PyIntAbstractDType);
+            DTypes[i] = &PyArray_PyIntAbstractDType;
+            promoting_pyscalars = NPY_TRUE;
+        }
+        else if (descr_obj == (PyObject *)&PyFloat_Type) {
+            descr = PyArray_DescrFromType(NPY_DOUBLE);
+            dummy_arrays[i] = (PyArrayObject *)PyArray_Empty(0, NULL, descr, 0);
+            if (dummy_arrays[i] == NULL) {
+                goto finish;
+            }
+            PyArray_ENABLEFLAGS(dummy_arrays[i], NPY_ARRAY_WAS_PYTHON_FLOAT);
+            Py_INCREF(&PyArray_PyFloatAbstractDType);
+            DTypes[i] = &PyArray_PyFloatAbstractDType;
+            promoting_pyscalars = NPY_TRUE;
+        }
+        else if (descr_obj == (PyObject *)&PyComplex_Type) {
+            descr = PyArray_DescrFromType(NPY_CDOUBLE);
+            dummy_arrays[i] = (PyArrayObject *)PyArray_Empty(0, NULL, descr, 0);
+            if (dummy_arrays[i] == NULL) {
+                goto finish;
+            }
+            PyArray_ENABLEFLAGS(dummy_arrays[i], NPY_ARRAY_WAS_PYTHON_COMPLEX);
+            Py_INCREF(&PyArray_PyComplexAbstractDType);
+            DTypes[i] = &PyArray_PyComplexAbstractDType;
+            promoting_pyscalars = NPY_TRUE;
+        }
+        else if (descr_obj == Py_None) {
+            if (i < ufunc->nin && !(reduction && i == 0)) {
+                PyErr_SetString(PyExc_TypeError,
+                        "All input dtypes must be provided "
+                        "(except the first one in reductions)");
+                goto finish;
+            }
+        }
+        else {
+            PyErr_SetString(PyExc_TypeError,
+                    "Provided dtype must be a valid NumPy dtype, "
+                    "int, float, complex, or None.");
+            goto finish;
+        }
+    }
+
+    PyArrayMethodObject *ufuncimpl;
+    if (!reduction) {
+        ufuncimpl = promote_and_get_ufuncimpl(ufunc,
+                dummy_arrays, signature, DTypes, NPY_FALSE,
+                allow_legacy_promotion, promoting_pyscalars, NPY_FALSE);
+        if (ufuncimpl == NULL) {
+            goto finish;
+        }
+
+        /* Find the correct descriptors for the operation */
+        if (resolve_descriptors(ufunc->nargs, ufunc, ufuncimpl,
+                dummy_arrays, operation_descrs, signature, casting) < 0) {
+            goto finish;
+        }
+
+        if (validate_casting(
+                ufuncimpl, ufunc, dummy_arrays, operation_descrs, casting) < 0) {
+            goto finish;
+        }
+    }
+    else {  /* reduction */
+        if (signature[2] != NULL) {
+            PyErr_SetString(PyExc_ValueError,
+                    "Reduction signature must end with None, instead pass "
+                    "the first DType in the signature.");
+            goto finish;
+        }
+
+        if (dummy_arrays[2] != NULL) {
+            PyErr_SetString(PyExc_TypeError,
+                    "Output dtype must not be passed for reductions, "
+                    "pass the first input instead.");
+            goto finish;
+        }
+
+        ufuncimpl = reducelike_promote_and_resolve(ufunc,
+                dummy_arrays[1], dummy_arrays[0], signature, NPY_FALSE,
+                operation_descrs, casting, "resolve_dtypes");
+
+        if (ufuncimpl == NULL) {
+            goto finish;
+        }
+    }
+
+    result = PyArray_TupleFromItems(
+            ufunc->nargs, (PyObject **)operation_descrs, 0);
+
+    if (result == NULL || !return_context) {
+        goto finish;
+    }
+    /* Result will be (dtype_tuple, call_info), so move it and clear result */
+    result_dtype_tuple = result;
+    result = NULL;
+
+    /* We may have to return the context: */
+    ufunc_call_info *call_info;
+    call_info = PyObject_Malloc(sizeof(ufunc_call_info)
+                              + ufunc->nargs * sizeof(PyArray_Descr *));
+    if (call_info == NULL) {
+        PyErr_NoMemory();
+        goto finish;
+    }
+    call_info->strided_loop = NULL;
+    call_info->auxdata = NULL;
+    call_info->context = &call_info->_full_context;
+
+    /*
+     * We create a capsule with NumPy 1.24 in the name to signal that it is
+     * prone to change in version updates (it doesn't have to).
+     * This capsule is documented in the `ufunc._resolve_dtypes_and_context`
+     * docstring.
+     */
+    PyObject *capsule = PyCapsule_New(
+            call_info, "numpy_1.24_ufunc_call_info", &free_ufunc_call_info);
+    if (capsule == NULL) {
+        PyObject_Free(call_info);
+        goto finish;
+    }
+
+    PyArrayMethod_Context *context = call_info->context;
+
+    Py_INCREF(ufunc);
+    context->caller = (PyObject *)ufunc;
+    Py_INCREF(ufuncimpl);
+    context->method = ufuncimpl;
+    context->descriptors = call_info->_descrs;
+    for (int i=0; i < ufunc->nargs; i++) {
+        Py_INCREF(operation_descrs[i]);
+        context->descriptors[i] = operation_descrs[i];
+    }
+
+    result = PyTuple_Pack(2, result_dtype_tuple, capsule);
+    /* cleanup and return */
+    Py_DECREF(capsule);
+
+  finish:
+    npy_promotion_state = original_promotion_state;
+
+    Py_XDECREF(result_dtype_tuple);
+    for (int i = 0; i < ufunc->nargs; i++) {
         Py_XDECREF(signature[i]);
-        Py_XDECREF(operand_DTypes[i]);
+        Py_XDECREF(dummy_arrays[i]);
         Py_XDECREF(operation_descrs[i]);
-        Py_XDECREF(array_operands[i]);
+        Py_XDECREF(DTypes[i]);
     }
-    NPY_AUXDATA_FREE(auxdata);
 
-    return NULL;
+    return result;
+}
+
+
+static PyObject *
+py_resolve_dtypes(PyUFuncObject *ufunc,
+        PyObject *const *args, Py_ssize_t len_args, PyObject *kwnames)
+{
+    return py_resolve_dtypes_generic(ufunc, NPY_FALSE, args, len_args, kwnames);
+}
+
+
+static PyObject *
+py_resolve_dtypes_and_context(PyUFuncObject *ufunc,
+        PyObject *const *args, Py_ssize_t len_args, PyObject *kwnames)
+{
+    return py_resolve_dtypes_generic(ufunc, NPY_TRUE, args, len_args, kwnames);
+}
+
+
+static PyObject *
+py_get_strided_loop(PyUFuncObject *ufunc,
+        PyObject *const *args, Py_ssize_t len_args, PyObject *kwnames)
+{
+    NPY_PREPARE_ARGPARSER;
+
+    PyObject *call_info_obj;
+    PyObject *fixed_strides_obj = Py_None;
+    npy_intp fixed_strides[NPY_MAXARGS];
+
+    if (npy_parse_arguments("_get_strided_loop", args, len_args, kwnames,
+            "", NULL, &call_info_obj,
+            "$fixed_strides", NULL, &fixed_strides_obj,
+            NULL, NULL, NULL) < 0) {
+        return NULL;
+    }
+
+    ufunc_call_info *call_info = PyCapsule_GetPointer(
+                call_info_obj, "numpy_1.24_ufunc_call_info");
+    if (call_info == NULL) {
+        /* Cannot have a context with NULL inside... */
+        assert(PyErr_Occurred());
+        return NULL;
+    }
+    if (call_info->strided_loop != NULL) {
+        PyErr_SetString(PyExc_TypeError,
+                "ufunc call_info has already been filled/used!");
+        return NULL;
+    }
+
+    if (call_info->context->caller != (PyObject *)ufunc) {
+        PyErr_SetString(PyExc_TypeError,
+                "calling get_strided_loop with incompatible context");
+        return NULL;
+    }
+
+    /*
+     * Strict conversion of fixed_strides, None, or tuple of int or None.
+     */
+    if (fixed_strides_obj == Py_None) {
+        for (int i = 0; i < ufunc->nargs; i++) {
+            fixed_strides[i] = NPY_MAX_INTP;
+        }
+    }
+    else if (PyTuple_CheckExact(fixed_strides_obj)
+            && PyTuple_Size(fixed_strides_obj) == ufunc->nargs) {
+        for (int i = 0; i < ufunc->nargs; i++) {
+            PyObject *stride = PyTuple_GET_ITEM(fixed_strides_obj, i);
+            if (PyLong_CheckExact(stride)) {
+                fixed_strides[i] = PyLong_AsSsize_t(stride);
+                if (error_converting(fixed_strides[i])) {
+                    return NULL;
+                }
+            }
+            else if (stride == Py_None) {
+                fixed_strides[i] = NPY_MAX_INTP;
+            }
+            else {
+                PyErr_SetString(PyExc_TypeError,
+                    "_get_strided_loop(): fixed_strides tuple must contain "
+                    "Python ints or None");
+                return NULL;
+            }
+        }
+    }
+    else {
+        PyErr_SetString(PyExc_TypeError,
+            "_get_strided_loop(): fixed_strides must be a tuple or None");
+        return NULL;
+    }
+
+    NPY_ARRAYMETHOD_FLAGS flags;
+    if (call_info->context->method->get_strided_loop(call_info->context,
+            1, 0, fixed_strides, &call_info->strided_loop, &call_info->auxdata,
+            &flags) < 0) {
+        return NULL;
+    }
+
+    call_info->requires_pyapi = flags & NPY_METH_REQUIRES_PYAPI;
+    call_info->no_floatingpoint_errors = (
+            flags & NPY_METH_NO_FLOATINGPOINT_ERRORS);
+
+    Py_RETURN_NONE;
 }
 
 
@@ -6338,6 +6895,21 @@ static struct PyMethodDef ufunc_methods[] = {
     {"at",
         (PyCFunction)ufunc_at,
         METH_VARARGS, NULL},
+    /* Lower level methods: */
+    {"resolve_dtypes",
+        (PyCFunction)py_resolve_dtypes,
+        METH_FASTCALL | METH_KEYWORDS, NULL},
+    /*
+     * The following two functions are public API, but underscored since they
+     * are C-user specific and allow direct access to the core of ufunc loops.
+     * (See their documentation for API stability.)
+     */
+    {"_resolve_dtypes_and_context",
+        (PyCFunction)py_resolve_dtypes_and_context,
+        METH_FASTCALL | METH_KEYWORDS, NULL},
+    {"_get_strided_loop",
+        (PyCFunction)py_get_strided_loop,
+        METH_FASTCALL | METH_KEYWORDS, NULL},
     {NULL, NULL, 0, NULL}           /* sentinel */
 };
 
@@ -6459,7 +7031,7 @@ static PyObject *
 ufunc_get_identity(PyUFuncObject *ufunc, void *NPY_UNUSED(ignored))
 {
     npy_bool reorderable;
-    return _get_identity(ufunc, &reorderable);
+    return PyUFunc_GetDefaultIdentity(ufunc, &reorderable);
 }
 
 static PyObject *
@@ -6518,6 +7090,7 @@ NPY_NO_EXPORT PyTypeObject PyUFunc_Type = {
     .tp_name = "numpy.ufunc",
     .tp_basicsize = sizeof(PyUFuncObject),
     .tp_dealloc = (destructor)ufunc_dealloc,
+    .tp_vectorcall_offset = offsetof(PyUFuncObject, vectorcall),
     .tp_repr = (reprfunc)ufunc_repr,
     .tp_call = &PyVectorcall_Call,
     .tp_str = (reprfunc)ufunc_repr,
@@ -6527,7 +7100,6 @@ NPY_NO_EXPORT PyTypeObject PyUFunc_Type = {
     .tp_traverse = (traverseproc)ufunc_traverse,
     .tp_methods = ufunc_methods,
     .tp_getset = ufunc_getset,
-    .tp_vectorcall_offset = offsetof(PyUFuncObject, vectorcall),
 };
 
 /* End of code for ufunc objects */
diff --git a/numpy/core/src/umath/ufunc_object.h b/numpy/core/src/umath/ufunc_object.h
index 32af6c58e..1b80bb2df 100644
--- a/numpy/core/src/umath/ufunc_object.h
+++ b/numpy/core/src/umath/ufunc_object.h
@@ -4,14 +4,17 @@
 #include <numpy/ufuncobject.h>
 
 NPY_NO_EXPORT PyObject *
-ufunc_geterr(PyObject *NPY_UNUSED(dummy), PyObject *args);
+ufunc_geterr(PyObject *NPY_UNUSED(dummy), PyObject *NPY_UNUSED(arg));
 
 NPY_NO_EXPORT PyObject *
-ufunc_seterr(PyObject *NPY_UNUSED(dummy), PyObject *args);
+ufunc_seterr(PyObject *NPY_UNUSED(dummy), PyObject *arg);
 
 NPY_NO_EXPORT const char*
 ufunc_get_name_cstr(PyUFuncObject *ufunc);
 
+NPY_NO_EXPORT PyObject *
+PyUFunc_GetDefaultIdentity(PyUFuncObject *ufunc, npy_bool *reorderable);
+
 /* strings from umathmodule.c that are interned on umath import */
 NPY_VISIBILITY_HIDDEN extern PyObject *npy_um_str_array_ufunc;
 NPY_VISIBILITY_HIDDEN extern PyObject *npy_um_str_array_prepare;
diff --git a/numpy/core/src/umath/ufunc_type_resolution.c b/numpy/core/src/umath/ufunc_type_resolution.c
index 94338e031..decd26580 100644
--- a/numpy/core/src/umath/ufunc_type_resolution.c
+++ b/numpy/core/src/umath/ufunc_type_resolution.c
@@ -358,20 +358,51 @@ PyUFunc_SimpleBinaryComparisonTypeResolver(PyUFuncObject *ufunc,
     }
 
     if (type_tup == NULL) {
+        if (PyArray_ISDATETIME(operands[0])
+                && PyArray_ISDATETIME(operands[1])
+                && type_num1 != type_num2) {
+            /*
+             * Reject mixed datetime and timedelta explicitly, this was always
+             * implicitly rejected because casting fails (except with
+             * `casting="unsafe"` admittedly).
+             * This is required to ensure that `==` and `!=` can correctly
+             * detect that they should return a result array of False/True.
+             */
+            return raise_binary_type_reso_error(ufunc, operands);
+        }
         /*
-         * DEPRECATED NumPy 1.20, 2020-12.
-         * This check is required to avoid the FutureWarning that
-         * ResultType will give for number->string promotions.
+         * This check is required to avoid a potential FutureWarning that
+         * ResultType would give for number->string promotions.
          * (We never supported flexible dtypes here.)
          */
-        if (!PyArray_ISFLEXIBLE(operands[0]) &&
+        else if (!PyArray_ISFLEXIBLE(operands[0]) &&
                 !PyArray_ISFLEXIBLE(operands[1])) {
             out_dtypes[0] = PyArray_ResultType(2, operands, 0, NULL);
             if (out_dtypes[0] == NULL) {
                 return -1;
             }
-            out_dtypes[1] = out_dtypes[0];
-            Py_INCREF(out_dtypes[1]);
+            if (PyArray_ISINTEGER(operands[0])
+                    && PyArray_ISINTEGER(operands[1])
+                    && !PyDataType_ISINTEGER(out_dtypes[0])) {
+                /*
+                 * NumPy promotion allows uint+int to go to float, avoid it
+                 * (input must have been a mix of signed and unsigned)
+                 */
+                if (PyArray_ISSIGNED(operands[0])) {
+                    Py_SETREF(out_dtypes[0], PyArray_DescrFromType(NPY_LONGLONG));
+                    out_dtypes[1] = PyArray_DescrFromType(NPY_ULONGLONG);
+                    Py_INCREF(out_dtypes[1]);
+                }
+                else {
+                    Py_SETREF(out_dtypes[0], PyArray_DescrFromType(NPY_ULONGLONG));
+                    out_dtypes[1] = PyArray_DescrFromType(NPY_LONGLONG);
+                    Py_INCREF(out_dtypes[1]);
+                }
+            }
+            else {
+                out_dtypes[1] = out_dtypes[0];
+                Py_INCREF(out_dtypes[1]);
+            }
         }
         else {
             /* Not doing anything will lead to a loop no found error. */
@@ -382,64 +413,13 @@ PyUFunc_SimpleBinaryComparisonTypeResolver(PyUFuncObject *ufunc,
         }
     }
     else {
-        PyArray_Descr *descr;
-        /*
-         * DEPRECATED 2021-03, NumPy 1.20
-         *
-         * If the type tuple was originally a single element (probably),
-         * issue a deprecation warning, but otherwise accept it.  Since the
-         * result dtype is always boolean, this is not actually valid unless it
-         * is `object` (but if there is an object input we already deferred).
-         *
-         * TODO: Once this deprecation is gone, the special case for
-         *       `PyUFunc_SimpleBinaryComparisonTypeResolver` in dispatching.c
-         *       can be removed.
-         */
-        if (PyTuple_Check(type_tup) && PyTuple_GET_SIZE(type_tup) == 3 &&
-                PyTuple_GET_ITEM(type_tup, 0) == Py_None &&
-                PyTuple_GET_ITEM(type_tup, 1) == Py_None &&
-                PyArray_DescrCheck(PyTuple_GET_ITEM(type_tup, 2))) {
-            descr = (PyArray_Descr *)PyTuple_GET_ITEM(type_tup, 2);
-            if (descr->type_num == NPY_OBJECT) {
-                if (DEPRECATE_FUTUREWARNING(
-                        "using `dtype=object` (or equivalent signature) will "
-                        "return object arrays in the future also when the "
-                        "inputs do not already have `object` dtype.") < 0) {
-                    return -1;
-                }
-            }
-            else if (descr->type_num != NPY_BOOL) {
-                if (DEPRECATE(
-                        "using `dtype=` in comparisons is only useful for "
-                        "`dtype=object` (and will do nothing for bool). "
-                        "This operation will fail in the future.") < 0) {
-                    return -1;
-                }
-            }
-        }
-        else {
-            /* Usually a failure, but let the default version handle it */
-            return PyUFunc_DefaultTypeResolver(ufunc, casting,
-                    operands, type_tup, out_dtypes);
-        }
-
-        out_dtypes[0] = NPY_DT_CALL_ensure_canonical(descr);
-        if (out_dtypes[0] == NULL) {
-            return -1;
-        }
-        out_dtypes[1] = out_dtypes[0];
-        Py_INCREF(out_dtypes[1]);
+        /* Usually a failure, but let the default version handle it */
+        return PyUFunc_DefaultTypeResolver(ufunc, casting,
+                operands, type_tup, out_dtypes);
     }
 
-    /* Output type is always boolean */
+    /* Output type is always boolean (cannot fail for builtins) */
     out_dtypes[2] = PyArray_DescrFromType(NPY_BOOL);
-    if (out_dtypes[2] == NULL) {
-        for (i = 0; i < 2; ++i) {
-            Py_DECREF(out_dtypes[i]);
-            out_dtypes[i] = NULL;
-        }
-        return -1;
-    }
 
     /* Check against the casting rules */
     if (PyUFunc_ValidateCasting(ufunc, casting, operands, out_dtypes) < 0) {
@@ -694,7 +674,7 @@ PyUFunc_IsNaTTypeResolver(PyUFuncObject *ufunc,
 {
     if (!PyTypeNum_ISDATETIME(PyArray_DESCR(operands[0])->type_num)) {
         PyErr_SetString(PyExc_TypeError,
-                "ufunc 'isnat' is only defined for datetime and timedelta.");
+                "ufunc 'isnat' is only defined for np.datetime64 and np.timedelta64.");
         return -1;
     }
 
diff --git a/numpy/core/src/umath/wrapping_array_method.c b/numpy/core/src/umath/wrapping_array_method.c
index 9f8f036e8..64fea7aeb 100644
--- a/numpy/core/src/umath/wrapping_array_method.c
+++ b/numpy/core/src/umath/wrapping_array_method.c
@@ -177,6 +177,39 @@ wrapping_method_get_loop(
 }
 
 
+/*
+ * Wraps the original identity function, needs to translate the descriptors
+ * back to the original ones and provide an "original" context (identically to
+ * `get_loop`).
+ * We assume again that translating the descriptors is quick.
+ */
+static int
+wrapping_method_get_identity_function(
+        PyArrayMethod_Context *context, npy_bool reduction_is_empty,
+        char *item)
+{
+    /* Copy the context, and replace descriptors: */
+    PyArrayMethod_Context orig_context = *context;
+    PyArray_Descr *orig_descrs[NPY_MAXARGS];
+    orig_context.descriptors = orig_descrs;
+    orig_context.method = context->method->wrapped_meth;
+
+    int nin = context->method->nin, nout = context->method->nout;
+    PyArray_DTypeMeta **dtypes = context->method->wrapped_dtypes;
+
+    if (context->method->translate_given_descrs(
+            nin, nout, dtypes, context->descriptors, orig_descrs) < 0) {
+        return -1;
+    }
+    int res = context->method->wrapped_meth->get_reduction_initial(
+            &orig_context, reduction_is_empty, item);
+    for (int i = 0; i < nin + nout; i++) {
+        Py_DECREF(orig_descrs);
+    }
+    return res;
+}
+
+
 /**
  * Allows creating of a fairly lightweight wrapper around an existing ufunc
  * loop.  The idea is mainly for units, as this is currently slightly limited
@@ -235,14 +268,17 @@ PyUFunc_AddWrappingLoop(PyObject *ufunc_obj,
         break;
     }
     if (wrapped_meth == NULL) {
-        PyErr_SetString(PyExc_TypeError,
-                "Did not find the to-be-wrapped loop in the ufunc.");
+        PyErr_Format(PyExc_TypeError,
+                "Did not find the to-be-wrapped loop in the ufunc with given "
+                "DTypes. Received wrapping types: %S", wrapped_dt_tuple);
         goto finish;
     }
 
     PyType_Slot slots[] = {
         {NPY_METH_resolve_descriptors, &wrapping_method_resolve_descriptors},
-        {NPY_METH_get_loop, &wrapping_method_get_loop},
+        {_NPY_METH_get_loop, &wrapping_method_get_loop},
+        {NPY_METH_get_reduction_initial,
+            &wrapping_method_get_identity_function},
         {0, NULL}
     };
 
diff --git a/numpy/core/tests/_locales.py b/numpy/core/tests/_locales.py
index ce7b81f00..b1dc55a9b 100644
--- a/numpy/core/tests/_locales.py
+++ b/numpy/core/tests/_locales.py
@@ -57,12 +57,12 @@ class CommaDecimalPointLocale:
     """
     (cur_locale, tst_locale) = find_comma_decimal_point_locale()
 
-    def setup(self):
+    def setup_method(self):
         if self.tst_locale is None:
             pytest.skip("No French locale available")
         locale.setlocale(locale.LC_NUMERIC, locale=self.tst_locale)
 
-    def teardown(self):
+    def teardown_method(self):
         locale.setlocale(locale.LC_NUMERIC, locale=self.cur_locale)
 
     def __enter__(self):
diff --git a/numpy/core/tests/test_api.py b/numpy/core/tests/test_api.py
index 5006e77f2..b9f2f8ae9 100644
--- a/numpy/core/tests/test_api.py
+++ b/numpy/core/tests/test_api.py
@@ -102,6 +102,16 @@ def test_array_array():
     assert_raises(ValueError, np.array, [nested], dtype=np.float64)
 
     # Try with lists...
+    # float32
+    assert_equal(np.array([None] * 10, dtype=np.float32),
+                 np.full((10,), np.nan, dtype=np.float32))
+    assert_equal(np.array([[None]] * 10, dtype=np.float32),
+                 np.full((10, 1), np.nan, dtype=np.float32))
+    assert_equal(np.array([[None] * 10], dtype=np.float32),
+                 np.full((1, 10), np.nan, dtype=np.float32))
+    assert_equal(np.array([[None] * 10] * 10, dtype=np.float32),
+                 np.full((10, 10), np.nan, dtype=np.float32))
+    # float64
     assert_equal(np.array([None] * 10, dtype=np.float64),
                  np.full((10,), np.nan, dtype=np.float64))
     assert_equal(np.array([[None]] * 10, dtype=np.float64),
@@ -310,7 +320,7 @@ def test_array_astype_warning(t):
 
 @pytest.mark.parametrize(["dtype", "out_dtype"],
         [(np.bytes_, np.bool_),
-         (np.unicode_, np.bool_),
+         (np.str_, np.bool_),
          (np.dtype("S10,S9"), np.dtype("?,?"))])
 def test_string_to_boolean_cast(dtype, out_dtype):
     """
@@ -324,7 +334,7 @@ def test_string_to_boolean_cast(dtype, out_dtype):
 
 @pytest.mark.parametrize(["dtype", "out_dtype"],
         [(np.bytes_, np.bool_),
-         (np.unicode_, np.bool_),
+         (np.str_, np.bool_),
          (np.dtype("S10,S9"), np.dtype("?,?"))])
 def test_string_to_boolean_cast_errors(dtype, out_dtype):
     """
diff --git a/numpy/core/tests/test_argparse.py b/numpy/core/tests/test_argparse.py
index 63a01dee4..fae227027 100644
--- a/numpy/core/tests/test_argparse.py
+++ b/numpy/core/tests/test_argparse.py
@@ -53,8 +53,8 @@ def test_multiple_values():
 def test_string_fallbacks():
     # We can (currently?) use numpy strings to test the "slow" fallbacks
     # that should normally not be taken due to string interning.
-    arg2 = np.unicode_("arg2")
-    missing_arg = np.unicode_("missing_arg")
+    arg2 = np.str_("arg2")
+    missing_arg = np.str_("missing_arg")
     func(1, **{arg2: 3})
     with pytest.raises(TypeError,
             match="got an unexpected keyword argument 'missing_arg'"):
diff --git a/numpy/core/tests/test_array_coercion.py b/numpy/core/tests/test_array_coercion.py
index fade57292..d5373f642 100644
--- a/numpy/core/tests/test_array_coercion.py
+++ b/numpy/core/tests/test_array_coercion.py
@@ -1,6 +1,6 @@
 """
 Tests for array coercion, mainly through testing `np.array` results directly.
-Note that other such tests exist e.g. in `test_api.py` and many corner-cases
+Note that other such tests exist, e.g., in `test_api.py` and many corner-cases
 are tested (sometimes indirectly) elsewhere.
 """
 
@@ -20,8 +20,8 @@ from numpy.testing import (
 def arraylikes():
     """
     Generator for functions converting an array into various array-likes.
-    If full is True (default) includes array-likes not capable of handling
-    all dtypes
+    If full is True (default) it includes array-likes not capable of handling
+    all dtypes.
     """
     # base array:
     def ndarray(a):
@@ -39,9 +39,9 @@ def arraylikes():
     yield subclass
 
     class _SequenceLike():
-        # We are giving a warning that array-like's were also expected to be
-        # sequence-like in `np.array([array_like])`, this can be removed
-        # when the deprecation exired (started NumPy 1.20)
+        # Older NumPy versions, sometimes cared whether a protocol array was
+        # also _SequenceLike.  This shouldn't matter, but keep it for now
+        # for __array__ and not the others.
         def __len__(self):
             raise TypeError
 
@@ -62,7 +62,7 @@ def arraylikes():
     yield param(memoryview, id="memoryview")
 
     # Array-interface
-    class ArrayInterface(_SequenceLike):
+    class ArrayInterface:
         def __init__(self, a):
             self.a = a  # need to hold on to keep interface valid
             self.__array_interface__ = a.__array_interface__
@@ -70,7 +70,7 @@ def arraylikes():
     yield param(ArrayInterface, id="__array_interface__")
 
     # Array-Struct
-    class ArrayStruct(_SequenceLike):
+    class ArrayStruct:
         def __init__(self, a):
             self.a = a  # need to hold on to keep struct valid
             self.__array_struct__ = a.__array_struct__
@@ -130,7 +130,7 @@ def scalar_instances(times=True, extended_precision=True, user_dtype=True):
 
     # Strings and unstructured void:
     yield param(np.bytes_(b"1234"), id="bytes")
-    yield param(np.unicode_("2345"), id="unicode")
+    yield param(np.str_("2345"), id="unicode")
     yield param(np.void(b"4321"), id="unstructured_void")
 
 
@@ -161,8 +161,12 @@ class TestStringDiscovery:
         # A nested array is also discovered correctly
         arr = np.array(obj, dtype="O")
         assert np.array(arr, dtype="S").dtype == expected
+        # Also if we use the dtype class
+        assert np.array(arr, dtype=type(expected)).dtype == expected
         # Check that .astype() behaves identical
         assert arr.astype("S").dtype == expected
+        # The DType class is accepted by `.astype()`
+        assert arr.astype(type(np.dtype("S"))).dtype == expected
 
     @pytest.mark.parametrize("obj",
             [object(), 1.2, 10**43, None, "string"],
@@ -257,7 +261,7 @@ class TestScalarDiscovery:
     @pytest.mark.parametrize("scalar", scalar_instances())
     def test_scalar_coercion(self, scalar):
         # This tests various scalar coercion paths, mainly for the numerical
-        # types.  It includes some paths not directly related to `np.array`
+        # types. It includes some paths not directly related to `np.array`.
         if isinstance(scalar, np.inexact):
             # Ensure we have a full-precision number if available
             scalar = type(scalar)((scalar * 2)**0.5)
@@ -292,7 +296,7 @@ class TestScalarDiscovery:
            * `np.array(scalar, dtype=dtype)`
            * `np.empty((), dtype=dtype)[()] = scalar`
            * `np.array(scalar).astype(dtype)`
-        should behave the same.  The only exceptions are paramteric dtypes
+        should behave the same.  The only exceptions are parametric dtypes
         (mainly datetime/timedelta without unit) and void without fields.
         """
         dtype = cast_to.dtype  # use to parametrize only the target dtype
@@ -384,7 +388,7 @@ class TestScalarDiscovery:
         """
         dtype = np.dtype(dtype)
 
-        # This is a special case using casting logic.  It warns for the NaN
+        # This is a special case using casting logic. It warns for the NaN
         # but allows the cast (giving undefined behaviour).
         with np.errstate(invalid="ignore"):
             coerced = np.array(scalar, dtype=dtype)
@@ -650,6 +654,14 @@ class TestArrayLikes:
         res = np.array([obj], dtype=object)
         assert res[0] is obj
 
+    @pytest.mark.parametrize("arraylike", arraylikes())
+    @pytest.mark.parametrize("arr", [np.array(0.), np.arange(4)])
+    def test_object_assignment_special_case(self, arraylike, arr):
+        obj = arraylike(arr)
+        empty = np.arange(1, dtype=object)
+        empty[:] = [obj]
+        assert empty[0] is obj
+
     def test_0d_generic_special_case(self):
         class ArraySubclass(np.ndarray):
             def __float__(self):
@@ -833,3 +845,28 @@ class TestSpecialAttributeLookupFailure:
             np.array(self.WeirdArrayLike())
         with pytest.raises(RuntimeError):
             np.array(self.WeirdArrayInterface())
+
+
+def test_subarray_from_array_construction():
+    # Arrays are more complex, since they "broadcast" on success:
+    arr = np.array([1, 2])
+
+    res = arr.astype("(2)i,")
+    assert_array_equal(res, [[1, 1], [2, 2]])
+
+    res = np.array(arr, dtype="(2)i,")
+
+    assert_array_equal(res, [[1, 1], [2, 2]])
+
+    res = np.array([[(1,), (2,)], arr], dtype="(2)i,")
+    assert_array_equal(res, [[[1, 1], [2, 2]], [[1, 1], [2, 2]]])
+
+    # Also try a multi-dimensional example:
+    arr = np.arange(5 * 2).reshape(5, 2)
+    expected = np.broadcast_to(arr[:, :, np.newaxis, np.newaxis], (5, 2, 2, 2))
+
+    res = arr.astype("(2,2)f")
+    assert_array_equal(res, expected)
+
+    res = np.array(arr, dtype="(2,2)f")
+    assert_array_equal(res, expected)
diff --git a/numpy/core/tests/test_arraymethod.py b/numpy/core/tests/test_arraymethod.py
index 6b75d1921..4fd4d5558 100644
--- a/numpy/core/tests/test_arraymethod.py
+++ b/numpy/core/tests/test_arraymethod.py
@@ -64,7 +64,6 @@ class TestSimpleStridedCall:
             self.method._simple_strided_call(*args)
 
 
-@pytest.mark.skipif(sys.version_info < (3, 9), reason="Requires python 3.9")
 @pytest.mark.parametrize(
     "cls", [np.ndarray, np.recarray, np.chararray, np.matrix, np.memmap]
 )
@@ -84,10 +83,3 @@ class TestClassGetItem:
             match = f"Too {'few' if arg_len == 0 else 'many'} arguments"
             with pytest.raises(TypeError, match=match):
                 cls[arg_tup]
-
-
-@pytest.mark.skipif(sys.version_info >= (3, 9), reason="Requires python 3.8")
-def test_class_getitem_38() -> None:
-    match = "Type subscription requires python >= 3.9"
-    with pytest.raises(TypeError, match=match):
-        np.ndarray[Any, Any]
diff --git a/numpy/core/tests/test_arrayprint.py b/numpy/core/tests/test_arrayprint.py
index f1883f703..6796b4077 100644
--- a/numpy/core/tests/test_arrayprint.py
+++ b/numpy/core/tests/test_arrayprint.py
@@ -9,6 +9,7 @@ from numpy.testing import (
     assert_, assert_equal, assert_raises, assert_warns, HAS_REFCOUNT,
     assert_raises_regex,
     )
+from numpy.core.arrayprint import _typelessdata
 import textwrap
 
 class TestArrayRepr:
@@ -257,8 +258,7 @@ class TestArray2String:
         assert_(np.array2string(s, formatter={'numpystr':lambda s: s*2}) ==
                 '[abcabc defdef]')
 
-
-    def test_structure_format(self):
+    def test_structure_format_mixed(self):
         dt = np.dtype([('name', np.str_, 16), ('grades', np.float64, (2,))])
         x = np.array([('Sarah', (8.0, 7.0)), ('John', (6.0, 7.0))], dtype=dt)
         assert_equal(np.array2string(x),
@@ -300,6 +300,7 @@ class TestArray2String:
              ( 'NaT',) ( 'NaT',) ( 'NaT',)]""")
         )
 
+    def test_structure_format_int(self):
         # See #8160
         struct_int = np.array([([1, -1],), ([123, 1],)], dtype=[('B', 'i4', 2)])
         assert_equal(np.array2string(struct_int),
@@ -309,6 +310,7 @@ class TestArray2String:
         assert_equal(np.array2string(struct_2dint),
                 "[([[ 0,  1], [ 2,  3]],) ([[12,  0], [ 0,  0]],)]")
 
+    def test_structure_format_float(self):
         # See #8172
         array_scalar = np.array(
                 (1., 2.1234567890123456789, 3.), dtype=('f8,f8,f8'))
@@ -352,6 +354,33 @@ class TestArray2String:
                 '       [ 501,  502,  503, ...,  999, 1000, 1001]])'
         assert_equal(repr(A), reprA)
 
+    def test_summarize_structure(self):
+        A = (np.arange(2002, dtype="<i8").reshape(2, 1001)
+             .view([('i', "<i8", (1001,))]))
+        strA = ("[[([   0,    1,    2, ...,  998,  999, 1000],)]\n"
+                " [([1001, 1002, 1003, ..., 1999, 2000, 2001],)]]")
+        assert_equal(str(A), strA)
+
+        reprA = ("array([[([   0,    1,    2, ...,  998,  999, 1000],)],\n"
+                 "       [([1001, 1002, 1003, ..., 1999, 2000, 2001],)]],\n"
+                 "      dtype=[('i', '<i8', (1001,))])")
+        assert_equal(repr(A), reprA)
+
+        B = np.ones(2002, dtype=">i8").view([('i', ">i8", (2, 1001))])
+        strB = "[([[1, 1, 1, ..., 1, 1, 1], [1, 1, 1, ..., 1, 1, 1]],)]"
+        assert_equal(str(B), strB)
+
+        reprB = (
+            "array([([[1, 1, 1, ..., 1, 1, 1], [1, 1, 1, ..., 1, 1, 1]],)],\n"
+            "      dtype=[('i', '>i8', (2, 1001))])"
+        )
+        assert_equal(repr(B), reprB)
+
+        C = (np.arange(22, dtype="<i8").reshape(2, 11)
+             .view([('i1', "<i8"), ('i10', "<i8", (10,))]))
+        strC = "[[( 0, [ 1, ..., 10])]\n [(11, [12, ..., 21])]]"
+        assert_equal(np.array2string(C, threshold=1, edgeitems=1), strC)
+
     def test_linewidth(self):
         a = np.full(6, 1)
 
@@ -796,6 +825,47 @@ class TestPrintOptions:
             array(['1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1'],
                   dtype='{}')""".format(styp)))
 
+    @pytest.mark.parametrize(
+        ['native'],
+        [
+            ('bool',),
+            ('uint8',),
+            ('uint16',),
+            ('uint32',),
+            ('uint64',),
+            ('int8',),
+            ('int16',),
+            ('int32',),
+            ('int64',),
+            ('float16',),
+            ('float32',),
+            ('float64',),
+            ('U1',),     # 4-byte width string
+        ],
+    )
+    def test_dtype_endianness_repr(self, native):
+        '''
+        there was an issue where
+        repr(array([0], dtype='<u2')) and repr(array([0], dtype='>u2'))
+        both returned the same thing:
+        array([0], dtype=uint16)
+        even though their dtypes have different endianness.
+        '''
+        native_dtype = np.dtype(native)
+        non_native_dtype = native_dtype.newbyteorder()
+        non_native_repr = repr(np.array([1], non_native_dtype))
+        native_repr = repr(np.array([1], native_dtype))
+        # preserve the sensible default of only showing dtype if nonstandard
+        assert ('dtype' in native_repr) ^ (native_dtype in _typelessdata),\
+                ("an array's repr should show dtype if and only if the type "
+                 'of the array is NOT one of the standard types '
+                 '(e.g., int32, bool, float64).')
+        if non_native_dtype.itemsize > 1:
+            # if the type is >1 byte, the non-native endian version
+            # must show endianness.
+            assert non_native_repr != native_repr
+            assert f"dtype='{non_native_dtype.byteorder}" in non_native_repr
+
     def test_linewidth_repr(self):
         a = np.full(7, fill_value=2)
         np.set_printoptions(linewidth=17)
@@ -921,6 +991,16 @@ class TestPrintOptions:
                     [[ 0.]]]])""")
         )
 
+    def test_edgeitems_structured(self):
+        np.set_printoptions(edgeitems=1, threshold=1)
+        A = np.arange(5*2*3, dtype="<i8").view([('i', "<i8", (5, 2, 3))])
+        reprA = (
+            "array([([[[ 0, ...,  2], [ 3, ...,  5]], ..., "
+            "[[24, ..., 26], [27, ..., 29]]],)],\n"
+            "      dtype=[('i', '<i8', (5, 2, 3))])"
+        )
+        assert_equal(repr(A), reprA)
+
     def test_bad_args(self):
         assert_raises(ValueError, np.set_printoptions, threshold=float('nan'))
         assert_raises(TypeError, np.set_printoptions, threshold='1')
diff --git a/numpy/core/tests/test_casting_floatingpoint_errors.py b/numpy/core/tests/test_casting_floatingpoint_errors.py
index 4fafc4ed8..d83180171 100644
--- a/numpy/core/tests/test_casting_floatingpoint_errors.py
+++ b/numpy/core/tests/test_casting_floatingpoint_errors.py
@@ -1,6 +1,6 @@
 import pytest
 from pytest import param
-
+from numpy.testing import IS_WASM
 import numpy as np
 
 
@@ -136,6 +136,7 @@ def check_operations(dtype, value):
 
     yield flat_assignment
 
+@pytest.mark.skipif(IS_WASM, reason="no wasm fp exception support")
 @pytest.mark.parametrize(["value", "dtype"], values_and_dtypes())
 @pytest.mark.filterwarnings("ignore::numpy.ComplexWarning")
 def test_floatingpoint_errors_casting(dtype, value):
diff --git a/numpy/core/tests/test_cpu_features.py b/numpy/core/tests/test_cpu_features.py
index 1a76897e2..2fad4dfd9 100644
--- a/numpy/core/tests/test_cpu_features.py
+++ b/numpy/core/tests/test_cpu_features.py
@@ -1,5 +1,14 @@
 import sys, platform, re, pytest
-from numpy.core._multiarray_umath import __cpu_features__
+from numpy.core._multiarray_umath import (
+    __cpu_features__,
+    __cpu_baseline__,
+    __cpu_dispatch__,
+)
+import numpy as np
+import subprocess
+import pathlib
+import os
+import re
 
 def assert_features_equal(actual, desired, fname):
     __tracebackhide__ = True  # Hide traceback for py.test
@@ -8,7 +17,7 @@ def assert_features_equal(actual, desired, fname):
         return
     detected = str(__cpu_features__).replace("'", "")
     try:
-        with open("/proc/cpuinfo", "r") as fd:
+        with open("/proc/cpuinfo") as fd:
             cpuinfo = fd.read(2048)
     except Exception as err:
         cpuinfo = str(err)
@@ -48,6 +57,10 @@ def assert_features_equal(actual, desired, fname):
         "%s"
     ) % (fname, actual, desired, error_report))
 
+def _text_to_list(txt):
+    out = txt.strip("][\n").replace("'", "").split(', ')
+    return None if out[0] == "" else out
+
 class AbstractTest:
     features = []
     features_groups = {}
@@ -92,7 +105,6 @@ class AbstractTest:
         return values
 
     def load_flags_auxv(self):
-        import subprocess
         auxv = subprocess.check_output(['/bin/true'], env=dict(LD_SHOW_AUXV="1"))
         for at in auxv.split(b'\n'):
             if not at.startswith(b"AT_HWCAP"):
@@ -103,6 +115,208 @@ class AbstractTest:
                     hwcap_value[1].upper().decode().split()
                 )
 
+@pytest.mark.skipif(
+    sys.platform == 'emscripten',
+    reason= (
+        "The subprocess module is not available on WASM platforms and"
+        " therefore this test class cannot be properly executed."
+    ),
+)
+class TestEnvPrivation:
+    cwd = pathlib.Path(__file__).parent.resolve()
+    env = os.environ.copy()
+    _enable = os.environ.pop('NPY_ENABLE_CPU_FEATURES', None)
+    _disable = os.environ.pop('NPY_DISABLE_CPU_FEATURES', None)
+    SUBPROCESS_ARGS = dict(cwd=cwd, capture_output=True, text=True, check=True)
+    unavailable_feats = [
+        feat for feat in __cpu_dispatch__ if not __cpu_features__[feat]
+    ]
+    UNAVAILABLE_FEAT = (
+        None if len(unavailable_feats) == 0
+        else unavailable_feats[0]
+    )
+    BASELINE_FEAT = None if len(__cpu_baseline__) == 0 else __cpu_baseline__[0]
+    SCRIPT = """
+def main():
+    from numpy.core._multiarray_umath import __cpu_features__, __cpu_dispatch__
+
+    detected = [feat for feat in __cpu_dispatch__ if __cpu_features__[feat]]
+    print(detected)
+
+if __name__ == "__main__":
+    main()
+    """
+
+    @pytest.fixture(autouse=True)
+    def setup_class(self, tmp_path_factory):
+        file = tmp_path_factory.mktemp("runtime_test_script")
+        file /= "_runtime_detect.py"
+        file.write_text(self.SCRIPT)
+        self.file = file
+        return
+
+    def _run(self):
+        return subprocess.run(
+            [sys.executable, self.file],
+            env=self.env,
+            **self.SUBPROCESS_ARGS,
+            )
+
+    # Helper function mimicing pytest.raises for subprocess call
+    def _expect_error(
+        self,
+        msg,
+        err_type,
+        no_error_msg="Failed to generate error"
+    ):
+        try:
+            self._run()
+        except subprocess.CalledProcessError as e:
+            assertion_message = f"Expected: {msg}\nGot: {e.stderr}"
+            assert re.search(msg, e.stderr), assertion_message
+
+            assertion_message = (
+                f"Expected error of type: {err_type}; see full "
+                f"error:\n{e.stderr}"
+            )
+            assert re.search(err_type, e.stderr), assertion_message
+        else:
+            assert False, no_error_msg
+
+    def setup_method(self):
+        """Ensure that the environment is reset"""
+        self.env = os.environ.copy()
+        return
+
+    def test_runtime_feature_selection(self):
+        """
+        Ensure that when selecting `NPY_ENABLE_CPU_FEATURES`, only the
+        features exactly specified are dispatched.
+        """
+
+        # Capture runtime-enabled features
+        out = self._run()
+        non_baseline_features = _text_to_list(out.stdout)
+
+        if non_baseline_features is None:
+            pytest.skip(
+                "No dispatchable features outside of baseline detected."
+            )
+        feature = non_baseline_features[0]
+
+        # Capture runtime-enabled features when `NPY_ENABLE_CPU_FEATURES` is
+        # specified
+        self.env['NPY_ENABLE_CPU_FEATURES'] = feature
+        out = self._run()
+        enabled_features = _text_to_list(out.stdout)
+
+        # Ensure that only one feature is enabled, and it is exactly the one
+        # specified by `NPY_ENABLE_CPU_FEATURES`
+        assert set(enabled_features) == {feature}
+
+        if len(non_baseline_features) < 2:
+            pytest.skip("Only one non-baseline feature detected.")
+        # Capture runtime-enabled features when `NPY_ENABLE_CPU_FEATURES` is
+        # specified
+        self.env['NPY_ENABLE_CPU_FEATURES'] = ",".join(non_baseline_features)
+        out = self._run()
+        enabled_features = _text_to_list(out.stdout)
+
+        # Ensure that both features are enabled, and they are exactly the ones
+        # specified by `NPY_ENABLE_CPU_FEATURES`
+        assert set(enabled_features) == set(non_baseline_features)
+        return
+
+    @pytest.mark.parametrize("enabled, disabled",
+    [
+        ("feature", "feature"),
+        ("feature", "same"),
+    ])
+    def test_both_enable_disable_set(self, enabled, disabled):
+        """
+        Ensure that when both environment variables are set then an
+        ImportError is thrown
+        """
+        self.env['NPY_ENABLE_CPU_FEATURES'] = enabled
+        self.env['NPY_DISABLE_CPU_FEATURES'] = disabled
+        msg = "Both NPY_DISABLE_CPU_FEATURES and NPY_ENABLE_CPU_FEATURES"
+        err_type = "ImportError"
+        self._expect_error(msg, err_type)
+
+    @pytest.mark.skipif(
+        not __cpu_dispatch__,
+        reason=(
+            "NPY_*_CPU_FEATURES only parsed if "
+            "`__cpu_dispatch__` is non-empty"
+        )
+    )
+    @pytest.mark.parametrize("action", ["ENABLE", "DISABLE"])
+    def test_variable_too_long(self, action):
+        """
+        Test that an error is thrown if the environment variables are too long
+        to be processed. Current limit is 1024, but this may change later.
+        """
+        MAX_VAR_LENGTH = 1024
+        # Actual length is MAX_VAR_LENGTH + 1 due to null-termination
+        self.env[f'NPY_{action}_CPU_FEATURES'] = "t" * MAX_VAR_LENGTH
+        msg = (
+            f"Length of environment variable 'NPY_{action}_CPU_FEATURES' is "
+            f"{MAX_VAR_LENGTH + 1}, only {MAX_VAR_LENGTH} accepted"
+        )
+        err_type = "RuntimeError"
+        self._expect_error(msg, err_type)
+
+    @pytest.mark.skipif(
+        not __cpu_dispatch__,
+        reason=(
+            "NPY_*_CPU_FEATURES only parsed if "
+            "`__cpu_dispatch__` is non-empty"
+        )
+    )
+    def test_impossible_feature_disable(self):
+        """
+        Test that a RuntimeError is thrown if an impossible feature-disabling
+        request is made. This includes disabling a baseline feature.
+        """
+
+        if self.BASELINE_FEAT is None:
+            pytest.skip("There are no unavailable features to test with")
+        bad_feature = self.BASELINE_FEAT
+        self.env['NPY_DISABLE_CPU_FEATURES'] = bad_feature
+        msg = (
+            f"You cannot disable CPU feature '{bad_feature}', since it is "
+            "part of the baseline optimizations"
+        )
+        err_type = "RuntimeError"
+        self._expect_error(msg, err_type)
+
+    def test_impossible_feature_enable(self):
+        """
+        Test that a RuntimeError is thrown if an impossible feature-enabling
+        request is made. This includes enabling a feature not supported by the
+        machine, or disabling a baseline optimization.
+        """
+
+        if self.UNAVAILABLE_FEAT is None:
+            pytest.skip("There are no unavailable features to test with")
+        bad_feature = self.UNAVAILABLE_FEAT
+        self.env['NPY_ENABLE_CPU_FEATURES'] = bad_feature
+        msg = (
+            f"You cannot enable CPU features \\({bad_feature}\\), since "
+            "they are not supported by your machine."
+        )
+        err_type = "RuntimeError"
+        self._expect_error(msg, err_type)
+
+        # Ensure that only the bad feature gets reported
+        feats = f"{bad_feature}, {self.BASELINE_FEAT}"
+        self.env['NPY_ENABLE_CPU_FEATURES'] = feats
+        msg = (
+            f"You cannot enable CPU features \\({bad_feature}\\), since they "
+            "are not supported by your machine."
+        )
+        self._expect_error(msg, err_type)
+
 is_linux = sys.platform.startswith('linux')
 is_cygwin = sys.platform.startswith('cygwin')
 machine  = platform.machine()
@@ -116,7 +330,7 @@ class Test_X86_Features(AbstractTest):
         "AVX", "F16C", "XOP", "FMA4", "FMA3", "AVX2", "AVX512F", "AVX512CD",
         "AVX512ER", "AVX512PF", "AVX5124FMAPS", "AVX5124VNNIW", "AVX512VPOPCNTDQ",
         "AVX512VL", "AVX512BW", "AVX512DQ", "AVX512VNNI", "AVX512IFMA",
-        "AVX512VBMI", "AVX512VBMI2", "AVX512BITALG",
+        "AVX512VBMI", "AVX512VBMI2", "AVX512BITALG", "AVX512FP16",
     ]
     features_groups = dict(
         AVX512_KNL = ["AVX512F", "AVX512CD", "AVX512ER", "AVX512PF"],
@@ -128,6 +342,10 @@ class Test_X86_Features(AbstractTest):
                       "AVX512VBMI"],
         AVX512_ICL = ["AVX512F", "AVX512CD", "AVX512BW", "AVX512DQ", "AVX512VL", "AVX512IFMA",
                       "AVX512VBMI", "AVX512VNNI", "AVX512VBMI2", "AVX512BITALG", "AVX512VPOPCNTDQ"],
+        AVX512_SPR = ["AVX512F", "AVX512CD", "AVX512BW", "AVX512DQ",
+                      "AVX512VL", "AVX512IFMA", "AVX512VBMI", "AVX512VNNI",
+                      "AVX512VBMI2", "AVX512BITALG", "AVX512VPOPCNTDQ",
+                      "AVX512FP16"],
     )
     features_map = dict(
         SSE3="PNI", SSE41="SSE4_1", SSE42="SSE4_2", FMA3="FMA",
diff --git a/numpy/core/tests/test_custom_dtypes.py b/numpy/core/tests/test_custom_dtypes.py
index 6bcc45d6b..da6a4bd50 100644
--- a/numpy/core/tests/test_custom_dtypes.py
+++ b/numpy/core/tests/test_custom_dtypes.py
@@ -45,6 +45,9 @@ class TestSFloat:
         # Check the repr, mainly to cover the code paths:
         assert repr(SF(scaling=1.)) == "_ScaledFloatTestDType(scaling=1.0)"
 
+    def test_dtype_name(self):
+        assert SF(1.).name == "_ScaledFloatTestDType64"
+
     @pytest.mark.parametrize("scaling", [1., -1., 2.])
     def test_sfloat_from_float(self, scaling):
         a = np.array([1., 2., 3.]).astype(dtype=SF(scaling))
@@ -199,3 +202,52 @@ class TestSFloat:
         # The output casting does not match the bool, bool -> bool loop:
         with pytest.raises(TypeError):
             ufunc(a, a, out=np.empty(a.shape, dtype=int), casting="equiv")
+
+    def test_wrapped_and_wrapped_reductions(self):
+        a = self._get_array(2.)
+        float_equiv = a.astype(float)
+
+        expected = np.hypot(float_equiv, float_equiv)
+        res = np.hypot(a, a)
+        assert res.dtype == a.dtype
+        res_float = res.view(np.float64) * 2
+        assert_array_equal(res_float, expected)
+
+        # Also check reduction (keepdims, due to incorrect getitem)
+        res = np.hypot.reduce(a, keepdims=True)
+        assert res.dtype == a.dtype
+        expected = np.hypot.reduce(float_equiv, keepdims=True)
+        assert res.view(np.float64) * 2 == expected
+
+    def test_astype_class(self):
+        # Very simple test that we accept `.astype()` also on the class.
+        # ScaledFloat always returns the default descriptor, but it does
+        # check the relevant code paths.
+        arr = np.array([1., 2., 3.], dtype=object)
+
+        res = arr.astype(SF)  # passing the class class
+        expected = arr.astype(SF(1.))  # above will have discovered 1. scaling
+        assert_array_equal(res.view(np.float64), expected.view(np.float64))
+
+    def test_creation_class(self):
+        arr1 = np.array([1., 2., 3.], dtype=SF)
+        assert arr1.dtype == SF(1.)
+        arr2 = np.array([1., 2., 3.], dtype=SF(1.))
+        assert_array_equal(arr1.view(np.float64), arr2.view(np.float64))
+
+
+def test_type_pickle():
+    # can't actually unpickle, but we can pickle (if in namespace)
+    import pickle
+
+    np._ScaledFloatTestDType = SF
+
+    s = pickle.dumps(SF)
+    res = pickle.loads(s)
+    assert res is SF
+
+    del np._ScaledFloatTestDType
+
+
+def test_is_numeric():
+    assert SF._is_numeric
diff --git a/numpy/core/tests/test_cython.py b/numpy/core/tests/test_cython.py
index a31d9460e..e916adceb 100644
--- a/numpy/core/tests/test_cython.py
+++ b/numpy/core/tests/test_cython.py
@@ -5,6 +5,7 @@ import sys
 import pytest
 
 import numpy as np
+from numpy.testing import IS_WASM
 
 # This import is copied from random.tests.test_extending
 try:
@@ -13,7 +14,7 @@ try:
 except ImportError:
     cython = None
 else:
-    from numpy.compat import _pep440
+    from numpy._utils import _pep440
 
     # Cython 0.29.30 is required for Python 3.11 and there are
     # other fixes in the 0.29 series that are needed even for earlier
@@ -30,6 +31,8 @@ pytestmark = pytest.mark.skipif(cython is None, reason="requires cython")
 @pytest.fixture
 def install_temp(request, tmp_path):
     # Based in part on test_cython from random.tests.test_extending
+    if IS_WASM:
+        pytest.skip("No subprocess")
 
     here = os.path.dirname(__file__)
     ext_dir = os.path.join(here, "examples", "cython")
diff --git a/numpy/core/tests/test_datetime.py b/numpy/core/tests/test_datetime.py
index baae77a35..547ebf9d6 100644
--- a/numpy/core/tests/test_datetime.py
+++ b/numpy/core/tests/test_datetime.py
@@ -4,6 +4,7 @@ import numpy as np
 import datetime
 import pytest
 from numpy.testing import (
+    IS_WASM,
     assert_, assert_equal, assert_raises, assert_warns, suppress_warnings,
     assert_raises_regex, assert_array_equal,
     )
@@ -718,8 +719,8 @@ class TestDateTime:
         assert_equal(uni_a, uni_b)
 
         # Datetime to long string - gh-9712
-        assert_equal(str_a, dt_a.astype((np.string_, 128)))
-        str_b = np.empty(str_a.shape, dtype=(np.string_, 128))
+        assert_equal(str_a, dt_a.astype((np.bytes_, 128)))
+        str_b = np.empty(str_a.shape, dtype=(np.bytes_, 128))
         str_b[...] = dt_a
         assert_equal(str_a, str_b)
 
@@ -1002,12 +1003,11 @@ class TestDateTime:
                      casting='unsafe'))
 
         # Shouldn't be able to compare datetime and timedelta
-        # TODO: Changing to 'same_kind' or 'safe' casting in the ufuncs by
-        #       default is needed to properly catch this kind of thing...
         a = np.array('2012-12-21', dtype='M8[D]')
         b = np.array(3, dtype='m8[D]')
-        #assert_raises(TypeError, np.less, a, b)
-        assert_raises(TypeError, np.less, a, b, casting='same_kind')
+        assert_raises(TypeError, np.less, a, b)
+        # not even if "unsafe"
+        assert_raises(TypeError, np.less, a, b, casting='unsafe')
 
     def test_datetime_like(self):
         a = np.array([3], dtype='m8[4D]')
@@ -1294,6 +1294,7 @@ class TestDateTime:
     def test_timedelta_floor_divide(self, op1, op2, exp):
         assert_equal(op1 // op2, exp)
 
+    @pytest.mark.skipif(IS_WASM, reason="fp errors don't work in wasm")
     @pytest.mark.parametrize("op1, op2", [
         # div by 0
         (np.timedelta64(10, 'us'),
@@ -1368,6 +1369,7 @@ class TestDateTime:
         expected = (op1 // op2, op1 % op2)
         assert_equal(divmod(op1, op2), expected)
 
+    @pytest.mark.skipif(IS_WASM, reason="does not work in wasm")
     @pytest.mark.parametrize("op1, op2", [
         # reuse cases from floordiv
         # div by 0
@@ -1993,6 +1995,7 @@ class TestDateTime:
         with assert_raises_regex(TypeError, "common metadata divisor"):
             val1 % val2
 
+    @pytest.mark.skipif(IS_WASM, reason="fp errors don't work in wasm")
     def test_timedelta_modulus_div_by_zero(self):
         with assert_warns(RuntimeWarning):
             actual = np.timedelta64(10, 's') % np.timedelta64(0, 's')
@@ -2327,16 +2330,23 @@ class TestDateTime:
         assert_equal(np.busday_count('2011-01-01', dates, busdaycal=bdd),
                      np.arange(366))
         # Returns negative value when reversed
+        # -1 since the '2011-01-01' is not a busday
         assert_equal(np.busday_count(dates, '2011-01-01', busdaycal=bdd),
-                     -np.arange(366))
+                     -np.arange(366) - 1)
 
+        # 2011-12-31 is a saturday
         dates = np.busday_offset('2011-12-31', -np.arange(366),
                         roll='forward', busdaycal=bdd)
+        # only the first generated date is in the future of 2011-12-31
+        expected = np.arange(366)
+        expected[0] = -1
         assert_equal(np.busday_count(dates, '2011-12-31', busdaycal=bdd),
-                     np.arange(366))
+                     expected)
         # Returns negative value when reversed
+        expected = -np.arange(366)+1
+        expected[0] = 0
         assert_equal(np.busday_count('2011-12-31', dates, busdaycal=bdd),
-                     -np.arange(366))
+                     expected)
 
         # Can't supply both a weekmask/holidays and busdaycal
         assert_raises(ValueError, np.busday_offset, '2012-01-03', '2012-02-03',
@@ -2349,6 +2359,17 @@ class TestDateTime:
         # Returns negative value when reversed
         assert_equal(np.busday_count('2011-04', '2011-03', weekmask='Mon'), -4)
 
+        sunday = np.datetime64('2023-03-05')
+        monday = sunday + 1
+        friday = sunday + 5
+        saturday = sunday + 6
+        assert_equal(np.busday_count(sunday, monday), 0)
+        assert_equal(np.busday_count(monday, sunday), -1)
+
+        assert_equal(np.busday_count(friday, saturday), 1)
+        assert_equal(np.busday_count(saturday, friday), 0)
+
+
     def test_datetime_is_busday(self):
         holidays = ['2011-01-01', '2011-10-10', '2011-11-11', '2011-11-24',
                     '2011-12-25', '2011-05-30', '2011-02-21', '2011-01-17',
@@ -2526,3 +2547,23 @@ class TestDateTimeData:
 
         dt = np.datetime64('2000', '5μs')
         assert np.datetime_data(dt.dtype) == ('us', 5)
+
+
+def test_comparisons_return_not_implemented():
+    # GH#17017
+
+    class custom:
+        __array_priority__ = 10000
+
+    obj = custom()
+
+    dt = np.datetime64('2000', 'ns')
+    td = dt - dt
+
+    for item in [dt, td]:
+        assert item.__eq__(obj) is NotImplemented
+        assert item.__ne__(obj) is NotImplemented
+        assert item.__le__(obj) is NotImplemented
+        assert item.__lt__(obj) is NotImplemented
+        assert item.__ge__(obj) is NotImplemented
+        assert item.__gt__(obj) is NotImplemented
diff --git a/numpy/core/tests/test_defchararray.py b/numpy/core/tests/test_defchararray.py
index 22296604e..39699f457 100644
--- a/numpy/core/tests/test_defchararray.py
+++ b/numpy/core/tests/test_defchararray.py
@@ -1,3 +1,5 @@
+import pytest
+
 import numpy as np
 from numpy.core.multiarray import _vec_string
 from numpy.testing import (
@@ -29,7 +31,7 @@ class TestBasic:
     def test_from_string_array(self):
         A = np.array([[b'abc', b'foo'],
                       [b'long   ', b'0123456789']])
-        assert_equal(A.dtype.type, np.string_)
+        assert_equal(A.dtype.type, np.bytes_)
         B = np.char.array(A)
         assert_array_equal(B, A)
         assert_equal(B.dtype, A.dtype)
@@ -46,7 +48,7 @@ class TestBasic:
     def test_from_unicode_array(self):
         A = np.array([['abc', 'Sigma \u03a3'],
                       ['long   ', '0123456789']])
-        assert_equal(A.dtype.type, np.unicode_)
+        assert_equal(A.dtype.type, np.str_)
         B = np.char.array(A)
         assert_array_equal(B, A)
         assert_equal(B.dtype, A.dtype)
@@ -64,40 +66,40 @@ class TestBasic:
     def test_unicode_upconvert(self):
         A = np.char.array(['abc'])
         B = np.char.array(['\u03a3'])
-        assert_(issubclass((A + B).dtype.type, np.unicode_))
+        assert_(issubclass((A + B).dtype.type, np.str_))
 
     def test_from_string(self):
         A = np.char.array(b'abc')
         assert_equal(len(A), 1)
         assert_equal(len(A[0]), 3)
-        assert_(issubclass(A.dtype.type, np.string_))
+        assert_(issubclass(A.dtype.type, np.bytes_))
 
     def test_from_unicode(self):
         A = np.char.array('\u03a3')
         assert_equal(len(A), 1)
         assert_equal(len(A[0]), 1)
         assert_equal(A.itemsize, 4)
-        assert_(issubclass(A.dtype.type, np.unicode_))
+        assert_(issubclass(A.dtype.type, np.str_))
 
 class TestVecString:
     def test_non_existent_method(self):
 
         def fail():
-            _vec_string('a', np.string_, 'bogus')
+            _vec_string('a', np.bytes_, 'bogus')
 
         assert_raises(AttributeError, fail)
 
     def test_non_string_array(self):
 
         def fail():
-            _vec_string(1, np.string_, 'strip')
+            _vec_string(1, np.bytes_, 'strip')
 
         assert_raises(TypeError, fail)
 
     def test_invalid_args_tuple(self):
 
         def fail():
-            _vec_string(['a'], np.string_, 'strip', 1)
+            _vec_string(['a'], np.bytes_, 'strip', 1)
 
         assert_raises(TypeError, fail)
 
@@ -111,7 +113,7 @@ class TestVecString:
     def test_invalid_function_args(self):
 
         def fail():
-            _vec_string(['a'], np.string_, 'strip', (1,))
+            _vec_string(['a'], np.bytes_, 'strip', (1,))
 
         assert_raises(TypeError, fail)
 
@@ -190,7 +192,7 @@ class TestComparisonsMixed1(TestComparisons):
     def setup_method(self):
         TestComparisons.setup_method(self)
         self.B = np.array([['efg', '123  '],
-                           ['051', 'tuv']], np.unicode_).view(np.chararray)
+                           ['051', 'tuv']], np.str_).view(np.chararray)
 
 class TestComparisonsMixed2(TestComparisons):
     """Ticket #1276"""
@@ -198,7 +200,7 @@ class TestComparisonsMixed2(TestComparisons):
     def setup_method(self):
         TestComparisons.setup_method(self)
         self.A = np.array([['abc', '123'],
-                           ['789', 'xyz']], np.unicode_).view(np.chararray)
+                           ['789', 'xyz']], np.str_).view(np.chararray)
 
 class TestInformation:
     def setup_method(self):
@@ -320,17 +322,17 @@ class TestMethods:
         tgt = [[b' abc ', b''],
                [b'12345', b'Mixedcase'],
                [b'123 \t 345 \0 ', b'Upper']]
-        assert_(issubclass(self.A.capitalize().dtype.type, np.string_))
+        assert_(issubclass(self.A.capitalize().dtype.type, np.bytes_))
         assert_array_equal(self.A.capitalize(), tgt)
 
         tgt = [[' \u03c3 ', ''],
                ['12345', 'Mixedcase'],
                ['123 \t 345 \0 ', 'Upper']]
-        assert_(issubclass(self.B.capitalize().dtype.type, np.unicode_))
+        assert_(issubclass(self.B.capitalize().dtype.type, np.str_))
         assert_array_equal(self.B.capitalize(), tgt)
 
     def test_center(self):
-        assert_(issubclass(self.A.center(10).dtype.type, np.string_))
+        assert_(issubclass(self.A.center(10).dtype.type, np.bytes_))
         C = self.A.center([10, 20])
         assert_array_equal(np.char.str_len(C), [[10, 20], [10, 20], [12, 20]])
 
@@ -341,7 +343,7 @@ class TestMethods:
         C = np.char.center(b'FOO', [[10, 20], [15, 8]])
         tgt = [[b'   FOO    ', b'        FOO         '],
                [b'      FOO      ', b'  FOO   ']]
-        assert_(issubclass(C.dtype.type, np.string_))
+        assert_(issubclass(C.dtype.type, np.bytes_))
         assert_array_equal(C, tgt)
 
     def test_decode(self):
@@ -362,14 +364,14 @@ class TestMethods:
         A0 = self.A.decode('ascii')
 
         A = np.char.join([',', '#'], A0)
-        assert_(issubclass(A.dtype.type, np.unicode_))
+        assert_(issubclass(A.dtype.type, np.str_))
         tgt = np.array([[' ,a,b,c, ', ''],
                         ['1,2,3,4,5', 'M#i#x#e#d#C#a#s#e'],
                         ['1,2,3, ,\t, ,3,4,5, ,\x00, ', 'U#P#P#E#R']])
         assert_array_equal(np.char.join([',', '#'], A0), tgt)
 
     def test_ljust(self):
-        assert_(issubclass(self.A.ljust(10).dtype.type, np.string_))
+        assert_(issubclass(self.A.ljust(10).dtype.type, np.bytes_))
 
         C = self.A.ljust([10, 20])
         assert_array_equal(np.char.str_len(C), [[10, 20], [10, 20], [12, 20]])
@@ -382,27 +384,27 @@ class TestMethods:
         C = np.char.ljust(b'FOO', [[10, 20], [15, 8]])
         tgt = [[b'FOO       ', b'FOO                 '],
                [b'FOO            ', b'FOO     ']]
-        assert_(issubclass(C.dtype.type, np.string_))
+        assert_(issubclass(C.dtype.type, np.bytes_))
         assert_array_equal(C, tgt)
 
     def test_lower(self):
         tgt = [[b' abc ', b''],
                [b'12345', b'mixedcase'],
                [b'123 \t 345 \0 ', b'upper']]
-        assert_(issubclass(self.A.lower().dtype.type, np.string_))
+        assert_(issubclass(self.A.lower().dtype.type, np.bytes_))
         assert_array_equal(self.A.lower(), tgt)
 
         tgt = [[' \u03c3 ', ''],
                ['12345', 'mixedcase'],
                ['123 \t 345 \0 ', 'upper']]
-        assert_(issubclass(self.B.lower().dtype.type, np.unicode_))
+        assert_(issubclass(self.B.lower().dtype.type, np.str_))
         assert_array_equal(self.B.lower(), tgt)
 
     def test_lstrip(self):
         tgt = [[b'abc ', b''],
                [b'12345', b'MixedCase'],
                [b'123 \t 345 \0 ', b'UPPER']]
-        assert_(issubclass(self.A.lstrip().dtype.type, np.string_))
+        assert_(issubclass(self.A.lstrip().dtype.type, np.bytes_))
         assert_array_equal(self.A.lstrip(), tgt)
 
         tgt = [[b' abc', b''],
@@ -413,7 +415,7 @@ class TestMethods:
         tgt = [['\u03a3 ', ''],
                ['12345', 'MixedCase'],
                ['123 \t 345 \0 ', 'UPPER']]
-        assert_(issubclass(self.B.lstrip().dtype.type, np.unicode_))
+        assert_(issubclass(self.B.lstrip().dtype.type, np.str_))
         assert_array_equal(self.B.lstrip(), tgt)
 
     def test_partition(self):
@@ -421,7 +423,7 @@ class TestMethods:
         tgt = [[(b' abc ', b'', b''), (b'', b'', b'')],
                [(b'12', b'3', b'45'), (b'', b'M', b'ixedCase')],
                [(b'12', b'3', b' \t 345 \0 '), (b'UPPER', b'', b'')]]
-        assert_(issubclass(P.dtype.type, np.string_))
+        assert_(issubclass(P.dtype.type, np.bytes_))
         assert_array_equal(P, tgt)
 
     def test_replace(self):
@@ -430,11 +432,11 @@ class TestMethods:
         tgt = [[b' abc ', b''],
                [b'12##########45', b'MixedC@se'],
                [b'12########## \t ##########45 \x00', b'UPPER']]
-        assert_(issubclass(R.dtype.type, np.string_))
+        assert_(issubclass(R.dtype.type, np.bytes_))
         assert_array_equal(R, tgt)
 
     def test_rjust(self):
-        assert_(issubclass(self.A.rjust(10).dtype.type, np.string_))
+        assert_(issubclass(self.A.rjust(10).dtype.type, np.bytes_))
 
         C = self.A.rjust([10, 20])
         assert_array_equal(np.char.str_len(C), [[10, 20], [10, 20], [12, 20]])
@@ -447,7 +449,7 @@ class TestMethods:
         C = np.char.rjust(b'FOO', [[10, 20], [15, 8]])
         tgt = [[b'       FOO', b'                 FOO'],
                [b'            FOO', b'     FOO']]
-        assert_(issubclass(C.dtype.type, np.string_))
+        assert_(issubclass(C.dtype.type, np.bytes_))
         assert_array_equal(C, tgt)
 
     def test_rpartition(self):
@@ -455,7 +457,7 @@ class TestMethods:
         tgt = [[(b'', b'', b' abc '), (b'', b'', b'')],
                [(b'12', b'3', b'45'), (b'', b'M', b'ixedCase')],
                [(b'123 \t ', b'3', b'45 \0 '), (b'', b'', b'UPPER')]]
-        assert_(issubclass(P.dtype.type, np.string_))
+        assert_(issubclass(P.dtype.type, np.bytes_))
         assert_array_equal(P, tgt)
 
     def test_rsplit(self):
@@ -467,7 +469,7 @@ class TestMethods:
         assert_equal(A.tolist(), tgt)
 
     def test_rstrip(self):
-        assert_(issubclass(self.A.rstrip().dtype.type, np.string_))
+        assert_(issubclass(self.A.rstrip().dtype.type, np.bytes_))
 
         tgt = [[b' abc', b''],
                [b'12345', b'MixedCase'],
@@ -483,14 +485,14 @@ class TestMethods:
         tgt = [[' \u03a3', ''],
                ['12345', 'MixedCase'],
                ['123 \t 345', 'UPPER']]
-        assert_(issubclass(self.B.rstrip().dtype.type, np.unicode_))
+        assert_(issubclass(self.B.rstrip().dtype.type, np.str_))
         assert_array_equal(self.B.rstrip(), tgt)
 
     def test_strip(self):
         tgt = [[b'abc', b''],
                [b'12345', b'MixedCase'],
                [b'123 \t 345', b'UPPER']]
-        assert_(issubclass(self.A.strip().dtype.type, np.string_))
+        assert_(issubclass(self.A.strip().dtype.type, np.bytes_))
         assert_array_equal(self.A.strip(), tgt)
 
         tgt = [[b' abc ', b''],
@@ -501,7 +503,7 @@ class TestMethods:
         tgt = [['\u03a3', ''],
                ['12345', 'MixedCase'],
                ['123 \t 345', 'UPPER']]
-        assert_(issubclass(self.B.strip().dtype.type, np.unicode_))
+        assert_(issubclass(self.B.strip().dtype.type, np.str_))
         assert_array_equal(self.B.strip(), tgt)
 
     def test_split(self):
@@ -523,39 +525,39 @@ class TestMethods:
         tgt = [[b' ABC ', b''],
                [b'12345', b'mIXEDcASE'],
                [b'123 \t 345 \0 ', b'upper']]
-        assert_(issubclass(self.A.swapcase().dtype.type, np.string_))
+        assert_(issubclass(self.A.swapcase().dtype.type, np.bytes_))
         assert_array_equal(self.A.swapcase(), tgt)
 
         tgt = [[' \u03c3 ', ''],
                ['12345', 'mIXEDcASE'],
                ['123 \t 345 \0 ', 'upper']]
-        assert_(issubclass(self.B.swapcase().dtype.type, np.unicode_))
+        assert_(issubclass(self.B.swapcase().dtype.type, np.str_))
         assert_array_equal(self.B.swapcase(), tgt)
 
     def test_title(self):
         tgt = [[b' Abc ', b''],
                [b'12345', b'Mixedcase'],
                [b'123 \t 345 \0 ', b'Upper']]
-        assert_(issubclass(self.A.title().dtype.type, np.string_))
+        assert_(issubclass(self.A.title().dtype.type, np.bytes_))
         assert_array_equal(self.A.title(), tgt)
 
         tgt = [[' \u03a3 ', ''],
                ['12345', 'Mixedcase'],
                ['123 \t 345 \0 ', 'Upper']]
-        assert_(issubclass(self.B.title().dtype.type, np.unicode_))
+        assert_(issubclass(self.B.title().dtype.type, np.str_))
         assert_array_equal(self.B.title(), tgt)
 
     def test_upper(self):
         tgt = [[b' ABC ', b''],
                [b'12345', b'MIXEDCASE'],
                [b'123 \t 345 \0 ', b'UPPER']]
-        assert_(issubclass(self.A.upper().dtype.type, np.string_))
+        assert_(issubclass(self.A.upper().dtype.type, np.bytes_))
         assert_array_equal(self.A.upper(), tgt)
 
         tgt = [[' \u03a3 ', ''],
                ['12345', 'MIXEDCASE'],
                ['123 \t 345 \0 ', 'UPPER']]
-        assert_(issubclass(self.B.upper().dtype.type, np.unicode_))
+        assert_(issubclass(self.B.upper().dtype.type, np.str_))
         assert_array_equal(self.B.upper(), tgt)
 
     def test_isnumeric(self):
@@ -670,3 +672,15 @@ def test_empty_indexing():
     # empty chararray instead of a chararray with a single empty string in it.
     s = np.chararray((4,))
     assert_(s[[]].size == 0)
+
+
+@pytest.mark.parametrize(["dt1", "dt2"],
+        [("S", "U"), ("U", "S"), ("S", "O"), ("U", "O"),
+         ("S", "d"), ("S", "V")])
+def test_add_types(dt1, dt2):
+    arr1 = np.array([1234234], dtype=dt1)
+    # If the following fails, e.g. use a number and test "V" explicitly
+    arr2 = np.array([b"423"], dtype=dt2)
+    with pytest.raises(TypeError,
+            match=f".*same dtype kind.*{arr1.dtype}.*{arr2.dtype}"):
+        np.char.add(arr1, arr2)
diff --git a/numpy/core/tests/test_deprecations.py b/numpy/core/tests/test_deprecations.py
index e09ec27b9..3ada39e90 100644
--- a/numpy/core/tests/test_deprecations.py
+++ b/numpy/core/tests/test_deprecations.py
@@ -138,80 +138,6 @@ class _VisibleDeprecationTestCase(_DeprecationTestCase):
     warning_cls = np.VisibleDeprecationWarning
 
 
-class TestComparisonDeprecations(_DeprecationTestCase):
-    """This tests the deprecation, for non-element-wise comparison logic.
-    This used to mean that when an error occurred during element-wise comparison
-    (i.e. broadcasting) NotImplemented was returned, but also in the comparison
-    itself, False was given instead of the error.
-
-    Also test FutureWarning for the None comparison.
-    """
-
-    message = "elementwise.* comparison failed; .*"
-
-    def test_normal_types(self):
-        for op in (operator.eq, operator.ne):
-            # Broadcasting errors:
-            self.assert_deprecated(op, args=(np.zeros(3), []))
-            a = np.zeros(3, dtype='i,i')
-            # (warning is issued a couple of times here)
-            self.assert_deprecated(op, args=(a, a[:-1]), num=None)
-
-            # ragged array comparison returns True/False
-            a = np.array([1, np.array([1,2,3])], dtype=object)
-            b = np.array([1, np.array([1,2,3])], dtype=object)
-            self.assert_deprecated(op, args=(a, b), num=None)
-
-    def test_string(self):
-        # For two string arrays, strings always raised the broadcasting error:
-        a = np.array(['a', 'b'])
-        b = np.array(['a', 'b', 'c'])
-        assert_warns(FutureWarning, lambda x, y: x == y, a, b)
-
-        # The empty list is not cast to string, and this used to pass due
-        # to dtype mismatch; now (2018-06-21) it correctly leads to a
-        # FutureWarning.
-        assert_warns(FutureWarning, lambda: a == [])
-
-    def test_void_dtype_equality_failures(self):
-        class NotArray:
-            def __array__(self):
-                raise TypeError
-
-            # Needed so Python 3 does not raise DeprecationWarning twice.
-            def __ne__(self, other):
-                return NotImplemented
-
-        self.assert_deprecated(lambda: np.arange(2) == NotArray())
-        self.assert_deprecated(lambda: np.arange(2) != NotArray())
-
-    def test_array_richcompare_legacy_weirdness(self):
-        # It doesn't really work to use assert_deprecated here, b/c part of
-        # the point of assert_deprecated is to check that when warnings are
-        # set to "error" mode then the error is propagated -- which is good!
-        # But here we are testing a bunch of code that is deprecated *because*
-        # it has the habit of swallowing up errors and converting them into
-        # different warnings. So assert_warns will have to be sufficient.
-        assert_warns(FutureWarning, lambda: np.arange(2) == "a")
-        assert_warns(FutureWarning, lambda: np.arange(2) != "a")
-        # No warning for scalar comparisons
-        with warnings.catch_warnings():
-            warnings.filterwarnings("error")
-            assert_(not (np.array(0) == "a"))
-            assert_(np.array(0) != "a")
-            assert_(not (np.int16(0) == "a"))
-            assert_(np.int16(0) != "a")
-
-        for arg1 in [np.asarray(0), np.int16(0)]:
-            struct = np.zeros(2, dtype="i4,i4")
-            for arg2 in [struct, "a"]:
-                for f in [operator.lt, operator.le, operator.gt, operator.ge]:
-                    with warnings.catch_warnings() as l:
-                        warnings.filterwarnings("always")
-                        assert_raises(TypeError, f, arg1, arg2)
-                        assert_(not l)
-
-
 class TestDatetime64Timezone(_DeprecationTestCase):
     """Parsing of datetime64 with timezones deprecated in 1.11.0, because
     datetime64 is now timezone naive rather than UTC only.
@@ -384,12 +310,6 @@ class TestGeneratorSum(_DeprecationTestCase):
         self.assert_deprecated(np.sum, args=((i for i in range(5)),))
 
 
-class TestPositiveOnNonNumerical(_DeprecationTestCase):
-    # 2018-06-28, 1.16.0
-    def test_positive_on_non_number(self):
-        self.assert_deprecated(operator.pos, args=(np.array('foo'),))
-
-
 class TestFromstring(_DeprecationTestCase):
     # 2017-10-19, 1.14
     def test_fromstring(self):
@@ -583,25 +503,6 @@ class TestNonExactMatchDeprecation(_DeprecationTestCase):
         self.assert_deprecated(lambda: np.searchsorted(arr[0], 4, side='Random'))
 
 
-class TestDeprecatedGlobals(_DeprecationTestCase):
-    # 2020-06-06
-    def test_type_aliases(self):
-        # from builtins
-        self.assert_deprecated(lambda: np.bool(True))
-        self.assert_deprecated(lambda: np.int(1))
-        self.assert_deprecated(lambda: np.float(1))
-        self.assert_deprecated(lambda: np.complex(1))
-        self.assert_deprecated(lambda: np.object())
-        self.assert_deprecated(lambda: np.str('abc'))
-
-        # from np.compat
-        self.assert_deprecated(lambda: np.long(1))
-        self.assert_deprecated(lambda: np.unicode('abc'))
-
-        # from np.core.numerictypes
-        self.assert_deprecated(lambda: np.typeDict)
-
-
 class TestMatrixInOuter(_DeprecationTestCase):
     # 2020-05-13 NumPy 1.20.0
     message = (r"add.outer\(\) was passed a numpy matrix as "
@@ -637,134 +538,6 @@ class FlatteningConcatenateUnsafeCast(_DeprecationTestCase):
                            casting="same_kind")
 
 
-class TestDeprecateSubarrayDTypeDuringArrayCoercion(_DeprecationTestCase):
-    warning_cls = FutureWarning
-    message = "(creating|casting) an array (with|to) a subarray dtype"
-
-    def test_deprecated_array(self):
-        # Arrays are more complex, since they "broadcast" on success:
-        arr = np.array([1, 2])
-
-        self.assert_deprecated(lambda: arr.astype("(2)i,"))
-        with pytest.warns(FutureWarning):
-            res = arr.astype("(2)i,")
-
-        assert_array_equal(res, [[1, 2], [1, 2]])
-
-        self.assert_deprecated(lambda: np.array(arr, dtype="(2)i,"))
-        with pytest.warns(FutureWarning):
-            res = np.array(arr, dtype="(2)i,")
-
-        assert_array_equal(res, [[1, 2], [1, 2]])
-
-        with pytest.warns(FutureWarning):
-            res = np.array([[(1,), (2,)], arr], dtype="(2)i,")
-
-        assert_array_equal(res, [[[1, 1], [2, 2]], [[1, 2], [1, 2]]])
-
-    def test_deprecated_and_error(self):
-        # These error paths do not give a warning, but will succeed in the
-        # future.
-        arr = np.arange(5 * 2).reshape(5, 2)
-        def check():
-            with pytest.raises(ValueError):
-                arr.astype("(2,2)f")
-
-        self.assert_deprecated(check)
-
-        def check():
-            with pytest.raises(ValueError):
-                np.array(arr, dtype="(2,2)f")
-
-        self.assert_deprecated(check)
-
-
-class TestFutureWarningArrayLikeNotIterable(_DeprecationTestCase):
-    # Deprecated 2020-12-09, NumPy 1.20
-    warning_cls = FutureWarning
-    message = "The input object of type.*but not a sequence"
-
-    @pytest.mark.parametrize("protocol",
-            ["__array__", "__array_interface__", "__array_struct__"])
-    def test_deprecated(self, protocol):
-        """Test that these objects give a warning since they are not 0-D,
-        not coerced at the top level `np.array(obj)`, but nested, and do
-        *not* define the sequence protocol.
-
-        NOTE: Tests for the versions including __len__ and __getitem__ exist
-              in `test_array_coercion.py` and they can be modified or amended
-              when this deprecation expired.
-        """
-        blueprint = np.arange(10)
-        MyArr = type("MyArr", (), {protocol: getattr(blueprint, protocol)})
-        self.assert_deprecated(lambda: np.array([MyArr()], dtype=object))
-
-    @pytest.mark.parametrize("protocol",
-             ["__array__", "__array_interface__", "__array_struct__"])
-    def test_0d_not_deprecated(self, protocol):
-        # 0-D always worked (albeit it would use __float__ or similar for the
-        # conversion, which may not happen anymore)
-        blueprint = np.array(1.)
-        MyArr = type("MyArr", (), {protocol: getattr(blueprint, protocol)})
-        myarr = MyArr()
-
-        self.assert_not_deprecated(lambda: np.array([myarr], dtype=object))
-        res = np.array([myarr], dtype=object)
-        expected = np.empty(1, dtype=object)
-        expected[0] = myarr
-        assert_array_equal(res, expected)
-
-    @pytest.mark.parametrize("protocol",
-             ["__array__", "__array_interface__", "__array_struct__"])
-    def test_unnested_not_deprecated(self, protocol):
-        blueprint = np.arange(10)
-        MyArr = type("MyArr", (), {protocol: getattr(blueprint, protocol)})
-        myarr = MyArr()
-
-        self.assert_not_deprecated(lambda: np.array(myarr))
-        res = np.array(myarr)
-        assert_array_equal(res, blueprint)
-
-    @pytest.mark.parametrize("protocol",
-             ["__array__", "__array_interface__", "__array_struct__"])
-    def test_strange_dtype_handling(self, protocol):
-        """The old code would actually use the dtype from the array, but
-        then end up not using the array (for dimension discovery)
-        """
-        blueprint = np.arange(10).astype("f4")
-        MyArr = type("MyArr", (), {protocol: getattr(blueprint, protocol),
-                                   "__float__": lambda _: 0.5})
-        myarr = MyArr()
-
-        # Make sure we warn (and capture the FutureWarning)
-        with pytest.warns(FutureWarning, match=self.message):
-            res = np.array([[myarr]])
-
-        assert res.shape == (1, 1)
-        assert res.dtype == "f4"
-        assert res[0, 0] == 0.5
-
-    @pytest.mark.parametrize("protocol",
-             ["__array__", "__array_interface__", "__array_struct__"])
-    def test_assignment_not_deprecated(self, protocol):
-        # If the result is dtype=object we do not unpack a nested array or
-        # array-like, if it is nested at exactly the right depth.
-        # NOTE: We actually do still call __array__, etc. but ignore the result
-        #       in the end. For `dtype=object` we could optimize that away.
-        blueprint = np.arange(10).astype("f4")
-        MyArr = type("MyArr", (), {protocol: getattr(blueprint, protocol),
-                                   "__float__": lambda _: 0.5})
-        myarr = MyArr()
-
-        res = np.empty(3, dtype=object)
-        def set():
-            res[:] = [myarr, myarr, myarr]
-        self.assert_not_deprecated(set)
-        assert res[0] is myarr
-        assert res[1] is myarr
-        assert res[2] is myarr
-
-
 class TestDeprecatedUnpickleObjectScalar(_DeprecationTestCase):
     # Deprecated 2020-11-24, NumPy 1.20
     """
@@ -778,218 +551,6 @@ class TestDeprecatedUnpickleObjectScalar(_DeprecationTestCase):
         ctor = np.core.multiarray.scalar
         self.assert_deprecated(lambda: ctor(np.dtype("O"), 1))
 
-try:
-    with warnings.catch_warnings():
-        warnings.simplefilter("always")
-        import nose  # noqa: F401
-except ImportError:
-    HAVE_NOSE = False
-else:
-    HAVE_NOSE = True
-
-
-@pytest.mark.skipif(not HAVE_NOSE, reason="Needs nose")
-class TestNoseDecoratorsDeprecated(_DeprecationTestCase):
-    class DidntSkipException(Exception):
-        pass
-
-    def test_slow(self):
-        def _test_slow():
-            @np.testing.dec.slow
-            def slow_func(x, y, z):
-                pass
-
-            assert_(slow_func.slow)
-        self.assert_deprecated(_test_slow)
-
-    def test_setastest(self):
-        def _test_setastest():
-            @np.testing.dec.setastest()
-            def f_default(a):
-                pass
-
-            @np.testing.dec.setastest(True)
-            def f_istest(a):
-                pass
-
-            @np.testing.dec.setastest(False)
-            def f_isnottest(a):
-                pass
-
-            assert_(f_default.__test__)
-            assert_(f_istest.__test__)
-            assert_(not f_isnottest.__test__)
-        self.assert_deprecated(_test_setastest, num=3)
-
-    def test_skip_functions_hardcoded(self):
-        def _test_skip_functions_hardcoded():
-            @np.testing.dec.skipif(True)
-            def f1(x):
-                raise self.DidntSkipException
-
-            try:
-                f1('a')
-            except self.DidntSkipException:
-                raise Exception('Failed to skip')
-            except SkipTest().__class__:
-                pass
-
-            @np.testing.dec.skipif(False)
-            def f2(x):
-                raise self.DidntSkipException
-
-            try:
-                f2('a')
-            except self.DidntSkipException:
-                pass
-            except SkipTest().__class__:
-                raise Exception('Skipped when not expected to')
-        self.assert_deprecated(_test_skip_functions_hardcoded, num=2)
-
-    def test_skip_functions_callable(self):
-        def _test_skip_functions_callable():
-            def skip_tester():
-                return skip_flag == 'skip me!'
-
-            @np.testing.dec.skipif(skip_tester)
-            def f1(x):
-                raise self.DidntSkipException
-
-            try:
-                skip_flag = 'skip me!'
-                f1('a')
-            except self.DidntSkipException:
-                raise Exception('Failed to skip')
-            except SkipTest().__class__:
-                pass
-
-            @np.testing.dec.skipif(skip_tester)
-            def f2(x):
-                raise self.DidntSkipException
-
-            try:
-                skip_flag = 'five is right out!'
-                f2('a')
-            except self.DidntSkipException:
-                pass
-            except SkipTest().__class__:
-                raise Exception('Skipped when not expected to')
-        self.assert_deprecated(_test_skip_functions_callable, num=2)
-
-    def test_skip_generators_hardcoded(self):
-        def _test_skip_generators_hardcoded():
-            @np.testing.dec.knownfailureif(True, "This test is known to fail")
-            def g1(x):
-                yield from range(x)
-
-            try:
-                for j in g1(10):
-                    pass
-            except KnownFailureException().__class__:
-                pass
-            else:
-                raise Exception('Failed to mark as known failure')
-
-            @np.testing.dec.knownfailureif(False, "This test is NOT known to fail")
-            def g2(x):
-                yield from range(x)
-                raise self.DidntSkipException('FAIL')
-
-            try:
-                for j in g2(10):
-                    pass
-            except KnownFailureException().__class__:
-                raise Exception('Marked incorrectly as known failure')
-            except self.DidntSkipException:
-                pass
-        self.assert_deprecated(_test_skip_generators_hardcoded, num=2)
-
-    def test_skip_generators_callable(self):
-        def _test_skip_generators_callable():
-            def skip_tester():
-                return skip_flag == 'skip me!'
-
-            @np.testing.dec.knownfailureif(skip_tester, "This test is known to fail")
-            def g1(x):
-                yield from range(x)
-
-            try:
-                skip_flag = 'skip me!'
-                for j in g1(10):
-                    pass
-            except KnownFailureException().__class__:
-                pass
-            else:
-                raise Exception('Failed to mark as known failure')
-
-            @np.testing.dec.knownfailureif(skip_tester, "This test is NOT known to fail")
-            def g2(x):
-                yield from range(x)
-                raise self.DidntSkipException('FAIL')
-
-            try:
-                skip_flag = 'do not skip'
-                for j in g2(10):
-                    pass
-            except KnownFailureException().__class__:
-                raise Exception('Marked incorrectly as known failure')
-            except self.DidntSkipException:
-                pass
-        self.assert_deprecated(_test_skip_generators_callable, num=2)
-
-    def test_deprecated(self):
-        def _test_deprecated():
-            @np.testing.dec.deprecated(True)
-            def non_deprecated_func():
-                pass
-
-            @np.testing.dec.deprecated()
-            def deprecated_func():
-                import warnings
-                warnings.warn("TEST: deprecated func", DeprecationWarning, stacklevel=1)
-
-            @np.testing.dec.deprecated()
-            def deprecated_func2():
-                import warnings
-                warnings.warn("AHHHH", stacklevel=1)
-                raise ValueError
-
-            @np.testing.dec.deprecated()
-            def deprecated_func3():
-                import warnings
-                warnings.warn("AHHHH", stacklevel=1)
-
-            # marked as deprecated, but does not raise DeprecationWarning
-            assert_raises(AssertionError, non_deprecated_func)
-            # should be silent
-            deprecated_func()
-            with warnings.catch_warnings(record=True):
-                warnings.simplefilter("always")  # do not propagate unrelated warnings
-                # fails if deprecated decorator just disables test. See #1453.
-                assert_raises(ValueError, deprecated_func2)
-                # warning is not a DeprecationWarning
-                assert_raises(AssertionError, deprecated_func3)
-        self.assert_deprecated(_test_deprecated, num=4)
-
-    def test_parametrize(self):
-        def _test_parametrize():
-            # dec.parametrize assumes that it is being run by nose. Because
-            # we are running under pytest, we need to explicitly check the
-            # results.
-            @np.testing.dec.parametrize('base, power, expected',
-                    [(1, 1, 1),
-                    (2, 1, 2),
-                    (2, 2, 4)])
-            def check_parametrize(base, power, expected):
-                assert_(base**power == expected)
-
-            count = 0
-            for test in check_parametrize():
-                test[0](*test[1:])
-                count += 1
-            assert_(count == 3)
-        self.assert_deprecated(_test_parametrize)
-
 
 class TestSingleElementSignature(_DeprecationTestCase):
     # Deprecated 2021-04-01, NumPy 1.21
@@ -1000,31 +561,6 @@ class TestSingleElementSignature(_DeprecationTestCase):
         self.assert_deprecated(lambda: np.add(1, 2, sig=(np.dtype("l"),)))
 
 
-class TestComparisonBadDType(_DeprecationTestCase):
-    # Deprecated 2021-04-01, NumPy 1.21
-    message = r"using `dtype=` in comparisons is only useful for"
-
-    def test_deprecated(self):
-        self.assert_deprecated(lambda: np.equal(1, 1, dtype=np.int64))
-        # Not an error only for the transition
-        self.assert_deprecated(lambda: np.equal(1, 1, sig=(None, None, "l")))
-
-    def test_not_deprecated(self):
-        np.equal(True, False, dtype=bool)
-        np.equal(3, 5, dtype=bool, casting="unsafe")
-        np.equal([None], [4], dtype=object)
-
-class TestComparisonBadObjectDType(_DeprecationTestCase):
-    # Deprecated 2021-04-01, NumPy 1.21  (different branch of the above one)
-    message = r"using `dtype=object` \(or equivalent signature\) will"
-    warning_cls = FutureWarning
-
-    def test_deprecated(self):
-        self.assert_deprecated(lambda: np.equal(1, 1, dtype=object))
-        self.assert_deprecated(
-                lambda: np.equal(1, 1, sig=(None, None, object)))
-
-
 class TestCtypesGetter(_DeprecationTestCase):
     # Deprecated 2021-05-18, Numpy 1.21.0
     warning_cls = DeprecationWarning
@@ -1044,39 +580,6 @@ class TestCtypesGetter(_DeprecationTestCase):
         self.assert_not_deprecated(lambda: getattr(self.ctypes, name))
 
 
-class TestUFuncForcedDTypeWarning(_DeprecationTestCase):
-    message = "The `dtype` and `signature` arguments to ufuncs only select the"
-
-    def test_not_deprecated(self):
-        import pickle
-        # does not warn (test relies on bad pickling behaviour, simply remove
-        # it if the `assert int64 is not int64_2` should start failing.
-        int64 = np.dtype("int64")
-        int64_2 = pickle.loads(pickle.dumps(int64))
-        assert int64 is not int64_2
-        self.assert_not_deprecated(lambda: np.add(3, 4, dtype=int64_2))
-
-    def test_deprecation(self):
-        int64 = np.dtype("int64")
-        self.assert_deprecated(lambda: np.add(3, 5, dtype=int64.newbyteorder()))
-        self.assert_deprecated(lambda: np.add(3, 5, dtype="m8[ns]"))
-
-    def test_behaviour(self):
-        int64 = np.dtype("int64")
-        arr = np.arange(10, dtype="m8[s]")
-
-        with pytest.warns(DeprecationWarning, match=self.message):
-            np.add(3, 5, dtype=int64.newbyteorder())
-        with pytest.warns(DeprecationWarning, match=self.message):
-            np.add(3, 5, dtype="m8[ns]")  # previously used the "ns"
-        with pytest.warns(DeprecationWarning, match=self.message):
-            np.add(arr, arr, dtype="m8[ns]")  # never preserved the "ns"
-        with pytest.warns(DeprecationWarning, match=self.message):
-            np.maximum(arr, arr, dtype="m8[ns]")  # previously used the "ns"
-        with pytest.warns(DeprecationWarning, match=self.message):
-            np.maximum.reduce(arr, dtype="m8[ns]")  # never preserved the "ns"
-
-
 PARTITION_DICT = {
     "partition method": np.arange(10).partition,
     "argpartition method": np.arange(10).argpartition,
@@ -1101,18 +604,11 @@ class TestPartitionBoolIndex(_DeprecationTestCase):
 
 
 class TestMachAr(_DeprecationTestCase):
-    # Deprecated 2021-10-19, NumPy 1.22
+    # Deprecated 2022-11-22, NumPy 1.25
     warning_cls = DeprecationWarning
 
-    def test_deprecated(self):
-        self.assert_deprecated(lambda: np.MachAr)
-
     def test_deprecated_module(self):
-        self.assert_deprecated(lambda: getattr(np.core, "machar"))
-
-    def test_deprecated_attr(self):
-        finfo = np.finfo(float)
-        self.assert_deprecated(lambda: getattr(finfo, "machar"))
+        self.assert_deprecated(lambda: getattr(np.core, "MachAr"))
 
 
 class TestQuantileInterpolationDeprecation(_DeprecationTestCase):
@@ -1198,6 +694,18 @@ class TestLoadtxtParseIntsViaFloat(_DeprecationTestCase):
                 assert isinstance(e.__cause__, DeprecationWarning)
 
 
+class TestScalarConversion(_DeprecationTestCase):
+    # 2023-01-02, 1.25.0
+    def test_float_conversion(self):
+        self.assert_deprecated(float, args=(np.array([3.14]),))
+
+    def test_behaviour(self):
+        b = np.array([[3.14]])
+        c = np.zeros(5)
+        with pytest.warns(DeprecationWarning):
+            c[0] = b
+
+
 class TestPyIntConversion(_DeprecationTestCase):
     message = r".*stop allowing conversion of out-of-bound.*"
 
@@ -1230,3 +738,80 @@ class TestPyIntConversion(_DeprecationTestCase):
                         lambda: creation_func(info.max + 1, dtype))
             except OverflowError:
                 pass  # OverflowErrors always happened also before and are OK.
+
+
+class TestDeprecatedGlobals(_DeprecationTestCase):
+    # Deprecated 2022-11-17, NumPy 1.24
+    def test_type_aliases(self):
+        # from builtins
+        self.assert_deprecated(lambda: np.bool8)
+        self.assert_deprecated(lambda: np.int0)
+        self.assert_deprecated(lambda: np.uint0)
+        self.assert_deprecated(lambda: np.bytes0)
+        self.assert_deprecated(lambda: np.str0)
+        self.assert_deprecated(lambda: np.object0)
+
+
+@pytest.mark.parametrize("name",
+        ["bool", "long", "ulong", "str", "bytes", "object"])
+def test_future_scalar_attributes(name):
+    # FutureWarning added 2022-11-17, NumPy 1.24,
+    assert name not in dir(np)  # we may want to not add them
+    with pytest.warns(FutureWarning,
+            match=f"In the future .*{name}"):
+        assert not hasattr(np, name)
+
+    # Unfortunately, they are currently still valid via `np.dtype()`
+    np.dtype(name)
+    name in np.sctypeDict
+
+
+# Ignore the above future attribute warning for this test.
+@pytest.mark.filterwarnings("ignore:In the future:FutureWarning")
+class TestRemovedGlobals:
+    # Removed 2023-01-12, NumPy 1.24.0
+    # Not a deprecation, but the large error was added to aid those who missed
+    # the previous deprecation, and should be removed similarly to one
+    # (or faster).
+    @pytest.mark.parametrize("name",
+            ["object", "bool", "float", "complex", "str", "int"])
+    def test_attributeerror_includes_info(self, name):
+        msg = f".*\n`np.{name}` was a deprecated alias for the builtin"
+        with pytest.raises(AttributeError, match=msg):
+            getattr(np, name)
+
+
+class TestDeprecatedFinfo(_DeprecationTestCase):
+    # Deprecated in NumPy 1.25, 2023-01-16
+    def test_deprecated_none(self):
+        self.assert_deprecated(np.finfo, args=(None,))
+
+class TestFromnumeric(_DeprecationTestCase):
+    # 2023-02-28, 1.25.0
+    def test_round_(self):
+        self.assert_deprecated(lambda: np.round_(np.array([1.5, 2.5, 3.5])))
+
+    # 2023-03-02, 1.25.0
+    def test_cumproduct(self):
+        self.assert_deprecated(lambda: np.cumproduct(np.array([1, 2, 3])))
+
+    # 2023-03-02, 1.25.0
+    def test_product(self):
+        self.assert_deprecated(lambda: np.product(np.array([1, 2, 3])))
+
+    # 2023-03-02, 1.25.0
+    def test_sometrue(self):
+        self.assert_deprecated(lambda: np.sometrue(np.array([True, False])))
+
+    # 2023-03-02, 1.25.0
+    def test_alltrue(self):
+        self.assert_deprecated(lambda: np.alltrue(np.array([True, False])))
+
+
+class TestMathAlias(_DeprecationTestCase):
+    # Deprecated in Numpy 1.25, 2023-04-06
+    def test_deprecated_np_math(self):
+        self.assert_deprecated(lambda: np.math)
+
+    def test_deprecated_np_lib_math(self):
+        self.assert_deprecated(lambda: np.lib.math)
diff --git a/numpy/core/tests/test_dlpack.py b/numpy/core/tests/test_dlpack.py
index 717210b54..49249bc6a 100644
--- a/numpy/core/tests/test_dlpack.py
+++ b/numpy/core/tests/test_dlpack.py
@@ -26,7 +26,7 @@ class TestDLPack:
         y = np.zeros((5,), dtype=dt)
         z = y['int']
 
-        with pytest.raises(RuntimeError):
+        with pytest.raises(BufferError):
             np.from_dlpack(z)
 
     @pytest.mark.skipif(IS_PYPY, reason="PyPy can't get refcounts.")
@@ -38,13 +38,14 @@ class TestDLPack:
         assert sys.getrefcount(x) == 2
 
     @pytest.mark.parametrize("dtype", [
+        np.bool_,
         np.int8, np.int16, np.int32, np.int64,
         np.uint8, np.uint16, np.uint32, np.uint64,
         np.float16, np.float32, np.float64,
         np.complex64, np.complex128
     ])
     def test_dtype_passthrough(self, dtype):
-        x = np.arange(5, dtype=dtype)
+        x = np.arange(5).astype(dtype)
         y = np.from_dlpack(x)
 
         assert y.dtype == x.dtype
@@ -53,14 +54,14 @@ class TestDLPack:
     def test_invalid_dtype(self):
         x = np.asarray(np.datetime64('2021-05-27'))
 
-        with pytest.raises(TypeError):
+        with pytest.raises(BufferError):
             np.from_dlpack(x)
 
     def test_invalid_byte_swapping(self):
         dt = np.dtype('=i8').newbyteorder()
         x = np.arange(5, dtype=dt)
 
-        with pytest.raises(TypeError):
+        with pytest.raises(BufferError):
             np.from_dlpack(x)
 
     def test_non_contiguous(self):
@@ -100,7 +101,7 @@ class TestDLPack:
         x = np.arange(5)
         _ = x.__dlpack__()
         raise RuntimeError
-    
+
     def test_dlpack_destructor_exception(self):
         with pytest.raises(RuntimeError):
             self.dlpack_deleter_exception()
@@ -108,7 +109,7 @@ class TestDLPack:
     def test_readonly(self):
         x = np.arange(5)
         x.flags.writeable = False
-        with pytest.raises(TypeError):
+        with pytest.raises(BufferError):
             x.__dlpack__()
 
     def test_ndim0(self):
diff --git a/numpy/core/tests/test_dtype.py b/numpy/core/tests/test_dtype.py
index 91f314d05..57831f46f 100644
--- a/numpy/core/tests/test_dtype.py
+++ b/numpy/core/tests/test_dtype.py
@@ -7,6 +7,7 @@ import types
 from typing import Any
 
 import numpy as np
+import numpy.dtypes
 from numpy.core._rational_tests import rational
 from numpy.core._multiarray_tests import create_custom_field_dtype
 from numpy.testing import (
@@ -126,6 +127,15 @@ class TestBuiltin:
         with assert_raises(TypeError):
             np.dtype(dtype)
 
+    def test_remaining_dtypes_with_bad_bytesize(self):
+        # The np.<name> aliases were deprecated, these probably should be too 
+        assert np.dtype("int0") is np.dtype("intp")
+        assert np.dtype("uint0") is np.dtype("uintp")
+        assert np.dtype("bool8") is np.dtype("bool")
+        assert np.dtype("bytes0") is np.dtype("bytes")
+        assert np.dtype("str0") is np.dtype("str")
+        assert np.dtype("object0") is np.dtype("object")
+
     @pytest.mark.parametrize(
         'value',
         ['m8', 'M8', 'datetime64', 'timedelta64',
@@ -186,6 +196,34 @@ class TestBuiltin:
         # This is an safe cast (not equiv) due to the different names:
         assert np.can_cast(x, y, casting="safe")
 
+    @pytest.mark.parametrize(
+        ["type_char", "char_size", "scalar_type"],
+        [["U", 4, np.str_],
+         ["S", 1, np.bytes_]])
+    def test_create_string_dtypes_directly(
+            self, type_char, char_size, scalar_type):
+        dtype_class = type(np.dtype(type_char))
+
+        dtype = dtype_class(8)
+        assert dtype.type is scalar_type
+        assert dtype.itemsize == 8*char_size
+
+    def test_create_invalid_string_errors(self):
+        one_too_big = np.iinfo(np.intc).max + 1
+        with pytest.raises(TypeError):
+            type(np.dtype("U"))(one_too_big // 4)
+
+        with pytest.raises(TypeError):
+            # Code coverage for very large numbers:
+            type(np.dtype("U"))(np.iinfo(np.intp).max // 4 + 1)
+
+        if one_too_big < sys.maxsize:
+            with pytest.raises(TypeError):
+                type(np.dtype("S"))(one_too_big)
+
+        with pytest.raises(ValueError):
+            type(np.dtype("U"))(-1)
+
 
 class TestRecord:
     def test_equivalent_record(self):
@@ -514,6 +552,14 @@ class TestRecord:
         assert_equal(np.zeros((1, 2), dtype=[]) == a,
                      np.ones((1, 2), dtype=bool))
 
+    def test_nonstructured_with_object(self):
+        # See gh-23277, the dtype here thinks it contain objects, if the
+        # assert about that fails, the test becomes meaningless (which is OK)
+        arr = np.recarray((0,), dtype="O") 
+        assert arr.dtype.names is None  # no fields
+        assert arr.dtype.hasobject  # but claims to contain objects
+        del arr  # the deletion failed previously.
+
 
 class TestSubarray:
     def test_single_subarray(self):
@@ -1518,8 +1564,21 @@ class TestDTypeClasses:
         dtype = np.dtype(dtype)
         assert isinstance(dtype, np.dtype)
         assert type(dtype) is not np.dtype
-        assert type(dtype).__name__ == f"dtype[{dtype.type.__name__}]"
-        assert type(dtype).__module__ == "numpy"
+        if dtype.type.__name__ != "rational":
+            dt_name = type(dtype).__name__.lower().removesuffix("dtype")
+            if dt_name == "uint" or dt_name == "int":
+                # The scalar names has a `c` attached because "int" is Python
+                # int and that is long...
+                dt_name += "c"
+            sc_name = dtype.type.__name__
+            assert dt_name == sc_name.strip("_")
+            assert type(dtype).__module__ == "numpy.dtypes"
+
+            assert getattr(numpy.dtypes, type(dtype).__name__) is type(dtype)
+        else:
+            assert type(dtype).__name__ == "dtype[rational]"
+            assert type(dtype).__module__ == "numpy"
+
         assert not type(dtype)._abstract
 
         # the flexible dtypes and datetime/timedelta have additional parameters
@@ -1542,6 +1601,32 @@ class TestDTypeClasses:
         assert type(np.dtype).__module__ == "numpy"
         assert np.dtype._abstract
 
+    def test_is_numeric(self):
+        all_codes = set(np.typecodes['All'])
+        numeric_codes = set(np.typecodes['AllInteger'] +
+                            np.typecodes['AllFloat'] + '?')
+        non_numeric_codes = all_codes - numeric_codes
+
+        for code in numeric_codes:
+            assert type(np.dtype(code))._is_numeric
+
+        for code in non_numeric_codes:
+            assert not type(np.dtype(code))._is_numeric
+
+    @pytest.mark.parametrize("int_", ["UInt", "Int"])
+    @pytest.mark.parametrize("size", [8, 16, 32, 64])
+    def test_integer_alias_names(self, int_, size):
+        DType = getattr(numpy.dtypes, f"{int_}{size}DType")
+        sctype = getattr(numpy, f"{int_.lower()}{size}")
+        assert DType.type is sctype
+        assert DType.__name__.lower().removesuffix("dtype") == sctype.__name__
+
+    @pytest.mark.parametrize("name",
+            ["Half", "Float", "Double", "CFloat", "CDouble"])
+    def test_float_alias_names(self, name):
+        with pytest.raises(AttributeError):
+            getattr(numpy.dtypes, name + "DType") is numpy.dtypes.Float16DType
+
 
 class TestFromCTypes:
 
@@ -1776,7 +1861,6 @@ class TestUserDType:
             create_custom_field_dtype(blueprint, mytype, 2)
 
 
-@pytest.mark.skipif(sys.version_info < (3, 9), reason="Requires python 3.9")
 class TestClassGetItem:
     def test_dtype(self) -> None:
         alias = np.dtype[Any]
@@ -1809,10 +1893,3 @@ def test_result_type_integers_and_unitless_timedelta64():
     td = np.timedelta64(4)
     result = np.result_type(0, td)
     assert_dtype_equal(result, td.dtype)
-
-
-@pytest.mark.skipif(sys.version_info >= (3, 9), reason="Requires python 3.8")
-def test_class_getitem_38() -> None:
-    match = "Type subscription requires python >= 3.9"
-    with pytest.raises(TypeError, match=match):
-        np.dtype[Any]
diff --git a/numpy/core/tests/test_einsum.py b/numpy/core/tests/test_einsum.py
index 7c0e8d97c..3a06d119f 100644
--- a/numpy/core/tests/test_einsum.py
+++ b/numpy/core/tests/test_einsum.py
@@ -100,6 +100,69 @@ class TestEinsum:
             assert_raises(ValueError, np.einsum, "i->i", np.arange(6).reshape(-1, 1),
                           optimize=do_opt, order='d')
 
+    def test_einsum_object_errors(self):
+        # Exceptions created by object arithmetic should
+        # successfully propogate
+
+        class CustomException(Exception):
+            pass
+
+        class DestructoBox:
+
+            def __init__(self, value, destruct):
+                self._val = value
+                self._destruct = destruct
+
+            def __add__(self, other):
+                tmp = self._val + other._val
+                if tmp >= self._destruct:
+                    raise CustomException
+                else:
+                    self._val = tmp
+                    return self
+
+            def __radd__(self, other):
+                if other == 0:
+                    return self
+                else:
+                    return self.__add__(other)
+
+            def __mul__(self, other):
+                tmp = self._val * other._val
+                if tmp >= self._destruct:
+                    raise CustomException
+                else:
+                    self._val = tmp
+                    return self
+
+            def __rmul__(self, other):
+                if other == 0:
+                    return self
+                else:
+                    return self.__mul__(other)
+
+        a = np.array([DestructoBox(i, 5) for i in range(1, 10)],
+                     dtype='object').reshape(3, 3)
+
+        # raised from unbuffered_loop_nop1_ndim2
+        assert_raises(CustomException, np.einsum, "ij->i", a)
+
+        # raised from unbuffered_loop_nop1_ndim3
+        b = np.array([DestructoBox(i, 100) for i in range(0, 27)],
+                     dtype='object').reshape(3, 3, 3)
+        assert_raises(CustomException, np.einsum, "i...k->...", b)
+
+        # raised from unbuffered_loop_nop2_ndim2
+        b = np.array([DestructoBox(i, 55) for i in range(1, 4)],
+                     dtype='object')
+        assert_raises(CustomException, np.einsum, "ij, j", a, b)
+
+        # raised from unbuffered_loop_nop2_ndim3
+        assert_raises(CustomException, np.einsum, "ij, jh", a, a)
+
+        # raised from PyArray_EinsteinSum
+        assert_raises(CustomException, np.einsum, "ij->", a)
+
     def test_einsum_views(self):
         # pass-through
         for do_opt in [True, False]:
@@ -247,47 +310,50 @@ class TestEinsum:
         # sum(a, axis=-1)
         for n in range(1, 17):
             a = np.arange(n, dtype=dtype)
-            assert_equal(np.einsum("i->", a, optimize=do_opt),
-                         np.sum(a, axis=-1).astype(dtype))
-            assert_equal(np.einsum(a, [0], [], optimize=do_opt),
-                         np.sum(a, axis=-1).astype(dtype))
+            b = np.sum(a, axis=-1)
+            if hasattr(b, 'astype'):
+                b = b.astype(dtype)
+            assert_equal(np.einsum("i->", a, optimize=do_opt), b)
+            assert_equal(np.einsum(a, [0], [], optimize=do_opt), b)
 
         for n in range(1, 17):
             a = np.arange(2*3*n, dtype=dtype).reshape(2, 3, n)
-            assert_equal(np.einsum("...i->...", a, optimize=do_opt),
-                         np.sum(a, axis=-1).astype(dtype))
-            assert_equal(np.einsum(a, [Ellipsis, 0], [Ellipsis], optimize=do_opt),
-                         np.sum(a, axis=-1).astype(dtype))
+            b = np.sum(a, axis=-1)
+            if hasattr(b, 'astype'):
+                b = b.astype(dtype)
+            assert_equal(np.einsum("...i->...", a, optimize=do_opt), b)
+            assert_equal(np.einsum(a, [Ellipsis, 0], [Ellipsis], optimize=do_opt), b)
 
         # sum(a, axis=0)
         for n in range(1, 17):
             a = np.arange(2*n, dtype=dtype).reshape(2, n)
-            assert_equal(np.einsum("i...->...", a, optimize=do_opt),
-                         np.sum(a, axis=0).astype(dtype))
-            assert_equal(np.einsum(a, [0, Ellipsis], [Ellipsis], optimize=do_opt),
-                         np.sum(a, axis=0).astype(dtype))
+            b = np.sum(a, axis=0)
+            if hasattr(b, 'astype'):
+                b = b.astype(dtype)
+            assert_equal(np.einsum("i...->...", a, optimize=do_opt), b)
+            assert_equal(np.einsum(a, [0, Ellipsis], [Ellipsis], optimize=do_opt), b)
 
         for n in range(1, 17):
             a = np.arange(2*3*n, dtype=dtype).reshape(2, 3, n)
-            assert_equal(np.einsum("i...->...", a, optimize=do_opt),
-                         np.sum(a, axis=0).astype(dtype))
-            assert_equal(np.einsum(a, [0, Ellipsis], [Ellipsis], optimize=do_opt),
-                         np.sum(a, axis=0).astype(dtype))
+            b = np.sum(a, axis=0)
+            if hasattr(b, 'astype'):
+                b = b.astype(dtype)
+            assert_equal(np.einsum("i...->...", a, optimize=do_opt), b)
+            assert_equal(np.einsum(a, [0, Ellipsis], [Ellipsis], optimize=do_opt), b)
 
         # trace(a)
         for n in range(1, 17):
             a = np.arange(n*n, dtype=dtype).reshape(n, n)
-            assert_equal(np.einsum("ii", a, optimize=do_opt),
-                         np.trace(a).astype(dtype))
-            assert_equal(np.einsum(a, [0, 0], optimize=do_opt),
-                         np.trace(a).astype(dtype))
+            b = np.trace(a)
+            if hasattr(b, 'astype'):
+                b = b.astype(dtype)
+            assert_equal(np.einsum("ii", a, optimize=do_opt), b)
+            assert_equal(np.einsum(a, [0, 0], optimize=do_opt), b)
 
             # gh-15961: should accept numpy int64 type in subscript list
             np_array = np.asarray([0, 0])
-            assert_equal(np.einsum(a, np_array, optimize=do_opt),
-                         np.trace(a).astype(dtype))
-            assert_equal(np.einsum(a, list(np_array), optimize=do_opt),
-                         np.trace(a).astype(dtype))
+            assert_equal(np.einsum(a, np_array, optimize=do_opt), b)
+            assert_equal(np.einsum(a, list(np_array), optimize=do_opt), b)
 
         # multiply(a, b)
         assert_equal(np.einsum("..., ...", 3, 4), 12)  # scalar case
@@ -489,11 +555,15 @@ class TestEinsum:
 
         b = np.einsum("i->", a, dtype=dtype, casting='unsafe')
         assert_equal(b, np.sum(a))
-        assert_equal(b.dtype, np.dtype(dtype))
+        if hasattr(b, "dtype"):
+            # Can be a python object when dtype is object
+            assert_equal(b.dtype, np.dtype(dtype))
 
         b = np.einsum(a, [0], [], dtype=dtype, casting='unsafe')
         assert_equal(b, np.sum(a))
-        assert_equal(b.dtype, np.dtype(dtype))
+        if hasattr(b, "dtype"):
+            # Can be a python object when dtype is object
+            assert_equal(b.dtype, np.dtype(dtype))
 
         # A case which was failing (ticket #1885)
         p = np.arange(2) + 1
@@ -587,6 +657,10 @@ class TestEinsum:
     def test_einsum_sums_clongdouble(self):
         self.check_einsum_sums(np.clongdouble)
 
+    def test_einsum_sums_object(self):
+        self.check_einsum_sums('object')
+        self.check_einsum_sums('object', True)
+
     def test_einsum_misc(self):
         # This call used to crash because of a bug in
         # PyArray_AssignZero
@@ -625,6 +699,21 @@ class TestEinsum:
         # see issue gh-15776 and issue gh-15256
         assert_equal(np.einsum('i,j', [1], [2], out=None), [[2]])
 
+    def test_object_loop(self):
+
+        class Mult:
+            def __mul__(self, other):
+                return 42
+
+        objMult = np.array([Mult()])
+        objNULL = np.ndarray(buffer = b'\0' * np.intp(0).itemsize, shape=1, dtype=object)
+
+        with pytest.raises(TypeError):
+            np.einsum("i,j", [1], objNULL)
+        with pytest.raises(TypeError):
+            np.einsum("i,j", objNULL, [1])
+        assert np.einsum("i,j", objMult, objMult) == 42
+
     def test_subscript_range(self):
         # Issue #7741, make sure that all letters of Latin alphabet (both uppercase & lowercase) can be used
         # when creating a subscript from arrays
@@ -755,7 +844,7 @@ class TestEinsum:
         # Test originally added to cover broken float16 path: gh-20305
         # Likely most are covered elsewhere, at least partially.
         dtype = np.dtype(dtype)
-        # Simple test, designed to excersize most specialized code paths,
+        # Simple test, designed to exercise most specialized code paths,
         # note the +0.5 for floats.  This makes sure we use a float value
         # where the results must be exact.
         arr = (np.arange(7) + 0.5).astype(dtype)
diff --git a/numpy/core/tests/test_errstate.py b/numpy/core/tests/test_errstate.py
index 184a37300..3a5647f6f 100644
--- a/numpy/core/tests/test_errstate.py
+++ b/numpy/core/tests/test_errstate.py
@@ -2,7 +2,7 @@ import pytest
 import sysconfig
 
 import numpy as np
-from numpy.testing import assert_, assert_raises
+from numpy.testing import assert_, assert_raises, IS_WASM
 
 # The floating point emulation on ARM EABI systems lacking a hardware FPU is
 # known to be buggy. This is an attempt to identify these hosts. It may not
@@ -12,6 +12,7 @@ hosttype = sysconfig.get_config_var('HOST_GNU_TYPE')
 arm_softfloat = False if hosttype is None else hosttype.endswith('gnueabi')
 
 class TestErrstate:
+    @pytest.mark.skipif(IS_WASM, reason="fp errors don't work in wasm")
     @pytest.mark.skipif(arm_softfloat,
                         reason='platform/cpu issue with FPU (gh-413,-15562)')
     def test_invalid(self):
@@ -24,6 +25,7 @@ class TestErrstate:
             with assert_raises(FloatingPointError):
                 np.sqrt(a)
 
+    @pytest.mark.skipif(IS_WASM, reason="fp errors don't work in wasm")
     @pytest.mark.skipif(arm_softfloat,
                         reason='platform/cpu issue with FPU (gh-15562)')
     def test_divide(self):
diff --git a/numpy/core/tests/test_function_base.py b/numpy/core/tests/test_function_base.py
index dad7a5883..79f1ecfc9 100644
--- a/numpy/core/tests/test_function_base.py
+++ b/numpy/core/tests/test_function_base.py
@@ -1,3 +1,4 @@
+import pytest
 from numpy import (
     logspace, linspace, geomspace, dtype, array, sctypes, arange, isnan,
     ndarray, sqrt, nextafter, stack, errstate
@@ -65,6 +66,33 @@ class TestLogspace:
         t5 = logspace(start, stop, 6, axis=-1)
         assert_equal(t5, t2.T)
 
+    @pytest.mark.parametrize("axis", [0, 1, -1])
+    def test_base_array(self, axis: int):
+        start = 1
+        stop = 2
+        num = 6
+        base = array([1, 2])
+        t1 = logspace(start, stop, num=num, base=base, axis=axis)
+        t2 = stack(
+            [logspace(start, stop, num=num, base=_base) for _base in base],
+            axis=(axis + 1) % t1.ndim,
+        )
+        assert_equal(t1, t2)
+
+    @pytest.mark.parametrize("axis", [0, 1, -1])
+    def test_stop_base_array(self, axis: int):
+        start = 1
+        stop = array([2, 3])
+        num = 6
+        base = array([1, 2])
+        t1 = logspace(start, stop, num=num, base=base, axis=axis)
+        t2 = stack(
+            [logspace(start, _stop, num=num, base=_base)
+             for _stop, _base in zip(stop, base)],
+            axis=(axis + 1) % t1.ndim,
+        )
+        assert_equal(t1, t2)
+
     def test_dtype(self):
         y = logspace(0, 6, dtype='float32')
         assert_equal(y.dtype, dtype('float32'))
@@ -407,3 +435,12 @@ class TestLinspace:
         y = linspace(-1, 3, num=8, dtype=int)
         t = array([-1, -1, 0, 0, 1, 1, 2, 3], dtype=int)
         assert_array_equal(y, t)
+
+    def test_any_step_zero_and_not_mult_inplace(self):
+        # any_step_zero is True, _mult_inplace is False
+        start = array([0.0, 1.0])
+        stop = array([2.0, 1.0])
+        y = linspace(start, stop, 3)
+        assert_array_equal(y, array([[0.0, 1.0], [1.0, 1.0], [2.0, 1.0]]))
+    
+
diff --git a/numpy/core/tests/test_getlimits.py b/numpy/core/tests/test_getlimits.py
index b8aaba386..63217c38c 100644
--- a/numpy/core/tests/test_getlimits.py
+++ b/numpy/core/tests/test_getlimits.py
@@ -3,6 +3,7 @@
 """
 import warnings
 import numpy as np
+import pytest
 from numpy.core import finfo, iinfo
 from numpy import half, single, double, longdouble
 from numpy.testing import assert_equal, assert_, assert_raises
@@ -40,20 +41,38 @@ class TestLongdouble:
         ftype2 = finfo(longdouble)
         assert_equal(id(ftype), id(ftype2))
 
+def assert_finfo_equal(f1, f2):
+    # assert two finfo instances have the same attributes
+    for attr in ('bits', 'eps', 'epsneg', 'iexp', 'machep',
+                 'max', 'maxexp', 'min', 'minexp', 'negep', 'nexp',
+                 'nmant', 'precision', 'resolution', 'tiny',
+                 'smallest_normal', 'smallest_subnormal'):
+        assert_equal(getattr(f1, attr), getattr(f2, attr),
+                     f'finfo instances {f1} and {f2} differ on {attr}')
+
+def assert_iinfo_equal(i1, i2):
+    # assert two iinfo instances have the same attributes
+    for attr in ('bits', 'min', 'max'):
+        assert_equal(getattr(i1, attr), getattr(i2, attr),
+                     f'iinfo instances {i1} and {i2} differ on {attr}')
+
 class TestFinfo:
     def test_basic(self):
         dts = list(zip(['f2', 'f4', 'f8', 'c8', 'c16'],
                        [np.float16, np.float32, np.float64, np.complex64,
                         np.complex128]))
         for dt1, dt2 in dts:
-            for attr in ('bits', 'eps', 'epsneg', 'iexp', 'machep',
-                         'max', 'maxexp', 'min', 'minexp', 'negep', 'nexp',
-                         'nmant', 'precision', 'resolution', 'tiny',
-                         'smallest_normal', 'smallest_subnormal'):
-                assert_equal(getattr(finfo(dt1), attr),
-                             getattr(finfo(dt2), attr), attr)
+            assert_finfo_equal(finfo(dt1), finfo(dt2))
+
         assert_raises(ValueError, finfo, 'i4')
 
+    def test_regression_gh23108(self):
+        # np.float32(1.0) and np.float64(1.0) have the same hash and are
+        # equal under the == operator
+        f1 = np.finfo(np.float32(1.0))
+        f2 = np.finfo(np.float64(1.0))
+        assert f1 != f2
+
 class TestIinfo:
     def test_basic(self):
         dts = list(zip(['i1', 'i2', 'i4', 'i8',
@@ -61,9 +80,8 @@ class TestIinfo:
                   [np.int8, np.int16, np.int32, np.int64,
                    np.uint8, np.uint16, np.uint32, np.uint64]))
         for dt1, dt2 in dts:
-            for attr in ('bits', 'min', 'max'):
-                assert_equal(getattr(iinfo(dt1), attr),
-                             getattr(iinfo(dt2), attr), attr)
+            assert_iinfo_equal(iinfo(dt1), iinfo(dt2))
+
         assert_raises(ValueError, iinfo, 'f4')
 
     def test_unsigned_max(self):
@@ -85,8 +103,28 @@ class TestRepr:
 
 
 def test_instances():
-    iinfo(10)
-    finfo(3.0)
+    # Test the finfo and iinfo results on numeric instances agree with
+    # the results on the corresponding types
+
+    for c in [int, np.int16, np.int32, np.int64]:
+        class_iinfo = iinfo(c)
+        instance_iinfo = iinfo(c(12))
+
+        assert_iinfo_equal(class_iinfo, instance_iinfo)
+
+    for c in [float, np.float16, np.float32, np.float64]:
+        class_finfo = finfo(c)
+        instance_finfo = finfo(c(1.2))
+        assert_finfo_equal(class_finfo, instance_finfo)
+
+    with pytest.raises(ValueError):
+        iinfo(10.)
+
+    with pytest.raises(ValueError):
+        iinfo('hi')
+
+    with pytest.raises(ValueError):
+        finfo(np.int64(1))
 
 
 def assert_ma_equal(discovered, ma_like):
diff --git a/numpy/core/tests/test_half.py b/numpy/core/tests/test_half.py
index 2205cc80c..ca849ad52 100644
--- a/numpy/core/tests/test_half.py
+++ b/numpy/core/tests/test_half.py
@@ -3,7 +3,7 @@ import pytest
 
 import numpy as np
 from numpy import uint16, float16, float32, float64
-from numpy.testing import assert_, assert_equal, _OLD_PROMOTION
+from numpy.testing import assert_, assert_equal, _OLD_PROMOTION, IS_WASM
 
 
 def assert_raises_fpe(strmatch, callable, *args, **kwargs):
@@ -483,6 +483,8 @@ class TestHalf:
 
     @pytest.mark.skipif(platform.machine() == "armv5tel",
                         reason="See gh-413.")
+    @pytest.mark.skipif(IS_WASM,
+                        reason="fp exceptions don't work in wasm.")
     def test_half_fpe(self):
         with np.errstate(all='raise'):
             sx16 = np.array((1e-4,), dtype=float16)
diff --git a/numpy/core/tests/test_indexing.py b/numpy/core/tests/test_indexing.py
index c8e52679f..042936702 100644
--- a/numpy/core/tests/test_indexing.py
+++ b/numpy/core/tests/test_indexing.py
@@ -10,7 +10,7 @@ from numpy.core._multiarray_tests import array_indexing
 from itertools import product
 from numpy.testing import (
     assert_, assert_equal, assert_raises, assert_raises_regex,
-    assert_array_equal, assert_warns, HAS_REFCOUNT,
+    assert_array_equal, assert_warns, HAS_REFCOUNT, IS_WASM
     )
 
 
@@ -563,6 +563,7 @@ class TestIndexing:
         with pytest.raises(IndexError):
             arr[(index,) * num] = 1.
 
+    @pytest.mark.skipif(IS_WASM, reason="no threading")
     def test_structured_advanced_indexing(self):
         # Test that copyswap(n) used by integer array indexing is threadsafe
         # for structured datatypes, see gh-15387. This test can behave randomly.
@@ -1061,7 +1062,7 @@ class TestMultiIndexingAutomated:
                         if np.any(_indx >= _size) or np.any(_indx < -_size):
                                 raise IndexError
                 if len(indx[1:]) == len(orig_slice):
-                    if np.product(orig_slice) == 0:
+                    if np.prod(orig_slice) == 0:
                         # Work around for a crash or IndexError with 'wrap'
                         # in some 0-sized cases.
                         try:
diff --git a/numpy/core/tests/test_item_selection.py b/numpy/core/tests/test_item_selection.py
index 3c35245a3..5660ef583 100644
--- a/numpy/core/tests/test_item_selection.py
+++ b/numpy/core/tests/test_item_selection.py
@@ -1,5 +1,7 @@
 import sys
 
+import pytest
+
 import numpy as np
 from numpy.testing import (
     assert_, assert_raises, assert_array_equal, HAS_REFCOUNT
@@ -84,3 +86,80 @@ class TestTake:
 
         b = np.array([0, 1, 2, 3, 4, 5])
         assert_array_equal(a, b)
+
+
+class TestPutMask:
+    @pytest.mark.parametrize("dtype", list(np.typecodes["All"]) + ["i,O"])
+    def test_simple(self, dtype):
+        if dtype.lower() == "m":
+            dtype += "8[ns]"
+
+        # putmask is weird and doesn't care about value length (even shorter)
+        vals = np.arange(1001).astype(dtype=dtype)
+
+        mask = np.random.randint(2, size=1000).astype(bool)
+        # Use vals.dtype in case of flexible dtype (i.e. string)
+        arr = np.zeros(1000, dtype=vals.dtype)
+        zeros = arr.copy()
+
+        np.putmask(arr, mask, vals)
+        assert_array_equal(arr[mask], vals[:len(mask)][mask])
+        assert_array_equal(arr[~mask], zeros[~mask])
+
+    @pytest.mark.parametrize("dtype", list(np.typecodes["All"])[1:] + ["i,O"])
+    @pytest.mark.parametrize("mode", ["raise", "wrap", "clip"])
+    def test_empty(self, dtype, mode):
+        arr = np.zeros(1000, dtype=dtype)
+        arr_copy = arr.copy()
+        mask = np.random.randint(2, size=1000).astype(bool)
+
+        # Allowing empty values like this is weird...
+        np.put(arr, mask, [])
+        assert_array_equal(arr, arr_copy)
+
+
+class TestPut:
+    @pytest.mark.parametrize("dtype", list(np.typecodes["All"])[1:] + ["i,O"])
+    @pytest.mark.parametrize("mode", ["raise", "wrap", "clip"])
+    def test_simple(self, dtype, mode):
+        if dtype.lower() == "m":
+            dtype += "8[ns]"
+
+        # put is weird and doesn't care about value length (even shorter)
+        vals = np.arange(1001).astype(dtype=dtype)
+
+        # Use vals.dtype in case of flexible dtype (i.e. string)
+        arr = np.zeros(1000, dtype=vals.dtype)
+        zeros = arr.copy()
+
+        if mode == "clip":
+            # Special because 0 and -1 value are "reserved" for clip test
+            indx = np.random.permutation(len(arr) - 2)[:-500] + 1
+
+            indx[-1] = 0
+            indx[-2] = len(arr) - 1
+            indx_put = indx.copy()
+            indx_put[-1] = -1389
+            indx_put[-2] = 1321
+        else:
+            # Avoid duplicates (for simplicity) and fill half only
+            indx = np.random.permutation(len(arr) - 3)[:-500]
+            indx_put = indx
+            if mode == "wrap":
+                indx_put = indx_put + len(arr)
+
+        np.put(arr, indx_put, vals, mode=mode)
+        assert_array_equal(arr[indx], vals[:len(indx)])
+        untouched = np.ones(len(arr), dtype=bool)
+        untouched[indx] = False
+        assert_array_equal(arr[untouched], zeros[:untouched.sum()])
+
+    @pytest.mark.parametrize("dtype", list(np.typecodes["All"])[1:] + ["i,O"])
+    @pytest.mark.parametrize("mode", ["raise", "wrap", "clip"])
+    def test_empty(self, dtype, mode):
+        arr = np.zeros(1000, dtype=dtype)
+        arr_copy = arr.copy()
+
+        # Allowing empty values like this is weird...
+        np.put(arr, [1, 2, 3], [])
+        assert_array_equal(arr, arr_copy)
diff --git a/numpy/core/tests/test_limited_api.py b/numpy/core/tests/test_limited_api.py
index 3f9bf346c..725de19bd 100644
--- a/numpy/core/tests/test_limited_api.py
+++ b/numpy/core/tests/test_limited_api.py
@@ -5,7 +5,10 @@ import sys
 import sysconfig
 import pytest
 
+from numpy.testing import IS_WASM
 
+
+@pytest.mark.skipif(IS_WASM, reason="Can't start subprocess")
 @pytest.mark.xfail(
     sysconfig.get_config_var("Py_DEBUG"),
     reason=(
diff --git a/numpy/core/tests/test_longdouble.py b/numpy/core/tests/test_longdouble.py
index 1a54e62d8..45721950c 100644
--- a/numpy/core/tests/test_longdouble.py
+++ b/numpy/core/tests/test_longdouble.py
@@ -1,10 +1,11 @@
 import warnings
+import platform
 import pytest
 
 import numpy as np
 from numpy.testing import (
     assert_, assert_equal, assert_raises, assert_warns, assert_array_equal,
-    temppath,
+    temppath, IS_MUSL
     )
 from numpy.core.tests._locales import CommaDecimalPointLocale
 
@@ -30,6 +31,10 @@ def test_scalar_extraction():
 # 0.1 not exactly representable in base 2 floating point.
 repr_precision = len(repr(np.longdouble(0.1)))
 # +2 from macro block starting around line 842 in scalartypes.c.src.
+
+
+@pytest.mark.skipif(IS_MUSL,
+                    reason="test flaky on musllinux")
 @pytest.mark.skipif(LD_INFO.precision + 2 >= repr_precision,
                     reason="repr precision not enough to show eps")
 def test_repr_roundtrip():
@@ -142,7 +147,7 @@ class TestFileBased:
 
     def test_fromfile_bogus(self):
         with temppath() as path:
-            with open(path, 'wt') as f:
+            with open(path, 'w') as f:
                 f.write("1. 2. 3. flop 4.\n")
 
             with assert_warns(DeprecationWarning):
@@ -153,7 +158,7 @@ class TestFileBased:
         for ctype in ["complex", "cdouble", "cfloat"]:
             # Check spacing between separator and only real component specified
             with temppath() as path:
-                with open(path, 'wt') as f:
+                with open(path, 'w') as f:
                     f.write("1, 2 ,  3  ,4\n")
 
                 res = np.fromfile(path, dtype=ctype, sep=",")
@@ -161,7 +166,7 @@ class TestFileBased:
 
             # Real component not specified
             with temppath() as path:
-                with open(path, 'wt') as f:
+                with open(path, 'w') as f:
                     f.write("1j, -2j,  3j, 4e1j\n")
 
                 res = np.fromfile(path, dtype=ctype, sep=",")
@@ -169,7 +174,7 @@ class TestFileBased:
 
             # Both components specified
             with temppath() as path:
-                with open(path, 'wt') as f:
+                with open(path, 'w') as f:
                     f.write("1+1j,2-2j, -3+3j,  -4e1+4j\n")
 
                 res = np.fromfile(path, dtype=ctype, sep=",")
@@ -177,7 +182,7 @@ class TestFileBased:
 
             # Spaces at wrong places
             with temppath() as path:
-                with open(path, 'wt') as f:
+                with open(path, 'w') as f:
                     f.write("1+2 j,3\n")
 
                 with assert_warns(DeprecationWarning):
@@ -186,7 +191,7 @@ class TestFileBased:
 
             # Spaces at wrong places
             with temppath() as path:
-                with open(path, 'wt') as f:
+                with open(path, 'w') as f:
                     f.write("1+ 2j,3\n")
 
                 with assert_warns(DeprecationWarning):
@@ -195,7 +200,7 @@ class TestFileBased:
 
             # Spaces at wrong places
             with temppath() as path:
-                with open(path, 'wt') as f:
+                with open(path, 'w') as f:
                     f.write("1 +2j,3\n")
 
                 with assert_warns(DeprecationWarning):
@@ -204,7 +209,7 @@ class TestFileBased:
 
             # Spaces at wrong places
             with temppath() as path:
-                with open(path, 'wt') as f:
+                with open(path, 'w') as f:
                     f.write("1+j\n")
 
                 with assert_warns(DeprecationWarning):
@@ -213,7 +218,7 @@ class TestFileBased:
 
             # Spaces at wrong places
             with temppath() as path:
-                with open(path, 'wt') as f:
+                with open(path, 'w') as f:
                     f.write("1+\n")
 
                 with assert_warns(DeprecationWarning):
@@ -222,7 +227,7 @@ class TestFileBased:
 
             # Spaces at wrong places
             with temppath() as path:
-                with open(path, 'wt') as f:
+                with open(path, 'w') as f:
                     f.write("1j+1\n")
 
                 with assert_warns(DeprecationWarning):
@@ -235,7 +240,7 @@ class TestFileBased:
                         reason="Need strtold_l")
     def test_fromfile(self):
         with temppath() as path:
-            with open(path, 'wt') as f:
+            with open(path, 'w') as f:
                 f.write(self.out)
             res = np.fromfile(path, dtype=np.longdouble, sep="\n")
         assert_equal(res, self.tgt)
@@ -244,7 +249,7 @@ class TestFileBased:
                         reason="Need strtold_l")
     def test_genfromtxt(self):
         with temppath() as path:
-            with open(path, 'wt') as f:
+            with open(path, 'w') as f:
                 f.write(self.out)
             res = np.genfromtxt(path, dtype=np.longdouble)
         assert_equal(res, self.tgt)
@@ -253,7 +258,7 @@ class TestFileBased:
                         reason="Need strtold_l")
     def test_loadtxt(self):
         with temppath() as path:
-            with open(path, 'wt') as f:
+            with open(path, 'w') as f:
                 f.write(self.out)
             res = np.loadtxt(path, dtype=np.longdouble)
         assert_equal(res, self.tgt)
@@ -368,3 +373,23 @@ def test_longdouble_from_int(int_val):
     True, False])
 def test_longdouble_from_bool(bool_val):
     assert np.longdouble(bool_val) == np.longdouble(int(bool_val))
+
+
+@pytest.mark.skipif(
+    not (IS_MUSL and platform.machine() == "x86_64"),
+    reason="only need to run on musllinux_x86_64"
+)
+def test_musllinux_x86_64_signature():
+    # this test may fail if you're emulating musllinux_x86_64 on a different
+    # architecture, but should pass natively.
+    known_sigs = [b'\xcd\xcc\xcc\xcc\xcc\xcc\xcc\xcc\xfb\xbf']
+    sig = (np.longdouble(-1.0) / np.longdouble(10.0)
+           ).newbyteorder('<').tobytes()[:10]
+    assert sig in known_sigs
+
+
+def test_eps_positive():
+    # np.finfo('g').eps should be positive on all platforms. If this isn't true
+    # then something may have gone wrong with the MachArLike, e.g. if
+    # np.core.getlimits._discovered_machar didn't work properly
+    assert np.finfo(np.longdouble).eps > 0.
diff --git a/numpy/core/tests/test_mem_overlap.py b/numpy/core/tests/test_mem_overlap.py
index d66decfda..1fd4c4d41 100644
--- a/numpy/core/tests/test_mem_overlap.py
+++ b/numpy/core/tests/test_mem_overlap.py
@@ -55,7 +55,7 @@ def _indices(ndims):
 def _check_assignment(srcidx, dstidx):
     """Check assignment arr[dstidx] = arr[srcidx] works."""
 
-    arr = np.arange(np.product(shape)).reshape(shape)
+    arr = np.arange(np.prod(shape)).reshape(shape)
 
     cpy = arr.copy()
 
diff --git a/numpy/core/tests/test_mem_policy.py b/numpy/core/tests/test_mem_policy.py
index 3dae36d5a..479f702ea 100644
--- a/numpy/core/tests/test_mem_policy.py
+++ b/numpy/core/tests/test_mem_policy.py
@@ -5,7 +5,7 @@ import pytest
 import numpy as np
 import threading
 import warnings
-from numpy.testing import extbuild, assert_warns
+from numpy.testing import extbuild, assert_warns, IS_WASM
 import sys
 
 
@@ -18,6 +18,8 @@ def get_module(tmp_path):
     """
     if sys.platform.startswith('cygwin'):
         pytest.skip('link fails on cygwin')
+    if IS_WASM:
+        pytest.skip("Can't build module inside Wasm")
     functions = [
         ("get_default_policy", "METH_NOARGS", """
              Py_INCREF(PyDataMem_DefaultHandler);
@@ -87,6 +89,7 @@ def get_module(tmp_path):
          """),
     ]
     prologue = '''
+        #define NPY_TARGET_VERSION NPY_1_22_API_VERSION
         #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
         #include <numpy/arrayobject.h>
         /*
diff --git a/numpy/core/tests/test_memmap.py b/numpy/core/tests/test_memmap.py
index 914f86f14..ad074b312 100644
--- a/numpy/core/tests/test_memmap.py
+++ b/numpy/core/tests/test_memmap.py
@@ -6,7 +6,7 @@ from pathlib import Path
 from tempfile import NamedTemporaryFile, TemporaryFile
 
 from numpy import (
-    memmap, sum, average, product, ndarray, isscalar, add, subtract, multiply)
+    memmap, sum, average, prod, ndarray, isscalar, add, subtract, multiply)
 
 from numpy import arange, allclose, asarray
 from numpy.testing import (
@@ -153,7 +153,7 @@ class TestMemmap:
 
         with suppress_warnings() as sup:
             sup.filter(FutureWarning, "np.average currently does not preserve")
-            for unary_op in [sum, average, product]:
+            for unary_op in [sum, average, prod]:
                 result = unary_op(fp)
                 assert_(isscalar(result))
                 assert_(result.__class__ is self.data[0, 0].__class__)
diff --git a/numpy/core/tests/test_multiarray.py b/numpy/core/tests/test_multiarray.py
index 027384fba..196c2dc13 100644
--- a/numpy/core/tests/test_multiarray.py
+++ b/numpy/core/tests/test_multiarray.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import collections.abc
 import tempfile
 import sys
@@ -9,6 +11,7 @@ import functools
 import ctypes
 import os
 import gc
+import re
 import weakref
 import pytest
 from contextlib import contextmanager
@@ -27,11 +30,12 @@ from numpy.testing import (
     assert_, assert_raises, assert_warns, assert_equal, assert_almost_equal,
     assert_array_equal, assert_raises_regex, assert_array_almost_equal,
     assert_allclose, IS_PYPY, IS_PYSTON, HAS_REFCOUNT, assert_array_less,
-    runstring, temppath, suppress_warnings, break_cycles,
+    runstring, temppath, suppress_warnings, break_cycles, _SUPPORTS_SVE,
     )
 from numpy.testing._private.utils import requires_memory, _no_tracing
 from numpy.core.tests._locales import CommaDecimalPointLocale
 from numpy.lib.recfunctions import repack_fields
+from numpy.core.multiarray import _get_ndarray_c_version
 
 # Need to test an object that does not fully implement math interface
 from datetime import timedelta, datetime
@@ -402,6 +406,13 @@ class TestAttributes:
         assert_array_equal(x['a'], [3.5, 3.5])
         assert_array_equal(x['b'], [-2, -2])
 
+    def test_fill_readonly(self):
+        # gh-22922
+        a = np.zeros(11)
+        a.setflags(write=False)
+        with pytest.raises(ValueError, match=".*read-only"):
+            a.fill(0)
+
 
 class TestArrayConstruction:
     def test_array(self):
@@ -1185,6 +1196,17 @@ class TestCreation:
         expected = expected * (arr.nbytes // len(expected))
         assert arr.tobytes() == expected
 
+    @pytest.mark.parametrize("func", [
+        np.array, np.asarray, np.asanyarray, np.ascontiguousarray,
+        np.asfortranarray])
+    def test_creation_from_dtypemeta(self, func):
+        dtype = np.dtype('i')
+        arr1 = func([1, 2, 3], dtype=dtype)
+        arr2 = func([1, 2, 3], dtype=type(dtype))
+        assert_array_equal(arr1, arr2)
+        assert arr2.dtype == dtype
+
+
 class TestStructured:
     def test_subarray_field_access(self):
         a = np.zeros((3, 5), dtype=[('a', ('i4', (2, 2)))])
@@ -1257,9 +1279,9 @@ class TestStructured:
         # The main importance is that it does not return True:
         with pytest.raises(TypeError):
             x == y
- 
+
     def test_empty_structured_array_comparison(self):
-        # Check that comparison works on empty arrays with nontrivially 
+        # Check that comparison works on empty arrays with nontrivially
         # shaped fields
         a = np.zeros(0, [('a', '<f8', (1, 1))])
         assert_equal(a, a)
@@ -1270,6 +1292,13 @@ class TestStructured:
         a = np.zeros((1, 0, 1), [('a', '<f8', (1, 1))])
         assert_equal(a, a)
 
+    @pytest.mark.parametrize("op", [operator.eq, operator.ne])
+    def test_structured_array_comparison_bad_broadcasts(self, op):
+        a = np.zeros(3, dtype='i,i')
+        b = np.array([], dtype="i,i")
+        with pytest.raises(ValueError):
+            op(a, b)
+
     def test_structured_comparisons_with_promotion(self):
         # Check that structured arrays can be compared so long as their
         # dtypes promote fine:
@@ -1290,7 +1319,10 @@ class TestStructured:
         assert_equal(a == b, [False, True])
         assert_equal(a != b, [True, False])
 
-    def test_void_comparison_failures(self):
+    @pytest.mark.parametrize("op", [
+            operator.eq, lambda x, y: operator.eq(y, x),
+            operator.ne, lambda x, y: operator.ne(y, x)])
+    def test_void_comparison_failures(self, op):
         # In principle, one could decide to return an array of False for some
         # if comparisons are impossible.  But right now we return TypeError
         # when "void" dtype are involved.
@@ -1298,18 +1330,18 @@ class TestStructured:
         y = np.zeros(3)
         # Cannot compare non-structured to structured:
         with pytest.raises(TypeError):
-            x == y
+            op(x, y)
 
         # Added title prevents promotion, but casts are OK:
         y = np.zeros(3, dtype=[(('title', 'a'), 'i1')])
         assert np.can_cast(y.dtype, x.dtype)
         with pytest.raises(TypeError):
-            x == y
+            op(x, y)
 
         x = np.zeros(3, dtype="V7")
         y = np.zeros(3, dtype="V8")
         with pytest.raises(TypeError):
-            x == y
+            op(x, y)
 
     def test_casting(self):
         # Check that casting a structured array to change its byte order
@@ -1687,7 +1719,7 @@ class TestBool:
 
     @pytest.mark.xfail(reason="See gh-9847")
     def test_cast_from_unicode(self):
-        self._test_cast_from_flexible(np.unicode_)
+        self._test_cast_from_flexible(np.str_)
 
     @pytest.mark.xfail(reason="See gh-9847")
     def test_cast_from_bytes(self):
@@ -1902,8 +1934,9 @@ class TestMethods:
                 assert_array_equal(a2.prod(axis=-1),
                                    np.array([24, 1890, 600], ctype))
 
-    def test_repeat(self):
-        m = np.array([1, 2, 3, 4, 5, 6])
+    @pytest.mark.parametrize('dtype', [None, object])
+    def test_repeat(self, dtype):
+        m = np.array([1, 2, 3, 4, 5, 6], dtype=dtype)
         m_rect = m.reshape((2, 3))
 
         A = m.repeat([1, 3, 2, 1, 1, 2])
@@ -2069,7 +2102,7 @@ class TestMethods:
                 msg = 'byte-swapped complex sort, dtype={0}'.format(dt)
                 assert_equal(c, arr, msg)
 
-    @pytest.mark.parametrize('dtype', [np.bytes_, np.unicode_])
+    @pytest.mark.parametrize('dtype', [np.bytes_, np.str_])
     def test_sort_string(self, dtype):
         # np.array will perform the encoding to bytes for us in the bytes test
         a = np.array(['aaaaaaaa' + chr(i) for i in range(101)], dtype=dtype)
@@ -2097,19 +2130,26 @@ class TestMethods:
             c.sort(kind=kind)
             assert_equal(c, a, msg)
 
-    def test_sort_structured(self):
+    @pytest.mark.parametrize("dt", [
+            np.dtype([('f', float), ('i', int)]),
+            np.dtype([('f', float), ('i', object)])])
+    @pytest.mark.parametrize("step", [1, 2])
+    def test_sort_structured(self, dt, step):
         # test record array sorts.
-        dt = np.dtype([('f', float), ('i', int)])
-        a = np.array([(i, i) for i in range(101)], dtype=dt)
+        a = np.array([(i, i) for i in range(101*step)], dtype=dt)
         b = a[::-1]
         for kind in ['q', 'h', 'm']:
             msg = "kind=%s" % kind
-            c = a.copy()
+            c = a.copy()[::step]
+            indx = c.argsort(kind=kind)
             c.sort(kind=kind)
-            assert_equal(c, a, msg)
-            c = b.copy()
+            assert_equal(c, a[::step], msg)
+            assert_equal(a[::step][indx], a[::step], msg)
+            c = b.copy()[::step]
+            indx = c.argsort(kind=kind)
             c.sort(kind=kind)
-            assert_equal(c, a, msg)
+            assert_equal(c, a[step-1::step], msg)
+            assert_equal(b[::step][indx], a[step-1::step], msg)
 
     @pytest.mark.parametrize('dtype', ['datetime64[D]', 'timedelta64[D]'])
     def test_sort_time(self, dtype):
@@ -2232,7 +2272,7 @@ class TestMethods:
         assert_c(a.copy('C'))
         assert_fortran(a.copy('F'))
         assert_c(a.copy('A'))
-    
+
     @pytest.mark.parametrize("dtype", ['O', np.int32, 'i,O'])
     def test__deepcopy__(self, dtype):
         # Force the entry of NULLs into array
@@ -2343,7 +2383,7 @@ class TestMethods:
 
         # test unicode argsorts.
         s = 'aaaaaaaa'
-        a = np.array([s + chr(i) for i in range(101)], dtype=np.unicode_)
+        a = np.array([s + chr(i) for i in range(101)], dtype=np.str_)
         b = a[::-1]
         r = np.arange(101)
         rr = r[::-1]
@@ -2426,7 +2466,7 @@ class TestMethods:
         a = np.array(['aaaaaaaaa' for i in range(100)])
         assert_equal(a.argsort(kind='m'), r)
         # unicode
-        a = np.array(['aaaaaaaaa' for i in range(100)], dtype=np.unicode_)
+        a = np.array(['aaaaaaaaa' for i in range(100)], dtype=np.str_)
         assert_equal(a.argsort(kind='m'), r)
 
     def test_sort_unicode_kind(self):
@@ -2441,7 +2481,7 @@ class TestMethods:
         np.array([0, 1, np.nan]),
     ])
     def test_searchsorted_floats(self, a):
-        # test for floats arrays containing nans. Explicitly test 
+        # test for floats arrays containing nans. Explicitly test
         # half, single, and double precision floats to verify that
         # the NaN-handling is correct.
         msg = "Test real (%s) searchsorted with nans, side='l'" % a.dtype
@@ -2457,7 +2497,7 @@ class TestMethods:
         assert_equal(y, 2)
 
     def test_searchsorted_complex(self):
-        # test for complex arrays containing nans. 
+        # test for complex arrays containing nans.
         # The search sorted routines use the compare functions for the
         # array type, so this checks if that is consistent with the sort
         # order.
@@ -2479,7 +2519,7 @@ class TestMethods:
         a = np.array([0, 128], dtype='>i4')
         b = a.searchsorted(np.array(128, dtype='>i4'))
         assert_equal(b, 1, msg)
-        
+
     def test_searchsorted_n_elements(self):
         # Check 0 elements
         a = np.ones(0)
@@ -2570,7 +2610,7 @@ class TestMethods:
                       'P:\\20x_dapi_cy3\\20x_dapi_cy3_20100197_1',
                       'P:\\20x_dapi_cy3\\20x_dapi_cy3_20100198_1',
                       'P:\\20x_dapi_cy3\\20x_dapi_cy3_20100199_1'],
-                     dtype=np.unicode_)
+                     dtype=np.str_)
         ind = np.arange(len(a))
         assert_equal([a.searchsorted(v, 'left') for v in a], ind)
         assert_equal([a.searchsorted(v, 'right') for v in a], ind + 1)
@@ -3605,9 +3645,13 @@ class TestMethods:
             msg = 'dtype: {0}'.format(dt)
             ap = complex(a)
             assert_equal(ap, a, msg)
-            bp = complex(b)
+
+            with assert_warns(DeprecationWarning):
+                bp = complex(b)
             assert_equal(bp, b, msg)
-            cp = complex(c)
+
+            with assert_warns(DeprecationWarning):
+                cp = complex(c)
             assert_equal(cp, c, msg)
 
     def test__complex__should_not_work(self):
@@ -3630,7 +3674,8 @@ class TestMethods:
         assert_raises(TypeError, complex, d)
 
         e = np.array(['1+1j'], 'U')
-        assert_raises(TypeError, complex, e)
+        with assert_warns(DeprecationWarning):
+            assert_raises(TypeError, complex, e)
 
 class TestCequenceMethods:
     def test_array_contains(self):
@@ -3692,7 +3737,7 @@ class TestBinop:
             'and':      (np.bitwise_and, True, int),
             'xor':      (np.bitwise_xor, True, int),
             'or':       (np.bitwise_or, True, int),
-            'matmul':   (np.matmul, False, float),
+            'matmul':   (np.matmul, True, float),
             # 'ge':       (np.less_equal, False),
             # 'gt':       (np.less, False),
             # 'le':       (np.greater_equal, False),
@@ -4710,23 +4755,23 @@ class TestArgmax:
 
         a = np.array([1, 2**7 - 1, -2**7], dtype=np.int8)
         assert_equal(np.argmax(a), 1)
-        a.repeat(129)
-        assert_equal(np.argmax(a), 1)
+        a = a.repeat(129)
+        assert_equal(np.argmax(a), 129)
 
         a = np.array([1, 2**15 - 1, -2**15], dtype=np.int16)
         assert_equal(np.argmax(a), 1)
-        a.repeat(129)
-        assert_equal(np.argmax(a), 1)
+        a = a.repeat(129)
+        assert_equal(np.argmax(a), 129)
 
         a = np.array([1, 2**31 - 1, -2**31], dtype=np.int32)
         assert_equal(np.argmax(a), 1)
-        a.repeat(129)
-        assert_equal(np.argmax(a), 1)
+        a = a.repeat(129)
+        assert_equal(np.argmax(a), 129)
 
         a = np.array([1, 2**63 - 1, -2**63], dtype=np.int64)
         assert_equal(np.argmax(a), 1)
-        a.repeat(129)
-        assert_equal(np.argmax(a), 1)
+        a = a.repeat(129)
+        assert_equal(np.argmax(a), 129)
 
 class TestArgmin:
     usg_data = [
@@ -4852,23 +4897,23 @@ class TestArgmin:
 
         a = np.array([1, -2**7, -2**7 + 1, 2**7 - 1], dtype=np.int8)
         assert_equal(np.argmin(a), 1)
-        a.repeat(129)
-        assert_equal(np.argmin(a), 1)
+        a = a.repeat(129)
+        assert_equal(np.argmin(a), 129)
 
         a = np.array([1, -2**15, -2**15 + 1, 2**15 - 1], dtype=np.int16)
         assert_equal(np.argmin(a), 1)
-        a.repeat(129)
-        assert_equal(np.argmin(a), 1)
+        a = a.repeat(129)
+        assert_equal(np.argmin(a), 129)
 
         a = np.array([1, -2**31, -2**31 + 1, 2**31 - 1], dtype=np.int32)
         assert_equal(np.argmin(a), 1)
-        a.repeat(129)
-        assert_equal(np.argmin(a), 1)
+        a = a.repeat(129)
+        assert_equal(np.argmin(a), 129)
 
         a = np.array([1, -2**63, -2**63 + 1, 2**63 - 1], dtype=np.int64)
         assert_equal(np.argmin(a), 1)
-        a.repeat(129)
-        assert_equal(np.argmin(a), 1)
+        a = a.repeat(129)
+        assert_equal(np.argmin(a), 129)
 
 class TestMinMax:
 
@@ -5066,6 +5111,22 @@ class TestPutmask:
         with pytest.raises(ValueError):
             np.putmask(a, a >= 2, 3)
 
+    def test_kwargs(self):
+        x = np.array([0, 0])
+        np.putmask(x, [0, 1], [-1, -2])
+        assert_array_equal(x, [0, -2])
+
+        x = np.array([0, 0])
+        np.putmask(x, mask=[0, 1], values=[-1, -2])
+        assert_array_equal(x, [0, -2])
+
+        x = np.array([0, 0])
+        np.putmask(x, values=[-1, -2],  mask=[0, 1])
+        assert_array_equal(x, [0, -2])
+
+        with pytest.raises(TypeError):
+            np.putmask(a=x, values=[-1, -2],  mask=[0, 1])
+
 
 class TestTake:
     def tst_basic(self, x):
@@ -5517,33 +5578,6 @@ class TestIO:
             tmp_filename,
             dtype='<f4')
 
-    @pytest.mark.slow  # takes > 1 minute on mechanical hard drive
-    def test_big_binary(self):
-        """Test workarounds for 32-bit limit for MSVC fwrite, fseek, and ftell
-
-        These normally would hang doing something like this.
-        See : https://github.com/numpy/numpy/issues/2256
-        """
-        if sys.platform != 'win32' or '[GCC ' in sys.version:
-            return
-        try:
-            # before workarounds, only up to 2**32-1 worked
-            fourgbplus = 2**32 + 2**16
-            testbytes = np.arange(8, dtype=np.int8)
-            n = len(testbytes)
-            flike = tempfile.NamedTemporaryFile()
-            f = flike.file
-            np.tile(testbytes, fourgbplus // testbytes.nbytes).tofile(f)
-            flike.seek(0)
-            a = np.fromfile(f, dtype=np.int8)
-            flike.close()
-            assert_(len(a) == fourgbplus)
-            # check only start and end for speed:
-            assert_((a[:n] == testbytes).all())
-            assert_((a[-n:] == testbytes).all())
-        except (MemoryError, ValueError):
-            pass
-
     def test_string(self, tmp_filename):
         self._check_from(b'1,2,3,4', [1., 2., 3., 4.], tmp_filename, sep=',')
 
@@ -6196,7 +6230,7 @@ class TestStats:
     def test_mean_axis_error(self):
         # Ensure that AxisError is raised instead of IndexError when axis is
         # out of bounds, see gh-15817.
-        with assert_raises(np.core._exceptions.AxisError):
+        with assert_raises(np.exceptions.AxisError):
             np.arange(10).mean(axis=2)
 
     def test_mean_where(self):
@@ -6280,7 +6314,7 @@ class TestStats:
     def test_var_axis_error(self):
         # Ensure that AxisError is raised instead of IndexError when axis is
         # out of bounds, see gh-15817.
-        with assert_raises(np.core._exceptions.AxisError):
+        with assert_raises(np.exceptions.AxisError):
             np.arange(10).var(axis=2)
 
     def test_var_where(self):
@@ -6647,6 +6681,22 @@ class TestDot:
         r = np.empty((1024, 32), dtype=int)
         assert_raises(ValueError, dot, f, v, r)
 
+    def test_dot_out_result(self):
+        x = np.ones((), dtype=np.float16)
+        y = np.ones((5,), dtype=np.float16)
+        z = np.zeros((5,), dtype=np.float16)
+        res = x.dot(y, out=z)
+        assert np.array_equal(res, y)
+        assert np.array_equal(z, y)
+
+    def test_dot_out_aliasing(self):
+        x = np.ones((), dtype=np.float16)
+        y = np.ones((5,), dtype=np.float16)
+        z = np.zeros((5,), dtype=np.float16)
+        res = x.dot(y, out=z)
+        z[0] = 2
+        assert np.array_equal(res, z)
+
     def test_dot_array_order(self):
         a = np.array([[1, 2], [3, 4]], order='C')
         b = np.array([[1, 2], [3, 4]], order='F')
@@ -6731,6 +6781,18 @@ class TestDot:
         res = np.dot(data, data)
         assert res == 2**30+100
 
+    def test_dtype_discovery_fails(self):
+        # See gh-14247, error checking was missing for failed dtype discovery
+        class BadObject(object):
+            def __array__(self):
+                raise TypeError("just this tiny mint leaf")
+
+        with pytest.raises(TypeError):
+            np.dot(BadObject(), BadObject())
+
+        with pytest.raises(TypeError):
+            np.dot(3.0, BadObject())
+
 
 class MatmulCommon:
     """Common tests for '@' operator and numpy.matmul.
@@ -7142,16 +7204,69 @@ class TestMatmulOperator(MatmulCommon):
         assert_raises(TypeError, self.matmul, np.void(b'abc'), np.void(b'abc'))
         assert_raises(TypeError, self.matmul, np.arange(10), np.void(b'abc'))
 
-def test_matmul_inplace():
-    # It would be nice to support in-place matmul eventually, but for now
-    # we don't have a working implementation, so better just to error out
-    # and nudge people to writing "a = a @ b".
-    a = np.eye(3)
-    b = np.eye(3)
-    assert_raises(TypeError, a.__imatmul__, b)
-    import operator
-    assert_raises(TypeError, operator.imatmul, a, b)
-    assert_raises(TypeError, exec, "a @= b", globals(), locals())
+
+class TestMatmulInplace:
+    DTYPES = {}
+    for i in MatmulCommon.types:
+        for j in MatmulCommon.types:
+            if np.can_cast(j, i):
+                DTYPES[f"{i}-{j}"] = (np.dtype(i), np.dtype(j))
+
+    @pytest.mark.parametrize("dtype1,dtype2", DTYPES.values(), ids=DTYPES)
+    def test_basic(self, dtype1: np.dtype, dtype2: np.dtype) -> None:
+        a = np.arange(10).reshape(5, 2).astype(dtype1)
+        a_id = id(a)
+        b = np.ones((2, 2), dtype=dtype2)
+
+        ref = a @ b
+        a @= b
+
+        assert id(a) == a_id
+        assert a.dtype == dtype1
+        assert a.shape == (5, 2)
+        if dtype1.kind in "fc":
+            np.testing.assert_allclose(a, ref)
+        else:
+            np.testing.assert_array_equal(a, ref)
+
+    SHAPES = {
+        "2d_large": ((10**5, 10), (10, 10)),
+        "3d_large": ((10**4, 10, 10), (1, 10, 10)),
+        "1d": ((3,), (3,)),
+        "2d_1d": ((3, 3), (3,)),
+        "1d_2d": ((3,), (3, 3)),
+        "2d_broadcast": ((3, 3), (3, 1)),
+        "2d_broadcast_reverse": ((1, 3), (3, 3)),
+        "3d_broadcast1": ((3, 3, 3), (1, 3, 1)),
+        "3d_broadcast2": ((3, 3, 3), (1, 3, 3)),
+        "3d_broadcast3": ((3, 3, 3), (3, 3, 1)),
+        "3d_broadcast_reverse1": ((1, 3, 3), (3, 3, 3)),
+        "3d_broadcast_reverse2": ((3, 1, 3), (3, 3, 3)),
+        "3d_broadcast_reverse3": ((1, 1, 3), (3, 3, 3)),
+    }
+
+    @pytest.mark.parametrize("a_shape,b_shape", SHAPES.values(), ids=SHAPES)
+    def test_shapes(self, a_shape: tuple[int, ...], b_shape: tuple[int, ...]):
+        a_size = np.prod(a_shape)
+        a = np.arange(a_size).reshape(a_shape).astype(np.float64)
+        a_id = id(a)
+
+        b_size = np.prod(b_shape)
+        b = np.arange(b_size).reshape(b_shape)
+
+        ref = a @ b
+        if ref.shape != a_shape:
+            with pytest.raises(ValueError):
+                a @= b
+            return
+        else:
+            a @= b
+
+        assert id(a) == a_id
+        assert a.dtype.type == np.float64
+        assert a.shape == a_shape
+        np.testing.assert_allclose(a, ref)
+
 
 def test_matmul_axes():
     a = np.arange(3*4*5).reshape(3, 4, 5)
@@ -8647,14 +8762,16 @@ class TestConversion:
         int_funcs = (int, lambda x: x.__int__())
         for int_func in int_funcs:
             assert_equal(int_func(np.array(0)), 0)
-            assert_equal(int_func(np.array([1])), 1)
-            assert_equal(int_func(np.array([[42]])), 42)
+            with assert_warns(DeprecationWarning):
+                assert_equal(int_func(np.array([1])), 1)
+            with assert_warns(DeprecationWarning):
+                assert_equal(int_func(np.array([[42]])), 42)
             assert_raises(TypeError, int_func, np.array([1, 2]))
 
             # gh-9972
             assert_equal(4, int_func(np.array('4')))
             assert_equal(5, int_func(np.bytes_(b'5')))
-            assert_equal(6, int_func(np.unicode_('6')))
+            assert_equal(6, int_func(np.str_('6')))
 
             # The delegation of int() to __trunc__ was deprecated in
             # Python 3.11.
@@ -8663,7 +8780,8 @@ class TestConversion:
                     def __trunc__(self):
                         return 3
                 assert_equal(3, int_func(np.array(HasTrunc())))
-                assert_equal(3, int_func(np.array([HasTrunc()])))
+                with assert_warns(DeprecationWarning):
+                    assert_equal(3, int_func(np.array([HasTrunc()])))
             else:
                 pass
 
@@ -8672,8 +8790,9 @@ class TestConversion:
                     raise NotImplementedError
             assert_raises(NotImplementedError,
                 int_func, np.array(NotConvertible()))
-            assert_raises(NotImplementedError,
-                int_func, np.array([NotConvertible()]))
+            with assert_warns(DeprecationWarning):
+                assert_raises(NotImplementedError,
+                    int_func, np.array([NotConvertible()]))
 
 
 class TestWhere:
@@ -8840,6 +8959,11 @@ class TestWhere:
             result = array.nonzero()
             assert_array_equal(benchmark, result)
 
+    def test_kwargs(self):
+        a = np.zeros(1)
+        with assert_raises(TypeError):
+            np.where(a, x=a, y=a)
+
 
 if not IS_PYPY:
     # sys.getsizeof() is not valid on PyPy
@@ -9021,33 +9145,33 @@ class TestUnicodeEncoding:
     def test_assign_scalar(self):
         # gh-3258
         l = np.array(['aa', 'bb'])
-        l[:] = np.unicode_('cc')
+        l[:] = np.str_('cc')
         assert_equal(l, ['cc', 'cc'])
 
     def test_fill_scalar(self):
         # gh-7227
         l = np.array(['aa', 'bb'])
-        l.fill(np.unicode_('cc'))
+        l.fill(np.str_('cc'))
         assert_equal(l, ['cc', 'cc'])
 
 
 class TestUnicodeArrayNonzero:
 
     def test_empty_ustring_array_is_falsey(self):
-        assert_(not np.array([''], dtype=np.unicode_))
+        assert_(not np.array([''], dtype=np.str_))
 
     def test_whitespace_ustring_array_is_falsey(self):
-        a = np.array(['eggs'], dtype=np.unicode_)
+        a = np.array(['eggs'], dtype=np.str_)
         a[0] = '  \0\0'
         assert_(not a)
 
     def test_all_null_ustring_array_is_falsey(self):
-        a = np.array(['eggs'], dtype=np.unicode_)
+        a = np.array(['eggs'], dtype=np.str_)
         a[0] = '\0\0\0\0'
         assert_(not a)
 
     def test_null_inside_ustring_array_is_truthy(self):
-        a = np.array(['eggs'], dtype=np.unicode_)
+        a = np.array(['eggs'], dtype=np.str_)
         a[0] = ' \0 \0'
         assert_(a)
 
@@ -9312,6 +9436,61 @@ class TestArange:
         assert len(keyword_start_stop) == 6
         assert_array_equal(keyword_stop, keyword_zerotostop)
 
+    def test_arange_booleans(self):
+        # Arange makes some sense for booleans and works up to length 2.
+        # But it is weird since `arange(2, 4, dtype=bool)` works.
+        # Arguably, much or all of this could be deprecated/removed.
+        res = np.arange(False, dtype=bool)
+        assert_array_equal(res, np.array([], dtype="bool"))
+
+        res = np.arange(True, dtype="bool")
+        assert_array_equal(res, [False])
+
+        res = np.arange(2, dtype="bool")
+        assert_array_equal(res, [False, True])
+
+        # This case is especially weird, but drops out without special case:
+        res = np.arange(6, 8, dtype="bool")
+        assert_array_equal(res, [True, True])
+
+        with pytest.raises(TypeError):
+            np.arange(3, dtype="bool")
+
+    @pytest.mark.parametrize("dtype", ["S3", "U", "5i"])
+    def test_rejects_bad_dtypes(self, dtype):
+        dtype = np.dtype(dtype)
+        DType_name = re.escape(str(type(dtype)))
+        with pytest.raises(TypeError,
+                match=rf"arange\(\) not supported for inputs .* {DType_name}"):
+            np.arange(2, dtype=dtype)
+
+    def test_rejects_strings(self):
+        # Explicitly test error for strings which may call "b" - "a":
+        DType_name = re.escape(str(type(np.array("a").dtype)))
+        with pytest.raises(TypeError,
+                match=rf"arange\(\) not supported for inputs .* {DType_name}"):
+            np.arange("a", "b")
+
+    def test_byteswapped(self):
+        res_be = np.arange(1, 1000, dtype=">i4")
+        res_le = np.arange(1, 1000, dtype="<i4")
+        assert res_be.dtype == ">i4"
+        assert res_le.dtype == "<i4"
+        assert_array_equal(res_le, res_be)
+
+    @pytest.mark.parametrize("which", [0, 1, 2])
+    def test_error_paths_and_promotion(self, which):
+        args = [0, 1, 2]  # start, stop, and step
+        args[which] = np.float64(2.)  # should ensure float64 output
+
+        assert np.arange(*args).dtype == np.float64
+
+        # Cover stranger error path, test only to achieve code coverage!
+        args[which] = [None, []]
+        with pytest.raises(ValueError):
+            # Fails discovering start dtype
+            np.arange(*args)
+
 
 class TestArrayFinalize:
     """ Tests __array_finalize__ """
@@ -9425,6 +9604,105 @@ def test_equal_override():
         assert_equal(array != my_always_equal, 'ne')
 
 
+@pytest.mark.parametrize("op", [operator.eq, operator.ne])
+@pytest.mark.parametrize(["dt1", "dt2"], [
+        ([("f", "i")], [("f", "i")]),  # structured comparison (successful)
+        ("M8", "d"),  # impossible comparison: result is all True or False
+        ("d", "d"),  # valid comparison
+        ])
+def test_equal_subclass_no_override(op, dt1, dt2):
+    # Test how the three different possible code-paths deal with subclasses
+
+    class MyArr(np.ndarray):
+        called_wrap = 0
+
+        def __array_wrap__(self, new):
+            type(self).called_wrap += 1
+            return super().__array_wrap__(new)
+
+    numpy_arr = np.zeros(5, dtype=dt1)
+    my_arr = np.zeros(5, dtype=dt2).view(MyArr)
+
+    assert type(op(numpy_arr, my_arr)) is MyArr
+    assert type(op(my_arr, numpy_arr)) is MyArr
+    # We expect 2 calls (more if there were more fields):
+    assert MyArr.called_wrap == 2
+
+
+@pytest.mark.parametrize(["dt1", "dt2"], [
+        ("M8[ns]", "d"),
+        ("M8[s]", "l"),
+        ("m8[ns]", "d"),
+        # Missing: ("m8[ns]", "l") as timedelta currently promotes ints
+        ("M8[s]", "m8[s]"),
+        ("S5", "U5"),
+        # Structured/void dtypes have explicit paths not tested here.
+])
+def test_no_loop_gives_all_true_or_false(dt1, dt2):
+    # Make sure they broadcast to test result shape, use random values, since
+    # the actual value should be ignored
+    arr1 = np.random.randint(5, size=100).astype(dt1)
+    arr2 = np.random.randint(5, size=99)[:, np.newaxis].astype(dt2)
+
+    res = arr1 == arr2
+    assert res.shape == (99, 100)
+    assert res.dtype == bool
+    assert not res.any()
+
+    res = arr1 != arr2
+    assert res.shape == (99, 100)
+    assert res.dtype == bool
+    assert res.all()
+
+    # incompatible shapes raise though
+    arr2 = np.random.randint(5, size=99).astype(dt2)
+    with pytest.raises(ValueError):
+        arr1 == arr2
+
+    with pytest.raises(ValueError):
+        arr1 != arr2
+
+    # Basic test with another operation:
+    with pytest.raises(np.core._exceptions._UFuncNoLoopError):
+        arr1 > arr2
+
+
+@pytest.mark.parametrize("op", [
+        operator.eq, operator.ne, operator.le, operator.lt, operator.ge,
+        operator.gt])
+def test_comparisons_forwards_error(op):
+    class NotArray:
+        def __array__(self):
+            raise TypeError("run you fools")
+
+    with pytest.raises(TypeError, match="run you fools"):
+        op(np.arange(2), NotArray())
+
+    with pytest.raises(TypeError, match="run you fools"):
+        op(NotArray(), np.arange(2))
+
+
+def test_richcompare_scalar_boolean_singleton_return():
+    # These are currently guaranteed to be the boolean singletons, but maybe
+    # returning NumPy booleans would also be OK:
+    assert (np.array(0) == "a") is False
+    assert (np.array(0) != "a") is True
+    assert (np.int16(0) == "a") is False
+    assert (np.int16(0) != "a") is True
+
+
+@pytest.mark.parametrize("op", [
+        operator.eq, operator.ne, operator.le, operator.lt, operator.ge,
+        operator.gt])
+def test_ragged_comparison_fails(op):
+    # This needs to convert the internal array to True/False, which fails:
+    a = np.array([1, np.array([1, 2, 3])], dtype=object)
+    b = np.array([1, np.array([1, 2, 3])], dtype=object)
+
+    with pytest.raises(ValueError, match="The truth value.*ambiguous"):
+        op(a, b)
+
+
 @pytest.mark.parametrize(
     ["fun", "npfun"],
     [
@@ -9651,40 +9929,50 @@ class TestViewDtype:
         assert_array_equal(x.view('<i2'), expected)
 
 
+@pytest.mark.xfail(_SUPPORTS_SVE, reason="gh-22982")
 # Test various array sizes that hit different code paths in quicksort-avx512
-@pytest.mark.parametrize("N", [8, 16, 24, 32, 48, 64, 96, 128, 151, 191,
-                               256, 383, 512, 1023, 2047])
-def test_sort_float(N):
+@pytest.mark.parametrize("N", np.arange(1, 512))
+@pytest.mark.parametrize("dtype", ['e', 'f', 'd'])
+def test_sort_float(N, dtype):
     # Regular data with nan sprinkled
     np.random.seed(42)
-    arr = -0.5 + np.random.sample(N).astype('f')
+    arr = -0.5 + np.random.sample(N).astype(dtype)
     arr[np.random.choice(arr.shape[0], 3)] = np.nan
     assert_equal(np.sort(arr, kind='quick'), np.sort(arr, kind='heap'))
 
     # (2) with +INF
-    infarr = np.inf*np.ones(N, dtype='f')
+    infarr = np.inf*np.ones(N, dtype=dtype)
     infarr[np.random.choice(infarr.shape[0], 5)] = -1.0
     assert_equal(np.sort(infarr, kind='quick'), np.sort(infarr, kind='heap'))
 
     # (3) with -INF
-    neginfarr = -np.inf*np.ones(N, dtype='f')
+    neginfarr = -np.inf*np.ones(N, dtype=dtype)
     neginfarr[np.random.choice(neginfarr.shape[0], 5)] = 1.0
     assert_equal(np.sort(neginfarr, kind='quick'),
                  np.sort(neginfarr, kind='heap'))
 
     # (4) with +/-INF
-    infarr = np.inf*np.ones(N, dtype='f')
+    infarr = np.inf*np.ones(N, dtype=dtype)
     infarr[np.random.choice(infarr.shape[0], (int)(N/2))] = -np.inf
     assert_equal(np.sort(infarr, kind='quick'), np.sort(infarr, kind='heap'))
 
-
-def test_sort_int():
-    # Random data with NPY_MAX_INT32 and NPY_MIN_INT32 sprinkled
-    rng = np.random.default_rng(42)
-    N = 2047
-    minv = np.iinfo(np.int32).min
-    maxv = np.iinfo(np.int32).max
-    arr = rng.integers(low=minv, high=maxv, size=N).astype('int32')
+def test_sort_float16():
+    arr = np.arange(65536, dtype=np.int16)
+    temp = np.frombuffer(arr.tobytes(), dtype=np.float16)
+    data = np.copy(temp)
+    np.random.shuffle(data)
+    data_backup = data
+    assert_equal(np.sort(data, kind='quick'),
+            np.sort(data_backup, kind='heap'))
+
+
+@pytest.mark.parametrize("N", np.arange(1, 512))
+@pytest.mark.parametrize("dtype", ['h', 'H', 'i', 'I', 'l', 'L'])
+def test_sort_int(N, dtype):
+    # Random data with MAX and MIN sprinkled
+    minv = np.iinfo(dtype).min
+    maxv = np.iinfo(dtype).max
+    arr = np.random.randint(low=minv, high=maxv-1, size=N, dtype=dtype)
     arr[np.random.choice(arr.shape[0], 10)] = minv
     arr[np.random.choice(arr.shape[0], 10)] = maxv
     assert_equal(np.sort(arr, kind='quick'), np.sort(arr, kind='heap'))
@@ -9698,3 +9986,6 @@ def test_sort_uint():
     arr = rng.integers(low=0, high=maxv, size=N).astype('uint32')
     arr[np.random.choice(arr.shape[0], 10)] = maxv
     assert_equal(np.sort(arr, kind='quick'), np.sort(arr, kind='heap'))
+
+def test_private_get_ndarray_c_version():
+    assert isinstance(_get_ndarray_c_version(), int)
diff --git a/numpy/core/tests/test_nditer.py b/numpy/core/tests/test_nditer.py
index cdbe87a45..9f639c4c1 100644
--- a/numpy/core/tests/test_nditer.py
+++ b/numpy/core/tests/test_nditer.py
@@ -9,7 +9,7 @@ import numpy.core._multiarray_tests as _multiarray_tests
 from numpy import array, arange, nditer, all
 from numpy.testing import (
     assert_, assert_equal, assert_array_equal, assert_raises,
-    HAS_REFCOUNT, suppress_warnings, break_cycles
+    IS_WASM, HAS_REFCOUNT, suppress_warnings, break_cycles
     )
 
 
@@ -1457,6 +1457,38 @@ def test_iter_copy_casts_structured():
         assert_array_equal(res2["b"][field], expected)
 
 
+def test_iter_copy_casts_structured2():
+    # Similar to the above, this is a fairly arcane test to cover internals
+    in_dtype = np.dtype([("a", np.dtype("O,O")),
+                         ("b", np.dtype("(5)O,(3)O,(1,)O,(1,)i,(1,)O"))])
+    out_dtype = np.dtype([("a", np.dtype("O")),
+                          ("b", np.dtype("O,(3)i,(4)O,(4)O,(4)i"))])
+
+    arr = np.ones(1, dtype=in_dtype)
+    it = np.nditer((arr,), ["buffered", "external_loop", "refs_ok"],
+                   op_dtypes=[out_dtype], casting="unsafe")
+    it_copy = it.copy()
+
+    res1 = next(it)
+    del it
+    res2 = next(it_copy)
+    del it_copy
+
+    # Array of two structured scalars:
+    for res in res1, res2:
+        # Cast to tuple by getitem, which may be weird and changable?:
+        assert type(res["a"][0]) == tuple
+        assert res["a"][0] == (1, 1)
+
+    for res in res1, res2:
+        assert_array_equal(res["b"]["f0"][0], np.ones(5, dtype=object))
+        assert_array_equal(res["b"]["f1"], np.ones((1, 3), dtype="i"))
+        assert res["b"]["f2"].shape == (1, 4)
+        assert_array_equal(res["b"]["f2"][0], np.ones(4, dtype=object))
+        assert_array_equal(res["b"]["f3"][0], np.ones(4, dtype=object))
+        assert_array_equal(res["b"]["f3"][0], np.ones(4, dtype="i"))
+
+
 def test_iter_allocate_output_simple():
     # Check that the iterator will properly allocate outputs
 
@@ -2025,6 +2057,7 @@ def test_buffered_cast_error_paths():
             buf = next(it)
             buf[...] = "a"  # cannot be converted to int.
 
+@pytest.mark.skipif(IS_WASM, reason="Cannot start subprocess")
 @pytest.mark.skipif(not HAS_REFCOUNT, reason="PyPy seems to not hit this.")
 def test_buffered_cast_error_paths_unraisable():
     # The following gives an unraisable error. Pytest sometimes captures that
@@ -2256,7 +2289,7 @@ def test_iter_buffering_string():
     assert_equal(i[0], b'abc')
     assert_equal(i[0].dtype, np.dtype('S6'))
 
-    a = np.array(['abc', 'a', 'abcd'], dtype=np.unicode_)
+    a = np.array(['abc', 'a', 'abcd'], dtype=np.str_)
     assert_equal(a.dtype, np.dtype('U4'))
     assert_raises(TypeError, nditer, a, ['buffered'], ['readonly'],
                     op_dtypes='U2')
diff --git a/numpy/core/tests/test_nep50_promotions.py b/numpy/core/tests/test_nep50_promotions.py
index 0c77d3f72..9e84c78c1 100644
--- a/numpy/core/tests/test_nep50_promotions.py
+++ b/numpy/core/tests/test_nep50_promotions.py
@@ -9,6 +9,7 @@ import operator
 import numpy as np
 
 import pytest
+from numpy.testing import IS_WASM
 
 
 @pytest.fixture(scope="module", autouse=True)
@@ -19,6 +20,7 @@ def _weak_promotion_enabled():
     np._set_promotion_state(state)
 
 
+@pytest.mark.skipif(IS_WASM, reason="wasm doesn't have support for fp errors")
 def test_nep50_examples():
     with pytest.warns(UserWarning, match="result dtype changed"):
         res = np.uint8(1) + 2
@@ -129,7 +131,7 @@ def test_nep50_weak_integers_with_inexact(dtype):
 
 @pytest.mark.parametrize("op", [operator.add, operator.pow, operator.eq])
 def test_weak_promotion_scalar_path(op):
-    # Some additional paths excercising the weak scalars.
+    # Some additional paths exercising the weak scalars.
     np._set_promotion_state("weak")
 
     # Integer path:
diff --git a/numpy/core/tests/test_numeric.py b/numpy/core/tests/test_numeric.py
index 9a7e95eaf..832a47c92 100644
--- a/numpy/core/tests/test_numeric.py
+++ b/numpy/core/tests/test_numeric.py
@@ -12,7 +12,7 @@ from numpy.random import rand, randint, randn
 from numpy.testing import (
     assert_, assert_equal, assert_raises, assert_raises_regex,
     assert_array_equal, assert_almost_equal, assert_array_almost_equal,
-    assert_warns, assert_array_max_ulp, HAS_REFCOUNT
+    assert_warns, assert_array_max_ulp, HAS_REFCOUNT, IS_WASM
     )
 from numpy.core._rational_tests import rational
 
@@ -114,7 +114,9 @@ class TestNonarrayArgs:
 
     def test_cumproduct(self):
         A = [[1, 2, 3], [4, 5, 6]]
-        assert_(np.all(np.cumproduct(A) == np.array([1, 2, 6, 24, 120, 720])))
+        with assert_warns(DeprecationWarning):
+            expected = np.array([1, 2, 6, 24, 120, 720])
+            assert_(np.all(np.cumproduct(A) == expected))
 
     def test_diagonal(self):
         a = [[0, 1, 2, 3],
@@ -556,6 +558,7 @@ class TestSeterr:
             np.seterr(**old)
             assert_(np.geterr() == old)
 
+    @pytest.mark.skipif(IS_WASM, reason="no wasm fp exception support")
     @pytest.mark.skipif(platform.machine() == "armv5tel", reason="See gh-413.")
     def test_divide_err(self):
         with np.errstate(divide='raise'):
@@ -565,6 +568,7 @@ class TestSeterr:
             np.seterr(divide='ignore')
             np.array([1.]) / np.array([0.])
 
+    @pytest.mark.skipif(IS_WASM, reason="no wasm fp exception support")
     def test_errobj(self):
         olderrobj = np.geterrobj()
         self.called = 0
@@ -638,6 +642,7 @@ class TestFloatExceptions:
         self.assert_raises_fpe(fpeerr, flop, sc1[()], sc2[()])
 
     # Test for all real and complex float types
+    @pytest.mark.skipif(IS_WASM, reason="no wasm fp exception support")
     @pytest.mark.parametrize("typecode", np.typecodes["AllFloat"])
     def test_floating_exceptions(self, typecode):
         # Test basic arithmetic function errors
@@ -697,6 +702,7 @@ class TestFloatExceptions:
             self.assert_raises_fpe(invalid,
                                    lambda a, b: a*b, ftype(0), ftype(np.inf))
 
+    @pytest.mark.skipif(IS_WASM, reason="no wasm fp exception support")
     def test_warnings(self):
         # test warning code path
         with warnings.catch_warnings(record=True) as w:
@@ -1189,8 +1195,8 @@ class TestFromiter:
         expected = np.array(list(self.makegen()))
         a = np.fromiter(self.makegen(), int)
         a20 = np.fromiter(self.makegen(), int, 20)
-        assert_(np.alltrue(a == expected, axis=0))
-        assert_(np.alltrue(a20 == expected[:20], axis=0))
+        assert_(np.all(a == expected, axis=0))
+        assert_(np.all(a20 == expected[:20], axis=0))
 
     def load_data(self, n, eindex):
         # Utility method for the issue 2592 tests.
@@ -1584,6 +1590,7 @@ class TestNonzero:
         a = np.array([[ThrowsAfter(15)]]*10)
         assert_raises(ValueError, np.nonzero, a)
 
+    @pytest.mark.skipif(IS_WASM, reason="wasm doesn't have threads")
     def test_structured_threadsafety(self):
         # Nonzero (and some other functions) should be threadsafe for
         # structured datatypes, see gh-15387. This test can behave randomly.
@@ -1817,20 +1824,11 @@ class TestClip:
         self.nr = 5
         self.nc = 3
 
-    def fastclip(self, a, m, M, out=None, casting=None):
-        if out is None:
-            if casting is None:
-                return a.clip(m, M)
-            else:
-                return a.clip(m, M, casting=casting)
-        else:
-            if casting is None:
-                return a.clip(m, M, out)
-            else:
-                return a.clip(m, M, out, casting=casting)
+    def fastclip(self, a, m, M, out=None, **kwargs):
+        return a.clip(m, M, out=out, **kwargs)
 
     def clip(self, a, m, M, out=None):
-        # use slow-clip
+        # use a.choose to verify fastclip result
         selector = np.less(a, m) + 2*np.greater(a, M)
         return selector.choose((a, m, M), out=out)
 
@@ -1986,14 +1984,13 @@ class TestClip:
         ac = np.zeros(a.shape, dtype=np.int32)
         act = ac.copy()
         if casting is None:
-            with assert_warns(DeprecationWarning):
-                # NumPy 1.17.0, 2018-02-24 - casting is unsafe
+            with pytest.raises(TypeError):
                 self.fastclip(a, m, M, ac, casting=casting)
         else:
             # explicitly passing "unsafe" will silence warning
             self.fastclip(a, m, M, ac, casting=casting)
-        self.clip(a, m, M, act)
-        assert_array_strict_equal(ac, act)
+            self.clip(a, m, M, act)
+            assert_array_strict_equal(ac, act)
 
     def test_simple_int64_out(self):
         # Test native int32 input with int32 scalar min/max and int64 out.
@@ -2013,9 +2010,7 @@ class TestClip:
         M = np.float64(1)
         ac = np.zeros(a.shape, dtype=np.int32)
         act = ac.copy()
-        with assert_warns(DeprecationWarning):
-            # NumPy 1.17.0, 2018-02-24 - casting is unsafe
-            self.fastclip(a, m, M, ac)
+        self.fastclip(a, m, M, out=ac, casting="unsafe")
         self.clip(a, m, M, act)
         assert_array_strict_equal(ac, act)
 
@@ -2026,9 +2021,7 @@ class TestClip:
         M = 2.0
         ac = np.zeros(a.shape, dtype=np.int32)
         act = ac.copy()
-        with assert_warns(DeprecationWarning):
-            # NumPy 1.17.0, 2018-02-24 - casting is unsafe
-            self.fastclip(a, m, M, ac)
+        self.fastclip(a, m, M, out=ac, casting="unsafe")
         self.clip(a, m, M, act)
         assert_array_strict_equal(ac, act)
 
@@ -2204,9 +2197,7 @@ class TestClip:
         M = np.float64(2)
         ac = np.zeros(a.shape, dtype=np.int32)
         act = ac.copy()
-        with assert_warns(DeprecationWarning):
-            # NumPy 1.17.0, 2018-02-24 - casting is unsafe
-            self.fastclip(a, m, M, ac)
+        self.fastclip(a, m, M, out=ac, casting="unsafe")
         self.clip(a, m, M, act)
         assert_array_strict_equal(ac, act)
 
@@ -2228,9 +2219,7 @@ class TestClip:
         M = np.float64(1)
         ac = np.zeros(a.shape, dtype=np.int32)
         act = ac.copy()
-        with assert_warns(DeprecationWarning):
-            # NumPy 1.17.0, 2018-02-24 - casting is unsafe
-            self.fastclip(a, m, M, ac)
+        self.fastclip(a, m, M, out=ac, casting="unsafe")
         self.clip(a, m, M, act)
         assert_array_strict_equal(ac, act)
 
@@ -2241,9 +2230,7 @@ class TestClip:
         M = 2.0
         ac = np.zeros(a.shape, dtype=np.int32)
         act = ac.copy()
-        with assert_warns(DeprecationWarning):
-            # NumPy 1.17.0, 2018-02-24 - casting is unsafe
-            self.fastclip(a, m, M, ac)
+        self.fastclip(a, m, M, out=ac, casting="unsafe")
         self.clip(a, m, M, act)
         assert_array_strict_equal(ac, act)
 
@@ -2296,16 +2283,11 @@ class TestClip:
 
     def test_clip_nan(self):
         d = np.arange(7.)
-        with assert_warns(DeprecationWarning):
-            assert_equal(d.clip(min=np.nan), d)
-        with assert_warns(DeprecationWarning):
-            assert_equal(d.clip(max=np.nan), d)
-        with assert_warns(DeprecationWarning):
-            assert_equal(d.clip(min=np.nan, max=np.nan), d)
-        with assert_warns(DeprecationWarning):
-            assert_equal(d.clip(min=-2, max=np.nan), d)
-        with assert_warns(DeprecationWarning):
-            assert_equal(d.clip(min=np.nan, max=10), d)
+        assert_equal(d.clip(min=np.nan), np.nan)
+        assert_equal(d.clip(max=np.nan), np.nan)
+        assert_equal(d.clip(min=np.nan, max=np.nan), np.nan)
+        assert_equal(d.clip(min=-2, max=np.nan), np.nan)
+        assert_equal(d.clip(min=np.nan, max=10), np.nan)
 
     def test_object_clip(self):
         a = np.arange(10, dtype=object)
@@ -2357,16 +2339,12 @@ class TestClip:
         actual = np.clip(arr, amin, amax)
         assert_equal(actual, exp)
 
-    @pytest.mark.xfail(reason="no scalar nan propagation yet",
-                       raises=AssertionError,
-                       strict=True)
     @pytest.mark.parametrize("arr, amin, amax", [
         # problematic scalar nan case from hypothesis
         (np.zeros(10, dtype=np.int64),
          np.array(np.nan),
          np.zeros(10, dtype=np.int32)),
     ])
-    @pytest.mark.filterwarnings("ignore::DeprecationWarning")
     def test_clip_scalar_nan_propagation(self, arr, amin, amax):
         # enforcement of scalar nan propagation for comparisons
         # called through clip()
diff --git a/numpy/core/tests/test_numerictypes.py b/numpy/core/tests/test_numerictypes.py
index 6aedbf569..bab5bf246 100644
--- a/numpy/core/tests/test_numerictypes.py
+++ b/numpy/core/tests/test_numerictypes.py
@@ -448,8 +448,9 @@ class TestSctypeDict:
         # alias for np.int_, but np.long is not supported for historical
         # reasons (gh-21063)
         assert_(np.sctypeDict['ulong'] is np.uint)
-        assert_(not hasattr(np, 'ulong'))
-
+        with pytest.warns(FutureWarning):
+            # We will probably allow this in the future:
+            assert not hasattr(np, 'ulong')
 
 class TestBitName:
     def test_abstract(self):
@@ -477,7 +478,8 @@ class TestMaximumSctype:
     def test_complex(self, t):
         assert_equal(np.maximum_sctype(t), np.sctypes['complex'][-1])
 
-    @pytest.mark.parametrize('t', [np.bool_, np.object_, np.unicode_, np.bytes_, np.void])
+    @pytest.mark.parametrize('t', [np.bool_, np.object_, np.str_, np.bytes_,
+                                   np.void])
     def test_other(self, t):
         assert_equal(np.maximum_sctype(t), t)
 
@@ -489,7 +491,7 @@ class Test_sctype2char:
     def test_scalar_type(self):
         assert_equal(np.sctype2char(np.double), 'd')
         assert_equal(np.sctype2char(np.int_), 'l')
-        assert_equal(np.sctype2char(np.unicode_), 'U')
+        assert_equal(np.sctype2char(np.str_), 'U')
         assert_equal(np.sctype2char(np.bytes_), 'S')
 
     def test_other_type(self):
diff --git a/numpy/core/tests/test_overrides.py b/numpy/core/tests/test_overrides.py
index 63432ffa7..25f551f6f 100644
--- a/numpy/core/tests/test_overrides.py
+++ b/numpy/core/tests/test_overrides.py
@@ -10,16 +10,11 @@ from numpy.testing import (
     assert_, assert_equal, assert_raises, assert_raises_regex)
 from numpy.core.overrides import (
     _get_implementing_args, array_function_dispatch,
-    verify_matching_signatures, ARRAY_FUNCTION_ENABLED)
+    verify_matching_signatures)
 from numpy.compat import pickle
 import pytest
 
 
-requires_array_function = pytest.mark.skipif(
-    not ARRAY_FUNCTION_ENABLED,
-    reason="__array_function__ dispatch not enabled.")
-
-
 def _return_not_implemented(self, *args, **kwargs):
     return NotImplemented
 
@@ -150,7 +145,6 @@ class TestGetImplementingArgs:
 
 class TestNDArrayArrayFunction:
 
-    @requires_array_function
     def test_method(self):
 
         class Other:
@@ -209,7 +203,6 @@ class TestNDArrayArrayFunction:
                                      args=(array,), kwargs={})
 
 
-@requires_array_function
 class TestArrayFunctionDispatch:
 
     def test_pickle(self):
@@ -248,8 +241,20 @@ class TestArrayFunctionDispatch:
         with assert_raises_regex(TypeError, 'no implementation found'):
             dispatched_one_arg(array)
 
+    def test_where_dispatch(self):
+
+        class DuckArray:
+            def __array_function__(self, ufunc, method, *inputs, **kwargs):
+                return "overridden"
+
+        array = np.array(1)
+        duck_array = DuckArray()
+
+        result = np.std(array, where=duck_array)
+
+        assert_equal(result, "overridden")
+
 
-@requires_array_function
 class TestVerifyMatchingSignatures:
 
     def test_verify_matching_signatures(self):
@@ -302,7 +307,6 @@ def _new_duck_type_and_implements():
     return (MyArray, implements)
 
 
-@requires_array_function
 class TestArrayFunctionImplementation:
 
     def test_one_arg(self):
@@ -394,6 +398,56 @@ class TestArrayFunctionImplementation:
         except TypeError as exc:
             assert exc is error  # unmodified exception
 
+    def test_properties(self):
+        # Check that str and repr are sensible
+        func = dispatched_two_arg
+        assert str(func) == str(func._implementation)
+        repr_no_id = repr(func).split("at ")[0]
+        repr_no_id_impl = repr(func._implementation).split("at ")[0]
+        assert repr_no_id == repr_no_id_impl
+
+    @pytest.mark.parametrize("func", [
+            lambda x, y: 0,  # no like argument
+            lambda like=None: 0,  # not keyword only
+            lambda *, like=None, a=3: 0,  # not last (not that it matters)
+        ])
+    def test_bad_like_sig(self, func):
+        # We sanity check the signature, and these should fail.
+        with pytest.raises(RuntimeError):
+            array_function_dispatch()(func)
+
+    def test_bad_like_passing(self):
+        # Cover internal sanity check for passing like as first positional arg
+        def func(*, like=None):
+            pass
+
+        func_with_like = array_function_dispatch()(func)
+        with pytest.raises(TypeError):
+            func_with_like()
+        with pytest.raises(TypeError):
+            func_with_like(like=234)
+
+    def test_too_many_args(self):
+        # Mainly a unit-test to increase coverage
+        objs = []
+        for i in range(40):
+            class MyArr:
+                def __array_function__(self, *args, **kwargs):
+                    return NotImplemented
+
+            objs.append(MyArr())
+
+        def _dispatch(*args):
+            return args
+
+        @array_function_dispatch(_dispatch)
+        def func(*args):
+            pass
+
+        with pytest.raises(TypeError, match="maximum number"):
+            func(*objs)
+
+
 
 class TestNDArrayMethods:
 
@@ -422,7 +476,6 @@ class TestNumPyFunctions:
         signature = inspect.signature(np.sum)
         assert_('axis' in signature.parameters)
 
-    @requires_array_function
     def test_override_sum(self):
         MyArray, implements = _new_duck_type_and_implements()
 
@@ -432,7 +485,6 @@ class TestNumPyFunctions:
 
         assert_equal(np.sum(MyArray()), 'yes')
 
-    @requires_array_function
     def test_sum_on_mock_array(self):
 
         # We need a proxy for mocks because __array_function__ is only looked
@@ -453,7 +505,6 @@ class TestNumPyFunctions:
             np.sum, (ArrayProxy,), (proxy,), {})
         proxy.value.__array__.assert_not_called()
 
-    @requires_array_function
     def test_sum_forwarding_implementation(self):
 
         class MyArray(np.ndarray):
@@ -505,7 +556,6 @@ class TestArrayLike:
     def func_args(*args, **kwargs):
         return args, kwargs
 
-    @requires_array_function
     def test_array_like_not_implemented(self):
         self.add_method('array', self.MyArray)
 
@@ -538,7 +588,6 @@ class TestArrayLike:
 
     @pytest.mark.parametrize('function, args, kwargs', _array_tests)
     @pytest.mark.parametrize('numpy_ref', [True, False])
-    @requires_array_function
     def test_array_like(self, function, args, kwargs, numpy_ref):
         self.add_method('array', self.MyArray)
         self.add_method(function, self.MyArray)
@@ -571,7 +620,6 @@ class TestArrayLike:
 
     @pytest.mark.parametrize('function, args, kwargs', _array_tests)
     @pytest.mark.parametrize('ref', [1, [1], "MyNoArrayFunctionArray"])
-    @requires_array_function
     def test_no_array_function_like(self, function, args, kwargs, ref):
         self.add_method('array', self.MyNoArrayFunctionArray)
         self.add_method(function, self.MyNoArrayFunctionArray)
@@ -613,7 +661,6 @@ class TestArrayLike:
                 assert type(array_like) is self.MyArray
                 assert array_like.function is self.MyArray.fromfile
 
-    @requires_array_function
     def test_exception_handling(self):
         self.add_method('array', self.MyArray, enable_value_error=True)
 
@@ -640,3 +687,56 @@ class TestArrayLike:
             array_like.fill(1)
             expected.fill(1)
         assert_equal(array_like, expected)
+
+
+def test_function_like():
+    # We provide a `__get__` implementation, make sure it works
+    assert type(np.mean) is np.core._multiarray_umath._ArrayFunctionDispatcher 
+
+    class MyClass:
+        def __array__(self):
+            # valid argument to mean:
+            return np.arange(3)
+
+        func1 = staticmethod(np.mean)
+        func2 = np.mean
+        func3 = classmethod(np.mean)
+
+    m = MyClass()
+    assert m.func1([10]) == 10
+    assert m.func2() == 1  # mean of the arange
+    with pytest.raises(TypeError, match="unsupported operand type"):
+        # Tries to operate on the class
+        m.func3()
+
+    # Manual binding also works (the above may shortcut):
+    bound = np.mean.__get__(m, MyClass)
+    assert bound() == 1
+
+    bound = np.mean.__get__(None, MyClass)  # unbound actually
+    assert bound([10]) == 10
+
+    bound = np.mean.__get__(MyClass)  # classmethod
+    with pytest.raises(TypeError, match="unsupported operand type"):
+        bound()
+
+
+def test_scipy_trapz_support_shim():
+    # SciPy 1.10 and earlier "clone" trapz in this way, so we have a
+    # support shim in place: https://github.com/scipy/scipy/issues/17811
+    # That should be removed eventually.  This test copies what SciPy does.
+    # Hopefully removable 1 year after SciPy 1.11; shim added to NumPy 1.25.
+    import types
+    import functools
+
+    def _copy_func(f):
+        # Based on http://stackoverflow.com/a/6528148/190597 (Glenn Maynard)
+        g = types.FunctionType(f.__code__, f.__globals__, name=f.__name__,
+                            argdefs=f.__defaults__, closure=f.__closure__)
+        g = functools.update_wrapper(g, f)
+        g.__kwdefaults__ = f.__kwdefaults__
+        return g
+
+    trapezoid = _copy_func(np.trapz)
+
+    assert np.trapz([1, 2]) == trapezoid([1, 2])
diff --git a/numpy/core/tests/test_print.py b/numpy/core/tests/test_print.py
index 89a8b48bf..162686ee0 100644
--- a/numpy/core/tests/test_print.py
+++ b/numpy/core/tests/test_print.py
@@ -3,7 +3,7 @@ import sys
 import pytest
 
 import numpy as np
-from numpy.testing import assert_, assert_equal
+from numpy.testing import assert_, assert_equal, IS_MUSL
 from numpy.core.tests._locales import CommaDecimalPointLocale
 
 
@@ -196,5 +196,7 @@ class TestCommaDecimalPointLocale(CommaDecimalPointLocale):
     def test_locale_double(self):
         assert_equal(str(np.double(1.2)), str(float(1.2)))
 
+    @pytest.mark.skipif(IS_MUSL,
+                        reason="test flaky on musllinux")
     def test_locale_longdouble(self):
         assert_equal(str(np.longdouble('1.2')), str(float(1.2)))
diff --git a/numpy/core/tests/test_regression.py b/numpy/core/tests/test_regression.py
index c3b9dab69..841144790 100644
--- a/numpy/core/tests/test_regression.py
+++ b/numpy/core/tests/test_regression.py
@@ -12,7 +12,7 @@ from numpy.testing import (
         assert_, assert_equal, IS_PYPY, assert_almost_equal,
         assert_array_equal, assert_array_almost_equal, assert_raises,
         assert_raises_regex, assert_warns, suppress_warnings,
-        _assert_valid_refcount, HAS_REFCOUNT, IS_PYSTON
+        _assert_valid_refcount, HAS_REFCOUNT, IS_PYSTON, IS_WASM
         )
 from numpy.testing._private.utils import _no_tracing, requires_memory
 from numpy.compat import asbytes, asunicode, pickle
@@ -128,10 +128,7 @@ class TestRegression:
         assert_(a[1] == 'auto')
         assert_(a[0] != 'auto')
         b = np.linspace(0, 10, 11)
-        # This should return true for now, but will eventually raise an error:
-        with suppress_warnings() as sup:
-            sup.filter(FutureWarning)
-            assert_(b != 'auto')
+        assert_array_equal(b != 'auto', np.ones(11, dtype=bool))
         assert_(b[0] != 'auto')
 
     def test_unicode_swapping(self):
@@ -295,7 +292,7 @@ class TestRegression:
 
     def test_unicode_string_comparison(self):
         # Ticket #190
-        a = np.array('hello', np.unicode_)
+        a = np.array('hello', np.str_)
         b = np.array('world')
         a == b
 
@@ -326,6 +323,7 @@ class TestRegression:
         assert_raises(ValueError, bfa)
         assert_raises(ValueError, bfb)
 
+    @pytest.mark.xfail(IS_WASM, reason="not sure why")
     @pytest.mark.parametrize("index",
             [np.ones(10, dtype=bool), np.arange(10)],
             ids=["boolean-arr-index", "integer-arr-index"])
@@ -457,7 +455,7 @@ class TestRegression:
 
         test_data = [
             # (original, py2_pickle)
-            (np.unicode_('\u6f2c'),
+            (np.str_('\u6f2c'),
              b"cnumpy.core.multiarray\nscalar\np0\n(cnumpy\ndtype\np1\n"
              b"(S'U1'\np2\nI0\nI1\ntp3\nRp4\n(I3\nS'<'\np5\nNNNI4\nI4\n"
              b"I0\ntp6\nbS',o\\x00\\x00'\np7\ntp8\nRp9\n."),
@@ -518,22 +516,15 @@ class TestRegression:
     def test_method_args(self):
         # Make sure methods and functions have same default axis
         # keyword and arguments
-        funcs1 = ['argmax', 'argmin', 'sum', ('product', 'prod'),
-                 ('sometrue', 'any'),
-                 ('alltrue', 'all'), 'cumsum', ('cumproduct', 'cumprod'),
-                 'ptp', 'cumprod', 'prod', 'std', 'var', 'mean',
-                 'round', 'min', 'max', 'argsort', 'sort']
+        funcs1 = ['argmax', 'argmin', 'sum', 'any', 'all', 'cumsum',
+                  'ptp', 'cumprod', 'prod', 'std', 'var', 'mean',
+                  'round', 'min', 'max', 'argsort', 'sort']
         funcs2 = ['compress', 'take', 'repeat']
 
         for func in funcs1:
             arr = np.random.rand(8, 7)
             arr2 = arr.copy()
-            if isinstance(func, tuple):
-                func_meth = func[1]
-                func = func[0]
-            else:
-                func_meth = func
-            res1 = getattr(arr, func_meth)()
+            res1 = getattr(arr, func)()
             res2 = getattr(np, func)(arr2)
             if res1 is None:
                 res1 = arr
@@ -1338,8 +1329,8 @@ class TestRegression:
         # Ticket #1058
         a = np.fromiter(list(range(10)), dtype='b')
         b = np.fromiter(list(range(10)), dtype='B')
-        assert_(np.alltrue(a == np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])))
-        assert_(np.alltrue(b == np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])))
+        assert_(np.all(a == np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])))
+        assert_(np.all(b == np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])))
 
     def test_array_from_sequence_scalar_array(self):
         # Ticket #1078: segfaults when creating an array with a sequence of
@@ -1517,8 +1508,8 @@ class TestRegression:
     def test_fromiter_comparison(self):
         a = np.fromiter(list(range(10)), dtype='b')
         b = np.fromiter(list(range(10)), dtype='B')
-        assert_(np.alltrue(a == np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])))
-        assert_(np.alltrue(b == np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])))
+        assert_(np.all(a == np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])))
+        assert_(np.all(b == np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])))
 
     def test_fromstring_crash(self):
         # Ticket #1345: the following should not cause a crash
@@ -1537,9 +1528,12 @@ class TestRegression:
             for y in dtypes:
                 c = a.astype(y)
                 try:
-                    np.dot(b, c)
+                    d = np.dot(b, c)
                 except TypeError:
                     failures.append((x, y))
+                else:
+                    if d != 0:
+                        failures.append((x, y))
         if failures:
             raise AssertionError("Failures: %r" % failures)
 
@@ -1689,7 +1683,7 @@ class TestRegression:
         # number 2, and the exception hung around until something checked
         # PyErr_Occurred() and returned an error.
         assert_equal(np.dtype('S10').itemsize, 10)
-        np.array([['abc', 2], ['long   ', '0123456789']], dtype=np.string_)
+        np.array([['abc', 2], ['long   ', '0123456789']], dtype=np.bytes_)
         assert_equal(np.dtype('S10').itemsize, 10)
 
     def test_any_float(self):
@@ -1958,7 +1952,7 @@ class TestRegression:
         # Python2 output for pickle.dumps(...)
         datas = [
             # (original, python2_pickle, koi8r_validity)
-            (np.unicode_('\u6bd2'),
+            (np.str_('\u6bd2'),
              (b"cnumpy.core.multiarray\nscalar\np0\n(cnumpy\ndtype\np1\n"
               b"(S'U1'\np2\nI0\nI1\ntp3\nRp4\n(I3\nS'<'\np5\nNNNI4\nI4\nI0\n"
               b"tp6\nbS'\\xd2k\\x00\\x00'\np7\ntp8\nRp9\n."),
@@ -2082,7 +2076,7 @@ class TestRegression:
         # Ticket #1578, the mismatch only showed up when running
         # python-debug for python versions >= 2.7, and then as
         # a core dump and error message.
-        a = np.array(['abc'], dtype=np.unicode_)[0]
+        a = np.array(['abc'], dtype=np.str_)[0]
         del a
 
     def test_refcount_error_in_clip(self):
@@ -2229,7 +2223,7 @@ class TestRegression:
     def test_pickle_empty_string(self):
         # gh-3926
         for proto in range(2, pickle.HIGHEST_PROTOCOL + 1):
-            test_string = np.string_('')
+            test_string = np.bytes_('')
             assert_equal(pickle.loads(
                 pickle.dumps(test_string, protocol=proto)), test_string)
 
@@ -2350,7 +2344,7 @@ class TestRegression:
         values = {
             np.void: b"a",
             np.bytes_: b"a",
-            np.unicode_: "a",
+            np.str_: "a",
             np.datetime64: "2017-08-25",
         }
         for sctype in scalar_types:
@@ -2557,3 +2551,14 @@ class TestRegression:
                 f"Unexpected types order of ufunc in {operation}"
                 f"for {order}. Possible fix: Use signed before unsigned"
                 "in generate_umath.py")
+
+    def test_nonbool_logical(self):
+        # gh-22845
+        # create two arrays with bit patterns that do not overlap.
+        # needs to be large enough to test both SIMD and scalar paths
+        size = 100
+        a = np.frombuffer(b'\x01' * size, dtype=np.bool_)
+        b = np.frombuffer(b'\x80' * size, dtype=np.bool_)
+        expected = np.ones(size, dtype=np.bool_)
+        assert_array_equal(np.logical_and(a, b), expected)
+
diff --git a/numpy/core/tests/test_scalar_ctors.py b/numpy/core/tests/test_scalar_ctors.py
index 17aca3fb8..da976d64f 100644
--- a/numpy/core/tests/test_scalar_ctors.py
+++ b/numpy/core/tests/test_scalar_ctors.py
@@ -114,3 +114,73 @@ class TestArrayFromScalar:
     @pytest.mark.parametrize('t2', cfloat_types + [None])
     def test_complex(self, t1, t2):
         return self._do_test(t1, t2)
+
+
+@pytest.mark.parametrize("length",
+        [5, np.int8(5), np.array(5, dtype=np.uint16)])
+def test_void_via_length(length):
+    res = np.void(length)
+    assert type(res) is np.void
+    assert res.item() == b"\0" * 5
+    assert res.dtype == "V5"
+
+@pytest.mark.parametrize("bytes_",
+        [b"spam", np.array(567.)])
+def test_void_from_byteslike(bytes_):
+    res = np.void(bytes_)
+    expected = bytes(bytes_)
+    assert type(res) is np.void
+    assert res.item() == expected
+
+    # Passing dtype can extend it (this is how filling works)
+    res = np.void(bytes_, dtype="V100")
+    assert type(res) is np.void
+    assert res.item()[:len(expected)] == expected
+    assert res.item()[len(expected):] == b"\0" * (res.nbytes - len(expected))
+    # As well as shorten:
+    res = np.void(bytes_, dtype="V4")
+    assert type(res) is np.void
+    assert res.item() == expected[:4]
+
+def test_void_arraylike_trumps_byteslike():
+    # The memoryview is converted as an array-like of shape (18,)
+    # rather than a single bytes-like of that length.
+    m = memoryview(b"just one mintleaf?")
+    res = np.void(m)
+    assert type(res) is np.ndarray
+    assert res.dtype == "V1"
+    assert res.shape == (18,)
+
+def test_void_dtype_arg():
+    # Basic test for the dtype argument (positional and keyword)
+    res = np.void((1, 2), dtype="i,i")
+    assert res.item() == (1, 2)
+    res = np.void((2, 3), "i,i")
+    assert res.item() == (2, 3)
+
+@pytest.mark.parametrize("data",
+        [5, np.int8(5), np.array(5, dtype=np.uint16)])
+def test_void_from_integer_with_dtype(data):
+    # The "length" meaning is ignored, rather data is used:
+    res = np.void(data, dtype="i,i")
+    assert type(res) is np.void
+    assert res.dtype == "i,i"
+    assert res["f0"] == 5 and res["f1"] == 5
+
+def test_void_from_structure():
+    dtype = np.dtype([('s', [('f', 'f8'), ('u', 'U1')]), ('i', 'i2')])
+    data = np.array(((1., 'a'), 2), dtype=dtype)
+    res = np.void(data[()], dtype=dtype)
+    assert type(res) is np.void
+    assert res.dtype == dtype
+    assert res == data[()]
+
+def test_void_bad_dtype():
+    with pytest.raises(TypeError,
+            match="void: descr must be a `void.*int64"):
+        np.void(4, dtype="i8")
+
+    # Subarray dtype (with shape `(4,)` is rejected):
+    with pytest.raises(TypeError,
+            match=r"void: descr must be a `void.*\(4,\)"):
+        np.void(4, dtype="4i")
diff --git a/numpy/core/tests/test_scalar_methods.py b/numpy/core/tests/test_scalar_methods.py
index a53e47b19..18a7bc828 100644
--- a/numpy/core/tests/test_scalar_methods.py
+++ b/numpy/core/tests/test_scalar_methods.py
@@ -1,7 +1,6 @@
 """
 Test the scalar constructors, which also do type-coercion
 """
-import sys
 import fractions
 import platform
 import types
@@ -10,7 +9,7 @@ from typing import Any, Type
 import pytest
 import numpy as np
 
-from numpy.testing import assert_equal, assert_raises
+from numpy.testing import assert_equal, assert_raises, IS_MUSL
 
 
 class TestAsIntegerRatio:
@@ -99,6 +98,8 @@ class TestAsIntegerRatio:
             try:
                 nf = np.longdouble(n)
                 df = np.longdouble(d)
+                if not np.isfinite(df):
+                    raise OverflowError
             except (OverflowError, RuntimeWarning):
                 # the values may not fit in any float type
                 pytest.skip("longdouble too small on this platform")
@@ -132,7 +133,6 @@ class TestIsInteger:
             assert not value.is_integer()
 
 
-@pytest.mark.skipif(sys.version_info < (3, 9), reason="Requires python 3.9")
 class TestClassGetItem:
     @pytest.mark.parametrize("cls", [
         np.number,
@@ -186,14 +186,6 @@ class TestClassGetItem:
         assert np.number[Any]
 
 
-@pytest.mark.skipif(sys.version_info >= (3, 9), reason="Requires python 3.8")
-@pytest.mark.parametrize("cls", [np.number, np.complexfloating, np.int64])
-def test_class_getitem_38(cls: Type[np.number]) -> None:
-    match = "Type subscription requires python >= 3.9"
-    with pytest.raises(TypeError, match=match):
-        cls[Any]
-
-
 class TestBitCount:
     # derived in part from the cpython test "test_bit_count"
 
diff --git a/numpy/core/tests/test_scalarbuffer.py b/numpy/core/tests/test_scalarbuffer.py
index 0e6ab1015..31b0494cf 100644
--- a/numpy/core/tests/test_scalarbuffer.py
+++ b/numpy/core/tests/test_scalarbuffer.py
@@ -68,11 +68,11 @@ class TestScalarPEP3118:
             get_buffer_info(x, ["WRITABLE"])
 
     def test_void_scalar_structured_data(self):
-        dt = np.dtype([('name', np.unicode_, 16), ('grades', np.float64, (2,))])
+        dt = np.dtype([('name', np.str_, 16), ('grades', np.float64, (2,))])
         x = np.array(('ndarray_scalar', (1.2, 3.0)), dtype=dt)[()]
         assert_(isinstance(x, np.void))
         mv_x = memoryview(x)
-        expected_size = 16 * np.dtype((np.unicode_, 1)).itemsize
+        expected_size = 16 * np.dtype((np.str_, 1)).itemsize
         expected_size += 2 * np.dtype(np.float64).itemsize
         assert_equal(mv_x.itemsize, expected_size)
         assert_equal(mv_x.ndim, 0)
diff --git a/numpy/core/tests/test_scalarinherit.py b/numpy/core/tests/test_scalarinherit.py
index f697c3c81..f9c574d57 100644
--- a/numpy/core/tests/test_scalarinherit.py
+++ b/numpy/core/tests/test_scalarinherit.py
@@ -58,8 +58,8 @@ class TestInherit:
 class TestCharacter:
     def test_char_radd(self):
         # GH issue 9620, reached gentype_add and raise TypeError
-        np_s = np.string_('abc')
-        np_u = np.unicode_('abc')
+        np_s = np.bytes_('abc')
+        np_u = np.str_('abc')
         s = b'def'
         u = 'def'
         assert_(np_s.__radd__(np_s) is NotImplemented)
@@ -90,8 +90,8 @@ class TestCharacter:
         assert ret == b"defabc"
 
     def test_char_repeat(self):
-        np_s = np.string_('abc')
-        np_u = np.unicode_('abc')
+        np_s = np.bytes_('abc')
+        np_u = np.str_('abc')
         res_s = b'abc' * 5
         res_u = 'abc' * 5
         assert_(np_s * 5 == res_s)
diff --git a/numpy/core/tests/test_scalarmath.py b/numpy/core/tests/test_scalarmath.py
index e3dc52c48..c737099c1 100644
--- a/numpy/core/tests/test_scalarmath.py
+++ b/numpy/core/tests/test_scalarmath.py
@@ -4,7 +4,7 @@ import warnings
 import itertools
 import operator
 import platform
-from numpy.compat import _pep440
+from numpy._utils import _pep440
 import pytest
 from hypothesis import given, settings
 from hypothesis.strategies import sampled_from
@@ -14,7 +14,7 @@ import numpy as np
 from numpy.testing import (
     assert_, assert_equal, assert_raises, assert_almost_equal,
     assert_array_equal, IS_PYPY, suppress_warnings, _gen_alignment_data,
-    assert_warns,
+    assert_warns, _SUPPORTS_SVE,
     )
 
 types = [np.bool_, np.byte, np.ubyte, np.short, np.ushort, np.intc, np.uintc,
@@ -75,17 +75,7 @@ class TestTypes:
             np.add(1, 1)
 
 
-@pytest.mark.slow
-@settings(max_examples=10000, deadline=2000)
-@given(sampled_from(reasonable_operators_for_scalars),
-       hynp.arrays(dtype=hynp.scalar_dtypes(), shape=()),
-       hynp.arrays(dtype=hynp.scalar_dtypes(), shape=()))
-def test_array_scalar_ufunc_equivalence(op, arr1, arr2):
-    """
-    This is a thorough test attempting to cover important promotion paths
-    and ensuring that arrays and scalars stay as aligned as possible.
-    However, if it creates troubles, it should maybe just be removed.
-    """
+def check_ufunc_scalar_equivalence(op, arr1, arr2):
     scalar1 = arr1[()]
     scalar2 = arr2[()]
     assert isinstance(scalar1, np.generic)
@@ -95,6 +85,11 @@ def test_array_scalar_ufunc_equivalence(op, arr1, arr2):
         comp_ops = {operator.ge, operator.gt, operator.le, operator.lt}
         if op in comp_ops and (np.isnan(scalar1) or np.isnan(scalar2)):
             pytest.xfail("complex comp ufuncs use sort-order, scalars do not.")
+    if op == operator.pow and arr2.item() in [-1, 0, 0.5, 1, 2]:
+        # array**scalar special case can have different result dtype
+        # (Other powers may have issues also, but are not hit here.)
+        # TODO: It would be nice to resolve this issue.
+        pytest.skip("array**2 can have incorrect/weird result dtype")
 
     # ignore fpe's since they may just mismatch for integers anyway.
     with warnings.catch_warnings(), np.errstate(all="ignore"):
@@ -107,10 +102,51 @@ def test_array_scalar_ufunc_equivalence(op, arr1, arr2):
                 op(scalar1, scalar2)
         else:
             scalar_res = op(scalar1, scalar2)
-            assert_array_equal(scalar_res, res)
+            assert_array_equal(scalar_res, res, strict=True)
+
+
+@pytest.mark.slow
+@settings(max_examples=10000, deadline=2000)
+@given(sampled_from(reasonable_operators_for_scalars),
+       hynp.arrays(dtype=hynp.scalar_dtypes(), shape=()),
+       hynp.arrays(dtype=hynp.scalar_dtypes(), shape=()))
+def test_array_scalar_ufunc_equivalence(op, arr1, arr2):
+    """
+    This is a thorough test attempting to cover important promotion paths
+    and ensuring that arrays and scalars stay as aligned as possible.
+    However, if it creates troubles, it should maybe just be removed.
+    """
+    check_ufunc_scalar_equivalence(op, arr1, arr2)
+
+
+@pytest.mark.slow
+@given(sampled_from(reasonable_operators_for_scalars),
+       hynp.scalar_dtypes(), hynp.scalar_dtypes())
+def test_array_scalar_ufunc_dtypes(op, dt1, dt2):
+    # Same as above, but don't worry about sampling weird values so that we
+    # do not have to sample as much
+    arr1 = np.array(2, dtype=dt1)
+    arr2 = np.array(3, dtype=dt2)  # some power do weird things.
+
+    check_ufunc_scalar_equivalence(op, arr1, arr2)
+
+
+@pytest.mark.parametrize("fscalar", [np.float16, np.float32])
+def test_int_float_promotion_truediv(fscalar):
+    # Promotion for mixed int and float32/float16 must not go to float64
+    i = np.int8(1)
+    f = fscalar(1)
+    expected = np.result_type(i, f)
+    assert (i / f).dtype == expected
+    assert (f / i).dtype == expected
+    # But normal int / int true division goes to float64:
+    assert (i / i).dtype == np.dtype("float64")
+    # For int16, result has to be ast least float32 (takes ufunc path):
+    assert (np.int16(1) / f).dtype == np.dtype("float32")
 
 
 class TestBaseMath:
+    @pytest.mark.xfail(_SUPPORTS_SVE, reason="gh-22982")
     def test_blocked(self):
         # test alignments offsets for simd instructions
         # alignments for vz + 2 * (vs - 1) + 1
@@ -860,27 +896,29 @@ def test_operator_scalars(op, type1, type2):
 
 
 @pytest.mark.parametrize("op", reasonable_operators_for_scalars)
-def test_longdouble_inf_loop(op):
+@pytest.mark.parametrize("val", [None, 2**64])
+def test_longdouble_inf_loop(op, val):
+    # Note: The 2**64 value will pass once NEP 50 is adopted.
     try:
-        op(np.longdouble(3), None)
+        op(np.longdouble(3), val)
     except TypeError:
         pass
     try:
-        op(None, np.longdouble(3))
+        op(val, np.longdouble(3))
     except TypeError:
         pass
 
 
 @pytest.mark.parametrize("op", reasonable_operators_for_scalars)
-def test_clongdouble_inf_loop(op):
-    if op in {operator.mod} and False:
-        pytest.xfail("The modulo operator is known to be broken")
+@pytest.mark.parametrize("val", [None, 2**64])
+def test_clongdouble_inf_loop(op, val):
+    # Note: The 2**64 value will pass once NEP 50 is adopted.
     try:
-        op(np.clongdouble(3), None)
+        op(np.clongdouble(3), val)
     except TypeError:
         pass
     try:
-        op(None, np.longdouble(3))
+        op(val, np.longdouble(3))
     except TypeError:
         pass
 
diff --git a/numpy/core/tests/test_scalarprint.py b/numpy/core/tests/test_scalarprint.py
index 4deb5a0a4..98d1f4aa1 100644
--- a/numpy/core/tests/test_scalarprint.py
+++ b/numpy/core/tests/test_scalarprint.py
@@ -8,7 +8,7 @@ import sys
 
 from tempfile import TemporaryFile
 import numpy as np
-from numpy.testing import assert_, assert_equal, assert_raises
+from numpy.testing import assert_, assert_equal, assert_raises, IS_MUSL
 
 class TestRealScalars:
     def test_str(self):
@@ -260,10 +260,10 @@ class TestRealScalars:
         assert_equal(fpos64('324', unique=False, precision=5,
                                    fractional=False), "324.00")
 
-
     def test_dragon4_interface(self):
         tps = [np.float16, np.float32, np.float64]
-        if hasattr(np, 'float128'):
+        # test is flaky for musllinux on np.float128
+        if hasattr(np, 'float128') and not IS_MUSL:
             tps.append(np.float128)
 
         fpos = np.format_float_positional
diff --git a/numpy/core/tests/test_shape_base.py b/numpy/core/tests/test_shape_base.py
index 570d006f5..0428b95a9 100644
--- a/numpy/core/tests/test_shape_base.py
+++ b/numpy/core/tests/test_shape_base.py
@@ -152,9 +152,9 @@ class TestHstack:
         assert_array_equal(res, desired)
 
     def test_generator(self):
-        with assert_warns(FutureWarning):
+        with pytest.raises(TypeError, match="arrays to stack must be"):
             hstack((np.arange(3) for _ in range(2)))
-        with assert_warns(FutureWarning):
+        with pytest.raises(TypeError, match="arrays to stack must be"):
             hstack(map(lambda x: x, np.ones((3, 2))))
 
     def test_casting_and_dtype(self):
@@ -207,7 +207,7 @@ class TestVstack:
         assert_array_equal(res, desired)
 
     def test_generator(self):
-        with assert_warns(FutureWarning):
+        with pytest.raises(TypeError, match="arrays to stack must be"):
             vstack((np.arange(3) for _ in range(2)))
 
     def test_casting_and_dtype(self):
@@ -472,10 +472,11 @@ def test_stack():
                         stack, [np.zeros((3, 3)), np.zeros(3)], axis=1)
     assert_raises_regex(ValueError, 'must have the same shape',
                         stack, [np.arange(2), np.arange(3)])
-    # generator is deprecated
-    with assert_warns(FutureWarning):
-        result = stack((x for x in range(3)))
-    assert_array_equal(result, np.array([0, 1, 2]))
+
+    # do not accept generators
+    with pytest.raises(TypeError, match="arrays to stack must be"):
+        stack((x for x in range(3)))
+
     #casting and dtype test
     a = np.array([1, 2, 3])
     b = np.array([2.5, 3.5, 4.5])
diff --git a/numpy/core/tests/test_simd.py b/numpy/core/tests/test_simd.py
index 4aeaf0da3..92b567446 100644
--- a/numpy/core/tests/test_simd.py
+++ b/numpy/core/tests/test_simd.py
@@ -2,9 +2,24 @@
 # may be involved in their functionality.
 import pytest, math, re
 import itertools
-from numpy.core._simd import targets
+import operator
+from numpy.core._simd import targets, clear_floatstatus, get_floatstatus
 from numpy.core._multiarray_umath import __cpu_baseline__
 
+def check_floatstatus(divbyzero=False, overflow=False,
+                      underflow=False, invalid=False,
+                      all=False):
+    #define NPY_FPE_DIVIDEBYZERO  1
+    #define NPY_FPE_OVERFLOW      2
+    #define NPY_FPE_UNDERFLOW     4
+    #define NPY_FPE_INVALID       8
+    err = get_floatstatus()
+    ret = (all or divbyzero) and (err & 1) != 0
+    ret |= (all or overflow) and (err & 2) != 0
+    ret |= (all or underflow) and (err & 4) != 0
+    ret |= (all or invalid) and (err & 8) != 0
+    return ret
+
 class _Test_Utility:
     # submodule of the desired SIMD extension, e.g. targets["AVX512F"]
     npyv = None
@@ -20,6 +35,9 @@ class _Test_Utility:
         """
         return getattr(self.npyv, attr + "_" + self.sfx)
 
+    def _x2(self, intrin_name):
+        return getattr(self.npyv, f"{intrin_name}_{self.sfx}x2")
+
     def _data(self, start=None, count=None, reverse=False):
         """
         Create list of consecutive numbers according to number of vector's lanes.
@@ -365,6 +383,11 @@ class _SIMD_FP(_Test_Utility):
         nfms = self.nmulsub(vdata_a, vdata_b, vdata_c)
         data_nfms = self.mul(data_fma, self.setall(-1))
         assert nfms == data_nfms
+        # multiply, add for odd elements and subtract even elements.
+        # (a * b) -+ c
+        fmas = list(self.muladdsub(vdata_a, vdata_b, vdata_c))
+        assert fmas[0::2] == list(data_fms)[0::2]
+        assert fmas[1::2] == list(data_fma)[1::2]
 
     def test_abs(self):
         pinf, ninf, nan = self._pinfinity(), self._ninfinity(), self._nan()
@@ -437,6 +460,16 @@ class _SIMD_FP(_Test_Utility):
                 _round = intrin(data)
                 assert _round == data_round
 
+        # test large numbers
+        for i in (
+            1.1529215045988576e+18, 4.6116860183954304e+18,
+            5.902958103546122e+20, 2.3611832414184488e+21
+        ):
+            x = self.setall(i)
+            y = intrin(x)
+            data_round = [func(n) for n in x]
+            assert y == data_round
+
         # signed zero
         if intrin_name == "floor":
             data_szero = (-0.0,)
@@ -543,7 +576,16 @@ class _SIMD_FP(_Test_Utility):
         nnan = self.notnan(self.setall(self._nan()))
         assert nnan == [0]*self.nlanes
 
-    import operator
+    @pytest.mark.parametrize("intrin_name", [
+        "rint", "trunc", "ceil", "floor"
+    ])
+    def test_unary_invalid_fpexception(self, intrin_name):
+        intrin = getattr(self, intrin_name)
+        for d in [float("nan"), float("inf"), -float("inf")]:
+            v = self.setall(d)
+            clear_floatstatus()
+            intrin(v)
+            assert check_floatstatus(invalid=True) == False
 
     @pytest.mark.parametrize('py_comp,np_comp', [
         (operator.lt, "cmplt"),
@@ -561,7 +603,8 @@ class _SIMD_FP(_Test_Utility):
             return [lane == mask_true for lane in vector]
 
         intrin = getattr(self, np_comp)
-        cmp_cases = ((0, nan), (nan, 0), (nan, nan), (pinf, nan), (ninf, nan))
+        cmp_cases = ((0, nan), (nan, 0), (nan, nan), (pinf, nan),
+                     (ninf, nan), (-0.0, +0.0))
         for case_operand1, case_operand2 in cmp_cases:
             data_a = [case_operand1]*self.nlanes
             data_b = [case_operand2]*self.nlanes
@@ -643,25 +686,34 @@ class _SIMD_ALL(_Test_Utility):
         assert store_h[:self.nlanes//2] == data[self.nlanes//2:]
         assert store_h != vdata  # detect overflow
 
-    def test_memory_partial_load(self):
-        if self.sfx in ("u8", "s8", "u16", "s16"):
+    @pytest.mark.parametrize("intrin, elsizes, scale, fill", [
+        ("self.load_tillz, self.load_till", (32, 64), 1, [0xffff]),
+        ("self.load2_tillz, self.load2_till", (32, 64), 2, [0xffff, 0x7fff]),
+    ])
+    def test_memory_partial_load(self, intrin, elsizes, scale, fill):
+        if self._scalar_size() not in elsizes:
             return
-
+        npyv_load_tillz, npyv_load_till = eval(intrin)
         data = self._data()
         lanes = list(range(1, self.nlanes + 1))
         lanes += [self.nlanes**2, self.nlanes**4] # test out of range
         for n in lanes:
-            load_till  = self.load_till(data, n, 15)
-            data_till  = data[:n] + [15] * (self.nlanes-n)
+            load_till = npyv_load_till(data, n, *fill)
+            load_tillz = npyv_load_tillz(data, n)
+            n *= scale
+            data_till = data[:n] + fill * ((self.nlanes-n) // scale)
             assert load_till == data_till
-            load_tillz = self.load_tillz(data, n)
             data_tillz = data[:n] + [0] * (self.nlanes-n)
             assert load_tillz == data_tillz
 
-    def test_memory_partial_store(self):
-        if self.sfx in ("u8", "s8", "u16", "s16"):
+    @pytest.mark.parametrize("intrin, elsizes, scale", [
+        ("self.store_till", (32, 64), 1),
+        ("self.store2_till", (32, 64), 2),
+    ])
+    def test_memory_partial_store(self, intrin, elsizes, scale):
+        if self._scalar_size() not in elsizes:
             return
-
+        npyv_store_till = eval(intrin)
         data = self._data()
         data_rev = self._data(reverse=True)
         vdata = self.load(data)
@@ -669,105 +721,159 @@ class _SIMD_ALL(_Test_Utility):
         lanes += [self.nlanes**2, self.nlanes**4]
         for n in lanes:
             data_till = data_rev.copy()
-            data_till[:n] = data[:n]
+            data_till[:n*scale] = data[:n*scale]
             store_till = self._data(reverse=True)
-            self.store_till(store_till, n, vdata)
+            npyv_store_till(store_till, n, vdata)
             assert store_till == data_till
 
-    def test_memory_noncont_load(self):
-        if self.sfx in ("u8", "s8", "u16", "s16"):
+    @pytest.mark.parametrize("intrin, elsizes, scale", [
+        ("self.loadn", (32, 64), 1),
+        ("self.loadn2", (32, 64), 2),
+    ])
+    def test_memory_noncont_load(self, intrin, elsizes, scale):
+        if self._scalar_size() not in elsizes:
             return
-
-        for stride in range(1, 64):
-            data = self._data(count=stride*self.nlanes)
-            data_stride = data[::stride]
-            loadn = self.loadn(data, stride)
-            assert loadn == data_stride
-
-        for stride in range(-64, 0):
-            data = self._data(stride, -stride*self.nlanes)
-            data_stride = self.load(data[::stride]) # cast unsigned
-            loadn = self.loadn(data, stride)
+        npyv_loadn = eval(intrin)
+        for stride in range(-64, 64):
+            if stride < 0:
+                data = self._data(stride, -stride*self.nlanes)
+                data_stride = list(itertools.chain(
+                    *zip(*[data[-i::stride] for i in range(scale, 0, -1)])
+                ))
+            elif stride == 0:
+                data = self._data()
+                data_stride = data[0:scale] * (self.nlanes//scale)
+            else:
+                data = self._data(count=stride*self.nlanes)
+                data_stride = list(itertools.chain(
+                    *zip(*[data[i::stride] for i in range(scale)]))
+                )
+            data_stride = self.load(data_stride)  # cast unsigned
+            loadn = npyv_loadn(data, stride)
             assert loadn == data_stride
 
-    def test_memory_noncont_partial_load(self):
-        if self.sfx in ("u8", "s8", "u16", "s16"):
+    @pytest.mark.parametrize("intrin, elsizes, scale, fill", [
+        ("self.loadn_tillz, self.loadn_till", (32, 64), 1, [0xffff]),
+        ("self.loadn2_tillz, self.loadn2_till", (32, 64), 2, [0xffff, 0x7fff]),
+    ])
+    def test_memory_noncont_partial_load(self, intrin, elsizes, scale, fill):
+        if self._scalar_size() not in elsizes:
             return
-
+        npyv_loadn_tillz, npyv_loadn_till = eval(intrin)
         lanes = list(range(1, self.nlanes + 1))
         lanes += [self.nlanes**2, self.nlanes**4]
-        for stride in range(1, 64):
-            data = self._data(count=stride*self.nlanes)
-            data_stride = data[::stride]
-            for n in lanes:
-                data_stride_till = data_stride[:n] + [15] * (self.nlanes-n)
-                loadn_till = self.loadn_till(data, stride, n, 15)
-                assert loadn_till == data_stride_till
-                data_stride_tillz = data_stride[:n] + [0] * (self.nlanes-n)
-                loadn_tillz = self.loadn_tillz(data, stride, n)
-                assert loadn_tillz == data_stride_tillz
-
-        for stride in range(-64, 0):
-            data = self._data(stride, -stride*self.nlanes)
-            data_stride = list(self.load(data[::stride])) # cast unsigned
+        for stride in range(-64, 64):
+            if stride < 0:
+                data = self._data(stride, -stride*self.nlanes)
+                data_stride = list(itertools.chain(
+                    *zip(*[data[-i::stride] for i in range(scale, 0, -1)])
+                ))
+            elif stride == 0:
+                data = self._data()
+                data_stride = data[0:scale] * (self.nlanes//scale)
+            else:
+                data = self._data(count=stride*self.nlanes)
+                data_stride = list(itertools.chain(
+                    *zip(*[data[i::stride] for i in range(scale)])
+                ))
+            data_stride = list(self.load(data_stride))  # cast unsigned
             for n in lanes:
-                data_stride_till = data_stride[:n] + [15] * (self.nlanes-n)
-                loadn_till = self.loadn_till(data, stride, n, 15)
+                nscale = n * scale
+                llanes = self.nlanes - nscale
+                data_stride_till = (
+                    data_stride[:nscale] + fill * (llanes//scale)
+                )
+                loadn_till = npyv_loadn_till(data, stride, n, *fill)
                 assert loadn_till == data_stride_till
-                data_stride_tillz = data_stride[:n] + [0] * (self.nlanes-n)
-                loadn_tillz = self.loadn_tillz(data, stride, n)
+                data_stride_tillz = data_stride[:nscale] + [0] * llanes
+                loadn_tillz = npyv_loadn_tillz(data, stride, n)
                 assert loadn_tillz == data_stride_tillz
 
-    def test_memory_noncont_store(self):
-        if self.sfx in ("u8", "s8", "u16", "s16"):
+    @pytest.mark.parametrize("intrin, elsizes, scale", [
+        ("self.storen", (32, 64), 1),
+        ("self.storen2", (32, 64), 2),
+    ])
+    def test_memory_noncont_store(self, intrin, elsizes, scale):
+        if self._scalar_size() not in elsizes:
             return
-
-        vdata = self.load(self._data())
+        npyv_storen = eval(intrin)
+        data = self._data()
+        vdata = self.load(data)
+        hlanes = self.nlanes // scale
         for stride in range(1, 64):
-            data = [15] * stride * self.nlanes
-            data[::stride] = vdata
-            storen = [15] * stride * self.nlanes
-            storen += [127]*64
-            self.storen(storen, stride, vdata)
-            assert storen[:-64] == data
-            assert storen[-64:] == [127]*64 # detect overflow
+            data_storen = [0xff] * stride * self.nlanes
+            for s in range(0, hlanes*stride, stride):
+                i = (s//stride)*scale
+                data_storen[s:s+scale] = data[i:i+scale]
+            storen = [0xff] * stride * self.nlanes
+            storen += [0x7f]*64
+            npyv_storen(storen, stride, vdata)
+            assert storen[:-64] == data_storen
+            assert storen[-64:] == [0x7f]*64  # detect overflow
 
         for stride in range(-64, 0):
-            data = [15] * -stride * self.nlanes
-            data[::stride] = vdata
-            storen = [127]*64
-            storen += [15] * -stride * self.nlanes
-            self.storen(storen, stride, vdata)
-            assert storen[64:] == data
-            assert storen[:64] == [127]*64 # detect overflow
-
-    def test_memory_noncont_partial_store(self):
-        if self.sfx in ("u8", "s8", "u16", "s16"):
+            data_storen = [0xff] * -stride * self.nlanes
+            for s in range(0, hlanes*stride, stride):
+                i = (s//stride)*scale
+                data_storen[s-scale:s or None] = data[i:i+scale]
+            storen = [0x7f]*64
+            storen += [0xff] * -stride * self.nlanes
+            npyv_storen(storen, stride, vdata)
+            assert storen[64:] == data_storen
+            assert storen[:64] == [0x7f]*64  # detect overflow
+        # stride 0
+        data_storen = [0x7f] * self.nlanes
+        storen = data_storen.copy()
+        data_storen[0:scale] = data[-scale:]
+        npyv_storen(storen, 0, vdata)
+        assert storen == data_storen
+
+    @pytest.mark.parametrize("intrin, elsizes, scale", [
+        ("self.storen_till", (32, 64), 1),
+        ("self.storen2_till", (32, 64), 2),
+    ])
+    def test_memory_noncont_partial_store(self, intrin, elsizes, scale):
+        if self._scalar_size() not in elsizes:
             return
-
+        npyv_storen_till = eval(intrin)
         data = self._data()
         vdata = self.load(data)
         lanes = list(range(1, self.nlanes + 1))
         lanes += [self.nlanes**2, self.nlanes**4]
+        hlanes = self.nlanes // scale
         for stride in range(1, 64):
             for n in lanes:
-                data_till = [15] * stride * self.nlanes
-                data_till[::stride] = data[:n] + [15] * (self.nlanes-n)
-                storen_till = [15] * stride * self.nlanes
-                storen_till += [127]*64
-                self.storen_till(storen_till, stride, n, vdata)
+                data_till = [0xff] * stride * self.nlanes
+                tdata = data[:n*scale] + [0xff] * (self.nlanes-n*scale)
+                for s in range(0, hlanes*stride, stride)[:n]:
+                    i = (s//stride)*scale
+                    data_till[s:s+scale] = tdata[i:i+scale]
+                storen_till = [0xff] * stride * self.nlanes
+                storen_till += [0x7f]*64
+                npyv_storen_till(storen_till, stride, n, vdata)
                 assert storen_till[:-64] == data_till
-                assert storen_till[-64:] == [127]*64 # detect overflow
+                assert storen_till[-64:] == [0x7f]*64  # detect overflow
 
         for stride in range(-64, 0):
             for n in lanes:
-                data_till = [15] * -stride * self.nlanes
-                data_till[::stride] = data[:n] + [15] * (self.nlanes-n)
-                storen_till = [127]*64
-                storen_till += [15] * -stride * self.nlanes
-                self.storen_till(storen_till, stride, n, vdata)
+                data_till = [0xff] * -stride * self.nlanes
+                tdata = data[:n*scale] + [0xff] * (self.nlanes-n*scale)
+                for s in range(0, hlanes*stride, stride)[:n]:
+                    i = (s//stride)*scale
+                    data_till[s-scale:s or None] = tdata[i:i+scale]
+                storen_till = [0x7f]*64
+                storen_till += [0xff] * -stride * self.nlanes
+                npyv_storen_till(storen_till, stride, n, vdata)
                 assert storen_till[64:] == data_till
-                assert storen_till[:64] == [127]*64 # detect overflow
+                assert storen_till[:64] == [0x7f]*64  # detect overflow
+
+        # stride 0
+        for n in lanes:
+            data_till = [0x7f] * self.nlanes
+            storen_till = data_till.copy()
+            data_till[0:scale] = data[:n*scale][-scale:]
+            npyv_storen_till(storen_till, 0, n, vdata)
+            assert storen_till == data_till
 
     @pytest.mark.parametrize("intrin, table_size, elsize", [
         ("self.lut32", 32, 32),
@@ -851,13 +957,27 @@ class _SIMD_ALL(_Test_Utility):
         combineh = self.combineh(vdata_a, vdata_b)
         assert combineh == data_a_hi + data_b_hi
         # combine x2
-        combine  = self.combine(vdata_a, vdata_b)
+        combine = self.combine(vdata_a, vdata_b)
         assert combine == (data_a_lo + data_b_lo, data_a_hi + data_b_hi)
+
         # zip(interleave)
-        data_zipl = [v for p in zip(data_a_lo, data_b_lo) for v in p]
-        data_ziph = [v for p in zip(data_a_hi, data_b_hi) for v in p]
-        vzip  = self.zip(vdata_a, vdata_b)
+        data_zipl = self.load([
+            v for p in zip(data_a_lo, data_b_lo) for v in p
+        ])
+        data_ziph = self.load([
+            v for p in zip(data_a_hi, data_b_hi) for v in p
+        ])
+        vzip = self.zip(vdata_a, vdata_b)
         assert vzip == (data_zipl, data_ziph)
+        vzip = [0]*self.nlanes*2
+        self._x2("store")(vzip, (vdata_a, vdata_b))
+        assert vzip == list(data_zipl) + list(data_ziph)
+
+        # unzip(deinterleave)
+        unzip = self.unzip(data_zipl, data_ziph)
+        assert unzip == (data_a, data_b)
+        unzip = self._x2("load")(list(data_zipl) + list(data_ziph))
+        assert unzip == (data_a, data_b)
 
     def test_reorder_rev64(self):
         # Reverse elements of each 64-bit lane
@@ -871,41 +991,51 @@ class _SIMD_ALL(_Test_Utility):
         rev64 = self.rev64(self.load(range(self.nlanes)))
         assert rev64 == data_rev64
 
-    def test_operators_comparison(self):
+    def test_reorder_permi128(self):
+        """
+        Test permuting elements for each 128-bit lane.
+        npyv_permi128_##sfx
+        """
+        ssize = self._scalar_size()
+        if ssize < 32:
+            return
+        data = self.load(self._data())
+        permn = 128//ssize
+        permd = permn-1
+        nlane128 = self.nlanes//permn
+        shfl = [0, 1] if ssize == 64 else [0, 2, 4, 6]
+        for i in range(permn):
+            indices = [(i >> shf) & permd for shf in shfl]
+            vperm = self.permi128(data, *indices)
+            data_vperm = [
+                data[j + (e & -permn)]
+                for e, j in enumerate(indices*nlane128)
+            ]
+            assert vperm == data_vperm
+
+    @pytest.mark.parametrize('func, intrin', [
+        (operator.lt, "cmplt"),
+        (operator.le, "cmple"),
+        (operator.gt, "cmpgt"),
+        (operator.ge, "cmpge"),
+        (operator.eq, "cmpeq")
+    ])
+    def test_operators_comparison(self, func, intrin):
         if self._is_fp():
             data_a = self._data()
         else:
             data_a = self._data(self._int_max() - self.nlanes)
         data_b = self._data(self._int_min(), reverse=True)
         vdata_a, vdata_b = self.load(data_a), self.load(data_b)
+        intrin = getattr(self, intrin)
 
         mask_true = self._true_mask()
         def to_bool(vector):
             return [lane == mask_true for lane in vector]
-        # equal
-        data_eq = [a == b for a, b in zip(data_a, data_b)]
-        cmpeq = to_bool(self.cmpeq(vdata_a, vdata_b))
-        assert cmpeq == data_eq
-        # not equal
-        data_neq = [a != b for a, b in zip(data_a, data_b)]
-        cmpneq = to_bool(self.cmpneq(vdata_a, vdata_b))
-        assert cmpneq == data_neq
-        # greater than
-        data_gt = [a > b for a, b in zip(data_a, data_b)]
-        cmpgt = to_bool(self.cmpgt(vdata_a, vdata_b))
-        assert cmpgt == data_gt
-        # greater than and equal
-        data_ge = [a >= b for a, b in zip(data_a, data_b)]
-        cmpge = to_bool(self.cmpge(vdata_a, vdata_b))
-        assert cmpge == data_ge
-        # less than
-        data_lt  = [a < b for a, b in zip(data_a, data_b)]
-        cmplt = to_bool(self.cmplt(vdata_a, vdata_b))
-        assert cmplt == data_lt
-        # less than and equal
-        data_le  = [a <= b for a, b in zip(data_a, data_b)]
-        cmple = to_bool(self.cmple(vdata_a, vdata_b))
-        assert cmple == data_le
+
+        data_cmp = [func(a, b) for a, b in zip(data_a, data_b)]
+        cmp = to_bool(intrin(vdata_a, vdata_b))
+        assert cmp == data_cmp
 
     def test_operators_logical(self):
         if self._is_fp():
@@ -1145,6 +1275,18 @@ class _SIMD_ALL(_Test_Utility):
         ifadd = self.ifadd(false_mask, vdata_a, vdata_b, vdata_b)
         assert ifadd == vdata_b
 
+        if not self._is_fp():
+            return
+        data_div = self.div(vdata_b, vdata_a)
+        ifdiv = self.ifdiv(true_mask, vdata_b, vdata_a, vdata_b)
+        assert ifdiv == data_div
+        ifdivz = self.ifdivz(true_mask, vdata_b, vdata_a)
+        assert ifdivz == data_div
+        ifdiv = self.ifdiv(false_mask, vdata_a, vdata_b, vdata_b)
+        assert ifdiv == vdata_b
+        ifdivz = self.ifdivz(false_mask, vdata_a, vdata_b)
+        assert ifdivz == self.zero()
+
 bool_sfx = ("b8", "b16", "b32", "b64")
 int_sfx = ("u8", "s8", "u16", "s16", "u32", "s32", "u64", "s64")
 fp_sfx  = ("f32", "f64")
diff --git a/numpy/core/tests/test_strings.py b/numpy/core/tests/test_strings.py
index 2b87ed654..42f775e85 100644
--- a/numpy/core/tests/test_strings.py
+++ b/numpy/core/tests/test_strings.py
@@ -83,3 +83,17 @@ def test_string_comparisons_empty(op, ufunc, sym, dtypes):
     assert_array_equal(op(arr, arr2), expected)
     assert_array_equal(ufunc(arr, arr2), expected)
     assert_array_equal(np.compare_chararrays(arr, arr2, sym, False), expected)
+
+
+@pytest.mark.parametrize("str_dt", ["S", "U"])
+@pytest.mark.parametrize("float_dt", np.typecodes["AllFloat"])
+def test_float_to_string_cast(str_dt, float_dt):
+    float_dt = np.dtype(float_dt)
+    fi = np.finfo(float_dt)
+    arr = np.array([np.nan, np.inf, -np.inf, fi.max, fi.min], dtype=float_dt)
+    expected = ["nan", "inf", "-inf", repr(fi.max), repr(fi.min)]
+    if float_dt.kind == 'c':
+        expected = [f"({r}+0j)" for r in expected]
+
+    res = arr.astype(str_dt)
+    assert_array_equal(res, np.array(expected, dtype=str_dt))
diff --git a/numpy/core/tests/test_ufunc.py b/numpy/core/tests/test_ufunc.py
index 55767a3ef..f716e2104 100644
--- a/numpy/core/tests/test_ufunc.py
+++ b/numpy/core/tests/test_ufunc.py
@@ -1,8 +1,10 @@
 import warnings
 import itertools
 import sys
+import ctypes as ct
 
 import pytest
+from pytest import param
 
 import numpy as np
 import numpy.core._umath_tests as umt
@@ -12,7 +14,7 @@ import numpy.core._rational_tests as _rational_tests
 from numpy.testing import (
     assert_, assert_equal, assert_raises, assert_array_equal,
     assert_almost_equal, assert_array_almost_equal, assert_no_warnings,
-    assert_allclose, HAS_REFCOUNT, suppress_warnings
+    assert_allclose, HAS_REFCOUNT, suppress_warnings, IS_WASM
     )
 from numpy.testing._private.utils import requires_memory
 from numpy.compat import pickle
@@ -477,6 +479,34 @@ class TestUfunc:
         float_dtype = type(np.dtype(np.float64))
         np.add(3, 4, signature=(float_dtype, float_dtype, None))
 
+    @pytest.mark.parametrize("get_kwarg", [
+            lambda dt: dict(dtype=x),
+            lambda dt: dict(signature=(x, None, None))])
+    def test_signature_dtype_instances_allowed(self, get_kwarg):
+        # We allow certain dtype instances when there is a clear singleton
+        # and the given one is equivalent; mainly for backcompat.
+        int64 = np.dtype("int64")
+        int64_2 = pickle.loads(pickle.dumps(int64))
+        # Relies on pickling behavior, if assert fails just remove test...
+        assert int64 is not int64_2
+
+        assert np.add(1, 2, **get_kwarg(int64_2)).dtype == int64
+        td = np.timedelta(2, "s")
+        assert np.add(td, td, **get_kwarg("m8")).dtype == "m8[s]"
+
+    @pytest.mark.parametrize("get_kwarg", [
+            param(lambda x: dict(dtype=x), id="dtype"),
+            param(lambda x: dict(signature=(x, None, None)), id="signature")])
+    def test_signature_dtype_instances_allowed(self, get_kwarg):
+        msg = "The `dtype` and `signature` arguments to ufuncs"
+
+        with pytest.raises(TypeError, match=msg):
+            np.add(3, 5, **get_kwarg(np.dtype("int64").newbyteorder()))
+        with pytest.raises(TypeError, match=msg):
+            np.add(3, 5, **get_kwarg(np.dtype("m8[ns]")))
+        with pytest.raises(TypeError, match=msg):
+            np.add(3, 5, **get_kwarg("m8[ns]"))
+
     @pytest.mark.parametrize("casting", ["unsafe", "same_kind", "safe"])
     def test_partial_signature_mismatch(self, casting):
         # If the second argument matches already, no need to specify it:
@@ -671,6 +701,7 @@ class TestUfunc:
         a = np.ones(500, dtype=np.float64)
         assert_almost_equal((a / 10.).sum() - a.size / 10., 0, 13)
 
+    @pytest.mark.skipif(IS_WASM, reason="fp errors don't work in wasm")
     def test_sum(self):
         for dt in (int, np.float16, np.float32, np.float64, np.longdouble):
             for v in (0, 1, 2, 7, 8, 9, 15, 16, 19, 127,
@@ -941,10 +972,10 @@ class TestUfunc:
         assert_raises(TypeError, inner1d, a, b, axes=[None, 1])
         # cannot pass an index unless there is only one dimension
         # (output is wrong in this case)
-        assert_raises(TypeError, inner1d, a, b, axes=[-1, -1, -1])
+        assert_raises(np.AxisError, inner1d, a, b, axes=[-1, -1, -1])
         # or pass in generally the wrong number of axes
-        assert_raises(ValueError, inner1d, a, b, axes=[-1, -1, (-1,)])
-        assert_raises(ValueError, inner1d, a, b, axes=[-1, (-2, -1), ()])
+        assert_raises(np.AxisError, inner1d, a, b, axes=[-1, -1, (-1,)])
+        assert_raises(np.AxisError, inner1d, a, b, axes=[-1, (-2, -1), ()])
         # axes need to have same length.
         assert_raises(ValueError, inner1d, a, b, axes=[0, 1])
 
@@ -984,14 +1015,16 @@ class TestUfunc:
         # list needs to have right length
         assert_raises(ValueError, mm, a, b, axes=[])
         assert_raises(ValueError, mm, a, b, axes=[(-2, -1)])
-        # list should contain tuples for multiple axes
-        assert_raises(TypeError, mm, a, b, axes=[-1, -1, -1])
-        assert_raises(TypeError, mm, a, b, axes=[(-2, -1), (-2, -1), -1])
+        # list should not contain None, or lists
+        assert_raises(TypeError, mm, a, b, axes=[None, None, None])
         assert_raises(TypeError,
                       mm, a, b, axes=[[-2, -1], [-2, -1], [-2, -1]])
         assert_raises(TypeError,
                       mm, a, b, axes=[(-2, -1), (-2, -1), [-2, -1]])
         assert_raises(TypeError, mm, a, b, axes=[(-2, -1), (-2, -1), None])
+        # single integers are AxisErrors if more are required
+        assert_raises(np.AxisError, mm, a, b, axes=[-1, -1, -1])
+        assert_raises(np.AxisError, mm, a, b, axes=[(-2, -1), (-2, -1), -1])
         # tuples should not have duplicated values
         assert_raises(ValueError, mm, a, b, axes=[(-2, -1), (-2, -1), (-2, -2)])
         # arrays should have enough axes.
@@ -1617,6 +1650,19 @@ class TestUfunc:
         a = a[1:, 1:, 1:]
         self.check_identityless_reduction(a)
 
+    def test_reduce_identity_depends_on_loop(self):
+        """
+        The type of the result should always depend on the selected loop, not
+        necessarily the output (only relevant for object arrays).
+        """
+        # For an object loop, the default value 0 with type int is used:
+        assert type(np.add.reduce([], dtype=object)) is int
+        out = np.array(None, dtype=object)
+        # When the loop is float64 but `out` is object this does not happen,
+        # the result is float64 cast to object (which gives Python `float`).
+        np.add.reduce([], out=out, dtype=np.float64)
+        assert type(out[()]) is float
+
     def test_initial_reduction(self):
         # np.minimum.reduce is an identityless reduction
 
@@ -1637,6 +1683,9 @@ class TestUfunc:
         # Check initial=None raises ValueError for both types of ufunc reductions
         assert_raises(ValueError, np.minimum.reduce, [], initial=None)
         assert_raises(ValueError, np.add.reduce, [], initial=None)
+        # Also in the somewhat special object case:
+        with pytest.raises(ValueError):
+            np.add.reduce([], initial=None, dtype=object)
 
         # Check that np._NoValue gives default behavior.
         assert_equal(np.add.reduce([], initial=np._NoValue), 0)
@@ -1646,6 +1695,23 @@ class TestUfunc:
         res = np.add.reduce(a, initial=5)
         assert_equal(res, 15)
 
+    def test_empty_reduction_and_idenity(self):
+        arr = np.zeros((0, 5))
+        # OK, since the reduction itself is *not* empty, the result is
+        assert np.true_divide.reduce(arr, axis=1).shape == (0,)
+        # Not OK, the reduction itself is empty and we have no idenity
+        with pytest.raises(ValueError):
+            np.true_divide.reduce(arr, axis=0)
+
+        # Test that an empty reduction fails also if the result is empty
+        arr = np.zeros((0, 0, 5))
+        with pytest.raises(ValueError):
+            np.true_divide.reduce(arr, axis=1)
+
+        # Division reduction makes sense with `initial=1` (empty or not):
+        res = np.true_divide.reduce(arr, axis=1, initial=1)
+        assert_array_equal(res, np.ones((0, 5)))
+
     @pytest.mark.parametrize('axis', (0, 1, None))
     @pytest.mark.parametrize('where', (np.array([False, True, True]),
                                        np.array([[True], [False], [True]]),
@@ -1890,17 +1956,133 @@ class TestUfunc:
         assert_(MyThing.rmul_count == 1, MyThing.rmul_count)
         assert_(MyThing.getitem_count <= 2, MyThing.getitem_count)
 
-    def test_inplace_fancy_indexing(self):
+    @pytest.mark.parametrize("a", (
+                             np.arange(10, dtype=int),
+                             np.arange(10, dtype=_rational_tests.rational),
+                             ))
+    def test_ufunc_at_basic(self, a):
 
-        a = np.arange(10)
-        np.add.at(a, [2, 5, 2], 1)
-        assert_equal(a, [0, 1, 4, 3, 4, 6, 6, 7, 8, 9])
+        aa = a.copy()
+        np.add.at(aa, [2, 5, 2], 1)
+        assert_equal(aa, [0, 1, 4, 3, 4, 6, 6, 7, 8, 9])
 
-        a = np.arange(10)
+        with pytest.raises(ValueError):
+            # missing second operand
+            np.add.at(aa, [2, 5, 3])
+
+        aa = a.copy()
+        np.negative.at(aa, [2, 5, 3])
+        assert_equal(aa, [0, 1, -2, -3, 4, -5, 6, 7, 8, 9])
+
+        aa = a.copy()
         b = np.array([100, 100, 100])
-        np.add.at(a, [2, 5, 2], b)
-        assert_equal(a, [0, 1, 202, 3, 4, 105, 6, 7, 8, 9])
+        np.add.at(aa, [2, 5, 2], b)
+        assert_equal(aa, [0, 1, 202, 3, 4, 105, 6, 7, 8, 9])
 
+        with pytest.raises(ValueError):
+            # extraneous second operand
+            np.negative.at(a, [2, 5, 3], [1, 2, 3])
+
+        with pytest.raises(ValueError):
+            # second operand cannot be converted to an array
+            np.add.at(a, [2, 5, 3], [[1, 2], 1])
+
+    # ufuncs with indexed loops for performance in ufunc.at
+    indexed_ufuncs = [np.add, np.subtract, np.multiply, np.floor_divide,
+                      np.maximum, np.minimum, np.fmax, np.fmin]
+
+    @pytest.mark.parametrize(
+                "typecode", np.typecodes['AllInteger'] + np.typecodes['Float'])
+    @pytest.mark.parametrize("ufunc", indexed_ufuncs)
+    def test_ufunc_at_inner_loops(self, typecode, ufunc):
+        if ufunc is np.divide and typecode in np.typecodes['AllInteger']:
+            # Avoid divide-by-zero and inf for integer divide
+            a = np.ones(100, dtype=typecode)
+            indx = np.random.randint(100, size=30, dtype=np.intp)
+            vals = np.arange(1, 31, dtype=typecode)
+        else:
+            a = np.ones(1000, dtype=typecode)
+            indx = np.random.randint(1000, size=3000, dtype=np.intp)
+            vals = np.arange(3000, dtype=typecode)
+        atag = a.copy()
+        # Do the calculation twice and compare the answers
+        with warnings.catch_warnings(record=True) as w_at:
+            warnings.simplefilter('always')
+            ufunc.at(a, indx, vals)
+        with warnings.catch_warnings(record=True) as w_loop:
+            warnings.simplefilter('always')
+            for i, v in zip(indx, vals):
+                # Make sure all the work happens inside the ufunc
+                # in order to duplicate error/warning handling
+                ufunc(atag[i], v, out=atag[i:i+1], casting="unsafe")
+        assert_equal(atag, a)
+        # If w_loop warned, make sure w_at warned as well
+        if len(w_loop) > 0:
+            #
+            assert len(w_at) > 0
+            assert w_at[0].category == w_loop[0].category
+            assert str(w_at[0].message)[:10] == str(w_loop[0].message)[:10]
+
+    @pytest.mark.parametrize("typecode", np.typecodes['Complex'])
+    @pytest.mark.parametrize("ufunc", [np.add, np.subtract, np.multiply])
+    def test_ufunc_at_inner_loops_complex(self, typecode, ufunc):
+        a = np.ones(10, dtype=typecode)
+        indx = np.concatenate([np.ones(6, dtype=np.intp),
+                               np.full(18, 4, dtype=np.intp)])
+        value = a.dtype.type(1j)
+        ufunc.at(a, indx, value)
+        expected = np.ones_like(a)
+        if ufunc is np.multiply:
+            expected[1] = expected[4] = -1
+        else:
+            expected[1] += 6 * (value if ufunc is np.add else -value)
+            expected[4] += 18 * (value if ufunc is np.add else -value)
+
+        assert_array_equal(a, expected)
+
+    def test_ufunc_at_ellipsis(self):
+        # Make sure the indexed loop check does not choke on iters
+        # with subspaces
+        arr = np.zeros(5)
+        np.add.at(arr, slice(None), np.ones(5))
+        assert_array_equal(arr, np.ones(5))
+
+    def test_ufunc_at_negative(self):
+        arr = np.ones(5, dtype=np.int32)
+        indx = np.arange(5)
+        umt.indexed_negative.at(arr, indx)
+        # If it is [-1, -1, -1, -100, 0] then the regular strided loop was used
+        assert np.all(arr == [-1, -1, -1, -200, -1])
+
+    def test_ufunc_at_large(self):
+        # issue gh-23457
+        indices = np.zeros(8195, dtype=np.int16)
+        b = np.zeros(8195, dtype=float)
+        b[0] = 10
+        b[1] = 5
+        b[8192:] = 100
+        a = np.zeros(1, dtype=float)
+        np.add.at(a, indices, b)
+        assert a[0] == b.sum()
+
+    def test_cast_index_fastpath(self):
+        arr = np.zeros(10)
+        values = np.ones(100000)
+        # index must be cast, which may be buffered in chunks:
+        index = np.zeros(len(values), dtype=np.uint8)
+        np.add.at(arr, index, values)
+        assert arr[0] == len(values)
+
+    @pytest.mark.parametrize("value", [
+        np.ones(1), np.ones(()), np.float64(1.), 1.])
+    def test_ufunc_at_scalar_value_fastpath(self, value):
+        arr = np.zeros(1000)
+        # index must be cast, which may be buffered in chunks:
+        index = np.repeat(np.arange(1000), 2)
+        np.add.at(arr, index, value)
+        assert_array_equal(arr, np.full_like(arr, 2 * value))
+
+    def test_ufunc_at_multiD(self):
         a = np.arange(9).reshape(3, 3)
         b = np.array([[100, 100, 100], [200, 200, 200], [300, 300, 300]])
         np.add.at(a, (slice(None), [1, 2, 1]), b)
@@ -1980,11 +2162,7 @@ class TestUfunc:
               [121, 222, 323],
               [124, 225, 326]]])
 
-        a = np.arange(10)
-        np.negative.at(a, [2, 5, 2])
-        assert_equal(a, [0, 1, 2, 3, 4, -5, 6, 7, 8, 9])
-
-        # Test 0-dim array
+    def test_ufunc_at_0D(self):
         a = np.array(0)
         np.add.at(a, (), 1)
         assert_equal(a, 1)
@@ -1992,11 +2170,13 @@ class TestUfunc:
         assert_raises(IndexError, np.add.at, a, 0, 1)
         assert_raises(IndexError, np.add.at, a, [], 1)
 
+    def test_ufunc_at_dtypes(self):
         # Test mixed dtypes
         a = np.arange(10)
         np.power.at(a, [1, 2, 3, 2], 3.5)
         assert_equal(a, np.array([0, 1, 4414, 46, 4, 5, 6, 7, 8, 9]))
 
+    def test_ufunc_at_boolean(self):
         # Test boolean indexing and boolean ufuncs
         a = np.arange(10)
         index = a % 2 == 0
@@ -2008,6 +2188,7 @@ class TestUfunc:
         np.invert.at(a, [2, 5, 2])
         assert_equal(a, [0, 1, 2, 3, 4, 5 ^ 0xffffffff, 6, 7, 8, 9])
 
+    def test_ufunc_at_advanced(self):
         # Test empty subspace
         orig = np.arange(4)
         a = orig[:, None][:, 0:0]
@@ -2031,7 +2212,7 @@ class TestUfunc:
         # Test maximum
         a = np.array([1, 2, 3])
         np.maximum.at(a, [0], 0)
-        assert_equal(np.array([1, 2, 3]), a)
+        assert_equal(a, np.array([1, 2, 3]))
 
     def test_at_not_none_signature(self):
         # Test ufuncs with non-trivial signature raise a TypeError
@@ -2042,6 +2223,23 @@ class TestUfunc:
         a = np.array([[[1, 2], [3, 4]]])
         assert_raises(TypeError, np.linalg._umath_linalg.det.at, a, [0])
 
+    def test_at_no_loop_for_op(self):
+        # str dtype does not have a ufunc loop for np.add
+        arr = np.ones(10, dtype=str)
+        with pytest.raises(np.core._exceptions._UFuncNoLoopError):
+            np.add.at(arr, [0, 1], [0, 1])
+
+    def test_at_output_casting(self):
+        arr = np.array([-1])
+        np.equal.at(arr, [0], [0])
+        assert arr[0] == 0
+
+    def test_at_broadcast_failure(self):
+        arr = np.arange(5)
+        with pytest.raises(ValueError):
+            np.add.at(arr, [0, 1], [1, 2, 3])
+
+
     def test_reduce_arguments(self):
         f = np.add.reduce
         d = np.ones((5,2), dtype=int)
@@ -2509,6 +2707,7 @@ def test_ufunc_input_casterrors(bad_offset):
         np.add(arr, arr, dtype=np.intp, casting="unsafe")
 
 
+@pytest.mark.skipif(IS_WASM, reason="fp errors don't work in wasm")
 @pytest.mark.parametrize("bad_offset", [0, int(np.BUFSIZE * 1.5)])
 def test_ufunc_input_floatingpoint_error(bad_offset):
     value = 123
@@ -2546,7 +2745,9 @@ def test_reduce_casterrors(offset):
     with pytest.raises(ValueError, match="invalid literal"):
         # This is an unsafe cast, but we currently always allow that.
         # Note that the double loop is picked, but the cast fails.
-        np.add.reduce(arr, dtype=np.intp, out=out)
+        # `initial=None` disables the use of an identity here to test failures
+        # while copying the first values path (not used when identity exists).
+        np.add.reduce(arr, dtype=np.intp, out=out, initial=None)
     assert count == sys.getrefcount(value)
     # If an error occurred during casting, the operation is done at most until
     # the error occurs (the result of which would be `value * offset`) and -1
@@ -2555,6 +2756,16 @@ def test_reduce_casterrors(offset):
     assert out[()] < value * offset
 
 
+def test_object_reduce_cleanup_on_failure():
+    # Test cleanup, including of the initial value (manually provided or not)
+    with pytest.raises(TypeError):
+        np.add.reduce([1, 2, None], initial=4)
+
+    with pytest.raises(TypeError):
+        np.add.reduce([1, 2, None])
+
+
+@pytest.mark.skipif(IS_WASM, reason="fp errors don't work in wasm")
 @pytest.mark.parametrize("method",
         [np.add.accumulate, np.add.reduce,
          pytest.param(lambda x: np.add.reduceat(x, [0]), id="reduceat"),
@@ -2624,3 +2835,142 @@ def test_addition_reduce_negative_zero(dtype, use_initial):
             # `sum([])` should probably be 0.0 and not -0.0 like `sum([-0.0])`
             assert not np.signbit(res.real)
             assert not np.signbit(res.imag)
+
+class TestLowlevelAPIAccess:
+    def test_resolve_dtypes_basic(self):
+        # Basic test for dtype resolution:
+        i4 = np.dtype("i4")
+        f4 = np.dtype("f4")
+        f8 = np.dtype("f8")
+
+        r = np.add.resolve_dtypes((i4, f4, None))
+        assert r == (f8, f8, f8)
+
+        # Signature uses the same logic to parse as ufunc (less strict)
+        # the following is "same-kind" casting so works:
+        r = np.add.resolve_dtypes((
+                i4, i4, None), signature=(None, None, "f4"))
+        assert r == (f4, f4, f4)
+
+        # Check NEP 50 "weak" promotion also:
+        r = np.add.resolve_dtypes((f4, int, None))
+        assert r == (f4, f4, f4)
+
+        with pytest.raises(TypeError):
+            np.add.resolve_dtypes((i4, f4, None), casting="no")
+
+    def test_weird_dtypes(self):
+        S0 = np.dtype("S0")
+        # S0 is often converted by NumPy to S1, but not here:
+        r = np.equal.resolve_dtypes((S0, S0, None))
+        assert r == (S0, S0, np.dtype(bool))
+
+        # Subarray dtypes are weird and may not work fully, we preserve them
+        # leading to a TypeError (currently no equal loop for void/structured)
+        dts = np.dtype("10i")
+        with pytest.raises(TypeError):
+            np.equal.resolve_dtypes((dts, dts, None))
+
+    def test_resolve_dtypes_reduction(self):
+        i4 = np.dtype("i4")
+        with pytest.raises(NotImplementedError):
+            np.add.resolve_dtypes((i4, i4, i4), reduction=True)
+
+    @pytest.mark.parametrize("dtypes", [
+            (np.dtype("i"), np.dtype("i")),
+            (None, np.dtype("i"), np.dtype("f")),
+            (np.dtype("i"), None, np.dtype("f")),
+            ("i4", "i4", None)])
+    def test_resolve_dtypes_errors(self, dtypes):
+        with pytest.raises(TypeError):
+            np.add.resolve_dtypes(dtypes)
+
+    def test_resolve_dtypes_reduction(self):
+        i2 = np.dtype("i2")
+        long_ = np.dtype("long")
+        # Check special addition resolution:
+        res = np.add.resolve_dtypes((None, i2, None), reduction=True)
+        assert res == (long_, long_, long_)
+
+    def test_resolve_dtypes_reduction_errors(self):
+        i2 = np.dtype("i2")
+
+        with pytest.raises(TypeError):
+            np.add.resolve_dtypes((None, i2, i2))
+
+        with pytest.raises(TypeError):
+            np.add.signature((None, None, "i4"))
+
+    @pytest.mark.skipif(not hasattr(ct, "pythonapi"),
+            reason="`ctypes.pythonapi` required for capsule unpacking.")
+    def test_loop_access(self):
+        # This is a basic test for the full strided loop access
+        data_t = ct.ARRAY(ct.c_char_p, 2)
+        dim_t = ct.ARRAY(ct.c_ssize_t, 1)
+        strides_t = ct.ARRAY(ct.c_ssize_t, 2)
+        strided_loop_t = ct.CFUNCTYPE(
+                ct.c_int, ct.c_void_p, data_t, dim_t, strides_t, ct.c_void_p)
+
+        class call_info_t(ct.Structure):
+            _fields_ = [
+                ("strided_loop", strided_loop_t),
+                ("context", ct.c_void_p),
+                ("auxdata", ct.c_void_p),
+                ("requires_pyapi", ct.c_byte),
+                ("no_floatingpoint_errors", ct.c_byte),
+            ]
+
+        i4 = np.dtype("i4")
+        dt, call_info_obj = np.negative._resolve_dtypes_and_context((i4, i4))
+        assert dt == (i4, i4)  # can be used without casting
+
+        # Fill in the rest of the information:
+        np.negative._get_strided_loop(call_info_obj)
+
+        ct.pythonapi.PyCapsule_GetPointer.restype = ct.c_void_p
+        call_info = ct.pythonapi.PyCapsule_GetPointer(
+                ct.py_object(call_info_obj),
+                ct.c_char_p(b"numpy_1.24_ufunc_call_info"))
+
+        call_info = ct.cast(call_info, ct.POINTER(call_info_t)).contents
+
+        arr = np.arange(10, dtype=i4)
+        call_info.strided_loop(
+                call_info.context,
+                data_t(arr.ctypes.data, arr.ctypes.data),
+                arr.ctypes.shape,  # is a C-array with 10 here
+                strides_t(arr.ctypes.strides[0], arr.ctypes.strides[0]),
+                call_info.auxdata)
+
+        # We just directly called the negative inner-loop in-place:
+        assert_array_equal(arr, -np.arange(10, dtype=i4))
+
+    @pytest.mark.parametrize("strides", [1, (1, 2, 3), (1, "2")])
+    def test__get_strided_loop_errors_bad_strides(self, strides):
+        i4 = np.dtype("i4")
+        dt, call_info = np.negative._resolve_dtypes_and_context((i4, i4))
+
+        with pytest.raises(TypeError, match="fixed_strides.*tuple.*or None"):
+            np.negative._get_strided_loop(call_info, fixed_strides=strides)
+
+    def test__get_strided_loop_errors_bad_call_info(self):
+        i4 = np.dtype("i4")
+        dt, call_info = np.negative._resolve_dtypes_and_context((i4, i4))
+
+        with pytest.raises(ValueError, match="PyCapsule"):
+            np.negative._get_strided_loop("not the capsule!")
+
+        with pytest.raises(TypeError, match=".*incompatible context"):
+            np.add._get_strided_loop(call_info)
+
+        np.negative._get_strided_loop(call_info)
+        with pytest.raises(TypeError):
+            # cannot call it a second time:
+            np.negative._get_strided_loop(call_info)
+
+    def test_long_arrays(self):
+        t = np.zeros((1029, 917), dtype=np.single)
+        t[0][0] = 1
+        t[28][414] = 1
+        tc = np.cos(t)
+        assert_equal(tc[0][0], tc[28][414])
diff --git a/numpy/core/tests/test_umath.py b/numpy/core/tests/test_umath.py
index e2e95ec69..9e3fe387b 100644
--- a/numpy/core/tests/test_umath.py
+++ b/numpy/core/tests/test_umath.py
@@ -17,10 +17,26 @@ from numpy.testing import (
     assert_, assert_equal, assert_raises, assert_raises_regex,
     assert_array_equal, assert_almost_equal, assert_array_almost_equal,
     assert_array_max_ulp, assert_allclose, assert_no_warnings, suppress_warnings,
-    _gen_alignment_data, assert_array_almost_equal_nulp
+    _gen_alignment_data, assert_array_almost_equal_nulp, IS_WASM, IS_MUSL
     )
 from numpy.testing._private.utils import _glibc_older_than
 
+UFUNCS = [obj for obj in np.core.umath.__dict__.values()
+         if isinstance(obj, np.ufunc)]
+
+UFUNCS_UNARY = [
+    uf for uf in UFUNCS if uf.nin == 1
+]
+UFUNCS_UNARY_FP = [
+    uf for uf in UFUNCS_UNARY if 'f->f' in uf.types
+]
+
+UFUNCS_BINARY = [
+    uf for uf in UFUNCS if uf.nin == 2
+]
+UFUNCS_BINARY_ACC = [
+    uf for uf in UFUNCS_BINARY if hasattr(uf, "accumulate") and uf.nout == 1
+]
 
 def interesting_binop_operands(val1, val2, dtype):
     """
@@ -275,16 +291,16 @@ class TestComparisons:
         b_lst = b.tolist()
 
         # (Binary) Comparison (x1=array, x2=array)
-        comp_b = np_comp(a, b)
-        comp_b_list = [py_comp(x, y) for x, y in zip(a_lst, b_lst)]
+        comp_b = np_comp(a, b).view(np.uint8)
+        comp_b_list = [int(py_comp(x, y)) for x, y in zip(a_lst, b_lst)]
 
         # (Scalar1) Comparison (x1=scalar, x2=array)
-        comp_s1 = np_comp(np_scalar, b)
-        comp_s1_list = [py_comp(scalar, x) for x in b_lst]
+        comp_s1 = np_comp(np_scalar, b).view(np.uint8)
+        comp_s1_list = [int(py_comp(scalar, x)) for x in b_lst]
 
         # (Scalar2) Comparison (x1=array, x2=scalar)
-        comp_s2 = np_comp(a, np_scalar)
-        comp_s2_list = [py_comp(x, scalar) for x in a_lst]
+        comp_s2 = np_comp(a, np_scalar).view(np.uint8)
+        comp_s2_list = [int(py_comp(x, scalar)) for x in a_lst]
 
         # Sequence: Binary, Scalar1 and Scalar2
         assert_(comp_b.tolist() == comp_b_list,
@@ -338,6 +354,79 @@ class TestComparisons:
         assert_equal(np.equal.reduce(a, dtype=bool), True)
         assert_raises(TypeError, np.equal.reduce, a)
 
+    def test_object_dtype(self):
+        assert np.equal(1, [1], dtype=object).dtype == object
+        assert np.equal(1, [1], signature=(None, None, "O")).dtype == object
+
+    def test_object_nonbool_dtype_error(self):
+        # bool output dtype is fine of course:
+        assert np.equal(1, [1], dtype=bool).dtype == bool
+
+        # but the following are examples do not have a loop:
+        with pytest.raises(TypeError, match="No loop matching"):
+            np.equal(1, 1, dtype=np.int64)
+
+        with pytest.raises(TypeError, match="No loop matching"):
+            np.equal(1, 1, sig=(None, None, "l"))
+
+    @pytest.mark.parametrize("dtypes", ["qQ", "Qq"])
+    @pytest.mark.parametrize('py_comp, np_comp', [
+        (operator.lt, np.less),
+        (operator.le, np.less_equal),
+        (operator.gt, np.greater),
+        (operator.ge, np.greater_equal),
+        (operator.eq, np.equal),
+        (operator.ne, np.not_equal)
+    ])
+    @pytest.mark.parametrize("vals", [(2**60, 2**60+1), (2**60+1, 2**60)])
+    def test_large_integer_direct_comparison(
+            self, dtypes, py_comp, np_comp, vals):
+        # Note that float(2**60) + 1 == float(2**60).
+        a1 = np.array([2**60], dtype=dtypes[0])
+        a2 = np.array([2**60 + 1], dtype=dtypes[1])
+        expected = py_comp(2**60, 2**60+1)
+
+        assert py_comp(a1, a2) == expected
+        assert np_comp(a1, a2) == expected
+        # Also check the scalars:
+        s1 = a1[0]
+        s2 = a2[0]
+        assert isinstance(s1, np.integer)
+        assert isinstance(s2, np.integer)
+        # The Python operator here is mainly interesting:
+        assert py_comp(s1, s2) == expected
+        assert np_comp(s1, s2) == expected
+
+    @pytest.mark.parametrize("dtype", np.typecodes['UnsignedInteger'])
+    @pytest.mark.parametrize('py_comp_func, np_comp_func', [
+        (operator.lt, np.less),
+        (operator.le, np.less_equal),
+        (operator.gt, np.greater),
+        (operator.ge, np.greater_equal),
+        (operator.eq, np.equal),
+        (operator.ne, np.not_equal)
+    ])
+    @pytest.mark.parametrize("flip", [True, False])
+    def test_unsigned_signed_direct_comparison(
+            self, dtype, py_comp_func, np_comp_func, flip):
+        if flip:
+            py_comp = lambda x, y: py_comp_func(y, x)
+            np_comp = lambda x, y: np_comp_func(y, x)
+        else:
+            py_comp = py_comp_func
+            np_comp = np_comp_func
+
+        arr = np.array([np.iinfo(dtype).max], dtype=dtype)
+        expected = py_comp(int(arr[0]), -1)
+
+        assert py_comp(arr, -1) == expected
+        assert np_comp(arr, -1) == expected
+        scalar = arr[0]
+        assert isinstance(scalar, np.integer)
+        # The Python operator here is mainly interesting:
+        assert py_comp(scalar, -1) == expected
+        assert np_comp(scalar, -1) == expected
+
 
 class TestAdd:
     def test_reduce_alignment(self):
@@ -362,6 +451,7 @@ class TestDivision:
         assert_equal(x // 100, [0, 0, 0, 1, -1, -1, -1, -1, -2])
         assert_equal(x % 100, [5, 10, 90, 0, 95, 90, 10, 0, 80])
 
+    @pytest.mark.skipif(IS_WASM, reason="fp errors don't work in wasm")
     @pytest.mark.parametrize("dtype,ex_val", itertools.product(
         np.sctypes['int'] + np.sctypes['uint'], (
             (
@@ -392,7 +482,7 @@ class TestDivision:
     def test_division_int_boundary(self, dtype, ex_val):
         fo = np.iinfo(dtype)
         neg = -1 if fo.min < 0 else 1
-        # Large enough to test SIMD loops and remaind elements
+        # Large enough to test SIMD loops and remainder elements
         lsize = 512 + 7
         a, b, divisors = eval(ex_val)
         a_lst, b_lst = a.tolist(), b.tolist()
@@ -447,6 +537,7 @@ class TestDivision:
 
             np.array([], dtype=dtype) // 0
 
+    @pytest.mark.skipif(IS_WASM, reason="fp errors don't work in wasm")
     @pytest.mark.parametrize("dtype,ex_val", itertools.product(
         np.sctypes['int'] + np.sctypes['uint'], (
             "np.array([fo.max, 1, 2, 1, 1, 2, 3], dtype=dtype)",
@@ -508,6 +599,8 @@ class TestDivision:
             quotient_array = np.array([quotient]*5)
             assert all(dividend_array // divisor == quotient_array), msg
         else:
+            if IS_WASM:
+                pytest.skip("fp errors don't work in wasm")
             with np.errstate(divide='raise', invalid='raise'):
                 with pytest.raises(FloatingPointError):
                     dividend // divisor
@@ -554,6 +647,9 @@ class TestDivision:
         assert_equal(np.signbit(x//1), 0)
         assert_equal(np.signbit((-x)//1), 1)
 
+    @pytest.mark.skipif(hasattr(np.__config__, "blas_ssl2_info"),
+            reason="gh-22982")
+    @pytest.mark.skipif(IS_WASM, reason="fp errors don't work in wasm")
     @pytest.mark.parametrize('dtype', np.typecodes['Float'])
     def test_floor_division_errors(self, dtype):
         fnan = np.array(np.nan, dtype=dtype)
@@ -668,6 +764,7 @@ class TestRemainder:
                     else:
                         assert_(b > rem >= 0, msg)
 
+    @pytest.mark.skipif(IS_WASM, reason="fp errors don't work in wasm")
     @pytest.mark.xfail(sys.platform.startswith("darwin"),
             reason="MacOS seems to not give the correct 'invalid' warning for "
                    "`fmod`.  Hopefully, others always do.")
@@ -694,6 +791,9 @@ class TestRemainder:
             # inf / 0 does not set any flags, only the modulo creates a NaN
             np.divmod(finf, fzero)
 
+    @pytest.mark.skipif(hasattr(np.__config__, "blas_ssl2_info"),
+            reason="gh-22982")
+    @pytest.mark.skipif(IS_WASM, reason="fp errors don't work in wasm")
     @pytest.mark.xfail(sys.platform.startswith("darwin"),
            reason="MacOS seems to not give the correct 'invalid' warning for "
                   "`fmod`.  Hopefully, others always do.")
@@ -715,6 +815,7 @@ class TestRemainder:
             fn(fone, fnan)
             fn(fnan, fone)
 
+    @pytest.mark.skipif(IS_WASM, reason="fp errors don't work in wasm")
     def test_float_remainder_overflow(self):
         a = np.finfo(np.float64).tiny
         with np.errstate(over='ignore', invalid='ignore'):
@@ -836,6 +937,7 @@ class TestDivisionIntegerOverflowsAndDivideByZero:
             helper_lambdas['min-zero'], helper_lambdas['neg_min-zero'])
     }
 
+    @pytest.mark.skipif(IS_WASM, reason="fp errors don't work in wasm")
     @pytest.mark.parametrize("dtype", np.typecodes["Integer"])
     def test_signed_division_overflow(self, dtype):
         to_check = interesting_binop_operands(np.iinfo(dtype).min, -1, dtype)
@@ -862,6 +964,7 @@ class TestDivisionIntegerOverflowsAndDivideByZero:
             assert extractor(res1) == np.iinfo(op1.dtype).min
             assert extractor(res2) == 0
 
+    @pytest.mark.skipif(IS_WASM, reason="fp errors don't work in wasm")
     @pytest.mark.parametrize("dtype", np.typecodes["AllInteger"])
     def test_divide_by_zero(self, dtype):
         # Note that the return value cannot be well defined here, but NumPy
@@ -881,6 +984,7 @@ class TestDivisionIntegerOverflowsAndDivideByZero:
             assert extractor(res1) == 0
             assert extractor(res2) == 0
 
+    @pytest.mark.skipif(IS_WASM, reason="fp errors don't work in wasm")
     @pytest.mark.parametrize("dividend_dtype",
             np.sctypes['int'])
     @pytest.mark.parametrize("divisor_dtype",
@@ -1043,6 +1147,33 @@ class TestPower:
                 assert_complex_equal(np.power(zero, -p), cnan)
             assert_complex_equal(np.power(zero, -1+0.2j), cnan)
 
+    @pytest.mark.skipif(IS_WASM, reason="fp errors don't work in wasm")
+    def test_zero_power_nonzero(self):
+        # Testing 0^{Non-zero} issue 18378
+        zero = np.array([0.0+0.0j])
+        cnan = np.array([complex(np.nan, np.nan)])
+
+        def assert_complex_equal(x, y):
+            assert_array_equal(x.real, y.real)
+            assert_array_equal(x.imag, y.imag)
+
+        #Complex powers with positive real part will not generate a warning
+        assert_complex_equal(np.power(zero, 1+4j), zero)
+        assert_complex_equal(np.power(zero, 2-3j), zero)
+        #Testing zero values when real part is greater than zero
+        assert_complex_equal(np.power(zero, 1+1j), zero)
+        assert_complex_equal(np.power(zero, 1+0j), zero)
+        assert_complex_equal(np.power(zero, 1-1j), zero)
+        #Complex powers will negative real part or 0 (provided imaginary
+        # part is not zero) will generate a NAN and hence a RUNTIME warning
+        with pytest.warns(expected_warning=RuntimeWarning) as r:
+            assert_complex_equal(np.power(zero, -1+1j), cnan)
+            assert_complex_equal(np.power(zero, -2-3j), cnan)
+            assert_complex_equal(np.power(zero, -7+0j), cnan)
+            assert_complex_equal(np.power(zero, 0+1j), cnan)
+            assert_complex_equal(np.power(zero, 0-1j), cnan)
+        assert len(r) == 5
+
     def test_fast_power(self):
         x = np.array([1, 2, 3], np.int16)
         res = x**2.0
@@ -1095,7 +1226,7 @@ class TestPower:
             assert_raises(ValueError, np.power, a, minusone)
             assert_raises(ValueError, np.power, one, b)
             assert_raises(ValueError, np.power, one, minusone)
-    
+
     def test_float_to_inf_power(self):
         for dt in [np.float32, np.float64]:
             a = np.array([1, 1, 2, 2, -2, -2, np.inf, -np.inf], dt)
@@ -1132,6 +1263,7 @@ class TestLog2:
         v = np.log2(2.**i)
         assert_equal(v, float(i), err_msg='at exponent %d' % i)
 
+    @pytest.mark.skipif(IS_WASM, reason="fp errors don't work in wasm")
     def test_log2_special(self):
         assert_equal(np.log2(1.), 0.)
         assert_equal(np.log2(np.inf), np.inf)
@@ -1222,9 +1354,20 @@ class TestLog:
 
         # test log() of max for dtype does not raise
         for dt in ['f', 'd', 'g']:
-            with np.errstate(all='raise'):
-                x = np.finfo(dt).max
-                np.log(x)
+            try:
+                with np.errstate(all='raise'):
+                    x = np.finfo(dt).max
+                    np.log(x)
+            except FloatingPointError as exc:
+                if dt == 'g' and IS_MUSL:
+                    # FloatingPointError is known to occur on longdouble
+                    # for musllinux_x86_64 x is very large
+                    pytest.skip(
+                        "Overflow has occurred for"
+                        " np.log(np.finfo(np.longdouble).max)"
+                    )
+                else:
+                    raise exc
 
     def test_log_strides(self):
         np.random.seed(42)
@@ -1290,6 +1433,7 @@ class TestSpecialFloats:
             assert_raises(FloatingPointError, np.exp, np.float64(-1000.))
             assert_raises(FloatingPointError, np.exp, np.float64(-1E19))
 
+    @pytest.mark.skipif(IS_WASM, reason="fp errors don't work in wasm")
     def test_log_values(self):
         with np.errstate(all='ignore'):
             x = [np.nan, np.nan, np.inf, np.nan, -np.inf, np.nan]
@@ -1339,23 +1483,51 @@ class TestSpecialFloats:
             a = np.array(1e9, dtype='float32')
             np.log(a)
 
-    def test_sincos_values(self):
+    @pytest.mark.skipif(IS_WASM, reason="fp errors don't work in wasm")
+    @pytest.mark.parametrize('dtype', ['e', 'f', 'd', 'g'])
+    def test_sincos_values(self, dtype):
         with np.errstate(all='ignore'):
             x = [np.nan, np.nan, np.nan, np.nan]
             y = [np.nan, -np.nan, np.inf, -np.inf]
-            for dt in ['e', 'f', 'd', 'g']:
-                xf = np.array(x, dtype=dt)
-                yf = np.array(y, dtype=dt)
-                assert_equal(np.sin(yf), xf)
-                assert_equal(np.cos(yf), xf)
-        
+            xf = np.array(x, dtype=dtype)
+            yf = np.array(y, dtype=dtype)
+            assert_equal(np.sin(yf), xf)
+            assert_equal(np.cos(yf), xf)
 
+    @pytest.mark.skipif(IS_WASM, reason="fp errors don't work in wasm")
+    @pytest.mark.xfail(
+        sys.platform.startswith("darwin"),
+        reason="underflow is triggered for scalar 'sin'"
+    )
+    def test_sincos_underflow(self):
+        with np.errstate(under='raise'):
+            underflow_trigger = np.array(
+                float.fromhex("0x1.f37f47a03f82ap-511"),
+                dtype=np.float64
+            )
+            np.sin(underflow_trigger)
+            np.cos(underflow_trigger)
+
+    @pytest.mark.skipif(IS_WASM, reason="fp errors don't work in wasm")
+    @pytest.mark.parametrize('callable', [np.sin, np.cos])
+    @pytest.mark.parametrize('dtype', ['e', 'f', 'd'])
+    @pytest.mark.parametrize('value', [np.inf, -np.inf])
+    def test_sincos_errors(self, callable, dtype, value):
         with np.errstate(invalid='raise'):
-            for callable in [np.sin, np.cos]:
-                for value in [np.inf, -np.inf]:
-                    for dt in ['e', 'f', 'd']:
-                        assert_raises(FloatingPointError, callable,
-                                np.array([value], dtype=dt))
+            assert_raises(FloatingPointError, callable,
+                np.array([value], dtype=dtype))
+
+    @pytest.mark.parametrize('callable', [np.sin, np.cos])
+    @pytest.mark.parametrize('dtype', ['f', 'd'])
+    @pytest.mark.parametrize('stride', [-1, 1, 2, 4, 5])
+    def test_sincos_overlaps(self, callable, dtype, stride):
+        N = 100
+        M = N // abs(stride)
+        rng = np.random.default_rng(42)
+        x = rng.standard_normal(N, dtype)
+        y = callable(x[::stride])
+        callable(x[::stride], out=x[:M])
+        assert_equal(x[:M], y)
 
     @pytest.mark.parametrize('dt', ['e', 'f', 'd', 'g'])
     def test_sqrt_values(self, dt):
@@ -1379,6 +1551,7 @@ class TestSpecialFloats:
             yf = np.array(y, dtype=dt)
             assert_equal(np.abs(yf), xf)
 
+    @pytest.mark.skipif(IS_WASM, reason="fp errors don't work in wasm")
     def test_square_values(self):
         x = [np.nan,  np.nan, np.inf, np.inf]
         y = [np.nan, -np.nan, np.inf, -np.inf]
@@ -1396,6 +1569,7 @@ class TestSpecialFloats:
             assert_raises(FloatingPointError, np.square,
                           np.array(1E200, dtype='d'))
 
+    @pytest.mark.skipif(IS_WASM, reason="fp errors don't work in wasm")
     def test_reciprocal_values(self):
         with np.errstate(all='ignore'):
             x = [np.nan,  np.nan, 0.0, -0.0, np.inf, -np.inf]
@@ -1410,6 +1584,7 @@ class TestSpecialFloats:
                 assert_raises(FloatingPointError, np.reciprocal,
                               np.array(-0.0, dtype=dt))
 
+    @pytest.mark.skipif(IS_WASM, reason="fp errors don't work in wasm")
     def test_tan(self):
         with np.errstate(all='ignore'):
             in_ = [np.nan, -np.nan, 0.0, -0.0, np.inf, -np.inf]
@@ -1426,6 +1601,7 @@ class TestSpecialFloats:
                 assert_raises(FloatingPointError, np.tan,
                               np.array(-np.inf, dtype=dt))
 
+    @pytest.mark.skipif(IS_WASM, reason="fp errors don't work in wasm")
     def test_arcsincos(self):
         with np.errstate(all='ignore'):
             in_ = [np.nan, -np.nan, np.inf, -np.inf]
@@ -1452,6 +1628,7 @@ class TestSpecialFloats:
                 out_arr = np.array(out, dtype=dt)
                 assert_equal(np.arctan(in_arr), out_arr)
 
+    @pytest.mark.skipif(IS_WASM, reason="fp errors don't work in wasm")
     def test_sinh(self):
         in_ = [np.nan, -np.nan, np.inf, -np.inf]
         out = [np.nan, np.nan, np.inf, -np.inf]
@@ -1468,6 +1645,7 @@ class TestSpecialFloats:
             assert_raises(FloatingPointError, np.sinh,
                           np.array(1200.0, dtype='d'))
 
+    @pytest.mark.skipif(IS_WASM, reason="fp errors don't work in wasm")
     def test_cosh(self):
         in_ = [np.nan, -np.nan, np.inf, -np.inf]
         out = [np.nan, np.nan, np.inf, np.inf]
@@ -1500,6 +1678,7 @@ class TestSpecialFloats:
             out_arr = np.array(out, dtype=dt)
             assert_equal(np.arcsinh(in_arr), out_arr)
 
+    @pytest.mark.skipif(IS_WASM, reason="fp errors don't work in wasm")
     def test_arccosh(self):
         with np.errstate(all='ignore'):
             in_ = [np.nan, -np.nan, np.inf, -np.inf, 1.0, 0.0]
@@ -1515,6 +1694,7 @@ class TestSpecialFloats:
                     assert_raises(FloatingPointError, np.arccosh,
                                   np.array(value, dtype=dt))
 
+    @pytest.mark.skipif(IS_WASM, reason="fp errors don't work in wasm")
     def test_arctanh(self):
         with np.errstate(all='ignore'):
             in_ = [np.nan, -np.nan, np.inf, -np.inf, 1.0, -1.0, 2.0]
@@ -1550,6 +1730,7 @@ class TestSpecialFloats:
                     assert_raises(FloatingPointError, np.exp2,
                                   np.array(value, dtype=dt))
 
+    @pytest.mark.skipif(IS_WASM, reason="fp errors don't work in wasm")
     def test_expm1(self):
         with np.errstate(all='ignore'):
             in_ = [np.nan, -np.nan, np.inf, -np.inf]
@@ -1566,18 +1747,78 @@ class TestSpecialFloats:
                                   np.array(value, dtype=dt))
 
     # test to ensure no spurious FP exceptions are raised due to SIMD
-    def test_spurious_fpexception(self):
-        for dt in ['e', 'f', 'd']:
-            arr = np.array([1.0, 2.0], dtype=dt)
-            with assert_no_warnings():
-                np.log(arr)
-                np.log2(arr)
-                np.log10(arr)
-                np.arccosh(arr)
-
+    INF_INVALID_ERR = [
+        np.cos, np.sin, np.tan, np.arccos, np.arcsin, np.spacing, np.arctanh
+    ]
+    NEG_INVALID_ERR = [
+        np.log, np.log2, np.log10, np.log1p, np.sqrt, np.arccosh,
+        np.arctanh
+    ]
+    ONE_INVALID_ERR = [
+        np.arctanh,
+    ]
+    LTONE_INVALID_ERR = [
+        np.arccosh,
+    ]
+    BYZERO_ERR = [
+        np.log, np.log2, np.log10, np.reciprocal, np.arccosh
+    ]
+
+    @pytest.mark.parametrize("ufunc", UFUNCS_UNARY_FP)
+    @pytest.mark.parametrize("dtype", ('e', 'f', 'd'))
+    @pytest.mark.parametrize("data, escape", (
+        ([0.03], LTONE_INVALID_ERR),
+        ([0.03]*32, LTONE_INVALID_ERR),
+        # neg
+        ([-1.0], NEG_INVALID_ERR),
+        ([-1.0]*32, NEG_INVALID_ERR),
+        # flat
+        ([1.0], ONE_INVALID_ERR),
+        ([1.0]*32, ONE_INVALID_ERR),
+        # zero
+        ([0.0], BYZERO_ERR),
+        ([0.0]*32, BYZERO_ERR),
+        ([-0.0], BYZERO_ERR),
+        ([-0.0]*32, BYZERO_ERR),
+        # nan
+        ([0.5, 0.5, 0.5, np.nan], LTONE_INVALID_ERR),
+        ([0.5, 0.5, 0.5, np.nan]*32, LTONE_INVALID_ERR),
+        ([np.nan, 1.0, 1.0, 1.0], ONE_INVALID_ERR),
+        ([np.nan, 1.0, 1.0, 1.0]*32, ONE_INVALID_ERR),
+        ([np.nan], []),
+        ([np.nan]*32, []),
+        # inf
+        ([0.5, 0.5, 0.5, np.inf], INF_INVALID_ERR + LTONE_INVALID_ERR),
+        ([0.5, 0.5, 0.5, np.inf]*32, INF_INVALID_ERR + LTONE_INVALID_ERR),
+        ([np.inf, 1.0, 1.0, 1.0], INF_INVALID_ERR),
+        ([np.inf, 1.0, 1.0, 1.0]*32, INF_INVALID_ERR),
+        ([np.inf], INF_INVALID_ERR),
+        ([np.inf]*32, INF_INVALID_ERR),
+        # ninf
+        ([0.5, 0.5, 0.5, -np.inf],
+         NEG_INVALID_ERR + INF_INVALID_ERR + LTONE_INVALID_ERR),
+        ([0.5, 0.5, 0.5, -np.inf]*32,
+         NEG_INVALID_ERR + INF_INVALID_ERR + LTONE_INVALID_ERR),
+        ([-np.inf, 1.0, 1.0, 1.0], NEG_INVALID_ERR + INF_INVALID_ERR),
+        ([-np.inf, 1.0, 1.0, 1.0]*32, NEG_INVALID_ERR + INF_INVALID_ERR),
+        ([-np.inf], NEG_INVALID_ERR + INF_INVALID_ERR),
+        ([-np.inf]*32, NEG_INVALID_ERR + INF_INVALID_ERR),
+    ))
+    def test_unary_spurious_fpexception(self, ufunc, dtype, data, escape):
+        if escape and ufunc in escape:
+            return
+        # FIXME: NAN raises FP invalid exception:
+        #  - ceil/float16 on MSVC:32-bit
+        #  - spacing/float16 on almost all platforms
+        if ufunc in (np.spacing, np.ceil) and dtype == 'e':
+            return
+        array = np.array(data, dtype=dtype)
+        with assert_no_warnings():
+            ufunc(array)
 
 class TestFPClass:
-    @pytest.mark.parametrize("stride", [-4,-2,-1,1,2,4])
+    @pytest.mark.parametrize("stride", [-5, -4, -3, -2, -1, 1,
+                                2, 4, 5, 6, 7, 8, 9, 10])
     def test_fpclass(self, stride):
         arr_f64 = np.array([np.nan, -np.nan, np.inf, -np.inf, -1.0, 1.0, -0.0, 0.0, 2.2251e-308, -2.2251e-308], dtype='d')
         arr_f32 = np.array([np.nan, -np.nan, np.inf, -np.inf, -1.0, 1.0, -0.0, 0.0, 1.4013e-045, -1.4013e-045], dtype='f')
@@ -1594,6 +1835,52 @@ class TestFPClass:
         assert_equal(np.isfinite(arr_f32[::stride]), finite[::stride])
         assert_equal(np.isfinite(arr_f64[::stride]), finite[::stride])
 
+    @pytest.mark.parametrize("dtype", ['d', 'f'])
+    def test_fp_noncontiguous(self, dtype):
+        data = np.array([np.nan, -np.nan, np.inf, -np.inf, -1.0,
+                            1.0, -0.0, 0.0, 2.2251e-308,
+                            -2.2251e-308], dtype=dtype)
+        nan = np.array([True, True, False, False, False, False,
+                            False, False, False, False])
+        inf = np.array([False, False, True, True, False, False,
+                            False, False, False, False])
+        sign = np.array([False, True, False, True, True, False,
+                            True, False, False, True])
+        finite = np.array([False, False, False, False, True, True,
+                            True, True, True, True])
+        out = np.ndarray(data.shape, dtype='bool')
+        ncontig_in = data[1::3]
+        ncontig_out = out[1::3]
+        contig_in = np.array(ncontig_in)
+        assert_equal(ncontig_in.flags.c_contiguous, False)
+        assert_equal(ncontig_out.flags.c_contiguous, False)
+        assert_equal(contig_in.flags.c_contiguous, True)
+        # ncontig in, ncontig out
+        assert_equal(np.isnan(ncontig_in, out=ncontig_out), nan[1::3])
+        assert_equal(np.isinf(ncontig_in, out=ncontig_out), inf[1::3])
+        assert_equal(np.signbit(ncontig_in, out=ncontig_out), sign[1::3])
+        assert_equal(np.isfinite(ncontig_in, out=ncontig_out), finite[1::3])
+        # contig in, ncontig out
+        assert_equal(np.isnan(contig_in, out=ncontig_out), nan[1::3])
+        assert_equal(np.isinf(contig_in, out=ncontig_out), inf[1::3])
+        assert_equal(np.signbit(contig_in, out=ncontig_out), sign[1::3])
+        assert_equal(np.isfinite(contig_in, out=ncontig_out), finite[1::3])
+        # ncontig in, contig out
+        assert_equal(np.isnan(ncontig_in), nan[1::3])
+        assert_equal(np.isinf(ncontig_in), inf[1::3])
+        assert_equal(np.signbit(ncontig_in), sign[1::3])
+        assert_equal(np.isfinite(ncontig_in), finite[1::3])
+        # contig in, contig out, nd stride
+        data_split = np.array(np.array_split(data, 2))
+        nan_split = np.array(np.array_split(nan, 2))
+        inf_split = np.array(np.array_split(inf, 2))
+        sign_split = np.array(np.array_split(sign, 2))
+        finite_split = np.array(np.array_split(finite, 2))
+        assert_equal(np.isnan(data_split), nan_split)
+        assert_equal(np.isinf(data_split), inf_split)
+        assert_equal(np.signbit(data_split), sign_split)
+        assert_equal(np.isfinite(data_split), finite_split)
+
 class TestLDExp:
     @pytest.mark.parametrize("stride", [-4,-2,-1,1,2,4])
     @pytest.mark.parametrize("dtype", ['f', 'd'])
@@ -1607,6 +1894,7 @@ class TestLDExp:
 class TestFRExp:
     @pytest.mark.parametrize("stride", [-4,-2,-1,1,2,4])
     @pytest.mark.parametrize("dtype", ['f', 'd'])
+    @pytest.mark.xfail(IS_MUSL, reason="gh23048")
     @pytest.mark.skipif(not sys.platform.startswith('linux'),
                         reason="np.frexp gives different answers for NAN/INF on windows and linux")
     def test_frexp(self, dtype, stride):
@@ -2570,6 +2858,35 @@ class TestAbsoluteNegative:
         np.abs(d, out=d)
         np.abs(np.ones_like(d), out=d)
 
+    @pytest.mark.parametrize("dtype", ['d', 'f', 'int32', 'int64'])
+    @pytest.mark.parametrize("big", [True, False])
+    def test_noncontiguous(self, dtype, big):
+        data = np.array([-1.0, 1.0, -0.0, 0.0, 2.2251e-308, -2.5, 2.5, -6,
+                            6, -2.2251e-308, -8, 10], dtype=dtype)
+        expect = np.array([1.0, -1.0, 0.0, -0.0, -2.2251e-308, 2.5, -2.5, 6,
+                            -6, 2.2251e-308, 8, -10], dtype=dtype)
+        if big:
+            data = np.repeat(data, 10)
+            expect = np.repeat(expect, 10)
+        out = np.ndarray(data.shape, dtype=dtype)
+        ncontig_in = data[1::2]
+        ncontig_out = out[1::2]
+        contig_in = np.array(ncontig_in)
+        # contig in, contig out
+        assert_array_equal(np.negative(contig_in), expect[1::2])
+        # contig in, ncontig out
+        assert_array_equal(np.negative(contig_in, out=ncontig_out),
+                                expect[1::2])
+        # ncontig in, contig out
+        assert_array_equal(np.negative(ncontig_in), expect[1::2])
+        # ncontig in, ncontig out
+        assert_array_equal(np.negative(ncontig_in, out=ncontig_out),
+                                expect[1::2])
+        # contig in, contig out, nd stride
+        data_split = np.array(np.array_split(data, 2))
+        expect_split = np.array(np.array_split(expect, 2))
+        assert_equal(np.negative(data_split), expect_split)
+
 
 class TestPositive:
     def test_valid(self):
@@ -3265,6 +3582,79 @@ class TestSpecialMethods:
         assert_raises(ValueError, np.modf, a, out=('one', 'two', 'three'))
         assert_raises(ValueError, np.modf, a, out=('one',))
 
+    def test_ufunc_override_where(self):
+
+        class OverriddenArrayOld(np.ndarray):
+
+            def _unwrap(self, objs):
+                cls = type(self)
+                result = []
+                for obj in objs:
+                    if isinstance(obj, cls):
+                        obj = np.array(obj)
+                    elif type(obj) != np.ndarray:
+                        return NotImplemented
+                    result.append(obj)
+                return result
+
+            def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
+
+                inputs = self._unwrap(inputs)
+                if inputs is NotImplemented:
+                    return NotImplemented
+
+                kwargs = kwargs.copy()
+                if "out" in kwargs:
+                    kwargs["out"] = self._unwrap(kwargs["out"])
+                    if kwargs["out"] is NotImplemented:
+                        return NotImplemented
+
+                r = super().__array_ufunc__(ufunc, method, *inputs, **kwargs)
+                if r is not NotImplemented:
+                    r = r.view(type(self))
+
+                return r
+
+        class OverriddenArrayNew(OverriddenArrayOld):
+            def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
+
+                kwargs = kwargs.copy()
+                if "where" in kwargs:
+                    kwargs["where"] = self._unwrap((kwargs["where"], ))
+                    if kwargs["where"] is NotImplemented:
+                        return NotImplemented
+                    else:
+                        kwargs["where"] = kwargs["where"][0]
+
+                r = super().__array_ufunc__(ufunc, method, *inputs, **kwargs)
+                if r is not NotImplemented:
+                    r = r.view(type(self))
+
+                return r
+
+        ufunc = np.negative
+
+        array = np.array([1, 2, 3])
+        where = np.array([True, False, True])
+        expected = ufunc(array, where=where)
+
+        with pytest.raises(TypeError):
+            ufunc(array, where=where.view(OverriddenArrayOld))
+
+        result_1 = ufunc(
+            array,
+            where=where.view(OverriddenArrayNew)
+        )
+        assert isinstance(result_1, OverriddenArrayNew)
+        assert np.all(np.array(result_1) == expected, where=where)
+
+        result_2 = ufunc(
+            array.view(OverriddenArrayNew),
+            where=where.view(OverriddenArrayNew)
+        )
+        assert isinstance(result_2, OverriddenArrayNew)
+        assert np.all(np.array(result_2) == expected, where=where)
+
     def test_ufunc_override_exception(self):
 
         class A:
@@ -3686,6 +4076,8 @@ class TestComplexFunctions:
             assert_almost_equal(fz.real, fr, err_msg='real part %s' % f)
             assert_almost_equal(fz.imag, 0., err_msg='imag part %s' % f)
 
+    @pytest.mark.xfail(IS_MUSL, reason="gh23049")
+    @pytest.mark.xfail(IS_WASM, reason="doesn't work")
     def test_precisions_consistent(self):
         z = 1 + 1j
         for f in self.funcs:
@@ -3695,6 +4087,8 @@ class TestComplexFunctions:
             assert_almost_equal(fcf, fcd, decimal=6, err_msg='fch-fcd %s' % f)
             assert_almost_equal(fcl, fcd, decimal=15, err_msg='fch-fcl %s' % f)
 
+    @pytest.mark.xfail(IS_MUSL, reason="gh23049")
+    @pytest.mark.xfail(IS_WASM, reason="doesn't work")
     def test_branch_cuts(self):
         # check branch cuts and continuity on them
         _check_branch_cut(np.log,   -0.5, 1j, 1, -1, True)
@@ -3720,6 +4114,8 @@ class TestComplexFunctions:
         _check_branch_cut(np.arccosh, [0-2j, 2j, 2], [1,  1,  1j], 1, 1)
         _check_branch_cut(np.arctanh, [0-2j, 2j, 0], [1,  1,  1j], 1, 1)
 
+    @pytest.mark.xfail(IS_MUSL, reason="gh23049")
+    @pytest.mark.xfail(IS_WASM, reason="doesn't work")
     def test_branch_cuts_complex64(self):
         # check branch cuts and continuity on them
         _check_branch_cut(np.log,   -0.5, 1j, 1, -1, True, np.complex64)
@@ -3764,6 +4160,8 @@ class TestComplexFunctions:
                 b = cfunc(p)
                 assert_(abs(a - b) < atol, "%s %s: %s; cmath: %s" % (fname, p, a, b))
 
+    @pytest.mark.xfail(IS_MUSL, reason="gh23049")
+    @pytest.mark.xfail(IS_WASM, reason="doesn't work")
     @pytest.mark.parametrize('dtype', [np.complex64, np.complex_, np.longcomplex])
     def test_loss_of_precision(self, dtype):
         """Check loss of precision in complex arc* functions"""
@@ -3854,6 +4252,14 @@ class TestComplexFunctions:
             check(func, pts, 1j)
             check(func, pts, 1+1j)
 
+    @np.errstate(all="ignore")
+    def test_promotion_corner_cases(self):
+        for func in self.funcs:
+            assert func(np.float16(1)).dtype == np.float16
+            # Integer to low precision float promotion is a dubious choice:
+            assert func(np.uint8(1)).dtype == np.float16
+            assert func(np.int16(1)).dtype == np.float32
+
 
 class TestAttributes:
     def test_attributes(self):
@@ -4027,7 +4433,7 @@ def _test_spacing(t):
     nan = t(np.nan)
     inf = t(np.inf)
     with np.errstate(invalid='ignore'):
-        assert_(np.spacing(one) == eps)
+        assert_equal(np.spacing(one), eps)
         assert_(np.isnan(np.spacing(nan)))
         assert_(np.isnan(np.spacing(inf)))
         assert_(np.isnan(np.spacing(-inf)))
@@ -4163,6 +4569,7 @@ def test_rint_big_int():
     # Rint should not change the value
     assert_equal(val, np.rint(val))
 
+
 @pytest.mark.parametrize('ftype', [np.float32, np.float64])
 def test_memoverlap_accumulate(ftype):
     # Reproduces bug https://github.com/numpy/numpy/issues/15597
@@ -4172,6 +4579,38 @@ def test_memoverlap_accumulate(ftype):
     assert_equal(np.maximum.accumulate(arr), out_max)
     assert_equal(np.minimum.accumulate(arr), out_min)
 
+@pytest.mark.parametrize("ufunc, dtype", [
+    (ufunc, t[0])
+    for ufunc in UFUNCS_BINARY_ACC
+    for t in ufunc.types
+    if t[-1] == '?' and t[0] not in 'DFGMmO'
+])
+def test_memoverlap_accumulate_cmp(ufunc, dtype):
+    if ufunc.signature:
+        pytest.skip('For generic signatures only')
+    for size in (2, 8, 32, 64, 128, 256):
+        arr = np.array([0, 1, 1]*size, dtype=dtype)
+        acc = ufunc.accumulate(arr, dtype='?')
+        acc_u8 = acc.view(np.uint8)
+        exp = np.array(list(itertools.accumulate(arr, ufunc)), dtype=np.uint8)
+        assert_equal(exp, acc_u8)
+
+@pytest.mark.parametrize("ufunc, dtype", [
+    (ufunc, t[0])
+    for ufunc in UFUNCS_BINARY_ACC
+    for t in ufunc.types
+    if t[0] == t[1] and t[0] == t[-1] and t[0] not in 'DFGMmO?'
+])
+def test_memoverlap_accumulate_symmetric(ufunc, dtype):
+    if ufunc.signature:
+        pytest.skip('For generic signatures only')
+    with np.errstate(all='ignore'):
+        for size in (2, 8, 32, 64, 128, 256):
+            arr = np.array([0, 1, 2]*size).astype(dtype)
+            acc = ufunc.accumulate(arr, dtype=dtype)
+            exp = np.array(list(itertools.accumulate(arr, ufunc)), dtype=dtype)
+            assert_equal(exp, acc)
+
 def test_signaling_nan_exceptions():
     with assert_no_warnings():
         a = np.ndarray(shape=(), dtype='float32', buffer=b'\x00\xe0\xbf\xff')
diff --git a/numpy/core/tests/test_unicode.py b/numpy/core/tests/test_unicode.py
index 2d7c2818e..e5454bd48 100644
--- a/numpy/core/tests/test_unicode.py
+++ b/numpy/core/tests/test_unicode.py
@@ -36,10 +36,10 @@ def test_string_cast():
     uni_arr1 = str_arr.astype('>U')
     uni_arr2 = str_arr.astype('<U')
 
-    with pytest.warns(FutureWarning):
-        assert str_arr != uni_arr1
-    with pytest.warns(FutureWarning):
-        assert str_arr != uni_arr2
+    assert_array_equal(str_arr != uni_arr1, np.ones(2, dtype=bool))
+    assert_array_equal(uni_arr1 != str_arr, np.ones(2, dtype=bool))
+    assert_array_equal(str_arr == uni_arr1, np.zeros(2, dtype=bool))
+    assert_array_equal(uni_arr1 == str_arr, np.zeros(2, dtype=bool))
 
     assert_array_equal(uni_arr1, uni_arr2)
 
diff --git a/numpy/ctypeslib.pyi b/numpy/ctypeslib.pyi
index 0313cd82a..3edf98e14 100644
--- a/numpy/ctypeslib.pyi
+++ b/numpy/ctypeslib.pyi
@@ -91,10 +91,10 @@ class _ndptr(ctypes.c_void_p, Generic[_DTypeOptional]):
 
     @overload
     @classmethod
-    def from_param(cls: type[_ndptr[None]], obj: ndarray[Any, Any]) -> _ctypes: ...
+    def from_param(cls: type[_ndptr[None]], obj: ndarray[Any, Any]) -> _ctypes[Any]: ...
     @overload
     @classmethod
-    def from_param(cls: type[_ndptr[_DType]], obj: ndarray[Any, _DType]) -> _ctypes: ...
+    def from_param(cls: type[_ndptr[_DType]], obj: ndarray[Any, _DType]) -> _ctypes[Any]: ...
 
 class _concrete_ndptr(_ndptr[_DType]):
     _dtype_: ClassVar[_DType]
diff --git a/numpy/distutils/ccompiler.py b/numpy/distutils/ccompiler.py
index f0487cb64..40f495fc7 100644
--- a/numpy/distutils/ccompiler.py
+++ b/numpy/distutils/ccompiler.py
@@ -1,10 +1,12 @@
 import os
 import re
 import sys
+import platform
 import shlex
 import time
 import subprocess
 from copy import copy
+from pathlib import Path
 from distutils import ccompiler
 from distutils.ccompiler import (
     compiler_class, gen_lib_options, get_default_compiler, new_compiler,
@@ -57,7 +59,7 @@ def _needs_build(obj, cc_args, extra_postargs, pp_opts):
     # the last line contains the compiler commandline arguments as some
     # projects may compile an extension multiple times with different
     # arguments
-    with open(dep_file, "r") as f:
+    with open(dep_file) as f:
         lines = f.readlines()
 
     cmdline =_commandline_dep_string(cc_args, extra_postargs, pp_opts)
@@ -279,7 +281,8 @@ def CCompiler_compile(self, sources, output_dir=None, macros=None,
 
     if not sources:
         return []
-    from numpy.distutils.fcompiler import (FCompiler, is_f_file,
+    from numpy.distutils.fcompiler import (FCompiler,
+                                           FORTRAN_COMMON_FIXED_EXTENSIONS,
                                            has_f90_header)
     if isinstance(self, FCompiler):
         display = []
@@ -338,7 +341,8 @@ def CCompiler_compile(self, sources, output_dir=None, macros=None,
                 if self.compiler_type=='absoft':
                     obj = cyg2win32(obj)
                     src = cyg2win32(src)
-                if is_f_file(src) and not has_f90_header(src):
+                if Path(src).suffix.lower() in FORTRAN_COMMON_FIXED_EXTENSIONS \
+                   and not has_f90_header(src):
                     f77_objects.append((obj, (src, ext)))
                 else:
                     other_objects.append((obj, (src, ext)))
@@ -391,8 +395,14 @@ def CCompiler_customize_cmd(self, cmd, ignore=()):
     log.info('customize %s using %s' % (self.__class__.__name__,
                                         cmd.__class__.__name__))
 
-    if hasattr(self, 'compiler') and 'clang' in self.compiler[0]:
+    if (
+        hasattr(self, 'compiler') and
+        'clang' in self.compiler[0] and
+        not (platform.machine() == 'arm64' and sys.platform == 'darwin')
+    ):
         # clang defaults to a non-strict floating error point model.
+        # However, '-ftrapping-math' is not currently supported (2023-04-08)
+        # for macosx_arm64.
         # Since NumPy and most Python libs give warnings for these, override:
         self.compiler.append('-ftrapping-math')
         self.compiler_so.append('-ftrapping-math')
@@ -717,6 +727,8 @@ compiler_class['pathcc'] = ('pathccompiler', 'PathScaleCCompiler',
                             "PathScale Compiler for SiCortex-based applications")
 compiler_class['arm'] = ('armccompiler', 'ArmCCompiler',
                             "Arm C Compiler")
+compiler_class['fujitsu'] = ('fujitsuccompiler', 'FujitsuCCompiler',
+                            "Fujitsu C Compiler")
 
 ccompiler._default_compilers += (('linux.*', 'intel'),
                                  ('linux.*', 'intele'),
diff --git a/numpy/distutils/ccompiler_opt.py b/numpy/distutils/ccompiler_opt.py
index 5f18efc7d..6ba4cd816 100644
--- a/numpy/distutils/ccompiler_opt.py
+++ b/numpy/distutils/ccompiler_opt.py
@@ -16,15 +16,6 @@ import re
 import subprocess
 import textwrap
 
-# These flags are used to compile any C++ source within Numpy.
-# They are chosen to have very few runtime dependencies.
-NPY_CXX_FLAGS = [
-    '-std=c++11',  # Minimal standard version
-    '-D__STDC_VERSION__=0',  # for compatibility with C headers
-    '-fno-exceptions',  # no exception support
-    '-fno-rtti']  # no runtime type information
-
-
 class _Config:
     """An abstract class holds all configurable attributes of `CCompilerOpt`,
     these class attributes can be used to change the default behavior
@@ -229,6 +220,11 @@ class _Config:
             native = None,
             opt = '/O2',
             werror = '/WX',
+        ),
+        fcc = dict(
+            native = '-mcpu=a64fx',
+            opt = None,
+            werror = None,
         )
     )
     conf_min_features = dict(
@@ -295,6 +291,10 @@ class _Config:
             group="AVX512VBMI2 AVX512BITALG AVX512VPOPCNTDQ",
             detect="AVX512_ICL", implies_detect=False
         ),
+        AVX512_SPR = dict(
+            interest=46, implies="AVX512_ICL", group="AVX512FP16",
+            detect="AVX512_SPR", implies_detect=False
+        ),
         # IBM/Power
         ## Power7/ISA 2.06
         VSX = dict(interest=1, headers="altivec.h", extra_checks="VSX_ASM"),
@@ -338,7 +338,7 @@ class _Config:
             return {}
 
         on_x86 = self.cc_on_x86 or self.cc_on_x64
-        is_unix = self.cc_is_gcc or self.cc_is_clang
+        is_unix = self.cc_is_gcc or self.cc_is_clang or self.cc_is_fcc
 
         if on_x86 and is_unix: return dict(
             SSE    = dict(flags="-msse"),
@@ -365,7 +365,8 @@ class _Config:
             AVX512_CNL = dict(flags="-mavx512ifma -mavx512vbmi"),
             AVX512_ICL = dict(
                 flags="-mavx512vbmi2 -mavx512bitalg -mavx512vpopcntdq"
-            )
+            ),
+            AVX512_SPR = dict(flags="-mavx512fp16"),
         )
         if on_x86 and self.cc_is_icc: return dict(
             SSE    = dict(flags="-msse"),
@@ -397,6 +398,7 @@ class _Config:
             AVX512_CLX = dict(flags="-xCASCADELAKE"),
             AVX512_CNL = dict(flags="-xCANNONLAKE"),
             AVX512_ICL = dict(flags="-xICELAKE-CLIENT"),
+            AVX512_SPR = dict(disable="Not supported yet")
         )
         if on_x86 and self.cc_is_iccw: return dict(
             SSE    = dict(flags="/arch:SSE"),
@@ -429,7 +431,8 @@ class _Config:
             AVX512_SKX = dict(flags="/Qx:SKYLAKE-AVX512"),
             AVX512_CLX = dict(flags="/Qx:CASCADELAKE"),
             AVX512_CNL = dict(flags="/Qx:CANNONLAKE"),
-            AVX512_ICL = dict(flags="/Qx:ICELAKE-CLIENT")
+            AVX512_ICL = dict(flags="/Qx:ICELAKE-CLIENT"),
+            AVX512_SPR = dict(disable="Not supported yet")
         )
         if on_x86 and self.cc_is_msvc: return dict(
             SSE = dict(flags="/arch:SSE") if self.cc_on_x86 else {},
@@ -467,7 +470,10 @@ class _Config:
             AVX512_SKX = dict(flags="/arch:AVX512"),
             AVX512_CLX = {},
             AVX512_CNL = {},
-            AVX512_ICL = {}
+            AVX512_ICL = {},
+            AVX512_SPR= dict(
+                disable="MSVC compiler doesn't support it"
+            )
         )
 
         on_power = self.cc_on_ppc64le or self.cc_on_ppc64
@@ -959,8 +965,12 @@ class _CCompiler:
         detect_arch = (
             ("cc_on_x64",      ".*(x|x86_|amd)64.*", ""),
             ("cc_on_x86",      ".*(win32|x86|i386|i686).*", ""),
-            ("cc_on_ppc64le",  ".*(powerpc|ppc)64(el|le).*", ""),
-            ("cc_on_ppc64",    ".*(powerpc|ppc)64.*", ""),
+            ("cc_on_ppc64le",  ".*(powerpc|ppc)64(el|le).*|.*powerpc.*",
+                                          "defined(__powerpc64__) && "
+                                          "defined(__LITTLE_ENDIAN__)"),
+            ("cc_on_ppc64",    ".*(powerpc|ppc).*|.*powerpc.*",
+                                          "defined(__powerpc64__) && "
+                                          "defined(__BIG_ENDIAN__)"),
             ("cc_on_aarch64",  ".*(aarch64|arm64).*", ""),
             ("cc_on_armhf",    ".*arm.*", "defined(__ARM_ARCH_7__) || "
                                           "defined(__ARM_ARCH_7A__)"),
@@ -975,12 +985,14 @@ class _CCompiler:
             ("cc_is_iccw",     ".*(intelw|intelemw|iccw).*", ""),
             ("cc_is_icc",      ".*(intel|icc).*", ""),  # intel unix like
             ("cc_is_msvc",     ".*msvc.*", ""),
+            ("cc_is_fcc",     ".*fcc.*", ""),
             # undefined compiler will be treat it as gcc
             ("cc_is_nocc",     "", ""),
         )
         detect_args = (
            ("cc_has_debug",  ".*(O0|Od|ggdb|coverage|debug:full).*", ""),
-           ("cc_has_native", ".*(-march=native|-xHost|/QxHost).*", ""),
+           ("cc_has_native",
+                ".*(-march=native|-xHost|/QxHost|-mcpu=a64fx).*", ""),
            # in case if the class run with -DNPY_DISABLE_OPTIMIZATION
            ("cc_noopt", ".*DISABLE_OPT.*", ""),
         )
@@ -1041,7 +1053,7 @@ class _CCompiler:
                 break
 
         self.cc_name = "unknown"
-        for name in ("gcc", "clang", "iccw", "icc", "msvc"):
+        for name in ("gcc", "clang", "iccw", "icc", "msvc", "fcc"):
             if getattr(self, "cc_is_" + name):
                 self.cc_name = name
                 break
@@ -1163,7 +1175,7 @@ class _CCompiler:
                 continue
             lower_flags = flags[:-(i+1)]
             upper_flags = flags[-i:]
-            filterd = list(filter(
+            filtered = list(filter(
                 self._cc_normalize_unix_frgx.search, lower_flags
             ))
             # gather subflags
@@ -1175,7 +1187,7 @@ class _CCompiler:
                         subflags = xsubflags + subflags
                 cur_flag = arch + '+' + '+'.join(subflags)
 
-            flags = filterd + [cur_flag]
+            flags = filtered + [cur_flag]
             if i > 0:
                 flags += upper_flags
             break
diff --git a/numpy/distutils/checks/cpu_avx512_spr.c b/numpy/distutils/checks/cpu_avx512_spr.c
new file mode 100644
index 000000000..9710d0b2f
--- /dev/null
+++ b/numpy/distutils/checks/cpu_avx512_spr.c
@@ -0,0 +1,26 @@
+#if defined(DETECT_FEATURES) && defined(__INTEL_COMPILER)
+    /*
+     * Unlike GCC and CLANG, Intel Compiler exposes all supported intrinsics,
+     * whether or not the build options for those features are specified.
+     * Therefore, we must test #definitions of CPU features when option native/host
+     * is enabled via `--cpu-baseline` or through env var `CFLAGS` otherwise
+     * the test will be broken and leads to enable all possible features.
+     */
+    #if !defined(__AVX512FP16__)
+        #error "HOST/ARCH doesn't support Sapphire Rapids AVX512FP16 features"
+    #endif
+#endif
+
+#include <immintrin.h>
+
+int main(int argc, char **argv)
+{
+/* clang has a bug regarding our spr coode, see gh-23730. */
+#if __clang__
+#error
+#endif
+    __m512h a = _mm512_loadu_ph((void*)argv[argc-1]);
+    __m512h temp = _mm512_fmadd_ph(a, a, a);
+    _mm512_storeu_ph((void*)(argv[argc-1]), temp);
+    return 0;
+}
diff --git a/numpy/distutils/command/build_clib.py b/numpy/distutils/command/build_clib.py
index 45201f98f..11999dae2 100644
--- a/numpy/distutils/command/build_clib.py
+++ b/numpy/distutils/command/build_clib.py
@@ -307,6 +307,7 @@ class build_clib(old_build_clib):
             # problem, msvc uses its own convention :(
             c_sources += cxx_sources
             cxx_sources = []
+            extra_cflags += extra_cxxflags
 
         # filtering C dispatch-table sources when optimization is not disabled,
         # otherwise treated as normal sources.
diff --git a/numpy/distutils/command/build_ext.py b/numpy/distutils/command/build_ext.py
index 8b568c159..68b13c0dd 100644
--- a/numpy/distutils/command/build_ext.py
+++ b/numpy/distutils/command/build_ext.py
@@ -259,7 +259,7 @@ class build_ext (old_build_ext):
                     log.warn('resetting extension %r language from %r to %r.' %
                              (ext.name, l, ext_language))
 
-                ext.language = ext_language
+            ext.language = ext_language
 
             # global language
             all_languages.update(ext_languages)
@@ -407,6 +407,7 @@ class build_ext (old_build_ext):
             if cxx_sources:
                 # Needed to compile kiva.agg._agg extension.
                 extra_args.append('/Zm1000')
+                extra_cflags += extra_cxxflags
             # this hack works around the msvc compiler attributes
             # problem, msvc uses its own convention :(
             c_sources += cxx_sources
@@ -586,6 +587,13 @@ class build_ext (old_build_ext):
             # not using fcompiler linker
             self._libs_with_msvc_and_fortran(
                 fcompiler, libraries, library_dirs)
+            if ext.runtime_library_dirs:
+                # gcc adds RPATH to the link. On windows, copy the dll into
+                # self.extra_dll_dir instead.
+                for d in ext.runtime_library_dirs:
+                    for f in glob(d + '/*.dll'):
+                        copy_file(f, self.extra_dll_dir)
+                ext.runtime_library_dirs = []
 
         elif ext.language in ['f77', 'f90'] and fcompiler is not None:
             linker = fcompiler.link_shared_object
@@ -634,12 +642,12 @@ class build_ext (old_build_ext):
                 if os.path.isfile(fake_lib):
                     # Replace fake static library
                     libraries.remove(lib)
-                    with open(fake_lib, 'r') as f:
+                    with open(fake_lib) as f:
                         unlinkable_fobjects.extend(f.read().splitlines())
 
                     # Expand C objects
                     c_lib = os.path.join(libdir, lib + '.cobjects')
-                    with open(c_lib, 'r') as f:
+                    with open(c_lib) as f:
                         objects.extend(f.read().splitlines())
 
         # Wrap unlinkable objects to a linkable one
diff --git a/numpy/distutils/command/build_src.py b/numpy/distutils/command/build_src.py
index 5581011f6..bf3d03c70 100644
--- a/numpy/distutils/command/build_src.py
+++ b/numpy/distutils/command/build_src.py
@@ -725,7 +725,7 @@ _has_c_header = re.compile(r'-\*-\s*c\s*-\*-', re.I).search
 _has_cpp_header = re.compile(r'-\*-\s*c\+\+\s*-\*-', re.I).search
 
 def get_swig_target(source):
-    with open(source, 'r') as f:
+    with open(source) as f:
         result = None
         line = f.readline()
         if _has_cpp_header(line):
@@ -735,7 +735,7 @@ def get_swig_target(source):
     return result
 
 def get_swig_modulename(source):
-    with open(source, 'r') as f:
+    with open(source) as f:
         name = None
         for line in f:
             m = _swig_module_name_match(line)
diff --git a/numpy/distutils/command/install.py b/numpy/distutils/command/install.py
index 2eff2d145..efa9b4740 100644
--- a/numpy/distutils/command/install.py
+++ b/numpy/distutils/command/install.py
@@ -62,7 +62,7 @@ class install(old_install):
             # bdist_rpm fails when INSTALLED_FILES contains
             # paths with spaces. Such paths must be enclosed
             # with double-quotes.
-            with open(self.record, 'r') as f:
+            with open(self.record) as f:
                 lines = []
                 need_rewrite = False
                 for l in f:
diff --git a/numpy/distutils/fcompiler/__init__.py b/numpy/distutils/fcompiler/__init__.py
index ecba3e5d5..5160e2abf 100644
--- a/numpy/distutils/fcompiler/__init__.py
+++ b/numpy/distutils/fcompiler/__init__.py
@@ -19,6 +19,7 @@ __all__ = ['FCompiler', 'new_fcompiler', 'show_fcompilers',
 import os
 import sys
 import re
+from pathlib import Path
 
 from distutils.sysconfig import get_python_lib
 from distutils.fancy_getopt import FancyGetopt
@@ -37,6 +38,10 @@ from .environment import EnvironmentConfig
 
 __metaclass__ = type
 
+
+FORTRAN_COMMON_FIXED_EXTENSIONS = ['.for', '.ftn', '.f77', '.f']
+
+
 class CompilerNotFound(Exception):
     pass
 
@@ -571,7 +576,8 @@ class FCompiler(CCompiler):
     def _compile(self, obj, src, ext, cc_args, extra_postargs, pp_opts):
         """Compile 'src' to product 'obj'."""
         src_flags = {}
-        if is_f_file(src) and not has_f90_header(src):
+        if Path(src).suffix.lower() in FORTRAN_COMMON_FIXED_EXTENSIONS \
+           and not has_f90_header(src):
             flavor = ':f77'
             compiler = self.compiler_f77
             src_flags = get_f77flags(src)
@@ -970,7 +976,6 @@ def dummy_fortran_file():
     return name[:-2]
 
 
-is_f_file = re.compile(r'.*\.(for|ftn|f77|f)\Z', re.I).match
 _has_f_header = re.compile(r'-\*-\s*fortran\s*-\*-', re.I).search
 _has_f90_header = re.compile(r'-\*-\s*f90\s*-\*-', re.I).search
 _has_fix_header = re.compile(r'-\*-\s*fix\s*-\*-', re.I).search
diff --git a/numpy/distutils/fcompiler/absoft.py b/numpy/distutils/fcompiler/absoft.py
index efe3a4cb5..68f516b92 100644
--- a/numpy/distutils/fcompiler/absoft.py
+++ b/numpy/distutils/fcompiler/absoft.py
@@ -1,6 +1,6 @@
 
-# http://www.absoft.com/literature/osxuserguide.pdf
-# http://www.absoft.com/documentation.html
+# Absoft Corporation ceased operations on 12/31/2022.
+# Thus, all links to <http://www.absoft.com> are invalid.
 
 # Notes:
 # - when using -g77 then use -DUNDERSCORE_G77 to compile f2py
diff --git a/numpy/distutils/fcompiler/ibm.py b/numpy/distutils/fcompiler/ibm.py
index eff24401a..29927518c 100644
--- a/numpy/distutils/fcompiler/ibm.py
+++ b/numpy/distutils/fcompiler/ibm.py
@@ -76,7 +76,7 @@ class IBMFCompiler(FCompiler):
                 xlf_cfg = '/etc/opt/ibmcmp/xlf/%s/xlf.cfg' % version
             fo, new_cfg = make_temp_file(suffix='_xlf.cfg')
             log.info('Creating '+new_cfg)
-            with open(xlf_cfg, 'r') as fi:
+            with open(xlf_cfg) as fi:
                 crt1_match = re.compile(r'\s*crt\s*=\s*(?P<path>.*)/crt1.o').match
                 for line in fi:
                     m = crt1_match(line)
diff --git a/numpy/distutils/fujitsuccompiler.py b/numpy/distutils/fujitsuccompiler.py
new file mode 100644
index 000000000..c25900b34
--- /dev/null
+++ b/numpy/distutils/fujitsuccompiler.py
@@ -0,0 +1,28 @@
+from distutils.unixccompiler import UnixCCompiler
+
+class FujitsuCCompiler(UnixCCompiler):
+
+    """
+    Fujitsu compiler.
+    """
+
+    compiler_type = 'fujitsu'
+    cc_exe = 'fcc'
+    cxx_exe = 'FCC'
+
+    def __init__(self, verbose=0, dry_run=0, force=0):
+        UnixCCompiler.__init__(self, verbose, dry_run, force)
+        cc_compiler = self.cc_exe
+        cxx_compiler = self.cxx_exe
+        self.set_executables(
+            compiler=cc_compiler +
+            ' -O3 -Nclang -fPIC',
+            compiler_so=cc_compiler +
+            ' -O3 -Nclang -fPIC',
+            compiler_cxx=cxx_compiler +
+            ' -O3 -Nclang -fPIC',
+            linker_exe=cc_compiler +
+            ' -lfj90i -lfj90f -lfjsrcinfo -lelf -shared',
+            linker_so=cc_compiler +
+            ' -lfj90i -lfj90f -lfjsrcinfo -lelf -shared'
+            )
diff --git a/numpy/distutils/mingw32ccompiler.py b/numpy/distutils/mingw32ccompiler.py
index 5d1c27a65..4763f41ad 100644
--- a/numpy/distutils/mingw32ccompiler.py
+++ b/numpy/distutils/mingw32ccompiler.py
@@ -8,7 +8,6 @@ Support code for building Python extensions on Windows.
 
 """
 import os
-import platform
 import sys
 import subprocess
 import re
@@ -203,11 +202,11 @@ def find_python_dll():
 
     # search in the file system for possible candidates
     major_version, minor_version = tuple(sys.version_info[:2])
-    implementation = platform.python_implementation()
-    if implementation == 'CPython':
+    implementation = sys.implementation.name
+    if implementation == 'cpython':
         dllname = f'python{major_version}{minor_version}.dll'
-    elif implementation == 'PyPy':
-        dllname = f'libpypy{major_version}-c.dll'
+    elif implementation == 'pypy':
+        dllname = f'libpypy{major_version}.{minor_version}-c.dll'
     else:
         dllname = f'Unknown platform {implementation}' 
     print("Looking for %s" % dllname)
diff --git a/numpy/distutils/misc_util.py b/numpy/distutils/misc_util.py
index 79ba08515..e226b4744 100644
--- a/numpy/distutils/misc_util.py
+++ b/numpy/distutils/misc_util.py
@@ -475,7 +475,7 @@ def _get_f90_modules(source):
     if not f90_ext_match(source):
         return []
     modules = []
-    with open(source, 'r') as f:
+    with open(source) as f:
         for line in f:
             m = f90_module_name_match(line)
             if m:
@@ -1932,7 +1932,7 @@ class Configuration:
                 revision0 = f.read().strip()
 
             branch_map = {}
-            with open(branch_cache_fn, 'r') as f:
+            with open(branch_cache_fn) as f:
                 for line in f:
                     branch1, revision1  = line.split()[:2]
                     if revision1==revision0:
diff --git a/numpy/distutils/system_info.py b/numpy/distutils/system_info.py
index d5a1687da..3dca7fb5a 100644
--- a/numpy/distutils/system_info.py
+++ b/numpy/distutils/system_info.py
@@ -62,6 +62,7 @@ Currently, the following classes are available, along with their section names:
     blas_ilp64_plain_opt_info:ALL      # usage recommended (general ILP64 BLAS, no symbol suffix)
     blas_info:blas
     blas_mkl_info:mkl
+    blas_ssl2_info:ssl2
     blas_opt_info:ALL                  # usage recommended
     blas_src_info:blas_src
     blis_info:blis
@@ -93,9 +94,11 @@ Currently, the following classes are available, along with their section names:
     lapack_ilp64_plain_opt_info:ALL    # usage recommended (general ILP64 LAPACK, no symbol suffix)
     lapack_info:lapack
     lapack_mkl_info:mkl
+    lapack_ssl2_info:ssl2
     lapack_opt_info:ALL                # usage recommended
     lapack_src_info:lapack_src
     mkl_info:mkl
+    ssl2_info:ssl2
     numarray_info:numarray
     numerix_info:numerix
     numpy_info:numpy
@@ -519,6 +522,7 @@ def get_info(name, notfound_action=0):
           'lapack_atlas_3_10_threads': lapack_atlas_3_10_threads_info,  # ditto
           'flame': flame_info,          # use lapack_opt instead
           'mkl': mkl_info,
+          'ssl2': ssl2_info,
           # openblas which may or may not have embedded lapack
           'openblas': openblas_info,          # use blas_opt instead
           # openblas with embedded lapack
@@ -527,6 +531,8 @@ def get_info(name, notfound_action=0):
           'blis': blis_info,                  # use blas_opt instead
           'lapack_mkl': lapack_mkl_info,      # use lapack_opt instead
           'blas_mkl': blas_mkl_info,          # use blas_opt instead
+          'lapack_ssl2': lapack_ssl2_info,      
+          'blas_ssl2': blas_ssl2_info,          
           'accelerate': accelerate_info,      # use blas_opt instead
           'openblas64_': openblas64__info,
           'openblas64__lapack': openblas64__lapack_info,
@@ -1260,7 +1266,7 @@ class mkl_info(system_info):
         paths = os.environ.get('LD_LIBRARY_PATH', '').split(os.pathsep)
         ld_so_conf = '/etc/ld.so.conf'
         if os.path.isfile(ld_so_conf):
-            with open(ld_so_conf, 'r') as f:
+            with open(ld_so_conf) as f:
                 for d in f:
                     d = d.strip()
                     if d:
@@ -1325,6 +1331,63 @@ class blas_mkl_info(mkl_info):
     pass
 
 
+class ssl2_info(system_info):
+    section = 'ssl2'
+    dir_env_var = 'SSL2_DIR'
+    # Multi-threaded version. Python itself must be built by Fujitsu compiler.
+    _lib_ssl2 = ['fjlapackexsve']
+    # Single-threaded version
+    #_lib_ssl2 = ['fjlapacksve']
+
+    def get_tcsds_rootdir(self):
+        tcsdsroot = os.environ.get('TCSDS_PATH', None)
+        if tcsdsroot is not None:
+            return tcsdsroot
+        return None
+
+    def __init__(self):
+        tcsdsroot = self.get_tcsds_rootdir()
+        if tcsdsroot is None:
+            system_info.__init__(self)
+        else:
+            system_info.__init__(
+                self,
+                default_lib_dirs=[os.path.join(tcsdsroot, 'lib64')],
+                default_include_dirs=[os.path.join(tcsdsroot,
+                    'clang-comp/include')])
+
+    def calc_info(self):
+        tcsdsroot = self.get_tcsds_rootdir()
+
+        lib_dirs = self.get_lib_dirs()
+        if lib_dirs is None:
+            lib_dirs = os.path.join(tcsdsroot, 'lib64')
+
+        incl_dirs = self.get_include_dirs()
+        if incl_dirs is None:
+            incl_dirs = os.path.join(tcsdsroot, 'clang-comp/include')
+
+        ssl2_libs = self.get_libs('ssl2_libs', self._lib_ssl2)
+
+        info = self.check_libs2(lib_dirs, ssl2_libs)
+        if info is None:
+            return
+        dict_append(info,
+                    define_macros=[('HAVE_CBLAS', None),
+                                   ('HAVE_SSL2', 1)],
+                    include_dirs=incl_dirs,)
+        self.set_info(**info)
+
+
+class lapack_ssl2_info(ssl2_info):
+    pass
+
+
+class blas_ssl2_info(ssl2_info):
+    pass
+
+
+
 class armpl_info(system_info):
     section = 'armpl'
     dir_env_var = 'ARMPL_DIR'
@@ -1787,7 +1850,7 @@ class lapack_opt_info(system_info):
     notfounderror = LapackNotFoundError
 
     # List of all known LAPACK libraries, in the default order
-    lapack_order = ['armpl', 'mkl', 'openblas', 'flame',
+    lapack_order = ['armpl', 'mkl', 'ssl2', 'openblas', 'flame',
                     'accelerate', 'atlas', 'lapack']
     order_env_var_name = 'NPY_LAPACK_ORDER'
     
@@ -1805,6 +1868,13 @@ class lapack_opt_info(system_info):
             return True
         return False
 
+    def _calc_info_ssl2(self):
+        info = get_info('lapack_ssl2')
+        if info:
+            self.set_info(**info)
+            return True
+        return False
+
     def _calc_info_openblas(self):
         info = get_info('openblas_lapack')
         if info:
@@ -1971,7 +2041,7 @@ class blas_opt_info(system_info):
     notfounderror = BlasNotFoundError
     # List of all known BLAS libraries, in the default order
 
-    blas_order = ['armpl', 'mkl', 'blis', 'openblas',
+    blas_order = ['armpl', 'mkl', 'ssl2', 'blis', 'openblas',
                   'accelerate', 'atlas', 'blas']
     order_env_var_name = 'NPY_BLAS_ORDER'
     
@@ -1989,6 +2059,13 @@ class blas_opt_info(system_info):
             return True
         return False
 
+    def _calc_info_ssl2(self):
+        info = get_info('blas_ssl2')
+        if info:
+            self.set_info(**info)
+            return True
+        return False
+
     def _calc_info_blis(self):
         info = get_info('blis')
         if info:
@@ -2190,7 +2267,7 @@ class blas_info(system_info):
             }""")
         src = os.path.join(tmpdir, 'source.c')
         try:
-            with open(src, 'wt') as f:
+            with open(src, 'w') as f:
                 f.write(s)
 
             try:
@@ -2345,7 +2422,7 @@ class openblas_info(blas_info):
         except Exception:
             extra_args = []
         try:
-            with open(src, 'wt') as f:
+            with open(src, 'w') as f:
                 f.write(s)
             obj = c.compile([src], output_dir=tmpdir)
             try:
@@ -2456,7 +2533,7 @@ class flame_info(system_info):
         # Add the additional "extra" arguments
         extra_args = info.get('extra_link_args', [])
         try:
-            with open(src, 'wt') as f:
+            with open(src, 'w') as f:
                 f.write(s)
             obj = c.compile([src], output_dir=tmpdir)
             try:
diff --git a/numpy/distutils/tests/test_build_ext.py b/numpy/distutils/tests/test_build_ext.py
index c007159f5..372100fc0 100644
--- a/numpy/distutils/tests/test_build_ext.py
+++ b/numpy/distutils/tests/test_build_ext.py
@@ -5,7 +5,9 @@ import subprocess
 import sys
 from textwrap import indent, dedent
 import pytest
+from numpy.testing import IS_WASM
 
+@pytest.mark.skipif(IS_WASM, reason="cannot start subprocess in wasm")
 @pytest.mark.slow
 def test_multi_fortran_libs_link(tmp_path):
     '''
diff --git a/numpy/distutils/tests/test_ccompiler_opt.py b/numpy/distutils/tests/test_ccompiler_opt.py
index 657ebdb68..a1b780336 100644
--- a/numpy/distutils/tests/test_ccompiler_opt.py
+++ b/numpy/distutils/tests/test_ccompiler_opt.py
@@ -31,7 +31,7 @@ arch_compilers = dict(
     ppc64 = ("gcc", "clang"),
     ppc64le = ("gcc", "clang"),
     armhf = ("gcc", "clang"),
-    aarch64 = ("gcc", "clang"),
+    aarch64 = ("gcc", "clang", "fcc"),
     s390x = ("gcc", "clang"),
     noarch = ("gcc",)
 )
@@ -422,8 +422,8 @@ class _Test_CCompilerOpt:
         # when option "native" is activated through the args
         try:
             self.expect("native",
-                trap_flags=".*(-march=native|-xHost|/QxHost).*",
-                x86=".*", ppc64=".*", armhf=".*", s390x=".*"
+                trap_flags=".*(-march=native|-xHost|/QxHost|-mcpu=a64fx).*",
+                x86=".*", ppc64=".*", armhf=".*", s390x=".*", aarch64=".*",
             )
             if self.march() != "unknown":
                 raise AssertionError(
diff --git a/numpy/distutils/tests/test_exec_command.py b/numpy/distutils/tests/test_exec_command.py
index 157bd8427..d1a20056a 100644
--- a/numpy/distutils/tests/test_exec_command.py
+++ b/numpy/distutils/tests/test_exec_command.py
@@ -1,10 +1,12 @@
 import os
+import pytest
 import sys
 from tempfile import TemporaryFile
 
 from numpy.distutils import exec_command
 from numpy.distutils.exec_command import get_pythonexe
-from numpy.testing import tempdir, assert_, assert_warns
+from numpy.testing import tempdir, assert_, assert_warns, IS_WASM
+
 
 # In python 3 stdout, stderr are text (unicode compliant) devices, so to
 # emulate them import StringIO from the io module.
@@ -93,6 +95,7 @@ def test_exec_command_stderr():
                         exec_command.exec_command("cd '.'")
 
 
+@pytest.mark.skipif(IS_WASM, reason="Cannot start subprocess")
 class TestExecCommand:
     def setup_method(self):
         self.pyexe = get_pythonexe()
diff --git a/numpy/distutils/tests/test_shell_utils.py b/numpy/distutils/tests/test_shell_utils.py
index 32bd283e5..696d38ddd 100644
--- a/numpy/distutils/tests/test_shell_utils.py
+++ b/numpy/distutils/tests/test_shell_utils.py
@@ -4,6 +4,7 @@ import json
 import sys
 
 from numpy.distutils import _shell_utils
+from numpy.testing import IS_WASM
 
 argv_cases = [
     [r'exe'],
@@ -49,6 +50,7 @@ def runner(Parser):
         raise NotImplementedError
 
 
+@pytest.mark.skipif(IS_WASM, reason="Cannot start subprocess")
 @pytest.mark.parametrize('argv', argv_cases)
 def test_join_matches_subprocess(Parser, runner, argv):
     """
@@ -64,6 +66,7 @@ def test_join_matches_subprocess(Parser, runner, argv):
     assert json.loads(json_out) == argv
 
 
+@pytest.mark.skipif(IS_WASM, reason="Cannot start subprocess")
 @pytest.mark.parametrize('argv', argv_cases)
 def test_roundtrip(Parser, argv):
     """
diff --git a/numpy/distutils/tests/test_system_info.py b/numpy/distutils/tests/test_system_info.py
index eb7235e04..66304a5e5 100644
--- a/numpy/distutils/tests/test_system_info.py
+++ b/numpy/distutils/tests/test_system_info.py
@@ -271,7 +271,7 @@ class TestSystemInfoReading:
 
             # But if we copy the values to a '[mkl]' section the value
             # is correct
-            with open(cfg, 'r') as fid:
+            with open(cfg) as fid:
                 mkl = fid.read().replace('[ALL]', '[mkl]', 1)
             with open(cfg, 'w') as fid:
                 fid.write(mkl)
@@ -279,7 +279,7 @@ class TestSystemInfoReading:
             assert info.get_lib_dirs() == lib_dirs
 
             # Also, the values will be taken from a section named '[DEFAULT]'
-            with open(cfg, 'r') as fid:
+            with open(cfg) as fid:
                 dflt = fid.read().replace('[mkl]', '[DEFAULT]', 1)
             with open(cfg, 'w') as fid:
                 fid.write(dflt)
diff --git a/numpy/dtypes.py b/numpy/dtypes.py
new file mode 100644
index 000000000..068a6a1a0
--- /dev/null
+++ b/numpy/dtypes.py
@@ -0,0 +1,77 @@
+"""
+DType classes and utility (:mod:`numpy.dtypes`)
+===============================================
+
+This module is home to specific dtypes related functionality and their classes.
+For more general information about dtypes, also see `numpy.dtype` and
+:ref:`arrays.dtypes`.
+
+Similar to the builtin ``types`` module, this submodule defines types (classes)
+that are not widely used directly.
+
+.. versionadded:: NumPy 1.25
+
+    The dtypes module is new in NumPy 1.25.  Previously DType classes were
+    only accessible indirectly.
+
+
+DType classes
+-------------
+
+The following are the classes of the corresponding NumPy dtype instances and
+NumPy scalar types.  The classes can be used in ``isinstance`` checks and can
+also be instantiated or used directly.  Direct use of these classes is not
+typical, since their scalar counterparts (e.g. ``np.float64``) or strings
+like ``"float64"`` can be used.
+
+.. list-table::
+    :header-rows: 1
+
+    * - Group
+      - DType class
+
+    * - Boolean
+      - ``BoolDType``
+
+    * - Bit-sized integers
+      - ``Int8DType``, ``UInt8DType``, ``Int16DType``, ``UInt16DType``,
+        ``Int32DType``, ``UInt32DType``, ``Int64DType``, ``UInt64DType``
+
+    * - C-named integers (may be aliases)
+      - ``ByteDType``, ``UByteDType``, ``ShortDType``, ``UShortDType``,
+        ``IntDType``, ``UIntDType``, ``LongDType``, ``ULongDType``,
+        ``LongLongDType``, ``ULongLongDType``
+
+    * - Floating point
+      - ``Float16DType``, ``Float32DType``, ``Float64DType``,
+        ``LongDoubleDType``
+
+    * - Complex
+      - ``Complex64DType``, ``Complex128DType``, ``CLongDoubleDType``
+
+    * - Strings
+      - ``BytesDType``, ``BytesDType``
+
+    * - Times
+      - ``DateTime64DType``, ``TimeDelta64DType``
+
+    * - Others
+      - ``ObjectDType``, ``VoidDType``
+
+"""
+
+__all__ = []
+
+
+def _add_dtype_helper(DType, alias):
+    # Function to add DTypes a bit more conveniently without channeling them
+    # through `numpy.core._multiarray_umath` namespace or similar.
+    from numpy import dtypes
+
+    setattr(dtypes, DType.__name__, DType)
+    __all__.append(DType.__name__)
+
+    if alias:
+        alias = alias.removeprefix("numpy.dtypes.")
+        setattr(dtypes, alias, DType)
+        __all__.append(alias)
diff --git a/numpy/dtypes.pyi b/numpy/dtypes.pyi
new file mode 100644
index 000000000..2f7e846f2
--- /dev/null
+++ b/numpy/dtypes.pyi
@@ -0,0 +1,43 @@
+import numpy as np
+
+
+__all__: list[str]
+
+# Boolean:
+BoolDType = np.dtype[np.bool_]
+# Sized integers:
+Int8DType = np.dtype[np.int8]
+UInt8DType = np.dtype[np.uint8]
+Int16DType = np.dtype[np.int16]
+UInt16DType = np.dtype[np.uint16]
+Int32DType = np.dtype[np.int32]
+UInt32DType = np.dtype[np.uint32]
+Int64DType = np.dtype[np.int64]
+UInt64DType = np.dtype[np.uint64]
+# Standard C-named version/alias:
+ByteDType = np.dtype[np.byte]
+UByteDType = np.dtype[np.ubyte]
+ShortDType = np.dtype[np.short]
+UShortDType = np.dtype[np.ushort]
+IntDType = np.dtype[np.intc]
+UIntDType = np.dtype[np.uintc]
+LongDType = np.dtype[np.int_]  # Unfortunately, the correct scalar
+ULongDType = np.dtype[np.uint]  # Unfortunately, the correct scalar
+LongLongDType = np.dtype[np.longlong]
+ULongLongDType = np.dtype[np.ulonglong]
+# Floats
+Float16DType = np.dtype[np.float16]
+Float32DType = np.dtype[np.float32]
+Float64DType = np.dtype[np.float64]
+LongDoubleDType = np.dtype[np.longdouble]
+# Complex:
+Complex64DType = np.dtype[np.complex64]
+Complex128DType = np.dtype[np.complex128]
+CLongDoubleDType = np.dtype[np.clongdouble]
+# Others:
+ObjectDType = np.dtype[np.object_]
+BytesDType = np.dtype[np.bytes_]
+StrDType = np.dtype[np.str_]
+VoidDType = np.dtype[np.void]
+DateTime64DType = np.dtype[np.datetime64]
+TimeDelta64DType = np.dtype[np.timedelta64]
diff --git a/numpy/dual.py b/numpy/dual.py
deleted file mode 100644
index eb7e61aac..000000000
--- a/numpy/dual.py
+++ /dev/null
@@ -1,83 +0,0 @@
-"""
-.. deprecated:: 1.20
-
-*This module is deprecated.  Instead of importing functions from*
-``numpy.dual``, *the functions should be imported directly from NumPy
-or SciPy*.
-
-Aliases for functions which may be accelerated by SciPy.
-
-SciPy_ can be built to use accelerated or otherwise improved libraries
-for FFTs, linear algebra, and special functions. This module allows
-developers to transparently support these accelerated functions when
-SciPy is available but still support users who have only installed
-NumPy.
-
-.. _SciPy : https://www.scipy.org
-
-"""
-import warnings
-
-
-warnings.warn('The module numpy.dual is deprecated.  Instead of using dual, '
-              'use the functions directly from numpy or scipy.',
-              category=DeprecationWarning,
-              stacklevel=2)
-
-# This module should be used for functions both in numpy and scipy if
-#  you want to use the numpy version if available but the scipy version
-#  otherwise.
-#  Usage  --- from numpy.dual import fft, inv
-
-__all__ = ['fft', 'ifft', 'fftn', 'ifftn', 'fft2', 'ifft2',
-           'norm', 'inv', 'svd', 'solve', 'det', 'eig', 'eigvals',
-           'eigh', 'eigvalsh', 'lstsq', 'pinv', 'cholesky', 'i0']
-
-import numpy.linalg as linpkg
-import numpy.fft as fftpkg
-from numpy.lib import i0
-import sys
-
-
-fft = fftpkg.fft
-ifft = fftpkg.ifft
-fftn = fftpkg.fftn
-ifftn = fftpkg.ifftn
-fft2 = fftpkg.fft2
-ifft2 = fftpkg.ifft2
-
-norm = linpkg.norm
-inv = linpkg.inv
-svd = linpkg.svd
-solve = linpkg.solve
-det = linpkg.det
-eig = linpkg.eig
-eigvals = linpkg.eigvals
-eigh = linpkg.eigh
-eigvalsh = linpkg.eigvalsh
-lstsq = linpkg.lstsq
-pinv = linpkg.pinv
-cholesky = linpkg.cholesky
-
-_restore_dict = {}
-
-def register_func(name, func):
-    if name not in __all__:
-        raise ValueError("{} not a dual function.".format(name))
-    f = sys._getframe(0).f_globals
-    _restore_dict[name] = f[name]
-    f[name] = func
-
-def restore_func(name):
-    if name not in __all__:
-        raise ValueError("{} not a dual function.".format(name))
-    try:
-        val = _restore_dict[name]
-    except KeyError:
-        return
-    else:
-        sys._getframe(0).f_globals[name] = val
-
-def restore_all():
-    for name in _restore_dict.keys():
-        restore_func(name)
diff --git a/numpy/exceptions.py b/numpy/exceptions.py
new file mode 100644
index 000000000..2f8438101
--- /dev/null
+++ b/numpy/exceptions.py
@@ -0,0 +1,231 @@
+"""
+Exceptions and Warnings (:mod:`numpy.exceptions`)
+=================================================
+
+General exceptions used by NumPy.  Note that some exceptions may be module
+specific, such as linear algebra errors.
+
+.. versionadded:: NumPy 1.25
+
+    The exceptions module is new in NumPy 1.25.  Older exceptions remain
+    available through the main NumPy namespace for compatibility.
+
+.. currentmodule:: numpy.exceptions
+
+Warnings
+--------
+.. autosummary::
+   :toctree: generated/
+
+   ComplexWarning             Given when converting complex to real.
+   VisibleDeprecationWarning  Same as a DeprecationWarning, but more visible.
+
+Exceptions
+----------
+.. autosummary::
+   :toctree: generated/
+
+    AxisError          Given when an axis was invalid.
+    DTypePromotionError   Given when no common dtype could be found.
+    TooHardError       Error specific to `numpy.shares_memory`.
+
+"""
+
+
+__all__ = [
+    "ComplexWarning", "VisibleDeprecationWarning", "ModuleDeprecationWarning",
+    "TooHardError", "AxisError", "DTypePromotionError"]
+
+
+# Disallow reloading this module so as to preserve the identities of the
+# classes defined here.
+if '_is_loaded' in globals():
+    raise RuntimeError('Reloading numpy._globals is not allowed')
+_is_loaded = True
+
+
+class ComplexWarning(RuntimeWarning):
+    """
+    The warning raised when casting a complex dtype to a real dtype.
+
+    As implemented, casting a complex number to a real discards its imaginary
+    part, but this behavior may not be what the user actually wants.
+
+    """
+    pass
+
+
+class ModuleDeprecationWarning(DeprecationWarning):
+    """Module deprecation warning.
+
+    .. warning::
+
+        This warning should not be used, since nose testing is not relevant
+        anymore.
+
+    The nose tester turns ordinary Deprecation warnings into test failures.
+    That makes it hard to deprecate whole modules, because they get
+    imported by default. So this is a special Deprecation warning that the
+    nose tester will let pass without making tests fail.
+
+    """
+
+
+class VisibleDeprecationWarning(UserWarning):
+    """Visible deprecation warning.
+
+    By default, python will not show deprecation warnings, so this class
+    can be used when a very visible warning is helpful, for example because
+    the usage is most likely a user bug.
+
+    """
+
+
+# Exception used in shares_memory()
+class TooHardError(RuntimeError):
+    """max_work was exceeded.
+
+    This is raised whenever the maximum number of candidate solutions
+    to consider specified by the ``max_work`` parameter is exceeded.
+    Assigning a finite number to max_work may have caused the operation
+    to fail.
+
+    """
+
+    pass
+
+
+class AxisError(ValueError, IndexError):
+    """Axis supplied was invalid.
+
+    This is raised whenever an ``axis`` parameter is specified that is larger
+    than the number of array dimensions.
+    For compatibility with code written against older numpy versions, which
+    raised a mixture of `ValueError` and `IndexError` for this situation, this
+    exception subclasses both to ensure that ``except ValueError`` and
+    ``except IndexError`` statements continue to catch `AxisError`.
+
+    .. versionadded:: 1.13
+
+    Parameters
+    ----------
+    axis : int or str
+        The out of bounds axis or a custom exception message.
+        If an axis is provided, then `ndim` should be specified as well.
+    ndim : int, optional
+        The number of array dimensions.
+    msg_prefix : str, optional
+        A prefix for the exception message.
+
+    Attributes
+    ----------
+    axis : int, optional
+        The out of bounds axis or ``None`` if a custom exception
+        message was provided. This should be the axis as passed by
+        the user, before any normalization to resolve negative indices.
+
+        .. versionadded:: 1.22
+    ndim : int, optional
+        The number of array dimensions or ``None`` if a custom exception
+        message was provided.
+
+        .. versionadded:: 1.22
+
+
+    Examples
+    --------
+    >>> array_1d = np.arange(10)
+    >>> np.cumsum(array_1d, axis=1)
+    Traceback (most recent call last):
+      ...
+    numpy.exceptions.AxisError: axis 1 is out of bounds for array of dimension 1
+
+    Negative axes are preserved:
+
+    >>> np.cumsum(array_1d, axis=-2)
+    Traceback (most recent call last):
+      ...
+    numpy.exceptions.AxisError: axis -2 is out of bounds for array of dimension 1
+
+    The class constructor generally takes the axis and arrays'
+    dimensionality as arguments:
+
+    >>> print(np.AxisError(2, 1, msg_prefix='error'))
+    error: axis 2 is out of bounds for array of dimension 1
+
+    Alternatively, a custom exception message can be passed:
+
+    >>> print(np.AxisError('Custom error message'))
+    Custom error message
+
+    """
+
+    __slots__ = ("axis", "ndim", "_msg")
+
+    def __init__(self, axis, ndim=None, msg_prefix=None):
+        if ndim is msg_prefix is None:
+            # single-argument form: directly set the error message
+            self._msg = axis
+            self.axis = None
+            self.ndim = None
+        else:
+            self._msg = msg_prefix
+            self.axis = axis
+            self.ndim = ndim
+
+    def __str__(self):
+        axis = self.axis
+        ndim = self.ndim
+
+        if axis is ndim is None:
+            return self._msg
+        else:
+            msg = f"axis {axis} is out of bounds for array of dimension {ndim}"
+            if self._msg is not None:
+                msg = f"{self._msg}: {msg}"
+            return msg
+
+
+class DTypePromotionError(TypeError):
+    """Multiple DTypes could not be converted to a common one.
+
+    This exception derives from ``TypeError`` and is raised whenever dtypes
+    cannot be converted to a single common one.  This can be because they
+    are of a different category/class or incompatible instances of the same
+    one (see Examples).
+
+    Notes
+    -----
+    Many functions will use promotion to find the correct result and
+    implementation.  For these functions the error will typically be chained
+    with a more specific error indicating that no implementation was found
+    for the input dtypes.
+
+    Typically promotion should be considered "invalid" between the dtypes of
+    two arrays when `arr1 == arr2` can safely return all ``False`` because the
+    dtypes are fundamentally different.
+
+    Examples
+    --------
+    Datetimes and complex numbers are incompatible classes and cannot be
+    promoted:
+
+    >>> np.result_type(np.dtype("M8[s]"), np.complex128)
+    DTypePromotionError: The DType <class 'numpy.dtype[datetime64]'> could not
+    be promoted by <class 'numpy.dtype[complex128]'>. This means that no common
+    DType exists for the given inputs. For example they cannot be stored in a
+    single array unless the dtype is `object`. The full list of DTypes is:
+    (<class 'numpy.dtype[datetime64]'>, <class 'numpy.dtype[complex128]'>)
+
+    For example for structured dtypes, the structure can mismatch and the
+    same ``DTypePromotionError`` is given when two structured dtypes with
+    a mismatch in their number of fields is given:
+
+    >>> dtype1 = np.dtype([("field1", np.float64), ("field2", np.int64)])
+    >>> dtype2 = np.dtype([("field1", np.float64)])
+    >>> np.promote_types(dtype1, dtype2)
+    DTypePromotionError: field names `('field1', 'field2')` and `('field1',)`
+    mismatch.
+
+    """
+    pass
diff --git a/numpy/exceptions.pyi b/numpy/exceptions.pyi
new file mode 100644
index 000000000..c76a0946b
--- /dev/null
+++ b/numpy/exceptions.pyi
@@ -0,0 +1,18 @@
+from typing import overload
+
+__all__: list[str]
+
+class ComplexWarning(RuntimeWarning): ...
+class ModuleDeprecationWarning(DeprecationWarning): ...
+class VisibleDeprecationWarning(UserWarning): ...
+class TooHardError(RuntimeError): ...
+class DTypePromotionError(TypeError): ...
+
+class AxisError(ValueError, IndexError):
+    axis: None | int
+    ndim: None | int
+    @overload
+    def __init__(self, axis: str, ndim: None = ..., msg_prefix: None = ...) -> None: ...
+    @overload
+    def __init__(self, axis: int, ndim: int, msg_prefix: None | str = ...) -> None: ...
+    def __str__(self) -> str: ...
diff --git a/numpy/f2py/capi_maps.py b/numpy/f2py/capi_maps.py
index f07066a09..f0a7221b7 100644
--- a/numpy/f2py/capi_maps.py
+++ b/numpy/f2py/capi_maps.py
@@ -196,7 +196,7 @@ def load_f2cmap_file(f2cmap_file):
     # they use PARAMETERS in type specifications.
     try:
         outmess('Reading f2cmap from {!r} ...\n'.format(f2cmap_file))
-        with open(f2cmap_file, 'r') as f:
+        with open(f2cmap_file) as f:
             d = eval(f.read().lower(), {}, {})
         for k, d1 in d.items():
             for k1 in d1.keys():
@@ -342,9 +342,9 @@ def getstrlength(var):
 def getarrdims(a, var, verbose=0):
     ret = {}
     if isstring(var) and not isarray(var):
-        ret['dims'] = getstrlength(var)
-        ret['size'] = ret['dims']
-        ret['rank'] = '1'
+        ret['size'] = getstrlength(var)
+        ret['rank'] = '0'
+        ret['dims'] = ''
     elif isscalar(var):
         ret['size'] = '1'
         ret['rank'] = '0'
diff --git a/numpy/f2py/cfuncs.py b/numpy/f2py/cfuncs.py
index 741692562..2d27b6524 100644
--- a/numpy/f2py/cfuncs.py
+++ b/numpy/f2py/cfuncs.py
@@ -800,16 +800,23 @@ character_from_pyobj(character* v, PyObject *obj, const char *errmess) {
         }
     }
     {
+        /* TODO: This error (and most other) error handling needs cleaning. */
         char mess[F2PY_MESSAGE_BUFFER_SIZE];
         strcpy(mess, errmess);
         PyObject* err = PyErr_Occurred();
         if (err == NULL) {
             err = PyExc_TypeError;
+            Py_INCREF(err);
+        }
+        else {
+            Py_INCREF(err);
+            PyErr_Clear();
         }
         sprintf(mess + strlen(mess),
                 " -- expected str|bytes|sequence-of-str-or-bytes, got ");
         f2py_describe(obj, mess + strlen(mess));
         PyErr_SetString(err, mess);
+        Py_DECREF(err);
     }
     return 0;
 }
@@ -1227,8 +1234,8 @@ static int try_pyarr_from_character(PyObject* obj, character* v) {
             strcpy(mess, "try_pyarr_from_character failed"
                          " -- expected bytes array-scalar|array, got ");
             f2py_describe(obj, mess + strlen(mess));
+            PyErr_SetString(err, mess);
         }
-        PyErr_SetString(err, mess);
     }
     return 0;
 }
diff --git a/numpy/f2py/crackfortran.py b/numpy/f2py/crackfortran.py
index 27e257c48..4871d2628 100755
--- a/numpy/f2py/crackfortran.py
+++ b/numpy/f2py/crackfortran.py
@@ -147,10 +147,11 @@ import os
 import copy
 import platform
 import codecs
+from pathlib import Path
 try:
-    import chardet
+    import charset_normalizer
 except ImportError:
-    chardet = None
+    charset_normalizer = None
 
 from . import __version__
 
@@ -289,69 +290,69 @@ def undo_rmbadname(names):
     return [undo_rmbadname1(_m) for _m in names]
 
 
-def getextension(name):
-    i = name.rfind('.')
-    if i == -1:
-        return ''
-    if '\\' in name[i:]:
-        return ''
-    if '/' in name[i:]:
-        return ''
-    return name[i + 1:]
-
-is_f_file = re.compile(r'.*\.(for|ftn|f77|f)\Z', re.I).match
 _has_f_header = re.compile(r'-\*-\s*fortran\s*-\*-', re.I).search
 _has_f90_header = re.compile(r'-\*-\s*f90\s*-\*-', re.I).search
 _has_fix_header = re.compile(r'-\*-\s*fix\s*-\*-', re.I).search
 _free_f90_start = re.compile(r'[^c*]\s*[^\s\d\t]', re.I).match
 
+# Extensions
+COMMON_FREE_EXTENSIONS = ['.f90', '.f95', '.f03', '.f08']
+COMMON_FIXED_EXTENSIONS = ['.for', '.ftn', '.f77', '.f']
+
 
 def openhook(filename, mode):
     """Ensures that filename is opened with correct encoding parameter.
 
-    This function uses chardet package, when available, for
-    determining the encoding of the file to be opened. When chardet is
-    not available, the function detects only UTF encodings, otherwise,
-    ASCII encoding is used as fallback.
+    This function uses charset_normalizer package, when available, for
+    determining the encoding of the file to be opened. When charset_normalizer
+    is not available, the function detects only UTF encodings, otherwise, ASCII
+    encoding is used as fallback.
     """
-    bytes = min(32, os.path.getsize(filename))
-    with open(filename, 'rb') as f:
-        raw = f.read(bytes)
-    if raw.startswith(codecs.BOM_UTF8):
-        encoding = 'UTF-8-SIG'
-    elif raw.startswith((codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE)):
-        encoding = 'UTF-32'
-    elif raw.startswith((codecs.BOM_LE, codecs.BOM_BE)):
-        encoding = 'UTF-16'
+    # Reads in the entire file. Robust detection of encoding.
+    # Correctly handles comments or late stage unicode characters
+    # gh-22871
+    if charset_normalizer is not None:
+        encoding = charset_normalizer.from_path(filename).best().encoding
     else:
-        if chardet is not None:
-            encoding = chardet.detect(raw)['encoding']
-        else:
-            # hint: install chardet to ensure correct encoding handling
-            encoding = 'ascii'
+        # hint: install charset_normalizer for correct encoding handling
+        # No need to read the whole file for trying with startswith
+        nbytes = min(32, os.path.getsize(filename))
+        with open(filename, 'rb') as fhandle:
+            raw = fhandle.read(nbytes)
+            if raw.startswith(codecs.BOM_UTF8):
+                encoding = 'UTF-8-SIG'
+            elif raw.startswith((codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE)):
+                encoding = 'UTF-32'
+            elif raw.startswith((codecs.BOM_LE, codecs.BOM_BE)):
+                encoding = 'UTF-16'
+            else:
+                # Fallback, without charset_normalizer
+                encoding = 'ascii'
     return open(filename, mode, encoding=encoding)
 
 
-def is_free_format(file):
+def is_free_format(fname):
     """Check if file is in free format Fortran."""
     # f90 allows both fixed and free format, assuming fixed unless
     # signs of free format are detected.
-    result = 0
-    with openhook(file, 'r') as f:
-        line = f.readline()
+    result = False
+    if Path(fname).suffix.lower() in COMMON_FREE_EXTENSIONS:
+        result = True
+    with openhook(fname, 'r') as fhandle:
+        line = fhandle.readline()
         n = 15  # the number of non-comment lines to scan for hints
         if _has_f_header(line):
             n = 0
         elif _has_f90_header(line):
             n = 0
-            result = 1
+            result = True
         while n > 0 and line:
             if line[0] != '!' and line.strip():
                 n -= 1
                 if (line[0] != '\t' and _free_f90_start(line[:5])) or line[-2:-1] == '&':
-                    result = 1
+                    result = True
                     break
-            line = f.readline()
+            line = fhandle.readline()
     return result
 
 
@@ -394,7 +395,7 @@ def readfortrancode(ffile, dowithline=show, istop=1):
         except UnicodeDecodeError as msg:
             raise Exception(
                 f'readfortrancode: reading {fin.filename()}#{fin.lineno()}'
-                f' failed with\n{msg}.\nIt is likely that installing chardet'
+                f' failed with\n{msg}.\nIt is likely that installing charset_normalizer'
                 ' package will help f2py determine the input file encoding'
                 ' correctly.')
         if not l:
@@ -407,7 +408,7 @@ def readfortrancode(ffile, dowithline=show, istop=1):
             strictf77 = 0
             sourcecodeform = 'fix'
             ext = os.path.splitext(currentfilename)[1]
-            if is_f_file(currentfilename) and \
+            if Path(currentfilename).suffix.lower() in COMMON_FIXED_EXTENSIONS and \
                     not (_has_f90_header(l) or _has_fix_header(l)):
                 strictf77 = 1
             elif is_free_format(currentfilename) and not _has_fix_header(l):
@@ -612,15 +613,15 @@ beginpattern90 = re.compile(
 groupends = (r'end|endprogram|endblockdata|endmodule|endpythonmodule|'
              r'endinterface|endsubroutine|endfunction')
 endpattern = re.compile(
-    beforethisafter % ('', groupends, groupends, r'.*'), re.I), 'end'
+    beforethisafter % ('', groupends, groupends, '.*'), re.I), 'end'
 endifs = r'end\s*(if|do|where|select|while|forall|associate|block|' + \
          r'critical|enum|team)'
 endifpattern = re.compile(
-    beforethisafter % (r'[\w]*?', endifs, endifs, r'[\w\s]*'), re.I), 'endif'
+    beforethisafter % (r'[\w]*?', endifs, endifs, '.*'), re.I), 'endif'
 #
 moduleprocedures = r'module\s*procedure'
 moduleprocedurepattern = re.compile(
-    beforethisafter % ('', moduleprocedures, moduleprocedures, r'.*'), re.I), \
+    beforethisafter % ('', moduleprocedures, moduleprocedures, '.*'), re.I), \
     'moduleprocedure'
 implicitpattern = re.compile(
     beforethisafter % ('', 'implicit', 'implicit', '.*'), re.I), 'implicit'
@@ -934,7 +935,7 @@ typedefpattern = re.compile(
     r'(?:,(?P<attributes>[\w(),]+))?(::)?(?P<name>\b[a-z$_][\w$]*\b)'
     r'(?:\((?P<params>[\w,]*)\))?\Z', re.I)
 nameargspattern = re.compile(
-    r'\s*(?P<name>\b[\w$]+\b)\s*(@\(@\s*(?P<args>[\w\s,]*)\s*@\)@|)\s*((result(\s*@\(@\s*(?P<result>\b[\w$]+\b)\s*@\)@|))|(bind\s*@\(@\s*(?P<bind>.*)\s*@\)@))*\s*\Z', re.I)
+    r'\s*(?P<name>\b[\w$]+\b)\s*(@\(@\s*(?P<args>[\w\s,]*)\s*@\)@|)\s*((result(\s*@\(@\s*(?P<result>\b[\w$]+\b)\s*@\)@|))|(bind\s*@\(@\s*(?P<bind>(?:(?!@\)@).)*)\s*@\)@))*\s*\Z', re.I)
 operatorpattern = re.compile(
     r'\s*(?P<scheme>(operator|assignment))'
     r'@\(@\s*(?P<name>[^)]+)\s*@\)@\s*\Z', re.I)
@@ -1739,6 +1740,28 @@ def updatevars(typespec, selector, attrspec, entitydecl):
                         d1[k] = unmarkouterparen(d1[k])
                     else:
                         del d1[k]
+
+                if 'len' in d1:
+                    if typespec in ['complex', 'integer', 'logical', 'real']:
+                        if ('kindselector' not in edecl) or (not edecl['kindselector']):
+                            edecl['kindselector'] = {}
+                        edecl['kindselector']['*'] = d1['len']
+                        del d1['len']
+                    elif typespec == 'character':
+                        if ('charselector' not in edecl) or (not edecl['charselector']):
+                            edecl['charselector'] = {}
+                        if 'len' in edecl['charselector']:
+                            del edecl['charselector']['len']
+                        edecl['charselector']['*'] = d1['len']
+                        del d1['len']
+
+                if 'init' in d1:
+                    if '=' in edecl and (not edecl['='] == d1['init']):
+                        outmess('updatevars: attempt to change the init expression of "%s" ("%s") to "%s". Ignoring.\n' % (
+                            ename, edecl['='], d1['init']))
+                    else:
+                        edecl['='] = d1['init']
+
                 if 'len' in d1 and 'array' in d1:
                     if d1['len'] == '':
                         d1['len'] = d1['array']
@@ -1748,6 +1771,7 @@ def updatevars(typespec, selector, attrspec, entitydecl):
                         del d1['len']
                         errmess('updatevars: "%s %s" is mapped to "%s %s(%s)"\n' % (
                             typespec, e, typespec, ename, d1['array']))
+
                 if 'array' in d1:
                     dm = 'dimension(%s)' % d1['array']
                     if 'attrspec' not in edecl or (not edecl['attrspec']):
@@ -1761,23 +1785,6 @@ def updatevars(typespec, selector, attrspec, entitydecl):
                                         % (ename, dm1, dm))
                                 break
 
-                if 'len' in d1:
-                    if typespec in ['complex', 'integer', 'logical', 'real']:
-                        if ('kindselector' not in edecl) or (not edecl['kindselector']):
-                            edecl['kindselector'] = {}
-                        edecl['kindselector']['*'] = d1['len']
-                    elif typespec == 'character':
-                        if ('charselector' not in edecl) or (not edecl['charselector']):
-                            edecl['charselector'] = {}
-                        if 'len' in edecl['charselector']:
-                            del edecl['charselector']['len']
-                        edecl['charselector']['*'] = d1['len']
-                if 'init' in d1:
-                    if '=' in edecl and (not edecl['='] == d1['init']):
-                        outmess('updatevars: attempt to change the init expression of "%s" ("%s") to "%s". Ignoring.\n' % (
-                            ename, edecl['='], d1['init']))
-                    else:
-                        edecl['='] = d1['init']
             else:
                 outmess('updatevars: could not crack entity declaration "%s". Ignoring.\n' % (
                     ename + m.group('after')))
@@ -2386,19 +2393,19 @@ def _selected_int_kind_func(r):
 
 def _selected_real_kind_func(p, r=0, radix=0):
     # XXX: This should be processor dependent
-    # This is only good for 0 <= p <= 20
+    # This is only verified for 0 <= p <= 20, possibly good for p <= 33 and above
     if p < 7:
         return 4
     if p < 16:
         return 8
     machine = platform.machine().lower()
-    if machine.startswith(('aarch64', 'power', 'ppc', 'riscv', 's390x', 'sparc')):
-        if p <= 20:
+    if machine.startswith(('aarch64', 'arm64', 'power', 'ppc', 'riscv', 's390x', 'sparc')):
+        if p <= 33:
             return 16
     else:
         if p < 19:
             return 10
-        elif p <= 20:
+        elif p <= 33:
             return 16
     return -1
 
@@ -2849,6 +2856,11 @@ def analyzevars(block):
                         kindselect, charselect, typename = cracktypespec(
                             typespec, selector)
                         vars[n]['typespec'] = typespec
+                        try:
+                            if block['result']:
+                                vars[block['result']]['typespec'] = typespec
+                        except Exception:
+                            pass
                         if kindselect:
                             if 'kind' in kindselect:
                                 try:
diff --git a/numpy/f2py/diagnose.py b/numpy/f2py/diagnose.py
index 21ee399f0..86d7004ab 100644
--- a/numpy/f2py/diagnose.py
+++ b/numpy/f2py/diagnose.py
@@ -30,15 +30,15 @@ def run():
     try:
         import numpy
         has_newnumpy = 1
-    except ImportError:
-        print('Failed to import new numpy:', sys.exc_info()[1])
+    except ImportError as e:
+        print('Failed to import new numpy:', e)
         has_newnumpy = 0
 
     try:
         from numpy.f2py import f2py2e
         has_f2py2e = 1
-    except ImportError:
-        print('Failed to import f2py2e:', sys.exc_info()[1])
+    except ImportError as e:
+        print('Failed to import f2py2e:', e)
         has_f2py2e = 0
 
     try:
@@ -48,8 +48,8 @@ def run():
         try:
             import numpy_distutils
             has_numpy_distutils = 1
-        except ImportError:
-            print('Failed to import numpy_distutils:', sys.exc_info()[1])
+        except ImportError as e:
+            print('Failed to import numpy_distutils:', e)
             has_numpy_distutils = 0
 
     if has_newnumpy:
diff --git a/numpy/f2py/func2subr.py b/numpy/f2py/func2subr.py
index 2a05f065b..cc3cdc5b4 100644
--- a/numpy/f2py/func2subr.py
+++ b/numpy/f2py/func2subr.py
@@ -119,6 +119,12 @@ def createfuncwrapper(rout, signature=0):
 
     sargs = ', '.join(args)
     if f90mode:
+        # gh-23598 fix warning
+        # Essentially, this gets called again with modules where the name of the
+        # function is added to the arguments, which is not required, and removed
+        sargs = sargs.replace(f"{name}, ", '')
+        args = [arg for arg in args if arg != name]
+        rout['args'] = args
         add('subroutine f2pywrap_%s_%s (%s)' %
             (rout['modulename'], name, sargs))
         if not signature:
diff --git a/numpy/f2py/symbolic.py b/numpy/f2py/symbolic.py
index c2ab0f140..b1b9f5b6a 100644
--- a/numpy/f2py/symbolic.py
+++ b/numpy/f2py/symbolic.py
@@ -801,7 +801,7 @@ def normalize(obj):
             else:
                 _pairs_add(d, t, c)
         if len(d) == 0:
-            # TODO: deterimine correct kind
+            # TODO: determine correct kind
             return as_number(0)
         elif len(d) == 1:
             (t, c), = d.items()
@@ -836,7 +836,7 @@ def normalize(obj):
             else:
                 _pairs_add(d, b, e)
         if len(d) == 0 or coeff == 0:
-            # TODO: deterimine correct kind
+            # TODO: determine correct kind
             assert isinstance(coeff, number_types)
             return as_number(coeff)
         elif len(d) == 1:
diff --git a/numpy/f2py/tests/src/crackfortran/gh23533.f b/numpy/f2py/tests/src/crackfortran/gh23533.f
new file mode 100644
index 000000000..db522afa7
--- /dev/null
+++ b/numpy/f2py/tests/src/crackfortran/gh23533.f
@@ -0,0 +1,5 @@
+      SUBROUTINE EXAMPLE( )
+        IF( .TRUE. ) THEN
+            CALL DO_SOMETHING()
+        END IF ! ** .TRUE. **
+      END
diff --git a/numpy/f2py/tests/src/crackfortran/gh23598.f90 b/numpy/f2py/tests/src/crackfortran/gh23598.f90
new file mode 100644
index 000000000..e0dffb5ef
--- /dev/null
+++ b/numpy/f2py/tests/src/crackfortran/gh23598.f90
@@ -0,0 +1,4 @@
+integer function intproduct(a, b) result(res)
+  integer, intent(in) :: a, b
+  res = a*b
+end function
diff --git a/numpy/f2py/tests/src/crackfortran/gh23598Warn.f90 b/numpy/f2py/tests/src/crackfortran/gh23598Warn.f90
new file mode 100644
index 000000000..3b44efc5e
--- /dev/null
+++ b/numpy/f2py/tests/src/crackfortran/gh23598Warn.f90
@@ -0,0 +1,11 @@
+module test_bug
+    implicit none
+    private
+    public :: intproduct
+
+contains
+    integer function intproduct(a, b) result(res)
+    integer, intent(in) :: a, b
+    res = a*b
+    end function
+end module
diff --git a/numpy/f2py/tests/src/crackfortran/unicode_comment.f90 b/numpy/f2py/tests/src/crackfortran/unicode_comment.f90
new file mode 100644
index 000000000..13515ce98
--- /dev/null
+++ b/numpy/f2py/tests/src/crackfortran/unicode_comment.f90
@@ -0,0 +1,4 @@
+subroutine foo(x)
+  real(8), intent(in) :: x
+  ! Écrit à l'écran la valeur de x
+end subroutine
diff --git a/numpy/f2py/tests/src/string/scalar_string.f90 b/numpy/f2py/tests/src/string/scalar_string.f90
new file mode 100644
index 000000000..f8f076172
--- /dev/null
+++ b/numpy/f2py/tests/src/string/scalar_string.f90
@@ -0,0 +1,9 @@
+MODULE string_test
+
+  character(len=8) :: string
+  character string77 * 8
+
+  character(len=12), dimension(5,7) :: strarr
+  character strarr77(5,7) * 12
+
+END MODULE string_test
diff --git a/numpy/f2py/tests/test_abstract_interface.py b/numpy/f2py/tests/test_abstract_interface.py
index 29e4b0647..42902913e 100644
--- a/numpy/f2py/tests/test_abstract_interface.py
+++ b/numpy/f2py/tests/test_abstract_interface.py
@@ -1,9 +1,12 @@
 from pathlib import Path
+import pytest
 import textwrap
 from . import util
 from numpy.f2py import crackfortran
+from numpy.testing import IS_WASM
 
 
+@pytest.mark.skipif(IS_WASM, reason="Cannot start subprocess")
 class TestAbstractInterface(util.F2PyTest):
     sources = [util.getpath("tests", "src", "abstract_interface", "foo.f90")]
 
diff --git a/numpy/f2py/tests/test_character.py b/numpy/f2py/tests/test_character.py
index b54b4d981..0bb0f4290 100644
--- a/numpy/f2py/tests/test_character.py
+++ b/numpy/f2py/tests/test_character.py
@@ -457,9 +457,10 @@ class TestMiscCharacter(util.F2PyTest):
          character(len=*), intent(in) :: x(:)
          !f2py intent(out) x
          integer :: i
-         do i=1, size(x)
-           print*, "x(",i,")=", x(i)
-         end do
+         ! Uncomment for debug printing:
+         !do i=1, size(x)
+         !   print*, "x(",i,")=", x(i)
+         !end do
        end subroutine {fprefix}_gh4519
 
        pure function {fprefix}_gh3425(x) result (y)
@@ -568,3 +569,25 @@ class TestMiscCharacter(util.F2PyTest):
         assert_equal(len(a), 2)
 
         assert_raises(Exception, lambda: f(b'c'))
+
+
+class TestStringScalarArr(util.F2PyTest):
+    sources = [util.getpath("tests", "src", "string", "scalar_string.f90")]
+
+    @pytest.mark.slow
+    def test_char(self):
+        for out in (self.module.string_test.string,
+                    self.module.string_test.string77):
+            expected = ()
+            assert out.shape == expected
+            expected = '|S8'
+            assert out.dtype == expected
+
+    @pytest.mark.slow
+    def test_char_arr(self):
+        for out in (self.module.string_test.strarr,
+                    self.module.string_test.strarr77):
+            expected = (5,7)
+            assert out.shape == expected
+            expected = '|S12'
+            assert out.dtype == expected
diff --git a/numpy/f2py/tests/test_crackfortran.py b/numpy/f2py/tests/test_crackfortran.py
index dcf8760db..49bfc13af 100644
--- a/numpy/f2py/tests/test_crackfortran.py
+++ b/numpy/f2py/tests/test_crackfortran.py
@@ -1,7 +1,10 @@
+import importlib
 import codecs
+import time
+import unicodedata
 import pytest
 import numpy as np
-from numpy.f2py.crackfortran import markinnerspaces
+from numpy.f2py.crackfortran import markinnerspaces, nameargspattern
 from . import util
 from numpy.f2py import crackfortran
 import textwrap
@@ -132,6 +135,7 @@ class TestMarkinnerspaces:
         assert markinnerspaces("a 'b c' 'd e'") == "a 'b@_@c' 'd@_@e'"
         assert markinnerspaces(r'a "b c" "d e"') == r'a "b@_@c" "d@_@e"'
 
+
 class TestDimSpec(util.F2PyTest):
     """This test suite tests various expressions that are used as dimension
     specifications.
@@ -241,6 +245,7 @@ class TestModuleDeclaration:
         assert len(mod) == 1
         assert mod[0]["vars"]["abar"]["="] == "bar('abar')"
 
+
 class TestEval(util.F2PyTest):
     def test_eval_scalar(self):
         eval_scalar = crackfortran._eval_scalar
@@ -257,13 +262,76 @@ class TestFortranReader(util.F2PyTest):
     def test_input_encoding(self, tmp_path, encoding):
         # gh-635
         f_path = tmp_path / f"input_with_{encoding}_encoding.f90"
-        # explicit BOM is required for UTF8
-        bom = {'utf-8': codecs.BOM_UTF8}.get(encoding, b'')
         with f_path.open('w', encoding=encoding) as ff:
-            ff.write(bom.decode(encoding) +
-                     """
+            ff.write("""
                      subroutine foo()
                      end subroutine foo
                      """)
         mod = crackfortran.crackfortran([str(f_path)])
         assert mod[0]['name'] == 'foo'
+
+
+class TestUnicodeComment(util.F2PyTest):
+    sources = [util.getpath("tests", "src", "crackfortran", "unicode_comment.f90")]
+
+    @pytest.mark.skipif(
+        (importlib.util.find_spec("charset_normalizer") is None),
+        reason="test requires charset_normalizer which is not installed",
+    )
+    def test_encoding_comment(self):
+        self.module.foo(3)
+
+
+class TestNameArgsPatternBacktracking:
+    @pytest.mark.parametrize(
+        ['adversary'],
+        [
+            ('@)@bind@(@',),
+            ('@)@bind                         @(@',),
+            ('@)@bind foo bar baz@(@',)
+        ]
+    )
+    def test_nameargspattern_backtracking(self, adversary):
+        '''address ReDOS vulnerability:
+        https://github.com/numpy/numpy/issues/23338'''
+        trials_per_batch = 12
+        batches_per_regex = 4
+        start_reps, end_reps = 15, 25
+        for ii in range(start_reps, end_reps):
+            repeated_adversary = adversary * ii
+            # test times in small batches.
+            # this gives us more chances to catch a bad regex
+            # while still catching it before too long if it is bad
+            for _ in range(batches_per_regex):
+                times = []
+                for _ in range(trials_per_batch):
+                    t0 = time.perf_counter()
+                    mtch = nameargspattern.search(repeated_adversary)
+                    times.append(time.perf_counter() - t0)
+                # our pattern should be much faster than 0.2s per search
+                # it's unlikely that a bad regex will pass even on fast CPUs
+                assert np.median(times) < 0.2
+            assert not mtch
+            # if the adversary is capped with @)@, it becomes acceptable
+            # according to the old version of the regex.
+            # that should still be true.
+            good_version_of_adversary = repeated_adversary + '@)@'
+            assert nameargspattern.search(good_version_of_adversary)
+
+
+class TestFunctionReturn(util.F2PyTest):
+    sources = [util.getpath("tests", "src", "crackfortran", "gh23598.f90")]
+
+    def test_function_rettype(self):
+        # gh-23598
+        assert self.module.intproduct(3, 4) == 12
+
+
+class TestFortranGroupCounters(util.F2PyTest):
+    def test_end_if_comment(self):
+        # gh-23533
+        fpath = util.getpath("tests", "src", "crackfortran", "gh23533.f")
+        try:
+            crackfortran.crackfortran([str(fpath)])
+        except Exception as exc:
+            assert False, f"'crackfortran.crackfortran' raised an exception {exc}"
diff --git a/numpy/f2py/tests/test_f2py2e.py b/numpy/f2py/tests/test_f2py2e.py
index 2c10f046f..5f7b56a68 100644
--- a/numpy/f2py/tests/test_f2py2e.py
+++ b/numpy/f2py/tests/test_f2py2e.py
@@ -63,6 +63,15 @@ def hello_world_f90(tmpdir_factory):
 
 
 @pytest.fixture(scope="session")
+def gh23598_warn(tmpdir_factory):
+    """F90 file for testing warnings in gh23598"""
+    fdat = util.getpath("tests", "src", "crackfortran", "gh23598Warn.f90").read_text()
+    fn = tmpdir_factory.getbasetemp() / "gh23598Warn.f90"
+    fn.write_text(fdat, encoding="ascii")
+    return fn
+
+
+@pytest.fixture(scope="session")
 def hello_world_f77(tmpdir_factory):
     """Generates a single f77 file for testing"""
     fdat = util.getpath("tests", "src", "cli", "hi77.f").read_text()
@@ -91,6 +100,19 @@ def f2cmap_f90(tmpdir_factory):
     return fn
 
 
+def test_gh23598_warn(capfd, gh23598_warn, monkeypatch):
+    foutl = get_io_paths(gh23598_warn, mname="test")
+    ipath = foutl.f90inp
+    monkeypatch.setattr(
+        sys, "argv",
+        f'f2py {ipath} -m test'.split())
+
+    with util.switchdir(ipath.parent):
+        f2pycli()  # Generate files
+        wrapper = foutl.wrap90.read_text()
+        assert "intproductf2pywrap, intpr" not in wrapper
+
+
 def test_gen_pyf(capfd, hello_world_f90, monkeypatch):
     """Ensures that a signature file is generated via the CLI
     CLI :: -h
diff --git a/numpy/f2py/tests/test_kind.py b/numpy/f2py/tests/test_kind.py
index f0cb61fb6..69b85aaad 100644
--- a/numpy/f2py/tests/test_kind.py
+++ b/numpy/f2py/tests/test_kind.py
@@ -1,5 +1,6 @@
 import os
 import pytest
+import platform
 
 from numpy.f2py.crackfortran import (
     _selected_int_kind_func as selected_int_kind,
@@ -11,8 +12,8 @@ from . import util
 class TestKind(util.F2PyTest):
     sources = [util.getpath("tests", "src", "kind", "foo.f90")]
 
-    def test_all(self):
-        selectedrealkind = self.module.selectedrealkind
+    def test_int(self):
+        """Test `int` kind_func for integers up to 10**40."""
         selectedintkind = self.module.selectedintkind
 
         for i in range(40):
@@ -20,7 +21,27 @@ class TestKind(util.F2PyTest):
                 i
             ), f"selectedintkind({i}): expected {selected_int_kind(i)!r} but got {selectedintkind(i)!r}"
 
-        for i in range(20):
+    def test_real(self):
+        """
+        Test (processor-dependent) `real` kind_func for real numbers
+        of up to 31 digits precision (extended/quadruple).
+        """
+        selectedrealkind = self.module.selectedrealkind
+
+        for i in range(32):
+            assert selectedrealkind(i) == selected_real_kind(
+                i
+            ), f"selectedrealkind({i}): expected {selected_real_kind(i)!r} but got {selectedrealkind(i)!r}"
+
+    @pytest.mark.xfail(platform.machine().lower().startswith("ppc"),
+                       reason="Some PowerPC may not support full IEEE 754 precision")
+    def test_quad_precision(self):
+        """
+        Test kind_func for quadruple precision [`real(16)`] of 32+ digits .
+        """
+        selectedrealkind = self.module.selectedrealkind
+
+        for i in range(32, 40):
             assert selectedrealkind(i) == selected_real_kind(
                 i
             ), f"selectedrealkind({i}): expected {selected_real_kind(i)!r} but got {selectedrealkind(i)!r}"
diff --git a/numpy/f2py/tests/util.py b/numpy/f2py/tests/util.py
index ad8c7a37e..26fa7e49d 100644
--- a/numpy/f2py/tests/util.py
+++ b/numpy/f2py/tests/util.py
@@ -6,6 +6,7 @@ Utility functions for
 - determining paths to tests
 
 """
+import glob
 import os
 import sys
 import subprocess
@@ -20,7 +21,7 @@ import numpy
 
 from pathlib import Path
 from numpy.compat import asbytes, asstr
-from numpy.testing import temppath
+from numpy.testing import temppath, IS_WASM
 from importlib import import_module
 
 #
@@ -30,6 +31,10 @@ from importlib import import_module
 _module_dir = None
 _module_num = 5403
 
+if sys.platform == "cygwin":
+    NUMPY_INSTALL_ROOT = Path(__file__).parent.parent.parent
+    _module_list = list(NUMPY_INSTALL_ROOT.glob("**/*.dll"))
+
 
 def _cleanup():
     global _module_dir
@@ -147,6 +152,21 @@ def build_module(source_files, options=[], skip=[], only=[], module_name=None):
         for fn in dst_sources:
             os.unlink(fn)
 
+    # Rebase (Cygwin-only)
+    if sys.platform == "cygwin":
+        # If someone starts deleting modules after import, this will
+        # need to change to record how big each module is, rather than
+        # relying on rebase being able to find that from the files.
+        _module_list.extend(
+            glob.glob(os.path.join(d, "{:s}*".format(module_name)))
+        )
+        subprocess.check_call(
+            ["/usr/bin/rebase", "--database", "--oblivious", "--verbose"]
+            + _module_list
+        )
+
+
+
     # Import
     return import_module(module_name)
 
@@ -187,6 +207,9 @@ def _get_compiler_status():
         return _compiler_status
 
     _compiler_status = (False, False, False)
+    if IS_WASM:
+        # Can't run compiler from inside WASM.
+        return _compiler_status
 
     # XXX: this is really ugly. But I don't know how to invoke Distutils
     #      in a safer way...
diff --git a/numpy/fft/_pocketfft.c b/numpy/fft/_pocketfft.c
index 1eb2eba18..37649386f 100644
--- a/numpy/fft/_pocketfft.c
+++ b/numpy/fft/_pocketfft.c
@@ -22,10 +22,8 @@
 #include <string.h>
 #include <stdlib.h>
 
-#define restrict NPY_RESTRICT
-
 #define RALLOC(type,num) \
-  ((type *)malloc((num)*sizeof(type)))
+  (assert(num != 0), ((type *)malloc((num)*sizeof(type))))
 #define DEALLOC(ptr) \
   do { free(ptr); (ptr)=NULL; } while(0)
 
@@ -1058,8 +1056,10 @@ static cfftp_plan make_cfftp_plan (size_t length)
   if (length==1) return plan;
   if (cfftp_factorize(plan)!=0) { DEALLOC(plan); return NULL; }
   size_t tws=cfftp_twsize(plan);
-  plan->mem=RALLOC(cmplx,tws);
-  if (!plan->mem) { DEALLOC(plan); return NULL; }
+  if (tws != 0) {
+    plan->mem=RALLOC(cmplx,tws);
+    if (!plan->mem) { DEALLOC(plan); return NULL; }
+  }
   if (cfftp_comp_twiddle(plan)!=0)
     { DEALLOC(plan->mem); DEALLOC(plan); return NULL; }
   return plan;
@@ -1822,7 +1822,6 @@ static size_t rfftp_twsize(rfftp_plan plan)
     l1*=ip;
     }
   return twsize;
-  return 0;
   }
 
 WARN_UNUSED_RESULT NOINLINE static int rfftp_comp_twiddle (rfftp_plan plan)
@@ -1878,8 +1877,10 @@ NOINLINE static rfftp_plan make_rfftp_plan (size_t length)
   if (length==1) return plan;
   if (rfftp_factorize(plan)!=0) { DEALLOC(plan); return NULL; }
   size_t tws=rfftp_twsize(plan);
-  plan->mem=RALLOC(double,tws);
-  if (!plan->mem) { DEALLOC(plan); return NULL; }
+  if (tws != 0) {
+    plan->mem=RALLOC(double,tws);
+    if (!plan->mem) { DEALLOC(plan); return NULL; }
+  }
   if (rfftp_comp_twiddle(plan)!=0)
     { DEALLOC(plan->mem); DEALLOC(plan); return NULL; }
   return plan;
@@ -2231,6 +2232,8 @@ execute_real_forward(PyObject *a1, double fct)
 {
     rfft_plan plan=NULL;
     int fail = 0;
+    npy_intp tdim[NPY_MAXDIMS];
+
     PyArrayObject *data = (PyArrayObject *)PyArray_FromAny(a1,
             PyArray_DescrFromType(NPY_DOUBLE), 1, 0,
             NPY_ARRAY_DEFAULT | NPY_ARRAY_ENSUREARRAY | NPY_ARRAY_FORCECAST,
@@ -2240,15 +2243,11 @@ execute_real_forward(PyObject *a1, double fct)
     int ndim = PyArray_NDIM(data);
     const npy_intp *odim = PyArray_DIMS(data);
     int npts = odim[ndim - 1];
-    npy_intp *tdim=(npy_intp *)malloc(ndim*sizeof(npy_intp));
-    if (!tdim)
-      { Py_XDECREF(data); return NULL; }
     for (int d=0; d<ndim-1; ++d)
       tdim[d] = odim[d];
     tdim[ndim-1] = npts/2 + 1;
     PyArrayObject *ret = (PyArrayObject *)PyArray_Empty(ndim,
             tdim, PyArray_DescrFromType(NPY_CDOUBLE), 0);
-    free(tdim);
     if (!ret) fail=1;
     if (!fail) {
       int rstep = PyArray_DIM(ret, PyArray_NDIM(ret) - 1)*2;
diff --git a/numpy/fft/helper.pyi b/numpy/fft/helper.pyi
index b49fc88f7..9b6525190 100644
--- a/numpy/fft/helper.pyi
+++ b/numpy/fft/helper.pyi
@@ -27,21 +27,21 @@ def ifftshift(x: ArrayLike, axes: None | _ShapeLike = ...) -> NDArray[Any]: ...
 @overload
 def fftfreq(
     n: int | integer[Any],
-    d: _ArrayLikeFloat_co,
+    d: _ArrayLikeFloat_co = ...,
 ) -> NDArray[floating[Any]]: ...
 @overload
 def fftfreq(
     n: int | integer[Any],
-    d: _ArrayLikeComplex_co,
+    d: _ArrayLikeComplex_co = ...,
 ) -> NDArray[complexfloating[Any, Any]]: ...
 
 @overload
 def rfftfreq(
     n: int | integer[Any],
-    d: _ArrayLikeFloat_co,
+    d: _ArrayLikeFloat_co = ...,
 ) -> NDArray[floating[Any]]: ...
 @overload
 def rfftfreq(
     n: int | integer[Any],
-    d: _ArrayLikeComplex_co,
+    d: _ArrayLikeComplex_co = ...,
 ) -> NDArray[complexfloating[Any, Any]]: ...
diff --git a/numpy/fft/meson.build b/numpy/fft/meson.build
new file mode 100644
index 000000000..fedbf6349
--- /dev/null
+++ b/numpy/fft/meson.build
@@ -0,0 +1,33 @@
+largefile_define = []
+if host_machine.system() == 'aix' or host_machine.system() == 'AIX'
+  largefile_define += '-D_LARGE_FILES'
+endif
+
+py.extension_module('_pocketfft_internal',
+  '_pocketfft.c',
+  c_args: largefile_define,
+  dependencies: np_core_dep,
+  install: true,
+  subdir: 'numpy/fft',
+)
+
+py.install_sources(
+  [
+    '__init__.py',
+    '__init__.pyi',
+    '_pocketfft.py',
+    '_pocketfft.pyi',
+    'helper.py',
+    'helper.pyi',
+  ],
+  subdir: 'numpy/fft'
+)
+
+py.install_sources(
+  [
+    'tests/__init__.py',
+    'tests/test_helper.py',
+    'tests/test_pocketfft.py',
+  ],
+  subdir: 'numpy/fft/tests'
+)
diff --git a/numpy/fft/tests/test_pocketfft.py b/numpy/fft/tests/test_pocketfft.py
index 392644237..122a9fac9 100644
--- a/numpy/fft/tests/test_pocketfft.py
+++ b/numpy/fft/tests/test_pocketfft.py
@@ -2,7 +2,7 @@ import numpy as np
 import pytest
 from numpy.random import random
 from numpy.testing import (
-        assert_array_equal, assert_raises, assert_allclose
+        assert_array_equal, assert_raises, assert_allclose, IS_WASM
         )
 import threading
 import queue
@@ -268,6 +268,7 @@ def test_fft_with_order(dtype, order, fft):
         raise ValueError()
 
 
+@pytest.mark.skipif(IS_WASM, reason="Cannot start thread")
 class TestFFTThreadSafe:
     threads = 16
     input_shape = (800, 200)
diff --git a/numpy/lib/__init__.py b/numpy/lib/__init__.py
index 58166d4b1..d3cc9fee4 100644
--- a/numpy/lib/__init__.py
+++ b/numpy/lib/__init__.py
@@ -11,7 +11,6 @@ Most contains basic functions that are used by several submodules and are
 useful to have in the main name-space.
 
 """
-import math
 
 from numpy.version import version as __version__
 
@@ -58,7 +57,7 @@ from .arraypad import *
 from ._version import *
 from numpy.core._multiarray_umath import tracemalloc_domain
 
-__all__ = ['emath', 'math', 'tracemalloc_domain', 'Arrayterator']
+__all__ = ['emath', 'tracemalloc_domain', 'Arrayterator']
 __all__ += type_check.__all__
 __all__ += index_tricks.__all__
 __all__ += function_base.__all__
@@ -77,3 +76,19 @@ __all__ += histograms.__all__
 from numpy._pytesttester import PytestTester
 test = PytestTester(__name__)
 del PytestTester
+
+def __getattr__(attr):
+    # Warn for reprecated attributes
+    import math
+    import warnings
+
+    if attr == 'math':
+        warnings.warn(
+            "`np.lib.math` is a deprecated alias for the standard library "
+            "`math` module (Deprecated Numpy 1.25). Replace usages of "
+            "`numpy.lib.math` with `math`", DeprecationWarning, stacklevel=2)
+        return math
+    else:
+        raise AttributeError("module {!r} has no attribute "
+                             "{!r}".format(__name__, attr))
+        
diff --git a/numpy/lib/__init__.pyi b/numpy/lib/__init__.pyi
index 0e3da5b41..d3553bbcc 100644
--- a/numpy/lib/__init__.pyi
+++ b/numpy/lib/__init__.pyi
@@ -64,7 +64,6 @@ from numpy.lib.function_base import (
     digitize as digitize,
     cov as cov,
     corrcoef as corrcoef,
-    msort as msort,
     median as median,
     sinc as sinc,
     hamming as hamming,
@@ -231,6 +230,7 @@ from numpy.lib.utils import (
     lookfor as lookfor,
     byte_bounds as byte_bounds,
     safe_eval as safe_eval,
+    show_runtime as show_runtime,
 )
 
 from numpy.core.multiarray import (
diff --git a/numpy/lib/_datasource.py b/numpy/lib/_datasource.py
index b7778234e..613733fa5 100644
--- a/numpy/lib/_datasource.py
+++ b/numpy/lib/_datasource.py
@@ -37,7 +37,7 @@ Example::
 import os
 import io
 
-from numpy.core.overrides import set_module
+from .._utils import set_module
 
 
 _open = open
diff --git a/numpy/lib/_iotools.py b/numpy/lib/_iotools.py
index 4a5ac1285..534d1b3ee 100644
--- a/numpy/lib/_iotools.py
+++ b/numpy/lib/_iotools.py
@@ -513,8 +513,8 @@ class StringConverter:
                     (nx.complexfloating, complex, nx.nan + 0j),
                     # Last, try with the string types (must be last, because
                     # `_mapper[-1]` is used as default in some cases)
-                    (nx.unicode_, asunicode, '???'),
-                    (nx.string_, asbytes, '???'),
+                    (nx.str_, asunicode, '???'),
+                    (nx.bytes_, asbytes, '???'),
                     ])
 
     @classmethod
diff --git a/numpy/lib/arraypad.py b/numpy/lib/arraypad.py
index 8830b8147..b06a645d8 100644
--- a/numpy/lib/arraypad.py
+++ b/numpy/lib/arraypad.py
@@ -378,7 +378,7 @@ def _set_reflect_both(padded, axis, width_pair, method, include_edge=False):
     return left_pad, right_pad
 
 
-def _set_wrap_both(padded, axis, width_pair):
+def _set_wrap_both(padded, axis, width_pair, original_period):
     """
     Pad `axis` of `arr` with wrapped values.
 
@@ -391,6 +391,8 @@ def _set_wrap_both(padded, axis, width_pair):
     width_pair : (int, int)
         Pair of widths that mark the pad area on both sides in the given
         dimension.
+    original_period : int
+        Original length of data on `axis` of `arr`.
 
     Returns
     -------
@@ -400,6 +402,9 @@ def _set_wrap_both(padded, axis, width_pair):
     """
     left_pad, right_pad = width_pair
     period = padded.shape[axis] - right_pad - left_pad
+    # Avoid wrapping with only a subset of the original area by ensuring period
+    # can only be a multiple of the original area's length.
+    period = period // original_period * original_period
 
     # If the current dimension of `arr` doesn't contain enough valid values
     # (not part of the undefined pad area) we need to pad multiple times.
@@ -410,14 +415,12 @@ def _set_wrap_both(padded, axis, width_pair):
 
     if left_pad > 0:
         # Pad with wrapped values on left side
-        # First slice chunk from right side of the non-pad area.
+        # First slice chunk from left side of the non-pad area.
         # Use min(period, left_pad) to ensure that chunk is not larger than
-        # pad area
-        right_slice = _slice_at_axis(
-            slice(-right_pad - min(period, left_pad),
-                  -right_pad if right_pad != 0 else None),
-            axis
-        )
+        # pad area.
+        slice_end = left_pad + period
+        slice_start = slice_end - min(period, left_pad)
+        right_slice = _slice_at_axis(slice(slice_start, slice_end), axis)
         right_chunk = padded[right_slice]
 
         if left_pad > period:
@@ -431,11 +434,12 @@ def _set_wrap_both(padded, axis, width_pair):
 
     if right_pad > 0:
         # Pad with wrapped values on right side
-        # First slice chunk from left side of the non-pad area.
+        # First slice chunk from right side of the non-pad area.
         # Use min(period, right_pad) to ensure that chunk is not larger than
-        # pad area
-        left_slice = _slice_at_axis(
-            slice(left_pad, left_pad + min(period, right_pad),), axis)
+        # pad area.
+        slice_start = -right_pad - period
+        slice_end = slice_start + min(period, right_pad)
+        left_slice = _slice_at_axis(slice(slice_start, slice_end), axis)
         left_chunk = padded[left_slice]
 
         if right_pad > period:
@@ -537,11 +541,12 @@ def pad(array, pad_width, mode='constant', **kwargs):
         The array to pad.
     pad_width : {sequence, array_like, int}
         Number of values padded to the edges of each axis.
-        ((before_1, after_1), ... (before_N, after_N)) unique pad widths
+        ``((before_1, after_1), ... (before_N, after_N))`` unique pad widths
         for each axis.
-        ((before, after),) yields same before and after pad for each axis.
-        (pad,) or int is a shortcut for before = after = pad width for all
-        axes.
+        ``(before, after)`` or ``((before, after),)`` yields same before
+        and after pad for each axis.
+        ``(pad,)`` or ``int`` is a shortcut for before = after = pad width
+        for all axes.
     mode : str or function, optional
         One of the following string values or a user supplied function.
 
@@ -586,14 +591,14 @@ def pad(array, pad_width, mode='constant', **kwargs):
         Used in 'maximum', 'mean', 'median', and 'minimum'.  Number of
         values at edge of each axis used to calculate the statistic value.
 
-        ((before_1, after_1), ... (before_N, after_N)) unique statistic
+        ``((before_1, after_1), ... (before_N, after_N))`` unique statistic
         lengths for each axis.
 
-        ((before, after),) yields same before and after statistic lengths
-        for each axis.
+        ``(before, after)`` or ``((before, after),)`` yields same before
+        and after statistic lengths for each axis.
 
-        (stat_length,) or int is a shortcut for before = after = statistic
-        length for all axes.
+        ``(stat_length,)`` or ``int`` is a shortcut for
+        ``before = after = statistic`` length for all axes.
 
         Default is ``None``, to use the entire axis.
     constant_values : sequence or scalar, optional
@@ -603,11 +608,11 @@ def pad(array, pad_width, mode='constant', **kwargs):
         ``((before_1, after_1), ... (before_N, after_N))`` unique pad constants
         for each axis.
 
-        ``((before, after),)`` yields same before and after constants for each
-        axis.
+        ``(before, after)`` or ``((before, after),)`` yields same before
+        and after constants for each axis.
 
-        ``(constant,)`` or ``constant`` is a shortcut for ``before = after = constant`` for
-        all axes.
+        ``(constant,)`` or ``constant`` is a shortcut for
+        ``before = after = constant`` for all axes.
 
         Default is 0.
     end_values : sequence or scalar, optional
@@ -617,11 +622,11 @@ def pad(array, pad_width, mode='constant', **kwargs):
         ``((before_1, after_1), ... (before_N, after_N))`` unique end values
         for each axis.
 
-        ``((before, after),)`` yields same before and after end values for each
-        axis.
+        ``(before, after)`` or ``((before, after),)`` yields same before
+        and after end values for each axis.
 
-        ``(constant,)`` or ``constant`` is a shortcut for ``before = after = constant`` for
-        all axes.
+        ``(constant,)`` or ``constant`` is a shortcut for
+        ``before = after = constant`` for all axes.
 
         Default is 0.
     reflect_type : {'even', 'odd'}, optional
@@ -866,11 +871,12 @@ def pad(array, pad_width, mode='constant', **kwargs):
     elif mode == "wrap":
         for axis, (left_index, right_index) in zip(axes, pad_width):
             roi = _view_roi(padded, original_area_slice, axis)
+            original_period = padded.shape[axis] - right_index - left_index
             while left_index > 0 or right_index > 0:
                 # Iteratively pad until dimension is filled with wrapped
                 # values. This is necessary if the pad area is larger than
                 # the length of the original values in the current dimension.
                 left_index, right_index = _set_wrap_both(
-                    roi, axis, (left_index, right_index))
+                    roi, axis, (left_index, right_index), original_period)
 
     return padded
diff --git a/numpy/lib/arraysetops.py b/numpy/lib/arraysetops.py
index cf5f47a82..300bbda26 100644
--- a/numpy/lib/arraysetops.py
+++ b/numpy/lib/arraysetops.py
@@ -649,8 +649,24 @@ def in1d(ar1, ar2, assume_unique=False, invert=False, *, kind=None):
         ar2_range = int(ar2_max) - int(ar2_min)
 
         # Constraints on whether we can actually use the table method:
-        range_safe_from_overflow = ar2_range < np.iinfo(ar2.dtype).max
+        #  1. Assert memory usage is not too large
         below_memory_constraint = ar2_range <= 6 * (ar1.size + ar2.size)
+        #  2. Check overflows for (ar2 - ar2_min); dtype=ar2.dtype
+        range_safe_from_overflow = ar2_range <= np.iinfo(ar2.dtype).max
+        #  3. Check overflows for (ar1 - ar2_min); dtype=ar1.dtype
+        if ar1.size > 0:
+            ar1_min = np.min(ar1)
+            ar1_max = np.max(ar1)
+
+            # After masking, the range of ar1 is guaranteed to be
+            # within the range of ar2:
+            ar1_upper = min(int(ar1_max), int(ar2_max))
+            ar1_lower = max(int(ar1_min), int(ar2_min))
+
+            range_safe_from_overflow &= all((
+                ar1_upper - int(ar2_min) <= np.iinfo(ar1.dtype).max,
+                ar1_lower - int(ar2_min) >= np.iinfo(ar1.dtype).min
+            ))
 
         # Optimal performance is for approximately
         # log10(size) > (log10(range) - 2.27) / 0.927.
@@ -687,7 +703,7 @@ def in1d(ar1, ar2, assume_unique=False, invert=False, *, kind=None):
         elif kind == 'table':  # not range_safe_from_overflow
             raise RuntimeError(
                 "You have specified kind='table', "
-                "but the range of values in `ar2` exceeds the "
+                "but the range of values in `ar2` or `ar1` exceed the "
                 "maximum integer of the datatype. "
                 "Please set `kind` to None or 'sort'."
             )
diff --git a/numpy/lib/format.py b/numpy/lib/format.py
index 54fd0b0bc..ef50fb19d 100644
--- a/numpy/lib/format.py
+++ b/numpy/lib/format.py
@@ -437,15 +437,15 @@ def _write_array_header(fp, d, version=None):
         header.append("'%s': %s, " % (key, repr(value)))
     header.append("}")
     header = "".join(header)
-    
+
     # Add some spare space so that the array header can be modified in-place
     # when changing the array size, e.g. when growing it by appending data at
-    # the end. 
+    # the end.
     shape = d['shape']
     header += " " * ((GROWTH_AXIS_MAX_DIGITS - len(repr(
         shape[-1 if d['fortran_order'] else 0]
     ))) if len(shape) > 0 else 0)
-    
+
     if version is None:
         header = _wrap_header_guess_version(header)
     else:
@@ -505,7 +505,7 @@ def read_array_header_1_0(fp, max_header_size=_MAX_HEADER_SIZE):
     max_header_size : int, optional
         Maximum allowed size of the header.  Large headers may not be safe
         to load securely and thus require explicitly passing a larger value.
-        See :py:meth:`ast.literal_eval()` for details.
+        See :py:func:`ast.literal_eval()` for details.
 
     Raises
     ------
@@ -532,7 +532,7 @@ def read_array_header_2_0(fp, max_header_size=_MAX_HEADER_SIZE):
     max_header_size : int, optional
         Maximum allowed size of the header.  Large headers may not be safe
         to load securely and thus require explicitly passing a larger value.
-        See :py:meth:`ast.literal_eval()` for details.
+        See :py:func:`ast.literal_eval()` for details.
 
     Returns
     -------
@@ -623,13 +623,27 @@ def _read_array_header(fp, version, max_header_size=_MAX_HEADER_SIZE):
     #   "descr" : dtype.descr
     # Versions (2, 0) and (1, 0) could have been created by a Python 2
     # implementation before header filtering was implemented.
-    if version <= (2, 0):
-        header = _filter_header(header)
+    #
+    # For performance reasons, we try without _filter_header first though
     try:
         d = safe_eval(header)
     except SyntaxError as e:
-        msg = "Cannot parse header: {!r}"
-        raise ValueError(msg.format(header)) from e
+        if version <= (2, 0):
+            header = _filter_header(header)
+            try:
+                d = safe_eval(header)
+            except SyntaxError as e2:
+                msg = "Cannot parse header: {!r}"
+                raise ValueError(msg.format(header)) from e2
+            else:
+                warnings.warn(
+                    "Reading `.npy` or `.npz` file required additional "
+                    "header parsing as it was created on Python 2. Save the "
+                    "file again to speed up loading and avoid this warning.",
+                    UserWarning, stacklevel=4)
+        else:
+            msg = "Cannot parse header: {!r}"
+            raise ValueError(msg.format(header)) from e
     if not isinstance(d, dict):
         msg = "Header is not a dictionary: {!r}"
         raise ValueError(msg.format(d))
@@ -750,7 +764,7 @@ def read_array(fp, allow_pickle=False, pickle_kwargs=None, *,
     max_header_size : int, optional
         Maximum allowed size of the header.  Large headers may not be safe
         to load securely and thus require explicitly passing a larger value.
-        See :py:meth:`ast.literal_eval()` for details.
+        See :py:func:`ast.literal_eval()` for details.
         This option is ignored when `allow_pickle` is passed.  In that case
         the file is by definition trusted and the limit is unnecessary.
 
@@ -869,7 +883,7 @@ def open_memmap(filename, mode='r+', dtype=None, shape=None,
     max_header_size : int, optional
         Maximum allowed size of the header.  Large headers may not be safe
         to load securely and thus require explicitly passing a larger value.
-        See :py:meth:`ast.literal_eval()` for details.
+        See :py:func:`ast.literal_eval()` for details.
 
     Returns
     -------
diff --git a/numpy/lib/function_base.py b/numpy/lib/function_base.py
index 6065dd0d3..02e141920 100644
--- a/numpy/lib/function_base.py
+++ b/numpy/lib/function_base.py
@@ -4,6 +4,7 @@ import re
 import sys
 import warnings
 
+from .._utils import set_module
 import numpy as np
 import numpy.core.numeric as _nx
 from numpy.core import transpose
@@ -19,12 +20,11 @@ from numpy.core.fromnumeric import (
     ravel, nonzero, partition, mean, any, sum
     )
 from numpy.core.numerictypes import typecodes
-from numpy.core.overrides import set_module
 from numpy.core import overrides
 from numpy.core.function_base import add_newdoc
 from numpy.lib.twodim_base import diag
 from numpy.core.multiarray import (
-    _insert, add_docstring, bincount, normalize_axis_index, _monotonicity,
+    _place, add_docstring, bincount, normalize_axis_index, _monotonicity,
     interp as compiled_interp, interp_complex as compiled_interp_complex
     )
 from numpy.core.umath import _add_newdoc_ufunc as add_newdoc_ufunc
@@ -161,6 +161,8 @@ def rot90(m, k=1, axes=(0, 1)):
     Rotate an array by 90 degrees in the plane specified by axes.
 
     Rotation direction is from the first towards the second axis.
+    This means for a 2D array with the default `k` and `axes`, the
+    rotation will be counterclockwise.
 
     Parameters
     ----------
@@ -1309,6 +1311,8 @@ def gradient(f, *varargs, axis=None, edge_order=1):
 
     if len_axes == 1:
         return outvals[0]
+    elif np._using_numpy2_behavior():
+        return tuple(outvals)
     else:
         return outvals
 
@@ -1947,11 +1951,7 @@ def place(arr, mask, vals):
            [44, 55, 44]])
 
     """
-    if not isinstance(arr, np.ndarray):
-        raise TypeError("argument 1 must be numpy.ndarray, "
-                        "not {name}".format(name=type(arr).__name__))
-
-    return _insert(arr, mask, vals)
+    return _place(arr, mask, vals)
 
 
 def disp(mesg, device=None, linefeed=True):
@@ -2117,10 +2117,10 @@ def _create_arrays(broadcast_shape, dim_sizes, list_of_core_dims, dtypes,
 @set_module('numpy')
 class vectorize:
     """
-    vectorize(pyfunc, otypes=None, doc=None, excluded=None, cache=False,
-              signature=None)
+    vectorize(pyfunc=np._NoValue, otypes=None, doc=None, excluded=None,
+    cache=False, signature=None)
 
-    Generalized function class.
+    Returns an object that acts like pyfunc, but takes arrays as input.
 
     Define a vectorized function which takes a nested sequence of objects or
     numpy arrays as inputs and returns a single numpy array or a tuple of numpy
@@ -2134,8 +2134,9 @@ class vectorize:
 
     Parameters
     ----------
-    pyfunc : callable
+    pyfunc : callable, optional
         A python function or method.
+        Can be omitted to produce a decorator with keyword arguments.
     otypes : str or list of dtypes, optional
         The output data type. It must be specified as either a string of
         typecode characters or a list of data type specifiers. There should
@@ -2167,8 +2168,9 @@ class vectorize:
 
     Returns
     -------
-    vectorized : callable
-        Vectorized function.
+    out : callable
+        A vectorized function if ``pyfunc`` was provided,
+        a decorator otherwise.
 
     See Also
     --------
@@ -2265,18 +2267,44 @@ class vectorize:
            [0., 0., 1., 2., 1., 0.],
            [0., 0., 0., 1., 2., 1.]])
 
+    Decorator syntax is supported.  The decorator can be called as
+    a function to provide keyword arguments.
+    >>>@np.vectorize
+    ...def identity(x):
+    ...    return x
+    ...
+    >>>identity([0, 1, 2])
+    array([0, 1, 2])
+    >>>@np.vectorize(otypes=[float])
+    ...def as_float(x):
+    ...    return x
+    ...
+    >>>as_float([0, 1, 2])
+    array([0., 1., 2.])
     """
-    def __init__(self, pyfunc, otypes=None, doc=None, excluded=None,
-                 cache=False, signature=None):
+    def __init__(self, pyfunc=np._NoValue, otypes=None, doc=None,
+                 excluded=None, cache=False, signature=None):
+
+        if (pyfunc != np._NoValue) and (not callable(pyfunc)):
+            #Splitting the error message to keep
+            #the length below 79 characters.
+            part1 = "When used as a decorator, "
+            part2 = "only accepts keyword arguments."
+            raise TypeError(part1 + part2)
+
         self.pyfunc = pyfunc
         self.cache = cache
         self.signature = signature
-        self._ufunc = {}    # Caching to improve default performance
+        if pyfunc != np._NoValue and hasattr(pyfunc, '__name__'):
+            self.__name__ = pyfunc.__name__
 
-        if doc is None:
+        self._ufunc = {}    # Caching to improve default performance
+        self._doc = None
+        self.__doc__ = doc
+        if doc is None and hasattr(pyfunc, '__doc__'):
             self.__doc__ = pyfunc.__doc__
         else:
-            self.__doc__ = doc
+            self._doc = doc
 
         if isinstance(otypes, str):
             for char in otypes:
@@ -2298,7 +2326,15 @@ class vectorize:
         else:
             self._in_and_out_core_dims = None
 
-    def __call__(self, *args, **kwargs):
+    def _init_stage_2(self, pyfunc, *args, **kwargs):
+        self.__name__ = pyfunc.__name__
+        self.pyfunc = pyfunc
+        if self._doc is None:
+            self.__doc__ = pyfunc.__doc__
+        else:
+            self.__doc__ = self._doc
+
+    def _call_as_normal(self, *args, **kwargs):
         """
         Return arrays with the results of `pyfunc` broadcast (vectorized) over
         `args` and `kwargs` not in `excluded`.
@@ -2328,6 +2364,13 @@ class vectorize:
 
         return self._vectorize_call(func=func, args=vargs)
 
+    def __call__(self, *args, **kwargs):
+        if self.pyfunc is np._NoValue:
+            self._init_stage_2(*args, **kwargs)
+            return self
+
+        return self._call_as_normal(*args, **kwargs)
+
     def _get_ufunc_and_otypes(self, func, args):
         """Return (ufunc, otypes)."""
         # frompyfunc will fail if args is empty
@@ -2693,7 +2736,7 @@ def cov(m, y=None, rowvar=True, bias=False, ddof=None, fweights=None,
 
     if fact <= 0:
         warnings.warn("Degrees of freedom <= 0 for slice",
-                      RuntimeWarning, stacklevel=3)
+                      RuntimeWarning, stacklevel=2)
         fact = 0.0
 
     X -= avg[:, None]
@@ -2842,7 +2885,7 @@ def corrcoef(x, y=None, rowvar=True, bias=np._NoValue, ddof=np._NoValue, *,
     if bias is not np._NoValue or ddof is not np._NoValue:
         # 2015-03-15, 1.10
         warnings.warn('bias and ddof have no effect and are deprecated',
-                      DeprecationWarning, stacklevel=3)
+                      DeprecationWarning, stacklevel=2)
     c = cov(x, y, rowvar, dtype=dtype)
     try:
         d = diag(c)
@@ -2956,10 +2999,15 @@ def blackman(M):
     >>> plt.show()
 
     """
+    # Ensures at least float64 via 0.0.  M should be an integer, but conversion
+    # to double is safe for a range.
+    values = np.array([0.0, M])
+    M = values[1]
+
     if M < 1:
-        return array([], dtype=np.result_type(M, 0.0))
+        return array([], dtype=values.dtype)
     if M == 1:
-        return ones(1, dtype=np.result_type(M, 0.0))
+        return ones(1, dtype=values.dtype)
     n = arange(1-M, M, 2)
     return 0.42 + 0.5*cos(pi*n/(M-1)) + 0.08*cos(2.0*pi*n/(M-1))
 
@@ -3064,10 +3112,15 @@ def bartlett(M):
     >>> plt.show()
 
     """
+    # Ensures at least float64 via 0.0.  M should be an integer, but conversion
+    # to double is safe for a range.
+    values = np.array([0.0, M])
+    M = values[1]
+
     if M < 1:
-        return array([], dtype=np.result_type(M, 0.0))
+        return array([], dtype=values.dtype)
     if M == 1:
-        return ones(1, dtype=np.result_type(M, 0.0))
+        return ones(1, dtype=values.dtype)
     n = arange(1-M, M, 2)
     return where(less_equal(n, 0), 1 + n/(M-1), 1 - n/(M-1))
 
@@ -3168,10 +3221,15 @@ def hanning(M):
     >>> plt.show()
 
     """
+    # Ensures at least float64 via 0.0.  M should be an integer, but conversion
+    # to double is safe for a range.
+    values = np.array([0.0, M])
+    M = values[1]
+
     if M < 1:
-        return array([], dtype=np.result_type(M, 0.0))
+        return array([], dtype=values.dtype)
     if M == 1:
-        return ones(1, dtype=np.result_type(M, 0.0))
+        return ones(1, dtype=values.dtype)
     n = arange(1-M, M, 2)
     return 0.5 + 0.5*cos(pi*n/(M-1))
 
@@ -3268,10 +3326,15 @@ def hamming(M):
     >>> plt.show()
 
     """
+    # Ensures at least float64 via 0.0.  M should be an integer, but conversion
+    # to double is safe for a range.
+    values = np.array([0.0, M])
+    M = values[1]
+
     if M < 1:
-        return array([], dtype=np.result_type(M, 0.0))
+        return array([], dtype=values.dtype)
     if M == 1:
-        return ones(1, dtype=np.result_type(M, 0.0))
+        return ones(1, dtype=values.dtype)
     n = arange(1-M, M, 2)
     return 0.54 + 0.46*cos(pi*n/(M-1))
 
@@ -3547,11 +3610,19 @@ def kaiser(M, beta):
     >>> plt.show()
 
     """
+    # Ensures at least float64 via 0.0.  M should be an integer, but conversion
+    # to double is safe for a range.  (Simplified result_type with 0.0
+    # strongly typed.  result-type is not/less order sensitive, but that mainly
+    # matters for integers anyway.)
+    values = np.array([0.0, M, beta])
+    M = values[1]
+    beta = values[2]
+
     if M == 1:
-        return np.ones(1, dtype=np.result_type(M, 0.0))
+        return np.ones(1, dtype=values.dtype)
     n = arange(0, M)
     alpha = (M-1)/2.0
-    return i0(beta * sqrt(1-((n-alpha)/alpha)**2.0))/i0(float(beta))
+    return i0(beta * sqrt(1-((n-alpha)/alpha)**2.0))/i0(beta)
 
 
 def _sinc_dispatcher(x):
@@ -3682,14 +3753,14 @@ def msort(a):
     warnings.warn(
         "msort is deprecated, use np.sort(a, axis=0) instead",
         DeprecationWarning,
-        stacklevel=3,
+        stacklevel=2,
     )
     b = array(a, subok=True, copy=True)
     b.sort(0)
     return b
 
 
-def _ureduce(a, func, **kwargs):
+def _ureduce(a, func, keepdims=False, **kwargs):
     """
     Internal Function.
     Call `func` with `a` as first argument swapping the axes to use extended
@@ -3717,13 +3788,20 @@ def _ureduce(a, func, **kwargs):
     """
     a = np.asanyarray(a)
     axis = kwargs.get('axis', None)
+    out = kwargs.get('out', None)
+
+    if keepdims is np._NoValue:
+        keepdims = False
+
+    nd = a.ndim
     if axis is not None:
-        keepdim = list(a.shape)
-        nd = a.ndim
         axis = _nx.normalize_axis_tuple(axis, nd)
 
-        for ax in axis:
-            keepdim[ax] = 1
+        if keepdims:
+            if out is not None:
+                index_out = tuple(
+                    0 if i in axis else slice(None) for i in range(nd))
+                kwargs['out'] = out[(Ellipsis, ) + index_out]
 
         if len(axis) == 1:
             kwargs['axis'] = axis[0]
@@ -3736,12 +3814,27 @@ def _ureduce(a, func, **kwargs):
             # merge reduced axis
             a = a.reshape(a.shape[:nkeep] + (-1,))
             kwargs['axis'] = -1
-        keepdim = tuple(keepdim)
     else:
-        keepdim = (1,) * a.ndim
+        if keepdims:
+            if out is not None:
+                index_out = (0, ) * nd
+                kwargs['out'] = out[(Ellipsis, ) + index_out]
 
     r = func(a, **kwargs)
-    return r, keepdim
+
+    if out is not None:
+        return out
+
+    if keepdims:
+        if axis is None:
+            index_r = (np.newaxis, ) * nd
+        else:
+            index_r = tuple(
+                np.newaxis if i in axis else slice(None)
+                for i in range(nd))
+        r = r[(Ellipsis, ) + index_r]
+
+    return r
 
 
 def _median_dispatcher(
@@ -3831,12 +3924,8 @@ def median(a, axis=None, out=None, overwrite_input=False, keepdims=False):
     >>> assert not np.all(a==b)
 
     """
-    r, k = _ureduce(a, func=_median, axis=axis, out=out,
+    return _ureduce(a, func=_median, keepdims=keepdims, axis=axis, out=out,
                     overwrite_input=overwrite_input)
-    if keepdims:
-        return r.reshape(k)
-    else:
-        return r
 
 
 def _median(a, axis=None, out=None, overwrite_input=False):
@@ -3916,11 +4005,11 @@ def percentile(a,
 
     Parameters
     ----------
-    a : array_like
+    a : array_like of real numbers
         Input array or object that can be converted to an array.
     q : array_like of float
-        Percentile or sequence of percentiles to compute, which must be between
-        0 and 100 inclusive.
+        Percentage or sequence of percentages for the percentiles to compute.
+        Values must be between 0 and 100 inclusive.
     axis : {int, tuple of int, None}, optional
         Axis or axes along which the percentiles are computed. The
         default is to compute the percentile(s) along a flattened
@@ -4017,7 +4106,7 @@ def percentile(a,
     since Python uses 0-based indexing, the code subtracts another 1 from the
     index internally.
 
-    The following formula determines the virtual index ``i + g``, the location 
+    The following formula determines the virtual index ``i + g``, the location
     of the percentile in the sorted sample:
 
     .. math::
@@ -4167,7 +4256,8 @@ def percentile(a,
             xlabel='Percentile',
             ylabel='Estimated percentile value',
             yticks=a)
-        ax.legend()
+        ax.legend(bbox_to_anchor=(1.03, 1))
+        plt.tight_layout()
         plt.show()
 
     References
@@ -4180,6 +4270,11 @@ def percentile(a,
     if interpolation is not None:
         method = _check_interpolation_as_method(
             method, interpolation, "percentile")
+
+    a = np.asanyarray(a)
+    if a.dtype.kind == "c":
+        raise TypeError("a must be an array of real numbers")
+
     q = np.true_divide(q, 100)
     q = asanyarray(q)  # undo any decay that the ufunc performed (see gh-13105)
     if not _quantile_is_valid(q):
@@ -4210,11 +4305,11 @@ def quantile(a,
 
     Parameters
     ----------
-    a : array_like
+    a : array_like of real numbers
         Input array or object that can be converted to an array.
     q : array_like of float
-        Quantile or sequence of quantiles to compute, which must be between
-        0 and 1 inclusive.
+        Probability or sequence of probabilities for the quantiles to compute.
+        Values must be between 0 and 1 inclusive.
     axis : {int, tuple of int, None}, optional
         Axis or axes along which the quantiles are computed. The default is
         to compute the quantile(s) along a flattened version of the array.
@@ -4268,8 +4363,8 @@ def quantile(a,
     Returns
     -------
     quantile : scalar or ndarray
-        If `q` is a single quantile and `axis=None`, then the result
-        is a scalar. If multiple quantiles are given, first axis of
+        If `q` is a single probability and `axis=None`, then the result
+        is a scalar. If multiple probabilies levels are given, first axis of
         the result corresponds to the quantiles. The other axes are
         the axes that remain after the reduction of `a`. If the input
         contains integers or floats smaller than ``float64``, the output
@@ -4306,7 +4401,7 @@ def quantile(a,
     since Python uses 0-based indexing, the code subtracts another 1 from the
     index internally.
 
-    The following formula determines the virtual index ``i + g``, the location 
+    The following formula determines the virtual index ``i + g``, the location
     of the quantile in the sorted sample:
 
     .. math::
@@ -4437,6 +4532,10 @@ def quantile(a,
         method = _check_interpolation_as_method(
             method, interpolation, "quantile")
 
+    a = np.asanyarray(a)
+    if a.dtype.kind == "c":
+        raise TypeError("a must be an array of real numbers")
+
     q = np.asanyarray(q)
     if not _quantile_is_valid(q):
         raise ValueError("Quantiles must be in the range [0, 1]")
@@ -4452,17 +4551,14 @@ def _quantile_unchecked(a,
                         method="linear",
                         keepdims=False):
     """Assumes that q is in [0, 1], and is an ndarray"""
-    r, k = _ureduce(a,
+    return _ureduce(a,
                     func=_quantile_ureduce_func,
                     q=q,
+                    keepdims=keepdims,
                     axis=axis,
                     out=out,
                     overwrite_input=overwrite_input,
                     method=method)
-    if keepdims:
-        return r.reshape(q.shape + k)
-    else:
-        return r
 
 
 def _quantile_is_valid(q):
@@ -4812,26 +4908,42 @@ def trapz(y, x=None, dx=1.0, axis=-1):
 
     Examples
     --------
-    >>> np.trapz([1,2,3])
+    Use the trapezoidal rule on evenly spaced points:
+
+    >>> np.trapz([1, 2, 3])
     4.0
-    >>> np.trapz([1,2,3], x=[4,6,8])
+
+    The spacing between sample points can be selected by either the
+    ``x`` or ``dx`` arguments:
+
+    >>> np.trapz([1, 2, 3], x=[4, 6, 8])
     8.0
-    >>> np.trapz([1,2,3], dx=2)
+    >>> np.trapz([1, 2, 3], dx=2)
     8.0
 
-    Using a decreasing `x` corresponds to integrating in reverse:
+    Using a decreasing ``x`` corresponds to integrating in reverse:
 
-    >>> np.trapz([1,2,3], x=[8,6,4])
+    >>> np.trapz([1, 2, 3], x=[8, 6, 4])
     -8.0
 
-    More generally `x` is used to integrate along a parametric curve.
-    This finds the area of a circle, noting we repeat the sample which closes
+    More generally ``x`` is used to integrate along a parametric curve. We can
+    estimate the integral :math:`\int_0^1 x^2 = 1/3` using:
+
+    >>> x = np.linspace(0, 1, num=50)
+    >>> y = x**2
+    >>> np.trapz(y, x)
+    0.33340274885464394
+
+    Or estimate the area of a circle, noting we repeat the sample which closes
     the curve:
 
     >>> theta = np.linspace(0, 2 * np.pi, num=1000, endpoint=True)
     >>> np.trapz(np.cos(theta), x=np.sin(theta))
     3.141571941375841
 
+    ``np.trapz`` can be applied along a specified axis to do multiple
+    computations in one call:
+
     >>> a = np.arange(6).reshape(2, 3)
     >>> a
     array([[0, 1, 2],
@@ -4869,6 +4981,24 @@ def trapz(y, x=None, dx=1.0, axis=-1):
     return ret
 
 
+# __array_function__ has no __code__ or other attributes normal Python funcs we
+# wrap everything into a C callable. SciPy however, tries to "clone" `trapz`
+# into a new Python function which requires `__code__` and a few other
+# attributes. So we create a dummy clone and copy over its attributes allowing
+# SciPy <= 1.10 to work: https://github.com/scipy/scipy/issues/17811
+assert not hasattr(trapz, "__code__")
+
+def _fake_trapz(y, x=None, dx=1.0, axis=-1):
+    return trapz(y, x=x, dx=dx, axis=axis)
+
+
+trapz.__code__ = _fake_trapz.__code__
+trapz.__globals__ = _fake_trapz.__globals__
+trapz.__defaults__ = _fake_trapz.__defaults__
+trapz.__closure__ = _fake_trapz.__closure__
+trapz.__kwdefaults__ = _fake_trapz.__kwdefaults__
+
+
 def _meshgrid_dispatcher(*xi, copy=None, sparse=None, indexing=None):
     return xi
 
@@ -4877,7 +5007,7 @@ def _meshgrid_dispatcher(*xi, copy=None, sparse=None, indexing=None):
 @array_function_dispatch(_meshgrid_dispatcher)
 def meshgrid(*xi, copy=True, sparse=False, indexing='xy'):
     """
-    Return coordinate matrices from coordinate vectors.
+    Return a list of coordinate matrices from coordinate vectors.
 
     Make N-D coordinate arrays for vectorized evaluations of
     N-D scalar/vector fields over N-D grids, given
@@ -4918,7 +5048,7 @@ def meshgrid(*xi, copy=True, sparse=False, indexing='xy'):
 
     Returns
     -------
-    X1, X2,..., XN : ndarray
+    X1, X2,..., XN : list of ndarrays
         For vectors `x1`, `x2`,..., `xn` with lengths ``Ni=len(xi)``,
         returns ``(N1, N2, N3,..., Nn)`` shaped arrays if indexing='ij'
         or ``(N2, N1, N3,..., Nn)`` shaped arrays if indexing='xy'
@@ -4953,6 +5083,7 @@ def meshgrid(*xi, copy=True, sparse=False, indexing='xy'):
     mgrid : Construct a multi-dimensional "meshgrid" using indexing notation.
     ogrid : Construct an open multi-dimensional "meshgrid" using indexing
             notation.
+    how-to-index
 
     Examples
     --------
@@ -4966,16 +5097,25 @@ def meshgrid(*xi, copy=True, sparse=False, indexing='xy'):
     >>> yv
     array([[0.,  0.,  0.],
            [1.,  1.,  1.]])
-    >>> xv, yv = np.meshgrid(x, y, sparse=True)  # make sparse output arrays
+
+    The result of `meshgrid` is a coordinate grid:
+
+    >>> import matplotlib.pyplot as plt
+    >>> plt.plot(xv, yv, marker='o', color='k', linestyle='none')
+    >>> plt.show()
+
+    You can create sparse output arrays to save memory and computation time.
+
+    >>> xv, yv = np.meshgrid(x, y, sparse=True)
     >>> xv
     array([[0. ,  0.5,  1. ]])
     >>> yv
     array([[0.],
            [1.]])
 
-    `meshgrid` is very useful to evaluate functions on a grid.  If the
-    function depends on all coordinates, you can use the parameter
-    ``sparse=True`` to save memory and computation time.
+    `meshgrid` is very useful to evaluate functions on a grid. If the
+    function depends on all coordinates, both dense and sparse outputs can be
+    used.
 
     >>> x = np.linspace(-5, 5, 101)
     >>> y = np.linspace(-5, 5, 101)
@@ -4992,7 +5132,6 @@ def meshgrid(*xi, copy=True, sparse=False, indexing='xy'):
     >>> np.array_equal(zz, zs)
     True
 
-    >>> import matplotlib.pyplot as plt
     >>> h = plt.contourf(x, y, zs)
     >>> plt.axis('scaled')
     >>> plt.colorbar()
@@ -5346,7 +5485,7 @@ def insert(arr, obj, values, axis=None):
             warnings.warn(
                 "in the future insert will treat boolean arrays and "
                 "array-likes as a boolean index instead of casting it to "
-                "integer", FutureWarning, stacklevel=3)
+                "integer", FutureWarning, stacklevel=2)
             indices = indices.astype(intp)
             # Code after warning period:
             #if obj.ndim != 1:
diff --git a/numpy/lib/function_base.pyi b/numpy/lib/function_base.pyi
index c14a54c60..687e4ab17 100644
--- a/numpy/lib/function_base.pyi
+++ b/numpy/lib/function_base.pyi
@@ -441,12 +441,8 @@ def sinc(x: _ArrayLikeFloat_co) -> NDArray[floating[Any]]: ...
 @overload
 def sinc(x: _ArrayLikeComplex_co) -> NDArray[complexfloating[Any, Any]]: ...
 
-@overload
-def msort(a: _ArrayType) -> _ArrayType: ...
-@overload
-def msort(a: _ArrayLike[_SCT]) -> NDArray[_SCT]: ...
-@overload
-def msort(a: ArrayLike) -> NDArray[Any]: ...
+# NOTE: Deprecated
+# def msort(a: ArrayLike) -> NDArray[Any]: ...
 
 @overload
 def median(
diff --git a/numpy/lib/histograms.py b/numpy/lib/histograms.py
index 704f69dce..35745e6dd 100644
--- a/numpy/lib/histograms.py
+++ b/numpy/lib/histograms.py
@@ -970,7 +970,7 @@ def histogramdd(sample, bins=10, range=None, density=None, weights=None):
         sample = np.atleast_2d(sample).T
         N, D = sample.shape
 
-    nbin = np.empty(D, int)
+    nbin = np.empty(D, np.intp)
     edges = D*[None]
     dedges = D*[None]
     if weights is not None:
@@ -981,7 +981,7 @@ def histogramdd(sample, bins=10, range=None, density=None, weights=None):
         if M != D:
             raise ValueError(
                 'The dimension of bins must be equal to the dimension of the '
-                ' sample x.')
+                'sample x.')
     except TypeError:
         # bins is an integer
         bins = D*[bins]
diff --git a/numpy/lib/index_tricks.py b/numpy/lib/index_tricks.py
index 3daacaff6..1da73dee5 100644
--- a/numpy/lib/index_tricks.py
+++ b/numpy/lib/index_tricks.py
@@ -3,16 +3,15 @@ import sys
 import math
 import warnings
 
+import numpy as np
+from .._utils import set_module
 import numpy.core.numeric as _nx
-from numpy.core.numeric import (
-    asarray, ScalarType, array, alltrue, cumprod, arange, ndim
-)
+from numpy.core.numeric import ScalarType, array
 from numpy.core.numerictypes import issubdtype
 
 import numpy.matrixlib as matrixlib
 from .function_base import diff
 from numpy.core.multiarray import ravel_multi_index, unravel_index
-from numpy.core.overrides import set_module
 from numpy.core import overrides, linspace
 from numpy.lib.stride_tricks import as_strided
 
@@ -94,7 +93,7 @@ def ix_(*args):
     nd = len(args)
     for k, new in enumerate(args):
         if not isinstance(new, _nx.ndarray):
-            new = asarray(new)
+            new = np.asarray(new)
             if new.size == 0:
                 # Explicitly type empty arrays to avoid float default
                 new = new.astype(_nx.intp)
@@ -232,7 +231,9 @@ class MGridClass(nd_grid):
     --------
     lib.index_tricks.nd_grid : class of `ogrid` and `mgrid` objects
     ogrid : like mgrid but returns open (not fleshed out) mesh grids
+    meshgrid: return coordinate matrices from coordinate vectors
     r_ : array concatenator
+    :ref:`how-to-partition`
 
     Examples
     --------
@@ -283,7 +284,9 @@ class OGridClass(nd_grid):
     --------
     np.lib.index_tricks.nd_grid : class of `ogrid` and `mgrid` objects
     mgrid : like `ogrid` but returns dense (or fleshed out) mesh grids
+    meshgrid: return coordinate matrices from coordinate vectors
     r_ : array concatenator
+    :ref:`how-to-partition`
 
     Examples
     --------
@@ -389,7 +392,7 @@ class AxisConcatenator:
                 scalar = True
                 newobj = item
             else:
-                item_ndim = ndim(item)
+                item_ndim = np.ndim(item)
                 newobj = array(item, copy=False, subok=True, ndmin=ndmin)
                 if trans1d != -1 and item_ndim < ndmin:
                     k2 = ndmin - item_ndim
@@ -594,7 +597,7 @@ class ndenumerate:
     """
 
     def __init__(self, arr):
-        self.iter = asarray(arr).flat
+        self.iter = np.asarray(arr).flat
 
     def __next__(self):
         """
@@ -907,9 +910,9 @@ def fill_diagonal(a, val, wrap=False):
     else:
         # For more than d=2, the strided formula is only valid for arrays with
         # all dimensions equal, so we check first.
-        if not alltrue(diff(a.shape) == 0):
+        if not np.all(diff(a.shape) == 0):
             raise ValueError("All dimensions of input must be of equal length")
-        step = 1 + (cumprod(a.shape[:-1])).sum()
+        step = 1 + (np.cumprod(a.shape[:-1])).sum()
 
     # Write the value out into the diagonal.
     a.flat[:end:step] = val
@@ -980,7 +983,7 @@ def diag_indices(n, ndim=2):
             [0, 1]]])
 
     """
-    idx = arange(n)
+    idx = np.arange(n)
     return (idx,) * ndim
 
 
@@ -1007,13 +1010,39 @@ def diag_indices_from(arr):
     -----
     .. versionadded:: 1.4.0
 
+    Examples
+    --------
+    
+    Create a 4 by 4 array.
+
+    >>> a = np.arange(16).reshape(4, 4)
+    >>> a
+    array([[ 0,  1,  2,  3],
+           [ 4,  5,  6,  7],
+           [ 8,  9, 10, 11],
+           [12, 13, 14, 15]])
+    
+    Get the indices of the diagonal elements.
+
+    >>> di = np.diag_indices_from(a)
+    >>> di
+    (array([0, 1, 2, 3]), array([0, 1, 2, 3]))
+
+    >>> a[di]
+    array([ 0,  5, 10, 15])
+
+    This is simply syntactic sugar for diag_indices.
+
+    >>> np.diag_indices(a.shape[0])
+    (array([0, 1, 2, 3]), array([0, 1, 2, 3]))
+
     """
 
     if not arr.ndim >= 2:
         raise ValueError("input array must be at least 2-d")
     # For more than d=2, the strided formula is only valid for arrays with
     # all dimensions equal, so we check first.
-    if not alltrue(diff(arr.shape) == 0):
+    if not np.all(diff(arr.shape) == 0):
         raise ValueError("All dimensions of input must be of equal length")
 
     return diag_indices(arr.shape[0], arr.ndim)
diff --git a/numpy/lib/index_tricks.pyi b/numpy/lib/index_tricks.pyi
index c9251abd1..29a6b9e2b 100644
--- a/numpy/lib/index_tricks.pyi
+++ b/numpy/lib/index_tricks.pyi
@@ -119,7 +119,7 @@ class AxisConcatenator:
     @staticmethod
     def makemat(
         data: ArrayLike, dtype: DTypeLike = ..., copy: bool = ...
-    ) -> _Matrix: ...
+    ) -> _Matrix[Any, Any]: ...
 
     # TODO: Sort out this `__getitem__` method
     def __getitem__(self, key: Any) -> Any: ...
diff --git a/numpy/lib/mixins.py b/numpy/lib/mixins.py
index c81239f6b..117cc7851 100644
--- a/numpy/lib/mixins.py
+++ b/numpy/lib/mixins.py
@@ -133,6 +133,7 @@ class NDArrayOperatorsMixin:
 
     .. versionadded:: 1.13
     """
+    __slots__ = ()
     # Like np.ndarray, this mixin class implements "Option 1" from the ufunc
     # overrides NEP.
 
diff --git a/numpy/lib/nanfunctions.py b/numpy/lib/nanfunctions.py
index 3814c0727..b3b570860 100644
--- a/numpy/lib/nanfunctions.py
+++ b/numpy/lib/nanfunctions.py
@@ -169,7 +169,7 @@ def _remove_nan_1d(arr1d, overwrite_input=False):
     s = np.nonzero(c)[0]
     if s.size == arr1d.size:
         warnings.warn("All-NaN slice encountered", RuntimeWarning,
-                      stacklevel=5)
+                      stacklevel=6)
         return arr1d[:0], True
     elif s.size == 0:
         return arr1d, overwrite_input
@@ -343,7 +343,7 @@ def nanmin(a, axis=None, out=None, keepdims=np._NoValue, initial=np._NoValue,
         res = np.fmin.reduce(a, axis=axis, out=out, **kwargs)
         if np.isnan(res).any():
             warnings.warn("All-NaN slice encountered", RuntimeWarning,
-                          stacklevel=3)
+                          stacklevel=2)
     else:
         # Slow, but safe for subclasses of ndarray
         a, mask = _replace_nan(a, +np.inf)
@@ -357,7 +357,7 @@ def nanmin(a, axis=None, out=None, keepdims=np._NoValue, initial=np._NoValue,
         if np.any(mask):
             res = _copyto(res, np.nan, mask)
             warnings.warn("All-NaN axis encountered", RuntimeWarning,
-                          stacklevel=3)
+                          stacklevel=2)
     return res
 
 
@@ -476,7 +476,7 @@ def nanmax(a, axis=None, out=None, keepdims=np._NoValue, initial=np._NoValue,
         res = np.fmax.reduce(a, axis=axis, out=out, **kwargs)
         if np.isnan(res).any():
             warnings.warn("All-NaN slice encountered", RuntimeWarning,
-                          stacklevel=3)
+                          stacklevel=2)
     else:
         # Slow, but safe for subclasses of ndarray
         a, mask = _replace_nan(a, -np.inf)
@@ -490,7 +490,7 @@ def nanmax(a, axis=None, out=None, keepdims=np._NoValue, initial=np._NoValue,
         if np.any(mask):
             res = _copyto(res, np.nan, mask)
             warnings.warn("All-NaN axis encountered", RuntimeWarning,
-                          stacklevel=3)
+                          stacklevel=2)
     return res
 
 
@@ -1049,7 +1049,7 @@ def nanmean(a, axis=None, dtype=None, out=None, keepdims=np._NoValue,
 
     isbad = (cnt == 0)
     if isbad.any():
-        warnings.warn("Mean of empty slice", RuntimeWarning, stacklevel=3)
+        warnings.warn("Mean of empty slice", RuntimeWarning, stacklevel=2)
         # NaN is the only possible bad value, so no further
         # action is needed to handle bad results.
     return avg
@@ -1109,7 +1109,7 @@ def _nanmedian_small(a, axis=None, out=None, overwrite_input=False):
     m = np.ma.median(a, axis=axis, overwrite_input=overwrite_input)
     for i in range(np.count_nonzero(m.mask.ravel())):
         warnings.warn("All-NaN slice encountered", RuntimeWarning,
-                      stacklevel=4)
+                      stacklevel=5)
 
     fill_value = np.timedelta64("NaT") if m.dtype.kind == "m" else np.nan
     if out is not None:
@@ -1214,12 +1214,9 @@ def nanmedian(a, axis=None, out=None, overwrite_input=False, keepdims=np._NoValu
     if a.size == 0:
         return np.nanmean(a, axis, out=out, keepdims=keepdims)
 
-    r, k = function_base._ureduce(a, func=_nanmedian, axis=axis, out=out,
+    return function_base._ureduce(a, func=_nanmedian, keepdims=keepdims,
+                                  axis=axis, out=out,
                                   overwrite_input=overwrite_input)
-    if keepdims and keepdims is not np._NoValue:
-        return r.reshape(k)
-    else:
-        return r
 
 
 def _nanpercentile_dispatcher(
@@ -1376,6 +1373,9 @@ def nanpercentile(
             method, interpolation, "nanpercentile")
 
     a = np.asanyarray(a)
+    if a.dtype.kind == "c":
+        raise TypeError("a must be an array of real numbers")
+
     q = np.true_divide(q, 100.0)
     # undo any decay that the ufunc performed (see gh-13105)
     q = np.asanyarray(q)
@@ -1415,8 +1415,8 @@ def nanquantile(
         Input array or object that can be converted to an array, containing
         nan values to be ignored
     q : array_like of float
-        Quantile or sequence of quantiles to compute, which must be between
-        0 and 1 inclusive.
+        Probability or sequence of probabilities for the quantiles to compute.
+        Values must be between 0 and 1 inclusive.
     axis : {int, tuple of int, None}, optional
         Axis or axes along which the quantiles are computed. The
         default is to compute the quantile(s) along a flattened
@@ -1476,8 +1476,8 @@ def nanquantile(
     Returns
     -------
     quantile : scalar or ndarray
-        If `q` is a single percentile and `axis=None`, then the result
-        is a scalar. If multiple quantiles are given, first axis of
+        If `q` is a single probability and `axis=None`, then the result
+        is a scalar. If multiple probability levels are given, first axis of
         the result corresponds to the quantiles. The other axes are
         the axes that remain after the reduction of `a`. If the input
         contains integers or floats smaller than ``float64``, the output
@@ -1530,11 +1530,15 @@ def nanquantile(
        The American Statistician, 50(4), pp. 361-365, 1996
 
     """
+
     if interpolation is not None:
         method = function_base._check_interpolation_as_method(
             method, interpolation, "nanquantile")
 
     a = np.asanyarray(a)
+    if a.dtype.kind == "c":
+        raise TypeError("a must be an array of real numbers")
+
     q = np.asanyarray(q)
     if not function_base._quantile_is_valid(q):
         raise ValueError("Quantiles must be in the range [0, 1]")
@@ -1556,17 +1560,14 @@ def _nanquantile_unchecked(
     # so deal them upfront
     if a.size == 0:
         return np.nanmean(a, axis, out=out, keepdims=keepdims)
-    r, k = function_base._ureduce(a,
+    return function_base._ureduce(a,
                                   func=_nanquantile_ureduce_func,
                                   q=q,
+                                  keepdims=keepdims,
                                   axis=axis,
                                   out=out,
                                   overwrite_input=overwrite_input,
                                   method=method)
-    if keepdims and keepdims is not np._NoValue:
-        return r.reshape(q.shape + k)
-    else:
-        return r
 
 
 def _nanquantile_ureduce_func(a, q, axis=None, out=None, overwrite_input=False,
@@ -1762,7 +1763,7 @@ def nanvar(a, axis=None, dtype=None, out=None, ddof=0, keepdims=np._NoValue,
     isbad = (dof <= 0)
     if np.any(isbad):
         warnings.warn("Degrees of freedom <= 0 for slice.", RuntimeWarning,
-                      stacklevel=3)
+                      stacklevel=2)
         # NaN, inf, or negative numbers are all possible bad
         # values, so explicitly replace them with NaN.
         var = _copyto(var, np.nan, isbad)
diff --git a/numpy/lib/npyio.py b/numpy/lib/npyio.py
index 4a27c7898..339b1dc62 100644
--- a/numpy/lib/npyio.py
+++ b/numpy/lib/npyio.py
@@ -142,7 +142,7 @@ class NpzFile(Mapping):
     max_header_size : int, optional
         Maximum allowed size of the header.  Large headers may not be safe
         to load securely and thus require explicitly passing a larger value.
-        See :py:meth:`ast.literal_eval()` for details.
+        See :py:func:`ast.literal_eval()` for details.
         This option is ignored when `allow_pickle` is passed.  In that case
         the file is by definition trusted and the limit is unnecessary.
 
@@ -167,6 +167,8 @@ class NpzFile(Mapping):
     >>> npz = np.load(outfile)
     >>> isinstance(npz, np.lib.npyio.NpzFile)
     True
+    >>> npz
+    NpzFile 'object' with keys x, y
     >>> sorted(npz.files)
     ['x', 'y']
     >>> npz['x']  # getitem access
@@ -178,6 +180,7 @@ class NpzFile(Mapping):
     # Make __exit__ safe if zipfile_factory raises an exception
     zip = None
     fid = None
+    _MAX_REPR_ARRAY_COUNT = 5
 
     def __init__(self, fid, own_fid=False, allow_pickle=False,
                  pickle_kwargs=None, *,
@@ -257,7 +260,23 @@ class NpzFile(Mapping):
             else:
                 return self.zip.read(key)
         else:
-            raise KeyError("%s is not a file in the archive" % key)
+            raise KeyError(f"{key} is not a file in the archive")
+
+    def __contains__(self, key):
+        return (key in self._files or key in self.files)
+
+    def __repr__(self):
+        # Get filename or default to `object`
+        if isinstance(self.fid, str):
+            filename = self.fid
+        else:
+            filename = getattr(self.fid, "name", "object")
+
+        # Get the name of arrays
+        array_names = ', '.join(self.files[:self._MAX_REPR_ARRAY_COUNT])
+        if len(self.files) > self._MAX_REPR_ARRAY_COUNT:
+            array_names += "..."
+        return f"NpzFile {filename!r} with keys: {array_names}"
 
 
 @set_module('numpy')
@@ -309,7 +328,7 @@ def load(file, mmap_mode=None, allow_pickle=False, fix_imports=True,
     max_header_size : int, optional
         Maximum allowed size of the header.  Large headers may not be safe
         to load securely and thus require explicitly passing a larger value.
-        See :py:meth:`ast.literal_eval()` for details.
+        See :py:func:`ast.literal_eval()` for details.
         This option is ignored when `allow_pickle` is passed.  In that case
         the file is by definition trusted and the limit is unnecessary.
 
@@ -327,6 +346,9 @@ def load(file, mmap_mode=None, allow_pickle=False, fix_imports=True,
         If ``allow_pickle=True``, but the file cannot be loaded as a pickle.
     ValueError
         The file contains an object array, but ``allow_pickle=False`` given.
+    EOFError
+        When calling ``np.load`` multiple times on the same file handle,
+        if all data has already been read
 
     See Also
     --------
@@ -410,6 +432,8 @@ def load(file, mmap_mode=None, allow_pickle=False, fix_imports=True,
         _ZIP_SUFFIX = b'PK\x05\x06' # empty zip files start with this
         N = len(format.MAGIC_PREFIX)
         magic = fid.read(N)
+        if not magic:
+            raise EOFError("No data left in file")
         # If the file size is less than N, we need to make sure not
         # to seek past the beginning of the file
         fid.seek(-min(N, len(magic)), 1)  # back-up
@@ -760,13 +784,6 @@ def _ensure_ndmin_ndarray(a, *, ndmin: int):
 _loadtxt_chunksize = 50000
 
 
-def _loadtxt_dispatcher(
-        fname, dtype=None, comments=None, delimiter=None,
-        converters=None, skiprows=None, usecols=None, unpack=None,
-        ndmin=None, encoding=None, max_rows=None, *, like=None):
-    return (like,)
-
-
 def _check_nonneg_int(value, name="argument"):
     try:
         operator.index(value)
@@ -1161,10 +1178,10 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
         while such lines are counted in `skiprows`.
 
         .. versionadded:: 1.16.0
-        
+
         .. versionchanged:: 1.23.0
-            Lines containing no data, including comment lines (e.g., lines 
-            starting with '#' or as specified via `comments`) are not counted 
+            Lines containing no data, including comment lines (e.g., lines
+            starting with '#' or as specified via `comments`) are not counted
             towards `max_rows`.
     quotechar : unicode character or None, optional
         The character used to denote the start and end of a quoted item.
@@ -1303,6 +1320,14 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
     array([('alpha, #42', 10.), ('beta, #64',  2.)],
           dtype=[('label', '<U12'), ('value', '<f8')])
 
+    Quoted fields can be separated by multiple whitespace characters:
+
+    >>> s = StringIO('"alpha, #42"       10.0\n"beta, #64" 2.0\n')
+    >>> dtype = np.dtype([("label", "U12"), ("value", float)])
+    >>> np.loadtxt(s, dtype=dtype, delimiter=None, quotechar='"')
+    array([('alpha, #42', 10.), ('beta, #64',  2.)],
+          dtype=[('label', '<U12'), ('value', '<f8')])
+
     Two consecutive quote characters within a quoted field are treated as a
     single escaped character:
 
@@ -1323,10 +1348,10 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
 
     if like is not None:
         return _loadtxt_with_like(
-            fname, dtype=dtype, comments=comments, delimiter=delimiter,
+            like, fname, dtype=dtype, comments=comments, delimiter=delimiter,
             converters=converters, skiprows=skiprows, usecols=usecols,
             unpack=unpack, ndmin=ndmin, encoding=encoding,
-            max_rows=max_rows, like=like
+            max_rows=max_rows
         )
 
     if isinstance(delimiter, bytes):
@@ -1353,9 +1378,7 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
     return arr
 
 
-_loadtxt_with_like = array_function_dispatch(
-    _loadtxt_dispatcher
-)(loadtxt)
+_loadtxt_with_like = array_function_dispatch()(loadtxt)
 
 
 def _savetxt_dispatcher(fname, X, fmt=None, delimiter=None, newline=None,
@@ -1716,17 +1739,6 @@ def fromregex(file, regexp, dtype, encoding=None):
 #####--------------------------------------------------------------------------
 
 
-def _genfromtxt_dispatcher(fname, dtype=None, comments=None, delimiter=None,
-                           skip_header=None, skip_footer=None, converters=None,
-                           missing_values=None, filling_values=None, usecols=None,
-                           names=None, excludelist=None, deletechars=None,
-                           replace_space=None, autostrip=None, case_sensitive=None,
-                           defaultfmt=None, unpack=None, usemask=None, loose=None,
-                           invalid_raise=None, max_rows=None, encoding=None,
-                           *, ndmin=None, like=None):
-    return (like,)
-
-
 @set_array_function_like_doc
 @set_module('numpy')
 def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
@@ -1924,7 +1936,7 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
 
     if like is not None:
         return _genfromtxt_with_like(
-            fname, dtype=dtype, comments=comments, delimiter=delimiter,
+            like, fname, dtype=dtype, comments=comments, delimiter=delimiter,
             skip_header=skip_header, skip_footer=skip_footer,
             converters=converters, missing_values=missing_values,
             filling_values=filling_values, usecols=usecols, names=names,
@@ -1934,7 +1946,6 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
             unpack=unpack, usemask=usemask, loose=loose,
             invalid_raise=invalid_raise, max_rows=max_rows, encoding=encoding,
             ndmin=ndmin,
-            like=like
         )
 
     _ensure_ndmin_ndarray_check_param(ndmin)
@@ -2327,7 +2338,7 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
         column_types = [conv.type for conv in converters]
         # Find the columns with strings...
         strcolidx = [i for (i, v) in enumerate(column_types)
-                     if v == np.unicode_]
+                     if v == np.str_]
 
         if byte_converters and strcolidx:
             # convert strings back to bytes for backward compatibility
@@ -2463,9 +2474,7 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
     return output
 
 
-_genfromtxt_with_like = array_function_dispatch(
-    _genfromtxt_dispatcher
-)(genfromtxt)
+_genfromtxt_with_like = array_function_dispatch()(genfromtxt)
 
 
 def recfromtxt(fname, **kwargs):
diff --git a/numpy/lib/npyio.pyi b/numpy/lib/npyio.pyi
index 8007b2dc7..cc81e82b7 100644
--- a/numpy/lib/npyio.pyi
+++ b/numpy/lib/npyio.pyi
@@ -72,6 +72,7 @@ class NpzFile(Mapping[str, NDArray[Any]]):
     files: list[str]
     allow_pickle: bool
     pickle_kwargs: None | Mapping[str, Any]
+    _MAX_REPR_ARRAY_COUNT: int
     # Represent `f` as a mutable property so we can access the type of `self`
     @property
     def f(self: _T) -> BagObj[_T]: ...
@@ -97,6 +98,8 @@ class NpzFile(Mapping[str, NDArray[Any]]):
     def __iter__(self) -> Iterator[str]: ...
     def __len__(self) -> int: ...
     def __getitem__(self, key: str) -> NDArray[Any]: ...
+    def __contains__(self, key: str) -> bool: ...
+    def __repr__(self) -> str: ...
 
 # NOTE: Returns a `NpzFile` if file is a zip file;
 # returns an `ndarray`/`memmap` otherwise
diff --git a/numpy/lib/polynomial.py b/numpy/lib/polynomial.py
index 6aa708861..3b8db2a95 100644
--- a/numpy/lib/polynomial.py
+++ b/numpy/lib/polynomial.py
@@ -9,12 +9,13 @@ __all__ = ['poly', 'roots', 'polyint', 'polyder', 'polyadd',
 import functools
 import re
 import warnings
+
+from .._utils import set_module
 import numpy.core.numeric as NX
 
 from numpy.core import (isscalar, abs, finfo, atleast_1d, hstack, dot, array,
                         ones)
 from numpy.core import overrides
-from numpy.core.overrides import set_module
 from numpy.lib.twodim_base import diag, vander
 from numpy.lib.function_base import trim_zeros
 from numpy.lib.type_check import iscomplex, real, imag, mintypecode
@@ -103,7 +104,7 @@ def poly(seq_of_zeros):
 
     References
     ----------
-    .. [1] M. Sullivan and M. Sullivan, III, "Algebra and Trignometry,
+    .. [1] M. Sullivan and M. Sullivan, III, "Algebra and Trigonometry,
        Enhanced With Graphing Utilities," Prentice-Hall, pg. 318, 1996.
 
     .. [2] G. Strang, "Linear Algebra and Its Applications, 2nd Edition,"
@@ -671,7 +672,7 @@ def polyfit(x, y, deg, rcond=None, full=False, w=None, cov=False):
     # warn on rank reduction, which indicates an ill conditioned matrix
     if rank != order and not full:
         msg = "Polyfit may be poorly conditioned"
-        warnings.warn(msg, RankWarning, stacklevel=4)
+        warnings.warn(msg, RankWarning, stacklevel=2)
 
     if full:
         return c, resids, rank, s, rcond
diff --git a/numpy/lib/shape_base.py b/numpy/lib/shape_base.py
index ab91423d9..5d8a41bfe 100644
--- a/numpy/lib/shape_base.py
+++ b/numpy/lib/shape_base.py
@@ -1,9 +1,7 @@
 import functools
 
 import numpy.core.numeric as _nx
-from numpy.core.numeric import (
-    asarray, zeros, outer, concatenate, array, asanyarray
-    )
+from numpy.core.numeric import asarray, zeros, array, asanyarray
 from numpy.core.fromnumeric import reshape, transpose
 from numpy.core.multiarray import normalize_axis_index
 from numpy.core import overrides
@@ -124,19 +122,21 @@ def take_along_axis(arr, indices, axis):
     >>> np.sort(a, axis=1)
     array([[10, 20, 30],
            [40, 50, 60]])
-    >>> ai = np.argsort(a, axis=1); ai
+    >>> ai = np.argsort(a, axis=1)
+    >>> ai
     array([[0, 2, 1],
            [1, 2, 0]])
     >>> np.take_along_axis(a, ai, axis=1)
     array([[10, 20, 30],
            [40, 50, 60]])
 
-    The same works for max and min, if you expand the dimensions:
+    The same works for max and min, if you maintain the trivial dimension
+    with ``keepdims``:
 
-    >>> np.expand_dims(np.max(a, axis=1), axis=1)
+    >>> np.max(a, axis=1, keepdims=True)
     array([[30],
            [60]])
-    >>> ai = np.expand_dims(np.argmax(a, axis=1), axis=1)
+    >>> ai = np.argmax(a, axis=1, keepdims=True)
     >>> ai
     array([[1],
            [0]])
@@ -147,8 +147,8 @@ def take_along_axis(arr, indices, axis):
     If we want to get the max and min at the same time, we can stack the
     indices first
 
-    >>> ai_min = np.expand_dims(np.argmin(a, axis=1), axis=1)
-    >>> ai_max = np.expand_dims(np.argmax(a, axis=1), axis=1)
+    >>> ai_min = np.argmin(a, axis=1, keepdims=True)
+    >>> ai_max = np.argmax(a, axis=1, keepdims=True)
     >>> ai = np.concatenate([ai_min, ai_max], axis=1)
     >>> ai
     array([[0, 1],
@@ -237,7 +237,7 @@ def put_along_axis(arr, indices, values, axis):
 
     We can replace the maximum values with:
 
-    >>> ai = np.expand_dims(np.argmax(a, axis=1), axis=1)
+    >>> ai = np.argmax(a, axis=1, keepdims=True)
     >>> ai
     array([[1],
            [0]])
@@ -372,7 +372,7 @@ def apply_along_axis(func1d, axis, arr, *args, **kwargs):
     # invoke the function on the first item
     try:
         ind0 = next(inds)
-    except StopIteration as e:
+    except StopIteration:
         raise ValueError(
             'Cannot apply_along_axis when any iteration dimensions are 0'
         ) from None
@@ -643,10 +643,6 @@ def column_stack(tup):
            [3, 4]])
 
     """
-    if not overrides.ARRAY_FUNCTION_ENABLED:
-        # raise warning if necessary
-        _arrays_for_stack_dispatcher(tup, stacklevel=2)
-
     arrays = []
     for v in tup:
         arr = asanyarray(v)
@@ -713,10 +709,6 @@ def dstack(tup):
            [[3, 4]]])
 
     """
-    if not overrides.ARRAY_FUNCTION_ENABLED:
-        # raise warning if necessary
-        _arrays_for_stack_dispatcher(tup, stacklevel=2)
-
     arrs = atleast_3d(*tup)
     if not isinstance(arrs, list):
         arrs = [arrs]
@@ -1041,6 +1033,7 @@ def dsplit(ary, indices_or_sections):
         raise ValueError('dsplit only works on arrays of 3 or more dimensions')
     return split(ary, indices_or_sections, 2)
 
+
 def get_array_prepare(*args):
     """Find the wrapper for the array with the highest priority.
 
@@ -1053,6 +1046,7 @@ def get_array_prepare(*args):
         return wrappers[-1][-1]
     return None
 
+
 def get_array_wrap(*args):
     """Find the wrapper for the array with the highest priority.
 
diff --git a/numpy/lib/tests/test_arraypad.py b/numpy/lib/tests/test_arraypad.py
index a59681573..0bebe3693 100644
--- a/numpy/lib/tests/test_arraypad.py
+++ b/numpy/lib/tests/test_arraypad.py
@@ -1139,6 +1139,23 @@ class TestWrap:
         a = np.arange(5)
         b = np.pad(a, (0, 12), mode="wrap")
         assert_array_equal(np.r_[a, a, a, a][:-3], b)
+    
+    def test_repeated_wrapping_multiple_origin(self):
+        """
+        Assert that 'wrap' pads only with multiples of the original area if
+        the pad width is larger than the original array.
+        """
+        a = np.arange(4).reshape(2, 2)
+        a = np.pad(a, [(1, 3), (3, 1)], mode='wrap')
+        b = np.array(
+            [[3, 2, 3, 2, 3, 2],
+             [1, 0, 1, 0, 1, 0],
+             [3, 2, 3, 2, 3, 2],
+             [1, 0, 1, 0, 1, 0],
+             [3, 2, 3, 2, 3, 2],
+             [1, 0, 1, 0, 1, 0]]
+        )
+        assert_array_equal(a, b)
 
 
 class TestEdge:
diff --git a/numpy/lib/tests/test_arraysetops.py b/numpy/lib/tests/test_arraysetops.py
index bb07e25a9..a180accbe 100644
--- a/numpy/lib/tests/test_arraysetops.py
+++ b/numpy/lib/tests/test_arraysetops.py
@@ -414,13 +414,48 @@ class TestSetOps:
         with pytest.raises(ValueError):
             in1d(a, b, kind="table")
 
+    @pytest.mark.parametrize(
+        "dtype1,dtype2",
+        [
+            (np.int8, np.int16),
+            (np.int16, np.int8),
+            (np.uint8, np.uint16),
+            (np.uint16, np.uint8),
+            (np.uint8, np.int16),
+            (np.int16, np.uint8),
+        ]
+    )
+    @pytest.mark.parametrize("kind", [None, "sort", "table"])
+    def test_in1d_mixed_dtype(self, dtype1, dtype2, kind):
+        """Test that in1d works as expected for mixed dtype input."""
+        is_dtype2_signed = np.issubdtype(dtype2, np.signedinteger)
+        ar1 = np.array([0, 0, 1, 1], dtype=dtype1)
+
+        if is_dtype2_signed:
+            ar2 = np.array([-128, 0, 127], dtype=dtype2)
+        else:
+            ar2 = np.array([127, 0, 255], dtype=dtype2)
+
+        expected = np.array([True, True, False, False])
+
+        expect_failure = kind == "table" and any((
+            dtype1 == np.int8 and dtype2 == np.int16,
+            dtype1 == np.int16 and dtype2 == np.int8
+        ))
+
+        if expect_failure:
+            with pytest.raises(RuntimeError, match="exceed the maximum"):
+                in1d(ar1, ar2, kind=kind)
+        else:
+            assert_array_equal(in1d(ar1, ar2, kind=kind), expected)
+
     @pytest.mark.parametrize("kind", [None, "sort", "table"])
     def test_in1d_mixed_boolean(self, kind):
         """Test that in1d works as expected for bool/int input."""
         for dtype in np.typecodes["AllInteger"]:
             a = np.array([True, False, False], dtype=bool)
-            b = np.array([1, 1, 1, 1], dtype=dtype)
-            expected = np.array([True, False, False], dtype=bool)
+            b = np.array([0, 0, 0, 0], dtype=dtype)
+            expected = np.array([False, True, True], dtype=bool)
             assert_array_equal(in1d(a, b, kind=kind), expected)
 
             a, b = b, a
diff --git a/numpy/lib/tests/test_format.py b/numpy/lib/tests/test_format.py
index 08878a1f9..58d08f1e5 100644
--- a/numpy/lib/tests/test_format.py
+++ b/numpy/lib/tests/test_format.py
@@ -283,7 +283,7 @@ from io import BytesIO
 import numpy as np
 from numpy.testing import (
     assert_, assert_array_equal, assert_raises, assert_raises_regex,
-    assert_warns, IS_PYPY,
+    assert_warns, IS_PYPY, IS_WASM
     )
 from numpy.testing._private.utils import requires_memory
 from numpy.lib import format
@@ -459,6 +459,7 @@ def test_long_str():
     assert_array_equal(long_str_arr, long_str_arr2)
 
 
+@pytest.mark.skipif(IS_WASM, reason="memmap doesn't work correctly")
 @pytest.mark.slow
 def test_memmap_roundtrip(tmpdir):
     for i, arr in enumerate(basic_arrays + record_arrays):
@@ -526,10 +527,12 @@ def test_load_padded_dtype(tmpdir, dt):
     assert_array_equal(arr, arr1)
 
 
+@pytest.mark.xfail(IS_WASM, reason="Emscripten NODEFS has a buggy dup")
 def test_python2_python3_interoperability():
     fname = 'win64python2.npy'
     path = os.path.join(os.path.dirname(__file__), 'data', fname)
-    data = np.load(path)
+    with pytest.warns(UserWarning, match="Reading.*this warning\\."):
+        data = np.load(path)
     assert_array_equal(data, np.ones(2))
 
 def test_pickle_python2_python3():
@@ -675,6 +678,7 @@ def test_version_2_0():
     assert_raises(ValueError, format.write_array, f, d, (1, 0))
 
 
+@pytest.mark.skipif(IS_WASM, reason="memmap doesn't work correctly")
 def test_version_2_0_memmap(tmpdir):
     # requires more than 2 byte for header
     dt = [(("%d" % i) * 100, float) for i in range(500)]
@@ -920,6 +924,7 @@ def test_large_file_support(tmpdir):
     assert_array_equal(r, d)
 
 
+@pytest.mark.skipif(IS_PYPY, reason="flaky on PyPy")
 @pytest.mark.skipif(np.dtype(np.intp).itemsize < 8,
                     reason="test requires 64-bit system")
 @pytest.mark.slow
diff --git a/numpy/lib/tests/test_function_base.py b/numpy/lib/tests/test_function_base.py
index 88d4987e6..b0944ec85 100644
--- a/numpy/lib/tests/test_function_base.py
+++ b/numpy/lib/tests/test_function_base.py
@@ -8,14 +8,14 @@ import pytest
 import hypothesis
 from hypothesis.extra.numpy import arrays
 import hypothesis.strategies as st
-
+from functools import partial
 
 import numpy as np
 from numpy import ma
 from numpy.testing import (
     assert_, assert_equal, assert_array_equal, assert_almost_equal,
     assert_array_almost_equal, assert_raises, assert_allclose, IS_PYPY,
-    assert_warns, assert_raises_regex, suppress_warnings, HAS_REFCOUNT,
+    assert_warns, assert_raises_regex, suppress_warnings, HAS_REFCOUNT, IS_WASM
     )
 import numpy.lib.function_base as nfb
 from numpy.random import rand
@@ -25,6 +25,7 @@ from numpy.lib import (
     i0, insert, interp, kaiser, meshgrid, msort, piecewise, place, rot90,
     select, setxor1d, sinc, trapz, trim_zeros, unwrap, unique, vectorize
     )
+from numpy.core.numeric import normalize_axis_tuple
 
 
 def get_mat(n):
@@ -228,8 +229,8 @@ class TestAny:
     def test_nd(self):
         y1 = [[0, 0, 0], [0, 1, 0], [1, 1, 0]]
         assert_(np.any(y1))
-        assert_array_equal(np.sometrue(y1, axis=0), [1, 1, 0])
-        assert_array_equal(np.sometrue(y1, axis=1), [0, 1, 1])
+        assert_array_equal(np.any(y1, axis=0), [1, 1, 0])
+        assert_array_equal(np.any(y1, axis=1), [0, 1, 1])
 
 
 class TestAll:
@@ -246,8 +247,8 @@ class TestAll:
     def test_nd(self):
         y1 = [[0, 0, 1], [0, 1, 1], [1, 1, 1]]
         assert_(not np.all(y1))
-        assert_array_equal(np.alltrue(y1, axis=0), [0, 0, 1])
-        assert_array_equal(np.alltrue(y1, axis=1), [0, 0, 1])
+        assert_array_equal(np.all(y1, axis=0), [0, 0, 1])
+        assert_array_equal(np.all(y1, axis=1), [0, 0, 1])
 
 
 class TestCopy:
@@ -1216,6 +1217,13 @@ class TestGradient:
         dfdx = gradient(f, x)
         assert_array_equal(dfdx, [0.5, 0.5])
 
+    def test_return_type(self):
+        res = np.gradient(([1, 2], [2, 3]))
+        if np._using_numpy2_behavior():
+            assert type(res) is tuple
+        else:
+            assert type(res) is list
+
 
 class TestAngle:
 
@@ -1779,6 +1787,70 @@ class TestVectorize:
         assert_equal(type(r), subclass)
         assert_equal(r, m * v)
 
+    def test_name(self):
+        #See gh-23021
+        @np.vectorize
+        def f2(a, b):
+            return a + b
+
+        assert f2.__name__ == 'f2'
+
+    def test_decorator(self):
+        @vectorize
+        def addsubtract(a, b):
+            if a > b:
+                return a - b
+            else:
+                return a + b
+
+        r = addsubtract([0, 3, 6, 9], [1, 3, 5, 7])
+        assert_array_equal(r, [1, 6, 1, 2])
+
+    def test_docstring(self):
+        @vectorize
+        def f(x):
+            """Docstring"""
+            return x
+
+        if sys.flags.optimize < 2:
+            assert f.__doc__ == "Docstring"
+
+    def test_partial(self):
+        def foo(x, y):
+            return x + y
+
+        bar = partial(foo, 3)
+        vbar = np.vectorize(bar)
+        assert vbar(1) == 4
+
+    def test_signature_otypes_decorator(self):
+        @vectorize(signature='(n)->(n)', otypes=['float64'])
+        def f(x):
+            return x
+
+        r = f([1, 2, 3])
+        assert_equal(r.dtype, np.dtype('float64'))
+        assert_array_equal(r, [1, 2, 3])
+        assert f.__name__ == 'f'
+
+    def test_bad_input(self):
+        with assert_raises(TypeError):
+            A = np.vectorize(pyfunc = 3)
+
+    def test_no_keywords(self):
+        with assert_raises(TypeError):
+            @np.vectorize("string")
+            def foo():
+                return "bar"
+
+    def test_positional_regression_9477(self):
+        # This supplies the first keyword argument as a positional,
+        # to ensure that they are still properly forwarded after the
+        # enhancement for #9477
+        f = vectorize((lambda x: x), ['float64'])
+        r = f([2])
+        assert_equal(r.dtype, np.dtype('float64'))
+
 
 class TestLeaks:
     class A:
@@ -2972,6 +3044,14 @@ class TestPercentile:
         o = np.ones((1,))
         np.percentile(d, 5, None, o, False, 'linear')
 
+    def test_complex(self):
+        arr_c = np.array([0.5+3.0j, 2.1+0.5j, 1.6+2.3j], dtype='G')
+        assert_raises(TypeError, np.percentile, arr_c, 0.5)
+        arr_c = np.array([0.5+3.0j, 2.1+0.5j, 1.6+2.3j], dtype='D')
+        assert_raises(TypeError, np.percentile, arr_c, 0.5)
+        arr_c = np.array([0.5+3.0j, 2.1+0.5j, 1.6+2.3j], dtype='F')
+        assert_raises(TypeError, np.percentile, arr_c, 0.5)
+
     def test_2D(self):
         x = np.array([[1, 1, 1],
                       [1, 1, 1],
@@ -2980,7 +3060,7 @@ class TestPercentile:
                       [1, 1, 1]])
         assert_array_equal(np.percentile(x, 50, axis=0), [1, 1, 1])
 
-    @pytest.mark.parametrize("dtype", np.typecodes["AllFloat"])
+    @pytest.mark.parametrize("dtype", np.typecodes["Float"])
     def test_linear_nan_1D(self, dtype):
         # METHOD 1 of H&F
         arr = np.asarray([15.0, np.NAN, 35.0, 40.0, 50.0], dtype=dtype)
@@ -2997,9 +3077,6 @@ class TestPercentile:
                            (np.float32, np.float32),
                            (np.float64, np.float64),
                            (np.longdouble, np.longdouble),
-                           (np.complex64, np.complex64),
-                           (np.complex128, np.complex128),
-                           (np.clongdouble, np.clongdouble),
                            (np.dtype("O"), np.float64)]
 
     @pytest.mark.parametrize(["input_dtype", "expected_dtype"], H_F_TYPE_CODES)
@@ -3039,7 +3116,7 @@ class TestPercentile:
             np.testing.assert_equal(np.asarray(actual).dtype,
                                     np.dtype(expected_dtype))
 
-    TYPE_CODES = np.typecodes["AllInteger"] + np.typecodes["AllFloat"] + "O"
+    TYPE_CODES = np.typecodes["AllInteger"] + np.typecodes["Float"] + "O"
 
     @pytest.mark.parametrize("dtype", TYPE_CODES)
     def test_lower_higher(self, dtype):
@@ -3331,6 +3408,32 @@ class TestPercentile:
         assert_equal(np.percentile(d, [1, 7], axis=(0, 3),
                                    keepdims=True).shape, (2, 1, 5, 7, 1))
 
+    @pytest.mark.parametrize('q', [7, [1, 7]])
+    @pytest.mark.parametrize(
+        argnames='axis',
+        argvalues=[
+            None,
+            1,
+            (1,),
+            (0, 1),
+            (-3, -1),
+        ]
+    )
+    def test_keepdims_out(self, q, axis):
+        d = np.ones((3, 5, 7, 11))
+        if axis is None:
+            shape_out = (1,) * d.ndim
+        else:
+            axis_norm = normalize_axis_tuple(axis, d.ndim)
+            shape_out = tuple(
+                1 if i in axis_norm else d.shape[i] for i in range(d.ndim))
+        shape_out = np.shape(q) + shape_out
+
+        out = np.empty(shape_out)
+        result = np.percentile(d, q, axis=axis, keepdims=True, out=out)
+        assert result is out
+        assert_equal(result.shape, shape_out)
+
     def test_out(self):
         o = np.zeros((4,))
         d = np.ones((3, 4))
@@ -3435,9 +3538,20 @@ class TestPercentile:
             np.percentile([1, 2, 3, 4.0], q)
 
 
+quantile_methods = [
+    'inverted_cdf', 'averaged_inverted_cdf', 'closest_observation',
+    'interpolated_inverted_cdf', 'hazen', 'weibull', 'linear',
+    'median_unbiased', 'normal_unbiased', 'nearest', 'lower', 'higher',
+    'midpoint']
+
+
 class TestQuantile:
     # most of this is already tested by TestPercentile
 
+    def V(self, x, y, alpha):
+        # Identification function used in several tests.
+        return (x >= y) - alpha
+
     def test_max_ulp(self):
         x = [0.0, 0.2, 0.4]
         a = np.quantile(x, 0.45)
@@ -3452,7 +3566,6 @@ class TestQuantile:
         assert_equal(np.quantile(x, 1), 3.5)
         assert_equal(np.quantile(x, 0.5), 1.75)
 
-    @pytest.mark.xfail(reason="See gh-19154")
     def test_correct_quantile_value(self):
         a = np.array([True])
         tf_quant = np.quantile(True, False)
@@ -3490,6 +3603,15 @@ class TestQuantile:
         x = np.arange(8)
         assert_equal(np.quantile(x, Fraction(1, 2)), Fraction(7, 2))
 
+    def test_complex(self):
+        #See gh-22652
+        arr_c = np.array([0.5+3.0j, 2.1+0.5j, 1.6+2.3j], dtype='G')
+        assert_raises(TypeError, np.quantile, arr_c, 0.5)
+        arr_c = np.array([0.5+3.0j, 2.1+0.5j, 1.6+2.3j], dtype='D')
+        assert_raises(TypeError, np.quantile, arr_c, 0.5)
+        arr_c = np.array([0.5+3.0j, 2.1+0.5j, 1.6+2.3j], dtype='F')
+        assert_raises(TypeError, np.quantile, arr_c, 0.5)
+
     def test_no_p_overwrite(self):
         # this is worth retesting, because quantile does not make a copy
         p0 = np.array([0, 0.75, 0.25, 0.5, 1.0])
@@ -3508,11 +3630,7 @@ class TestQuantile:
                           method="nearest")
         assert res.dtype == dtype
 
-    @pytest.mark.parametrize("method",
-             ['inverted_cdf', 'averaged_inverted_cdf', 'closest_observation',
-              'interpolated_inverted_cdf', 'hazen', 'weibull', 'linear',
-              'median_unbiased', 'normal_unbiased',
-              'nearest', 'lower', 'higher', 'midpoint'])
+    @pytest.mark.parametrize("method", quantile_methods)
     def test_quantile_monotonic(self, method):
         # GH 14685
         # test that the return value of quantile is monotonic if p0 is ordered
@@ -3543,6 +3661,94 @@ class TestQuantile:
         assert np.isscalar(actual)
         assert_equal(np.quantile(a, 0.5), np.nan)
 
+    @pytest.mark.parametrize("method", quantile_methods)
+    @pytest.mark.parametrize("alpha", [0.2, 0.5, 0.9])
+    def test_quantile_identification_equation(self, method, alpha):
+        # Test that the identification equation holds for the empirical
+        # CDF:
+        #   E[V(x, Y)] = 0  <=>  x is quantile
+        # with Y the random variable for which we have observed values and
+        # V(x, y) the canonical identification function for the quantile (at
+        # level alpha), see
+        # https://doi.org/10.48550/arXiv.0912.0902        
+        rng = np.random.default_rng(4321)
+        # We choose n and alpha such that we cover 3 cases:
+        #  - n * alpha is an integer
+        #  - n * alpha is a float that gets rounded down
+        #  - n * alpha is a float that gest rounded up
+        n = 102  # n * alpha = 20.4, 51. , 91.8
+        y = rng.random(n)
+        x = np.quantile(y, alpha, method=method)
+        if method in ("higher",):
+            # These methods do not fulfill the identification equation.
+            assert np.abs(np.mean(self.V(x, y, alpha))) > 0.1 / n
+        elif int(n * alpha) == n * alpha:
+            # We can expect exact results, up to machine precision.
+            assert_allclose(np.mean(self.V(x, y, alpha)), 0, atol=1e-14)
+        else:
+            # V = (x >= y) - alpha cannot sum to zero exactly but within
+            # "sample precision".
+            assert_allclose(np.mean(self.V(x, y, alpha)), 0,
+                atol=1 / n / np.amin([alpha, 1 - alpha]))
+
+    @pytest.mark.parametrize("method", quantile_methods)
+    @pytest.mark.parametrize("alpha", [0.2, 0.5, 0.9])
+    def test_quantile_add_and_multiply_constant(self, method, alpha):
+        # Test that
+        #  1. quantile(c + x) = c + quantile(x)
+        #  2. quantile(c * x) = c * quantile(x)
+        #  3. quantile(-x) = -quantile(x, 1 - alpha)
+        #     On empirical quantiles, this equation does not hold exactly.
+        # Koenker (2005) "Quantile Regression" Chapter 2.2.3 calls these
+        # properties equivariance.
+        rng = np.random.default_rng(4321)
+        # We choose n and alpha such that we have cases for
+        #  - n * alpha is an integer
+        #  - n * alpha is a float that gets rounded down
+        #  - n * alpha is a float that gest rounded up
+        n = 102  # n * alpha = 20.4, 51. , 91.8
+        y = rng.random(n)
+        q = np.quantile(y, alpha, method=method)
+        c = 13.5
+
+        # 1
+        assert_allclose(np.quantile(c + y, alpha, method=method), c + q)
+        # 2
+        assert_allclose(np.quantile(c * y, alpha, method=method), c * q)
+        # 3
+        q = -np.quantile(-y, 1 - alpha, method=method)
+        if method == "inverted_cdf":
+            if (
+                n * alpha == int(n * alpha)
+                or np.round(n * alpha) == int(n * alpha) + 1
+            ):
+                assert_allclose(q, np.quantile(y, alpha, method="higher"))
+            else:
+                assert_allclose(q, np.quantile(y, alpha, method="lower"))
+        elif method == "closest_observation":
+            if n * alpha == int(n * alpha):
+                assert_allclose(q, np.quantile(y, alpha, method="higher"))
+            elif np.round(n * alpha) == int(n * alpha) + 1:
+                assert_allclose(
+                    q, np.quantile(y, alpha + 1/n, method="higher"))
+            else:
+                assert_allclose(q, np.quantile(y, alpha, method="lower"))
+        elif method == "interpolated_inverted_cdf":
+            assert_allclose(q, np.quantile(y, alpha + 1/n, method=method))
+        elif method == "nearest":
+            if n * alpha == int(n * alpha):
+                assert_allclose(q, np.quantile(y, alpha + 1/n, method=method))
+            else:
+                assert_allclose(q, np.quantile(y, alpha, method=method))
+        elif method == "lower":
+            assert_allclose(q, np.quantile(y, alpha, method="higher"))
+        elif method == "higher":
+            assert_allclose(q, np.quantile(y, alpha, method="lower"))
+        else:
+            # "averaged_inverted_cdf", "hazen", "weibull", "linear",
+            # "median_unbiased", "normal_unbiased", "midpoint"
+            assert_allclose(q, np.quantile(y, alpha, method=method))
+
 
 class TestLerp:
     @hypothesis.given(t0=st.floats(allow_nan=False, allow_infinity=False,
@@ -3754,6 +3960,7 @@ class TestMedian:
         b[2] = np.nan
         assert_equal(np.median(a, (0, 2)), b)
 
+    @pytest.mark.skipif(IS_WASM, reason="fp errors don't work correctly")
     def test_empty(self):
         # mean(empty array) emits two warnings: empty slice and divide by 0
         a = np.array([], dtype=float)
@@ -3842,6 +4049,29 @@ class TestMedian:
         assert_equal(np.median(d, axis=(0, 1, 3), keepdims=True).shape,
                      (1, 1, 7, 1))
 
+    @pytest.mark.parametrize(
+        argnames='axis',
+        argvalues=[
+            None,
+            1,
+            (1, ),
+            (0, 1),
+            (-3, -1),
+        ]
+    )
+    def test_keepdims_out(self, axis):
+        d = np.ones((3, 5, 7, 11))
+        if axis is None:
+            shape_out = (1,) * d.ndim
+        else:
+            axis_norm = normalize_axis_tuple(axis, d.ndim)
+            shape_out = tuple(
+                1 if i in axis_norm else d.shape[i] for i in range(d.ndim))
+        out = np.empty(shape_out)
+        result = np.median(d, axis=axis, keepdims=True, out=out)
+        assert result is out
+        assert_equal(result.shape, shape_out)
+
 
 class TestAdd_newdoc_ufunc:
 
diff --git a/numpy/lib/tests/test_histograms.py b/numpy/lib/tests/test_histograms.py
index 87643169b..87e6e1d41 100644
--- a/numpy/lib/tests/test_histograms.py
+++ b/numpy/lib/tests/test_histograms.py
@@ -6,6 +6,7 @@ from numpy.testing import (
     assert_array_almost_equal, assert_raises, assert_allclose,
     assert_array_max_ulp, assert_raises_regex, suppress_warnings,
     )
+from numpy.testing._private.utils import requires_memory
 import pytest
 
 
@@ -397,6 +398,16 @@ class TestHistogram:
         edges = histogram_bin_edges(arr, bins='auto', range=(0, 1))
         assert_array_equal(edges, e)
 
+    @requires_memory(free_bytes=1e10)
+    @pytest.mark.slow
+    def test_big_arrays(self):
+        sample = np.zeros([100000000, 3])
+        xbins = 400
+        ybins = 400
+        zbins = np.arange(16000)
+        hist = np.histogramdd(sample=sample, bins=(xbins, ybins, zbins))
+        assert_equal(type(hist), type((1, 2)))
+
 
 class TestHistogramOptimBinNums:
     """
diff --git a/numpy/lib/tests/test_io.py b/numpy/lib/tests/test_io.py
index a5462749f..c1032df8e 100644
--- a/numpy/lib/tests/test_io.py
+++ b/numpy/lib/tests/test_io.py
@@ -25,7 +25,7 @@ from numpy.testing import (
     assert_warns, assert_, assert_raises_regex, assert_raises,
     assert_allclose, assert_array_equal, temppath, tempdir, IS_PYPY,
     HAS_REFCOUNT, suppress_warnings, assert_no_gc_cycles, assert_no_warnings,
-    break_cycles
+    break_cycles, IS_WASM
     )
 from numpy.testing._private.utils import requires_memory
 
@@ -232,6 +232,17 @@ class TestSavezLoad(RoundtripTest):
         assert_equal(a, l['file_a'])
         assert_equal(b, l['file_b'])
 
+
+    def test_tuple_getitem_raises(self):
+        # gh-23748
+        a = np.array([1, 2, 3])
+        f = BytesIO()
+        np.savez(f, a=a)
+        f.seek(0)
+        l = np.load(f)
+        with pytest.raises(KeyError, match="(1, 2)"):
+            l[1, 2]
+
     def test_BagObj(self):
         a = np.array([[1, 2], [3, 4]], float)
         b = np.array([[1 + 2j, 2 + 7j], [3 - 6j, 4 + 12j]], complex)
@@ -243,6 +254,7 @@ class TestSavezLoad(RoundtripTest):
         assert_equal(a, l.f.file_a)
         assert_equal(b, l.f.file_b)
 
+    @pytest.mark.skipif(IS_WASM, reason="Cannot start thread")
     def test_savez_filename_clashes(self):
         # Test that issue #852 is fixed
         # and savez functions in multithreaded environment
@@ -320,6 +332,21 @@ class TestSavezLoad(RoundtripTest):
             data.close()
             assert_(fp.closed)
 
+    @pytest.mark.parametrize("count, expected_repr", [
+        (1, "NpzFile {fname!r} with keys: arr_0"),
+        (5, "NpzFile {fname!r} with keys: arr_0, arr_1, arr_2, arr_3, arr_4"),
+        # _MAX_REPR_ARRAY_COUNT is 5, so files with more than 5 keys are
+        # expected to end in '...'
+        (6, "NpzFile {fname!r} with keys: arr_0, arr_1, arr_2, arr_3, arr_4..."),
+    ])
+    def test_repr_lists_keys(self, count, expected_repr):
+        a = np.array([[1, 2], [3, 4]], float)
+        with temppath(suffix='.npz') as tmp:
+            np.savez(tmp, *[a]*count)
+            l = np.load(tmp)
+            assert repr(l) == expected_repr.format(fname=tmp)
+            l.close()
+
 
 class TestSaveTxt:
     def test_array(self):
@@ -521,7 +548,7 @@ class TestSaveTxt:
 
     def test_unicode(self):
         utf8 = b'\xcf\x96'.decode('UTF-8')
-        a = np.array([utf8], dtype=np.unicode_)
+        a = np.array([utf8], dtype=np.str_)
         with tempdir() as tmpdir:
             # set encoding as on windows it may not be unicode even on py3
             np.savetxt(os.path.join(tmpdir, 'test.csv'), a, fmt=['%s'],
@@ -529,7 +556,7 @@ class TestSaveTxt:
 
     def test_unicode_roundtrip(self):
         utf8 = b'\xcf\x96'.decode('UTF-8')
-        a = np.array([utf8], dtype=np.unicode_)
+        a = np.array([utf8], dtype=np.str_)
         # our gz wrapper support encoding
         suffixes = ['', '.gz']
         if HAS_BZ2:
@@ -541,12 +568,12 @@ class TestSaveTxt:
                 np.savetxt(os.path.join(tmpdir, 'test.csv' + suffix), a,
                            fmt=['%s'], encoding='UTF-16-LE')
                 b = np.loadtxt(os.path.join(tmpdir, 'test.csv' + suffix),
-                               encoding='UTF-16-LE', dtype=np.unicode_)
+                               encoding='UTF-16-LE', dtype=np.str_)
                 assert_array_equal(a, b)
 
     def test_unicode_bytestream(self):
         utf8 = b'\xcf\x96'.decode('UTF-8')
-        a = np.array([utf8], dtype=np.unicode_)
+        a = np.array([utf8], dtype=np.str_)
         s = BytesIO()
         np.savetxt(s, a, fmt=['%s'], encoding='UTF-8')
         s.seek(0)
@@ -554,7 +581,7 @@ class TestSaveTxt:
 
     def test_unicode_stringstream(self):
         utf8 = b'\xcf\x96'.decode('UTF-8')
-        a = np.array([utf8], dtype=np.unicode_)
+        a = np.array([utf8], dtype=np.str_)
         s = StringIO()
         np.savetxt(s, a, fmt=['%s'], encoding='UTF-8')
         s.seek(0)
@@ -596,8 +623,8 @@ class TestSaveTxt:
         # in our process if needed, see gh-16889
         memoryerror_raised = Value(c_bool)
 
-        # Since Python 3.8, the default start method for multiprocessing has 
-        # been changed from 'fork' to 'spawn' on macOS, causing inconsistency 
+        # Since Python 3.8, the default start method for multiprocessing has
+        # been changed from 'fork' to 'spawn' on macOS, causing inconsistency
         # on memory sharing model, lead to failed test for check_large_zip
         ctx = get_context('fork')
         p = ctx.Process(target=check_large_zip, args=(memoryerror_raised,))
@@ -651,12 +678,12 @@ class LoadTxtBase:
         with temppath() as path:
             with open(path, "wb") as f:
                 f.write(nonascii.encode("UTF-16"))
-            x = self.loadfunc(path, encoding="UTF-16", dtype=np.unicode_)
+            x = self.loadfunc(path, encoding="UTF-16", dtype=np.str_)
             assert_array_equal(x, nonascii)
 
     def test_binary_decode(self):
         utf16 = b'\xff\xfeh\x04 \x00i\x04 \x00j\x04'
-        v = self.loadfunc(BytesIO(utf16), dtype=np.unicode_, encoding='UTF-16')
+        v = self.loadfunc(BytesIO(utf16), dtype=np.str_, encoding='UTF-16')
         assert_array_equal(v, np.array(utf16.decode('UTF-16').split()))
 
     def test_converters_decode(self):
@@ -664,7 +691,7 @@ class LoadTxtBase:
         c = TextIO()
         c.write(b'\xcf\x96')
         c.seek(0)
-        x = self.loadfunc(c, dtype=np.unicode_,
+        x = self.loadfunc(c, dtype=np.str_,
                           converters={0: lambda x: x.decode('UTF-8')})
         a = np.array([b'\xcf\x96'.decode('UTF-8')])
         assert_array_equal(x, a)
@@ -675,7 +702,7 @@ class LoadTxtBase:
         with temppath() as path:
             with io.open(path, 'wt', encoding='UTF-8') as f:
                 f.write(utf8)
-            x = self.loadfunc(path, dtype=np.unicode_,
+            x = self.loadfunc(path, dtype=np.str_,
                               converters={0: lambda x: x + 't'},
                               encoding='UTF-8')
             a = np.array([utf8 + 't'])
@@ -1160,7 +1187,7 @@ class TestLoadTxt(LoadTxtBase):
             with open(path, "wb") as f:
                 f.write(butf8)
             with open(path, "rb") as f:
-                x = np.loadtxt(f, encoding="UTF-8", dtype=np.unicode_)
+                x = np.loadtxt(f, encoding="UTF-8", dtype=np.str_)
             assert_array_equal(x, sutf8)
             # test broken latin1 conversion people now rely on
             with open(path, "rb") as f:
@@ -2218,7 +2245,7 @@ M   33  21.99
             ctl = np.array([
                      ["test1", "testNonethe" + utf8.decode("UTF-8"), "test3"],
                      ["test1", "testNonethe" + utf8.decode("UTF-8"), "test3"]],
-                     dtype=np.unicode_)
+                     dtype=np.str_)
             assert_array_equal(test, ctl)
 
             # test a mixed dtype
@@ -2261,7 +2288,7 @@ M   33  21.99
                      ["norm1", "norm2", "norm3"],
                      ["norm1", latin1, "norm3"],
                      ["test1", "testNonethe" + utf8, "test3"]],
-                     dtype=np.unicode_)
+                     dtype=np.str_)
             assert_array_equal(test, ctl)
 
     def test_recfromtxt(self):
@@ -2539,6 +2566,7 @@ class TestPathUsage:
                 break_cycles()
                 break_cycles()
 
+    @pytest.mark.xfail(IS_WASM, reason="memmap doesn't work correctly")
     def test_save_load_memmap_readwrite(self):
         # Test that pathlib.Path instances can be written mem-mapped.
         with temppath(suffix='.npy') as path:
@@ -2735,3 +2763,13 @@ def test_load_refcount():
     with assert_no_gc_cycles():
         x = np.loadtxt(TextIO("0 1 2 3"), dtype=dt)
         assert_equal(x, np.array([((0, 1), (2, 3))], dtype=dt))
+
+def test_load_multiple_arrays_until_eof():
+    f = BytesIO()
+    np.save(f, 1)
+    np.save(f, 2)
+    f.seek(0)
+    assert np.load(f) == 1
+    assert np.load(f) == 2
+    with pytest.raises(EOFError):
+        np.load(f)
diff --git a/numpy/lib/tests/test_loadtxt.py b/numpy/lib/tests/test_loadtxt.py
index 0b8fe3c47..2d805e434 100644
--- a/numpy/lib/tests/test_loadtxt.py
+++ b/numpy/lib/tests/test_loadtxt.py
@@ -244,6 +244,14 @@ def test_converters_negative_indices_with_usecols():
                      usecols=[0, -1], converters={-1: (lambda x: -1)})
     assert_array_equal(res, [[0, -1], [0, -1]])
 
+
+def test_ragged_error():
+    rows = ["1,2,3", "1,2,3", "4,3,2,1"]
+    with pytest.raises(ValueError,
+            match="the number of columns changed from 3 to 4 at row 3"):
+        np.loadtxt(rows, delimiter=",")
+
+
 def test_ragged_usecols():
     # usecols, and negative ones, work even with varying number of columns.
     txt = StringIO("0,0,XXX\n0,XXX,0,XXX\n0,XXX,XXX,0,XXX\n")
@@ -534,12 +542,27 @@ def test_quoted_field(q):
     assert_array_equal(res, expected)
 
 
+@pytest.mark.parametrize("q", ('"', "'", "`"))
+def test_quoted_field_with_whitepace_delimiter(q):
+    txt = StringIO(
+        f"{q}alpha, x{q}     2.5\n{q}beta, y{q} 4.5\n{q}gamma, z{q}   5.0\n"
+    )
+    dtype = np.dtype([('f0', 'U8'), ('f1', np.float64)])
+    expected = np.array(
+        [("alpha, x", 2.5), ("beta, y", 4.5), ("gamma, z", 5.0)], dtype=dtype
+    )
+
+    res = np.loadtxt(txt, dtype=dtype, delimiter=None, quotechar=q)
+    assert_array_equal(res, expected)
+
+
 def test_quote_support_default():
     """Support for quoted fields is disabled by default."""
     txt = StringIO('"lat,long", 45, 30\n')
     dtype = np.dtype([('f0', 'U24'), ('f1', np.float64), ('f2', np.float64)])
 
-    with pytest.raises(ValueError, match="the number of columns changed"):
+    with pytest.raises(ValueError,
+            match="the dtype passed requires 3 columns but 4 were"):
         np.loadtxt(txt, dtype=dtype, delimiter=",")
 
     # Enable quoting support with non-None value for quotechar param
@@ -1011,3 +1034,15 @@ def test_control_characters_as_bytes():
     """Byte control characters (comments, delimiter) are supported."""
     a = np.loadtxt(StringIO("#header\n1,2,3"), comments=b"#", delimiter=b",")
     assert_equal(a, [1, 2, 3])
+
+
+@pytest.mark.filterwarnings('ignore::UserWarning')
+def test_field_growing_cases():
+    # Test empty field appending/growing (each field still takes 1 character)
+    # to see if the final field appending does not create issues.
+    res = np.loadtxt([""], delimiter=",", dtype=bytes)
+    assert len(res) == 0
+
+    for i in range(1, 1024):
+        res = np.loadtxt(["," * i], delimiter=",", dtype=bytes)
+        assert len(res) == i+1
diff --git a/numpy/lib/tests/test_nanfunctions.py b/numpy/lib/tests/test_nanfunctions.py
index 733a077ea..257de381b 100644
--- a/numpy/lib/tests/test_nanfunctions.py
+++ b/numpy/lib/tests/test_nanfunctions.py
@@ -3,6 +3,7 @@ import pytest
 import inspect
 
 import numpy as np
+from numpy.core.numeric import normalize_axis_tuple
 from numpy.lib.nanfunctions import _nan_mask, _replace_nan
 from numpy.testing import (
     assert_, assert_equal, assert_almost_equal, assert_raises,
@@ -403,14 +404,20 @@ class TestNanFunctions_NumberTypes:
     )
     def test_nanfunc_q(self, mat, dtype, nanfunc, func):
         mat = mat.astype(dtype)
-        tgt = func(mat, q=1)
-        out = nanfunc(mat, q=1)
+        if mat.dtype.kind == "c":
+            assert_raises(TypeError, func, mat, q=1)
+            assert_raises(TypeError, nanfunc, mat, q=1)
 
-        assert_almost_equal(out, tgt)
-        if dtype == "O":
-            assert type(out) is type(tgt)
         else:
-            assert out.dtype == tgt.dtype
+            tgt = func(mat, q=1)
+            out = nanfunc(mat, q=1)
+
+            assert_almost_equal(out, tgt)
+
+            if dtype == "O":
+                assert type(out) is type(tgt)
+            else:
+                assert out.dtype == tgt.dtype
 
     @pytest.mark.parametrize(
         "nanfunc,func",
@@ -807,6 +814,34 @@ class TestNanFunctions_Median:
             res = np.nanmedian(d, axis=(0, 1, 3), keepdims=True)
             assert_equal(res.shape, (1, 1, 7, 1))
 
+    @pytest.mark.parametrize(
+        argnames='axis',
+        argvalues=[
+            None,
+            1,
+            (1, ),
+            (0, 1),
+            (-3, -1),
+        ]
+    )
+    @pytest.mark.filterwarnings("ignore:All-NaN slice:RuntimeWarning")
+    def test_keepdims_out(self, axis):
+        d = np.ones((3, 5, 7, 11))
+        # Randomly set some elements to NaN:
+        w = np.random.random((4, 200)) * np.array(d.shape)[:, None]
+        w = w.astype(np.intp)
+        d[tuple(w)] = np.nan
+        if axis is None:
+            shape_out = (1,) * d.ndim
+        else:
+            axis_norm = normalize_axis_tuple(axis, d.ndim)
+            shape_out = tuple(
+                1 if i in axis_norm else d.shape[i] for i in range(d.ndim))
+        out = np.empty(shape_out)
+        result = np.nanmedian(d, axis=axis, keepdims=True, out=out)
+        assert result is out
+        assert_equal(result.shape, shape_out)
+
     def test_out(self):
         mat = np.random.rand(3, 3)
         nan_mat = np.insert(mat, [0, 2], np.nan, axis=1)
@@ -982,6 +1017,37 @@ class TestNanFunctions_Percentile:
             res = np.nanpercentile(d, 90, axis=(0, 1, 3), keepdims=True)
             assert_equal(res.shape, (1, 1, 7, 1))
 
+    @pytest.mark.parametrize('q', [7, [1, 7]])
+    @pytest.mark.parametrize(
+        argnames='axis',
+        argvalues=[
+            None,
+            1,
+            (1,),
+            (0, 1),
+            (-3, -1),
+        ]
+    )
+    @pytest.mark.filterwarnings("ignore:All-NaN slice:RuntimeWarning")
+    def test_keepdims_out(self, q, axis):
+        d = np.ones((3, 5, 7, 11))
+        # Randomly set some elements to NaN:
+        w = np.random.random((4, 200)) * np.array(d.shape)[:, None]
+        w = w.astype(np.intp)
+        d[tuple(w)] = np.nan
+        if axis is None:
+            shape_out = (1,) * d.ndim
+        else:
+            axis_norm = normalize_axis_tuple(axis, d.ndim)
+            shape_out = tuple(
+                1 if i in axis_norm else d.shape[i] for i in range(d.ndim))
+        shape_out = np.shape(q) + shape_out
+
+        out = np.empty(shape_out)
+        result = np.nanpercentile(d, q, axis=axis, keepdims=True, out=out)
+        assert result is out
+        assert_equal(result.shape, shape_out)
+
     def test_out(self):
         mat = np.random.rand(3, 3)
         nan_mat = np.insert(mat, [0, 2], np.nan, axis=1)
@@ -1000,6 +1066,14 @@ class TestNanFunctions_Percentile:
         assert_almost_equal(res, resout)
         assert_almost_equal(res, tgt)
 
+    def test_complex(self):
+        arr_c = np.array([0.5+3.0j, 2.1+0.5j, 1.6+2.3j], dtype='G')
+        assert_raises(TypeError, np.nanpercentile, arr_c, 0.5)
+        arr_c = np.array([0.5+3.0j, 2.1+0.5j, 1.6+2.3j], dtype='D')
+        assert_raises(TypeError, np.nanpercentile, arr_c, 0.5)
+        arr_c = np.array([0.5+3.0j, 2.1+0.5j, 1.6+2.3j], dtype='F')
+        assert_raises(TypeError, np.nanpercentile, arr_c, 0.5)
+
     def test_result_values(self):
         tgt = [np.percentile(d, 28) for d in _rdat]
         res = np.nanpercentile(_ndat, 28, axis=1)
@@ -1010,7 +1084,7 @@ class TestNanFunctions_Percentile:
         assert_almost_equal(res, tgt)
 
     @pytest.mark.parametrize("axis", [None, 0, 1])
-    @pytest.mark.parametrize("dtype", np.typecodes["AllFloat"])
+    @pytest.mark.parametrize("dtype", np.typecodes["Float"])
     @pytest.mark.parametrize("array", [
         np.array(np.nan),
         np.full((3, 3), np.nan),
@@ -1104,6 +1178,14 @@ class TestNanFunctions_Quantile:
         assert_equal(np.nanquantile(x, 1), 3.5)
         assert_equal(np.nanquantile(x, 0.5), 1.75)
 
+    def test_complex(self):
+        arr_c = np.array([0.5+3.0j, 2.1+0.5j, 1.6+2.3j], dtype='G')
+        assert_raises(TypeError, np.nanquantile, arr_c, 0.5)
+        arr_c = np.array([0.5+3.0j, 2.1+0.5j, 1.6+2.3j], dtype='D')
+        assert_raises(TypeError, np.nanquantile, arr_c, 0.5)
+        arr_c = np.array([0.5+3.0j, 2.1+0.5j, 1.6+2.3j], dtype='F')
+        assert_raises(TypeError, np.nanquantile, arr_c, 0.5)
+
     def test_no_p_overwrite(self):
         # this is worth retesting, because quantile does not make a copy
         p0 = np.array([0, 0.75, 0.25, 0.5, 1.0])
@@ -1117,7 +1199,7 @@ class TestNanFunctions_Quantile:
         assert_array_equal(p, p0)
 
     @pytest.mark.parametrize("axis", [None, 0, 1])
-    @pytest.mark.parametrize("dtype", np.typecodes["AllFloat"])
+    @pytest.mark.parametrize("dtype", np.typecodes["Float"])
     @pytest.mark.parametrize("array", [
         np.array(np.nan),
         np.full((3, 3), np.nan),
diff --git a/numpy/lib/tests/test_shape_base.py b/numpy/lib/tests/test_shape_base.py
index 76058cf20..eb6628904 100644
--- a/numpy/lib/tests/test_shape_base.py
+++ b/numpy/lib/tests/test_shape_base.py
@@ -492,7 +492,7 @@ class TestColumnStack:
         assert_equal(actual, expected)
 
     def test_generator(self):
-        with assert_warns(FutureWarning):
+        with pytest.raises(TypeError, match="arrays to stack must be"):
             column_stack((np.arange(3) for _ in range(2)))
 
 
@@ -529,7 +529,7 @@ class TestDstack:
         assert_array_equal(res, desired)
 
     def test_generator(self):
-        with assert_warns(FutureWarning):
+        with pytest.raises(TypeError, match="arrays to stack must be"):
             dstack((np.arange(3) for _ in range(2)))
 
 
diff --git a/numpy/lib/tests/test_twodim_base.py b/numpy/lib/tests/test_twodim_base.py
index 141f508fd..eb008c600 100644
--- a/numpy/lib/tests/test_twodim_base.py
+++ b/numpy/lib/tests/test_twodim_base.py
@@ -4,20 +4,14 @@
 from numpy.testing import (
     assert_equal, assert_array_equal, assert_array_max_ulp,
     assert_array_almost_equal, assert_raises, assert_
-    )
-
+)
 from numpy import (
     arange, add, fliplr, flipud, zeros, ones, eye, array, diag, histogram2d,
     tri, mask_indices, triu_indices, triu_indices_from, tril_indices,
     tril_indices_from, vander,
-    )
-
+)
 import numpy as np
 
-
-from numpy.core.tests.test_overrides import requires_array_function
-
-
 import pytest
 
 
@@ -283,7 +277,6 @@ class TestHistogram2d:
         assert_array_equal(H, answer)
         assert_array_equal(xe, array([0., 0.25, 0.5, 0.75, 1]))
 
-    @requires_array_function
     def test_dispatch(self):
         class ShouldDispatch:
             def __array_function__(self, function, types, args, kwargs):
diff --git a/numpy/lib/tests/test_type_check.py b/numpy/lib/tests/test_type_check.py
index 3f4ca6309..ea0326139 100644
--- a/numpy/lib/tests/test_type_check.py
+++ b/numpy/lib/tests/test_type_check.py
@@ -155,7 +155,7 @@ class TestIscomplex:
     def test_fail(self):
         z = np.array([-1, 0, 1])
         res = iscomplex(z)
-        assert_(not np.sometrue(res, axis=0))
+        assert_(not np.any(res, axis=0))
 
     def test_pass(self):
         z = np.array([-1j, 1, 0])
diff --git a/numpy/lib/tests/test_ufunclike.py b/numpy/lib/tests/test_ufunclike.py
index c280b6969..fac4f41d0 100644
--- a/numpy/lib/tests/test_ufunclike.py
+++ b/numpy/lib/tests/test_ufunclike.py
@@ -80,12 +80,6 @@ class TestUfunclike:
         assert_(isinstance(f0d, MyArray))
         assert_equal(f0d.metadata, 'bar')
 
-    def test_deprecated(self):
-        # NumPy 1.13.0, 2017-04-26
-        assert_warns(DeprecationWarning, ufl.fix, [1, 2], y=nx.empty(2))
-        assert_warns(DeprecationWarning, ufl.isposinf, [1, 2], y=nx.empty(2))
-        assert_warns(DeprecationWarning, ufl.isneginf, [1, 2], y=nx.empty(2))
-
     def test_scalar(self):
         x = np.inf
         actual = np.isposinf(x)
diff --git a/numpy/lib/twodim_base.py b/numpy/lib/twodim_base.py
index 654ee4cf5..6dcb65651 100644
--- a/numpy/lib/twodim_base.py
+++ b/numpy/lib/twodim_base.py
@@ -155,10 +155,6 @@ def flipud(m):
     return m[::-1, ...]
 
 
-def _eye_dispatcher(N, M=None, k=None, dtype=None, order=None, *, like=None):
-    return (like,)
-
-
 @set_array_function_like_doc
 @set_module('numpy')
 def eye(N, M=None, k=0, dtype=float, order='C', *, like=None):
@@ -209,7 +205,7 @@ def eye(N, M=None, k=0, dtype=float, order='C', *, like=None):
 
     """
     if like is not None:
-        return _eye_with_like(N, M=M, k=k, dtype=dtype, order=order, like=like)
+        return _eye_with_like(like, N, M=M, k=k, dtype=dtype, order=order)
     if M is None:
         M = N
     m = zeros((N, M), dtype=dtype, order=order)
@@ -228,9 +224,7 @@ def eye(N, M=None, k=0, dtype=float, order='C', *, like=None):
     return m
 
 
-_eye_with_like = array_function_dispatch(
-    _eye_dispatcher
-)(eye)
+_eye_with_like = array_function_dispatch()(eye)
 
 
 def _diag_dispatcher(v, k=None):
@@ -369,10 +363,6 @@ def diagflat(v, k=0):
     return wrap(res)
 
 
-def _tri_dispatcher(N, M=None, k=None, dtype=None, *, like=None):
-    return (like,)
-
-
 @set_array_function_like_doc
 @set_module('numpy')
 def tri(N, M=None, k=0, dtype=float, *, like=None):
@@ -416,7 +406,7 @@ def tri(N, M=None, k=0, dtype=float, *, like=None):
 
     """
     if like is not None:
-        return _tri_with_like(N, M=M, k=k, dtype=dtype, like=like)
+        return _tri_with_like(like, N, M=M, k=k, dtype=dtype)
 
     if M is None:
         M = N
@@ -430,9 +420,7 @@ def tri(N, M=None, k=0, dtype=float, *, like=None):
     return m
 
 
-_tri_with_like = array_function_dispatch(
-    _tri_dispatcher
-)(tri)
+_tri_with_like = array_function_dispatch()(tri)
 
 
 def _trilu_dispatcher(m, k=None):
@@ -766,7 +754,7 @@ def histogram2d(x, y, bins=10, range=None, density=None, weights=None):
     >>> xcenters = (xedges[:-1] + xedges[1:]) / 2
     >>> ycenters = (yedges[:-1] + yedges[1:]) / 2
     >>> im.set_data(xcenters, ycenters, H)
-    >>> ax.images.append(im)
+    >>> ax.add_image(im)
     >>> plt.show()
 
     It is also possible to construct a 2-D histogram without specifying bin
@@ -995,9 +983,42 @@ def tril_indices_from(arr, k=0):
     k : int, optional
         Diagonal offset (see `tril` for details).
 
+    Examples
+    --------
+
+    Create a 4 by 4 array.
+
+    >>> a = np.arange(16).reshape(4, 4)
+    >>> a
+    array([[ 0,  1,  2,  3],
+           [ 4,  5,  6,  7],
+           [ 8,  9, 10, 11],
+           [12, 13, 14, 15]])
+
+    Pass the array to get the indices of the lower triangular elements.
+
+    >>> trili = np.tril_indices_from(a)
+    >>> trili
+    (array([0, 1, 1, 2, 2, 2, 3, 3, 3, 3]), array([0, 0, 1, 0, 1, 2, 0, 1, 2, 3]))
+
+    >>> a[trili]
+    array([ 0,  4,  5,  8,  9, 10, 12, 13, 14, 15])
+
+    This is syntactic sugar for tril_indices().
+
+    >>> np.tril_indices(a.shape[0])
+    (array([0, 1, 1, 2, 2, 2, 3, 3, 3, 3]), array([0, 0, 1, 0, 1, 2, 0, 1, 2, 3]))
+
+    Use the `k` parameter to return the indices for the lower triangular array
+    up to the k-th diagonal.
+
+    >>> trili1 = np.tril_indices_from(a, k=1)
+    >>> a[trili1]
+    array([ 0,  1,  4,  5,  6,  8,  9, 10, 11, 12, 13, 14, 15])
+
     See Also
     --------
-    tril_indices, tril
+    tril_indices, tril, triu_indices_from
 
     Notes
     -----
@@ -1114,9 +1135,43 @@ def triu_indices_from(arr, k=0):
     triu_indices_from : tuple, shape(2) of ndarray, shape(N)
         Indices for the upper-triangle of `arr`.
 
+    Examples
+    --------
+
+    Create a 4 by 4 array.
+
+    >>> a = np.arange(16).reshape(4, 4)
+    >>> a
+    array([[ 0,  1,  2,  3],
+           [ 4,  5,  6,  7],
+           [ 8,  9, 10, 11],
+           [12, 13, 14, 15]])
+
+    Pass the array to get the indices of the upper triangular elements.
+
+    >>> triui = np.triu_indices_from(a)
+    >>> triui
+    (array([0, 0, 0, 0, 1, 1, 1, 2, 2, 3]), array([0, 1, 2, 3, 1, 2, 3, 2, 3, 3]))
+
+    >>> a[triui]
+    array([ 0,  1,  2,  3,  5,  6,  7, 10, 11, 15])
+
+    This is syntactic sugar for triu_indices().
+
+    >>> np.triu_indices(a.shape[0])
+    (array([0, 0, 0, 0, 1, 1, 1, 2, 2, 3]), array([0, 1, 2, 3, 1, 2, 3, 2, 3, 3]))
+
+    Use the `k` parameter to return the indices for the upper triangular array
+    from the k-th diagonal.
+
+    >>> triuim1 = np.triu_indices_from(a, k=1)
+    >>> a[triuim1]
+    array([ 1,  2,  3,  6,  7, 11])
+
+
     See Also
     --------
-    triu_indices, triu
+    triu_indices, triu, tril_indices_from
 
     Notes
     -----
diff --git a/numpy/lib/type_check.py b/numpy/lib/type_check.py
index 94d525f51..3f84b80e5 100644
--- a/numpy/lib/type_check.py
+++ b/numpy/lib/type_check.py
@@ -2,17 +2,16 @@
 
 """
 import functools
-import warnings
 
 __all__ = ['iscomplexobj', 'isrealobj', 'imag', 'iscomplex',
            'isreal', 'nan_to_num', 'real', 'real_if_close',
            'typename', 'asfarray', 'mintypecode',
            'common_type']
 
+from .._utils import set_module
 import numpy.core.numeric as _nx
 from numpy.core.numeric import asarray, asanyarray, isnan, zeros
-from numpy.core.overrides import set_module
-from numpy.core import overrides
+from numpy.core import overrides, getlimits
 from .ufunclike import isneginf, isposinf
 
 
@@ -541,7 +540,8 @@ def real_if_close(a, tol=100):
         Input array.
     tol : float
         Tolerance in machine epsilons for the complex part of the elements
-        in the array.
+        in the array. If the tolerance is <=1, then the absolute tolerance
+        is used.
 
     Returns
     -------
@@ -572,11 +572,11 @@ def real_if_close(a, tol=100):
 
     """
     a = asanyarray(a)
-    if not issubclass(a.dtype.type, _nx.complexfloating):
+    type_ = a.dtype.type
+    if not issubclass(type_, _nx.complexfloating):
         return a
     if tol > 1:
-        from numpy.core import getlimits
-        f = getlimits.finfo(a.dtype.type)
+        f = getlimits.finfo(type_)
         tol = f.eps * tol
     if _nx.all(_nx.absolute(a.imag) < tol):
         a = a.real
diff --git a/numpy/lib/ufunclike.py b/numpy/lib/ufunclike.py
index a93c4773b..05fe60c5b 100644
--- a/numpy/lib/ufunclike.py
+++ b/numpy/lib/ufunclike.py
@@ -6,72 +6,16 @@ storing results in an output array.
 __all__ = ['fix', 'isneginf', 'isposinf']
 
 import numpy.core.numeric as nx
-from numpy.core.overrides import (
-    array_function_dispatch, ARRAY_FUNCTION_ENABLED,
-)
+from numpy.core.overrides import array_function_dispatch
 import warnings
 import functools
 
 
-def _deprecate_out_named_y(f):
-    """
-    Allow the out argument to be passed as the name `y` (deprecated)
-
-    In future, this decorator should be removed.
-    """
-    @functools.wraps(f)
-    def func(x, out=None, **kwargs):
-        if 'y' in kwargs:
-            if 'out' in kwargs:
-                raise TypeError(
-                    "{} got multiple values for argument 'out'/'y'"
-                    .format(f.__name__)
-                )
-            out = kwargs.pop('y')
-            # NumPy 1.13.0, 2017-04-26
-            warnings.warn(
-                "The name of the out argument to {} has changed from `y` to "
-                "`out`, to match other ufuncs.".format(f.__name__),
-                DeprecationWarning, stacklevel=3)
-        return f(x, out=out, **kwargs)
-
-    return func
-
-
-def _fix_out_named_y(f):
-    """
-    Allow the out argument to be passed as the name `y` (deprecated)
-
-    This decorator should only be used if _deprecate_out_named_y is used on
-    a corresponding dispatcher function.
-    """
-    @functools.wraps(f)
-    def func(x, out=None, **kwargs):
-        if 'y' in kwargs:
-            # we already did error checking in _deprecate_out_named_y
-            out = kwargs.pop('y')
-        return f(x, out=out, **kwargs)
-
-    return func
-
-
-def _fix_and_maybe_deprecate_out_named_y(f):
-    """
-    Use the appropriate decorator, depending upon if dispatching is being used.
-    """
-    if ARRAY_FUNCTION_ENABLED:
-        return _fix_out_named_y(f)
-    else:
-        return _deprecate_out_named_y(f)
-
-
-@_deprecate_out_named_y
 def _dispatcher(x, out=None):
     return (x, out)
 
 
 @array_function_dispatch(_dispatcher, verify=False, module='numpy')
-@_fix_and_maybe_deprecate_out_named_y
 def fix(x, out=None):
     """
     Round to nearest integer towards zero.
@@ -125,7 +69,6 @@ def fix(x, out=None):
 
 
 @array_function_dispatch(_dispatcher, verify=False, module='numpy')
-@_fix_and_maybe_deprecate_out_named_y
 def isposinf(x, out=None):
     """
     Test element-wise for positive infinity, return result as bool array.
@@ -197,7 +140,6 @@ def isposinf(x, out=None):
 
 
 @array_function_dispatch(_dispatcher, verify=False, module='numpy')
-@_fix_and_maybe_deprecate_out_named_y
 def isneginf(x, out=None):
     """
     Test element-wise for negative infinity, return result as bool array.
diff --git a/numpy/lib/utils.py b/numpy/lib/utils.py
index afde8cc60..095c914db 100644
--- a/numpy/lib/utils.py
+++ b/numpy/lib/utils.py
@@ -5,9 +5,10 @@ import types
 import re
 import warnings
 import functools
+import platform
 
+from .._utils import set_module
 from numpy.core.numerictypes import issubclass_, issubsctype, issubdtype
-from numpy.core.overrides import set_module
 from numpy.core import ndarray, ufunc, asarray
 import numpy as np
 
@@ -24,6 +25,8 @@ def show_runtime():
     including available intrinsic support and BLAS/LAPACK library
     in use
 
+    .. versionadded:: 1.24.0
+
     See Also
     --------
     show_config : Show libraries in the system on which NumPy was built.
@@ -31,45 +34,20 @@ def show_runtime():
     Notes
     -----
     1. Information is derived with the help of `threadpoolctl <https://pypi.org/project/threadpoolctl/>`_
-       library.
+       library if available.
     2. SIMD related information is derived from ``__cpu_features__``,
        ``__cpu_baseline__`` and ``__cpu_dispatch__``
 
-    Examples
-    --------
-    >>> import numpy as np
-    >>> np.show_runtime()
-    [{'simd_extensions': {'baseline': ['SSE', 'SSE2', 'SSE3'],
-                          'found': ['SSSE3',
-                                    'SSE41',
-                                    'POPCNT',
-                                    'SSE42',
-                                    'AVX',
-                                    'F16C',
-                                    'FMA3',
-                                    'AVX2'],
-                          'not_found': ['AVX512F',
-                                        'AVX512CD',
-                                        'AVX512_KNL',
-                                        'AVX512_KNM',
-                                        'AVX512_SKX',
-                                        'AVX512_CLX',
-                                        'AVX512_CNL',
-                                        'AVX512_ICL']}},
-     {'architecture': 'Zen',
-      'filepath': '/usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so',
-      'internal_api': 'openblas',
-      'num_threads': 12,
-      'prefix': 'libopenblas',
-      'threading_layer': 'pthreads',
-      'user_api': 'blas',
-      'version': '0.3.20'}]
     """
     from numpy.core._multiarray_umath import (
         __cpu_features__, __cpu_baseline__, __cpu_dispatch__
     )
     from pprint import pprint
-    config_found = []
+    config_found = [{
+        "numpy_version": np.__version__,
+        "python": sys.version,
+        "uname": platform.uname(),
+        }]
     features_found, features_not_found = [], []
     for feature in __cpu_dispatch__:
         if __cpu_features__[feature]:
@@ -550,15 +528,16 @@ def _info(obj, output=None):
 @set_module('numpy')
 def info(object=None, maxwidth=76, output=None, toplevel='numpy'):
     """
-    Get help information for a function, class, or module.
+    Get help information for an array, function, class, or module.
 
     Parameters
     ----------
     object : object or str, optional
-        Input object or name to get information about. If `object` is a
-        numpy object, its docstring is given. If it is a string, available
-        modules are searched for matching objects.  If None, information
-        about `info` itself is returned.
+        Input object or name to get information about. If `object` is
+        an `ndarray` instance, information about the array is printed.
+        If `object` is a numpy object, its docstring is given. If it is
+        a string, available modules are searched for matching objects.
+        If None, information about `info` itself is returned.
     maxwidth : int, optional
         Printing width.
     output : file like object, optional
@@ -597,6 +576,22 @@ def info(object=None, maxwidth=76, output=None, toplevel='numpy'):
          *** Repeat reference found in numpy.fft.fftpack ***
          *** Total of 3 references found. ***
 
+    When the argument is an array, information about the array is printed.
+
+    >>> a = np.array([[1 + 2j, 3, -4], [-5j, 6, 0]], dtype=np.complex64)
+    >>> np.info(a)
+    class:  ndarray
+    shape:  (2, 3)
+    strides:  (24, 8)
+    itemsize:  8
+    aligned:  True
+    contiguous:  True
+    fortran:  False
+    data pointer: 0x562b6e0d2860  # may vary
+    byteorder:  little
+    byteswap:  False
+    type: complex64
+
     """
     global _namedict, _dictlist
     # Local import to speed up numpy's import time.
diff --git a/numpy/lib/utils.pyi b/numpy/lib/utils.pyi
index 407ce1120..52ca92774 100644
--- a/numpy/lib/utils.pyi
+++ b/numpy/lib/utils.pyi
@@ -87,3 +87,5 @@ def lookfor(
 ) -> None: ...
 
 def safe_eval(source: str | AST) -> Any: ...
+
+def show_runtime() -> None: ...
diff --git a/numpy/linalg/lapack_lite/clapack_scrub.py b/numpy/linalg/lapack_lite/clapack_scrub.py
index fffd70910..d4da9070d 100644
--- a/numpy/linalg/lapack_lite/clapack_scrub.py
+++ b/numpy/linalg/lapack_lite/clapack_scrub.py
@@ -297,7 +297,7 @@ def scrubSource(source, nsteps=None, verbose=False):
 if __name__ == '__main__':
     filename = sys.argv[1]
     outfilename = os.path.join(sys.argv[2], os.path.basename(filename))
-    with open(filename, 'r') as fo:
+    with open(filename) as fo:
         source = fo.read()
 
     if len(sys.argv) > 3:
diff --git a/numpy/linalg/lapack_lite/f2c.c b/numpy/linalg/lapack_lite/f2c.c
index 869fce5d3..ddaa8d172 100644
--- a/numpy/linalg/lapack_lite/f2c.c
+++ b/numpy/linalg/lapack_lite/f2c.c
@@ -7,10 +7,14 @@
   it is available, and shipping a static library isn't portable.
 */
 
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
+
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+
 #include "f2c.h"
 
 
diff --git a/numpy/linalg/lapack_lite/f2c.h b/numpy/linalg/lapack_lite/f2c.h
index b44aaac44..0db1b9ecb 100644
--- a/numpy/linalg/lapack_lite/f2c.h
+++ b/numpy/linalg/lapack_lite/f2c.h
@@ -7,6 +7,9 @@
 #ifndef F2C_INCLUDE
 #define F2C_INCLUDE
 
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
+
 #include <math.h>
 #include "numpy/npy_common.h"
 #include "npy_cblas.h"
diff --git a/numpy/linalg/lapack_lite/f2c_blas.c b/numpy/linalg/lapack_lite/f2c_blas.c
index 65286892f..9a9fd3f9b 100644
--- a/numpy/linalg/lapack_lite/f2c_blas.c
+++ b/numpy/linalg/lapack_lite/f2c_blas.c
@@ -380,7 +380,6 @@ L20:
     static logical nota, notb;
     static complex temp;
     static logical conja, conjb;
-    static integer ncola;
     extern logical lsame_(char *, char *);
     static integer nrowa, nrowb;
     extern /* Subroutine */ int xerbla_(char *, integer *);
@@ -514,7 +513,7 @@ L20:
        Set  NOTA  and  NOTB  as  true if  A  and  B  respectively are not
        conjugated or transposed, set  CONJA and CONJB  as true if  A  and
        B  respectively are to be  transposed but  not conjugated  and set
-       NROWA, NCOLA and  NROWB  as the number of rows and  columns  of  A
+       NROWA and  NROWB  as the number of rows and  columns  of  A
        and the number of rows of  B  respectively.
 */
 
@@ -536,10 +535,8 @@ L20:
     conjb = lsame_(transb, "C");
     if (nota) {
 	nrowa = *m;
-	ncola = *k;
     } else {
 	nrowa = *k;
-	ncola = *m;
     }
     if (notb) {
 	nrowb = *k;
@@ -6850,7 +6847,6 @@ L60:
     static integer i__, j, l, info;
     static logical nota, notb;
     static doublereal temp;
-    static integer ncola;
     extern logical lsame_(char *, char *);
     static integer nrowa, nrowb;
     extern /* Subroutine */ int xerbla_(char *, integer *);
@@ -6982,7 +6978,7 @@ L60:
 
 
        Set  NOTA  and  NOTB  as  true if  A  and  B  respectively are not
-       transposed and set  NROWA, NCOLA and  NROWB  as the number of rows
+       transposed and set  NROWA and  NROWB  as the number of rows
        and  columns of  A  and the  number of  rows  of  B  respectively.
 */
 
@@ -7002,10 +6998,8 @@ L60:
     notb = lsame_(transb, "N");
     if (nota) {
 	nrowa = *m;
-	ncola = *k;
     } else {
 	nrowa = *k;
-	ncola = *m;
     }
     if (notb) {
 	nrowb = *k;
@@ -11454,7 +11448,6 @@ L60:
     static integer i__, j, l, info;
     static logical nota, notb;
     static real temp;
-    static integer ncola;
     extern logical lsame_(char *, char *);
     static integer nrowa, nrowb;
     extern /* Subroutine */ int xerbla_(char *, integer *);
@@ -11586,7 +11579,7 @@ L60:
 
 
        Set  NOTA  and  NOTB  as  true if  A  and  B  respectively are not
-       transposed and set  NROWA, NCOLA and  NROWB  as the number of rows
+       transposed and set  NROWA and  NROWB  as the number of rows
        and  columns of  A  and the  number of  rows  of  B  respectively.
 */
 
@@ -11606,10 +11599,8 @@ L60:
     notb = lsame_(transb, "N");
     if (nota) {
 	nrowa = *m;
-	ncola = *k;
     } else {
 	nrowa = *k;
-	ncola = *m;
     }
     if (notb) {
 	nrowb = *k;
@@ -15667,7 +15658,6 @@ L20:
     static logical nota, notb;
     static doublecomplex temp;
     static logical conja, conjb;
-    static integer ncola;
     extern logical lsame_(char *, char *);
     static integer nrowa, nrowb;
     extern /* Subroutine */ int xerbla_(char *, integer *);
@@ -15801,7 +15791,7 @@ L20:
        Set  NOTA  and  NOTB  as  true if  A  and  B  respectively are not
        conjugated or transposed, set  CONJA and CONJB  as true if  A  and
        B  respectively are to be  transposed but  not conjugated  and set
-       NROWA, NCOLA and  NROWB  as the number of rows and  columns  of  A
+       NROWA and  NROWB  as the number of rows and  columns  of  A
        and the number of rows of  B  respectively.
 */
 
@@ -15823,10 +15813,8 @@ L20:
     conjb = lsame_(transb, "C");
     if (nota) {
 	nrowa = *m;
-	ncola = *k;
     } else {
 	nrowa = *k;
-	ncola = *m;
     }
     if (notb) {
 	nrowb = *k;
diff --git a/numpy/linalg/lapack_lite/f2c_blas.c.patch b/numpy/linalg/lapack_lite/f2c_blas.c.patch
new file mode 100644
index 000000000..59a6a1919
--- /dev/null
+++ b/numpy/linalg/lapack_lite/f2c_blas.c.patch
@@ -0,0 +1,112 @@
+@@ -380,7 +380,6 @@ L20:
+     static logical nota, notb;
+     static complex temp;
+     static logical conja, conjb;
+-    static integer ncola;
+     extern logical lsame_(char *, char *);
+     static integer nrowa, nrowb;
+     extern /* Subroutine */ int xerbla_(char *, integer *);
+@@ -514,7 +513,7 @@ L20:
+        Set  NOTA  and  NOTB  as  true if  A  and  B  respectively are not
+        conjugated or transposed, set  CONJA and CONJB  as true if  A  and
+        B  respectively are to be  transposed but  not conjugated  and set
+-       NROWA, NCOLA and  NROWB  as the number of rows and  columns  of  A
++       NROWA and  NROWB  as the number of rows and  columns  of  A
+        and the number of rows of  B  respectively.
+ */
+ 
+@@ -536,10 +535,8 @@ L20:
+     conjb = lsame_(transb, "C");
+     if (nota) {
+ 	nrowa = *m;
+-	ncola = *k;
+     } else {
+ 	nrowa = *k;
+-	ncola = *m;
+     }
+     if (notb) {
+ 	nrowb = *k;
+@@ -6850,7 +6847,6 @@ L60:
+     static integer i__, j, l, info;
+     static logical nota, notb;
+     static doublereal temp;
+-    static integer ncola;
+     extern logical lsame_(char *, char *);
+     static integer nrowa, nrowb;
+     extern /* Subroutine */ int xerbla_(char *, integer *);
+@@ -6982,7 +6978,7 @@ L60:
+ 
+ 
+        Set  NOTA  and  NOTB  as  true if  A  and  B  respectively are not
+-       transposed and set  NROWA, NCOLA and  NROWB  as the number of rows
++       transposed and set  NROWA and  NROWB  as the number of rows
+        and  columns of  A  and the  number of  rows  of  B  respectively.
+ */
+ 
+@@ -7002,10 +6998,8 @@ L60:
+     notb = lsame_(transb, "N");
+     if (nota) {
+ 	nrowa = *m;
+-	ncola = *k;
+     } else {
+ 	nrowa = *k;
+-	ncola = *m;
+     }
+     if (notb) {
+ 	nrowb = *k;
+@@ -11454,7 +11448,6 @@ L60:
+     static integer i__, j, l, info;
+     static logical nota, notb;
+     static real temp;
+-    static integer ncola;
+     extern logical lsame_(char *, char *);
+     static integer nrowa, nrowb;
+     extern /* Subroutine */ int xerbla_(char *, integer *);
+@@ -11586,7 +11579,7 @@ L60:
+ 
+ 
+        Set  NOTA  and  NOTB  as  true if  A  and  B  respectively are not
+-       transposed and set  NROWA, NCOLA and  NROWB  as the number of rows
++       transposed and set  NROWA and  NROWB  as the number of rows
+        and  columns of  A  and the  number of  rows  of  B  respectively.
+ */
+ 
+@@ -11606,10 +11599,8 @@ L60:
+     notb = lsame_(transb, "N");
+     if (nota) {
+ 	nrowa = *m;
+-	ncola = *k;
+     } else {
+ 	nrowa = *k;
+-	ncola = *m;
+     }
+     if (notb) {
+ 	nrowb = *k;
+@@ -15667,7 +15658,6 @@ L20:
+     static logical nota, notb;
+     static doublecomplex temp;
+     static logical conja, conjb;
+-    static integer ncola;
+     extern logical lsame_(char *, char *);
+     static integer nrowa, nrowb;
+     extern /* Subroutine */ int xerbla_(char *, integer *);
+@@ -15801,7 +15791,7 @@ L20:
+        Set  NOTA  and  NOTB  as  true if  A  and  B  respectively are not
+        conjugated or transposed, set  CONJA and CONJB  as true if  A  and
+        B  respectively are to be  transposed but  not conjugated  and set
+-       NROWA, NCOLA and  NROWB  as the number of rows and  columns  of  A
++       NROWA and  NROWB  as the number of rows and  columns  of  A
+        and the number of rows of  B  respectively.
+ */
+ 
+@@ -15823,10 +15813,8 @@ L20:
+     conjb = lsame_(transb, "C");
+     if (nota) {
+ 	nrowa = *m;
+-	ncola = *k;
+     } else {
+ 	nrowa = *k;
+-	ncola = *m;
+     }
+     if (notb) {
+ 	nrowb = *k;
diff --git a/numpy/linalg/lapack_lite/make_lite.py b/numpy/linalg/lapack_lite/make_lite.py
index ca8d4c62c..20b212792 100755
--- a/numpy/linalg/lapack_lite/make_lite.py
+++ b/numpy/linalg/lapack_lite/make_lite.py
@@ -253,7 +253,7 @@ def dumpRoutineNames(library, output_dir):
 def concatenateRoutines(routines, output_file):
     with open(output_file, 'w') as output_fo:
         for r in routines:
-            with open(r.filename, 'r') as fo:
+            with open(r.filename) as fo:
                 source = fo.read()
             output_fo.write(source)
 
@@ -296,7 +296,7 @@ def create_name_header(output_dir):
         if not fn.endswith('.f'):
             continue
 
-        with open(fn, 'r') as f:
+        with open(fn) as f:
             for line in f:
                 m = routine_re.match(line)
                 if m:
@@ -304,7 +304,7 @@ def create_name_header(output_dir):
 
     # f2c symbols
     f2c_symbols = set()
-    with open('f2c.h', 'r') as f:
+    with open('f2c.h') as f:
         for line in f:
             m = extern_re.match(line)
             if m:
diff --git a/numpy/linalg/linalg.py b/numpy/linalg/linalg.py
index 697193321..78927d1ae 100644
--- a/numpy/linalg/linalg.py
+++ b/numpy/linalg/linalg.py
@@ -18,17 +18,17 @@ import functools
 import operator
 import warnings
 
+from .._utils import set_module
 from numpy.core import (
     array, asarray, zeros, empty, empty_like, intc, single, double,
     csingle, cdouble, inexact, complexfloating, newaxis, all, Inf, dot,
     add, multiply, sqrt, sum, isfinite,
-    finfo, errstate, geterrobj, moveaxis, amin, amax, product, abs,
+    finfo, errstate, geterrobj, moveaxis, amin, amax, prod, abs,
     atleast_2d, intp, asanyarray, object_, matmul,
     swapaxes, divide, count_nonzero, isnan, sign, argsort, sort,
     reciprocal
 )
 from numpy.core.multiarray import normalize_axis_index
-from numpy.core.overrides import set_module
 from numpy.core import overrides
 from numpy.lib.twodim_base import triu, eye
 from numpy.linalg import _umath_linalg
@@ -42,11 +42,11 @@ fortran_int = intc
 
 
 @set_module('numpy.linalg')
-class LinAlgError(Exception):
+class LinAlgError(ValueError):
     """
     Generic Python-exception-derived object raised by linalg functions.
 
-    General purpose exception class, derived from Python's exception.Exception
+    General purpose exception class, derived from Python's ValueError
     class, programmatically raised in linalg functions when a Linear
     Algebra-related condition would prevent further correct execution of the
     function.
@@ -138,24 +138,24 @@ def _commonType(*arrays):
     result_type = single
     is_complex = False
     for a in arrays:
-        if issubclass(a.dtype.type, inexact):
-            if isComplexType(a.dtype.type):
+        type_ = a.dtype.type
+        if issubclass(type_, inexact):
+            if isComplexType(type_):
                 is_complex = True
-            rt = _realType(a.dtype.type, default=None)
-            if rt is None:
+            rt = _realType(type_, default=None)
+            if rt is double:
+                result_type = double
+            elif rt is None:
                 # unsupported inexact scalar
                 raise TypeError("array type %s is unsupported in linalg" %
                         (a.dtype.name,))
         else:
-            rt = double
-        if rt is double:
             result_type = double
     if is_complex:
-        t = cdouble
         result_type = _complex_types_map[result_type]
+        return cdouble, result_type
     else:
-        t = double
-    return t, result_type
+        return double, result_type
 
 
 def _to_native_byte_order(*arrays):
@@ -196,7 +196,7 @@ def _assert_finite(*arrays):
 
 def _is_empty_2d(arr):
     # check size first for efficiency
-    return arr.size == 0 and product(arr.shape[-2:]) == 0
+    return arr.size == 0 and prod(arr.shape[-2:]) == 0
 
 
 def transpose(a):
@@ -899,12 +899,12 @@ def qr(a, mode='reduced'):
             msg = "".join((
                     "The 'full' option is deprecated in favor of 'reduced'.\n",
                     "For backward compatibility let mode default."))
-            warnings.warn(msg, DeprecationWarning, stacklevel=3)
+            warnings.warn(msg, DeprecationWarning, stacklevel=2)
             mode = 'reduced'
         elif mode in ('e', 'economic'):
             # 2013-04-01, 1.8
             msg = "The 'economic' option is deprecated."
-            warnings.warn(msg, DeprecationWarning, stacklevel=3)
+            warnings.warn(msg, DeprecationWarning, stacklevel=2)
             mode = 'economic'
         else:
             raise ValueError(f"Unrecognized mode '{mode}'")
@@ -943,7 +943,7 @@ def qr(a, mode='reduced'):
         return wrap(a)
 
     # mc is the number of columns in the resulting q
-    # matrix. If the mode is complete then it is 
+    # matrix. If the mode is complete then it is
     # same as number of rows, and if the mode is reduced,
     # then it is the minimum of number of rows and columns.
     if mode == 'complete' and m > n:
@@ -2267,7 +2267,7 @@ def lstsq(a, b, rcond="warn"):
                       "To use the future default and silence this warning "
                       "we advise to pass `rcond=None`, to keep using the old, "
                       "explicitly pass `rcond=-1`.",
-                      FutureWarning, stacklevel=3)
+                      FutureWarning, stacklevel=2)
         rcond = -1
     if rcond is None:
         rcond = finfo(t).eps * max(n, m)
diff --git a/numpy/linalg/meson.build b/numpy/linalg/meson.build
new file mode 100644
index 000000000..083692913
--- /dev/null
+++ b/numpy/linalg/meson.build
@@ -0,0 +1,56 @@
+lapack_lite_sources = [
+  'lapack_lite/f2c.c',
+  'lapack_lite/f2c_c_lapack.c',
+  'lapack_lite/f2c_d_lapack.c',
+  'lapack_lite/f2c_s_lapack.c',
+  'lapack_lite/f2c_z_lapack.c',
+  'lapack_lite/f2c_blas.c',
+  'lapack_lite/f2c_config.c',
+  'lapack_lite/f2c_lapack.c',
+  'lapack_lite/python_xerbla.c',
+]
+
+# TODO: ILP64 support
+
+lapack_lite_module_src = ['lapack_litemodule.c']
+if not have_lapack
+  warning('LAPACK was not found, NumPy is using an unoptimized, naive build from sources!')
+  lapack_lite_module_src += lapack_lite_sources
+endif
+
+py.extension_module('lapack_lite',
+  lapack_lite_module_src,
+  dependencies: [np_core_dep, lapack],
+  install: true,
+  subdir: 'numpy/linalg',
+)
+
+_umath_linalg_src = ['umath_linalg.cpp'] + lapack_lite_sources
+
+py.extension_module('_umath_linalg',
+  _umath_linalg_src,
+  dependencies: np_core_dep,
+  link_with: npymath_lib,
+  install: true,
+  subdir: 'numpy/linalg',
+)
+
+py.install_sources(
+  [
+    '__init__.py',
+    '__init__.pyi',
+    'linalg.py',
+    'linalg.pyi',
+  ],
+  subdir: 'numpy/linalg'
+)
+
+py.install_sources(
+  [
+    'tests/__init__.py',
+    'tests/test_deprecations.py',
+    'tests/test_linalg.py',
+    'tests/test_regression.py',
+  ],
+  subdir: 'numpy/linalg/tests'
+)
diff --git a/numpy/linalg/setup.py b/numpy/linalg/setup.py
index 1c4e1295e..6f72635ab 100644
--- a/numpy/linalg/setup.py
+++ b/numpy/linalg/setup.py
@@ -4,7 +4,6 @@ import sysconfig
 
 def configuration(parent_package='', top_path=None):
     from numpy.distutils.misc_util import Configuration
-    from numpy.distutils.ccompiler_opt import NPY_CXX_FLAGS
     from numpy.distutils.system_info import get_info, system_info
     config = Configuration('linalg', parent_package, top_path)
 
@@ -81,7 +80,6 @@ def configuration(parent_package='', top_path=None):
         sources=['umath_linalg.cpp', get_lapack_lite_sources],
         depends=['lapack_lite/f2c.h'],
         extra_info=lapack_info,
-        extra_cxx_compile_args=NPY_CXX_FLAGS,
         libraries=['npymath'],
     )
     config.add_data_files('*.pyi')
diff --git a/numpy/linalg/tests/test_linalg.py b/numpy/linalg/tests/test_linalg.py
index f871a5f8e..b1dbd4c22 100644
--- a/numpy/linalg/tests/test_linalg.py
+++ b/numpy/linalg/tests/test_linalg.py
@@ -19,7 +19,7 @@ from numpy.linalg.linalg import _multi_dot_matrix_chain_order
 from numpy.testing import (
     assert_, assert_equal, assert_raises, assert_array_equal,
     assert_almost_equal, assert_allclose, suppress_warnings,
-    assert_raises_regex, HAS_LAPACK64,
+    assert_raises_regex, HAS_LAPACK64, IS_WASM
     )
 
 
@@ -1063,6 +1063,7 @@ class TestMatrixPower:
         assert_raises(LinAlgError, matrix_power, np.array([[1], [2]], dt), 1)
         assert_raises(LinAlgError, matrix_power, np.ones((4, 3, 2), dt), 1)
 
+    @pytest.mark.skipif(IS_WASM, reason="fp errors don't work in wasm")
     def test_exceptions_not_invertible(self, dt):
         if dt in self.dtnoinv:
             return
@@ -1845,6 +1846,7 @@ def test_byteorder_check():
             assert_array_equal(res, routine(sw_arr))
 
 
+@pytest.mark.skipif(IS_WASM, reason="fp errors don't work in wasm")
 def test_generalized_raise_multiloop():
     # It should raise an error even if the error doesn't occur in the
     # last iteration of the ufunc inner loop
@@ -1908,6 +1910,7 @@ def test_xerbla_override():
             pytest.skip('Numpy xerbla not linked in.')
 
 
+@pytest.mark.skipif(IS_WASM, reason="Cannot start subprocess")
 @pytest.mark.slow
 def test_sdot_bug_8577():
     # Regression test that loading certain other libraries does not
diff --git a/numpy/linalg/tests/test_regression.py b/numpy/linalg/tests/test_regression.py
index 7ed932bc9..af38443a9 100644
--- a/numpy/linalg/tests/test_regression.py
+++ b/numpy/linalg/tests/test_regression.py
@@ -107,10 +107,7 @@ class TestRegression:
         assert_raises(ValueError, linalg.norm, testvector, ord='nuc')
         assert_raises(ValueError, linalg.norm, testvector, ord=np.inf)
         assert_raises(ValueError, linalg.norm, testvector, ord=-np.inf)
-        with warnings.catch_warnings():
-            warnings.simplefilter("error", DeprecationWarning)
-            assert_raises((AttributeError, DeprecationWarning),
-                              linalg.norm, testvector, ord=0)
+        assert_raises(ValueError, linalg.norm, testvector, ord=0)
         assert_raises(ValueError, linalg.norm, testvector, ord=-1)
         assert_raises(ValueError, linalg.norm, testvector, ord=-2)
 
diff --git a/numpy/linalg/umath_linalg.cpp b/numpy/linalg/umath_linalg.cpp
index bbb4bb896..68db2b2f1 100644
--- a/numpy/linalg/umath_linalg.cpp
+++ b/numpy/linalg/umath_linalg.cpp
@@ -22,6 +22,7 @@
 #include <cstdio>
 #include <cassert>
 #include <cmath>
+#include <type_traits>
 #include <utility>
 
 
@@ -1148,7 +1149,7 @@ slogdet(char **args,
                void *NPY_UNUSED(func))
 {
     fortran_int m;
-    npy_uint8 *tmp_buff = NULL;
+    char *tmp_buff = NULL;
     size_t matrix_size;
     size_t pivot_size;
     size_t safe_m;
@@ -1162,10 +1163,11 @@ slogdet(char **args,
      */
     INIT_OUTER_LOOP_3
     m = (fortran_int) dimensions[0];
-    safe_m = m;
+    /* avoid empty malloc (buffers likely unused) and ensure m is `size_t` */
+    safe_m = m != 0 ? m : 1;
     matrix_size = safe_m * safe_m * sizeof(typ);
     pivot_size = safe_m * sizeof(fortran_int);
-    tmp_buff = (npy_uint8 *)malloc(matrix_size + pivot_size);
+    tmp_buff = (char *)malloc(matrix_size + pivot_size);
 
     if (tmp_buff) {
         LINEARIZE_DATA_t lin_data;
@@ -1182,6 +1184,13 @@ slogdet(char **args,
 
         free(tmp_buff);
     }
+    else {
+        /* TODO: Requires use of new ufunc API to indicate error return */
+        NPY_ALLOW_C_API_DEF
+        NPY_ALLOW_C_API;
+        PyErr_NoMemory();
+        NPY_DISABLE_C_API;
+    }
 }
 
 template<typename typ, typename basetyp>
@@ -1192,7 +1201,7 @@ det(char **args,
            void *NPY_UNUSED(func))
 {
     fortran_int m;
-    npy_uint8 *tmp_buff;
+    char *tmp_buff;
     size_t matrix_size;
     size_t pivot_size;
     size_t safe_m;
@@ -1206,10 +1215,11 @@ det(char **args,
      */
     INIT_OUTER_LOOP_2
     m = (fortran_int) dimensions[0];
-    safe_m = m;
+    /* avoid empty malloc (buffers likely unused) and ensure m is `size_t` */
+    safe_m = m != 0 ? m : 1;
     matrix_size = safe_m * safe_m * sizeof(typ);
     pivot_size = safe_m * sizeof(fortran_int);
-    tmp_buff = (npy_uint8 *)malloc(matrix_size + pivot_size);
+    tmp_buff = (char *)malloc(matrix_size + pivot_size);
 
     if (tmp_buff) {
         LINEARIZE_DATA_t lin_data;
@@ -1230,6 +1240,13 @@ det(char **args,
 
         free(tmp_buff);
     }
+    else {
+        /* TODO: Requires use of new ufunc API to indicate error return */
+        NPY_ALLOW_C_API_DEF
+        NPY_ALLOW_C_API;
+        PyErr_NoMemory();
+        NPY_DISABLE_C_API;
+    }
 }
 
 
@@ -3737,16 +3754,16 @@ scalar_trait)
     fortran_int lda = fortran_int_max(1, m);
     fortran_int ldb = fortran_int_max(1, fortran_int_max(m,n));
 
-    mem_buff = (npy_uint8 *)malloc(a_size + b_size + s_size);
-
-    if (!mem_buff)
-        goto error;
+    size_t msize = a_size + b_size + s_size;
+    mem_buff = (npy_uint8 *)malloc(msize != 0 ? msize : 1);
 
+    if (!mem_buff) {
+        goto no_memory;
+    }
     a = mem_buff;
     b = a + a_size;
     s = b + b_size;
 
-
     params->M = m;
     params->N = n;
     params->NRHS = nrhs;
@@ -3766,9 +3783,9 @@ scalar_trait)
         params->RWORK = NULL;
         params->LWORK = -1;
 
-        if (call_gelsd(params) != 0)
+        if (call_gelsd(params) != 0) {
             goto error;
-
+        }
         work_count = (fortran_int)work_size_query;
 
         work_size  = (size_t) work_size_query * sizeof(ftyp);
@@ -3776,9 +3793,9 @@ scalar_trait)
     }
 
     mem_buff2 = (npy_uint8 *)malloc(work_size + iwork_size);
-    if (!mem_buff2)
-        goto error;
-
+    if (!mem_buff2) {
+        goto no_memory;
+    }
     work = mem_buff2;
     iwork = work + work_size;
 
@@ -3788,12 +3805,18 @@ scalar_trait)
     params->LWORK = work_count;
 
     return 1;
+
+ no_memory:
+    NPY_ALLOW_C_API_DEF
+    NPY_ALLOW_C_API;
+    PyErr_NoMemory();
+    NPY_DISABLE_C_API;
+
  error:
     TRACE_TXT("%s failed init\n", __FUNCTION__);
     free(mem_buff);
     free(mem_buff2);
     memset(params, 0, sizeof(*params));
-
     return 0;
 }
 
@@ -3857,16 +3880,17 @@ using frealtyp = basetype_t<ftyp>;
     fortran_int lda = fortran_int_max(1, m);
     fortran_int ldb = fortran_int_max(1, fortran_int_max(m,n));
 
-    mem_buff = (npy_uint8 *)malloc(a_size + b_size + s_size);
+    size_t msize = a_size + b_size + s_size;
+    mem_buff = (npy_uint8 *)malloc(msize != 0 ? msize : 1);
 
-    if (!mem_buff)
-        goto error;
+    if (!mem_buff) {
+        goto no_memory;
+    }
 
     a = mem_buff;
     b = a + a_size;
     s = b + b_size;
 
-
     params->M = m;
     params->N = n;
     params->NRHS = nrhs;
@@ -3887,8 +3911,9 @@ using frealtyp = basetype_t<ftyp>;
         params->RWORK = &rwork_size_query;
         params->LWORK = -1;
 
-        if (call_gelsd(params) != 0)
+        if (call_gelsd(params) != 0) {
             goto error;
+        }
 
         work_count = (fortran_int)work_size_query.r;
 
@@ -3898,8 +3923,9 @@ using frealtyp = basetype_t<ftyp>;
     }
 
     mem_buff2 = (npy_uint8 *)malloc(work_size + rwork_size + iwork_size);
-    if (!mem_buff2)
-        goto error;
+    if (!mem_buff2) {
+        goto no_memory;
+    }
 
     work = mem_buff2;
     rwork = work + work_size;
@@ -3911,6 +3937,13 @@ using frealtyp = basetype_t<ftyp>;
     params->LWORK = work_count;
 
     return 1;
+
+ no_memory:
+    NPY_ALLOW_C_API_DEF
+    NPY_ALLOW_C_API;
+    PyErr_NoMemory();
+    NPY_DISABLE_C_API;
+
  error:
     TRACE_TXT("%s failed init\n", __FUNCTION__);
     free(mem_buff);
diff --git a/numpy/ma/__init__.pyi b/numpy/ma/__init__.pyi
index 7f5cb56a8..ce72383e5 100644
--- a/numpy/ma/__init__.pyi
+++ b/numpy/ma/__init__.pyi
@@ -155,7 +155,6 @@ from numpy.ma.core import (
     resize as resize,
     right_shift as right_shift,
     round as round,
-    round_ as round_,
     set_fill_value as set_fill_value,
     shape as shape,
     sin as sin,
diff --git a/numpy/ma/bench.py b/numpy/ma/bench.py
deleted file mode 100644
index 56865683d..000000000
--- a/numpy/ma/bench.py
+++ /dev/null
@@ -1,130 +0,0 @@
-#!/usr/bin/env python3
-
-import timeit
-import numpy
-
-
-###############################################################################
-#                               Global variables                              #
-###############################################################################
-
-
-# Small arrays
-xs = numpy.random.uniform(-1, 1, 6).reshape(2, 3)
-ys = numpy.random.uniform(-1, 1, 6).reshape(2, 3)
-zs = xs + 1j * ys
-m1 = [[True, False, False], [False, False, True]]
-m2 = [[True, False, True], [False, False, True]]
-nmxs = numpy.ma.array(xs, mask=m1)
-nmys = numpy.ma.array(ys, mask=m2)
-nmzs = numpy.ma.array(zs, mask=m1)
-
-# Big arrays
-xl = numpy.random.uniform(-1, 1, 100*100).reshape(100, 100)
-yl = numpy.random.uniform(-1, 1, 100*100).reshape(100, 100)
-zl = xl + 1j * yl
-maskx = xl > 0.8
-masky = yl < -0.8
-nmxl = numpy.ma.array(xl, mask=maskx)
-nmyl = numpy.ma.array(yl, mask=masky)
-nmzl = numpy.ma.array(zl, mask=maskx)
-
-
-###############################################################################
-#                                 Functions                                   #
-###############################################################################
-
-
-def timer(s, v='', nloop=500, nrep=3):
-    units = ["s", "ms", "µs", "ns"]
-    scaling = [1, 1e3, 1e6, 1e9]
-    print("%s : %-50s : " % (v, s), end=' ')
-    varnames = ["%ss,nm%ss,%sl,nm%sl" % tuple(x*4) for x in 'xyz']
-    setup = 'from __main__ import numpy, ma, %s' % ','.join(varnames)
-    Timer = timeit.Timer(stmt=s, setup=setup)
-    best = min(Timer.repeat(nrep, nloop)) / nloop
-    if best > 0.0:
-        order = min(-int(numpy.floor(numpy.log10(best)) // 3), 3)
-    else:
-        order = 3
-    print("%d loops, best of %d: %.*g %s per loop" % (nloop, nrep,
-                                                      3,
-                                                      best * scaling[order],
-                                                      units[order]))
-
-
-def compare_functions_1v(func, nloop=500,
-                       xs=xs, nmxs=nmxs, xl=xl, nmxl=nmxl):
-    funcname = func.__name__
-    print("-"*50)
-    print(f'{funcname} on small arrays')
-    module, data = "numpy.ma", "nmxs"
-    timer("%(module)s.%(funcname)s(%(data)s)" % locals(), v="%11s" % module, nloop=nloop)
-
-    print("%s on large arrays" % funcname)
-    module, data = "numpy.ma", "nmxl"
-    timer("%(module)s.%(funcname)s(%(data)s)" % locals(), v="%11s" % module, nloop=nloop)
-    return
-
-def compare_methods(methodname, args, vars='x', nloop=500, test=True,
-                    xs=xs, nmxs=nmxs, xl=xl, nmxl=nmxl):
-    print("-"*50)
-    print(f'{methodname} on small arrays')
-    data, ver = f'nm{vars}l', 'numpy.ma'
-    timer("%(data)s.%(methodname)s(%(args)s)" % locals(), v=ver, nloop=nloop)
-
-    print("%s on large arrays" % methodname)
-    data, ver = "nm%sl" % vars, 'numpy.ma'
-    timer("%(data)s.%(methodname)s(%(args)s)" % locals(), v=ver, nloop=nloop)
-    return
-
-def compare_functions_2v(func, nloop=500, test=True,
-                       xs=xs, nmxs=nmxs,
-                       ys=ys, nmys=nmys,
-                       xl=xl, nmxl=nmxl,
-                       yl=yl, nmyl=nmyl):
-    funcname = func.__name__
-    print("-"*50)
-    print(f'{funcname} on small arrays')
-    module, data = "numpy.ma", "nmxs,nmys"
-    timer("%(module)s.%(funcname)s(%(data)s)" % locals(), v="%11s" % module, nloop=nloop)
-
-    print(f'{funcname} on large arrays')
-    module, data = "numpy.ma", "nmxl,nmyl"
-    timer("%(module)s.%(funcname)s(%(data)s)" % locals(), v="%11s" % module, nloop=nloop)
-    return
-
-
-if __name__ == '__main__':
-    compare_functions_1v(numpy.sin)
-    compare_functions_1v(numpy.log)
-    compare_functions_1v(numpy.sqrt)
-
-    compare_functions_2v(numpy.multiply)
-    compare_functions_2v(numpy.divide)
-    compare_functions_2v(numpy.power)
-
-    compare_methods('ravel', '', nloop=1000)
-    compare_methods('conjugate', '', 'z', nloop=1000)
-    compare_methods('transpose', '', nloop=1000)
-    compare_methods('compressed', '', nloop=1000)
-    compare_methods('__getitem__', '0', nloop=1000)
-    compare_methods('__getitem__', '(0,0)', nloop=1000)
-    compare_methods('__getitem__', '[0,-1]', nloop=1000)
-    compare_methods('__setitem__', '0, 17', nloop=1000, test=False)
-    compare_methods('__setitem__', '(0,0), 17', nloop=1000, test=False)
-
-    print("-"*50)
-    print("__setitem__ on small arrays")
-    timer('nmxs.__setitem__((-1,0),numpy.ma.masked)', 'numpy.ma   ', nloop=10000)
-
-    print("-"*50)
-    print("__setitem__ on large arrays")
-    timer('nmxl.__setitem__((-1,0),numpy.ma.masked)', 'numpy.ma   ', nloop=10000)
-
-    print("-"*50)
-    print("where on small arrays")
-    timer('numpy.ma.where(nmxs>2,nmxs,nmys)', 'numpy.ma   ', nloop=1000)
-    print("-"*50)
-    print("where on large arrays")
-    timer('numpy.ma.where(nmxl>2,nmxl,nmyl)', 'numpy.ma   ', nloop=100)
diff --git a/numpy/ma/core.py b/numpy/ma/core.py
index 8c4ad37eb..2fe326885 100644
--- a/numpy/ma/core.py
+++ b/numpy/ma/core.py
@@ -2081,9 +2081,11 @@ def masked_equal(x, value, copy=True):
     """
     Mask an array where equal to a given value.
 
-    This function is a shortcut to ``masked_where``, with
-    `condition` = (x == value).  For floating point arrays,
-    consider using ``masked_values(x, value)``.
+    Return a MaskedArray, masked where the data in array `x` are
+    equal to `value`. The fill_value of the returned MaskedArray
+    is set to `value`.
+
+    For floating point arrays, consider using ``masked_values(x, value)``.
 
     See Also
     --------
@@ -2303,25 +2305,17 @@ def masked_values(x, value, rtol=1e-5, atol=1e-8, copy=True, shrink=True):
 
     Note that `mask` is set to ``nomask`` if possible.
 
-    >>> ma.masked_values(x, 1.5)
+    >>> ma.masked_values(x, 2.1)
     masked_array(data=[1. , 1.1, 2. , 1.1, 3. ],
                  mask=False,
-           fill_value=1.5)
+           fill_value=2.1)
 
-    For integers, the fill value will be different in general to the
-    result of ``masked_equal``.
+    Unlike `masked_equal`, `masked_values` can perform approximate equalities.
 
-    >>> x = np.arange(5)
-    >>> x
-    array([0, 1, 2, 3, 4])
-    >>> ma.masked_values(x, 2)
-    masked_array(data=[0, 1, --, 3, 4],
-                 mask=[False, False,  True, False, False],
-           fill_value=2)
-    >>> ma.masked_equal(x, 2)
-    masked_array(data=[0, 1, --, 3, 4],
+    >>> ma.masked_values(x, 2.1, atol=1e-1)
+    masked_array(data=[1.0, 1.1, --, 1.1, 3.0],
                  mask=[False, False,  True, False, False],
-           fill_value=2)
+           fill_value=2.1)
 
     """
     xnew = filled(x, value)
@@ -2362,8 +2356,13 @@ def masked_invalid(a, copy=True):
            fill_value=1e+20)
 
     """
-
-    return masked_where(~(np.isfinite(getdata(a))), a, copy=copy)
+    a = np.array(a, copy=False, subok=True)
+    res = masked_where(~(np.isfinite(a)), a, copy=copy)
+    # masked_invalid previously never returned nomask as a mask and doing so
+    # threw off matplotlib (gh-22842).  So use shrink=False:
+    if res._mask is nomask:
+        res._mask = make_mask_none(res.shape, res.dtype)
+    return res
 
 ###############################################################################
 #                            Printing options                                 #
@@ -2841,7 +2840,6 @@ class MaskedArray(ndarray):
         # Process mask.
         # Type of the mask
         mdtype = make_mask_descr(_data.dtype)
-
         if mask is nomask:
             # Case 1. : no mask in input.
             # Erase the current mask ?
@@ -2859,7 +2857,7 @@ class MaskedArray(ndarray):
                     mask = np.array(
                         [getmaskarray(np.asanyarray(m, dtype=_data.dtype))
                          for m in data], dtype=mdtype)
-                except ValueError:
+                except (ValueError, TypeError):
                     # If data is nested
                     mask = nomask
                 # Force shrinking of the mask if needed (and possible)
@@ -2872,10 +2870,22 @@ class MaskedArray(ndarray):
                     _data._mask = _data._mask.copy()
                     # Reset the shape of the original mask
                     if getmask(data) is not nomask:
-                        data._mask.shape = data.shape
+                        # gh-21022 encounters an issue here
+                        # because data._mask.shape is not writeable, but
+                        # the op was also pointless in that case, because
+                        # the shapes were the same, so we can at least
+                        # avoid that path
+                        if data._mask.shape != data.shape:
+                            data._mask.shape = data.shape
         else:
             # Case 2. : With a mask in input.
             # If mask is boolean, create an array of True or False
+
+            # if users pass `mask=None` be forgiving here and cast it False
+            # for speed; although the default is `mask=nomask` and can differ.
+            if mask is None:
+                mask = False
+
             if mask is True and mdtype == MaskType:
                 mask = np.ones(_data.shape, dtype=mdtype)
             elif mask is False and mdtype == MaskType:
@@ -2923,6 +2933,7 @@ class MaskedArray(ndarray):
                     else:
                         _data._mask = np.logical_or(mask, _data._mask)
                     _data._sharedmask = False
+
         # Update fill_value.
         if fill_value is None:
             fill_value = getattr(data, '_fill_value', None)
@@ -3329,6 +3340,10 @@ class MaskedArray(ndarray):
                 # Note: Don't try to check for m.any(), that'll take too long
         return dout
 
+    # setitem may put NaNs into integer arrays or occasionally overflow a
+    # float.  But this may happen in masked values, so avoid otherwise
+    # correct warnings (as is typical also in masked calculations).
+    @np.errstate(over='ignore', invalid='ignore')
     def __setitem__(self, indx, value):
         """
         x.__setitem__(i, y) <==> x[i]=y
@@ -4620,6 +4635,7 @@ class MaskedArray(ndarray):
             otherwise.  'K' means to read the elements in the order they occur
             in memory, except for reversing the data when strides are negative.
             By default, 'C' index order is used.
+            (Masked arrays currently use 'A' on the data when 'K' is passed.)
 
         Returns
         -------
@@ -4646,6 +4662,13 @@ class MaskedArray(ndarray):
                fill_value=999999)
 
         """
+        # The order of _data and _mask could be different (it shouldn't be
+        # normally).  Passing order `K` or `A` would be incorrect.
+        # So we ignore the mask memory order.
+        # TODO: We don't actually support K, so use A instead.  We could
+        #       try to guess this correct by sorting strides or deprecate.
+        if order in "kKaA":
+            order = "F" if self._data.flags.fnc else "C"
         r = ndarray.ravel(self._data, order=order).view(type(self))
         r._update_from(self)
         if self._mask is not nomask:
@@ -6295,6 +6318,12 @@ class MaskedArray(ndarray):
         memo[id(self)] = copied
         for (k, v) in self.__dict__.items():
             copied.__dict__[k] = deepcopy(v, memo)
+        # as clearly documented for np.copy(), you need to use
+        # deepcopy() directly for arrays of object type that may
+        # contain compound types--you cannot depend on normal
+        # copy semantics to do the right thing here
+        if self.dtype.hasobject:
+            copied._data[...] = deepcopy(copied._data)
         return copied
 
 
@@ -6900,6 +6929,32 @@ def power(a, b, third=None):
     The *out* argument to `numpy.power` is not supported, `third` has to be
     None.
 
+    Examples
+    --------
+    >>> import numpy.ma as ma
+    >>> x = [11.2, -3.973, 0.801, -1.41]
+    >>> mask = [0, 0, 0, 1]
+    >>> masked_x = ma.masked_array(x, mask)
+    >>> masked_x
+    masked_array(data=[11.2, -3.973, 0.801, --],
+             mask=[False, False, False,  True],
+       fill_value=1e+20)
+    >>> ma.power(masked_x, 2)
+    masked_array(data=[125.43999999999998, 15.784728999999999,
+                   0.6416010000000001, --],
+             mask=[False, False, False,  True],
+       fill_value=1e+20)
+    >>> y = [-0.5, 2, 0, 17]
+    >>> masked_y = ma.masked_array(y, mask)
+    >>> masked_y
+    masked_array(data=[-0.5, 2.0, 0.0, --],
+             mask=[False, False, False,  True],
+       fill_value=1e+20)
+    >>> ma.power(masked_x, masked_y)
+    masked_array(data=[0.29880715233359845, 15.784728999999999, 1.0, --],
+             mask=[False, False, False,  True],
+       fill_value=1e+20)
+
     """
     if third is not None:
         raise MaskError("3-argument power not supported.")
@@ -6965,6 +7020,21 @@ def sort(a, axis=-1, kind=None, order=None, endwith=True, fill_value=None):
     See Also
     --------
     MaskedArray.sort : equivalent method
+
+    Examples
+    --------
+    >>> import numpy.ma as ma
+    >>> x = [11.2, -3.973, 0.801, -1.41]
+    >>> mask = [0, 0, 0, 1]
+    >>> masked_x = ma.masked_array(x, mask)
+    >>> masked_x
+    masked_array(data=[11.2, -3.973, 0.801, --],
+                 mask=[False, False, False,  True],
+           fill_value=1e+20)
+    >>> ma.sort(masked_x)
+    masked_array(data=[-3.973, 0.801, 11.2, --],
+                 mask=[False, False, False,  True],
+           fill_value=1e+20)
     """
     a = np.array(a, copy=True, subok=True)
     if axis is None:
@@ -6990,6 +7060,29 @@ def compressed(x):
     --------
     ma.MaskedArray.compressed : Equivalent method.
 
+    Examples
+    --------
+    
+    Create an array with negative values masked:
+
+    >>> import numpy as np
+    >>> x = np.array([[1, -1, 0], [2, -1, 3], [7, 4, -1]])
+    >>> masked_x = np.ma.masked_array(x, mask=x < 0)
+    >>> masked_x
+    masked_array(
+      data=[[1, --, 0],
+            [2, --, 3],
+            [7, 4, --]],
+      mask=[[False,  True, False],
+            [False,  True, False],
+            [False, False,  True]],
+      fill_value=999999)
+
+    Compress the masked array into a 1-D array of non-masked values:
+
+    >>> np.ma.compressed(masked_x)
+    array([1, 0, 2, 3, 7, 4])
+
     """
     return asanyarray(x).compressed()
 
@@ -7065,6 +7158,38 @@ def diag(v, k=0):
     --------
     numpy.diag : Equivalent function for ndarrays.
 
+    Examples
+    --------
+
+    Create an array with negative values masked:
+
+    >>> import numpy as np
+    >>> x = np.array([[11.2, -3.973, 18], [0.801, -1.41, 12], [7, 33, -12]])
+    >>> masked_x = np.ma.masked_array(x, mask=x < 0)
+    >>> masked_x
+    masked_array(
+      data=[[11.2, --, 18.0],
+            [0.801, --, 12.0],
+            [7.0, 33.0, --]],
+      mask=[[False,  True, False],
+            [False,  True, False],
+            [False, False,  True]],
+      fill_value=1e+20)
+
+    Isolate the main diagonal from the masked array:
+
+    >>> np.ma.diag(masked_x)
+    masked_array(data=[11.2, --, --],
+                 mask=[False,  True,  True],
+           fill_value=1e+20)
+
+    Isolate the first diagonal below the main diagonal:
+
+    >>> np.ma.diag(masked_x, -1)
+    masked_array(data=[0.801, 33.0],
+                 mask=[False, False],
+           fill_value=1e+20)
+
     """
     output = np.diag(v, k).view(MaskedArray)
     if getmask(v) is not nomask:
@@ -7104,6 +7229,21 @@ def right_shift(a, n):
     --------
     numpy.right_shift
 
+    Examples
+    --------
+    >>> import numpy.ma as ma
+    >>> x = [11, 3, 8, 1]
+    >>> mask = [0, 0, 0, 1]
+    >>> masked_x = ma.masked_array(x, mask)
+    >>> masked_x
+    masked_array(data=[11, 3, 8, --],
+                 mask=[False, False, False,  True],
+           fill_value=999999)
+    >>> ma.right_shift(masked_x,1)
+    masked_array(data=[5, 1, 4, --],
+                 mask=[False, False, False,  True],
+           fill_value=999999)
+
     """
     m = getmask(a)
     if m is nomask:
@@ -7315,6 +7455,141 @@ def size(obj, axis=None):
 size.__doc__ = np.size.__doc__
 
 
+def diff(a, /, n=1, axis=-1, prepend=np._NoValue, append=np._NoValue):
+    """
+    Calculate the n-th discrete difference along the given axis.
+    The first difference is given by ``out[i] = a[i+1] - a[i]`` along
+    the given axis, higher differences are calculated by using `diff`
+    recursively.
+    Preserves the input mask.
+
+    Parameters
+    ----------
+    a : array_like
+        Input array
+    n : int, optional
+        The number of times values are differenced. If zero, the input
+        is returned as-is.
+    axis : int, optional
+        The axis along which the difference is taken, default is the
+        last axis.
+    prepend, append : array_like, optional
+        Values to prepend or append to `a` along axis prior to
+        performing the difference.  Scalar values are expanded to
+        arrays with length 1 in the direction of axis and the shape
+        of the input array in along all other axes.  Otherwise the
+        dimension and shape must match `a` except along axis.
+
+    Returns
+    -------
+    diff : MaskedArray
+        The n-th differences. The shape of the output is the same as `a`
+        except along `axis` where the dimension is smaller by `n`. The
+        type of the output is the same as the type of the difference
+        between any two elements of `a`. This is the same as the type of
+        `a` in most cases. A notable exception is `datetime64`, which
+        results in a `timedelta64` output array.
+
+    See Also
+    --------
+    numpy.diff : Equivalent function in the top-level NumPy module.
+
+    Notes
+    -----
+    Type is preserved for boolean arrays, so the result will contain
+    `False` when consecutive elements are the same and `True` when they
+    differ.
+
+    For unsigned integer arrays, the results will also be unsigned. This
+    should not be surprising, as the result is consistent with
+    calculating the difference directly:
+
+    >>> u8_arr = np.array([1, 0], dtype=np.uint8)
+    >>> np.ma.diff(u8_arr)
+    masked_array(data=[255],
+                 mask=False,
+           fill_value=999999,
+                dtype=uint8)
+    >>> u8_arr[1,...] - u8_arr[0,...]
+    255
+
+    If this is not desirable, then the array should be cast to a larger
+    integer type first:
+
+    >>> i16_arr = u8_arr.astype(np.int16)
+    >>> np.ma.diff(i16_arr)
+    masked_array(data=[-1],
+                 mask=False,
+           fill_value=999999,
+                dtype=int16)
+
+    Examples
+    --------
+    >>> a = np.array([1, 2, 3, 4, 7, 0, 2, 3])
+    >>> x = np.ma.masked_where(a < 2, a)
+    >>> np.ma.diff(x)
+    masked_array(data=[--, 1, 1, 3, --, --, 1],
+            mask=[ True, False, False, False,  True,  True, False],
+        fill_value=999999)
+
+    >>> np.ma.diff(x, n=2)
+    masked_array(data=[--, 0, 2, --, --, --],
+                mask=[ True, False, False,  True,  True,  True],
+        fill_value=999999)
+
+    >>> a = np.array([[1, 3, 1, 5, 10], [0, 1, 5, 6, 8]])
+    >>> x = np.ma.masked_equal(a, value=1)
+    >>> np.ma.diff(x)
+    masked_array(
+        data=[[--, --, --, 5],
+                [--, --, 1, 2]],
+        mask=[[ True,  True,  True, False],
+                [ True,  True, False, False]],
+        fill_value=1)
+
+    >>> np.ma.diff(x, axis=0)
+    masked_array(data=[[--, --, --, 1, -2]],
+            mask=[[ True,  True,  True, False, False]],
+        fill_value=1)
+
+    """
+    if n == 0:
+        return a
+    if n < 0:
+        raise ValueError("order must be non-negative but got " + repr(n))
+
+    a = np.ma.asanyarray(a)
+    if a.ndim == 0:
+        raise ValueError(
+            "diff requires input that is at least one dimensional"
+            )
+
+    combined = []
+    if prepend is not np._NoValue:
+        prepend = np.ma.asanyarray(prepend)
+        if prepend.ndim == 0:
+            shape = list(a.shape)
+            shape[axis] = 1
+            prepend = np.broadcast_to(prepend, tuple(shape))
+        combined.append(prepend)
+
+    combined.append(a)
+
+    if append is not np._NoValue:
+        append = np.ma.asanyarray(append)
+        if append.ndim == 0:
+            shape = list(a.shape)
+            shape[axis] = 1
+            append = np.broadcast_to(append, tuple(shape))
+        combined.append(append)
+
+    if len(combined) > 1:
+        a = np.ma.concatenate(combined, axis)
+
+    # GH 22465 np.diff without prepend/append preserves the mask
+    return np.diff(a, n, axis)
+
+
 ##############################################################################
 #                            Extra functions                                 #
 ##############################################################################
@@ -7542,94 +7817,18 @@ def round_(a, decimals=0, out=None):
 round = round_
 
 
-# Needed by dot, so move here from extras.py. It will still be exported
-# from extras.py for compatibility.
-def mask_rowcols(a, axis=None):
+def _mask_propagate(a, axis):
     """
-    Mask rows and/or columns of a 2D array that contain masked values.
-
-    Mask whole rows and/or columns of a 2D array that contain
-    masked values.  The masking behavior is selected using the
-    `axis` parameter.
-
-      - If `axis` is None, rows *and* columns are masked.
-      - If `axis` is 0, only rows are masked.
-      - If `axis` is 1 or -1, only columns are masked.
-
-    Parameters
-    ----------
-    a : array_like, MaskedArray
-        The array to mask.  If not a MaskedArray instance (or if no array
-        elements are masked).  The result is a MaskedArray with `mask` set
-        to `nomask` (False). Must be a 2D array.
-    axis : int, optional
-        Axis along which to perform the operation. If None, applies to a
-        flattened version of the array.
-
-    Returns
-    -------
-    a : MaskedArray
-        A modified version of the input array, masked depending on the value
-        of the `axis` parameter.
-
-    Raises
-    ------
-    NotImplementedError
-        If input array `a` is not 2D.
-
-    See Also
-    --------
-    mask_rows : Mask rows of a 2D array that contain masked values.
-    mask_cols : Mask cols of a 2D array that contain masked values.
-    masked_where : Mask where a condition is met.
-
-    Notes
-    -----
-    The input array's mask is modified by this function.
-
-    Examples
-    --------
-    >>> import numpy.ma as ma
-    >>> a = np.zeros((3, 3), dtype=int)
-    >>> a[1, 1] = 1
-    >>> a
-    array([[0, 0, 0],
-           [0, 1, 0],
-           [0, 0, 0]])
-    >>> a = ma.masked_equal(a, 1)
-    >>> a
-    masked_array(
-      data=[[0, 0, 0],
-            [0, --, 0],
-            [0, 0, 0]],
-      mask=[[False, False, False],
-            [False,  True, False],
-            [False, False, False]],
-      fill_value=1)
-    >>> ma.mask_rowcols(a)
-    masked_array(
-      data=[[0, --, 0],
-            [--, --, --],
-            [0, --, 0]],
-      mask=[[False,  True, False],
-            [ True,  True,  True],
-            [False,  True, False]],
-      fill_value=1)
-
+    Mask whole 1-d vectors of an array that contain masked values.
     """
     a = array(a, subok=False)
-    if a.ndim != 2:
-        raise NotImplementedError("mask_rowcols works for 2D arrays only.")
     m = getmask(a)
-    # Nothing is masked: return a
-    if m is nomask or not m.any():
+    if m is nomask or not m.any() or axis is None:
         return a
-    maskedval = m.nonzero()
     a._mask = a._mask.copy()
-    if not axis:
-        a[np.unique(maskedval[0])] = masked
-    if axis in [None, 1, -1]:
-        a[:, np.unique(maskedval[1])] = masked
+    axes = normalize_axis_tuple(axis, a.ndim)
+    for ax in axes:
+        a._mask |= m.any(axis=ax, keepdims=True)
     return a
 
 
@@ -7646,10 +7845,6 @@ def dot(a, b, strict=False, out=None):
     corresponding method, it is recommended that the optional arguments be
     treated as keyword only.  At some point that may be mandatory.
 
-    .. note::
-      Works only with 2-D arrays at the moment.
-
-
     Parameters
     ----------
     a, b : masked_array_like
@@ -7693,18 +7888,22 @@ def dot(a, b, strict=False, out=None):
       fill_value=999999)
 
     """
-    # !!!: Works only with 2D arrays. There should be a way to get it to run
-    # with higher dimension
-    if strict and (a.ndim == 2) and (b.ndim == 2):
-        a = mask_rowcols(a, 0)
-        b = mask_rowcols(b, 1)
+    if strict is True:
+        if np.ndim(a) == 0 or np.ndim(b) == 0:
+            pass
+        elif b.ndim == 1:
+            a = _mask_propagate(a, a.ndim - 1)
+            b = _mask_propagate(b, b.ndim - 1)
+        else:
+            a = _mask_propagate(a, a.ndim - 1)
+            b = _mask_propagate(b, b.ndim - 2)
     am = ~getmaskarray(a)
     bm = ~getmaskarray(b)
 
     if out is None:
         d = np.dot(filled(a, 0), filled(b, 0))
         m = ~np.dot(am, bm)
-        if d.ndim == 0:
+        if np.ndim(d) == 0:
             d = np.asarray(d)
         r = d.view(get_masked_subclass(a, b))
         r.__setmask__(m)
@@ -8255,12 +8454,6 @@ clip = _convert2ma(
     np_ret='clipped_array : ndarray',
     np_ma_ret='clipped_array : MaskedArray',
 )
-diff = _convert2ma(
-    'diff',
-    params=dict(fill_value=None, hardmask=False),
-    np_ret='diff : ndarray',
-    np_ma_ret='diff : MaskedArray',
-)
 empty = _convert2ma(
     'empty',
     params=dict(fill_value=None, hardmask=False),
diff --git a/numpy/ma/core.pyi b/numpy/ma/core.pyi
index ffdb21983..e94ebce3c 100644
--- a/numpy/ma/core.pyi
+++ b/numpy/ma/core.pyi
@@ -7,7 +7,6 @@ from numpy import (
     amin as amin,
     bool_ as bool_,
     expand_dims as expand_dims,
-    diff as diff,
     clip as clip,
     indices as indices,
     ones_like as ones_like,
@@ -220,6 +219,10 @@ class MaskedArray(ndarray[_ShapeType, _DType_co]):
     def compress(self, condition, axis=..., out=...): ...
     def __eq__(self, other): ...
     def __ne__(self, other): ...
+    def __ge__(self, other): ...
+    def __gt__(self, other): ...
+    def __le__(self, other): ...
+    def __lt__(self, other): ...
     def __add__(self, other): ...
     def __radd__(self, other): ...
     def __sub__(self, other): ...
@@ -276,7 +279,6 @@ class MaskedArray(ndarray[_ShapeType, _DType_co]):
     def sort(self, axis=..., kind=..., order=..., endwith=..., fill_value=...): ...
     def min(self, axis=..., out=..., fill_value=..., keepdims=...): ...
     # NOTE: deprecated
-    # def mini(self, axis=...): ...
     # def tostring(self, fill_value=..., order=...): ...
     def max(self, axis=..., out=..., fill_value=..., keepdims=...): ...
     def ptp(self, axis=..., out=..., fill_value=..., keepdims=...): ...
@@ -430,10 +432,10 @@ def resize(x, new_shape): ...
 def ndim(obj): ...
 def shape(obj): ...
 def size(obj, axis=...): ...
+def diff(a, /, n=..., axis=..., prepend=..., append=...): ...
 def where(condition, x=..., y=...): ...
 def choose(indices, choices, out=..., mode=...): ...
-def round_(a, decimals=..., out=...): ...
-round = round_
+def round(a, decimals=..., out=...): ...
 
 def inner(a, b): ...
 innerproduct = inner
diff --git a/numpy/ma/extras.py b/numpy/ma/extras.py
index 4e7f8e85e..8a6246c36 100644
--- a/numpy/ma/extras.py
+++ b/numpy/ma/extras.py
@@ -27,8 +27,7 @@ from . import core as ma
 from .core import (
     MaskedArray, MAError, add, array, asarray, concatenate, filled, count,
     getmask, getmaskarray, make_mask_descr, masked, masked_array, mask_or,
-    nomask, ones, sort, zeros, getdata, get_masked_subclass, dot,
-    mask_rowcols
+    nomask, ones, sort, zeros, getdata, get_masked_subclass, dot
     )
 
 import numpy as np
@@ -398,7 +397,7 @@ def apply_along_axis(func1d, axis, arr, *args, **kwargs):
         dtypes.append(np.asarray(res).dtype)
         outarr = zeros(outshape, object)
         outarr[tuple(ind)] = res
-        Ntot = np.product(outshape)
+        Ntot = np.prod(outshape)
         k = 1
         while k < Ntot:
             # increment the index
@@ -418,7 +417,7 @@ def apply_along_axis(func1d, axis, arr, *args, **kwargs):
         j = i.copy()
         j[axis] = ([slice(None, None)] * res.ndim)
         j.put(indlist, ind)
-        Ntot = np.product(outshape)
+        Ntot = np.prod(outshape)
         holdshape = outshape
         outshape = list(arr.shape)
         outshape[axis] = res.shape
@@ -732,12 +731,8 @@ def median(a, axis=None, out=None, overwrite_input=False, keepdims=False):
         else:
             return m
 
-    r, k = _ureduce(a, func=_median, axis=axis, out=out,
+    return _ureduce(a, func=_median, keepdims=keepdims, axis=axis, out=out,
                     overwrite_input=overwrite_input)
-    if keepdims:
-        return r.reshape(k)
-    else:
-        return r
 
 
 def _median(a, axis=None, out=None, overwrite_input=False):
@@ -959,6 +954,95 @@ def compress_cols(a):
     return compress_rowcols(a, 1)
 
 
+def mask_rowcols(a, axis=None):
+    """
+    Mask rows and/or columns of a 2D array that contain masked values.
+
+    Mask whole rows and/or columns of a 2D array that contain
+    masked values.  The masking behavior is selected using the
+    `axis` parameter.
+
+      - If `axis` is None, rows *and* columns are masked.
+      - If `axis` is 0, only rows are masked.
+      - If `axis` is 1 or -1, only columns are masked.
+
+    Parameters
+    ----------
+    a : array_like, MaskedArray
+        The array to mask.  If not a MaskedArray instance (or if no array
+        elements are masked), the result is a MaskedArray with `mask` set
+        to `nomask` (False). Must be a 2D array.
+    axis : int, optional
+        Axis along which to perform the operation. If None, applies to a
+        flattened version of the array.
+
+    Returns
+    -------
+    a : MaskedArray
+        A modified version of the input array, masked depending on the value
+        of the `axis` parameter.
+
+    Raises
+    ------
+    NotImplementedError
+        If input array `a` is not 2D.
+
+    See Also
+    --------
+    mask_rows : Mask rows of a 2D array that contain masked values.
+    mask_cols : Mask cols of a 2D array that contain masked values.
+    masked_where : Mask where a condition is met.
+
+    Notes
+    -----
+    The input array's mask is modified by this function.
+
+    Examples
+    --------
+    >>> import numpy.ma as ma
+    >>> a = np.zeros((3, 3), dtype=int)
+    >>> a[1, 1] = 1
+    >>> a
+    array([[0, 0, 0],
+           [0, 1, 0],
+           [0, 0, 0]])
+    >>> a = ma.masked_equal(a, 1)
+    >>> a
+    masked_array(
+      data=[[0, 0, 0],
+            [0, --, 0],
+            [0, 0, 0]],
+      mask=[[False, False, False],
+            [False,  True, False],
+            [False, False, False]],
+      fill_value=1)
+    >>> ma.mask_rowcols(a)
+    masked_array(
+      data=[[0, --, 0],
+            [--, --, --],
+            [0, --, 0]],
+      mask=[[False,  True, False],
+            [ True,  True,  True],
+            [False,  True, False]],
+      fill_value=1)
+
+    """
+    a = array(a, subok=False)
+    if a.ndim != 2:
+        raise NotImplementedError("mask_rowcols works for 2D arrays only.")
+    m = getmask(a)
+    # Nothing is masked: return a
+    if m is nomask or not m.any():
+        return a
+    maskedval = m.nonzero()
+    a._mask = a._mask.copy()
+    if not axis:
+        a[np.unique(maskedval[0])] = masked
+    if axis in [None, 1, -1]:
+        a[:, np.unique(maskedval[1])] = masked
+    return a
+
+
 def mask_rows(a, axis=np._NoValue):
     """
     Mask rows of a 2D array that contain masked values.
diff --git a/numpy/ma/tests/test_core.py b/numpy/ma/tests/test_core.py
index 6196dcfab..6ab1d7e4f 100644
--- a/numpy/ma/tests/test_core.py
+++ b/numpy/ma/tests/test_core.py
@@ -8,6 +8,7 @@ __author__ = "Pierre GF Gerard-Marchant"
 
 import sys
 import warnings
+import copy
 import operator
 import itertools
 import textwrap
@@ -21,8 +22,9 @@ import numpy.ma.core
 import numpy.core.fromnumeric as fromnumeric
 import numpy.core.umath as umath
 from numpy.testing import (
-    assert_raises, assert_warns, suppress_warnings
+    assert_raises, assert_warns, suppress_warnings, IS_WASM
     )
+from numpy.testing._private.utils import requires_memory
 from numpy import ndarray
 from numpy.compat import asbytes
 from numpy.ma.testutils import (
@@ -214,6 +216,8 @@ class TestMaskedArray:
         assert_(np.may_share_memory(x.mask, y.mask))
         y = array([1, 2, 3], mask=x._mask, copy=True)
         assert_(not np.may_share_memory(x.mask, y.mask))
+        x = array([1, 2, 3], mask=None)
+        assert_equal(x._mask, [False, False, False])
 
     def test_masked_singleton_array_creation_warns(self):
         # The first works, but should not (ideally), there may be no way
@@ -373,6 +377,24 @@ class TestMaskedArray:
         assert_equal(s1, s2)
         assert_(x1[1:1].shape == (0,))
 
+    def test_setitem_no_warning(self):
+        # Setitem shouldn't warn, because the assignment might be masked
+        # and warning for a masked assignment is weird (see gh-23000)
+        # (When the value is masked, otherwise a warning would be acceptable
+        # but is not given currently.)
+        x = np.ma.arange(60).reshape((6, 10))
+        index = (slice(1, 5, 2), [7, 5])
+        value = np.ma.masked_all((2, 2))
+        value._data[...] = np.inf  # not a valid integer...
+        x[index] = value
+        # The masked scalar is special cased, but test anyway (it's NaN):
+        x[...] = np.ma.masked
+        # Finally, a large value that cannot be cast to the float32 `x`
+        x = np.ma.arange(3., dtype=np.float32)
+        value = np.ma.array([2e234, 1, 1], mask=[True, False, False])
+        x[...] = value
+        x[[0, 1, 2]] = value
+
     @suppress_copy_mask_on_assignment
     def test_copy(self):
         # Tests of some subtle points of copying and sizing.
@@ -1332,16 +1354,16 @@ class TestMaskedArrayArithmetic:
         assert_equal(np.sum(x, axis=0), sum(x, axis=0))
         assert_equal(np.sum(filled(xm, 0), axis=0), sum(xm, axis=0))
         assert_equal(np.sum(x, 0), sum(x, 0))
-        assert_equal(np.product(x, axis=0), product(x, axis=0))
-        assert_equal(np.product(x, 0), product(x, 0))
-        assert_equal(np.product(filled(xm, 1), axis=0), product(xm, axis=0))
+        assert_equal(np.prod(x, axis=0), product(x, axis=0))
+        assert_equal(np.prod(x, 0), product(x, 0))
+        assert_equal(np.prod(filled(xm, 1), axis=0), product(xm, axis=0))
         s = (3, 4)
         x.shape = y.shape = xm.shape = ym.shape = s
         if len(s) > 1:
             assert_equal(np.concatenate((x, y), 1), concatenate((xm, ym), 1))
             assert_equal(np.add.reduce(x, 1), add.reduce(x, 1))
             assert_equal(np.sum(x, 1), sum(x, 1))
-            assert_equal(np.product(x, 1), product(x, 1))
+            assert_equal(np.prod(x, 1), product(x, 1))
 
     def test_binops_d2D(self):
         # Test binary operations on 2D data
@@ -3407,6 +3429,24 @@ class TestMaskedArrayMethods:
         assert_equal(a.ravel(order='C'), [1, 2, 3, 4])
         assert_equal(a.ravel(order='F'), [1, 3, 2, 4])
 
+    @pytest.mark.parametrize("order", "AKCF")
+    @pytest.mark.parametrize("data_order", "CF")
+    def test_ravel_order(self, order, data_order):
+        # Ravelling must ravel mask and data in the same order always to avoid
+        # misaligning the two in the ravel result.
+        arr = np.ones((5, 10), order=data_order)
+        arr[0, :] = 0
+        mask = np.ones((10, 5), dtype=bool, order=data_order).T
+        mask[0, :] = False
+        x = array(arr, mask=mask)
+        assert x._data.flags.fnc != x._mask.flags.fnc
+        assert (x.filled(0) == 0).all()
+        raveled = x.ravel(order)
+        assert (raveled.filled(0) == 0).all()
+
+        # NOTE: Can be wrong if arr order is neither C nor F and `order="K"` 
+        assert_array_equal(arr.ravel(order), x.ravel(order)._data)
+
     def test_reshape(self):
         # Tests reshape
         x = arange(4)
@@ -4082,6 +4122,7 @@ class TestMaskedArrayMathMethods:
         assert_equal(a.max(-1), [3, 6])
         assert_equal(a.max(1), [3, 6])
 
+    @requires_memory(free_bytes=2 * 10000 * 1000 * 2)
     def test_mean_overflow(self):
         # Test overflow in masked arrays
         # gh-20272
@@ -4089,6 +4130,46 @@ class TestMaskedArrayMathMethods:
                          mask=np.zeros((10000, 10000)))
         assert_equal(a.mean(), 65535.0)
 
+    def test_diff_with_prepend(self):
+        # GH 22465
+        x = np.array([1, 2, 2, 3, 4, 2, 1, 1])
+
+        a = np.ma.masked_equal(x[3:], value=2)
+        a_prep = np.ma.masked_equal(x[:3], value=2)
+        diff1 = np.ma.diff(a, prepend=a_prep, axis=0)
+
+        b = np.ma.masked_equal(x, value=2)
+        diff2 = np.ma.diff(b, axis=0)
+
+        assert_(np.ma.allequal(diff1, diff2))
+
+    def test_diff_with_append(self):
+        # GH 22465
+        x = np.array([1, 2, 2, 3, 4, 2, 1, 1])
+
+        a = np.ma.masked_equal(x[:3], value=2)
+        a_app = np.ma.masked_equal(x[3:], value=2)
+        diff1 = np.ma.diff(a, append=a_app, axis=0)
+
+        b = np.ma.masked_equal(x, value=2)
+        diff2 = np.ma.diff(b, axis=0)
+
+        assert_(np.ma.allequal(diff1, diff2))
+
+    def test_diff_with_dim_0(self):
+        with pytest.raises(
+            ValueError,
+            match="diff requires input that is at least one dimensional"
+            ):
+            np.ma.diff(np.array(1))
+
+    def test_diff_with_n_0(self):
+        a = np.ma.masked_equal([1, 2, 2, 3, 4, 2, 1, 1], value=2)
+        diff = np.ma.diff(a, n=0, axis=0)
+
+        assert_(np.ma.allequal(a, diff))
+
+
 class TestMaskedArrayMathMethodsComplex:
     # Test class for miscellaneous MaskedArrays methods.
     def setup_method(self):
@@ -4365,6 +4446,7 @@ class TestMaskedArrayFunctions:
         assert_equal(test, ctrl)
         assert_equal(test.mask, ctrl.mask)
 
+    @pytest.mark.skipif(IS_WASM, reason="fp errors don't work in wasm")
     def test_where(self):
         # Test the where function
         x = np.array([1., 1., 1., -2., pi/2.0, 4., 5., -10., 10., 1., 2., 3.])
@@ -4504,6 +4586,32 @@ class TestMaskedArrayFunctions:
                            match="not supported for the input types"):
             np.ma.masked_invalid(a)
 
+    def test_masked_invalid_pandas(self):
+        # getdata() used to be bad for pandas series due to its _data
+        # attribute.  This test is a regression test mainly and may be
+        # removed if getdata() is adjusted.
+        class Series():
+            _data = "nonsense"
+
+            def __array__(self):
+                return np.array([5, np.nan, np.inf])
+
+        arr = np.ma.masked_invalid(Series())
+        assert_array_equal(arr._data, np.array(Series()))
+        assert_array_equal(arr._mask, [False, True, True])
+
+    @pytest.mark.parametrize("copy", [True, False])
+    def test_masked_invalid_full_mask(self, copy):
+        # Matplotlib relied on masked_invalid always returning a full mask
+        # (Also astropy projects, but were ok with it gh-22720 and gh-22842)
+        a = np.ma.array([1, 2, 3, 4])
+        assert a._mask is nomask
+        res = np.ma.masked_invalid(a, copy=copy)
+        assert res.mask is not nomask
+        # mask of a should not be mutated
+        assert a.mask is nomask
+        assert np.may_share_memory(a._data, res._data) != copy
+
     def test_choose(self):
         # Test choose
         choices = [[0, 1, 2, 3], [10, 11, 12, 13],
@@ -5499,3 +5607,47 @@ note
 original note"""
 
     assert_equal(np.ma.core.doc_note(method.__doc__, "note"), expected_doc)
+
+
+def test_gh_22556():
+    source = np.ma.array([0, [0, 1, 2]], dtype=object)
+    deepcopy = copy.deepcopy(source)
+    deepcopy[1].append('this should not appear in source')
+    assert len(source[1]) == 3
+
+
+def test_gh_21022():
+    # testing for absence of reported error
+    source = np.ma.masked_array(data=[-1, -1], mask=True, dtype=np.float64)
+    axis = np.array(0)
+    result = np.prod(source, axis=axis, keepdims=False)
+    result = np.ma.masked_array(result,
+                                mask=np.ones(result.shape, dtype=np.bool_))
+    array = np.ma.masked_array(data=-1, mask=True, dtype=np.float64)
+    copy.deepcopy(array)
+    copy.deepcopy(result)
+
+
+def test_deepcopy_2d_obj():
+    source = np.ma.array([[0, "dog"],
+                          [1, 1],
+                          [[1, 2], "cat"]],
+                        mask=[[0, 1],
+                              [0, 0],
+                              [0, 0]],
+                        dtype=object)
+    deepcopy = copy.deepcopy(source)
+    deepcopy[2, 0].extend(['this should not appear in source', 3])
+    assert len(source[2, 0]) == 2
+    assert len(deepcopy[2, 0]) == 4
+    assert_equal(deepcopy._mask, source._mask)
+    deepcopy._mask[0, 0] = 1
+    assert source._mask[0, 0] == 0
+
+
+def test_deepcopy_0d_obj():
+    source = np.ma.array(0, mask=[0], dtype=object)
+    deepcopy = copy.deepcopy(source)
+    deepcopy[...] = 17
+    assert_equal(source, 0)
+    assert_equal(deepcopy, 17)
diff --git a/numpy/ma/tests/test_extras.py b/numpy/ma/tests/test_extras.py
index 3c95e25ea..d09a50fec 100644
--- a/numpy/ma/tests/test_extras.py
+++ b/numpy/ma/tests/test_extras.py
@@ -12,6 +12,7 @@ import itertools
 import pytest
 
 import numpy as np
+from numpy.core.numeric import normalize_axis_tuple
 from numpy.testing import (
     assert_warns, suppress_warnings
     )
@@ -386,8 +387,8 @@ class TestConcatenator:
         # Tests mr_ on 2D arrays.
         a_1 = np.random.rand(5, 5)
         a_2 = np.random.rand(5, 5)
-        m_1 = np.round_(np.random.rand(5, 5), 0)
-        m_2 = np.round_(np.random.rand(5, 5), 0)
+        m_1 = np.round(np.random.rand(5, 5), 0)
+        m_2 = np.round(np.random.rand(5, 5), 0)
         b_1 = masked_array(a_1, mask=m_1)
         b_2 = masked_array(a_2, mask=m_2)
         # append columns
@@ -729,6 +730,47 @@ class TestCompressFunctions:
         assert_equal(c.mask, [[0, 0, 1], [1, 1, 1], [0, 0, 1]])
         c = dot(b, a, strict=False)
         assert_equal(c, np.dot(b.filled(0), a.filled(0)))
+        #
+        a = masked_array(np.arange(8).reshape(2, 2, 2),
+                         mask=[[[1, 0], [0, 0]], [[0, 0], [0, 0]]])
+        b = masked_array(np.arange(8).reshape(2, 2, 2),
+                         mask=[[[0, 0], [0, 0]], [[0, 0], [0, 1]]])
+        c = dot(a, b, strict=True)
+        assert_equal(c.mask,
+                     [[[[1, 1], [1, 1]], [[0, 0], [0, 1]]],
+                      [[[0, 0], [0, 1]], [[0, 0], [0, 1]]]])
+        c = dot(a, b, strict=False)
+        assert_equal(c.mask,
+                     [[[[0, 0], [0, 1]], [[0, 0], [0, 0]]],
+                      [[[0, 0], [0, 0]], [[0, 0], [0, 0]]]])
+        c = dot(b, a, strict=True)
+        assert_equal(c.mask,
+                     [[[[1, 0], [0, 0]], [[1, 0], [0, 0]]],
+                      [[[1, 0], [0, 0]], [[1, 1], [1, 1]]]])
+        c = dot(b, a, strict=False)
+        assert_equal(c.mask,
+                     [[[[0, 0], [0, 0]], [[0, 0], [0, 0]]],
+                      [[[0, 0], [0, 0]], [[1, 0], [0, 0]]]])
+        #
+        a = masked_array(np.arange(8).reshape(2, 2, 2),
+                         mask=[[[1, 0], [0, 0]], [[0, 0], [0, 0]]])
+        b = 5.
+        c = dot(a, b, strict=True)
+        assert_equal(c.mask, [[[1, 0], [0, 0]], [[0, 0], [0, 0]]])
+        c = dot(a, b, strict=False)
+        assert_equal(c.mask, [[[1, 0], [0, 0]], [[0, 0], [0, 0]]])
+        c = dot(b, a, strict=True)
+        assert_equal(c.mask, [[[1, 0], [0, 0]], [[0, 0], [0, 0]]])
+        c = dot(b, a, strict=False)
+        assert_equal(c.mask, [[[1, 0], [0, 0]], [[0, 0], [0, 0]]])
+        #
+        a = masked_array(np.arange(8).reshape(2, 2, 2),
+                         mask=[[[1, 0], [0, 0]], [[0, 0], [0, 0]]])
+        b = masked_array(np.arange(2), mask=[0, 1])
+        c = dot(a, b, strict=True)
+        assert_equal(c.mask, [[1, 1], [1, 1]])
+        c = dot(a, b, strict=False)
+        assert_equal(c.mask, [[1, 0], [0, 0]])
 
     def test_dot_returns_maskedarray(self):
         # See gh-6611
@@ -989,6 +1031,34 @@ class TestMedian:
             assert_(r is out)
             assert_(type(r) is MaskedArray)
 
+    @pytest.mark.parametrize(
+        argnames='axis',
+        argvalues=[
+            None,
+            1,
+            (1, ),
+            (0, 1),
+            (-3, -1),
+        ]
+    )
+    def test_keepdims_out(self, axis):
+        mask = np.zeros((3, 5, 7, 11), dtype=bool)
+        # Randomly set some elements to True:
+        w = np.random.random((4, 200)) * np.array(mask.shape)[:, None]
+        w = w.astype(np.intp)
+        mask[tuple(w)] = np.nan
+        d = masked_array(np.ones(mask.shape), mask=mask)
+        if axis is None:
+            shape_out = (1,) * d.ndim
+        else:
+            axis_norm = normalize_axis_tuple(axis, d.ndim)
+            shape_out = tuple(
+                1 if i in axis_norm else d.shape[i] for i in range(d.ndim))
+        out = masked_array(np.empty(shape_out))
+        result = median(d, axis=axis, keepdims=True, out=out)
+        assert result is out
+        assert_equal(result.shape, shape_out)
+
     def test_single_non_masked_value_on_axis(self):
         data = [[1., 0.],
                 [0., 3.],
diff --git a/numpy/ma/tests/test_old_ma.py b/numpy/ma/tests/test_old_ma.py
index 8465b1153..7b892ad23 100644
--- a/numpy/ma/tests/test_old_ma.py
+++ b/numpy/ma/tests/test_old_ma.py
@@ -194,16 +194,16 @@ class TestMa:
         assert_(eq(np.sum(x, axis=0), sum(x, axis=0)))
         assert_(eq(np.sum(filled(xm, 0), axis=0), sum(xm, axis=0)))
         assert_(eq(np.sum(x, 0), sum(x, 0)))
-        assert_(eq(np.product(x, axis=0), product(x, axis=0)))
-        assert_(eq(np.product(x, 0), product(x, 0)))
-        assert_(eq(np.product(filled(xm, 1), axis=0),
+        assert_(eq(np.prod(x, axis=0), product(x, axis=0)))
+        assert_(eq(np.prod(x, 0), product(x, 0)))
+        assert_(eq(np.prod(filled(xm, 1), axis=0),
                            product(xm, axis=0)))
         if len(s) > 1:
             assert_(eq(np.concatenate((x, y), 1),
                                concatenate((xm, ym), 1)))
             assert_(eq(np.add.reduce(x, 1), add.reduce(x, 1)))
             assert_(eq(np.sum(x, 1), sum(x, 1)))
-            assert_(eq(np.product(x, 1), product(x, 1)))
+            assert_(eq(np.prod(x, 1), product(x, 1)))
 
     def test_testCI(self):
         # Test of conversions and indexing
diff --git a/numpy/ma/tests/test_regression.py b/numpy/ma/tests/test_regression.py
index cb3d0349f..f4f32cc7a 100644
--- a/numpy/ma/tests/test_regression.py
+++ b/numpy/ma/tests/test_regression.py
@@ -89,3 +89,9 @@ class TestRegression:
     def test_masked_array_tobytes_fortran(self):
         ma = np.ma.arange(4).reshape((2,2))
         assert_array_equal(ma.tobytes(order='F'), ma.T.tobytes())
+
+    def test_structured_array(self):
+        # see gh-22041
+        np.ma.array((1, (b"", b"")),
+                    dtype=[("x", np.int_),
+                          ("y", [("i", np.void), ("j", np.void)])])
diff --git a/numpy/ma/tests/test_subclassing.py b/numpy/ma/tests/test_subclassing.py
index 64c66eeb9..e3c885253 100644
--- a/numpy/ma/tests/test_subclassing.py
+++ b/numpy/ma/tests/test_subclassing.py
@@ -154,6 +154,7 @@ class WrappedArray(NDArrayOperatorsMixin):
     ufunc deferrals are commutative.
     See: https://github.com/numpy/numpy/issues/15200)
     """
+    __slots__ = ('_array', 'attrs')
     __array_priority__ = 20
 
     def __init__(self, array, **attrs):
@@ -448,3 +449,12 @@ class TestClassWrapping:
         assert_(isinstance(np.divide(wm, m2), WrappedArray))
         assert_(isinstance(np.divide(m2, wm), WrappedArray))
         assert_equal(np.divide(m2, wm), np.divide(wm, m2))
+
+    def test_mixins_have_slots(self):
+        mixin = NDArrayOperatorsMixin()
+        # Should raise an error
+        assert_raises(AttributeError, mixin.__setattr__, "not_a_real_attr", 1)
+
+        m = np.ma.masked_array([1, 3, 5], mask=[False, True, False])
+        wm = WrappedArray(m)
+        assert_raises(AttributeError, wm.__setattr__, "not_an_attr", 2)
diff --git a/numpy/ma/testutils.py b/numpy/ma/testutils.py
index 2dd479abe..7a633906b 100644
--- a/numpy/ma/testutils.py
+++ b/numpy/ma/testutils.py
@@ -233,7 +233,7 @@ def fail_if_array_equal(x, y, err_msg='', verbose=True):
 
     """
     def compare(x, y):
-        return (not np.alltrue(approx(x, y)))
+        return (not np.all(approx(x, y)))
     assert_array_compare(compare, x, y, err_msg=err_msg, verbose=verbose,
                          header='Arrays are not equal')
 
diff --git a/numpy/matrixlib/defmatrix.py b/numpy/matrixlib/defmatrix.py
index a414ee9bb..d029b13fb 100644
--- a/numpy/matrixlib/defmatrix.py
+++ b/numpy/matrixlib/defmatrix.py
@@ -3,9 +3,10 @@ __all__ = ['matrix', 'bmat', 'mat', 'asmatrix']
 import sys
 import warnings
 import ast
+
+from .._utils import set_module
 import numpy.core.numeric as N
 from numpy.core.numeric import concatenate, isscalar
-from numpy.core.overrides import set_module
 # While not in __all__, matrix_power used to be defined here, so we import
 # it for backward compatibility.
 from numpy.linalg import matrix_power
diff --git a/numpy/meson.build b/numpy/meson.build
new file mode 100644
index 000000000..de36e0d49
--- /dev/null
+++ b/numpy/meson.build
@@ -0,0 +1,214 @@
+# We need -lm for all C code (assuming it uses math functions, which is safe to
+# assume for numpy).
+m_dep = cc.find_library('m', required : false)
+mlib_linkflag = ''
+if m_dep.found()
+  mlib_linkflag = '-lm'
+  add_project_link_arguments(mlib_linkflag, language : 'c')
+endif
+
+# Platform detection
+is_windows = host_machine.system() == 'windows'
+is_mingw = is_windows and cc.get_id() == 'gcc'
+
+if is_windows
+  # For mingw-w64, link statically against the UCRT.
+  gcc_link_args = ['-lucrt', '-static']
+  if is_mingw
+    add_project_link_arguments(gcc_link_args, language: ['c', 'cpp'])
+    # Force gcc to float64 long doubles for compatibility with MSVC
+    # builds, for C only.
+    add_project_arguments('-mlong-double-64', language: 'c')
+    # Make fprintf("%zd") work (see https://github.com/rgommers/scipy/issues/118)
+    add_project_arguments('-D__USE_MINGW_ANSI_STDIO=1', language: ['c', 'cpp'])
+    # Manual add of MS_WIN64 macro when not using MSVC.
+    # https://bugs.python.org/issue28267
+    bitness = run_command('_build_utils/gcc_build_bitness.py').stdout().strip()
+    if bitness == '64'
+      add_project_arguments('-DMS_WIN64', language: ['c', 'cpp'])
+    endif
+  endif
+endif
+
+# Enable UNIX large file support on 32-bit systems (64 bit off_t,
+# lseek -> lseek64, etc.)
+cflags_large_file_support = []
+if host_machine.system() == 'aix'
+  cflags_large_file_support += '-D_LARGE_FILES'
+else
+  cflags_large_file_support += [
+    '-D_FILE_OFFSET_BITS=64',
+    '-D_LARGEFILE_SOURCE=1',
+    '-D_LARGEFILE64_SOURCE=1',
+  ]
+endif
+
+
+# TODO: 64-bit BLAS and LAPACK
+#
+# Note that this works as long as BLAS and LAPACK are detected properly via
+# pkg-config. By default we look for OpenBLAS, other libraries can be configured via
+# `meson configure -Dblas=blas -Dlapack=lapack` (example to build with Netlib
+# BLAS and LAPACK).
+# For MKL and for auto-detecting one of multiple libs, we'll need a custom
+# dependency in Meson (like is done for scalapack) - see
+# https://github.com/mesonbuild/meson/issues/2835
+blas_name = get_option('blas')
+lapack_name = get_option('lapack')
+# pkg-config uses a lower-case name while CMake uses a capitalized name, so try
+# that too to make the fallback detection with CMake work
+if blas_name == 'openblas'
+  blas_name = ['openblas', 'OpenBLAS']
+endif
+if lapack_name == 'openblas'
+  lapack_name = ['openblas', 'OpenBLAS']
+endif
+blas = dependency(blas_name, required: false)
+lapack = dependency(lapack_name, required: false)
+
+dependency_map = {
+  'BLAS': blas,
+  'LAPACK': lapack,
+}
+
+# BLAS and LAPACK are optional dependencies for NumPy. We can only use a BLAS
+# which provides a CBLAS interface.
+# TODO: add ILP64 support
+have_blas = blas.found() # TODO: and blas.has_cblas()
+have_lapack = lapack.found()
+if have_blas
+  # TODO: this is a shortcut - it needs to be checked rather than used
+  # unconditionally, and then added to the extension modules that need the
+  # macro, rather than as a project argument.
+  add_project_arguments('-DHAVE_CBLAS', language: ['c', 'cpp'])
+endif
+
+# Copy the main __init__.py|pxd files to the build dir (needed for Cython)
+__init__py = fs.copyfile('__init__.py')
+__init__pxd = fs.copyfile('__init__.pxd')
+__init__pxd30 = fs.copyfile('__init__.cython-30.pxd')
+_cython_tree = [__init__py, __init__pxd, __init__pxd30]
+
+python_sources = [
+  '__init__.cython-30.pxd',
+  '__init__.pxd',
+  '__init__.py',
+  '__init__.pyi',
+  '_distributor_init.py',
+  '_globals.py',
+  '_pytesttester.py',
+  '_pytesttester.pyi',
+  '_version.py',
+  'conftest.py',
+  'ctypeslib.py',
+  'ctypeslib.pyi',
+  'exceptions.py',
+  'exceptions.pyi',
+  'dtypes.py',
+  'dtypes.pyi',
+  'matlib.py',
+  'py.typed',
+  'version.py'
+]
+
+py.install_sources(
+  python_sources,
+  subdir: 'numpy'
+)
+
+src_file_cli = find_program('_build_utils/process_src_template.py')
+src_file = generator(src_file_cli,
+  arguments : ['@INPUT@', '--outfile', '@OUTPUT@'],
+  output : '@BASENAME@'
+)
+
+tempita_cli = find_program('_build_utils/tempita.py')
+tempita = generator(tempita_cli,
+  arguments : ['@INPUT@', '--outfile', '@OUTPUT@'],
+  output : '@BASENAME@'
+)
+
+pure_subdirs = [
+  '_pyinstaller',
+  '_typing',
+  '_utils',
+  'array_api',
+  'compat',
+  'distutils',
+  'doc',
+  'f2py',
+  'lib',
+  'ma',
+  'matrixlib',
+  'polynomial',
+  'testing',
+  'tests',
+  'typing',
+]
+
+np_dir = py.get_install_dir() / 'numpy'
+
+foreach subdir: pure_subdirs
+  install_subdir(subdir, install_dir: np_dir)
+endforeach
+
+compilers = {
+  'C': cc,
+  'CPP': cpp,
+  'CYTHON': meson.get_compiler('cython')
+}
+
+machines = {
+  'HOST': host_machine,
+  'BUILD': build_machine,
+}
+
+conf_data = configuration_data()
+
+# Set compiler information
+foreach name, compiler : compilers
+  conf_data.set(name + '_COMP', compiler.get_id())
+  conf_data.set(name + '_COMP_LINKER_ID', compiler.get_linker_id())
+  conf_data.set(name + '_COMP_VERSION', compiler.version())
+  conf_data.set(name + '_COMP_CMD_ARRAY', ', '.join(compiler.cmd_array()))
+endforeach
+
+# Machines CPU and system information
+foreach name, machine : machines
+  conf_data.set(name + '_CPU', machine.cpu())
+  conf_data.set(name + '_CPU_FAMILY', machine.cpu_family())
+  conf_data.set(name + '_CPU_ENDIAN', machine.endian())
+  conf_data.set(name + '_CPU_SYSTEM', machine.system())
+endforeach
+
+conf_data.set('CROSS_COMPILED', meson.is_cross_build())
+
+# Python information
+conf_data.set('PYTHON_PATH', py.full_path())
+conf_data.set('PYTHON_VERSION', py.language_version())
+
+# Dependencies information
+foreach name, dep : dependency_map
+  conf_data.set(name + '_NAME', dep.name())
+  conf_data.set(name + '_FOUND', dep.found())
+  if dep.found()
+    conf_data.set(name + '_VERSION', dep.version())
+    conf_data.set(name + '_TYPE_NAME', dep.type_name())
+    conf_data.set(name + '_INCLUDEDIR', dep.get_variable('includedir'))
+    conf_data.set(name + '_LIBDIR', dep.get_variable('libdir'))
+    conf_data.set(name + '_OPENBLAS_CONFIG', dep.get_variable('openblas_config'))
+    conf_data.set(name + '_PCFILEDIR', dep.get_variable('pcfiledir'))
+  endif
+endforeach
+
+configure_file(
+  input: '__config__.py.in',
+  output: '__config__.py',
+  configuration : conf_data,
+  install_dir: np_dir,
+)
+
+subdir('core')
+subdir('fft')
+subdir('linalg')
+subdir('random')
diff --git a/numpy/polynomial/_polybase.py b/numpy/polynomial/_polybase.py
index 9674dee0b..9730574cf 100644
--- a/numpy/polynomial/_polybase.py
+++ b/numpy/polynomial/_polybase.py
@@ -499,7 +499,7 @@ class ABCPolyBase(abc.ABC):
         ret['coef'] = self.coef.copy()
         ret['domain'] = self.domain.copy()
         ret['window'] = self.window.copy()
-        ret['symbol'] = self.symbol.copy()
+        ret['symbol'] = self.symbol
         return ret
 
     def __setstate__(self, dict):
@@ -682,6 +682,28 @@ class ABCPolyBase(abc.ABC):
         degree : int
             Degree of the series, one less than the number of coefficients.
 
+        Examples
+        --------
+
+        Create a polynomial object for ``1 + 7*x + 4*x**2``:
+
+        >>> poly = np.polynomial.Polynomial([1, 7, 4])
+        >>> print(poly)
+        1.0 + 7.0·x + 4.0·x²
+        >>> poly.degree()
+        2
+
+        Note that this method does not check for non-zero coefficients.
+        You must trim the polynomial to remove any trailing zeroes:
+
+        >>> poly = np.polynomial.Polynomial([1, 7, 0])
+        >>> print(poly)
+        1.0 + 7.0·x + 0.0·x²
+        >>> poly.degree()
+        2
+        >>> poly.trim().degree()
+        1
+
         """
         return len(self) - 1
 
@@ -887,7 +909,7 @@ class ABCPolyBase(abc.ABC):
         """Return the roots of the series polynomial.
 
         Compute the roots for the series. Note that the accuracy of the
-        roots decrease the further outside the domain they lie.
+        roots decreases the further outside the `domain` they lie.
 
         Returns
         -------
diff --git a/numpy/polynomial/_polybase.pyi b/numpy/polynomial/_polybase.pyi
index 537221c45..25c740dbe 100644
--- a/numpy/polynomial/_polybase.pyi
+++ b/numpy/polynomial/_polybase.pyi
@@ -9,6 +9,8 @@ class ABCPolyBase(abc.ABC):
     maxpower: ClassVar[int]
     coef: Any
     @property
+    def symbol(self) -> str: ...
+    @property
     @abc.abstractmethod
     def domain(self): ...
     @property
@@ -21,7 +23,7 @@ class ABCPolyBase(abc.ABC):
     def has_samedomain(self, other): ...
     def has_samewindow(self, other): ...
     def has_sametype(self, other): ...
-    def __init__(self, coef, domain=..., window=...): ...
+    def __init__(self, coef, domain=..., window=..., symbol: str = ...) -> None: ...
     def __format__(self, fmt_str): ...
     def __call__(self, arg): ...
     def __iter__(self): ...
diff --git a/numpy/polynomial/chebyshev.py b/numpy/polynomial/chebyshev.py
index c663ffab0..efbe13e0c 100644
--- a/numpy/polynomial/chebyshev.py
+++ b/numpy/polynomial/chebyshev.py
@@ -2012,6 +2012,12 @@ class Chebyshev(ABCPolyBase):
         Window, see `domain` for its use. The default value is [-1, 1].
 
         .. versionadded:: 1.6.0
+    symbol : str, optional
+        Symbol used to represent the independent variable in string
+        representations of the polynomial expression, e.g. for printing.
+        The symbol must be a valid Python identifier. Default value is 'x'.
+
+        .. versionadded:: 1.24
 
     """
     # Virtual Functions
diff --git a/numpy/polynomial/hermite.py b/numpy/polynomial/hermite.py
index e20339121..210df25f5 100644
--- a/numpy/polynomial/hermite.py
+++ b/numpy/polynomial/hermite.py
@@ -1675,6 +1675,12 @@ class Hermite(ABCPolyBase):
         Window, see `domain` for its use. The default value is [-1, 1].
 
         .. versionadded:: 1.6.0
+    symbol : str, optional
+        Symbol used to represent the independent variable in string
+        representations of the polynomial expression, e.g. for printing.
+        The symbol must be a valid Python identifier. Default value is 'x'.
+
+        .. versionadded:: 1.24
 
     """
     # Virtual Functions
diff --git a/numpy/polynomial/hermite_e.py b/numpy/polynomial/hermite_e.py
index 182c562c2..bdf29405b 100644
--- a/numpy/polynomial/hermite_e.py
+++ b/numpy/polynomial/hermite_e.py
@@ -1667,6 +1667,12 @@ class HermiteE(ABCPolyBase):
         Window, see `domain` for its use. The default value is [-1, 1].
 
         .. versionadded:: 1.6.0
+    symbol : str, optional
+        Symbol used to represent the independent variable in string
+        representations of the polynomial expression, e.g. for printing.
+        The symbol must be a valid Python identifier. Default value is 'x'.
+
+        .. versionadded:: 1.24
 
     """
     # Virtual Functions
diff --git a/numpy/polynomial/laguerre.py b/numpy/polynomial/laguerre.py
index 2eacceced..925d4898e 100644
--- a/numpy/polynomial/laguerre.py
+++ b/numpy/polynomial/laguerre.py
@@ -1623,6 +1623,12 @@ class Laguerre(ABCPolyBase):
         Window, see `domain` for its use. The default value is [0, 1].
 
         .. versionadded:: 1.6.0
+    symbol : str, optional
+        Symbol used to represent the independent variable in string
+        representations of the polynomial expression, e.g. for printing.
+        The symbol must be a valid Python identifier. Default value is 'x'.
+
+        .. versionadded:: 1.24
 
     """
     # Virtual Functions
diff --git a/numpy/polynomial/legendre.py b/numpy/polynomial/legendre.py
index 028e2fe7b..8e9c19d94 100644
--- a/numpy/polynomial/legendre.py
+++ b/numpy/polynomial/legendre.py
@@ -1636,6 +1636,12 @@ class Legendre(ABCPolyBase):
         Window, see `domain` for its use. The default value is [-1, 1].
 
         .. versionadded:: 1.6.0
+    symbol : str, optional
+        Symbol used to represent the independent variable in string
+        representations of the polynomial expression, e.g. for printing.
+        The symbol must be a valid Python identifier. Default value is 'x'.
+
+        .. versionadded:: 1.24
 
     """
     # Virtual Functions
diff --git a/numpy/polynomial/polynomial.py b/numpy/polynomial/polynomial.py
index d102f5a30..ceadff0bf 100644
--- a/numpy/polynomial/polynomial.py
+++ b/numpy/polynomial/polynomial.py
@@ -1489,6 +1489,12 @@ class Polynomial(ABCPolyBase):
         Window, see `domain` for its use. The default value is [-1, 1].
 
         .. versionadded:: 1.6.0
+    symbol : str, optional
+        Symbol used to represent the independent variable in string
+        representations of the polynomial expression, e.g. for printing.
+        The symbol must be a valid Python identifier. Default value is 'x'.
+
+        .. versionadded:: 1.24
 
     """
     # Virtual Functions
diff --git a/numpy/polynomial/tests/test_polynomial.py b/numpy/polynomial/tests/test_polynomial.py
index a0a09fcf4..6b3ef2388 100644
--- a/numpy/polynomial/tests/test_polynomial.py
+++ b/numpy/polynomial/tests/test_polynomial.py
@@ -5,6 +5,8 @@ from functools import reduce
 
 import numpy as np
 import numpy.polynomial.polynomial as poly
+import pickle
+from copy import deepcopy
 from numpy.testing import (
     assert_almost_equal, assert_raises, assert_equal, assert_,
     assert_warns, assert_array_equal, assert_raises_regex)
@@ -41,6 +43,15 @@ class TestConstants:
     def test_polyx(self):
         assert_equal(poly.polyx, [0, 1])
 
+    def test_copy(self):
+        x = poly.Polynomial([1, 2, 3])
+        y = deepcopy(x)
+        assert_equal(x, y)
+
+    def test_pickle(self):
+        x = poly.Polynomial([1, 2, 3])
+        y = pickle.loads(pickle.dumps(x))
+        assert_equal(x, y)
 
 class TestArithmetic:
 
diff --git a/numpy/polynomial/tests/test_printing.py b/numpy/polynomial/tests/test_printing.py
index 990a0d179..6f2a5092d 100644
--- a/numpy/polynomial/tests/test_printing.py
+++ b/numpy/polynomial/tests/test_printing.py
@@ -478,6 +478,10 @@ class TestPrintOptions:
     are too small or too large.
     """
 
+    @pytest.fixture(scope='class', autouse=True)
+    def use_ascii(self):
+        poly.set_default_printstyle('ascii')
+
     def test_str(self):
         p = poly.Polynomial([1/2, 1/7, 1/7*10**8, 1/7*10**9])
         assert_equal(str(p), '0.5 + 0.14285714 x + 14285714.28571429 x**2 '
diff --git a/numpy/random/_bounded_integers.pyx.in b/numpy/random/_bounded_integers.pyx.in
index 7eb6aff1e..6743001d6 100644
--- a/numpy/random/_bounded_integers.pyx.in
+++ b/numpy/random/_bounded_integers.pyx.in
@@ -99,8 +99,12 @@ cdef object _rand_{{nptype}}_broadcast(np.ndarray low, np.ndarray high, object s
     is_open = not closed
     low_arr = <np.ndarray>low
     high_arr = <np.ndarray>high
-    if np.any(np.less(low_arr, {{lb}})):
+
+    if np.can_cast(low_arr, np.{{otype}}):
+        pass  # cannot be out-of-bounds
+    elif np.any(np.less(low_arr, np.{{otype}}({{lb}}))):
         raise ValueError('low is out of bounds for {{nptype}}')
+
     if closed:
         high_comp = np.greater_equal
         low_high_comp = np.greater
@@ -108,8 +112,11 @@ cdef object _rand_{{nptype}}_broadcast(np.ndarray low, np.ndarray high, object s
         high_comp = np.greater
         low_high_comp = np.greater_equal
 
-    if np.any(high_comp(high_arr, {{ub}})):
+    if np.can_cast(high_arr, np.{{otype}}):
+        pass  # cannot be out-of-bounds
+    elif np.any(high_comp(high_arr, np.{{nptype_up}}({{ub}}))):
         raise ValueError('high is out of bounds for {{nptype}}')
+
     if np.any(low_high_comp(low_arr, high_arr)):
         raise ValueError(format_bounds_error(closed, low_arr))
 
@@ -165,50 +172,69 @@ cdef object _rand_{{nptype}}_broadcast(object low, object high, object size,
     64 bit integer type.
     """
 
-    cdef np.ndarray low_arr, high_arr, out_arr, highm1_arr
+    cdef np.ndarray low_arr, low_arr_orig, high_arr, high_arr_orig, out_arr
     cdef np.npy_intp i, cnt, n
     cdef np.broadcast it
     cdef object closed_upper
     cdef uint64_t *out_data
     cdef {{nptype}}_t *highm1_data
     cdef {{nptype}}_t low_v, high_v
-    cdef uint64_t rng, last_rng, val, mask, off, out_val
+    cdef uint64_t rng, last_rng, val, mask, off, out_val, is_open
 
-    low_arr = <np.ndarray>low
-    high_arr = <np.ndarray>high
+    low_arr_orig = <np.ndarray>low
+    high_arr_orig = <np.ndarray>high
 
-    if np.any(np.less(low_arr, {{lb}})):
-        raise ValueError('low is out of bounds for {{nptype}}')
-    dt = high_arr.dtype
-    if closed or np.issubdtype(dt, np.integer):
-        # Avoid object dtype path if already an integer
-        high_lower_comp = np.less if closed else np.less_equal
-        if np.any(high_lower_comp(high_arr, {{lb}})):
-            raise ValueError(format_bounds_error(closed, low_arr))
-        high_m1 = high_arr if closed else high_arr - dt.type(1)
-        if np.any(np.greater(high_m1, {{ub}})):
-            raise ValueError('high is out of bounds for {{nptype}}')
-        highm1_arr = <np.ndarray>np.PyArray_FROM_OTF(high_m1, np.{{npctype}}, np.NPY_ALIGNED | np.NPY_FORCECAST)
+    is_open = not closed
+
+    # The following code tries to cast safely, but failing that goes via
+    # Python `int()` because it is very difficult to cast integers in a
+    # truly safe way (i.e. so it raises on out-of-bound).
+    # We correct if the interval is not closed in this step if we go the long
+    # route.  (Not otherwise, since the -1 could overflow in theory.)
+    if np.can_cast(low_arr_orig, np.{{otype}}):
+        low_arr = <np.ndarray>np.PyArray_FROM_OTF(low_arr_orig, np.{{npctype}}, np.NPY_ALIGNED)
+    else:
+        low_arr = <np.ndarray>np.empty_like(low_arr_orig, dtype=np.{{otype}})
+        flat = low_arr_orig.flat
+        low_data = <{{nptype}}_t *>np.PyArray_DATA(low_arr)
+        cnt = np.PyArray_SIZE(low_arr)
+        for i in range(cnt):
+            lower = int(flat[i])
+            if lower < {{lb}} or lower > {{ub}}:
+                raise ValueError('low is out of bounds for {{nptype}}')
+            low_data[i] = lower
+
+    del low_arr_orig
+
+    if np.can_cast(high_arr_orig, np.{{otype}}):
+        high_arr = <np.ndarray>np.PyArray_FROM_OTF(high_arr_orig, np.{{npctype}}, np.NPY_ALIGNED)
     else:
-        # If input is object or a floating type
-        highm1_arr = <np.ndarray>np.empty_like(high_arr, dtype=np.{{otype}})
-        highm1_data = <{{nptype}}_t *>np.PyArray_DATA(highm1_arr)
+        high_arr = np.empty_like(high_arr_orig, dtype=np.{{otype}})
+        flat = high_arr_orig.flat
+        high_data = <{{nptype}}_t *>np.PyArray_DATA(high_arr)
         cnt = np.PyArray_SIZE(high_arr)
-        flat = high_arr.flat
         for i in range(cnt):
-            # Subtract 1 since generator produces values on the closed int [off, off+rng]
-            closed_upper = int(flat[i]) - 1
+            closed_upper = int(flat[i]) - is_open
             if closed_upper > {{ub}}:
                 raise ValueError('high is out of bounds for {{nptype}}')
             if closed_upper < {{lb}}:
                 raise ValueError(format_bounds_error(closed, low_arr))
-            highm1_data[i] = <{{nptype}}_t>closed_upper
+            high_data[i] = closed_upper
 
-    if np.any(np.greater(low_arr, highm1_arr)):
-        raise ValueError(format_bounds_error(closed, low_arr))
+        is_open = 0  # we applied is_open in this path already
 
-    high_arr = highm1_arr
-    low_arr = <np.ndarray>np.PyArray_FROM_OTF(low, np.{{npctype}}, np.NPY_ALIGNED | np.NPY_FORCECAST)
+    del high_arr_orig
+
+    # Since we have the largest supported integer dtypes, they must be within
+    # range at this point; otherwise conversion would have failed.  Check that
+    # it is never true that `high <= low`` if closed and `high < low` if not
+    if not is_open:
+        low_high_comp = np.greater
+    else:
+        low_high_comp = np.greater_equal
+
+    if np.any(low_high_comp(low_arr, high_arr)):
+        raise ValueError(format_bounds_error(closed, low_arr))
 
     if size is not None:
         out_arr = <np.ndarray>np.empty(size, np.{{otype}})
@@ -224,8 +250,8 @@ cdef object _rand_{{nptype}}_broadcast(object low, object high, object size,
         for i in range(n):
             low_v = (<{{nptype}}_t*>np.PyArray_MultiIter_DATA(it, 0))[0]
             high_v = (<{{nptype}}_t*>np.PyArray_MultiIter_DATA(it, 1))[0]
-            # Generator produces values on the closed int [off, off+rng], -1 subtracted above
-            rng = <{{utype}}_t>(high_v - low_v)
+            # Generator produces values on the closed int [off, off+rng]
+            rng = <{{utype}}_t>((high_v - is_open) - low_v)
             off = <{{utype}}_t>(<{{nptype}}_t>low_v)
 
             if rng != last_rng:
diff --git a/numpy/random/_common.pxd b/numpy/random/_common.pxd
index 3eaf39ddf..659da0d2d 100644
--- a/numpy/random/_common.pxd
+++ b/numpy/random/_common.pxd
@@ -39,32 +39,32 @@ cdef extern from "include/aligned_malloc.h":
     cdef void *PyArray_calloc_aligned(size_t n, size_t s)
     cdef void PyArray_free_aligned(void *p)
 
-ctypedef void (*random_double_fill)(bitgen_t *state, np.npy_intp count, double* out) nogil
-ctypedef double (*random_double_0)(void *state) nogil
-ctypedef double (*random_double_1)(void *state, double a) nogil
-ctypedef double (*random_double_2)(void *state, double a, double b) nogil
-ctypedef double (*random_double_3)(void *state, double a, double b, double c) nogil
+ctypedef void (*random_double_fill)(bitgen_t *state, np.npy_intp count, double* out)  noexcept nogil
+ctypedef double (*random_double_0)(void *state)  noexcept nogil
+ctypedef double (*random_double_1)(void *state, double a)  noexcept nogil
+ctypedef double (*random_double_2)(void *state, double a, double b)  noexcept nogil
+ctypedef double (*random_double_3)(void *state, double a, double b, double c)  noexcept nogil
 
-ctypedef void (*random_float_fill)(bitgen_t *state, np.npy_intp count, float* out) nogil
-ctypedef float (*random_float_0)(bitgen_t *state) nogil
-ctypedef float (*random_float_1)(bitgen_t *state, float a) nogil
+ctypedef void (*random_float_fill)(bitgen_t *state, np.npy_intp count, float* out)  noexcept nogil
+ctypedef float (*random_float_0)(bitgen_t *state)  noexcept nogil
+ctypedef float (*random_float_1)(bitgen_t *state, float a)  noexcept nogil
 
-ctypedef int64_t (*random_uint_0)(void *state) nogil
-ctypedef int64_t (*random_uint_d)(void *state, double a) nogil
-ctypedef int64_t (*random_uint_dd)(void *state, double a, double b) nogil
-ctypedef int64_t (*random_uint_di)(void *state, double a, uint64_t b) nogil
-ctypedef int64_t (*random_uint_i)(void *state, int64_t a) nogil
-ctypedef int64_t (*random_uint_iii)(void *state, int64_t a, int64_t b, int64_t c) nogil
+ctypedef int64_t (*random_uint_0)(void *state)  noexcept nogil
+ctypedef int64_t (*random_uint_d)(void *state, double a)  noexcept nogil
+ctypedef int64_t (*random_uint_dd)(void *state, double a, double b)  noexcept nogil
+ctypedef int64_t (*random_uint_di)(void *state, double a, uint64_t b)  noexcept nogil
+ctypedef int64_t (*random_uint_i)(void *state, int64_t a)  noexcept nogil
+ctypedef int64_t (*random_uint_iii)(void *state, int64_t a, int64_t b, int64_t c)  noexcept nogil
 
-ctypedef uint32_t (*random_uint_0_32)(bitgen_t *state) nogil
-ctypedef uint32_t (*random_uint_1_i_32)(bitgen_t *state, uint32_t a) nogil
+ctypedef uint32_t (*random_uint_0_32)(bitgen_t *state)  noexcept nogil
+ctypedef uint32_t (*random_uint_1_i_32)(bitgen_t *state, uint32_t a)  noexcept nogil
 
-ctypedef int32_t (*random_int_2_i_32)(bitgen_t *state, int32_t a, int32_t b) nogil
-ctypedef int64_t (*random_int_2_i)(bitgen_t *state, int64_t a, int64_t b) nogil
+ctypedef int32_t (*random_int_2_i_32)(bitgen_t *state, int32_t a, int32_t b)  noexcept nogil
+ctypedef int64_t (*random_int_2_i)(bitgen_t *state, int64_t a, int64_t b)  noexcept nogil
 
-cdef double kahan_sum(double *darr, np.npy_intp n)
+cdef double kahan_sum(double *darr, np.npy_intp n) noexcept
 
-cdef inline double uint64_to_double(uint64_t rnd) nogil:
+cdef inline double uint64_to_double(uint64_t rnd) noexcept nogil:
     return (rnd >> 11) * (1.0 / 9007199254740992.0)
 
 cdef object double_fill(void *func, bitgen_t *state, object size, object lock, object out)
diff --git a/numpy/random/_common.pyx b/numpy/random/_common.pyx
index 7b6f69303..c5e4e3297 100644
--- a/numpy/random/_common.pyx
+++ b/numpy/random/_common.pyx
@@ -171,7 +171,7 @@ cdef object prepare_ctypes(bitgen_t *bitgen):
                         ctypes.c_void_p(<uintptr_t>bitgen))
     return _ctypes
 
-cdef double kahan_sum(double *darr, np.npy_intp n):
+cdef double kahan_sum(double *darr, np.npy_intp n) noexcept:
     """
     Parameters
     ----------
diff --git a/numpy/random/_examples/cffi/parse.py b/numpy/random/_examples/cffi/parse.py
index daff6bdec..d41c4c2db 100644
--- a/numpy/random/_examples/cffi/parse.py
+++ b/numpy/random/_examples/cffi/parse.py
@@ -36,9 +36,9 @@ def parse_distributions_h(ffi, inc_dir):
                 continue
     
             # skip any inlined function definition
-            # which starts with 'static NPY_INLINE xxx(...) {'
+            # which starts with 'static inline xxx(...) {'
             # and ends with a closing '}'
-            if line.strip().startswith('static NPY_INLINE'):
+            if line.strip().startswith('static inline'):
                 in_skip += line.count('{')
                 continue
             elif in_skip > 0:
@@ -48,7 +48,6 @@ def parse_distributions_h(ffi, inc_dir):
     
             # replace defines with their value or remove them
             line = line.replace('DECLDIR', '')
-            line = line.replace('NPY_INLINE', '')
             line = line.replace('RAND_INT_TYPE', 'int64_t')
             s.append(line)
         ffi.cdef('\n'.join(s))
diff --git a/numpy/random/_generator.pyi b/numpy/random/_generator.pyi
index f0d814fef..e1cdefb15 100644
--- a/numpy/random/_generator.pyi
+++ b/numpy/random/_generator.pyi
@@ -29,6 +29,7 @@ from numpy._typing import (
     _DTypeLikeUInt,
     _Float32Codes,
     _Float64Codes,
+    _FloatLike_co,
     _Int8Codes,
     _Int16Codes,
     _Int32Codes,
@@ -72,6 +73,7 @@ class Generator:
     def __reduce__(self) -> tuple[Callable[[str], Generator], tuple[str], dict[str, Any]]: ...
     @property
     def bit_generator(self) -> BitGenerator: ...
+    def spawn(self, n_children: int) -> list[Generator]: ...
     def bytes(self, length: int) -> bytes: ...
     @overload
     def standard_normal(  # type: ignore[misc]
@@ -187,13 +189,18 @@ class Generator:
         out: None | ndarray[Any, dtype[float64]] = ...,
     ) -> ndarray[Any, dtype[float64]]: ...
     @overload
-    def beta(self, a: float, b: float, size: None = ...) -> float: ...  # type: ignore[misc]
+    def beta(
+        self,
+        a: _FloatLike_co,
+        b: _FloatLike_co,
+        size: None = ...,
+    ) -> float: ...  # type: ignore[misc]
     @overload
     def beta(
         self, a: _ArrayLikeFloat_co, b: _ArrayLikeFloat_co, size: None | _ShapeLike = ...
     ) -> ndarray[Any, dtype[float64]]: ...
     @overload
-    def exponential(self, scale: float = ..., size: None = ...) -> float: ...  # type: ignore[misc]
+    def exponential(self, scale: _FloatLike_co = ..., size: None = ...) -> float: ...  # type: ignore[misc]
     @overload
     def exponential(
         self, scale: _ArrayLikeFloat_co = ..., size: None | _ShapeLike = ...
@@ -370,7 +377,12 @@ class Generator:
         shuffle: bool = ...,
     ) -> ndarray[Any, Any]: ...
     @overload
-    def uniform(self, low: float = ..., high: float = ..., size: None = ...) -> float: ...  # type: ignore[misc]
+    def uniform(
+        self,
+        low: _FloatLike_co = ...,
+        high: _FloatLike_co = ...,
+        size: None = ...,
+    ) -> float: ...  # type: ignore[misc]
     @overload
     def uniform(
         self,
@@ -379,7 +391,12 @@ class Generator:
         size: None | _ShapeLike = ...,
     ) -> ndarray[Any, dtype[float64]]: ...
     @overload
-    def normal(self, loc: float = ..., scale: float = ..., size: None = ...) -> float: ...  # type: ignore[misc]
+    def normal(
+        self,
+        loc: _FloatLike_co = ...,
+        scale: _FloatLike_co = ...,
+        size: None = ...,
+    ) -> float: ...  # type: ignore[misc]
     @overload
     def normal(
         self,
@@ -390,7 +407,7 @@ class Generator:
     @overload
     def standard_gamma(  # type: ignore[misc]
         self,
-        shape: float,
+        shape: _FloatLike_co,
         size: None = ...,
         dtype: _DTypeLikeFloat32 | _DTypeLikeFloat64 = ...,
         out: None = ...,
@@ -425,7 +442,7 @@ class Generator:
         out: None | ndarray[Any, dtype[float64]] = ...,
     ) -> ndarray[Any, dtype[float64]]: ...
     @overload
-    def gamma(self, shape: float, scale: float = ..., size: None = ...) -> float: ...  # type: ignore[misc]
+    def gamma(self, shape: _FloatLike_co, scale: _FloatLike_co = ..., size: None = ...) -> float: ...  # type: ignore[misc]
     @overload
     def gamma(
         self,
@@ -434,13 +451,13 @@ class Generator:
         size: None | _ShapeLike = ...,
     ) -> ndarray[Any, dtype[float64]]: ...
     @overload
-    def f(self, dfnum: float, dfden: float, size: None = ...) -> float: ...  # type: ignore[misc]
+    def f(self, dfnum: _FloatLike_co, dfden: _FloatLike_co, size: None = ...) -> float: ...  # type: ignore[misc]
     @overload
     def f(
         self, dfnum: _ArrayLikeFloat_co, dfden: _ArrayLikeFloat_co, size: None | _ShapeLike = ...
     ) -> ndarray[Any, dtype[float64]]: ...
     @overload
-    def noncentral_f(self, dfnum: float, dfden: float, nonc: float, size: None = ...) -> float: ...  # type: ignore[misc]
+    def noncentral_f(self, dfnum: _FloatLike_co, dfden: _FloatLike_co, nonc: _FloatLike_co, size: None = ...) -> float: ...  # type: ignore[misc]
     @overload
     def noncentral_f(
         self,
@@ -450,19 +467,19 @@ class Generator:
         size: None | _ShapeLike = ...,
     ) -> ndarray[Any, dtype[float64]]: ...
     @overload
-    def chisquare(self, df: float, size: None = ...) -> float: ...  # type: ignore[misc]
+    def chisquare(self, df: _FloatLike_co, size: None = ...) -> float: ...  # type: ignore[misc]
     @overload
     def chisquare(
         self, df: _ArrayLikeFloat_co, size: None | _ShapeLike = ...
     ) -> ndarray[Any, dtype[float64]]: ...
     @overload
-    def noncentral_chisquare(self, df: float, nonc: float, size: None = ...) -> float: ...  # type: ignore[misc]
+    def noncentral_chisquare(self, df: _FloatLike_co, nonc: _FloatLike_co, size: None = ...) -> float: ...  # type: ignore[misc]
     @overload
     def noncentral_chisquare(
         self, df: _ArrayLikeFloat_co, nonc: _ArrayLikeFloat_co, size: None | _ShapeLike = ...
     ) -> ndarray[Any, dtype[float64]]: ...
     @overload
-    def standard_t(self, df: float, size: None = ...) -> float: ...  # type: ignore[misc]
+    def standard_t(self, df: _FloatLike_co, size: None = ...) -> float: ...  # type: ignore[misc]
     @overload
     def standard_t(
         self, df: _ArrayLikeFloat_co, size: None = ...
@@ -472,25 +489,25 @@ class Generator:
         self, df: _ArrayLikeFloat_co, size: _ShapeLike = ...
     ) -> ndarray[Any, dtype[float64]]: ...
     @overload
-    def vonmises(self, mu: float, kappa: float, size: None = ...) -> float: ...  # type: ignore[misc]
+    def vonmises(self, mu: _FloatLike_co, kappa: _FloatLike_co, size: None = ...) -> float: ...  # type: ignore[misc]
     @overload
     def vonmises(
         self, mu: _ArrayLikeFloat_co, kappa: _ArrayLikeFloat_co, size: None | _ShapeLike = ...
     ) -> ndarray[Any, dtype[float64]]: ...
     @overload
-    def pareto(self, a: float, size: None = ...) -> float: ...  # type: ignore[misc]
+    def pareto(self, a: _FloatLike_co, size: None = ...) -> float: ...  # type: ignore[misc]
     @overload
     def pareto(
         self, a: _ArrayLikeFloat_co, size: None | _ShapeLike = ...
     ) -> ndarray[Any, dtype[float64]]: ...
     @overload
-    def weibull(self, a: float, size: None = ...) -> float: ...  # type: ignore[misc]
+    def weibull(self, a: _FloatLike_co, size: None = ...) -> float: ...  # type: ignore[misc]
     @overload
     def weibull(
         self, a: _ArrayLikeFloat_co, size: None | _ShapeLike = ...
     ) -> ndarray[Any, dtype[float64]]: ...
     @overload
-    def power(self, a: float, size: None = ...) -> float: ...  # type: ignore[misc]
+    def power(self, a: _FloatLike_co, size: None = ...) -> float: ...  # type: ignore[misc]
     @overload
     def power(
         self, a: _ArrayLikeFloat_co, size: None | _ShapeLike = ...
@@ -500,7 +517,12 @@ class Generator:
     @overload
     def standard_cauchy(self, size: _ShapeLike = ...) -> ndarray[Any, dtype[float64]]: ...
     @overload
-    def laplace(self, loc: float = ..., scale: float = ..., size: None = ...) -> float: ...  # type: ignore[misc]
+    def laplace(
+        self,
+        loc: _FloatLike_co = ...,
+        scale: _FloatLike_co = ...,
+        size: None = ...,
+    ) -> float: ...  # type: ignore[misc]
     @overload
     def laplace(
         self,
@@ -509,7 +531,12 @@ class Generator:
         size: None | _ShapeLike = ...,
     ) -> ndarray[Any, dtype[float64]]: ...
     @overload
-    def gumbel(self, loc: float = ..., scale: float = ..., size: None = ...) -> float: ...  # type: ignore[misc]
+    def gumbel(
+        self,
+        loc: _FloatLike_co = ...,
+        scale: _FloatLike_co = ...,
+        size: None = ...,
+    ) -> float: ...  # type: ignore[misc]
     @overload
     def gumbel(
         self,
@@ -518,7 +545,12 @@ class Generator:
         size: None | _ShapeLike = ...,
     ) -> ndarray[Any, dtype[float64]]: ...
     @overload
-    def logistic(self, loc: float = ..., scale: float = ..., size: None = ...) -> float: ...  # type: ignore[misc]
+    def logistic(
+        self,
+        loc: _FloatLike_co = ...,
+        scale: _FloatLike_co = ...,
+        size: None = ...,
+    ) -> float: ...  # type: ignore[misc]
     @overload
     def logistic(
         self,
@@ -527,7 +559,12 @@ class Generator:
         size: None | _ShapeLike = ...,
     ) -> ndarray[Any, dtype[float64]]: ...
     @overload
-    def lognormal(self, mean: float = ..., sigma: float = ..., size: None = ...) -> float: ...  # type: ignore[misc]
+    def lognormal(
+        self,
+        mean: _FloatLike_co = ...,
+        sigma: _FloatLike_co = ...,
+        size: None = ...,
+    ) -> float: ...  # type: ignore[misc]
     @overload
     def lognormal(
         self,
@@ -536,19 +573,25 @@ class Generator:
         size: None | _ShapeLike = ...,
     ) -> ndarray[Any, dtype[float64]]: ...
     @overload
-    def rayleigh(self, scale: float = ..., size: None = ...) -> float: ...  # type: ignore[misc]
+    def rayleigh(self, scale: _FloatLike_co = ..., size: None = ...) -> float: ...  # type: ignore[misc]
     @overload
     def rayleigh(
         self, scale: _ArrayLikeFloat_co = ..., size: None | _ShapeLike = ...
     ) -> ndarray[Any, dtype[float64]]: ...
     @overload
-    def wald(self, mean: float, scale: float, size: None = ...) -> float: ...  # type: ignore[misc]
+    def wald(self, mean: _FloatLike_co, scale: _FloatLike_co, size: None = ...) -> float: ...  # type: ignore[misc]
     @overload
     def wald(
         self, mean: _ArrayLikeFloat_co, scale: _ArrayLikeFloat_co, size: None | _ShapeLike = ...
     ) -> ndarray[Any, dtype[float64]]: ...
     @overload
-    def triangular(self, left: float, mode: float, right: float, size: None = ...) -> float: ...  # type: ignore[misc]
+    def triangular(
+        self,
+        left: _FloatLike_co,
+        mode: _FloatLike_co,
+        right: _FloatLike_co,
+        size: None = ...,
+    ) -> float: ...  # type: ignore[misc]
     @overload
     def triangular(
         self,
@@ -558,31 +601,31 @@ class Generator:
         size: None | _ShapeLike = ...,
     ) -> ndarray[Any, dtype[float64]]: ...
     @overload
-    def binomial(self, n: int, p: float, size: None = ...) -> int: ...  # type: ignore[misc]
+    def binomial(self, n: int, p: _FloatLike_co, size: None = ...) -> int: ...  # type: ignore[misc]
     @overload
     def binomial(
         self, n: _ArrayLikeInt_co, p: _ArrayLikeFloat_co, size: None | _ShapeLike = ...
     ) -> ndarray[Any, dtype[int64]]: ...
     @overload
-    def negative_binomial(self, n: float, p: float, size: None = ...) -> int: ...  # type: ignore[misc]
+    def negative_binomial(self, n: _FloatLike_co, p: _FloatLike_co, size: None = ...) -> int: ...  # type: ignore[misc]
     @overload
     def negative_binomial(
         self, n: _ArrayLikeFloat_co, p: _ArrayLikeFloat_co, size: None | _ShapeLike = ...
     ) -> ndarray[Any, dtype[int64]]: ...
     @overload
-    def poisson(self, lam: float = ..., size: None = ...) -> int: ...  # type: ignore[misc]
+    def poisson(self, lam: _FloatLike_co = ..., size: None = ...) -> int: ...  # type: ignore[misc]
     @overload
     def poisson(
         self, lam: _ArrayLikeFloat_co = ..., size: None | _ShapeLike = ...
     ) -> ndarray[Any, dtype[int64]]: ...
     @overload
-    def zipf(self, a: float, size: None = ...) -> int: ...  # type: ignore[misc]
+    def zipf(self, a: _FloatLike_co, size: None = ...) -> int: ...  # type: ignore[misc]
     @overload
     def zipf(
         self, a: _ArrayLikeFloat_co, size: None | _ShapeLike = ...
     ) -> ndarray[Any, dtype[int64]]: ...
     @overload
-    def geometric(self, p: float, size: None = ...) -> int: ...  # type: ignore[misc]
+    def geometric(self, p: _FloatLike_co, size: None = ...) -> int: ...  # type: ignore[misc]
     @overload
     def geometric(
         self, p: _ArrayLikeFloat_co, size: None | _ShapeLike = ...
@@ -598,7 +641,7 @@ class Generator:
         size: None | _ShapeLike = ...,
     ) -> ndarray[Any, dtype[int64]]: ...
     @overload
-    def logseries(self, p: float, size: None = ...) -> int: ...  # type: ignore[misc]
+    def logseries(self, p: _FloatLike_co, size: None = ...) -> int: ...  # type: ignore[misc]
     @overload
     def logseries(
         self, p: _ArrayLikeFloat_co, size: None | _ShapeLike = ...
diff --git a/numpy/random/_generator.pyx b/numpy/random/_generator.pyx
index 266a05387..a30d116c2 100644
--- a/numpy/random/_generator.pyx
+++ b/numpy/random/_generator.pyx
@@ -238,6 +238,64 @@ cdef class Generator:
         """
         return self._bit_generator
 
+    def spawn(self, int n_children):
+        """
+        spawn(n_children)
+
+        Create new independent child generators.
+
+        See :ref:`seedsequence-spawn` for additional notes on spawning
+        children.
+
+        .. versionadded:: 1.25.0
+
+        Parameters
+        ----------
+        n_children : int
+
+        Returns
+        -------
+        child_generators : list of Generators
+
+        Raises
+        ------
+        TypeError
+            When the underlying SeedSequence does not implement spawning.
+
+        See Also
+        --------
+        random.BitGenerator.spawn, random.SeedSequence.spawn :
+            Equivalent method on the bit generator and seed sequence.
+        bit_generator :
+            The bit generator instance used by the generator.
+
+        Examples
+        --------
+        Starting from a seeded default generator:
+
+        >>> # High quality entropy created with: f"0x{secrets.randbits(128):x}"
+        >>> entropy = 0x3034c61a9ae04ff8cb62ab8ec2c4b501
+        >>> rng = np.random.default_rng(entropy)
+
+        Create two new generators for example for parallel executation:
+
+        >>> child_rng1, child_rng2 = rng.spawn(2)
+
+        Drawn numbers from each are independent but derived from the initial
+        seeding entropy:
+
+        >>> rng.uniform(), child_rng1.uniform(), child_rng2.uniform()
+        (0.19029263503854454, 0.9475673279178444, 0.4702687338396767)
+
+        It is safe to spawn additional children from the original ``rng`` or
+        the children:
+
+        >>> more_child_rngs = rng.spawn(20)
+        >>> nested_spawn = child_rng1.spawn(20)
+
+        """
+        return [type(self)(g) for g in self._bit_generator.spawn(n_children)]
+
     def random(self, size=None, dtype=np.float64, out=None):
         """
         random(size=None, dtype=np.float64, out=None)
@@ -380,6 +438,22 @@ cdef class Generator:
         out : ndarray or scalar
             Drawn samples from the parameterized exponential distribution.
 
+        Examples
+        --------
+        A real world example: Assume a company has 10000 customer support 
+        agents and the average time between customer calls is 4 minutes.
+
+        >>> n = 10000
+        >>> time_between_calls = np.random.default_rng().exponential(scale=4, size=n)
+
+        What is the probability that a customer will call in the next 
+        4 to 5 minutes? 
+        
+        >>> x = ((time_between_calls < 5).sum())/n 
+        >>> y = ((time_between_calls < 4).sum())/n
+        >>> x-y
+        0.08 # may vary
+
         References
         ----------
         .. [1] Peyton Z. Peebles Jr., "Probability, Random Variables and
@@ -2560,7 +2634,7 @@ cdef class Generator:
         >>> b = []
         >>> for i in range(1000):
         ...    a = 10. + rng.standard_normal(100)
-        ...    b.append(np.product(a))
+        ...    b.append(np.prod(a))
 
         >>> b = np.array(b) / np.min(b) # scale values to be positive
         >>> count, bins, ignored = plt.hist(b, 100, density=True, align='mid')
@@ -3533,8 +3607,8 @@ cdef class Generator:
         generalization of the one-dimensional normal distribution to higher
         dimensions.  Such a distribution is specified by its mean and
         covariance matrix.  These parameters are analogous to the mean
-        (average or "center") and variance (standard deviation, or "width,"
-        squared) of the one-dimensional normal distribution.
+        (average or "center") and variance (the squared standard deviation,
+        or "width") of the one-dimensional normal distribution.
 
         Parameters
         ----------
@@ -3611,6 +3685,12 @@ cdef class Generator:
         nonnegative-definite). Otherwise, the behavior of this method is
         undefined and backwards compatibility is not guaranteed.
 
+        This function internally uses linear algebra routines, and thus results
+        may not be identical (even up to precision) across architectures, OSes,
+        or even builds. For example, this is likely if ``cov`` has multiple equal
+        singular values and ``method`` is ``'svd'`` (default). In this case,
+        ``method='cholesky'`` may be more robust.
+
         References
         ----------
         .. [1] Papoulis, A., "Probability, Random Variables, and Stochastic
@@ -4247,7 +4327,7 @@ cdef class Generator:
         Raises
         ------
         ValueError
-            If any value in ``alpha`` is less than or equal to zero
+            If any value in ``alpha`` is less than zero
 
         Notes
         -----
@@ -4326,8 +4406,8 @@ cdef class Generator:
         alpha_arr = <np.ndarray>np.PyArray_FROMANY(
             alpha, np.NPY_DOUBLE, 1, 1,
             np.NPY_ARRAY_ALIGNED | np.NPY_ARRAY_C_CONTIGUOUS)
-        if np.any(np.less_equal(alpha_arr, 0)):
-            raise ValueError('alpha <= 0')
+        if np.any(np.less(alpha_arr, 0)):
+            raise ValueError('alpha < 0')
         alpha_data = <double*>np.PyArray_DATA(alpha_arr)
 
         if size is None:
@@ -4745,7 +4825,7 @@ cdef class Generator:
         >>> rng.permutation("abc")
         Traceback (most recent call last):
             ...
-        numpy.AxisError: axis 0 is out of bounds for array of dimension 0
+        numpy.exceptions.AxisError: axis 0 is out of bounds for array of dimension 0
 
         >>> arr = np.arange(9).reshape((3, 3))
         >>> rng.permutation(arr, axis=1)
@@ -4803,6 +4883,8 @@ def default_rng(seed=None):
     -----
     If ``seed`` is not a `BitGenerator` or a `Generator`, a new `BitGenerator`
     is instantiated. This function does not manage a default global instance.
+
+    See :ref:`seeding_and_entropy` for more information about seeding.
     
     Examples
     --------
diff --git a/numpy/random/_mt19937.pyx b/numpy/random/_mt19937.pyx
index 5a8d52e6b..8b991254a 100644
--- a/numpy/random/_mt19937.pyx
+++ b/numpy/random/_mt19937.pyx
@@ -28,16 +28,16 @@ cdef extern from "src/mt19937/mt19937.h":
     enum:
         RK_STATE_LEN
 
-cdef uint64_t mt19937_uint64(void *st) nogil:
+cdef uint64_t mt19937_uint64(void *st) noexcept nogil:
     return mt19937_next64(<mt19937_state *> st)
 
-cdef uint32_t mt19937_uint32(void *st) nogil:
+cdef uint32_t mt19937_uint32(void *st) noexcept nogil:
     return mt19937_next32(<mt19937_state *> st)
 
-cdef double mt19937_double(void *st) nogil:
+cdef double mt19937_double(void *st) noexcept nogil:
     return mt19937_next_double(<mt19937_state *> st)
 
-cdef uint64_t mt19937_raw(void *st) nogil:
+cdef uint64_t mt19937_raw(void *st) noexcept nogil:
     return <uint64_t>mt19937_next32(<mt19937_state *> st)
 
 cdef class MT19937(BitGenerator):
diff --git a/numpy/random/_pcg64.pyx b/numpy/random/_pcg64.pyx
index c0a10a812..dee38c039 100644
--- a/numpy/random/_pcg64.pyx
+++ b/numpy/random/_pcg64.pyx
@@ -30,13 +30,13 @@ cdef extern from "src/pcg64/pcg64.h":
     uint32_t pcg64_cm_next32(pcg64_state *state)  nogil
     void pcg64_cm_advance(pcg64_state *state, uint64_t *step)
 
-cdef uint64_t pcg64_uint64(void* st) nogil:
+cdef uint64_t pcg64_uint64(void* st) noexcept nogil:
     return pcg64_next64(<pcg64_state *>st)
 
-cdef uint32_t pcg64_uint32(void *st) nogil:
+cdef uint32_t pcg64_uint32(void *st) noexcept nogil:
     return pcg64_next32(<pcg64_state *> st)
 
-cdef double pcg64_double(void* st) nogil:
+cdef double pcg64_double(void* st) noexcept nogil:
     return uint64_to_double(pcg64_next64(<pcg64_state *>st))
 
 cdef uint64_t pcg64_cm_uint64(void* st) nogil:
diff --git a/numpy/random/_philox.pyx b/numpy/random/_philox.pyx
index d9a366e86..e0c0504f6 100644
--- a/numpy/random/_philox.pyx
+++ b/numpy/random/_philox.pyx
@@ -42,13 +42,13 @@ cdef extern from 'src/philox/philox.h':
     void philox_advance(uint64_t *step, philox_state *state)
 
 
-cdef uint64_t philox_uint64(void*st) nogil:
+cdef uint64_t philox_uint64(void*st) noexcept nogil:
     return philox_next64(<philox_state *> st)
 
-cdef uint32_t philox_uint32(void *st) nogil:
+cdef uint32_t philox_uint32(void *st) noexcept nogil:
     return philox_next32(<philox_state *> st)
 
-cdef double philox_double(void*st) nogil:
+cdef double philox_double(void*st) noexcept nogil:
     return uint64_to_double(philox_next64(<philox_state *> st))
 
 cdef class Philox(BitGenerator):
diff --git a/numpy/random/_sfc64.pyx b/numpy/random/_sfc64.pyx
index 1daee34f8..419045c1d 100644
--- a/numpy/random/_sfc64.pyx
+++ b/numpy/random/_sfc64.pyx
@@ -21,13 +21,13 @@ cdef extern from "src/sfc64/sfc64.h":
     void sfc64_set_state(sfc64_state *state, uint64_t *state_arr, int has_uint32, uint32_t uinteger)
 
 
-cdef uint64_t sfc64_uint64(void* st) nogil:
+cdef uint64_t sfc64_uint64(void* st) noexcept nogil:
     return sfc64_next64(<sfc64_state *>st)
 
-cdef uint32_t sfc64_uint32(void *st) nogil:
+cdef uint32_t sfc64_uint32(void *st) noexcept nogil:
     return sfc64_next32(<sfc64_state *> st)
 
-cdef double sfc64_double(void* st) nogil:
+cdef double sfc64_double(void* st) noexcept nogil:
     return uint64_to_double(sfc64_next64(<sfc64_state *>st))
 
 
diff --git a/numpy/random/bit_generator.pyi b/numpy/random/bit_generator.pyi
index e6e3b10cd..8b9779cad 100644
--- a/numpy/random/bit_generator.pyi
+++ b/numpy/random/bit_generator.pyi
@@ -96,6 +96,9 @@ class BitGenerator(abc.ABC):
     def state(self) -> Mapping[str, Any]: ...
     @state.setter
     def state(self, value: Mapping[str, Any]) -> None: ...
+    @property
+    def seed_seq(self) -> ISeedSequence: ...
+    def spawn(self, n_children: int) -> list[BitGenerator]: ...
     @overload
     def random_raw(self, size: None = ..., output: Literal[True] = ...) -> int: ...  # type: ignore[misc]
     @overload
diff --git a/numpy/random/bit_generator.pyx b/numpy/random/bit_generator.pyx
index 3da4fabce..83441747a 100644
--- a/numpy/random/bit_generator.pyx
+++ b/numpy/random/bit_generator.pyx
@@ -212,6 +212,9 @@ class ISpawnableSeedSequence(ISeedSequence):
         Spawn a number of child `SeedSequence` s by extending the
         `spawn_key`.
 
+        See :ref:`seedsequence-spawn` for additional notes on spawning
+        children.
+
         Parameters
         ----------
         n_children : int
@@ -260,6 +263,7 @@ cdef class SeedSequence():
     ----------
     entropy : {None, int, sequence[int]}, optional
         The entropy for creating a `SeedSequence`.
+        All integer values must be non-negative.
     spawn_key : {(), sequence[int]}, optional
         An additional source of entropy based on the position of this
         `SeedSequence` in the tree of such objects created with the
@@ -450,6 +454,9 @@ cdef class SeedSequence():
         Spawn a number of child `SeedSequence` s by extending the
         `spawn_key`.
 
+        See :ref:`seedsequence-spawn` for additional notes on spawning
+        children.
+
         Parameters
         ----------
         n_children : int
@@ -457,6 +464,12 @@ cdef class SeedSequence():
         Returns
         -------
         seqs : list of `SeedSequence` s
+
+        See Also
+        --------
+        random.Generator.spawn, random.BitGenerator.spawn :
+            Equivalent method on the generator and bit generator.
+
         """
         cdef uint32_t i
 
@@ -490,6 +503,7 @@ cdef class BitGenerator():
         ``array_like[ints]`` is passed, then it will be passed to
         `~numpy.random.SeedSequence` to derive the initial `BitGenerator` state.
         One may also pass in a `SeedSequence` instance.
+        All integer values must be non-negative.
 
     Attributes
     ----------
@@ -549,6 +563,59 @@ cdef class BitGenerator():
     def state(self, value):
         raise NotImplementedError('Not implemented in base BitGenerator')
 
+    @property
+    def seed_seq(self):
+        """
+        Get the seed sequence used to initialize the bit generator.
+
+        .. versionadded:: 1.25.0
+
+        Returns
+        -------
+        seed_seq : ISeedSequence
+            The SeedSequence object used to initialize the BitGenerator.
+            This is normally a `np.random.SeedSequence` instance.
+
+        """
+        return self._seed_seq
+
+    def spawn(self, int n_children):
+        """
+        spawn(n_children)
+
+        Create new independent child bit generators.
+
+        See :ref:`seedsequence-spawn` for additional notes on spawning
+        children.  Some bit generators also implement ``jumped``
+        as a different approach for creating independent streams.
+
+        .. versionadded:: 1.25.0
+
+        Parameters
+        ----------
+        n_children : int
+
+        Returns
+        -------
+        child_bit_generators : list of BitGenerators
+
+        Raises
+        ------
+        TypeError
+            When the underlying SeedSequence does not implement spawning.
+
+        See Also
+        --------
+        random.Generator.spawn, random.SeedSequence.spawn :
+            Equivalent method on the generator and seed sequence.
+
+        """
+        if not isinstance(self._seed_seq, ISpawnableSeedSequence):
+            raise TypeError(
+                "The underlying SeedSequence does not implement spawning.")
+
+        return [type(self)(seed=s) for s in self._seed_seq.spawn(n_children)]
+
     def random_raw(self, size=None, output=True):
         """
         random_raw(self, size=None)
diff --git a/numpy/random/c_distributions.pxd b/numpy/random/c_distributions.pxd
index 6f905edc1..b978d1350 100644
--- a/numpy/random/c_distributions.pxd
+++ b/numpy/random/c_distributions.pxd
@@ -28,18 +28,24 @@ cdef extern from "numpy/random/distributions.h":
 
     ctypedef s_binomial_t binomial_t
 
+    float random_standard_uniform_f(bitgen_t *bitgen_state) nogil
     double random_standard_uniform(bitgen_t *bitgen_state) nogil
     void random_standard_uniform_fill(bitgen_t* bitgen_state, npy_intp cnt, double *out) nogil
+    void random_standard_uniform_fill_f(bitgen_t *bitgen_state, npy_intp cnt, float *out) nogil
+    
     double random_standard_exponential(bitgen_t *bitgen_state) nogil
-    double random_standard_exponential_f(bitgen_t *bitgen_state) nogil
+    float random_standard_exponential_f(bitgen_t *bitgen_state) nogil
     void random_standard_exponential_fill(bitgen_t *bitgen_state, npy_intp cnt, double *out) nogil
-    void random_standard_exponential_fill_f(bitgen_t *bitgen_state, npy_intp cnt, double *out) nogil
+    void random_standard_exponential_fill_f(bitgen_t *bitgen_state, npy_intp cnt, float *out) nogil
     void random_standard_exponential_inv_fill(bitgen_t *bitgen_state, npy_intp cnt, double *out) nogil
-    void random_standard_exponential_inv_fill_f(bitgen_t *bitgen_state, npy_intp cnt, double *out) nogil
+    void random_standard_exponential_inv_fill_f(bitgen_t *bitgen_state, npy_intp cnt, float *out) nogil
+    
     double random_standard_normal(bitgen_t* bitgen_state) nogil
+    float random_standard_normal_f(bitgen_t *bitgen_state) nogil
     void random_standard_normal_fill(bitgen_t *bitgen_state, npy_intp count, double *out) nogil
     void random_standard_normal_fill_f(bitgen_t *bitgen_state, npy_intp count, float *out) nogil
     double random_standard_gamma(bitgen_t *bitgen_state, double shape) nogil
+    float random_standard_gamma_f(bitgen_t *bitgen_state, float shape) nogil
 
     float random_standard_uniform_f(bitgen_t *bitgen_state) nogil
     void random_standard_uniform_fill_f(bitgen_t* bitgen_state, npy_intp cnt, float *out) nogil
diff --git a/numpy/random/include/aligned_malloc.h b/numpy/random/include/aligned_malloc.h
index 43f68253d..0fff57b6c 100644
--- a/numpy/random/include/aligned_malloc.h
+++ b/numpy/random/include/aligned_malloc.h
@@ -6,7 +6,7 @@
 
 #define NPY_MEMALIGN 16 /* 16 for SSE2, 32 for AVX, 64 for Xeon Phi */
 
-static NPY_INLINE void *PyArray_realloc_aligned(void *p, size_t n)
+static inline void *PyArray_realloc_aligned(void *p, size_t n)
 {
     void *p1, **p2, *base;
     size_t old_offs, offs = NPY_MEMALIGN - 1 + sizeof(void *);
@@ -31,12 +31,12 @@ static NPY_INLINE void *PyArray_realloc_aligned(void *p, size_t n)
     return (void *)p2;
 }
 
-static NPY_INLINE void *PyArray_malloc_aligned(size_t n)
+static inline void *PyArray_malloc_aligned(size_t n)
 {
     return PyArray_realloc_aligned(NULL, n);
 }
 
-static NPY_INLINE void *PyArray_calloc_aligned(size_t n, size_t s)
+static inline void *PyArray_calloc_aligned(size_t n, size_t s)
 {
     void *p;
     if (NPY_UNLIKELY((p = PyArray_realloc_aligned(NULL, n * s)) == NULL))
@@ -45,7 +45,7 @@ static NPY_INLINE void *PyArray_calloc_aligned(size_t n, size_t s)
     return p;
 }
 
-static NPY_INLINE void PyArray_free_aligned(void *p)
+static inline void PyArray_free_aligned(void *p)
 {
     void *base = *(((void **)p) - 1);
     PyMem_Free(base);
diff --git a/numpy/random/meson.build b/numpy/random/meson.build
new file mode 100644
index 000000000..036cd81b9
--- /dev/null
+++ b/numpy/random/meson.build
@@ -0,0 +1,164 @@
+# Build npyrandom library
+# -----------------------
+npyrandom_sources = [
+  'src/distributions/logfactorial.c',
+  'src/distributions/distributions.c',
+  'src/distributions/random_mvhg_count.c',
+  'src/distributions/random_mvhg_marginals.c',
+  'src/distributions/random_hypergeometric.c',
+]
+
+npyrandom_lib = static_library('npyrandom',
+  npyrandom_sources,
+  c_args: staticlib_cflags,
+  # include_directories: '../core/include',
+  dependencies: [py_dep, np_core_dep],
+  install: true,
+  install_dir: np_dir / 'random/lib',
+)
+
+# Build Cython extensions for numpy.random
+# ----------------------------------------
+# pyx -> c transpile output depends on copied __init__.py and pxd files
+_cython_tree_random = [
+  fs.copyfile('__init__.py'),
+  fs.copyfile('__init__.pxd'),
+  fs.copyfile('_common.pxd'),
+  fs.copyfile('bit_generator.pxd'),
+  fs.copyfile('c_distributions.pxd'),
+]
+# Need to use `custom_target` because we need to install this .pxd file
+_cython_tree_random += custom_target('_bounded_integer_pxd',
+  output: '_bounded_integers.pxd',
+  input: '_bounded_integers.pxd.in',
+  command: [tempita_cli, '@INPUT@', '-o', '@OUTPUT@'],
+  install: true,
+  install_dir: np_dir / 'random'
+)
+
+_bounded_integers_pyx = custom_target('_bounded_integer_pyx',
+  output: '_bounded_integers.pyx',
+  input: '_bounded_integers.pyx.in',
+  command: [tempita_cli, '@INPUT@', '-o', '@OUTPUT@'],
+)
+
+c_args_random = [
+  cflags_large_file_support,
+  '-DNPY_NO_DEPRECATED_API=0',  # Cython still uses old NumPy C API
+]
+if host_machine.system() == 'cygwin'
+  c_args_random += ['-Wl,--export-all-symbols']
+endif
+
+# name, sources, extra link libs, extra c_args
+random_pyx_sources = [
+  ['_bounded_integers', _bounded_integers_pyx, [], npymath_lib],
+  ['_common', '_common.pyx', [], []],
+  ['_mt19937', ['_mt19937.pyx', 'src/mt19937/mt19937.c', 'src/mt19937/mt19937-jump.c'], [], []],
+  ['_philox', ['_philox.pyx', 'src/philox/philox.c'], [], []],
+  ['_pcg64', ['_pcg64.pyx', 'src/pcg64/pcg64.c'], ['-U__GNUC_GNU_INLINE__'], []],
+  ['_sfc64', ['_sfc64.pyx', 'src/sfc64/sfc64.c'], [], []],
+  ['bit_generator', 'bit_generator.pyx', [], []],
+  # The `fs.copyfile` usage here is needed because these two .pyx files import
+  # from _bounded_integers,and its pxd file is only present in the build directory
+  ['_generator', fs.copyfile('_generator.pyx'), [], npymath_lib],
+  ['mtrand', [
+      fs.copyfile('mtrand.pyx'),
+      'src/distributions/distributions.c',
+      'src/legacy/legacy-distributions.c'
+    ], ['-DNP_RANDOM_LEGACY=1'], npymath_lib,
+  ],
+]
+foreach gen: random_pyx_sources
+  py.extension_module(gen[0],
+    [gen[1], _cython_tree, _cython_tree_random],
+    c_args: [c_args_random, gen[2]],
+    include_directories: 'src',
+    dependencies: np_core_dep,
+    link_with: [npyrandom_lib, gen[3]],
+    install: true,
+    subdir: 'numpy/random',
+  )
+endforeach
+
+# Install Python sources, stub files, tests, examples and license
+# ---------------------------------------------------------------
+py.install_sources(
+  [
+    '__init__.pxd',
+    '__init__.py',
+    '__init__.pyi',
+    '_common.pxd',
+    '_generator.pyi',
+    '_mt19937.pyi',
+    '_pcg64.pyi',
+    '_pickle.py',
+    '_philox.pyi',
+    '_sfc64.pyi',
+    'bit_generator.pxd',
+    'bit_generator.pyi',
+    'c_distributions.pxd',
+    'LICENSE.md',
+    'mtrand.pyi',
+  ],
+  subdir: 'numpy/random'
+)
+
+py.install_sources(
+  [
+    'tests/__init__.py',
+    'tests/test_direct.py',
+    'tests/test_extending.py',
+    'tests/test_generator_mt19937.py',
+    'tests/test_generator_mt19937_regressions.py',
+    'tests/test_random.py',
+    'tests/test_randomstate.py',
+    'tests/test_randomstate_regression.py',
+    'tests/test_regression.py',
+    'tests/test_seed_sequence.py',
+    'tests/test_smoke.py',
+  ],
+  subdir: 'numpy/random/tests'
+)
+
+py.install_sources(
+  [
+    'tests/data/__init__.py',
+    'tests/data/mt19937-testset-1.csv',
+    'tests/data/mt19937-testset-2.csv',
+    'tests/data/pcg64-testset-1.csv',
+    'tests/data/pcg64-testset-2.csv',
+    'tests/data/pcg64dxsm-testset-1.csv',
+    'tests/data/pcg64dxsm-testset-2.csv',
+    'tests/data/philox-testset-1.csv',
+    'tests/data/philox-testset-2.csv',
+    'tests/data/sfc64-testset-1.csv',
+    'tests/data/sfc64-testset-2.csv',
+  ],
+  subdir: 'numpy/random/tests/data'
+)
+
+py.install_sources(
+  [
+  '_examples/cffi/extending.py',
+  '_examples/cffi/parse.py',
+  ],
+  subdir: 'numpy/random/_examples/cffi'
+)
+
+py.install_sources(
+  [
+  '_examples/cython/extending.pyx',
+  '_examples/cython/extending_distributions.pyx',
+  '_examples/cython/setup.py',
+  ],
+  subdir: 'numpy/random/_examples/cython'
+)
+
+py.install_sources(
+  [
+  '_examples/numba/extending.py',
+  '_examples/numba/extending_distributions.py',
+  ],
+  subdir: 'numpy/random/_examples/numba'
+)
diff --git a/numpy/random/mtrand.pyx b/numpy/random/mtrand.pyx
index edf812a4d..dfa553ee4 100644
--- a/numpy/random/mtrand.pyx
+++ b/numpy/random/mtrand.pyx
@@ -537,6 +537,22 @@ cdef class RandomState:
         out : ndarray or scalar
             Drawn samples from the parameterized exponential distribution.
 
+        Examples
+        --------
+        A real world example: Assume a company has 10000 customer support 
+        agents and the average time between customer calls is 4 minutes.
+
+        >>> n = 10000
+        >>> time_between_calls = np.random.default_rng().exponential(scale=4, size=n)
+
+        What is the probability that a customer will call in the next 
+        4 to 5 minutes? 
+        
+        >>> x = ((time_between_calls < 5).sum())/n 
+        >>> y = ((time_between_calls < 4).sum())/n
+        >>> x-y
+        0.08 # may vary
+
         See Also
         --------
         random.Generator.exponential: which should be used for new code.
@@ -3050,7 +3066,7 @@ cdef class RandomState:
         >>> b = []
         >>> for i in range(1000):
         ...    a = 10. + np.random.standard_normal(100)
-        ...    b.append(np.product(a))
+        ...    b.append(np.prod(a))
 
         >>> b = np.array(b) / np.min(b) # scale values to be positive
         >>> count, bins, ignored = plt.hist(b, 100, density=True, align='mid')
diff --git a/numpy/random/src/distributions/distributions.c b/numpy/random/src/distributions/distributions.c
index bd1e1faa4..cebeb07cf 100644
--- a/numpy/random/src/distributions/distributions.c
+++ b/numpy/random/src/distributions/distributions.c
@@ -9,14 +9,14 @@
 #include <assert.h>
 
 /* Inline generators for internal use */
-static NPY_INLINE uint32_t next_uint32(bitgen_t *bitgen_state) {
+static inline uint32_t next_uint32(bitgen_t *bitgen_state) {
   return bitgen_state->next_uint32(bitgen_state->state);
 }
-static NPY_INLINE uint64_t next_uint64(bitgen_t *bitgen_state) {
+static inline uint64_t next_uint64(bitgen_t *bitgen_state) {
   return bitgen_state->next_uint64(bitgen_state->state);
 }
 
-static NPY_INLINE float next_float(bitgen_t *bitgen_state) {
+static inline float next_float(bitgen_t *bitgen_state) {
   return (next_uint32(bitgen_state) >> 8) * (1.0f / 16777216.0f);
 }
 
@@ -960,7 +960,15 @@ RAND_INT_TYPE random_geometric_search(bitgen_t *bitgen_state, double p) {
 }
 
 int64_t random_geometric_inversion(bitgen_t *bitgen_state, double p) {
-  return (int64_t)ceil(-random_standard_exponential(bitgen_state) / npy_log1p(-p));
+  double z = ceil(-random_standard_exponential(bitgen_state) / npy_log1p(-p));
+  /*
+   * The constant 9.223372036854776e+18 is the smallest double that is
+   * larger than INT64_MAX.
+   */
+  if (z >= 9.223372036854776e+18) {
+    return INT64_MAX;
+  }
+  return (int64_t) z;
 }
 
 int64_t random_geometric(bitgen_t *bitgen_state, double p) {
@@ -1047,7 +1055,7 @@ uint64_t random_interval(bitgen_t *bitgen_state, uint64_t max) {
 }
 
 /* Bounded generators */
-static NPY_INLINE uint64_t gen_mask(uint64_t max) {
+static inline uint64_t gen_mask(uint64_t max) {
   uint64_t mask = max;
   mask |= mask >> 1;
   mask |= mask >> 2;
@@ -1059,8 +1067,8 @@ static NPY_INLINE uint64_t gen_mask(uint64_t max) {
 }
 
 /* Generate 16 bit random numbers using a 32 bit buffer. */
-static NPY_INLINE uint16_t buffered_uint16(bitgen_t *bitgen_state, int *bcnt,
-                                           uint32_t *buf) {
+static inline uint16_t buffered_uint16(bitgen_t *bitgen_state, int *bcnt,
+                                       uint32_t *buf) {
   if (!(bcnt[0])) {
     buf[0] = next_uint32(bitgen_state);
     bcnt[0] = 1;
@@ -1073,8 +1081,8 @@ static NPY_INLINE uint16_t buffered_uint16(bitgen_t *bitgen_state, int *bcnt,
 }
 
 /* Generate 8 bit random numbers using a 32 bit buffer. */
-static NPY_INLINE uint8_t buffered_uint8(bitgen_t *bitgen_state, int *bcnt,
-                                         uint32_t *buf) {
+static inline uint8_t buffered_uint8(bitgen_t *bitgen_state, int *bcnt,
+                                     uint32_t *buf) {
   if (!(bcnt[0])) {
     buf[0] = next_uint32(bitgen_state);
     bcnt[0] = 3;
@@ -1087,8 +1095,8 @@ static NPY_INLINE uint8_t buffered_uint8(bitgen_t *bitgen_state, int *bcnt,
 }
 
 /* Static `masked rejection` function called by random_bounded_uint64(...) */
-static NPY_INLINE uint64_t bounded_masked_uint64(bitgen_t *bitgen_state,
-                                                 uint64_t rng, uint64_t mask) {
+static inline uint64_t bounded_masked_uint64(bitgen_t *bitgen_state,
+                                             uint64_t rng, uint64_t mask) {
   uint64_t val;
 
   while ((val = (next_uint64(bitgen_state) & mask)) > rng)
@@ -1099,7 +1107,7 @@ static NPY_INLINE uint64_t bounded_masked_uint64(bitgen_t *bitgen_state,
 
 /* Static `masked rejection` function called by
  * random_buffered_bounded_uint32(...) */
-static NPY_INLINE uint32_t
+static inline uint32_t
 buffered_bounded_masked_uint32(bitgen_t *bitgen_state, uint32_t rng,
                                uint32_t mask, int *bcnt, uint32_t *buf) {
   /*
@@ -1118,7 +1126,7 @@ buffered_bounded_masked_uint32(bitgen_t *bitgen_state, uint32_t rng,
 
 /* Static `masked rejection` function called by
  * random_buffered_bounded_uint16(...) */
-static NPY_INLINE uint16_t
+static inline uint16_t
 buffered_bounded_masked_uint16(bitgen_t *bitgen_state, uint16_t rng,
                                uint16_t mask, int *bcnt, uint32_t *buf) {
   uint16_t val;
@@ -1131,10 +1139,11 @@ buffered_bounded_masked_uint16(bitgen_t *bitgen_state, uint16_t rng,
 
 /* Static `masked rejection` function called by
  * random_buffered_bounded_uint8(...) */
-static NPY_INLINE uint8_t buffered_bounded_masked_uint8(bitgen_t *bitgen_state,
-                                                        uint8_t rng,
-                                                        uint8_t mask, int *bcnt,
-                                                        uint32_t *buf) {
+static inline uint8_t buffered_bounded_masked_uint8(bitgen_t *bitgen_state,
+                                                    uint8_t rng,
+                                                    uint8_t mask,
+                                                    int *bcnt,
+                                                    uint32_t *buf) {
   uint8_t val;
 
   while ((val = (buffered_uint8(bitgen_state, bcnt, buf) & mask)) > rng)
@@ -1143,10 +1152,10 @@ static NPY_INLINE uint8_t buffered_bounded_masked_uint8(bitgen_t *bitgen_state,
   return val;
 }
 
-static NPY_INLINE npy_bool buffered_bounded_bool(bitgen_t *bitgen_state,
-                                                 npy_bool off, npy_bool rng,
-                                                 npy_bool mask, int *bcnt,
-                                                 uint32_t *buf) {
+static inline npy_bool buffered_bounded_bool(bitgen_t *bitgen_state,
+                                             npy_bool off, npy_bool rng,
+                                             npy_bool mask, int *bcnt,
+                                             uint32_t *buf) {
   if (rng == 0)
     return off;
   if (!(bcnt[0])) {
@@ -1160,8 +1169,8 @@ static NPY_INLINE npy_bool buffered_bounded_bool(bitgen_t *bitgen_state,
 }
 
 /* Static `Lemire rejection` function called by random_bounded_uint64(...) */
-static NPY_INLINE uint64_t bounded_lemire_uint64(bitgen_t *bitgen_state,
-                                                 uint64_t rng) {
+static inline uint64_t bounded_lemire_uint64(bitgen_t *bitgen_state,
+                                             uint64_t rng) {
   /*
    * Uses Lemire's algorithm - https://arxiv.org/abs/1805.10941
    *
@@ -1245,7 +1254,7 @@ static NPY_INLINE uint64_t bounded_lemire_uint64(bitgen_t *bitgen_state,
 
 /* Static `Lemire rejection` function called by
  * random_buffered_bounded_uint32(...) */
-static NPY_INLINE uint32_t buffered_bounded_lemire_uint32(
+static inline uint32_t buffered_bounded_lemire_uint32(
     bitgen_t *bitgen_state, uint32_t rng, int *bcnt, uint32_t *buf) {
   /*
    * Uses Lemire's algorithm - https://arxiv.org/abs/1805.10941
@@ -1285,7 +1294,7 @@ static NPY_INLINE uint32_t buffered_bounded_lemire_uint32(
 
 /* Static `Lemire rejection` function called by
  * random_buffered_bounded_uint16(...) */
-static NPY_INLINE uint16_t buffered_bounded_lemire_uint16(
+static inline uint16_t buffered_bounded_lemire_uint16(
     bitgen_t *bitgen_state, uint16_t rng, int *bcnt, uint32_t *buf) {
   /*
    * Uses Lemire's algorithm - https://arxiv.org/abs/1805.10941
@@ -1321,9 +1330,9 @@ static NPY_INLINE uint16_t buffered_bounded_lemire_uint16(
 
 /* Static `Lemire rejection` function called by
  * random_buffered_bounded_uint8(...) */
-static NPY_INLINE uint8_t buffered_bounded_lemire_uint8(bitgen_t *bitgen_state,
-                                                        uint8_t rng, int *bcnt,
-                                                        uint32_t *buf) {
+static inline uint8_t buffered_bounded_lemire_uint8(bitgen_t *bitgen_state,
+                                                    uint8_t rng, int *bcnt,
+                                                    uint32_t *buf) {
   /*
    * Uses Lemire's algorithm - https://arxiv.org/abs/1805.10941
    *
diff --git a/numpy/random/src/legacy/legacy-distributions.c b/numpy/random/src/legacy/legacy-distributions.c
index 443c1a4bf..b518b8a03 100644
--- a/numpy/random/src/legacy/legacy-distributions.c
+++ b/numpy/random/src/legacy/legacy-distributions.c
@@ -11,7 +11,7 @@
 #include "include/legacy-distributions.h"
 
 
-static NPY_INLINE double legacy_double(aug_bitgen_t *aug_state) {
+static inline double legacy_double(aug_bitgen_t *aug_state) {
   return aug_state->bit_generator->next_double(aug_state->bit_generator->state);
 }
 
@@ -494,4 +494,4 @@ int64_t legacy_logseries(bitgen_t *bitgen_state, double p) {
     }
     return 2;
   }
-}
-\ No newline at end of file
+}
diff --git a/numpy/random/src/mt19937/randomkit.c b/numpy/random/src/mt19937/randomkit.c
index f8ed4b49e..e718c2d06 100644
--- a/numpy/random/src/mt19937/randomkit.c
+++ b/numpy/random/src/mt19937/randomkit.c
@@ -247,7 +247,7 @@ unsigned long rk_random(rk_state *state) {
 /*
  * Returns an unsigned 64 bit random integer.
  */
-NPY_INLINE static npy_uint64 rk_uint64(rk_state *state) {
+static inline npy_uint64 rk_uint64(rk_state *state) {
   npy_uint64 upper = (npy_uint64)rk_random(state) << 32;
   npy_uint64 lower = (npy_uint64)rk_random(state);
   return upper | lower;
@@ -256,7 +256,7 @@ NPY_INLINE static npy_uint64 rk_uint64(rk_state *state) {
 /*
  * Returns an unsigned 32 bit random integer.
  */
-NPY_INLINE static npy_uint32 rk_uint32(rk_state *state) {
+static inline npy_uint32 rk_uint32(rk_state *state) {
   return (npy_uint32)rk_random(state);
 }
 
diff --git a/numpy/random/src/philox/philox.c b/numpy/random/src/philox/philox.c
index 6f2fad5a4..7909383e7 100644
--- a/numpy/random/src/philox/philox.c
+++ b/numpy/random/src/philox/philox.c
@@ -1,8 +1,8 @@
 #include "philox.h"
 
-extern NPY_INLINE uint64_t philox_next64(philox_state *state);
+extern inline uint64_t philox_next64(philox_state *state);
 
-extern NPY_INLINE uint32_t philox_next32(philox_state *state);
+extern inline uint32_t philox_next32(philox_state *state);
 
 extern void philox_jump(philox_state *state) {
   /* Advances state as-if 2^128 draws were made */
diff --git a/numpy/random/src/philox/philox.h b/numpy/random/src/philox/philox.h
index 81e034a47..ba7a67470 100644
--- a/numpy/random/src/philox/philox.h
+++ b/numpy/random/src/philox/philox.h
@@ -18,7 +18,7 @@ typedef struct r123array4x64 philox4x64_ctr_t;
 typedef struct r123array2x64 philox4x64_key_t;
 typedef struct r123array2x64 philox4x64_ukey_t;
 
-static NPY_INLINE struct r123array2x64
+static inline struct r123array2x64
 _philox4x64bumpkey(struct r123array2x64 key) {
   key.v[0] += (0x9E3779B97F4A7C15ULL);
   key.v[1] += (0xBB67AE8584CAA73BULL);
@@ -27,7 +27,7 @@ _philox4x64bumpkey(struct r123array2x64 key) {
 
 /* Prefer uint128 if available: GCC, clang, ICC */
 #ifdef __SIZEOF_INT128__
-static NPY_INLINE uint64_t mulhilo64(uint64_t a, uint64_t b, uint64_t *hip) {
+static inline uint64_t mulhilo64(uint64_t a, uint64_t b, uint64_t *hip) {
   __uint128_t product = ((__uint128_t)a) * ((__uint128_t)b);
   *hip = product >> 64;
   return (uint64_t)product;
@@ -39,13 +39,13 @@ static NPY_INLINE uint64_t mulhilo64(uint64_t a, uint64_t b, uint64_t *hip) {
 #pragma intrinsic(_umul128)
 #elif defined(_WIN64) && defined(_M_ARM64)
 #pragma intrinsic(__umulh)
-static NPY_INLINE uint64_t _umul128(uint64_t a, uint64_t b, uint64_t *high) {
+static inline uint64_t _umul128(uint64_t a, uint64_t b, uint64_t *high) {
   *high = __umulh(a, b);
   return a * b;
 }
 #else
 #pragma intrinsic(__emulu)
-static NPY_INLINE uint64_t _umul128(uint64_t a, uint64_t b, uint64_t *high) {
+static inline uint64_t _umul128(uint64_t a, uint64_t b, uint64_t *high) {
 
   uint64_t a_lo, a_hi, b_lo, b_hi, a_x_b_hi, a_x_b_mid, a_x_b_lo, b_x_a_mid,
       carry_bit;
@@ -68,11 +68,11 @@ static NPY_INLINE uint64_t _umul128(uint64_t a, uint64_t b, uint64_t *high) {
   return a_x_b_lo + ((a_x_b_mid + b_x_a_mid) << 32);
 }
 #endif
-static NPY_INLINE uint64_t mulhilo64(uint64_t a, uint64_t b, uint64_t *hip) {
+static inline uint64_t mulhilo64(uint64_t a, uint64_t b, uint64_t *hip) {
   return _umul128(a, b, hip);
 }
 #else
-static NPY_INLINE uint64_t _umul128(uint64_t a, uint64_t b, uint64_t *high) {
+static inline uint64_t _umul128(uint64_t a, uint64_t b, uint64_t *high) {
 
   uint64_t a_lo, a_hi, b_lo, b_hi, a_x_b_hi, a_x_b_mid, a_x_b_lo, b_x_a_mid,
       carry_bit;
@@ -94,16 +94,16 @@ static NPY_INLINE uint64_t _umul128(uint64_t a, uint64_t b, uint64_t *high) {
 
   return a_x_b_lo + ((a_x_b_mid + b_x_a_mid) << 32);
 }
-static NPY_INLINE uint64_t mulhilo64(uint64_t a, uint64_t b, uint64_t *hip) {
+static inline uint64_t mulhilo64(uint64_t a, uint64_t b, uint64_t *hip) {
   return _umul128(a, b, hip);
 }
 #endif
 #endif
 
-static NPY_INLINE struct r123array4x64 _philox4x64round(struct r123array4x64 ctr,
+static inline struct r123array4x64 _philox4x64round(struct r123array4x64 ctr,
                                                     struct r123array2x64 key);
 
-static NPY_INLINE struct r123array4x64 _philox4x64round(struct r123array4x64 ctr,
+static inline struct r123array4x64 _philox4x64round(struct r123array4x64 ctr,
                                                     struct r123array2x64 key) {
   uint64_t hi0;
   uint64_t hi1;
@@ -114,14 +114,14 @@ static NPY_INLINE struct r123array4x64 _philox4x64round(struct r123array4x64 ctr
   return out;
 }
 
-static NPY_INLINE philox4x64_key_t philox4x64keyinit(philox4x64_ukey_t uk) {
+static inline philox4x64_key_t philox4x64keyinit(philox4x64_ukey_t uk) {
   return uk;
 }
-static NPY_INLINE philox4x64_ctr_t philox4x64_R(unsigned int R,
+static inline philox4x64_ctr_t philox4x64_R(unsigned int R,
                                             philox4x64_ctr_t ctr,
                                             philox4x64_key_t key);
 
-static NPY_INLINE philox4x64_ctr_t philox4x64_R(unsigned int R,
+static inline philox4x64_ctr_t philox4x64_R(unsigned int R,
                                             philox4x64_ctr_t ctr,
                                             philox4x64_key_t key) {
   if (R > 0) {
@@ -199,7 +199,7 @@ typedef struct s_philox_state {
   uint32_t uinteger;
 } philox_state;
 
-static NPY_INLINE uint64_t philox_next(philox_state *state) {
+static inline uint64_t philox_next(philox_state *state) {
   uint64_t out;
   int i;
   philox4x64_ctr_t ct;
@@ -229,11 +229,11 @@ static NPY_INLINE uint64_t philox_next(philox_state *state) {
   return state->buffer[0];
 }
 
-static NPY_INLINE uint64_t philox_next64(philox_state *state) {
+static inline uint64_t philox_next64(philox_state *state) {
   return philox_next(state);
 }
 
-static NPY_INLINE uint32_t philox_next32(philox_state *state) {
+static inline uint32_t philox_next32(philox_state *state) {
   uint64_t next;
 
   if (state->has_uint32) {
diff --git a/numpy/random/src/sfc64/sfc64.h b/numpy/random/src/sfc64/sfc64.h
index 75c4118d3..f6526063e 100644
--- a/numpy/random/src/sfc64/sfc64.h
+++ b/numpy/random/src/sfc64/sfc64.h
@@ -14,7 +14,7 @@ typedef struct s_sfc64_state {
 } sfc64_state;
 
 
-static NPY_INLINE uint64_t rotl(const uint64_t value, unsigned int rot) {
+static inline uint64_t rotl(const uint64_t value, unsigned int rot) {
 #ifdef _WIN32
   return _rotl64(value, rot);
 #else
@@ -22,7 +22,7 @@ static NPY_INLINE uint64_t rotl(const uint64_t value, unsigned int rot) {
 #endif
 }
 
-static NPY_INLINE uint64_t sfc64_next(uint64_t *s) {
+static inline uint64_t sfc64_next(uint64_t *s) {
   const uint64_t tmp = s[0] + s[1] + s[3]++;
 
   s[0] = s[1] ^ (s[1] >> 11);
@@ -33,11 +33,11 @@ static NPY_INLINE uint64_t sfc64_next(uint64_t *s) {
 }
 
 
-static NPY_INLINE uint64_t sfc64_next64(sfc64_state *state) {
+static inline uint64_t sfc64_next64(sfc64_state *state) {
   return sfc64_next(&state->s[0]);
 }
 
-static NPY_INLINE uint32_t sfc64_next32(sfc64_state *state) {
+static inline uint32_t sfc64_next32(sfc64_state *state) {
   uint64_t next;
   if (state->has_uint32) {
     state->has_uint32 = 0;
diff --git a/numpy/random/tests/test_direct.py b/numpy/random/tests/test_direct.py
index 58d966adf..fa2ae866b 100644
--- a/numpy/random/tests/test_direct.py
+++ b/numpy/random/tests/test_direct.py
@@ -148,6 +148,46 @@ def test_seedsequence():
     assert len(dummy.spawn(10)) == 10
 
 
+def test_generator_spawning():
+    """ Test spawning new generators and bit_generators directly.
+    """
+    rng = np.random.default_rng()
+    seq = rng.bit_generator.seed_seq
+    new_ss = seq.spawn(5)
+    expected_keys = [seq.spawn_key + (i,) for i in range(5)]
+    assert [c.spawn_key for c in new_ss] == expected_keys
+
+    new_bgs = rng.bit_generator.spawn(5)
+    expected_keys = [seq.spawn_key + (i,) for i in range(5, 10)]
+    assert [bg.seed_seq.spawn_key for bg in new_bgs] == expected_keys
+
+    new_rngs = rng.spawn(5)
+    expected_keys = [seq.spawn_key + (i,) for i in range(10, 15)]
+    found_keys = [rng.bit_generator.seed_seq.spawn_key for rng in new_rngs]
+    assert found_keys == expected_keys
+
+    # Sanity check that streams are actually different:
+    assert new_rngs[0].uniform() != new_rngs[1].uniform()
+
+
+def test_non_spawnable():
+    from numpy.random.bit_generator import ISeedSequence
+
+    class FakeSeedSequence:
+        def generate_state(self, n_words, dtype=np.uint32):
+            return np.zeros(n_words, dtype=dtype)
+
+    ISeedSequence.register(FakeSeedSequence)
+
+    rng = np.random.default_rng(FakeSeedSequence())
+
+    with pytest.raises(TypeError, match="The underlying SeedSequence"):
+        rng.spawn(5)
+
+    with pytest.raises(TypeError, match="The underlying SeedSequence"):
+        rng.bit_generator.spawn(5)
+
+
 class Base:
     dtype = np.uint64
     data2 = data1 = {}
diff --git a/numpy/random/tests/test_extending.py b/numpy/random/tests/test_extending.py
index 04b13cb8c..67a84d57d 100644
--- a/numpy/random/tests/test_extending.py
+++ b/numpy/random/tests/test_extending.py
@@ -6,6 +6,7 @@ import sys
 import warnings
 import numpy as np
 from numpy.distutils.misc_util import exec_mod_from_location
+from numpy.testing import IS_WASM
 
 try:
     import cffi
@@ -22,7 +23,8 @@ try:
         # numba issue gh-4733
         warnings.filterwarnings('always', '', DeprecationWarning)
         import numba
-except ImportError:
+except (ImportError, SystemError):
+    # Certain numpy/numba versions trigger a SystemError due to a numba bug
     numba = None
 
 try:
@@ -31,7 +33,7 @@ try:
 except ImportError:
     cython = None
 else:
-    from numpy.compat import _pep440
+    from numpy._utils import _pep440
     # Cython 0.29.30 is required for Python 3.11 and there are
     # other fixes in the 0.29 series that are needed even for earlier
     # Python versions.
@@ -41,6 +43,8 @@ else:
         # too old or wrong cython, skip the test
         cython = None
 
+
+@pytest.mark.skipif(IS_WASM, reason="Can't start subprocess")
 @pytest.mark.skipif(cython is None, reason="requires cython")
 @pytest.mark.slow
 def test_cython(tmp_path):
diff --git a/numpy/random/tests/test_generator_mt19937.py b/numpy/random/tests/test_generator_mt19937.py
index 1710df8ca..5c4c2cbf9 100644
--- a/numpy/random/tests/test_generator_mt19937.py
+++ b/numpy/random/tests/test_generator_mt19937.py
@@ -8,7 +8,7 @@ from numpy.linalg import LinAlgError
 from numpy.testing import (
     assert_, assert_raises, assert_equal, assert_allclose,
     assert_warns, assert_no_warnings, assert_array_equal,
-    assert_array_almost_equal, suppress_warnings)
+    assert_array_almost_equal, suppress_warnings, IS_WASM)
 
 from numpy.random import Generator, MT19937, SeedSequence, RandomState
 
@@ -345,6 +345,8 @@ class TestIntegers:
                           endpoint=endpoint, dtype=dt)
             assert_raises(ValueError, self.rfunc, 1, [0],
                           endpoint=endpoint, dtype=dt)
+            assert_raises(ValueError, self.rfunc, [ubnd+1], [ubnd],
+                          endpoint=endpoint, dtype=dt)
 
     def test_bounds_checking_array(self, endpoint):
         for dt in self.itype:
@@ -1391,6 +1393,7 @@ class TestRandomDist:
                              [5, 5, 3, 1, 2, 4]]])
         assert_array_equal(actual, desired)
 
+    @pytest.mark.skipif(IS_WASM, reason="fp errors don't work in wasm")
     @pytest.mark.parametrize("method", ["svd", "eigh", "cholesky"])
     def test_multivariate_normal(self, method):
         random = Generator(MT19937(self.seed))
@@ -2452,6 +2455,7 @@ class TestBroadcast:
         assert actual.shape == (3, 0, 7, 4)
 
 
+@pytest.mark.skipif(IS_WASM, reason="can't start thread")
 class TestThread:
     # make sure each state produces the same sequence even in threads
     def setup_method(self):
diff --git a/numpy/random/tests/test_generator_mt19937_regressions.py b/numpy/random/tests/test_generator_mt19937_regressions.py
index 0227d6502..7c2b6867c 100644
--- a/numpy/random/tests/test_generator_mt19937_regressions.py
+++ b/numpy/random/tests/test_generator_mt19937_regressions.py
@@ -3,32 +3,32 @@ import numpy as np
 import pytest
 from numpy.random import Generator, MT19937
 
-mt19937 = Generator(MT19937())
-
 
 class TestRegression:
 
+    def setup_method(self):
+        self.mt19937 = Generator(MT19937(121263137472525314065))
+
     def test_vonmises_range(self):
         # Make sure generated random variables are in [-pi, pi].
         # Regression test for ticket #986.
         for mu in np.linspace(-7., 7., 5):
-            r = mt19937.vonmises(mu, 1, 50)
+            r = self.mt19937.vonmises(mu, 1, 50)
             assert_(np.all(r > -np.pi) and np.all(r <= np.pi))
 
     def test_hypergeometric_range(self):
         # Test for ticket #921
-        assert_(np.all(mt19937.hypergeometric(3, 18, 11, size=10) < 4))
-        assert_(np.all(mt19937.hypergeometric(18, 3, 11, size=10) > 0))
+        assert_(np.all(self.mt19937.hypergeometric(3, 18, 11, size=10) < 4))
+        assert_(np.all(self.mt19937.hypergeometric(18, 3, 11, size=10) > 0))
 
         # Test for ticket #5623
         args = (2**20 - 2, 2**20 - 2, 2**20 - 2)  # Check for 32-bit systems
-        assert_(mt19937.hypergeometric(*args) > 0)
+        assert_(self.mt19937.hypergeometric(*args) > 0)
 
     def test_logseries_convergence(self):
         # Test for ticket #923
         N = 1000
-        mt19937 = Generator(MT19937(0))
-        rvsn = mt19937.logseries(0.8, size=N)
+        rvsn = self.mt19937.logseries(0.8, size=N)
         # these two frequency counts should be close to theoretical
         # numbers with this large sample
         # theoretical large N result is 0.49706795
@@ -65,41 +65,38 @@ class TestRegression:
         # Test for multivariate_normal issue with 'size' argument.
         # Check that the multivariate_normal size argument can be a
         # numpy integer.
-        mt19937.multivariate_normal([0], [[0]], size=1)
-        mt19937.multivariate_normal([0], [[0]], size=np.int_(1))
-        mt19937.multivariate_normal([0], [[0]], size=np.int64(1))
+        self.mt19937.multivariate_normal([0], [[0]], size=1)
+        self.mt19937.multivariate_normal([0], [[0]], size=np.int_(1))
+        self.mt19937.multivariate_normal([0], [[0]], size=np.int64(1))
 
     def test_beta_small_parameters(self):
         # Test that beta with small a and b parameters does not produce
         # NaNs due to roundoff errors causing 0 / 0, gh-5851
-        mt19937 = Generator(MT19937(1234567890))
-        x = mt19937.beta(0.0001, 0.0001, size=100)
+        x = self.mt19937.beta(0.0001, 0.0001, size=100)
         assert_(not np.any(np.isnan(x)), 'Nans in mt19937.beta')
 
     def test_choice_sum_of_probs_tolerance(self):
         # The sum of probs should be 1.0 with some tolerance.
         # For low precision dtypes the tolerance was too tight.
         # See numpy github issue 6123.
-        mt19937 = Generator(MT19937(1234))
         a = [1, 2, 3]
         counts = [4, 4, 2]
         for dt in np.float16, np.float32, np.float64:
             probs = np.array(counts, dtype=dt) / sum(counts)
-            c = mt19937.choice(a, p=probs)
+            c = self.mt19937.choice(a, p=probs)
             assert_(c in a)
             with pytest.raises(ValueError):
-                mt19937.choice(a, p=probs*0.9)
+                self.mt19937.choice(a, p=probs*0.9)
 
     def test_shuffle_of_array_of_different_length_strings(self):
         # Test that permuting an array of different length strings
         # will not cause a segfault on garbage collection
         # Tests gh-7710
-        mt19937 = Generator(MT19937(1234))
 
         a = np.array(['a', 'a' * 1000])
 
         for _ in range(100):
-            mt19937.shuffle(a)
+            self.mt19937.shuffle(a)
 
         # Force Garbage Collection - should not segfault.
         import gc
@@ -109,17 +106,17 @@ class TestRegression:
         # Test that permuting an array of objects will not cause
         # a segfault on garbage collection.
         # See gh-7719
-        mt19937 = Generator(MT19937(1234))
         a = np.array([np.arange(1), np.arange(4)], dtype=object)
 
         for _ in range(1000):
-            mt19937.shuffle(a)
+            self.mt19937.shuffle(a)
 
         # Force Garbage Collection - should not segfault.
         import gc
         gc.collect()
 
     def test_permutation_subclass(self):
+
         class N(np.ndarray):
             pass
 
@@ -142,9 +139,16 @@ class TestRegression:
         assert_array_equal(m.__array__(), np.arange(5))
 
     def test_gamma_0(self):
-        assert mt19937.standard_gamma(0.0) == 0.0
-        assert_array_equal(mt19937.standard_gamma([0.0]), 0.0)
+        assert self.mt19937.standard_gamma(0.0) == 0.0
+        assert_array_equal(self.mt19937.standard_gamma([0.0]), 0.0)
 
-        actual = mt19937.standard_gamma([0.0], dtype='float')
+        actual = self.mt19937.standard_gamma([0.0], dtype='float')
         expected = np.array([0.], dtype=np.float32)
         assert_array_equal(actual, expected)
+
+    def test_geometric_tiny_prob(self):
+        # Regression test for gh-17007.
+        # When p = 1e-30, the probability that a sample will exceed 2**63-1
+        # is 0.9999999999907766, so we expect the result to be all 2**63-1.
+        assert_array_equal(self.mt19937.geometric(p=1e-30, size=3),
+                           np.iinfo(np.int64).max)
diff --git a/numpy/random/tests/test_random.py b/numpy/random/tests/test_random.py
index 6b4c82bc9..0f4e7925a 100644
--- a/numpy/random/tests/test_random.py
+++ b/numpy/random/tests/test_random.py
@@ -6,7 +6,7 @@ import numpy as np
 from numpy.testing import (
         assert_, assert_raises, assert_equal, assert_warns,
         assert_no_warnings, assert_array_equal, assert_array_almost_equal,
-        suppress_warnings
+        suppress_warnings, IS_WASM
         )
 from numpy import random
 import sys
@@ -1615,6 +1615,7 @@ class TestBroadcast:
         assert_raises(ValueError, logseries, bad_p_two * 3)
 
 
+@pytest.mark.skipif(IS_WASM, reason="can't start thread")
 class TestThread:
     # make sure each state produces the same sequence even in threads
     def setup_method(self):
diff --git a/numpy/random/tests/test_randomstate.py b/numpy/random/tests/test_randomstate.py
index be9a9c339..3a2961098 100644
--- a/numpy/random/tests/test_randomstate.py
+++ b/numpy/random/tests/test_randomstate.py
@@ -8,7 +8,7 @@ import pytest
 from numpy.testing import (
         assert_, assert_raises, assert_equal, assert_warns,
         assert_no_warnings, assert_array_equal, assert_array_almost_equal,
-        suppress_warnings
+        suppress_warnings, IS_WASM
         )
 
 from numpy.random import MT19937, PCG64
@@ -812,6 +812,10 @@ class TestRandomDist:
         alpha = np.array([5.4e-01, -1.0e-16])
         assert_raises(ValueError, random.dirichlet, alpha)
 
+    def test_dirichlet_zero_alpha(self):
+        y = random.default_rng().dirichlet([5, 9, 0, 8])
+        assert_equal(y[2], 0)
+
     def test_dirichlet_alpha_non_contiguous(self):
         a = np.array([51.72840233779265162, -1.0, 39.74494232180943953])
         alpha = a[::2]
@@ -1894,6 +1898,7 @@ class TestBroadcast:
         assert_raises(ValueError, logseries, bad_p_two * 3)
 
 
+@pytest.mark.skipif(IS_WASM, reason="can't start thread")
 class TestThread:
     # make sure each state produces the same sequence even in threads
     def setup_method(self):
diff --git a/numpy/setup.py b/numpy/setup.py
index 28c28d1ac..b6f879bcc 100644
--- a/numpy/setup.py
+++ b/numpy/setup.py
@@ -20,6 +20,7 @@ def configuration(parent_package='',top_path=None):
     config.add_subpackage('testing')
     config.add_subpackage('typing')
     config.add_subpackage('_typing')
+    config.add_subpackage('_utils')
     config.add_data_dir('doc')
     config.add_data_files('py.typed')
     config.add_data_files('*.pyi')
diff --git a/numpy/testing/__init__.py b/numpy/testing/__init__.py
index 087527e43..8a34221e4 100644
--- a/numpy/testing/__init__.py
+++ b/numpy/testing/__init__.py
@@ -10,12 +10,12 @@ from unittest import TestCase
 from . import _private
 from ._private.utils import *
 from ._private.utils import (_assert_valid_refcount, _gen_alignment_data)
-from ._private import extbuild, decorators as dec
-from ._private.nosetester import (
-    run_module_suite, NoseTester as Tester
-    )
+from ._private import extbuild
+from . import overrides
 
-__all__ = _private.utils.__all__ + ['TestCase', 'run_module_suite']
+__all__ = (
+    _private.utils.__all__ + ['TestCase', 'overrides']
+)
 
 from numpy._pytesttester import PytestTester
 test = PytestTester(__name__)
diff --git a/numpy/testing/__init__.pyi b/numpy/testing/__init__.pyi
index a981d6113..d65860ccb 100644
--- a/numpy/testing/__init__.pyi
+++ b/numpy/testing/__init__.pyi
@@ -18,7 +18,6 @@ from numpy.testing._private.utils import (
     jiffies as jiffies,
     memusage as memusage,
     print_assert_equal as print_assert_equal,
-    raises as raises,
     rundocs as rundocs,
     runstring as runstring,
     verbose as verbose,
@@ -49,8 +48,3 @@ from numpy.testing._private.utils import (
 __all__: list[str]
 __path__: list[str]
 test: PytestTester
-
-def run_module_suite(
-    file_to_run: None | str = ...,
-    argv: None | list[str] = ...,
-) -> None: ...
diff --git a/numpy/testing/_private/decorators.py b/numpy/testing/_private/decorators.py
deleted file mode 100644
index cb49d9a73..000000000
--- a/numpy/testing/_private/decorators.py
+++ /dev/null
@@ -1,331 +0,0 @@
-"""
-Decorators for labeling and modifying behavior of test objects.
-
-Decorators that merely return a modified version of the original
-function object are straightforward. Decorators that return a new
-function object need to use
-::
-
-  nose.tools.make_decorator(original_function)(decorator)
-
-in returning the decorator, in order to preserve meta-data such as
-function name, setup and teardown functions and so on - see
-``nose.tools`` for more information.
-
-"""
-import collections.abc
-import warnings
-
-from .utils import SkipTest, assert_warns, HAS_REFCOUNT
-
-__all__ = ['slow', 'setastest', 'skipif', 'knownfailureif', 'deprecated',
-           'parametrize', '_needs_refcount',]
-
-
-def slow(t):
-    """
-    .. deprecated:: 1.21
-        This decorator is retained for compatibility with the nose testing framework, which is being phased out.
-        Please use the nose2 or pytest frameworks instead.
-
-    Label a test as 'slow'.
-
-    The exact definition of a slow test is obviously both subjective and
-    hardware-dependent, but in general any individual test that requires more
-    than a second or two should be labeled as slow (the whole suite consists of
-    thousands of tests, so even a second is significant).
-
-    Parameters
-    ----------
-    t : callable
-        The test to label as slow.
-
-    Returns
-    -------
-    t : callable
-        The decorated test `t`.
-
-    Examples
-    --------
-    The `numpy.testing` module includes ``import decorators as dec``.
-    A test can be decorated as slow like this::
-
-      from numpy.testing import *
-
-      @dec.slow
-      def test_big(self):
-          print('Big, slow test')
-
-    """
-    # Numpy 1.21, 2020-12-20
-    warnings.warn('the np.testing.dec decorators are included for nose support, and are '
-                'deprecated since NumPy v1.21. Use the nose2 or pytest frameworks instead.', DeprecationWarning, stacklevel=2)
-
-    t.slow = True
-    return t
-
-def setastest(tf=True):
-    """
-    .. deprecated:: 1.21
-        This decorator is retained for compatibility with the nose testing framework, which is being phased out.
-        Please use the nose2 or pytest frameworks instead.
-
-    Signals to nose that this function is or is not a test.
-
-    Parameters
-    ----------
-    tf : bool
-        If True, specifies that the decorated callable is a test.
-        If False, specifies that the decorated callable is not a test.
-        Default is True.
-
-    Notes
-    -----
-    This decorator can't use the nose namespace, because it can be
-    called from a non-test module. See also ``istest`` and ``nottest`` in
-    ``nose.tools``.
-
-    Examples
-    --------
-    `setastest` can be used in the following way::
-
-      from numpy.testing import dec
-
-      @dec.setastest(False)
-      def func_with_test_in_name(arg1, arg2):
-          pass
-
-    """
-    # Numpy 1.21, 2020-12-20
-    warnings.warn('the np.testing.dec decorators are included for nose support, and are '
-            'deprecated since NumPy v1.21. Use the nose2 or pytest frameworks instead.', DeprecationWarning, stacklevel=2)
-    def set_test(t):
-        t.__test__ = tf
-        return t
-    return set_test
-
-def skipif(skip_condition, msg=None):
-    """
-    .. deprecated:: 1.21
-        This decorator is retained for compatibility with the nose testing framework, which is being phased out.
-        Please use the nose2 or pytest frameworks instead.
-
-    Make function raise SkipTest exception if a given condition is true.
-
-    If the condition is a callable, it is used at runtime to dynamically
-    make the decision. This is useful for tests that may require costly
-    imports, to delay the cost until the test suite is actually executed.
-
-    Parameters
-    ----------
-    skip_condition : bool or callable
-        Flag to determine whether to skip the decorated test.
-    msg : str, optional
-        Message to give on raising a SkipTest exception. Default is None.
-
-    Returns
-    -------
-    decorator : function
-        Decorator which, when applied to a function, causes SkipTest
-        to be raised when `skip_condition` is True, and the function
-        to be called normally otherwise.
-
-    Notes
-    -----
-    The decorator itself is decorated with the ``nose.tools.make_decorator``
-    function in order to transmit function name, and various other metadata.
-
-    """
-
-    def skip_decorator(f):
-        # Local import to avoid a hard nose dependency and only incur the
-        # import time overhead at actual test-time.
-        import nose
-
-        # Numpy 1.21, 2020-12-20
-        warnings.warn('the np.testing.dec decorators are included for nose support, and are '
-            'deprecated since NumPy v1.21. Use the nose2 or pytest frameworks instead.', DeprecationWarning, stacklevel=2)
-
-        # Allow for both boolean or callable skip conditions.
-        if isinstance(skip_condition, collections.abc.Callable):
-            skip_val = lambda: skip_condition()
-        else:
-            skip_val = lambda: skip_condition
-
-        def get_msg(func,msg=None):
-            """Skip message with information about function being skipped."""
-            if msg is None:
-                out = 'Test skipped due to test condition'
-            else:
-                out = msg
-
-            return f'Skipping test: {func.__name__}: {out}'
-
-        # We need to define *two* skippers because Python doesn't allow both
-        # return with value and yield inside the same function.
-        def skipper_func(*args, **kwargs):
-            """Skipper for normal test functions."""
-            if skip_val():
-                raise SkipTest(get_msg(f, msg))
-            else:
-                return f(*args, **kwargs)
-
-        def skipper_gen(*args, **kwargs):
-            """Skipper for test generators."""
-            if skip_val():
-                raise SkipTest(get_msg(f, msg))
-            else:
-                yield from f(*args, **kwargs)
-
-        # Choose the right skipper to use when building the actual decorator.
-        if nose.util.isgenerator(f):
-            skipper = skipper_gen
-        else:
-            skipper = skipper_func
-
-        return nose.tools.make_decorator(f)(skipper)
-
-    return skip_decorator
-
-
-def knownfailureif(fail_condition, msg=None):
-    """
-    .. deprecated:: 1.21
-        This decorator is retained for compatibility with the nose testing framework, which is being phased out.
-        Please use the nose2 or pytest frameworks instead.
-
-    Make function raise KnownFailureException exception if given condition is true.
-
-    If the condition is a callable, it is used at runtime to dynamically
-    make the decision. This is useful for tests that may require costly
-    imports, to delay the cost until the test suite is actually executed.
-
-    Parameters
-    ----------
-    fail_condition : bool or callable
-        Flag to determine whether to mark the decorated test as a known
-        failure (if True) or not (if False).
-    msg : str, optional
-        Message to give on raising a KnownFailureException exception.
-        Default is None.
-
-    Returns
-    -------
-    decorator : function
-        Decorator, which, when applied to a function, causes
-        KnownFailureException to be raised when `fail_condition` is True,
-        and the function to be called normally otherwise.
-
-    Notes
-    -----
-    The decorator itself is decorated with the ``nose.tools.make_decorator``
-    function in order to transmit function name, and various other metadata.
-
-    """
-    # Numpy 1.21, 2020-12-20
-    warnings.warn('the np.testing.dec decorators are included for nose support, and are '
-            'deprecated since NumPy v1.21. Use the nose2 or pytest frameworks instead.', DeprecationWarning, stacklevel=2)
-
-    if msg is None:
-        msg = 'Test skipped due to known failure'
-
-    # Allow for both boolean or callable known failure conditions.
-    if isinstance(fail_condition, collections.abc.Callable):
-        fail_val = lambda: fail_condition()
-    else:
-        fail_val = lambda: fail_condition
-
-    def knownfail_decorator(f):
-        # Local import to avoid a hard nose dependency and only incur the
-        # import time overhead at actual test-time.
-        import nose
-        from .noseclasses import KnownFailureException
-
-        def knownfailer(*args, **kwargs):
-            if fail_val():
-                raise KnownFailureException(msg)
-            else:
-                return f(*args, **kwargs)
-        return nose.tools.make_decorator(f)(knownfailer)
-
-    return knownfail_decorator
-
-def deprecated(conditional=True):
-    """
-    .. deprecated:: 1.21
-        This decorator is retained for compatibility with the nose testing framework, which is being phased out.
-        Please use the nose2 or pytest frameworks instead.
-
-    Filter deprecation warnings while running the test suite.
-
-    This decorator can be used to filter DeprecationWarning's, to avoid
-    printing them during the test suite run, while checking that the test
-    actually raises a DeprecationWarning.
-
-    Parameters
-    ----------
-    conditional : bool or callable, optional
-        Flag to determine whether to mark test as deprecated or not. If the
-        condition is a callable, it is used at runtime to dynamically make the
-        decision. Default is True.
-
-    Returns
-    -------
-    decorator : function
-        The `deprecated` decorator itself.
-
-    Notes
-    -----
-    .. versionadded:: 1.4.0
-
-    """
-    def deprecate_decorator(f):
-        # Local import to avoid a hard nose dependency and only incur the
-        # import time overhead at actual test-time.
-        import nose
-
-        # Numpy 1.21, 2020-12-20
-        warnings.warn('the np.testing.dec decorators are included for nose support, and are '
-            'deprecated since NumPy v1.21. Use the nose2 or pytest frameworks instead.', DeprecationWarning, stacklevel=2)
-
-        def _deprecated_imp(*args, **kwargs):
-            # Poor man's replacement for the with statement
-            with assert_warns(DeprecationWarning):
-                f(*args, **kwargs)
-
-        if isinstance(conditional, collections.abc.Callable):
-            cond = conditional()
-        else:
-            cond = conditional
-        if cond:
-            return nose.tools.make_decorator(f)(_deprecated_imp)
-        else:
-            return f
-    return deprecate_decorator
-
-
-def parametrize(vars, input):
-    """
-    .. deprecated:: 1.21
-        This decorator is retained for compatibility with the nose testing framework, which is being phased out.
-        Please use the nose2 or pytest frameworks instead.
-
-    Pytest compatibility class. This implements the simplest level of
-    pytest.mark.parametrize for use in nose as an aid in making the transition
-    to pytest. It achieves that by adding a dummy var parameter and ignoring
-    the doc_func parameter of the base class. It does not support variable
-    substitution by name, nor does it support nesting or classes. See the
-    pytest documentation for usage.
-
-    .. versionadded:: 1.14.0
-
-    """
-    from .parameterized import parameterized
-
-    # Numpy 1.21, 2020-12-20
-    warnings.warn('the np.testing.dec decorators are included for nose support, and are '
-            'deprecated since NumPy v1.21. Use the nose2 or pytest frameworks instead.', DeprecationWarning, stacklevel=2)
-
-    return parameterized(input)
-
-_needs_refcount = skipif(not HAS_REFCOUNT, "python has no sys.getrefcount")
diff --git a/numpy/testing/_private/noseclasses.py b/numpy/testing/_private/noseclasses.py
deleted file mode 100644
index 48fa4dc1f..000000000
--- a/numpy/testing/_private/noseclasses.py
+++ /dev/null
@@ -1,364 +0,0 @@
-# These classes implement a doctest runner plugin for nose, a "known failure"
-# error class, and a customized TestProgram for NumPy.
-
-# Because this module imports nose directly, it should not
-# be used except by nosetester.py to avoid a general NumPy
-# dependency on nose.
-import os
-import sys
-import doctest
-import inspect
-
-import numpy
-import nose
-from nose.plugins import doctests as npd
-from nose.plugins.errorclass import ErrorClass, ErrorClassPlugin
-from nose.plugins.base import Plugin
-from nose.util import src
-from .nosetester import get_package_name
-from .utils import KnownFailureException, KnownFailureTest
-
-
-# Some of the classes in this module begin with 'Numpy' to clearly distinguish
-# them from the plethora of very similar names from nose/unittest/doctest
-
-#-----------------------------------------------------------------------------
-# Modified version of the one in the stdlib, that fixes a python bug (doctests
-# not found in extension modules, https://bugs.python.org/issue3158)
-class NumpyDocTestFinder(doctest.DocTestFinder):
-
-    def _from_module(self, module, object):
-        """
-        Return true if the given object is defined in the given
-        module.
-        """
-        if module is None:
-            return True
-        elif inspect.isfunction(object):
-            return module.__dict__ is object.__globals__
-        elif inspect.isbuiltin(object):
-            return module.__name__ == object.__module__
-        elif inspect.isclass(object):
-            return module.__name__ == object.__module__
-        elif inspect.ismethod(object):
-            # This one may be a bug in cython that fails to correctly set the
-            # __module__ attribute of methods, but since the same error is easy
-            # to make by extension code writers, having this safety in place
-            # isn't such a bad idea
-            return module.__name__ == object.__self__.__class__.__module__
-        elif inspect.getmodule(object) is not None:
-            return module is inspect.getmodule(object)
-        elif hasattr(object, '__module__'):
-            return module.__name__ == object.__module__
-        elif isinstance(object, property):
-            return True  # [XX] no way not be sure.
-        else:
-            raise ValueError("object must be a class or function")
-
-    def _find(self, tests, obj, name, module, source_lines, globs, seen):
-        """
-        Find tests for the given object and any contained objects, and
-        add them to `tests`.
-        """
-
-        doctest.DocTestFinder._find(self, tests, obj, name, module,
-                                    source_lines, globs, seen)
-
-        # Below we re-run pieces of the above method with manual modifications,
-        # because the original code is buggy and fails to correctly identify
-        # doctests in extension modules.
-
-        # Local shorthands
-        from inspect import (
-            isroutine, isclass, ismodule, isfunction, ismethod
-            )
-
-        # Look for tests in a module's contained objects.
-        if ismodule(obj) and self._recurse:
-            for valname, val in obj.__dict__.items():
-                valname1 = f'{name}.{valname}'
-                if ( (isroutine(val) or isclass(val))
-                     and self._from_module(module, val)):
-
-                    self._find(tests, val, valname1, module, source_lines,
-                               globs, seen)
-
-        # Look for tests in a class's contained objects.
-        if isclass(obj) and self._recurse:
-            for valname, val in obj.__dict__.items():
-                # Special handling for staticmethod/classmethod.
-                if isinstance(val, staticmethod):
-                    val = getattr(obj, valname)
-                if isinstance(val, classmethod):
-                    val = getattr(obj, valname).__func__
-
-                # Recurse to methods, properties, and nested classes.
-                if ((isfunction(val) or isclass(val) or
-                     ismethod(val) or isinstance(val, property)) and
-                      self._from_module(module, val)):
-                    valname = f'{name}.{valname}'
-                    self._find(tests, val, valname, module, source_lines,
-                               globs, seen)
-
-
-# second-chance checker; if the default comparison doesn't
-# pass, then see if the expected output string contains flags that
-# tell us to ignore the output
-class NumpyOutputChecker(doctest.OutputChecker):
-    def check_output(self, want, got, optionflags):
-        ret = doctest.OutputChecker.check_output(self, want, got,
-                                                 optionflags)
-        if not ret:
-            if "#random" in want:
-                return True
-
-            # it would be useful to normalize endianness so that
-            # bigendian machines don't fail all the tests (and there are
-            # actually some bigendian examples in the doctests). Let's try
-            # making them all little endian
-            got = got.replace("'>", "'<")
-            want = want.replace("'>", "'<")
-
-            # try to normalize out 32 and 64 bit default int sizes
-            for sz in [4, 8]:
-                got = got.replace("'<i%d'" % sz, "int")
-                want = want.replace("'<i%d'" % sz, "int")
-
-            ret = doctest.OutputChecker.check_output(self, want,
-                    got, optionflags)
-
-        return ret
-
-
-# Subclass nose.plugins.doctests.DocTestCase to work around a bug in
-# its constructor that blocks non-default arguments from being passed
-# down into doctest.DocTestCase
-class NumpyDocTestCase(npd.DocTestCase):
-    def __init__(self, test, optionflags=0, setUp=None, tearDown=None,
-                 checker=None, obj=None, result_var='_'):
-        self._result_var = result_var
-        self._nose_obj = obj
-        doctest.DocTestCase.__init__(self, test,
-                                     optionflags=optionflags,
-                                     setUp=setUp, tearDown=tearDown,
-                                     checker=checker)
-
-
-print_state = numpy.get_printoptions()
-
-class NumpyDoctest(npd.Doctest):
-    name = 'numpydoctest'   # call nosetests with --with-numpydoctest
-    score = 1000  # load late, after doctest builtin
-
-    # always use whitespace and ellipsis options for doctests
-    doctest_optflags = doctest.NORMALIZE_WHITESPACE | doctest.ELLIPSIS
-
-    # files that should be ignored for doctests
-    doctest_ignore = ['generate_numpy_api.py',
-                      'setup.py']
-
-    # Custom classes; class variables to allow subclassing
-    doctest_case_class = NumpyDocTestCase
-    out_check_class = NumpyOutputChecker
-    test_finder_class = NumpyDocTestFinder
-
-    # Don't use the standard doctest option handler; hard-code the option values
-    def options(self, parser, env=os.environ):
-        Plugin.options(self, parser, env)
-        # Test doctests in 'test' files / directories. Standard plugin default
-        # is False
-        self.doctest_tests = True
-        # Variable name; if defined, doctest results stored in this variable in
-        # the top-level namespace.  None is the standard default
-        self.doctest_result_var = None
-
-    def configure(self, options, config):
-        # parent method sets enabled flag from command line --with-numpydoctest
-        Plugin.configure(self, options, config)
-        self.finder = self.test_finder_class()
-        self.parser = doctest.DocTestParser()
-        if self.enabled:
-            # Pull standard doctest out of plugin list; there's no reason to run
-            # both.  In practice the Unplugger plugin above would cover us when
-            # run from a standard numpy.test() call; this is just in case
-            # someone wants to run our plugin outside the numpy.test() machinery
-            config.plugins.plugins = [p for p in config.plugins.plugins
-                                      if p.name != 'doctest']
-
-    def set_test_context(self, test):
-        """ Configure `test` object to set test context
-
-        We set the numpy / scipy standard doctest namespace
-
-        Parameters
-        ----------
-        test : test object
-            with ``globs`` dictionary defining namespace
-
-        Returns
-        -------
-        None
-
-        Notes
-        -----
-        `test` object modified in place
-        """
-        # set the namespace for tests
-        pkg_name = get_package_name(os.path.dirname(test.filename))
-
-        # Each doctest should execute in an environment equivalent to
-        # starting Python and executing "import numpy as np", and,
-        # for SciPy packages, an additional import of the local
-        # package (so that scipy.linalg.basic.py's doctests have an
-        # implicit "from scipy import linalg" as well).
-        #
-        # Note: __file__ allows the doctest in NoseTester to run
-        # without producing an error
-        test.globs = {'__builtins__':__builtins__,
-                      '__file__':'__main__',
-                      '__name__':'__main__',
-                      'np':numpy}
-        # add appropriate scipy import for SciPy tests
-        if 'scipy' in pkg_name:
-            p = pkg_name.split('.')
-            p2 = p[-1]
-            test.globs[p2] = __import__(pkg_name, test.globs, {}, [p2])
-
-    # Override test loading to customize test context (with set_test_context
-    # method), set standard docstring options, and install our own test output
-    # checker
-    def loadTestsFromModule(self, module):
-        if not self.matches(module.__name__):
-            npd.log.debug("Doctest doesn't want module %s", module)
-            return
-        try:
-            tests = self.finder.find(module)
-        except AttributeError:
-            # nose allows module.__test__ = False; doctest does not and
-            # throws AttributeError
-            return
-        if not tests:
-            return
-        tests.sort()
-        module_file = src(module.__file__)
-        for test in tests:
-            if not test.examples:
-                continue
-            if not test.filename:
-                test.filename = module_file
-            # Set test namespace; test altered in place
-            self.set_test_context(test)
-            yield self.doctest_case_class(test,
-                                          optionflags=self.doctest_optflags,
-                                          checker=self.out_check_class(),
-                                          result_var=self.doctest_result_var)
-
-    # Add an afterContext method to nose.plugins.doctests.Doctest in order
-    # to restore print options to the original state after each doctest
-    def afterContext(self):
-        numpy.set_printoptions(**print_state)
-
-    # Ignore NumPy-specific build files that shouldn't be searched for tests
-    def wantFile(self, file):
-        bn = os.path.basename(file)
-        if bn in self.doctest_ignore:
-            return False
-        return npd.Doctest.wantFile(self, file)
-
-
-class Unplugger:
-    """ Nose plugin to remove named plugin late in loading
-
-    By default it removes the "doctest" plugin.
-    """
-    name = 'unplugger'
-    enabled = True  # always enabled
-    score = 4000  # load late in order to be after builtins
-
-    def __init__(self, to_unplug='doctest'):
-        self.to_unplug = to_unplug
-
-    def options(self, parser, env):
-        pass
-
-    def configure(self, options, config):
-        # Pull named plugin out of plugins list
-        config.plugins.plugins = [p for p in config.plugins.plugins
-                                  if p.name != self.to_unplug]
-
-
-class KnownFailurePlugin(ErrorClassPlugin):
-    '''Plugin that installs a KNOWNFAIL error class for the
-    KnownFailureClass exception.  When KnownFailure is raised,
-    the exception will be logged in the knownfail attribute of the
-    result, 'K' or 'KNOWNFAIL' (verbose) will be output, and the
-    exception will not be counted as an error or failure.'''
-    enabled = True
-    knownfail = ErrorClass(KnownFailureException,
-                           label='KNOWNFAIL',
-                           isfailure=False)
-
-    def options(self, parser, env=os.environ):
-        env_opt = 'NOSE_WITHOUT_KNOWNFAIL'
-        parser.add_option('--no-knownfail', action='store_true',
-                          dest='noKnownFail', default=env.get(env_opt, False),
-                          help='Disable special handling of KnownFailure '
-                               'exceptions')
-
-    def configure(self, options, conf):
-        if not self.can_configure:
-            return
-        self.conf = conf
-        disable = getattr(options, 'noKnownFail', False)
-        if disable:
-            self.enabled = False
-
-KnownFailure = KnownFailurePlugin   # backwards compat
-
-
-class FPUModeCheckPlugin(Plugin):
-    """
-    Plugin that checks the FPU mode before and after each test,
-    raising failures if the test changed the mode.
-    """
-
-    def prepareTestCase(self, test):
-        from numpy.core._multiarray_tests import get_fpu_mode
-
-        def run(result):
-            old_mode = get_fpu_mode()
-            test.test(result)
-            new_mode = get_fpu_mode()
-
-            if old_mode != new_mode:
-                try:
-                    raise AssertionError(
-                        "FPU mode changed from {0:#x} to {1:#x} during the "
-                        "test".format(old_mode, new_mode))
-                except AssertionError:
-                    result.addFailure(test, sys.exc_info())
-
-        return run
-
-
-# Class allows us to save the results of the tests in runTests - see runTests
-# method docstring for details
-class NumpyTestProgram(nose.core.TestProgram):
-    def runTests(self):
-        """Run Tests. Returns true on success, false on failure, and
-        sets self.success to the same value.
-
-        Because nose currently discards the test result object, but we need
-        to return it to the user, override TestProgram.runTests to retain
-        the result
-        """
-        if self.testRunner is None:
-            self.testRunner = nose.core.TextTestRunner(stream=self.config.stream,
-                                                       verbosity=self.config.verbosity,
-                                                       config=self.config)
-        plug_runner = self.config.plugins.prepareTestRunner(self.testRunner)
-        if plug_runner is not None:
-            self.testRunner = plug_runner
-        self.result = self.testRunner.run(self.test)
-        self.success = self.result.wasSuccessful()
-        return self.success
diff --git a/numpy/testing/_private/nosetester.py b/numpy/testing/_private/nosetester.py
deleted file mode 100644
index bccec8236..000000000
--- a/numpy/testing/_private/nosetester.py
+++ /dev/null
@@ -1,545 +0,0 @@
-"""
-Nose test running.
-
-This module implements ``test()`` and ``bench()`` functions for NumPy modules.
-
-"""
-import os
-import sys
-import warnings
-import numpy as np
-
-from .utils import import_nose, suppress_warnings
-
-
-__all__ = ['get_package_name', 'run_module_suite', 'NoseTester',
-           '_numpy_tester', 'get_package_name', 'import_nose',
-           'suppress_warnings']
-
-
-def get_package_name(filepath):
-    """
-    Given a path where a package is installed, determine its name.
-
-    Parameters
-    ----------
-    filepath : str
-        Path to a file. If the determination fails, "numpy" is returned.
-
-    Examples
-    --------
-    >>> np.testing.nosetester.get_package_name('nonsense')
-    'numpy'
-
-    """
-
-    fullpath = filepath[:]
-    pkg_name = []
-    while 'site-packages' in filepath or 'dist-packages' in filepath:
-        filepath, p2 = os.path.split(filepath)
-        if p2 in ('site-packages', 'dist-packages'):
-            break
-        pkg_name.append(p2)
-
-    # if package name determination failed, just default to numpy/scipy
-    if not pkg_name:
-        if 'scipy' in fullpath:
-            return 'scipy'
-        else:
-            return 'numpy'
-
-    # otherwise, reverse to get correct order and return
-    pkg_name.reverse()
-
-    # don't include the outer egg directory
-    if pkg_name[0].endswith('.egg'):
-        pkg_name.pop(0)
-
-    return '.'.join(pkg_name)
-
-
-def run_module_suite(file_to_run=None, argv=None):
-    """
-    Run a test module.
-
-    Equivalent to calling ``$ nosetests <argv> <file_to_run>`` from
-    the command line
-
-    Parameters
-    ----------
-    file_to_run : str, optional
-        Path to test module, or None.
-        By default, run the module from which this function is called.
-    argv : list of strings
-        Arguments to be passed to the nose test runner. ``argv[0]`` is
-        ignored. All command line arguments accepted by ``nosetests``
-        will work. If it is the default value None, sys.argv is used.
-
-        .. versionadded:: 1.9.0
-
-    Examples
-    --------
-    Adding the following::
-
-        if __name__ == "__main__" :
-            run_module_suite(argv=sys.argv)
-
-    at the end of a test module will run the tests when that module is
-    called in the python interpreter.
-
-    Alternatively, calling::
-
-    >>> run_module_suite(file_to_run="numpy/tests/test_matlib.py")  # doctest: +SKIP
-
-    from an interpreter will run all the test routine in 'test_matlib.py'.
-    """
-    if file_to_run is None:
-        f = sys._getframe(1)
-        file_to_run = f.f_locals.get('__file__', None)
-        if file_to_run is None:
-            raise AssertionError
-
-    if argv is None:
-        argv = sys.argv + [file_to_run]
-    else:
-        argv = argv + [file_to_run]
-
-    nose = import_nose()
-    from .noseclasses import KnownFailurePlugin
-    nose.run(argv=argv, addplugins=[KnownFailurePlugin()])
-
-
-class NoseTester:
-    """
-    Nose test runner.
-
-    This class is made available as numpy.testing.Tester, and a test function
-    is typically added to a package's __init__.py like so::
-
-      from numpy.testing import Tester
-      test = Tester().test
-
-    Calling this test function finds and runs all tests associated with the
-    package and all its sub-packages.
-
-    Attributes
-    ----------
-    package_path : str
-        Full path to the package to test.
-    package_name : str
-        Name of the package to test.
-
-    Parameters
-    ----------
-    package : module, str or None, optional
-        The package to test. If a string, this should be the full path to
-        the package. If None (default), `package` is set to the module from
-        which `NoseTester` is initialized.
-    raise_warnings : None, str or sequence of warnings, optional
-        This specifies which warnings to configure as 'raise' instead
-        of being shown once during the test execution.  Valid strings are:
-
-          - "develop" : equals ``(Warning,)``
-          - "release" : equals ``()``, don't raise on any warnings.
-
-        Default is "release".
-    depth : int, optional
-        If `package` is None, then this can be used to initialize from the
-        module of the caller of (the caller of (...)) the code that
-        initializes `NoseTester`. Default of 0 means the module of the
-        immediate caller; higher values are useful for utility routines that
-        want to initialize `NoseTester` objects on behalf of other code.
-
-    """
-    def __init__(self, package=None, raise_warnings="release", depth=0,
-                 check_fpu_mode=False):
-        # Back-compat: 'None' used to mean either "release" or "develop"
-        # depending on whether this was a release or develop version of
-        # numpy. Those semantics were fine for testing numpy, but not so
-        # helpful for downstream projects like scipy that use
-        # numpy.testing. (They want to set this based on whether *they* are a
-        # release or develop version, not whether numpy is.) So we continue to
-        # accept 'None' for back-compat, but it's now just an alias for the
-        # default "release".
-        if raise_warnings is None:
-            raise_warnings = "release"
-
-        package_name = None
-        if package is None:
-            f = sys._getframe(1 + depth)
-            package_path = f.f_locals.get('__file__', None)
-            if package_path is None:
-                raise AssertionError
-            package_path = os.path.dirname(package_path)
-            package_name = f.f_locals.get('__name__', None)
-        elif isinstance(package, type(os)):
-            package_path = os.path.dirname(package.__file__)
-            package_name = getattr(package, '__name__', None)
-        else:
-            package_path = str(package)
-
-        self.package_path = package_path
-
-        # Find the package name under test; this name is used to limit coverage
-        # reporting (if enabled).
-        if package_name is None:
-            package_name = get_package_name(package_path)
-        self.package_name = package_name
-
-        # Set to "release" in constructor in maintenance branches.
-        self.raise_warnings = raise_warnings
-
-        # Whether to check for FPU mode changes
-        self.check_fpu_mode = check_fpu_mode
-
-    def _test_argv(self, label, verbose, extra_argv):
-        ''' Generate argv for nosetest command
-
-        Parameters
-        ----------
-        label : {'fast', 'full', '', attribute identifier}, optional
-            see ``test`` docstring
-        verbose : int, optional
-            Verbosity value for test outputs, in the range 1-10. Default is 1.
-        extra_argv : list, optional
-            List with any extra arguments to pass to nosetests.
-
-        Returns
-        -------
-        argv : list
-            command line arguments that will be passed to nose
-        '''
-        argv = [__file__, self.package_path, '-s']
-        if label and label != 'full':
-            if not isinstance(label, str):
-                raise TypeError('Selection label should be a string')
-            if label == 'fast':
-                label = 'not slow'
-            argv += ['-A', label]
-        argv += ['--verbosity', str(verbose)]
-
-        # When installing with setuptools, and also in some other cases, the
-        # test_*.py files end up marked +x executable. Nose, by default, does
-        # not run files marked with +x as they might be scripts. However, in
-        # our case nose only looks for test_*.py files under the package
-        # directory, which should be safe.
-        argv += ['--exe']
-
-        if extra_argv:
-            argv += extra_argv
-        return argv
-
-    def _show_system_info(self):
-        nose = import_nose()
-
-        import numpy
-        print(f'NumPy version {numpy.__version__}')
-        relaxed_strides = numpy.ones((10, 1), order="C").flags.f_contiguous
-        print("NumPy relaxed strides checking option:", relaxed_strides)
-        npdir = os.path.dirname(numpy.__file__)
-        print(f'NumPy is installed in {npdir}')
-
-        if 'scipy' in self.package_name:
-            import scipy
-            print(f'SciPy version {scipy.__version__}')
-            spdir = os.path.dirname(scipy.__file__)
-            print(f'SciPy is installed in {spdir}')
-
-        pyversion = sys.version.replace('\n', '')
-        print(f'Python version {pyversion}')
-        print("nose version %d.%d.%d" % nose.__versioninfo__)
-
-    def _get_custom_doctester(self):
-        """ Return instantiated plugin for doctests
-
-        Allows subclassing of this class to override doctester
-
-        A return value of None means use the nose builtin doctest plugin
-        """
-        from .noseclasses import NumpyDoctest
-        return NumpyDoctest()
-
-    def prepare_test_args(self, label='fast', verbose=1, extra_argv=None,
-                          doctests=False, coverage=False, timer=False):
-        """
-        Run tests for module using nose.
-
-        This method does the heavy lifting for the `test` method. It takes all
-        the same arguments, for details see `test`.
-
-        See Also
-        --------
-        test
-
-        """
-        # fail with nice error message if nose is not present
-        import_nose()
-        # compile argv
-        argv = self._test_argv(label, verbose, extra_argv)
-        # our way of doing coverage
-        if coverage:
-            argv += [f'--cover-package={self.package_name}', '--with-coverage',
-                   '--cover-tests', '--cover-erase']
-
-        if timer:
-            if timer is True:
-                argv += ['--with-timer']
-            elif isinstance(timer, int):
-                argv += ['--with-timer', '--timer-top-n', str(timer)]
-
-        # construct list of plugins
-        import nose.plugins.builtin
-        from nose.plugins import EntryPointPluginManager
-        from .noseclasses import (KnownFailurePlugin, Unplugger,
-                                  FPUModeCheckPlugin)
-        plugins = [KnownFailurePlugin()]
-        plugins += [p() for p in nose.plugins.builtin.plugins]
-        if self.check_fpu_mode:
-            plugins += [FPUModeCheckPlugin()]
-            argv += ["--with-fpumodecheckplugin"]
-        try:
-            # External plugins (like nose-timer)
-            entrypoint_manager = EntryPointPluginManager()
-            entrypoint_manager.loadPlugins()
-            plugins += [p for p in entrypoint_manager.plugins]
-        except ImportError:
-            # Relies on pkg_resources, not a hard dependency
-            pass
-
-        # add doctesting if required
-        doctest_argv = '--with-doctest' in argv
-        if doctests == False and doctest_argv:
-            doctests = True
-        plug = self._get_custom_doctester()
-        if plug is None:
-            # use standard doctesting
-            if doctests and not doctest_argv:
-                argv += ['--with-doctest']
-        else:  # custom doctesting
-            if doctest_argv:  # in fact the unplugger would take care of this
-                argv.remove('--with-doctest')
-            plugins += [Unplugger('doctest'), plug]
-            if doctests:
-                argv += ['--with-' + plug.name]
-        return argv, plugins
-
-    def test(self, label='fast', verbose=1, extra_argv=None,
-             doctests=False, coverage=False, raise_warnings=None,
-             timer=False):
-        """
-        Run tests for module using nose.
-
-        Parameters
-        ----------
-        label : {'fast', 'full', '', attribute identifier}, optional
-            Identifies the tests to run. This can be a string to pass to
-            the nosetests executable with the '-A' option, or one of several
-            special values.  Special values are:
-
-            * 'fast' - the default - which corresponds to the ``nosetests -A``
-              option of 'not slow'.
-            * 'full' - fast (as above) and slow tests as in the
-              'no -A' option to nosetests - this is the same as ''.
-            * None or '' - run all tests.
-            * attribute_identifier - string passed directly to nosetests as '-A'.
-
-        verbose : int, optional
-            Verbosity value for test outputs, in the range 1-10. Default is 1.
-        extra_argv : list, optional
-            List with any extra arguments to pass to nosetests.
-        doctests : bool, optional
-            If True, run doctests in module. Default is False.
-        coverage : bool, optional
-            If True, report coverage of NumPy code. Default is False.
-            (This requires the
-            `coverage module <https://pypi.org/project/coverage/>`_).
-        raise_warnings : None, str or sequence of warnings, optional
-            This specifies which warnings to configure as 'raise' instead
-            of being shown once during the test execution. Valid strings are:
-
-            * "develop" : equals ``(Warning,)``
-            * "release" : equals ``()``, do not raise on any warnings.
-        timer : bool or int, optional
-            Timing of individual tests with ``nose-timer`` (which needs to be
-            installed).  If True, time tests and report on all of them.
-            If an integer (say ``N``), report timing results for ``N`` slowest
-            tests.
-
-        Returns
-        -------
-        result : object
-            Returns the result of running the tests as a
-            ``nose.result.TextTestResult`` object.
-
-        Notes
-        -----
-        Each NumPy module exposes `test` in its namespace to run all tests for it.
-        For example, to run all tests for numpy.lib:
-
-        >>> np.lib.test() #doctest: +SKIP
-
-        Examples
-        --------
-        >>> result = np.lib.test() #doctest: +SKIP
-        Running unit tests for numpy.lib
-        ...
-        Ran 976 tests in 3.933s
-
-        OK
-
-        >>> result.errors #doctest: +SKIP
-        []
-        >>> result.knownfail #doctest: +SKIP
-        []
-        """
-
-        # cap verbosity at 3 because nose becomes *very* verbose beyond that
-        verbose = min(verbose, 3)
-
-        from . import utils
-        utils.verbose = verbose
-
-        argv, plugins = self.prepare_test_args(
-                label, verbose, extra_argv, doctests, coverage, timer)
-
-        if doctests:
-            print(f'Running unit tests and doctests for {self.package_name}')
-        else:
-            print(f'Running unit tests for {self.package_name}')
-
-        self._show_system_info()
-
-        # reset doctest state on every run
-        import doctest
-        doctest.master = None
-
-        if raise_warnings is None:
-            raise_warnings = self.raise_warnings
-
-        _warn_opts = dict(develop=(Warning,),
-                          release=())
-        if isinstance(raise_warnings, str):
-            raise_warnings = _warn_opts[raise_warnings]
-
-        with suppress_warnings("location") as sup:
-            # Reset the warning filters to the default state,
-            # so that running the tests is more repeatable.
-            warnings.resetwarnings()
-            # Set all warnings to 'warn', this is because the default 'once'
-            # has the bad property of possibly shadowing later warnings.
-            warnings.filterwarnings('always')
-            # Force the requested warnings to raise
-            for warningtype in raise_warnings:
-                warnings.filterwarnings('error', category=warningtype)
-            # Filter out annoying import messages.
-            sup.filter(message='Not importing directory')
-            sup.filter(message="numpy.dtype size changed")
-            sup.filter(message="numpy.ufunc size changed")
-            sup.filter(category=np.ModuleDeprecationWarning)
-            # Filter out boolean '-' deprecation messages. This allows
-            # older versions of scipy to test without a flood of messages.
-            sup.filter(message=".*boolean negative.*")
-            sup.filter(message=".*boolean subtract.*")
-            # Filter out distutils cpu warnings (could be localized to
-            # distutils tests). ASV has problems with top level import,
-            # so fetch module for suppression here.
-            with warnings.catch_warnings():
-                warnings.simplefilter("always")
-                from ...distutils import cpuinfo
-            sup.filter(category=UserWarning, module=cpuinfo)
-            # Filter out some deprecation warnings inside nose 1.3.7 when run
-            # on python 3.5b2. See
-            #     https://github.com/nose-devs/nose/issues/929
-            # Note: it is hard to filter based on module for sup (lineno could
-            #       be implemented).
-            warnings.filterwarnings("ignore", message=".*getargspec.*",
-                                    category=DeprecationWarning,
-                                    module=r"nose\.")
-
-            from .noseclasses import NumpyTestProgram
-
-            t = NumpyTestProgram(argv=argv, exit=False, plugins=plugins)
-
-        return t.result
-
-    def bench(self, label='fast', verbose=1, extra_argv=None):
-        """
-        Run benchmarks for module using nose.
-
-        Parameters
-        ----------
-        label : {'fast', 'full', '', attribute identifier}, optional
-            Identifies the benchmarks to run. This can be a string to pass to
-            the nosetests executable with the '-A' option, or one of several
-            special values.  Special values are:
-
-            * 'fast' - the default - which corresponds to the ``nosetests -A``
-              option of 'not slow'.
-            * 'full' - fast (as above) and slow benchmarks as in the
-              'no -A' option to nosetests - this is the same as ''.
-            * None or '' - run all tests.
-            * attribute_identifier - string passed directly to nosetests as '-A'.
-
-        verbose : int, optional
-            Verbosity value for benchmark outputs, in the range 1-10. Default is 1.
-        extra_argv : list, optional
-            List with any extra arguments to pass to nosetests.
-
-        Returns
-        -------
-        success : bool
-            Returns True if running the benchmarks works, False if an error
-            occurred.
-
-        Notes
-        -----
-        Benchmarks are like tests, but have names starting with "bench" instead
-        of "test", and can be found under the "benchmarks" sub-directory of the
-        module.
-
-        Each NumPy module exposes `bench` in its namespace to run all benchmarks
-        for it.
-
-        Examples
-        --------
-        >>> success = np.lib.bench() #doctest: +SKIP
-        Running benchmarks for numpy.lib
-        ...
-        using 562341 items:
-        unique:
-        0.11
-        unique1d:
-        0.11
-        ratio: 1.0
-        nUnique: 56230 == 56230
-        ...
-        OK
-
-        >>> success #doctest: +SKIP
-        True
-
-        """
-
-        print(f'Running benchmarks for {self.package_name}')
-        self._show_system_info()
-
-        argv = self._test_argv(label, verbose, extra_argv)
-        argv += ['--match', r'(?:^|[\\b_\\.%s-])[Bb]ench' % os.sep]
-
-        # import nose or make informative error
-        nose = import_nose()
-
-        # get plugin to disable doctests
-        from .noseclasses import Unplugger
-        add_plugins = [Unplugger('doctest')]
-
-        return nose.run(argv=argv, addplugins=add_plugins)
-
-
-def _numpy_tester():
-    if hasattr(np, "__version__") and ".dev0" in np.__version__:
-        mode = "develop"
-    else:
-        mode = "release"
-    return NoseTester(raise_warnings=mode, depth=1,
-                      check_fpu_mode=True)
diff --git a/numpy/testing/_private/parameterized.py b/numpy/testing/_private/parameterized.py
deleted file mode 100644
index 3a29a1811..000000000
--- a/numpy/testing/_private/parameterized.py
+++ /dev/null
@@ -1,432 +0,0 @@
-"""
-tl;dr: all code is licensed under simplified BSD, unless stated otherwise.
-
-Unless stated otherwise in the source files, all code is copyright 2010 David
-Wolever <david@wolever.net>. All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
-   1. Redistributions of source code must retain the above copyright notice,
-   this list of conditions and the following disclaimer.
-
-   2. Redistributions in binary form must reproduce the above copyright notice,
-   this list of conditions and the following disclaimer in the documentation
-   and/or other materials provided with the distribution.
-
-THIS SOFTWARE IS PROVIDED BY <COPYRIGHT HOLDER> ``AS IS'' AND ANY EXPRESS OR
-IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
-MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
-EVENT SHALL <COPYRIGHT HOLDER> OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
-INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
-OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
-ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-The views and conclusions contained in the software and documentation are those
-of the authors and should not be interpreted as representing official policies,
-either expressed or implied, of David Wolever.
-
-"""
-import re
-import inspect
-import warnings
-from functools import wraps
-from types import MethodType
-from collections import namedtuple
-
-from unittest import TestCase
-
-_param = namedtuple("param", "args kwargs")
-
-class param(_param):
-    """ Represents a single parameter to a test case.
-
-        For example::
-
-            >>> p = param("foo", bar=16)
-            >>> p
-            param("foo", bar=16)
-            >>> p.args
-            ('foo', )
-            >>> p.kwargs
-            {'bar': 16}
-
-        Intended to be used as an argument to ``@parameterized``::
-
-            @parameterized([
-                param("foo", bar=16),
-            ])
-            def test_stuff(foo, bar=16):
-                pass
-        """
-
-    def __new__(cls, *args , **kwargs):
-        return _param.__new__(cls, args, kwargs)
-
-    @classmethod
-    def explicit(cls, args=None, kwargs=None):
-        """ Creates a ``param`` by explicitly specifying ``args`` and
-            ``kwargs``::
-
-                >>> param.explicit([1,2,3])
-                param(*(1, 2, 3))
-                >>> param.explicit(kwargs={"foo": 42})
-                param(*(), **{"foo": "42"})
-            """
-        args = args or ()
-        kwargs = kwargs or {}
-        return cls(*args, **kwargs)
-
-    @classmethod
-    def from_decorator(cls, args):
-        """ Returns an instance of ``param()`` for ``@parameterized`` argument
-            ``args``::
-
-                >>> param.from_decorator((42, ))
-                param(args=(42, ), kwargs={})
-                >>> param.from_decorator("foo")
-                param(args=("foo", ), kwargs={})
-            """
-        if isinstance(args, param):
-            return args
-        elif isinstance(args, (str,)):
-            args = (args, )
-        try:
-            return cls(*args)
-        except TypeError as e:
-            if "after * must be" not in str(e):
-                raise
-            raise TypeError(
-                "Parameters must be tuples, but %r is not (hint: use '(%r, )')"
-                %(args, args),
-            )
-
-    def __repr__(self):
-        return "param(*%r, **%r)" %self
-
-
-def parameterized_argument_value_pairs(func, p):
-    """Return tuples of parameterized arguments and their values.
-
-        This is useful if you are writing your own doc_func
-        function and need to know the values for each parameter name::
-
-            >>> def func(a, foo=None, bar=42, **kwargs): pass
-            >>> p = param(1, foo=7, extra=99)
-            >>> parameterized_argument_value_pairs(func, p)
-            [("a", 1), ("foo", 7), ("bar", 42), ("**kwargs", {"extra": 99})]
-
-        If the function's first argument is named ``self`` then it will be
-        ignored::
-
-            >>> def func(self, a): pass
-            >>> p = param(1)
-            >>> parameterized_argument_value_pairs(func, p)
-            [("a", 1)]
-
-        Additionally, empty ``*args`` or ``**kwargs`` will be ignored::
-
-            >>> def func(foo, *args): pass
-            >>> p = param(1)
-            >>> parameterized_argument_value_pairs(func, p)
-            [("foo", 1)]
-            >>> p = param(1, 16)
-            >>> parameterized_argument_value_pairs(func, p)
-            [("foo", 1), ("*args", (16, ))]
-    """
-    argspec = inspect.getargspec(func)
-    arg_offset = 1 if argspec.args[:1] == ["self"] else 0
-
-    named_args = argspec.args[arg_offset:]
-
-    result = list(zip(named_args, p.args))
-    named_args = argspec.args[len(result) + arg_offset:]
-    varargs = p.args[len(result):]
-
-    result.extend([
-        (name, p.kwargs.get(name, default))
-        for (name, default)
-        in zip(named_args, argspec.defaults or [])
-    ])
-
-    seen_arg_names = {n for (n, _) in result}
-    keywords = dict(sorted([
-        (name, p.kwargs[name])
-        for name in p.kwargs
-        if name not in seen_arg_names
-    ]))
-
-    if varargs:
-        result.append(("*%s" %(argspec.varargs, ), tuple(varargs)))
-
-    if keywords:
-        result.append(("**%s" %(argspec.keywords, ), keywords))
-
-    return result
-
-def short_repr(x, n=64):
-    """ A shortened repr of ``x`` which is guaranteed to be ``unicode``::
-
-            >>> short_repr("foo")
-            u"foo"
-            >>> short_repr("123456789", n=4)
-            u"12...89"
-    """
-
-    x_repr = repr(x)
-    if isinstance(x_repr, bytes):
-        try:
-            x_repr = str(x_repr, "utf-8")
-        except UnicodeDecodeError:
-            x_repr = str(x_repr, "latin1")
-    if len(x_repr) > n:
-        x_repr = x_repr[:n//2] + "..." + x_repr[len(x_repr) - n//2:]
-    return x_repr
-
-def default_doc_func(func, num, p):
-    if func.__doc__ is None:
-        return None
-
-    all_args_with_values = parameterized_argument_value_pairs(func, p)
-
-    # Assumes that the function passed is a bound method.
-    descs = [f'{n}={short_repr(v)}' for n, v in all_args_with_values]
-
-    # The documentation might be a multiline string, so split it
-    # and just work with the first string, ignoring the period
-    # at the end if there is one.
-    first, nl, rest = func.__doc__.lstrip().partition("\n")
-    suffix = ""
-    if first.endswith("."):
-        suffix = "."
-        first = first[:-1]
-    args = "%s[with %s]" %(len(first) and " " or "", ", ".join(descs))
-    return "".join([first.rstrip(), args, suffix, nl, rest])
-
-def default_name_func(func, num, p):
-    base_name = func.__name__
-    name_suffix = "_%s" %(num, )
-    if len(p.args) > 0 and isinstance(p.args[0], (str,)):
-        name_suffix += "_" + parameterized.to_safe_name(p.args[0])
-    return base_name + name_suffix
-
-
-# force nose for numpy purposes.
-_test_runner_override = 'nose'
-_test_runner_guess = False
-_test_runners = set(["unittest", "unittest2", "nose", "nose2", "pytest"])
-_test_runner_aliases = {
-    "_pytest": "pytest",
-}
-
-def set_test_runner(name):
-    global _test_runner_override
-    if name not in _test_runners:
-        raise TypeError(
-            "Invalid test runner: %r (must be one of: %s)"
-            %(name, ", ".join(_test_runners)),
-        )
-    _test_runner_override = name
-
-def detect_runner():
-    """ Guess which test runner we're using by traversing the stack and looking
-        for the first matching module. This *should* be reasonably safe, as
-        it's done during test discovery where the test runner should be the
-        stack frame immediately outside. """
-    if _test_runner_override is not None:
-        return _test_runner_override
-    global _test_runner_guess
-    if _test_runner_guess is False:
-        stack = inspect.stack()
-        for record in reversed(stack):
-            frame = record[0]
-            module = frame.f_globals.get("__name__").partition(".")[0]
-            if module in _test_runner_aliases:
-                module = _test_runner_aliases[module]
-            if module in _test_runners:
-                _test_runner_guess = module
-                break
-        else:
-            _test_runner_guess = None
-    return _test_runner_guess
-
-class parameterized:
-    """ Parameterize a test case::
-
-            class TestInt:
-                @parameterized([
-                    ("A", 10),
-                    ("F", 15),
-                    param("10", 42, base=42)
-                ])
-                def test_int(self, input, expected, base=16):
-                    actual = int(input, base=base)
-                    assert_equal(actual, expected)
-
-            @parameterized([
-                (2, 3, 5)
-                (3, 5, 8),
-            ])
-            def test_add(a, b, expected):
-                assert_equal(a + b, expected)
-        """
-
-    def __init__(self, input, doc_func=None):
-        self.get_input = self.input_as_callable(input)
-        self.doc_func = doc_func or default_doc_func
-
-    def __call__(self, test_func):
-        self.assert_not_in_testcase_subclass()
-
-        @wraps(test_func)
-        def wrapper(test_self=None):
-            test_cls = test_self and type(test_self)
-
-            original_doc = wrapper.__doc__
-            for num, args in enumerate(wrapper.parameterized_input):
-                p = param.from_decorator(args)
-                unbound_func, nose_tuple = self.param_as_nose_tuple(test_self, test_func, num, p)
-                try:
-                    wrapper.__doc__ = nose_tuple[0].__doc__
-                    # Nose uses `getattr(instance, test_func.__name__)` to get
-                    # a method bound to the test instance (as opposed to a
-                    # method bound to the instance of the class created when
-                    # tests were being enumerated). Set a value here to make
-                    # sure nose can get the correct test method.
-                    if test_self is not None:
-                        setattr(test_cls, test_func.__name__, unbound_func)
-                    yield nose_tuple
-                finally:
-                    if test_self is not None:
-                        delattr(test_cls, test_func.__name__)
-                    wrapper.__doc__ = original_doc
-        wrapper.parameterized_input = self.get_input()
-        wrapper.parameterized_func = test_func
-        test_func.__name__ = "_parameterized_original_%s" %(test_func.__name__, )
-        return wrapper
-
-    def param_as_nose_tuple(self, test_self, func, num, p):
-        nose_func = wraps(func)(lambda *args: func(*args[:-1], **args[-1]))
-        nose_func.__doc__ = self.doc_func(func, num, p)
-        # Track the unbound function because we need to setattr the unbound
-        # function onto the class for nose to work (see comments above), and
-        # Python 3 doesn't let us pull the function out of a bound method.
-        unbound_func = nose_func
-        if test_self is not None:
-            nose_func = MethodType(nose_func, test_self)
-        return unbound_func, (nose_func, ) + p.args + (p.kwargs or {}, )
-
-    def assert_not_in_testcase_subclass(self):
-        parent_classes = self._terrible_magic_get_defining_classes()
-        if any(issubclass(cls, TestCase) for cls in parent_classes):
-            raise Exception("Warning: '@parameterized' tests won't work "
-                            "inside subclasses of 'TestCase' - use "
-                            "'@parameterized.expand' instead.")
-
-    def _terrible_magic_get_defining_classes(self):
-        """ Returns the list of parent classes of the class currently being defined.
-            Will likely only work if called from the ``parameterized`` decorator.
-            This function is entirely @brandon_rhodes's fault, as he suggested
-            the implementation: http://stackoverflow.com/a/8793684/71522
-            """
-        stack = inspect.stack()
-        if len(stack) <= 4:
-            return []
-        frame = stack[4]
-        code_context = frame[4] and frame[4][0].strip()
-        if not (code_context and code_context.startswith("class ")):
-            return []
-        _, _, parents = code_context.partition("(")
-        parents, _, _ = parents.partition(")")
-        return eval("[" + parents + "]", frame[0].f_globals, frame[0].f_locals)
-
-    @classmethod
-    def input_as_callable(cls, input):
-        if callable(input):
-            return lambda: cls.check_input_values(input())
-        input_values = cls.check_input_values(input)
-        return lambda: input_values
-
-    @classmethod
-    def check_input_values(cls, input_values):
-        # Explicitly convert non-list inputs to a list so that:
-        # 1. A helpful exception will be raised if they aren't iterable, and
-        # 2. Generators are unwrapped exactly once (otherwise `nosetests
-        #    --processes=n` has issues; see:
-        #    https://github.com/wolever/nose-parameterized/pull/31)
-        if not isinstance(input_values, list):
-            input_values = list(input_values)
-        return [ param.from_decorator(p) for p in input_values ]
-
-    @classmethod
-    def expand(cls, input, name_func=None, doc_func=None, **legacy):
-        """ A "brute force" method of parameterizing test cases. Creates new
-            test cases and injects them into the namespace that the wrapped
-            function is being defined in. Useful for parameterizing tests in
-            subclasses of 'UnitTest', where Nose test generators don't work.
-
-            >>> @parameterized.expand([("foo", 1, 2)])
-            ... def test_add1(name, input, expected):
-            ...     actual = add1(input)
-            ...     assert_equal(actual, expected)
-            ...
-            >>> locals()
-            ... 'test_add1_foo_0': <function ...> ...
-            >>>
-            """
-
-        if "testcase_func_name" in legacy:
-            warnings.warn("testcase_func_name= is deprecated; use name_func=",
-                          DeprecationWarning, stacklevel=2)
-            if not name_func:
-                name_func = legacy["testcase_func_name"]
-
-        if "testcase_func_doc" in legacy:
-            warnings.warn("testcase_func_doc= is deprecated; use doc_func=",
-                          DeprecationWarning, stacklevel=2)
-            if not doc_func:
-                doc_func = legacy["testcase_func_doc"]
-
-        doc_func = doc_func or default_doc_func
-        name_func = name_func or default_name_func
-
-        def parameterized_expand_wrapper(f, instance=None):
-            stack = inspect.stack()
-            frame = stack[1]
-            frame_locals = frame[0].f_locals
-
-            parameters = cls.input_as_callable(input)()
-            for num, p in enumerate(parameters):
-                name = name_func(f, num, p)
-                frame_locals[name] = cls.param_as_standalone_func(p, f, name)
-                frame_locals[name].__doc__ = doc_func(f, num, p)
-
-            f.__test__ = False
-        return parameterized_expand_wrapper
-
-    @classmethod
-    def param_as_standalone_func(cls, p, func, name):
-        @wraps(func)
-        def standalone_func(*a):
-            return func(*(a + p.args), **p.kwargs)
-        standalone_func.__name__ = name
-
-        # place_as is used by py.test to determine what source file should be
-        # used for this test.
-        standalone_func.place_as = func
-
-        # Remove __wrapped__ because py.test will try to look at __wrapped__
-        # to determine which parameters should be used with this test case,
-        # and obviously we don't need it to do any parameterization.
-        try:
-            del standalone_func.__wrapped__
-        except AttributeError:
-            pass
-        return standalone_func
-
-    @classmethod
-    def to_safe_name(cls, s):
-        return str(re.sub("[^a-zA-Z0-9_]+", "_", s))
diff --git a/numpy/testing/_private/utils.py b/numpy/testing/_private/utils.py
index 26b2aac4d..cca7b8063 100644
--- a/numpy/testing/_private/utils.py
+++ b/numpy/testing/_private/utils.py
@@ -16,10 +16,12 @@ from tempfile import mkdtemp, mkstemp
 from unittest.case import SkipTest
 from warnings import WarningMessage
 import pprint
+import sysconfig
 
 import numpy as np
-from numpy.core import(
+from numpy.core import (
      intp, float32, empty, arange, array_repr, ndarray, isnat, array)
+from numpy import isfinite, isnan, isinf
 import numpy.linalg.lapack_lite
 
 from io import StringIO
@@ -29,14 +31,14 @@ __all__ = [
         'assert_array_equal', 'assert_array_less', 'assert_string_equal',
         'assert_array_almost_equal', 'assert_raises', 'build_err_msg',
         'decorate_methods', 'jiffies', 'memusage', 'print_assert_equal',
-        'raises', 'rundocs', 'runstring', 'verbose', 'measure',
+        'rundocs', 'runstring', 'verbose', 'measure',
         'assert_', 'assert_array_almost_equal_nulp', 'assert_raises_regex',
         'assert_array_max_ulp', 'assert_warns', 'assert_no_warnings',
         'assert_allclose', 'IgnoreException', 'clear_and_catch_warnings',
         'SkipTest', 'KnownFailureException', 'temppath', 'tempdir', 'IS_PYPY',
-        'HAS_REFCOUNT', 'suppress_warnings', 'assert_array_compare',
+        'HAS_REFCOUNT', "IS_WASM", 'suppress_warnings', 'assert_array_compare',
         'assert_no_gc_cycles', 'break_cycles', 'HAS_LAPACK64', 'IS_PYSTON',
-        '_OLD_PROMOTION'
+        '_OLD_PROMOTION', 'IS_MUSL', '_SUPPORTS_SVE'
         ]
 
 
@@ -48,6 +50,7 @@ class KnownFailureException(Exception):
 KnownFailureTest = KnownFailureException  # backwards compat
 verbose = 0
 
+IS_WASM = platform.machine() in ["wasm32", "wasm64"]
 IS_PYPY = sys.implementation.name == 'pypy'
 IS_PYSTON = hasattr(sys, "pyston_version_info")
 HAS_REFCOUNT = getattr(sys, 'getrefcount', None) is not None and not IS_PYSTON
@@ -55,27 +58,18 @@ HAS_LAPACK64 = numpy.linalg.lapack_lite._ilp64
 
 _OLD_PROMOTION = lambda: np._get_promotion_state() == 'legacy'
 
-
-def import_nose():
-    """ Import nose only when needed.
-    """
-    nose_is_good = True
-    minimum_nose_version = (1, 0, 0)
-    try:
-        import nose
-    except ImportError:
-        nose_is_good = False
-    else:
-        if nose.__versioninfo__ < minimum_nose_version:
-            nose_is_good = False
-
-    if not nose_is_good:
-        msg = ('Need nose >= %d.%d.%d for tests - see '
-               'https://nose.readthedocs.io' %
-               minimum_nose_version)
-        raise ImportError(msg)
-
-    return nose
+IS_MUSL = False
+try:
+    from packaging.tags import sys_tags
+    _tags = list(sys_tags())
+    if 'musllinux' in _tags[0].platform:
+        IS_MUSL = True
+except ImportError:
+    # fallback to sysconfig (might be flaky)
+    # value could be None.
+    v = sysconfig.get_config_var('HOST_GNU_TYPE') or ''
+    if 'musl' in v:
+        IS_MUSL = True
 
 
 def assert_(val, msg=''):
@@ -98,62 +92,6 @@ def assert_(val, msg=''):
         raise AssertionError(smsg)
 
 
-def gisnan(x):
-    """like isnan, but always raise an error if type not supported instead of
-    returning a TypeError object.
-
-    Notes
-    -----
-    isnan and other ufunc sometimes return a NotImplementedType object instead
-    of raising any exception. This function is a wrapper to make sure an
-    exception is always raised.
-
-    This should be removed once this problem is solved at the Ufunc level."""
-    from numpy.core import isnan
-    st = isnan(x)
-    if isinstance(st, type(NotImplemented)):
-        raise TypeError("isnan not supported for this type")
-    return st
-
-
-def gisfinite(x):
-    """like isfinite, but always raise an error if type not supported instead
-    of returning a TypeError object.
-
-    Notes
-    -----
-    isfinite and other ufunc sometimes return a NotImplementedType object
-    instead of raising any exception. This function is a wrapper to make sure
-    an exception is always raised.
-
-    This should be removed once this problem is solved at the Ufunc level."""
-    from numpy.core import isfinite, errstate
-    with errstate(invalid='ignore'):
-        st = isfinite(x)
-        if isinstance(st, type(NotImplemented)):
-            raise TypeError("isfinite not supported for this type")
-    return st
-
-
-def gisinf(x):
-    """like isinf, but always raise an error if type not supported instead of
-    returning a TypeError object.
-
-    Notes
-    -----
-    isinf and other ufunc sometimes return a NotImplementedType object instead
-    of raising any exception. This function is a wrapper to make sure an
-    exception is always raised.
-
-    This should be removed once this problem is solved at the Ufunc level."""
-    from numpy.core import isinf, errstate
-    with errstate(invalid='ignore'):
-        st = isinf(x)
-        if isinstance(st, type(NotImplemented)):
-            raise TypeError("isinf not supported for this type")
-    return st
-
-
 if os.name == 'nt':
     # Code "stolen" from enthought/debug/memusage.py
     def GetPerformanceAttributes(object, counter, instance=None,
@@ -197,7 +135,7 @@ elif sys.platform[:5] == 'linux':
 
         """
         try:
-            with open(_proc_pid_stat, 'r') as f:
+            with open(_proc_pid_stat) as f:
                 l = f.readline().split(' ')
             return int(l[22])
         except Exception:
@@ -224,7 +162,7 @@ if sys.platform[:5] == 'linux':
         if not _load_time:
             _load_time.append(time.time())
         try:
-            with open(_proc_pid_stat, 'r') as f:
+            with open(_proc_pid_stat) as f:
                 l = f.readline().split(' ')
             return int(l[13])
         except Exception:
@@ -397,8 +335,8 @@ def assert_equal(actual, desired, err_msg='', verbose=True):
 
     # Inf/nan/negative zero handling
     try:
-        isdesnan = gisnan(desired)
-        isactnan = gisnan(actual)
+        isdesnan = isnan(desired)
+        isactnan = isnan(actual)
         if isdesnan and isactnan:
             return  # both nan, so equal
 
@@ -408,7 +346,7 @@ def assert_equal(actual, desired, err_msg='', verbose=True):
         if (array_actual.dtype.char in 'Mm' or
                 array_desired.dtype.char in 'Mm'):
             # version 1.18
-            # until this version, gisnan failed for datetime64 and timedelta64.
+            # until this version, isnan failed for datetime64 and timedelta64.
             # Now it succeeds but comparison to scalar with a different type
             # emits a DeprecationWarning.
             # Avoid that by skipping the next check
@@ -477,7 +415,7 @@ def print_assert_equal(test_string, actual, desired):
 
 
 @np._no_nep50_warning()
-def assert_almost_equal(actual,desired,decimal=7,err_msg='',verbose=True):
+def assert_almost_equal(actual, desired, decimal=7, err_msg='', verbose=True):
     """
     Raises an AssertionError if two items are not equal up to desired
     precision.
@@ -589,9 +527,9 @@ def assert_almost_equal(actual,desired,decimal=7,err_msg='',verbose=True):
         # If one of desired/actual is not finite, handle it specially here:
         # check that both are nan if any is a nan, and test for equality
         # otherwise
-        if not (gisfinite(desired) and gisfinite(actual)):
-            if gisnan(desired) or gisnan(actual):
-                if not (gisnan(desired) and gisnan(actual)):
+        if not (isfinite(desired) and isfinite(actual)):
+            if isnan(desired) or isnan(actual):
+                if not (isnan(desired) and isnan(actual)):
                     raise AssertionError(_build_err_msg())
             else:
                 if not desired == actual:
@@ -604,7 +542,8 @@ def assert_almost_equal(actual,desired,decimal=7,err_msg='',verbose=True):
 
 
 @np._no_nep50_warning()
-def assert_approx_equal(actual,desired,significant=7,err_msg='',verbose=True):
+def assert_approx_equal(actual, desired, significant=7, err_msg='',
+                        verbose=True):
     """
     Raises an AssertionError if two items are not equal up to significant
     digits.
@@ -689,9 +628,9 @@ def assert_approx_equal(actual,desired,significant=7,err_msg='',verbose=True):
         # If one of desired/actual is not finite, handle it specially here:
         # check that both are nan if any is a nan, and test for equality
         # otherwise
-        if not (gisfinite(desired) and gisfinite(actual)):
-            if gisnan(desired) or gisnan(actual):
-                if not (gisnan(desired) and gisnan(actual)):
+        if not (isfinite(desired) and isfinite(actual)):
+            if isnan(desired) or isnan(actual):
+                if not (isnan(desired) and isnan(actual)):
                     raise AssertionError(msg)
             else:
                 if not desired == actual:
@@ -708,7 +647,8 @@ def assert_array_compare(comparison, x, y, err_msg='', verbose=True, header='',
                          precision=6, equal_nan=True, equal_inf=True,
                          *, strict=False):
     __tracebackhide__ = True  # Hide traceback for py.test
-    from numpy.core import array, array2string, isnan, inf, bool_, errstate, all, max, object_
+    from numpy.core import (array2string, isnan, inf, bool_, errstate,
+                            all, max, object_)
 
     x = np.asanyarray(x)
     y = np.asanyarray(y)
@@ -834,10 +774,10 @@ def assert_array_compare(comparison, x, y, err_msg='', verbose=True, header='',
                     max_abs_error = max(error)
                     if getattr(error, 'dtype', object_) == object_:
                         remarks.append('Max absolute difference: '
-                                        + str(max_abs_error))
+                                       + str(max_abs_error))
                     else:
                         remarks.append('Max absolute difference: '
-                                        + array2string(max_abs_error))
+                                       + array2string(max_abs_error))
 
                     # note: this definition of relative error matches that one
                     # used by assert_allclose (found in np.isclose)
@@ -849,10 +789,10 @@ def assert_array_compare(comparison, x, y, err_msg='', verbose=True, header='',
                         max_rel_error = max(error[nonzero] / abs(y[nonzero]))
                     if getattr(error, 'dtype', object_) == object_:
                         remarks.append('Max relative difference: '
-                                        + str(max_rel_error))
+                                       + str(max_rel_error))
                     else:
                         remarks.append('Max relative difference: '
-                                        + array2string(max_rel_error))
+                                       + array2string(max_rel_error))
 
             err_msg += '\n' + '\n'.join(remarks)
             msg = build_err_msg([ox, oy], err_msg,
@@ -898,6 +838,8 @@ def assert_array_equal(x, y, err_msg='', verbose=True, *, strict=False):
         type of the array_like objects does not match. The special
         handling for scalars mentioned in the Notes section is disabled.
 
+        .. versionadded:: 1.24.0
+
     Raises
     ------
     AssertionError
@@ -1063,15 +1005,15 @@ def assert_array_almost_equal(x, y, decimal=6, err_msg='', verbose=True):
 
     """
     __tracebackhide__ = True  # Hide traceback for py.test
-    from numpy.core import number, float_, result_type, array
+    from numpy.core import number, float_, result_type
     from numpy.core.numerictypes import issubdtype
     from numpy.core.fromnumeric import any as npany
 
     def compare(x, y):
         try:
-            if npany(gisinf(x)) or npany( gisinf(y)):
-                xinfid = gisinf(x)
-                yinfid = gisinf(y)
+            if npany(isinf(x)) or npany(isinf(y)):
+                xinfid = isinf(x)
+                yinfid = isinf(y)
                 if not (xinfid == yinfid).all():
                     return False
                 # if one item, x and y is +- inf
@@ -1111,8 +1053,6 @@ def assert_array_less(x, y, err_msg='', verbose=True):
     compared, no assertion is raised if both objects have NaNs in the same
     positions.
 
-
-
     Parameters
     ----------
     x : array_like
@@ -1127,15 +1067,13 @@ def assert_array_less(x, y, err_msg='', verbose=True):
     Raises
     ------
     AssertionError
-      If actual and desired objects are not equal.
+      If x is not strictly smaller than y, element-wise.
 
     See Also
     --------
     assert_array_equal: tests objects for equality
     assert_array_almost_equal: test objects for equality up to precision
 
-
-
     Examples
     --------
     >>> np.testing.assert_array_less([1.0, 1.0, np.nan], [1.1, 2.0, np.nan])
@@ -1302,42 +1240,21 @@ def rundocs(filename=None, raise_on_error=True):
         raise AssertionError("Some doctests failed:\n%s" % "\n".join(msg))
 
 
-def raises(*args):
-    """Decorator to check for raised exceptions.
-
-    The decorated test function must raise one of the passed exceptions to
-    pass.  If you want to test many assertions about exceptions in a single
-    test, you may want to use `assert_raises` instead.
-
-    .. warning::
-       This decorator is nose specific, do not use it if you are using a
-       different test framework.
-
-    Parameters
-    ----------
-    args : exceptions
-        The test passes if any of the passed exceptions is raised.
-
-    Raises
-    ------
-    AssertionError
-
-    Examples
-    --------
-
-    Usage::
+def check_support_sve():
+    """
+    gh-22982
+    """
+    
+    import subprocess
+    cmd = 'lscpu'
+    try:
+        return "sve" in (subprocess.Popen(cmd, stdout=subprocess.PIPE,
+                            shell=True).communicate()[0]).decode('utf-8')
+    except OSError:
+        return False
 
-        @raises(TypeError, ValueError)
-        def test_raises_type_error():
-            raise TypeError("This test passes")
 
-        @raises(Exception)
-        def test_that_fails_by_passing():
-            pass
-
-    """
-    nose = import_nose()
-    return nose.tools.raises(*args)
+_SUPPORTS_SVE = check_support_sve()
 
 #
 # assert_raises and assert_raises_regex are taken from unittest.
@@ -1349,8 +1266,10 @@ class _Dummy(unittest.TestCase):
     def nop(self):
         pass
 
+
 _d = _Dummy('nop')
 
+
 def assert_raises(*args, **kwargs):
     """
     assert_raises(exception_class, callable, *args, **kwargs)
@@ -1377,7 +1296,7 @@ def assert_raises(*args, **kwargs):
 
     """
     __tracebackhide__ = True  # Hide traceback for py.test
-    return _d.assertRaises(*args,**kwargs)
+    return _d.assertRaises(*args, **kwargs)
 
 
 def assert_raises_regex(exception_class, expected_regexp, *args, **kwargs):
@@ -1528,7 +1447,7 @@ def assert_allclose(actual, desired, rtol=1e-7, atol=0, equal_nan=True,
 
     Given two array_like objects, check that their shapes and all elements
     are equal (but see the Notes for the special handling of a scalar). An
-    exception is raised if the shapes mismatch or any values conflict. In 
+    exception is raised if the shapes mismatch or any values conflict. In
     contrast to the standard usage in numpy, NaNs are compared like numbers,
     no assertion is raised if both objects have NaNs in the same positions.
 
@@ -1625,7 +1544,7 @@ def assert_array_almost_equal_nulp(x, y, nulp=1):
     -----
     An assertion is raised if the following condition is not met::
 
-        abs(x - y) <= nulps * spacing(maximum(abs(x), abs(y)))
+        abs(x - y) <= nulp * spacing(maximum(abs(x), abs(y)))
 
     Examples
     --------
@@ -2450,6 +2369,7 @@ def assert_no_gc_cycles(*args, **kwargs):
     with _assert_no_gc_cycles_context(name=func.__name__):
         func(*args, **kwargs)
 
+
 def break_cycles():
     """
     Break reference cycles by calling gc.collect
@@ -2545,7 +2465,7 @@ def _get_mem_available():
 
     if sys.platform.startswith('linux'):
         info = {}
-        with open('/proc/meminfo', 'r') as f:
+        with open('/proc/meminfo') as f:
             for line in f:
                 p = line.split()
                 info[p[0].strip(':').lower()] = int(p[1]) * 1024
@@ -2582,7 +2502,7 @@ def _no_tracing(func):
 def _get_glibc_version():
     try:
         ver = os.confstr('CS_GNU_LIBC_VERSION').rsplit(' ')[1]
-    except Exception as inst:
+    except Exception:
         ver = '0.0'
 
     return ver
@@ -2590,3 +2510,4 @@ def _get_glibc_version():
 
 _glibcver = _get_glibc_version()
 _glibc_older_than = lambda x: (_glibcver != '0.0' and _glibcver < x)
+
diff --git a/numpy/testing/overrides.py b/numpy/testing/overrides.py
new file mode 100644
index 000000000..edc7132c2
--- /dev/null
+++ b/numpy/testing/overrides.py
@@ -0,0 +1,83 @@
+"""Tools for testing implementations of __array_function__ and ufunc overrides
+
+
+"""
+
+from numpy.core.overrides import ARRAY_FUNCTIONS as _array_functions
+from numpy import ufunc as _ufunc
+import numpy.core.umath as _umath
+
+def get_overridable_numpy_ufuncs():
+    """List all numpy ufuncs overridable via `__array_ufunc__`
+
+    Parameters
+    ----------
+    None
+
+    Returns
+    -------
+    set
+        A set containing all overridable ufuncs in the public numpy API.
+    """
+    ufuncs = {obj for obj in _umath.__dict__.values()
+              if isinstance(obj, _ufunc)}
+    return ufuncs
+    
+
+def allows_array_ufunc_override(func):
+    """Determine if a function can be overridden via `__array_ufunc__`
+
+    Parameters
+    ----------
+    func : callable
+        Function that may be overridable via `__array_ufunc__`
+
+    Returns
+    -------
+    bool
+        `True` if `func` is overridable via `__array_ufunc__` and
+        `False` otherwise.
+
+    Notes
+    -----
+    This function is equivalent to ``isinstance(func, np.ufunc)`` and
+    will work correctly for ufuncs defined outside of Numpy.
+
+    """
+    return isinstance(func, np.ufunc)
+
+
+def get_overridable_numpy_array_functions():
+    """List all numpy functions overridable via `__array_function__`
+
+    Parameters
+    ----------
+    None
+
+    Returns
+    -------
+    set
+        A set containing all functions in the public numpy API that are
+        overridable via `__array_function__`.
+
+    """
+    # 'import numpy' doesn't import recfunctions, so make sure it's imported
+    # so ufuncs defined there show up in the ufunc listing
+    from numpy.lib import recfunctions
+    return _array_functions.copy()
+
+def allows_array_function_override(func):
+    """Determine if a Numpy function can be overridden via `__array_function__`
+
+    Parameters
+    ----------
+    func : callable
+        Function that may be overridable via `__array_function__`
+
+    Returns
+    -------
+    bool
+        `True` if `func` is a function in the Numpy API that is
+        overridable via `__array_function__` and `False` otherwise.
+    """
+    return func in _array_functions
diff --git a/numpy/testing/tests/test_doctesting.py b/numpy/testing/tests/test_doctesting.py
deleted file mode 100644
index 92c2156d8..000000000
--- a/numpy/testing/tests/test_doctesting.py
+++ /dev/null
@@ -1,57 +0,0 @@
-""" Doctests for NumPy-specific nose/doctest modifications
-
-"""
-#FIXME: None of these tests is run, because 'check' is not a recognized
-# testing prefix.
-
-# try the #random directive on the output line
-def check_random_directive():
-    '''
-    >>> 2+2
-    <BadExample object at 0x084D05AC>  #random: may vary on your system
-    '''
-
-# check the implicit "import numpy as np"
-def check_implicit_np():
-    '''
-    >>> np.array([1,2,3])
-    array([1, 2, 3])
-    '''
-
-# there's some extraneous whitespace around the correct responses
-def check_whitespace_enabled():
-    '''
-    # whitespace after the 3
-    >>> 1+2
-    3
-
-    # whitespace before the 7
-    >>> 3+4
-     7
-    '''
-
-def check_empty_output():
-    """ Check that no output does not cause an error.
-
-    This is related to nose bug 445; the numpy plugin changed the
-    doctest-result-variable default and therefore hit this bug:
-    http://code.google.com/p/python-nose/issues/detail?id=445
-
-    >>> a = 10
-    """
-
-def check_skip():
-    """ Check skip directive
-
-    The test below should not run
-
-    >>> 1/0 #doctest: +SKIP
-    """
-
-
-if __name__ == '__main__':
-    # Run tests outside numpy test rig
-    import nose
-    from numpy.testing.noseclasses import NumpyDoctest
-    argv = ['', __file__, '--with-numpydoctest']
-    nose.core.TestProgram(argv=argv, addplugins=[NumpyDoctest()])
diff --git a/numpy/testing/tests/test_utils.py b/numpy/testing/tests/test_utils.py
index 052cc3aa1..0aaa508ee 100644
--- a/numpy/testing/tests/test_utils.py
+++ b/numpy/testing/tests/test_utils.py
@@ -8,13 +8,12 @@ import weakref
 import numpy as np
 from numpy.testing import (
     assert_equal, assert_array_equal, assert_almost_equal,
-    assert_array_almost_equal, assert_array_less, build_err_msg, raises,
+    assert_array_almost_equal, assert_array_less, build_err_msg,
     assert_raises, assert_warns, assert_no_warnings, assert_allclose,
     assert_approx_equal, assert_array_almost_equal_nulp, assert_array_max_ulp,
     clear_and_catch_warnings, suppress_warnings, assert_string_equal, assert_,
     tempdir, temppath, assert_no_gc_cycles, HAS_REFCOUNT
     )
-from numpy.core.overrides import ARRAY_FUNCTION_ENABLED
 
 
 class _GenericTest:
@@ -191,8 +190,6 @@ class TestArrayEqual(_GenericTest):
         self._test_not_equal(a, b)
         self._test_not_equal(b, a)
 
-    @pytest.mark.skipif(
-        not ARRAY_FUNCTION_ENABLED, reason='requires __array_function__')
     def test_subclass_that_does_not_implement_npall(self):
         class MyArray(np.ndarray):
             def __array_function__(self, *args, **kwargs):
@@ -793,41 +790,6 @@ class TestArrayAssertLess:
         self._assert_func(-ainf, x)
 
 
-@pytest.mark.skip(reason="The raises decorator depends on Nose")
-class TestRaises:
-
-    def setup_method(self):
-        class MyException(Exception):
-            pass
-
-        self.e = MyException
-
-    def raises_exception(self, e):
-        raise e
-
-    def does_not_raise_exception(self):
-        pass
-
-    def test_correct_catch(self):
-        raises(self.e)(self.raises_exception)(self.e)  # raises?
-
-    def test_wrong_exception(self):
-        try:
-            raises(self.e)(self.raises_exception)(RuntimeError)  # raises?
-        except RuntimeError:
-            return
-        else:
-            raise AssertionError("should have caught RuntimeError")
-
-    def test_catch_no_raise(self):
-        try:
-            raises(self.e)(self.does_not_raise_exception)()  # raises?
-        except AssertionError:
-            return
-        else:
-            raise AssertionError("should have raised an AssertionError")
-
-
 class TestWarns:
 
     def test_warn(self):
diff --git a/numpy/testing/utils.py b/numpy/testing/utils.py
deleted file mode 100644
index 20a883304..000000000
--- a/numpy/testing/utils.py
+++ /dev/null
@@ -1,29 +0,0 @@
-"""
-Back compatibility utils module. It will import the appropriate
-set of tools
-
-"""
-import warnings
-
-# 2018-04-04, numpy 1.15.0 ImportWarning
-# 2019-09-18, numpy 1.18.0 DeprecatonWarning (changed)
-warnings.warn("Importing from numpy.testing.utils is deprecated "
-              "since 1.15.0, import from numpy.testing instead.",
-              DeprecationWarning, stacklevel=2)
-
-from ._private.utils import *
-from ._private.utils import _assert_valid_refcount, _gen_alignment_data
-
-__all__ = [
-        'assert_equal', 'assert_almost_equal', 'assert_approx_equal',
-        'assert_array_equal', 'assert_array_less', 'assert_string_equal',
-        'assert_array_almost_equal', 'assert_raises', 'build_err_msg',
-        'decorate_methods', 'jiffies', 'memusage', 'print_assert_equal',
-        'raises', 'rundocs', 'runstring', 'verbose', 'measure',
-        'assert_', 'assert_array_almost_equal_nulp', 'assert_raises_regex',
-        'assert_array_max_ulp', 'assert_warns', 'assert_no_warnings',
-        'assert_allclose', 'IgnoreException', 'clear_and_catch_warnings',
-        'SkipTest', 'KnownFailureException', 'temppath', 'tempdir', 'IS_PYPY',
-        'HAS_REFCOUNT', 'suppress_warnings', 'assert_array_compare',
-        'assert_no_gc_cycles'
-        ]
diff --git a/numpy/tests/test_numpy_config.py b/numpy/tests/test_numpy_config.py
new file mode 100644
index 000000000..82c1ad70b
--- /dev/null
+++ b/numpy/tests/test_numpy_config.py
@@ -0,0 +1,44 @@
+"""
+Check the numpy config is valid.
+"""
+import numpy as np
+import pytest
+from unittest.mock import Mock, patch
+
+pytestmark = pytest.mark.skipif(
+    not hasattr(np.__config__, "_built_with_meson"),
+    reason="Requires Meson builds",
+)
+
+
+class TestNumPyConfigs:
+    REQUIRED_CONFIG_KEYS = [
+        "Compilers",
+        "Machine Information",
+        "Python Information",
+    ]
+
+    @patch("numpy.__config__._check_pyyaml")
+    def test_pyyaml_not_found(self, mock_yaml_importer):
+        mock_yaml_importer.side_effect = ModuleNotFoundError()
+        with pytest.warns(UserWarning):
+            np.show_config()
+
+    def test_dict_mode(self):
+        config = np.show_config(mode="dicts")
+
+        assert isinstance(config, dict)
+        assert all([key in config for key in self.REQUIRED_CONFIG_KEYS]), (
+            "Required key missing,"
+            " see index of `False` with `REQUIRED_CONFIG_KEYS`"
+        )
+
+    def test_invalid_mode(self):
+        with pytest.raises(AttributeError):
+            np.show_config(mode="foo")
+
+    def test_warn_to_add_tests(self):
+        assert len(np.__config__.DisplayModes) == 2, (
+            "New mode detected,"
+            " please add UT if applicable and increment this count"
+        )
diff --git a/numpy/tests/test_public_api.py b/numpy/tests/test_public_api.py
index 92a34bf91..eaa89aa6f 100644
--- a/numpy/tests/test_public_api.py
+++ b/numpy/tests/test_public_api.py
@@ -9,6 +9,7 @@ import warnings
 import numpy as np
 import numpy
 import pytest
+from numpy.testing import IS_WASM
 
 try:
     import ctypes
@@ -33,7 +34,6 @@ def test_numpy_namespace():
     # None of these objects are publicly documented to be part of the main
     # NumPy namespace (some are useful though, others need to be cleaned up)
     undocumented = {
-        'Tester': 'numpy.testing._private.nosetester.NoseTester',
         '_add_newdoc_ufunc': 'numpy.core._multiarray_umath._add_newdoc_ufunc',
         'add_docstring': 'numpy.core._multiarray_umath.add_docstring',
         'add_newdoc': 'numpy.core.function_base.add_newdoc',
@@ -62,7 +62,8 @@ def test_numpy_namespace():
     assert bad_results == allowlist
 
 
-@pytest.mark.parametrize('name', ['testing', 'Tester'])
+@pytest.mark.skipif(IS_WASM, reason="can't start subprocess")
+@pytest.mark.parametrize('name', ['testing'])
 def test_import_lazy_import(name):
     """Make sure we can actually use the modules we lazy load.
 
@@ -135,6 +136,8 @@ PUBLIC_MODULES = ['numpy.' + s for s in [
     "doc",
     "doc.constants",
     "doc.ufuncs",
+    "dtypes",
+    "exceptions",
     "f2py",
     "fft",
     "lib",
@@ -157,6 +160,7 @@ PUBLIC_MODULES = ['numpy.' + s for s in [
     "polynomial.polynomial",
     "random",
     "testing",
+    "testing.overrides",
     "typing",
     "typing.mypy_plugin",
     "version",
@@ -191,6 +195,7 @@ PRIVATE_BUT_PRESENT_MODULES = ['numpy.' + s for s in [
     "core.umath",
     "core.umath_tests",
     "distutils.armccompiler",
+    "distutils.fujitsuccompiler",
     "distutils.ccompiler",
     'distutils.ccompiler_opt',
     "distutils.command",
@@ -244,7 +249,6 @@ PRIVATE_BUT_PRESENT_MODULES = ['numpy.' + s for s in [
     "distutils.numpy_distribution",
     "distutils.pathccompiler",
     "distutils.unixccompiler",
-    "dual",
     "f2py.auxfuncs",
     "f2py.capi_maps",
     "f2py.cb_rules",
@@ -276,7 +280,6 @@ PRIVATE_BUT_PRESENT_MODULES = ['numpy.' + s for s in [
     "lib.utils",
     "linalg.lapack_lite",
     "linalg.linalg",
-    "ma.bench",
     "ma.core",
     "ma.testutils",
     "ma.timer_comparison",
@@ -286,7 +289,6 @@ PRIVATE_BUT_PRESENT_MODULES = ['numpy.' + s for s in [
     "random.mtrand",
     "random.bit_generator",
     "testing.print_coercion_tables",
-    "testing.utils",
 ]]
 
 
@@ -318,6 +320,7 @@ SKIP_LIST = [
     "numpy.core.code_generators.generate_ufunc_api",
     "numpy.core.code_generators.numpy_api",
     "numpy.core.code_generators.generate_umath_doc",
+    "numpy.core.code_generators.verify_c_api_version",
     "numpy.core.cversions",
     "numpy.core.generate_numpy_api",
     "numpy.distutils.msvc9compiler",
@@ -349,6 +352,8 @@ def test_all_modules_are_expected():
 SKIP_LIST_2 = [
     'numpy.math',
     'numpy.distutils.log.sys',
+    'numpy.distutils.log.logging',
+    'numpy.distutils.log.warnings',
     'numpy.doc.constants.re',
     'numpy.doc.constants.textwrap',
     'numpy.lib.emath',
@@ -356,6 +361,7 @@ SKIP_LIST_2 = [
     'numpy.matlib.char',
     'numpy.matlib.rec',
     'numpy.matlib.emath',
+    'numpy.matlib.exceptions',
     'numpy.matlib.math',
     'numpy.matlib.linalg',
     'numpy.matlib.fft',
@@ -466,6 +472,10 @@ def test_api_importable():
 
 
 @pytest.mark.xfail(
+    hasattr(np.__config__, "_built_with_meson"),
+    reason = "Meson does not yet support entry points via pyproject.toml",
+)
+@pytest.mark.xfail(
     sysconfig.get_config_var("Py_DEBUG") is not None,
     reason=(
         "NumPy possibly built with `USE_DEBUG=True ./tools/travis-test.sh`, "
@@ -500,3 +510,17 @@ def test_array_api_entry_point():
         "does not point to our Array API implementation"
     )
     assert xp is numpy.array_api, msg
+
+
+@pytest.mark.parametrize("name", [
+        'ModuleDeprecationWarning', 'VisibleDeprecationWarning',
+        'ComplexWarning', 'TooHardError', 'AxisError'])
+def test_moved_exceptions(name):
+    # These were moved to the exceptions namespace, but currently still
+    # available
+    assert name in np.__all__
+    assert name not in np.__dir__()
+    # Fetching works, but __module__ is set correctly:
+    assert getattr(np, name).__module__ == "numpy.exceptions"
+    assert name in np.exceptions.__all__
+    getattr(np.exceptions, name)
diff --git a/numpy/tests/test_reloading.py b/numpy/tests/test_reloading.py
index 8d8c8aa34..a1f360089 100644
--- a/numpy/tests/test_reloading.py
+++ b/numpy/tests/test_reloading.py
@@ -1,6 +1,13 @@
-from numpy.testing import assert_raises, assert_warns, assert_, assert_equal
+from numpy.testing import (
+    assert_raises,
+    assert_warns,
+    assert_,
+    assert_equal,
+    IS_WASM,
+)
 from numpy.compat import pickle
 
+import pytest
 import sys
 import subprocess
 import textwrap
@@ -37,6 +44,7 @@ def test_novalue():
                                           protocol=proto)) is np._NoValue)
 
 
+@pytest.mark.skipif(IS_WASM, reason="can't start subprocess")
 def test_full_reimport():
     """At the time of writing this, it is *not* truly supported, but
     apparently enough users rely on it, for it to be an annoying change
diff --git a/numpy/tests/test_scripts.py b/numpy/tests/test_scripts.py
index 5aa8191bb..892c04eef 100644
--- a/numpy/tests/test_scripts.py
+++ b/numpy/tests/test_scripts.py
@@ -9,7 +9,7 @@ from os.path import join as pathjoin, isfile, dirname
 import subprocess
 
 import numpy as np
-from numpy.testing import assert_equal
+from numpy.testing import assert_equal, IS_WASM
 
 is_inplace = isfile(pathjoin(dirname(np.__file__),  '..', 'setup.py'))
 
@@ -41,6 +41,7 @@ def test_f2py(f2py_cmd):
     assert_equal(stdout.strip(), np.__version__.encode('ascii'))
 
 
+@pytest.mark.skipif(IS_WASM, reason="Cannot start subprocess")
 def test_pep338():
     stdout = subprocess.check_output([sys.executable, '-mnumpy.f2py', '-v'])
     assert_equal(stdout.strip(), np.__version__.encode('ascii'))
diff --git a/numpy/typing/tests/data/fail/einsumfunc.pyi b/numpy/typing/tests/data/fail/einsumfunc.pyi
index f0e3f1e95..2d1f37418 100644
--- a/numpy/typing/tests/data/fail/einsumfunc.pyi
+++ b/numpy/typing/tests/data/fail/einsumfunc.pyi
@@ -4,12 +4,9 @@ import numpy as np
 AR_i: np.ndarray[Any, np.dtype[np.int64]]
 AR_f: np.ndarray[Any, np.dtype[np.float64]]
 AR_m: np.ndarray[Any, np.dtype[np.timedelta64]]
-AR_O: np.ndarray[Any, np.dtype[np.object_]]
 AR_U: np.ndarray[Any, np.dtype[np.str_]]
 
 np.einsum("i,i->i", AR_i, AR_m)  # E: incompatible type
-np.einsum("i,i->i", AR_O, AR_O)  # E: incompatible type
 np.einsum("i,i->i", AR_f, AR_f, dtype=np.int32)  # E: incompatible type
-np.einsum("i,i->i", AR_i, AR_i, dtype=np.timedelta64, casting="unsafe")  # E: No overload variant
 np.einsum("i,i->i", AR_i, AR_i, out=AR_U)  # E: Value of type variable "_ArrayType" of "einsum" cannot be
 np.einsum("i,i->i", AR_i, AR_i, out=AR_U, casting="unsafe")  # E: No overload variant
diff --git a/numpy/typing/tests/data/fail/scalars.pyi b/numpy/typing/tests/data/fail/scalars.pyi
index c24f9e479..2a6c2c7ad 100644
--- a/numpy/typing/tests/data/fail/scalars.pyi
+++ b/numpy/typing/tests/data/fail/scalars.pyi
@@ -47,7 +47,8 @@ np.uint16(A())  # E: incompatible type
 np.uint32(A())  # E: incompatible type
 np.uint64(A())  # E: incompatible type
 
-np.void("test")  # E: incompatible type
+np.void("test")  # E: No overload variant
+np.void("test", dtype=None)  # E: No overload variant
 
 np.generic(1)  # E: Cannot instantiate abstract class
 np.number(1)  # E: Cannot instantiate abstract class
@@ -62,7 +63,7 @@ np.uint64(value=0)  # E: Unexpected keyword argument
 np.complex128(value=0.0j)  # E: Unexpected keyword argument
 np.str_(value='bob')  # E: No overload variant
 np.bytes_(value=b'test')  # E: No overload variant
-np.void(value=b'test')  # E: Unexpected keyword argument
+np.void(value=b'test')  # E: No overload variant
 np.bool_(value=True)  # E: Unexpected keyword argument
 np.datetime64(value="2019")  # E: No overload variant
 np.timedelta64(value=0)  # E: Unexpected keyword argument
diff --git a/numpy/typing/tests/data/pass/dtype.py b/numpy/typing/tests/data/pass/dtype.py
index e849cfdd4..6ec44e6b0 100644
--- a/numpy/typing/tests/data/pass/dtype.py
+++ b/numpy/typing/tests/data/pass/dtype.py
@@ -15,7 +15,7 @@ np.dtype({"names": ["a", "b"], "formats": [int, float]})
 np.dtype({"names": ["a"], "formats": [int], "titles": [object]})
 np.dtype({"names": ["a"], "formats": [int], "titles": [object()]})
 
-np.dtype([("name", np.unicode_, 16), ("grades", np.float64, (2,)), ("age", "int32")])
+np.dtype([("name", np.str_, 16), ("grades", np.float64, (2,)), ("age", "int32")])
 
 np.dtype(
     {
diff --git a/numpy/typing/tests/data/pass/ndarray_misc.py b/numpy/typing/tests/data/pass/ndarray_misc.py
index 19a1af9e2..6beacc5d7 100644
--- a/numpy/typing/tests/data/pass/ndarray_misc.py
+++ b/numpy/typing/tests/data/pass/ndarray_misc.py
@@ -150,7 +150,7 @@ A.argpartition([0])
 A.diagonal()
 
 A.dot(1)
-A.dot(1, out=B0)
+A.dot(1, out=B2)
 
 A.nonzero()
 
diff --git a/numpy/typing/tests/data/pass/numerictypes.py b/numpy/typing/tests/data/pass/numerictypes.py
index 8d6a39074..63b6ad0e2 100644
--- a/numpy/typing/tests/data/pass/numerictypes.py
+++ b/numpy/typing/tests/data/pass/numerictypes.py
@@ -8,7 +8,7 @@ np.issctype("S8")
 
 np.obj2sctype(list)
 np.obj2sctype(list, default=None)
-np.obj2sctype(list, default=np.string_)
+np.obj2sctype(list, default=np.bytes_)
 
 np.issubclass_(np.int32, int)
 np.issubclass_(np.float64, float)
@@ -17,7 +17,7 @@ np.issubclass_(np.float64, (int, float))
 np.issubsctype("int64", int)
 np.issubsctype(np.array([1]), np.array([1]))
 
-np.issubdtype("S1", np.string_)
+np.issubdtype("S1", np.bytes_)
 np.issubdtype(np.float64, np.float32)
 
 np.sctype2char("S1")
diff --git a/numpy/typing/tests/data/pass/scalars.py b/numpy/typing/tests/data/pass/scalars.py
index 684d41fad..a5c6f96e9 100644
--- a/numpy/typing/tests/data/pass/scalars.py
+++ b/numpy/typing/tests/data/pass/scalars.py
@@ -113,6 +113,8 @@ np.void(True)
 np.void(np.bool_(True))
 np.void(b"test")
 np.void(np.bytes_("test"))
+np.void(object(), [("a", "O"), ("b", "O")])
+np.void(object(), dtype=[("a", "O"), ("b", "O")])
 
 # Protocols
 i8 = np.int64()
@@ -173,18 +175,12 @@ c16.byteswap()
 c16.transpose()
 
 # Aliases
-np.str0()
-np.bool8()
-np.bytes0()
 np.string_()
-np.object0()
-np.void0(0)
 
 np.byte()
 np.short()
 np.intc()
 np.intp()
-np.int0()
 np.int_()
 np.longlong()
 
@@ -192,7 +188,6 @@ np.ubyte()
 np.ushort()
 np.uintc()
 np.uintp()
-np.uint0()
 np.uint()
 np.ulonglong()
 
diff --git a/numpy/typing/tests/data/pass/simple.py b/numpy/typing/tests/data/pass/simple.py
index 03ca3e83f..801168702 100644
--- a/numpy/typing/tests/data/pass/simple.py
+++ b/numpy/typing/tests/data/pass/simple.py
@@ -33,13 +33,13 @@ np.dtype(two_tuples_dtype)
 three_tuples_dtype = [("R", "u1", 2)]
 np.dtype(three_tuples_dtype)
 
-mixed_tuples_dtype = [("R", "u1"), ("G", np.unicode_, 1)]
+mixed_tuples_dtype = [("R", "u1"), ("G", np.str_, 1)]
 np.dtype(mixed_tuples_dtype)
 
 shape_tuple_dtype = [("R", "u1", (2, 2))]
 np.dtype(shape_tuple_dtype)
 
-shape_like_dtype = [("R", "u1", (2, 2)), ("G", np.unicode_, 1)]
+shape_like_dtype = [("R", "u1", (2, 2)), ("G", np.str_, 1)]
 np.dtype(shape_like_dtype)
 
 object_dtype = [("field1", object)]
diff --git a/numpy/typing/tests/data/reveal/array_constructors.pyi b/numpy/typing/tests/data/reveal/array_constructors.pyi
index 4a865a31c..2ff20e9ae 100644
--- a/numpy/typing/tests/data/reveal/array_constructors.pyi
+++ b/numpy/typing/tests/data/reveal/array_constructors.pyi
@@ -187,12 +187,15 @@ reveal_type(np.atleast_2d(A))  # E: ndarray[Any, dtype[{float64}]]
 reveal_type(np.atleast_3d(A))  # E: ndarray[Any, dtype[{float64}]]
 
 reveal_type(np.vstack([A, A]))  # E: ndarray[Any, Any]
+reveal_type(np.vstack([A, A], dtype=np.float64))  # E: ndarray[Any, dtype[{float64}]]
 reveal_type(np.vstack([A, C]))  # E: ndarray[Any, dtype[Any]]
 reveal_type(np.vstack([C, C]))  # E: ndarray[Any, dtype[Any]]
 
 reveal_type(np.hstack([A, A]))  # E: ndarray[Any, Any]
+reveal_type(np.hstack([A, A], dtype=np.float64))  # E: ndarray[Any, dtype[{float64}]]
 
 reveal_type(np.stack([A, A]))  # E: Any
+reveal_type(np.stack([A, A], dtype=np.float64))  # E: ndarray[Any, dtype[{float64}]]
 reveal_type(np.stack([A, C]))  # E: ndarray[Any, dtype[Any]]
 reveal_type(np.stack([C, C]))  # E: ndarray[Any, dtype[Any]]
 reveal_type(np.stack([A, A], axis=0))  # E: Any
diff --git a/numpy/typing/tests/data/reveal/einsumfunc.pyi b/numpy/typing/tests/data/reveal/einsumfunc.pyi
index d5f930149..5f6415f27 100644
--- a/numpy/typing/tests/data/reveal/einsumfunc.pyi
+++ b/numpy/typing/tests/data/reveal/einsumfunc.pyi
@@ -1,5 +1,6 @@
 from typing import Any
 import numpy as np
+import numpy.typing as npt
 
 AR_LIKE_b: list[bool]
 AR_LIKE_u: list[np.uint32]
@@ -7,10 +8,12 @@ AR_LIKE_i: list[int]
 AR_LIKE_f: list[float]
 AR_LIKE_c: list[complex]
 AR_LIKE_U: list[str]
+AR_o: npt.NDArray[np.object_]
 
-OUT_f: np.ndarray[Any, np.dtype[np.float64]]
+OUT_f: npt.NDArray[np.float64]
 
 reveal_type(np.einsum("i,i->i", AR_LIKE_b, AR_LIKE_b))  # E: Any
+reveal_type(np.einsum("i,i->i", AR_o, AR_o))  # E: Any
 reveal_type(np.einsum("i,i->i", AR_LIKE_u, AR_LIKE_u))  # E: Any
 reveal_type(np.einsum("i,i->i", AR_LIKE_i, AR_LIKE_i))  # E: Any
 reveal_type(np.einsum("i,i->i", AR_LIKE_f, AR_LIKE_f))  # E: Any
diff --git a/numpy/typing/tests/data/reveal/lib_function_base.pyi b/numpy/typing/tests/data/reveal/lib_function_base.pyi
index 259e1964f..a8b9b01ac 100644
--- a/numpy/typing/tests/data/reveal/lib_function_base.pyi
+++ b/numpy/typing/tests/data/reveal/lib_function_base.pyi
@@ -123,10 +123,6 @@ reveal_type(np.sinc(1j))  # E: complexfloating[Any, Any]
 reveal_type(np.sinc(AR_f8))  # E: ndarray[Any, dtype[floating[Any]]]
 reveal_type(np.sinc(AR_c16))  # E: ndarray[Any, dtype[complexfloating[Any, Any]]]
 
-reveal_type(np.msort(CHAR_AR_U))  # E: Any
-reveal_type(np.msort(AR_U))  # E: ndarray[Any, dtype[str_]]
-reveal_type(np.msort(AR_LIKE_f8))  # E: ndarray[Any, dtype[Any]]
-
 reveal_type(np.median(AR_f8, keepdims=False))  # E: floating[Any]
 reveal_type(np.median(AR_c16, overwrite_input=True))  # E: complexfloating[Any, Any]
 reveal_type(np.median(AR_m))  # E: timedelta64
diff --git a/numpy/typing/tests/data/reveal/modules.pyi b/numpy/typing/tests/data/reveal/modules.pyi
index ba830eb0d..4191c564a 100644
--- a/numpy/typing/tests/data/reveal/modules.pyi
+++ b/numpy/typing/tests/data/reveal/modules.pyi
@@ -16,6 +16,8 @@ reveal_type(np.random)  # E: ModuleType
 reveal_type(np.rec)  # E: ModuleType
 reveal_type(np.testing)  # E: ModuleType
 reveal_type(np.version)  # E: ModuleType
+reveal_type(np.exceptions)  # E: ModuleType
+reveal_type(np.dtypes)  # E: ModuleType
 
 reveal_type(np.lib.format)  # E: ModuleType
 reveal_type(np.lib.mixins)  # E: ModuleType
diff --git a/numpy/typing/tests/data/reveal/scalars.pyi b/numpy/typing/tests/data/reveal/scalars.pyi
index 383e40ef0..965aa5ace 100644
--- a/numpy/typing/tests/data/reveal/scalars.pyi
+++ b/numpy/typing/tests/data/reveal/scalars.pyi
@@ -1,4 +1,3 @@
-import sys
 import numpy as np
 
 b: np.bool_
@@ -35,7 +34,6 @@ reveal_type(c8.real)  # E: {float32}
 reveal_type(c16.imag)  # E: {float64}
 
 reveal_type(np.unicode_('foo'))  # E: str_
-reveal_type(np.str0('foo'))  # E: str_
 
 reveal_type(V[0])  # E: Any
 reveal_type(V["field1"])  # E: Any
@@ -44,18 +42,12 @@ V[0] = 5
 
 # Aliases
 reveal_type(np.unicode_())  # E: str_
-reveal_type(np.str0())  # E: str_
-reveal_type(np.bool8())  # E: bool_
-reveal_type(np.bytes0())  # E: bytes_
 reveal_type(np.string_())  # E: bytes_
-reveal_type(np.object0())  # E: object_
-reveal_type(np.void0(0))  # E: void
 
 reveal_type(np.byte())  # E: {byte}
 reveal_type(np.short())  # E: {short}
 reveal_type(np.intc())  # E: {intc}
 reveal_type(np.intp())  # E: {intp}
-reveal_type(np.int0())  # E: {intp}
 reveal_type(np.int_())  # E: {int_}
 reveal_type(np.longlong())  # E: {longlong}
 
@@ -63,7 +55,6 @@ reveal_type(np.ubyte())  # E: {ubyte}
 reveal_type(np.ushort())  # E: {ushort}
 reveal_type(np.uintc())  # E: {uintc}
 reveal_type(np.uintp())  # E: {uintp}
-reveal_type(np.uint0())  # E: {uintp}
 reveal_type(np.uint())  # E: {uint}
 reveal_type(np.ulonglong())  # E: {ulonglong}
 
@@ -159,8 +150,7 @@ reveal_type(round(u8, 3))  # E: {uint64}
 reveal_type(round(f8))  # E: int
 reveal_type(round(f8, 3))  # E: {float64}
 
-if sys.version_info >= (3, 9):
-    reveal_type(f8.__ceil__())  # E: int
-    reveal_type(f8.__floor__())  # E: int
+reveal_type(f8.__ceil__())  # E: int
+reveal_type(f8.__floor__())  # E: int
 
 reveal_type(i8.is_integer())  # E: Literal[True]
diff --git a/numpy/typing/tests/data/reveal/testing.pyi b/numpy/typing/tests/data/reveal/testing.pyi
index 5440af800..5c35731d3 100644
--- a/numpy/typing/tests/data/reveal/testing.pyi
+++ b/numpy/typing/tests/data/reveal/testing.pyi
@@ -113,7 +113,6 @@ reveal_type(np.testing.rundocs())  # E: None
 reveal_type(np.testing.rundocs("test.py"))  # E: None
 reveal_type(np.testing.rundocs(Path("test.py"), raise_on_error=True))  # E: None
 
-@np.testing.raises(RuntimeError, RuntimeWarning)
 def func3(a: int) -> bool: ...
 
 reveal_type(func3)  # E: def (a: builtins.int) -> builtins.bool
@@ -175,4 +174,3 @@ reveal_type(np.testing.assert_no_gc_cycles(func3, 5))  # E: None
 reveal_type(np.testing.break_cycles())  # E: None
 
 reveal_type(np.testing.TestCase())  # E: unittest.case.TestCase
-reveal_type(np.testing.run_module_suite(file_to_run="numpy/tests/test_matlib.py"))  # E: None
diff --git a/numpy/typing/tests/test_generic_alias.py b/numpy/typing/tests/test_generic_alias.py
deleted file mode 100644
index 861babd4b..000000000
--- a/numpy/typing/tests/test_generic_alias.py
+++ /dev/null
@@ -1,188 +0,0 @@
-from __future__ import annotations
-
-import sys
-import copy
-import types
-import pickle
-import weakref
-from typing import TypeVar, Any, Union, Callable
-
-import pytest
-import numpy as np
-from numpy._typing._generic_alias import _GenericAlias
-from typing_extensions import Unpack
-
-ScalarType = TypeVar("ScalarType", bound=np.generic, covariant=True)
-T1 = TypeVar("T1")
-T2 = TypeVar("T2")
-DType = _GenericAlias(np.dtype, (ScalarType,))
-NDArray = _GenericAlias(np.ndarray, (Any, DType))
-
-# NOTE: The `npt._GenericAlias` *class* isn't quite stable on python >=3.11.
-# This is not a problem during runtime (as it's 3.8-exclusive), but we still
-# need it for the >=3.9 in order to verify its semantics match
-# `types.GenericAlias` replacement. xref numpy/numpy#21526
-if sys.version_info >= (3, 9):
-    DType_ref = types.GenericAlias(np.dtype, (ScalarType,))
-    NDArray_ref = types.GenericAlias(np.ndarray, (Any, DType_ref))
-    FuncType = Callable[["_GenericAlias | types.GenericAlias"], Any]
-else:
-    DType_ref = Any
-    NDArray_ref = Any
-    FuncType = Callable[["_GenericAlias"], Any]
-
-GETATTR_NAMES = sorted(set(dir(np.ndarray)) - _GenericAlias._ATTR_EXCEPTIONS)
-
-BUFFER = np.array([1], dtype=np.int64)
-BUFFER.setflags(write=False)
-
-def _get_subclass_mro(base: type) -> tuple[type, ...]:
-    class Subclass(base):  # type: ignore[misc,valid-type]
-        pass
-    return Subclass.__mro__[1:]
-
-
-class TestGenericAlias:
-    """Tests for `numpy._typing._generic_alias._GenericAlias`."""
-
-    @pytest.mark.parametrize("name,func", [
-        ("__init__", lambda n: n),
-        ("__init__", lambda n: _GenericAlias(np.ndarray, Any)),
-        ("__init__", lambda n: _GenericAlias(np.ndarray, (Any,))),
-        ("__init__", lambda n: _GenericAlias(np.ndarray, (Any, Any))),
-        ("__init__", lambda n: _GenericAlias(np.ndarray, T1)),
-        ("__init__", lambda n: _GenericAlias(np.ndarray, (T1,))),
-        ("__init__", lambda n: _GenericAlias(np.ndarray, (T1, T2))),
-        ("__origin__", lambda n: n.__origin__),
-        ("__args__", lambda n: n.__args__),
-        ("__parameters__", lambda n: n.__parameters__),
-        ("__mro_entries__", lambda n: n.__mro_entries__([object])),
-        ("__hash__", lambda n: hash(n)),
-        ("__repr__", lambda n: repr(n)),
-        ("__getitem__", lambda n: n[np.float64]),
-        ("__getitem__", lambda n: n[ScalarType][np.float64]),
-        ("__getitem__", lambda n: n[Union[np.int64, ScalarType]][np.float64]),
-        ("__getitem__", lambda n: n[Union[T1, T2]][np.float32, np.float64]),
-        ("__eq__", lambda n: n == n),
-        ("__ne__", lambda n: n != np.ndarray),
-        ("__call__", lambda n: n((1,), np.int64, BUFFER)),
-        ("__call__", lambda n: n(shape=(1,), dtype=np.int64, buffer=BUFFER)),
-        ("subclassing", lambda n: _get_subclass_mro(n)),
-        ("pickle", lambda n: n == pickle.loads(pickle.dumps(n))),
-    ])
-    def test_pass(self, name: str, func: FuncType) -> None:
-        """Compare `types.GenericAlias` with its numpy-based backport.
-
-        Checker whether ``func`` runs as intended and that both `GenericAlias`
-        and `_GenericAlias` return the same result.
-
-        """
-        value = func(NDArray)
-
-        if sys.version_info >= (3, 9):
-            value_ref = func(NDArray_ref)
-            assert value == value_ref
-
-    @pytest.mark.parametrize("name,func", [
-        ("__copy__", lambda n: n == copy.copy(n)),
-        ("__deepcopy__", lambda n: n == copy.deepcopy(n)),
-    ])
-    def test_copy(self, name: str, func: FuncType) -> None:
-        value = func(NDArray)
-
-        # xref bpo-45167
-        GE_398 = (
-            sys.version_info[:2] == (3, 9) and sys.version_info >= (3, 9, 8)
-        )
-        if GE_398 or sys.version_info >= (3, 10, 1):
-            value_ref = func(NDArray_ref)
-            assert value == value_ref
-
-    def test_dir(self) -> None:
-        value = dir(NDArray)
-        if sys.version_info < (3, 9):
-            return
-
-        # A number attributes only exist in `types.GenericAlias` in >= 3.11
-        if sys.version_info < (3, 11, 0, "beta", 3):
-            value.remove("__typing_unpacked_tuple_args__")
-        if sys.version_info < (3, 11, 0, "beta", 1):
-            value.remove("__unpacked__")
-        assert value == dir(NDArray_ref)
-
-    @pytest.mark.parametrize("name,func,dev_version", [
-        ("__iter__", lambda n: len(list(n)), ("beta", 1)),
-        ("__iter__", lambda n: next(iter(n)), ("beta", 1)),
-        ("__unpacked__", lambda n: n.__unpacked__, ("beta", 1)),
-        ("Unpack", lambda n: Unpack[n], ("beta", 1)),
-
-        # The right operand should now have `__unpacked__ = True`,
-        # and they are thus now longer equivalent
-        ("__ne__", lambda n: n != next(iter(n)), ("beta", 1)),
-
-        # >= beta3
-        ("__typing_unpacked_tuple_args__",
-         lambda n: n.__typing_unpacked_tuple_args__, ("beta", 3)),
-
-        # >= beta4
-        ("__class__", lambda n: n.__class__ == type(n), ("beta", 4)),
-    ])
-    def test_py311_features(
-        self,
-        name: str,
-        func: FuncType,
-        dev_version: tuple[str, int],
-    ) -> None:
-        """Test Python 3.11 features."""
-        value = func(NDArray)
-
-        if sys.version_info >= (3, 11, 0, *dev_version):
-            value_ref = func(NDArray_ref)
-            assert value == value_ref
-
-    def test_weakref(self) -> None:
-        """Test ``__weakref__``."""
-        value = weakref.ref(NDArray)()
-
-        if sys.version_info >= (3, 9, 1):  # xref bpo-42332
-            value_ref = weakref.ref(NDArray_ref)()
-            assert value == value_ref
-
-    @pytest.mark.parametrize("name", GETATTR_NAMES)
-    def test_getattr(self, name: str) -> None:
-        """Test that `getattr` wraps around the underlying type,
-        aka ``__origin__``.
-
-        """
-        value = getattr(NDArray, name)
-        value_ref1 = getattr(np.ndarray, name)
-
-        if sys.version_info >= (3, 9):
-            value_ref2 = getattr(NDArray_ref, name)
-            assert value == value_ref1 == value_ref2
-        else:
-            assert value == value_ref1
-
-    @pytest.mark.parametrize("name,exc_type,func", [
-        ("__getitem__", TypeError, lambda n: n[()]),
-        ("__getitem__", TypeError, lambda n: n[Any, Any]),
-        ("__getitem__", TypeError, lambda n: n[Any][Any]),
-        ("isinstance", TypeError, lambda n: isinstance(np.array(1), n)),
-        ("issublass", TypeError, lambda n: issubclass(np.ndarray, n)),
-        ("setattr", AttributeError, lambda n: setattr(n, "__origin__", int)),
-        ("setattr", AttributeError, lambda n: setattr(n, "test", int)),
-        ("getattr", AttributeError, lambda n: getattr(n, "test")),
-    ])
-    def test_raise(
-        self,
-        name: str,
-        exc_type: type[BaseException],
-        func: FuncType,
-    ) -> None:
-        """Test operations that are supposed to raise."""
-        with pytest.raises(exc_type):
-            func(NDArray)
-
-        if sys.version_info >= (3, 9):
-            with pytest.raises(exc_type):
-                func(NDArray_ref)
diff --git a/numpy/typing/tests/test_runtime.py b/numpy/typing/tests/test_runtime.py
index 44d069006..c32c5db32 100644
--- a/numpy/typing/tests/test_runtime.py
+++ b/numpy/typing/tests/test_runtime.py
@@ -2,7 +2,6 @@
 
 from __future__ import annotations
 
-import sys
 from typing import (
     get_type_hints,
     Union,
@@ -24,10 +23,7 @@ class TypeTup(NamedTuple):
     origin: None | type
 
 
-if sys.version_info >= (3, 9):
-    NDArrayTup = TypeTup(npt.NDArray, npt.NDArray.__args__, np.ndarray)
-else:
-    NDArrayTup = TypeTup(npt.NDArray, (), None)
+NDArrayTup = TypeTup(npt.NDArray, npt.NDArray.__args__, np.ndarray)
 
 TYPES = {
     "ArrayLike": TypeTup(npt.ArrayLike, npt.ArrayLike.__args__, Union),
diff --git a/numpy/typing/tests/test_typing.py b/numpy/typing/tests/test_typing.py
index 5011339b5..bcaaf5250 100644
--- a/numpy/typing/tests/test_typing.py
+++ b/numpy/typing/tests/test_typing.py
@@ -431,7 +431,7 @@ def test_extended_precision() -> None:
     output_mypy = OUTPUT_MYPY
     assert path in output_mypy
 
-    with open(path, "r") as f:
+    with open(path) as f:
         expression_list = f.readlines()
 
     for _msg in output_mypy[path]:
diff --git a/numpy/version.py b/numpy/version.py
index d5657d0d0..d9d2fe1b7 100644
--- a/numpy/version.py
+++ b/numpy/version.py
@@ -4,12 +4,20 @@ from ._version import get_versions
 
 __ALL__ = ['version', '__version__', 'full_version', 'git_revision', 'release']
 
+
+_built_with_meson = False
+try:
+    from ._version_meson import get_versions
+    _built_with_meson = True
+except ImportError:
+    from ._version import get_versions
+
 vinfo: dict[str, str] = get_versions()
 version = vinfo["version"]
 __version__ = vinfo.get("closest-tag", vinfo["version"])
-full_version = vinfo['version']
 git_revision = vinfo['full-revisionid']
 release = 'dev0' not in version and '+' not in version
-short_version = vinfo['version'].split("+")[0]
+full_version = version
+short_version = version.split("+")[0]
 
 del get_versions, vinfo
diff --git a/pavement.py b/pavement.py
index e54712e81..b6ad4358b 100644
--- a/pavement.py
+++ b/pavement.py
@@ -38,7 +38,7 @@ from paver.easy import Bunch, options, task, sh
 #-----------------------------------
 
 # Path to the release notes
-RELEASE_NOTES = 'doc/source/release/1.24.0-notes.rst'
+RELEASE_NOTES = 'doc/source/release/1.25.0-notes.rst'
 
 
 #-------------------------------------------------------
@@ -184,7 +184,7 @@ def write_release_task(options, filename='README'):
     ----------
     options :
         Set by ``task`` decorator.
-    filename : string
+    filename : str
         Filename of the modified notes. The file is written
         in the release directory.
 
diff --git a/pyproject.toml b/pyproject.toml
index 60e7f5826..759b538fb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,11 +1,79 @@
 [build-system]
-# Minimum requirements for the build system to execute.
+# Uncomment this line, the `meson-python` requires line, and the [project] and
+# [project.urls] tables below in order to build with Meson by default
+#build-backend = "mesonpy"
 requires = [
+    # setuptools, wheel and Cython are needed for the setup.py based build
     "setuptools==59.2.0",
-    "wheel==0.37.0",
-    "Cython>=0.29.30,<3.0",
+    # `wheel` is needed for non-isolated builds, given that `meson-python`
+    # doesn't list it as a runtime requirement (at least in 0.11.0) - it's
+    # likely to be removed as a dependency in meson-python 0.12.0.
+    "wheel==0.38.1",
+    "Cython>=0.29.34,<3.0",
+#    "meson-python>=0.10.0",
 ]
 
+#[project]
+#name = "numpy"
+#
+## Using https://peps.python.org/pep-0639/
+## which is still in draft
+#license = {text = "BSD-3-Clause"}
+## Note: needed for Meson, but setuptools errors on it. Uncomment once Meson is default.
+##license-files.paths = [
+##    "LICENSE.txt",
+##    "LICENSES_bundles.txt"
+##]
+#
+#description = "Fundamental package for array computing in Python"
+#authors = [{name = "Travis E. Oliphant et al."}]
+#maintainers = [
+#    {name = "NumPy Developers", email="numpy-discussion@python.org"},
+#]
+#requires-python = ">=3.9"
+#readme = "README.md"
+#classifiers = [
+#    'Development Status :: 5 - Production/Stable',
+#    'Intended Audience :: Science/Research',
+#    'Intended Audience :: Developers',
+#    'License :: OSI Approved :: BSD License',
+#    'Programming Language :: C',
+#    'Programming Language :: Python',
+#    'Programming Language :: Python :: 3',
+#    'Programming Language :: Python :: 3.9',
+#    'Programming Language :: Python :: 3.10',
+#    'Programming Language :: Python :: 3.11',
+#    'Programming Language :: Python :: 3 :: Only',
+#    'Programming Language :: Python :: Implementation :: CPython',
+#    'Topic :: Software Development',
+#    'Topic :: Scientific/Engineering',
+#    'Typing :: Typed',
+#    'Operating System :: Microsoft :: Windows',
+#    'Operating System :: POSIX',
+#    'Operating System :: Unix',
+#    'Operating System :: MacOS',
+#]
+#dynamic = ["version", "scripts"]
+#
+#[project.scripts]
+## Note: this is currently dynamic, see setup.py. Can we get rid of that?
+##       see commit f22a33b71 for rationale for dynamic behavior
+#'f2py = numpy.f2py.f2py2e:main'
+#'f2py3 = numpy.f2py.f2py2e:main'
+#'f2py3.MINOR_VERSION = numpy.f2py.f2py2e:main'
+#
+# When enabling this stanza, make sure to remove the meson-specific xfail from
+# numpy/tests/test_public_api.py
+#[project.entry-points]
+#'array_api': 'numpy = numpy.array_api'
+#'pyinstaller40': 'hook-dirs = numpy:_pyinstaller_hooks_dir'
+#
+#[project.urls]
+#homepage = "https://numpy.org"
+#documentation = "https://numpy.org/doc/"
+#source = "https://github.com/numpy/numpy"
+#download = "https://pypi.org/project/numpy/#files"
+#tracker = "https://github.com/numpy/numpy/issues"
 
 [tool.towncrier]
     # Do no set this since it is hard to import numpy inside the source directory
@@ -76,7 +144,7 @@ requires = [
 
 
 [tool.cibuildwheel]
-skip = "cp36-* cp37-* pp37-* *-manylinux_i686 *_ppc64le *_s390x *-musllinux*"
+skip = "cp36-* cp37-* pp37-* *-manylinux_i686 *_ppc64le *_s390x *-musllinux_aarch64"
 build-verbosity = "3"
 before-build = "bash {project}/tools/wheels/cibw_before_build.sh {project}"
 before-test = "pip install -r {project}/test_requirements.txt"
@@ -85,6 +153,7 @@ test-command = "bash {project}/tools/wheels/cibw_test_command.sh {project}"
 [tool.cibuildwheel.linux]
 manylinux-x86_64-image = "manylinux2014"
 manylinux-aarch64-image = "manylinux2014"
+musllinux-x86_64-image = "musllinux_1_1"
 environment = { CFLAGS="-std=c99 -fno-strict-aliasing", LDFLAGS="-Wl,--strip-debug", OPENBLAS64_="/usr/local", NPY_USE_BLAS_ILP64="1", RUNNER_OS="Linux" }
 
 [tool.cibuildwheel.macos]
@@ -94,9 +163,9 @@ environment = { CFLAGS="-std=c99 -fno-strict-aliasing", LDFLAGS="-Wl,--strip-deb
 # https://github.com/multi-build/multibuild/blame/devel/README.rst#L541-L565
 # for more info
 archs = "x86_64 arm64"
-test-skip = "*_arm64 *_universal2:arm64"
+test-skip = "*_universal2:arm64"
 # MACOS linker doesn't support stripping symbols
-environment = { CFLAGS="-std=c99 -fno-strict-aliasing", OPENBLAS64_="/usr/local", NPY_USE_BLAS_ILP64="1", CC="clang", CXX = "clang++" }
+environment = { CFLAGS="-std=c99 -fno-strict-aliasing", OPENBLAS64_="/usr/local", NPY_USE_BLAS_ILP64="1", CC="clang", CXX = "clang++", RUNNER_OS="macOS" }
 
 [tool.cibuildwheel.windows]
 environment = { OPENBLAS64_="openblas", OPENBLAS="", NPY_USE_BLAS_ILP64="1", CFLAGS="", LDFLAGS="" }
@@ -104,3 +173,10 @@ environment = { OPENBLAS64_="openblas", OPENBLAS="", NPY_USE_BLAS_ILP64="1", CFL
 [[tool.cibuildwheel.overrides]]
 select = "*-win32"
 environment = { OPENBLAS64_="", OPENBLAS="openblas", NPY_USE_BLAS_ILP64="0", CFLAGS="-m32", LDFLAGS="-m32" }
+
+[tool.spin]
+package = 'numpy'
+
+[tool.spin.commands]
+"Build" = ["spin.cmds.meson.build", "spin.cmds.meson.test"]
+"Environments" = ["spin.cmds.meson.shell", "spin.cmds.meson.ipython", "spin.cmds.meson.python"]
diff --git a/release_requirements.txt b/release_requirements.txt
index 43a677e05..13f763651 100644
--- a/release_requirements.txt
+++ b/release_requirements.txt
@@ -7,7 +7,7 @@ beautifulsoup4
 
 # changelog.py
 pygithub
-gitpython
+gitpython>=3.1.30
 
 # uploading wheels
 twine
diff --git a/runtests.py b/runtests.py
index fea29a10d..8b026bec7 100755
--- a/runtests.py
+++ b/runtests.py
@@ -220,7 +220,7 @@ def main(argv):
             # Don't use subprocess, since we don't want to include the
             # current path in PYTHONPATH.
             sys.argv = extra_argv
-            with open(extra_argv[0], 'r') as f:
+            with open(extra_argv[0]) as f:
                 script = f.read()
             sys.modules['__main__'] = types.ModuleType('__main__')
             ns = dict(__name__='__main__',
@@ -540,11 +540,22 @@ def build_project(args):
         print("Build OK")
     else:
         if not args.show_build_log:
-            with open(log_filename, 'r') as f:
+            with open(log_filename) as f:
                 print(f.read())
             print("Build failed!")
         sys.exit(1)
 
+    # Rebase
+    if sys.platform == "cygwin":
+        from pathlib import path
+        testenv_root = Path(config_vars["platbase"])
+        dll_list = testenv_root.glob("**/*.dll")
+        rebase_cmd = ["/usr/bin/rebase", "--database", "--oblivious"]
+        rebase_cmd.extend(dll_list)
+        if subprocess.run(rebase_cmd):
+            print("Rebase failed")
+            sys.exit(1)
+
     return site_dir, site_dir_noarch
 
 def asv_compare_config(bench_path, args, h_commits):
@@ -624,7 +635,7 @@ def asv_substitute_config(in_config, out_config, **custom_vars):
 
     vars_hash = sdbm_hash(custom_vars, os.path.getmtime(in_config))
     try:
-        with open(out_config, "r") as wfd:
+        with open(out_config) as wfd:
             hash_line = wfd.readline().split('hash:')
             if len(hash_line) > 1 and int(hash_line[1]) == vars_hash:
                 return True
diff --git a/setup.py b/setup.py
index 70b36bc7e..c924bb999 100755
--- a/setup.py
+++ b/setup.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 """
-Numpy build options can be modified with a site.cfg file. 
+Numpy build options can be modified with a site.cfg file.
 See site.cfg.example for a template and more information.
 """
 
@@ -12,12 +12,14 @@ import textwrap
 import warnings
 import builtins
 import re
+import tempfile
 
+from distutils.errors import CompileError
 
 # Python supported version checks. Keep right after stdlib imports to ensure we
 # get a sensible error for older Python versions
-if sys.version_info[:2] < (3, 8):
-    raise RuntimeError("Python version >= 3.8 required.")
+if sys.version_info[:2] < (3, 9):
+    raise RuntimeError("Python version >= 3.9 required.")
 
 
 import versioneer
@@ -90,7 +92,6 @@ License :: OSI Approved :: BSD License
 Programming Language :: C
 Programming Language :: Python
 Programming Language :: Python :: 3
-Programming Language :: Python :: 3.8
 Programming Language :: Python :: 3.9
 Programming Language :: Python :: 3.10
 Programming Language :: Python :: 3.11
@@ -158,11 +159,11 @@ class concat_license_files():
 
     def __enter__(self):
         """Concatenate files and remove LICENSES_bundled.txt"""
-        with open(self.f1, 'r') as f1:
+        with open(self.f1) as f1:
             self.bsd_text = f1.read()
 
         with open(self.f1, 'a') as f1:
-            with open(self.f2, 'r') as f2:
+            with open(self.f2) as f2:
                 self.bundled_text = f2.read()
                 f1.write('\n\n')
                 f1.write(self.bundled_text)
@@ -185,48 +186,135 @@ class sdist_checked(cmdclass['sdist']):
 
 def get_build_overrides():
     """
-    Custom build commands to add `-std=c99` to compilation
+    Custom build commands to add std flags if required to compilation
     """
     from numpy.distutils.command.build_clib import build_clib
     from numpy.distutils.command.build_ext import build_ext
-    from numpy.compat import _pep440
-
-    def _needs_gcc_c99_flag(obj):
-        if obj.compiler.compiler_type != 'unix':
-            return False
-
-        cc = obj.compiler.compiler[0]
-        if "gcc" not in cc:
-            return False
+    from numpy._utils import _pep440
+
+    def try_compile(compiler, file, flags = [], verbose=False):
+        bk_ver = getattr(compiler, 'verbose', False)
+        compiler.verbose = verbose
+        try:
+            compiler.compile([file], extra_postargs=flags)
+            return True, ''
+        except CompileError as e:
+            return False, str(e)
+        finally:
+            compiler.verbose = bk_ver
+
+    def flags_is_required(compiler, is_cpp, flags, code):
+        if is_cpp:
+            compiler = compiler.cxx_compiler()
+            suf = '.cpp'
+        else:
+            suf = '.c'
+        with tempfile.TemporaryDirectory() as temp_dir:
+            tmp_file = os.path.join(temp_dir, "test" + suf)
+            with open(tmp_file, "w+") as f:
+                f.write(code)
+            # without specify any flags in case of the required
+            # standard already supported by default, then there's
+            # no need for passing the flags
+            comp = try_compile(compiler, tmp_file)
+            if not comp[0]:
+                comp = try_compile(compiler, tmp_file, flags)
+                if not comp[0]:
+                    # rerun to verbose the error
+                    try_compile(compiler, tmp_file, flags, True)
+                    if is_cpp:
+                        raise RuntimeError(
+                            "Broken toolchain during testing C++ compiler. \n"
+                            "A compiler with support for C++17 language "
+                            "features is required.\n"
+                            f"Triggered the following error: {comp[1]}."
+                        )
+                    else:
+                        raise RuntimeError(
+                            "Broken toolchain during testing C compiler. \n"
+                            "A compiler with support for C99 language "
+                            "features is required.\n"
+                            f"Triggered the following error: {comp[1]}."
+                        )
+                return True
+        return False
 
-        # will print something like '4.2.1\n'
-        out = subprocess.run([cc, '-dumpversion'],
-                             capture_output=True, text=True)
-        # -std=c99 is default from this version on
-        if _pep440.parse(out.stdout) >= _pep440.Version('5.0'):
-            return False
-        return True
+    def std_cxx_flags(cmd):
+        compiler = cmd.compiler
+        flags = getattr(compiler, '__np_cache_cpp_flags', None)
+        if flags is not None:
+            return flags
+        flags = dict(
+            msvc = ['/std:c++17']
+        ).get(compiler.compiler_type, ['-std=c++17'])
+        # These flags are used to compile any C++ source within Numpy.
+        # They are chosen to have very few runtime dependencies.
+        extra_flags = dict(
+            # to update #def __cplusplus with enabled C++ version
+            msvc = ['/Zc:__cplusplus']
+        ).get(compiler.compiler_type, [
+            # The following flag is used to avoid emit any extra code
+            # from STL since extensions are build by C linker and
+            # without C++ runtime dependencies.
+            '-fno-threadsafe-statics',
+            '-D__STDC_VERSION__=0',  # for compatibility with C headers
+            '-fno-exceptions',  # no exception support
+            '-fno-rtti'  # no runtime type information
+        ])
+        if not flags_is_required(compiler, True, flags, textwrap.dedent('''
+            #include <type_traits>
+            template<typename ...T>
+            constexpr bool test_fold = (... && std::is_const_v<T>);
+            int main()
+            {
+                if (test_fold<int, const int>) {
+                    return 0;
+                }
+                else {
+                    return -1;
+                }
+            }
+        ''')):
+            flags.clear()
+        flags += extra_flags
+        setattr(compiler, '__np_cache_cpp_flags', flags)
+        return flags
+
+    def std_c_flags(cmd):
+        compiler = cmd.compiler
+        flags = getattr(compiler, '__np_cache_c_flags', None)
+        if flags is not None:
+            return flags
+        flags = dict(
+            msvc = []
+        ).get(compiler.compiler_type, ['-std=c99'])
+
+        if not flags_is_required(compiler, False, flags, textwrap.dedent('''
+            inline static int test_inline() { return 0; }
+            int main(void)
+            { return test_inline(); }
+        ''')):
+            flags.clear()
+
+        setattr(compiler, '__np_cache_c_flags', flags)
+        return flags
 
     class new_build_clib(build_clib):
         def build_a_library(self, build_info, lib_name, libraries):
-            from numpy.distutils.ccompiler_opt import NPY_CXX_FLAGS
-            if _needs_gcc_c99_flag(self):
-                build_info['extra_cflags'] = ['-std=c99']
-            build_info['extra_cxxflags'] = NPY_CXX_FLAGS
+            build_info['extra_cflags'] = std_c_flags(self)
+            build_info['extra_cxxflags'] = std_cxx_flags(self)
             build_clib.build_a_library(self, build_info, lib_name, libraries)
 
     class new_build_ext(build_ext):
         def build_extension(self, ext):
-            if _needs_gcc_c99_flag(self):
-                if '-std=c99' not in ext.extra_compile_args:
-                    ext.extra_compile_args.append('-std=c99')
+            ext.extra_c_compile_args += std_c_flags(self)
+            ext.extra_cxx_compile_args += std_cxx_flags(self)
             build_ext.build_extension(self, ext)
     return new_build_clib, new_build_ext
 
-
 def generate_cython():
     # Check Cython version
-    from numpy.compat import _pep440
+    from numpy._utils import _pep440
     try:
         # try the cython in the installed python first (somewhat related to
         # scipy/scipy#2397)
@@ -432,7 +520,7 @@ def setup_package():
         test_suite='pytest',
         version=versioneer.get_version(),
         cmdclass=cmdclass,
-        python_requires='>=3.8',
+        python_requires='>=3.9',
         zip_safe=False,
         entry_points={
             'console_scripts': f2py_cmds,
@@ -463,6 +551,9 @@ def setup_package():
     else:
         #from numpy.distutils.core import setup
         from setuptools import setup
+        # workaround for broken --no-build-isolation with newer setuptools,
+        # see gh-21288
+        metadata["packages"] = []
 
     try:
         setup(**metadata)
diff --git a/site.cfg.example b/site.cfg.example
index 4df01a210..5d36922ea 100644
--- a/site.cfg.example
+++ b/site.cfg.example
@@ -225,9 +225,10 @@
 #
 # UMFPACK is not used by NumPy.
 #
-#   https://www.cise.ufl.edu/research/sparse/umfpack/
-#   https://www.cise.ufl.edu/research/sparse/amd/
+#   https://github.com/DrTimothyAldenDavis/SuiteSparse/tree/dev/UMFPACK
+#   https://github.com/DrTimothyAldenDavis/SuiteSparse/tree/dev/AMD
 #   https://scikit-umfpack.github.io/scikit-umfpack/
+#   https://people.engr.tamu.edu/davis/suitesparse.html
 #
 #[amd]
 #libraries = amd
@@ -252,3 +253,13 @@
 #[djbfft]
 #include_dirs = /usr/local/djbfft/include
 #library_dirs = /usr/local/djbfft/lib
+
+# Fujitsu SSL2
+# ----
+#[ssl2]
+#library_dirs = /opt/FJSVstclanga/v1.1.0/lib64
+#include_dirs = /opt/FJSVstclanga/v1.1.0/clang-comp/include
+# Single-threaded version.
+#libraies = fjlapacksve
+# Multi-threaded version. Python itself must be built by Fujitsu compiler.
+#libraies = fjlapackexsve
diff --git a/test_requirements.txt b/test_requirements.txt
index 3e7d3fef7..16f448eb8 100644
--- a/test_requirements.txt
+++ b/test_requirements.txt
@@ -1,5 +1,5 @@
-cython>=0.29.30,<3.0
-wheel==0.37.0
+cython>=0.29.34,<3.0
+wheel==0.38.1
 setuptools==59.2.0
 hypothesis==6.24.1
 pytest==6.2.5
@@ -12,3 +12,5 @@ cffi; python_version < '3.10'
 # NOTE: Keep mypy in sync with environment.yml
 mypy==0.981; platform_python_implementation != "PyPy"
 typing_extensions>=4.2.0
+# for optional f2py encoding detection
+charset-normalizer
diff --git a/tools/allocation_tracking/README.md b/tools/allocation_tracking/README.md
deleted file mode 100644
index 6cc4c2a58..000000000
--- a/tools/allocation_tracking/README.md
+++ /dev/null
@@ -1,7 +0,0 @@
-Note that since Python 3.6 the builtin tracemalloc module can be used to
-track allocations inside numpy.
-Numpy places its CPU memory allocations into the `np.lib.tracemalloc_domain`
-domain.
-See https://docs.python.org/3/library/tracemalloc.html.
-
-The tool that used to be here has been deprecated.
diff --git a/tools/check_installed_files.py b/tools/check_installed_files.py
new file mode 100644
index 000000000..f15ca7969
--- /dev/null
+++ b/tools/check_installed_files.py
@@ -0,0 +1,86 @@
+"""
+Check if all the test and .pyi files are installed after building.
+
+Examples::
+
+    $ python check_installed_files.py install_dirname
+
+        install_dirname:
+            the relative path to the directory where NumPy is installed after
+            building and running `meson install`.
+
+Notes
+=====
+
+The script will stop on encountering the first missing file in the install dir,
+it will not give a full listing. This should be okay, because the script is
+meant for use in CI so it's not like many files will be missing at once.
+
+"""
+
+import os
+import glob
+import sys
+
+
+CUR_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__)))
+ROOT_DIR = os.path.dirname(CUR_DIR)
+NUMPY_DIR = os.path.join(ROOT_DIR, 'numpy')
+
+
+# Files whose installation path will be different from original one
+changed_installed_path = {
+    #'numpy/_build_utils/some_file.py': 'numpy/lib/some_file.py'
+}
+
+
+def main(install_dir):
+    INSTALLED_DIR = os.path.join(ROOT_DIR, install_dir)
+    if not os.path.exists(INSTALLED_DIR):
+        raise ValueError(
+            f"Provided install dir {INSTALLED_DIR} does not exist"
+        )
+
+    numpy_test_files = get_files(NUMPY_DIR, kind='test')
+    installed_test_files = get_files(INSTALLED_DIR, kind='test')
+
+    # Check test files detected in repo are installed
+    for test_file in numpy_test_files.keys():
+        if test_file not in installed_test_files.keys():
+            raise Exception(
+                "%s is not installed" % numpy_test_files[test_file]
+            )
+
+    print("----------- All the test files were installed --------------")
+
+    numpy_pyi_files = get_files(NUMPY_DIR, kind='stub')
+    installed_pyi_files = get_files(INSTALLED_DIR, kind='stub')
+
+    # Check *.pyi files detected in repo are installed
+    for pyi_file in numpy_pyi_files.keys():
+        if pyi_file not in installed_pyi_files.keys():
+            raise Exception("%s is not installed" % numpy_pyi_files[pyi_file])
+
+    print("----------- All the .pyi files were installed --------------")
+
+
+def get_files(dir_to_check, kind='test'):
+    files = dict()
+    patterns = {
+        'test': f'{dir_to_check}/**/test_*.py',
+        'stub': f'{dir_to_check}/**/*.pyi',
+    }
+    for path in glob.glob(patterns[kind], recursive=True):
+        relpath = os.path.relpath(path, dir_to_check)
+        files[relpath] = path
+
+    return files
+
+
+if __name__ == '__main__':
+    if not len(sys.argv) == 2:
+        raise ValueError("Incorrect number of input arguments, need "
+                         "check_installation.py relpath/to/installed/numpy")
+
+    install_dir = sys.argv[1]
+    main(install_dir)
diff --git a/tools/ci/cirrus_macosx_arm64.yml b/tools/ci/cirrus_macosx_arm64.yml
new file mode 100644
index 000000000..2343b807f
--- /dev/null
+++ b/tools/ci/cirrus_macosx_arm64.yml
@@ -0,0 +1,55 @@
+modified_clone: &MODIFIED_CLONE
+  # makes sure that for a PR the CI runs against a merged main
+  clone_script: |
+    if [ -z "$CIRRUS_PR" ]; then
+      # if you're not in a PR then clone against the branch name that was pushed to.
+      git clone --recursive --branch=$CIRRUS_BRANCH https://x-access-token:${CIRRUS_REPO_CLONE_TOKEN}@github.com/${CIRRUS_REPO_FULL_NAME}.git $CIRRUS_WORKING_DIR
+      git reset --hard $CIRRUS_CHANGE_IN_REPO
+    else
+      # it's a PR so clone the main branch then merge the changes from the PR
+      git clone https://x-access-token:${CIRRUS_REPO_CLONE_TOKEN}@github.com/${CIRRUS_REPO_FULL_NAME}.git $CIRRUS_WORKING_DIR
+      git fetch origin pull/$CIRRUS_PR/head:pull/$CIRRUS_PR
+    
+      # CIRRUS_BASE_BRANCH will probably be `main` for the majority of the time
+      # However, if you do a PR against a maintenance branch we will want to
+      # merge the PR into the maintenance branch, not main
+      git checkout $CIRRUS_BASE_BRANCH
+
+      # alpine git package needs default user.name and user.email to be set before a merge
+      git -c user.email="you@example.com" merge --no-commit pull/$CIRRUS_PR
+      git submodule update --init --recursive
+    fi
+
+
+macos_arm64_test_task:
+  macos_instance:
+    image: ghcr.io/cirruslabs/macos-monterey-xcode:14
+
+  <<: *MODIFIED_CLONE
+
+  pip_cache:
+    folder: ~/.cache/pip
+
+  test_script: |
+    brew install python@3.10
+
+    export PATH=/opt/homebrew/opt/python@3.10/libexec/bin:$PATH
+    python --version
+
+    RUNNER_OS="macOS"
+    CFLAGS="-std=c99 -fno-strict-aliasing"
+    SDKROOT=/Applications/Xcode-14.0.0.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX12.3.sdk
+    
+    # used for installing OpenBLAS/gfortran
+    bash tools/wheels/cibw_before_build.sh $PWD
+
+    pushd ~/
+    python -m venv numpy-dev
+    source numpy-dev/bin/activate
+    popd
+
+    pip install -r build_requirements.txt
+    pip install pytest hypothesis typing_extensions
+    
+    spin build
+    spin test
diff --git a/tools/ci/cirrus_wheels.yml b/tools/ci/cirrus_wheels.yml
new file mode 100644
index 000000000..60512afab
--- /dev/null
+++ b/tools/ci/cirrus_wheels.yml
@@ -0,0 +1,157 @@
+build_and_store_wheels: &BUILD_AND_STORE_WHEELS
+  install_cibuildwheel_script:
+    - python -m pip install cibuildwheel==2.12.1
+  cibuildwheel_script:
+    - cibuildwheel
+  wheels_artifacts:
+    path: "wheelhouse/*"
+
+######################################################################
+# Build linux_aarch64 natively
+######################################################################
+
+linux_aarch64_task:
+  compute_engine_instance:
+    image_project: cirrus-images
+    image: family/docker-builder-arm64
+    architecture: arm64
+    platform: linux
+    cpu: 2
+    memory: 8G
+  matrix:
+    # build in a matrix because building and testing all four wheels in a
+    # single task takes longer than 60 mins (the default time limit for a
+    # cirrus-ci task).
+    - env:
+        CIRRUS_CLONE_SUBMODULES: true
+        CIBW_BUILD: cp39-*
+        EXPECT_CPU_FEATURES: NEON NEON_FP16 NEON_VFPV4 ASIMD ASIMDHP ASIMDDP ASIMDFHM
+    - env:
+        CIRRUS_CLONE_SUBMODULES: true
+        CIBW_BUILD: cp310-*
+    - env:
+        CIRRUS_CLONE_SUBMODULES: true
+        CIBW_BUILD: cp311-*
+
+  build_script: |
+    apt install -y python3-venv python-is-python3 gfortran libatlas-base-dev libgfortran5 eatmydata
+    git fetch origin
+    ./tools/travis-before-install.sh
+    which python
+    echo $CIRRUS_CHANGE_MESSAGE
+  <<: *BUILD_AND_STORE_WHEELS
+
+
+######################################################################
+# Build macosx_arm64 natively
+######################################################################
+
+macosx_arm64_task:
+  macos_instance:
+    image: ghcr.io/cirruslabs/macos-monterey-xcode:14
+  matrix:
+    - env:
+        CIRRUS_CLONE_SUBMODULES: true
+        CIBW_BUILD: cp39-*
+    - env:
+        CIRRUS_CLONE_SUBMODULES: true
+        CIBW_BUILD: cp310-* cp311-*
+  env:
+    PATH: /opt/homebrew/opt/python@3.10/bin:/usr/local/lib:/usr/local/include:$PATH
+    CIBW_ARCHS: arm64
+    # Specifying CIBW_ENVIRONMENT_MACOS overrides pyproject.toml, so include
+    # all the settings from there, otherwise they're lost.
+    # SDKROOT needs to be set for repackaged conda-forge gfortran compilers
+    # supplied by isuruf.
+    # Find out SDKROOT via `xcrun --sdk macosx --show-sdk-path`
+    CIBW_ENVIRONMENT_MACOS: >
+      RUNNER_OS=macOS
+      SDKROOT=/Applications/Xcode-14.0.0.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX12.3.sdk
+      LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
+      CFLAGS="-std=c99 -fno-strict-aliasing"
+      OPENBLAS64_="/usr/local"
+      NPY_USE_BLAS_ILP64="1"
+
+  build_script:
+    - brew install python@3.10
+    - ln -s python3 /opt/homebrew/opt/python@3.10/bin/python
+    - which python
+    # needed for submodules
+    - git submodule update --init
+    # need to obtain all the tags so setup.py can determine FULLVERSION
+    - git fetch origin
+    - uname -m
+    - python -c "import platform;print(platform.python_version());print(platform.system());print(platform.machine())"
+    - clang --version
+  <<: *BUILD_AND_STORE_WHEELS
+
+
+######################################################################
+# Upload all wheels
+######################################################################
+
+wheels_upload_task:
+  # Artifacts don't seem to be persistent from task to task.
+  # Rather than upload wheels at the end of each cibuildwheel run we do a
+  # final upload here. This is because a run may be on different OS for
+  # which bash, etc, may not be present.
+  depends_on:
+    - linux_aarch64
+    - macosx_arm64
+  compute_engine_instance:
+    image_project: cirrus-images
+    image: family/docker-builder
+    platform: linux
+    cpu: 1
+
+  env:
+    NUMPY_STAGING_UPLOAD_TOKEN: ENCRYPTED[!5a69522ae0c2af9edb2bc1cdfeaca6292fb3666d9ecd82dca0615921834a6ce3b702352835d8bde4ea2a9ed5ef8424ac!]
+    NUMPY_NIGHTLY_UPLOAD_TOKEN: ENCRYPTED[!196422e6c3419a3b1d79815e1026094a215cb0f346fe34ed0f9d3ca1c19339df7398d04556491b1e0420fc1fe3713289!]
+
+  upload_script: |
+    apt-get update
+    apt-get install -y curl wget
+    export IS_SCHEDULE_DISPATCH="false"
+    export IS_PUSH="false"
+
+    # cron job
+    if [[ "$CIRRUS_CRON" == "weekly" ]]; then
+      export IS_SCHEDULE_DISPATCH="true"
+    fi
+
+    # a manual build was started
+    if [[ "$CIRRUS_BUILD_SOURCE" == "api" && "$CIRRUS_COMMIT_MESSAGE" == "API build for null" ]]; then
+      export IS_SCHEDULE_DISPATCH="true"
+    fi
+
+    # only upload wheels to staging if it's a tag beginning with 'v' and you're
+    # on a maintenance branch
+    if [[ "$CIRRUS_TAG" == v* ]] && [[ $CIRRUS_TAG != *"dev0"* ]]; then
+      export IS_PUSH="true"    
+    fi
+
+    if [[ $IS_PUSH == "true" ]] || [[ $IS_SCHEDULE_DISPATCH == "true" ]]; then
+        # install miniconda in the home directory. For some reason HOME isn't set by Cirrus
+        export HOME=$PWD
+    
+        # install miniconda for uploading to anaconda
+        wget -q https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh
+        bash miniconda.sh -b -p $HOME/miniconda3
+        $HOME/miniconda3/bin/conda init bash
+        source $HOME/miniconda3/bin/activate
+        conda install -y anaconda-client
+    
+        # The name of the zip file is derived from the `wheels_artifact` line.
+        # If you change the artifact line to `myfile_artifact` then it would be
+        # called myfile.zip
+        
+        curl https://api.cirrus-ci.com/v1/artifact/build/$CIRRUS_BUILD_ID/wheels.zip --output wheels.zip
+        unzip wheels.zip
+        
+        source ./tools/wheels/upload_wheels.sh
+        # IS_PUSH takes precedence over IS_SCHEDULE_DISPATCH
+        set_upload_vars
+        
+        # Will be skipped if not a push/tag/scheduled build
+        upload_wheels
+    fi
diff --git a/tools/ci/push_docs_to_repo.py b/tools/ci/push_docs_to_repo.py
index e34522a29..0471e3824 100755
--- a/tools/ci/push_docs_to_repo.py
+++ b/tools/ci/push_docs_to_repo.py
@@ -19,6 +19,8 @@ parser.add_argument('--committer', default='numpy-commit-bot',
                     help='Name of the git committer')
 parser.add_argument('--email', default='numpy-commit-bot@nomail',
                     help='Email of the git committer')
+parser.add_argument('--count', default=1, type=int,
+                    help="minimum number of expected files, defaults to 1")
 
 parser.add_argument(
     '--force', action='store_true',
@@ -31,6 +33,11 @@ if not os.path.exists(args.dir):
     print('Content directory does not exist')
     sys.exit(1)
 
+count = len([name for name in os.listdir(args.dir) if os.path.isfile(os.path.join(args.dir, name))])
+
+if count < args.count:
+    print(f"Expected {args.count} top-directory files to upload, got {count}")
+    sys.exit(1)
 
 def run(cmd, stdout=True):
     pipe = None if stdout else subprocess.DEVNULL
diff --git a/tools/cythonize.py b/tools/cythonize.py
index 311a52c91..d66ddb98c 100755
--- a/tools/cythonize.py
+++ b/tools/cythonize.py
@@ -52,7 +52,7 @@ def process_tempita_pyx(fromfile, tofile):
     import npy_tempita as tempita
 
     assert fromfile.endswith('.pyx.in')
-    with open(fromfile, "r") as f:
+    with open(fromfile) as f:
         tmpl = f.read()
     pyxcontent = tempita.sub(tmpl)
     pyxfile = fromfile[:-len('.pyx.in')] + '.pyx'
@@ -66,7 +66,7 @@ def process_tempita_pyd(fromfile, tofile):
 
     assert fromfile.endswith('.pxd.in')
     assert tofile.endswith('.pxd')
-    with open(fromfile, "r") as f:
+    with open(fromfile) as f:
         tmpl = f.read()
     pyxcontent = tempita.sub(tmpl)
     with open(tofile, "w") as f:
@@ -77,7 +77,7 @@ def process_tempita_pxi(fromfile, tofile):
 
     assert fromfile.endswith('.pxi.in')
     assert tofile.endswith('.pxi')
-    with open(fromfile, "r") as f:
+    with open(fromfile) as f:
         tmpl = f.read()
     pyxcontent = tempita.sub(tmpl)
     with open(tofile, "w") as f:
@@ -88,7 +88,7 @@ def process_tempita_pxd(fromfile, tofile):
 
     assert fromfile.endswith('.pxd.in')
     assert tofile.endswith('.pxd')
-    with open(fromfile, "r") as f:
+    with open(fromfile) as f:
         tmpl = f.read()
     pyxcontent = tempita.sub(tmpl)
     with open(tofile, "w") as f:
@@ -109,7 +109,7 @@ def load_hashes(filename):
     # Return { filename : (sha256 of input, sha256 of output) }
     if os.path.isfile(filename):
         hashes = {}
-        with open(filename, 'r') as f:
+        with open(filename) as f:
             for line in f:
                 filename, inhash, outhash = line.split()
                 hashes[filename] = (inhash, outhash)
diff --git a/tools/gitpod/Dockerfile b/tools/gitpod/Dockerfile
deleted file mode 100644
index dd5561750..000000000
--- a/tools/gitpod/Dockerfile
+++ /dev/null
@@ -1,101 +0,0 @@
-#
-# Dockerfile for NumPy development
-#
-# Usage:
-# -------
-#
-# To make a local build of the container, from the 'Docker-dev' directory:
-# docker build  --rm -f "Dockerfile" -t <build-tag> "."
-#
-# To use the container use the following command. It assumes that you are in
-# the root folder of the NumPy git repository, making it available as
-# /home/numpy in the container. Whatever changes you make to that directory
-# are visible in the host and container.
-# The docker image is retrieved from the NumPy dockerhub repository
-#
-# docker run --rm -it -v $(pwd):/home/numpy numpy/numpy-dev:<image-tag>
-#
-# By default the container will activate the conda environment numpy-dev
-# which contains all the dependencies needed for NumPy development
-#
-# To build NumPy run: python setup.py build_ext --inplace
-#
-# To run the tests use: python runtests.py
-#
-# This image is based on: Ubuntu 20.04 (focal)
-# https://hub.docker.com/_/ubuntu/?tab=tags&name=focal
-# OS/ARCH: linux/amd64
-FROM gitpod/workspace-base:latest
-
-ARG MAMBAFORGE_VERSION="4.11.0-0"
-ARG CONDA_ENV=numpy-dev
-
-
-# ---- Configure environment ----
-ENV CONDA_DIR=/home/gitpod/mambaforge3 \
-    SHELL=/bin/bash
-ENV PATH=${CONDA_DIR}/bin:$PATH \
-    WORKSPACE=/workspace/numpy
-
-
-# -----------------------------------------------------------------------------
-# ---- Creating as root - note: make sure to change to gitpod in the end ----
-USER root
-
-# hadolint ignore=DL3008
-RUN apt-get update && \
-    apt-get install -yq --no-install-recommends \
-    ca-certificates \
-    dirmngr \
-    dvisvgm \
-    gnupg \
-    gpg-agent \
-    texlive-latex-extra \
-    vim && \
-    # this needs to be done after installing dirmngr
-    apt-key adv --keyserver keyserver.ubuntu.com --recv-key 23F3D4EA75716059 && \ 
-    apt-add-repository https://cli.github.com/packages && \ 
-    apt-get install -yq --no-install-recommends \
-    gh && \ 
-    locale-gen en_US.UTF-8 && \
-    apt-get clean && \
-    rm -rf /var/cache/apt/* &&\
-    rm -rf /var/lib/apt/lists/* &&\
-    rm -rf /tmp/*
-
-# Allows this Dockerfile to activate conda environments
-SHELL ["/bin/bash", "--login", "-o", "pipefail", "-c"]
-
-# -----------------------------------------------------------------------------
-# ---- Installing mamba  ----
-RUN wget -q -O mambaforge3.sh \
-    "https://github.com/conda-forge/miniforge/releases/download/$MAMBAFORGE_VERSION/Mambaforge-$MAMBAFORGE_VERSION-Linux-x86_64.sh" && \
-    bash mambaforge3.sh -p ${CONDA_DIR} -b && \
-    rm mambaforge3.sh
-
-# -----------------------------------------------------------------------------
-# ---- Copy needed files ----
-# basic workspace configurations
-COPY ./tools/gitpod/workspace_config /usr/local/bin/workspace_config
-
-RUN chmod a+rx /usr/local/bin/workspace_config && \
-    workspace_config
-
-# Copy conda environment file into the container - this needs to exists inside 
-# the container to create a conda environment from it
-COPY environment.yml /tmp/environment.yml
-
-# -----------------------------------------------------------------------------
-# ---- Create conda environment ----
-# Install NumPy dependencies
-RUN mamba env create -f /tmp/environment.yml && \
-    conda activate ${CONDA_ENV} && \
-    mamba install ccache -y && \
-    # needed for docs rendering later on
-    python -m pip install --no-cache-dir sphinx-autobuild && \
-    conda clean --all -f -y && \
-    rm -rf /tmp/*
-
-# -----------------------------------------------------------------------------
-# Always make sure we are not root
-USER gitpod
-\ No newline at end of file
diff --git a/tools/gitpod/gitpod.Dockerfile b/tools/gitpod/gitpod.Dockerfile
deleted file mode 100644
index 8dac0d597..000000000
--- a/tools/gitpod/gitpod.Dockerfile
+++ /dev/null
@@ -1,49 +0,0 @@
-# Doing a local shallow clone - keeps the container secure
-# and much slimmer than using COPY directly or making a 
-# remote clone
-ARG BASE_CONTAINER="numpy/numpy-dev:latest"
-FROM gitpod/workspace-base:latest as clone
-
-COPY --chown=gitpod . /tmp/numpy_repo
-
-# the clone should be deep enough for versioneer to work
-RUN git clone --shallow-since=2021-05-22 file:////tmp/numpy_repo /tmp/numpy
-
-# -----------------------------------------------------------------------------
-# Using the numpy-dev Docker image as a base
-# This way, we ensure we have all the needed compilers and dependencies
-# while reducing the build time
-FROM ${BASE_CONTAINER} as build
-
-# -----------------------------------------------------------------------------
-USER root
-
-# -----------------------------------------------------------------------------
-# ---- ENV variables ----
-# ---- Directories needed ----
-ENV WORKSPACE=/workspace/numpy/ \
-    CONDA_ENV=numpy-dev
-
-# Allows this Dockerfile to activate conda environments
-SHELL ["/bin/bash", "--login", "-o", "pipefail", "-c"]
-
-# Copy over the shallow clone
-COPY --from=clone --chown=gitpod /tmp/numpy ${WORKSPACE}
-
-# Everything happens in the /workspace/numpy directory
-WORKDIR ${WORKSPACE}
-
-# Build numpy to populate the cache used by ccache
-RUN git config --global --add safe.directory /workspace/numpy
-RUN git submodule update --init --depth=1 -- numpy/core/src/umath/svml
-RUN conda activate ${CONDA_ENV} && \ 
-    python setup.py build_ext --inplace && \
-    ccache -s
-
-# Gitpod will load the repository into /workspace/numpy. We remove the
-# directory from the image to prevent conflicts
-RUN rm -rf ${WORKSPACE}
-
-# -----------------------------------------------------------------------------
-# Always return to non privileged user
-USER gitpod
diff --git a/tools/gitpod/settings.json b/tools/gitpod/settings.json
deleted file mode 100644
index 50296336d..000000000
--- a/tools/gitpod/settings.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-    "restructuredtext.updateOnTextChanged": "true",
-    "restructuredtext.updateDelay": 300,
-    "restructuredtext.linter.disabledLinters": ["doc8","rst-lint", "rstcheck"],
-    "python.defaultInterpreterPath": "/home/gitpod/mambaforge3/envs/numpy-dev/bin/python",
-    "esbonio.sphinx.buildDir": "${workspaceRoot}/doc/build/html",
-    "esbonio.sphinx.confDir": ""
-}
-\ No newline at end of file
diff --git a/tools/gitpod/workspace_config b/tools/gitpod/workspace_config
deleted file mode 100644
index aa859c9be..000000000
--- a/tools/gitpod/workspace_config
+++ /dev/null
@@ -1,58 +0,0 @@
-#!/bin/bash
-# Basic configurations for the workspace
-
-set -e
-
-# gitpod/workspace-base needs at least one file here
-touch /home/gitpod/.bashrc.d/empty
-
-# Add git aliases
-git config --global alias.co checkout
-git config --global alias.ci commit
-git config --global alias.st status
-git config --global alias.br branch
-git config --global alias.hist "log --pretty=format:'%h %ad | %s%d [%an]' --graph --date=short"
-git config --global alias.type 'cat-file -t'
-git config --global alias.dump 'cat-file -p'
-
-# Enable basic vim defaults in ~/.vimrc
-echo "filetype plugin indent on" >>~/.vimrc
-echo "set colorcolumn=80" >>~/.vimrc
-echo "set number" >>~/.vimrc
-echo "syntax enable" >>~/.vimrc
-
-# Vanity custom bash prompt - makes it more legible
-echo "PS1='\[\e]0;\u \w\a\]\[\033[01;36m\]\u\[\033[m\] > \[\033[38;5;141m\]\w\[\033[m\] \\$ '" >>~/.bashrc
-
-# Enable prompt color in the skeleton .bashrc
-# hadolint ignore=SC2016
-sed -i 's/^#force_color_prompt=yes/force_color_prompt=yes/' /etc/skel/.bashrc
-
-# .gitpod.yml is configured to install NumPy from /workspace/numpy
-echo "export PYTHONPATH=${WORKSPACE}" >>~/.bashrc
-
-# make conda activate command available from /bin/bash (login and interactive)
-if [[ ! -f "/etc/profile.d/conda.sh" ]]; then
-    ln -s ${CONDA_DIR}/etc/profile.d/conda.sh /etc/profile.d/conda.sh
-fi
-echo ". ${CONDA_DIR}/etc/profile.d/conda.sh" >>~/.bashrc
-echo "conda activate numpy-dev" >>~/.bashrc
-
-# Enable prompt color in the skeleton .bashrc
-# hadolint ignore=SC2016
-sed -i 's/^#force_color_prompt=yes/force_color_prompt=yes/' /etc/skel/.bashrc
-
-# .gitpod.yml is configured to install numpy from /workspace/numpy
-echo "export PYTHONPATH=/workspace/numpy" >>~/.bashrc
-
-# Set up ccache for compilers for this Dockerfile
-# REF: https://github.com/conda-forge/compilers-feedstock/issues/31
-echo "conda activate numpy-dev" >>~/.startuprc
-echo "export CC=\"ccache \$CC\"" >>~/.startuprc
-echo "export CXX=\"ccache \$CXX\"" >>~/.startuprc
-echo "export F77=\"ccache \$F77\"" >>~/.startuprc
-echo "export F90=\"ccache \$F90\"" >>~/.startuprc
-echo "export GFORTRAN=\"ccache \$GFORTRAN\"" >>~/.startuprc
-echo "export FC=\"ccache \$FC\"" >>~/.startuprc
-echo "source ~/.startuprc" >>~/.profile
-echo "source ~/.startuprc" >>~/.bashrc
diff --git a/tools/list_installed_dll_dependencies_cygwin.sh b/tools/list_installed_dll_dependencies_cygwin.sh
index ee06ae0d0..78436fe53 100644
--- a/tools/list_installed_dll_dependencies_cygwin.sh
+++ b/tools/list_installed_dll_dependencies_cygwin.sh
@@ -14,11 +14,11 @@
 py_ver=${1}
 dll_list=`/bin/dash tools/list_numpy_dlls.sh ${py_ver}`
 echo "Checks for existence, permissions and file type"
-ls -l ${dll_list}
-file ${dll_list}
+/usr/bin/timeout 10m /usr/bin/ls -l ${dll_list}
+/usr/bin/timeout 10m /usr/bin/file ${dll_list}
 echo "Dependency checks"
-ldd ${dll_list} | grep -F -e " => not found" && exit 1
-cygcheck ${dll_list} >cygcheck_dll_list 2>cygcheck_missing_deps
+/usr/bin/timeout 10m /usr/bin/ldd ${dll_list} | grep -F -e " => not found" && exit 1
+/usr/bin/timeout 10m /usr/bin/cygcheck ${dll_list} >cygcheck_dll_list 2>cygcheck_missing_deps
 grep -F -e "cygcheck: track_down: could not find " cygcheck_missing_deps && exit 1
 echo "Import tests"
 mkdir -p dist/
@@ -31,5 +31,5 @@ do
 			 -e "s/^\/+(home|usr).*?site-packages\/+//" \
 			 -e "s/.cpython-3.m?-x86(_64)?-cygwin.dll$//" \
 			 -e "s/\//./g"`
-    python${py_ver} -c "import ${ext_module}"
+    /usr/bin/timeout 2m /usr/bin/python${py_ver} -c "import ${ext_module}"
 done
diff --git a/tools/openblas_support.py b/tools/openblas_support.py
index ce677f9a5..fd8c6a97a 100644
--- a/tools/openblas_support.py
+++ b/tools/openblas_support.py
@@ -13,13 +13,14 @@ from tempfile import mkstemp, gettempdir
 from urllib.request import urlopen, Request
 from urllib.error import HTTPError
 
-OPENBLAS_V = '0.3.20'
-OPENBLAS_LONG = 'v0.3.20'
+OPENBLAS_V = '0.3.23'
+OPENBLAS_LONG = 'v0.3.23'
 BASE_LOC = 'https://anaconda.org/multibuild-wheels-staging/openblas-libs'
 BASEURL = f'{BASE_LOC}/{OPENBLAS_LONG}/download'
 SUPPORTED_PLATFORMS = [
     'linux-aarch64',
     'linux-x86_64',
+    'musllinux-x86_64',
     'linux-i686',
     'linux-ppc64le',
     'linux-s390x',
@@ -54,14 +55,40 @@ def get_ilp64():
 
 
 def get_manylinux(arch):
-    if arch in ('x86_64', 'i686'):
-        default = '2010'
-    else:
-        default = '2014'
-    ret = os.environ.get("MB_ML_VER", default)
+    default = '2014'
+    ml_ver = os.environ.get("MB_ML_VER", default)
     # XXX For PEP 600 this can be a glibc version
-    assert ret in ('1', '2010', '2014', '_2_24'), f'invalid MB_ML_VER {ret}'
-    return ret
+    assert ml_ver in ('2010', '2014', '_2_24'), f'invalid MB_ML_VER {ml_ver}'
+    suffix = f'manylinux{ml_ver}_{arch}.tar.gz'
+    return suffix
+
+
+def get_musllinux(arch):
+    musl_ver = "1_1"
+    suffix = f'musllinux_{musl_ver}_{arch}.tar.gz'
+    return suffix
+
+
+def get_linux(arch):
+    # best way of figuring out whether manylinux or musllinux is to look
+    # at the packaging tags. If packaging isn't installed (it's not by default)
+    # fallback to sysconfig (which may be flakier)
+    try:
+        from packaging.tags import sys_tags
+        tags = list(sys_tags())
+        plat = tags[0].platform
+    except ImportError:
+        # fallback to sysconfig for figuring out if you're using musl
+        plat = 'manylinux'
+        # value could be None
+        v = sysconfig.get_config_var('HOST_GNU_TYPE') or ''
+        if 'musl' in v:
+            plat = 'musllinux'
+
+    if 'manylinux' in plat:
+        return get_manylinux(arch)
+    elif 'musllinux' in plat:
+        return get_musllinux(arch)
 
 
 def download_openblas(target, plat, ilp64):
@@ -73,20 +100,22 @@ def download_openblas(target, plat, ilp64):
                 '(KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.3')}
     suffix = None
     if osname == "linux":
-        ml_ver = get_manylinux(arch)
-        suffix = f'manylinux{ml_ver}_{arch}.tar.gz'
+        suffix = get_linux(arch)
+        typ = 'tar.gz'
+    elif osname == "musllinux":
+        suffix = get_musllinux(arch)
         typ = 'tar.gz'
     elif plat == 'macosx-x86_64':
-        suffix = 'macosx_10_9_x86_64-gf_1becaaa.tar.gz'
+        suffix = 'macosx_10_9_x86_64-gf_c469a42.tar.gz'
         typ = 'tar.gz'
     elif plat == 'macosx-arm64':
-        suffix = 'macosx_11_0_arm64-gf_f26990f.tar.gz'
+        suffix = 'macosx_11_0_arm64-gf_5272328.tar.gz'
         typ = 'tar.gz'
     elif osname == 'win':
         if plat == "win-32":
-            suffix = 'win32-gcc_8_1_0.zip'
+            suffix = 'win32-gcc_8_3_0.zip'
         else:
-            suffix = 'win_amd64-gcc_8_1_0.zip'
+            suffix = 'win_amd64-gcc_10_3_0.zip'
         typ = 'zip'
 
     if not suffix:
@@ -102,11 +131,11 @@ def download_openblas(target, plat, ilp64):
     if response.status != 200:
         print(f'Could not download "{filename}"', file=sys.stderr)
         return None
-    print(f"Downloading {length} from {filename}", file=sys.stderr)
+    # print(f"Downloading {length} from {filename}", file=sys.stderr)
     data = response.read()
     # Verify hash
     key = os.path.basename(filename)
-    print("Saving to file", file=sys.stderr)
+    # print("Saving to file", file=sys.stderr)
     with open(target, 'wb') as fid:
         fid.write(data)
     return typ
@@ -133,28 +162,34 @@ def setup_openblas(plat=get_plat(), ilp64=get_ilp64()):
     if osname == 'win':
         if not typ == 'zip':
             return f'expecting to download zipfile on windows, not {typ}'
-        return unpack_windows_zip(tmp)
+        return unpack_windows_zip(tmp, plat)
     else:
         if not typ == 'tar.gz':
             return 'expecting to download tar.gz, not %s' % str(typ)
         return unpack_targz(tmp)
 
 
-def unpack_windows_zip(fname):
+def unpack_windows_zip(fname, plat):
+    unzip_base = os.path.join(gettempdir(), 'openblas')
+    if not os.path.exists(unzip_base):
+        os.mkdir(unzip_base)
     with zipfile.ZipFile(fname, 'r') as zf:
-        # Get the openblas.a file, but not openblas.dll.a nor openblas.dev.a
-        lib = [x for x in zf.namelist() if OPENBLAS_LONG in x and
-               x.endswith('a') and not x.endswith('dll.a') and
-               not x.endswith('dev.a')]
-        if not lib:
-            return 'could not find libopenblas_%s*.a ' \
-                    'in downloaded zipfile' % OPENBLAS_LONG
-        if get_ilp64() is None:
-            target = os.path.join(gettempdir(), 'openblas.a')
-        else:
-            target = os.path.join(gettempdir(), 'openblas64_.a')
-        with open(target, 'wb') as fid:
-            fid.write(zf.read(lib[0]))
+        zf.extractall(unzip_base)
+    if plat == "win-32":
+        target = os.path.join(unzip_base, "32")
+    else:
+        target = os.path.join(unzip_base, "64")
+    # Copy the lib to openblas.lib. Once we can properly use pkg-config
+    # this will not be needed
+    lib = glob.glob(os.path.join(target, 'lib', '*.lib'))
+    assert len(lib) == 1
+    for f in lib:
+        shutil.copy(f, os.path.join(target, 'lib', 'openblas.lib'))
+        shutil.copy(f, os.path.join(target, 'lib', 'openblas64_.lib'))
+    # Copy the dll from bin to lib so system_info can pick it up
+    dll = glob.glob(os.path.join(target, 'bin', '*.dll'))
+    for f in dll:
+        shutil.copy(f, os.path.join(target, 'lib'))
     return target
 
 
@@ -197,7 +232,7 @@ def make_init(dirname):
     '''
     Create a _distributor_init.py file for OpenBlas
     '''
-    with open(os.path.join(dirname, '_distributor_init.py'), 'wt') as fid:
+    with open(os.path.join(dirname, '_distributor_init.py'), 'w') as fid:
         fid.write(textwrap.dedent("""
             '''
             Helper to preload windows dlls to prevent dll not found errors.
@@ -235,7 +270,8 @@ def make_init(dirname):
 
 def test_setup(plats):
     '''
-    Make sure all the downloadable files exist and can be opened
+    Make sure all the downloadable files needed for wheel building
+    exist and can be opened
     '''
     def items():
         """ yields all combinations of arch, ilp64
@@ -243,19 +279,8 @@ def test_setup(plats):
         for plat in plats:
             yield plat, None
             osname, arch = plat.split("-")
-            if arch not in ('i686', 'arm64', '32'):
+            if arch not in ('i686', '32'):
                 yield plat, '64_'
-            if osname == "linux" and arch in ('i686', 'x86_64'):
-                oldval = os.environ.get('MB_ML_VER', None)
-                os.environ['MB_ML_VER'] = '1'
-                yield plat, None
-                # Once we create x86_64 and i686 manylinux2014 wheels...
-                # os.environ['MB_ML_VER'] = '2014'
-                # yield arch, None, False
-                if oldval:
-                    os.environ['MB_ML_VER'] = oldval
-                else:
-                    os.environ.pop('MB_ML_VER')
 
     errs = []
     for plat, ilp64 in items():
@@ -273,16 +298,12 @@ def test_setup(plats):
                 continue
             if not target:
                 raise RuntimeError(f'Could not setup {plat}')
-            print(target)
-            if osname == 'win':
-                if not target.endswith('.a'):
-                    raise RuntimeError("Not .a extracted!")
-            else:
-                files = glob.glob(os.path.join(target, "lib", "*.a"))
-                if not files:
-                    raise RuntimeError("No lib/*.a unpacked!")
+            print('success with', plat, ilp64)
+            files = glob.glob(os.path.join(target, "lib", "*.a"))
+            if not files:
+                raise RuntimeError("No lib/*.a unpacked!")
         finally:
-            if target is not None:
+            if target:
                 if os.path.isfile(target):
                     os.unlink(target)
                 else:
@@ -291,32 +312,24 @@ def test_setup(plats):
         raise errs[0]
 
 
-def test_version(expected_version, ilp64=get_ilp64()):
+def test_version(expected_version=None):
     """
     Assert that expected OpenBLAS version is
-    actually available via NumPy
+    actually available via NumPy. Requires threadpoolctl
     """
     import numpy
-    import ctypes
+    import threadpoolctl
 
-    dll = ctypes.CDLL(numpy.core._multiarray_umath.__file__)
-    if ilp64 == "64_":
-        get_config = dll.openblas_get_config64_
-    else:
-        get_config = dll.openblas_get_config
-    get_config.restype = ctypes.c_char_p
-    res = get_config()
-    print('OpenBLAS get_config returned', str(res))
+    data = threadpoolctl.threadpool_info()
+    if len(data) != 1:
+        raise ValueError(f"expected single threadpool_info result, got {data}")
     if not expected_version:
         expected_version = OPENBLAS_V
-    check_str = b'OpenBLAS %s' % expected_version.encode()
-    print(check_str)
-    assert check_str in res, f'{expected_version} not found in {res}'
-    if ilp64:
-        assert b"USE64BITINT" in res
-    else:
-        assert b"USE64BITINT" not in res
-
+    if data[0]['version'] != expected_version:
+        raise ValueError(
+            f"expected OpenBLAS version {expected_version}, got {data}"
+        )
+    print("OK")
 
 if __name__ == '__main__':
     import argparse
diff --git a/tools/rebase_installed_dlls_cygwin.sh b/tools/rebase_installed_dlls_cygwin.sh
index f772879d9..58d4f0c0f 100644
--- a/tools/rebase_installed_dlls_cygwin.sh
+++ b/tools/rebase_installed_dlls_cygwin.sh
@@ -2,4 +2,6 @@
 # Rebase the dlls installed by NumPy
 
 py_ver=${1}
-/usr/bin/rebase --database --oblivious `/bin/dash tools/list_numpy_dlls.sh ${py_ver}`
+numpy_dlls="`/bin/dash tools/list_numpy_dlls.sh ${py_ver}`"
+/usr/bin/rebase --verbose --database --oblivious ${numpy_dlls}
+/usr/bin/rebase --verbose --info ${numpy_dlls}
diff --git a/tools/refguide_check.py b/tools/refguide_check.py
index 1d230b385..2c69b9464 100644
--- a/tools/refguide_check.py
+++ b/tools/refguide_check.py
@@ -99,12 +99,6 @@ DOCTEST_SKIPDICT = {
     'numpy.lib.DataSource': None,
     'numpy.lib.Repository': None,
 }
-if sys.version_info < (3, 9):
-    DOCTEST_SKIPDICT.update({
-        "numpy.core.ndarray": {"__class_getitem__"},
-        "numpy.core.dtype": {"__class_getitem__"},
-        "numpy.core.number": {"__class_getitem__"},
-    })
 
 # Skip non-numpy RST files, historical release notes
 # Any single-directory exact match will skip the directory and all subdirs.
diff --git a/tools/swig/pyfragments.swg b/tools/swig/pyfragments.swg
index eac817322..6d3e6ff41 100644
--- a/tools/swig/pyfragments.swg
+++ b/tools/swig/pyfragments.swg
@@ -52,6 +52,7 @@
     }
 %#endif
     if (!PyArray_IsScalar(obj,Integer)) return SWIG_TypeError;
+    if (!val) return SWIG_OK;
     PyArray_Descr * longDescr = PyArray_DescrFromType(NPY_LONG);
     PyArray_CastScalarToCtype(obj, (void*)val, longDescr);
     Py_DECREF(longDescr);
@@ -102,6 +103,7 @@
     }
 %#endif
     if (!PyArray_IsScalar(obj,Integer)) return SWIG_TypeError;
+    if (!val) return SWIG_OK;
     PyArray_Descr * ulongDescr = PyArray_DescrFromType(NPY_ULONG);
     PyArray_CastScalarToCtype(obj, (void*)val, ulongDescr);
     Py_DECREF(ulongDescr);
diff --git a/tools/travis-test.sh b/tools/travis-test.sh
index 96bc33926..67e507eb3 100755
--- a/tools/travis-test.sh
+++ b/tools/travis-test.sh
@@ -33,7 +33,7 @@ werrors="$werrors -Werror=implicit-function-declaration"
 setup_base()
 {
   # use default python flags but remove sign-compare
-  sysflags="$($PYTHON -c "from distutils import sysconfig; \
+  sysflags="$($PYTHON -c "import sysconfig; \
     print (sysconfig.get_config_var('CFLAGS'))")"
   export CFLAGS="$sysflags $werrors -Wlogical-op -Wno-sign-compare"
 
@@ -143,6 +143,7 @@ EOF
   fi
 
   if [ -n "$CHECK_BLAS" ]; then
+    $PYTHON -m pip install threadpoolctl
     $PYTHON ../tools/openblas_support.py --check_version
   fi
 
@@ -180,7 +181,7 @@ EOF
     pushd ../benchmarks
     $PYTHON `which asv` check --python=same
     $PYTHON `which asv` machine --machine travis
-    $PYTHON `which asv` dev 2>&1| tee asv-output.log
+    $PYTHON `which asv` dev -q 2>&1| tee asv-output.log
     if grep -q Traceback asv-output.log; then
       echo "Some benchmarks have errors!"
       exit 1
diff --git a/tools/wheels/check_license.py b/tools/wheels/check_license.py
index 0fe7356c0..8ced317d6 100644
--- a/tools/wheels/check_license.py
+++ b/tools/wheels/check_license.py
@@ -9,7 +9,6 @@ distribution.
 """
 import os
 import sys
-import io
 import re
 import argparse
 
@@ -36,7 +35,7 @@ def main():
 
     # Check license text
     license_txt = os.path.join(os.path.dirname(mod.__file__), "LICENSE.txt")
-    with io.open(license_txt, "r", encoding="utf-8") as f:
+    with open(license_txt, encoding="utf-8") as f:
         text = f.read()
 
     ok = check_text(text)
diff --git a/tools/wheels/cibw_before_build.sh b/tools/wheels/cibw_before_build.sh
index 5cf7d8088..493cceeae 100644
--- a/tools/wheels/cibw_before_build.sh
+++ b/tools/wheels/cibw_before_build.sh
@@ -15,44 +15,37 @@ fi
 # Install Openblas
 if [[ $RUNNER_OS == "Linux" || $RUNNER_OS == "macOS" ]] ; then
     basedir=$(python tools/openblas_support.py)
-    cp -r $basedir/lib/* /usr/local/lib
-    cp $basedir/include/* /usr/local/include
     if [[ $RUNNER_OS == "macOS" && $PLATFORM == "macosx-arm64" ]]; then
+        # /usr/local/lib doesn't exist on cirrus-ci runners
+        sudo mkdir -p /usr/local/lib /usr/local/include /usr/local/lib/cmake/openblas
         sudo mkdir -p /opt/arm64-builds/lib /opt/arm64-builds/include
         sudo chown -R $USER /opt/arm64-builds
         cp -r $basedir/lib/* /opt/arm64-builds/lib
         cp $basedir/include/* /opt/arm64-builds/include
+        sudo cp -r $basedir/lib/* /usr/local/lib
+        sudo cp $basedir/include/* /usr/local/include
+    else
+        cp -r $basedir/lib/* /usr/local/lib
+        cp $basedir/include/* /usr/local/include
     fi
 elif [[ $RUNNER_OS == "Windows" ]]; then
     PYTHONPATH=tools python -c "import openblas_support; openblas_support.make_init('numpy')"
     target=$(python tools/openblas_support.py)
+    ls /tmp
     mkdir -p openblas
-    cp $target openblas
+    # bash on windows does not like cp -r $target/* openblas
+    for f in $(ls $target); do
+        cp -r $target/$f openblas
+    done
+    ls openblas
 fi
 
-# Install GFortran
 if [[ $RUNNER_OS == "macOS" ]]; then
-    # same version of gfortran as the openblas-libs and numpy-wheel builds
-    curl -L https://github.com/MacPython/gfortran-install/raw/master/archives/gfortran-4.9.0-Mavericks.dmg -o gfortran.dmg
-    GFORTRAN_SHA256=$(shasum -a 256 gfortran.dmg)
-    KNOWN_SHA256="d2d5ca5ba8332d63bbe23a07201c4a0a5d7e09ee56f0298a96775f928c3c4b30  gfortran.dmg"
-    if [ "$GFORTRAN_SHA256" != "$KNOWN_SHA256" ]; then
-        echo sha256 mismatch
-        exit 1
-    fi
-
-    hdiutil attach -mountpoint /Volumes/gfortran gfortran.dmg
-    sudo installer -pkg /Volumes/gfortran/gfortran.pkg -target /
-    otool -L /usr/local/gfortran/lib/libgfortran.3.dylib
-
-    # arm64 stuff from gfortran_utils
+    # Install same version of gfortran as the openblas-libs builds
     if [[ $PLATFORM == "macosx-arm64" ]]; then
-        source $PROJECT_DIR/tools/wheels/gfortran_utils.sh
-        install_arm64_cross_gfortran
+        PLAT="arm64"
     fi
-
-    # Manually symlink gfortran-4.9 to plain gfortran for f2py.
-    # No longer needed after Feb 13 2020 as gfortran is already present
-    # and the attempted link errors. Keep this for future reference.
-    # ln -s /usr/local/bin/gfortran-4.9 /usr/local/bin/gfortran
+    source $PROJECT_DIR/tools/wheels/gfortran_utils.sh
+    install_gfortran
+    pip install "delocate==0.10.4"
 fi
diff --git a/tools/wheels/cibw_test_command.sh b/tools/wheels/cibw_test_command.sh
index cbc735fc0..36c275f32 100644
--- a/tools/wheels/cibw_test_command.sh
+++ b/tools/wheels/cibw_test_command.sh
@@ -1,23 +1,35 @@
-# This script is used by .github/workflows/wheels.yml to build wheels with
-# cibuildwheel. It runs the full test suite, checks for lincense inclusion
-# and that the openblas version is correct.
+# This script is used by .github/workflows/wheels.yml to run the full test
+# suite, checks for lincense inclusion and that the openblas version is correct.
 set -xe
 
 PROJECT_DIR="$1"
 
+python -m pip install threadpoolctl
 python -c "import numpy; numpy.show_config()"
 if [[ $RUNNER_OS == "Windows" ]]; then
     # GH 20391
     PY_DIR=$(python -c "import sys; print(sys.prefix)")
     mkdir $PY_DIR/libs
 fi
-
+if [[ $RUNNER_OS == "macOS"  && $RUNNER_ARCH == "X64" ]]; then
+  # Not clear why this is needed but it seems on x86_64 this is not the default
+  # and without it f2py tests fail
+  export DYLD_LIBRARY_PATH="$DYLD_LIBRARY_PATH:/usr/local/lib"
+  # Needed so gfortran (not clang) can find system libraries like libm (-lm)
+  # in f2py tests
+  export LIBRARY_PATH="$LIBRARY_PATH:/Library/Developer/CommandLineTools/SDKs/MacOSX.sdk/usr/lib"
+else
+  # For some reason the macos-x86_64 runner does not work with threadpoolctl
+  # Skip this check there
+  python $PROJECT_DIR/tools/openblas_support.py --check_version
+fi
 # Set available memory value to avoid OOM problems on aarch64.
 # See gh-22418.
 export NPY_AVAILABLE_MEM="4 GB"
-python -c "import sys; import numpy; sys.exit(not numpy.test('full'))"
-
-python $PROJECT_DIR/tools/wheels/check_license.py
-if [[ $UNAME == "Linux" || $UNAME == "Darwin" ]] ; then
-    python $PROJECT_DIR/tools/openblas_support.py --check_version
+if [[ $(python -c "import sys; print(sys.implementation.name)") == "pypy" ]]; then
+  # make PyPy more verbose, try to catch a segfault
+  python -c "import sys; import numpy; sys.exit(not numpy.test(label='full', verbose=2))"
+else
+  python -c "import sys; import numpy; sys.exit(not numpy.test(label='full'))"
 fi
+python $PROJECT_DIR/tools/wheels/check_license.py
diff --git a/tools/wheels/gfortran_utils.sh b/tools/wheels/gfortran_utils.sh
index e0c7054a5..9102ba127 100644
--- a/tools/wheels/gfortran_utils.sh
+++ b/tools/wheels/gfortran_utils.sh
@@ -26,7 +26,6 @@
 
 # Bash utilities for use with gfortran
 
-GF_LIB_URL="https://3f23b170c54c2533c070-1c8a9b3114517dc5fe17b7c3f8c63a43.ssl.cf2.rackcdn.com"
 ARCHIVE_SDIR="${ARCHIVE_SDIR:-archives}"
 
 GF_UTIL_DIR=$(dirname "${BASH_SOURCE[0]}")
@@ -52,9 +51,8 @@ function get_distutils_platform {
         echo "manylinux1_$plat"
         return
     fi
-    # macOS 32-bit arch is i386
-    [ "$plat" == "i686" ] && plat="i386"
-    local target=$(echo $MACOSX_DEPLOYMENT_TARGET | tr .- _)
+    # The gfortran downloads build for macos 10.9
+    local target="10_9"
     echo "macosx_${target}_${plat}"
 }
 
@@ -78,15 +76,14 @@ function get_distutils_platform_ex {
         echo "manylinux${mb_ml_ver}_${plat}"
         return
     fi
-    # macOS 32-bit arch is i386
-    [ "$plat" == "i686" ] && plat="i386"
-    local target=$(echo $MACOSX_DEPLOYMENT_TARGET | tr .- _)
+    # The gfortran downloads build for macos 10.9
+    local target="10_9"
     echo "macosx_${target}_${plat}"
 }
 
 function get_macosx_target {
     # Report MACOSX_DEPLOYMENT_TARGET as given by distutils get_platform.
-    python -c "import sysconfig as s; print(s.get_config_vars()['MACOSX_DEPLOYMENT_TARGET'])"
+    python3 -c "import sysconfig as s; print(s.get_config_vars()['MACOSX_DEPLOYMENT_TARGET'])"
 }
 
 function check_gfortran {
@@ -107,9 +104,6 @@ function get_gf_lib_for_suf {
     if [ -n "$suffix" ]; then suffix="-$suffix"; fi
     local fname="$prefix-${plat_tag}${suffix}.tar.gz"
     local out_fname="${ARCHIVE_SDIR}/$fname"
-    if [ ! -e "$out_fname" ]; then
-        curl -L "${GF_LIB_URL}/$fname" > $out_fname || (echo "Fetch of $out_fname failed"; exit 1)
-    fi
     [ -s $out_fname ] || (echo "$out_fname is empty"; exit 24)
     echo "$out_fname"
 }
@@ -117,24 +111,54 @@ function get_gf_lib_for_suf {
 if [ "$(uname)" == "Darwin" ]; then
     mac_target=${MACOSX_DEPLOYMENT_TARGET:-$(get_macosx_target)}
     export MACOSX_DEPLOYMENT_TARGET=$mac_target
-    GFORTRAN_DMG="${GF_UTIL_DIR}/archives/gfortran-4.9.0-Mavericks.dmg"
-    export GFORTRAN_SHA="$(shasum $GFORTRAN_DMG)"
-
-    function install_arm64_cross_gfortran {
-        curl -L -O https://github.com/isuruf/gcc/releases/download/gcc-10-arm-20210228/gfortran-darwin-arm64.tar.gz
-        export GFORTRAN_SHA=f26990f6f08e19b2ec150b9da9d59bd0558261dd
-        if [[ "$(shasum gfortran-darwin-arm64.tar.gz)" != "${GFORTRAN_SHA}  gfortran-darwin-arm64.tar.gz" ]]; then
-            echo "shasum mismatch for gfortran-darwin-arm64"
+    # Keep this for now as some builds might depend on this being
+    # available before install_gfortran is called
+    export GFORTRAN_SHA=c469a420d2d003112749dcdcbe3c684eef42127e
+    # Set SDKROOT env variable if not set
+    export SDKROOT=${SDKROOT:-$(xcrun --show-sdk-path)}
+
+    function download_and_unpack_gfortran {
+	local arch=$1
+	local type=$2
+        curl -L -O https://github.com/isuruf/gcc/releases/download/gcc-11.3.0-2/gfortran-darwin-${arch}-${type}.tar.gz
+	case ${arch}-${type} in
+	    arm64-native)
+	        export GFORTRAN_SHA=0d5c118e5966d0fb9e7ddb49321f63cac1397ce8
+		;;
+	    arm64-cross)
+		export GFORTRAN_SHA=527232845abc5af21f21ceacc46fb19c190fe804
+		;;
+	    x86_64-native)
+		export GFORTRAN_SHA=c469a420d2d003112749dcdcbe3c684eef42127e
+		;;
+	    x86_64-cross)
+		export GFORTRAN_SHA=107604e57db97a0ae3e7ca7f5dd722959752f0b3
+		;;
+	esac
+        if [[ "$(shasum gfortran-darwin-${arch}-${type}.tar.gz)" != "${GFORTRAN_SHA}  gfortran-darwin-${arch}-${type}.tar.gz" ]]; then
+            echo "shasum mismatch for gfortran-darwin-${arch}-${type}"
             exit 1
         fi
         sudo mkdir -p /opt/
-        sudo cp "gfortran-darwin-arm64.tar.gz" /opt/gfortran-darwin-arm64.tar.gz
+        sudo cp "gfortran-darwin-${arch}-${type}.tar.gz" /opt/gfortran-darwin-${arch}-${type}.tar.gz
         pushd /opt
-            sudo tar -xvf gfortran-darwin-arm64.tar.gz
-            sudo rm gfortran-darwin-arm64.tar.gz
+            sudo tar -xvf gfortran-darwin-${arch}-${type}.tar.gz
+            sudo rm gfortran-darwin-${arch}-${type}.tar.gz
         popd
-        export FC_ARM64="$(find /opt/gfortran-darwin-arm64/bin -name "*-gfortran")"
-        local libgfortran="$(find /opt/gfortran-darwin-arm64/lib -name libgfortran.dylib)"
+	if [[ "${type}" == "native" ]]; then
+	    # Link these into /usr/local so that there's no need to add rpath or -L
+	    for f in libgfortran.dylib libgfortran.5.dylib libgcc_s.1.dylib libgcc_s.1.1.dylib libquadmath.dylib libquadmath.0.dylib; do
+                sudo ln -sf /opt/gfortran-darwin-${arch}-${type}/lib/$f /usr/local/lib/$f
+            done
+	    # Add it to PATH
+	    sudo ln -sf /opt/gfortran-darwin-${arch}-${type}/bin/gfortran /usr/local/bin/gfortran
+	fi
+    }
+
+    function install_arm64_cross_gfortran {
+	download_and_unpack_gfortran arm64 cross
+        export FC_ARM64="$(find /opt/gfortran-darwin-arm64-cross/bin -name "*-gfortran")"
+        local libgfortran="$(find /opt/gfortran-darwin-arm64-cross/lib -name libgfortran.dylib)"
         local libdir=$(dirname $libgfortran)
 
         export FC_ARM64_LDFLAGS="-L$libdir -Wl,-rpath,$libdir"
@@ -143,8 +167,7 @@ if [ "$(uname)" == "Darwin" ]; then
         fi
     }
     function install_gfortran {
-        hdiutil attach -mountpoint /Volumes/gfortran $GFORTRAN_DMG
-        sudo installer -pkg /Volumes/gfortran/gfortran.pkg -target /
+        download_and_unpack_gfortran $(uname -m) native
         check_gfortran
         if [[ "${PLAT:-}" == "universal2" || "${PLAT:-}" == "arm64" ]]; then
             install_arm64_cross_gfortran
diff --git a/tools/wheels/upload_wheels.sh b/tools/wheels/upload_wheels.sh
index d65492ee7..bc2066726 100644
--- a/tools/wheels/upload_wheels.sh
+++ b/tools/wheels/upload_wheels.sh
@@ -9,9 +9,6 @@ set_travis_vars() {
     fi
     if [[ "$TRAVIS_EVENT_TYPE" == "cron" ]]; then
       IS_SCHEDULE_DISPATCH="true"
-    elif [[ "$TRAVIS_EVENT_TYPE" == "api" ]]; then
-      # Manual CI run, so upload
-      IS_SCHEDULE_DISPATCH="true"
     else
       IS_SCHEDULE_DISPATCH="false"
     fi
@@ -37,11 +34,9 @@ set_upload_vars() {
 upload_wheels() {
     echo ${PWD}
     if [[ ${ANACONDA_UPLOAD} == true ]]; then
-        if [ -z ${TOKEN} ]; then
+        if [[ -z ${TOKEN} ]]; then
             echo no token set, not uploading
         else
-            python -m pip install \
-            git+https://github.com/Anaconda-Platform/anaconda-client.git@be1e14936a8e947da94d026c990715f0596d7043
             # sdists are located under dist folder when built through setup.py
             if compgen -G "./dist/*.gz"; then
                 echo "Found sdist"
author	Charles Harris <charlesr.harris@gmail.com>	2023-05-13 11:02:49 -0600
committer	GitHub <noreply@github.com>	2023-05-13 11:02:49 -0600
commit	5187067d7ad176ee3614beab2b99a524dd719aa8 (patch)
tree	907997d0c294f550193322aaa73237c1a7bcfaa6
parent	b786189222ac5bf2f4efbb04399261f7f760bc18 (diff)
parent	81caed6e3c34c4bf4b22b4f6167e816ba2a3f73c (diff)
download	numpy-5187067d7ad176ee3614beab2b99a524dd719aa8.tar.gz