17 files changed, 884 insertions, 197 deletions
diff --git a/benchmarks/README.rst b/benchmarks/README.rst
index 2700e95e7..ef841a818 100644
--- a/benchmarks/README.rst
+++ b/benchmarks/README.rst
@@ -22,8 +22,8 @@ By default, `asv` ships with support for anaconda and virtualenv::
     pip install asv
     pip install virtualenv
 
-After contributing new benchmarks, you should test them locally
-before submitting a pull request.
+After contributing new benchmarks, you should test them locally before
+submitting a pull request.
 
 To run all benchmarks, navigate to the root NumPy directory at
 the command line and execute::
@@ -31,11 +31,21 @@ the command line and execute::
     python runtests.py --bench
 
 where ``--bench`` activates the benchmark suite instead of the
-test suite. This builds NumPy and runs  all available benchmarks
+test suite. This builds NumPy and runs all available benchmarks
 defined in ``benchmarks/``. (Note: this could take a while. Each
 benchmark is run multiple times to measure the distribution in
 execution times.)
 
+For **testing** benchmarks locally, it may be better to run these without
+replications::
+
+    cd benchmarks/
+    export REGEXP="bench.*Ufunc"
+    asv run --dry-run --show-stderr --python=same --quick -b $REGEXP
+
+Where the regular expression used to match benchmarks is stored in ``$REGEXP``,
+and `--quick` is used to avoid repetitions.
+
 To run benchmarks from a particular benchmark module, such as
 ``bench_core.py``, simply append the filename without the extension::
 
@@ -69,6 +79,27 @@ Command-line help is available as usual via ``asv --help`` and
 
 .. _ASV documentation: https://asv.readthedocs.io/
 
+Benchmarking versions
+---------------------
+
+To benchmark or visualize only releases on different machines locally, the tags with their commits can be generated, before being run with ``asv``, that is::
+
+    cd benchmarks
+    # Get commits for tags
+    # delete tag_commits.txt before re-runs
+    for gtag in $(git tag --list --sort taggerdate | grep "^v"); do
+    git log $gtag --oneline -n1 --decorate=no | awk '{print $1;}' >> tag_commits.txt
+    done
+    # Use the last 20
+    tail --lines=20 tag_commits.txt > 20_vers.txt
+    asv run HASHFILE:20_vers.txt
+    # Publish and view
+    asv publish
+    asv preview
+
+For details on contributing these, see the `benchmark results repository`_.
+
+.. _benchmark results repository: https://github.com/HaoZeke/asv-numpy
 
 Writing benchmarks
 ------------------
diff --git a/benchmarks/asv.conf.json b/benchmarks/asv.conf.json
index b60135524..267450448 100644
--- a/benchmarks/asv.conf.json
+++ b/benchmarks/asv.conf.json
@@ -43,7 +43,8 @@
     // version.
     "matrix": {
         "Cython": [],
-        "setuptools": ["59.2.0"]
+        "setuptools": ["59.2.0"],
+        "packaging": []
     },
 
     // The directory (relative to the current directory) that benchmarks are
diff --git a/benchmarks/asv_compare.conf.json.tpl b/benchmarks/asv_compare.conf.json.tpl
index 01f4e41de..f0ef0bf49 100644
--- a/benchmarks/asv_compare.conf.json.tpl
+++ b/benchmarks/asv_compare.conf.json.tpl
@@ -47,7 +47,8 @@
     // version.
     "matrix": {
         "Cython": [],
-        "setuptools": ["59.2.0"]
+        "setuptools": ["59.2.0"],
+        "packaging": []
     },
 
     // The directory (relative to the current directory) that benchmarks are
diff --git a/benchmarks/benchmarks/__init__.py b/benchmarks/benchmarks/__init__.py
index 7b9f1d3e6..35fc87eac 100644
--- a/benchmarks/benchmarks/__init__.py
+++ b/benchmarks/benchmarks/__init__.py
@@ -26,7 +26,7 @@ def dirty_lock(lock_name, lock_on_count=1):
     lock_path = os.path.abspath(os.path.join(
         os.path.dirname(__file__), "..", "env", lock_name)
     )
-    # ASV load the 'benchmark_dir' to discovering the available benchmarks
+    # ASV loads the 'benchmark_dir' to discover the available benchmarks
     # the issue here is ASV doesn't capture any strings from stdout or stderr
     # during this stage so we escape it and lock on the second increment
     try:
diff --git a/benchmarks/benchmarks/bench_core.py b/benchmarks/benchmarks/bench_core.py
index 4fcd7ace5..fe1cd37b6 100644
--- a/benchmarks/benchmarks/bench_core.py
+++ b/benchmarks/benchmarks/bench_core.py
@@ -45,6 +45,12 @@ class Core(Benchmark):
     def time_array_l_view(self):
         np.array(self.l_view)
 
+    def time_can_cast(self):
+        np.can_cast(self.l10x10, self.float64_dtype)
+
+    def time_can_cast_same_kind(self):
+        np.can_cast(self.l10x10, self.float64_dtype, casting="same_kind")
+
     def time_vstack_l(self):
         np.vstack(self.l)
 
@@ -66,6 +72,9 @@ class Core(Benchmark):
     def time_empty_100(self):
         np.empty(100)
 
+    def time_empty_like(self):
+        np.empty_like(self.l10x10)
+
     def time_eye_100(self):
         np.eye(100)
 
@@ -206,13 +215,41 @@ class Indices(Benchmark):
     def time_indices(self):
         np.indices((1000, 500))
 
-class VarComplex(Benchmark):
-    params = [10**n for n in range(0, 9)]
-    def setup(self, n):
-        self.arr = np.random.randn(n) + 1j * np.random.randn(n)
 
-    def teardown(self, n):
-        del self.arr
+class StatsMethods(Benchmark):
+    # Not testing, but in array_api (redundant)
+    # 8, 16, 32 bit variants, and 128 complexes
+    params = [['int64', 'uint64', 'float64', 'intp',
+               'complex64', 'bool', 'float', 'int',
+               'complex', 'complex256'],
+              [100**n for n in range(0, 2)]]
+    param_names = ['dtype', 'size']
+
+    def setup(self, dtype, size):
+        try:
+            self.data = np.ones(size, dtype=getattr(np, dtype))
+        except AttributeError:  # builtins throw AttributeError after 1.20
+            self.data = np.ones(size, dtype=dtype)
+        if dtype.startswith('complex'):
+            self.data = np.random.randn(size) + 1j * np.random.randn(size)
+
+    def time_min(self, dtype, size):
+        self.data.min()
+
+    def time_max(self, dtype, size):
+        self.data.max()
+
+    def time_mean(self, dtype, size):
+        self.data.mean()
+
+    def time_std(self, dtype, size):
+        self.data.std()
+
+    def time_prod(self, dtype, size):
+        self.data.prod()
+
+    def time_var(self, dtype, size):
+        self.data.var()
 
-    def time_var(self, n):
-        self.arr.var()
+    def time_sum(self, dtype, size):
+        self.data.sum()
diff --git a/benchmarks/benchmarks/bench_creation.py b/benchmarks/benchmarks/bench_creation.py
new file mode 100644
index 000000000..3a577df7a
--- /dev/null
+++ b/benchmarks/benchmarks/bench_creation.py
@@ -0,0 +1,81 @@
+from .common import Benchmark, TYPES1
+
+import numpy as np
+
+
+class MeshGrid(Benchmark):
+    """ Benchmark meshgrid generation
+    """
+    params = [[16, 32],
+              [2, 3, 4],
+              ['ij', 'xy'], TYPES1]
+    param_names = ['size', 'ndims', 'ind', 'ndtype']
+    timeout = 10
+
+    def setup(self, size, ndims, ind, ndtype):
+        self.grid_dims = [(np.random.ranf(size)).astype(ndtype) for
+                          x in range(ndims)]
+
+    def time_meshgrid(self, size, ndims, ind, ndtype):
+        np.meshgrid(*self.grid_dims, indexing=ind)
+
+
+class Create(Benchmark):
+    """ Benchmark for creation functions
+    """
+    # (64, 64), (128, 128), (256, 256)
+    # , (512, 512), (1024, 1024)
+    params = [[16, 32, 128, 256, 512,
+               (16, 16), (32, 32)],
+              ['C', 'F'],
+              TYPES1]
+    param_names = ['shape', 'order', 'npdtypes']
+    timeout = 10
+
+    def setup(self, shape, order, npdtypes):
+        values = get_squares_()
+        self.xarg = values.get(npdtypes)[0]
+
+    def time_full(self, shape, order, npdtypes):
+        np.full(shape, self.xarg[1], dtype=npdtypes, order=order)
+
+    def time_full_like(self, shape, order, npdtypes):
+        np.full_like(self.xarg, self.xarg[0], order=order)
+
+    def time_ones(self, shape, order, npdtypes):
+        np.ones(shape, dtype=npdtypes, order=order)
+
+    def time_ones_like(self, shape, order, npdtypes):
+        np.ones_like(self.xarg, order=order)
+
+    def time_zeros(self, shape, order, npdtypes):
+        np.zeros(shape, dtype=npdtypes, order=order)
+
+    def time_zeros_like(self, shape, order, npdtypes):
+        np.zeros_like(self.xarg, order=order)
+
+    def time_empty(self, shape, order, npdtypes):
+        np.empty(shape, dtype=npdtypes, order=order)
+
+    def time_empty_like(self, shape, order, npdtypes):
+        np.empty_like(self.xarg, order=order)
+
+
+class UfuncsFromDLP(Benchmark):
+    """ Benchmark for creation functions
+    """
+    params = [[16, 32, (16, 16),
+               (32, 32), (64, 64)],
+              TYPES1]
+    param_names = ['shape', 'npdtypes']
+    timeout = 10
+
+    def setup(self, shape, npdtypes):
+        if npdtypes in ['longdouble', 'clongdouble']:
+            raise NotImplementedError(
+                'Only IEEE dtypes are supported')
+        values = get_squares_()
+        self.xarg = values.get(npdtypes)[0]
+
+    def time_from_dlpack(self, shape, npdtypes):
+        np.from_dlpack(self.xarg)
diff --git a/benchmarks/benchmarks/bench_function_base.py b/benchmarks/benchmarks/bench_function_base.py
index 2e44ff76b..cc37bef39 100644
--- a/benchmarks/benchmarks/bench_function_base.py
+++ b/benchmarks/benchmarks/bench_function_base.py
@@ -248,7 +248,7 @@ class Sort(Benchmark):
         # In NumPy 1.17 and newer, 'merge' can be one of several
         # stable sorts, it isn't necessarily merge sort.
         ['quick', 'merge', 'heap'],
-        ['float64', 'int64', 'float32', 'uint32', 'int32', 'int16'],
+        ['float64', 'int64', 'float32', 'uint32', 'int32', 'int16', 'float16'],
         [
             ('random',),
             ('ordered',),
diff --git a/benchmarks/benchmarks/bench_io.py b/benchmarks/benchmarks/bench_io.py
index 357adbb87..e316d07f3 100644
--- a/benchmarks/benchmarks/bench_io.py
+++ b/benchmarks/benchmarks/bench_io.py
@@ -1,7 +1,7 @@
-from .common import Benchmark, get_squares
+from .common import Benchmark, get_squares, get_squares_
 
 import numpy as np
-from io import StringIO
+from io import SEEK_SET, StringIO, BytesIO
 
 
 class Copy(Benchmark):
@@ -67,6 +67,15 @@ class Savez(Benchmark):
         np.savez('tmp.npz', **self.squares)
 
 
+class LoadNpyOverhead(Benchmark):
+    def setup(self):
+        self.buffer = BytesIO()
+        np.save(self.buffer, get_squares_()['float32'])
+
+    def time_loadnpy_overhead(self):
+        self.buffer.seek(0, SEEK_SET)
+        np.load(self.buffer)
+
 class LoadtxtCSVComments(Benchmark):
     # benchmarks for np.loadtxt comment handling
     # when reading in CSV files
diff --git a/benchmarks/benchmarks/bench_itemselection.py b/benchmarks/benchmarks/bench_itemselection.py
index 518258a8f..46a39372c 100644
--- a/benchmarks/benchmarks/bench_itemselection.py
+++ b/benchmarks/benchmarks/bench_itemselection.py
@@ -7,7 +7,7 @@ class Take(Benchmark):
     params = [
         [(1000, 1), (1000, 2), (2, 1000, 1), (1000, 3)],
         ["raise", "wrap", "clip"],
-        TYPES1]
+        TYPES1 + ["O", "i,O"]]
     param_names = ["shape", "mode", "dtype"]
 
     def setup(self, shape, mode, dtype):
@@ -21,7 +21,7 @@ class Take(Benchmark):
 class PutMask(Benchmark):
     params = [
         [True, False],
-        TYPES1]
+        TYPES1 + ["O", "i,O"]]
     param_names = ["values_is_scalar", "dtype"]
 
     def setup(self, values_is_scalar, dtype):
@@ -41,3 +41,21 @@ class PutMask(Benchmark):
     def time_sparse(self, values_is_scalar, dtype):
         np.putmask(self.arr, self.sparse_mask, self.vals)
 
+
+class Put(Benchmark):
+    params = [
+        [True, False],
+        TYPES1 + ["O", "i,O"]]
+    param_names = ["values_is_scalar", "dtype"]
+
+    def setup(self, values_is_scalar, dtype):
+        if values_is_scalar:
+            self.vals = np.array(1., dtype=dtype)
+        else:
+            self.vals = np.ones(1000, dtype=dtype)
+
+        self.arr = np.ones(1000, dtype=dtype)
+        self.indx = np.arange(1000, dtype=np.intp)
+
+    def time_ordered(self, values_is_scalar, dtype):
+        np.put(self.arr, self.indx, self.vals)
diff --git a/benchmarks/benchmarks/bench_lib.py b/benchmarks/benchmarks/bench_lib.py
index b64f8ab17..f792116a6 100644
--- a/benchmarks/benchmarks/bench_lib.py
+++ b/benchmarks/benchmarks/bench_lib.py
@@ -132,11 +132,26 @@ class Unique(Benchmark):
         # produce a randomly shuffled array with the
         # approximate desired percentage np.nan content
         base_array = np.random.uniform(size=array_size)
-        base_array[base_array < percent_nans / 100.] = np.nan
+        n_nan = int(percent_nans * array_size)
+        nan_indices = np.random.choice(np.arange(array_size), size=n_nan)
+        base_array[nan_indices] = np.nan
         self.arr = base_array
 
-    def time_unique(self, array_size, percent_nans):
-        np.unique(self.arr)
+    def time_unique_values(self, array_size, percent_nans):
+        np.unique(self.arr, return_index=False,
+                  return_inverse=False, return_counts=False)
+
+    def time_unique_counts(self, array_size, percent_nans):
+        np.unique(self.arr, return_index=False,
+                  return_inverse=False, return_counts=True)
+
+    def time_unique_inverse(self, array_size, percent_nans):
+        np.unique(self.arr, return_index=False,
+                  return_inverse=True, return_counts=False)
+
+    def time_unique_all(self, array_size, percent_nans):
+        np.unique(self.arr, return_index=True,
+                  return_inverse=True, return_counts=True)
 
 
 class Isin(Benchmark):
diff --git a/benchmarks/benchmarks/bench_linalg.py b/benchmarks/benchmarks/bench_linalg.py
index a94ba1139..b4e39b084 100644
--- a/benchmarks/benchmarks/bench_linalg.py
+++ b/benchmarks/benchmarks/bench_linalg.py
@@ -190,3 +190,27 @@ class Einsum(Benchmark):
     # sum_of_products_contig_outstride0_one：non_contiguous arrays
     def time_einsum_noncon_contig_outstride0(self, dtype):
         np.einsum("i->", self.non_contiguous_dim1, optimize=True)
+
+
+class LinAlgTransposeVdot(Benchmark):
+    # Smaller for speed
+    # , (128, 128), (256, 256), (512, 512),
+    # (1024, 1024)
+    params = [[(16, 16), (32, 32),
+               (64, 64)], TYPES1]
+    param_names = ['shape', 'npdtypes']
+
+    def setup(self, shape, npdtypes):
+        self.xarg = np.random.uniform(-1, 1, np.dot(*shape)).reshape(shape)
+        self.xarg = self.xarg.astype(npdtypes)
+        self.x2arg = np.random.uniform(-1, 1, np.dot(*shape)).reshape(shape)
+        self.x2arg = self.x2arg.astype(npdtypes)
+        if npdtypes.startswith('complex'):
+            self.xarg += self.xarg.T*1j
+            self.x2arg += self.x2arg.T*1j
+
+    def time_transpose(self, shape, npdtypes):
+        np.transpose(self.xarg)
+
+    def time_vdot(self, shape, npdtypes):
+        np.vdot(self.xarg, self.x2arg)
diff --git a/benchmarks/benchmarks/bench_manipulate.py b/benchmarks/benchmarks/bench_manipulate.py
new file mode 100644
index 000000000..0a312479c
--- /dev/null
+++ b/benchmarks/benchmarks/bench_manipulate.py
@@ -0,0 +1,107 @@
+from .common import Benchmark, get_squares_, TYPES1, DLPACK_TYPES
+
+import numpy as np
+from collections import deque
+
+class BroadcastArrays(Benchmark):
+    params = [[(16, 32), (32, 64),
+               (64, 128), (128, 256),
+               (256, 512), (512, 1024)],
+              TYPES1]
+    param_names = ['shape', 'ndtype']
+    timeout = 10
+
+    def setup(self, shape, ndtype):
+        self.xarg = np.random.ranf(shape[0]*shape[1]).reshape(shape)
+        self.xarg = self.xarg.astype(ndtype)
+        if ndtype.startswith('complex'):
+            self.xarg += np.random.ranf(1)*1j
+
+    def time_broadcast_arrays(self, shape, ndtype):
+        np.broadcast_arrays(self.xarg, np.ones(1))
+
+
+class BroadcastArraysTo(Benchmark):
+    params = [[16, 32, 64, 128, 256, 512],
+              TYPES1]
+    param_names = ['size', 'ndtype']
+    timeout = 10
+
+    def setup(self, size, ndtype):
+        self.rng = np.random.default_rng()
+        self.xarg = self.rng.random(size)
+        self.xarg = self.xarg.astype(ndtype)
+        if ndtype.startswith('complex'):
+            self.xarg += self.rng.random(1)*1j
+
+    def time_broadcast_to(self, size, ndtype):
+        np.broadcast_to(self.xarg, (size, size))
+
+
+class ConcatenateStackArrays(Benchmark):
+    # (64, 128), (128, 256), (256, 512)
+    params = [[(16, 32), (32, 64)],
+              [2, 3, 4, 5],
+              TYPES1]
+    param_names = ['shape', 'narrays', 'ndtype']
+    timeout = 10
+
+    def setup(self, shape, narrays, ndtype):
+        self.xarg = [np.random.ranf(shape[0]*shape[1]).reshape(shape)
+                     for x in range(narrays)]
+        self.xarg = [x.astype(ndtype) for x in self.xarg]
+        if ndtype.startswith('complex'):
+            [x + np.random.ranf(1)*1j for x in self.xarg]
+
+    def time_concatenate_ax0(self, size, narrays, ndtype):
+        np.concatenate(self.xarg, axis=0)
+
+    def time_concatenate_ax1(self, size, narrays, ndtype):
+        np.concatenate(self.xarg, axis=1)
+
+    def time_stack_ax0(self, size, narrays, ndtype):
+        np.stack(self.xarg, axis=0)
+
+    def time_stack_ax1(self, size, narrays, ndtype):
+        np.stack(self.xarg, axis=1)
+
+
+class DimsManipulations(Benchmark):
+    params = [
+        [(2, 1, 4), (2, 1), (5, 2, 3, 1)],
+    ]
+    param_names = ['shape']
+    timeout = 10
+
+    def setup(self, shape):
+        self.xarg = np.ones(shape=shape)
+        self.reshaped = deque(shape)
+        self.reshaped.rotate(1)
+        self.reshaped = tuple(self.reshaped)
+
+    def time_expand_dims(self, shape):
+        np.expand_dims(self.xarg, axis=1)
+
+    def time_expand_dims_neg(self, shape):
+        np.expand_dims(self.xarg, axis=-1)
+
+    def time_squeeze_dims(self, shape):
+        np.squeeze(self.xarg)
+
+    def time_flip_all(self, shape):
+        np.flip(self.xarg, axis=None)
+
+    def time_flip_one(self, shape):
+        np.flip(self.xarg, axis=1)
+
+    def time_flip_neg(self, shape):
+        np.flip(self.xarg, axis=-1)
+
+    def time_moveaxis(self, shape):
+        np.moveaxis(self.xarg, [0, 1], [-1, -2])
+
+    def time_roll(self, shape):
+        np.roll(self.xarg, 3)
+
+    def time_reshape(self, shape):
+        np.reshape(self.xarg, self.reshaped)
diff --git a/benchmarks/benchmarks/bench_reduce.py b/benchmarks/benchmarks/bench_reduce.py
index ca07bd180..040b5ca73 100644
--- a/benchmarks/benchmarks/bench_reduce.py
+++ b/benchmarks/benchmarks/bench_reduce.py
@@ -45,19 +45,40 @@ class AnyAll(Benchmark):
         self.zeros.any()
 
 
-class MinMax(Benchmark):
-    params = [np.int8, np.uint8, np.int16, np.uint16, np.int32, np.uint32,
-              np.int64, np.uint64, np.float32, np.float64, np.intp]
+class StatsReductions(Benchmark):
+    # Not testing, but in array_api (redundant)
+    # 8, 16, 32 bit variants, and 128 complexes
+    params = ['int64', 'uint64', 'float64', 'intp',
+               'complex64', 'bool', 'float', 'int',
+               'complex', 'complex256'],
     param_names = ['dtype']
 
     def setup(self, dtype):
-        self.d = np.ones(20000, dtype=dtype)
+        try:
+            self.data = np.ones(200, dtype=getattr(np, dtype))
+        except AttributeError:  # builtins throw AttributeError after 1.20
+            self.data = np.ones(200, dtype=dtype)
+        if dtype.startswith('complex'):
+            self.data = self.data * self.data.T*1j
 
     def time_min(self, dtype):
-        np.min(self.d)
+        np.min(self.data)
 
     def time_max(self, dtype):
-        np.max(self.d)
+        np.max(self.data)
+
+    def time_mean(self, dtype):
+        np.mean(self.data)
+
+    def time_std(self, dtype):
+        np.std(self.data)
+
+    def time_prod(self, dtype):
+        np.prod(self.data)
+
+    def time_var(self, dtype):
+        np.var(self.data)
+
 
 class FMinMax(Benchmark):
     params = [np.float32, np.float64]
@@ -72,6 +93,7 @@ class FMinMax(Benchmark):
     def time_max(self, dtype):
         np.fmax.reduce(self.d)
 
+
 class ArgMax(Benchmark):
     params = [np.int8, np.uint8, np.int16, np.uint16, np.int32, np.uint32,
               np.int64, np.uint64, np.float32, np.float64, bool]
@@ -83,6 +105,7 @@ class ArgMax(Benchmark):
     def time_argmax(self, dtype):
         np.argmax(self.d)
 
+
 class ArgMin(Benchmark):
     params = [np.int8, np.uint8, np.int16, np.uint16, np.int32, np.uint32,
               np.int64, np.uint64, np.float32, np.float64, bool]
@@ -94,6 +117,7 @@ class ArgMin(Benchmark):
     def time_argmin(self, dtype):
         np.argmin(self.d)
 
+
 class SmallReduction(Benchmark):
     def setup(self):
         self.d = np.ones(100, dtype=np.float32)
diff --git a/benchmarks/benchmarks/bench_scalar.py b/benchmarks/benchmarks/bench_scalar.py
index 650daa89d..638f66df5 100644
--- a/benchmarks/benchmarks/bench_scalar.py
+++ b/benchmarks/benchmarks/bench_scalar.py
@@ -65,3 +65,15 @@ class ScalarMath(Benchmark):
         other + int32
         other + int32
         other + int32
+
+
+class ScalarStr(Benchmark):
+    # Test scalar to str conversion
+    params = [TYPES1]
+    param_names = ["type"]
+
+    def setup(self, typename):
+        self.a = np.array([100] * 100, dtype=typename)
+
+    def time_str_repr(self, typename):
+        res = [str(x) for x in self.a]
diff --git a/benchmarks/benchmarks/bench_ufunc.py b/benchmarks/benchmarks/bench_ufunc.py
index 36d8621e8..f7c77d90c 100644
--- a/benchmarks/benchmarks/bench_ufunc.py
+++ b/benchmarks/benchmarks/bench_ufunc.py
@@ -1,6 +1,9 @@
-from .common import Benchmark, get_squares_
+from .common import Benchmark, get_squares_, TYPES1, DLPACK_TYPES
 
 import numpy as np
+import itertools
+from packaging import version
+import operator
 
 
 ufuncs = ['abs', 'absolute', 'add', 'arccos', 'arccosh', 'arcsin', 'arcsinh',
@@ -13,11 +16,13 @@ ufuncs = ['abs', 'absolute', 'add', 'arccos', 'arccosh', 'arcsin', 'arcsinh',
           'isinf', 'isnan', 'isnat', 'lcm', 'ldexp', 'left_shift', 'less',
           'less_equal', 'log', 'log10', 'log1p', 'log2', 'logaddexp',
           'logaddexp2', 'logical_and', 'logical_not', 'logical_or',
-          'logical_xor', 'matmul', 'maximum', 'minimum', 'mod', 'modf', 'multiply',
-          'negative', 'nextafter', 'not_equal', 'positive', 'power',
-          'rad2deg', 'radians', 'reciprocal', 'remainder', 'right_shift',
-          'rint', 'sign', 'signbit', 'sin', 'sinh', 'spacing', 'sqrt',
-          'square', 'subtract', 'tan', 'tanh', 'true_divide', 'trunc']
+          'logical_xor', 'matmul', 'maximum', 'minimum', 'mod', 'modf',
+          'multiply', 'negative', 'nextafter', 'not_equal', 'positive',
+          'power', 'rad2deg', 'radians', 'reciprocal', 'remainder',
+          'right_shift', 'rint', 'sign', 'signbit', 'sin',
+          'sinh', 'spacing', 'sqrt', 'square', 'subtract', 'tan', 'tanh',
+          'true_divide', 'trunc']
+arrayfuncdisp = ['real', 'round']
 
 
 for name in dir(np):
@@ -25,6 +30,30 @@ for name in dir(np):
         print("Missing ufunc %r" % (name,))
 
 
+class ArrayFunctionDispatcher(Benchmark):
+    params = [arrayfuncdisp]
+    param_names = ['func']
+    timeout = 10
+
+    def setup(self, ufuncname):
+        np.seterr(all='ignore')
+        try:
+            self.afdn = getattr(np, ufuncname)
+        except AttributeError:
+            raise NotImplementedError()
+        self.args = []
+        for _, aarg in get_squares_().items():
+            arg = (aarg,) * 1  # no nin
+            try:
+                self.afdn(*arg)
+            except TypeError:
+                continue
+            self.args.append(arg)
+
+    def time_afdn_types(self, ufuncname):
+        [self.afdn(*arg) for arg in self.args]
+
+
 class Broadcast(Benchmark):
     def setup(self):
         self.d = np.ones((50000, 100), dtype=np.float64)
@@ -34,6 +63,20 @@ class Broadcast(Benchmark):
         self.d - self.e
 
 
+class At(Benchmark):
+    def setup(self):
+        rng = np.random.default_rng(1)
+        self.vals = rng.random(10_000_000, dtype=np.float64)
+        self.idx = rng.integers(1000, size=10_000_000).astype(np.intp)
+        self.res = np.zeros(1000, dtype=self.vals.dtype)
+
+    def time_sum_at(self):
+        np.add.at(self.res, self.idx, self.vals)
+
+    def time_maximum_at(self):
+        np.maximum.at(self.res, self.idx, self.vals)
+
+
 class UFunc(Benchmark):
     params = [ufuncs]
     param_names = ['ufunc']
@@ -42,23 +85,179 @@ class UFunc(Benchmark):
     def setup(self, ufuncname):
         np.seterr(all='ignore')
         try:
-            self.f = getattr(np, ufuncname)
+            self.ufn = getattr(np, ufuncname)
         except AttributeError:
             raise NotImplementedError()
         self.args = []
-        for t, a in get_squares_().items():
-            arg = (a,) * self.f.nin
+        for _, aarg in get_squares_().items():
+            arg = (aarg,) * self.ufn.nin
             try:
-                self.f(*arg)
+                self.ufn(*arg)
             except TypeError:
                 continue
             self.args.append(arg)
 
     def time_ufunc_types(self, ufuncname):
-        [self.f(*arg) for arg in self.args]
+        [self.ufn(*arg) for arg in self.args]
+
+
+class MethodsV0(Benchmark):
+    """ Benchmark for the methods which do not take any arguments
+    """
+    params = [['__abs__', '__neg__', '__pos__'], TYPES1]
+    param_names = ['methods', 'npdtypes']
+    timeout = 10
+
+    def setup(self, methname, npdtypes):
+        values = get_squares_()
+        self.xarg = values.get(npdtypes)[0]
+
+    def time_ndarray_meth(self, methname, npdtypes):
+        getattr(operator, methname)(self.xarg)
+
+
+class NDArrayLRShifts(Benchmark):
+    """ Benchmark for the shift methods
+    """
+    params = [['__lshift__', '__rshift__'],
+              ['intp', 'int8', 'int16',
+                'int32', 'int64', 'uint8',
+                'uint16', 'uint32', 'uint64']]
+    param_names = ['methods', 'npdtypes']
+    timeout = 10
+
+    def setup(self, methname, npdtypes):
+        self.vals = np.ones(1000,
+                            dtype=getattr(np, npdtypes)) * \
+                            np.random.randint(9)
+
+    def time_ndarray_meth(self, methname, npdtypes):
+        getattr(operator, methname)(*[self.vals, 2])
+
+
+class Methods0D(Benchmark):
+    """Zero dimension array methods
+    """
+    params = [['__bool__', '__complex__', '__invert__',
+               '__float__', '__int__'], TYPES1]
+    param_names = ['methods', 'npdtypes']
+    timeout = 10
+
+    def setup(self, methname, npdtypes):
+        self.xarg = np.array(3, dtype=npdtypes)
+        if (npdtypes.startswith('complex') and
+           methname in ['__float__', '__int__']) or \
+           (npdtypes.startswith('int') and methname == '__invert__'):
+            # Skip
+            raise NotImplementedError
+
+    def time_ndarray__0d__(self, methname, npdtypes):
+        meth = getattr(self.xarg, methname)
+        meth()
+
+
+class MethodsV1(Benchmark):
+    """ Benchmark for the methods which take an argument
+    """
+    params = [['__and__', '__add__', '__eq__', '__floordiv__', '__ge__',
+               '__gt__', '__le__', '__lt__', '__matmul__',
+               '__mod__', '__mul__', '__ne__', '__or__',
+               '__pow__', '__sub__', '__truediv__', '__xor__'],
+              TYPES1]
+    param_names = ['methods', 'npdtypes']
+    timeout = 10
+
+    def setup(self, methname, npdtypes):
+        if (
+            npdtypes.startswith("complex")
+                and methname in ["__floordiv__", "__mod__"]
+        ) or (
+            not npdtypes.startswith("int")
+            and methname in ["__and__", "__or__", "__xor__"]
+        ):
+            raise NotImplementedError  # skip
+        values = get_squares_().get(npdtypes)
+        self.xargs = [values[0], values[1]]
+
+    def time_ndarray_meth(self, methname, npdtypes):
+        getattr(operator, methname)(*self.xargs)
+
+
+class NDArrayGetItem(Benchmark):
+    param_names = ['margs', 'msize']
+    params = [[0, (0, 0), (-1, 0), [0, -1]],
+              ['small', 'big']]
+
+    def setup(self, margs, msize):
+        self.xs = np.random.uniform(-1, 1, 6).reshape(2, 3)
+        self.xl = np.random.uniform(-1, 1, 50*50).reshape(50, 50)
+
+    def time_methods_getitem(self, margs, msize):
+        if msize == 'small':
+            mdat = self.xs
+        elif msize == 'big':
+            mdat = self.xl
+        getattr(mdat, '__getitem__')(margs)
+
+
+class NDArraySetItem(Benchmark):
+    param_names = ['margs', 'msize']
+    params = [[0, (0, 0), (-1, 0), [0, -1]],
+              ['small', 'big']]
+
+    def setup(self, margs, msize):
+        self.xs = np.random.uniform(-1, 1, 6).reshape(2, 3)
+        self.xl = np.random.uniform(-1, 1, 100*100).reshape(100, 100)
+
+    def time_methods_setitem(self, margs, msize):
+        if msize == 'small':
+            mdat = self.xs
+        elif msize == 'big':
+            mdat = self.xl
+            mdat[margs] = 17
+
+
+class DLPMethods(Benchmark):
+    """ Benchmark for DLPACK helpers
+    """
+    params = [['__dlpack__', '__dlpack_device__'], DLPACK_TYPES]
+    param_names = ['methods', 'npdtypes']
+    timeout = 10
+
+    def setup(self, methname, npdtypes):
+        values = get_squares_()
+        if npdtypes == 'bool':
+            if version.parse(np.__version__) > version.parse("1.25"):
+                self.xarg = values.get('int16')[0].astype('bool')
+            else:
+                raise NotImplementedError("Not supported before v1.25")
+        else:
+            self.xarg = values.get('int16')[0]
+
+    def time_ndarray_dlp(self, methname, npdtypes):
+        meth = getattr(self.xarg, methname)
+        meth()
+
+
+class NDArrayAsType(Benchmark):
+    """ Benchmark for type conversion
+    """
+    params = [list(itertools.combinations(TYPES1, 2))]
+    param_names = ['typeconv']
+    timeout = 10
+
+    def setup(self, typeconv):
+        if typeconv[0] == typeconv[1]:
+            raise NotImplementedError(
+                    "Skipping test for converting to the same dtype")
+        self.xarg = get_squares_().get(typeconv[0])
+
+    def time_astype(self, typeconv):
+        self.xarg.astype(typeconv[1])
+
 
 class UFuncSmall(Benchmark):
-    """  Benchmark for a selection of ufuncs on a small arrays and scalars 
+    """  Benchmark for a selection of ufuncs on a small arrays and scalars
 
     Since the arrays and scalars are small, we are benchmarking the overhead 
     of the numpy ufunc functionality
diff --git a/benchmarks/benchmarks/bench_ufunc_strides.py b/benchmarks/benchmarks/bench_ufunc_strides.py
index f80bf90f9..898cc0818 100644
--- a/benchmarks/benchmarks/bench_ufunc_strides.py
+++ b/benchmarks/benchmarks/bench_ufunc_strides.py
@@ -1,156 +1,181 @@
-from .common import Benchmark
+from .common import Benchmark, get_data
 
 import numpy as np
 
-UNARY_UFUNCS = [obj for obj in np.core.umath.__dict__.values() if
-        isinstance(obj, np.ufunc)]
-UNARY_OBJECT_UFUNCS = [uf for uf in UNARY_UFUNCS if "O->O" in uf.types]
-UNARY_OBJECT_UFUNCS.remove(getattr(np, 'invert'))
+UFUNCS = [obj for obj in np.core.umath.__dict__.values() if
+          isinstance(obj, np.ufunc)]
+UFUNCS_UNARY = [uf for uf in UFUNCS if "O->O" in uf.types]
 
-stride = [1, 2, 4]
-stride_out = [1, 2, 4]
-dtype = ['e', 'f', 'd']
-
-class Unary(Benchmark):
-    params = [UNARY_OBJECT_UFUNCS, stride, stride_out, dtype]
-    param_names = ['ufunc', 'stride_in', 'stride_out', 'dtype']
-    timeout = 10
-
-    def setup(self, ufuncname, stride, stride_out, dtype):
-        np.seterr(all='ignore')
-        try:
-            self.f = ufuncname
-        except AttributeError:
-            raise NotImplementedError(f"No ufunc {ufuncname} found") from None
-        N = 100000
-        self.arr_out = np.empty(stride_out*N, dtype)
-        self.arr = np.random.rand(stride*N).astype(dtype)
-        if (ufuncname.__name__ == 'arccosh'):
-            self.arr = 1.0 + self.arr
-
-    def time_ufunc(self, ufuncname, stride, stride_out, dtype):
-        self.f(self.arr[::stride], self.arr_out[::stride_out])
-
-class AVX_UFunc_log(Benchmark):
-    params = [stride, dtype]
-    param_names = ['stride', 'dtype']
-    timeout = 10
-
-    def setup(self, stride, dtype):
-        np.seterr(all='ignore')
-        N = 10000
-        self.arr = np.array(np.random.random_sample(stride*N), dtype=dtype)
-
-    def time_log(self, stride, dtype):
-        np.log(self.arr[::stride])
-
-
-binary_ufuncs = [
-    'maximum', 'minimum', 'fmax', 'fmin'
-]
-binary_dtype = ['f', 'd']
-
-class Binary(Benchmark):
-    param_names = ['ufunc', 'stride_in0', 'stride_in1', 'stride_out', 'dtype']
-    params = [binary_ufuncs, stride, stride, stride_out, binary_dtype]
+class _AbstractBinary(Benchmark):
+    params = []
+    param_names = ['ufunc', 'stride_in0', 'stride_in1' 'stride_out', 'dtype']
     timeout = 10
+    arrlen = 10000
+    data_finite = True
+    data_denormal = False
+    data_zeros = False
+
+    def setup(self, ufunc, stride_in0, stride_in1, stride_out, dtype):
+        ufunc_insig = f'{dtype}{dtype}->'
+        if ufunc_insig+dtype not in ufunc.types:
+            for st_sig in (ufunc_insig, dtype):
+                test = [sig for sig in ufunc.types if sig.startswith(st_sig)]
+                if test:
+                    break
+            if not test:
+                raise NotImplementedError(
+                    f"Ufunc {ufunc} doesn't support "
+                    f"binary input of dtype {dtype}"
+                ) from None
+            tin, tout = test[0].split('->')
+        else:
+            tin = dtype + dtype
+            tout = dtype
+
+        self.ufunc_args = []
+        for i, (dt, stride) in enumerate(zip(tin, (stride_in0, stride_in1))):
+            self.ufunc_args += [get_data(
+                self.arrlen*stride, dt, i,
+                zeros=self.data_zeros,
+                finite=self.data_finite,
+                denormal=self.data_denormal,
+            )[::stride]]
+        for dt in tout:
+            self.ufunc_args += [
+                np.empty(stride_out*self.arrlen, dt)[::stride_out]
+            ]
 
-    def setup(self, ufuncname, stride_in0, stride_in1, stride_out, dtype):
         np.seterr(all='ignore')
-        try:
-            self.f = getattr(np, ufuncname)
-        except AttributeError:
-            raise NotImplementedError(f"No ufunc {ufuncname} found") from None
-        N = 100000
-        self.arr1 = np.array(np.random.rand(stride_in0*N), dtype=dtype)
-        self.arr2 = np.array(np.random.rand(stride_in1*N), dtype=dtype)
-        self.arr_out = np.empty(stride_out*N, dtype)
-
-    def time_ufunc(self, ufuncname, stride_in0, stride_in1, stride_out, dtype):
-        self.f(self.arr1[::stride_in0], self.arr2[::stride_in1],
-               self.arr_out[::stride_out])
-
 
-binary_int_ufuncs = ['maximum', 'minimum']
-binary_int_dtype = ['b', 'B', 'h', 'H', 'i', 'I', 'l', 'L', 'q', 'Q']
+    def time_binary(self, ufunc, stride_in0, stride_in1, stride_out,
+             dtype):
+        ufunc(*self.ufunc_args)
 
-class BinaryInt(Binary):
+    def time_binary_scalar_in0(self, ufunc, stride_in0, stride_in1,
+                        stride_out, dtype):
+        ufunc(self.ufunc_args[0][0], *self.ufunc_args[1:])
 
-    param_names = ['ufunc', 'stride_in0', 'stride_in1', 'stride_out', 'dtype']
-    params = [binary_int_ufuncs, stride, stride, stride_out, binary_int_dtype]
-
-class AVX_ldexp(Benchmark):
-
-    params = [dtype, stride]
-    param_names = ['dtype', 'stride']
-    timeout = 10
+    def time_binary_scalar_in1(self, ufunc, stride_in0, stride_in1,
+                        stride_out, dtype):
+        ufunc(self.ufunc_args[0], self.ufunc_args[1][0], *self.ufunc_args[2:])
 
-    def setup(self, dtype, stride):
-        np.seterr(all='ignore')
-        self.f = getattr(np, 'ldexp')
-        N = 10000
-        self.arr1 = np.array(np.random.rand(stride*N), dtype=dtype)
-        self.arr2 = np.array(np.random.rand(stride*N), dtype='i')
-
-    def time_ufunc(self, dtype, stride):
-        self.f(self.arr1[::stride], self.arr2[::stride])
-
-cmplx_bfuncs = ['add',
-                'subtract',
-                'multiply',
-                'divide']
-cmplxstride = [1, 2, 4]
-cmplxdtype  = ['F', 'D']
-
-class AVX_cmplx_arithmetic(Benchmark):
-    params = [cmplx_bfuncs, cmplxstride, cmplxdtype]
-    param_names = ['bfunc', 'stride', 'dtype']
-    timeout = 10
-
-    def setup(self, bfuncname, stride, dtype):
-        np.seterr(all='ignore')
-        try:
-            self.f = getattr(np, bfuncname)
-        except AttributeError:
-            raise NotImplementedError(f"No bfunc {bfuncname} found") from None
-        N = 10000
-        self.arr1 = np.ones(stride*N, dtype)
-        self.arr2 = np.ones(stride*N, dtype)
-
-    def time_ufunc(self, bfuncname, stride, dtype):
-        self.f(self.arr1[::stride], self.arr2[::stride])
-
-cmplx_ufuncs = ['reciprocal',
-                'absolute',
-                'square',
-                'conjugate']
-
-class AVX_cmplx_funcs(Benchmark):
-    params = [cmplx_ufuncs, cmplxstride, cmplxdtype]
-    param_names = ['bfunc', 'stride', 'dtype']
+class _AbstractUnary(Benchmark):
+    params = []
+    param_names = ['ufunc', 'stride_in', 'stride_out', 'dtype']
     timeout = 10
+    arrlen = 10000
+    data_finite = True
+    data_denormal = False
+    data_zeros = False
+
+    def setup(self, ufunc, stride_in, stride_out, dtype):
+        arr_in = get_data(
+            stride_in*self.arrlen, dtype,
+            zeros=self.data_zeros,
+            finite=self.data_finite,
+            denormal=self.data_denormal,
+        )
+        self.ufunc_args = [arr_in[::stride_in]]
+
+        ufunc_insig = f'{dtype}->'
+        if ufunc_insig+dtype not in ufunc.types:
+            test = [sig for sig in ufunc.types if sig.startswith(ufunc_insig)]
+            if not test:
+                raise NotImplementedError(
+                    f"Ufunc {ufunc} doesn't support "
+                    f"unary input of dtype {dtype}"
+                ) from None
+            tout = test[0].split('->')[1]
+        else:
+            tout = dtype
+
+        for dt in tout:
+            self.ufunc_args += [
+                np.empty(stride_out*self.arrlen, dt)[::stride_out]
+            ]
 
-    def setup(self, bfuncname, stride, dtype):
         np.seterr(all='ignore')
-        try:
-            self.f = getattr(np, bfuncname)
-        except AttributeError:
-            raise NotImplementedError(f"No bfunc {bfuncname} found") from None
-        N = 10000
-        self.arr1 = np.ones(stride*N, dtype)
 
-    def time_ufunc(self, bfuncname, stride, dtype):
-        self.f(self.arr1[::stride])
+    def time_unary(self, ufunc, stride_in, stride_out, dtype):
+        ufunc(*self.ufunc_args)
+
+class UnaryFP(_AbstractUnary):
+    params = [UFUNCS_UNARY, [1, 2, 4], [1, 2, 4], ['e', 'f', 'd']]
+
+    def setup(self, ufunc, stride_in, stride_out, dtype):
+        _AbstractUnary.setup(self, ufunc, stride_in, stride_out, dtype)
+        if (ufunc.__name__ == 'arccosh'):
+            self.ufunc_args[0] += 1.0
+
+class UnaryFPSpecial(UnaryFP):
+    data_finite = False
+    data_denormal = True
+    data_zeros = True
+
+class BinaryFP(_AbstractBinary):
+    params = [
+        [np.maximum, np.minimum, np.fmax, np.fmin, np.ldexp],
+        [1, 2, 4], [1, 2, 4], [1, 2, 4], ['f', 'd']
+    ]
+
+class BinaryFPSpecial(BinaryFP):
+    data_finite = False
+    data_denormal = True
+    data_zeros = True
+
+class BinaryComplex(_AbstractBinary):
+    params = [
+        [np.add, np.subtract, np.multiply, np.divide],
+        [1, 2, 4], [1, 2, 4], [1, 2, 4],
+        ['F', 'D']
+    ]
+
+class UnaryComplex(_AbstractUnary):
+    params = [
+        [np.reciprocal, np.absolute, np.square, np.conjugate],
+        [1, 2, 4], [1, 2, 4], ['F', 'D']
+    ]
+
+class BinaryInt(_AbstractBinary):
+    arrlen = 100000
+    params = [
+        [np.maximum, np.minimum],
+        [1, 2], [1, 2], [1, 2],
+        ['b', 'B', 'h', 'H', 'i', 'I', 'l', 'L', 'q', 'Q']
+    ]
+
+class BinaryIntContig(_AbstractBinary):
+    params = [
+        [getattr(np, uf) for uf in (
+            'add', 'subtract', 'multiply', 'bitwise_and', 'bitwise_or',
+            'bitwise_xor', 'logical_and', 'logical_or', 'logical_xor',
+            'right_shift', 'left_shift',
+        )],
+        [1], [1], [1],
+        ['b', 'B', 'h', 'H', 'i', 'I', 'l', 'L', 'q', 'Q']
+    ]
+
+class UnaryIntContig(_AbstractUnary):
+    arrlen = 100000
+    params = [
+        [getattr(np, uf) for uf in (
+            'positive', 'square', 'reciprocal', 'conjugate', 'logical_not',
+            'invert', 'isnan', 'isinf', 'isfinite',
+            'absolute', 'sign'
+        )],
+        [1], [1],
+        ['b', 'B', 'h', 'H', 'i', 'I', 'l', 'L', 'q', 'Q']
+    ]
 
 class Mandelbrot(Benchmark):
     def f(self,z):
         return np.abs(z) < 4.0
 
     def g(self,z,c):
-        return np.sum(np.multiply(z,z) + c)
+        return np.sum(np.multiply(z, z) + c)
 
     def mandelbrot_numpy(self, c, maxiter):
-        output = np.zeros(c.shape, np.int)
+        output = np.zeros(c.shape, np.int32)
         z = np.empty(c.shape, np.complex64)
         for it in range(maxiter):
             notdone = self.f(z)
diff --git a/benchmarks/benchmarks/common.py b/benchmarks/benchmarks/common.py
index 0c40e85b0..d10fe999d 100644
--- a/benchmarks/benchmarks/common.py
+++ b/benchmarks/benchmarks/common.py
@@ -1,5 +1,8 @@
-import numpy
+import numpy as np
 import random
+import os
+from functools import lru_cache
+from pathlib import Path
 
 # Various pre-crafted datasets/variables for testing
 # !!! Must not be changed -- only appended !!!
@@ -7,7 +10,7 @@ import random
 # sequences
 random.seed(1)
 # but will seed it nevertheless
-numpy.random.seed(1)
+np.random.seed(1)
 
 nx, ny = 1000, 1000
 # reduced squares based on indexes_rand, primarily for testing more
@@ -19,37 +22,37 @@ TYPES1 = [
     'int16', 'float16',
     'int32', 'float32',
     'int64', 'float64',  'complex64',
-    'longfloat', 'complex128',
+    'longdouble', 'complex128',
 ]
-if 'complex256' in numpy.sctypeDict:
-    TYPES1.append('complex256')
+if 'complex256' in np.sctypeDict:
+    TYPES1.append('clongdouble')
 
+DLPACK_TYPES = [
+    'int16', 'float16',
+    'int32', 'float32',
+    'int64', 'float64',  'complex64',
+    'complex128', 'bool',
+]
 
-def memoize(func):
-    result = []
-    def wrapper():
-        if not result:
-            result.append(func())
-        return result[0]
-    return wrapper
-
+# Path for caching
+CACHE_ROOT = Path(__file__).resolve().parent.parent / 'env' / 'numpy_benchdata'
 
 # values which will be used to construct our sample data matrices
 # replicate 10 times to speed up initial imports of this helper
 # and generate some redundancy
 
-@memoize
+@lru_cache(typed=True)
 def get_values():
-    rnd = numpy.random.RandomState(1)
-    values = numpy.tile(rnd.uniform(0, 100, size=nx*ny//10), 10)
+    rnd = np.random.RandomState(1)
+    values = np.tile(rnd.uniform(0, 100, size=nx*ny//10), 10)
     return values
 
 
-@memoize
+@lru_cache(typed=True)
 def get_squares():
     values = get_values()
-    squares = {t: numpy.array(values,
-                              dtype=getattr(numpy, t)).reshape((nx, ny))
+    squares = {t: np.array(values,
+                              dtype=getattr(np, t)).reshape((nx, ny))
                for t in TYPES1}
 
     # adjust complex ones to have non-degenerated imagery part -- use
@@ -60,42 +63,42 @@ def get_squares():
     return squares
 
 
-@memoize
+@lru_cache(typed=True)
 def get_squares_():
     # smaller squares
     squares_ = {t: s[:nxs, :nys] for t, s in get_squares().items()}
     return squares_
 
 
-@memoize
+@lru_cache(typed=True)
 def get_vectors():
     # vectors
     vectors = {t: s[0] for t, s in get_squares().items()}
     return vectors
 
 
-@memoize
+@lru_cache(typed=True)
 def get_indexes():
     indexes = list(range(nx))
     # so we do not have all items
     indexes.pop(5)
     indexes.pop(95)
 
-    indexes = numpy.array(indexes)
+    indexes = np.array(indexes)
     return indexes
 
 
-@memoize
+@lru_cache(typed=True)
 def get_indexes_rand():
     rnd = random.Random(1)
 
     indexes_rand = get_indexes().tolist()       # copy
     rnd.shuffle(indexes_rand)         # in-place shuffle
-    indexes_rand = numpy.array(indexes_rand)
+    indexes_rand = np.array(indexes_rand)
     return indexes_rand
 
 
-@memoize
+@lru_cache(typed=True)
 def get_indexes_():
     # smaller versions
     indexes = get_indexes()
@@ -103,12 +106,112 @@ def get_indexes_():
     return indexes_
 
 
-@memoize
+@lru_cache(typed=True)
 def get_indexes_rand_():
     indexes_rand = get_indexes_rand()
     indexes_rand_ = indexes_rand[indexes_rand < nxs]
     return indexes_rand_
 
 
+@lru_cache(typed=True)
+def get_data(size, dtype, ip_num=0, zeros=False, finite=True, denormal=False):
+    """
+    Generates a cached random array that covers several scenarios that
+    may affect the benchmark for fairness and to stabilize the benchmark.
+
+    Parameters
+    ----------
+    size: int
+        Array length.
+
+    dtype: dtype or dtype specifier
+
+    ip_num: int
+        Input number, to avoid memory overload
+        and to provide unique data for each operand.
+
+    zeros: bool
+        Spreading zeros along with generated data.
+
+    finite: bool
+        Avoid spreading fp special cases nan/inf.
+
+    denormal:
+        Spreading subnormal numbers along with generated data.
+    """
+    dtype = np.dtype(dtype)
+    dname = dtype.name
+    cache_name = f'{dname}_{size}_{ip_num}_{int(zeros)}'
+    if dtype.kind in 'fc':
+        cache_name += f'{int(finite)}{int(denormal)}'
+    cache_name += '.bin'
+    cache_path = CACHE_ROOT / cache_name
+    if cache_path.exists():
+        return np.fromfile(cache_path, dtype)
+
+    array = np.ones(size, dtype)
+    rands = []
+    if dtype.kind == 'i':
+        dinfo = np.iinfo(dtype)
+        scale = 8
+        if zeros:
+            scale += 1
+        lsize = size // scale
+        for low, high in (
+            (-0x80, -1),
+            (1, 0x7f),
+            (-0x8000, -1),
+            (1, 0x7fff),
+            (-0x80000000, -1),
+            (1, 0x7fffffff),
+            (-0x8000000000000000, -1),
+            (1, 0x7fffffffffffffff),
+        ):
+            rands += [np.random.randint(
+                max(low, dinfo.min),
+                min(high, dinfo.max),
+                lsize, dtype
+            )]
+    elif dtype.kind == 'u':
+        dinfo = np.iinfo(dtype)
+        scale = 4
+        if zeros:
+            scale += 1
+        lsize = size // scale
+        for high in (0xff, 0xffff, 0xffffffff, 0xffffffffffffffff):
+            rands += [np.random.randint(1, min(high, dinfo.max), lsize, dtype)]
+    elif dtype.kind in 'fc':
+        scale = 1
+        if zeros:
+            scale += 1
+        if not finite:
+            scale += 2
+        if denormal:
+            scale += 1
+        dinfo = np.finfo(dtype)
+        lsize = size // scale
+        rands = [np.random.rand(lsize).astype(dtype)]
+        if not finite:
+            rands += [
+                np.empty(lsize, dtype=dtype), np.empty(lsize, dtype=dtype)
+            ]
+            rands[1].fill(float('nan'))
+            rands[2].fill(float('inf'))
+        if denormal:
+            rands += [np.empty(lsize, dtype=dtype)]
+            rands[-1].fill(dinfo.smallest_subnormal)
+
+    if rands:
+        if zeros:
+            rands += [np.zeros(lsize, dtype)]
+        stride = len(rands)
+        for start, r in enumerate(rands):
+            array[start:len(r)*stride:stride] = r
+
+    if not CACHE_ROOT.exists():
+        CACHE_ROOT.mkdir(parents=True)
+    array.tofile(cache_path)
+    return array
+
 class Benchmark:
     pass