diff options
Diffstat (limited to 'numpy')
129 files changed, 3270 insertions, 1738 deletions
diff --git a/numpy/__init__.py b/numpy/__init__.py index aae5c95ac..83487dc97 100644 --- a/numpy/__init__.py +++ b/numpy/__init__.py @@ -274,6 +274,7 @@ else: def __getattr__(attr): # Warn for expired attributes, and return a dummy function # that always raises an exception. + import warnings try: msg = __expired_functions__[attr] except KeyError: @@ -312,7 +313,11 @@ else: "{!r}".format(__name__, attr)) def __dir__(): - return list(globals().keys() | {'Tester', 'testing'}) + public_symbols = globals().keys() | {'Tester', 'testing'} + public_symbols -= { + "core", "matrixlib", + } + return list(public_symbols) # Pytest testing from numpy._pytesttester import PytestTester @@ -358,7 +363,6 @@ else: except ValueError: pass - import sys if sys.platform == "darwin": with warnings.catch_warnings(record=True) as w: _mac_os_check() @@ -414,6 +418,12 @@ else: from pathlib import Path return [str(Path(__file__).with_name("_pyinstaller").resolve())] + # Remove symbols imported for internal use + del os + # get the version using versioneer from .version import __version__, git_revision as __git_version__ + +# Remove symbols imported for internal use +del sys, warnings diff --git a/numpy/__init__.pyi b/numpy/__init__.pyi index 2eb4a0634..d6faa9ca3 100644 --- a/numpy/__init__.pyi +++ b/numpy/__init__.pyi @@ -203,7 +203,6 @@ from numpy import ( lib as lib, linalg as linalg, ma as ma, - matrixlib as matrixlib, polynomial as polynomial, random as random, testing as testing, diff --git a/numpy/_pytesttester.py b/numpy/_pytesttester.py index 8decb9dd7..01ddaaf98 100644 --- a/numpy/_pytesttester.py +++ b/numpy/_pytesttester.py @@ -33,7 +33,6 @@ import os __all__ = ['PytestTester'] - def _show_numpy_info(): import numpy as np @@ -44,7 +43,6 @@ def _show_numpy_info(): print("NumPy CPU features: ", (info if info else 'nothing enabled')) - class PytestTester: """ Pytest test runner. @@ -167,7 +165,7 @@ class PytestTester: ] if doctests: - raise ValueError("Doctests not supported") + pytest_args += ["--doctest-modules"] if extra_argv: pytest_args += list(extra_argv) diff --git a/numpy/array_api/linalg.py b/numpy/array_api/linalg.py index f422e1c27..a4a2f23e4 100644 --- a/numpy/array_api/linalg.py +++ b/numpy/array_api/linalg.py @@ -1,8 +1,11 @@ from __future__ import annotations from ._dtypes import _floating_dtypes, _numeric_dtypes +from ._manipulation_functions import reshape from ._array_object import Array +from ..core.numeric import normalize_axis_tuple + from typing import TYPE_CHECKING if TYPE_CHECKING: from ._typing import Literal, Optional, Sequence, Tuple, Union @@ -395,18 +398,38 @@ def vector_norm(x: Array, /, *, axis: Optional[Union[int, Tuple[int, ...]]] = No if x.dtype not in _floating_dtypes: raise TypeError('Only floating-point dtypes are allowed in norm') + # np.linalg.norm tries to do a matrix norm whenever axis is a 2-tuple or + # when axis=None and the input is 2-D, so to force a vector norm, we make + # it so the input is 1-D (for axis=None), or reshape so that norm is done + # on a single dimension. a = x._array if axis is None: - a = a.flatten() - axis = 0 + # Note: np.linalg.norm() doesn't handle 0-D arrays + a = a.ravel() + _axis = 0 elif isinstance(axis, tuple): - # Note: The axis argument supports any number of axes, whereas norm() - # only supports a single axis for vector norm. - rest = tuple(i for i in range(a.ndim) if i not in axis) + # Note: The axis argument supports any number of axes, whereas + # np.linalg.norm() only supports a single axis for vector norm. + normalized_axis = normalize_axis_tuple(axis, x.ndim) + rest = tuple(i for i in range(a.ndim) if i not in normalized_axis) newshape = axis + rest - a = np.transpose(a, newshape).reshape((np.prod([a.shape[i] for i in axis]), *[a.shape[i] for i in rest])) - axis = 0 - return Array._new(np.linalg.norm(a, axis=axis, keepdims=keepdims, ord=ord)) + a = np.transpose(a, newshape).reshape( + (np.prod([a.shape[i] for i in axis], dtype=int), *[a.shape[i] for i in rest])) + _axis = 0 + else: + _axis = axis + + res = Array._new(np.linalg.norm(a, axis=_axis, ord=ord)) + + if keepdims: + # We can't reuse np.linalg.norm(keepdims) because of the reshape hacks + # above to avoid matrix norm logic. + shape = list(x.shape) + _axis = normalize_axis_tuple(range(x.ndim) if axis is None else axis, x.ndim) + for i in _axis: + shape[i] = 1 + res = reshape(res, tuple(shape)) + return res __all__ = ['cholesky', 'cross', 'det', 'diagonal', 'eigh', 'eigvalsh', 'inv', 'matmul', 'matrix_norm', 'matrix_power', 'matrix_rank', 'matrix_transpose', 'outer', 'pinv', 'qr', 'slogdet', 'solve', 'svd', 'svdvals', 'tensordot', 'trace', 'vecdot', 'vector_norm'] diff --git a/numpy/core/_add_newdocs.py b/numpy/core/_add_newdocs.py index fb9c30d93..3e8df6d46 100644 --- a/numpy/core/_add_newdocs.py +++ b/numpy/core/_add_newdocs.py @@ -3437,6 +3437,24 @@ add_newdoc('numpy.core.multiarray', 'ndarray', ('fill', >>> a array([1., 1.]) + Fill expects a scalar value and always behaves the same as assigning + to a single array element. The following is a rare example where this + distinction is important: + + >>> a = np.array([None, None], dtype=object) + >>> a[0] = np.array(3) + >>> a + array([array(3), None], dtype=object) + >>> a.fill(np.array(3)) + >>> a + array([array(3), array(3)], dtype=object) + + Where other forms of assignments will unpack the array being assigned: + + >>> a[...] = np.array(3) + >>> a + array([3, 3], dtype=object) + """)) diff --git a/numpy/core/_asarray.py b/numpy/core/_asarray.py index 89d422e99..cbaab8c3f 100644 --- a/numpy/core/_asarray.py +++ b/numpy/core/_asarray.py @@ -14,6 +14,15 @@ from .multiarray import array, asanyarray __all__ = ["require"] +POSSIBLE_FLAGS = { + 'C': 'C', 'C_CONTIGUOUS': 'C', 'CONTIGUOUS': 'C', + 'F': 'F', 'F_CONTIGUOUS': 'F', 'FORTRAN': 'F', + 'A': 'A', 'ALIGNED': 'A', + 'W': 'W', 'WRITEABLE': 'W', + 'O': 'O', 'OWNDATA': 'O', + 'E': 'E', 'ENSUREARRAY': 'E' +} + def _require_dispatcher(a, dtype=None, requirements=None, *, like=None): return (like,) @@ -36,7 +45,7 @@ def require(a, dtype=None, requirements=None, *, like=None): The required data-type. If None preserve the current dtype. If your application requires the data to be in native byteorder, include a byteorder specification as a part of the dtype specification. - requirements : str or list of str + requirements : str or sequence of str The requirements list can be any of the following * 'F_CONTIGUOUS' ('F') - ensure a Fortran-contiguous array @@ -97,16 +106,10 @@ def require(a, dtype=None, requirements=None, *, like=None): like=like, ) - possible_flags = {'C': 'C', 'C_CONTIGUOUS': 'C', 'CONTIGUOUS': 'C', - 'F': 'F', 'F_CONTIGUOUS': 'F', 'FORTRAN': 'F', - 'A': 'A', 'ALIGNED': 'A', - 'W': 'W', 'WRITEABLE': 'W', - 'O': 'O', 'OWNDATA': 'O', - 'E': 'E', 'ENSUREARRAY': 'E'} if not requirements: return asanyarray(a, dtype=dtype) - else: - requirements = {possible_flags[x.upper()] for x in requirements} + + requirements = {POSSIBLE_FLAGS[x.upper()] for x in requirements} if 'E' in requirements: requirements.remove('E') @@ -128,8 +131,7 @@ def require(a, dtype=None, requirements=None, *, like=None): for prop in requirements: if not arr.flags[prop]: - arr = arr.copy(order) - break + return arr.copy(order) return arr diff --git a/numpy/core/code_generators/generate_numpy_api.py b/numpy/core/code_generators/generate_numpy_api.py index 37975966f..a966be57d 100644 --- a/numpy/core/code_generators/generate_numpy_api.py +++ b/numpy/core/code_generators/generate_numpy_api.py @@ -89,19 +89,22 @@ _import_array(void) */ st = PyArray_GetEndianness(); if (st == NPY_CPU_UNKNOWN_ENDIAN) { - PyErr_Format(PyExc_RuntimeError, "FATAL: module compiled as unknown endian"); + PyErr_SetString(PyExc_RuntimeError, + "FATAL: module compiled as unknown endian"); return -1; } #if NPY_BYTE_ORDER == NPY_BIG_ENDIAN if (st != NPY_CPU_BIG) { - PyErr_Format(PyExc_RuntimeError, "FATAL: module compiled as "\ - "big endian, but detected different endianness at runtime"); + PyErr_SetString(PyExc_RuntimeError, + "FATAL: module compiled as big endian, but " + "detected different endianness at runtime"); return -1; } #elif NPY_BYTE_ORDER == NPY_LITTLE_ENDIAN if (st != NPY_CPU_LITTLE) { - PyErr_Format(PyExc_RuntimeError, "FATAL: module compiled as "\ - "little endian, but detected different endianness at runtime"); + PyErr_SetString(PyExc_RuntimeError, + "FATAL: module compiled as little endian, but " + "detected different endianness at runtime"); return -1; } #endif diff --git a/numpy/core/code_generators/ufunc_docstrings.py b/numpy/core/code_generators/ufunc_docstrings.py index 24b707a12..7c020fa2e 100644 --- a/numpy/core/code_generators/ufunc_docstrings.py +++ b/numpy/core/code_generators/ufunc_docstrings.py @@ -2011,7 +2011,7 @@ add_newdoc('numpy.core.umath', 'log', ----- Logarithm is a multivalued function: for each `x` there is an infinite number of `z` such that `exp(z) = x`. The convention is to return the - `z` whose imaginary part lies in `[-pi, pi]`. + `z` whose imaginary part lies in `(-pi, pi]`. For real-valued input data types, `log` always returns real output. For each value that cannot be expressed as a real number or infinity, it @@ -2021,6 +2021,10 @@ add_newdoc('numpy.core.umath', 'log', has a branch cut `[-inf, 0]` and is continuous from above on it. `log` handles the floating-point negative zero as an infinitesimal negative number, conforming to the C99 standard. + + In the cases where the input has a negative real part and a very small + negative complex part (approaching 0), the result is so close to `-pi` + that it evaluates to exactly `-pi`. References ---------- @@ -2061,7 +2065,7 @@ add_newdoc('numpy.core.umath', 'log10', ----- Logarithm is a multivalued function: for each `x` there is an infinite number of `z` such that `10**z = x`. The convention is to return the - `z` whose imaginary part lies in `[-pi, pi]`. + `z` whose imaginary part lies in `(-pi, pi]`. For real-valued input data types, `log10` always returns real output. For each value that cannot be expressed as a real number or infinity, @@ -2072,6 +2076,10 @@ add_newdoc('numpy.core.umath', 'log10', `log10` handles the floating-point negative zero as an infinitesimal negative number, conforming to the C99 standard. + In the cases where the input has a negative real part and a very small + negative complex part (approaching 0), the result is so close to `-pi` + that it evaluates to exactly `-pi`. + References ---------- .. [1] M. Abramowitz and I.A. Stegun, "Handbook of Mathematical Functions", @@ -2112,7 +2120,7 @@ add_newdoc('numpy.core.umath', 'log2', Logarithm is a multivalued function: for each `x` there is an infinite number of `z` such that `2**z = x`. The convention is to return the `z` - whose imaginary part lies in `[-pi, pi]`. + whose imaginary part lies in `(-pi, pi]`. For real-valued input data types, `log2` always returns real output. For each value that cannot be expressed as a real number or infinity, @@ -2123,6 +2131,10 @@ add_newdoc('numpy.core.umath', 'log2', handles the floating-point negative zero as an infinitesimal negative number, conforming to the C99 standard. + In the cases where the input has a negative real part and a very small + negative complex part (approaching 0), the result is so close to `-pi` + that it evaluates to exactly `-pi`. + Examples -------- >>> x = np.array([0, 1, 2, 2**4]) diff --git a/numpy/core/include/numpy/experimental_dtype_api.h b/numpy/core/include/numpy/experimental_dtype_api.h index 1dd6215e6..23e9a8d21 100644 --- a/numpy/core/include/numpy/experimental_dtype_api.h +++ b/numpy/core/include/numpy/experimental_dtype_api.h @@ -214,7 +214,7 @@ typedef struct { } PyArrayMethod_Spec; -typedef PyObject *_ufunc_addloop_fromspec_func( +typedef int _ufunc_addloop_fromspec_func( PyObject *ufunc, PyArrayMethod_Spec *spec); /* * The main ufunc registration function. This adds a new implementation/loop diff --git a/numpy/core/include/numpy/ndarraytypes.h b/numpy/core/include/numpy/ndarraytypes.h index c295f34bb..97e0f4e2a 100644 --- a/numpy/core/include/numpy/ndarraytypes.h +++ b/numpy/core/include/numpy/ndarraytypes.h @@ -1380,7 +1380,10 @@ typedef struct { int nd_fancy; npy_intp fancy_dims[NPY_MAXDIMS]; - /* Whether the iterator (any of the iterators) requires API */ + /* + * Whether the iterator (any of the iterators) requires API. This is + * unused by NumPy itself; ArrayMethod flags are more precise. + */ int needs_api; /* diff --git a/numpy/core/overrides.py b/numpy/core/overrides.py index cb550152e..663436a4c 100644 --- a/numpy/core/overrides.py +++ b/numpy/core/overrides.py @@ -2,6 +2,7 @@ import collections import functools import os +import sys from numpy.core._multiarray_umath import ( add_docstring, implement_array_function, _get_implementing_args) @@ -176,7 +177,27 @@ def array_function_dispatch(dispatcher, module=None, verify=True, @functools.wraps(implementation) def public_api(*args, **kwargs): - relevant_args = dispatcher(*args, **kwargs) + try: + relevant_args = dispatcher(*args, **kwargs) + except TypeError as exc: + # Try to clean up a signature related TypeError. Such an + # error will be something like: + # dispatcher.__name__() got an unexpected keyword argument + # + # So replace the dispatcher name in this case. In principle + # TypeErrors may be raised from _within_ the dispatcher, so + # we check that the traceback contains a string that starts + # with the name. (In principle we could also check the + # traceback length, as it would be deeper.) + msg = exc.args[0] + disp_name = dispatcher.__name__ + if not isinstance(msg, str) or not msg.startswith(disp_name): + raise + + # Replace with the correct name and re-raise: + new_msg = msg.replace(disp_name, public_api.__name__) + raise TypeError(new_msg) from None + return implement_array_function( implementation, public_api, relevant_args, args, kwargs) diff --git a/numpy/core/setup.py b/numpy/core/setup.py index 7d072c15c..543b6ae39 100644 --- a/numpy/core/setup.py +++ b/numpy/core/setup.py @@ -1,9 +1,9 @@ import os import sys +import sysconfig import pickle import copy import warnings -import platform import textwrap import glob from os.path import join @@ -79,9 +79,8 @@ def can_link_svml(): """ if NPY_DISABLE_SVML: return False - machine = platform.machine() - system = platform.system() - return "x86_64" in machine and system == "Linux" + platform = sysconfig.get_platform() + return "x86_64" in platform and "linux" in platform def check_svml_submodule(svmlpath): if not os.path.exists(svmlpath + "/README.md"): @@ -1081,6 +1080,7 @@ def configuration(parent_package='',top_path=None): join('src', 'umath', 'scalarmath.c.src'), join('src', 'umath', 'ufunc_type_resolution.c'), join('src', 'umath', 'override.c'), + join('src', 'umath', 'string_ufuncs.cpp'), # For testing. Eventually, should use public API and be separate: join('src', 'umath', '_scaled_float_dtype.c'), ] diff --git a/numpy/core/src/_simd/_simd.dispatch.c.src b/numpy/core/src/_simd/_simd.dispatch.c.src index 0f3e4fc8f..997205957 100644 --- a/numpy/core/src/_simd/_simd.dispatch.c.src +++ b/numpy/core/src/_simd/_simd.dispatch.c.src @@ -18,7 +18,7 @@ * #esfx = u16,s8, u32, s16, u32, s32, u64, s64, f32, f64# * #size = 8, 8, 16, 16, 32, 32, 64, 64, 32, 64# * #expand_sup= 1, 0, 1, 0, 0, 0, 0, 0, 0, 0# - * #simd_sup = 1, 1, 1, 1, 1, 1, 1, 1, 1, NPY_SIMD_F64# + * #simd_sup = 1, 1, 1, 1, 1, 1, 1, 1, NPY_SIMD_F32, NPY_SIMD_F64# * #fp_only = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1# * #sat_sup = 1, 1, 1, 1, 0, 0, 0, 0, 0, 0# * #mul_sup = 1, 1, 1, 1, 1, 1, 0, 0, 1, 1# @@ -252,7 +252,7 @@ SIMD_IMPL_INTRIN_3(select_@sfx@, v@sfx@, v@bsfx@, v@sfx@, v@sfx@) /**begin repeat1 * #sfx_to = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64# - * #simd_sup2 = 1, 1, 1, 1, 1, 1, 1, 1, 1, NPY_SIMD_F64# + * #simd_sup2 = 1, 1, 1, 1, 1, 1, 1, 1, NPY_SIMD_F32, NPY_SIMD_F64# */ #if @simd_sup2@ SIMD_IMPL_INTRIN_1(reinterpret_@sfx_to@_@sfx@, v@sfx_to@, v@sfx@) @@ -442,7 +442,9 @@ SIMD_IMPL_INTRIN_0N(cleanup) * Operators ***************************/ // check special cases -SIMD_IMPL_INTRIN_1(notnan_f32, vb32, vf32) +#if NPY_SIMD_F32 + SIMD_IMPL_INTRIN_1(notnan_f32, vb32, vf32) +#endif #if NPY_SIMD_F64 SIMD_IMPL_INTRIN_1(notnan_f64, vb64, vf64) #endif @@ -450,7 +452,9 @@ SIMD_IMPL_INTRIN_1(notnan_f32, vb32, vf32) * Conversions ***************************/ // round to nearest integer (assume even) -SIMD_IMPL_INTRIN_1(round_s32_f32, vs32, vf32) +#if NPY_SIMD_F32 + SIMD_IMPL_INTRIN_1(round_s32_f32, vs32, vf32) +#endif #if NPY_SIMD_F64 SIMD_IMPL_INTRIN_2(round_s32_f64, vs32, vf64, vf64) #endif @@ -492,10 +496,10 @@ static PyMethodDef simd__intrinsics_methods[] = { /**begin repeat * #sfx = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64# * #bsfx = b8, b8, b16, b16, b32, b32, b64, b64, b32, b64# - * #esfx = u16,s8, u32, s16, u32, s32, u64, s64, f32, f64# * #size = 8, 8, 16, 16, 32, 32, 64, 64, 32, 64# + * #esfx = u16, s8, u32,s16, u32, s32, u64, s64, f32, f64# * #expand_sup= 1, 0, 1, 0, 0, 0, 0, 0, 0, 0# - * #simd_sup = 1, 1, 1, 1, 1, 1, 1, 1, 1, NPY_SIMD_F64# + * #simd_sup = 1, 1, 1, 1, 1, 1, 1, 1, NPY_SIMD_F32, NPY_SIMD_F64# * #fp_only = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1# * #sat_sup = 1, 1, 1, 1, 0, 0, 0, 0, 0, 0# * #mul_sup = 1, 1, 1, 1, 1, 1, 0, 0, 1, 1# @@ -547,7 +551,7 @@ SIMD_INTRIN_DEF(lut16_@sfx@) ***************************/ /**begin repeat1 * #sfx_to = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64# - * #simd_sup2 = 1, 1, 1, 1, 1, 1, 1, 1, 1, NPY_SIMD_F64# + * #simd_sup2 = 1, 1, 1, 1, 1, 1, 1, 1, NPY_SIMD_F32, NPY_SIMD_F64# */ #if @simd_sup2@ SIMD_INTRIN_DEF(reinterpret_@sfx_to@_@sfx@) @@ -698,7 +702,9 @@ SIMD_INTRIN_DEF(cleanup) * Operators ***************************/ // check special cases -SIMD_INTRIN_DEF(notnan_f32) +#if NPY_SIMD_F32 + SIMD_INTRIN_DEF(notnan_f32) +#endif #if NPY_SIMD_F64 SIMD_INTRIN_DEF(notnan_f64) #endif @@ -706,7 +712,9 @@ SIMD_INTRIN_DEF(notnan_f32) * Conversions ***************************/ // round to nearest integer (assume even) -SIMD_INTRIN_DEF(round_s32_f32) +#if NPY_SIMD_F32 + SIMD_INTRIN_DEF(round_s32_f32) +#endif #if NPY_SIMD_F64 SIMD_INTRIN_DEF(round_s32_f64) #endif @@ -777,12 +785,18 @@ NPY_CPU_DISPATCH_CURFX(simd_create_module)(void) if (PyModule_AddIntConstant(m, "simd_f64", NPY_SIMD_F64)) { goto err; } + if (PyModule_AddIntConstant(m, "simd_f32", NPY_SIMD_F32)) { + goto err; + } if (PyModule_AddIntConstant(m, "simd_fma3", NPY_SIMD_FMA3)) { goto err; } if (PyModule_AddIntConstant(m, "simd_width", NPY_SIMD_WIDTH)) { goto err; } + if (PyModule_AddIntConstant(m, "simd_bigendian", NPY_SIMD_BIGENDIAN)) { + goto err; + } #if NPY_SIMD if (PySIMDVectorType_Init(m)) { goto err; diff --git a/numpy/core/src/_simd/_simd_convert.inc b/numpy/core/src/_simd/_simd_convert.inc index 46e044479..58eb90d69 100644 --- a/numpy/core/src/_simd/_simd_convert.inc +++ b/numpy/core/src/_simd/_simd_convert.inc @@ -20,6 +20,10 @@ simd_scalar_from_number(PyObject *obj, simd_data_type dtype) } } else { data.u64 = PyLong_AsUnsignedLongLongMask(obj); + #if NPY_SIMD_BIGENDIAN + int leftb = (sizeof(npyv_lanetype_u64) - info->lane_size) * 8; + data.u64 <<= leftb; + #endif } return data; } @@ -36,7 +40,9 @@ simd_scalar_to_number(simd_data data, simd_data_type dtype) return PyFloat_FromDouble(data.f64); } int leftb = (sizeof(npyv_lanetype_u64) - info->lane_size) * 8; +#if !NPY_SIMD_BIGENDIAN data.u64 <<= leftb; +#endif if (info->is_signed) { return PyLong_FromLongLong(data.s64 >> leftb); } diff --git a/numpy/core/src/_simd/_simd_inc.h.src b/numpy/core/src/_simd/_simd_inc.h.src index fbdf982c2..887545414 100644 --- a/numpy/core/src/_simd/_simd_inc.h.src +++ b/numpy/core/src/_simd/_simd_inc.h.src @@ -27,22 +27,27 @@ typedef union /**end repeat**/ // vectors /**begin repeat - * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32, b8, b16, b32, b64# + * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, b8, b16, b32, b64# */ npyv_@sfx@ v@sfx@; /**end repeat**/ // multi-vectors x2 /**begin repeat - * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32# + * #sfx = u8, u16, u32, u64, s8, s16, s32, s64# */ npyv_@sfx@x2 v@sfx@x2; /**end repeat**/ // multi-vectors x3 /**begin repeat - * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32# + * #sfx = u8, u16, u32, u64, s8, s16, s32, s64# */ npyv_@sfx@x3 v@sfx@x3; /**end repeat**/ +#if NPY_SIMD_F32 + npyv_f32 vf32; + npyv_f32x2 vf32x2; + npyv_f32x3 vf32x3; +#endif #if NPY_SIMD_F64 npyv_f64 vf64; npyv_f64x2 vf64x2; diff --git a/numpy/core/src/common/lowlevel_strided_loops.h b/numpy/core/src/common/lowlevel_strided_loops.h index 118ce9cb1..924a34db5 100644 --- a/numpy/core/src/common/lowlevel_strided_loops.h +++ b/numpy/core/src/common/lowlevel_strided_loops.h @@ -196,7 +196,7 @@ PyArray_GetDTypeTransferFunction(int aligned, PyArray_Descr *src_dtype, PyArray_Descr *dst_dtype, int move_references, NPY_cast_info *cast_info, - int *out_needs_api); + NPY_ARRAYMETHOD_FLAGS *out_flags); NPY_NO_EXPORT int get_fields_transfer_function(int aligned, @@ -205,7 +205,7 @@ get_fields_transfer_function(int aligned, int move_references, PyArrayMethod_StridedLoop **out_stransfer, NpyAuxData **out_transferdata, - int *out_needs_api); + NPY_ARRAYMETHOD_FLAGS *out_flags); NPY_NO_EXPORT int get_subarray_transfer_function(int aligned, @@ -214,7 +214,7 @@ get_subarray_transfer_function(int aligned, int move_references, PyArrayMethod_StridedLoop **out_stransfer, NpyAuxData **out_transferdata, - int *out_needs_api); + NPY_ARRAYMETHOD_FLAGS *out_flags); /* * This is identical to PyArray_GetDTypeTransferFunction, but returns a @@ -241,7 +241,7 @@ PyArray_GetMaskedDTypeTransferFunction(int aligned, PyArray_Descr *mask_dtype, int move_references, NPY_cast_info *cast_info, - int *out_needs_api); + NPY_ARRAYMETHOD_FLAGS *out_flags); /* * Casts the specified number of elements from 'src' with data type @@ -336,10 +336,14 @@ mapiter_trivial_set(PyArrayObject *self, PyArrayObject *ind, PyArrayObject *result); NPY_NO_EXPORT int -mapiter_get(PyArrayMapIterObject *mit); +mapiter_get( + PyArrayMapIterObject *mit, NPY_cast_info *cast_info, + NPY_ARRAYMETHOD_FLAGS flags, int is_aligned); NPY_NO_EXPORT int -mapiter_set(PyArrayMapIterObject *mit); +mapiter_set( + PyArrayMapIterObject *mit, NPY_cast_info *cast_info, + NPY_ARRAYMETHOD_FLAGS flags, int is_aligned); /* * Prepares shape and strides for a simple raw array iteration. diff --git a/numpy/core/src/common/npy_cpu_dispatch.h b/numpy/core/src/common/npy_cpu_dispatch.h index e814cd425..4d5addec8 100644 --- a/numpy/core/src/common/npy_cpu_dispatch.h +++ b/numpy/core/src/common/npy_cpu_dispatch.h @@ -22,7 +22,7 @@ * which is explicitly disabling the module ccompiler_opt. */ #ifndef NPY_DISABLE_OPTIMIZATION - #if defined(__powerpc64__) && !defined(__cplusplus) && defined(bool) + #if (defined(__s390x__) || defined(__powerpc64__)) && !defined(__cplusplus) && defined(bool) /** * "altivec.h" header contains the definitions(bool, vector, pixel), * usually in c++ we undefine them after including the header. @@ -34,7 +34,7 @@ typedef bool npy__dispatch_bkbool; #endif #include "npy_cpu_dispatch_config.h" - #ifdef NPY_HAVE_VSX + #if defined(NPY_HAVE_VSX) || defined(NPY_HAVE_VX) #undef bool #undef vector #undef pixel diff --git a/numpy/core/src/common/numpyos.h b/numpy/core/src/common/numpyos.h index ce49cbea7..6e526af17 100644 --- a/numpy/core/src/common/numpyos.h +++ b/numpy/core/src/common/numpyos.h @@ -1,6 +1,10 @@ #ifndef NUMPY_CORE_SRC_COMMON_NPY_NUMPYOS_H_ #define NUMPY_CORE_SRC_COMMON_NPY_NUMPYOS_H_ +#ifdef __cplusplus +extern "C" { +#endif + NPY_NO_EXPORT char* NumPyOS_ascii_formatd(char *buffer, size_t buf_size, const char *format, @@ -39,4 +43,8 @@ NumPyOS_strtoll(const char *str, char **endptr, int base); NPY_NO_EXPORT npy_ulonglong NumPyOS_strtoull(const char *str, char **endptr, int base); +#ifdef __cplusplus +} +#endif + #endif /* NUMPY_CORE_SRC_COMMON_NPY_NUMPYOS_H_ */ diff --git a/numpy/core/src/common/simd/avx2/avx2.h b/numpy/core/src/common/simd/avx2/avx2.h index 02ff536fb..8cb74df2b 100644 --- a/numpy/core/src/common/simd/avx2/avx2.h +++ b/numpy/core/src/common/simd/avx2/avx2.h @@ -3,12 +3,14 @@ #endif #define NPY_SIMD 256 #define NPY_SIMD_WIDTH 32 +#define NPY_SIMD_F32 1 #define NPY_SIMD_F64 1 #ifdef NPY_HAVE_FMA3 #define NPY_SIMD_FMA3 1 // native support #else #define NPY_SIMD_FMA3 0 // fast emulated #endif +#define NPY_SIMD_BIGENDIAN 0 // Enough limit to allow us to use _mm256_i32gather_* #define NPY_SIMD_MAXLOAD_STRIDE32 (0x7fffffff / 8) diff --git a/numpy/core/src/common/simd/avx512/avx512.h b/numpy/core/src/common/simd/avx512/avx512.h index f38686834..0946e6443 100644 --- a/numpy/core/src/common/simd/avx512/avx512.h +++ b/numpy/core/src/common/simd/avx512/avx512.h @@ -3,8 +3,10 @@ #endif #define NPY_SIMD 512 #define NPY_SIMD_WIDTH 64 +#define NPY_SIMD_F32 1 #define NPY_SIMD_F64 1 #define NPY_SIMD_FMA3 1 // native support +#define NPY_SIMD_BIGENDIAN 0 // Enough limit to allow us to use _mm512_i32gather_* and _mm512_i32scatter_* #define NPY_SIMD_MAXLOAD_STRIDE32 (0x7fffffff / 16) #define NPY_SIMD_MAXSTORE_STRIDE32 (0x7fffffff / 16) diff --git a/numpy/core/src/common/simd/emulate_maskop.h b/numpy/core/src/common/simd/emulate_maskop.h index 41e397c2d..2a808a153 100644 --- a/numpy/core/src/common/simd/emulate_maskop.h +++ b/numpy/core/src/common/simd/emulate_maskop.h @@ -36,7 +36,9 @@ NPYV_IMPL_EMULATE_MASK_ADDSUB(u32, b32) NPYV_IMPL_EMULATE_MASK_ADDSUB(s32, b32) NPYV_IMPL_EMULATE_MASK_ADDSUB(u64, b64) NPYV_IMPL_EMULATE_MASK_ADDSUB(s64, b64) -NPYV_IMPL_EMULATE_MASK_ADDSUB(f32, b32) +#if NPY_SIMD_F32 + NPYV_IMPL_EMULATE_MASK_ADDSUB(f32, b32) +#endif #if NPY_SIMD_F64 NPYV_IMPL_EMULATE_MASK_ADDSUB(f64, b64) #endif diff --git a/numpy/core/src/common/simd/intdiv.h b/numpy/core/src/common/simd/intdiv.h index 8b65b3a76..f5066b59b 100644 --- a/numpy/core/src/common/simd/intdiv.h +++ b/numpy/core/src/common/simd/intdiv.h @@ -89,7 +89,9 @@ NPY_FINLINE unsigned npyv__bitscan_revnz_u32(npy_uint32 a) unsigned long rl; (void)_BitScanReverse(&rl, (unsigned long)a); r = (unsigned)rl; -#elif defined(NPY_HAVE_SSE2) && (defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER)) + +#elif defined(NPY_HAVE_SSE2) && (defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER)) \ + && (defined(NPY_CPU_X86) || defined(NPY_CPU_AMD64)) __asm__("bsr %1, %0" : "=r" (r) : "r"(a)); #elif defined(__GNUC__) || defined(__clang__) r = 31 - __builtin_clz(a); // performs on arm -> clz, ppc -> cntlzw @@ -206,7 +208,7 @@ NPY_FINLINE npyv_u8x3 npyv_divisor_u8(npy_uint8 d) divisor.val[0] = npyv_setall_u16(m); divisor.val[1] = npyv_set_u8(sh1); divisor.val[2] = npyv_set_u8(sh2); -#elif defined(NPY_HAVE_VSX2) +#elif defined(NPY_HAVE_VSX2) || defined(NPY_HAVE_VX) divisor.val[0] = npyv_setall_u8(m); divisor.val[1] = npyv_setall_u8(sh1); divisor.val[2] = npyv_setall_u8(sh2); @@ -247,7 +249,7 @@ NPY_FINLINE npyv_s8x3 npyv_divisor_s8(npy_int8 d) npyv_s8x3 divisor; divisor.val[0] = npyv_setall_s8(m); divisor.val[2] = npyv_setall_s8(d < 0 ? -1 : 0); - #ifdef NPY_HAVE_VSX2 + #if defined(NPY_HAVE_VSX2) || defined(NPY_HAVE_VX) divisor.val[1] = npyv_setall_s8(sh); #elif defined(NPY_HAVE_NEON) divisor.val[1] = npyv_setall_s8(-sh); @@ -283,7 +285,7 @@ NPY_FINLINE npyv_u16x3 npyv_divisor_u16(npy_uint16 d) #ifdef NPY_HAVE_SSE2 // SSE/AVX2/AVX512 divisor.val[1] = npyv_set_u16(sh1); divisor.val[2] = npyv_set_u16(sh2); -#elif defined(NPY_HAVE_VSX2) +#elif defined(NPY_HAVE_VSX2) || defined(NPY_HAVE_VX) divisor.val[1] = npyv_setall_u16(sh1); divisor.val[2] = npyv_setall_u16(sh2); #elif defined(NPY_HAVE_NEON) @@ -315,7 +317,7 @@ NPY_FINLINE npyv_s16x3 npyv_divisor_s16(npy_int16 d) divisor.val[2] = npyv_setall_s16(d < 0 ? -1 : 0); // sign of divisor #ifdef NPY_HAVE_SSE2 // SSE/AVX2/AVX512 divisor.val[1] = npyv_set_s16(sh); -#elif defined(NPY_HAVE_VSX2) +#elif defined(NPY_HAVE_VSX2) || defined(NPY_HAVE_VX) divisor.val[1] = npyv_setall_s16(sh); #elif defined(NPY_HAVE_NEON) divisor.val[1] = npyv_setall_s16(-sh); @@ -350,7 +352,7 @@ NPY_FINLINE npyv_u32x3 npyv_divisor_u32(npy_uint32 d) #ifdef NPY_HAVE_SSE2 // SSE/AVX2/AVX512 divisor.val[1] = npyv_set_u32(sh1); divisor.val[2] = npyv_set_u32(sh2); -#elif defined(NPY_HAVE_VSX2) +#elif defined(NPY_HAVE_VSX2) || defined(NPY_HAVE_VX) divisor.val[1] = npyv_setall_u32(sh1); divisor.val[2] = npyv_setall_u32(sh2); #elif defined(NPY_HAVE_NEON) @@ -387,7 +389,7 @@ NPY_FINLINE npyv_s32x3 npyv_divisor_s32(npy_int32 d) divisor.val[2] = npyv_setall_s32(d < 0 ? -1 : 0); // sign of divisor #ifdef NPY_HAVE_SSE2 // SSE/AVX2/AVX512 divisor.val[1] = npyv_set_s32(sh); -#elif defined(NPY_HAVE_VSX2) +#elif defined(NPY_HAVE_VSX2) || defined(NPY_HAVE_VX) divisor.val[1] = npyv_setall_s32(sh); #elif defined(NPY_HAVE_NEON) divisor.val[1] = npyv_setall_s32(-sh); @@ -400,7 +402,7 @@ NPY_FINLINE npyv_s32x3 npyv_divisor_s32(npy_int32 d) NPY_FINLINE npyv_u64x3 npyv_divisor_u64(npy_uint64 d) { npyv_u64x3 divisor; -#if defined(NPY_HAVE_VSX2) || defined(NPY_HAVE_NEON) +#if defined(NPY_HAVE_VSX2) || defined(NPY_HAVE_VX) || defined(NPY_HAVE_NEON) divisor.val[0] = npyv_setall_u64(d); #else npy_uint64 l, l2, sh1, sh2, m; @@ -435,7 +437,7 @@ NPY_FINLINE npyv_u64x3 npyv_divisor_u64(npy_uint64 d) NPY_FINLINE npyv_s64x3 npyv_divisor_s64(npy_int64 d) { npyv_s64x3 divisor; -#if defined(NPY_HAVE_VSX2) || defined(NPY_HAVE_NEON) +#if defined(NPY_HAVE_VSX2) || defined(NPY_HAVE_VX) || defined(NPY_HAVE_NEON) divisor.val[0] = npyv_setall_s64(d); divisor.val[1] = npyv_cvt_s64_b64( npyv_cmpeq_s64(npyv_setall_s64(-1), divisor.val[0]) diff --git a/numpy/core/src/common/simd/neon/math.h b/numpy/core/src/common/simd/neon/math.h index 4607d6f27..8f4680c8f 100644 --- a/numpy/core/src/common/simd/neon/math.h +++ b/numpy/core/src/common/simd/neon/math.h @@ -161,7 +161,7 @@ NPY_FINLINE npyv_f32 npyv_rint_f32(npyv_f32 a) #else // ARMv7 NEON only supports fp to int truncate conversion. // a magic trick of adding 1.5 * 2**23 is used for rounding - // to nearest even and then substract this magic number to get + // to nearest even and then subtract this magic number to get // the integer. const npyv_s32 szero = vreinterpretq_s32_f32(vdupq_n_f32(-0.0f)); const npyv_f32 magic = vdupq_n_f32(12582912.0f); // 1.5 * 2**23 diff --git a/numpy/core/src/common/simd/neon/neon.h b/numpy/core/src/common/simd/neon/neon.h index e6f6a7324..b08071527 100644 --- a/numpy/core/src/common/simd/neon/neon.h +++ b/numpy/core/src/common/simd/neon/neon.h @@ -4,7 +4,7 @@ #define NPY_SIMD 128 #define NPY_SIMD_WIDTH 16 - +#define NPY_SIMD_F32 1 #ifdef __aarch64__ #define NPY_SIMD_F64 1 #else @@ -15,6 +15,7 @@ #else #define NPY_SIMD_FMA3 0 // HW emulated #endif +#define NPY_SIMD_BIGENDIAN 0 typedef uint8x16_t npyv_u8; typedef int8x16_t npyv_s8; diff --git a/numpy/core/src/common/simd/simd.h b/numpy/core/src/common/simd/simd.h index 08b2a7d00..b1492500f 100644 --- a/numpy/core/src/common/simd/simd.h +++ b/numpy/core/src/common/simd/simd.h @@ -34,7 +34,7 @@ typedef double npyv_lanetype_f64; * They had bad impact on the generated instructions, * sometimes the compiler deal with them without the respect * of 32-bit mode which lead to crush due to execute 64-bit - * instructions and other times generate bad emulated instructions. + * instructions and other times generate bad emulated instructions. */ #undef _mm512_set1_epi64 #undef _mm256_set1_epi64x @@ -54,9 +54,9 @@ typedef double npyv_lanetype_f64; #include "sse/sse.h" #endif -// TODO: Add support for VSX(2.06) and BE Mode -#if defined(NPY_HAVE_VSX2) && defined(__LITTLE_ENDIAN__) - #include "vsx/vsx.h" +// TODO: Add support for VSX(2.06) and BE Mode for VSX +#if defined(NPY_HAVE_VX) || (defined(NPY_HAVE_VSX2) && defined(__LITTLE_ENDIAN__)) + #include "vec/vec.h" #endif #ifdef NPY_HAVE_NEON @@ -64,10 +64,20 @@ typedef double npyv_lanetype_f64; #endif #ifndef NPY_SIMD + /// SIMD width in bits or 0 if there's no SIMD extension available. #define NPY_SIMD 0 + /// SIMD width in bytes or 0 if there's no SIMD extension available. #define NPY_SIMD_WIDTH 0 + /// 1 if the enabled SIMD extension supports single-precision otherwise 0. + #define NPY_SIMD_F32 0 + /// 1 if the enabled SIMD extension supports double-precision otherwise 0. #define NPY_SIMD_F64 0 + /// 1 if the enabled SIMD extension supports native FMA otherwise 0. + /// note: we still emulate(fast) FMA intrinsics even if they + /// aren't supported but they shouldn't be used if the precision is matters. #define NPY_SIMD_FMA3 0 + /// 1 if the enabled SIMD extension is running on big-endian mode otherwise 0. + #define NPY_SIMD_BIGENDIAN 0 #endif // enable emulated mask operations for all SIMD extension except for AVX512 diff --git a/numpy/core/src/common/simd/sse/sse.h b/numpy/core/src/common/simd/sse/sse.h index 0bb404312..c21bbfda7 100644 --- a/numpy/core/src/common/simd/sse/sse.h +++ b/numpy/core/src/common/simd/sse/sse.h @@ -4,12 +4,15 @@ #define NPY_SIMD 128 #define NPY_SIMD_WIDTH 16 +#define NPY_SIMD_F32 1 #define NPY_SIMD_F64 1 #if defined(NPY_HAVE_FMA3) || defined(NPY_HAVE_FMA4) #define NPY_SIMD_FMA3 1 // native support #else #define NPY_SIMD_FMA3 0 // fast emulated #endif +#define NPY_SIMD_BIGENDIAN 0 + typedef __m128i npyv_u8; typedef __m128i npyv_s8; typedef __m128i npyv_u16; diff --git a/numpy/core/src/common/simd/vsx/arithmetic.h b/numpy/core/src/common/simd/vec/arithmetic.h index 01dbf5480..a2e9d07eb 100644 --- a/numpy/core/src/common/simd/vsx/arithmetic.h +++ b/numpy/core/src/common/simd/vec/arithmetic.h @@ -2,8 +2,8 @@ #error "Not a standalone header" #endif -#ifndef _NPY_SIMD_VSX_ARITHMETIC_H -#define _NPY_SIMD_VSX_ARITHMETIC_H +#ifndef _NPY_SIMD_VEC_ARITHMETIC_H +#define _NPY_SIMD_VEC_ARITHMETIC_H /*************************** * Addition @@ -17,15 +17,32 @@ #define npyv_add_s32 vec_add #define npyv_add_u64 vec_add #define npyv_add_s64 vec_add +#if NPY_SIMD_F32 #define npyv_add_f32 vec_add +#endif #define npyv_add_f64 vec_add // saturated -#define npyv_adds_u8 vec_adds -#define npyv_adds_s8 vec_adds -#define npyv_adds_u16 vec_adds -#define npyv_adds_s16 vec_adds +#ifdef NPY_HAVE_VX + #define NPYV_IMPL_VX_ADDS(SFX, PSFX) \ + NPY_FINLINE npyv_##SFX npyv_adds_##SFX(npyv_##SFX a, npyv_##SFX b)\ + { \ + return vec_pack##PSFX( \ + vec_add(vec_unpackh(a), vec_unpackh(b)), \ + vec_add(vec_unpackl(a), vec_unpackl(b)) \ + ); \ + } + NPYV_IMPL_VX_ADDS(u8, su) + NPYV_IMPL_VX_ADDS(s8, s) + NPYV_IMPL_VX_ADDS(u16, su) + NPYV_IMPL_VX_ADDS(s16, s) +#else // VSX + #define npyv_adds_u8 vec_adds + #define npyv_adds_s8 vec_adds + #define npyv_adds_u16 vec_adds + #define npyv_adds_s16 vec_adds +#endif /*************************** * Subtraction ***************************/ @@ -38,21 +55,39 @@ #define npyv_sub_s32 vec_sub #define npyv_sub_u64 vec_sub #define npyv_sub_s64 vec_sub +#if NPY_SIMD_F32 #define npyv_sub_f32 vec_sub +#endif #define npyv_sub_f64 vec_sub // saturated -#define npyv_subs_u8 vec_subs -#define npyv_subs_s8 vec_subs -#define npyv_subs_u16 vec_subs -#define npyv_subs_s16 vec_subs +#ifdef NPY_HAVE_VX + #define NPYV_IMPL_VX_SUBS(SFX, PSFX) \ + NPY_FINLINE npyv_##SFX npyv_subs_##SFX(npyv_##SFX a, npyv_##SFX b)\ + { \ + return vec_pack##PSFX( \ + vec_sub(vec_unpackh(a), vec_unpackh(b)), \ + vec_sub(vec_unpackl(a), vec_unpackl(b)) \ + ); \ + } + + NPYV_IMPL_VX_SUBS(u8, su) + NPYV_IMPL_VX_SUBS(s8, s) + NPYV_IMPL_VX_SUBS(u16, su) + NPYV_IMPL_VX_SUBS(s16, s) +#else // VSX + #define npyv_subs_u8 vec_subs + #define npyv_subs_s8 vec_subs + #define npyv_subs_u16 vec_subs + #define npyv_subs_s16 vec_subs +#endif /*************************** * Multiplication ***************************/ // non-saturated // up to GCC 6 vec_mul only supports precisions and llong -#if defined(__GNUC__) && __GNUC__ < 7 +#if defined(NPY_HAVE_VSX) && defined(__GNUC__) && __GNUC__ < 7 #define NPYV_IMPL_VSX_MUL(T_VEC, SFX, ...) \ NPY_FINLINE T_VEC npyv_mul_##SFX(T_VEC a, T_VEC b) \ { \ @@ -91,7 +126,9 @@ #define npyv_mul_u32 vec_mul #define npyv_mul_s32 vec_mul #endif +#if NPY_SIMD_F32 #define npyv_mul_f32 vec_mul +#endif #define npyv_mul_f64 vec_mul /*************************** @@ -101,6 +138,9 @@ // divide each unsigned 8-bit element by a precomputed divisor NPY_FINLINE npyv_u8 npyv_divc_u8(npyv_u8 a, const npyv_u8x3 divisor) { +#ifdef NPY_HAVE_VX + npyv_u8 mulhi = vec_mulh(a, divisor.val[0]); +#else // VSX const npyv_u8 mergeo_perm = { 1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31 }; @@ -108,6 +148,7 @@ NPY_FINLINE npyv_u8 npyv_divc_u8(npyv_u8 a, const npyv_u8x3 divisor) npyv_u16 mul_even = vec_mule(a, divisor.val[0]); npyv_u16 mul_odd = vec_mulo(a, divisor.val[0]); npyv_u8 mulhi = (npyv_u8)vec_perm(mul_even, mul_odd, mergeo_perm); +#endif // floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2 npyv_u8 q = vec_sub(a, mulhi); q = vec_sr(q, divisor.val[1]); @@ -118,6 +159,9 @@ NPY_FINLINE npyv_u8 npyv_divc_u8(npyv_u8 a, const npyv_u8x3 divisor) // divide each signed 8-bit element by a precomputed divisor NPY_FINLINE npyv_s8 npyv_divc_s8(npyv_s8 a, const npyv_s8x3 divisor) { +#ifdef NPY_HAVE_VX + npyv_s8 mulhi = vec_mulh(a, divisor.val[0]); +#else const npyv_u8 mergeo_perm = { 1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31 }; @@ -125,16 +169,20 @@ NPY_FINLINE npyv_s8 npyv_divc_s8(npyv_s8 a, const npyv_s8x3 divisor) npyv_s16 mul_even = vec_mule(a, divisor.val[0]); npyv_s16 mul_odd = vec_mulo(a, divisor.val[0]); npyv_s8 mulhi = (npyv_s8)vec_perm(mul_even, mul_odd, mergeo_perm); +#endif // q = ((a + mulhi) >> sh1) - XSIGN(a) // trunc(a/d) = (q ^ dsign) - dsign - npyv_s8 q = vec_sra(vec_add(a, mulhi), (npyv_u8)divisor.val[1]); - q = vec_sub(q, vec_sra(a, npyv_setall_u8(7))); + npyv_s8 q = vec_sra_s8(vec_add(a, mulhi), (npyv_u8)divisor.val[1]); + q = vec_sub(q, vec_sra_s8(a, npyv_setall_u8(7))); q = vec_sub(vec_xor(q, divisor.val[2]), divisor.val[2]); return q; } // divide each unsigned 16-bit element by a precomputed divisor NPY_FINLINE npyv_u16 npyv_divc_u16(npyv_u16 a, const npyv_u16x3 divisor) { +#ifdef NPY_HAVE_VX + npyv_u16 mulhi = vec_mulh(a, divisor.val[0]); +#else // VSX const npyv_u8 mergeo_perm = { 2, 3, 18, 19, 6, 7, 22, 23, 10, 11, 26, 27, 14, 15, 30, 31 }; @@ -142,6 +190,7 @@ NPY_FINLINE npyv_u16 npyv_divc_u16(npyv_u16 a, const npyv_u16x3 divisor) npyv_u32 mul_even = vec_mule(a, divisor.val[0]); npyv_u32 mul_odd = vec_mulo(a, divisor.val[0]); npyv_u16 mulhi = (npyv_u16)vec_perm(mul_even, mul_odd, mergeo_perm); +#endif // floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2 npyv_u16 q = vec_sub(a, mulhi); q = vec_sr(q, divisor.val[1]); @@ -152,6 +201,9 @@ NPY_FINLINE npyv_u16 npyv_divc_u16(npyv_u16 a, const npyv_u16x3 divisor) // divide each signed 16-bit element by a precomputed divisor (round towards zero) NPY_FINLINE npyv_s16 npyv_divc_s16(npyv_s16 a, const npyv_s16x3 divisor) { +#ifdef NPY_HAVE_VX + npyv_s16 mulhi = vec_mulh(a, divisor.val[0]); +#else // VSX const npyv_u8 mergeo_perm = { 2, 3, 18, 19, 6, 7, 22, 23, 10, 11, 26, 27, 14, 15, 30, 31 }; @@ -159,30 +211,31 @@ NPY_FINLINE npyv_s16 npyv_divc_s16(npyv_s16 a, const npyv_s16x3 divisor) npyv_s32 mul_even = vec_mule(a, divisor.val[0]); npyv_s32 mul_odd = vec_mulo(a, divisor.val[0]); npyv_s16 mulhi = (npyv_s16)vec_perm(mul_even, mul_odd, mergeo_perm); +#endif // q = ((a + mulhi) >> sh1) - XSIGN(a) // trunc(a/d) = (q ^ dsign) - dsign - npyv_s16 q = vec_sra(vec_add(a, mulhi), (npyv_u16)divisor.val[1]); - q = vec_sub(q, vec_sra(a, npyv_setall_u16(15))); + npyv_s16 q = vec_sra_s16(vec_add(a, mulhi), (npyv_u16)divisor.val[1]); + q = vec_sub(q, vec_sra_s16(a, npyv_setall_u16(15))); q = vec_sub(vec_xor(q, divisor.val[2]), divisor.val[2]); return q; } // divide each unsigned 32-bit element by a precomputed divisor NPY_FINLINE npyv_u32 npyv_divc_u32(npyv_u32 a, const npyv_u32x3 divisor) { -#if defined(NPY_HAVE_VSX4) +#if defined(NPY_HAVE_VSX4) || defined(NPY_HAVE_VX) // high part of unsigned multiplication npyv_u32 mulhi = vec_mulh(a, divisor.val[0]); -#else -#if defined(__GNUC__) && __GNUC__ < 8 - // Doubleword integer wide multiplication supported by GCC 8+ - npyv_u64 mul_even, mul_odd; - __asm__ ("vmulouw %0,%1,%2" : "=v" (mul_even) : "v" (a), "v" (divisor.val[0])); - __asm__ ("vmuleuw %0,%1,%2" : "=v" (mul_odd) : "v" (a), "v" (divisor.val[0])); -#else - // Doubleword integer wide multiplication supported by GCC 8+ - npyv_u64 mul_even = vec_mule(a, divisor.val[0]); - npyv_u64 mul_odd = vec_mulo(a, divisor.val[0]); -#endif +#else // VSX + #if defined(__GNUC__) && __GNUC__ < 8 + // Doubleword integer wide multiplication supported by GCC 8+ + npyv_u64 mul_even, mul_odd; + __asm__ ("vmulouw %0,%1,%2" : "=v" (mul_even) : "v" (a), "v" (divisor.val[0])); + __asm__ ("vmuleuw %0,%1,%2" : "=v" (mul_odd) : "v" (a), "v" (divisor.val[0])); + #else + // Doubleword integer wide multiplication supported by GCC 8+ + npyv_u64 mul_even = vec_mule(a, divisor.val[0]); + npyv_u64 mul_odd = vec_mulo(a, divisor.val[0]); + #endif // high part of unsigned multiplication npyv_u32 mulhi = vec_mergeo((npyv_u32)mul_even, (npyv_u32)mul_odd); #endif @@ -196,27 +249,27 @@ NPY_FINLINE npyv_u32 npyv_divc_u32(npyv_u32 a, const npyv_u32x3 divisor) // divide each signed 32-bit element by a precomputed divisor (round towards zero) NPY_FINLINE npyv_s32 npyv_divc_s32(npyv_s32 a, const npyv_s32x3 divisor) { -#if defined(NPY_HAVE_VSX4) +#if defined(NPY_HAVE_VSX4) || defined(NPY_HAVE_VX) // high part of signed multiplication npyv_s32 mulhi = vec_mulh(a, divisor.val[0]); #else -#if defined(__GNUC__) && __GNUC__ < 8 - // Doubleword integer wide multiplication supported by GCC8+ - npyv_s64 mul_even, mul_odd; - __asm__ ("vmulosw %0,%1,%2" : "=v" (mul_even) : "v" (a), "v" (divisor.val[0])); - __asm__ ("vmulesw %0,%1,%2" : "=v" (mul_odd) : "v" (a), "v" (divisor.val[0])); -#else - // Doubleword integer wide multiplication supported by GCC8+ - npyv_s64 mul_even = vec_mule(a, divisor.val[0]); - npyv_s64 mul_odd = vec_mulo(a, divisor.val[0]); -#endif + #if defined(__GNUC__) && __GNUC__ < 8 + // Doubleword integer wide multiplication supported by GCC8+ + npyv_s64 mul_even, mul_odd; + __asm__ ("vmulosw %0,%1,%2" : "=v" (mul_even) : "v" (a), "v" (divisor.val[0])); + __asm__ ("vmulesw %0,%1,%2" : "=v" (mul_odd) : "v" (a), "v" (divisor.val[0])); + #else + // Doubleword integer wide multiplication supported by GCC8+ + npyv_s64 mul_even = vec_mule(a, divisor.val[0]); + npyv_s64 mul_odd = vec_mulo(a, divisor.val[0]); + #endif // high part of signed multiplication npyv_s32 mulhi = vec_mergeo((npyv_s32)mul_even, (npyv_s32)mul_odd); #endif // q = ((a + mulhi) >> sh1) - XSIGN(a) // trunc(a/d) = (q ^ dsign) - dsign - npyv_s32 q = vec_sra(vec_add(a, mulhi), (npyv_u32)divisor.val[1]); - q = vec_sub(q, vec_sra(a, npyv_setall_u32(31))); + npyv_s32 q = vec_sra_s32(vec_add(a, mulhi), (npyv_u32)divisor.val[1]); + q = vec_sub(q, vec_sra_s32(a, npyv_setall_u32(31))); q = vec_sub(vec_xor(q, divisor.val[2]), divisor.val[2]); return q; } @@ -240,45 +293,67 @@ NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor) /*************************** * Division ***************************/ -#define npyv_div_f32 vec_div +#if NPY_SIMD_F32 + #define npyv_div_f32 vec_div +#endif #define npyv_div_f64 vec_div /*************************** * FUSED ***************************/ // multiply and add, a*b + c -#define npyv_muladd_f32 vec_madd #define npyv_muladd_f64 vec_madd // multiply and subtract, a*b - c -#define npyv_mulsub_f32 vec_msub #define npyv_mulsub_f64 vec_msub -// negate multiply and add, -(a*b) + c -#define npyv_nmuladd_f32 vec_nmsub // equivalent to -(a*b - c) -#define npyv_nmuladd_f64 vec_nmsub -// negate multiply and subtract, -(a*b) - c -#define npyv_nmulsub_f32 vec_nmadd // equivalent to -(a*b + c) -#define npyv_nmulsub_f64 vec_nmadd - +#if NPY_SIMD_F32 + #define npyv_muladd_f32 vec_madd + #define npyv_mulsub_f32 vec_msub +#endif +#if defined(NPY_HAVE_VXE) || defined(NPY_HAVE_VSX) + // negate multiply and add, -(a*b) + c + #define npyv_nmuladd_f32 vec_nmsub // equivalent to -(a*b - c) + #define npyv_nmuladd_f64 vec_nmsub + // negate multiply and subtract, -(a*b) - c + #define npyv_nmulsub_f64 vec_nmadd + #define npyv_nmulsub_f32 vec_nmadd // equivalent to -(a*b + c) +#else + NPY_FINLINE npyv_f64 npyv_nmuladd_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c) + { return vec_neg(vec_msub(a, b, c)); } + NPY_FINLINE npyv_f64 npyv_nmulsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c) + { return vec_neg(vec_madd(a, b, c)); } +#endif /*************************** * Summation ***************************/ // reduce sum across vector NPY_FINLINE npy_uint64 npyv_sum_u64(npyv_u64 a) { +#ifdef NPY_HAVE_VX + const npyv_u64 zero = npyv_zero_u64(); + return vec_extract((npyv_u64)vec_sum_u128(a, zero), 1); +#else return vec_extract(vec_add(a, vec_mergel(a, a)), 0); +#endif } NPY_FINLINE npy_uint32 npyv_sum_u32(npyv_u32 a) { +#ifdef NPY_HAVE_VX + const npyv_u32 zero = npyv_zero_u32(); + return vec_extract((npyv_u32)vec_sum_u128(a, zero), 3); +#else const npyv_u32 rs = vec_add(a, vec_sld(a, a, 8)); return vec_extract(vec_add(rs, vec_sld(rs, rs, 4)), 0); +#endif } +#if NPY_SIMD_F32 NPY_FINLINE float npyv_sum_f32(npyv_f32 a) { npyv_f32 sum = vec_add(a, npyv_combineh_f32(a, a)); return vec_extract(sum, 0) + vec_extract(sum, 1); } +#endif NPY_FINLINE double npyv_sum_f64(npyv_f64 a) { @@ -288,19 +363,30 @@ NPY_FINLINE double npyv_sum_f64(npyv_f64 a) // expand the source vector and performs sum reduce NPY_FINLINE npy_uint16 npyv_sumup_u8(npyv_u8 a) { +#ifdef NPY_HAVE_VX + const npyv_u8 zero = npyv_zero_u8(); + npyv_u32 sum4 = vec_sum4(a, zero); + return (npy_uint16)npyv_sum_u32(sum4); +#else const npyv_u32 zero = npyv_zero_u32(); npyv_u32 four = vec_sum4s(a, zero); npyv_s32 one = vec_sums((npyv_s32)four, (npyv_s32)zero); return (npy_uint16)vec_extract(one, 3); +#endif } NPY_FINLINE npy_uint32 npyv_sumup_u16(npyv_u16 a) { +#ifdef NPY_HAVE_VX + npyv_u64 sum = vec_sum2(a, npyv_zero_u16()); + return (npy_uint32)npyv_sum_u64(sum); +#else // VSX const npyv_s32 zero = npyv_zero_s32(); npyv_u32x2 eight = npyv_expand_u32_u16(a); npyv_u32 four = vec_add(eight.val[0], eight.val[1]); npyv_s32 one = vec_sums((npyv_s32)four, zero); return (npy_uint32)vec_extract(one, 3); +#endif } -#endif // _NPY_SIMD_VSX_ARITHMETIC_H +#endif // _NPY_SIMD_VEC_ARITHMETIC_H diff --git a/numpy/core/src/common/simd/vec/conversion.h b/numpy/core/src/common/simd/vec/conversion.h new file mode 100644 index 000000000..f0d625c55 --- /dev/null +++ b/numpy/core/src/common/simd/vec/conversion.h @@ -0,0 +1,228 @@ +#ifndef NPY_SIMD + #error "Not a standalone header" +#endif + +#ifndef _NPY_SIMD_VEC_CVT_H +#define _NPY_SIMD_VEC_CVT_H + +// convert boolean vectors to integer vectors +#define npyv_cvt_u8_b8(BL) ((npyv_u8) BL) +#define npyv_cvt_s8_b8(BL) ((npyv_s8) BL) +#define npyv_cvt_u16_b16(BL) ((npyv_u16) BL) +#define npyv_cvt_s16_b16(BL) ((npyv_s16) BL) +#define npyv_cvt_u32_b32(BL) ((npyv_u32) BL) +#define npyv_cvt_s32_b32(BL) ((npyv_s32) BL) +#define npyv_cvt_u64_b64(BL) ((npyv_u64) BL) +#define npyv_cvt_s64_b64(BL) ((npyv_s64) BL) +#if NPY_SIMD_F32 + #define npyv_cvt_f32_b32(BL) ((npyv_f32) BL) +#endif +#define npyv_cvt_f64_b64(BL) ((npyv_f64) BL) + +// convert integer vectors to boolean vectors +#define npyv_cvt_b8_u8(A) ((npyv_b8) A) +#define npyv_cvt_b8_s8(A) ((npyv_b8) A) +#define npyv_cvt_b16_u16(A) ((npyv_b16) A) +#define npyv_cvt_b16_s16(A) ((npyv_b16) A) +#define npyv_cvt_b32_u32(A) ((npyv_b32) A) +#define npyv_cvt_b32_s32(A) ((npyv_b32) A) +#define npyv_cvt_b64_u64(A) ((npyv_b64) A) +#define npyv_cvt_b64_s64(A) ((npyv_b64) A) +#if NPY_SIMD_F32 + #define npyv_cvt_b32_f32(A) ((npyv_b32) A) +#endif +#define npyv_cvt_b64_f64(A) ((npyv_b64) A) + +//expand +NPY_FINLINE npyv_u16x2 npyv_expand_u16_u8(npyv_u8 data) +{ + npyv_u16x2 r; +#ifdef NPY_HAVE_VX + r.val[0] = vec_unpackh(data); + r.val[1] = vec_unpackl(data); +#else + npyv_u8 zero = npyv_zero_u8(); + r.val[0] = (npyv_u16)vec_mergeh(data, zero); + r.val[1] = (npyv_u16)vec_mergel(data, zero); +#endif + return r; +} + +NPY_FINLINE npyv_u32x2 npyv_expand_u32_u16(npyv_u16 data) +{ + npyv_u32x2 r; +#ifdef NPY_HAVE_VX + r.val[0] = vec_unpackh(data); + r.val[1] = vec_unpackl(data); +#else + npyv_u16 zero = npyv_zero_u16(); + r.val[0] = (npyv_u32)vec_mergeh(data, zero); + r.val[1] = (npyv_u32)vec_mergel(data, zero); +#endif + return r; +} + +// pack two 16-bit boolean into one 8-bit boolean vector +NPY_FINLINE npyv_b8 npyv_pack_b8_b16(npyv_b16 a, npyv_b16 b) { + return vec_pack(a, b); +} + +// pack four 32-bit boolean vectors into one 8-bit boolean vector +NPY_FINLINE npyv_b8 npyv_pack_b8_b32(npyv_b32 a, npyv_b32 b, npyv_b32 c, npyv_b32 d) { + npyv_b16 ab = vec_pack(a, b); + npyv_b16 cd = vec_pack(c, d); + return npyv_pack_b8_b16(ab, cd); +} + +// pack eight 64-bit boolean vectors into one 8-bit boolean vector +NPY_FINLINE npyv_b8 +npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d, + npyv_b64 e, npyv_b64 f, npyv_b64 g, npyv_b64 h) { + npyv_b32 ab = vec_pack(a, b); + npyv_b32 cd = vec_pack(c, d); + npyv_b32 ef = vec_pack(e, f); + npyv_b32 gh = vec_pack(g, h); + return npyv_pack_b8_b32(ab, cd, ef, gh); +} + +// convert boolean vector to integer bitfield +#if defined(NPY_HAVE_VXE) || defined(NPY_HAVE_VSX2) + NPY_FINLINE npy_uint64 npyv_tobits_b8(npyv_b8 a) + { + const npyv_u8 qperm = npyv_set_u8(120, 112, 104, 96, 88, 80, 72, 64, 56, 48, 40, 32, 24, 16, 8, 0); + npyv_u16 r = (npyv_u16)vec_vbpermq((npyv_u8)a, qperm); + #ifdef NPY_HAVE_VXE + return vec_extract(r, 3); + #else + return vec_extract(r, 4); + #endif + } + NPY_FINLINE npy_uint64 npyv_tobits_b16(npyv_b16 a) + { + const npyv_u8 qperm = npyv_setf_u8(128, 112, 96, 80, 64, 48, 32, 16, 0); + npyv_u8 r = (npyv_u8)vec_vbpermq((npyv_u8)a, qperm); + #ifdef NPY_HAVE_VXE + return vec_extract(r, 6); + #else + return vec_extract(r, 8); + #endif + } + NPY_FINLINE npy_uint64 npyv_tobits_b32(npyv_b32 a) + { + #ifdef NPY_HAVE_VXE + const npyv_u8 qperm = npyv_setf_u8(128, 128, 128, 128, 128, 96, 64, 32, 0); + #else + const npyv_u8 qperm = npyv_setf_u8(128, 96, 64, 32, 0); + #endif + npyv_u8 r = (npyv_u8)vec_vbpermq((npyv_u8)a, qperm); + #ifdef NPY_HAVE_VXE + return vec_extract(r, 6); + #else + return vec_extract(r, 8); + #endif + } + NPY_FINLINE npy_uint64 npyv_tobits_b64(npyv_b64 a) + { + #ifdef NPY_HAVE_VXE + const npyv_u8 qperm = npyv_setf_u8(128, 128, 128, 128, 128, 128, 128, 64, 0); + #else + const npyv_u8 qperm = npyv_setf_u8(128, 64, 0); + #endif + npyv_u8 r = (npyv_u8)vec_vbpermq((npyv_u8)a, qperm); + #ifdef NPY_HAVE_VXE + return vec_extract(r, 6); + #else + return vec_extract(r, 8); + #endif + } +#else + NPY_FINLINE npy_uint64 npyv_tobits_b8(npyv_b8 a) + { + const npyv_u8 scale = npyv_set_u8(1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128); + npyv_u8 seq_scale = vec_and((npyv_u8)a, scale); + npyv_u64 sum = vec_sum2(vec_sum4(seq_scale, npyv_zero_u8()), npyv_zero_u32()); + return vec_extract(sum, 0) + ((int)vec_extract(sum, 1) << 8); + } + NPY_FINLINE npy_uint64 npyv_tobits_b16(npyv_b16 a) + { + const npyv_u16 scale = npyv_set_u16(1, 2, 4, 8, 16, 32, 64, 128); + npyv_u16 seq_scale = vec_and((npyv_u16)a, scale); + npyv_u64 sum = vec_sum2(seq_scale, npyv_zero_u16()); + return vec_extract(vec_sum_u128(sum, npyv_zero_u64()), 15); + } + NPY_FINLINE npy_uint64 npyv_tobits_b32(npyv_b32 a) + { + const npyv_u32 scale = npyv_set_u32(1, 2, 4, 8); + npyv_u32 seq_scale = vec_and((npyv_u32)a, scale); + return vec_extract(vec_sum_u128(seq_scale, npyv_zero_u32()), 15); + } + NPY_FINLINE npy_uint64 npyv_tobits_b64(npyv_b64 a) + { + const npyv_u64 scale = npyv_set_u64(1, 2); + npyv_u64 seq_scale = vec_and((npyv_u64)a, scale); + return vec_extract(vec_sum_u128(seq_scale, npyv_zero_u64()), 15); + } +#endif +// truncate compatible with all compilers(internal use for now) +#if NPY_SIMD_F32 + NPY_FINLINE npyv_s32 npyv__trunc_s32_f32(npyv_f32 a) + { + #ifdef NPY_HAVE_VXE2 + return vec_signed(a); + #elif defined(NPY_HAVE_VXE) + return vec_packs(vec_signed(npyv_doublee(a)), vec_signed(npyv_doublee(vec_mergel(a, a)))); + // VSX + #elif defined(__IBMC__) + return vec_cts(a, 0); + #elif defined(__clang__) + /** + * old versions of CLANG doesn't support %x<n> in the inline asm template + * which fixes register number when using any of the register constraints wa, wd, wf. + * therefore, we count on built-in functions. + */ + return __builtin_convertvector(a, npyv_s32); + #else // gcc + npyv_s32 ret; + __asm__ ("xvcvspsxws %x0,%x1" : "=wa" (ret) : "wa" (a)); + return ret; + #endif + } +#endif + +NPY_FINLINE npyv_s32 npyv__trunc_s32_f64(npyv_f64 a, npyv_f64 b) +{ +#ifdef NPY_HAVE_VX + return vec_packs(vec_signed(a), vec_signed(b)); +// VSX +#elif defined(__IBMC__) + const npyv_u8 seq_even = npyv_set_u8(0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27); + // unfortunately, XLC missing asm register vsx fixer + // hopefully, xlc can optimize around big-endian compatibility + npyv_s32 lo_even = vec_cts(a, 0); + npyv_s32 hi_even = vec_cts(b, 0); + return vec_perm(lo_even, hi_even, seq_even); +#else + const npyv_u8 seq_odd = npyv_set_u8(4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31); + #ifdef __clang__ + // __builtin_convertvector doesn't support this conversion on wide range of versions + // fortunately, almost all versions have direct builtin of 'xvcvdpsxws' + npyv_s32 lo_odd = __builtin_vsx_xvcvdpsxws(a); + npyv_s32 hi_odd = __builtin_vsx_xvcvdpsxws(b); + #else // gcc + npyv_s32 lo_odd, hi_odd; + __asm__ ("xvcvdpsxws %x0,%x1" : "=wa" (lo_odd) : "wa" (a)); + __asm__ ("xvcvdpsxws %x0,%x1" : "=wa" (hi_odd) : "wa" (b)); + #endif + return vec_perm(lo_odd, hi_odd, seq_odd); +#endif +} + +// round to nearest integer (assuming even) +#if NPY_SIMD_F32 + NPY_FINLINE npyv_s32 npyv_round_s32_f32(npyv_f32 a) + { return npyv__trunc_s32_f32(vec_rint(a)); } +#endif +NPY_FINLINE npyv_s32 npyv_round_s32_f64(npyv_f64 a, npyv_f64 b) +{ return npyv__trunc_s32_f64(vec_rint(a), vec_rint(b)); } + +#endif // _NPY_SIMD_VEC_CVT_H diff --git a/numpy/core/src/common/simd/vsx/math.h b/numpy/core/src/common/simd/vec/math.h index 444bc9e54..7714a612d 100644 --- a/numpy/core/src/common/simd/vsx/math.h +++ b/numpy/core/src/common/simd/vec/math.h @@ -2,21 +2,25 @@ #error "Not a standalone header" #endif -#ifndef _NPY_SIMD_VSX_MATH_H -#define _NPY_SIMD_VSX_MATH_H +#ifndef _NPY_SIMD_VEC_MATH_H +#define _NPY_SIMD_VEC_MATH_H /*************************** * Elementary ***************************/ // Square root -#define npyv_sqrt_f32 vec_sqrt +#if NPY_SIMD_F32 + #define npyv_sqrt_f32 vec_sqrt +#endif #define npyv_sqrt_f64 vec_sqrt // Reciprocal -NPY_FINLINE npyv_f32 npyv_recip_f32(npyv_f32 a) -{ - const npyv_f32 one = npyv_setall_f32(1.0f); - return vec_div(one, a); -} +#if NPY_SIMD_F32 + NPY_FINLINE npyv_f32 npyv_recip_f32(npyv_f32 a) + { + const npyv_f32 one = npyv_setall_f32(1.0f); + return vec_div(one, a); + } +#endif NPY_FINLINE npyv_f64 npyv_recip_f64(npyv_f64 a) { const npyv_f64 one = npyv_setall_f64(1.0); @@ -24,23 +28,41 @@ NPY_FINLINE npyv_f64 npyv_recip_f64(npyv_f64 a) } // Absolute -#define npyv_abs_f32 vec_abs +#if NPY_SIMD_F32 + #define npyv_abs_f32 vec_abs +#endif #define npyv_abs_f64 vec_abs // Square -NPY_FINLINE npyv_f32 npyv_square_f32(npyv_f32 a) -{ return vec_mul(a, a); } +#if NPY_SIMD_F32 + NPY_FINLINE npyv_f32 npyv_square_f32(npyv_f32 a) + { return vec_mul(a, a); } +#endif NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a) { return vec_mul(a, a); } // Maximum, natively mapping with no guarantees to handle NaN. -#define npyv_max_f32 vec_max +#if NPY_SIMD_F32 + #define npyv_max_f32 vec_max +#endif #define npyv_max_f64 vec_max // Maximum, supports IEEE floating-point arithmetic (IEC 60559), // - If one of the two vectors contains NaN, the equivalent element of the other vector is set // - Only if both corresponded elements are NaN, NaN is set. -#define npyv_maxp_f32 vec_max -#define npyv_maxp_f64 vec_max +#if NPY_SIMD_F32 + #define npyv_maxp_f32 vec_max +#endif +#if defined(NPY_HAVE_VXE) || defined(NPY_HAVE_VSX) + #define npyv_maxp_f64 vec_max +#else + // vfmindb & vfmaxdb appears in zarch12 + NPY_FINLINE npyv_f64 npyv_maxp_f64(npyv_f64 a, npyv_f64 b) + { + npyv_b64 nn_a = npyv_notnan_f64(a); + npyv_b64 nn_b = npyv_notnan_f64(b); + return vec_max(vec_sel(b, a, nn_a), vec_sel(a, b, nn_b)); + } +#endif // Maximum, integer operations #define npyv_max_u8 vec_max #define npyv_max_s8 vec_max @@ -52,13 +74,27 @@ NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a) #define npyv_max_s64 vec_max // Minimum, natively mapping with no guarantees to handle NaN. -#define npyv_min_f32 vec_min +#if NPY_SIMD_F32 + #define npyv_min_f32 vec_min +#endif #define npyv_min_f64 vec_min // Minimum, supports IEEE floating-point arithmetic (IEC 60559), // - If one of the two vectors contains NaN, the equivalent element of the other vector is set // - Only if both corresponded elements are NaN, NaN is set. -#define npyv_minp_f32 vec_min -#define npyv_minp_f64 vec_min +#if NPY_SIMD_F32 + #define npyv_minp_f32 vec_min +#endif +#if defined(NPY_HAVE_VXE) || defined(NPY_HAVE_VSX) + #define npyv_minp_f64 vec_min +#else + // vfmindb & vfmaxdb appears in zarch12 + NPY_FINLINE npyv_f64 npyv_minp_f64(npyv_f64 a, npyv_f64 b) + { + npyv_b64 nn_a = npyv_notnan_f64(a); + npyv_b64 nn_b = npyv_notnan_f64(b); + return vec_min(vec_sel(b, a, nn_a), vec_sel(a, b, nn_b)); + } +#endif // Minimum, integer operations #define npyv_min_u8 vec_min #define npyv_min_s8 vec_min @@ -70,19 +106,18 @@ NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a) #define npyv_min_s64 vec_min // round to nearest int even -#define npyv_rint_f32 vec_rint #define npyv_rint_f64 vec_rint - // ceil -#define npyv_ceil_f32 vec_ceil #define npyv_ceil_f64 vec_ceil - // trunc -#define npyv_trunc_f32 vec_trunc #define npyv_trunc_f64 vec_trunc - // floor -#define npyv_floor_f32 vec_floor #define npyv_floor_f64 vec_floor +#if NPY_SIMD_F32 + #define npyv_rint_f32 vec_rint + #define npyv_ceil_f32 vec_ceil + #define npyv_trunc_f32 vec_trunc + #define npyv_floor_f32 vec_floor +#endif -#endif // _NPY_SIMD_VSX_MATH_H +#endif // _NPY_SIMD_VEC_MATH_H diff --git a/numpy/core/src/common/simd/vsx/memory.h b/numpy/core/src/common/simd/vec/memory.h index 3007584ef..e8f588ef2 100644 --- a/numpy/core/src/common/simd/vsx/memory.h +++ b/numpy/core/src/common/simd/vec/memory.h @@ -2,8 +2,8 @@ #error "Not a standalone header" #endif -#ifndef _NPY_SIMD_VSX_MEMORY_H -#define _NPY_SIMD_VSX_MEMORY_H +#ifndef _NPY_SIMD_VEC_MEMORY_H +#define _NPY_SIMD_VEC_MEMORY_H #include "misc.h" @@ -19,19 +19,32 @@ * CLANG fails to load unaligned addresses via vec_xl, vec_xst * so we failback to vec_vsx_ld, vec_vsx_st */ - #if (defined(__GNUC__) && !defined(vec_xl)) || (defined(__clang__) && !defined(__IBMC__)) + #if defined (NPY_HAVE_VSX2) && ( \ + (defined(__GNUC__) && !defined(vec_xl)) || (defined(__clang__) && !defined(__IBMC__)) \ + ) #define npyv__load(T_VEC, PTR) vec_vsx_ld(0, PTR) - #else + #else // VX #define npyv__load(T_VEC, PTR) vec_xl(0, PTR) #endif #endif // unaligned store -#if (defined(__GNUC__) && !defined(vec_xl)) || (defined(__clang__) && !defined(__IBMC__)) +#if defined (NPY_HAVE_VSX2) && ( \ + (defined(__GNUC__) && !defined(vec_xl)) || (defined(__clang__) && !defined(__IBMC__)) \ +) #define npyv__store(PTR, VEC) vec_vsx_st(VEC, 0, PTR) -#else +#else // VX #define npyv__store(PTR, VEC) vec_xst(VEC, 0, PTR) #endif +// aligned load/store +#if defined (NPY_HAVE_VSX) + #define npyv__loada(PTR) vec_ld(0, PTR) + #define npyv__storea(PTR, VEC) vec_st(VEC, 0, PTR) +#else // VX + #define npyv__loada(PTR) vec_xl(0, PTR) + #define npyv__storea(PTR, VEC) vec_xst(VEC, 0, PTR) +#endif + // avoid aliasing rules #ifdef __cplusplus template<typename T_PTR> @@ -45,12 +58,16 @@ // load lower part NPY_FINLINE npyv_u64 npyv__loadl(const void *ptr) { +#ifdef NPY_HAVE_VSX #if defined(__clang__) && !defined(__IBMC__) // vec_promote doesn't support doubleword on clang return npyv_setall_u64(*npyv__ptr2u64(ptr)); #else return vec_promote(*npyv__ptr2u64(ptr), 0); #endif +#else // VX + return vec_load_len((const unsigned long long*)ptr, 7); +#endif } // store lower part #define npyv__storel(PTR, VEC) \ @@ -62,11 +79,11 @@ NPY_FINLINE npyv_u64 npyv__loadl(const void *ptr) /**************************** * load/store ****************************/ -#define NPYV_IMPL_VSX_MEM(SFX, DW_CAST) \ +#define NPYV_IMPL_VEC_MEM(SFX, DW_CAST) \ NPY_FINLINE npyv_##SFX npyv_load_##SFX(const npyv_lanetype_##SFX *ptr) \ { return (npyv_##SFX)npyv__load(npyv_##SFX, (const npyv_lanetype_##DW_CAST*)ptr); } \ NPY_FINLINE npyv_##SFX npyv_loada_##SFX(const npyv_lanetype_##SFX *ptr) \ - { return (npyv_##SFX)vec_ld(0, (const npyv_lanetype_u32*)ptr); } \ + { return (npyv_##SFX)npyv__loada((const npyv_lanetype_u32*)ptr); } \ NPY_FINLINE npyv_##SFX npyv_loads_##SFX(const npyv_lanetype_##SFX *ptr) \ { return npyv_loada_##SFX(ptr); } \ NPY_FINLINE npyv_##SFX npyv_loadl_##SFX(const npyv_lanetype_##SFX *ptr) \ @@ -74,7 +91,7 @@ NPY_FINLINE npyv_u64 npyv__loadl(const void *ptr) NPY_FINLINE void npyv_store_##SFX(npyv_lanetype_##SFX *ptr, npyv_##SFX vec) \ { npyv__store((npyv_lanetype_##DW_CAST*)ptr, (npyv_##DW_CAST)vec); } \ NPY_FINLINE void npyv_storea_##SFX(npyv_lanetype_##SFX *ptr, npyv_##SFX vec) \ - { vec_st((npyv_u32)vec, 0, (npyv_lanetype_u32*)ptr); } \ + { npyv__storea((npyv_lanetype_##DW_CAST*)ptr, (npyv_##DW_CAST)vec); } \ NPY_FINLINE void npyv_stores_##SFX(npyv_lanetype_##SFX *ptr, npyv_##SFX vec) \ { npyv_storea_##SFX(ptr, vec); } \ NPY_FINLINE void npyv_storel_##SFX(npyv_lanetype_##SFX *ptr, npyv_##SFX vec) \ @@ -82,16 +99,18 @@ NPY_FINLINE npyv_u64 npyv__loadl(const void *ptr) NPY_FINLINE void npyv_storeh_##SFX(npyv_lanetype_##SFX *ptr, npyv_##SFX vec) \ { npyv__storeh(ptr, vec); } -NPYV_IMPL_VSX_MEM(u8, u8) -NPYV_IMPL_VSX_MEM(s8, s8) -NPYV_IMPL_VSX_MEM(u16, u16) -NPYV_IMPL_VSX_MEM(s16, s16) -NPYV_IMPL_VSX_MEM(u32, u32) -NPYV_IMPL_VSX_MEM(s32, s32) -NPYV_IMPL_VSX_MEM(u64, f64) -NPYV_IMPL_VSX_MEM(s64, f64) -NPYV_IMPL_VSX_MEM(f32, f32) -NPYV_IMPL_VSX_MEM(f64, f64) +NPYV_IMPL_VEC_MEM(u8, u8) +NPYV_IMPL_VEC_MEM(s8, s8) +NPYV_IMPL_VEC_MEM(u16, u16) +NPYV_IMPL_VEC_MEM(s16, s16) +NPYV_IMPL_VEC_MEM(u32, u32) +NPYV_IMPL_VEC_MEM(s32, s32) +NPYV_IMPL_VEC_MEM(u64, f64) +NPYV_IMPL_VEC_MEM(s64, f64) +#if NPY_SIMD_F32 +NPYV_IMPL_VEC_MEM(f32, f32) +#endif +NPYV_IMPL_VEC_MEM(f64, f64) /*************************** * Non-contiguous Load @@ -106,8 +125,10 @@ NPY_FINLINE npyv_u32 npyv_loadn_u32(const npy_uint32 *ptr, npy_intp stride) } NPY_FINLINE npyv_s32 npyv_loadn_s32(const npy_int32 *ptr, npy_intp stride) { return (npyv_s32)npyv_loadn_u32((const npy_uint32*)ptr, stride); } +#if NPY_SIMD_F32 NPY_FINLINE npyv_f32 npyv_loadn_f32(const float *ptr, npy_intp stride) { return (npyv_f32)npyv_loadn_u32((const npy_uint32*)ptr, stride); } +#endif //// 64 NPY_FINLINE npyv_u64 npyv_loadn_u64(const npy_uint64 *ptr, npy_intp stride) { return npyv_set_u64(ptr[0], ptr[stride]); } @@ -128,8 +149,10 @@ NPY_FINLINE void npyv_storen_u32(npy_uint32 *ptr, npy_intp stride, npyv_u32 a) } NPY_FINLINE void npyv_storen_s32(npy_int32 *ptr, npy_intp stride, npyv_s32 a) { npyv_storen_u32((npy_uint32*)ptr, stride, (npyv_u32)a); } +#if NPY_SIMD_F32 NPY_FINLINE void npyv_storen_f32(float *ptr, npy_intp stride, npyv_f32 a) { npyv_storen_u32((npy_uint32*)ptr, stride, (npyv_u32)a); } +#endif //// 64 NPY_FINLINE void npyv_storen_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a) { @@ -149,6 +172,14 @@ NPY_FINLINE npyv_s32 npyv_load_till_s32(const npy_int32 *ptr, npy_uintp nlane, n { assert(nlane > 0); npyv_s32 vfill = npyv_setall_s32(fill); +#ifdef NPY_HAVE_VX + const unsigned blane = (unsigned short)nlane; + const npyv_u32 steps = npyv_set_u32(0, 1, 2, 3); + const npyv_u32 vlane = npyv_setall_u32((unsigned)blane); + const npyv_b32 mask = vec_cmpgt(vlane, steps); + npyv_s32 a = vec_load_len(ptr, blane*4-1); + return vec_sel(vfill, a, mask); +#else switch(nlane) { case 1: return vec_insert(ptr[0], vfill, 0); @@ -164,10 +195,18 @@ NPY_FINLINE npyv_s32 npyv_load_till_s32(const npy_int32 *ptr, npy_uintp nlane, n default: return npyv_load_s32(ptr); } +#endif } // fill zero to rest lanes NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32 *ptr, npy_uintp nlane) -{ return npyv_load_till_s32(ptr, nlane, 0); } +{ +#ifdef NPY_HAVE_VX + unsigned blane = ((unsigned short)nlane)*4 - 1; + return vec_load_len(ptr, blane); +#else + return npyv_load_till_s32(ptr, nlane, 0); +#endif +} //// 64 NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, npy_int64 fill) { @@ -179,7 +218,14 @@ NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, n } // fill zero to rest lanes NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane) -{ return npyv_load_till_s64(ptr, nlane, 0); } +{ +#ifdef NPY_HAVE_VX + unsigned blane = (unsigned short)nlane; + return vec_load_len((const signed long long*)ptr, blane*8-1); +#else + return npyv_load_till_s64(ptr, nlane, 0); +#endif +} /********************************* * Non-contiguous partial load *********************************/ @@ -226,6 +272,10 @@ NPY_FINLINE npyv_s64 npyv_loadn_tillz_s64(const npy_int64 *ptr, npy_intp stride, NPY_FINLINE void npyv_store_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a) { assert(nlane > 0); +#ifdef NPY_HAVE_VX + unsigned blane = (unsigned short)nlane; + vec_store_len(a, ptr, blane*4-1); +#else switch(nlane) { case 1: *ptr = vec_extract(a, 0); @@ -240,16 +290,22 @@ NPY_FINLINE void npyv_store_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a default: npyv_store_s32(ptr, a); } +#endif } //// 64 NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a) { assert(nlane > 0); +#ifdef NPY_HAVE_VX + unsigned blane = (unsigned short)nlane; + vec_store_len(a, (signed long long*)ptr, blane*8-1); +#else if (nlane == 1) { npyv_storel_s64(ptr, a); return; } npyv_store_s64(ptr, a); +#endif } /********************************* * Non-contiguous partial store @@ -283,7 +339,7 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp /***************************************************************** * Implement partial load/store for u32/f32/u64/f64... via casting *****************************************************************/ -#define NPYV_IMPL_VSX_REST_PARTIAL_TYPES(F_SFX, T_SFX) \ +#define NPYV_IMPL_VEC_REST_PARTIAL_TYPES(F_SFX, T_SFX) \ NPY_FINLINE npyv_##F_SFX npyv_load_till_##F_SFX \ (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, npyv_lanetype_##F_SFX fill) \ { \ @@ -338,39 +394,47 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp ); \ } -NPYV_IMPL_VSX_REST_PARTIAL_TYPES(u32, s32) -NPYV_IMPL_VSX_REST_PARTIAL_TYPES(f32, s32) -NPYV_IMPL_VSX_REST_PARTIAL_TYPES(u64, s64) -NPYV_IMPL_VSX_REST_PARTIAL_TYPES(f64, s64) +NPYV_IMPL_VEC_REST_PARTIAL_TYPES(u32, s32) +#if NPY_SIMD_F32 +NPYV_IMPL_VEC_REST_PARTIAL_TYPES(f32, s32) +#endif +NPYV_IMPL_VEC_REST_PARTIAL_TYPES(u64, s64) +NPYV_IMPL_VEC_REST_PARTIAL_TYPES(f64, s64) /********************************* * Lookup table *********************************/ // uses vector as indexes into a table // that contains 32 elements of float32. -NPY_FINLINE npyv_f32 npyv_lut32_f32(const float *table, npyv_u32 idx) +NPY_FINLINE npyv_u32 npyv_lut32_u32(const npy_uint32 *table, npyv_u32 idx) { const unsigned i0 = vec_extract(idx, 0); const unsigned i1 = vec_extract(idx, 1); const unsigned i2 = vec_extract(idx, 2); const unsigned i3 = vec_extract(idx, 3); - npyv_f32 r = vec_promote(table[i0], 0); + npyv_u32 r = vec_promote(table[i0], 0); r = vec_insert(table[i1], r, 1); r = vec_insert(table[i2], r, 2); r = vec_insert(table[i3], r, 3); return r; } -NPY_FINLINE npyv_u32 npyv_lut32_u32(const npy_uint32 *table, npyv_u32 idx) -{ return npyv_reinterpret_u32_f32(npyv_lut32_f32((const float*)table, idx)); } NPY_FINLINE npyv_s32 npyv_lut32_s32(const npy_int32 *table, npyv_u32 idx) -{ return npyv_reinterpret_s32_f32(npyv_lut32_f32((const float*)table, idx)); } - +{ return (npyv_s32)npyv_lut32_u32((const npy_uint32*)table, idx); } +#if NPY_SIMD_F32 + NPY_FINLINE npyv_f32 npyv_lut32_f32(const float *table, npyv_u32 idx) + { return (npyv_f32)npyv_lut32_u32((const npy_uint32*)table, idx); } +#endif // uses vector as indexes into a table // that contains 16 elements of float64. NPY_FINLINE npyv_f64 npyv_lut16_f64(const double *table, npyv_u64 idx) { +#ifdef NPY_HAVE_VX + const unsigned i0 = vec_extract((npyv_u32)idx, 1); + const unsigned i1 = vec_extract((npyv_u32)idx, 3); +#else const unsigned i0 = vec_extract((npyv_u32)idx, 0); const unsigned i1 = vec_extract((npyv_u32)idx, 2); +#endif npyv_f64 r = vec_promote(table[i0], 0); r = vec_insert(table[i1], r, 1); return r; @@ -380,4 +444,4 @@ NPY_FINLINE npyv_u64 npyv_lut16_u64(const npy_uint64 *table, npyv_u64 idx) NPY_FINLINE npyv_s64 npyv_lut16_s64(const npy_int64 *table, npyv_u64 idx) { return npyv_reinterpret_s64_f64(npyv_lut16_f64((const double*)table, idx)); } -#endif // _NPY_SIMD_VSX_MEMORY_H +#endif // _NPY_SIMD_VEC_MEMORY_H diff --git a/numpy/core/src/common/simd/vsx/misc.h b/numpy/core/src/common/simd/vec/misc.h index f7a0cdd5c..c4f35cfc0 100644 --- a/numpy/core/src/common/simd/vsx/misc.h +++ b/numpy/core/src/common/simd/vec/misc.h @@ -2,8 +2,8 @@ #error "Not a standalone header" #endif -#ifndef _NPY_SIMD_VSX_MISC_H -#define _NPY_SIMD_VSX_MISC_H +#ifndef _NPY_SIMD_VEC_MISC_H +#define _NPY_SIMD_VEC_MISC_H // vector with zero lanes #define npyv_zero_u8() ((npyv_u8) npyv_setall_s32(0)) @@ -14,26 +14,30 @@ #define npyv_zero_s32() npyv_setall_s32(0) #define npyv_zero_u64() ((npyv_u64) npyv_setall_s32(0)) #define npyv_zero_s64() ((npyv_s64) npyv_setall_s32(0)) -#define npyv_zero_f32() npyv_setall_f32(0.0f) +#if NPY_SIMD_F32 + #define npyv_zero_f32() npyv_setall_f32(0.0f) +#endif #define npyv_zero_f64() npyv_setall_f64(0.0) // vector with a specific value set to all lanes // the safest way to generate vsplti* and vsplt* instructions -#define NPYV_IMPL_VSX_SPLTB(T_VEC, V) ((T_VEC){V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V}) -#define NPYV_IMPL_VSX_SPLTH(T_VEC, V) ((T_VEC){V, V, V, V, V, V, V, V}) -#define NPYV_IMPL_VSX_SPLTW(T_VEC, V) ((T_VEC){V, V, V, V}) -#define NPYV_IMPL_VSX_SPLTD(T_VEC, V) ((T_VEC){V, V}) - -#define npyv_setall_u8(VAL) NPYV_IMPL_VSX_SPLTB(npyv_u8, (unsigned char)VAL) -#define npyv_setall_s8(VAL) NPYV_IMPL_VSX_SPLTB(npyv_s8, (signed char)VAL) -#define npyv_setall_u16(VAL) NPYV_IMPL_VSX_SPLTH(npyv_u16, (unsigned short)VAL) -#define npyv_setall_s16(VAL) NPYV_IMPL_VSX_SPLTH(npyv_s16, (short)VAL) -#define npyv_setall_u32(VAL) NPYV_IMPL_VSX_SPLTW(npyv_u32, (unsigned int)VAL) -#define npyv_setall_s32(VAL) NPYV_IMPL_VSX_SPLTW(npyv_s32, (int)VAL) -#define npyv_setall_f32(VAL) NPYV_IMPL_VSX_SPLTW(npyv_f32, VAL) -#define npyv_setall_u64(VAL) NPYV_IMPL_VSX_SPLTD(npyv_u64, (npy_uint64)VAL) -#define npyv_setall_s64(VAL) NPYV_IMPL_VSX_SPLTD(npyv_s64, (npy_int64)VAL) -#define npyv_setall_f64(VAL) NPYV_IMPL_VSX_SPLTD(npyv_f64, VAL) +#define NPYV_IMPL_VEC_SPLTB(T_VEC, V) ((T_VEC){V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V}) +#define NPYV_IMPL_VEC_SPLTH(T_VEC, V) ((T_VEC){V, V, V, V, V, V, V, V}) +#define NPYV_IMPL_VEC_SPLTW(T_VEC, V) ((T_VEC){V, V, V, V}) +#define NPYV_IMPL_VEC_SPLTD(T_VEC, V) ((T_VEC){V, V}) + +#define npyv_setall_u8(VAL) NPYV_IMPL_VEC_SPLTB(npyv_u8, (unsigned char)VAL) +#define npyv_setall_s8(VAL) NPYV_IMPL_VEC_SPLTB(npyv_s8, (signed char)VAL) +#define npyv_setall_u16(VAL) NPYV_IMPL_VEC_SPLTH(npyv_u16, (unsigned short)VAL) +#define npyv_setall_s16(VAL) NPYV_IMPL_VEC_SPLTH(npyv_s16, (short)VAL) +#define npyv_setall_u32(VAL) NPYV_IMPL_VEC_SPLTW(npyv_u32, (unsigned int)VAL) +#define npyv_setall_s32(VAL) NPYV_IMPL_VEC_SPLTW(npyv_s32, (int)VAL) +#if NPY_SIMD_F32 + #define npyv_setall_f32(VAL) NPYV_IMPL_VEC_SPLTW(npyv_f32, VAL) +#endif +#define npyv_setall_u64(VAL) NPYV_IMPL_VEC_SPLTD(npyv_u64, (npy_uint64)VAL) +#define npyv_setall_s64(VAL) NPYV_IMPL_VEC_SPLTD(npyv_s64, (npy_int64)VAL) +#define npyv_setall_f64(VAL) NPYV_IMPL_VEC_SPLTD(npyv_f64, VAL) // vector with specific values set to each lane and // set a specific value to all remained lanes @@ -45,7 +49,9 @@ #define npyv_setf_s32(FILL, ...) ((npyv_s32){NPYV__SET_FILL_4(int, FILL, __VA_ARGS__)}) #define npyv_setf_u64(FILL, ...) ((npyv_u64){NPYV__SET_FILL_2(npy_int64, FILL, __VA_ARGS__)}) #define npyv_setf_s64(FILL, ...) ((npyv_s64){NPYV__SET_FILL_2(npy_int64, FILL, __VA_ARGS__)}) -#define npyv_setf_f32(FILL, ...) ((npyv_f32){NPYV__SET_FILL_4(float, FILL, __VA_ARGS__)}) +#if NPY_SIMD_F32 + #define npyv_setf_f32(FILL, ...) ((npyv_f32){NPYV__SET_FILL_4(float, FILL, __VA_ARGS__)}) +#endif #define npyv_setf_f64(FILL, ...) ((npyv_f64){NPYV__SET_FILL_2(double, FILL, __VA_ARGS__)}) // vector with specific values set to each lane and @@ -58,7 +64,9 @@ #define npyv_set_s32(...) npyv_setf_s32(0, __VA_ARGS__) #define npyv_set_u64(...) npyv_setf_u64(0, __VA_ARGS__) #define npyv_set_s64(...) npyv_setf_s64(0, __VA_ARGS__) -#define npyv_set_f32(...) npyv_setf_f32(0, __VA_ARGS__) +#if NPY_SIMD_F32 + #define npyv_set_f32(...) npyv_setf_f32(0, __VA_ARGS__) +#endif #define npyv_set_f64(...) npyv_setf_f64(0, __VA_ARGS__) // Per lane select @@ -70,7 +78,9 @@ #define npyv_select_s32 npyv_select_u8 #define npyv_select_u64 npyv_select_u8 #define npyv_select_s64 npyv_select_u8 -#define npyv_select_f32 npyv_select_u8 +#if NPY_SIMD_F32 + #define npyv_select_f32 npyv_select_u8 +#endif #define npyv_select_f64 npyv_select_u8 // Reinterpret @@ -82,7 +92,9 @@ #define npyv_reinterpret_u8_s32 npyv_reinterpret_u8_s8 #define npyv_reinterpret_u8_u64 npyv_reinterpret_u8_s8 #define npyv_reinterpret_u8_s64 npyv_reinterpret_u8_s8 -#define npyv_reinterpret_u8_f32 npyv_reinterpret_u8_s8 +#if NPY_SIMD_F32 + #define npyv_reinterpret_u8_f32 npyv_reinterpret_u8_s8 +#endif #define npyv_reinterpret_u8_f64 npyv_reinterpret_u8_s8 #define npyv_reinterpret_s8_s8(X) X @@ -93,7 +105,9 @@ #define npyv_reinterpret_s8_s32 npyv_reinterpret_s8_u8 #define npyv_reinterpret_s8_u64 npyv_reinterpret_s8_u8 #define npyv_reinterpret_s8_s64 npyv_reinterpret_s8_u8 -#define npyv_reinterpret_s8_f32 npyv_reinterpret_s8_u8 +#if NPY_SIMD_F32 + #define npyv_reinterpret_s8_f32 npyv_reinterpret_s8_u8 +#endif #define npyv_reinterpret_s8_f64 npyv_reinterpret_s8_u8 #define npyv_reinterpret_u16_u16(X) X @@ -104,7 +118,9 @@ #define npyv_reinterpret_u16_s32 npyv_reinterpret_u16_u8 #define npyv_reinterpret_u16_u64 npyv_reinterpret_u16_u8 #define npyv_reinterpret_u16_s64 npyv_reinterpret_u16_u8 -#define npyv_reinterpret_u16_f32 npyv_reinterpret_u16_u8 +#if NPY_SIMD_F32 + #define npyv_reinterpret_u16_f32 npyv_reinterpret_u16_u8 +#endif #define npyv_reinterpret_u16_f64 npyv_reinterpret_u16_u8 #define npyv_reinterpret_s16_s16(X) X @@ -115,7 +131,9 @@ #define npyv_reinterpret_s16_s32 npyv_reinterpret_s16_u8 #define npyv_reinterpret_s16_u64 npyv_reinterpret_s16_u8 #define npyv_reinterpret_s16_s64 npyv_reinterpret_s16_u8 -#define npyv_reinterpret_s16_f32 npyv_reinterpret_s16_u8 +#if NPY_SIMD_F32 + #define npyv_reinterpret_s16_f32 npyv_reinterpret_s16_u8 +#endif #define npyv_reinterpret_s16_f64 npyv_reinterpret_s16_u8 #define npyv_reinterpret_u32_u32(X) X @@ -126,7 +144,9 @@ #define npyv_reinterpret_u32_s32 npyv_reinterpret_u32_u8 #define npyv_reinterpret_u32_u64 npyv_reinterpret_u32_u8 #define npyv_reinterpret_u32_s64 npyv_reinterpret_u32_u8 -#define npyv_reinterpret_u32_f32 npyv_reinterpret_u32_u8 +#if NPY_SIMD_F32 + #define npyv_reinterpret_u32_f32 npyv_reinterpret_u32_u8 +#endif #define npyv_reinterpret_u32_f64 npyv_reinterpret_u32_u8 #define npyv_reinterpret_s32_s32(X) X @@ -137,7 +157,9 @@ #define npyv_reinterpret_s32_u32 npyv_reinterpret_s32_u8 #define npyv_reinterpret_s32_u64 npyv_reinterpret_s32_u8 #define npyv_reinterpret_s32_s64 npyv_reinterpret_s32_u8 -#define npyv_reinterpret_s32_f32 npyv_reinterpret_s32_u8 +#if NPY_SIMD_F32 + #define npyv_reinterpret_s32_f32 npyv_reinterpret_s32_u8 +#endif #define npyv_reinterpret_s32_f64 npyv_reinterpret_s32_u8 #define npyv_reinterpret_u64_u64(X) X @@ -148,7 +170,9 @@ #define npyv_reinterpret_u64_u32 npyv_reinterpret_u64_u8 #define npyv_reinterpret_u64_s32 npyv_reinterpret_u64_u8 #define npyv_reinterpret_u64_s64 npyv_reinterpret_u64_u8 -#define npyv_reinterpret_u64_f32 npyv_reinterpret_u64_u8 +#if NPY_SIMD_F32 + #define npyv_reinterpret_u64_f32 npyv_reinterpret_u64_u8 +#endif #define npyv_reinterpret_u64_f64 npyv_reinterpret_u64_u8 #define npyv_reinterpret_s64_s64(X) X @@ -159,19 +183,23 @@ #define npyv_reinterpret_s64_u32 npyv_reinterpret_s64_u8 #define npyv_reinterpret_s64_s32 npyv_reinterpret_s64_u8 #define npyv_reinterpret_s64_u64 npyv_reinterpret_s64_u8 -#define npyv_reinterpret_s64_f32 npyv_reinterpret_s64_u8 +#if NPY_SIMD_F32 + #define npyv_reinterpret_s64_f32 npyv_reinterpret_s64_u8 +#endif #define npyv_reinterpret_s64_f64 npyv_reinterpret_s64_u8 -#define npyv_reinterpret_f32_f32(X) X -#define npyv_reinterpret_f32_u8(X) ((npyv_f32)X) -#define npyv_reinterpret_f32_s8 npyv_reinterpret_f32_u8 -#define npyv_reinterpret_f32_u16 npyv_reinterpret_f32_u8 -#define npyv_reinterpret_f32_s16 npyv_reinterpret_f32_u8 -#define npyv_reinterpret_f32_u32 npyv_reinterpret_f32_u8 -#define npyv_reinterpret_f32_s32 npyv_reinterpret_f32_u8 -#define npyv_reinterpret_f32_u64 npyv_reinterpret_f32_u8 -#define npyv_reinterpret_f32_s64 npyv_reinterpret_f32_u8 -#define npyv_reinterpret_f32_f64 npyv_reinterpret_f32_u8 +#if NPY_SIMD_F32 + #define npyv_reinterpret_f32_f32(X) X + #define npyv_reinterpret_f32_u8(X) ((npyv_f32)X) + #define npyv_reinterpret_f32_s8 npyv_reinterpret_f32_u8 + #define npyv_reinterpret_f32_u16 npyv_reinterpret_f32_u8 + #define npyv_reinterpret_f32_s16 npyv_reinterpret_f32_u8 + #define npyv_reinterpret_f32_u32 npyv_reinterpret_f32_u8 + #define npyv_reinterpret_f32_s32 npyv_reinterpret_f32_u8 + #define npyv_reinterpret_f32_u64 npyv_reinterpret_f32_u8 + #define npyv_reinterpret_f32_s64 npyv_reinterpret_f32_u8 + #define npyv_reinterpret_f32_f64 npyv_reinterpret_f32_u8 +#endif #define npyv_reinterpret_f64_f64(X) X #define npyv_reinterpret_f64_u8(X) ((npyv_f64)X) @@ -182,9 +210,10 @@ #define npyv_reinterpret_f64_s32 npyv_reinterpret_f64_u8 #define npyv_reinterpret_f64_u64 npyv_reinterpret_f64_u8 #define npyv_reinterpret_f64_s64 npyv_reinterpret_f64_u8 -#define npyv_reinterpret_f64_f32 npyv_reinterpret_f64_u8 - +#if NPY_SIMD_F32 + #define npyv_reinterpret_f64_f32 npyv_reinterpret_f64_u8 +#endif // Only required by AVX2/AVX512 #define npyv_cleanup() ((void)0) -#endif // _NPY_SIMD_VSX_MISC_H +#endif // _NPY_SIMD_VEC_MISC_H diff --git a/numpy/core/src/common/simd/vsx/operators.h b/numpy/core/src/common/simd/vec/operators.h index b01d85321..8b58676e7 100644 --- a/numpy/core/src/common/simd/vsx/operators.h +++ b/numpy/core/src/common/simd/vec/operators.h @@ -2,8 +2,8 @@ #error "Not a standalone header" #endif -#ifndef _NPY_SIMD_VSX_OPERATORS_H -#define _NPY_SIMD_VSX_OPERATORS_H +#ifndef _NPY_SIMD_VEC_OPERATORS_H +#define _NPY_SIMD_VEC_OPERATORS_H /*************************** * Shifting @@ -11,11 +11,11 @@ // Left #define npyv_shl_u16(A, C) vec_sl(A, npyv_setall_u16(C)) -#define npyv_shl_s16(A, C) vec_sl(A, npyv_setall_u16(C)) +#define npyv_shl_s16(A, C) vec_sl_s16(A, npyv_setall_u16(C)) #define npyv_shl_u32(A, C) vec_sl(A, npyv_setall_u32(C)) -#define npyv_shl_s32(A, C) vec_sl(A, npyv_setall_u32(C)) +#define npyv_shl_s32(A, C) vec_sl_s32(A, npyv_setall_u32(C)) #define npyv_shl_u64(A, C) vec_sl(A, npyv_setall_u64(C)) -#define npyv_shl_s64(A, C) vec_sl(A, npyv_setall_u64(C)) +#define npyv_shl_s64(A, C) vec_sl_s64(A, npyv_setall_u64(C)) // Left by an immediate constant #define npyv_shli_u16 npyv_shl_u16 @@ -27,11 +27,11 @@ // Right #define npyv_shr_u16(A, C) vec_sr(A, npyv_setall_u16(C)) -#define npyv_shr_s16(A, C) vec_sra(A, npyv_setall_u16(C)) +#define npyv_shr_s16(A, C) vec_sra_s16(A, npyv_setall_u16(C)) #define npyv_shr_u32(A, C) vec_sr(A, npyv_setall_u32(C)) -#define npyv_shr_s32(A, C) vec_sra(A, npyv_setall_u32(C)) +#define npyv_shr_s32(A, C) vec_sra_s32(A, npyv_setall_u32(C)) #define npyv_shr_u64(A, C) vec_sr(A, npyv_setall_u64(C)) -#define npyv_shr_s64(A, C) vec_sra(A, npyv_setall_u64(C)) +#define npyv_shr_s64(A, C) vec_sra_s64(A, npyv_setall_u64(C)) // Right by an immediate constant #define npyv_shri_u16 npyv_shr_u16 @@ -44,15 +44,15 @@ /*************************** * Logical ***************************/ -#define NPYV_IMPL_VSX_BIN_CAST(INTRIN, SFX, CAST) \ +#define NPYV_IMPL_VEC_BIN_CAST(INTRIN, SFX, CAST) \ NPY_FINLINE npyv_##SFX npyv_##INTRIN##_##SFX(npyv_##SFX a, npyv_##SFX b) \ { return (npyv_##SFX)vec_##INTRIN((CAST)a, (CAST)b); } // Up to GCC 6 logical intrinsics don't support bool long long #if defined(__GNUC__) && __GNUC__ <= 6 - #define NPYV_IMPL_VSX_BIN_B64(INTRIN) NPYV_IMPL_VSX_BIN_CAST(INTRIN, b64, npyv_u64) + #define NPYV_IMPL_VEC_BIN_B64(INTRIN) NPYV_IMPL_VEC_BIN_CAST(INTRIN, b64, npyv_u64) #else - #define NPYV_IMPL_VSX_BIN_B64(INTRIN) NPYV_IMPL_VSX_BIN_CAST(INTRIN, b64, npyv_b64) + #define NPYV_IMPL_VEC_BIN_B64(INTRIN) NPYV_IMPL_VEC_BIN_CAST(INTRIN, b64, npyv_b64) #endif // AND #define npyv_and_u8 vec_and @@ -63,12 +63,14 @@ #define npyv_and_s32 vec_and #define npyv_and_u64 vec_and #define npyv_and_s64 vec_and -#define npyv_and_f32 vec_and +#if NPY_SIMD_F32 + #define npyv_and_f32 vec_and +#endif #define npyv_and_f64 vec_and #define npyv_and_b8 vec_and #define npyv_and_b16 vec_and #define npyv_and_b32 vec_and -NPYV_IMPL_VSX_BIN_B64(and) +NPYV_IMPL_VEC_BIN_B64(and) // OR #define npyv_or_u8 vec_or @@ -79,12 +81,14 @@ NPYV_IMPL_VSX_BIN_B64(and) #define npyv_or_s32 vec_or #define npyv_or_u64 vec_or #define npyv_or_s64 vec_or -#define npyv_or_f32 vec_or +#if NPY_SIMD_F32 + #define npyv_or_f32 vec_or +#endif #define npyv_or_f64 vec_or #define npyv_or_b8 vec_or #define npyv_or_b16 vec_or #define npyv_or_b32 vec_or -NPYV_IMPL_VSX_BIN_B64(or) +NPYV_IMPL_VEC_BIN_B64(or) // XOR #define npyv_xor_u8 vec_xor @@ -95,16 +99,18 @@ NPYV_IMPL_VSX_BIN_B64(or) #define npyv_xor_s32 vec_xor #define npyv_xor_u64 vec_xor #define npyv_xor_s64 vec_xor -#define npyv_xor_f32 vec_xor +#if NPY_SIMD_F32 + #define npyv_xor_f32 vec_xor +#endif #define npyv_xor_f64 vec_xor #define npyv_xor_b8 vec_xor #define npyv_xor_b16 vec_xor #define npyv_xor_b32 vec_xor -NPYV_IMPL_VSX_BIN_B64(xor) +NPYV_IMPL_VEC_BIN_B64(xor) // NOT // note: we implement npyv_not_b*(boolean types) for internal use*/ -#define NPYV_IMPL_VSX_NOT_INT(VEC_LEN) \ +#define NPYV_IMPL_VEC_NOT_INT(VEC_LEN) \ NPY_FINLINE npyv_u##VEC_LEN npyv_not_u##VEC_LEN(npyv_u##VEC_LEN a) \ { return vec_nor(a, a); } \ NPY_FINLINE npyv_s##VEC_LEN npyv_not_s##VEC_LEN(npyv_s##VEC_LEN a) \ @@ -112,13 +118,13 @@ NPYV_IMPL_VSX_BIN_B64(xor) NPY_FINLINE npyv_b##VEC_LEN npyv_not_b##VEC_LEN(npyv_b##VEC_LEN a) \ { return vec_nor(a, a); } -NPYV_IMPL_VSX_NOT_INT(8) -NPYV_IMPL_VSX_NOT_INT(16) -NPYV_IMPL_VSX_NOT_INT(32) +NPYV_IMPL_VEC_NOT_INT(8) +NPYV_IMPL_VEC_NOT_INT(16) +NPYV_IMPL_VEC_NOT_INT(32) -// up to gcc5 vec_nor doesn't support bool long long -#if defined(__GNUC__) && __GNUC__ > 5 - NPYV_IMPL_VSX_NOT_INT(64) +// on ppc64, up to gcc5 vec_nor doesn't support bool long long +#if defined(NPY_HAVE_VSX) && defined(__GNUC__) && __GNUC__ > 5 + NPYV_IMPL_VEC_NOT_INT(64) #else NPY_FINLINE npyv_u64 npyv_not_u64(npyv_u64 a) { return vec_nor(a, a); } @@ -128,16 +134,23 @@ NPYV_IMPL_VSX_NOT_INT(32) { return (npyv_b64)vec_nor((npyv_u64)a, (npyv_u64)a); } #endif -NPY_FINLINE npyv_f32 npyv_not_f32(npyv_f32 a) -{ return vec_nor(a, a); } +#if NPY_SIMD_F32 + NPY_FINLINE npyv_f32 npyv_not_f32(npyv_f32 a) + { return vec_nor(a, a); } +#endif NPY_FINLINE npyv_f64 npyv_not_f64(npyv_f64 a) { return vec_nor(a, a); } // ANDC, ORC and XNOR #define npyv_andc_u8 vec_andc #define npyv_andc_b8 vec_andc -#define npyv_orc_b8 vec_orc -#define npyv_xnor_b8 vec_eqv +#if defined(NPY_HAVE_VXE) || defined(NPY_HAVE_VSX) + #define npyv_orc_b8 vec_orc + #define npyv_xnor_b8 vec_eqv +#else + #define npyv_orc_b8(A, B) npyv_or_b8(npyv_not_b8(B), A) + #define npyv_xnor_b8(A, B) npyv_not_b8(npyv_xor_b8(B, A)) +#endif /*************************** * Comparison @@ -152,7 +165,9 @@ NPY_FINLINE npyv_f64 npyv_not_f64(npyv_f64 a) #define npyv_cmpeq_s32 vec_cmpeq #define npyv_cmpeq_u64 vec_cmpeq #define npyv_cmpeq_s64 vec_cmpeq -#define npyv_cmpeq_f32 vec_cmpeq +#if NPY_SIMD_F32 + #define npyv_cmpeq_f32 vec_cmpeq +#endif #define npyv_cmpeq_f64 vec_cmpeq // Int Not Equal @@ -177,7 +192,9 @@ NPY_FINLINE npyv_f64 npyv_not_f64(npyv_f64 a) #define npyv_cmpneq_s32(A, B) npyv_not_b32(vec_cmpeq(A, B)) #define npyv_cmpneq_u64(A, B) npyv_not_b64(vec_cmpeq(A, B)) #define npyv_cmpneq_s64(A, B) npyv_not_b64(vec_cmpeq(A, B)) - #define npyv_cmpneq_f32(A, B) npyv_not_b32(vec_cmpeq(A, B)) + #if NPY_SIMD_F32 + #define npyv_cmpneq_f32(A, B) npyv_not_b32(vec_cmpeq(A, B)) + #endif #define npyv_cmpneq_f64(A, B) npyv_not_b64(vec_cmpeq(A, B)) #endif @@ -190,12 +207,14 @@ NPY_FINLINE npyv_f64 npyv_not_f64(npyv_f64 a) #define npyv_cmpgt_s32 vec_cmpgt #define npyv_cmpgt_u64 vec_cmpgt #define npyv_cmpgt_s64 vec_cmpgt -#define npyv_cmpgt_f32 vec_cmpgt +#if NPY_SIMD_F32 + #define npyv_cmpgt_f32 vec_cmpgt +#endif #define npyv_cmpgt_f64 vec_cmpgt // Greater than or equal -// up to gcc5 vec_cmpge only supports single and double precision -#if defined(__GNUC__) && __GNUC__ > 5 +// On ppc64le, up to gcc5 vec_cmpge only supports single and double precision +#if defined(NPY_HAVE_VX) || (defined(__GNUC__) && __GNUC__ > 5) #define npyv_cmpge_u8 vec_cmpge #define npyv_cmpge_s8 vec_cmpge #define npyv_cmpge_u16 vec_cmpge @@ -214,7 +233,9 @@ NPY_FINLINE npyv_f64 npyv_not_f64(npyv_f64 a) #define npyv_cmpge_u64(A, B) npyv_not_b64(vec_cmpgt(B, A)) #define npyv_cmpge_s64(A, B) npyv_not_b64(vec_cmpgt(B, A)) #endif -#define npyv_cmpge_f32 vec_cmpge +#if NPY_SIMD_F32 + #define npyv_cmpge_f32 vec_cmpge +#endif #define npyv_cmpge_f64 vec_cmpge // Less than @@ -226,7 +247,9 @@ NPY_FINLINE npyv_f64 npyv_not_f64(npyv_f64 a) #define npyv_cmplt_s32(A, B) npyv_cmpgt_s32(B, A) #define npyv_cmplt_u64(A, B) npyv_cmpgt_u64(B, A) #define npyv_cmplt_s64(A, B) npyv_cmpgt_s64(B, A) -#define npyv_cmplt_f32(A, B) npyv_cmpgt_f32(B, A) +#if NPY_SIMD_F32 + #define npyv_cmplt_f32(A, B) npyv_cmpgt_f32(B, A) +#endif #define npyv_cmplt_f64(A, B) npyv_cmpgt_f64(B, A) // Less than or equal @@ -238,13 +261,17 @@ NPY_FINLINE npyv_f64 npyv_not_f64(npyv_f64 a) #define npyv_cmple_s32(A, B) npyv_cmpge_s32(B, A) #define npyv_cmple_u64(A, B) npyv_cmpge_u64(B, A) #define npyv_cmple_s64(A, B) npyv_cmpge_s64(B, A) -#define npyv_cmple_f32(A, B) npyv_cmpge_f32(B, A) +#if NPY_SIMD_F32 + #define npyv_cmple_f32(A, B) npyv_cmpge_f32(B, A) +#endif #define npyv_cmple_f64(A, B) npyv_cmpge_f64(B, A) // check special cases -NPY_FINLINE npyv_b32 npyv_notnan_f32(npyv_f32 a) -{ return vec_cmpeq(a, a); } +#if NPY_SIMD_F32 + NPY_FINLINE npyv_b32 npyv_notnan_f32(npyv_f32 a) + { return vec_cmpeq(a, a); } +#endif NPY_FINLINE npyv_b64 npyv_notnan_f64(npyv_f64 a) { return vec_cmpeq(a, a); } -#endif // _NPY_SIMD_VSX_OPERATORS_H +#endif // _NPY_SIMD_VEC_OPERATORS_H diff --git a/numpy/core/src/common/simd/vsx/reorder.h b/numpy/core/src/common/simd/vec/reorder.h index 6533e5093..b60b9287d 100644 --- a/numpy/core/src/common/simd/vsx/reorder.h +++ b/numpy/core/src/common/simd/vec/reorder.h @@ -2,8 +2,8 @@ #error "Not a standalone header" #endif -#ifndef _NPY_SIMD_VSX_REORDER_H -#define _NPY_SIMD_VSX_REORDER_H +#ifndef _NPY_SIMD_VEC_REORDER_H +#define _NPY_SIMD_VEC_REORDER_H // combine lower part of two vectors #define npyv__combinel(A, B) vec_mergeh((npyv_u64)(A), (npyv_u64)(B)) @@ -15,7 +15,9 @@ #define npyv_combinel_s32(A, B) ((npyv_s32)npyv__combinel(A, B)) #define npyv_combinel_u64 vec_mergeh #define npyv_combinel_s64 vec_mergeh -#define npyv_combinel_f32(A, B) ((npyv_f32)npyv__combinel(A, B)) +#if NPY_SIMD_F32 + #define npyv_combinel_f32(A, B) ((npyv_f32)npyv__combinel(A, B)) +#endif #define npyv_combinel_f64 vec_mergeh // combine higher part of two vectors @@ -28,14 +30,16 @@ #define npyv_combineh_s32(A, B) ((npyv_s32)npyv__combineh(A, B)) #define npyv_combineh_u64 vec_mergel #define npyv_combineh_s64 vec_mergel -#define npyv_combineh_f32(A, B) ((npyv_f32)npyv__combineh(A, B)) +#if NPY_SIMD_F32 + #define npyv_combineh_f32(A, B) ((npyv_f32)npyv__combineh(A, B)) +#endif #define npyv_combineh_f64 vec_mergel /* * combine: combine two vectors from lower and higher parts of two other vectors * zip: interleave two vectors */ -#define NPYV_IMPL_VSX_COMBINE_ZIP(T_VEC, SFX) \ +#define NPYV_IMPL_VEC_COMBINE_ZIP(T_VEC, SFX) \ NPY_FINLINE T_VEC##x2 npyv_combine_##SFX(T_VEC a, T_VEC b) \ { \ T_VEC##x2 r; \ @@ -51,16 +55,18 @@ return r; \ } -NPYV_IMPL_VSX_COMBINE_ZIP(npyv_u8, u8) -NPYV_IMPL_VSX_COMBINE_ZIP(npyv_s8, s8) -NPYV_IMPL_VSX_COMBINE_ZIP(npyv_u16, u16) -NPYV_IMPL_VSX_COMBINE_ZIP(npyv_s16, s16) -NPYV_IMPL_VSX_COMBINE_ZIP(npyv_u32, u32) -NPYV_IMPL_VSX_COMBINE_ZIP(npyv_s32, s32) -NPYV_IMPL_VSX_COMBINE_ZIP(npyv_u64, u64) -NPYV_IMPL_VSX_COMBINE_ZIP(npyv_s64, s64) -NPYV_IMPL_VSX_COMBINE_ZIP(npyv_f32, f32) -NPYV_IMPL_VSX_COMBINE_ZIP(npyv_f64, f64) +NPYV_IMPL_VEC_COMBINE_ZIP(npyv_u8, u8) +NPYV_IMPL_VEC_COMBINE_ZIP(npyv_s8, s8) +NPYV_IMPL_VEC_COMBINE_ZIP(npyv_u16, u16) +NPYV_IMPL_VEC_COMBINE_ZIP(npyv_s16, s16) +NPYV_IMPL_VEC_COMBINE_ZIP(npyv_u32, u32) +NPYV_IMPL_VEC_COMBINE_ZIP(npyv_s32, s32) +NPYV_IMPL_VEC_COMBINE_ZIP(npyv_u64, u64) +NPYV_IMPL_VEC_COMBINE_ZIP(npyv_s64, s64) +#if NPY_SIMD_F32 + NPYV_IMPL_VEC_COMBINE_ZIP(npyv_f32, f32) +#endif +NPYV_IMPL_VEC_COMBINE_ZIP(npyv_f64, f64) // Reverse elements of each 64-bit lane NPY_FINLINE npyv_u8 npyv_rev64_u8(npyv_u8 a) @@ -100,7 +106,9 @@ NPY_FINLINE npyv_u32 npyv_rev64_u32(npyv_u32 a) } NPY_FINLINE npyv_s32 npyv_rev64_s32(npyv_s32 a) { return (npyv_s32)npyv_rev64_u32((npyv_u32)a); } -NPY_FINLINE npyv_f32 npyv_rev64_f32(npyv_f32 a) -{ return (npyv_f32)npyv_rev64_u32((npyv_u32)a); } +#if NPY_SIMD_F32 + NPY_FINLINE npyv_f32 npyv_rev64_f32(npyv_f32 a) + { return (npyv_f32)npyv_rev64_u32((npyv_u32)a); } +#endif -#endif // _NPY_SIMD_VSX_REORDER_H +#endif // _NPY_SIMD_VEC_REORDER_H diff --git a/numpy/core/src/common/simd/vec/utils.h b/numpy/core/src/common/simd/vec/utils.h new file mode 100644 index 000000000..f8b28cfeb --- /dev/null +++ b/numpy/core/src/common/simd/vec/utils.h @@ -0,0 +1,84 @@ +#ifndef NPY_SIMD + #error "Not a standalone header" +#endif + +#ifndef _NPY_SIMD_VEC_UTILS_H +#define _NPY_SIMD_VEC_UTILS_H + +// the following intrinsics may not some|all by zvector API on gcc/clang +#ifdef NPY_HAVE_VX + #ifndef vec_neg + #define vec_neg(a) (-(a)) // Vector Negate + #endif + #ifndef vec_add + #define vec_add(a, b) ((a) + (b)) // Vector Add + #endif + #ifndef vec_sub + #define vec_sub(a, b) ((a) - (b)) // Vector Subtract + #endif + #ifndef vec_mul + #define vec_mul(a, b) ((a) * (b)) // Vector Multiply + #endif + #ifndef vec_div + #define vec_div(a, b) ((a) / (b)) // Vector Divide + #endif + #ifndef vec_neg + #define vec_neg(a) (-(a)) + #endif + #ifndef vec_and + #define vec_and(a, b) ((a) & (b)) // Vector AND + #endif + #ifndef vec_or + #define vec_or(a, b) ((a) | (b)) // Vector OR + #endif + #ifndef vec_xor + #define vec_xor(a, b) ((a) ^ (b)) // Vector XOR + #endif + #ifndef vec_sl + #define vec_sl(a, b) ((a) << (b)) // Vector Shift Left + #endif + #ifndef vec_sra + #define vec_sra(a, b) ((a) >> (b)) // Vector Shift Right + #endif + #ifndef vec_sr + #define vec_sr(a, b) ((a) >> (b)) // Vector Shift Right Algebraic + #endif + #ifndef vec_slo + #define vec_slo(a, b) vec_slb(a, (b) << 64) // Vector Shift Left by Octet + #endif + #ifndef vec_sro + #define vec_sro(a, b) vec_srb(a, (b) << 64) // Vector Shift Right by Octet + #endif + // vec_doublee maps to wrong intrin "vfll". + // see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100871 + #if defined(__GNUC__) && !defined(__clang__) + #define npyv_doublee __builtin_s390_vflls + #else + #define npyv_doublee vec_doublee + #endif + // compatibility with vsx + #ifndef vec_vbpermq + #define vec_vbpermq vec_bperm_u128 + #endif + // zvector requires second operand to signed while vsx api expected to be + // unsigned, the following macros are set to remove this conflict + #define vec_sl_s8(a, b) vec_sl(a, (npyv_s8)(b)) + #define vec_sl_s16(a, b) vec_sl(a, (npyv_s16)(b)) + #define vec_sl_s32(a, b) vec_sl(a, (npyv_s32)(b)) + #define vec_sl_s64(a, b) vec_sl(a, (npyv_s64)(b)) + #define vec_sra_s8(a, b) vec_sra(a, (npyv_s8)(b)) + #define vec_sra_s16(a, b) vec_sra(a, (npyv_s16)(b)) + #define vec_sra_s32(a, b) vec_sra(a, (npyv_s32)(b)) + #define vec_sra_s64(a, b) vec_sra(a, (npyv_s64)(b)) +#else + #define vec_sl_s8 vec_sl + #define vec_sl_s16 vec_sl + #define vec_sl_s32 vec_sl + #define vec_sl_s64 vec_sl + #define vec_sra_s8 vec_sra + #define vec_sra_s16 vec_sra + #define vec_sra_s32 vec_sra + #define vec_sra_s64 vec_sra +#endif + +#endif // _NPY_SIMD_VEC_UTILS_H diff --git a/numpy/core/src/common/simd/vsx/vsx.h b/numpy/core/src/common/simd/vec/vec.h index b4d8172a2..abcd33ce1 100644 --- a/numpy/core/src/common/simd/vsx/vsx.h +++ b/numpy/core/src/common/simd/vec/vec.h @@ -1,7 +1,22 @@ +/** + * branch /vec(altivec-like) provides the SIMD operations for + * both IBM VSX(Power) and VX(ZArch). +*/ #ifndef _NPY_SIMD_H_ #error "Not a standalone header" #endif +#if !defined(NPY_HAVE_VX) && !defined(NPY_HAVE_VSX2) + #error "require minimum support VX(zarch11) or VSX2(Power8/ISA2.07)" +#endif + +#if defined(NPY_HAVE_VSX) && !defined(__LITTLE_ENDIAN__) + #error "VSX support doesn't cover big-endian mode yet, only zarch." +#endif +#if defined(NPY_HAVE_VX) && defined(__LITTLE_ENDIAN__) + #error "VX(zarch) support doesn't cover little-endian mode." +#endif + #if defined(__GNUC__) && __GNUC__ <= 7 /** * GCC <= 7 produces ambiguous warning caused by -Werror=maybe-uninitialized, @@ -15,8 +30,19 @@ #define NPY_SIMD 128 #define NPY_SIMD_WIDTH 16 #define NPY_SIMD_F64 1 +#if defined(NPY_HAVE_VXE) || defined(NPY_HAVE_VSX) + #define NPY_SIMD_F32 1 +#else + #define NPY_SIMD_F32 0 +#endif #define NPY_SIMD_FMA3 1 // native support +#ifdef NPY_HAVE_VX + #define NPY_SIMD_BIGENDIAN 1 +#else + #define NPY_SIMD_BIGENDIAN 0 +#endif + typedef __vector unsigned char npyv_u8; typedef __vector signed char npyv_s8; typedef __vector unsigned short npyv_u16; @@ -25,7 +51,9 @@ typedef __vector unsigned int npyv_u32; typedef __vector signed int npyv_s32; typedef __vector unsigned long long npyv_u64; typedef __vector signed long long npyv_s64; +#if NPY_SIMD_F32 typedef __vector float npyv_f32; +#endif typedef __vector double npyv_f64; typedef struct { npyv_u8 val[2]; } npyv_u8x2; @@ -36,7 +64,9 @@ typedef struct { npyv_u32 val[2]; } npyv_u32x2; typedef struct { npyv_s32 val[2]; } npyv_s32x2; typedef struct { npyv_u64 val[2]; } npyv_u64x2; typedef struct { npyv_s64 val[2]; } npyv_s64x2; +#if NPY_SIMD_F32 typedef struct { npyv_f32 val[2]; } npyv_f32x2; +#endif typedef struct { npyv_f64 val[2]; } npyv_f64x2; typedef struct { npyv_u8 val[3]; } npyv_u8x3; @@ -47,7 +77,9 @@ typedef struct { npyv_u32 val[3]; } npyv_u32x3; typedef struct { npyv_s32 val[3]; } npyv_s32x3; typedef struct { npyv_u64 val[3]; } npyv_u64x3; typedef struct { npyv_s64 val[3]; } npyv_s64x3; +#if NPY_SIMD_F32 typedef struct { npyv_f32 val[3]; } npyv_f32x3; +#endif typedef struct { npyv_f64 val[3]; } npyv_f64x3; #define npyv_nlanes_u8 16 @@ -67,6 +99,7 @@ typedef struct { npyv_f64 val[3]; } npyv_f64x3; #define npyv_b32 __vector __bool int #define npyv_b64 __vector __bool long long +#include "utils.h" #include "memory.h" #include "misc.h" #include "reorder.h" diff --git a/numpy/core/src/common/simd/vsx/conversion.h b/numpy/core/src/common/simd/vsx/conversion.h deleted file mode 100644 index a599f3950..000000000 --- a/numpy/core/src/common/simd/vsx/conversion.h +++ /dev/null @@ -1,146 +0,0 @@ -#ifndef NPY_SIMD - #error "Not a standalone header" -#endif - -#ifndef _NPY_SIMD_VSX_CVT_H -#define _NPY_SIMD_VSX_CVT_H - -// convert boolean vectors to integer vectors -#define npyv_cvt_u8_b8(BL) ((npyv_u8) BL) -#define npyv_cvt_s8_b8(BL) ((npyv_s8) BL) -#define npyv_cvt_u16_b16(BL) ((npyv_u16) BL) -#define npyv_cvt_s16_b16(BL) ((npyv_s16) BL) -#define npyv_cvt_u32_b32(BL) ((npyv_u32) BL) -#define npyv_cvt_s32_b32(BL) ((npyv_s32) BL) -#define npyv_cvt_u64_b64(BL) ((npyv_u64) BL) -#define npyv_cvt_s64_b64(BL) ((npyv_s64) BL) -#define npyv_cvt_f32_b32(BL) ((npyv_f32) BL) -#define npyv_cvt_f64_b64(BL) ((npyv_f64) BL) - -// convert integer vectors to boolean vectors -#define npyv_cvt_b8_u8(A) ((npyv_b8) A) -#define npyv_cvt_b8_s8(A) ((npyv_b8) A) -#define npyv_cvt_b16_u16(A) ((npyv_b16) A) -#define npyv_cvt_b16_s16(A) ((npyv_b16) A) -#define npyv_cvt_b32_u32(A) ((npyv_b32) A) -#define npyv_cvt_b32_s32(A) ((npyv_b32) A) -#define npyv_cvt_b64_u64(A) ((npyv_b64) A) -#define npyv_cvt_b64_s64(A) ((npyv_b64) A) -#define npyv_cvt_b32_f32(A) ((npyv_b32) A) -#define npyv_cvt_b64_f64(A) ((npyv_b64) A) - -//expand -NPY_FINLINE npyv_u16x2 npyv_expand_u16_u8(npyv_u8 data) -{ - npyv_u16x2 r; - npyv_u8 zero = npyv_zero_u8(); - r.val[0] = (npyv_u16)vec_mergeh(data, zero); - r.val[1] = (npyv_u16)vec_mergel(data, zero); - return r; -} - -NPY_FINLINE npyv_u32x2 npyv_expand_u32_u16(npyv_u16 data) -{ - npyv_u32x2 r; - npyv_u16 zero = npyv_zero_u16(); - r.val[0] = (npyv_u32)vec_mergeh(data, zero); - r.val[1] = (npyv_u32)vec_mergel(data, zero); - return r; -} - -// pack two 16-bit boolean into one 8-bit boolean vector -NPY_FINLINE npyv_b8 npyv_pack_b8_b16(npyv_b16 a, npyv_b16 b) { - return vec_pack(a, b); -} - -// pack four 32-bit boolean vectors into one 8-bit boolean vector -NPY_FINLINE npyv_b8 npyv_pack_b8_b32(npyv_b32 a, npyv_b32 b, npyv_b32 c, npyv_b32 d) { - npyv_b16 ab = vec_pack(a, b); - npyv_b16 cd = vec_pack(c, d); - return npyv_pack_b8_b16(ab, cd); -} - -// pack eight 64-bit boolean vectors into one 8-bit boolean vector -NPY_FINLINE npyv_b8 -npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d, - npyv_b64 e, npyv_b64 f, npyv_b64 g, npyv_b64 h) { - npyv_b32 ab = vec_pack(a, b); - npyv_b32 cd = vec_pack(c, d); - npyv_b32 ef = vec_pack(e, f); - npyv_b32 gh = vec_pack(g, h); - return npyv_pack_b8_b32(ab, cd, ef, gh); -} - -// convert boolean vector to integer bitfield -NPY_FINLINE npy_uint64 npyv_tobits_b8(npyv_b8 a) -{ - const npyv_u8 qperm = npyv_set_u8(120, 112, 104, 96, 88, 80, 72, 64, 56, 48, 40, 32, 24, 16, 8, 0); - return vec_extract((npyv_u32)vec_vbpermq((npyv_u8)a, qperm), 2); -} -NPY_FINLINE npy_uint64 npyv_tobits_b16(npyv_b16 a) -{ - const npyv_u8 qperm = npyv_setf_u8(128, 112, 96, 80, 64, 48, 32, 16, 0); - return vec_extract((npyv_u32)vec_vbpermq((npyv_u8)a, qperm), 2); -} -NPY_FINLINE npy_uint64 npyv_tobits_b32(npyv_b32 a) -{ - const npyv_u8 qperm = npyv_setf_u8(128, 96, 64, 32, 0); - return vec_extract((npyv_u32)vec_vbpermq((npyv_u8)a, qperm), 2); -} -NPY_FINLINE npy_uint64 npyv_tobits_b64(npyv_b64 a) -{ - npyv_u64 bit = npyv_shri_u64((npyv_u64)a, 63); - return vec_extract(bit, 0) | (int)vec_extract(bit, 1) << 1; -} - -// truncate compatible with all compilers(internal use for now) -NPY_FINLINE npyv_s32 npyv__trunc_s32_f32(npyv_f32 a) -{ -#ifdef __IBMC__ - return vec_cts(a, 0); -#elif defined(__clang__) - /** - * old versions of CLANG doesn't support %x<n> in the inline asm template - * which fixes register number when using any of the register constraints wa, wd, wf. - * therefore, we count on built-in functions. - */ - return __builtin_convertvector(a, npyv_s32); -#else // gcc - npyv_s32 ret; - __asm__ ("xvcvspsxws %x0,%x1" : "=wa" (ret) : "wa" (a)); - return ret; -#endif -} -NPY_FINLINE npyv_s32 npyv__trunc_s32_f64(npyv_f64 a, npyv_f64 b) -{ -#ifdef __IBMC__ - const npyv_u8 seq_even = npyv_set_u8(0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27); - // unfortunately, XLC missing asm register vsx fixer - // hopefully, xlc can optimize around big-endian compatibility - npyv_s32 lo_even = vec_cts(a, 0); - npyv_s32 hi_even = vec_cts(b, 0); - return vec_perm(lo_even, hi_even, seq_even); -#else - const npyv_u8 seq_odd = npyv_set_u8(4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31); - #ifdef __clang__ - // __builtin_convertvector doesn't support this conversion on wide range of versions - // fortunately, almost all versions have direct builtin of 'xvcvdpsxws' - npyv_s32 lo_odd = __builtin_vsx_xvcvdpsxws(a); - npyv_s32 hi_odd = __builtin_vsx_xvcvdpsxws(b); - #else // gcc - npyv_s32 lo_odd, hi_odd; - __asm__ ("xvcvdpsxws %x0,%x1" : "=wa" (lo_odd) : "wa" (a)); - __asm__ ("xvcvdpsxws %x0,%x1" : "=wa" (hi_odd) : "wa" (b)); - #endif - return vec_perm(lo_odd, hi_odd, seq_odd); -#endif -} - -// round to nearest integer (assuming even) -NPY_FINLINE npyv_s32 npyv_round_s32_f32(npyv_f32 a) -{ return npyv__trunc_s32_f32(vec_rint(a)); } - -NPY_FINLINE npyv_s32 npyv_round_s32_f64(npyv_f64 a, npyv_f64 b) -{ return npyv__trunc_s32_f64(vec_rint(a), vec_rint(b)); } - -#endif // _NPY_SIMD_VSX_CVT_H diff --git a/numpy/core/src/common/umathmodule.h b/numpy/core/src/common/umathmodule.h index fe44fe403..0c69f8f54 100644 --- a/numpy/core/src/common/umathmodule.h +++ b/numpy/core/src/common/umathmodule.h @@ -7,8 +7,14 @@ NPY_NO_EXPORT PyObject * get_sfloat_dtype(PyObject *NPY_UNUSED(mod), PyObject *NPY_UNUSED(args)); +/* Defined in umath/extobj.c */ +NPY_NO_EXPORT int +PyUFunc_GiveFloatingpointErrors(const char *name, int fpe_errors); + PyObject * add_newdoc_ufunc(PyObject *NPY_UNUSED(dummy), PyObject *args); PyObject * ufunc_frompyfunc(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject *NPY_UNUSED(kwds)); + + int initumath(PyObject *m); #endif /* NUMPY_CORE_SRC_COMMON_UMATHMODULE_H_ */ diff --git a/numpy/core/src/multiarray/argfunc.dispatch.c.src b/numpy/core/src/multiarray/argfunc.dispatch.c.src index cbfaebdb4..1d7753275 100644 --- a/numpy/core/src/multiarray/argfunc.dispatch.c.src +++ b/numpy/core/src/multiarray/argfunc.dispatch.c.src @@ -4,6 +4,7 @@ ** sse2 sse42 xop avx2 avx512_skx ** vsx2 ** neon asimd + ** vx vxe **/ #define NPY_NO_DEPRECATED_API NPY_API_VERSION @@ -123,7 +124,7 @@ simd_@func@_@sfx@(npyv_lanetype_@sfx@ *ip, npy_intp len) * #bsfx = b32, b32, b64, b64, b32, b64# * #is_fp = 0*4, 1*2# * #is_idx32 = 1*2, 0*2, 1, 0# - * #chk_simd = NPY_SIMD*5, NPY_SIMD_F64# + * #chk_simd = NPY_SIMD*4, NPY_SIMD_F32, NPY_SIMD_F64# */ #if @chk_simd@ /**begin repeat1 @@ -298,6 +299,9 @@ scalar_loop: #if NPY_BITSOF_@BTYPE@ == 64 && !NPY_SIMD_F64 #undef TO_SIMD_SFX #endif + #if NPY_BITSOF_@BTYPE@ == 32 && !NPY_SIMD_F32 + #undef TO_SIMD_SFX + #endif #elif @is_unsigned@ #define TO_SIMD_SFX(X) X##_u@len@ #else diff --git a/numpy/core/src/multiarray/array_assign_array.c b/numpy/core/src/multiarray/array_assign_array.c index 020a7f29a..9d5bf6875 100644 --- a/numpy/core/src/multiarray/array_assign_array.c +++ b/numpy/core/src/multiarray/array_assign_array.c @@ -8,11 +8,13 @@ */ #define NPY_NO_DEPRECATED_API NPY_API_VERSION #define _MULTIARRAYMODULE +#define _UMATHMODULE #define PY_SSIZE_T_CLEAN #include <Python.h> #include "numpy/ndarraytypes.h" +#include "numpy/npy_math.h" #include "npy_config.h" #include "npy_pycompat.h" @@ -25,6 +27,8 @@ #include "array_assign.h" #include "dtype_transfer.h" +#include "umathmodule.h" + /* * Check that array data is both uint-aligned and true-aligned for all array * elements, as required by the copy/casting code in lowlevel_strided_loops.c @@ -83,7 +87,7 @@ raw_array_assign_array(int ndim, npy_intp const *shape, npy_intp src_strides_it[NPY_MAXDIMS]; npy_intp coord[NPY_MAXDIMS]; - int aligned, needs_api = 0; + int aligned; NPY_BEGIN_THREADS_DEF; @@ -116,15 +120,19 @@ raw_array_assign_array(int ndim, npy_intp const *shape, /* Get the function to do the casting */ NPY_cast_info cast_info; + NPY_ARRAYMETHOD_FLAGS flags; if (PyArray_GetDTypeTransferFunction(aligned, src_strides_it[0], dst_strides_it[0], src_dtype, dst_dtype, 0, - &cast_info, &needs_api) != NPY_SUCCEED) { + &cast_info, &flags) != NPY_SUCCEED) { return -1; } - if (!needs_api) { + if (!(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) { + npy_clear_floatstatus_barrier(src_data); + } + if (!(flags & NPY_METH_REQUIRES_PYAPI)) { NPY_BEGIN_THREADS; } @@ -143,6 +151,14 @@ raw_array_assign_array(int ndim, npy_intp const *shape, NPY_END_THREADS; NPY_cast_info_xfree(&cast_info); + + if (!(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) { + int fpes = npy_get_floatstatus_barrier(src_data); + if (fpes && PyUFunc_GiveFloatingpointErrors("cast", fpes) < 0) { + return -1; + } + } + return 0; fail: NPY_END_THREADS; @@ -170,7 +186,7 @@ raw_array_wheremasked_assign_array(int ndim, npy_intp const *shape, npy_intp wheremask_strides_it[NPY_MAXDIMS]; npy_intp coord[NPY_MAXDIMS]; - int aligned, needs_api = 0; + int aligned; NPY_BEGIN_THREADS_DEF; @@ -207,17 +223,21 @@ raw_array_wheremasked_assign_array(int ndim, npy_intp const *shape, /* Get the function to do the casting */ NPY_cast_info cast_info; + NPY_ARRAYMETHOD_FLAGS flags; if (PyArray_GetMaskedDTypeTransferFunction(aligned, src_strides_it[0], dst_strides_it[0], wheremask_strides_it[0], src_dtype, dst_dtype, wheremask_dtype, 0, - &cast_info, &needs_api) != NPY_SUCCEED) { + &cast_info, &flags) != NPY_SUCCEED) { return -1; } - if (!needs_api) { + if (!(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) { + npy_clear_floatstatus_barrier(src_data); + } + if (!(flags & NPY_METH_REQUIRES_PYAPI)) { NPY_BEGIN_THREADS; } npy_intp strides[2] = {src_strides_it[0], dst_strides_it[0]}; @@ -232,7 +252,7 @@ raw_array_wheremasked_assign_array(int ndim, npy_intp const *shape, args, &shape_it[0], strides, (npy_bool *)wheremask_data, wheremask_strides_it[0], cast_info.auxdata) < 0) { - break; + goto fail; } } NPY_RAW_ITER_THREE_NEXT(idim, ndim, coord, shape_it, dst_data, dst_strides_it, @@ -241,7 +261,20 @@ raw_array_wheremasked_assign_array(int ndim, npy_intp const *shape, NPY_END_THREADS; NPY_cast_info_xfree(&cast_info); - return (needs_api && PyErr_Occurred()) ? -1 : 0; + + if (!(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) { + int fpes = npy_get_floatstatus_barrier(src_data); + if (fpes && PyUFunc_GiveFloatingpointErrors("cast", fpes) < 0) { + return -1; + } + } + + return 0; + +fail: + NPY_END_THREADS; + NPY_cast_info_xfree(&cast_info); + return -1; } /* diff --git a/numpy/core/src/multiarray/array_assign_scalar.c b/numpy/core/src/multiarray/array_assign_scalar.c index 4ffef7ecc..ba964b86d 100644 --- a/numpy/core/src/multiarray/array_assign_scalar.c +++ b/numpy/core/src/multiarray/array_assign_scalar.c @@ -8,11 +8,13 @@ */ #define NPY_NO_DEPRECATED_API NPY_API_VERSION #define _MULTIARRAYMODULE +#define _UMATHMODULE #define PY_SSIZE_T_CLEAN #include <Python.h> #include <numpy/ndarraytypes.h> +#include "numpy/npy_math.h" #include "npy_config.h" #include "npy_pycompat.h" @@ -25,6 +27,8 @@ #include "array_assign.h" #include "dtype_transfer.h" +#include "umathmodule.h" + /* * Assigns the scalar value to every element of the destination raw array. * @@ -39,7 +43,7 @@ raw_array_assign_scalar(int ndim, npy_intp const *shape, npy_intp shape_it[NPY_MAXDIMS], dst_strides_it[NPY_MAXDIMS]; npy_intp coord[NPY_MAXDIMS]; - int aligned, needs_api = 0; + int aligned; NPY_BEGIN_THREADS_DEF; @@ -62,15 +66,19 @@ raw_array_assign_scalar(int ndim, npy_intp const *shape, /* Get the function to do the casting */ NPY_cast_info cast_info; + NPY_ARRAYMETHOD_FLAGS flags; if (PyArray_GetDTypeTransferFunction(aligned, 0, dst_strides_it[0], src_dtype, dst_dtype, 0, - &cast_info, &needs_api) != NPY_SUCCEED) { + &cast_info, &flags) != NPY_SUCCEED) { return -1; } - if (!needs_api) { + if (!(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) { + npy_clear_floatstatus_barrier(src_data); + } + if (!(flags & NPY_METH_REQUIRES_PYAPI)) { npy_intp nitems = 1, i; for (i = 0; i < ndim; i++) { nitems *= shape_it[i]; @@ -92,6 +100,14 @@ raw_array_assign_scalar(int ndim, npy_intp const *shape, NPY_END_THREADS; NPY_cast_info_xfree(&cast_info); + + if (!(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) { + int fpes = npy_get_floatstatus_barrier(src_data); + if (fpes && PyUFunc_GiveFloatingpointErrors("cast", fpes) < 0) { + return -1; + } + } + return 0; fail: NPY_END_THREADS; @@ -117,7 +133,7 @@ raw_array_wheremasked_assign_scalar(int ndim, npy_intp const *shape, npy_intp wheremask_strides_it[NPY_MAXDIMS]; npy_intp coord[NPY_MAXDIMS]; - int aligned, needs_api = 0; + int aligned; NPY_BEGIN_THREADS_DEF; @@ -142,15 +158,19 @@ raw_array_wheremasked_assign_scalar(int ndim, npy_intp const *shape, /* Get the function to do the casting */ NPY_cast_info cast_info; + NPY_ARRAYMETHOD_FLAGS flags; if (PyArray_GetMaskedDTypeTransferFunction(aligned, 0, dst_strides_it[0], wheremask_strides_it[0], src_dtype, dst_dtype, wheremask_dtype, 0, - &cast_info, &needs_api) != NPY_SUCCEED) { + &cast_info, &flags) != NPY_SUCCEED) { return -1; } - if (!needs_api) { + if (!(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) { + npy_clear_floatstatus_barrier(src_data); + } + if (!(flags & NPY_METH_REQUIRES_PYAPI)) { npy_intp nitems = 1, i; for (i = 0; i < ndim; i++) { nitems *= shape_it[i]; @@ -170,7 +190,7 @@ raw_array_wheremasked_assign_scalar(int ndim, npy_intp const *shape, args, &shape_it[0], strides, (npy_bool *)wheremask_data, wheremask_strides_it[0], cast_info.auxdata) < 0) { - break; + goto fail; } } NPY_RAW_ITER_TWO_NEXT(idim, ndim, coord, shape_it, dst_data, dst_strides_it, @@ -178,7 +198,20 @@ raw_array_wheremasked_assign_scalar(int ndim, npy_intp const *shape, NPY_END_THREADS; NPY_cast_info_xfree(&cast_info); - return (needs_api && PyErr_Occurred()) ? -1 : 0; + + if (!(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) { + int fpes = npy_get_floatstatus_barrier(src_data); + if (fpes && PyUFunc_GiveFloatingpointErrors("cast", fpes) < 0) { + return -1; + } + } + + return 0; + +fail: + NPY_END_THREADS; + NPY_cast_info_xfree(&cast_info); + return -1; } /* diff --git a/numpy/core/src/multiarray/array_coercion.c b/numpy/core/src/multiarray/array_coercion.c index 1559f3485..e703e7382 100644 --- a/numpy/core/src/multiarray/array_coercion.c +++ b/numpy/core/src/multiarray/array_coercion.c @@ -9,6 +9,7 @@ #include "lowlevel_strided_loops.h" #include "numpy/arrayobject.h" +#include "numpy/npy_math.h" #include "descriptor.h" #include "convert_datatype.h" @@ -22,6 +23,7 @@ #include "_datetime.h" #include "npy_import.h" +#include "umathmodule.h" /* * This file defines helpers for some of the ctors.c functions which @@ -378,6 +380,49 @@ find_scalar_descriptor( } +/* + * Helper function for casting a raw value from one descriptor to another. + * This helper uses the normal casting machinery, but e.g. does not care about + * checking cast safety. + */ +static int +cast_raw_scalar_item( + PyArray_Descr *from_descr, char *from_item, + PyArray_Descr *to_descr, char *to_item) +{ + NPY_cast_info cast_info; + NPY_ARRAYMETHOD_FLAGS flags; + if (PyArray_GetDTypeTransferFunction( + 0, 0, 0, from_descr, to_descr, 0, &cast_info, + &flags) == NPY_FAIL) { + return -1; + } + + if (!(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) { + npy_clear_floatstatus_barrier(from_item); + } + + char *args[2] = {from_item, to_item}; + const npy_intp strides[2] = {0, 0}; + const npy_intp length = 1; + if (cast_info.func(&cast_info.context, + args, &length, strides, cast_info.auxdata) < 0) { + NPY_cast_info_xfree(&cast_info); + return -1; + } + NPY_cast_info_xfree(&cast_info); + + if (!(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) { + int fpes = npy_get_floatstatus_barrier(to_item); + if (fpes && PyUFunc_GiveFloatingpointErrors("cast", fpes) < 0) { + return -1; + } + } + + return 0; +} + + /** * Assign a single element in an array from a python value. * @@ -388,26 +433,35 @@ find_scalar_descriptor( * This function handles the cast, which is for example hit when assigning * a float128 to complex128. * - * At this time, this function does not support arrays (historically we - * mainly supported arrays through `__float__()`, etc.). Such support should - * possibly be added (although when called from `PyArray_AssignFromCache` - * the input cannot be an array). - * Note that this is also problematic for some array-likes, such as - * `astropy.units.Quantity` and `np.ma.masked`. These are used to us calling - * `__float__`/`__int__` for 0-D instances in many cases. - * Eventually, we may want to define this as wrong: They must use DTypes - * instead of (only) subclasses. Until then, here as well as in - * `PyArray_AssignFromCache` (which already does this), we need to special - * case 0-D array-likes to behave like arbitrary (unknown!) Python objects. + * TODO: This function probably needs to be passed an "owner" for the sake of + * future HPy (non CPython) support + * + * NOTE: We do support 0-D exact NumPy arrays correctly via casting here. + * There be dragons, because we must NOT support generic array-likes. + * The problem is that some (e.g. astropy's Quantity and our masked + * arrays) have divergent behaviour for `__array__` as opposed to + * `__float__`. And they rely on that. + * That is arguably bad as it limits the things that work seamlessly + * because `__float__`, etc. cannot even begin to cover all of casting. + * However, we have no choice. We simply CANNOT support array-likes + * here without finding a solution for this first. + * And the only plausible one I see currently, is expanding protocols + * in some form, either to indicate that we want a scalar or to indicate + * that we want the unsafe version that `__array__` currently gives + * for both objects. + * + * If we ever figure out how to expand this to other array-likes, care + * may need to be taken. `PyArray_FromAny`/`PyArray_AssignFromCache` + * uses this function but know if the input is an array, array-like, + * or scalar. Relaxing things here should be OK, but looks a bit + * like possible recursion, so it may make sense to make a "scalars only" + * version of this function. * * @param descr * @param item * @param value * @return 0 on success -1 on failure. */ -/* - * TODO: This function should possibly be public API. - */ NPY_NO_EXPORT int PyArray_Pack(PyArray_Descr *descr, char *item, PyObject *value) { @@ -433,6 +487,29 @@ PyArray_Pack(PyArray_Descr *descr, char *item, PyObject *value) if (DType == NULL) { return -1; } + if (DType == (PyArray_DTypeMeta *)Py_None && PyArray_CheckExact(value) + && PyArray_NDIM((PyArrayObject *)value) == 0) { + /* + * WARNING: Do NOT relax the above `PyArray_CheckExact`, unless you + * read the function doc NOTE carefully and understood it. + * + * NOTE: The ndim == 0 check should probably be an error, but + * unfortunately. `arr.__float__()` works for 1 element arrays + * so in some contexts we need to let it handled like a scalar. + * (If we manage to deprecate the above, we can do that.) + */ + Py_DECREF(DType); + + PyArrayObject *arr = (PyArrayObject *)value; + if (PyArray_DESCR(arr) == descr && !PyDataType_REFCHK(descr)) { + /* light-weight fast-path for when the descrs obviously matches */ + memcpy(item, PyArray_BYTES(arr), descr->elsize); + return 0; /* success (it was an array-like) */ + } + return cast_raw_scalar_item( + PyArray_DESCR(arr), PyArray_BYTES(arr), descr, item); + + } if (DType == NPY_DTYPE(descr) || DType == (PyArray_DTypeMeta *)Py_None) { /* We can set the element directly (or at least will try to) */ Py_XDECREF(DType); @@ -461,30 +538,8 @@ PyArray_Pack(PyArray_Descr *descr, char *item, PyObject *value) Py_DECREF(tmp_descr); return -1; } - if (PyDataType_REFCHK(tmp_descr)) { - /* We could probably use move-references above */ - PyArray_Item_INCREF(data, tmp_descr); - } - - int res = 0; - int needs_api = 0; - NPY_cast_info cast_info; - if (PyArray_GetDTypeTransferFunction( - 0, 0, 0, tmp_descr, descr, 0, &cast_info, - &needs_api) == NPY_FAIL) { - res = -1; - goto finish; - } - char *args[2] = {data, item}; - const npy_intp strides[2] = {0, 0}; - const npy_intp length = 1; - if (cast_info.func(&cast_info.context, - args, &length, strides, cast_info.auxdata) < 0) { - res = -1; - } - NPY_cast_info_xfree(&cast_info); + int res = cast_raw_scalar_item(tmp_descr, data, descr, item); - finish: if (PyDataType_REFCHK(tmp_descr)) { /* We could probably use move-references above */ PyArray_Item_XDECREF(data, tmp_descr); diff --git a/numpy/core/src/multiarray/array_method.h b/numpy/core/src/multiarray/array_method.h index 30dd94a80..c9ec8903d 100644 --- a/numpy/core/src/multiarray/array_method.h +++ b/numpy/core/src/multiarray/array_method.h @@ -7,6 +7,9 @@ #include <Python.h> #include <numpy/ndarraytypes.h> +#ifdef __cplusplus +extern "C" { +#endif typedef enum { /* Flag for whether the GIL is required */ @@ -17,7 +20,11 @@ typedef enum { * setup/check. No function should set error flags and ignore them * since it would interfere with chaining operations (e.g. casting). */ - /* TODO: Change this into a positive flag */ + /* + * TODO: Change this into a positive flag? That would make "combing" + * multiple methods easier. OTOH, if we add more flags, the default + * would be 0 just like it is here. + */ NPY_METH_NO_FLOATINGPOINT_ERRORS = 1 << 2, /* Whether the method supports unaligned access (not runtime) */ NPY_METH_SUPPORTS_UNALIGNED = 1 << 3, @@ -40,6 +47,20 @@ typedef enum { } NPY_ARRAYMETHOD_FLAGS; +/* + * It would be nice to just | flags, but in general it seems that 0 bits + * probably should indicate "default". + * And that is not necessarily compatible with `|`. + * + * NOTE: If made public, should maybe be a function to easier add flags? + */ +#define PyArrayMethod_MINIMAL_FLAGS NPY_METH_NO_FLOATINGPOINT_ERRORS +#define PyArrayMethod_COMBINED_FLAGS(flags1, flags2) \ + ((NPY_ARRAYMETHOD_FLAGS)( \ + ((flags1 | flags2) & ~PyArrayMethod_MINIMAL_FLAGS) \ + | (flags1 & flags2))) + + struct PyArrayMethodObject_tag; /* @@ -249,6 +270,10 @@ PyArrayMethod_FromSpec(PyArrayMethod_Spec *spec); * need better tests when a public version is exposed. */ NPY_NO_EXPORT PyBoundArrayMethodObject * -PyArrayMethod_FromSpec_int(PyArrayMethod_Spec *spec, int private); +PyArrayMethod_FromSpec_int(PyArrayMethod_Spec *spec, int priv); + +#ifdef __cplusplus +} +#endif #endif /* NUMPY_CORE_SRC_MULTIARRAY_ARRAY_METHOD_H_ */ diff --git a/numpy/core/src/multiarray/arrayobject.c b/numpy/core/src/multiarray/arrayobject.c index a1f0e2d5b..d18fe1b10 100644 --- a/numpy/core/src/multiarray/arrayobject.c +++ b/numpy/core/src/multiarray/arrayobject.c @@ -641,375 +641,11 @@ PyArray_FailUnlessWriteable(PyArrayObject *obj, const char *name) return 0; } -/* This also handles possibly mis-aligned data */ -/* Compare s1 and s2 which are not necessarily NULL-terminated. - s1 is of length len1 - s2 is of length len2 - If they are NULL terminated, then stop comparison. -*/ -static int -_myunincmp(npy_ucs4 const *s1, npy_ucs4 const *s2, int len1, int len2) -{ - npy_ucs4 const *sptr; - npy_ucs4 *s1t = NULL; - npy_ucs4 *s2t = NULL; - int val; - npy_intp size; - int diff; - - /* Replace `s1` and `s2` with aligned copies if needed */ - if ((npy_intp)s1 % sizeof(npy_ucs4) != 0) { - size = len1*sizeof(npy_ucs4); - s1t = malloc(size); - memcpy(s1t, s1, size); - s1 = s1t; - } - if ((npy_intp)s2 % sizeof(npy_ucs4) != 0) { - size = len2*sizeof(npy_ucs4); - s2t = malloc(size); - memcpy(s2t, s2, size); - s2 = s1t; - } - - val = PyArray_CompareUCS4(s1, s2, PyArray_MIN(len1,len2)); - if ((val != 0) || (len1 == len2)) { - goto finish; - } - if (len2 > len1) { - sptr = s2+len1; - val = -1; - diff = len2-len1; - } - else { - sptr = s1+len2; - val = 1; - diff=len1-len2; - } - while (diff--) { - if (*sptr != 0) { - goto finish; - } - sptr++; - } - val = 0; - - finish: - /* Cleanup the aligned copies */ - if (s1t) { - free(s1t); - } - if (s2t) { - free(s2t); - } - return val; -} - - - - -/* - * Compare s1 and s2 which are not necessarily NULL-terminated. - * s1 is of length len1 - * s2 is of length len2 - * If they are NULL terminated, then stop comparison. - */ -static int -_mystrncmp(char const *s1, char const *s2, int len1, int len2) -{ - char const *sptr; - int val; - int diff; - - val = memcmp(s1, s2, PyArray_MIN(len1, len2)); - if ((val != 0) || (len1 == len2)) { - return val; - } - if (len2 > len1) { - sptr = s2 + len1; - val = -1; - diff = len2 - len1; - } - else { - sptr = s1 + len2; - val = 1; - diff = len1 - len2; - } - while (diff--) { - if (*sptr != 0) { - return val; - } - sptr++; - } - return 0; /* Only happens if NULLs are everywhere */ -} - -/* Borrowed from Numarray */ - -#define SMALL_STRING 2048 - -static void _rstripw(char *s, int n) -{ - int i; - for (i = n - 1; i >= 1; i--) { /* Never strip to length 0. */ - int c = s[i]; - - if (!c || NumPyOS_ascii_isspace((int)c)) { - s[i] = 0; - } - else { - break; - } - } -} - -static void _unistripw(npy_ucs4 *s, int n) -{ - int i; - for (i = n - 1; i >= 1; i--) { /* Never strip to length 0. */ - npy_ucs4 c = s[i]; - if (!c || NumPyOS_ascii_isspace((int)c)) { - s[i] = 0; - } - else { - break; - } - } -} - - -static char * -_char_copy_n_strip(char const *original, char *temp, int nc) -{ - if (nc > SMALL_STRING) { - temp = malloc(nc); - if (!temp) { - PyErr_NoMemory(); - return NULL; - } - } - memcpy(temp, original, nc); - _rstripw(temp, nc); - return temp; -} - -static void -_char_release(char *ptr, int nc) -{ - if (nc > SMALL_STRING) { - free(ptr); - } -} - -static char * -_uni_copy_n_strip(char const *original, char *temp, int nc) -{ - if (nc*sizeof(npy_ucs4) > SMALL_STRING) { - temp = malloc(nc*sizeof(npy_ucs4)); - if (!temp) { - PyErr_NoMemory(); - return NULL; - } - } - memcpy(temp, original, nc*sizeof(npy_ucs4)); - _unistripw((npy_ucs4 *)temp, nc); - return temp; -} - -static void -_uni_release(char *ptr, int nc) -{ - if (nc*sizeof(npy_ucs4) > SMALL_STRING) { - free(ptr); - } -} - - -/* End borrowed from numarray */ - -#define _rstrip_loop(CMP) { \ - void *aptr, *bptr; \ - char atemp[SMALL_STRING], btemp[SMALL_STRING]; \ - while(size--) { \ - aptr = stripfunc(iself->dataptr, atemp, N1); \ - if (!aptr) return -1; \ - bptr = stripfunc(iother->dataptr, btemp, N2); \ - if (!bptr) { \ - relfunc(aptr, N1); \ - return -1; \ - } \ - val = compfunc(aptr, bptr, N1, N2); \ - *dptr = (val CMP 0); \ - PyArray_ITER_NEXT(iself); \ - PyArray_ITER_NEXT(iother); \ - dptr += 1; \ - relfunc(aptr, N1); \ - relfunc(bptr, N2); \ - } \ - } - -#define _reg_loop(CMP) { \ - while(size--) { \ - val = compfunc((void *)iself->dataptr, \ - (void *)iother->dataptr, \ - N1, N2); \ - *dptr = (val CMP 0); \ - PyArray_ITER_NEXT(iself); \ - PyArray_ITER_NEXT(iother); \ - dptr += 1; \ - } \ - } - -static int -_compare_strings(PyArrayObject *result, PyArrayMultiIterObject *multi, - int cmp_op, void *func, int rstrip) -{ - PyArrayIterObject *iself, *iother; - npy_bool *dptr; - npy_intp size; - int val; - int N1, N2; - int (*compfunc)(void *, void *, int, int); - void (*relfunc)(char *, int); - char* (*stripfunc)(char const *, char *, int); - - compfunc = func; - dptr = (npy_bool *)PyArray_DATA(result); - iself = multi->iters[0]; - iother = multi->iters[1]; - size = multi->size; - N1 = PyArray_DESCR(iself->ao)->elsize; - N2 = PyArray_DESCR(iother->ao)->elsize; - if ((void *)compfunc == (void *)_myunincmp) { - N1 >>= 2; - N2 >>= 2; - stripfunc = _uni_copy_n_strip; - relfunc = _uni_release; - } - else { - stripfunc = _char_copy_n_strip; - relfunc = _char_release; - } - switch (cmp_op) { - case Py_EQ: - if (rstrip) { - _rstrip_loop(==); - } else { - _reg_loop(==); - } - break; - case Py_NE: - if (rstrip) { - _rstrip_loop(!=); - } else { - _reg_loop(!=); - } - break; - case Py_LT: - if (rstrip) { - _rstrip_loop(<); - } else { - _reg_loop(<); - } - break; - case Py_LE: - if (rstrip) { - _rstrip_loop(<=); - } else { - _reg_loop(<=); - } - break; - case Py_GT: - if (rstrip) { - _rstrip_loop(>); - } else { - _reg_loop(>); - } - break; - case Py_GE: - if (rstrip) { - _rstrip_loop(>=); - } else { - _reg_loop(>=); - } - break; - default: - PyErr_SetString(PyExc_RuntimeError, "bad comparison operator"); - return -1; - } - return 0; -} - -#undef _reg_loop -#undef _rstrip_loop -#undef SMALL_STRING +/* From umath/string_ufuncs.cpp/h */ NPY_NO_EXPORT PyObject * -_strings_richcompare(PyArrayObject *self, PyArrayObject *other, int cmp_op, - int rstrip) -{ - PyArrayObject *result; - PyArrayMultiIterObject *mit; - int val; - - if (PyArray_TYPE(self) != PyArray_TYPE(other)) { - /* - * Comparison between Bytes and Unicode is not defined in Py3K; - * we follow. - */ - Py_INCREF(Py_NotImplemented); - return Py_NotImplemented; - } - if (PyArray_ISNOTSWAPPED(self) != PyArray_ISNOTSWAPPED(other)) { - /* Cast `other` to the same byte order as `self` (both unicode here) */ - PyArray_Descr* unicode = PyArray_DescrNew(PyArray_DESCR(self)); - if (unicode == NULL) { - return NULL; - } - unicode->elsize = PyArray_DESCR(other)->elsize; - PyObject *new = PyArray_FromAny((PyObject *)other, - unicode, 0, 0, 0, NULL); - if (new == NULL) { - return NULL; - } - other = (PyArrayObject *)new; - } - else { - Py_INCREF(other); - } - - /* Broad-cast the arrays to a common shape */ - mit = (PyArrayMultiIterObject *)PyArray_MultiIterNew(2, self, other); - Py_DECREF(other); - if (mit == NULL) { - return NULL; - } - - result = (PyArrayObject *)PyArray_NewFromDescr(&PyArray_Type, - PyArray_DescrFromType(NPY_BOOL), - mit->nd, - mit->dimensions, - NULL, NULL, 0, - NULL); - if (result == NULL) { - goto finish; - } - - if (PyArray_TYPE(self) == NPY_UNICODE) { - val = _compare_strings(result, mit, cmp_op, _myunincmp, rstrip); - } - else { - val = _compare_strings(result, mit, cmp_op, _mystrncmp, rstrip); - } - - if (val < 0) { - Py_DECREF(result); - result = NULL; - } - - finish: - Py_DECREF(mit); - return (PyObject *)result; -} +_umath_strings_richcompare( + PyArrayObject *self, PyArrayObject *other, int cmp_op, int rstrip); /* * VOID-type arrays can only be compared equal and not-equal @@ -1130,7 +766,15 @@ _void_compare(PyArrayObject *self, PyArrayObject *other, int cmp_op) memcpy(dimensions, PyArray_DIMS((PyArrayObject *)temp), sizeof(npy_intp)*result_ndim); } - dimensions[result_ndim] = -1; + + /* + * Compute the new dimension size manually, as reshaping + * with -1 does not work on empty arrays. + */ + dimensions[result_ndim] = PyArray_MultiplyList( + PyArray_DIMS((PyArrayObject *)temp) + result_ndim, + PyArray_NDIM((PyArrayObject *)temp) - result_ndim); + temp2 = PyArray_Newshape((PyArrayObject *)temp, &newdims, NPY_ANYORDER); if (temp2 == NULL) { @@ -1207,7 +851,7 @@ _void_compare(PyArrayObject *self, PyArrayObject *other, int cmp_op) return NULL; } /* compare as a string. Assumes self and other have same descr->type */ - return _strings_richcompare(self, other, cmp_op, 0); + return _umath_strings_richcompare(self, other, cmp_op, 0); } } @@ -1341,36 +985,6 @@ array_richcompare(PyArrayObject *self, PyObject *other, int cmp_op) PyObject *obj_self = (PyObject *)self; PyObject *result = NULL; - /* Special case for string arrays (which don't and currently can't have - * ufunc loops defined, so there's no point in trying). - */ - if (PyArray_ISSTRING(self)) { - array_other = (PyArrayObject *)PyArray_FromObject(other, - NPY_NOTYPE, 0, 0); - if (array_other == NULL) { - PyErr_Clear(); - /* Never mind, carry on, see what happens */ - } - else if (!PyArray_ISSTRING(array_other)) { - Py_DECREF(array_other); - /* Never mind, carry on, see what happens */ - } - else { - result = _strings_richcompare(self, array_other, cmp_op, 0); - Py_DECREF(array_other); - return result; - } - /* If we reach this point, it means that we are not comparing - * string-to-string. It's possible that this will still work out, - * e.g. if the other array is an object array, then both will be cast - * to object or something? I don't know how that works actually, but - * it does, b/c this works: - * l = ["a", "b"] - * assert np.array(l, dtype="S1") == np.array(l, dtype="O") - * So we fall through and see what happens. - */ - } - switch (cmp_op) { case Py_LT: RICHCMP_GIVE_UP_IF_NEEDED(obj_self, other); diff --git a/numpy/core/src/multiarray/arraytypes.c.src b/numpy/core/src/multiarray/arraytypes.c.src index ee4f5f312..a9f8dfdd2 100644 --- a/numpy/core/src/multiarray/arraytypes.c.src +++ b/numpy/core/src/multiarray/arraytypes.c.src @@ -7,6 +7,7 @@ #define NPY_NO_DEPRECATED_API NPY_API_VERSION #define _MULTIARRAYMODULE +#define _UMATHMODULE #define _NPY_NO_DEPRECATIONS /* for NPY_CHAR */ #include "numpy/npy_common.h" @@ -37,6 +38,9 @@ #include "npy_buffer.h" #include "arraytypes.h" + +#include "umathmodule.h" + /* * Define a stack allocated dummy array with only the minimum information set: * 1. The descr, the main field interesting here. @@ -96,10 +100,32 @@ MyPyFloat_AsDouble(PyObject *obj) return ret; } + +static float +MyPyFloat_AsFloat(PyObject *obj) +{ + double d_val = MyPyFloat_AsDouble(obj); + float res = (float)d_val; + if (NPY_UNLIKELY(npy_isinf(res) && !npy_isinf(d_val))) { + if (PyUFunc_GiveFloatingpointErrors("cast", NPY_FPE_OVERFLOW) < 0) { + return -1; + } + } + return res; +} + + static npy_half MyPyFloat_AsHalf(PyObject *obj) { - return npy_double_to_half(MyPyFloat_AsDouble(obj)); + double d_val = MyPyFloat_AsDouble(obj); + npy_half res = npy_double_to_half(d_val); + if (NPY_UNLIKELY(npy_half_isinf(res) && !npy_isinf(d_val))) { + if (PyUFunc_GiveFloatingpointErrors("cast", NPY_FPE_OVERFLOW) < 0) { + return npy_double_to_half(-1.); + } + } + return res; } static PyObject * @@ -200,7 +226,7 @@ MyPyLong_AsUnsigned@Type@ (PyObject *obj) * MyPyFloat_FromHalf, PyFloat_FromDouble*2# * #func2 = PyObject_IsTrue, MyPyLong_AsLong*6, MyPyLong_AsUnsignedLong*2, * MyPyLong_AsLongLong, MyPyLong_AsUnsignedLongLong, - * MyPyFloat_AsHalf, MyPyFloat_AsDouble*2# + * MyPyFloat_AsHalf, MyPyFloat_AsFloat, MyPyFloat_AsDouble# * #type = npy_bool, * npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, * npy_long, npy_uint, npy_ulong, npy_longlong, npy_ulonglong, @@ -363,6 +389,26 @@ static int } temp.real = (@ftype@) oop.real; temp.imag = (@ftype@) oop.imag; + +#if NPY_SIZEOF_@NAME@ < NPY_SIZEOF_CDOUBLE /* really just float... */ + /* Overflow could have occured converting double to float */ + if (NPY_UNLIKELY((npy_isinf(temp.real) && !npy_isinf(oop.real)) || + (npy_isinf(temp.imag) && !npy_isinf(oop.imag)))) { + int bufsize, errmask; + PyObject *errobj; + + if (PyUFunc_GetPyValues("assignment", &bufsize, &errmask, + &errobj) < 0) { + return -1; + } + int first = 1; + if (PyUFunc_handlefperr(errmask, errobj, NPY_FPE_OVERFLOW, &first)) { + Py_XDECREF(errobj); + return -1; + } + Py_XDECREF(errobj); + } +#endif } memcpy(ov, &temp, PyArray_DESCR(ap)->elsize); @@ -1151,13 +1197,22 @@ static void @totype@ *op = output; while (n--) { - @fromtype@ f = *ip++; - @totype@ t = (@totype@)f; #if @supports_nat@ && @floatingpoint@ - /* Avoid undefined behaviour for NaN -> NaT */ + /* + * volatile works around clang (and gcc sometimes) not branching + * correctly, leading to floating point errors in the test suite. + */ + volatile @fromtype@ f = *ip++; + @totype@ t; + /* Avoid undefined behaviour and warning for NaN -> NaT */ if (npy_isnan(f)) { t = (@totype@)NPY_DATETIME_NAT; } + else { + t = (@totype@)f; + } +#else + @totype@ t = (@totype@)*ip++; #endif *op++ = t; } @@ -1177,13 +1232,22 @@ static void @totype@ *op = output; while (n--) { - @fromtype@ f = *ip; - @totype@ t = (@totype@)f; #if @supports_nat@ - /* Avoid undefined behaviour for NaN -> NaT */ + /* + * volatile works around clang (and gcc sometimes) not branching + * correctly, leading to floating point errors in the test suite. + */ + volatile @fromtype@ f = *ip; + @totype@ t; + /* Avoid undefined behaviour and warning for NaN -> NaT */ if (npy_isnan(f)) { t = (@totype@)NPY_DATETIME_NAT; } + else { + t = (@totype@)f; + } +#else + @totype@ t = (@totype@)*ip; #endif *op++ = t; ip += 2; diff --git a/numpy/core/src/multiarray/common_dtype.h b/numpy/core/src/multiarray/common_dtype.h index 13d38ddf8..9f25fc14e 100644 --- a/numpy/core/src/multiarray/common_dtype.h +++ b/numpy/core/src/multiarray/common_dtype.h @@ -7,6 +7,10 @@ #include <numpy/ndarraytypes.h> #include "dtypemeta.h" +#ifdef __cplusplus +extern "C" { +#endif + NPY_NO_EXPORT PyArray_DTypeMeta * PyArray_CommonDType(PyArray_DTypeMeta *dtype1, PyArray_DTypeMeta *dtype2); @@ -14,4 +18,8 @@ NPY_NO_EXPORT PyArray_DTypeMeta * PyArray_PromoteDTypeSequence( npy_intp length, PyArray_DTypeMeta **dtypes_in); +#ifdef __cplusplus +} +#endif + #endif /* NUMPY_CORE_SRC_MULTIARRAY_COMMON_DTYPE_H_ */ diff --git a/numpy/core/src/multiarray/convert.c b/numpy/core/src/multiarray/convert.c index 630253e38..2aed0bbb4 100644 --- a/numpy/core/src/multiarray/convert.c +++ b/numpy/core/src/multiarray/convert.c @@ -20,6 +20,7 @@ #include "array_assign.h" #include "convert.h" +#include "array_coercion.h" int fallocate(int fd, int mode, off_t offset, off_t len); @@ -358,151 +359,42 @@ PyArray_ToString(PyArrayObject *self, NPY_ORDER order) NPY_NO_EXPORT int PyArray_FillWithScalar(PyArrayObject *arr, PyObject *obj) { - PyArray_Descr *dtype = NULL; - npy_longlong value_buffer[4]; - char *value = NULL; - int retcode = 0; - /* - * If 'arr' is an object array, copy the object as is unless - * 'obj' is a zero-dimensional array, in which case we copy - * the element in that array instead. + * If we knew that the output array has at least one element, we would + * not actually need a helping buffer, we always null it, just in case. + * + * (The longlong here should help with alignment.) */ - if (PyArray_DESCR(arr)->type_num == NPY_OBJECT && - !(PyArray_Check(obj) && - PyArray_NDIM((PyArrayObject *)obj) == 0)) { - value = (char *)&obj; - - dtype = PyArray_DescrFromType(NPY_OBJECT); - if (dtype == NULL) { - return -1; - } - } - /* NumPy scalar */ - else if (PyArray_IsScalar(obj, Generic)) { - dtype = PyArray_DescrFromScalar(obj); - if (dtype == NULL) { - return -1; - } - value = scalar_value(obj, dtype); - if (value == NULL) { - Py_DECREF(dtype); - return -1; - } - } - /* Python boolean */ - else if (PyBool_Check(obj)) { - value = (char *)value_buffer; - *value = (obj == Py_True); - - dtype = PyArray_DescrFromType(NPY_BOOL); - if (dtype == NULL) { - return -1; - } - } - /* Python integer */ - else if (PyLong_Check(obj)) { - /* Try long long before unsigned long long */ - npy_longlong ll_v = PyLong_AsLongLong(obj); - if (error_converting(ll_v)) { - /* Long long failed, try unsigned long long */ - npy_ulonglong ull_v; - PyErr_Clear(); - ull_v = PyLong_AsUnsignedLongLong(obj); - if (ull_v == (unsigned long long)-1 && PyErr_Occurred()) { - return -1; - } - value = (char *)value_buffer; - *(npy_ulonglong *)value = ull_v; - - dtype = PyArray_DescrFromType(NPY_ULONGLONG); - if (dtype == NULL) { - return -1; - } - } - else { - /* Long long succeeded */ - value = (char *)value_buffer; - *(npy_longlong *)value = ll_v; - - dtype = PyArray_DescrFromType(NPY_LONGLONG); - if (dtype == NULL) { - return -1; - } - } - } - /* Python float */ - else if (PyFloat_Check(obj)) { - npy_double v = PyFloat_AsDouble(obj); - if (error_converting(v)) { - return -1; - } - value = (char *)value_buffer; - *(npy_double *)value = v; - - dtype = PyArray_DescrFromType(NPY_DOUBLE); - if (dtype == NULL) { + npy_longlong value_buffer_stack[4] = {0}; + char *value_buffer_heap = NULL; + char *value = (char *)value_buffer_stack; + PyArray_Descr *descr = PyArray_DESCR(arr); + + if (descr->elsize > sizeof(value_buffer_stack)) { + /* We need a large temporary buffer... */ + value_buffer_heap = PyObject_Calloc(1, descr->elsize); + if (value_buffer_heap == NULL) { + PyErr_NoMemory(); return -1; } + value = value_buffer_heap; } - /* Python complex */ - else if (PyComplex_Check(obj)) { - npy_double re, im; - - re = PyComplex_RealAsDouble(obj); - if (error_converting(re)) { - return -1; - } - im = PyComplex_ImagAsDouble(obj); - if (error_converting(im)) { - return -1; - } - value = (char *)value_buffer; - ((npy_double *)value)[0] = re; - ((npy_double *)value)[1] = im; - - dtype = PyArray_DescrFromType(NPY_CDOUBLE); - if (dtype == NULL) { - return -1; - } - } - - /* Use the value pointer we got if possible */ - if (value != NULL) { - /* TODO: switch to SAME_KIND casting */ - retcode = PyArray_AssignRawScalar(arr, dtype, value, - NULL, NPY_UNSAFE_CASTING); - Py_DECREF(dtype); - return retcode; + if (PyArray_Pack(descr, value, obj) < 0) { + PyMem_FREE(value_buffer_heap); + return -1; } - /* Otherwise convert to an array to do the assignment */ - else { - PyArrayObject *src_arr; - /** - * The dtype of the destination is used when converting - * from the pyobject, so that for example a tuple gets - * recognized as a struct scalar of the required type. - */ - Py_INCREF(PyArray_DTYPE(arr)); - src_arr = (PyArrayObject *)PyArray_FromAny(obj, - PyArray_DTYPE(arr), 0, 0, 0, NULL); - if (src_arr == NULL) { - return -1; - } - - if (PyArray_NDIM(src_arr) != 0) { - PyErr_SetString(PyExc_ValueError, - "Input object to FillWithScalar is not a scalar"); - Py_DECREF(src_arr); - return -1; - } - - retcode = PyArray_CopyInto(arr, src_arr); + /* + * There is no cast anymore, the above already coerced using scalar + * coercion rules + */ + int retcode = raw_array_assign_scalar( + PyArray_NDIM(arr), PyArray_DIMS(arr), descr, + PyArray_BYTES(arr), PyArray_STRIDES(arr), + descr, value); - Py_DECREF(src_arr); - return retcode; - } + PyMem_FREE(value_buffer_heap); + return retcode; } /* diff --git a/numpy/core/src/multiarray/convert_datatype.c b/numpy/core/src/multiarray/convert_datatype.c index 8d0a4cd56..bc8a3bf88 100644 --- a/numpy/core/src/multiarray/convert_datatype.c +++ b/numpy/core/src/multiarray/convert_datatype.c @@ -1691,8 +1691,12 @@ PyArray_ResultType( all_DTypes[i_all] = &PyArray_PyComplexAbstractDType; } else { - /* N.B.: Could even be an object dtype here for large ints */ + /* This could even be an object dtype here for large ints */ all_DTypes[i_all] = &PyArray_PyIntAbstractDType; + if (PyArray_TYPE(arrs[i]) != NPY_LONG) { + /* Not a "normal" scalar, so we cannot avoid the legacy path */ + all_pyscalar = 0; + } } Py_INCREF(all_DTypes[i_all]); /* @@ -3042,26 +3046,22 @@ nonstructured_to_structured_get_loop( NPY_ARRAYMETHOD_FLAGS *flags) { if (context->descriptors[1]->names != NULL) { - int needs_api = 0; if (get_fields_transfer_function( aligned, strides[0], strides[1], context->descriptors[0], context->descriptors[1], move_references, out_loop, out_transferdata, - &needs_api) == NPY_FAIL) { + flags) == NPY_FAIL) { return -1; } - *flags = needs_api ? NPY_METH_REQUIRES_PYAPI : 0; } else if (context->descriptors[1]->subarray != NULL) { - int needs_api = 0; if (get_subarray_transfer_function( aligned, strides[0], strides[1], context->descriptors[0], context->descriptors[1], move_references, out_loop, out_transferdata, - &needs_api) == NPY_FAIL) { + flags) == NPY_FAIL) { return -1; } - *flags = needs_api ? NPY_METH_REQUIRES_PYAPI : 0; } else { /* @@ -3204,26 +3204,22 @@ structured_to_nonstructured_get_loop( NPY_ARRAYMETHOD_FLAGS *flags) { if (context->descriptors[0]->names != NULL) { - int needs_api = 0; if (get_fields_transfer_function( aligned, strides[0], strides[1], context->descriptors[0], context->descriptors[1], move_references, out_loop, out_transferdata, - &needs_api) == NPY_FAIL) { + flags) == NPY_FAIL) { return -1; } - *flags = needs_api ? NPY_METH_REQUIRES_PYAPI : 0; } else if (context->descriptors[0]->subarray != NULL) { - int needs_api = 0; if (get_subarray_transfer_function( aligned, strides[0], strides[1], context->descriptors[0], context->descriptors[1], move_references, out_loop, out_transferdata, - &needs_api) == NPY_FAIL) { + flags) == NPY_FAIL) { return -1; } - *flags = needs_api ? NPY_METH_REQUIRES_PYAPI : 0; } else { /* @@ -3513,27 +3509,23 @@ void_to_void_get_loop( { if (context->descriptors[0]->names != NULL || context->descriptors[1]->names != NULL) { - int needs_api = 0; if (get_fields_transfer_function( aligned, strides[0], strides[1], context->descriptors[0], context->descriptors[1], move_references, out_loop, out_transferdata, - &needs_api) == NPY_FAIL) { + flags) == NPY_FAIL) { return -1; } - *flags = needs_api ? NPY_METH_REQUIRES_PYAPI : 0; } else if (context->descriptors[0]->subarray != NULL || context->descriptors[1]->subarray != NULL) { - int needs_api = 0; if (get_subarray_transfer_function( aligned, strides[0], strides[1], context->descriptors[0], context->descriptors[1], move_references, out_loop, out_transferdata, - &needs_api) == NPY_FAIL) { + flags) == NPY_FAIL) { return -1; } - *flags = needs_api ? NPY_METH_REQUIRES_PYAPI : 0; } else { /* @@ -3546,7 +3538,7 @@ void_to_void_get_loop( out_loop, out_transferdata) == NPY_FAIL) { return -1; } - *flags = 0; + *flags = PyArrayMethod_MINIMAL_FLAGS; } return 0; } diff --git a/numpy/core/src/multiarray/convert_datatype.h b/numpy/core/src/multiarray/convert_datatype.h index d1865d1c2..af6d790cf 100644 --- a/numpy/core/src/multiarray/convert_datatype.h +++ b/numpy/core/src/multiarray/convert_datatype.h @@ -3,6 +3,10 @@ #include "array_method.h" +#ifdef __cplusplus +extern "C" { +#endif + extern NPY_NO_EXPORT npy_intp REQUIRED_STR_LEN[]; NPY_NO_EXPORT PyObject * @@ -34,7 +38,7 @@ dtype_kind_to_ordering(char kind); /* Used by PyArray_CanCastArrayTo and in the legacy ufunc type resolution */ NPY_NO_EXPORT npy_bool can_cast_scalar_to(PyArray_Descr *scal_type, char *scal_data, - PyArray_Descr *to, NPY_CASTING casting); + PyArray_Descr *to, NPY_CASTING casting); NPY_NO_EXPORT int should_use_min_scalar(npy_intp narrs, PyArrayObject **arr, @@ -59,7 +63,7 @@ NPY_NO_EXPORT int PyArray_AddCastingImplementation(PyBoundArrayMethodObject *meth); NPY_NO_EXPORT int -PyArray_AddCastingImplementation_FromSpec(PyArrayMethod_Spec *spec, int private); +PyArray_AddCastingImplementation_FromSpec(PyArrayMethod_Spec *spec, int private_); NPY_NO_EXPORT NPY_CASTING PyArray_MinCastSafety(NPY_CASTING casting1, NPY_CASTING casting2); @@ -99,4 +103,8 @@ simple_cast_resolve_descriptors( NPY_NO_EXPORT int PyArray_InitializeCasts(void); +#ifdef __cplusplus +} +#endif + #endif /* NUMPY_CORE_SRC_MULTIARRAY_CONVERT_DATATYPE_H_ */ diff --git a/numpy/core/src/multiarray/ctors.c b/numpy/core/src/multiarray/ctors.c index c780f4b2b..c3d66dd6b 100644 --- a/numpy/core/src/multiarray/ctors.c +++ b/numpy/core/src/multiarray/ctors.c @@ -1,5 +1,6 @@ #define NPY_NO_DEPRECATED_API NPY_API_VERSION #define _MULTIARRAYMODULE +#define _UMATHMODULE #define PY_SSIZE_T_CLEAN #include <Python.h> @@ -33,6 +34,8 @@ #include "get_attr_string.h" #include "array_coercion.h" +#include "umathmodule.h" + /* * Reading from a file or a string. * @@ -465,55 +468,12 @@ PyArray_AssignFromCache_Recursive( PyArrayObject *self, const int ndim, coercion_cache_obj **cache) { /* Consume first cache element by extracting information and freeing it */ - PyObject *original_obj = (*cache)->converted_obj; PyObject *obj = (*cache)->arr_or_sequence; Py_INCREF(obj); npy_bool sequence = (*cache)->sequence; int depth = (*cache)->depth; *cache = npy_unlink_coercion_cache(*cache); - /* - * The maximum depth is special (specifically for objects), but usually - * unrolled in the sequence branch below. - */ - if (NPY_UNLIKELY(depth == ndim)) { - /* - * We have reached the maximum depth. We should simply assign to the - * element in principle. There is one exception. If this is a 0-D - * array being stored into a 0-D array (but we do not reach here then). - */ - if (PyArray_ISOBJECT(self)) { - assert(ndim != 0); /* guaranteed by PyArray_AssignFromCache */ - assert(PyArray_NDIM(self) == 0); - Py_DECREF(obj); - return PyArray_Pack(PyArray_DESCR(self), PyArray_BYTES(self), - original_obj); - } - if (sequence) { - /* - * Sanity check which may be removed, the error is raised already - * in `PyArray_DiscoverDTypeAndShape`. - */ - assert(0); - PyErr_SetString(PyExc_RuntimeError, - "setting an array element with a sequence"); - goto fail; - } - else if (original_obj != obj || !PyArray_CheckExact(obj)) { - /* - * If the leave node is an array-like, but not a numpy array, - * we pretend it is an arbitrary scalar. This means that in - * most cases (where the dtype is int or float), we will end - * up using float(array-like), or int(array-like). That does - * not support general casting, but helps Quantity and masked - * arrays, because it allows them to raise an error when - * `__float__()` or `__int__()` is called. - */ - Py_DECREF(obj); - return PyArray_SETITEM(self, PyArray_BYTES(self), original_obj); - } - } - /* The element is either a sequence, or an array */ if (!sequence) { /* Straight forward array assignment */ @@ -535,20 +495,24 @@ PyArray_AssignFromCache_Recursive( for (npy_intp i = 0; i < length; i++) { PyObject *value = PySequence_Fast_GET_ITEM(obj, i); - if (*cache == NULL || (*cache)->converted_obj != value || - (*cache)->depth != depth + 1) { - if (ndim != depth + 1) { - PyErr_SetString(PyExc_RuntimeError, - "Inconsistent object during array creation? " - "Content of sequences changed (now too shallow)."); - goto fail; - } - /* Straight forward assignment of elements */ + if (ndim == depth + 1) { + /* + * Straight forward assignment of elements. Note that it is + * possible for such an element to be a 0-D array or array-like. + * `PyArray_Pack` supports arrays as well as we want: We + * support exact NumPy arrays, but at this point ignore others. + * (Please see the `PyArray_Pack` function comment if this + * rightly confuses you.) + */ char *item; item = (PyArray_BYTES(self) + i * PyArray_STRIDES(self)[0]); if (PyArray_Pack(PyArray_DESCR(self), item, value) < 0) { goto fail; } + /* If this was an array(-like) we still need to unlike int: */ + if (*cache != NULL && (*cache)->converted_obj == value) { + *cache = npy_unlink_coercion_cache(*cache); + } } else { PyArrayObject *view; @@ -2780,18 +2744,22 @@ PyArray_CopyAsFlat(PyArrayObject *dst, PyArrayObject *src, NPY_ORDER order) * contiguous strides, etc. */ NPY_cast_info cast_info; + NPY_ARRAYMETHOD_FLAGS flags; if (PyArray_GetDTypeTransferFunction( IsUintAligned(src) && IsAligned(src) && IsUintAligned(dst) && IsAligned(dst), src_stride, dst_stride, PyArray_DESCR(src), PyArray_DESCR(dst), 0, - &cast_info, &needs_api) != NPY_SUCCEED) { + &cast_info, &flags) != NPY_SUCCEED) { NpyIter_Deallocate(dst_iter); NpyIter_Deallocate(src_iter); return -1; } - + needs_api |= (flags & NPY_METH_REQUIRES_PYAPI) != 0; + if (!(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) { + npy_clear_floatstatus_barrier((char *)src_iter); + } if (!needs_api) { NPY_BEGIN_THREADS; } @@ -2843,8 +2811,20 @@ PyArray_CopyAsFlat(PyArrayObject *dst, PyArrayObject *src, NPY_ORDER order) NPY_END_THREADS; NPY_cast_info_xfree(&cast_info); - NpyIter_Deallocate(dst_iter); - NpyIter_Deallocate(src_iter); + if (!NpyIter_Deallocate(dst_iter)) { + res = -1; + } + if (!NpyIter_Deallocate(src_iter)) { + res = -1; + } + + if (res == 0 && !(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) { + int fpes = npy_get_floatstatus_barrier((char *)src_iter); + if (fpes && PyUFunc_GiveFloatingpointErrors("cast", fpes) < 0) { + return -1; + } + } + return res; } diff --git a/numpy/core/src/multiarray/dtype_transfer.c b/numpy/core/src/multiarray/dtype_transfer.c index 18de5d132..f8458d2d7 100644 --- a/numpy/core/src/multiarray/dtype_transfer.c +++ b/numpy/core/src/multiarray/dtype_transfer.c @@ -11,12 +11,14 @@ */ #define NPY_NO_DEPRECATED_API NPY_API_VERSION #define _MULTIARRAYMODULE +#define _UMATHMODULE #define PY_SSIZE_T_CLEAN #include <Python.h> #include <structmember.h> #include "numpy/arrayobject.h" +#include "numpy/npy_math.h" #include "lowlevel_strided_loops.h" #include "npy_pycompat.h" @@ -35,6 +37,8 @@ #include "array_method.h" #include "array_coercion.h" +#include "umathmodule.h" + #define NPY_LOWLEVEL_BUFFER_BLOCKSIZE 128 /********** PRINTF DEBUG TRACING **************/ @@ -1506,7 +1510,7 @@ get_one_to_n_transfer_function(int aligned, npy_intp N, PyArrayMethod_StridedLoop **out_stransfer, NpyAuxData **out_transferdata, - int *out_needs_api) + NPY_ARRAYMETHOD_FLAGS *out_flags) { _one_to_n_data *data = PyMem_Malloc(sizeof(_one_to_n_data)); if (data == NULL) { @@ -1530,18 +1534,19 @@ get_one_to_n_transfer_function(int aligned, src_dtype, dst_dtype, 0, &data->wrapped, - out_needs_api) != NPY_SUCCEED) { + out_flags) != NPY_SUCCEED) { NPY_AUXDATA_FREE((NpyAuxData *)data); return NPY_FAIL; } /* If the src object will need a DECREF, set src_dtype */ if (move_references && PyDataType_REFCHK(src_dtype)) { + *out_flags |= NPY_METH_REQUIRES_PYAPI; if (get_decref_transfer_function(aligned, src_stride, src_dtype, &data->decref_src, - out_needs_api) != NPY_SUCCEED) { + NULL) != NPY_SUCCEED) { NPY_AUXDATA_FREE((NpyAuxData *)data); return NPY_FAIL; } @@ -1667,7 +1672,7 @@ get_n_to_n_transfer_function(int aligned, npy_intp N, PyArrayMethod_StridedLoop **out_stransfer, NpyAuxData **out_transferdata, - int *out_needs_api) + NPY_ARRAYMETHOD_FLAGS *out_flags) { _n_to_n_data *data = PyMem_Malloc(sizeof(_n_to_n_data)); if (data == NULL) { @@ -1699,7 +1704,7 @@ get_n_to_n_transfer_function(int aligned, src_dtype, dst_dtype, move_references, &data->wrapped, - out_needs_api) != NPY_SUCCEED) { + out_flags) != NPY_SUCCEED) { NPY_AUXDATA_FREE((NpyAuxData *)data); return NPY_FAIL; } @@ -1913,7 +1918,7 @@ get_subarray_broadcast_transfer_function(int aligned, int move_references, PyArrayMethod_StridedLoop **out_stransfer, NpyAuxData **out_transferdata, - int *out_needs_api) + NPY_ARRAYMETHOD_FLAGS *out_flags) { _subarray_broadcast_data *data; npy_intp structsize, loop_index, run, run_size, @@ -1946,7 +1951,7 @@ get_subarray_broadcast_transfer_function(int aligned, src_dtype, dst_dtype, 0, &data->wrapped, - out_needs_api) != NPY_SUCCEED) { + out_flags) != NPY_SUCCEED) { NPY_AUXDATA_FREE((NpyAuxData *)data); return NPY_FAIL; } @@ -1958,7 +1963,7 @@ get_subarray_broadcast_transfer_function(int aligned, src_dtype, NULL, 1, &data->decref_src, - out_needs_api) != NPY_SUCCEED) { + out_flags) != NPY_SUCCEED) { NPY_AUXDATA_FREE((NpyAuxData *)data); return NPY_FAIL; } @@ -1971,7 +1976,7 @@ get_subarray_broadcast_transfer_function(int aligned, dst_dtype, NULL, 1, &data->decref_dst, - out_needs_api) != NPY_SUCCEED) { + out_flags) != NPY_SUCCEED) { NPY_AUXDATA_FREE((NpyAuxData *)data); return NPY_FAIL; } @@ -2087,7 +2092,7 @@ get_subarray_transfer_function(int aligned, int move_references, PyArrayMethod_StridedLoop **out_stransfer, NpyAuxData **out_transferdata, - int *out_needs_api) + NPY_ARRAYMETHOD_FLAGS *out_flags) { PyArray_Dims src_shape = {NULL, -1}, dst_shape = {NULL, -1}; npy_intp src_size = 1, dst_size = 1; @@ -2132,7 +2137,7 @@ get_subarray_transfer_function(int aligned, move_references, src_size, out_stransfer, out_transferdata, - out_needs_api); + out_flags); } /* Copy the src value to all the dst values */ else if (src_size == 1) { @@ -2145,7 +2150,7 @@ get_subarray_transfer_function(int aligned, move_references, dst_size, out_stransfer, out_transferdata, - out_needs_api); + out_flags); } /* * Copy the subarray with broadcasting, truncating, and zero-padding @@ -2159,7 +2164,7 @@ get_subarray_transfer_function(int aligned, src_shape, dst_shape, move_references, out_stransfer, out_transferdata, - out_needs_api); + out_flags); npy_free_cache_dim_obj(src_shape); npy_free_cache_dim_obj(dst_shape); @@ -2277,7 +2282,7 @@ get_fields_transfer_function(int NPY_UNUSED(aligned), int move_references, PyArrayMethod_StridedLoop **out_stransfer, NpyAuxData **out_transferdata, - int *out_needs_api) + NPY_ARRAYMETHOD_FLAGS *out_flags) { PyObject *key, *tup, *title; PyArray_Descr *src_fld_dtype, *dst_fld_dtype; @@ -2308,6 +2313,7 @@ get_fields_transfer_function(int NPY_UNUSED(aligned), data->base.clone = &_field_transfer_data_clone; data->field_count = 0; + *out_flags = PyArrayMethod_MINIMAL_FLAGS; for (i = 0; i < field_count; ++i) { key = PyTuple_GET_ITEM(dst_dtype->names, i); tup = PyDict_GetItem(dst_dtype->fields, key); @@ -2316,15 +2322,17 @@ get_fields_transfer_function(int NPY_UNUSED(aligned), PyMem_Free(data); return NPY_FAIL; } + NPY_ARRAYMETHOD_FLAGS field_flags; if (PyArray_GetDTypeTransferFunction(0, src_stride, dst_stride, src_dtype, dst_fld_dtype, 0, &data->fields[i].info, - out_needs_api) != NPY_SUCCEED) { + &field_flags) != NPY_SUCCEED) { NPY_AUXDATA_FREE((NpyAuxData *)data); return NPY_FAIL; } + *out_flags = PyArrayMethod_COMBINED_FLAGS(*out_flags, field_flags); data->fields[i].src_offset = 0; data->fields[i].dst_offset = dst_offset; data->field_count++; @@ -2336,11 +2344,12 @@ get_fields_transfer_function(int NPY_UNUSED(aligned), * input, the second one (normally output) just does not matter here. */ if (move_references && PyDataType_REFCHK(src_dtype)) { + *out_flags |= NPY_METH_REQUIRES_PYAPI; if (get_decref_transfer_function(0, src_stride, src_dtype, &data->fields[field_count].info, - out_needs_api) != NPY_SUCCEED) { + NULL) != NPY_SUCCEED) { NPY_AUXDATA_FREE((NpyAuxData *)data); return NPY_FAIL; } @@ -2388,7 +2397,7 @@ get_fields_transfer_function(int NPY_UNUSED(aligned), src_fld_dtype, dst_dtype, move_references, &data->fields[0].info, - out_needs_api) != NPY_SUCCEED) { + out_flags) != NPY_SUCCEED) { PyMem_Free(data); return NPY_FAIL; } @@ -2423,6 +2432,7 @@ get_fields_transfer_function(int NPY_UNUSED(aligned), data->base.clone = &_field_transfer_data_clone; data->field_count = 0; + *out_flags = PyArrayMethod_MINIMAL_FLAGS; /* set up the transfer function for each field */ for (i = 0; i < field_count; ++i) { key = PyTuple_GET_ITEM(dst_dtype->names, i); @@ -2440,15 +2450,17 @@ get_fields_transfer_function(int NPY_UNUSED(aligned), return NPY_FAIL; } + NPY_ARRAYMETHOD_FLAGS field_flags; if (PyArray_GetDTypeTransferFunction(0, src_stride, dst_stride, src_fld_dtype, dst_fld_dtype, move_references, &data->fields[i].info, - out_needs_api) != NPY_SUCCEED) { + &field_flags) != NPY_SUCCEED) { NPY_AUXDATA_FREE((NpyAuxData *)data); return NPY_FAIL; } + *out_flags = PyArrayMethod_COMBINED_FLAGS(*out_flags, field_flags); data->fields[i].src_offset = src_offset; data->fields[i].dst_offset = dst_offset; data->field_count++; @@ -2748,11 +2760,12 @@ get_decref_transfer_function(int aligned, src_size = PyArray_MultiplyList(src_shape.ptr, src_shape.len); npy_free_cache_dim_obj(src_shape); + NPY_ARRAYMETHOD_FLAGS ignored_flags; if (get_n_to_n_transfer_function(aligned, src_stride, 0, src_dtype->subarray->base, NULL, 1, src_size, &cast_info->func, &cast_info->auxdata, - out_needs_api) != NPY_SUCCEED) { + &ignored_flags) != NPY_SUCCEED) { return NPY_FAIL; } @@ -3098,7 +3111,7 @@ define_cast_for_descrs( npy_intp src_stride, npy_intp dst_stride, PyArray_Descr *src_dtype, PyArray_Descr *dst_dtype, int move_references, - NPY_cast_info *cast_info, int *out_needs_api) + NPY_cast_info *cast_info, NPY_ARRAYMETHOD_FLAGS *out_flags) { /* Storage for all cast info in case multi-step casting is necessary */ _multistep_castdata castdata; @@ -3109,6 +3122,7 @@ define_cast_for_descrs( /* `view_offset` passed to `init_cast_info` but unused for the main cast */ npy_intp view_offset = NPY_MIN_INTP; NPY_CASTING casting = -1; + *out_flags = PyArrayMethod_MINIMAL_FLAGS; if (init_cast_info( cast_info, &casting, &view_offset, src_dtype, dst_dtype, 1) < 0) { @@ -3159,7 +3173,7 @@ define_cast_for_descrs( } assert(castdata.from.func != NULL); - *out_needs_api |= (flags & NPY_METH_REQUIRES_PYAPI) != 0; + *out_flags = PyArrayMethod_COMBINED_FLAGS(*out_flags, flags); /* The main cast now uses a buffered input: */ src_stride = strides[1]; move_references = 1; /* main cast has to clear the buffer */ @@ -3198,7 +3212,7 @@ define_cast_for_descrs( } assert(castdata.to.func != NULL); - *out_needs_api |= (flags & NPY_METH_REQUIRES_PYAPI) != 0; + *out_flags = PyArrayMethod_COMBINED_FLAGS(*out_flags, flags); /* The main cast now uses a buffered input: */ dst_stride = strides[0]; if (castdata.from.func != NULL) { @@ -3219,7 +3233,7 @@ define_cast_for_descrs( goto fail; } - *out_needs_api |= (flags & NPY_METH_REQUIRES_PYAPI) != 0; + *out_flags = PyArrayMethod_COMBINED_FLAGS(*out_flags, flags); if (castdata.from.func == NULL && castdata.to.func == NULL) { /* Most of the time, there will be only one step required. */ @@ -3256,7 +3270,7 @@ PyArray_GetDTypeTransferFunction(int aligned, PyArray_Descr *src_dtype, PyArray_Descr *dst_dtype, int move_references, NPY_cast_info *cast_info, - int *out_needs_api) + NPY_ARRAYMETHOD_FLAGS *out_flags) { assert(src_dtype != NULL); @@ -3271,17 +3285,24 @@ PyArray_GetDTypeTransferFunction(int aligned, */ if (dst_dtype == NULL) { assert(move_references); - return get_decref_transfer_function(aligned, + int needs_api = 0; + int res = get_decref_transfer_function(aligned, src_dtype->elsize, src_dtype, cast_info, - out_needs_api); + &needs_api); + /* decref'ing never creates floating point errors, so just ignore it */ + *out_flags = PyArrayMethod_MINIMAL_FLAGS; + if (needs_api) { + *out_flags |= NPY_METH_REQUIRES_PYAPI; + } + return res; } if (define_cast_for_descrs(aligned, src_stride, dst_stride, src_dtype, dst_dtype, move_references, - cast_info, out_needs_api) < 0) { + cast_info, out_flags) < 0) { return NPY_FAIL; } @@ -3353,21 +3374,29 @@ wrap_aligned_transferfunction( * have an explicit implementation instead if we want performance. */ if (must_wrap || src_wrapped_dtype != src_dtype) { + NPY_ARRAYMETHOD_FLAGS flags; if (PyArray_GetDTypeTransferFunction(aligned, src_stride, castdata.main.descriptors[0]->elsize, src_dtype, castdata.main.descriptors[0], 0, - &castdata.from, out_needs_api) != NPY_SUCCEED) { + &castdata.from, &flags) != NPY_SUCCEED) { goto fail; } + if (flags & NPY_METH_REQUIRES_PYAPI) { + *out_needs_api = 1; + } } if (must_wrap || dst_wrapped_dtype != dst_dtype) { + NPY_ARRAYMETHOD_FLAGS flags; if (PyArray_GetDTypeTransferFunction(aligned, castdata.main.descriptors[1]->elsize, dst_stride, castdata.main.descriptors[1], dst_dtype, 1, /* clear buffer if it includes references */ - &castdata.to, out_needs_api) != NPY_SUCCEED) { + &castdata.to, &flags) != NPY_SUCCEED) { goto fail; } + if (flags & NPY_METH_REQUIRES_PYAPI) { + *out_needs_api = 1; + } } *out_transferdata = _multistep_cast_auxdata_clone_int(&castdata, 1); @@ -3492,7 +3521,7 @@ PyArray_GetMaskedDTypeTransferFunction(int aligned, PyArray_Descr *mask_dtype, int move_references, NPY_cast_info *cast_info, - int *out_needs_api) + NPY_ARRAYMETHOD_FLAGS *out_flags) { NPY_cast_info_init(cast_info); @@ -3520,18 +3549,19 @@ PyArray_GetMaskedDTypeTransferFunction(int aligned, src_dtype, dst_dtype, move_references, &data->wrapped, - out_needs_api) != NPY_SUCCEED) { + out_flags) != NPY_SUCCEED) { PyMem_Free(data); return NPY_FAIL; } /* If the src object will need a DECREF, get a function to handle that */ if (move_references && PyDataType_REFCHK(src_dtype)) { + *out_flags |= NPY_METH_REQUIRES_PYAPI; if (get_decref_transfer_function(aligned, src_stride, src_dtype, &data->decref_src, - out_needs_api) != NPY_SUCCEED) { + NULL) != NPY_SUCCEED) { NPY_AUXDATA_FREE((NpyAuxData *)data); return NPY_FAIL; } @@ -3562,7 +3592,7 @@ PyArray_CastRawArrays(npy_intp count, PyArray_Descr *src_dtype, PyArray_Descr *dst_dtype, int move_references) { - int aligned = 1, needs_api = 0; + int aligned; /* Make sure the copy is reasonable */ if (dst_stride == 0 && count > 1) { @@ -3586,15 +3616,20 @@ PyArray_CastRawArrays(npy_intp count, /* Get the function to do the casting */ NPY_cast_info cast_info; + NPY_ARRAYMETHOD_FLAGS flags; if (PyArray_GetDTypeTransferFunction(aligned, src_stride, dst_stride, src_dtype, dst_dtype, move_references, &cast_info, - &needs_api) != NPY_SUCCEED) { + &flags) != NPY_SUCCEED) { return NPY_FAIL; } + if (!(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) { + npy_clear_floatstatus_barrier((char*)&cast_info); + } + /* Cast */ char *args[2] = {src, dst}; npy_intp strides[2] = {src_stride, dst_stride}; @@ -3603,8 +3638,16 @@ PyArray_CastRawArrays(npy_intp count, /* Cleanup */ NPY_cast_info_xfree(&cast_info); - /* If needs_api was set to 1, it may have raised a Python exception */ - return (needs_api && PyErr_Occurred()) ? NPY_FAIL : NPY_SUCCEED; + if (flags & NPY_METH_REQUIRES_PYAPI && PyErr_Occurred()) { + return NPY_FAIL; + } + if (!(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) { + int fpes = npy_get_floatstatus_barrier(*args); + if (fpes && PyUFunc_GiveFloatingpointErrors("cast", fpes) < 0) { + return NPY_FAIL; + } + } + return NPY_SUCCEED; } /* diff --git a/numpy/core/src/multiarray/dtypemeta.c b/numpy/core/src/multiarray/dtypemeta.c index 577478d2a..cc99a3eca 100644 --- a/numpy/core/src/multiarray/dtypemeta.c +++ b/numpy/core/src/multiarray/dtypemeta.c @@ -613,6 +613,7 @@ string_unicode_common_dtype(PyArray_DTypeMeta *cls, PyArray_DTypeMeta *other) return cls; } + static PyArray_DTypeMeta * datetime_common_dtype(PyArray_DTypeMeta *cls, PyArray_DTypeMeta *other) { diff --git a/numpy/core/src/multiarray/dtypemeta.h b/numpy/core/src/multiarray/dtypemeta.h index e7d5505d8..618491c98 100644 --- a/numpy/core/src/multiarray/dtypemeta.h +++ b/numpy/core/src/multiarray/dtypemeta.h @@ -1,6 +1,9 @@ #ifndef NUMPY_CORE_SRC_MULTIARRAY_DTYPEMETA_H_ #define NUMPY_CORE_SRC_MULTIARRAY_DTYPEMETA_H_ +#ifdef __cplusplus +extern "C" { +#endif /* DType flags, currently private, since we may just expose functions */ #define NPY_DT_LEGACY 1 << 0 @@ -126,4 +129,8 @@ python_builtins_are_known_scalar_types( NPY_NO_EXPORT int dtypemeta_wrap_legacy_descriptor(PyArray_Descr *dtypem); +#ifdef __cplusplus +} +#endif + #endif /* NUMPY_CORE_SRC_MULTIARRAY_DTYPEMETA_H_ */ diff --git a/numpy/core/src/multiarray/einsum_sumprod.c.src b/numpy/core/src/multiarray/einsum_sumprod.c.src index 3114a5896..e7b2f2c2c 100644 --- a/numpy/core/src/multiarray/einsum_sumprod.c.src +++ b/numpy/core/src/multiarray/einsum_sumprod.c.src @@ -68,7 +68,7 @@ * 0*3# * #NPYV_CHK = 0*5, * 0*5, - * 0, NPY_SIMD, NPY_SIMD_F64, 0, + * 0, NPY_SIMD_F32, NPY_SIMD_F64, 0, * 0*3# */ diff --git a/numpy/core/src/multiarray/experimental_public_dtype_api.c b/numpy/core/src/multiarray/experimental_public_dtype_api.c index cf5f152ab..441dbdc1f 100644 --- a/numpy/core/src/multiarray/experimental_public_dtype_api.c +++ b/numpy/core/src/multiarray/experimental_public_dtype_api.c @@ -300,37 +300,13 @@ PyArrayInitDTypeMeta_FromSpec( } -/* Function is defined in umath/dispatching.c (same/one compilation unit) */ +/* Functions defined in umath/dispatching.c (same/one compilation unit) */ NPY_NO_EXPORT int PyUFunc_AddLoop(PyUFuncObject *ufunc, PyObject *info, int ignore_duplicate); -static int -PyUFunc_AddLoopFromSpec(PyObject *ufunc, PyArrayMethod_Spec *spec) -{ - if (!PyObject_TypeCheck(ufunc, &PyUFunc_Type)) { - PyErr_SetString(PyExc_TypeError, - "ufunc object passed is not a ufunc!"); - return -1; - } - PyBoundArrayMethodObject *bmeth = - (PyBoundArrayMethodObject *)PyArrayMethod_FromSpec(spec); - if (bmeth == NULL) { - return -1; - } - int nargs = bmeth->method->nin + bmeth->method->nout; - PyObject *dtypes = PyArray_TupleFromItems( - nargs, (PyObject **)bmeth->dtypes, 1); - if (dtypes == NULL) { - return -1; - } - PyObject *info = PyTuple_Pack(2, dtypes, bmeth->method); - Py_DECREF(bmeth); - Py_DECREF(dtypes); - if (info == NULL) { - return -1; - } - return PyUFunc_AddLoop((PyUFuncObject *)ufunc, info, 0); -} +NPY_NO_EXPORT int +PyUFunc_AddLoopFromSpec(PyUFuncObject *ufunc, PyObject *info, int ignore_duplicate); + /* * Function is defined in umath/wrapping_array_method.c diff --git a/numpy/core/src/multiarray/iterators.c b/numpy/core/src/multiarray/iterators.c index f959162fd..95aa11d2d 100644 --- a/numpy/core/src/multiarray/iterators.c +++ b/numpy/core/src/multiarray/iterators.c @@ -827,7 +827,8 @@ iter_ass_subscript(PyArrayIterObject *self, PyObject *ind, PyObject *val) if (PyBool_Check(ind)) { retval = 0; if (PyObject_IsTrue(ind)) { - retval = PyArray_Pack(PyArray_DESCR(self->ao), self->dataptr, val); + retval = PyArray_Pack( + PyArray_DESCR(self->ao), self->dataptr, val); } goto finish; } diff --git a/numpy/core/src/multiarray/lowlevel_strided_loops.c.src b/numpy/core/src/multiarray/lowlevel_strided_loops.c.src index e313d2447..8e3afd3cc 100644 --- a/numpy/core/src/multiarray/lowlevel_strided_loops.c.src +++ b/numpy/core/src/multiarray/lowlevel_strided_loops.c.src @@ -13,6 +13,7 @@ #define NPY_NO_DEPRECATED_API NPY_API_VERSION #define _MULTIARRAYMODULE +#define _UMATHMODULE #include <numpy/arrayobject.h> #include <numpy/npy_cpu.h> #include <numpy/halffloat.h> @@ -22,6 +23,7 @@ #include "array_method.h" #include "usertypes.h" +#include "umathmodule.h" /* * x86 platform works with unaligned access but the compiler is allowed to @@ -1557,14 +1559,16 @@ mapiter_trivial_@name@(PyArrayObject *self, PyArrayObject *ind, * General advanced indexing iteration. */ NPY_NO_EXPORT int -mapiter_@name@(PyArrayMapIterObject *mit) +mapiter_@name@( + PyArrayMapIterObject *mit, NPY_cast_info *cast_info, + NPY_ARRAYMETHOD_FLAGS flags, int is_aligned) { npy_intp *counter, count; - int i, is_aligned; + int i; /* Cached mit info */ int numiter = mit->numiter; - int needs_api = mit->needs_api; + int needs_api = (flags & NPY_METH_REQUIRES_PYAPI) != 0; /* Constant information */ npy_intp fancy_dims[NPY_MAXDIMS]; npy_intp fancy_strides[NPY_MAXDIMS]; @@ -1586,13 +1590,6 @@ mapiter_@name@(PyArrayMapIterObject *mit) fancy_strides[i] = mit->fancy_strides[i]; } - /* - * Alignment information (swapping is never needed, since we buffer), - * could also check extra_op is buffered, but it should rarely matter. - */ - - is_aligned = IsUintAligned(array) && IsUintAligned(mit->extra_op); - if (mit->size == 0) { return 0; } @@ -1600,9 +1597,11 @@ mapiter_@name@(PyArrayMapIterObject *mit) if (mit->subspace_iter == NULL) { /* * Item by item copy situation, the operand is buffered - * so use copyswap. + * so use copyswap. The iterator may not do any transfers, so may + * not have set `needs_api` yet, set it if necessary: */ - PyArray_CopySwapFunc *copyswap = PyArray_DESCR(array)->f->copyswap; + needs_api |= PyDataType_REFCHK(PyArray_DESCR(array)); + PyArray_CopySwapFunc *copyswap = PyArray_DESCR(array)->f->copyswap; /* We have only one iterator handling everything */ counter = NpyIter_GetInnerLoopSizePtr(mit->outer); @@ -1715,28 +1714,9 @@ mapiter_@name@(PyArrayMapIterObject *mit) int is_subiter_trivial = 0; /* has three states */ npy_intp reset_offsets[2] = {0, 0}; - /* Use strided transfer functions for the inner loop */ - npy_intp fixed_strides[2]; - - /* - * Get a dtype transfer function, since there are no - * buffers, this is safe. - */ - NpyIter_GetInnerFixedStrideArray(mit->subspace_iter, fixed_strides); - - NPY_cast_info cast_info; - if (PyArray_GetDTypeTransferFunction(is_aligned, -#if @isget@ - fixed_strides[0], fixed_strides[1], - PyArray_DESCR(array), PyArray_DESCR(mit->extra_op), -#else - fixed_strides[1], fixed_strides[0], - PyArray_DESCR(mit->extra_op), PyArray_DESCR(array), -#endif - 0, - &cast_info, - &needs_api) != NPY_SUCCEED) { - return -1; + /* Note: it may make sense to refactor `needs_api` out in this branch */ + if (flags & NPY_METH_REQUIRES_PYAPI) { + needs_api = 1; } counter = NpyIter_GetInnerLoopSizePtr(mit->subspace_iter); @@ -1771,7 +1751,6 @@ mapiter_@name@(PyArrayMapIterObject *mit) #if @isget@ && @one_iter@ if (check_and_adjust_index(&indval, fancy_dims[i], iteraxis, _save) < 0 ) { - NPY_cast_info_xfree(&cast_info); return -1; } #else @@ -1803,7 +1782,6 @@ mapiter_@name@(PyArrayMapIterObject *mit) &errmsg)) { NPY_END_THREADS; PyErr_SetString(PyExc_ValueError, errmsg); - NPY_cast_info_xfree(&cast_info); return -1; } if (is_subiter_trivial != 0) { @@ -1833,7 +1811,6 @@ mapiter_@name@(PyArrayMapIterObject *mit) * not at all... */ if (needs_api && PyErr_Occurred()) { - NPY_cast_info_xfree(&cast_info); return -1; } #endif @@ -1841,21 +1818,19 @@ mapiter_@name@(PyArrayMapIterObject *mit) do { #if @isget@ - if (NPY_UNLIKELY(cast_info.func(&cast_info.context, + if (NPY_UNLIKELY(cast_info->func(&cast_info->context, subspace_ptrs, counter, subspace_strides, - cast_info.auxdata) < 0)) { + cast_info->auxdata) < 0)) { NPY_END_THREADS; - NPY_cast_info_xfree(&cast_info); return -1; } #else /* The operand order is reversed here */ char *args[2] = {subspace_ptrs[1], subspace_ptrs[0]}; npy_intp strides[2] = {subspace_strides[1], subspace_strides[0]}; - if (NPY_UNLIKELY(cast_info.func(&cast_info.context, - args, counter, strides, cast_info.auxdata) < 0)) { + if (NPY_UNLIKELY(cast_info->func(&cast_info->context, + args, counter, strides, cast_info->auxdata) < 0)) { NPY_END_THREADS; - NPY_cast_info_xfree(&cast_info); return -1; } #endif @@ -1866,8 +1841,6 @@ mapiter_@name@(PyArrayMapIterObject *mit) NPY_END_THREADS; } /**end repeat1**/ - - NPY_cast_info_xfree(&cast_info); } return 0; } diff --git a/numpy/core/src/multiarray/mapping.c b/numpy/core/src/multiarray/mapping.c index 1a2ade11b..98c2d7eda 100644 --- a/numpy/core/src/multiarray/mapping.c +++ b/numpy/core/src/multiarray/mapping.c @@ -1,11 +1,14 @@ #define NPY_NO_DEPRECATED_API NPY_API_VERSION #define _MULTIARRAYMODULE +#define _UMATHMODULE #define PY_SSIZE_T_CLEAN #include <Python.h> #include <structmember.h> #include "numpy/arrayobject.h" +#include "numpy/npy_math.h" + #include "arrayobject.h" #include "npy_config.h" @@ -23,6 +26,11 @@ #include "mem_overlap.h" #include "array_assign.h" #include "array_coercion.h" +/* TODO: Only for `NpyIter_GetTransferFlags` until it is public */ +#define NPY_ITERATOR_IMPLEMENTATION_CODE +#include "nditer_impl.h" + +#include "umathmodule.h" #define HAS_INTEGER 1 @@ -914,7 +922,6 @@ array_boolean_subscript(PyArrayObject *self, char *ret_data; PyArray_Descr *dtype; PyArrayObject *ret; - int needs_api = 0; size = count_boolean_trues(PyArray_NDIM(bmask), PyArray_DATA(bmask), PyArray_DIMS(bmask), PyArray_STRIDES(bmask)); @@ -962,13 +969,18 @@ array_boolean_subscript(PyArrayObject *self, /* Get a dtype transfer function */ NpyIter_GetInnerFixedStrideArray(iter, fixed_strides); NPY_cast_info cast_info; + /* + * TODO: Ignoring cast flags, since this is only ever a copy. In + * principle that may not be quite right in some future? + */ + NPY_ARRAYMETHOD_FLAGS cast_flags; if (PyArray_GetDTypeTransferFunction( IsUintAligned(self) && IsAligned(self), fixed_strides[0], itemsize, dtype, dtype, 0, &cast_info, - &needs_api) != NPY_SUCCEED) { + &cast_flags) != NPY_SUCCEED) { Py_DECREF(ret); NpyIter_Deallocate(iter); return NULL; @@ -1068,7 +1080,6 @@ array_assign_boolean_subscript(PyArrayObject *self, { npy_intp size, v_stride; char *v_data; - int needs_api = 0; npy_intp bmask_size; if (PyArray_DESCR(bmask)->type_num != NPY_BOOL) { @@ -1164,6 +1175,7 @@ array_assign_boolean_subscript(PyArrayObject *self, /* Get a dtype transfer function */ NpyIter_GetInnerFixedStrideArray(iter, fixed_strides); NPY_cast_info cast_info; + NPY_ARRAYMETHOD_FLAGS cast_flags; if (PyArray_GetDTypeTransferFunction( IsUintAligned(self) && IsAligned(self) && IsUintAligned(v) && IsAligned(v), @@ -1171,14 +1183,17 @@ array_assign_boolean_subscript(PyArrayObject *self, PyArray_DESCR(v), PyArray_DESCR(self), 0, &cast_info, - &needs_api) != NPY_SUCCEED) { + &cast_flags) != NPY_SUCCEED) { NpyIter_Deallocate(iter); return -1; } - if (!needs_api) { + if (!(cast_flags & NPY_METH_REQUIRES_PYAPI)) { NPY_BEGIN_THREADS_NDITER(iter); } + if (!(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) { + npy_clear_floatstatus_barrier((char *)self); + } npy_intp strides[2] = {v_stride, self_stride}; @@ -1209,7 +1224,7 @@ array_assign_boolean_subscript(PyArrayObject *self, } } while (iternext(iter)); - if (!needs_api) { + if (!(cast_flags & NPY_METH_REQUIRES_PYAPI)) { NPY_END_THREADS; } @@ -1217,6 +1232,12 @@ array_assign_boolean_subscript(PyArrayObject *self, if (!NpyIter_Deallocate(iter)) { res = -1; } + if (res == 0 && !(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) { + int fpes = npy_get_floatstatus_barrier((char *)self); + if (fpes && PyUFunc_GiveFloatingpointErrors("cast", fpes) < 0) { + return -1; + } + } } return res; @@ -1414,6 +1435,8 @@ array_subscript(PyArrayObject *self, PyObject *op) int index_type; int index_num; int i, ndim, fancy_ndim; + NPY_cast_info cast_info = {.func = NULL}; + /* * Index info array. We can have twice as many indices as dimensions * (because of None). The + 1 is to not need to check as much. @@ -1579,7 +1602,43 @@ array_subscript(PyArrayObject *self, PyObject *op) goto finish; } - if (mapiter_get(mit) < 0) { + /* + * Alignment information (swapping is never needed, since we buffer), + * could also check extra_op is buffered, but it should rarely matter. + */ + int is_aligned = IsUintAligned(self) && IsUintAligned(mit->extra_op); + /* + * NOTE: Getting never actually casts, so we currently do not bother to do + * the full checks (floating point errors) here (unlike assignment). + */ + int meth_flags = NpyIter_GetTransferFlags(mit->outer); + if (mit->extra_op_iter) { + int extra_op_flags = NpyIter_GetTransferFlags(mit->extra_op_iter); + meth_flags = PyArrayMethod_COMBINED_FLAGS(meth_flags, extra_op_flags); + } + + if (mit->subspace_iter != NULL) { + int extra_op_flags = NpyIter_GetTransferFlags(mit->subspace_iter); + meth_flags = PyArrayMethod_COMBINED_FLAGS(meth_flags, extra_op_flags); + + NPY_ARRAYMETHOD_FLAGS transfer_flags; + npy_intp fixed_strides[2]; + /* + * Get a dtype transfer function, since there are no + * buffers, this is safe. + */ + NpyIter_GetInnerFixedStrideArray(mit->subspace_iter, fixed_strides); + + if (PyArray_GetDTypeTransferFunction(is_aligned, + fixed_strides[0], fixed_strides[1], + PyArray_DESCR(self), PyArray_DESCR(mit->extra_op), + 0, &cast_info, &transfer_flags) != NPY_SUCCEED) { + goto finish; + } + meth_flags = PyArrayMethod_COMBINED_FLAGS(meth_flags, transfer_flags); + } + + if (mapiter_get(mit, &cast_info, meth_flags, is_aligned) < 0) { goto finish; } @@ -1614,6 +1673,7 @@ array_subscript(PyArrayObject *self, PyObject *op) } finish: + NPY_cast_info_xfree(&cast_info); Py_XDECREF(mit); Py_XDECREF(view); /* Clean up indices */ @@ -1699,6 +1759,9 @@ array_assign_subscript(PyArrayObject *self, PyObject *ind, PyObject *op) PyArrayMapIterObject *mit = NULL; + /* When a subspace is used, casting is done manually. */ + NPY_cast_info cast_info = {.func = NULL}; + if (op == NULL) { PyErr_SetString(PyExc_ValueError, "cannot delete array elements"); @@ -1871,7 +1934,6 @@ array_assign_subscript(PyArrayObject *self, PyObject *ind, PyObject *op) index_num == 1 && tmp_arr) { /* The array being indexed has one dimension and it is a fancy index */ PyArrayObject *ind = (PyArrayObject*)indices[0].object; - /* Check if the type is equivalent */ if (PyArray_EquivTypes(PyArray_DESCR(self), PyArray_DESCR(tmp_arr)) && @@ -1935,12 +1997,50 @@ array_assign_subscript(PyArrayObject *self, PyObject *ind, PyObject *op) } } - /* Can now reset the outer iterator (delayed bufalloc) */ - if (NpyIter_Reset(mit->outer, NULL) < 0) { + if (PyArray_MapIterCheckIndices(mit) < 0) { goto fail; } - if (PyArray_MapIterCheckIndices(mit) < 0) { + /* + * Alignment information (swapping is never needed, since we buffer), + * could also check extra_op is buffered, but it should rarely matter. + */ + int is_aligned = IsUintAligned(self) && IsUintAligned(mit->extra_op); + int meth_flags = NpyIter_GetTransferFlags(mit->outer); + + if (mit->extra_op_iter) { + int extra_op_flags = NpyIter_GetTransferFlags(mit->extra_op_iter); + meth_flags = PyArrayMethod_COMBINED_FLAGS(meth_flags, extra_op_flags); + } + + if (mit->subspace_iter != NULL) { + int extra_op_flags = NpyIter_GetTransferFlags(mit->subspace_iter); + meth_flags = PyArrayMethod_COMBINED_FLAGS(meth_flags, extra_op_flags); + + NPY_ARRAYMETHOD_FLAGS transfer_flags; + npy_intp fixed_strides[2]; + + /* + * Get a dtype transfer function, since there are no + * buffers, this is safe. + */ + NpyIter_GetInnerFixedStrideArray(mit->subspace_iter, fixed_strides); + + if (PyArray_GetDTypeTransferFunction(is_aligned, + fixed_strides[1], fixed_strides[0], + PyArray_DESCR(mit->extra_op), PyArray_DESCR(self), + 0, &cast_info, &transfer_flags) != NPY_SUCCEED) { + goto fail; + } + meth_flags = PyArrayMethod_COMBINED_FLAGS(meth_flags, transfer_flags); + } + + if (!(meth_flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) { + npy_clear_floatstatus_barrier((char *)mit); + } + + /* Can now reset the outer iterator (delayed bufalloc) */ + if (NpyIter_Reset(mit->outer, NULL) < 0) { goto fail; } @@ -1948,11 +2048,17 @@ array_assign_subscript(PyArrayObject *self, PyObject *ind, PyObject *op) * Could add a casting check, but apparently most assignments do * not care about safe casting. */ - - if (mapiter_set(mit) < 0) { + if (mapiter_set(mit, &cast_info, meth_flags, is_aligned) < 0) { goto fail; } + if (!(meth_flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) { + int fpes = npy_get_floatstatus_barrier((char *)mit); + if (fpes && PyUFunc_GiveFloatingpointErrors("cast", fpes) < 0) { + goto fail; + } + } + Py_DECREF(mit); goto success; @@ -1961,6 +2067,8 @@ array_assign_subscript(PyArrayObject *self, PyObject *ind, PyObject *op) Py_XDECREF((PyObject *)view); Py_XDECREF((PyObject *)tmp_arr); Py_XDECREF((PyObject *)mit); + NPY_cast_info_xfree(&cast_info); + for (i=0; i < index_num; i++) { Py_XDECREF(indices[i].object); } @@ -1969,6 +2077,8 @@ array_assign_subscript(PyArrayObject *self, PyObject *ind, PyObject *op) success: Py_XDECREF((PyObject *)view); Py_XDECREF((PyObject *)tmp_arr); + NPY_cast_info_xfree(&cast_info); + for (i=0; i < index_num; i++) { Py_XDECREF(indices[i].object); } @@ -2089,7 +2199,7 @@ _nonzero_indices(PyObject *myBool, PyArrayObject **arrays) /* Reset the map iterator to the beginning */ -NPY_NO_EXPORT void +NPY_NO_EXPORT int PyArray_MapIterReset(PyArrayMapIterObject *mit) { npy_intp indval; @@ -2097,12 +2207,16 @@ PyArray_MapIterReset(PyArrayMapIterObject *mit) int i; if (mit->size == 0) { - return; + return 0; } - NpyIter_Reset(mit->outer, NULL); + if (!NpyIter_Reset(mit->outer, NULL)) { + return -1; + } if (mit->extra_op_iter) { - NpyIter_Reset(mit->extra_op_iter, NULL); + if (!NpyIter_Reset(mit->extra_op_iter, NULL)) { + return -1; + } baseptrs[1] = mit->extra_op_ptrs[0]; } @@ -2119,14 +2233,16 @@ PyArray_MapIterReset(PyArrayMapIterObject *mit) mit->dataptr = baseptrs[0]; if (mit->subspace_iter) { - NpyIter_ResetBasePointers(mit->subspace_iter, baseptrs, NULL); + if (!NpyIter_ResetBasePointers(mit->subspace_iter, baseptrs, NULL)) { + return -1; + } mit->iter_count = *NpyIter_GetInnerLoopSizePtr(mit->subspace_iter); } else { mit->iter_count = *NpyIter_GetInnerLoopSizePtr(mit->outer); } - return; + return 0; } @@ -2592,13 +2708,14 @@ PyArray_MapIterNew(npy_index_info *indices , int index_num, int index_type, } /* create new MapIter object */ - mit = (PyArrayMapIterObject *)PyArray_malloc(sizeof(PyArrayMapIterObject)); + mit = (PyArrayMapIterObject *)PyArray_malloc( + sizeof(PyArrayMapIterObject) + sizeof(NPY_cast_info)); if (mit == NULL) { Py_DECREF(intp_descr); return NULL; } /* set all attributes of mapiter to zero */ - memset(mit, 0, sizeof(PyArrayMapIterObject)); + memset(mit, 0, sizeof(PyArrayMapIterObject) + sizeof(NPY_cast_info)); PyObject_Init((PyObject *)mit, &PyArrayMapIter_Type); Py_INCREF(arr); @@ -2874,6 +2991,11 @@ PyArray_MapIterNew(npy_index_info *indices , int index_num, int index_type, /* If external array is iterated, and no subspace is needed */ nops = mit->numiter; + + if (!uses_subspace) { + outer_flags |= NPY_ITER_EXTERNAL_LOOP; + } + if (extra_op_flags && !uses_subspace) { /* * NOTE: This small limitation should practically not matter. @@ -2921,9 +3043,6 @@ PyArray_MapIterNew(npy_index_info *indices , int index_num, int index_type, if (mit->outer == NULL) { goto fail; } - if (!uses_subspace) { - NpyIter_EnableExternalLoop(mit->outer); - } mit->outer_next = NpyIter_GetIterNext(mit->outer, NULL); if (mit->outer_next == NULL) { @@ -3061,7 +3180,7 @@ PyArray_MapIterNew(npy_index_info *indices , int index_num, int index_type, mit->subspace_ptrs = NpyIter_GetDataPtrArray(mit->subspace_iter); mit->subspace_strides = NpyIter_GetInnerStrideArray(mit->subspace_iter); - if (NpyIter_IterationNeedsAPI(mit->outer)) { + if (NpyIter_IterationNeedsAPI(mit->subspace_iter)) { mit->needs_api = 1; /* * NOTE: In this case, need to call PyErr_Occurred() after @@ -3212,9 +3331,12 @@ PyArray_MapIterArrayCopyIfOverlap(PyArrayObject * a, PyObject * index, goto fail; } + if (PyArray_MapIterReset(mit) < 0) { + goto fail; + } + Py_XDECREF(a_copy); Py_XDECREF(subspace); - PyArray_MapIterReset(mit); for (i=0; i < index_num; i++) { Py_XDECREF(indices[i].object); diff --git a/numpy/core/src/multiarray/mapping.h b/numpy/core/src/multiarray/mapping.h index e929b8b3f..4e5d06238 100644 --- a/numpy/core/src/multiarray/mapping.h +++ b/numpy/core/src/multiarray/mapping.h @@ -51,7 +51,7 @@ array_assign_item(PyArrayObject *self, Py_ssize_t i, PyObject *v); * Prototypes for Mapping calls --- not part of the C-API * because only useful as part of a getitem call. */ -NPY_NO_EXPORT void +NPY_NO_EXPORT int PyArray_MapIterReset(PyArrayMapIterObject *mit); NPY_NO_EXPORT void diff --git a/numpy/core/src/multiarray/multiarraymodule.c b/numpy/core/src/multiarray/multiarraymodule.c index 5209d6914..96d0c893d 100644 --- a/numpy/core/src/multiarray/multiarraymodule.c +++ b/numpy/core/src/multiarray/multiarraymodule.c @@ -85,6 +85,10 @@ NPY_NO_EXPORT int NPY_NUMUSERTYPES = 0; NPY_NO_EXPORT int initscalarmath(PyObject *); NPY_NO_EXPORT int set_matmul_flags(PyObject *d); /* in ufunc_object.c */ +/* From umath/string_ufuncs.cpp/h */ +NPY_NO_EXPORT PyObject * +_umath_strings_richcompare( + PyArrayObject *self, PyArrayObject *other, int cmp_op, int rstrip); /* * global variable to determine if legacy printing is enabled, accessible from @@ -138,12 +142,12 @@ PyArray_GetPriority(PyObject *obj, double default_) } priority = PyFloat_AsDouble(ret); + Py_DECREF(ret); if (error_converting(priority)) { /* TODO[gh-14801]: propagate crashes for bad priority? */ PyErr_Clear(); return default_; } - Py_DECREF(ret); return priority; } @@ -3726,6 +3730,12 @@ format_longfloat(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject *kwds) TrimMode_LeaveOneZero, -1, -1); } + +/* + * The only purpose of this function is that it allows the "rstrip". + * From my (@seberg's) perspective, this function should be deprecated + * and I do not think it matters if it is not particularly fast. + */ static PyObject * compare_chararrays(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject *kwds) { @@ -3791,7 +3801,7 @@ compare_chararrays(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject *kwds) return NULL; } if (PyArray_ISSTRING(newarr) && PyArray_ISSTRING(newoth)) { - res = _strings_richcompare(newarr, newoth, cmp_op, rstrip != 0); + res = _umath_strings_richcompare(newarr, newoth, cmp_op, rstrip != 0); } else { PyErr_SetString(PyExc_TypeError, diff --git a/numpy/core/src/multiarray/nditer_api.c b/numpy/core/src/multiarray/nditer_api.c index 860c8c1f6..b80312e06 100644 --- a/numpy/core/src/multiarray/nditer_api.c +++ b/numpy/core/src/multiarray/nditer_api.c @@ -857,6 +857,13 @@ NpyIter_RequiresBuffering(NpyIter *iter) * Whether the iteration loop, and in particular the iternext() * function, needs API access. If this is true, the GIL must * be retained while iterating. + * + * NOTE: Internally (currently), `NpyIter_GetTransferFlags` will + * additionally provide information on whether floating point errors + * may be given during casts. The flags only require the API use + * necessary for buffering though. So an iterate which does not require + * buffering may indicate `NpyIter_IterationNeedsAPI`, but not include + * the flag in `NpyIter_GetTransferFlags`. */ NPY_NO_EXPORT npy_bool NpyIter_IterationNeedsAPI(NpyIter *iter) @@ -864,6 +871,21 @@ NpyIter_IterationNeedsAPI(NpyIter *iter) return (NIT_ITFLAGS(iter)&NPY_ITFLAG_NEEDSAPI) != 0; } + +/* + * Fetch the ArrayMethod (runtime) flags for all "transfer functions' (i.e. + * copy to buffer/casts). + * + * TODO: This should be public API, but that only makes sense when the + * ArrayMethod API is made public. + */ +NPY_NO_EXPORT int +NpyIter_GetTransferFlags(NpyIter *iter) +{ + return NIT_ITFLAGS(iter) >> NPY_ITFLAG_TRANSFERFLAGS_SHIFT; +} + + /*NUMPY_API * Gets the number of dimensions being iterated */ diff --git a/numpy/core/src/multiarray/nditer_constr.c b/numpy/core/src/multiarray/nditer_constr.c index f82a9624e..a383c63e8 100644 --- a/numpy/core/src/multiarray/nditer_constr.c +++ b/numpy/core/src/multiarray/nditer_constr.c @@ -3141,7 +3141,9 @@ npyiter_allocate_transfer_functions(NpyIter *iter) npy_intp *strides = NAD_STRIDES(axisdata), op_stride; NpyIter_TransferInfo *transferinfo = NBF_TRANSFERINFO(bufferdata); - int needs_api = 0; + /* combined cast flags, the new cast flags for each cast: */ + NPY_ARRAYMETHOD_FLAGS cflags = PyArrayMethod_MINIMAL_FLAGS; + NPY_ARRAYMETHOD_FLAGS nc_flags; for (iop = 0; iop < nop; ++iop) { npyiter_opitflags flags = op_itflags[iop]; @@ -3167,10 +3169,11 @@ npyiter_allocate_transfer_functions(NpyIter *iter) op_dtype[iop], move_references, &transferinfo[iop].read, - &needs_api) != NPY_SUCCEED) { + &nc_flags) != NPY_SUCCEED) { iop -= 1; /* This one cannot be cleaned up yet. */ goto fail; } + cflags = PyArrayMethod_COMBINED_FLAGS(cflags, nc_flags); } else { transferinfo[iop].read.func = NULL; @@ -3199,9 +3202,10 @@ npyiter_allocate_transfer_functions(NpyIter *iter) mask_dtype, move_references, &transferinfo[iop].write, - &needs_api) != NPY_SUCCEED) { + &nc_flags) != NPY_SUCCEED) { goto fail; } + cflags = PyArrayMethod_COMBINED_FLAGS(cflags, nc_flags); } else { if (PyArray_GetDTypeTransferFunction( @@ -3212,9 +3216,10 @@ npyiter_allocate_transfer_functions(NpyIter *iter) PyArray_DESCR(op[iop]), move_references, &transferinfo[iop].write, - &needs_api) != NPY_SUCCEED) { + &nc_flags) != NPY_SUCCEED) { goto fail; } + cflags = PyArrayMethod_COMBINED_FLAGS(cflags, nc_flags); } } /* If no write back but there are references make a decref fn */ @@ -3230,9 +3235,10 @@ npyiter_allocate_transfer_functions(NpyIter *iter) op_dtype[iop], NULL, 1, &transferinfo[iop].write, - &needs_api) != NPY_SUCCEED) { + &nc_flags) != NPY_SUCCEED) { goto fail; } + cflags = PyArrayMethod_COMBINED_FLAGS(cflags, nc_flags); } else { transferinfo[iop].write.func = NULL; @@ -3244,8 +3250,12 @@ npyiter_allocate_transfer_functions(NpyIter *iter) } } - /* If any of the dtype transfer functions needed the API, flag it */ - if (needs_api) { + /* Store the combined transfer flags on the iterator */ + NIT_ITFLAGS(iter) |= cflags << NPY_ITFLAG_TRANSFERFLAGS_SHIFT; + assert(NIT_ITFLAGS(iter) >> NPY_ITFLAG_TRANSFERFLAGS_SHIFT == cflags); + + /* If any of the dtype transfer functions needed the API, flag it. */ + if (cflags & NPY_METH_REQUIRES_PYAPI) { NIT_ITFLAGS(iter) |= NPY_ITFLAG_NEEDSAPI; } diff --git a/numpy/core/src/multiarray/nditer_impl.h b/numpy/core/src/multiarray/nditer_impl.h index 2a82b7e54..459675ea8 100644 --- a/numpy/core/src/multiarray/nditer_impl.h +++ b/numpy/core/src/multiarray/nditer_impl.h @@ -76,33 +76,38 @@ /* Internal iterator flags */ /* The perm is the identity */ -#define NPY_ITFLAG_IDENTPERM 0x0001 +#define NPY_ITFLAG_IDENTPERM (1 << 0) /* The perm has negative entries (indicating flipped axes) */ -#define NPY_ITFLAG_NEGPERM 0x0002 +#define NPY_ITFLAG_NEGPERM (1 << 1) /* The iterator is tracking an index */ -#define NPY_ITFLAG_HASINDEX 0x0004 +#define NPY_ITFLAG_HASINDEX (1 << 2) /* The iterator is tracking a multi-index */ -#define NPY_ITFLAG_HASMULTIINDEX 0x0008 +#define NPY_ITFLAG_HASMULTIINDEX (1 << 3) /* The iteration order was forced on construction */ -#define NPY_ITFLAG_FORCEDORDER 0x0010 +#define NPY_ITFLAG_FORCEDORDER (1 << 4) /* The inner loop is handled outside the iterator */ -#define NPY_ITFLAG_EXLOOP 0x0020 +#define NPY_ITFLAG_EXLOOP (1 << 5) /* The iterator is ranged */ -#define NPY_ITFLAG_RANGE 0x0040 +#define NPY_ITFLAG_RANGE (1 << 6) /* The iterator is buffered */ -#define NPY_ITFLAG_BUFFER 0x0080 +#define NPY_ITFLAG_BUFFER (1 << 7) /* The iterator should grow the buffered inner loop when possible */ -#define NPY_ITFLAG_GROWINNER 0x0100 +#define NPY_ITFLAG_GROWINNER (1 << 8) /* There is just one iteration, can specialize iternext for that */ -#define NPY_ITFLAG_ONEITERATION 0x0200 +#define NPY_ITFLAG_ONEITERATION (1 << 9) /* Delay buffer allocation until first Reset* call */ -#define NPY_ITFLAG_DELAYBUF 0x0400 +#define NPY_ITFLAG_DELAYBUF (1 << 10) /* Iteration needs API access during iternext */ -#define NPY_ITFLAG_NEEDSAPI 0x0800 +#define NPY_ITFLAG_NEEDSAPI (1 << 11) /* Iteration includes one or more operands being reduced */ -#define NPY_ITFLAG_REDUCE 0x1000 +#define NPY_ITFLAG_REDUCE (1 << 12) /* Reduce iteration doesn't need to recalculate reduce loops next time */ -#define NPY_ITFLAG_REUSE_REDUCE_LOOPS 0x2000 +#define NPY_ITFLAG_REUSE_REDUCE_LOOPS (1 << 13) +/* + * Offset of (combined) ArrayMethod flags for all transfer functions. + * For now, we use the top 8 bits. + */ +#define NPY_ITFLAG_TRANSFERFLAGS_SHIFT 24 /* Internal iterator per-operand iterator flags */ @@ -356,4 +361,12 @@ npyiter_copy_to_buffers(NpyIter *iter, char **prev_dataptrs); NPY_NO_EXPORT void npyiter_clear_buffers(NpyIter *iter); +/* + * Function to get the ArrayMethod flags of the transfer functions. + * TODO: This function should be public and removed from `nditer_impl.h`, but + * this requires making the ArrayMethod flags public API first. + */ +NPY_NO_EXPORT int +NpyIter_GetTransferFlags(NpyIter *iter); + #endif /* NUMPY_CORE_SRC_MULTIARRAY_NDITER_IMPL_H_ */ diff --git a/numpy/core/src/multiarray/textreading/readtext.c b/numpy/core/src/multiarray/textreading/readtext.c index 9804fd462..a5db1cb77 100644 --- a/numpy/core/src/multiarray/textreading/readtext.c +++ b/numpy/core/src/multiarray/textreading/readtext.c @@ -270,6 +270,10 @@ _load_from_filelike(PyObject *NPY_UNUSED(mod), } /* Calloc just to not worry about overflow */ usecols = PyMem_Calloc(num_usecols, sizeof(Py_ssize_t)); + if (usecols == NULL) { + PyErr_NoMemory(); + return NULL; + } for (Py_ssize_t i = 0; i < num_usecols; i++) { PyObject *tmp = PySequence_GetItem(usecols_obj, i); if (tmp == NULL) { diff --git a/numpy/core/src/multiarray/textreading/rows.c b/numpy/core/src/multiarray/textreading/rows.c index e30ff835e..a72fb79d9 100644 --- a/numpy/core/src/multiarray/textreading/rows.c +++ b/numpy/core/src/multiarray/textreading/rows.c @@ -91,7 +91,7 @@ create_conv_funcs( if (column < -num_fields || column >= num_fields) { PyErr_Format(PyExc_ValueError, "converter specified for column %zd, which is invalid " - "for the number of fields %d.", column, num_fields); + "for the number of fields %zd.", column, num_fields); goto error; } if (column < 0) { @@ -319,7 +319,7 @@ read_rows(stream *s, if (!usecols && (actual_num_fields != current_num_fields)) { PyErr_Format(PyExc_ValueError, - "the number of columns changed from %d to %d at row %zu; " + "the number of columns changed from %zd to %zd at row %zd; " "use `usecols` to select a subset and avoid this error", actual_num_fields, current_num_fields, row_count+1); goto error; @@ -382,9 +382,9 @@ read_rows(stream *s, } if (NPY_UNLIKELY((col < 0) || (col >= current_num_fields))) { PyErr_Format(PyExc_ValueError, - "invalid column index %d at row %zu with %d " + "invalid column index %zd at row %zd with %zd " "columns", - usecols[i], current_num_fields, row_count+1); + usecols[i], row_count+1, current_num_fields); goto error; } } @@ -419,7 +419,7 @@ read_rows(stream *s, } PyErr_Format(PyExc_ValueError, "could not convert string %.100R to %S at " - "row %zu, column %d.", + "row %zd, column %zd.", string, field_types[f].descr, row_count, col+1); Py_DECREF(string); npy_PyErr_ChainExceptionsCause(exc, val, tb); @@ -432,7 +432,12 @@ read_rows(stream *s, } tokenizer_clear(&ts); - PyMem_FREE(conv_funcs); + if (conv_funcs != NULL) { + for (Py_ssize_t i = 0; i < actual_num_fields; i++) { + Py_XDECREF(conv_funcs[i]); + } + PyMem_FREE(conv_funcs); + } if (data_array == NULL) { assert(row_count == 0 && result_shape[0] == 0); @@ -474,7 +479,12 @@ read_rows(stream *s, return data_array; error: - PyMem_FREE(conv_funcs); + if (conv_funcs != NULL) { + for (Py_ssize_t i = 0; i < actual_num_fields; i++) { + Py_XDECREF(conv_funcs[i]); + } + PyMem_FREE(conv_funcs); + } tokenizer_clear(&ts); Py_XDECREF(data_array); return NULL; diff --git a/numpy/core/src/npymath/ieee754.c.src b/numpy/core/src/npymath/ieee754.c.src index 4e6ddb712..5d1ea3a69 100644 --- a/numpy/core/src/npymath/ieee754.c.src +++ b/numpy/core/src/npymath/ieee754.c.src @@ -566,228 +566,38 @@ int npy_get_floatstatus() { return npy_get_floatstatus_barrier(&x); } -/* - * Functions to set the floating point status word. - */ - -#if (defined(__unix__) || defined(unix)) && !defined(USG) -#include <sys/param.h> -#endif - /* - * Define floating point status functions. We must define - * npy_get_floatstatus_barrier, npy_clear_floatstatus_barrier, - * npy_set_floatstatus_{divbyzero, overflow, underflow, invalid} - * for all supported platforms. + * General C99 code for floating point error handling. These functions mainly + * exists, because `fenv.h` was not standardized in C89 so they gave better + * portability. This should be unnecessary with C99/C++11 and further + * functionality can be used from `fenv.h` directly. */ - - -/* Solaris --------------------------------------------------------*/ -/* --------ignoring SunOS ieee_flags approach, someone else can -** deal with that! */ -#if defined(sun) || defined(__BSD__) || defined(__OpenBSD__) || \ - (defined(__FreeBSD__) && (__FreeBSD_version < 502114)) || \ - defined(__NetBSD__) -#include <ieeefp.h> - -int npy_get_floatstatus_barrier(char * param) -{ - int fpstatus = fpgetsticky(); - /* - * By using a volatile, the compiler cannot reorder this call - */ - if (param != NULL) { - volatile char NPY_UNUSED(c) = *(char*)param; - } - return ((FP_X_DZ & fpstatus) ? NPY_FPE_DIVIDEBYZERO : 0) | - ((FP_X_OFL & fpstatus) ? NPY_FPE_OVERFLOW : 0) | - ((FP_X_UFL & fpstatus) ? NPY_FPE_UNDERFLOW : 0) | - ((FP_X_INV & fpstatus) ? NPY_FPE_INVALID : 0); -} - -int npy_clear_floatstatus_barrier(char * param) -{ - int fpstatus = npy_get_floatstatus_barrier(param); - fpsetsticky(0); - - return fpstatus; -} - -void npy_set_floatstatus_divbyzero(void) -{ - fpsetsticky(FP_X_DZ); -} - -void npy_set_floatstatus_overflow(void) -{ - fpsetsticky(FP_X_OFL); -} - -void npy_set_floatstatus_underflow(void) -{ - fpsetsticky(FP_X_UFL); -} - -void npy_set_floatstatus_invalid(void) -{ - fpsetsticky(FP_X_INV); -} - -#elif defined(_AIX) && !defined(__GNUC__) -#include <float.h> -#include <fpxcp.h> - -int npy_get_floatstatus_barrier(char *param) -{ - int fpstatus = fp_read_flag(); - /* - * By using a volatile, the compiler cannot reorder this call - */ - if (param != NULL) { - volatile char NPY_UNUSED(c) = *(char*)param; - } - return ((FP_DIV_BY_ZERO & fpstatus) ? NPY_FPE_DIVIDEBYZERO : 0) | - ((FP_OVERFLOW & fpstatus) ? NPY_FPE_OVERFLOW : 0) | - ((FP_UNDERFLOW & fpstatus) ? NPY_FPE_UNDERFLOW : 0) | - ((FP_INVALID & fpstatus) ? NPY_FPE_INVALID : 0); -} - -int npy_clear_floatstatus_barrier(char * param) -{ - int fpstatus = npy_get_floatstatus_barrier(param); - fp_swap_flag(0); - - return fpstatus; -} - -void npy_set_floatstatus_divbyzero(void) -{ - fp_raise_xcp(FP_DIV_BY_ZERO); -} - -void npy_set_floatstatus_overflow(void) -{ - fp_raise_xcp(FP_OVERFLOW); -} - -void npy_set_floatstatus_underflow(void) -{ - fp_raise_xcp(FP_UNDERFLOW); -} - -void npy_set_floatstatus_invalid(void) -{ - fp_raise_xcp(FP_INVALID); -} - -#elif defined(_MSC_VER) || (defined(__osf__) && defined(__alpha)) || \ - defined (__UCLIBC__) || (defined(__arc__) && defined(__GLIBC__)) +# include <fenv.h> /* - * By using a volatile floating point value, - * the compiler is forced to actually do the requested - * operations because of potential concurrency. - * - * We shouldn't write multiple values to a single - * global here, because that would cause - * a race condition. + * According to the C99 standard FE_DIVBYZERO, etc. may not be provided when + * unsupported. In such cases NumPy will not report these correctly, but we + * should still allow compiling (whether tests pass or not). + * By defining them as 0 locally, we make them no-ops. Unlike these defines, + * for example `musl` still defines all of the functions (as no-ops): + * https://git.musl-libc.org/cgit/musl/tree/src/fenv/fenv.c + * and does similar replacement in its tests: + * http://nsz.repo.hu/git/?p=libc-test;a=blob;f=src/common/mtest.h;h=706c1ba23ea8989b17a2f72ed1a919e187c06b6a;hb=HEAD#l30 */ -static volatile double _npy_floatstatus_x, - _npy_floatstatus_zero = 0.0, _npy_floatstatus_big = 1e300, - _npy_floatstatus_small = 1e-300, _npy_floatstatus_inf; - -void npy_set_floatstatus_divbyzero(void) -{ - _npy_floatstatus_x = 1.0 / _npy_floatstatus_zero; -} - -void npy_set_floatstatus_overflow(void) -{ - _npy_floatstatus_x = _npy_floatstatus_big * 1e300; -} - -void npy_set_floatstatus_underflow(void) -{ - _npy_floatstatus_x = _npy_floatstatus_small * 1e-300; -} - -void npy_set_floatstatus_invalid(void) -{ - _npy_floatstatus_inf = NPY_INFINITY; - _npy_floatstatus_x = _npy_floatstatus_inf - NPY_INFINITY; -} - -/* MS Windows -----------------------------------------------------*/ -#if defined(_MSC_VER) - -#include <float.h> - -int npy_get_floatstatus_barrier(char *param) -{ - /* - * By using a volatile, the compiler cannot reorder this call - */ -#if defined(_WIN64) - int fpstatus = _statusfp(); -#else - /* windows enables sse on 32 bit, so check both flags */ - int fpstatus, fpstatus2; - _statusfp2(&fpstatus, &fpstatus2); - fpstatus |= fpstatus2; +#ifndef FE_DIVBYZERO + #define FE_DIVBYZERO 0 #endif - if (param != NULL) { - volatile char NPY_UNUSED(c) = *(char*)param; - } - return ((SW_ZERODIVIDE & fpstatus) ? NPY_FPE_DIVIDEBYZERO : 0) | - ((SW_OVERFLOW & fpstatus) ? NPY_FPE_OVERFLOW : 0) | - ((SW_UNDERFLOW & fpstatus) ? NPY_FPE_UNDERFLOW : 0) | - ((SW_INVALID & fpstatus) ? NPY_FPE_INVALID : 0); -} - -int npy_clear_floatstatus_barrier(char *param) -{ - int fpstatus = npy_get_floatstatus_barrier(param); - _clearfp(); - - return fpstatus; -} - -/* OSF/Alpha (Tru64) ---------------------------------------------*/ -#elif defined(__osf__) && defined(__alpha) - -#include <machine/fpu.h> - -int npy_get_floatstatus_barrier(char *param) -{ - unsigned long fpstatus = ieee_get_fp_control(); - /* - * By using a volatile, the compiler cannot reorder this call - */ - if (param != NULL) { - volatile char NPY_UNUSED(c) = *(char*)param; - } - return ((IEEE_STATUS_DZE & fpstatus) ? NPY_FPE_DIVIDEBYZERO : 0) | - ((IEEE_STATUS_OVF & fpstatus) ? NPY_FPE_OVERFLOW : 0) | - ((IEEE_STATUS_UNF & fpstatus) ? NPY_FPE_UNDERFLOW : 0) | - ((IEEE_STATUS_INV & fpstatus) ? NPY_FPE_INVALID : 0); -} - -int npy_clear_floatstatus_barrier(char *param) -{ - int fpstatus = npy_get_floatstatus_barrier(param); - /* clear status bits as well as disable exception mode if on */ - ieee_set_fp_control(0); - - return fpstatus; -} - +#ifndef FE_OVERFLOW + #define FE_OVERFLOW 0 +#endif +#ifndef FE_UNDERFLOW + #define FE_UNDERFLOW 0 +#endif +#ifndef FE_INVALID + #define FE_INVALID 0 #endif -/* End of defined(_MSC_VER) || (defined(__osf__) && defined(__alpha)) */ -#else -/* General GCC code, should work on most platforms */ -# include <fenv.h> int npy_get_floatstatus_barrier(char* param) { @@ -839,4 +649,3 @@ void npy_set_floatstatus_invalid(void) feraiseexcept(FE_INVALID); } -#endif diff --git a/numpy/core/src/npymath/ieee754.cpp b/numpy/core/src/npymath/ieee754.cpp index 2244004c0..27fcf7c6e 100644 --- a/numpy/core/src/npymath/ieee754.cpp +++ b/numpy/core/src/npymath/ieee754.cpp @@ -655,6 +655,30 @@ npy_get_floatstatus() */ #include <fenv.h> +/* + * According to the C99 standard FE_DIVBYZERO, etc. may not be provided when + * unsupported. In such cases NumPy will not report these correctly, but we + * should still allow compiling (whether tests pass or not). + * By defining them as 0 locally, we make them no-ops. Unlike these defines, + * for example `musl` still defines all of the functions (as no-ops): + * https://git.musl-libc.org/cgit/musl/tree/src/fenv/fenv.c + * and does similar replacement in its tests: + * http://nsz.repo.hu/git/?p=libc-test;a=blob;f=src/common/mtest.h;h=706c1ba23ea8989b17a2f72ed1a919e187c06b6a;hb=HEAD#l30 + */ +#ifndef FE_DIVBYZERO + #define FE_DIVBYZERO 0 +#endif +#ifndef FE_OVERFLOW + #define FE_OVERFLOW 0 +#endif +#ifndef FE_UNDERFLOW + #define FE_UNDERFLOW 0 +#endif +#ifndef FE_INVALID + #define FE_INVALID 0 +#endif + + extern "C" int npy_get_floatstatus_barrier(char *param) { diff --git a/numpy/core/src/umath/dispatching.c b/numpy/core/src/umath/dispatching.c index b8f102b3d..620335d88 100644 --- a/numpy/core/src/umath/dispatching.c +++ b/numpy/core/src/umath/dispatching.c @@ -145,6 +145,38 @@ PyUFunc_AddLoop(PyUFuncObject *ufunc, PyObject *info, int ignore_duplicate) } +/* + * Add loop directly to a ufunc from a given ArrayMethod spec. + */ +NPY_NO_EXPORT int +PyUFunc_AddLoopFromSpec(PyObject *ufunc, PyArrayMethod_Spec *spec) +{ + if (!PyObject_TypeCheck(ufunc, &PyUFunc_Type)) { + PyErr_SetString(PyExc_TypeError, + "ufunc object passed is not a ufunc!"); + return -1; + } + PyBoundArrayMethodObject *bmeth = + (PyBoundArrayMethodObject *)PyArrayMethod_FromSpec(spec); + if (bmeth == NULL) { + return -1; + } + int nargs = bmeth->method->nin + bmeth->method->nout; + PyObject *dtypes = PyArray_TupleFromItems( + nargs, (PyObject **)bmeth->dtypes, 1); + if (dtypes == NULL) { + return -1; + } + PyObject *info = PyTuple_Pack(2, dtypes, bmeth->method); + Py_DECREF(bmeth); + Py_DECREF(dtypes); + if (info == NULL) { + return -1; + } + return PyUFunc_AddLoop((PyUFuncObject *)ufunc, info, 0); +} + + /** * Resolves the implementation to use, this uses typical multiple dispatching * methods of finding the best matching implementation or resolver. diff --git a/numpy/core/src/umath/dispatching.h b/numpy/core/src/umath/dispatching.h index a7e9e88d0..f2ab0be2e 100644 --- a/numpy/core/src/umath/dispatching.h +++ b/numpy/core/src/umath/dispatching.h @@ -6,6 +6,9 @@ #include <numpy/ufuncobject.h> #include "array_method.h" +#ifdef __cplusplus +extern "C" { +#endif typedef int promoter_function(PyUFuncObject *ufunc, PyArray_DTypeMeta *op_dtypes[], PyArray_DTypeMeta *signature[], @@ -14,6 +17,9 @@ typedef int promoter_function(PyUFuncObject *ufunc, NPY_NO_EXPORT int PyUFunc_AddLoop(PyUFuncObject *ufunc, PyObject *info, int ignore_duplicate); +NPY_NO_EXPORT int +PyUFunc_AddLoopFromSpec(PyObject *ufunc, PyArrayMethod_Spec *spec); + NPY_NO_EXPORT PyArrayMethodObject * promote_and_get_ufuncimpl(PyUFuncObject *ufunc, PyArrayObject *const ops[], @@ -41,5 +47,8 @@ object_only_ufunc_promoter(PyUFuncObject *ufunc, NPY_NO_EXPORT int install_logical_ufunc_promoter(PyObject *ufunc); +#ifdef __cplusplus +} +#endif #endif /*_NPY_DISPATCHING_H */ diff --git a/numpy/core/src/umath/extobj.c b/numpy/core/src/umath/extobj.c index 6b9a27e26..893429107 100644 --- a/numpy/core/src/umath/extobj.c +++ b/numpy/core/src/umath/extobj.c @@ -267,6 +267,33 @@ _extract_pyvals(PyObject *ref, const char *name, int *bufsize, } /* + * Handler which uses the default `np.errstate` given that `fpe_errors` is + * already set. `fpe_errors` is typically the (nonzero) result of + * `npy_get_floatstatus_barrier`. + * + * Returns -1 on failure (an error was raised) and 0 on success. + */ +NPY_NO_EXPORT int +PyUFunc_GiveFloatingpointErrors(const char *name, int fpe_errors) +{ + int bufsize, errmask; + PyObject *errobj; + + if (PyUFunc_GetPyValues((char *)name, &bufsize, &errmask, + &errobj) < 0) { + return -1; + } + int first = 1; + if (PyUFunc_handlefperr(errmask, errobj, fpe_errors, &first)) { + Py_XDECREF(errobj); + return -1; + } + Py_XDECREF(errobj); + return 0; +} + + +/* * check the floating point status * - errmask: mask of status to check * - extobj: ufunc pyvals object diff --git a/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src b/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src index 51b167844..bf8142880 100644 --- a/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src +++ b/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src @@ -1,6 +1,7 @@ /*@targets ** $maxopt baseline ** sse2 avx2 avx512f + ** vx vxe **/ #define _UMATHMODULE #define _MULTIARRAYMODULE @@ -364,7 +365,7 @@ sse2_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_i * #type = npy_float, npy_double# * #TYPE = FLOAT, DOUBLE# * #sfx = f32, f64# - * #CHK = , _F64# + * #CHK = _F32, _F64# */ #if NPY_SIMD@CHK@ /**begin repeat1 @@ -444,7 +445,7 @@ simd_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_i * #type = npy_float, npy_double, npy_longdouble# * #TYPE = FLOAT, DOUBLE, LONGDOUBLE# * #vector = 1, 1, 0# - * #VECTOR = NPY_SIMD, NPY_SIMD_F64, 0 # + * #VECTOR = NPY_SIMD_F32, NPY_SIMD_F64, 0 # */ /**begin repeat1 * Arithmetic diff --git a/numpy/core/src/umath/loops_arithmetic.dispatch.c.src b/numpy/core/src/umath/loops_arithmetic.dispatch.c.src index 16a9eac2e..5b5f13ad1 100644 --- a/numpy/core/src/umath/loops_arithmetic.dispatch.c.src +++ b/numpy/core/src/umath/loops_arithmetic.dispatch.c.src @@ -3,6 +3,7 @@ ** sse2 sse41 avx2 avx512f avx512_skx ** vsx2 vsx4 ** neon + ** vx **/ #define _UMATHMODULE #define _MULTIARRAYMODULE @@ -51,13 +52,14 @@ simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len) const npyv_@sfx@x3 divisor = npyv_divisor_@sfx@(scalar); if (scalar == -1) { - npyv_b@len@ noverflow = npyv_cvt_b@len@_@sfx@(npyv_setall_@sfx@(-1)); - npyv_@sfx@ vzero = npyv_zero_@sfx@(); + npyv_b@len@ noverflow = npyv_cvt_b@len@_@sfx@(npyv_setall_@sfx@(-1)); + const npyv_@sfx@ vzero = npyv_zero_@sfx@(); + const npyv_@sfx@ vmin = npyv_setall_@sfx@(NPY_MIN_INT@len@); for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) { npyv_@sfx@ a = npyv_load_@sfx@(src); npyv_b@len@ gt_min = npyv_cmpgt_@sfx@(a, npyv_setall_@sfx@(NPY_MIN_INT@len@)); noverflow = npyv_and_b@len@(noverflow, gt_min); - npyv_@sfx@ neg = npyv_ifsub_@sfx@(gt_min, vzero, a, vzero); + npyv_@sfx@ neg = npyv_ifsub_@sfx@(gt_min, vzero, a, vmin); npyv_store_@sfx@(dst, neg); } @@ -66,13 +68,13 @@ simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len) npyv_lanetype_@sfx@ a = *src; if (a == NPY_MIN_INT@len@) { raise_err = 1; - *dst = 0; + *dst = NPY_MIN_INT@len@; } else { *dst = -a; } } if (raise_err) { - npy_set_floatstatus_divbyzero(); + npy_set_floatstatus_overflow(); } } else { for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) { @@ -253,7 +255,8 @@ vsx4_simd_divide_contig_@sfx@(char **args, npy_intp len) const npyv_@sfx@ vneg_one = npyv_setall_@sfx@(-1); const npyv_@sfx@ vzero = npyv_zero_@sfx@(); const npyv_@sfx@ vmin = npyv_setall_@sfx@(NPY_MIN_INT@len@); - npyv_b@len@ warn = npyv_cvt_b@len@_@sfx@(npyv_zero_@sfx@()); + npyv_b@len@ warn_zero = npyv_cvt_b@len@_@sfx@(npyv_zero_@sfx@()); + npyv_b@len@ warn_overflow = npyv_cvt_b@len@_@sfx@(npyv_zero_@sfx@()); const int vstep = npyv_nlanes_@sfx@; for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, @@ -267,10 +270,8 @@ vsx4_simd_divide_contig_@sfx@(char **args, npy_intp len) npyv_b@len@ amin = npyv_cmpeq_@sfx@(a, vmin); npyv_b@len@ bneg_one = npyv_cmpeq_@sfx@(b, vneg_one); npyv_b@len@ overflow = npyv_and_@sfx@(bneg_one, amin); - npyv_b@len@ error = npyv_or_@sfx@(bzero, overflow); - // in case of overflow or b = 0, 'cvtozero' forces quo/rem to be 0 - npyv_@sfx@ cvtozero = npyv_select_@sfx@(error, vzero, vneg_one); - warn = npyv_or_@sfx@(error, warn); + warn_zero = npyv_or_@sfx@(bzero, warn_zero); + warn_overflow = npyv_or_@sfx@(overflow, warn_overflow); // handle mixed case the way Python does // ((a > 0) == (b > 0) || rem == 0) npyv_b@len@ a_gt_zero = npyv_cmpgt_@sfx@(a, vzero); @@ -280,21 +281,30 @@ vsx4_simd_divide_contig_@sfx@(char **args, npy_intp len) npyv_b@len@ or = npyv_or_@sfx@(ab_eq_cond, rem_zero); npyv_@sfx@ to_sub = npyv_select_@sfx@(or, vzero, vneg_one); quo = npyv_add_@sfx@(quo, to_sub); - npyv_store_@sfx@(dst1, npyv_and_@sfx@(cvtozero, quo)); + // Divide by zero + quo = npyv_select_@sfx@(bzero, vzero, quo); + // Overflow + quo = npyv_select_@sfx@(overflow, vmin, quo); + npyv_store_@sfx@(dst1, quo); } - if (!vec_all_eq(warn, vzero)) { + if (!vec_all_eq(warn_zero, vzero)) { npy_set_floatstatus_divbyzero(); } + if (!vec_all_eq(warn_overflow, vzero)) { + npy_set_floatstatus_overflow(); + } for (; len > 0; --len, ++src1, ++src2, ++dst1) { const npyv_lanetype_@sfx@ a = *src1; const npyv_lanetype_@sfx@ b = *src2; - if (b == 0 || (a == NPY_MIN_INT@len@ && b == -1)) { + if (NPY_UNLIKELY(b == 0)) { npy_set_floatstatus_divbyzero(); *dst1 = 0; - } - else { + } else if (NPY_UNLIKELY((a == NPY_MIN_INT@len@) && (b == -1))) { + npy_set_floatstatus_overflow(); + *dst1 = NPY_MIN_INT@len@; + } else { *dst1 = a / b; if (((a > 0) != (b > 0)) && ((*dst1 * b) != a)) { *dst1 -= 1; @@ -340,8 +350,14 @@ NPY_FINLINE @type@ floor_div_@TYPE@(const @type@ n, const @type@ d) * (i.e. a different approach than npy_set_floatstatus_divbyzero()). */ if (NPY_UNLIKELY(d == 0 || (n == NPY_MIN_@TYPE@ && d == -1))) { - npy_set_floatstatus_divbyzero(); - return 0; + if (d == 0) { + npy_set_floatstatus_divbyzero(); + return 0; + } + else { + npy_set_floatstatus_overflow(); + return NPY_MIN_@TYPE@; + } } @type@ r = n / d; // Negative quotients needs to be rounded down diff --git a/numpy/core/src/umath/loops_comparison.dispatch.c.src b/numpy/core/src/umath/loops_comparison.dispatch.c.src index 01d58fbf9..2f75593a5 100644 --- a/numpy/core/src/umath/loops_comparison.dispatch.c.src +++ b/numpy/core/src/umath/loops_comparison.dispatch.c.src @@ -3,6 +3,7 @@ ** sse2 sse42 avx2 avx512f avx512_skx ** vsx2 vsx3 ** neon + ** vx vxe **/ #define _UMATHMODULE #define _MULTIARRAYMODULE @@ -22,7 +23,7 @@ * #sfx = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64# * #len = 8, 8, 16, 16, 32, 32, 64, 64, 32, 64# * #signed = 0, 1, 0, 1, 0, 1, 0, 1, 0, 0# - * #VECTOR = NPY_SIMD*9, NPY_SIMD_F64# + * #VECTOR = NPY_SIMD*8, NPY_SIMD_F32, NPY_SIMD_F64# */ /**begin repeat1 * #kind = equal, not_equal, less, less_equal# @@ -298,7 +299,7 @@ static void simd_binary_scalar2_@kind@_b8(char **args, npy_intp len) * #bool = 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0# * #fp = 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1# * #signed = 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0# - * #VECTOR = NPY_SIMD*10, NPY_SIMD_F64# + * #VECTOR = NPY_SIMD*9, NPY_SIMD_F32, NPY_SIMD_F64# */ /**begin repeat1 * #kind = equal, not_equal, less, less_equal# diff --git a/numpy/core/src/umath/loops_hyperbolic.dispatch.c.src b/numpy/core/src/umath/loops_hyperbolic.dispatch.c.src index 8cccc18f0..ce4962ce3 100644 --- a/numpy/core/src/umath/loops_hyperbolic.dispatch.c.src +++ b/numpy/core/src/umath/loops_hyperbolic.dispatch.c.src @@ -3,6 +3,7 @@ ** (avx2 fma3) AVX512_SKX ** vsx2 vsx4 ** neon_vfpv4 + ** vx vxe **/ #include "numpy/npy_math.h" #include "simd/simd.h" @@ -240,6 +241,8 @@ simd_tanh_f64(const double *src, npy_intp ssrc, double *dst, npy_intp sdst, npy_ } } #endif // NPY_SIMD_F64 + +#if NPY_SIMD_F32 static void simd_tanh_f32(const float *src, npy_intp ssrc, float *dst, npy_intp sdst, npy_intp len) { @@ -335,6 +338,7 @@ simd_tanh_f32(const float *src, npy_intp ssrc, float *dst, npy_intp sdst, npy_in } } } +#endif // NPY_SIMD_F32 #endif // NPY_SIMD_FMA3 /**begin repeat @@ -342,7 +346,7 @@ simd_tanh_f32(const float *src, npy_intp ssrc, float *dst, npy_intp sdst, npy_in * #type = float, double# * #sfx = f32, f64# * #ssfx = f, # - * #simd = NPY_SIMD_FMA3, NPY_SIMD_FMA3 && NPY_SIMD_F64# + * #simd = NPY_SIMD_FMA3 && NPY_SIMD_F32, NPY_SIMD_FMA3 && NPY_SIMD_F64# */ /**begin repeat1 * #func = tanh# diff --git a/numpy/core/src/umath/loops_minmax.dispatch.c.src b/numpy/core/src/umath/loops_minmax.dispatch.c.src index ba2288f0b..b4fb205a0 100644 --- a/numpy/core/src/umath/loops_minmax.dispatch.c.src +++ b/numpy/core/src/umath/loops_minmax.dispatch.c.src @@ -3,6 +3,7 @@ ** neon asimd ** sse2 avx2 avx512_skx ** vsx2 + ** vx vxe **/ #define _UMATHMODULE #define _MULTIARRAYMODULE @@ -144,7 +145,7 @@ NPY_FINLINE @type@ scalar_@op@_@c_sfx@(@type@ a, @type@ b) { /**begin repeat * #sfx = f32, f64# * #bsfx = b32, b64# - * #simd_chk = NPY_SIMD, NPY_SIMD_F64# + * #simd_chk = NPY_SIMD_F32, NPY_SIMD_F64# * #scalar_sfx = f, d# */ #if @simd_chk@ @@ -196,7 +197,7 @@ NPY_FINLINE @type@ scalar_@op@_@c_sfx@(@type@ a, @type@ b) { ******************************************************************************/ /**begin repeat * #sfx = s8, u8, s16, u16, s32, u32, s64, u64, f32, f64# - * #simd_chk = NPY_SIMD*9, NPY_SIMD_F64# + * #simd_chk = NPY_SIMD*8, NPY_SIMD_F32, NPY_SIMD_F64# * #is_fp = 0*8, 1, 1# * #scalar_sfx = i*8, f, d# */ @@ -395,6 +396,9 @@ simd_binary_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip1, npy_intp sip1, #elif NPY_SIMD && NPY_BITSOF_@BTYPE@ == @len@ #if @is_fp@ #define TO_SIMD_SFX(X) X##_f@len@ + #if NPY_BITSOF_@BTYPE@ == 32 && !NPY_SIMD_F32 + #undef TO_SIMD_SFX + #endif #if NPY_BITSOF_@BTYPE@ == 64 && !NPY_SIMD_F64 #undef TO_SIMD_SFX #endif diff --git a/numpy/core/src/umath/loops_trigonometric.dispatch.c.src b/numpy/core/src/umath/loops_trigonometric.dispatch.c.src index 44c47d14f..78685e807 100644 --- a/numpy/core/src/umath/loops_trigonometric.dispatch.c.src +++ b/numpy/core/src/umath/loops_trigonometric.dispatch.c.src @@ -3,6 +3,7 @@ ** (avx2 fma3) avx512f ** vsx2 vsx3 vsx4 ** neon_vfpv4 + ** vxe vxe2 **/ #include "numpy/npy_math.h" #include "simd/simd.h" @@ -13,7 +14,7 @@ * - use vectorized version of Payne-Hanek style reduction for large elements or * when there's no native FUSED support instead of fallback to libc */ -#if NPY_SIMD_FMA3 // native support +#if NPY_SIMD_F32 && NPY_SIMD_FMA3 // native support /* * Vectorized Cody-Waite range reduction technique * Performs the reduction step x* = x - y*C in three steps: @@ -210,7 +211,7 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_@func@) const npy_intp sdst = steps[1] / lsize; npy_intp len = dimensions[0]; assert(len <= 1 || (steps[0] % lsize == 0 && steps[1] % lsize == 0)); -#if NPY_SIMD_FMA3 +#if NPY_SIMD_F32 && NPY_SIMD_FMA3 if (is_mem_overlap(src, steps[0], dst, steps[1], len) || !npyv_loadable_stride_f32(ssrc) || !npyv_storable_stride_f32(sdst) ) { diff --git a/numpy/core/src/umath/loops_unary_fp.dispatch.c.src b/numpy/core/src/umath/loops_unary_fp.dispatch.c.src index 78e231965..0ac39a9b1 100644 --- a/numpy/core/src/umath/loops_unary_fp.dispatch.c.src +++ b/numpy/core/src/umath/loops_unary_fp.dispatch.c.src @@ -3,6 +3,7 @@ ** sse2 sse41 ** vsx2 ** neon asimd + ** vx vxe **/ /** * Force use SSE only on x86, even if AVX2 or AVX512F are enabled @@ -18,7 +19,7 @@ /********************************************************** ** Scalars **********************************************************/ -#if !NPY_SIMD +#if !NPY_SIMD_F32 NPY_FINLINE float c_recip_f32(float a) { return 1.0f / a; } NPY_FINLINE float c_abs_f32(float a) @@ -29,7 +30,7 @@ NPY_FINLINE float c_abs_f32(float a) } NPY_FINLINE float c_square_f32(float a) { return a * a; } -#endif // !NPY_SIMD +#endif // !NPY_SIMD_F32 #if !NPY_SIMD_F64 NPY_FINLINE double c_recip_f64(double a) @@ -147,7 +148,7 @@ NPY_FINLINE double c_square_f64(double a) /**begin repeat * #TYPE = FLOAT, DOUBLE# * #sfx = f32, f64# - * #VCHK = NPY_SIMD, NPY_SIMD_F64# + * #VCHK = NPY_SIMD_F32, NPY_SIMD_F64# */ #if @VCHK@ /**begin repeat1 @@ -259,7 +260,7 @@ static void simd_@TYPE@_@kind@_@STYPE@_@DTYPE@ /**begin repeat * #TYPE = FLOAT, DOUBLE# * #sfx = f32, f64# - * #VCHK = NPY_SIMD, NPY_SIMD_F64# + * #VCHK = NPY_SIMD_F32, NPY_SIMD_F64# */ /**begin repeat1 * #kind = rint, floor, ceil, trunc, sqrt, absolute, square, reciprocal# diff --git a/numpy/core/src/umath/scalarmath.c.src b/numpy/core/src/umath/scalarmath.c.src index 4993546f8..ef608378a 100644 --- a/numpy/core/src/umath/scalarmath.c.src +++ b/numpy/core/src/umath/scalarmath.c.src @@ -499,17 +499,26 @@ half_ctype_power(npy_half a, npy_half b, npy_half *out) * #type = npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint, * npy_long, npy_ulong, npy_longlong, npy_ulonglong, * npy_float, npy_double, npy_longdouble# + * #NAME = BYTE, UBYTE, SHORT, USHORT, INT, UINT, + * LONG, ULONG, LONGLONG, ULONGLONG, + * FLOAT, DOUBLE, LONGDOUBLE# * #uns = (0,1)*5,0*3# + * #int = 1*10,0*3# */ static NPY_INLINE int @name@_ctype_negative(@type@ a, @type@ *out) { - *out = -a; #if @uns@ + *out = -a; return NPY_FPE_OVERFLOW; -#else - return 0; +#elif @int@ + if(a == NPY_MIN_@NAME@){ + *out = a; + return NPY_FPE_OVERFLOW; + } #endif + *out = -a; + return 0; } /**end repeat**/ @@ -584,10 +593,15 @@ static NPY_INLINE int /**begin repeat * #name = byte, short, int, long, longlong# * #type = npy_byte, npy_short, npy_int, npy_long, npy_longlong# + * #NAME = BYTE, SHORT, INT, LONG, LONGLONG# */ static NPY_INLINE int @name@_ctype_absolute(@type@ a, @type@ *out) { + if (a == NPY_MIN_@NAME@) { + *out = a; + return NPY_FPE_OVERFLOW; + } *out = (a < 0 ? -a : a); return 0; } @@ -1564,8 +1578,23 @@ static PyObject * val = PyArrayScalar_VAL(a, @Name@); + int retstatus = @name@_ctype_@oper@(val, &out); - @name@_ctype_@oper@(val, &out); + if (retstatus) { + int bufsize, errmask; + PyObject *errobj; + + if (PyUFunc_GetPyValues("@name@_scalars", &bufsize, &errmask, + &errobj) < 0) { + return NULL; + } + int first = 1; + if (PyUFunc_handlefperr(errmask, errobj, retstatus, &first)) { + Py_XDECREF(errobj); + return NULL; + } + Py_XDECREF(errobj); + } /* * TODO: Complex absolute should check floating point flags. diff --git a/numpy/core/src/umath/string_ufuncs.cpp b/numpy/core/src/umath/string_ufuncs.cpp new file mode 100644 index 000000000..5a35c318b --- /dev/null +++ b/numpy/core/src/umath/string_ufuncs.cpp @@ -0,0 +1,449 @@ +#include <Python.h> + +#define NPY_NO_DEPRECATED_API NPY_API_VERSION +#define _MULTIARRAYMODULE +#define _UMATHMODULE + +#include "numpy/ndarraytypes.h" + +#include "numpyos.h" +#include "dispatching.h" +#include "dtypemeta.h" +#include "common_dtype.h" +#include "convert_datatype.h" + +#include "string_ufuncs.h" + + +template <typename character> +static NPY_INLINE int +character_cmp(character a, character b) +{ + if (a == b) { + return 0; + } + else if (a < b) { + return -1; + } + else { + return 1; + } +} + + +/* + * Compare two strings of different length. Note that either string may be + * zero padded (trailing zeros are ignored in other words, the shorter word + * is always padded with zeros). + */ +template <bool rstrip, typename character> +static NPY_INLINE int +string_cmp(int len1, const character *str1, int len2, const character *str2) +{ + if (rstrip) { + /* + * Ignore/"trim" trailing whitespace (and 0s). Note that this function + * does not support unicode whitespace (and never has). + */ + while (len1 > 0) { + character c = str1[len1-1]; + if (c != (character)0 && !NumPyOS_ascii_isspace(c)) { + break; + } + len1--; + } + while (len2 > 0) { + character c = str2[len2-1]; + if (c != (character)0 && !NumPyOS_ascii_isspace(c)) { + break; + } + len2--; + } + } + + int n = PyArray_MIN(len1, len2); + + if (sizeof(character) == 1) { + /* + * TODO: `memcmp` makes things 2x faster for longer words that match + * exactly, but at least 2x slower for short or mismatching ones. + */ + int cmp = memcmp(str1, str2, n); + if (cmp != 0) { + return cmp; + } + str1 += n; + str2 += n; + } + else { + for (int i = 0; i < n; i++) { + int cmp = character_cmp(*str1, *str2); + if (cmp != 0) { + return cmp; + } + str1++; + str2++; + } + } + if (len1 > len2) { + for (int i = n; i < len1; i++) { + int cmp = character_cmp(*str1, (character)0); + if (cmp != 0) { + return cmp; + } + str1++; + } + } + else if (len2 > len1) { + for (int i = n; i < len2; i++) { + int cmp = character_cmp((character)0, *str2); + if (cmp != 0) { + return cmp; + } + str2++; + } + } + return 0; +} + + +/* + * Helper for templating, avoids warnings about uncovered switch paths. + */ +enum class COMP { + EQ, NE, LT, LE, GT, GE, +}; + +static char const * +comp_name(COMP comp) { + switch(comp) { + case COMP::EQ: return "equal"; + case COMP::NE: return "not_equal"; + case COMP::LT: return "less"; + case COMP::LE: return "less_equal"; + case COMP::GT: return "greater"; + case COMP::GE: return "greater_equal"; + default: + assert(0); + return nullptr; + } +} + + +template <bool rstrip, COMP comp, typename character> +static int +string_comparison_loop(PyArrayMethod_Context *context, + char *const data[], npy_intp const dimensions[], + npy_intp const strides[], NpyAuxData *NPY_UNUSED(auxdata)) +{ + /* + * Note, fetching `elsize` from the descriptor is OK even without the GIL, + * however it may be that this should be moved into `auxdata` eventually, + * which may also be slightly faster/cleaner (but more involved). + */ + int len1 = context->descriptors[0]->elsize / sizeof(character); + int len2 = context->descriptors[1]->elsize / sizeof(character); + + char *in1 = data[0]; + char *in2 = data[1]; + char *out = data[2]; + + npy_intp N = dimensions[0]; + + while (N--) { + int cmp = string_cmp<rstrip>( + len1, (character *)in1, len2, (character *)in2); + npy_bool res; + switch (comp) { + case COMP::EQ: + res = cmp == 0; + break; + case COMP::NE: + res = cmp != 0; + break; + case COMP::LT: + res = cmp < 0; + break; + case COMP::LE: + res = cmp <= 0; + break; + case COMP::GT: + res = cmp > 0; + break; + case COMP::GE: + res = cmp >= 0; + break; + } + *(npy_bool *)out = res; + + in1 += strides[0]; + in2 += strides[1]; + out += strides[2]; + } + return 0; +} + + +/* + * Machinery to add the string loops to the existing ufuncs. + */ + +/* + * This function replaces the strided loop with the passed in one, + * and registers it with the given ufunc. + */ +static int +add_loop(PyObject *umath, const char *ufunc_name, + PyArrayMethod_Spec *spec, PyArrayMethod_StridedLoop *loop) +{ + PyObject *name = PyUnicode_FromString(ufunc_name); + if (name == nullptr) { + return -1; + } + PyObject *ufunc = PyObject_GetItem(umath, name); + Py_DECREF(name); + if (ufunc == nullptr) { + return -1; + } + spec->slots[0].pfunc = (void *)loop; + + int res = PyUFunc_AddLoopFromSpec(ufunc, spec); + Py_DECREF(ufunc); + return res; +} + + +template<bool rstrip, typename character, COMP...> +struct add_loops; + +template<bool rstrip, typename character> +struct add_loops<rstrip, character> { + int operator()(PyObject*, PyArrayMethod_Spec*) { + return 0; + } +}; + +template<bool rstrip, typename character, COMP comp, COMP... comps> +struct add_loops<rstrip, character, comp, comps...> { + int operator()(PyObject* umath, PyArrayMethod_Spec* spec) { + PyArrayMethod_StridedLoop* loop = string_comparison_loop<rstrip, comp, character>; + + if (add_loop(umath, comp_name(comp), spec, loop) < 0) { + return -1; + } + else { + return add_loops<rstrip, character, comps...>()(umath, spec); + } + } +}; + + +NPY_NO_EXPORT int +init_string_ufuncs(PyObject *umath) +{ + int res = -1; + /* NOTE: This should receive global symbols? */ + PyArray_DTypeMeta *String = PyArray_DTypeFromTypeNum(NPY_STRING); + PyArray_DTypeMeta *Unicode = PyArray_DTypeFromTypeNum(NPY_UNICODE); + PyArray_DTypeMeta *Bool = PyArray_DTypeFromTypeNum(NPY_BOOL); + + /* We start with the string loops: */ + PyArray_DTypeMeta *dtypes[] = {String, String, Bool}; + /* + * We only have one loop right now, the strided one. The default type + * resolver ensures native byte order/canonical representation. + */ + PyType_Slot slots[] = { + {NPY_METH_strided_loop, nullptr}, + {0, nullptr} + }; + + PyArrayMethod_Spec spec = {}; + spec.name = "templated_string_comparison"; + spec.nin = 2; + spec.nout = 1; + spec.dtypes = dtypes; + spec.slots = slots; + spec.flags = NPY_METH_NO_FLOATINGPOINT_ERRORS; + + /* All String loops */ + using string_looper = add_loops<false, npy_byte, COMP::EQ, COMP::NE, COMP::LT, COMP::LE, COMP::GT, COMP::GE>; + if (string_looper()(umath, &spec) < 0) { + goto finish; + } + + /* All Unicode loops */ + using ucs_looper = add_loops<false, npy_ucs4, COMP::EQ, COMP::NE, COMP::LT, COMP::LE, COMP::GT, COMP::GE>; + dtypes[0] = Unicode; + dtypes[1] = Unicode; + if (ucs_looper()(umath, &spec) < 0) { + goto finish; + } + + res = 0; + finish: + Py_DECREF(String); + Py_DECREF(Unicode); + Py_DECREF(Bool); + return res; +} + + +template <bool rstrip, typename character> +static PyArrayMethod_StridedLoop * +get_strided_loop(int comp) +{ + switch (comp) { + case Py_EQ: + return string_comparison_loop<rstrip, COMP::EQ, character>; + case Py_NE: + return string_comparison_loop<rstrip, COMP::NE, character>; + case Py_LT: + return string_comparison_loop<rstrip, COMP::LT, character>; + case Py_LE: + return string_comparison_loop<rstrip, COMP::LE, character>; + case Py_GT: + return string_comparison_loop<rstrip, COMP::GT, character>; + case Py_GE: + return string_comparison_loop<rstrip, COMP::GE, character>; + default: + assert(false); /* caller ensures this */ + } + return nullptr; +} + + +/* + * This function is used for `compare_chararrays` and currently also void + * comparisons (unstructured voids). The first could probably be deprecated + * and removed but is used by `np.char.chararray` the latter should also be + * moved to the ufunc probably (removing the need for manual looping). + * + * The `rstrip` mechanism is presumably for some fortran compat, but the + * question is whether it would not be better to have/use `rstrip` on such + * an array first... + * + * NOTE: This function is also used for unstructured voids, this works because + * `npy_byte` is correct. + */ +NPY_NO_EXPORT PyObject * +_umath_strings_richcompare( + PyArrayObject *self, PyArrayObject *other, int cmp_op, int rstrip) +{ + NpyIter *iter = nullptr; + PyObject *result = nullptr; + + char **dataptr = nullptr; + npy_intp *strides = nullptr; + npy_intp *countptr = nullptr; + npy_intp size = 0; + + PyArrayMethod_Context context = {}; + NpyIter_IterNextFunc *iternext = nullptr; + + npy_uint32 it_flags = ( + NPY_ITER_EXTERNAL_LOOP | NPY_ITER_ZEROSIZE_OK | + NPY_ITER_BUFFERED | NPY_ITER_GROWINNER); + npy_uint32 op_flags[3] = { + NPY_ITER_READONLY | NPY_ITER_ALIGNED, + NPY_ITER_READONLY | NPY_ITER_ALIGNED, + NPY_ITER_WRITEONLY | NPY_ITER_ALLOCATE | NPY_ITER_ALIGNED}; + + PyArrayMethod_StridedLoop *strided_loop = nullptr; + NPY_BEGIN_THREADS_DEF; + + if (PyArray_TYPE(self) != PyArray_TYPE(other)) { + /* + * Comparison between Bytes and Unicode is not defined in Py3K; + * we follow. + * TODO: This makes no sense at all for `compare_chararrays`, kept + * only under the assumption that we are more likely to deprecate + * than fix it to begin with. + */ + Py_INCREF(Py_NotImplemented); + return Py_NotImplemented; + } + + PyArrayObject *ops[3] = {self, other, nullptr}; + PyArray_Descr *descrs[3] = {nullptr, nullptr, PyArray_DescrFromType(NPY_BOOL)}; + /* TODO: ensuring native byte order is not really necessary for == and != */ + descrs[0] = NPY_DT_CALL_ensure_canonical(PyArray_DESCR(self)); + if (descrs[0] == nullptr) { + goto finish; + } + descrs[1] = NPY_DT_CALL_ensure_canonical(PyArray_DESCR(other)); + if (descrs[1] == nullptr) { + goto finish; + } + + /* + * Create the iterator: + */ + iter = NpyIter_AdvancedNew( + 3, ops, it_flags, NPY_KEEPORDER, NPY_SAFE_CASTING, op_flags, descrs, + -1, nullptr, nullptr, 0); + if (iter == nullptr) { + goto finish; + } + + size = NpyIter_GetIterSize(iter); + if (size == 0) { + result = (PyObject *)NpyIter_GetOperandArray(iter)[2]; + Py_INCREF(result); + goto finish; + } + + iternext = NpyIter_GetIterNext(iter, nullptr); + if (iternext == nullptr) { + goto finish; + } + + /* + * Prepare the inner-loop and execute it (we only need descriptors to be + * passed in). + */ + context.descriptors = descrs; + + dataptr = NpyIter_GetDataPtrArray(iter); + strides = NpyIter_GetInnerStrideArray(iter); + countptr = NpyIter_GetInnerLoopSizePtr(iter); + + if (rstrip == 0) { + /* NOTE: Also used for VOID, so can be STRING, UNICODE, or VOID: */ + if (descrs[0]->type_num != NPY_UNICODE) { + strided_loop = get_strided_loop<false, npy_byte>(cmp_op); + } + else { + strided_loop = get_strided_loop<false, npy_ucs4>(cmp_op); + } + } + else { + if (descrs[0]->type_num != NPY_UNICODE) { + strided_loop = get_strided_loop<true, npy_byte>(cmp_op); + } + else { + strided_loop = get_strided_loop<true, npy_ucs4>(cmp_op); + } + } + + NPY_BEGIN_THREADS_THRESHOLDED(size); + + do { + /* We know the loop cannot fail */ + strided_loop(&context, dataptr, countptr, strides, nullptr); + } while (iternext(iter) != 0); + + NPY_END_THREADS; + + result = (PyObject *)NpyIter_GetOperandArray(iter)[2]; + Py_INCREF(result); + + finish: + if (NpyIter_Deallocate(iter) < 0) { + Py_CLEAR(result); + } + Py_XDECREF(descrs[0]); + Py_XDECREF(descrs[1]); + Py_XDECREF(descrs[2]); + return result; +} diff --git a/numpy/core/src/umath/string_ufuncs.h b/numpy/core/src/umath/string_ufuncs.h new file mode 100644 index 000000000..aa1719954 --- /dev/null +++ b/numpy/core/src/umath/string_ufuncs.h @@ -0,0 +1,19 @@ +#ifndef _NPY_CORE_SRC_UMATH_STRING_UFUNCS_H_ +#define _NPY_CORE_SRC_UMATH_STRING_UFUNCS_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +NPY_NO_EXPORT int +init_string_ufuncs(PyObject *umath); + +NPY_NO_EXPORT PyObject * +_umath_strings_richcompare( + PyArrayObject *self, PyArrayObject *other, int cmp_op, int rstrip); + +#ifdef __cplusplus +} +#endif + +#endif /* _NPY_CORE_SRC_UMATH_STRING_UFUNCS_H_ */
\ No newline at end of file diff --git a/numpy/core/src/umath/ufunc_object.c b/numpy/core/src/umath/ufunc_object.c index fce7d61de..2636396d3 100644 --- a/numpy/core/src/umath/ufunc_object.c +++ b/numpy/core/src/umath/ufunc_object.c @@ -57,6 +57,10 @@ #include "legacy_array_method.h" #include "abstractdtypes.h" +/* TODO: Only for `NpyIter_GetTransferFlags` until it is public */ +#define NPY_ITERATOR_IMPLEMENTATION_CODE +#include "nditer_impl.h" + /********** PRINTF DEBUG TRACING **************/ #define NPY_UF_DBG_TRACING 0 @@ -1544,10 +1548,6 @@ execute_ufunc_loop(PyArrayMethod_Context *context, int masked, if (masked) { baseptrs[nop] = PyArray_BYTES(op_it[nop]); } - if (NpyIter_ResetBasePointers(iter, baseptrs, NULL) != NPY_SUCCEED) { - NpyIter_Deallocate(iter); - return -1; - } /* * Get the inner loop, with the possibility of specialization @@ -1584,17 +1584,25 @@ execute_ufunc_loop(PyArrayMethod_Context *context, int masked, char **dataptr = NpyIter_GetDataPtrArray(iter); npy_intp *strides = NpyIter_GetInnerStrideArray(iter); npy_intp *countptr = NpyIter_GetInnerLoopSizePtr(iter); - int needs_api = NpyIter_IterationNeedsAPI(iter); NPY_BEGIN_THREADS_DEF; + flags = PyArrayMethod_COMBINED_FLAGS(flags, NpyIter_GetTransferFlags(iter)); + if (!(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) { npy_clear_floatstatus_barrier((char *)context); } - if (!needs_api && !(flags & NPY_METH_REQUIRES_PYAPI)) { + if (!(flags & NPY_METH_REQUIRES_PYAPI)) { NPY_BEGIN_THREADS_THRESHOLDED(full_size); } + /* The reset may copy the first buffer chunk, which could cause FPEs */ + if (NpyIter_ResetBasePointers(iter, baseptrs, NULL) != NPY_SUCCEED) { + NPY_AUXDATA_FREE(auxdata); + NpyIter_Deallocate(iter); + return -1; + } + NPY_UF_DBG_PRINT("Actual inner loop:\n"); /* Execute the loop */ int res; @@ -2388,7 +2396,8 @@ PyUFunc_GeneralizedFunctionInternal(PyUFuncObject *ufunc, NPY_ITER_MULTI_INDEX | NPY_ITER_REFS_OK | NPY_ITER_ZEROSIZE_OK | - NPY_ITER_COPY_IF_OVERLAP; + NPY_ITER_COPY_IF_OVERLAP | + NPY_ITER_DELAY_BUFALLOC; /* Create the iterator */ iter = NpyIter_AdvancedNew(nop, op, iter_flags, diff --git a/numpy/core/src/umath/umathmodule.c b/numpy/core/src/umath/umathmodule.c index 49328d19e..17fedec6f 100644 --- a/numpy/core/src/umath/umathmodule.c +++ b/numpy/core/src/umath/umathmodule.c @@ -23,11 +23,13 @@ #include "numpy/npy_math.h" #include "number.h" #include "dispatching.h" +#include "string_ufuncs.h" /* Automatically generated code to define all ufuncs: */ #include "funcs.inc" #include "__umath_generated.c" + static PyUFuncGenericFunction pyfunc_functions[] = {PyUFunc_On_Om}; static int @@ -347,5 +349,10 @@ int initumath(PyObject *m) if (install_logical_ufunc_promoter(s) < 0) { return -1; } + + if (init_string_ufuncs(d) < 0) { + return -1; + } + return 0; } diff --git a/numpy/core/tests/test_abc.py b/numpy/core/tests/test_abc.py index 30e5748af..8b12d07ac 100644 --- a/numpy/core/tests/test_abc.py +++ b/numpy/core/tests/test_abc.py @@ -20,35 +20,35 @@ class TestABC: def test_floats(self): for t in sctypes['float']: assert_(isinstance(t(), numbers.Real), - "{0} is not instance of Real".format(t.__name__)) + f"{t.__name__} is not instance of Real") assert_(issubclass(t, numbers.Real), - "{0} is not subclass of Real".format(t.__name__)) + f"{t.__name__} is not subclass of Real") assert_(not isinstance(t(), numbers.Rational), - "{0} is instance of Rational".format(t.__name__)) + f"{t.__name__} is instance of Rational") assert_(not issubclass(t, numbers.Rational), - "{0} is subclass of Rational".format(t.__name__)) + f"{t.__name__} is subclass of Rational") def test_complex(self): for t in sctypes['complex']: assert_(isinstance(t(), numbers.Complex), - "{0} is not instance of Complex".format(t.__name__)) + f"{t.__name__} is not instance of Complex") assert_(issubclass(t, numbers.Complex), - "{0} is not subclass of Complex".format(t.__name__)) + f"{t.__name__} is not subclass of Complex") assert_(not isinstance(t(), numbers.Real), - "{0} is instance of Real".format(t.__name__)) + f"{t.__name__} is instance of Real") assert_(not issubclass(t, numbers.Real), - "{0} is subclass of Real".format(t.__name__)) + f"{t.__name__} is subclass of Real") def test_int(self): for t in sctypes['int']: assert_(isinstance(t(), numbers.Integral), - "{0} is not instance of Integral".format(t.__name__)) + f"{t.__name__} is not instance of Integral") assert_(issubclass(t, numbers.Integral), - "{0} is not subclass of Integral".format(t.__name__)) + f"{t.__name__} is not subclass of Integral") def test_uint(self): for t in sctypes['uint']: assert_(isinstance(t(), numbers.Integral), - "{0} is not instance of Integral".format(t.__name__)) + f"{t.__name__} is not instance of Integral") assert_(issubclass(t, numbers.Integral), - "{0} is not subclass of Integral".format(t.__name__)) + f"{t.__name__} is not subclass of Integral") diff --git a/numpy/core/tests/test_array_coercion.py b/numpy/core/tests/test_array_coercion.py index e858cd8b6..ed3ef7e67 100644 --- a/numpy/core/tests/test_array_coercion.py +++ b/numpy/core/tests/test_array_coercion.py @@ -373,28 +373,29 @@ class TestScalarDiscovery: assert discovered_dtype.itemsize == dtype.itemsize @pytest.mark.parametrize("dtype", np.typecodes["Integer"]) - def test_scalar_to_int_coerce_does_not_cast(self, dtype): + @pytest.mark.parametrize(["scalar", "error"], + [(np.float64(np.nan), ValueError), + (np.ulonglong(-1), OverflowError)]) + def test_scalar_to_int_coerce_does_not_cast(self, dtype, scalar, error): """ Signed integers are currently different in that they do not cast other NumPy scalar, but instead use scalar.__int__(). The hardcoded exception to this rule is `np.array(scalar, dtype=integer)`. """ dtype = np.dtype(dtype) - invalid_int = np.ulonglong(-1) - float_nan = np.float64(np.nan) - - for scalar in [float_nan, invalid_int]: - # This is a special case using casting logic and thus not failing: + # This is a special case using casting logic. It warns for the NaN + # but allows the cast (giving undefined behaviour). + with np.errstate(invalid="ignore"): coerced = np.array(scalar, dtype=dtype) cast = np.array(scalar).astype(dtype) - assert_array_equal(coerced, cast) + assert_array_equal(coerced, cast) - # However these fail: - with pytest.raises((ValueError, OverflowError)): - np.array([scalar], dtype=dtype) - with pytest.raises((ValueError, OverflowError)): - cast[()] = scalar + # However these fail: + with pytest.raises(error): + np.array([scalar], dtype=dtype) + with pytest.raises(error): + cast[()] = scalar class TestTimeScalars: @@ -614,8 +615,8 @@ class TestBadSequences: obj.append([2, 3]) obj.append(mylist([1, 2])) - with pytest.raises(RuntimeError): - np.array(obj) + # Does not crash: + np.array(obj) def test_replace_0d_array(self): # List to coerce, `mylist` will mutate the first element diff --git a/numpy/core/tests/test_casting_floatingpoint_errors.py b/numpy/core/tests/test_casting_floatingpoint_errors.py new file mode 100644 index 000000000..4fafc4ed8 --- /dev/null +++ b/numpy/core/tests/test_casting_floatingpoint_errors.py @@ -0,0 +1,153 @@ +import pytest +from pytest import param + +import numpy as np + + +def values_and_dtypes(): + """ + Generate value+dtype pairs that generate floating point errors during + casts. The invalid casts to integers will generate "invalid" value + warnings, the float casts all generate "overflow". + + (The Python int/float paths don't need to get tested in all the same + situations, but it does not hurt.) + """ + # Casting to float16: + yield param(70000, "float16", id="int-to-f2") + yield param("70000", "float16", id="str-to-f2") + yield param(70000.0, "float16", id="float-to-f2") + yield param(np.longdouble(70000.), "float16", id="longdouble-to-f2") + yield param(np.float64(70000.), "float16", id="double-to-f2") + yield param(np.float32(70000.), "float16", id="float-to-f2") + # Casting to float32: + yield param(10**100, "float32", id="int-to-f4") + yield param(1e100, "float32", id="float-to-f2") + yield param(np.longdouble(1e300), "float32", id="longdouble-to-f2") + yield param(np.float64(1e300), "float32", id="double-to-f2") + # Casting to float64: + # If longdouble is double-double, its max can be rounded down to the double + # max. So we correct the double spacing (a bit weird, admittedly): + max_ld = np.finfo(np.longdouble).max + spacing = np.spacing(np.nextafter(np.finfo("f8").max, 0)) + if max_ld - spacing > np.finfo("f8").max: + yield param(np.finfo(np.longdouble).max, "float64", + id="longdouble-to-f8") + + # Cast to complex32: + yield param(2e300, "complex64", id="float-to-c8") + yield param(2e300+0j, "complex64", id="complex-to-c8") + yield param(2e300j, "complex64", id="complex-to-c8") + yield param(np.longdouble(2e300), "complex64", id="longdouble-to-c8") + + # Invalid float to integer casts: + with np.errstate(over="ignore"): + for to_dt in np.typecodes["AllInteger"]: + for value in [np.inf, np.nan]: + for from_dt in np.typecodes["AllFloat"]: + from_dt = np.dtype(from_dt) + from_val = from_dt.type(value) + + yield param(from_val, to_dt, id=f"{from_val}-to-{to_dt}") + + +def check_operations(dtype, value): + """ + There are many dedicated paths in NumPy which cast and should check for + floating point errors which occurred during those casts. + """ + if dtype.kind != 'i': + # These assignments use the stricter setitem logic: + def assignment(): + arr = np.empty(3, dtype=dtype) + arr[0] = value + + yield assignment + + def fill(): + arr = np.empty(3, dtype=dtype) + arr.fill(value) + + yield fill + + def copyto_scalar(): + arr = np.empty(3, dtype=dtype) + np.copyto(arr, value, casting="unsafe") + + yield copyto_scalar + + def copyto(): + arr = np.empty(3, dtype=dtype) + np.copyto(arr, np.array([value, value, value]), casting="unsafe") + + yield copyto + + def copyto_scalar_masked(): + arr = np.empty(3, dtype=dtype) + np.copyto(arr, value, casting="unsafe", + where=[True, False, True]) + + yield copyto_scalar_masked + + def copyto_masked(): + arr = np.empty(3, dtype=dtype) + np.copyto(arr, np.array([value, value, value]), casting="unsafe", + where=[True, False, True]) + + yield copyto_masked + + def direct_cast(): + np.array([value, value, value]).astype(dtype) + + yield direct_cast + + def direct_cast_nd_strided(): + arr = np.full((5, 5, 5), fill_value=value)[:, ::2, :] + arr.astype(dtype) + + yield direct_cast_nd_strided + + def boolean_array_assignment(): + arr = np.empty(3, dtype=dtype) + arr[[True, False, True]] = np.array([value, value]) + + yield boolean_array_assignment + + def integer_array_assignment(): + arr = np.empty(3, dtype=dtype) + values = np.array([value, value]) + + arr[[0, 1]] = values + + yield integer_array_assignment + + def integer_array_assignment_with_subspace(): + arr = np.empty((5, 3), dtype=dtype) + values = np.array([value, value, value]) + + arr[[0, 2]] = values + + yield integer_array_assignment_with_subspace + + def flat_assignment(): + arr = np.empty((3,), dtype=dtype) + values = np.array([value, value, value]) + arr.flat[:] = values + + yield flat_assignment + +@pytest.mark.parametrize(["value", "dtype"], values_and_dtypes()) +@pytest.mark.filterwarnings("ignore::numpy.ComplexWarning") +def test_floatingpoint_errors_casting(dtype, value): + dtype = np.dtype(dtype) + for operation in check_operations(dtype, value): + dtype = np.dtype(dtype) + + match = "invalid" if dtype.kind in 'iu' else "overflow" + with pytest.warns(RuntimeWarning, match=match): + operation() + + with np.errstate(all="raise"): + with pytest.raises(FloatingPointError, match=match): + operation() + diff --git a/numpy/core/tests/test_deprecations.py b/numpy/core/tests/test_deprecations.py index 2b7864433..2255cb2a3 100644 --- a/numpy/core/tests/test_deprecations.py +++ b/numpy/core/tests/test_deprecations.py @@ -166,7 +166,7 @@ class TestComparisonDeprecations(_DeprecationTestCase): # For two string arrays, strings always raised the broadcasting error: a = np.array(['a', 'b']) b = np.array(['a', 'b', 'c']) - assert_raises(ValueError, lambda x, y: x == y, a, b) + assert_warns(FutureWarning, lambda x, y: x == y, a, b) # The empty list is not cast to string, and this used to pass due # to dtype mismatch; now (2018-06-21) it correctly leads to a diff --git a/numpy/core/tests/test_dtype.py b/numpy/core/tests/test_dtype.py index 32e2c6842..b37bded73 100644 --- a/numpy/core/tests/test_dtype.py +++ b/numpy/core/tests/test_dtype.py @@ -1346,6 +1346,16 @@ class TestPromotion: match=r".* no common DType exists for the given inputs"): np.result_type(1j, rational(1, 2)) + @pytest.mark.parametrize("val", [2, 2**32, 2**63, 2**64, 2*100]) + def test_python_integer_promotion(self, val): + # If we only path scalars (mainly python ones!), the result must take + # into account that the integer may be considered int32, int64, uint64, + # or object depending on the input value. So test those paths! + expected_dtype = np.result_type(np.array(val).dtype, np.array(0).dtype) + assert np.result_type(val, 0) == expected_dtype + # For completeness sake, also check with a NumPy scalar as second arg: + assert np.result_type(val, np.int8(0)) == expected_dtype + @pytest.mark.parametrize(["other", "expected"], [(1, rational), (1., np.float64)]) def test_float_int_pyscalar_promote_rational(self, other, expected): diff --git a/numpy/core/tests/test_half.py b/numpy/core/tests/test_half.py index 1b6fd21e1..6743dfb51 100644 --- a/numpy/core/tests/test_half.py +++ b/numpy/core/tests/test_half.py @@ -104,9 +104,9 @@ class TestHalf: # Increase the float by a minimal value: if offset == "up": - f16s_float = np.nextafter(f16s_float, float_t(1e50)) + f16s_float = np.nextafter(f16s_float, float_t(np.inf)) elif offset == "down": - f16s_float = np.nextafter(f16s_float, float_t(-1e50)) + f16s_float = np.nextafter(f16s_float, float_t(-np.inf)) # Convert back to float16 and its bit pattern: res_patterns = f16s_float.astype(np.float16).view(np.uint16) @@ -233,12 +233,14 @@ class TestHalf: np.inf] # Check float64->float16 rounding - b = np.array(a, dtype=float16) + with np.errstate(over="ignore"): + b = np.array(a, dtype=float16) assert_equal(b, rounded) # Check float32->float16 rounding a = np.array(a, dtype=float32) - b = np.array(a, dtype=float16) + with np.errstate(over="ignore"): + b = np.array(a, dtype=float16) assert_equal(b, rounded) def test_half_correctness(self): diff --git a/numpy/core/tests/test_indexing.py b/numpy/core/tests/test_indexing.py index efcb92c2e..9ef30eae2 100644 --- a/numpy/core/tests/test_indexing.py +++ b/numpy/core/tests/test_indexing.py @@ -1297,11 +1297,10 @@ class TestBooleanIndexing: def test_boolean_indexing_weirdness(self): # Weird boolean indexing things a = np.ones((2, 3, 4)) - a[False, True, ...].shape == (0, 2, 3, 4) - a[True, [0, 1], True, True, [1], [[2]]] == (1, 2) + assert a[False, True, ...].shape == (0, 2, 3, 4) + assert a[True, [0, 1], True, True, [1], [[2]]].shape == (1, 2) assert_raises(IndexError, lambda: a[False, [0, 1], ...]) - def test_boolean_indexing_fast_path(self): # These used to either give the wrong error, or incorrectly give no # error. diff --git a/numpy/core/tests/test_multiarray.py b/numpy/core/tests/test_multiarray.py index f4454130d..84fdf545f 100644 --- a/numpy/core/tests/test_multiarray.py +++ b/numpy/core/tests/test_multiarray.py @@ -68,8 +68,8 @@ def _aligned_zeros(shape, dtype=float, order="C", align=None): # Note: slices producing 0-size arrays do not necessarily change # data pointer --- so we use and allocate size+1 buf = buf[offset:offset+size+1][:-1] + buf.fill(0) data = np.ndarray(shape, dtype, buf, order=order) - data.fill(0) return data @@ -1244,6 +1244,18 @@ class TestStructured: # The main importance is that it does not return True: with pytest.raises(TypeError): x == y + + def test_empty_structured_array_comparison(self): + # Check that comparison works on empty arrays with nontrivially + # shaped fields + a = np.zeros(0, [('a', '<f8', (1, 1))]) + assert_equal(a, a) + a = np.zeros(0, [('a', '<f8', (1,))]) + assert_equal(a, a) + a = np.zeros((0, 0), [('a', '<f8', (1, 1))]) + assert_equal(a, a) + a = np.zeros((1, 0, 1), [('a', '<f8', (1, 1))]) + assert_equal(a, a) def test_structured_comparisons_with_promotion(self): # Check that structured arrays can be compared so long as their diff --git a/numpy/core/tests/test_numeric.py b/numpy/core/tests/test_numeric.py index 0b03c6576..5b15e29b4 100644 --- a/numpy/core/tests/test_numeric.py +++ b/numpy/core/tests/test_numeric.py @@ -2939,7 +2939,9 @@ class TestLikeFuncs: self.check_like_function(np.full_like, 1, True) self.check_like_function(np.full_like, 1000, True) self.check_like_function(np.full_like, 123.456, True) - self.check_like_function(np.full_like, np.inf, True) + # Inf to integer casts cause invalid-value errors: ignore them. + with np.errstate(invalid="ignore"): + self.check_like_function(np.full_like, np.inf, True) @pytest.mark.parametrize('likefunc', [np.empty_like, np.full_like, np.zeros_like, np.ones_like]) diff --git a/numpy/core/tests/test_overrides.py b/numpy/core/tests/test_overrides.py index 36970dbc0..e68406ebd 100644 --- a/numpy/core/tests/test_overrides.py +++ b/numpy/core/tests/test_overrides.py @@ -355,6 +355,45 @@ class TestArrayFunctionImplementation: TypeError, "no implementation found for 'my.func'"): func(MyArray()) + def test_signature_error_message(self): + # The lambda function will be named "<lambda>", but the TypeError + # should show the name as "func" + def _dispatcher(): + return () + + @array_function_dispatch(_dispatcher) + def func(): + pass + + try: + func(bad_arg=3) + except TypeError as e: + expected_exception = e + + try: + func(bad_arg=3) + raise AssertionError("must fail") + except TypeError as exc: + assert exc.args == expected_exception.args + + @pytest.mark.parametrize("value", [234, "this func is not replaced"]) + def test_dispatcher_error(self, value): + # If the dispatcher raises an error, we must not attempt to mutate it + error = TypeError(value) + + def dispatcher(): + raise error + + @array_function_dispatch(dispatcher) + def func(): + return 3 + + try: + func() + raise AssertionError("must fail") + except TypeError as exc: + assert exc is error # unmodified exception + class TestNDArrayMethods: diff --git a/numpy/core/tests/test_regression.py b/numpy/core/tests/test_regression.py index 98e0df9b8..4538c825d 100644 --- a/numpy/core/tests/test_regression.py +++ b/numpy/core/tests/test_regression.py @@ -326,20 +326,20 @@ class TestRegression: assert_raises(ValueError, bfa) assert_raises(ValueError, bfb) - def test_nonarray_assignment(self): + @pytest.mark.parametrize("index", + [np.ones(10, dtype=bool), np.arange(10)], + ids=["boolean-arr-index", "integer-arr-index"]) + def test_nonarray_assignment(self, index): # See also Issue gh-2870, test for non-array assignment # and equivalent unsafe casted array assignment a = np.arange(10) - b = np.ones(10, dtype=bool) - r = np.arange(10) - def assign(a, b, c): - a[b] = c + with pytest.raises(ValueError): + a[index] = np.nan - assert_raises(ValueError, assign, a, b, np.nan) - a[b] = np.array(np.nan) # but not this. - assert_raises(ValueError, assign, a, r, np.nan) - a[r] = np.array(np.nan) + with np.errstate(invalid="warn"): + with pytest.warns(RuntimeWarning, match="invalid value"): + a[index] = np.array(np.nan) # Only warns def test_unpickle_dtype_with_object(self): # Implemented in r2840 @@ -1496,7 +1496,7 @@ class TestRegression: min = np.array([np.iinfo(t).min]) min //= -1 - with np.errstate(divide="ignore"): + with np.errstate(over="ignore"): for t in (np.int8, np.int16, np.int32, np.int64, int): test_type(t) diff --git a/numpy/core/tests/test_scalarmath.py b/numpy/core/tests/test_scalarmath.py index b7fe5183e..8b14284ff 100644 --- a/numpy/core/tests/test_scalarmath.py +++ b/numpy/core/tests/test_scalarmath.py @@ -683,8 +683,12 @@ class TestNegative: sup.filter(RuntimeWarning) for dt in types: a = np.ones((), dtype=dt)[()] - assert_equal(operator.neg(a) + a, 0) - + if dt in np.typecodes['UnsignedInteger']: + st = np.dtype(dt).type + max = st(np.iinfo(dt).max) + assert_equal(operator.neg(a), max) + else: + assert_equal(operator.neg(a) + a, 0) class TestSubtract: def test_exceptions(self): @@ -896,9 +900,13 @@ def test_scalar_integer_operation_overflow(dtype, operation): @pytest.mark.parametrize("dtype", np.typecodes["Integer"]) @pytest.mark.parametrize("operation", [ + lambda min, neg_1: -min, lambda min, neg_1: abs(min), - lambda min, neg_1: min * neg_1, - lambda min, neg_1: min // neg_1], ids=["abs", "*", "//"]) + pytest.param(lambda min, neg_1: min * neg_1, + marks=pytest.mark.xfail(reason="broken on some platforms")), + pytest.param(lambda min, neg_1: min // neg_1, + marks=pytest.mark.skip(reason="broken on some platforms"))], + ids=["neg", "abs", "*", "//"]) def test_scalar_signed_integer_overflow(dtype, operation): # The minimum signed integer can "overflow" for some additional operations st = np.dtype(dtype).type @@ -910,8 +918,7 @@ def test_scalar_signed_integer_overflow(dtype, operation): @pytest.mark.parametrize("dtype", np.typecodes["UnsignedInteger"]) -@pytest.mark.xfail # TODO: the check is quite simply missing! -def test_scalar_signed_integer_overflow(dtype): +def test_scalar_unsigned_integer_overflow(dtype): val = np.dtype(dtype).type(8) with pytest.warns(RuntimeWarning, match="overflow encountered"): -val diff --git a/numpy/core/tests/test_simd.py b/numpy/core/tests/test_simd.py index 324948cf2..c4488533a 100644 --- a/numpy/core/tests/test_simd.py +++ b/numpy/core/tests/test_simd.py @@ -85,16 +85,13 @@ class _Test_Utility: return getattr(self.npyv, cvt_intrin.format(sfx[1:], sfx))(vector) def _pinfinity(self): - v = self.npyv.setall_u32(0x7f800000) - return self.npyv.reinterpret_f32_u32(v)[0] + return float("inf") def _ninfinity(self): - v = self.npyv.setall_u32(0xff800000) - return self.npyv.reinterpret_f32_u32(v)[0] + return -float("inf") def _nan(self): - v = self.npyv.setall_u32(0x7fc00000) - return self.npyv.reinterpret_f32_u32(v)[0] + return float("nan") def _cpu_features(self): target = self.target_name @@ -170,8 +167,9 @@ class _SIMD_BOOL(_Test_Utility): for data in (self._data(), self._data(reverse=True)): vdata = self._load_b(data) data_bits = data2bits(data) - tobits = bin(self.tobits(vdata)) - assert tobits == bin(data_bits) + tobits = self.tobits(vdata) + bin_tobits = bin(tobits) + assert bin_tobits == bin(data_bits) def test_pack(self): """ @@ -746,9 +744,11 @@ class _SIMD_ALL(_Test_Utility): # We're testing the sanity of _simd's type-vector, # reinterpret* intrinsics itself are tested via compiler # during the build of _simd module - sfxes = ["u8", "s8", "u16", "s16", "u32", "s32", "u64", "s64", "f32"] + sfxes = ["u8", "s8", "u16", "s16", "u32", "s32", "u64", "s64"] if self.npyv.simd_f64: sfxes.append("f64") + if self.npyv.simd_f32: + sfxes.append("f32") for sfx in sfxes: vec_name = getattr(self, "reinterpret_" + sfx)(vdata_a).__name__ assert vec_name == "npyv_" + sfx @@ -1077,8 +1077,13 @@ for target_name, npyv in targets.items(): skip = f"target '{pretty_name}' isn't supported by current machine" elif not npyv.simd: skip = f"target '{pretty_name}' isn't supported by NPYV" - elif not npyv.simd_f64: - skip_sfx["f64"] = f"target '{pretty_name}' doesn't support double-precision" + else: + if not npyv.simd_f32: + skip_sfx["f32"] = f"target '{pretty_name}' "\ + "doesn't support single-precision" + if not npyv.simd_f64: + skip_sfx["f64"] = f"target '{pretty_name}' doesn't"\ + "support double-precision" for sfxes, cls in tests_registry.items(): for sfx in sfxes: diff --git a/numpy/core/tests/test_simd_module.py b/numpy/core/tests/test_simd_module.py index 3d710884a..44dc58dac 100644 --- a/numpy/core/tests/test_simd_module.py +++ b/numpy/core/tests/test_simd_module.py @@ -12,7 +12,9 @@ npyv, npyv2 = (npyvs + [None, None])[:2] unsigned_sfx = ["u8", "u16", "u32", "u64"] signed_sfx = ["s8", "s16", "s32", "s64"] -fp_sfx = ["f32"] +fp_sfx = [] +if npyv and npyv.simd_f32: + fp_sfx.append("f32") if npyv and npyv.simd_f64: fp_sfx.append("f64") diff --git a/numpy/core/tests/test_strings.py b/numpy/core/tests/test_strings.py new file mode 100644 index 000000000..2b87ed654 --- /dev/null +++ b/numpy/core/tests/test_strings.py @@ -0,0 +1,85 @@ +import pytest + +import operator +import numpy as np + +from numpy.testing import assert_array_equal + + +COMPARISONS = [ + (operator.eq, np.equal, "=="), + (operator.ne, np.not_equal, "!="), + (operator.lt, np.less, "<"), + (operator.le, np.less_equal, "<="), + (operator.gt, np.greater, ">"), + (operator.ge, np.greater_equal, ">="), +] + + +@pytest.mark.parametrize(["op", "ufunc", "sym"], COMPARISONS) +def test_mixed_string_comparison_ufuncs_fail(op, ufunc, sym): + arr_string = np.array(["a", "b"], dtype="S") + arr_unicode = np.array(["a", "c"], dtype="U") + + with pytest.raises(TypeError, match="did not contain a loop"): + ufunc(arr_string, arr_unicode) + + with pytest.raises(TypeError, match="did not contain a loop"): + ufunc(arr_unicode, arr_string) + +@pytest.mark.parametrize(["op", "ufunc", "sym"], COMPARISONS) +def test_mixed_string_comparisons_ufuncs_with_cast(op, ufunc, sym): + arr_string = np.array(["a", "b"], dtype="S") + arr_unicode = np.array(["a", "c"], dtype="U") + + # While there is no loop, manual casting is acceptable: + res1 = ufunc(arr_string, arr_unicode, signature="UU->?", casting="unsafe") + res2 = ufunc(arr_string, arr_unicode, signature="SS->?", casting="unsafe") + + expected = op(arr_string.astype('U'), arr_unicode) + assert_array_equal(res1, expected) + assert_array_equal(res2, expected) + + +@pytest.mark.parametrize(["op", "ufunc", "sym"], COMPARISONS) +@pytest.mark.parametrize("dtypes", [ + ("S2", "S2"), ("S2", "S10"), + ("<U1", "<U1"), ("<U1", ">U1"), (">U1", ">U1"), + ("<U1", "<U10"), ("<U1", ">U10")]) +@pytest.mark.parametrize("aligned", [True, False]) +def test_string_comparisons(op, ufunc, sym, dtypes, aligned): + # ensure native byte-order for the first view to stay within unicode range + native_dt = np.dtype(dtypes[0]).newbyteorder("=") + arr = np.arange(2**15).view(native_dt).astype(dtypes[0]) + if not aligned: + # Make `arr` unaligned: + new = np.zeros(arr.nbytes + 1, dtype=np.uint8)[1:].view(dtypes[0]) + new[...] = arr + arr = new + + arr2 = arr.astype(dtypes[1], copy=True) + np.random.shuffle(arr2) + arr[0] = arr2[0] # make sure one matches + + expected = [op(d1, d2) for d1, d2 in zip(arr.tolist(), arr2.tolist())] + assert_array_equal(op(arr, arr2), expected) + assert_array_equal(ufunc(arr, arr2), expected) + assert_array_equal(np.compare_chararrays(arr, arr2, sym, False), expected) + + expected = [op(d2, d1) for d1, d2 in zip(arr.tolist(), arr2.tolist())] + assert_array_equal(op(arr2, arr), expected) + assert_array_equal(ufunc(arr2, arr), expected) + assert_array_equal(np.compare_chararrays(arr2, arr, sym, False), expected) + + +@pytest.mark.parametrize(["op", "ufunc", "sym"], COMPARISONS) +@pytest.mark.parametrize("dtypes", [ + ("S2", "S2"), ("S2", "S10"), ("<U1", "<U1"), ("<U1", ">U10")]) +def test_string_comparisons_empty(op, ufunc, sym, dtypes): + arr = np.empty((1, 0, 1, 5), dtype=dtypes[0]) + arr2 = np.empty((100, 1, 0, 1), dtype=dtypes[1]) + + expected = np.empty(np.broadcast_shapes(arr.shape, arr2.shape), dtype=bool) + assert_array_equal(op(arr, arr2), expected) + assert_array_equal(ufunc(arr, arr2), expected) + assert_array_equal(np.compare_chararrays(arr, arr2, sym, False), expected) diff --git a/numpy/core/tests/test_ufunc.py b/numpy/core/tests/test_ufunc.py index 852044d32..3466178a3 100644 --- a/numpy/core/tests/test_ufunc.py +++ b/numpy/core/tests/test_ufunc.py @@ -620,8 +620,9 @@ class TestUfunc: atol = max(np.finfo(dtout).tiny, 3e-308) else: atol = 3e-308 - # Some test values result in invalid for float16. - with np.errstate(invalid='ignore'): + # Some test values result in invalid for float16 + # and the cast to it may overflow to inf. + with np.errstate(invalid='ignore', over='ignore'): res = np.true_divide(x, y, dtype=dtout) if not np.isfinite(res) and tcout == 'e': continue @@ -665,20 +666,22 @@ class TestUfunc: for dt in (int, np.float16, np.float32, np.float64, np.longdouble): for v in (0, 1, 2, 7, 8, 9, 15, 16, 19, 127, 128, 1024, 1235): - tgt = dt(v * (v + 1) / 2) - d = np.arange(1, v + 1, dtype=dt) - # warning if sum overflows, which it does in float16 - overflow = not np.isfinite(tgt) - with warnings.catch_warnings(record=True) as w: - warnings.simplefilter("always") - assert_almost_equal(np.sum(d), tgt) + warnings.simplefilter("always", RuntimeWarning) + + tgt = dt(v * (v + 1) / 2) + overflow = not np.isfinite(tgt) assert_equal(len(w), 1 * overflow) - assert_almost_equal(np.sum(d[::-1]), tgt) + d = np.arange(1, v + 1, dtype=dt) + + assert_almost_equal(np.sum(d), tgt) assert_equal(len(w), 2 * overflow) + assert_almost_equal(np.sum(d[::-1]), tgt) + assert_equal(len(w), 3 * overflow) + d = np.ones(500, dtype=dt) assert_almost_equal(np.sum(d[::2]), 250.) assert_almost_equal(np.sum(d[1::2]), 250.) @@ -2454,7 +2457,7 @@ def test_ufunc_warn_with_nan(ufunc): @pytest.mark.skipif(not HAS_REFCOUNT, reason="Python lacks refcounts") -def test_ufunc_casterrors(): +def test_ufunc_out_casterrors(): # Tests that casting errors are correctly reported and buffers are # cleared. # The following array can be added to itself as an object array, but @@ -2485,6 +2488,28 @@ def test_ufunc_casterrors(): assert out[-1] == 1 +@pytest.mark.parametrize("bad_offset", [0, int(np.BUFSIZE * 1.5)]) +def test_ufunc_input_casterrors(bad_offset): + value = 123 + arr = np.array([value] * bad_offset + + ["string"] + + [value] * int(1.5 * np.BUFSIZE), dtype=object) + with pytest.raises(ValueError): + # Force cast inputs, but the buffered cast of `arr` to intp fails: + np.add(arr, arr, dtype=np.intp, casting="unsafe") + + +@pytest.mark.parametrize("bad_offset", [0, int(np.BUFSIZE * 1.5)]) +def test_ufunc_input_floatingpoint_error(bad_offset): + value = 123 + arr = np.array([value] * bad_offset + + [np.nan] + + [value] * int(1.5 * np.BUFSIZE)) + with np.errstate(invalid="raise"), pytest.raises(FloatingPointError): + # Force cast inputs, but the buffered cast of `arr` to intp fails: + np.add(arr, arr, dtype=np.intp, casting="unsafe") + + def test_trivial_loop_invalid_cast(): # This tests the fast-path "invalid cast", see gh-19904. with pytest.raises(TypeError, diff --git a/numpy/core/tests/test_umath.py b/numpy/core/tests/test_umath.py index 7b6e2ee92..a696fceb8 100644 --- a/numpy/core/tests/test_umath.py +++ b/numpy/core/tests/test_umath.py @@ -327,7 +327,9 @@ class TestDivision: a_lst, b_lst = a.tolist(), b.tolist() c_div = lambda n, d: ( - 0 if d == 0 or (n and n == fo.min and d == -1) else n//d + 0 if d == 0 else ( + fo.min if (n and n == fo.min and d == -1) else n//d + ) ) with np.errstate(divide='ignore'): ac = a.copy() @@ -342,7 +344,7 @@ class TestDivision: for divisor in divisors: ac = a.copy() - with np.errstate(divide='ignore'): + with np.errstate(divide='ignore', over='ignore'): div_a = a // divisor ac //= divisor div_lst = [c_div(i, divisor) for i in a_lst] @@ -350,21 +352,25 @@ class TestDivision: assert all(div_a == div_lst), msg assert all(ac == div_lst), msg_eq - with np.errstate(divide='raise'): - if 0 in b or (fo.min and -1 in b and fo.min in a): + with np.errstate(divide='raise', over='raise'): + if 0 in b: # Verify overflow case - with pytest.raises(FloatingPointError): + with pytest.raises(FloatingPointError, + match="divide by zero encountered in floor_divide"): a // b else: a // b if fo.min and fo.min in a: - with pytest.raises(FloatingPointError): + with pytest.raises(FloatingPointError, + match='overflow encountered in floor_divide'): a // -1 elif fo.min: a // -1 - with pytest.raises(FloatingPointError): + with pytest.raises(FloatingPointError, + match="divide by zero encountered in floor_divide"): a // 0 - with pytest.raises(FloatingPointError): + with pytest.raises(FloatingPointError, + match="divide by zero encountered in floor_divide"): ac = a.copy() ac //= 0 @@ -392,11 +398,13 @@ class TestDivision: msg = "Reduce floor integer division check" assert div_a == div_lst, msg - with np.errstate(divide='raise'): - with pytest.raises(FloatingPointError): + with np.errstate(divide='raise', over='raise'): + with pytest.raises(FloatingPointError, + match="divide by zero encountered in reduce"): np.floor_divide.reduce(np.arange(-100, 100, dtype=dtype)) if fo.min: - with pytest.raises(FloatingPointError): + with pytest.raises(FloatingPointError, + match='overflow encountered in reduce'): np.floor_divide.reduce( np.array([fo.min, 1, -1], dtype=dtype) ) diff --git a/numpy/core/tests/test_unicode.py b/numpy/core/tests/test_unicode.py index 8e0dd47cb..12de25771 100644 --- a/numpy/core/tests/test_unicode.py +++ b/numpy/core/tests/test_unicode.py @@ -1,3 +1,5 @@ +import pytest + import numpy as np from numpy.testing import assert_, assert_equal, assert_array_equal @@ -33,8 +35,11 @@ def test_string_cast(): uni_arr1 = str_arr.astype('>U') uni_arr2 = str_arr.astype('<U') - assert_(str_arr != uni_arr1) - assert_(str_arr != uni_arr2) + with pytest.warns(FutureWarning): + assert str_arr != uni_arr1 + with pytest.warns(FutureWarning): + assert str_arr != uni_arr2 + assert_array_equal(uni_arr1, uni_arr2) diff --git a/numpy/distutils/ccompiler_opt.py b/numpy/distutils/ccompiler_opt.py index befc83c16..2019dcb25 100644 --- a/numpy/distutils/ccompiler_opt.py +++ b/numpy/distutils/ccompiler_opt.py @@ -955,51 +955,57 @@ class _CCompiler: def __init__(self): if hasattr(self, "cc_is_cached"): return - # attr regex + # attr regex compiler-expression detect_arch = ( - ("cc_on_x64", ".*(x|x86_|amd)64.*"), - ("cc_on_x86", ".*(win32|x86|i386|i686).*"), - ("cc_on_ppc64le", ".*(powerpc|ppc)64(el|le).*"), - ("cc_on_ppc64", ".*(powerpc|ppc)64.*"), - ("cc_on_aarch64", ".*(aarch64|arm64).*"), - ("cc_on_armhf", ".*arm.*"), - ("cc_on_s390x", ".*s390x.*"), + ("cc_on_x64", ".*(x|x86_|amd)64.*", ""), + ("cc_on_x86", ".*(win32|x86|i386|i686).*", ""), + ("cc_on_ppc64le", ".*(powerpc|ppc)64(el|le).*", ""), + ("cc_on_ppc64", ".*(powerpc|ppc)64.*", ""), + ("cc_on_aarch64", ".*(aarch64|arm64).*", ""), + ("cc_on_armhf", ".*arm.*", "defined(__ARM_ARCH_7__) || " + "defined(__ARM_ARCH_7A__)"), + ("cc_on_s390x", ".*s390x.*", ""), # undefined platform - ("cc_on_noarch", ""), + ("cc_on_noarch", "", ""), ) detect_compiler = ( - ("cc_is_gcc", r".*(gcc|gnu\-g).*"), - ("cc_is_clang", ".*clang.*"), - ("cc_is_iccw", ".*(intelw|intelemw|iccw).*"), # intel msvc like - ("cc_is_icc", ".*(intel|icc).*"), # intel unix like - ("cc_is_msvc", ".*msvc.*"), + ("cc_is_gcc", r".*(gcc|gnu\-g).*", ""), + ("cc_is_clang", ".*clang.*", ""), + # intel msvc like + ("cc_is_iccw", ".*(intelw|intelemw|iccw).*", ""), + ("cc_is_icc", ".*(intel|icc).*", ""), # intel unix like + ("cc_is_msvc", ".*msvc.*", ""), # undefined compiler will be treat it as gcc - ("cc_is_nocc", ""), + ("cc_is_nocc", "", ""), ) detect_args = ( - ("cc_has_debug", ".*(O0|Od|ggdb|coverage|debug:full).*"), - ("cc_has_native", ".*(-march=native|-xHost|/QxHost).*"), + ("cc_has_debug", ".*(O0|Od|ggdb|coverage|debug:full).*", ""), + ("cc_has_native", ".*(-march=native|-xHost|/QxHost).*", ""), # in case if the class run with -DNPY_DISABLE_OPTIMIZATION - ("cc_noopt", ".*DISABLE_OPT.*"), + ("cc_noopt", ".*DISABLE_OPT.*", ""), ) dist_info = self.dist_info() platform, compiler_info, extra_args = dist_info # set False to all attrs for section in (detect_arch, detect_compiler, detect_args): - for attr, rgex in section: + for attr, rgex, cexpr in section: setattr(self, attr, False) for detect, searchin in ((detect_arch, platform), (detect_compiler, compiler_info)): - for attr, rgex in detect: + for attr, rgex, cexpr in detect: if rgex and not re.match(rgex, searchin, re.IGNORECASE): continue + if cexpr and not self.cc_test_cexpr(cexpr): + continue setattr(self, attr, True) break - for attr, rgex in detect_args: + for attr, rgex, cexpr in detect_args: if rgex and not re.match(rgex, extra_args, re.IGNORECASE): continue + if cexpr and not self.cc_test_cexpr(cexpr): + continue setattr(self, attr, True) if self.cc_on_noarch: @@ -1071,6 +1077,25 @@ class _CCompiler: self.dist_log("testing failed", stderr=True) return test + @_Cache.me + def cc_test_cexpr(self, cexpr, flags=[]): + """ + Same as the above but supports compile-time expressions. + """ + self.dist_log("testing compiler expression", cexpr) + test_path = os.path.join(self.conf_tmp_path, "npy_dist_test_cexpr.c") + with open(test_path, "w") as fd: + fd.write(textwrap.dedent(f"""\ + #if !({cexpr}) + #error "unsupported expression" + #endif + int dummy; + """)) + test = self.dist_test(test_path, flags) + if not test: + self.dist_log("testing failed", stderr=True) + return test + def cc_normalize_flags(self, flags): """ Remove the conflicts that caused due gathering implied features flags. diff --git a/numpy/distutils/checks/cpu_asimd.c b/numpy/distutils/checks/cpu_asimd.c index 8df556b6c..6bc9022a5 100644 --- a/numpy/distutils/checks/cpu_asimd.c +++ b/numpy/distutils/checks/cpu_asimd.c @@ -3,9 +3,10 @@ #endif #include <arm_neon.h> -int main(void) +int main(int argc, char **argv) { - float32x4_t v1 = vdupq_n_f32(1.0f), v2 = vdupq_n_f32(2.0f); + float *src = (float*)argv[argc-1]; + float32x4_t v1 = vdupq_n_f32(src[0]), v2 = vdupq_n_f32(src[1]); /* MAXMIN */ int ret = (int)vgetq_lane_f32(vmaxnmq_f32(v1, v2), 0); ret += (int)vgetq_lane_f32(vminnmq_f32(v1, v2), 0); @@ -13,7 +14,8 @@ int main(void) ret += (int)vgetq_lane_f32(vrndq_f32(v1), 0); #ifdef __aarch64__ { - float64x2_t vd1 = vdupq_n_f64(1.0), vd2 = vdupq_n_f64(2.0); + double *src2 = (double*)argv[argc-1]; + float64x2_t vd1 = vdupq_n_f64(src2[0]), vd2 = vdupq_n_f64(src2[1]); /* MAXMIN */ ret += (int)vgetq_lane_f64(vmaxnmq_f64(vd1, vd2), 0); ret += (int)vgetq_lane_f64(vminnmq_f64(vd1, vd2), 0); diff --git a/numpy/distutils/checks/cpu_asimddp.c b/numpy/distutils/checks/cpu_asimddp.c index 0158d1354..e7068ce02 100644 --- a/numpy/distutils/checks/cpu_asimddp.c +++ b/numpy/distutils/checks/cpu_asimddp.c @@ -3,9 +3,10 @@ #endif #include <arm_neon.h> -int main(void) +int main(int argc, char **argv) { - uint8x16_t v1 = vdupq_n_u8((unsigned char)1), v2 = vdupq_n_u8((unsigned char)2); + unsigned char *src = (unsigned char*)argv[argc-1]; + uint8x16_t v1 = vdupq_n_u8(src[0]), v2 = vdupq_n_u8(src[1]); uint32x4_t va = vdupq_n_u32(3); int ret = (int)vgetq_lane_u32(vdotq_u32(va, v1, v2), 0); #ifdef __aarch64__ diff --git a/numpy/distutils/checks/cpu_asimdfhm.c b/numpy/distutils/checks/cpu_asimdfhm.c index cb49751c4..54e328098 100644 --- a/numpy/distutils/checks/cpu_asimdfhm.c +++ b/numpy/distutils/checks/cpu_asimdfhm.c @@ -3,12 +3,14 @@ #endif #include <arm_neon.h> -int main(void) +int main(int argc, char **argv) { - float16x8_t vhp = vdupq_n_f16((float16_t)1); - float16x4_t vlhp = vdup_n_f16((float16_t)1); - float32x4_t vf = vdupq_n_f32(1.0f); - float32x2_t vlf = vdup_n_f32(1.0f); + float16_t *src = (float16_t*)argv[argc-1]; + float *src2 = (float*)argv[argc-2]; + float16x8_t vhp = vdupq_n_f16(src[0]); + float16x4_t vlhp = vdup_n_f16(src[1]); + float32x4_t vf = vdupq_n_f32(src2[0]); + float32x2_t vlf = vdup_n_f32(src2[1]); int ret = (int)vget_lane_f32(vfmlal_low_f16(vlf, vlhp, vlhp), 0); ret += (int)vgetq_lane_f32(vfmlslq_high_f16(vf, vhp, vhp), 0); diff --git a/numpy/distutils/checks/cpu_asimdhp.c b/numpy/distutils/checks/cpu_asimdhp.c index 80b94000f..e2de0306e 100644 --- a/numpy/distutils/checks/cpu_asimdhp.c +++ b/numpy/distutils/checks/cpu_asimdhp.c @@ -3,10 +3,11 @@ #endif #include <arm_neon.h> -int main(void) +int main(int argc, char **argv) { - float16x8_t vhp = vdupq_n_f16((float16_t)-1); - float16x4_t vlhp = vdup_n_f16((float16_t)-1); + float16_t *src = (float16_t*)argv[argc-1]; + float16x8_t vhp = vdupq_n_f16(src[0]); + float16x4_t vlhp = vdup_n_f16(src[1]); int ret = (int)vgetq_lane_f16(vabdq_f16(vhp, vhp), 0); ret += (int)vget_lane_f16(vabd_f16(vlhp, vlhp), 0); diff --git a/numpy/distutils/checks/cpu_neon.c b/numpy/distutils/checks/cpu_neon.c index 4eab1f384..8c64f864d 100644 --- a/numpy/distutils/checks/cpu_neon.c +++ b/numpy/distutils/checks/cpu_neon.c @@ -3,12 +3,16 @@ #endif #include <arm_neon.h> -int main(void) +int main(int argc, char **argv) { - float32x4_t v1 = vdupq_n_f32(1.0f), v2 = vdupq_n_f32(2.0f); + // passing from untraced pointers to avoid optimizing out any constants + // so we can test against the linker. + float *src = (float*)argv[argc-1]; + float32x4_t v1 = vdupq_n_f32(src[0]), v2 = vdupq_n_f32(src[1]); int ret = (int)vgetq_lane_f32(vmulq_f32(v1, v2), 0); #ifdef __aarch64__ - float64x2_t vd1 = vdupq_n_f64(1.0), vd2 = vdupq_n_f64(2.0); + double *src2 = (double*)argv[argc-2]; + float64x2_t vd1 = vdupq_n_f64(src2[0]), vd2 = vdupq_n_f64(src2[1]); ret += (int)vgetq_lane_f64(vmulq_f64(vd1, vd2), 0); #endif return ret; diff --git a/numpy/distutils/checks/cpu_neon_fp16.c b/numpy/distutils/checks/cpu_neon_fp16.c index 745d2e793..f3b949770 100644 --- a/numpy/distutils/checks/cpu_neon_fp16.c +++ b/numpy/distutils/checks/cpu_neon_fp16.c @@ -3,9 +3,9 @@ #endif #include <arm_neon.h> -int main(void) +int main(int argc, char **argv) { - short z4[] = {0, 0, 0, 0, 0, 0, 0, 0}; - float32x4_t v_z4 = vcvt_f32_f16((float16x4_t)vld1_s16((const short*)z4)); + short *src = (short*)argv[argc-1]; + float32x4_t v_z4 = vcvt_f32_f16((float16x4_t)vld1_s16(src)); return (int)vgetq_lane_f32(v_z4, 0); } diff --git a/numpy/distutils/checks/cpu_neon_vfpv4.c b/numpy/distutils/checks/cpu_neon_vfpv4.c index 45f7b5d69..a039159dd 100644 --- a/numpy/distutils/checks/cpu_neon_vfpv4.c +++ b/numpy/distutils/checks/cpu_neon_vfpv4.c @@ -3,16 +3,18 @@ #endif #include <arm_neon.h> -int main(void) +int main(int argc, char **argv) { - float32x4_t v1 = vdupq_n_f32(1.0f); - float32x4_t v2 = vdupq_n_f32(2.0f); - float32x4_t v3 = vdupq_n_f32(3.0f); + float *src = (float*)argv[argc-1]; + float32x4_t v1 = vdupq_n_f32(src[0]); + float32x4_t v2 = vdupq_n_f32(src[1]); + float32x4_t v3 = vdupq_n_f32(src[2]); int ret = (int)vgetq_lane_f32(vfmaq_f32(v1, v2, v3), 0); #ifdef __aarch64__ - float64x2_t vd1 = vdupq_n_f64(1.0); - float64x2_t vd2 = vdupq_n_f64(2.0); - float64x2_t vd3 = vdupq_n_f64(3.0); + double *src2 = (double*)argv[argc-2]; + float64x2_t vd1 = vdupq_n_f64(src2[0]); + float64x2_t vd2 = vdupq_n_f64(src2[1]); + float64x2_t vd3 = vdupq_n_f64(src2[2]); ret += (int)vgetq_lane_f64(vfmaq_f64(vd1, vd2, vd3), 0); #endif return ret; diff --git a/numpy/distutils/misc_util.py b/numpy/distutils/misc_util.py index 78665d351..b3916a2c8 100644 --- a/numpy/distutils/misc_util.py +++ b/numpy/distutils/misc_util.py @@ -358,7 +358,7 @@ if terminal_has_colors(): fgcode = 30 + _colour_codes.get(fg.lower(), 0) seq.append(str(fgcode)) if bg: - bgcode = 40 + _colour_codes.get(fg.lower(), 7) + bgcode = 40 + _colour_codes.get(bg.lower(), 7) seq.append(str(bgcode)) if seq: return '\x1b[%sm%s\x1b[0m' % (';'.join(seq), s) diff --git a/numpy/f2py/capi_maps.py b/numpy/f2py/capi_maps.py index e5dc2331a..f07066a09 100644 --- a/numpy/f2py/capi_maps.py +++ b/numpy/f2py/capi_maps.py @@ -176,6 +176,7 @@ f2cmap_all = {'real': {'': 'float', '4': 'float', '8': 'double', f2cmap_default = copy.deepcopy(f2cmap_all) +f2cmap_mapped = [] def load_f2cmap_file(f2cmap_file): global f2cmap_all @@ -212,6 +213,7 @@ def load_f2cmap_file(f2cmap_file): f2cmap_all[k][k1] = d[k][k1] outmess('\tMapping "%s(kind=%s)" to "%s"\n' % (k, k1, d[k][k1])) + f2cmap_mapped.append(d[k][k1]) else: errmess("\tIgnoring map {'%s':{'%s':'%s'}}: '%s' must be in %s\n" % ( k, k1, d[k][k1], d[k][k1], list(c2py_map.keys()))) diff --git a/numpy/f2py/rules.py b/numpy/f2py/rules.py index c9c3b2383..63c48a878 100755 --- a/numpy/f2py/rules.py +++ b/numpy/f2py/rules.py @@ -1323,6 +1323,9 @@ def buildmodule(m, um): rd = dictappend(rd, ar) needs = cfuncs.get_needs() + # Add mapped definitions + needs['typedefs'] += [cvar for cvar in capi_maps.f2cmap_mapped # + if cvar in typedef_need_dict.values()] code = {} for n in needs.keys(): code[n] = [] diff --git a/numpy/f2py/src/fortranobject.h b/numpy/f2py/src/fortranobject.h index 376b83dad..abd699c2f 100644 --- a/numpy/f2py/src/fortranobject.h +++ b/numpy/f2py/src/fortranobject.h @@ -6,7 +6,9 @@ extern "C" { #include <Python.h> -#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION +#ifndef NPY_NO_DEPRECATED_API +#define NPY_NO_DEPRECATED_API NPY_API_VERSION +#endif #ifdef FORTRANOBJECT_C #define NO_IMPORT_ARRAY #endif diff --git a/numpy/f2py/tests/src/f2cmap/.f2py_f2cmap b/numpy/f2py/tests/src/f2cmap/.f2py_f2cmap new file mode 100644 index 000000000..a4425f887 --- /dev/null +++ b/numpy/f2py/tests/src/f2cmap/.f2py_f2cmap @@ -0,0 +1 @@ +dict(real=dict(real32='float', real64='double'), integer=dict(int64='long_long')) diff --git a/numpy/f2py/tests/src/f2cmap/isoFortranEnvMap.f90 b/numpy/f2py/tests/src/f2cmap/isoFortranEnvMap.f90 new file mode 100644 index 000000000..3f0e12c76 --- /dev/null +++ b/numpy/f2py/tests/src/f2cmap/isoFortranEnvMap.f90 @@ -0,0 +1,9 @@ + subroutine func1(n, x, res) + use, intrinsic :: iso_fortran_env, only: int64, real64 + implicit none + integer(int64), intent(in) :: n + real(real64), intent(in) :: x(n) + real(real64), intent(out) :: res +Cf2py intent(hide) :: n + res = sum(x) + end diff --git a/numpy/f2py/tests/test_f2cmap.py b/numpy/f2py/tests/test_f2cmap.py new file mode 100644 index 000000000..d2967e4f7 --- /dev/null +++ b/numpy/f2py/tests/test_f2cmap.py @@ -0,0 +1,15 @@ +from . import util +import numpy as np + +class TestF2Cmap(util.F2PyTest): + sources = [ + util.getpath("tests", "src", "f2cmap", "isoFortranEnvMap.f90"), + util.getpath("tests", "src", "f2cmap", ".f2py_f2cmap") + ] + + # gh-15095 + def test_long_long_map(self): + inp = np.ones(3) + out = self.module.func1(inp) + exp_out = 3 + assert out == exp_out diff --git a/numpy/lib/tests/test_loadtxt.py b/numpy/lib/tests/test_loadtxt.py index 8839ef0a8..0b8fe3c47 100644 --- a/numpy/lib/tests/test_loadtxt.py +++ b/numpy/lib/tests/test_loadtxt.py @@ -5,6 +5,7 @@ These tests complement those found in `test_io.py`. """ import sys +import os import pytest from tempfile import NamedTemporaryFile, mkstemp from io import StringIO @@ -252,7 +253,7 @@ def test_ragged_usecols(): txt = StringIO("0,0,XXX\n0\n0,XXX,XXX,0,XXX\n") with pytest.raises(ValueError, - match="invalid column index -2 at row 1 with 2 columns"): + match="invalid column index -2 at row 2 with 1 columns"): # There is no -2 column in the second row: np.loadtxt(txt, dtype=float, delimiter=",", usecols=[0, -2]) @@ -960,9 +961,11 @@ def test_parametric_unit_discovery( # file-obj path fd, fname = mkstemp() + os.close(fd) with open(fname, "w") as fh: fh.write("\n".join(data)) a = np.loadtxt(fname, dtype=unitless_dtype) + os.remove(fname) assert a.dtype == expected.dtype assert_equal(a, expected) @@ -982,9 +985,11 @@ def test_str_dtype_unit_discovery_with_converter(): # file-obj path fd, fname = mkstemp() + os.close(fd) with open(fname, "w") as fh: fh.write("\n".join(data)) a = np.loadtxt(fname, dtype="U", converters=conv, encoding=None) + os.remove(fname) assert a.dtype == expected.dtype assert_equal(a, expected) diff --git a/numpy/linalg/lapack_lite/f2c.c b/numpy/linalg/lapack_lite/f2c.c index 9a1e9cec1..f1d3fdfbe 100644 --- a/numpy/linalg/lapack_lite/f2c.c +++ b/numpy/linalg/lapack_lite/f2c.c @@ -14,9 +14,9 @@ #include "f2c.h" -extern void s_wsfe(cilist *f) {;} -extern void e_wsfe(void) {;} -extern void do_fio(integer *c, char *s, ftnlen l) {;} +extern int s_wsfe(cilist *f) {return 0;} +extern int e_wsfe(void) {return 0;} +extern int do_fio(integer *c, char *s, ftnlen l) {return 0;} /* You'll want this if you redo the f2c_*.c files with the -C option * to f2c for checking array subscripts. (It's not suggested you do that @@ -377,7 +377,7 @@ p->i = p1.i; #endif /* NO_OVERWRITE */ - VOID + int #ifdef KR_headers s_cat(lp, rpp, rnp, np, ll) char *lp, *rpp[]; ftnlen rnp[], *np, ll; #else @@ -485,9 +485,9 @@ return(0); /* assign strings: a = b */ #ifdef KR_headers -VOID s_copy(a, b, la, lb) register char *a, *b; ftnlen la, lb; +int s_copy(a, b, la, lb) register char *a, *b; ftnlen la, lb; #else -void s_copy(register char *a, register char *b, ftnlen la, ftnlen lb) +int s_copy(register char *a, register char *b, ftnlen la, ftnlen lb) #endif { register char *aend, *bend; @@ -524,6 +524,7 @@ void s_copy(register char *a, register char *b, ftnlen la, ftnlen lb) while(a < aend) *a++ = ' '; } + return 0; } diff --git a/numpy/linalg/lapack_lite/f2c.h b/numpy/linalg/lapack_lite/f2c.h index d3fbfc177..b44aaac44 100644 --- a/numpy/linalg/lapack_lite/f2c.h +++ b/numpy/linalg/lapack_lite/f2c.h @@ -263,7 +263,7 @@ extern double d_tan(double *); extern double d_tanh(double *); extern double derf_(double *); extern double derfc_(double *); -extern void do_fio(ftnint *, char *, ftnlen); +extern int do_fio(ftnint *, char *, ftnlen); extern integer do_lio(ftnint *, ftnint *, char *, ftnlen); extern integer do_uio(ftnint *, char *, ftnlen); extern integer e_rdfe(void); @@ -275,7 +275,7 @@ extern integer e_rsli(void); extern integer e_rsue(void); extern integer e_wdfe(void); extern integer e_wdue(void); -extern void e_wsfe(void); +extern int e_wsfe(void); extern integer e_wsfi(void); extern integer e_wsle(void); extern integer e_wsli(void); @@ -350,9 +350,9 @@ extern double r_sinh(float *); extern double r_sqrt(float *); extern double r_tan(float *); extern double r_tanh(float *); -extern void s_cat(char *, char **, integer *, integer *, ftnlen); +extern int s_cat(char *, char **, integer *, integer *, ftnlen); extern integer s_cmp(char *, char *, ftnlen, ftnlen); -extern void s_copy(char *, char *, ftnlen, ftnlen); +extern int s_copy(char *, char *, ftnlen, ftnlen); extern int s_paus(char *, ftnlen); extern integer s_rdfe(cilist *); extern integer s_rdue(cilist *); @@ -367,7 +367,7 @@ extern integer s_rsue(cilist *); extern int s_stop(char *, ftnlen); extern integer s_wdfe(cilist *); extern integer s_wdue(cilist *); -extern void s_wsfe( cilist *); +extern int s_wsfe( cilist *); extern integer s_wsfi(icilist *); extern integer s_wsle(cilist *); extern integer s_wsli(icilist *); diff --git a/numpy/linalg/setup.py b/numpy/linalg/setup.py index dc62dff8f..1c4e1295e 100644 --- a/numpy/linalg/setup.py +++ b/numpy/linalg/setup.py @@ -1,5 +1,6 @@ import os import sys +import sysconfig def configuration(parent_package='', top_path=None): from numpy.distutils.misc_util import Configuration @@ -38,7 +39,14 @@ def configuration(parent_package='', top_path=None): class numpy_linalg_lapack_lite(system_info): def calc_info(self): info = {'language': 'c'} - if sys.maxsize > 2**32: + size_t_size = sysconfig.get_config_var("SIZEOF_SIZE_T") + if size_t_size: + maxsize = 2**(size_t_size - 1) - 1 + else: + # We prefer using sysconfig as it allows cross-compilation + # but the information may be missing (e.g. on windows). + maxsize = sys.maxsize + if maxsize > 2**32: # Build lapack-lite in 64-bit integer mode. # The suffix is arbitrary (lapack_lite symbols follow it), # but use the "64_" convention here. diff --git a/numpy/ma/core.py b/numpy/ma/core.py index 78333ed02..d8fd4f389 100644 --- a/numpy/ma/core.py +++ b/numpy/ma/core.py @@ -31,6 +31,7 @@ from functools import reduce import numpy as np import numpy.core.umath as umath import numpy.core.numerictypes as ntypes +from numpy.core import multiarray as mu from numpy import ndarray, amax, amin, iscomplexobj, bool_, _NoValue from numpy import array as narray from numpy.lib.function_base import angle @@ -5289,14 +5290,22 @@ class MaskedArray(ndarray): """ kwargs = {} if keepdims is np._NoValue else {'keepdims': keepdims} - if self._mask is nomask: result = super().mean(axis=axis, dtype=dtype, **kwargs)[()] else: + is_float16_result = False + if dtype is None: + if issubclass(self.dtype.type, (ntypes.integer, ntypes.bool_)): + dtype = mu.dtype('f8') + elif issubclass(self.dtype.type, ntypes.float16): + dtype = mu.dtype('f4') + is_float16_result = True dsum = self.sum(axis=axis, dtype=dtype, **kwargs) cnt = self.count(axis=axis, **kwargs) if cnt.shape == () and (cnt == 0): result = masked + elif is_float16_result: + result = self.dtype.type(dsum * 1. / cnt) else: result = dsum * 1. / cnt if out is not None: diff --git a/numpy/ma/tests/test_core.py b/numpy/ma/tests/test_core.py index 0dada104d..4fac897de 100644 --- a/numpy/ma/tests/test_core.py +++ b/numpy/ma/tests/test_core.py @@ -4036,6 +4036,12 @@ class TestMaskedArrayMathMethods: assert_equal(a.max(-1), [3, 6]) assert_equal(a.max(1), [3, 6]) + def test_mean_overflow(self): + # Test overflow in masked arrays + # gh-20272 + a = masked_array(np.full((10000, 10000), 65535, dtype=np.uint16), + mask=np.zeros((10000, 10000))) + assert_equal(a.mean(), 65535.0) class TestMaskedArrayMathMethodsComplex: # Test class for miscellaneous MaskedArrays methods. @@ -4158,7 +4164,11 @@ class TestMaskedArrayFunctions: # test that masked_where on a structured array sets a structured # mask (see issue #2972) a = np.zeros(10, dtype=[("A", "<f2"), ("B", "<f4")]) - am = np.ma.masked_where(a["A"] < 5, a) + with np.errstate(over="ignore"): + # NOTE: The float16 "uses" 1e20 as mask, which overflows to inf + # and warns. Unrelated to this test, but probably undesired. + # But NumPy previously did not warn for this overflow. + am = np.ma.masked_where(a["A"] < 5, a) assert_equal(am.mask.dtype.names, am.dtype.names) assert_equal(am["A"], np.ma.masked_array(np.zeros(10), np.ones(10))) @@ -4334,7 +4344,10 @@ class TestMaskedArrayFunctions: tmp[(xm <= 2).filled(True)] = True assert_equal(d._mask, tmp) - ixm = xm.astype(int) + with np.errstate(invalid="warn"): + # The fill value is 1e20, it cannot be converted to `int`: + with pytest.warns(RuntimeWarning, match="invalid value"): + ixm = xm.astype(int) d = where(ixm > 2, ixm, masked) assert_equal(d, [-9, -9, -9, -9, -9, 4, -9, -9, 10, -9, -9, 3]) assert_equal(d.dtype, ixm.dtype) diff --git a/numpy/polynomial/__init__.py b/numpy/polynomial/__init__.py index 5a3addf4c..c4e7baf2c 100644 --- a/numpy/polynomial/__init__.py +++ b/numpy/polynomial/__init__.py @@ -156,17 +156,17 @@ def set_default_printstyle(style): >>> c = np.polynomial.Chebyshev([1, 2, 3]) >>> np.polynomial.set_default_printstyle('unicode') >>> print(p) - 1.0 + 2.0·x¹ + 3.0·x² + 1.0 + 2.0·x + 3.0·x² >>> print(c) 1.0 + 2.0·T₁(x) + 3.0·T₂(x) >>> np.polynomial.set_default_printstyle('ascii') >>> print(p) - 1.0 + 2.0 x**1 + 3.0 x**2 + 1.0 + 2.0 x + 3.0 x**2 >>> print(c) 1.0 + 2.0 T_1(x) + 3.0 T_2(x) >>> # Formatting supersedes all class/package-level defaults >>> print(f"{p:unicode}") - 1.0 + 2.0·x¹ + 3.0·x² + 1.0 + 2.0·x + 3.0·x² """ if style not in ('unicode', 'ascii'): raise ValueError( diff --git a/numpy/polynomial/_polybase.py b/numpy/polynomial/_polybase.py index 6382732dc..9674dee0b 100644 --- a/numpy/polynomial/_polybase.py +++ b/numpy/polynomial/_polybase.py @@ -366,7 +366,7 @@ class ABCPolyBase(abc.ABC): linewidth = np.get_printoptions().get('linewidth', 75) if linewidth < 1: linewidth = 1 - out = f"{self.coef[0]}" + out = pu.format_float(self.coef[0]) for i, coef in enumerate(self.coef[1:]): out += " " power = str(i + 1) @@ -376,9 +376,9 @@ class ABCPolyBase(abc.ABC): # complex). In this case, represent the coefficient as-is. try: if coef >= 0: - next_term = f"+ {coef}" + next_term = f"+ " + pu.format_float(coef, parens=True) else: - next_term = f"- {-coef}" + next_term = f"- " + pu.format_float(-coef, parens=True) except TypeError: next_term = f"+ {coef}" # Polynomial term @@ -432,10 +432,10 @@ class ABCPolyBase(abc.ABC): return f"{{{cls.basis_name}}}_{{{i}}}({arg_str})" @staticmethod - def _repr_latex_scalar(x): + def _repr_latex_scalar(x, parens=False): # TODO: we're stuck with disabling math formatting until we handle # exponents in this function - return r'\text{{{}}}'.format(x) + return r'\text{{{}}}'.format(pu.format_float(x, parens=parens)) def _repr_latex_(self): # get the scaled argument string to the basis functions @@ -466,9 +466,9 @@ class ABCPolyBase(abc.ABC): elif not isinstance(c, numbers.Real): coef_str = f" + ({self._repr_latex_scalar(c)})" elif not np.signbit(c): - coef_str = f" + {self._repr_latex_scalar(c)}" + coef_str = f" + {self._repr_latex_scalar(c, parens=True)}" else: - coef_str = f" - {self._repr_latex_scalar(-c)}" + coef_str = f" - {self._repr_latex_scalar(-c, parens=True)}" # produce the string for the term term_str = self._repr_latex_term(i, term, needs_parens) diff --git a/numpy/polynomial/polynomial.py b/numpy/polynomial/polynomial.py index b4741355f..8e2c6f002 100644 --- a/numpy/polynomial/polynomial.py +++ b/numpy/polynomial/polynomial.py @@ -1512,11 +1512,17 @@ class Polynomial(ABCPolyBase): @classmethod def _str_term_unicode(cls, i, arg_str): - return f"·{arg_str}{i.translate(cls._superscript_mapping)}" + if i == '1': + return f"·{arg_str}" + else: + return f"·{arg_str}{i.translate(cls._superscript_mapping)}" @staticmethod def _str_term_ascii(i, arg_str): - return f" {arg_str}**{i}" + if i == '1': + return f" {arg_str}" + else: + return f" {arg_str}**{i}" @staticmethod def _repr_latex_term(i, arg_str, needs_parens): diff --git a/numpy/polynomial/polyutils.py b/numpy/polynomial/polyutils.py index a2bc75a4d..482913892 100644 --- a/numpy/polynomial/polyutils.py +++ b/numpy/polynomial/polyutils.py @@ -32,9 +32,13 @@ import warnings import numpy as np +from numpy.core.multiarray import dragon4_positional, dragon4_scientific +from numpy.core.umath import absolute + __all__ = [ 'RankWarning', 'as_series', 'trimseq', - 'trimcoef', 'getdomain', 'mapdomain', 'mapparms'] + 'trimcoef', 'getdomain', 'mapdomain', 'mapparms', + 'format_float'] # # Warnings and Exceptions @@ -748,3 +752,38 @@ def _deprecate_as_int(x, desc): return ix raise TypeError(f"{desc} must be an integer") from e + + +def format_float(x, parens=False): + if not np.issubdtype(type(x), np.floating): + return str(x) + + opts = np.get_printoptions() + + if np.isnan(x): + return opts['nanstr'] + elif np.isinf(x): + return opts['infstr'] + + exp_format = False + if x != 0: + a = absolute(x) + if a >= 1.e8 or a < 10**min(0, -(opts['precision']-1)//2): + exp_format = True + + trim, unique = '0', True + if opts['floatmode'] == 'fixed': + trim, unique = 'k', False + + if exp_format: + s = dragon4_scientific(x, precision=opts['precision'], + unique=unique, trim=trim, + sign=opts['sign'] == '+') + if parens: + s = '(' + s + ')' + else: + s = dragon4_positional(x, precision=opts['precision'], + fractional=True, + unique=unique, trim=trim, + sign=opts['sign'] == '+') + return s diff --git a/numpy/polynomial/polyutils.pyi b/numpy/polynomial/polyutils.pyi index 06260a9f1..c0bcc6784 100644 --- a/numpy/polynomial/polyutils.pyi +++ b/numpy/polynomial/polyutils.pyi @@ -8,3 +8,4 @@ def trimcoef(c, tol=...): ... def getdomain(x): ... def mapparms(old, new): ... def mapdomain(x, old, new): ... +def format_float(x, parens=...): ... diff --git a/numpy/polynomial/tests/test_printing.py b/numpy/polynomial/tests/test_printing.py index 0c4316223..990a0d179 100644 --- a/numpy/polynomial/tests/test_printing.py +++ b/numpy/polynomial/tests/test_printing.py @@ -1,3 +1,4 @@ +from math import nan, inf import pytest from numpy.core import array, arange, printoptions import numpy.polynomial as poly @@ -15,9 +16,9 @@ class TestStrUnicodeSuperSubscripts: poly.set_default_printstyle('unicode') @pytest.mark.parametrize(('inp', 'tgt'), ( - ([1, 2, 3], "1.0 + 2.0·x¹ + 3.0·x²"), - ([-1, 0, 3, -1], "-1.0 + 0.0·x¹ + 3.0·x² - 1.0·x³"), - (arange(12), ("0.0 + 1.0·x¹ + 2.0·x² + 3.0·x³ + 4.0·x⁴ + 5.0·x⁵ + " + ([1, 2, 3], "1.0 + 2.0·x + 3.0·x²"), + ([-1, 0, 3, -1], "-1.0 + 0.0·x + 3.0·x² - 1.0·x³"), + (arange(12), ("0.0 + 1.0·x + 2.0·x² + 3.0·x³ + 4.0·x⁴ + 5.0·x⁵ + " "6.0·x⁶ + 7.0·x⁷ +\n8.0·x⁸ + 9.0·x⁹ + 10.0·x¹⁰ + " "11.0·x¹¹")), )) @@ -89,9 +90,9 @@ class TestStrAscii: poly.set_default_printstyle('ascii') @pytest.mark.parametrize(('inp', 'tgt'), ( - ([1, 2, 3], "1.0 + 2.0 x**1 + 3.0 x**2"), - ([-1, 0, 3, -1], "-1.0 + 0.0 x**1 + 3.0 x**2 - 1.0 x**3"), - (arange(12), ("0.0 + 1.0 x**1 + 2.0 x**2 + 3.0 x**3 + 4.0 x**4 + " + ([1, 2, 3], "1.0 + 2.0 x + 3.0 x**2"), + ([-1, 0, 3, -1], "-1.0 + 0.0 x + 3.0 x**2 - 1.0 x**3"), + (arange(12), ("0.0 + 1.0 x + 2.0 x**2 + 3.0 x**3 + 4.0 x**4 + " "5.0 x**5 + 6.0 x**6 +\n7.0 x**7 + 8.0 x**8 + " "9.0 x**9 + 10.0 x**10 + 11.0 x**11")), )) @@ -168,51 +169,51 @@ class TestLinebreaking: def test_single_line_one_less(self): # With 'ascii' style, len(str(p)) is default linewidth - 1 (i.e. 74) - p = poly.Polynomial([123456789, 123456789, 123456789, 1234, 1]) + p = poly.Polynomial([12345678, 12345678, 12345678, 12345678, 123]) assert_equal(len(str(p)), 74) assert_equal(str(p), ( - '123456789.0 + 123456789.0 x**1 + 123456789.0 x**2 + ' - '1234.0 x**3 + 1.0 x**4' + '12345678.0 + 12345678.0 x + 12345678.0 x**2 + ' + '12345678.0 x**3 + 123.0 x**4' )) def test_num_chars_is_linewidth(self): # len(str(p)) == default linewidth == 75 - p = poly.Polynomial([123456789, 123456789, 123456789, 1234, 10]) + p = poly.Polynomial([12345678, 12345678, 12345678, 12345678, 1234]) assert_equal(len(str(p)), 75) assert_equal(str(p), ( - '123456789.0 + 123456789.0 x**1 + 123456789.0 x**2 + ' - '1234.0 x**3 +\n10.0 x**4' + '12345678.0 + 12345678.0 x + 12345678.0 x**2 + ' + '12345678.0 x**3 +\n1234.0 x**4' )) def test_first_linebreak_multiline_one_less_than_linewidth(self): # Multiline str where len(first_line) + len(next_term) == lw - 1 == 74 p = poly.Polynomial( - [123456789, 123456789, 123456789, 12, 1, 123456789] + [12345678, 12345678, 12345678, 12345678, 1, 12345678] ) assert_equal(len(str(p).split('\n')[0]), 74) assert_equal(str(p), ( - '123456789.0 + 123456789.0 x**1 + 123456789.0 x**2 + ' - '12.0 x**3 + 1.0 x**4 +\n123456789.0 x**5' + '12345678.0 + 12345678.0 x + 12345678.0 x**2 + ' + '12345678.0 x**3 + 1.0 x**4 +\n12345678.0 x**5' )) def test_first_linebreak_multiline_on_linewidth(self): # First line is one character longer than previous test p = poly.Polynomial( - [123456789, 123456789, 123456789, 123, 1, 123456789] + [12345678, 12345678, 12345678, 12345678.12, 1, 12345678] ) assert_equal(str(p), ( - '123456789.0 + 123456789.0 x**1 + 123456789.0 x**2 + ' - '123.0 x**3 +\n1.0 x**4 + 123456789.0 x**5' + '12345678.0 + 12345678.0 x + 12345678.0 x**2 + ' + '12345678.12 x**3 +\n1.0 x**4 + 12345678.0 x**5' )) @pytest.mark.parametrize(('lw', 'tgt'), ( - (75, ('0.0 + 10.0 x**1 + 200.0 x**2 + 3000.0 x**3 + 40000.0 x**4 +\n' - '500000.0 x**5 + 600000.0 x**6 + 70000.0 x**7 + 8000.0 x**8 + ' + (75, ('0.0 + 10.0 x + 200.0 x**2 + 3000.0 x**3 + 40000.0 x**4 + ' + '500000.0 x**5 +\n600000.0 x**6 + 70000.0 x**7 + 8000.0 x**8 + ' '900.0 x**9')), - (45, ('0.0 + 10.0 x**1 + 200.0 x**2 + 3000.0 x**3 +\n40000.0 x**4 + ' + (45, ('0.0 + 10.0 x + 200.0 x**2 + 3000.0 x**3 +\n40000.0 x**4 + ' '500000.0 x**5 +\n600000.0 x**6 + 70000.0 x**7 + 8000.0 x**8 +\n' '900.0 x**9')), - (132, ('0.0 + 10.0 x**1 + 200.0 x**2 + 3000.0 x**3 + 40000.0 x**4 + ' + (132, ('0.0 + 10.0 x + 200.0 x**2 + 3000.0 x**3 + 40000.0 x**4 + ' '500000.0 x**5 + 600000.0 x**6 + 70000.0 x**7 + 8000.0 x**8 + ' '900.0 x**9')), )) @@ -230,10 +231,10 @@ def test_set_default_printoptions(): p = poly.Polynomial([1, 2, 3]) c = poly.Chebyshev([1, 2, 3]) poly.set_default_printstyle('ascii') - assert_equal(str(p), "1.0 + 2.0 x**1 + 3.0 x**2") + assert_equal(str(p), "1.0 + 2.0 x + 3.0 x**2") assert_equal(str(c), "1.0 + 2.0 T_1(x) + 3.0 T_2(x)") poly.set_default_printstyle('unicode') - assert_equal(str(p), "1.0 + 2.0·x¹ + 3.0·x²") + assert_equal(str(p), "1.0 + 2.0·x + 3.0·x²") assert_equal(str(c), "1.0 + 2.0·T₁(x) + 3.0·T₂(x)") with pytest.raises(ValueError): poly.set_default_printstyle('invalid_input') @@ -247,22 +248,22 @@ def test_complex_coefficients(): # Python complex p2 = poly.Polynomial(array(coefs, dtype=object)) poly.set_default_printstyle('unicode') - assert_equal(str(p1), "1j + (1+1j)·x¹ - (2-2j)·x² + (3+0j)·x³") - assert_equal(str(p2), "1j + (1+1j)·x¹ + (-2+2j)·x² + (3+0j)·x³") + assert_equal(str(p1), "1j + (1+1j)·x - (2-2j)·x² + (3+0j)·x³") + assert_equal(str(p2), "1j + (1+1j)·x + (-2+2j)·x² + (3+0j)·x³") poly.set_default_printstyle('ascii') - assert_equal(str(p1), "1j + (1+1j) x**1 - (2-2j) x**2 + (3+0j) x**3") - assert_equal(str(p2), "1j + (1+1j) x**1 + (-2+2j) x**2 + (3+0j) x**3") + assert_equal(str(p1), "1j + (1+1j) x - (2-2j) x**2 + (3+0j) x**3") + assert_equal(str(p2), "1j + (1+1j) x + (-2+2j) x**2 + (3+0j) x**3") @pytest.mark.parametrize(('coefs', 'tgt'), ( (array([Fraction(1, 2), Fraction(3, 4)], dtype=object), ( - "1/2 + 3/4·x¹" + "1/2 + 3/4·x" )), (array([1, 2, Fraction(5, 7)], dtype=object), ( - "1 + 2·x¹ + 5/7·x²" + "1 + 2·x + 5/7·x²" )), (array([Decimal('1.00'), Decimal('2.2'), 3], dtype=object), ( - "1.00 + 2.2·x¹ + 3·x²" + "1.00 + 2.2·x + 3·x²" )), )) def test_numeric_object_coefficients(coefs, tgt): @@ -272,8 +273,8 @@ def test_numeric_object_coefficients(coefs, tgt): @pytest.mark.parametrize(('coefs', 'tgt'), ( - (array([1, 2, 'f'], dtype=object), '1 + 2·x¹ + f·x²'), - (array([1, 2, [3, 4]], dtype=object), '1 + 2·x¹ + [3, 4]·x²'), + (array([1, 2, 'f'], dtype=object), '1 + 2·x + f·x²'), + (array([1, 2, [3, 4]], dtype=object), '1 + 2·x + [3, 4]·x²'), )) def test_nonnumeric_object_coefficients(coefs, tgt): """ @@ -288,20 +289,20 @@ class TestFormat: def test_format_unicode(self): poly.set_default_printstyle('ascii') p = poly.Polynomial([1, 2, 0, -1]) - assert_equal(format(p, 'unicode'), "1.0 + 2.0·x¹ + 0.0·x² - 1.0·x³") + assert_equal(format(p, 'unicode'), "1.0 + 2.0·x + 0.0·x² - 1.0·x³") def test_format_ascii(self): poly.set_default_printstyle('unicode') p = poly.Polynomial([1, 2, 0, -1]) assert_equal( - format(p, 'ascii'), "1.0 + 2.0 x**1 + 0.0 x**2 - 1.0 x**3" + format(p, 'ascii'), "1.0 + 2.0 x + 0.0 x**2 - 1.0 x**3" ) def test_empty_formatstr(self): poly.set_default_printstyle('ascii') p = poly.Polynomial([1, 2, 3]) - assert_equal(format(p), "1.0 + 2.0 x**1 + 3.0 x**2") - assert_equal(f"{p}", "1.0 + 2.0 x**1 + 3.0 x**2") + assert_equal(format(p), "1.0 + 2.0 x + 3.0 x**2") + assert_equal(f"{p}", "1.0 + 2.0 x + 3.0 x**2") def test_bad_formatstr(self): p = poly.Polynomial([1, 2, 0, -1]) @@ -310,7 +311,7 @@ class TestFormat: @pytest.mark.parametrize(('poly', 'tgt'), ( - (poly.Polynomial, '1.0 + 2.0·z¹ + 3.0·z²'), + (poly.Polynomial, '1.0 + 2.0·z + 3.0·z²'), (poly.Chebyshev, '1.0 + 2.0·T₁(z) + 3.0·T₂(z)'), (poly.Hermite, '1.0 + 2.0·H₁(z) + 3.0·H₂(z)'), (poly.HermiteE, '1.0 + 2.0·He₁(z) + 3.0·He₂(z)'), @@ -379,7 +380,7 @@ class TestLatexRepr: # right now we ignore the formatting of scalars in our tests, since # it makes them too verbose. Ideally, the formatting of scalars will # be fixed such that tests below continue to pass - obj._repr_latex_scalar = lambda x: str(x) + obj._repr_latex_scalar = lambda x, parens=False: str(x) try: return obj._repr_latex_() finally: @@ -455,3 +456,71 @@ class TestLatexRepr: r'\left(1.0 + 2.0z\right)^{2}$' ), ) + + +SWITCH_TO_EXP = ( + '1.0 + (1.0e-01) x + (1.0e-02) x**2', + '1.2 + (1.2e-01) x + (1.2e-02) x**2', + '1.23 + 0.12 x + (1.23e-02) x**2 + (1.23e-03) x**3', + '1.235 + 0.123 x + (1.235e-02) x**2 + (1.235e-03) x**3', + '1.2346 + 0.1235 x + 0.0123 x**2 + (1.2346e-03) x**3 + (1.2346e-04) x**4', + '1.23457 + 0.12346 x + 0.01235 x**2 + (1.23457e-03) x**3 + ' + '(1.23457e-04) x**4', + '1.234568 + 0.123457 x + 0.012346 x**2 + 0.001235 x**3 + ' + '(1.234568e-04) x**4 + (1.234568e-05) x**5', + '1.2345679 + 0.1234568 x + 0.0123457 x**2 + 0.0012346 x**3 + ' + '(1.2345679e-04) x**4 + (1.2345679e-05) x**5') + +class TestPrintOptions: + """ + Test the output is properly configured via printoptions. + The exponential notation is enabled automatically when the values + are too small or too large. + """ + + def test_str(self): + p = poly.Polynomial([1/2, 1/7, 1/7*10**8, 1/7*10**9]) + assert_equal(str(p), '0.5 + 0.14285714 x + 14285714.28571429 x**2 ' + '+ (1.42857143e+08) x**3') + + with printoptions(precision=3): + assert_equal(str(p), '0.5 + 0.143 x + 14285714.286 x**2 ' + '+ (1.429e+08) x**3') + + def test_latex(self): + p = poly.Polynomial([1/2, 1/7, 1/7*10**8, 1/7*10**9]) + assert_equal(p._repr_latex_(), + r'$x \mapsto \text{0.5} + \text{0.14285714}\,x + ' + r'\text{14285714.28571429}\,x^{2} + ' + r'\text{(1.42857143e+08)}\,x^{3}$') + + with printoptions(precision=3): + assert_equal(p._repr_latex_(), + r'$x \mapsto \text{0.5} + \text{0.143}\,x + ' + r'\text{14285714.286}\,x^{2} + \text{(1.429e+08)}\,x^{3}$') + + def test_fixed(self): + p = poly.Polynomial([1/2]) + assert_equal(str(p), '0.5') + + with printoptions(floatmode='fixed'): + assert_equal(str(p), '0.50000000') + + with printoptions(floatmode='fixed', precision=4): + assert_equal(str(p), '0.5000') + + def test_switch_to_exp(self): + for i, s in enumerate(SWITCH_TO_EXP): + with printoptions(precision=i): + p = poly.Polynomial([1.23456789*10**-i + for i in range(i//2+3)]) + assert str(p).replace('\n', ' ') == s + + def test_non_finite(self): + p = poly.Polynomial([nan, inf]) + assert str(p) == 'nan + inf x' + assert p._repr_latex_() == r'$x \mapsto \text{nan} + \text{inf}\,x$' + with printoptions(nanstr='NAN', infstr='INF'): + assert str(p) == 'NAN + INF x' + assert p._repr_latex_() == \ + r'$x \mapsto \text{NAN} + \text{INF}\,x$' diff --git a/numpy/random/_generator.pyx b/numpy/random/_generator.pyx index b54fe3610..0019c4bcd 100644 --- a/numpy/random/_generator.pyx +++ b/numpy/random/_generator.pyx @@ -3660,6 +3660,11 @@ cdef class Generator: # Check preconditions on arguments mean = np.array(mean) cov = np.array(cov) + + if (np.issubdtype(mean.dtype, np.complexfloating) or + np.issubdtype(cov.dtype, np.complexfloating)): + raise TypeError("mean and cov must not be complex") + if size is None: shape = [] elif isinstance(size, (int, long, np.integer)): diff --git a/numpy/random/tests/test_generator_mt19937.py b/numpy/random/tests/test_generator_mt19937.py index 3ccb9103c..fa55ac0ee 100644 --- a/numpy/random/tests/test_generator_mt19937.py +++ b/numpy/random/tests/test_generator_mt19937.py @@ -1452,6 +1452,12 @@ class TestRandomDist: mu, np.empty((3, 2))) assert_raises(ValueError, random.multivariate_normal, mu, np.eye(3)) + + @pytest.mark.parametrize('mean, cov', [([0], [[1+1j]]), ([0j], [[1]])]) + def test_multivariate_normal_disallow_complex(self, mean, cov): + random = Generator(MT19937(self.seed)) + with pytest.raises(TypeError, match="must not be complex"): + random.multivariate_normal(mean, cov) @pytest.mark.parametrize("method", ["svd", "eigh", "cholesky"]) def test_multivariate_normal_basic_stats(self, method): diff --git a/numpy/typing/tests/data/pass/arithmetic.py b/numpy/typing/tests/data/pass/arithmetic.py index 4ed69c923..07a990127 100644 --- a/numpy/typing/tests/data/pass/arithmetic.py +++ b/numpy/typing/tests/data/pass/arithmetic.py @@ -2,6 +2,7 @@ from __future__ import annotations from typing import Any import numpy as np +import pytest c16 = np.complex128(1) f8 = np.float64(1) @@ -330,8 +331,9 @@ AR_O **= AR_LIKE_O -f4 -i8 -i4 --u8 --u4 +with pytest.warns(RuntimeWarning): + -u8 + -u4 -td -AR_f |