diff options
Diffstat (limited to 'numpy/core/src/common')
54 files changed, 3635 insertions, 380 deletions
diff --git a/numpy/core/src/common/common.hpp b/numpy/core/src/common/common.hpp new file mode 100644 index 000000000..44ba449d8 --- /dev/null +++ b/numpy/core/src/common/common.hpp @@ -0,0 +1,14 @@ +#ifndef NUMPY_CORE_SRC_COMMON_COMMON_HPP +#define NUMPY_CORE_SRC_COMMON_COMMON_HPP +/* + * The following C++ headers are safe to be used standalone, however, + * they are gathered to make it easy for us and for the future need to support PCH. + */ +#include "npdef.hpp" +#include "utils.hpp" +#include "npstd.hpp" +#include "half.hpp" +#include "meta.hpp" +#include "float_status.hpp" + +#endif // NUMPY_CORE_SRC_COMMON_COMMON_HPP diff --git a/numpy/core/src/common/dlpack/dlpack.h b/numpy/core/src/common/dlpack/dlpack.h index 29209aee1..f0cbf6136 100644 --- a/numpy/core/src/common/dlpack/dlpack.h +++ b/numpy/core/src/common/dlpack/dlpack.h @@ -1,5 +1,5 @@ // Taken from: -// https://github.com/dmlc/dlpack/blob/9b6176fdecb55e9bf39b16f08b96913ed3f275b4/include/dlpack/dlpack.h +// https://github.com/dmlc/dlpack/blob/ca4d00ad3e2e0f410eeab3264d21b8a39397f362/include/dlpack/dlpack.h /*! * Copyright (c) 2017 by Contributors * \file dlpack.h @@ -8,14 +8,20 @@ #ifndef DLPACK_DLPACK_H_ #define DLPACK_DLPACK_H_ +/** + * \brief Compatibility with C++ + */ #ifdef __cplusplus #define DLPACK_EXTERN_C extern "C" #else #define DLPACK_EXTERN_C #endif -/*! \brief The current version of dlpack */ -#define DLPACK_VERSION 050 +/*! \brief The current major version of dlpack */ +#define DLPACK_MAJOR_VERSION 1 + +/*! \brief The current minor version of dlpack */ +#define DLPACK_MINOR_VERSION 0 /*! \brief DLPACK_DLL prefix for windows */ #ifdef _WIN32 @@ -34,10 +40,41 @@ #ifdef __cplusplus extern "C" { #endif + +/*! + * \brief The DLPack version. + * + * A change in major version indicates that we have changed the + * data layout of the ABI - DLManagedTensorVersioned. + * + * A change in minor version indicates that we have added new + * code, such as a new device type, but the ABI is kept the same. + * + * If an obtained DLPack tensor has a major version that disagrees + * with the version number specified in this header file + * (i.e. major != DLPACK_MAJOR_VERSION), the consumer must call the deleter + * (and it is safe to do so). It is not safe to access any other fields + * as the memory layout will have changed. + * + * In the case of a minor version mismatch, the tensor can be safely used as + * long as the consumer knows how to interpret all fields. Minor version + * updates indicate the addition of enumeration values. + */ +typedef struct { + /*! \brief DLPack major version. */ + uint32_t major; + /*! \brief DLPack minor version. */ + uint32_t minor; +} DLPackVersion; + /*! * \brief The device type in DLDevice. */ +#ifdef __cplusplus +typedef enum : int32_t { +#else typedef enum { +#endif /*! \brief CPU device */ kDLCPU = 1, /*! \brief CUDA GPU device */ @@ -70,6 +107,17 @@ typedef enum { * \brief CUDA managed/unified memory allocated by cudaMallocManaged */ kDLCUDAManaged = 13, + /*! + * \brief Unified shared memory allocated on a oneAPI non-partititioned + * device. Call to oneAPI runtime is required to determine the device + * type, the USM allocation type and the sycl context it is bound to. + * + */ + kDLOneAPI = 14, + /*! \brief GPU support for next generation WebGPU standard. */ + kDLWebGPU = 15, + /*! \brief Qualcomm Hexagon DSP */ + kDLHexagon = 16, } DLDeviceType; /*! @@ -82,7 +130,7 @@ typedef struct { * \brief The device index. * For vanilla CPU memory, pinned memory, or managed memory, this is set to 0. */ - int device_id; + int32_t device_id; } DLDevice; /*! @@ -107,16 +155,21 @@ typedef enum { * (C/C++/Python layout: compact struct per complex number) */ kDLComplex = 5U, + /*! \brief boolean */ + kDLBool = 6U, } DLDataTypeCode; /*! - * \brief The data type the tensor can hold. + * \brief The data type the tensor can hold. The data type is assumed to follow the + * native endian-ness. An explicit error message should be raised when attempting to + * export an array with non-native endianness * * Examples - * - float: type_code = 2, bits = 32, lanes=1 - * - float4(vectorized 4 float): type_code = 2, bits = 32, lanes=4 - * - int8: type_code = 0, bits = 8, lanes=1 + * - float: type_code = 2, bits = 32, lanes = 1 + * - float4(vectorized 4 float): type_code = 2, bits = 32, lanes = 4 + * - int8: type_code = 0, bits = 8, lanes = 1 * - std::complex<float>: type_code = 5, bits = 64, lanes = 1 + * - bool: type_code = 6, bits = 8, lanes = 1 (as per common array library convention, the underlying storage size of bool is 8 bits) */ typedef struct { /*! @@ -138,9 +191,16 @@ typedef struct { */ typedef struct { /*! - * \brief The opaque data pointer points to the allocated data. This will be - * CUDA device pointer or cl_mem handle in OpenCL. This pointer is always - * aligned to 256 bytes as in CUDA. + * \brief The data pointer points to the allocated data. This will be CUDA + * device pointer or cl_mem handle in OpenCL. It may be opaque on some device + * types. This pointer is always aligned to 256 bytes as in CUDA. The + * `byte_offset` field should be used to point to the beginning of the data. + * + * Note that as of Nov 2021, multiply libraries (CuPy, PyTorch, TensorFlow, + * TVM, perhaps others) do not adhere to this 256 byte alignment requirement + * on CPU/CUDA/ROCm, and always use `byte_offset=0`. This must be fixed + * (after which this note will be updated); at the moment it is recommended + * to not rely on the data pointer being correctly aligned. * * For given DLTensor, the size of memory required to store the contents of * data is calculated as follows: @@ -160,7 +220,7 @@ typedef struct { /*! \brief The device of the tensor */ DLDevice device; /*! \brief Number of dimensions */ - int ndim; + int32_t ndim; /*! \brief The data type of the pointer*/ DLDataType dtype; /*! \brief The shape of the tensor */ @@ -180,6 +240,13 @@ typedef struct { * not meant to transfer the tensor. When the borrowing framework doesn't need * the tensor, it should call the deleter to notify the host that the resource * is no longer needed. + * + * \note This data structure is used as Legacy DLManagedTensor + * in DLPack exchange and is deprecated after DLPack v0.8 + * Use DLManagedTensorVersioned instead. + * This data structure may get renamed or deleted in future versions. + * + * \sa DLManagedTensorVersioned */ typedef struct DLManagedTensor { /*! \brief DLTensor which is being memory managed */ @@ -188,13 +255,65 @@ typedef struct DLManagedTensor { * which DLManagedTensor is used in the framework. It can also be NULL. */ void * manager_ctx; - /*! \brief Destructor signature void (*)(void*) - this should be called - * to destruct manager_ctx which holds the DLManagedTensor. It can be NULL - * if there is no way for the caller to provide a reasonable destructor. - * The destructors deletes the argument self as well. + /*! + * \brief Destructor - this should be called + * to destruct the manager_ctx which backs the DLManagedTensor. It can be + * NULL if there is no way for the caller to provide a reasonable destructor. + * The destructors deletes the argument self as well. */ void (*deleter)(struct DLManagedTensor * self); } DLManagedTensor; + +// bit masks used in in the DLManagedTensorVersioned + +/*! \brief bit mask to indicate that the tensor is read only. */ +#define DLPACK_FLAG_BITMASK_READ_ONLY (1UL << 0UL) + +/*! + * \brief A versioned and managed C Tensor object, manage memory of DLTensor. + * + * This data structure is intended to facilitate the borrowing of DLTensor by + * another framework. It is not meant to transfer the tensor. When the borrowing + * framework doesn't need the tensor, it should call the deleter to notify the + * host that the resource is no longer needed. + * + * \note This is the current standard DLPack exchange data structure. + */ +struct DLManagedTensorVersioned { + /*! + * \brief The API and ABI version of the current managed Tensor + */ + DLPackVersion version; + /*! + * \brief the context of the original host framework. + * + * Stores DLManagedTensorVersioned is used in the + * framework. It can also be NULL. + */ + void *manager_ctx; + /*! + * \brief Destructor. + * + * This should be called to destruct manager_ctx which holds the DLManagedTensorVersioned. + * It can be NULL if there is no way for the caller to provide a reasonable + * destructor. The destructors deletes the argument self as well. + */ + void (*deleter)(struct DLManagedTensorVersioned *self); + /*! + * \brief Additional bitmask flags information about the tensor. + * + * By default the flags should be set to 0. + * + * \note Future ABI changes should keep everything until this field + * stable, to ensure that deleter can be correctly called. + * + * \sa DLPACK_FLAG_BITMASK_READ_ONLY + */ + uint64_t flags; + /*! \brief DLTensor which is being memory managed */ + DLTensor dl_tensor; +}; + #ifdef __cplusplus } // DLPACK_EXTERN_C #endif diff --git a/numpy/core/src/common/float_status.hpp b/numpy/core/src/common/float_status.hpp new file mode 100644 index 000000000..8e4d5e06a --- /dev/null +++ b/numpy/core/src/common/float_status.hpp @@ -0,0 +1,134 @@ +#ifndef NUMPY_CORE_SRC_COMMON_FLOAT_STATUS_HPP +#define NUMPY_CORE_SRC_COMMON_FLOAT_STATUS_HPP + +#include "npstd.hpp" + +#include <fenv.h> + +namespace np { + +/// @addtogroup cpp_core_utility +/// @{ +/** + * Class wraps floating-point environment operations, + * provides lazy access to its functionality. + */ +class FloatStatus { + public: +/* + * According to the C99 standard FE_DIVBYZERO, etc. may not be provided when + * unsupported. In such cases NumPy will not report these correctly, but we + * should still allow compiling (whether tests pass or not). + * By defining them as 0 locally, we make them no-ops. Unlike these defines, + * for example `musl` still defines all of the functions (as no-ops): + * https://git.musl-libc.org/cgit/musl/tree/src/fenv/fenv.c + * and does similar replacement in its tests: + * http://nsz.repo.hu/git/?p=libc-test;a=blob;f=src/common/mtest.h;h=706c1ba23ea8989b17a2f72ed1a919e187c06b6a;hb=HEAD#l30 + */ +#ifdef FE_DIVBYZERO + static constexpr int kDivideByZero = FE_DIVBYZERO; +#else + static constexpr int kDivideByZero = 0; +#endif +#ifdef FE_INVALID + static constexpr int kInvalid = FE_INVALID; +#else + static constexpr int kInvalid = 0; +#endif +#ifdef FE_INEXACT + static constexpr int kInexact = FE_INEXACT; +#else + static constexpr int kInexact = 0; +#endif +#ifdef FE_OVERFLOW + static constexpr int kOverflow = FE_OVERFLOW; +#else + static constexpr int kOverflow = 0; +#endif +#ifdef FE_UNDERFLOW + static constexpr int kUnderflow = FE_UNDERFLOW; +#else + static constexpr int kUnderflow = 0; +#endif + static constexpr int kAllExcept = (kDivideByZero | kInvalid | kInexact | + kOverflow | kUnderflow); + + FloatStatus(bool clear_on_dst=true) + : clear_on_dst_(clear_on_dst) + { + if constexpr (kAllExcept != 0) { + fpstatus_ = fetestexcept(kAllExcept); + } + else { + fpstatus_ = 0; + } + } + ~FloatStatus() + { + if constexpr (kAllExcept != 0) { + if (fpstatus_ != 0 && clear_on_dst_) { + feclearexcept(kAllExcept); + } + } + } + constexpr bool IsDivideByZero() const + { + return (fpstatus_ & kDivideByZero) != 0; + } + constexpr bool IsInexact() const + { + return (fpstatus_ & kInexact) != 0; + } + constexpr bool IsInvalid() const + { + return (fpstatus_ & kInvalid) != 0; + } + constexpr bool IsOverFlow() const + { + return (fpstatus_ & kOverflow) != 0; + } + constexpr bool IsUnderFlow() const + { + return (fpstatus_ & kUnderflow) != 0; + } + static void RaiseDivideByZero() + { + if constexpr (kDivideByZero != 0) { + feraiseexcept(kDivideByZero); + } + } + static void RaiseInexact() + { + if constexpr (kInexact != 0) { + feraiseexcept(kInexact); + } + } + static void RaiseInvalid() + { + if constexpr (kInvalid != 0) { + feraiseexcept(kInvalid); + } + } + static void RaiseOverflow() + { + if constexpr (kOverflow != 0) { + feraiseexcept(kOverflow); + } + } + static void RaiseUnderflow() + { + if constexpr (kUnderflow != 0) { + feraiseexcept(kUnderflow); + } + } + + private: + bool clear_on_dst_; + int fpstatus_; +}; + +/// @} cpp_core_utility +} // namespace np + +#endif // NUMPY_CORE_SRC_COMMON_FLOAT_STATUS_HPP + diff --git a/numpy/core/src/common/get_attr_string.h b/numpy/core/src/common/get_attr_string.h index 90eca5ee6..36d39189f 100644 --- a/numpy/core/src/common/get_attr_string.h +++ b/numpy/core/src/common/get_attr_string.h @@ -4,7 +4,7 @@ #include <Python.h> #include "ufunc_object.h" -static NPY_INLINE npy_bool +static inline npy_bool _is_basic_python_type(PyTypeObject *tp) { return ( @@ -46,7 +46,7 @@ _is_basic_python_type(PyTypeObject *tp) * * In future, could be made more like _Py_LookupSpecial */ -static NPY_INLINE PyObject * +static inline PyObject * PyArray_LookupSpecial(PyObject *obj, PyObject *name_unicode) { PyTypeObject *tp = Py_TYPE(obj); @@ -73,7 +73,7 @@ PyArray_LookupSpecial(PyObject *obj, PyObject *name_unicode) * * Kept for backwards compatibility. In future, we should deprecate this. */ -static NPY_INLINE PyObject * +static inline PyObject * PyArray_LookupSpecial_OnInstance(PyObject *obj, PyObject *name_unicode) { PyTypeObject *tp = Py_TYPE(obj); diff --git a/numpy/core/src/common/half.hpp b/numpy/core/src/common/half.hpp new file mode 100644 index 000000000..4d16e3bcc --- /dev/null +++ b/numpy/core/src/common/half.hpp @@ -0,0 +1,264 @@ +#ifndef NUMPY_CORE_SRC_COMMON_HALF_HPP +#define NUMPY_CORE_SRC_COMMON_HALF_HPP + +#include "npstd.hpp" + +#include "npy_cpu_dispatch.h" // NPY_HAVE_CPU_FEATURES +#include "half_private.hpp" + +// TODO(@seiko2plus): +// - covers half-precision operations that being supported by numpy/halffloat.h +// - add support for arithmetic operations +// - enables __fp16 causes massive FP exceptions on aarch64, +// needs a deep investigation + +namespace np { + +/// @addtogroup cpp_core_types +/// @{ + +/// Provides a type that implements 16-bit floating point (half-precision). +/// This type is ensured to be 16-bit size. +#if 1 // ndef __ARM_FP16_FORMAT_IEEE +class Half final { + public: + /// Whether `Half` has a full native HW support. + static constexpr bool kNative = false; + /// Whether `Half` has a native HW support for single/double conversion. + template<typename T> + static constexpr bool kNativeConversion = ( + ( + std::is_same_v<T, float> && + #if defined(NPY_HAVE_FP16) || defined(NPY_HAVE_VSX3) + true + #else + false + #endif + ) || ( + std::is_same_v<T, double> && + #if defined(NPY_HAVE_AVX512FP16) || defined(NPY_HAVE_VSX3) + true + #else + false + #endif + ) + ); + + /// Default constructor. initialize nothing. + Half() = default; + + /// Constract from float + /// If there are no hardware optimization available, rounding will always + /// be set to ties to even. + explicit Half(float f) + { + #if defined(NPY_HAVE_FP16) + __m128 mf = _mm_load_ss(&f); + bits_ = static_cast<uint16_t>(_mm_cvtsi128_si32(_mm_cvtps_ph(mf, _MM_FROUND_TO_NEAREST_INT))); + #elif defined(NPY_HAVE_VSX3) && defined(NPY_HAVE_VSX_ASM) + __vector float vf32 = vec_splats(f); + __vector unsigned short vf16; + __asm__ __volatile__ ("xvcvsphp %x0,%x1" : "=wa" (vf16) : "wa" (vf32)); + bits_ = vec_extract(vf16, 0); + #else + bits_ = half_private::FromFloatBits(BitCast<uint32_t>(f)); + #endif + } + + /// Construct from double. + /// If there are no hardware optimization available, rounding will always + /// be set to ties to even. + explicit Half(double f) + { + #if defined(NPY_HAVE_AVX512FP16) + __m128d md = _mm_load_sd(&f); + bits_ = static_cast<uint16_t>(_mm_cvtsi128_si32(_mm_castph_si128(_mm_cvtpd_ph(md)))); + #elif defined(NPY_HAVE_VSX3) && defined(NPY_HAVE_VSX_ASM) + __vector double vf64 = vec_splats(f); + __vector unsigned short vf16; + __asm__ __volatile__ ("xvcvdphp %x0,%x1" : "=wa" (vf16) : "wa" (vf64)); + bits_ = vec_extract(vf16, 0); + #else + bits_ = half_private::FromDoubleBits(BitCast<uint64_t>(f)); + #endif + } + + /// Cast to float + explicit operator float() const + { + #if defined(NPY_HAVE_FP16) + float ret; + _mm_store_ss(&ret, _mm_cvtph_ps(_mm_cvtsi32_si128(bits_))); + return ret; + #elif defined(NPY_HAVE_VSX3) && defined(vec_extract_fp_from_shorth) + return vec_extract(vec_extract_fp_from_shorth(vec_splats(bits_)), 0); + #elif defined(NPY_HAVE_VSX3) && defined(NPY_HAVE_VSX_ASM) + __vector float vf32; + __asm__ __volatile__("xvcvhpsp %x0,%x1" + : "=wa"(vf32) + : "wa"(vec_splats(bits_.u))); + return vec_extract(vf32, 0); + #else + return BitCast<float>(half_private::ToFloatBits(bits_)); + #endif + } + + /// Cast to double + explicit operator double() const + { + #if defined(NPY_HAVE_AVX512FP16) + double ret; + _mm_store_sd(&ret, _mm_cvtph_pd(_mm_castsi128_ph(_mm_cvtsi32_si128(bits_)))); + return ret; + #elif defined(NPY_HAVE_VSX3) && defined(NPY_HAVE_VSX_ASM) + __vector float vf64; + __asm__ __volatile__("xvcvhpdp %x0,%x1" + : "=wa"(vf32) + : "wa"(vec_splats(bits_))); + return vec_extract(vf64, 0); + #else + return BitCast<double>(half_private::ToDoubleBits(bits_)); + #endif + } + + /// Returns a new Half constracted from the IEEE 754 binary16. + static constexpr Half FromBits(uint16_t bits) + { + Half h{}; + h.bits_ = bits; + return h; + } + /// Returns the IEEE 754 binary16 representation. + constexpr uint16_t Bits() const + { + return bits_; + } + + /// @name Comparison operators (orderd) + /// @{ + constexpr bool operator==(Half r) const + { + return !(IsNaN() || r.IsNaN()) && Equal(r); + } + constexpr bool operator<(Half r) const + { + return !(IsNaN() || r.IsNaN()) && Less(r); + } + constexpr bool operator<=(Half r) const + { + return !(IsNaN() || r.IsNaN()) && LessEqual(r); + } + constexpr bool operator>(Half r) const + { + return r < *this; + } + constexpr bool operator>=(Half r) const + { + return r <= *this; + } + /// @} + + /// @name Comparison operators (unorderd) + /// @{ + constexpr bool operator!=(Half r) const + { + return !(*this == r); + } + /// @} Comparison operators + + /// @name Comparison with no guarantee of NaN behavior + /// @{ + constexpr bool Less(Half r) const + { + uint_fast16_t a = static_cast<uint_fast16_t>(bits_), + b = static_cast<uint_fast16_t>(r.bits_); + bool sign_a = (a & 0x8000u) == 0x8000u; + bool sign_b = (b & 0x8000u) == 0x8000u; + // if both `a` and `b` have same sign + // Test if `a` > `b` when `a` has the sign + // or `a` < `b` when is not. + // And make sure they are not equal to each other + // in case of both are equal to +-0 + // else + // Test if `a` has the sign. + // and `a` != -0.0 and `b` != 0.0 + return (sign_a == sign_b) ? (sign_a ^ (a < b)) && (a != b) + : sign_a && ((a | b) != 0x8000u); + } + constexpr bool LessEqual(Half r) const + { + uint_fast16_t a = static_cast<uint_fast16_t>(bits_), + b = static_cast<uint_fast16_t>(r.bits_); + bool sign_a = (a & 0x8000u) == 0x8000u; + bool sign_b = (b & 0x8000u) == 0x8000u; + // if both `a` and `b` have same sign + // Test if `a` > `b` when `a` has the sign + // or `a` < `b` when is not. + // or a == b (needed even if we used <= above instead + // since testing +-0 still required) + // else + // Test if `a` has the sign + // or `a` and `b` equal to +-0.0 + return (sign_a == sign_b) ? (sign_a ^ (a < b)) || (a == b) + : sign_a || ((a | b) == 0x8000u); + } + constexpr bool Equal(Half r) const + { + // fast16 cast is not worth it, since unpack op should involved. + uint16_t a = bits_, b = r.bits_; + return a == b || ((a | b) == 0x8000u); + } + /// @} Comparison + + /// @name Properties + // @{ + constexpr bool IsNaN() const + { + return ((bits_ & 0x7c00u) == 0x7c00u) && + ((bits_ & 0x03ffu) != 0); + } + /// @} Properties + + private: + uint16_t bits_; +}; +#else // __ARM_FP16_FORMAT_IEEE +class Half final { + public: + static constexpr bool kNative = true; + template<typename T> + static constexpr bool kNativeConversion = ( + std::is_same_v<T, float> || std::is_same_v<T, double> + ); + Half() = default; + constexpr Half(__fp16 h) : half_(h) + {} + constexpr operator __fp16() const + { return half_; } + static Half FromBits(uint16_t bits) + { + Half h; + h.half_ = BitCast<__fp16>(bits); + return h; + } + uint16_t Bits() const + { return BitCast<uint16_t>(half_); } + constexpr bool Less(Half r) const + { return half_ < r.half_; } + constexpr bool LessEqual(Half r) const + { return half_ <= r.half_; } + constexpr bool Equal(Half r) const + { return half_ == r.half_; } + constexpr bool IsNaN() const + { return half_ != half_; } + + private: + __fp16 half_; +}; +#endif // __ARM_FP16_FORMAT_IEEE + +/// @} cpp_core_types + +} // namespace np + +#endif // NUMPY_CORE_SRC_COMMON_HALF_HPP diff --git a/numpy/core/src/common/half_private.hpp b/numpy/core/src/common/half_private.hpp new file mode 100644 index 000000000..7a64eb397 --- /dev/null +++ b/numpy/core/src/common/half_private.hpp @@ -0,0 +1,330 @@ +#ifndef NUMPY_CORE_SRC_COMMON_HALF_PRIVATE_HPP +#define NUMPY_CORE_SRC_COMMON_HALF_PRIVATE_HPP + +#include "npstd.hpp" +#include "float_status.hpp" + +/* + * The following functions that emulating float/double/half conversions + * are copied from npymath without any changes to its functionalty. + */ +namespace np { namespace half_private { + +template<bool gen_overflow=true, bool gen_underflow=true, bool round_even=true> +inline uint16_t FromFloatBits(uint32_t f) +{ + uint32_t f_exp, f_sig; + uint16_t h_sgn, h_exp, h_sig; + + h_sgn = (uint16_t) ((f&0x80000000u) >> 16); + f_exp = (f&0x7f800000u); + + /* Exponent overflow/NaN converts to signed inf/NaN */ + if (f_exp >= 0x47800000u) { + if (f_exp == 0x7f800000u) { + /* Inf or NaN */ + f_sig = (f&0x007fffffu); + if (f_sig != 0) { + /* NaN - propagate the flag in the significand... */ + uint16_t ret = (uint16_t) (0x7c00u + (f_sig >> 13)); + /* ...but make sure it stays a NaN */ + if (ret == 0x7c00u) { + ret++; + } + return h_sgn + ret; + } else { + /* signed inf */ + return (uint16_t) (h_sgn + 0x7c00u); + } + } else { + if constexpr (gen_overflow) { + /* overflow to signed inf */ + FloatStatus::RaiseOverflow(); + } + return (uint16_t) (h_sgn + 0x7c00u); + } + } + + /* Exponent underflow converts to a subnormal half or signed zero */ + if (f_exp <= 0x38000000u) { + /* + * Signed zeros, subnormal floats, and floats with small + * exponents all convert to signed zero half-floats. + */ + if (f_exp < 0x33000000u) { + if constexpr (gen_underflow) { + /* If f != 0, it underflowed to 0 */ + if ((f&0x7fffffff) != 0) { + FloatStatus::RaiseUnderflow(); + } + } + return h_sgn; + } + /* Make the subnormal significand */ + f_exp >>= 23; + f_sig = (0x00800000u + (f&0x007fffffu)); + if constexpr (gen_underflow) { + /* If it's not exactly represented, it underflowed */ + if ((f_sig&(((uint32_t)1 << (126 - f_exp)) - 1)) != 0) { + FloatStatus::RaiseUnderflow(); + } + } + /* + * Usually the significand is shifted by 13. For subnormals an + * additional shift needs to occur. This shift is one for the largest + * exponent giving a subnormal `f_exp = 0x38000000 >> 23 = 112`, which + * offsets the new first bit. At most the shift can be 1+10 bits. + */ + f_sig >>= (113 - f_exp); + /* Handle rounding by adding 1 to the bit beyond half precision */ + if constexpr (round_even) { + /* + * If the last bit in the half significand is 0 (already even), and + * the remaining bit pattern is 1000...0, then we do not add one + * to the bit after the half significand. However, the (113 - f_exp) + * shift can lose up to 11 bits, so the || checks them in the original. + * In all other cases, we can just add one. + */ + if (((f_sig&0x00003fffu) != 0x00001000u) || (f&0x000007ffu)) { + f_sig += 0x00001000u; + } + } + else { + f_sig += 0x00001000u; + } + h_sig = (uint16_t) (f_sig >> 13); + /* + * If the rounding causes a bit to spill into h_exp, it will + * increment h_exp from zero to one and h_sig will be zero. + * This is the correct result. + */ + return (uint16_t) (h_sgn + h_sig); + } + + /* Regular case with no overflow or underflow */ + h_exp = (uint16_t) ((f_exp - 0x38000000u) >> 13); + /* Handle rounding by adding 1 to the bit beyond half precision */ + f_sig = (f&0x007fffffu); + if constexpr (round_even) { + /* + * If the last bit in the half significand is 0 (already even), and + * the remaining bit pattern is 1000...0, then we do not add one + * to the bit after the half significand. In all other cases, we do. + */ + if ((f_sig&0x00003fffu) != 0x00001000u) { + f_sig += 0x00001000u; + } + } + else { + f_sig += 0x00001000u; + } + h_sig = (uint16_t) (f_sig >> 13); + /* + * If the rounding causes a bit to spill into h_exp, it will + * increment h_exp by one and h_sig will be zero. This is the + * correct result. h_exp may increment to 15, at greatest, in + * which case the result overflows to a signed inf. + */ + if constexpr (gen_overflow) { + h_sig += h_exp; + if (h_sig == 0x7c00u) { + FloatStatus::RaiseOverflow(); + } + return h_sgn + h_sig; + } + else { + return h_sgn + h_exp + h_sig; + } +} + +template<bool gen_overflow=true, bool gen_underflow=true, bool round_even=true> +inline uint16_t FromDoubleBits(uint64_t d) +{ + uint64_t d_exp, d_sig; + uint16_t h_sgn, h_exp, h_sig; + + h_sgn = (d&0x8000000000000000ULL) >> 48; + d_exp = (d&0x7ff0000000000000ULL); + + /* Exponent overflow/NaN converts to signed inf/NaN */ + if (d_exp >= 0x40f0000000000000ULL) { + if (d_exp == 0x7ff0000000000000ULL) { + /* Inf or NaN */ + d_sig = (d&0x000fffffffffffffULL); + if (d_sig != 0) { + /* NaN - propagate the flag in the significand... */ + uint16_t ret = (uint16_t) (0x7c00u + (d_sig >> 42)); + /* ...but make sure it stays a NaN */ + if (ret == 0x7c00u) { + ret++; + } + return h_sgn + ret; + } else { + /* signed inf */ + return h_sgn + 0x7c00u; + } + } else { + /* overflow to signed inf */ + if constexpr (gen_overflow) { + FloatStatus::RaiseOverflow(); + } + return h_sgn + 0x7c00u; + } + } + + /* Exponent underflow converts to subnormal half or signed zero */ + if (d_exp <= 0x3f00000000000000ULL) { + /* + * Signed zeros, subnormal floats, and floats with small + * exponents all convert to signed zero half-floats. + */ + if (d_exp < 0x3e60000000000000ULL) { + if constexpr (gen_underflow) { + /* If d != 0, it underflowed to 0 */ + if ((d&0x7fffffffffffffffULL) != 0) { + FloatStatus::RaiseUnderflow(); + } + } + return h_sgn; + } + /* Make the subnormal significand */ + d_exp >>= 52; + d_sig = (0x0010000000000000ULL + (d&0x000fffffffffffffULL)); + if constexpr (gen_underflow) { + /* If it's not exactly represented, it underflowed */ + if ((d_sig&(((uint64_t)1 << (1051 - d_exp)) - 1)) != 0) { + FloatStatus::RaiseUnderflow(); + } + } + /* + * Unlike floats, doubles have enough room to shift left to align + * the subnormal significand leading to no loss of the last bits. + * The smallest possible exponent giving a subnormal is: + * `d_exp = 0x3e60000000000000 >> 52 = 998`. All larger subnormals are + * shifted with respect to it. This adds a shift of 10+1 bits the final + * right shift when comparing it to the one in the normal branch. + */ + assert(d_exp - 998 >= 0); + d_sig <<= (d_exp - 998); + /* Handle rounding by adding 1 to the bit beyond half precision */ + if constexpr (round_even) { + /* + * If the last bit in the half significand is 0 (already even), and + * the remaining bit pattern is 1000...0, then we do not add one + * to the bit after the half significand. In all other cases, we do. + */ + if ((d_sig&0x003fffffffffffffULL) != 0x0010000000000000ULL) { + d_sig += 0x0010000000000000ULL; + } + } + else { + d_sig += 0x0010000000000000ULL; + } + h_sig = (uint16_t) (d_sig >> 53); + /* + * If the rounding causes a bit to spill into h_exp, it will + * increment h_exp from zero to one and h_sig will be zero. + * This is the correct result. + */ + return h_sgn + h_sig; + } + + /* Regular case with no overflow or underflow */ + h_exp = (uint16_t) ((d_exp - 0x3f00000000000000ULL) >> 42); + /* Handle rounding by adding 1 to the bit beyond half precision */ + d_sig = (d&0x000fffffffffffffULL); + if constexpr (round_even) { + /* + * If the last bit in the half significand is 0 (already even), and + * the remaining bit pattern is 1000...0, then we do not add one + * to the bit after the half significand. In all other cases, we do. + */ + if ((d_sig&0x000007ffffffffffULL) != 0x0000020000000000ULL) { + d_sig += 0x0000020000000000ULL; + } + } + else { + d_sig += 0x0000020000000000ULL; + } + h_sig = (uint16_t) (d_sig >> 42); + + /* + * If the rounding causes a bit to spill into h_exp, it will + * increment h_exp by one and h_sig will be zero. This is the + * correct result. h_exp may increment to 15, at greatest, in + * which case the result overflows to a signed inf. + */ + if constexpr (gen_overflow) { + h_sig += h_exp; + if (h_sig == 0x7c00u) { + FloatStatus::RaiseOverflow(); + } + return h_sgn + h_sig; + } + else { + return h_sgn + h_exp + h_sig; + } +} + +constexpr uint32_t ToFloatBits(uint16_t h) +{ + uint16_t h_exp = (h&0x7c00u); + uint32_t f_sgn = ((uint32_t)h&0x8000u) << 16; + switch (h_exp) { + case 0x0000u: { // 0 or subnormal + uint16_t h_sig = (h&0x03ffu); + // Signed zero + if (h_sig == 0) { + return f_sgn; + } + // Subnormal + h_sig <<= 1; + while ((h_sig&0x0400u) == 0) { + h_sig <<= 1; + h_exp++; + } + uint32_t f_exp = ((uint32_t)(127 - 15 - h_exp)) << 23; + uint32_t f_sig = ((uint32_t)(h_sig&0x03ffu)) << 13; + return f_sgn + f_exp + f_sig; + } + case 0x7c00u: // inf or NaN + // All-ones exponent and a copy of the significand + return f_sgn + 0x7f800000u + (((uint32_t)(h&0x03ffu)) << 13); + default: // normalized + // Just need to adjust the exponent and shift + return f_sgn + (((uint32_t)(h&0x7fffu) + 0x1c000u) << 13); + } +} + +constexpr uint64_t ToDoubleBits(uint16_t h) +{ + uint16_t h_exp = (h&0x7c00u); + uint64_t d_sgn = ((uint64_t)h&0x8000u) << 48; + switch (h_exp) { + case 0x0000u: { // 0 or subnormal + uint16_t h_sig = (h&0x03ffu); + // Signed zero + if (h_sig == 0) { + return d_sgn; + } + // Subnormal + h_sig <<= 1; + while ((h_sig&0x0400u) == 0) { + h_sig <<= 1; + h_exp++; + } + uint64_t d_exp = ((uint64_t)(1023 - 15 - h_exp)) << 52; + uint64_t d_sig = ((uint64_t)(h_sig&0x03ffu)) << 42; + return d_sgn + d_exp + d_sig; + } + case 0x7c00u: // inf or NaN + // All-ones exponent and a copy of the significand + return d_sgn + 0x7ff0000000000000ULL + (((uint64_t)(h&0x03ffu)) << 42); + default: // normalized + // Just need to adjust the exponent and shift + return d_sgn + (((uint64_t)(h&0x7fffu) + 0xfc000u) << 42); + } +} + +}} // namespace np::half_private +#endif // NUMPY_CORE_SRC_COMMON_HALF_PRIVATE_HPP diff --git a/numpy/core/src/common/lowlevel_strided_loops.h b/numpy/core/src/common/lowlevel_strided_loops.h index 924a34db5..4ad110663 100644 --- a/numpy/core/src/common/lowlevel_strided_loops.h +++ b/numpy/core/src/common/lowlevel_strided_loops.h @@ -425,7 +425,7 @@ PyArray_PrepareThreeRawArrayIter(int ndim, npy_intp const *shape, * blocking. See the 'npy_blocked_end' function documentation below for an * example of how this function is used. */ -static NPY_INLINE npy_intp +static inline npy_intp npy_aligned_block_offset(const void * addr, const npy_uintp esize, const npy_uintp alignment, const npy_uintp nvals) { @@ -458,7 +458,7 @@ npy_aligned_block_offset(const void * addr, const npy_uintp esize, * for(; i < n; i++) * <scalar-op> */ -static NPY_INLINE npy_intp +static inline npy_intp npy_blocked_end(const npy_uintp peel, const npy_uintp esize, const npy_uintp vsz, const npy_uintp nvals) { @@ -472,7 +472,7 @@ npy_blocked_end(const npy_uintp peel, const npy_uintp esize, /* byte swapping functions */ -static NPY_INLINE npy_uint16 +static inline npy_uint16 npy_bswap2(npy_uint16 x) { return ((x & 0xffu) << 8) | (x >> 8); @@ -482,7 +482,7 @@ npy_bswap2(npy_uint16 x) * treat as int16 and byteswap unaligned memory, * some cpus don't support unaligned access */ -static NPY_INLINE void +static inline void npy_bswap2_unaligned(char * x) { char a = x[0]; @@ -490,7 +490,7 @@ npy_bswap2_unaligned(char * x) x[1] = a; } -static NPY_INLINE npy_uint32 +static inline npy_uint32 npy_bswap4(npy_uint32 x) { #ifdef HAVE___BUILTIN_BSWAP32 @@ -501,7 +501,7 @@ npy_bswap4(npy_uint32 x) #endif } -static NPY_INLINE void +static inline void npy_bswap4_unaligned(char * x) { char a = x[0]; @@ -512,7 +512,7 @@ npy_bswap4_unaligned(char * x) x[2] = a; } -static NPY_INLINE npy_uint64 +static inline npy_uint64 npy_bswap8(npy_uint64 x) { #ifdef HAVE___BUILTIN_BSWAP64 @@ -529,7 +529,7 @@ npy_bswap8(npy_uint64 x) #endif } -static NPY_INLINE void +static inline void npy_bswap8_unaligned(char * x) { char a = x[0]; x[0] = x[7]; x[7] = a; @@ -685,7 +685,7 @@ npy_bswap8_unaligned(char * x) size == 1 ? 0 : ((PyArray_NDIM(arr) == 1) ? \ PyArray_STRIDE(arr, 0) : PyArray_ITEMSIZE(arr))) -static NPY_INLINE int +static inline int PyArray_EQUIVALENTLY_ITERABLE_OVERLAP_OK(PyArrayObject *arr1, PyArrayObject *arr2, int arr1_read, int arr2_read) { diff --git a/numpy/core/src/common/meta.hpp b/numpy/core/src/common/meta.hpp new file mode 100644 index 000000000..27ea1857e --- /dev/null +++ b/numpy/core/src/common/meta.hpp @@ -0,0 +1,54 @@ +#ifndef NUMPY_CORE_SRC_COMMON_META_HPP +#define NUMPY_CORE_SRC_COMMON_META_HPP + +#include "npstd.hpp" + +namespace np { namespace meta { +/// @addtogroup cpp_core_meta +/// @{ + +namespace details { +template<int size, bool unsig> +struct IntBySize; + +template<bool unsig> +struct IntBySize<sizeof(uint8_t), unsig> { + using Type = typename std::conditional< + unsig, uint8_t, int8_t>::type; +}; +template<bool unsig> +struct IntBySize<sizeof(uint16_t), unsig> { + using Type = typename std::conditional< + unsig, uint16_t, int16_t>::type; +}; +template<bool unsig> +struct IntBySize<sizeof(uint32_t), unsig> { + using Type = typename std::conditional< + unsig, uint32_t, int32_t>::type; +}; +template<bool unsig> +struct IntBySize<sizeof(uint64_t), unsig> { + using Type = typename std::conditional< + unsig, uint64_t, int64_t>::type; +}; +} // namespace details + +/// Provides safe conversion of any integer type synonyms +/// to a fixed-width integer type. +template<typename T> +struct FixedWidth { + using TF_ = typename details::IntBySize< + sizeof(T), std::is_unsigned<T>::value + >::Type; + + using Type = typename std::conditional< + std::is_integral<T>::value, TF_, T + >::type; +}; + +/// @} cpp_core_meta + +}} // namespace np::meta + +#endif // NUMPY_CORE_SRC_COMMON_META_HPP + diff --git a/numpy/core/src/common/npdef.hpp b/numpy/core/src/common/npdef.hpp new file mode 100644 index 000000000..56a0df52e --- /dev/null +++ b/numpy/core/src/common/npdef.hpp @@ -0,0 +1,28 @@ +#ifndef NUMPY_CORE_SRC_COMMON_NPDEF_HPP +#define NUMPY_CORE_SRC_COMMON_NPDEF_HPP + +#if !defined(__cplusplus) || __cplusplus < 201703L + #error "NumPy requires a compiler with at least C++17 enabled" +#endif + +/// @addtogroup cpp_core_defs +/// @{ + +/// Whether compiler supports C++20 +#if __cplusplus > 202002L + #define NP_HAS_CPP20 1 +#else + #define NP_HAS_CPP20 0 +#endif + +/// Wraps `__has_builtin` +#if defined(__has_builtin) + #define NP_HAS_BUILTIN(INTRIN) __has_builtin(INTRIN) +#else + #define NP_HAS_BUILTIN(INTRIN) 0 +#endif + +/// @} cpp_core_defs + +#endif // NUMPY_CORE_SRC_COMMON_NPDEF_HPP + diff --git a/numpy/core/src/common/npstd.hpp b/numpy/core/src/common/npstd.hpp new file mode 100644 index 000000000..92e3d5cc5 --- /dev/null +++ b/numpy/core/src/common/npstd.hpp @@ -0,0 +1,57 @@ +#ifndef NUMPY_CORE_SRC_COMMON_NPSTD_HPP +#define NUMPY_CORE_SRC_COMMON_NPSTD_HPP + +#include <cstddef> +#include <cstring> +#include <cctype> +#include <cstdint> + +#include <string> +#include <algorithm> +#include <utility> +#include <cstdlib> +#include <cmath> +#include <complex> +#include <type_traits> + +#include <numpy/npy_common.h> + +#include "npy_config.h" + +namespace np { +/// @addtogroup cpp_core_types +/// @{ +using std::uint8_t; +using std::int8_t; +using std::uint16_t; +using std::int16_t; +using std::uint32_t; +using std::int32_t; +using std::uint64_t; +using std::int64_t; +using std::uintptr_t; +using std::intptr_t; +using std::complex; +using std::uint_fast16_t; +using std::uint_fast32_t; + +/** Guard for long double. + * + * The C implementation defines long double as double + * on MinGW to provide compatibility with MSVC to unify + * one behavior under Windows OS, which makes npy_longdouble + * not fit to be used with template specialization or overloading. + * + * This type will be set to `void` when `npy_longdouble` is not defined + * as `long double`. + */ +using LongDouble = typename std::conditional< + !std::is_same<npy_longdouble, long double>::value, + void, npy_longdouble +>::type; +/// @} cpp_core_types + +} // namespace np + +#endif // NUMPY_CORE_SRC_COMMON_NPSTD_HPP + diff --git a/numpy/core/src/common/npy_cblas.h b/numpy/core/src/common/npy_cblas.h index 30fec1a65..751854b6e 100644 --- a/numpy/core/src/common/npy_cblas.h +++ b/numpy/core/src/common/npy_cblas.h @@ -66,7 +66,7 @@ enum CBLAS_SIDE {CblasLeft=141, CblasRight=142}; * Convert NumPy stride to BLAS stride. Returns 0 if conversion cannot be done * (BLAS won't handle negative or zero strides the way we want). */ -static NPY_INLINE CBLAS_INT +static inline CBLAS_INT blas_stride(npy_intp stride, unsigned itemsize) { /* diff --git a/numpy/core/src/common/npy_config.h b/numpy/core/src/common/npy_config.h index d6886c5ea..715b17777 100644 --- a/numpy/core/src/common/npy_config.h +++ b/numpy/core/src/common/npy_config.h @@ -2,6 +2,7 @@ #define NUMPY_CORE_SRC_COMMON_NPY_CONFIG_H_ #include "config.h" +#include "npy_cpu_dispatch.h" // brings NPY_HAVE_[CPU features] #include "numpy/numpyconfig.h" #include "numpy/utils.h" #include "numpy/npy_os.h" diff --git a/numpy/core/src/common/npy_cpu_features.c b/numpy/core/src/common/npy_cpu_features.c index 773f4af86..64a85f6fb 100644 --- a/numpy/core/src/common/npy_cpu_features.c +++ b/numpy/core/src/common/npy_cpu_features.c @@ -1,6 +1,6 @@ #include "npy_cpu_features.h" #include "npy_cpu_dispatch.h" // To guarantee the CPU baseline definitions are in scope. -#include "numpy/npy_common.h" // for NPY_INLINE +#include "numpy/npy_common.h" #include "numpy/npy_cpu.h" // To guarantee the CPU definitions are in scope. /******************** Private Definitions *********************/ @@ -14,13 +14,15 @@ static unsigned char npy__cpu_have[NPY_CPU_FEATURE_MAX]; static void npy__cpu_init_features(void); /* - * Disable CPU dispatched features at runtime if environment variable - * 'NPY_DISABLE_CPU_FEATURES' is defined. + * Enable or disable CPU dispatched features at runtime if the environment variable + * `NPY_ENABLE_CPU_FEATURES` or `NPY_DISABLE_CPU_FEATURES` + * depends on the value of boolean parameter `disable`(toggle). + * * Multiple features can be present, and separated by space, comma, or tab. - * Raises an error if parsing fails or if the feature was not enabled + * Raises an error if parsing fails or if the feature was not enabled or disabled */ static int -npy__cpu_try_disable_env(void); +npy__cpu_check_env(int disable, const char *env); /* Ensure the build's CPU baseline features are supported at runtime */ static int @@ -43,9 +45,22 @@ npy_cpu_init(void) if (npy__cpu_validate_baseline() < 0) { return -1; } - if (npy__cpu_try_disable_env() < 0) { + char *enable_env = getenv("NPY_ENABLE_CPU_FEATURES"); + char *disable_env = getenv("NPY_DISABLE_CPU_FEATURES"); + int is_enable = enable_env && enable_env[0]; + int is_disable = disable_env && disable_env[0]; + if (is_enable & is_disable) { + PyErr_Format(PyExc_ImportError, + "Both NPY_DISABLE_CPU_FEATURES and NPY_ENABLE_CPU_FEATURES " + "environment variables cannot be set simultaneously." + ); return -1; } + if (is_enable | is_disable) { + if (npy__cpu_check_env(is_disable, is_disable ? disable_env : enable_env) < 0) { + return -1; + } + } return 0; } @@ -81,12 +96,14 @@ static struct { {NPY_CPU_FEATURE_AVX512VBMI, "AVX512VBMI"}, {NPY_CPU_FEATURE_AVX512VBMI2, "AVX512VBMI2"}, {NPY_CPU_FEATURE_AVX512BITALG, "AVX512BITALG"}, + {NPY_CPU_FEATURE_AVX512FP16 , "AVX512FP16"}, {NPY_CPU_FEATURE_AVX512_KNL, "AVX512_KNL"}, {NPY_CPU_FEATURE_AVX512_KNM, "AVX512_KNM"}, {NPY_CPU_FEATURE_AVX512_SKX, "AVX512_SKX"}, {NPY_CPU_FEATURE_AVX512_CLX, "AVX512_CLX"}, {NPY_CPU_FEATURE_AVX512_CNL, "AVX512_CNL"}, {NPY_CPU_FEATURE_AVX512_ICL, "AVX512_ICL"}, + {NPY_CPU_FEATURE_AVX512_SPR, "AVX512_SPR"}, {NPY_CPU_FEATURE_VSX, "VSX"}, {NPY_CPU_FEATURE_VSX2, "VSX2"}, {NPY_CPU_FEATURE_VSX3, "VSX3"}, @@ -166,7 +183,7 @@ npy_cpu_dispatch_list(void) * features that had been configured via --cpu-baseline * otherwise it returns 0 */ -static NPY_INLINE int +static inline int npy__cpu_baseline_fid(const char *feature) { #if !defined(NPY_DISABLE_OPTIMIZATION) && NPY_WITH_CPU_BASELINE_N > 0 @@ -179,7 +196,7 @@ npy__cpu_baseline_fid(const char *feature) * features that had been configured via --cpu-dispatch * otherwise it returns 0 */ -static NPY_INLINE int +static inline int npy__cpu_dispatch_fid(const char *feature) { #if !defined(NPY_DISABLE_OPTIMIZATION) && NPY_WITH_CPU_DISPATCH_N > 0 @@ -208,7 +225,8 @@ npy__cpu_validate_baseline(void) *(fptr-1) = '\0'; // trim the last space PyErr_Format(PyExc_RuntimeError, "NumPy was built with baseline optimizations: \n" - "(" NPY_WITH_CPU_BASELINE ") but your machine doesn't support:\n(%s).", + "(" NPY_WITH_CPU_BASELINE ") but your machine " + "doesn't support:\n(%s).", baseline_failure ); return -1; @@ -218,27 +236,31 @@ npy__cpu_validate_baseline(void) } static int -npy__cpu_try_disable_env(void) -{ - char *disenv = getenv("NPY_DISABLE_CPU_FEATURES"); - if (disenv == NULL || disenv[0] == 0) { - return 0; - } - #define NPY__CPU_ENV_ERR_HEAD \ - "During parsing environment variable 'NPY_DISABLE_CPU_FEATURES':\n" +npy__cpu_check_env(int disable, const char *env) { + + static const char *names[] = { + "enable", "disable", + "NPY_ENABLE_CPU_FEATURES", "NPY_DISABLE_CPU_FEATURES", + "During parsing environment variable: 'NPY_ENABLE_CPU_FEATURES':\n", + "During parsing environment variable: 'NPY_DISABLE_CPU_FEATURES':\n" + }; + disable = disable ? 1 : 0; + const char *act_name = names[disable]; + const char *env_name = names[disable + 2]; + const char *err_head = names[disable + 4]; #if !defined(NPY_DISABLE_OPTIMIZATION) && NPY_WITH_CPU_DISPATCH_N > 0 #define NPY__MAX_VAR_LEN 1024 // More than enough for this era - size_t var_len = strlen(disenv) + 1; + size_t var_len = strlen(env) + 1; if (var_len > NPY__MAX_VAR_LEN) { PyErr_Format(PyExc_RuntimeError, - "Length of environment variable 'NPY_DISABLE_CPU_FEATURES' is %d, only %d accepted", - var_len, NPY__MAX_VAR_LEN - 1 + "Length of environment variable '%s' is %d, only %d accepted", + env_name, var_len, NPY__MAX_VAR_LEN ); return -1; } - char disable_features[NPY__MAX_VAR_LEN]; - memcpy(disable_features, disenv, var_len); + char features[NPY__MAX_VAR_LEN]; + memcpy(features, env, var_len); char nexist[NPY__MAX_VAR_LEN]; char *nexist_cur = &nexist[0]; @@ -248,17 +270,19 @@ npy__cpu_try_disable_env(void) //comma and space including (htab, vtab, CR, LF, FF) const char *delim = ", \t\v\r\n\f"; - char *feature = strtok(disable_features, delim); + char *feature = strtok(features, delim); while (feature) { - if (npy__cpu_baseline_fid(feature) > 0) { - PyErr_Format(PyExc_RuntimeError, - NPY__CPU_ENV_ERR_HEAD - "You cannot disable CPU feature '%s', since it is part of " - "the baseline optimizations:\n" - "(" NPY_WITH_CPU_BASELINE ").", - feature - ); - return -1; + if (npy__cpu_baseline_fid(feature) > 0){ + if (disable) { + PyErr_Format(PyExc_RuntimeError, + "%s" + "You cannot disable CPU feature '%s', since it is part of " + "the baseline optimizations:\n" + "(" NPY_WITH_CPU_BASELINE ").", + err_head, feature + ); + return -1; + } goto next; } // check if the feature is part of dispatched features int feature_id = npy__cpu_dispatch_fid(feature); @@ -275,47 +299,58 @@ npy__cpu_try_disable_env(void) notsupp_cur[flen] = ' '; notsupp_cur += flen + 1; goto next; } - // Finally we can disable it - npy__cpu_have[feature_id] = 0; + // Finally we can disable or mark for enabling + npy__cpu_have[feature_id] = disable ? 0:2; next: feature = strtok(NULL, delim); } + if (!disable){ + // Disables any unmarked dispatched feature. + #define NPY__CPU_DISABLE_DISPATCH_CB(FEATURE, DUMMY) \ + if(npy__cpu_have[NPY_CAT(NPY_CPU_FEATURE_, FEATURE)] != 0)\ + {npy__cpu_have[NPY_CAT(NPY_CPU_FEATURE_, FEATURE)]--;}\ + + NPY_WITH_CPU_DISPATCH_CALL(NPY__CPU_DISABLE_DISPATCH_CB, DUMMY) // extra arg for msvc + } *nexist_cur = '\0'; if (nexist[0] != '\0') { *(nexist_cur-1) = '\0'; // trim the last space - if (PyErr_WarnFormat(PyExc_RuntimeWarning, 1, - NPY__CPU_ENV_ERR_HEAD - "You cannot disable CPU features (%s), since " - "they are not part of the dispatched optimizations\n" - "(" NPY_WITH_CPU_DISPATCH ").", - nexist + if (PyErr_WarnFormat(PyExc_ImportWarning, 1, + "%sYou cannot %s CPU features (%s), since " + "they are not part of the dispatched optimizations\n" + "(" NPY_WITH_CPU_DISPATCH ").", + err_head, act_name, nexist ) < 0) { return -1; } + return 0; } + #define NOTSUPP_BODY \ + "%s" \ + "You cannot %s CPU features (%s), since " \ + "they are not supported by your machine.", \ + err_head, act_name, notsupp + *notsupp_cur = '\0'; if (notsupp[0] != '\0') { *(notsupp_cur-1) = '\0'; // trim the last space - if (PyErr_WarnFormat(PyExc_RuntimeWarning, 1, - NPY__CPU_ENV_ERR_HEAD - "You cannot disable CPU features (%s), since " - "they are not supported by your machine.", - notsupp - ) < 0) { + if (!disable){ + PyErr_Format(PyExc_RuntimeError, NOTSUPP_BODY); return -1; } } #else - if (PyErr_WarnFormat(PyExc_RuntimeWarning, 1, - NPY__CPU_ENV_ERR_HEAD - "You cannot use environment variable 'NPY_DISABLE_CPU_FEATURES', since " + if (PyErr_WarnFormat(PyExc_ImportWarning, 1, + "%s" + "You cannot use environment variable '%s', since " #ifdef NPY_DISABLE_OPTIMIZATION - "the NumPy library was compiled with optimization disabled." + "the NumPy library was compiled with optimization disabled.", #else - "the NumPy library was compiled without any dispatched optimizations." + "the NumPy library was compiled without any dispatched optimizations.", #endif + err_head, env_name, act_name ) < 0) { return -1; } @@ -506,6 +541,11 @@ npy__cpu_init_features(void) npy__cpu_have[NPY_CPU_FEATURE_AVX512VBMI2] && npy__cpu_have[NPY_CPU_FEATURE_AVX512BITALG] && npy__cpu_have[NPY_CPU_FEATURE_AVX512VPOPCNTDQ]; + // Sapphire Rapids + npy__cpu_have[NPY_CPU_FEATURE_AVX512FP16] = (reg[3] & (1 << 23)) != 0; + npy__cpu_have[NPY_CPU_FEATURE_AVX512_SPR] = npy__cpu_have[NPY_CPU_FEATURE_AVX512_ICL] && + npy__cpu_have[NPY_CPU_FEATURE_AVX512FP16]; + } } @@ -513,7 +553,10 @@ npy__cpu_init_features(void) #elif defined(NPY_CPU_PPC64) || defined(NPY_CPU_PPC64LE) -#ifdef __linux__ +#if defined(__linux__) || defined(__FreeBSD__) + #ifdef __FreeBSD__ + #include <machine/cpu.h> // defines PPC_FEATURE_HAS_VSX + #endif #include <sys/auxv.h> #ifndef AT_HWCAP2 #define AT_HWCAP2 26 @@ -533,12 +576,21 @@ static void npy__cpu_init_features(void) { memset(npy__cpu_have, 0, sizeof(npy__cpu_have[0]) * NPY_CPU_FEATURE_MAX); +#if defined(__linux__) || defined(__FreeBSD__) #ifdef __linux__ unsigned int hwcap = getauxval(AT_HWCAP); if ((hwcap & PPC_FEATURE_HAS_VSX) == 0) return; hwcap = getauxval(AT_HWCAP2); +#else + unsigned long hwcap; + elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap)); + if ((hwcap & PPC_FEATURE_HAS_VSX) == 0) + return; + + elf_aux_info(AT_HWCAP2, &hwcap, sizeof(hwcap)); +#endif // __linux__ if (hwcap & PPC_FEATURE2_ARCH_3_1) { npy__cpu_have[NPY_CPU_FEATURE_VSX] = @@ -551,7 +603,7 @@ npy__cpu_init_features(void) npy__cpu_have[NPY_CPU_FEATURE_VSX2] = (hwcap & PPC_FEATURE2_ARCH_2_07) != 0; npy__cpu_have[NPY_CPU_FEATURE_VSX3] = (hwcap & PPC_FEATURE2_ARCH_3_00) != 0; npy__cpu_have[NPY_CPU_FEATURE_VSX4] = (hwcap & PPC_FEATURE2_ARCH_3_1) != 0; -// TODO: AIX, FreeBSD +// TODO: AIX, OpenBSD #else npy__cpu_have[NPY_CPU_FEATURE_VSX] = 1; #if defined(NPY_CPU_PPC64LE) || defined(NPY_HAVE_VSX2) @@ -606,7 +658,7 @@ npy__cpu_init_features(void) #elif defined(__arm__) || defined(__aarch64__) -static NPY_INLINE void +static inline void npy__cpu_init_features_arm8(void) { npy__cpu_have[NPY_CPU_FEATURE_NEON] = diff --git a/numpy/core/src/common/npy_cpu_features.h b/numpy/core/src/common/npy_cpu_features.h index 96c543e70..9180670c4 100644 --- a/numpy/core/src/common/npy_cpu_features.h +++ b/numpy/core/src/common/npy_cpu_features.h @@ -43,6 +43,7 @@ enum npy_cpu_features NPY_CPU_FEATURE_AVX512VNNI = 42, NPY_CPU_FEATURE_AVX512VBMI2 = 43, NPY_CPU_FEATURE_AVX512BITALG = 44, + NPY_CPU_FEATURE_AVX512FP16 = 45, // X86 CPU Groups // Knights Landing (F,CD,ER,PF) @@ -57,6 +58,8 @@ enum npy_cpu_features NPY_CPU_FEATURE_AVX512_CNL = 105, // Ice Lake (F,CD,BW,DQ,VL,IFMA,VBMI,VNNI,VBMI2,BITALG,VPOPCNTDQ) NPY_CPU_FEATURE_AVX512_ICL = 106, + // Sapphire Rapids (Ice Lake, AVX512FP16) + NPY_CPU_FEATURE_AVX512_SPR = 107, // IBM/POWER VSX // POWER7 @@ -103,13 +106,25 @@ enum npy_cpu_features * - detects runtime CPU features * - check that baseline CPU features are present * - uses 'NPY_DISABLE_CPU_FEATURES' to disable dispatchable features + * - uses 'NPY_ENABLE_CPU_FEATURES' to enable dispatchable features * * It will set a RuntimeError when * - CPU baseline features from the build are not supported at runtime * - 'NPY_DISABLE_CPU_FEATURES' tries to disable a baseline feature - * and will warn if 'NPY_DISABLE_CPU_FEATURES' tries to disable a feature that - * is not disabled (the machine or build does not support it, or the project was - * not built with any feature optimization support) + * - 'NPY_DISABLE_CPU_FEATURES' and 'NPY_ENABLE_CPU_FEATURES' are + * simultaneously set + * - 'NPY_ENABLE_CPU_FEATURES' tries to enable a feature that is not supported + * by the machine or build + * - 'NPY_ENABLE_CPU_FEATURES' tries to enable a feature when the project was + * not built with any feature optimization support + * + * It will set an ImportWarning when: + * - 'NPY_DISABLE_CPU_FEATURES' tries to disable a feature that is not supported + * by the machine or build + * - 'NPY_DISABLE_CPU_FEATURES' or 'NPY_ENABLE_CPU_FEATURES' tries to + * disable/enable a feature when the project was not built with any feature + * optimization support + * * return 0 on success otherwise return -1 */ NPY_VISIBILITY_HIDDEN int diff --git a/numpy/core/src/common/npy_ctypes.h b/numpy/core/src/common/npy_ctypes.h index 05761cad3..4b5634574 100644 --- a/numpy/core/src/common/npy_ctypes.h +++ b/numpy/core/src/common/npy_ctypes.h @@ -14,7 +14,7 @@ * This entire function is just a wrapper around the Python function of the * same name. */ -NPY_INLINE static int +static inline int npy_ctypes_check(PyTypeObject *obj) { static PyObject *py_func = NULL; diff --git a/numpy/core/src/common/npy_extint128.h b/numpy/core/src/common/npy_extint128.h index d563c2ac8..776d71c77 100644 --- a/numpy/core/src/common/npy_extint128.h +++ b/numpy/core/src/common/npy_extint128.h @@ -9,7 +9,7 @@ typedef struct { /* Integer addition with overflow checking */ -static NPY_INLINE npy_int64 +static inline npy_int64 safe_add(npy_int64 a, npy_int64 b, char *overflow_flag) { if (a > 0 && b > NPY_MAX_INT64 - a) { @@ -23,7 +23,7 @@ safe_add(npy_int64 a, npy_int64 b, char *overflow_flag) /* Integer subtraction with overflow checking */ -static NPY_INLINE npy_int64 +static inline npy_int64 safe_sub(npy_int64 a, npy_int64 b, char *overflow_flag) { if (a >= 0 && b < a - NPY_MAX_INT64) { @@ -37,7 +37,7 @@ safe_sub(npy_int64 a, npy_int64 b, char *overflow_flag) /* Integer multiplication with overflow checking */ -static NPY_INLINE npy_int64 +static inline npy_int64 safe_mul(npy_int64 a, npy_int64 b, char *overflow_flag) { if (a > 0) { @@ -58,7 +58,7 @@ safe_mul(npy_int64 a, npy_int64 b, char *overflow_flag) /* Long integer init */ -static NPY_INLINE npy_extint128_t +static inline npy_extint128_t to_128(npy_int64 x) { npy_extint128_t result; @@ -74,7 +74,7 @@ to_128(npy_int64 x) } -static NPY_INLINE npy_int64 +static inline npy_int64 to_64(npy_extint128_t x, char *overflow) { if (x.hi != 0 || @@ -87,7 +87,7 @@ to_64(npy_extint128_t x, char *overflow) /* Long integer multiply */ -static NPY_INLINE npy_extint128_t +static inline npy_extint128_t mul_64_64(npy_int64 a, npy_int64 b) { npy_extint128_t x, y, z; @@ -127,7 +127,7 @@ mul_64_64(npy_int64 a, npy_int64 b) /* Long integer add */ -static NPY_INLINE npy_extint128_t +static inline npy_extint128_t add_128(npy_extint128_t x, npy_extint128_t y, char *overflow) { npy_extint128_t z; @@ -170,7 +170,7 @@ add_128(npy_extint128_t x, npy_extint128_t y, char *overflow) /* Long integer negation */ -static NPY_INLINE npy_extint128_t +static inline npy_extint128_t neg_128(npy_extint128_t x) { npy_extint128_t z = x; @@ -179,14 +179,14 @@ neg_128(npy_extint128_t x) } -static NPY_INLINE npy_extint128_t +static inline npy_extint128_t sub_128(npy_extint128_t x, npy_extint128_t y, char *overflow) { return add_128(x, neg_128(y), overflow); } -static NPY_INLINE npy_extint128_t +static inline npy_extint128_t shl_128(npy_extint128_t v) { npy_extint128_t z; @@ -198,7 +198,7 @@ shl_128(npy_extint128_t v) } -static NPY_INLINE npy_extint128_t +static inline npy_extint128_t shr_128(npy_extint128_t v) { npy_extint128_t z; @@ -209,7 +209,7 @@ shr_128(npy_extint128_t v) return z; } -static NPY_INLINE int +static inline int gt_128(npy_extint128_t a, npy_extint128_t b) { if (a.sign > 0 && b.sign > 0) { @@ -228,7 +228,7 @@ gt_128(npy_extint128_t a, npy_extint128_t b) /* Long integer divide */ -static NPY_INLINE npy_extint128_t +static inline npy_extint128_t divmod_128_64(npy_extint128_t x, npy_int64 b, npy_int64 *mod) { npy_extint128_t remainder, pointer, result, divisor; @@ -284,7 +284,7 @@ divmod_128_64(npy_extint128_t x, npy_int64 b, npy_int64 *mod) /* Divide and round down (positive divisor; no overflows) */ -static NPY_INLINE npy_extint128_t +static inline npy_extint128_t floordiv_128_64(npy_extint128_t a, npy_int64 b) { npy_extint128_t result; @@ -300,7 +300,7 @@ floordiv_128_64(npy_extint128_t a, npy_int64 b) /* Divide and round up (positive divisor; no overflows) */ -static NPY_INLINE npy_extint128_t +static inline npy_extint128_t ceildiv_128_64(npy_extint128_t a, npy_int64 b) { npy_extint128_t result; diff --git a/numpy/core/src/common/npy_hashtable.c b/numpy/core/src/common/npy_hashtable.c index af82c0f85..14f6cca1b 100644 --- a/numpy/core/src/common/npy_hashtable.c +++ b/numpy/core/src/common/npy_hashtable.c @@ -35,7 +35,7 @@ * * Users cannot control pointers, so we do not have to worry about DoS attacks? */ -static NPY_INLINE Py_hash_t +static inline Py_hash_t identity_list_hash(PyObject *const *v, int len) { Py_uhash_t acc = _NpyHASH_XXPRIME_5; @@ -58,7 +58,7 @@ identity_list_hash(PyObject *const *v, int len) #undef _NpyHASH_XXROTATE -static NPY_INLINE PyObject ** +static inline PyObject ** find_item(PyArrayIdentityHash const *tb, PyObject *const *key) { Py_hash_t hash = identity_list_hash(key, tb->key_len); diff --git a/numpy/core/src/common/npy_import.h b/numpy/core/src/common/npy_import.h index f36b6924a..58b4ba0bc 100644 --- a/numpy/core/src/common/npy_import.h +++ b/numpy/core/src/common/npy_import.h @@ -16,7 +16,7 @@ * @param attr module attribute to cache. * @param cache Storage location for imported function. */ -NPY_INLINE static void +static inline void npy_cache_import(const char *module, const char *attr, PyObject **cache) { if (NPY_UNLIKELY(*cache == NULL)) { diff --git a/numpy/core/src/common/npy_pycompat.h b/numpy/core/src/common/npy_pycompat.h index 6641cd591..ce6c34fa1 100644 --- a/numpy/core/src/common/npy_pycompat.h +++ b/numpy/core/src/common/npy_pycompat.h @@ -11,7 +11,7 @@ #if PY_VERSION_HEX > 0x030a00a6 #define Npy_HashDouble _Py_HashDouble #else -static NPY_INLINE Py_hash_t +static inline Py_hash_t Npy_HashDouble(PyObject *NPY_UNUSED(identity), double val) { return _Py_HashDouble(val); diff --git a/numpy/core/src/common/npy_sort.h.src b/numpy/core/src/common/npy_sort.h.src index a3f556f56..d6e435722 100644 --- a/numpy/core/src/common/npy_sort.h.src +++ b/numpy/core/src/common/npy_sort.h.src @@ -9,7 +9,7 @@ #define NPY_ENOMEM 1 #define NPY_ECOMP 2 -static NPY_INLINE int npy_get_msb(npy_uintp unum) +static inline int npy_get_msb(npy_uintp unum) { int depth_limit = 0; while (unum >>= 1) { diff --git a/numpy/core/src/common/numpyos.c b/numpy/core/src/common/numpyos.c index 5ee95191d..2fec06e1c 100644 --- a/numpy/core/src/common/numpyos.c +++ b/numpy/core/src/common/numpyos.c @@ -779,3 +779,25 @@ NumPyOS_strtoull(const char *str, char **endptr, int base) return strtoull(str, endptr, base); } +#ifdef _MSC_VER + +#include <stdlib.h> + +#if _MSC_VER >= 1900 +/* npy3k_compat.h uses this function in the _Py_BEGIN/END_SUPPRESS_IPH + * macros. It does not need to be defined when building using MSVC + * earlier than 14.0 (_MSC_VER == 1900). + */ + +static void __cdecl _silent_invalid_parameter_handler( + wchar_t const* expression, + wchar_t const* function, + wchar_t const* file, + unsigned int line, + uintptr_t pReserved) { } + +_invalid_parameter_handler _Py_silent_invalid_parameter_handler = _silent_invalid_parameter_handler; + +#endif + +#endif diff --git a/numpy/core/src/common/simd/avx2/arithmetic.h b/numpy/core/src/common/simd/avx2/arithmetic.h index ad9688338..58d842a6d 100644 --- a/numpy/core/src/common/simd/avx2/arithmetic.h +++ b/numpy/core/src/common/simd/avx2/arithmetic.h @@ -247,6 +247,10 @@ NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor) // negate multiply and subtract, -(a*b) - c #define npyv_nmulsub_f32 _mm256_fnmsub_ps #define npyv_nmulsub_f64 _mm256_fnmsub_pd + // multiply, add for odd elements and subtract even elements. + // (a * b) -+ c + #define npyv_muladdsub_f32 _mm256_fmaddsub_ps + #define npyv_muladdsub_f64 _mm256_fmaddsub_pd #else // multiply and add, a*b + c NPY_FINLINE npyv_f32 npyv_muladd_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c) @@ -274,6 +278,13 @@ NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor) npyv_f64 neg_a = npyv_xor_f64(a, npyv_setall_f64(-0.0)); return npyv_sub_f64(npyv_mul_f64(neg_a, b), c); } + // multiply, add for odd elements and subtract even elements. + // (a * b) -+ c + NPY_FINLINE npyv_f32 npyv_muladdsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c) + { return _mm256_addsub_ps(npyv_mul_f32(a, b), c); } + NPY_FINLINE npyv_f64 npyv_muladdsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c) + { return _mm256_addsub_pd(npyv_mul_f64(a, b), c); } + #endif // !NPY_HAVE_FMA3 /*************************** diff --git a/numpy/core/src/common/simd/avx2/avx2.h b/numpy/core/src/common/simd/avx2/avx2.h index 8cb74df2b..d64f3c6d6 100644 --- a/numpy/core/src/common/simd/avx2/avx2.h +++ b/numpy/core/src/common/simd/avx2/avx2.h @@ -11,6 +11,7 @@ #define NPY_SIMD_FMA3 0 // fast emulated #endif #define NPY_SIMD_BIGENDIAN 0 +#define NPY_SIMD_CMPSIGNAL 0 // Enough limit to allow us to use _mm256_i32gather_* #define NPY_SIMD_MAXLOAD_STRIDE32 (0x7fffffff / 8) diff --git a/numpy/core/src/common/simd/avx2/memory.h b/numpy/core/src/common/simd/avx2/memory.h index 410c35dc8..81144a36b 100644 --- a/numpy/core/src/common/simd/avx2/memory.h +++ b/numpy/core/src/common/simd/avx2/memory.h @@ -84,17 +84,6 @@ NPY_FINLINE npyv_s32 npyv_loadn_s32(const npy_int32 *ptr, npy_intp stride) NPY_FINLINE npyv_f32 npyv_loadn_f32(const float *ptr, npy_intp stride) { return _mm256_castsi256_ps(npyv_loadn_u32((const npy_uint32*)ptr, stride)); } //// 64 -#if 0 // slower -NPY_FINLINE npyv_u64 npyv_loadn_u64(const npy_uint64 *ptr, npy_intp stride) -{ - const __m256i idx = npyv_set_s64(0, 1*stride, 2*stride, 3*stride); - return _mm256_i64gather_epi64((const void*)ptr, idx, 8); -} -NPY_FINLINE npyv_s64 npyv_loadn_s64(const npy_int64 *ptr, npy_intp stride) -{ return npyv_loadn_u64((const npy_uint64*)ptr, stride); } -NPY_FINLINE npyv_f64 npyv_loadn_f64(const double *ptr, npy_intp stride) -{ return _mm256_castsi256_pd(npyv_loadn_u64((const npy_uint64*)ptr, stride)); } -#endif NPY_FINLINE npyv_f64 npyv_loadn_f64(const double *ptr, npy_intp stride) { __m128d a0 = _mm_castsi128_pd(_mm_loadl_epi64((const __m128i*)ptr)); @@ -107,6 +96,32 @@ NPY_FINLINE npyv_u64 npyv_loadn_u64(const npy_uint64 *ptr, npy_intp stride) { return _mm256_castpd_si256(npyv_loadn_f64((const double*)ptr, stride)); } NPY_FINLINE npyv_s64 npyv_loadn_s64(const npy_int64 *ptr, npy_intp stride) { return _mm256_castpd_si256(npyv_loadn_f64((const double*)ptr, stride)); } + +//// 64-bit load over 32-bit stride +NPY_FINLINE npyv_f32 npyv_loadn2_f32(const float *ptr, npy_intp stride) +{ + __m128d a0 = _mm_castsi128_pd(_mm_loadl_epi64((const __m128i*)ptr)); + __m128d a2 = _mm_castsi128_pd(_mm_loadl_epi64((const __m128i*)(ptr + stride*2))); + __m128d a01 = _mm_loadh_pd(a0, (const double*)(ptr + stride)); + __m128d a23 = _mm_loadh_pd(a2, (const double*)(ptr + stride*3)); + return _mm256_castpd_ps(_mm256_insertf128_pd(_mm256_castpd128_pd256(a01), a23, 1)); +} +NPY_FINLINE npyv_u32 npyv_loadn2_u32(const npy_uint32 *ptr, npy_intp stride) +{ return _mm256_castps_si256(npyv_loadn2_f32((const float*)ptr, stride)); } +NPY_FINLINE npyv_s32 npyv_loadn2_s32(const npy_int32 *ptr, npy_intp stride) +{ return _mm256_castps_si256(npyv_loadn2_f32((const float*)ptr, stride)); } + +//// 128-bit load over 64-bit stride +NPY_FINLINE npyv_f64 npyv_loadn2_f64(const double *ptr, npy_intp stride) +{ + __m256d a = npyv_loadl_f64(ptr); + return _mm256_insertf128_pd(a, _mm_loadu_pd(ptr + stride), 1); +} +NPY_FINLINE npyv_u64 npyv_loadn2_u64(const npy_uint64 *ptr, npy_intp stride) +{ return _mm256_castpd_si256(npyv_loadn2_f64((const double*)ptr, stride)); } +NPY_FINLINE npyv_s64 npyv_loadn2_s64(const npy_int64 *ptr, npy_intp stride) +{ return _mm256_castpd_si256(npyv_loadn2_f64((const double*)ptr, stride)); } + /*************************** * Non-contiguous Store ***************************/ @@ -143,6 +158,32 @@ NPY_FINLINE void npyv_storen_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a) NPY_FINLINE void npyv_storen_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a) { npyv_storen_f64((double*)ptr, stride, _mm256_castsi256_pd(a)); } +//// 64-bit store over 32-bit stride +NPY_FINLINE void npyv_storen2_u32(npy_uint32 *ptr, npy_intp stride, npyv_u32 a) +{ + __m128d a0 = _mm256_castpd256_pd128(_mm256_castsi256_pd(a)); + __m128d a1 = _mm256_extractf128_pd(_mm256_castsi256_pd(a), 1); + _mm_storel_pd((double*)ptr, a0); + _mm_storeh_pd((double*)(ptr + stride), a0); + _mm_storel_pd((double*)(ptr + stride*2), a1); + _mm_storeh_pd((double*)(ptr + stride*3), a1); +} +NPY_FINLINE void npyv_storen2_s32(npy_int32 *ptr, npy_intp stride, npyv_s32 a) +{ npyv_storen2_u32((npy_uint32*)ptr, stride, a); } +NPY_FINLINE void npyv_storen2_f32(float *ptr, npy_intp stride, npyv_f32 a) +{ npyv_storen2_u32((npy_uint32*)ptr, stride, _mm256_castps_si256(a)); } + +//// 128-bit store over 64-bit stride +NPY_FINLINE void npyv_storen2_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a) +{ + npyv_storel_u64(ptr, a); + npyv_storeh_u64(ptr + stride, a); +} +NPY_FINLINE void npyv_storen2_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a) +{ npyv_storen2_u64((npy_uint64*)ptr, stride, a); } +NPY_FINLINE void npyv_storen2_f64(double *ptr, npy_intp stride, npyv_f64 a) +{ npyv_storen2_u64((npy_uint64*)ptr, stride, _mm256_castpd_si256(a)); } + /********************************* * Partial Load *********************************/ @@ -174,7 +215,7 @@ NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, n const __m256i steps = npyv_set_s64(0, 1, 2, 3); __m256i vnlane = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane); __m256i mask = _mm256_cmpgt_epi64(vnlane, steps); - __m256i payload = _mm256_maskload_epi64((const void*)ptr, mask); + __m256i payload = _mm256_maskload_epi64((const long long*)ptr, mask); return _mm256_blendv_epi8(vfill, payload, mask); } // fill zero to rest lanes @@ -184,7 +225,45 @@ NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane) const __m256i steps = npyv_set_s64(0, 1, 2, 3); __m256i vnlane = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane); __m256i mask = _mm256_cmpgt_epi64(vnlane, steps); - return _mm256_maskload_epi64((const void*)ptr, mask); + return _mm256_maskload_epi64((const long long*)ptr, mask); +} + +//// 64-bit nlane +NPY_FINLINE npyv_s32 npyv_load2_till_s32(const npy_int32 *ptr, npy_uintp nlane, + npy_int32 fill_lo, npy_int32 fill_hi) +{ + assert(nlane > 0); + const __m256i vfill = npyv_set_s32( + fill_lo, fill_hi, fill_lo, fill_hi, + fill_lo, fill_hi, fill_lo, fill_hi + ); + const __m256i steps = npyv_set_s64(0, 1, 2, 3); + __m256i vnlane = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane); + __m256i mask = _mm256_cmpgt_epi64(vnlane, steps); + __m256i payload = _mm256_maskload_epi64((const long long*)ptr, mask); + return _mm256_blendv_epi8(vfill, payload, mask); +} +// fill zero to rest lanes +NPY_FINLINE npyv_s32 npyv_load2_tillz_s32(const npy_int32 *ptr, npy_uintp nlane) +{ return npyv_load_tillz_s64((const npy_int64*)ptr, nlane); } + +/// 128-bit nlane +NPY_FINLINE npyv_u64 npyv_load2_tillz_s64(const npy_int64 *ptr, npy_uintp nlane) +{ + assert(nlane > 0); + npy_int64 m = -((npy_int64)(nlane > 1)); + __m256i mask = npyv_set_s64(-1, -1, m, m); + return _mm256_maskload_epi64((const long long*)ptr, mask); +} +// fill zero to rest lanes +NPY_FINLINE npyv_u64 npyv_load2_till_s64(const npy_int64 *ptr, npy_uintp nlane, + npy_int64 fill_lo, npy_int64 fill_hi) +{ + const __m256i vfill = npyv_set_s64(0, 0, fill_lo, fill_hi); + npy_int64 m = -((npy_int64)(nlane > 1)); + __m256i mask = npyv_set_s64(-1, -1, m, m); + __m256i payload = _mm256_maskload_epi64((const long long*)ptr, mask); + return _mm256_blendv_epi8(vfill, payload, mask); } /********************************* * Non-contiguous partial load @@ -216,12 +295,53 @@ npyv_loadn_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_ const __m256i steps = npyv_set_s64(0, 1, 2, 3); __m256i vnlane = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane); __m256i mask = _mm256_cmpgt_epi64(vnlane, steps); - return _mm256_mask_i64gather_epi64(vfill, (const void*)ptr, idx, mask, 8); + return _mm256_mask_i64gather_epi64(vfill, (const long long*)ptr, idx, mask, 8); } // fill zero to rest lanes NPY_FINLINE npyv_s64 npyv_loadn_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane) { return npyv_loadn_till_s64(ptr, stride, nlane, 0); } + +//// 64-bit load over 32-bit stride +NPY_FINLINE npyv_s64 npyv_loadn2_till_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane, + npy_int32 fill_lo, npy_int32 fill_hi) +{ + assert(nlane > 0); + const __m256i vfill = npyv_set_s32( + fill_lo, fill_hi, fill_lo, fill_hi, + fill_lo, fill_hi, fill_lo, fill_hi + ); + const __m256i idx = npyv_set_s64(0, 1*stride, 2*stride, 3*stride); + const __m256i steps = npyv_set_s64(0, 1, 2, 3); + __m256i vnlane = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane); + __m256i mask = _mm256_cmpgt_epi64(vnlane, steps); + return _mm256_mask_i64gather_epi64(vfill, (const long long*)ptr, idx, mask, 4); +} +// fill zero to rest lanes +NPY_FINLINE npyv_s32 npyv_loadn2_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane) +{ return npyv_loadn2_till_s32(ptr, stride, nlane, 0, 0); } + +//// 128-bit load over 64-bit stride +NPY_FINLINE npyv_s64 npyv_loadn2_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, + npy_int64 fill_lo, npy_int64 fill_hi) +{ + assert(nlane > 0); + __m256i a = npyv_loadl_s64(ptr); +#if defined(_MSC_VER) && defined(_M_IX86) + __m128i fill =_mm_setr_epi32( + (int)fill_lo, (int)(fill_lo >> 32), + (int)fill_hi, (int)(fill_hi >> 32) + ); +#else + __m128i fill = _mm_set_epi64x(fill_hi, fill_lo); +#endif + __m128i b = nlane > 1 ? _mm_loadu_si128((const __m128i*)(ptr + stride)) : fill; + return _mm256_inserti128_si256(a, b, 1); +} +// fill zero to rest lanes +NPY_FINLINE npyv_s64 npyv_loadn2_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane) +{ return npyv_loadn2_till_s64(ptr, stride, nlane, 0, 0); } + /********************************* * Partial store *********************************/ @@ -241,7 +361,21 @@ NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a const __m256i steps = npyv_set_s64(0, 1, 2, 3); __m256i vnlane = npyv_setall_s64(nlane > 8 ? 8 : (int)nlane); __m256i mask = _mm256_cmpgt_epi64(vnlane, steps); - _mm256_maskstore_epi64((void*)ptr, mask, a); + _mm256_maskstore_epi64((long long*)ptr, mask, a); +} + +//// 64-bit nlane +NPY_FINLINE void npyv_store2_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a) +{ npyv_store_till_s64((npy_int64*)ptr, nlane, a); } + +//// 128-bit nlane +NPY_FINLINE void npyv_store2_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a) +{ + assert(nlane > 0); + npyv_storel_s64(ptr, a); + if (nlane > 1) { + npyv_storeh_s64(ptr + 2, a); + } } /********************************* * Non-contiguous partial store @@ -252,23 +386,52 @@ NPY_FINLINE void npyv_storen_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp assert(nlane > 0); __m128i a0 = _mm256_castsi256_si128(a); __m128i a1 = _mm256_extracti128_si256(a, 1); + + ptr[stride*0] = _mm_extract_epi32(a0, 0); switch(nlane) { - default: - ptr[stride*7] = _mm_extract_epi32(a1, 3); - case 7: - ptr[stride*6] = _mm_extract_epi32(a1, 2); - case 6: - ptr[stride*5] = _mm_extract_epi32(a1, 1); + case 1: + return; + case 2: + ptr[stride*1] = _mm_extract_epi32(a0, 1); + return; + case 3: + ptr[stride*1] = _mm_extract_epi32(a0, 1); + ptr[stride*2] = _mm_extract_epi32(a0, 2); + return; + case 4: + ptr[stride*1] = _mm_extract_epi32(a0, 1); + ptr[stride*2] = _mm_extract_epi32(a0, 2); + ptr[stride*3] = _mm_extract_epi32(a0, 3); + return; case 5: + ptr[stride*1] = _mm_extract_epi32(a0, 1); + ptr[stride*2] = _mm_extract_epi32(a0, 2); + ptr[stride*3] = _mm_extract_epi32(a0, 3); ptr[stride*4] = _mm_extract_epi32(a1, 0); - case 4: + return; + case 6: + ptr[stride*1] = _mm_extract_epi32(a0, 1); + ptr[stride*2] = _mm_extract_epi32(a0, 2); ptr[stride*3] = _mm_extract_epi32(a0, 3); - case 3: + ptr[stride*4] = _mm_extract_epi32(a1, 0); + ptr[stride*5] = _mm_extract_epi32(a1, 1); + return; + case 7: + ptr[stride*1] = _mm_extract_epi32(a0, 1); ptr[stride*2] = _mm_extract_epi32(a0, 2); - case 2: + ptr[stride*3] = _mm_extract_epi32(a0, 3); + ptr[stride*4] = _mm_extract_epi32(a1, 0); + ptr[stride*5] = _mm_extract_epi32(a1, 1); + ptr[stride*6] = _mm_extract_epi32(a1, 2); + return; + default: ptr[stride*1] = _mm_extract_epi32(a0, 1); - case 1: - ptr[stride*0] = _mm_extract_epi32(a0, 0); + ptr[stride*2] = _mm_extract_epi32(a0, 2); + ptr[stride*3] = _mm_extract_epi32(a0, 3); + ptr[stride*4] = _mm_extract_epi32(a1, 0); + ptr[stride*5] = _mm_extract_epi32(a1, 1); + ptr[stride*6] = _mm_extract_epi32(a1, 2); + ptr[stride*7] = _mm_extract_epi32(a1, 3); } } //// 64 @@ -277,19 +440,60 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp assert(nlane > 0); __m128d a0 = _mm256_castpd256_pd128(_mm256_castsi256_pd(a)); __m128d a1 = _mm256_extractf128_pd(_mm256_castsi256_pd(a), 1); + double *dptr = (double*)ptr; + _mm_storel_pd(dptr, a0); switch(nlane) { - default: - _mm_storeh_pd(dptr + stride * 3, a1); + case 1: + return; + case 2: + _mm_storeh_pd(dptr + stride * 1, a0); + return; case 3: + _mm_storeh_pd(dptr + stride * 1, a0); _mm_storel_pd(dptr + stride * 2, a1); - case 2: + return; + default: _mm_storeh_pd(dptr + stride * 1, a0); + _mm_storel_pd(dptr + stride * 2, a1); + _mm_storeh_pd(dptr + stride * 3, a1); + } +} + +//// 64-bit store over 32-bit stride +NPY_FINLINE void npyv_storen2_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npyv_s32 a) +{ + assert(nlane > 0); + __m128d a0 = _mm256_castpd256_pd128(_mm256_castsi256_pd(a)); + __m128d a1 = _mm256_extractf128_pd(_mm256_castsi256_pd(a), 1); + + _mm_storel_pd((double*)ptr, a0); + switch(nlane) { case 1: - _mm_storel_pd(dptr + stride * 0, a0); + return; + case 2: + _mm_storeh_pd((double*)(ptr + stride * 1), a0); + return; + case 3: + _mm_storeh_pd((double*)(ptr + stride * 1), a0); + _mm_storel_pd((double*)(ptr + stride * 2), a1); + return; + default: + _mm_storeh_pd((double*)(ptr + stride * 1), a0); + _mm_storel_pd((double*)(ptr + stride * 2), a1); + _mm_storeh_pd((double*)(ptr + stride * 3), a1); } } +//// 128-bit store over 64-bit stride +NPY_FINLINE void npyv_storen2_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npyv_s64 a) +{ + assert(nlane > 0); + npyv_storel_s64(ptr, a); + if (nlane > 1) { + npyv_storeh_s64(ptr + stride, a); + } +} /***************************************************************************** * Implement partial load/store for u32/f32/u64/f64... via reinterpret cast *****************************************************************************/ @@ -300,7 +504,8 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp union { \ npyv_lanetype_##F_SFX from_##F_SFX; \ npyv_lanetype_##T_SFX to_##T_SFX; \ - } pun = {.from_##F_SFX = fill}; \ + } pun; \ + pun.from_##F_SFX = fill; \ return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load_till_##T_SFX( \ (const npyv_lanetype_##T_SFX *)ptr, nlane, pun.to_##T_SFX \ )); \ @@ -312,7 +517,8 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp union { \ npyv_lanetype_##F_SFX from_##F_SFX; \ npyv_lanetype_##T_SFX to_##T_SFX; \ - } pun = {.from_##F_SFX = fill}; \ + } pun; \ + pun.from_##F_SFX = fill; \ return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn_till_##T_SFX( \ (const npyv_lanetype_##T_SFX *)ptr, stride, nlane, pun.to_##T_SFX \ )); \ @@ -353,6 +559,110 @@ NPYV_IMPL_AVX2_REST_PARTIAL_TYPES(f32, s32) NPYV_IMPL_AVX2_REST_PARTIAL_TYPES(u64, s64) NPYV_IMPL_AVX2_REST_PARTIAL_TYPES(f64, s64) +// 128-bit/64-bit stride (load/store pair) +#define NPYV_IMPL_AVX2_REST_PARTIAL_TYPES_PAIR(F_SFX, T_SFX) \ + NPY_FINLINE npyv_##F_SFX npyv_load2_till_##F_SFX \ + (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, \ + npyv_lanetype_##F_SFX fill_lo, npyv_lanetype_##F_SFX fill_hi) \ + { \ + union pun { \ + npyv_lanetype_##F_SFX from_##F_SFX; \ + npyv_lanetype_##T_SFX to_##T_SFX; \ + }; \ + union pun pun_lo; \ + union pun pun_hi; \ + pun_lo.from_##F_SFX = fill_lo; \ + pun_hi.from_##F_SFX = fill_hi; \ + return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load2_till_##T_SFX( \ + (const npyv_lanetype_##T_SFX *)ptr, nlane, pun_lo.to_##T_SFX, pun_hi.to_##T_SFX \ + )); \ + } \ + NPY_FINLINE npyv_##F_SFX npyv_loadn2_till_##F_SFX \ + (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, \ + npyv_lanetype_##F_SFX fill_lo, npyv_lanetype_##F_SFX fill_hi) \ + { \ + union pun { \ + npyv_lanetype_##F_SFX from_##F_SFX; \ + npyv_lanetype_##T_SFX to_##T_SFX; \ + }; \ + union pun pun_lo; \ + union pun pun_hi; \ + pun_lo.from_##F_SFX = fill_lo; \ + pun_hi.from_##F_SFX = fill_hi; \ + return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn2_till_##T_SFX( \ + (const npyv_lanetype_##T_SFX *)ptr, stride, nlane, pun_lo.to_##T_SFX, \ + pun_hi.to_##T_SFX \ + )); \ + } \ + NPY_FINLINE npyv_##F_SFX npyv_load2_tillz_##F_SFX \ + (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane) \ + { \ + return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load2_tillz_##T_SFX( \ + (const npyv_lanetype_##T_SFX *)ptr, nlane \ + )); \ + } \ + NPY_FINLINE npyv_##F_SFX npyv_loadn2_tillz_##F_SFX \ + (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane) \ + { \ + return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn2_tillz_##T_SFX( \ + (const npyv_lanetype_##T_SFX *)ptr, stride, nlane \ + )); \ + } \ + NPY_FINLINE void npyv_store2_till_##F_SFX \ + (npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, npyv_##F_SFX a) \ + { \ + npyv_store2_till_##T_SFX( \ + (npyv_lanetype_##T_SFX *)ptr, nlane, \ + npyv_reinterpret_##T_SFX##_##F_SFX(a) \ + ); \ + } \ + NPY_FINLINE void npyv_storen2_till_##F_SFX \ + (npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, npyv_##F_SFX a) \ + { \ + npyv_storen2_till_##T_SFX( \ + (npyv_lanetype_##T_SFX *)ptr, stride, nlane, \ + npyv_reinterpret_##T_SFX##_##F_SFX(a) \ + ); \ + } + +NPYV_IMPL_AVX2_REST_PARTIAL_TYPES_PAIR(u32, s32) +NPYV_IMPL_AVX2_REST_PARTIAL_TYPES_PAIR(f32, s32) +NPYV_IMPL_AVX2_REST_PARTIAL_TYPES_PAIR(u64, s64) +NPYV_IMPL_AVX2_REST_PARTIAL_TYPES_PAIR(f64, s64) + +/************************************************************ + * de-interlave load / interleave contiguous store + ************************************************************/ +// two channels +#define NPYV_IMPL_AVX2_MEM_INTERLEAVE(SFX, ZSFX) \ + NPY_FINLINE npyv_##ZSFX##x2 npyv_zip_##ZSFX(npyv_##ZSFX, npyv_##ZSFX); \ + NPY_FINLINE npyv_##ZSFX##x2 npyv_unzip_##ZSFX(npyv_##ZSFX, npyv_##ZSFX); \ + NPY_FINLINE npyv_##SFX##x2 npyv_load_##SFX##x2( \ + const npyv_lanetype_##SFX *ptr \ + ) { \ + return npyv_unzip_##ZSFX( \ + npyv_load_##SFX(ptr), npyv_load_##SFX(ptr+npyv_nlanes_##SFX) \ + ); \ + } \ + NPY_FINLINE void npyv_store_##SFX##x2( \ + npyv_lanetype_##SFX *ptr, npyv_##SFX##x2 v \ + ) { \ + npyv_##SFX##x2 zip = npyv_zip_##ZSFX(v.val[0], v.val[1]); \ + npyv_store_##SFX(ptr, zip.val[0]); \ + npyv_store_##SFX(ptr + npyv_nlanes_##SFX, zip.val[1]); \ + } + +NPYV_IMPL_AVX2_MEM_INTERLEAVE(u8, u8) +NPYV_IMPL_AVX2_MEM_INTERLEAVE(s8, u8) +NPYV_IMPL_AVX2_MEM_INTERLEAVE(u16, u16) +NPYV_IMPL_AVX2_MEM_INTERLEAVE(s16, u16) +NPYV_IMPL_AVX2_MEM_INTERLEAVE(u32, u32) +NPYV_IMPL_AVX2_MEM_INTERLEAVE(s32, u32) +NPYV_IMPL_AVX2_MEM_INTERLEAVE(u64, u64) +NPYV_IMPL_AVX2_MEM_INTERLEAVE(s64, u64) +NPYV_IMPL_AVX2_MEM_INTERLEAVE(f32, f32) +NPYV_IMPL_AVX2_MEM_INTERLEAVE(f64, f64) + /********************************* * Lookup tables *********************************/ diff --git a/numpy/core/src/common/simd/avx2/operators.h b/numpy/core/src/common/simd/avx2/operators.h index c10267b21..7b9b6a344 100644 --- a/numpy/core/src/common/simd/avx2/operators.h +++ b/numpy/core/src/common/simd/avx2/operators.h @@ -205,7 +205,7 @@ NPY_FINLINE __m256i npyv_cmpge_u32(__m256i a, __m256i b) #define npyv_cmple_u64(A, B) npyv_cmpge_u64(B, A) #define npyv_cmple_s64(A, B) npyv_cmpge_s64(B, A) -// precision comparison +// precision comparison (ordered) #define npyv_cmpeq_f32(A, B) _mm256_castps_si256(_mm256_cmp_ps(A, B, _CMP_EQ_OQ)) #define npyv_cmpeq_f64(A, B) _mm256_castpd_si256(_mm256_cmp_pd(A, B, _CMP_EQ_OQ)) #define npyv_cmpneq_f32(A, B) _mm256_castps_si256(_mm256_cmp_ps(A, B, _CMP_NEQ_UQ)) diff --git a/numpy/core/src/common/simd/avx2/reorder.h b/numpy/core/src/common/simd/avx2/reorder.h index 4d6ec8f75..9ebe0e7f4 100644 --- a/numpy/core/src/common/simd/avx2/reorder.h +++ b/numpy/core/src/common/simd/avx2/reorder.h @@ -94,6 +94,75 @@ NPY_FINLINE npyv_f64x2 npyv_zip_f64(__m256d a, __m256d b) return npyv_combine_f64(ab0, ab1); } +// deinterleave two vectors +NPY_FINLINE npyv_u8x2 npyv_unzip_u8(npyv_u8 ab0, npyv_u8 ab1) +{ + const __m256i idx = _mm256_setr_epi8( + 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, + 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 + ); + __m256i ab_03 = _mm256_shuffle_epi8(ab0, idx); + __m256i ab_12 = _mm256_shuffle_epi8(ab1, idx); + npyv_u8x2 ab_lh = npyv_combine_u8(ab_03, ab_12); + npyv_u8x2 r; + r.val[0] = _mm256_unpacklo_epi64(ab_lh.val[0], ab_lh.val[1]); + r.val[1] = _mm256_unpackhi_epi64(ab_lh.val[0], ab_lh.val[1]); + return r; +} +#define npyv_unzip_s8 npyv_unzip_u8 + +NPY_FINLINE npyv_u16x2 npyv_unzip_u16(npyv_u16 ab0, npyv_u16 ab1) +{ + const __m256i idx = _mm256_setr_epi8( + 0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15, + 0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15 + ); + __m256i ab_03 = _mm256_shuffle_epi8(ab0, idx); + __m256i ab_12 = _mm256_shuffle_epi8(ab1, idx); + npyv_u16x2 ab_lh = npyv_combine_u16(ab_03, ab_12); + npyv_u16x2 r; + r.val[0] = _mm256_unpacklo_epi64(ab_lh.val[0], ab_lh.val[1]); + r.val[1] = _mm256_unpackhi_epi64(ab_lh.val[0], ab_lh.val[1]); + return r; +} +#define npyv_unzip_s16 npyv_unzip_u16 + +NPY_FINLINE npyv_u32x2 npyv_unzip_u32(npyv_u32 ab0, npyv_u32 ab1) +{ + const __m256i idx = npyv_set_u32(0, 2, 4, 6, 1, 3, 5, 7); + __m256i abl = _mm256_permutevar8x32_epi32(ab0, idx); + __m256i abh = _mm256_permutevar8x32_epi32(ab1, idx); + return npyv_combine_u32(abl, abh); +} +#define npyv_unzip_s32 npyv_unzip_u32 + +NPY_FINLINE npyv_u64x2 npyv_unzip_u64(npyv_u64 ab0, npyv_u64 ab1) +{ + npyv_u64x2 ab_lh = npyv_combine_u64(ab0, ab1); + npyv_u64x2 r; + r.val[0] = _mm256_unpacklo_epi64(ab_lh.val[0], ab_lh.val[1]); + r.val[1] = _mm256_unpackhi_epi64(ab_lh.val[0], ab_lh.val[1]); + return r; +} +#define npyv_unzip_s64 npyv_unzip_u64 + +NPY_FINLINE npyv_f32x2 npyv_unzip_f32(npyv_f32 ab0, npyv_f32 ab1) +{ + const __m256i idx = npyv_set_u32(0, 2, 4, 6, 1, 3, 5, 7); + __m256 abl = _mm256_permutevar8x32_ps(ab0, idx); + __m256 abh = _mm256_permutevar8x32_ps(ab1, idx); + return npyv_combine_f32(abl, abh); +} + +NPY_FINLINE npyv_f64x2 npyv_unzip_f64(npyv_f64 ab0, npyv_f64 ab1) +{ + npyv_f64x2 ab_lh = npyv_combine_f64(ab0, ab1); + npyv_f64x2 r; + r.val[0] = _mm256_unpacklo_pd(ab_lh.val[0], ab_lh.val[1]); + r.val[1] = _mm256_unpackhi_pd(ab_lh.val[0], ab_lh.val[1]); + return r; +} + // Reverse elements of each 64-bit lane NPY_FINLINE npyv_u8 npyv_rev64_u8(npyv_u8 a) { @@ -126,4 +195,22 @@ NPY_FINLINE npyv_f32 npyv_rev64_f32(npyv_f32 a) return _mm256_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 0, 1)); } +// Permuting the elements of each 128-bit lane by immediate index for +// each element. +#define npyv_permi128_u32(A, E0, E1, E2, E3) \ + _mm256_shuffle_epi32(A, _MM_SHUFFLE(E3, E2, E1, E0)) + +#define npyv_permi128_s32 npyv_permi128_u32 + +#define npyv_permi128_u64(A, E0, E1) \ + _mm256_shuffle_epi32(A, _MM_SHUFFLE(((E1)<<1)+1, ((E1)<<1), ((E0)<<1)+1, ((E0)<<1))) + +#define npyv_permi128_s64 npyv_permi128_u64 + +#define npyv_permi128_f32(A, E0, E1, E2, E3) \ + _mm256_permute_ps(A, _MM_SHUFFLE(E3, E2, E1, E0)) + +#define npyv_permi128_f64(A, E0, E1) \ + _mm256_permute_pd(A, ((E1)<<3) | ((E0)<<2) | ((E1)<<1) | (E0)) + #endif // _NPY_SIMD_AVX2_REORDER_H diff --git a/numpy/core/src/common/simd/avx512/arithmetic.h b/numpy/core/src/common/simd/avx512/arithmetic.h index 1290dc0ad..a63da87d0 100644 --- a/numpy/core/src/common/simd/avx512/arithmetic.h +++ b/numpy/core/src/common/simd/avx512/arithmetic.h @@ -349,6 +349,10 @@ NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor) // negate multiply and subtract, -(a*b) - c #define npyv_nmulsub_f32 _mm512_fnmsub_ps #define npyv_nmulsub_f64 _mm512_fnmsub_pd +// multiply, add for odd elements and subtract even elements. +// (a * b) -+ c +#define npyv_muladdsub_f32 _mm512_fmaddsub_ps +#define npyv_muladdsub_f64 _mm512_fmaddsub_pd /*************************** * Summation: Calculates the sum of all vector elements. diff --git a/numpy/core/src/common/simd/avx512/avx512.h b/numpy/core/src/common/simd/avx512/avx512.h index 0946e6443..aa6abe256 100644 --- a/numpy/core/src/common/simd/avx512/avx512.h +++ b/numpy/core/src/common/simd/avx512/avx512.h @@ -7,6 +7,7 @@ #define NPY_SIMD_F64 1 #define NPY_SIMD_FMA3 1 // native support #define NPY_SIMD_BIGENDIAN 0 +#define NPY_SIMD_CMPSIGNAL 0 // Enough limit to allow us to use _mm512_i32gather_* and _mm512_i32scatter_* #define NPY_SIMD_MAXLOAD_STRIDE32 (0x7fffffff / 16) #define NPY_SIMD_MAXSTORE_STRIDE32 (0x7fffffff / 16) diff --git a/numpy/core/src/common/simd/avx512/maskop.h b/numpy/core/src/common/simd/avx512/maskop.h index d1c188390..88fa4a68c 100644 --- a/numpy/core/src/common/simd/avx512/maskop.h +++ b/numpy/core/src/common/simd/avx512/maskop.h @@ -51,4 +51,17 @@ NPYV_IMPL_AVX512_MASK_ADDSUB(s64, b64, epi64) NPYV_IMPL_AVX512_MASK_ADDSUB(f32, b32, ps) NPYV_IMPL_AVX512_MASK_ADDSUB(f64, b64, pd) +// division, m ? a / b : c +NPY_FINLINE npyv_f32 npyv_ifdiv_f32(npyv_b32 m, npyv_f32 a, npyv_f32 b, npyv_f32 c) +{ return _mm512_mask_div_ps(c, m, a, b); } +// conditional division, m ? a / b : 0 +NPY_FINLINE npyv_f32 npyv_ifdivz_f32(npyv_b32 m, npyv_f32 a, npyv_f32 b) +{ return _mm512_maskz_div_ps(m, a, b); } +// division, m ? a / b : c +NPY_FINLINE npyv_f64 npyv_ifdiv_f64(npyv_b32 m, npyv_f64 a, npyv_f64 b, npyv_f64 c) +{ return _mm512_mask_div_pd(c, m, a, b); } +// conditional division, m ? a / b : 0 +NPY_FINLINE npyv_f64 npyv_ifdivz_f64(npyv_b32 m, npyv_f64 a, npyv_f64 b) +{ return _mm512_maskz_div_pd(m, a, b); } + #endif // _NPY_SIMD_AVX512_MASKOP_H diff --git a/numpy/core/src/common/simd/avx512/memory.h b/numpy/core/src/common/simd/avx512/memory.h index 03fcb4630..fdf96a92c 100644 --- a/numpy/core/src/common/simd/avx512/memory.h +++ b/numpy/core/src/common/simd/avx512/memory.h @@ -120,6 +120,52 @@ NPY_FINLINE npyv_s64 npyv_loadn_s64(const npy_int64 *ptr, npy_intp stride) { return npyv_loadn_u64((const npy_uint64*)ptr, stride); } NPY_FINLINE npyv_f64 npyv_loadn_f64(const double *ptr, npy_intp stride) { return _mm512_castsi512_pd(npyv_loadn_u64((const npy_uint64*)ptr, stride)); } + +//// 64-bit load over 32-bit stride +NPY_FINLINE npyv_u32 npyv_loadn2_u32(const npy_uint32 *ptr, npy_intp stride) +{ + __m128d a = _mm_loadh_pd( + _mm_castsi128_pd(_mm_loadl_epi64((const __m128i*)ptr)), + (const double*)(ptr + stride) + ); + __m128d b = _mm_loadh_pd( + _mm_castsi128_pd(_mm_loadl_epi64((const __m128i*)(ptr + stride*2))), + (const double*)(ptr + stride*3) + ); + __m128d c = _mm_loadh_pd( + _mm_castsi128_pd(_mm_loadl_epi64((const __m128i*)(ptr + stride*4))), + (const double*)(ptr + stride*5) + ); + __m128d d = _mm_loadh_pd( + _mm_castsi128_pd(_mm_loadl_epi64((const __m128i*)(ptr + stride*6))), + (const double*)(ptr + stride*7) + ); + return _mm512_castpd_si512(npyv512_combine_pd256( + _mm256_insertf128_pd(_mm256_castpd128_pd256(a), b, 1), + _mm256_insertf128_pd(_mm256_castpd128_pd256(c), d, 1) + )); +} +NPY_FINLINE npyv_s32 npyv_loadn2_s32(const npy_int32 *ptr, npy_intp stride) +{ return npyv_loadn2_u32((const npy_uint32*)ptr, stride); } +NPY_FINLINE npyv_f32 npyv_loadn2_f32(const float *ptr, npy_intp stride) +{ return _mm512_castsi512_ps(npyv_loadn2_u32((const npy_uint32*)ptr, stride)); } + +//// 128-bit load over 64-bit stride +NPY_FINLINE npyv_f64 npyv_loadn2_f64(const double *ptr, npy_intp stride) +{ + __m128d a = _mm_loadu_pd(ptr); + __m128d b = _mm_loadu_pd(ptr + stride); + __m128d c = _mm_loadu_pd(ptr + stride * 2); + __m128d d = _mm_loadu_pd(ptr + stride * 3); + return npyv512_combine_pd256( + _mm256_insertf128_pd(_mm256_castpd128_pd256(a), b, 1), + _mm256_insertf128_pd(_mm256_castpd128_pd256(c), d, 1) + ); +} +NPY_FINLINE npyv_u64 npyv_loadn2_u64(const npy_uint64 *ptr, npy_intp stride) +{ return npyv_reinterpret_u64_f64(npyv_loadn2_f64((const double*)ptr, stride)); } +NPY_FINLINE npyv_s64 npyv_loadn2_s64(const npy_int64 *ptr, npy_intp stride) +{ return npyv_loadn2_u64((const npy_uint64*)ptr, stride); } /*************************** * Non-contiguous Store ***************************/ @@ -151,6 +197,48 @@ NPY_FINLINE void npyv_storen_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a) NPY_FINLINE void npyv_storen_f64(double *ptr, npy_intp stride, npyv_f64 a) { npyv_storen_u64((npy_uint64*)ptr, stride, _mm512_castpd_si512(a)); } +//// 64-bit store over 32-bit stride +NPY_FINLINE void npyv_storen2_u32(npy_uint32 *ptr, npy_intp stride, npyv_u32 a) +{ + __m256d lo = _mm512_castpd512_pd256(_mm512_castsi512_pd(a)); + __m256d hi = _mm512_extractf64x4_pd(_mm512_castsi512_pd(a), 1); + __m128d e0 = _mm256_castpd256_pd128(lo); + __m128d e1 = _mm256_extractf128_pd(lo, 1); + __m128d e2 = _mm256_castpd256_pd128(hi); + __m128d e3 = _mm256_extractf128_pd(hi, 1); + _mm_storel_pd((double*)(ptr + stride * 0), e0); + _mm_storeh_pd((double*)(ptr + stride * 1), e0); + _mm_storel_pd((double*)(ptr + stride * 2), e1); + _mm_storeh_pd((double*)(ptr + stride * 3), e1); + _mm_storel_pd((double*)(ptr + stride * 4), e2); + _mm_storeh_pd((double*)(ptr + stride * 5), e2); + _mm_storel_pd((double*)(ptr + stride * 6), e3); + _mm_storeh_pd((double*)(ptr + stride * 7), e3); +} +NPY_FINLINE void npyv_storen2_s32(npy_int32 *ptr, npy_intp stride, npyv_s32 a) +{ npyv_storen2_u32((npy_uint32*)ptr, stride, a); } +NPY_FINLINE void npyv_storen2_f32(float *ptr, npy_intp stride, npyv_f32 a) +{ npyv_storen2_u32((npy_uint32*)ptr, stride, _mm512_castps_si512(a)); } + +//// 128-bit store over 64-bit stride +NPY_FINLINE void npyv_storen2_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a) +{ + __m256i lo = npyv512_lower_si256(a); + __m256i hi = npyv512_higher_si256(a); + __m128i e0 = _mm256_castsi256_si128(lo); + __m128i e1 = _mm256_extracti128_si256(lo, 1); + __m128i e2 = _mm256_castsi256_si128(hi); + __m128i e3 = _mm256_extracti128_si256(hi, 1); + _mm_storeu_si128((__m128i*)(ptr + stride * 0), e0); + _mm_storeu_si128((__m128i*)(ptr + stride * 1), e1); + _mm_storeu_si128((__m128i*)(ptr + stride * 2), e2); + _mm_storeu_si128((__m128i*)(ptr + stride * 3), e3); +} +NPY_FINLINE void npyv_storen2_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a) +{ npyv_storen2_u64((npy_uint64*)ptr, stride, a); } +NPY_FINLINE void npyv_storen2_f64(double *ptr, npy_intp stride, npyv_f64 a) +{ npyv_storen2_u64((npy_uint64*)ptr, stride, _mm512_castpd_si512(a)); } + /********************************* * Partial Load *********************************/ @@ -159,14 +247,14 @@ NPY_FINLINE npyv_s32 npyv_load_till_s32(const npy_int32 *ptr, npy_uintp nlane, n { assert(nlane > 0); const __m512i vfill = _mm512_set1_epi32(fill); - const __mmask16 mask = nlane > 31 ? -1 : (1 << nlane) - 1; + const __mmask16 mask = nlane > 15 ? -1 : (1 << nlane) - 1; return _mm512_mask_loadu_epi32(vfill, mask, (const __m512i*)ptr); } // fill zero to rest lanes NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32 *ptr, npy_uintp nlane) { assert(nlane > 0); - const __mmask16 mask = nlane > 31 ? -1 : (1 << nlane) - 1; + const __mmask16 mask = nlane > 15 ? -1 : (1 << nlane) - 1; return _mm512_maskz_loadu_epi32(mask, (const __m512i*)ptr); } //// 64 @@ -174,14 +262,44 @@ NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, n { assert(nlane > 0); const __m512i vfill = npyv_setall_s64(fill); - const __mmask8 mask = nlane > 31 ? -1 : (1 << nlane) - 1; + const __mmask8 mask = nlane > 7 ? -1 : (1 << nlane) - 1; return _mm512_mask_loadu_epi64(vfill, mask, (const __m512i*)ptr); } // fill zero to rest lanes NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane) { assert(nlane > 0); - const __mmask8 mask = nlane > 15 ? -1 : (1 << nlane) - 1; + const __mmask8 mask = nlane > 7 ? -1 : (1 << nlane) - 1; + return _mm512_maskz_loadu_epi64(mask, (const __m512i*)ptr); +} + +//// 64-bit nlane +NPY_FINLINE npyv_s32 npyv_load2_till_s32(const npy_int32 *ptr, npy_uintp nlane, + npy_int32 fill_lo, npy_int32 fill_hi) +{ + assert(nlane > 0); + const __m512i vfill = _mm512_set4_epi32(fill_hi, fill_lo, fill_hi, fill_lo); + const __mmask8 mask = nlane > 7 ? -1 : (1 << nlane) - 1; + return _mm512_mask_loadu_epi64(vfill, mask, (const __m512i*)ptr); +} +// fill zero to rest lanes +NPY_FINLINE npyv_s32 npyv_load2_tillz_s32(const npy_int32 *ptr, npy_uintp nlane) +{ return npyv_load_tillz_s64((const npy_int64*)ptr, nlane); } + +//// 128-bit nlane +NPY_FINLINE npyv_u64 npyv_load2_till_s64(const npy_int64 *ptr, npy_uintp nlane, + npy_int64 fill_lo, npy_int64 fill_hi) +{ + assert(nlane > 0); + const __m512i vfill = _mm512_set4_epi64(fill_hi, fill_lo, fill_hi, fill_lo); + const __mmask8 mask = nlane > 3 ? -1 : (1 << (nlane*2)) - 1; + return _mm512_mask_loadu_epi64(vfill, mask, (const __m512i*)ptr); +} +// fill zero to rest lanes +NPY_FINLINE npyv_s64 npyv_load2_tillz_s64(const npy_int64 *ptr, npy_uintp nlane) +{ + assert(nlane > 0); + const __mmask8 mask = nlane > 3 ? -1 : (1 << (nlane*2)) - 1; return _mm512_maskz_loadu_epi64(mask, (const __m512i*)ptr); } /********************************* @@ -198,7 +316,7 @@ npyv_loadn_till_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npy_ ); const __m512i idx = _mm512_mullo_epi32(steps, _mm512_set1_epi32((int)stride)); const __m512i vfill = _mm512_set1_epi32(fill); - const __mmask16 mask = nlane > 31 ? -1 : (1 << nlane) - 1; + const __mmask16 mask = nlane > 15 ? -1 : (1 << nlane) - 1; return _mm512_mask_i32gather_epi32(vfill, mask, idx, (const __m512i*)ptr, 4); } // fill zero to rest lanes @@ -215,13 +333,48 @@ npyv_loadn_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_ 4*stride, 5*stride, 6*stride, 7*stride ); const __m512i vfill = npyv_setall_s64(fill); - const __mmask8 mask = nlane > 31 ? -1 : (1 << nlane) - 1; + const __mmask8 mask = nlane > 15 ? -1 : (1 << nlane) - 1; return _mm512_mask_i64gather_epi64(vfill, mask, idx, (const __m512i*)ptr, 8); } // fill zero to rest lanes NPY_FINLINE npyv_s64 npyv_loadn_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane) { return npyv_loadn_till_s64(ptr, stride, nlane, 0); } + +//// 64-bit load over 32-bit stride +NPY_FINLINE npyv_s64 npyv_loadn2_till_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane, + npy_int32 fill_lo, npy_int32 fill_hi) +{ + assert(nlane > 0); + const __m512i idx = npyv_set_s64( + 0*stride, 1*stride, 2*stride, 3*stride, + 4*stride, 5*stride, 6*stride, 7*stride + ); + const __m512i vfill = _mm512_set4_epi32(fill_hi, fill_lo, fill_hi, fill_lo); + const __mmask8 mask = nlane > 7 ? -1 : (1 << nlane) - 1; + return _mm512_mask_i64gather_epi64(vfill, mask, idx, (const __m512i*)ptr, 4); +} +// fill zero to rest lanes +NPY_FINLINE npyv_s32 npyv_loadn2_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane) +{ return npyv_loadn2_till_s32(ptr, stride, nlane, 0, 0); } + +//// 128-bit load over 64-bit stride +NPY_FINLINE npyv_s64 npyv_loadn2_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, + npy_int64 fill_lo, npy_int64 fill_hi) +{ + assert(nlane > 0); + const __m512i idx = npyv_set_s64( + 0, 1, stride, stride+1, + stride*2, stride*2+1, stride*3, stride*3+1 + ); + const __mmask8 mask = nlane > 3 ? -1 : (1 << (nlane*2)) - 1; + const __m512i vfill = _mm512_set4_epi64(fill_hi, fill_lo, fill_hi, fill_lo); + return _mm512_mask_i64gather_epi64(vfill, mask, idx, (const __m512i*)ptr, 8); +} +// fill zero to rest lanes +NPY_FINLINE npyv_s64 npyv_loadn2_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane) +{ return npyv_loadn2_till_s64(ptr, stride, nlane, 0, 0); } + /********************************* * Partial store *********************************/ @@ -229,14 +382,30 @@ npyv_loadn_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane) NPY_FINLINE void npyv_store_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a) { assert(nlane > 0); - const __mmask16 mask = nlane > 31 ? -1 : (1 << nlane) - 1; + const __mmask16 mask = nlane > 15 ? -1 : (1 << nlane) - 1; _mm512_mask_storeu_epi32((__m512i*)ptr, mask, a); } //// 64 NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a) { assert(nlane > 0); - const __mmask8 mask = nlane > 15 ? -1 : (1 << nlane) - 1; + const __mmask8 mask = nlane > 7 ? -1 : (1 << nlane) - 1; + _mm512_mask_storeu_epi64((__m512i*)ptr, mask, a); +} + +//// 64-bit nlane +NPY_FINLINE void npyv_store2_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a) +{ + assert(nlane > 0); + const __mmask8 mask = nlane > 7 ? -1 : (1 << nlane) - 1; + _mm512_mask_storeu_epi64((__m512i*)ptr, mask, a); +} + +//// 128-bit nlane +NPY_FINLINE void npyv_store2_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a) +{ + assert(nlane > 0); + const __mmask8 mask = nlane > 3 ? -1 : (1 << (nlane*2)) - 1; _mm512_mask_storeu_epi64((__m512i*)ptr, mask, a); } /********************************* @@ -251,7 +420,7 @@ NPY_FINLINE void npyv_storen_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 ); const __m512i idx = _mm512_mullo_epi32(steps, _mm512_set1_epi32((int)stride)); - const __mmask16 mask = nlane > 31 ? -1 : (1 << nlane) - 1; + const __mmask16 mask = nlane > 15 ? -1 : (1 << nlane) - 1; _mm512_mask_i32scatter_epi32((__m512i*)ptr, mask, idx, a, 4); } //// 64 @@ -262,7 +431,31 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp 0*stride, 1*stride, 2*stride, 3*stride, 4*stride, 5*stride, 6*stride, 7*stride ); - const __mmask8 mask = nlane > 15 ? -1 : (1 << nlane) - 1; + const __mmask8 mask = nlane > 7 ? -1 : (1 << nlane) - 1; + _mm512_mask_i64scatter_epi64((__m512i*)ptr, mask, idx, a, 8); +} + +//// 64-bit store over 32-bit stride +NPY_FINLINE void npyv_storen2_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npyv_s32 a) +{ + assert(nlane > 0); + const __m512i idx = npyv_set_s64( + 0*stride, 1*stride, 2*stride, 3*stride, + 4*stride, 5*stride, 6*stride, 7*stride + ); + const __mmask8 mask = nlane > 7 ? -1 : (1 << nlane) - 1; + _mm512_mask_i64scatter_epi64((__m512i*)ptr, mask, idx, a, 4); +} + +//// 128-bit store over 64-bit stride +NPY_FINLINE void npyv_storen2_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npyv_s64 a) +{ + assert(nlane > 0); + const __m512i idx = npyv_set_s64( + 0, 1, stride, stride+1, + 2*stride, 2*stride+1, 3*stride, 3*stride+1 + ); + const __mmask8 mask = nlane > 3 ? -1 : (1 << (nlane*2)) - 1; _mm512_mask_i64scatter_epi64((__m512i*)ptr, mask, idx, a, 8); } @@ -331,6 +524,110 @@ NPYV_IMPL_AVX512_REST_PARTIAL_TYPES(f32, s32) NPYV_IMPL_AVX512_REST_PARTIAL_TYPES(u64, s64) NPYV_IMPL_AVX512_REST_PARTIAL_TYPES(f64, s64) +// 128-bit/64-bit stride (pair load/store) +#define NPYV_IMPL_AVX512_REST_PARTIAL_TYPES_PAIR(F_SFX, T_SFX) \ + NPY_FINLINE npyv_##F_SFX npyv_load2_till_##F_SFX \ + (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, \ + npyv_lanetype_##F_SFX fill_lo, npyv_lanetype_##F_SFX fill_hi) \ + { \ + union pun { \ + npyv_lanetype_##F_SFX from_##F_SFX; \ + npyv_lanetype_##T_SFX to_##T_SFX; \ + }; \ + union pun pun_lo; \ + union pun pun_hi; \ + pun_lo.from_##F_SFX = fill_lo; \ + pun_hi.from_##F_SFX = fill_hi; \ + return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load2_till_##T_SFX( \ + (const npyv_lanetype_##T_SFX *)ptr, nlane, pun_lo.to_##T_SFX, pun_hi.to_##T_SFX \ + )); \ + } \ + NPY_FINLINE npyv_##F_SFX npyv_loadn2_till_##F_SFX \ + (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, \ + npyv_lanetype_##F_SFX fill_lo, npyv_lanetype_##F_SFX fill_hi) \ + { \ + union pun { \ + npyv_lanetype_##F_SFX from_##F_SFX; \ + npyv_lanetype_##T_SFX to_##T_SFX; \ + }; \ + union pun pun_lo; \ + union pun pun_hi; \ + pun_lo.from_##F_SFX = fill_lo; \ + pun_hi.from_##F_SFX = fill_hi; \ + return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn2_till_##T_SFX( \ + (const npyv_lanetype_##T_SFX *)ptr, stride, nlane, pun_lo.to_##T_SFX, \ + pun_hi.to_##T_SFX \ + )); \ + } \ + NPY_FINLINE npyv_##F_SFX npyv_load2_tillz_##F_SFX \ + (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane) \ + { \ + return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load2_tillz_##T_SFX( \ + (const npyv_lanetype_##T_SFX *)ptr, nlane \ + )); \ + } \ + NPY_FINLINE npyv_##F_SFX npyv_loadn2_tillz_##F_SFX \ + (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane) \ + { \ + return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn2_tillz_##T_SFX( \ + (const npyv_lanetype_##T_SFX *)ptr, stride, nlane \ + )); \ + } \ + NPY_FINLINE void npyv_store2_till_##F_SFX \ + (npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, npyv_##F_SFX a) \ + { \ + npyv_store2_till_##T_SFX( \ + (npyv_lanetype_##T_SFX *)ptr, nlane, \ + npyv_reinterpret_##T_SFX##_##F_SFX(a) \ + ); \ + } \ + NPY_FINLINE void npyv_storen2_till_##F_SFX \ + (npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, npyv_##F_SFX a) \ + { \ + npyv_storen2_till_##T_SFX( \ + (npyv_lanetype_##T_SFX *)ptr, stride, nlane, \ + npyv_reinterpret_##T_SFX##_##F_SFX(a) \ + ); \ + } + +NPYV_IMPL_AVX512_REST_PARTIAL_TYPES_PAIR(u32, s32) +NPYV_IMPL_AVX512_REST_PARTIAL_TYPES_PAIR(f32, s32) +NPYV_IMPL_AVX512_REST_PARTIAL_TYPES_PAIR(u64, s64) +NPYV_IMPL_AVX512_REST_PARTIAL_TYPES_PAIR(f64, s64) + +/************************************************************ + * de-interlave load / interleave contiguous store + ************************************************************/ +// two channels +#define NPYV_IMPL_AVX512_MEM_INTERLEAVE(SFX, ZSFX) \ + NPY_FINLINE npyv_##ZSFX##x2 npyv_zip_##ZSFX(npyv_##ZSFX, npyv_##ZSFX); \ + NPY_FINLINE npyv_##ZSFX##x2 npyv_unzip_##ZSFX(npyv_##ZSFX, npyv_##ZSFX); \ + NPY_FINLINE npyv_##SFX##x2 npyv_load_##SFX##x2( \ + const npyv_lanetype_##SFX *ptr \ + ) { \ + return npyv_unzip_##ZSFX( \ + npyv_load_##SFX(ptr), npyv_load_##SFX(ptr+npyv_nlanes_##SFX) \ + ); \ + } \ + NPY_FINLINE void npyv_store_##SFX##x2( \ + npyv_lanetype_##SFX *ptr, npyv_##SFX##x2 v \ + ) { \ + npyv_##SFX##x2 zip = npyv_zip_##ZSFX(v.val[0], v.val[1]); \ + npyv_store_##SFX(ptr, zip.val[0]); \ + npyv_store_##SFX(ptr + npyv_nlanes_##SFX, zip.val[1]); \ + } + +NPYV_IMPL_AVX512_MEM_INTERLEAVE(u8, u8) +NPYV_IMPL_AVX512_MEM_INTERLEAVE(s8, u8) +NPYV_IMPL_AVX512_MEM_INTERLEAVE(u16, u16) +NPYV_IMPL_AVX512_MEM_INTERLEAVE(s16, u16) +NPYV_IMPL_AVX512_MEM_INTERLEAVE(u32, u32) +NPYV_IMPL_AVX512_MEM_INTERLEAVE(s32, u32) +NPYV_IMPL_AVX512_MEM_INTERLEAVE(u64, u64) +NPYV_IMPL_AVX512_MEM_INTERLEAVE(s64, u64) +NPYV_IMPL_AVX512_MEM_INTERLEAVE(f32, f32) +NPYV_IMPL_AVX512_MEM_INTERLEAVE(f64, f64) + /************************************************** * Lookup table *************************************************/ diff --git a/numpy/core/src/common/simd/avx512/reorder.h b/numpy/core/src/common/simd/avx512/reorder.h index c0b2477f3..27e66b5e7 100644 --- a/numpy/core/src/common/simd/avx512/reorder.h +++ b/numpy/core/src/common/simd/avx512/reorder.h @@ -167,6 +167,140 @@ NPY_FINLINE npyv_f64x2 npyv_zip_f64(__m512d a, __m512d b) return r; } +// deinterleave two vectors +NPY_FINLINE npyv_u8x2 npyv_unzip_u8(npyv_u8 ab0, npyv_u8 ab1) +{ + npyv_u8x2 r; +#ifdef NPY_HAVE_AVX512VBMI + const __m512i idx_a = npyv_set_u8( + 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, + 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, + 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94, + 96, 98, 100, 102, 104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126 + ); + const __m512i idx_b = npyv_set_u8( + 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, + 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63, + 65, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 95, + 97, 99, 101, 103, 105, 107, 109, 111, 113, 115, 117, 119, 121, 123, 125, 127 + ); + r.val[0] = _mm512_permutex2var_epi8(ab0, idx_a, ab1); + r.val[1] = _mm512_permutex2var_epi8(ab0, idx_b, ab1); +#else + #ifdef NPY_HAVE_AVX512BW + const __m512i idx = npyv_set_u8( + 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, + 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, + 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, + 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 + ); + __m512i abl = _mm512_shuffle_epi8(ab0, idx); + __m512i abh = _mm512_shuffle_epi8(ab1, idx); + #else + const __m256i idx = _mm256_setr_epi8( + 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, + 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 + ); + __m256i abl_lo = _mm256_shuffle_epi8(npyv512_lower_si256(ab0), idx); + __m256i abl_hi = _mm256_shuffle_epi8(npyv512_higher_si256(ab0), idx); + __m256i abh_lo = _mm256_shuffle_epi8(npyv512_lower_si256(ab1), idx); + __m256i abh_hi = _mm256_shuffle_epi8(npyv512_higher_si256(ab1), idx); + __m512i abl = npyv512_combine_si256(abl_lo, abl_hi); + __m512i abh = npyv512_combine_si256(abh_lo, abh_hi); + #endif + const __m512i idx_a = npyv_set_u64(0, 2, 4, 6, 8, 10, 12, 14); + const __m512i idx_b = npyv_set_u64(1, 3, 5, 7, 9, 11, 13, 15); + r.val[0] = _mm512_permutex2var_epi64(abl, idx_a, abh); + r.val[1] = _mm512_permutex2var_epi64(abl, idx_b, abh); +#endif + return r; +} +#define npyv_unzip_s8 npyv_unzip_u8 + +NPY_FINLINE npyv_u16x2 npyv_unzip_u16(npyv_u16 ab0, npyv_u16 ab1) +{ + npyv_u16x2 r; +#ifdef NPY_HAVE_AVX512BW + const __m512i idx_a = npyv_set_u16( + 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, + 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62 + ); + const __m512i idx_b = npyv_set_u16( + 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, + 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63 + ); + r.val[0] = _mm512_permutex2var_epi16(ab0, idx_a, ab1); + r.val[1] = _mm512_permutex2var_epi16(ab0, idx_b, ab1); +#else + const __m256i idx = _mm256_setr_epi8( + 0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15, + 0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15 + ); + __m256i abl_lo = _mm256_shuffle_epi8(npyv512_lower_si256(ab0), idx); + __m256i abl_hi = _mm256_shuffle_epi8(npyv512_higher_si256(ab0), idx); + __m256i abh_lo = _mm256_shuffle_epi8(npyv512_lower_si256(ab1), idx); + __m256i abh_hi = _mm256_shuffle_epi8(npyv512_higher_si256(ab1), idx); + __m512i abl = npyv512_combine_si256(abl_lo, abl_hi); + __m512i abh = npyv512_combine_si256(abh_lo, abh_hi); + + const __m512i idx_a = npyv_set_u64(0, 2, 4, 6, 8, 10, 12, 14); + const __m512i idx_b = npyv_set_u64(1, 3, 5, 7, 9, 11, 13, 15); + r.val[0] = _mm512_permutex2var_epi64(abl, idx_a, abh); + r.val[1] = _mm512_permutex2var_epi64(abl, idx_b, abh); +#endif + return r; +} +#define npyv_unzip_s16 npyv_unzip_u16 + +NPY_FINLINE npyv_u32x2 npyv_unzip_u32(npyv_u32 ab0, npyv_u32 ab1) +{ + const __m512i idx_a = npyv_set_u32( + 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 + ); + const __m512i idx_b = npyv_set_u32( + 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 + ); + npyv_u32x2 r; + r.val[0] = _mm512_permutex2var_epi32(ab0, idx_a, ab1); + r.val[1] = _mm512_permutex2var_epi32(ab0, idx_b, ab1); + return r; +} +#define npyv_unzip_s32 npyv_unzip_u32 + +NPY_FINLINE npyv_u64x2 npyv_unzip_u64(npyv_u64 ab0, npyv_u64 ab1) +{ + const __m512i idx_a = npyv_set_u64(0, 2, 4, 6, 8, 10, 12, 14); + const __m512i idx_b = npyv_set_u64(1, 3, 5, 7, 9, 11, 13, 15); + npyv_u64x2 r; + r.val[0] = _mm512_permutex2var_epi64(ab0, idx_a, ab1); + r.val[1] = _mm512_permutex2var_epi64(ab0, idx_b, ab1); + return r; +} +#define npyv_unzip_s64 npyv_unzip_u64 + +NPY_FINLINE npyv_f32x2 npyv_unzip_f32(npyv_f32 ab0, npyv_f32 ab1) +{ + const __m512i idx_a = npyv_set_u32( + 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 + ); + const __m512i idx_b = npyv_set_u32( + 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 + ); + npyv_f32x2 r; + r.val[0] = _mm512_permutex2var_ps(ab0, idx_a, ab1); + r.val[1] = _mm512_permutex2var_ps(ab0, idx_b, ab1); + return r; +} +NPY_FINLINE npyv_f64x2 npyv_unzip_f64(npyv_f64 ab0, npyv_f64 ab1) +{ + const __m512i idx_a = npyv_set_u64(0, 2, 4, 6, 8, 10, 12, 14); + const __m512i idx_b = npyv_set_u64(1, 3, 5, 7, 9, 11, 13, 15); + npyv_f64x2 r; + r.val[0] = _mm512_permutex2var_pd(ab0, idx_a, ab1); + r.val[1] = _mm512_permutex2var_pd(ab0, idx_b, ab1); + return r; +} + // Reverse elements of each 64-bit lane NPY_FINLINE npyv_u8 npyv_rev64_u8(npyv_u8 a) { @@ -223,4 +357,22 @@ NPY_FINLINE npyv_f32 npyv_rev64_f32(npyv_f32 a) return _mm512_shuffle_ps(a, a, (_MM_PERM_ENUM)_MM_SHUFFLE(2, 3, 0, 1)); } +// Permuting the elements of each 128-bit lane by immediate index for +// each element. +#define npyv_permi128_u32(A, E0, E1, E2, E3) \ + _mm512_shuffle_epi32(A, _MM_SHUFFLE(E3, E2, E1, E0)) + +#define npyv_permi128_s32 npyv_permi128_u32 + +#define npyv_permi128_u64(A, E0, E1) \ + _mm512_shuffle_epi32(A, _MM_SHUFFLE(((E1)<<1)+1, ((E1)<<1), ((E0)<<1)+1, ((E0)<<1))) + +#define npyv_permi128_s64 npyv_permi128_u64 + +#define npyv_permi128_f32(A, E0, E1, E2, E3) \ + _mm512_permute_ps(A, _MM_SHUFFLE(E3, E2, E1, E0)) + +#define npyv_permi128_f64(A, E0, E1) \ + _mm512_permute_pd(A, (((E1)<<7) | ((E0)<<6) | ((E1)<<5) | ((E0)<<4) | ((E1)<<3) | ((E0)<<2) | ((E1)<<1) | (E0))) + #endif // _NPY_SIMD_AVX512_REORDER_H diff --git a/numpy/core/src/common/simd/emulate_maskop.h b/numpy/core/src/common/simd/emulate_maskop.h index 2a808a153..6627e5010 100644 --- a/numpy/core/src/common/simd/emulate_maskop.h +++ b/numpy/core/src/common/simd/emulate_maskop.h @@ -42,5 +42,39 @@ NPYV_IMPL_EMULATE_MASK_ADDSUB(s64, b64) #if NPY_SIMD_F64 NPYV_IMPL_EMULATE_MASK_ADDSUB(f64, b64) #endif +#if NPY_SIMD_F32 + // conditional division, m ? a / b : c + NPY_FINLINE npyv_f32 + npyv_ifdiv_f32(npyv_b32 m, npyv_f32 a, npyv_f32 b, npyv_f32 c) + { + const npyv_f32 one = npyv_setall_f32(1.0f); + npyv_f32 div = npyv_div_f32(a, npyv_select_f32(m, b, one)); + return npyv_select_f32(m, div, c); + } + // conditional division, m ? a / b : 0 + NPY_FINLINE npyv_f32 + npyv_ifdivz_f32(npyv_b32 m, npyv_f32 a, npyv_f32 b) + { + const npyv_f32 zero = npyv_zero_f32(); + return npyv_ifdiv_f32(m, a, b, zero); + } +#endif +#if NPY_SIMD_F64 + // conditional division, m ? a / b : c + NPY_FINLINE npyv_f64 + npyv_ifdiv_f64(npyv_b64 m, npyv_f64 a, npyv_f64 b, npyv_f64 c) + { + const npyv_f64 one = npyv_setall_f64(1.0); + npyv_f64 div = npyv_div_f64(a, npyv_select_f64(m, b, one)); + return npyv_select_f64(m, div, c); + } + // conditional division, m ? a / b : 0 + NPY_FINLINE npyv_f64 + npyv_ifdivz_f64(npyv_b64 m, npyv_f64 a, npyv_f64 b) + { + const npyv_f64 zero = npyv_zero_f64(); + return npyv_ifdiv_f64(m, a, b, zero); + } +#endif #endif // _NPY_SIMD_EMULATE_MASKOP_H diff --git a/numpy/core/src/common/simd/neon/arithmetic.h b/numpy/core/src/common/simd/neon/arithmetic.h index 00994806d..683620111 100644 --- a/numpy/core/src/common/simd/neon/arithmetic.h +++ b/numpy/core/src/common/simd/neon/arithmetic.h @@ -265,6 +265,14 @@ NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor) NPY_FINLINE npyv_f32 npyv_nmulsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c) { return vmlsq_f32(vnegq_f32(c), a, b); } #endif +// multiply, add for odd elements and subtract even elements. +// (a * b) -+ c +NPY_FINLINE npyv_f32 npyv_muladdsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c) +{ + const npyv_f32 msign = npyv_set_f32(-0.0f, 0.0f, -0.0f, 0.0f); + return npyv_muladd_f32(a, b, npyv_xor_f32(msign, c)); +} + /*************************** * FUSED F64 ***************************/ @@ -277,6 +285,11 @@ NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor) { return vfmsq_f64(c, a, b); } NPY_FINLINE npyv_f64 npyv_nmulsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c) { return vfmsq_f64(vnegq_f64(c), a, b); } + NPY_FINLINE npyv_f64 npyv_muladdsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c) + { + const npyv_f64 msign = npyv_set_f64(-0.0, 0.0); + return npyv_muladd_f64(a, b, npyv_xor_f64(msign, c)); + } #endif // NPY_SIMD_F64 /*************************** diff --git a/numpy/core/src/common/simd/neon/math.h b/numpy/core/src/common/simd/neon/math.h index c0a771b5d..58d14809f 100644 --- a/numpy/core/src/common/simd/neon/math.h +++ b/numpy/core/src/common/simd/neon/math.h @@ -278,20 +278,25 @@ NPY_FINLINE npyv_f32 npyv_rint_f32(npyv_f32 a) return vrndnq_f32(a); #else // ARMv7 NEON only supports fp to int truncate conversion. - // a magic trick of adding 1.5 * 2**23 is used for rounding + // a magic trick of adding 1.5 * 2^23 is used for rounding // to nearest even and then subtract this magic number to get // the integer. - const npyv_s32 szero = vreinterpretq_s32_f32(vdupq_n_f32(-0.0f)); - const npyv_f32 magic = vdupq_n_f32(12582912.0f); // 1.5 * 2**23 - npyv_f32 round = vsubq_f32(vaddq_f32(a, magic), magic); - npyv_b32 overflow = vcleq_f32(vabsq_f32(a), vreinterpretq_f32_u32(vdupq_n_u32(0x4b000000))); - round = vbslq_f32(overflow, round, a); - // signed zero - round = vreinterpretq_f32_s32(vorrq_s32( - vreinterpretq_s32_f32(round), - vandq_s32(vreinterpretq_s32_f32(a), szero) - )); - return round; + // + const npyv_u32 szero = vreinterpretq_u32_f32(vdupq_n_f32(-0.0f)); + const npyv_u32 sign_mask = vandq_u32(vreinterpretq_u32_f32(a), szero); + const npyv_f32 two_power_23 = vdupq_n_f32(8388608.0); // 2^23 + const npyv_f32 two_power_23h = vdupq_n_f32(12582912.0f); // 1.5 * 2^23 + npyv_u32 nnan_mask = vceqq_f32(a, a); + // eliminate nans to avoid invalid fp errors + npyv_f32 abs_x = vabsq_f32(vreinterpretq_f32_u32(vandq_u32(nnan_mask, vreinterpretq_u32_f32(a)))); + // round by add magic number 1.5 * 2^23 + npyv_f32 round = vsubq_f32(vaddq_f32(two_power_23h, abs_x), two_power_23h); + // copysign + round = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(round), sign_mask )); + // a if |a| >= 2^23 or a == NaN + npyv_u32 mask = vcleq_f32(abs_x, two_power_23); + mask = vandq_u32(mask, nnan_mask); + return vbslq_f32(mask, round, a); #endif } #if NPY_SIMD_F64 @@ -302,33 +307,30 @@ NPY_FINLINE npyv_f32 npyv_rint_f32(npyv_f32 a) #ifdef NPY_HAVE_ASIMD #define npyv_ceil_f32 vrndpq_f32 #else - NPY_FINLINE npyv_f32 npyv_ceil_f32(npyv_f32 a) - { - const npyv_s32 szero = vreinterpretq_s32_f32(vdupq_n_f32(-0.0f)); + NPY_FINLINE npyv_f32 npyv_ceil_f32(npyv_f32 a) + { const npyv_u32 one = vreinterpretq_u32_f32(vdupq_n_f32(1.0f)); - const npyv_s32 max_int = vdupq_n_s32(0x7fffffff); - /** - * On armv7, vcvtq.f32 handles special cases as follows: - * NaN return 0 - * +inf or +outrange return 0x80000000(-0.0f) - * -inf or -outrange return 0x7fffffff(nan) - */ - npyv_s32 roundi = vcvtq_s32_f32(a); - npyv_f32 round = vcvtq_f32_s32(roundi); + const npyv_u32 szero = vreinterpretq_u32_f32(vdupq_n_f32(-0.0f)); + const npyv_u32 sign_mask = vandq_u32(vreinterpretq_u32_f32(a), szero); + const npyv_f32 two_power_23 = vdupq_n_f32(8388608.0); // 2^23 + const npyv_f32 two_power_23h = vdupq_n_f32(12582912.0f); // 1.5 * 2^23 + npyv_u32 nnan_mask = vceqq_f32(a, a); + npyv_f32 x = vreinterpretq_f32_u32(vandq_u32(nnan_mask, vreinterpretq_u32_f32(a))); + // eliminate nans to avoid invalid fp errors + npyv_f32 abs_x = vabsq_f32(x); + // round by add magic number 1.5 * 2^23 + npyv_f32 round = vsubq_f32(vaddq_f32(two_power_23h, abs_x), two_power_23h); + // copysign + round = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(round), sign_mask)); npyv_f32 ceil = vaddq_f32(round, vreinterpretq_f32_u32( - vandq_u32(vcltq_f32(round, a), one)) - ); - // respect signed zero, e.g. -0.5 -> -0.0 - npyv_f32 rzero = vreinterpretq_f32_s32(vorrq_s32( - vreinterpretq_s32_f32(ceil), - vandq_s32(vreinterpretq_s32_f32(a), szero) - )); - // if nan or overflow return a - npyv_u32 nnan = npyv_notnan_f32(a); - npyv_u32 overflow = vorrq_u32( - vceqq_s32(roundi, szero), vceqq_s32(roundi, max_int) + vandq_u32(vcltq_f32(round, x), one)) ); - return vbslq_f32(vbicq_u32(nnan, overflow), rzero, a); + // respects signed zero + ceil = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(ceil), sign_mask)); + // a if |a| >= 2^23 or a == NaN + npyv_u32 mask = vcleq_f32(abs_x, two_power_23); + mask = vandq_u32(mask, nnan_mask); + return vbslq_f32(mask, ceil, a); } #endif #if NPY_SIMD_F64 @@ -339,29 +341,37 @@ NPY_FINLINE npyv_f32 npyv_rint_f32(npyv_f32 a) #ifdef NPY_HAVE_ASIMD #define npyv_trunc_f32 vrndq_f32 #else - NPY_FINLINE npyv_f32 npyv_trunc_f32(npyv_f32 a) - { - const npyv_s32 szero = vreinterpretq_s32_f32(vdupq_n_f32(-0.0f)); + NPY_FINLINE npyv_f32 npyv_trunc_f32(npyv_f32 a) + { const npyv_s32 max_int = vdupq_n_s32(0x7fffffff); + const npyv_u32 exp_mask = vdupq_n_u32(0xff000000); + const npyv_s32 szero = vreinterpretq_s32_f32(vdupq_n_f32(-0.0f)); + const npyv_u32 sign_mask = vandq_u32( + vreinterpretq_u32_f32(a), vreinterpretq_u32_s32(szero)); + + npyv_u32 nfinite_mask = vshlq_n_u32(vreinterpretq_u32_f32(a), 1); + nfinite_mask = vandq_u32(nfinite_mask, exp_mask); + nfinite_mask = vceqq_u32(nfinite_mask, exp_mask); + // eliminate nans/inf to avoid invalid fp errors + npyv_f32 x = vreinterpretq_f32_u32( + veorq_u32(nfinite_mask, vreinterpretq_u32_f32(a))); /** * On armv7, vcvtq.f32 handles special cases as follows: * NaN return 0 * +inf or +outrange return 0x80000000(-0.0f) * -inf or -outrange return 0x7fffffff(nan) */ - npyv_s32 roundi = vcvtq_s32_f32(a); - npyv_f32 round = vcvtq_f32_s32(roundi); + npyv_s32 trunci = vcvtq_s32_f32(x); + npyv_f32 trunc = vcvtq_f32_s32(trunci); // respect signed zero, e.g. -0.5 -> -0.0 - npyv_f32 rzero = vreinterpretq_f32_s32(vorrq_s32( - vreinterpretq_s32_f32(round), - vandq_s32(vreinterpretq_s32_f32(a), szero) - )); - // if nan or overflow return a - npyv_u32 nnan = npyv_notnan_f32(a); - npyv_u32 overflow = vorrq_u32( - vceqq_s32(roundi, szero), vceqq_s32(roundi, max_int) + trunc = vreinterpretq_f32_u32( + vorrq_u32(vreinterpretq_u32_f32(trunc), sign_mask)); + // if overflow return a + npyv_u32 overflow_mask = vorrq_u32( + vceqq_s32(trunci, szero), vceqq_s32(trunci, max_int) ); - return vbslq_f32(vbicq_u32(nnan, overflow), rzero, a); + // a if a overflow or nonfinite + return vbslq_f32(vorrq_u32(nfinite_mask, overflow_mask), a, trunc); } #endif #if NPY_SIMD_F64 @@ -372,28 +382,31 @@ NPY_FINLINE npyv_f32 npyv_rint_f32(npyv_f32 a) #ifdef NPY_HAVE_ASIMD #define npyv_floor_f32 vrndmq_f32 #else - NPY_FINLINE npyv_f32 npyv_floor_f32(npyv_f32 a) - { - const npyv_s32 szero = vreinterpretq_s32_f32(vdupq_n_f32(-0.0f)); + NPY_FINLINE npyv_f32 npyv_floor_f32(npyv_f32 a) + { const npyv_u32 one = vreinterpretq_u32_f32(vdupq_n_f32(1.0f)); - const npyv_s32 max_int = vdupq_n_s32(0x7fffffff); + const npyv_u32 szero = vreinterpretq_u32_f32(vdupq_n_f32(-0.0f)); + const npyv_u32 sign_mask = vandq_u32(vreinterpretq_u32_f32(a), szero); + const npyv_f32 two_power_23 = vdupq_n_f32(8388608.0); // 2^23 + const npyv_f32 two_power_23h = vdupq_n_f32(12582912.0f); // 1.5 * 2^23 - npyv_s32 roundi = vcvtq_s32_f32(a); - npyv_f32 round = vcvtq_f32_s32(roundi); + npyv_u32 nnan_mask = vceqq_f32(a, a); + npyv_f32 x = vreinterpretq_f32_u32(vandq_u32(nnan_mask, vreinterpretq_u32_f32(a))); + // eliminate nans to avoid invalid fp errors + npyv_f32 abs_x = vabsq_f32(x); + // round by add magic number 1.5 * 2^23 + npyv_f32 round = vsubq_f32(vaddq_f32(two_power_23h, abs_x), two_power_23h); + // copysign + round = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(round), sign_mask)); npyv_f32 floor = vsubq_f32(round, vreinterpretq_f32_u32( - vandq_u32(vcgtq_f32(round, a), one) - )); - // respect signed zero - npyv_f32 rzero = vreinterpretq_f32_s32(vorrq_s32( - vreinterpretq_s32_f32(floor), - vandq_s32(vreinterpretq_s32_f32(a), szero) + vandq_u32(vcgtq_f32(round, x), one) )); - npyv_u32 nnan = npyv_notnan_f32(a); - npyv_u32 overflow = vorrq_u32( - vceqq_s32(roundi, szero), vceqq_s32(roundi, max_int) - ); - - return vbslq_f32(vbicq_u32(nnan, overflow), rzero, a); + // respects signed zero + floor = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(floor), sign_mask)); + // a if |a| >= 2^23 or a == NaN + npyv_u32 mask = vcleq_f32(abs_x, two_power_23); + mask = vandq_u32(mask, nnan_mask); + return vbslq_f32(mask, floor, a); } #endif // NPY_HAVE_ASIMD #if NPY_SIMD_F64 diff --git a/numpy/core/src/common/simd/neon/memory.h b/numpy/core/src/common/simd/neon/memory.h index 7060ea628..6163440c3 100644 --- a/numpy/core/src/common/simd/neon/memory.h +++ b/numpy/core/src/common/simd/neon/memory.h @@ -102,6 +102,29 @@ NPY_FINLINE npyv_f64 npyv_loadn_f64(const double *ptr, npy_intp stride) ); } #endif + +//// 64-bit load over 32-bit stride +NPY_FINLINE npyv_u32 npyv_loadn2_u32(const npy_uint32 *ptr, npy_intp stride) +{ + return vcombine_u32( + vld1_u32((const uint32_t*)ptr), vld1_u32((const uint32_t*)ptr + stride) + ); +} +NPY_FINLINE npyv_s32 npyv_loadn2_s32(const npy_int32 *ptr, npy_intp stride) +{ return npyv_reinterpret_s32_u32(npyv_loadn2_u32((const npy_uint32*)ptr, stride)); } +NPY_FINLINE npyv_f32 npyv_loadn2_f32(const float *ptr, npy_intp stride) +{ return npyv_reinterpret_f32_u32(npyv_loadn2_u32((const npy_uint32*)ptr, stride)); } + +//// 128-bit load over 64-bit stride +NPY_FINLINE npyv_u64 npyv_loadn2_u64(const npy_uint64 *ptr, npy_intp stride) +{ (void)stride; return npyv_load_u64(ptr); } +NPY_FINLINE npyv_s64 npyv_loadn2_s64(const npy_int64 *ptr, npy_intp stride) +{ (void)stride; return npyv_load_s64(ptr); } +#if NPY_SIMD_F64 +NPY_FINLINE npyv_f64 npyv_loadn2_f64(const double *ptr, npy_intp stride) +{ (void)stride; return npyv_load_f64(ptr); } +#endif + /*************************** * Non-contiguous Store ***************************/ @@ -131,6 +154,32 @@ NPY_FINLINE void npyv_storen_f64(double *ptr, npy_intp stride, npyv_f64 a) { npyv_storen_s64((npy_int64*)ptr, stride, npyv_reinterpret_s64_f64(a)); } #endif +//// 64-bit store over 32-bit stride +NPY_FINLINE void npyv_storen2_u32(npy_uint32 *ptr, npy_intp stride, npyv_u32 a) +{ +#if NPY_SIMD_F64 + vst1q_lane_u64((uint64_t*)ptr, npyv_reinterpret_u64_u32(a), 0); + vst1q_lane_u64((uint64_t*)(ptr + stride), npyv_reinterpret_u64_u32(a), 1); +#else + // armhf strict to alignment + vst1_u32((uint32_t*)ptr, vget_low_u32(a)); + vst1_u32((uint32_t*)ptr + stride, vget_high_u32(a)); +#endif +} +NPY_FINLINE void npyv_storen2_s32(npy_int32 *ptr, npy_intp stride, npyv_s32 a) +{ npyv_storen2_u32((npy_uint32*)ptr, stride, npyv_reinterpret_u32_s32(a)); } +NPY_FINLINE void npyv_storen2_f32(float *ptr, npy_intp stride, npyv_f32 a) +{ npyv_storen2_u32((npy_uint32*)ptr, stride, npyv_reinterpret_u32_f32(a)); } + +//// 128-bit store over 64-bit stride +NPY_FINLINE void npyv_storen2_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a) +{ (void)stride; npyv_store_u64(ptr, a); } +NPY_FINLINE void npyv_storen2_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a) +{ (void)stride; npyv_store_s64(ptr, a); } +#if NPY_SIMD_F64 +NPY_FINLINE void npyv_storen2_f64(double *ptr, npy_intp stride, npyv_f64 a) +{ (void)stride; npyv_store_f64(ptr, a); } +#endif /********************************* * Partial Load *********************************/ @@ -168,6 +217,29 @@ NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, n NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane) { return npyv_load_till_s64(ptr, nlane, 0); } +//// 64-bit nlane +NPY_FINLINE npyv_s32 npyv_load2_till_s32(const npy_int32 *ptr, npy_uintp nlane, + npy_int32 fill_lo, npy_int32 fill_hi) +{ + assert(nlane > 0); + if (nlane == 1) { + const int32_t NPY_DECL_ALIGNED(16) fill[2] = {fill_lo, fill_hi}; + return vcombine_s32(vld1_s32((const int32_t*)ptr), vld1_s32(fill)); + } + return npyv_load_s32(ptr); +} +// fill zero to rest lanes +NPY_FINLINE npyv_s32 npyv_load2_tillz_s32(const npy_int32 *ptr, npy_uintp nlane) +{ return vreinterpretq_s32_s64(npyv_load_tillz_s64((const npy_int64*)ptr, nlane)); } + +//// 128-bit nlane +NPY_FINLINE npyv_s64 npyv_load2_till_s64(const npy_int64 *ptr, npy_uintp nlane, + npy_int64 fill_lo, npy_int64 fill_hi) +{ (void)nlane; (void)fill_lo; (void)fill_hi; return npyv_load_s64(ptr); } + +NPY_FINLINE npyv_s64 npyv_load2_tillz_s64(const npy_int64 *ptr, npy_uintp nlane) +{ (void)nlane; return npyv_load_s64(ptr); } + /********************************* * Non-contiguous partial load *********************************/ @@ -206,6 +278,34 @@ npyv_loadn_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_ NPY_FINLINE npyv_s64 npyv_loadn_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane) { return npyv_loadn_till_s64(ptr, stride, nlane, 0); } +//// 64-bit load over 32-bit stride +NPY_FINLINE npyv_s32 npyv_loadn2_till_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane, + npy_int32 fill_lo, npy_int32 fill_hi) +{ + assert(nlane > 0); + if (nlane == 1) { + const int32_t NPY_DECL_ALIGNED(16) fill[2] = {fill_lo, fill_hi}; + return vcombine_s32(vld1_s32((const int32_t*)ptr), vld1_s32(fill)); + } + return npyv_loadn2_s32(ptr, stride); +} +NPY_FINLINE npyv_s32 npyv_loadn2_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane) +{ + assert(nlane > 0); + if (nlane == 1) { + return vcombine_s32(vld1_s32((const int32_t*)ptr), vdup_n_s32(0)); + } + return npyv_loadn2_s32(ptr, stride); +} + +//// 128-bit load over 64-bit stride +NPY_FINLINE npyv_s64 npyv_loadn2_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, + npy_int64 fill_lo, npy_int64 fill_hi) +{ assert(nlane > 0); (void)stride; (void)nlane; (void)fill_lo; (void)fill_hi; return npyv_load_s64(ptr); } + +NPY_FINLINE npyv_s64 npyv_loadn2_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane) +{ assert(nlane > 0); (void)stride; (void)nlane; return npyv_load_s64(ptr); } + /********************************* * Partial store *********************************/ @@ -238,6 +338,30 @@ NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a } npyv_store_s64(ptr, a); } + +//// 64-bit nlane +NPY_FINLINE void npyv_store2_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a) +{ + assert(nlane > 0); + if (nlane == 1) { + // armhf strict to alignment, may cause bus error + #if NPY_SIMD_F64 + vst1q_lane_s64((int64_t*)ptr, npyv_reinterpret_s64_s32(a), 0); + #else + npyv_storel_s32(ptr, a); + #endif + return; + } + npyv_store_s32(ptr, a); +} + +//// 128-bit nlane +NPY_FINLINE void npyv_store2_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a) +{ + assert(nlane > 0); (void)nlane; + npyv_store_s64(ptr, a); +} + /********************************* * Non-contiguous partial store *********************************/ @@ -245,16 +369,21 @@ NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a NPY_FINLINE void npyv_storen_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npyv_s32 a) { assert(nlane > 0); + vst1q_lane_s32((int32_t*)ptr, a, 0); switch(nlane) { - default: - vst1q_lane_s32((int32_t*)ptr + stride*3, a, 3); + case 1: + return; + case 2: + vst1q_lane_s32((int32_t*)ptr + stride, a, 1); + return; case 3: + vst1q_lane_s32((int32_t*)ptr + stride, a, 1); vst1q_lane_s32((int32_t*)ptr + stride*2, a, 2); - case 2: + return; + default: vst1q_lane_s32((int32_t*)ptr + stride, a, 1); - case 1: - vst1q_lane_s32((int32_t*)ptr, a, 0); - break; + vst1q_lane_s32((int32_t*)ptr + stride*2, a, 2); + vst1q_lane_s32((int32_t*)ptr + stride*3, a, 3); } } //// 64 @@ -268,6 +397,27 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp npyv_storen_s64(ptr, stride, a); } +//// 64-bit store over 32-bit stride +NPY_FINLINE void npyv_storen2_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npyv_s32 a) +{ + assert(nlane > 0); +#if NPY_SIMD_F64 + vst1q_lane_s64((int64_t*)ptr, npyv_reinterpret_s64_s32(a), 0); + if (nlane > 1) { + vst1q_lane_s64((int64_t*)(ptr + stride), npyv_reinterpret_s64_s32(a), 1); + } +#else + npyv_storel_s32(ptr, a); + if (nlane > 1) { + npyv_storeh_s32(ptr + stride, a); + } +#endif +} + +//// 128-bit store over 64-bit stride +NPY_FINLINE void npyv_storen2_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npyv_s64 a) +{ assert(nlane > 0); (void)stride; (void)nlane; npyv_store_s64(ptr, a); } + /***************************************************************** * Implement partial load/store for u32/f32/u64/f64... via casting *****************************************************************/ @@ -278,7 +428,8 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp union { \ npyv_lanetype_##F_SFX from_##F_SFX; \ npyv_lanetype_##T_SFX to_##T_SFX; \ - } pun = {.from_##F_SFX = fill}; \ + } pun; \ + pun.from_##F_SFX = fill; \ return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load_till_##T_SFX( \ (const npyv_lanetype_##T_SFX *)ptr, nlane, pun.to_##T_SFX \ )); \ @@ -290,7 +441,8 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp union { \ npyv_lanetype_##F_SFX from_##F_SFX; \ npyv_lanetype_##T_SFX to_##T_SFX; \ - } pun = {.from_##F_SFX = fill}; \ + } pun; \ + pun.from_##F_SFX = fill; \ return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn_till_##T_SFX( \ (const npyv_lanetype_##T_SFX *)ptr, stride, nlane, pun.to_##T_SFX \ )); \ @@ -332,6 +484,131 @@ NPYV_IMPL_NEON_REST_PARTIAL_TYPES(u64, s64) #if NPY_SIMD_F64 NPYV_IMPL_NEON_REST_PARTIAL_TYPES(f64, s64) #endif + +// 128-bit/64-bit stride +#define NPYV_IMPL_NEON_REST_PARTIAL_TYPES_PAIR(F_SFX, T_SFX) \ + NPY_FINLINE npyv_##F_SFX npyv_load2_till_##F_SFX \ + (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, \ + npyv_lanetype_##F_SFX fill_lo, npyv_lanetype_##F_SFX fill_hi) \ + { \ + union pun { \ + npyv_lanetype_##F_SFX from_##F_SFX; \ + npyv_lanetype_##T_SFX to_##T_SFX; \ + }; \ + union pun pun_lo; \ + union pun pun_hi; \ + pun_lo.from_##F_SFX = fill_lo; \ + pun_hi.from_##F_SFX = fill_hi; \ + return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load2_till_##T_SFX( \ + (const npyv_lanetype_##T_SFX *)ptr, nlane, pun_lo.to_##T_SFX, pun_hi.to_##T_SFX \ + )); \ + } \ + NPY_FINLINE npyv_##F_SFX npyv_loadn2_till_##F_SFX \ + (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, \ + npyv_lanetype_##F_SFX fill_lo, npyv_lanetype_##F_SFX fill_hi) \ + { \ + union pun { \ + npyv_lanetype_##F_SFX from_##F_SFX; \ + npyv_lanetype_##T_SFX to_##T_SFX; \ + }; \ + union pun pun_lo; \ + union pun pun_hi; \ + pun_lo.from_##F_SFX = fill_lo; \ + pun_hi.from_##F_SFX = fill_hi; \ + return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn2_till_##T_SFX( \ + (const npyv_lanetype_##T_SFX *)ptr, stride, nlane, pun_lo.to_##T_SFX, \ + pun_hi.to_##T_SFX \ + )); \ + } \ + NPY_FINLINE npyv_##F_SFX npyv_load2_tillz_##F_SFX \ + (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane) \ + { \ + return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load2_tillz_##T_SFX( \ + (const npyv_lanetype_##T_SFX *)ptr, nlane \ + )); \ + } \ + NPY_FINLINE npyv_##F_SFX npyv_loadn2_tillz_##F_SFX \ + (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane) \ + { \ + return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn2_tillz_##T_SFX( \ + (const npyv_lanetype_##T_SFX *)ptr, stride, nlane \ + )); \ + } \ + NPY_FINLINE void npyv_store2_till_##F_SFX \ + (npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, npyv_##F_SFX a) \ + { \ + npyv_store2_till_##T_SFX( \ + (npyv_lanetype_##T_SFX *)ptr, nlane, \ + npyv_reinterpret_##T_SFX##_##F_SFX(a) \ + ); \ + } \ + NPY_FINLINE void npyv_storen2_till_##F_SFX \ + (npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, npyv_##F_SFX a) \ + { \ + npyv_storen2_till_##T_SFX( \ + (npyv_lanetype_##T_SFX *)ptr, stride, nlane, \ + npyv_reinterpret_##T_SFX##_##F_SFX(a) \ + ); \ + } + +NPYV_IMPL_NEON_REST_PARTIAL_TYPES_PAIR(u32, s32) +NPYV_IMPL_NEON_REST_PARTIAL_TYPES_PAIR(f32, s32) +NPYV_IMPL_NEON_REST_PARTIAL_TYPES_PAIR(u64, s64) +#if NPY_SIMD_F64 +NPYV_IMPL_NEON_REST_PARTIAL_TYPES_PAIR(f64, s64) +#endif + +/************************************************************ + * de-interlave load / interleave contiguous store + ************************************************************/ +// two channels +#define NPYV_IMPL_NEON_MEM_INTERLEAVE(SFX, T_PTR) \ + NPY_FINLINE npyv_##SFX##x2 npyv_load_##SFX##x2( \ + const npyv_lanetype_##SFX *ptr \ + ) { \ + return vld2q_##SFX((const T_PTR*)ptr); \ + } \ + NPY_FINLINE void npyv_store_##SFX##x2( \ + npyv_lanetype_##SFX *ptr, npyv_##SFX##x2 v \ + ) { \ + vst2q_##SFX((T_PTR*)ptr, v); \ + } + +NPYV_IMPL_NEON_MEM_INTERLEAVE(u8, uint8_t) +NPYV_IMPL_NEON_MEM_INTERLEAVE(s8, int8_t) +NPYV_IMPL_NEON_MEM_INTERLEAVE(u16, uint16_t) +NPYV_IMPL_NEON_MEM_INTERLEAVE(s16, int16_t) +NPYV_IMPL_NEON_MEM_INTERLEAVE(u32, uint32_t) +NPYV_IMPL_NEON_MEM_INTERLEAVE(s32, int32_t) +NPYV_IMPL_NEON_MEM_INTERLEAVE(f32, float) + +#if NPY_SIMD_F64 + NPYV_IMPL_NEON_MEM_INTERLEAVE(f64, double) + NPYV_IMPL_NEON_MEM_INTERLEAVE(u64, uint64_t) + NPYV_IMPL_NEON_MEM_INTERLEAVE(s64, int64_t) +#else + #define NPYV_IMPL_NEON_MEM_INTERLEAVE_64(SFX) \ + NPY_FINLINE npyv_##SFX##x2 npyv_load_##SFX##x2( \ + const npyv_lanetype_##SFX *ptr) \ + { \ + npyv_##SFX a = npyv_load_##SFX(ptr); \ + npyv_##SFX b = npyv_load_##SFX(ptr + 2); \ + npyv_##SFX##x2 r; \ + r.val[0] = vcombine_##SFX(vget_low_##SFX(a), vget_low_##SFX(b)); \ + r.val[1] = vcombine_##SFX(vget_high_##SFX(a), vget_high_##SFX(b)); \ + return r; \ + } \ + NPY_FINLINE void npyv_store_##SFX##x2( \ + npyv_lanetype_##SFX *ptr, npyv_##SFX##x2 v) \ + { \ + npyv_store_##SFX(ptr, vcombine_##SFX( \ + vget_low_##SFX(v.val[0]), vget_low_##SFX(v.val[1]))); \ + npyv_store_##SFX(ptr + 2, vcombine_##SFX( \ + vget_high_##SFX(v.val[0]), vget_high_##SFX(v.val[1]))); \ + } + NPYV_IMPL_NEON_MEM_INTERLEAVE_64(u64) + NPYV_IMPL_NEON_MEM_INTERLEAVE_64(s64) +#endif /********************************* * Lookup table *********************************/ diff --git a/numpy/core/src/common/simd/neon/neon.h b/numpy/core/src/common/simd/neon/neon.h index b08071527..49c35c415 100644 --- a/numpy/core/src/common/simd/neon/neon.h +++ b/numpy/core/src/common/simd/neon/neon.h @@ -16,6 +16,7 @@ #define NPY_SIMD_FMA3 0 // HW emulated #endif #define NPY_SIMD_BIGENDIAN 0 +#define NPY_SIMD_CMPSIGNAL 1 typedef uint8x16_t npyv_u8; typedef int8x16_t npyv_s8; diff --git a/numpy/core/src/common/simd/neon/operators.h b/numpy/core/src/common/simd/neon/operators.h index 249621bd6..e18ea94b8 100644 --- a/numpy/core/src/common/simd/neon/operators.h +++ b/numpy/core/src/common/simd/neon/operators.h @@ -240,10 +240,35 @@ // check special cases NPY_FINLINE npyv_b32 npyv_notnan_f32(npyv_f32 a) -{ return vceqq_f32(a, a); } +{ +#if defined(__clang__) +/** + * To avoid signaling qNaN, workaround for clang symmetric inputs bug + * check https://github.com/numpy/numpy/issues/22933, + * for more clarification. + */ + npyv_b32 ret; + #if NPY_SIMD_F64 + __asm("fcmeq %0.4s, %1.4s, %1.4s" : "=w" (ret) : "w" (a)); + #else + __asm("vceq.f32 %q0, %q1, %q1" : "=w" (ret) : "w" (a)); + #endif + return ret; +#else + return vceqq_f32(a, a); +#endif +} #if NPY_SIMD_F64 NPY_FINLINE npyv_b64 npyv_notnan_f64(npyv_f64 a) - { return vceqq_f64(a, a); } + { + #if defined(__clang__) + npyv_b64 ret; + __asm("fcmeq %0.2d, %1.2d, %1.2d" : "=w" (ret) : "w" (a)); + return ret; + #else + return vceqq_f64(a, a); + #endif + } #endif // Test cross all vector lanes diff --git a/numpy/core/src/common/simd/neon/reorder.h b/numpy/core/src/common/simd/neon/reorder.h index 50b06ed11..8bf68f5be 100644 --- a/numpy/core/src/common/simd/neon/reorder.h +++ b/numpy/core/src/common/simd/neon/reorder.h @@ -76,36 +76,45 @@ NPYV_IMPL_NEON_COMBINE(npyv_f32, f32) NPYV_IMPL_NEON_COMBINE(npyv_f64, f64) #endif -// interleave two vectors -#define NPYV_IMPL_NEON_ZIP(T_VEC, SFX) \ - NPY_FINLINE T_VEC##x2 npyv_zip_##SFX(T_VEC a, T_VEC b) \ - { \ - T_VEC##x2 r; \ - r.val[0] = vzip1q_##SFX(a, b); \ - r.val[1] = vzip2q_##SFX(a, b); \ - return r; \ - } - +// interleave & deinterleave two vectors #ifdef __aarch64__ - NPYV_IMPL_NEON_ZIP(npyv_u8, u8) - NPYV_IMPL_NEON_ZIP(npyv_s8, s8) - NPYV_IMPL_NEON_ZIP(npyv_u16, u16) - NPYV_IMPL_NEON_ZIP(npyv_s16, s16) - NPYV_IMPL_NEON_ZIP(npyv_u32, u32) - NPYV_IMPL_NEON_ZIP(npyv_s32, s32) - NPYV_IMPL_NEON_ZIP(npyv_f32, f32) - NPYV_IMPL_NEON_ZIP(npyv_f64, f64) + #define NPYV_IMPL_NEON_ZIP(T_VEC, SFX) \ + NPY_FINLINE T_VEC##x2 npyv_zip_##SFX(T_VEC a, T_VEC b) \ + { \ + T_VEC##x2 r; \ + r.val[0] = vzip1q_##SFX(a, b); \ + r.val[1] = vzip2q_##SFX(a, b); \ + return r; \ + } \ + NPY_FINLINE T_VEC##x2 npyv_unzip_##SFX(T_VEC a, T_VEC b) \ + { \ + T_VEC##x2 r; \ + r.val[0] = vuzp1q_##SFX(a, b); \ + r.val[1] = vuzp2q_##SFX(a, b); \ + return r; \ + } #else - #define npyv_zip_u8 vzipq_u8 - #define npyv_zip_s8 vzipq_s8 - #define npyv_zip_u16 vzipq_u16 - #define npyv_zip_s16 vzipq_s16 - #define npyv_zip_u32 vzipq_u32 - #define npyv_zip_s32 vzipq_s32 - #define npyv_zip_f32 vzipq_f32 + #define NPYV_IMPL_NEON_ZIP(T_VEC, SFX) \ + NPY_FINLINE T_VEC##x2 npyv_zip_##SFX(T_VEC a, T_VEC b) \ + { return vzipq_##SFX(a, b); } \ + NPY_FINLINE T_VEC##x2 npyv_unzip_##SFX(T_VEC a, T_VEC b) \ + { return vuzpq_##SFX(a, b); } #endif + +NPYV_IMPL_NEON_ZIP(npyv_u8, u8) +NPYV_IMPL_NEON_ZIP(npyv_s8, s8) +NPYV_IMPL_NEON_ZIP(npyv_u16, u16) +NPYV_IMPL_NEON_ZIP(npyv_s16, s16) +NPYV_IMPL_NEON_ZIP(npyv_u32, u32) +NPYV_IMPL_NEON_ZIP(npyv_s32, s32) +NPYV_IMPL_NEON_ZIP(npyv_f32, f32) + #define npyv_zip_u64 npyv_combine_u64 #define npyv_zip_s64 npyv_combine_s64 +#define npyv_zip_f64 npyv_combine_f64 +#define npyv_unzip_u64 npyv_combine_u64 +#define npyv_unzip_s64 npyv_combine_s64 +#define npyv_unzip_f64 npyv_combine_f64 // Reverse elements of each 64-bit lane #define npyv_rev64_u8 vrev64q_u8 @@ -116,4 +125,65 @@ NPYV_IMPL_NEON_COMBINE(npyv_f64, f64) #define npyv_rev64_s32 vrev64q_s32 #define npyv_rev64_f32 vrev64q_f32 +// Permuting the elements of each 128-bit lane by immediate index for +// each element. +#ifdef __clang__ + #define npyv_permi128_u32(A, E0, E1, E2, E3) \ + __builtin_shufflevector(A, A, E0, E1, E2, E3) +#elif defined(__GNUC__) + #define npyv_permi128_u32(A, E0, E1, E2, E3) \ + __builtin_shuffle(A, npyv_set_u32(E0, E1, E2, E3)) +#else + #define npyv_permi128_u32(A, E0, E1, E2, E3) \ + npyv_set_u32( \ + vgetq_lane_u32(A, E0), vgetq_lane_u32(A, E1), \ + vgetq_lane_u32(A, E2), vgetq_lane_u32(A, E3) \ + ) + #define npyv_permi128_s32(A, E0, E1, E2, E3) \ + npyv_set_s32( \ + vgetq_lane_s32(A, E0), vgetq_lane_s32(A, E1), \ + vgetq_lane_s32(A, E2), vgetq_lane_s32(A, E3) \ + ) + #define npyv_permi128_f32(A, E0, E1, E2, E3) \ + npyv_set_f32( \ + vgetq_lane_f32(A, E0), vgetq_lane_f32(A, E1), \ + vgetq_lane_f32(A, E2), vgetq_lane_f32(A, E3) \ + ) +#endif + +#if defined(__clang__) || defined(__GNUC__) + #define npyv_permi128_s32 npyv_permi128_u32 + #define npyv_permi128_f32 npyv_permi128_u32 +#endif + +#ifdef __clang__ + #define npyv_permi128_u64(A, E0, E1) \ + __builtin_shufflevector(A, A, E0, E1) +#elif defined(__GNUC__) + #define npyv_permi128_u64(A, E0, E1) \ + __builtin_shuffle(A, npyv_set_u64(E0, E1)) +#else + #define npyv_permi128_u64(A, E0, E1) \ + npyv_set_u64( \ + vgetq_lane_u64(A, E0), vgetq_lane_u64(A, E1) \ + ) + #define npyv_permi128_s64(A, E0, E1) \ + npyv_set_s64( \ + vgetq_lane_s64(A, E0), vgetq_lane_s64(A, E1) \ + ) + #define npyv_permi128_f64(A, E0, E1) \ + npyv_set_f64( \ + vgetq_lane_f64(A, E0), vgetq_lane_f64(A, E1) \ + ) +#endif + +#if defined(__clang__) || defined(__GNUC__) + #define npyv_permi128_s64 npyv_permi128_u64 + #define npyv_permi128_f64 npyv_permi128_u64 +#endif + +#if !NPY_SIMD_F64 + #undef npyv_permi128_f64 +#endif + #endif // _NPY_SIMD_NEON_REORDER_H diff --git a/numpy/core/src/common/simd/simd.h b/numpy/core/src/common/simd/simd.h index 92a77ad80..8c9b14251 100644 --- a/numpy/core/src/common/simd/simd.h +++ b/numpy/core/src/common/simd/simd.h @@ -82,6 +82,9 @@ typedef double npyv_lanetype_f64; #define NPY_SIMD_FMA3 0 /// 1 if the enabled SIMD extension is running on big-endian mode otherwise 0. #define NPY_SIMD_BIGENDIAN 0 + /// 1 if the supported comparison intrinsics(lt, le, gt, ge) + /// raises FP invalid exception for quite NaNs. + #define NPY_SIMD_CMPSIGNAL 0 #endif // enable emulated mask operations for all SIMD extension except for AVX512 diff --git a/numpy/core/src/common/simd/sse/arithmetic.h b/numpy/core/src/common/simd/sse/arithmetic.h index bced35108..72a87eac1 100644 --- a/numpy/core/src/common/simd/sse/arithmetic.h +++ b/numpy/core/src/common/simd/sse/arithmetic.h @@ -282,6 +282,10 @@ NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor) // negate multiply and subtract, -(a*b) - c #define npyv_nmulsub_f32 _mm_fnmsub_ps #define npyv_nmulsub_f64 _mm_fnmsub_pd + // multiply, add for odd elements and subtract even elements. + // (a * b) -+ c + #define npyv_muladdsub_f32 _mm_fmaddsub_ps + #define npyv_muladdsub_f64 _mm_fmaddsub_pd #elif defined(NPY_HAVE_FMA4) // multiply and add, a*b + c #define npyv_muladd_f32 _mm_macc_ps @@ -292,6 +296,10 @@ NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor) // negate multiply and add, -(a*b) + c #define npyv_nmuladd_f32 _mm_nmacc_ps #define npyv_nmuladd_f64 _mm_nmacc_pd + // multiply, add for odd elements and subtract even elements. + // (a * b) -+ c + #define npyv_muladdsub_f32 _mm_maddsub_ps + #define npyv_muladdsub_f64 _mm_maddsub_pd #else // multiply and add, a*b + c NPY_FINLINE npyv_f32 npyv_muladd_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c) @@ -308,6 +316,28 @@ NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor) { return npyv_sub_f32(c, npyv_mul_f32(a, b)); } NPY_FINLINE npyv_f64 npyv_nmuladd_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c) { return npyv_sub_f64(c, npyv_mul_f64(a, b)); } + // multiply, add for odd elements and subtract even elements. + // (a * b) -+ c + NPY_FINLINE npyv_f32 npyv_muladdsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c) + { + npyv_f32 m = npyv_mul_f32(a, b); + #if NPY_HAVE_SSE3 + return _mm_addsub_ps(m, c); + #else + const npyv_f32 msign = npyv_set_f32(-0.0f, 0.0f, -0.0f, 0.0f); + return npyv_add_f32(m, npyv_xor_f32(msign, c)); + #endif + } + NPY_FINLINE npyv_f64 npyv_muladdsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c) + { + npyv_f64 m = npyv_mul_f64(a, b); + #if NPY_HAVE_SSE3 + return _mm_addsub_pd(m, c); + #else + const npyv_f64 msign = npyv_set_f64(-0.0, 0.0); + return npyv_add_f64(m, npyv_xor_f64(msign, c)); + #endif + } #endif // NPY_HAVE_FMA3 #ifndef NPY_HAVE_FMA3 // for FMA4 and NON-FMA3 // negate multiply and subtract, -(a*b) - c diff --git a/numpy/core/src/common/simd/sse/math.h b/numpy/core/src/common/simd/sse/math.h index b7f8e6ebb..b51c935af 100644 --- a/numpy/core/src/common/simd/sse/math.h +++ b/numpy/core/src/common/simd/sse/math.h @@ -269,13 +269,23 @@ NPY_FINLINE npyv_f32 npyv_rint_f32(npyv_f32 a) #ifdef NPY_HAVE_SSE41 return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT); #else - const npyv_f32 szero = _mm_set1_ps(-0.0f); - __m128i roundi = _mm_cvtps_epi32(a); - __m128i overflow = _mm_cmpeq_epi32(roundi, _mm_castps_si128(szero)); - __m128 r = _mm_cvtepi32_ps(roundi); - // respect sign of zero - r = _mm_or_ps(r, _mm_and_ps(a, szero)); - return npyv_select_f32(overflow, a, r); + const __m128 szero = _mm_set1_ps(-0.0f); + const __m128i exp_mask = _mm_set1_epi32(0xff000000); + + __m128i nfinite_mask = _mm_slli_epi32(_mm_castps_si128(a), 1); + nfinite_mask = _mm_and_si128(nfinite_mask, exp_mask); + nfinite_mask = _mm_cmpeq_epi32(nfinite_mask, exp_mask); + + // eliminate nans/inf to avoid invalid fp errors + __m128 x = _mm_xor_ps(a, _mm_castsi128_ps(nfinite_mask)); + __m128i roundi = _mm_cvtps_epi32(x); + __m128 round = _mm_cvtepi32_ps(roundi); + // respect signed zero + round = _mm_or_ps(round, _mm_and_ps(a, szero)); + // if overflow return a + __m128i overflow_mask = _mm_cmpeq_epi32(roundi, _mm_castps_si128(szero)); + // a if a overflow or nonfinite + return npyv_select_f32(_mm_or_si128(nfinite_mask, overflow_mask), a, round); #endif } @@ -285,16 +295,22 @@ NPY_FINLINE npyv_f64 npyv_rint_f64(npyv_f64 a) #ifdef NPY_HAVE_SSE41 return _mm_round_pd(a, _MM_FROUND_TO_NEAREST_INT); #else - const npyv_f64 szero = _mm_set1_pd(-0.0); - const npyv_f64 two_power_52 = _mm_set1_pd(0x10000000000000); - npyv_f64 sign_two52 = _mm_or_pd(two_power_52, _mm_and_pd(a, szero)); + const __m128d szero = _mm_set1_pd(-0.0); + const __m128d two_power_52 = _mm_set1_pd(0x10000000000000); + __m128d nan_mask = _mm_cmpunord_pd(a, a); + // eliminate nans to avoid invalid fp errors within cmpge + __m128d abs_x = npyv_abs_f64(_mm_xor_pd(nan_mask, a)); // round by add magic number 2^52 - npyv_f64 round = _mm_sub_pd(_mm_add_pd(a, sign_two52), sign_two52); - // respect signed zero, e.g. -0.5 -> -0.0 - return _mm_or_pd(round, _mm_and_pd(a, szero)); + // assuming that MXCSR register is set to rounding + __m128d round = _mm_sub_pd(_mm_add_pd(two_power_52, abs_x), two_power_52); + // copysign + round = _mm_or_pd(round, _mm_and_pd(a, szero)); + // a if |a| >= 2^52 or a == NaN + __m128d mask = _mm_cmpge_pd(abs_x, two_power_52); + mask = _mm_or_pd(mask, nan_mask); + return npyv_select_f64(_mm_castpd_si128(mask), a, round); #endif } - // ceil #ifdef NPY_HAVE_SSE41 #define npyv_ceil_f32 _mm_ceil_ps @@ -302,27 +318,48 @@ NPY_FINLINE npyv_f64 npyv_rint_f64(npyv_f64 a) #else NPY_FINLINE npyv_f32 npyv_ceil_f32(npyv_f32 a) { - const npyv_f32 szero = _mm_set1_ps(-0.0f); - const npyv_f32 one = _mm_set1_ps(1.0f); - npyv_s32 roundi = _mm_cvttps_epi32(a); - npyv_f32 round = _mm_cvtepi32_ps(roundi); - npyv_f32 ceil = _mm_add_ps(round, _mm_and_ps(_mm_cmplt_ps(round, a), one)); - // respect signed zero, e.g. -0.5 -> -0.0 - npyv_f32 rzero = _mm_or_ps(ceil, _mm_and_ps(a, szero)); + const __m128 one = _mm_set1_ps(1.0f); + const __m128 szero = _mm_set1_ps(-0.0f); + const __m128i exp_mask = _mm_set1_epi32(0xff000000); + + __m128i nfinite_mask = _mm_slli_epi32(_mm_castps_si128(a), 1); + nfinite_mask = _mm_and_si128(nfinite_mask, exp_mask); + nfinite_mask = _mm_cmpeq_epi32(nfinite_mask, exp_mask); + + // eliminate nans/inf to avoid invalid fp errors + __m128 x = _mm_xor_ps(a, _mm_castsi128_ps(nfinite_mask)); + __m128i roundi = _mm_cvtps_epi32(x); + __m128 round = _mm_cvtepi32_ps(roundi); + __m128 ceil = _mm_add_ps(round, _mm_and_ps(_mm_cmplt_ps(round, x), one)); + // respect signed zero + ceil = _mm_or_ps(ceil, _mm_and_ps(a, szero)); // if overflow return a - return npyv_select_f32(_mm_cmpeq_epi32(roundi, _mm_castps_si128(szero)), a, rzero); + __m128i overflow_mask = _mm_cmpeq_epi32(roundi, _mm_castps_si128(szero)); + // a if a overflow or nonfinite + return npyv_select_f32(_mm_or_si128(nfinite_mask, overflow_mask), a, ceil); } NPY_FINLINE npyv_f64 npyv_ceil_f64(npyv_f64 a) { - const npyv_f64 szero = _mm_set1_pd(-0.0); - const npyv_f64 one = _mm_set1_pd(1.0); - const npyv_f64 two_power_52 = _mm_set1_pd(0x10000000000000); - npyv_f64 sign_two52 = _mm_or_pd(two_power_52, _mm_and_pd(a, szero)); + const __m128d one = _mm_set1_pd(1.0); + const __m128d szero = _mm_set1_pd(-0.0); + const __m128d two_power_52 = _mm_set1_pd(0x10000000000000); + __m128d nan_mask = _mm_cmpunord_pd(a, a); + // eliminate nans to avoid invalid fp errors within cmpge + __m128d x = _mm_xor_pd(nan_mask, a); + __m128d abs_x = npyv_abs_f64(x); + __m128d sign_x = _mm_and_pd(x, szero); // round by add magic number 2^52 - npyv_f64 round = _mm_sub_pd(_mm_add_pd(a, sign_two52), sign_two52); - npyv_f64 ceil = _mm_add_pd(round, _mm_and_pd(_mm_cmplt_pd(round, a), one)); - // respect signed zero, e.g. -0.5 -> -0.0 - return _mm_or_pd(ceil, _mm_and_pd(a, szero)); + // assuming that MXCSR register is set to rounding + __m128d round = _mm_sub_pd(_mm_add_pd(two_power_52, abs_x), two_power_52); + // copysign + round = _mm_or_pd(round, sign_x); + __m128d ceil = _mm_add_pd(round, _mm_and_pd(_mm_cmplt_pd(round, x), one)); + // respects sign of 0.0 + ceil = _mm_or_pd(ceil, sign_x); + // a if |a| >= 2^52 or a == NaN + __m128d mask = _mm_cmpge_pd(abs_x, two_power_52); + mask = _mm_or_pd(mask, nan_mask); + return npyv_select_f64(_mm_castpd_si128(mask), a, ceil); } #endif @@ -333,24 +370,43 @@ NPY_FINLINE npyv_f64 npyv_rint_f64(npyv_f64 a) #else NPY_FINLINE npyv_f32 npyv_trunc_f32(npyv_f32 a) { - const npyv_f32 szero = _mm_set1_ps(-0.0f); - npyv_s32 roundi = _mm_cvttps_epi32(a); - npyv_f32 trunc = _mm_cvtepi32_ps(roundi); + const __m128 szero = _mm_set1_ps(-0.0f); + const __m128i exp_mask = _mm_set1_epi32(0xff000000); + + __m128i nfinite_mask = _mm_slli_epi32(_mm_castps_si128(a), 1); + nfinite_mask = _mm_and_si128(nfinite_mask, exp_mask); + nfinite_mask = _mm_cmpeq_epi32(nfinite_mask, exp_mask); + + // eliminate nans/inf to avoid invalid fp errors + __m128 x = _mm_xor_ps(a, _mm_castsi128_ps(nfinite_mask)); + __m128i trunci = _mm_cvttps_epi32(x); + __m128 trunc = _mm_cvtepi32_ps(trunci); // respect signed zero, e.g. -0.5 -> -0.0 - npyv_f32 rzero = _mm_or_ps(trunc, _mm_and_ps(a, szero)); + trunc = _mm_or_ps(trunc, _mm_and_ps(a, szero)); // if overflow return a - return npyv_select_f32(_mm_cmpeq_epi32(roundi, _mm_castps_si128(szero)), a, rzero); + __m128i overflow_mask = _mm_cmpeq_epi32(trunci, _mm_castps_si128(szero)); + // a if a overflow or nonfinite + return npyv_select_f32(_mm_or_si128(nfinite_mask, overflow_mask), a, trunc); } NPY_FINLINE npyv_f64 npyv_trunc_f64(npyv_f64 a) { - const npyv_f64 szero = _mm_set1_pd(-0.0); - const npyv_f64 one = _mm_set1_pd(1.0); - const npyv_f64 two_power_52 = _mm_set1_pd(0x10000000000000); - npyv_f64 abs_a = npyv_abs_f64(a); + const __m128d one = _mm_set1_pd(1.0); + const __m128d szero = _mm_set1_pd(-0.0); + const __m128d two_power_52 = _mm_set1_pd(0x10000000000000); + __m128d nan_mask = _mm_cmpunord_pd(a, a); + // eliminate nans to avoid invalid fp errors within cmpge + __m128d abs_x = npyv_abs_f64(_mm_xor_pd(nan_mask, a)); // round by add magic number 2^52 - npyv_f64 abs_round = _mm_sub_pd(_mm_add_pd(abs_a, two_power_52), two_power_52); - npyv_f64 subtrahend = _mm_and_pd(_mm_cmpgt_pd(abs_round, abs_a), one); - return _mm_or_pd(_mm_sub_pd(abs_round, subtrahend), _mm_and_pd(a, szero)); + // assuming that MXCSR register is set to rounding + __m128d abs_round = _mm_sub_pd(_mm_add_pd(two_power_52, abs_x), two_power_52); + __m128d subtrahend = _mm_and_pd(_mm_cmpgt_pd(abs_round, abs_x), one); + __m128d trunc = _mm_sub_pd(abs_round, subtrahend); + // copysign + trunc = _mm_or_pd(trunc, _mm_and_pd(a, szero)); + // a if |a| >= 2^52 or a == NaN + __m128d mask = _mm_cmpge_pd(abs_x, two_power_52); + mask = _mm_or_pd(mask, nan_mask); + return npyv_select_f64(_mm_castpd_si128(mask), a, trunc); } #endif @@ -361,15 +417,46 @@ NPY_FINLINE npyv_f64 npyv_rint_f64(npyv_f64 a) #else NPY_FINLINE npyv_f32 npyv_floor_f32(npyv_f32 a) { - const npyv_f32 one = _mm_set1_ps(1.0f); - npyv_f32 round = npyv_rint_f32(a); - return _mm_sub_ps(round, _mm_and_ps(_mm_cmpgt_ps(round, a), one)); + const __m128 one = _mm_set1_ps(1.0f); + const __m128 szero = _mm_set1_ps(-0.0f); + const __m128i exp_mask = _mm_set1_epi32(0xff000000); + + __m128i nfinite_mask = _mm_slli_epi32(_mm_castps_si128(a), 1); + nfinite_mask = _mm_and_si128(nfinite_mask, exp_mask); + nfinite_mask = _mm_cmpeq_epi32(nfinite_mask, exp_mask); + + // eliminate nans/inf to avoid invalid fp errors + __m128 x = _mm_xor_ps(a, _mm_castsi128_ps(nfinite_mask)); + __m128i roundi = _mm_cvtps_epi32(x); + __m128 round = _mm_cvtepi32_ps(roundi); + __m128 floor = _mm_sub_ps(round, _mm_and_ps(_mm_cmpgt_ps(round, x), one)); + // respect signed zero + floor = _mm_or_ps(floor, _mm_and_ps(a, szero)); + // if overflow return a + __m128i overflow_mask = _mm_cmpeq_epi32(roundi, _mm_castps_si128(szero)); + // a if a overflow or nonfinite + return npyv_select_f32(_mm_or_si128(nfinite_mask, overflow_mask), a, floor); } NPY_FINLINE npyv_f64 npyv_floor_f64(npyv_f64 a) { - const npyv_f64 one = _mm_set1_pd(1.0); - npyv_f64 round = npyv_rint_f64(a); - return _mm_sub_pd(round, _mm_and_pd(_mm_cmpgt_pd(round, a), one)); + const __m128d one = _mm_set1_pd(1.0f); + const __m128d szero = _mm_set1_pd(-0.0f); + const __m128d two_power_52 = _mm_set1_pd(0x10000000000000); + __m128d nan_mask = _mm_cmpunord_pd(a, a); + // eliminate nans to avoid invalid fp errors within cmpge + __m128d x = _mm_xor_pd(nan_mask, a); + __m128d abs_x = npyv_abs_f64(x); + __m128d sign_x = _mm_and_pd(x, szero); + // round by add magic number 2^52 + // assuming that MXCSR register is set to rounding + __m128d round = _mm_sub_pd(_mm_add_pd(two_power_52, abs_x), two_power_52); + // copysign + round = _mm_or_pd(round, sign_x); + __m128d floor = _mm_sub_pd(round, _mm_and_pd(_mm_cmpgt_pd(round, x), one)); + // a if |a| >= 2^52 or a == NaN + __m128d mask = _mm_cmpge_pd(abs_x, two_power_52); + mask = _mm_or_pd(mask, nan_mask); + return npyv_select_f64(_mm_castpd_si128(mask), a, floor); } #endif // NPY_HAVE_SSE41 diff --git a/numpy/core/src/common/simd/sse/memory.h b/numpy/core/src/common/simd/sse/memory.h index 3ff64848d..4c8e86a6f 100644 --- a/numpy/core/src/common/simd/sse/memory.h +++ b/numpy/core/src/common/simd/sse/memory.h @@ -103,6 +103,28 @@ NPY_FINLINE npyv_u64 npyv_loadn_u64(const npy_uint64 *ptr, npy_intp stride) { return _mm_castpd_si128(npyv_loadn_f64((const double*)ptr, stride)); } NPY_FINLINE npyv_s64 npyv_loadn_s64(const npy_int64 *ptr, npy_intp stride) { return _mm_castpd_si128(npyv_loadn_f64((const double*)ptr, stride)); } + +//// 64-bit load over 32-bit stride +NPY_FINLINE npyv_f32 npyv_loadn2_f32(const float *ptr, npy_intp stride) +{ + __m128d r = _mm_loadh_pd( + npyv_loadl_f64((const double*)ptr), (const double*)(ptr + stride) + ); + return _mm_castpd_ps(r); +} +NPY_FINLINE npyv_u32 npyv_loadn2_u32(const npy_uint32 *ptr, npy_intp stride) +{ return _mm_castps_si128(npyv_loadn2_f32((const float*)ptr, stride)); } +NPY_FINLINE npyv_s32 npyv_loadn2_s32(const npy_int32 *ptr, npy_intp stride) +{ return _mm_castps_si128(npyv_loadn2_f32((const float*)ptr, stride)); } + +//// 128-bit load over 64-bit stride +NPY_FINLINE npyv_f64 npyv_loadn2_f64(const double *ptr, npy_intp stride) +{ (void)stride; return npyv_load_f64(ptr); } +NPY_FINLINE npyv_u64 npyv_loadn2_u64(const npy_uint64 *ptr, npy_intp stride) +{ (void)stride; return npyv_load_u64(ptr); } +NPY_FINLINE npyv_s64 npyv_loadn2_s64(const npy_int64 *ptr, npy_intp stride) +{ (void)stride; return npyv_load_s64(ptr); } + /*************************** * Non-contiguous Store ***************************/ @@ -135,6 +157,24 @@ NPY_FINLINE void npyv_storen_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a) NPY_FINLINE void npyv_storen_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a) { npyv_storen_f64((double*)ptr, stride, _mm_castsi128_pd(a)); } +//// 64-bit store over 32-bit stride +NPY_FINLINE void npyv_storen2_u32(npy_uint32 *ptr, npy_intp stride, npyv_u32 a) +{ + _mm_storel_pd((double*)ptr, _mm_castsi128_pd(a)); + _mm_storeh_pd((double*)(ptr + stride), _mm_castsi128_pd(a)); +} +NPY_FINLINE void npyv_storen2_s32(npy_int32 *ptr, npy_intp stride, npyv_s32 a) +{ npyv_storen2_u32((npy_uint32*)ptr, stride, a); } +NPY_FINLINE void npyv_storen2_f32(float *ptr, npy_intp stride, npyv_f32 a) +{ npyv_storen2_u32((npy_uint32*)ptr, stride, _mm_castps_si128(a)); } + +//// 128-bit store over 64-bit stride +NPY_FINLINE void npyv_storen2_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a) +{ (void)stride; npyv_store_u64(ptr, a); } +NPY_FINLINE void npyv_storen2_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a) +{ (void)stride; npyv_store_s64(ptr, a); } +NPY_FINLINE void npyv_storen2_f64(double *ptr, npy_intp stride, npyv_f64 a) +{ (void)stride; npyv_store_f64(ptr, a); } /********************************* * Partial Load *********************************/ @@ -204,13 +244,14 @@ NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32 *ptr, npy_uintp nlane) return _mm_cvtsi32_si128(*ptr); case 2: return _mm_loadl_epi64((const __m128i*)ptr); - case 3:; - npyv_s32 a = _mm_loadl_epi64((const __m128i*)ptr); - #ifdef NPY_HAVE_SSE41 - return _mm_insert_epi32(a, ptr[2], 2); - #else - return _mm_unpacklo_epi64(a, _mm_cvtsi32_si128(ptr[2])); - #endif + case 3: { + npyv_s32 a = _mm_loadl_epi64((const __m128i*)ptr); + #ifdef NPY_HAVE_SSE41 + return _mm_insert_epi32(a, ptr[2], 2); + #else + return _mm_unpacklo_epi64(a, _mm_cvtsi32_si128(ptr[2])); + #endif + } default: return npyv_load_s32(ptr); } @@ -246,6 +287,32 @@ NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane) } return npyv_load_s64(ptr); } + +//// 64-bit nlane +NPY_FINLINE npyv_s32 npyv_load2_till_s32(const npy_int32 *ptr, npy_uintp nlane, + npy_int32 fill_lo, npy_int32 fill_hi) +{ + assert(nlane > 0); + if (nlane == 1) { + const __m128i vfill = npyv_set_s32(fill_lo, fill_hi, fill_lo, fill_hi); + return _mm_castpd_si128( + _mm_loadl_pd(_mm_castsi128_pd(vfill), (double*)ptr) + ); + } + return npyv_load_s32(ptr); +} +// fill zero to rest lanes +NPY_FINLINE npyv_s32 npyv_load2_tillz_s32(const npy_int32 *ptr, npy_uintp nlane) +{ return npyv_load_tillz_s64((const npy_int64*)ptr, nlane); } + +//// 128-bit nlane +NPY_FINLINE npyv_s64 npyv_load2_till_s64(const npy_int64 *ptr, npy_uintp nlane, + npy_int64 fill_lo, npy_int64 fill_hi) +{ (void)nlane; (void)fill_lo; (void)fill_hi; return npyv_load_s64(ptr); } + +NPY_FINLINE npyv_s64 npyv_load2_tillz_s64(const npy_int64 *ptr, npy_uintp nlane) +{ (void)nlane; return npyv_load_s64(ptr); } + /********************************* * Non-contiguous partial load *********************************/ @@ -305,23 +372,27 @@ npyv_loadn_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane) case 1: return _mm_cvtsi32_si128(ptr[0]); case 2:; - npyv_s32 a = _mm_cvtsi32_si128(ptr[0]); -#ifdef NPY_HAVE_SSE41 - return _mm_insert_epi32(a, ptr[stride], 1); -#else - return _mm_unpacklo_epi32(a, _mm_cvtsi32_si128(ptr[stride])); -#endif // NPY_HAVE_SSE41 - case 3:; - a = _mm_cvtsi32_si128(ptr[0]); -#ifdef NPY_HAVE_SSE41 - a = _mm_insert_epi32(a, ptr[stride], 1); - a = _mm_insert_epi32(a, ptr[stride*2], 2); - return a; -#else - a = _mm_unpacklo_epi32(a, _mm_cvtsi32_si128(ptr[stride])); - a = _mm_unpacklo_epi64(a, _mm_cvtsi32_si128(ptr[stride*2])); - return a; -#endif // NPY_HAVE_SSE41 + { + npyv_s32 a = _mm_cvtsi32_si128(ptr[0]); + #ifdef NPY_HAVE_SSE41 + return _mm_insert_epi32(a, ptr[stride], 1); + #else + return _mm_unpacklo_epi32(a, _mm_cvtsi32_si128(ptr[stride])); + #endif // NPY_HAVE_SSE41 + } + case 3: + { + npyv_s32 a = _mm_cvtsi32_si128(ptr[0]); + #ifdef NPY_HAVE_SSE41 + a = _mm_insert_epi32(a, ptr[stride], 1); + a = _mm_insert_epi32(a, ptr[stride*2], 2); + return a; + #else + a = _mm_unpacklo_epi32(a, _mm_cvtsi32_si128(ptr[stride])); + a = _mm_unpacklo_epi64(a, _mm_cvtsi32_si128(ptr[stride*2])); + return a; + #endif // NPY_HAVE_SSE41 + } default: return npyv_loadn_s32(ptr, stride); } @@ -358,6 +429,37 @@ NPY_FINLINE npyv_s64 npyv_loadn_tillz_s64(const npy_int64 *ptr, npy_intp stride, } return npyv_loadn_s64(ptr, stride); } + +//// 64-bit load over 32-bit stride +NPY_FINLINE npyv_s32 npyv_loadn2_till_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane, + npy_int32 fill_lo, npy_int32 fill_hi) +{ + assert(nlane > 0); + if (nlane == 1) { + const __m128i vfill = npyv_set_s32(0, 0, fill_lo, fill_hi); + return _mm_castpd_si128( + _mm_loadl_pd(_mm_castsi128_pd(vfill), (double*)ptr) + ); + } + return npyv_loadn2_s32(ptr, stride); +} +NPY_FINLINE npyv_s32 npyv_loadn2_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane) +{ + assert(nlane > 0); + if (nlane == 1) { + return _mm_loadl_epi64((const __m128i*)ptr); + } + return npyv_loadn2_s32(ptr, stride); +} + +//// 128-bit load over 64-bit stride +NPY_FINLINE npyv_s64 npyv_loadn2_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, + npy_int64 fill_lo, npy_int64 fill_hi) +{ assert(nlane > 0); (void)stride; (void)nlane; (void)fill_lo; (void)fill_hi; return npyv_load_s64(ptr); } + +NPY_FINLINE npyv_s64 npyv_loadn2_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane) +{ assert(nlane > 0); (void)stride; (void)nlane; return npyv_load_s64(ptr); } + /********************************* * Partial store *********************************/ @@ -394,6 +496,17 @@ NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a } npyv_store_s64(ptr, a); } +//// 64-bit nlane +NPY_FINLINE void npyv_store2_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a) +{ npyv_store_till_s64((npy_int64*)ptr, nlane, a); } + +//// 128-bit nlane +NPY_FINLINE void npyv_store2_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a) +{ + assert(nlane > 0); (void)nlane; + npyv_store_s64(ptr, a); +} + /********************************* * Non-contiguous partial store *********************************/ @@ -401,25 +514,35 @@ NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a NPY_FINLINE void npyv_storen_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npyv_s32 a) { assert(nlane > 0); + ptr[stride*0] = _mm_cvtsi128_si32(a); switch(nlane) { + case 1: + return; #ifdef NPY_HAVE_SSE41 - default: - ptr[stride*3] = _mm_extract_epi32(a, 3); + case 2: + ptr[stride*1] = _mm_extract_epi32(a, 1); + return; case 3: + ptr[stride*1] = _mm_extract_epi32(a, 1); ptr[stride*2] = _mm_extract_epi32(a, 2); - case 2: + return; + default: ptr[stride*1] = _mm_extract_epi32(a, 1); + ptr[stride*2] = _mm_extract_epi32(a, 2); + ptr[stride*3] = _mm_extract_epi32(a, 3); #else - default: - ptr[stride*3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 3))); + case 2: + ptr[stride*1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 1))); + return; case 3: + ptr[stride*1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 1))); ptr[stride*2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 2))); - case 2: + return; + default: ptr[stride*1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 1))); + ptr[stride*2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 2))); + ptr[stride*3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 3))); #endif - case 1: - ptr[stride*0] = _mm_cvtsi128_si32(a); - break; } } //// 64 @@ -432,6 +555,21 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp } npyv_storen_s64(ptr, stride, a); } + +//// 64-bit store over 32-bit stride +NPY_FINLINE void npyv_storen2_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npyv_s32 a) +{ + assert(nlane > 0); + npyv_storel_s32(ptr, a); + if (nlane > 1) { + npyv_storeh_s32(ptr + stride, a); + } +} + +//// 128-bit store over 64-bit stride +NPY_FINLINE void npyv_storen2_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npyv_s64 a) +{ assert(nlane > 0); (void)stride; (void)nlane; npyv_store_s64(ptr, a); } + /***************************************************************** * Implement partial load/store for u32/f32/u64/f64... via casting *****************************************************************/ @@ -442,7 +580,8 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp union { \ npyv_lanetype_##F_SFX from_##F_SFX; \ npyv_lanetype_##T_SFX to_##T_SFX; \ - } pun = {.from_##F_SFX = fill}; \ + } pun; \ + pun.from_##F_SFX = fill; \ return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load_till_##T_SFX( \ (const npyv_lanetype_##T_SFX *)ptr, nlane, pun.to_##T_SFX \ )); \ @@ -454,7 +593,8 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp union { \ npyv_lanetype_##F_SFX from_##F_SFX; \ npyv_lanetype_##T_SFX to_##T_SFX; \ - } pun = {.from_##F_SFX = fill}; \ + } pun; \ + pun.from_##F_SFX = fill; \ return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn_till_##T_SFX( \ (const npyv_lanetype_##T_SFX *)ptr, stride, nlane, pun.to_##T_SFX \ )); \ @@ -495,6 +635,110 @@ NPYV_IMPL_SSE_REST_PARTIAL_TYPES(f32, s32) NPYV_IMPL_SSE_REST_PARTIAL_TYPES(u64, s64) NPYV_IMPL_SSE_REST_PARTIAL_TYPES(f64, s64) +// 128-bit/64-bit stride +#define NPYV_IMPL_SSE_REST_PARTIAL_TYPES_PAIR(F_SFX, T_SFX) \ + NPY_FINLINE npyv_##F_SFX npyv_load2_till_##F_SFX \ + (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, \ + npyv_lanetype_##F_SFX fill_lo, npyv_lanetype_##F_SFX fill_hi) \ + { \ + union pun { \ + npyv_lanetype_##F_SFX from_##F_SFX; \ + npyv_lanetype_##T_SFX to_##T_SFX; \ + }; \ + union pun pun_lo; \ + union pun pun_hi; \ + pun_lo.from_##F_SFX = fill_lo; \ + pun_hi.from_##F_SFX = fill_hi; \ + return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load2_till_##T_SFX( \ + (const npyv_lanetype_##T_SFX *)ptr, nlane, pun_lo.to_##T_SFX, pun_hi.to_##T_SFX \ + )); \ + } \ + NPY_FINLINE npyv_##F_SFX npyv_loadn2_till_##F_SFX \ + (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, \ + npyv_lanetype_##F_SFX fill_lo, npyv_lanetype_##F_SFX fill_hi) \ + { \ + union pun { \ + npyv_lanetype_##F_SFX from_##F_SFX; \ + npyv_lanetype_##T_SFX to_##T_SFX; \ + }; \ + union pun pun_lo; \ + union pun pun_hi; \ + pun_lo.from_##F_SFX = fill_lo; \ + pun_hi.from_##F_SFX = fill_hi; \ + return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn2_till_##T_SFX( \ + (const npyv_lanetype_##T_SFX *)ptr, stride, nlane, pun_lo.to_##T_SFX, \ + pun_hi.to_##T_SFX \ + )); \ + } \ + NPY_FINLINE npyv_##F_SFX npyv_load2_tillz_##F_SFX \ + (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane) \ + { \ + return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load2_tillz_##T_SFX( \ + (const npyv_lanetype_##T_SFX *)ptr, nlane \ + )); \ + } \ + NPY_FINLINE npyv_##F_SFX npyv_loadn2_tillz_##F_SFX \ + (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane) \ + { \ + return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn2_tillz_##T_SFX( \ + (const npyv_lanetype_##T_SFX *)ptr, stride, nlane \ + )); \ + } \ + NPY_FINLINE void npyv_store2_till_##F_SFX \ + (npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, npyv_##F_SFX a) \ + { \ + npyv_store2_till_##T_SFX( \ + (npyv_lanetype_##T_SFX *)ptr, nlane, \ + npyv_reinterpret_##T_SFX##_##F_SFX(a) \ + ); \ + } \ + NPY_FINLINE void npyv_storen2_till_##F_SFX \ + (npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, npyv_##F_SFX a) \ + { \ + npyv_storen2_till_##T_SFX( \ + (npyv_lanetype_##T_SFX *)ptr, stride, nlane, \ + npyv_reinterpret_##T_SFX##_##F_SFX(a) \ + ); \ + } + +NPYV_IMPL_SSE_REST_PARTIAL_TYPES_PAIR(u32, s32) +NPYV_IMPL_SSE_REST_PARTIAL_TYPES_PAIR(f32, s32) +NPYV_IMPL_SSE_REST_PARTIAL_TYPES_PAIR(u64, s64) +NPYV_IMPL_SSE_REST_PARTIAL_TYPES_PAIR(f64, s64) + +/************************************************************ + * de-interlave load / interleave contiguous store + ************************************************************/ +// two channels +#define NPYV_IMPL_SSE_MEM_INTERLEAVE(SFX, ZSFX) \ + NPY_FINLINE npyv_##ZSFX##x2 npyv_zip_##ZSFX(npyv_##ZSFX, npyv_##ZSFX); \ + NPY_FINLINE npyv_##ZSFX##x2 npyv_unzip_##ZSFX(npyv_##ZSFX, npyv_##ZSFX); \ + NPY_FINLINE npyv_##SFX##x2 npyv_load_##SFX##x2( \ + const npyv_lanetype_##SFX *ptr \ + ) { \ + return npyv_unzip_##ZSFX( \ + npyv_load_##SFX(ptr), npyv_load_##SFX(ptr+npyv_nlanes_##SFX) \ + ); \ + } \ + NPY_FINLINE void npyv_store_##SFX##x2( \ + npyv_lanetype_##SFX *ptr, npyv_##SFX##x2 v \ + ) { \ + npyv_##SFX##x2 zip = npyv_zip_##ZSFX(v.val[0], v.val[1]); \ + npyv_store_##SFX(ptr, zip.val[0]); \ + npyv_store_##SFX(ptr + npyv_nlanes_##SFX, zip.val[1]); \ + } + +NPYV_IMPL_SSE_MEM_INTERLEAVE(u8, u8) +NPYV_IMPL_SSE_MEM_INTERLEAVE(s8, u8) +NPYV_IMPL_SSE_MEM_INTERLEAVE(u16, u16) +NPYV_IMPL_SSE_MEM_INTERLEAVE(s16, u16) +NPYV_IMPL_SSE_MEM_INTERLEAVE(u32, u32) +NPYV_IMPL_SSE_MEM_INTERLEAVE(s32, u32) +NPYV_IMPL_SSE_MEM_INTERLEAVE(u64, u64) +NPYV_IMPL_SSE_MEM_INTERLEAVE(s64, u64) +NPYV_IMPL_SSE_MEM_INTERLEAVE(f32, f32) +NPYV_IMPL_SSE_MEM_INTERLEAVE(f64, f64) + /********************************* * Lookup table *********************************/ diff --git a/numpy/core/src/common/simd/sse/reorder.h b/numpy/core/src/common/simd/sse/reorder.h index d96ab9c56..9a57f6489 100644 --- a/numpy/core/src/common/simd/sse/reorder.h +++ b/numpy/core/src/common/simd/sse/reorder.h @@ -81,6 +81,75 @@ NPYV_IMPL_SSE_ZIP(npyv_s64, s64, epi64) NPYV_IMPL_SSE_ZIP(npyv_f32, f32, ps) NPYV_IMPL_SSE_ZIP(npyv_f64, f64, pd) +// deinterleave two vectors +NPY_FINLINE npyv_u8x2 npyv_unzip_u8(npyv_u8 ab0, npyv_u8 ab1) +{ +#ifdef NPY_HAVE_SSSE3 + const __m128i idx = _mm_setr_epi8( + 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 + ); + __m128i abl = _mm_shuffle_epi8(ab0, idx); + __m128i abh = _mm_shuffle_epi8(ab1, idx); + return npyv_combine_u8(abl, abh); +#else + __m128i ab_083b = _mm_unpacklo_epi8(ab0, ab1); + __m128i ab_4c6e = _mm_unpackhi_epi8(ab0, ab1); + __m128i ab_048c = _mm_unpacklo_epi8(ab_083b, ab_4c6e); + __m128i ab_36be = _mm_unpackhi_epi8(ab_083b, ab_4c6e); + __m128i ab_0346 = _mm_unpacklo_epi8(ab_048c, ab_36be); + __m128i ab_8bc8 = _mm_unpackhi_epi8(ab_048c, ab_36be); + npyv_u8x2 r; + r.val[0] = _mm_unpacklo_epi8(ab_0346, ab_8bc8); + r.val[1] = _mm_unpackhi_epi8(ab_0346, ab_8bc8); + return r; +#endif +} +#define npyv_unzip_s8 npyv_unzip_u8 + +NPY_FINLINE npyv_u16x2 npyv_unzip_u16(npyv_u16 ab0, npyv_u16 ab1) +{ +#ifdef NPY_HAVE_SSSE3 + const __m128i idx = _mm_setr_epi8( + 0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15 + ); + __m128i abl = _mm_shuffle_epi8(ab0, idx); + __m128i abh = _mm_shuffle_epi8(ab1, idx); + return npyv_combine_u16(abl, abh); +#else + __m128i ab_0415 = _mm_unpacklo_epi16(ab0, ab1); + __m128i ab_263f = _mm_unpackhi_epi16(ab0, ab1); + __m128i ab_0246 = _mm_unpacklo_epi16(ab_0415, ab_263f); + __m128i ab_135f = _mm_unpackhi_epi16(ab_0415, ab_263f); + npyv_u16x2 r; + r.val[0] = _mm_unpacklo_epi16(ab_0246, ab_135f); + r.val[1] = _mm_unpackhi_epi16(ab_0246, ab_135f); + return r; +#endif +} +#define npyv_unzip_s16 npyv_unzip_u16 + +NPY_FINLINE npyv_u32x2 npyv_unzip_u32(npyv_u32 ab0, npyv_u32 ab1) +{ + __m128i abl = _mm_shuffle_epi32(ab0, _MM_SHUFFLE(3, 1, 2, 0)); + __m128i abh = _mm_shuffle_epi32(ab1, _MM_SHUFFLE(3, 1, 2, 0)); + return npyv_combine_u32(abl, abh); +} +#define npyv_unzip_s32 npyv_unzip_u32 + +NPY_FINLINE npyv_u64x2 npyv_unzip_u64(npyv_u64 ab0, npyv_u64 ab1) +{ return npyv_combine_u64(ab0, ab1); } +#define npyv_unzip_s64 npyv_unzip_u64 + +NPY_FINLINE npyv_f32x2 npyv_unzip_f32(npyv_f32 ab0, npyv_f32 ab1) +{ + npyv_f32x2 r; + r.val[0] = _mm_shuffle_ps(ab0, ab1, _MM_SHUFFLE(2, 0, 2, 0)); + r.val[1] = _mm_shuffle_ps(ab0, ab1, _MM_SHUFFLE(3, 1, 3, 1)); + return r; +} +NPY_FINLINE npyv_f64x2 npyv_unzip_f64(npyv_f64 ab0, npyv_f64 ab1) +{ return npyv_combine_f64(ab0, ab1); } + // Reverse elements of each 64-bit lane NPY_FINLINE npyv_u16 npyv_rev64_u16(npyv_u16 a) { @@ -122,4 +191,22 @@ NPY_FINLINE npyv_f32 npyv_rev64_f32(npyv_f32 a) return _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 0, 1)); } +// Permuting the elements of each 128-bit lane by immediate index for +// each element. +#define npyv_permi128_u32(A, E0, E1, E2, E3) \ + _mm_shuffle_epi32(A, _MM_SHUFFLE(E3, E2, E1, E0)) + +#define npyv_permi128_s32 npyv_permi128_u32 + +#define npyv_permi128_u64(A, E0, E1) \ + _mm_shuffle_epi32(A, _MM_SHUFFLE(((E1)<<1)+1, ((E1)<<1), ((E0)<<1)+1, ((E0)<<1))) + +#define npyv_permi128_s64 npyv_permi128_u64 + +#define npyv_permi128_f32(A, E0, E1, E2, E3) \ + _mm_shuffle_ps(A, A, _MM_SHUFFLE(E3, E2, E1, E0)) + +#define npyv_permi128_f64(A, E0, E1) \ + _mm_shuffle_pd(A, A, _MM_SHUFFLE2(E1, E0)) + #endif // _NPY_SIMD_SSE_REORDER_H diff --git a/numpy/core/src/common/simd/sse/sse.h b/numpy/core/src/common/simd/sse/sse.h index c21bbfda7..0c6b8cdba 100644 --- a/numpy/core/src/common/simd/sse/sse.h +++ b/numpy/core/src/common/simd/sse/sse.h @@ -12,6 +12,7 @@ #define NPY_SIMD_FMA3 0 // fast emulated #endif #define NPY_SIMD_BIGENDIAN 0 +#define NPY_SIMD_CMPSIGNAL 1 typedef __m128i npyv_u8; typedef __m128i npyv_s8; diff --git a/numpy/core/src/common/simd/vec/arithmetic.h b/numpy/core/src/common/simd/vec/arithmetic.h index a2e9d07eb..85f4d6b26 100644 --- a/numpy/core/src/common/simd/vec/arithmetic.h +++ b/numpy/core/src/common/simd/vec/arithmetic.h @@ -322,6 +322,20 @@ NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor) NPY_FINLINE npyv_f64 npyv_nmulsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c) { return vec_neg(vec_madd(a, b, c)); } #endif +// multiply, add for odd elements and subtract even elements. +// (a * b) -+ c +#if NPY_SIMD_F32 +NPY_FINLINE npyv_f32 npyv_muladdsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c) +{ + const npyv_f32 msign = npyv_set_f32(-0.0f, 0.0f, -0.0f, 0.0f); + return npyv_muladd_f32(a, b, npyv_xor_f32(msign, c)); +} +#endif +NPY_FINLINE npyv_f64 npyv_muladdsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c) +{ + const npyv_f64 msign = npyv_set_f64(-0.0, 0.0); + return npyv_muladd_f64(a, b, npyv_xor_f64(msign, c)); +} /*************************** * Summation ***************************/ @@ -352,6 +366,7 @@ NPY_FINLINE float npyv_sum_f32(npyv_f32 a) { npyv_f32 sum = vec_add(a, npyv_combineh_f32(a, a)); return vec_extract(sum, 0) + vec_extract(sum, 1); + (void)sum; } #endif @@ -372,6 +387,7 @@ NPY_FINLINE npy_uint16 npyv_sumup_u8(npyv_u8 a) npyv_u32 four = vec_sum4s(a, zero); npyv_s32 one = vec_sums((npyv_s32)four, (npyv_s32)zero); return (npy_uint16)vec_extract(one, 3); + (void)one; #endif } @@ -386,6 +402,7 @@ NPY_FINLINE npy_uint32 npyv_sumup_u16(npyv_u16 a) npyv_u32 four = vec_add(eight.val[0], eight.val[1]); npyv_s32 one = vec_sums((npyv_s32)four, zero); return (npy_uint32)vec_extract(one, 3); + (void)one; #endif } diff --git a/numpy/core/src/common/simd/vec/conversion.h b/numpy/core/src/common/simd/vec/conversion.h index f0d625c55..922109f7b 100644 --- a/numpy/core/src/common/simd/vec/conversion.h +++ b/numpy/core/src/common/simd/vec/conversion.h @@ -96,6 +96,8 @@ npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d, #else return vec_extract(r, 4); #endif + // to suppress ambiguous warning: variable `r` but not used [-Wunused-but-set-variable] + (void)r; } NPY_FINLINE npy_uint64 npyv_tobits_b16(npyv_b16 a) { @@ -106,6 +108,8 @@ npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d, #else return vec_extract(r, 8); #endif + // to suppress ambiguous warning: variable `r` but not used [-Wunused-but-set-variable] + (void)r; } NPY_FINLINE npy_uint64 npyv_tobits_b32(npyv_b32 a) { @@ -120,6 +124,8 @@ npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d, #else return vec_extract(r, 8); #endif + // to suppress ambiguous warning: variable `r` but not used [-Wunused-but-set-variable] + (void)r; } NPY_FINLINE npy_uint64 npyv_tobits_b64(npyv_b64 a) { @@ -134,6 +140,8 @@ npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d, #else return vec_extract(r, 8); #endif + // to suppress ambiguous warning: variable `r` but not used [-Wunused-but-set-variable] + (void)r; } #else NPY_FINLINE npy_uint64 npyv_tobits_b8(npyv_b8 a) @@ -170,7 +178,8 @@ npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d, #ifdef NPY_HAVE_VXE2 return vec_signed(a); #elif defined(NPY_HAVE_VXE) - return vec_packs(vec_signed(npyv_doublee(a)), vec_signed(npyv_doublee(vec_mergel(a, a)))); + return vec_packs(vec_signed(npyv_doublee(vec_mergeh(a,a))), + vec_signed(npyv_doublee(vec_mergel(a, a)))); // VSX #elif defined(__IBMC__) return vec_cts(a, 0); diff --git a/numpy/core/src/common/simd/vec/math.h b/numpy/core/src/common/simd/vec/math.h index 95b16fdf7..85690f76c 100644 --- a/numpy/core/src/common/simd/vec/math.h +++ b/numpy/core/src/common/simd/vec/math.h @@ -186,6 +186,7 @@ NPY_IMPL_VEC_REDUCE_MINMAX(max, int32, s32) { \ npyv_##SFX r = vec_##INTRIN(a, vec_sld(a, a, 8)); \ return (npy_##STYPE)vec_extract(r, 0); \ + (void)r; \ } NPY_IMPL_VEC_REDUCE_MINMAX(min, uint64, u64) NPY_IMPL_VEC_REDUCE_MINMAX(max, uint64, u64) @@ -225,6 +226,7 @@ NPY_IMPL_VEC_REDUCE_MINMAX(max, int64, s64) { \ npyv_f64 r = vec_##INTRIN(a, vec_sld(a, a, 8)); \ return vec_extract(r, 0); \ + (void)r; \ } \ NPY_FINLINE double npyv_reduce_##INTRIN##n_f64(npyv_f64 a) \ { \ diff --git a/numpy/core/src/common/simd/vec/memory.h b/numpy/core/src/common/simd/vec/memory.h index e8f588ef2..1ad39cead 100644 --- a/numpy/core/src/common/simd/vec/memory.h +++ b/numpy/core/src/common/simd/vec/memory.h @@ -46,14 +46,8 @@ #endif // avoid aliasing rules -#ifdef __cplusplus - template<typename T_PTR> - NPY_FINLINE npy_uint64 *npyv__ptr2u64(const T_PTR *ptr) - { npy_uint64 *ptr64 = (npy_uint64*)ptr; return ptr64; } -#else - NPY_FINLINE npy_uint64 *npyv__ptr2u64(const void *ptr) - { npy_uint64 *ptr64 = (npy_uint64*)ptr; return ptr64; } -#endif // __cplusplus +NPY_FINLINE npy_uint64 *npyv__ptr2u64(const void *ptr) +{ npy_uint64 *ptr64 = (npy_uint64*)ptr; return ptr64; } // load lower part NPY_FINLINE npyv_u64 npyv__loadl(const void *ptr) @@ -136,6 +130,24 @@ NPY_FINLINE npyv_s64 npyv_loadn_s64(const npy_int64 *ptr, npy_intp stride) { return npyv_set_s64(ptr[0], ptr[stride]); } NPY_FINLINE npyv_f64 npyv_loadn_f64(const double *ptr, npy_intp stride) { return npyv_set_f64(ptr[0], ptr[stride]); } + +//// 64-bit load over 32-bit stride +NPY_FINLINE npyv_u32 npyv_loadn2_u32(const npy_uint32 *ptr, npy_intp stride) +{ return (npyv_u32)npyv_set_u64(*(npy_uint64*)ptr, *(npy_uint64*)(ptr + stride)); } +NPY_FINLINE npyv_s32 npyv_loadn2_s32(const npy_int32 *ptr, npy_intp stride) +{ return (npyv_s32)npyv_set_u64(*(npy_uint64*)ptr, *(npy_uint64*)(ptr + stride)); } +#if NPY_SIMD_F32 +NPY_FINLINE npyv_f32 npyv_loadn2_f32(const float *ptr, npy_intp stride) +{ return (npyv_f32)npyv_set_u64(*(npy_uint64*)ptr, *(npy_uint64*)(ptr + stride)); } +#endif +//// 128-bit load over 64-bit stride +NPY_FINLINE npyv_u64 npyv_loadn2_u64(const npy_uint64 *ptr, npy_intp stride) +{ (void)stride; return npyv_load_u64(ptr); } +NPY_FINLINE npyv_s64 npyv_loadn2_s64(const npy_int64 *ptr, npy_intp stride) +{ (void)stride; return npyv_load_s64(ptr); } +NPY_FINLINE npyv_f64 npyv_loadn2_f64(const double *ptr, npy_intp stride) +{ (void)stride; return npyv_load_f64(ptr); } + /*************************** * Non-contiguous Store ***************************/ @@ -164,6 +176,26 @@ NPY_FINLINE void npyv_storen_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a) NPY_FINLINE void npyv_storen_f64(double *ptr, npy_intp stride, npyv_f64 a) { npyv_storen_u64((npy_uint64*)ptr, stride, (npyv_u64)a); } +//// 64-bit store over 32-bit stride +NPY_FINLINE void npyv_storen2_u32(npy_uint32 *ptr, npy_intp stride, npyv_u32 a) +{ + *(npy_uint64*)ptr = vec_extract((npyv_u64)a, 0); + *(npy_uint64*)(ptr + stride) = vec_extract((npyv_u64)a, 1); +} +NPY_FINLINE void npyv_storen2_s32(npy_int32 *ptr, npy_intp stride, npyv_s32 a) +{ npyv_storen2_u32((npy_uint32*)ptr, stride, (npyv_u32)a); } +#if NPY_SIMD_F32 +NPY_FINLINE void npyv_storen2_f32(float *ptr, npy_intp stride, npyv_f32 a) +{ npyv_storen2_u32((npy_uint32*)ptr, stride, (npyv_u32)a); } +#endif +//// 128-bit store over 64-bit stride +NPY_FINLINE void npyv_storen2_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a) +{ (void)stride; npyv_store_u64(ptr, a); } +NPY_FINLINE void npyv_storen2_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a) +{ (void)stride; npyv_store_s64(ptr, a); } +NPY_FINLINE void npyv_storen2_f64(double *ptr, npy_intp stride, npyv_f64 a) +{ (void)stride; npyv_store_f64(ptr, a); } + /********************************* * Partial Load *********************************/ @@ -173,9 +205,9 @@ NPY_FINLINE npyv_s32 npyv_load_till_s32(const npy_int32 *ptr, npy_uintp nlane, n assert(nlane > 0); npyv_s32 vfill = npyv_setall_s32(fill); #ifdef NPY_HAVE_VX - const unsigned blane = (unsigned short)nlane; + const unsigned blane = (nlane > 4) ? 4 : nlane; const npyv_u32 steps = npyv_set_u32(0, 1, 2, 3); - const npyv_u32 vlane = npyv_setall_u32((unsigned)blane); + const npyv_u32 vlane = npyv_setall_u32(blane); const npyv_b32 mask = vec_cmpgt(vlane, steps); npyv_s32 a = vec_load_len(ptr, blane*4-1); return vec_sel(vfill, a, mask); @@ -201,8 +233,8 @@ NPY_FINLINE npyv_s32 npyv_load_till_s32(const npy_int32 *ptr, npy_uintp nlane, n NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32 *ptr, npy_uintp nlane) { #ifdef NPY_HAVE_VX - unsigned blane = ((unsigned short)nlane)*4 - 1; - return vec_load_len(ptr, blane); + unsigned blane = (nlane > 4) ? 4 : nlane; + return vec_load_len(ptr, blane*4-1); #else return npyv_load_till_s32(ptr, nlane, 0); #endif @@ -220,12 +252,33 @@ NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, n NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane) { #ifdef NPY_HAVE_VX - unsigned blane = (unsigned short)nlane; + unsigned blane = (nlane > 2) ? 2 : nlane; return vec_load_len((const signed long long*)ptr, blane*8-1); #else return npyv_load_till_s64(ptr, nlane, 0); #endif } +//// 64-bit nlane +NPY_FINLINE npyv_s32 npyv_load2_till_s32(const npy_int32 *ptr, npy_uintp nlane, + npy_int32 fill_lo, npy_int32 fill_hi) +{ + assert(nlane > 0); + if (nlane == 1) { + return npyv_set_s32(ptr[0], ptr[1], fill_lo, fill_hi); + } + return npyv_load_s32(ptr); +} +// fill zero to rest lanes +NPY_FINLINE npyv_s32 npyv_load2_tillz_s32(const npy_int32 *ptr, npy_uintp nlane) +{ return (npyv_s32)npyv_load_tillz_s64((const npy_int64*)ptr, nlane); } + +//// 128-bit nlane +NPY_FINLINE npyv_s64 npyv_load2_till_s64(const npy_int64 *ptr, npy_uintp nlane, + npy_int64 fill_lo, npy_int64 fill_hi) +{ (void)nlane; (void)fill_lo; (void)fill_hi; return npyv_load_s64(ptr); } + +NPY_FINLINE npyv_s64 npyv_load2_tillz_s64(const npy_int64 *ptr, npy_uintp nlane) +{ (void)nlane; return npyv_load_s64(ptr); } /********************************* * Non-contiguous partial load *********************************/ @@ -265,6 +318,34 @@ npyv_loadn_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_ // fill zero to rest lanes NPY_FINLINE npyv_s64 npyv_loadn_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane) { return npyv_loadn_till_s64(ptr, stride, nlane, 0); } + +//// 64-bit load over 32-bit stride +NPY_FINLINE npyv_s32 npyv_loadn2_till_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane, + npy_int32 fill_lo, npy_int32 fill_hi) +{ + assert(nlane > 0); + if (nlane == 1) { + return npyv_set_s32(ptr[0], ptr[1], fill_lo, fill_hi); + } + return npyv_loadn2_s32(ptr, stride); +} +NPY_FINLINE npyv_s32 npyv_loadn2_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane) +{ + assert(nlane > 0); + if (nlane == 1) { + return (npyv_s32)npyv_set_s64(*(npy_int64*)ptr, 0); + } + return npyv_loadn2_s32(ptr, stride); +} + +//// 128-bit load over 64-bit stride +NPY_FINLINE npyv_s64 npyv_loadn2_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, + npy_int64 fill_lo, npy_int64 fill_hi) +{ assert(nlane > 0); (void)stride; (void)nlane; (void)fill_lo; (void)fill_hi; return npyv_load_s64(ptr); } + +NPY_FINLINE npyv_s64 npyv_loadn2_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane) +{ assert(nlane > 0); (void)stride; (void)nlane; return npyv_load_s64(ptr); } + /********************************* * Partial store *********************************/ @@ -273,7 +354,7 @@ NPY_FINLINE void npyv_store_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a { assert(nlane > 0); #ifdef NPY_HAVE_VX - unsigned blane = (unsigned short)nlane; + unsigned blane = (nlane > 4) ? 4 : nlane; vec_store_len(a, ptr, blane*4-1); #else switch(nlane) { @@ -297,7 +378,7 @@ NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a { assert(nlane > 0); #ifdef NPY_HAVE_VX - unsigned blane = (unsigned short)nlane; + unsigned blane = (nlane > 2) ? 2 : nlane; vec_store_len(a, (signed long long*)ptr, blane*8-1); #else if (nlane == 1) { @@ -307,6 +388,18 @@ NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a npyv_store_s64(ptr, a); #endif } + +//// 64-bit nlane +NPY_FINLINE void npyv_store2_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a) +{ npyv_store_till_s64((npy_int64*)ptr, nlane, (npyv_s64)a); } + +//// 128-bit nlane +NPY_FINLINE void npyv_store2_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a) +{ + assert(nlane > 0); (void)nlane; + npyv_store_s64(ptr, a); +} + /********************************* * Non-contiguous partial store *********************************/ @@ -314,16 +407,21 @@ NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a NPY_FINLINE void npyv_storen_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npyv_s32 a) { assert(nlane > 0); + ptr[stride*0] = vec_extract(a, 0); switch(nlane) { - default: - ptr[stride*3] = vec_extract(a, 3); - case 3: - ptr[stride*2] = vec_extract(a, 2); + case 1: + return; case 2: ptr[stride*1] = vec_extract(a, 1); - case 1: - ptr[stride*0] = vec_extract(a, 0); - break; + return; + case 3: + ptr[stride*1] = vec_extract(a, 1); + ptr[stride*2] = vec_extract(a, 2); + return; + default: + ptr[stride*1] = vec_extract(a, 1); + ptr[stride*2] = vec_extract(a, 2); + ptr[stride*3] = vec_extract(a, 3); } } //// 64 @@ -336,6 +434,21 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp } npyv_storen_s64(ptr, stride, a); } + +//// 64-bit store over 32-bit stride +NPY_FINLINE void npyv_storen2_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npyv_s32 a) +{ + assert(nlane > 0); + npyv_storel_s32(ptr, a); + if (nlane > 1) { + npyv_storeh_s32(ptr + stride, a); + } +} + +//// 128-bit store over 64-bit stride +NPY_FINLINE void npyv_storen2_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npyv_s64 a) +{ assert(nlane > 0); (void)stride; (void)nlane; npyv_store_s64(ptr, a); } + /***************************************************************** * Implement partial load/store for u32/f32/u64/f64... via casting *****************************************************************/ @@ -346,7 +459,8 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp union { \ npyv_lanetype_##F_SFX from_##F_SFX; \ npyv_lanetype_##T_SFX to_##T_SFX; \ - } pun = {.from_##F_SFX = fill}; \ + } pun; \ + pun.from_##F_SFX = fill; \ return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load_till_##T_SFX( \ (const npyv_lanetype_##T_SFX *)ptr, nlane, pun.to_##T_SFX \ )); \ @@ -358,7 +472,8 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp union { \ npyv_lanetype_##F_SFX from_##F_SFX; \ npyv_lanetype_##T_SFX to_##T_SFX; \ - } pun = {.from_##F_SFX = fill}; \ + } pun; \ + pun.from_##F_SFX = fill; \ return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn_till_##T_SFX( \ (const npyv_lanetype_##T_SFX *)ptr, stride, nlane, pun.to_##T_SFX \ )); \ @@ -401,6 +516,114 @@ NPYV_IMPL_VEC_REST_PARTIAL_TYPES(f32, s32) NPYV_IMPL_VEC_REST_PARTIAL_TYPES(u64, s64) NPYV_IMPL_VEC_REST_PARTIAL_TYPES(f64, s64) +// 128-bit/64-bit stride +#define NPYV_IMPL_VEC_REST_PARTIAL_TYPES_PAIR(F_SFX, T_SFX) \ + NPY_FINLINE npyv_##F_SFX npyv_load2_till_##F_SFX \ + (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, \ + npyv_lanetype_##F_SFX fill_lo, npyv_lanetype_##F_SFX fill_hi) \ + { \ + union pun { \ + npyv_lanetype_##F_SFX from_##F_SFX; \ + npyv_lanetype_##T_SFX to_##T_SFX; \ + }; \ + union pun pun_lo; \ + union pun pun_hi; \ + pun_lo.from_##F_SFX = fill_lo; \ + pun_hi.from_##F_SFX = fill_hi; \ + return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load2_till_##T_SFX( \ + (const npyv_lanetype_##T_SFX *)ptr, nlane, pun_lo.to_##T_SFX, pun_hi.to_##T_SFX \ + )); \ + } \ + NPY_FINLINE npyv_##F_SFX npyv_loadn2_till_##F_SFX \ + (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, \ + npyv_lanetype_##F_SFX fill_lo, npyv_lanetype_##F_SFX fill_hi) \ + { \ + union pun { \ + npyv_lanetype_##F_SFX from_##F_SFX; \ + npyv_lanetype_##T_SFX to_##T_SFX; \ + }; \ + union pun pun_lo; \ + union pun pun_hi; \ + pun_lo.from_##F_SFX = fill_lo; \ + pun_hi.from_##F_SFX = fill_hi; \ + return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn2_till_##T_SFX( \ + (const npyv_lanetype_##T_SFX *)ptr, stride, nlane, pun_lo.to_##T_SFX, \ + pun_hi.to_##T_SFX \ + )); \ + } \ + NPY_FINLINE npyv_##F_SFX npyv_load2_tillz_##F_SFX \ + (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane) \ + { \ + return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load2_tillz_##T_SFX( \ + (const npyv_lanetype_##T_SFX *)ptr, nlane \ + )); \ + } \ + NPY_FINLINE npyv_##F_SFX npyv_loadn2_tillz_##F_SFX \ + (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane) \ + { \ + return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn2_tillz_##T_SFX( \ + (const npyv_lanetype_##T_SFX *)ptr, stride, nlane \ + )); \ + } \ + NPY_FINLINE void npyv_store2_till_##F_SFX \ + (npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, npyv_##F_SFX a) \ + { \ + npyv_store2_till_##T_SFX( \ + (npyv_lanetype_##T_SFX *)ptr, nlane, \ + npyv_reinterpret_##T_SFX##_##F_SFX(a) \ + ); \ + } \ + NPY_FINLINE void npyv_storen2_till_##F_SFX \ + (npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, npyv_##F_SFX a) \ + { \ + npyv_storen2_till_##T_SFX( \ + (npyv_lanetype_##T_SFX *)ptr, stride, nlane, \ + npyv_reinterpret_##T_SFX##_##F_SFX(a) \ + ); \ + } + +NPYV_IMPL_VEC_REST_PARTIAL_TYPES_PAIR(u32, s32) +#if NPY_SIMD_F32 +NPYV_IMPL_VEC_REST_PARTIAL_TYPES_PAIR(f32, s32) +#endif +NPYV_IMPL_VEC_REST_PARTIAL_TYPES_PAIR(u64, s64) +NPYV_IMPL_VEC_REST_PARTIAL_TYPES_PAIR(f64, s64) + +/************************************************************ + * de-interlave load / interleave contiguous store + ************************************************************/ +// two channels +#define NPYV_IMPL_VEC_MEM_INTERLEAVE(SFX) \ + NPY_FINLINE npyv_##SFX##x2 npyv_zip_##SFX(npyv_##SFX, npyv_##SFX); \ + NPY_FINLINE npyv_##SFX##x2 npyv_unzip_##SFX(npyv_##SFX, npyv_##SFX); \ + NPY_FINLINE npyv_##SFX##x2 npyv_load_##SFX##x2( \ + const npyv_lanetype_##SFX *ptr \ + ) { \ + return npyv_unzip_##SFX( \ + npyv_load_##SFX(ptr), npyv_load_##SFX(ptr+npyv_nlanes_##SFX) \ + ); \ + } \ + NPY_FINLINE void npyv_store_##SFX##x2( \ + npyv_lanetype_##SFX *ptr, npyv_##SFX##x2 v \ + ) { \ + npyv_##SFX##x2 zip = npyv_zip_##SFX(v.val[0], v.val[1]); \ + npyv_store_##SFX(ptr, zip.val[0]); \ + npyv_store_##SFX(ptr + npyv_nlanes_##SFX, zip.val[1]); \ + } + +NPYV_IMPL_VEC_MEM_INTERLEAVE(u8) +NPYV_IMPL_VEC_MEM_INTERLEAVE(s8) +NPYV_IMPL_VEC_MEM_INTERLEAVE(u16) +NPYV_IMPL_VEC_MEM_INTERLEAVE(s16) +NPYV_IMPL_VEC_MEM_INTERLEAVE(u32) +NPYV_IMPL_VEC_MEM_INTERLEAVE(s32) +NPYV_IMPL_VEC_MEM_INTERLEAVE(u64) +NPYV_IMPL_VEC_MEM_INTERLEAVE(s64) +#if NPY_SIMD_F32 +NPYV_IMPL_VEC_MEM_INTERLEAVE(f32) +#endif +NPYV_IMPL_VEC_MEM_INTERLEAVE(f64) + /********************************* * Lookup table *********************************/ diff --git a/numpy/core/src/common/simd/vec/misc.h b/numpy/core/src/common/simd/vec/misc.h index 7ea0f21f6..79c194d90 100644 --- a/numpy/core/src/common/simd/vec/misc.h +++ b/numpy/core/src/common/simd/vec/misc.h @@ -26,28 +26,28 @@ #define NPYV_IMPL_VEC_SPLTW(T_VEC, V) ((T_VEC){V, V, V, V}) #define NPYV_IMPL_VEC_SPLTD(T_VEC, V) ((T_VEC){V, V}) -#define npyv_setall_u8(VAL) NPYV_IMPL_VEC_SPLTB(npyv_u8, (unsigned char)VAL) -#define npyv_setall_s8(VAL) NPYV_IMPL_VEC_SPLTB(npyv_s8, (signed char)VAL) -#define npyv_setall_u16(VAL) NPYV_IMPL_VEC_SPLTH(npyv_u16, (unsigned short)VAL) -#define npyv_setall_s16(VAL) NPYV_IMPL_VEC_SPLTH(npyv_s16, (short)VAL) -#define npyv_setall_u32(VAL) NPYV_IMPL_VEC_SPLTW(npyv_u32, (unsigned int)VAL) -#define npyv_setall_s32(VAL) NPYV_IMPL_VEC_SPLTW(npyv_s32, (int)VAL) +#define npyv_setall_u8(VAL) NPYV_IMPL_VEC_SPLTB(npyv_u8, (unsigned char)(VAL)) +#define npyv_setall_s8(VAL) NPYV_IMPL_VEC_SPLTB(npyv_s8, (signed char)(VAL)) +#define npyv_setall_u16(VAL) NPYV_IMPL_VEC_SPLTH(npyv_u16, (unsigned short)(VAL)) +#define npyv_setall_s16(VAL) NPYV_IMPL_VEC_SPLTH(npyv_s16, (short)(VAL)) +#define npyv_setall_u32(VAL) NPYV_IMPL_VEC_SPLTW(npyv_u32, (unsigned int)(VAL)) +#define npyv_setall_s32(VAL) NPYV_IMPL_VEC_SPLTW(npyv_s32, (int)(VAL)) #if NPY_SIMD_F32 - #define npyv_setall_f32(VAL) NPYV_IMPL_VEC_SPLTW(npyv_f32, VAL) + #define npyv_setall_f32(VAL) NPYV_IMPL_VEC_SPLTW(npyv_f32, (VAL)) #endif -#define npyv_setall_u64(VAL) NPYV_IMPL_VEC_SPLTD(npyv_u64, (npy_uint64)VAL) -#define npyv_setall_s64(VAL) NPYV_IMPL_VEC_SPLTD(npyv_s64, (npy_int64)VAL) +#define npyv_setall_u64(VAL) NPYV_IMPL_VEC_SPLTD(npyv_u64, (npy_uint64)(VAL)) +#define npyv_setall_s64(VAL) NPYV_IMPL_VEC_SPLTD(npyv_s64, (npy_int64)(VAL)) #define npyv_setall_f64(VAL) NPYV_IMPL_VEC_SPLTD(npyv_f64, VAL) // vector with specific values set to each lane and // set a specific value to all remained lanes -#define npyv_setf_u8(FILL, ...) ((npyv_u8){NPYV__SET_FILL_16(char, FILL, __VA_ARGS__)}) -#define npyv_setf_s8(FILL, ...) ((npyv_s8){NPYV__SET_FILL_16(char, FILL, __VA_ARGS__)}) -#define npyv_setf_u16(FILL, ...) ((npyv_u16){NPYV__SET_FILL_8(short, FILL, __VA_ARGS__)}) +#define npyv_setf_u8(FILL, ...) ((npyv_u8){NPYV__SET_FILL_16(unsigned char, FILL, __VA_ARGS__)}) +#define npyv_setf_s8(FILL, ...) ((npyv_s8){NPYV__SET_FILL_16(signed char, FILL, __VA_ARGS__)}) +#define npyv_setf_u16(FILL, ...) ((npyv_u16){NPYV__SET_FILL_8(unsigned short, FILL, __VA_ARGS__)}) #define npyv_setf_s16(FILL, ...) ((npyv_s16){NPYV__SET_FILL_8(short, FILL, __VA_ARGS__)}) -#define npyv_setf_u32(FILL, ...) ((npyv_u32){NPYV__SET_FILL_4(int, FILL, __VA_ARGS__)}) +#define npyv_setf_u32(FILL, ...) ((npyv_u32){NPYV__SET_FILL_4(unsigned int, FILL, __VA_ARGS__)}) #define npyv_setf_s32(FILL, ...) ((npyv_s32){NPYV__SET_FILL_4(int, FILL, __VA_ARGS__)}) -#define npyv_setf_u64(FILL, ...) ((npyv_u64){NPYV__SET_FILL_2(npy_int64, FILL, __VA_ARGS__)}) +#define npyv_setf_u64(FILL, ...) ((npyv_u64){NPYV__SET_FILL_2(npy_uint64, FILL, __VA_ARGS__)}) #define npyv_setf_s64(FILL, ...) ((npyv_s64){NPYV__SET_FILL_2(npy_int64, FILL, __VA_ARGS__)}) #if NPY_SIMD_F32 #define npyv_setf_f32(FILL, ...) ((npyv_f32){NPYV__SET_FILL_4(float, FILL, __VA_ARGS__)}) diff --git a/numpy/core/src/common/simd/vec/reorder.h b/numpy/core/src/common/simd/vec/reorder.h index b60b9287d..3910980a2 100644 --- a/numpy/core/src/common/simd/vec/reorder.h +++ b/numpy/core/src/common/simd/vec/reorder.h @@ -68,6 +68,85 @@ NPYV_IMPL_VEC_COMBINE_ZIP(npyv_s64, s64) #endif NPYV_IMPL_VEC_COMBINE_ZIP(npyv_f64, f64) +// deinterleave two vectors +NPY_FINLINE npyv_u8x2 npyv_unzip_u8(npyv_u8 ab0, npyv_u8 ab1) +{ + const npyv_u8 idx_even = npyv_set_u8( + 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 + ); + const npyv_u8 idx_odd = npyv_set_u8( + 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 + ); + npyv_u8x2 r; + r.val[0] = vec_perm(ab0, ab1, idx_even); + r.val[1] = vec_perm(ab0, ab1, idx_odd); + return r; +} +NPY_FINLINE npyv_s8x2 npyv_unzip_s8(npyv_s8 ab0, npyv_s8 ab1) +{ + npyv_u8x2 ru = npyv_unzip_u8((npyv_u8)ab0, (npyv_u8)ab1); + npyv_s8x2 r; + r.val[0] = (npyv_s8)ru.val[0]; + r.val[1] = (npyv_s8)ru.val[1]; + return r; +} +NPY_FINLINE npyv_u16x2 npyv_unzip_u16(npyv_u16 ab0, npyv_u16 ab1) +{ + const npyv_u8 idx_even = npyv_set_u8( + 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 + ); + const npyv_u8 idx_odd = npyv_set_u8( + 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 + ); + npyv_u16x2 r; + r.val[0] = vec_perm(ab0, ab1, idx_even); + r.val[1] = vec_perm(ab0, ab1, idx_odd); + return r; +} +NPY_FINLINE npyv_s16x2 npyv_unzip_s16(npyv_s16 ab0, npyv_s16 ab1) +{ + npyv_u16x2 ru = npyv_unzip_u16((npyv_u16)ab0, (npyv_u16)ab1); + npyv_s16x2 r; + r.val[0] = (npyv_s16)ru.val[0]; + r.val[1] = (npyv_s16)ru.val[1]; + return r; +} +NPY_FINLINE npyv_u32x2 npyv_unzip_u32(npyv_u32 ab0, npyv_u32 ab1) +{ + npyv_u32 m0 = vec_mergeh(ab0, ab1); + npyv_u32 m1 = vec_mergel(ab0, ab1); + npyv_u32 r0 = vec_mergeh(m0, m1); + npyv_u32 r1 = vec_mergel(m0, m1); + npyv_u32x2 r; + r.val[0] = r0; + r.val[1] = r1; + return r; +} +NPY_FINLINE npyv_s32x2 npyv_unzip_s32(npyv_s32 ab0, npyv_s32 ab1) +{ + npyv_u32x2 ru = npyv_unzip_u32((npyv_u32)ab0, (npyv_u32)ab1); + npyv_s32x2 r; + r.val[0] = (npyv_s32)ru.val[0]; + r.val[1] = (npyv_s32)ru.val[1]; + return r; +} +#if NPY_SIMD_F32 + NPY_FINLINE npyv_f32x2 npyv_unzip_f32(npyv_f32 ab0, npyv_f32 ab1) + { + npyv_u32x2 ru = npyv_unzip_u32((npyv_u32)ab0, (npyv_u32)ab1); + npyv_f32x2 r; + r.val[0] = (npyv_f32)ru.val[0]; + r.val[1] = (npyv_f32)ru.val[1]; + return r; + } +#endif +NPY_FINLINE npyv_u64x2 npyv_unzip_u64(npyv_u64 ab0, npyv_u64 ab1) +{ return npyv_combine_u64(ab0, ab1); } +NPY_FINLINE npyv_s64x2 npyv_unzip_s64(npyv_s64 ab0, npyv_s64 ab1) +{ return npyv_combine_s64(ab0, ab1); } +NPY_FINLINE npyv_f64x2 npyv_unzip_f64(npyv_f64 ab0, npyv_f64 ab1) +{ return npyv_combine_f64(ab0, ab1); } + // Reverse elements of each 64-bit lane NPY_FINLINE npyv_u8 npyv_rev64_u8(npyv_u8 a) { @@ -111,4 +190,24 @@ NPY_FINLINE npyv_s32 npyv_rev64_s32(npyv_s32 a) { return (npyv_f32)npyv_rev64_u32((npyv_u32)a); } #endif +// Permuting the elements of each 128-bit lane by immediate index for +// each element. +#define npyv_permi128_u32(A, E0, E1, E2, E3) \ + vec_perm(A, A, npyv_set_u8( \ + (E0<<2), (E0<<2)+1, (E0<<2)+2, (E0<<2)+3, \ + (E1<<2), (E1<<2)+1, (E1<<2)+2, (E1<<2)+3, \ + (E2<<2), (E2<<2)+1, (E2<<2)+2, (E2<<2)+3, \ + (E3<<2), (E3<<2)+1, (E3<<2)+2, (E3<<2)+3 \ + )) +#define npyv_permi128_s32 npyv_permi128_u32 +#define npyv_permi128_f32 npyv_permi128_u32 + +#if defined(__IBMC__) || defined(vec_permi) + #define npyv_permi128_u64(A, E0, E1) vec_permi(A, A, ((E0)<<1) | (E1)) +#else + #define npyv_permi128_u64(A, E0, E1) vec_xxpermdi(A, A, ((E0)<<1) | (E1)) +#endif +#define npyv_permi128_s64 npyv_permi128_u64 +#define npyv_permi128_f64 npyv_permi128_u64 + #endif // _NPY_SIMD_VEC_REORDER_H diff --git a/numpy/core/src/common/simd/vec/vec.h b/numpy/core/src/common/simd/vec/vec.h index abcd33ce1..1d4508669 100644 --- a/numpy/core/src/common/simd/vec/vec.h +++ b/numpy/core/src/common/simd/vec/vec.h @@ -39,8 +39,10 @@ #ifdef NPY_HAVE_VX #define NPY_SIMD_BIGENDIAN 1 + #define NPY_SIMD_CMPSIGNAL 0 #else #define NPY_SIMD_BIGENDIAN 0 + #define NPY_SIMD_CMPSIGNAL 1 #endif typedef __vector unsigned char npyv_u8; diff --git a/numpy/core/src/common/templ_common.h.src b/numpy/core/src/common/templ_common.h.src index 1eea8eb37..d0d1917fe 100644 --- a/numpy/core/src/common/templ_common.h.src +++ b/numpy/core/src/common/templ_common.h.src @@ -4,6 +4,7 @@ /* utility functions that profit from templates */ #include "numpy/npy_common.h" +#include <assert.h> /**begin repeat * #name = int, uint, long, ulong, @@ -24,7 +25,7 @@ * that is not checked. Could use absolute values and adjust the sign if that * functionality was desired. */ -static NPY_INLINE int +static inline int npy_mul_with_overflow_@name@(@type@ * r, @type@ a, @type@ b) { #ifdef HAVE___BUILTIN_MUL_OVERFLOW @@ -51,14 +52,14 @@ npy_mul_with_overflow_@name@(@type@ * r, @type@ a, @type@ b) } /**end repeat**/ -static NPY_INLINE int +static inline int npy_mul_sizes_with_overflow (npy_intp * r, npy_intp a, npy_intp b) { #ifdef HAVE___BUILTIN_MUL_OVERFLOW return __builtin_mul_overflow(a, b, r); #else - - assert(a >= 0 && b >= 0, "this function only supports non-negative numbers"); + /* this function only supports non-negative numbers */ + assert(a >= 0 && b >= 0); const npy_intp half_sz = ((npy_intp)1 << ((sizeof(a) * 8 - 1 ) / 2)); *r = a * b; diff --git a/numpy/core/src/common/ucsnarrow.h b/numpy/core/src/common/ucsnarrow.h index 6fe157199..4b17a2809 100644 --- a/numpy/core/src/common/ucsnarrow.h +++ b/numpy/core/src/common/ucsnarrow.h @@ -2,6 +2,6 @@ #define NUMPY_CORE_SRC_COMMON_NPY_UCSNARROW_H_ NPY_NO_EXPORT PyUnicodeObject * -PyUnicode_FromUCS4(char *src, Py_ssize_t size, int swap, int align); +PyUnicode_FromUCS4(char const *src, Py_ssize_t size, int swap, int align); #endif /* NUMPY_CORE_SRC_COMMON_NPY_UCSNARROW_H_ */ diff --git a/numpy/core/src/common/utils.hpp b/numpy/core/src/common/utils.hpp new file mode 100644 index 000000000..f847cab44 --- /dev/null +++ b/numpy/core/src/common/utils.hpp @@ -0,0 +1,51 @@ +#ifndef NUMPY_CORE_SRC_COMMON_UTILS_HPP +#define NUMPY_CORE_SRC_COMMON_UTILS_HPP + +#include "npdef.hpp" + +#if NP_HAS_CPP20 + #include <bit> +#endif + +#include <type_traits> +#include <string.h> + +namespace np { + +/** Create a value of type `To` from the bits of `from`. + * + * similar to `std::bit_cast` but compatible with C++17, + * should perform similar to `*reinterpret_cast<To*>(&from)` + * or through punning without expecting any undefined behaviors. + */ +template<typename To, typename From> +#if NP_HAS_BUILTIN(__builtin_bit_cast) || NP_HAS_CPP20 +[[nodiscard]] constexpr +#else +inline +#endif +To BitCast(const From &from) noexcept +{ + static_assert( + sizeof(To) == sizeof(From), + "both data types must have the same size"); + + static_assert( + std::is_trivially_copyable_v<To> && + std::is_trivially_copyable_v<From>, + "both data types must be trivially copyable"); + +#if NP_HAS_CPP20 + return std::bit_cast<To>(from); +#elif NP_HAS_BUILTIN(__builtin_bit_cast) + return __builtin_bit_cast(To, from); +#else + To to; + memcpy(&to, &from, sizeof(from)); + return to; +#endif +} + +} // namespace np +#endif // NUMPY_CORE_SRC_COMMON_UTILS_HPP + |