summaryrefslogtreecommitdiff
path: root/numpy/core/src/common
diff options
context:
space:
mode:
Diffstat (limited to 'numpy/core/src/common')
-rw-r--r--numpy/core/src/common/common.hpp14
-rw-r--r--numpy/core/src/common/dlpack/dlpack.h151
-rw-r--r--numpy/core/src/common/float_status.hpp134
-rw-r--r--numpy/core/src/common/get_attr_string.h6
-rw-r--r--numpy/core/src/common/half.hpp264
-rw-r--r--numpy/core/src/common/half_private.hpp330
-rw-r--r--numpy/core/src/common/lowlevel_strided_loops.h18
-rw-r--r--numpy/core/src/common/meta.hpp54
-rw-r--r--numpy/core/src/common/npdef.hpp28
-rw-r--r--numpy/core/src/common/npstd.hpp57
-rw-r--r--numpy/core/src/common/npy_cblas.h2
-rw-r--r--numpy/core/src/common/npy_config.h1
-rw-r--r--numpy/core/src/common/npy_cpu_features.c160
-rw-r--r--numpy/core/src/common/npy_cpu_features.h21
-rw-r--r--numpy/core/src/common/npy_ctypes.h2
-rw-r--r--numpy/core/src/common/npy_extint128.h30
-rw-r--r--numpy/core/src/common/npy_hashtable.c4
-rw-r--r--numpy/core/src/common/npy_import.h2
-rw-r--r--numpy/core/src/common/npy_pycompat.h2
-rw-r--r--numpy/core/src/common/npy_sort.h.src2
-rw-r--r--numpy/core/src/common/numpyos.c22
-rw-r--r--numpy/core/src/common/simd/avx2/arithmetic.h11
-rw-r--r--numpy/core/src/common/simd/avx2/avx2.h1
-rw-r--r--numpy/core/src/common/simd/avx2/memory.h374
-rw-r--r--numpy/core/src/common/simd/avx2/operators.h2
-rw-r--r--numpy/core/src/common/simd/avx2/reorder.h87
-rw-r--r--numpy/core/src/common/simd/avx512/arithmetic.h4
-rw-r--r--numpy/core/src/common/simd/avx512/avx512.h1
-rw-r--r--numpy/core/src/common/simd/avx512/maskop.h13
-rw-r--r--numpy/core/src/common/simd/avx512/memory.h317
-rw-r--r--numpy/core/src/common/simd/avx512/reorder.h152
-rw-r--r--numpy/core/src/common/simd/emulate_maskop.h34
-rw-r--r--numpy/core/src/common/simd/neon/arithmetic.h13
-rw-r--r--numpy/core/src/common/simd/neon/math.h149
-rw-r--r--numpy/core/src/common/simd/neon/memory.h293
-rw-r--r--numpy/core/src/common/simd/neon/neon.h1
-rw-r--r--numpy/core/src/common/simd/neon/operators.h29
-rw-r--r--numpy/core/src/common/simd/neon/reorder.h120
-rw-r--r--numpy/core/src/common/simd/simd.h3
-rw-r--r--numpy/core/src/common/simd/sse/arithmetic.h30
-rw-r--r--numpy/core/src/common/simd/sse/math.h183
-rw-r--r--numpy/core/src/common/simd/sse/memory.h314
-rw-r--r--numpy/core/src/common/simd/sse/reorder.h87
-rw-r--r--numpy/core/src/common/simd/sse/sse.h1
-rw-r--r--numpy/core/src/common/simd/vec/arithmetic.h17
-rw-r--r--numpy/core/src/common/simd/vec/conversion.h11
-rw-r--r--numpy/core/src/common/simd/vec/math.h2
-rw-r--r--numpy/core/src/common/simd/vec/memory.h271
-rw-r--r--numpy/core/src/common/simd/vec/misc.h28
-rw-r--r--numpy/core/src/common/simd/vec/reorder.h99
-rw-r--r--numpy/core/src/common/simd/vec/vec.h2
-rw-r--r--numpy/core/src/common/templ_common.h.src9
-rw-r--r--numpy/core/src/common/ucsnarrow.h2
-rw-r--r--numpy/core/src/common/utils.hpp51
54 files changed, 3635 insertions, 380 deletions
diff --git a/numpy/core/src/common/common.hpp b/numpy/core/src/common/common.hpp
new file mode 100644
index 000000000..44ba449d8
--- /dev/null
+++ b/numpy/core/src/common/common.hpp
@@ -0,0 +1,14 @@
+#ifndef NUMPY_CORE_SRC_COMMON_COMMON_HPP
+#define NUMPY_CORE_SRC_COMMON_COMMON_HPP
+/*
+ * The following C++ headers are safe to be used standalone, however,
+ * they are gathered to make it easy for us and for the future need to support PCH.
+ */
+#include "npdef.hpp"
+#include "utils.hpp"
+#include "npstd.hpp"
+#include "half.hpp"
+#include "meta.hpp"
+#include "float_status.hpp"
+
+#endif // NUMPY_CORE_SRC_COMMON_COMMON_HPP
diff --git a/numpy/core/src/common/dlpack/dlpack.h b/numpy/core/src/common/dlpack/dlpack.h
index 29209aee1..f0cbf6136 100644
--- a/numpy/core/src/common/dlpack/dlpack.h
+++ b/numpy/core/src/common/dlpack/dlpack.h
@@ -1,5 +1,5 @@
// Taken from:
-// https://github.com/dmlc/dlpack/blob/9b6176fdecb55e9bf39b16f08b96913ed3f275b4/include/dlpack/dlpack.h
+// https://github.com/dmlc/dlpack/blob/ca4d00ad3e2e0f410eeab3264d21b8a39397f362/include/dlpack/dlpack.h
/*!
* Copyright (c) 2017 by Contributors
* \file dlpack.h
@@ -8,14 +8,20 @@
#ifndef DLPACK_DLPACK_H_
#define DLPACK_DLPACK_H_
+/**
+ * \brief Compatibility with C++
+ */
#ifdef __cplusplus
#define DLPACK_EXTERN_C extern "C"
#else
#define DLPACK_EXTERN_C
#endif
-/*! \brief The current version of dlpack */
-#define DLPACK_VERSION 050
+/*! \brief The current major version of dlpack */
+#define DLPACK_MAJOR_VERSION 1
+
+/*! \brief The current minor version of dlpack */
+#define DLPACK_MINOR_VERSION 0
/*! \brief DLPACK_DLL prefix for windows */
#ifdef _WIN32
@@ -34,10 +40,41 @@
#ifdef __cplusplus
extern "C" {
#endif
+
+/*!
+ * \brief The DLPack version.
+ *
+ * A change in major version indicates that we have changed the
+ * data layout of the ABI - DLManagedTensorVersioned.
+ *
+ * A change in minor version indicates that we have added new
+ * code, such as a new device type, but the ABI is kept the same.
+ *
+ * If an obtained DLPack tensor has a major version that disagrees
+ * with the version number specified in this header file
+ * (i.e. major != DLPACK_MAJOR_VERSION), the consumer must call the deleter
+ * (and it is safe to do so). It is not safe to access any other fields
+ * as the memory layout will have changed.
+ *
+ * In the case of a minor version mismatch, the tensor can be safely used as
+ * long as the consumer knows how to interpret all fields. Minor version
+ * updates indicate the addition of enumeration values.
+ */
+typedef struct {
+ /*! \brief DLPack major version. */
+ uint32_t major;
+ /*! \brief DLPack minor version. */
+ uint32_t minor;
+} DLPackVersion;
+
/*!
* \brief The device type in DLDevice.
*/
+#ifdef __cplusplus
+typedef enum : int32_t {
+#else
typedef enum {
+#endif
/*! \brief CPU device */
kDLCPU = 1,
/*! \brief CUDA GPU device */
@@ -70,6 +107,17 @@ typedef enum {
* \brief CUDA managed/unified memory allocated by cudaMallocManaged
*/
kDLCUDAManaged = 13,
+ /*!
+ * \brief Unified shared memory allocated on a oneAPI non-partititioned
+ * device. Call to oneAPI runtime is required to determine the device
+ * type, the USM allocation type and the sycl context it is bound to.
+ *
+ */
+ kDLOneAPI = 14,
+ /*! \brief GPU support for next generation WebGPU standard. */
+ kDLWebGPU = 15,
+ /*! \brief Qualcomm Hexagon DSP */
+ kDLHexagon = 16,
} DLDeviceType;
/*!
@@ -82,7 +130,7 @@ typedef struct {
* \brief The device index.
* For vanilla CPU memory, pinned memory, or managed memory, this is set to 0.
*/
- int device_id;
+ int32_t device_id;
} DLDevice;
/*!
@@ -107,16 +155,21 @@ typedef enum {
* (C/C++/Python layout: compact struct per complex number)
*/
kDLComplex = 5U,
+ /*! \brief boolean */
+ kDLBool = 6U,
} DLDataTypeCode;
/*!
- * \brief The data type the tensor can hold.
+ * \brief The data type the tensor can hold. The data type is assumed to follow the
+ * native endian-ness. An explicit error message should be raised when attempting to
+ * export an array with non-native endianness
*
* Examples
- * - float: type_code = 2, bits = 32, lanes=1
- * - float4(vectorized 4 float): type_code = 2, bits = 32, lanes=4
- * - int8: type_code = 0, bits = 8, lanes=1
+ * - float: type_code = 2, bits = 32, lanes = 1
+ * - float4(vectorized 4 float): type_code = 2, bits = 32, lanes = 4
+ * - int8: type_code = 0, bits = 8, lanes = 1
* - std::complex<float>: type_code = 5, bits = 64, lanes = 1
+ * - bool: type_code = 6, bits = 8, lanes = 1 (as per common array library convention, the underlying storage size of bool is 8 bits)
*/
typedef struct {
/*!
@@ -138,9 +191,16 @@ typedef struct {
*/
typedef struct {
/*!
- * \brief The opaque data pointer points to the allocated data. This will be
- * CUDA device pointer or cl_mem handle in OpenCL. This pointer is always
- * aligned to 256 bytes as in CUDA.
+ * \brief The data pointer points to the allocated data. This will be CUDA
+ * device pointer or cl_mem handle in OpenCL. It may be opaque on some device
+ * types. This pointer is always aligned to 256 bytes as in CUDA. The
+ * `byte_offset` field should be used to point to the beginning of the data.
+ *
+ * Note that as of Nov 2021, multiply libraries (CuPy, PyTorch, TensorFlow,
+ * TVM, perhaps others) do not adhere to this 256 byte alignment requirement
+ * on CPU/CUDA/ROCm, and always use `byte_offset=0`. This must be fixed
+ * (after which this note will be updated); at the moment it is recommended
+ * to not rely on the data pointer being correctly aligned.
*
* For given DLTensor, the size of memory required to store the contents of
* data is calculated as follows:
@@ -160,7 +220,7 @@ typedef struct {
/*! \brief The device of the tensor */
DLDevice device;
/*! \brief Number of dimensions */
- int ndim;
+ int32_t ndim;
/*! \brief The data type of the pointer*/
DLDataType dtype;
/*! \brief The shape of the tensor */
@@ -180,6 +240,13 @@ typedef struct {
* not meant to transfer the tensor. When the borrowing framework doesn't need
* the tensor, it should call the deleter to notify the host that the resource
* is no longer needed.
+ *
+ * \note This data structure is used as Legacy DLManagedTensor
+ * in DLPack exchange and is deprecated after DLPack v0.8
+ * Use DLManagedTensorVersioned instead.
+ * This data structure may get renamed or deleted in future versions.
+ *
+ * \sa DLManagedTensorVersioned
*/
typedef struct DLManagedTensor {
/*! \brief DLTensor which is being memory managed */
@@ -188,13 +255,65 @@ typedef struct DLManagedTensor {
* which DLManagedTensor is used in the framework. It can also be NULL.
*/
void * manager_ctx;
- /*! \brief Destructor signature void (*)(void*) - this should be called
- * to destruct manager_ctx which holds the DLManagedTensor. It can be NULL
- * if there is no way for the caller to provide a reasonable destructor.
- * The destructors deletes the argument self as well.
+ /*!
+ * \brief Destructor - this should be called
+ * to destruct the manager_ctx which backs the DLManagedTensor. It can be
+ * NULL if there is no way for the caller to provide a reasonable destructor.
+ * The destructors deletes the argument self as well.
*/
void (*deleter)(struct DLManagedTensor * self);
} DLManagedTensor;
+
+// bit masks used in in the DLManagedTensorVersioned
+
+/*! \brief bit mask to indicate that the tensor is read only. */
+#define DLPACK_FLAG_BITMASK_READ_ONLY (1UL << 0UL)
+
+/*!
+ * \brief A versioned and managed C Tensor object, manage memory of DLTensor.
+ *
+ * This data structure is intended to facilitate the borrowing of DLTensor by
+ * another framework. It is not meant to transfer the tensor. When the borrowing
+ * framework doesn't need the tensor, it should call the deleter to notify the
+ * host that the resource is no longer needed.
+ *
+ * \note This is the current standard DLPack exchange data structure.
+ */
+struct DLManagedTensorVersioned {
+ /*!
+ * \brief The API and ABI version of the current managed Tensor
+ */
+ DLPackVersion version;
+ /*!
+ * \brief the context of the original host framework.
+ *
+ * Stores DLManagedTensorVersioned is used in the
+ * framework. It can also be NULL.
+ */
+ void *manager_ctx;
+ /*!
+ * \brief Destructor.
+ *
+ * This should be called to destruct manager_ctx which holds the DLManagedTensorVersioned.
+ * It can be NULL if there is no way for the caller to provide a reasonable
+ * destructor. The destructors deletes the argument self as well.
+ */
+ void (*deleter)(struct DLManagedTensorVersioned *self);
+ /*!
+ * \brief Additional bitmask flags information about the tensor.
+ *
+ * By default the flags should be set to 0.
+ *
+ * \note Future ABI changes should keep everything until this field
+ * stable, to ensure that deleter can be correctly called.
+ *
+ * \sa DLPACK_FLAG_BITMASK_READ_ONLY
+ */
+ uint64_t flags;
+ /*! \brief DLTensor which is being memory managed */
+ DLTensor dl_tensor;
+};
+
#ifdef __cplusplus
} // DLPACK_EXTERN_C
#endif
diff --git a/numpy/core/src/common/float_status.hpp b/numpy/core/src/common/float_status.hpp
new file mode 100644
index 000000000..8e4d5e06a
--- /dev/null
+++ b/numpy/core/src/common/float_status.hpp
@@ -0,0 +1,134 @@
+#ifndef NUMPY_CORE_SRC_COMMON_FLOAT_STATUS_HPP
+#define NUMPY_CORE_SRC_COMMON_FLOAT_STATUS_HPP
+
+#include "npstd.hpp"
+
+#include <fenv.h>
+
+namespace np {
+
+/// @addtogroup cpp_core_utility
+/// @{
+/**
+ * Class wraps floating-point environment operations,
+ * provides lazy access to its functionality.
+ */
+class FloatStatus {
+ public:
+/*
+ * According to the C99 standard FE_DIVBYZERO, etc. may not be provided when
+ * unsupported. In such cases NumPy will not report these correctly, but we
+ * should still allow compiling (whether tests pass or not).
+ * By defining them as 0 locally, we make them no-ops. Unlike these defines,
+ * for example `musl` still defines all of the functions (as no-ops):
+ * https://git.musl-libc.org/cgit/musl/tree/src/fenv/fenv.c
+ * and does similar replacement in its tests:
+ * http://nsz.repo.hu/git/?p=libc-test;a=blob;f=src/common/mtest.h;h=706c1ba23ea8989b17a2f72ed1a919e187c06b6a;hb=HEAD#l30
+ */
+#ifdef FE_DIVBYZERO
+ static constexpr int kDivideByZero = FE_DIVBYZERO;
+#else
+ static constexpr int kDivideByZero = 0;
+#endif
+#ifdef FE_INVALID
+ static constexpr int kInvalid = FE_INVALID;
+#else
+ static constexpr int kInvalid = 0;
+#endif
+#ifdef FE_INEXACT
+ static constexpr int kInexact = FE_INEXACT;
+#else
+ static constexpr int kInexact = 0;
+#endif
+#ifdef FE_OVERFLOW
+ static constexpr int kOverflow = FE_OVERFLOW;
+#else
+ static constexpr int kOverflow = 0;
+#endif
+#ifdef FE_UNDERFLOW
+ static constexpr int kUnderflow = FE_UNDERFLOW;
+#else
+ static constexpr int kUnderflow = 0;
+#endif
+ static constexpr int kAllExcept = (kDivideByZero | kInvalid | kInexact |
+ kOverflow | kUnderflow);
+
+ FloatStatus(bool clear_on_dst=true)
+ : clear_on_dst_(clear_on_dst)
+ {
+ if constexpr (kAllExcept != 0) {
+ fpstatus_ = fetestexcept(kAllExcept);
+ }
+ else {
+ fpstatus_ = 0;
+ }
+ }
+ ~FloatStatus()
+ {
+ if constexpr (kAllExcept != 0) {
+ if (fpstatus_ != 0 && clear_on_dst_) {
+ feclearexcept(kAllExcept);
+ }
+ }
+ }
+ constexpr bool IsDivideByZero() const
+ {
+ return (fpstatus_ & kDivideByZero) != 0;
+ }
+ constexpr bool IsInexact() const
+ {
+ return (fpstatus_ & kInexact) != 0;
+ }
+ constexpr bool IsInvalid() const
+ {
+ return (fpstatus_ & kInvalid) != 0;
+ }
+ constexpr bool IsOverFlow() const
+ {
+ return (fpstatus_ & kOverflow) != 0;
+ }
+ constexpr bool IsUnderFlow() const
+ {
+ return (fpstatus_ & kUnderflow) != 0;
+ }
+ static void RaiseDivideByZero()
+ {
+ if constexpr (kDivideByZero != 0) {
+ feraiseexcept(kDivideByZero);
+ }
+ }
+ static void RaiseInexact()
+ {
+ if constexpr (kInexact != 0) {
+ feraiseexcept(kInexact);
+ }
+ }
+ static void RaiseInvalid()
+ {
+ if constexpr (kInvalid != 0) {
+ feraiseexcept(kInvalid);
+ }
+ }
+ static void RaiseOverflow()
+ {
+ if constexpr (kOverflow != 0) {
+ feraiseexcept(kOverflow);
+ }
+ }
+ static void RaiseUnderflow()
+ {
+ if constexpr (kUnderflow != 0) {
+ feraiseexcept(kUnderflow);
+ }
+ }
+
+ private:
+ bool clear_on_dst_;
+ int fpstatus_;
+};
+
+/// @} cpp_core_utility
+} // namespace np
+
+#endif // NUMPY_CORE_SRC_COMMON_FLOAT_STATUS_HPP
+
diff --git a/numpy/core/src/common/get_attr_string.h b/numpy/core/src/common/get_attr_string.h
index 90eca5ee6..36d39189f 100644
--- a/numpy/core/src/common/get_attr_string.h
+++ b/numpy/core/src/common/get_attr_string.h
@@ -4,7 +4,7 @@
#include <Python.h>
#include "ufunc_object.h"
-static NPY_INLINE npy_bool
+static inline npy_bool
_is_basic_python_type(PyTypeObject *tp)
{
return (
@@ -46,7 +46,7 @@ _is_basic_python_type(PyTypeObject *tp)
*
* In future, could be made more like _Py_LookupSpecial
*/
-static NPY_INLINE PyObject *
+static inline PyObject *
PyArray_LookupSpecial(PyObject *obj, PyObject *name_unicode)
{
PyTypeObject *tp = Py_TYPE(obj);
@@ -73,7 +73,7 @@ PyArray_LookupSpecial(PyObject *obj, PyObject *name_unicode)
*
* Kept for backwards compatibility. In future, we should deprecate this.
*/
-static NPY_INLINE PyObject *
+static inline PyObject *
PyArray_LookupSpecial_OnInstance(PyObject *obj, PyObject *name_unicode)
{
PyTypeObject *tp = Py_TYPE(obj);
diff --git a/numpy/core/src/common/half.hpp b/numpy/core/src/common/half.hpp
new file mode 100644
index 000000000..4d16e3bcc
--- /dev/null
+++ b/numpy/core/src/common/half.hpp
@@ -0,0 +1,264 @@
+#ifndef NUMPY_CORE_SRC_COMMON_HALF_HPP
+#define NUMPY_CORE_SRC_COMMON_HALF_HPP
+
+#include "npstd.hpp"
+
+#include "npy_cpu_dispatch.h" // NPY_HAVE_CPU_FEATURES
+#include "half_private.hpp"
+
+// TODO(@seiko2plus):
+// - covers half-precision operations that being supported by numpy/halffloat.h
+// - add support for arithmetic operations
+// - enables __fp16 causes massive FP exceptions on aarch64,
+// needs a deep investigation
+
+namespace np {
+
+/// @addtogroup cpp_core_types
+/// @{
+
+/// Provides a type that implements 16-bit floating point (half-precision).
+/// This type is ensured to be 16-bit size.
+#if 1 // ndef __ARM_FP16_FORMAT_IEEE
+class Half final {
+ public:
+ /// Whether `Half` has a full native HW support.
+ static constexpr bool kNative = false;
+ /// Whether `Half` has a native HW support for single/double conversion.
+ template<typename T>
+ static constexpr bool kNativeConversion = (
+ (
+ std::is_same_v<T, float> &&
+ #if defined(NPY_HAVE_FP16) || defined(NPY_HAVE_VSX3)
+ true
+ #else
+ false
+ #endif
+ ) || (
+ std::is_same_v<T, double> &&
+ #if defined(NPY_HAVE_AVX512FP16) || defined(NPY_HAVE_VSX3)
+ true
+ #else
+ false
+ #endif
+ )
+ );
+
+ /// Default constructor. initialize nothing.
+ Half() = default;
+
+ /// Constract from float
+ /// If there are no hardware optimization available, rounding will always
+ /// be set to ties to even.
+ explicit Half(float f)
+ {
+ #if defined(NPY_HAVE_FP16)
+ __m128 mf = _mm_load_ss(&f);
+ bits_ = static_cast<uint16_t>(_mm_cvtsi128_si32(_mm_cvtps_ph(mf, _MM_FROUND_TO_NEAREST_INT)));
+ #elif defined(NPY_HAVE_VSX3) && defined(NPY_HAVE_VSX_ASM)
+ __vector float vf32 = vec_splats(f);
+ __vector unsigned short vf16;
+ __asm__ __volatile__ ("xvcvsphp %x0,%x1" : "=wa" (vf16) : "wa" (vf32));
+ bits_ = vec_extract(vf16, 0);
+ #else
+ bits_ = half_private::FromFloatBits(BitCast<uint32_t>(f));
+ #endif
+ }
+
+ /// Construct from double.
+ /// If there are no hardware optimization available, rounding will always
+ /// be set to ties to even.
+ explicit Half(double f)
+ {
+ #if defined(NPY_HAVE_AVX512FP16)
+ __m128d md = _mm_load_sd(&f);
+ bits_ = static_cast<uint16_t>(_mm_cvtsi128_si32(_mm_castph_si128(_mm_cvtpd_ph(md))));
+ #elif defined(NPY_HAVE_VSX3) && defined(NPY_HAVE_VSX_ASM)
+ __vector double vf64 = vec_splats(f);
+ __vector unsigned short vf16;
+ __asm__ __volatile__ ("xvcvdphp %x0,%x1" : "=wa" (vf16) : "wa" (vf64));
+ bits_ = vec_extract(vf16, 0);
+ #else
+ bits_ = half_private::FromDoubleBits(BitCast<uint64_t>(f));
+ #endif
+ }
+
+ /// Cast to float
+ explicit operator float() const
+ {
+ #if defined(NPY_HAVE_FP16)
+ float ret;
+ _mm_store_ss(&ret, _mm_cvtph_ps(_mm_cvtsi32_si128(bits_)));
+ return ret;
+ #elif defined(NPY_HAVE_VSX3) && defined(vec_extract_fp_from_shorth)
+ return vec_extract(vec_extract_fp_from_shorth(vec_splats(bits_)), 0);
+ #elif defined(NPY_HAVE_VSX3) && defined(NPY_HAVE_VSX_ASM)
+ __vector float vf32;
+ __asm__ __volatile__("xvcvhpsp %x0,%x1"
+ : "=wa"(vf32)
+ : "wa"(vec_splats(bits_.u)));
+ return vec_extract(vf32, 0);
+ #else
+ return BitCast<float>(half_private::ToFloatBits(bits_));
+ #endif
+ }
+
+ /// Cast to double
+ explicit operator double() const
+ {
+ #if defined(NPY_HAVE_AVX512FP16)
+ double ret;
+ _mm_store_sd(&ret, _mm_cvtph_pd(_mm_castsi128_ph(_mm_cvtsi32_si128(bits_))));
+ return ret;
+ #elif defined(NPY_HAVE_VSX3) && defined(NPY_HAVE_VSX_ASM)
+ __vector float vf64;
+ __asm__ __volatile__("xvcvhpdp %x0,%x1"
+ : "=wa"(vf32)
+ : "wa"(vec_splats(bits_)));
+ return vec_extract(vf64, 0);
+ #else
+ return BitCast<double>(half_private::ToDoubleBits(bits_));
+ #endif
+ }
+
+ /// Returns a new Half constracted from the IEEE 754 binary16.
+ static constexpr Half FromBits(uint16_t bits)
+ {
+ Half h{};
+ h.bits_ = bits;
+ return h;
+ }
+ /// Returns the IEEE 754 binary16 representation.
+ constexpr uint16_t Bits() const
+ {
+ return bits_;
+ }
+
+ /// @name Comparison operators (orderd)
+ /// @{
+ constexpr bool operator==(Half r) const
+ {
+ return !(IsNaN() || r.IsNaN()) && Equal(r);
+ }
+ constexpr bool operator<(Half r) const
+ {
+ return !(IsNaN() || r.IsNaN()) && Less(r);
+ }
+ constexpr bool operator<=(Half r) const
+ {
+ return !(IsNaN() || r.IsNaN()) && LessEqual(r);
+ }
+ constexpr bool operator>(Half r) const
+ {
+ return r < *this;
+ }
+ constexpr bool operator>=(Half r) const
+ {
+ return r <= *this;
+ }
+ /// @}
+
+ /// @name Comparison operators (unorderd)
+ /// @{
+ constexpr bool operator!=(Half r) const
+ {
+ return !(*this == r);
+ }
+ /// @} Comparison operators
+
+ /// @name Comparison with no guarantee of NaN behavior
+ /// @{
+ constexpr bool Less(Half r) const
+ {
+ uint_fast16_t a = static_cast<uint_fast16_t>(bits_),
+ b = static_cast<uint_fast16_t>(r.bits_);
+ bool sign_a = (a & 0x8000u) == 0x8000u;
+ bool sign_b = (b & 0x8000u) == 0x8000u;
+ // if both `a` and `b` have same sign
+ // Test if `a` > `b` when `a` has the sign
+ // or `a` < `b` when is not.
+ // And make sure they are not equal to each other
+ // in case of both are equal to +-0
+ // else
+ // Test if `a` has the sign.
+ // and `a` != -0.0 and `b` != 0.0
+ return (sign_a == sign_b) ? (sign_a ^ (a < b)) && (a != b)
+ : sign_a && ((a | b) != 0x8000u);
+ }
+ constexpr bool LessEqual(Half r) const
+ {
+ uint_fast16_t a = static_cast<uint_fast16_t>(bits_),
+ b = static_cast<uint_fast16_t>(r.bits_);
+ bool sign_a = (a & 0x8000u) == 0x8000u;
+ bool sign_b = (b & 0x8000u) == 0x8000u;
+ // if both `a` and `b` have same sign
+ // Test if `a` > `b` when `a` has the sign
+ // or `a` < `b` when is not.
+ // or a == b (needed even if we used <= above instead
+ // since testing +-0 still required)
+ // else
+ // Test if `a` has the sign
+ // or `a` and `b` equal to +-0.0
+ return (sign_a == sign_b) ? (sign_a ^ (a < b)) || (a == b)
+ : sign_a || ((a | b) == 0x8000u);
+ }
+ constexpr bool Equal(Half r) const
+ {
+ // fast16 cast is not worth it, since unpack op should involved.
+ uint16_t a = bits_, b = r.bits_;
+ return a == b || ((a | b) == 0x8000u);
+ }
+ /// @} Comparison
+
+ /// @name Properties
+ // @{
+ constexpr bool IsNaN() const
+ {
+ return ((bits_ & 0x7c00u) == 0x7c00u) &&
+ ((bits_ & 0x03ffu) != 0);
+ }
+ /// @} Properties
+
+ private:
+ uint16_t bits_;
+};
+#else // __ARM_FP16_FORMAT_IEEE
+class Half final {
+ public:
+ static constexpr bool kNative = true;
+ template<typename T>
+ static constexpr bool kNativeConversion = (
+ std::is_same_v<T, float> || std::is_same_v<T, double>
+ );
+ Half() = default;
+ constexpr Half(__fp16 h) : half_(h)
+ {}
+ constexpr operator __fp16() const
+ { return half_; }
+ static Half FromBits(uint16_t bits)
+ {
+ Half h;
+ h.half_ = BitCast<__fp16>(bits);
+ return h;
+ }
+ uint16_t Bits() const
+ { return BitCast<uint16_t>(half_); }
+ constexpr bool Less(Half r) const
+ { return half_ < r.half_; }
+ constexpr bool LessEqual(Half r) const
+ { return half_ <= r.half_; }
+ constexpr bool Equal(Half r) const
+ { return half_ == r.half_; }
+ constexpr bool IsNaN() const
+ { return half_ != half_; }
+
+ private:
+ __fp16 half_;
+};
+#endif // __ARM_FP16_FORMAT_IEEE
+
+/// @} cpp_core_types
+
+} // namespace np
+
+#endif // NUMPY_CORE_SRC_COMMON_HALF_HPP
diff --git a/numpy/core/src/common/half_private.hpp b/numpy/core/src/common/half_private.hpp
new file mode 100644
index 000000000..7a64eb397
--- /dev/null
+++ b/numpy/core/src/common/half_private.hpp
@@ -0,0 +1,330 @@
+#ifndef NUMPY_CORE_SRC_COMMON_HALF_PRIVATE_HPP
+#define NUMPY_CORE_SRC_COMMON_HALF_PRIVATE_HPP
+
+#include "npstd.hpp"
+#include "float_status.hpp"
+
+/*
+ * The following functions that emulating float/double/half conversions
+ * are copied from npymath without any changes to its functionalty.
+ */
+namespace np { namespace half_private {
+
+template<bool gen_overflow=true, bool gen_underflow=true, bool round_even=true>
+inline uint16_t FromFloatBits(uint32_t f)
+{
+ uint32_t f_exp, f_sig;
+ uint16_t h_sgn, h_exp, h_sig;
+
+ h_sgn = (uint16_t) ((f&0x80000000u) >> 16);
+ f_exp = (f&0x7f800000u);
+
+ /* Exponent overflow/NaN converts to signed inf/NaN */
+ if (f_exp >= 0x47800000u) {
+ if (f_exp == 0x7f800000u) {
+ /* Inf or NaN */
+ f_sig = (f&0x007fffffu);
+ if (f_sig != 0) {
+ /* NaN - propagate the flag in the significand... */
+ uint16_t ret = (uint16_t) (0x7c00u + (f_sig >> 13));
+ /* ...but make sure it stays a NaN */
+ if (ret == 0x7c00u) {
+ ret++;
+ }
+ return h_sgn + ret;
+ } else {
+ /* signed inf */
+ return (uint16_t) (h_sgn + 0x7c00u);
+ }
+ } else {
+ if constexpr (gen_overflow) {
+ /* overflow to signed inf */
+ FloatStatus::RaiseOverflow();
+ }
+ return (uint16_t) (h_sgn + 0x7c00u);
+ }
+ }
+
+ /* Exponent underflow converts to a subnormal half or signed zero */
+ if (f_exp <= 0x38000000u) {
+ /*
+ * Signed zeros, subnormal floats, and floats with small
+ * exponents all convert to signed zero half-floats.
+ */
+ if (f_exp < 0x33000000u) {
+ if constexpr (gen_underflow) {
+ /* If f != 0, it underflowed to 0 */
+ if ((f&0x7fffffff) != 0) {
+ FloatStatus::RaiseUnderflow();
+ }
+ }
+ return h_sgn;
+ }
+ /* Make the subnormal significand */
+ f_exp >>= 23;
+ f_sig = (0x00800000u + (f&0x007fffffu));
+ if constexpr (gen_underflow) {
+ /* If it's not exactly represented, it underflowed */
+ if ((f_sig&(((uint32_t)1 << (126 - f_exp)) - 1)) != 0) {
+ FloatStatus::RaiseUnderflow();
+ }
+ }
+ /*
+ * Usually the significand is shifted by 13. For subnormals an
+ * additional shift needs to occur. This shift is one for the largest
+ * exponent giving a subnormal `f_exp = 0x38000000 >> 23 = 112`, which
+ * offsets the new first bit. At most the shift can be 1+10 bits.
+ */
+ f_sig >>= (113 - f_exp);
+ /* Handle rounding by adding 1 to the bit beyond half precision */
+ if constexpr (round_even) {
+ /*
+ * If the last bit in the half significand is 0 (already even), and
+ * the remaining bit pattern is 1000...0, then we do not add one
+ * to the bit after the half significand. However, the (113 - f_exp)
+ * shift can lose up to 11 bits, so the || checks them in the original.
+ * In all other cases, we can just add one.
+ */
+ if (((f_sig&0x00003fffu) != 0x00001000u) || (f&0x000007ffu)) {
+ f_sig += 0x00001000u;
+ }
+ }
+ else {
+ f_sig += 0x00001000u;
+ }
+ h_sig = (uint16_t) (f_sig >> 13);
+ /*
+ * If the rounding causes a bit to spill into h_exp, it will
+ * increment h_exp from zero to one and h_sig will be zero.
+ * This is the correct result.
+ */
+ return (uint16_t) (h_sgn + h_sig);
+ }
+
+ /* Regular case with no overflow or underflow */
+ h_exp = (uint16_t) ((f_exp - 0x38000000u) >> 13);
+ /* Handle rounding by adding 1 to the bit beyond half precision */
+ f_sig = (f&0x007fffffu);
+ if constexpr (round_even) {
+ /*
+ * If the last bit in the half significand is 0 (already even), and
+ * the remaining bit pattern is 1000...0, then we do not add one
+ * to the bit after the half significand. In all other cases, we do.
+ */
+ if ((f_sig&0x00003fffu) != 0x00001000u) {
+ f_sig += 0x00001000u;
+ }
+ }
+ else {
+ f_sig += 0x00001000u;
+ }
+ h_sig = (uint16_t) (f_sig >> 13);
+ /*
+ * If the rounding causes a bit to spill into h_exp, it will
+ * increment h_exp by one and h_sig will be zero. This is the
+ * correct result. h_exp may increment to 15, at greatest, in
+ * which case the result overflows to a signed inf.
+ */
+ if constexpr (gen_overflow) {
+ h_sig += h_exp;
+ if (h_sig == 0x7c00u) {
+ FloatStatus::RaiseOverflow();
+ }
+ return h_sgn + h_sig;
+ }
+ else {
+ return h_sgn + h_exp + h_sig;
+ }
+}
+
+template<bool gen_overflow=true, bool gen_underflow=true, bool round_even=true>
+inline uint16_t FromDoubleBits(uint64_t d)
+{
+ uint64_t d_exp, d_sig;
+ uint16_t h_sgn, h_exp, h_sig;
+
+ h_sgn = (d&0x8000000000000000ULL) >> 48;
+ d_exp = (d&0x7ff0000000000000ULL);
+
+ /* Exponent overflow/NaN converts to signed inf/NaN */
+ if (d_exp >= 0x40f0000000000000ULL) {
+ if (d_exp == 0x7ff0000000000000ULL) {
+ /* Inf or NaN */
+ d_sig = (d&0x000fffffffffffffULL);
+ if (d_sig != 0) {
+ /* NaN - propagate the flag in the significand... */
+ uint16_t ret = (uint16_t) (0x7c00u + (d_sig >> 42));
+ /* ...but make sure it stays a NaN */
+ if (ret == 0x7c00u) {
+ ret++;
+ }
+ return h_sgn + ret;
+ } else {
+ /* signed inf */
+ return h_sgn + 0x7c00u;
+ }
+ } else {
+ /* overflow to signed inf */
+ if constexpr (gen_overflow) {
+ FloatStatus::RaiseOverflow();
+ }
+ return h_sgn + 0x7c00u;
+ }
+ }
+
+ /* Exponent underflow converts to subnormal half or signed zero */
+ if (d_exp <= 0x3f00000000000000ULL) {
+ /*
+ * Signed zeros, subnormal floats, and floats with small
+ * exponents all convert to signed zero half-floats.
+ */
+ if (d_exp < 0x3e60000000000000ULL) {
+ if constexpr (gen_underflow) {
+ /* If d != 0, it underflowed to 0 */
+ if ((d&0x7fffffffffffffffULL) != 0) {
+ FloatStatus::RaiseUnderflow();
+ }
+ }
+ return h_sgn;
+ }
+ /* Make the subnormal significand */
+ d_exp >>= 52;
+ d_sig = (0x0010000000000000ULL + (d&0x000fffffffffffffULL));
+ if constexpr (gen_underflow) {
+ /* If it's not exactly represented, it underflowed */
+ if ((d_sig&(((uint64_t)1 << (1051 - d_exp)) - 1)) != 0) {
+ FloatStatus::RaiseUnderflow();
+ }
+ }
+ /*
+ * Unlike floats, doubles have enough room to shift left to align
+ * the subnormal significand leading to no loss of the last bits.
+ * The smallest possible exponent giving a subnormal is:
+ * `d_exp = 0x3e60000000000000 >> 52 = 998`. All larger subnormals are
+ * shifted with respect to it. This adds a shift of 10+1 bits the final
+ * right shift when comparing it to the one in the normal branch.
+ */
+ assert(d_exp - 998 >= 0);
+ d_sig <<= (d_exp - 998);
+ /* Handle rounding by adding 1 to the bit beyond half precision */
+ if constexpr (round_even) {
+ /*
+ * If the last bit in the half significand is 0 (already even), and
+ * the remaining bit pattern is 1000...0, then we do not add one
+ * to the bit after the half significand. In all other cases, we do.
+ */
+ if ((d_sig&0x003fffffffffffffULL) != 0x0010000000000000ULL) {
+ d_sig += 0x0010000000000000ULL;
+ }
+ }
+ else {
+ d_sig += 0x0010000000000000ULL;
+ }
+ h_sig = (uint16_t) (d_sig >> 53);
+ /*
+ * If the rounding causes a bit to spill into h_exp, it will
+ * increment h_exp from zero to one and h_sig will be zero.
+ * This is the correct result.
+ */
+ return h_sgn + h_sig;
+ }
+
+ /* Regular case with no overflow or underflow */
+ h_exp = (uint16_t) ((d_exp - 0x3f00000000000000ULL) >> 42);
+ /* Handle rounding by adding 1 to the bit beyond half precision */
+ d_sig = (d&0x000fffffffffffffULL);
+ if constexpr (round_even) {
+ /*
+ * If the last bit in the half significand is 0 (already even), and
+ * the remaining bit pattern is 1000...0, then we do not add one
+ * to the bit after the half significand. In all other cases, we do.
+ */
+ if ((d_sig&0x000007ffffffffffULL) != 0x0000020000000000ULL) {
+ d_sig += 0x0000020000000000ULL;
+ }
+ }
+ else {
+ d_sig += 0x0000020000000000ULL;
+ }
+ h_sig = (uint16_t) (d_sig >> 42);
+
+ /*
+ * If the rounding causes a bit to spill into h_exp, it will
+ * increment h_exp by one and h_sig will be zero. This is the
+ * correct result. h_exp may increment to 15, at greatest, in
+ * which case the result overflows to a signed inf.
+ */
+ if constexpr (gen_overflow) {
+ h_sig += h_exp;
+ if (h_sig == 0x7c00u) {
+ FloatStatus::RaiseOverflow();
+ }
+ return h_sgn + h_sig;
+ }
+ else {
+ return h_sgn + h_exp + h_sig;
+ }
+}
+
+constexpr uint32_t ToFloatBits(uint16_t h)
+{
+ uint16_t h_exp = (h&0x7c00u);
+ uint32_t f_sgn = ((uint32_t)h&0x8000u) << 16;
+ switch (h_exp) {
+ case 0x0000u: { // 0 or subnormal
+ uint16_t h_sig = (h&0x03ffu);
+ // Signed zero
+ if (h_sig == 0) {
+ return f_sgn;
+ }
+ // Subnormal
+ h_sig <<= 1;
+ while ((h_sig&0x0400u) == 0) {
+ h_sig <<= 1;
+ h_exp++;
+ }
+ uint32_t f_exp = ((uint32_t)(127 - 15 - h_exp)) << 23;
+ uint32_t f_sig = ((uint32_t)(h_sig&0x03ffu)) << 13;
+ return f_sgn + f_exp + f_sig;
+ }
+ case 0x7c00u: // inf or NaN
+ // All-ones exponent and a copy of the significand
+ return f_sgn + 0x7f800000u + (((uint32_t)(h&0x03ffu)) << 13);
+ default: // normalized
+ // Just need to adjust the exponent and shift
+ return f_sgn + (((uint32_t)(h&0x7fffu) + 0x1c000u) << 13);
+ }
+}
+
+constexpr uint64_t ToDoubleBits(uint16_t h)
+{
+ uint16_t h_exp = (h&0x7c00u);
+ uint64_t d_sgn = ((uint64_t)h&0x8000u) << 48;
+ switch (h_exp) {
+ case 0x0000u: { // 0 or subnormal
+ uint16_t h_sig = (h&0x03ffu);
+ // Signed zero
+ if (h_sig == 0) {
+ return d_sgn;
+ }
+ // Subnormal
+ h_sig <<= 1;
+ while ((h_sig&0x0400u) == 0) {
+ h_sig <<= 1;
+ h_exp++;
+ }
+ uint64_t d_exp = ((uint64_t)(1023 - 15 - h_exp)) << 52;
+ uint64_t d_sig = ((uint64_t)(h_sig&0x03ffu)) << 42;
+ return d_sgn + d_exp + d_sig;
+ }
+ case 0x7c00u: // inf or NaN
+ // All-ones exponent and a copy of the significand
+ return d_sgn + 0x7ff0000000000000ULL + (((uint64_t)(h&0x03ffu)) << 42);
+ default: // normalized
+ // Just need to adjust the exponent and shift
+ return d_sgn + (((uint64_t)(h&0x7fffu) + 0xfc000u) << 42);
+ }
+}
+
+}} // namespace np::half_private
+#endif // NUMPY_CORE_SRC_COMMON_HALF_PRIVATE_HPP
diff --git a/numpy/core/src/common/lowlevel_strided_loops.h b/numpy/core/src/common/lowlevel_strided_loops.h
index 924a34db5..4ad110663 100644
--- a/numpy/core/src/common/lowlevel_strided_loops.h
+++ b/numpy/core/src/common/lowlevel_strided_loops.h
@@ -425,7 +425,7 @@ PyArray_PrepareThreeRawArrayIter(int ndim, npy_intp const *shape,
* blocking. See the 'npy_blocked_end' function documentation below for an
* example of how this function is used.
*/
-static NPY_INLINE npy_intp
+static inline npy_intp
npy_aligned_block_offset(const void * addr, const npy_uintp esize,
const npy_uintp alignment, const npy_uintp nvals)
{
@@ -458,7 +458,7 @@ npy_aligned_block_offset(const void * addr, const npy_uintp esize,
* for(; i < n; i++)
* <scalar-op>
*/
-static NPY_INLINE npy_intp
+static inline npy_intp
npy_blocked_end(const npy_uintp peel, const npy_uintp esize,
const npy_uintp vsz, const npy_uintp nvals)
{
@@ -472,7 +472,7 @@ npy_blocked_end(const npy_uintp peel, const npy_uintp esize,
/* byte swapping functions */
-static NPY_INLINE npy_uint16
+static inline npy_uint16
npy_bswap2(npy_uint16 x)
{
return ((x & 0xffu) << 8) | (x >> 8);
@@ -482,7 +482,7 @@ npy_bswap2(npy_uint16 x)
* treat as int16 and byteswap unaligned memory,
* some cpus don't support unaligned access
*/
-static NPY_INLINE void
+static inline void
npy_bswap2_unaligned(char * x)
{
char a = x[0];
@@ -490,7 +490,7 @@ npy_bswap2_unaligned(char * x)
x[1] = a;
}
-static NPY_INLINE npy_uint32
+static inline npy_uint32
npy_bswap4(npy_uint32 x)
{
#ifdef HAVE___BUILTIN_BSWAP32
@@ -501,7 +501,7 @@ npy_bswap4(npy_uint32 x)
#endif
}
-static NPY_INLINE void
+static inline void
npy_bswap4_unaligned(char * x)
{
char a = x[0];
@@ -512,7 +512,7 @@ npy_bswap4_unaligned(char * x)
x[2] = a;
}
-static NPY_INLINE npy_uint64
+static inline npy_uint64
npy_bswap8(npy_uint64 x)
{
#ifdef HAVE___BUILTIN_BSWAP64
@@ -529,7 +529,7 @@ npy_bswap8(npy_uint64 x)
#endif
}
-static NPY_INLINE void
+static inline void
npy_bswap8_unaligned(char * x)
{
char a = x[0]; x[0] = x[7]; x[7] = a;
@@ -685,7 +685,7 @@ npy_bswap8_unaligned(char * x)
size == 1 ? 0 : ((PyArray_NDIM(arr) == 1) ? \
PyArray_STRIDE(arr, 0) : PyArray_ITEMSIZE(arr)))
-static NPY_INLINE int
+static inline int
PyArray_EQUIVALENTLY_ITERABLE_OVERLAP_OK(PyArrayObject *arr1, PyArrayObject *arr2,
int arr1_read, int arr2_read)
{
diff --git a/numpy/core/src/common/meta.hpp b/numpy/core/src/common/meta.hpp
new file mode 100644
index 000000000..27ea1857e
--- /dev/null
+++ b/numpy/core/src/common/meta.hpp
@@ -0,0 +1,54 @@
+#ifndef NUMPY_CORE_SRC_COMMON_META_HPP
+#define NUMPY_CORE_SRC_COMMON_META_HPP
+
+#include "npstd.hpp"
+
+namespace np { namespace meta {
+/// @addtogroup cpp_core_meta
+/// @{
+
+namespace details {
+template<int size, bool unsig>
+struct IntBySize;
+
+template<bool unsig>
+struct IntBySize<sizeof(uint8_t), unsig> {
+ using Type = typename std::conditional<
+ unsig, uint8_t, int8_t>::type;
+};
+template<bool unsig>
+struct IntBySize<sizeof(uint16_t), unsig> {
+ using Type = typename std::conditional<
+ unsig, uint16_t, int16_t>::type;
+};
+template<bool unsig>
+struct IntBySize<sizeof(uint32_t), unsig> {
+ using Type = typename std::conditional<
+ unsig, uint32_t, int32_t>::type;
+};
+template<bool unsig>
+struct IntBySize<sizeof(uint64_t), unsig> {
+ using Type = typename std::conditional<
+ unsig, uint64_t, int64_t>::type;
+};
+} // namespace details
+
+/// Provides safe conversion of any integer type synonyms
+/// to a fixed-width integer type.
+template<typename T>
+struct FixedWidth {
+ using TF_ = typename details::IntBySize<
+ sizeof(T), std::is_unsigned<T>::value
+ >::Type;
+
+ using Type = typename std::conditional<
+ std::is_integral<T>::value, TF_, T
+ >::type;
+};
+
+/// @} cpp_core_meta
+
+}} // namespace np::meta
+
+#endif // NUMPY_CORE_SRC_COMMON_META_HPP
+
diff --git a/numpy/core/src/common/npdef.hpp b/numpy/core/src/common/npdef.hpp
new file mode 100644
index 000000000..56a0df52e
--- /dev/null
+++ b/numpy/core/src/common/npdef.hpp
@@ -0,0 +1,28 @@
+#ifndef NUMPY_CORE_SRC_COMMON_NPDEF_HPP
+#define NUMPY_CORE_SRC_COMMON_NPDEF_HPP
+
+#if !defined(__cplusplus) || __cplusplus < 201703L
+ #error "NumPy requires a compiler with at least C++17 enabled"
+#endif
+
+/// @addtogroup cpp_core_defs
+/// @{
+
+/// Whether compiler supports C++20
+#if __cplusplus > 202002L
+ #define NP_HAS_CPP20 1
+#else
+ #define NP_HAS_CPP20 0
+#endif
+
+/// Wraps `__has_builtin`
+#if defined(__has_builtin)
+ #define NP_HAS_BUILTIN(INTRIN) __has_builtin(INTRIN)
+#else
+ #define NP_HAS_BUILTIN(INTRIN) 0
+#endif
+
+/// @} cpp_core_defs
+
+#endif // NUMPY_CORE_SRC_COMMON_NPDEF_HPP
+
diff --git a/numpy/core/src/common/npstd.hpp b/numpy/core/src/common/npstd.hpp
new file mode 100644
index 000000000..92e3d5cc5
--- /dev/null
+++ b/numpy/core/src/common/npstd.hpp
@@ -0,0 +1,57 @@
+#ifndef NUMPY_CORE_SRC_COMMON_NPSTD_HPP
+#define NUMPY_CORE_SRC_COMMON_NPSTD_HPP
+
+#include <cstddef>
+#include <cstring>
+#include <cctype>
+#include <cstdint>
+
+#include <string>
+#include <algorithm>
+#include <utility>
+#include <cstdlib>
+#include <cmath>
+#include <complex>
+#include <type_traits>
+
+#include <numpy/npy_common.h>
+
+#include "npy_config.h"
+
+namespace np {
+/// @addtogroup cpp_core_types
+/// @{
+using std::uint8_t;
+using std::int8_t;
+using std::uint16_t;
+using std::int16_t;
+using std::uint32_t;
+using std::int32_t;
+using std::uint64_t;
+using std::int64_t;
+using std::uintptr_t;
+using std::intptr_t;
+using std::complex;
+using std::uint_fast16_t;
+using std::uint_fast32_t;
+
+/** Guard for long double.
+ *
+ * The C implementation defines long double as double
+ * on MinGW to provide compatibility with MSVC to unify
+ * one behavior under Windows OS, which makes npy_longdouble
+ * not fit to be used with template specialization or overloading.
+ *
+ * This type will be set to `void` when `npy_longdouble` is not defined
+ * as `long double`.
+ */
+using LongDouble = typename std::conditional<
+ !std::is_same<npy_longdouble, long double>::value,
+ void, npy_longdouble
+>::type;
+/// @} cpp_core_types
+
+} // namespace np
+
+#endif // NUMPY_CORE_SRC_COMMON_NPSTD_HPP
+
diff --git a/numpy/core/src/common/npy_cblas.h b/numpy/core/src/common/npy_cblas.h
index 30fec1a65..751854b6e 100644
--- a/numpy/core/src/common/npy_cblas.h
+++ b/numpy/core/src/common/npy_cblas.h
@@ -66,7 +66,7 @@ enum CBLAS_SIDE {CblasLeft=141, CblasRight=142};
* Convert NumPy stride to BLAS stride. Returns 0 if conversion cannot be done
* (BLAS won't handle negative or zero strides the way we want).
*/
-static NPY_INLINE CBLAS_INT
+static inline CBLAS_INT
blas_stride(npy_intp stride, unsigned itemsize)
{
/*
diff --git a/numpy/core/src/common/npy_config.h b/numpy/core/src/common/npy_config.h
index d6886c5ea..715b17777 100644
--- a/numpy/core/src/common/npy_config.h
+++ b/numpy/core/src/common/npy_config.h
@@ -2,6 +2,7 @@
#define NUMPY_CORE_SRC_COMMON_NPY_CONFIG_H_
#include "config.h"
+#include "npy_cpu_dispatch.h" // brings NPY_HAVE_[CPU features]
#include "numpy/numpyconfig.h"
#include "numpy/utils.h"
#include "numpy/npy_os.h"
diff --git a/numpy/core/src/common/npy_cpu_features.c b/numpy/core/src/common/npy_cpu_features.c
index 773f4af86..64a85f6fb 100644
--- a/numpy/core/src/common/npy_cpu_features.c
+++ b/numpy/core/src/common/npy_cpu_features.c
@@ -1,6 +1,6 @@
#include "npy_cpu_features.h"
#include "npy_cpu_dispatch.h" // To guarantee the CPU baseline definitions are in scope.
-#include "numpy/npy_common.h" // for NPY_INLINE
+#include "numpy/npy_common.h"
#include "numpy/npy_cpu.h" // To guarantee the CPU definitions are in scope.
/******************** Private Definitions *********************/
@@ -14,13 +14,15 @@ static unsigned char npy__cpu_have[NPY_CPU_FEATURE_MAX];
static void
npy__cpu_init_features(void);
/*
- * Disable CPU dispatched features at runtime if environment variable
- * 'NPY_DISABLE_CPU_FEATURES' is defined.
+ * Enable or disable CPU dispatched features at runtime if the environment variable
+ * `NPY_ENABLE_CPU_FEATURES` or `NPY_DISABLE_CPU_FEATURES`
+ * depends on the value of boolean parameter `disable`(toggle).
+ *
* Multiple features can be present, and separated by space, comma, or tab.
- * Raises an error if parsing fails or if the feature was not enabled
+ * Raises an error if parsing fails or if the feature was not enabled or disabled
*/
static int
-npy__cpu_try_disable_env(void);
+npy__cpu_check_env(int disable, const char *env);
/* Ensure the build's CPU baseline features are supported at runtime */
static int
@@ -43,9 +45,22 @@ npy_cpu_init(void)
if (npy__cpu_validate_baseline() < 0) {
return -1;
}
- if (npy__cpu_try_disable_env() < 0) {
+ char *enable_env = getenv("NPY_ENABLE_CPU_FEATURES");
+ char *disable_env = getenv("NPY_DISABLE_CPU_FEATURES");
+ int is_enable = enable_env && enable_env[0];
+ int is_disable = disable_env && disable_env[0];
+ if (is_enable & is_disable) {
+ PyErr_Format(PyExc_ImportError,
+ "Both NPY_DISABLE_CPU_FEATURES and NPY_ENABLE_CPU_FEATURES "
+ "environment variables cannot be set simultaneously."
+ );
return -1;
}
+ if (is_enable | is_disable) {
+ if (npy__cpu_check_env(is_disable, is_disable ? disable_env : enable_env) < 0) {
+ return -1;
+ }
+ }
return 0;
}
@@ -81,12 +96,14 @@ static struct {
{NPY_CPU_FEATURE_AVX512VBMI, "AVX512VBMI"},
{NPY_CPU_FEATURE_AVX512VBMI2, "AVX512VBMI2"},
{NPY_CPU_FEATURE_AVX512BITALG, "AVX512BITALG"},
+ {NPY_CPU_FEATURE_AVX512FP16 , "AVX512FP16"},
{NPY_CPU_FEATURE_AVX512_KNL, "AVX512_KNL"},
{NPY_CPU_FEATURE_AVX512_KNM, "AVX512_KNM"},
{NPY_CPU_FEATURE_AVX512_SKX, "AVX512_SKX"},
{NPY_CPU_FEATURE_AVX512_CLX, "AVX512_CLX"},
{NPY_CPU_FEATURE_AVX512_CNL, "AVX512_CNL"},
{NPY_CPU_FEATURE_AVX512_ICL, "AVX512_ICL"},
+ {NPY_CPU_FEATURE_AVX512_SPR, "AVX512_SPR"},
{NPY_CPU_FEATURE_VSX, "VSX"},
{NPY_CPU_FEATURE_VSX2, "VSX2"},
{NPY_CPU_FEATURE_VSX3, "VSX3"},
@@ -166,7 +183,7 @@ npy_cpu_dispatch_list(void)
* features that had been configured via --cpu-baseline
* otherwise it returns 0
*/
-static NPY_INLINE int
+static inline int
npy__cpu_baseline_fid(const char *feature)
{
#if !defined(NPY_DISABLE_OPTIMIZATION) && NPY_WITH_CPU_BASELINE_N > 0
@@ -179,7 +196,7 @@ npy__cpu_baseline_fid(const char *feature)
* features that had been configured via --cpu-dispatch
* otherwise it returns 0
*/
-static NPY_INLINE int
+static inline int
npy__cpu_dispatch_fid(const char *feature)
{
#if !defined(NPY_DISABLE_OPTIMIZATION) && NPY_WITH_CPU_DISPATCH_N > 0
@@ -208,7 +225,8 @@ npy__cpu_validate_baseline(void)
*(fptr-1) = '\0'; // trim the last space
PyErr_Format(PyExc_RuntimeError,
"NumPy was built with baseline optimizations: \n"
- "(" NPY_WITH_CPU_BASELINE ") but your machine doesn't support:\n(%s).",
+ "(" NPY_WITH_CPU_BASELINE ") but your machine "
+ "doesn't support:\n(%s).",
baseline_failure
);
return -1;
@@ -218,27 +236,31 @@ npy__cpu_validate_baseline(void)
}
static int
-npy__cpu_try_disable_env(void)
-{
- char *disenv = getenv("NPY_DISABLE_CPU_FEATURES");
- if (disenv == NULL || disenv[0] == 0) {
- return 0;
- }
- #define NPY__CPU_ENV_ERR_HEAD \
- "During parsing environment variable 'NPY_DISABLE_CPU_FEATURES':\n"
+npy__cpu_check_env(int disable, const char *env) {
+
+ static const char *names[] = {
+ "enable", "disable",
+ "NPY_ENABLE_CPU_FEATURES", "NPY_DISABLE_CPU_FEATURES",
+ "During parsing environment variable: 'NPY_ENABLE_CPU_FEATURES':\n",
+ "During parsing environment variable: 'NPY_DISABLE_CPU_FEATURES':\n"
+ };
+ disable = disable ? 1 : 0;
+ const char *act_name = names[disable];
+ const char *env_name = names[disable + 2];
+ const char *err_head = names[disable + 4];
#if !defined(NPY_DISABLE_OPTIMIZATION) && NPY_WITH_CPU_DISPATCH_N > 0
#define NPY__MAX_VAR_LEN 1024 // More than enough for this era
- size_t var_len = strlen(disenv) + 1;
+ size_t var_len = strlen(env) + 1;
if (var_len > NPY__MAX_VAR_LEN) {
PyErr_Format(PyExc_RuntimeError,
- "Length of environment variable 'NPY_DISABLE_CPU_FEATURES' is %d, only %d accepted",
- var_len, NPY__MAX_VAR_LEN - 1
+ "Length of environment variable '%s' is %d, only %d accepted",
+ env_name, var_len, NPY__MAX_VAR_LEN
);
return -1;
}
- char disable_features[NPY__MAX_VAR_LEN];
- memcpy(disable_features, disenv, var_len);
+ char features[NPY__MAX_VAR_LEN];
+ memcpy(features, env, var_len);
char nexist[NPY__MAX_VAR_LEN];
char *nexist_cur = &nexist[0];
@@ -248,17 +270,19 @@ npy__cpu_try_disable_env(void)
//comma and space including (htab, vtab, CR, LF, FF)
const char *delim = ", \t\v\r\n\f";
- char *feature = strtok(disable_features, delim);
+ char *feature = strtok(features, delim);
while (feature) {
- if (npy__cpu_baseline_fid(feature) > 0) {
- PyErr_Format(PyExc_RuntimeError,
- NPY__CPU_ENV_ERR_HEAD
- "You cannot disable CPU feature '%s', since it is part of "
- "the baseline optimizations:\n"
- "(" NPY_WITH_CPU_BASELINE ").",
- feature
- );
- return -1;
+ if (npy__cpu_baseline_fid(feature) > 0){
+ if (disable) {
+ PyErr_Format(PyExc_RuntimeError,
+ "%s"
+ "You cannot disable CPU feature '%s', since it is part of "
+ "the baseline optimizations:\n"
+ "(" NPY_WITH_CPU_BASELINE ").",
+ err_head, feature
+ );
+ return -1;
+ } goto next;
}
// check if the feature is part of dispatched features
int feature_id = npy__cpu_dispatch_fid(feature);
@@ -275,47 +299,58 @@ npy__cpu_try_disable_env(void)
notsupp_cur[flen] = ' '; notsupp_cur += flen + 1;
goto next;
}
- // Finally we can disable it
- npy__cpu_have[feature_id] = 0;
+ // Finally we can disable or mark for enabling
+ npy__cpu_have[feature_id] = disable ? 0:2;
next:
feature = strtok(NULL, delim);
}
+ if (!disable){
+ // Disables any unmarked dispatched feature.
+ #define NPY__CPU_DISABLE_DISPATCH_CB(FEATURE, DUMMY) \
+ if(npy__cpu_have[NPY_CAT(NPY_CPU_FEATURE_, FEATURE)] != 0)\
+ {npy__cpu_have[NPY_CAT(NPY_CPU_FEATURE_, FEATURE)]--;}\
+
+ NPY_WITH_CPU_DISPATCH_CALL(NPY__CPU_DISABLE_DISPATCH_CB, DUMMY) // extra arg for msvc
+ }
*nexist_cur = '\0';
if (nexist[0] != '\0') {
*(nexist_cur-1) = '\0'; // trim the last space
- if (PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
- NPY__CPU_ENV_ERR_HEAD
- "You cannot disable CPU features (%s), since "
- "they are not part of the dispatched optimizations\n"
- "(" NPY_WITH_CPU_DISPATCH ").",
- nexist
+ if (PyErr_WarnFormat(PyExc_ImportWarning, 1,
+ "%sYou cannot %s CPU features (%s), since "
+ "they are not part of the dispatched optimizations\n"
+ "(" NPY_WITH_CPU_DISPATCH ").",
+ err_head, act_name, nexist
) < 0) {
return -1;
}
+ return 0;
}
+ #define NOTSUPP_BODY \
+ "%s" \
+ "You cannot %s CPU features (%s), since " \
+ "they are not supported by your machine.", \
+ err_head, act_name, notsupp
+
*notsupp_cur = '\0';
if (notsupp[0] != '\0') {
*(notsupp_cur-1) = '\0'; // trim the last space
- if (PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
- NPY__CPU_ENV_ERR_HEAD
- "You cannot disable CPU features (%s), since "
- "they are not supported by your machine.",
- notsupp
- ) < 0) {
+ if (!disable){
+ PyErr_Format(PyExc_RuntimeError, NOTSUPP_BODY);
return -1;
}
}
#else
- if (PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
- NPY__CPU_ENV_ERR_HEAD
- "You cannot use environment variable 'NPY_DISABLE_CPU_FEATURES', since "
+ if (PyErr_WarnFormat(PyExc_ImportWarning, 1,
+ "%s"
+ "You cannot use environment variable '%s', since "
#ifdef NPY_DISABLE_OPTIMIZATION
- "the NumPy library was compiled with optimization disabled."
+ "the NumPy library was compiled with optimization disabled.",
#else
- "the NumPy library was compiled without any dispatched optimizations."
+ "the NumPy library was compiled without any dispatched optimizations.",
#endif
+ err_head, env_name, act_name
) < 0) {
return -1;
}
@@ -506,6 +541,11 @@ npy__cpu_init_features(void)
npy__cpu_have[NPY_CPU_FEATURE_AVX512VBMI2] &&
npy__cpu_have[NPY_CPU_FEATURE_AVX512BITALG] &&
npy__cpu_have[NPY_CPU_FEATURE_AVX512VPOPCNTDQ];
+ // Sapphire Rapids
+ npy__cpu_have[NPY_CPU_FEATURE_AVX512FP16] = (reg[3] & (1 << 23)) != 0;
+ npy__cpu_have[NPY_CPU_FEATURE_AVX512_SPR] = npy__cpu_have[NPY_CPU_FEATURE_AVX512_ICL] &&
+ npy__cpu_have[NPY_CPU_FEATURE_AVX512FP16];
+
}
}
@@ -513,7 +553,10 @@ npy__cpu_init_features(void)
#elif defined(NPY_CPU_PPC64) || defined(NPY_CPU_PPC64LE)
-#ifdef __linux__
+#if defined(__linux__) || defined(__FreeBSD__)
+ #ifdef __FreeBSD__
+ #include <machine/cpu.h> // defines PPC_FEATURE_HAS_VSX
+ #endif
#include <sys/auxv.h>
#ifndef AT_HWCAP2
#define AT_HWCAP2 26
@@ -533,12 +576,21 @@ static void
npy__cpu_init_features(void)
{
memset(npy__cpu_have, 0, sizeof(npy__cpu_have[0]) * NPY_CPU_FEATURE_MAX);
+#if defined(__linux__) || defined(__FreeBSD__)
#ifdef __linux__
unsigned int hwcap = getauxval(AT_HWCAP);
if ((hwcap & PPC_FEATURE_HAS_VSX) == 0)
return;
hwcap = getauxval(AT_HWCAP2);
+#else
+ unsigned long hwcap;
+ elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap));
+ if ((hwcap & PPC_FEATURE_HAS_VSX) == 0)
+ return;
+
+ elf_aux_info(AT_HWCAP2, &hwcap, sizeof(hwcap));
+#endif // __linux__
if (hwcap & PPC_FEATURE2_ARCH_3_1)
{
npy__cpu_have[NPY_CPU_FEATURE_VSX] =
@@ -551,7 +603,7 @@ npy__cpu_init_features(void)
npy__cpu_have[NPY_CPU_FEATURE_VSX2] = (hwcap & PPC_FEATURE2_ARCH_2_07) != 0;
npy__cpu_have[NPY_CPU_FEATURE_VSX3] = (hwcap & PPC_FEATURE2_ARCH_3_00) != 0;
npy__cpu_have[NPY_CPU_FEATURE_VSX4] = (hwcap & PPC_FEATURE2_ARCH_3_1) != 0;
-// TODO: AIX, FreeBSD
+// TODO: AIX, OpenBSD
#else
npy__cpu_have[NPY_CPU_FEATURE_VSX] = 1;
#if defined(NPY_CPU_PPC64LE) || defined(NPY_HAVE_VSX2)
@@ -606,7 +658,7 @@ npy__cpu_init_features(void)
#elif defined(__arm__) || defined(__aarch64__)
-static NPY_INLINE void
+static inline void
npy__cpu_init_features_arm8(void)
{
npy__cpu_have[NPY_CPU_FEATURE_NEON] =
diff --git a/numpy/core/src/common/npy_cpu_features.h b/numpy/core/src/common/npy_cpu_features.h
index 96c543e70..9180670c4 100644
--- a/numpy/core/src/common/npy_cpu_features.h
+++ b/numpy/core/src/common/npy_cpu_features.h
@@ -43,6 +43,7 @@ enum npy_cpu_features
NPY_CPU_FEATURE_AVX512VNNI = 42,
NPY_CPU_FEATURE_AVX512VBMI2 = 43,
NPY_CPU_FEATURE_AVX512BITALG = 44,
+ NPY_CPU_FEATURE_AVX512FP16 = 45,
// X86 CPU Groups
// Knights Landing (F,CD,ER,PF)
@@ -57,6 +58,8 @@ enum npy_cpu_features
NPY_CPU_FEATURE_AVX512_CNL = 105,
// Ice Lake (F,CD,BW,DQ,VL,IFMA,VBMI,VNNI,VBMI2,BITALG,VPOPCNTDQ)
NPY_CPU_FEATURE_AVX512_ICL = 106,
+ // Sapphire Rapids (Ice Lake, AVX512FP16)
+ NPY_CPU_FEATURE_AVX512_SPR = 107,
// IBM/POWER VSX
// POWER7
@@ -103,13 +106,25 @@ enum npy_cpu_features
* - detects runtime CPU features
* - check that baseline CPU features are present
* - uses 'NPY_DISABLE_CPU_FEATURES' to disable dispatchable features
+ * - uses 'NPY_ENABLE_CPU_FEATURES' to enable dispatchable features
*
* It will set a RuntimeError when
* - CPU baseline features from the build are not supported at runtime
* - 'NPY_DISABLE_CPU_FEATURES' tries to disable a baseline feature
- * and will warn if 'NPY_DISABLE_CPU_FEATURES' tries to disable a feature that
- * is not disabled (the machine or build does not support it, or the project was
- * not built with any feature optimization support)
+ * - 'NPY_DISABLE_CPU_FEATURES' and 'NPY_ENABLE_CPU_FEATURES' are
+ * simultaneously set
+ * - 'NPY_ENABLE_CPU_FEATURES' tries to enable a feature that is not supported
+ * by the machine or build
+ * - 'NPY_ENABLE_CPU_FEATURES' tries to enable a feature when the project was
+ * not built with any feature optimization support
+ *
+ * It will set an ImportWarning when:
+ * - 'NPY_DISABLE_CPU_FEATURES' tries to disable a feature that is not supported
+ * by the machine or build
+ * - 'NPY_DISABLE_CPU_FEATURES' or 'NPY_ENABLE_CPU_FEATURES' tries to
+ * disable/enable a feature when the project was not built with any feature
+ * optimization support
+ *
* return 0 on success otherwise return -1
*/
NPY_VISIBILITY_HIDDEN int
diff --git a/numpy/core/src/common/npy_ctypes.h b/numpy/core/src/common/npy_ctypes.h
index 05761cad3..4b5634574 100644
--- a/numpy/core/src/common/npy_ctypes.h
+++ b/numpy/core/src/common/npy_ctypes.h
@@ -14,7 +14,7 @@
* This entire function is just a wrapper around the Python function of the
* same name.
*/
-NPY_INLINE static int
+static inline int
npy_ctypes_check(PyTypeObject *obj)
{
static PyObject *py_func = NULL;
diff --git a/numpy/core/src/common/npy_extint128.h b/numpy/core/src/common/npy_extint128.h
index d563c2ac8..776d71c77 100644
--- a/numpy/core/src/common/npy_extint128.h
+++ b/numpy/core/src/common/npy_extint128.h
@@ -9,7 +9,7 @@ typedef struct {
/* Integer addition with overflow checking */
-static NPY_INLINE npy_int64
+static inline npy_int64
safe_add(npy_int64 a, npy_int64 b, char *overflow_flag)
{
if (a > 0 && b > NPY_MAX_INT64 - a) {
@@ -23,7 +23,7 @@ safe_add(npy_int64 a, npy_int64 b, char *overflow_flag)
/* Integer subtraction with overflow checking */
-static NPY_INLINE npy_int64
+static inline npy_int64
safe_sub(npy_int64 a, npy_int64 b, char *overflow_flag)
{
if (a >= 0 && b < a - NPY_MAX_INT64) {
@@ -37,7 +37,7 @@ safe_sub(npy_int64 a, npy_int64 b, char *overflow_flag)
/* Integer multiplication with overflow checking */
-static NPY_INLINE npy_int64
+static inline npy_int64
safe_mul(npy_int64 a, npy_int64 b, char *overflow_flag)
{
if (a > 0) {
@@ -58,7 +58,7 @@ safe_mul(npy_int64 a, npy_int64 b, char *overflow_flag)
/* Long integer init */
-static NPY_INLINE npy_extint128_t
+static inline npy_extint128_t
to_128(npy_int64 x)
{
npy_extint128_t result;
@@ -74,7 +74,7 @@ to_128(npy_int64 x)
}
-static NPY_INLINE npy_int64
+static inline npy_int64
to_64(npy_extint128_t x, char *overflow)
{
if (x.hi != 0 ||
@@ -87,7 +87,7 @@ to_64(npy_extint128_t x, char *overflow)
/* Long integer multiply */
-static NPY_INLINE npy_extint128_t
+static inline npy_extint128_t
mul_64_64(npy_int64 a, npy_int64 b)
{
npy_extint128_t x, y, z;
@@ -127,7 +127,7 @@ mul_64_64(npy_int64 a, npy_int64 b)
/* Long integer add */
-static NPY_INLINE npy_extint128_t
+static inline npy_extint128_t
add_128(npy_extint128_t x, npy_extint128_t y, char *overflow)
{
npy_extint128_t z;
@@ -170,7 +170,7 @@ add_128(npy_extint128_t x, npy_extint128_t y, char *overflow)
/* Long integer negation */
-static NPY_INLINE npy_extint128_t
+static inline npy_extint128_t
neg_128(npy_extint128_t x)
{
npy_extint128_t z = x;
@@ -179,14 +179,14 @@ neg_128(npy_extint128_t x)
}
-static NPY_INLINE npy_extint128_t
+static inline npy_extint128_t
sub_128(npy_extint128_t x, npy_extint128_t y, char *overflow)
{
return add_128(x, neg_128(y), overflow);
}
-static NPY_INLINE npy_extint128_t
+static inline npy_extint128_t
shl_128(npy_extint128_t v)
{
npy_extint128_t z;
@@ -198,7 +198,7 @@ shl_128(npy_extint128_t v)
}
-static NPY_INLINE npy_extint128_t
+static inline npy_extint128_t
shr_128(npy_extint128_t v)
{
npy_extint128_t z;
@@ -209,7 +209,7 @@ shr_128(npy_extint128_t v)
return z;
}
-static NPY_INLINE int
+static inline int
gt_128(npy_extint128_t a, npy_extint128_t b)
{
if (a.sign > 0 && b.sign > 0) {
@@ -228,7 +228,7 @@ gt_128(npy_extint128_t a, npy_extint128_t b)
/* Long integer divide */
-static NPY_INLINE npy_extint128_t
+static inline npy_extint128_t
divmod_128_64(npy_extint128_t x, npy_int64 b, npy_int64 *mod)
{
npy_extint128_t remainder, pointer, result, divisor;
@@ -284,7 +284,7 @@ divmod_128_64(npy_extint128_t x, npy_int64 b, npy_int64 *mod)
/* Divide and round down (positive divisor; no overflows) */
-static NPY_INLINE npy_extint128_t
+static inline npy_extint128_t
floordiv_128_64(npy_extint128_t a, npy_int64 b)
{
npy_extint128_t result;
@@ -300,7 +300,7 @@ floordiv_128_64(npy_extint128_t a, npy_int64 b)
/* Divide and round up (positive divisor; no overflows) */
-static NPY_INLINE npy_extint128_t
+static inline npy_extint128_t
ceildiv_128_64(npy_extint128_t a, npy_int64 b)
{
npy_extint128_t result;
diff --git a/numpy/core/src/common/npy_hashtable.c b/numpy/core/src/common/npy_hashtable.c
index af82c0f85..14f6cca1b 100644
--- a/numpy/core/src/common/npy_hashtable.c
+++ b/numpy/core/src/common/npy_hashtable.c
@@ -35,7 +35,7 @@
*
* Users cannot control pointers, so we do not have to worry about DoS attacks?
*/
-static NPY_INLINE Py_hash_t
+static inline Py_hash_t
identity_list_hash(PyObject *const *v, int len)
{
Py_uhash_t acc = _NpyHASH_XXPRIME_5;
@@ -58,7 +58,7 @@ identity_list_hash(PyObject *const *v, int len)
#undef _NpyHASH_XXROTATE
-static NPY_INLINE PyObject **
+static inline PyObject **
find_item(PyArrayIdentityHash const *tb, PyObject *const *key)
{
Py_hash_t hash = identity_list_hash(key, tb->key_len);
diff --git a/numpy/core/src/common/npy_import.h b/numpy/core/src/common/npy_import.h
index f36b6924a..58b4ba0bc 100644
--- a/numpy/core/src/common/npy_import.h
+++ b/numpy/core/src/common/npy_import.h
@@ -16,7 +16,7 @@
* @param attr module attribute to cache.
* @param cache Storage location for imported function.
*/
-NPY_INLINE static void
+static inline void
npy_cache_import(const char *module, const char *attr, PyObject **cache)
{
if (NPY_UNLIKELY(*cache == NULL)) {
diff --git a/numpy/core/src/common/npy_pycompat.h b/numpy/core/src/common/npy_pycompat.h
index 6641cd591..ce6c34fa1 100644
--- a/numpy/core/src/common/npy_pycompat.h
+++ b/numpy/core/src/common/npy_pycompat.h
@@ -11,7 +11,7 @@
#if PY_VERSION_HEX > 0x030a00a6
#define Npy_HashDouble _Py_HashDouble
#else
-static NPY_INLINE Py_hash_t
+static inline Py_hash_t
Npy_HashDouble(PyObject *NPY_UNUSED(identity), double val)
{
return _Py_HashDouble(val);
diff --git a/numpy/core/src/common/npy_sort.h.src b/numpy/core/src/common/npy_sort.h.src
index a3f556f56..d6e435722 100644
--- a/numpy/core/src/common/npy_sort.h.src
+++ b/numpy/core/src/common/npy_sort.h.src
@@ -9,7 +9,7 @@
#define NPY_ENOMEM 1
#define NPY_ECOMP 2
-static NPY_INLINE int npy_get_msb(npy_uintp unum)
+static inline int npy_get_msb(npy_uintp unum)
{
int depth_limit = 0;
while (unum >>= 1) {
diff --git a/numpy/core/src/common/numpyos.c b/numpy/core/src/common/numpyos.c
index 5ee95191d..2fec06e1c 100644
--- a/numpy/core/src/common/numpyos.c
+++ b/numpy/core/src/common/numpyos.c
@@ -779,3 +779,25 @@ NumPyOS_strtoull(const char *str, char **endptr, int base)
return strtoull(str, endptr, base);
}
+#ifdef _MSC_VER
+
+#include <stdlib.h>
+
+#if _MSC_VER >= 1900
+/* npy3k_compat.h uses this function in the _Py_BEGIN/END_SUPPRESS_IPH
+ * macros. It does not need to be defined when building using MSVC
+ * earlier than 14.0 (_MSC_VER == 1900).
+ */
+
+static void __cdecl _silent_invalid_parameter_handler(
+ wchar_t const* expression,
+ wchar_t const* function,
+ wchar_t const* file,
+ unsigned int line,
+ uintptr_t pReserved) { }
+
+_invalid_parameter_handler _Py_silent_invalid_parameter_handler = _silent_invalid_parameter_handler;
+
+#endif
+
+#endif
diff --git a/numpy/core/src/common/simd/avx2/arithmetic.h b/numpy/core/src/common/simd/avx2/arithmetic.h
index ad9688338..58d842a6d 100644
--- a/numpy/core/src/common/simd/avx2/arithmetic.h
+++ b/numpy/core/src/common/simd/avx2/arithmetic.h
@@ -247,6 +247,10 @@ NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor)
// negate multiply and subtract, -(a*b) - c
#define npyv_nmulsub_f32 _mm256_fnmsub_ps
#define npyv_nmulsub_f64 _mm256_fnmsub_pd
+ // multiply, add for odd elements and subtract even elements.
+ // (a * b) -+ c
+ #define npyv_muladdsub_f32 _mm256_fmaddsub_ps
+ #define npyv_muladdsub_f64 _mm256_fmaddsub_pd
#else
// multiply and add, a*b + c
NPY_FINLINE npyv_f32 npyv_muladd_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
@@ -274,6 +278,13 @@ NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor)
npyv_f64 neg_a = npyv_xor_f64(a, npyv_setall_f64(-0.0));
return npyv_sub_f64(npyv_mul_f64(neg_a, b), c);
}
+ // multiply, add for odd elements and subtract even elements.
+ // (a * b) -+ c
+ NPY_FINLINE npyv_f32 npyv_muladdsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
+ { return _mm256_addsub_ps(npyv_mul_f32(a, b), c); }
+ NPY_FINLINE npyv_f64 npyv_muladdsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c)
+ { return _mm256_addsub_pd(npyv_mul_f64(a, b), c); }
+
#endif // !NPY_HAVE_FMA3
/***************************
diff --git a/numpy/core/src/common/simd/avx2/avx2.h b/numpy/core/src/common/simd/avx2/avx2.h
index 8cb74df2b..d64f3c6d6 100644
--- a/numpy/core/src/common/simd/avx2/avx2.h
+++ b/numpy/core/src/common/simd/avx2/avx2.h
@@ -11,6 +11,7 @@
#define NPY_SIMD_FMA3 0 // fast emulated
#endif
#define NPY_SIMD_BIGENDIAN 0
+#define NPY_SIMD_CMPSIGNAL 0
// Enough limit to allow us to use _mm256_i32gather_*
#define NPY_SIMD_MAXLOAD_STRIDE32 (0x7fffffff / 8)
diff --git a/numpy/core/src/common/simd/avx2/memory.h b/numpy/core/src/common/simd/avx2/memory.h
index 410c35dc8..81144a36b 100644
--- a/numpy/core/src/common/simd/avx2/memory.h
+++ b/numpy/core/src/common/simd/avx2/memory.h
@@ -84,17 +84,6 @@ NPY_FINLINE npyv_s32 npyv_loadn_s32(const npy_int32 *ptr, npy_intp stride)
NPY_FINLINE npyv_f32 npyv_loadn_f32(const float *ptr, npy_intp stride)
{ return _mm256_castsi256_ps(npyv_loadn_u32((const npy_uint32*)ptr, stride)); }
//// 64
-#if 0 // slower
-NPY_FINLINE npyv_u64 npyv_loadn_u64(const npy_uint64 *ptr, npy_intp stride)
-{
- const __m256i idx = npyv_set_s64(0, 1*stride, 2*stride, 3*stride);
- return _mm256_i64gather_epi64((const void*)ptr, idx, 8);
-}
-NPY_FINLINE npyv_s64 npyv_loadn_s64(const npy_int64 *ptr, npy_intp stride)
-{ return npyv_loadn_u64((const npy_uint64*)ptr, stride); }
-NPY_FINLINE npyv_f64 npyv_loadn_f64(const double *ptr, npy_intp stride)
-{ return _mm256_castsi256_pd(npyv_loadn_u64((const npy_uint64*)ptr, stride)); }
-#endif
NPY_FINLINE npyv_f64 npyv_loadn_f64(const double *ptr, npy_intp stride)
{
__m128d a0 = _mm_castsi128_pd(_mm_loadl_epi64((const __m128i*)ptr));
@@ -107,6 +96,32 @@ NPY_FINLINE npyv_u64 npyv_loadn_u64(const npy_uint64 *ptr, npy_intp stride)
{ return _mm256_castpd_si256(npyv_loadn_f64((const double*)ptr, stride)); }
NPY_FINLINE npyv_s64 npyv_loadn_s64(const npy_int64 *ptr, npy_intp stride)
{ return _mm256_castpd_si256(npyv_loadn_f64((const double*)ptr, stride)); }
+
+//// 64-bit load over 32-bit stride
+NPY_FINLINE npyv_f32 npyv_loadn2_f32(const float *ptr, npy_intp stride)
+{
+ __m128d a0 = _mm_castsi128_pd(_mm_loadl_epi64((const __m128i*)ptr));
+ __m128d a2 = _mm_castsi128_pd(_mm_loadl_epi64((const __m128i*)(ptr + stride*2)));
+ __m128d a01 = _mm_loadh_pd(a0, (const double*)(ptr + stride));
+ __m128d a23 = _mm_loadh_pd(a2, (const double*)(ptr + stride*3));
+ return _mm256_castpd_ps(_mm256_insertf128_pd(_mm256_castpd128_pd256(a01), a23, 1));
+}
+NPY_FINLINE npyv_u32 npyv_loadn2_u32(const npy_uint32 *ptr, npy_intp stride)
+{ return _mm256_castps_si256(npyv_loadn2_f32((const float*)ptr, stride)); }
+NPY_FINLINE npyv_s32 npyv_loadn2_s32(const npy_int32 *ptr, npy_intp stride)
+{ return _mm256_castps_si256(npyv_loadn2_f32((const float*)ptr, stride)); }
+
+//// 128-bit load over 64-bit stride
+NPY_FINLINE npyv_f64 npyv_loadn2_f64(const double *ptr, npy_intp stride)
+{
+ __m256d a = npyv_loadl_f64(ptr);
+ return _mm256_insertf128_pd(a, _mm_loadu_pd(ptr + stride), 1);
+}
+NPY_FINLINE npyv_u64 npyv_loadn2_u64(const npy_uint64 *ptr, npy_intp stride)
+{ return _mm256_castpd_si256(npyv_loadn2_f64((const double*)ptr, stride)); }
+NPY_FINLINE npyv_s64 npyv_loadn2_s64(const npy_int64 *ptr, npy_intp stride)
+{ return _mm256_castpd_si256(npyv_loadn2_f64((const double*)ptr, stride)); }
+
/***************************
* Non-contiguous Store
***************************/
@@ -143,6 +158,32 @@ NPY_FINLINE void npyv_storen_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a)
NPY_FINLINE void npyv_storen_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a)
{ npyv_storen_f64((double*)ptr, stride, _mm256_castsi256_pd(a)); }
+//// 64-bit store over 32-bit stride
+NPY_FINLINE void npyv_storen2_u32(npy_uint32 *ptr, npy_intp stride, npyv_u32 a)
+{
+ __m128d a0 = _mm256_castpd256_pd128(_mm256_castsi256_pd(a));
+ __m128d a1 = _mm256_extractf128_pd(_mm256_castsi256_pd(a), 1);
+ _mm_storel_pd((double*)ptr, a0);
+ _mm_storeh_pd((double*)(ptr + stride), a0);
+ _mm_storel_pd((double*)(ptr + stride*2), a1);
+ _mm_storeh_pd((double*)(ptr + stride*3), a1);
+}
+NPY_FINLINE void npyv_storen2_s32(npy_int32 *ptr, npy_intp stride, npyv_s32 a)
+{ npyv_storen2_u32((npy_uint32*)ptr, stride, a); }
+NPY_FINLINE void npyv_storen2_f32(float *ptr, npy_intp stride, npyv_f32 a)
+{ npyv_storen2_u32((npy_uint32*)ptr, stride, _mm256_castps_si256(a)); }
+
+//// 128-bit store over 64-bit stride
+NPY_FINLINE void npyv_storen2_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a)
+{
+ npyv_storel_u64(ptr, a);
+ npyv_storeh_u64(ptr + stride, a);
+}
+NPY_FINLINE void npyv_storen2_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a)
+{ npyv_storen2_u64((npy_uint64*)ptr, stride, a); }
+NPY_FINLINE void npyv_storen2_f64(double *ptr, npy_intp stride, npyv_f64 a)
+{ npyv_storen2_u64((npy_uint64*)ptr, stride, _mm256_castpd_si256(a)); }
+
/*********************************
* Partial Load
*********************************/
@@ -174,7 +215,7 @@ NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, n
const __m256i steps = npyv_set_s64(0, 1, 2, 3);
__m256i vnlane = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane);
__m256i mask = _mm256_cmpgt_epi64(vnlane, steps);
- __m256i payload = _mm256_maskload_epi64((const void*)ptr, mask);
+ __m256i payload = _mm256_maskload_epi64((const long long*)ptr, mask);
return _mm256_blendv_epi8(vfill, payload, mask);
}
// fill zero to rest lanes
@@ -184,7 +225,45 @@ NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
const __m256i steps = npyv_set_s64(0, 1, 2, 3);
__m256i vnlane = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane);
__m256i mask = _mm256_cmpgt_epi64(vnlane, steps);
- return _mm256_maskload_epi64((const void*)ptr, mask);
+ return _mm256_maskload_epi64((const long long*)ptr, mask);
+}
+
+//// 64-bit nlane
+NPY_FINLINE npyv_s32 npyv_load2_till_s32(const npy_int32 *ptr, npy_uintp nlane,
+ npy_int32 fill_lo, npy_int32 fill_hi)
+{
+ assert(nlane > 0);
+ const __m256i vfill = npyv_set_s32(
+ fill_lo, fill_hi, fill_lo, fill_hi,
+ fill_lo, fill_hi, fill_lo, fill_hi
+ );
+ const __m256i steps = npyv_set_s64(0, 1, 2, 3);
+ __m256i vnlane = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane);
+ __m256i mask = _mm256_cmpgt_epi64(vnlane, steps);
+ __m256i payload = _mm256_maskload_epi64((const long long*)ptr, mask);
+ return _mm256_blendv_epi8(vfill, payload, mask);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s32 npyv_load2_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
+{ return npyv_load_tillz_s64((const npy_int64*)ptr, nlane); }
+
+/// 128-bit nlane
+NPY_FINLINE npyv_u64 npyv_load2_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
+{
+ assert(nlane > 0);
+ npy_int64 m = -((npy_int64)(nlane > 1));
+ __m256i mask = npyv_set_s64(-1, -1, m, m);
+ return _mm256_maskload_epi64((const long long*)ptr, mask);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_u64 npyv_load2_till_s64(const npy_int64 *ptr, npy_uintp nlane,
+ npy_int64 fill_lo, npy_int64 fill_hi)
+{
+ const __m256i vfill = npyv_set_s64(0, 0, fill_lo, fill_hi);
+ npy_int64 m = -((npy_int64)(nlane > 1));
+ __m256i mask = npyv_set_s64(-1, -1, m, m);
+ __m256i payload = _mm256_maskload_epi64((const long long*)ptr, mask);
+ return _mm256_blendv_epi8(vfill, payload, mask);
}
/*********************************
* Non-contiguous partial load
@@ -216,12 +295,53 @@ npyv_loadn_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_
const __m256i steps = npyv_set_s64(0, 1, 2, 3);
__m256i vnlane = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane);
__m256i mask = _mm256_cmpgt_epi64(vnlane, steps);
- return _mm256_mask_i64gather_epi64(vfill, (const void*)ptr, idx, mask, 8);
+ return _mm256_mask_i64gather_epi64(vfill, (const long long*)ptr, idx, mask, 8);
}
// fill zero to rest lanes
NPY_FINLINE npyv_s64
npyv_loadn_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane)
{ return npyv_loadn_till_s64(ptr, stride, nlane, 0); }
+
+//// 64-bit load over 32-bit stride
+NPY_FINLINE npyv_s64 npyv_loadn2_till_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane,
+ npy_int32 fill_lo, npy_int32 fill_hi)
+{
+ assert(nlane > 0);
+ const __m256i vfill = npyv_set_s32(
+ fill_lo, fill_hi, fill_lo, fill_hi,
+ fill_lo, fill_hi, fill_lo, fill_hi
+ );
+ const __m256i idx = npyv_set_s64(0, 1*stride, 2*stride, 3*stride);
+ const __m256i steps = npyv_set_s64(0, 1, 2, 3);
+ __m256i vnlane = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane);
+ __m256i mask = _mm256_cmpgt_epi64(vnlane, steps);
+ return _mm256_mask_i64gather_epi64(vfill, (const long long*)ptr, idx, mask, 4);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s32 npyv_loadn2_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane)
+{ return npyv_loadn2_till_s32(ptr, stride, nlane, 0, 0); }
+
+//// 128-bit load over 64-bit stride
+NPY_FINLINE npyv_s64 npyv_loadn2_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane,
+ npy_int64 fill_lo, npy_int64 fill_hi)
+{
+ assert(nlane > 0);
+ __m256i a = npyv_loadl_s64(ptr);
+#if defined(_MSC_VER) && defined(_M_IX86)
+ __m128i fill =_mm_setr_epi32(
+ (int)fill_lo, (int)(fill_lo >> 32),
+ (int)fill_hi, (int)(fill_hi >> 32)
+ );
+#else
+ __m128i fill = _mm_set_epi64x(fill_hi, fill_lo);
+#endif
+ __m128i b = nlane > 1 ? _mm_loadu_si128((const __m128i*)(ptr + stride)) : fill;
+ return _mm256_inserti128_si256(a, b, 1);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s64 npyv_loadn2_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane)
+{ return npyv_loadn2_till_s64(ptr, stride, nlane, 0, 0); }
+
/*********************************
* Partial store
*********************************/
@@ -241,7 +361,21 @@ NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a
const __m256i steps = npyv_set_s64(0, 1, 2, 3);
__m256i vnlane = npyv_setall_s64(nlane > 8 ? 8 : (int)nlane);
__m256i mask = _mm256_cmpgt_epi64(vnlane, steps);
- _mm256_maskstore_epi64((void*)ptr, mask, a);
+ _mm256_maskstore_epi64((long long*)ptr, mask, a);
+}
+
+//// 64-bit nlane
+NPY_FINLINE void npyv_store2_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a)
+{ npyv_store_till_s64((npy_int64*)ptr, nlane, a); }
+
+//// 128-bit nlane
+NPY_FINLINE void npyv_store2_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a)
+{
+ assert(nlane > 0);
+ npyv_storel_s64(ptr, a);
+ if (nlane > 1) {
+ npyv_storeh_s64(ptr + 2, a);
+ }
}
/*********************************
* Non-contiguous partial store
@@ -252,23 +386,52 @@ NPY_FINLINE void npyv_storen_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp
assert(nlane > 0);
__m128i a0 = _mm256_castsi256_si128(a);
__m128i a1 = _mm256_extracti128_si256(a, 1);
+
+ ptr[stride*0] = _mm_extract_epi32(a0, 0);
switch(nlane) {
- default:
- ptr[stride*7] = _mm_extract_epi32(a1, 3);
- case 7:
- ptr[stride*6] = _mm_extract_epi32(a1, 2);
- case 6:
- ptr[stride*5] = _mm_extract_epi32(a1, 1);
+ case 1:
+ return;
+ case 2:
+ ptr[stride*1] = _mm_extract_epi32(a0, 1);
+ return;
+ case 3:
+ ptr[stride*1] = _mm_extract_epi32(a0, 1);
+ ptr[stride*2] = _mm_extract_epi32(a0, 2);
+ return;
+ case 4:
+ ptr[stride*1] = _mm_extract_epi32(a0, 1);
+ ptr[stride*2] = _mm_extract_epi32(a0, 2);
+ ptr[stride*3] = _mm_extract_epi32(a0, 3);
+ return;
case 5:
+ ptr[stride*1] = _mm_extract_epi32(a0, 1);
+ ptr[stride*2] = _mm_extract_epi32(a0, 2);
+ ptr[stride*3] = _mm_extract_epi32(a0, 3);
ptr[stride*4] = _mm_extract_epi32(a1, 0);
- case 4:
+ return;
+ case 6:
+ ptr[stride*1] = _mm_extract_epi32(a0, 1);
+ ptr[stride*2] = _mm_extract_epi32(a0, 2);
ptr[stride*3] = _mm_extract_epi32(a0, 3);
- case 3:
+ ptr[stride*4] = _mm_extract_epi32(a1, 0);
+ ptr[stride*5] = _mm_extract_epi32(a1, 1);
+ return;
+ case 7:
+ ptr[stride*1] = _mm_extract_epi32(a0, 1);
ptr[stride*2] = _mm_extract_epi32(a0, 2);
- case 2:
+ ptr[stride*3] = _mm_extract_epi32(a0, 3);
+ ptr[stride*4] = _mm_extract_epi32(a1, 0);
+ ptr[stride*5] = _mm_extract_epi32(a1, 1);
+ ptr[stride*6] = _mm_extract_epi32(a1, 2);
+ return;
+ default:
ptr[stride*1] = _mm_extract_epi32(a0, 1);
- case 1:
- ptr[stride*0] = _mm_extract_epi32(a0, 0);
+ ptr[stride*2] = _mm_extract_epi32(a0, 2);
+ ptr[stride*3] = _mm_extract_epi32(a0, 3);
+ ptr[stride*4] = _mm_extract_epi32(a1, 0);
+ ptr[stride*5] = _mm_extract_epi32(a1, 1);
+ ptr[stride*6] = _mm_extract_epi32(a1, 2);
+ ptr[stride*7] = _mm_extract_epi32(a1, 3);
}
}
//// 64
@@ -277,19 +440,60 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp
assert(nlane > 0);
__m128d a0 = _mm256_castpd256_pd128(_mm256_castsi256_pd(a));
__m128d a1 = _mm256_extractf128_pd(_mm256_castsi256_pd(a), 1);
+
double *dptr = (double*)ptr;
+ _mm_storel_pd(dptr, a0);
switch(nlane) {
- default:
- _mm_storeh_pd(dptr + stride * 3, a1);
+ case 1:
+ return;
+ case 2:
+ _mm_storeh_pd(dptr + stride * 1, a0);
+ return;
case 3:
+ _mm_storeh_pd(dptr + stride * 1, a0);
_mm_storel_pd(dptr + stride * 2, a1);
- case 2:
+ return;
+ default:
_mm_storeh_pd(dptr + stride * 1, a0);
+ _mm_storel_pd(dptr + stride * 2, a1);
+ _mm_storeh_pd(dptr + stride * 3, a1);
+ }
+}
+
+//// 64-bit store over 32-bit stride
+NPY_FINLINE void npyv_storen2_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npyv_s32 a)
+{
+ assert(nlane > 0);
+ __m128d a0 = _mm256_castpd256_pd128(_mm256_castsi256_pd(a));
+ __m128d a1 = _mm256_extractf128_pd(_mm256_castsi256_pd(a), 1);
+
+ _mm_storel_pd((double*)ptr, a0);
+ switch(nlane) {
case 1:
- _mm_storel_pd(dptr + stride * 0, a0);
+ return;
+ case 2:
+ _mm_storeh_pd((double*)(ptr + stride * 1), a0);
+ return;
+ case 3:
+ _mm_storeh_pd((double*)(ptr + stride * 1), a0);
+ _mm_storel_pd((double*)(ptr + stride * 2), a1);
+ return;
+ default:
+ _mm_storeh_pd((double*)(ptr + stride * 1), a0);
+ _mm_storel_pd((double*)(ptr + stride * 2), a1);
+ _mm_storeh_pd((double*)(ptr + stride * 3), a1);
}
}
+//// 128-bit store over 64-bit stride
+NPY_FINLINE void npyv_storen2_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npyv_s64 a)
+{
+ assert(nlane > 0);
+ npyv_storel_s64(ptr, a);
+ if (nlane > 1) {
+ npyv_storeh_s64(ptr + stride, a);
+ }
+}
/*****************************************************************************
* Implement partial load/store for u32/f32/u64/f64... via reinterpret cast
*****************************************************************************/
@@ -300,7 +504,8 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp
union { \
npyv_lanetype_##F_SFX from_##F_SFX; \
npyv_lanetype_##T_SFX to_##T_SFX; \
- } pun = {.from_##F_SFX = fill}; \
+ } pun; \
+ pun.from_##F_SFX = fill; \
return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load_till_##T_SFX( \
(const npyv_lanetype_##T_SFX *)ptr, nlane, pun.to_##T_SFX \
)); \
@@ -312,7 +517,8 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp
union { \
npyv_lanetype_##F_SFX from_##F_SFX; \
npyv_lanetype_##T_SFX to_##T_SFX; \
- } pun = {.from_##F_SFX = fill}; \
+ } pun; \
+ pun.from_##F_SFX = fill; \
return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn_till_##T_SFX( \
(const npyv_lanetype_##T_SFX *)ptr, stride, nlane, pun.to_##T_SFX \
)); \
@@ -353,6 +559,110 @@ NPYV_IMPL_AVX2_REST_PARTIAL_TYPES(f32, s32)
NPYV_IMPL_AVX2_REST_PARTIAL_TYPES(u64, s64)
NPYV_IMPL_AVX2_REST_PARTIAL_TYPES(f64, s64)
+// 128-bit/64-bit stride (load/store pair)
+#define NPYV_IMPL_AVX2_REST_PARTIAL_TYPES_PAIR(F_SFX, T_SFX) \
+ NPY_FINLINE npyv_##F_SFX npyv_load2_till_##F_SFX \
+ (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, \
+ npyv_lanetype_##F_SFX fill_lo, npyv_lanetype_##F_SFX fill_hi) \
+ { \
+ union pun { \
+ npyv_lanetype_##F_SFX from_##F_SFX; \
+ npyv_lanetype_##T_SFX to_##T_SFX; \
+ }; \
+ union pun pun_lo; \
+ union pun pun_hi; \
+ pun_lo.from_##F_SFX = fill_lo; \
+ pun_hi.from_##F_SFX = fill_hi; \
+ return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load2_till_##T_SFX( \
+ (const npyv_lanetype_##T_SFX *)ptr, nlane, pun_lo.to_##T_SFX, pun_hi.to_##T_SFX \
+ )); \
+ } \
+ NPY_FINLINE npyv_##F_SFX npyv_loadn2_till_##F_SFX \
+ (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, \
+ npyv_lanetype_##F_SFX fill_lo, npyv_lanetype_##F_SFX fill_hi) \
+ { \
+ union pun { \
+ npyv_lanetype_##F_SFX from_##F_SFX; \
+ npyv_lanetype_##T_SFX to_##T_SFX; \
+ }; \
+ union pun pun_lo; \
+ union pun pun_hi; \
+ pun_lo.from_##F_SFX = fill_lo; \
+ pun_hi.from_##F_SFX = fill_hi; \
+ return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn2_till_##T_SFX( \
+ (const npyv_lanetype_##T_SFX *)ptr, stride, nlane, pun_lo.to_##T_SFX, \
+ pun_hi.to_##T_SFX \
+ )); \
+ } \
+ NPY_FINLINE npyv_##F_SFX npyv_load2_tillz_##F_SFX \
+ (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane) \
+ { \
+ return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load2_tillz_##T_SFX( \
+ (const npyv_lanetype_##T_SFX *)ptr, nlane \
+ )); \
+ } \
+ NPY_FINLINE npyv_##F_SFX npyv_loadn2_tillz_##F_SFX \
+ (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane) \
+ { \
+ return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn2_tillz_##T_SFX( \
+ (const npyv_lanetype_##T_SFX *)ptr, stride, nlane \
+ )); \
+ } \
+ NPY_FINLINE void npyv_store2_till_##F_SFX \
+ (npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, npyv_##F_SFX a) \
+ { \
+ npyv_store2_till_##T_SFX( \
+ (npyv_lanetype_##T_SFX *)ptr, nlane, \
+ npyv_reinterpret_##T_SFX##_##F_SFX(a) \
+ ); \
+ } \
+ NPY_FINLINE void npyv_storen2_till_##F_SFX \
+ (npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, npyv_##F_SFX a) \
+ { \
+ npyv_storen2_till_##T_SFX( \
+ (npyv_lanetype_##T_SFX *)ptr, stride, nlane, \
+ npyv_reinterpret_##T_SFX##_##F_SFX(a) \
+ ); \
+ }
+
+NPYV_IMPL_AVX2_REST_PARTIAL_TYPES_PAIR(u32, s32)
+NPYV_IMPL_AVX2_REST_PARTIAL_TYPES_PAIR(f32, s32)
+NPYV_IMPL_AVX2_REST_PARTIAL_TYPES_PAIR(u64, s64)
+NPYV_IMPL_AVX2_REST_PARTIAL_TYPES_PAIR(f64, s64)
+
+/************************************************************
+ * de-interlave load / interleave contiguous store
+ ************************************************************/
+// two channels
+#define NPYV_IMPL_AVX2_MEM_INTERLEAVE(SFX, ZSFX) \
+ NPY_FINLINE npyv_##ZSFX##x2 npyv_zip_##ZSFX(npyv_##ZSFX, npyv_##ZSFX); \
+ NPY_FINLINE npyv_##ZSFX##x2 npyv_unzip_##ZSFX(npyv_##ZSFX, npyv_##ZSFX); \
+ NPY_FINLINE npyv_##SFX##x2 npyv_load_##SFX##x2( \
+ const npyv_lanetype_##SFX *ptr \
+ ) { \
+ return npyv_unzip_##ZSFX( \
+ npyv_load_##SFX(ptr), npyv_load_##SFX(ptr+npyv_nlanes_##SFX) \
+ ); \
+ } \
+ NPY_FINLINE void npyv_store_##SFX##x2( \
+ npyv_lanetype_##SFX *ptr, npyv_##SFX##x2 v \
+ ) { \
+ npyv_##SFX##x2 zip = npyv_zip_##ZSFX(v.val[0], v.val[1]); \
+ npyv_store_##SFX(ptr, zip.val[0]); \
+ npyv_store_##SFX(ptr + npyv_nlanes_##SFX, zip.val[1]); \
+ }
+
+NPYV_IMPL_AVX2_MEM_INTERLEAVE(u8, u8)
+NPYV_IMPL_AVX2_MEM_INTERLEAVE(s8, u8)
+NPYV_IMPL_AVX2_MEM_INTERLEAVE(u16, u16)
+NPYV_IMPL_AVX2_MEM_INTERLEAVE(s16, u16)
+NPYV_IMPL_AVX2_MEM_INTERLEAVE(u32, u32)
+NPYV_IMPL_AVX2_MEM_INTERLEAVE(s32, u32)
+NPYV_IMPL_AVX2_MEM_INTERLEAVE(u64, u64)
+NPYV_IMPL_AVX2_MEM_INTERLEAVE(s64, u64)
+NPYV_IMPL_AVX2_MEM_INTERLEAVE(f32, f32)
+NPYV_IMPL_AVX2_MEM_INTERLEAVE(f64, f64)
+
/*********************************
* Lookup tables
*********************************/
diff --git a/numpy/core/src/common/simd/avx2/operators.h b/numpy/core/src/common/simd/avx2/operators.h
index c10267b21..7b9b6a344 100644
--- a/numpy/core/src/common/simd/avx2/operators.h
+++ b/numpy/core/src/common/simd/avx2/operators.h
@@ -205,7 +205,7 @@ NPY_FINLINE __m256i npyv_cmpge_u32(__m256i a, __m256i b)
#define npyv_cmple_u64(A, B) npyv_cmpge_u64(B, A)
#define npyv_cmple_s64(A, B) npyv_cmpge_s64(B, A)
-// precision comparison
+// precision comparison (ordered)
#define npyv_cmpeq_f32(A, B) _mm256_castps_si256(_mm256_cmp_ps(A, B, _CMP_EQ_OQ))
#define npyv_cmpeq_f64(A, B) _mm256_castpd_si256(_mm256_cmp_pd(A, B, _CMP_EQ_OQ))
#define npyv_cmpneq_f32(A, B) _mm256_castps_si256(_mm256_cmp_ps(A, B, _CMP_NEQ_UQ))
diff --git a/numpy/core/src/common/simd/avx2/reorder.h b/numpy/core/src/common/simd/avx2/reorder.h
index 4d6ec8f75..9ebe0e7f4 100644
--- a/numpy/core/src/common/simd/avx2/reorder.h
+++ b/numpy/core/src/common/simd/avx2/reorder.h
@@ -94,6 +94,75 @@ NPY_FINLINE npyv_f64x2 npyv_zip_f64(__m256d a, __m256d b)
return npyv_combine_f64(ab0, ab1);
}
+// deinterleave two vectors
+NPY_FINLINE npyv_u8x2 npyv_unzip_u8(npyv_u8 ab0, npyv_u8 ab1)
+{
+ const __m256i idx = _mm256_setr_epi8(
+ 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
+ 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
+ );
+ __m256i ab_03 = _mm256_shuffle_epi8(ab0, idx);
+ __m256i ab_12 = _mm256_shuffle_epi8(ab1, idx);
+ npyv_u8x2 ab_lh = npyv_combine_u8(ab_03, ab_12);
+ npyv_u8x2 r;
+ r.val[0] = _mm256_unpacklo_epi64(ab_lh.val[0], ab_lh.val[1]);
+ r.val[1] = _mm256_unpackhi_epi64(ab_lh.val[0], ab_lh.val[1]);
+ return r;
+}
+#define npyv_unzip_s8 npyv_unzip_u8
+
+NPY_FINLINE npyv_u16x2 npyv_unzip_u16(npyv_u16 ab0, npyv_u16 ab1)
+{
+ const __m256i idx = _mm256_setr_epi8(
+ 0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15,
+ 0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15
+ );
+ __m256i ab_03 = _mm256_shuffle_epi8(ab0, idx);
+ __m256i ab_12 = _mm256_shuffle_epi8(ab1, idx);
+ npyv_u16x2 ab_lh = npyv_combine_u16(ab_03, ab_12);
+ npyv_u16x2 r;
+ r.val[0] = _mm256_unpacklo_epi64(ab_lh.val[0], ab_lh.val[1]);
+ r.val[1] = _mm256_unpackhi_epi64(ab_lh.val[0], ab_lh.val[1]);
+ return r;
+}
+#define npyv_unzip_s16 npyv_unzip_u16
+
+NPY_FINLINE npyv_u32x2 npyv_unzip_u32(npyv_u32 ab0, npyv_u32 ab1)
+{
+ const __m256i idx = npyv_set_u32(0, 2, 4, 6, 1, 3, 5, 7);
+ __m256i abl = _mm256_permutevar8x32_epi32(ab0, idx);
+ __m256i abh = _mm256_permutevar8x32_epi32(ab1, idx);
+ return npyv_combine_u32(abl, abh);
+}
+#define npyv_unzip_s32 npyv_unzip_u32
+
+NPY_FINLINE npyv_u64x2 npyv_unzip_u64(npyv_u64 ab0, npyv_u64 ab1)
+{
+ npyv_u64x2 ab_lh = npyv_combine_u64(ab0, ab1);
+ npyv_u64x2 r;
+ r.val[0] = _mm256_unpacklo_epi64(ab_lh.val[0], ab_lh.val[1]);
+ r.val[1] = _mm256_unpackhi_epi64(ab_lh.val[0], ab_lh.val[1]);
+ return r;
+}
+#define npyv_unzip_s64 npyv_unzip_u64
+
+NPY_FINLINE npyv_f32x2 npyv_unzip_f32(npyv_f32 ab0, npyv_f32 ab1)
+{
+ const __m256i idx = npyv_set_u32(0, 2, 4, 6, 1, 3, 5, 7);
+ __m256 abl = _mm256_permutevar8x32_ps(ab0, idx);
+ __m256 abh = _mm256_permutevar8x32_ps(ab1, idx);
+ return npyv_combine_f32(abl, abh);
+}
+
+NPY_FINLINE npyv_f64x2 npyv_unzip_f64(npyv_f64 ab0, npyv_f64 ab1)
+{
+ npyv_f64x2 ab_lh = npyv_combine_f64(ab0, ab1);
+ npyv_f64x2 r;
+ r.val[0] = _mm256_unpacklo_pd(ab_lh.val[0], ab_lh.val[1]);
+ r.val[1] = _mm256_unpackhi_pd(ab_lh.val[0], ab_lh.val[1]);
+ return r;
+}
+
// Reverse elements of each 64-bit lane
NPY_FINLINE npyv_u8 npyv_rev64_u8(npyv_u8 a)
{
@@ -126,4 +195,22 @@ NPY_FINLINE npyv_f32 npyv_rev64_f32(npyv_f32 a)
return _mm256_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 0, 1));
}
+// Permuting the elements of each 128-bit lane by immediate index for
+// each element.
+#define npyv_permi128_u32(A, E0, E1, E2, E3) \
+ _mm256_shuffle_epi32(A, _MM_SHUFFLE(E3, E2, E1, E0))
+
+#define npyv_permi128_s32 npyv_permi128_u32
+
+#define npyv_permi128_u64(A, E0, E1) \
+ _mm256_shuffle_epi32(A, _MM_SHUFFLE(((E1)<<1)+1, ((E1)<<1), ((E0)<<1)+1, ((E0)<<1)))
+
+#define npyv_permi128_s64 npyv_permi128_u64
+
+#define npyv_permi128_f32(A, E0, E1, E2, E3) \
+ _mm256_permute_ps(A, _MM_SHUFFLE(E3, E2, E1, E0))
+
+#define npyv_permi128_f64(A, E0, E1) \
+ _mm256_permute_pd(A, ((E1)<<3) | ((E0)<<2) | ((E1)<<1) | (E0))
+
#endif // _NPY_SIMD_AVX2_REORDER_H
diff --git a/numpy/core/src/common/simd/avx512/arithmetic.h b/numpy/core/src/common/simd/avx512/arithmetic.h
index 1290dc0ad..a63da87d0 100644
--- a/numpy/core/src/common/simd/avx512/arithmetic.h
+++ b/numpy/core/src/common/simd/avx512/arithmetic.h
@@ -349,6 +349,10 @@ NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor)
// negate multiply and subtract, -(a*b) - c
#define npyv_nmulsub_f32 _mm512_fnmsub_ps
#define npyv_nmulsub_f64 _mm512_fnmsub_pd
+// multiply, add for odd elements and subtract even elements.
+// (a * b) -+ c
+#define npyv_muladdsub_f32 _mm512_fmaddsub_ps
+#define npyv_muladdsub_f64 _mm512_fmaddsub_pd
/***************************
* Summation: Calculates the sum of all vector elements.
diff --git a/numpy/core/src/common/simd/avx512/avx512.h b/numpy/core/src/common/simd/avx512/avx512.h
index 0946e6443..aa6abe256 100644
--- a/numpy/core/src/common/simd/avx512/avx512.h
+++ b/numpy/core/src/common/simd/avx512/avx512.h
@@ -7,6 +7,7 @@
#define NPY_SIMD_F64 1
#define NPY_SIMD_FMA3 1 // native support
#define NPY_SIMD_BIGENDIAN 0
+#define NPY_SIMD_CMPSIGNAL 0
// Enough limit to allow us to use _mm512_i32gather_* and _mm512_i32scatter_*
#define NPY_SIMD_MAXLOAD_STRIDE32 (0x7fffffff / 16)
#define NPY_SIMD_MAXSTORE_STRIDE32 (0x7fffffff / 16)
diff --git a/numpy/core/src/common/simd/avx512/maskop.h b/numpy/core/src/common/simd/avx512/maskop.h
index d1c188390..88fa4a68c 100644
--- a/numpy/core/src/common/simd/avx512/maskop.h
+++ b/numpy/core/src/common/simd/avx512/maskop.h
@@ -51,4 +51,17 @@ NPYV_IMPL_AVX512_MASK_ADDSUB(s64, b64, epi64)
NPYV_IMPL_AVX512_MASK_ADDSUB(f32, b32, ps)
NPYV_IMPL_AVX512_MASK_ADDSUB(f64, b64, pd)
+// division, m ? a / b : c
+NPY_FINLINE npyv_f32 npyv_ifdiv_f32(npyv_b32 m, npyv_f32 a, npyv_f32 b, npyv_f32 c)
+{ return _mm512_mask_div_ps(c, m, a, b); }
+// conditional division, m ? a / b : 0
+NPY_FINLINE npyv_f32 npyv_ifdivz_f32(npyv_b32 m, npyv_f32 a, npyv_f32 b)
+{ return _mm512_maskz_div_ps(m, a, b); }
+// division, m ? a / b : c
+NPY_FINLINE npyv_f64 npyv_ifdiv_f64(npyv_b32 m, npyv_f64 a, npyv_f64 b, npyv_f64 c)
+{ return _mm512_mask_div_pd(c, m, a, b); }
+// conditional division, m ? a / b : 0
+NPY_FINLINE npyv_f64 npyv_ifdivz_f64(npyv_b32 m, npyv_f64 a, npyv_f64 b)
+{ return _mm512_maskz_div_pd(m, a, b); }
+
#endif // _NPY_SIMD_AVX512_MASKOP_H
diff --git a/numpy/core/src/common/simd/avx512/memory.h b/numpy/core/src/common/simd/avx512/memory.h
index 03fcb4630..fdf96a92c 100644
--- a/numpy/core/src/common/simd/avx512/memory.h
+++ b/numpy/core/src/common/simd/avx512/memory.h
@@ -120,6 +120,52 @@ NPY_FINLINE npyv_s64 npyv_loadn_s64(const npy_int64 *ptr, npy_intp stride)
{ return npyv_loadn_u64((const npy_uint64*)ptr, stride); }
NPY_FINLINE npyv_f64 npyv_loadn_f64(const double *ptr, npy_intp stride)
{ return _mm512_castsi512_pd(npyv_loadn_u64((const npy_uint64*)ptr, stride)); }
+
+//// 64-bit load over 32-bit stride
+NPY_FINLINE npyv_u32 npyv_loadn2_u32(const npy_uint32 *ptr, npy_intp stride)
+{
+ __m128d a = _mm_loadh_pd(
+ _mm_castsi128_pd(_mm_loadl_epi64((const __m128i*)ptr)),
+ (const double*)(ptr + stride)
+ );
+ __m128d b = _mm_loadh_pd(
+ _mm_castsi128_pd(_mm_loadl_epi64((const __m128i*)(ptr + stride*2))),
+ (const double*)(ptr + stride*3)
+ );
+ __m128d c = _mm_loadh_pd(
+ _mm_castsi128_pd(_mm_loadl_epi64((const __m128i*)(ptr + stride*4))),
+ (const double*)(ptr + stride*5)
+ );
+ __m128d d = _mm_loadh_pd(
+ _mm_castsi128_pd(_mm_loadl_epi64((const __m128i*)(ptr + stride*6))),
+ (const double*)(ptr + stride*7)
+ );
+ return _mm512_castpd_si512(npyv512_combine_pd256(
+ _mm256_insertf128_pd(_mm256_castpd128_pd256(a), b, 1),
+ _mm256_insertf128_pd(_mm256_castpd128_pd256(c), d, 1)
+ ));
+}
+NPY_FINLINE npyv_s32 npyv_loadn2_s32(const npy_int32 *ptr, npy_intp stride)
+{ return npyv_loadn2_u32((const npy_uint32*)ptr, stride); }
+NPY_FINLINE npyv_f32 npyv_loadn2_f32(const float *ptr, npy_intp stride)
+{ return _mm512_castsi512_ps(npyv_loadn2_u32((const npy_uint32*)ptr, stride)); }
+
+//// 128-bit load over 64-bit stride
+NPY_FINLINE npyv_f64 npyv_loadn2_f64(const double *ptr, npy_intp stride)
+{
+ __m128d a = _mm_loadu_pd(ptr);
+ __m128d b = _mm_loadu_pd(ptr + stride);
+ __m128d c = _mm_loadu_pd(ptr + stride * 2);
+ __m128d d = _mm_loadu_pd(ptr + stride * 3);
+ return npyv512_combine_pd256(
+ _mm256_insertf128_pd(_mm256_castpd128_pd256(a), b, 1),
+ _mm256_insertf128_pd(_mm256_castpd128_pd256(c), d, 1)
+ );
+}
+NPY_FINLINE npyv_u64 npyv_loadn2_u64(const npy_uint64 *ptr, npy_intp stride)
+{ return npyv_reinterpret_u64_f64(npyv_loadn2_f64((const double*)ptr, stride)); }
+NPY_FINLINE npyv_s64 npyv_loadn2_s64(const npy_int64 *ptr, npy_intp stride)
+{ return npyv_loadn2_u64((const npy_uint64*)ptr, stride); }
/***************************
* Non-contiguous Store
***************************/
@@ -151,6 +197,48 @@ NPY_FINLINE void npyv_storen_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a)
NPY_FINLINE void npyv_storen_f64(double *ptr, npy_intp stride, npyv_f64 a)
{ npyv_storen_u64((npy_uint64*)ptr, stride, _mm512_castpd_si512(a)); }
+//// 64-bit store over 32-bit stride
+NPY_FINLINE void npyv_storen2_u32(npy_uint32 *ptr, npy_intp stride, npyv_u32 a)
+{
+ __m256d lo = _mm512_castpd512_pd256(_mm512_castsi512_pd(a));
+ __m256d hi = _mm512_extractf64x4_pd(_mm512_castsi512_pd(a), 1);
+ __m128d e0 = _mm256_castpd256_pd128(lo);
+ __m128d e1 = _mm256_extractf128_pd(lo, 1);
+ __m128d e2 = _mm256_castpd256_pd128(hi);
+ __m128d e3 = _mm256_extractf128_pd(hi, 1);
+ _mm_storel_pd((double*)(ptr + stride * 0), e0);
+ _mm_storeh_pd((double*)(ptr + stride * 1), e0);
+ _mm_storel_pd((double*)(ptr + stride * 2), e1);
+ _mm_storeh_pd((double*)(ptr + stride * 3), e1);
+ _mm_storel_pd((double*)(ptr + stride * 4), e2);
+ _mm_storeh_pd((double*)(ptr + stride * 5), e2);
+ _mm_storel_pd((double*)(ptr + stride * 6), e3);
+ _mm_storeh_pd((double*)(ptr + stride * 7), e3);
+}
+NPY_FINLINE void npyv_storen2_s32(npy_int32 *ptr, npy_intp stride, npyv_s32 a)
+{ npyv_storen2_u32((npy_uint32*)ptr, stride, a); }
+NPY_FINLINE void npyv_storen2_f32(float *ptr, npy_intp stride, npyv_f32 a)
+{ npyv_storen2_u32((npy_uint32*)ptr, stride, _mm512_castps_si512(a)); }
+
+//// 128-bit store over 64-bit stride
+NPY_FINLINE void npyv_storen2_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a)
+{
+ __m256i lo = npyv512_lower_si256(a);
+ __m256i hi = npyv512_higher_si256(a);
+ __m128i e0 = _mm256_castsi256_si128(lo);
+ __m128i e1 = _mm256_extracti128_si256(lo, 1);
+ __m128i e2 = _mm256_castsi256_si128(hi);
+ __m128i e3 = _mm256_extracti128_si256(hi, 1);
+ _mm_storeu_si128((__m128i*)(ptr + stride * 0), e0);
+ _mm_storeu_si128((__m128i*)(ptr + stride * 1), e1);
+ _mm_storeu_si128((__m128i*)(ptr + stride * 2), e2);
+ _mm_storeu_si128((__m128i*)(ptr + stride * 3), e3);
+}
+NPY_FINLINE void npyv_storen2_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a)
+{ npyv_storen2_u64((npy_uint64*)ptr, stride, a); }
+NPY_FINLINE void npyv_storen2_f64(double *ptr, npy_intp stride, npyv_f64 a)
+{ npyv_storen2_u64((npy_uint64*)ptr, stride, _mm512_castpd_si512(a)); }
+
/*********************************
* Partial Load
*********************************/
@@ -159,14 +247,14 @@ NPY_FINLINE npyv_s32 npyv_load_till_s32(const npy_int32 *ptr, npy_uintp nlane, n
{
assert(nlane > 0);
const __m512i vfill = _mm512_set1_epi32(fill);
- const __mmask16 mask = nlane > 31 ? -1 : (1 << nlane) - 1;
+ const __mmask16 mask = nlane > 15 ? -1 : (1 << nlane) - 1;
return _mm512_mask_loadu_epi32(vfill, mask, (const __m512i*)ptr);
}
// fill zero to rest lanes
NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
{
assert(nlane > 0);
- const __mmask16 mask = nlane > 31 ? -1 : (1 << nlane) - 1;
+ const __mmask16 mask = nlane > 15 ? -1 : (1 << nlane) - 1;
return _mm512_maskz_loadu_epi32(mask, (const __m512i*)ptr);
}
//// 64
@@ -174,14 +262,44 @@ NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, n
{
assert(nlane > 0);
const __m512i vfill = npyv_setall_s64(fill);
- const __mmask8 mask = nlane > 31 ? -1 : (1 << nlane) - 1;
+ const __mmask8 mask = nlane > 7 ? -1 : (1 << nlane) - 1;
return _mm512_mask_loadu_epi64(vfill, mask, (const __m512i*)ptr);
}
// fill zero to rest lanes
NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
{
assert(nlane > 0);
- const __mmask8 mask = nlane > 15 ? -1 : (1 << nlane) - 1;
+ const __mmask8 mask = nlane > 7 ? -1 : (1 << nlane) - 1;
+ return _mm512_maskz_loadu_epi64(mask, (const __m512i*)ptr);
+}
+
+//// 64-bit nlane
+NPY_FINLINE npyv_s32 npyv_load2_till_s32(const npy_int32 *ptr, npy_uintp nlane,
+ npy_int32 fill_lo, npy_int32 fill_hi)
+{
+ assert(nlane > 0);
+ const __m512i vfill = _mm512_set4_epi32(fill_hi, fill_lo, fill_hi, fill_lo);
+ const __mmask8 mask = nlane > 7 ? -1 : (1 << nlane) - 1;
+ return _mm512_mask_loadu_epi64(vfill, mask, (const __m512i*)ptr);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s32 npyv_load2_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
+{ return npyv_load_tillz_s64((const npy_int64*)ptr, nlane); }
+
+//// 128-bit nlane
+NPY_FINLINE npyv_u64 npyv_load2_till_s64(const npy_int64 *ptr, npy_uintp nlane,
+ npy_int64 fill_lo, npy_int64 fill_hi)
+{
+ assert(nlane > 0);
+ const __m512i vfill = _mm512_set4_epi64(fill_hi, fill_lo, fill_hi, fill_lo);
+ const __mmask8 mask = nlane > 3 ? -1 : (1 << (nlane*2)) - 1;
+ return _mm512_mask_loadu_epi64(vfill, mask, (const __m512i*)ptr);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s64 npyv_load2_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
+{
+ assert(nlane > 0);
+ const __mmask8 mask = nlane > 3 ? -1 : (1 << (nlane*2)) - 1;
return _mm512_maskz_loadu_epi64(mask, (const __m512i*)ptr);
}
/*********************************
@@ -198,7 +316,7 @@ npyv_loadn_till_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npy_
);
const __m512i idx = _mm512_mullo_epi32(steps, _mm512_set1_epi32((int)stride));
const __m512i vfill = _mm512_set1_epi32(fill);
- const __mmask16 mask = nlane > 31 ? -1 : (1 << nlane) - 1;
+ const __mmask16 mask = nlane > 15 ? -1 : (1 << nlane) - 1;
return _mm512_mask_i32gather_epi32(vfill, mask, idx, (const __m512i*)ptr, 4);
}
// fill zero to rest lanes
@@ -215,13 +333,48 @@ npyv_loadn_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_
4*stride, 5*stride, 6*stride, 7*stride
);
const __m512i vfill = npyv_setall_s64(fill);
- const __mmask8 mask = nlane > 31 ? -1 : (1 << nlane) - 1;
+ const __mmask8 mask = nlane > 15 ? -1 : (1 << nlane) - 1;
return _mm512_mask_i64gather_epi64(vfill, mask, idx, (const __m512i*)ptr, 8);
}
// fill zero to rest lanes
NPY_FINLINE npyv_s64
npyv_loadn_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane)
{ return npyv_loadn_till_s64(ptr, stride, nlane, 0); }
+
+//// 64-bit load over 32-bit stride
+NPY_FINLINE npyv_s64 npyv_loadn2_till_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane,
+ npy_int32 fill_lo, npy_int32 fill_hi)
+{
+ assert(nlane > 0);
+ const __m512i idx = npyv_set_s64(
+ 0*stride, 1*stride, 2*stride, 3*stride,
+ 4*stride, 5*stride, 6*stride, 7*stride
+ );
+ const __m512i vfill = _mm512_set4_epi32(fill_hi, fill_lo, fill_hi, fill_lo);
+ const __mmask8 mask = nlane > 7 ? -1 : (1 << nlane) - 1;
+ return _mm512_mask_i64gather_epi64(vfill, mask, idx, (const __m512i*)ptr, 4);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s32 npyv_loadn2_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane)
+{ return npyv_loadn2_till_s32(ptr, stride, nlane, 0, 0); }
+
+//// 128-bit load over 64-bit stride
+NPY_FINLINE npyv_s64 npyv_loadn2_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane,
+ npy_int64 fill_lo, npy_int64 fill_hi)
+{
+ assert(nlane > 0);
+ const __m512i idx = npyv_set_s64(
+ 0, 1, stride, stride+1,
+ stride*2, stride*2+1, stride*3, stride*3+1
+ );
+ const __mmask8 mask = nlane > 3 ? -1 : (1 << (nlane*2)) - 1;
+ const __m512i vfill = _mm512_set4_epi64(fill_hi, fill_lo, fill_hi, fill_lo);
+ return _mm512_mask_i64gather_epi64(vfill, mask, idx, (const __m512i*)ptr, 8);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s64 npyv_loadn2_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane)
+{ return npyv_loadn2_till_s64(ptr, stride, nlane, 0, 0); }
+
/*********************************
* Partial store
*********************************/
@@ -229,14 +382,30 @@ npyv_loadn_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane)
NPY_FINLINE void npyv_store_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a)
{
assert(nlane > 0);
- const __mmask16 mask = nlane > 31 ? -1 : (1 << nlane) - 1;
+ const __mmask16 mask = nlane > 15 ? -1 : (1 << nlane) - 1;
_mm512_mask_storeu_epi32((__m512i*)ptr, mask, a);
}
//// 64
NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a)
{
assert(nlane > 0);
- const __mmask8 mask = nlane > 15 ? -1 : (1 << nlane) - 1;
+ const __mmask8 mask = nlane > 7 ? -1 : (1 << nlane) - 1;
+ _mm512_mask_storeu_epi64((__m512i*)ptr, mask, a);
+}
+
+//// 64-bit nlane
+NPY_FINLINE void npyv_store2_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a)
+{
+ assert(nlane > 0);
+ const __mmask8 mask = nlane > 7 ? -1 : (1 << nlane) - 1;
+ _mm512_mask_storeu_epi64((__m512i*)ptr, mask, a);
+}
+
+//// 128-bit nlane
+NPY_FINLINE void npyv_store2_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a)
+{
+ assert(nlane > 0);
+ const __mmask8 mask = nlane > 3 ? -1 : (1 << (nlane*2)) - 1;
_mm512_mask_storeu_epi64((__m512i*)ptr, mask, a);
}
/*********************************
@@ -251,7 +420,7 @@ NPY_FINLINE void npyv_storen_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
);
const __m512i idx = _mm512_mullo_epi32(steps, _mm512_set1_epi32((int)stride));
- const __mmask16 mask = nlane > 31 ? -1 : (1 << nlane) - 1;
+ const __mmask16 mask = nlane > 15 ? -1 : (1 << nlane) - 1;
_mm512_mask_i32scatter_epi32((__m512i*)ptr, mask, idx, a, 4);
}
//// 64
@@ -262,7 +431,31 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp
0*stride, 1*stride, 2*stride, 3*stride,
4*stride, 5*stride, 6*stride, 7*stride
);
- const __mmask8 mask = nlane > 15 ? -1 : (1 << nlane) - 1;
+ const __mmask8 mask = nlane > 7 ? -1 : (1 << nlane) - 1;
+ _mm512_mask_i64scatter_epi64((__m512i*)ptr, mask, idx, a, 8);
+}
+
+//// 64-bit store over 32-bit stride
+NPY_FINLINE void npyv_storen2_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npyv_s32 a)
+{
+ assert(nlane > 0);
+ const __m512i idx = npyv_set_s64(
+ 0*stride, 1*stride, 2*stride, 3*stride,
+ 4*stride, 5*stride, 6*stride, 7*stride
+ );
+ const __mmask8 mask = nlane > 7 ? -1 : (1 << nlane) - 1;
+ _mm512_mask_i64scatter_epi64((__m512i*)ptr, mask, idx, a, 4);
+}
+
+//// 128-bit store over 64-bit stride
+NPY_FINLINE void npyv_storen2_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npyv_s64 a)
+{
+ assert(nlane > 0);
+ const __m512i idx = npyv_set_s64(
+ 0, 1, stride, stride+1,
+ 2*stride, 2*stride+1, 3*stride, 3*stride+1
+ );
+ const __mmask8 mask = nlane > 3 ? -1 : (1 << (nlane*2)) - 1;
_mm512_mask_i64scatter_epi64((__m512i*)ptr, mask, idx, a, 8);
}
@@ -331,6 +524,110 @@ NPYV_IMPL_AVX512_REST_PARTIAL_TYPES(f32, s32)
NPYV_IMPL_AVX512_REST_PARTIAL_TYPES(u64, s64)
NPYV_IMPL_AVX512_REST_PARTIAL_TYPES(f64, s64)
+// 128-bit/64-bit stride (pair load/store)
+#define NPYV_IMPL_AVX512_REST_PARTIAL_TYPES_PAIR(F_SFX, T_SFX) \
+ NPY_FINLINE npyv_##F_SFX npyv_load2_till_##F_SFX \
+ (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, \
+ npyv_lanetype_##F_SFX fill_lo, npyv_lanetype_##F_SFX fill_hi) \
+ { \
+ union pun { \
+ npyv_lanetype_##F_SFX from_##F_SFX; \
+ npyv_lanetype_##T_SFX to_##T_SFX; \
+ }; \
+ union pun pun_lo; \
+ union pun pun_hi; \
+ pun_lo.from_##F_SFX = fill_lo; \
+ pun_hi.from_##F_SFX = fill_hi; \
+ return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load2_till_##T_SFX( \
+ (const npyv_lanetype_##T_SFX *)ptr, nlane, pun_lo.to_##T_SFX, pun_hi.to_##T_SFX \
+ )); \
+ } \
+ NPY_FINLINE npyv_##F_SFX npyv_loadn2_till_##F_SFX \
+ (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, \
+ npyv_lanetype_##F_SFX fill_lo, npyv_lanetype_##F_SFX fill_hi) \
+ { \
+ union pun { \
+ npyv_lanetype_##F_SFX from_##F_SFX; \
+ npyv_lanetype_##T_SFX to_##T_SFX; \
+ }; \
+ union pun pun_lo; \
+ union pun pun_hi; \
+ pun_lo.from_##F_SFX = fill_lo; \
+ pun_hi.from_##F_SFX = fill_hi; \
+ return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn2_till_##T_SFX( \
+ (const npyv_lanetype_##T_SFX *)ptr, stride, nlane, pun_lo.to_##T_SFX, \
+ pun_hi.to_##T_SFX \
+ )); \
+ } \
+ NPY_FINLINE npyv_##F_SFX npyv_load2_tillz_##F_SFX \
+ (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane) \
+ { \
+ return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load2_tillz_##T_SFX( \
+ (const npyv_lanetype_##T_SFX *)ptr, nlane \
+ )); \
+ } \
+ NPY_FINLINE npyv_##F_SFX npyv_loadn2_tillz_##F_SFX \
+ (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane) \
+ { \
+ return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn2_tillz_##T_SFX( \
+ (const npyv_lanetype_##T_SFX *)ptr, stride, nlane \
+ )); \
+ } \
+ NPY_FINLINE void npyv_store2_till_##F_SFX \
+ (npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, npyv_##F_SFX a) \
+ { \
+ npyv_store2_till_##T_SFX( \
+ (npyv_lanetype_##T_SFX *)ptr, nlane, \
+ npyv_reinterpret_##T_SFX##_##F_SFX(a) \
+ ); \
+ } \
+ NPY_FINLINE void npyv_storen2_till_##F_SFX \
+ (npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, npyv_##F_SFX a) \
+ { \
+ npyv_storen2_till_##T_SFX( \
+ (npyv_lanetype_##T_SFX *)ptr, stride, nlane, \
+ npyv_reinterpret_##T_SFX##_##F_SFX(a) \
+ ); \
+ }
+
+NPYV_IMPL_AVX512_REST_PARTIAL_TYPES_PAIR(u32, s32)
+NPYV_IMPL_AVX512_REST_PARTIAL_TYPES_PAIR(f32, s32)
+NPYV_IMPL_AVX512_REST_PARTIAL_TYPES_PAIR(u64, s64)
+NPYV_IMPL_AVX512_REST_PARTIAL_TYPES_PAIR(f64, s64)
+
+/************************************************************
+ * de-interlave load / interleave contiguous store
+ ************************************************************/
+// two channels
+#define NPYV_IMPL_AVX512_MEM_INTERLEAVE(SFX, ZSFX) \
+ NPY_FINLINE npyv_##ZSFX##x2 npyv_zip_##ZSFX(npyv_##ZSFX, npyv_##ZSFX); \
+ NPY_FINLINE npyv_##ZSFX##x2 npyv_unzip_##ZSFX(npyv_##ZSFX, npyv_##ZSFX); \
+ NPY_FINLINE npyv_##SFX##x2 npyv_load_##SFX##x2( \
+ const npyv_lanetype_##SFX *ptr \
+ ) { \
+ return npyv_unzip_##ZSFX( \
+ npyv_load_##SFX(ptr), npyv_load_##SFX(ptr+npyv_nlanes_##SFX) \
+ ); \
+ } \
+ NPY_FINLINE void npyv_store_##SFX##x2( \
+ npyv_lanetype_##SFX *ptr, npyv_##SFX##x2 v \
+ ) { \
+ npyv_##SFX##x2 zip = npyv_zip_##ZSFX(v.val[0], v.val[1]); \
+ npyv_store_##SFX(ptr, zip.val[0]); \
+ npyv_store_##SFX(ptr + npyv_nlanes_##SFX, zip.val[1]); \
+ }
+
+NPYV_IMPL_AVX512_MEM_INTERLEAVE(u8, u8)
+NPYV_IMPL_AVX512_MEM_INTERLEAVE(s8, u8)
+NPYV_IMPL_AVX512_MEM_INTERLEAVE(u16, u16)
+NPYV_IMPL_AVX512_MEM_INTERLEAVE(s16, u16)
+NPYV_IMPL_AVX512_MEM_INTERLEAVE(u32, u32)
+NPYV_IMPL_AVX512_MEM_INTERLEAVE(s32, u32)
+NPYV_IMPL_AVX512_MEM_INTERLEAVE(u64, u64)
+NPYV_IMPL_AVX512_MEM_INTERLEAVE(s64, u64)
+NPYV_IMPL_AVX512_MEM_INTERLEAVE(f32, f32)
+NPYV_IMPL_AVX512_MEM_INTERLEAVE(f64, f64)
+
/**************************************************
* Lookup table
*************************************************/
diff --git a/numpy/core/src/common/simd/avx512/reorder.h b/numpy/core/src/common/simd/avx512/reorder.h
index c0b2477f3..27e66b5e7 100644
--- a/numpy/core/src/common/simd/avx512/reorder.h
+++ b/numpy/core/src/common/simd/avx512/reorder.h
@@ -167,6 +167,140 @@ NPY_FINLINE npyv_f64x2 npyv_zip_f64(__m512d a, __m512d b)
return r;
}
+// deinterleave two vectors
+NPY_FINLINE npyv_u8x2 npyv_unzip_u8(npyv_u8 ab0, npyv_u8 ab1)
+{
+ npyv_u8x2 r;
+#ifdef NPY_HAVE_AVX512VBMI
+ const __m512i idx_a = npyv_set_u8(
+ 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
+ 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62,
+ 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94,
+ 96, 98, 100, 102, 104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126
+ );
+ const __m512i idx_b = npyv_set_u8(
+ 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31,
+ 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63,
+ 65, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 95,
+ 97, 99, 101, 103, 105, 107, 109, 111, 113, 115, 117, 119, 121, 123, 125, 127
+ );
+ r.val[0] = _mm512_permutex2var_epi8(ab0, idx_a, ab1);
+ r.val[1] = _mm512_permutex2var_epi8(ab0, idx_b, ab1);
+#else
+ #ifdef NPY_HAVE_AVX512BW
+ const __m512i idx = npyv_set_u8(
+ 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
+ 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
+ 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
+ 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
+ );
+ __m512i abl = _mm512_shuffle_epi8(ab0, idx);
+ __m512i abh = _mm512_shuffle_epi8(ab1, idx);
+ #else
+ const __m256i idx = _mm256_setr_epi8(
+ 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
+ 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
+ );
+ __m256i abl_lo = _mm256_shuffle_epi8(npyv512_lower_si256(ab0), idx);
+ __m256i abl_hi = _mm256_shuffle_epi8(npyv512_higher_si256(ab0), idx);
+ __m256i abh_lo = _mm256_shuffle_epi8(npyv512_lower_si256(ab1), idx);
+ __m256i abh_hi = _mm256_shuffle_epi8(npyv512_higher_si256(ab1), idx);
+ __m512i abl = npyv512_combine_si256(abl_lo, abl_hi);
+ __m512i abh = npyv512_combine_si256(abh_lo, abh_hi);
+ #endif
+ const __m512i idx_a = npyv_set_u64(0, 2, 4, 6, 8, 10, 12, 14);
+ const __m512i idx_b = npyv_set_u64(1, 3, 5, 7, 9, 11, 13, 15);
+ r.val[0] = _mm512_permutex2var_epi64(abl, idx_a, abh);
+ r.val[1] = _mm512_permutex2var_epi64(abl, idx_b, abh);
+#endif
+ return r;
+}
+#define npyv_unzip_s8 npyv_unzip_u8
+
+NPY_FINLINE npyv_u16x2 npyv_unzip_u16(npyv_u16 ab0, npyv_u16 ab1)
+{
+ npyv_u16x2 r;
+#ifdef NPY_HAVE_AVX512BW
+ const __m512i idx_a = npyv_set_u16(
+ 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
+ 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62
+ );
+ const __m512i idx_b = npyv_set_u16(
+ 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31,
+ 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63
+ );
+ r.val[0] = _mm512_permutex2var_epi16(ab0, idx_a, ab1);
+ r.val[1] = _mm512_permutex2var_epi16(ab0, idx_b, ab1);
+#else
+ const __m256i idx = _mm256_setr_epi8(
+ 0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15,
+ 0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15
+ );
+ __m256i abl_lo = _mm256_shuffle_epi8(npyv512_lower_si256(ab0), idx);
+ __m256i abl_hi = _mm256_shuffle_epi8(npyv512_higher_si256(ab0), idx);
+ __m256i abh_lo = _mm256_shuffle_epi8(npyv512_lower_si256(ab1), idx);
+ __m256i abh_hi = _mm256_shuffle_epi8(npyv512_higher_si256(ab1), idx);
+ __m512i abl = npyv512_combine_si256(abl_lo, abl_hi);
+ __m512i abh = npyv512_combine_si256(abh_lo, abh_hi);
+
+ const __m512i idx_a = npyv_set_u64(0, 2, 4, 6, 8, 10, 12, 14);
+ const __m512i idx_b = npyv_set_u64(1, 3, 5, 7, 9, 11, 13, 15);
+ r.val[0] = _mm512_permutex2var_epi64(abl, idx_a, abh);
+ r.val[1] = _mm512_permutex2var_epi64(abl, idx_b, abh);
+#endif
+ return r;
+}
+#define npyv_unzip_s16 npyv_unzip_u16
+
+NPY_FINLINE npyv_u32x2 npyv_unzip_u32(npyv_u32 ab0, npyv_u32 ab1)
+{
+ const __m512i idx_a = npyv_set_u32(
+ 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+ );
+ const __m512i idx_b = npyv_set_u32(
+ 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+ );
+ npyv_u32x2 r;
+ r.val[0] = _mm512_permutex2var_epi32(ab0, idx_a, ab1);
+ r.val[1] = _mm512_permutex2var_epi32(ab0, idx_b, ab1);
+ return r;
+}
+#define npyv_unzip_s32 npyv_unzip_u32
+
+NPY_FINLINE npyv_u64x2 npyv_unzip_u64(npyv_u64 ab0, npyv_u64 ab1)
+{
+ const __m512i idx_a = npyv_set_u64(0, 2, 4, 6, 8, 10, 12, 14);
+ const __m512i idx_b = npyv_set_u64(1, 3, 5, 7, 9, 11, 13, 15);
+ npyv_u64x2 r;
+ r.val[0] = _mm512_permutex2var_epi64(ab0, idx_a, ab1);
+ r.val[1] = _mm512_permutex2var_epi64(ab0, idx_b, ab1);
+ return r;
+}
+#define npyv_unzip_s64 npyv_unzip_u64
+
+NPY_FINLINE npyv_f32x2 npyv_unzip_f32(npyv_f32 ab0, npyv_f32 ab1)
+{
+ const __m512i idx_a = npyv_set_u32(
+ 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+ );
+ const __m512i idx_b = npyv_set_u32(
+ 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+ );
+ npyv_f32x2 r;
+ r.val[0] = _mm512_permutex2var_ps(ab0, idx_a, ab1);
+ r.val[1] = _mm512_permutex2var_ps(ab0, idx_b, ab1);
+ return r;
+}
+NPY_FINLINE npyv_f64x2 npyv_unzip_f64(npyv_f64 ab0, npyv_f64 ab1)
+{
+ const __m512i idx_a = npyv_set_u64(0, 2, 4, 6, 8, 10, 12, 14);
+ const __m512i idx_b = npyv_set_u64(1, 3, 5, 7, 9, 11, 13, 15);
+ npyv_f64x2 r;
+ r.val[0] = _mm512_permutex2var_pd(ab0, idx_a, ab1);
+ r.val[1] = _mm512_permutex2var_pd(ab0, idx_b, ab1);
+ return r;
+}
+
// Reverse elements of each 64-bit lane
NPY_FINLINE npyv_u8 npyv_rev64_u8(npyv_u8 a)
{
@@ -223,4 +357,22 @@ NPY_FINLINE npyv_f32 npyv_rev64_f32(npyv_f32 a)
return _mm512_shuffle_ps(a, a, (_MM_PERM_ENUM)_MM_SHUFFLE(2, 3, 0, 1));
}
+// Permuting the elements of each 128-bit lane by immediate index for
+// each element.
+#define npyv_permi128_u32(A, E0, E1, E2, E3) \
+ _mm512_shuffle_epi32(A, _MM_SHUFFLE(E3, E2, E1, E0))
+
+#define npyv_permi128_s32 npyv_permi128_u32
+
+#define npyv_permi128_u64(A, E0, E1) \
+ _mm512_shuffle_epi32(A, _MM_SHUFFLE(((E1)<<1)+1, ((E1)<<1), ((E0)<<1)+1, ((E0)<<1)))
+
+#define npyv_permi128_s64 npyv_permi128_u64
+
+#define npyv_permi128_f32(A, E0, E1, E2, E3) \
+ _mm512_permute_ps(A, _MM_SHUFFLE(E3, E2, E1, E0))
+
+#define npyv_permi128_f64(A, E0, E1) \
+ _mm512_permute_pd(A, (((E1)<<7) | ((E0)<<6) | ((E1)<<5) | ((E0)<<4) | ((E1)<<3) | ((E0)<<2) | ((E1)<<1) | (E0)))
+
#endif // _NPY_SIMD_AVX512_REORDER_H
diff --git a/numpy/core/src/common/simd/emulate_maskop.h b/numpy/core/src/common/simd/emulate_maskop.h
index 2a808a153..6627e5010 100644
--- a/numpy/core/src/common/simd/emulate_maskop.h
+++ b/numpy/core/src/common/simd/emulate_maskop.h
@@ -42,5 +42,39 @@ NPYV_IMPL_EMULATE_MASK_ADDSUB(s64, b64)
#if NPY_SIMD_F64
NPYV_IMPL_EMULATE_MASK_ADDSUB(f64, b64)
#endif
+#if NPY_SIMD_F32
+ // conditional division, m ? a / b : c
+ NPY_FINLINE npyv_f32
+ npyv_ifdiv_f32(npyv_b32 m, npyv_f32 a, npyv_f32 b, npyv_f32 c)
+ {
+ const npyv_f32 one = npyv_setall_f32(1.0f);
+ npyv_f32 div = npyv_div_f32(a, npyv_select_f32(m, b, one));
+ return npyv_select_f32(m, div, c);
+ }
+ // conditional division, m ? a / b : 0
+ NPY_FINLINE npyv_f32
+ npyv_ifdivz_f32(npyv_b32 m, npyv_f32 a, npyv_f32 b)
+ {
+ const npyv_f32 zero = npyv_zero_f32();
+ return npyv_ifdiv_f32(m, a, b, zero);
+ }
+#endif
+#if NPY_SIMD_F64
+ // conditional division, m ? a / b : c
+ NPY_FINLINE npyv_f64
+ npyv_ifdiv_f64(npyv_b64 m, npyv_f64 a, npyv_f64 b, npyv_f64 c)
+ {
+ const npyv_f64 one = npyv_setall_f64(1.0);
+ npyv_f64 div = npyv_div_f64(a, npyv_select_f64(m, b, one));
+ return npyv_select_f64(m, div, c);
+ }
+ // conditional division, m ? a / b : 0
+ NPY_FINLINE npyv_f64
+ npyv_ifdivz_f64(npyv_b64 m, npyv_f64 a, npyv_f64 b)
+ {
+ const npyv_f64 zero = npyv_zero_f64();
+ return npyv_ifdiv_f64(m, a, b, zero);
+ }
+#endif
#endif // _NPY_SIMD_EMULATE_MASKOP_H
diff --git a/numpy/core/src/common/simd/neon/arithmetic.h b/numpy/core/src/common/simd/neon/arithmetic.h
index 00994806d..683620111 100644
--- a/numpy/core/src/common/simd/neon/arithmetic.h
+++ b/numpy/core/src/common/simd/neon/arithmetic.h
@@ -265,6 +265,14 @@ NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor)
NPY_FINLINE npyv_f32 npyv_nmulsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
{ return vmlsq_f32(vnegq_f32(c), a, b); }
#endif
+// multiply, add for odd elements and subtract even elements.
+// (a * b) -+ c
+NPY_FINLINE npyv_f32 npyv_muladdsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
+{
+ const npyv_f32 msign = npyv_set_f32(-0.0f, 0.0f, -0.0f, 0.0f);
+ return npyv_muladd_f32(a, b, npyv_xor_f32(msign, c));
+}
+
/***************************
* FUSED F64
***************************/
@@ -277,6 +285,11 @@ NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor)
{ return vfmsq_f64(c, a, b); }
NPY_FINLINE npyv_f64 npyv_nmulsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c)
{ return vfmsq_f64(vnegq_f64(c), a, b); }
+ NPY_FINLINE npyv_f64 npyv_muladdsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c)
+ {
+ const npyv_f64 msign = npyv_set_f64(-0.0, 0.0);
+ return npyv_muladd_f64(a, b, npyv_xor_f64(msign, c));
+ }
#endif // NPY_SIMD_F64
/***************************
diff --git a/numpy/core/src/common/simd/neon/math.h b/numpy/core/src/common/simd/neon/math.h
index c0a771b5d..58d14809f 100644
--- a/numpy/core/src/common/simd/neon/math.h
+++ b/numpy/core/src/common/simd/neon/math.h
@@ -278,20 +278,25 @@ NPY_FINLINE npyv_f32 npyv_rint_f32(npyv_f32 a)
return vrndnq_f32(a);
#else
// ARMv7 NEON only supports fp to int truncate conversion.
- // a magic trick of adding 1.5 * 2**23 is used for rounding
+ // a magic trick of adding 1.5 * 2^23 is used for rounding
// to nearest even and then subtract this magic number to get
// the integer.
- const npyv_s32 szero = vreinterpretq_s32_f32(vdupq_n_f32(-0.0f));
- const npyv_f32 magic = vdupq_n_f32(12582912.0f); // 1.5 * 2**23
- npyv_f32 round = vsubq_f32(vaddq_f32(a, magic), magic);
- npyv_b32 overflow = vcleq_f32(vabsq_f32(a), vreinterpretq_f32_u32(vdupq_n_u32(0x4b000000)));
- round = vbslq_f32(overflow, round, a);
- // signed zero
- round = vreinterpretq_f32_s32(vorrq_s32(
- vreinterpretq_s32_f32(round),
- vandq_s32(vreinterpretq_s32_f32(a), szero)
- ));
- return round;
+ //
+ const npyv_u32 szero = vreinterpretq_u32_f32(vdupq_n_f32(-0.0f));
+ const npyv_u32 sign_mask = vandq_u32(vreinterpretq_u32_f32(a), szero);
+ const npyv_f32 two_power_23 = vdupq_n_f32(8388608.0); // 2^23
+ const npyv_f32 two_power_23h = vdupq_n_f32(12582912.0f); // 1.5 * 2^23
+ npyv_u32 nnan_mask = vceqq_f32(a, a);
+ // eliminate nans to avoid invalid fp errors
+ npyv_f32 abs_x = vabsq_f32(vreinterpretq_f32_u32(vandq_u32(nnan_mask, vreinterpretq_u32_f32(a))));
+ // round by add magic number 1.5 * 2^23
+ npyv_f32 round = vsubq_f32(vaddq_f32(two_power_23h, abs_x), two_power_23h);
+ // copysign
+ round = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(round), sign_mask ));
+ // a if |a| >= 2^23 or a == NaN
+ npyv_u32 mask = vcleq_f32(abs_x, two_power_23);
+ mask = vandq_u32(mask, nnan_mask);
+ return vbslq_f32(mask, round, a);
#endif
}
#if NPY_SIMD_F64
@@ -302,33 +307,30 @@ NPY_FINLINE npyv_f32 npyv_rint_f32(npyv_f32 a)
#ifdef NPY_HAVE_ASIMD
#define npyv_ceil_f32 vrndpq_f32
#else
- NPY_FINLINE npyv_f32 npyv_ceil_f32(npyv_f32 a)
- {
- const npyv_s32 szero = vreinterpretq_s32_f32(vdupq_n_f32(-0.0f));
+ NPY_FINLINE npyv_f32 npyv_ceil_f32(npyv_f32 a)
+ {
const npyv_u32 one = vreinterpretq_u32_f32(vdupq_n_f32(1.0f));
- const npyv_s32 max_int = vdupq_n_s32(0x7fffffff);
- /**
- * On armv7, vcvtq.f32 handles special cases as follows:
- * NaN return 0
- * +inf or +outrange return 0x80000000(-0.0f)
- * -inf or -outrange return 0x7fffffff(nan)
- */
- npyv_s32 roundi = vcvtq_s32_f32(a);
- npyv_f32 round = vcvtq_f32_s32(roundi);
+ const npyv_u32 szero = vreinterpretq_u32_f32(vdupq_n_f32(-0.0f));
+ const npyv_u32 sign_mask = vandq_u32(vreinterpretq_u32_f32(a), szero);
+ const npyv_f32 two_power_23 = vdupq_n_f32(8388608.0); // 2^23
+ const npyv_f32 two_power_23h = vdupq_n_f32(12582912.0f); // 1.5 * 2^23
+ npyv_u32 nnan_mask = vceqq_f32(a, a);
+ npyv_f32 x = vreinterpretq_f32_u32(vandq_u32(nnan_mask, vreinterpretq_u32_f32(a)));
+ // eliminate nans to avoid invalid fp errors
+ npyv_f32 abs_x = vabsq_f32(x);
+ // round by add magic number 1.5 * 2^23
+ npyv_f32 round = vsubq_f32(vaddq_f32(two_power_23h, abs_x), two_power_23h);
+ // copysign
+ round = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(round), sign_mask));
npyv_f32 ceil = vaddq_f32(round, vreinterpretq_f32_u32(
- vandq_u32(vcltq_f32(round, a), one))
- );
- // respect signed zero, e.g. -0.5 -> -0.0
- npyv_f32 rzero = vreinterpretq_f32_s32(vorrq_s32(
- vreinterpretq_s32_f32(ceil),
- vandq_s32(vreinterpretq_s32_f32(a), szero)
- ));
- // if nan or overflow return a
- npyv_u32 nnan = npyv_notnan_f32(a);
- npyv_u32 overflow = vorrq_u32(
- vceqq_s32(roundi, szero), vceqq_s32(roundi, max_int)
+ vandq_u32(vcltq_f32(round, x), one))
);
- return vbslq_f32(vbicq_u32(nnan, overflow), rzero, a);
+ // respects signed zero
+ ceil = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(ceil), sign_mask));
+ // a if |a| >= 2^23 or a == NaN
+ npyv_u32 mask = vcleq_f32(abs_x, two_power_23);
+ mask = vandq_u32(mask, nnan_mask);
+ return vbslq_f32(mask, ceil, a);
}
#endif
#if NPY_SIMD_F64
@@ -339,29 +341,37 @@ NPY_FINLINE npyv_f32 npyv_rint_f32(npyv_f32 a)
#ifdef NPY_HAVE_ASIMD
#define npyv_trunc_f32 vrndq_f32
#else
- NPY_FINLINE npyv_f32 npyv_trunc_f32(npyv_f32 a)
- {
- const npyv_s32 szero = vreinterpretq_s32_f32(vdupq_n_f32(-0.0f));
+ NPY_FINLINE npyv_f32 npyv_trunc_f32(npyv_f32 a)
+ {
const npyv_s32 max_int = vdupq_n_s32(0x7fffffff);
+ const npyv_u32 exp_mask = vdupq_n_u32(0xff000000);
+ const npyv_s32 szero = vreinterpretq_s32_f32(vdupq_n_f32(-0.0f));
+ const npyv_u32 sign_mask = vandq_u32(
+ vreinterpretq_u32_f32(a), vreinterpretq_u32_s32(szero));
+
+ npyv_u32 nfinite_mask = vshlq_n_u32(vreinterpretq_u32_f32(a), 1);
+ nfinite_mask = vandq_u32(nfinite_mask, exp_mask);
+ nfinite_mask = vceqq_u32(nfinite_mask, exp_mask);
+ // eliminate nans/inf to avoid invalid fp errors
+ npyv_f32 x = vreinterpretq_f32_u32(
+ veorq_u32(nfinite_mask, vreinterpretq_u32_f32(a)));
/**
* On armv7, vcvtq.f32 handles special cases as follows:
* NaN return 0
* +inf or +outrange return 0x80000000(-0.0f)
* -inf or -outrange return 0x7fffffff(nan)
*/
- npyv_s32 roundi = vcvtq_s32_f32(a);
- npyv_f32 round = vcvtq_f32_s32(roundi);
+ npyv_s32 trunci = vcvtq_s32_f32(x);
+ npyv_f32 trunc = vcvtq_f32_s32(trunci);
// respect signed zero, e.g. -0.5 -> -0.0
- npyv_f32 rzero = vreinterpretq_f32_s32(vorrq_s32(
- vreinterpretq_s32_f32(round),
- vandq_s32(vreinterpretq_s32_f32(a), szero)
- ));
- // if nan or overflow return a
- npyv_u32 nnan = npyv_notnan_f32(a);
- npyv_u32 overflow = vorrq_u32(
- vceqq_s32(roundi, szero), vceqq_s32(roundi, max_int)
+ trunc = vreinterpretq_f32_u32(
+ vorrq_u32(vreinterpretq_u32_f32(trunc), sign_mask));
+ // if overflow return a
+ npyv_u32 overflow_mask = vorrq_u32(
+ vceqq_s32(trunci, szero), vceqq_s32(trunci, max_int)
);
- return vbslq_f32(vbicq_u32(nnan, overflow), rzero, a);
+ // a if a overflow or nonfinite
+ return vbslq_f32(vorrq_u32(nfinite_mask, overflow_mask), a, trunc);
}
#endif
#if NPY_SIMD_F64
@@ -372,28 +382,31 @@ NPY_FINLINE npyv_f32 npyv_rint_f32(npyv_f32 a)
#ifdef NPY_HAVE_ASIMD
#define npyv_floor_f32 vrndmq_f32
#else
- NPY_FINLINE npyv_f32 npyv_floor_f32(npyv_f32 a)
- {
- const npyv_s32 szero = vreinterpretq_s32_f32(vdupq_n_f32(-0.0f));
+ NPY_FINLINE npyv_f32 npyv_floor_f32(npyv_f32 a)
+ {
const npyv_u32 one = vreinterpretq_u32_f32(vdupq_n_f32(1.0f));
- const npyv_s32 max_int = vdupq_n_s32(0x7fffffff);
+ const npyv_u32 szero = vreinterpretq_u32_f32(vdupq_n_f32(-0.0f));
+ const npyv_u32 sign_mask = vandq_u32(vreinterpretq_u32_f32(a), szero);
+ const npyv_f32 two_power_23 = vdupq_n_f32(8388608.0); // 2^23
+ const npyv_f32 two_power_23h = vdupq_n_f32(12582912.0f); // 1.5 * 2^23
- npyv_s32 roundi = vcvtq_s32_f32(a);
- npyv_f32 round = vcvtq_f32_s32(roundi);
+ npyv_u32 nnan_mask = vceqq_f32(a, a);
+ npyv_f32 x = vreinterpretq_f32_u32(vandq_u32(nnan_mask, vreinterpretq_u32_f32(a)));
+ // eliminate nans to avoid invalid fp errors
+ npyv_f32 abs_x = vabsq_f32(x);
+ // round by add magic number 1.5 * 2^23
+ npyv_f32 round = vsubq_f32(vaddq_f32(two_power_23h, abs_x), two_power_23h);
+ // copysign
+ round = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(round), sign_mask));
npyv_f32 floor = vsubq_f32(round, vreinterpretq_f32_u32(
- vandq_u32(vcgtq_f32(round, a), one)
- ));
- // respect signed zero
- npyv_f32 rzero = vreinterpretq_f32_s32(vorrq_s32(
- vreinterpretq_s32_f32(floor),
- vandq_s32(vreinterpretq_s32_f32(a), szero)
+ vandq_u32(vcgtq_f32(round, x), one)
));
- npyv_u32 nnan = npyv_notnan_f32(a);
- npyv_u32 overflow = vorrq_u32(
- vceqq_s32(roundi, szero), vceqq_s32(roundi, max_int)
- );
-
- return vbslq_f32(vbicq_u32(nnan, overflow), rzero, a);
+ // respects signed zero
+ floor = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(floor), sign_mask));
+ // a if |a| >= 2^23 or a == NaN
+ npyv_u32 mask = vcleq_f32(abs_x, two_power_23);
+ mask = vandq_u32(mask, nnan_mask);
+ return vbslq_f32(mask, floor, a);
}
#endif // NPY_HAVE_ASIMD
#if NPY_SIMD_F64
diff --git a/numpy/core/src/common/simd/neon/memory.h b/numpy/core/src/common/simd/neon/memory.h
index 7060ea628..6163440c3 100644
--- a/numpy/core/src/common/simd/neon/memory.h
+++ b/numpy/core/src/common/simd/neon/memory.h
@@ -102,6 +102,29 @@ NPY_FINLINE npyv_f64 npyv_loadn_f64(const double *ptr, npy_intp stride)
);
}
#endif
+
+//// 64-bit load over 32-bit stride
+NPY_FINLINE npyv_u32 npyv_loadn2_u32(const npy_uint32 *ptr, npy_intp stride)
+{
+ return vcombine_u32(
+ vld1_u32((const uint32_t*)ptr), vld1_u32((const uint32_t*)ptr + stride)
+ );
+}
+NPY_FINLINE npyv_s32 npyv_loadn2_s32(const npy_int32 *ptr, npy_intp stride)
+{ return npyv_reinterpret_s32_u32(npyv_loadn2_u32((const npy_uint32*)ptr, stride)); }
+NPY_FINLINE npyv_f32 npyv_loadn2_f32(const float *ptr, npy_intp stride)
+{ return npyv_reinterpret_f32_u32(npyv_loadn2_u32((const npy_uint32*)ptr, stride)); }
+
+//// 128-bit load over 64-bit stride
+NPY_FINLINE npyv_u64 npyv_loadn2_u64(const npy_uint64 *ptr, npy_intp stride)
+{ (void)stride; return npyv_load_u64(ptr); }
+NPY_FINLINE npyv_s64 npyv_loadn2_s64(const npy_int64 *ptr, npy_intp stride)
+{ (void)stride; return npyv_load_s64(ptr); }
+#if NPY_SIMD_F64
+NPY_FINLINE npyv_f64 npyv_loadn2_f64(const double *ptr, npy_intp stride)
+{ (void)stride; return npyv_load_f64(ptr); }
+#endif
+
/***************************
* Non-contiguous Store
***************************/
@@ -131,6 +154,32 @@ NPY_FINLINE void npyv_storen_f64(double *ptr, npy_intp stride, npyv_f64 a)
{ npyv_storen_s64((npy_int64*)ptr, stride, npyv_reinterpret_s64_f64(a)); }
#endif
+//// 64-bit store over 32-bit stride
+NPY_FINLINE void npyv_storen2_u32(npy_uint32 *ptr, npy_intp stride, npyv_u32 a)
+{
+#if NPY_SIMD_F64
+ vst1q_lane_u64((uint64_t*)ptr, npyv_reinterpret_u64_u32(a), 0);
+ vst1q_lane_u64((uint64_t*)(ptr + stride), npyv_reinterpret_u64_u32(a), 1);
+#else
+ // armhf strict to alignment
+ vst1_u32((uint32_t*)ptr, vget_low_u32(a));
+ vst1_u32((uint32_t*)ptr + stride, vget_high_u32(a));
+#endif
+}
+NPY_FINLINE void npyv_storen2_s32(npy_int32 *ptr, npy_intp stride, npyv_s32 a)
+{ npyv_storen2_u32((npy_uint32*)ptr, stride, npyv_reinterpret_u32_s32(a)); }
+NPY_FINLINE void npyv_storen2_f32(float *ptr, npy_intp stride, npyv_f32 a)
+{ npyv_storen2_u32((npy_uint32*)ptr, stride, npyv_reinterpret_u32_f32(a)); }
+
+//// 128-bit store over 64-bit stride
+NPY_FINLINE void npyv_storen2_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a)
+{ (void)stride; npyv_store_u64(ptr, a); }
+NPY_FINLINE void npyv_storen2_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a)
+{ (void)stride; npyv_store_s64(ptr, a); }
+#if NPY_SIMD_F64
+NPY_FINLINE void npyv_storen2_f64(double *ptr, npy_intp stride, npyv_f64 a)
+{ (void)stride; npyv_store_f64(ptr, a); }
+#endif
/*********************************
* Partial Load
*********************************/
@@ -168,6 +217,29 @@ NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, n
NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
{ return npyv_load_till_s64(ptr, nlane, 0); }
+//// 64-bit nlane
+NPY_FINLINE npyv_s32 npyv_load2_till_s32(const npy_int32 *ptr, npy_uintp nlane,
+ npy_int32 fill_lo, npy_int32 fill_hi)
+{
+ assert(nlane > 0);
+ if (nlane == 1) {
+ const int32_t NPY_DECL_ALIGNED(16) fill[2] = {fill_lo, fill_hi};
+ return vcombine_s32(vld1_s32((const int32_t*)ptr), vld1_s32(fill));
+ }
+ return npyv_load_s32(ptr);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s32 npyv_load2_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
+{ return vreinterpretq_s32_s64(npyv_load_tillz_s64((const npy_int64*)ptr, nlane)); }
+
+//// 128-bit nlane
+NPY_FINLINE npyv_s64 npyv_load2_till_s64(const npy_int64 *ptr, npy_uintp nlane,
+ npy_int64 fill_lo, npy_int64 fill_hi)
+{ (void)nlane; (void)fill_lo; (void)fill_hi; return npyv_load_s64(ptr); }
+
+NPY_FINLINE npyv_s64 npyv_load2_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
+{ (void)nlane; return npyv_load_s64(ptr); }
+
/*********************************
* Non-contiguous partial load
*********************************/
@@ -206,6 +278,34 @@ npyv_loadn_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_
NPY_FINLINE npyv_s64 npyv_loadn_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane)
{ return npyv_loadn_till_s64(ptr, stride, nlane, 0); }
+//// 64-bit load over 32-bit stride
+NPY_FINLINE npyv_s32 npyv_loadn2_till_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane,
+ npy_int32 fill_lo, npy_int32 fill_hi)
+{
+ assert(nlane > 0);
+ if (nlane == 1) {
+ const int32_t NPY_DECL_ALIGNED(16) fill[2] = {fill_lo, fill_hi};
+ return vcombine_s32(vld1_s32((const int32_t*)ptr), vld1_s32(fill));
+ }
+ return npyv_loadn2_s32(ptr, stride);
+}
+NPY_FINLINE npyv_s32 npyv_loadn2_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane)
+{
+ assert(nlane > 0);
+ if (nlane == 1) {
+ return vcombine_s32(vld1_s32((const int32_t*)ptr), vdup_n_s32(0));
+ }
+ return npyv_loadn2_s32(ptr, stride);
+}
+
+//// 128-bit load over 64-bit stride
+NPY_FINLINE npyv_s64 npyv_loadn2_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane,
+ npy_int64 fill_lo, npy_int64 fill_hi)
+{ assert(nlane > 0); (void)stride; (void)nlane; (void)fill_lo; (void)fill_hi; return npyv_load_s64(ptr); }
+
+NPY_FINLINE npyv_s64 npyv_loadn2_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane)
+{ assert(nlane > 0); (void)stride; (void)nlane; return npyv_load_s64(ptr); }
+
/*********************************
* Partial store
*********************************/
@@ -238,6 +338,30 @@ NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a
}
npyv_store_s64(ptr, a);
}
+
+//// 64-bit nlane
+NPY_FINLINE void npyv_store2_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a)
+{
+ assert(nlane > 0);
+ if (nlane == 1) {
+ // armhf strict to alignment, may cause bus error
+ #if NPY_SIMD_F64
+ vst1q_lane_s64((int64_t*)ptr, npyv_reinterpret_s64_s32(a), 0);
+ #else
+ npyv_storel_s32(ptr, a);
+ #endif
+ return;
+ }
+ npyv_store_s32(ptr, a);
+}
+
+//// 128-bit nlane
+NPY_FINLINE void npyv_store2_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a)
+{
+ assert(nlane > 0); (void)nlane;
+ npyv_store_s64(ptr, a);
+}
+
/*********************************
* Non-contiguous partial store
*********************************/
@@ -245,16 +369,21 @@ NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a
NPY_FINLINE void npyv_storen_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npyv_s32 a)
{
assert(nlane > 0);
+ vst1q_lane_s32((int32_t*)ptr, a, 0);
switch(nlane) {
- default:
- vst1q_lane_s32((int32_t*)ptr + stride*3, a, 3);
+ case 1:
+ return;
+ case 2:
+ vst1q_lane_s32((int32_t*)ptr + stride, a, 1);
+ return;
case 3:
+ vst1q_lane_s32((int32_t*)ptr + stride, a, 1);
vst1q_lane_s32((int32_t*)ptr + stride*2, a, 2);
- case 2:
+ return;
+ default:
vst1q_lane_s32((int32_t*)ptr + stride, a, 1);
- case 1:
- vst1q_lane_s32((int32_t*)ptr, a, 0);
- break;
+ vst1q_lane_s32((int32_t*)ptr + stride*2, a, 2);
+ vst1q_lane_s32((int32_t*)ptr + stride*3, a, 3);
}
}
//// 64
@@ -268,6 +397,27 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp
npyv_storen_s64(ptr, stride, a);
}
+//// 64-bit store over 32-bit stride
+NPY_FINLINE void npyv_storen2_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npyv_s32 a)
+{
+ assert(nlane > 0);
+#if NPY_SIMD_F64
+ vst1q_lane_s64((int64_t*)ptr, npyv_reinterpret_s64_s32(a), 0);
+ if (nlane > 1) {
+ vst1q_lane_s64((int64_t*)(ptr + stride), npyv_reinterpret_s64_s32(a), 1);
+ }
+#else
+ npyv_storel_s32(ptr, a);
+ if (nlane > 1) {
+ npyv_storeh_s32(ptr + stride, a);
+ }
+#endif
+}
+
+//// 128-bit store over 64-bit stride
+NPY_FINLINE void npyv_storen2_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npyv_s64 a)
+{ assert(nlane > 0); (void)stride; (void)nlane; npyv_store_s64(ptr, a); }
+
/*****************************************************************
* Implement partial load/store for u32/f32/u64/f64... via casting
*****************************************************************/
@@ -278,7 +428,8 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp
union { \
npyv_lanetype_##F_SFX from_##F_SFX; \
npyv_lanetype_##T_SFX to_##T_SFX; \
- } pun = {.from_##F_SFX = fill}; \
+ } pun; \
+ pun.from_##F_SFX = fill; \
return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load_till_##T_SFX( \
(const npyv_lanetype_##T_SFX *)ptr, nlane, pun.to_##T_SFX \
)); \
@@ -290,7 +441,8 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp
union { \
npyv_lanetype_##F_SFX from_##F_SFX; \
npyv_lanetype_##T_SFX to_##T_SFX; \
- } pun = {.from_##F_SFX = fill}; \
+ } pun; \
+ pun.from_##F_SFX = fill; \
return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn_till_##T_SFX( \
(const npyv_lanetype_##T_SFX *)ptr, stride, nlane, pun.to_##T_SFX \
)); \
@@ -332,6 +484,131 @@ NPYV_IMPL_NEON_REST_PARTIAL_TYPES(u64, s64)
#if NPY_SIMD_F64
NPYV_IMPL_NEON_REST_PARTIAL_TYPES(f64, s64)
#endif
+
+// 128-bit/64-bit stride
+#define NPYV_IMPL_NEON_REST_PARTIAL_TYPES_PAIR(F_SFX, T_SFX) \
+ NPY_FINLINE npyv_##F_SFX npyv_load2_till_##F_SFX \
+ (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, \
+ npyv_lanetype_##F_SFX fill_lo, npyv_lanetype_##F_SFX fill_hi) \
+ { \
+ union pun { \
+ npyv_lanetype_##F_SFX from_##F_SFX; \
+ npyv_lanetype_##T_SFX to_##T_SFX; \
+ }; \
+ union pun pun_lo; \
+ union pun pun_hi; \
+ pun_lo.from_##F_SFX = fill_lo; \
+ pun_hi.from_##F_SFX = fill_hi; \
+ return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load2_till_##T_SFX( \
+ (const npyv_lanetype_##T_SFX *)ptr, nlane, pun_lo.to_##T_SFX, pun_hi.to_##T_SFX \
+ )); \
+ } \
+ NPY_FINLINE npyv_##F_SFX npyv_loadn2_till_##F_SFX \
+ (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, \
+ npyv_lanetype_##F_SFX fill_lo, npyv_lanetype_##F_SFX fill_hi) \
+ { \
+ union pun { \
+ npyv_lanetype_##F_SFX from_##F_SFX; \
+ npyv_lanetype_##T_SFX to_##T_SFX; \
+ }; \
+ union pun pun_lo; \
+ union pun pun_hi; \
+ pun_lo.from_##F_SFX = fill_lo; \
+ pun_hi.from_##F_SFX = fill_hi; \
+ return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn2_till_##T_SFX( \
+ (const npyv_lanetype_##T_SFX *)ptr, stride, nlane, pun_lo.to_##T_SFX, \
+ pun_hi.to_##T_SFX \
+ )); \
+ } \
+ NPY_FINLINE npyv_##F_SFX npyv_load2_tillz_##F_SFX \
+ (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane) \
+ { \
+ return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load2_tillz_##T_SFX( \
+ (const npyv_lanetype_##T_SFX *)ptr, nlane \
+ )); \
+ } \
+ NPY_FINLINE npyv_##F_SFX npyv_loadn2_tillz_##F_SFX \
+ (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane) \
+ { \
+ return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn2_tillz_##T_SFX( \
+ (const npyv_lanetype_##T_SFX *)ptr, stride, nlane \
+ )); \
+ } \
+ NPY_FINLINE void npyv_store2_till_##F_SFX \
+ (npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, npyv_##F_SFX a) \
+ { \
+ npyv_store2_till_##T_SFX( \
+ (npyv_lanetype_##T_SFX *)ptr, nlane, \
+ npyv_reinterpret_##T_SFX##_##F_SFX(a) \
+ ); \
+ } \
+ NPY_FINLINE void npyv_storen2_till_##F_SFX \
+ (npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, npyv_##F_SFX a) \
+ { \
+ npyv_storen2_till_##T_SFX( \
+ (npyv_lanetype_##T_SFX *)ptr, stride, nlane, \
+ npyv_reinterpret_##T_SFX##_##F_SFX(a) \
+ ); \
+ }
+
+NPYV_IMPL_NEON_REST_PARTIAL_TYPES_PAIR(u32, s32)
+NPYV_IMPL_NEON_REST_PARTIAL_TYPES_PAIR(f32, s32)
+NPYV_IMPL_NEON_REST_PARTIAL_TYPES_PAIR(u64, s64)
+#if NPY_SIMD_F64
+NPYV_IMPL_NEON_REST_PARTIAL_TYPES_PAIR(f64, s64)
+#endif
+
+/************************************************************
+ * de-interlave load / interleave contiguous store
+ ************************************************************/
+// two channels
+#define NPYV_IMPL_NEON_MEM_INTERLEAVE(SFX, T_PTR) \
+ NPY_FINLINE npyv_##SFX##x2 npyv_load_##SFX##x2( \
+ const npyv_lanetype_##SFX *ptr \
+ ) { \
+ return vld2q_##SFX((const T_PTR*)ptr); \
+ } \
+ NPY_FINLINE void npyv_store_##SFX##x2( \
+ npyv_lanetype_##SFX *ptr, npyv_##SFX##x2 v \
+ ) { \
+ vst2q_##SFX((T_PTR*)ptr, v); \
+ }
+
+NPYV_IMPL_NEON_MEM_INTERLEAVE(u8, uint8_t)
+NPYV_IMPL_NEON_MEM_INTERLEAVE(s8, int8_t)
+NPYV_IMPL_NEON_MEM_INTERLEAVE(u16, uint16_t)
+NPYV_IMPL_NEON_MEM_INTERLEAVE(s16, int16_t)
+NPYV_IMPL_NEON_MEM_INTERLEAVE(u32, uint32_t)
+NPYV_IMPL_NEON_MEM_INTERLEAVE(s32, int32_t)
+NPYV_IMPL_NEON_MEM_INTERLEAVE(f32, float)
+
+#if NPY_SIMD_F64
+ NPYV_IMPL_NEON_MEM_INTERLEAVE(f64, double)
+ NPYV_IMPL_NEON_MEM_INTERLEAVE(u64, uint64_t)
+ NPYV_IMPL_NEON_MEM_INTERLEAVE(s64, int64_t)
+#else
+ #define NPYV_IMPL_NEON_MEM_INTERLEAVE_64(SFX) \
+ NPY_FINLINE npyv_##SFX##x2 npyv_load_##SFX##x2( \
+ const npyv_lanetype_##SFX *ptr) \
+ { \
+ npyv_##SFX a = npyv_load_##SFX(ptr); \
+ npyv_##SFX b = npyv_load_##SFX(ptr + 2); \
+ npyv_##SFX##x2 r; \
+ r.val[0] = vcombine_##SFX(vget_low_##SFX(a), vget_low_##SFX(b)); \
+ r.val[1] = vcombine_##SFX(vget_high_##SFX(a), vget_high_##SFX(b)); \
+ return r; \
+ } \
+ NPY_FINLINE void npyv_store_##SFX##x2( \
+ npyv_lanetype_##SFX *ptr, npyv_##SFX##x2 v) \
+ { \
+ npyv_store_##SFX(ptr, vcombine_##SFX( \
+ vget_low_##SFX(v.val[0]), vget_low_##SFX(v.val[1]))); \
+ npyv_store_##SFX(ptr + 2, vcombine_##SFX( \
+ vget_high_##SFX(v.val[0]), vget_high_##SFX(v.val[1]))); \
+ }
+ NPYV_IMPL_NEON_MEM_INTERLEAVE_64(u64)
+ NPYV_IMPL_NEON_MEM_INTERLEAVE_64(s64)
+#endif
/*********************************
* Lookup table
*********************************/
diff --git a/numpy/core/src/common/simd/neon/neon.h b/numpy/core/src/common/simd/neon/neon.h
index b08071527..49c35c415 100644
--- a/numpy/core/src/common/simd/neon/neon.h
+++ b/numpy/core/src/common/simd/neon/neon.h
@@ -16,6 +16,7 @@
#define NPY_SIMD_FMA3 0 // HW emulated
#endif
#define NPY_SIMD_BIGENDIAN 0
+#define NPY_SIMD_CMPSIGNAL 1
typedef uint8x16_t npyv_u8;
typedef int8x16_t npyv_s8;
diff --git a/numpy/core/src/common/simd/neon/operators.h b/numpy/core/src/common/simd/neon/operators.h
index 249621bd6..e18ea94b8 100644
--- a/numpy/core/src/common/simd/neon/operators.h
+++ b/numpy/core/src/common/simd/neon/operators.h
@@ -240,10 +240,35 @@
// check special cases
NPY_FINLINE npyv_b32 npyv_notnan_f32(npyv_f32 a)
-{ return vceqq_f32(a, a); }
+{
+#if defined(__clang__)
+/**
+ * To avoid signaling qNaN, workaround for clang symmetric inputs bug
+ * check https://github.com/numpy/numpy/issues/22933,
+ * for more clarification.
+ */
+ npyv_b32 ret;
+ #if NPY_SIMD_F64
+ __asm("fcmeq %0.4s, %1.4s, %1.4s" : "=w" (ret) : "w" (a));
+ #else
+ __asm("vceq.f32 %q0, %q1, %q1" : "=w" (ret) : "w" (a));
+ #endif
+ return ret;
+#else
+ return vceqq_f32(a, a);
+#endif
+}
#if NPY_SIMD_F64
NPY_FINLINE npyv_b64 npyv_notnan_f64(npyv_f64 a)
- { return vceqq_f64(a, a); }
+ {
+ #if defined(__clang__)
+ npyv_b64 ret;
+ __asm("fcmeq %0.2d, %1.2d, %1.2d" : "=w" (ret) : "w" (a));
+ return ret;
+ #else
+ return vceqq_f64(a, a);
+ #endif
+ }
#endif
// Test cross all vector lanes
diff --git a/numpy/core/src/common/simd/neon/reorder.h b/numpy/core/src/common/simd/neon/reorder.h
index 50b06ed11..8bf68f5be 100644
--- a/numpy/core/src/common/simd/neon/reorder.h
+++ b/numpy/core/src/common/simd/neon/reorder.h
@@ -76,36 +76,45 @@ NPYV_IMPL_NEON_COMBINE(npyv_f32, f32)
NPYV_IMPL_NEON_COMBINE(npyv_f64, f64)
#endif
-// interleave two vectors
-#define NPYV_IMPL_NEON_ZIP(T_VEC, SFX) \
- NPY_FINLINE T_VEC##x2 npyv_zip_##SFX(T_VEC a, T_VEC b) \
- { \
- T_VEC##x2 r; \
- r.val[0] = vzip1q_##SFX(a, b); \
- r.val[1] = vzip2q_##SFX(a, b); \
- return r; \
- }
-
+// interleave & deinterleave two vectors
#ifdef __aarch64__
- NPYV_IMPL_NEON_ZIP(npyv_u8, u8)
- NPYV_IMPL_NEON_ZIP(npyv_s8, s8)
- NPYV_IMPL_NEON_ZIP(npyv_u16, u16)
- NPYV_IMPL_NEON_ZIP(npyv_s16, s16)
- NPYV_IMPL_NEON_ZIP(npyv_u32, u32)
- NPYV_IMPL_NEON_ZIP(npyv_s32, s32)
- NPYV_IMPL_NEON_ZIP(npyv_f32, f32)
- NPYV_IMPL_NEON_ZIP(npyv_f64, f64)
+ #define NPYV_IMPL_NEON_ZIP(T_VEC, SFX) \
+ NPY_FINLINE T_VEC##x2 npyv_zip_##SFX(T_VEC a, T_VEC b) \
+ { \
+ T_VEC##x2 r; \
+ r.val[0] = vzip1q_##SFX(a, b); \
+ r.val[1] = vzip2q_##SFX(a, b); \
+ return r; \
+ } \
+ NPY_FINLINE T_VEC##x2 npyv_unzip_##SFX(T_VEC a, T_VEC b) \
+ { \
+ T_VEC##x2 r; \
+ r.val[0] = vuzp1q_##SFX(a, b); \
+ r.val[1] = vuzp2q_##SFX(a, b); \
+ return r; \
+ }
#else
- #define npyv_zip_u8 vzipq_u8
- #define npyv_zip_s8 vzipq_s8
- #define npyv_zip_u16 vzipq_u16
- #define npyv_zip_s16 vzipq_s16
- #define npyv_zip_u32 vzipq_u32
- #define npyv_zip_s32 vzipq_s32
- #define npyv_zip_f32 vzipq_f32
+ #define NPYV_IMPL_NEON_ZIP(T_VEC, SFX) \
+ NPY_FINLINE T_VEC##x2 npyv_zip_##SFX(T_VEC a, T_VEC b) \
+ { return vzipq_##SFX(a, b); } \
+ NPY_FINLINE T_VEC##x2 npyv_unzip_##SFX(T_VEC a, T_VEC b) \
+ { return vuzpq_##SFX(a, b); }
#endif
+
+NPYV_IMPL_NEON_ZIP(npyv_u8, u8)
+NPYV_IMPL_NEON_ZIP(npyv_s8, s8)
+NPYV_IMPL_NEON_ZIP(npyv_u16, u16)
+NPYV_IMPL_NEON_ZIP(npyv_s16, s16)
+NPYV_IMPL_NEON_ZIP(npyv_u32, u32)
+NPYV_IMPL_NEON_ZIP(npyv_s32, s32)
+NPYV_IMPL_NEON_ZIP(npyv_f32, f32)
+
#define npyv_zip_u64 npyv_combine_u64
#define npyv_zip_s64 npyv_combine_s64
+#define npyv_zip_f64 npyv_combine_f64
+#define npyv_unzip_u64 npyv_combine_u64
+#define npyv_unzip_s64 npyv_combine_s64
+#define npyv_unzip_f64 npyv_combine_f64
// Reverse elements of each 64-bit lane
#define npyv_rev64_u8 vrev64q_u8
@@ -116,4 +125,65 @@ NPYV_IMPL_NEON_COMBINE(npyv_f64, f64)
#define npyv_rev64_s32 vrev64q_s32
#define npyv_rev64_f32 vrev64q_f32
+// Permuting the elements of each 128-bit lane by immediate index for
+// each element.
+#ifdef __clang__
+ #define npyv_permi128_u32(A, E0, E1, E2, E3) \
+ __builtin_shufflevector(A, A, E0, E1, E2, E3)
+#elif defined(__GNUC__)
+ #define npyv_permi128_u32(A, E0, E1, E2, E3) \
+ __builtin_shuffle(A, npyv_set_u32(E0, E1, E2, E3))
+#else
+ #define npyv_permi128_u32(A, E0, E1, E2, E3) \
+ npyv_set_u32( \
+ vgetq_lane_u32(A, E0), vgetq_lane_u32(A, E1), \
+ vgetq_lane_u32(A, E2), vgetq_lane_u32(A, E3) \
+ )
+ #define npyv_permi128_s32(A, E0, E1, E2, E3) \
+ npyv_set_s32( \
+ vgetq_lane_s32(A, E0), vgetq_lane_s32(A, E1), \
+ vgetq_lane_s32(A, E2), vgetq_lane_s32(A, E3) \
+ )
+ #define npyv_permi128_f32(A, E0, E1, E2, E3) \
+ npyv_set_f32( \
+ vgetq_lane_f32(A, E0), vgetq_lane_f32(A, E1), \
+ vgetq_lane_f32(A, E2), vgetq_lane_f32(A, E3) \
+ )
+#endif
+
+#if defined(__clang__) || defined(__GNUC__)
+ #define npyv_permi128_s32 npyv_permi128_u32
+ #define npyv_permi128_f32 npyv_permi128_u32
+#endif
+
+#ifdef __clang__
+ #define npyv_permi128_u64(A, E0, E1) \
+ __builtin_shufflevector(A, A, E0, E1)
+#elif defined(__GNUC__)
+ #define npyv_permi128_u64(A, E0, E1) \
+ __builtin_shuffle(A, npyv_set_u64(E0, E1))
+#else
+ #define npyv_permi128_u64(A, E0, E1) \
+ npyv_set_u64( \
+ vgetq_lane_u64(A, E0), vgetq_lane_u64(A, E1) \
+ )
+ #define npyv_permi128_s64(A, E0, E1) \
+ npyv_set_s64( \
+ vgetq_lane_s64(A, E0), vgetq_lane_s64(A, E1) \
+ )
+ #define npyv_permi128_f64(A, E0, E1) \
+ npyv_set_f64( \
+ vgetq_lane_f64(A, E0), vgetq_lane_f64(A, E1) \
+ )
+#endif
+
+#if defined(__clang__) || defined(__GNUC__)
+ #define npyv_permi128_s64 npyv_permi128_u64
+ #define npyv_permi128_f64 npyv_permi128_u64
+#endif
+
+#if !NPY_SIMD_F64
+ #undef npyv_permi128_f64
+#endif
+
#endif // _NPY_SIMD_NEON_REORDER_H
diff --git a/numpy/core/src/common/simd/simd.h b/numpy/core/src/common/simd/simd.h
index 92a77ad80..8c9b14251 100644
--- a/numpy/core/src/common/simd/simd.h
+++ b/numpy/core/src/common/simd/simd.h
@@ -82,6 +82,9 @@ typedef double npyv_lanetype_f64;
#define NPY_SIMD_FMA3 0
/// 1 if the enabled SIMD extension is running on big-endian mode otherwise 0.
#define NPY_SIMD_BIGENDIAN 0
+ /// 1 if the supported comparison intrinsics(lt, le, gt, ge)
+ /// raises FP invalid exception for quite NaNs.
+ #define NPY_SIMD_CMPSIGNAL 0
#endif
// enable emulated mask operations for all SIMD extension except for AVX512
diff --git a/numpy/core/src/common/simd/sse/arithmetic.h b/numpy/core/src/common/simd/sse/arithmetic.h
index bced35108..72a87eac1 100644
--- a/numpy/core/src/common/simd/sse/arithmetic.h
+++ b/numpy/core/src/common/simd/sse/arithmetic.h
@@ -282,6 +282,10 @@ NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor)
// negate multiply and subtract, -(a*b) - c
#define npyv_nmulsub_f32 _mm_fnmsub_ps
#define npyv_nmulsub_f64 _mm_fnmsub_pd
+ // multiply, add for odd elements and subtract even elements.
+ // (a * b) -+ c
+ #define npyv_muladdsub_f32 _mm_fmaddsub_ps
+ #define npyv_muladdsub_f64 _mm_fmaddsub_pd
#elif defined(NPY_HAVE_FMA4)
// multiply and add, a*b + c
#define npyv_muladd_f32 _mm_macc_ps
@@ -292,6 +296,10 @@ NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor)
// negate multiply and add, -(a*b) + c
#define npyv_nmuladd_f32 _mm_nmacc_ps
#define npyv_nmuladd_f64 _mm_nmacc_pd
+ // multiply, add for odd elements and subtract even elements.
+ // (a * b) -+ c
+ #define npyv_muladdsub_f32 _mm_maddsub_ps
+ #define npyv_muladdsub_f64 _mm_maddsub_pd
#else
// multiply and add, a*b + c
NPY_FINLINE npyv_f32 npyv_muladd_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
@@ -308,6 +316,28 @@ NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor)
{ return npyv_sub_f32(c, npyv_mul_f32(a, b)); }
NPY_FINLINE npyv_f64 npyv_nmuladd_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c)
{ return npyv_sub_f64(c, npyv_mul_f64(a, b)); }
+ // multiply, add for odd elements and subtract even elements.
+ // (a * b) -+ c
+ NPY_FINLINE npyv_f32 npyv_muladdsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
+ {
+ npyv_f32 m = npyv_mul_f32(a, b);
+ #if NPY_HAVE_SSE3
+ return _mm_addsub_ps(m, c);
+ #else
+ const npyv_f32 msign = npyv_set_f32(-0.0f, 0.0f, -0.0f, 0.0f);
+ return npyv_add_f32(m, npyv_xor_f32(msign, c));
+ #endif
+ }
+ NPY_FINLINE npyv_f64 npyv_muladdsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c)
+ {
+ npyv_f64 m = npyv_mul_f64(a, b);
+ #if NPY_HAVE_SSE3
+ return _mm_addsub_pd(m, c);
+ #else
+ const npyv_f64 msign = npyv_set_f64(-0.0, 0.0);
+ return npyv_add_f64(m, npyv_xor_f64(msign, c));
+ #endif
+ }
#endif // NPY_HAVE_FMA3
#ifndef NPY_HAVE_FMA3 // for FMA4 and NON-FMA3
// negate multiply and subtract, -(a*b) - c
diff --git a/numpy/core/src/common/simd/sse/math.h b/numpy/core/src/common/simd/sse/math.h
index b7f8e6ebb..b51c935af 100644
--- a/numpy/core/src/common/simd/sse/math.h
+++ b/numpy/core/src/common/simd/sse/math.h
@@ -269,13 +269,23 @@ NPY_FINLINE npyv_f32 npyv_rint_f32(npyv_f32 a)
#ifdef NPY_HAVE_SSE41
return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT);
#else
- const npyv_f32 szero = _mm_set1_ps(-0.0f);
- __m128i roundi = _mm_cvtps_epi32(a);
- __m128i overflow = _mm_cmpeq_epi32(roundi, _mm_castps_si128(szero));
- __m128 r = _mm_cvtepi32_ps(roundi);
- // respect sign of zero
- r = _mm_or_ps(r, _mm_and_ps(a, szero));
- return npyv_select_f32(overflow, a, r);
+ const __m128 szero = _mm_set1_ps(-0.0f);
+ const __m128i exp_mask = _mm_set1_epi32(0xff000000);
+
+ __m128i nfinite_mask = _mm_slli_epi32(_mm_castps_si128(a), 1);
+ nfinite_mask = _mm_and_si128(nfinite_mask, exp_mask);
+ nfinite_mask = _mm_cmpeq_epi32(nfinite_mask, exp_mask);
+
+ // eliminate nans/inf to avoid invalid fp errors
+ __m128 x = _mm_xor_ps(a, _mm_castsi128_ps(nfinite_mask));
+ __m128i roundi = _mm_cvtps_epi32(x);
+ __m128 round = _mm_cvtepi32_ps(roundi);
+ // respect signed zero
+ round = _mm_or_ps(round, _mm_and_ps(a, szero));
+ // if overflow return a
+ __m128i overflow_mask = _mm_cmpeq_epi32(roundi, _mm_castps_si128(szero));
+ // a if a overflow or nonfinite
+ return npyv_select_f32(_mm_or_si128(nfinite_mask, overflow_mask), a, round);
#endif
}
@@ -285,16 +295,22 @@ NPY_FINLINE npyv_f64 npyv_rint_f64(npyv_f64 a)
#ifdef NPY_HAVE_SSE41
return _mm_round_pd(a, _MM_FROUND_TO_NEAREST_INT);
#else
- const npyv_f64 szero = _mm_set1_pd(-0.0);
- const npyv_f64 two_power_52 = _mm_set1_pd(0x10000000000000);
- npyv_f64 sign_two52 = _mm_or_pd(two_power_52, _mm_and_pd(a, szero));
+ const __m128d szero = _mm_set1_pd(-0.0);
+ const __m128d two_power_52 = _mm_set1_pd(0x10000000000000);
+ __m128d nan_mask = _mm_cmpunord_pd(a, a);
+ // eliminate nans to avoid invalid fp errors within cmpge
+ __m128d abs_x = npyv_abs_f64(_mm_xor_pd(nan_mask, a));
// round by add magic number 2^52
- npyv_f64 round = _mm_sub_pd(_mm_add_pd(a, sign_two52), sign_two52);
- // respect signed zero, e.g. -0.5 -> -0.0
- return _mm_or_pd(round, _mm_and_pd(a, szero));
+ // assuming that MXCSR register is set to rounding
+ __m128d round = _mm_sub_pd(_mm_add_pd(two_power_52, abs_x), two_power_52);
+ // copysign
+ round = _mm_or_pd(round, _mm_and_pd(a, szero));
+ // a if |a| >= 2^52 or a == NaN
+ __m128d mask = _mm_cmpge_pd(abs_x, two_power_52);
+ mask = _mm_or_pd(mask, nan_mask);
+ return npyv_select_f64(_mm_castpd_si128(mask), a, round);
#endif
}
-
// ceil
#ifdef NPY_HAVE_SSE41
#define npyv_ceil_f32 _mm_ceil_ps
@@ -302,27 +318,48 @@ NPY_FINLINE npyv_f64 npyv_rint_f64(npyv_f64 a)
#else
NPY_FINLINE npyv_f32 npyv_ceil_f32(npyv_f32 a)
{
- const npyv_f32 szero = _mm_set1_ps(-0.0f);
- const npyv_f32 one = _mm_set1_ps(1.0f);
- npyv_s32 roundi = _mm_cvttps_epi32(a);
- npyv_f32 round = _mm_cvtepi32_ps(roundi);
- npyv_f32 ceil = _mm_add_ps(round, _mm_and_ps(_mm_cmplt_ps(round, a), one));
- // respect signed zero, e.g. -0.5 -> -0.0
- npyv_f32 rzero = _mm_or_ps(ceil, _mm_and_ps(a, szero));
+ const __m128 one = _mm_set1_ps(1.0f);
+ const __m128 szero = _mm_set1_ps(-0.0f);
+ const __m128i exp_mask = _mm_set1_epi32(0xff000000);
+
+ __m128i nfinite_mask = _mm_slli_epi32(_mm_castps_si128(a), 1);
+ nfinite_mask = _mm_and_si128(nfinite_mask, exp_mask);
+ nfinite_mask = _mm_cmpeq_epi32(nfinite_mask, exp_mask);
+
+ // eliminate nans/inf to avoid invalid fp errors
+ __m128 x = _mm_xor_ps(a, _mm_castsi128_ps(nfinite_mask));
+ __m128i roundi = _mm_cvtps_epi32(x);
+ __m128 round = _mm_cvtepi32_ps(roundi);
+ __m128 ceil = _mm_add_ps(round, _mm_and_ps(_mm_cmplt_ps(round, x), one));
+ // respect signed zero
+ ceil = _mm_or_ps(ceil, _mm_and_ps(a, szero));
// if overflow return a
- return npyv_select_f32(_mm_cmpeq_epi32(roundi, _mm_castps_si128(szero)), a, rzero);
+ __m128i overflow_mask = _mm_cmpeq_epi32(roundi, _mm_castps_si128(szero));
+ // a if a overflow or nonfinite
+ return npyv_select_f32(_mm_or_si128(nfinite_mask, overflow_mask), a, ceil);
}
NPY_FINLINE npyv_f64 npyv_ceil_f64(npyv_f64 a)
{
- const npyv_f64 szero = _mm_set1_pd(-0.0);
- const npyv_f64 one = _mm_set1_pd(1.0);
- const npyv_f64 two_power_52 = _mm_set1_pd(0x10000000000000);
- npyv_f64 sign_two52 = _mm_or_pd(two_power_52, _mm_and_pd(a, szero));
+ const __m128d one = _mm_set1_pd(1.0);
+ const __m128d szero = _mm_set1_pd(-0.0);
+ const __m128d two_power_52 = _mm_set1_pd(0x10000000000000);
+ __m128d nan_mask = _mm_cmpunord_pd(a, a);
+ // eliminate nans to avoid invalid fp errors within cmpge
+ __m128d x = _mm_xor_pd(nan_mask, a);
+ __m128d abs_x = npyv_abs_f64(x);
+ __m128d sign_x = _mm_and_pd(x, szero);
// round by add magic number 2^52
- npyv_f64 round = _mm_sub_pd(_mm_add_pd(a, sign_two52), sign_two52);
- npyv_f64 ceil = _mm_add_pd(round, _mm_and_pd(_mm_cmplt_pd(round, a), one));
- // respect signed zero, e.g. -0.5 -> -0.0
- return _mm_or_pd(ceil, _mm_and_pd(a, szero));
+ // assuming that MXCSR register is set to rounding
+ __m128d round = _mm_sub_pd(_mm_add_pd(two_power_52, abs_x), two_power_52);
+ // copysign
+ round = _mm_or_pd(round, sign_x);
+ __m128d ceil = _mm_add_pd(round, _mm_and_pd(_mm_cmplt_pd(round, x), one));
+ // respects sign of 0.0
+ ceil = _mm_or_pd(ceil, sign_x);
+ // a if |a| >= 2^52 or a == NaN
+ __m128d mask = _mm_cmpge_pd(abs_x, two_power_52);
+ mask = _mm_or_pd(mask, nan_mask);
+ return npyv_select_f64(_mm_castpd_si128(mask), a, ceil);
}
#endif
@@ -333,24 +370,43 @@ NPY_FINLINE npyv_f64 npyv_rint_f64(npyv_f64 a)
#else
NPY_FINLINE npyv_f32 npyv_trunc_f32(npyv_f32 a)
{
- const npyv_f32 szero = _mm_set1_ps(-0.0f);
- npyv_s32 roundi = _mm_cvttps_epi32(a);
- npyv_f32 trunc = _mm_cvtepi32_ps(roundi);
+ const __m128 szero = _mm_set1_ps(-0.0f);
+ const __m128i exp_mask = _mm_set1_epi32(0xff000000);
+
+ __m128i nfinite_mask = _mm_slli_epi32(_mm_castps_si128(a), 1);
+ nfinite_mask = _mm_and_si128(nfinite_mask, exp_mask);
+ nfinite_mask = _mm_cmpeq_epi32(nfinite_mask, exp_mask);
+
+ // eliminate nans/inf to avoid invalid fp errors
+ __m128 x = _mm_xor_ps(a, _mm_castsi128_ps(nfinite_mask));
+ __m128i trunci = _mm_cvttps_epi32(x);
+ __m128 trunc = _mm_cvtepi32_ps(trunci);
// respect signed zero, e.g. -0.5 -> -0.0
- npyv_f32 rzero = _mm_or_ps(trunc, _mm_and_ps(a, szero));
+ trunc = _mm_or_ps(trunc, _mm_and_ps(a, szero));
// if overflow return a
- return npyv_select_f32(_mm_cmpeq_epi32(roundi, _mm_castps_si128(szero)), a, rzero);
+ __m128i overflow_mask = _mm_cmpeq_epi32(trunci, _mm_castps_si128(szero));
+ // a if a overflow or nonfinite
+ return npyv_select_f32(_mm_or_si128(nfinite_mask, overflow_mask), a, trunc);
}
NPY_FINLINE npyv_f64 npyv_trunc_f64(npyv_f64 a)
{
- const npyv_f64 szero = _mm_set1_pd(-0.0);
- const npyv_f64 one = _mm_set1_pd(1.0);
- const npyv_f64 two_power_52 = _mm_set1_pd(0x10000000000000);
- npyv_f64 abs_a = npyv_abs_f64(a);
+ const __m128d one = _mm_set1_pd(1.0);
+ const __m128d szero = _mm_set1_pd(-0.0);
+ const __m128d two_power_52 = _mm_set1_pd(0x10000000000000);
+ __m128d nan_mask = _mm_cmpunord_pd(a, a);
+ // eliminate nans to avoid invalid fp errors within cmpge
+ __m128d abs_x = npyv_abs_f64(_mm_xor_pd(nan_mask, a));
// round by add magic number 2^52
- npyv_f64 abs_round = _mm_sub_pd(_mm_add_pd(abs_a, two_power_52), two_power_52);
- npyv_f64 subtrahend = _mm_and_pd(_mm_cmpgt_pd(abs_round, abs_a), one);
- return _mm_or_pd(_mm_sub_pd(abs_round, subtrahend), _mm_and_pd(a, szero));
+ // assuming that MXCSR register is set to rounding
+ __m128d abs_round = _mm_sub_pd(_mm_add_pd(two_power_52, abs_x), two_power_52);
+ __m128d subtrahend = _mm_and_pd(_mm_cmpgt_pd(abs_round, abs_x), one);
+ __m128d trunc = _mm_sub_pd(abs_round, subtrahend);
+ // copysign
+ trunc = _mm_or_pd(trunc, _mm_and_pd(a, szero));
+ // a if |a| >= 2^52 or a == NaN
+ __m128d mask = _mm_cmpge_pd(abs_x, two_power_52);
+ mask = _mm_or_pd(mask, nan_mask);
+ return npyv_select_f64(_mm_castpd_si128(mask), a, trunc);
}
#endif
@@ -361,15 +417,46 @@ NPY_FINLINE npyv_f64 npyv_rint_f64(npyv_f64 a)
#else
NPY_FINLINE npyv_f32 npyv_floor_f32(npyv_f32 a)
{
- const npyv_f32 one = _mm_set1_ps(1.0f);
- npyv_f32 round = npyv_rint_f32(a);
- return _mm_sub_ps(round, _mm_and_ps(_mm_cmpgt_ps(round, a), one));
+ const __m128 one = _mm_set1_ps(1.0f);
+ const __m128 szero = _mm_set1_ps(-0.0f);
+ const __m128i exp_mask = _mm_set1_epi32(0xff000000);
+
+ __m128i nfinite_mask = _mm_slli_epi32(_mm_castps_si128(a), 1);
+ nfinite_mask = _mm_and_si128(nfinite_mask, exp_mask);
+ nfinite_mask = _mm_cmpeq_epi32(nfinite_mask, exp_mask);
+
+ // eliminate nans/inf to avoid invalid fp errors
+ __m128 x = _mm_xor_ps(a, _mm_castsi128_ps(nfinite_mask));
+ __m128i roundi = _mm_cvtps_epi32(x);
+ __m128 round = _mm_cvtepi32_ps(roundi);
+ __m128 floor = _mm_sub_ps(round, _mm_and_ps(_mm_cmpgt_ps(round, x), one));
+ // respect signed zero
+ floor = _mm_or_ps(floor, _mm_and_ps(a, szero));
+ // if overflow return a
+ __m128i overflow_mask = _mm_cmpeq_epi32(roundi, _mm_castps_si128(szero));
+ // a if a overflow or nonfinite
+ return npyv_select_f32(_mm_or_si128(nfinite_mask, overflow_mask), a, floor);
}
NPY_FINLINE npyv_f64 npyv_floor_f64(npyv_f64 a)
{
- const npyv_f64 one = _mm_set1_pd(1.0);
- npyv_f64 round = npyv_rint_f64(a);
- return _mm_sub_pd(round, _mm_and_pd(_mm_cmpgt_pd(round, a), one));
+ const __m128d one = _mm_set1_pd(1.0f);
+ const __m128d szero = _mm_set1_pd(-0.0f);
+ const __m128d two_power_52 = _mm_set1_pd(0x10000000000000);
+ __m128d nan_mask = _mm_cmpunord_pd(a, a);
+ // eliminate nans to avoid invalid fp errors within cmpge
+ __m128d x = _mm_xor_pd(nan_mask, a);
+ __m128d abs_x = npyv_abs_f64(x);
+ __m128d sign_x = _mm_and_pd(x, szero);
+ // round by add magic number 2^52
+ // assuming that MXCSR register is set to rounding
+ __m128d round = _mm_sub_pd(_mm_add_pd(two_power_52, abs_x), two_power_52);
+ // copysign
+ round = _mm_or_pd(round, sign_x);
+ __m128d floor = _mm_sub_pd(round, _mm_and_pd(_mm_cmpgt_pd(round, x), one));
+ // a if |a| >= 2^52 or a == NaN
+ __m128d mask = _mm_cmpge_pd(abs_x, two_power_52);
+ mask = _mm_or_pd(mask, nan_mask);
+ return npyv_select_f64(_mm_castpd_si128(mask), a, floor);
}
#endif // NPY_HAVE_SSE41
diff --git a/numpy/core/src/common/simd/sse/memory.h b/numpy/core/src/common/simd/sse/memory.h
index 3ff64848d..4c8e86a6f 100644
--- a/numpy/core/src/common/simd/sse/memory.h
+++ b/numpy/core/src/common/simd/sse/memory.h
@@ -103,6 +103,28 @@ NPY_FINLINE npyv_u64 npyv_loadn_u64(const npy_uint64 *ptr, npy_intp stride)
{ return _mm_castpd_si128(npyv_loadn_f64((const double*)ptr, stride)); }
NPY_FINLINE npyv_s64 npyv_loadn_s64(const npy_int64 *ptr, npy_intp stride)
{ return _mm_castpd_si128(npyv_loadn_f64((const double*)ptr, stride)); }
+
+//// 64-bit load over 32-bit stride
+NPY_FINLINE npyv_f32 npyv_loadn2_f32(const float *ptr, npy_intp stride)
+{
+ __m128d r = _mm_loadh_pd(
+ npyv_loadl_f64((const double*)ptr), (const double*)(ptr + stride)
+ );
+ return _mm_castpd_ps(r);
+}
+NPY_FINLINE npyv_u32 npyv_loadn2_u32(const npy_uint32 *ptr, npy_intp stride)
+{ return _mm_castps_si128(npyv_loadn2_f32((const float*)ptr, stride)); }
+NPY_FINLINE npyv_s32 npyv_loadn2_s32(const npy_int32 *ptr, npy_intp stride)
+{ return _mm_castps_si128(npyv_loadn2_f32((const float*)ptr, stride)); }
+
+//// 128-bit load over 64-bit stride
+NPY_FINLINE npyv_f64 npyv_loadn2_f64(const double *ptr, npy_intp stride)
+{ (void)stride; return npyv_load_f64(ptr); }
+NPY_FINLINE npyv_u64 npyv_loadn2_u64(const npy_uint64 *ptr, npy_intp stride)
+{ (void)stride; return npyv_load_u64(ptr); }
+NPY_FINLINE npyv_s64 npyv_loadn2_s64(const npy_int64 *ptr, npy_intp stride)
+{ (void)stride; return npyv_load_s64(ptr); }
+
/***************************
* Non-contiguous Store
***************************/
@@ -135,6 +157,24 @@ NPY_FINLINE void npyv_storen_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a)
NPY_FINLINE void npyv_storen_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a)
{ npyv_storen_f64((double*)ptr, stride, _mm_castsi128_pd(a)); }
+//// 64-bit store over 32-bit stride
+NPY_FINLINE void npyv_storen2_u32(npy_uint32 *ptr, npy_intp stride, npyv_u32 a)
+{
+ _mm_storel_pd((double*)ptr, _mm_castsi128_pd(a));
+ _mm_storeh_pd((double*)(ptr + stride), _mm_castsi128_pd(a));
+}
+NPY_FINLINE void npyv_storen2_s32(npy_int32 *ptr, npy_intp stride, npyv_s32 a)
+{ npyv_storen2_u32((npy_uint32*)ptr, stride, a); }
+NPY_FINLINE void npyv_storen2_f32(float *ptr, npy_intp stride, npyv_f32 a)
+{ npyv_storen2_u32((npy_uint32*)ptr, stride, _mm_castps_si128(a)); }
+
+//// 128-bit store over 64-bit stride
+NPY_FINLINE void npyv_storen2_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a)
+{ (void)stride; npyv_store_u64(ptr, a); }
+NPY_FINLINE void npyv_storen2_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a)
+{ (void)stride; npyv_store_s64(ptr, a); }
+NPY_FINLINE void npyv_storen2_f64(double *ptr, npy_intp stride, npyv_f64 a)
+{ (void)stride; npyv_store_f64(ptr, a); }
/*********************************
* Partial Load
*********************************/
@@ -204,13 +244,14 @@ NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
return _mm_cvtsi32_si128(*ptr);
case 2:
return _mm_loadl_epi64((const __m128i*)ptr);
- case 3:;
- npyv_s32 a = _mm_loadl_epi64((const __m128i*)ptr);
- #ifdef NPY_HAVE_SSE41
- return _mm_insert_epi32(a, ptr[2], 2);
- #else
- return _mm_unpacklo_epi64(a, _mm_cvtsi32_si128(ptr[2]));
- #endif
+ case 3: {
+ npyv_s32 a = _mm_loadl_epi64((const __m128i*)ptr);
+ #ifdef NPY_HAVE_SSE41
+ return _mm_insert_epi32(a, ptr[2], 2);
+ #else
+ return _mm_unpacklo_epi64(a, _mm_cvtsi32_si128(ptr[2]));
+ #endif
+ }
default:
return npyv_load_s32(ptr);
}
@@ -246,6 +287,32 @@ NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
}
return npyv_load_s64(ptr);
}
+
+//// 64-bit nlane
+NPY_FINLINE npyv_s32 npyv_load2_till_s32(const npy_int32 *ptr, npy_uintp nlane,
+ npy_int32 fill_lo, npy_int32 fill_hi)
+{
+ assert(nlane > 0);
+ if (nlane == 1) {
+ const __m128i vfill = npyv_set_s32(fill_lo, fill_hi, fill_lo, fill_hi);
+ return _mm_castpd_si128(
+ _mm_loadl_pd(_mm_castsi128_pd(vfill), (double*)ptr)
+ );
+ }
+ return npyv_load_s32(ptr);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s32 npyv_load2_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
+{ return npyv_load_tillz_s64((const npy_int64*)ptr, nlane); }
+
+//// 128-bit nlane
+NPY_FINLINE npyv_s64 npyv_load2_till_s64(const npy_int64 *ptr, npy_uintp nlane,
+ npy_int64 fill_lo, npy_int64 fill_hi)
+{ (void)nlane; (void)fill_lo; (void)fill_hi; return npyv_load_s64(ptr); }
+
+NPY_FINLINE npyv_s64 npyv_load2_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
+{ (void)nlane; return npyv_load_s64(ptr); }
+
/*********************************
* Non-contiguous partial load
*********************************/
@@ -305,23 +372,27 @@ npyv_loadn_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane)
case 1:
return _mm_cvtsi32_si128(ptr[0]);
case 2:;
- npyv_s32 a = _mm_cvtsi32_si128(ptr[0]);
-#ifdef NPY_HAVE_SSE41
- return _mm_insert_epi32(a, ptr[stride], 1);
-#else
- return _mm_unpacklo_epi32(a, _mm_cvtsi32_si128(ptr[stride]));
-#endif // NPY_HAVE_SSE41
- case 3:;
- a = _mm_cvtsi32_si128(ptr[0]);
-#ifdef NPY_HAVE_SSE41
- a = _mm_insert_epi32(a, ptr[stride], 1);
- a = _mm_insert_epi32(a, ptr[stride*2], 2);
- return a;
-#else
- a = _mm_unpacklo_epi32(a, _mm_cvtsi32_si128(ptr[stride]));
- a = _mm_unpacklo_epi64(a, _mm_cvtsi32_si128(ptr[stride*2]));
- return a;
-#endif // NPY_HAVE_SSE41
+ {
+ npyv_s32 a = _mm_cvtsi32_si128(ptr[0]);
+ #ifdef NPY_HAVE_SSE41
+ return _mm_insert_epi32(a, ptr[stride], 1);
+ #else
+ return _mm_unpacklo_epi32(a, _mm_cvtsi32_si128(ptr[stride]));
+ #endif // NPY_HAVE_SSE41
+ }
+ case 3:
+ {
+ npyv_s32 a = _mm_cvtsi32_si128(ptr[0]);
+ #ifdef NPY_HAVE_SSE41
+ a = _mm_insert_epi32(a, ptr[stride], 1);
+ a = _mm_insert_epi32(a, ptr[stride*2], 2);
+ return a;
+ #else
+ a = _mm_unpacklo_epi32(a, _mm_cvtsi32_si128(ptr[stride]));
+ a = _mm_unpacklo_epi64(a, _mm_cvtsi32_si128(ptr[stride*2]));
+ return a;
+ #endif // NPY_HAVE_SSE41
+ }
default:
return npyv_loadn_s32(ptr, stride);
}
@@ -358,6 +429,37 @@ NPY_FINLINE npyv_s64 npyv_loadn_tillz_s64(const npy_int64 *ptr, npy_intp stride,
}
return npyv_loadn_s64(ptr, stride);
}
+
+//// 64-bit load over 32-bit stride
+NPY_FINLINE npyv_s32 npyv_loadn2_till_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane,
+ npy_int32 fill_lo, npy_int32 fill_hi)
+{
+ assert(nlane > 0);
+ if (nlane == 1) {
+ const __m128i vfill = npyv_set_s32(0, 0, fill_lo, fill_hi);
+ return _mm_castpd_si128(
+ _mm_loadl_pd(_mm_castsi128_pd(vfill), (double*)ptr)
+ );
+ }
+ return npyv_loadn2_s32(ptr, stride);
+}
+NPY_FINLINE npyv_s32 npyv_loadn2_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane)
+{
+ assert(nlane > 0);
+ if (nlane == 1) {
+ return _mm_loadl_epi64((const __m128i*)ptr);
+ }
+ return npyv_loadn2_s32(ptr, stride);
+}
+
+//// 128-bit load over 64-bit stride
+NPY_FINLINE npyv_s64 npyv_loadn2_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane,
+ npy_int64 fill_lo, npy_int64 fill_hi)
+{ assert(nlane > 0); (void)stride; (void)nlane; (void)fill_lo; (void)fill_hi; return npyv_load_s64(ptr); }
+
+NPY_FINLINE npyv_s64 npyv_loadn2_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane)
+{ assert(nlane > 0); (void)stride; (void)nlane; return npyv_load_s64(ptr); }
+
/*********************************
* Partial store
*********************************/
@@ -394,6 +496,17 @@ NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a
}
npyv_store_s64(ptr, a);
}
+//// 64-bit nlane
+NPY_FINLINE void npyv_store2_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a)
+{ npyv_store_till_s64((npy_int64*)ptr, nlane, a); }
+
+//// 128-bit nlane
+NPY_FINLINE void npyv_store2_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a)
+{
+ assert(nlane > 0); (void)nlane;
+ npyv_store_s64(ptr, a);
+}
+
/*********************************
* Non-contiguous partial store
*********************************/
@@ -401,25 +514,35 @@ NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a
NPY_FINLINE void npyv_storen_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npyv_s32 a)
{
assert(nlane > 0);
+ ptr[stride*0] = _mm_cvtsi128_si32(a);
switch(nlane) {
+ case 1:
+ return;
#ifdef NPY_HAVE_SSE41
- default:
- ptr[stride*3] = _mm_extract_epi32(a, 3);
+ case 2:
+ ptr[stride*1] = _mm_extract_epi32(a, 1);
+ return;
case 3:
+ ptr[stride*1] = _mm_extract_epi32(a, 1);
ptr[stride*2] = _mm_extract_epi32(a, 2);
- case 2:
+ return;
+ default:
ptr[stride*1] = _mm_extract_epi32(a, 1);
+ ptr[stride*2] = _mm_extract_epi32(a, 2);
+ ptr[stride*3] = _mm_extract_epi32(a, 3);
#else
- default:
- ptr[stride*3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 3)));
+ case 2:
+ ptr[stride*1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 1)));
+ return;
case 3:
+ ptr[stride*1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 1)));
ptr[stride*2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 2)));
- case 2:
+ return;
+ default:
ptr[stride*1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 1)));
+ ptr[stride*2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 2)));
+ ptr[stride*3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 3)));
#endif
- case 1:
- ptr[stride*0] = _mm_cvtsi128_si32(a);
- break;
}
}
//// 64
@@ -432,6 +555,21 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp
}
npyv_storen_s64(ptr, stride, a);
}
+
+//// 64-bit store over 32-bit stride
+NPY_FINLINE void npyv_storen2_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npyv_s32 a)
+{
+ assert(nlane > 0);
+ npyv_storel_s32(ptr, a);
+ if (nlane > 1) {
+ npyv_storeh_s32(ptr + stride, a);
+ }
+}
+
+//// 128-bit store over 64-bit stride
+NPY_FINLINE void npyv_storen2_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npyv_s64 a)
+{ assert(nlane > 0); (void)stride; (void)nlane; npyv_store_s64(ptr, a); }
+
/*****************************************************************
* Implement partial load/store for u32/f32/u64/f64... via casting
*****************************************************************/
@@ -442,7 +580,8 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp
union { \
npyv_lanetype_##F_SFX from_##F_SFX; \
npyv_lanetype_##T_SFX to_##T_SFX; \
- } pun = {.from_##F_SFX = fill}; \
+ } pun; \
+ pun.from_##F_SFX = fill; \
return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load_till_##T_SFX( \
(const npyv_lanetype_##T_SFX *)ptr, nlane, pun.to_##T_SFX \
)); \
@@ -454,7 +593,8 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp
union { \
npyv_lanetype_##F_SFX from_##F_SFX; \
npyv_lanetype_##T_SFX to_##T_SFX; \
- } pun = {.from_##F_SFX = fill}; \
+ } pun; \
+ pun.from_##F_SFX = fill; \
return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn_till_##T_SFX( \
(const npyv_lanetype_##T_SFX *)ptr, stride, nlane, pun.to_##T_SFX \
)); \
@@ -495,6 +635,110 @@ NPYV_IMPL_SSE_REST_PARTIAL_TYPES(f32, s32)
NPYV_IMPL_SSE_REST_PARTIAL_TYPES(u64, s64)
NPYV_IMPL_SSE_REST_PARTIAL_TYPES(f64, s64)
+// 128-bit/64-bit stride
+#define NPYV_IMPL_SSE_REST_PARTIAL_TYPES_PAIR(F_SFX, T_SFX) \
+ NPY_FINLINE npyv_##F_SFX npyv_load2_till_##F_SFX \
+ (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, \
+ npyv_lanetype_##F_SFX fill_lo, npyv_lanetype_##F_SFX fill_hi) \
+ { \
+ union pun { \
+ npyv_lanetype_##F_SFX from_##F_SFX; \
+ npyv_lanetype_##T_SFX to_##T_SFX; \
+ }; \
+ union pun pun_lo; \
+ union pun pun_hi; \
+ pun_lo.from_##F_SFX = fill_lo; \
+ pun_hi.from_##F_SFX = fill_hi; \
+ return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load2_till_##T_SFX( \
+ (const npyv_lanetype_##T_SFX *)ptr, nlane, pun_lo.to_##T_SFX, pun_hi.to_##T_SFX \
+ )); \
+ } \
+ NPY_FINLINE npyv_##F_SFX npyv_loadn2_till_##F_SFX \
+ (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, \
+ npyv_lanetype_##F_SFX fill_lo, npyv_lanetype_##F_SFX fill_hi) \
+ { \
+ union pun { \
+ npyv_lanetype_##F_SFX from_##F_SFX; \
+ npyv_lanetype_##T_SFX to_##T_SFX; \
+ }; \
+ union pun pun_lo; \
+ union pun pun_hi; \
+ pun_lo.from_##F_SFX = fill_lo; \
+ pun_hi.from_##F_SFX = fill_hi; \
+ return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn2_till_##T_SFX( \
+ (const npyv_lanetype_##T_SFX *)ptr, stride, nlane, pun_lo.to_##T_SFX, \
+ pun_hi.to_##T_SFX \
+ )); \
+ } \
+ NPY_FINLINE npyv_##F_SFX npyv_load2_tillz_##F_SFX \
+ (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane) \
+ { \
+ return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load2_tillz_##T_SFX( \
+ (const npyv_lanetype_##T_SFX *)ptr, nlane \
+ )); \
+ } \
+ NPY_FINLINE npyv_##F_SFX npyv_loadn2_tillz_##F_SFX \
+ (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane) \
+ { \
+ return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn2_tillz_##T_SFX( \
+ (const npyv_lanetype_##T_SFX *)ptr, stride, nlane \
+ )); \
+ } \
+ NPY_FINLINE void npyv_store2_till_##F_SFX \
+ (npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, npyv_##F_SFX a) \
+ { \
+ npyv_store2_till_##T_SFX( \
+ (npyv_lanetype_##T_SFX *)ptr, nlane, \
+ npyv_reinterpret_##T_SFX##_##F_SFX(a) \
+ ); \
+ } \
+ NPY_FINLINE void npyv_storen2_till_##F_SFX \
+ (npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, npyv_##F_SFX a) \
+ { \
+ npyv_storen2_till_##T_SFX( \
+ (npyv_lanetype_##T_SFX *)ptr, stride, nlane, \
+ npyv_reinterpret_##T_SFX##_##F_SFX(a) \
+ ); \
+ }
+
+NPYV_IMPL_SSE_REST_PARTIAL_TYPES_PAIR(u32, s32)
+NPYV_IMPL_SSE_REST_PARTIAL_TYPES_PAIR(f32, s32)
+NPYV_IMPL_SSE_REST_PARTIAL_TYPES_PAIR(u64, s64)
+NPYV_IMPL_SSE_REST_PARTIAL_TYPES_PAIR(f64, s64)
+
+/************************************************************
+ * de-interlave load / interleave contiguous store
+ ************************************************************/
+// two channels
+#define NPYV_IMPL_SSE_MEM_INTERLEAVE(SFX, ZSFX) \
+ NPY_FINLINE npyv_##ZSFX##x2 npyv_zip_##ZSFX(npyv_##ZSFX, npyv_##ZSFX); \
+ NPY_FINLINE npyv_##ZSFX##x2 npyv_unzip_##ZSFX(npyv_##ZSFX, npyv_##ZSFX); \
+ NPY_FINLINE npyv_##SFX##x2 npyv_load_##SFX##x2( \
+ const npyv_lanetype_##SFX *ptr \
+ ) { \
+ return npyv_unzip_##ZSFX( \
+ npyv_load_##SFX(ptr), npyv_load_##SFX(ptr+npyv_nlanes_##SFX) \
+ ); \
+ } \
+ NPY_FINLINE void npyv_store_##SFX##x2( \
+ npyv_lanetype_##SFX *ptr, npyv_##SFX##x2 v \
+ ) { \
+ npyv_##SFX##x2 zip = npyv_zip_##ZSFX(v.val[0], v.val[1]); \
+ npyv_store_##SFX(ptr, zip.val[0]); \
+ npyv_store_##SFX(ptr + npyv_nlanes_##SFX, zip.val[1]); \
+ }
+
+NPYV_IMPL_SSE_MEM_INTERLEAVE(u8, u8)
+NPYV_IMPL_SSE_MEM_INTERLEAVE(s8, u8)
+NPYV_IMPL_SSE_MEM_INTERLEAVE(u16, u16)
+NPYV_IMPL_SSE_MEM_INTERLEAVE(s16, u16)
+NPYV_IMPL_SSE_MEM_INTERLEAVE(u32, u32)
+NPYV_IMPL_SSE_MEM_INTERLEAVE(s32, u32)
+NPYV_IMPL_SSE_MEM_INTERLEAVE(u64, u64)
+NPYV_IMPL_SSE_MEM_INTERLEAVE(s64, u64)
+NPYV_IMPL_SSE_MEM_INTERLEAVE(f32, f32)
+NPYV_IMPL_SSE_MEM_INTERLEAVE(f64, f64)
+
/*********************************
* Lookup table
*********************************/
diff --git a/numpy/core/src/common/simd/sse/reorder.h b/numpy/core/src/common/simd/sse/reorder.h
index d96ab9c56..9a57f6489 100644
--- a/numpy/core/src/common/simd/sse/reorder.h
+++ b/numpy/core/src/common/simd/sse/reorder.h
@@ -81,6 +81,75 @@ NPYV_IMPL_SSE_ZIP(npyv_s64, s64, epi64)
NPYV_IMPL_SSE_ZIP(npyv_f32, f32, ps)
NPYV_IMPL_SSE_ZIP(npyv_f64, f64, pd)
+// deinterleave two vectors
+NPY_FINLINE npyv_u8x2 npyv_unzip_u8(npyv_u8 ab0, npyv_u8 ab1)
+{
+#ifdef NPY_HAVE_SSSE3
+ const __m128i idx = _mm_setr_epi8(
+ 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
+ );
+ __m128i abl = _mm_shuffle_epi8(ab0, idx);
+ __m128i abh = _mm_shuffle_epi8(ab1, idx);
+ return npyv_combine_u8(abl, abh);
+#else
+ __m128i ab_083b = _mm_unpacklo_epi8(ab0, ab1);
+ __m128i ab_4c6e = _mm_unpackhi_epi8(ab0, ab1);
+ __m128i ab_048c = _mm_unpacklo_epi8(ab_083b, ab_4c6e);
+ __m128i ab_36be = _mm_unpackhi_epi8(ab_083b, ab_4c6e);
+ __m128i ab_0346 = _mm_unpacklo_epi8(ab_048c, ab_36be);
+ __m128i ab_8bc8 = _mm_unpackhi_epi8(ab_048c, ab_36be);
+ npyv_u8x2 r;
+ r.val[0] = _mm_unpacklo_epi8(ab_0346, ab_8bc8);
+ r.val[1] = _mm_unpackhi_epi8(ab_0346, ab_8bc8);
+ return r;
+#endif
+}
+#define npyv_unzip_s8 npyv_unzip_u8
+
+NPY_FINLINE npyv_u16x2 npyv_unzip_u16(npyv_u16 ab0, npyv_u16 ab1)
+{
+#ifdef NPY_HAVE_SSSE3
+ const __m128i idx = _mm_setr_epi8(
+ 0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15
+ );
+ __m128i abl = _mm_shuffle_epi8(ab0, idx);
+ __m128i abh = _mm_shuffle_epi8(ab1, idx);
+ return npyv_combine_u16(abl, abh);
+#else
+ __m128i ab_0415 = _mm_unpacklo_epi16(ab0, ab1);
+ __m128i ab_263f = _mm_unpackhi_epi16(ab0, ab1);
+ __m128i ab_0246 = _mm_unpacklo_epi16(ab_0415, ab_263f);
+ __m128i ab_135f = _mm_unpackhi_epi16(ab_0415, ab_263f);
+ npyv_u16x2 r;
+ r.val[0] = _mm_unpacklo_epi16(ab_0246, ab_135f);
+ r.val[1] = _mm_unpackhi_epi16(ab_0246, ab_135f);
+ return r;
+#endif
+}
+#define npyv_unzip_s16 npyv_unzip_u16
+
+NPY_FINLINE npyv_u32x2 npyv_unzip_u32(npyv_u32 ab0, npyv_u32 ab1)
+{
+ __m128i abl = _mm_shuffle_epi32(ab0, _MM_SHUFFLE(3, 1, 2, 0));
+ __m128i abh = _mm_shuffle_epi32(ab1, _MM_SHUFFLE(3, 1, 2, 0));
+ return npyv_combine_u32(abl, abh);
+}
+#define npyv_unzip_s32 npyv_unzip_u32
+
+NPY_FINLINE npyv_u64x2 npyv_unzip_u64(npyv_u64 ab0, npyv_u64 ab1)
+{ return npyv_combine_u64(ab0, ab1); }
+#define npyv_unzip_s64 npyv_unzip_u64
+
+NPY_FINLINE npyv_f32x2 npyv_unzip_f32(npyv_f32 ab0, npyv_f32 ab1)
+{
+ npyv_f32x2 r;
+ r.val[0] = _mm_shuffle_ps(ab0, ab1, _MM_SHUFFLE(2, 0, 2, 0));
+ r.val[1] = _mm_shuffle_ps(ab0, ab1, _MM_SHUFFLE(3, 1, 3, 1));
+ return r;
+}
+NPY_FINLINE npyv_f64x2 npyv_unzip_f64(npyv_f64 ab0, npyv_f64 ab1)
+{ return npyv_combine_f64(ab0, ab1); }
+
// Reverse elements of each 64-bit lane
NPY_FINLINE npyv_u16 npyv_rev64_u16(npyv_u16 a)
{
@@ -122,4 +191,22 @@ NPY_FINLINE npyv_f32 npyv_rev64_f32(npyv_f32 a)
return _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 0, 1));
}
+// Permuting the elements of each 128-bit lane by immediate index for
+// each element.
+#define npyv_permi128_u32(A, E0, E1, E2, E3) \
+ _mm_shuffle_epi32(A, _MM_SHUFFLE(E3, E2, E1, E0))
+
+#define npyv_permi128_s32 npyv_permi128_u32
+
+#define npyv_permi128_u64(A, E0, E1) \
+ _mm_shuffle_epi32(A, _MM_SHUFFLE(((E1)<<1)+1, ((E1)<<1), ((E0)<<1)+1, ((E0)<<1)))
+
+#define npyv_permi128_s64 npyv_permi128_u64
+
+#define npyv_permi128_f32(A, E0, E1, E2, E3) \
+ _mm_shuffle_ps(A, A, _MM_SHUFFLE(E3, E2, E1, E0))
+
+#define npyv_permi128_f64(A, E0, E1) \
+ _mm_shuffle_pd(A, A, _MM_SHUFFLE2(E1, E0))
+
#endif // _NPY_SIMD_SSE_REORDER_H
diff --git a/numpy/core/src/common/simd/sse/sse.h b/numpy/core/src/common/simd/sse/sse.h
index c21bbfda7..0c6b8cdba 100644
--- a/numpy/core/src/common/simd/sse/sse.h
+++ b/numpy/core/src/common/simd/sse/sse.h
@@ -12,6 +12,7 @@
#define NPY_SIMD_FMA3 0 // fast emulated
#endif
#define NPY_SIMD_BIGENDIAN 0
+#define NPY_SIMD_CMPSIGNAL 1
typedef __m128i npyv_u8;
typedef __m128i npyv_s8;
diff --git a/numpy/core/src/common/simd/vec/arithmetic.h b/numpy/core/src/common/simd/vec/arithmetic.h
index a2e9d07eb..85f4d6b26 100644
--- a/numpy/core/src/common/simd/vec/arithmetic.h
+++ b/numpy/core/src/common/simd/vec/arithmetic.h
@@ -322,6 +322,20 @@ NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor)
NPY_FINLINE npyv_f64 npyv_nmulsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c)
{ return vec_neg(vec_madd(a, b, c)); }
#endif
+// multiply, add for odd elements and subtract even elements.
+// (a * b) -+ c
+#if NPY_SIMD_F32
+NPY_FINLINE npyv_f32 npyv_muladdsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
+{
+ const npyv_f32 msign = npyv_set_f32(-0.0f, 0.0f, -0.0f, 0.0f);
+ return npyv_muladd_f32(a, b, npyv_xor_f32(msign, c));
+}
+#endif
+NPY_FINLINE npyv_f64 npyv_muladdsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c)
+{
+ const npyv_f64 msign = npyv_set_f64(-0.0, 0.0);
+ return npyv_muladd_f64(a, b, npyv_xor_f64(msign, c));
+}
/***************************
* Summation
***************************/
@@ -352,6 +366,7 @@ NPY_FINLINE float npyv_sum_f32(npyv_f32 a)
{
npyv_f32 sum = vec_add(a, npyv_combineh_f32(a, a));
return vec_extract(sum, 0) + vec_extract(sum, 1);
+ (void)sum;
}
#endif
@@ -372,6 +387,7 @@ NPY_FINLINE npy_uint16 npyv_sumup_u8(npyv_u8 a)
npyv_u32 four = vec_sum4s(a, zero);
npyv_s32 one = vec_sums((npyv_s32)four, (npyv_s32)zero);
return (npy_uint16)vec_extract(one, 3);
+ (void)one;
#endif
}
@@ -386,6 +402,7 @@ NPY_FINLINE npy_uint32 npyv_sumup_u16(npyv_u16 a)
npyv_u32 four = vec_add(eight.val[0], eight.val[1]);
npyv_s32 one = vec_sums((npyv_s32)four, zero);
return (npy_uint32)vec_extract(one, 3);
+ (void)one;
#endif
}
diff --git a/numpy/core/src/common/simd/vec/conversion.h b/numpy/core/src/common/simd/vec/conversion.h
index f0d625c55..922109f7b 100644
--- a/numpy/core/src/common/simd/vec/conversion.h
+++ b/numpy/core/src/common/simd/vec/conversion.h
@@ -96,6 +96,8 @@ npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d,
#else
return vec_extract(r, 4);
#endif
+ // to suppress ambiguous warning: variable `r` but not used [-Wunused-but-set-variable]
+ (void)r;
}
NPY_FINLINE npy_uint64 npyv_tobits_b16(npyv_b16 a)
{
@@ -106,6 +108,8 @@ npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d,
#else
return vec_extract(r, 8);
#endif
+ // to suppress ambiguous warning: variable `r` but not used [-Wunused-but-set-variable]
+ (void)r;
}
NPY_FINLINE npy_uint64 npyv_tobits_b32(npyv_b32 a)
{
@@ -120,6 +124,8 @@ npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d,
#else
return vec_extract(r, 8);
#endif
+ // to suppress ambiguous warning: variable `r` but not used [-Wunused-but-set-variable]
+ (void)r;
}
NPY_FINLINE npy_uint64 npyv_tobits_b64(npyv_b64 a)
{
@@ -134,6 +140,8 @@ npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d,
#else
return vec_extract(r, 8);
#endif
+ // to suppress ambiguous warning: variable `r` but not used [-Wunused-but-set-variable]
+ (void)r;
}
#else
NPY_FINLINE npy_uint64 npyv_tobits_b8(npyv_b8 a)
@@ -170,7 +178,8 @@ npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d,
#ifdef NPY_HAVE_VXE2
return vec_signed(a);
#elif defined(NPY_HAVE_VXE)
- return vec_packs(vec_signed(npyv_doublee(a)), vec_signed(npyv_doublee(vec_mergel(a, a))));
+ return vec_packs(vec_signed(npyv_doublee(vec_mergeh(a,a))),
+ vec_signed(npyv_doublee(vec_mergel(a, a))));
// VSX
#elif defined(__IBMC__)
return vec_cts(a, 0);
diff --git a/numpy/core/src/common/simd/vec/math.h b/numpy/core/src/common/simd/vec/math.h
index 95b16fdf7..85690f76c 100644
--- a/numpy/core/src/common/simd/vec/math.h
+++ b/numpy/core/src/common/simd/vec/math.h
@@ -186,6 +186,7 @@ NPY_IMPL_VEC_REDUCE_MINMAX(max, int32, s32)
{ \
npyv_##SFX r = vec_##INTRIN(a, vec_sld(a, a, 8)); \
return (npy_##STYPE)vec_extract(r, 0); \
+ (void)r; \
}
NPY_IMPL_VEC_REDUCE_MINMAX(min, uint64, u64)
NPY_IMPL_VEC_REDUCE_MINMAX(max, uint64, u64)
@@ -225,6 +226,7 @@ NPY_IMPL_VEC_REDUCE_MINMAX(max, int64, s64)
{ \
npyv_f64 r = vec_##INTRIN(a, vec_sld(a, a, 8)); \
return vec_extract(r, 0); \
+ (void)r; \
} \
NPY_FINLINE double npyv_reduce_##INTRIN##n_f64(npyv_f64 a) \
{ \
diff --git a/numpy/core/src/common/simd/vec/memory.h b/numpy/core/src/common/simd/vec/memory.h
index e8f588ef2..1ad39cead 100644
--- a/numpy/core/src/common/simd/vec/memory.h
+++ b/numpy/core/src/common/simd/vec/memory.h
@@ -46,14 +46,8 @@
#endif
// avoid aliasing rules
-#ifdef __cplusplus
- template<typename T_PTR>
- NPY_FINLINE npy_uint64 *npyv__ptr2u64(const T_PTR *ptr)
- { npy_uint64 *ptr64 = (npy_uint64*)ptr; return ptr64; }
-#else
- NPY_FINLINE npy_uint64 *npyv__ptr2u64(const void *ptr)
- { npy_uint64 *ptr64 = (npy_uint64*)ptr; return ptr64; }
-#endif // __cplusplus
+NPY_FINLINE npy_uint64 *npyv__ptr2u64(const void *ptr)
+{ npy_uint64 *ptr64 = (npy_uint64*)ptr; return ptr64; }
// load lower part
NPY_FINLINE npyv_u64 npyv__loadl(const void *ptr)
@@ -136,6 +130,24 @@ NPY_FINLINE npyv_s64 npyv_loadn_s64(const npy_int64 *ptr, npy_intp stride)
{ return npyv_set_s64(ptr[0], ptr[stride]); }
NPY_FINLINE npyv_f64 npyv_loadn_f64(const double *ptr, npy_intp stride)
{ return npyv_set_f64(ptr[0], ptr[stride]); }
+
+//// 64-bit load over 32-bit stride
+NPY_FINLINE npyv_u32 npyv_loadn2_u32(const npy_uint32 *ptr, npy_intp stride)
+{ return (npyv_u32)npyv_set_u64(*(npy_uint64*)ptr, *(npy_uint64*)(ptr + stride)); }
+NPY_FINLINE npyv_s32 npyv_loadn2_s32(const npy_int32 *ptr, npy_intp stride)
+{ return (npyv_s32)npyv_set_u64(*(npy_uint64*)ptr, *(npy_uint64*)(ptr + stride)); }
+#if NPY_SIMD_F32
+NPY_FINLINE npyv_f32 npyv_loadn2_f32(const float *ptr, npy_intp stride)
+{ return (npyv_f32)npyv_set_u64(*(npy_uint64*)ptr, *(npy_uint64*)(ptr + stride)); }
+#endif
+//// 128-bit load over 64-bit stride
+NPY_FINLINE npyv_u64 npyv_loadn2_u64(const npy_uint64 *ptr, npy_intp stride)
+{ (void)stride; return npyv_load_u64(ptr); }
+NPY_FINLINE npyv_s64 npyv_loadn2_s64(const npy_int64 *ptr, npy_intp stride)
+{ (void)stride; return npyv_load_s64(ptr); }
+NPY_FINLINE npyv_f64 npyv_loadn2_f64(const double *ptr, npy_intp stride)
+{ (void)stride; return npyv_load_f64(ptr); }
+
/***************************
* Non-contiguous Store
***************************/
@@ -164,6 +176,26 @@ NPY_FINLINE void npyv_storen_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a)
NPY_FINLINE void npyv_storen_f64(double *ptr, npy_intp stride, npyv_f64 a)
{ npyv_storen_u64((npy_uint64*)ptr, stride, (npyv_u64)a); }
+//// 64-bit store over 32-bit stride
+NPY_FINLINE void npyv_storen2_u32(npy_uint32 *ptr, npy_intp stride, npyv_u32 a)
+{
+ *(npy_uint64*)ptr = vec_extract((npyv_u64)a, 0);
+ *(npy_uint64*)(ptr + stride) = vec_extract((npyv_u64)a, 1);
+}
+NPY_FINLINE void npyv_storen2_s32(npy_int32 *ptr, npy_intp stride, npyv_s32 a)
+{ npyv_storen2_u32((npy_uint32*)ptr, stride, (npyv_u32)a); }
+#if NPY_SIMD_F32
+NPY_FINLINE void npyv_storen2_f32(float *ptr, npy_intp stride, npyv_f32 a)
+{ npyv_storen2_u32((npy_uint32*)ptr, stride, (npyv_u32)a); }
+#endif
+//// 128-bit store over 64-bit stride
+NPY_FINLINE void npyv_storen2_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a)
+{ (void)stride; npyv_store_u64(ptr, a); }
+NPY_FINLINE void npyv_storen2_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a)
+{ (void)stride; npyv_store_s64(ptr, a); }
+NPY_FINLINE void npyv_storen2_f64(double *ptr, npy_intp stride, npyv_f64 a)
+{ (void)stride; npyv_store_f64(ptr, a); }
+
/*********************************
* Partial Load
*********************************/
@@ -173,9 +205,9 @@ NPY_FINLINE npyv_s32 npyv_load_till_s32(const npy_int32 *ptr, npy_uintp nlane, n
assert(nlane > 0);
npyv_s32 vfill = npyv_setall_s32(fill);
#ifdef NPY_HAVE_VX
- const unsigned blane = (unsigned short)nlane;
+ const unsigned blane = (nlane > 4) ? 4 : nlane;
const npyv_u32 steps = npyv_set_u32(0, 1, 2, 3);
- const npyv_u32 vlane = npyv_setall_u32((unsigned)blane);
+ const npyv_u32 vlane = npyv_setall_u32(blane);
const npyv_b32 mask = vec_cmpgt(vlane, steps);
npyv_s32 a = vec_load_len(ptr, blane*4-1);
return vec_sel(vfill, a, mask);
@@ -201,8 +233,8 @@ NPY_FINLINE npyv_s32 npyv_load_till_s32(const npy_int32 *ptr, npy_uintp nlane, n
NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
{
#ifdef NPY_HAVE_VX
- unsigned blane = ((unsigned short)nlane)*4 - 1;
- return vec_load_len(ptr, blane);
+ unsigned blane = (nlane > 4) ? 4 : nlane;
+ return vec_load_len(ptr, blane*4-1);
#else
return npyv_load_till_s32(ptr, nlane, 0);
#endif
@@ -220,12 +252,33 @@ NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, n
NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
{
#ifdef NPY_HAVE_VX
- unsigned blane = (unsigned short)nlane;
+ unsigned blane = (nlane > 2) ? 2 : nlane;
return vec_load_len((const signed long long*)ptr, blane*8-1);
#else
return npyv_load_till_s64(ptr, nlane, 0);
#endif
}
+//// 64-bit nlane
+NPY_FINLINE npyv_s32 npyv_load2_till_s32(const npy_int32 *ptr, npy_uintp nlane,
+ npy_int32 fill_lo, npy_int32 fill_hi)
+{
+ assert(nlane > 0);
+ if (nlane == 1) {
+ return npyv_set_s32(ptr[0], ptr[1], fill_lo, fill_hi);
+ }
+ return npyv_load_s32(ptr);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s32 npyv_load2_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
+{ return (npyv_s32)npyv_load_tillz_s64((const npy_int64*)ptr, nlane); }
+
+//// 128-bit nlane
+NPY_FINLINE npyv_s64 npyv_load2_till_s64(const npy_int64 *ptr, npy_uintp nlane,
+ npy_int64 fill_lo, npy_int64 fill_hi)
+{ (void)nlane; (void)fill_lo; (void)fill_hi; return npyv_load_s64(ptr); }
+
+NPY_FINLINE npyv_s64 npyv_load2_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
+{ (void)nlane; return npyv_load_s64(ptr); }
/*********************************
* Non-contiguous partial load
*********************************/
@@ -265,6 +318,34 @@ npyv_loadn_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_
// fill zero to rest lanes
NPY_FINLINE npyv_s64 npyv_loadn_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane)
{ return npyv_loadn_till_s64(ptr, stride, nlane, 0); }
+
+//// 64-bit load over 32-bit stride
+NPY_FINLINE npyv_s32 npyv_loadn2_till_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane,
+ npy_int32 fill_lo, npy_int32 fill_hi)
+{
+ assert(nlane > 0);
+ if (nlane == 1) {
+ return npyv_set_s32(ptr[0], ptr[1], fill_lo, fill_hi);
+ }
+ return npyv_loadn2_s32(ptr, stride);
+}
+NPY_FINLINE npyv_s32 npyv_loadn2_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane)
+{
+ assert(nlane > 0);
+ if (nlane == 1) {
+ return (npyv_s32)npyv_set_s64(*(npy_int64*)ptr, 0);
+ }
+ return npyv_loadn2_s32(ptr, stride);
+}
+
+//// 128-bit load over 64-bit stride
+NPY_FINLINE npyv_s64 npyv_loadn2_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane,
+ npy_int64 fill_lo, npy_int64 fill_hi)
+{ assert(nlane > 0); (void)stride; (void)nlane; (void)fill_lo; (void)fill_hi; return npyv_load_s64(ptr); }
+
+NPY_FINLINE npyv_s64 npyv_loadn2_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane)
+{ assert(nlane > 0); (void)stride; (void)nlane; return npyv_load_s64(ptr); }
+
/*********************************
* Partial store
*********************************/
@@ -273,7 +354,7 @@ NPY_FINLINE void npyv_store_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a
{
assert(nlane > 0);
#ifdef NPY_HAVE_VX
- unsigned blane = (unsigned short)nlane;
+ unsigned blane = (nlane > 4) ? 4 : nlane;
vec_store_len(a, ptr, blane*4-1);
#else
switch(nlane) {
@@ -297,7 +378,7 @@ NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a
{
assert(nlane > 0);
#ifdef NPY_HAVE_VX
- unsigned blane = (unsigned short)nlane;
+ unsigned blane = (nlane > 2) ? 2 : nlane;
vec_store_len(a, (signed long long*)ptr, blane*8-1);
#else
if (nlane == 1) {
@@ -307,6 +388,18 @@ NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a
npyv_store_s64(ptr, a);
#endif
}
+
+//// 64-bit nlane
+NPY_FINLINE void npyv_store2_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a)
+{ npyv_store_till_s64((npy_int64*)ptr, nlane, (npyv_s64)a); }
+
+//// 128-bit nlane
+NPY_FINLINE void npyv_store2_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a)
+{
+ assert(nlane > 0); (void)nlane;
+ npyv_store_s64(ptr, a);
+}
+
/*********************************
* Non-contiguous partial store
*********************************/
@@ -314,16 +407,21 @@ NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a
NPY_FINLINE void npyv_storen_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npyv_s32 a)
{
assert(nlane > 0);
+ ptr[stride*0] = vec_extract(a, 0);
switch(nlane) {
- default:
- ptr[stride*3] = vec_extract(a, 3);
- case 3:
- ptr[stride*2] = vec_extract(a, 2);
+ case 1:
+ return;
case 2:
ptr[stride*1] = vec_extract(a, 1);
- case 1:
- ptr[stride*0] = vec_extract(a, 0);
- break;
+ return;
+ case 3:
+ ptr[stride*1] = vec_extract(a, 1);
+ ptr[stride*2] = vec_extract(a, 2);
+ return;
+ default:
+ ptr[stride*1] = vec_extract(a, 1);
+ ptr[stride*2] = vec_extract(a, 2);
+ ptr[stride*3] = vec_extract(a, 3);
}
}
//// 64
@@ -336,6 +434,21 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp
}
npyv_storen_s64(ptr, stride, a);
}
+
+//// 64-bit store over 32-bit stride
+NPY_FINLINE void npyv_storen2_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npyv_s32 a)
+{
+ assert(nlane > 0);
+ npyv_storel_s32(ptr, a);
+ if (nlane > 1) {
+ npyv_storeh_s32(ptr + stride, a);
+ }
+}
+
+//// 128-bit store over 64-bit stride
+NPY_FINLINE void npyv_storen2_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npyv_s64 a)
+{ assert(nlane > 0); (void)stride; (void)nlane; npyv_store_s64(ptr, a); }
+
/*****************************************************************
* Implement partial load/store for u32/f32/u64/f64... via casting
*****************************************************************/
@@ -346,7 +459,8 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp
union { \
npyv_lanetype_##F_SFX from_##F_SFX; \
npyv_lanetype_##T_SFX to_##T_SFX; \
- } pun = {.from_##F_SFX = fill}; \
+ } pun; \
+ pun.from_##F_SFX = fill; \
return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load_till_##T_SFX( \
(const npyv_lanetype_##T_SFX *)ptr, nlane, pun.to_##T_SFX \
)); \
@@ -358,7 +472,8 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp
union { \
npyv_lanetype_##F_SFX from_##F_SFX; \
npyv_lanetype_##T_SFX to_##T_SFX; \
- } pun = {.from_##F_SFX = fill}; \
+ } pun; \
+ pun.from_##F_SFX = fill; \
return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn_till_##T_SFX( \
(const npyv_lanetype_##T_SFX *)ptr, stride, nlane, pun.to_##T_SFX \
)); \
@@ -401,6 +516,114 @@ NPYV_IMPL_VEC_REST_PARTIAL_TYPES(f32, s32)
NPYV_IMPL_VEC_REST_PARTIAL_TYPES(u64, s64)
NPYV_IMPL_VEC_REST_PARTIAL_TYPES(f64, s64)
+// 128-bit/64-bit stride
+#define NPYV_IMPL_VEC_REST_PARTIAL_TYPES_PAIR(F_SFX, T_SFX) \
+ NPY_FINLINE npyv_##F_SFX npyv_load2_till_##F_SFX \
+ (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, \
+ npyv_lanetype_##F_SFX fill_lo, npyv_lanetype_##F_SFX fill_hi) \
+ { \
+ union pun { \
+ npyv_lanetype_##F_SFX from_##F_SFX; \
+ npyv_lanetype_##T_SFX to_##T_SFX; \
+ }; \
+ union pun pun_lo; \
+ union pun pun_hi; \
+ pun_lo.from_##F_SFX = fill_lo; \
+ pun_hi.from_##F_SFX = fill_hi; \
+ return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load2_till_##T_SFX( \
+ (const npyv_lanetype_##T_SFX *)ptr, nlane, pun_lo.to_##T_SFX, pun_hi.to_##T_SFX \
+ )); \
+ } \
+ NPY_FINLINE npyv_##F_SFX npyv_loadn2_till_##F_SFX \
+ (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, \
+ npyv_lanetype_##F_SFX fill_lo, npyv_lanetype_##F_SFX fill_hi) \
+ { \
+ union pun { \
+ npyv_lanetype_##F_SFX from_##F_SFX; \
+ npyv_lanetype_##T_SFX to_##T_SFX; \
+ }; \
+ union pun pun_lo; \
+ union pun pun_hi; \
+ pun_lo.from_##F_SFX = fill_lo; \
+ pun_hi.from_##F_SFX = fill_hi; \
+ return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn2_till_##T_SFX( \
+ (const npyv_lanetype_##T_SFX *)ptr, stride, nlane, pun_lo.to_##T_SFX, \
+ pun_hi.to_##T_SFX \
+ )); \
+ } \
+ NPY_FINLINE npyv_##F_SFX npyv_load2_tillz_##F_SFX \
+ (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane) \
+ { \
+ return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load2_tillz_##T_SFX( \
+ (const npyv_lanetype_##T_SFX *)ptr, nlane \
+ )); \
+ } \
+ NPY_FINLINE npyv_##F_SFX npyv_loadn2_tillz_##F_SFX \
+ (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane) \
+ { \
+ return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn2_tillz_##T_SFX( \
+ (const npyv_lanetype_##T_SFX *)ptr, stride, nlane \
+ )); \
+ } \
+ NPY_FINLINE void npyv_store2_till_##F_SFX \
+ (npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, npyv_##F_SFX a) \
+ { \
+ npyv_store2_till_##T_SFX( \
+ (npyv_lanetype_##T_SFX *)ptr, nlane, \
+ npyv_reinterpret_##T_SFX##_##F_SFX(a) \
+ ); \
+ } \
+ NPY_FINLINE void npyv_storen2_till_##F_SFX \
+ (npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, npyv_##F_SFX a) \
+ { \
+ npyv_storen2_till_##T_SFX( \
+ (npyv_lanetype_##T_SFX *)ptr, stride, nlane, \
+ npyv_reinterpret_##T_SFX##_##F_SFX(a) \
+ ); \
+ }
+
+NPYV_IMPL_VEC_REST_PARTIAL_TYPES_PAIR(u32, s32)
+#if NPY_SIMD_F32
+NPYV_IMPL_VEC_REST_PARTIAL_TYPES_PAIR(f32, s32)
+#endif
+NPYV_IMPL_VEC_REST_PARTIAL_TYPES_PAIR(u64, s64)
+NPYV_IMPL_VEC_REST_PARTIAL_TYPES_PAIR(f64, s64)
+
+/************************************************************
+ * de-interlave load / interleave contiguous store
+ ************************************************************/
+// two channels
+#define NPYV_IMPL_VEC_MEM_INTERLEAVE(SFX) \
+ NPY_FINLINE npyv_##SFX##x2 npyv_zip_##SFX(npyv_##SFX, npyv_##SFX); \
+ NPY_FINLINE npyv_##SFX##x2 npyv_unzip_##SFX(npyv_##SFX, npyv_##SFX); \
+ NPY_FINLINE npyv_##SFX##x2 npyv_load_##SFX##x2( \
+ const npyv_lanetype_##SFX *ptr \
+ ) { \
+ return npyv_unzip_##SFX( \
+ npyv_load_##SFX(ptr), npyv_load_##SFX(ptr+npyv_nlanes_##SFX) \
+ ); \
+ } \
+ NPY_FINLINE void npyv_store_##SFX##x2( \
+ npyv_lanetype_##SFX *ptr, npyv_##SFX##x2 v \
+ ) { \
+ npyv_##SFX##x2 zip = npyv_zip_##SFX(v.val[0], v.val[1]); \
+ npyv_store_##SFX(ptr, zip.val[0]); \
+ npyv_store_##SFX(ptr + npyv_nlanes_##SFX, zip.val[1]); \
+ }
+
+NPYV_IMPL_VEC_MEM_INTERLEAVE(u8)
+NPYV_IMPL_VEC_MEM_INTERLEAVE(s8)
+NPYV_IMPL_VEC_MEM_INTERLEAVE(u16)
+NPYV_IMPL_VEC_MEM_INTERLEAVE(s16)
+NPYV_IMPL_VEC_MEM_INTERLEAVE(u32)
+NPYV_IMPL_VEC_MEM_INTERLEAVE(s32)
+NPYV_IMPL_VEC_MEM_INTERLEAVE(u64)
+NPYV_IMPL_VEC_MEM_INTERLEAVE(s64)
+#if NPY_SIMD_F32
+NPYV_IMPL_VEC_MEM_INTERLEAVE(f32)
+#endif
+NPYV_IMPL_VEC_MEM_INTERLEAVE(f64)
+
/*********************************
* Lookup table
*********************************/
diff --git a/numpy/core/src/common/simd/vec/misc.h b/numpy/core/src/common/simd/vec/misc.h
index 7ea0f21f6..79c194d90 100644
--- a/numpy/core/src/common/simd/vec/misc.h
+++ b/numpy/core/src/common/simd/vec/misc.h
@@ -26,28 +26,28 @@
#define NPYV_IMPL_VEC_SPLTW(T_VEC, V) ((T_VEC){V, V, V, V})
#define NPYV_IMPL_VEC_SPLTD(T_VEC, V) ((T_VEC){V, V})
-#define npyv_setall_u8(VAL) NPYV_IMPL_VEC_SPLTB(npyv_u8, (unsigned char)VAL)
-#define npyv_setall_s8(VAL) NPYV_IMPL_VEC_SPLTB(npyv_s8, (signed char)VAL)
-#define npyv_setall_u16(VAL) NPYV_IMPL_VEC_SPLTH(npyv_u16, (unsigned short)VAL)
-#define npyv_setall_s16(VAL) NPYV_IMPL_VEC_SPLTH(npyv_s16, (short)VAL)
-#define npyv_setall_u32(VAL) NPYV_IMPL_VEC_SPLTW(npyv_u32, (unsigned int)VAL)
-#define npyv_setall_s32(VAL) NPYV_IMPL_VEC_SPLTW(npyv_s32, (int)VAL)
+#define npyv_setall_u8(VAL) NPYV_IMPL_VEC_SPLTB(npyv_u8, (unsigned char)(VAL))
+#define npyv_setall_s8(VAL) NPYV_IMPL_VEC_SPLTB(npyv_s8, (signed char)(VAL))
+#define npyv_setall_u16(VAL) NPYV_IMPL_VEC_SPLTH(npyv_u16, (unsigned short)(VAL))
+#define npyv_setall_s16(VAL) NPYV_IMPL_VEC_SPLTH(npyv_s16, (short)(VAL))
+#define npyv_setall_u32(VAL) NPYV_IMPL_VEC_SPLTW(npyv_u32, (unsigned int)(VAL))
+#define npyv_setall_s32(VAL) NPYV_IMPL_VEC_SPLTW(npyv_s32, (int)(VAL))
#if NPY_SIMD_F32
- #define npyv_setall_f32(VAL) NPYV_IMPL_VEC_SPLTW(npyv_f32, VAL)
+ #define npyv_setall_f32(VAL) NPYV_IMPL_VEC_SPLTW(npyv_f32, (VAL))
#endif
-#define npyv_setall_u64(VAL) NPYV_IMPL_VEC_SPLTD(npyv_u64, (npy_uint64)VAL)
-#define npyv_setall_s64(VAL) NPYV_IMPL_VEC_SPLTD(npyv_s64, (npy_int64)VAL)
+#define npyv_setall_u64(VAL) NPYV_IMPL_VEC_SPLTD(npyv_u64, (npy_uint64)(VAL))
+#define npyv_setall_s64(VAL) NPYV_IMPL_VEC_SPLTD(npyv_s64, (npy_int64)(VAL))
#define npyv_setall_f64(VAL) NPYV_IMPL_VEC_SPLTD(npyv_f64, VAL)
// vector with specific values set to each lane and
// set a specific value to all remained lanes
-#define npyv_setf_u8(FILL, ...) ((npyv_u8){NPYV__SET_FILL_16(char, FILL, __VA_ARGS__)})
-#define npyv_setf_s8(FILL, ...) ((npyv_s8){NPYV__SET_FILL_16(char, FILL, __VA_ARGS__)})
-#define npyv_setf_u16(FILL, ...) ((npyv_u16){NPYV__SET_FILL_8(short, FILL, __VA_ARGS__)})
+#define npyv_setf_u8(FILL, ...) ((npyv_u8){NPYV__SET_FILL_16(unsigned char, FILL, __VA_ARGS__)})
+#define npyv_setf_s8(FILL, ...) ((npyv_s8){NPYV__SET_FILL_16(signed char, FILL, __VA_ARGS__)})
+#define npyv_setf_u16(FILL, ...) ((npyv_u16){NPYV__SET_FILL_8(unsigned short, FILL, __VA_ARGS__)})
#define npyv_setf_s16(FILL, ...) ((npyv_s16){NPYV__SET_FILL_8(short, FILL, __VA_ARGS__)})
-#define npyv_setf_u32(FILL, ...) ((npyv_u32){NPYV__SET_FILL_4(int, FILL, __VA_ARGS__)})
+#define npyv_setf_u32(FILL, ...) ((npyv_u32){NPYV__SET_FILL_4(unsigned int, FILL, __VA_ARGS__)})
#define npyv_setf_s32(FILL, ...) ((npyv_s32){NPYV__SET_FILL_4(int, FILL, __VA_ARGS__)})
-#define npyv_setf_u64(FILL, ...) ((npyv_u64){NPYV__SET_FILL_2(npy_int64, FILL, __VA_ARGS__)})
+#define npyv_setf_u64(FILL, ...) ((npyv_u64){NPYV__SET_FILL_2(npy_uint64, FILL, __VA_ARGS__)})
#define npyv_setf_s64(FILL, ...) ((npyv_s64){NPYV__SET_FILL_2(npy_int64, FILL, __VA_ARGS__)})
#if NPY_SIMD_F32
#define npyv_setf_f32(FILL, ...) ((npyv_f32){NPYV__SET_FILL_4(float, FILL, __VA_ARGS__)})
diff --git a/numpy/core/src/common/simd/vec/reorder.h b/numpy/core/src/common/simd/vec/reorder.h
index b60b9287d..3910980a2 100644
--- a/numpy/core/src/common/simd/vec/reorder.h
+++ b/numpy/core/src/common/simd/vec/reorder.h
@@ -68,6 +68,85 @@ NPYV_IMPL_VEC_COMBINE_ZIP(npyv_s64, s64)
#endif
NPYV_IMPL_VEC_COMBINE_ZIP(npyv_f64, f64)
+// deinterleave two vectors
+NPY_FINLINE npyv_u8x2 npyv_unzip_u8(npyv_u8 ab0, npyv_u8 ab1)
+{
+ const npyv_u8 idx_even = npyv_set_u8(
+ 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+ );
+ const npyv_u8 idx_odd = npyv_set_u8(
+ 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+ );
+ npyv_u8x2 r;
+ r.val[0] = vec_perm(ab0, ab1, idx_even);
+ r.val[1] = vec_perm(ab0, ab1, idx_odd);
+ return r;
+}
+NPY_FINLINE npyv_s8x2 npyv_unzip_s8(npyv_s8 ab0, npyv_s8 ab1)
+{
+ npyv_u8x2 ru = npyv_unzip_u8((npyv_u8)ab0, (npyv_u8)ab1);
+ npyv_s8x2 r;
+ r.val[0] = (npyv_s8)ru.val[0];
+ r.val[1] = (npyv_s8)ru.val[1];
+ return r;
+}
+NPY_FINLINE npyv_u16x2 npyv_unzip_u16(npyv_u16 ab0, npyv_u16 ab1)
+{
+ const npyv_u8 idx_even = npyv_set_u8(
+ 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29
+ );
+ const npyv_u8 idx_odd = npyv_set_u8(
+ 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31
+ );
+ npyv_u16x2 r;
+ r.val[0] = vec_perm(ab0, ab1, idx_even);
+ r.val[1] = vec_perm(ab0, ab1, idx_odd);
+ return r;
+}
+NPY_FINLINE npyv_s16x2 npyv_unzip_s16(npyv_s16 ab0, npyv_s16 ab1)
+{
+ npyv_u16x2 ru = npyv_unzip_u16((npyv_u16)ab0, (npyv_u16)ab1);
+ npyv_s16x2 r;
+ r.val[0] = (npyv_s16)ru.val[0];
+ r.val[1] = (npyv_s16)ru.val[1];
+ return r;
+}
+NPY_FINLINE npyv_u32x2 npyv_unzip_u32(npyv_u32 ab0, npyv_u32 ab1)
+{
+ npyv_u32 m0 = vec_mergeh(ab0, ab1);
+ npyv_u32 m1 = vec_mergel(ab0, ab1);
+ npyv_u32 r0 = vec_mergeh(m0, m1);
+ npyv_u32 r1 = vec_mergel(m0, m1);
+ npyv_u32x2 r;
+ r.val[0] = r0;
+ r.val[1] = r1;
+ return r;
+}
+NPY_FINLINE npyv_s32x2 npyv_unzip_s32(npyv_s32 ab0, npyv_s32 ab1)
+{
+ npyv_u32x2 ru = npyv_unzip_u32((npyv_u32)ab0, (npyv_u32)ab1);
+ npyv_s32x2 r;
+ r.val[0] = (npyv_s32)ru.val[0];
+ r.val[1] = (npyv_s32)ru.val[1];
+ return r;
+}
+#if NPY_SIMD_F32
+ NPY_FINLINE npyv_f32x2 npyv_unzip_f32(npyv_f32 ab0, npyv_f32 ab1)
+ {
+ npyv_u32x2 ru = npyv_unzip_u32((npyv_u32)ab0, (npyv_u32)ab1);
+ npyv_f32x2 r;
+ r.val[0] = (npyv_f32)ru.val[0];
+ r.val[1] = (npyv_f32)ru.val[1];
+ return r;
+ }
+#endif
+NPY_FINLINE npyv_u64x2 npyv_unzip_u64(npyv_u64 ab0, npyv_u64 ab1)
+{ return npyv_combine_u64(ab0, ab1); }
+NPY_FINLINE npyv_s64x2 npyv_unzip_s64(npyv_s64 ab0, npyv_s64 ab1)
+{ return npyv_combine_s64(ab0, ab1); }
+NPY_FINLINE npyv_f64x2 npyv_unzip_f64(npyv_f64 ab0, npyv_f64 ab1)
+{ return npyv_combine_f64(ab0, ab1); }
+
// Reverse elements of each 64-bit lane
NPY_FINLINE npyv_u8 npyv_rev64_u8(npyv_u8 a)
{
@@ -111,4 +190,24 @@ NPY_FINLINE npyv_s32 npyv_rev64_s32(npyv_s32 a)
{ return (npyv_f32)npyv_rev64_u32((npyv_u32)a); }
#endif
+// Permuting the elements of each 128-bit lane by immediate index for
+// each element.
+#define npyv_permi128_u32(A, E0, E1, E2, E3) \
+ vec_perm(A, A, npyv_set_u8( \
+ (E0<<2), (E0<<2)+1, (E0<<2)+2, (E0<<2)+3, \
+ (E1<<2), (E1<<2)+1, (E1<<2)+2, (E1<<2)+3, \
+ (E2<<2), (E2<<2)+1, (E2<<2)+2, (E2<<2)+3, \
+ (E3<<2), (E3<<2)+1, (E3<<2)+2, (E3<<2)+3 \
+ ))
+#define npyv_permi128_s32 npyv_permi128_u32
+#define npyv_permi128_f32 npyv_permi128_u32
+
+#if defined(__IBMC__) || defined(vec_permi)
+ #define npyv_permi128_u64(A, E0, E1) vec_permi(A, A, ((E0)<<1) | (E1))
+#else
+ #define npyv_permi128_u64(A, E0, E1) vec_xxpermdi(A, A, ((E0)<<1) | (E1))
+#endif
+#define npyv_permi128_s64 npyv_permi128_u64
+#define npyv_permi128_f64 npyv_permi128_u64
+
#endif // _NPY_SIMD_VEC_REORDER_H
diff --git a/numpy/core/src/common/simd/vec/vec.h b/numpy/core/src/common/simd/vec/vec.h
index abcd33ce1..1d4508669 100644
--- a/numpy/core/src/common/simd/vec/vec.h
+++ b/numpy/core/src/common/simd/vec/vec.h
@@ -39,8 +39,10 @@
#ifdef NPY_HAVE_VX
#define NPY_SIMD_BIGENDIAN 1
+ #define NPY_SIMD_CMPSIGNAL 0
#else
#define NPY_SIMD_BIGENDIAN 0
+ #define NPY_SIMD_CMPSIGNAL 1
#endif
typedef __vector unsigned char npyv_u8;
diff --git a/numpy/core/src/common/templ_common.h.src b/numpy/core/src/common/templ_common.h.src
index 1eea8eb37..d0d1917fe 100644
--- a/numpy/core/src/common/templ_common.h.src
+++ b/numpy/core/src/common/templ_common.h.src
@@ -4,6 +4,7 @@
/* utility functions that profit from templates */
#include "numpy/npy_common.h"
+#include <assert.h>
/**begin repeat
* #name = int, uint, long, ulong,
@@ -24,7 +25,7 @@
* that is not checked. Could use absolute values and adjust the sign if that
* functionality was desired.
*/
-static NPY_INLINE int
+static inline int
npy_mul_with_overflow_@name@(@type@ * r, @type@ a, @type@ b)
{
#ifdef HAVE___BUILTIN_MUL_OVERFLOW
@@ -51,14 +52,14 @@ npy_mul_with_overflow_@name@(@type@ * r, @type@ a, @type@ b)
}
/**end repeat**/
-static NPY_INLINE int
+static inline int
npy_mul_sizes_with_overflow (npy_intp * r, npy_intp a, npy_intp b)
{
#ifdef HAVE___BUILTIN_MUL_OVERFLOW
return __builtin_mul_overflow(a, b, r);
#else
-
- assert(a >= 0 && b >= 0, "this function only supports non-negative numbers");
+ /* this function only supports non-negative numbers */
+ assert(a >= 0 && b >= 0);
const npy_intp half_sz = ((npy_intp)1 << ((sizeof(a) * 8 - 1 ) / 2));
*r = a * b;
diff --git a/numpy/core/src/common/ucsnarrow.h b/numpy/core/src/common/ucsnarrow.h
index 6fe157199..4b17a2809 100644
--- a/numpy/core/src/common/ucsnarrow.h
+++ b/numpy/core/src/common/ucsnarrow.h
@@ -2,6 +2,6 @@
#define NUMPY_CORE_SRC_COMMON_NPY_UCSNARROW_H_
NPY_NO_EXPORT PyUnicodeObject *
-PyUnicode_FromUCS4(char *src, Py_ssize_t size, int swap, int align);
+PyUnicode_FromUCS4(char const *src, Py_ssize_t size, int swap, int align);
#endif /* NUMPY_CORE_SRC_COMMON_NPY_UCSNARROW_H_ */
diff --git a/numpy/core/src/common/utils.hpp b/numpy/core/src/common/utils.hpp
new file mode 100644
index 000000000..f847cab44
--- /dev/null
+++ b/numpy/core/src/common/utils.hpp
@@ -0,0 +1,51 @@
+#ifndef NUMPY_CORE_SRC_COMMON_UTILS_HPP
+#define NUMPY_CORE_SRC_COMMON_UTILS_HPP
+
+#include "npdef.hpp"
+
+#if NP_HAS_CPP20
+ #include <bit>
+#endif
+
+#include <type_traits>
+#include <string.h>
+
+namespace np {
+
+/** Create a value of type `To` from the bits of `from`.
+ *
+ * similar to `std::bit_cast` but compatible with C++17,
+ * should perform similar to `*reinterpret_cast<To*>(&from)`
+ * or through punning without expecting any undefined behaviors.
+ */
+template<typename To, typename From>
+#if NP_HAS_BUILTIN(__builtin_bit_cast) || NP_HAS_CPP20
+[[nodiscard]] constexpr
+#else
+inline
+#endif
+To BitCast(const From &from) noexcept
+{
+ static_assert(
+ sizeof(To) == sizeof(From),
+ "both data types must have the same size");
+
+ static_assert(
+ std::is_trivially_copyable_v<To> &&
+ std::is_trivially_copyable_v<From>,
+ "both data types must be trivially copyable");
+
+#if NP_HAS_CPP20
+ return std::bit_cast<To>(from);
+#elif NP_HAS_BUILTIN(__builtin_bit_cast)
+ return __builtin_bit_cast(To, from);
+#else
+ To to;
+ memcpy(&to, &from, sizeof(from));
+ return to;
+#endif
+}
+
+} // namespace np
+#endif // NUMPY_CORE_SRC_COMMON_UTILS_HPP
+