summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSayed Adel <seiko@imavr.com>2020-11-16 21:20:57 +0000
committerSayed Adel <seiko@imavr.com>2020-12-22 20:34:43 +0000
commit150d459e0cf7ce8c92c971260e4aa88cbae43a2c (patch)
treeeda5c3aabdeea61c99b0d242897d3c3cab9a1e1f
parentcfb6a4d88d88307d507a86d1d70cd7d84c611406 (diff)
downloadnumpy-150d459e0cf7ce8c92c971260e4aa88cbae43a2c.tar.gz
ENH, SIMD: Add new NPYV intrinsics pack(1)
- add bitwise logical operations for boolean vectors - add round conversion for float vectors - add NAN test for float vectors - add conditional addition and subtraction - add #definition NPY_SIMD_FMA3 to check FUSED native support - add testing cases for all of the above
-rw-r--r--numpy/core/src/_simd/_simd.dispatch.c.src87
-rw-r--r--numpy/core/src/_simd/_simd_easyintrin.inc30
-rw-r--r--numpy/core/src/common/simd/avx2/avx2.h5
-rw-r--r--numpy/core/src/common/simd/avx2/conversion.h8
-rw-r--r--numpy/core/src/common/simd/avx2/operators.h22
-rw-r--r--numpy/core/src/common/simd/avx512/avx512.h2
-rw-r--r--numpy/core/src/common/simd/avx512/conversion.h8
-rw-r--r--numpy/core/src/common/simd/avx512/maskop.h54
-rw-r--r--numpy/core/src/common/simd/avx512/operators.h63
-rw-r--r--numpy/core/src/common/simd/emulate_maskop.h44
-rw-r--r--numpy/core/src/common/simd/neon/conversion.h20
-rw-r--r--numpy/core/src/common/simd/neon/neon.h5
-rw-r--r--numpy/core/src/common/simd/neon/operators.h25
-rw-r--r--numpy/core/src/common/simd/simd.h7
-rw-r--r--numpy/core/src/common/simd/sse/conversion.h8
-rw-r--r--numpy/core/src/common/simd/sse/operators.h22
-rw-r--r--numpy/core/src/common/simd/sse/sse.h6
-rw-r--r--numpy/core/src/common/simd/vsx/conversion.h50
-rw-r--r--numpy/core/src/common/simd/vsx/operators.h18
-rw-r--r--numpy/core/src/common/simd/vsx/vsx.h1
-rw-r--r--numpy/core/tests/test_simd.py118
21 files changed, 598 insertions, 5 deletions
diff --git a/numpy/core/src/_simd/_simd.dispatch.c.src b/numpy/core/src/_simd/_simd.dispatch.c.src
index e3dbcdece..eaff81338 100644
--- a/numpy/core/src/_simd/_simd.dispatch.c.src
+++ b/numpy/core/src/_simd/_simd.dispatch.c.src
@@ -372,16 +372,58 @@ SIMD_IMPL_INTRIN_1(@intrin@_@sfx@, v@sfx@, v@sfx@)
/**end repeat1**/
#endif
+/***************************
+ * Mask operations
+ ***************************/
+/**begin repeat1
+ * #intrin = ifadd, ifsub#
+ */
+ SIMD_IMPL_INTRIN_4(@intrin@_@sfx@, v@sfx@, v@bsfx@, v@sfx@, v@sfx@, v@sfx@)
+/**end repeat1**/
+
#endif // simd_sup
/**end repeat**/
/*************************************************************************
* Variant
************************************************************************/
SIMD_IMPL_INTRIN_0N(cleanup)
+
+/*************************************************************************
+ * A special section for f32/f64 intrinsics outside the main repeater
+ ************************************************************************/
+/***************************
+ * Operators
+ ***************************/
+// check special cases
+SIMD_IMPL_INTRIN_1(notnan_f32, vb32, vf32)
+#if NPY_SIMD_F64
+ SIMD_IMPL_INTRIN_1(notnan_f64, vb64, vf64)
+#endif
+/***************************
+ * Conversions
+ ***************************/
+// round to nearest integer (assume even)
+SIMD_IMPL_INTRIN_1(round_s32_f32, vs32, vf32)
+#if NPY_SIMD_F64
+ SIMD_IMPL_INTRIN_2(round_s32_f64, vs32, vf64, vf64)
+#endif
+
/*************************************************************************
* A special section for boolean intrinsics outside the main repeater
************************************************************************/
/***************************
+ * Operators
+ ***************************/
+// Logical
+/**begin repeat
+ * #bsfx = b8, b16, b32, b64#
+ */
+SIMD_IMPL_INTRIN_2(and_@bsfx@, v@bsfx@, v@bsfx@, v@bsfx@)
+SIMD_IMPL_INTRIN_2(or_@bsfx@, v@bsfx@, v@bsfx@, v@bsfx@)
+SIMD_IMPL_INTRIN_2(xor_@bsfx@, v@bsfx@, v@bsfx@, v@bsfx@)
+SIMD_IMPL_INTRIN_1(not_@bsfx@, v@bsfx@, v@bsfx@)
+/**end repeat**/
+/***************************
* Conversions
***************************/
// Convert mask vector to integer bitfield
@@ -534,6 +576,16 @@ SIMD_INTRIN_DEF(sum_@sfx@)
SIMD_INTRIN_DEF(@intrin@_@sfx@)
/**end repeat1**/
#endif
+
+/***************************
+ * Mask operations
+ ***************************/
+/**begin repeat1
+ * #intrin = ifadd, ifsub#
+ */
+ SIMD_INTRIN_DEF(@intrin@_@sfx@)
+/**end repeat1**/
+
#endif // simd_sup
/**end repeat**/
/*************************************************************************
@@ -542,9 +594,41 @@ SIMD_INTRIN_DEF(@intrin@_@sfx@)
SIMD_INTRIN_DEF(cleanup)
/*************************************************************************
+ * A special section for f32/f64 intrinsics outside the main repeater
+ ************************************************************************/
+/***************************
+ * Operators
+ ***************************/
+// check special cases
+SIMD_INTRIN_DEF(notnan_f32)
+#if NPY_SIMD_F64
+ SIMD_INTRIN_DEF(notnan_f64)
+#endif
+/***************************
+ * Conversions
+ ***************************/
+// round to nearest integer (assume even)
+SIMD_INTRIN_DEF(round_s32_f32)
+#if NPY_SIMD_F64
+ SIMD_INTRIN_DEF(round_s32_f64)
+#endif
+
+/*************************************************************************
* A special section for boolean intrinsics outside the main repeater
************************************************************************/
/***************************
+ * Operators
+ ***************************/
+// Logical
+/**begin repeat
+ * #bsfx = b8, b16, b32, b64#
+ */
+SIMD_INTRIN_DEF(and_@bsfx@)
+SIMD_INTRIN_DEF(or_@bsfx@)
+SIMD_INTRIN_DEF(xor_@bsfx@)
+SIMD_INTRIN_DEF(not_@bsfx@)
+/**end repeat**/
+/***************************
* Conversions
***************************/
// Convert mask vector to integer bitfield
@@ -590,6 +674,9 @@ NPY_CPU_DISPATCH_CURFX(simd_create_module)(void)
if (PyModule_AddIntConstant(m, "simd_f64", NPY_SIMD_F64)) {
goto err;
}
+ if (PyModule_AddIntConstant(m, "simd_fma3", NPY_SIMD_FMA3)) {
+ goto err;
+ }
if (PyModule_AddIntConstant(m, "simd_width", NPY_SIMD_WIDTH)) {
goto err;
}
diff --git a/numpy/core/src/_simd/_simd_easyintrin.inc b/numpy/core/src/_simd/_simd_easyintrin.inc
index 54e7ccf01..f83d7a286 100644
--- a/numpy/core/src/_simd/_simd_easyintrin.inc
+++ b/numpy/core/src/_simd/_simd_easyintrin.inc
@@ -123,6 +123,36 @@
}; \
return simd_arg_to_obj(&ret); \
}
+
+#define SIMD_IMPL_INTRIN_4(NAME, RET, IN0, IN1, IN2, IN3) \
+ static PyObject *simd__intrin_##NAME \
+ (PyObject* NPY_UNUSED(self), PyObject *args) \
+ { \
+ simd_arg arg1 = {.dtype = simd_data_##IN0}; \
+ simd_arg arg2 = {.dtype = simd_data_##IN1}; \
+ simd_arg arg3 = {.dtype = simd_data_##IN2}; \
+ simd_arg arg4 = {.dtype = simd_data_##IN3}; \
+ if (!PyArg_ParseTuple( \
+ args, "O&O&O&O&:"NPY_TOSTRING(NAME), \
+ simd_arg_converter, &arg1, \
+ simd_arg_converter, &arg2, \
+ simd_arg_converter, &arg3, \
+ simd_arg_converter, &arg4 \
+ )) return NULL; \
+ simd_data data = {.RET = npyv_##NAME( \
+ arg1.data.IN0, arg2.data.IN1, \
+ arg3.data.IN2, arg4.data.IN3 \
+ )}; \
+ simd_arg_free(&arg1); \
+ simd_arg_free(&arg2); \
+ simd_arg_free(&arg3); \
+ simd_arg_free(&arg4); \
+ simd_arg ret = { \
+ .data = data, .dtype = simd_data_##RET \
+ }; \
+ return simd_arg_to_obj(&ret); \
+ }
+
/**
* Helper macros for repeating and expand a certain macro.
* Mainly used for converting a scalar to an immediate constant.
diff --git a/numpy/core/src/common/simd/avx2/avx2.h b/numpy/core/src/common/simd/avx2/avx2.h
index 6f0d3c0d9..bcd90b110 100644
--- a/numpy/core/src/common/simd/avx2/avx2.h
+++ b/numpy/core/src/common/simd/avx2/avx2.h
@@ -5,6 +5,11 @@
#define NPY_SIMD 256
#define NPY_SIMD_WIDTH 32
#define NPY_SIMD_F64 1
+#ifdef NPY_HAVE_FMA3
+ #define NPY_SIMD_FMA3 1 // native support
+#else
+ #define NPY_SIMD_FMA3 0 // fast emulated
+#endif
// Enough limit to allow us to use _mm256_i32gather_*
#define NPY_SIMD_MAXLOAD_STRIDE32 (0x7fffffff / 8)
diff --git a/numpy/core/src/common/simd/avx2/conversion.h b/numpy/core/src/common/simd/avx2/conversion.h
index f72678b54..dc6b18766 100644
--- a/numpy/core/src/common/simd/avx2/conversion.h
+++ b/numpy/core/src/common/simd/avx2/conversion.h
@@ -43,4 +43,12 @@ NPY_FINLINE npy_uint64 npyv_tobits_b32(npyv_b32 a)
NPY_FINLINE npy_uint64 npyv_tobits_b64(npyv_b64 a)
{ return (npy_uint8)_mm256_movemask_pd(_mm256_castsi256_pd(a)); }
+// round to nearest integer (assuming even)
+#define npyv_round_s32_f32 _mm256_cvtps_epi32
+NPY_FINLINE npyv_s32 npyv_round_s32_f64(npyv_f64 a, npyv_f64 b)
+{
+ __m128i lo = _mm256_cvtpd_epi32(a), hi = _mm256_cvtpd_epi32(b);
+ return _mm256_inserti128_si256(_mm256_castsi128_si256(lo), hi, 1);
+}
+
#endif // _NPY_SIMD_AVX2_CVT_H
diff --git a/numpy/core/src/common/simd/avx2/operators.h b/numpy/core/src/common/simd/avx2/operators.h
index c1d30413f..5fc7719e9 100644
--- a/numpy/core/src/common/simd/avx2/operators.h
+++ b/numpy/core/src/common/simd/avx2/operators.h
@@ -61,6 +61,10 @@ NPY_FINLINE __m256i npyv_shr_s64(__m256i a, int c)
#define npyv_and_s64 _mm256_and_si256
#define npyv_and_f32 _mm256_and_ps
#define npyv_and_f64 _mm256_and_pd
+#define npyv_and_b8 _mm256_and_si256
+#define npyv_and_b16 _mm256_and_si256
+#define npyv_and_b32 _mm256_and_si256
+#define npyv_and_b64 _mm256_and_si256
// OR
#define npyv_or_u8 _mm256_or_si256
@@ -73,6 +77,10 @@ NPY_FINLINE __m256i npyv_shr_s64(__m256i a, int c)
#define npyv_or_s64 _mm256_or_si256
#define npyv_or_f32 _mm256_or_ps
#define npyv_or_f64 _mm256_or_pd
+#define npyv_or_b8 _mm256_or_si256
+#define npyv_or_b16 _mm256_or_si256
+#define npyv_or_b32 _mm256_or_si256
+#define npyv_or_b64 _mm256_or_si256
// XOR
#define npyv_xor_u8 _mm256_xor_si256
@@ -85,6 +93,10 @@ NPY_FINLINE __m256i npyv_shr_s64(__m256i a, int c)
#define npyv_xor_s64 _mm256_xor_si256
#define npyv_xor_f32 _mm256_xor_ps
#define npyv_xor_f64 _mm256_xor_pd
+#define npyv_xor_b8 _mm256_xor_si256
+#define npyv_xor_b16 _mm256_xor_si256
+#define npyv_xor_b32 _mm256_xor_si256
+#define npyv_xor_b64 _mm256_xor_si256
// NOT
#define npyv_not_u8(A) _mm256_xor_si256(A, _mm256_set1_epi32(-1))
@@ -97,6 +109,10 @@ NPY_FINLINE __m256i npyv_shr_s64(__m256i a, int c)
#define npyv_not_s64 npyv_not_u8
#define npyv_not_f32(A) _mm256_xor_ps(A, _mm256_castsi256_ps(_mm256_set1_epi32(-1)))
#define npyv_not_f64(A) _mm256_xor_pd(A, _mm256_castsi256_pd(_mm256_set1_epi32(-1)))
+#define npyv_not_b8 npyv_not_u8
+#define npyv_not_b16 npyv_not_u8
+#define npyv_not_b32 npyv_not_u8
+#define npyv_not_b64 npyv_not_u8
/***************************
* Comparison
@@ -197,4 +213,10 @@ NPY_FINLINE __m256i npyv_cmpge_u32(__m256i a, __m256i b)
#define npyv_cmpge_f32(A, B) _mm256_castps_si256(_mm256_cmp_ps(A, B, _CMP_GE_OQ))
#define npyv_cmpge_f64(A, B) _mm256_castpd_si256(_mm256_cmp_pd(A, B, _CMP_GE_OQ))
+// check special cases
+NPY_FINLINE npyv_b32 npyv_notnan_f32(npyv_f32 a)
+{ return _mm256_castps_si256(_mm256_cmp_ps(a, a, _CMP_ORD_Q)); }
+NPY_FINLINE npyv_b64 npyv_notnan_f64(npyv_f64 a)
+{ return _mm256_castpd_si256(_mm256_cmp_pd(a, a, _CMP_ORD_Q)); }
+
#endif // _NPY_SIMD_AVX2_OPERATORS_H
diff --git a/numpy/core/src/common/simd/avx512/avx512.h b/numpy/core/src/common/simd/avx512/avx512.h
index 2de33765a..f38686834 100644
--- a/numpy/core/src/common/simd/avx512/avx512.h
+++ b/numpy/core/src/common/simd/avx512/avx512.h
@@ -4,6 +4,7 @@
#define NPY_SIMD 512
#define NPY_SIMD_WIDTH 64
#define NPY_SIMD_F64 1
+#define NPY_SIMD_FMA3 1 // native support
// Enough limit to allow us to use _mm512_i32gather_* and _mm512_i32scatter_*
#define NPY_SIMD_MAXLOAD_STRIDE32 (0x7fffffff / 16)
#define NPY_SIMD_MAXSTORE_STRIDE32 (0x7fffffff / 16)
@@ -73,3 +74,4 @@ typedef struct { __m512d val[3]; } npyv_f64x3;
#include "conversion.h"
#include "arithmetic.h"
#include "math.h"
+#include "maskop.h"
diff --git a/numpy/core/src/common/simd/avx512/conversion.h b/numpy/core/src/common/simd/avx512/conversion.h
index 6ad299dd5..1d71d7b49 100644
--- a/numpy/core/src/common/simd/avx512/conversion.h
+++ b/numpy/core/src/common/simd/avx512/conversion.h
@@ -82,4 +82,12 @@ NPY_FINLINE npy_uint64 npyv_tobits_b32(npyv_b32 a)
NPY_FINLINE npy_uint64 npyv_tobits_b64(npyv_b64 a)
{ return (npy_uint8)a; }
+// round to nearest integer (assuming even)
+#define npyv_round_s32_f32 _mm512_cvtps_epi32
+NPY_FINLINE npyv_s32 npyv_round_s32_f64(npyv_f64 a, npyv_f64 b)
+{
+ __m256i lo = _mm512_cvtpd_epi32(a), hi = _mm512_cvtpd_epi32(b);
+ return npyv512_combine_si256(lo, hi);
+}
+
#endif // _NPY_SIMD_AVX512_CVT_H
diff --git a/numpy/core/src/common/simd/avx512/maskop.h b/numpy/core/src/common/simd/avx512/maskop.h
new file mode 100644
index 000000000..d1c188390
--- /dev/null
+++ b/numpy/core/src/common/simd/avx512/maskop.h
@@ -0,0 +1,54 @@
+#ifndef NPY_SIMD
+ #error "Not a standalone header, use simd/simd.h instead"
+#endif
+
+#ifndef _NPY_SIMD_AVX512_MASKOP_H
+#define _NPY_SIMD_AVX512_MASKOP_H
+
+/**
+ * Implements conditional addition and subtraction.
+ * e.g. npyv_ifadd_f32(m, a, b, c) -> m ? a + b : c
+ * e.g. npyv_ifsub_f32(m, a, b, c) -> m ? a - b : c
+ */
+#define NPYV_IMPL_AVX512_EMULATE_MASK_ADDSUB(SFX, BSFX) \
+ NPY_FINLINE npyv_##SFX npyv_ifadd_##SFX \
+ (npyv_##BSFX m, npyv_##SFX a, npyv_##SFX b, npyv_##SFX c) \
+ { \
+ npyv_##SFX add = npyv_add_##SFX(a, b); \
+ return npyv_select_##SFX(m, add, c); \
+ } \
+ NPY_FINLINE npyv_##SFX npyv_ifsub_##SFX \
+ (npyv_##BSFX m, npyv_##SFX a, npyv_##SFX b, npyv_##SFX c) \
+ { \
+ npyv_##SFX sub = npyv_sub_##SFX(a, b); \
+ return npyv_select_##SFX(m, sub, c); \
+ }
+
+#define NPYV_IMPL_AVX512_MASK_ADDSUB(SFX, BSFX, ZSFX) \
+ NPY_FINLINE npyv_##SFX npyv_ifadd_##SFX \
+ (npyv_##BSFX m, npyv_##SFX a, npyv_##SFX b, npyv_##SFX c) \
+ { return _mm512_mask_add_##ZSFX(c, m, a, b); } \
+ NPY_FINLINE npyv_##SFX npyv_ifsub_##SFX \
+ (npyv_##BSFX m, npyv_##SFX a, npyv_##SFX b, npyv_##SFX c) \
+ { return _mm512_mask_sub_##ZSFX(c, m, a, b); }
+
+#ifdef NPY_HAVE_AVX512BW
+ NPYV_IMPL_AVX512_MASK_ADDSUB(u8, b8, epi8)
+ NPYV_IMPL_AVX512_MASK_ADDSUB(s8, b8, epi8)
+ NPYV_IMPL_AVX512_MASK_ADDSUB(u16, b16, epi16)
+ NPYV_IMPL_AVX512_MASK_ADDSUB(s16, b16, epi16)
+#else
+ NPYV_IMPL_AVX512_EMULATE_MASK_ADDSUB(u8, b8)
+ NPYV_IMPL_AVX512_EMULATE_MASK_ADDSUB(s8, b8)
+ NPYV_IMPL_AVX512_EMULATE_MASK_ADDSUB(u16, b16)
+ NPYV_IMPL_AVX512_EMULATE_MASK_ADDSUB(s16, b16)
+#endif
+
+NPYV_IMPL_AVX512_MASK_ADDSUB(u32, b32, epi32)
+NPYV_IMPL_AVX512_MASK_ADDSUB(s32, b32, epi32)
+NPYV_IMPL_AVX512_MASK_ADDSUB(u64, b64, epi64)
+NPYV_IMPL_AVX512_MASK_ADDSUB(s64, b64, epi64)
+NPYV_IMPL_AVX512_MASK_ADDSUB(f32, b32, ps)
+NPYV_IMPL_AVX512_MASK_ADDSUB(f64, b64, pd)
+
+#endif // _NPY_SIMD_AVX512_MASKOP_H
diff --git a/numpy/core/src/common/simd/avx512/operators.h b/numpy/core/src/common/simd/avx512/operators.h
index f76ea5e2d..5f1771770 100644
--- a/numpy/core/src/common/simd/avx512/operators.h
+++ b/numpy/core/src/common/simd/avx512/operators.h
@@ -90,6 +90,20 @@
NPYV_IMPL_AVX512_FROM_SI512_PS_2ARG(npyv_and_f32, _mm512_and_si512)
NPYV_IMPL_AVX512_FROM_SI512_PD_2ARG(npyv_and_f64, _mm512_and_si512)
#endif
+#ifdef NPY_HAVE_AVX512BW_MASK
+ #define npyv_and_b8 _kand_mask64
+ #define npyv_and_b16 _kand_mask32
+#elif defined(NPY_HAVE_AVX512BW)
+ NPY_FINLINE npyv_b8 npyv_and_b8(npyv_b8 a, npyv_b8 b)
+ { return a & b; }
+ NPY_FINLINE npyv_b16 npyv_and_b16(npyv_b16 a, npyv_b16 b)
+ { return a & b; }
+#else
+ #define npyv_and_b8 _mm512_and_si512
+ #define npyv_and_b16 _mm512_and_si512
+#endif
+#define npyv_and_b32 _mm512_kand
+#define npyv_and_b64 _mm512_kand
// OR
#define npyv_or_u8 _mm512_or_si512
@@ -107,6 +121,20 @@
NPYV_IMPL_AVX512_FROM_SI512_PS_2ARG(npyv_or_f32, _mm512_or_si512)
NPYV_IMPL_AVX512_FROM_SI512_PD_2ARG(npyv_or_f64, _mm512_or_si512)
#endif
+#ifdef NPY_HAVE_AVX512BW_MASK
+ #define npyv_or_b8 _kor_mask64
+ #define npyv_or_b16 _kor_mask32
+#elif defined(NPY_HAVE_AVX512BW)
+ NPY_FINLINE npyv_b8 npyv_or_b8(npyv_b8 a, npyv_b8 b)
+ { return a | b; }
+ NPY_FINLINE npyv_b16 npyv_or_b16(npyv_b16 a, npyv_b16 b)
+ { return a | b; }
+#else
+ #define npyv_or_b8 _mm512_or_si512
+ #define npyv_or_b16 _mm512_or_si512
+#endif
+#define npyv_or_b32 _mm512_kor
+#define npyv_or_b64 _mm512_kor
// XOR
#define npyv_xor_u8 _mm512_xor_si512
@@ -124,6 +152,20 @@
NPYV_IMPL_AVX512_FROM_SI512_PS_2ARG(npyv_xor_f32, _mm512_xor_si512)
NPYV_IMPL_AVX512_FROM_SI512_PD_2ARG(npyv_xor_f64, _mm512_xor_si512)
#endif
+#ifdef NPY_HAVE_AVX512BW_MASK
+ #define npyv_xor_b8 _kxor_mask64
+ #define npyv_xor_b16 _kxor_mask32
+#elif defined(NPY_HAVE_AVX512BW)
+ NPY_FINLINE npyv_b8 npyv_xor_b8(npyv_b8 a, npyv_b8 b)
+ { return a ^ b; }
+ NPY_FINLINE npyv_b16 npyv_xor_b16(npyv_b16 a, npyv_b16 b)
+ { return a ^ b; }
+#else
+ #define npyv_xor_b8 _mm512_xor_si512
+ #define npyv_xor_b16 _mm512_xor_si512
+#endif
+#define npyv_xor_b32 _mm512_kxor
+#define npyv_xor_b64 _mm512_kxor
// NOT
#define npyv_not_u8(A) _mm512_xor_si512(A, _mm512_set1_epi32(-1))
@@ -141,6 +183,21 @@
#define npyv_not_f32(A) _mm512_castsi512_ps(npyv_not_u32(_mm512_castps_si512(A)))
#define npyv_not_f64(A) _mm512_castsi512_pd(npyv_not_u64(_mm512_castpd_si512(A)))
#endif
+#ifdef NPY_HAVE_AVX512BW_MASK
+ #define npyv_not_b8 _knot_mask64
+ #define npyv_not_b16 _knot_mask32
+#elif defined(NPY_HAVE_AVX512BW)
+ NPY_FINLINE npyv_b8 npyv_not_b8(npyv_b8 a)
+ { return ~a; }
+ NPY_FINLINE npyv_b16 npyv_not_b16(npyv_b16 a)
+ { return ~a; }
+#else
+ #define npyv_not_b8 npyv_not_u8
+ #define npyv_not_b16 npyv_not_u8
+#endif
+#define npyv_not_b32 _mm512_knot
+#define npyv_not_b64 _mm512_knot
+
/***************************
* Comparison
@@ -256,4 +313,10 @@
#define npyv_cmpge_f32(A, B) _mm512_cmp_ps_mask(A, B, _CMP_GE_OQ)
#define npyv_cmpge_f64(A, B) _mm512_cmp_pd_mask(A, B, _CMP_GE_OQ)
+// check special cases
+NPY_FINLINE npyv_b32 npyv_notnan_f32(npyv_f32 a)
+{ return _mm512_cmp_ps_mask(a, a, _CMP_ORD_Q); }
+NPY_FINLINE npyv_b64 npyv_notnan_f64(npyv_f64 a)
+{ return _mm512_cmp_pd_mask(a, a, _CMP_ORD_Q); }
+
#endif // _NPY_SIMD_AVX512_OPERATORS_H
diff --git a/numpy/core/src/common/simd/emulate_maskop.h b/numpy/core/src/common/simd/emulate_maskop.h
new file mode 100644
index 000000000..7e7446bc5
--- /dev/null
+++ b/numpy/core/src/common/simd/emulate_maskop.h
@@ -0,0 +1,44 @@
+/**
+ * This header is used internaly by all current supported SIMD extention,
+ * execpt for AVX512.
+ */
+#ifndef NPY_SIMD
+ #error "Not a standalone header, use simd/simd.h instead"
+#endif
+
+#ifndef _NPY_SIMD_EMULATE_MASKOP_H
+#define _NPY_SIMD_EMULATE_MASKOP_H
+
+/**
+ * Implements conditional addition and subtraction.
+ * e.g. npyv_ifadd_f32(mask, a, b, c) -> mask ? a + b : c
+ * e.g. npyv_ifsub_f32(mask, a, b, c) -> mask ? a - b : c
+ */
+#define NPYV_IMPL_EMULATE_MASK_ADDSUB(SFX, BSFX) \
+ NPY_FINLINE npyv_##SFX npyv_ifadd_##SFX \
+ (npyv_##BSFX m, npyv_##SFX a, npyv_##SFX b, npyv_##SFX c) \
+ { \
+ npyv_##SFX add = npyv_add_##SFX(a, b); \
+ return npyv_select_##SFX(m, add, c); \
+ } \
+ NPY_FINLINE npyv_##SFX npyv_ifsub_##SFX \
+ (npyv_##BSFX m, npyv_##SFX a, npyv_##SFX b, npyv_##SFX c) \
+ { \
+ npyv_##SFX sub = npyv_sub_##SFX(a, b); \
+ return npyv_select_##SFX(m, sub, c); \
+ }
+
+NPYV_IMPL_EMULATE_MASK_ADDSUB(u8, b8)
+NPYV_IMPL_EMULATE_MASK_ADDSUB(s8, b8)
+NPYV_IMPL_EMULATE_MASK_ADDSUB(u16, b16)
+NPYV_IMPL_EMULATE_MASK_ADDSUB(s16, b16)
+NPYV_IMPL_EMULATE_MASK_ADDSUB(u32, b32)
+NPYV_IMPL_EMULATE_MASK_ADDSUB(s32, b32)
+NPYV_IMPL_EMULATE_MASK_ADDSUB(u64, b64)
+NPYV_IMPL_EMULATE_MASK_ADDSUB(s64, b64)
+NPYV_IMPL_EMULATE_MASK_ADDSUB(f32, b32)
+#if NPY_SIMD_F64
+ NPYV_IMPL_EMULATE_MASK_ADDSUB(f64, b64)
+#endif
+
+#endif // _NPY_SIMD_EMULATE_MASKOP_H
diff --git a/numpy/core/src/common/simd/neon/conversion.h b/numpy/core/src/common/simd/neon/conversion.h
index f9840b1cb..f92910b66 100644
--- a/numpy/core/src/common/simd/neon/conversion.h
+++ b/numpy/core/src/common/simd/neon/conversion.h
@@ -71,4 +71,24 @@ NPY_FINLINE npy_uint64 npyv_tobits_b64(npyv_b64 a)
return vgetq_lane_u64(bit, 0) | ((int)vgetq_lane_u64(bit, 1) << 1);
}
+// round to nearest integer
+#if NPY_SIMD_F64
+ #define npyv_round_s32_f32 vcvtnq_s32_f32
+ NPY_FINLINE npyv_s32 npyv_round_s32_f64(npyv_f64 a, npyv_f64 b)
+ {
+ npyv_s64 lo = vcvtnq_s64_f64(a), hi = vcvtnq_s64_f64(b);
+ return vcombine_s32(vmovn_s64(lo), vmovn_s64(hi));
+ }
+#else
+ NPY_FINLINE npyv_s32 npyv_round_s32_f32(npyv_f32 a)
+ {
+ // halves will be rounded up. it's very costly
+ // to obey IEEE standard on arm7. tests should pass +-1 difference
+ const npyv_u32 sign = vdupq_n_u32(0x80000000);
+ const npyv_f32 half = vdupq_n_f32(0.5f);
+ npyv_f32 sign_half = vbslq_f32(sign, a, half);
+ return vcvtq_s32_f32(vaddq_f32(a, sign_half));
+ }
+#endif
+
#endif // _NPY_SIMD_NEON_CVT_H
diff --git a/numpy/core/src/common/simd/neon/neon.h b/numpy/core/src/common/simd/neon/neon.h
index c8ddc92ad..e6f6a7324 100644
--- a/numpy/core/src/common/simd/neon/neon.h
+++ b/numpy/core/src/common/simd/neon/neon.h
@@ -10,6 +10,11 @@
#else
#define NPY_SIMD_F64 0
#endif
+#ifdef NPY_HAVE_NEON_VFPV4
+ #define NPY_SIMD_FMA3 1 // native support
+#else
+ #define NPY_SIMD_FMA3 0 // HW emulated
+#endif
typedef uint8x16_t npyv_u8;
typedef int8x16_t npyv_s8;
diff --git a/numpy/core/src/common/simd/neon/operators.h b/numpy/core/src/common/simd/neon/operators.h
index c1ad4ba12..280c5e0da 100644
--- a/numpy/core/src/common/simd/neon/operators.h
+++ b/numpy/core/src/common/simd/neon/operators.h
@@ -58,6 +58,10 @@
vreinterpretq_f32_u8(vandq_u8(vreinterpretq_u8_f32(A), vreinterpretq_u8_f32(B)))
#define npyv_and_f64(A, B) \
vreinterpretq_f64_u8(vandq_u8(vreinterpretq_u8_f64(A), vreinterpretq_u8_f64(B)))
+#define npyv_and_b8 vandq_u8
+#define npyv_and_b16 vandq_u16
+#define npyv_and_b32 vandq_u32
+#define npyv_and_b64 vandq_u64
// OR
#define npyv_or_u8 vorrq_u8
@@ -72,6 +76,11 @@
vreinterpretq_f32_u8(vorrq_u8(vreinterpretq_u8_f32(A), vreinterpretq_u8_f32(B)))
#define npyv_or_f64(A, B) \
vreinterpretq_f64_u8(vorrq_u8(vreinterpretq_u8_f64(A), vreinterpretq_u8_f64(B)))
+#define npyv_or_b8 vorrq_u8
+#define npyv_or_b16 vorrq_u16
+#define npyv_or_b32 vorrq_u32
+#define npyv_or_b64 vorrq_u64
+
// XOR
#define npyv_xor_u8 veorq_u8
@@ -86,6 +95,10 @@
vreinterpretq_f32_u8(veorq_u8(vreinterpretq_u8_f32(A), vreinterpretq_u8_f32(B)))
#define npyv_xor_f64(A, B) \
vreinterpretq_f64_u8(veorq_u8(vreinterpretq_u8_f64(A), vreinterpretq_u8_f64(B)))
+#define npyv_xor_b8 veorq_u8
+#define npyv_xor_b16 veorq_u16
+#define npyv_xor_b32 veorq_u32
+#define npyv_xor_b64 veorq_u64
// NOT
#define npyv_not_u8 vmvnq_u8
@@ -98,6 +111,10 @@
#define npyv_not_s64(A) vreinterpretq_s64_u8(vmvnq_u8(vreinterpretq_u8_s64(A)))
#define npyv_not_f32(A) vreinterpretq_f32_u8(vmvnq_u8(vreinterpretq_u8_f32(A)))
#define npyv_not_f64(A) vreinterpretq_f64_u8(vmvnq_u8(vreinterpretq_u8_f64(A)))
+#define npyv_not_b8 vmvnq_u8
+#define npyv_not_b16 vmvnq_u16
+#define npyv_not_b32 vmvnq_u32
+#define npyv_not_b64 npyv_not_u64
/***************************
* Comparison
@@ -215,4 +232,12 @@
#define npyv_cmple_f32(A, B) npyv_cmpge_f32(B, A)
#define npyv_cmple_f64(A, B) npyv_cmpge_f64(B, A)
+// check special cases
+NPY_FINLINE npyv_b32 npyv_notnan_f32(npyv_f32 a)
+{ return vceqq_f32(a, a); }
+#if NPY_SIMD_F64
+ NPY_FINLINE npyv_b64 npyv_notnan_f64(npyv_f64 a)
+ { return vceqq_f64(a, a); }
+#endif
+
#endif // _NPY_SIMD_NEON_OPERATORS_H
diff --git a/numpy/core/src/common/simd/simd.h b/numpy/core/src/common/simd/simd.h
index 8804223c9..d6c14228d 100644
--- a/numpy/core/src/common/simd/simd.h
+++ b/numpy/core/src/common/simd/simd.h
@@ -48,7 +48,14 @@ typedef double npyv_lanetype_f64;
#define NPY_SIMD 0
#define NPY_SIMD_WIDTH 0
#define NPY_SIMD_F64 0
+ #define NPY_SIMD_FMA3 0
#endif
+
+// enable emulated mask operations for all SIMD extension except for AVX512
+#if !defined(NPY_HAVE_AVX512F) && NPY_SIMD && NPY_SIMD < 512
+ #include "emulate_maskop.h"
+#endif
+
/**
* Some SIMD extensions currently(AVX2, AVX512F) require (de facto)
* a maximum number of strides sizes when dealing with non-contiguous memory access.
diff --git a/numpy/core/src/common/simd/sse/conversion.h b/numpy/core/src/common/simd/sse/conversion.h
index ab4beea96..d690ec313 100644
--- a/numpy/core/src/common/simd/sse/conversion.h
+++ b/numpy/core/src/common/simd/sse/conversion.h
@@ -42,4 +42,12 @@ NPY_FINLINE npy_uint64 npyv_tobits_b32(npyv_b32 a)
NPY_FINLINE npy_uint64 npyv_tobits_b64(npyv_b64 a)
{ return (npy_uint8)_mm_movemask_pd(_mm_castsi128_pd(a)); }
+// round to nearest integer (assuming even)
+#define npyv_round_s32_f32 _mm_cvtps_epi32
+NPY_FINLINE npyv_s32 npyv_round_s32_f64(npyv_f64 a, npyv_f64 b)
+{
+ __m128i lo = _mm_cvtpd_epi32(a), hi = _mm_cvtpd_epi32(b);
+ return _mm_unpacklo_epi64(lo, hi);
+}
+
#endif // _NPY_SIMD_SSE_CVT_H
diff --git a/numpy/core/src/common/simd/sse/operators.h b/numpy/core/src/common/simd/sse/operators.h
index 6e32ca4fd..51c84fb4e 100644
--- a/numpy/core/src/common/simd/sse/operators.h
+++ b/numpy/core/src/common/simd/sse/operators.h
@@ -62,6 +62,10 @@ NPY_FINLINE __m128i npyv_shr_s64(__m128i a, int c)
#define npyv_and_s64 _mm_and_si128
#define npyv_and_f32 _mm_and_ps
#define npyv_and_f64 _mm_and_pd
+#define npyv_and_b8 _mm_and_si128
+#define npyv_and_b16 _mm_and_si128
+#define npyv_and_b32 _mm_and_si128
+#define npyv_and_b64 _mm_and_si128
// OR
#define npyv_or_u8 _mm_or_si128
@@ -74,6 +78,10 @@ NPY_FINLINE __m128i npyv_shr_s64(__m128i a, int c)
#define npyv_or_s64 _mm_or_si128
#define npyv_or_f32 _mm_or_ps
#define npyv_or_f64 _mm_or_pd
+#define npyv_or_b8 _mm_or_si128
+#define npyv_or_b16 _mm_or_si128
+#define npyv_or_b32 _mm_or_si128
+#define npyv_or_b64 _mm_or_si128
// XOR
#define npyv_xor_u8 _mm_xor_si128
@@ -86,6 +94,10 @@ NPY_FINLINE __m128i npyv_shr_s64(__m128i a, int c)
#define npyv_xor_s64 _mm_xor_si128
#define npyv_xor_f32 _mm_xor_ps
#define npyv_xor_f64 _mm_xor_pd
+#define npyv_xor_b8 _mm_xor_si128
+#define npyv_xor_b16 _mm_xor_si128
+#define npyv_xor_b32 _mm_xor_si128
+#define npyv_xor_b64 _mm_xor_si128
// NOT
#define npyv_not_u8(A) _mm_xor_si128(A, _mm_set1_epi32(-1))
@@ -98,6 +110,10 @@ NPY_FINLINE __m128i npyv_shr_s64(__m128i a, int c)
#define npyv_not_s64 npyv_not_u8
#define npyv_not_f32(A) _mm_xor_ps(A, _mm_castsi128_ps(_mm_set1_epi32(-1)))
#define npyv_not_f64(A) _mm_xor_pd(A, _mm_castsi128_pd(_mm_set1_epi32(-1)))
+#define npyv_not_b8 npyv_not_u8
+#define npyv_not_b16 npyv_not_u8
+#define npyv_not_b32 npyv_not_u8
+#define npyv_not_b64 npyv_not_u8
/***************************
* Comparison
@@ -255,4 +271,10 @@ NPY_FINLINE __m128i npyv_shr_s64(__m128i a, int c)
#define npyv_cmpge_f32(a, b) _mm_castps_si128(_mm_cmpge_ps(a, b))
#define npyv_cmpge_f64(a, b) _mm_castpd_si128(_mm_cmpge_pd(a, b))
+// check special cases
+NPY_FINLINE npyv_b32 npyv_notnan_f32(npyv_f32 a)
+{ return _mm_castps_si128(_mm_cmpord_ps(a, a)); }
+NPY_FINLINE npyv_b64 npyv_notnan_f64(npyv_f64 a)
+{ return _mm_castpd_si128(_mm_cmpord_pd(a, a)); }
+
#endif // _NPY_SIMD_SSE_OPERATORS_H
diff --git a/numpy/core/src/common/simd/sse/sse.h b/numpy/core/src/common/simd/sse/sse.h
index 132d3d347..dc0b62f73 100644
--- a/numpy/core/src/common/simd/sse/sse.h
+++ b/numpy/core/src/common/simd/sse/sse.h
@@ -5,7 +5,11 @@
#define NPY_SIMD 128
#define NPY_SIMD_WIDTH 16
#define NPY_SIMD_F64 1
-
+#if defined(NPY_HAVE_FMA3) || defined(NPY_HAVE_FMA4)
+ #define NPY_SIMD_FMA3 1 // native support
+#else
+ #define NPY_SIMD_FMA3 0 // fast emulated
+#endif
typedef __m128i npyv_u8;
typedef __m128i npyv_s8;
typedef __m128i npyv_u16;
diff --git a/numpy/core/src/common/simd/vsx/conversion.h b/numpy/core/src/common/simd/vsx/conversion.h
index 5803e1cdd..72fe10e69 100644
--- a/numpy/core/src/common/simd/vsx/conversion.h
+++ b/numpy/core/src/common/simd/vsx/conversion.h
@@ -51,4 +51,54 @@ NPY_FINLINE npy_uint64 npyv_tobits_b64(npyv_b64 a)
return vec_extract(bit, 0) | (int)vec_extract(bit, 1) << 1;
}
+// truncate compatible with all compilers(internal use for now)
+NPY_FINLINE npyv_s32 npyv__trunc_s32_f32(npyv_f32 a)
+{
+#ifdef __IBMC__
+ return vec_cts(a, 0);
+#elif defined(__clang__)
+ /**
+ * old versions of CLANG doesn't support %x<n> in the inline asm template
+ * which fixes register number when using any of the register constraints wa, wd, wf.
+ * therefore, we count on built-in functions.
+ */
+ return __builtin_convertvector(a, npyv_s32);
+#else // gcc
+ npyv_s32 ret;
+ __asm__ ("xvcvspsxws %x0,%x1" : "=wa" (ret) : "wa" (a));
+ return ret;
+#endif
+}
+NPY_FINLINE npyv_s32 npyv__trunc_s32_f64(npyv_f64 a, npyv_f64 b)
+{
+#ifdef __IBMC__
+ const npyv_u8 seq_even = npyv_set_u8(0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27);
+ // unfortunately, XLC missing asm register vsx fixer
+ // hopefully, xlc can optimize around big-endian compatibility
+ npyv_s32 lo_even = vec_cts(a, 0);
+ npyv_s32 hi_even = vec_cts(b, 0);
+ return vec_perm(lo_even, hi_even, seq_even);
+#else
+ const npyv_u8 seq_odd = npyv_set_u8(4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31);
+ #ifdef __clang__
+ // __builtin_convertvector doesn't support this conversion on wide range of versions
+ // fortunately, almost all versions have direct builtin of 'xvcvdpsxws'
+ npyv_s32 lo_odd = __builtin_vsx_xvcvdpsxws(a);
+ npyv_s32 hi_odd = __builtin_vsx_xvcvdpsxws(b);
+ #else // gcc
+ npyv_s32 lo_odd, hi_odd;
+ __asm__ ("xvcvdpsxws %x0,%x1" : "=wa" (lo_odd) : "wa" (a));
+ __asm__ ("xvcvdpsxws %x0,%x1" : "=wa" (hi_odd) : "wa" (b));
+ #endif
+ return vec_perm(lo_odd, hi_odd, seq_odd);
+#endif
+}
+
+// round to nearest integer (assuming even)
+NPY_FINLINE npyv_s32 npyv_round_s32_f32(npyv_f32 a)
+{ return npyv__trunc_s32_f32(vec_rint(a)); }
+
+NPY_FINLINE npyv_s32 npyv_round_s32_f64(npyv_f64 a, npyv_f64 b)
+{ return npyv__trunc_s32_f64(vec_rint(a), vec_rint(b)); }
+
#endif // _NPY_SIMD_VSX_CVT_H
diff --git a/numpy/core/src/common/simd/vsx/operators.h b/numpy/core/src/common/simd/vsx/operators.h
index ca020d9e0..230610129 100644
--- a/numpy/core/src/common/simd/vsx/operators.h
+++ b/numpy/core/src/common/simd/vsx/operators.h
@@ -56,6 +56,10 @@
#define npyv_and_s64 vec_and
#define npyv_and_f32 vec_and
#define npyv_and_f64 vec_and
+#define npyv_and_b8 vec_and
+#define npyv_and_b16 vec_and
+#define npyv_and_b32 vec_and
+#define npyv_and_b64 vec_and
// OR
#define npyv_or_u8 vec_or
@@ -68,6 +72,10 @@
#define npyv_or_s64 vec_or
#define npyv_or_f32 vec_or
#define npyv_or_f64 vec_or
+#define npyv_or_b8 vec_or
+#define npyv_or_b16 vec_or
+#define npyv_or_b32 vec_or
+#define npyv_or_b64 vec_or
// XOR
#define npyv_xor_u8 vec_xor
@@ -80,6 +88,10 @@
#define npyv_xor_s64 vec_xor
#define npyv_xor_f32 vec_xor
#define npyv_xor_f64 vec_xor
+#define npyv_xor_b8 vec_xor
+#define npyv_xor_b16 vec_xor
+#define npyv_xor_b32 vec_xor
+#define npyv_xor_b64 vec_xor
// NOT
// note: we implement npyv_not_b*(boolen types) for internal use*/
@@ -213,4 +225,10 @@ NPY_FINLINE npyv_f64 npyv_not_f64(npyv_f64 a)
#define npyv_cmple_f32(A, B) npyv_cmpge_f32(B, A)
#define npyv_cmple_f64(A, B) npyv_cmpge_f64(B, A)
+// check special cases
+NPY_FINLINE npyv_b32 npyv_notnan_f32(npyv_f32 a)
+{ return vec_cmpeq(a, a); }
+NPY_FINLINE npyv_b64 npyv_notnan_f64(npyv_f64 a)
+{ return vec_cmpeq(a, a); }
+
#endif // _NPY_SIMD_VSX_OPERATORS_H
diff --git a/numpy/core/src/common/simd/vsx/vsx.h b/numpy/core/src/common/simd/vsx/vsx.h
index 27dde98e7..769f5a08f 100644
--- a/numpy/core/src/common/simd/vsx/vsx.h
+++ b/numpy/core/src/common/simd/vsx/vsx.h
@@ -5,6 +5,7 @@
#define NPY_SIMD 128
#define NPY_SIMD_WIDTH 16
#define NPY_SIMD_F64 1
+#define NPY_SIMD_FMA3 1 // native support
typedef __vector unsigned char npyv_u8;
typedef __vector signed char npyv_s8;
diff --git a/numpy/core/tests/test_simd.py b/numpy/core/tests/test_simd.py
index 196003cdd..2f378667d 100644
--- a/numpy/core/tests/test_simd.py
+++ b/numpy/core/tests/test_simd.py
@@ -1,13 +1,16 @@
# NOTE: Please avoid the use of numpy.testing since NPYV intrinsics
# may be involved in their functionality.
-import pytest, math
+import pytest, math, re
from numpy.core._simd import targets
+from numpy.core._multiarray_umath import __cpu_baseline__
class _Test_Utility:
- # submodule of the desired SIMD extention, e.g. targets["AVX512F"]
+ # submodule of the desired SIMD extension, e.g. targets["AVX512F"]
npyv = None
# the current data type suffix e.g. 's8'
sfx = None
+ # target name can be 'baseline' or one or more of CPU features
+ target_name = None
def __getattr__(self, attr):
"""
@@ -92,6 +95,14 @@ class _Test_Utility:
v = self.npyv.setall_u32(0x7fc00000)
return self.npyv.reinterpret_f32_u32(v)[0]
+ def _cpu_features(self):
+ target = self.target_name
+ if target == "baseline":
+ target = __cpu_baseline__
+ else:
+ target = target.split('__') # multi-target separator
+ return ' '.join(target)
+
class _SIMD_BOOL(_Test_Utility):
"""
To test all boolean vector types at once
@@ -110,6 +121,32 @@ class _SIMD_BOOL(_Test_Utility):
cvt = getattr(self.npyv, f"cvt_b{len_str}_u{len_str}")
return cvt(load(data))
+ def test_operators_logical(self):
+ """
+ Logical operations for boolean types.
+ Test intrinsics:
+ npyv_xor_##SFX, npyv_and_##SFX, npyv_or_##SFX, npyv_not_##SFX
+ """
+ data_a = self._data()
+ data_b = self._data(reverse=True)
+ vdata_a = self._load_b(data_a)
+ vdata_b = self._load_b(data_b)
+
+ data_and = [a & b for a, b in zip(data_a, data_b)]
+ vand = getattr(self, "and")(vdata_a, vdata_b)
+ assert vand == data_and
+
+ data_or = [a | b for a, b in zip(data_a, data_b)]
+ vor = getattr(self, "or")(vdata_a, vdata_b)
+ assert vor == data_or
+
+ data_xor = [a ^ b for a, b in zip(data_a, data_b)]
+ vxor = getattr(self, "xor")(vdata_a, vdata_b)
+ assert vxor == data_xor
+
+ vnot = getattr(self, "not")(vdata_a)
+ assert vnot == data_b
+
def test_tobits(self):
data2bits = lambda data: sum([int(x != 0) << i for i, x in enumerate(data, 0)])
for data in (self._data(), self._data(reverse=True)):
@@ -164,6 +201,46 @@ class _SIMD_INT(_Test_Utility):
subs = self.subs(vdata_a, vdata_b)
assert subs == data_subs
+class _SIMD_FP32(_Test_Utility):
+ """
+ To only test single precision
+ """
+ def test_conversions(self):
+ """
+ Round to nearest even integer, assume CPU control register is set to rounding.
+ Test intrinics:
+ npyv_round_s32_##SFX
+ """
+ features = self._cpu_features()
+ if not self.npyv.simd_f64 and re.match(r".*(NEON|ASIMD)", features):
+ # very costly to emulate nearest even on Armv7
+ # instead we round halves to up. e.g. 0.5 -> 1, -0.5 -> -1
+ _round = lambda v: int(v + (0.5 if v >= 0 else -0.5))
+ else:
+ _round = round
+ vdata_a = self.load(self._data())
+ vdata_a = self.sub(vdata_a, self.setall(0.5))
+ data_round = [_round(x) for x in vdata_a]
+ vround = self.round_s32(vdata_a)
+ assert vround == data_round
+
+class _SIMD_FP64(_Test_Utility):
+ """
+ To only test double precision
+ """
+ def test_conversions(self):
+ """
+ Round to nearest even integer, assume CPU control register is set to rounding.
+ Test intrinics:
+ npyv_round_s32_##SFX
+ """
+ vdata_a = self.load(self._data())
+ vdata_a = self.sub(vdata_a, self.setall(0.5))
+ vdata_b = self.mul(vdata_a, self.setall(-1.5))
+ data_round = [round(x) for x in list(vdata_a) + list(vdata_b)]
+ vround = self.round_s32(vdata_a, vdata_b)
+ assert vround == data_round
+
class _SIMD_FP(_Test_Utility):
"""
To test all float vector types at once
@@ -247,6 +324,14 @@ class _SIMD_FP(_Test_Utility):
recip = self.recip(vdata)
assert recip == data_recip
+ def test_special_cases(self):
+ """
+ Compare Not NaN. Test intrinics:
+ npyv_notnan_##SFX
+ """
+ nnan = self.notnan(self.setall(self._nan()))
+ assert nnan == [0]*self.nlanes
+
class _SIMD_ALL(_Test_Utility):
"""
To test all vector types at once
@@ -440,7 +525,7 @@ class _SIMD_ALL(_Test_Utility):
vsetf = self.setf(10, *data_a)
assert vsetf == data_a
- # We're testing the sainty of _simd's type-vector,
+ # We're testing the sanity of _simd's type-vector,
# reinterpret* intrinsics itself are tested via compiler
# during the build of _simd module
sfxes = ["u8", "s8", "u16", "s16", "u32", "s32", "u64", "s64", "f32"]
@@ -632,6 +717,29 @@ class _SIMD_ALL(_Test_Utility):
vsum = self.sum(vdata)
assert vsum == data_sum
+ def test_mask_conditional(self):
+ """
+ Conditional addition and subtraction for all supported data types.
+ Test intrinics:
+ npyv_ifadd_##SFX, npyv_ifsub_##SFX
+ """
+ vdata_a = self.load(self._data())
+ vdata_b = self.load(self._data(reverse=True))
+ true_mask = self.cmpeq(self.zero(), self.zero())
+ false_mask = self.cmpneq(self.zero(), self.zero())
+
+ data_sub = self.sub(vdata_b, vdata_a)
+ ifsub = self.ifsub(true_mask, vdata_b, vdata_a, vdata_b)
+ assert ifsub == data_sub
+ ifsub = self.ifsub(false_mask, vdata_a, vdata_b, vdata_b)
+ assert ifsub == vdata_b
+
+ data_add = self.add(vdata_b, vdata_a)
+ ifadd = self.ifadd(true_mask, vdata_b, vdata_a, vdata_b)
+ assert ifadd == data_add
+ ifadd = self.ifadd(false_mask, vdata_a, vdata_b, vdata_b)
+ assert ifadd == vdata_b
+
bool_sfx = ("b8", "b16", "b32", "b64")
int_sfx = ("u8", "s8", "u16", "s16", "u32", "s32", "u64", "s64")
fp_sfx = ("f32", "f64")
@@ -640,6 +748,8 @@ tests_registry = {
bool_sfx: _SIMD_BOOL,
int_sfx : _SIMD_INT,
fp_sfx : _SIMD_FP,
+ ("f32",): _SIMD_FP32,
+ ("f64",): _SIMD_FP64,
all_sfx : _SIMD_ALL
}
for target_name, npyv in targets.items():
@@ -664,7 +774,7 @@ for target_name, npyv in targets.items():
for sfx in sfxes:
skip_m = skip_sfx.get(sfx, skip)
inhr = (cls,)
- attr = dict(npyv=targets[target_name], sfx=sfx)
+ attr = dict(npyv=targets[target_name], sfx=sfx, target_name=target_name)
tcls = type(f"Test{cls.__name__}_{simd_width}_{target_name}_{sfx}", inhr, attr)
if skip_m:
pytest.mark.skip(reason=skip_m)(tcls)