diff options
author | Sayed Adel <seiko@imavr.com> | 2020-11-16 21:20:57 +0000 |
---|---|---|
committer | Sayed Adel <seiko@imavr.com> | 2020-12-22 20:34:43 +0000 |
commit | 150d459e0cf7ce8c92c971260e4aa88cbae43a2c (patch) | |
tree | eda5c3aabdeea61c99b0d242897d3c3cab9a1e1f | |
parent | cfb6a4d88d88307d507a86d1d70cd7d84c611406 (diff) | |
download | numpy-150d459e0cf7ce8c92c971260e4aa88cbae43a2c.tar.gz |
ENH, SIMD: Add new NPYV intrinsics pack(1)
- add bitwise logical operations for boolean vectors
- add round conversion for float vectors
- add NAN test for float vectors
- add conditional addition and subtraction
- add #definition NPY_SIMD_FMA3 to check FUSED native support
- add testing cases for all of the above
21 files changed, 598 insertions, 5 deletions
diff --git a/numpy/core/src/_simd/_simd.dispatch.c.src b/numpy/core/src/_simd/_simd.dispatch.c.src index e3dbcdece..eaff81338 100644 --- a/numpy/core/src/_simd/_simd.dispatch.c.src +++ b/numpy/core/src/_simd/_simd.dispatch.c.src @@ -372,16 +372,58 @@ SIMD_IMPL_INTRIN_1(@intrin@_@sfx@, v@sfx@, v@sfx@) /**end repeat1**/ #endif +/*************************** + * Mask operations + ***************************/ +/**begin repeat1 + * #intrin = ifadd, ifsub# + */ + SIMD_IMPL_INTRIN_4(@intrin@_@sfx@, v@sfx@, v@bsfx@, v@sfx@, v@sfx@, v@sfx@) +/**end repeat1**/ + #endif // simd_sup /**end repeat**/ /************************************************************************* * Variant ************************************************************************/ SIMD_IMPL_INTRIN_0N(cleanup) + +/************************************************************************* + * A special section for f32/f64 intrinsics outside the main repeater + ************************************************************************/ +/*************************** + * Operators + ***************************/ +// check special cases +SIMD_IMPL_INTRIN_1(notnan_f32, vb32, vf32) +#if NPY_SIMD_F64 + SIMD_IMPL_INTRIN_1(notnan_f64, vb64, vf64) +#endif +/*************************** + * Conversions + ***************************/ +// round to nearest integer (assume even) +SIMD_IMPL_INTRIN_1(round_s32_f32, vs32, vf32) +#if NPY_SIMD_F64 + SIMD_IMPL_INTRIN_2(round_s32_f64, vs32, vf64, vf64) +#endif + /************************************************************************* * A special section for boolean intrinsics outside the main repeater ************************************************************************/ /*************************** + * Operators + ***************************/ +// Logical +/**begin repeat + * #bsfx = b8, b16, b32, b64# + */ +SIMD_IMPL_INTRIN_2(and_@bsfx@, v@bsfx@, v@bsfx@, v@bsfx@) +SIMD_IMPL_INTRIN_2(or_@bsfx@, v@bsfx@, v@bsfx@, v@bsfx@) +SIMD_IMPL_INTRIN_2(xor_@bsfx@, v@bsfx@, v@bsfx@, v@bsfx@) +SIMD_IMPL_INTRIN_1(not_@bsfx@, v@bsfx@, v@bsfx@) +/**end repeat**/ +/*************************** * Conversions ***************************/ // Convert mask vector to integer bitfield @@ -534,6 +576,16 @@ SIMD_INTRIN_DEF(sum_@sfx@) SIMD_INTRIN_DEF(@intrin@_@sfx@) /**end repeat1**/ #endif + +/*************************** + * Mask operations + ***************************/ +/**begin repeat1 + * #intrin = ifadd, ifsub# + */ + SIMD_INTRIN_DEF(@intrin@_@sfx@) +/**end repeat1**/ + #endif // simd_sup /**end repeat**/ /************************************************************************* @@ -542,9 +594,41 @@ SIMD_INTRIN_DEF(@intrin@_@sfx@) SIMD_INTRIN_DEF(cleanup) /************************************************************************* + * A special section for f32/f64 intrinsics outside the main repeater + ************************************************************************/ +/*************************** + * Operators + ***************************/ +// check special cases +SIMD_INTRIN_DEF(notnan_f32) +#if NPY_SIMD_F64 + SIMD_INTRIN_DEF(notnan_f64) +#endif +/*************************** + * Conversions + ***************************/ +// round to nearest integer (assume even) +SIMD_INTRIN_DEF(round_s32_f32) +#if NPY_SIMD_F64 + SIMD_INTRIN_DEF(round_s32_f64) +#endif + +/************************************************************************* * A special section for boolean intrinsics outside the main repeater ************************************************************************/ /*************************** + * Operators + ***************************/ +// Logical +/**begin repeat + * #bsfx = b8, b16, b32, b64# + */ +SIMD_INTRIN_DEF(and_@bsfx@) +SIMD_INTRIN_DEF(or_@bsfx@) +SIMD_INTRIN_DEF(xor_@bsfx@) +SIMD_INTRIN_DEF(not_@bsfx@) +/**end repeat**/ +/*************************** * Conversions ***************************/ // Convert mask vector to integer bitfield @@ -590,6 +674,9 @@ NPY_CPU_DISPATCH_CURFX(simd_create_module)(void) if (PyModule_AddIntConstant(m, "simd_f64", NPY_SIMD_F64)) { goto err; } + if (PyModule_AddIntConstant(m, "simd_fma3", NPY_SIMD_FMA3)) { + goto err; + } if (PyModule_AddIntConstant(m, "simd_width", NPY_SIMD_WIDTH)) { goto err; } diff --git a/numpy/core/src/_simd/_simd_easyintrin.inc b/numpy/core/src/_simd/_simd_easyintrin.inc index 54e7ccf01..f83d7a286 100644 --- a/numpy/core/src/_simd/_simd_easyintrin.inc +++ b/numpy/core/src/_simd/_simd_easyintrin.inc @@ -123,6 +123,36 @@ }; \ return simd_arg_to_obj(&ret); \ } + +#define SIMD_IMPL_INTRIN_4(NAME, RET, IN0, IN1, IN2, IN3) \ + static PyObject *simd__intrin_##NAME \ + (PyObject* NPY_UNUSED(self), PyObject *args) \ + { \ + simd_arg arg1 = {.dtype = simd_data_##IN0}; \ + simd_arg arg2 = {.dtype = simd_data_##IN1}; \ + simd_arg arg3 = {.dtype = simd_data_##IN2}; \ + simd_arg arg4 = {.dtype = simd_data_##IN3}; \ + if (!PyArg_ParseTuple( \ + args, "O&O&O&O&:"NPY_TOSTRING(NAME), \ + simd_arg_converter, &arg1, \ + simd_arg_converter, &arg2, \ + simd_arg_converter, &arg3, \ + simd_arg_converter, &arg4 \ + )) return NULL; \ + simd_data data = {.RET = npyv_##NAME( \ + arg1.data.IN0, arg2.data.IN1, \ + arg3.data.IN2, arg4.data.IN3 \ + )}; \ + simd_arg_free(&arg1); \ + simd_arg_free(&arg2); \ + simd_arg_free(&arg3); \ + simd_arg_free(&arg4); \ + simd_arg ret = { \ + .data = data, .dtype = simd_data_##RET \ + }; \ + return simd_arg_to_obj(&ret); \ + } + /** * Helper macros for repeating and expand a certain macro. * Mainly used for converting a scalar to an immediate constant. diff --git a/numpy/core/src/common/simd/avx2/avx2.h b/numpy/core/src/common/simd/avx2/avx2.h index 6f0d3c0d9..bcd90b110 100644 --- a/numpy/core/src/common/simd/avx2/avx2.h +++ b/numpy/core/src/common/simd/avx2/avx2.h @@ -5,6 +5,11 @@ #define NPY_SIMD 256 #define NPY_SIMD_WIDTH 32 #define NPY_SIMD_F64 1 +#ifdef NPY_HAVE_FMA3 + #define NPY_SIMD_FMA3 1 // native support +#else + #define NPY_SIMD_FMA3 0 // fast emulated +#endif // Enough limit to allow us to use _mm256_i32gather_* #define NPY_SIMD_MAXLOAD_STRIDE32 (0x7fffffff / 8) diff --git a/numpy/core/src/common/simd/avx2/conversion.h b/numpy/core/src/common/simd/avx2/conversion.h index f72678b54..dc6b18766 100644 --- a/numpy/core/src/common/simd/avx2/conversion.h +++ b/numpy/core/src/common/simd/avx2/conversion.h @@ -43,4 +43,12 @@ NPY_FINLINE npy_uint64 npyv_tobits_b32(npyv_b32 a) NPY_FINLINE npy_uint64 npyv_tobits_b64(npyv_b64 a) { return (npy_uint8)_mm256_movemask_pd(_mm256_castsi256_pd(a)); } +// round to nearest integer (assuming even) +#define npyv_round_s32_f32 _mm256_cvtps_epi32 +NPY_FINLINE npyv_s32 npyv_round_s32_f64(npyv_f64 a, npyv_f64 b) +{ + __m128i lo = _mm256_cvtpd_epi32(a), hi = _mm256_cvtpd_epi32(b); + return _mm256_inserti128_si256(_mm256_castsi128_si256(lo), hi, 1); +} + #endif // _NPY_SIMD_AVX2_CVT_H diff --git a/numpy/core/src/common/simd/avx2/operators.h b/numpy/core/src/common/simd/avx2/operators.h index c1d30413f..5fc7719e9 100644 --- a/numpy/core/src/common/simd/avx2/operators.h +++ b/numpy/core/src/common/simd/avx2/operators.h @@ -61,6 +61,10 @@ NPY_FINLINE __m256i npyv_shr_s64(__m256i a, int c) #define npyv_and_s64 _mm256_and_si256 #define npyv_and_f32 _mm256_and_ps #define npyv_and_f64 _mm256_and_pd +#define npyv_and_b8 _mm256_and_si256 +#define npyv_and_b16 _mm256_and_si256 +#define npyv_and_b32 _mm256_and_si256 +#define npyv_and_b64 _mm256_and_si256 // OR #define npyv_or_u8 _mm256_or_si256 @@ -73,6 +77,10 @@ NPY_FINLINE __m256i npyv_shr_s64(__m256i a, int c) #define npyv_or_s64 _mm256_or_si256 #define npyv_or_f32 _mm256_or_ps #define npyv_or_f64 _mm256_or_pd +#define npyv_or_b8 _mm256_or_si256 +#define npyv_or_b16 _mm256_or_si256 +#define npyv_or_b32 _mm256_or_si256 +#define npyv_or_b64 _mm256_or_si256 // XOR #define npyv_xor_u8 _mm256_xor_si256 @@ -85,6 +93,10 @@ NPY_FINLINE __m256i npyv_shr_s64(__m256i a, int c) #define npyv_xor_s64 _mm256_xor_si256 #define npyv_xor_f32 _mm256_xor_ps #define npyv_xor_f64 _mm256_xor_pd +#define npyv_xor_b8 _mm256_xor_si256 +#define npyv_xor_b16 _mm256_xor_si256 +#define npyv_xor_b32 _mm256_xor_si256 +#define npyv_xor_b64 _mm256_xor_si256 // NOT #define npyv_not_u8(A) _mm256_xor_si256(A, _mm256_set1_epi32(-1)) @@ -97,6 +109,10 @@ NPY_FINLINE __m256i npyv_shr_s64(__m256i a, int c) #define npyv_not_s64 npyv_not_u8 #define npyv_not_f32(A) _mm256_xor_ps(A, _mm256_castsi256_ps(_mm256_set1_epi32(-1))) #define npyv_not_f64(A) _mm256_xor_pd(A, _mm256_castsi256_pd(_mm256_set1_epi32(-1))) +#define npyv_not_b8 npyv_not_u8 +#define npyv_not_b16 npyv_not_u8 +#define npyv_not_b32 npyv_not_u8 +#define npyv_not_b64 npyv_not_u8 /*************************** * Comparison @@ -197,4 +213,10 @@ NPY_FINLINE __m256i npyv_cmpge_u32(__m256i a, __m256i b) #define npyv_cmpge_f32(A, B) _mm256_castps_si256(_mm256_cmp_ps(A, B, _CMP_GE_OQ)) #define npyv_cmpge_f64(A, B) _mm256_castpd_si256(_mm256_cmp_pd(A, B, _CMP_GE_OQ)) +// check special cases +NPY_FINLINE npyv_b32 npyv_notnan_f32(npyv_f32 a) +{ return _mm256_castps_si256(_mm256_cmp_ps(a, a, _CMP_ORD_Q)); } +NPY_FINLINE npyv_b64 npyv_notnan_f64(npyv_f64 a) +{ return _mm256_castpd_si256(_mm256_cmp_pd(a, a, _CMP_ORD_Q)); } + #endif // _NPY_SIMD_AVX2_OPERATORS_H diff --git a/numpy/core/src/common/simd/avx512/avx512.h b/numpy/core/src/common/simd/avx512/avx512.h index 2de33765a..f38686834 100644 --- a/numpy/core/src/common/simd/avx512/avx512.h +++ b/numpy/core/src/common/simd/avx512/avx512.h @@ -4,6 +4,7 @@ #define NPY_SIMD 512 #define NPY_SIMD_WIDTH 64 #define NPY_SIMD_F64 1 +#define NPY_SIMD_FMA3 1 // native support // Enough limit to allow us to use _mm512_i32gather_* and _mm512_i32scatter_* #define NPY_SIMD_MAXLOAD_STRIDE32 (0x7fffffff / 16) #define NPY_SIMD_MAXSTORE_STRIDE32 (0x7fffffff / 16) @@ -73,3 +74,4 @@ typedef struct { __m512d val[3]; } npyv_f64x3; #include "conversion.h" #include "arithmetic.h" #include "math.h" +#include "maskop.h" diff --git a/numpy/core/src/common/simd/avx512/conversion.h b/numpy/core/src/common/simd/avx512/conversion.h index 6ad299dd5..1d71d7b49 100644 --- a/numpy/core/src/common/simd/avx512/conversion.h +++ b/numpy/core/src/common/simd/avx512/conversion.h @@ -82,4 +82,12 @@ NPY_FINLINE npy_uint64 npyv_tobits_b32(npyv_b32 a) NPY_FINLINE npy_uint64 npyv_tobits_b64(npyv_b64 a) { return (npy_uint8)a; } +// round to nearest integer (assuming even) +#define npyv_round_s32_f32 _mm512_cvtps_epi32 +NPY_FINLINE npyv_s32 npyv_round_s32_f64(npyv_f64 a, npyv_f64 b) +{ + __m256i lo = _mm512_cvtpd_epi32(a), hi = _mm512_cvtpd_epi32(b); + return npyv512_combine_si256(lo, hi); +} + #endif // _NPY_SIMD_AVX512_CVT_H diff --git a/numpy/core/src/common/simd/avx512/maskop.h b/numpy/core/src/common/simd/avx512/maskop.h new file mode 100644 index 000000000..d1c188390 --- /dev/null +++ b/numpy/core/src/common/simd/avx512/maskop.h @@ -0,0 +1,54 @@ +#ifndef NPY_SIMD + #error "Not a standalone header, use simd/simd.h instead" +#endif + +#ifndef _NPY_SIMD_AVX512_MASKOP_H +#define _NPY_SIMD_AVX512_MASKOP_H + +/** + * Implements conditional addition and subtraction. + * e.g. npyv_ifadd_f32(m, a, b, c) -> m ? a + b : c + * e.g. npyv_ifsub_f32(m, a, b, c) -> m ? a - b : c + */ +#define NPYV_IMPL_AVX512_EMULATE_MASK_ADDSUB(SFX, BSFX) \ + NPY_FINLINE npyv_##SFX npyv_ifadd_##SFX \ + (npyv_##BSFX m, npyv_##SFX a, npyv_##SFX b, npyv_##SFX c) \ + { \ + npyv_##SFX add = npyv_add_##SFX(a, b); \ + return npyv_select_##SFX(m, add, c); \ + } \ + NPY_FINLINE npyv_##SFX npyv_ifsub_##SFX \ + (npyv_##BSFX m, npyv_##SFX a, npyv_##SFX b, npyv_##SFX c) \ + { \ + npyv_##SFX sub = npyv_sub_##SFX(a, b); \ + return npyv_select_##SFX(m, sub, c); \ + } + +#define NPYV_IMPL_AVX512_MASK_ADDSUB(SFX, BSFX, ZSFX) \ + NPY_FINLINE npyv_##SFX npyv_ifadd_##SFX \ + (npyv_##BSFX m, npyv_##SFX a, npyv_##SFX b, npyv_##SFX c) \ + { return _mm512_mask_add_##ZSFX(c, m, a, b); } \ + NPY_FINLINE npyv_##SFX npyv_ifsub_##SFX \ + (npyv_##BSFX m, npyv_##SFX a, npyv_##SFX b, npyv_##SFX c) \ + { return _mm512_mask_sub_##ZSFX(c, m, a, b); } + +#ifdef NPY_HAVE_AVX512BW + NPYV_IMPL_AVX512_MASK_ADDSUB(u8, b8, epi8) + NPYV_IMPL_AVX512_MASK_ADDSUB(s8, b8, epi8) + NPYV_IMPL_AVX512_MASK_ADDSUB(u16, b16, epi16) + NPYV_IMPL_AVX512_MASK_ADDSUB(s16, b16, epi16) +#else + NPYV_IMPL_AVX512_EMULATE_MASK_ADDSUB(u8, b8) + NPYV_IMPL_AVX512_EMULATE_MASK_ADDSUB(s8, b8) + NPYV_IMPL_AVX512_EMULATE_MASK_ADDSUB(u16, b16) + NPYV_IMPL_AVX512_EMULATE_MASK_ADDSUB(s16, b16) +#endif + +NPYV_IMPL_AVX512_MASK_ADDSUB(u32, b32, epi32) +NPYV_IMPL_AVX512_MASK_ADDSUB(s32, b32, epi32) +NPYV_IMPL_AVX512_MASK_ADDSUB(u64, b64, epi64) +NPYV_IMPL_AVX512_MASK_ADDSUB(s64, b64, epi64) +NPYV_IMPL_AVX512_MASK_ADDSUB(f32, b32, ps) +NPYV_IMPL_AVX512_MASK_ADDSUB(f64, b64, pd) + +#endif // _NPY_SIMD_AVX512_MASKOP_H diff --git a/numpy/core/src/common/simd/avx512/operators.h b/numpy/core/src/common/simd/avx512/operators.h index f76ea5e2d..5f1771770 100644 --- a/numpy/core/src/common/simd/avx512/operators.h +++ b/numpy/core/src/common/simd/avx512/operators.h @@ -90,6 +90,20 @@ NPYV_IMPL_AVX512_FROM_SI512_PS_2ARG(npyv_and_f32, _mm512_and_si512) NPYV_IMPL_AVX512_FROM_SI512_PD_2ARG(npyv_and_f64, _mm512_and_si512) #endif +#ifdef NPY_HAVE_AVX512BW_MASK + #define npyv_and_b8 _kand_mask64 + #define npyv_and_b16 _kand_mask32 +#elif defined(NPY_HAVE_AVX512BW) + NPY_FINLINE npyv_b8 npyv_and_b8(npyv_b8 a, npyv_b8 b) + { return a & b; } + NPY_FINLINE npyv_b16 npyv_and_b16(npyv_b16 a, npyv_b16 b) + { return a & b; } +#else + #define npyv_and_b8 _mm512_and_si512 + #define npyv_and_b16 _mm512_and_si512 +#endif +#define npyv_and_b32 _mm512_kand +#define npyv_and_b64 _mm512_kand // OR #define npyv_or_u8 _mm512_or_si512 @@ -107,6 +121,20 @@ NPYV_IMPL_AVX512_FROM_SI512_PS_2ARG(npyv_or_f32, _mm512_or_si512) NPYV_IMPL_AVX512_FROM_SI512_PD_2ARG(npyv_or_f64, _mm512_or_si512) #endif +#ifdef NPY_HAVE_AVX512BW_MASK + #define npyv_or_b8 _kor_mask64 + #define npyv_or_b16 _kor_mask32 +#elif defined(NPY_HAVE_AVX512BW) + NPY_FINLINE npyv_b8 npyv_or_b8(npyv_b8 a, npyv_b8 b) + { return a | b; } + NPY_FINLINE npyv_b16 npyv_or_b16(npyv_b16 a, npyv_b16 b) + { return a | b; } +#else + #define npyv_or_b8 _mm512_or_si512 + #define npyv_or_b16 _mm512_or_si512 +#endif +#define npyv_or_b32 _mm512_kor +#define npyv_or_b64 _mm512_kor // XOR #define npyv_xor_u8 _mm512_xor_si512 @@ -124,6 +152,20 @@ NPYV_IMPL_AVX512_FROM_SI512_PS_2ARG(npyv_xor_f32, _mm512_xor_si512) NPYV_IMPL_AVX512_FROM_SI512_PD_2ARG(npyv_xor_f64, _mm512_xor_si512) #endif +#ifdef NPY_HAVE_AVX512BW_MASK + #define npyv_xor_b8 _kxor_mask64 + #define npyv_xor_b16 _kxor_mask32 +#elif defined(NPY_HAVE_AVX512BW) + NPY_FINLINE npyv_b8 npyv_xor_b8(npyv_b8 a, npyv_b8 b) + { return a ^ b; } + NPY_FINLINE npyv_b16 npyv_xor_b16(npyv_b16 a, npyv_b16 b) + { return a ^ b; } +#else + #define npyv_xor_b8 _mm512_xor_si512 + #define npyv_xor_b16 _mm512_xor_si512 +#endif +#define npyv_xor_b32 _mm512_kxor +#define npyv_xor_b64 _mm512_kxor // NOT #define npyv_not_u8(A) _mm512_xor_si512(A, _mm512_set1_epi32(-1)) @@ -141,6 +183,21 @@ #define npyv_not_f32(A) _mm512_castsi512_ps(npyv_not_u32(_mm512_castps_si512(A))) #define npyv_not_f64(A) _mm512_castsi512_pd(npyv_not_u64(_mm512_castpd_si512(A))) #endif +#ifdef NPY_HAVE_AVX512BW_MASK + #define npyv_not_b8 _knot_mask64 + #define npyv_not_b16 _knot_mask32 +#elif defined(NPY_HAVE_AVX512BW) + NPY_FINLINE npyv_b8 npyv_not_b8(npyv_b8 a) + { return ~a; } + NPY_FINLINE npyv_b16 npyv_not_b16(npyv_b16 a) + { return ~a; } +#else + #define npyv_not_b8 npyv_not_u8 + #define npyv_not_b16 npyv_not_u8 +#endif +#define npyv_not_b32 _mm512_knot +#define npyv_not_b64 _mm512_knot + /*************************** * Comparison @@ -256,4 +313,10 @@ #define npyv_cmpge_f32(A, B) _mm512_cmp_ps_mask(A, B, _CMP_GE_OQ) #define npyv_cmpge_f64(A, B) _mm512_cmp_pd_mask(A, B, _CMP_GE_OQ) +// check special cases +NPY_FINLINE npyv_b32 npyv_notnan_f32(npyv_f32 a) +{ return _mm512_cmp_ps_mask(a, a, _CMP_ORD_Q); } +NPY_FINLINE npyv_b64 npyv_notnan_f64(npyv_f64 a) +{ return _mm512_cmp_pd_mask(a, a, _CMP_ORD_Q); } + #endif // _NPY_SIMD_AVX512_OPERATORS_H diff --git a/numpy/core/src/common/simd/emulate_maskop.h b/numpy/core/src/common/simd/emulate_maskop.h new file mode 100644 index 000000000..7e7446bc5 --- /dev/null +++ b/numpy/core/src/common/simd/emulate_maskop.h @@ -0,0 +1,44 @@ +/** + * This header is used internaly by all current supported SIMD extention, + * execpt for AVX512. + */ +#ifndef NPY_SIMD + #error "Not a standalone header, use simd/simd.h instead" +#endif + +#ifndef _NPY_SIMD_EMULATE_MASKOP_H +#define _NPY_SIMD_EMULATE_MASKOP_H + +/** + * Implements conditional addition and subtraction. + * e.g. npyv_ifadd_f32(mask, a, b, c) -> mask ? a + b : c + * e.g. npyv_ifsub_f32(mask, a, b, c) -> mask ? a - b : c + */ +#define NPYV_IMPL_EMULATE_MASK_ADDSUB(SFX, BSFX) \ + NPY_FINLINE npyv_##SFX npyv_ifadd_##SFX \ + (npyv_##BSFX m, npyv_##SFX a, npyv_##SFX b, npyv_##SFX c) \ + { \ + npyv_##SFX add = npyv_add_##SFX(a, b); \ + return npyv_select_##SFX(m, add, c); \ + } \ + NPY_FINLINE npyv_##SFX npyv_ifsub_##SFX \ + (npyv_##BSFX m, npyv_##SFX a, npyv_##SFX b, npyv_##SFX c) \ + { \ + npyv_##SFX sub = npyv_sub_##SFX(a, b); \ + return npyv_select_##SFX(m, sub, c); \ + } + +NPYV_IMPL_EMULATE_MASK_ADDSUB(u8, b8) +NPYV_IMPL_EMULATE_MASK_ADDSUB(s8, b8) +NPYV_IMPL_EMULATE_MASK_ADDSUB(u16, b16) +NPYV_IMPL_EMULATE_MASK_ADDSUB(s16, b16) +NPYV_IMPL_EMULATE_MASK_ADDSUB(u32, b32) +NPYV_IMPL_EMULATE_MASK_ADDSUB(s32, b32) +NPYV_IMPL_EMULATE_MASK_ADDSUB(u64, b64) +NPYV_IMPL_EMULATE_MASK_ADDSUB(s64, b64) +NPYV_IMPL_EMULATE_MASK_ADDSUB(f32, b32) +#if NPY_SIMD_F64 + NPYV_IMPL_EMULATE_MASK_ADDSUB(f64, b64) +#endif + +#endif // _NPY_SIMD_EMULATE_MASKOP_H diff --git a/numpy/core/src/common/simd/neon/conversion.h b/numpy/core/src/common/simd/neon/conversion.h index f9840b1cb..f92910b66 100644 --- a/numpy/core/src/common/simd/neon/conversion.h +++ b/numpy/core/src/common/simd/neon/conversion.h @@ -71,4 +71,24 @@ NPY_FINLINE npy_uint64 npyv_tobits_b64(npyv_b64 a) return vgetq_lane_u64(bit, 0) | ((int)vgetq_lane_u64(bit, 1) << 1); } +// round to nearest integer +#if NPY_SIMD_F64 + #define npyv_round_s32_f32 vcvtnq_s32_f32 + NPY_FINLINE npyv_s32 npyv_round_s32_f64(npyv_f64 a, npyv_f64 b) + { + npyv_s64 lo = vcvtnq_s64_f64(a), hi = vcvtnq_s64_f64(b); + return vcombine_s32(vmovn_s64(lo), vmovn_s64(hi)); + } +#else + NPY_FINLINE npyv_s32 npyv_round_s32_f32(npyv_f32 a) + { + // halves will be rounded up. it's very costly + // to obey IEEE standard on arm7. tests should pass +-1 difference + const npyv_u32 sign = vdupq_n_u32(0x80000000); + const npyv_f32 half = vdupq_n_f32(0.5f); + npyv_f32 sign_half = vbslq_f32(sign, a, half); + return vcvtq_s32_f32(vaddq_f32(a, sign_half)); + } +#endif + #endif // _NPY_SIMD_NEON_CVT_H diff --git a/numpy/core/src/common/simd/neon/neon.h b/numpy/core/src/common/simd/neon/neon.h index c8ddc92ad..e6f6a7324 100644 --- a/numpy/core/src/common/simd/neon/neon.h +++ b/numpy/core/src/common/simd/neon/neon.h @@ -10,6 +10,11 @@ #else #define NPY_SIMD_F64 0 #endif +#ifdef NPY_HAVE_NEON_VFPV4 + #define NPY_SIMD_FMA3 1 // native support +#else + #define NPY_SIMD_FMA3 0 // HW emulated +#endif typedef uint8x16_t npyv_u8; typedef int8x16_t npyv_s8; diff --git a/numpy/core/src/common/simd/neon/operators.h b/numpy/core/src/common/simd/neon/operators.h index c1ad4ba12..280c5e0da 100644 --- a/numpy/core/src/common/simd/neon/operators.h +++ b/numpy/core/src/common/simd/neon/operators.h @@ -58,6 +58,10 @@ vreinterpretq_f32_u8(vandq_u8(vreinterpretq_u8_f32(A), vreinterpretq_u8_f32(B))) #define npyv_and_f64(A, B) \ vreinterpretq_f64_u8(vandq_u8(vreinterpretq_u8_f64(A), vreinterpretq_u8_f64(B))) +#define npyv_and_b8 vandq_u8 +#define npyv_and_b16 vandq_u16 +#define npyv_and_b32 vandq_u32 +#define npyv_and_b64 vandq_u64 // OR #define npyv_or_u8 vorrq_u8 @@ -72,6 +76,11 @@ vreinterpretq_f32_u8(vorrq_u8(vreinterpretq_u8_f32(A), vreinterpretq_u8_f32(B))) #define npyv_or_f64(A, B) \ vreinterpretq_f64_u8(vorrq_u8(vreinterpretq_u8_f64(A), vreinterpretq_u8_f64(B))) +#define npyv_or_b8 vorrq_u8 +#define npyv_or_b16 vorrq_u16 +#define npyv_or_b32 vorrq_u32 +#define npyv_or_b64 vorrq_u64 + // XOR #define npyv_xor_u8 veorq_u8 @@ -86,6 +95,10 @@ vreinterpretq_f32_u8(veorq_u8(vreinterpretq_u8_f32(A), vreinterpretq_u8_f32(B))) #define npyv_xor_f64(A, B) \ vreinterpretq_f64_u8(veorq_u8(vreinterpretq_u8_f64(A), vreinterpretq_u8_f64(B))) +#define npyv_xor_b8 veorq_u8 +#define npyv_xor_b16 veorq_u16 +#define npyv_xor_b32 veorq_u32 +#define npyv_xor_b64 veorq_u64 // NOT #define npyv_not_u8 vmvnq_u8 @@ -98,6 +111,10 @@ #define npyv_not_s64(A) vreinterpretq_s64_u8(vmvnq_u8(vreinterpretq_u8_s64(A))) #define npyv_not_f32(A) vreinterpretq_f32_u8(vmvnq_u8(vreinterpretq_u8_f32(A))) #define npyv_not_f64(A) vreinterpretq_f64_u8(vmvnq_u8(vreinterpretq_u8_f64(A))) +#define npyv_not_b8 vmvnq_u8 +#define npyv_not_b16 vmvnq_u16 +#define npyv_not_b32 vmvnq_u32 +#define npyv_not_b64 npyv_not_u64 /*************************** * Comparison @@ -215,4 +232,12 @@ #define npyv_cmple_f32(A, B) npyv_cmpge_f32(B, A) #define npyv_cmple_f64(A, B) npyv_cmpge_f64(B, A) +// check special cases +NPY_FINLINE npyv_b32 npyv_notnan_f32(npyv_f32 a) +{ return vceqq_f32(a, a); } +#if NPY_SIMD_F64 + NPY_FINLINE npyv_b64 npyv_notnan_f64(npyv_f64 a) + { return vceqq_f64(a, a); } +#endif + #endif // _NPY_SIMD_NEON_OPERATORS_H diff --git a/numpy/core/src/common/simd/simd.h b/numpy/core/src/common/simd/simd.h index 8804223c9..d6c14228d 100644 --- a/numpy/core/src/common/simd/simd.h +++ b/numpy/core/src/common/simd/simd.h @@ -48,7 +48,14 @@ typedef double npyv_lanetype_f64; #define NPY_SIMD 0 #define NPY_SIMD_WIDTH 0 #define NPY_SIMD_F64 0 + #define NPY_SIMD_FMA3 0 #endif + +// enable emulated mask operations for all SIMD extension except for AVX512 +#if !defined(NPY_HAVE_AVX512F) && NPY_SIMD && NPY_SIMD < 512 + #include "emulate_maskop.h" +#endif + /** * Some SIMD extensions currently(AVX2, AVX512F) require (de facto) * a maximum number of strides sizes when dealing with non-contiguous memory access. diff --git a/numpy/core/src/common/simd/sse/conversion.h b/numpy/core/src/common/simd/sse/conversion.h index ab4beea96..d690ec313 100644 --- a/numpy/core/src/common/simd/sse/conversion.h +++ b/numpy/core/src/common/simd/sse/conversion.h @@ -42,4 +42,12 @@ NPY_FINLINE npy_uint64 npyv_tobits_b32(npyv_b32 a) NPY_FINLINE npy_uint64 npyv_tobits_b64(npyv_b64 a) { return (npy_uint8)_mm_movemask_pd(_mm_castsi128_pd(a)); } +// round to nearest integer (assuming even) +#define npyv_round_s32_f32 _mm_cvtps_epi32 +NPY_FINLINE npyv_s32 npyv_round_s32_f64(npyv_f64 a, npyv_f64 b) +{ + __m128i lo = _mm_cvtpd_epi32(a), hi = _mm_cvtpd_epi32(b); + return _mm_unpacklo_epi64(lo, hi); +} + #endif // _NPY_SIMD_SSE_CVT_H diff --git a/numpy/core/src/common/simd/sse/operators.h b/numpy/core/src/common/simd/sse/operators.h index 6e32ca4fd..51c84fb4e 100644 --- a/numpy/core/src/common/simd/sse/operators.h +++ b/numpy/core/src/common/simd/sse/operators.h @@ -62,6 +62,10 @@ NPY_FINLINE __m128i npyv_shr_s64(__m128i a, int c) #define npyv_and_s64 _mm_and_si128 #define npyv_and_f32 _mm_and_ps #define npyv_and_f64 _mm_and_pd +#define npyv_and_b8 _mm_and_si128 +#define npyv_and_b16 _mm_and_si128 +#define npyv_and_b32 _mm_and_si128 +#define npyv_and_b64 _mm_and_si128 // OR #define npyv_or_u8 _mm_or_si128 @@ -74,6 +78,10 @@ NPY_FINLINE __m128i npyv_shr_s64(__m128i a, int c) #define npyv_or_s64 _mm_or_si128 #define npyv_or_f32 _mm_or_ps #define npyv_or_f64 _mm_or_pd +#define npyv_or_b8 _mm_or_si128 +#define npyv_or_b16 _mm_or_si128 +#define npyv_or_b32 _mm_or_si128 +#define npyv_or_b64 _mm_or_si128 // XOR #define npyv_xor_u8 _mm_xor_si128 @@ -86,6 +94,10 @@ NPY_FINLINE __m128i npyv_shr_s64(__m128i a, int c) #define npyv_xor_s64 _mm_xor_si128 #define npyv_xor_f32 _mm_xor_ps #define npyv_xor_f64 _mm_xor_pd +#define npyv_xor_b8 _mm_xor_si128 +#define npyv_xor_b16 _mm_xor_si128 +#define npyv_xor_b32 _mm_xor_si128 +#define npyv_xor_b64 _mm_xor_si128 // NOT #define npyv_not_u8(A) _mm_xor_si128(A, _mm_set1_epi32(-1)) @@ -98,6 +110,10 @@ NPY_FINLINE __m128i npyv_shr_s64(__m128i a, int c) #define npyv_not_s64 npyv_not_u8 #define npyv_not_f32(A) _mm_xor_ps(A, _mm_castsi128_ps(_mm_set1_epi32(-1))) #define npyv_not_f64(A) _mm_xor_pd(A, _mm_castsi128_pd(_mm_set1_epi32(-1))) +#define npyv_not_b8 npyv_not_u8 +#define npyv_not_b16 npyv_not_u8 +#define npyv_not_b32 npyv_not_u8 +#define npyv_not_b64 npyv_not_u8 /*************************** * Comparison @@ -255,4 +271,10 @@ NPY_FINLINE __m128i npyv_shr_s64(__m128i a, int c) #define npyv_cmpge_f32(a, b) _mm_castps_si128(_mm_cmpge_ps(a, b)) #define npyv_cmpge_f64(a, b) _mm_castpd_si128(_mm_cmpge_pd(a, b)) +// check special cases +NPY_FINLINE npyv_b32 npyv_notnan_f32(npyv_f32 a) +{ return _mm_castps_si128(_mm_cmpord_ps(a, a)); } +NPY_FINLINE npyv_b64 npyv_notnan_f64(npyv_f64 a) +{ return _mm_castpd_si128(_mm_cmpord_pd(a, a)); } + #endif // _NPY_SIMD_SSE_OPERATORS_H diff --git a/numpy/core/src/common/simd/sse/sse.h b/numpy/core/src/common/simd/sse/sse.h index 132d3d347..dc0b62f73 100644 --- a/numpy/core/src/common/simd/sse/sse.h +++ b/numpy/core/src/common/simd/sse/sse.h @@ -5,7 +5,11 @@ #define NPY_SIMD 128 #define NPY_SIMD_WIDTH 16 #define NPY_SIMD_F64 1 - +#if defined(NPY_HAVE_FMA3) || defined(NPY_HAVE_FMA4) + #define NPY_SIMD_FMA3 1 // native support +#else + #define NPY_SIMD_FMA3 0 // fast emulated +#endif typedef __m128i npyv_u8; typedef __m128i npyv_s8; typedef __m128i npyv_u16; diff --git a/numpy/core/src/common/simd/vsx/conversion.h b/numpy/core/src/common/simd/vsx/conversion.h index 5803e1cdd..72fe10e69 100644 --- a/numpy/core/src/common/simd/vsx/conversion.h +++ b/numpy/core/src/common/simd/vsx/conversion.h @@ -51,4 +51,54 @@ NPY_FINLINE npy_uint64 npyv_tobits_b64(npyv_b64 a) return vec_extract(bit, 0) | (int)vec_extract(bit, 1) << 1; } +// truncate compatible with all compilers(internal use for now) +NPY_FINLINE npyv_s32 npyv__trunc_s32_f32(npyv_f32 a) +{ +#ifdef __IBMC__ + return vec_cts(a, 0); +#elif defined(__clang__) + /** + * old versions of CLANG doesn't support %x<n> in the inline asm template + * which fixes register number when using any of the register constraints wa, wd, wf. + * therefore, we count on built-in functions. + */ + return __builtin_convertvector(a, npyv_s32); +#else // gcc + npyv_s32 ret; + __asm__ ("xvcvspsxws %x0,%x1" : "=wa" (ret) : "wa" (a)); + return ret; +#endif +} +NPY_FINLINE npyv_s32 npyv__trunc_s32_f64(npyv_f64 a, npyv_f64 b) +{ +#ifdef __IBMC__ + const npyv_u8 seq_even = npyv_set_u8(0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27); + // unfortunately, XLC missing asm register vsx fixer + // hopefully, xlc can optimize around big-endian compatibility + npyv_s32 lo_even = vec_cts(a, 0); + npyv_s32 hi_even = vec_cts(b, 0); + return vec_perm(lo_even, hi_even, seq_even); +#else + const npyv_u8 seq_odd = npyv_set_u8(4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31); + #ifdef __clang__ + // __builtin_convertvector doesn't support this conversion on wide range of versions + // fortunately, almost all versions have direct builtin of 'xvcvdpsxws' + npyv_s32 lo_odd = __builtin_vsx_xvcvdpsxws(a); + npyv_s32 hi_odd = __builtin_vsx_xvcvdpsxws(b); + #else // gcc + npyv_s32 lo_odd, hi_odd; + __asm__ ("xvcvdpsxws %x0,%x1" : "=wa" (lo_odd) : "wa" (a)); + __asm__ ("xvcvdpsxws %x0,%x1" : "=wa" (hi_odd) : "wa" (b)); + #endif + return vec_perm(lo_odd, hi_odd, seq_odd); +#endif +} + +// round to nearest integer (assuming even) +NPY_FINLINE npyv_s32 npyv_round_s32_f32(npyv_f32 a) +{ return npyv__trunc_s32_f32(vec_rint(a)); } + +NPY_FINLINE npyv_s32 npyv_round_s32_f64(npyv_f64 a, npyv_f64 b) +{ return npyv__trunc_s32_f64(vec_rint(a), vec_rint(b)); } + #endif // _NPY_SIMD_VSX_CVT_H diff --git a/numpy/core/src/common/simd/vsx/operators.h b/numpy/core/src/common/simd/vsx/operators.h index ca020d9e0..230610129 100644 --- a/numpy/core/src/common/simd/vsx/operators.h +++ b/numpy/core/src/common/simd/vsx/operators.h @@ -56,6 +56,10 @@ #define npyv_and_s64 vec_and #define npyv_and_f32 vec_and #define npyv_and_f64 vec_and +#define npyv_and_b8 vec_and +#define npyv_and_b16 vec_and +#define npyv_and_b32 vec_and +#define npyv_and_b64 vec_and // OR #define npyv_or_u8 vec_or @@ -68,6 +72,10 @@ #define npyv_or_s64 vec_or #define npyv_or_f32 vec_or #define npyv_or_f64 vec_or +#define npyv_or_b8 vec_or +#define npyv_or_b16 vec_or +#define npyv_or_b32 vec_or +#define npyv_or_b64 vec_or // XOR #define npyv_xor_u8 vec_xor @@ -80,6 +88,10 @@ #define npyv_xor_s64 vec_xor #define npyv_xor_f32 vec_xor #define npyv_xor_f64 vec_xor +#define npyv_xor_b8 vec_xor +#define npyv_xor_b16 vec_xor +#define npyv_xor_b32 vec_xor +#define npyv_xor_b64 vec_xor // NOT // note: we implement npyv_not_b*(boolen types) for internal use*/ @@ -213,4 +225,10 @@ NPY_FINLINE npyv_f64 npyv_not_f64(npyv_f64 a) #define npyv_cmple_f32(A, B) npyv_cmpge_f32(B, A) #define npyv_cmple_f64(A, B) npyv_cmpge_f64(B, A) +// check special cases +NPY_FINLINE npyv_b32 npyv_notnan_f32(npyv_f32 a) +{ return vec_cmpeq(a, a); } +NPY_FINLINE npyv_b64 npyv_notnan_f64(npyv_f64 a) +{ return vec_cmpeq(a, a); } + #endif // _NPY_SIMD_VSX_OPERATORS_H diff --git a/numpy/core/src/common/simd/vsx/vsx.h b/numpy/core/src/common/simd/vsx/vsx.h index 27dde98e7..769f5a08f 100644 --- a/numpy/core/src/common/simd/vsx/vsx.h +++ b/numpy/core/src/common/simd/vsx/vsx.h @@ -5,6 +5,7 @@ #define NPY_SIMD 128 #define NPY_SIMD_WIDTH 16 #define NPY_SIMD_F64 1 +#define NPY_SIMD_FMA3 1 // native support typedef __vector unsigned char npyv_u8; typedef __vector signed char npyv_s8; diff --git a/numpy/core/tests/test_simd.py b/numpy/core/tests/test_simd.py index 196003cdd..2f378667d 100644 --- a/numpy/core/tests/test_simd.py +++ b/numpy/core/tests/test_simd.py @@ -1,13 +1,16 @@ # NOTE: Please avoid the use of numpy.testing since NPYV intrinsics # may be involved in their functionality. -import pytest, math +import pytest, math, re from numpy.core._simd import targets +from numpy.core._multiarray_umath import __cpu_baseline__ class _Test_Utility: - # submodule of the desired SIMD extention, e.g. targets["AVX512F"] + # submodule of the desired SIMD extension, e.g. targets["AVX512F"] npyv = None # the current data type suffix e.g. 's8' sfx = None + # target name can be 'baseline' or one or more of CPU features + target_name = None def __getattr__(self, attr): """ @@ -92,6 +95,14 @@ class _Test_Utility: v = self.npyv.setall_u32(0x7fc00000) return self.npyv.reinterpret_f32_u32(v)[0] + def _cpu_features(self): + target = self.target_name + if target == "baseline": + target = __cpu_baseline__ + else: + target = target.split('__') # multi-target separator + return ' '.join(target) + class _SIMD_BOOL(_Test_Utility): """ To test all boolean vector types at once @@ -110,6 +121,32 @@ class _SIMD_BOOL(_Test_Utility): cvt = getattr(self.npyv, f"cvt_b{len_str}_u{len_str}") return cvt(load(data)) + def test_operators_logical(self): + """ + Logical operations for boolean types. + Test intrinsics: + npyv_xor_##SFX, npyv_and_##SFX, npyv_or_##SFX, npyv_not_##SFX + """ + data_a = self._data() + data_b = self._data(reverse=True) + vdata_a = self._load_b(data_a) + vdata_b = self._load_b(data_b) + + data_and = [a & b for a, b in zip(data_a, data_b)] + vand = getattr(self, "and")(vdata_a, vdata_b) + assert vand == data_and + + data_or = [a | b for a, b in zip(data_a, data_b)] + vor = getattr(self, "or")(vdata_a, vdata_b) + assert vor == data_or + + data_xor = [a ^ b for a, b in zip(data_a, data_b)] + vxor = getattr(self, "xor")(vdata_a, vdata_b) + assert vxor == data_xor + + vnot = getattr(self, "not")(vdata_a) + assert vnot == data_b + def test_tobits(self): data2bits = lambda data: sum([int(x != 0) << i for i, x in enumerate(data, 0)]) for data in (self._data(), self._data(reverse=True)): @@ -164,6 +201,46 @@ class _SIMD_INT(_Test_Utility): subs = self.subs(vdata_a, vdata_b) assert subs == data_subs +class _SIMD_FP32(_Test_Utility): + """ + To only test single precision + """ + def test_conversions(self): + """ + Round to nearest even integer, assume CPU control register is set to rounding. + Test intrinics: + npyv_round_s32_##SFX + """ + features = self._cpu_features() + if not self.npyv.simd_f64 and re.match(r".*(NEON|ASIMD)", features): + # very costly to emulate nearest even on Armv7 + # instead we round halves to up. e.g. 0.5 -> 1, -0.5 -> -1 + _round = lambda v: int(v + (0.5 if v >= 0 else -0.5)) + else: + _round = round + vdata_a = self.load(self._data()) + vdata_a = self.sub(vdata_a, self.setall(0.5)) + data_round = [_round(x) for x in vdata_a] + vround = self.round_s32(vdata_a) + assert vround == data_round + +class _SIMD_FP64(_Test_Utility): + """ + To only test double precision + """ + def test_conversions(self): + """ + Round to nearest even integer, assume CPU control register is set to rounding. + Test intrinics: + npyv_round_s32_##SFX + """ + vdata_a = self.load(self._data()) + vdata_a = self.sub(vdata_a, self.setall(0.5)) + vdata_b = self.mul(vdata_a, self.setall(-1.5)) + data_round = [round(x) for x in list(vdata_a) + list(vdata_b)] + vround = self.round_s32(vdata_a, vdata_b) + assert vround == data_round + class _SIMD_FP(_Test_Utility): """ To test all float vector types at once @@ -247,6 +324,14 @@ class _SIMD_FP(_Test_Utility): recip = self.recip(vdata) assert recip == data_recip + def test_special_cases(self): + """ + Compare Not NaN. Test intrinics: + npyv_notnan_##SFX + """ + nnan = self.notnan(self.setall(self._nan())) + assert nnan == [0]*self.nlanes + class _SIMD_ALL(_Test_Utility): """ To test all vector types at once @@ -440,7 +525,7 @@ class _SIMD_ALL(_Test_Utility): vsetf = self.setf(10, *data_a) assert vsetf == data_a - # We're testing the sainty of _simd's type-vector, + # We're testing the sanity of _simd's type-vector, # reinterpret* intrinsics itself are tested via compiler # during the build of _simd module sfxes = ["u8", "s8", "u16", "s16", "u32", "s32", "u64", "s64", "f32"] @@ -632,6 +717,29 @@ class _SIMD_ALL(_Test_Utility): vsum = self.sum(vdata) assert vsum == data_sum + def test_mask_conditional(self): + """ + Conditional addition and subtraction for all supported data types. + Test intrinics: + npyv_ifadd_##SFX, npyv_ifsub_##SFX + """ + vdata_a = self.load(self._data()) + vdata_b = self.load(self._data(reverse=True)) + true_mask = self.cmpeq(self.zero(), self.zero()) + false_mask = self.cmpneq(self.zero(), self.zero()) + + data_sub = self.sub(vdata_b, vdata_a) + ifsub = self.ifsub(true_mask, vdata_b, vdata_a, vdata_b) + assert ifsub == data_sub + ifsub = self.ifsub(false_mask, vdata_a, vdata_b, vdata_b) + assert ifsub == vdata_b + + data_add = self.add(vdata_b, vdata_a) + ifadd = self.ifadd(true_mask, vdata_b, vdata_a, vdata_b) + assert ifadd == data_add + ifadd = self.ifadd(false_mask, vdata_a, vdata_b, vdata_b) + assert ifadd == vdata_b + bool_sfx = ("b8", "b16", "b32", "b64") int_sfx = ("u8", "s8", "u16", "s16", "u32", "s32", "u64", "s64") fp_sfx = ("f32", "f64") @@ -640,6 +748,8 @@ tests_registry = { bool_sfx: _SIMD_BOOL, int_sfx : _SIMD_INT, fp_sfx : _SIMD_FP, + ("f32",): _SIMD_FP32, + ("f64",): _SIMD_FP64, all_sfx : _SIMD_ALL } for target_name, npyv in targets.items(): @@ -664,7 +774,7 @@ for target_name, npyv in targets.items(): for sfx in sfxes: skip_m = skip_sfx.get(sfx, skip) inhr = (cls,) - attr = dict(npyv=targets[target_name], sfx=sfx) + attr = dict(npyv=targets[target_name], sfx=sfx, target_name=target_name) tcls = type(f"Test{cls.__name__}_{simd_width}_{target_name}_{sfx}", inhr, attr) if skip_m: pytest.mark.skip(reason=skip_m)(tcls) |