diff options
author | Matti Picus <matti.picus@gmail.com> | 2022-09-25 07:22:52 +0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-09-25 07:22:52 +0300 |
commit | d66ca35de0cae4e41471dc44793c18511eb45109 (patch) | |
tree | 36b66d3907ab8b26cdbdfcf661888f3d2af747fa /numpy/core | |
parent | 2ec7a5d917dbeedf28ee4054e73f92ecebb44975 (diff) | |
parent | 6ef4c8bc1459f5d4f548ed87715651c6bc75fc49 (diff) | |
download | numpy-d66ca35de0cae4e41471dc44793c18511eb45109.tar.gz |
Merge pull request #22306 from seiko2plus/npyv_new_intrinsics_sep2022_vol0
ENH: Implement essential intrinsics required by the upcoming SIMD optimizations(0)
Diffstat (limited to 'numpy/core')
21 files changed, 1276 insertions, 251 deletions
diff --git a/numpy/core/src/_simd/_simd.dispatch.c.src b/numpy/core/src/_simd/_simd.dispatch.c.src index 997205957..b6af8e6a9 100644 --- a/numpy/core/src/_simd/_simd.dispatch.c.src +++ b/numpy/core/src/_simd/_simd.dispatch.c.src @@ -247,6 +247,7 @@ SIMD_IMPL_INTRIN_2(lut16_@sfx@, v@sfx@, q@sfx@, vu@size@) * Misc ***************************/ SIMD_IMPL_INTRIN_0(zero_@sfx@, v@sfx@) +SIMD_IMPL_INTRIN_1(extract0_@sfx@, @sfx@, v@sfx@) SIMD_IMPL_INTRIN_1(setall_@sfx@, v@sfx@, @sfx@) SIMD_IMPL_INTRIN_3(select_@sfx@, v@sfx@, v@bsfx@, v@sfx@, v@sfx@) @@ -340,6 +341,12 @@ SIMD_IMPL_INTRIN_2(orc_@bsfx@, v@bsfx@, v@bsfx@, v@bsfx@) SIMD_IMPL_INTRIN_2(xnor_@bsfx@, v@bsfx@, v@bsfx@, v@bsfx@) #endif +// test cross all vector lanes +/**begin repeat1 + * #intrin = any, all# + */ +SIMD_IMPL_INTRIN_1(@intrin@_@sfx@, u8, v@sfx@) +/**end repeat1**/ /*************************** * Conversion ***************************/ @@ -409,13 +416,16 @@ SIMD_IMPL_INTRIN_1(@intrin@_@sfx@, v@sfx@, v@sfx@) * #intrin = max, min# */ SIMD_IMPL_INTRIN_2(@intrin@_@sfx@, v@sfx@, v@sfx@, v@sfx@) +SIMD_IMPL_INTRIN_1(reduce_@intrin@_@sfx@, @sfx@, v@sfx@) /**end repeat1**/ #if @fp_only@ /**begin repeat1 - * #intrin = maxp, minp# + * #intrin = maxp, minp, maxn, minn# */ SIMD_IMPL_INTRIN_2(@intrin@_@sfx@, v@sfx@, v@sfx@, v@sfx@) +SIMD_IMPL_INTRIN_1(reduce_@intrin@_@sfx@, @sfx@, v@sfx@) +/**end repeat1**/ /**end repeat1**/ #endif @@ -465,14 +475,20 @@ SIMD_IMPL_INTRIN_0N(cleanup) /*************************** * Operators ***************************/ -// Logical /**begin repeat * #bsfx = b8, b16, b32, b64# */ +// Logical SIMD_IMPL_INTRIN_2(and_@bsfx@, v@bsfx@, v@bsfx@, v@bsfx@) SIMD_IMPL_INTRIN_2(or_@bsfx@, v@bsfx@, v@bsfx@, v@bsfx@) SIMD_IMPL_INTRIN_2(xor_@bsfx@, v@bsfx@, v@bsfx@, v@bsfx@) SIMD_IMPL_INTRIN_1(not_@bsfx@, v@bsfx@, v@bsfx@) +// test cross vector's lanes +/**begin repeat1 + * #intrin = any, all# + */ +SIMD_IMPL_INTRIN_1(@intrin@_@bsfx@, u8, v@bsfx@) +/**end repeat1**/ /**end repeat**/ /*************************** * Conversions @@ -559,7 +575,7 @@ SIMD_INTRIN_DEF(reinterpret_@sfx_to@_@sfx@) /**end repeat1**/ /**begin repeat1 - * # intrin = set, setf, setall, zero, select# + * # intrin = set, setf, setall, zero, select, extract0# */ SIMD_INTRIN_DEF(@intrin@_@sfx@) /**end repeat1**/ @@ -589,7 +605,8 @@ SIMD_INTRIN_DEF(@intrin@_@sfx@) #endif // shl_imm /**begin repeat1 - * #intrin = and, or, xor, not, cmpeq, cmpneq, cmpgt, cmpge, cmplt, cmple# + * #intrin = and, or, xor, not, cmpeq, cmpneq, cmpgt, cmpge, cmplt, cmple, + * any, all# */ SIMD_INTRIN_DEF(@intrin@_@sfx@) /**end repeat1**/ @@ -669,13 +686,16 @@ SIMD_INTRIN_DEF(@intrin@_@sfx@) * #intrin = max, min# */ SIMD_INTRIN_DEF(@intrin@_@sfx@) +SIMD_INTRIN_DEF(reduce_@intrin@_@sfx@) /**end repeat1**/ #if @fp_only@ /**begin repeat1 - * #intrin = maxp, minp# + * #intrin = maxp, minp, maxn, minn# */ SIMD_INTRIN_DEF(@intrin@_@sfx@) +SIMD_INTRIN_DEF(reduce_@intrin@_@sfx@) +/**end repeat1**/ /**end repeat1**/ #endif @@ -725,14 +745,20 @@ SIMD_INTRIN_DEF(cleanup) /*************************** * Operators ***************************/ -// Logical /**begin repeat * #bsfx = b8, b16, b32, b64# */ +// Logical SIMD_INTRIN_DEF(and_@bsfx@) SIMD_INTRIN_DEF(or_@bsfx@) SIMD_INTRIN_DEF(xor_@bsfx@) SIMD_INTRIN_DEF(not_@bsfx@) +// test cross vector's lanes +/**begin repeat1 + * #intrin = any, all# + */ +SIMD_INTRIN_DEF(@intrin@_@bsfx@) +/**end repeat1**/ /**end repeat**/ /*************************** * Conversions diff --git a/numpy/core/src/common/simd/avx2/math.h b/numpy/core/src/common/simd/avx2/math.h index deaf4ad11..5c869f911 100644 --- a/numpy/core/src/common/simd/avx2/math.h +++ b/numpy/core/src/common/simd/avx2/math.h @@ -55,6 +55,21 @@ NPY_FINLINE npyv_f64 npyv_maxp_f64(npyv_f64 a, npyv_f64 b) __m256d max = _mm256_max_pd(a, b); return _mm256_blendv_pd(a, max, nn); } +// Maximum, propagates NaNs +// If any of corresponded elements is NaN, NaN is set. +NPY_FINLINE npyv_f32 npyv_maxn_f32(npyv_f32 a, npyv_f32 b) +{ + __m256 nn = _mm256_cmp_ps(a, a, _CMP_ORD_Q); + __m256 max = _mm256_max_ps(a, b); + return _mm256_blendv_ps(a, max, nn); +} +NPY_FINLINE npyv_f64 npyv_maxn_f64(npyv_f64 a, npyv_f64 b) +{ + __m256d nn = _mm256_cmp_pd(a, a, _CMP_ORD_Q); + __m256d max = _mm256_max_pd(a, b); + return _mm256_blendv_pd(a, max, nn); +} + // Maximum, integer operations #define npyv_max_u8 _mm256_max_epu8 #define npyv_max_s8 _mm256_max_epi8 @@ -89,6 +104,20 @@ NPY_FINLINE npyv_f64 npyv_minp_f64(npyv_f64 a, npyv_f64 b) __m256d min = _mm256_min_pd(a, b); return _mm256_blendv_pd(a, min, nn); } +// Minimum, propagates NaNs +// If any of corresponded element is NaN, NaN is set. +NPY_FINLINE npyv_f32 npyv_minn_f32(npyv_f32 a, npyv_f32 b) +{ + __m256 nn = _mm256_cmp_ps(a, a, _CMP_ORD_Q); + __m256 min = _mm256_min_ps(a, b); + return _mm256_blendv_ps(a, min, nn); +} +NPY_FINLINE npyv_f64 npyv_minn_f64(npyv_f64 a, npyv_f64 b) +{ + __m256d nn = _mm256_cmp_pd(a, a, _CMP_ORD_Q); + __m256d min = _mm256_min_pd(a, b); + return _mm256_blendv_pd(a, min, nn); +} // Minimum, integer operations #define npyv_min_u8 _mm256_min_epu8 #define npyv_min_s8 _mm256_min_epi8 @@ -104,6 +133,106 @@ NPY_FINLINE npyv_s64 npyv_min_s64(npyv_s64 a, npyv_s64 b) { return _mm256_blendv_epi8(a, b, _mm256_cmpgt_epi64(a, b)); } +// reduce min&max for 32&64-bits +#define NPY_IMPL_AVX2_REDUCE_MINMAX(STYPE, INTRIN, VINTRIN) \ + NPY_FINLINE STYPE##32 npyv_reduce_##INTRIN##32(__m256i a) \ + { \ + __m128i v128 = _mm_##VINTRIN##32(_mm256_castsi256_si128(a), _mm256_extracti128_si256(a, 1)); \ + __m128i v64 = _mm_##VINTRIN##32(v128, _mm_shuffle_epi32(v128, _MM_SHUFFLE(0, 0, 3, 2))); \ + __m128i v32 = _mm_##VINTRIN##32(v64, _mm_shuffle_epi32(v64, _MM_SHUFFLE(0, 0, 0, 1))); \ + return (STYPE##32)_mm_cvtsi128_si32(v32); \ + } \ + NPY_FINLINE STYPE##64 npyv_reduce_##INTRIN##64(__m256i a) \ + { \ + __m256i v128 = npyv_##INTRIN##64(a, _mm256_permute2f128_si256(a, a, _MM_SHUFFLE(0, 0, 0, 1))); \ + __m256i v64 = npyv_##INTRIN##64(v128, _mm256_shuffle_epi32(v128, _MM_SHUFFLE(0, 0, 3, 2))); \ + return (STYPE##64)npyv_extract0_u64(v64); \ + } +NPY_IMPL_AVX2_REDUCE_MINMAX(npy_uint, min_u, min_epu) +NPY_IMPL_AVX2_REDUCE_MINMAX(npy_int, min_s, min_epi) +NPY_IMPL_AVX2_REDUCE_MINMAX(npy_uint, max_u, max_epu) +NPY_IMPL_AVX2_REDUCE_MINMAX(npy_int, max_s, max_epi) +#undef NPY_IMPL_AVX2_REDUCE_MINMAX + +// reduce min&max for ps & pd +#define NPY_IMPL_AVX2_REDUCE_MINMAX(INTRIN, INF, INF64) \ + NPY_FINLINE float npyv_reduce_##INTRIN##_f32(npyv_f32 a) \ + { \ + __m128 v128 = _mm_##INTRIN##_ps(_mm256_castps256_ps128(a), _mm256_extractf128_ps(a, 1)); \ + __m128 v64 = _mm_##INTRIN##_ps(v128, _mm_shuffle_ps(v128, v128, _MM_SHUFFLE(0, 0, 3, 2))); \ + __m128 v32 = _mm_##INTRIN##_ps(v64, _mm_shuffle_ps(v64, v64, _MM_SHUFFLE(0, 0, 0, 1))); \ + return _mm_cvtss_f32(v32); \ + } \ + NPY_FINLINE double npyv_reduce_##INTRIN##_f64(npyv_f64 a) \ + { \ + __m128d v128 = _mm_##INTRIN##_pd(_mm256_castpd256_pd128(a), _mm256_extractf128_pd(a, 1)); \ + __m128d v64 = _mm_##INTRIN##_pd(v128, _mm_shuffle_pd(v128, v128, _MM_SHUFFLE(0, 0, 0, 1))); \ + return _mm_cvtsd_f64(v64); \ + } \ + NPY_FINLINE float npyv_reduce_##INTRIN##p_f32(npyv_f32 a) \ + { \ + npyv_b32 notnan = npyv_notnan_f32(a); \ + if (NPY_UNLIKELY(!npyv_any_b32(notnan))) { \ + return _mm_cvtss_f32(_mm256_castps256_ps128(a)); \ + } \ + a = npyv_select_f32(notnan, a, npyv_reinterpret_f32_u32(npyv_setall_u32(INF))); \ + return npyv_reduce_##INTRIN##_f32(a); \ + } \ + NPY_FINLINE double npyv_reduce_##INTRIN##p_f64(npyv_f64 a) \ + { \ + npyv_b64 notnan = npyv_notnan_f64(a); \ + if (NPY_UNLIKELY(!npyv_any_b64(notnan))) { \ + return _mm_cvtsd_f64(_mm256_castpd256_pd128(a)); \ + } \ + a = npyv_select_f64(notnan, a, npyv_reinterpret_f64_u64(npyv_setall_u64(INF64))); \ + return npyv_reduce_##INTRIN##_f64(a); \ + } \ + NPY_FINLINE float npyv_reduce_##INTRIN##n_f32(npyv_f32 a) \ + { \ + npyv_b32 notnan = npyv_notnan_f32(a); \ + if (NPY_UNLIKELY(!npyv_all_b32(notnan))) { \ + const union { npy_uint32 i; float f;} pnan = {0x7fc00000UL}; \ + return pnan.f; \ + } \ + return npyv_reduce_##INTRIN##_f32(a); \ + } \ + NPY_FINLINE double npyv_reduce_##INTRIN##n_f64(npyv_f64 a) \ + { \ + npyv_b64 notnan = npyv_notnan_f64(a); \ + if (NPY_UNLIKELY(!npyv_all_b64(notnan))) { \ + const union { npy_uint64 i; double d;} pnan = {0x7ff8000000000000ull}; \ + return pnan.d; \ + } \ + return npyv_reduce_##INTRIN##_f64(a); \ + } +NPY_IMPL_AVX2_REDUCE_MINMAX(min, 0x7f800000, 0x7ff0000000000000) +NPY_IMPL_AVX2_REDUCE_MINMAX(max, 0xff800000, 0xfff0000000000000) +#undef NPY_IMPL_AVX2_REDUCE_MINMAX + +// reduce min&max for 8&16-bits +#define NPY_IMPL_AVX256_REDUCE_MINMAX(STYPE, INTRIN, VINTRIN) \ + NPY_FINLINE STYPE##16 npyv_reduce_##INTRIN##16(__m256i a) \ + { \ + __m128i v128 = _mm_##VINTRIN##16(_mm256_castsi256_si128(a), _mm256_extracti128_si256(a, 1)); \ + __m128i v64 = _mm_##VINTRIN##16(v128, _mm_shuffle_epi32(v128, _MM_SHUFFLE(0, 0, 3, 2))); \ + __m128i v32 = _mm_##VINTRIN##16(v64, _mm_shuffle_epi32(v64, _MM_SHUFFLE(0, 0, 0, 1))); \ + __m128i v16 = _mm_##VINTRIN##16(v32, _mm_shufflelo_epi16(v32, _MM_SHUFFLE(0, 0, 0, 1))); \ + return (STYPE##16)_mm_cvtsi128_si32(v16); \ + } \ + NPY_FINLINE STYPE##8 npyv_reduce_##INTRIN##8(__m256i a) \ + { \ + __m128i v128 = _mm_##VINTRIN##8(_mm256_castsi256_si128(a), _mm256_extracti128_si256(a, 1)); \ + __m128i v64 = _mm_##VINTRIN##8(v128, _mm_shuffle_epi32(v128, _MM_SHUFFLE(0, 0, 3, 2))); \ + __m128i v32 = _mm_##VINTRIN##8(v64, _mm_shuffle_epi32(v64, _MM_SHUFFLE(0, 0, 0, 1))); \ + __m128i v16 = _mm_##VINTRIN##8(v32, _mm_shufflelo_epi16(v32, _MM_SHUFFLE(0, 0, 0, 1))); \ + __m128i v8 = _mm_##VINTRIN##8(v16, _mm_srli_epi16(v16, 8)); \ + return (STYPE##16)_mm_cvtsi128_si32(v8); \ + } +NPY_IMPL_AVX256_REDUCE_MINMAX(npy_uint, min_u, min_epu) +NPY_IMPL_AVX256_REDUCE_MINMAX(npy_int, min_s, min_epi) +NPY_IMPL_AVX256_REDUCE_MINMAX(npy_uint, max_u, max_epu) +NPY_IMPL_AVX256_REDUCE_MINMAX(npy_int, max_s, max_epi) +#undef NPY_IMPL_AVX256_REDUCE_MINMAX // round to nearest intger even #define npyv_rint_f32(A) _mm256_round_ps(A, _MM_FROUND_TO_NEAREST_INT) diff --git a/numpy/core/src/common/simd/avx2/misc.h b/numpy/core/src/common/simd/avx2/misc.h index 5e91e91b3..41e788c75 100644 --- a/numpy/core/src/common/simd/avx2/misc.h +++ b/numpy/core/src/common/simd/avx2/misc.h @@ -31,7 +31,7 @@ NPY_FINLINE __m256i npyv__setr_epi64(npy_int64, npy_int64, npy_int64, npy_int64) NPY_FINLINE npyv_u64 npyv_setall_u64(npy_uint64 a) { npy_int64 ai = (npy_int64)a; -#if defined(_MSC_VER) && defined(_M_IX86) +#if defined(_MSC_VER) && defined(_M_IX86) return npyv__setr_epi64(ai, ai, ai, ai); #else return _mm256_set1_epi64x(ai); @@ -130,6 +130,18 @@ NPY_FINLINE __m256d npyv__setr_pd(double i0, double i1, double i2, double i3) #define npyv_select_f32(MASK, A, B) _mm256_blendv_ps(B, A, _mm256_castsi256_ps(MASK)) #define npyv_select_f64(MASK, A, B) _mm256_blendv_pd(B, A, _mm256_castsi256_pd(MASK)) +// extract the first vector's lane +#define npyv_extract0_u8(A) ((npy_uint8)_mm_cvtsi128_si32(_mm256_castsi256_si128(A))) +#define npyv_extract0_s8(A) ((npy_int8)_mm_cvtsi128_si32(_mm256_castsi256_si128(A))) +#define npyv_extract0_u16(A) ((npy_uint16)_mm_cvtsi128_si32(_mm256_castsi256_si128(A))) +#define npyv_extract0_s16(A) ((npy_int16)_mm_cvtsi128_si32(_mm256_castsi256_si128(A))) +#define npyv_extract0_u32(A) ((npy_uint32)_mm_cvtsi128_si32(_mm256_castsi256_si128(A))) +#define npyv_extract0_s32(A) ((npy_int32)_mm_cvtsi128_si32(_mm256_castsi256_si128(A))) +#define npyv_extract0_u64(A) ((npy_uint64)npyv128_cvtsi128_si64(_mm256_castsi256_si128(A))) +#define npyv_extract0_s64(A) ((npy_int64)npyv128_cvtsi128_si64(_mm256_castsi256_si128(A))) +#define npyv_extract0_f32(A) _mm_cvtss_f32(_mm256_castps256_ps128(A)) +#define npyv_extract0_f64(A) _mm_cvtsd_f64(_mm256_castpd256_pd128(A)) + // Reinterpret #define npyv_reinterpret_u8_u8(X) X #define npyv_reinterpret_u8_s8(X) X diff --git a/numpy/core/src/common/simd/avx2/operators.h b/numpy/core/src/common/simd/avx2/operators.h index 7682b24cb..c10267b21 100644 --- a/numpy/core/src/common/simd/avx2/operators.h +++ b/numpy/core/src/common/simd/avx2/operators.h @@ -225,4 +225,58 @@ NPY_FINLINE npyv_b32 npyv_notnan_f32(npyv_f32 a) NPY_FINLINE npyv_b64 npyv_notnan_f64(npyv_f64 a) { return _mm256_castpd_si256(_mm256_cmp_pd(a, a, _CMP_ORD_Q)); } +// Test cross all vector lanes +// any: returns true if any of the elements is not equal to zero +// all: returns true if all elements are not equal to zero +#define NPYV_IMPL_AVX2_ANYALL(SFX) \ + NPY_FINLINE bool npyv_any_##SFX(npyv_##SFX a) \ + { return _mm256_movemask_epi8(a) != 0; } \ + NPY_FINLINE bool npyv_all_##SFX(npyv_##SFX a) \ + { return _mm256_movemask_epi8(a) == -1; } +NPYV_IMPL_AVX2_ANYALL(b8) +NPYV_IMPL_AVX2_ANYALL(b16) +NPYV_IMPL_AVX2_ANYALL(b32) +NPYV_IMPL_AVX2_ANYALL(b64) +#undef NPYV_IMPL_AVX2_ANYALL + +#define NPYV_IMPL_AVX2_ANYALL(SFX) \ + NPY_FINLINE bool npyv_any_##SFX(npyv_##SFX a) \ + { \ + return _mm256_movemask_epi8( \ + npyv_cmpeq_##SFX(a, npyv_zero_##SFX()) \ + ) != -1; \ + } \ + NPY_FINLINE bool npyv_all_##SFX(npyv_##SFX a) \ + { \ + return _mm256_movemask_epi8( \ + npyv_cmpeq_##SFX(a, npyv_zero_##SFX()) \ + ) == 0; \ + } +NPYV_IMPL_AVX2_ANYALL(u8) +NPYV_IMPL_AVX2_ANYALL(s8) +NPYV_IMPL_AVX2_ANYALL(u16) +NPYV_IMPL_AVX2_ANYALL(s16) +NPYV_IMPL_AVX2_ANYALL(u32) +NPYV_IMPL_AVX2_ANYALL(s32) +NPYV_IMPL_AVX2_ANYALL(u64) +NPYV_IMPL_AVX2_ANYALL(s64) +#undef NPYV_IMPL_AVX2_ANYALL + +#define NPYV_IMPL_AVX2_ANYALL(SFX, XSFX, MASK) \ + NPY_FINLINE bool npyv_any_##SFX(npyv_##SFX a) \ + { \ + return _mm256_movemask_##XSFX( \ + _mm256_cmp_##XSFX(a, npyv_zero_##SFX(), _CMP_EQ_OQ) \ + ) != MASK; \ + } \ + NPY_FINLINE bool npyv_all_##SFX(npyv_##SFX a) \ + { \ + return _mm256_movemask_##XSFX( \ + _mm256_cmp_##XSFX(a, npyv_zero_##SFX(), _CMP_EQ_OQ) \ + ) == 0; \ + } +NPYV_IMPL_AVX2_ANYALL(f32, ps, 0xff) +NPYV_IMPL_AVX2_ANYALL(f64, pd, 0xf) +#undef NPYV_IMPL_AVX2_ANYALL + #endif // _NPY_SIMD_AVX2_OPERATORS_H diff --git a/numpy/core/src/common/simd/avx512/arithmetic.h b/numpy/core/src/common/simd/avx512/arithmetic.h index 850a0c05a..1290dc0ad 100644 --- a/numpy/core/src/common/simd/avx512/arithmetic.h +++ b/numpy/core/src/common/simd/avx512/arithmetic.h @@ -371,79 +371,7 @@ NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor) #define npyv_sum_u64 _mm512_reduce_add_epi64 #define npyv_sum_f32 _mm512_reduce_add_ps #define npyv_sum_f64 _mm512_reduce_add_pd - #define npyv_reducemin_u32 _mm512_reduce_min_epu32 - #define npyv_reducemin_s32 _mm512_reduce_min_epi32 - #define npyv_reducemin_f32 _mm512_reduce_min_ps - #define npyv_reducemax_u32 _mm512_reduce_max_epu32 - #define npyv_reducemax_s32 _mm512_reduce_max_epi32 - #define npyv_reducemax_f32 _mm512_reduce_max_ps #else - NPY_FINLINE npy_uint32 npyv_reducemax_u32(npyv_u32 a) - { - const npyv_u32 idx1 = _mm512_set_epi32(7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8); - const npyv_u32 idx2 = _mm512_set_epi32(3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12); - npyv_u32 a1 = _mm512_max_epu32(a, _mm512_permutex2var_epi32(a, idx1, a)); - npyv_u32 a2 = _mm512_max_epu32(a1, _mm512_permutex2var_epi32(a1, idx2, a1)); - npyv_u32 a3 = _mm512_max_epu32(a2, _mm512_shuffle_epi32(a2, (_MM_PERM_ENUM)(1<<6 | 0<<4 | 3<<2 | 2))); - npyv_u32 a4 = _mm512_max_epu32(a3, _mm512_shuffle_epi32(a3, (_MM_PERM_ENUM)(2<<6 | 3<<4 | 0<<2 | 1))); - return _mm_cvtsi128_si32(_mm512_extracti32x4_epi32(a4, 0x00)); - } - - NPY_FINLINE npy_int32 npyv_reducemax_s32(npyv_s32 a) - { - const npyv_u32 idx1 = _mm512_set_epi32(7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8); - const npyv_u32 idx2 = _mm512_set_epi32(3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12); - npyv_s32 a1 = _mm512_max_epi32(a, _mm512_permutex2var_epi32(a, idx1, a)); - npyv_s32 a2 = _mm512_max_epi32(a1, _mm512_permutex2var_epi32(a1, idx2, a1)); - npyv_s32 a3 = _mm512_max_epi32(a2, _mm512_shuffle_epi32(a2, (_MM_PERM_ENUM)(1<<6 | 0<<4 | 3<<2 | 2))); - npyv_s32 a4 = _mm512_max_epi32(a3, _mm512_shuffle_epi32(a3, (_MM_PERM_ENUM)(2<<6 | 3<<4 | 0<<2 | 1))); - return _mm_cvtsi128_si32(_mm512_extracti32x4_epi32(a4, 0x00)); - } - - NPY_FINLINE npy_float npyv_reducemax_f32(npyv_f32 a) - { - const npyv_u32 idx1 = _mm512_set_epi32(7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8); - const npyv_u32 idx2 = _mm512_set_epi32(3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12); - npyv_f32 a1 = _mm512_max_ps(a, _mm512_permutex2var_ps(a, idx1, a)); - npyv_f32 a2 = _mm512_max_ps(a1, _mm512_permutex2var_ps(a1, idx2, a1)); - npyv_f32 a3 = _mm512_max_ps(a2, _mm512_shuffle_ps(a2, a2, (_MM_PERM_ENUM)(1<<6 | 0<<4 | 3<<2 | 2))); - npyv_f32 a4 = _mm512_max_ps(a3, _mm512_shuffle_ps(a3, a3, (_MM_PERM_ENUM)(2<<6 | 3<<4 | 0<<2 | 1))); - return _mm_cvtss_f32(_mm512_extractf32x4_ps(a4, 0x00)); - } - - NPY_FINLINE npy_uint32 npyv_reducemin_u32(npyv_u32 a) - { - const npyv_u32 idx1 = _mm512_set_epi32(7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8); - const npyv_u32 idx2 = _mm512_set_epi32(3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12); - npyv_u32 a1 = _mm512_min_epu32(a, _mm512_permutex2var_epi32(a, idx1, a)); - npyv_u32 a2 = _mm512_min_epu32(a1, _mm512_permutex2var_epi32(a1, idx2, a1)); - npyv_u32 a3 = _mm512_min_epu32(a2, _mm512_shuffle_epi32(a2, (_MM_PERM_ENUM)(1<<6 | 0<<4 | 3<<2 | 2))); - npyv_u32 a4 = _mm512_min_epu32(a3, _mm512_shuffle_epi32(a3, (_MM_PERM_ENUM)(2<<6 | 3<<4 | 0<<2 | 1))); - return _mm_cvtsi128_si32(_mm512_extracti32x4_epi32(a4, 0x00)); - } - - NPY_FINLINE npy_int32 npyv_reducemin_s32(npyv_s32 a) - { - const npyv_u32 idx1 = _mm512_set_epi32(7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8); - const npyv_u32 idx2 = _mm512_set_epi32(3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12); - npyv_s32 a1 = _mm512_min_epi32(a, _mm512_permutex2var_epi32(a, idx1, a)); - npyv_s32 a2 = _mm512_min_epi32(a1, _mm512_permutex2var_epi32(a1, idx2, a1)); - npyv_s32 a3 = _mm512_min_epi32(a2, _mm512_shuffle_epi32(a2, (_MM_PERM_ENUM)(1<<6 | 0<<4 | 3<<2 | 2))); - npyv_s32 a4 = _mm512_min_epi32(a3, _mm512_shuffle_epi32(a3, (_MM_PERM_ENUM)(2<<6 | 3<<4 | 0<<2 | 1))); - return _mm_cvtsi128_si32(_mm512_extracti32x4_epi32(a4, 0x00)); - } - - NPY_FINLINE npy_float npyv_reducemin_f32(npyv_f32 a) - { - const npyv_u32 idx1 = _mm512_set_epi32(7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8); - const npyv_u32 idx2 = _mm512_set_epi32(3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12); - npyv_f32 a1 = _mm512_min_ps(a, _mm512_permutex2var_ps(a, idx1, a)); - npyv_f32 a2 = _mm512_min_ps(a1, _mm512_permutex2var_ps(a1, idx2, a1)); - npyv_f32 a3 = _mm512_min_ps(a2, _mm512_shuffle_ps(a2, a2, (_MM_PERM_ENUM)(1<<6 | 0<<4 | 3<<2 | 2))); - npyv_f32 a4 = _mm512_min_ps(a3, _mm512_shuffle_ps(a3, a3, (_MM_PERM_ENUM)(2<<6 | 3<<4 | 0<<2 | 1))); - return _mm_cvtss_f32(_mm512_extractf32x4_ps(a4, 0x00)); - } - NPY_FINLINE npy_uint32 npyv_sum_u32(npyv_u32 a) { __m256i half = _mm256_add_epi32(npyv512_lower_si256(a), npyv512_higher_si256(a)); @@ -483,6 +411,7 @@ NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor) __m512d sum8 = _mm512_add_pd(sum16, h16); return _mm_cvtsd_f64(_mm512_castpd512_pd128(sum8)); } + #endif // expand the source vector and performs sum reduce diff --git a/numpy/core/src/common/simd/avx512/math.h b/numpy/core/src/common/simd/avx512/math.h index 5a6cb6dcd..97fd2d641 100644 --- a/numpy/core/src/common/simd/avx512/math.h +++ b/numpy/core/src/common/simd/avx512/math.h @@ -62,6 +62,18 @@ NPY_FINLINE npyv_f64 npyv_maxp_f64(npyv_f64 a, npyv_f64 b) __mmask8 nn = _mm512_cmp_pd_mask(b, b, _CMP_ORD_Q); return _mm512_mask_max_pd(a, nn, a, b); } +// Maximum, propagates NaNs +// If any of corresponded element is NaN, NaN is set. +NPY_FINLINE npyv_f32 npyv_maxn_f32(npyv_f32 a, npyv_f32 b) +{ + __mmask16 nn = _mm512_cmp_ps_mask(a, a, _CMP_ORD_Q); + return _mm512_mask_max_ps(a, nn, a, b); +} +NPY_FINLINE npyv_f64 npyv_maxn_f64(npyv_f64 a, npyv_f64 b) +{ + __mmask8 nn = _mm512_cmp_pd_mask(a, a, _CMP_ORD_Q); + return _mm512_mask_max_pd(a, nn, a, b); +} // Maximum, integer operations #ifdef NPY_HAVE_AVX512BW #define npyv_max_u8 _mm512_max_epu8 @@ -95,6 +107,18 @@ NPY_FINLINE npyv_f64 npyv_minp_f64(npyv_f64 a, npyv_f64 b) __mmask8 nn = _mm512_cmp_pd_mask(b, b, _CMP_ORD_Q); return _mm512_mask_min_pd(a, nn, a, b); } +// Minimum, propagates NaNs +// If any of corresponded element is NaN, NaN is set. +NPY_FINLINE npyv_f32 npyv_minn_f32(npyv_f32 a, npyv_f32 b) +{ + __mmask16 nn = _mm512_cmp_ps_mask(a, a, _CMP_ORD_Q); + return _mm512_mask_min_ps(a, nn, a, b); +} +NPY_FINLINE npyv_f64 npyv_minn_f64(npyv_f64 a, npyv_f64 b) +{ + __mmask8 nn = _mm512_cmp_pd_mask(a, a, _CMP_ORD_Q); + return _mm512_mask_min_pd(a, nn, a, b); +} // Minimum, integer operations #ifdef NPY_HAVE_AVX512BW #define npyv_min_u8 _mm512_min_epu8 @@ -112,6 +136,160 @@ NPY_FINLINE npyv_f64 npyv_minp_f64(npyv_f64 a, npyv_f64 b) #define npyv_min_u64 _mm512_min_epu64 #define npyv_min_s64 _mm512_min_epi64 +#ifdef NPY_HAVE_AVX512F_REDUCE + #define npyv_reduce_min_u32 _mm512_reduce_min_epu32 + #define npyv_reduce_min_s32 _mm512_reduce_min_epi32 + #define npyv_reduce_min_u64 _mm512_reduce_min_epu64 + #define npyv_reduce_min_s64 _mm512_reduce_min_epi64 + #define npyv_reduce_min_f32 _mm512_reduce_min_ps + #define npyv_reduce_min_f64 _mm512_reduce_min_pd + #define npyv_reduce_max_u32 _mm512_reduce_max_epu32 + #define npyv_reduce_max_s32 _mm512_reduce_max_epi32 + #define npyv_reduce_max_u64 _mm512_reduce_max_epu64 + #define npyv_reduce_max_s64 _mm512_reduce_max_epi64 + #define npyv_reduce_max_f32 _mm512_reduce_max_ps + #define npyv_reduce_max_f64 _mm512_reduce_max_pd +#else + // reduce min&max for 32&64-bits + #define NPY_IMPL_AVX512_REDUCE_MINMAX(STYPE, INTRIN, VINTRIN) \ + NPY_FINLINE STYPE##32 npyv_reduce_##INTRIN##32(__m512i a) \ + { \ + __m256i v256 = _mm256_##VINTRIN##32(npyv512_lower_si256(a), \ + npyv512_higher_si256(a)); \ + __m128i v128 = _mm_##VINTRIN##32(_mm256_castsi256_si128(v256), \ + _mm256_extracti128_si256(v256, 1)); \ + __m128i v64 = _mm_##VINTRIN##32(v128, _mm_shuffle_epi32(v128, \ + (_MM_PERM_ENUM)_MM_SHUFFLE(0, 0, 3, 2))); \ + __m128i v32 = _mm_##VINTRIN##32(v64, _mm_shuffle_epi32(v64, \ + (_MM_PERM_ENUM)_MM_SHUFFLE(0, 0, 0, 1))); \ + return (STYPE##32)_mm_cvtsi128_si32(v32); \ + } \ + NPY_FINLINE STYPE##64 npyv_reduce_##INTRIN##64(__m512i a) \ + { \ + __m512i v256 = _mm512_##VINTRIN##64(a, \ + _mm512_shuffle_i64x2(a, a, (_MM_PERM_ENUM)_MM_SHUFFLE(0, 0, 3, 2))); \ + __m512i v128 = _mm512_##VINTRIN##64(v256, \ + _mm512_shuffle_i64x2(v256, v256, (_MM_PERM_ENUM)_MM_SHUFFLE(0, 0, 0, 1))); \ + __m512i v64 = _mm512_##VINTRIN##64(v128, \ + _mm512_shuffle_epi32(v128, (_MM_PERM_ENUM)_MM_SHUFFLE(0, 0, 3, 2))); \ + return (STYPE##64)npyv_extract0_u64(v64); \ + } + + NPY_IMPL_AVX512_REDUCE_MINMAX(npy_uint, min_u, min_epu) + NPY_IMPL_AVX512_REDUCE_MINMAX(npy_int, min_s, min_epi) + NPY_IMPL_AVX512_REDUCE_MINMAX(npy_uint, max_u, max_epu) + NPY_IMPL_AVX512_REDUCE_MINMAX(npy_int, max_s, max_epi) + #undef NPY_IMPL_AVX512_REDUCE_MINMAX + // reduce min&max for ps & pd + #define NPY_IMPL_AVX512_REDUCE_MINMAX(INTRIN) \ + NPY_FINLINE float npyv_reduce_##INTRIN##_f32(npyv_f32 a) \ + { \ + __m256 v256 = _mm256_##INTRIN##_ps( \ + npyv512_lower_ps256(a), npyv512_higher_ps256(a)); \ + __m128 v128 = _mm_##INTRIN##_ps( \ + _mm256_castps256_ps128(v256), _mm256_extractf128_ps(v256, 1)); \ + __m128 v64 = _mm_##INTRIN##_ps(v128, \ + _mm_shuffle_ps(v128, v128, (_MM_PERM_ENUM)_MM_SHUFFLE(0, 0, 3, 2))); \ + __m128 v32 = _mm_##INTRIN##_ps(v64, \ + _mm_shuffle_ps(v64, v64, (_MM_PERM_ENUM)_MM_SHUFFLE(0, 0, 0, 1))); \ + return _mm_cvtss_f32(v32); \ + } \ + NPY_FINLINE double npyv_reduce_##INTRIN##_f64(npyv_f64 a) \ + { \ + __m256d v256 = _mm256_##INTRIN##_pd( \ + npyv512_lower_pd256(a), npyv512_higher_pd256(a)); \ + __m128d v128 = _mm_##INTRIN##_pd( \ + _mm256_castpd256_pd128(v256), _mm256_extractf128_pd(v256, 1)); \ + __m128d v64 = _mm_##INTRIN##_pd(v128, \ + _mm_shuffle_pd(v128, v128, (_MM_PERM_ENUM)_MM_SHUFFLE(0, 0, 0, 1))); \ + return _mm_cvtsd_f64(v64); \ + } + + NPY_IMPL_AVX512_REDUCE_MINMAX(min) + NPY_IMPL_AVX512_REDUCE_MINMAX(max) + #undef NPY_IMPL_AVX512_REDUCE_MINMAX +#endif +#define NPY_IMPL_AVX512_REDUCE_MINMAX(INTRIN, INF, INF64) \ + NPY_FINLINE float npyv_reduce_##INTRIN##p_f32(npyv_f32 a) \ + { \ + npyv_b32 notnan = npyv_notnan_f32(a); \ + if (NPY_UNLIKELY(!npyv_any_b32(notnan))) { \ + return _mm_cvtss_f32(_mm512_castps512_ps128(a)); \ + } \ + a = npyv_select_f32(notnan, a, \ + npyv_reinterpret_f32_u32(npyv_setall_u32(INF))); \ + return npyv_reduce_##INTRIN##_f32(a); \ + } \ + NPY_FINLINE double npyv_reduce_##INTRIN##p_f64(npyv_f64 a) \ + { \ + npyv_b64 notnan = npyv_notnan_f64(a); \ + if (NPY_UNLIKELY(!npyv_any_b64(notnan))) { \ + return _mm_cvtsd_f64(_mm512_castpd512_pd128(a)); \ + } \ + a = npyv_select_f64(notnan, a, \ + npyv_reinterpret_f64_u64(npyv_setall_u64(INF64))); \ + return npyv_reduce_##INTRIN##_f64(a); \ + } \ + NPY_FINLINE float npyv_reduce_##INTRIN##n_f32(npyv_f32 a) \ + { \ + npyv_b32 notnan = npyv_notnan_f32(a); \ + if (NPY_UNLIKELY(!npyv_all_b32(notnan))) { \ + const union { npy_uint32 i; float f;} pnan = { \ + 0x7fc00000ul \ + }; \ + return pnan.f; \ + } \ + return npyv_reduce_##INTRIN##_f32(a); \ + } \ + NPY_FINLINE double npyv_reduce_##INTRIN##n_f64(npyv_f64 a) \ + { \ + npyv_b64 notnan = npyv_notnan_f64(a); \ + if (NPY_UNLIKELY(!npyv_all_b64(notnan))) { \ + const union { npy_uint64 i; double d;} pnan = { \ + 0x7ff8000000000000ull \ + }; \ + return pnan.d; \ + } \ + return npyv_reduce_##INTRIN##_f64(a); \ + } + +NPY_IMPL_AVX512_REDUCE_MINMAX(min, 0x7f800000, 0x7ff0000000000000) +NPY_IMPL_AVX512_REDUCE_MINMAX(max, 0xff800000, 0xfff0000000000000) +#undef NPY_IMPL_AVX512_REDUCE_MINMAX + +// reduce min&max for 8&16-bits +#define NPY_IMPL_AVX512_REDUCE_MINMAX(STYPE, INTRIN, VINTRIN) \ + NPY_FINLINE STYPE##16 npyv_reduce_##INTRIN##16(__m512i a) \ + { \ + __m256i v256 = _mm256_##VINTRIN##16(npyv512_lower_si256(a), npyv512_higher_si256(a)); \ + __m128i v128 = _mm_##VINTRIN##16(_mm256_castsi256_si128(v256), _mm256_extracti128_si256(v256, 1)); \ + __m128i v64 = _mm_##VINTRIN##16(v128, _mm_shuffle_epi32(v128, \ + (_MM_PERM_ENUM)_MM_SHUFFLE(0, 0, 3, 2))); \ + __m128i v32 = _mm_##VINTRIN##16(v64, _mm_shuffle_epi32(v64, \ + (_MM_PERM_ENUM)_MM_SHUFFLE(0, 0, 0, 1))); \ + __m128i v16 = _mm_##VINTRIN##16(v32, _mm_shufflelo_epi16(v32, \ + (_MM_PERM_ENUM)_MM_SHUFFLE(0, 0, 0, 1))); \ + return (STYPE##16)_mm_cvtsi128_si32(v16); \ + } \ + NPY_FINLINE STYPE##8 npyv_reduce_##INTRIN##8(__m512i a) \ + { \ + __m256i v256 = _mm256_##VINTRIN##8(npyv512_lower_si256(a), npyv512_higher_si256(a)); \ + __m128i v128 = _mm_##VINTRIN##8(_mm256_castsi256_si128(v256), _mm256_extracti128_si256(v256, 1)); \ + __m128i v64 = _mm_##VINTRIN##8(v128, _mm_shuffle_epi32(v128, \ + (_MM_PERM_ENUM)_MM_SHUFFLE(0, 0, 3, 2))); \ + __m128i v32 = _mm_##VINTRIN##8(v64, _mm_shuffle_epi32(v64, \ + (_MM_PERM_ENUM)_MM_SHUFFLE(0, 0, 0, 1))); \ + __m128i v16 = _mm_##VINTRIN##8(v32, _mm_shufflelo_epi16(v32, \ + (_MM_PERM_ENUM)_MM_SHUFFLE(0, 0, 0, 1))); \ + __m128i v8 = _mm_##VINTRIN##8(v16, _mm_srli_epi16(v16, 8)); \ + return (STYPE##16)_mm_cvtsi128_si32(v8); \ + } +NPY_IMPL_AVX512_REDUCE_MINMAX(npy_uint, min_u, min_epu) +NPY_IMPL_AVX512_REDUCE_MINMAX(npy_int, min_s, min_epi) +NPY_IMPL_AVX512_REDUCE_MINMAX(npy_uint, max_u, max_epu) +NPY_IMPL_AVX512_REDUCE_MINMAX(npy_int, max_s, max_epi) +#undef NPY_IMPL_AVX512_REDUCE_MINMAX + // round to nearest integer even #define npyv_rint_f32(A) _mm512_roundscale_ps(A, _MM_FROUND_TO_NEAREST_INT) #define npyv_rint_f64(A) _mm512_roundscale_pd(A, _MM_FROUND_TO_NEAREST_INT) diff --git a/numpy/core/src/common/simd/avx512/misc.h b/numpy/core/src/common/simd/avx512/misc.h index c3039ecfe..d9190870e 100644 --- a/numpy/core/src/common/simd/avx512/misc.h +++ b/numpy/core/src/common/simd/avx512/misc.h @@ -34,7 +34,7 @@ NPY_FINLINE __m512i npyv__setr_epi64( NPY_FINLINE npyv_u64 npyv_setall_u64(npy_uint64 a) { npy_int64 ai = (npy_int64)a; -#if defined(_MSC_VER) && defined(_M_IX86) +#if defined(_MSC_VER) && defined(_M_IX86) return npyv__setr_epi64(ai, ai, ai, ai, ai, ai, ai, ai); #else return _mm512_set1_epi64(ai); @@ -160,6 +160,18 @@ NPY_FINLINE __m512d npyv__setr_pd(double i0, double i1, double i2, double i3, #define npyv_select_f32(MASK, A, B) _mm512_mask_blend_ps(MASK, B, A) #define npyv_select_f64(MASK, A, B) _mm512_mask_blend_pd(MASK, B, A) +// extract the first vector's lane +#define npyv_extract0_u8(A) ((npy_uint8)_mm_cvtsi128_si32(_mm512_castsi512_si128(A))) +#define npyv_extract0_s8(A) ((npy_int8)_mm_cvtsi128_si32(_mm512_castsi512_si128(A))) +#define npyv_extract0_u16(A) ((npy_uint16)_mm_cvtsi128_si32(_mm512_castsi512_si128(A))) +#define npyv_extract0_s16(A) ((npy_int16)_mm_cvtsi128_si32(_mm512_castsi512_si128(A))) +#define npyv_extract0_u32(A) ((npy_uint32)_mm_cvtsi128_si32(_mm512_castsi512_si128(A))) +#define npyv_extract0_s32(A) ((npy_int32)_mm_cvtsi128_si32(_mm512_castsi512_si128(A))) +#define npyv_extract0_u64(A) ((npy_uint64)npyv128_cvtsi128_si64(_mm512_castsi512_si128(A))) +#define npyv_extract0_s64(A) ((npy_int64)npyv128_cvtsi128_si64(_mm512_castsi512_si128(A))) +#define npyv_extract0_f32(A) _mm_cvtss_f32(_mm512_castps512_ps128(A)) +#define npyv_extract0_f64(A) _mm_cvtsd_f64(_mm512_castpd512_pd128(A)) + // reinterpret #define npyv_reinterpret_u8_u8(X) X #define npyv_reinterpret_u8_s8(X) X diff --git a/numpy/core/src/common/simd/avx512/operators.h b/numpy/core/src/common/simd/avx512/operators.h index 804cd24e8..c70932d5f 100644 --- a/numpy/core/src/common/simd/avx512/operators.h +++ b/numpy/core/src/common/simd/avx512/operators.h @@ -5,6 +5,8 @@ #ifndef _NPY_SIMD_AVX512_OPERATORS_H #define _NPY_SIMD_AVX512_OPERATORS_H +#include "conversion.h" // tobits + /*************************** * Shifting ***************************/ @@ -336,4 +338,43 @@ NPY_FINLINE npyv_b32 npyv_notnan_f32(npyv_f32 a) NPY_FINLINE npyv_b64 npyv_notnan_f64(npyv_f64 a) { return _mm512_cmp_pd_mask(a, a, _CMP_ORD_Q); } +// Test cross all vector lanes +// any: returns true if any of the elements is not equal to zero +// all: returns true if all elements are not equal to zero +#define NPYV_IMPL_AVX512_ANYALL(SFX, MASK) \ + NPY_FINLINE bool npyv_any_##SFX(npyv_##SFX a) \ + { return npyv_tobits_##SFX(a) != 0; } \ + NPY_FINLINE bool npyv_all_##SFX(npyv_##SFX a) \ + { return npyv_tobits_##SFX(a) == MASK; } +NPYV_IMPL_AVX512_ANYALL(b8, 0xffffffffffffffffull) +NPYV_IMPL_AVX512_ANYALL(b16, 0xfffffffful) +NPYV_IMPL_AVX512_ANYALL(b32, 0xffff) +NPYV_IMPL_AVX512_ANYALL(b64, 0xff) +#undef NPYV_IMPL_AVX512_ANYALL + +#define NPYV_IMPL_AVX512_ANYALL(SFX, BSFX, MASK) \ + NPY_FINLINE bool npyv_any_##SFX(npyv_##SFX a) \ + { \ + return npyv_tobits_##BSFX( \ + npyv_cmpeq_##SFX(a, npyv_zero_##SFX()) \ + ) != MASK; \ + } \ + NPY_FINLINE bool npyv_all_##SFX(npyv_##SFX a) \ + { \ + return npyv_tobits_##BSFX( \ + npyv_cmpeq_##SFX(a, npyv_zero_##SFX()) \ + ) == 0; \ + } +NPYV_IMPL_AVX512_ANYALL(u8, b8, 0xffffffffffffffffull) +NPYV_IMPL_AVX512_ANYALL(s8, b8, 0xffffffffffffffffull) +NPYV_IMPL_AVX512_ANYALL(u16, b16, 0xfffffffful) +NPYV_IMPL_AVX512_ANYALL(s16, b16, 0xfffffffful) +NPYV_IMPL_AVX512_ANYALL(u32, b32, 0xffff) +NPYV_IMPL_AVX512_ANYALL(s32, b32, 0xffff) +NPYV_IMPL_AVX512_ANYALL(u64, b64, 0xff) +NPYV_IMPL_AVX512_ANYALL(s64, b64, 0xff) +NPYV_IMPL_AVX512_ANYALL(f32, b32, 0xffff) +NPYV_IMPL_AVX512_ANYALL(f64, b64, 0xff) +#undef NPYV_IMPL_AVX512_ANYALL + #endif // _NPY_SIMD_AVX512_OPERATORS_H diff --git a/numpy/core/src/common/simd/neon/math.h b/numpy/core/src/common/simd/neon/math.h index 8f4680c8f..c0a771b5d 100644 --- a/numpy/core/src/common/simd/neon/math.h +++ b/numpy/core/src/common/simd/neon/math.h @@ -99,8 +99,12 @@ NPY_FINLINE npyv_f32 npyv_recip_f32(npyv_f32 a) return vmaxq_f32(vbslq_f32(nn_a, a, b), vbslq_f32(nn_b, b, a)); } #endif +// Max, propagates NaNs +// If any of corresponded element is NaN, NaN is set. +#define npyv_maxn_f32 vmaxq_f32 #if NPY_SIMD_F64 #define npyv_maxp_f64 vmaxnmq_f64 + #define npyv_maxn_f64 vmaxq_f64 #endif // NPY_SIMD_F64 // Maximum, integer operations #define npyv_max_u8 vmaxq_u8 @@ -134,9 +138,14 @@ NPY_FINLINE npyv_s64 npyv_max_s64(npyv_s64 a, npyv_s64 b) return vminq_f32(vbslq_f32(nn_a, a, b), vbslq_f32(nn_b, b, a)); } #endif +// Min, propagates NaNs +// If any of corresponded element is NaN, NaN is set. +#define npyv_minn_f32 vminq_f32 #if NPY_SIMD_F64 #define npyv_minp_f64 vminnmq_f64 + #define npyv_minn_f64 vminq_f64 #endif // NPY_SIMD_F64 + // Minimum, integer operations #define npyv_min_u8 vminq_u8 #define npyv_min_s8 vminq_s8 @@ -152,6 +161,115 @@ NPY_FINLINE npyv_s64 npyv_min_s64(npyv_s64 a, npyv_s64 b) { return vbslq_s64(npyv_cmplt_s64(a, b), a, b); } +// reduce min/max for all data types +#if NPY_SIMD_F64 + #define npyv_reduce_max_u8 vmaxvq_u8 + #define npyv_reduce_max_s8 vmaxvq_s8 + #define npyv_reduce_max_u16 vmaxvq_u16 + #define npyv_reduce_max_s16 vmaxvq_s16 + #define npyv_reduce_max_u32 vmaxvq_u32 + #define npyv_reduce_max_s32 vmaxvq_s32 + + #define npyv_reduce_max_f32 vmaxvq_f32 + #define npyv_reduce_max_f64 vmaxvq_f64 + #define npyv_reduce_maxn_f32 vmaxvq_f32 + #define npyv_reduce_maxn_f64 vmaxvq_f64 + #define npyv_reduce_maxp_f32 vmaxnmvq_f32 + #define npyv_reduce_maxp_f64 vmaxnmvq_f64 + + #define npyv_reduce_min_u8 vminvq_u8 + #define npyv_reduce_min_s8 vminvq_s8 + #define npyv_reduce_min_u16 vminvq_u16 + #define npyv_reduce_min_s16 vminvq_s16 + #define npyv_reduce_min_u32 vminvq_u32 + #define npyv_reduce_min_s32 vminvq_s32 + + #define npyv_reduce_min_f32 vminvq_f32 + #define npyv_reduce_min_f64 vminvq_f64 + #define npyv_reduce_minn_f32 vminvq_f32 + #define npyv_reduce_minn_f64 vminvq_f64 + #define npyv_reduce_minp_f32 vminnmvq_f32 + #define npyv_reduce_minp_f64 vminnmvq_f64 +#else + #define NPY_IMPL_NEON_REDUCE_MINMAX(INTRIN, STYPE, SFX) \ + NPY_FINLINE npy_##STYPE npyv_reduce_##INTRIN##_##SFX(npyv_##SFX a) \ + { \ + STYPE##x8_t r = vp##INTRIN##_##SFX(vget_low_##SFX(a), vget_high_##SFX(a)); \ + r = vp##INTRIN##_##SFX(r, r); \ + r = vp##INTRIN##_##SFX(r, r); \ + r = vp##INTRIN##_##SFX(r, r); \ + return (npy_##STYPE)vget_lane_##SFX(r, 0); \ + } + NPY_IMPL_NEON_REDUCE_MINMAX(min, uint8, u8) + NPY_IMPL_NEON_REDUCE_MINMAX(max, uint8, u8) + NPY_IMPL_NEON_REDUCE_MINMAX(min, int8, s8) + NPY_IMPL_NEON_REDUCE_MINMAX(max, int8, s8) + #undef NPY_IMPL_NEON_REDUCE_MINMAX + + #define NPY_IMPL_NEON_REDUCE_MINMAX(INTRIN, STYPE, SFX) \ + NPY_FINLINE npy_##STYPE npyv_reduce_##INTRIN##_##SFX(npyv_##SFX a) \ + { \ + STYPE##x4_t r = vp##INTRIN##_##SFX(vget_low_##SFX(a), vget_high_##SFX(a)); \ + r = vp##INTRIN##_##SFX(r, r); \ + r = vp##INTRIN##_##SFX(r, r); \ + return (npy_##STYPE)vget_lane_##SFX(r, 0); \ + } + NPY_IMPL_NEON_REDUCE_MINMAX(min, uint16, u16) + NPY_IMPL_NEON_REDUCE_MINMAX(max, uint16, u16) + NPY_IMPL_NEON_REDUCE_MINMAX(min, int16, s16) + NPY_IMPL_NEON_REDUCE_MINMAX(max, int16, s16) + #undef NPY_IMPL_NEON_REDUCE_MINMAX + + #define NPY_IMPL_NEON_REDUCE_MINMAX(INTRIN, STYPE, SFX) \ + NPY_FINLINE npy_##STYPE npyv_reduce_##INTRIN##_##SFX(npyv_##SFX a) \ + { \ + STYPE##x2_t r = vp##INTRIN##_##SFX(vget_low_##SFX(a), vget_high_##SFX(a)); \ + r = vp##INTRIN##_##SFX(r, r); \ + return (npy_##STYPE)vget_lane_##SFX(r, 0); \ + } + NPY_IMPL_NEON_REDUCE_MINMAX(min, uint32, u32) + NPY_IMPL_NEON_REDUCE_MINMAX(max, uint32, u32) + NPY_IMPL_NEON_REDUCE_MINMAX(min, int32, s32) + NPY_IMPL_NEON_REDUCE_MINMAX(max, int32, s32) + #undef NPY_IMPL_NEON_REDUCE_MINMAX + + #define NPY_IMPL_NEON_REDUCE_MINMAX(INTRIN, INF) \ + NPY_FINLINE float npyv_reduce_##INTRIN##_f32(npyv_f32 a) \ + { \ + float32x2_t r = vp##INTRIN##_f32(vget_low_f32(a), vget_high_f32(a));\ + r = vp##INTRIN##_f32(r, r); \ + return vget_lane_f32(r, 0); \ + } \ + NPY_FINLINE float npyv_reduce_##INTRIN##p_f32(npyv_f32 a) \ + { \ + npyv_b32 notnan = npyv_notnan_f32(a); \ + if (NPY_UNLIKELY(!npyv_any_b32(notnan))) { \ + return vgetq_lane_f32(a, 0); \ + } \ + a = npyv_select_f32(notnan, a, \ + npyv_reinterpret_f32_u32(npyv_setall_u32(INF))); \ + return npyv_reduce_##INTRIN##_f32(a); \ + } \ + NPY_FINLINE float npyv_reduce_##INTRIN##n_f32(npyv_f32 a) \ + { \ + return npyv_reduce_##INTRIN##_f32(a); \ + } + NPY_IMPL_NEON_REDUCE_MINMAX(min, 0x7f800000) + NPY_IMPL_NEON_REDUCE_MINMAX(max, 0xff800000) + #undef NPY_IMPL_NEON_REDUCE_MINMAX +#endif // NPY_SIMD_F64 +#define NPY_IMPL_NEON_REDUCE_MINMAX(INTRIN, STYPE, SFX, OP) \ + NPY_FINLINE STYPE npyv_reduce_##INTRIN##_##SFX(npyv_##SFX a) \ + { \ + STYPE al = (STYPE)vget_low_##SFX(a); \ + STYPE ah = (STYPE)vget_high_##SFX(a); \ + return al OP ah ? al : ah; \ + } +NPY_IMPL_NEON_REDUCE_MINMAX(max, npy_uint64, u64, >) +NPY_IMPL_NEON_REDUCE_MINMAX(max, npy_int64, s64, >) +NPY_IMPL_NEON_REDUCE_MINMAX(min, npy_uint64, u64, <) +NPY_IMPL_NEON_REDUCE_MINMAX(min, npy_int64, s64, <) +#undef NPY_IMPL_NEON_REDUCE_MINMAX // round to nearest integer even NPY_FINLINE npyv_f32 npyv_rint_f32(npyv_f32 a) diff --git a/numpy/core/src/common/simd/neon/misc.h b/numpy/core/src/common/simd/neon/misc.h index 51b0c3858..5fe109c13 100644 --- a/numpy/core/src/common/simd/neon/misc.h +++ b/numpy/core/src/common/simd/neon/misc.h @@ -138,6 +138,18 @@ NPY_FINLINE float64x2_t npyv__set_f64(double i0, double i1) #define npyv_select_f32 vbslq_f32 #define npyv_select_f64 vbslq_f64 +// extract the first vector's lane +#define npyv_extract0_u8(A) ((npy_uint8)vgetq_lane_u8(A, 0)) +#define npyv_extract0_s8(A) ((npy_int8)vgetq_lane_s8(A, 0)) +#define npyv_extract0_u16(A) ((npy_uint16)vgetq_lane_u16(A, 0)) +#define npyv_extract0_s16(A) ((npy_int16)vgetq_lane_s16(A, 0)) +#define npyv_extract0_u32(A) ((npy_uint32)vgetq_lane_u32(A, 0)) +#define npyv_extract0_s32(A) ((npy_int32)vgetq_lane_s32(A, 0)) +#define npyv_extract0_u64(A) ((npy_uint64)vgetq_lane_u64(A, 0)) +#define npyv_extract0_s64(A) ((npy_int64)vgetq_lane_s64(A, 0)) +#define npyv_extract0_f32(A) vgetq_lane_f32(A, 0) +#define npyv_extract0_f64(A) vgetq_lane_f64(A, 0) + // Reinterpret #define npyv_reinterpret_u8_u8(X) X #define npyv_reinterpret_u8_s8 vreinterpretq_u8_s8 diff --git a/numpy/core/src/common/simd/neon/operators.h b/numpy/core/src/common/simd/neon/operators.h index a08fa5390..249621bd6 100644 --- a/numpy/core/src/common/simd/neon/operators.h +++ b/numpy/core/src/common/simd/neon/operators.h @@ -246,4 +246,129 @@ NPY_FINLINE npyv_b32 npyv_notnan_f32(npyv_f32 a) { return vceqq_f64(a, a); } #endif +// Test cross all vector lanes +// any: returns true if any of the elements is not equal to zero +// all: returns true if all elements are not equal to zero +#if NPY_SIMD_F64 + #define NPYV_IMPL_NEON_ANYALL(LEN) \ + NPY_FINLINE bool npyv_any_b##LEN(npyv_b##LEN a) \ + { return vmaxvq_u##LEN(a) != 0; } \ + NPY_FINLINE bool npyv_all_b##LEN(npyv_b##LEN a) \ + { return vminvq_u##LEN(a) != 0; } + NPYV_IMPL_NEON_ANYALL(8) + NPYV_IMPL_NEON_ANYALL(16) + NPYV_IMPL_NEON_ANYALL(32) + #undef NPYV_IMPL_NEON_ANYALL + + #define NPYV_IMPL_NEON_ANYALL(SFX, USFX, BSFX) \ + NPY_FINLINE bool npyv_any_##SFX(npyv_##SFX a) \ + { return npyv_any_##BSFX(npyv_reinterpret_##USFX##_##SFX(a)); } \ + NPY_FINLINE bool npyv_all_##SFX(npyv_##SFX a) \ + { return npyv_all_##BSFX(npyv_reinterpret_##USFX##_##SFX(a)); } + NPYV_IMPL_NEON_ANYALL(u8, u8, b8) + NPYV_IMPL_NEON_ANYALL(s8, u8, b8) + NPYV_IMPL_NEON_ANYALL(u16, u16, b16) + NPYV_IMPL_NEON_ANYALL(s16, u16, b16) + NPYV_IMPL_NEON_ANYALL(u32, u32, b32) + NPYV_IMPL_NEON_ANYALL(s32, u32, b32) + #undef NPYV_IMPL_NEON_ANYALL + + NPY_FINLINE bool npyv_any_b64(npyv_b64 a) + { return vmaxvq_u32(vreinterpretq_u32_u64(a)) != 0; } + NPY_FINLINE bool npyv_all_b64(npyv_b64 a) + { return vminvq_u32(vreinterpretq_u32_u64(a)) != 0; } + #define npyv_any_u64 npyv_any_b64 + NPY_FINLINE bool npyv_all_u64(npyv_u64 a) + { + uint32x4_t a32 = vreinterpretq_u32_u64(a); + a32 = vorrq_u32(a32, vrev64q_u32(a32)); + return vminvq_u32(a32) != 0; + } + NPY_FINLINE bool npyv_any_s64(npyv_s64 a) + { return npyv_any_u64(vreinterpretq_u64_s64(a)); } + NPY_FINLINE bool npyv_all_s64(npyv_s64 a) + { return npyv_all_u64(vreinterpretq_u64_s64(a)); } + + #define NPYV_IMPL_NEON_ANYALL(SFX, BSFX) \ + NPY_FINLINE bool npyv_any_##SFX(npyv_##SFX a) \ + { return !npyv_all_##BSFX(npyv_cmpeq_##SFX(a, npyv_zero_##SFX())); } \ + NPY_FINLINE bool npyv_all_##SFX(npyv_##SFX a) \ + { return !npyv_any_##BSFX(npyv_cmpeq_##SFX(a, npyv_zero_##SFX())); } + NPYV_IMPL_NEON_ANYALL(f32, b32) + NPYV_IMPL_NEON_ANYALL(f64, b64) + #undef NPYV_IMPL_NEON_ANYALL +#else + #define NPYV_IMPL_NEON_ANYALL(LEN) \ + NPY_FINLINE bool npyv_any_b##LEN(npyv_b##LEN a) \ + { \ + int64x2_t a64 = vreinterpretq_s64_u##LEN(a); \ + return ( \ + vgetq_lane_s64(a64, 0) | \ + vgetq_lane_s64(a64, 1) \ + ) != 0; \ + } \ + NPY_FINLINE bool npyv_all_b##LEN(npyv_b##LEN a) \ + { \ + int64x2_t a64 = vreinterpretq_s64_u##LEN(a); \ + return ( \ + vgetq_lane_s64(a64, 0) & \ + vgetq_lane_s64(a64, 1) \ + ) == -1; \ + } + NPYV_IMPL_NEON_ANYALL(8) + NPYV_IMPL_NEON_ANYALL(16) + NPYV_IMPL_NEON_ANYALL(32) + NPYV_IMPL_NEON_ANYALL(64) + #undef NPYV_IMPL_NEON_ANYALL + + #define NPYV_IMPL_NEON_ANYALL(SFX, USFX) \ + NPY_FINLINE bool npyv_any_##SFX(npyv_##SFX a) \ + { \ + int64x2_t a64 = vreinterpretq_s64_##SFX(a); \ + return ( \ + vgetq_lane_s64(a64, 0) | \ + vgetq_lane_s64(a64, 1) \ + ) != 0; \ + } \ + NPY_FINLINE bool npyv_all_##SFX(npyv_##SFX a) \ + { \ + npyv_##USFX tz = npyv_cmpeq_##SFX( \ + a, npyv_zero_##SFX() \ + ); \ + int64x2_t a64 = vreinterpretq_s64_##USFX(tz); \ + return ( \ + vgetq_lane_s64(a64, 0) | \ + vgetq_lane_s64(a64, 1) \ + ) == 0; \ + } + NPYV_IMPL_NEON_ANYALL(u8, u8) + NPYV_IMPL_NEON_ANYALL(s8, u8) + NPYV_IMPL_NEON_ANYALL(u16, u16) + NPYV_IMPL_NEON_ANYALL(s16, u16) + NPYV_IMPL_NEON_ANYALL(u32, u32) + NPYV_IMPL_NEON_ANYALL(s32, u32) + #undef NPYV_IMPL_NEON_ANYALL + + NPY_FINLINE bool npyv_any_f32(npyv_f32 a) + { + uint32x4_t tz = npyv_cmpeq_f32(a, npyv_zero_f32()); + int64x2_t a64 = vreinterpretq_s64_u32(tz); + return (vgetq_lane_s64(a64, 0) & vgetq_lane_s64(a64, 1)) != -1ll; + } + NPY_FINLINE bool npyv_all_f32(npyv_f32 a) + { + uint32x4_t tz = npyv_cmpeq_f32(a, npyv_zero_f32()); + int64x2_t a64 = vreinterpretq_s64_u32(tz); + return (vgetq_lane_s64(a64, 0) | vgetq_lane_s64(a64, 1)) == 0; + } + NPY_FINLINE bool npyv_any_s64(npyv_s64 a) + { return (vgetq_lane_s64(a, 0) | vgetq_lane_s64(a, 1)) != 0; } + NPY_FINLINE bool npyv_all_s64(npyv_s64 a) + { return vgetq_lane_s64(a, 0) && vgetq_lane_s64(a, 1); } + NPY_FINLINE bool npyv_any_u64(npyv_u64 a) + { return (vgetq_lane_u64(a, 0) | vgetq_lane_u64(a, 1)) != 0; } + NPY_FINLINE bool npyv_all_u64(npyv_u64 a) + { return vgetq_lane_u64(a, 0) && vgetq_lane_u64(a, 1); } +#endif // NPY_SIMD_F64 + #endif // _NPY_SIMD_NEON_OPERATORS_H diff --git a/numpy/core/src/common/simd/simd.h b/numpy/core/src/common/simd/simd.h index b1492500f..92a77ad80 100644 --- a/numpy/core/src/common/simd/simd.h +++ b/numpy/core/src/common/simd/simd.h @@ -8,6 +8,10 @@ * TODO: Add an independent sphinx doc. */ #include "numpy/npy_common.h" +#ifndef __cplusplus + #include <stdbool.h> +#endif + #include "npy_cpu_dispatch.h" #include "simd_utils.h" diff --git a/numpy/core/src/common/simd/sse/math.h b/numpy/core/src/common/simd/sse/math.h index e4b77b671..b7f8e6ebb 100644 --- a/numpy/core/src/common/simd/sse/math.h +++ b/numpy/core/src/common/simd/sse/math.h @@ -45,15 +45,27 @@ NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a) // - Only if both corresponded elements are NaN, NaN is set. NPY_FINLINE npyv_f32 npyv_maxp_f32(npyv_f32 a, npyv_f32 b) { - __m128 nn = _mm_cmpord_ps(b, b); + __m128i nn = npyv_notnan_f32(b); __m128 max = _mm_max_ps(a, b); - return npyv_select_f32(_mm_castps_si128(nn), max, a); + return npyv_select_f32(nn, max, a); } NPY_FINLINE npyv_f64 npyv_maxp_f64(npyv_f64 a, npyv_f64 b) { - __m128d nn = _mm_cmpord_pd(b, b); + __m128i nn = npyv_notnan_f64(b); __m128d max = _mm_max_pd(a, b); - return npyv_select_f64(_mm_castpd_si128(nn), max, a); + return npyv_select_f64(nn, max, a); +} +NPY_FINLINE npyv_f32 npyv_maxn_f32(npyv_f32 a, npyv_f32 b) +{ + __m128i nn = npyv_notnan_f32(a); + __m128 max = _mm_max_ps(a, b); + return npyv_select_f32(nn, max, a); +} +NPY_FINLINE npyv_f64 npyv_maxn_f64(npyv_f64 a, npyv_f64 b) +{ + __m128i nn = npyv_notnan_f64(a); + __m128d max = _mm_max_pd(a, b); + return npyv_select_f64(nn, max, a); } // Maximum, integer operations #ifdef NPY_HAVE_SSE41 @@ -98,15 +110,27 @@ NPY_FINLINE npyv_s64 npyv_max_s64(npyv_s64 a, npyv_s64 b) // - Only if both corresponded elements are NaN, NaN is set. NPY_FINLINE npyv_f32 npyv_minp_f32(npyv_f32 a, npyv_f32 b) { - __m128 nn = _mm_cmpord_ps(b, b); + __m128i nn = npyv_notnan_f32(b); __m128 min = _mm_min_ps(a, b); - return npyv_select_f32(_mm_castps_si128(nn), min, a); + return npyv_select_f32(nn, min, a); } NPY_FINLINE npyv_f64 npyv_minp_f64(npyv_f64 a, npyv_f64 b) { - __m128d nn = _mm_cmpord_pd(b, b); + __m128i nn = npyv_notnan_f64(b); __m128d min = _mm_min_pd(a, b); - return npyv_select_f64(_mm_castpd_si128(nn), min, a); + return npyv_select_f64(nn, min, a); +} +NPY_FINLINE npyv_f32 npyv_minn_f32(npyv_f32 a, npyv_f32 b) +{ + __m128i nn = npyv_notnan_f32(a); + __m128 min = _mm_min_ps(a, b); + return npyv_select_f32(nn, min, a); +} +NPY_FINLINE npyv_f64 npyv_minn_f64(npyv_f64 a, npyv_f64 b) +{ + __m128i nn = npyv_notnan_f64(a); + __m128d min = _mm_min_pd(a, b); + return npyv_select_f64(nn, min, a); } // Minimum, integer operations #ifdef NPY_HAVE_SSE41 @@ -143,6 +167,102 @@ NPY_FINLINE npyv_s64 npyv_min_s64(npyv_s64 a, npyv_s64 b) return npyv_select_s64(npyv_cmplt_s64(a, b), a, b); } +// reduce min&max for 32&64-bits +#define NPY_IMPL_SSE_REDUCE_MINMAX(STYPE, INTRIN, VINTRIN) \ + NPY_FINLINE STYPE##32 npyv_reduce_##INTRIN##32(__m128i a) \ + { \ + __m128i v64 = npyv_##INTRIN##32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 3, 2))); \ + __m128i v32 = npyv_##INTRIN##32(v64, _mm_shuffle_epi32(v64, _MM_SHUFFLE(0, 0, 0, 1))); \ + return (STYPE##32)_mm_cvtsi128_si32(v32); \ + } \ + NPY_FINLINE STYPE##64 npyv_reduce_##INTRIN##64(__m128i a) \ + { \ + __m128i v64 = npyv_##INTRIN##64(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 3, 2))); \ + return (STYPE##64)npyv_extract0_u64(v64); \ + } + +NPY_IMPL_SSE_REDUCE_MINMAX(npy_uint, min_u, min_epu) +NPY_IMPL_SSE_REDUCE_MINMAX(npy_int, min_s, min_epi) +NPY_IMPL_SSE_REDUCE_MINMAX(npy_uint, max_u, max_epu) +NPY_IMPL_SSE_REDUCE_MINMAX(npy_int, max_s, max_epi) +#undef NPY_IMPL_SSE_REDUCE_MINMAX +// reduce min&max for ps & pd +#define NPY_IMPL_SSE_REDUCE_MINMAX(INTRIN, INF, INF64) \ + NPY_FINLINE float npyv_reduce_##INTRIN##_f32(npyv_f32 a) \ + { \ + __m128 v64 = _mm_##INTRIN##_ps(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 0, 3, 2))); \ + __m128 v32 = _mm_##INTRIN##_ps(v64, _mm_shuffle_ps(v64, v64, _MM_SHUFFLE(0, 0, 0, 1))); \ + return _mm_cvtss_f32(v32); \ + } \ + NPY_FINLINE double npyv_reduce_##INTRIN##_f64(npyv_f64 a) \ + { \ + __m128d v64 = _mm_##INTRIN##_pd(a, _mm_shuffle_pd(a, a, _MM_SHUFFLE(0, 0, 0, 1))); \ + return _mm_cvtsd_f64(v64); \ + } \ + NPY_FINLINE float npyv_reduce_##INTRIN##p_f32(npyv_f32 a) \ + { \ + npyv_b32 notnan = npyv_notnan_f32(a); \ + if (NPY_UNLIKELY(!npyv_any_b32(notnan))) { \ + return _mm_cvtss_f32(a); \ + } \ + a = npyv_select_f32(notnan, a, npyv_reinterpret_f32_u32(npyv_setall_u32(INF))); \ + return npyv_reduce_##INTRIN##_f32(a); \ + } \ + NPY_FINLINE double npyv_reduce_##INTRIN##p_f64(npyv_f64 a) \ + { \ + npyv_b64 notnan = npyv_notnan_f64(a); \ + if (NPY_UNLIKELY(!npyv_any_b64(notnan))) { \ + return _mm_cvtsd_f64(a); \ + } \ + a = npyv_select_f64(notnan, a, npyv_reinterpret_f64_u64(npyv_setall_u64(INF64))); \ + return npyv_reduce_##INTRIN##_f64(a); \ + } \ + NPY_FINLINE float npyv_reduce_##INTRIN##n_f32(npyv_f32 a) \ + { \ + npyv_b32 notnan = npyv_notnan_f32(a); \ + if (NPY_UNLIKELY(!npyv_all_b32(notnan))) { \ + const union { npy_uint32 i; float f;} pnan = {0x7fc00000UL}; \ + return pnan.f; \ + } \ + return npyv_reduce_##INTRIN##_f32(a); \ + } \ + NPY_FINLINE double npyv_reduce_##INTRIN##n_f64(npyv_f64 a) \ + { \ + npyv_b64 notnan = npyv_notnan_f64(a); \ + if (NPY_UNLIKELY(!npyv_all_b64(notnan))) { \ + const union { npy_uint64 i; double d;} pnan = {0x7ff8000000000000ull}; \ + return pnan.d; \ + } \ + return npyv_reduce_##INTRIN##_f64(a); \ + } + +NPY_IMPL_SSE_REDUCE_MINMAX(min, 0x7f800000, 0x7ff0000000000000) +NPY_IMPL_SSE_REDUCE_MINMAX(max, 0xff800000, 0xfff0000000000000) +#undef NPY_IMPL_SSE_REDUCE_MINMAX + +// reduce min&max for 8&16-bits +#define NPY_IMPL_SSE_REDUCE_MINMAX(STYPE, INTRIN) \ + NPY_FINLINE STYPE##16 npyv_reduce_##INTRIN##16(__m128i a) \ + { \ + __m128i v64 = npyv_##INTRIN##16(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 3, 2))); \ + __m128i v32 = npyv_##INTRIN##16(v64, _mm_shuffle_epi32(v64, _MM_SHUFFLE(0, 0, 0, 1))); \ + __m128i v16 = npyv_##INTRIN##16(v32, _mm_shufflelo_epi16(v32, _MM_SHUFFLE(0, 0, 0, 1))); \ + return (STYPE##16)_mm_cvtsi128_si32(v16); \ + } \ + NPY_FINLINE STYPE##8 npyv_reduce_##INTRIN##8(__m128i a) \ + { \ + __m128i v64 = npyv_##INTRIN##8(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 3, 2))); \ + __m128i v32 = npyv_##INTRIN##8(v64, _mm_shuffle_epi32(v64, _MM_SHUFFLE(0, 0, 0, 1))); \ + __m128i v16 = npyv_##INTRIN##8(v32, _mm_shufflelo_epi16(v32, _MM_SHUFFLE(0, 0, 0, 1))); \ + __m128i v8 = npyv_##INTRIN##8(v16, _mm_srli_epi16(v16, 8)); \ + return (STYPE##16)_mm_cvtsi128_si32(v8); \ + } +NPY_IMPL_SSE_REDUCE_MINMAX(npy_uint, min_u) +NPY_IMPL_SSE_REDUCE_MINMAX(npy_int, min_s) +NPY_IMPL_SSE_REDUCE_MINMAX(npy_uint, max_u) +NPY_IMPL_SSE_REDUCE_MINMAX(npy_int, max_s) +#undef NPY_IMPL_SSE_REDUCE_MINMAX + // round to nearest integer even NPY_FINLINE npyv_f32 npyv_rint_f32(npyv_f32 a) { diff --git a/numpy/core/src/common/simd/sse/misc.h b/numpy/core/src/common/simd/sse/misc.h index 7d13fbf55..b01ff1722 100644 --- a/numpy/core/src/common/simd/sse/misc.h +++ b/numpy/core/src/common/simd/sse/misc.h @@ -129,6 +129,18 @@ NPY_FINLINE __m128d npyv__setr_pd(double i0, double i1) #define npyv_select_u64 npyv_select_u8 #define npyv_select_s64 npyv_select_u8 +// extract the first vector's lane +#define npyv_extract0_u8(A) ((npy_uint8)_mm_cvtsi128_si32(A)) +#define npyv_extract0_s8(A) ((npy_int8)_mm_cvtsi128_si32(A)) +#define npyv_extract0_u16(A) ((npy_uint16)_mm_cvtsi128_si32(A)) +#define npyv_extract0_s16(A) ((npy_int16)_mm_cvtsi128_si32(A)) +#define npyv_extract0_u32(A) ((npy_uint32)_mm_cvtsi128_si32(A)) +#define npyv_extract0_s32(A) ((npy_int32)_mm_cvtsi128_si32(A)) +#define npyv_extract0_u64(A) ((npy_uint64)npyv128_cvtsi128_si64(A)) +#define npyv_extract0_s64(A) ((npy_int64)npyv128_cvtsi128_si64(A)) +#define npyv_extract0_f32 _mm_cvtss_f32 +#define npyv_extract0_f64 _mm_cvtsd_f64 + // Reinterpret #define npyv_reinterpret_u8_u8(X) X #define npyv_reinterpret_u8_s8(X) X diff --git a/numpy/core/src/common/simd/sse/operators.h b/numpy/core/src/common/simd/sse/operators.h index 86dbcfea5..59182679e 100644 --- a/numpy/core/src/common/simd/sse/operators.h +++ b/numpy/core/src/common/simd/sse/operators.h @@ -283,4 +283,60 @@ NPY_FINLINE npyv_b32 npyv_notnan_f32(npyv_f32 a) NPY_FINLINE npyv_b64 npyv_notnan_f64(npyv_f64 a) { return _mm_castpd_si128(_mm_cmpord_pd(a, a)); } +// Test cross all vector lanes +// any: returns true if any of the elements is not equal to zero +// all: returns true if all elements are not equal to zero +#define NPYV_IMPL_SSE_ANYALL(SFX) \ + NPY_FINLINE bool npyv_any_##SFX(npyv_##SFX a) \ + { return _mm_movemask_epi8(a) != 0; } \ + NPY_FINLINE bool npyv_all_##SFX(npyv_##SFX a) \ + { return _mm_movemask_epi8(a) == 0xffff; } +NPYV_IMPL_SSE_ANYALL(b8) +NPYV_IMPL_SSE_ANYALL(b16) +NPYV_IMPL_SSE_ANYALL(b32) +NPYV_IMPL_SSE_ANYALL(b64) +#undef NPYV_IMPL_SSE_ANYALL + +#define NPYV_IMPL_SSE_ANYALL(SFX, MSFX, TSFX, MASK) \ + NPY_FINLINE bool npyv_any_##SFX(npyv_##SFX a) \ + { \ + return _mm_movemask_##MSFX( \ + _mm_cmpeq_##TSFX(a, npyv_zero_##SFX()) \ + ) != MASK; \ + } \ + NPY_FINLINE bool npyv_all_##SFX(npyv_##SFX a) \ + { \ + return _mm_movemask_##MSFX( \ + _mm_cmpeq_##TSFX(a, npyv_zero_##SFX()) \ + ) == 0; \ + } +NPYV_IMPL_SSE_ANYALL(u8, epi8, epi8, 0xffff) +NPYV_IMPL_SSE_ANYALL(s8, epi8, epi8, 0xffff) +NPYV_IMPL_SSE_ANYALL(u16, epi8, epi16, 0xffff) +NPYV_IMPL_SSE_ANYALL(s16, epi8, epi16, 0xffff) +NPYV_IMPL_SSE_ANYALL(u32, epi8, epi32, 0xffff) +NPYV_IMPL_SSE_ANYALL(s32, epi8, epi32, 0xffff) +#ifdef NPY_HAVE_SSE41 + NPYV_IMPL_SSE_ANYALL(u64, epi8, epi64, 0xffff) + NPYV_IMPL_SSE_ANYALL(s64, epi8, epi64, 0xffff) +#else + NPY_FINLINE bool npyv_any_u64(npyv_u64 a) + { + return _mm_movemask_epi8( + _mm_cmpeq_epi32(a, npyv_zero_u64()) + ) != 0xffff; + } + NPY_FINLINE bool npyv_all_u64(npyv_u64 a) + { + a = _mm_cmpeq_epi32(a, npyv_zero_u64()); + a = _mm_and_si128(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(2, 3, 0, 1))); + return _mm_movemask_epi8(a) == 0; + } + #define npyv_any_s64 npyv_any_u64 + #define npyv_all_s64 npyv_all_u64 +#endif +NPYV_IMPL_SSE_ANYALL(f32, ps, ps, 0xf) +NPYV_IMPL_SSE_ANYALL(f64, pd, pd, 0x3) +#undef NPYV_IMPL_SSE_ANYALL + #endif // _NPY_SIMD_SSE_OPERATORS_H diff --git a/numpy/core/src/common/simd/vec/math.h b/numpy/core/src/common/simd/vec/math.h index 7714a612d..95b16fdf7 100644 --- a/numpy/core/src/common/simd/vec/math.h +++ b/numpy/core/src/common/simd/vec/math.h @@ -63,6 +63,23 @@ NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a) return vec_max(vec_sel(b, a, nn_a), vec_sel(a, b, nn_b)); } #endif +#if NPY_SIMD_F32 + NPY_FINLINE npyv_f32 npyv_maxn_f32(npyv_f32 a, npyv_f32 b) + { + npyv_b32 nn_a = npyv_notnan_f32(a); + npyv_b32 nn_b = npyv_notnan_f32(b); + npyv_f32 max = vec_max(a, b); + return vec_sel(b, vec_sel(a, max, nn_a), nn_b); + } +#endif +NPY_FINLINE npyv_f64 npyv_maxn_f64(npyv_f64 a, npyv_f64 b) +{ + npyv_b64 nn_a = npyv_notnan_f64(a); + npyv_b64 nn_b = npyv_notnan_f64(b); + npyv_f64 max = vec_max(a, b); + return vec_sel(b, vec_sel(a, max, nn_a), nn_b); +} + // Maximum, integer operations #define npyv_max_u8 vec_max #define npyv_max_s8 vec_max @@ -95,6 +112,23 @@ NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a) return vec_min(vec_sel(b, a, nn_a), vec_sel(a, b, nn_b)); } #endif +#if NPY_SIMD_F32 + NPY_FINLINE npyv_f32 npyv_minn_f32(npyv_f32 a, npyv_f32 b) + { + npyv_b32 nn_a = npyv_notnan_f32(a); + npyv_b32 nn_b = npyv_notnan_f32(b); + npyv_f32 min = vec_min(a, b); + return vec_sel(b, vec_sel(a, min, nn_a), nn_b); + } +#endif +NPY_FINLINE npyv_f64 npyv_minn_f64(npyv_f64 a, npyv_f64 b) +{ + npyv_b64 nn_a = npyv_notnan_f64(a); + npyv_b64 nn_b = npyv_notnan_f64(b); + npyv_f64 min = vec_min(a, b); + return vec_sel(b, vec_sel(a, min, nn_a), nn_b); +} + // Minimum, integer operations #define npyv_min_u8 vec_min #define npyv_min_s8 vec_min @@ -105,6 +139,132 @@ NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a) #define npyv_min_u64 vec_min #define npyv_min_s64 vec_min +#define NPY_IMPL_VEC_REDUCE_MINMAX(INTRIN, STYPE, SFX) \ + NPY_FINLINE npy_##STYPE npyv_reduce_##INTRIN##_##SFX(npyv_##SFX a) \ + { \ + npyv_##SFX r = vec_##INTRIN(a, vec_sld(a, a, 8)); \ + r = vec_##INTRIN(r, vec_sld(r, r, 4)); \ + r = vec_##INTRIN(r, vec_sld(r, r, 2)); \ + r = vec_##INTRIN(r, vec_sld(r, r, 1)); \ + return (npy_##STYPE)vec_extract(r, 0); \ + } +NPY_IMPL_VEC_REDUCE_MINMAX(min, uint8, u8) +NPY_IMPL_VEC_REDUCE_MINMAX(max, uint8, u8) +NPY_IMPL_VEC_REDUCE_MINMAX(min, int8, s8) +NPY_IMPL_VEC_REDUCE_MINMAX(max, int8, s8) +#undef NPY_IMPL_VEC_REDUCE_MINMAX + +#define NPY_IMPL_VEC_REDUCE_MINMAX(INTRIN, STYPE, SFX) \ + NPY_FINLINE npy_##STYPE npyv_reduce_##INTRIN##_##SFX(npyv_##SFX a) \ + { \ + npyv_##SFX r = vec_##INTRIN(a, vec_sld(a, a, 8)); \ + r = vec_##INTRIN(r, vec_sld(r, r, 4)); \ + r = vec_##INTRIN(r, vec_sld(r, r, 2)); \ + return (npy_##STYPE)vec_extract(r, 0); \ + } +NPY_IMPL_VEC_REDUCE_MINMAX(min, uint16, u16) +NPY_IMPL_VEC_REDUCE_MINMAX(max, uint16, u16) +NPY_IMPL_VEC_REDUCE_MINMAX(min, int16, s16) +NPY_IMPL_VEC_REDUCE_MINMAX(max, int16, s16) +#undef NPY_IMPL_VEC_REDUCE_MINMAX + +#define NPY_IMPL_VEC_REDUCE_MINMAX(INTRIN, STYPE, SFX) \ + NPY_FINLINE npy_##STYPE npyv_reduce_##INTRIN##_##SFX(npyv_##SFX a) \ + { \ + npyv_##SFX r = vec_##INTRIN(a, vec_sld(a, a, 8)); \ + r = vec_##INTRIN(r, vec_sld(r, r, 4)); \ + return (npy_##STYPE)vec_extract(r, 0); \ + } +NPY_IMPL_VEC_REDUCE_MINMAX(min, uint32, u32) +NPY_IMPL_VEC_REDUCE_MINMAX(max, uint32, u32) +NPY_IMPL_VEC_REDUCE_MINMAX(min, int32, s32) +NPY_IMPL_VEC_REDUCE_MINMAX(max, int32, s32) +#undef NPY_IMPL_VEC_REDUCE_MINMAX + +#define NPY_IMPL_VEC_REDUCE_MINMAX(INTRIN, STYPE, SFX) \ + NPY_FINLINE npy_##STYPE npyv_reduce_##INTRIN##_##SFX(npyv_##SFX a) \ + { \ + npyv_##SFX r = vec_##INTRIN(a, vec_sld(a, a, 8)); \ + return (npy_##STYPE)vec_extract(r, 0); \ + } +NPY_IMPL_VEC_REDUCE_MINMAX(min, uint64, u64) +NPY_IMPL_VEC_REDUCE_MINMAX(max, uint64, u64) +NPY_IMPL_VEC_REDUCE_MINMAX(min, int64, s64) +NPY_IMPL_VEC_REDUCE_MINMAX(max, int64, s64) +#undef NPY_IMPL_VEC_REDUCE_MINMAX + +#if NPY_SIMD_F32 + #define NPY_IMPL_VEC_REDUCE_MINMAX(INTRIN, INF) \ + NPY_FINLINE float npyv_reduce_##INTRIN##_f32(npyv_f32 a) \ + { \ + npyv_f32 r = vec_##INTRIN(a, vec_sld(a, a, 8)); \ + r = vec_##INTRIN(r, vec_sld(r, r, 4)); \ + return vec_extract(r, 0); \ + } \ + NPY_FINLINE float npyv_reduce_##INTRIN##p_f32(npyv_f32 a) \ + { \ + return npyv_reduce_##INTRIN##_f32(a); \ + } \ + NPY_FINLINE float npyv_reduce_##INTRIN##n_f32(npyv_f32 a) \ + { \ + npyv_b32 notnan = npyv_notnan_f32(a); \ + if (NPY_UNLIKELY(!npyv_all_b32(notnan))) { \ + const union { npy_uint32 i; float f;} \ + pnan = {0x7fc00000UL}; \ + return pnan.f; \ + } \ + return npyv_reduce_##INTRIN##_f32(a); \ + } + NPY_IMPL_VEC_REDUCE_MINMAX(min, 0x7f800000) + NPY_IMPL_VEC_REDUCE_MINMAX(max, 0xff800000) + #undef NPY_IMPL_VEC_REDUCE_MINMAX +#endif // NPY_SIMD_F32 + +#define NPY_IMPL_VEC_REDUCE_MINMAX(INTRIN, INF) \ + NPY_FINLINE double npyv_reduce_##INTRIN##_f64(npyv_f64 a) \ + { \ + npyv_f64 r = vec_##INTRIN(a, vec_sld(a, a, 8)); \ + return vec_extract(r, 0); \ + } \ + NPY_FINLINE double npyv_reduce_##INTRIN##n_f64(npyv_f64 a) \ + { \ + npyv_b64 notnan = npyv_notnan_f64(a); \ + if (NPY_UNLIKELY(!npyv_all_b64(notnan))) { \ + const union { npy_uint64 i; double f;} \ + pnan = {0x7ff8000000000000ull}; \ + return pnan.f; \ + } \ + return npyv_reduce_##INTRIN##_f64(a); \ + } +NPY_IMPL_VEC_REDUCE_MINMAX(min, 0x7ff0000000000000) +NPY_IMPL_VEC_REDUCE_MINMAX(max, 0xfff0000000000000) +#undef NPY_IMPL_VEC_REDUCE_MINMAX + +#if defined(NPY_HAVE_VXE) || defined(NPY_HAVE_VSX) + #define npyv_reduce_minp_f64 npyv_reduce_min_f64 + #define npyv_reduce_maxp_f64 npyv_reduce_max_f64 +#else + NPY_FINLINE double npyv_reduce_minp_f64(npyv_f64 a) + { + npyv_b64 notnan = npyv_notnan_f64(a); + if (NPY_UNLIKELY(!npyv_any_b64(notnan))) { + return vec_extract(a, 0); + } + a = npyv_select_f64(notnan, a, npyv_reinterpret_f64_u64( + npyv_setall_u64(0x7ff0000000000000))); + return npyv_reduce_min_f64(a); + } + NPY_FINLINE double npyv_reduce_maxp_f64(npyv_f64 a) + { + npyv_b64 notnan = npyv_notnan_f64(a); + if (NPY_UNLIKELY(!npyv_any_b64(notnan))) { + return vec_extract(a, 0); + } + a = npyv_select_f64(notnan, a, npyv_reinterpret_f64_u64( + npyv_setall_u64(0xfff0000000000000))); + return npyv_reduce_max_f64(a); + } +#endif // round to nearest int even #define npyv_rint_f64 vec_rint // ceil diff --git a/numpy/core/src/common/simd/vec/misc.h b/numpy/core/src/common/simd/vec/misc.h index c4f35cfc0..7ea0f21f6 100644 --- a/numpy/core/src/common/simd/vec/misc.h +++ b/numpy/core/src/common/simd/vec/misc.h @@ -83,6 +83,20 @@ #endif #define npyv_select_f64 npyv_select_u8 +// extract the first vector's lane +#define npyv_extract0_u8(A) ((npy_uint8)vec_extract(A, 0)) +#define npyv_extract0_s8(A) ((npy_int8)vec_extract(A, 0)) +#define npyv_extract0_u16(A) ((npy_uint16)vec_extract(A, 0)) +#define npyv_extract0_s16(A) ((npy_int16)vec_extract(A, 0)) +#define npyv_extract0_u32(A) ((npy_uint32)vec_extract(A, 0)) +#define npyv_extract0_s32(A) ((npy_int32)vec_extract(A, 0)) +#define npyv_extract0_u64(A) ((npy_uint64)vec_extract(A, 0)) +#define npyv_extract0_s64(A) ((npy_int64)vec_extract(A, 0)) +#if NPY_SIMD_F32 + #define npyv_extract0_f32(A) vec_extract(A, 0) +#endif +#define npyv_extract0_f64(A) vec_extract(A, 0) + // Reinterpret #define npyv_reinterpret_u8_u8(X) X #define npyv_reinterpret_u8_s8(X) ((npyv_u8)X) diff --git a/numpy/core/src/common/simd/vec/operators.h b/numpy/core/src/common/simd/vec/operators.h index 8b58676e7..50dac20f6 100644 --- a/numpy/core/src/common/simd/vec/operators.h +++ b/numpy/core/src/common/simd/vec/operators.h @@ -274,4 +274,30 @@ NPY_FINLINE npyv_f64 npyv_not_f64(npyv_f64 a) NPY_FINLINE npyv_b64 npyv_notnan_f64(npyv_f64 a) { return vec_cmpeq(a, a); } +// Test cross all vector lanes +// any: returns true if any of the elements is not equal to zero +// all: returns true if all elements are not equal to zero +#define NPYV_IMPL_VEC_ANYALL(SFX, SFX2) \ + NPY_FINLINE bool npyv_any_##SFX(npyv_##SFX a) \ + { return vec_any_ne(a, (npyv_##SFX)npyv_zero_##SFX2()); } \ + NPY_FINLINE bool npyv_all_##SFX(npyv_##SFX a) \ + { return vec_all_ne(a, (npyv_##SFX)npyv_zero_##SFX2()); } +NPYV_IMPL_VEC_ANYALL(b8, u8) +NPYV_IMPL_VEC_ANYALL(b16, u16) +NPYV_IMPL_VEC_ANYALL(b32, u32) +NPYV_IMPL_VEC_ANYALL(b64, u64) +NPYV_IMPL_VEC_ANYALL(u8, u8) +NPYV_IMPL_VEC_ANYALL(s8, s8) +NPYV_IMPL_VEC_ANYALL(u16, u16) +NPYV_IMPL_VEC_ANYALL(s16, s16) +NPYV_IMPL_VEC_ANYALL(u32, u32) +NPYV_IMPL_VEC_ANYALL(s32, s32) +NPYV_IMPL_VEC_ANYALL(u64, u64) +NPYV_IMPL_VEC_ANYALL(s64, s64) +#if NPY_SIMD_F32 + NPYV_IMPL_VEC_ANYALL(f32, f32) +#endif +NPYV_IMPL_VEC_ANYALL(f64, f64) +#undef NPYV_IMPL_VEC_ANYALL + #endif // _NPY_SIMD_VEC_OPERATORS_H diff --git a/numpy/core/src/npysort/x86-qsort.dispatch.cpp b/numpy/core/src/npysort/x86-qsort.dispatch.cpp index 01fa16e3e..39c24d522 100644 --- a/numpy/core/src/npysort/x86-qsort.dispatch.cpp +++ b/numpy/core/src/npysort/x86-qsort.dispatch.cpp @@ -137,8 +137,8 @@ struct vector<npy_int> { { return _mm512_permutexvar_epi32(idx, zmm); } - static type_t reducemax(zmm_t v) { return npyv_reducemax_s32(v); } - static type_t reducemin(zmm_t v) { return npyv_reducemin_s32(v); } + static type_t reducemax(zmm_t v) { return npyv_reduce_max_s32(v); } + static type_t reducemin(zmm_t v) { return npyv_reduce_min_s32(v); } static zmm_t set1(type_t v) { return _mm512_set1_epi32(v); } template<__mmask16 mask> static zmm_t shuffle(zmm_t zmm) @@ -196,8 +196,8 @@ struct vector<npy_uint> { { return _mm512_permutexvar_epi32(idx, zmm); } - static type_t reducemax(zmm_t v) { return npyv_reducemax_u32(v); } - static type_t reducemin(zmm_t v) { return npyv_reducemin_u32(v); } + static type_t reducemax(zmm_t v) { return npyv_reduce_max_u32(v); } + static type_t reducemin(zmm_t v) { return npyv_reduce_min_u32(v); } static zmm_t set1(type_t v) { return _mm512_set1_epi32(v); } template<__mmask16 mask> static zmm_t shuffle(zmm_t zmm) @@ -255,8 +255,8 @@ struct vector<npy_float> { { return _mm512_permutexvar_ps(idx, zmm); } - static type_t reducemax(zmm_t v) { return npyv_reducemax_f32(v); } - static type_t reducemin(zmm_t v) { return npyv_reducemin_f32(v); } + static type_t reducemax(zmm_t v) { return npyv_reduce_max_f32(v); } + static type_t reducemin(zmm_t v) { return npyv_reduce_min_f32(v); } static zmm_t set1(type_t v) { return _mm512_set1_ps(v); } template<__mmask16 mask> static zmm_t shuffle(zmm_t zmm) diff --git a/numpy/core/src/umath/loops_minmax.dispatch.c.src b/numpy/core/src/umath/loops_minmax.dispatch.c.src index b4fb205a0..237c8e933 100644 --- a/numpy/core/src/umath/loops_minmax.dispatch.c.src +++ b/numpy/core/src/umath/loops_minmax.dispatch.c.src @@ -95,104 +95,6 @@ NPY_FINLINE @type@ scalar_@op@_@c_sfx@(@type@ a, @type@ b) { #endif /******************************************************************************* - ** extra SIMD intrinsics - ******************************************************************************/ - -#if NPY_SIMD -/**begin repeat - * #sfx = s8, u8, s16, u16, s32, u32, s64, u64# - * #is_64 = 0*6, 1*2# - */ -#if defined(NPY_HAVE_ASIMD) && defined(__aarch64__) - #if !@is_64@ - #define npyv_reduce_min_@sfx@ vminvq_@sfx@ - #define npyv_reduce_max_@sfx@ vmaxvq_@sfx@ - #else - NPY_FINLINE npyv_lanetype_@sfx@ npyv_reduce_min_@sfx@(npyv_@sfx@ v) - { - npyv_lanetype_@sfx@ a = vgetq_lane_@sfx@(v, 0); - npyv_lanetype_@sfx@ b = vgetq_lane_@sfx@(v, 1); - npyv_lanetype_@sfx@ result = (a < b) ? a : b; - return result; - } - NPY_FINLINE npyv_lanetype_@sfx@ npyv_reduce_max_@sfx@(npyv_@sfx@ v) - { - npyv_lanetype_@sfx@ a = vgetq_lane_@sfx@(v, 0); - npyv_lanetype_@sfx@ b = vgetq_lane_@sfx@(v, 1); - npyv_lanetype_@sfx@ result = (a > b) ? a : b; - return result; - } - #endif // !@is_64@ -#else - /**begin repeat1 - * #intrin = min, max# - */ - NPY_FINLINE npyv_lanetype_@sfx@ npyv_reduce_@intrin@_@sfx@(npyv_@sfx@ v) - { - npyv_lanetype_@sfx@ NPY_DECL_ALIGNED(NPY_SIMD_WIDTH) s[npyv_nlanes_@sfx@]; - npyv_storea_@sfx@(s, v); - npyv_lanetype_@sfx@ result = s[0]; - for(int i=1; i<npyv_nlanes_@sfx@; ++i){ - result = scalar_@intrin@_i(result, s[i]); - } - return result; - } - /**end repeat1**/ -#endif -/**end repeat**/ -#endif // NPY_SIMD - -/**begin repeat - * #sfx = f32, f64# - * #bsfx = b32, b64# - * #simd_chk = NPY_SIMD_F32, NPY_SIMD_F64# - * #scalar_sfx = f, d# - */ -#if @simd_chk@ -#if defined(NPY_HAVE_ASIMD) && defined(__aarch64__) - #define npyv_minn_@sfx@ vminq_@sfx@ - #define npyv_maxn_@sfx@ vmaxq_@sfx@ - #define npyv_reduce_minn_@sfx@ vminvq_@sfx@ - #define npyv_reduce_maxn_@sfx@ vmaxvq_@sfx@ - #define npyv_reduce_minp_@sfx@ vminnmvq_@sfx@ - #define npyv_reduce_maxp_@sfx@ vmaxnmvq_@sfx@ -#else - /**begin repeat1 - * #intrin = min, max# - */ - // propagates NaNs - NPY_FINLINE npyv_@sfx@ npyv_@intrin@n_@sfx@(npyv_@sfx@ a, npyv_@sfx@ b) - { - npyv_@sfx@ result = npyv_@intrin@_@sfx@(a, b); - // result = npyv_select_@sfx@(npyv_notnan_@sfx@(b), result, b); - // X86 handle second operand - #ifndef NPY_HAVE_SSE2 - result = npyv_select_@sfx@(npyv_notnan_@sfx@(b), result, b); - #endif - result = npyv_select_@sfx@(npyv_notnan_@sfx@(a), result, a); - return result; - } - /**end repeat1**/ - /**begin repeat1 - * #intrin = minn, maxn, minp, maxp# - * #scalar_intrin = min, max, minp, maxp# - */ - NPY_FINLINE npyv_lanetype_@sfx@ npyv_reduce_@intrin@_@sfx@(npyv_@sfx@ v) - { - npyv_lanetype_@sfx@ NPY_DECL_ALIGNED(NPY_SIMD_WIDTH) s[npyv_nlanes_@sfx@]; - npyv_storea_@sfx@(s, v); - npyv_lanetype_@sfx@ result = s[0]; - for(int i=1; i<npyv_nlanes_@sfx@; ++i){ - result = scalar_@scalar_intrin@_@scalar_sfx@(result, s[i]); - } - return result; - } - /**end repeat1**/ -#endif -#endif // simd_chk -/**end repeat**/ - -/******************************************************************************* ** Defining the SIMD kernels ******************************************************************************/ /**begin repeat diff --git a/numpy/core/tests/test_simd.py b/numpy/core/tests/test_simd.py index c4488533a..4aeaf0da3 100644 --- a/numpy/core/tests/test_simd.py +++ b/numpy/core/tests/test_simd.py @@ -105,10 +105,12 @@ class _SIMD_BOOL(_Test_Utility): """ To test all boolean vector types at once """ + def _nlanes(self): + return getattr(self.npyv, "nlanes_u" + self.sfx[1:]) + def _data(self, start=None, count=None, reverse=False): - nlanes = getattr(self.npyv, "nlanes_u" + self.sfx[1:]) true_mask = self._true_mask() - rng = range(nlanes) + rng = range(self._nlanes()) if reverse: rng = reversed(rng) return [true_mask if x % 2 else 0 for x in rng] @@ -202,6 +204,26 @@ class _SIMD_BOOL(_Test_Utility): vdata, vdata, vdata, vdata) assert vpack == spack + @pytest.mark.parametrize("intrin", ["any", "all"]) + @pytest.mark.parametrize("data", ( + [-1, 0], + [0, -1], + [-1], + [0] + )) + def test_operators_crosstest(self, intrin, data): + """ + Test intrinsics: + npyv_any_##SFX + npyv_all_##SFX + """ + data_a = self._load_b(data * self._nlanes()) + func = eval(intrin) + intrin = getattr(self, intrin) + desired = func(data_a) + simd = intrin(data_a) + assert not not simd == desired + class _SIMD_INT(_Test_Utility): """ To test all integer vector types at once @@ -268,6 +290,18 @@ class _SIMD_INT(_Test_Utility): simd_min = self.min(vdata_a, vdata_b) assert simd_min == data_min + @pytest.mark.parametrize("start", [-100, -10000, 0, 100, 10000]) + def test_reduce_max_min(self, start): + """ + Test intrinsics: + npyv_reduce_max_##sfx + npyv_reduce_min_##sfx + """ + vdata_a = self.load(self._data(start)) + assert self.reduce_max(vdata_a) == max(vdata_a) + assert self.reduce_min(vdata_a) == min(vdata_a) + + class _SIMD_FP32(_Test_Utility): """ To only test single precision @@ -414,67 +448,77 @@ class _SIMD_FP(_Test_Utility): data_round = self._to_unsigned(self.setall(-0.0)) assert _round == data_round - def test_max(self): - """ - Test intrinsics: - npyv_max_##SFX - npyv_maxp_##SFX - """ - data_a = self._data() - data_b = self._data(self.nlanes) - vdata_a, vdata_b = self.load(data_a), self.load(data_b) - data_max = [max(a, b) for a, b in zip(data_a, data_b)] - _max = self.max(vdata_a, vdata_b) - assert _max == data_max - maxp = self.maxp(vdata_a, vdata_b) - assert maxp == data_max - # test IEEE standards - pinf, ninf, nan = self._pinfinity(), self._ninfinity(), self._nan() - max_cases = ((nan, nan, nan), (nan, 10, 10), (10, nan, 10), - (pinf, pinf, pinf), (pinf, 10, pinf), (10, pinf, pinf), - (ninf, ninf, ninf), (ninf, 10, 10), (10, ninf, 10), - (10, 0, 10), (10, -10, 10)) - for case_operand1, case_operand2, desired in max_cases: - data_max = [desired]*self.nlanes - vdata_a = self.setall(case_operand1) - vdata_b = self.setall(case_operand2) - maxp = self.maxp(vdata_a, vdata_b) - assert maxp == pytest.approx(data_max, nan_ok=True) - if nan in (case_operand1, case_operand2, desired): - continue - _max = self.max(vdata_a, vdata_b) - assert _max == data_max - - def test_min(self): + @pytest.mark.parametrize("intrin", [ + "max", "maxp", "maxn", "min", "minp", "minn" + ]) + def test_max_min(self, intrin): """ Test intrinsics: - npyv_min_##SFX - npyv_minp_##SFX + npyv_max_##sfx + npyv_maxp_##sfx + npyv_maxn_##sfx + npyv_min_##sfx + npyv_minp_##sfx + npyv_minn_##sfx + npyv_reduce_max_##sfx + npyv_reduce_maxp_##sfx + npyv_reduce_maxn_##sfx + npyv_reduce_min_##sfx + npyv_reduce_minp_##sfx + npyv_reduce_minn_##sfx """ - data_a = self._data() - data_b = self._data(self.nlanes) - vdata_a, vdata_b = self.load(data_a), self.load(data_b) - data_min = [min(a, b) for a, b in zip(data_a, data_b)] - _min = self.min(vdata_a, vdata_b) - assert _min == data_min - minp = self.minp(vdata_a, vdata_b) - assert minp == data_min - # test IEEE standards pinf, ninf, nan = self._pinfinity(), self._ninfinity(), self._nan() - min_cases = ((nan, nan, nan), (nan, 10, 10), (10, nan, 10), - (pinf, pinf, pinf), (pinf, 10, 10), (10, pinf, 10), - (ninf, ninf, ninf), (ninf, 10, ninf), (10, ninf, ninf), - (10, 0, 0), (10, -10, -10)) - for case_operand1, case_operand2, desired in min_cases: - data_min = [desired]*self.nlanes - vdata_a = self.setall(case_operand1) - vdata_b = self.setall(case_operand2) - minp = self.minp(vdata_a, vdata_b) - assert minp == pytest.approx(data_min, nan_ok=True) - if nan in (case_operand1, case_operand2, desired): - continue - _min = self.min(vdata_a, vdata_b) - assert _min == data_min + chk_nan = {"xp": 1, "np": 1, "nn": 2, "xn": 2}.get(intrin[-2:], 0) + func = eval(intrin[:3]) + reduce_intrin = getattr(self, "reduce_" + intrin) + intrin = getattr(self, intrin) + hf_nlanes = self.nlanes//2 + + cases = ( + ([0.0, -0.0], [-0.0, 0.0]), + ([10, -10], [10, -10]), + ([pinf, 10], [10, ninf]), + ([10, pinf], [ninf, 10]), + ([10, -10], [10, -10]), + ([-10, 10], [-10, 10]) + ) + for op1, op2 in cases: + vdata_a = self.load(op1*hf_nlanes) + vdata_b = self.load(op2*hf_nlanes) + data = func(vdata_a, vdata_b) + simd = intrin(vdata_a, vdata_b) + assert simd == data + data = func(vdata_a) + simd = reduce_intrin(vdata_a) + assert simd == data + + if not chk_nan: + return + if chk_nan == 1: + test_nan = lambda a, b: ( + b if math.isnan(a) else a if math.isnan(b) else b + ) + else: + test_nan = lambda a, b: ( + nan if math.isnan(a) or math.isnan(b) else b + ) + cases = ( + (nan, 10), + (10, nan), + (nan, pinf), + (pinf, nan), + (nan, nan) + ) + for op1, op2 in cases: + vdata_ab = self.load([op1, op2]*hf_nlanes) + data = test_nan(op1, op2) + simd = reduce_intrin(vdata_ab) + assert simd == pytest.approx(data, nan_ok=True) + vdata_a = self.setall(op1) + vdata_b = self.setall(op2) + data = [data] * self.nlanes + simd = intrin(vdata_a, vdata_b) + assert simd == pytest.approx(data, nan_ok=True) def test_reciprocal(self): pinf, ninf, nan = self._pinfinity(), self._ninfinity(), self._nan() @@ -527,6 +571,30 @@ class _SIMD_FP(_Test_Utility): data_cmp = [py_comp(a, b) for a, b in zip(data_a, data_b)] assert vcmp == data_cmp + @pytest.mark.parametrize("intrin", ["any", "all"]) + @pytest.mark.parametrize("data", ( + [float("nan"), 0], + [0, float("nan")], + [float("nan"), 1], + [1, float("nan")], + [float("nan"), float("nan")], + [0.0, -0.0], + [-0.0, 0.0], + [1.0, -0.0] + )) + def test_operators_crosstest(self, intrin, data): + """ + Test intrinsics: + npyv_any_##SFX + npyv_all_##SFX + """ + data_a = self.load(data * self.nlanes) + func = eval(intrin) + intrin = getattr(self, intrin) + desired = func(data_a) + simd = intrin(data_a) + assert not not simd == desired + class _SIMD_ALL(_Test_Utility): """ To test all vector types at once @@ -759,6 +827,9 @@ class _SIMD_ALL(_Test_Utility): select_b = self.select(self.cmpneq(self.zero(), self.zero()), vdata_a, vdata_b) assert select_b == data_b + # test extract elements + assert self.extract0(vdata_b) == vdata_b[0] + # cleanup intrinsic is only used with AVX for # zeroing registers to avoid the AVX-SSE transition penalty, # so nothing to test here @@ -874,6 +945,30 @@ class _SIMD_ALL(_Test_Utility): vandc = cast(getattr(self, "andc")(vdata_a, vdata_b)) assert vandc == data_andc + @pytest.mark.parametrize("intrin", ["any", "all"]) + @pytest.mark.parametrize("data", ( + [1, 2, 3, 4], + [-1, -2, -3, -4], + [0, 1, 2, 3, 4], + [0x7f, 0x7fff, 0x7fffffff, 0x7fffffffffffffff], + [0, -1, -2, -3, 4], + [0], + [1], + [-1] + )) + def test_operators_crosstest(self, intrin, data): + """ + Test intrinsics: + npyv_any_##SFX + npyv_all_##SFX + """ + data_a = self.load(data * self.nlanes) + func = eval(intrin) + intrin = getattr(self, intrin) + desired = func(data_a) + simd = intrin(data_a) + assert not not simd == desired + def test_conversion_boolean(self): bsfx = "b" + self.sfx[1:] to_boolean = getattr(self.npyv, "cvt_%s_%s" % (bsfx, self.sfx)) |