summaryrefslogtreecommitdiff
path: root/numpy/core
diff options
context:
space:
mode:
authorMatti Picus <matti.picus@gmail.com>2022-09-25 07:22:52 +0300
committerGitHub <noreply@github.com>2022-09-25 07:22:52 +0300
commitd66ca35de0cae4e41471dc44793c18511eb45109 (patch)
tree36b66d3907ab8b26cdbdfcf661888f3d2af747fa /numpy/core
parent2ec7a5d917dbeedf28ee4054e73f92ecebb44975 (diff)
parent6ef4c8bc1459f5d4f548ed87715651c6bc75fc49 (diff)
downloadnumpy-d66ca35de0cae4e41471dc44793c18511eb45109.tar.gz
Merge pull request #22306 from seiko2plus/npyv_new_intrinsics_sep2022_vol0
ENH: Implement essential intrinsics required by the upcoming SIMD optimizations(0)
Diffstat (limited to 'numpy/core')
-rw-r--r--numpy/core/src/_simd/_simd.dispatch.c.src38
-rw-r--r--numpy/core/src/common/simd/avx2/math.h129
-rw-r--r--numpy/core/src/common/simd/avx2/misc.h14
-rw-r--r--numpy/core/src/common/simd/avx2/operators.h54
-rw-r--r--numpy/core/src/common/simd/avx512/arithmetic.h73
-rw-r--r--numpy/core/src/common/simd/avx512/math.h178
-rw-r--r--numpy/core/src/common/simd/avx512/misc.h14
-rw-r--r--numpy/core/src/common/simd/avx512/operators.h41
-rw-r--r--numpy/core/src/common/simd/neon/math.h118
-rw-r--r--numpy/core/src/common/simd/neon/misc.h12
-rw-r--r--numpy/core/src/common/simd/neon/operators.h125
-rw-r--r--numpy/core/src/common/simd/simd.h4
-rw-r--r--numpy/core/src/common/simd/sse/math.h136
-rw-r--r--numpy/core/src/common/simd/sse/misc.h12
-rw-r--r--numpy/core/src/common/simd/sse/operators.h56
-rw-r--r--numpy/core/src/common/simd/vec/math.h160
-rw-r--r--numpy/core/src/common/simd/vec/misc.h14
-rw-r--r--numpy/core/src/common/simd/vec/operators.h26
-rw-r--r--numpy/core/src/npysort/x86-qsort.dispatch.cpp12
-rw-r--r--numpy/core/src/umath/loops_minmax.dispatch.c.src98
-rw-r--r--numpy/core/tests/test_simd.py213
21 files changed, 1276 insertions, 251 deletions
diff --git a/numpy/core/src/_simd/_simd.dispatch.c.src b/numpy/core/src/_simd/_simd.dispatch.c.src
index 997205957..b6af8e6a9 100644
--- a/numpy/core/src/_simd/_simd.dispatch.c.src
+++ b/numpy/core/src/_simd/_simd.dispatch.c.src
@@ -247,6 +247,7 @@ SIMD_IMPL_INTRIN_2(lut16_@sfx@, v@sfx@, q@sfx@, vu@size@)
* Misc
***************************/
SIMD_IMPL_INTRIN_0(zero_@sfx@, v@sfx@)
+SIMD_IMPL_INTRIN_1(extract0_@sfx@, @sfx@, v@sfx@)
SIMD_IMPL_INTRIN_1(setall_@sfx@, v@sfx@, @sfx@)
SIMD_IMPL_INTRIN_3(select_@sfx@, v@sfx@, v@bsfx@, v@sfx@, v@sfx@)
@@ -340,6 +341,12 @@ SIMD_IMPL_INTRIN_2(orc_@bsfx@, v@bsfx@, v@bsfx@, v@bsfx@)
SIMD_IMPL_INTRIN_2(xnor_@bsfx@, v@bsfx@, v@bsfx@, v@bsfx@)
#endif
+// test cross all vector lanes
+/**begin repeat1
+ * #intrin = any, all#
+ */
+SIMD_IMPL_INTRIN_1(@intrin@_@sfx@, u8, v@sfx@)
+/**end repeat1**/
/***************************
* Conversion
***************************/
@@ -409,13 +416,16 @@ SIMD_IMPL_INTRIN_1(@intrin@_@sfx@, v@sfx@, v@sfx@)
* #intrin = max, min#
*/
SIMD_IMPL_INTRIN_2(@intrin@_@sfx@, v@sfx@, v@sfx@, v@sfx@)
+SIMD_IMPL_INTRIN_1(reduce_@intrin@_@sfx@, @sfx@, v@sfx@)
/**end repeat1**/
#if @fp_only@
/**begin repeat1
- * #intrin = maxp, minp#
+ * #intrin = maxp, minp, maxn, minn#
*/
SIMD_IMPL_INTRIN_2(@intrin@_@sfx@, v@sfx@, v@sfx@, v@sfx@)
+SIMD_IMPL_INTRIN_1(reduce_@intrin@_@sfx@, @sfx@, v@sfx@)
+/**end repeat1**/
/**end repeat1**/
#endif
@@ -465,14 +475,20 @@ SIMD_IMPL_INTRIN_0N(cleanup)
/***************************
* Operators
***************************/
-// Logical
/**begin repeat
* #bsfx = b8, b16, b32, b64#
*/
+// Logical
SIMD_IMPL_INTRIN_2(and_@bsfx@, v@bsfx@, v@bsfx@, v@bsfx@)
SIMD_IMPL_INTRIN_2(or_@bsfx@, v@bsfx@, v@bsfx@, v@bsfx@)
SIMD_IMPL_INTRIN_2(xor_@bsfx@, v@bsfx@, v@bsfx@, v@bsfx@)
SIMD_IMPL_INTRIN_1(not_@bsfx@, v@bsfx@, v@bsfx@)
+// test cross vector's lanes
+/**begin repeat1
+ * #intrin = any, all#
+ */
+SIMD_IMPL_INTRIN_1(@intrin@_@bsfx@, u8, v@bsfx@)
+/**end repeat1**/
/**end repeat**/
/***************************
* Conversions
@@ -559,7 +575,7 @@ SIMD_INTRIN_DEF(reinterpret_@sfx_to@_@sfx@)
/**end repeat1**/
/**begin repeat1
- * # intrin = set, setf, setall, zero, select#
+ * # intrin = set, setf, setall, zero, select, extract0#
*/
SIMD_INTRIN_DEF(@intrin@_@sfx@)
/**end repeat1**/
@@ -589,7 +605,8 @@ SIMD_INTRIN_DEF(@intrin@_@sfx@)
#endif // shl_imm
/**begin repeat1
- * #intrin = and, or, xor, not, cmpeq, cmpneq, cmpgt, cmpge, cmplt, cmple#
+ * #intrin = and, or, xor, not, cmpeq, cmpneq, cmpgt, cmpge, cmplt, cmple,
+ * any, all#
*/
SIMD_INTRIN_DEF(@intrin@_@sfx@)
/**end repeat1**/
@@ -669,13 +686,16 @@ SIMD_INTRIN_DEF(@intrin@_@sfx@)
* #intrin = max, min#
*/
SIMD_INTRIN_DEF(@intrin@_@sfx@)
+SIMD_INTRIN_DEF(reduce_@intrin@_@sfx@)
/**end repeat1**/
#if @fp_only@
/**begin repeat1
- * #intrin = maxp, minp#
+ * #intrin = maxp, minp, maxn, minn#
*/
SIMD_INTRIN_DEF(@intrin@_@sfx@)
+SIMD_INTRIN_DEF(reduce_@intrin@_@sfx@)
+/**end repeat1**/
/**end repeat1**/
#endif
@@ -725,14 +745,20 @@ SIMD_INTRIN_DEF(cleanup)
/***************************
* Operators
***************************/
-// Logical
/**begin repeat
* #bsfx = b8, b16, b32, b64#
*/
+// Logical
SIMD_INTRIN_DEF(and_@bsfx@)
SIMD_INTRIN_DEF(or_@bsfx@)
SIMD_INTRIN_DEF(xor_@bsfx@)
SIMD_INTRIN_DEF(not_@bsfx@)
+// test cross vector's lanes
+/**begin repeat1
+ * #intrin = any, all#
+ */
+SIMD_INTRIN_DEF(@intrin@_@bsfx@)
+/**end repeat1**/
/**end repeat**/
/***************************
* Conversions
diff --git a/numpy/core/src/common/simd/avx2/math.h b/numpy/core/src/common/simd/avx2/math.h
index deaf4ad11..5c869f911 100644
--- a/numpy/core/src/common/simd/avx2/math.h
+++ b/numpy/core/src/common/simd/avx2/math.h
@@ -55,6 +55,21 @@ NPY_FINLINE npyv_f64 npyv_maxp_f64(npyv_f64 a, npyv_f64 b)
__m256d max = _mm256_max_pd(a, b);
return _mm256_blendv_pd(a, max, nn);
}
+// Maximum, propagates NaNs
+// If any of corresponded elements is NaN, NaN is set.
+NPY_FINLINE npyv_f32 npyv_maxn_f32(npyv_f32 a, npyv_f32 b)
+{
+ __m256 nn = _mm256_cmp_ps(a, a, _CMP_ORD_Q);
+ __m256 max = _mm256_max_ps(a, b);
+ return _mm256_blendv_ps(a, max, nn);
+}
+NPY_FINLINE npyv_f64 npyv_maxn_f64(npyv_f64 a, npyv_f64 b)
+{
+ __m256d nn = _mm256_cmp_pd(a, a, _CMP_ORD_Q);
+ __m256d max = _mm256_max_pd(a, b);
+ return _mm256_blendv_pd(a, max, nn);
+}
+
// Maximum, integer operations
#define npyv_max_u8 _mm256_max_epu8
#define npyv_max_s8 _mm256_max_epi8
@@ -89,6 +104,20 @@ NPY_FINLINE npyv_f64 npyv_minp_f64(npyv_f64 a, npyv_f64 b)
__m256d min = _mm256_min_pd(a, b);
return _mm256_blendv_pd(a, min, nn);
}
+// Minimum, propagates NaNs
+// If any of corresponded element is NaN, NaN is set.
+NPY_FINLINE npyv_f32 npyv_minn_f32(npyv_f32 a, npyv_f32 b)
+{
+ __m256 nn = _mm256_cmp_ps(a, a, _CMP_ORD_Q);
+ __m256 min = _mm256_min_ps(a, b);
+ return _mm256_blendv_ps(a, min, nn);
+}
+NPY_FINLINE npyv_f64 npyv_minn_f64(npyv_f64 a, npyv_f64 b)
+{
+ __m256d nn = _mm256_cmp_pd(a, a, _CMP_ORD_Q);
+ __m256d min = _mm256_min_pd(a, b);
+ return _mm256_blendv_pd(a, min, nn);
+}
// Minimum, integer operations
#define npyv_min_u8 _mm256_min_epu8
#define npyv_min_s8 _mm256_min_epi8
@@ -104,6 +133,106 @@ NPY_FINLINE npyv_s64 npyv_min_s64(npyv_s64 a, npyv_s64 b)
{
return _mm256_blendv_epi8(a, b, _mm256_cmpgt_epi64(a, b));
}
+// reduce min&max for 32&64-bits
+#define NPY_IMPL_AVX2_REDUCE_MINMAX(STYPE, INTRIN, VINTRIN) \
+ NPY_FINLINE STYPE##32 npyv_reduce_##INTRIN##32(__m256i a) \
+ { \
+ __m128i v128 = _mm_##VINTRIN##32(_mm256_castsi256_si128(a), _mm256_extracti128_si256(a, 1)); \
+ __m128i v64 = _mm_##VINTRIN##32(v128, _mm_shuffle_epi32(v128, _MM_SHUFFLE(0, 0, 3, 2))); \
+ __m128i v32 = _mm_##VINTRIN##32(v64, _mm_shuffle_epi32(v64, _MM_SHUFFLE(0, 0, 0, 1))); \
+ return (STYPE##32)_mm_cvtsi128_si32(v32); \
+ } \
+ NPY_FINLINE STYPE##64 npyv_reduce_##INTRIN##64(__m256i a) \
+ { \
+ __m256i v128 = npyv_##INTRIN##64(a, _mm256_permute2f128_si256(a, a, _MM_SHUFFLE(0, 0, 0, 1))); \
+ __m256i v64 = npyv_##INTRIN##64(v128, _mm256_shuffle_epi32(v128, _MM_SHUFFLE(0, 0, 3, 2))); \
+ return (STYPE##64)npyv_extract0_u64(v64); \
+ }
+NPY_IMPL_AVX2_REDUCE_MINMAX(npy_uint, min_u, min_epu)
+NPY_IMPL_AVX2_REDUCE_MINMAX(npy_int, min_s, min_epi)
+NPY_IMPL_AVX2_REDUCE_MINMAX(npy_uint, max_u, max_epu)
+NPY_IMPL_AVX2_REDUCE_MINMAX(npy_int, max_s, max_epi)
+#undef NPY_IMPL_AVX2_REDUCE_MINMAX
+
+// reduce min&max for ps & pd
+#define NPY_IMPL_AVX2_REDUCE_MINMAX(INTRIN, INF, INF64) \
+ NPY_FINLINE float npyv_reduce_##INTRIN##_f32(npyv_f32 a) \
+ { \
+ __m128 v128 = _mm_##INTRIN##_ps(_mm256_castps256_ps128(a), _mm256_extractf128_ps(a, 1)); \
+ __m128 v64 = _mm_##INTRIN##_ps(v128, _mm_shuffle_ps(v128, v128, _MM_SHUFFLE(0, 0, 3, 2))); \
+ __m128 v32 = _mm_##INTRIN##_ps(v64, _mm_shuffle_ps(v64, v64, _MM_SHUFFLE(0, 0, 0, 1))); \
+ return _mm_cvtss_f32(v32); \
+ } \
+ NPY_FINLINE double npyv_reduce_##INTRIN##_f64(npyv_f64 a) \
+ { \
+ __m128d v128 = _mm_##INTRIN##_pd(_mm256_castpd256_pd128(a), _mm256_extractf128_pd(a, 1)); \
+ __m128d v64 = _mm_##INTRIN##_pd(v128, _mm_shuffle_pd(v128, v128, _MM_SHUFFLE(0, 0, 0, 1))); \
+ return _mm_cvtsd_f64(v64); \
+ } \
+ NPY_FINLINE float npyv_reduce_##INTRIN##p_f32(npyv_f32 a) \
+ { \
+ npyv_b32 notnan = npyv_notnan_f32(a); \
+ if (NPY_UNLIKELY(!npyv_any_b32(notnan))) { \
+ return _mm_cvtss_f32(_mm256_castps256_ps128(a)); \
+ } \
+ a = npyv_select_f32(notnan, a, npyv_reinterpret_f32_u32(npyv_setall_u32(INF))); \
+ return npyv_reduce_##INTRIN##_f32(a); \
+ } \
+ NPY_FINLINE double npyv_reduce_##INTRIN##p_f64(npyv_f64 a) \
+ { \
+ npyv_b64 notnan = npyv_notnan_f64(a); \
+ if (NPY_UNLIKELY(!npyv_any_b64(notnan))) { \
+ return _mm_cvtsd_f64(_mm256_castpd256_pd128(a)); \
+ } \
+ a = npyv_select_f64(notnan, a, npyv_reinterpret_f64_u64(npyv_setall_u64(INF64))); \
+ return npyv_reduce_##INTRIN##_f64(a); \
+ } \
+ NPY_FINLINE float npyv_reduce_##INTRIN##n_f32(npyv_f32 a) \
+ { \
+ npyv_b32 notnan = npyv_notnan_f32(a); \
+ if (NPY_UNLIKELY(!npyv_all_b32(notnan))) { \
+ const union { npy_uint32 i; float f;} pnan = {0x7fc00000UL}; \
+ return pnan.f; \
+ } \
+ return npyv_reduce_##INTRIN##_f32(a); \
+ } \
+ NPY_FINLINE double npyv_reduce_##INTRIN##n_f64(npyv_f64 a) \
+ { \
+ npyv_b64 notnan = npyv_notnan_f64(a); \
+ if (NPY_UNLIKELY(!npyv_all_b64(notnan))) { \
+ const union { npy_uint64 i; double d;} pnan = {0x7ff8000000000000ull}; \
+ return pnan.d; \
+ } \
+ return npyv_reduce_##INTRIN##_f64(a); \
+ }
+NPY_IMPL_AVX2_REDUCE_MINMAX(min, 0x7f800000, 0x7ff0000000000000)
+NPY_IMPL_AVX2_REDUCE_MINMAX(max, 0xff800000, 0xfff0000000000000)
+#undef NPY_IMPL_AVX2_REDUCE_MINMAX
+
+// reduce min&max for 8&16-bits
+#define NPY_IMPL_AVX256_REDUCE_MINMAX(STYPE, INTRIN, VINTRIN) \
+ NPY_FINLINE STYPE##16 npyv_reduce_##INTRIN##16(__m256i a) \
+ { \
+ __m128i v128 = _mm_##VINTRIN##16(_mm256_castsi256_si128(a), _mm256_extracti128_si256(a, 1)); \
+ __m128i v64 = _mm_##VINTRIN##16(v128, _mm_shuffle_epi32(v128, _MM_SHUFFLE(0, 0, 3, 2))); \
+ __m128i v32 = _mm_##VINTRIN##16(v64, _mm_shuffle_epi32(v64, _MM_SHUFFLE(0, 0, 0, 1))); \
+ __m128i v16 = _mm_##VINTRIN##16(v32, _mm_shufflelo_epi16(v32, _MM_SHUFFLE(0, 0, 0, 1))); \
+ return (STYPE##16)_mm_cvtsi128_si32(v16); \
+ } \
+ NPY_FINLINE STYPE##8 npyv_reduce_##INTRIN##8(__m256i a) \
+ { \
+ __m128i v128 = _mm_##VINTRIN##8(_mm256_castsi256_si128(a), _mm256_extracti128_si256(a, 1)); \
+ __m128i v64 = _mm_##VINTRIN##8(v128, _mm_shuffle_epi32(v128, _MM_SHUFFLE(0, 0, 3, 2))); \
+ __m128i v32 = _mm_##VINTRIN##8(v64, _mm_shuffle_epi32(v64, _MM_SHUFFLE(0, 0, 0, 1))); \
+ __m128i v16 = _mm_##VINTRIN##8(v32, _mm_shufflelo_epi16(v32, _MM_SHUFFLE(0, 0, 0, 1))); \
+ __m128i v8 = _mm_##VINTRIN##8(v16, _mm_srli_epi16(v16, 8)); \
+ return (STYPE##16)_mm_cvtsi128_si32(v8); \
+ }
+NPY_IMPL_AVX256_REDUCE_MINMAX(npy_uint, min_u, min_epu)
+NPY_IMPL_AVX256_REDUCE_MINMAX(npy_int, min_s, min_epi)
+NPY_IMPL_AVX256_REDUCE_MINMAX(npy_uint, max_u, max_epu)
+NPY_IMPL_AVX256_REDUCE_MINMAX(npy_int, max_s, max_epi)
+#undef NPY_IMPL_AVX256_REDUCE_MINMAX
// round to nearest intger even
#define npyv_rint_f32(A) _mm256_round_ps(A, _MM_FROUND_TO_NEAREST_INT)
diff --git a/numpy/core/src/common/simd/avx2/misc.h b/numpy/core/src/common/simd/avx2/misc.h
index 5e91e91b3..41e788c75 100644
--- a/numpy/core/src/common/simd/avx2/misc.h
+++ b/numpy/core/src/common/simd/avx2/misc.h
@@ -31,7 +31,7 @@ NPY_FINLINE __m256i npyv__setr_epi64(npy_int64, npy_int64, npy_int64, npy_int64)
NPY_FINLINE npyv_u64 npyv_setall_u64(npy_uint64 a)
{
npy_int64 ai = (npy_int64)a;
-#if defined(_MSC_VER) && defined(_M_IX86)
+#if defined(_MSC_VER) && defined(_M_IX86)
return npyv__setr_epi64(ai, ai, ai, ai);
#else
return _mm256_set1_epi64x(ai);
@@ -130,6 +130,18 @@ NPY_FINLINE __m256d npyv__setr_pd(double i0, double i1, double i2, double i3)
#define npyv_select_f32(MASK, A, B) _mm256_blendv_ps(B, A, _mm256_castsi256_ps(MASK))
#define npyv_select_f64(MASK, A, B) _mm256_blendv_pd(B, A, _mm256_castsi256_pd(MASK))
+// extract the first vector's lane
+#define npyv_extract0_u8(A) ((npy_uint8)_mm_cvtsi128_si32(_mm256_castsi256_si128(A)))
+#define npyv_extract0_s8(A) ((npy_int8)_mm_cvtsi128_si32(_mm256_castsi256_si128(A)))
+#define npyv_extract0_u16(A) ((npy_uint16)_mm_cvtsi128_si32(_mm256_castsi256_si128(A)))
+#define npyv_extract0_s16(A) ((npy_int16)_mm_cvtsi128_si32(_mm256_castsi256_si128(A)))
+#define npyv_extract0_u32(A) ((npy_uint32)_mm_cvtsi128_si32(_mm256_castsi256_si128(A)))
+#define npyv_extract0_s32(A) ((npy_int32)_mm_cvtsi128_si32(_mm256_castsi256_si128(A)))
+#define npyv_extract0_u64(A) ((npy_uint64)npyv128_cvtsi128_si64(_mm256_castsi256_si128(A)))
+#define npyv_extract0_s64(A) ((npy_int64)npyv128_cvtsi128_si64(_mm256_castsi256_si128(A)))
+#define npyv_extract0_f32(A) _mm_cvtss_f32(_mm256_castps256_ps128(A))
+#define npyv_extract0_f64(A) _mm_cvtsd_f64(_mm256_castpd256_pd128(A))
+
// Reinterpret
#define npyv_reinterpret_u8_u8(X) X
#define npyv_reinterpret_u8_s8(X) X
diff --git a/numpy/core/src/common/simd/avx2/operators.h b/numpy/core/src/common/simd/avx2/operators.h
index 7682b24cb..c10267b21 100644
--- a/numpy/core/src/common/simd/avx2/operators.h
+++ b/numpy/core/src/common/simd/avx2/operators.h
@@ -225,4 +225,58 @@ NPY_FINLINE npyv_b32 npyv_notnan_f32(npyv_f32 a)
NPY_FINLINE npyv_b64 npyv_notnan_f64(npyv_f64 a)
{ return _mm256_castpd_si256(_mm256_cmp_pd(a, a, _CMP_ORD_Q)); }
+// Test cross all vector lanes
+// any: returns true if any of the elements is not equal to zero
+// all: returns true if all elements are not equal to zero
+#define NPYV_IMPL_AVX2_ANYALL(SFX) \
+ NPY_FINLINE bool npyv_any_##SFX(npyv_##SFX a) \
+ { return _mm256_movemask_epi8(a) != 0; } \
+ NPY_FINLINE bool npyv_all_##SFX(npyv_##SFX a) \
+ { return _mm256_movemask_epi8(a) == -1; }
+NPYV_IMPL_AVX2_ANYALL(b8)
+NPYV_IMPL_AVX2_ANYALL(b16)
+NPYV_IMPL_AVX2_ANYALL(b32)
+NPYV_IMPL_AVX2_ANYALL(b64)
+#undef NPYV_IMPL_AVX2_ANYALL
+
+#define NPYV_IMPL_AVX2_ANYALL(SFX) \
+ NPY_FINLINE bool npyv_any_##SFX(npyv_##SFX a) \
+ { \
+ return _mm256_movemask_epi8( \
+ npyv_cmpeq_##SFX(a, npyv_zero_##SFX()) \
+ ) != -1; \
+ } \
+ NPY_FINLINE bool npyv_all_##SFX(npyv_##SFX a) \
+ { \
+ return _mm256_movemask_epi8( \
+ npyv_cmpeq_##SFX(a, npyv_zero_##SFX()) \
+ ) == 0; \
+ }
+NPYV_IMPL_AVX2_ANYALL(u8)
+NPYV_IMPL_AVX2_ANYALL(s8)
+NPYV_IMPL_AVX2_ANYALL(u16)
+NPYV_IMPL_AVX2_ANYALL(s16)
+NPYV_IMPL_AVX2_ANYALL(u32)
+NPYV_IMPL_AVX2_ANYALL(s32)
+NPYV_IMPL_AVX2_ANYALL(u64)
+NPYV_IMPL_AVX2_ANYALL(s64)
+#undef NPYV_IMPL_AVX2_ANYALL
+
+#define NPYV_IMPL_AVX2_ANYALL(SFX, XSFX, MASK) \
+ NPY_FINLINE bool npyv_any_##SFX(npyv_##SFX a) \
+ { \
+ return _mm256_movemask_##XSFX( \
+ _mm256_cmp_##XSFX(a, npyv_zero_##SFX(), _CMP_EQ_OQ) \
+ ) != MASK; \
+ } \
+ NPY_FINLINE bool npyv_all_##SFX(npyv_##SFX a) \
+ { \
+ return _mm256_movemask_##XSFX( \
+ _mm256_cmp_##XSFX(a, npyv_zero_##SFX(), _CMP_EQ_OQ) \
+ ) == 0; \
+ }
+NPYV_IMPL_AVX2_ANYALL(f32, ps, 0xff)
+NPYV_IMPL_AVX2_ANYALL(f64, pd, 0xf)
+#undef NPYV_IMPL_AVX2_ANYALL
+
#endif // _NPY_SIMD_AVX2_OPERATORS_H
diff --git a/numpy/core/src/common/simd/avx512/arithmetic.h b/numpy/core/src/common/simd/avx512/arithmetic.h
index 850a0c05a..1290dc0ad 100644
--- a/numpy/core/src/common/simd/avx512/arithmetic.h
+++ b/numpy/core/src/common/simd/avx512/arithmetic.h
@@ -371,79 +371,7 @@ NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor)
#define npyv_sum_u64 _mm512_reduce_add_epi64
#define npyv_sum_f32 _mm512_reduce_add_ps
#define npyv_sum_f64 _mm512_reduce_add_pd
- #define npyv_reducemin_u32 _mm512_reduce_min_epu32
- #define npyv_reducemin_s32 _mm512_reduce_min_epi32
- #define npyv_reducemin_f32 _mm512_reduce_min_ps
- #define npyv_reducemax_u32 _mm512_reduce_max_epu32
- #define npyv_reducemax_s32 _mm512_reduce_max_epi32
- #define npyv_reducemax_f32 _mm512_reduce_max_ps
#else
- NPY_FINLINE npy_uint32 npyv_reducemax_u32(npyv_u32 a)
- {
- const npyv_u32 idx1 = _mm512_set_epi32(7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8);
- const npyv_u32 idx2 = _mm512_set_epi32(3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12);
- npyv_u32 a1 = _mm512_max_epu32(a, _mm512_permutex2var_epi32(a, idx1, a));
- npyv_u32 a2 = _mm512_max_epu32(a1, _mm512_permutex2var_epi32(a1, idx2, a1));
- npyv_u32 a3 = _mm512_max_epu32(a2, _mm512_shuffle_epi32(a2, (_MM_PERM_ENUM)(1<<6 | 0<<4 | 3<<2 | 2)));
- npyv_u32 a4 = _mm512_max_epu32(a3, _mm512_shuffle_epi32(a3, (_MM_PERM_ENUM)(2<<6 | 3<<4 | 0<<2 | 1)));
- return _mm_cvtsi128_si32(_mm512_extracti32x4_epi32(a4, 0x00));
- }
-
- NPY_FINLINE npy_int32 npyv_reducemax_s32(npyv_s32 a)
- {
- const npyv_u32 idx1 = _mm512_set_epi32(7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8);
- const npyv_u32 idx2 = _mm512_set_epi32(3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12);
- npyv_s32 a1 = _mm512_max_epi32(a, _mm512_permutex2var_epi32(a, idx1, a));
- npyv_s32 a2 = _mm512_max_epi32(a1, _mm512_permutex2var_epi32(a1, idx2, a1));
- npyv_s32 a3 = _mm512_max_epi32(a2, _mm512_shuffle_epi32(a2, (_MM_PERM_ENUM)(1<<6 | 0<<4 | 3<<2 | 2)));
- npyv_s32 a4 = _mm512_max_epi32(a3, _mm512_shuffle_epi32(a3, (_MM_PERM_ENUM)(2<<6 | 3<<4 | 0<<2 | 1)));
- return _mm_cvtsi128_si32(_mm512_extracti32x4_epi32(a4, 0x00));
- }
-
- NPY_FINLINE npy_float npyv_reducemax_f32(npyv_f32 a)
- {
- const npyv_u32 idx1 = _mm512_set_epi32(7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8);
- const npyv_u32 idx2 = _mm512_set_epi32(3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12);
- npyv_f32 a1 = _mm512_max_ps(a, _mm512_permutex2var_ps(a, idx1, a));
- npyv_f32 a2 = _mm512_max_ps(a1, _mm512_permutex2var_ps(a1, idx2, a1));
- npyv_f32 a3 = _mm512_max_ps(a2, _mm512_shuffle_ps(a2, a2, (_MM_PERM_ENUM)(1<<6 | 0<<4 | 3<<2 | 2)));
- npyv_f32 a4 = _mm512_max_ps(a3, _mm512_shuffle_ps(a3, a3, (_MM_PERM_ENUM)(2<<6 | 3<<4 | 0<<2 | 1)));
- return _mm_cvtss_f32(_mm512_extractf32x4_ps(a4, 0x00));
- }
-
- NPY_FINLINE npy_uint32 npyv_reducemin_u32(npyv_u32 a)
- {
- const npyv_u32 idx1 = _mm512_set_epi32(7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8);
- const npyv_u32 idx2 = _mm512_set_epi32(3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12);
- npyv_u32 a1 = _mm512_min_epu32(a, _mm512_permutex2var_epi32(a, idx1, a));
- npyv_u32 a2 = _mm512_min_epu32(a1, _mm512_permutex2var_epi32(a1, idx2, a1));
- npyv_u32 a3 = _mm512_min_epu32(a2, _mm512_shuffle_epi32(a2, (_MM_PERM_ENUM)(1<<6 | 0<<4 | 3<<2 | 2)));
- npyv_u32 a4 = _mm512_min_epu32(a3, _mm512_shuffle_epi32(a3, (_MM_PERM_ENUM)(2<<6 | 3<<4 | 0<<2 | 1)));
- return _mm_cvtsi128_si32(_mm512_extracti32x4_epi32(a4, 0x00));
- }
-
- NPY_FINLINE npy_int32 npyv_reducemin_s32(npyv_s32 a)
- {
- const npyv_u32 idx1 = _mm512_set_epi32(7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8);
- const npyv_u32 idx2 = _mm512_set_epi32(3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12);
- npyv_s32 a1 = _mm512_min_epi32(a, _mm512_permutex2var_epi32(a, idx1, a));
- npyv_s32 a2 = _mm512_min_epi32(a1, _mm512_permutex2var_epi32(a1, idx2, a1));
- npyv_s32 a3 = _mm512_min_epi32(a2, _mm512_shuffle_epi32(a2, (_MM_PERM_ENUM)(1<<6 | 0<<4 | 3<<2 | 2)));
- npyv_s32 a4 = _mm512_min_epi32(a3, _mm512_shuffle_epi32(a3, (_MM_PERM_ENUM)(2<<6 | 3<<4 | 0<<2 | 1)));
- return _mm_cvtsi128_si32(_mm512_extracti32x4_epi32(a4, 0x00));
- }
-
- NPY_FINLINE npy_float npyv_reducemin_f32(npyv_f32 a)
- {
- const npyv_u32 idx1 = _mm512_set_epi32(7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8);
- const npyv_u32 idx2 = _mm512_set_epi32(3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12);
- npyv_f32 a1 = _mm512_min_ps(a, _mm512_permutex2var_ps(a, idx1, a));
- npyv_f32 a2 = _mm512_min_ps(a1, _mm512_permutex2var_ps(a1, idx2, a1));
- npyv_f32 a3 = _mm512_min_ps(a2, _mm512_shuffle_ps(a2, a2, (_MM_PERM_ENUM)(1<<6 | 0<<4 | 3<<2 | 2)));
- npyv_f32 a4 = _mm512_min_ps(a3, _mm512_shuffle_ps(a3, a3, (_MM_PERM_ENUM)(2<<6 | 3<<4 | 0<<2 | 1)));
- return _mm_cvtss_f32(_mm512_extractf32x4_ps(a4, 0x00));
- }
-
NPY_FINLINE npy_uint32 npyv_sum_u32(npyv_u32 a)
{
__m256i half = _mm256_add_epi32(npyv512_lower_si256(a), npyv512_higher_si256(a));
@@ -483,6 +411,7 @@ NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor)
__m512d sum8 = _mm512_add_pd(sum16, h16);
return _mm_cvtsd_f64(_mm512_castpd512_pd128(sum8));
}
+
#endif
// expand the source vector and performs sum reduce
diff --git a/numpy/core/src/common/simd/avx512/math.h b/numpy/core/src/common/simd/avx512/math.h
index 5a6cb6dcd..97fd2d641 100644
--- a/numpy/core/src/common/simd/avx512/math.h
+++ b/numpy/core/src/common/simd/avx512/math.h
@@ -62,6 +62,18 @@ NPY_FINLINE npyv_f64 npyv_maxp_f64(npyv_f64 a, npyv_f64 b)
__mmask8 nn = _mm512_cmp_pd_mask(b, b, _CMP_ORD_Q);
return _mm512_mask_max_pd(a, nn, a, b);
}
+// Maximum, propagates NaNs
+// If any of corresponded element is NaN, NaN is set.
+NPY_FINLINE npyv_f32 npyv_maxn_f32(npyv_f32 a, npyv_f32 b)
+{
+ __mmask16 nn = _mm512_cmp_ps_mask(a, a, _CMP_ORD_Q);
+ return _mm512_mask_max_ps(a, nn, a, b);
+}
+NPY_FINLINE npyv_f64 npyv_maxn_f64(npyv_f64 a, npyv_f64 b)
+{
+ __mmask8 nn = _mm512_cmp_pd_mask(a, a, _CMP_ORD_Q);
+ return _mm512_mask_max_pd(a, nn, a, b);
+}
// Maximum, integer operations
#ifdef NPY_HAVE_AVX512BW
#define npyv_max_u8 _mm512_max_epu8
@@ -95,6 +107,18 @@ NPY_FINLINE npyv_f64 npyv_minp_f64(npyv_f64 a, npyv_f64 b)
__mmask8 nn = _mm512_cmp_pd_mask(b, b, _CMP_ORD_Q);
return _mm512_mask_min_pd(a, nn, a, b);
}
+// Minimum, propagates NaNs
+// If any of corresponded element is NaN, NaN is set.
+NPY_FINLINE npyv_f32 npyv_minn_f32(npyv_f32 a, npyv_f32 b)
+{
+ __mmask16 nn = _mm512_cmp_ps_mask(a, a, _CMP_ORD_Q);
+ return _mm512_mask_min_ps(a, nn, a, b);
+}
+NPY_FINLINE npyv_f64 npyv_minn_f64(npyv_f64 a, npyv_f64 b)
+{
+ __mmask8 nn = _mm512_cmp_pd_mask(a, a, _CMP_ORD_Q);
+ return _mm512_mask_min_pd(a, nn, a, b);
+}
// Minimum, integer operations
#ifdef NPY_HAVE_AVX512BW
#define npyv_min_u8 _mm512_min_epu8
@@ -112,6 +136,160 @@ NPY_FINLINE npyv_f64 npyv_minp_f64(npyv_f64 a, npyv_f64 b)
#define npyv_min_u64 _mm512_min_epu64
#define npyv_min_s64 _mm512_min_epi64
+#ifdef NPY_HAVE_AVX512F_REDUCE
+ #define npyv_reduce_min_u32 _mm512_reduce_min_epu32
+ #define npyv_reduce_min_s32 _mm512_reduce_min_epi32
+ #define npyv_reduce_min_u64 _mm512_reduce_min_epu64
+ #define npyv_reduce_min_s64 _mm512_reduce_min_epi64
+ #define npyv_reduce_min_f32 _mm512_reduce_min_ps
+ #define npyv_reduce_min_f64 _mm512_reduce_min_pd
+ #define npyv_reduce_max_u32 _mm512_reduce_max_epu32
+ #define npyv_reduce_max_s32 _mm512_reduce_max_epi32
+ #define npyv_reduce_max_u64 _mm512_reduce_max_epu64
+ #define npyv_reduce_max_s64 _mm512_reduce_max_epi64
+ #define npyv_reduce_max_f32 _mm512_reduce_max_ps
+ #define npyv_reduce_max_f64 _mm512_reduce_max_pd
+#else
+ // reduce min&max for 32&64-bits
+ #define NPY_IMPL_AVX512_REDUCE_MINMAX(STYPE, INTRIN, VINTRIN) \
+ NPY_FINLINE STYPE##32 npyv_reduce_##INTRIN##32(__m512i a) \
+ { \
+ __m256i v256 = _mm256_##VINTRIN##32(npyv512_lower_si256(a), \
+ npyv512_higher_si256(a)); \
+ __m128i v128 = _mm_##VINTRIN##32(_mm256_castsi256_si128(v256), \
+ _mm256_extracti128_si256(v256, 1)); \
+ __m128i v64 = _mm_##VINTRIN##32(v128, _mm_shuffle_epi32(v128, \
+ (_MM_PERM_ENUM)_MM_SHUFFLE(0, 0, 3, 2))); \
+ __m128i v32 = _mm_##VINTRIN##32(v64, _mm_shuffle_epi32(v64, \
+ (_MM_PERM_ENUM)_MM_SHUFFLE(0, 0, 0, 1))); \
+ return (STYPE##32)_mm_cvtsi128_si32(v32); \
+ } \
+ NPY_FINLINE STYPE##64 npyv_reduce_##INTRIN##64(__m512i a) \
+ { \
+ __m512i v256 = _mm512_##VINTRIN##64(a, \
+ _mm512_shuffle_i64x2(a, a, (_MM_PERM_ENUM)_MM_SHUFFLE(0, 0, 3, 2))); \
+ __m512i v128 = _mm512_##VINTRIN##64(v256, \
+ _mm512_shuffle_i64x2(v256, v256, (_MM_PERM_ENUM)_MM_SHUFFLE(0, 0, 0, 1))); \
+ __m512i v64 = _mm512_##VINTRIN##64(v128, \
+ _mm512_shuffle_epi32(v128, (_MM_PERM_ENUM)_MM_SHUFFLE(0, 0, 3, 2))); \
+ return (STYPE##64)npyv_extract0_u64(v64); \
+ }
+
+ NPY_IMPL_AVX512_REDUCE_MINMAX(npy_uint, min_u, min_epu)
+ NPY_IMPL_AVX512_REDUCE_MINMAX(npy_int, min_s, min_epi)
+ NPY_IMPL_AVX512_REDUCE_MINMAX(npy_uint, max_u, max_epu)
+ NPY_IMPL_AVX512_REDUCE_MINMAX(npy_int, max_s, max_epi)
+ #undef NPY_IMPL_AVX512_REDUCE_MINMAX
+ // reduce min&max for ps & pd
+ #define NPY_IMPL_AVX512_REDUCE_MINMAX(INTRIN) \
+ NPY_FINLINE float npyv_reduce_##INTRIN##_f32(npyv_f32 a) \
+ { \
+ __m256 v256 = _mm256_##INTRIN##_ps( \
+ npyv512_lower_ps256(a), npyv512_higher_ps256(a)); \
+ __m128 v128 = _mm_##INTRIN##_ps( \
+ _mm256_castps256_ps128(v256), _mm256_extractf128_ps(v256, 1)); \
+ __m128 v64 = _mm_##INTRIN##_ps(v128, \
+ _mm_shuffle_ps(v128, v128, (_MM_PERM_ENUM)_MM_SHUFFLE(0, 0, 3, 2))); \
+ __m128 v32 = _mm_##INTRIN##_ps(v64, \
+ _mm_shuffle_ps(v64, v64, (_MM_PERM_ENUM)_MM_SHUFFLE(0, 0, 0, 1))); \
+ return _mm_cvtss_f32(v32); \
+ } \
+ NPY_FINLINE double npyv_reduce_##INTRIN##_f64(npyv_f64 a) \
+ { \
+ __m256d v256 = _mm256_##INTRIN##_pd( \
+ npyv512_lower_pd256(a), npyv512_higher_pd256(a)); \
+ __m128d v128 = _mm_##INTRIN##_pd( \
+ _mm256_castpd256_pd128(v256), _mm256_extractf128_pd(v256, 1)); \
+ __m128d v64 = _mm_##INTRIN##_pd(v128, \
+ _mm_shuffle_pd(v128, v128, (_MM_PERM_ENUM)_MM_SHUFFLE(0, 0, 0, 1))); \
+ return _mm_cvtsd_f64(v64); \
+ }
+
+ NPY_IMPL_AVX512_REDUCE_MINMAX(min)
+ NPY_IMPL_AVX512_REDUCE_MINMAX(max)
+ #undef NPY_IMPL_AVX512_REDUCE_MINMAX
+#endif
+#define NPY_IMPL_AVX512_REDUCE_MINMAX(INTRIN, INF, INF64) \
+ NPY_FINLINE float npyv_reduce_##INTRIN##p_f32(npyv_f32 a) \
+ { \
+ npyv_b32 notnan = npyv_notnan_f32(a); \
+ if (NPY_UNLIKELY(!npyv_any_b32(notnan))) { \
+ return _mm_cvtss_f32(_mm512_castps512_ps128(a)); \
+ } \
+ a = npyv_select_f32(notnan, a, \
+ npyv_reinterpret_f32_u32(npyv_setall_u32(INF))); \
+ return npyv_reduce_##INTRIN##_f32(a); \
+ } \
+ NPY_FINLINE double npyv_reduce_##INTRIN##p_f64(npyv_f64 a) \
+ { \
+ npyv_b64 notnan = npyv_notnan_f64(a); \
+ if (NPY_UNLIKELY(!npyv_any_b64(notnan))) { \
+ return _mm_cvtsd_f64(_mm512_castpd512_pd128(a)); \
+ } \
+ a = npyv_select_f64(notnan, a, \
+ npyv_reinterpret_f64_u64(npyv_setall_u64(INF64))); \
+ return npyv_reduce_##INTRIN##_f64(a); \
+ } \
+ NPY_FINLINE float npyv_reduce_##INTRIN##n_f32(npyv_f32 a) \
+ { \
+ npyv_b32 notnan = npyv_notnan_f32(a); \
+ if (NPY_UNLIKELY(!npyv_all_b32(notnan))) { \
+ const union { npy_uint32 i; float f;} pnan = { \
+ 0x7fc00000ul \
+ }; \
+ return pnan.f; \
+ } \
+ return npyv_reduce_##INTRIN##_f32(a); \
+ } \
+ NPY_FINLINE double npyv_reduce_##INTRIN##n_f64(npyv_f64 a) \
+ { \
+ npyv_b64 notnan = npyv_notnan_f64(a); \
+ if (NPY_UNLIKELY(!npyv_all_b64(notnan))) { \
+ const union { npy_uint64 i; double d;} pnan = { \
+ 0x7ff8000000000000ull \
+ }; \
+ return pnan.d; \
+ } \
+ return npyv_reduce_##INTRIN##_f64(a); \
+ }
+
+NPY_IMPL_AVX512_REDUCE_MINMAX(min, 0x7f800000, 0x7ff0000000000000)
+NPY_IMPL_AVX512_REDUCE_MINMAX(max, 0xff800000, 0xfff0000000000000)
+#undef NPY_IMPL_AVX512_REDUCE_MINMAX
+
+// reduce min&max for 8&16-bits
+#define NPY_IMPL_AVX512_REDUCE_MINMAX(STYPE, INTRIN, VINTRIN) \
+ NPY_FINLINE STYPE##16 npyv_reduce_##INTRIN##16(__m512i a) \
+ { \
+ __m256i v256 = _mm256_##VINTRIN##16(npyv512_lower_si256(a), npyv512_higher_si256(a)); \
+ __m128i v128 = _mm_##VINTRIN##16(_mm256_castsi256_si128(v256), _mm256_extracti128_si256(v256, 1)); \
+ __m128i v64 = _mm_##VINTRIN##16(v128, _mm_shuffle_epi32(v128, \
+ (_MM_PERM_ENUM)_MM_SHUFFLE(0, 0, 3, 2))); \
+ __m128i v32 = _mm_##VINTRIN##16(v64, _mm_shuffle_epi32(v64, \
+ (_MM_PERM_ENUM)_MM_SHUFFLE(0, 0, 0, 1))); \
+ __m128i v16 = _mm_##VINTRIN##16(v32, _mm_shufflelo_epi16(v32, \
+ (_MM_PERM_ENUM)_MM_SHUFFLE(0, 0, 0, 1))); \
+ return (STYPE##16)_mm_cvtsi128_si32(v16); \
+ } \
+ NPY_FINLINE STYPE##8 npyv_reduce_##INTRIN##8(__m512i a) \
+ { \
+ __m256i v256 = _mm256_##VINTRIN##8(npyv512_lower_si256(a), npyv512_higher_si256(a)); \
+ __m128i v128 = _mm_##VINTRIN##8(_mm256_castsi256_si128(v256), _mm256_extracti128_si256(v256, 1)); \
+ __m128i v64 = _mm_##VINTRIN##8(v128, _mm_shuffle_epi32(v128, \
+ (_MM_PERM_ENUM)_MM_SHUFFLE(0, 0, 3, 2))); \
+ __m128i v32 = _mm_##VINTRIN##8(v64, _mm_shuffle_epi32(v64, \
+ (_MM_PERM_ENUM)_MM_SHUFFLE(0, 0, 0, 1))); \
+ __m128i v16 = _mm_##VINTRIN##8(v32, _mm_shufflelo_epi16(v32, \
+ (_MM_PERM_ENUM)_MM_SHUFFLE(0, 0, 0, 1))); \
+ __m128i v8 = _mm_##VINTRIN##8(v16, _mm_srli_epi16(v16, 8)); \
+ return (STYPE##16)_mm_cvtsi128_si32(v8); \
+ }
+NPY_IMPL_AVX512_REDUCE_MINMAX(npy_uint, min_u, min_epu)
+NPY_IMPL_AVX512_REDUCE_MINMAX(npy_int, min_s, min_epi)
+NPY_IMPL_AVX512_REDUCE_MINMAX(npy_uint, max_u, max_epu)
+NPY_IMPL_AVX512_REDUCE_MINMAX(npy_int, max_s, max_epi)
+#undef NPY_IMPL_AVX512_REDUCE_MINMAX
+
// round to nearest integer even
#define npyv_rint_f32(A) _mm512_roundscale_ps(A, _MM_FROUND_TO_NEAREST_INT)
#define npyv_rint_f64(A) _mm512_roundscale_pd(A, _MM_FROUND_TO_NEAREST_INT)
diff --git a/numpy/core/src/common/simd/avx512/misc.h b/numpy/core/src/common/simd/avx512/misc.h
index c3039ecfe..d9190870e 100644
--- a/numpy/core/src/common/simd/avx512/misc.h
+++ b/numpy/core/src/common/simd/avx512/misc.h
@@ -34,7 +34,7 @@ NPY_FINLINE __m512i npyv__setr_epi64(
NPY_FINLINE npyv_u64 npyv_setall_u64(npy_uint64 a)
{
npy_int64 ai = (npy_int64)a;
-#if defined(_MSC_VER) && defined(_M_IX86)
+#if defined(_MSC_VER) && defined(_M_IX86)
return npyv__setr_epi64(ai, ai, ai, ai, ai, ai, ai, ai);
#else
return _mm512_set1_epi64(ai);
@@ -160,6 +160,18 @@ NPY_FINLINE __m512d npyv__setr_pd(double i0, double i1, double i2, double i3,
#define npyv_select_f32(MASK, A, B) _mm512_mask_blend_ps(MASK, B, A)
#define npyv_select_f64(MASK, A, B) _mm512_mask_blend_pd(MASK, B, A)
+// extract the first vector's lane
+#define npyv_extract0_u8(A) ((npy_uint8)_mm_cvtsi128_si32(_mm512_castsi512_si128(A)))
+#define npyv_extract0_s8(A) ((npy_int8)_mm_cvtsi128_si32(_mm512_castsi512_si128(A)))
+#define npyv_extract0_u16(A) ((npy_uint16)_mm_cvtsi128_si32(_mm512_castsi512_si128(A)))
+#define npyv_extract0_s16(A) ((npy_int16)_mm_cvtsi128_si32(_mm512_castsi512_si128(A)))
+#define npyv_extract0_u32(A) ((npy_uint32)_mm_cvtsi128_si32(_mm512_castsi512_si128(A)))
+#define npyv_extract0_s32(A) ((npy_int32)_mm_cvtsi128_si32(_mm512_castsi512_si128(A)))
+#define npyv_extract0_u64(A) ((npy_uint64)npyv128_cvtsi128_si64(_mm512_castsi512_si128(A)))
+#define npyv_extract0_s64(A) ((npy_int64)npyv128_cvtsi128_si64(_mm512_castsi512_si128(A)))
+#define npyv_extract0_f32(A) _mm_cvtss_f32(_mm512_castps512_ps128(A))
+#define npyv_extract0_f64(A) _mm_cvtsd_f64(_mm512_castpd512_pd128(A))
+
// reinterpret
#define npyv_reinterpret_u8_u8(X) X
#define npyv_reinterpret_u8_s8(X) X
diff --git a/numpy/core/src/common/simd/avx512/operators.h b/numpy/core/src/common/simd/avx512/operators.h
index 804cd24e8..c70932d5f 100644
--- a/numpy/core/src/common/simd/avx512/operators.h
+++ b/numpy/core/src/common/simd/avx512/operators.h
@@ -5,6 +5,8 @@
#ifndef _NPY_SIMD_AVX512_OPERATORS_H
#define _NPY_SIMD_AVX512_OPERATORS_H
+#include "conversion.h" // tobits
+
/***************************
* Shifting
***************************/
@@ -336,4 +338,43 @@ NPY_FINLINE npyv_b32 npyv_notnan_f32(npyv_f32 a)
NPY_FINLINE npyv_b64 npyv_notnan_f64(npyv_f64 a)
{ return _mm512_cmp_pd_mask(a, a, _CMP_ORD_Q); }
+// Test cross all vector lanes
+// any: returns true if any of the elements is not equal to zero
+// all: returns true if all elements are not equal to zero
+#define NPYV_IMPL_AVX512_ANYALL(SFX, MASK) \
+ NPY_FINLINE bool npyv_any_##SFX(npyv_##SFX a) \
+ { return npyv_tobits_##SFX(a) != 0; } \
+ NPY_FINLINE bool npyv_all_##SFX(npyv_##SFX a) \
+ { return npyv_tobits_##SFX(a) == MASK; }
+NPYV_IMPL_AVX512_ANYALL(b8, 0xffffffffffffffffull)
+NPYV_IMPL_AVX512_ANYALL(b16, 0xfffffffful)
+NPYV_IMPL_AVX512_ANYALL(b32, 0xffff)
+NPYV_IMPL_AVX512_ANYALL(b64, 0xff)
+#undef NPYV_IMPL_AVX512_ANYALL
+
+#define NPYV_IMPL_AVX512_ANYALL(SFX, BSFX, MASK) \
+ NPY_FINLINE bool npyv_any_##SFX(npyv_##SFX a) \
+ { \
+ return npyv_tobits_##BSFX( \
+ npyv_cmpeq_##SFX(a, npyv_zero_##SFX()) \
+ ) != MASK; \
+ } \
+ NPY_FINLINE bool npyv_all_##SFX(npyv_##SFX a) \
+ { \
+ return npyv_tobits_##BSFX( \
+ npyv_cmpeq_##SFX(a, npyv_zero_##SFX()) \
+ ) == 0; \
+ }
+NPYV_IMPL_AVX512_ANYALL(u8, b8, 0xffffffffffffffffull)
+NPYV_IMPL_AVX512_ANYALL(s8, b8, 0xffffffffffffffffull)
+NPYV_IMPL_AVX512_ANYALL(u16, b16, 0xfffffffful)
+NPYV_IMPL_AVX512_ANYALL(s16, b16, 0xfffffffful)
+NPYV_IMPL_AVX512_ANYALL(u32, b32, 0xffff)
+NPYV_IMPL_AVX512_ANYALL(s32, b32, 0xffff)
+NPYV_IMPL_AVX512_ANYALL(u64, b64, 0xff)
+NPYV_IMPL_AVX512_ANYALL(s64, b64, 0xff)
+NPYV_IMPL_AVX512_ANYALL(f32, b32, 0xffff)
+NPYV_IMPL_AVX512_ANYALL(f64, b64, 0xff)
+#undef NPYV_IMPL_AVX512_ANYALL
+
#endif // _NPY_SIMD_AVX512_OPERATORS_H
diff --git a/numpy/core/src/common/simd/neon/math.h b/numpy/core/src/common/simd/neon/math.h
index 8f4680c8f..c0a771b5d 100644
--- a/numpy/core/src/common/simd/neon/math.h
+++ b/numpy/core/src/common/simd/neon/math.h
@@ -99,8 +99,12 @@ NPY_FINLINE npyv_f32 npyv_recip_f32(npyv_f32 a)
return vmaxq_f32(vbslq_f32(nn_a, a, b), vbslq_f32(nn_b, b, a));
}
#endif
+// Max, propagates NaNs
+// If any of corresponded element is NaN, NaN is set.
+#define npyv_maxn_f32 vmaxq_f32
#if NPY_SIMD_F64
#define npyv_maxp_f64 vmaxnmq_f64
+ #define npyv_maxn_f64 vmaxq_f64
#endif // NPY_SIMD_F64
// Maximum, integer operations
#define npyv_max_u8 vmaxq_u8
@@ -134,9 +138,14 @@ NPY_FINLINE npyv_s64 npyv_max_s64(npyv_s64 a, npyv_s64 b)
return vminq_f32(vbslq_f32(nn_a, a, b), vbslq_f32(nn_b, b, a));
}
#endif
+// Min, propagates NaNs
+// If any of corresponded element is NaN, NaN is set.
+#define npyv_minn_f32 vminq_f32
#if NPY_SIMD_F64
#define npyv_minp_f64 vminnmq_f64
+ #define npyv_minn_f64 vminq_f64
#endif // NPY_SIMD_F64
+
// Minimum, integer operations
#define npyv_min_u8 vminq_u8
#define npyv_min_s8 vminq_s8
@@ -152,6 +161,115 @@ NPY_FINLINE npyv_s64 npyv_min_s64(npyv_s64 a, npyv_s64 b)
{
return vbslq_s64(npyv_cmplt_s64(a, b), a, b);
}
+// reduce min/max for all data types
+#if NPY_SIMD_F64
+ #define npyv_reduce_max_u8 vmaxvq_u8
+ #define npyv_reduce_max_s8 vmaxvq_s8
+ #define npyv_reduce_max_u16 vmaxvq_u16
+ #define npyv_reduce_max_s16 vmaxvq_s16
+ #define npyv_reduce_max_u32 vmaxvq_u32
+ #define npyv_reduce_max_s32 vmaxvq_s32
+
+ #define npyv_reduce_max_f32 vmaxvq_f32
+ #define npyv_reduce_max_f64 vmaxvq_f64
+ #define npyv_reduce_maxn_f32 vmaxvq_f32
+ #define npyv_reduce_maxn_f64 vmaxvq_f64
+ #define npyv_reduce_maxp_f32 vmaxnmvq_f32
+ #define npyv_reduce_maxp_f64 vmaxnmvq_f64
+
+ #define npyv_reduce_min_u8 vminvq_u8
+ #define npyv_reduce_min_s8 vminvq_s8
+ #define npyv_reduce_min_u16 vminvq_u16
+ #define npyv_reduce_min_s16 vminvq_s16
+ #define npyv_reduce_min_u32 vminvq_u32
+ #define npyv_reduce_min_s32 vminvq_s32
+
+ #define npyv_reduce_min_f32 vminvq_f32
+ #define npyv_reduce_min_f64 vminvq_f64
+ #define npyv_reduce_minn_f32 vminvq_f32
+ #define npyv_reduce_minn_f64 vminvq_f64
+ #define npyv_reduce_minp_f32 vminnmvq_f32
+ #define npyv_reduce_minp_f64 vminnmvq_f64
+#else
+ #define NPY_IMPL_NEON_REDUCE_MINMAX(INTRIN, STYPE, SFX) \
+ NPY_FINLINE npy_##STYPE npyv_reduce_##INTRIN##_##SFX(npyv_##SFX a) \
+ { \
+ STYPE##x8_t r = vp##INTRIN##_##SFX(vget_low_##SFX(a), vget_high_##SFX(a)); \
+ r = vp##INTRIN##_##SFX(r, r); \
+ r = vp##INTRIN##_##SFX(r, r); \
+ r = vp##INTRIN##_##SFX(r, r); \
+ return (npy_##STYPE)vget_lane_##SFX(r, 0); \
+ }
+ NPY_IMPL_NEON_REDUCE_MINMAX(min, uint8, u8)
+ NPY_IMPL_NEON_REDUCE_MINMAX(max, uint8, u8)
+ NPY_IMPL_NEON_REDUCE_MINMAX(min, int8, s8)
+ NPY_IMPL_NEON_REDUCE_MINMAX(max, int8, s8)
+ #undef NPY_IMPL_NEON_REDUCE_MINMAX
+
+ #define NPY_IMPL_NEON_REDUCE_MINMAX(INTRIN, STYPE, SFX) \
+ NPY_FINLINE npy_##STYPE npyv_reduce_##INTRIN##_##SFX(npyv_##SFX a) \
+ { \
+ STYPE##x4_t r = vp##INTRIN##_##SFX(vget_low_##SFX(a), vget_high_##SFX(a)); \
+ r = vp##INTRIN##_##SFX(r, r); \
+ r = vp##INTRIN##_##SFX(r, r); \
+ return (npy_##STYPE)vget_lane_##SFX(r, 0); \
+ }
+ NPY_IMPL_NEON_REDUCE_MINMAX(min, uint16, u16)
+ NPY_IMPL_NEON_REDUCE_MINMAX(max, uint16, u16)
+ NPY_IMPL_NEON_REDUCE_MINMAX(min, int16, s16)
+ NPY_IMPL_NEON_REDUCE_MINMAX(max, int16, s16)
+ #undef NPY_IMPL_NEON_REDUCE_MINMAX
+
+ #define NPY_IMPL_NEON_REDUCE_MINMAX(INTRIN, STYPE, SFX) \
+ NPY_FINLINE npy_##STYPE npyv_reduce_##INTRIN##_##SFX(npyv_##SFX a) \
+ { \
+ STYPE##x2_t r = vp##INTRIN##_##SFX(vget_low_##SFX(a), vget_high_##SFX(a)); \
+ r = vp##INTRIN##_##SFX(r, r); \
+ return (npy_##STYPE)vget_lane_##SFX(r, 0); \
+ }
+ NPY_IMPL_NEON_REDUCE_MINMAX(min, uint32, u32)
+ NPY_IMPL_NEON_REDUCE_MINMAX(max, uint32, u32)
+ NPY_IMPL_NEON_REDUCE_MINMAX(min, int32, s32)
+ NPY_IMPL_NEON_REDUCE_MINMAX(max, int32, s32)
+ #undef NPY_IMPL_NEON_REDUCE_MINMAX
+
+ #define NPY_IMPL_NEON_REDUCE_MINMAX(INTRIN, INF) \
+ NPY_FINLINE float npyv_reduce_##INTRIN##_f32(npyv_f32 a) \
+ { \
+ float32x2_t r = vp##INTRIN##_f32(vget_low_f32(a), vget_high_f32(a));\
+ r = vp##INTRIN##_f32(r, r); \
+ return vget_lane_f32(r, 0); \
+ } \
+ NPY_FINLINE float npyv_reduce_##INTRIN##p_f32(npyv_f32 a) \
+ { \
+ npyv_b32 notnan = npyv_notnan_f32(a); \
+ if (NPY_UNLIKELY(!npyv_any_b32(notnan))) { \
+ return vgetq_lane_f32(a, 0); \
+ } \
+ a = npyv_select_f32(notnan, a, \
+ npyv_reinterpret_f32_u32(npyv_setall_u32(INF))); \
+ return npyv_reduce_##INTRIN##_f32(a); \
+ } \
+ NPY_FINLINE float npyv_reduce_##INTRIN##n_f32(npyv_f32 a) \
+ { \
+ return npyv_reduce_##INTRIN##_f32(a); \
+ }
+ NPY_IMPL_NEON_REDUCE_MINMAX(min, 0x7f800000)
+ NPY_IMPL_NEON_REDUCE_MINMAX(max, 0xff800000)
+ #undef NPY_IMPL_NEON_REDUCE_MINMAX
+#endif // NPY_SIMD_F64
+#define NPY_IMPL_NEON_REDUCE_MINMAX(INTRIN, STYPE, SFX, OP) \
+ NPY_FINLINE STYPE npyv_reduce_##INTRIN##_##SFX(npyv_##SFX a) \
+ { \
+ STYPE al = (STYPE)vget_low_##SFX(a); \
+ STYPE ah = (STYPE)vget_high_##SFX(a); \
+ return al OP ah ? al : ah; \
+ }
+NPY_IMPL_NEON_REDUCE_MINMAX(max, npy_uint64, u64, >)
+NPY_IMPL_NEON_REDUCE_MINMAX(max, npy_int64, s64, >)
+NPY_IMPL_NEON_REDUCE_MINMAX(min, npy_uint64, u64, <)
+NPY_IMPL_NEON_REDUCE_MINMAX(min, npy_int64, s64, <)
+#undef NPY_IMPL_NEON_REDUCE_MINMAX
// round to nearest integer even
NPY_FINLINE npyv_f32 npyv_rint_f32(npyv_f32 a)
diff --git a/numpy/core/src/common/simd/neon/misc.h b/numpy/core/src/common/simd/neon/misc.h
index 51b0c3858..5fe109c13 100644
--- a/numpy/core/src/common/simd/neon/misc.h
+++ b/numpy/core/src/common/simd/neon/misc.h
@@ -138,6 +138,18 @@ NPY_FINLINE float64x2_t npyv__set_f64(double i0, double i1)
#define npyv_select_f32 vbslq_f32
#define npyv_select_f64 vbslq_f64
+// extract the first vector's lane
+#define npyv_extract0_u8(A) ((npy_uint8)vgetq_lane_u8(A, 0))
+#define npyv_extract0_s8(A) ((npy_int8)vgetq_lane_s8(A, 0))
+#define npyv_extract0_u16(A) ((npy_uint16)vgetq_lane_u16(A, 0))
+#define npyv_extract0_s16(A) ((npy_int16)vgetq_lane_s16(A, 0))
+#define npyv_extract0_u32(A) ((npy_uint32)vgetq_lane_u32(A, 0))
+#define npyv_extract0_s32(A) ((npy_int32)vgetq_lane_s32(A, 0))
+#define npyv_extract0_u64(A) ((npy_uint64)vgetq_lane_u64(A, 0))
+#define npyv_extract0_s64(A) ((npy_int64)vgetq_lane_s64(A, 0))
+#define npyv_extract0_f32(A) vgetq_lane_f32(A, 0)
+#define npyv_extract0_f64(A) vgetq_lane_f64(A, 0)
+
// Reinterpret
#define npyv_reinterpret_u8_u8(X) X
#define npyv_reinterpret_u8_s8 vreinterpretq_u8_s8
diff --git a/numpy/core/src/common/simd/neon/operators.h b/numpy/core/src/common/simd/neon/operators.h
index a08fa5390..249621bd6 100644
--- a/numpy/core/src/common/simd/neon/operators.h
+++ b/numpy/core/src/common/simd/neon/operators.h
@@ -246,4 +246,129 @@ NPY_FINLINE npyv_b32 npyv_notnan_f32(npyv_f32 a)
{ return vceqq_f64(a, a); }
#endif
+// Test cross all vector lanes
+// any: returns true if any of the elements is not equal to zero
+// all: returns true if all elements are not equal to zero
+#if NPY_SIMD_F64
+ #define NPYV_IMPL_NEON_ANYALL(LEN) \
+ NPY_FINLINE bool npyv_any_b##LEN(npyv_b##LEN a) \
+ { return vmaxvq_u##LEN(a) != 0; } \
+ NPY_FINLINE bool npyv_all_b##LEN(npyv_b##LEN a) \
+ { return vminvq_u##LEN(a) != 0; }
+ NPYV_IMPL_NEON_ANYALL(8)
+ NPYV_IMPL_NEON_ANYALL(16)
+ NPYV_IMPL_NEON_ANYALL(32)
+ #undef NPYV_IMPL_NEON_ANYALL
+
+ #define NPYV_IMPL_NEON_ANYALL(SFX, USFX, BSFX) \
+ NPY_FINLINE bool npyv_any_##SFX(npyv_##SFX a) \
+ { return npyv_any_##BSFX(npyv_reinterpret_##USFX##_##SFX(a)); } \
+ NPY_FINLINE bool npyv_all_##SFX(npyv_##SFX a) \
+ { return npyv_all_##BSFX(npyv_reinterpret_##USFX##_##SFX(a)); }
+ NPYV_IMPL_NEON_ANYALL(u8, u8, b8)
+ NPYV_IMPL_NEON_ANYALL(s8, u8, b8)
+ NPYV_IMPL_NEON_ANYALL(u16, u16, b16)
+ NPYV_IMPL_NEON_ANYALL(s16, u16, b16)
+ NPYV_IMPL_NEON_ANYALL(u32, u32, b32)
+ NPYV_IMPL_NEON_ANYALL(s32, u32, b32)
+ #undef NPYV_IMPL_NEON_ANYALL
+
+ NPY_FINLINE bool npyv_any_b64(npyv_b64 a)
+ { return vmaxvq_u32(vreinterpretq_u32_u64(a)) != 0; }
+ NPY_FINLINE bool npyv_all_b64(npyv_b64 a)
+ { return vminvq_u32(vreinterpretq_u32_u64(a)) != 0; }
+ #define npyv_any_u64 npyv_any_b64
+ NPY_FINLINE bool npyv_all_u64(npyv_u64 a)
+ {
+ uint32x4_t a32 = vreinterpretq_u32_u64(a);
+ a32 = vorrq_u32(a32, vrev64q_u32(a32));
+ return vminvq_u32(a32) != 0;
+ }
+ NPY_FINLINE bool npyv_any_s64(npyv_s64 a)
+ { return npyv_any_u64(vreinterpretq_u64_s64(a)); }
+ NPY_FINLINE bool npyv_all_s64(npyv_s64 a)
+ { return npyv_all_u64(vreinterpretq_u64_s64(a)); }
+
+ #define NPYV_IMPL_NEON_ANYALL(SFX, BSFX) \
+ NPY_FINLINE bool npyv_any_##SFX(npyv_##SFX a) \
+ { return !npyv_all_##BSFX(npyv_cmpeq_##SFX(a, npyv_zero_##SFX())); } \
+ NPY_FINLINE bool npyv_all_##SFX(npyv_##SFX a) \
+ { return !npyv_any_##BSFX(npyv_cmpeq_##SFX(a, npyv_zero_##SFX())); }
+ NPYV_IMPL_NEON_ANYALL(f32, b32)
+ NPYV_IMPL_NEON_ANYALL(f64, b64)
+ #undef NPYV_IMPL_NEON_ANYALL
+#else
+ #define NPYV_IMPL_NEON_ANYALL(LEN) \
+ NPY_FINLINE bool npyv_any_b##LEN(npyv_b##LEN a) \
+ { \
+ int64x2_t a64 = vreinterpretq_s64_u##LEN(a); \
+ return ( \
+ vgetq_lane_s64(a64, 0) | \
+ vgetq_lane_s64(a64, 1) \
+ ) != 0; \
+ } \
+ NPY_FINLINE bool npyv_all_b##LEN(npyv_b##LEN a) \
+ { \
+ int64x2_t a64 = vreinterpretq_s64_u##LEN(a); \
+ return ( \
+ vgetq_lane_s64(a64, 0) & \
+ vgetq_lane_s64(a64, 1) \
+ ) == -1; \
+ }
+ NPYV_IMPL_NEON_ANYALL(8)
+ NPYV_IMPL_NEON_ANYALL(16)
+ NPYV_IMPL_NEON_ANYALL(32)
+ NPYV_IMPL_NEON_ANYALL(64)
+ #undef NPYV_IMPL_NEON_ANYALL
+
+ #define NPYV_IMPL_NEON_ANYALL(SFX, USFX) \
+ NPY_FINLINE bool npyv_any_##SFX(npyv_##SFX a) \
+ { \
+ int64x2_t a64 = vreinterpretq_s64_##SFX(a); \
+ return ( \
+ vgetq_lane_s64(a64, 0) | \
+ vgetq_lane_s64(a64, 1) \
+ ) != 0; \
+ } \
+ NPY_FINLINE bool npyv_all_##SFX(npyv_##SFX a) \
+ { \
+ npyv_##USFX tz = npyv_cmpeq_##SFX( \
+ a, npyv_zero_##SFX() \
+ ); \
+ int64x2_t a64 = vreinterpretq_s64_##USFX(tz); \
+ return ( \
+ vgetq_lane_s64(a64, 0) | \
+ vgetq_lane_s64(a64, 1) \
+ ) == 0; \
+ }
+ NPYV_IMPL_NEON_ANYALL(u8, u8)
+ NPYV_IMPL_NEON_ANYALL(s8, u8)
+ NPYV_IMPL_NEON_ANYALL(u16, u16)
+ NPYV_IMPL_NEON_ANYALL(s16, u16)
+ NPYV_IMPL_NEON_ANYALL(u32, u32)
+ NPYV_IMPL_NEON_ANYALL(s32, u32)
+ #undef NPYV_IMPL_NEON_ANYALL
+
+ NPY_FINLINE bool npyv_any_f32(npyv_f32 a)
+ {
+ uint32x4_t tz = npyv_cmpeq_f32(a, npyv_zero_f32());
+ int64x2_t a64 = vreinterpretq_s64_u32(tz);
+ return (vgetq_lane_s64(a64, 0) & vgetq_lane_s64(a64, 1)) != -1ll;
+ }
+ NPY_FINLINE bool npyv_all_f32(npyv_f32 a)
+ {
+ uint32x4_t tz = npyv_cmpeq_f32(a, npyv_zero_f32());
+ int64x2_t a64 = vreinterpretq_s64_u32(tz);
+ return (vgetq_lane_s64(a64, 0) | vgetq_lane_s64(a64, 1)) == 0;
+ }
+ NPY_FINLINE bool npyv_any_s64(npyv_s64 a)
+ { return (vgetq_lane_s64(a, 0) | vgetq_lane_s64(a, 1)) != 0; }
+ NPY_FINLINE bool npyv_all_s64(npyv_s64 a)
+ { return vgetq_lane_s64(a, 0) && vgetq_lane_s64(a, 1); }
+ NPY_FINLINE bool npyv_any_u64(npyv_u64 a)
+ { return (vgetq_lane_u64(a, 0) | vgetq_lane_u64(a, 1)) != 0; }
+ NPY_FINLINE bool npyv_all_u64(npyv_u64 a)
+ { return vgetq_lane_u64(a, 0) && vgetq_lane_u64(a, 1); }
+#endif // NPY_SIMD_F64
+
#endif // _NPY_SIMD_NEON_OPERATORS_H
diff --git a/numpy/core/src/common/simd/simd.h b/numpy/core/src/common/simd/simd.h
index b1492500f..92a77ad80 100644
--- a/numpy/core/src/common/simd/simd.h
+++ b/numpy/core/src/common/simd/simd.h
@@ -8,6 +8,10 @@
* TODO: Add an independent sphinx doc.
*/
#include "numpy/npy_common.h"
+#ifndef __cplusplus
+ #include <stdbool.h>
+#endif
+
#include "npy_cpu_dispatch.h"
#include "simd_utils.h"
diff --git a/numpy/core/src/common/simd/sse/math.h b/numpy/core/src/common/simd/sse/math.h
index e4b77b671..b7f8e6ebb 100644
--- a/numpy/core/src/common/simd/sse/math.h
+++ b/numpy/core/src/common/simd/sse/math.h
@@ -45,15 +45,27 @@ NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a)
// - Only if both corresponded elements are NaN, NaN is set.
NPY_FINLINE npyv_f32 npyv_maxp_f32(npyv_f32 a, npyv_f32 b)
{
- __m128 nn = _mm_cmpord_ps(b, b);
+ __m128i nn = npyv_notnan_f32(b);
__m128 max = _mm_max_ps(a, b);
- return npyv_select_f32(_mm_castps_si128(nn), max, a);
+ return npyv_select_f32(nn, max, a);
}
NPY_FINLINE npyv_f64 npyv_maxp_f64(npyv_f64 a, npyv_f64 b)
{
- __m128d nn = _mm_cmpord_pd(b, b);
+ __m128i nn = npyv_notnan_f64(b);
__m128d max = _mm_max_pd(a, b);
- return npyv_select_f64(_mm_castpd_si128(nn), max, a);
+ return npyv_select_f64(nn, max, a);
+}
+NPY_FINLINE npyv_f32 npyv_maxn_f32(npyv_f32 a, npyv_f32 b)
+{
+ __m128i nn = npyv_notnan_f32(a);
+ __m128 max = _mm_max_ps(a, b);
+ return npyv_select_f32(nn, max, a);
+}
+NPY_FINLINE npyv_f64 npyv_maxn_f64(npyv_f64 a, npyv_f64 b)
+{
+ __m128i nn = npyv_notnan_f64(a);
+ __m128d max = _mm_max_pd(a, b);
+ return npyv_select_f64(nn, max, a);
}
// Maximum, integer operations
#ifdef NPY_HAVE_SSE41
@@ -98,15 +110,27 @@ NPY_FINLINE npyv_s64 npyv_max_s64(npyv_s64 a, npyv_s64 b)
// - Only if both corresponded elements are NaN, NaN is set.
NPY_FINLINE npyv_f32 npyv_minp_f32(npyv_f32 a, npyv_f32 b)
{
- __m128 nn = _mm_cmpord_ps(b, b);
+ __m128i nn = npyv_notnan_f32(b);
__m128 min = _mm_min_ps(a, b);
- return npyv_select_f32(_mm_castps_si128(nn), min, a);
+ return npyv_select_f32(nn, min, a);
}
NPY_FINLINE npyv_f64 npyv_minp_f64(npyv_f64 a, npyv_f64 b)
{
- __m128d nn = _mm_cmpord_pd(b, b);
+ __m128i nn = npyv_notnan_f64(b);
__m128d min = _mm_min_pd(a, b);
- return npyv_select_f64(_mm_castpd_si128(nn), min, a);
+ return npyv_select_f64(nn, min, a);
+}
+NPY_FINLINE npyv_f32 npyv_minn_f32(npyv_f32 a, npyv_f32 b)
+{
+ __m128i nn = npyv_notnan_f32(a);
+ __m128 min = _mm_min_ps(a, b);
+ return npyv_select_f32(nn, min, a);
+}
+NPY_FINLINE npyv_f64 npyv_minn_f64(npyv_f64 a, npyv_f64 b)
+{
+ __m128i nn = npyv_notnan_f64(a);
+ __m128d min = _mm_min_pd(a, b);
+ return npyv_select_f64(nn, min, a);
}
// Minimum, integer operations
#ifdef NPY_HAVE_SSE41
@@ -143,6 +167,102 @@ NPY_FINLINE npyv_s64 npyv_min_s64(npyv_s64 a, npyv_s64 b)
return npyv_select_s64(npyv_cmplt_s64(a, b), a, b);
}
+// reduce min&max for 32&64-bits
+#define NPY_IMPL_SSE_REDUCE_MINMAX(STYPE, INTRIN, VINTRIN) \
+ NPY_FINLINE STYPE##32 npyv_reduce_##INTRIN##32(__m128i a) \
+ { \
+ __m128i v64 = npyv_##INTRIN##32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 3, 2))); \
+ __m128i v32 = npyv_##INTRIN##32(v64, _mm_shuffle_epi32(v64, _MM_SHUFFLE(0, 0, 0, 1))); \
+ return (STYPE##32)_mm_cvtsi128_si32(v32); \
+ } \
+ NPY_FINLINE STYPE##64 npyv_reduce_##INTRIN##64(__m128i a) \
+ { \
+ __m128i v64 = npyv_##INTRIN##64(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 3, 2))); \
+ return (STYPE##64)npyv_extract0_u64(v64); \
+ }
+
+NPY_IMPL_SSE_REDUCE_MINMAX(npy_uint, min_u, min_epu)
+NPY_IMPL_SSE_REDUCE_MINMAX(npy_int, min_s, min_epi)
+NPY_IMPL_SSE_REDUCE_MINMAX(npy_uint, max_u, max_epu)
+NPY_IMPL_SSE_REDUCE_MINMAX(npy_int, max_s, max_epi)
+#undef NPY_IMPL_SSE_REDUCE_MINMAX
+// reduce min&max for ps & pd
+#define NPY_IMPL_SSE_REDUCE_MINMAX(INTRIN, INF, INF64) \
+ NPY_FINLINE float npyv_reduce_##INTRIN##_f32(npyv_f32 a) \
+ { \
+ __m128 v64 = _mm_##INTRIN##_ps(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 0, 3, 2))); \
+ __m128 v32 = _mm_##INTRIN##_ps(v64, _mm_shuffle_ps(v64, v64, _MM_SHUFFLE(0, 0, 0, 1))); \
+ return _mm_cvtss_f32(v32); \
+ } \
+ NPY_FINLINE double npyv_reduce_##INTRIN##_f64(npyv_f64 a) \
+ { \
+ __m128d v64 = _mm_##INTRIN##_pd(a, _mm_shuffle_pd(a, a, _MM_SHUFFLE(0, 0, 0, 1))); \
+ return _mm_cvtsd_f64(v64); \
+ } \
+ NPY_FINLINE float npyv_reduce_##INTRIN##p_f32(npyv_f32 a) \
+ { \
+ npyv_b32 notnan = npyv_notnan_f32(a); \
+ if (NPY_UNLIKELY(!npyv_any_b32(notnan))) { \
+ return _mm_cvtss_f32(a); \
+ } \
+ a = npyv_select_f32(notnan, a, npyv_reinterpret_f32_u32(npyv_setall_u32(INF))); \
+ return npyv_reduce_##INTRIN##_f32(a); \
+ } \
+ NPY_FINLINE double npyv_reduce_##INTRIN##p_f64(npyv_f64 a) \
+ { \
+ npyv_b64 notnan = npyv_notnan_f64(a); \
+ if (NPY_UNLIKELY(!npyv_any_b64(notnan))) { \
+ return _mm_cvtsd_f64(a); \
+ } \
+ a = npyv_select_f64(notnan, a, npyv_reinterpret_f64_u64(npyv_setall_u64(INF64))); \
+ return npyv_reduce_##INTRIN##_f64(a); \
+ } \
+ NPY_FINLINE float npyv_reduce_##INTRIN##n_f32(npyv_f32 a) \
+ { \
+ npyv_b32 notnan = npyv_notnan_f32(a); \
+ if (NPY_UNLIKELY(!npyv_all_b32(notnan))) { \
+ const union { npy_uint32 i; float f;} pnan = {0x7fc00000UL}; \
+ return pnan.f; \
+ } \
+ return npyv_reduce_##INTRIN##_f32(a); \
+ } \
+ NPY_FINLINE double npyv_reduce_##INTRIN##n_f64(npyv_f64 a) \
+ { \
+ npyv_b64 notnan = npyv_notnan_f64(a); \
+ if (NPY_UNLIKELY(!npyv_all_b64(notnan))) { \
+ const union { npy_uint64 i; double d;} pnan = {0x7ff8000000000000ull}; \
+ return pnan.d; \
+ } \
+ return npyv_reduce_##INTRIN##_f64(a); \
+ }
+
+NPY_IMPL_SSE_REDUCE_MINMAX(min, 0x7f800000, 0x7ff0000000000000)
+NPY_IMPL_SSE_REDUCE_MINMAX(max, 0xff800000, 0xfff0000000000000)
+#undef NPY_IMPL_SSE_REDUCE_MINMAX
+
+// reduce min&max for 8&16-bits
+#define NPY_IMPL_SSE_REDUCE_MINMAX(STYPE, INTRIN) \
+ NPY_FINLINE STYPE##16 npyv_reduce_##INTRIN##16(__m128i a) \
+ { \
+ __m128i v64 = npyv_##INTRIN##16(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 3, 2))); \
+ __m128i v32 = npyv_##INTRIN##16(v64, _mm_shuffle_epi32(v64, _MM_SHUFFLE(0, 0, 0, 1))); \
+ __m128i v16 = npyv_##INTRIN##16(v32, _mm_shufflelo_epi16(v32, _MM_SHUFFLE(0, 0, 0, 1))); \
+ return (STYPE##16)_mm_cvtsi128_si32(v16); \
+ } \
+ NPY_FINLINE STYPE##8 npyv_reduce_##INTRIN##8(__m128i a) \
+ { \
+ __m128i v64 = npyv_##INTRIN##8(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 3, 2))); \
+ __m128i v32 = npyv_##INTRIN##8(v64, _mm_shuffle_epi32(v64, _MM_SHUFFLE(0, 0, 0, 1))); \
+ __m128i v16 = npyv_##INTRIN##8(v32, _mm_shufflelo_epi16(v32, _MM_SHUFFLE(0, 0, 0, 1))); \
+ __m128i v8 = npyv_##INTRIN##8(v16, _mm_srli_epi16(v16, 8)); \
+ return (STYPE##16)_mm_cvtsi128_si32(v8); \
+ }
+NPY_IMPL_SSE_REDUCE_MINMAX(npy_uint, min_u)
+NPY_IMPL_SSE_REDUCE_MINMAX(npy_int, min_s)
+NPY_IMPL_SSE_REDUCE_MINMAX(npy_uint, max_u)
+NPY_IMPL_SSE_REDUCE_MINMAX(npy_int, max_s)
+#undef NPY_IMPL_SSE_REDUCE_MINMAX
+
// round to nearest integer even
NPY_FINLINE npyv_f32 npyv_rint_f32(npyv_f32 a)
{
diff --git a/numpy/core/src/common/simd/sse/misc.h b/numpy/core/src/common/simd/sse/misc.h
index 7d13fbf55..b01ff1722 100644
--- a/numpy/core/src/common/simd/sse/misc.h
+++ b/numpy/core/src/common/simd/sse/misc.h
@@ -129,6 +129,18 @@ NPY_FINLINE __m128d npyv__setr_pd(double i0, double i1)
#define npyv_select_u64 npyv_select_u8
#define npyv_select_s64 npyv_select_u8
+// extract the first vector's lane
+#define npyv_extract0_u8(A) ((npy_uint8)_mm_cvtsi128_si32(A))
+#define npyv_extract0_s8(A) ((npy_int8)_mm_cvtsi128_si32(A))
+#define npyv_extract0_u16(A) ((npy_uint16)_mm_cvtsi128_si32(A))
+#define npyv_extract0_s16(A) ((npy_int16)_mm_cvtsi128_si32(A))
+#define npyv_extract0_u32(A) ((npy_uint32)_mm_cvtsi128_si32(A))
+#define npyv_extract0_s32(A) ((npy_int32)_mm_cvtsi128_si32(A))
+#define npyv_extract0_u64(A) ((npy_uint64)npyv128_cvtsi128_si64(A))
+#define npyv_extract0_s64(A) ((npy_int64)npyv128_cvtsi128_si64(A))
+#define npyv_extract0_f32 _mm_cvtss_f32
+#define npyv_extract0_f64 _mm_cvtsd_f64
+
// Reinterpret
#define npyv_reinterpret_u8_u8(X) X
#define npyv_reinterpret_u8_s8(X) X
diff --git a/numpy/core/src/common/simd/sse/operators.h b/numpy/core/src/common/simd/sse/operators.h
index 86dbcfea5..59182679e 100644
--- a/numpy/core/src/common/simd/sse/operators.h
+++ b/numpy/core/src/common/simd/sse/operators.h
@@ -283,4 +283,60 @@ NPY_FINLINE npyv_b32 npyv_notnan_f32(npyv_f32 a)
NPY_FINLINE npyv_b64 npyv_notnan_f64(npyv_f64 a)
{ return _mm_castpd_si128(_mm_cmpord_pd(a, a)); }
+// Test cross all vector lanes
+// any: returns true if any of the elements is not equal to zero
+// all: returns true if all elements are not equal to zero
+#define NPYV_IMPL_SSE_ANYALL(SFX) \
+ NPY_FINLINE bool npyv_any_##SFX(npyv_##SFX a) \
+ { return _mm_movemask_epi8(a) != 0; } \
+ NPY_FINLINE bool npyv_all_##SFX(npyv_##SFX a) \
+ { return _mm_movemask_epi8(a) == 0xffff; }
+NPYV_IMPL_SSE_ANYALL(b8)
+NPYV_IMPL_SSE_ANYALL(b16)
+NPYV_IMPL_SSE_ANYALL(b32)
+NPYV_IMPL_SSE_ANYALL(b64)
+#undef NPYV_IMPL_SSE_ANYALL
+
+#define NPYV_IMPL_SSE_ANYALL(SFX, MSFX, TSFX, MASK) \
+ NPY_FINLINE bool npyv_any_##SFX(npyv_##SFX a) \
+ { \
+ return _mm_movemask_##MSFX( \
+ _mm_cmpeq_##TSFX(a, npyv_zero_##SFX()) \
+ ) != MASK; \
+ } \
+ NPY_FINLINE bool npyv_all_##SFX(npyv_##SFX a) \
+ { \
+ return _mm_movemask_##MSFX( \
+ _mm_cmpeq_##TSFX(a, npyv_zero_##SFX()) \
+ ) == 0; \
+ }
+NPYV_IMPL_SSE_ANYALL(u8, epi8, epi8, 0xffff)
+NPYV_IMPL_SSE_ANYALL(s8, epi8, epi8, 0xffff)
+NPYV_IMPL_SSE_ANYALL(u16, epi8, epi16, 0xffff)
+NPYV_IMPL_SSE_ANYALL(s16, epi8, epi16, 0xffff)
+NPYV_IMPL_SSE_ANYALL(u32, epi8, epi32, 0xffff)
+NPYV_IMPL_SSE_ANYALL(s32, epi8, epi32, 0xffff)
+#ifdef NPY_HAVE_SSE41
+ NPYV_IMPL_SSE_ANYALL(u64, epi8, epi64, 0xffff)
+ NPYV_IMPL_SSE_ANYALL(s64, epi8, epi64, 0xffff)
+#else
+ NPY_FINLINE bool npyv_any_u64(npyv_u64 a)
+ {
+ return _mm_movemask_epi8(
+ _mm_cmpeq_epi32(a, npyv_zero_u64())
+ ) != 0xffff;
+ }
+ NPY_FINLINE bool npyv_all_u64(npyv_u64 a)
+ {
+ a = _mm_cmpeq_epi32(a, npyv_zero_u64());
+ a = _mm_and_si128(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(2, 3, 0, 1)));
+ return _mm_movemask_epi8(a) == 0;
+ }
+ #define npyv_any_s64 npyv_any_u64
+ #define npyv_all_s64 npyv_all_u64
+#endif
+NPYV_IMPL_SSE_ANYALL(f32, ps, ps, 0xf)
+NPYV_IMPL_SSE_ANYALL(f64, pd, pd, 0x3)
+#undef NPYV_IMPL_SSE_ANYALL
+
#endif // _NPY_SIMD_SSE_OPERATORS_H
diff --git a/numpy/core/src/common/simd/vec/math.h b/numpy/core/src/common/simd/vec/math.h
index 7714a612d..95b16fdf7 100644
--- a/numpy/core/src/common/simd/vec/math.h
+++ b/numpy/core/src/common/simd/vec/math.h
@@ -63,6 +63,23 @@ NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a)
return vec_max(vec_sel(b, a, nn_a), vec_sel(a, b, nn_b));
}
#endif
+#if NPY_SIMD_F32
+ NPY_FINLINE npyv_f32 npyv_maxn_f32(npyv_f32 a, npyv_f32 b)
+ {
+ npyv_b32 nn_a = npyv_notnan_f32(a);
+ npyv_b32 nn_b = npyv_notnan_f32(b);
+ npyv_f32 max = vec_max(a, b);
+ return vec_sel(b, vec_sel(a, max, nn_a), nn_b);
+ }
+#endif
+NPY_FINLINE npyv_f64 npyv_maxn_f64(npyv_f64 a, npyv_f64 b)
+{
+ npyv_b64 nn_a = npyv_notnan_f64(a);
+ npyv_b64 nn_b = npyv_notnan_f64(b);
+ npyv_f64 max = vec_max(a, b);
+ return vec_sel(b, vec_sel(a, max, nn_a), nn_b);
+}
+
// Maximum, integer operations
#define npyv_max_u8 vec_max
#define npyv_max_s8 vec_max
@@ -95,6 +112,23 @@ NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a)
return vec_min(vec_sel(b, a, nn_a), vec_sel(a, b, nn_b));
}
#endif
+#if NPY_SIMD_F32
+ NPY_FINLINE npyv_f32 npyv_minn_f32(npyv_f32 a, npyv_f32 b)
+ {
+ npyv_b32 nn_a = npyv_notnan_f32(a);
+ npyv_b32 nn_b = npyv_notnan_f32(b);
+ npyv_f32 min = vec_min(a, b);
+ return vec_sel(b, vec_sel(a, min, nn_a), nn_b);
+ }
+#endif
+NPY_FINLINE npyv_f64 npyv_minn_f64(npyv_f64 a, npyv_f64 b)
+{
+ npyv_b64 nn_a = npyv_notnan_f64(a);
+ npyv_b64 nn_b = npyv_notnan_f64(b);
+ npyv_f64 min = vec_min(a, b);
+ return vec_sel(b, vec_sel(a, min, nn_a), nn_b);
+}
+
// Minimum, integer operations
#define npyv_min_u8 vec_min
#define npyv_min_s8 vec_min
@@ -105,6 +139,132 @@ NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a)
#define npyv_min_u64 vec_min
#define npyv_min_s64 vec_min
+#define NPY_IMPL_VEC_REDUCE_MINMAX(INTRIN, STYPE, SFX) \
+ NPY_FINLINE npy_##STYPE npyv_reduce_##INTRIN##_##SFX(npyv_##SFX a) \
+ { \
+ npyv_##SFX r = vec_##INTRIN(a, vec_sld(a, a, 8)); \
+ r = vec_##INTRIN(r, vec_sld(r, r, 4)); \
+ r = vec_##INTRIN(r, vec_sld(r, r, 2)); \
+ r = vec_##INTRIN(r, vec_sld(r, r, 1)); \
+ return (npy_##STYPE)vec_extract(r, 0); \
+ }
+NPY_IMPL_VEC_REDUCE_MINMAX(min, uint8, u8)
+NPY_IMPL_VEC_REDUCE_MINMAX(max, uint8, u8)
+NPY_IMPL_VEC_REDUCE_MINMAX(min, int8, s8)
+NPY_IMPL_VEC_REDUCE_MINMAX(max, int8, s8)
+#undef NPY_IMPL_VEC_REDUCE_MINMAX
+
+#define NPY_IMPL_VEC_REDUCE_MINMAX(INTRIN, STYPE, SFX) \
+ NPY_FINLINE npy_##STYPE npyv_reduce_##INTRIN##_##SFX(npyv_##SFX a) \
+ { \
+ npyv_##SFX r = vec_##INTRIN(a, vec_sld(a, a, 8)); \
+ r = vec_##INTRIN(r, vec_sld(r, r, 4)); \
+ r = vec_##INTRIN(r, vec_sld(r, r, 2)); \
+ return (npy_##STYPE)vec_extract(r, 0); \
+ }
+NPY_IMPL_VEC_REDUCE_MINMAX(min, uint16, u16)
+NPY_IMPL_VEC_REDUCE_MINMAX(max, uint16, u16)
+NPY_IMPL_VEC_REDUCE_MINMAX(min, int16, s16)
+NPY_IMPL_VEC_REDUCE_MINMAX(max, int16, s16)
+#undef NPY_IMPL_VEC_REDUCE_MINMAX
+
+#define NPY_IMPL_VEC_REDUCE_MINMAX(INTRIN, STYPE, SFX) \
+ NPY_FINLINE npy_##STYPE npyv_reduce_##INTRIN##_##SFX(npyv_##SFX a) \
+ { \
+ npyv_##SFX r = vec_##INTRIN(a, vec_sld(a, a, 8)); \
+ r = vec_##INTRIN(r, vec_sld(r, r, 4)); \
+ return (npy_##STYPE)vec_extract(r, 0); \
+ }
+NPY_IMPL_VEC_REDUCE_MINMAX(min, uint32, u32)
+NPY_IMPL_VEC_REDUCE_MINMAX(max, uint32, u32)
+NPY_IMPL_VEC_REDUCE_MINMAX(min, int32, s32)
+NPY_IMPL_VEC_REDUCE_MINMAX(max, int32, s32)
+#undef NPY_IMPL_VEC_REDUCE_MINMAX
+
+#define NPY_IMPL_VEC_REDUCE_MINMAX(INTRIN, STYPE, SFX) \
+ NPY_FINLINE npy_##STYPE npyv_reduce_##INTRIN##_##SFX(npyv_##SFX a) \
+ { \
+ npyv_##SFX r = vec_##INTRIN(a, vec_sld(a, a, 8)); \
+ return (npy_##STYPE)vec_extract(r, 0); \
+ }
+NPY_IMPL_VEC_REDUCE_MINMAX(min, uint64, u64)
+NPY_IMPL_VEC_REDUCE_MINMAX(max, uint64, u64)
+NPY_IMPL_VEC_REDUCE_MINMAX(min, int64, s64)
+NPY_IMPL_VEC_REDUCE_MINMAX(max, int64, s64)
+#undef NPY_IMPL_VEC_REDUCE_MINMAX
+
+#if NPY_SIMD_F32
+ #define NPY_IMPL_VEC_REDUCE_MINMAX(INTRIN, INF) \
+ NPY_FINLINE float npyv_reduce_##INTRIN##_f32(npyv_f32 a) \
+ { \
+ npyv_f32 r = vec_##INTRIN(a, vec_sld(a, a, 8)); \
+ r = vec_##INTRIN(r, vec_sld(r, r, 4)); \
+ return vec_extract(r, 0); \
+ } \
+ NPY_FINLINE float npyv_reduce_##INTRIN##p_f32(npyv_f32 a) \
+ { \
+ return npyv_reduce_##INTRIN##_f32(a); \
+ } \
+ NPY_FINLINE float npyv_reduce_##INTRIN##n_f32(npyv_f32 a) \
+ { \
+ npyv_b32 notnan = npyv_notnan_f32(a); \
+ if (NPY_UNLIKELY(!npyv_all_b32(notnan))) { \
+ const union { npy_uint32 i; float f;} \
+ pnan = {0x7fc00000UL}; \
+ return pnan.f; \
+ } \
+ return npyv_reduce_##INTRIN##_f32(a); \
+ }
+ NPY_IMPL_VEC_REDUCE_MINMAX(min, 0x7f800000)
+ NPY_IMPL_VEC_REDUCE_MINMAX(max, 0xff800000)
+ #undef NPY_IMPL_VEC_REDUCE_MINMAX
+#endif // NPY_SIMD_F32
+
+#define NPY_IMPL_VEC_REDUCE_MINMAX(INTRIN, INF) \
+ NPY_FINLINE double npyv_reduce_##INTRIN##_f64(npyv_f64 a) \
+ { \
+ npyv_f64 r = vec_##INTRIN(a, vec_sld(a, a, 8)); \
+ return vec_extract(r, 0); \
+ } \
+ NPY_FINLINE double npyv_reduce_##INTRIN##n_f64(npyv_f64 a) \
+ { \
+ npyv_b64 notnan = npyv_notnan_f64(a); \
+ if (NPY_UNLIKELY(!npyv_all_b64(notnan))) { \
+ const union { npy_uint64 i; double f;} \
+ pnan = {0x7ff8000000000000ull}; \
+ return pnan.f; \
+ } \
+ return npyv_reduce_##INTRIN##_f64(a); \
+ }
+NPY_IMPL_VEC_REDUCE_MINMAX(min, 0x7ff0000000000000)
+NPY_IMPL_VEC_REDUCE_MINMAX(max, 0xfff0000000000000)
+#undef NPY_IMPL_VEC_REDUCE_MINMAX
+
+#if defined(NPY_HAVE_VXE) || defined(NPY_HAVE_VSX)
+ #define npyv_reduce_minp_f64 npyv_reduce_min_f64
+ #define npyv_reduce_maxp_f64 npyv_reduce_max_f64
+#else
+ NPY_FINLINE double npyv_reduce_minp_f64(npyv_f64 a)
+ {
+ npyv_b64 notnan = npyv_notnan_f64(a);
+ if (NPY_UNLIKELY(!npyv_any_b64(notnan))) {
+ return vec_extract(a, 0);
+ }
+ a = npyv_select_f64(notnan, a, npyv_reinterpret_f64_u64(
+ npyv_setall_u64(0x7ff0000000000000)));
+ return npyv_reduce_min_f64(a);
+ }
+ NPY_FINLINE double npyv_reduce_maxp_f64(npyv_f64 a)
+ {
+ npyv_b64 notnan = npyv_notnan_f64(a);
+ if (NPY_UNLIKELY(!npyv_any_b64(notnan))) {
+ return vec_extract(a, 0);
+ }
+ a = npyv_select_f64(notnan, a, npyv_reinterpret_f64_u64(
+ npyv_setall_u64(0xfff0000000000000)));
+ return npyv_reduce_max_f64(a);
+ }
+#endif
// round to nearest int even
#define npyv_rint_f64 vec_rint
// ceil
diff --git a/numpy/core/src/common/simd/vec/misc.h b/numpy/core/src/common/simd/vec/misc.h
index c4f35cfc0..7ea0f21f6 100644
--- a/numpy/core/src/common/simd/vec/misc.h
+++ b/numpy/core/src/common/simd/vec/misc.h
@@ -83,6 +83,20 @@
#endif
#define npyv_select_f64 npyv_select_u8
+// extract the first vector's lane
+#define npyv_extract0_u8(A) ((npy_uint8)vec_extract(A, 0))
+#define npyv_extract0_s8(A) ((npy_int8)vec_extract(A, 0))
+#define npyv_extract0_u16(A) ((npy_uint16)vec_extract(A, 0))
+#define npyv_extract0_s16(A) ((npy_int16)vec_extract(A, 0))
+#define npyv_extract0_u32(A) ((npy_uint32)vec_extract(A, 0))
+#define npyv_extract0_s32(A) ((npy_int32)vec_extract(A, 0))
+#define npyv_extract0_u64(A) ((npy_uint64)vec_extract(A, 0))
+#define npyv_extract0_s64(A) ((npy_int64)vec_extract(A, 0))
+#if NPY_SIMD_F32
+ #define npyv_extract0_f32(A) vec_extract(A, 0)
+#endif
+#define npyv_extract0_f64(A) vec_extract(A, 0)
+
// Reinterpret
#define npyv_reinterpret_u8_u8(X) X
#define npyv_reinterpret_u8_s8(X) ((npyv_u8)X)
diff --git a/numpy/core/src/common/simd/vec/operators.h b/numpy/core/src/common/simd/vec/operators.h
index 8b58676e7..50dac20f6 100644
--- a/numpy/core/src/common/simd/vec/operators.h
+++ b/numpy/core/src/common/simd/vec/operators.h
@@ -274,4 +274,30 @@ NPY_FINLINE npyv_f64 npyv_not_f64(npyv_f64 a)
NPY_FINLINE npyv_b64 npyv_notnan_f64(npyv_f64 a)
{ return vec_cmpeq(a, a); }
+// Test cross all vector lanes
+// any: returns true if any of the elements is not equal to zero
+// all: returns true if all elements are not equal to zero
+#define NPYV_IMPL_VEC_ANYALL(SFX, SFX2) \
+ NPY_FINLINE bool npyv_any_##SFX(npyv_##SFX a) \
+ { return vec_any_ne(a, (npyv_##SFX)npyv_zero_##SFX2()); } \
+ NPY_FINLINE bool npyv_all_##SFX(npyv_##SFX a) \
+ { return vec_all_ne(a, (npyv_##SFX)npyv_zero_##SFX2()); }
+NPYV_IMPL_VEC_ANYALL(b8, u8)
+NPYV_IMPL_VEC_ANYALL(b16, u16)
+NPYV_IMPL_VEC_ANYALL(b32, u32)
+NPYV_IMPL_VEC_ANYALL(b64, u64)
+NPYV_IMPL_VEC_ANYALL(u8, u8)
+NPYV_IMPL_VEC_ANYALL(s8, s8)
+NPYV_IMPL_VEC_ANYALL(u16, u16)
+NPYV_IMPL_VEC_ANYALL(s16, s16)
+NPYV_IMPL_VEC_ANYALL(u32, u32)
+NPYV_IMPL_VEC_ANYALL(s32, s32)
+NPYV_IMPL_VEC_ANYALL(u64, u64)
+NPYV_IMPL_VEC_ANYALL(s64, s64)
+#if NPY_SIMD_F32
+ NPYV_IMPL_VEC_ANYALL(f32, f32)
+#endif
+NPYV_IMPL_VEC_ANYALL(f64, f64)
+#undef NPYV_IMPL_VEC_ANYALL
+
#endif // _NPY_SIMD_VEC_OPERATORS_H
diff --git a/numpy/core/src/npysort/x86-qsort.dispatch.cpp b/numpy/core/src/npysort/x86-qsort.dispatch.cpp
index 01fa16e3e..39c24d522 100644
--- a/numpy/core/src/npysort/x86-qsort.dispatch.cpp
+++ b/numpy/core/src/npysort/x86-qsort.dispatch.cpp
@@ -137,8 +137,8 @@ struct vector<npy_int> {
{
return _mm512_permutexvar_epi32(idx, zmm);
}
- static type_t reducemax(zmm_t v) { return npyv_reducemax_s32(v); }
- static type_t reducemin(zmm_t v) { return npyv_reducemin_s32(v); }
+ static type_t reducemax(zmm_t v) { return npyv_reduce_max_s32(v); }
+ static type_t reducemin(zmm_t v) { return npyv_reduce_min_s32(v); }
static zmm_t set1(type_t v) { return _mm512_set1_epi32(v); }
template<__mmask16 mask>
static zmm_t shuffle(zmm_t zmm)
@@ -196,8 +196,8 @@ struct vector<npy_uint> {
{
return _mm512_permutexvar_epi32(idx, zmm);
}
- static type_t reducemax(zmm_t v) { return npyv_reducemax_u32(v); }
- static type_t reducemin(zmm_t v) { return npyv_reducemin_u32(v); }
+ static type_t reducemax(zmm_t v) { return npyv_reduce_max_u32(v); }
+ static type_t reducemin(zmm_t v) { return npyv_reduce_min_u32(v); }
static zmm_t set1(type_t v) { return _mm512_set1_epi32(v); }
template<__mmask16 mask>
static zmm_t shuffle(zmm_t zmm)
@@ -255,8 +255,8 @@ struct vector<npy_float> {
{
return _mm512_permutexvar_ps(idx, zmm);
}
- static type_t reducemax(zmm_t v) { return npyv_reducemax_f32(v); }
- static type_t reducemin(zmm_t v) { return npyv_reducemin_f32(v); }
+ static type_t reducemax(zmm_t v) { return npyv_reduce_max_f32(v); }
+ static type_t reducemin(zmm_t v) { return npyv_reduce_min_f32(v); }
static zmm_t set1(type_t v) { return _mm512_set1_ps(v); }
template<__mmask16 mask>
static zmm_t shuffle(zmm_t zmm)
diff --git a/numpy/core/src/umath/loops_minmax.dispatch.c.src b/numpy/core/src/umath/loops_minmax.dispatch.c.src
index b4fb205a0..237c8e933 100644
--- a/numpy/core/src/umath/loops_minmax.dispatch.c.src
+++ b/numpy/core/src/umath/loops_minmax.dispatch.c.src
@@ -95,104 +95,6 @@ NPY_FINLINE @type@ scalar_@op@_@c_sfx@(@type@ a, @type@ b) {
#endif
/*******************************************************************************
- ** extra SIMD intrinsics
- ******************************************************************************/
-
-#if NPY_SIMD
-/**begin repeat
- * #sfx = s8, u8, s16, u16, s32, u32, s64, u64#
- * #is_64 = 0*6, 1*2#
- */
-#if defined(NPY_HAVE_ASIMD) && defined(__aarch64__)
- #if !@is_64@
- #define npyv_reduce_min_@sfx@ vminvq_@sfx@
- #define npyv_reduce_max_@sfx@ vmaxvq_@sfx@
- #else
- NPY_FINLINE npyv_lanetype_@sfx@ npyv_reduce_min_@sfx@(npyv_@sfx@ v)
- {
- npyv_lanetype_@sfx@ a = vgetq_lane_@sfx@(v, 0);
- npyv_lanetype_@sfx@ b = vgetq_lane_@sfx@(v, 1);
- npyv_lanetype_@sfx@ result = (a < b) ? a : b;
- return result;
- }
- NPY_FINLINE npyv_lanetype_@sfx@ npyv_reduce_max_@sfx@(npyv_@sfx@ v)
- {
- npyv_lanetype_@sfx@ a = vgetq_lane_@sfx@(v, 0);
- npyv_lanetype_@sfx@ b = vgetq_lane_@sfx@(v, 1);
- npyv_lanetype_@sfx@ result = (a > b) ? a : b;
- return result;
- }
- #endif // !@is_64@
-#else
- /**begin repeat1
- * #intrin = min, max#
- */
- NPY_FINLINE npyv_lanetype_@sfx@ npyv_reduce_@intrin@_@sfx@(npyv_@sfx@ v)
- {
- npyv_lanetype_@sfx@ NPY_DECL_ALIGNED(NPY_SIMD_WIDTH) s[npyv_nlanes_@sfx@];
- npyv_storea_@sfx@(s, v);
- npyv_lanetype_@sfx@ result = s[0];
- for(int i=1; i<npyv_nlanes_@sfx@; ++i){
- result = scalar_@intrin@_i(result, s[i]);
- }
- return result;
- }
- /**end repeat1**/
-#endif
-/**end repeat**/
-#endif // NPY_SIMD
-
-/**begin repeat
- * #sfx = f32, f64#
- * #bsfx = b32, b64#
- * #simd_chk = NPY_SIMD_F32, NPY_SIMD_F64#
- * #scalar_sfx = f, d#
- */
-#if @simd_chk@
-#if defined(NPY_HAVE_ASIMD) && defined(__aarch64__)
- #define npyv_minn_@sfx@ vminq_@sfx@
- #define npyv_maxn_@sfx@ vmaxq_@sfx@
- #define npyv_reduce_minn_@sfx@ vminvq_@sfx@
- #define npyv_reduce_maxn_@sfx@ vmaxvq_@sfx@
- #define npyv_reduce_minp_@sfx@ vminnmvq_@sfx@
- #define npyv_reduce_maxp_@sfx@ vmaxnmvq_@sfx@
-#else
- /**begin repeat1
- * #intrin = min, max#
- */
- // propagates NaNs
- NPY_FINLINE npyv_@sfx@ npyv_@intrin@n_@sfx@(npyv_@sfx@ a, npyv_@sfx@ b)
- {
- npyv_@sfx@ result = npyv_@intrin@_@sfx@(a, b);
- // result = npyv_select_@sfx@(npyv_notnan_@sfx@(b), result, b);
- // X86 handle second operand
- #ifndef NPY_HAVE_SSE2
- result = npyv_select_@sfx@(npyv_notnan_@sfx@(b), result, b);
- #endif
- result = npyv_select_@sfx@(npyv_notnan_@sfx@(a), result, a);
- return result;
- }
- /**end repeat1**/
- /**begin repeat1
- * #intrin = minn, maxn, minp, maxp#
- * #scalar_intrin = min, max, minp, maxp#
- */
- NPY_FINLINE npyv_lanetype_@sfx@ npyv_reduce_@intrin@_@sfx@(npyv_@sfx@ v)
- {
- npyv_lanetype_@sfx@ NPY_DECL_ALIGNED(NPY_SIMD_WIDTH) s[npyv_nlanes_@sfx@];
- npyv_storea_@sfx@(s, v);
- npyv_lanetype_@sfx@ result = s[0];
- for(int i=1; i<npyv_nlanes_@sfx@; ++i){
- result = scalar_@scalar_intrin@_@scalar_sfx@(result, s[i]);
- }
- return result;
- }
- /**end repeat1**/
-#endif
-#endif // simd_chk
-/**end repeat**/
-
-/*******************************************************************************
** Defining the SIMD kernels
******************************************************************************/
/**begin repeat
diff --git a/numpy/core/tests/test_simd.py b/numpy/core/tests/test_simd.py
index c4488533a..4aeaf0da3 100644
--- a/numpy/core/tests/test_simd.py
+++ b/numpy/core/tests/test_simd.py
@@ -105,10 +105,12 @@ class _SIMD_BOOL(_Test_Utility):
"""
To test all boolean vector types at once
"""
+ def _nlanes(self):
+ return getattr(self.npyv, "nlanes_u" + self.sfx[1:])
+
def _data(self, start=None, count=None, reverse=False):
- nlanes = getattr(self.npyv, "nlanes_u" + self.sfx[1:])
true_mask = self._true_mask()
- rng = range(nlanes)
+ rng = range(self._nlanes())
if reverse:
rng = reversed(rng)
return [true_mask if x % 2 else 0 for x in rng]
@@ -202,6 +204,26 @@ class _SIMD_BOOL(_Test_Utility):
vdata, vdata, vdata, vdata)
assert vpack == spack
+ @pytest.mark.parametrize("intrin", ["any", "all"])
+ @pytest.mark.parametrize("data", (
+ [-1, 0],
+ [0, -1],
+ [-1],
+ [0]
+ ))
+ def test_operators_crosstest(self, intrin, data):
+ """
+ Test intrinsics:
+ npyv_any_##SFX
+ npyv_all_##SFX
+ """
+ data_a = self._load_b(data * self._nlanes())
+ func = eval(intrin)
+ intrin = getattr(self, intrin)
+ desired = func(data_a)
+ simd = intrin(data_a)
+ assert not not simd == desired
+
class _SIMD_INT(_Test_Utility):
"""
To test all integer vector types at once
@@ -268,6 +290,18 @@ class _SIMD_INT(_Test_Utility):
simd_min = self.min(vdata_a, vdata_b)
assert simd_min == data_min
+ @pytest.mark.parametrize("start", [-100, -10000, 0, 100, 10000])
+ def test_reduce_max_min(self, start):
+ """
+ Test intrinsics:
+ npyv_reduce_max_##sfx
+ npyv_reduce_min_##sfx
+ """
+ vdata_a = self.load(self._data(start))
+ assert self.reduce_max(vdata_a) == max(vdata_a)
+ assert self.reduce_min(vdata_a) == min(vdata_a)
+
+
class _SIMD_FP32(_Test_Utility):
"""
To only test single precision
@@ -414,67 +448,77 @@ class _SIMD_FP(_Test_Utility):
data_round = self._to_unsigned(self.setall(-0.0))
assert _round == data_round
- def test_max(self):
- """
- Test intrinsics:
- npyv_max_##SFX
- npyv_maxp_##SFX
- """
- data_a = self._data()
- data_b = self._data(self.nlanes)
- vdata_a, vdata_b = self.load(data_a), self.load(data_b)
- data_max = [max(a, b) for a, b in zip(data_a, data_b)]
- _max = self.max(vdata_a, vdata_b)
- assert _max == data_max
- maxp = self.maxp(vdata_a, vdata_b)
- assert maxp == data_max
- # test IEEE standards
- pinf, ninf, nan = self._pinfinity(), self._ninfinity(), self._nan()
- max_cases = ((nan, nan, nan), (nan, 10, 10), (10, nan, 10),
- (pinf, pinf, pinf), (pinf, 10, pinf), (10, pinf, pinf),
- (ninf, ninf, ninf), (ninf, 10, 10), (10, ninf, 10),
- (10, 0, 10), (10, -10, 10))
- for case_operand1, case_operand2, desired in max_cases:
- data_max = [desired]*self.nlanes
- vdata_a = self.setall(case_operand1)
- vdata_b = self.setall(case_operand2)
- maxp = self.maxp(vdata_a, vdata_b)
- assert maxp == pytest.approx(data_max, nan_ok=True)
- if nan in (case_operand1, case_operand2, desired):
- continue
- _max = self.max(vdata_a, vdata_b)
- assert _max == data_max
-
- def test_min(self):
+ @pytest.mark.parametrize("intrin", [
+ "max", "maxp", "maxn", "min", "minp", "minn"
+ ])
+ def test_max_min(self, intrin):
"""
Test intrinsics:
- npyv_min_##SFX
- npyv_minp_##SFX
+ npyv_max_##sfx
+ npyv_maxp_##sfx
+ npyv_maxn_##sfx
+ npyv_min_##sfx
+ npyv_minp_##sfx
+ npyv_minn_##sfx
+ npyv_reduce_max_##sfx
+ npyv_reduce_maxp_##sfx
+ npyv_reduce_maxn_##sfx
+ npyv_reduce_min_##sfx
+ npyv_reduce_minp_##sfx
+ npyv_reduce_minn_##sfx
"""
- data_a = self._data()
- data_b = self._data(self.nlanes)
- vdata_a, vdata_b = self.load(data_a), self.load(data_b)
- data_min = [min(a, b) for a, b in zip(data_a, data_b)]
- _min = self.min(vdata_a, vdata_b)
- assert _min == data_min
- minp = self.minp(vdata_a, vdata_b)
- assert minp == data_min
- # test IEEE standards
pinf, ninf, nan = self._pinfinity(), self._ninfinity(), self._nan()
- min_cases = ((nan, nan, nan), (nan, 10, 10), (10, nan, 10),
- (pinf, pinf, pinf), (pinf, 10, 10), (10, pinf, 10),
- (ninf, ninf, ninf), (ninf, 10, ninf), (10, ninf, ninf),
- (10, 0, 0), (10, -10, -10))
- for case_operand1, case_operand2, desired in min_cases:
- data_min = [desired]*self.nlanes
- vdata_a = self.setall(case_operand1)
- vdata_b = self.setall(case_operand2)
- minp = self.minp(vdata_a, vdata_b)
- assert minp == pytest.approx(data_min, nan_ok=True)
- if nan in (case_operand1, case_operand2, desired):
- continue
- _min = self.min(vdata_a, vdata_b)
- assert _min == data_min
+ chk_nan = {"xp": 1, "np": 1, "nn": 2, "xn": 2}.get(intrin[-2:], 0)
+ func = eval(intrin[:3])
+ reduce_intrin = getattr(self, "reduce_" + intrin)
+ intrin = getattr(self, intrin)
+ hf_nlanes = self.nlanes//2
+
+ cases = (
+ ([0.0, -0.0], [-0.0, 0.0]),
+ ([10, -10], [10, -10]),
+ ([pinf, 10], [10, ninf]),
+ ([10, pinf], [ninf, 10]),
+ ([10, -10], [10, -10]),
+ ([-10, 10], [-10, 10])
+ )
+ for op1, op2 in cases:
+ vdata_a = self.load(op1*hf_nlanes)
+ vdata_b = self.load(op2*hf_nlanes)
+ data = func(vdata_a, vdata_b)
+ simd = intrin(vdata_a, vdata_b)
+ assert simd == data
+ data = func(vdata_a)
+ simd = reduce_intrin(vdata_a)
+ assert simd == data
+
+ if not chk_nan:
+ return
+ if chk_nan == 1:
+ test_nan = lambda a, b: (
+ b if math.isnan(a) else a if math.isnan(b) else b
+ )
+ else:
+ test_nan = lambda a, b: (
+ nan if math.isnan(a) or math.isnan(b) else b
+ )
+ cases = (
+ (nan, 10),
+ (10, nan),
+ (nan, pinf),
+ (pinf, nan),
+ (nan, nan)
+ )
+ for op1, op2 in cases:
+ vdata_ab = self.load([op1, op2]*hf_nlanes)
+ data = test_nan(op1, op2)
+ simd = reduce_intrin(vdata_ab)
+ assert simd == pytest.approx(data, nan_ok=True)
+ vdata_a = self.setall(op1)
+ vdata_b = self.setall(op2)
+ data = [data] * self.nlanes
+ simd = intrin(vdata_a, vdata_b)
+ assert simd == pytest.approx(data, nan_ok=True)
def test_reciprocal(self):
pinf, ninf, nan = self._pinfinity(), self._ninfinity(), self._nan()
@@ -527,6 +571,30 @@ class _SIMD_FP(_Test_Utility):
data_cmp = [py_comp(a, b) for a, b in zip(data_a, data_b)]
assert vcmp == data_cmp
+ @pytest.mark.parametrize("intrin", ["any", "all"])
+ @pytest.mark.parametrize("data", (
+ [float("nan"), 0],
+ [0, float("nan")],
+ [float("nan"), 1],
+ [1, float("nan")],
+ [float("nan"), float("nan")],
+ [0.0, -0.0],
+ [-0.0, 0.0],
+ [1.0, -0.0]
+ ))
+ def test_operators_crosstest(self, intrin, data):
+ """
+ Test intrinsics:
+ npyv_any_##SFX
+ npyv_all_##SFX
+ """
+ data_a = self.load(data * self.nlanes)
+ func = eval(intrin)
+ intrin = getattr(self, intrin)
+ desired = func(data_a)
+ simd = intrin(data_a)
+ assert not not simd == desired
+
class _SIMD_ALL(_Test_Utility):
"""
To test all vector types at once
@@ -759,6 +827,9 @@ class _SIMD_ALL(_Test_Utility):
select_b = self.select(self.cmpneq(self.zero(), self.zero()), vdata_a, vdata_b)
assert select_b == data_b
+ # test extract elements
+ assert self.extract0(vdata_b) == vdata_b[0]
+
# cleanup intrinsic is only used with AVX for
# zeroing registers to avoid the AVX-SSE transition penalty,
# so nothing to test here
@@ -874,6 +945,30 @@ class _SIMD_ALL(_Test_Utility):
vandc = cast(getattr(self, "andc")(vdata_a, vdata_b))
assert vandc == data_andc
+ @pytest.mark.parametrize("intrin", ["any", "all"])
+ @pytest.mark.parametrize("data", (
+ [1, 2, 3, 4],
+ [-1, -2, -3, -4],
+ [0, 1, 2, 3, 4],
+ [0x7f, 0x7fff, 0x7fffffff, 0x7fffffffffffffff],
+ [0, -1, -2, -3, 4],
+ [0],
+ [1],
+ [-1]
+ ))
+ def test_operators_crosstest(self, intrin, data):
+ """
+ Test intrinsics:
+ npyv_any_##SFX
+ npyv_all_##SFX
+ """
+ data_a = self.load(data * self.nlanes)
+ func = eval(intrin)
+ intrin = getattr(self, intrin)
+ desired = func(data_a)
+ simd = intrin(data_a)
+ assert not not simd == desired
+
def test_conversion_boolean(self):
bsfx = "b" + self.sfx[1:]
to_boolean = getattr(self.npyv, "cvt_%s_%s" % (bsfx, self.sfx))