summaryrefslogtreecommitdiff
path: root/numpy/core/src/common/simd
diff options
context:
space:
mode:
Diffstat (limited to 'numpy/core/src/common/simd')
-rw-r--r--numpy/core/src/common/simd/avx2/arithmetic.h11
-rw-r--r--numpy/core/src/common/simd/avx2/avx2.h1
-rw-r--r--numpy/core/src/common/simd/avx2/memory.h374
-rw-r--r--numpy/core/src/common/simd/avx2/operators.h2
-rw-r--r--numpy/core/src/common/simd/avx2/reorder.h87
-rw-r--r--numpy/core/src/common/simd/avx512/arithmetic.h4
-rw-r--r--numpy/core/src/common/simd/avx512/avx512.h1
-rw-r--r--numpy/core/src/common/simd/avx512/maskop.h13
-rw-r--r--numpy/core/src/common/simd/avx512/memory.h317
-rw-r--r--numpy/core/src/common/simd/avx512/reorder.h152
-rw-r--r--numpy/core/src/common/simd/emulate_maskop.h34
-rw-r--r--numpy/core/src/common/simd/neon/arithmetic.h13
-rw-r--r--numpy/core/src/common/simd/neon/math.h149
-rw-r--r--numpy/core/src/common/simd/neon/memory.h293
-rw-r--r--numpy/core/src/common/simd/neon/neon.h1
-rw-r--r--numpy/core/src/common/simd/neon/operators.h29
-rw-r--r--numpy/core/src/common/simd/neon/reorder.h120
-rw-r--r--numpy/core/src/common/simd/simd.h3
-rw-r--r--numpy/core/src/common/simd/sse/arithmetic.h30
-rw-r--r--numpy/core/src/common/simd/sse/math.h183
-rw-r--r--numpy/core/src/common/simd/sse/memory.h314
-rw-r--r--numpy/core/src/common/simd/sse/reorder.h87
-rw-r--r--numpy/core/src/common/simd/sse/sse.h1
-rw-r--r--numpy/core/src/common/simd/vec/arithmetic.h17
-rw-r--r--numpy/core/src/common/simd/vec/conversion.h11
-rw-r--r--numpy/core/src/common/simd/vec/math.h2
-rw-r--r--numpy/core/src/common/simd/vec/memory.h271
-rw-r--r--numpy/core/src/common/simd/vec/misc.h28
-rw-r--r--numpy/core/src/common/simd/vec/reorder.h99
-rw-r--r--numpy/core/src/common/simd/vec/vec.h2
30 files changed, 2381 insertions, 268 deletions
diff --git a/numpy/core/src/common/simd/avx2/arithmetic.h b/numpy/core/src/common/simd/avx2/arithmetic.h
index ad9688338..58d842a6d 100644
--- a/numpy/core/src/common/simd/avx2/arithmetic.h
+++ b/numpy/core/src/common/simd/avx2/arithmetic.h
@@ -247,6 +247,10 @@ NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor)
// negate multiply and subtract, -(a*b) - c
#define npyv_nmulsub_f32 _mm256_fnmsub_ps
#define npyv_nmulsub_f64 _mm256_fnmsub_pd
+ // multiply, add for odd elements and subtract even elements.
+ // (a * b) -+ c
+ #define npyv_muladdsub_f32 _mm256_fmaddsub_ps
+ #define npyv_muladdsub_f64 _mm256_fmaddsub_pd
#else
// multiply and add, a*b + c
NPY_FINLINE npyv_f32 npyv_muladd_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
@@ -274,6 +278,13 @@ NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor)
npyv_f64 neg_a = npyv_xor_f64(a, npyv_setall_f64(-0.0));
return npyv_sub_f64(npyv_mul_f64(neg_a, b), c);
}
+ // multiply, add for odd elements and subtract even elements.
+ // (a * b) -+ c
+ NPY_FINLINE npyv_f32 npyv_muladdsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
+ { return _mm256_addsub_ps(npyv_mul_f32(a, b), c); }
+ NPY_FINLINE npyv_f64 npyv_muladdsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c)
+ { return _mm256_addsub_pd(npyv_mul_f64(a, b), c); }
+
#endif // !NPY_HAVE_FMA3
/***************************
diff --git a/numpy/core/src/common/simd/avx2/avx2.h b/numpy/core/src/common/simd/avx2/avx2.h
index 8cb74df2b..d64f3c6d6 100644
--- a/numpy/core/src/common/simd/avx2/avx2.h
+++ b/numpy/core/src/common/simd/avx2/avx2.h
@@ -11,6 +11,7 @@
#define NPY_SIMD_FMA3 0 // fast emulated
#endif
#define NPY_SIMD_BIGENDIAN 0
+#define NPY_SIMD_CMPSIGNAL 0
// Enough limit to allow us to use _mm256_i32gather_*
#define NPY_SIMD_MAXLOAD_STRIDE32 (0x7fffffff / 8)
diff --git a/numpy/core/src/common/simd/avx2/memory.h b/numpy/core/src/common/simd/avx2/memory.h
index 410c35dc8..81144a36b 100644
--- a/numpy/core/src/common/simd/avx2/memory.h
+++ b/numpy/core/src/common/simd/avx2/memory.h
@@ -84,17 +84,6 @@ NPY_FINLINE npyv_s32 npyv_loadn_s32(const npy_int32 *ptr, npy_intp stride)
NPY_FINLINE npyv_f32 npyv_loadn_f32(const float *ptr, npy_intp stride)
{ return _mm256_castsi256_ps(npyv_loadn_u32((const npy_uint32*)ptr, stride)); }
//// 64
-#if 0 // slower
-NPY_FINLINE npyv_u64 npyv_loadn_u64(const npy_uint64 *ptr, npy_intp stride)
-{
- const __m256i idx = npyv_set_s64(0, 1*stride, 2*stride, 3*stride);
- return _mm256_i64gather_epi64((const void*)ptr, idx, 8);
-}
-NPY_FINLINE npyv_s64 npyv_loadn_s64(const npy_int64 *ptr, npy_intp stride)
-{ return npyv_loadn_u64((const npy_uint64*)ptr, stride); }
-NPY_FINLINE npyv_f64 npyv_loadn_f64(const double *ptr, npy_intp stride)
-{ return _mm256_castsi256_pd(npyv_loadn_u64((const npy_uint64*)ptr, stride)); }
-#endif
NPY_FINLINE npyv_f64 npyv_loadn_f64(const double *ptr, npy_intp stride)
{
__m128d a0 = _mm_castsi128_pd(_mm_loadl_epi64((const __m128i*)ptr));
@@ -107,6 +96,32 @@ NPY_FINLINE npyv_u64 npyv_loadn_u64(const npy_uint64 *ptr, npy_intp stride)
{ return _mm256_castpd_si256(npyv_loadn_f64((const double*)ptr, stride)); }
NPY_FINLINE npyv_s64 npyv_loadn_s64(const npy_int64 *ptr, npy_intp stride)
{ return _mm256_castpd_si256(npyv_loadn_f64((const double*)ptr, stride)); }
+
+//// 64-bit load over 32-bit stride
+NPY_FINLINE npyv_f32 npyv_loadn2_f32(const float *ptr, npy_intp stride)
+{
+ __m128d a0 = _mm_castsi128_pd(_mm_loadl_epi64((const __m128i*)ptr));
+ __m128d a2 = _mm_castsi128_pd(_mm_loadl_epi64((const __m128i*)(ptr + stride*2)));
+ __m128d a01 = _mm_loadh_pd(a0, (const double*)(ptr + stride));
+ __m128d a23 = _mm_loadh_pd(a2, (const double*)(ptr + stride*3));
+ return _mm256_castpd_ps(_mm256_insertf128_pd(_mm256_castpd128_pd256(a01), a23, 1));
+}
+NPY_FINLINE npyv_u32 npyv_loadn2_u32(const npy_uint32 *ptr, npy_intp stride)
+{ return _mm256_castps_si256(npyv_loadn2_f32((const float*)ptr, stride)); }
+NPY_FINLINE npyv_s32 npyv_loadn2_s32(const npy_int32 *ptr, npy_intp stride)
+{ return _mm256_castps_si256(npyv_loadn2_f32((const float*)ptr, stride)); }
+
+//// 128-bit load over 64-bit stride
+NPY_FINLINE npyv_f64 npyv_loadn2_f64(const double *ptr, npy_intp stride)
+{
+ __m256d a = npyv_loadl_f64(ptr);
+ return _mm256_insertf128_pd(a, _mm_loadu_pd(ptr + stride), 1);
+}
+NPY_FINLINE npyv_u64 npyv_loadn2_u64(const npy_uint64 *ptr, npy_intp stride)
+{ return _mm256_castpd_si256(npyv_loadn2_f64((const double*)ptr, stride)); }
+NPY_FINLINE npyv_s64 npyv_loadn2_s64(const npy_int64 *ptr, npy_intp stride)
+{ return _mm256_castpd_si256(npyv_loadn2_f64((const double*)ptr, stride)); }
+
/***************************
* Non-contiguous Store
***************************/
@@ -143,6 +158,32 @@ NPY_FINLINE void npyv_storen_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a)
NPY_FINLINE void npyv_storen_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a)
{ npyv_storen_f64((double*)ptr, stride, _mm256_castsi256_pd(a)); }
+//// 64-bit store over 32-bit stride
+NPY_FINLINE void npyv_storen2_u32(npy_uint32 *ptr, npy_intp stride, npyv_u32 a)
+{
+ __m128d a0 = _mm256_castpd256_pd128(_mm256_castsi256_pd(a));
+ __m128d a1 = _mm256_extractf128_pd(_mm256_castsi256_pd(a), 1);
+ _mm_storel_pd((double*)ptr, a0);
+ _mm_storeh_pd((double*)(ptr + stride), a0);
+ _mm_storel_pd((double*)(ptr + stride*2), a1);
+ _mm_storeh_pd((double*)(ptr + stride*3), a1);
+}
+NPY_FINLINE void npyv_storen2_s32(npy_int32 *ptr, npy_intp stride, npyv_s32 a)
+{ npyv_storen2_u32((npy_uint32*)ptr, stride, a); }
+NPY_FINLINE void npyv_storen2_f32(float *ptr, npy_intp stride, npyv_f32 a)
+{ npyv_storen2_u32((npy_uint32*)ptr, stride, _mm256_castps_si256(a)); }
+
+//// 128-bit store over 64-bit stride
+NPY_FINLINE void npyv_storen2_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a)
+{
+ npyv_storel_u64(ptr, a);
+ npyv_storeh_u64(ptr + stride, a);
+}
+NPY_FINLINE void npyv_storen2_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a)
+{ npyv_storen2_u64((npy_uint64*)ptr, stride, a); }
+NPY_FINLINE void npyv_storen2_f64(double *ptr, npy_intp stride, npyv_f64 a)
+{ npyv_storen2_u64((npy_uint64*)ptr, stride, _mm256_castpd_si256(a)); }
+
/*********************************
* Partial Load
*********************************/
@@ -174,7 +215,7 @@ NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, n
const __m256i steps = npyv_set_s64(0, 1, 2, 3);
__m256i vnlane = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane);
__m256i mask = _mm256_cmpgt_epi64(vnlane, steps);
- __m256i payload = _mm256_maskload_epi64((const void*)ptr, mask);
+ __m256i payload = _mm256_maskload_epi64((const long long*)ptr, mask);
return _mm256_blendv_epi8(vfill, payload, mask);
}
// fill zero to rest lanes
@@ -184,7 +225,45 @@ NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
const __m256i steps = npyv_set_s64(0, 1, 2, 3);
__m256i vnlane = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane);
__m256i mask = _mm256_cmpgt_epi64(vnlane, steps);
- return _mm256_maskload_epi64((const void*)ptr, mask);
+ return _mm256_maskload_epi64((const long long*)ptr, mask);
+}
+
+//// 64-bit nlane
+NPY_FINLINE npyv_s32 npyv_load2_till_s32(const npy_int32 *ptr, npy_uintp nlane,
+ npy_int32 fill_lo, npy_int32 fill_hi)
+{
+ assert(nlane > 0);
+ const __m256i vfill = npyv_set_s32(
+ fill_lo, fill_hi, fill_lo, fill_hi,
+ fill_lo, fill_hi, fill_lo, fill_hi
+ );
+ const __m256i steps = npyv_set_s64(0, 1, 2, 3);
+ __m256i vnlane = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane);
+ __m256i mask = _mm256_cmpgt_epi64(vnlane, steps);
+ __m256i payload = _mm256_maskload_epi64((const long long*)ptr, mask);
+ return _mm256_blendv_epi8(vfill, payload, mask);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s32 npyv_load2_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
+{ return npyv_load_tillz_s64((const npy_int64*)ptr, nlane); }
+
+/// 128-bit nlane
+NPY_FINLINE npyv_u64 npyv_load2_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
+{
+ assert(nlane > 0);
+ npy_int64 m = -((npy_int64)(nlane > 1));
+ __m256i mask = npyv_set_s64(-1, -1, m, m);
+ return _mm256_maskload_epi64((const long long*)ptr, mask);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_u64 npyv_load2_till_s64(const npy_int64 *ptr, npy_uintp nlane,
+ npy_int64 fill_lo, npy_int64 fill_hi)
+{
+ const __m256i vfill = npyv_set_s64(0, 0, fill_lo, fill_hi);
+ npy_int64 m = -((npy_int64)(nlane > 1));
+ __m256i mask = npyv_set_s64(-1, -1, m, m);
+ __m256i payload = _mm256_maskload_epi64((const long long*)ptr, mask);
+ return _mm256_blendv_epi8(vfill, payload, mask);
}
/*********************************
* Non-contiguous partial load
@@ -216,12 +295,53 @@ npyv_loadn_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_
const __m256i steps = npyv_set_s64(0, 1, 2, 3);
__m256i vnlane = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane);
__m256i mask = _mm256_cmpgt_epi64(vnlane, steps);
- return _mm256_mask_i64gather_epi64(vfill, (const void*)ptr, idx, mask, 8);
+ return _mm256_mask_i64gather_epi64(vfill, (const long long*)ptr, idx, mask, 8);
}
// fill zero to rest lanes
NPY_FINLINE npyv_s64
npyv_loadn_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane)
{ return npyv_loadn_till_s64(ptr, stride, nlane, 0); }
+
+//// 64-bit load over 32-bit stride
+NPY_FINLINE npyv_s64 npyv_loadn2_till_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane,
+ npy_int32 fill_lo, npy_int32 fill_hi)
+{
+ assert(nlane > 0);
+ const __m256i vfill = npyv_set_s32(
+ fill_lo, fill_hi, fill_lo, fill_hi,
+ fill_lo, fill_hi, fill_lo, fill_hi
+ );
+ const __m256i idx = npyv_set_s64(0, 1*stride, 2*stride, 3*stride);
+ const __m256i steps = npyv_set_s64(0, 1, 2, 3);
+ __m256i vnlane = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane);
+ __m256i mask = _mm256_cmpgt_epi64(vnlane, steps);
+ return _mm256_mask_i64gather_epi64(vfill, (const long long*)ptr, idx, mask, 4);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s32 npyv_loadn2_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane)
+{ return npyv_loadn2_till_s32(ptr, stride, nlane, 0, 0); }
+
+//// 128-bit load over 64-bit stride
+NPY_FINLINE npyv_s64 npyv_loadn2_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane,
+ npy_int64 fill_lo, npy_int64 fill_hi)
+{
+ assert(nlane > 0);
+ __m256i a = npyv_loadl_s64(ptr);
+#if defined(_MSC_VER) && defined(_M_IX86)
+ __m128i fill =_mm_setr_epi32(
+ (int)fill_lo, (int)(fill_lo >> 32),
+ (int)fill_hi, (int)(fill_hi >> 32)
+ );
+#else
+ __m128i fill = _mm_set_epi64x(fill_hi, fill_lo);
+#endif
+ __m128i b = nlane > 1 ? _mm_loadu_si128((const __m128i*)(ptr + stride)) : fill;
+ return _mm256_inserti128_si256(a, b, 1);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s64 npyv_loadn2_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane)
+{ return npyv_loadn2_till_s64(ptr, stride, nlane, 0, 0); }
+
/*********************************
* Partial store
*********************************/
@@ -241,7 +361,21 @@ NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a
const __m256i steps = npyv_set_s64(0, 1, 2, 3);
__m256i vnlane = npyv_setall_s64(nlane > 8 ? 8 : (int)nlane);
__m256i mask = _mm256_cmpgt_epi64(vnlane, steps);
- _mm256_maskstore_epi64((void*)ptr, mask, a);
+ _mm256_maskstore_epi64((long long*)ptr, mask, a);
+}
+
+//// 64-bit nlane
+NPY_FINLINE void npyv_store2_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a)
+{ npyv_store_till_s64((npy_int64*)ptr, nlane, a); }
+
+//// 128-bit nlane
+NPY_FINLINE void npyv_store2_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a)
+{
+ assert(nlane > 0);
+ npyv_storel_s64(ptr, a);
+ if (nlane > 1) {
+ npyv_storeh_s64(ptr + 2, a);
+ }
}
/*********************************
* Non-contiguous partial store
@@ -252,23 +386,52 @@ NPY_FINLINE void npyv_storen_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp
assert(nlane > 0);
__m128i a0 = _mm256_castsi256_si128(a);
__m128i a1 = _mm256_extracti128_si256(a, 1);
+
+ ptr[stride*0] = _mm_extract_epi32(a0, 0);
switch(nlane) {
- default:
- ptr[stride*7] = _mm_extract_epi32(a1, 3);
- case 7:
- ptr[stride*6] = _mm_extract_epi32(a1, 2);
- case 6:
- ptr[stride*5] = _mm_extract_epi32(a1, 1);
+ case 1:
+ return;
+ case 2:
+ ptr[stride*1] = _mm_extract_epi32(a0, 1);
+ return;
+ case 3:
+ ptr[stride*1] = _mm_extract_epi32(a0, 1);
+ ptr[stride*2] = _mm_extract_epi32(a0, 2);
+ return;
+ case 4:
+ ptr[stride*1] = _mm_extract_epi32(a0, 1);
+ ptr[stride*2] = _mm_extract_epi32(a0, 2);
+ ptr[stride*3] = _mm_extract_epi32(a0, 3);
+ return;
case 5:
+ ptr[stride*1] = _mm_extract_epi32(a0, 1);
+ ptr[stride*2] = _mm_extract_epi32(a0, 2);
+ ptr[stride*3] = _mm_extract_epi32(a0, 3);
ptr[stride*4] = _mm_extract_epi32(a1, 0);
- case 4:
+ return;
+ case 6:
+ ptr[stride*1] = _mm_extract_epi32(a0, 1);
+ ptr[stride*2] = _mm_extract_epi32(a0, 2);
ptr[stride*3] = _mm_extract_epi32(a0, 3);
- case 3:
+ ptr[stride*4] = _mm_extract_epi32(a1, 0);
+ ptr[stride*5] = _mm_extract_epi32(a1, 1);
+ return;
+ case 7:
+ ptr[stride*1] = _mm_extract_epi32(a0, 1);
ptr[stride*2] = _mm_extract_epi32(a0, 2);
- case 2:
+ ptr[stride*3] = _mm_extract_epi32(a0, 3);
+ ptr[stride*4] = _mm_extract_epi32(a1, 0);
+ ptr[stride*5] = _mm_extract_epi32(a1, 1);
+ ptr[stride*6] = _mm_extract_epi32(a1, 2);
+ return;
+ default:
ptr[stride*1] = _mm_extract_epi32(a0, 1);
- case 1:
- ptr[stride*0] = _mm_extract_epi32(a0, 0);
+ ptr[stride*2] = _mm_extract_epi32(a0, 2);
+ ptr[stride*3] = _mm_extract_epi32(a0, 3);
+ ptr[stride*4] = _mm_extract_epi32(a1, 0);
+ ptr[stride*5] = _mm_extract_epi32(a1, 1);
+ ptr[stride*6] = _mm_extract_epi32(a1, 2);
+ ptr[stride*7] = _mm_extract_epi32(a1, 3);
}
}
//// 64
@@ -277,19 +440,60 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp
assert(nlane > 0);
__m128d a0 = _mm256_castpd256_pd128(_mm256_castsi256_pd(a));
__m128d a1 = _mm256_extractf128_pd(_mm256_castsi256_pd(a), 1);
+
double *dptr = (double*)ptr;
+ _mm_storel_pd(dptr, a0);
switch(nlane) {
- default:
- _mm_storeh_pd(dptr + stride * 3, a1);
+ case 1:
+ return;
+ case 2:
+ _mm_storeh_pd(dptr + stride * 1, a0);
+ return;
case 3:
+ _mm_storeh_pd(dptr + stride * 1, a0);
_mm_storel_pd(dptr + stride * 2, a1);
- case 2:
+ return;
+ default:
_mm_storeh_pd(dptr + stride * 1, a0);
+ _mm_storel_pd(dptr + stride * 2, a1);
+ _mm_storeh_pd(dptr + stride * 3, a1);
+ }
+}
+
+//// 64-bit store over 32-bit stride
+NPY_FINLINE void npyv_storen2_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npyv_s32 a)
+{
+ assert(nlane > 0);
+ __m128d a0 = _mm256_castpd256_pd128(_mm256_castsi256_pd(a));
+ __m128d a1 = _mm256_extractf128_pd(_mm256_castsi256_pd(a), 1);
+
+ _mm_storel_pd((double*)ptr, a0);
+ switch(nlane) {
case 1:
- _mm_storel_pd(dptr + stride * 0, a0);
+ return;
+ case 2:
+ _mm_storeh_pd((double*)(ptr + stride * 1), a0);
+ return;
+ case 3:
+ _mm_storeh_pd((double*)(ptr + stride * 1), a0);
+ _mm_storel_pd((double*)(ptr + stride * 2), a1);
+ return;
+ default:
+ _mm_storeh_pd((double*)(ptr + stride * 1), a0);
+ _mm_storel_pd((double*)(ptr + stride * 2), a1);
+ _mm_storeh_pd((double*)(ptr + stride * 3), a1);
}
}
+//// 128-bit store over 64-bit stride
+NPY_FINLINE void npyv_storen2_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npyv_s64 a)
+{
+ assert(nlane > 0);
+ npyv_storel_s64(ptr, a);
+ if (nlane > 1) {
+ npyv_storeh_s64(ptr + stride, a);
+ }
+}
/*****************************************************************************
* Implement partial load/store for u32/f32/u64/f64... via reinterpret cast
*****************************************************************************/
@@ -300,7 +504,8 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp
union { \
npyv_lanetype_##F_SFX from_##F_SFX; \
npyv_lanetype_##T_SFX to_##T_SFX; \
- } pun = {.from_##F_SFX = fill}; \
+ } pun; \
+ pun.from_##F_SFX = fill; \
return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load_till_##T_SFX( \
(const npyv_lanetype_##T_SFX *)ptr, nlane, pun.to_##T_SFX \
)); \
@@ -312,7 +517,8 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp
union { \
npyv_lanetype_##F_SFX from_##F_SFX; \
npyv_lanetype_##T_SFX to_##T_SFX; \
- } pun = {.from_##F_SFX = fill}; \
+ } pun; \
+ pun.from_##F_SFX = fill; \
return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn_till_##T_SFX( \
(const npyv_lanetype_##T_SFX *)ptr, stride, nlane, pun.to_##T_SFX \
)); \
@@ -353,6 +559,110 @@ NPYV_IMPL_AVX2_REST_PARTIAL_TYPES(f32, s32)
NPYV_IMPL_AVX2_REST_PARTIAL_TYPES(u64, s64)
NPYV_IMPL_AVX2_REST_PARTIAL_TYPES(f64, s64)
+// 128-bit/64-bit stride (load/store pair)
+#define NPYV_IMPL_AVX2_REST_PARTIAL_TYPES_PAIR(F_SFX, T_SFX) \
+ NPY_FINLINE npyv_##F_SFX npyv_load2_till_##F_SFX \
+ (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, \
+ npyv_lanetype_##F_SFX fill_lo, npyv_lanetype_##F_SFX fill_hi) \
+ { \
+ union pun { \
+ npyv_lanetype_##F_SFX from_##F_SFX; \
+ npyv_lanetype_##T_SFX to_##T_SFX; \
+ }; \
+ union pun pun_lo; \
+ union pun pun_hi; \
+ pun_lo.from_##F_SFX = fill_lo; \
+ pun_hi.from_##F_SFX = fill_hi; \
+ return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load2_till_##T_SFX( \
+ (const npyv_lanetype_##T_SFX *)ptr, nlane, pun_lo.to_##T_SFX, pun_hi.to_##T_SFX \
+ )); \
+ } \
+ NPY_FINLINE npyv_##F_SFX npyv_loadn2_till_##F_SFX \
+ (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, \
+ npyv_lanetype_##F_SFX fill_lo, npyv_lanetype_##F_SFX fill_hi) \
+ { \
+ union pun { \
+ npyv_lanetype_##F_SFX from_##F_SFX; \
+ npyv_lanetype_##T_SFX to_##T_SFX; \
+ }; \
+ union pun pun_lo; \
+ union pun pun_hi; \
+ pun_lo.from_##F_SFX = fill_lo; \
+ pun_hi.from_##F_SFX = fill_hi; \
+ return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn2_till_##T_SFX( \
+ (const npyv_lanetype_##T_SFX *)ptr, stride, nlane, pun_lo.to_##T_SFX, \
+ pun_hi.to_##T_SFX \
+ )); \
+ } \
+ NPY_FINLINE npyv_##F_SFX npyv_load2_tillz_##F_SFX \
+ (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane) \
+ { \
+ return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load2_tillz_##T_SFX( \
+ (const npyv_lanetype_##T_SFX *)ptr, nlane \
+ )); \
+ } \
+ NPY_FINLINE npyv_##F_SFX npyv_loadn2_tillz_##F_SFX \
+ (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane) \
+ { \
+ return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn2_tillz_##T_SFX( \
+ (const npyv_lanetype_##T_SFX *)ptr, stride, nlane \
+ )); \
+ } \
+ NPY_FINLINE void npyv_store2_till_##F_SFX \
+ (npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, npyv_##F_SFX a) \
+ { \
+ npyv_store2_till_##T_SFX( \
+ (npyv_lanetype_##T_SFX *)ptr, nlane, \
+ npyv_reinterpret_##T_SFX##_##F_SFX(a) \
+ ); \
+ } \
+ NPY_FINLINE void npyv_storen2_till_##F_SFX \
+ (npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, npyv_##F_SFX a) \
+ { \
+ npyv_storen2_till_##T_SFX( \
+ (npyv_lanetype_##T_SFX *)ptr, stride, nlane, \
+ npyv_reinterpret_##T_SFX##_##F_SFX(a) \
+ ); \
+ }
+
+NPYV_IMPL_AVX2_REST_PARTIAL_TYPES_PAIR(u32, s32)
+NPYV_IMPL_AVX2_REST_PARTIAL_TYPES_PAIR(f32, s32)
+NPYV_IMPL_AVX2_REST_PARTIAL_TYPES_PAIR(u64, s64)
+NPYV_IMPL_AVX2_REST_PARTIAL_TYPES_PAIR(f64, s64)
+
+/************************************************************
+ * de-interlave load / interleave contiguous store
+ ************************************************************/
+// two channels
+#define NPYV_IMPL_AVX2_MEM_INTERLEAVE(SFX, ZSFX) \
+ NPY_FINLINE npyv_##ZSFX##x2 npyv_zip_##ZSFX(npyv_##ZSFX, npyv_##ZSFX); \
+ NPY_FINLINE npyv_##ZSFX##x2 npyv_unzip_##ZSFX(npyv_##ZSFX, npyv_##ZSFX); \
+ NPY_FINLINE npyv_##SFX##x2 npyv_load_##SFX##x2( \
+ const npyv_lanetype_##SFX *ptr \
+ ) { \
+ return npyv_unzip_##ZSFX( \
+ npyv_load_##SFX(ptr), npyv_load_##SFX(ptr+npyv_nlanes_##SFX) \
+ ); \
+ } \
+ NPY_FINLINE void npyv_store_##SFX##x2( \
+ npyv_lanetype_##SFX *ptr, npyv_##SFX##x2 v \
+ ) { \
+ npyv_##SFX##x2 zip = npyv_zip_##ZSFX(v.val[0], v.val[1]); \
+ npyv_store_##SFX(ptr, zip.val[0]); \
+ npyv_store_##SFX(ptr + npyv_nlanes_##SFX, zip.val[1]); \
+ }
+
+NPYV_IMPL_AVX2_MEM_INTERLEAVE(u8, u8)
+NPYV_IMPL_AVX2_MEM_INTERLEAVE(s8, u8)
+NPYV_IMPL_AVX2_MEM_INTERLEAVE(u16, u16)
+NPYV_IMPL_AVX2_MEM_INTERLEAVE(s16, u16)
+NPYV_IMPL_AVX2_MEM_INTERLEAVE(u32, u32)
+NPYV_IMPL_AVX2_MEM_INTERLEAVE(s32, u32)
+NPYV_IMPL_AVX2_MEM_INTERLEAVE(u64, u64)
+NPYV_IMPL_AVX2_MEM_INTERLEAVE(s64, u64)
+NPYV_IMPL_AVX2_MEM_INTERLEAVE(f32, f32)
+NPYV_IMPL_AVX2_MEM_INTERLEAVE(f64, f64)
+
/*********************************
* Lookup tables
*********************************/
diff --git a/numpy/core/src/common/simd/avx2/operators.h b/numpy/core/src/common/simd/avx2/operators.h
index c10267b21..7b9b6a344 100644
--- a/numpy/core/src/common/simd/avx2/operators.h
+++ b/numpy/core/src/common/simd/avx2/operators.h
@@ -205,7 +205,7 @@ NPY_FINLINE __m256i npyv_cmpge_u32(__m256i a, __m256i b)
#define npyv_cmple_u64(A, B) npyv_cmpge_u64(B, A)
#define npyv_cmple_s64(A, B) npyv_cmpge_s64(B, A)
-// precision comparison
+// precision comparison (ordered)
#define npyv_cmpeq_f32(A, B) _mm256_castps_si256(_mm256_cmp_ps(A, B, _CMP_EQ_OQ))
#define npyv_cmpeq_f64(A, B) _mm256_castpd_si256(_mm256_cmp_pd(A, B, _CMP_EQ_OQ))
#define npyv_cmpneq_f32(A, B) _mm256_castps_si256(_mm256_cmp_ps(A, B, _CMP_NEQ_UQ))
diff --git a/numpy/core/src/common/simd/avx2/reorder.h b/numpy/core/src/common/simd/avx2/reorder.h
index 4d6ec8f75..9ebe0e7f4 100644
--- a/numpy/core/src/common/simd/avx2/reorder.h
+++ b/numpy/core/src/common/simd/avx2/reorder.h
@@ -94,6 +94,75 @@ NPY_FINLINE npyv_f64x2 npyv_zip_f64(__m256d a, __m256d b)
return npyv_combine_f64(ab0, ab1);
}
+// deinterleave two vectors
+NPY_FINLINE npyv_u8x2 npyv_unzip_u8(npyv_u8 ab0, npyv_u8 ab1)
+{
+ const __m256i idx = _mm256_setr_epi8(
+ 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
+ 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
+ );
+ __m256i ab_03 = _mm256_shuffle_epi8(ab0, idx);
+ __m256i ab_12 = _mm256_shuffle_epi8(ab1, idx);
+ npyv_u8x2 ab_lh = npyv_combine_u8(ab_03, ab_12);
+ npyv_u8x2 r;
+ r.val[0] = _mm256_unpacklo_epi64(ab_lh.val[0], ab_lh.val[1]);
+ r.val[1] = _mm256_unpackhi_epi64(ab_lh.val[0], ab_lh.val[1]);
+ return r;
+}
+#define npyv_unzip_s8 npyv_unzip_u8
+
+NPY_FINLINE npyv_u16x2 npyv_unzip_u16(npyv_u16 ab0, npyv_u16 ab1)
+{
+ const __m256i idx = _mm256_setr_epi8(
+ 0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15,
+ 0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15
+ );
+ __m256i ab_03 = _mm256_shuffle_epi8(ab0, idx);
+ __m256i ab_12 = _mm256_shuffle_epi8(ab1, idx);
+ npyv_u16x2 ab_lh = npyv_combine_u16(ab_03, ab_12);
+ npyv_u16x2 r;
+ r.val[0] = _mm256_unpacklo_epi64(ab_lh.val[0], ab_lh.val[1]);
+ r.val[1] = _mm256_unpackhi_epi64(ab_lh.val[0], ab_lh.val[1]);
+ return r;
+}
+#define npyv_unzip_s16 npyv_unzip_u16
+
+NPY_FINLINE npyv_u32x2 npyv_unzip_u32(npyv_u32 ab0, npyv_u32 ab1)
+{
+ const __m256i idx = npyv_set_u32(0, 2, 4, 6, 1, 3, 5, 7);
+ __m256i abl = _mm256_permutevar8x32_epi32(ab0, idx);
+ __m256i abh = _mm256_permutevar8x32_epi32(ab1, idx);
+ return npyv_combine_u32(abl, abh);
+}
+#define npyv_unzip_s32 npyv_unzip_u32
+
+NPY_FINLINE npyv_u64x2 npyv_unzip_u64(npyv_u64 ab0, npyv_u64 ab1)
+{
+ npyv_u64x2 ab_lh = npyv_combine_u64(ab0, ab1);
+ npyv_u64x2 r;
+ r.val[0] = _mm256_unpacklo_epi64(ab_lh.val[0], ab_lh.val[1]);
+ r.val[1] = _mm256_unpackhi_epi64(ab_lh.val[0], ab_lh.val[1]);
+ return r;
+}
+#define npyv_unzip_s64 npyv_unzip_u64
+
+NPY_FINLINE npyv_f32x2 npyv_unzip_f32(npyv_f32 ab0, npyv_f32 ab1)
+{
+ const __m256i idx = npyv_set_u32(0, 2, 4, 6, 1, 3, 5, 7);
+ __m256 abl = _mm256_permutevar8x32_ps(ab0, idx);
+ __m256 abh = _mm256_permutevar8x32_ps(ab1, idx);
+ return npyv_combine_f32(abl, abh);
+}
+
+NPY_FINLINE npyv_f64x2 npyv_unzip_f64(npyv_f64 ab0, npyv_f64 ab1)
+{
+ npyv_f64x2 ab_lh = npyv_combine_f64(ab0, ab1);
+ npyv_f64x2 r;
+ r.val[0] = _mm256_unpacklo_pd(ab_lh.val[0], ab_lh.val[1]);
+ r.val[1] = _mm256_unpackhi_pd(ab_lh.val[0], ab_lh.val[1]);
+ return r;
+}
+
// Reverse elements of each 64-bit lane
NPY_FINLINE npyv_u8 npyv_rev64_u8(npyv_u8 a)
{
@@ -126,4 +195,22 @@ NPY_FINLINE npyv_f32 npyv_rev64_f32(npyv_f32 a)
return _mm256_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 0, 1));
}
+// Permuting the elements of each 128-bit lane by immediate index for
+// each element.
+#define npyv_permi128_u32(A, E0, E1, E2, E3) \
+ _mm256_shuffle_epi32(A, _MM_SHUFFLE(E3, E2, E1, E0))
+
+#define npyv_permi128_s32 npyv_permi128_u32
+
+#define npyv_permi128_u64(A, E0, E1) \
+ _mm256_shuffle_epi32(A, _MM_SHUFFLE(((E1)<<1)+1, ((E1)<<1), ((E0)<<1)+1, ((E0)<<1)))
+
+#define npyv_permi128_s64 npyv_permi128_u64
+
+#define npyv_permi128_f32(A, E0, E1, E2, E3) \
+ _mm256_permute_ps(A, _MM_SHUFFLE(E3, E2, E1, E0))
+
+#define npyv_permi128_f64(A, E0, E1) \
+ _mm256_permute_pd(A, ((E1)<<3) | ((E0)<<2) | ((E1)<<1) | (E0))
+
#endif // _NPY_SIMD_AVX2_REORDER_H
diff --git a/numpy/core/src/common/simd/avx512/arithmetic.h b/numpy/core/src/common/simd/avx512/arithmetic.h
index 1290dc0ad..a63da87d0 100644
--- a/numpy/core/src/common/simd/avx512/arithmetic.h
+++ b/numpy/core/src/common/simd/avx512/arithmetic.h
@@ -349,6 +349,10 @@ NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor)
// negate multiply and subtract, -(a*b) - c
#define npyv_nmulsub_f32 _mm512_fnmsub_ps
#define npyv_nmulsub_f64 _mm512_fnmsub_pd
+// multiply, add for odd elements and subtract even elements.
+// (a * b) -+ c
+#define npyv_muladdsub_f32 _mm512_fmaddsub_ps
+#define npyv_muladdsub_f64 _mm512_fmaddsub_pd
/***************************
* Summation: Calculates the sum of all vector elements.
diff --git a/numpy/core/src/common/simd/avx512/avx512.h b/numpy/core/src/common/simd/avx512/avx512.h
index 0946e6443..aa6abe256 100644
--- a/numpy/core/src/common/simd/avx512/avx512.h
+++ b/numpy/core/src/common/simd/avx512/avx512.h
@@ -7,6 +7,7 @@
#define NPY_SIMD_F64 1
#define NPY_SIMD_FMA3 1 // native support
#define NPY_SIMD_BIGENDIAN 0
+#define NPY_SIMD_CMPSIGNAL 0
// Enough limit to allow us to use _mm512_i32gather_* and _mm512_i32scatter_*
#define NPY_SIMD_MAXLOAD_STRIDE32 (0x7fffffff / 16)
#define NPY_SIMD_MAXSTORE_STRIDE32 (0x7fffffff / 16)
diff --git a/numpy/core/src/common/simd/avx512/maskop.h b/numpy/core/src/common/simd/avx512/maskop.h
index d1c188390..88fa4a68c 100644
--- a/numpy/core/src/common/simd/avx512/maskop.h
+++ b/numpy/core/src/common/simd/avx512/maskop.h
@@ -51,4 +51,17 @@ NPYV_IMPL_AVX512_MASK_ADDSUB(s64, b64, epi64)
NPYV_IMPL_AVX512_MASK_ADDSUB(f32, b32, ps)
NPYV_IMPL_AVX512_MASK_ADDSUB(f64, b64, pd)
+// division, m ? a / b : c
+NPY_FINLINE npyv_f32 npyv_ifdiv_f32(npyv_b32 m, npyv_f32 a, npyv_f32 b, npyv_f32 c)
+{ return _mm512_mask_div_ps(c, m, a, b); }
+// conditional division, m ? a / b : 0
+NPY_FINLINE npyv_f32 npyv_ifdivz_f32(npyv_b32 m, npyv_f32 a, npyv_f32 b)
+{ return _mm512_maskz_div_ps(m, a, b); }
+// division, m ? a / b : c
+NPY_FINLINE npyv_f64 npyv_ifdiv_f64(npyv_b32 m, npyv_f64 a, npyv_f64 b, npyv_f64 c)
+{ return _mm512_mask_div_pd(c, m, a, b); }
+// conditional division, m ? a / b : 0
+NPY_FINLINE npyv_f64 npyv_ifdivz_f64(npyv_b32 m, npyv_f64 a, npyv_f64 b)
+{ return _mm512_maskz_div_pd(m, a, b); }
+
#endif // _NPY_SIMD_AVX512_MASKOP_H
diff --git a/numpy/core/src/common/simd/avx512/memory.h b/numpy/core/src/common/simd/avx512/memory.h
index 03fcb4630..fdf96a92c 100644
--- a/numpy/core/src/common/simd/avx512/memory.h
+++ b/numpy/core/src/common/simd/avx512/memory.h
@@ -120,6 +120,52 @@ NPY_FINLINE npyv_s64 npyv_loadn_s64(const npy_int64 *ptr, npy_intp stride)
{ return npyv_loadn_u64((const npy_uint64*)ptr, stride); }
NPY_FINLINE npyv_f64 npyv_loadn_f64(const double *ptr, npy_intp stride)
{ return _mm512_castsi512_pd(npyv_loadn_u64((const npy_uint64*)ptr, stride)); }
+
+//// 64-bit load over 32-bit stride
+NPY_FINLINE npyv_u32 npyv_loadn2_u32(const npy_uint32 *ptr, npy_intp stride)
+{
+ __m128d a = _mm_loadh_pd(
+ _mm_castsi128_pd(_mm_loadl_epi64((const __m128i*)ptr)),
+ (const double*)(ptr + stride)
+ );
+ __m128d b = _mm_loadh_pd(
+ _mm_castsi128_pd(_mm_loadl_epi64((const __m128i*)(ptr + stride*2))),
+ (const double*)(ptr + stride*3)
+ );
+ __m128d c = _mm_loadh_pd(
+ _mm_castsi128_pd(_mm_loadl_epi64((const __m128i*)(ptr + stride*4))),
+ (const double*)(ptr + stride*5)
+ );
+ __m128d d = _mm_loadh_pd(
+ _mm_castsi128_pd(_mm_loadl_epi64((const __m128i*)(ptr + stride*6))),
+ (const double*)(ptr + stride*7)
+ );
+ return _mm512_castpd_si512(npyv512_combine_pd256(
+ _mm256_insertf128_pd(_mm256_castpd128_pd256(a), b, 1),
+ _mm256_insertf128_pd(_mm256_castpd128_pd256(c), d, 1)
+ ));
+}
+NPY_FINLINE npyv_s32 npyv_loadn2_s32(const npy_int32 *ptr, npy_intp stride)
+{ return npyv_loadn2_u32((const npy_uint32*)ptr, stride); }
+NPY_FINLINE npyv_f32 npyv_loadn2_f32(const float *ptr, npy_intp stride)
+{ return _mm512_castsi512_ps(npyv_loadn2_u32((const npy_uint32*)ptr, stride)); }
+
+//// 128-bit load over 64-bit stride
+NPY_FINLINE npyv_f64 npyv_loadn2_f64(const double *ptr, npy_intp stride)
+{
+ __m128d a = _mm_loadu_pd(ptr);
+ __m128d b = _mm_loadu_pd(ptr + stride);
+ __m128d c = _mm_loadu_pd(ptr + stride * 2);
+ __m128d d = _mm_loadu_pd(ptr + stride * 3);
+ return npyv512_combine_pd256(
+ _mm256_insertf128_pd(_mm256_castpd128_pd256(a), b, 1),
+ _mm256_insertf128_pd(_mm256_castpd128_pd256(c), d, 1)
+ );
+}
+NPY_FINLINE npyv_u64 npyv_loadn2_u64(const npy_uint64 *ptr, npy_intp stride)
+{ return npyv_reinterpret_u64_f64(npyv_loadn2_f64((const double*)ptr, stride)); }
+NPY_FINLINE npyv_s64 npyv_loadn2_s64(const npy_int64 *ptr, npy_intp stride)
+{ return npyv_loadn2_u64((const npy_uint64*)ptr, stride); }
/***************************
* Non-contiguous Store
***************************/
@@ -151,6 +197,48 @@ NPY_FINLINE void npyv_storen_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a)
NPY_FINLINE void npyv_storen_f64(double *ptr, npy_intp stride, npyv_f64 a)
{ npyv_storen_u64((npy_uint64*)ptr, stride, _mm512_castpd_si512(a)); }
+//// 64-bit store over 32-bit stride
+NPY_FINLINE void npyv_storen2_u32(npy_uint32 *ptr, npy_intp stride, npyv_u32 a)
+{
+ __m256d lo = _mm512_castpd512_pd256(_mm512_castsi512_pd(a));
+ __m256d hi = _mm512_extractf64x4_pd(_mm512_castsi512_pd(a), 1);
+ __m128d e0 = _mm256_castpd256_pd128(lo);
+ __m128d e1 = _mm256_extractf128_pd(lo, 1);
+ __m128d e2 = _mm256_castpd256_pd128(hi);
+ __m128d e3 = _mm256_extractf128_pd(hi, 1);
+ _mm_storel_pd((double*)(ptr + stride * 0), e0);
+ _mm_storeh_pd((double*)(ptr + stride * 1), e0);
+ _mm_storel_pd((double*)(ptr + stride * 2), e1);
+ _mm_storeh_pd((double*)(ptr + stride * 3), e1);
+ _mm_storel_pd((double*)(ptr + stride * 4), e2);
+ _mm_storeh_pd((double*)(ptr + stride * 5), e2);
+ _mm_storel_pd((double*)(ptr + stride * 6), e3);
+ _mm_storeh_pd((double*)(ptr + stride * 7), e3);
+}
+NPY_FINLINE void npyv_storen2_s32(npy_int32 *ptr, npy_intp stride, npyv_s32 a)
+{ npyv_storen2_u32((npy_uint32*)ptr, stride, a); }
+NPY_FINLINE void npyv_storen2_f32(float *ptr, npy_intp stride, npyv_f32 a)
+{ npyv_storen2_u32((npy_uint32*)ptr, stride, _mm512_castps_si512(a)); }
+
+//// 128-bit store over 64-bit stride
+NPY_FINLINE void npyv_storen2_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a)
+{
+ __m256i lo = npyv512_lower_si256(a);
+ __m256i hi = npyv512_higher_si256(a);
+ __m128i e0 = _mm256_castsi256_si128(lo);
+ __m128i e1 = _mm256_extracti128_si256(lo, 1);
+ __m128i e2 = _mm256_castsi256_si128(hi);
+ __m128i e3 = _mm256_extracti128_si256(hi, 1);
+ _mm_storeu_si128((__m128i*)(ptr + stride * 0), e0);
+ _mm_storeu_si128((__m128i*)(ptr + stride * 1), e1);
+ _mm_storeu_si128((__m128i*)(ptr + stride * 2), e2);
+ _mm_storeu_si128((__m128i*)(ptr + stride * 3), e3);
+}
+NPY_FINLINE void npyv_storen2_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a)
+{ npyv_storen2_u64((npy_uint64*)ptr, stride, a); }
+NPY_FINLINE void npyv_storen2_f64(double *ptr, npy_intp stride, npyv_f64 a)
+{ npyv_storen2_u64((npy_uint64*)ptr, stride, _mm512_castpd_si512(a)); }
+
/*********************************
* Partial Load
*********************************/
@@ -159,14 +247,14 @@ NPY_FINLINE npyv_s32 npyv_load_till_s32(const npy_int32 *ptr, npy_uintp nlane, n
{
assert(nlane > 0);
const __m512i vfill = _mm512_set1_epi32(fill);
- const __mmask16 mask = nlane > 31 ? -1 : (1 << nlane) - 1;
+ const __mmask16 mask = nlane > 15 ? -1 : (1 << nlane) - 1;
return _mm512_mask_loadu_epi32(vfill, mask, (const __m512i*)ptr);
}
// fill zero to rest lanes
NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
{
assert(nlane > 0);
- const __mmask16 mask = nlane > 31 ? -1 : (1 << nlane) - 1;
+ const __mmask16 mask = nlane > 15 ? -1 : (1 << nlane) - 1;
return _mm512_maskz_loadu_epi32(mask, (const __m512i*)ptr);
}
//// 64
@@ -174,14 +262,44 @@ NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, n
{
assert(nlane > 0);
const __m512i vfill = npyv_setall_s64(fill);
- const __mmask8 mask = nlane > 31 ? -1 : (1 << nlane) - 1;
+ const __mmask8 mask = nlane > 7 ? -1 : (1 << nlane) - 1;
return _mm512_mask_loadu_epi64(vfill, mask, (const __m512i*)ptr);
}
// fill zero to rest lanes
NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
{
assert(nlane > 0);
- const __mmask8 mask = nlane > 15 ? -1 : (1 << nlane) - 1;
+ const __mmask8 mask = nlane > 7 ? -1 : (1 << nlane) - 1;
+ return _mm512_maskz_loadu_epi64(mask, (const __m512i*)ptr);
+}
+
+//// 64-bit nlane
+NPY_FINLINE npyv_s32 npyv_load2_till_s32(const npy_int32 *ptr, npy_uintp nlane,
+ npy_int32 fill_lo, npy_int32 fill_hi)
+{
+ assert(nlane > 0);
+ const __m512i vfill = _mm512_set4_epi32(fill_hi, fill_lo, fill_hi, fill_lo);
+ const __mmask8 mask = nlane > 7 ? -1 : (1 << nlane) - 1;
+ return _mm512_mask_loadu_epi64(vfill, mask, (const __m512i*)ptr);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s32 npyv_load2_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
+{ return npyv_load_tillz_s64((const npy_int64*)ptr, nlane); }
+
+//// 128-bit nlane
+NPY_FINLINE npyv_u64 npyv_load2_till_s64(const npy_int64 *ptr, npy_uintp nlane,
+ npy_int64 fill_lo, npy_int64 fill_hi)
+{
+ assert(nlane > 0);
+ const __m512i vfill = _mm512_set4_epi64(fill_hi, fill_lo, fill_hi, fill_lo);
+ const __mmask8 mask = nlane > 3 ? -1 : (1 << (nlane*2)) - 1;
+ return _mm512_mask_loadu_epi64(vfill, mask, (const __m512i*)ptr);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s64 npyv_load2_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
+{
+ assert(nlane > 0);
+ const __mmask8 mask = nlane > 3 ? -1 : (1 << (nlane*2)) - 1;
return _mm512_maskz_loadu_epi64(mask, (const __m512i*)ptr);
}
/*********************************
@@ -198,7 +316,7 @@ npyv_loadn_till_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npy_
);
const __m512i idx = _mm512_mullo_epi32(steps, _mm512_set1_epi32((int)stride));
const __m512i vfill = _mm512_set1_epi32(fill);
- const __mmask16 mask = nlane > 31 ? -1 : (1 << nlane) - 1;
+ const __mmask16 mask = nlane > 15 ? -1 : (1 << nlane) - 1;
return _mm512_mask_i32gather_epi32(vfill, mask, idx, (const __m512i*)ptr, 4);
}
// fill zero to rest lanes
@@ -215,13 +333,48 @@ npyv_loadn_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_
4*stride, 5*stride, 6*stride, 7*stride
);
const __m512i vfill = npyv_setall_s64(fill);
- const __mmask8 mask = nlane > 31 ? -1 : (1 << nlane) - 1;
+ const __mmask8 mask = nlane > 15 ? -1 : (1 << nlane) - 1;
return _mm512_mask_i64gather_epi64(vfill, mask, idx, (const __m512i*)ptr, 8);
}
// fill zero to rest lanes
NPY_FINLINE npyv_s64
npyv_loadn_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane)
{ return npyv_loadn_till_s64(ptr, stride, nlane, 0); }
+
+//// 64-bit load over 32-bit stride
+NPY_FINLINE npyv_s64 npyv_loadn2_till_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane,
+ npy_int32 fill_lo, npy_int32 fill_hi)
+{
+ assert(nlane > 0);
+ const __m512i idx = npyv_set_s64(
+ 0*stride, 1*stride, 2*stride, 3*stride,
+ 4*stride, 5*stride, 6*stride, 7*stride
+ );
+ const __m512i vfill = _mm512_set4_epi32(fill_hi, fill_lo, fill_hi, fill_lo);
+ const __mmask8 mask = nlane > 7 ? -1 : (1 << nlane) - 1;
+ return _mm512_mask_i64gather_epi64(vfill, mask, idx, (const __m512i*)ptr, 4);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s32 npyv_loadn2_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane)
+{ return npyv_loadn2_till_s32(ptr, stride, nlane, 0, 0); }
+
+//// 128-bit load over 64-bit stride
+NPY_FINLINE npyv_s64 npyv_loadn2_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane,
+ npy_int64 fill_lo, npy_int64 fill_hi)
+{
+ assert(nlane > 0);
+ const __m512i idx = npyv_set_s64(
+ 0, 1, stride, stride+1,
+ stride*2, stride*2+1, stride*3, stride*3+1
+ );
+ const __mmask8 mask = nlane > 3 ? -1 : (1 << (nlane*2)) - 1;
+ const __m512i vfill = _mm512_set4_epi64(fill_hi, fill_lo, fill_hi, fill_lo);
+ return _mm512_mask_i64gather_epi64(vfill, mask, idx, (const __m512i*)ptr, 8);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s64 npyv_loadn2_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane)
+{ return npyv_loadn2_till_s64(ptr, stride, nlane, 0, 0); }
+
/*********************************
* Partial store
*********************************/
@@ -229,14 +382,30 @@ npyv_loadn_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane)
NPY_FINLINE void npyv_store_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a)
{
assert(nlane > 0);
- const __mmask16 mask = nlane > 31 ? -1 : (1 << nlane) - 1;
+ const __mmask16 mask = nlane > 15 ? -1 : (1 << nlane) - 1;
_mm512_mask_storeu_epi32((__m512i*)ptr, mask, a);
}
//// 64
NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a)
{
assert(nlane > 0);
- const __mmask8 mask = nlane > 15 ? -1 : (1 << nlane) - 1;
+ const __mmask8 mask = nlane > 7 ? -1 : (1 << nlane) - 1;
+ _mm512_mask_storeu_epi64((__m512i*)ptr, mask, a);
+}
+
+//// 64-bit nlane
+NPY_FINLINE void npyv_store2_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a)
+{
+ assert(nlane > 0);
+ const __mmask8 mask = nlane > 7 ? -1 : (1 << nlane) - 1;
+ _mm512_mask_storeu_epi64((__m512i*)ptr, mask, a);
+}
+
+//// 128-bit nlane
+NPY_FINLINE void npyv_store2_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a)
+{
+ assert(nlane > 0);
+ const __mmask8 mask = nlane > 3 ? -1 : (1 << (nlane*2)) - 1;
_mm512_mask_storeu_epi64((__m512i*)ptr, mask, a);
}
/*********************************
@@ -251,7 +420,7 @@ NPY_FINLINE void npyv_storen_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
);
const __m512i idx = _mm512_mullo_epi32(steps, _mm512_set1_epi32((int)stride));
- const __mmask16 mask = nlane > 31 ? -1 : (1 << nlane) - 1;
+ const __mmask16 mask = nlane > 15 ? -1 : (1 << nlane) - 1;
_mm512_mask_i32scatter_epi32((__m512i*)ptr, mask, idx, a, 4);
}
//// 64
@@ -262,7 +431,31 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp
0*stride, 1*stride, 2*stride, 3*stride,
4*stride, 5*stride, 6*stride, 7*stride
);
- const __mmask8 mask = nlane > 15 ? -1 : (1 << nlane) - 1;
+ const __mmask8 mask = nlane > 7 ? -1 : (1 << nlane) - 1;
+ _mm512_mask_i64scatter_epi64((__m512i*)ptr, mask, idx, a, 8);
+}
+
+//// 64-bit store over 32-bit stride
+NPY_FINLINE void npyv_storen2_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npyv_s32 a)
+{
+ assert(nlane > 0);
+ const __m512i idx = npyv_set_s64(
+ 0*stride, 1*stride, 2*stride, 3*stride,
+ 4*stride, 5*stride, 6*stride, 7*stride
+ );
+ const __mmask8 mask = nlane > 7 ? -1 : (1 << nlane) - 1;
+ _mm512_mask_i64scatter_epi64((__m512i*)ptr, mask, idx, a, 4);
+}
+
+//// 128-bit store over 64-bit stride
+NPY_FINLINE void npyv_storen2_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npyv_s64 a)
+{
+ assert(nlane > 0);
+ const __m512i idx = npyv_set_s64(
+ 0, 1, stride, stride+1,
+ 2*stride, 2*stride+1, 3*stride, 3*stride+1
+ );
+ const __mmask8 mask = nlane > 3 ? -1 : (1 << (nlane*2)) - 1;
_mm512_mask_i64scatter_epi64((__m512i*)ptr, mask, idx, a, 8);
}
@@ -331,6 +524,110 @@ NPYV_IMPL_AVX512_REST_PARTIAL_TYPES(f32, s32)
NPYV_IMPL_AVX512_REST_PARTIAL_TYPES(u64, s64)
NPYV_IMPL_AVX512_REST_PARTIAL_TYPES(f64, s64)
+// 128-bit/64-bit stride (pair load/store)
+#define NPYV_IMPL_AVX512_REST_PARTIAL_TYPES_PAIR(F_SFX, T_SFX) \
+ NPY_FINLINE npyv_##F_SFX npyv_load2_till_##F_SFX \
+ (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, \
+ npyv_lanetype_##F_SFX fill_lo, npyv_lanetype_##F_SFX fill_hi) \
+ { \
+ union pun { \
+ npyv_lanetype_##F_SFX from_##F_SFX; \
+ npyv_lanetype_##T_SFX to_##T_SFX; \
+ }; \
+ union pun pun_lo; \
+ union pun pun_hi; \
+ pun_lo.from_##F_SFX = fill_lo; \
+ pun_hi.from_##F_SFX = fill_hi; \
+ return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load2_till_##T_SFX( \
+ (const npyv_lanetype_##T_SFX *)ptr, nlane, pun_lo.to_##T_SFX, pun_hi.to_##T_SFX \
+ )); \
+ } \
+ NPY_FINLINE npyv_##F_SFX npyv_loadn2_till_##F_SFX \
+ (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, \
+ npyv_lanetype_##F_SFX fill_lo, npyv_lanetype_##F_SFX fill_hi) \
+ { \
+ union pun { \
+ npyv_lanetype_##F_SFX from_##F_SFX; \
+ npyv_lanetype_##T_SFX to_##T_SFX; \
+ }; \
+ union pun pun_lo; \
+ union pun pun_hi; \
+ pun_lo.from_##F_SFX = fill_lo; \
+ pun_hi.from_##F_SFX = fill_hi; \
+ return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn2_till_##T_SFX( \
+ (const npyv_lanetype_##T_SFX *)ptr, stride, nlane, pun_lo.to_##T_SFX, \
+ pun_hi.to_##T_SFX \
+ )); \
+ } \
+ NPY_FINLINE npyv_##F_SFX npyv_load2_tillz_##F_SFX \
+ (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane) \
+ { \
+ return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load2_tillz_##T_SFX( \
+ (const npyv_lanetype_##T_SFX *)ptr, nlane \
+ )); \
+ } \
+ NPY_FINLINE npyv_##F_SFX npyv_loadn2_tillz_##F_SFX \
+ (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane) \
+ { \
+ return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn2_tillz_##T_SFX( \
+ (const npyv_lanetype_##T_SFX *)ptr, stride, nlane \
+ )); \
+ } \
+ NPY_FINLINE void npyv_store2_till_##F_SFX \
+ (npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, npyv_##F_SFX a) \
+ { \
+ npyv_store2_till_##T_SFX( \
+ (npyv_lanetype_##T_SFX *)ptr, nlane, \
+ npyv_reinterpret_##T_SFX##_##F_SFX(a) \
+ ); \
+ } \
+ NPY_FINLINE void npyv_storen2_till_##F_SFX \
+ (npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, npyv_##F_SFX a) \
+ { \
+ npyv_storen2_till_##T_SFX( \
+ (npyv_lanetype_##T_SFX *)ptr, stride, nlane, \
+ npyv_reinterpret_##T_SFX##_##F_SFX(a) \
+ ); \
+ }
+
+NPYV_IMPL_AVX512_REST_PARTIAL_TYPES_PAIR(u32, s32)
+NPYV_IMPL_AVX512_REST_PARTIAL_TYPES_PAIR(f32, s32)
+NPYV_IMPL_AVX512_REST_PARTIAL_TYPES_PAIR(u64, s64)
+NPYV_IMPL_AVX512_REST_PARTIAL_TYPES_PAIR(f64, s64)
+
+/************************************************************
+ * de-interlave load / interleave contiguous store
+ ************************************************************/
+// two channels
+#define NPYV_IMPL_AVX512_MEM_INTERLEAVE(SFX, ZSFX) \
+ NPY_FINLINE npyv_##ZSFX##x2 npyv_zip_##ZSFX(npyv_##ZSFX, npyv_##ZSFX); \
+ NPY_FINLINE npyv_##ZSFX##x2 npyv_unzip_##ZSFX(npyv_##ZSFX, npyv_##ZSFX); \
+ NPY_FINLINE npyv_##SFX##x2 npyv_load_##SFX##x2( \
+ const npyv_lanetype_##SFX *ptr \
+ ) { \
+ return npyv_unzip_##ZSFX( \
+ npyv_load_##SFX(ptr), npyv_load_##SFX(ptr+npyv_nlanes_##SFX) \
+ ); \
+ } \
+ NPY_FINLINE void npyv_store_##SFX##x2( \
+ npyv_lanetype_##SFX *ptr, npyv_##SFX##x2 v \
+ ) { \
+ npyv_##SFX##x2 zip = npyv_zip_##ZSFX(v.val[0], v.val[1]); \
+ npyv_store_##SFX(ptr, zip.val[0]); \
+ npyv_store_##SFX(ptr + npyv_nlanes_##SFX, zip.val[1]); \
+ }
+
+NPYV_IMPL_AVX512_MEM_INTERLEAVE(u8, u8)
+NPYV_IMPL_AVX512_MEM_INTERLEAVE(s8, u8)
+NPYV_IMPL_AVX512_MEM_INTERLEAVE(u16, u16)
+NPYV_IMPL_AVX512_MEM_INTERLEAVE(s16, u16)
+NPYV_IMPL_AVX512_MEM_INTERLEAVE(u32, u32)
+NPYV_IMPL_AVX512_MEM_INTERLEAVE(s32, u32)
+NPYV_IMPL_AVX512_MEM_INTERLEAVE(u64, u64)
+NPYV_IMPL_AVX512_MEM_INTERLEAVE(s64, u64)
+NPYV_IMPL_AVX512_MEM_INTERLEAVE(f32, f32)
+NPYV_IMPL_AVX512_MEM_INTERLEAVE(f64, f64)
+
/**************************************************
* Lookup table
*************************************************/
diff --git a/numpy/core/src/common/simd/avx512/reorder.h b/numpy/core/src/common/simd/avx512/reorder.h
index c0b2477f3..27e66b5e7 100644
--- a/numpy/core/src/common/simd/avx512/reorder.h
+++ b/numpy/core/src/common/simd/avx512/reorder.h
@@ -167,6 +167,140 @@ NPY_FINLINE npyv_f64x2 npyv_zip_f64(__m512d a, __m512d b)
return r;
}
+// deinterleave two vectors
+NPY_FINLINE npyv_u8x2 npyv_unzip_u8(npyv_u8 ab0, npyv_u8 ab1)
+{
+ npyv_u8x2 r;
+#ifdef NPY_HAVE_AVX512VBMI
+ const __m512i idx_a = npyv_set_u8(
+ 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
+ 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62,
+ 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94,
+ 96, 98, 100, 102, 104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126
+ );
+ const __m512i idx_b = npyv_set_u8(
+ 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31,
+ 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63,
+ 65, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 95,
+ 97, 99, 101, 103, 105, 107, 109, 111, 113, 115, 117, 119, 121, 123, 125, 127
+ );
+ r.val[0] = _mm512_permutex2var_epi8(ab0, idx_a, ab1);
+ r.val[1] = _mm512_permutex2var_epi8(ab0, idx_b, ab1);
+#else
+ #ifdef NPY_HAVE_AVX512BW
+ const __m512i idx = npyv_set_u8(
+ 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
+ 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
+ 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
+ 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
+ );
+ __m512i abl = _mm512_shuffle_epi8(ab0, idx);
+ __m512i abh = _mm512_shuffle_epi8(ab1, idx);
+ #else
+ const __m256i idx = _mm256_setr_epi8(
+ 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
+ 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
+ );
+ __m256i abl_lo = _mm256_shuffle_epi8(npyv512_lower_si256(ab0), idx);
+ __m256i abl_hi = _mm256_shuffle_epi8(npyv512_higher_si256(ab0), idx);
+ __m256i abh_lo = _mm256_shuffle_epi8(npyv512_lower_si256(ab1), idx);
+ __m256i abh_hi = _mm256_shuffle_epi8(npyv512_higher_si256(ab1), idx);
+ __m512i abl = npyv512_combine_si256(abl_lo, abl_hi);
+ __m512i abh = npyv512_combine_si256(abh_lo, abh_hi);
+ #endif
+ const __m512i idx_a = npyv_set_u64(0, 2, 4, 6, 8, 10, 12, 14);
+ const __m512i idx_b = npyv_set_u64(1, 3, 5, 7, 9, 11, 13, 15);
+ r.val[0] = _mm512_permutex2var_epi64(abl, idx_a, abh);
+ r.val[1] = _mm512_permutex2var_epi64(abl, idx_b, abh);
+#endif
+ return r;
+}
+#define npyv_unzip_s8 npyv_unzip_u8
+
+NPY_FINLINE npyv_u16x2 npyv_unzip_u16(npyv_u16 ab0, npyv_u16 ab1)
+{
+ npyv_u16x2 r;
+#ifdef NPY_HAVE_AVX512BW
+ const __m512i idx_a = npyv_set_u16(
+ 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
+ 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62
+ );
+ const __m512i idx_b = npyv_set_u16(
+ 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31,
+ 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63
+ );
+ r.val[0] = _mm512_permutex2var_epi16(ab0, idx_a, ab1);
+ r.val[1] = _mm512_permutex2var_epi16(ab0, idx_b, ab1);
+#else
+ const __m256i idx = _mm256_setr_epi8(
+ 0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15,
+ 0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15
+ );
+ __m256i abl_lo = _mm256_shuffle_epi8(npyv512_lower_si256(ab0), idx);
+ __m256i abl_hi = _mm256_shuffle_epi8(npyv512_higher_si256(ab0), idx);
+ __m256i abh_lo = _mm256_shuffle_epi8(npyv512_lower_si256(ab1), idx);
+ __m256i abh_hi = _mm256_shuffle_epi8(npyv512_higher_si256(ab1), idx);
+ __m512i abl = npyv512_combine_si256(abl_lo, abl_hi);
+ __m512i abh = npyv512_combine_si256(abh_lo, abh_hi);
+
+ const __m512i idx_a = npyv_set_u64(0, 2, 4, 6, 8, 10, 12, 14);
+ const __m512i idx_b = npyv_set_u64(1, 3, 5, 7, 9, 11, 13, 15);
+ r.val[0] = _mm512_permutex2var_epi64(abl, idx_a, abh);
+ r.val[1] = _mm512_permutex2var_epi64(abl, idx_b, abh);
+#endif
+ return r;
+}
+#define npyv_unzip_s16 npyv_unzip_u16
+
+NPY_FINLINE npyv_u32x2 npyv_unzip_u32(npyv_u32 ab0, npyv_u32 ab1)
+{
+ const __m512i idx_a = npyv_set_u32(
+ 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+ );
+ const __m512i idx_b = npyv_set_u32(
+ 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+ );
+ npyv_u32x2 r;
+ r.val[0] = _mm512_permutex2var_epi32(ab0, idx_a, ab1);
+ r.val[1] = _mm512_permutex2var_epi32(ab0, idx_b, ab1);
+ return r;
+}
+#define npyv_unzip_s32 npyv_unzip_u32
+
+NPY_FINLINE npyv_u64x2 npyv_unzip_u64(npyv_u64 ab0, npyv_u64 ab1)
+{
+ const __m512i idx_a = npyv_set_u64(0, 2, 4, 6, 8, 10, 12, 14);
+ const __m512i idx_b = npyv_set_u64(1, 3, 5, 7, 9, 11, 13, 15);
+ npyv_u64x2 r;
+ r.val[0] = _mm512_permutex2var_epi64(ab0, idx_a, ab1);
+ r.val[1] = _mm512_permutex2var_epi64(ab0, idx_b, ab1);
+ return r;
+}
+#define npyv_unzip_s64 npyv_unzip_u64
+
+NPY_FINLINE npyv_f32x2 npyv_unzip_f32(npyv_f32 ab0, npyv_f32 ab1)
+{
+ const __m512i idx_a = npyv_set_u32(
+ 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+ );
+ const __m512i idx_b = npyv_set_u32(
+ 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+ );
+ npyv_f32x2 r;
+ r.val[0] = _mm512_permutex2var_ps(ab0, idx_a, ab1);
+ r.val[1] = _mm512_permutex2var_ps(ab0, idx_b, ab1);
+ return r;
+}
+NPY_FINLINE npyv_f64x2 npyv_unzip_f64(npyv_f64 ab0, npyv_f64 ab1)
+{
+ const __m512i idx_a = npyv_set_u64(0, 2, 4, 6, 8, 10, 12, 14);
+ const __m512i idx_b = npyv_set_u64(1, 3, 5, 7, 9, 11, 13, 15);
+ npyv_f64x2 r;
+ r.val[0] = _mm512_permutex2var_pd(ab0, idx_a, ab1);
+ r.val[1] = _mm512_permutex2var_pd(ab0, idx_b, ab1);
+ return r;
+}
+
// Reverse elements of each 64-bit lane
NPY_FINLINE npyv_u8 npyv_rev64_u8(npyv_u8 a)
{
@@ -223,4 +357,22 @@ NPY_FINLINE npyv_f32 npyv_rev64_f32(npyv_f32 a)
return _mm512_shuffle_ps(a, a, (_MM_PERM_ENUM)_MM_SHUFFLE(2, 3, 0, 1));
}
+// Permuting the elements of each 128-bit lane by immediate index for
+// each element.
+#define npyv_permi128_u32(A, E0, E1, E2, E3) \
+ _mm512_shuffle_epi32(A, _MM_SHUFFLE(E3, E2, E1, E0))
+
+#define npyv_permi128_s32 npyv_permi128_u32
+
+#define npyv_permi128_u64(A, E0, E1) \
+ _mm512_shuffle_epi32(A, _MM_SHUFFLE(((E1)<<1)+1, ((E1)<<1), ((E0)<<1)+1, ((E0)<<1)))
+
+#define npyv_permi128_s64 npyv_permi128_u64
+
+#define npyv_permi128_f32(A, E0, E1, E2, E3) \
+ _mm512_permute_ps(A, _MM_SHUFFLE(E3, E2, E1, E0))
+
+#define npyv_permi128_f64(A, E0, E1) \
+ _mm512_permute_pd(A, (((E1)<<7) | ((E0)<<6) | ((E1)<<5) | ((E0)<<4) | ((E1)<<3) | ((E0)<<2) | ((E1)<<1) | (E0)))
+
#endif // _NPY_SIMD_AVX512_REORDER_H
diff --git a/numpy/core/src/common/simd/emulate_maskop.h b/numpy/core/src/common/simd/emulate_maskop.h
index 2a808a153..6627e5010 100644
--- a/numpy/core/src/common/simd/emulate_maskop.h
+++ b/numpy/core/src/common/simd/emulate_maskop.h
@@ -42,5 +42,39 @@ NPYV_IMPL_EMULATE_MASK_ADDSUB(s64, b64)
#if NPY_SIMD_F64
NPYV_IMPL_EMULATE_MASK_ADDSUB(f64, b64)
#endif
+#if NPY_SIMD_F32
+ // conditional division, m ? a / b : c
+ NPY_FINLINE npyv_f32
+ npyv_ifdiv_f32(npyv_b32 m, npyv_f32 a, npyv_f32 b, npyv_f32 c)
+ {
+ const npyv_f32 one = npyv_setall_f32(1.0f);
+ npyv_f32 div = npyv_div_f32(a, npyv_select_f32(m, b, one));
+ return npyv_select_f32(m, div, c);
+ }
+ // conditional division, m ? a / b : 0
+ NPY_FINLINE npyv_f32
+ npyv_ifdivz_f32(npyv_b32 m, npyv_f32 a, npyv_f32 b)
+ {
+ const npyv_f32 zero = npyv_zero_f32();
+ return npyv_ifdiv_f32(m, a, b, zero);
+ }
+#endif
+#if NPY_SIMD_F64
+ // conditional division, m ? a / b : c
+ NPY_FINLINE npyv_f64
+ npyv_ifdiv_f64(npyv_b64 m, npyv_f64 a, npyv_f64 b, npyv_f64 c)
+ {
+ const npyv_f64 one = npyv_setall_f64(1.0);
+ npyv_f64 div = npyv_div_f64(a, npyv_select_f64(m, b, one));
+ return npyv_select_f64(m, div, c);
+ }
+ // conditional division, m ? a / b : 0
+ NPY_FINLINE npyv_f64
+ npyv_ifdivz_f64(npyv_b64 m, npyv_f64 a, npyv_f64 b)
+ {
+ const npyv_f64 zero = npyv_zero_f64();
+ return npyv_ifdiv_f64(m, a, b, zero);
+ }
+#endif
#endif // _NPY_SIMD_EMULATE_MASKOP_H
diff --git a/numpy/core/src/common/simd/neon/arithmetic.h b/numpy/core/src/common/simd/neon/arithmetic.h
index 00994806d..683620111 100644
--- a/numpy/core/src/common/simd/neon/arithmetic.h
+++ b/numpy/core/src/common/simd/neon/arithmetic.h
@@ -265,6 +265,14 @@ NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor)
NPY_FINLINE npyv_f32 npyv_nmulsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
{ return vmlsq_f32(vnegq_f32(c), a, b); }
#endif
+// multiply, add for odd elements and subtract even elements.
+// (a * b) -+ c
+NPY_FINLINE npyv_f32 npyv_muladdsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
+{
+ const npyv_f32 msign = npyv_set_f32(-0.0f, 0.0f, -0.0f, 0.0f);
+ return npyv_muladd_f32(a, b, npyv_xor_f32(msign, c));
+}
+
/***************************
* FUSED F64
***************************/
@@ -277,6 +285,11 @@ NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor)
{ return vfmsq_f64(c, a, b); }
NPY_FINLINE npyv_f64 npyv_nmulsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c)
{ return vfmsq_f64(vnegq_f64(c), a, b); }
+ NPY_FINLINE npyv_f64 npyv_muladdsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c)
+ {
+ const npyv_f64 msign = npyv_set_f64(-0.0, 0.0);
+ return npyv_muladd_f64(a, b, npyv_xor_f64(msign, c));
+ }
#endif // NPY_SIMD_F64
/***************************
diff --git a/numpy/core/src/common/simd/neon/math.h b/numpy/core/src/common/simd/neon/math.h
index c0a771b5d..58d14809f 100644
--- a/numpy/core/src/common/simd/neon/math.h
+++ b/numpy/core/src/common/simd/neon/math.h
@@ -278,20 +278,25 @@ NPY_FINLINE npyv_f32 npyv_rint_f32(npyv_f32 a)
return vrndnq_f32(a);
#else
// ARMv7 NEON only supports fp to int truncate conversion.
- // a magic trick of adding 1.5 * 2**23 is used for rounding
+ // a magic trick of adding 1.5 * 2^23 is used for rounding
// to nearest even and then subtract this magic number to get
// the integer.
- const npyv_s32 szero = vreinterpretq_s32_f32(vdupq_n_f32(-0.0f));
- const npyv_f32 magic = vdupq_n_f32(12582912.0f); // 1.5 * 2**23
- npyv_f32 round = vsubq_f32(vaddq_f32(a, magic), magic);
- npyv_b32 overflow = vcleq_f32(vabsq_f32(a), vreinterpretq_f32_u32(vdupq_n_u32(0x4b000000)));
- round = vbslq_f32(overflow, round, a);
- // signed zero
- round = vreinterpretq_f32_s32(vorrq_s32(
- vreinterpretq_s32_f32(round),
- vandq_s32(vreinterpretq_s32_f32(a), szero)
- ));
- return round;
+ //
+ const npyv_u32 szero = vreinterpretq_u32_f32(vdupq_n_f32(-0.0f));
+ const npyv_u32 sign_mask = vandq_u32(vreinterpretq_u32_f32(a), szero);
+ const npyv_f32 two_power_23 = vdupq_n_f32(8388608.0); // 2^23
+ const npyv_f32 two_power_23h = vdupq_n_f32(12582912.0f); // 1.5 * 2^23
+ npyv_u32 nnan_mask = vceqq_f32(a, a);
+ // eliminate nans to avoid invalid fp errors
+ npyv_f32 abs_x = vabsq_f32(vreinterpretq_f32_u32(vandq_u32(nnan_mask, vreinterpretq_u32_f32(a))));
+ // round by add magic number 1.5 * 2^23
+ npyv_f32 round = vsubq_f32(vaddq_f32(two_power_23h, abs_x), two_power_23h);
+ // copysign
+ round = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(round), sign_mask ));
+ // a if |a| >= 2^23 or a == NaN
+ npyv_u32 mask = vcleq_f32(abs_x, two_power_23);
+ mask = vandq_u32(mask, nnan_mask);
+ return vbslq_f32(mask, round, a);
#endif
}
#if NPY_SIMD_F64
@@ -302,33 +307,30 @@ NPY_FINLINE npyv_f32 npyv_rint_f32(npyv_f32 a)
#ifdef NPY_HAVE_ASIMD
#define npyv_ceil_f32 vrndpq_f32
#else
- NPY_FINLINE npyv_f32 npyv_ceil_f32(npyv_f32 a)
- {
- const npyv_s32 szero = vreinterpretq_s32_f32(vdupq_n_f32(-0.0f));
+ NPY_FINLINE npyv_f32 npyv_ceil_f32(npyv_f32 a)
+ {
const npyv_u32 one = vreinterpretq_u32_f32(vdupq_n_f32(1.0f));
- const npyv_s32 max_int = vdupq_n_s32(0x7fffffff);
- /**
- * On armv7, vcvtq.f32 handles special cases as follows:
- * NaN return 0
- * +inf or +outrange return 0x80000000(-0.0f)
- * -inf or -outrange return 0x7fffffff(nan)
- */
- npyv_s32 roundi = vcvtq_s32_f32(a);
- npyv_f32 round = vcvtq_f32_s32(roundi);
+ const npyv_u32 szero = vreinterpretq_u32_f32(vdupq_n_f32(-0.0f));
+ const npyv_u32 sign_mask = vandq_u32(vreinterpretq_u32_f32(a), szero);
+ const npyv_f32 two_power_23 = vdupq_n_f32(8388608.0); // 2^23
+ const npyv_f32 two_power_23h = vdupq_n_f32(12582912.0f); // 1.5 * 2^23
+ npyv_u32 nnan_mask = vceqq_f32(a, a);
+ npyv_f32 x = vreinterpretq_f32_u32(vandq_u32(nnan_mask, vreinterpretq_u32_f32(a)));
+ // eliminate nans to avoid invalid fp errors
+ npyv_f32 abs_x = vabsq_f32(x);
+ // round by add magic number 1.5 * 2^23
+ npyv_f32 round = vsubq_f32(vaddq_f32(two_power_23h, abs_x), two_power_23h);
+ // copysign
+ round = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(round), sign_mask));
npyv_f32 ceil = vaddq_f32(round, vreinterpretq_f32_u32(
- vandq_u32(vcltq_f32(round, a), one))
- );
- // respect signed zero, e.g. -0.5 -> -0.0
- npyv_f32 rzero = vreinterpretq_f32_s32(vorrq_s32(
- vreinterpretq_s32_f32(ceil),
- vandq_s32(vreinterpretq_s32_f32(a), szero)
- ));
- // if nan or overflow return a
- npyv_u32 nnan = npyv_notnan_f32(a);
- npyv_u32 overflow = vorrq_u32(
- vceqq_s32(roundi, szero), vceqq_s32(roundi, max_int)
+ vandq_u32(vcltq_f32(round, x), one))
);
- return vbslq_f32(vbicq_u32(nnan, overflow), rzero, a);
+ // respects signed zero
+ ceil = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(ceil), sign_mask));
+ // a if |a| >= 2^23 or a == NaN
+ npyv_u32 mask = vcleq_f32(abs_x, two_power_23);
+ mask = vandq_u32(mask, nnan_mask);
+ return vbslq_f32(mask, ceil, a);
}
#endif
#if NPY_SIMD_F64
@@ -339,29 +341,37 @@ NPY_FINLINE npyv_f32 npyv_rint_f32(npyv_f32 a)
#ifdef NPY_HAVE_ASIMD
#define npyv_trunc_f32 vrndq_f32
#else
- NPY_FINLINE npyv_f32 npyv_trunc_f32(npyv_f32 a)
- {
- const npyv_s32 szero = vreinterpretq_s32_f32(vdupq_n_f32(-0.0f));
+ NPY_FINLINE npyv_f32 npyv_trunc_f32(npyv_f32 a)
+ {
const npyv_s32 max_int = vdupq_n_s32(0x7fffffff);
+ const npyv_u32 exp_mask = vdupq_n_u32(0xff000000);
+ const npyv_s32 szero = vreinterpretq_s32_f32(vdupq_n_f32(-0.0f));
+ const npyv_u32 sign_mask = vandq_u32(
+ vreinterpretq_u32_f32(a), vreinterpretq_u32_s32(szero));
+
+ npyv_u32 nfinite_mask = vshlq_n_u32(vreinterpretq_u32_f32(a), 1);
+ nfinite_mask = vandq_u32(nfinite_mask, exp_mask);
+ nfinite_mask = vceqq_u32(nfinite_mask, exp_mask);
+ // eliminate nans/inf to avoid invalid fp errors
+ npyv_f32 x = vreinterpretq_f32_u32(
+ veorq_u32(nfinite_mask, vreinterpretq_u32_f32(a)));
/**
* On armv7, vcvtq.f32 handles special cases as follows:
* NaN return 0
* +inf or +outrange return 0x80000000(-0.0f)
* -inf or -outrange return 0x7fffffff(nan)
*/
- npyv_s32 roundi = vcvtq_s32_f32(a);
- npyv_f32 round = vcvtq_f32_s32(roundi);
+ npyv_s32 trunci = vcvtq_s32_f32(x);
+ npyv_f32 trunc = vcvtq_f32_s32(trunci);
// respect signed zero, e.g. -0.5 -> -0.0
- npyv_f32 rzero = vreinterpretq_f32_s32(vorrq_s32(
- vreinterpretq_s32_f32(round),
- vandq_s32(vreinterpretq_s32_f32(a), szero)
- ));
- // if nan or overflow return a
- npyv_u32 nnan = npyv_notnan_f32(a);
- npyv_u32 overflow = vorrq_u32(
- vceqq_s32(roundi, szero), vceqq_s32(roundi, max_int)
+ trunc = vreinterpretq_f32_u32(
+ vorrq_u32(vreinterpretq_u32_f32(trunc), sign_mask));
+ // if overflow return a
+ npyv_u32 overflow_mask = vorrq_u32(
+ vceqq_s32(trunci, szero), vceqq_s32(trunci, max_int)
);
- return vbslq_f32(vbicq_u32(nnan, overflow), rzero, a);
+ // a if a overflow or nonfinite
+ return vbslq_f32(vorrq_u32(nfinite_mask, overflow_mask), a, trunc);
}
#endif
#if NPY_SIMD_F64
@@ -372,28 +382,31 @@ NPY_FINLINE npyv_f32 npyv_rint_f32(npyv_f32 a)
#ifdef NPY_HAVE_ASIMD
#define npyv_floor_f32 vrndmq_f32
#else
- NPY_FINLINE npyv_f32 npyv_floor_f32(npyv_f32 a)
- {
- const npyv_s32 szero = vreinterpretq_s32_f32(vdupq_n_f32(-0.0f));
+ NPY_FINLINE npyv_f32 npyv_floor_f32(npyv_f32 a)
+ {
const npyv_u32 one = vreinterpretq_u32_f32(vdupq_n_f32(1.0f));
- const npyv_s32 max_int = vdupq_n_s32(0x7fffffff);
+ const npyv_u32 szero = vreinterpretq_u32_f32(vdupq_n_f32(-0.0f));
+ const npyv_u32 sign_mask = vandq_u32(vreinterpretq_u32_f32(a), szero);
+ const npyv_f32 two_power_23 = vdupq_n_f32(8388608.0); // 2^23
+ const npyv_f32 two_power_23h = vdupq_n_f32(12582912.0f); // 1.5 * 2^23
- npyv_s32 roundi = vcvtq_s32_f32(a);
- npyv_f32 round = vcvtq_f32_s32(roundi);
+ npyv_u32 nnan_mask = vceqq_f32(a, a);
+ npyv_f32 x = vreinterpretq_f32_u32(vandq_u32(nnan_mask, vreinterpretq_u32_f32(a)));
+ // eliminate nans to avoid invalid fp errors
+ npyv_f32 abs_x = vabsq_f32(x);
+ // round by add magic number 1.5 * 2^23
+ npyv_f32 round = vsubq_f32(vaddq_f32(two_power_23h, abs_x), two_power_23h);
+ // copysign
+ round = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(round), sign_mask));
npyv_f32 floor = vsubq_f32(round, vreinterpretq_f32_u32(
- vandq_u32(vcgtq_f32(round, a), one)
- ));
- // respect signed zero
- npyv_f32 rzero = vreinterpretq_f32_s32(vorrq_s32(
- vreinterpretq_s32_f32(floor),
- vandq_s32(vreinterpretq_s32_f32(a), szero)
+ vandq_u32(vcgtq_f32(round, x), one)
));
- npyv_u32 nnan = npyv_notnan_f32(a);
- npyv_u32 overflow = vorrq_u32(
- vceqq_s32(roundi, szero), vceqq_s32(roundi, max_int)
- );
-
- return vbslq_f32(vbicq_u32(nnan, overflow), rzero, a);
+ // respects signed zero
+ floor = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(floor), sign_mask));
+ // a if |a| >= 2^23 or a == NaN
+ npyv_u32 mask = vcleq_f32(abs_x, two_power_23);
+ mask = vandq_u32(mask, nnan_mask);
+ return vbslq_f32(mask, floor, a);
}
#endif // NPY_HAVE_ASIMD
#if NPY_SIMD_F64
diff --git a/numpy/core/src/common/simd/neon/memory.h b/numpy/core/src/common/simd/neon/memory.h
index 7060ea628..6163440c3 100644
--- a/numpy/core/src/common/simd/neon/memory.h
+++ b/numpy/core/src/common/simd/neon/memory.h
@@ -102,6 +102,29 @@ NPY_FINLINE npyv_f64 npyv_loadn_f64(const double *ptr, npy_intp stride)
);
}
#endif
+
+//// 64-bit load over 32-bit stride
+NPY_FINLINE npyv_u32 npyv_loadn2_u32(const npy_uint32 *ptr, npy_intp stride)
+{
+ return vcombine_u32(
+ vld1_u32((const uint32_t*)ptr), vld1_u32((const uint32_t*)ptr + stride)
+ );
+}
+NPY_FINLINE npyv_s32 npyv_loadn2_s32(const npy_int32 *ptr, npy_intp stride)
+{ return npyv_reinterpret_s32_u32(npyv_loadn2_u32((const npy_uint32*)ptr, stride)); }
+NPY_FINLINE npyv_f32 npyv_loadn2_f32(const float *ptr, npy_intp stride)
+{ return npyv_reinterpret_f32_u32(npyv_loadn2_u32((const npy_uint32*)ptr, stride)); }
+
+//// 128-bit load over 64-bit stride
+NPY_FINLINE npyv_u64 npyv_loadn2_u64(const npy_uint64 *ptr, npy_intp stride)
+{ (void)stride; return npyv_load_u64(ptr); }
+NPY_FINLINE npyv_s64 npyv_loadn2_s64(const npy_int64 *ptr, npy_intp stride)
+{ (void)stride; return npyv_load_s64(ptr); }
+#if NPY_SIMD_F64
+NPY_FINLINE npyv_f64 npyv_loadn2_f64(const double *ptr, npy_intp stride)
+{ (void)stride; return npyv_load_f64(ptr); }
+#endif
+
/***************************
* Non-contiguous Store
***************************/
@@ -131,6 +154,32 @@ NPY_FINLINE void npyv_storen_f64(double *ptr, npy_intp stride, npyv_f64 a)
{ npyv_storen_s64((npy_int64*)ptr, stride, npyv_reinterpret_s64_f64(a)); }
#endif
+//// 64-bit store over 32-bit stride
+NPY_FINLINE void npyv_storen2_u32(npy_uint32 *ptr, npy_intp stride, npyv_u32 a)
+{
+#if NPY_SIMD_F64
+ vst1q_lane_u64((uint64_t*)ptr, npyv_reinterpret_u64_u32(a), 0);
+ vst1q_lane_u64((uint64_t*)(ptr + stride), npyv_reinterpret_u64_u32(a), 1);
+#else
+ // armhf strict to alignment
+ vst1_u32((uint32_t*)ptr, vget_low_u32(a));
+ vst1_u32((uint32_t*)ptr + stride, vget_high_u32(a));
+#endif
+}
+NPY_FINLINE void npyv_storen2_s32(npy_int32 *ptr, npy_intp stride, npyv_s32 a)
+{ npyv_storen2_u32((npy_uint32*)ptr, stride, npyv_reinterpret_u32_s32(a)); }
+NPY_FINLINE void npyv_storen2_f32(float *ptr, npy_intp stride, npyv_f32 a)
+{ npyv_storen2_u32((npy_uint32*)ptr, stride, npyv_reinterpret_u32_f32(a)); }
+
+//// 128-bit store over 64-bit stride
+NPY_FINLINE void npyv_storen2_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a)
+{ (void)stride; npyv_store_u64(ptr, a); }
+NPY_FINLINE void npyv_storen2_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a)
+{ (void)stride; npyv_store_s64(ptr, a); }
+#if NPY_SIMD_F64
+NPY_FINLINE void npyv_storen2_f64(double *ptr, npy_intp stride, npyv_f64 a)
+{ (void)stride; npyv_store_f64(ptr, a); }
+#endif
/*********************************
* Partial Load
*********************************/
@@ -168,6 +217,29 @@ NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, n
NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
{ return npyv_load_till_s64(ptr, nlane, 0); }
+//// 64-bit nlane
+NPY_FINLINE npyv_s32 npyv_load2_till_s32(const npy_int32 *ptr, npy_uintp nlane,
+ npy_int32 fill_lo, npy_int32 fill_hi)
+{
+ assert(nlane > 0);
+ if (nlane == 1) {
+ const int32_t NPY_DECL_ALIGNED(16) fill[2] = {fill_lo, fill_hi};
+ return vcombine_s32(vld1_s32((const int32_t*)ptr), vld1_s32(fill));
+ }
+ return npyv_load_s32(ptr);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s32 npyv_load2_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
+{ return vreinterpretq_s32_s64(npyv_load_tillz_s64((const npy_int64*)ptr, nlane)); }
+
+//// 128-bit nlane
+NPY_FINLINE npyv_s64 npyv_load2_till_s64(const npy_int64 *ptr, npy_uintp nlane,
+ npy_int64 fill_lo, npy_int64 fill_hi)
+{ (void)nlane; (void)fill_lo; (void)fill_hi; return npyv_load_s64(ptr); }
+
+NPY_FINLINE npyv_s64 npyv_load2_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
+{ (void)nlane; return npyv_load_s64(ptr); }
+
/*********************************
* Non-contiguous partial load
*********************************/
@@ -206,6 +278,34 @@ npyv_loadn_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_
NPY_FINLINE npyv_s64 npyv_loadn_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane)
{ return npyv_loadn_till_s64(ptr, stride, nlane, 0); }
+//// 64-bit load over 32-bit stride
+NPY_FINLINE npyv_s32 npyv_loadn2_till_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane,
+ npy_int32 fill_lo, npy_int32 fill_hi)
+{
+ assert(nlane > 0);
+ if (nlane == 1) {
+ const int32_t NPY_DECL_ALIGNED(16) fill[2] = {fill_lo, fill_hi};
+ return vcombine_s32(vld1_s32((const int32_t*)ptr), vld1_s32(fill));
+ }
+ return npyv_loadn2_s32(ptr, stride);
+}
+NPY_FINLINE npyv_s32 npyv_loadn2_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane)
+{
+ assert(nlane > 0);
+ if (nlane == 1) {
+ return vcombine_s32(vld1_s32((const int32_t*)ptr), vdup_n_s32(0));
+ }
+ return npyv_loadn2_s32(ptr, stride);
+}
+
+//// 128-bit load over 64-bit stride
+NPY_FINLINE npyv_s64 npyv_loadn2_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane,
+ npy_int64 fill_lo, npy_int64 fill_hi)
+{ assert(nlane > 0); (void)stride; (void)nlane; (void)fill_lo; (void)fill_hi; return npyv_load_s64(ptr); }
+
+NPY_FINLINE npyv_s64 npyv_loadn2_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane)
+{ assert(nlane > 0); (void)stride; (void)nlane; return npyv_load_s64(ptr); }
+
/*********************************
* Partial store
*********************************/
@@ -238,6 +338,30 @@ NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a
}
npyv_store_s64(ptr, a);
}
+
+//// 64-bit nlane
+NPY_FINLINE void npyv_store2_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a)
+{
+ assert(nlane > 0);
+ if (nlane == 1) {
+ // armhf strict to alignment, may cause bus error
+ #if NPY_SIMD_F64
+ vst1q_lane_s64((int64_t*)ptr, npyv_reinterpret_s64_s32(a), 0);
+ #else
+ npyv_storel_s32(ptr, a);
+ #endif
+ return;
+ }
+ npyv_store_s32(ptr, a);
+}
+
+//// 128-bit nlane
+NPY_FINLINE void npyv_store2_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a)
+{
+ assert(nlane > 0); (void)nlane;
+ npyv_store_s64(ptr, a);
+}
+
/*********************************
* Non-contiguous partial store
*********************************/
@@ -245,16 +369,21 @@ NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a
NPY_FINLINE void npyv_storen_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npyv_s32 a)
{
assert(nlane > 0);
+ vst1q_lane_s32((int32_t*)ptr, a, 0);
switch(nlane) {
- default:
- vst1q_lane_s32((int32_t*)ptr + stride*3, a, 3);
+ case 1:
+ return;
+ case 2:
+ vst1q_lane_s32((int32_t*)ptr + stride, a, 1);
+ return;
case 3:
+ vst1q_lane_s32((int32_t*)ptr + stride, a, 1);
vst1q_lane_s32((int32_t*)ptr + stride*2, a, 2);
- case 2:
+ return;
+ default:
vst1q_lane_s32((int32_t*)ptr + stride, a, 1);
- case 1:
- vst1q_lane_s32((int32_t*)ptr, a, 0);
- break;
+ vst1q_lane_s32((int32_t*)ptr + stride*2, a, 2);
+ vst1q_lane_s32((int32_t*)ptr + stride*3, a, 3);
}
}
//// 64
@@ -268,6 +397,27 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp
npyv_storen_s64(ptr, stride, a);
}
+//// 64-bit store over 32-bit stride
+NPY_FINLINE void npyv_storen2_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npyv_s32 a)
+{
+ assert(nlane > 0);
+#if NPY_SIMD_F64
+ vst1q_lane_s64((int64_t*)ptr, npyv_reinterpret_s64_s32(a), 0);
+ if (nlane > 1) {
+ vst1q_lane_s64((int64_t*)(ptr + stride), npyv_reinterpret_s64_s32(a), 1);
+ }
+#else
+ npyv_storel_s32(ptr, a);
+ if (nlane > 1) {
+ npyv_storeh_s32(ptr + stride, a);
+ }
+#endif
+}
+
+//// 128-bit store over 64-bit stride
+NPY_FINLINE void npyv_storen2_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npyv_s64 a)
+{ assert(nlane > 0); (void)stride; (void)nlane; npyv_store_s64(ptr, a); }
+
/*****************************************************************
* Implement partial load/store for u32/f32/u64/f64... via casting
*****************************************************************/
@@ -278,7 +428,8 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp
union { \
npyv_lanetype_##F_SFX from_##F_SFX; \
npyv_lanetype_##T_SFX to_##T_SFX; \
- } pun = {.from_##F_SFX = fill}; \
+ } pun; \
+ pun.from_##F_SFX = fill; \
return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load_till_##T_SFX( \
(const npyv_lanetype_##T_SFX *)ptr, nlane, pun.to_##T_SFX \
)); \
@@ -290,7 +441,8 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp
union { \
npyv_lanetype_##F_SFX from_##F_SFX; \
npyv_lanetype_##T_SFX to_##T_SFX; \
- } pun = {.from_##F_SFX = fill}; \
+ } pun; \
+ pun.from_##F_SFX = fill; \
return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn_till_##T_SFX( \
(const npyv_lanetype_##T_SFX *)ptr, stride, nlane, pun.to_##T_SFX \
)); \
@@ -332,6 +484,131 @@ NPYV_IMPL_NEON_REST_PARTIAL_TYPES(u64, s64)
#if NPY_SIMD_F64
NPYV_IMPL_NEON_REST_PARTIAL_TYPES(f64, s64)
#endif
+
+// 128-bit/64-bit stride
+#define NPYV_IMPL_NEON_REST_PARTIAL_TYPES_PAIR(F_SFX, T_SFX) \
+ NPY_FINLINE npyv_##F_SFX npyv_load2_till_##F_SFX \
+ (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, \
+ npyv_lanetype_##F_SFX fill_lo, npyv_lanetype_##F_SFX fill_hi) \
+ { \
+ union pun { \
+ npyv_lanetype_##F_SFX from_##F_SFX; \
+ npyv_lanetype_##T_SFX to_##T_SFX; \
+ }; \
+ union pun pun_lo; \
+ union pun pun_hi; \
+ pun_lo.from_##F_SFX = fill_lo; \
+ pun_hi.from_##F_SFX = fill_hi; \
+ return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load2_till_##T_SFX( \
+ (const npyv_lanetype_##T_SFX *)ptr, nlane, pun_lo.to_##T_SFX, pun_hi.to_##T_SFX \
+ )); \
+ } \
+ NPY_FINLINE npyv_##F_SFX npyv_loadn2_till_##F_SFX \
+ (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, \
+ npyv_lanetype_##F_SFX fill_lo, npyv_lanetype_##F_SFX fill_hi) \
+ { \
+ union pun { \
+ npyv_lanetype_##F_SFX from_##F_SFX; \
+ npyv_lanetype_##T_SFX to_##T_SFX; \
+ }; \
+ union pun pun_lo; \
+ union pun pun_hi; \
+ pun_lo.from_##F_SFX = fill_lo; \
+ pun_hi.from_##F_SFX = fill_hi; \
+ return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn2_till_##T_SFX( \
+ (const npyv_lanetype_##T_SFX *)ptr, stride, nlane, pun_lo.to_##T_SFX, \
+ pun_hi.to_##T_SFX \
+ )); \
+ } \
+ NPY_FINLINE npyv_##F_SFX npyv_load2_tillz_##F_SFX \
+ (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane) \
+ { \
+ return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load2_tillz_##T_SFX( \
+ (const npyv_lanetype_##T_SFX *)ptr, nlane \
+ )); \
+ } \
+ NPY_FINLINE npyv_##F_SFX npyv_loadn2_tillz_##F_SFX \
+ (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane) \
+ { \
+ return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn2_tillz_##T_SFX( \
+ (const npyv_lanetype_##T_SFX *)ptr, stride, nlane \
+ )); \
+ } \
+ NPY_FINLINE void npyv_store2_till_##F_SFX \
+ (npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, npyv_##F_SFX a) \
+ { \
+ npyv_store2_till_##T_SFX( \
+ (npyv_lanetype_##T_SFX *)ptr, nlane, \
+ npyv_reinterpret_##T_SFX##_##F_SFX(a) \
+ ); \
+ } \
+ NPY_FINLINE void npyv_storen2_till_##F_SFX \
+ (npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, npyv_##F_SFX a) \
+ { \
+ npyv_storen2_till_##T_SFX( \
+ (npyv_lanetype_##T_SFX *)ptr, stride, nlane, \
+ npyv_reinterpret_##T_SFX##_##F_SFX(a) \
+ ); \
+ }
+
+NPYV_IMPL_NEON_REST_PARTIAL_TYPES_PAIR(u32, s32)
+NPYV_IMPL_NEON_REST_PARTIAL_TYPES_PAIR(f32, s32)
+NPYV_IMPL_NEON_REST_PARTIAL_TYPES_PAIR(u64, s64)
+#if NPY_SIMD_F64
+NPYV_IMPL_NEON_REST_PARTIAL_TYPES_PAIR(f64, s64)
+#endif
+
+/************************************************************
+ * de-interlave load / interleave contiguous store
+ ************************************************************/
+// two channels
+#define NPYV_IMPL_NEON_MEM_INTERLEAVE(SFX, T_PTR) \
+ NPY_FINLINE npyv_##SFX##x2 npyv_load_##SFX##x2( \
+ const npyv_lanetype_##SFX *ptr \
+ ) { \
+ return vld2q_##SFX((const T_PTR*)ptr); \
+ } \
+ NPY_FINLINE void npyv_store_##SFX##x2( \
+ npyv_lanetype_##SFX *ptr, npyv_##SFX##x2 v \
+ ) { \
+ vst2q_##SFX((T_PTR*)ptr, v); \
+ }
+
+NPYV_IMPL_NEON_MEM_INTERLEAVE(u8, uint8_t)
+NPYV_IMPL_NEON_MEM_INTERLEAVE(s8, int8_t)
+NPYV_IMPL_NEON_MEM_INTERLEAVE(u16, uint16_t)
+NPYV_IMPL_NEON_MEM_INTERLEAVE(s16, int16_t)
+NPYV_IMPL_NEON_MEM_INTERLEAVE(u32, uint32_t)
+NPYV_IMPL_NEON_MEM_INTERLEAVE(s32, int32_t)
+NPYV_IMPL_NEON_MEM_INTERLEAVE(f32, float)
+
+#if NPY_SIMD_F64
+ NPYV_IMPL_NEON_MEM_INTERLEAVE(f64, double)
+ NPYV_IMPL_NEON_MEM_INTERLEAVE(u64, uint64_t)
+ NPYV_IMPL_NEON_MEM_INTERLEAVE(s64, int64_t)
+#else
+ #define NPYV_IMPL_NEON_MEM_INTERLEAVE_64(SFX) \
+ NPY_FINLINE npyv_##SFX##x2 npyv_load_##SFX##x2( \
+ const npyv_lanetype_##SFX *ptr) \
+ { \
+ npyv_##SFX a = npyv_load_##SFX(ptr); \
+ npyv_##SFX b = npyv_load_##SFX(ptr + 2); \
+ npyv_##SFX##x2 r; \
+ r.val[0] = vcombine_##SFX(vget_low_##SFX(a), vget_low_##SFX(b)); \
+ r.val[1] = vcombine_##SFX(vget_high_##SFX(a), vget_high_##SFX(b)); \
+ return r; \
+ } \
+ NPY_FINLINE void npyv_store_##SFX##x2( \
+ npyv_lanetype_##SFX *ptr, npyv_##SFX##x2 v) \
+ { \
+ npyv_store_##SFX(ptr, vcombine_##SFX( \
+ vget_low_##SFX(v.val[0]), vget_low_##SFX(v.val[1]))); \
+ npyv_store_##SFX(ptr + 2, vcombine_##SFX( \
+ vget_high_##SFX(v.val[0]), vget_high_##SFX(v.val[1]))); \
+ }
+ NPYV_IMPL_NEON_MEM_INTERLEAVE_64(u64)
+ NPYV_IMPL_NEON_MEM_INTERLEAVE_64(s64)
+#endif
/*********************************
* Lookup table
*********************************/
diff --git a/numpy/core/src/common/simd/neon/neon.h b/numpy/core/src/common/simd/neon/neon.h
index b08071527..49c35c415 100644
--- a/numpy/core/src/common/simd/neon/neon.h
+++ b/numpy/core/src/common/simd/neon/neon.h
@@ -16,6 +16,7 @@
#define NPY_SIMD_FMA3 0 // HW emulated
#endif
#define NPY_SIMD_BIGENDIAN 0
+#define NPY_SIMD_CMPSIGNAL 1
typedef uint8x16_t npyv_u8;
typedef int8x16_t npyv_s8;
diff --git a/numpy/core/src/common/simd/neon/operators.h b/numpy/core/src/common/simd/neon/operators.h
index 249621bd6..e18ea94b8 100644
--- a/numpy/core/src/common/simd/neon/operators.h
+++ b/numpy/core/src/common/simd/neon/operators.h
@@ -240,10 +240,35 @@
// check special cases
NPY_FINLINE npyv_b32 npyv_notnan_f32(npyv_f32 a)
-{ return vceqq_f32(a, a); }
+{
+#if defined(__clang__)
+/**
+ * To avoid signaling qNaN, workaround for clang symmetric inputs bug
+ * check https://github.com/numpy/numpy/issues/22933,
+ * for more clarification.
+ */
+ npyv_b32 ret;
+ #if NPY_SIMD_F64
+ __asm("fcmeq %0.4s, %1.4s, %1.4s" : "=w" (ret) : "w" (a));
+ #else
+ __asm("vceq.f32 %q0, %q1, %q1" : "=w" (ret) : "w" (a));
+ #endif
+ return ret;
+#else
+ return vceqq_f32(a, a);
+#endif
+}
#if NPY_SIMD_F64
NPY_FINLINE npyv_b64 npyv_notnan_f64(npyv_f64 a)
- { return vceqq_f64(a, a); }
+ {
+ #if defined(__clang__)
+ npyv_b64 ret;
+ __asm("fcmeq %0.2d, %1.2d, %1.2d" : "=w" (ret) : "w" (a));
+ return ret;
+ #else
+ return vceqq_f64(a, a);
+ #endif
+ }
#endif
// Test cross all vector lanes
diff --git a/numpy/core/src/common/simd/neon/reorder.h b/numpy/core/src/common/simd/neon/reorder.h
index 50b06ed11..8bf68f5be 100644
--- a/numpy/core/src/common/simd/neon/reorder.h
+++ b/numpy/core/src/common/simd/neon/reorder.h
@@ -76,36 +76,45 @@ NPYV_IMPL_NEON_COMBINE(npyv_f32, f32)
NPYV_IMPL_NEON_COMBINE(npyv_f64, f64)
#endif
-// interleave two vectors
-#define NPYV_IMPL_NEON_ZIP(T_VEC, SFX) \
- NPY_FINLINE T_VEC##x2 npyv_zip_##SFX(T_VEC a, T_VEC b) \
- { \
- T_VEC##x2 r; \
- r.val[0] = vzip1q_##SFX(a, b); \
- r.val[1] = vzip2q_##SFX(a, b); \
- return r; \
- }
-
+// interleave & deinterleave two vectors
#ifdef __aarch64__
- NPYV_IMPL_NEON_ZIP(npyv_u8, u8)
- NPYV_IMPL_NEON_ZIP(npyv_s8, s8)
- NPYV_IMPL_NEON_ZIP(npyv_u16, u16)
- NPYV_IMPL_NEON_ZIP(npyv_s16, s16)
- NPYV_IMPL_NEON_ZIP(npyv_u32, u32)
- NPYV_IMPL_NEON_ZIP(npyv_s32, s32)
- NPYV_IMPL_NEON_ZIP(npyv_f32, f32)
- NPYV_IMPL_NEON_ZIP(npyv_f64, f64)
+ #define NPYV_IMPL_NEON_ZIP(T_VEC, SFX) \
+ NPY_FINLINE T_VEC##x2 npyv_zip_##SFX(T_VEC a, T_VEC b) \
+ { \
+ T_VEC##x2 r; \
+ r.val[0] = vzip1q_##SFX(a, b); \
+ r.val[1] = vzip2q_##SFX(a, b); \
+ return r; \
+ } \
+ NPY_FINLINE T_VEC##x2 npyv_unzip_##SFX(T_VEC a, T_VEC b) \
+ { \
+ T_VEC##x2 r; \
+ r.val[0] = vuzp1q_##SFX(a, b); \
+ r.val[1] = vuzp2q_##SFX(a, b); \
+ return r; \
+ }
#else
- #define npyv_zip_u8 vzipq_u8
- #define npyv_zip_s8 vzipq_s8
- #define npyv_zip_u16 vzipq_u16
- #define npyv_zip_s16 vzipq_s16
- #define npyv_zip_u32 vzipq_u32
- #define npyv_zip_s32 vzipq_s32
- #define npyv_zip_f32 vzipq_f32
+ #define NPYV_IMPL_NEON_ZIP(T_VEC, SFX) \
+ NPY_FINLINE T_VEC##x2 npyv_zip_##SFX(T_VEC a, T_VEC b) \
+ { return vzipq_##SFX(a, b); } \
+ NPY_FINLINE T_VEC##x2 npyv_unzip_##SFX(T_VEC a, T_VEC b) \
+ { return vuzpq_##SFX(a, b); }
#endif
+
+NPYV_IMPL_NEON_ZIP(npyv_u8, u8)
+NPYV_IMPL_NEON_ZIP(npyv_s8, s8)
+NPYV_IMPL_NEON_ZIP(npyv_u16, u16)
+NPYV_IMPL_NEON_ZIP(npyv_s16, s16)
+NPYV_IMPL_NEON_ZIP(npyv_u32, u32)
+NPYV_IMPL_NEON_ZIP(npyv_s32, s32)
+NPYV_IMPL_NEON_ZIP(npyv_f32, f32)
+
#define npyv_zip_u64 npyv_combine_u64
#define npyv_zip_s64 npyv_combine_s64
+#define npyv_zip_f64 npyv_combine_f64
+#define npyv_unzip_u64 npyv_combine_u64
+#define npyv_unzip_s64 npyv_combine_s64
+#define npyv_unzip_f64 npyv_combine_f64
// Reverse elements of each 64-bit lane
#define npyv_rev64_u8 vrev64q_u8
@@ -116,4 +125,65 @@ NPYV_IMPL_NEON_COMBINE(npyv_f64, f64)
#define npyv_rev64_s32 vrev64q_s32
#define npyv_rev64_f32 vrev64q_f32
+// Permuting the elements of each 128-bit lane by immediate index for
+// each element.
+#ifdef __clang__
+ #define npyv_permi128_u32(A, E0, E1, E2, E3) \
+ __builtin_shufflevector(A, A, E0, E1, E2, E3)
+#elif defined(__GNUC__)
+ #define npyv_permi128_u32(A, E0, E1, E2, E3) \
+ __builtin_shuffle(A, npyv_set_u32(E0, E1, E2, E3))
+#else
+ #define npyv_permi128_u32(A, E0, E1, E2, E3) \
+ npyv_set_u32( \
+ vgetq_lane_u32(A, E0), vgetq_lane_u32(A, E1), \
+ vgetq_lane_u32(A, E2), vgetq_lane_u32(A, E3) \
+ )
+ #define npyv_permi128_s32(A, E0, E1, E2, E3) \
+ npyv_set_s32( \
+ vgetq_lane_s32(A, E0), vgetq_lane_s32(A, E1), \
+ vgetq_lane_s32(A, E2), vgetq_lane_s32(A, E3) \
+ )
+ #define npyv_permi128_f32(A, E0, E1, E2, E3) \
+ npyv_set_f32( \
+ vgetq_lane_f32(A, E0), vgetq_lane_f32(A, E1), \
+ vgetq_lane_f32(A, E2), vgetq_lane_f32(A, E3) \
+ )
+#endif
+
+#if defined(__clang__) || defined(__GNUC__)
+ #define npyv_permi128_s32 npyv_permi128_u32
+ #define npyv_permi128_f32 npyv_permi128_u32
+#endif
+
+#ifdef __clang__
+ #define npyv_permi128_u64(A, E0, E1) \
+ __builtin_shufflevector(A, A, E0, E1)
+#elif defined(__GNUC__)
+ #define npyv_permi128_u64(A, E0, E1) \
+ __builtin_shuffle(A, npyv_set_u64(E0, E1))
+#else
+ #define npyv_permi128_u64(A, E0, E1) \
+ npyv_set_u64( \
+ vgetq_lane_u64(A, E0), vgetq_lane_u64(A, E1) \
+ )
+ #define npyv_permi128_s64(A, E0, E1) \
+ npyv_set_s64( \
+ vgetq_lane_s64(A, E0), vgetq_lane_s64(A, E1) \
+ )
+ #define npyv_permi128_f64(A, E0, E1) \
+ npyv_set_f64( \
+ vgetq_lane_f64(A, E0), vgetq_lane_f64(A, E1) \
+ )
+#endif
+
+#if defined(__clang__) || defined(__GNUC__)
+ #define npyv_permi128_s64 npyv_permi128_u64
+ #define npyv_permi128_f64 npyv_permi128_u64
+#endif
+
+#if !NPY_SIMD_F64
+ #undef npyv_permi128_f64
+#endif
+
#endif // _NPY_SIMD_NEON_REORDER_H
diff --git a/numpy/core/src/common/simd/simd.h b/numpy/core/src/common/simd/simd.h
index 92a77ad80..8c9b14251 100644
--- a/numpy/core/src/common/simd/simd.h
+++ b/numpy/core/src/common/simd/simd.h
@@ -82,6 +82,9 @@ typedef double npyv_lanetype_f64;
#define NPY_SIMD_FMA3 0
/// 1 if the enabled SIMD extension is running on big-endian mode otherwise 0.
#define NPY_SIMD_BIGENDIAN 0
+ /// 1 if the supported comparison intrinsics(lt, le, gt, ge)
+ /// raises FP invalid exception for quite NaNs.
+ #define NPY_SIMD_CMPSIGNAL 0
#endif
// enable emulated mask operations for all SIMD extension except for AVX512
diff --git a/numpy/core/src/common/simd/sse/arithmetic.h b/numpy/core/src/common/simd/sse/arithmetic.h
index bced35108..72a87eac1 100644
--- a/numpy/core/src/common/simd/sse/arithmetic.h
+++ b/numpy/core/src/common/simd/sse/arithmetic.h
@@ -282,6 +282,10 @@ NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor)
// negate multiply and subtract, -(a*b) - c
#define npyv_nmulsub_f32 _mm_fnmsub_ps
#define npyv_nmulsub_f64 _mm_fnmsub_pd
+ // multiply, add for odd elements and subtract even elements.
+ // (a * b) -+ c
+ #define npyv_muladdsub_f32 _mm_fmaddsub_ps
+ #define npyv_muladdsub_f64 _mm_fmaddsub_pd
#elif defined(NPY_HAVE_FMA4)
// multiply and add, a*b + c
#define npyv_muladd_f32 _mm_macc_ps
@@ -292,6 +296,10 @@ NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor)
// negate multiply and add, -(a*b) + c
#define npyv_nmuladd_f32 _mm_nmacc_ps
#define npyv_nmuladd_f64 _mm_nmacc_pd
+ // multiply, add for odd elements and subtract even elements.
+ // (a * b) -+ c
+ #define npyv_muladdsub_f32 _mm_maddsub_ps
+ #define npyv_muladdsub_f64 _mm_maddsub_pd
#else
// multiply and add, a*b + c
NPY_FINLINE npyv_f32 npyv_muladd_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
@@ -308,6 +316,28 @@ NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor)
{ return npyv_sub_f32(c, npyv_mul_f32(a, b)); }
NPY_FINLINE npyv_f64 npyv_nmuladd_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c)
{ return npyv_sub_f64(c, npyv_mul_f64(a, b)); }
+ // multiply, add for odd elements and subtract even elements.
+ // (a * b) -+ c
+ NPY_FINLINE npyv_f32 npyv_muladdsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
+ {
+ npyv_f32 m = npyv_mul_f32(a, b);
+ #if NPY_HAVE_SSE3
+ return _mm_addsub_ps(m, c);
+ #else
+ const npyv_f32 msign = npyv_set_f32(-0.0f, 0.0f, -0.0f, 0.0f);
+ return npyv_add_f32(m, npyv_xor_f32(msign, c));
+ #endif
+ }
+ NPY_FINLINE npyv_f64 npyv_muladdsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c)
+ {
+ npyv_f64 m = npyv_mul_f64(a, b);
+ #if NPY_HAVE_SSE3
+ return _mm_addsub_pd(m, c);
+ #else
+ const npyv_f64 msign = npyv_set_f64(-0.0, 0.0);
+ return npyv_add_f64(m, npyv_xor_f64(msign, c));
+ #endif
+ }
#endif // NPY_HAVE_FMA3
#ifndef NPY_HAVE_FMA3 // for FMA4 and NON-FMA3
// negate multiply and subtract, -(a*b) - c
diff --git a/numpy/core/src/common/simd/sse/math.h b/numpy/core/src/common/simd/sse/math.h
index b7f8e6ebb..b51c935af 100644
--- a/numpy/core/src/common/simd/sse/math.h
+++ b/numpy/core/src/common/simd/sse/math.h
@@ -269,13 +269,23 @@ NPY_FINLINE npyv_f32 npyv_rint_f32(npyv_f32 a)
#ifdef NPY_HAVE_SSE41
return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT);
#else
- const npyv_f32 szero = _mm_set1_ps(-0.0f);
- __m128i roundi = _mm_cvtps_epi32(a);
- __m128i overflow = _mm_cmpeq_epi32(roundi, _mm_castps_si128(szero));
- __m128 r = _mm_cvtepi32_ps(roundi);
- // respect sign of zero
- r = _mm_or_ps(r, _mm_and_ps(a, szero));
- return npyv_select_f32(overflow, a, r);
+ const __m128 szero = _mm_set1_ps(-0.0f);
+ const __m128i exp_mask = _mm_set1_epi32(0xff000000);
+
+ __m128i nfinite_mask = _mm_slli_epi32(_mm_castps_si128(a), 1);
+ nfinite_mask = _mm_and_si128(nfinite_mask, exp_mask);
+ nfinite_mask = _mm_cmpeq_epi32(nfinite_mask, exp_mask);
+
+ // eliminate nans/inf to avoid invalid fp errors
+ __m128 x = _mm_xor_ps(a, _mm_castsi128_ps(nfinite_mask));
+ __m128i roundi = _mm_cvtps_epi32(x);
+ __m128 round = _mm_cvtepi32_ps(roundi);
+ // respect signed zero
+ round = _mm_or_ps(round, _mm_and_ps(a, szero));
+ // if overflow return a
+ __m128i overflow_mask = _mm_cmpeq_epi32(roundi, _mm_castps_si128(szero));
+ // a if a overflow or nonfinite
+ return npyv_select_f32(_mm_or_si128(nfinite_mask, overflow_mask), a, round);
#endif
}
@@ -285,16 +295,22 @@ NPY_FINLINE npyv_f64 npyv_rint_f64(npyv_f64 a)
#ifdef NPY_HAVE_SSE41
return _mm_round_pd(a, _MM_FROUND_TO_NEAREST_INT);
#else
- const npyv_f64 szero = _mm_set1_pd(-0.0);
- const npyv_f64 two_power_52 = _mm_set1_pd(0x10000000000000);
- npyv_f64 sign_two52 = _mm_or_pd(two_power_52, _mm_and_pd(a, szero));
+ const __m128d szero = _mm_set1_pd(-0.0);
+ const __m128d two_power_52 = _mm_set1_pd(0x10000000000000);
+ __m128d nan_mask = _mm_cmpunord_pd(a, a);
+ // eliminate nans to avoid invalid fp errors within cmpge
+ __m128d abs_x = npyv_abs_f64(_mm_xor_pd(nan_mask, a));
// round by add magic number 2^52
- npyv_f64 round = _mm_sub_pd(_mm_add_pd(a, sign_two52), sign_two52);
- // respect signed zero, e.g. -0.5 -> -0.0
- return _mm_or_pd(round, _mm_and_pd(a, szero));
+ // assuming that MXCSR register is set to rounding
+ __m128d round = _mm_sub_pd(_mm_add_pd(two_power_52, abs_x), two_power_52);
+ // copysign
+ round = _mm_or_pd(round, _mm_and_pd(a, szero));
+ // a if |a| >= 2^52 or a == NaN
+ __m128d mask = _mm_cmpge_pd(abs_x, two_power_52);
+ mask = _mm_or_pd(mask, nan_mask);
+ return npyv_select_f64(_mm_castpd_si128(mask), a, round);
#endif
}
-
// ceil
#ifdef NPY_HAVE_SSE41
#define npyv_ceil_f32 _mm_ceil_ps
@@ -302,27 +318,48 @@ NPY_FINLINE npyv_f64 npyv_rint_f64(npyv_f64 a)
#else
NPY_FINLINE npyv_f32 npyv_ceil_f32(npyv_f32 a)
{
- const npyv_f32 szero = _mm_set1_ps(-0.0f);
- const npyv_f32 one = _mm_set1_ps(1.0f);
- npyv_s32 roundi = _mm_cvttps_epi32(a);
- npyv_f32 round = _mm_cvtepi32_ps(roundi);
- npyv_f32 ceil = _mm_add_ps(round, _mm_and_ps(_mm_cmplt_ps(round, a), one));
- // respect signed zero, e.g. -0.5 -> -0.0
- npyv_f32 rzero = _mm_or_ps(ceil, _mm_and_ps(a, szero));
+ const __m128 one = _mm_set1_ps(1.0f);
+ const __m128 szero = _mm_set1_ps(-0.0f);
+ const __m128i exp_mask = _mm_set1_epi32(0xff000000);
+
+ __m128i nfinite_mask = _mm_slli_epi32(_mm_castps_si128(a), 1);
+ nfinite_mask = _mm_and_si128(nfinite_mask, exp_mask);
+ nfinite_mask = _mm_cmpeq_epi32(nfinite_mask, exp_mask);
+
+ // eliminate nans/inf to avoid invalid fp errors
+ __m128 x = _mm_xor_ps(a, _mm_castsi128_ps(nfinite_mask));
+ __m128i roundi = _mm_cvtps_epi32(x);
+ __m128 round = _mm_cvtepi32_ps(roundi);
+ __m128 ceil = _mm_add_ps(round, _mm_and_ps(_mm_cmplt_ps(round, x), one));
+ // respect signed zero
+ ceil = _mm_or_ps(ceil, _mm_and_ps(a, szero));
// if overflow return a
- return npyv_select_f32(_mm_cmpeq_epi32(roundi, _mm_castps_si128(szero)), a, rzero);
+ __m128i overflow_mask = _mm_cmpeq_epi32(roundi, _mm_castps_si128(szero));
+ // a if a overflow or nonfinite
+ return npyv_select_f32(_mm_or_si128(nfinite_mask, overflow_mask), a, ceil);
}
NPY_FINLINE npyv_f64 npyv_ceil_f64(npyv_f64 a)
{
- const npyv_f64 szero = _mm_set1_pd(-0.0);
- const npyv_f64 one = _mm_set1_pd(1.0);
- const npyv_f64 two_power_52 = _mm_set1_pd(0x10000000000000);
- npyv_f64 sign_two52 = _mm_or_pd(two_power_52, _mm_and_pd(a, szero));
+ const __m128d one = _mm_set1_pd(1.0);
+ const __m128d szero = _mm_set1_pd(-0.0);
+ const __m128d two_power_52 = _mm_set1_pd(0x10000000000000);
+ __m128d nan_mask = _mm_cmpunord_pd(a, a);
+ // eliminate nans to avoid invalid fp errors within cmpge
+ __m128d x = _mm_xor_pd(nan_mask, a);
+ __m128d abs_x = npyv_abs_f64(x);
+ __m128d sign_x = _mm_and_pd(x, szero);
// round by add magic number 2^52
- npyv_f64 round = _mm_sub_pd(_mm_add_pd(a, sign_two52), sign_two52);
- npyv_f64 ceil = _mm_add_pd(round, _mm_and_pd(_mm_cmplt_pd(round, a), one));
- // respect signed zero, e.g. -0.5 -> -0.0
- return _mm_or_pd(ceil, _mm_and_pd(a, szero));
+ // assuming that MXCSR register is set to rounding
+ __m128d round = _mm_sub_pd(_mm_add_pd(two_power_52, abs_x), two_power_52);
+ // copysign
+ round = _mm_or_pd(round, sign_x);
+ __m128d ceil = _mm_add_pd(round, _mm_and_pd(_mm_cmplt_pd(round, x), one));
+ // respects sign of 0.0
+ ceil = _mm_or_pd(ceil, sign_x);
+ // a if |a| >= 2^52 or a == NaN
+ __m128d mask = _mm_cmpge_pd(abs_x, two_power_52);
+ mask = _mm_or_pd(mask, nan_mask);
+ return npyv_select_f64(_mm_castpd_si128(mask), a, ceil);
}
#endif
@@ -333,24 +370,43 @@ NPY_FINLINE npyv_f64 npyv_rint_f64(npyv_f64 a)
#else
NPY_FINLINE npyv_f32 npyv_trunc_f32(npyv_f32 a)
{
- const npyv_f32 szero = _mm_set1_ps(-0.0f);
- npyv_s32 roundi = _mm_cvttps_epi32(a);
- npyv_f32 trunc = _mm_cvtepi32_ps(roundi);
+ const __m128 szero = _mm_set1_ps(-0.0f);
+ const __m128i exp_mask = _mm_set1_epi32(0xff000000);
+
+ __m128i nfinite_mask = _mm_slli_epi32(_mm_castps_si128(a), 1);
+ nfinite_mask = _mm_and_si128(nfinite_mask, exp_mask);
+ nfinite_mask = _mm_cmpeq_epi32(nfinite_mask, exp_mask);
+
+ // eliminate nans/inf to avoid invalid fp errors
+ __m128 x = _mm_xor_ps(a, _mm_castsi128_ps(nfinite_mask));
+ __m128i trunci = _mm_cvttps_epi32(x);
+ __m128 trunc = _mm_cvtepi32_ps(trunci);
// respect signed zero, e.g. -0.5 -> -0.0
- npyv_f32 rzero = _mm_or_ps(trunc, _mm_and_ps(a, szero));
+ trunc = _mm_or_ps(trunc, _mm_and_ps(a, szero));
// if overflow return a
- return npyv_select_f32(_mm_cmpeq_epi32(roundi, _mm_castps_si128(szero)), a, rzero);
+ __m128i overflow_mask = _mm_cmpeq_epi32(trunci, _mm_castps_si128(szero));
+ // a if a overflow or nonfinite
+ return npyv_select_f32(_mm_or_si128(nfinite_mask, overflow_mask), a, trunc);
}
NPY_FINLINE npyv_f64 npyv_trunc_f64(npyv_f64 a)
{
- const npyv_f64 szero = _mm_set1_pd(-0.0);
- const npyv_f64 one = _mm_set1_pd(1.0);
- const npyv_f64 two_power_52 = _mm_set1_pd(0x10000000000000);
- npyv_f64 abs_a = npyv_abs_f64(a);
+ const __m128d one = _mm_set1_pd(1.0);
+ const __m128d szero = _mm_set1_pd(-0.0);
+ const __m128d two_power_52 = _mm_set1_pd(0x10000000000000);
+ __m128d nan_mask = _mm_cmpunord_pd(a, a);
+ // eliminate nans to avoid invalid fp errors within cmpge
+ __m128d abs_x = npyv_abs_f64(_mm_xor_pd(nan_mask, a));
// round by add magic number 2^52
- npyv_f64 abs_round = _mm_sub_pd(_mm_add_pd(abs_a, two_power_52), two_power_52);
- npyv_f64 subtrahend = _mm_and_pd(_mm_cmpgt_pd(abs_round, abs_a), one);
- return _mm_or_pd(_mm_sub_pd(abs_round, subtrahend), _mm_and_pd(a, szero));
+ // assuming that MXCSR register is set to rounding
+ __m128d abs_round = _mm_sub_pd(_mm_add_pd(two_power_52, abs_x), two_power_52);
+ __m128d subtrahend = _mm_and_pd(_mm_cmpgt_pd(abs_round, abs_x), one);
+ __m128d trunc = _mm_sub_pd(abs_round, subtrahend);
+ // copysign
+ trunc = _mm_or_pd(trunc, _mm_and_pd(a, szero));
+ // a if |a| >= 2^52 or a == NaN
+ __m128d mask = _mm_cmpge_pd(abs_x, two_power_52);
+ mask = _mm_or_pd(mask, nan_mask);
+ return npyv_select_f64(_mm_castpd_si128(mask), a, trunc);
}
#endif
@@ -361,15 +417,46 @@ NPY_FINLINE npyv_f64 npyv_rint_f64(npyv_f64 a)
#else
NPY_FINLINE npyv_f32 npyv_floor_f32(npyv_f32 a)
{
- const npyv_f32 one = _mm_set1_ps(1.0f);
- npyv_f32 round = npyv_rint_f32(a);
- return _mm_sub_ps(round, _mm_and_ps(_mm_cmpgt_ps(round, a), one));
+ const __m128 one = _mm_set1_ps(1.0f);
+ const __m128 szero = _mm_set1_ps(-0.0f);
+ const __m128i exp_mask = _mm_set1_epi32(0xff000000);
+
+ __m128i nfinite_mask = _mm_slli_epi32(_mm_castps_si128(a), 1);
+ nfinite_mask = _mm_and_si128(nfinite_mask, exp_mask);
+ nfinite_mask = _mm_cmpeq_epi32(nfinite_mask, exp_mask);
+
+ // eliminate nans/inf to avoid invalid fp errors
+ __m128 x = _mm_xor_ps(a, _mm_castsi128_ps(nfinite_mask));
+ __m128i roundi = _mm_cvtps_epi32(x);
+ __m128 round = _mm_cvtepi32_ps(roundi);
+ __m128 floor = _mm_sub_ps(round, _mm_and_ps(_mm_cmpgt_ps(round, x), one));
+ // respect signed zero
+ floor = _mm_or_ps(floor, _mm_and_ps(a, szero));
+ // if overflow return a
+ __m128i overflow_mask = _mm_cmpeq_epi32(roundi, _mm_castps_si128(szero));
+ // a if a overflow or nonfinite
+ return npyv_select_f32(_mm_or_si128(nfinite_mask, overflow_mask), a, floor);
}
NPY_FINLINE npyv_f64 npyv_floor_f64(npyv_f64 a)
{
- const npyv_f64 one = _mm_set1_pd(1.0);
- npyv_f64 round = npyv_rint_f64(a);
- return _mm_sub_pd(round, _mm_and_pd(_mm_cmpgt_pd(round, a), one));
+ const __m128d one = _mm_set1_pd(1.0f);
+ const __m128d szero = _mm_set1_pd(-0.0f);
+ const __m128d two_power_52 = _mm_set1_pd(0x10000000000000);
+ __m128d nan_mask = _mm_cmpunord_pd(a, a);
+ // eliminate nans to avoid invalid fp errors within cmpge
+ __m128d x = _mm_xor_pd(nan_mask, a);
+ __m128d abs_x = npyv_abs_f64(x);
+ __m128d sign_x = _mm_and_pd(x, szero);
+ // round by add magic number 2^52
+ // assuming that MXCSR register is set to rounding
+ __m128d round = _mm_sub_pd(_mm_add_pd(two_power_52, abs_x), two_power_52);
+ // copysign
+ round = _mm_or_pd(round, sign_x);
+ __m128d floor = _mm_sub_pd(round, _mm_and_pd(_mm_cmpgt_pd(round, x), one));
+ // a if |a| >= 2^52 or a == NaN
+ __m128d mask = _mm_cmpge_pd(abs_x, two_power_52);
+ mask = _mm_or_pd(mask, nan_mask);
+ return npyv_select_f64(_mm_castpd_si128(mask), a, floor);
}
#endif // NPY_HAVE_SSE41
diff --git a/numpy/core/src/common/simd/sse/memory.h b/numpy/core/src/common/simd/sse/memory.h
index 3ff64848d..4c8e86a6f 100644
--- a/numpy/core/src/common/simd/sse/memory.h
+++ b/numpy/core/src/common/simd/sse/memory.h
@@ -103,6 +103,28 @@ NPY_FINLINE npyv_u64 npyv_loadn_u64(const npy_uint64 *ptr, npy_intp stride)
{ return _mm_castpd_si128(npyv_loadn_f64((const double*)ptr, stride)); }
NPY_FINLINE npyv_s64 npyv_loadn_s64(const npy_int64 *ptr, npy_intp stride)
{ return _mm_castpd_si128(npyv_loadn_f64((const double*)ptr, stride)); }
+
+//// 64-bit load over 32-bit stride
+NPY_FINLINE npyv_f32 npyv_loadn2_f32(const float *ptr, npy_intp stride)
+{
+ __m128d r = _mm_loadh_pd(
+ npyv_loadl_f64((const double*)ptr), (const double*)(ptr + stride)
+ );
+ return _mm_castpd_ps(r);
+}
+NPY_FINLINE npyv_u32 npyv_loadn2_u32(const npy_uint32 *ptr, npy_intp stride)
+{ return _mm_castps_si128(npyv_loadn2_f32((const float*)ptr, stride)); }
+NPY_FINLINE npyv_s32 npyv_loadn2_s32(const npy_int32 *ptr, npy_intp stride)
+{ return _mm_castps_si128(npyv_loadn2_f32((const float*)ptr, stride)); }
+
+//// 128-bit load over 64-bit stride
+NPY_FINLINE npyv_f64 npyv_loadn2_f64(const double *ptr, npy_intp stride)
+{ (void)stride; return npyv_load_f64(ptr); }
+NPY_FINLINE npyv_u64 npyv_loadn2_u64(const npy_uint64 *ptr, npy_intp stride)
+{ (void)stride; return npyv_load_u64(ptr); }
+NPY_FINLINE npyv_s64 npyv_loadn2_s64(const npy_int64 *ptr, npy_intp stride)
+{ (void)stride; return npyv_load_s64(ptr); }
+
/***************************
* Non-contiguous Store
***************************/
@@ -135,6 +157,24 @@ NPY_FINLINE void npyv_storen_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a)
NPY_FINLINE void npyv_storen_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a)
{ npyv_storen_f64((double*)ptr, stride, _mm_castsi128_pd(a)); }
+//// 64-bit store over 32-bit stride
+NPY_FINLINE void npyv_storen2_u32(npy_uint32 *ptr, npy_intp stride, npyv_u32 a)
+{
+ _mm_storel_pd((double*)ptr, _mm_castsi128_pd(a));
+ _mm_storeh_pd((double*)(ptr + stride), _mm_castsi128_pd(a));
+}
+NPY_FINLINE void npyv_storen2_s32(npy_int32 *ptr, npy_intp stride, npyv_s32 a)
+{ npyv_storen2_u32((npy_uint32*)ptr, stride, a); }
+NPY_FINLINE void npyv_storen2_f32(float *ptr, npy_intp stride, npyv_f32 a)
+{ npyv_storen2_u32((npy_uint32*)ptr, stride, _mm_castps_si128(a)); }
+
+//// 128-bit store over 64-bit stride
+NPY_FINLINE void npyv_storen2_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a)
+{ (void)stride; npyv_store_u64(ptr, a); }
+NPY_FINLINE void npyv_storen2_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a)
+{ (void)stride; npyv_store_s64(ptr, a); }
+NPY_FINLINE void npyv_storen2_f64(double *ptr, npy_intp stride, npyv_f64 a)
+{ (void)stride; npyv_store_f64(ptr, a); }
/*********************************
* Partial Load
*********************************/
@@ -204,13 +244,14 @@ NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
return _mm_cvtsi32_si128(*ptr);
case 2:
return _mm_loadl_epi64((const __m128i*)ptr);
- case 3:;
- npyv_s32 a = _mm_loadl_epi64((const __m128i*)ptr);
- #ifdef NPY_HAVE_SSE41
- return _mm_insert_epi32(a, ptr[2], 2);
- #else
- return _mm_unpacklo_epi64(a, _mm_cvtsi32_si128(ptr[2]));
- #endif
+ case 3: {
+ npyv_s32 a = _mm_loadl_epi64((const __m128i*)ptr);
+ #ifdef NPY_HAVE_SSE41
+ return _mm_insert_epi32(a, ptr[2], 2);
+ #else
+ return _mm_unpacklo_epi64(a, _mm_cvtsi32_si128(ptr[2]));
+ #endif
+ }
default:
return npyv_load_s32(ptr);
}
@@ -246,6 +287,32 @@ NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
}
return npyv_load_s64(ptr);
}
+
+//// 64-bit nlane
+NPY_FINLINE npyv_s32 npyv_load2_till_s32(const npy_int32 *ptr, npy_uintp nlane,
+ npy_int32 fill_lo, npy_int32 fill_hi)
+{
+ assert(nlane > 0);
+ if (nlane == 1) {
+ const __m128i vfill = npyv_set_s32(fill_lo, fill_hi, fill_lo, fill_hi);
+ return _mm_castpd_si128(
+ _mm_loadl_pd(_mm_castsi128_pd(vfill), (double*)ptr)
+ );
+ }
+ return npyv_load_s32(ptr);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s32 npyv_load2_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
+{ return npyv_load_tillz_s64((const npy_int64*)ptr, nlane); }
+
+//// 128-bit nlane
+NPY_FINLINE npyv_s64 npyv_load2_till_s64(const npy_int64 *ptr, npy_uintp nlane,
+ npy_int64 fill_lo, npy_int64 fill_hi)
+{ (void)nlane; (void)fill_lo; (void)fill_hi; return npyv_load_s64(ptr); }
+
+NPY_FINLINE npyv_s64 npyv_load2_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
+{ (void)nlane; return npyv_load_s64(ptr); }
+
/*********************************
* Non-contiguous partial load
*********************************/
@@ -305,23 +372,27 @@ npyv_loadn_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane)
case 1:
return _mm_cvtsi32_si128(ptr[0]);
case 2:;
- npyv_s32 a = _mm_cvtsi32_si128(ptr[0]);
-#ifdef NPY_HAVE_SSE41
- return _mm_insert_epi32(a, ptr[stride], 1);
-#else
- return _mm_unpacklo_epi32(a, _mm_cvtsi32_si128(ptr[stride]));
-#endif // NPY_HAVE_SSE41
- case 3:;
- a = _mm_cvtsi32_si128(ptr[0]);
-#ifdef NPY_HAVE_SSE41
- a = _mm_insert_epi32(a, ptr[stride], 1);
- a = _mm_insert_epi32(a, ptr[stride*2], 2);
- return a;
-#else
- a = _mm_unpacklo_epi32(a, _mm_cvtsi32_si128(ptr[stride]));
- a = _mm_unpacklo_epi64(a, _mm_cvtsi32_si128(ptr[stride*2]));
- return a;
-#endif // NPY_HAVE_SSE41
+ {
+ npyv_s32 a = _mm_cvtsi32_si128(ptr[0]);
+ #ifdef NPY_HAVE_SSE41
+ return _mm_insert_epi32(a, ptr[stride], 1);
+ #else
+ return _mm_unpacklo_epi32(a, _mm_cvtsi32_si128(ptr[stride]));
+ #endif // NPY_HAVE_SSE41
+ }
+ case 3:
+ {
+ npyv_s32 a = _mm_cvtsi32_si128(ptr[0]);
+ #ifdef NPY_HAVE_SSE41
+ a = _mm_insert_epi32(a, ptr[stride], 1);
+ a = _mm_insert_epi32(a, ptr[stride*2], 2);
+ return a;
+ #else
+ a = _mm_unpacklo_epi32(a, _mm_cvtsi32_si128(ptr[stride]));
+ a = _mm_unpacklo_epi64(a, _mm_cvtsi32_si128(ptr[stride*2]));
+ return a;
+ #endif // NPY_HAVE_SSE41
+ }
default:
return npyv_loadn_s32(ptr, stride);
}
@@ -358,6 +429,37 @@ NPY_FINLINE npyv_s64 npyv_loadn_tillz_s64(const npy_int64 *ptr, npy_intp stride,
}
return npyv_loadn_s64(ptr, stride);
}
+
+//// 64-bit load over 32-bit stride
+NPY_FINLINE npyv_s32 npyv_loadn2_till_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane,
+ npy_int32 fill_lo, npy_int32 fill_hi)
+{
+ assert(nlane > 0);
+ if (nlane == 1) {
+ const __m128i vfill = npyv_set_s32(0, 0, fill_lo, fill_hi);
+ return _mm_castpd_si128(
+ _mm_loadl_pd(_mm_castsi128_pd(vfill), (double*)ptr)
+ );
+ }
+ return npyv_loadn2_s32(ptr, stride);
+}
+NPY_FINLINE npyv_s32 npyv_loadn2_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane)
+{
+ assert(nlane > 0);
+ if (nlane == 1) {
+ return _mm_loadl_epi64((const __m128i*)ptr);
+ }
+ return npyv_loadn2_s32(ptr, stride);
+}
+
+//// 128-bit load over 64-bit stride
+NPY_FINLINE npyv_s64 npyv_loadn2_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane,
+ npy_int64 fill_lo, npy_int64 fill_hi)
+{ assert(nlane > 0); (void)stride; (void)nlane; (void)fill_lo; (void)fill_hi; return npyv_load_s64(ptr); }
+
+NPY_FINLINE npyv_s64 npyv_loadn2_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane)
+{ assert(nlane > 0); (void)stride; (void)nlane; return npyv_load_s64(ptr); }
+
/*********************************
* Partial store
*********************************/
@@ -394,6 +496,17 @@ NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a
}
npyv_store_s64(ptr, a);
}
+//// 64-bit nlane
+NPY_FINLINE void npyv_store2_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a)
+{ npyv_store_till_s64((npy_int64*)ptr, nlane, a); }
+
+//// 128-bit nlane
+NPY_FINLINE void npyv_store2_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a)
+{
+ assert(nlane > 0); (void)nlane;
+ npyv_store_s64(ptr, a);
+}
+
/*********************************
* Non-contiguous partial store
*********************************/
@@ -401,25 +514,35 @@ NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a
NPY_FINLINE void npyv_storen_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npyv_s32 a)
{
assert(nlane > 0);
+ ptr[stride*0] = _mm_cvtsi128_si32(a);
switch(nlane) {
+ case 1:
+ return;
#ifdef NPY_HAVE_SSE41
- default:
- ptr[stride*3] = _mm_extract_epi32(a, 3);
+ case 2:
+ ptr[stride*1] = _mm_extract_epi32(a, 1);
+ return;
case 3:
+ ptr[stride*1] = _mm_extract_epi32(a, 1);
ptr[stride*2] = _mm_extract_epi32(a, 2);
- case 2:
+ return;
+ default:
ptr[stride*1] = _mm_extract_epi32(a, 1);
+ ptr[stride*2] = _mm_extract_epi32(a, 2);
+ ptr[stride*3] = _mm_extract_epi32(a, 3);
#else
- default:
- ptr[stride*3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 3)));
+ case 2:
+ ptr[stride*1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 1)));
+ return;
case 3:
+ ptr[stride*1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 1)));
ptr[stride*2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 2)));
- case 2:
+ return;
+ default:
ptr[stride*1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 1)));
+ ptr[stride*2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 2)));
+ ptr[stride*3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 3)));
#endif
- case 1:
- ptr[stride*0] = _mm_cvtsi128_si32(a);
- break;
}
}
//// 64
@@ -432,6 +555,21 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp
}
npyv_storen_s64(ptr, stride, a);
}
+
+//// 64-bit store over 32-bit stride
+NPY_FINLINE void npyv_storen2_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npyv_s32 a)
+{
+ assert(nlane > 0);
+ npyv_storel_s32(ptr, a);
+ if (nlane > 1) {
+ npyv_storeh_s32(ptr + stride, a);
+ }
+}
+
+//// 128-bit store over 64-bit stride
+NPY_FINLINE void npyv_storen2_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npyv_s64 a)
+{ assert(nlane > 0); (void)stride; (void)nlane; npyv_store_s64(ptr, a); }
+
/*****************************************************************
* Implement partial load/store for u32/f32/u64/f64... via casting
*****************************************************************/
@@ -442,7 +580,8 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp
union { \
npyv_lanetype_##F_SFX from_##F_SFX; \
npyv_lanetype_##T_SFX to_##T_SFX; \
- } pun = {.from_##F_SFX = fill}; \
+ } pun; \
+ pun.from_##F_SFX = fill; \
return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load_till_##T_SFX( \
(const npyv_lanetype_##T_SFX *)ptr, nlane, pun.to_##T_SFX \
)); \
@@ -454,7 +593,8 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp
union { \
npyv_lanetype_##F_SFX from_##F_SFX; \
npyv_lanetype_##T_SFX to_##T_SFX; \
- } pun = {.from_##F_SFX = fill}; \
+ } pun; \
+ pun.from_##F_SFX = fill; \
return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn_till_##T_SFX( \
(const npyv_lanetype_##T_SFX *)ptr, stride, nlane, pun.to_##T_SFX \
)); \
@@ -495,6 +635,110 @@ NPYV_IMPL_SSE_REST_PARTIAL_TYPES(f32, s32)
NPYV_IMPL_SSE_REST_PARTIAL_TYPES(u64, s64)
NPYV_IMPL_SSE_REST_PARTIAL_TYPES(f64, s64)
+// 128-bit/64-bit stride
+#define NPYV_IMPL_SSE_REST_PARTIAL_TYPES_PAIR(F_SFX, T_SFX) \
+ NPY_FINLINE npyv_##F_SFX npyv_load2_till_##F_SFX \
+ (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, \
+ npyv_lanetype_##F_SFX fill_lo, npyv_lanetype_##F_SFX fill_hi) \
+ { \
+ union pun { \
+ npyv_lanetype_##F_SFX from_##F_SFX; \
+ npyv_lanetype_##T_SFX to_##T_SFX; \
+ }; \
+ union pun pun_lo; \
+ union pun pun_hi; \
+ pun_lo.from_##F_SFX = fill_lo; \
+ pun_hi.from_##F_SFX = fill_hi; \
+ return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load2_till_##T_SFX( \
+ (const npyv_lanetype_##T_SFX *)ptr, nlane, pun_lo.to_##T_SFX, pun_hi.to_##T_SFX \
+ )); \
+ } \
+ NPY_FINLINE npyv_##F_SFX npyv_loadn2_till_##F_SFX \
+ (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, \
+ npyv_lanetype_##F_SFX fill_lo, npyv_lanetype_##F_SFX fill_hi) \
+ { \
+ union pun { \
+ npyv_lanetype_##F_SFX from_##F_SFX; \
+ npyv_lanetype_##T_SFX to_##T_SFX; \
+ }; \
+ union pun pun_lo; \
+ union pun pun_hi; \
+ pun_lo.from_##F_SFX = fill_lo; \
+ pun_hi.from_##F_SFX = fill_hi; \
+ return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn2_till_##T_SFX( \
+ (const npyv_lanetype_##T_SFX *)ptr, stride, nlane, pun_lo.to_##T_SFX, \
+ pun_hi.to_##T_SFX \
+ )); \
+ } \
+ NPY_FINLINE npyv_##F_SFX npyv_load2_tillz_##F_SFX \
+ (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane) \
+ { \
+ return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load2_tillz_##T_SFX( \
+ (const npyv_lanetype_##T_SFX *)ptr, nlane \
+ )); \
+ } \
+ NPY_FINLINE npyv_##F_SFX npyv_loadn2_tillz_##F_SFX \
+ (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane) \
+ { \
+ return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn2_tillz_##T_SFX( \
+ (const npyv_lanetype_##T_SFX *)ptr, stride, nlane \
+ )); \
+ } \
+ NPY_FINLINE void npyv_store2_till_##F_SFX \
+ (npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, npyv_##F_SFX a) \
+ { \
+ npyv_store2_till_##T_SFX( \
+ (npyv_lanetype_##T_SFX *)ptr, nlane, \
+ npyv_reinterpret_##T_SFX##_##F_SFX(a) \
+ ); \
+ } \
+ NPY_FINLINE void npyv_storen2_till_##F_SFX \
+ (npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, npyv_##F_SFX a) \
+ { \
+ npyv_storen2_till_##T_SFX( \
+ (npyv_lanetype_##T_SFX *)ptr, stride, nlane, \
+ npyv_reinterpret_##T_SFX##_##F_SFX(a) \
+ ); \
+ }
+
+NPYV_IMPL_SSE_REST_PARTIAL_TYPES_PAIR(u32, s32)
+NPYV_IMPL_SSE_REST_PARTIAL_TYPES_PAIR(f32, s32)
+NPYV_IMPL_SSE_REST_PARTIAL_TYPES_PAIR(u64, s64)
+NPYV_IMPL_SSE_REST_PARTIAL_TYPES_PAIR(f64, s64)
+
+/************************************************************
+ * de-interlave load / interleave contiguous store
+ ************************************************************/
+// two channels
+#define NPYV_IMPL_SSE_MEM_INTERLEAVE(SFX, ZSFX) \
+ NPY_FINLINE npyv_##ZSFX##x2 npyv_zip_##ZSFX(npyv_##ZSFX, npyv_##ZSFX); \
+ NPY_FINLINE npyv_##ZSFX##x2 npyv_unzip_##ZSFX(npyv_##ZSFX, npyv_##ZSFX); \
+ NPY_FINLINE npyv_##SFX##x2 npyv_load_##SFX##x2( \
+ const npyv_lanetype_##SFX *ptr \
+ ) { \
+ return npyv_unzip_##ZSFX( \
+ npyv_load_##SFX(ptr), npyv_load_##SFX(ptr+npyv_nlanes_##SFX) \
+ ); \
+ } \
+ NPY_FINLINE void npyv_store_##SFX##x2( \
+ npyv_lanetype_##SFX *ptr, npyv_##SFX##x2 v \
+ ) { \
+ npyv_##SFX##x2 zip = npyv_zip_##ZSFX(v.val[0], v.val[1]); \
+ npyv_store_##SFX(ptr, zip.val[0]); \
+ npyv_store_##SFX(ptr + npyv_nlanes_##SFX, zip.val[1]); \
+ }
+
+NPYV_IMPL_SSE_MEM_INTERLEAVE(u8, u8)
+NPYV_IMPL_SSE_MEM_INTERLEAVE(s8, u8)
+NPYV_IMPL_SSE_MEM_INTERLEAVE(u16, u16)
+NPYV_IMPL_SSE_MEM_INTERLEAVE(s16, u16)
+NPYV_IMPL_SSE_MEM_INTERLEAVE(u32, u32)
+NPYV_IMPL_SSE_MEM_INTERLEAVE(s32, u32)
+NPYV_IMPL_SSE_MEM_INTERLEAVE(u64, u64)
+NPYV_IMPL_SSE_MEM_INTERLEAVE(s64, u64)
+NPYV_IMPL_SSE_MEM_INTERLEAVE(f32, f32)
+NPYV_IMPL_SSE_MEM_INTERLEAVE(f64, f64)
+
/*********************************
* Lookup table
*********************************/
diff --git a/numpy/core/src/common/simd/sse/reorder.h b/numpy/core/src/common/simd/sse/reorder.h
index d96ab9c56..9a57f6489 100644
--- a/numpy/core/src/common/simd/sse/reorder.h
+++ b/numpy/core/src/common/simd/sse/reorder.h
@@ -81,6 +81,75 @@ NPYV_IMPL_SSE_ZIP(npyv_s64, s64, epi64)
NPYV_IMPL_SSE_ZIP(npyv_f32, f32, ps)
NPYV_IMPL_SSE_ZIP(npyv_f64, f64, pd)
+// deinterleave two vectors
+NPY_FINLINE npyv_u8x2 npyv_unzip_u8(npyv_u8 ab0, npyv_u8 ab1)
+{
+#ifdef NPY_HAVE_SSSE3
+ const __m128i idx = _mm_setr_epi8(
+ 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
+ );
+ __m128i abl = _mm_shuffle_epi8(ab0, idx);
+ __m128i abh = _mm_shuffle_epi8(ab1, idx);
+ return npyv_combine_u8(abl, abh);
+#else
+ __m128i ab_083b = _mm_unpacklo_epi8(ab0, ab1);
+ __m128i ab_4c6e = _mm_unpackhi_epi8(ab0, ab1);
+ __m128i ab_048c = _mm_unpacklo_epi8(ab_083b, ab_4c6e);
+ __m128i ab_36be = _mm_unpackhi_epi8(ab_083b, ab_4c6e);
+ __m128i ab_0346 = _mm_unpacklo_epi8(ab_048c, ab_36be);
+ __m128i ab_8bc8 = _mm_unpackhi_epi8(ab_048c, ab_36be);
+ npyv_u8x2 r;
+ r.val[0] = _mm_unpacklo_epi8(ab_0346, ab_8bc8);
+ r.val[1] = _mm_unpackhi_epi8(ab_0346, ab_8bc8);
+ return r;
+#endif
+}
+#define npyv_unzip_s8 npyv_unzip_u8
+
+NPY_FINLINE npyv_u16x2 npyv_unzip_u16(npyv_u16 ab0, npyv_u16 ab1)
+{
+#ifdef NPY_HAVE_SSSE3
+ const __m128i idx = _mm_setr_epi8(
+ 0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15
+ );
+ __m128i abl = _mm_shuffle_epi8(ab0, idx);
+ __m128i abh = _mm_shuffle_epi8(ab1, idx);
+ return npyv_combine_u16(abl, abh);
+#else
+ __m128i ab_0415 = _mm_unpacklo_epi16(ab0, ab1);
+ __m128i ab_263f = _mm_unpackhi_epi16(ab0, ab1);
+ __m128i ab_0246 = _mm_unpacklo_epi16(ab_0415, ab_263f);
+ __m128i ab_135f = _mm_unpackhi_epi16(ab_0415, ab_263f);
+ npyv_u16x2 r;
+ r.val[0] = _mm_unpacklo_epi16(ab_0246, ab_135f);
+ r.val[1] = _mm_unpackhi_epi16(ab_0246, ab_135f);
+ return r;
+#endif
+}
+#define npyv_unzip_s16 npyv_unzip_u16
+
+NPY_FINLINE npyv_u32x2 npyv_unzip_u32(npyv_u32 ab0, npyv_u32 ab1)
+{
+ __m128i abl = _mm_shuffle_epi32(ab0, _MM_SHUFFLE(3, 1, 2, 0));
+ __m128i abh = _mm_shuffle_epi32(ab1, _MM_SHUFFLE(3, 1, 2, 0));
+ return npyv_combine_u32(abl, abh);
+}
+#define npyv_unzip_s32 npyv_unzip_u32
+
+NPY_FINLINE npyv_u64x2 npyv_unzip_u64(npyv_u64 ab0, npyv_u64 ab1)
+{ return npyv_combine_u64(ab0, ab1); }
+#define npyv_unzip_s64 npyv_unzip_u64
+
+NPY_FINLINE npyv_f32x2 npyv_unzip_f32(npyv_f32 ab0, npyv_f32 ab1)
+{
+ npyv_f32x2 r;
+ r.val[0] = _mm_shuffle_ps(ab0, ab1, _MM_SHUFFLE(2, 0, 2, 0));
+ r.val[1] = _mm_shuffle_ps(ab0, ab1, _MM_SHUFFLE(3, 1, 3, 1));
+ return r;
+}
+NPY_FINLINE npyv_f64x2 npyv_unzip_f64(npyv_f64 ab0, npyv_f64 ab1)
+{ return npyv_combine_f64(ab0, ab1); }
+
// Reverse elements of each 64-bit lane
NPY_FINLINE npyv_u16 npyv_rev64_u16(npyv_u16 a)
{
@@ -122,4 +191,22 @@ NPY_FINLINE npyv_f32 npyv_rev64_f32(npyv_f32 a)
return _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 0, 1));
}
+// Permuting the elements of each 128-bit lane by immediate index for
+// each element.
+#define npyv_permi128_u32(A, E0, E1, E2, E3) \
+ _mm_shuffle_epi32(A, _MM_SHUFFLE(E3, E2, E1, E0))
+
+#define npyv_permi128_s32 npyv_permi128_u32
+
+#define npyv_permi128_u64(A, E0, E1) \
+ _mm_shuffle_epi32(A, _MM_SHUFFLE(((E1)<<1)+1, ((E1)<<1), ((E0)<<1)+1, ((E0)<<1)))
+
+#define npyv_permi128_s64 npyv_permi128_u64
+
+#define npyv_permi128_f32(A, E0, E1, E2, E3) \
+ _mm_shuffle_ps(A, A, _MM_SHUFFLE(E3, E2, E1, E0))
+
+#define npyv_permi128_f64(A, E0, E1) \
+ _mm_shuffle_pd(A, A, _MM_SHUFFLE2(E1, E0))
+
#endif // _NPY_SIMD_SSE_REORDER_H
diff --git a/numpy/core/src/common/simd/sse/sse.h b/numpy/core/src/common/simd/sse/sse.h
index c21bbfda7..0c6b8cdba 100644
--- a/numpy/core/src/common/simd/sse/sse.h
+++ b/numpy/core/src/common/simd/sse/sse.h
@@ -12,6 +12,7 @@
#define NPY_SIMD_FMA3 0 // fast emulated
#endif
#define NPY_SIMD_BIGENDIAN 0
+#define NPY_SIMD_CMPSIGNAL 1
typedef __m128i npyv_u8;
typedef __m128i npyv_s8;
diff --git a/numpy/core/src/common/simd/vec/arithmetic.h b/numpy/core/src/common/simd/vec/arithmetic.h
index a2e9d07eb..85f4d6b26 100644
--- a/numpy/core/src/common/simd/vec/arithmetic.h
+++ b/numpy/core/src/common/simd/vec/arithmetic.h
@@ -322,6 +322,20 @@ NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor)
NPY_FINLINE npyv_f64 npyv_nmulsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c)
{ return vec_neg(vec_madd(a, b, c)); }
#endif
+// multiply, add for odd elements and subtract even elements.
+// (a * b) -+ c
+#if NPY_SIMD_F32
+NPY_FINLINE npyv_f32 npyv_muladdsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
+{
+ const npyv_f32 msign = npyv_set_f32(-0.0f, 0.0f, -0.0f, 0.0f);
+ return npyv_muladd_f32(a, b, npyv_xor_f32(msign, c));
+}
+#endif
+NPY_FINLINE npyv_f64 npyv_muladdsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c)
+{
+ const npyv_f64 msign = npyv_set_f64(-0.0, 0.0);
+ return npyv_muladd_f64(a, b, npyv_xor_f64(msign, c));
+}
/***************************
* Summation
***************************/
@@ -352,6 +366,7 @@ NPY_FINLINE float npyv_sum_f32(npyv_f32 a)
{
npyv_f32 sum = vec_add(a, npyv_combineh_f32(a, a));
return vec_extract(sum, 0) + vec_extract(sum, 1);
+ (void)sum;
}
#endif
@@ -372,6 +387,7 @@ NPY_FINLINE npy_uint16 npyv_sumup_u8(npyv_u8 a)
npyv_u32 four = vec_sum4s(a, zero);
npyv_s32 one = vec_sums((npyv_s32)four, (npyv_s32)zero);
return (npy_uint16)vec_extract(one, 3);
+ (void)one;
#endif
}
@@ -386,6 +402,7 @@ NPY_FINLINE npy_uint32 npyv_sumup_u16(npyv_u16 a)
npyv_u32 four = vec_add(eight.val[0], eight.val[1]);
npyv_s32 one = vec_sums((npyv_s32)four, zero);
return (npy_uint32)vec_extract(one, 3);
+ (void)one;
#endif
}
diff --git a/numpy/core/src/common/simd/vec/conversion.h b/numpy/core/src/common/simd/vec/conversion.h
index f0d625c55..922109f7b 100644
--- a/numpy/core/src/common/simd/vec/conversion.h
+++ b/numpy/core/src/common/simd/vec/conversion.h
@@ -96,6 +96,8 @@ npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d,
#else
return vec_extract(r, 4);
#endif
+ // to suppress ambiguous warning: variable `r` but not used [-Wunused-but-set-variable]
+ (void)r;
}
NPY_FINLINE npy_uint64 npyv_tobits_b16(npyv_b16 a)
{
@@ -106,6 +108,8 @@ npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d,
#else
return vec_extract(r, 8);
#endif
+ // to suppress ambiguous warning: variable `r` but not used [-Wunused-but-set-variable]
+ (void)r;
}
NPY_FINLINE npy_uint64 npyv_tobits_b32(npyv_b32 a)
{
@@ -120,6 +124,8 @@ npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d,
#else
return vec_extract(r, 8);
#endif
+ // to suppress ambiguous warning: variable `r` but not used [-Wunused-but-set-variable]
+ (void)r;
}
NPY_FINLINE npy_uint64 npyv_tobits_b64(npyv_b64 a)
{
@@ -134,6 +140,8 @@ npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d,
#else
return vec_extract(r, 8);
#endif
+ // to suppress ambiguous warning: variable `r` but not used [-Wunused-but-set-variable]
+ (void)r;
}
#else
NPY_FINLINE npy_uint64 npyv_tobits_b8(npyv_b8 a)
@@ -170,7 +178,8 @@ npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d,
#ifdef NPY_HAVE_VXE2
return vec_signed(a);
#elif defined(NPY_HAVE_VXE)
- return vec_packs(vec_signed(npyv_doublee(a)), vec_signed(npyv_doublee(vec_mergel(a, a))));
+ return vec_packs(vec_signed(npyv_doublee(vec_mergeh(a,a))),
+ vec_signed(npyv_doublee(vec_mergel(a, a))));
// VSX
#elif defined(__IBMC__)
return vec_cts(a, 0);
diff --git a/numpy/core/src/common/simd/vec/math.h b/numpy/core/src/common/simd/vec/math.h
index 95b16fdf7..85690f76c 100644
--- a/numpy/core/src/common/simd/vec/math.h
+++ b/numpy/core/src/common/simd/vec/math.h
@@ -186,6 +186,7 @@ NPY_IMPL_VEC_REDUCE_MINMAX(max, int32, s32)
{ \
npyv_##SFX r = vec_##INTRIN(a, vec_sld(a, a, 8)); \
return (npy_##STYPE)vec_extract(r, 0); \
+ (void)r; \
}
NPY_IMPL_VEC_REDUCE_MINMAX(min, uint64, u64)
NPY_IMPL_VEC_REDUCE_MINMAX(max, uint64, u64)
@@ -225,6 +226,7 @@ NPY_IMPL_VEC_REDUCE_MINMAX(max, int64, s64)
{ \
npyv_f64 r = vec_##INTRIN(a, vec_sld(a, a, 8)); \
return vec_extract(r, 0); \
+ (void)r; \
} \
NPY_FINLINE double npyv_reduce_##INTRIN##n_f64(npyv_f64 a) \
{ \
diff --git a/numpy/core/src/common/simd/vec/memory.h b/numpy/core/src/common/simd/vec/memory.h
index e8f588ef2..1ad39cead 100644
--- a/numpy/core/src/common/simd/vec/memory.h
+++ b/numpy/core/src/common/simd/vec/memory.h
@@ -46,14 +46,8 @@
#endif
// avoid aliasing rules
-#ifdef __cplusplus
- template<typename T_PTR>
- NPY_FINLINE npy_uint64 *npyv__ptr2u64(const T_PTR *ptr)
- { npy_uint64 *ptr64 = (npy_uint64*)ptr; return ptr64; }
-#else
- NPY_FINLINE npy_uint64 *npyv__ptr2u64(const void *ptr)
- { npy_uint64 *ptr64 = (npy_uint64*)ptr; return ptr64; }
-#endif // __cplusplus
+NPY_FINLINE npy_uint64 *npyv__ptr2u64(const void *ptr)
+{ npy_uint64 *ptr64 = (npy_uint64*)ptr; return ptr64; }
// load lower part
NPY_FINLINE npyv_u64 npyv__loadl(const void *ptr)
@@ -136,6 +130,24 @@ NPY_FINLINE npyv_s64 npyv_loadn_s64(const npy_int64 *ptr, npy_intp stride)
{ return npyv_set_s64(ptr[0], ptr[stride]); }
NPY_FINLINE npyv_f64 npyv_loadn_f64(const double *ptr, npy_intp stride)
{ return npyv_set_f64(ptr[0], ptr[stride]); }
+
+//// 64-bit load over 32-bit stride
+NPY_FINLINE npyv_u32 npyv_loadn2_u32(const npy_uint32 *ptr, npy_intp stride)
+{ return (npyv_u32)npyv_set_u64(*(npy_uint64*)ptr, *(npy_uint64*)(ptr + stride)); }
+NPY_FINLINE npyv_s32 npyv_loadn2_s32(const npy_int32 *ptr, npy_intp stride)
+{ return (npyv_s32)npyv_set_u64(*(npy_uint64*)ptr, *(npy_uint64*)(ptr + stride)); }
+#if NPY_SIMD_F32
+NPY_FINLINE npyv_f32 npyv_loadn2_f32(const float *ptr, npy_intp stride)
+{ return (npyv_f32)npyv_set_u64(*(npy_uint64*)ptr, *(npy_uint64*)(ptr + stride)); }
+#endif
+//// 128-bit load over 64-bit stride
+NPY_FINLINE npyv_u64 npyv_loadn2_u64(const npy_uint64 *ptr, npy_intp stride)
+{ (void)stride; return npyv_load_u64(ptr); }
+NPY_FINLINE npyv_s64 npyv_loadn2_s64(const npy_int64 *ptr, npy_intp stride)
+{ (void)stride; return npyv_load_s64(ptr); }
+NPY_FINLINE npyv_f64 npyv_loadn2_f64(const double *ptr, npy_intp stride)
+{ (void)stride; return npyv_load_f64(ptr); }
+
/***************************
* Non-contiguous Store
***************************/
@@ -164,6 +176,26 @@ NPY_FINLINE void npyv_storen_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a)
NPY_FINLINE void npyv_storen_f64(double *ptr, npy_intp stride, npyv_f64 a)
{ npyv_storen_u64((npy_uint64*)ptr, stride, (npyv_u64)a); }
+//// 64-bit store over 32-bit stride
+NPY_FINLINE void npyv_storen2_u32(npy_uint32 *ptr, npy_intp stride, npyv_u32 a)
+{
+ *(npy_uint64*)ptr = vec_extract((npyv_u64)a, 0);
+ *(npy_uint64*)(ptr + stride) = vec_extract((npyv_u64)a, 1);
+}
+NPY_FINLINE void npyv_storen2_s32(npy_int32 *ptr, npy_intp stride, npyv_s32 a)
+{ npyv_storen2_u32((npy_uint32*)ptr, stride, (npyv_u32)a); }
+#if NPY_SIMD_F32
+NPY_FINLINE void npyv_storen2_f32(float *ptr, npy_intp stride, npyv_f32 a)
+{ npyv_storen2_u32((npy_uint32*)ptr, stride, (npyv_u32)a); }
+#endif
+//// 128-bit store over 64-bit stride
+NPY_FINLINE void npyv_storen2_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a)
+{ (void)stride; npyv_store_u64(ptr, a); }
+NPY_FINLINE void npyv_storen2_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a)
+{ (void)stride; npyv_store_s64(ptr, a); }
+NPY_FINLINE void npyv_storen2_f64(double *ptr, npy_intp stride, npyv_f64 a)
+{ (void)stride; npyv_store_f64(ptr, a); }
+
/*********************************
* Partial Load
*********************************/
@@ -173,9 +205,9 @@ NPY_FINLINE npyv_s32 npyv_load_till_s32(const npy_int32 *ptr, npy_uintp nlane, n
assert(nlane > 0);
npyv_s32 vfill = npyv_setall_s32(fill);
#ifdef NPY_HAVE_VX
- const unsigned blane = (unsigned short)nlane;
+ const unsigned blane = (nlane > 4) ? 4 : nlane;
const npyv_u32 steps = npyv_set_u32(0, 1, 2, 3);
- const npyv_u32 vlane = npyv_setall_u32((unsigned)blane);
+ const npyv_u32 vlane = npyv_setall_u32(blane);
const npyv_b32 mask = vec_cmpgt(vlane, steps);
npyv_s32 a = vec_load_len(ptr, blane*4-1);
return vec_sel(vfill, a, mask);
@@ -201,8 +233,8 @@ NPY_FINLINE npyv_s32 npyv_load_till_s32(const npy_int32 *ptr, npy_uintp nlane, n
NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
{
#ifdef NPY_HAVE_VX
- unsigned blane = ((unsigned short)nlane)*4 - 1;
- return vec_load_len(ptr, blane);
+ unsigned blane = (nlane > 4) ? 4 : nlane;
+ return vec_load_len(ptr, blane*4-1);
#else
return npyv_load_till_s32(ptr, nlane, 0);
#endif
@@ -220,12 +252,33 @@ NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, n
NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
{
#ifdef NPY_HAVE_VX
- unsigned blane = (unsigned short)nlane;
+ unsigned blane = (nlane > 2) ? 2 : nlane;
return vec_load_len((const signed long long*)ptr, blane*8-1);
#else
return npyv_load_till_s64(ptr, nlane, 0);
#endif
}
+//// 64-bit nlane
+NPY_FINLINE npyv_s32 npyv_load2_till_s32(const npy_int32 *ptr, npy_uintp nlane,
+ npy_int32 fill_lo, npy_int32 fill_hi)
+{
+ assert(nlane > 0);
+ if (nlane == 1) {
+ return npyv_set_s32(ptr[0], ptr[1], fill_lo, fill_hi);
+ }
+ return npyv_load_s32(ptr);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s32 npyv_load2_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
+{ return (npyv_s32)npyv_load_tillz_s64((const npy_int64*)ptr, nlane); }
+
+//// 128-bit nlane
+NPY_FINLINE npyv_s64 npyv_load2_till_s64(const npy_int64 *ptr, npy_uintp nlane,
+ npy_int64 fill_lo, npy_int64 fill_hi)
+{ (void)nlane; (void)fill_lo; (void)fill_hi; return npyv_load_s64(ptr); }
+
+NPY_FINLINE npyv_s64 npyv_load2_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
+{ (void)nlane; return npyv_load_s64(ptr); }
/*********************************
* Non-contiguous partial load
*********************************/
@@ -265,6 +318,34 @@ npyv_loadn_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_
// fill zero to rest lanes
NPY_FINLINE npyv_s64 npyv_loadn_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane)
{ return npyv_loadn_till_s64(ptr, stride, nlane, 0); }
+
+//// 64-bit load over 32-bit stride
+NPY_FINLINE npyv_s32 npyv_loadn2_till_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane,
+ npy_int32 fill_lo, npy_int32 fill_hi)
+{
+ assert(nlane > 0);
+ if (nlane == 1) {
+ return npyv_set_s32(ptr[0], ptr[1], fill_lo, fill_hi);
+ }
+ return npyv_loadn2_s32(ptr, stride);
+}
+NPY_FINLINE npyv_s32 npyv_loadn2_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane)
+{
+ assert(nlane > 0);
+ if (nlane == 1) {
+ return (npyv_s32)npyv_set_s64(*(npy_int64*)ptr, 0);
+ }
+ return npyv_loadn2_s32(ptr, stride);
+}
+
+//// 128-bit load over 64-bit stride
+NPY_FINLINE npyv_s64 npyv_loadn2_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane,
+ npy_int64 fill_lo, npy_int64 fill_hi)
+{ assert(nlane > 0); (void)stride; (void)nlane; (void)fill_lo; (void)fill_hi; return npyv_load_s64(ptr); }
+
+NPY_FINLINE npyv_s64 npyv_loadn2_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane)
+{ assert(nlane > 0); (void)stride; (void)nlane; return npyv_load_s64(ptr); }
+
/*********************************
* Partial store
*********************************/
@@ -273,7 +354,7 @@ NPY_FINLINE void npyv_store_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a
{
assert(nlane > 0);
#ifdef NPY_HAVE_VX
- unsigned blane = (unsigned short)nlane;
+ unsigned blane = (nlane > 4) ? 4 : nlane;
vec_store_len(a, ptr, blane*4-1);
#else
switch(nlane) {
@@ -297,7 +378,7 @@ NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a
{
assert(nlane > 0);
#ifdef NPY_HAVE_VX
- unsigned blane = (unsigned short)nlane;
+ unsigned blane = (nlane > 2) ? 2 : nlane;
vec_store_len(a, (signed long long*)ptr, blane*8-1);
#else
if (nlane == 1) {
@@ -307,6 +388,18 @@ NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a
npyv_store_s64(ptr, a);
#endif
}
+
+//// 64-bit nlane
+NPY_FINLINE void npyv_store2_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a)
+{ npyv_store_till_s64((npy_int64*)ptr, nlane, (npyv_s64)a); }
+
+//// 128-bit nlane
+NPY_FINLINE void npyv_store2_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a)
+{
+ assert(nlane > 0); (void)nlane;
+ npyv_store_s64(ptr, a);
+}
+
/*********************************
* Non-contiguous partial store
*********************************/
@@ -314,16 +407,21 @@ NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a
NPY_FINLINE void npyv_storen_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npyv_s32 a)
{
assert(nlane > 0);
+ ptr[stride*0] = vec_extract(a, 0);
switch(nlane) {
- default:
- ptr[stride*3] = vec_extract(a, 3);
- case 3:
- ptr[stride*2] = vec_extract(a, 2);
+ case 1:
+ return;
case 2:
ptr[stride*1] = vec_extract(a, 1);
- case 1:
- ptr[stride*0] = vec_extract(a, 0);
- break;
+ return;
+ case 3:
+ ptr[stride*1] = vec_extract(a, 1);
+ ptr[stride*2] = vec_extract(a, 2);
+ return;
+ default:
+ ptr[stride*1] = vec_extract(a, 1);
+ ptr[stride*2] = vec_extract(a, 2);
+ ptr[stride*3] = vec_extract(a, 3);
}
}
//// 64
@@ -336,6 +434,21 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp
}
npyv_storen_s64(ptr, stride, a);
}
+
+//// 64-bit store over 32-bit stride
+NPY_FINLINE void npyv_storen2_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npyv_s32 a)
+{
+ assert(nlane > 0);
+ npyv_storel_s32(ptr, a);
+ if (nlane > 1) {
+ npyv_storeh_s32(ptr + stride, a);
+ }
+}
+
+//// 128-bit store over 64-bit stride
+NPY_FINLINE void npyv_storen2_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npyv_s64 a)
+{ assert(nlane > 0); (void)stride; (void)nlane; npyv_store_s64(ptr, a); }
+
/*****************************************************************
* Implement partial load/store for u32/f32/u64/f64... via casting
*****************************************************************/
@@ -346,7 +459,8 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp
union { \
npyv_lanetype_##F_SFX from_##F_SFX; \
npyv_lanetype_##T_SFX to_##T_SFX; \
- } pun = {.from_##F_SFX = fill}; \
+ } pun; \
+ pun.from_##F_SFX = fill; \
return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load_till_##T_SFX( \
(const npyv_lanetype_##T_SFX *)ptr, nlane, pun.to_##T_SFX \
)); \
@@ -358,7 +472,8 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp
union { \
npyv_lanetype_##F_SFX from_##F_SFX; \
npyv_lanetype_##T_SFX to_##T_SFX; \
- } pun = {.from_##F_SFX = fill}; \
+ } pun; \
+ pun.from_##F_SFX = fill; \
return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn_till_##T_SFX( \
(const npyv_lanetype_##T_SFX *)ptr, stride, nlane, pun.to_##T_SFX \
)); \
@@ -401,6 +516,114 @@ NPYV_IMPL_VEC_REST_PARTIAL_TYPES(f32, s32)
NPYV_IMPL_VEC_REST_PARTIAL_TYPES(u64, s64)
NPYV_IMPL_VEC_REST_PARTIAL_TYPES(f64, s64)
+// 128-bit/64-bit stride
+#define NPYV_IMPL_VEC_REST_PARTIAL_TYPES_PAIR(F_SFX, T_SFX) \
+ NPY_FINLINE npyv_##F_SFX npyv_load2_till_##F_SFX \
+ (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, \
+ npyv_lanetype_##F_SFX fill_lo, npyv_lanetype_##F_SFX fill_hi) \
+ { \
+ union pun { \
+ npyv_lanetype_##F_SFX from_##F_SFX; \
+ npyv_lanetype_##T_SFX to_##T_SFX; \
+ }; \
+ union pun pun_lo; \
+ union pun pun_hi; \
+ pun_lo.from_##F_SFX = fill_lo; \
+ pun_hi.from_##F_SFX = fill_hi; \
+ return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load2_till_##T_SFX( \
+ (const npyv_lanetype_##T_SFX *)ptr, nlane, pun_lo.to_##T_SFX, pun_hi.to_##T_SFX \
+ )); \
+ } \
+ NPY_FINLINE npyv_##F_SFX npyv_loadn2_till_##F_SFX \
+ (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, \
+ npyv_lanetype_##F_SFX fill_lo, npyv_lanetype_##F_SFX fill_hi) \
+ { \
+ union pun { \
+ npyv_lanetype_##F_SFX from_##F_SFX; \
+ npyv_lanetype_##T_SFX to_##T_SFX; \
+ }; \
+ union pun pun_lo; \
+ union pun pun_hi; \
+ pun_lo.from_##F_SFX = fill_lo; \
+ pun_hi.from_##F_SFX = fill_hi; \
+ return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn2_till_##T_SFX( \
+ (const npyv_lanetype_##T_SFX *)ptr, stride, nlane, pun_lo.to_##T_SFX, \
+ pun_hi.to_##T_SFX \
+ )); \
+ } \
+ NPY_FINLINE npyv_##F_SFX npyv_load2_tillz_##F_SFX \
+ (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane) \
+ { \
+ return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load2_tillz_##T_SFX( \
+ (const npyv_lanetype_##T_SFX *)ptr, nlane \
+ )); \
+ } \
+ NPY_FINLINE npyv_##F_SFX npyv_loadn2_tillz_##F_SFX \
+ (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane) \
+ { \
+ return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn2_tillz_##T_SFX( \
+ (const npyv_lanetype_##T_SFX *)ptr, stride, nlane \
+ )); \
+ } \
+ NPY_FINLINE void npyv_store2_till_##F_SFX \
+ (npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, npyv_##F_SFX a) \
+ { \
+ npyv_store2_till_##T_SFX( \
+ (npyv_lanetype_##T_SFX *)ptr, nlane, \
+ npyv_reinterpret_##T_SFX##_##F_SFX(a) \
+ ); \
+ } \
+ NPY_FINLINE void npyv_storen2_till_##F_SFX \
+ (npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, npyv_##F_SFX a) \
+ { \
+ npyv_storen2_till_##T_SFX( \
+ (npyv_lanetype_##T_SFX *)ptr, stride, nlane, \
+ npyv_reinterpret_##T_SFX##_##F_SFX(a) \
+ ); \
+ }
+
+NPYV_IMPL_VEC_REST_PARTIAL_TYPES_PAIR(u32, s32)
+#if NPY_SIMD_F32
+NPYV_IMPL_VEC_REST_PARTIAL_TYPES_PAIR(f32, s32)
+#endif
+NPYV_IMPL_VEC_REST_PARTIAL_TYPES_PAIR(u64, s64)
+NPYV_IMPL_VEC_REST_PARTIAL_TYPES_PAIR(f64, s64)
+
+/************************************************************
+ * de-interlave load / interleave contiguous store
+ ************************************************************/
+// two channels
+#define NPYV_IMPL_VEC_MEM_INTERLEAVE(SFX) \
+ NPY_FINLINE npyv_##SFX##x2 npyv_zip_##SFX(npyv_##SFX, npyv_##SFX); \
+ NPY_FINLINE npyv_##SFX##x2 npyv_unzip_##SFX(npyv_##SFX, npyv_##SFX); \
+ NPY_FINLINE npyv_##SFX##x2 npyv_load_##SFX##x2( \
+ const npyv_lanetype_##SFX *ptr \
+ ) { \
+ return npyv_unzip_##SFX( \
+ npyv_load_##SFX(ptr), npyv_load_##SFX(ptr+npyv_nlanes_##SFX) \
+ ); \
+ } \
+ NPY_FINLINE void npyv_store_##SFX##x2( \
+ npyv_lanetype_##SFX *ptr, npyv_##SFX##x2 v \
+ ) { \
+ npyv_##SFX##x2 zip = npyv_zip_##SFX(v.val[0], v.val[1]); \
+ npyv_store_##SFX(ptr, zip.val[0]); \
+ npyv_store_##SFX(ptr + npyv_nlanes_##SFX, zip.val[1]); \
+ }
+
+NPYV_IMPL_VEC_MEM_INTERLEAVE(u8)
+NPYV_IMPL_VEC_MEM_INTERLEAVE(s8)
+NPYV_IMPL_VEC_MEM_INTERLEAVE(u16)
+NPYV_IMPL_VEC_MEM_INTERLEAVE(s16)
+NPYV_IMPL_VEC_MEM_INTERLEAVE(u32)
+NPYV_IMPL_VEC_MEM_INTERLEAVE(s32)
+NPYV_IMPL_VEC_MEM_INTERLEAVE(u64)
+NPYV_IMPL_VEC_MEM_INTERLEAVE(s64)
+#if NPY_SIMD_F32
+NPYV_IMPL_VEC_MEM_INTERLEAVE(f32)
+#endif
+NPYV_IMPL_VEC_MEM_INTERLEAVE(f64)
+
/*********************************
* Lookup table
*********************************/
diff --git a/numpy/core/src/common/simd/vec/misc.h b/numpy/core/src/common/simd/vec/misc.h
index 7ea0f21f6..79c194d90 100644
--- a/numpy/core/src/common/simd/vec/misc.h
+++ b/numpy/core/src/common/simd/vec/misc.h
@@ -26,28 +26,28 @@
#define NPYV_IMPL_VEC_SPLTW(T_VEC, V) ((T_VEC){V, V, V, V})
#define NPYV_IMPL_VEC_SPLTD(T_VEC, V) ((T_VEC){V, V})
-#define npyv_setall_u8(VAL) NPYV_IMPL_VEC_SPLTB(npyv_u8, (unsigned char)VAL)
-#define npyv_setall_s8(VAL) NPYV_IMPL_VEC_SPLTB(npyv_s8, (signed char)VAL)
-#define npyv_setall_u16(VAL) NPYV_IMPL_VEC_SPLTH(npyv_u16, (unsigned short)VAL)
-#define npyv_setall_s16(VAL) NPYV_IMPL_VEC_SPLTH(npyv_s16, (short)VAL)
-#define npyv_setall_u32(VAL) NPYV_IMPL_VEC_SPLTW(npyv_u32, (unsigned int)VAL)
-#define npyv_setall_s32(VAL) NPYV_IMPL_VEC_SPLTW(npyv_s32, (int)VAL)
+#define npyv_setall_u8(VAL) NPYV_IMPL_VEC_SPLTB(npyv_u8, (unsigned char)(VAL))
+#define npyv_setall_s8(VAL) NPYV_IMPL_VEC_SPLTB(npyv_s8, (signed char)(VAL))
+#define npyv_setall_u16(VAL) NPYV_IMPL_VEC_SPLTH(npyv_u16, (unsigned short)(VAL))
+#define npyv_setall_s16(VAL) NPYV_IMPL_VEC_SPLTH(npyv_s16, (short)(VAL))
+#define npyv_setall_u32(VAL) NPYV_IMPL_VEC_SPLTW(npyv_u32, (unsigned int)(VAL))
+#define npyv_setall_s32(VAL) NPYV_IMPL_VEC_SPLTW(npyv_s32, (int)(VAL))
#if NPY_SIMD_F32
- #define npyv_setall_f32(VAL) NPYV_IMPL_VEC_SPLTW(npyv_f32, VAL)
+ #define npyv_setall_f32(VAL) NPYV_IMPL_VEC_SPLTW(npyv_f32, (VAL))
#endif
-#define npyv_setall_u64(VAL) NPYV_IMPL_VEC_SPLTD(npyv_u64, (npy_uint64)VAL)
-#define npyv_setall_s64(VAL) NPYV_IMPL_VEC_SPLTD(npyv_s64, (npy_int64)VAL)
+#define npyv_setall_u64(VAL) NPYV_IMPL_VEC_SPLTD(npyv_u64, (npy_uint64)(VAL))
+#define npyv_setall_s64(VAL) NPYV_IMPL_VEC_SPLTD(npyv_s64, (npy_int64)(VAL))
#define npyv_setall_f64(VAL) NPYV_IMPL_VEC_SPLTD(npyv_f64, VAL)
// vector with specific values set to each lane and
// set a specific value to all remained lanes
-#define npyv_setf_u8(FILL, ...) ((npyv_u8){NPYV__SET_FILL_16(char, FILL, __VA_ARGS__)})
-#define npyv_setf_s8(FILL, ...) ((npyv_s8){NPYV__SET_FILL_16(char, FILL, __VA_ARGS__)})
-#define npyv_setf_u16(FILL, ...) ((npyv_u16){NPYV__SET_FILL_8(short, FILL, __VA_ARGS__)})
+#define npyv_setf_u8(FILL, ...) ((npyv_u8){NPYV__SET_FILL_16(unsigned char, FILL, __VA_ARGS__)})
+#define npyv_setf_s8(FILL, ...) ((npyv_s8){NPYV__SET_FILL_16(signed char, FILL, __VA_ARGS__)})
+#define npyv_setf_u16(FILL, ...) ((npyv_u16){NPYV__SET_FILL_8(unsigned short, FILL, __VA_ARGS__)})
#define npyv_setf_s16(FILL, ...) ((npyv_s16){NPYV__SET_FILL_8(short, FILL, __VA_ARGS__)})
-#define npyv_setf_u32(FILL, ...) ((npyv_u32){NPYV__SET_FILL_4(int, FILL, __VA_ARGS__)})
+#define npyv_setf_u32(FILL, ...) ((npyv_u32){NPYV__SET_FILL_4(unsigned int, FILL, __VA_ARGS__)})
#define npyv_setf_s32(FILL, ...) ((npyv_s32){NPYV__SET_FILL_4(int, FILL, __VA_ARGS__)})
-#define npyv_setf_u64(FILL, ...) ((npyv_u64){NPYV__SET_FILL_2(npy_int64, FILL, __VA_ARGS__)})
+#define npyv_setf_u64(FILL, ...) ((npyv_u64){NPYV__SET_FILL_2(npy_uint64, FILL, __VA_ARGS__)})
#define npyv_setf_s64(FILL, ...) ((npyv_s64){NPYV__SET_FILL_2(npy_int64, FILL, __VA_ARGS__)})
#if NPY_SIMD_F32
#define npyv_setf_f32(FILL, ...) ((npyv_f32){NPYV__SET_FILL_4(float, FILL, __VA_ARGS__)})
diff --git a/numpy/core/src/common/simd/vec/reorder.h b/numpy/core/src/common/simd/vec/reorder.h
index b60b9287d..3910980a2 100644
--- a/numpy/core/src/common/simd/vec/reorder.h
+++ b/numpy/core/src/common/simd/vec/reorder.h
@@ -68,6 +68,85 @@ NPYV_IMPL_VEC_COMBINE_ZIP(npyv_s64, s64)
#endif
NPYV_IMPL_VEC_COMBINE_ZIP(npyv_f64, f64)
+// deinterleave two vectors
+NPY_FINLINE npyv_u8x2 npyv_unzip_u8(npyv_u8 ab0, npyv_u8 ab1)
+{
+ const npyv_u8 idx_even = npyv_set_u8(
+ 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+ );
+ const npyv_u8 idx_odd = npyv_set_u8(
+ 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+ );
+ npyv_u8x2 r;
+ r.val[0] = vec_perm(ab0, ab1, idx_even);
+ r.val[1] = vec_perm(ab0, ab1, idx_odd);
+ return r;
+}
+NPY_FINLINE npyv_s8x2 npyv_unzip_s8(npyv_s8 ab0, npyv_s8 ab1)
+{
+ npyv_u8x2 ru = npyv_unzip_u8((npyv_u8)ab0, (npyv_u8)ab1);
+ npyv_s8x2 r;
+ r.val[0] = (npyv_s8)ru.val[0];
+ r.val[1] = (npyv_s8)ru.val[1];
+ return r;
+}
+NPY_FINLINE npyv_u16x2 npyv_unzip_u16(npyv_u16 ab0, npyv_u16 ab1)
+{
+ const npyv_u8 idx_even = npyv_set_u8(
+ 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29
+ );
+ const npyv_u8 idx_odd = npyv_set_u8(
+ 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31
+ );
+ npyv_u16x2 r;
+ r.val[0] = vec_perm(ab0, ab1, idx_even);
+ r.val[1] = vec_perm(ab0, ab1, idx_odd);
+ return r;
+}
+NPY_FINLINE npyv_s16x2 npyv_unzip_s16(npyv_s16 ab0, npyv_s16 ab1)
+{
+ npyv_u16x2 ru = npyv_unzip_u16((npyv_u16)ab0, (npyv_u16)ab1);
+ npyv_s16x2 r;
+ r.val[0] = (npyv_s16)ru.val[0];
+ r.val[1] = (npyv_s16)ru.val[1];
+ return r;
+}
+NPY_FINLINE npyv_u32x2 npyv_unzip_u32(npyv_u32 ab0, npyv_u32 ab1)
+{
+ npyv_u32 m0 = vec_mergeh(ab0, ab1);
+ npyv_u32 m1 = vec_mergel(ab0, ab1);
+ npyv_u32 r0 = vec_mergeh(m0, m1);
+ npyv_u32 r1 = vec_mergel(m0, m1);
+ npyv_u32x2 r;
+ r.val[0] = r0;
+ r.val[1] = r1;
+ return r;
+}
+NPY_FINLINE npyv_s32x2 npyv_unzip_s32(npyv_s32 ab0, npyv_s32 ab1)
+{
+ npyv_u32x2 ru = npyv_unzip_u32((npyv_u32)ab0, (npyv_u32)ab1);
+ npyv_s32x2 r;
+ r.val[0] = (npyv_s32)ru.val[0];
+ r.val[1] = (npyv_s32)ru.val[1];
+ return r;
+}
+#if NPY_SIMD_F32
+ NPY_FINLINE npyv_f32x2 npyv_unzip_f32(npyv_f32 ab0, npyv_f32 ab1)
+ {
+ npyv_u32x2 ru = npyv_unzip_u32((npyv_u32)ab0, (npyv_u32)ab1);
+ npyv_f32x2 r;
+ r.val[0] = (npyv_f32)ru.val[0];
+ r.val[1] = (npyv_f32)ru.val[1];
+ return r;
+ }
+#endif
+NPY_FINLINE npyv_u64x2 npyv_unzip_u64(npyv_u64 ab0, npyv_u64 ab1)
+{ return npyv_combine_u64(ab0, ab1); }
+NPY_FINLINE npyv_s64x2 npyv_unzip_s64(npyv_s64 ab0, npyv_s64 ab1)
+{ return npyv_combine_s64(ab0, ab1); }
+NPY_FINLINE npyv_f64x2 npyv_unzip_f64(npyv_f64 ab0, npyv_f64 ab1)
+{ return npyv_combine_f64(ab0, ab1); }
+
// Reverse elements of each 64-bit lane
NPY_FINLINE npyv_u8 npyv_rev64_u8(npyv_u8 a)
{
@@ -111,4 +190,24 @@ NPY_FINLINE npyv_s32 npyv_rev64_s32(npyv_s32 a)
{ return (npyv_f32)npyv_rev64_u32((npyv_u32)a); }
#endif
+// Permuting the elements of each 128-bit lane by immediate index for
+// each element.
+#define npyv_permi128_u32(A, E0, E1, E2, E3) \
+ vec_perm(A, A, npyv_set_u8( \
+ (E0<<2), (E0<<2)+1, (E0<<2)+2, (E0<<2)+3, \
+ (E1<<2), (E1<<2)+1, (E1<<2)+2, (E1<<2)+3, \
+ (E2<<2), (E2<<2)+1, (E2<<2)+2, (E2<<2)+3, \
+ (E3<<2), (E3<<2)+1, (E3<<2)+2, (E3<<2)+3 \
+ ))
+#define npyv_permi128_s32 npyv_permi128_u32
+#define npyv_permi128_f32 npyv_permi128_u32
+
+#if defined(__IBMC__) || defined(vec_permi)
+ #define npyv_permi128_u64(A, E0, E1) vec_permi(A, A, ((E0)<<1) | (E1))
+#else
+ #define npyv_permi128_u64(A, E0, E1) vec_xxpermdi(A, A, ((E0)<<1) | (E1))
+#endif
+#define npyv_permi128_s64 npyv_permi128_u64
+#define npyv_permi128_f64 npyv_permi128_u64
+
#endif // _NPY_SIMD_VEC_REORDER_H
diff --git a/numpy/core/src/common/simd/vec/vec.h b/numpy/core/src/common/simd/vec/vec.h
index abcd33ce1..1d4508669 100644
--- a/numpy/core/src/common/simd/vec/vec.h
+++ b/numpy/core/src/common/simd/vec/vec.h
@@ -39,8 +39,10 @@
#ifdef NPY_HAVE_VX
#define NPY_SIMD_BIGENDIAN 1
+ #define NPY_SIMD_CMPSIGNAL 0
#else
#define NPY_SIMD_BIGENDIAN 0
+ #define NPY_SIMD_CMPSIGNAL 1
#endif
typedef __vector unsigned char npyv_u8;