diff options
Diffstat (limited to 'numpy/core/src/common/simd')
30 files changed, 2381 insertions, 268 deletions
diff --git a/numpy/core/src/common/simd/avx2/arithmetic.h b/numpy/core/src/common/simd/avx2/arithmetic.h index ad9688338..58d842a6d 100644 --- a/numpy/core/src/common/simd/avx2/arithmetic.h +++ b/numpy/core/src/common/simd/avx2/arithmetic.h @@ -247,6 +247,10 @@ NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor) // negate multiply and subtract, -(a*b) - c #define npyv_nmulsub_f32 _mm256_fnmsub_ps #define npyv_nmulsub_f64 _mm256_fnmsub_pd + // multiply, add for odd elements and subtract even elements. + // (a * b) -+ c + #define npyv_muladdsub_f32 _mm256_fmaddsub_ps + #define npyv_muladdsub_f64 _mm256_fmaddsub_pd #else // multiply and add, a*b + c NPY_FINLINE npyv_f32 npyv_muladd_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c) @@ -274,6 +278,13 @@ NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor) npyv_f64 neg_a = npyv_xor_f64(a, npyv_setall_f64(-0.0)); return npyv_sub_f64(npyv_mul_f64(neg_a, b), c); } + // multiply, add for odd elements and subtract even elements. + // (a * b) -+ c + NPY_FINLINE npyv_f32 npyv_muladdsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c) + { return _mm256_addsub_ps(npyv_mul_f32(a, b), c); } + NPY_FINLINE npyv_f64 npyv_muladdsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c) + { return _mm256_addsub_pd(npyv_mul_f64(a, b), c); } + #endif // !NPY_HAVE_FMA3 /*************************** diff --git a/numpy/core/src/common/simd/avx2/avx2.h b/numpy/core/src/common/simd/avx2/avx2.h index 8cb74df2b..d64f3c6d6 100644 --- a/numpy/core/src/common/simd/avx2/avx2.h +++ b/numpy/core/src/common/simd/avx2/avx2.h @@ -11,6 +11,7 @@ #define NPY_SIMD_FMA3 0 // fast emulated #endif #define NPY_SIMD_BIGENDIAN 0 +#define NPY_SIMD_CMPSIGNAL 0 // Enough limit to allow us to use _mm256_i32gather_* #define NPY_SIMD_MAXLOAD_STRIDE32 (0x7fffffff / 8) diff --git a/numpy/core/src/common/simd/avx2/memory.h b/numpy/core/src/common/simd/avx2/memory.h index 410c35dc8..81144a36b 100644 --- a/numpy/core/src/common/simd/avx2/memory.h +++ b/numpy/core/src/common/simd/avx2/memory.h @@ -84,17 +84,6 @@ NPY_FINLINE npyv_s32 npyv_loadn_s32(const npy_int32 *ptr, npy_intp stride) NPY_FINLINE npyv_f32 npyv_loadn_f32(const float *ptr, npy_intp stride) { return _mm256_castsi256_ps(npyv_loadn_u32((const npy_uint32*)ptr, stride)); } //// 64 -#if 0 // slower -NPY_FINLINE npyv_u64 npyv_loadn_u64(const npy_uint64 *ptr, npy_intp stride) -{ - const __m256i idx = npyv_set_s64(0, 1*stride, 2*stride, 3*stride); - return _mm256_i64gather_epi64((const void*)ptr, idx, 8); -} -NPY_FINLINE npyv_s64 npyv_loadn_s64(const npy_int64 *ptr, npy_intp stride) -{ return npyv_loadn_u64((const npy_uint64*)ptr, stride); } -NPY_FINLINE npyv_f64 npyv_loadn_f64(const double *ptr, npy_intp stride) -{ return _mm256_castsi256_pd(npyv_loadn_u64((const npy_uint64*)ptr, stride)); } -#endif NPY_FINLINE npyv_f64 npyv_loadn_f64(const double *ptr, npy_intp stride) { __m128d a0 = _mm_castsi128_pd(_mm_loadl_epi64((const __m128i*)ptr)); @@ -107,6 +96,32 @@ NPY_FINLINE npyv_u64 npyv_loadn_u64(const npy_uint64 *ptr, npy_intp stride) { return _mm256_castpd_si256(npyv_loadn_f64((const double*)ptr, stride)); } NPY_FINLINE npyv_s64 npyv_loadn_s64(const npy_int64 *ptr, npy_intp stride) { return _mm256_castpd_si256(npyv_loadn_f64((const double*)ptr, stride)); } + +//// 64-bit load over 32-bit stride +NPY_FINLINE npyv_f32 npyv_loadn2_f32(const float *ptr, npy_intp stride) +{ + __m128d a0 = _mm_castsi128_pd(_mm_loadl_epi64((const __m128i*)ptr)); + __m128d a2 = _mm_castsi128_pd(_mm_loadl_epi64((const __m128i*)(ptr + stride*2))); + __m128d a01 = _mm_loadh_pd(a0, (const double*)(ptr + stride)); + __m128d a23 = _mm_loadh_pd(a2, (const double*)(ptr + stride*3)); + return _mm256_castpd_ps(_mm256_insertf128_pd(_mm256_castpd128_pd256(a01), a23, 1)); +} +NPY_FINLINE npyv_u32 npyv_loadn2_u32(const npy_uint32 *ptr, npy_intp stride) +{ return _mm256_castps_si256(npyv_loadn2_f32((const float*)ptr, stride)); } +NPY_FINLINE npyv_s32 npyv_loadn2_s32(const npy_int32 *ptr, npy_intp stride) +{ return _mm256_castps_si256(npyv_loadn2_f32((const float*)ptr, stride)); } + +//// 128-bit load over 64-bit stride +NPY_FINLINE npyv_f64 npyv_loadn2_f64(const double *ptr, npy_intp stride) +{ + __m256d a = npyv_loadl_f64(ptr); + return _mm256_insertf128_pd(a, _mm_loadu_pd(ptr + stride), 1); +} +NPY_FINLINE npyv_u64 npyv_loadn2_u64(const npy_uint64 *ptr, npy_intp stride) +{ return _mm256_castpd_si256(npyv_loadn2_f64((const double*)ptr, stride)); } +NPY_FINLINE npyv_s64 npyv_loadn2_s64(const npy_int64 *ptr, npy_intp stride) +{ return _mm256_castpd_si256(npyv_loadn2_f64((const double*)ptr, stride)); } + /*************************** * Non-contiguous Store ***************************/ @@ -143,6 +158,32 @@ NPY_FINLINE void npyv_storen_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a) NPY_FINLINE void npyv_storen_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a) { npyv_storen_f64((double*)ptr, stride, _mm256_castsi256_pd(a)); } +//// 64-bit store over 32-bit stride +NPY_FINLINE void npyv_storen2_u32(npy_uint32 *ptr, npy_intp stride, npyv_u32 a) +{ + __m128d a0 = _mm256_castpd256_pd128(_mm256_castsi256_pd(a)); + __m128d a1 = _mm256_extractf128_pd(_mm256_castsi256_pd(a), 1); + _mm_storel_pd((double*)ptr, a0); + _mm_storeh_pd((double*)(ptr + stride), a0); + _mm_storel_pd((double*)(ptr + stride*2), a1); + _mm_storeh_pd((double*)(ptr + stride*3), a1); +} +NPY_FINLINE void npyv_storen2_s32(npy_int32 *ptr, npy_intp stride, npyv_s32 a) +{ npyv_storen2_u32((npy_uint32*)ptr, stride, a); } +NPY_FINLINE void npyv_storen2_f32(float *ptr, npy_intp stride, npyv_f32 a) +{ npyv_storen2_u32((npy_uint32*)ptr, stride, _mm256_castps_si256(a)); } + +//// 128-bit store over 64-bit stride +NPY_FINLINE void npyv_storen2_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a) +{ + npyv_storel_u64(ptr, a); + npyv_storeh_u64(ptr + stride, a); +} +NPY_FINLINE void npyv_storen2_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a) +{ npyv_storen2_u64((npy_uint64*)ptr, stride, a); } +NPY_FINLINE void npyv_storen2_f64(double *ptr, npy_intp stride, npyv_f64 a) +{ npyv_storen2_u64((npy_uint64*)ptr, stride, _mm256_castpd_si256(a)); } + /********************************* * Partial Load *********************************/ @@ -174,7 +215,7 @@ NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, n const __m256i steps = npyv_set_s64(0, 1, 2, 3); __m256i vnlane = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane); __m256i mask = _mm256_cmpgt_epi64(vnlane, steps); - __m256i payload = _mm256_maskload_epi64((const void*)ptr, mask); + __m256i payload = _mm256_maskload_epi64((const long long*)ptr, mask); return _mm256_blendv_epi8(vfill, payload, mask); } // fill zero to rest lanes @@ -184,7 +225,45 @@ NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane) const __m256i steps = npyv_set_s64(0, 1, 2, 3); __m256i vnlane = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane); __m256i mask = _mm256_cmpgt_epi64(vnlane, steps); - return _mm256_maskload_epi64((const void*)ptr, mask); + return _mm256_maskload_epi64((const long long*)ptr, mask); +} + +//// 64-bit nlane +NPY_FINLINE npyv_s32 npyv_load2_till_s32(const npy_int32 *ptr, npy_uintp nlane, + npy_int32 fill_lo, npy_int32 fill_hi) +{ + assert(nlane > 0); + const __m256i vfill = npyv_set_s32( + fill_lo, fill_hi, fill_lo, fill_hi, + fill_lo, fill_hi, fill_lo, fill_hi + ); + const __m256i steps = npyv_set_s64(0, 1, 2, 3); + __m256i vnlane = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane); + __m256i mask = _mm256_cmpgt_epi64(vnlane, steps); + __m256i payload = _mm256_maskload_epi64((const long long*)ptr, mask); + return _mm256_blendv_epi8(vfill, payload, mask); +} +// fill zero to rest lanes +NPY_FINLINE npyv_s32 npyv_load2_tillz_s32(const npy_int32 *ptr, npy_uintp nlane) +{ return npyv_load_tillz_s64((const npy_int64*)ptr, nlane); } + +/// 128-bit nlane +NPY_FINLINE npyv_u64 npyv_load2_tillz_s64(const npy_int64 *ptr, npy_uintp nlane) +{ + assert(nlane > 0); + npy_int64 m = -((npy_int64)(nlane > 1)); + __m256i mask = npyv_set_s64(-1, -1, m, m); + return _mm256_maskload_epi64((const long long*)ptr, mask); +} +// fill zero to rest lanes +NPY_FINLINE npyv_u64 npyv_load2_till_s64(const npy_int64 *ptr, npy_uintp nlane, + npy_int64 fill_lo, npy_int64 fill_hi) +{ + const __m256i vfill = npyv_set_s64(0, 0, fill_lo, fill_hi); + npy_int64 m = -((npy_int64)(nlane > 1)); + __m256i mask = npyv_set_s64(-1, -1, m, m); + __m256i payload = _mm256_maskload_epi64((const long long*)ptr, mask); + return _mm256_blendv_epi8(vfill, payload, mask); } /********************************* * Non-contiguous partial load @@ -216,12 +295,53 @@ npyv_loadn_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_ const __m256i steps = npyv_set_s64(0, 1, 2, 3); __m256i vnlane = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane); __m256i mask = _mm256_cmpgt_epi64(vnlane, steps); - return _mm256_mask_i64gather_epi64(vfill, (const void*)ptr, idx, mask, 8); + return _mm256_mask_i64gather_epi64(vfill, (const long long*)ptr, idx, mask, 8); } // fill zero to rest lanes NPY_FINLINE npyv_s64 npyv_loadn_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane) { return npyv_loadn_till_s64(ptr, stride, nlane, 0); } + +//// 64-bit load over 32-bit stride +NPY_FINLINE npyv_s64 npyv_loadn2_till_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane, + npy_int32 fill_lo, npy_int32 fill_hi) +{ + assert(nlane > 0); + const __m256i vfill = npyv_set_s32( + fill_lo, fill_hi, fill_lo, fill_hi, + fill_lo, fill_hi, fill_lo, fill_hi + ); + const __m256i idx = npyv_set_s64(0, 1*stride, 2*stride, 3*stride); + const __m256i steps = npyv_set_s64(0, 1, 2, 3); + __m256i vnlane = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane); + __m256i mask = _mm256_cmpgt_epi64(vnlane, steps); + return _mm256_mask_i64gather_epi64(vfill, (const long long*)ptr, idx, mask, 4); +} +// fill zero to rest lanes +NPY_FINLINE npyv_s32 npyv_loadn2_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane) +{ return npyv_loadn2_till_s32(ptr, stride, nlane, 0, 0); } + +//// 128-bit load over 64-bit stride +NPY_FINLINE npyv_s64 npyv_loadn2_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, + npy_int64 fill_lo, npy_int64 fill_hi) +{ + assert(nlane > 0); + __m256i a = npyv_loadl_s64(ptr); +#if defined(_MSC_VER) && defined(_M_IX86) + __m128i fill =_mm_setr_epi32( + (int)fill_lo, (int)(fill_lo >> 32), + (int)fill_hi, (int)(fill_hi >> 32) + ); +#else + __m128i fill = _mm_set_epi64x(fill_hi, fill_lo); +#endif + __m128i b = nlane > 1 ? _mm_loadu_si128((const __m128i*)(ptr + stride)) : fill; + return _mm256_inserti128_si256(a, b, 1); +} +// fill zero to rest lanes +NPY_FINLINE npyv_s64 npyv_loadn2_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane) +{ return npyv_loadn2_till_s64(ptr, stride, nlane, 0, 0); } + /********************************* * Partial store *********************************/ @@ -241,7 +361,21 @@ NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a const __m256i steps = npyv_set_s64(0, 1, 2, 3); __m256i vnlane = npyv_setall_s64(nlane > 8 ? 8 : (int)nlane); __m256i mask = _mm256_cmpgt_epi64(vnlane, steps); - _mm256_maskstore_epi64((void*)ptr, mask, a); + _mm256_maskstore_epi64((long long*)ptr, mask, a); +} + +//// 64-bit nlane +NPY_FINLINE void npyv_store2_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a) +{ npyv_store_till_s64((npy_int64*)ptr, nlane, a); } + +//// 128-bit nlane +NPY_FINLINE void npyv_store2_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a) +{ + assert(nlane > 0); + npyv_storel_s64(ptr, a); + if (nlane > 1) { + npyv_storeh_s64(ptr + 2, a); + } } /********************************* * Non-contiguous partial store @@ -252,23 +386,52 @@ NPY_FINLINE void npyv_storen_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp assert(nlane > 0); __m128i a0 = _mm256_castsi256_si128(a); __m128i a1 = _mm256_extracti128_si256(a, 1); + + ptr[stride*0] = _mm_extract_epi32(a0, 0); switch(nlane) { - default: - ptr[stride*7] = _mm_extract_epi32(a1, 3); - case 7: - ptr[stride*6] = _mm_extract_epi32(a1, 2); - case 6: - ptr[stride*5] = _mm_extract_epi32(a1, 1); + case 1: + return; + case 2: + ptr[stride*1] = _mm_extract_epi32(a0, 1); + return; + case 3: + ptr[stride*1] = _mm_extract_epi32(a0, 1); + ptr[stride*2] = _mm_extract_epi32(a0, 2); + return; + case 4: + ptr[stride*1] = _mm_extract_epi32(a0, 1); + ptr[stride*2] = _mm_extract_epi32(a0, 2); + ptr[stride*3] = _mm_extract_epi32(a0, 3); + return; case 5: + ptr[stride*1] = _mm_extract_epi32(a0, 1); + ptr[stride*2] = _mm_extract_epi32(a0, 2); + ptr[stride*3] = _mm_extract_epi32(a0, 3); ptr[stride*4] = _mm_extract_epi32(a1, 0); - case 4: + return; + case 6: + ptr[stride*1] = _mm_extract_epi32(a0, 1); + ptr[stride*2] = _mm_extract_epi32(a0, 2); ptr[stride*3] = _mm_extract_epi32(a0, 3); - case 3: + ptr[stride*4] = _mm_extract_epi32(a1, 0); + ptr[stride*5] = _mm_extract_epi32(a1, 1); + return; + case 7: + ptr[stride*1] = _mm_extract_epi32(a0, 1); ptr[stride*2] = _mm_extract_epi32(a0, 2); - case 2: + ptr[stride*3] = _mm_extract_epi32(a0, 3); + ptr[stride*4] = _mm_extract_epi32(a1, 0); + ptr[stride*5] = _mm_extract_epi32(a1, 1); + ptr[stride*6] = _mm_extract_epi32(a1, 2); + return; + default: ptr[stride*1] = _mm_extract_epi32(a0, 1); - case 1: - ptr[stride*0] = _mm_extract_epi32(a0, 0); + ptr[stride*2] = _mm_extract_epi32(a0, 2); + ptr[stride*3] = _mm_extract_epi32(a0, 3); + ptr[stride*4] = _mm_extract_epi32(a1, 0); + ptr[stride*5] = _mm_extract_epi32(a1, 1); + ptr[stride*6] = _mm_extract_epi32(a1, 2); + ptr[stride*7] = _mm_extract_epi32(a1, 3); } } //// 64 @@ -277,19 +440,60 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp assert(nlane > 0); __m128d a0 = _mm256_castpd256_pd128(_mm256_castsi256_pd(a)); __m128d a1 = _mm256_extractf128_pd(_mm256_castsi256_pd(a), 1); + double *dptr = (double*)ptr; + _mm_storel_pd(dptr, a0); switch(nlane) { - default: - _mm_storeh_pd(dptr + stride * 3, a1); + case 1: + return; + case 2: + _mm_storeh_pd(dptr + stride * 1, a0); + return; case 3: + _mm_storeh_pd(dptr + stride * 1, a0); _mm_storel_pd(dptr + stride * 2, a1); - case 2: + return; + default: _mm_storeh_pd(dptr + stride * 1, a0); + _mm_storel_pd(dptr + stride * 2, a1); + _mm_storeh_pd(dptr + stride * 3, a1); + } +} + +//// 64-bit store over 32-bit stride +NPY_FINLINE void npyv_storen2_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npyv_s32 a) +{ + assert(nlane > 0); + __m128d a0 = _mm256_castpd256_pd128(_mm256_castsi256_pd(a)); + __m128d a1 = _mm256_extractf128_pd(_mm256_castsi256_pd(a), 1); + + _mm_storel_pd((double*)ptr, a0); + switch(nlane) { case 1: - _mm_storel_pd(dptr + stride * 0, a0); + return; + case 2: + _mm_storeh_pd((double*)(ptr + stride * 1), a0); + return; + case 3: + _mm_storeh_pd((double*)(ptr + stride * 1), a0); + _mm_storel_pd((double*)(ptr + stride * 2), a1); + return; + default: + _mm_storeh_pd((double*)(ptr + stride * 1), a0); + _mm_storel_pd((double*)(ptr + stride * 2), a1); + _mm_storeh_pd((double*)(ptr + stride * 3), a1); } } +//// 128-bit store over 64-bit stride +NPY_FINLINE void npyv_storen2_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npyv_s64 a) +{ + assert(nlane > 0); + npyv_storel_s64(ptr, a); + if (nlane > 1) { + npyv_storeh_s64(ptr + stride, a); + } +} /***************************************************************************** * Implement partial load/store for u32/f32/u64/f64... via reinterpret cast *****************************************************************************/ @@ -300,7 +504,8 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp union { \ npyv_lanetype_##F_SFX from_##F_SFX; \ npyv_lanetype_##T_SFX to_##T_SFX; \ - } pun = {.from_##F_SFX = fill}; \ + } pun; \ + pun.from_##F_SFX = fill; \ return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load_till_##T_SFX( \ (const npyv_lanetype_##T_SFX *)ptr, nlane, pun.to_##T_SFX \ )); \ @@ -312,7 +517,8 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp union { \ npyv_lanetype_##F_SFX from_##F_SFX; \ npyv_lanetype_##T_SFX to_##T_SFX; \ - } pun = {.from_##F_SFX = fill}; \ + } pun; \ + pun.from_##F_SFX = fill; \ return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn_till_##T_SFX( \ (const npyv_lanetype_##T_SFX *)ptr, stride, nlane, pun.to_##T_SFX \ )); \ @@ -353,6 +559,110 @@ NPYV_IMPL_AVX2_REST_PARTIAL_TYPES(f32, s32) NPYV_IMPL_AVX2_REST_PARTIAL_TYPES(u64, s64) NPYV_IMPL_AVX2_REST_PARTIAL_TYPES(f64, s64) +// 128-bit/64-bit stride (load/store pair) +#define NPYV_IMPL_AVX2_REST_PARTIAL_TYPES_PAIR(F_SFX, T_SFX) \ + NPY_FINLINE npyv_##F_SFX npyv_load2_till_##F_SFX \ + (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, \ + npyv_lanetype_##F_SFX fill_lo, npyv_lanetype_##F_SFX fill_hi) \ + { \ + union pun { \ + npyv_lanetype_##F_SFX from_##F_SFX; \ + npyv_lanetype_##T_SFX to_##T_SFX; \ + }; \ + union pun pun_lo; \ + union pun pun_hi; \ + pun_lo.from_##F_SFX = fill_lo; \ + pun_hi.from_##F_SFX = fill_hi; \ + return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load2_till_##T_SFX( \ + (const npyv_lanetype_##T_SFX *)ptr, nlane, pun_lo.to_##T_SFX, pun_hi.to_##T_SFX \ + )); \ + } \ + NPY_FINLINE npyv_##F_SFX npyv_loadn2_till_##F_SFX \ + (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, \ + npyv_lanetype_##F_SFX fill_lo, npyv_lanetype_##F_SFX fill_hi) \ + { \ + union pun { \ + npyv_lanetype_##F_SFX from_##F_SFX; \ + npyv_lanetype_##T_SFX to_##T_SFX; \ + }; \ + union pun pun_lo; \ + union pun pun_hi; \ + pun_lo.from_##F_SFX = fill_lo; \ + pun_hi.from_##F_SFX = fill_hi; \ + return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn2_till_##T_SFX( \ + (const npyv_lanetype_##T_SFX *)ptr, stride, nlane, pun_lo.to_##T_SFX, \ + pun_hi.to_##T_SFX \ + )); \ + } \ + NPY_FINLINE npyv_##F_SFX npyv_load2_tillz_##F_SFX \ + (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane) \ + { \ + return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load2_tillz_##T_SFX( \ + (const npyv_lanetype_##T_SFX *)ptr, nlane \ + )); \ + } \ + NPY_FINLINE npyv_##F_SFX npyv_loadn2_tillz_##F_SFX \ + (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane) \ + { \ + return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn2_tillz_##T_SFX( \ + (const npyv_lanetype_##T_SFX *)ptr, stride, nlane \ + )); \ + } \ + NPY_FINLINE void npyv_store2_till_##F_SFX \ + (npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, npyv_##F_SFX a) \ + { \ + npyv_store2_till_##T_SFX( \ + (npyv_lanetype_##T_SFX *)ptr, nlane, \ + npyv_reinterpret_##T_SFX##_##F_SFX(a) \ + ); \ + } \ + NPY_FINLINE void npyv_storen2_till_##F_SFX \ + (npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, npyv_##F_SFX a) \ + { \ + npyv_storen2_till_##T_SFX( \ + (npyv_lanetype_##T_SFX *)ptr, stride, nlane, \ + npyv_reinterpret_##T_SFX##_##F_SFX(a) \ + ); \ + } + +NPYV_IMPL_AVX2_REST_PARTIAL_TYPES_PAIR(u32, s32) +NPYV_IMPL_AVX2_REST_PARTIAL_TYPES_PAIR(f32, s32) +NPYV_IMPL_AVX2_REST_PARTIAL_TYPES_PAIR(u64, s64) +NPYV_IMPL_AVX2_REST_PARTIAL_TYPES_PAIR(f64, s64) + +/************************************************************ + * de-interlave load / interleave contiguous store + ************************************************************/ +// two channels +#define NPYV_IMPL_AVX2_MEM_INTERLEAVE(SFX, ZSFX) \ + NPY_FINLINE npyv_##ZSFX##x2 npyv_zip_##ZSFX(npyv_##ZSFX, npyv_##ZSFX); \ + NPY_FINLINE npyv_##ZSFX##x2 npyv_unzip_##ZSFX(npyv_##ZSFX, npyv_##ZSFX); \ + NPY_FINLINE npyv_##SFX##x2 npyv_load_##SFX##x2( \ + const npyv_lanetype_##SFX *ptr \ + ) { \ + return npyv_unzip_##ZSFX( \ + npyv_load_##SFX(ptr), npyv_load_##SFX(ptr+npyv_nlanes_##SFX) \ + ); \ + } \ + NPY_FINLINE void npyv_store_##SFX##x2( \ + npyv_lanetype_##SFX *ptr, npyv_##SFX##x2 v \ + ) { \ + npyv_##SFX##x2 zip = npyv_zip_##ZSFX(v.val[0], v.val[1]); \ + npyv_store_##SFX(ptr, zip.val[0]); \ + npyv_store_##SFX(ptr + npyv_nlanes_##SFX, zip.val[1]); \ + } + +NPYV_IMPL_AVX2_MEM_INTERLEAVE(u8, u8) +NPYV_IMPL_AVX2_MEM_INTERLEAVE(s8, u8) +NPYV_IMPL_AVX2_MEM_INTERLEAVE(u16, u16) +NPYV_IMPL_AVX2_MEM_INTERLEAVE(s16, u16) +NPYV_IMPL_AVX2_MEM_INTERLEAVE(u32, u32) +NPYV_IMPL_AVX2_MEM_INTERLEAVE(s32, u32) +NPYV_IMPL_AVX2_MEM_INTERLEAVE(u64, u64) +NPYV_IMPL_AVX2_MEM_INTERLEAVE(s64, u64) +NPYV_IMPL_AVX2_MEM_INTERLEAVE(f32, f32) +NPYV_IMPL_AVX2_MEM_INTERLEAVE(f64, f64) + /********************************* * Lookup tables *********************************/ diff --git a/numpy/core/src/common/simd/avx2/operators.h b/numpy/core/src/common/simd/avx2/operators.h index c10267b21..7b9b6a344 100644 --- a/numpy/core/src/common/simd/avx2/operators.h +++ b/numpy/core/src/common/simd/avx2/operators.h @@ -205,7 +205,7 @@ NPY_FINLINE __m256i npyv_cmpge_u32(__m256i a, __m256i b) #define npyv_cmple_u64(A, B) npyv_cmpge_u64(B, A) #define npyv_cmple_s64(A, B) npyv_cmpge_s64(B, A) -// precision comparison +// precision comparison (ordered) #define npyv_cmpeq_f32(A, B) _mm256_castps_si256(_mm256_cmp_ps(A, B, _CMP_EQ_OQ)) #define npyv_cmpeq_f64(A, B) _mm256_castpd_si256(_mm256_cmp_pd(A, B, _CMP_EQ_OQ)) #define npyv_cmpneq_f32(A, B) _mm256_castps_si256(_mm256_cmp_ps(A, B, _CMP_NEQ_UQ)) diff --git a/numpy/core/src/common/simd/avx2/reorder.h b/numpy/core/src/common/simd/avx2/reorder.h index 4d6ec8f75..9ebe0e7f4 100644 --- a/numpy/core/src/common/simd/avx2/reorder.h +++ b/numpy/core/src/common/simd/avx2/reorder.h @@ -94,6 +94,75 @@ NPY_FINLINE npyv_f64x2 npyv_zip_f64(__m256d a, __m256d b) return npyv_combine_f64(ab0, ab1); } +// deinterleave two vectors +NPY_FINLINE npyv_u8x2 npyv_unzip_u8(npyv_u8 ab0, npyv_u8 ab1) +{ + const __m256i idx = _mm256_setr_epi8( + 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, + 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 + ); + __m256i ab_03 = _mm256_shuffle_epi8(ab0, idx); + __m256i ab_12 = _mm256_shuffle_epi8(ab1, idx); + npyv_u8x2 ab_lh = npyv_combine_u8(ab_03, ab_12); + npyv_u8x2 r; + r.val[0] = _mm256_unpacklo_epi64(ab_lh.val[0], ab_lh.val[1]); + r.val[1] = _mm256_unpackhi_epi64(ab_lh.val[0], ab_lh.val[1]); + return r; +} +#define npyv_unzip_s8 npyv_unzip_u8 + +NPY_FINLINE npyv_u16x2 npyv_unzip_u16(npyv_u16 ab0, npyv_u16 ab1) +{ + const __m256i idx = _mm256_setr_epi8( + 0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15, + 0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15 + ); + __m256i ab_03 = _mm256_shuffle_epi8(ab0, idx); + __m256i ab_12 = _mm256_shuffle_epi8(ab1, idx); + npyv_u16x2 ab_lh = npyv_combine_u16(ab_03, ab_12); + npyv_u16x2 r; + r.val[0] = _mm256_unpacklo_epi64(ab_lh.val[0], ab_lh.val[1]); + r.val[1] = _mm256_unpackhi_epi64(ab_lh.val[0], ab_lh.val[1]); + return r; +} +#define npyv_unzip_s16 npyv_unzip_u16 + +NPY_FINLINE npyv_u32x2 npyv_unzip_u32(npyv_u32 ab0, npyv_u32 ab1) +{ + const __m256i idx = npyv_set_u32(0, 2, 4, 6, 1, 3, 5, 7); + __m256i abl = _mm256_permutevar8x32_epi32(ab0, idx); + __m256i abh = _mm256_permutevar8x32_epi32(ab1, idx); + return npyv_combine_u32(abl, abh); +} +#define npyv_unzip_s32 npyv_unzip_u32 + +NPY_FINLINE npyv_u64x2 npyv_unzip_u64(npyv_u64 ab0, npyv_u64 ab1) +{ + npyv_u64x2 ab_lh = npyv_combine_u64(ab0, ab1); + npyv_u64x2 r; + r.val[0] = _mm256_unpacklo_epi64(ab_lh.val[0], ab_lh.val[1]); + r.val[1] = _mm256_unpackhi_epi64(ab_lh.val[0], ab_lh.val[1]); + return r; +} +#define npyv_unzip_s64 npyv_unzip_u64 + +NPY_FINLINE npyv_f32x2 npyv_unzip_f32(npyv_f32 ab0, npyv_f32 ab1) +{ + const __m256i idx = npyv_set_u32(0, 2, 4, 6, 1, 3, 5, 7); + __m256 abl = _mm256_permutevar8x32_ps(ab0, idx); + __m256 abh = _mm256_permutevar8x32_ps(ab1, idx); + return npyv_combine_f32(abl, abh); +} + +NPY_FINLINE npyv_f64x2 npyv_unzip_f64(npyv_f64 ab0, npyv_f64 ab1) +{ + npyv_f64x2 ab_lh = npyv_combine_f64(ab0, ab1); + npyv_f64x2 r; + r.val[0] = _mm256_unpacklo_pd(ab_lh.val[0], ab_lh.val[1]); + r.val[1] = _mm256_unpackhi_pd(ab_lh.val[0], ab_lh.val[1]); + return r; +} + // Reverse elements of each 64-bit lane NPY_FINLINE npyv_u8 npyv_rev64_u8(npyv_u8 a) { @@ -126,4 +195,22 @@ NPY_FINLINE npyv_f32 npyv_rev64_f32(npyv_f32 a) return _mm256_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 0, 1)); } +// Permuting the elements of each 128-bit lane by immediate index for +// each element. +#define npyv_permi128_u32(A, E0, E1, E2, E3) \ + _mm256_shuffle_epi32(A, _MM_SHUFFLE(E3, E2, E1, E0)) + +#define npyv_permi128_s32 npyv_permi128_u32 + +#define npyv_permi128_u64(A, E0, E1) \ + _mm256_shuffle_epi32(A, _MM_SHUFFLE(((E1)<<1)+1, ((E1)<<1), ((E0)<<1)+1, ((E0)<<1))) + +#define npyv_permi128_s64 npyv_permi128_u64 + +#define npyv_permi128_f32(A, E0, E1, E2, E3) \ + _mm256_permute_ps(A, _MM_SHUFFLE(E3, E2, E1, E0)) + +#define npyv_permi128_f64(A, E0, E1) \ + _mm256_permute_pd(A, ((E1)<<3) | ((E0)<<2) | ((E1)<<1) | (E0)) + #endif // _NPY_SIMD_AVX2_REORDER_H diff --git a/numpy/core/src/common/simd/avx512/arithmetic.h b/numpy/core/src/common/simd/avx512/arithmetic.h index 1290dc0ad..a63da87d0 100644 --- a/numpy/core/src/common/simd/avx512/arithmetic.h +++ b/numpy/core/src/common/simd/avx512/arithmetic.h @@ -349,6 +349,10 @@ NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor) // negate multiply and subtract, -(a*b) - c #define npyv_nmulsub_f32 _mm512_fnmsub_ps #define npyv_nmulsub_f64 _mm512_fnmsub_pd +// multiply, add for odd elements and subtract even elements. +// (a * b) -+ c +#define npyv_muladdsub_f32 _mm512_fmaddsub_ps +#define npyv_muladdsub_f64 _mm512_fmaddsub_pd /*************************** * Summation: Calculates the sum of all vector elements. diff --git a/numpy/core/src/common/simd/avx512/avx512.h b/numpy/core/src/common/simd/avx512/avx512.h index 0946e6443..aa6abe256 100644 --- a/numpy/core/src/common/simd/avx512/avx512.h +++ b/numpy/core/src/common/simd/avx512/avx512.h @@ -7,6 +7,7 @@ #define NPY_SIMD_F64 1 #define NPY_SIMD_FMA3 1 // native support #define NPY_SIMD_BIGENDIAN 0 +#define NPY_SIMD_CMPSIGNAL 0 // Enough limit to allow us to use _mm512_i32gather_* and _mm512_i32scatter_* #define NPY_SIMD_MAXLOAD_STRIDE32 (0x7fffffff / 16) #define NPY_SIMD_MAXSTORE_STRIDE32 (0x7fffffff / 16) diff --git a/numpy/core/src/common/simd/avx512/maskop.h b/numpy/core/src/common/simd/avx512/maskop.h index d1c188390..88fa4a68c 100644 --- a/numpy/core/src/common/simd/avx512/maskop.h +++ b/numpy/core/src/common/simd/avx512/maskop.h @@ -51,4 +51,17 @@ NPYV_IMPL_AVX512_MASK_ADDSUB(s64, b64, epi64) NPYV_IMPL_AVX512_MASK_ADDSUB(f32, b32, ps) NPYV_IMPL_AVX512_MASK_ADDSUB(f64, b64, pd) +// division, m ? a / b : c +NPY_FINLINE npyv_f32 npyv_ifdiv_f32(npyv_b32 m, npyv_f32 a, npyv_f32 b, npyv_f32 c) +{ return _mm512_mask_div_ps(c, m, a, b); } +// conditional division, m ? a / b : 0 +NPY_FINLINE npyv_f32 npyv_ifdivz_f32(npyv_b32 m, npyv_f32 a, npyv_f32 b) +{ return _mm512_maskz_div_ps(m, a, b); } +// division, m ? a / b : c +NPY_FINLINE npyv_f64 npyv_ifdiv_f64(npyv_b32 m, npyv_f64 a, npyv_f64 b, npyv_f64 c) +{ return _mm512_mask_div_pd(c, m, a, b); } +// conditional division, m ? a / b : 0 +NPY_FINLINE npyv_f64 npyv_ifdivz_f64(npyv_b32 m, npyv_f64 a, npyv_f64 b) +{ return _mm512_maskz_div_pd(m, a, b); } + #endif // _NPY_SIMD_AVX512_MASKOP_H diff --git a/numpy/core/src/common/simd/avx512/memory.h b/numpy/core/src/common/simd/avx512/memory.h index 03fcb4630..fdf96a92c 100644 --- a/numpy/core/src/common/simd/avx512/memory.h +++ b/numpy/core/src/common/simd/avx512/memory.h @@ -120,6 +120,52 @@ NPY_FINLINE npyv_s64 npyv_loadn_s64(const npy_int64 *ptr, npy_intp stride) { return npyv_loadn_u64((const npy_uint64*)ptr, stride); } NPY_FINLINE npyv_f64 npyv_loadn_f64(const double *ptr, npy_intp stride) { return _mm512_castsi512_pd(npyv_loadn_u64((const npy_uint64*)ptr, stride)); } + +//// 64-bit load over 32-bit stride +NPY_FINLINE npyv_u32 npyv_loadn2_u32(const npy_uint32 *ptr, npy_intp stride) +{ + __m128d a = _mm_loadh_pd( + _mm_castsi128_pd(_mm_loadl_epi64((const __m128i*)ptr)), + (const double*)(ptr + stride) + ); + __m128d b = _mm_loadh_pd( + _mm_castsi128_pd(_mm_loadl_epi64((const __m128i*)(ptr + stride*2))), + (const double*)(ptr + stride*3) + ); + __m128d c = _mm_loadh_pd( + _mm_castsi128_pd(_mm_loadl_epi64((const __m128i*)(ptr + stride*4))), + (const double*)(ptr + stride*5) + ); + __m128d d = _mm_loadh_pd( + _mm_castsi128_pd(_mm_loadl_epi64((const __m128i*)(ptr + stride*6))), + (const double*)(ptr + stride*7) + ); + return _mm512_castpd_si512(npyv512_combine_pd256( + _mm256_insertf128_pd(_mm256_castpd128_pd256(a), b, 1), + _mm256_insertf128_pd(_mm256_castpd128_pd256(c), d, 1) + )); +} +NPY_FINLINE npyv_s32 npyv_loadn2_s32(const npy_int32 *ptr, npy_intp stride) +{ return npyv_loadn2_u32((const npy_uint32*)ptr, stride); } +NPY_FINLINE npyv_f32 npyv_loadn2_f32(const float *ptr, npy_intp stride) +{ return _mm512_castsi512_ps(npyv_loadn2_u32((const npy_uint32*)ptr, stride)); } + +//// 128-bit load over 64-bit stride +NPY_FINLINE npyv_f64 npyv_loadn2_f64(const double *ptr, npy_intp stride) +{ + __m128d a = _mm_loadu_pd(ptr); + __m128d b = _mm_loadu_pd(ptr + stride); + __m128d c = _mm_loadu_pd(ptr + stride * 2); + __m128d d = _mm_loadu_pd(ptr + stride * 3); + return npyv512_combine_pd256( + _mm256_insertf128_pd(_mm256_castpd128_pd256(a), b, 1), + _mm256_insertf128_pd(_mm256_castpd128_pd256(c), d, 1) + ); +} +NPY_FINLINE npyv_u64 npyv_loadn2_u64(const npy_uint64 *ptr, npy_intp stride) +{ return npyv_reinterpret_u64_f64(npyv_loadn2_f64((const double*)ptr, stride)); } +NPY_FINLINE npyv_s64 npyv_loadn2_s64(const npy_int64 *ptr, npy_intp stride) +{ return npyv_loadn2_u64((const npy_uint64*)ptr, stride); } /*************************** * Non-contiguous Store ***************************/ @@ -151,6 +197,48 @@ NPY_FINLINE void npyv_storen_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a) NPY_FINLINE void npyv_storen_f64(double *ptr, npy_intp stride, npyv_f64 a) { npyv_storen_u64((npy_uint64*)ptr, stride, _mm512_castpd_si512(a)); } +//// 64-bit store over 32-bit stride +NPY_FINLINE void npyv_storen2_u32(npy_uint32 *ptr, npy_intp stride, npyv_u32 a) +{ + __m256d lo = _mm512_castpd512_pd256(_mm512_castsi512_pd(a)); + __m256d hi = _mm512_extractf64x4_pd(_mm512_castsi512_pd(a), 1); + __m128d e0 = _mm256_castpd256_pd128(lo); + __m128d e1 = _mm256_extractf128_pd(lo, 1); + __m128d e2 = _mm256_castpd256_pd128(hi); + __m128d e3 = _mm256_extractf128_pd(hi, 1); + _mm_storel_pd((double*)(ptr + stride * 0), e0); + _mm_storeh_pd((double*)(ptr + stride * 1), e0); + _mm_storel_pd((double*)(ptr + stride * 2), e1); + _mm_storeh_pd((double*)(ptr + stride * 3), e1); + _mm_storel_pd((double*)(ptr + stride * 4), e2); + _mm_storeh_pd((double*)(ptr + stride * 5), e2); + _mm_storel_pd((double*)(ptr + stride * 6), e3); + _mm_storeh_pd((double*)(ptr + stride * 7), e3); +} +NPY_FINLINE void npyv_storen2_s32(npy_int32 *ptr, npy_intp stride, npyv_s32 a) +{ npyv_storen2_u32((npy_uint32*)ptr, stride, a); } +NPY_FINLINE void npyv_storen2_f32(float *ptr, npy_intp stride, npyv_f32 a) +{ npyv_storen2_u32((npy_uint32*)ptr, stride, _mm512_castps_si512(a)); } + +//// 128-bit store over 64-bit stride +NPY_FINLINE void npyv_storen2_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a) +{ + __m256i lo = npyv512_lower_si256(a); + __m256i hi = npyv512_higher_si256(a); + __m128i e0 = _mm256_castsi256_si128(lo); + __m128i e1 = _mm256_extracti128_si256(lo, 1); + __m128i e2 = _mm256_castsi256_si128(hi); + __m128i e3 = _mm256_extracti128_si256(hi, 1); + _mm_storeu_si128((__m128i*)(ptr + stride * 0), e0); + _mm_storeu_si128((__m128i*)(ptr + stride * 1), e1); + _mm_storeu_si128((__m128i*)(ptr + stride * 2), e2); + _mm_storeu_si128((__m128i*)(ptr + stride * 3), e3); +} +NPY_FINLINE void npyv_storen2_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a) +{ npyv_storen2_u64((npy_uint64*)ptr, stride, a); } +NPY_FINLINE void npyv_storen2_f64(double *ptr, npy_intp stride, npyv_f64 a) +{ npyv_storen2_u64((npy_uint64*)ptr, stride, _mm512_castpd_si512(a)); } + /********************************* * Partial Load *********************************/ @@ -159,14 +247,14 @@ NPY_FINLINE npyv_s32 npyv_load_till_s32(const npy_int32 *ptr, npy_uintp nlane, n { assert(nlane > 0); const __m512i vfill = _mm512_set1_epi32(fill); - const __mmask16 mask = nlane > 31 ? -1 : (1 << nlane) - 1; + const __mmask16 mask = nlane > 15 ? -1 : (1 << nlane) - 1; return _mm512_mask_loadu_epi32(vfill, mask, (const __m512i*)ptr); } // fill zero to rest lanes NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32 *ptr, npy_uintp nlane) { assert(nlane > 0); - const __mmask16 mask = nlane > 31 ? -1 : (1 << nlane) - 1; + const __mmask16 mask = nlane > 15 ? -1 : (1 << nlane) - 1; return _mm512_maskz_loadu_epi32(mask, (const __m512i*)ptr); } //// 64 @@ -174,14 +262,44 @@ NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, n { assert(nlane > 0); const __m512i vfill = npyv_setall_s64(fill); - const __mmask8 mask = nlane > 31 ? -1 : (1 << nlane) - 1; + const __mmask8 mask = nlane > 7 ? -1 : (1 << nlane) - 1; return _mm512_mask_loadu_epi64(vfill, mask, (const __m512i*)ptr); } // fill zero to rest lanes NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane) { assert(nlane > 0); - const __mmask8 mask = nlane > 15 ? -1 : (1 << nlane) - 1; + const __mmask8 mask = nlane > 7 ? -1 : (1 << nlane) - 1; + return _mm512_maskz_loadu_epi64(mask, (const __m512i*)ptr); +} + +//// 64-bit nlane +NPY_FINLINE npyv_s32 npyv_load2_till_s32(const npy_int32 *ptr, npy_uintp nlane, + npy_int32 fill_lo, npy_int32 fill_hi) +{ + assert(nlane > 0); + const __m512i vfill = _mm512_set4_epi32(fill_hi, fill_lo, fill_hi, fill_lo); + const __mmask8 mask = nlane > 7 ? -1 : (1 << nlane) - 1; + return _mm512_mask_loadu_epi64(vfill, mask, (const __m512i*)ptr); +} +// fill zero to rest lanes +NPY_FINLINE npyv_s32 npyv_load2_tillz_s32(const npy_int32 *ptr, npy_uintp nlane) +{ return npyv_load_tillz_s64((const npy_int64*)ptr, nlane); } + +//// 128-bit nlane +NPY_FINLINE npyv_u64 npyv_load2_till_s64(const npy_int64 *ptr, npy_uintp nlane, + npy_int64 fill_lo, npy_int64 fill_hi) +{ + assert(nlane > 0); + const __m512i vfill = _mm512_set4_epi64(fill_hi, fill_lo, fill_hi, fill_lo); + const __mmask8 mask = nlane > 3 ? -1 : (1 << (nlane*2)) - 1; + return _mm512_mask_loadu_epi64(vfill, mask, (const __m512i*)ptr); +} +// fill zero to rest lanes +NPY_FINLINE npyv_s64 npyv_load2_tillz_s64(const npy_int64 *ptr, npy_uintp nlane) +{ + assert(nlane > 0); + const __mmask8 mask = nlane > 3 ? -1 : (1 << (nlane*2)) - 1; return _mm512_maskz_loadu_epi64(mask, (const __m512i*)ptr); } /********************************* @@ -198,7 +316,7 @@ npyv_loadn_till_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npy_ ); const __m512i idx = _mm512_mullo_epi32(steps, _mm512_set1_epi32((int)stride)); const __m512i vfill = _mm512_set1_epi32(fill); - const __mmask16 mask = nlane > 31 ? -1 : (1 << nlane) - 1; + const __mmask16 mask = nlane > 15 ? -1 : (1 << nlane) - 1; return _mm512_mask_i32gather_epi32(vfill, mask, idx, (const __m512i*)ptr, 4); } // fill zero to rest lanes @@ -215,13 +333,48 @@ npyv_loadn_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_ 4*stride, 5*stride, 6*stride, 7*stride ); const __m512i vfill = npyv_setall_s64(fill); - const __mmask8 mask = nlane > 31 ? -1 : (1 << nlane) - 1; + const __mmask8 mask = nlane > 15 ? -1 : (1 << nlane) - 1; return _mm512_mask_i64gather_epi64(vfill, mask, idx, (const __m512i*)ptr, 8); } // fill zero to rest lanes NPY_FINLINE npyv_s64 npyv_loadn_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane) { return npyv_loadn_till_s64(ptr, stride, nlane, 0); } + +//// 64-bit load over 32-bit stride +NPY_FINLINE npyv_s64 npyv_loadn2_till_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane, + npy_int32 fill_lo, npy_int32 fill_hi) +{ + assert(nlane > 0); + const __m512i idx = npyv_set_s64( + 0*stride, 1*stride, 2*stride, 3*stride, + 4*stride, 5*stride, 6*stride, 7*stride + ); + const __m512i vfill = _mm512_set4_epi32(fill_hi, fill_lo, fill_hi, fill_lo); + const __mmask8 mask = nlane > 7 ? -1 : (1 << nlane) - 1; + return _mm512_mask_i64gather_epi64(vfill, mask, idx, (const __m512i*)ptr, 4); +} +// fill zero to rest lanes +NPY_FINLINE npyv_s32 npyv_loadn2_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane) +{ return npyv_loadn2_till_s32(ptr, stride, nlane, 0, 0); } + +//// 128-bit load over 64-bit stride +NPY_FINLINE npyv_s64 npyv_loadn2_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, + npy_int64 fill_lo, npy_int64 fill_hi) +{ + assert(nlane > 0); + const __m512i idx = npyv_set_s64( + 0, 1, stride, stride+1, + stride*2, stride*2+1, stride*3, stride*3+1 + ); + const __mmask8 mask = nlane > 3 ? -1 : (1 << (nlane*2)) - 1; + const __m512i vfill = _mm512_set4_epi64(fill_hi, fill_lo, fill_hi, fill_lo); + return _mm512_mask_i64gather_epi64(vfill, mask, idx, (const __m512i*)ptr, 8); +} +// fill zero to rest lanes +NPY_FINLINE npyv_s64 npyv_loadn2_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane) +{ return npyv_loadn2_till_s64(ptr, stride, nlane, 0, 0); } + /********************************* * Partial store *********************************/ @@ -229,14 +382,30 @@ npyv_loadn_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane) NPY_FINLINE void npyv_store_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a) { assert(nlane > 0); - const __mmask16 mask = nlane > 31 ? -1 : (1 << nlane) - 1; + const __mmask16 mask = nlane > 15 ? -1 : (1 << nlane) - 1; _mm512_mask_storeu_epi32((__m512i*)ptr, mask, a); } //// 64 NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a) { assert(nlane > 0); - const __mmask8 mask = nlane > 15 ? -1 : (1 << nlane) - 1; + const __mmask8 mask = nlane > 7 ? -1 : (1 << nlane) - 1; + _mm512_mask_storeu_epi64((__m512i*)ptr, mask, a); +} + +//// 64-bit nlane +NPY_FINLINE void npyv_store2_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a) +{ + assert(nlane > 0); + const __mmask8 mask = nlane > 7 ? -1 : (1 << nlane) - 1; + _mm512_mask_storeu_epi64((__m512i*)ptr, mask, a); +} + +//// 128-bit nlane +NPY_FINLINE void npyv_store2_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a) +{ + assert(nlane > 0); + const __mmask8 mask = nlane > 3 ? -1 : (1 << (nlane*2)) - 1; _mm512_mask_storeu_epi64((__m512i*)ptr, mask, a); } /********************************* @@ -251,7 +420,7 @@ NPY_FINLINE void npyv_storen_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 ); const __m512i idx = _mm512_mullo_epi32(steps, _mm512_set1_epi32((int)stride)); - const __mmask16 mask = nlane > 31 ? -1 : (1 << nlane) - 1; + const __mmask16 mask = nlane > 15 ? -1 : (1 << nlane) - 1; _mm512_mask_i32scatter_epi32((__m512i*)ptr, mask, idx, a, 4); } //// 64 @@ -262,7 +431,31 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp 0*stride, 1*stride, 2*stride, 3*stride, 4*stride, 5*stride, 6*stride, 7*stride ); - const __mmask8 mask = nlane > 15 ? -1 : (1 << nlane) - 1; + const __mmask8 mask = nlane > 7 ? -1 : (1 << nlane) - 1; + _mm512_mask_i64scatter_epi64((__m512i*)ptr, mask, idx, a, 8); +} + +//// 64-bit store over 32-bit stride +NPY_FINLINE void npyv_storen2_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npyv_s32 a) +{ + assert(nlane > 0); + const __m512i idx = npyv_set_s64( + 0*stride, 1*stride, 2*stride, 3*stride, + 4*stride, 5*stride, 6*stride, 7*stride + ); + const __mmask8 mask = nlane > 7 ? -1 : (1 << nlane) - 1; + _mm512_mask_i64scatter_epi64((__m512i*)ptr, mask, idx, a, 4); +} + +//// 128-bit store over 64-bit stride +NPY_FINLINE void npyv_storen2_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npyv_s64 a) +{ + assert(nlane > 0); + const __m512i idx = npyv_set_s64( + 0, 1, stride, stride+1, + 2*stride, 2*stride+1, 3*stride, 3*stride+1 + ); + const __mmask8 mask = nlane > 3 ? -1 : (1 << (nlane*2)) - 1; _mm512_mask_i64scatter_epi64((__m512i*)ptr, mask, idx, a, 8); } @@ -331,6 +524,110 @@ NPYV_IMPL_AVX512_REST_PARTIAL_TYPES(f32, s32) NPYV_IMPL_AVX512_REST_PARTIAL_TYPES(u64, s64) NPYV_IMPL_AVX512_REST_PARTIAL_TYPES(f64, s64) +// 128-bit/64-bit stride (pair load/store) +#define NPYV_IMPL_AVX512_REST_PARTIAL_TYPES_PAIR(F_SFX, T_SFX) \ + NPY_FINLINE npyv_##F_SFX npyv_load2_till_##F_SFX \ + (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, \ + npyv_lanetype_##F_SFX fill_lo, npyv_lanetype_##F_SFX fill_hi) \ + { \ + union pun { \ + npyv_lanetype_##F_SFX from_##F_SFX; \ + npyv_lanetype_##T_SFX to_##T_SFX; \ + }; \ + union pun pun_lo; \ + union pun pun_hi; \ + pun_lo.from_##F_SFX = fill_lo; \ + pun_hi.from_##F_SFX = fill_hi; \ + return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load2_till_##T_SFX( \ + (const npyv_lanetype_##T_SFX *)ptr, nlane, pun_lo.to_##T_SFX, pun_hi.to_##T_SFX \ + )); \ + } \ + NPY_FINLINE npyv_##F_SFX npyv_loadn2_till_##F_SFX \ + (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, \ + npyv_lanetype_##F_SFX fill_lo, npyv_lanetype_##F_SFX fill_hi) \ + { \ + union pun { \ + npyv_lanetype_##F_SFX from_##F_SFX; \ + npyv_lanetype_##T_SFX to_##T_SFX; \ + }; \ + union pun pun_lo; \ + union pun pun_hi; \ + pun_lo.from_##F_SFX = fill_lo; \ + pun_hi.from_##F_SFX = fill_hi; \ + return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn2_till_##T_SFX( \ + (const npyv_lanetype_##T_SFX *)ptr, stride, nlane, pun_lo.to_##T_SFX, \ + pun_hi.to_##T_SFX \ + )); \ + } \ + NPY_FINLINE npyv_##F_SFX npyv_load2_tillz_##F_SFX \ + (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane) \ + { \ + return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load2_tillz_##T_SFX( \ + (const npyv_lanetype_##T_SFX *)ptr, nlane \ + )); \ + } \ + NPY_FINLINE npyv_##F_SFX npyv_loadn2_tillz_##F_SFX \ + (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane) \ + { \ + return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn2_tillz_##T_SFX( \ + (const npyv_lanetype_##T_SFX *)ptr, stride, nlane \ + )); \ + } \ + NPY_FINLINE void npyv_store2_till_##F_SFX \ + (npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, npyv_##F_SFX a) \ + { \ + npyv_store2_till_##T_SFX( \ + (npyv_lanetype_##T_SFX *)ptr, nlane, \ + npyv_reinterpret_##T_SFX##_##F_SFX(a) \ + ); \ + } \ + NPY_FINLINE void npyv_storen2_till_##F_SFX \ + (npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, npyv_##F_SFX a) \ + { \ + npyv_storen2_till_##T_SFX( \ + (npyv_lanetype_##T_SFX *)ptr, stride, nlane, \ + npyv_reinterpret_##T_SFX##_##F_SFX(a) \ + ); \ + } + +NPYV_IMPL_AVX512_REST_PARTIAL_TYPES_PAIR(u32, s32) +NPYV_IMPL_AVX512_REST_PARTIAL_TYPES_PAIR(f32, s32) +NPYV_IMPL_AVX512_REST_PARTIAL_TYPES_PAIR(u64, s64) +NPYV_IMPL_AVX512_REST_PARTIAL_TYPES_PAIR(f64, s64) + +/************************************************************ + * de-interlave load / interleave contiguous store + ************************************************************/ +// two channels +#define NPYV_IMPL_AVX512_MEM_INTERLEAVE(SFX, ZSFX) \ + NPY_FINLINE npyv_##ZSFX##x2 npyv_zip_##ZSFX(npyv_##ZSFX, npyv_##ZSFX); \ + NPY_FINLINE npyv_##ZSFX##x2 npyv_unzip_##ZSFX(npyv_##ZSFX, npyv_##ZSFX); \ + NPY_FINLINE npyv_##SFX##x2 npyv_load_##SFX##x2( \ + const npyv_lanetype_##SFX *ptr \ + ) { \ + return npyv_unzip_##ZSFX( \ + npyv_load_##SFX(ptr), npyv_load_##SFX(ptr+npyv_nlanes_##SFX) \ + ); \ + } \ + NPY_FINLINE void npyv_store_##SFX##x2( \ + npyv_lanetype_##SFX *ptr, npyv_##SFX##x2 v \ + ) { \ + npyv_##SFX##x2 zip = npyv_zip_##ZSFX(v.val[0], v.val[1]); \ + npyv_store_##SFX(ptr, zip.val[0]); \ + npyv_store_##SFX(ptr + npyv_nlanes_##SFX, zip.val[1]); \ + } + +NPYV_IMPL_AVX512_MEM_INTERLEAVE(u8, u8) +NPYV_IMPL_AVX512_MEM_INTERLEAVE(s8, u8) +NPYV_IMPL_AVX512_MEM_INTERLEAVE(u16, u16) +NPYV_IMPL_AVX512_MEM_INTERLEAVE(s16, u16) +NPYV_IMPL_AVX512_MEM_INTERLEAVE(u32, u32) +NPYV_IMPL_AVX512_MEM_INTERLEAVE(s32, u32) +NPYV_IMPL_AVX512_MEM_INTERLEAVE(u64, u64) +NPYV_IMPL_AVX512_MEM_INTERLEAVE(s64, u64) +NPYV_IMPL_AVX512_MEM_INTERLEAVE(f32, f32) +NPYV_IMPL_AVX512_MEM_INTERLEAVE(f64, f64) + /************************************************** * Lookup table *************************************************/ diff --git a/numpy/core/src/common/simd/avx512/reorder.h b/numpy/core/src/common/simd/avx512/reorder.h index c0b2477f3..27e66b5e7 100644 --- a/numpy/core/src/common/simd/avx512/reorder.h +++ b/numpy/core/src/common/simd/avx512/reorder.h @@ -167,6 +167,140 @@ NPY_FINLINE npyv_f64x2 npyv_zip_f64(__m512d a, __m512d b) return r; } +// deinterleave two vectors +NPY_FINLINE npyv_u8x2 npyv_unzip_u8(npyv_u8 ab0, npyv_u8 ab1) +{ + npyv_u8x2 r; +#ifdef NPY_HAVE_AVX512VBMI + const __m512i idx_a = npyv_set_u8( + 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, + 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, + 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94, + 96, 98, 100, 102, 104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126 + ); + const __m512i idx_b = npyv_set_u8( + 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, + 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63, + 65, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 95, + 97, 99, 101, 103, 105, 107, 109, 111, 113, 115, 117, 119, 121, 123, 125, 127 + ); + r.val[0] = _mm512_permutex2var_epi8(ab0, idx_a, ab1); + r.val[1] = _mm512_permutex2var_epi8(ab0, idx_b, ab1); +#else + #ifdef NPY_HAVE_AVX512BW + const __m512i idx = npyv_set_u8( + 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, + 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, + 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, + 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 + ); + __m512i abl = _mm512_shuffle_epi8(ab0, idx); + __m512i abh = _mm512_shuffle_epi8(ab1, idx); + #else + const __m256i idx = _mm256_setr_epi8( + 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, + 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 + ); + __m256i abl_lo = _mm256_shuffle_epi8(npyv512_lower_si256(ab0), idx); + __m256i abl_hi = _mm256_shuffle_epi8(npyv512_higher_si256(ab0), idx); + __m256i abh_lo = _mm256_shuffle_epi8(npyv512_lower_si256(ab1), idx); + __m256i abh_hi = _mm256_shuffle_epi8(npyv512_higher_si256(ab1), idx); + __m512i abl = npyv512_combine_si256(abl_lo, abl_hi); + __m512i abh = npyv512_combine_si256(abh_lo, abh_hi); + #endif + const __m512i idx_a = npyv_set_u64(0, 2, 4, 6, 8, 10, 12, 14); + const __m512i idx_b = npyv_set_u64(1, 3, 5, 7, 9, 11, 13, 15); + r.val[0] = _mm512_permutex2var_epi64(abl, idx_a, abh); + r.val[1] = _mm512_permutex2var_epi64(abl, idx_b, abh); +#endif + return r; +} +#define npyv_unzip_s8 npyv_unzip_u8 + +NPY_FINLINE npyv_u16x2 npyv_unzip_u16(npyv_u16 ab0, npyv_u16 ab1) +{ + npyv_u16x2 r; +#ifdef NPY_HAVE_AVX512BW + const __m512i idx_a = npyv_set_u16( + 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, + 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62 + ); + const __m512i idx_b = npyv_set_u16( + 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, + 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63 + ); + r.val[0] = _mm512_permutex2var_epi16(ab0, idx_a, ab1); + r.val[1] = _mm512_permutex2var_epi16(ab0, idx_b, ab1); +#else + const __m256i idx = _mm256_setr_epi8( + 0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15, + 0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15 + ); + __m256i abl_lo = _mm256_shuffle_epi8(npyv512_lower_si256(ab0), idx); + __m256i abl_hi = _mm256_shuffle_epi8(npyv512_higher_si256(ab0), idx); + __m256i abh_lo = _mm256_shuffle_epi8(npyv512_lower_si256(ab1), idx); + __m256i abh_hi = _mm256_shuffle_epi8(npyv512_higher_si256(ab1), idx); + __m512i abl = npyv512_combine_si256(abl_lo, abl_hi); + __m512i abh = npyv512_combine_si256(abh_lo, abh_hi); + + const __m512i idx_a = npyv_set_u64(0, 2, 4, 6, 8, 10, 12, 14); + const __m512i idx_b = npyv_set_u64(1, 3, 5, 7, 9, 11, 13, 15); + r.val[0] = _mm512_permutex2var_epi64(abl, idx_a, abh); + r.val[1] = _mm512_permutex2var_epi64(abl, idx_b, abh); +#endif + return r; +} +#define npyv_unzip_s16 npyv_unzip_u16 + +NPY_FINLINE npyv_u32x2 npyv_unzip_u32(npyv_u32 ab0, npyv_u32 ab1) +{ + const __m512i idx_a = npyv_set_u32( + 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 + ); + const __m512i idx_b = npyv_set_u32( + 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 + ); + npyv_u32x2 r; + r.val[0] = _mm512_permutex2var_epi32(ab0, idx_a, ab1); + r.val[1] = _mm512_permutex2var_epi32(ab0, idx_b, ab1); + return r; +} +#define npyv_unzip_s32 npyv_unzip_u32 + +NPY_FINLINE npyv_u64x2 npyv_unzip_u64(npyv_u64 ab0, npyv_u64 ab1) +{ + const __m512i idx_a = npyv_set_u64(0, 2, 4, 6, 8, 10, 12, 14); + const __m512i idx_b = npyv_set_u64(1, 3, 5, 7, 9, 11, 13, 15); + npyv_u64x2 r; + r.val[0] = _mm512_permutex2var_epi64(ab0, idx_a, ab1); + r.val[1] = _mm512_permutex2var_epi64(ab0, idx_b, ab1); + return r; +} +#define npyv_unzip_s64 npyv_unzip_u64 + +NPY_FINLINE npyv_f32x2 npyv_unzip_f32(npyv_f32 ab0, npyv_f32 ab1) +{ + const __m512i idx_a = npyv_set_u32( + 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 + ); + const __m512i idx_b = npyv_set_u32( + 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 + ); + npyv_f32x2 r; + r.val[0] = _mm512_permutex2var_ps(ab0, idx_a, ab1); + r.val[1] = _mm512_permutex2var_ps(ab0, idx_b, ab1); + return r; +} +NPY_FINLINE npyv_f64x2 npyv_unzip_f64(npyv_f64 ab0, npyv_f64 ab1) +{ + const __m512i idx_a = npyv_set_u64(0, 2, 4, 6, 8, 10, 12, 14); + const __m512i idx_b = npyv_set_u64(1, 3, 5, 7, 9, 11, 13, 15); + npyv_f64x2 r; + r.val[0] = _mm512_permutex2var_pd(ab0, idx_a, ab1); + r.val[1] = _mm512_permutex2var_pd(ab0, idx_b, ab1); + return r; +} + // Reverse elements of each 64-bit lane NPY_FINLINE npyv_u8 npyv_rev64_u8(npyv_u8 a) { @@ -223,4 +357,22 @@ NPY_FINLINE npyv_f32 npyv_rev64_f32(npyv_f32 a) return _mm512_shuffle_ps(a, a, (_MM_PERM_ENUM)_MM_SHUFFLE(2, 3, 0, 1)); } +// Permuting the elements of each 128-bit lane by immediate index for +// each element. +#define npyv_permi128_u32(A, E0, E1, E2, E3) \ + _mm512_shuffle_epi32(A, _MM_SHUFFLE(E3, E2, E1, E0)) + +#define npyv_permi128_s32 npyv_permi128_u32 + +#define npyv_permi128_u64(A, E0, E1) \ + _mm512_shuffle_epi32(A, _MM_SHUFFLE(((E1)<<1)+1, ((E1)<<1), ((E0)<<1)+1, ((E0)<<1))) + +#define npyv_permi128_s64 npyv_permi128_u64 + +#define npyv_permi128_f32(A, E0, E1, E2, E3) \ + _mm512_permute_ps(A, _MM_SHUFFLE(E3, E2, E1, E0)) + +#define npyv_permi128_f64(A, E0, E1) \ + _mm512_permute_pd(A, (((E1)<<7) | ((E0)<<6) | ((E1)<<5) | ((E0)<<4) | ((E1)<<3) | ((E0)<<2) | ((E1)<<1) | (E0))) + #endif // _NPY_SIMD_AVX512_REORDER_H diff --git a/numpy/core/src/common/simd/emulate_maskop.h b/numpy/core/src/common/simd/emulate_maskop.h index 2a808a153..6627e5010 100644 --- a/numpy/core/src/common/simd/emulate_maskop.h +++ b/numpy/core/src/common/simd/emulate_maskop.h @@ -42,5 +42,39 @@ NPYV_IMPL_EMULATE_MASK_ADDSUB(s64, b64) #if NPY_SIMD_F64 NPYV_IMPL_EMULATE_MASK_ADDSUB(f64, b64) #endif +#if NPY_SIMD_F32 + // conditional division, m ? a / b : c + NPY_FINLINE npyv_f32 + npyv_ifdiv_f32(npyv_b32 m, npyv_f32 a, npyv_f32 b, npyv_f32 c) + { + const npyv_f32 one = npyv_setall_f32(1.0f); + npyv_f32 div = npyv_div_f32(a, npyv_select_f32(m, b, one)); + return npyv_select_f32(m, div, c); + } + // conditional division, m ? a / b : 0 + NPY_FINLINE npyv_f32 + npyv_ifdivz_f32(npyv_b32 m, npyv_f32 a, npyv_f32 b) + { + const npyv_f32 zero = npyv_zero_f32(); + return npyv_ifdiv_f32(m, a, b, zero); + } +#endif +#if NPY_SIMD_F64 + // conditional division, m ? a / b : c + NPY_FINLINE npyv_f64 + npyv_ifdiv_f64(npyv_b64 m, npyv_f64 a, npyv_f64 b, npyv_f64 c) + { + const npyv_f64 one = npyv_setall_f64(1.0); + npyv_f64 div = npyv_div_f64(a, npyv_select_f64(m, b, one)); + return npyv_select_f64(m, div, c); + } + // conditional division, m ? a / b : 0 + NPY_FINLINE npyv_f64 + npyv_ifdivz_f64(npyv_b64 m, npyv_f64 a, npyv_f64 b) + { + const npyv_f64 zero = npyv_zero_f64(); + return npyv_ifdiv_f64(m, a, b, zero); + } +#endif #endif // _NPY_SIMD_EMULATE_MASKOP_H diff --git a/numpy/core/src/common/simd/neon/arithmetic.h b/numpy/core/src/common/simd/neon/arithmetic.h index 00994806d..683620111 100644 --- a/numpy/core/src/common/simd/neon/arithmetic.h +++ b/numpy/core/src/common/simd/neon/arithmetic.h @@ -265,6 +265,14 @@ NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor) NPY_FINLINE npyv_f32 npyv_nmulsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c) { return vmlsq_f32(vnegq_f32(c), a, b); } #endif +// multiply, add for odd elements and subtract even elements. +// (a * b) -+ c +NPY_FINLINE npyv_f32 npyv_muladdsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c) +{ + const npyv_f32 msign = npyv_set_f32(-0.0f, 0.0f, -0.0f, 0.0f); + return npyv_muladd_f32(a, b, npyv_xor_f32(msign, c)); +} + /*************************** * FUSED F64 ***************************/ @@ -277,6 +285,11 @@ NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor) { return vfmsq_f64(c, a, b); } NPY_FINLINE npyv_f64 npyv_nmulsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c) { return vfmsq_f64(vnegq_f64(c), a, b); } + NPY_FINLINE npyv_f64 npyv_muladdsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c) + { + const npyv_f64 msign = npyv_set_f64(-0.0, 0.0); + return npyv_muladd_f64(a, b, npyv_xor_f64(msign, c)); + } #endif // NPY_SIMD_F64 /*************************** diff --git a/numpy/core/src/common/simd/neon/math.h b/numpy/core/src/common/simd/neon/math.h index c0a771b5d..58d14809f 100644 --- a/numpy/core/src/common/simd/neon/math.h +++ b/numpy/core/src/common/simd/neon/math.h @@ -278,20 +278,25 @@ NPY_FINLINE npyv_f32 npyv_rint_f32(npyv_f32 a) return vrndnq_f32(a); #else // ARMv7 NEON only supports fp to int truncate conversion. - // a magic trick of adding 1.5 * 2**23 is used for rounding + // a magic trick of adding 1.5 * 2^23 is used for rounding // to nearest even and then subtract this magic number to get // the integer. - const npyv_s32 szero = vreinterpretq_s32_f32(vdupq_n_f32(-0.0f)); - const npyv_f32 magic = vdupq_n_f32(12582912.0f); // 1.5 * 2**23 - npyv_f32 round = vsubq_f32(vaddq_f32(a, magic), magic); - npyv_b32 overflow = vcleq_f32(vabsq_f32(a), vreinterpretq_f32_u32(vdupq_n_u32(0x4b000000))); - round = vbslq_f32(overflow, round, a); - // signed zero - round = vreinterpretq_f32_s32(vorrq_s32( - vreinterpretq_s32_f32(round), - vandq_s32(vreinterpretq_s32_f32(a), szero) - )); - return round; + // + const npyv_u32 szero = vreinterpretq_u32_f32(vdupq_n_f32(-0.0f)); + const npyv_u32 sign_mask = vandq_u32(vreinterpretq_u32_f32(a), szero); + const npyv_f32 two_power_23 = vdupq_n_f32(8388608.0); // 2^23 + const npyv_f32 two_power_23h = vdupq_n_f32(12582912.0f); // 1.5 * 2^23 + npyv_u32 nnan_mask = vceqq_f32(a, a); + // eliminate nans to avoid invalid fp errors + npyv_f32 abs_x = vabsq_f32(vreinterpretq_f32_u32(vandq_u32(nnan_mask, vreinterpretq_u32_f32(a)))); + // round by add magic number 1.5 * 2^23 + npyv_f32 round = vsubq_f32(vaddq_f32(two_power_23h, abs_x), two_power_23h); + // copysign + round = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(round), sign_mask )); + // a if |a| >= 2^23 or a == NaN + npyv_u32 mask = vcleq_f32(abs_x, two_power_23); + mask = vandq_u32(mask, nnan_mask); + return vbslq_f32(mask, round, a); #endif } #if NPY_SIMD_F64 @@ -302,33 +307,30 @@ NPY_FINLINE npyv_f32 npyv_rint_f32(npyv_f32 a) #ifdef NPY_HAVE_ASIMD #define npyv_ceil_f32 vrndpq_f32 #else - NPY_FINLINE npyv_f32 npyv_ceil_f32(npyv_f32 a) - { - const npyv_s32 szero = vreinterpretq_s32_f32(vdupq_n_f32(-0.0f)); + NPY_FINLINE npyv_f32 npyv_ceil_f32(npyv_f32 a) + { const npyv_u32 one = vreinterpretq_u32_f32(vdupq_n_f32(1.0f)); - const npyv_s32 max_int = vdupq_n_s32(0x7fffffff); - /** - * On armv7, vcvtq.f32 handles special cases as follows: - * NaN return 0 - * +inf or +outrange return 0x80000000(-0.0f) - * -inf or -outrange return 0x7fffffff(nan) - */ - npyv_s32 roundi = vcvtq_s32_f32(a); - npyv_f32 round = vcvtq_f32_s32(roundi); + const npyv_u32 szero = vreinterpretq_u32_f32(vdupq_n_f32(-0.0f)); + const npyv_u32 sign_mask = vandq_u32(vreinterpretq_u32_f32(a), szero); + const npyv_f32 two_power_23 = vdupq_n_f32(8388608.0); // 2^23 + const npyv_f32 two_power_23h = vdupq_n_f32(12582912.0f); // 1.5 * 2^23 + npyv_u32 nnan_mask = vceqq_f32(a, a); + npyv_f32 x = vreinterpretq_f32_u32(vandq_u32(nnan_mask, vreinterpretq_u32_f32(a))); + // eliminate nans to avoid invalid fp errors + npyv_f32 abs_x = vabsq_f32(x); + // round by add magic number 1.5 * 2^23 + npyv_f32 round = vsubq_f32(vaddq_f32(two_power_23h, abs_x), two_power_23h); + // copysign + round = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(round), sign_mask)); npyv_f32 ceil = vaddq_f32(round, vreinterpretq_f32_u32( - vandq_u32(vcltq_f32(round, a), one)) - ); - // respect signed zero, e.g. -0.5 -> -0.0 - npyv_f32 rzero = vreinterpretq_f32_s32(vorrq_s32( - vreinterpretq_s32_f32(ceil), - vandq_s32(vreinterpretq_s32_f32(a), szero) - )); - // if nan or overflow return a - npyv_u32 nnan = npyv_notnan_f32(a); - npyv_u32 overflow = vorrq_u32( - vceqq_s32(roundi, szero), vceqq_s32(roundi, max_int) + vandq_u32(vcltq_f32(round, x), one)) ); - return vbslq_f32(vbicq_u32(nnan, overflow), rzero, a); + // respects signed zero + ceil = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(ceil), sign_mask)); + // a if |a| >= 2^23 or a == NaN + npyv_u32 mask = vcleq_f32(abs_x, two_power_23); + mask = vandq_u32(mask, nnan_mask); + return vbslq_f32(mask, ceil, a); } #endif #if NPY_SIMD_F64 @@ -339,29 +341,37 @@ NPY_FINLINE npyv_f32 npyv_rint_f32(npyv_f32 a) #ifdef NPY_HAVE_ASIMD #define npyv_trunc_f32 vrndq_f32 #else - NPY_FINLINE npyv_f32 npyv_trunc_f32(npyv_f32 a) - { - const npyv_s32 szero = vreinterpretq_s32_f32(vdupq_n_f32(-0.0f)); + NPY_FINLINE npyv_f32 npyv_trunc_f32(npyv_f32 a) + { const npyv_s32 max_int = vdupq_n_s32(0x7fffffff); + const npyv_u32 exp_mask = vdupq_n_u32(0xff000000); + const npyv_s32 szero = vreinterpretq_s32_f32(vdupq_n_f32(-0.0f)); + const npyv_u32 sign_mask = vandq_u32( + vreinterpretq_u32_f32(a), vreinterpretq_u32_s32(szero)); + + npyv_u32 nfinite_mask = vshlq_n_u32(vreinterpretq_u32_f32(a), 1); + nfinite_mask = vandq_u32(nfinite_mask, exp_mask); + nfinite_mask = vceqq_u32(nfinite_mask, exp_mask); + // eliminate nans/inf to avoid invalid fp errors + npyv_f32 x = vreinterpretq_f32_u32( + veorq_u32(nfinite_mask, vreinterpretq_u32_f32(a))); /** * On armv7, vcvtq.f32 handles special cases as follows: * NaN return 0 * +inf or +outrange return 0x80000000(-0.0f) * -inf or -outrange return 0x7fffffff(nan) */ - npyv_s32 roundi = vcvtq_s32_f32(a); - npyv_f32 round = vcvtq_f32_s32(roundi); + npyv_s32 trunci = vcvtq_s32_f32(x); + npyv_f32 trunc = vcvtq_f32_s32(trunci); // respect signed zero, e.g. -0.5 -> -0.0 - npyv_f32 rzero = vreinterpretq_f32_s32(vorrq_s32( - vreinterpretq_s32_f32(round), - vandq_s32(vreinterpretq_s32_f32(a), szero) - )); - // if nan or overflow return a - npyv_u32 nnan = npyv_notnan_f32(a); - npyv_u32 overflow = vorrq_u32( - vceqq_s32(roundi, szero), vceqq_s32(roundi, max_int) + trunc = vreinterpretq_f32_u32( + vorrq_u32(vreinterpretq_u32_f32(trunc), sign_mask)); + // if overflow return a + npyv_u32 overflow_mask = vorrq_u32( + vceqq_s32(trunci, szero), vceqq_s32(trunci, max_int) ); - return vbslq_f32(vbicq_u32(nnan, overflow), rzero, a); + // a if a overflow or nonfinite + return vbslq_f32(vorrq_u32(nfinite_mask, overflow_mask), a, trunc); } #endif #if NPY_SIMD_F64 @@ -372,28 +382,31 @@ NPY_FINLINE npyv_f32 npyv_rint_f32(npyv_f32 a) #ifdef NPY_HAVE_ASIMD #define npyv_floor_f32 vrndmq_f32 #else - NPY_FINLINE npyv_f32 npyv_floor_f32(npyv_f32 a) - { - const npyv_s32 szero = vreinterpretq_s32_f32(vdupq_n_f32(-0.0f)); + NPY_FINLINE npyv_f32 npyv_floor_f32(npyv_f32 a) + { const npyv_u32 one = vreinterpretq_u32_f32(vdupq_n_f32(1.0f)); - const npyv_s32 max_int = vdupq_n_s32(0x7fffffff); + const npyv_u32 szero = vreinterpretq_u32_f32(vdupq_n_f32(-0.0f)); + const npyv_u32 sign_mask = vandq_u32(vreinterpretq_u32_f32(a), szero); + const npyv_f32 two_power_23 = vdupq_n_f32(8388608.0); // 2^23 + const npyv_f32 two_power_23h = vdupq_n_f32(12582912.0f); // 1.5 * 2^23 - npyv_s32 roundi = vcvtq_s32_f32(a); - npyv_f32 round = vcvtq_f32_s32(roundi); + npyv_u32 nnan_mask = vceqq_f32(a, a); + npyv_f32 x = vreinterpretq_f32_u32(vandq_u32(nnan_mask, vreinterpretq_u32_f32(a))); + // eliminate nans to avoid invalid fp errors + npyv_f32 abs_x = vabsq_f32(x); + // round by add magic number 1.5 * 2^23 + npyv_f32 round = vsubq_f32(vaddq_f32(two_power_23h, abs_x), two_power_23h); + // copysign + round = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(round), sign_mask)); npyv_f32 floor = vsubq_f32(round, vreinterpretq_f32_u32( - vandq_u32(vcgtq_f32(round, a), one) - )); - // respect signed zero - npyv_f32 rzero = vreinterpretq_f32_s32(vorrq_s32( - vreinterpretq_s32_f32(floor), - vandq_s32(vreinterpretq_s32_f32(a), szero) + vandq_u32(vcgtq_f32(round, x), one) )); - npyv_u32 nnan = npyv_notnan_f32(a); - npyv_u32 overflow = vorrq_u32( - vceqq_s32(roundi, szero), vceqq_s32(roundi, max_int) - ); - - return vbslq_f32(vbicq_u32(nnan, overflow), rzero, a); + // respects signed zero + floor = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(floor), sign_mask)); + // a if |a| >= 2^23 or a == NaN + npyv_u32 mask = vcleq_f32(abs_x, two_power_23); + mask = vandq_u32(mask, nnan_mask); + return vbslq_f32(mask, floor, a); } #endif // NPY_HAVE_ASIMD #if NPY_SIMD_F64 diff --git a/numpy/core/src/common/simd/neon/memory.h b/numpy/core/src/common/simd/neon/memory.h index 7060ea628..6163440c3 100644 --- a/numpy/core/src/common/simd/neon/memory.h +++ b/numpy/core/src/common/simd/neon/memory.h @@ -102,6 +102,29 @@ NPY_FINLINE npyv_f64 npyv_loadn_f64(const double *ptr, npy_intp stride) ); } #endif + +//// 64-bit load over 32-bit stride +NPY_FINLINE npyv_u32 npyv_loadn2_u32(const npy_uint32 *ptr, npy_intp stride) +{ + return vcombine_u32( + vld1_u32((const uint32_t*)ptr), vld1_u32((const uint32_t*)ptr + stride) + ); +} +NPY_FINLINE npyv_s32 npyv_loadn2_s32(const npy_int32 *ptr, npy_intp stride) +{ return npyv_reinterpret_s32_u32(npyv_loadn2_u32((const npy_uint32*)ptr, stride)); } +NPY_FINLINE npyv_f32 npyv_loadn2_f32(const float *ptr, npy_intp stride) +{ return npyv_reinterpret_f32_u32(npyv_loadn2_u32((const npy_uint32*)ptr, stride)); } + +//// 128-bit load over 64-bit stride +NPY_FINLINE npyv_u64 npyv_loadn2_u64(const npy_uint64 *ptr, npy_intp stride) +{ (void)stride; return npyv_load_u64(ptr); } +NPY_FINLINE npyv_s64 npyv_loadn2_s64(const npy_int64 *ptr, npy_intp stride) +{ (void)stride; return npyv_load_s64(ptr); } +#if NPY_SIMD_F64 +NPY_FINLINE npyv_f64 npyv_loadn2_f64(const double *ptr, npy_intp stride) +{ (void)stride; return npyv_load_f64(ptr); } +#endif + /*************************** * Non-contiguous Store ***************************/ @@ -131,6 +154,32 @@ NPY_FINLINE void npyv_storen_f64(double *ptr, npy_intp stride, npyv_f64 a) { npyv_storen_s64((npy_int64*)ptr, stride, npyv_reinterpret_s64_f64(a)); } #endif +//// 64-bit store over 32-bit stride +NPY_FINLINE void npyv_storen2_u32(npy_uint32 *ptr, npy_intp stride, npyv_u32 a) +{ +#if NPY_SIMD_F64 + vst1q_lane_u64((uint64_t*)ptr, npyv_reinterpret_u64_u32(a), 0); + vst1q_lane_u64((uint64_t*)(ptr + stride), npyv_reinterpret_u64_u32(a), 1); +#else + // armhf strict to alignment + vst1_u32((uint32_t*)ptr, vget_low_u32(a)); + vst1_u32((uint32_t*)ptr + stride, vget_high_u32(a)); +#endif +} +NPY_FINLINE void npyv_storen2_s32(npy_int32 *ptr, npy_intp stride, npyv_s32 a) +{ npyv_storen2_u32((npy_uint32*)ptr, stride, npyv_reinterpret_u32_s32(a)); } +NPY_FINLINE void npyv_storen2_f32(float *ptr, npy_intp stride, npyv_f32 a) +{ npyv_storen2_u32((npy_uint32*)ptr, stride, npyv_reinterpret_u32_f32(a)); } + +//// 128-bit store over 64-bit stride +NPY_FINLINE void npyv_storen2_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a) +{ (void)stride; npyv_store_u64(ptr, a); } +NPY_FINLINE void npyv_storen2_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a) +{ (void)stride; npyv_store_s64(ptr, a); } +#if NPY_SIMD_F64 +NPY_FINLINE void npyv_storen2_f64(double *ptr, npy_intp stride, npyv_f64 a) +{ (void)stride; npyv_store_f64(ptr, a); } +#endif /********************************* * Partial Load *********************************/ @@ -168,6 +217,29 @@ NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, n NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane) { return npyv_load_till_s64(ptr, nlane, 0); } +//// 64-bit nlane +NPY_FINLINE npyv_s32 npyv_load2_till_s32(const npy_int32 *ptr, npy_uintp nlane, + npy_int32 fill_lo, npy_int32 fill_hi) +{ + assert(nlane > 0); + if (nlane == 1) { + const int32_t NPY_DECL_ALIGNED(16) fill[2] = {fill_lo, fill_hi}; + return vcombine_s32(vld1_s32((const int32_t*)ptr), vld1_s32(fill)); + } + return npyv_load_s32(ptr); +} +// fill zero to rest lanes +NPY_FINLINE npyv_s32 npyv_load2_tillz_s32(const npy_int32 *ptr, npy_uintp nlane) +{ return vreinterpretq_s32_s64(npyv_load_tillz_s64((const npy_int64*)ptr, nlane)); } + +//// 128-bit nlane +NPY_FINLINE npyv_s64 npyv_load2_till_s64(const npy_int64 *ptr, npy_uintp nlane, + npy_int64 fill_lo, npy_int64 fill_hi) +{ (void)nlane; (void)fill_lo; (void)fill_hi; return npyv_load_s64(ptr); } + +NPY_FINLINE npyv_s64 npyv_load2_tillz_s64(const npy_int64 *ptr, npy_uintp nlane) +{ (void)nlane; return npyv_load_s64(ptr); } + /********************************* * Non-contiguous partial load *********************************/ @@ -206,6 +278,34 @@ npyv_loadn_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_ NPY_FINLINE npyv_s64 npyv_loadn_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane) { return npyv_loadn_till_s64(ptr, stride, nlane, 0); } +//// 64-bit load over 32-bit stride +NPY_FINLINE npyv_s32 npyv_loadn2_till_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane, + npy_int32 fill_lo, npy_int32 fill_hi) +{ + assert(nlane > 0); + if (nlane == 1) { + const int32_t NPY_DECL_ALIGNED(16) fill[2] = {fill_lo, fill_hi}; + return vcombine_s32(vld1_s32((const int32_t*)ptr), vld1_s32(fill)); + } + return npyv_loadn2_s32(ptr, stride); +} +NPY_FINLINE npyv_s32 npyv_loadn2_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane) +{ + assert(nlane > 0); + if (nlane == 1) { + return vcombine_s32(vld1_s32((const int32_t*)ptr), vdup_n_s32(0)); + } + return npyv_loadn2_s32(ptr, stride); +} + +//// 128-bit load over 64-bit stride +NPY_FINLINE npyv_s64 npyv_loadn2_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, + npy_int64 fill_lo, npy_int64 fill_hi) +{ assert(nlane > 0); (void)stride; (void)nlane; (void)fill_lo; (void)fill_hi; return npyv_load_s64(ptr); } + +NPY_FINLINE npyv_s64 npyv_loadn2_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane) +{ assert(nlane > 0); (void)stride; (void)nlane; return npyv_load_s64(ptr); } + /********************************* * Partial store *********************************/ @@ -238,6 +338,30 @@ NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a } npyv_store_s64(ptr, a); } + +//// 64-bit nlane +NPY_FINLINE void npyv_store2_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a) +{ + assert(nlane > 0); + if (nlane == 1) { + // armhf strict to alignment, may cause bus error + #if NPY_SIMD_F64 + vst1q_lane_s64((int64_t*)ptr, npyv_reinterpret_s64_s32(a), 0); + #else + npyv_storel_s32(ptr, a); + #endif + return; + } + npyv_store_s32(ptr, a); +} + +//// 128-bit nlane +NPY_FINLINE void npyv_store2_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a) +{ + assert(nlane > 0); (void)nlane; + npyv_store_s64(ptr, a); +} + /********************************* * Non-contiguous partial store *********************************/ @@ -245,16 +369,21 @@ NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a NPY_FINLINE void npyv_storen_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npyv_s32 a) { assert(nlane > 0); + vst1q_lane_s32((int32_t*)ptr, a, 0); switch(nlane) { - default: - vst1q_lane_s32((int32_t*)ptr + stride*3, a, 3); + case 1: + return; + case 2: + vst1q_lane_s32((int32_t*)ptr + stride, a, 1); + return; case 3: + vst1q_lane_s32((int32_t*)ptr + stride, a, 1); vst1q_lane_s32((int32_t*)ptr + stride*2, a, 2); - case 2: + return; + default: vst1q_lane_s32((int32_t*)ptr + stride, a, 1); - case 1: - vst1q_lane_s32((int32_t*)ptr, a, 0); - break; + vst1q_lane_s32((int32_t*)ptr + stride*2, a, 2); + vst1q_lane_s32((int32_t*)ptr + stride*3, a, 3); } } //// 64 @@ -268,6 +397,27 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp npyv_storen_s64(ptr, stride, a); } +//// 64-bit store over 32-bit stride +NPY_FINLINE void npyv_storen2_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npyv_s32 a) +{ + assert(nlane > 0); +#if NPY_SIMD_F64 + vst1q_lane_s64((int64_t*)ptr, npyv_reinterpret_s64_s32(a), 0); + if (nlane > 1) { + vst1q_lane_s64((int64_t*)(ptr + stride), npyv_reinterpret_s64_s32(a), 1); + } +#else + npyv_storel_s32(ptr, a); + if (nlane > 1) { + npyv_storeh_s32(ptr + stride, a); + } +#endif +} + +//// 128-bit store over 64-bit stride +NPY_FINLINE void npyv_storen2_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npyv_s64 a) +{ assert(nlane > 0); (void)stride; (void)nlane; npyv_store_s64(ptr, a); } + /***************************************************************** * Implement partial load/store for u32/f32/u64/f64... via casting *****************************************************************/ @@ -278,7 +428,8 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp union { \ npyv_lanetype_##F_SFX from_##F_SFX; \ npyv_lanetype_##T_SFX to_##T_SFX; \ - } pun = {.from_##F_SFX = fill}; \ + } pun; \ + pun.from_##F_SFX = fill; \ return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load_till_##T_SFX( \ (const npyv_lanetype_##T_SFX *)ptr, nlane, pun.to_##T_SFX \ )); \ @@ -290,7 +441,8 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp union { \ npyv_lanetype_##F_SFX from_##F_SFX; \ npyv_lanetype_##T_SFX to_##T_SFX; \ - } pun = {.from_##F_SFX = fill}; \ + } pun; \ + pun.from_##F_SFX = fill; \ return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn_till_##T_SFX( \ (const npyv_lanetype_##T_SFX *)ptr, stride, nlane, pun.to_##T_SFX \ )); \ @@ -332,6 +484,131 @@ NPYV_IMPL_NEON_REST_PARTIAL_TYPES(u64, s64) #if NPY_SIMD_F64 NPYV_IMPL_NEON_REST_PARTIAL_TYPES(f64, s64) #endif + +// 128-bit/64-bit stride +#define NPYV_IMPL_NEON_REST_PARTIAL_TYPES_PAIR(F_SFX, T_SFX) \ + NPY_FINLINE npyv_##F_SFX npyv_load2_till_##F_SFX \ + (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, \ + npyv_lanetype_##F_SFX fill_lo, npyv_lanetype_##F_SFX fill_hi) \ + { \ + union pun { \ + npyv_lanetype_##F_SFX from_##F_SFX; \ + npyv_lanetype_##T_SFX to_##T_SFX; \ + }; \ + union pun pun_lo; \ + union pun pun_hi; \ + pun_lo.from_##F_SFX = fill_lo; \ + pun_hi.from_##F_SFX = fill_hi; \ + return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load2_till_##T_SFX( \ + (const npyv_lanetype_##T_SFX *)ptr, nlane, pun_lo.to_##T_SFX, pun_hi.to_##T_SFX \ + )); \ + } \ + NPY_FINLINE npyv_##F_SFX npyv_loadn2_till_##F_SFX \ + (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, \ + npyv_lanetype_##F_SFX fill_lo, npyv_lanetype_##F_SFX fill_hi) \ + { \ + union pun { \ + npyv_lanetype_##F_SFX from_##F_SFX; \ + npyv_lanetype_##T_SFX to_##T_SFX; \ + }; \ + union pun pun_lo; \ + union pun pun_hi; \ + pun_lo.from_##F_SFX = fill_lo; \ + pun_hi.from_##F_SFX = fill_hi; \ + return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn2_till_##T_SFX( \ + (const npyv_lanetype_##T_SFX *)ptr, stride, nlane, pun_lo.to_##T_SFX, \ + pun_hi.to_##T_SFX \ + )); \ + } \ + NPY_FINLINE npyv_##F_SFX npyv_load2_tillz_##F_SFX \ + (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane) \ + { \ + return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load2_tillz_##T_SFX( \ + (const npyv_lanetype_##T_SFX *)ptr, nlane \ + )); \ + } \ + NPY_FINLINE npyv_##F_SFX npyv_loadn2_tillz_##F_SFX \ + (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane) \ + { \ + return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn2_tillz_##T_SFX( \ + (const npyv_lanetype_##T_SFX *)ptr, stride, nlane \ + )); \ + } \ + NPY_FINLINE void npyv_store2_till_##F_SFX \ + (npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, npyv_##F_SFX a) \ + { \ + npyv_store2_till_##T_SFX( \ + (npyv_lanetype_##T_SFX *)ptr, nlane, \ + npyv_reinterpret_##T_SFX##_##F_SFX(a) \ + ); \ + } \ + NPY_FINLINE void npyv_storen2_till_##F_SFX \ + (npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, npyv_##F_SFX a) \ + { \ + npyv_storen2_till_##T_SFX( \ + (npyv_lanetype_##T_SFX *)ptr, stride, nlane, \ + npyv_reinterpret_##T_SFX##_##F_SFX(a) \ + ); \ + } + +NPYV_IMPL_NEON_REST_PARTIAL_TYPES_PAIR(u32, s32) +NPYV_IMPL_NEON_REST_PARTIAL_TYPES_PAIR(f32, s32) +NPYV_IMPL_NEON_REST_PARTIAL_TYPES_PAIR(u64, s64) +#if NPY_SIMD_F64 +NPYV_IMPL_NEON_REST_PARTIAL_TYPES_PAIR(f64, s64) +#endif + +/************************************************************ + * de-interlave load / interleave contiguous store + ************************************************************/ +// two channels +#define NPYV_IMPL_NEON_MEM_INTERLEAVE(SFX, T_PTR) \ + NPY_FINLINE npyv_##SFX##x2 npyv_load_##SFX##x2( \ + const npyv_lanetype_##SFX *ptr \ + ) { \ + return vld2q_##SFX((const T_PTR*)ptr); \ + } \ + NPY_FINLINE void npyv_store_##SFX##x2( \ + npyv_lanetype_##SFX *ptr, npyv_##SFX##x2 v \ + ) { \ + vst2q_##SFX((T_PTR*)ptr, v); \ + } + +NPYV_IMPL_NEON_MEM_INTERLEAVE(u8, uint8_t) +NPYV_IMPL_NEON_MEM_INTERLEAVE(s8, int8_t) +NPYV_IMPL_NEON_MEM_INTERLEAVE(u16, uint16_t) +NPYV_IMPL_NEON_MEM_INTERLEAVE(s16, int16_t) +NPYV_IMPL_NEON_MEM_INTERLEAVE(u32, uint32_t) +NPYV_IMPL_NEON_MEM_INTERLEAVE(s32, int32_t) +NPYV_IMPL_NEON_MEM_INTERLEAVE(f32, float) + +#if NPY_SIMD_F64 + NPYV_IMPL_NEON_MEM_INTERLEAVE(f64, double) + NPYV_IMPL_NEON_MEM_INTERLEAVE(u64, uint64_t) + NPYV_IMPL_NEON_MEM_INTERLEAVE(s64, int64_t) +#else + #define NPYV_IMPL_NEON_MEM_INTERLEAVE_64(SFX) \ + NPY_FINLINE npyv_##SFX##x2 npyv_load_##SFX##x2( \ + const npyv_lanetype_##SFX *ptr) \ + { \ + npyv_##SFX a = npyv_load_##SFX(ptr); \ + npyv_##SFX b = npyv_load_##SFX(ptr + 2); \ + npyv_##SFX##x2 r; \ + r.val[0] = vcombine_##SFX(vget_low_##SFX(a), vget_low_##SFX(b)); \ + r.val[1] = vcombine_##SFX(vget_high_##SFX(a), vget_high_##SFX(b)); \ + return r; \ + } \ + NPY_FINLINE void npyv_store_##SFX##x2( \ + npyv_lanetype_##SFX *ptr, npyv_##SFX##x2 v) \ + { \ + npyv_store_##SFX(ptr, vcombine_##SFX( \ + vget_low_##SFX(v.val[0]), vget_low_##SFX(v.val[1]))); \ + npyv_store_##SFX(ptr + 2, vcombine_##SFX( \ + vget_high_##SFX(v.val[0]), vget_high_##SFX(v.val[1]))); \ + } + NPYV_IMPL_NEON_MEM_INTERLEAVE_64(u64) + NPYV_IMPL_NEON_MEM_INTERLEAVE_64(s64) +#endif /********************************* * Lookup table *********************************/ diff --git a/numpy/core/src/common/simd/neon/neon.h b/numpy/core/src/common/simd/neon/neon.h index b08071527..49c35c415 100644 --- a/numpy/core/src/common/simd/neon/neon.h +++ b/numpy/core/src/common/simd/neon/neon.h @@ -16,6 +16,7 @@ #define NPY_SIMD_FMA3 0 // HW emulated #endif #define NPY_SIMD_BIGENDIAN 0 +#define NPY_SIMD_CMPSIGNAL 1 typedef uint8x16_t npyv_u8; typedef int8x16_t npyv_s8; diff --git a/numpy/core/src/common/simd/neon/operators.h b/numpy/core/src/common/simd/neon/operators.h index 249621bd6..e18ea94b8 100644 --- a/numpy/core/src/common/simd/neon/operators.h +++ b/numpy/core/src/common/simd/neon/operators.h @@ -240,10 +240,35 @@ // check special cases NPY_FINLINE npyv_b32 npyv_notnan_f32(npyv_f32 a) -{ return vceqq_f32(a, a); } +{ +#if defined(__clang__) +/** + * To avoid signaling qNaN, workaround for clang symmetric inputs bug + * check https://github.com/numpy/numpy/issues/22933, + * for more clarification. + */ + npyv_b32 ret; + #if NPY_SIMD_F64 + __asm("fcmeq %0.4s, %1.4s, %1.4s" : "=w" (ret) : "w" (a)); + #else + __asm("vceq.f32 %q0, %q1, %q1" : "=w" (ret) : "w" (a)); + #endif + return ret; +#else + return vceqq_f32(a, a); +#endif +} #if NPY_SIMD_F64 NPY_FINLINE npyv_b64 npyv_notnan_f64(npyv_f64 a) - { return vceqq_f64(a, a); } + { + #if defined(__clang__) + npyv_b64 ret; + __asm("fcmeq %0.2d, %1.2d, %1.2d" : "=w" (ret) : "w" (a)); + return ret; + #else + return vceqq_f64(a, a); + #endif + } #endif // Test cross all vector lanes diff --git a/numpy/core/src/common/simd/neon/reorder.h b/numpy/core/src/common/simd/neon/reorder.h index 50b06ed11..8bf68f5be 100644 --- a/numpy/core/src/common/simd/neon/reorder.h +++ b/numpy/core/src/common/simd/neon/reorder.h @@ -76,36 +76,45 @@ NPYV_IMPL_NEON_COMBINE(npyv_f32, f32) NPYV_IMPL_NEON_COMBINE(npyv_f64, f64) #endif -// interleave two vectors -#define NPYV_IMPL_NEON_ZIP(T_VEC, SFX) \ - NPY_FINLINE T_VEC##x2 npyv_zip_##SFX(T_VEC a, T_VEC b) \ - { \ - T_VEC##x2 r; \ - r.val[0] = vzip1q_##SFX(a, b); \ - r.val[1] = vzip2q_##SFX(a, b); \ - return r; \ - } - +// interleave & deinterleave two vectors #ifdef __aarch64__ - NPYV_IMPL_NEON_ZIP(npyv_u8, u8) - NPYV_IMPL_NEON_ZIP(npyv_s8, s8) - NPYV_IMPL_NEON_ZIP(npyv_u16, u16) - NPYV_IMPL_NEON_ZIP(npyv_s16, s16) - NPYV_IMPL_NEON_ZIP(npyv_u32, u32) - NPYV_IMPL_NEON_ZIP(npyv_s32, s32) - NPYV_IMPL_NEON_ZIP(npyv_f32, f32) - NPYV_IMPL_NEON_ZIP(npyv_f64, f64) + #define NPYV_IMPL_NEON_ZIP(T_VEC, SFX) \ + NPY_FINLINE T_VEC##x2 npyv_zip_##SFX(T_VEC a, T_VEC b) \ + { \ + T_VEC##x2 r; \ + r.val[0] = vzip1q_##SFX(a, b); \ + r.val[1] = vzip2q_##SFX(a, b); \ + return r; \ + } \ + NPY_FINLINE T_VEC##x2 npyv_unzip_##SFX(T_VEC a, T_VEC b) \ + { \ + T_VEC##x2 r; \ + r.val[0] = vuzp1q_##SFX(a, b); \ + r.val[1] = vuzp2q_##SFX(a, b); \ + return r; \ + } #else - #define npyv_zip_u8 vzipq_u8 - #define npyv_zip_s8 vzipq_s8 - #define npyv_zip_u16 vzipq_u16 - #define npyv_zip_s16 vzipq_s16 - #define npyv_zip_u32 vzipq_u32 - #define npyv_zip_s32 vzipq_s32 - #define npyv_zip_f32 vzipq_f32 + #define NPYV_IMPL_NEON_ZIP(T_VEC, SFX) \ + NPY_FINLINE T_VEC##x2 npyv_zip_##SFX(T_VEC a, T_VEC b) \ + { return vzipq_##SFX(a, b); } \ + NPY_FINLINE T_VEC##x2 npyv_unzip_##SFX(T_VEC a, T_VEC b) \ + { return vuzpq_##SFX(a, b); } #endif + +NPYV_IMPL_NEON_ZIP(npyv_u8, u8) +NPYV_IMPL_NEON_ZIP(npyv_s8, s8) +NPYV_IMPL_NEON_ZIP(npyv_u16, u16) +NPYV_IMPL_NEON_ZIP(npyv_s16, s16) +NPYV_IMPL_NEON_ZIP(npyv_u32, u32) +NPYV_IMPL_NEON_ZIP(npyv_s32, s32) +NPYV_IMPL_NEON_ZIP(npyv_f32, f32) + #define npyv_zip_u64 npyv_combine_u64 #define npyv_zip_s64 npyv_combine_s64 +#define npyv_zip_f64 npyv_combine_f64 +#define npyv_unzip_u64 npyv_combine_u64 +#define npyv_unzip_s64 npyv_combine_s64 +#define npyv_unzip_f64 npyv_combine_f64 // Reverse elements of each 64-bit lane #define npyv_rev64_u8 vrev64q_u8 @@ -116,4 +125,65 @@ NPYV_IMPL_NEON_COMBINE(npyv_f64, f64) #define npyv_rev64_s32 vrev64q_s32 #define npyv_rev64_f32 vrev64q_f32 +// Permuting the elements of each 128-bit lane by immediate index for +// each element. +#ifdef __clang__ + #define npyv_permi128_u32(A, E0, E1, E2, E3) \ + __builtin_shufflevector(A, A, E0, E1, E2, E3) +#elif defined(__GNUC__) + #define npyv_permi128_u32(A, E0, E1, E2, E3) \ + __builtin_shuffle(A, npyv_set_u32(E0, E1, E2, E3)) +#else + #define npyv_permi128_u32(A, E0, E1, E2, E3) \ + npyv_set_u32( \ + vgetq_lane_u32(A, E0), vgetq_lane_u32(A, E1), \ + vgetq_lane_u32(A, E2), vgetq_lane_u32(A, E3) \ + ) + #define npyv_permi128_s32(A, E0, E1, E2, E3) \ + npyv_set_s32( \ + vgetq_lane_s32(A, E0), vgetq_lane_s32(A, E1), \ + vgetq_lane_s32(A, E2), vgetq_lane_s32(A, E3) \ + ) + #define npyv_permi128_f32(A, E0, E1, E2, E3) \ + npyv_set_f32( \ + vgetq_lane_f32(A, E0), vgetq_lane_f32(A, E1), \ + vgetq_lane_f32(A, E2), vgetq_lane_f32(A, E3) \ + ) +#endif + +#if defined(__clang__) || defined(__GNUC__) + #define npyv_permi128_s32 npyv_permi128_u32 + #define npyv_permi128_f32 npyv_permi128_u32 +#endif + +#ifdef __clang__ + #define npyv_permi128_u64(A, E0, E1) \ + __builtin_shufflevector(A, A, E0, E1) +#elif defined(__GNUC__) + #define npyv_permi128_u64(A, E0, E1) \ + __builtin_shuffle(A, npyv_set_u64(E0, E1)) +#else + #define npyv_permi128_u64(A, E0, E1) \ + npyv_set_u64( \ + vgetq_lane_u64(A, E0), vgetq_lane_u64(A, E1) \ + ) + #define npyv_permi128_s64(A, E0, E1) \ + npyv_set_s64( \ + vgetq_lane_s64(A, E0), vgetq_lane_s64(A, E1) \ + ) + #define npyv_permi128_f64(A, E0, E1) \ + npyv_set_f64( \ + vgetq_lane_f64(A, E0), vgetq_lane_f64(A, E1) \ + ) +#endif + +#if defined(__clang__) || defined(__GNUC__) + #define npyv_permi128_s64 npyv_permi128_u64 + #define npyv_permi128_f64 npyv_permi128_u64 +#endif + +#if !NPY_SIMD_F64 + #undef npyv_permi128_f64 +#endif + #endif // _NPY_SIMD_NEON_REORDER_H diff --git a/numpy/core/src/common/simd/simd.h b/numpy/core/src/common/simd/simd.h index 92a77ad80..8c9b14251 100644 --- a/numpy/core/src/common/simd/simd.h +++ b/numpy/core/src/common/simd/simd.h @@ -82,6 +82,9 @@ typedef double npyv_lanetype_f64; #define NPY_SIMD_FMA3 0 /// 1 if the enabled SIMD extension is running on big-endian mode otherwise 0. #define NPY_SIMD_BIGENDIAN 0 + /// 1 if the supported comparison intrinsics(lt, le, gt, ge) + /// raises FP invalid exception for quite NaNs. + #define NPY_SIMD_CMPSIGNAL 0 #endif // enable emulated mask operations for all SIMD extension except for AVX512 diff --git a/numpy/core/src/common/simd/sse/arithmetic.h b/numpy/core/src/common/simd/sse/arithmetic.h index bced35108..72a87eac1 100644 --- a/numpy/core/src/common/simd/sse/arithmetic.h +++ b/numpy/core/src/common/simd/sse/arithmetic.h @@ -282,6 +282,10 @@ NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor) // negate multiply and subtract, -(a*b) - c #define npyv_nmulsub_f32 _mm_fnmsub_ps #define npyv_nmulsub_f64 _mm_fnmsub_pd + // multiply, add for odd elements and subtract even elements. + // (a * b) -+ c + #define npyv_muladdsub_f32 _mm_fmaddsub_ps + #define npyv_muladdsub_f64 _mm_fmaddsub_pd #elif defined(NPY_HAVE_FMA4) // multiply and add, a*b + c #define npyv_muladd_f32 _mm_macc_ps @@ -292,6 +296,10 @@ NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor) // negate multiply and add, -(a*b) + c #define npyv_nmuladd_f32 _mm_nmacc_ps #define npyv_nmuladd_f64 _mm_nmacc_pd + // multiply, add for odd elements and subtract even elements. + // (a * b) -+ c + #define npyv_muladdsub_f32 _mm_maddsub_ps + #define npyv_muladdsub_f64 _mm_maddsub_pd #else // multiply and add, a*b + c NPY_FINLINE npyv_f32 npyv_muladd_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c) @@ -308,6 +316,28 @@ NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor) { return npyv_sub_f32(c, npyv_mul_f32(a, b)); } NPY_FINLINE npyv_f64 npyv_nmuladd_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c) { return npyv_sub_f64(c, npyv_mul_f64(a, b)); } + // multiply, add for odd elements and subtract even elements. + // (a * b) -+ c + NPY_FINLINE npyv_f32 npyv_muladdsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c) + { + npyv_f32 m = npyv_mul_f32(a, b); + #if NPY_HAVE_SSE3 + return _mm_addsub_ps(m, c); + #else + const npyv_f32 msign = npyv_set_f32(-0.0f, 0.0f, -0.0f, 0.0f); + return npyv_add_f32(m, npyv_xor_f32(msign, c)); + #endif + } + NPY_FINLINE npyv_f64 npyv_muladdsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c) + { + npyv_f64 m = npyv_mul_f64(a, b); + #if NPY_HAVE_SSE3 + return _mm_addsub_pd(m, c); + #else + const npyv_f64 msign = npyv_set_f64(-0.0, 0.0); + return npyv_add_f64(m, npyv_xor_f64(msign, c)); + #endif + } #endif // NPY_HAVE_FMA3 #ifndef NPY_HAVE_FMA3 // for FMA4 and NON-FMA3 // negate multiply and subtract, -(a*b) - c diff --git a/numpy/core/src/common/simd/sse/math.h b/numpy/core/src/common/simd/sse/math.h index b7f8e6ebb..b51c935af 100644 --- a/numpy/core/src/common/simd/sse/math.h +++ b/numpy/core/src/common/simd/sse/math.h @@ -269,13 +269,23 @@ NPY_FINLINE npyv_f32 npyv_rint_f32(npyv_f32 a) #ifdef NPY_HAVE_SSE41 return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT); #else - const npyv_f32 szero = _mm_set1_ps(-0.0f); - __m128i roundi = _mm_cvtps_epi32(a); - __m128i overflow = _mm_cmpeq_epi32(roundi, _mm_castps_si128(szero)); - __m128 r = _mm_cvtepi32_ps(roundi); - // respect sign of zero - r = _mm_or_ps(r, _mm_and_ps(a, szero)); - return npyv_select_f32(overflow, a, r); + const __m128 szero = _mm_set1_ps(-0.0f); + const __m128i exp_mask = _mm_set1_epi32(0xff000000); + + __m128i nfinite_mask = _mm_slli_epi32(_mm_castps_si128(a), 1); + nfinite_mask = _mm_and_si128(nfinite_mask, exp_mask); + nfinite_mask = _mm_cmpeq_epi32(nfinite_mask, exp_mask); + + // eliminate nans/inf to avoid invalid fp errors + __m128 x = _mm_xor_ps(a, _mm_castsi128_ps(nfinite_mask)); + __m128i roundi = _mm_cvtps_epi32(x); + __m128 round = _mm_cvtepi32_ps(roundi); + // respect signed zero + round = _mm_or_ps(round, _mm_and_ps(a, szero)); + // if overflow return a + __m128i overflow_mask = _mm_cmpeq_epi32(roundi, _mm_castps_si128(szero)); + // a if a overflow or nonfinite + return npyv_select_f32(_mm_or_si128(nfinite_mask, overflow_mask), a, round); #endif } @@ -285,16 +295,22 @@ NPY_FINLINE npyv_f64 npyv_rint_f64(npyv_f64 a) #ifdef NPY_HAVE_SSE41 return _mm_round_pd(a, _MM_FROUND_TO_NEAREST_INT); #else - const npyv_f64 szero = _mm_set1_pd(-0.0); - const npyv_f64 two_power_52 = _mm_set1_pd(0x10000000000000); - npyv_f64 sign_two52 = _mm_or_pd(two_power_52, _mm_and_pd(a, szero)); + const __m128d szero = _mm_set1_pd(-0.0); + const __m128d two_power_52 = _mm_set1_pd(0x10000000000000); + __m128d nan_mask = _mm_cmpunord_pd(a, a); + // eliminate nans to avoid invalid fp errors within cmpge + __m128d abs_x = npyv_abs_f64(_mm_xor_pd(nan_mask, a)); // round by add magic number 2^52 - npyv_f64 round = _mm_sub_pd(_mm_add_pd(a, sign_two52), sign_two52); - // respect signed zero, e.g. -0.5 -> -0.0 - return _mm_or_pd(round, _mm_and_pd(a, szero)); + // assuming that MXCSR register is set to rounding + __m128d round = _mm_sub_pd(_mm_add_pd(two_power_52, abs_x), two_power_52); + // copysign + round = _mm_or_pd(round, _mm_and_pd(a, szero)); + // a if |a| >= 2^52 or a == NaN + __m128d mask = _mm_cmpge_pd(abs_x, two_power_52); + mask = _mm_or_pd(mask, nan_mask); + return npyv_select_f64(_mm_castpd_si128(mask), a, round); #endif } - // ceil #ifdef NPY_HAVE_SSE41 #define npyv_ceil_f32 _mm_ceil_ps @@ -302,27 +318,48 @@ NPY_FINLINE npyv_f64 npyv_rint_f64(npyv_f64 a) #else NPY_FINLINE npyv_f32 npyv_ceil_f32(npyv_f32 a) { - const npyv_f32 szero = _mm_set1_ps(-0.0f); - const npyv_f32 one = _mm_set1_ps(1.0f); - npyv_s32 roundi = _mm_cvttps_epi32(a); - npyv_f32 round = _mm_cvtepi32_ps(roundi); - npyv_f32 ceil = _mm_add_ps(round, _mm_and_ps(_mm_cmplt_ps(round, a), one)); - // respect signed zero, e.g. -0.5 -> -0.0 - npyv_f32 rzero = _mm_or_ps(ceil, _mm_and_ps(a, szero)); + const __m128 one = _mm_set1_ps(1.0f); + const __m128 szero = _mm_set1_ps(-0.0f); + const __m128i exp_mask = _mm_set1_epi32(0xff000000); + + __m128i nfinite_mask = _mm_slli_epi32(_mm_castps_si128(a), 1); + nfinite_mask = _mm_and_si128(nfinite_mask, exp_mask); + nfinite_mask = _mm_cmpeq_epi32(nfinite_mask, exp_mask); + + // eliminate nans/inf to avoid invalid fp errors + __m128 x = _mm_xor_ps(a, _mm_castsi128_ps(nfinite_mask)); + __m128i roundi = _mm_cvtps_epi32(x); + __m128 round = _mm_cvtepi32_ps(roundi); + __m128 ceil = _mm_add_ps(round, _mm_and_ps(_mm_cmplt_ps(round, x), one)); + // respect signed zero + ceil = _mm_or_ps(ceil, _mm_and_ps(a, szero)); // if overflow return a - return npyv_select_f32(_mm_cmpeq_epi32(roundi, _mm_castps_si128(szero)), a, rzero); + __m128i overflow_mask = _mm_cmpeq_epi32(roundi, _mm_castps_si128(szero)); + // a if a overflow or nonfinite + return npyv_select_f32(_mm_or_si128(nfinite_mask, overflow_mask), a, ceil); } NPY_FINLINE npyv_f64 npyv_ceil_f64(npyv_f64 a) { - const npyv_f64 szero = _mm_set1_pd(-0.0); - const npyv_f64 one = _mm_set1_pd(1.0); - const npyv_f64 two_power_52 = _mm_set1_pd(0x10000000000000); - npyv_f64 sign_two52 = _mm_or_pd(two_power_52, _mm_and_pd(a, szero)); + const __m128d one = _mm_set1_pd(1.0); + const __m128d szero = _mm_set1_pd(-0.0); + const __m128d two_power_52 = _mm_set1_pd(0x10000000000000); + __m128d nan_mask = _mm_cmpunord_pd(a, a); + // eliminate nans to avoid invalid fp errors within cmpge + __m128d x = _mm_xor_pd(nan_mask, a); + __m128d abs_x = npyv_abs_f64(x); + __m128d sign_x = _mm_and_pd(x, szero); // round by add magic number 2^52 - npyv_f64 round = _mm_sub_pd(_mm_add_pd(a, sign_two52), sign_two52); - npyv_f64 ceil = _mm_add_pd(round, _mm_and_pd(_mm_cmplt_pd(round, a), one)); - // respect signed zero, e.g. -0.5 -> -0.0 - return _mm_or_pd(ceil, _mm_and_pd(a, szero)); + // assuming that MXCSR register is set to rounding + __m128d round = _mm_sub_pd(_mm_add_pd(two_power_52, abs_x), two_power_52); + // copysign + round = _mm_or_pd(round, sign_x); + __m128d ceil = _mm_add_pd(round, _mm_and_pd(_mm_cmplt_pd(round, x), one)); + // respects sign of 0.0 + ceil = _mm_or_pd(ceil, sign_x); + // a if |a| >= 2^52 or a == NaN + __m128d mask = _mm_cmpge_pd(abs_x, two_power_52); + mask = _mm_or_pd(mask, nan_mask); + return npyv_select_f64(_mm_castpd_si128(mask), a, ceil); } #endif @@ -333,24 +370,43 @@ NPY_FINLINE npyv_f64 npyv_rint_f64(npyv_f64 a) #else NPY_FINLINE npyv_f32 npyv_trunc_f32(npyv_f32 a) { - const npyv_f32 szero = _mm_set1_ps(-0.0f); - npyv_s32 roundi = _mm_cvttps_epi32(a); - npyv_f32 trunc = _mm_cvtepi32_ps(roundi); + const __m128 szero = _mm_set1_ps(-0.0f); + const __m128i exp_mask = _mm_set1_epi32(0xff000000); + + __m128i nfinite_mask = _mm_slli_epi32(_mm_castps_si128(a), 1); + nfinite_mask = _mm_and_si128(nfinite_mask, exp_mask); + nfinite_mask = _mm_cmpeq_epi32(nfinite_mask, exp_mask); + + // eliminate nans/inf to avoid invalid fp errors + __m128 x = _mm_xor_ps(a, _mm_castsi128_ps(nfinite_mask)); + __m128i trunci = _mm_cvttps_epi32(x); + __m128 trunc = _mm_cvtepi32_ps(trunci); // respect signed zero, e.g. -0.5 -> -0.0 - npyv_f32 rzero = _mm_or_ps(trunc, _mm_and_ps(a, szero)); + trunc = _mm_or_ps(trunc, _mm_and_ps(a, szero)); // if overflow return a - return npyv_select_f32(_mm_cmpeq_epi32(roundi, _mm_castps_si128(szero)), a, rzero); + __m128i overflow_mask = _mm_cmpeq_epi32(trunci, _mm_castps_si128(szero)); + // a if a overflow or nonfinite + return npyv_select_f32(_mm_or_si128(nfinite_mask, overflow_mask), a, trunc); } NPY_FINLINE npyv_f64 npyv_trunc_f64(npyv_f64 a) { - const npyv_f64 szero = _mm_set1_pd(-0.0); - const npyv_f64 one = _mm_set1_pd(1.0); - const npyv_f64 two_power_52 = _mm_set1_pd(0x10000000000000); - npyv_f64 abs_a = npyv_abs_f64(a); + const __m128d one = _mm_set1_pd(1.0); + const __m128d szero = _mm_set1_pd(-0.0); + const __m128d two_power_52 = _mm_set1_pd(0x10000000000000); + __m128d nan_mask = _mm_cmpunord_pd(a, a); + // eliminate nans to avoid invalid fp errors within cmpge + __m128d abs_x = npyv_abs_f64(_mm_xor_pd(nan_mask, a)); // round by add magic number 2^52 - npyv_f64 abs_round = _mm_sub_pd(_mm_add_pd(abs_a, two_power_52), two_power_52); - npyv_f64 subtrahend = _mm_and_pd(_mm_cmpgt_pd(abs_round, abs_a), one); - return _mm_or_pd(_mm_sub_pd(abs_round, subtrahend), _mm_and_pd(a, szero)); + // assuming that MXCSR register is set to rounding + __m128d abs_round = _mm_sub_pd(_mm_add_pd(two_power_52, abs_x), two_power_52); + __m128d subtrahend = _mm_and_pd(_mm_cmpgt_pd(abs_round, abs_x), one); + __m128d trunc = _mm_sub_pd(abs_round, subtrahend); + // copysign + trunc = _mm_or_pd(trunc, _mm_and_pd(a, szero)); + // a if |a| >= 2^52 or a == NaN + __m128d mask = _mm_cmpge_pd(abs_x, two_power_52); + mask = _mm_or_pd(mask, nan_mask); + return npyv_select_f64(_mm_castpd_si128(mask), a, trunc); } #endif @@ -361,15 +417,46 @@ NPY_FINLINE npyv_f64 npyv_rint_f64(npyv_f64 a) #else NPY_FINLINE npyv_f32 npyv_floor_f32(npyv_f32 a) { - const npyv_f32 one = _mm_set1_ps(1.0f); - npyv_f32 round = npyv_rint_f32(a); - return _mm_sub_ps(round, _mm_and_ps(_mm_cmpgt_ps(round, a), one)); + const __m128 one = _mm_set1_ps(1.0f); + const __m128 szero = _mm_set1_ps(-0.0f); + const __m128i exp_mask = _mm_set1_epi32(0xff000000); + + __m128i nfinite_mask = _mm_slli_epi32(_mm_castps_si128(a), 1); + nfinite_mask = _mm_and_si128(nfinite_mask, exp_mask); + nfinite_mask = _mm_cmpeq_epi32(nfinite_mask, exp_mask); + + // eliminate nans/inf to avoid invalid fp errors + __m128 x = _mm_xor_ps(a, _mm_castsi128_ps(nfinite_mask)); + __m128i roundi = _mm_cvtps_epi32(x); + __m128 round = _mm_cvtepi32_ps(roundi); + __m128 floor = _mm_sub_ps(round, _mm_and_ps(_mm_cmpgt_ps(round, x), one)); + // respect signed zero + floor = _mm_or_ps(floor, _mm_and_ps(a, szero)); + // if overflow return a + __m128i overflow_mask = _mm_cmpeq_epi32(roundi, _mm_castps_si128(szero)); + // a if a overflow or nonfinite + return npyv_select_f32(_mm_or_si128(nfinite_mask, overflow_mask), a, floor); } NPY_FINLINE npyv_f64 npyv_floor_f64(npyv_f64 a) { - const npyv_f64 one = _mm_set1_pd(1.0); - npyv_f64 round = npyv_rint_f64(a); - return _mm_sub_pd(round, _mm_and_pd(_mm_cmpgt_pd(round, a), one)); + const __m128d one = _mm_set1_pd(1.0f); + const __m128d szero = _mm_set1_pd(-0.0f); + const __m128d two_power_52 = _mm_set1_pd(0x10000000000000); + __m128d nan_mask = _mm_cmpunord_pd(a, a); + // eliminate nans to avoid invalid fp errors within cmpge + __m128d x = _mm_xor_pd(nan_mask, a); + __m128d abs_x = npyv_abs_f64(x); + __m128d sign_x = _mm_and_pd(x, szero); + // round by add magic number 2^52 + // assuming that MXCSR register is set to rounding + __m128d round = _mm_sub_pd(_mm_add_pd(two_power_52, abs_x), two_power_52); + // copysign + round = _mm_or_pd(round, sign_x); + __m128d floor = _mm_sub_pd(round, _mm_and_pd(_mm_cmpgt_pd(round, x), one)); + // a if |a| >= 2^52 or a == NaN + __m128d mask = _mm_cmpge_pd(abs_x, two_power_52); + mask = _mm_or_pd(mask, nan_mask); + return npyv_select_f64(_mm_castpd_si128(mask), a, floor); } #endif // NPY_HAVE_SSE41 diff --git a/numpy/core/src/common/simd/sse/memory.h b/numpy/core/src/common/simd/sse/memory.h index 3ff64848d..4c8e86a6f 100644 --- a/numpy/core/src/common/simd/sse/memory.h +++ b/numpy/core/src/common/simd/sse/memory.h @@ -103,6 +103,28 @@ NPY_FINLINE npyv_u64 npyv_loadn_u64(const npy_uint64 *ptr, npy_intp stride) { return _mm_castpd_si128(npyv_loadn_f64((const double*)ptr, stride)); } NPY_FINLINE npyv_s64 npyv_loadn_s64(const npy_int64 *ptr, npy_intp stride) { return _mm_castpd_si128(npyv_loadn_f64((const double*)ptr, stride)); } + +//// 64-bit load over 32-bit stride +NPY_FINLINE npyv_f32 npyv_loadn2_f32(const float *ptr, npy_intp stride) +{ + __m128d r = _mm_loadh_pd( + npyv_loadl_f64((const double*)ptr), (const double*)(ptr + stride) + ); + return _mm_castpd_ps(r); +} +NPY_FINLINE npyv_u32 npyv_loadn2_u32(const npy_uint32 *ptr, npy_intp stride) +{ return _mm_castps_si128(npyv_loadn2_f32((const float*)ptr, stride)); } +NPY_FINLINE npyv_s32 npyv_loadn2_s32(const npy_int32 *ptr, npy_intp stride) +{ return _mm_castps_si128(npyv_loadn2_f32((const float*)ptr, stride)); } + +//// 128-bit load over 64-bit stride +NPY_FINLINE npyv_f64 npyv_loadn2_f64(const double *ptr, npy_intp stride) +{ (void)stride; return npyv_load_f64(ptr); } +NPY_FINLINE npyv_u64 npyv_loadn2_u64(const npy_uint64 *ptr, npy_intp stride) +{ (void)stride; return npyv_load_u64(ptr); } +NPY_FINLINE npyv_s64 npyv_loadn2_s64(const npy_int64 *ptr, npy_intp stride) +{ (void)stride; return npyv_load_s64(ptr); } + /*************************** * Non-contiguous Store ***************************/ @@ -135,6 +157,24 @@ NPY_FINLINE void npyv_storen_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a) NPY_FINLINE void npyv_storen_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a) { npyv_storen_f64((double*)ptr, stride, _mm_castsi128_pd(a)); } +//// 64-bit store over 32-bit stride +NPY_FINLINE void npyv_storen2_u32(npy_uint32 *ptr, npy_intp stride, npyv_u32 a) +{ + _mm_storel_pd((double*)ptr, _mm_castsi128_pd(a)); + _mm_storeh_pd((double*)(ptr + stride), _mm_castsi128_pd(a)); +} +NPY_FINLINE void npyv_storen2_s32(npy_int32 *ptr, npy_intp stride, npyv_s32 a) +{ npyv_storen2_u32((npy_uint32*)ptr, stride, a); } +NPY_FINLINE void npyv_storen2_f32(float *ptr, npy_intp stride, npyv_f32 a) +{ npyv_storen2_u32((npy_uint32*)ptr, stride, _mm_castps_si128(a)); } + +//// 128-bit store over 64-bit stride +NPY_FINLINE void npyv_storen2_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a) +{ (void)stride; npyv_store_u64(ptr, a); } +NPY_FINLINE void npyv_storen2_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a) +{ (void)stride; npyv_store_s64(ptr, a); } +NPY_FINLINE void npyv_storen2_f64(double *ptr, npy_intp stride, npyv_f64 a) +{ (void)stride; npyv_store_f64(ptr, a); } /********************************* * Partial Load *********************************/ @@ -204,13 +244,14 @@ NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32 *ptr, npy_uintp nlane) return _mm_cvtsi32_si128(*ptr); case 2: return _mm_loadl_epi64((const __m128i*)ptr); - case 3:; - npyv_s32 a = _mm_loadl_epi64((const __m128i*)ptr); - #ifdef NPY_HAVE_SSE41 - return _mm_insert_epi32(a, ptr[2], 2); - #else - return _mm_unpacklo_epi64(a, _mm_cvtsi32_si128(ptr[2])); - #endif + case 3: { + npyv_s32 a = _mm_loadl_epi64((const __m128i*)ptr); + #ifdef NPY_HAVE_SSE41 + return _mm_insert_epi32(a, ptr[2], 2); + #else + return _mm_unpacklo_epi64(a, _mm_cvtsi32_si128(ptr[2])); + #endif + } default: return npyv_load_s32(ptr); } @@ -246,6 +287,32 @@ NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane) } return npyv_load_s64(ptr); } + +//// 64-bit nlane +NPY_FINLINE npyv_s32 npyv_load2_till_s32(const npy_int32 *ptr, npy_uintp nlane, + npy_int32 fill_lo, npy_int32 fill_hi) +{ + assert(nlane > 0); + if (nlane == 1) { + const __m128i vfill = npyv_set_s32(fill_lo, fill_hi, fill_lo, fill_hi); + return _mm_castpd_si128( + _mm_loadl_pd(_mm_castsi128_pd(vfill), (double*)ptr) + ); + } + return npyv_load_s32(ptr); +} +// fill zero to rest lanes +NPY_FINLINE npyv_s32 npyv_load2_tillz_s32(const npy_int32 *ptr, npy_uintp nlane) +{ return npyv_load_tillz_s64((const npy_int64*)ptr, nlane); } + +//// 128-bit nlane +NPY_FINLINE npyv_s64 npyv_load2_till_s64(const npy_int64 *ptr, npy_uintp nlane, + npy_int64 fill_lo, npy_int64 fill_hi) +{ (void)nlane; (void)fill_lo; (void)fill_hi; return npyv_load_s64(ptr); } + +NPY_FINLINE npyv_s64 npyv_load2_tillz_s64(const npy_int64 *ptr, npy_uintp nlane) +{ (void)nlane; return npyv_load_s64(ptr); } + /********************************* * Non-contiguous partial load *********************************/ @@ -305,23 +372,27 @@ npyv_loadn_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane) case 1: return _mm_cvtsi32_si128(ptr[0]); case 2:; - npyv_s32 a = _mm_cvtsi32_si128(ptr[0]); -#ifdef NPY_HAVE_SSE41 - return _mm_insert_epi32(a, ptr[stride], 1); -#else - return _mm_unpacklo_epi32(a, _mm_cvtsi32_si128(ptr[stride])); -#endif // NPY_HAVE_SSE41 - case 3:; - a = _mm_cvtsi32_si128(ptr[0]); -#ifdef NPY_HAVE_SSE41 - a = _mm_insert_epi32(a, ptr[stride], 1); - a = _mm_insert_epi32(a, ptr[stride*2], 2); - return a; -#else - a = _mm_unpacklo_epi32(a, _mm_cvtsi32_si128(ptr[stride])); - a = _mm_unpacklo_epi64(a, _mm_cvtsi32_si128(ptr[stride*2])); - return a; -#endif // NPY_HAVE_SSE41 + { + npyv_s32 a = _mm_cvtsi32_si128(ptr[0]); + #ifdef NPY_HAVE_SSE41 + return _mm_insert_epi32(a, ptr[stride], 1); + #else + return _mm_unpacklo_epi32(a, _mm_cvtsi32_si128(ptr[stride])); + #endif // NPY_HAVE_SSE41 + } + case 3: + { + npyv_s32 a = _mm_cvtsi32_si128(ptr[0]); + #ifdef NPY_HAVE_SSE41 + a = _mm_insert_epi32(a, ptr[stride], 1); + a = _mm_insert_epi32(a, ptr[stride*2], 2); + return a; + #else + a = _mm_unpacklo_epi32(a, _mm_cvtsi32_si128(ptr[stride])); + a = _mm_unpacklo_epi64(a, _mm_cvtsi32_si128(ptr[stride*2])); + return a; + #endif // NPY_HAVE_SSE41 + } default: return npyv_loadn_s32(ptr, stride); } @@ -358,6 +429,37 @@ NPY_FINLINE npyv_s64 npyv_loadn_tillz_s64(const npy_int64 *ptr, npy_intp stride, } return npyv_loadn_s64(ptr, stride); } + +//// 64-bit load over 32-bit stride +NPY_FINLINE npyv_s32 npyv_loadn2_till_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane, + npy_int32 fill_lo, npy_int32 fill_hi) +{ + assert(nlane > 0); + if (nlane == 1) { + const __m128i vfill = npyv_set_s32(0, 0, fill_lo, fill_hi); + return _mm_castpd_si128( + _mm_loadl_pd(_mm_castsi128_pd(vfill), (double*)ptr) + ); + } + return npyv_loadn2_s32(ptr, stride); +} +NPY_FINLINE npyv_s32 npyv_loadn2_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane) +{ + assert(nlane > 0); + if (nlane == 1) { + return _mm_loadl_epi64((const __m128i*)ptr); + } + return npyv_loadn2_s32(ptr, stride); +} + +//// 128-bit load over 64-bit stride +NPY_FINLINE npyv_s64 npyv_loadn2_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, + npy_int64 fill_lo, npy_int64 fill_hi) +{ assert(nlane > 0); (void)stride; (void)nlane; (void)fill_lo; (void)fill_hi; return npyv_load_s64(ptr); } + +NPY_FINLINE npyv_s64 npyv_loadn2_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane) +{ assert(nlane > 0); (void)stride; (void)nlane; return npyv_load_s64(ptr); } + /********************************* * Partial store *********************************/ @@ -394,6 +496,17 @@ NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a } npyv_store_s64(ptr, a); } +//// 64-bit nlane +NPY_FINLINE void npyv_store2_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a) +{ npyv_store_till_s64((npy_int64*)ptr, nlane, a); } + +//// 128-bit nlane +NPY_FINLINE void npyv_store2_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a) +{ + assert(nlane > 0); (void)nlane; + npyv_store_s64(ptr, a); +} + /********************************* * Non-contiguous partial store *********************************/ @@ -401,25 +514,35 @@ NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a NPY_FINLINE void npyv_storen_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npyv_s32 a) { assert(nlane > 0); + ptr[stride*0] = _mm_cvtsi128_si32(a); switch(nlane) { + case 1: + return; #ifdef NPY_HAVE_SSE41 - default: - ptr[stride*3] = _mm_extract_epi32(a, 3); + case 2: + ptr[stride*1] = _mm_extract_epi32(a, 1); + return; case 3: + ptr[stride*1] = _mm_extract_epi32(a, 1); ptr[stride*2] = _mm_extract_epi32(a, 2); - case 2: + return; + default: ptr[stride*1] = _mm_extract_epi32(a, 1); + ptr[stride*2] = _mm_extract_epi32(a, 2); + ptr[stride*3] = _mm_extract_epi32(a, 3); #else - default: - ptr[stride*3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 3))); + case 2: + ptr[stride*1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 1))); + return; case 3: + ptr[stride*1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 1))); ptr[stride*2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 2))); - case 2: + return; + default: ptr[stride*1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 1))); + ptr[stride*2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 2))); + ptr[stride*3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 3))); #endif - case 1: - ptr[stride*0] = _mm_cvtsi128_si32(a); - break; } } //// 64 @@ -432,6 +555,21 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp } npyv_storen_s64(ptr, stride, a); } + +//// 64-bit store over 32-bit stride +NPY_FINLINE void npyv_storen2_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npyv_s32 a) +{ + assert(nlane > 0); + npyv_storel_s32(ptr, a); + if (nlane > 1) { + npyv_storeh_s32(ptr + stride, a); + } +} + +//// 128-bit store over 64-bit stride +NPY_FINLINE void npyv_storen2_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npyv_s64 a) +{ assert(nlane > 0); (void)stride; (void)nlane; npyv_store_s64(ptr, a); } + /***************************************************************** * Implement partial load/store for u32/f32/u64/f64... via casting *****************************************************************/ @@ -442,7 +580,8 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp union { \ npyv_lanetype_##F_SFX from_##F_SFX; \ npyv_lanetype_##T_SFX to_##T_SFX; \ - } pun = {.from_##F_SFX = fill}; \ + } pun; \ + pun.from_##F_SFX = fill; \ return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load_till_##T_SFX( \ (const npyv_lanetype_##T_SFX *)ptr, nlane, pun.to_##T_SFX \ )); \ @@ -454,7 +593,8 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp union { \ npyv_lanetype_##F_SFX from_##F_SFX; \ npyv_lanetype_##T_SFX to_##T_SFX; \ - } pun = {.from_##F_SFX = fill}; \ + } pun; \ + pun.from_##F_SFX = fill; \ return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn_till_##T_SFX( \ (const npyv_lanetype_##T_SFX *)ptr, stride, nlane, pun.to_##T_SFX \ )); \ @@ -495,6 +635,110 @@ NPYV_IMPL_SSE_REST_PARTIAL_TYPES(f32, s32) NPYV_IMPL_SSE_REST_PARTIAL_TYPES(u64, s64) NPYV_IMPL_SSE_REST_PARTIAL_TYPES(f64, s64) +// 128-bit/64-bit stride +#define NPYV_IMPL_SSE_REST_PARTIAL_TYPES_PAIR(F_SFX, T_SFX) \ + NPY_FINLINE npyv_##F_SFX npyv_load2_till_##F_SFX \ + (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, \ + npyv_lanetype_##F_SFX fill_lo, npyv_lanetype_##F_SFX fill_hi) \ + { \ + union pun { \ + npyv_lanetype_##F_SFX from_##F_SFX; \ + npyv_lanetype_##T_SFX to_##T_SFX; \ + }; \ + union pun pun_lo; \ + union pun pun_hi; \ + pun_lo.from_##F_SFX = fill_lo; \ + pun_hi.from_##F_SFX = fill_hi; \ + return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load2_till_##T_SFX( \ + (const npyv_lanetype_##T_SFX *)ptr, nlane, pun_lo.to_##T_SFX, pun_hi.to_##T_SFX \ + )); \ + } \ + NPY_FINLINE npyv_##F_SFX npyv_loadn2_till_##F_SFX \ + (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, \ + npyv_lanetype_##F_SFX fill_lo, npyv_lanetype_##F_SFX fill_hi) \ + { \ + union pun { \ + npyv_lanetype_##F_SFX from_##F_SFX; \ + npyv_lanetype_##T_SFX to_##T_SFX; \ + }; \ + union pun pun_lo; \ + union pun pun_hi; \ + pun_lo.from_##F_SFX = fill_lo; \ + pun_hi.from_##F_SFX = fill_hi; \ + return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn2_till_##T_SFX( \ + (const npyv_lanetype_##T_SFX *)ptr, stride, nlane, pun_lo.to_##T_SFX, \ + pun_hi.to_##T_SFX \ + )); \ + } \ + NPY_FINLINE npyv_##F_SFX npyv_load2_tillz_##F_SFX \ + (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane) \ + { \ + return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load2_tillz_##T_SFX( \ + (const npyv_lanetype_##T_SFX *)ptr, nlane \ + )); \ + } \ + NPY_FINLINE npyv_##F_SFX npyv_loadn2_tillz_##F_SFX \ + (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane) \ + { \ + return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn2_tillz_##T_SFX( \ + (const npyv_lanetype_##T_SFX *)ptr, stride, nlane \ + )); \ + } \ + NPY_FINLINE void npyv_store2_till_##F_SFX \ + (npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, npyv_##F_SFX a) \ + { \ + npyv_store2_till_##T_SFX( \ + (npyv_lanetype_##T_SFX *)ptr, nlane, \ + npyv_reinterpret_##T_SFX##_##F_SFX(a) \ + ); \ + } \ + NPY_FINLINE void npyv_storen2_till_##F_SFX \ + (npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, npyv_##F_SFX a) \ + { \ + npyv_storen2_till_##T_SFX( \ + (npyv_lanetype_##T_SFX *)ptr, stride, nlane, \ + npyv_reinterpret_##T_SFX##_##F_SFX(a) \ + ); \ + } + +NPYV_IMPL_SSE_REST_PARTIAL_TYPES_PAIR(u32, s32) +NPYV_IMPL_SSE_REST_PARTIAL_TYPES_PAIR(f32, s32) +NPYV_IMPL_SSE_REST_PARTIAL_TYPES_PAIR(u64, s64) +NPYV_IMPL_SSE_REST_PARTIAL_TYPES_PAIR(f64, s64) + +/************************************************************ + * de-interlave load / interleave contiguous store + ************************************************************/ +// two channels +#define NPYV_IMPL_SSE_MEM_INTERLEAVE(SFX, ZSFX) \ + NPY_FINLINE npyv_##ZSFX##x2 npyv_zip_##ZSFX(npyv_##ZSFX, npyv_##ZSFX); \ + NPY_FINLINE npyv_##ZSFX##x2 npyv_unzip_##ZSFX(npyv_##ZSFX, npyv_##ZSFX); \ + NPY_FINLINE npyv_##SFX##x2 npyv_load_##SFX##x2( \ + const npyv_lanetype_##SFX *ptr \ + ) { \ + return npyv_unzip_##ZSFX( \ + npyv_load_##SFX(ptr), npyv_load_##SFX(ptr+npyv_nlanes_##SFX) \ + ); \ + } \ + NPY_FINLINE void npyv_store_##SFX##x2( \ + npyv_lanetype_##SFX *ptr, npyv_##SFX##x2 v \ + ) { \ + npyv_##SFX##x2 zip = npyv_zip_##ZSFX(v.val[0], v.val[1]); \ + npyv_store_##SFX(ptr, zip.val[0]); \ + npyv_store_##SFX(ptr + npyv_nlanes_##SFX, zip.val[1]); \ + } + +NPYV_IMPL_SSE_MEM_INTERLEAVE(u8, u8) +NPYV_IMPL_SSE_MEM_INTERLEAVE(s8, u8) +NPYV_IMPL_SSE_MEM_INTERLEAVE(u16, u16) +NPYV_IMPL_SSE_MEM_INTERLEAVE(s16, u16) +NPYV_IMPL_SSE_MEM_INTERLEAVE(u32, u32) +NPYV_IMPL_SSE_MEM_INTERLEAVE(s32, u32) +NPYV_IMPL_SSE_MEM_INTERLEAVE(u64, u64) +NPYV_IMPL_SSE_MEM_INTERLEAVE(s64, u64) +NPYV_IMPL_SSE_MEM_INTERLEAVE(f32, f32) +NPYV_IMPL_SSE_MEM_INTERLEAVE(f64, f64) + /********************************* * Lookup table *********************************/ diff --git a/numpy/core/src/common/simd/sse/reorder.h b/numpy/core/src/common/simd/sse/reorder.h index d96ab9c56..9a57f6489 100644 --- a/numpy/core/src/common/simd/sse/reorder.h +++ b/numpy/core/src/common/simd/sse/reorder.h @@ -81,6 +81,75 @@ NPYV_IMPL_SSE_ZIP(npyv_s64, s64, epi64) NPYV_IMPL_SSE_ZIP(npyv_f32, f32, ps) NPYV_IMPL_SSE_ZIP(npyv_f64, f64, pd) +// deinterleave two vectors +NPY_FINLINE npyv_u8x2 npyv_unzip_u8(npyv_u8 ab0, npyv_u8 ab1) +{ +#ifdef NPY_HAVE_SSSE3 + const __m128i idx = _mm_setr_epi8( + 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 + ); + __m128i abl = _mm_shuffle_epi8(ab0, idx); + __m128i abh = _mm_shuffle_epi8(ab1, idx); + return npyv_combine_u8(abl, abh); +#else + __m128i ab_083b = _mm_unpacklo_epi8(ab0, ab1); + __m128i ab_4c6e = _mm_unpackhi_epi8(ab0, ab1); + __m128i ab_048c = _mm_unpacklo_epi8(ab_083b, ab_4c6e); + __m128i ab_36be = _mm_unpackhi_epi8(ab_083b, ab_4c6e); + __m128i ab_0346 = _mm_unpacklo_epi8(ab_048c, ab_36be); + __m128i ab_8bc8 = _mm_unpackhi_epi8(ab_048c, ab_36be); + npyv_u8x2 r; + r.val[0] = _mm_unpacklo_epi8(ab_0346, ab_8bc8); + r.val[1] = _mm_unpackhi_epi8(ab_0346, ab_8bc8); + return r; +#endif +} +#define npyv_unzip_s8 npyv_unzip_u8 + +NPY_FINLINE npyv_u16x2 npyv_unzip_u16(npyv_u16 ab0, npyv_u16 ab1) +{ +#ifdef NPY_HAVE_SSSE3 + const __m128i idx = _mm_setr_epi8( + 0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15 + ); + __m128i abl = _mm_shuffle_epi8(ab0, idx); + __m128i abh = _mm_shuffle_epi8(ab1, idx); + return npyv_combine_u16(abl, abh); +#else + __m128i ab_0415 = _mm_unpacklo_epi16(ab0, ab1); + __m128i ab_263f = _mm_unpackhi_epi16(ab0, ab1); + __m128i ab_0246 = _mm_unpacklo_epi16(ab_0415, ab_263f); + __m128i ab_135f = _mm_unpackhi_epi16(ab_0415, ab_263f); + npyv_u16x2 r; + r.val[0] = _mm_unpacklo_epi16(ab_0246, ab_135f); + r.val[1] = _mm_unpackhi_epi16(ab_0246, ab_135f); + return r; +#endif +} +#define npyv_unzip_s16 npyv_unzip_u16 + +NPY_FINLINE npyv_u32x2 npyv_unzip_u32(npyv_u32 ab0, npyv_u32 ab1) +{ + __m128i abl = _mm_shuffle_epi32(ab0, _MM_SHUFFLE(3, 1, 2, 0)); + __m128i abh = _mm_shuffle_epi32(ab1, _MM_SHUFFLE(3, 1, 2, 0)); + return npyv_combine_u32(abl, abh); +} +#define npyv_unzip_s32 npyv_unzip_u32 + +NPY_FINLINE npyv_u64x2 npyv_unzip_u64(npyv_u64 ab0, npyv_u64 ab1) +{ return npyv_combine_u64(ab0, ab1); } +#define npyv_unzip_s64 npyv_unzip_u64 + +NPY_FINLINE npyv_f32x2 npyv_unzip_f32(npyv_f32 ab0, npyv_f32 ab1) +{ + npyv_f32x2 r; + r.val[0] = _mm_shuffle_ps(ab0, ab1, _MM_SHUFFLE(2, 0, 2, 0)); + r.val[1] = _mm_shuffle_ps(ab0, ab1, _MM_SHUFFLE(3, 1, 3, 1)); + return r; +} +NPY_FINLINE npyv_f64x2 npyv_unzip_f64(npyv_f64 ab0, npyv_f64 ab1) +{ return npyv_combine_f64(ab0, ab1); } + // Reverse elements of each 64-bit lane NPY_FINLINE npyv_u16 npyv_rev64_u16(npyv_u16 a) { @@ -122,4 +191,22 @@ NPY_FINLINE npyv_f32 npyv_rev64_f32(npyv_f32 a) return _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 0, 1)); } +// Permuting the elements of each 128-bit lane by immediate index for +// each element. +#define npyv_permi128_u32(A, E0, E1, E2, E3) \ + _mm_shuffle_epi32(A, _MM_SHUFFLE(E3, E2, E1, E0)) + +#define npyv_permi128_s32 npyv_permi128_u32 + +#define npyv_permi128_u64(A, E0, E1) \ + _mm_shuffle_epi32(A, _MM_SHUFFLE(((E1)<<1)+1, ((E1)<<1), ((E0)<<1)+1, ((E0)<<1))) + +#define npyv_permi128_s64 npyv_permi128_u64 + +#define npyv_permi128_f32(A, E0, E1, E2, E3) \ + _mm_shuffle_ps(A, A, _MM_SHUFFLE(E3, E2, E1, E0)) + +#define npyv_permi128_f64(A, E0, E1) \ + _mm_shuffle_pd(A, A, _MM_SHUFFLE2(E1, E0)) + #endif // _NPY_SIMD_SSE_REORDER_H diff --git a/numpy/core/src/common/simd/sse/sse.h b/numpy/core/src/common/simd/sse/sse.h index c21bbfda7..0c6b8cdba 100644 --- a/numpy/core/src/common/simd/sse/sse.h +++ b/numpy/core/src/common/simd/sse/sse.h @@ -12,6 +12,7 @@ #define NPY_SIMD_FMA3 0 // fast emulated #endif #define NPY_SIMD_BIGENDIAN 0 +#define NPY_SIMD_CMPSIGNAL 1 typedef __m128i npyv_u8; typedef __m128i npyv_s8; diff --git a/numpy/core/src/common/simd/vec/arithmetic.h b/numpy/core/src/common/simd/vec/arithmetic.h index a2e9d07eb..85f4d6b26 100644 --- a/numpy/core/src/common/simd/vec/arithmetic.h +++ b/numpy/core/src/common/simd/vec/arithmetic.h @@ -322,6 +322,20 @@ NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor) NPY_FINLINE npyv_f64 npyv_nmulsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c) { return vec_neg(vec_madd(a, b, c)); } #endif +// multiply, add for odd elements and subtract even elements. +// (a * b) -+ c +#if NPY_SIMD_F32 +NPY_FINLINE npyv_f32 npyv_muladdsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c) +{ + const npyv_f32 msign = npyv_set_f32(-0.0f, 0.0f, -0.0f, 0.0f); + return npyv_muladd_f32(a, b, npyv_xor_f32(msign, c)); +} +#endif +NPY_FINLINE npyv_f64 npyv_muladdsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c) +{ + const npyv_f64 msign = npyv_set_f64(-0.0, 0.0); + return npyv_muladd_f64(a, b, npyv_xor_f64(msign, c)); +} /*************************** * Summation ***************************/ @@ -352,6 +366,7 @@ NPY_FINLINE float npyv_sum_f32(npyv_f32 a) { npyv_f32 sum = vec_add(a, npyv_combineh_f32(a, a)); return vec_extract(sum, 0) + vec_extract(sum, 1); + (void)sum; } #endif @@ -372,6 +387,7 @@ NPY_FINLINE npy_uint16 npyv_sumup_u8(npyv_u8 a) npyv_u32 four = vec_sum4s(a, zero); npyv_s32 one = vec_sums((npyv_s32)four, (npyv_s32)zero); return (npy_uint16)vec_extract(one, 3); + (void)one; #endif } @@ -386,6 +402,7 @@ NPY_FINLINE npy_uint32 npyv_sumup_u16(npyv_u16 a) npyv_u32 four = vec_add(eight.val[0], eight.val[1]); npyv_s32 one = vec_sums((npyv_s32)four, zero); return (npy_uint32)vec_extract(one, 3); + (void)one; #endif } diff --git a/numpy/core/src/common/simd/vec/conversion.h b/numpy/core/src/common/simd/vec/conversion.h index f0d625c55..922109f7b 100644 --- a/numpy/core/src/common/simd/vec/conversion.h +++ b/numpy/core/src/common/simd/vec/conversion.h @@ -96,6 +96,8 @@ npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d, #else return vec_extract(r, 4); #endif + // to suppress ambiguous warning: variable `r` but not used [-Wunused-but-set-variable] + (void)r; } NPY_FINLINE npy_uint64 npyv_tobits_b16(npyv_b16 a) { @@ -106,6 +108,8 @@ npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d, #else return vec_extract(r, 8); #endif + // to suppress ambiguous warning: variable `r` but not used [-Wunused-but-set-variable] + (void)r; } NPY_FINLINE npy_uint64 npyv_tobits_b32(npyv_b32 a) { @@ -120,6 +124,8 @@ npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d, #else return vec_extract(r, 8); #endif + // to suppress ambiguous warning: variable `r` but not used [-Wunused-but-set-variable] + (void)r; } NPY_FINLINE npy_uint64 npyv_tobits_b64(npyv_b64 a) { @@ -134,6 +140,8 @@ npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d, #else return vec_extract(r, 8); #endif + // to suppress ambiguous warning: variable `r` but not used [-Wunused-but-set-variable] + (void)r; } #else NPY_FINLINE npy_uint64 npyv_tobits_b8(npyv_b8 a) @@ -170,7 +178,8 @@ npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d, #ifdef NPY_HAVE_VXE2 return vec_signed(a); #elif defined(NPY_HAVE_VXE) - return vec_packs(vec_signed(npyv_doublee(a)), vec_signed(npyv_doublee(vec_mergel(a, a)))); + return vec_packs(vec_signed(npyv_doublee(vec_mergeh(a,a))), + vec_signed(npyv_doublee(vec_mergel(a, a)))); // VSX #elif defined(__IBMC__) return vec_cts(a, 0); diff --git a/numpy/core/src/common/simd/vec/math.h b/numpy/core/src/common/simd/vec/math.h index 95b16fdf7..85690f76c 100644 --- a/numpy/core/src/common/simd/vec/math.h +++ b/numpy/core/src/common/simd/vec/math.h @@ -186,6 +186,7 @@ NPY_IMPL_VEC_REDUCE_MINMAX(max, int32, s32) { \ npyv_##SFX r = vec_##INTRIN(a, vec_sld(a, a, 8)); \ return (npy_##STYPE)vec_extract(r, 0); \ + (void)r; \ } NPY_IMPL_VEC_REDUCE_MINMAX(min, uint64, u64) NPY_IMPL_VEC_REDUCE_MINMAX(max, uint64, u64) @@ -225,6 +226,7 @@ NPY_IMPL_VEC_REDUCE_MINMAX(max, int64, s64) { \ npyv_f64 r = vec_##INTRIN(a, vec_sld(a, a, 8)); \ return vec_extract(r, 0); \ + (void)r; \ } \ NPY_FINLINE double npyv_reduce_##INTRIN##n_f64(npyv_f64 a) \ { \ diff --git a/numpy/core/src/common/simd/vec/memory.h b/numpy/core/src/common/simd/vec/memory.h index e8f588ef2..1ad39cead 100644 --- a/numpy/core/src/common/simd/vec/memory.h +++ b/numpy/core/src/common/simd/vec/memory.h @@ -46,14 +46,8 @@ #endif // avoid aliasing rules -#ifdef __cplusplus - template<typename T_PTR> - NPY_FINLINE npy_uint64 *npyv__ptr2u64(const T_PTR *ptr) - { npy_uint64 *ptr64 = (npy_uint64*)ptr; return ptr64; } -#else - NPY_FINLINE npy_uint64 *npyv__ptr2u64(const void *ptr) - { npy_uint64 *ptr64 = (npy_uint64*)ptr; return ptr64; } -#endif // __cplusplus +NPY_FINLINE npy_uint64 *npyv__ptr2u64(const void *ptr) +{ npy_uint64 *ptr64 = (npy_uint64*)ptr; return ptr64; } // load lower part NPY_FINLINE npyv_u64 npyv__loadl(const void *ptr) @@ -136,6 +130,24 @@ NPY_FINLINE npyv_s64 npyv_loadn_s64(const npy_int64 *ptr, npy_intp stride) { return npyv_set_s64(ptr[0], ptr[stride]); } NPY_FINLINE npyv_f64 npyv_loadn_f64(const double *ptr, npy_intp stride) { return npyv_set_f64(ptr[0], ptr[stride]); } + +//// 64-bit load over 32-bit stride +NPY_FINLINE npyv_u32 npyv_loadn2_u32(const npy_uint32 *ptr, npy_intp stride) +{ return (npyv_u32)npyv_set_u64(*(npy_uint64*)ptr, *(npy_uint64*)(ptr + stride)); } +NPY_FINLINE npyv_s32 npyv_loadn2_s32(const npy_int32 *ptr, npy_intp stride) +{ return (npyv_s32)npyv_set_u64(*(npy_uint64*)ptr, *(npy_uint64*)(ptr + stride)); } +#if NPY_SIMD_F32 +NPY_FINLINE npyv_f32 npyv_loadn2_f32(const float *ptr, npy_intp stride) +{ return (npyv_f32)npyv_set_u64(*(npy_uint64*)ptr, *(npy_uint64*)(ptr + stride)); } +#endif +//// 128-bit load over 64-bit stride +NPY_FINLINE npyv_u64 npyv_loadn2_u64(const npy_uint64 *ptr, npy_intp stride) +{ (void)stride; return npyv_load_u64(ptr); } +NPY_FINLINE npyv_s64 npyv_loadn2_s64(const npy_int64 *ptr, npy_intp stride) +{ (void)stride; return npyv_load_s64(ptr); } +NPY_FINLINE npyv_f64 npyv_loadn2_f64(const double *ptr, npy_intp stride) +{ (void)stride; return npyv_load_f64(ptr); } + /*************************** * Non-contiguous Store ***************************/ @@ -164,6 +176,26 @@ NPY_FINLINE void npyv_storen_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a) NPY_FINLINE void npyv_storen_f64(double *ptr, npy_intp stride, npyv_f64 a) { npyv_storen_u64((npy_uint64*)ptr, stride, (npyv_u64)a); } +//// 64-bit store over 32-bit stride +NPY_FINLINE void npyv_storen2_u32(npy_uint32 *ptr, npy_intp stride, npyv_u32 a) +{ + *(npy_uint64*)ptr = vec_extract((npyv_u64)a, 0); + *(npy_uint64*)(ptr + stride) = vec_extract((npyv_u64)a, 1); +} +NPY_FINLINE void npyv_storen2_s32(npy_int32 *ptr, npy_intp stride, npyv_s32 a) +{ npyv_storen2_u32((npy_uint32*)ptr, stride, (npyv_u32)a); } +#if NPY_SIMD_F32 +NPY_FINLINE void npyv_storen2_f32(float *ptr, npy_intp stride, npyv_f32 a) +{ npyv_storen2_u32((npy_uint32*)ptr, stride, (npyv_u32)a); } +#endif +//// 128-bit store over 64-bit stride +NPY_FINLINE void npyv_storen2_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a) +{ (void)stride; npyv_store_u64(ptr, a); } +NPY_FINLINE void npyv_storen2_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a) +{ (void)stride; npyv_store_s64(ptr, a); } +NPY_FINLINE void npyv_storen2_f64(double *ptr, npy_intp stride, npyv_f64 a) +{ (void)stride; npyv_store_f64(ptr, a); } + /********************************* * Partial Load *********************************/ @@ -173,9 +205,9 @@ NPY_FINLINE npyv_s32 npyv_load_till_s32(const npy_int32 *ptr, npy_uintp nlane, n assert(nlane > 0); npyv_s32 vfill = npyv_setall_s32(fill); #ifdef NPY_HAVE_VX - const unsigned blane = (unsigned short)nlane; + const unsigned blane = (nlane > 4) ? 4 : nlane; const npyv_u32 steps = npyv_set_u32(0, 1, 2, 3); - const npyv_u32 vlane = npyv_setall_u32((unsigned)blane); + const npyv_u32 vlane = npyv_setall_u32(blane); const npyv_b32 mask = vec_cmpgt(vlane, steps); npyv_s32 a = vec_load_len(ptr, blane*4-1); return vec_sel(vfill, a, mask); @@ -201,8 +233,8 @@ NPY_FINLINE npyv_s32 npyv_load_till_s32(const npy_int32 *ptr, npy_uintp nlane, n NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32 *ptr, npy_uintp nlane) { #ifdef NPY_HAVE_VX - unsigned blane = ((unsigned short)nlane)*4 - 1; - return vec_load_len(ptr, blane); + unsigned blane = (nlane > 4) ? 4 : nlane; + return vec_load_len(ptr, blane*4-1); #else return npyv_load_till_s32(ptr, nlane, 0); #endif @@ -220,12 +252,33 @@ NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, n NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane) { #ifdef NPY_HAVE_VX - unsigned blane = (unsigned short)nlane; + unsigned blane = (nlane > 2) ? 2 : nlane; return vec_load_len((const signed long long*)ptr, blane*8-1); #else return npyv_load_till_s64(ptr, nlane, 0); #endif } +//// 64-bit nlane +NPY_FINLINE npyv_s32 npyv_load2_till_s32(const npy_int32 *ptr, npy_uintp nlane, + npy_int32 fill_lo, npy_int32 fill_hi) +{ + assert(nlane > 0); + if (nlane == 1) { + return npyv_set_s32(ptr[0], ptr[1], fill_lo, fill_hi); + } + return npyv_load_s32(ptr); +} +// fill zero to rest lanes +NPY_FINLINE npyv_s32 npyv_load2_tillz_s32(const npy_int32 *ptr, npy_uintp nlane) +{ return (npyv_s32)npyv_load_tillz_s64((const npy_int64*)ptr, nlane); } + +//// 128-bit nlane +NPY_FINLINE npyv_s64 npyv_load2_till_s64(const npy_int64 *ptr, npy_uintp nlane, + npy_int64 fill_lo, npy_int64 fill_hi) +{ (void)nlane; (void)fill_lo; (void)fill_hi; return npyv_load_s64(ptr); } + +NPY_FINLINE npyv_s64 npyv_load2_tillz_s64(const npy_int64 *ptr, npy_uintp nlane) +{ (void)nlane; return npyv_load_s64(ptr); } /********************************* * Non-contiguous partial load *********************************/ @@ -265,6 +318,34 @@ npyv_loadn_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_ // fill zero to rest lanes NPY_FINLINE npyv_s64 npyv_loadn_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane) { return npyv_loadn_till_s64(ptr, stride, nlane, 0); } + +//// 64-bit load over 32-bit stride +NPY_FINLINE npyv_s32 npyv_loadn2_till_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane, + npy_int32 fill_lo, npy_int32 fill_hi) +{ + assert(nlane > 0); + if (nlane == 1) { + return npyv_set_s32(ptr[0], ptr[1], fill_lo, fill_hi); + } + return npyv_loadn2_s32(ptr, stride); +} +NPY_FINLINE npyv_s32 npyv_loadn2_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane) +{ + assert(nlane > 0); + if (nlane == 1) { + return (npyv_s32)npyv_set_s64(*(npy_int64*)ptr, 0); + } + return npyv_loadn2_s32(ptr, stride); +} + +//// 128-bit load over 64-bit stride +NPY_FINLINE npyv_s64 npyv_loadn2_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, + npy_int64 fill_lo, npy_int64 fill_hi) +{ assert(nlane > 0); (void)stride; (void)nlane; (void)fill_lo; (void)fill_hi; return npyv_load_s64(ptr); } + +NPY_FINLINE npyv_s64 npyv_loadn2_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane) +{ assert(nlane > 0); (void)stride; (void)nlane; return npyv_load_s64(ptr); } + /********************************* * Partial store *********************************/ @@ -273,7 +354,7 @@ NPY_FINLINE void npyv_store_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a { assert(nlane > 0); #ifdef NPY_HAVE_VX - unsigned blane = (unsigned short)nlane; + unsigned blane = (nlane > 4) ? 4 : nlane; vec_store_len(a, ptr, blane*4-1); #else switch(nlane) { @@ -297,7 +378,7 @@ NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a { assert(nlane > 0); #ifdef NPY_HAVE_VX - unsigned blane = (unsigned short)nlane; + unsigned blane = (nlane > 2) ? 2 : nlane; vec_store_len(a, (signed long long*)ptr, blane*8-1); #else if (nlane == 1) { @@ -307,6 +388,18 @@ NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a npyv_store_s64(ptr, a); #endif } + +//// 64-bit nlane +NPY_FINLINE void npyv_store2_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a) +{ npyv_store_till_s64((npy_int64*)ptr, nlane, (npyv_s64)a); } + +//// 128-bit nlane +NPY_FINLINE void npyv_store2_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a) +{ + assert(nlane > 0); (void)nlane; + npyv_store_s64(ptr, a); +} + /********************************* * Non-contiguous partial store *********************************/ @@ -314,16 +407,21 @@ NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a NPY_FINLINE void npyv_storen_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npyv_s32 a) { assert(nlane > 0); + ptr[stride*0] = vec_extract(a, 0); switch(nlane) { - default: - ptr[stride*3] = vec_extract(a, 3); - case 3: - ptr[stride*2] = vec_extract(a, 2); + case 1: + return; case 2: ptr[stride*1] = vec_extract(a, 1); - case 1: - ptr[stride*0] = vec_extract(a, 0); - break; + return; + case 3: + ptr[stride*1] = vec_extract(a, 1); + ptr[stride*2] = vec_extract(a, 2); + return; + default: + ptr[stride*1] = vec_extract(a, 1); + ptr[stride*2] = vec_extract(a, 2); + ptr[stride*3] = vec_extract(a, 3); } } //// 64 @@ -336,6 +434,21 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp } npyv_storen_s64(ptr, stride, a); } + +//// 64-bit store over 32-bit stride +NPY_FINLINE void npyv_storen2_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npyv_s32 a) +{ + assert(nlane > 0); + npyv_storel_s32(ptr, a); + if (nlane > 1) { + npyv_storeh_s32(ptr + stride, a); + } +} + +//// 128-bit store over 64-bit stride +NPY_FINLINE void npyv_storen2_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npyv_s64 a) +{ assert(nlane > 0); (void)stride; (void)nlane; npyv_store_s64(ptr, a); } + /***************************************************************** * Implement partial load/store for u32/f32/u64/f64... via casting *****************************************************************/ @@ -346,7 +459,8 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp union { \ npyv_lanetype_##F_SFX from_##F_SFX; \ npyv_lanetype_##T_SFX to_##T_SFX; \ - } pun = {.from_##F_SFX = fill}; \ + } pun; \ + pun.from_##F_SFX = fill; \ return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load_till_##T_SFX( \ (const npyv_lanetype_##T_SFX *)ptr, nlane, pun.to_##T_SFX \ )); \ @@ -358,7 +472,8 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp union { \ npyv_lanetype_##F_SFX from_##F_SFX; \ npyv_lanetype_##T_SFX to_##T_SFX; \ - } pun = {.from_##F_SFX = fill}; \ + } pun; \ + pun.from_##F_SFX = fill; \ return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn_till_##T_SFX( \ (const npyv_lanetype_##T_SFX *)ptr, stride, nlane, pun.to_##T_SFX \ )); \ @@ -401,6 +516,114 @@ NPYV_IMPL_VEC_REST_PARTIAL_TYPES(f32, s32) NPYV_IMPL_VEC_REST_PARTIAL_TYPES(u64, s64) NPYV_IMPL_VEC_REST_PARTIAL_TYPES(f64, s64) +// 128-bit/64-bit stride +#define NPYV_IMPL_VEC_REST_PARTIAL_TYPES_PAIR(F_SFX, T_SFX) \ + NPY_FINLINE npyv_##F_SFX npyv_load2_till_##F_SFX \ + (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, \ + npyv_lanetype_##F_SFX fill_lo, npyv_lanetype_##F_SFX fill_hi) \ + { \ + union pun { \ + npyv_lanetype_##F_SFX from_##F_SFX; \ + npyv_lanetype_##T_SFX to_##T_SFX; \ + }; \ + union pun pun_lo; \ + union pun pun_hi; \ + pun_lo.from_##F_SFX = fill_lo; \ + pun_hi.from_##F_SFX = fill_hi; \ + return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load2_till_##T_SFX( \ + (const npyv_lanetype_##T_SFX *)ptr, nlane, pun_lo.to_##T_SFX, pun_hi.to_##T_SFX \ + )); \ + } \ + NPY_FINLINE npyv_##F_SFX npyv_loadn2_till_##F_SFX \ + (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, \ + npyv_lanetype_##F_SFX fill_lo, npyv_lanetype_##F_SFX fill_hi) \ + { \ + union pun { \ + npyv_lanetype_##F_SFX from_##F_SFX; \ + npyv_lanetype_##T_SFX to_##T_SFX; \ + }; \ + union pun pun_lo; \ + union pun pun_hi; \ + pun_lo.from_##F_SFX = fill_lo; \ + pun_hi.from_##F_SFX = fill_hi; \ + return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn2_till_##T_SFX( \ + (const npyv_lanetype_##T_SFX *)ptr, stride, nlane, pun_lo.to_##T_SFX, \ + pun_hi.to_##T_SFX \ + )); \ + } \ + NPY_FINLINE npyv_##F_SFX npyv_load2_tillz_##F_SFX \ + (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane) \ + { \ + return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load2_tillz_##T_SFX( \ + (const npyv_lanetype_##T_SFX *)ptr, nlane \ + )); \ + } \ + NPY_FINLINE npyv_##F_SFX npyv_loadn2_tillz_##F_SFX \ + (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane) \ + { \ + return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn2_tillz_##T_SFX( \ + (const npyv_lanetype_##T_SFX *)ptr, stride, nlane \ + )); \ + } \ + NPY_FINLINE void npyv_store2_till_##F_SFX \ + (npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, npyv_##F_SFX a) \ + { \ + npyv_store2_till_##T_SFX( \ + (npyv_lanetype_##T_SFX *)ptr, nlane, \ + npyv_reinterpret_##T_SFX##_##F_SFX(a) \ + ); \ + } \ + NPY_FINLINE void npyv_storen2_till_##F_SFX \ + (npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, npyv_##F_SFX a) \ + { \ + npyv_storen2_till_##T_SFX( \ + (npyv_lanetype_##T_SFX *)ptr, stride, nlane, \ + npyv_reinterpret_##T_SFX##_##F_SFX(a) \ + ); \ + } + +NPYV_IMPL_VEC_REST_PARTIAL_TYPES_PAIR(u32, s32) +#if NPY_SIMD_F32 +NPYV_IMPL_VEC_REST_PARTIAL_TYPES_PAIR(f32, s32) +#endif +NPYV_IMPL_VEC_REST_PARTIAL_TYPES_PAIR(u64, s64) +NPYV_IMPL_VEC_REST_PARTIAL_TYPES_PAIR(f64, s64) + +/************************************************************ + * de-interlave load / interleave contiguous store + ************************************************************/ +// two channels +#define NPYV_IMPL_VEC_MEM_INTERLEAVE(SFX) \ + NPY_FINLINE npyv_##SFX##x2 npyv_zip_##SFX(npyv_##SFX, npyv_##SFX); \ + NPY_FINLINE npyv_##SFX##x2 npyv_unzip_##SFX(npyv_##SFX, npyv_##SFX); \ + NPY_FINLINE npyv_##SFX##x2 npyv_load_##SFX##x2( \ + const npyv_lanetype_##SFX *ptr \ + ) { \ + return npyv_unzip_##SFX( \ + npyv_load_##SFX(ptr), npyv_load_##SFX(ptr+npyv_nlanes_##SFX) \ + ); \ + } \ + NPY_FINLINE void npyv_store_##SFX##x2( \ + npyv_lanetype_##SFX *ptr, npyv_##SFX##x2 v \ + ) { \ + npyv_##SFX##x2 zip = npyv_zip_##SFX(v.val[0], v.val[1]); \ + npyv_store_##SFX(ptr, zip.val[0]); \ + npyv_store_##SFX(ptr + npyv_nlanes_##SFX, zip.val[1]); \ + } + +NPYV_IMPL_VEC_MEM_INTERLEAVE(u8) +NPYV_IMPL_VEC_MEM_INTERLEAVE(s8) +NPYV_IMPL_VEC_MEM_INTERLEAVE(u16) +NPYV_IMPL_VEC_MEM_INTERLEAVE(s16) +NPYV_IMPL_VEC_MEM_INTERLEAVE(u32) +NPYV_IMPL_VEC_MEM_INTERLEAVE(s32) +NPYV_IMPL_VEC_MEM_INTERLEAVE(u64) +NPYV_IMPL_VEC_MEM_INTERLEAVE(s64) +#if NPY_SIMD_F32 +NPYV_IMPL_VEC_MEM_INTERLEAVE(f32) +#endif +NPYV_IMPL_VEC_MEM_INTERLEAVE(f64) + /********************************* * Lookup table *********************************/ diff --git a/numpy/core/src/common/simd/vec/misc.h b/numpy/core/src/common/simd/vec/misc.h index 7ea0f21f6..79c194d90 100644 --- a/numpy/core/src/common/simd/vec/misc.h +++ b/numpy/core/src/common/simd/vec/misc.h @@ -26,28 +26,28 @@ #define NPYV_IMPL_VEC_SPLTW(T_VEC, V) ((T_VEC){V, V, V, V}) #define NPYV_IMPL_VEC_SPLTD(T_VEC, V) ((T_VEC){V, V}) -#define npyv_setall_u8(VAL) NPYV_IMPL_VEC_SPLTB(npyv_u8, (unsigned char)VAL) -#define npyv_setall_s8(VAL) NPYV_IMPL_VEC_SPLTB(npyv_s8, (signed char)VAL) -#define npyv_setall_u16(VAL) NPYV_IMPL_VEC_SPLTH(npyv_u16, (unsigned short)VAL) -#define npyv_setall_s16(VAL) NPYV_IMPL_VEC_SPLTH(npyv_s16, (short)VAL) -#define npyv_setall_u32(VAL) NPYV_IMPL_VEC_SPLTW(npyv_u32, (unsigned int)VAL) -#define npyv_setall_s32(VAL) NPYV_IMPL_VEC_SPLTW(npyv_s32, (int)VAL) +#define npyv_setall_u8(VAL) NPYV_IMPL_VEC_SPLTB(npyv_u8, (unsigned char)(VAL)) +#define npyv_setall_s8(VAL) NPYV_IMPL_VEC_SPLTB(npyv_s8, (signed char)(VAL)) +#define npyv_setall_u16(VAL) NPYV_IMPL_VEC_SPLTH(npyv_u16, (unsigned short)(VAL)) +#define npyv_setall_s16(VAL) NPYV_IMPL_VEC_SPLTH(npyv_s16, (short)(VAL)) +#define npyv_setall_u32(VAL) NPYV_IMPL_VEC_SPLTW(npyv_u32, (unsigned int)(VAL)) +#define npyv_setall_s32(VAL) NPYV_IMPL_VEC_SPLTW(npyv_s32, (int)(VAL)) #if NPY_SIMD_F32 - #define npyv_setall_f32(VAL) NPYV_IMPL_VEC_SPLTW(npyv_f32, VAL) + #define npyv_setall_f32(VAL) NPYV_IMPL_VEC_SPLTW(npyv_f32, (VAL)) #endif -#define npyv_setall_u64(VAL) NPYV_IMPL_VEC_SPLTD(npyv_u64, (npy_uint64)VAL) -#define npyv_setall_s64(VAL) NPYV_IMPL_VEC_SPLTD(npyv_s64, (npy_int64)VAL) +#define npyv_setall_u64(VAL) NPYV_IMPL_VEC_SPLTD(npyv_u64, (npy_uint64)(VAL)) +#define npyv_setall_s64(VAL) NPYV_IMPL_VEC_SPLTD(npyv_s64, (npy_int64)(VAL)) #define npyv_setall_f64(VAL) NPYV_IMPL_VEC_SPLTD(npyv_f64, VAL) // vector with specific values set to each lane and // set a specific value to all remained lanes -#define npyv_setf_u8(FILL, ...) ((npyv_u8){NPYV__SET_FILL_16(char, FILL, __VA_ARGS__)}) -#define npyv_setf_s8(FILL, ...) ((npyv_s8){NPYV__SET_FILL_16(char, FILL, __VA_ARGS__)}) -#define npyv_setf_u16(FILL, ...) ((npyv_u16){NPYV__SET_FILL_8(short, FILL, __VA_ARGS__)}) +#define npyv_setf_u8(FILL, ...) ((npyv_u8){NPYV__SET_FILL_16(unsigned char, FILL, __VA_ARGS__)}) +#define npyv_setf_s8(FILL, ...) ((npyv_s8){NPYV__SET_FILL_16(signed char, FILL, __VA_ARGS__)}) +#define npyv_setf_u16(FILL, ...) ((npyv_u16){NPYV__SET_FILL_8(unsigned short, FILL, __VA_ARGS__)}) #define npyv_setf_s16(FILL, ...) ((npyv_s16){NPYV__SET_FILL_8(short, FILL, __VA_ARGS__)}) -#define npyv_setf_u32(FILL, ...) ((npyv_u32){NPYV__SET_FILL_4(int, FILL, __VA_ARGS__)}) +#define npyv_setf_u32(FILL, ...) ((npyv_u32){NPYV__SET_FILL_4(unsigned int, FILL, __VA_ARGS__)}) #define npyv_setf_s32(FILL, ...) ((npyv_s32){NPYV__SET_FILL_4(int, FILL, __VA_ARGS__)}) -#define npyv_setf_u64(FILL, ...) ((npyv_u64){NPYV__SET_FILL_2(npy_int64, FILL, __VA_ARGS__)}) +#define npyv_setf_u64(FILL, ...) ((npyv_u64){NPYV__SET_FILL_2(npy_uint64, FILL, __VA_ARGS__)}) #define npyv_setf_s64(FILL, ...) ((npyv_s64){NPYV__SET_FILL_2(npy_int64, FILL, __VA_ARGS__)}) #if NPY_SIMD_F32 #define npyv_setf_f32(FILL, ...) ((npyv_f32){NPYV__SET_FILL_4(float, FILL, __VA_ARGS__)}) diff --git a/numpy/core/src/common/simd/vec/reorder.h b/numpy/core/src/common/simd/vec/reorder.h index b60b9287d..3910980a2 100644 --- a/numpy/core/src/common/simd/vec/reorder.h +++ b/numpy/core/src/common/simd/vec/reorder.h @@ -68,6 +68,85 @@ NPYV_IMPL_VEC_COMBINE_ZIP(npyv_s64, s64) #endif NPYV_IMPL_VEC_COMBINE_ZIP(npyv_f64, f64) +// deinterleave two vectors +NPY_FINLINE npyv_u8x2 npyv_unzip_u8(npyv_u8 ab0, npyv_u8 ab1) +{ + const npyv_u8 idx_even = npyv_set_u8( + 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 + ); + const npyv_u8 idx_odd = npyv_set_u8( + 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 + ); + npyv_u8x2 r; + r.val[0] = vec_perm(ab0, ab1, idx_even); + r.val[1] = vec_perm(ab0, ab1, idx_odd); + return r; +} +NPY_FINLINE npyv_s8x2 npyv_unzip_s8(npyv_s8 ab0, npyv_s8 ab1) +{ + npyv_u8x2 ru = npyv_unzip_u8((npyv_u8)ab0, (npyv_u8)ab1); + npyv_s8x2 r; + r.val[0] = (npyv_s8)ru.val[0]; + r.val[1] = (npyv_s8)ru.val[1]; + return r; +} +NPY_FINLINE npyv_u16x2 npyv_unzip_u16(npyv_u16 ab0, npyv_u16 ab1) +{ + const npyv_u8 idx_even = npyv_set_u8( + 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 + ); + const npyv_u8 idx_odd = npyv_set_u8( + 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 + ); + npyv_u16x2 r; + r.val[0] = vec_perm(ab0, ab1, idx_even); + r.val[1] = vec_perm(ab0, ab1, idx_odd); + return r; +} +NPY_FINLINE npyv_s16x2 npyv_unzip_s16(npyv_s16 ab0, npyv_s16 ab1) +{ + npyv_u16x2 ru = npyv_unzip_u16((npyv_u16)ab0, (npyv_u16)ab1); + npyv_s16x2 r; + r.val[0] = (npyv_s16)ru.val[0]; + r.val[1] = (npyv_s16)ru.val[1]; + return r; +} +NPY_FINLINE npyv_u32x2 npyv_unzip_u32(npyv_u32 ab0, npyv_u32 ab1) +{ + npyv_u32 m0 = vec_mergeh(ab0, ab1); + npyv_u32 m1 = vec_mergel(ab0, ab1); + npyv_u32 r0 = vec_mergeh(m0, m1); + npyv_u32 r1 = vec_mergel(m0, m1); + npyv_u32x2 r; + r.val[0] = r0; + r.val[1] = r1; + return r; +} +NPY_FINLINE npyv_s32x2 npyv_unzip_s32(npyv_s32 ab0, npyv_s32 ab1) +{ + npyv_u32x2 ru = npyv_unzip_u32((npyv_u32)ab0, (npyv_u32)ab1); + npyv_s32x2 r; + r.val[0] = (npyv_s32)ru.val[0]; + r.val[1] = (npyv_s32)ru.val[1]; + return r; +} +#if NPY_SIMD_F32 + NPY_FINLINE npyv_f32x2 npyv_unzip_f32(npyv_f32 ab0, npyv_f32 ab1) + { + npyv_u32x2 ru = npyv_unzip_u32((npyv_u32)ab0, (npyv_u32)ab1); + npyv_f32x2 r; + r.val[0] = (npyv_f32)ru.val[0]; + r.val[1] = (npyv_f32)ru.val[1]; + return r; + } +#endif +NPY_FINLINE npyv_u64x2 npyv_unzip_u64(npyv_u64 ab0, npyv_u64 ab1) +{ return npyv_combine_u64(ab0, ab1); } +NPY_FINLINE npyv_s64x2 npyv_unzip_s64(npyv_s64 ab0, npyv_s64 ab1) +{ return npyv_combine_s64(ab0, ab1); } +NPY_FINLINE npyv_f64x2 npyv_unzip_f64(npyv_f64 ab0, npyv_f64 ab1) +{ return npyv_combine_f64(ab0, ab1); } + // Reverse elements of each 64-bit lane NPY_FINLINE npyv_u8 npyv_rev64_u8(npyv_u8 a) { @@ -111,4 +190,24 @@ NPY_FINLINE npyv_s32 npyv_rev64_s32(npyv_s32 a) { return (npyv_f32)npyv_rev64_u32((npyv_u32)a); } #endif +// Permuting the elements of each 128-bit lane by immediate index for +// each element. +#define npyv_permi128_u32(A, E0, E1, E2, E3) \ + vec_perm(A, A, npyv_set_u8( \ + (E0<<2), (E0<<2)+1, (E0<<2)+2, (E0<<2)+3, \ + (E1<<2), (E1<<2)+1, (E1<<2)+2, (E1<<2)+3, \ + (E2<<2), (E2<<2)+1, (E2<<2)+2, (E2<<2)+3, \ + (E3<<2), (E3<<2)+1, (E3<<2)+2, (E3<<2)+3 \ + )) +#define npyv_permi128_s32 npyv_permi128_u32 +#define npyv_permi128_f32 npyv_permi128_u32 + +#if defined(__IBMC__) || defined(vec_permi) + #define npyv_permi128_u64(A, E0, E1) vec_permi(A, A, ((E0)<<1) | (E1)) +#else + #define npyv_permi128_u64(A, E0, E1) vec_xxpermdi(A, A, ((E0)<<1) | (E1)) +#endif +#define npyv_permi128_s64 npyv_permi128_u64 +#define npyv_permi128_f64 npyv_permi128_u64 + #endif // _NPY_SIMD_VEC_REORDER_H diff --git a/numpy/core/src/common/simd/vec/vec.h b/numpy/core/src/common/simd/vec/vec.h index abcd33ce1..1d4508669 100644 --- a/numpy/core/src/common/simd/vec/vec.h +++ b/numpy/core/src/common/simd/vec/vec.h @@ -39,8 +39,10 @@ #ifdef NPY_HAVE_VX #define NPY_SIMD_BIGENDIAN 1 + #define NPY_SIMD_CMPSIGNAL 0 #else #define NPY_SIMD_BIGENDIAN 0 + #define NPY_SIMD_CMPSIGNAL 1 #endif typedef __vector unsigned char npyv_u8; |