diff options
author | Sebastian Berg <sebastian@sipsolutions.net> | 2022-09-26 08:26:08 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-09-26 08:26:08 +0200 |
commit | c27bb662aa75be518f173b24e0f647248310d0ce (patch) | |
tree | 153703f9cb24a0efff5537b6a358617a68c62c70 /numpy/core | |
parent | 380fbb2b89e43fe6793d0653989b8fbde1b4ddb7 (diff) | |
parent | ee5d890007f8937a49d47cc219cf6013c13bf27f (diff) | |
download | numpy-c27bb662aa75be518f173b24e0f647248310d0ce.tar.gz |
Merge pull request #22325 from seiko2plus/npyv_speedup_neon_vec_init
SIMD: Improve the performance of NEON vector initializer
Diffstat (limited to 'numpy/core')
-rw-r--r-- | numpy/core/src/common/simd/neon/misc.h | 168 |
1 files changed, 88 insertions, 80 deletions
diff --git a/numpy/core/src/common/simd/neon/misc.h b/numpy/core/src/common/simd/neon/misc.h index 5fe109c13..9dac0cfaf 100644 --- a/numpy/core/src/common/simd/neon/misc.h +++ b/numpy/core/src/common/simd/neon/misc.h @@ -31,86 +31,94 @@ // vector with specific values set to each lane and // set a specific value to all remained lanes -NPY_FINLINE uint8x16_t npyv__set_u8(npy_uint8 i0, npy_uint8 i1, npy_uint8 i2, npy_uint8 i3, - npy_uint8 i4, npy_uint8 i5, npy_uint8 i6, npy_uint8 i7, npy_uint8 i8, npy_uint8 i9, - npy_uint8 i10, npy_uint8 i11, npy_uint8 i12, npy_uint8 i13, npy_uint8 i14, npy_uint8 i15) -{ - const uint8_t NPY_DECL_ALIGNED(16) data[16] = { - i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15 - }; - return vld1q_u8(data); -} -#define npyv_setf_u8(FILL, ...) npyv__set_u8(NPYV__SET_FILL_16(npy_uint8, FILL, __VA_ARGS__)) - -NPY_FINLINE int8x16_t npyv__set_s8(npy_int8 i0, npy_int8 i1, npy_int8 i2, npy_int8 i3, - npy_int8 i4, npy_int8 i5, npy_int8 i6, npy_int8 i7, npy_int8 i8, npy_int8 i9, - npy_int8 i10, npy_int8 i11, npy_int8 i12, npy_int8 i13, npy_int8 i14, npy_int8 i15) -{ - const int8_t NPY_DECL_ALIGNED(16) data[16] = { - i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15 - }; - return vld1q_s8(data); -} -#define npyv_setf_s8(FILL, ...) npyv__set_s8(NPYV__SET_FILL_16(npy_int8, FILL, __VA_ARGS__)) - -NPY_FINLINE uint16x8_t npyv__set_u16(npy_uint16 i0, npy_uint16 i1, npy_uint16 i2, npy_uint16 i3, - npy_uint16 i4, npy_uint16 i5, npy_uint16 i6, npy_uint16 i7) -{ - const uint16_t NPY_DECL_ALIGNED(16) data[8] = {i0, i1, i2, i3, i4, i5, i6, i7}; - return vld1q_u16(data); -} -#define npyv_setf_u16(FILL, ...) npyv__set_u16(NPYV__SET_FILL_8(npy_uint16, FILL, __VA_ARGS__)) - -NPY_FINLINE int16x8_t npyv__set_s16(npy_int16 i0, npy_int16 i1, npy_int16 i2, npy_int16 i3, - npy_int16 i4, npy_int16 i5, npy_int16 i6, npy_int16 i7) -{ - const int16_t NPY_DECL_ALIGNED(16) data[8] = {i0, i1, i2, i3, i4, i5, i6, i7}; - return vld1q_s16(data); -} -#define npyv_setf_s16(FILL, ...) npyv__set_s16(NPYV__SET_FILL_8(npy_int16, FILL, __VA_ARGS__)) - -NPY_FINLINE uint32x4_t npyv__set_u32(npy_uint32 i0, npy_uint32 i1, npy_uint32 i2, npy_uint32 i3) -{ - const uint32_t NPY_DECL_ALIGNED(16) data[4] = {i0, i1, i2, i3}; - return vld1q_u32(data); -} -#define npyv_setf_u32(FILL, ...) npyv__set_u32(NPYV__SET_FILL_4(npy_uint32, FILL, __VA_ARGS__)) - -NPY_FINLINE int32x4_t npyv__set_s32(npy_int32 i0, npy_int32 i1, npy_int32 i2, npy_int32 i3) -{ - const int32_t NPY_DECL_ALIGNED(16) data[4] = {i0, i1, i2, i3}; - return vld1q_s32(data); -} -#define npyv_setf_s32(FILL, ...) npyv__set_s32(NPYV__SET_FILL_4(npy_int32, FILL, __VA_ARGS__)) - -NPY_FINLINE uint64x2_t npyv__set_u64(npy_uint64 i0, npy_uint64 i1) -{ - const uint64_t NPY_DECL_ALIGNED(16) data[2] = {i0, i1}; - return vld1q_u64(data); -} -#define npyv_setf_u64(FILL, ...) npyv__set_u64(NPYV__SET_FILL_2(npy_int64, FILL, __VA_ARGS__)) - -NPY_FINLINE int64x2_t npyv__set_s64(npy_int64 i0, npy_int64 i1) -{ - const int64_t NPY_DECL_ALIGNED(16) data[2] = {i0, i1}; - return vld1q_s64(data); -} -#define npyv_setf_s64(FILL, ...) npyv__set_s64(NPYV__SET_FILL_2(npy_int64, FILL, __VA_ARGS__)) - -NPY_FINLINE float32x4_t npyv__set_f32(float i0, float i1, float i2, float i3) -{ - const float NPY_DECL_ALIGNED(16) data[4] = {i0, i1, i2, i3}; - return vld1q_f32(data); -} -#define npyv_setf_f32(FILL, ...) npyv__set_f32(NPYV__SET_FILL_4(float, FILL, __VA_ARGS__)) - -#ifdef __aarch64__ -NPY_FINLINE float64x2_t npyv__set_f64(double i0, double i1) -{ - const double NPY_DECL_ALIGNED(16) data[2] = {i0, i1}; - return vld1q_f64(data); -} -#define npyv_setf_f64(FILL, ...) npyv__set_f64(NPYV__SET_FILL_2(double, FILL, __VA_ARGS__)) +#if defined(__clang__) || defined(__GNUC__) + #define npyv_setf_u8(FILL, ...) ((uint8x16_t){NPYV__SET_FILL_16(uint8_t, FILL, __VA_ARGS__)}) + #define npyv_setf_s8(FILL, ...) ((int8x16_t){NPYV__SET_FILL_16(int8_t, FILL, __VA_ARGS__)}) + #define npyv_setf_u16(FILL, ...) ((uint16x8_t){NPYV__SET_FILL_8(uint16_t, FILL, __VA_ARGS__)}) + #define npyv_setf_s16(FILL, ...) ((int16x8_t){NPYV__SET_FILL_8(int16_t, FILL, __VA_ARGS__)}) + #define npyv_setf_u32(FILL, ...) ((uint32x4_t){NPYV__SET_FILL_4(uint32_t, FILL, __VA_ARGS__)}) + #define npyv_setf_s32(FILL, ...) ((int32x4_t){NPYV__SET_FILL_4(int32_t, FILL, __VA_ARGS__)}) + #define npyv_setf_u64(FILL, ...) ((uint64x2_t){NPYV__SET_FILL_2(uint64_t, FILL, __VA_ARGS__)}) + #define npyv_setf_s64(FILL, ...) ((int64x2_t){NPYV__SET_FILL_2(int64_t, FILL, __VA_ARGS__)}) + #define npyv_setf_f32(FILL, ...) ((float32x4_t){NPYV__SET_FILL_4(float, FILL, __VA_ARGS__)}) + #if NPY_SIMD_F64 + #define npyv_setf_f64(FILL, ...) ((float64x2_t){NPYV__SET_FILL_2(double, FILL, __VA_ARGS__)}) + #endif +#else + NPY_FINLINE uint8x16_t npyv__set_u8(npy_uint8 i0, npy_uint8 i1, npy_uint8 i2, npy_uint8 i3, + npy_uint8 i4, npy_uint8 i5, npy_uint8 i6, npy_uint8 i7, npy_uint8 i8, npy_uint8 i9, + npy_uint8 i10, npy_uint8 i11, npy_uint8 i12, npy_uint8 i13, npy_uint8 i14, npy_uint8 i15) + { + const uint8_t NPY_DECL_ALIGNED(16) data[16] = { + i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15 + }; + return vld1q_u8(data); + } + NPY_FINLINE int8x16_t npyv__set_s8(npy_int8 i0, npy_int8 i1, npy_int8 i2, npy_int8 i3, + npy_int8 i4, npy_int8 i5, npy_int8 i6, npy_int8 i7, npy_int8 i8, npy_int8 i9, + npy_int8 i10, npy_int8 i11, npy_int8 i12, npy_int8 i13, npy_int8 i14, npy_int8 i15) + { + const int8_t NPY_DECL_ALIGNED(16) data[16] = { + i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15 + }; + return vld1q_s8(data); + } + NPY_FINLINE uint16x8_t npyv__set_u16(npy_uint16 i0, npy_uint16 i1, npy_uint16 i2, npy_uint16 i3, + npy_uint16 i4, npy_uint16 i5, npy_uint16 i6, npy_uint16 i7) + { + const uint16_t NPY_DECL_ALIGNED(16) data[8] = {i0, i1, i2, i3, i4, i5, i6, i7}; + return vld1q_u16(data); + } + NPY_FINLINE int16x8_t npyv__set_s16(npy_int16 i0, npy_int16 i1, npy_int16 i2, npy_int16 i3, + npy_int16 i4, npy_int16 i5, npy_int16 i6, npy_int16 i7) + { + const int16_t NPY_DECL_ALIGNED(16) data[8] = {i0, i1, i2, i3, i4, i5, i6, i7}; + return vld1q_s16(data); + } + NPY_FINLINE uint32x4_t npyv__set_u32(npy_uint32 i0, npy_uint32 i1, npy_uint32 i2, npy_uint32 i3) + { + const uint32_t NPY_DECL_ALIGNED(16) data[4] = {i0, i1, i2, i3}; + return vld1q_u32(data); + } + NPY_FINLINE int32x4_t npyv__set_s32(npy_int32 i0, npy_int32 i1, npy_int32 i2, npy_int32 i3) + { + const int32_t NPY_DECL_ALIGNED(16) data[4] = {i0, i1, i2, i3}; + return vld1q_s32(data); + } + NPY_FINLINE uint64x2_t npyv__set_u64(npy_uint64 i0, npy_uint64 i1) + { + const uint64_t NPY_DECL_ALIGNED(16) data[2] = {i0, i1}; + return vld1q_u64(data); + } + NPY_FINLINE int64x2_t npyv__set_s64(npy_int64 i0, npy_int64 i1) + { + const int64_t NPY_DECL_ALIGNED(16) data[2] = {i0, i1}; + return vld1q_s64(data); + } + NPY_FINLINE float32x4_t npyv__set_f32(float i0, float i1, float i2, float i3) + { + const float NPY_DECL_ALIGNED(16) data[4] = {i0, i1, i2, i3}; + return vld1q_f32(data); + } + #if NPY_SIMD_F64 + NPY_FINLINE float64x2_t npyv__set_f64(double i0, double i1) + { + const double NPY_DECL_ALIGNED(16) data[2] = {i0, i1}; + return vld1q_f64(data); + } + #endif + #define npyv_setf_u8(FILL, ...) npyv__set_u8(NPYV__SET_FILL_16(npy_uint8, FILL, __VA_ARGS__)) + #define npyv_setf_s8(FILL, ...) npyv__set_s8(NPYV__SET_FILL_16(npy_int8, FILL, __VA_ARGS__)) + #define npyv_setf_u16(FILL, ...) npyv__set_u16(NPYV__SET_FILL_8(npy_uint16, FILL, __VA_ARGS__)) + #define npyv_setf_s16(FILL, ...) npyv__set_s16(NPYV__SET_FILL_8(npy_int16, FILL, __VA_ARGS__)) + #define npyv_setf_u32(FILL, ...) npyv__set_u32(NPYV__SET_FILL_4(npy_uint32, FILL, __VA_ARGS__)) + #define npyv_setf_s32(FILL, ...) npyv__set_s32(NPYV__SET_FILL_4(npy_int32, FILL, __VA_ARGS__)) + #define npyv_setf_u64(FILL, ...) npyv__set_u64(NPYV__SET_FILL_2(npy_uint64, FILL, __VA_ARGS__)) + #define npyv_setf_s64(FILL, ...) npyv__set_s64(NPYV__SET_FILL_2(npy_int64, FILL, __VA_ARGS__)) + #define npyv_setf_f32(FILL, ...) npyv__set_f32(NPYV__SET_FILL_4(float, FILL, __VA_ARGS__)) + #if NPY_SIMD_F64 + #define npyv_setf_f64(FILL, ...) npyv__set_f64(NPYV__SET_FILL_2(double, FILL, __VA_ARGS__)) + #endif #endif // vector with specific values set to each lane and |