#ifndef NPY_SIMD #error "Not a standalone header" #endif #ifndef _NPY_SIMD_NEON_REORDER_H #define _NPY_SIMD_NEON_REORDER_H // combine lower part of two vectors #ifdef __aarch64__ #define npyv_combinel_u8(A, B) vreinterpretq_u8_u64(vzip1q_u64(vreinterpretq_u64_u8(A), vreinterpretq_u64_u8(B))) #define npyv_combinel_s8(A, B) vreinterpretq_s8_u64(vzip1q_u64(vreinterpretq_u64_s8(A), vreinterpretq_u64_s8(B))) #define npyv_combinel_u16(A, B) vreinterpretq_u16_u64(vzip1q_u64(vreinterpretq_u64_u16(A), vreinterpretq_u64_u16(B))) #define npyv_combinel_s16(A, B) vreinterpretq_s16_u64(vzip1q_u64(vreinterpretq_u64_s16(A), vreinterpretq_u64_s16(B))) #define npyv_combinel_u32(A, B) vreinterpretq_u32_u64(vzip1q_u64(vreinterpretq_u64_u32(A), vreinterpretq_u64_u32(B))) #define npyv_combinel_s32(A, B) vreinterpretq_s32_u64(vzip1q_u64(vreinterpretq_u64_s32(A), vreinterpretq_u64_s32(B))) #define npyv_combinel_u64 vzip1q_u64 #define npyv_combinel_s64 vzip1q_s64 #define npyv_combinel_f32(A, B) vreinterpretq_f32_u64(vzip1q_u64(vreinterpretq_u64_f32(A), vreinterpretq_u64_f32(B))) #define npyv_combinel_f64 vzip1q_f64 #else #define npyv_combinel_u8(A, B) vcombine_u8(vget_low_u8(A), vget_low_u8(B)) #define npyv_combinel_s8(A, B) vcombine_s8(vget_low_s8(A), vget_low_s8(B)) #define npyv_combinel_u16(A, B) vcombine_u16(vget_low_u16(A), vget_low_u16(B)) #define npyv_combinel_s16(A, B) vcombine_s16(vget_low_s16(A), vget_low_s16(B)) #define npyv_combinel_u32(A, B) vcombine_u32(vget_low_u32(A), vget_low_u32(B)) #define npyv_combinel_s32(A, B) vcombine_s32(vget_low_s32(A), vget_low_s32(B)) #define npyv_combinel_u64(A, B) vcombine_u64(vget_low_u64(A), vget_low_u64(B)) #define npyv_combinel_s64(A, B) vcombine_s64(vget_low_s64(A), vget_low_s64(B)) #define npyv_combinel_f32(A, B) vcombine_f32(vget_low_f32(A), vget_low_f32(B)) #endif // combine higher part of two vectors #ifdef __aarch64__ #define npyv_combineh_u8(A, B) vreinterpretq_u8_u64(vzip2q_u64(vreinterpretq_u64_u8(A), vreinterpretq_u64_u8(B))) #define npyv_combineh_s8(A, B) vreinterpretq_s8_u64(vzip2q_u64(vreinterpretq_u64_s8(A), vreinterpretq_u64_s8(B))) #define npyv_combineh_u16(A, B) vreinterpretq_u16_u64(vzip2q_u64(vreinterpretq_u64_u16(A), vreinterpretq_u64_u16(B))) #define npyv_combineh_s16(A, B) vreinterpretq_s16_u64(vzip2q_u64(vreinterpretq_u64_s16(A), vreinterpretq_u64_s16(B))) #define npyv_combineh_u32(A, B) vreinterpretq_u32_u64(vzip2q_u64(vreinterpretq_u64_u32(A), vreinterpretq_u64_u32(B))) #define npyv_combineh_s32(A, B) vreinterpretq_s32_u64(vzip2q_u64(vreinterpretq_u64_s32(A), vreinterpretq_u64_s32(B))) #define npyv_combineh_u64 vzip2q_u64 #define npyv_combineh_s64 vzip2q_s64 #define npyv_combineh_f32(A, B) vreinterpretq_f32_u64(vzip2q_u64(vreinterpretq_u64_f32(A), vreinterpretq_u64_f32(B))) #define npyv_combineh_f64 vzip2q_f64 #else #define npyv_combineh_u8(A, B) vcombine_u8(vget_high_u8(A), vget_high_u8(B)) #define npyv_combineh_s8(A, B) vcombine_s8(vget_high_s8(A), vget_high_s8(B)) #define npyv_combineh_u16(A, B) vcombine_u16(vget_high_u16(A), vget_high_u16(B)) #define npyv_combineh_s16(A, B) vcombine_s16(vget_high_s16(A), vget_high_s16(B)) #define npyv_combineh_u32(A, B) vcombine_u32(vget_high_u32(A), vget_high_u32(B)) #define npyv_combineh_s32(A, B) vcombine_s32(vget_high_s32(A), vget_high_s32(B)) #define npyv_combineh_u64(A, B) vcombine_u64(vget_high_u64(A), vget_high_u64(B)) #define npyv_combineh_s64(A, B) vcombine_s64(vget_high_s64(A), vget_high_s64(B)) #define npyv_combineh_f32(A, B) vcombine_f32(vget_high_f32(A), vget_high_f32(B)) #endif // combine two vectors from lower and higher parts of two other vectors #define NPYV_IMPL_NEON_COMBINE(T_VEC, SFX) \ NPY_FINLINE T_VEC##x2 npyv_combine_##SFX(T_VEC a, T_VEC b) \ { \ T_VEC##x2 r; \ r.val[0] = NPY_CAT(npyv_combinel_, SFX)(a, b); \ r.val[1] = NPY_CAT(npyv_combineh_, SFX)(a, b); \ return r; \ } NPYV_IMPL_NEON_COMBINE(npyv_u8, u8) NPYV_IMPL_NEON_COMBINE(npyv_s8, s8) NPYV_IMPL_NEON_COMBINE(npyv_u16, u16) NPYV_IMPL_NEON_COMBINE(npyv_s16, s16) NPYV_IMPL_NEON_COMBINE(npyv_u32, u32) NPYV_IMPL_NEON_COMBINE(npyv_s32, s32) NPYV_IMPL_NEON_COMBINE(npyv_u64, u64) NPYV_IMPL_NEON_COMBINE(npyv_s64, s64) NPYV_IMPL_NEON_COMBINE(npyv_f32, f32) #ifdef __aarch64__ NPYV_IMPL_NEON_COMBINE(npyv_f64, f64) #endif // interleave two vectors #define NPYV_IMPL_NEON_ZIP(T_VEC, SFX) \ NPY_FINLINE T_VEC##x2 npyv_zip_##SFX(T_VEC a, T_VEC b) \ { \ T_VEC##x2 r; \ r.val[0] = vzip1q_##SFX(a, b); \ r.val[1] = vzip2q_##SFX(a, b); \ return r; \ } #ifdef __aarch64__ NPYV_IMPL_NEON_ZIP(npyv_u8, u8) NPYV_IMPL_NEON_ZIP(npyv_s8, s8) NPYV_IMPL_NEON_ZIP(npyv_u16, u16) NPYV_IMPL_NEON_ZIP(npyv_s16, s16) NPYV_IMPL_NEON_ZIP(npyv_u32, u32) NPYV_IMPL_NEON_ZIP(npyv_s32, s32) NPYV_IMPL_NEON_ZIP(npyv_f32, f32) NPYV_IMPL_NEON_ZIP(npyv_f64, f64) #else #define npyv_zip_u8 vzipq_u8 #define npyv_zip_s8 vzipq_s8 #define npyv_zip_u16 vzipq_u16 #define npyv_zip_s16 vzipq_s16 #define npyv_zip_u32 vzipq_u32 #define npyv_zip_s32 vzipq_s32 #define npyv_zip_f32 vzipq_f32 #endif #define npyv_zip_u64 npyv_combine_u64 #define npyv_zip_s64 npyv_combine_s64 // Reverse elements of each 64-bit lane #define npyv_rev64_u8 vrev64q_u8 #define npyv_rev64_s8 vrev64q_s8 #define npyv_rev64_u16 vrev64q_u16 #define npyv_rev64_s16 vrev64q_s16 #define npyv_rev64_u32 vrev64q_u32 #define npyv_rev64_s32 vrev64q_s32 #define npyv_rev64_f32 vrev64q_f32 #endif // _NPY_SIMD_NEON_REORDER_H