1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
|
#ifndef NPY_SIMD
#error "Not a standalone header"
#endif
#ifndef _NPY_SIMD_NEON_REORDER_H
#define _NPY_SIMD_NEON_REORDER_H
// combine lower part of two vectors
#ifdef __aarch64__
#define npyv_combinel_u8(A, B) vreinterpretq_u8_u64(vzip1q_u64(vreinterpretq_u64_u8(A), vreinterpretq_u64_u8(B)))
#define npyv_combinel_s8(A, B) vreinterpretq_s8_u64(vzip1q_u64(vreinterpretq_u64_s8(A), vreinterpretq_u64_s8(B)))
#define npyv_combinel_u16(A, B) vreinterpretq_u16_u64(vzip1q_u64(vreinterpretq_u64_u16(A), vreinterpretq_u64_u16(B)))
#define npyv_combinel_s16(A, B) vreinterpretq_s16_u64(vzip1q_u64(vreinterpretq_u64_s16(A), vreinterpretq_u64_s16(B)))
#define npyv_combinel_u32(A, B) vreinterpretq_u32_u64(vzip1q_u64(vreinterpretq_u64_u32(A), vreinterpretq_u64_u32(B)))
#define npyv_combinel_s32(A, B) vreinterpretq_s32_u64(vzip1q_u64(vreinterpretq_u64_s32(A), vreinterpretq_u64_s32(B)))
#define npyv_combinel_u64 vzip1q_u64
#define npyv_combinel_s64 vzip1q_s64
#define npyv_combinel_f32(A, B) vreinterpretq_f32_u64(vzip1q_u64(vreinterpretq_u64_f32(A), vreinterpretq_u64_f32(B)))
#define npyv_combinel_f64 vzip1q_f64
#else
#define npyv_combinel_u8(A, B) vcombine_u8(vget_low_u8(A), vget_low_u8(B))
#define npyv_combinel_s8(A, B) vcombine_s8(vget_low_s8(A), vget_low_s8(B))
#define npyv_combinel_u16(A, B) vcombine_u16(vget_low_u16(A), vget_low_u16(B))
#define npyv_combinel_s16(A, B) vcombine_s16(vget_low_s16(A), vget_low_s16(B))
#define npyv_combinel_u32(A, B) vcombine_u32(vget_low_u32(A), vget_low_u32(B))
#define npyv_combinel_s32(A, B) vcombine_s32(vget_low_s32(A), vget_low_s32(B))
#define npyv_combinel_u64(A, B) vcombine_u64(vget_low_u64(A), vget_low_u64(B))
#define npyv_combinel_s64(A, B) vcombine_s64(vget_low_s64(A), vget_low_s64(B))
#define npyv_combinel_f32(A, B) vcombine_f32(vget_low_f32(A), vget_low_f32(B))
#endif
// combine higher part of two vectors
#ifdef __aarch64__
#define npyv_combineh_u8(A, B) vreinterpretq_u8_u64(vzip2q_u64(vreinterpretq_u64_u8(A), vreinterpretq_u64_u8(B)))
#define npyv_combineh_s8(A, B) vreinterpretq_s8_u64(vzip2q_u64(vreinterpretq_u64_s8(A), vreinterpretq_u64_s8(B)))
#define npyv_combineh_u16(A, B) vreinterpretq_u16_u64(vzip2q_u64(vreinterpretq_u64_u16(A), vreinterpretq_u64_u16(B)))
#define npyv_combineh_s16(A, B) vreinterpretq_s16_u64(vzip2q_u64(vreinterpretq_u64_s16(A), vreinterpretq_u64_s16(B)))
#define npyv_combineh_u32(A, B) vreinterpretq_u32_u64(vzip2q_u64(vreinterpretq_u64_u32(A), vreinterpretq_u64_u32(B)))
#define npyv_combineh_s32(A, B) vreinterpretq_s32_u64(vzip2q_u64(vreinterpretq_u64_s32(A), vreinterpretq_u64_s32(B)))
#define npyv_combineh_u64 vzip2q_u64
#define npyv_combineh_s64 vzip2q_s64
#define npyv_combineh_f32(A, B) vreinterpretq_f32_u64(vzip2q_u64(vreinterpretq_u64_f32(A), vreinterpretq_u64_f32(B)))
#define npyv_combineh_f64 vzip2q_f64
#else
#define npyv_combineh_u8(A, B) vcombine_u8(vget_high_u8(A), vget_high_u8(B))
#define npyv_combineh_s8(A, B) vcombine_s8(vget_high_s8(A), vget_high_s8(B))
#define npyv_combineh_u16(A, B) vcombine_u16(vget_high_u16(A), vget_high_u16(B))
#define npyv_combineh_s16(A, B) vcombine_s16(vget_high_s16(A), vget_high_s16(B))
#define npyv_combineh_u32(A, B) vcombine_u32(vget_high_u32(A), vget_high_u32(B))
#define npyv_combineh_s32(A, B) vcombine_s32(vget_high_s32(A), vget_high_s32(B))
#define npyv_combineh_u64(A, B) vcombine_u64(vget_high_u64(A), vget_high_u64(B))
#define npyv_combineh_s64(A, B) vcombine_s64(vget_high_s64(A), vget_high_s64(B))
#define npyv_combineh_f32(A, B) vcombine_f32(vget_high_f32(A), vget_high_f32(B))
#endif
// combine two vectors from lower and higher parts of two other vectors
#define NPYV_IMPL_NEON_COMBINE(T_VEC, SFX) \
NPY_FINLINE T_VEC##x2 npyv_combine_##SFX(T_VEC a, T_VEC b) \
{ \
T_VEC##x2 r; \
r.val[0] = NPY_CAT(npyv_combinel_, SFX)(a, b); \
r.val[1] = NPY_CAT(npyv_combineh_, SFX)(a, b); \
return r; \
}
NPYV_IMPL_NEON_COMBINE(npyv_u8, u8)
NPYV_IMPL_NEON_COMBINE(npyv_s8, s8)
NPYV_IMPL_NEON_COMBINE(npyv_u16, u16)
NPYV_IMPL_NEON_COMBINE(npyv_s16, s16)
NPYV_IMPL_NEON_COMBINE(npyv_u32, u32)
NPYV_IMPL_NEON_COMBINE(npyv_s32, s32)
NPYV_IMPL_NEON_COMBINE(npyv_u64, u64)
NPYV_IMPL_NEON_COMBINE(npyv_s64, s64)
NPYV_IMPL_NEON_COMBINE(npyv_f32, f32)
#ifdef __aarch64__
NPYV_IMPL_NEON_COMBINE(npyv_f64, f64)
#endif
// interleave two vectors
#define NPYV_IMPL_NEON_ZIP(T_VEC, SFX) \
NPY_FINLINE T_VEC##x2 npyv_zip_##SFX(T_VEC a, T_VEC b) \
{ \
T_VEC##x2 r; \
r.val[0] = vzip1q_##SFX(a, b); \
r.val[1] = vzip2q_##SFX(a, b); \
return r; \
}
#ifdef __aarch64__
NPYV_IMPL_NEON_ZIP(npyv_u8, u8)
NPYV_IMPL_NEON_ZIP(npyv_s8, s8)
NPYV_IMPL_NEON_ZIP(npyv_u16, u16)
NPYV_IMPL_NEON_ZIP(npyv_s16, s16)
NPYV_IMPL_NEON_ZIP(npyv_u32, u32)
NPYV_IMPL_NEON_ZIP(npyv_s32, s32)
NPYV_IMPL_NEON_ZIP(npyv_f32, f32)
NPYV_IMPL_NEON_ZIP(npyv_f64, f64)
#else
#define npyv_zip_u8 vzipq_u8
#define npyv_zip_s8 vzipq_s8
#define npyv_zip_u16 vzipq_u16
#define npyv_zip_s16 vzipq_s16
#define npyv_zip_u32 vzipq_u32
#define npyv_zip_s32 vzipq_s32
#define npyv_zip_f32 vzipq_f32
#endif
#define npyv_zip_u64 npyv_combine_u64
#define npyv_zip_s64 npyv_combine_s64
// Reverse elements of each 64-bit lane
#define npyv_rev64_u8 vrev64q_u8
#define npyv_rev64_s8 vrev64q_s8
#define npyv_rev64_u16 vrev64q_u16
#define npyv_rev64_s16 vrev64q_s16
#define npyv_rev64_u32 vrev64q_u32
#define npyv_rev64_s32 vrev64q_s32
#define npyv_rev64_f32 vrev64q_f32
#endif // _NPY_SIMD_NEON_REORDER_H
|