numpy/core/src/common/simd/vec/conversion.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237

#ifndef NPY_SIMD
    #error "Not a standalone header"
#endif

#ifndef _NPY_SIMD_VEC_CVT_H
#define _NPY_SIMD_VEC_CVT_H

// convert boolean vectors to integer vectors
#define npyv_cvt_u8_b8(BL)   ((npyv_u8)  BL)
#define npyv_cvt_s8_b8(BL)   ((npyv_s8)  BL)
#define npyv_cvt_u16_b16(BL) ((npyv_u16) BL)
#define npyv_cvt_s16_b16(BL) ((npyv_s16) BL)
#define npyv_cvt_u32_b32(BL) ((npyv_u32) BL)
#define npyv_cvt_s32_b32(BL) ((npyv_s32) BL)
#define npyv_cvt_u64_b64(BL) ((npyv_u64) BL)
#define npyv_cvt_s64_b64(BL) ((npyv_s64) BL)
#if NPY_SIMD_F32
    #define npyv_cvt_f32_b32(BL) ((npyv_f32) BL)
#endif
#define npyv_cvt_f64_b64(BL) ((npyv_f64) BL)

// convert integer vectors to boolean vectors
#define npyv_cvt_b8_u8(A)   ((npyv_b8)  A)
#define npyv_cvt_b8_s8(A)   ((npyv_b8)  A)
#define npyv_cvt_b16_u16(A) ((npyv_b16) A)
#define npyv_cvt_b16_s16(A) ((npyv_b16) A)
#define npyv_cvt_b32_u32(A) ((npyv_b32) A)
#define npyv_cvt_b32_s32(A) ((npyv_b32) A)
#define npyv_cvt_b64_u64(A) ((npyv_b64) A)
#define npyv_cvt_b64_s64(A) ((npyv_b64) A)
#if NPY_SIMD_F32
    #define npyv_cvt_b32_f32(A) ((npyv_b32) A)
#endif
#define npyv_cvt_b64_f64(A) ((npyv_b64) A)

//expand
NPY_FINLINE npyv_u16x2 npyv_expand_u16_u8(npyv_u8 data)
{
    npyv_u16x2 r;
#ifdef NPY_HAVE_VX
    r.val[0] = vec_unpackh(data);
    r.val[1] = vec_unpackl(data);
#else
    npyv_u8 zero = npyv_zero_u8();
    r.val[0] = (npyv_u16)vec_mergeh(data, zero);
    r.val[1] = (npyv_u16)vec_mergel(data, zero);
#endif
    return r;
}

NPY_FINLINE npyv_u32x2 npyv_expand_u32_u16(npyv_u16 data)
{
    npyv_u32x2 r;
#ifdef NPY_HAVE_VX
    r.val[0] = vec_unpackh(data);
    r.val[1] = vec_unpackl(data);
#else
    npyv_u16 zero = npyv_zero_u16();
    r.val[0] = (npyv_u32)vec_mergeh(data, zero);
    r.val[1] = (npyv_u32)vec_mergel(data, zero);
#endif
    return r;
}

// pack two 16-bit boolean into one 8-bit boolean vector
NPY_FINLINE npyv_b8 npyv_pack_b8_b16(npyv_b16 a, npyv_b16 b) {
    return vec_pack(a, b);
}

// pack four 32-bit boolean vectors into one 8-bit boolean vector
NPY_FINLINE npyv_b8 npyv_pack_b8_b32(npyv_b32 a, npyv_b32 b, npyv_b32 c, npyv_b32 d) {
    npyv_b16 ab = vec_pack(a, b);
    npyv_b16 cd = vec_pack(c, d);
    return npyv_pack_b8_b16(ab, cd);
}

// pack eight 64-bit boolean vectors into one 8-bit boolean vector
NPY_FINLINE npyv_b8
npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d,
                 npyv_b64 e, npyv_b64 f, npyv_b64 g, npyv_b64 h) {
    npyv_b32 ab = vec_pack(a, b);
    npyv_b32 cd = vec_pack(c, d);
    npyv_b32 ef = vec_pack(e, f);
    npyv_b32 gh = vec_pack(g, h);
    return npyv_pack_b8_b32(ab, cd, ef, gh);
}

// convert boolean vector to integer bitfield
#if defined(NPY_HAVE_VXE) || defined(NPY_HAVE_VSX2)
    NPY_FINLINE npy_uint64 npyv_tobits_b8(npyv_b8 a)
    {
        const npyv_u8 qperm = npyv_set_u8(120, 112, 104, 96, 88, 80, 72, 64, 56, 48, 40, 32, 24, 16, 8, 0);
        npyv_u16 r = (npyv_u16)vec_vbpermq((npyv_u8)a, qperm);
    #ifdef NPY_HAVE_VXE
        return vec_extract(r, 3);
    #else
        return vec_extract(r, 4);
    #endif
        // to suppress ambiguous warning: variable `r` but not used [-Wunused-but-set-variable]
	(void)r;
    }
    NPY_FINLINE npy_uint64 npyv_tobits_b16(npyv_b16 a)
    {
        const npyv_u8 qperm = npyv_setf_u8(128, 112, 96, 80, 64, 48, 32, 16, 0);
        npyv_u8 r = (npyv_u8)vec_vbpermq((npyv_u8)a, qperm);
    #ifdef NPY_HAVE_VXE
        return vec_extract(r, 6);
    #else
        return vec_extract(r, 8);
    #endif
	// to suppress ambiguous warning: variable `r` but not used [-Wunused-but-set-variable]
        (void)r;
    }
    NPY_FINLINE npy_uint64 npyv_tobits_b32(npyv_b32 a)
    {
    #ifdef NPY_HAVE_VXE
        const npyv_u8 qperm = npyv_setf_u8(128, 128, 128, 128, 128, 96, 64, 32, 0);
    #else
        const npyv_u8 qperm = npyv_setf_u8(128, 96, 64, 32, 0);
    #endif
        npyv_u8 r = (npyv_u8)vec_vbpermq((npyv_u8)a, qperm);
    #ifdef NPY_HAVE_VXE
        return vec_extract(r, 6);
    #else
        return vec_extract(r, 8);
    #endif
	// to suppress ambiguous warning: variable `r` but not used [-Wunused-but-set-variable]
        (void)r;
    }
    NPY_FINLINE npy_uint64 npyv_tobits_b64(npyv_b64 a)
    {
    #ifdef NPY_HAVE_VXE
        const npyv_u8 qperm = npyv_setf_u8(128, 128, 128, 128, 128, 128, 128, 64, 0);
    #else
        const npyv_u8 qperm = npyv_setf_u8(128, 64, 0);
    #endif
        npyv_u8 r = (npyv_u8)vec_vbpermq((npyv_u8)a, qperm);
    #ifdef NPY_HAVE_VXE
        return vec_extract(r, 6);
    #else
        return vec_extract(r, 8);
    #endif
	// to suppress ambiguous warning: variable `r` but not used [-Wunused-but-set-variable]
        (void)r;
    }
#else
    NPY_FINLINE npy_uint64 npyv_tobits_b8(npyv_b8 a)
    {
        const npyv_u8 scale = npyv_set_u8(1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128);
        npyv_u8 seq_scale = vec_and((npyv_u8)a, scale);
        npyv_u64 sum = vec_sum2(vec_sum4(seq_scale, npyv_zero_u8()), npyv_zero_u32());
        return vec_extract(sum, 0) + ((int)vec_extract(sum, 1) << 8);
    }
    NPY_FINLINE npy_uint64 npyv_tobits_b16(npyv_b16 a)
    {
        const npyv_u16 scale = npyv_set_u16(1, 2, 4, 8, 16, 32, 64, 128);
        npyv_u16 seq_scale = vec_and((npyv_u16)a, scale);
        npyv_u64 sum = vec_sum2(seq_scale, npyv_zero_u16());
        return vec_extract(vec_sum_u128(sum, npyv_zero_u64()), 15);
    }
    NPY_FINLINE npy_uint64 npyv_tobits_b32(npyv_b32 a)
    {
        const npyv_u32 scale = npyv_set_u32(1, 2, 4, 8);
        npyv_u32 seq_scale = vec_and((npyv_u32)a, scale);
        return vec_extract(vec_sum_u128(seq_scale, npyv_zero_u32()), 15);
    }
    NPY_FINLINE npy_uint64 npyv_tobits_b64(npyv_b64 a)
    {
        const npyv_u64 scale = npyv_set_u64(1, 2);
        npyv_u64 seq_scale = vec_and((npyv_u64)a, scale);
        return vec_extract(vec_sum_u128(seq_scale, npyv_zero_u64()), 15);
    }
#endif
// truncate compatible with all compilers(internal use for now)
#if NPY_SIMD_F32
    NPY_FINLINE npyv_s32 npyv__trunc_s32_f32(npyv_f32 a)
    {
    #ifdef NPY_HAVE_VXE2
        return vec_signed(a);
    #elif defined(NPY_HAVE_VXE)
        return vec_packs(vec_signed(npyv_doublee(vec_mergeh(a,a))),
            vec_signed(npyv_doublee(vec_mergel(a, a))));
    // VSX
    #elif defined(__IBMC__)
        return vec_cts(a, 0);
    #elif defined(__clang__)
        /**
         * old versions of CLANG doesn't support %x<n> in the inline asm template
         * which fixes register number when using any of the register constraints wa, wd, wf.
         * therefore, we count on built-in functions.
         */
        return __builtin_convertvector(a, npyv_s32);
    #else // gcc
        npyv_s32 ret;
        __asm__ ("xvcvspsxws %x0,%x1" : "=wa" (ret) : "wa" (a));
        return ret;
    #endif
    }
#endif

NPY_FINLINE npyv_s32 npyv__trunc_s32_f64(npyv_f64 a, npyv_f64 b)
{
#ifdef NPY_HAVE_VX
    return vec_packs(vec_signed(a), vec_signed(b));
// VSX
#elif defined(__IBMC__)
    const npyv_u8 seq_even = npyv_set_u8(0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27);
    // unfortunately, XLC missing asm register vsx fixer
    // hopefully, xlc can optimize around big-endian compatibility
    npyv_s32 lo_even = vec_cts(a, 0);
    npyv_s32 hi_even = vec_cts(b, 0);
    return vec_perm(lo_even, hi_even, seq_even);
#else
    const npyv_u8 seq_odd = npyv_set_u8(4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31);
    #ifdef __clang__
        // __builtin_convertvector doesn't support this conversion on wide range of versions
        // fortunately, almost all versions have direct builtin of 'xvcvdpsxws'
        npyv_s32 lo_odd = __builtin_vsx_xvcvdpsxws(a);
        npyv_s32 hi_odd = __builtin_vsx_xvcvdpsxws(b);
    #else // gcc
        npyv_s32 lo_odd, hi_odd;
        __asm__ ("xvcvdpsxws %x0,%x1" : "=wa" (lo_odd) : "wa" (a));
        __asm__ ("xvcvdpsxws %x0,%x1" : "=wa" (hi_odd) : "wa" (b));
    #endif
    return vec_perm(lo_odd, hi_odd, seq_odd);
#endif
}

// round to nearest integer (assuming even)
#if NPY_SIMD_F32
    NPY_FINLINE npyv_s32 npyv_round_s32_f32(npyv_f32 a)
    { return npyv__trunc_s32_f32(vec_rint(a)); }
#endif
NPY_FINLINE npyv_s32 npyv_round_s32_f64(npyv_f64 a, npyv_f64 b)
{ return npyv__trunc_s32_f64(vec_rint(a), vec_rint(b)); }

#endif // _NPY_SIMD_VEC_CVT_H