/*@targets #simd_test*/ #include "_simd.h" #include "_simd_inc.h" #if NPY_SIMD #include "_simd_data.inc" #include "_simd_convert.inc" #include "_simd_vector.inc" #include "_simd_arg.inc" #include "_simd_easyintrin.inc" //######################################################################### //## Defining NPYV intrinsics as module functions //######################################################################### /**begin repeat * #sfx = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64# * #bsfx = b8, b8, b16, b16, b32, b32, b64, b64, b32, b64# * #esfx = u16,s8, u32, s16, u32, s32, u64, s64, f32, f64# * #size = 8, 8, 16, 16, 32, 32, 64, 64, 32, 64# * #expand_sup= 1, 0, 1, 0, 0, 0, 0, 0, 0, 0# * #simd_sup = 1, 1, 1, 1, 1, 1, 1, 1, NPY_SIMD_F32, NPY_SIMD_F64# * #fp_only = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1# * #sat_sup = 1, 1, 1, 1, 0, 0, 0, 0, 0, 0# * #mul_sup = 1, 1, 1, 1, 1, 1, 0, 0, 1, 1# * #div_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1# * #fused_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1# * #sumup_sup = 1, 0, 1, 0, 0, 0, 0, 0, 0, 0# * #sum_sup = 0, 0, 0, 0, 1, 0, 1, 0, 1, 1# * #rev64_sup = 1, 1, 1, 1, 1, 1, 0, 0, 1, 0# * #ncont_sup = 0, 0, 0, 0, 1, 1, 1, 1, 1, 1# * #intdiv_sup= 1, 1, 1, 1, 1, 1, 1, 1, 0, 0# * #shl_imm = 0, 0, 15, 15, 31, 31, 63, 63, 0, 0# * #shr_imm = 0, 0, 16, 16, 32, 32, 64, 64, 0, 0# * #bitw8b_sup= 1, 0, 0, 0, 0, 0, 0, 0, 0, 0# */ #if @simd_sup@ /*************************** * Memory ***************************/ /**begin repeat1 * # intrin = load, loada, loads, loadl# */ SIMD_IMPL_INTRIN_1(@intrin@_@sfx@, v@sfx@, q@sfx@) /**end repeat1**/ SIMD_IMPL_INTRIN_1(load_@sfx@x2, v@sfx@x2, q@sfx@) /**begin repeat1 * # intrin = store, storea, stores, storel, storeh, store# * # x = ,,,,, x2# */ // special definition due to the nature of @intrin@ static PyObject * simd__intrin_@intrin@_@sfx@@x@(PyObject* NPY_UNUSED(self), PyObject *args) { simd_arg seq_arg = {.dtype = simd_data_q@sfx@}; simd_arg vec_arg = {.dtype = simd_data_v@sfx@@x@}; if (!PyArg_ParseTuple( args, "O&O&:@intrin@_@sfx@@x@", simd_arg_converter, &seq_arg, simd_arg_converter, &vec_arg )) { return NULL; } npyv_@intrin@_@sfx@@x@(seq_arg.data.q@sfx@, vec_arg.data.v@sfx@@x@); // write-back if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.q@sfx@, simd_data_q@sfx@)) { simd_arg_free(&seq_arg); return NULL; } simd_arg_free(&seq_arg); Py_RETURN_NONE; } /**end repeat1**/ /**************************************** * Non-contiguous/Partial Memory access ****************************************/ #if @ncont_sup@ // Partial Load SIMD_IMPL_INTRIN_3(load_till_@sfx@, v@sfx@, q@sfx@, u32, @sfx@) SIMD_IMPL_INTRIN_2(load_tillz_@sfx@, v@sfx@, q@sfx@, u32) #if @size@ == 32 SIMD_IMPL_INTRIN_4(load2_till_@sfx@, v@sfx@, q@sfx@, u32, @sfx@, @sfx@) SIMD_IMPL_INTRIN_2(load2_tillz_@sfx@, v@sfx@, q@sfx@, u32) #else SIMD_IMPL_INTRIN_4(load2_till_@sfx@, v@sfx@, q@sfx@, u32, @sfx@, @sfx@) SIMD_IMPL_INTRIN_2(load2_tillz_@sfx@, v@sfx@, q@sfx@, u32) #endif // Partial Store /**begin repeat1 * #intrin = store_till, store2_till, store2_till# * #chksize= 0, 32, 64# */ #if !@chksize@ || @chksize@ == @size@ static PyObject * simd__intrin_@intrin@_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args) { simd_arg seq_arg = {.dtype = simd_data_q@sfx@}; simd_arg nlane_arg = {.dtype = simd_data_u32}; simd_arg vec_arg = {.dtype = simd_data_v@sfx@}; if (!PyArg_ParseTuple( args, "O&O&O&:@intrin@_@sfx@", simd_arg_converter, &seq_arg, simd_arg_converter, &nlane_arg, simd_arg_converter, &vec_arg )) { return NULL; } npyv_@intrin@_@sfx@( seq_arg.data.q@sfx@, nlane_arg.data.u32, vec_arg.data.v@sfx@ ); // write-back if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.q@sfx@, simd_data_q@sfx@)) { simd_arg_free(&seq_arg); return NULL; } simd_arg_free(&seq_arg); Py_RETURN_NONE; } #endif // chksize /**end repeat1**/ // Non-contiguous Load /**begin repeat1 * #intrin = loadn, loadn2, loadn2, * loadn_till, loadn2_till, loadn2_till, * loadn_tillz, loadn2_tillz, loadn2_tillz# * #scale = 1,2,2, 1,2,2, 1,2,2# * #till = 0*3, 1*3, 1*3# * #fill = 0*3, 1*3, 0*3# # #fill2 = 0*3, 0,1,1, 0*3# * #format = ,,, O&O&, O&O&O&*2,O&*3# * #chksize= 0,32,64, 0,32,64, 0,32,64# */ #if !@chksize@ || @chksize@ == @size@ static PyObject * simd__intrin_@intrin@_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args) { simd_arg seq_arg = {.dtype = simd_data_q@sfx@}; simd_arg stride_arg = {.dtype = simd_data_s64}; #if @till@ simd_arg nlane_arg = {.dtype = simd_data_u32}; #endif // till #if @fill@ simd_arg fill_arg = {.dtype = simd_data_@sfx@}; #endif #if @fill2@ simd_arg fill2_arg = {.dtype = simd_data_@sfx@}; #endif if (!PyArg_ParseTuple( args, "@format@O&O&:@intrin@_@sfx@", simd_arg_converter, &seq_arg, simd_arg_converter, &stride_arg #if @till@ ,simd_arg_converter, &nlane_arg #endif #if @fill@ ,simd_arg_converter, &fill_arg #endif #if @fill2@ ,simd_arg_converter, &fill2_arg #endif )) { return NULL; } npyv_lanetype_@sfx@ *seq_ptr = seq_arg.data.q@sfx@; npy_intp stride = (npy_intp)stride_arg.data.s64; Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr); Py_ssize_t min_seq_len = stride * npyv_nlanes_@sfx@; if (stride < 0) { seq_ptr += cur_seq_len - 1 * @scale@; min_seq_len = -min_seq_len; } if (cur_seq_len < min_seq_len) { PyErr_Format(PyExc_ValueError, "@intrin@_@sfx@(), according to provided stride %d, the " "minimum acceptable size of the required sequence is %d, given(%d)", stride, min_seq_len, cur_seq_len ); goto err; } npyv_@sfx@ rvec = npyv_@intrin@_@sfx@( seq_ptr, stride #if @till@ , nlane_arg.data.u32 #endif #if @fill@ , fill_arg.data.@sfx@ #endif #if @fill2@ , fill2_arg.data.@sfx@ #endif ); simd_arg ret = { .dtype = simd_data_v@sfx@, .data = {.v@sfx@=rvec} }; simd_arg_free(&seq_arg); return simd_arg_to_obj(&ret); err: simd_arg_free(&seq_arg); return NULL; } #endif // chksize /**end repeat1**/ // Non-contiguous Store /**begin repeat1 * #intrin = storen, storen2, storen2, storen_till, storen2_till, storen2_till# * #scale = 1,2,2, 1,2,2# * #till = 0*3, 1*3# * #format = ,,, O&*3# * #chksize= 0,32,64, 0,32,64# */ #if !@chksize@ || @chksize@ == @size@ static PyObject * simd__intrin_@intrin@_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args) { simd_arg seq_arg = {.dtype = simd_data_q@sfx@}; simd_arg stride_arg = {.dtype = simd_data_s64}; simd_arg vec_arg = {.dtype = simd_data_v@sfx@}; #if @till@ simd_arg nlane_arg = {.dtype = simd_data_u32}; #endif if (!PyArg_ParseTuple( args, "@format@O&O&O&:storen_@sfx@", simd_arg_converter, &seq_arg, simd_arg_converter, &stride_arg #if @till@ ,simd_arg_converter, &nlane_arg #endif ,simd_arg_converter, &vec_arg )) { return NULL; } npyv_lanetype_@sfx@ *seq_ptr = seq_arg.data.q@sfx@; npy_intp stride = (npy_intp)stride_arg.data.s64; Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr); Py_ssize_t min_seq_len = stride * npyv_nlanes_@sfx@; if (stride < 0) { seq_ptr += cur_seq_len - 1*@scale@; min_seq_len = -min_seq_len; } // overflow guard if (cur_seq_len < min_seq_len) { PyErr_Format(PyExc_ValueError, "@intrin@_@sfx@(), according to provided stride %d, the" "minimum acceptable size of the required sequence is %d, given(%d)", stride, min_seq_len, cur_seq_len ); goto err; } npyv_@intrin@_@sfx@( seq_ptr, stride #if @till@ ,nlane_arg.data.u32 #endif ,vec_arg.data.v@sfx@ ); // write-back if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.q@sfx@, simd_data_q@sfx@)) { goto err; } simd_arg_free(&seq_arg); Py_RETURN_NONE; err: simd_arg_free(&seq_arg); return NULL; } #endif // chksize /**end repeat1**/ #endif // @ncont_sup@ /**************************** * Lookup tables ****************************/ #if @size@ == 32 SIMD_IMPL_INTRIN_2(lut32_@sfx@, v@sfx@, q@sfx@, vu@size@) #endif #if @size@ == 64 SIMD_IMPL_INTRIN_2(lut16_@sfx@, v@sfx@, q@sfx@, vu@size@) #endif /*************************** * Misc ***************************/ SIMD_IMPL_INTRIN_0(zero_@sfx@, v@sfx@) SIMD_IMPL_INTRIN_1(extract0_@sfx@, @sfx@, v@sfx@) SIMD_IMPL_INTRIN_1(setall_@sfx@, v@sfx@, @sfx@) SIMD_IMPL_INTRIN_3(select_@sfx@, v@sfx@, v@bsfx@, v@sfx@, v@sfx@) /**begin repeat1 * #sfx_to = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64# * #simd_sup2 = 1, 1, 1, 1, 1, 1, 1, 1, NPY_SIMD_F32, NPY_SIMD_F64# */ #if @simd_sup2@ SIMD_IMPL_INTRIN_1(reinterpret_@sfx_to@_@sfx@, v@sfx_to@, v@sfx@) #endif // simd_sup2 /**end repeat1**/ /** * special definition due to the nature of intrinsics * npyv_setf_@sfx@ and npy_set_@sfx@. */ /**begin repeat1 * #intrin = setf, set# */ static PyObject * simd__intrin_@intrin@_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args) { npyv_lanetype_@sfx@ *data = simd_sequence_from_iterable(args, simd_data_q@sfx@, npyv_nlanes_@sfx@); if (data == NULL) { return NULL; } simd_data r = {.v@sfx@ = npyv_@intrin@_@sfx@( data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7], data[8], data[9], data[10], data[11], data[12], data[13], data[14], data[15], data[16], data[17], data[18], data[19], data[20], data[21], data[22], data[23], data[24], data[25], data[26], data[27], data[28], data[29], data[30], data[31], data[32], data[33], data[34], data[35], data[36], data[37], data[38], data[39], data[40], data[41], data[42], data[43], data[44], data[45], data[46], data[47], data[48], data[49], data[50], data[51], data[52], data[53], data[54], data[55], data[56], data[57], data[58], data[59], data[60], data[61], data[62], data[63], data[64] // for setf )}; simd_sequence_free(data); return (PyObject*)PySIMDVector_FromData(r, simd_data_v@sfx@); } /**end repeat1**/ /*************************** * Reorder ***************************/ /**begin repeat1 * # intrin = combinel, combineh# */ SIMD_IMPL_INTRIN_2(@intrin@_@sfx@, v@sfx@, v@sfx@, v@sfx@) /**end repeat1**/ /**begin repeat1 * # intrin = combine, zip, unzip# */ SIMD_IMPL_INTRIN_2(@intrin@_@sfx@, v@sfx@x2, v@sfx@, v@sfx@) /**end repeat1**/ #if @rev64_sup@ SIMD_IMPL_INTRIN_1(rev64_@sfx@, v@sfx@, v@sfx@) #endif // special implementation to convert runtime constants to immediate values #if @size@ == 32 // one call for element index then gather them within one vector // instead of unroll the 255 possible cases. NPY_FINLINE npyv_@sfx@ npyv_permi128_@sfx@_(npyv_@sfx@ a, unsigned e0, unsigned e1, unsigned e2, unsigned e3) { /**begin repeat1 * # en = e0, e1, e2, e3# */ npyv_@sfx@ v@en@; npyv_lanetype_@sfx@ d@en@[npyv_nlanes_@sfx@]; if (0) {} /**begin repeat2 * # imm = 1, 2, 3# */ else if (@en@ == @imm@) { v@en@ = npyv_permi128_@sfx@(a, @imm@, @imm@, @imm@, @imm@); } /**end repeat2**/ else { v@en@ = npyv_permi128_@sfx@(a, 0, 0, 0, 0); } npyv_store_@sfx@(d@en@, v@en@); /**end repeat1**/ if (e0 == e1 && e0 == e2 && e0 == e3) { return ve0; } for (int i = 0; i < npyv_nlanes_@sfx@; i += 4) { de0[i+1] = de1[i+1]; de0[i+2] = de2[i+2]; de0[i+3] = de3[i+3]; } return npyv_load_@sfx@(de0); } SIMD_IMPL_INTRIN_5(permi128_@sfx@_, v@sfx@, v@sfx@, u8, u8, u8, u8) #elif @size@ == 64 NPY_FINLINE npyv_@sfx@ npyv_permi128_@sfx@_(npyv_@sfx@ a, unsigned e0, unsigned e1) { if (e0 == 1 && e1 == 0) { return npyv_permi128_@sfx@(a, 1, 0); } else if (e0 == 0 && e1 == 1) { return npyv_permi128_@sfx@(a, 0, 1); } else if (e0 == 1 && e1 == 1) { return npyv_permi128_@sfx@(a, 1, 1); } return npyv_permi128_@sfx@(a, 0, 0); } SIMD_IMPL_INTRIN_3(permi128_@sfx@_, v@sfx@, v@sfx@, u8, u8) #endif /*************************** * Operators ***************************/ #if @shl_imm@ > 0 SIMD_IMPL_INTRIN_2(shl_@sfx@, v@sfx@, v@sfx@, u8) SIMD_IMPL_INTRIN_2(shr_@sfx@, v@sfx@, v@sfx@, u8) // immediate constant SIMD_IMPL_INTRIN_2IMM(shli_@sfx@, v@sfx@, v@sfx@, @shl_imm@) SIMD_IMPL_INTRIN_2IMM(shri_@sfx@, v@sfx@, v@sfx@, @shr_imm@) #endif // shl_imm /**begin repeat1 * #intrin = and, or, xor# */ SIMD_IMPL_INTRIN_2(@intrin@_@sfx@, v@sfx@, v@sfx@, v@sfx@) /**end repeat1**/ SIMD_IMPL_INTRIN_1(not_@sfx@, v@sfx@, v@sfx@) /**begin repeat1 * #intrin = cmpeq, cmpneq, cmpgt, cmpge, cmplt, cmple# */ SIMD_IMPL_INTRIN_2(@intrin@_@sfx@, v@bsfx@, v@sfx@, v@sfx@) /**end repeat1**/ #if @bitw8b_sup@ SIMD_IMPL_INTRIN_2(andc_@sfx@, v@sfx@, v@sfx@, v@sfx@) SIMD_IMPL_INTRIN_2(andc_@bsfx@, v@bsfx@, v@bsfx@, v@bsfx@) SIMD_IMPL_INTRIN_2(orc_@bsfx@, v@bsfx@, v@bsfx@, v@bsfx@) SIMD_IMPL_INTRIN_2(xnor_@bsfx@, v@bsfx@, v@bsfx@, v@bsfx@) #endif // test cross all vector lanes /**begin repeat1 * #intrin = any, all# */ SIMD_IMPL_INTRIN_1(@intrin@_@sfx@, u8, v@sfx@) /**end repeat1**/ /*************************** * Conversion ***************************/ SIMD_IMPL_INTRIN_1(cvt_@sfx@_@bsfx@, v@sfx@, v@bsfx@) SIMD_IMPL_INTRIN_1(cvt_@bsfx@_@sfx@, v@bsfx@, v@sfx@) #if @expand_sup@ SIMD_IMPL_INTRIN_1(expand_@esfx@_@sfx@, v@esfx@x2, v@sfx@) #endif // expand_sup /*************************** * Arithmetic ***************************/ /**begin repeat1 * #intrin = add, sub# */ SIMD_IMPL_INTRIN_2(@intrin@_@sfx@, v@sfx@, v@sfx@, v@sfx@) /**end repeat1**/ #if @sat_sup@ /**begin repeat1 * #intrin = adds, subs# */ SIMD_IMPL_INTRIN_2(@intrin@_@sfx@, v@sfx@, v@sfx@, v@sfx@) /**end repeat1**/ #endif // sat_sup #if @mul_sup@ SIMD_IMPL_INTRIN_2(mul_@sfx@, v@sfx@, v@sfx@, v@sfx@) #endif // mul_sup #if @div_sup@ SIMD_IMPL_INTRIN_2(div_@sfx@, v@sfx@, v@sfx@, v@sfx@) #endif // div_sup #if @intdiv_sup@ SIMD_IMPL_INTRIN_1(divisor_@sfx@, v@sfx@x3, @sfx@) SIMD_IMPL_INTRIN_2(divc_@sfx@, v@sfx@, v@sfx@, v@sfx@x3) #endif // intdiv_sup #if @fused_sup@ /**begin repeat1 * #intrin = muladd, mulsub, nmuladd, nmulsub, muladdsub# */ SIMD_IMPL_INTRIN_3(@intrin@_@sfx@, v@sfx@, v@sfx@, v@sfx@, v@sfx@) /**end repeat1**/ #endif // fused_sup #if @sum_sup@ SIMD_IMPL_INTRIN_1(sum_@sfx@, @sfx@, v@sfx@) #endif // sum_sup #if @sumup_sup@ SIMD_IMPL_INTRIN_1(sumup_@sfx@, @esfx@, v@sfx@) #endif // sumup_sup /*************************** * Math ***************************/ #if @fp_only@ /**begin repeat1 * #intrin = sqrt, recip, abs, square, rint, ceil, trunc, floor# */ SIMD_IMPL_INTRIN_1(@intrin@_@sfx@, v@sfx@, v@sfx@) /**end repeat1**/ #endif /**begin repeat1 * #intrin = max, min# */ SIMD_IMPL_INTRIN_2(@intrin@_@sfx@, v@sfx@, v@sfx@, v@sfx@) SIMD_IMPL_INTRIN_1(reduce_@intrin@_@sfx@, @sfx@, v@sfx@) /**end repeat1**/ #if @fp_only@ /**begin repeat1 * #intrin = maxp, minp, maxn, minn# */ SIMD_IMPL_INTRIN_2(@intrin@_@sfx@, v@sfx@, v@sfx@, v@sfx@) SIMD_IMPL_INTRIN_1(reduce_@intrin@_@sfx@, @sfx@, v@sfx@) /**end repeat1**/ /**end repeat1**/ #endif /*************************** * Mask operations ***************************/ /**begin repeat1 * #intrin = ifadd, ifsub# */ SIMD_IMPL_INTRIN_4(@intrin@_@sfx@, v@sfx@, v@bsfx@, v@sfx@, v@sfx@, v@sfx@) /**end repeat1**/ #if @fp_only@ SIMD_IMPL_INTRIN_4(ifdiv_@sfx@, v@sfx@, v@bsfx@, v@sfx@, v@sfx@, v@sfx@) SIMD_IMPL_INTRIN_3(ifdivz_@sfx@, v@sfx@, v@bsfx@, v@sfx@, v@sfx@) #endif #endif // simd_sup /**end repeat**/ /************************************************************************* * Variant ************************************************************************/ SIMD_IMPL_INTRIN_0N(cleanup) /************************************************************************* * A special section for f32/f64 intrinsics outside the main repeater ************************************************************************/ /*************************** * Operators ***************************/ // check special cases #if NPY_SIMD_F32 SIMD_IMPL_INTRIN_1(notnan_f32, vb32, vf32) #endif #if NPY_SIMD_F64 SIMD_IMPL_INTRIN_1(notnan_f64, vb64, vf64) #endif /*************************** * Conversions ***************************/ // round to nearest integer (assume even) #if NPY_SIMD_F32 SIMD_IMPL_INTRIN_1(round_s32_f32, vs32, vf32) #endif #if NPY_SIMD_F64 SIMD_IMPL_INTRIN_2(round_s32_f64, vs32, vf64, vf64) #endif /************************************************************************* * A special section for boolean intrinsics outside the main repeater ************************************************************************/ /*************************** * Operators ***************************/ /**begin repeat * #bsfx = b8, b16, b32, b64# */ // Logical SIMD_IMPL_INTRIN_2(and_@bsfx@, v@bsfx@, v@bsfx@, v@bsfx@) SIMD_IMPL_INTRIN_2(or_@bsfx@, v@bsfx@, v@bsfx@, v@bsfx@) SIMD_IMPL_INTRIN_2(xor_@bsfx@, v@bsfx@, v@bsfx@, v@bsfx@) SIMD_IMPL_INTRIN_1(not_@bsfx@, v@bsfx@, v@bsfx@) // test cross vector's lanes /**begin repeat1 * #intrin = any, all# */ SIMD_IMPL_INTRIN_1(@intrin@_@bsfx@, u8, v@bsfx@) /**end repeat1**/ /**end repeat**/ /*************************** * Conversions ***************************/ // Convert mask vector to integer bitfield /**begin repeat * #bsfx = b8, b16, b32, b64# */ SIMD_IMPL_INTRIN_1(tobits_@bsfx@, u64, v@bsfx@) /**end repeat**/ SIMD_IMPL_INTRIN_2(pack_b8_b16, vb8, vb16, vb16) SIMD_IMPL_INTRIN_4(pack_b8_b32, vb8, vb32, vb32, vb32, vb32) SIMD_IMPL_INTRIN_8(pack_b8_b64, vb8, vb64, vb64, vb64, vb64, vb64, vb64, vb64, vb64) //######################################################################### //## Attach module functions //######################################################################### static PyMethodDef simd__intrinsics_methods[] = { /**begin repeat * #sfx = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64# * #bsfx = b8, b8, b16, b16, b32, b32, b64, b64, b32, b64# * #size = 8, 8, 16, 16, 32, 32, 64, 64, 32, 64# * #esfx = u16, s8, u32,s16, u32, s32, u64, s64, f32, f64# * #expand_sup= 1, 0, 1, 0, 0, 0, 0, 0, 0, 0# * #simd_sup = 1, 1, 1, 1, 1, 1, 1, 1, NPY_SIMD_F32, NPY_SIMD_F64# * #fp_only = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1# * #sat_sup = 1, 1, 1, 1, 0, 0, 0, 0, 0, 0# * #mul_sup = 1, 1, 1, 1, 1, 1, 0, 0, 1, 1# * #div_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1# * #fused_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1# * #sumup_sup = 1, 0, 1, 0, 0, 0, 0, 0, 0, 0# * #sum_sup = 0, 0, 0, 0, 1, 0, 1, 0, 1, 1# * #rev64_sup = 1, 1, 1, 1, 1, 1, 0, 0, 1, 0# * #ncont_sup = 0, 0, 0, 0, 1, 1, 1, 1, 1, 1# * #intdiv_sup= 1, 1, 1, 1, 1, 1, 1, 1, 0, 0# * #shl_imm = 0, 0, 15, 15, 31, 31, 63, 63, 0, 0# * #shr_imm = 0, 0, 16, 16, 32, 32, 64, 64, 0, 0# * #bitw8b_sup= 1, 0, 0, 0, 0, 0, 0, 0, 0, 0# */ #if @simd_sup@ /*************************** * Memory ***************************/ /**begin repeat1 * # intrin = load, loada, loads, loadl, store, storea, stores, storel, storeh# */ SIMD_INTRIN_DEF(@intrin@_@sfx@) /**end repeat1**/ /**begin repeat1 * # intrin = load, store# */ SIMD_INTRIN_DEF(@intrin@_@sfx@x2) /**end repeat1**/ /**************************************** * Non-contiguous/Partial Memory access ****************************************/ #if @ncont_sup@ /**begin repeat1 * #intrin = load_till, load_tillz, loadn, loadn_till, loadn_tillz, * store_till, storen, storen_till# */ SIMD_INTRIN_DEF(@intrin@_@sfx@) /**end repeat1**/ #if @size@ == 32 /**begin repeat1 * #intrin = load2_till, load2_tillz, loadn2, loadn2_till, loadn2_tillz, * store2_till, storen2, storen2_till# */ SIMD_INTRIN_DEF(@intrin@_@sfx@) /**end repeat1**/ #else /**begin repeat1 * #intrin = load2_till, load2_tillz, loadn2, loadn2_till, loadn2_tillz, * store2_till, storen2, storen2_till# */ SIMD_INTRIN_DEF(@intrin@_@sfx@) /**end repeat1**/ #endif #endif // ncont_sup /**************************** * Lookup tables ****************************/ #if @size@ == 32 SIMD_INTRIN_DEF(lut32_@sfx@) #endif #if @size@ == 64 SIMD_INTRIN_DEF(lut16_@sfx@) #endif /*************************** * Misc ***************************/ /**begin repeat1 * #sfx_to = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64# * #simd_sup2 = 1, 1, 1, 1, 1, 1, 1, 1, NPY_SIMD_F32, NPY_SIMD_F64# */ #if @simd_sup2@ SIMD_INTRIN_DEF(reinterpret_@sfx_to@_@sfx@) #endif // simd_sup2 /**end repeat1**/ /**begin repeat1 * # intrin = set, setf, setall, zero, select, extract0# */ SIMD_INTRIN_DEF(@intrin@_@sfx@) /**end repeat1**/ /*************************** * Reorder ***************************/ /**begin repeat1 * # intrin = combinel, combineh, combine, zip, unzip# */ SIMD_INTRIN_DEF(@intrin@_@sfx@) /**end repeat1**/ #if @rev64_sup@ SIMD_INTRIN_DEF(rev64_@sfx@) #endif #if @size@ > 16 { "permi128_@sfx@", simd__intrin_permi128_@sfx@_, METH_VARARGS, NULL }, #endif /*************************** * Operators ***************************/ #if @shl_imm@ > 0 /**begin repeat1 * # intrin = shl, shr, shli, shri# */ SIMD_INTRIN_DEF(@intrin@_@sfx@) /**end repeat1**/ #endif // shl_imm /**begin repeat1 * #intrin = and, or, xor, not, cmpeq, cmpneq, cmpgt, cmpge, cmplt, cmple, * any, all# */ SIMD_INTRIN_DEF(@intrin@_@sfx@) /**end repeat1**/ #if @bitw8b_sup@ SIMD_INTRIN_DEF(andc_@sfx@) SIMD_INTRIN_DEF(andc_@bsfx@) SIMD_INTRIN_DEF(orc_@bsfx@) SIMD_INTRIN_DEF(xnor_@bsfx@) #endif /*************************** * Conversion ***************************/ SIMD_INTRIN_DEF(cvt_@sfx@_@bsfx@) SIMD_INTRIN_DEF(cvt_@bsfx@_@sfx@) #if @expand_sup@ SIMD_INTRIN_DEF(expand_@esfx@_@sfx@) #endif // expand_sup /*************************** * Arithmetic ***************************/ /**begin repeat1 * #intrin = add, sub# */ SIMD_INTRIN_DEF(@intrin@_@sfx@) /**end repeat1**/ #if @sat_sup@ /**begin repeat1 * #intrin = adds, subs# */ SIMD_INTRIN_DEF(@intrin@_@sfx@) /**end repeat1**/ #endif // sat_sup #if @mul_sup@ SIMD_INTRIN_DEF(mul_@sfx@) #endif // mul_sup #if @div_sup@ SIMD_INTRIN_DEF(div_@sfx@) #endif // div_sup #if @intdiv_sup@ SIMD_INTRIN_DEF(divisor_@sfx@) SIMD_INTRIN_DEF(divc_@sfx@) #endif // intdiv_sup #if @fused_sup@ /**begin repeat1 * #intrin = muladd, mulsub, nmuladd, nmulsub, muladdsub# */ SIMD_INTRIN_DEF(@intrin@_@sfx@) /**end repeat1**/ #endif // fused_sup #if @sum_sup@ SIMD_INTRIN_DEF(sum_@sfx@) #endif // sum_sup #if @sumup_sup@ SIMD_INTRIN_DEF(sumup_@sfx@) #endif // sumup_sup /*************************** * Math ***************************/ #if @fp_only@ /**begin repeat1 * #intrin = sqrt, recip, abs, square, rint, ceil, trunc, floor# */ SIMD_INTRIN_DEF(@intrin@_@sfx@) /**end repeat1**/ #endif /**begin repeat1 * #intrin = max, min# */ SIMD_INTRIN_DEF(@intrin@_@sfx@) SIMD_INTRIN_DEF(reduce_@intrin@_@sfx@) /**end repeat1**/ #if @fp_only@ /**begin repeat1 * #intrin = maxp, minp, maxn, minn# */ SIMD_INTRIN_DEF(@intrin@_@sfx@) SIMD_INTRIN_DEF(reduce_@intrin@_@sfx@) /**end repeat1**/ /**end repeat1**/ #endif /*************************** * Mask operations ***************************/ /**begin repeat1 * #intrin = ifadd, ifsub# */ SIMD_INTRIN_DEF(@intrin@_@sfx@) /**end repeat1**/ #if @fp_only@ /**begin repeat1 * #intrin = ifdiv, ifdivz# */ SIMD_INTRIN_DEF(@intrin@_@sfx@) /**end repeat1**/ #endif #endif // simd_sup /**end repeat**/ /************************************************************************* * Variant ************************************************************************/ SIMD_INTRIN_DEF(cleanup) /************************************************************************* * A special section for f32/f64 intrinsics outside the main repeater ************************************************************************/ /*************************** * Operators ***************************/ // check special cases #if NPY_SIMD_F32 SIMD_INTRIN_DEF(notnan_f32) #endif #if NPY_SIMD_F64 SIMD_INTRIN_DEF(notnan_f64) #endif /*************************** * Conversions ***************************/ // round to nearest integer (assume even) #if NPY_SIMD_F32 SIMD_INTRIN_DEF(round_s32_f32) #endif #if NPY_SIMD_F64 SIMD_INTRIN_DEF(round_s32_f64) #endif /************************************************************************* * A special section for boolean intrinsics outside the main repeater ************************************************************************/ /*************************** * Operators ***************************/ /**begin repeat * #bsfx = b8, b16, b32, b64# */ // Logical SIMD_INTRIN_DEF(and_@bsfx@) SIMD_INTRIN_DEF(or_@bsfx@) SIMD_INTRIN_DEF(xor_@bsfx@) SIMD_INTRIN_DEF(not_@bsfx@) // test cross vector's lanes /**begin repeat1 * #intrin = any, all# */ SIMD_INTRIN_DEF(@intrin@_@bsfx@) /**end repeat1**/ /**end repeat**/ /*************************** * Conversions ***************************/ // Convert mask vector to integer bitfield /**begin repeat * #bsfx = b8, b16, b32, b64# */ SIMD_INTRIN_DEF(tobits_@bsfx@) /**end repeat**/ // Pack multiple vectors into one SIMD_INTRIN_DEF(pack_b8_b16) SIMD_INTRIN_DEF(pack_b8_b32) SIMD_INTRIN_DEF(pack_b8_b64) /************************************************************************/ {NULL, NULL, 0, NULL} }; // PyMethodDef #endif // NPY_SIMD //######################################################################### //## Defining a separate module for each target //######################################################################### NPY_VISIBILITY_HIDDEN PyObject * NPY_CPU_DISPATCH_CURFX(simd_create_module)(void) { static struct PyModuleDef defs = { .m_base = PyModuleDef_HEAD_INIT, #ifdef NPY__CPU_TARGET_CURRENT .m_name = "numpy.core._simd." NPY_TOSTRING(NPY__CPU_TARGET_CURRENT), #else .m_name = "numpy.core._simd.baseline", #endif .m_size = -1, #if NPY_SIMD .m_methods = simd__intrinsics_methods #else .m_methods = NULL #endif }; PyObject *m = PyModule_Create(&defs); if (m == NULL) { return NULL; } if (PyModule_AddIntConstant(m, "simd", NPY_SIMD)) { goto err; } if (PyModule_AddIntConstant(m, "simd_f64", NPY_SIMD_F64)) { goto err; } if (PyModule_AddIntConstant(m, "simd_f32", NPY_SIMD_F32)) { goto err; } if (PyModule_AddIntConstant(m, "simd_fma3", NPY_SIMD_FMA3)) { goto err; } if (PyModule_AddIntConstant(m, "simd_width", NPY_SIMD_WIDTH)) { goto err; } if (PyModule_AddIntConstant(m, "simd_bigendian", NPY_SIMD_BIGENDIAN)) { goto err; } #if NPY_SIMD if (PySIMDVectorType_Init(m)) { goto err; } /**begin repeat * #sfx = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64# */ if (PyModule_AddIntConstant(m, "nlanes_@sfx@", npyv_nlanes_@sfx@)) { goto err; } /**end repeat**/ #endif // NPY_SIMD return m; err: Py_DECREF(m); return NULL; }