diff options
author | DumbMice <bangchengyang@hotmail.com> | 2020-07-31 13:25:26 +0800 |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-07-31 08:25:26 +0300 |
commit | 6f0436d745d1a10b53c0fdbc484fbd942386f9ea (patch) | |
tree | 6b7530bb1d095468996ca45f27c6cdbf5035e726 /numpy | |
parent | b66f02bed6380f6f88a21adf77ca4ce54e4a9052 (diff) | |
download | numpy-6f0436d745d1a10b53c0fdbc484fbd942386f9ea.tar.gz |
ENH: Add Neon SIMD implementations for add, sub, mul, and div (#16969)
* ENH: Add universal SIMD implementations of add, sub, mul, div for non-X86
Diffstat (limited to 'numpy')
-rw-r--r-- | numpy/core/src/umath/simd.inc.src | 114 |
1 files changed, 113 insertions, 1 deletions
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src index 7866f8143..40bb76914 100644 --- a/numpy/core/src/umath/simd.inc.src +++ b/numpy/core/src/umath/simd.inc.src @@ -28,6 +28,7 @@ #undef __AVX512F__ #endif #endif +#include "simd/simd.h" #include <assert.h> #include <stdlib.h> #include <float.h> @@ -505,6 +506,7 @@ run_unary_avx512f_log_DOUBLE(char **args, npy_intp const *dimensions, npy_intp c * #type = npy_float, npy_double, npy_longdouble# * #TYPE = FLOAT, DOUBLE, LONGDOUBLE# * #vector = 1, 1, 0# + * #VECTOR = NPY_SIMD, NPY_SIMD_F64, 0 # */ /**begin repeat1 @@ -553,6 +555,18 @@ static void sse2_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n); +#elif @VECTOR@ + +static void +simd_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, + npy_intp n); +static void +simd_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, + npy_intp n); +static void +simd_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, + npy_intp n); + #endif static NPY_INLINE int @@ -584,6 +598,25 @@ run_binary_simd_@kind@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp sse2_binary_@kind@_@TYPE@(op, ip1, ip2, n); return 1; } +#elif @VECTOR@ + @type@ * ip1 = (@type@ *)args[0]; + @type@ * ip2 = (@type@ *)args[1]; + @type@ * op = (@type@ *)args[2]; + npy_intp n = dimensions[0]; + /* argument one scalar */ + if (IS_BLOCKABLE_BINARY_SCALAR1(sizeof(@type@), NPY_SIMD_WIDTH)) { + simd_binary_scalar1_@kind@_@TYPE@(op, ip1, ip2, n); + return 1; + } + /* argument two scalar */ + else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), NPY_SIMD_WIDTH)) { + simd_binary_scalar2_@kind@_@TYPE@(op, ip1, ip2, n); + return 1; + } + else if (IS_BLOCKABLE_BINARY(sizeof(@type@), NPY_SIMD_WIDTH)) { + simd_binary_@kind@_@TYPE@(op, ip1, ip2, n); + return 1; + } #endif return 0; } @@ -3694,7 +3727,86 @@ sse2_@kind@_BOOL(@type@ * op, @type@ * ip, const npy_intp n) /**end repeat**/ #undef VECTOR_SIZE_BYTES +#else /* NPY_HAVE_SSE2_INTRINSICS */ -#endif /* NPY_HAVE_SSE2_INTRINSICS */ +/**begin repeat + * #type = npy_float, npy_double# + * #TYPE = FLOAT, DOUBLE# + * #sfx = f32, f64# + * #CHK = , _F64# + */ + +#if NPY_SIMD@CHK@ +/**begin repeat1 +* Arithmetic +* # kind = add, subtract, multiply, divide# +* # OP = +, -, *, /# +* # VOP = add, sub, mul, div# +*/ + +static void +simd_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n) +{ + LOOP_BLOCK_ALIGN_VAR(op, @type@, NPY_SIMD_WIDTH) { + op[i] = ip1[i] @OP@ ip2[i]; + } + /* lots of specializations, to squeeze out max performance */ + if (ip1 == ip2) { + LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) { + npyv_@sfx@ a = npyv_load_@sfx@(&ip1[i]); + npyv_@sfx@ c = npyv_@VOP@_@sfx@(a, a); + npyv_store_@sfx@(&op[i], c); + } + } + else { + LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) { + npyv_@sfx@ a = npyv_load_@sfx@(&ip1[i]); + npyv_@sfx@ b = npyv_load_@sfx@(&ip2[i]); + npyv_@sfx@ c = npyv_@VOP@_@sfx@(a, b); + npyv_store_@sfx@(&op[i], c); + } + } + LOOP_BLOCKED_END { + op[i] = ip1[i] @OP@ ip2[i]; + } +} + +static void +simd_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n) +{ + const npyv_@sfx@ v1 = npyv_setall_@sfx@(ip1[0]); + LOOP_BLOCK_ALIGN_VAR(op, @type@, NPY_SIMD_WIDTH) { + op[i] = ip1[0] @OP@ ip2[i]; + } + LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) { + npyv_@sfx@ v2 = npyv_load_@sfx@(&ip2[i]); + npyv_@sfx@ v3 = npyv_@VOP@_@sfx@(v1, v2); + npyv_store_@sfx@(&op[i], v3); + } + LOOP_BLOCKED_END { + op[i] = ip1[0] @OP@ ip2[i]; + } +} + +static void +simd_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n) +{ + const npyv_@sfx@ v2 = npyv_setall_@sfx@(ip2[0]); + LOOP_BLOCK_ALIGN_VAR(op, @type@, NPY_SIMD_WIDTH) { + op[i] = ip1[i] @OP@ ip2[0]; + } + LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) { + npyv_@sfx@ v1 = npyv_load_@sfx@(&ip1[i]); + npyv_@sfx@ v3 = npyv_@VOP@_@sfx@(v1, v2); + npyv_store_@sfx@(&op[i], v3); + } + LOOP_BLOCKED_END { + op[i] = ip1[i] @OP@ ip2[0]; + } +} +/**end repeat1**/ +#endif /* NPY_SIMD@CHK@ */ +/**end repeat**/ +#endif #endif |