summaryrefslogtreecommitdiff
path: root/numpy
diff options
context:
space:
mode:
authorDumbMice <bangchengyang@hotmail.com>2020-07-31 13:25:26 +0800
committerGitHub <noreply@github.com>2020-07-31 08:25:26 +0300
commit6f0436d745d1a10b53c0fdbc484fbd942386f9ea (patch)
tree6b7530bb1d095468996ca45f27c6cdbf5035e726 /numpy
parentb66f02bed6380f6f88a21adf77ca4ce54e4a9052 (diff)
downloadnumpy-6f0436d745d1a10b53c0fdbc484fbd942386f9ea.tar.gz
ENH: Add Neon SIMD implementations for add, sub, mul, and div (#16969)
* ENH: Add universal SIMD implementations of add, sub, mul, div for non-X86
Diffstat (limited to 'numpy')
-rw-r--r--numpy/core/src/umath/simd.inc.src114
1 files changed, 113 insertions, 1 deletions
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index 7866f8143..40bb76914 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -28,6 +28,7 @@
#undef __AVX512F__
#endif
#endif
+#include "simd/simd.h"
#include <assert.h>
#include <stdlib.h>
#include <float.h>
@@ -505,6 +506,7 @@ run_unary_avx512f_log_DOUBLE(char **args, npy_intp const *dimensions, npy_intp c
* #type = npy_float, npy_double, npy_longdouble#
* #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
* #vector = 1, 1, 0#
+ * #VECTOR = NPY_SIMD, NPY_SIMD_F64, 0 #
*/
/**begin repeat1
@@ -553,6 +555,18 @@ static void
sse2_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2,
npy_intp n);
+#elif @VECTOR@
+
+static void
+simd_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2,
+ npy_intp n);
+static void
+simd_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2,
+ npy_intp n);
+static void
+simd_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2,
+ npy_intp n);
+
#endif
static NPY_INLINE int
@@ -584,6 +598,25 @@ run_binary_simd_@kind@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp
sse2_binary_@kind@_@TYPE@(op, ip1, ip2, n);
return 1;
}
+#elif @VECTOR@
+ @type@ * ip1 = (@type@ *)args[0];
+ @type@ * ip2 = (@type@ *)args[1];
+ @type@ * op = (@type@ *)args[2];
+ npy_intp n = dimensions[0];
+ /* argument one scalar */
+ if (IS_BLOCKABLE_BINARY_SCALAR1(sizeof(@type@), NPY_SIMD_WIDTH)) {
+ simd_binary_scalar1_@kind@_@TYPE@(op, ip1, ip2, n);
+ return 1;
+ }
+ /* argument two scalar */
+ else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), NPY_SIMD_WIDTH)) {
+ simd_binary_scalar2_@kind@_@TYPE@(op, ip1, ip2, n);
+ return 1;
+ }
+ else if (IS_BLOCKABLE_BINARY(sizeof(@type@), NPY_SIMD_WIDTH)) {
+ simd_binary_@kind@_@TYPE@(op, ip1, ip2, n);
+ return 1;
+ }
#endif
return 0;
}
@@ -3694,7 +3727,86 @@ sse2_@kind@_BOOL(@type@ * op, @type@ * ip, const npy_intp n)
/**end repeat**/
#undef VECTOR_SIZE_BYTES
+#else /* NPY_HAVE_SSE2_INTRINSICS */
-#endif /* NPY_HAVE_SSE2_INTRINSICS */
+/**begin repeat
+ * #type = npy_float, npy_double#
+ * #TYPE = FLOAT, DOUBLE#
+ * #sfx = f32, f64#
+ * #CHK = , _F64#
+ */
+
+#if NPY_SIMD@CHK@
+/**begin repeat1
+* Arithmetic
+* # kind = add, subtract, multiply, divide#
+* # OP = +, -, *, /#
+* # VOP = add, sub, mul, div#
+*/
+
+static void
+simd_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
+{
+ LOOP_BLOCK_ALIGN_VAR(op, @type@, NPY_SIMD_WIDTH) {
+ op[i] = ip1[i] @OP@ ip2[i];
+ }
+ /* lots of specializations, to squeeze out max performance */
+ if (ip1 == ip2) {
+ LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) {
+ npyv_@sfx@ a = npyv_load_@sfx@(&ip1[i]);
+ npyv_@sfx@ c = npyv_@VOP@_@sfx@(a, a);
+ npyv_store_@sfx@(&op[i], c);
+ }
+ }
+ else {
+ LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) {
+ npyv_@sfx@ a = npyv_load_@sfx@(&ip1[i]);
+ npyv_@sfx@ b = npyv_load_@sfx@(&ip2[i]);
+ npyv_@sfx@ c = npyv_@VOP@_@sfx@(a, b);
+ npyv_store_@sfx@(&op[i], c);
+ }
+ }
+ LOOP_BLOCKED_END {
+ op[i] = ip1[i] @OP@ ip2[i];
+ }
+}
+
+static void
+simd_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
+{
+ const npyv_@sfx@ v1 = npyv_setall_@sfx@(ip1[0]);
+ LOOP_BLOCK_ALIGN_VAR(op, @type@, NPY_SIMD_WIDTH) {
+ op[i] = ip1[0] @OP@ ip2[i];
+ }
+ LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) {
+ npyv_@sfx@ v2 = npyv_load_@sfx@(&ip2[i]);
+ npyv_@sfx@ v3 = npyv_@VOP@_@sfx@(v1, v2);
+ npyv_store_@sfx@(&op[i], v3);
+ }
+ LOOP_BLOCKED_END {
+ op[i] = ip1[0] @OP@ ip2[i];
+ }
+}
+
+static void
+simd_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
+{
+ const npyv_@sfx@ v2 = npyv_setall_@sfx@(ip2[0]);
+ LOOP_BLOCK_ALIGN_VAR(op, @type@, NPY_SIMD_WIDTH) {
+ op[i] = ip1[i] @OP@ ip2[0];
+ }
+ LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) {
+ npyv_@sfx@ v1 = npyv_load_@sfx@(&ip1[i]);
+ npyv_@sfx@ v3 = npyv_@VOP@_@sfx@(v1, v2);
+ npyv_store_@sfx@(&op[i], v3);
+ }
+ LOOP_BLOCKED_END {
+ op[i] = ip1[i] @OP@ ip2[0];
+ }
+}
+/**end repeat1**/
+#endif /* NPY_SIMD@CHK@ */
+/**end repeat**/
+#endif
#endif