SIMD: Separate signed and unsigned loops

author: Ganesh Kathiresan <ganesh3597@gmail.com> 2021-05-11 21:38:51 +0530
committer: Sayed Adel <seiko@imavr.com> 2021-05-20 23:19:50 +0200
commit: 7c163672933d42e76dd643065acbe36a7274dc00 (patch)
tree: 74e4b40c40a7d0ff2e42095397acff886f98dda3 /numpy
parent: b6b32674d634b6dfe9d92212e8a6ced0f1e14319 (diff)
download: numpy-7c163672933d42e76dd643065acbe36a7274dc00.tar.gz
1 files changed, 105 insertions, 76 deletions
diff --git a/numpy/core/src/umath/loops_arithmetic.dispatch.c.src b/numpy/core/src/umath/loops_arithmetic.dispatch.c.src
index 5e54a45de..a52bb36b7 100644
--- a/numpy/core/src/umath/loops_arithmetic.dispatch.c.src
+++ b/numpy/core/src/umath/loops_arithmetic.dispatch.c.src
@@ -36,41 +36,35 @@
  ********************************************************************************/
 #if NPY_SIMD
 /**begin repeat
- * #sfx    = u8,  u16, u32, u64, s8, s16, s32, s64#
- * #len    = 8,   16,  32,  64,  8,  16,  32,  64#
- * #signed = 0*4, 1*4#
+ * Signed types
+ * #sfx    = s8, s16, s32, s64#
+ * #len    = 8,  16,  32,  64#
  */
-#if @signed@
 static NPY_INLINE void
 simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len)
 {
-    npyv_@sfx@  a, nsign_d, nsign_a, diff_sign, to_ninf, trunc, floor, neg, vzero;
-    npyv_b@len@ greater_min, noverflow;
-    npy_bool    raise;
-    npy_uint64  tobits;
-
     npyv_lanetype_@sfx@ *src   = (npyv_lanetype_@sfx@ *) args[0];
     npyv_lanetype_@sfx@ scalar = *(npyv_lanetype_@sfx@ *) args[1];
     npyv_lanetype_@sfx@ *dst   = (npyv_lanetype_@sfx@ *) args[2];
     const int vstep            = npyv_nlanes_@sfx@;
     const npyv_@sfx@x3 divisor = npyv_divisor_@sfx@(scalar);
 
-    if (NPY_UNLIKELY(-1 == scalar)) {
-        noverflow = npyv_cvt_b@len@_@sfx@(npyv_setall_@sfx@(-1));
-        vzero     = npyv_zero_@sfx@();
+    if (scalar == (npyv_lanetype_@sfx@)-1) {
+        npyv_b@len@ noverflow = npyv_cvt_b@len@_@sfx@(npyv_setall_@sfx@(-1));
+        npyv_@sfx@ vzero      = npyv_zero_@sfx@();
         for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
-            a           = npyv_load_@sfx@(src);
-            greater_min = npyv_cmpgt_@sfx@(a, npyv_setall_@sfx@(NPY_MIN_INT@len@));
-            noverflow   = npyv_and_b@len@(noverflow, greater_min);
-            neg         = npyv_ifsub_@sfx@(greater_min, vzero, a, vzero);
+            npyv_@sfx@ a            = npyv_load_@sfx@(src);
+            npyv_b@len@ greater_min = npyv_cmpgt_@sfx@(a, npyv_setall_@sfx@(NPY_MIN_INT@len@));
+            noverflow               = npyv_and_b@len@(noverflow, greater_min);
+            npyv_@sfx@ neg          = npyv_ifsub_@sfx@(greater_min, vzero, a, vzero);
 
             npyv_store_@sfx@(dst, neg);
         }
-        tobits = npyv_tobits_b@len@(noverflow);
+        npy_uint64 tobits = npyv_tobits_b@len@(noverflow);
     #if npyv_nlanes_@sfx@ == 64
-        raise = (~tobits) != 0;
+        int raise = (~tobits) != 0;
     #else
-        raise = tobits != (1ULL << vstep)-1;
+        int raise = tobits != (1ULL << vstep)-1;
     #endif
 
         for (; len > 0; --len, ++src, ++dst) {
@@ -87,36 +81,37 @@ simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len)
         }
     } else {
         for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
-            nsign_d   = npyv_setall_@sfx@(scalar < 0);
-            a         = npyv_load_@sfx@(src);
-            nsign_a   = npyv_cvt_@sfx@_b@len@(npyv_cmplt_@sfx@(a, nsign_d));
-            nsign_a   = npyv_and_@sfx@(nsign_a, npyv_setall_@sfx@(1));
-            diff_sign = npyv_sub_@sfx@(nsign_a, nsign_d);
-            to_ninf   = npyv_xor_@sfx@(nsign_a, nsign_d);
-            trunc     = npyv_divc_@sfx@(npyv_add_@sfx@(a, diff_sign), divisor);
-            floor     = npyv_sub_@sfx@(trunc, to_ninf);
+            npyv_@sfx@  nsign_d   = npyv_setall_@sfx@(scalar < 0);
+            npyv_@sfx@  a         = npyv_load_@sfx@(src);
+            npyv_@sfx@  nsign_a   = npyv_cvt_@sfx@_b@len@(npyv_cmplt_@sfx@(a, nsign_d));
+            nsign_a               = npyv_and_@sfx@(nsign_a, npyv_setall_@sfx@(1));
+            npyv_@sfx@  diff_sign = npyv_sub_@sfx@(nsign_a, nsign_d);
+            npyv_@sfx@  to_ninf   = npyv_xor_@sfx@(nsign_a, nsign_d);
+            npyv_@sfx@  trunc     = npyv_divc_@sfx@(npyv_add_@sfx@(a, diff_sign), divisor);
+            npyv_@sfx@  floor     = npyv_sub_@sfx@(trunc, to_ninf);
 
             npyv_store_@sfx@(dst, floor);
         }
 
         for (; len > 0; --len, ++src, ++dst) {
             const npyv_lanetype_@sfx@ a = *src;
-            if (scalar == 0 || (a == (npyv_lanetype_@sfx@)NPY_MIN_INT@len@ && scalar == (npyv_lanetype_@sfx@)-1)) {
-                 npy_set_floatstatus_divbyzero();
-                 *dst = 0;
-            } else {
-                *dst = a / scalar;
-                /* Negative quotients needs to be rounded down */
-                if (((a > 0) != (scalar > 0)) && (*dst * scalar != a)) {
-                    *dst = *dst - 1;
-                }
+            *dst = a / scalar;
+            /* Negative quotients needs to be rounded down */
+            if (((a > 0) != (scalar > 0)) && (*dst * scalar != a)) {
+                *dst = *dst - 1;
             }
         }
     }
 
     npyv_cleanup();
 }
-#else
+/**end repeat**/
+
+/**begin repeat
+ * Unsigned types
+ * #sfx    = u8, u16, u32, u64#
+ * #len    = 8,  16,  32,  64#
+ */
 static NPY_INLINE void
 simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len)
 {
@@ -134,17 +129,11 @@ simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len)
 
     for (; len > 0; --len, ++src, ++dst) {
         const npyv_lanetype_@sfx@ a = *src;
-        if (scalar == 0 || (a == (npyv_lanetype_@sfx@)NPY_MIN_INT@len@ && scalar == (npyv_lanetype_@sfx@)-1)) {
-             npy_set_floatstatus_divbyzero();
-             *dst = 0;
-        } else {
-            *dst = a / scalar;
-        }
+        *dst = a / scalar;
     }
 
     npyv_cleanup();
 }
-#endif
 /**end repeat**/
 #endif
 
@@ -153,31 +142,78 @@ simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len)
  ********************************************************************************/
 
 /**begin repeat
- * Unsigned types
+ * Signed types
  *  #type  = byte, short, int, long, longlong#
  *  #TYPE  = BYTE, SHORT, INT, LONG, LONGLONG#
  */
-
+#undef TO_SIMD_SFX
+#if 0
 /**begin repeat1
- * #signed = 1, 0#
+ * #len = 8, 16, 32, 64#
+ */
+#elif NPY_BITSOF_@TYPE@ == @len@
+    #define TO_SIMD_SFX(X) X##_s@len@
+/**end repeat1**/
+#endif
+
+#if NPY_BITSOF_@TYPE@ == 64 && !defined(NPY_HAVE_VSX4) && (defined(NPY_HAVE_VSX) || defined(NPY_HAVE_NEON))
+    #undef TO_SIMD_SFX
+#endif
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_divide)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if (IS_BINARY_REDUCE) {
+        BINARY_REDUCE_LOOP(npy_@type@) {
+            const npy_@type@ d = *(npy_@type@ *)ip2;
+            if (NPY_UNLIKELY(d == 0 || (io1 == (npy_@type@)NPY_MIN_@TYPE@ && d == (npy_@type@)-1))) {
+                npy_set_floatstatus_divbyzero();
+                io1 = 0;
+            } else {
+                io1 /= d;
+            }
+        }
+        *((npy_@type@ *)iop1) = io1;
+    }
+#if NPY_SIMD && defined(TO_SIMD_SFX)
+    // for contiguous block of memory, divisor is a scalar and not 0
+    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_@type@), NPY_SIMD_WIDTH) &&
+             (*(npy_@type@ *)args[1]) != 0) {
+        TO_SIMD_SFX(simd_divide_by_scalar_contig)(args, dimensions[0]);
+    }
+#endif
+    else {
+        BINARY_LOOP {
+            const npy_@type@ in1 = *(npy_@type@ *)ip1;
+            const npy_@type@ in2 = *(npy_@type@ *)ip2;
+            if (NPY_UNLIKELY(in2 == 0 || (in1 == (npy_@type@)NPY_MIN_@TYPE@ && in2 == (npy_@type@)-1))) {
+                npy_set_floatstatus_divbyzero();
+                *((npy_@type@ *)op1) = 0;
+            } else{
+                *((npy_@type@ *)op1) = in1 / in2;
+                 /* Negative quotients needs to be rounded down */
+                 if (((in1 > 0) != (in2 > 0)) && (*((npy_@type@ *)op1) * in2 != in1)) {
+                     *((npy_@type@ *)op1) = *((npy_@type@ *)op1) - 1;
+                 }
+            }
+        }
+    }
+}
+/**end repeat**/
+
+/**begin repeat
+ * Unsigned types
+ *  #type  = byte, short, int, long, longlong#
+ *  #TYPE  = BYTE, SHORT, INT, LONG, LONGLONG#
  */
 
 #undef TO_SIMD_SFX
-#undef SIMD_TYPE
-#undef SIMD_DIVIDE
 #if 0
-/**begin repeat2
+/**begin repeat1
  * #len = 8, 16, 32, 64#
  */
-#elif NPY_BITSOF_@TYPE@ == @len@ && @signed@
-    #define TO_SIMD_SFX(X) X##_s@len@
-    #define SIMD_TYPE      npy_@type@
-    #define SIMD_DIVIDE    @TYPE@_divide
 #elif NPY_BITSOF_@TYPE@ == @len@
     #define TO_SIMD_SFX(X) X##_u@len@
-    #define SIMD_TYPE      npy_u@type@
-    #define SIMD_DIVIDE    U@TYPE@_divide
-/**end repeat2**/
+/**end repeat1**/
 #endif
 /*
  * For 64-bit division on Armv7, Aarch64, and IBM/Power, NPYV fall-backs to the scalar division
@@ -190,46 +226,39 @@ simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len)
 #if NPY_BITSOF_@TYPE@ == 64 && !defined(NPY_HAVE_VSX4) && (defined(NPY_HAVE_VSX) || defined(NPY_HAVE_NEON))
     #undef TO_SIMD_SFX
 #endif
-NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(SIMD_DIVIDE)
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(U@TYPE@_divide)
 (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
 {
     if (IS_BINARY_REDUCE) {
-        BINARY_REDUCE_LOOP(SIMD_TYPE) {
-            const SIMD_TYPE d = *(SIMD_TYPE *)ip2;
-            if (NPY_UNLIKELY(d == 0 || (io1 == (SIMD_TYPE)NPY_MIN_@TYPE@ && d == (SIMD_TYPE)-1))) {
+        BINARY_REDUCE_LOOP(npy_u@type@) {
+            const npy_u@type@ d = *(npy_u@type@ *)ip2;
+            if (NPY_UNLIKELY(d == 0 || (io1 == (npy_u@type@)NPY_MIN_@TYPE@ && d == (npy_u@type@)-1))) {
                 npy_set_floatstatus_divbyzero();
                 io1 = 0;
             } else {
                 io1 /= d;
             }
         }
-        *((SIMD_TYPE *)iop1) = io1;
+        *((npy_u@type@ *)iop1) = io1;
     }
 #if NPY_SIMD && defined(TO_SIMD_SFX)
     // for contiguous block of memory, divisor is a scalar and not 0
-    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(SIMD_TYPE), NPY_SIMD_WIDTH) &&
-             (*(SIMD_TYPE *)args[1]) != 0) {
+    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_u@type@), NPY_SIMD_WIDTH) &&
+             (*(npy_u@type@ *)args[1]) != 0) {
         TO_SIMD_SFX(simd_divide_by_scalar_contig)(args, dimensions[0]);
     }
 #endif
     else {
         BINARY_LOOP {
-            const SIMD_TYPE in1 = *(SIMD_TYPE *)ip1;
-            const SIMD_TYPE in2 = *(SIMD_TYPE *)ip2;
-            if (NPY_UNLIKELY(in2 == 0 || (in1 == (SIMD_TYPE)NPY_MIN_@TYPE@ && in2 == (SIMD_TYPE)-1))) {
+            const npy_u@type@ in1 = *(npy_u@type@ *)ip1;
+            const npy_u@type@ in2 = *(npy_u@type@ *)ip2;
+            if (NPY_UNLIKELY(in2 == 0 || (in1 == (npy_u@type@)NPY_MIN_@TYPE@ && in2 == (npy_u@type@)-1))) {
                 npy_set_floatstatus_divbyzero();
-                *((SIMD_TYPE *)op1) = 0;
+                *((npy_u@type@ *)op1) = 0;
             } else{
-                *((SIMD_TYPE *)op1) = in1 / in2;
-#if @signed@
-                 /* Negative quotients needs to be rounded down */
-                 if (((in1 > 0) != (in2 > 0)) && (*((SIMD_TYPE *)op1) * in2 != in1)) {
-                     *((SIMD_TYPE *)op1) = *((SIMD_TYPE *)op1) - 1;
-                 }
-#endif
+                *((npy_u@type@ *)op1) = in1 / in2;
             }
         }
     }
 }
-/**end repeat1**/
 /**end repeat**/
author	Ganesh Kathiresan <ganesh3597@gmail.com>	2021-05-11 21:38:51 +0530
committer	Sayed Adel <seiko@imavr.com>	2021-05-20 23:19:50 +0200
commit	7c163672933d42e76dd643065acbe36a7274dc00 (patch)
tree	74e4b40c40a7d0ff2e42095397acff886f98dda3 /numpy
parent	b6b32674d634b6dfe9d92212e8a6ced0f1e14319 (diff)
download	numpy-7c163672933d42e76dd643065acbe36a7274dc00.tar.gz