8 files changed, 150 insertions, 116 deletions
diff --git a/numpy/core/src/_simd/_simd.dispatch.c.src b/numpy/core/src/_simd/_simd.dispatch.c.src
index f8a0a3196..0f3e4fc8f 100644
--- a/numpy/core/src/_simd/_simd.dispatch.c.src
+++ b/numpy/core/src/_simd/_simd.dispatch.c.src
@@ -31,6 +31,7 @@
  * #intdiv_sup= 1,  1,  1,   1,   1,   1,   1,   1,   0,   0#
  * #shl_imm   = 0,  0,  15,  15,  31,  31,  63,  63,  0,   0#
  * #shr_imm   = 0,  0,  16,  16,  32,  32,  64,  64,  0,   0#
+ * #bitw8b_sup= 1,  0,  0,   0,   0,   0,   0,   0,   0,   0#
  */
 #if @simd_sup@
 /***************************
@@ -332,6 +333,13 @@ SIMD_IMPL_INTRIN_1(not_@sfx@, v@sfx@, v@sfx@)
 SIMD_IMPL_INTRIN_2(@intrin@_@sfx@, v@bsfx@, v@sfx@, v@sfx@)
 /**end repeat1**/
 
+#if @bitw8b_sup@
+SIMD_IMPL_INTRIN_2(andc_@sfx@, v@sfx@, v@sfx@, v@sfx@)
+SIMD_IMPL_INTRIN_2(andc_@bsfx@, v@bsfx@, v@bsfx@, v@bsfx@)
+SIMD_IMPL_INTRIN_2(orc_@bsfx@, v@bsfx@, v@bsfx@, v@bsfx@)
+SIMD_IMPL_INTRIN_2(xnor_@bsfx@, v@bsfx@, v@bsfx@, v@bsfx@)
+#endif
+
 /***************************
  * Conversion
  ***************************/
@@ -462,9 +470,6 @@ SIMD_IMPL_INTRIN_2(or_@bsfx@,  v@bsfx@, v@bsfx@, v@bsfx@)
 SIMD_IMPL_INTRIN_2(xor_@bsfx@, v@bsfx@, v@bsfx@, v@bsfx@)
 SIMD_IMPL_INTRIN_1(not_@bsfx@, v@bsfx@, v@bsfx@)
 /**end repeat**/
-SIMD_IMPL_INTRIN_2(andc_b8, vb8, vb8, vb8)
-SIMD_IMPL_INTRIN_2(orc_b8,  vb8, vb8, vb8)
-SIMD_IMPL_INTRIN_2(xnor_b8, vb8, vb8, vb8)
 /***************************
  * Conversions
  ***************************/
@@ -503,6 +508,7 @@ static PyMethodDef simd__intrinsics_methods[] = {
  * #intdiv_sup= 1,  1,  1,   1,   1,   1,   1,   1,   0,   0#
  * #shl_imm   = 0,  0,  15,  15,  31,  31,  63,  63,  0,   0#
  * #shr_imm   = 0,  0,  16,  16,  32,  32,  64,  64,  0,   0#
+ * #bitw8b_sup= 1,  0,  0,   0,   0,   0,   0,   0,   0,   0#
  */
 #if @simd_sup@
 
@@ -584,6 +590,13 @@ SIMD_INTRIN_DEF(@intrin@_@sfx@)
 SIMD_INTRIN_DEF(@intrin@_@sfx@)
 /**end repeat1**/
 
+#if @bitw8b_sup@
+SIMD_INTRIN_DEF(andc_@sfx@)
+SIMD_INTRIN_DEF(andc_@bsfx@)
+SIMD_INTRIN_DEF(orc_@bsfx@)
+SIMD_INTRIN_DEF(xnor_@bsfx@)
+#endif
+
 /***************************
  * Conversion
  ***************************/
@@ -713,9 +726,6 @@ SIMD_INTRIN_DEF(or_@bsfx@)
 SIMD_INTRIN_DEF(xor_@bsfx@)
 SIMD_INTRIN_DEF(not_@bsfx@)
 /**end repeat**/
-SIMD_INTRIN_DEF(andc_b8)
-SIMD_INTRIN_DEF(orc_b8)
-SIMD_INTRIN_DEF(xnor_b8)
 /***************************
  * Conversions
  ***************************/
diff --git a/numpy/core/src/common/simd/avx2/operators.h b/numpy/core/src/common/simd/avx2/operators.h
index 0e77fc6be..99ef76dcb 100644
--- a/numpy/core/src/common/simd/avx2/operators.h
+++ b/numpy/core/src/common/simd/avx2/operators.h
@@ -115,9 +115,10 @@ NPY_FINLINE __m256i npyv_shr_s64(__m256i a, int c)
 #define npyv_not_b64 npyv_not_u8
 
 // ANDC, ORC and XNOR
-#define npyv_andc_b8(A, B) _mm256_andnot_si256(A, B)
-#define npyv_orc_b8(A, B) npyv_or_b8(npyv_not_b8(A), B)
-#define npyv_xnor_b8(A, B) npyv_not_b8(npyv_xor_b8(A, B))
+#define npyv_andc_u8(A, B) _mm256_andnot_si256(B, A)
+#define npyv_andc_b8(A, B) _mm256_andnot_si256(B, A)
+#define npyv_orc_b8(A, B) npyv_or_b8(npyv_not_b8(B), A)
+#define npyv_xnor_b8 _mm256_cmpeq_epi8
 
 /***************************
  * Comparison
diff --git a/numpy/core/src/common/simd/avx512/conversion.h b/numpy/core/src/common/simd/avx512/conversion.h
index a2f56b2ae..474aee446 100644
--- a/numpy/core/src/common/simd/avx512/conversion.h
+++ b/numpy/core/src/common/simd/avx512/conversion.h
@@ -104,8 +104,8 @@ NPY_FINLINE npyv_b8 npyv_pack_b8_b16(npyv_b16 a, npyv_b16 b) {
 NPY_FINLINE npyv_b8
 npyv_pack_b8_b32(npyv_b32 a, npyv_b32 b, npyv_b32 c, npyv_b32 d) {
 #ifdef NPY_HAVE_AVX512BW
-    __mmask32 ab = (__mmask64)_mm512_kunpackw((__mmask32)b, (__mmask32)a);
-    __mmask32 cd = (__mmask64)_mm512_kunpackw((__mmask32)d, (__mmask32)c);
+    __mmask32 ab = _mm512_kunpackw((__mmask32)b, (__mmask32)a);
+    __mmask32 cd = _mm512_kunpackw((__mmask32)d, (__mmask32)c);
     return npyv_pack_b8_b16(ab, cd);
 #else
     const __m512i idx = _mm512_setr_epi32(
diff --git a/numpy/core/src/common/simd/avx512/operators.h b/numpy/core/src/common/simd/avx512/operators.h
index 8c98b72dd..b856b345a 100644
--- a/numpy/core/src/common/simd/avx512/operators.h
+++ b/numpy/core/src/common/simd/avx512/operators.h
@@ -140,6 +140,9 @@
     #define npyv_not_f64(A) _mm512_castsi512_pd(npyv_not_u64(_mm512_castpd_si512(A)))
 #endif
 
+// ANDC
+#define npyv_andc_u8(A, B) _mm512_andnot_si512(B, A)
+
 /***************************
  * Logical (boolean)
  ***************************/
@@ -152,8 +155,8 @@
     #define npyv_xor_b16 _kxor_mask32
     #define npyv_not_b8  _knot_mask64
     #define npyv_not_b16 _knot_mask32
-    #define npyv_andc_b8 _kandn_mask64
-    #define npyv_orc_b8(A, B) npyv_or_b8(npyv_not_b8(A), B)
+    #define npyv_andc_b8(A, B) _kandn_mask64(B, A)
+    #define npyv_orc_b8(A, B) npyv_or_b8(npyv_not_b8(B), A)
     #define npyv_xnor_b8 _kxnor_mask64
 #elif defined(NPY_HAVE_AVX512BW)
     NPY_FINLINE npyv_b8  npyv_and_b8(npyv_b8 a, npyv_b8 b)
@@ -173,9 +176,9 @@
     NPY_FINLINE npyv_b16 npyv_not_b16(npyv_b16 a)
     { return ~a; }
     NPY_FINLINE npyv_b8  npyv_andc_b8(npyv_b8 a, npyv_b8 b)
-    { return (~a) & b; }
+    { return a & (~b); }
     NPY_FINLINE npyv_b8  npyv_orc_b8(npyv_b8 a, npyv_b8 b)
-    { return (~a) | b; }
+    { return a | (~b); }
     NPY_FINLINE npyv_b8  npyv_xnor_b8(npyv_b8 a, npyv_b8 b)
     { return ~(a ^ b); }
 #else
@@ -187,8 +190,8 @@
     #define npyv_xor_b16 _mm512_xor_si512
     #define npyv_not_b8  npyv_not_u8
     #define npyv_not_b16 npyv_not_u8
-    #define npyv_andc_b8 _mm512_andnot_si512
-    #define npyv_orc_b8(A, B) npyv_or_b8(npyv_not_b8(A), B)
+    #define npyv_andc_b8(A, B) _mm512_andnot_si512(B, A)
+    #define npyv_orc_b8(A, B) npyv_or_b8(npyv_not_b8(B), A)
     #define npyv_xnor_b8(A, B) npyv_not_b8(npyv_xor_b8(A, B))
 #endif
 
diff --git a/numpy/core/src/common/simd/neon/operators.h b/numpy/core/src/common/simd/neon/operators.h
index 6c155fc67..a08fa5390 100644
--- a/numpy/core/src/common/simd/neon/operators.h
+++ b/numpy/core/src/common/simd/neon/operators.h
@@ -117,8 +117,9 @@
 #define npyv_not_b64  npyv_not_u64
 
 // ANDC, ORC and XNOR
-#define npyv_andc_b8(A, B) vbicq_u8(B, A)
-#define npyv_orc_b8(A, B) vornq_u8(B, A)
+#define npyv_andc_u8 vbicq_u8
+#define npyv_andc_b8 vbicq_u8
+#define npyv_orc_b8 vornq_u8
 #define npyv_xnor_b8 vceqq_u8
 
 /***************************
diff --git a/numpy/core/src/common/simd/sse/operators.h b/numpy/core/src/common/simd/sse/operators.h
index 51bdca356..86dbcfea5 100644
--- a/numpy/core/src/common/simd/sse/operators.h
+++ b/numpy/core/src/common/simd/sse/operators.h
@@ -116,9 +116,10 @@ NPY_FINLINE __m128i npyv_shr_s64(__m128i a, int c)
 #define npyv_not_b64 npyv_not_u8
 
 // ANDC, ORC and XNOR
-#define npyv_andc_b8(A, B) _mm_andnot_si128(A, B)
-#define npyv_orc_b8(A, B) npyv_or_b8(npyv_not_b8(A), B)
-#define npyv_xnor_b8(A, B) npyv_not_b8(npyv_xor_b8(A, B))
+#define npyv_andc_u8(A, B) _mm_andnot_si128(B, A)
+#define npyv_andc_b8(A, B) _mm_andnot_si128(B, A)
+#define npyv_orc_b8(A, B) npyv_or_b8(npyv_not_b8(B), A)
+#define npyv_xnor_b8 _mm_cmpeq_epi8
 
 /***************************
  * Comparison
diff --git a/numpy/core/src/common/simd/vsx/operators.h b/numpy/core/src/common/simd/vsx/operators.h
index fc29ba920..b01d85321 100644
--- a/numpy/core/src/common/simd/vsx/operators.h
+++ b/numpy/core/src/common/simd/vsx/operators.h
@@ -134,8 +134,9 @@ NPY_FINLINE npyv_f64 npyv_not_f64(npyv_f64 a)
 { return vec_nor(a, a); }
 
 // ANDC, ORC and XNOR
-#define npyv_andc_b8(A, B) vec_andc(B, A)
-#define npyv_orc_b8(A, B) vec_orc(B, A)
+#define npyv_andc_u8 vec_andc
+#define npyv_andc_b8 vec_andc
+#define npyv_orc_b8 vec_orc
 #define npyv_xnor_b8 vec_eqv
 
 /***************************
diff --git a/numpy/core/src/umath/loops_comparison.dispatch.c.src b/numpy/core/src/umath/loops_comparison.dispatch.c.src
index 07bbf0354..01d58fbf9 100644
--- a/numpy/core/src/umath/loops_comparison.dispatch.c.src
+++ b/numpy/core/src/umath/loops_comparison.dispatch.c.src
@@ -1,6 +1,6 @@
 /*@targets
  ** $maxopt baseline
- ** sse2 sse41 avx2 avx512f avx512_skx
+ ** sse2 sse42 avx2 avx512f avx512_skx
  ** vsx2 vsx3
  ** neon
  **/
@@ -15,18 +15,23 @@
 // Provides the various *_LOOP macros
 #include "fast_loop_macros.h"
 
+/********************************************************************************
+ ** Defining the SIMD kernels
+ ********************************************************************************/
 /**begin repeat
  * #sfx    = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64#
  * #len    =  8,  8,  16,  16,  32,  32,  64,  64,  32,  64#
+ * #signed =  0,  1,   0,   1,   0,   1,   0,   1,   0,   0#
  * #VECTOR = NPY_SIMD*9, NPY_SIMD_F64#
  */
 /**begin repeat1
- * #kind = equal, not_equal, less, less_equal, greater, greater_equal#
- * #OP = ==, !=, <, <=, >, >=#
- * #VOP = cmpeq, cmpneq, cmplt, cmple, cmpgt, cmpge#
+ * #kind = equal, not_equal, less, less_equal#
+ * #eq = 1, 0, 0, 0#
+ * #neq = 0, 1, 0, 0#
+ * #OP = ==, !=, <, <=#
+ * #VOP = cmpeq, cmpneq, cmplt, cmple#
  */
-
-#if @VECTOR@
+#if @VECTOR@ && !((@eq@ || @neq@) && @signed@)
 static void simd_binary_@kind@_@sfx@(char **args, npy_intp len)
 {
     npyv_lanetype_@sfx@ *src1 = (npyv_lanetype_@sfx@ *) args[0];
@@ -205,10 +210,11 @@ static void simd_binary_scalar2_@kind@_@sfx@(char **args, npy_intp len)
 /**end repeat**/
 
 /**begin repeat
- * #kind = equal, not_equal, less, less_equal, greater, greater_equal#
- * #OP = ==, !=, <, <=, >, >=#
- * #VOP = xnor, xor, andc, orc, andc, orc#
- * #rev = 0, 0, 0, 0, 1, 1#
+ * #kind = equal, not_equal, less, less_equal#
+ * #eq = 1, 0, 0, 0#
+ * #neq = 0, 1, 0, 0#
+ * #OP = ==, !=, <, <=#
+ * #VOP = xnor, xor, andc, orc#
  */
 
 #if NPY_SIMD
@@ -224,14 +230,10 @@ static void simd_binary_@kind@_b8(char **args, npy_intp len)
     for (; len >= vstep;
          len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
         // Whatever element in src != 0x0 is converted to 0xFF
-        npyv_b8 a = npyv_cmpneq_u8(npyv_load_u8(src1), vzero);
-        npyv_b8 b = npyv_cmpneq_u8(npyv_load_u8(src2), vzero);
-#if !@rev@
+        npyv_b8 a = npyv_cmpeq_u8(npyv_load_u8(src1), vzero);
+        npyv_b8 b = npyv_cmpeq_u8(npyv_load_u8(src2), vzero);
         npyv_b8 c = npyv_@VOP@_b8(a, b);
-#else
-        npyv_b8 c = npyv_@VOP@_b8(b, a);
-#endif
-        npyv_store_u8(dst, npyv_and_u8(npyv_cvt_u8_b8(c), truemask));
+        npyv_store_u8(dst, npyv_andc_u8(npyv_cvt_u8_b8(c), truemask));
     }
 
     for (; len > 0; --len, ++src1, ++src2, ++dst) {
@@ -248,18 +250,14 @@ static void simd_binary_scalar1_@kind@_b8(char **args, npy_intp len)
     npyv_lanetype_u8 *dst   = (npyv_lanetype_u8 *) args[2];
     const npyv_u8 vzero     = npyv_setall_u8(0x0);
     const npyv_u8 vscalar   = npyv_setall_u8(scalar);
-    const npyv_b8 a         = npyv_cmpneq_u8(vscalar, vzero);
+    const npyv_b8 a         = npyv_cmpeq_u8(vscalar, vzero);
     const npyv_u8 truemask  = npyv_setall_u8(0x1);
     const int vstep         = npyv_nlanes_u8;
 
     for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
-        npyv_b8 b = npyv_cmpneq_u8(npyv_load_u8(src), vzero);
-#if !@rev@
+        npyv_b8 b = npyv_cmpeq_u8(npyv_load_u8(src), vzero);
         npyv_b8 c = npyv_@VOP@_b8(a, b);
-#else
-        npyv_b8 c = npyv_@VOP@_b8(b, a);
-#endif
-        npyv_store_u8(dst, npyv_and_u8(npyv_cvt_u8_b8(c), truemask));
+        npyv_store_u8(dst, npyv_andc_u8(npyv_cvt_u8_b8(c), truemask));
     }
 
     for (; len > 0; --len, ++src, ++dst) {
@@ -275,18 +273,14 @@ static void simd_binary_scalar2_@kind@_b8(char **args, npy_intp len)
     npyv_lanetype_u8 *dst   = (npyv_lanetype_u8 *) args[2];
     const npyv_u8 vzero     = npyv_setall_u8(0x0);
     const npyv_u8 vscalar   = npyv_setall_u8(scalar);
-    const npyv_b8 b         = npyv_cmpneq_u8(vscalar, vzero);
+    const npyv_b8 b         = npyv_cmpeq_u8(vscalar, vzero);
     const npyv_u8 truemask  = npyv_setall_u8(0x1);
     const int vstep         = npyv_nlanes_u8;
 
     for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
-        npyv_b8 a = npyv_cmpneq_u8(npyv_load_u8(src), vzero);
-#if !@rev@
+        npyv_b8 a = npyv_cmpeq_u8(npyv_load_u8(src), vzero);
         npyv_b8 c = npyv_@VOP@_b8(a, b);
-#else
-        npyv_b8 c = npyv_@VOP@_b8(b, a);
-#endif
-        npyv_store_u8(dst, npyv_and_u8(npyv_cvt_u8_b8(c), truemask));
+        npyv_store_u8(dst, npyv_andc_u8(npyv_cvt_u8_b8(c), truemask));
     }
 
     for (; len > 0; --len, ++src, ++dst) {
@@ -297,73 +291,73 @@ static void simd_binary_scalar2_@kind@_b8(char **args, npy_intp len)
 #endif
 /**end repeat**/
 
-
 /**begin repeat
  * #type = npy_ubyte*2, npy_byte, npy_ushort, npy_short, npy_uint, npy_int,
            npy_ulonglong, npy_longlong, npy_float, npy_double#
  * #sfx = b8, u8, s8, u16, s16, u32, s32, u64, s64, f32, f64#
+ * #bool = 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0#
+ * #fp = 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1#
+ * #signed = 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0#
  * #VECTOR = NPY_SIMD*10, NPY_SIMD_F64#
  */
 /**begin repeat1
- * #kind = equal, not_equal, less, less_equal, greater, greater_equal#
+ * #kind = equal, not_equal, less, less_equal#
+ * #eq   = 1, 0, 0, 0#
+ * #neq  = 0, 1, 0, 0#
+ * #OP = ==, !=, <, <=#
  */
-static NPY_INLINE int
+#if !((@eq@ || @neq@) && @signed@)
+static NPY_INLINE void
 run_binary_simd_@kind@_@sfx@(char **args, npy_intp const *dimensions, npy_intp const *steps)
 {
 #if @VECTOR@
     /* argument one scalar */
     if (IS_BLOCKABLE_BINARY_SCALAR1_BOOL(sizeof(@type@), NPY_SIMD_WIDTH)) {
         simd_binary_scalar1_@kind@_@sfx@(args, dimensions[0]);
-        return 1;
+        return;
     }
     /* argument two scalar */
     else if (IS_BLOCKABLE_BINARY_SCALAR2_BOOL(sizeof(@type@), NPY_SIMD_WIDTH)) {
         simd_binary_scalar2_@kind@_@sfx@(args, dimensions[0]);
-        return 1;
+        return;
     }
     else if (IS_BLOCKABLE_BINARY_BOOL(sizeof(@type@), NPY_SIMD_WIDTH)) {
         simd_binary_@kind@_@sfx@(args, dimensions[0]);
-        return 1;
+        return;
     }
 #endif
-    return 0;
-}
-/**end repeat1**/
-/**end repeat**/
 
-/*
- *****************************************************************************
- **                             BOOLEAN LOOPS                               **
- *****************************************************************************
- */
-
-/**begin repeat
- * #kind = equal, not_equal, less, less_equal, greater, greater_equal#
- * #OP = ==, !=, <, <=, >, >=#
- */
-NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BOOL_@kind@)
-(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    if (!run_binary_simd_@kind@_b8(args, dimensions, steps)) {
-        BINARY_LOOP {
-            npy_bool in1 = *((npy_bool *)ip1) != 0;
-            npy_bool in2 = *((npy_bool *)ip2) != 0;
-            *((npy_bool *)op1)= in1 @OP@ in2;
-        }
+    BINARY_LOOP {
+#if @bool@
+        npy_bool in1 = *((npy_bool *)ip1) != 0;
+        npy_bool in2 = *((npy_bool *)ip2) != 0;
+#else
+        const @type@ in1 = *(@type@ *)ip1;
+        const @type@ in2 = *(@type@ *)ip2;
+#endif
+        *((npy_bool *)op1) = in1 @OP@ in2;
     }
 }
+#endif
+/**end repeat1**/
 /**end repeat**/
 
+/********************************************************************************
+ ** Defining ufunc inner functions
+ ********************************************************************************/
+
 /*
- *****************************************************************************
- **                           INTEGER LOOPS
- *****************************************************************************
+ * In order to reduce the size of the binary generated from this source, the
+ * following rules are applied: 1) each data type implements its function
+ * 'greater' as a call to the function 'less' but with the arguments swapped,
+ * the same applies to the function 'greater_equal', which is implemented
+ * with a call to the function 'less_equal', and 2) for the integer datatypes
+ * of the same size (eg 8-bit), a single kernel of the functions 'equal' and
+ * 'not_equal' is used to implement both signed and unsigned types.
  */
 
 /**begin repeat
  * Signed and Unsigned types
- *  #type  = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong,
- *           npy_byte,  npy_short,  npy_int,  npy_long,  npy_longlong#
  *  #TYPE  = UBYTE,     USHORT,     UINT,     ULONG,     ULONGLONG,
  *           BYTE,      SHORT,      INT,      LONG,      LONGLONG#
  *  #STYPE = BYTE,      SHORT,      INT,      LONG,      LONGLONG,
@@ -371,11 +365,13 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BOOL_@kind@)
  *  #signed = 0, 0, 0, 0, 0, 1, 1, 1, 1, 1#
  */
 #undef TO_SIMD_SFX
+#undef TO_SIMD_UTYPE
 #if 0
 /**begin repeat1
  * #len = 8, 16, 32, 64#
  */
 #elif NPY_BITSOF_@STYPE@ == @len@
+    #define TO_SIMD_UTYPE(X) X##_u@len@
     #if @signed@
         #define TO_SIMD_SFX(X) X##_s@len@
     #else
@@ -385,50 +381,71 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BOOL_@kind@)
 #endif
 
 /**begin repeat1
- * #kind = equal, not_equal, less, less_equal, greater, greater_equal#
- * #OP = ==, !=, <, <=, >, >=#
+ * #kind = greater, greater_equal#
+ * #kind_to = less, less_equal#
  */
 NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
 (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
 {
-    if (!TO_SIMD_SFX(run_binary_simd_@kind@)(args, dimensions, steps)) {
-        BINARY_LOOP {
-            const @type@ in1 = *(@type@ *)ip1;
-            const @type@ in2 = *(@type@ *)ip2;
-            *((npy_bool *)op1) = in1 @OP@ in2;
-        }
-    }
+    char *nargs[3] = {args[1], args[0], args[2]};
+    npy_intp nsteps[3] = {steps[1], steps[0], steps[2]};
+    TO_SIMD_SFX(run_binary_simd_@kind_to@)(nargs, dimensions, nsteps);
 }
 /**end repeat1**/
-/**end repeat**/
 
-/*
- *****************************************************************************
- **                             FLOAT LOOPS                                 **
- *****************************************************************************
+/**begin repeat1
+ * #kind = less, less_equal#
  */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    TO_SIMD_SFX(run_binary_simd_@kind@)(args, dimensions, steps);
+}
+/**end repeat1**/
+
+/**begin repeat1
+ * #kind = equal, not_equal#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    TO_SIMD_UTYPE(run_binary_simd_@kind@)(args, dimensions, steps);
+}
+/**end repeat1**/
+/**end repeat**/
 
 /**begin repeat
- * Float types
- * #type = npy_float, npy_double#
- * #TYPE = FLOAT, DOUBLE#
- * #sfx = f32, f64#
+ * Boolean & Float types
+ * #TYPE = BOOL, FLOAT, DOUBLE#
+ * #sfx = b8, f32, f64#
+ * #fp = 0, 1, 1#
  */
 /**begin repeat1
- * #kind = equal, not_equal, less, less_equal, greater, greater_equal#
- * #OP = ==, !=, <, <=, >, >=#
+ * #kind = greater, greater_equal#
+ * #kind_to = less, less_equal#
  */
 NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
 (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
 {
-    if (!run_binary_simd_@kind@_@sfx@(args, dimensions, steps)) {
-        BINARY_LOOP {
-            const @type@ in1 = *(@type@ *)ip1;
-            const @type@ in2 = *(@type@ *)ip2;
-            *((npy_bool *)op1) = in1 @OP@ in2;
-        }
-    }
+    char *nargs[3] = {args[1], args[0], args[2]};
+    npy_intp nsteps[3] = {steps[1], steps[0], steps[2]};
+    run_binary_simd_@kind_to@_@sfx@(nargs, dimensions, nsteps);
+#if @fp@
     npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+/**end repeat1**/
+
+/**begin repeat1
+ * #kind = equal, not_equal, less, less_equal#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    run_binary_simd_@kind@_@sfx@(args, dimensions, steps);
+#if @fp@
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
 }
 /**end repeat1**/
 /**end repeat**/