Merge branch 'simd-runtime-detection' into 'master'

Support for runtime detection of SIMD This merge request adds support for runtime SIMD detection. The idea is that you would build gf-complete with full SIMD support, and gf_init will select the appropriate function at runtime based on the capabilities of the target machine. This would eliminate the need to build different versions of the code for different processors (you still need to build for different archs). Ceph for example has 3-4 flavors of jerasure on Intel (and does not support PCLMUL optimizations as a result of using to many binaries). Numerous libraries have followed as similar approach include zlib. When reviewing this merge request I recommend that you look at each of the 5 commits independently. The first 3 commits don't change the existing logic. Instead they add debugging functions and test scripts that facilitate testing of the 4th and commit. The 4th commit is where all the new logic goes along with tests. The 5th commit fixes build scripts. I've tested this on x86_64, arm, and aarch64 using QEMU. Numerous tests have been added that help this code and could help with future testing of gf-complete. Also I've compared the functions selected with the old code (prior to runtime SIMD support) with the new code and all functions are identical. Here's a gist with the test results prior to SIMD extensions: https://gist.github.com/bassamtabbara/d9a6dcf0a749b7ab01bc2953a359edec. See merge request !18
author: bassamtabbara <bassam.tabbara@quantum.com> 2016-09-14 20:22:27 +0000
committer: bassamtabbara <bassam.tabbara@quantum.com> 2016-09-14 20:22:27 +0000
commit: a6847973cba329ae079d3bd26341a4ec2906f012 (patch)
tree: cbdb3947d9d86f2fa7d9cee84d3b773e1bb8f2b2 /src/gf_w4.c
parent: 185295f247698f727fd3bb11c4795e1741bb359e (diff)
parent: 0690ba86a81faff99a3383b5907ddc02a317eea0 (diff)
download: gf-complete-a6847973cba329ae079d3bd26341a4ec2906f012.tar.gz
1 files changed, 84 insertions, 88 deletions
diff --git a/src/gf_w4.c b/src/gf_w4.c
index 0e86aa8..3a7b953 100644
--- a/src/gf_w4.c
+++ b/src/gf_w4.c
@@ -12,6 +12,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include "gf_w4.h"
+#include "gf_cpu.h"
 
 #define AB2(ip, am1 ,am2, b, t1, t2) {\
   t1 = (b << 1) & am1;\
@@ -134,6 +135,7 @@ gf_w4_shift_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
 
 /* Ben: This function works, but it is 33% slower than the normal shift mult */
 
+#if defined(INTEL_SSE4_PCLMUL)
 static
 inline
 gf_val_32_t
@@ -141,8 +143,6 @@ gf_w4_clm_multiply (gf_t *gf, gf_val_32_t a4, gf_val_32_t b4)
 {
   gf_val_32_t rv = 0;
 
-#if defined(INTEL_SSE4_PCLMUL)
-
   __m128i         a, b;
   __m128i         result;
   __m128i         prim_poly;
@@ -173,9 +173,9 @@ gf_w4_clm_multiply (gf_t *gf, gf_val_32_t a4, gf_val_32_t b4)
   /* Extracts 32 bit value from result. */
 
   rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
-#endif
   return rv;
 }
+#endif
 
 static
 void
@@ -311,10 +311,10 @@ int gf_w4_log_init(gf_t *gf)
     return 0;
   }
     
-  gf->inverse.w32 = gf_w4_inverse_from_divide;
-  gf->divide.w32 = gf_w4_log_divide;
-  gf->multiply.w32 = gf_w4_log_multiply;
-  gf->multiply_region.w32 = gf_w4_log_multiply_region;
+  SET_FUNCTION(gf,inverse,w32,gf_w4_inverse_from_divide)
+  SET_FUNCTION(gf,divide,w32,gf_w4_log_divide)
+  SET_FUNCTION(gf,multiply,w32,gf_w4_log_multiply)
+  SET_FUNCTION(gf,multiply_region,w32,gf_w4_log_multiply_region)
   return 1;
 }
 
@@ -444,21 +444,22 @@ int gf_w4_single_table_init(gf_t *gf)
     }
   }
 
-  gf->inverse.w32 = NULL;
-  gf->divide.w32 = gf_w4_single_table_divide;
-  gf->multiply.w32 = gf_w4_single_table_multiply;
-  #if defined(INTEL_SSSE3) || defined(ARM_NEON)
-    if(h->region_type & (GF_REGION_NOSIMD | GF_REGION_CAUCHY))
-      gf->multiply_region.w32 = gf_w4_single_table_multiply_region;
-    else
-    #if defined(INTEL_SSSE3)
-      gf->multiply_region.w32 = gf_w4_single_table_sse_multiply_region;
-    #elif defined(ARM_NEON)
+  SET_FUNCTION(gf,inverse,w32,NULL)
+  SET_FUNCTION(gf,divide,w32,gf_w4_single_table_divide)
+  SET_FUNCTION(gf,multiply,w32,gf_w4_single_table_multiply)
+  #if defined(INTEL_SSSE3)
+    if (gf_cpu_supports_intel_ssse3 && !(h->region_type & (GF_REGION_NOSIMD | GF_REGION_CAUCHY))) {
+      SET_FUNCTION(gf,multiply_region,w32,gf_w4_single_table_sse_multiply_region)
+    } else {
+  #elif defined(ARM_NEON)
+    if (gf_cpu_supports_arm_neon && !(h->region_type & (GF_REGION_NOSIMD | GF_REGION_CAUCHY))) {
       gf_w4_neon_single_table_init(gf);
-    #endif
-  #else
-    gf->multiply_region.w32 = gf_w4_single_table_multiply_region;
-    if (h->region_type & GF_REGION_SIMD) return 0;
+    } else {
+  #endif
+      SET_FUNCTION(gf,multiply_region,w32,gf_w4_single_table_multiply_region)
+      if (h->region_type & GF_REGION_SIMD) return 0;
+  #if defined(INTEL_SSSE3) || defined(ARM_NEON)
+    }
   #endif
 
   return 1;
@@ -548,10 +549,10 @@ int gf_w4_double_table_init(gf_t *gf)
     }
   }
 
-  gf->inverse.w32 = NULL;
-  gf->divide.w32 = gf_w4_double_table_divide;
-  gf->multiply.w32 = gf_w4_double_table_multiply;
-  gf->multiply_region.w32 = gf_w4_double_table_multiply_region;
+  SET_FUNCTION(gf,inverse,w32,NULL)
+  SET_FUNCTION(gf,divide,w32,gf_w4_double_table_divide)
+  SET_FUNCTION(gf,multiply,w32,gf_w4_double_table_multiply)
+  SET_FUNCTION(gf,multiply_region,w32,gf_w4_double_table_multiply_region)
   return 1;
 }
 
@@ -682,10 +683,10 @@ int gf_w4_quad_table_init(gf_t *gf)
     }
   }
 
-  gf->inverse.w32 = NULL;
-  gf->divide.w32 = gf_w4_quad_table_divide;
-  gf->multiply.w32 = gf_w4_quad_table_multiply;
-  gf->multiply_region.w32 = gf_w4_quad_table_multiply_region;
+  SET_FUNCTION(gf,inverse,w32,NULL)
+  SET_FUNCTION(gf,divide,w32,gf_w4_quad_table_divide)
+  SET_FUNCTION(gf,multiply,w32,gf_w4_quad_table_multiply)
+  SET_FUNCTION(gf,multiply_region,w32,gf_w4_quad_table_multiply_region)
   return 1;
 }
 static 
@@ -724,10 +725,10 @@ int gf_w4_quad_table_lazy_init(gf_t *gf)
     }
   }
 
-  gf->inverse.w32 = NULL;
-  gf->divide.w32 = gf_w4_quad_table_lazy_divide;
-  gf->multiply.w32 = gf_w4_quad_table_lazy_multiply;
-  gf->multiply_region.w32 = gf_w4_quad_table_multiply_region;
+  SET_FUNCTION(gf,inverse,w32,NULL)
+  SET_FUNCTION(gf,divide,w32,gf_w4_quad_table_lazy_divide)
+  SET_FUNCTION(gf,multiply,w32,gf_w4_quad_table_lazy_multiply)
+  SET_FUNCTION(gf,multiply_region,w32,gf_w4_quad_table_multiply_region)
   return 1;
 }
 
@@ -736,16 +737,13 @@ int gf_w4_table_init(gf_t *gf)
 {
   int rt;
   gf_internal_t *h;
-  int simd = 0;
-
-#if defined(INTEL_SSSE3) || defined(ARM_NEON)
-  simd = 1;
-#endif
 
   h = (gf_internal_t *) gf->scratch;
   rt = (h->region_type);
 
-  if (h->mult_type == GF_MULT_DEFAULT && !simd) rt |= GF_REGION_DOUBLE_TABLE;
+  if (h->mult_type == GF_MULT_DEFAULT && 
+    !(gf_cpu_supports_intel_ssse3 || gf_cpu_supports_arm_neon)) 
+      rt |= GF_REGION_DOUBLE_TABLE;
 
   if (rt & GF_REGION_DOUBLE_TABLE) {
     return gf_w4_double_table_init(gf);
@@ -929,11 +927,11 @@ gf_w4_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t v
 #endif
 
 /*
+#ifdef INTEL_SSE2
 static
 void 
 gf_w4_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
 {
-#ifdef INTEL_SSE2
   uint8_t *d8, *s8, tb;
   __m128i pp, m1, m2, t1, t2, va, vb;
   struct gf_bytwo_data *btd;
@@ -990,8 +988,8 @@ gf_w4_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t v
     }
   }
   gf_do_final_region_alignment(&rd);
-#endif
 }
+#endif
 */
 
 #ifdef INTEL_SSE2
@@ -1865,28 +1863,30 @@ int gf_w4_bytwo_init(gf_t *gf)
   }
 
   if (h->mult_type == GF_MULT_BYTWO_p) {
-    gf->multiply.w32 = gf_w4_bytwo_p_multiply;
+    SET_FUNCTION(gf,multiply,w32,gf_w4_bytwo_p_multiply)
     #ifdef INTEL_SSE2
-      if (h->region_type & GF_REGION_NOSIMD)
-        gf->multiply_region.w32 = gf_w4_bytwo_p_nosse_multiply_region;
-      else
-        gf->multiply_region.w32 = gf_w4_bytwo_p_sse_multiply_region;
-    #else
-      gf->multiply_region.w32 = gf_w4_bytwo_p_nosse_multiply_region;
-      if (h->region_type & GF_REGION_SIMD)
-        return 0;
+      if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) {
+        SET_FUNCTION(gf,multiply_region,w32,gf_w4_bytwo_p_sse_multiply_region)
+      } else {
+    #endif
+        SET_FUNCTION(gf,multiply_region,w32,gf_w4_bytwo_p_nosse_multiply_region)
+        if (h->region_type & GF_REGION_SIMD)
+          return 0;
+    #ifdef INTEL_SSE2
+      }
     #endif
   } else {
-    gf->multiply.w32 = gf_w4_bytwo_b_multiply;
+    SET_FUNCTION(gf,multiply,w32,gf_w4_bytwo_b_multiply)
     #ifdef INTEL_SSE2
-      if (h->region_type & GF_REGION_NOSIMD)
-        gf->multiply_region.w32 = gf_w4_bytwo_b_nosse_multiply_region;
-      else
-        gf->multiply_region.w32 = gf_w4_bytwo_b_sse_multiply_region;
-    #else
-      gf->multiply_region.w32 = gf_w4_bytwo_b_nosse_multiply_region;
-      if (h->region_type & GF_REGION_SIMD)
-        return 0;
+      if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) {
+        SET_FUNCTION(gf,multiply_region,w32,gf_w4_bytwo_b_sse_multiply_region)
+      } else {
+    #endif
+        SET_FUNCTION(gf,multiply_region,w32,gf_w4_bytwo_b_nosse_multiply_region)
+        if (h->region_type & GF_REGION_SIMD)
+          return 0;
+    #ifdef INTEL_SSE2
+      }
     #endif
   }
   return 1;
@@ -1897,10 +1897,14 @@ static
 int gf_w4_cfm_init(gf_t *gf)
 {
 #if defined(INTEL_SSE4_PCLMUL)
-  gf->multiply.w32 = gf_w4_clm_multiply;
-  return 1;
+  if (gf_cpu_supports_intel_pclmul) {
+    SET_FUNCTION(gf,multiply,w32,gf_w4_clm_multiply)
+    return 1;
+  }
 #elif defined(ARM_NEON)
-  return gf_w4_neon_cfm_init(gf);
+  if (gf_cpu_supports_arm_neon) {
+    return gf_w4_neon_cfm_init(gf);
+  }
 #endif
   return 0;
 }
@@ -1908,7 +1912,7 @@ int gf_w4_cfm_init(gf_t *gf)
 static 
 int gf_w4_shift_init(gf_t *gf)
 {
-  gf->multiply.w32 = gf_w4_shift_multiply;
+  SET_FUNCTION(gf,multiply,w32,gf_w4_shift_multiply)
   return 1;
 }
 
@@ -1917,15 +1921,6 @@ int gf_w4_shift_init(gf_t *gf)
 
 int gf_w4_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2)
 {
-  int issse3 = 0, isneon = 0;
-
-#ifdef INTEL_SSSE3
-  issse3 = 1;
-#endif
-#ifdef ARM_NEON
-  isneon = 1;
-#endif
-
   switch(mult_type)
   {
     case GF_MULT_BYTWO_p:
@@ -1938,7 +1933,8 @@ int gf_w4_scratch_size(int mult_type, int region_type, int divide_type, int arg1
         return sizeof(gf_internal_t) + sizeof(struct gf_single_table_data) + 64;
       }
 
-      if (mult_type == GF_MULT_DEFAULT && !(issse3 || isneon))
+      if (mult_type == GF_MULT_DEFAULT && 
+          !(gf_cpu_supports_arm_neon || gf_cpu_supports_intel_ssse3))
           region_type = GF_REGION_DOUBLE_TABLE;
 
       if (region_type & GF_REGION_DOUBLE_TABLE) {
@@ -1977,11 +1973,11 @@ gf_w4_init (gf_t *gf)
   h = (gf_internal_t *) gf->scratch;
   if (h->prim_poly == 0) h->prim_poly = 0x13;
   h->prim_poly |= 0x10;
-  gf->multiply.w32 = NULL;
-  gf->divide.w32 = NULL;
-  gf->inverse.w32 = NULL;
-  gf->multiply_region.w32 = NULL;
-  gf->extract_word.w32 = gf_w4_extract_word;
+  SET_FUNCTION(gf,multiply,w32,NULL)
+  SET_FUNCTION(gf,divide,w32,NULL)
+  SET_FUNCTION(gf,inverse,w32,NULL)
+  SET_FUNCTION(gf,multiply_region,w32,NULL)
+  SET_FUNCTION(gf,extract_word,w32,gf_w4_extract_word)
 
   switch(h->mult_type) {
     case GF_MULT_CARRY_FREE: if (gf_w4_cfm_init(gf) == 0) return 0; break;
@@ -1995,27 +1991,27 @@ gf_w4_init (gf_t *gf)
   }
 
   if (h->divide_type == GF_DIVIDE_EUCLID) {
-    gf->divide.w32 = gf_w4_divide_from_inverse;
-    gf->inverse.w32 = gf_w4_euclid;
+    SET_FUNCTION(gf,divide,w32,gf_w4_divide_from_inverse)
+    SET_FUNCTION(gf,inverse,w32,gf_w4_euclid)
   } else if (h->divide_type == GF_DIVIDE_MATRIX) {
-    gf->divide.w32 = gf_w4_divide_from_inverse;
-    gf->inverse.w32 = gf_w4_matrix;
+    SET_FUNCTION(gf,divide,w32,gf_w4_divide_from_inverse)
+    SET_FUNCTION(gf,inverse,w32,gf_w4_matrix)
   }
 
   if (gf->divide.w32 == NULL) {
-    gf->divide.w32 = gf_w4_divide_from_inverse;
-    if (gf->inverse.w32 == NULL) gf->inverse.w32 = gf_w4_euclid;
+    SET_FUNCTION(gf,divide,w32,gf_w4_divide_from_inverse)
+    if (gf->inverse.w32 == NULL) SET_FUNCTION(gf,inverse,w32,gf_w4_euclid)
   }
 
-  if (gf->inverse.w32 == NULL)  gf->inverse.w32 = gf_w4_inverse_from_divide;
+  if (gf->inverse.w32 == NULL)  SET_FUNCTION(gf,inverse,w32,gf_w4_inverse_from_divide)
 
   if (h->region_type == GF_REGION_CAUCHY) {
-    gf->multiply_region.w32 = gf_wgen_cauchy_region;
-    gf->extract_word.w32 = gf_wgen_extract_word;
+    SET_FUNCTION(gf,multiply_region,w32,gf_wgen_cauchy_region)
+    SET_FUNCTION(gf,extract_word,w32,gf_wgen_extract_word)
   }
 
   if (gf->multiply_region.w32 == NULL) {
-    gf->multiply_region.w32 = gf_w4_multiply_region_from_single;
+    SET_FUNCTION(gf,multiply_region,w32,gf_w4_multiply_region_from_single)
   }
 
   return 1;
author	bassamtabbara <bassam.tabbara@quantum.com>	2016-09-14 20:22:27 +0000
committer	bassamtabbara <bassam.tabbara@quantum.com>	2016-09-14 20:22:27 +0000
commit	a6847973cba329ae079d3bd26341a4ec2906f012 (patch)
tree	cbdb3947d9d86f2fa7d9cee84d3b773e1bb8f2b2 /src/gf_w4.c
parent	185295f247698f727fd3bb11c4795e1741bb359e (diff)
parent	0690ba86a81faff99a3383b5907ddc02a317eea0 (diff)
download	gf-complete-a6847973cba329ae079d3bd26341a4ec2906f012.tar.gz