summaryrefslogtreecommitdiff
path: root/src/gf_w4.c
diff options
context:
space:
mode:
authorbassamtabbara <bassam.tabbara@quantum.com>2016-09-14 20:22:27 +0000
committerbassamtabbara <bassam.tabbara@quantum.com>2016-09-14 20:22:27 +0000
commita6847973cba329ae079d3bd26341a4ec2906f012 (patch)
treecbdb3947d9d86f2fa7d9cee84d3b773e1bb8f2b2 /src/gf_w4.c
parent185295f247698f727fd3bb11c4795e1741bb359e (diff)
parent0690ba86a81faff99a3383b5907ddc02a317eea0 (diff)
downloadgf-complete-a6847973cba329ae079d3bd26341a4ec2906f012.tar.gz
Merge branch 'simd-runtime-detection' into 'master'
Support for runtime detection of SIMD This merge request adds support for runtime SIMD detection. The idea is that you would build gf-complete with full SIMD support, and gf_init will select the appropriate function at runtime based on the capabilities of the target machine. This would eliminate the need to build different versions of the code for different processors (you still need to build for different archs). Ceph for example has 3-4 flavors of jerasure on Intel (and does not support PCLMUL optimizations as a result of using to many binaries). Numerous libraries have followed as similar approach include zlib. When reviewing this merge request I recommend that you look at each of the 5 commits independently. The first 3 commits don't change the existing logic. Instead they add debugging functions and test scripts that facilitate testing of the 4th and commit. The 4th commit is where all the new logic goes along with tests. The 5th commit fixes build scripts. I've tested this on x86_64, arm, and aarch64 using QEMU. Numerous tests have been added that help this code and could help with future testing of gf-complete. Also I've compared the functions selected with the old code (prior to runtime SIMD support) with the new code and all functions are identical. Here's a gist with the test results prior to SIMD extensions: https://gist.github.com/bassamtabbara/d9a6dcf0a749b7ab01bc2953a359edec. See merge request !18
Diffstat (limited to 'src/gf_w4.c')
-rw-r--r--src/gf_w4.c172
1 files changed, 84 insertions, 88 deletions
diff --git a/src/gf_w4.c b/src/gf_w4.c
index 0e86aa8..3a7b953 100644
--- a/src/gf_w4.c
+++ b/src/gf_w4.c
@@ -12,6 +12,7 @@
#include <stdio.h>
#include <stdlib.h>
#include "gf_w4.h"
+#include "gf_cpu.h"
#define AB2(ip, am1 ,am2, b, t1, t2) {\
t1 = (b << 1) & am1;\
@@ -134,6 +135,7 @@ gf_w4_shift_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
/* Ben: This function works, but it is 33% slower than the normal shift mult */
+#if defined(INTEL_SSE4_PCLMUL)
static
inline
gf_val_32_t
@@ -141,8 +143,6 @@ gf_w4_clm_multiply (gf_t *gf, gf_val_32_t a4, gf_val_32_t b4)
{
gf_val_32_t rv = 0;
-#if defined(INTEL_SSE4_PCLMUL)
-
__m128i a, b;
__m128i result;
__m128i prim_poly;
@@ -173,9 +173,9 @@ gf_w4_clm_multiply (gf_t *gf, gf_val_32_t a4, gf_val_32_t b4)
/* Extracts 32 bit value from result. */
rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
-#endif
return rv;
}
+#endif
static
void
@@ -311,10 +311,10 @@ int gf_w4_log_init(gf_t *gf)
return 0;
}
- gf->inverse.w32 = gf_w4_inverse_from_divide;
- gf->divide.w32 = gf_w4_log_divide;
- gf->multiply.w32 = gf_w4_log_multiply;
- gf->multiply_region.w32 = gf_w4_log_multiply_region;
+ SET_FUNCTION(gf,inverse,w32,gf_w4_inverse_from_divide)
+ SET_FUNCTION(gf,divide,w32,gf_w4_log_divide)
+ SET_FUNCTION(gf,multiply,w32,gf_w4_log_multiply)
+ SET_FUNCTION(gf,multiply_region,w32,gf_w4_log_multiply_region)
return 1;
}
@@ -444,21 +444,22 @@ int gf_w4_single_table_init(gf_t *gf)
}
}
- gf->inverse.w32 = NULL;
- gf->divide.w32 = gf_w4_single_table_divide;
- gf->multiply.w32 = gf_w4_single_table_multiply;
- #if defined(INTEL_SSSE3) || defined(ARM_NEON)
- if(h->region_type & (GF_REGION_NOSIMD | GF_REGION_CAUCHY))
- gf->multiply_region.w32 = gf_w4_single_table_multiply_region;
- else
- #if defined(INTEL_SSSE3)
- gf->multiply_region.w32 = gf_w4_single_table_sse_multiply_region;
- #elif defined(ARM_NEON)
+ SET_FUNCTION(gf,inverse,w32,NULL)
+ SET_FUNCTION(gf,divide,w32,gf_w4_single_table_divide)
+ SET_FUNCTION(gf,multiply,w32,gf_w4_single_table_multiply)
+ #if defined(INTEL_SSSE3)
+ if (gf_cpu_supports_intel_ssse3 && !(h->region_type & (GF_REGION_NOSIMD | GF_REGION_CAUCHY))) {
+ SET_FUNCTION(gf,multiply_region,w32,gf_w4_single_table_sse_multiply_region)
+ } else {
+ #elif defined(ARM_NEON)
+ if (gf_cpu_supports_arm_neon && !(h->region_type & (GF_REGION_NOSIMD | GF_REGION_CAUCHY))) {
gf_w4_neon_single_table_init(gf);
- #endif
- #else
- gf->multiply_region.w32 = gf_w4_single_table_multiply_region;
- if (h->region_type & GF_REGION_SIMD) return 0;
+ } else {
+ #endif
+ SET_FUNCTION(gf,multiply_region,w32,gf_w4_single_table_multiply_region)
+ if (h->region_type & GF_REGION_SIMD) return 0;
+ #if defined(INTEL_SSSE3) || defined(ARM_NEON)
+ }
#endif
return 1;
@@ -548,10 +549,10 @@ int gf_w4_double_table_init(gf_t *gf)
}
}
- gf->inverse.w32 = NULL;
- gf->divide.w32 = gf_w4_double_table_divide;
- gf->multiply.w32 = gf_w4_double_table_multiply;
- gf->multiply_region.w32 = gf_w4_double_table_multiply_region;
+ SET_FUNCTION(gf,inverse,w32,NULL)
+ SET_FUNCTION(gf,divide,w32,gf_w4_double_table_divide)
+ SET_FUNCTION(gf,multiply,w32,gf_w4_double_table_multiply)
+ SET_FUNCTION(gf,multiply_region,w32,gf_w4_double_table_multiply_region)
return 1;
}
@@ -682,10 +683,10 @@ int gf_w4_quad_table_init(gf_t *gf)
}
}
- gf->inverse.w32 = NULL;
- gf->divide.w32 = gf_w4_quad_table_divide;
- gf->multiply.w32 = gf_w4_quad_table_multiply;
- gf->multiply_region.w32 = gf_w4_quad_table_multiply_region;
+ SET_FUNCTION(gf,inverse,w32,NULL)
+ SET_FUNCTION(gf,divide,w32,gf_w4_quad_table_divide)
+ SET_FUNCTION(gf,multiply,w32,gf_w4_quad_table_multiply)
+ SET_FUNCTION(gf,multiply_region,w32,gf_w4_quad_table_multiply_region)
return 1;
}
static
@@ -724,10 +725,10 @@ int gf_w4_quad_table_lazy_init(gf_t *gf)
}
}
- gf->inverse.w32 = NULL;
- gf->divide.w32 = gf_w4_quad_table_lazy_divide;
- gf->multiply.w32 = gf_w4_quad_table_lazy_multiply;
- gf->multiply_region.w32 = gf_w4_quad_table_multiply_region;
+ SET_FUNCTION(gf,inverse,w32,NULL)
+ SET_FUNCTION(gf,divide,w32,gf_w4_quad_table_lazy_divide)
+ SET_FUNCTION(gf,multiply,w32,gf_w4_quad_table_lazy_multiply)
+ SET_FUNCTION(gf,multiply_region,w32,gf_w4_quad_table_multiply_region)
return 1;
}
@@ -736,16 +737,13 @@ int gf_w4_table_init(gf_t *gf)
{
int rt;
gf_internal_t *h;
- int simd = 0;
-
-#if defined(INTEL_SSSE3) || defined(ARM_NEON)
- simd = 1;
-#endif
h = (gf_internal_t *) gf->scratch;
rt = (h->region_type);
- if (h->mult_type == GF_MULT_DEFAULT && !simd) rt |= GF_REGION_DOUBLE_TABLE;
+ if (h->mult_type == GF_MULT_DEFAULT &&
+ !(gf_cpu_supports_intel_ssse3 || gf_cpu_supports_arm_neon))
+ rt |= GF_REGION_DOUBLE_TABLE;
if (rt & GF_REGION_DOUBLE_TABLE) {
return gf_w4_double_table_init(gf);
@@ -929,11 +927,11 @@ gf_w4_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t v
#endif
/*
+#ifdef INTEL_SSE2
static
void
gf_w4_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
{
-#ifdef INTEL_SSE2
uint8_t *d8, *s8, tb;
__m128i pp, m1, m2, t1, t2, va, vb;
struct gf_bytwo_data *btd;
@@ -990,8 +988,8 @@ gf_w4_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t v
}
}
gf_do_final_region_alignment(&rd);
-#endif
}
+#endif
*/
#ifdef INTEL_SSE2
@@ -1865,28 +1863,30 @@ int gf_w4_bytwo_init(gf_t *gf)
}
if (h->mult_type == GF_MULT_BYTWO_p) {
- gf->multiply.w32 = gf_w4_bytwo_p_multiply;
+ SET_FUNCTION(gf,multiply,w32,gf_w4_bytwo_p_multiply)
#ifdef INTEL_SSE2
- if (h->region_type & GF_REGION_NOSIMD)
- gf->multiply_region.w32 = gf_w4_bytwo_p_nosse_multiply_region;
- else
- gf->multiply_region.w32 = gf_w4_bytwo_p_sse_multiply_region;
- #else
- gf->multiply_region.w32 = gf_w4_bytwo_p_nosse_multiply_region;
- if (h->region_type & GF_REGION_SIMD)
- return 0;
+ if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) {
+ SET_FUNCTION(gf,multiply_region,w32,gf_w4_bytwo_p_sse_multiply_region)
+ } else {
+ #endif
+ SET_FUNCTION(gf,multiply_region,w32,gf_w4_bytwo_p_nosse_multiply_region)
+ if (h->region_type & GF_REGION_SIMD)
+ return 0;
+ #ifdef INTEL_SSE2
+ }
#endif
} else {
- gf->multiply.w32 = gf_w4_bytwo_b_multiply;
+ SET_FUNCTION(gf,multiply,w32,gf_w4_bytwo_b_multiply)
#ifdef INTEL_SSE2
- if (h->region_type & GF_REGION_NOSIMD)
- gf->multiply_region.w32 = gf_w4_bytwo_b_nosse_multiply_region;
- else
- gf->multiply_region.w32 = gf_w4_bytwo_b_sse_multiply_region;
- #else
- gf->multiply_region.w32 = gf_w4_bytwo_b_nosse_multiply_region;
- if (h->region_type & GF_REGION_SIMD)
- return 0;
+ if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) {
+ SET_FUNCTION(gf,multiply_region,w32,gf_w4_bytwo_b_sse_multiply_region)
+ } else {
+ #endif
+ SET_FUNCTION(gf,multiply_region,w32,gf_w4_bytwo_b_nosse_multiply_region)
+ if (h->region_type & GF_REGION_SIMD)
+ return 0;
+ #ifdef INTEL_SSE2
+ }
#endif
}
return 1;
@@ -1897,10 +1897,14 @@ static
int gf_w4_cfm_init(gf_t *gf)
{
#if defined(INTEL_SSE4_PCLMUL)
- gf->multiply.w32 = gf_w4_clm_multiply;
- return 1;
+ if (gf_cpu_supports_intel_pclmul) {
+ SET_FUNCTION(gf,multiply,w32,gf_w4_clm_multiply)
+ return 1;
+ }
#elif defined(ARM_NEON)
- return gf_w4_neon_cfm_init(gf);
+ if (gf_cpu_supports_arm_neon) {
+ return gf_w4_neon_cfm_init(gf);
+ }
#endif
return 0;
}
@@ -1908,7 +1912,7 @@ int gf_w4_cfm_init(gf_t *gf)
static
int gf_w4_shift_init(gf_t *gf)
{
- gf->multiply.w32 = gf_w4_shift_multiply;
+ SET_FUNCTION(gf,multiply,w32,gf_w4_shift_multiply)
return 1;
}
@@ -1917,15 +1921,6 @@ int gf_w4_shift_init(gf_t *gf)
int gf_w4_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2)
{
- int issse3 = 0, isneon = 0;
-
-#ifdef INTEL_SSSE3
- issse3 = 1;
-#endif
-#ifdef ARM_NEON
- isneon = 1;
-#endif
-
switch(mult_type)
{
case GF_MULT_BYTWO_p:
@@ -1938,7 +1933,8 @@ int gf_w4_scratch_size(int mult_type, int region_type, int divide_type, int arg1
return sizeof(gf_internal_t) + sizeof(struct gf_single_table_data) + 64;
}
- if (mult_type == GF_MULT_DEFAULT && !(issse3 || isneon))
+ if (mult_type == GF_MULT_DEFAULT &&
+ !(gf_cpu_supports_arm_neon || gf_cpu_supports_intel_ssse3))
region_type = GF_REGION_DOUBLE_TABLE;
if (region_type & GF_REGION_DOUBLE_TABLE) {
@@ -1977,11 +1973,11 @@ gf_w4_init (gf_t *gf)
h = (gf_internal_t *) gf->scratch;
if (h->prim_poly == 0) h->prim_poly = 0x13;
h->prim_poly |= 0x10;
- gf->multiply.w32 = NULL;
- gf->divide.w32 = NULL;
- gf->inverse.w32 = NULL;
- gf->multiply_region.w32 = NULL;
- gf->extract_word.w32 = gf_w4_extract_word;
+ SET_FUNCTION(gf,multiply,w32,NULL)
+ SET_FUNCTION(gf,divide,w32,NULL)
+ SET_FUNCTION(gf,inverse,w32,NULL)
+ SET_FUNCTION(gf,multiply_region,w32,NULL)
+ SET_FUNCTION(gf,extract_word,w32,gf_w4_extract_word)
switch(h->mult_type) {
case GF_MULT_CARRY_FREE: if (gf_w4_cfm_init(gf) == 0) return 0; break;
@@ -1995,27 +1991,27 @@ gf_w4_init (gf_t *gf)
}
if (h->divide_type == GF_DIVIDE_EUCLID) {
- gf->divide.w32 = gf_w4_divide_from_inverse;
- gf->inverse.w32 = gf_w4_euclid;
+ SET_FUNCTION(gf,divide,w32,gf_w4_divide_from_inverse)
+ SET_FUNCTION(gf,inverse,w32,gf_w4_euclid)
} else if (h->divide_type == GF_DIVIDE_MATRIX) {
- gf->divide.w32 = gf_w4_divide_from_inverse;
- gf->inverse.w32 = gf_w4_matrix;
+ SET_FUNCTION(gf,divide,w32,gf_w4_divide_from_inverse)
+ SET_FUNCTION(gf,inverse,w32,gf_w4_matrix)
}
if (gf->divide.w32 == NULL) {
- gf->divide.w32 = gf_w4_divide_from_inverse;
- if (gf->inverse.w32 == NULL) gf->inverse.w32 = gf_w4_euclid;
+ SET_FUNCTION(gf,divide,w32,gf_w4_divide_from_inverse)
+ if (gf->inverse.w32 == NULL) SET_FUNCTION(gf,inverse,w32,gf_w4_euclid)
}
- if (gf->inverse.w32 == NULL) gf->inverse.w32 = gf_w4_inverse_from_divide;
+ if (gf->inverse.w32 == NULL) SET_FUNCTION(gf,inverse,w32,gf_w4_inverse_from_divide)
if (h->region_type == GF_REGION_CAUCHY) {
- gf->multiply_region.w32 = gf_wgen_cauchy_region;
- gf->extract_word.w32 = gf_wgen_extract_word;
+ SET_FUNCTION(gf,multiply_region,w32,gf_wgen_cauchy_region)
+ SET_FUNCTION(gf,extract_word,w32,gf_wgen_extract_word)
}
if (gf->multiply_region.w32 == NULL) {
- gf->multiply_region.w32 = gf_w4_multiply_region_from_single;
+ SET_FUNCTION(gf,multiply_region,w32,gf_w4_multiply_region_from_single)
}
return 1;