diff options
-rw-r--r-- | examples/gf_example_2.c | 4 | ||||
-rw-r--r-- | examples/gf_example_5.c | 1 | ||||
-rw-r--r-- | examples/gf_example_6.c | 1 | ||||
-rw-r--r-- | examples/gf_example_7.c | 1 | ||||
-rw-r--r-- | src/gf.c | 43 | ||||
-rw-r--r-- | src/gf_general.c | 7 | ||||
-rw-r--r-- | src/gf_general.h | 61 | ||||
-rw-r--r-- | src/gf_int.h | 200 | ||||
-rw-r--r-- | src/gf_method.c | 3 | ||||
-rw-r--r-- | src/gf_rand.h | 22 | ||||
-rw-r--r-- | src/gf_w128.c | 61 | ||||
-rw-r--r-- | src/gf_w16.c | 76 | ||||
-rw-r--r-- | src/gf_w32.c | 89 | ||||
-rw-r--r-- | src/gf_w4.c | 203 | ||||
-rw-r--r-- | src/gf_w64.c | 90 | ||||
-rw-r--r-- | src/gf_w8.c | 100 | ||||
-rw-r--r-- | src/gf_wgen.c | 13 | ||||
-rw-r--r-- | test/gf_unit.c | 20 | ||||
-rw-r--r-- | tools/gf_add.c | 2 | ||||
-rw-r--r-- | tools/gf_inline_time.c | 5 | ||||
-rw-r--r-- | tools/gf_methods.c | 2 | ||||
-rw-r--r-- | tools/gf_poly.c | 6 | ||||
-rw-r--r-- | tools/gf_time.c | 15 |
23 files changed, 303 insertions, 722 deletions
diff --git a/examples/gf_example_2.c b/examples/gf_example_2.c index e98774a..576d9a5 100644 --- a/examples/gf_example_2.c +++ b/examples/gf_example_2.c @@ -28,8 +28,8 @@ int main(int argc, char **argv) { uint32_t a, b, c; uint8_t *r1, *r2; - uint16_t *r16; - uint32_t *r32; + uint16_t *r16 = NULL; + uint32_t *r32 = NULL; int w, i; gf_t gf; diff --git a/examples/gf_example_5.c b/examples/gf_example_5.c index 8e7dd4e..da6e9ca 100644 --- a/examples/gf_example_5.c +++ b/examples/gf_example_5.c @@ -74,4 +74,5 @@ int main(int argc, char **argv) gf.extract_word.w32(&gf, a, 30*2, i+15), gf.extract_word.w32(&gf, b, 30*2, i+15)); } + return 0; } diff --git a/examples/gf_example_6.c b/examples/gf_example_6.c index 54cdf83..800a35f 100644 --- a/examples/gf_example_6.c +++ b/examples/gf_example_6.c @@ -80,4 +80,5 @@ int main(int argc, char **argv) gf.extract_word.w32(&gf, a, 30*4, i+15), gf.extract_word.w32(&gf, b, 30*4, i+15)); } + return 0; } diff --git a/examples/gf_example_7.c b/examples/gf_example_7.c index cd5c44b..ee07d53 100644 --- a/examples/gf_example_7.c +++ b/examples/gf_example_7.c @@ -71,4 +71,5 @@ int main(int argc, char **argv) gf.extract_word.w32(&gf, a, 3, i), gf.extract_word.w32(&gf, b, 3, i)); } + return 0; } @@ -179,13 +179,11 @@ uint64_t gf_composite_get_default_poly(gf_t *base) int gf_error_check(int w, int mult_type, int region_type, int divide_type, int arg1, int arg2, uint64_t poly, gf_t *base) { - int sse4 = 0; int sse3 = 0; int sse2 = 0; int pclmul = 0; int rdouble, rquad, rlazy, rsse, rnosse, raltmap, rcauchy, tmp; - uint64_t pp; - gf_internal_t *sub, *subsub, *subsubsub; + gf_internal_t *sub; rdouble = (region_type & GF_REGION_DOUBLE_TABLE); rquad = (region_type & GF_REGION_QUAD_TABLE); @@ -214,10 +212,6 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type, sse3 = 1; #endif -#ifdef INTEL_SSE4 - sse4 = 1; -#endif - #ifdef INTEL_SSE4_PCLMUL pclmul = 1; #endif @@ -488,7 +482,7 @@ int gf_init_hard(gf_t *gf, int w, int mult_type, h->arg2 = arg2; h->base_gf = base_gf; h->private = (void *) gf->scratch; - h->private += (sizeof(gf_internal_t)); + h->private = (char*)h->private + (sizeof(gf_internal_t)); gf->extract_word.w32 = NULL; switch(w) { @@ -525,7 +519,7 @@ void gf_alignment_error(char *s, int a) static void gf_invert_binary_matrix(uint32_t *mat, uint32_t *inv, int rows) { - int cols, i, j, k; + int cols, i, j; uint32_t tmp; cols = rows; @@ -594,7 +588,7 @@ uint32_t gf_bitmatrix_inverse(uint32_t y, int w, uint32_t pp) void gf_two_byte_region_table_multiply(gf_region_data *rd, uint16_t *base) { uint64_t a, prod; - int j, xor; + int xor; uint64_t *s64, *d64, *top; s64 = rd->s_start; @@ -693,8 +687,8 @@ static void gf_slow_multiply_region(gf_region_data *rd, void *src, void *dest, v fprintf(stderr, "Error: gf_slow_multiply_region: w=%d not implemented.\n", h->w); exit(1); } - src += wb; - dest += wb; + src = (char*)src + wb; + dest = (char*)dest + wb; } } @@ -773,8 +767,7 @@ void gf_set_region_data(gf_region_data *rd, int xor, int align) { - uint8_t *s8, *d8; - gf_internal_t *h; + gf_internal_t *h = NULL; int wb; uint32_t a; unsigned long uls, uld; @@ -802,7 +795,7 @@ void gf_set_region_data(gf_region_data *rd, if (align == -1) { /* JSP: This is cauchy. Error check bytes, then set up the pointers so that there are no alignment regions. */ - if (bytes % h->w != 0) { + if (h != NULL && bytes % h->w != 0) { fprintf(stderr, "Error in region multiply operation.\n"); fprintf(stderr, "The size must be a multiple of %d bytes.\n", h->w); exit(1); @@ -810,8 +803,8 @@ void gf_set_region_data(gf_region_data *rd, rd->s_start = src; rd->d_start = dest; - rd->s_top = src + bytes; - rd->d_top = src + bytes; + rd->s_top = (char*)src + bytes; + rd->d_top = (char*)src + bytes; return; } @@ -840,12 +833,12 @@ void gf_set_region_data(gf_region_data *rd, uls %= a; if (uls != 0) uls = (a-uls); - rd->s_start = rd->src + uls; - rd->d_start = rd->dest + uls; + rd->s_start = (char*)rd->src + uls; + rd->d_start = (char*)rd->dest + uls; bytes -= uls; bytes -= (bytes % align); - rd->s_top = rd->s_start + bytes; - rd->d_top = rd->d_start + bytes; + rd->s_top = (char*)rd->s_start + bytes; + rd->d_top = (char*)rd->d_start + bytes; } @@ -856,7 +849,7 @@ void gf_do_initial_region_alignment(gf_region_data *rd) void gf_do_final_region_alignment(gf_region_data *rd) { - gf_slow_multiply_region(rd, rd->s_top, rd->d_top, rd->src+rd->bytes); + gf_slow_multiply_region(rd, rd->s_top, rd->d_top, (char*)rd->src+rd->bytes); } void gf_multby_zero(void *dest, int bytes, int xor) @@ -897,9 +890,8 @@ void gf_multby_one(void *src, void *dest, int bytes, int xor) __m128i ms, md; #endif unsigned long uls, uld; - uint8_t *s8, *d8, *dtop8; + uint8_t *s8, *d8; uint64_t *s64, *d64, *dtop64; - int abytes; gf_region_data rd; if (!xor) { @@ -910,6 +902,7 @@ void gf_multby_one(void *src, void *dest, int bytes, int xor) uld = (unsigned long) dest; #ifdef INTEL_SSE2 + int abytes; s8 = (uint8_t *) src; d8 = (uint8_t *) dest; if (uls % 16 == uld % 16) { @@ -1025,7 +1018,7 @@ static void gf_unaligned_xor(void *src, void *dest, int bytes) } d8 = (uint8_t *) d64; - while (d8 < (uint8_t *) (dest+bytes)) { + while (d8 < (uint8_t *) ((char*)dest+bytes)) { *d8 ^= *s8; d8++; s8++; diff --git a/src/gf_general.c b/src/gf_general.c index d9d1700..c410598 100644 --- a/src/gf_general.c +++ b/src/gf_general.c @@ -267,7 +267,6 @@ void gf_general_do_region_check(gf_t *gf, gf_general_t *a, void *orig_a, void *o int w, words, i; gf_general_t oa, ot, ft, sb; char sa[50], soa[50], sot[50], sft[50], ssb[50]; - uint8_t *p; h = (gf_internal_t *) gf->scratch; w = h->w; @@ -327,7 +326,7 @@ void gf_general_set_up_single_timing_test(int w, void *ra, void *rb, int size) uint64_t *r64; int i; - top = rb+size; + top = (char*)rb+size; /* If w is 8, 16, 32, 64 or 128, fill the regions with random bytes. However, don't allow for zeros in rb, because that will screw up @@ -366,7 +365,7 @@ void gf_general_set_up_single_timing_test(int w, void *ra, void *rb, int size) r64[1] = g.w128[1]; break; } - rb += (w/8); + rb = (char*)rb + (w/8); } } else if (w == 4) { r8a = (uint8_t *) ra; @@ -408,7 +407,7 @@ int gf_general_do_single_timing_test(gf_t *gf, void *ra, void *rb, int size, cha h = (gf_internal_t *) gf->scratch; w = h->w; - top = ra + size; + top = (char*)ra + size; if (w == 8 || w == 4) { r8a = (uint8_t *) ra; diff --git a/src/gf_general.h b/src/gf_general.h deleted file mode 100644 index 9a5de52..0000000 --- a/src/gf_general.h +++ /dev/null @@ -1,61 +0,0 @@ -/* - * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic - * James S. Plank, Ethan L. Miller, Kevin M. Greenan, - * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. - * - * gf_general.h - * - * This file has helper routines for doing basic GF operations with any - * legal value of w. The problem is that w <= 32, w=64 and w=128 all have - * different data types, which is a pain. The procedures in this file try - * to alleviate that pain. They are used in gf_unit and gf_time. - */ - -#pragma once - -#include <stdio.h> -#include <getopt.h> -#include <stdint.h> -#include <string.h> -#include <stdlib.h> -#include <time.h> - -#include "gf_complete.h" - -typedef union { - uint32_t w32; - uint64_t w64; - uint64_t w128[2]; -} gf_general_t; - -void gf_general_set_zero(gf_general_t *v, int w); -void gf_general_set_one(gf_general_t *v, int w); -void gf_general_set_two(gf_general_t *v, int w); - -int gf_general_is_zero(gf_general_t *v, int w); -int gf_general_is_one(gf_general_t *v, int w); -int gf_general_are_equal(gf_general_t *v1, gf_general_t *v2, int w); - -void gf_general_val_to_s(gf_general_t *v, int w, char *s, int hex); -int gf_general_s_to_val(gf_general_t *v, int w, char *s, int hex); - -void gf_general_set_random(gf_general_t *v, int w, int zero_ok); - -void gf_general_add(gf_t *gf, gf_general_t *a, gf_general_t *b, gf_general_t *c); -void gf_general_multiply(gf_t *gf, gf_general_t *a, gf_general_t *b, gf_general_t *c); -void gf_general_divide(gf_t *gf, gf_general_t *a, gf_general_t *b, gf_general_t *c); -void gf_general_inverse(gf_t *gf, gf_general_t *a, gf_general_t *b); - -void gf_general_do_region_multiply(gf_t *gf, gf_general_t *a, - void *ra, void *rb, - int bytes, int xor); - -void gf_general_do_region_check(gf_t *gf, gf_general_t *a, - void *orig_a, void *orig_target, void *final_target, - int bytes, int xor); - - -/* Which is M, D or I for multiply, divide or inverse. */ - -void gf_general_set_up_single_timing_test(int w, void *ra, void *rb, int size); -int gf_general_do_single_timing_test(gf_t *gf, void *ra, void *rb, int size, char which); diff --git a/src/gf_int.h b/src/gf_int.h deleted file mode 100644 index 9221569..0000000 --- a/src/gf_int.h +++ /dev/null @@ -1,200 +0,0 @@ -/* - * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic - * James S. Plank, Ethan L. Miller, Kevin M. Greenan, - * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. - * - * gf_int.h - * - * Internal code for Galois field routines. This is not meant for - * users to include, but for the internal GF files to use. - */ - -#pragma once - -#include "gf_complete.h" - -#include <string.h> - -extern void timer_start (double *t); -extern double timer_split (const double *t); -extern void galois_fill_random (void *buf, int len, unsigned int seed); - -typedef struct { - int mult_type; - int region_type; - int divide_type; - int w; - uint64_t prim_poly; - int free_me; - int arg1; - int arg2; - gf_t *base_gf; - void *private; -} gf_internal_t; - -extern int gf_w4_init (gf_t *gf); -extern int gf_w4_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2); - -extern int gf_w8_init (gf_t *gf); -extern int gf_w8_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2); - -extern int gf_w16_init (gf_t *gf); -extern int gf_w16_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2); - -extern int gf_w32_init (gf_t *gf); -extern int gf_w32_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2); - -extern int gf_w64_init (gf_t *gf); -extern int gf_w64_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2); - -extern int gf_w128_init (gf_t *gf); -extern int gf_w128_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2); - -extern int gf_wgen_init (gf_t *gf); -extern int gf_wgen_scratch_size(int w, int mult_type, int region_type, int divide_type, int arg1, int arg2); - -void gf_wgen_cauchy_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor); -gf_val_32_t gf_wgen_extract_word(gf_t *gf, void *start, int bytes, int index); - -extern void gf_alignment_error(char *s, int a); - -extern uint32_t gf_bitmatrix_inverse(uint32_t y, int w, uint32_t pp); - -/* This returns the correct default for prim_poly when base is used as the base - field for COMPOSITE. It returns 0 if we don't have a default prim_poly. */ - -extern uint64_t gf_composite_get_default_poly(gf_t *base); - -/* This structure lets you define a region multiply. It helps because you can handle - unaligned portions of the data with the procedures below, which really cleans - up the code. */ - -typedef struct { - gf_t *gf; - void *src; - void *dest; - int bytes; - uint64_t val; - int xor; - int align; /* The number of bytes to which to align. */ - void *s_start; /* The start and the top of the aligned region. */ - void *d_start; - void *s_top; - void *d_top; -} gf_region_data; - -/* This lets you set up one of these in one call. It also sets the start/top pointers. */ - -void gf_set_region_data(gf_region_data *rd, - gf_t *gf, - void *src, - void *dest, - int bytes, - uint64_t val, - int xor, - int align); - -/* This performs gf->multiply.32() on all of the unaligned bytes in the beginning of the region */ - -extern void gf_do_initial_region_alignment(gf_region_data *rd); - -/* This performs gf->multiply.32() on all of the unaligned bytes in the end of the region */ - -extern void gf_do_final_region_alignment(gf_region_data *rd); - -extern void gf_two_byte_region_table_multiply(gf_region_data *rd, uint16_t *base); - -extern void gf_multby_zero(void *dest, int bytes, int xor); -extern void gf_multby_one(void *src, void *dest, int bytes, int xor); - -typedef enum {GF_E_MDEFDIV, /* Dev != Default && Mult == Default */ - GF_E_MDEFREG, /* Reg != Default && Mult == Default */ - GF_E_MDEFARG, /* Args != Default && Mult == Default */ - GF_E_DIVCOMP, /* Mult == Composite && Div != Default */ - GF_E_CAUCOMP, /* Mult == Composite && Reg == CAUCHY */ - GF_E_DOUQUAD, /* Reg == DOUBLE && Reg == QUAD */ - GF_E_SSE__NO, /* Reg == SSE && Reg == NOSSE */ - GF_E_CAUCHYB, /* Reg == CAUCHY && Other Reg */ - GF_E_CAUGT32, /* Reg == CAUCHY && w > 32*/ - GF_E_ARG1SET, /* Arg1 != 0 && Mult \notin COMPOSITE/SPLIT/GROUP */ - GF_E_ARG2SET, /* Arg2 != 0 && Mult \notin SPLIT/GROUP */ - GF_E_MATRIXW, /* Div == MATRIX && w > 32 */ - GF_E_BAD___W, /* Illegal w */ - GF_E_DOUBLET, /* Reg == DOUBLE && Mult != TABLE */ - GF_E_DOUBLEW, /* Reg == DOUBLE && w \notin {4,8} */ - GF_E_DOUBLEJ, /* Reg == DOUBLE && other Reg */ - GF_E_DOUBLEL, /* Reg == DOUBLE & LAZY but w = 4 */ - GF_E_QUAD__T, /* Reg == QUAD && Mult != TABLE */ - GF_E_QUAD__W, /* Reg == QUAD && w != 4 */ - GF_E_QUAD__J, /* Reg == QUAD && other Reg */ - GF_E_LAZY__X, /* Reg == LAZY && not DOUBLE or QUAD*/ - GF_E_ALTSHIF, /* Mult == Shift && Reg == ALTMAP */ - GF_E_SSESHIF, /* Mult == Shift && Reg == SSE|NOSSE */ - GF_E_ALT_CFM, /* Mult == CARRY_FREE && Reg == ALTMAP */ - GF_E_SSE_CFM, /* Mult == CARRY_FREE && Reg == SSE|NOSSE */ - GF_E_PCLMULX, /* Mult == Carry_Free && No PCLMUL */ - GF_E_ALT_BY2, /* Mult == Bytwo_x && Reg == ALTMAP */ - GF_E_BY2_SSE, /* Mult == Bytwo_x && Reg == SSE && No SSE2 */ - GF_E_LOGBADW, /* Mult == LOGx, w too big*/ - GF_E_LOG___J, /* Mult == LOGx, && Reg == SSE|ALTMAP|NOSSE */ - GF_E_ZERBADW, /* Mult == LOG_ZERO, w \notin {8,16} */ - GF_E_ZEXBADW, /* Mult == LOG_ZERO_EXT, w != 8 */ - GF_E_LOGPOLY, /* Mult == LOG & poly not primitive */ - GF_E_GR_ARGX, /* Mult == GROUP, Bad arg1/2 */ - GF_E_GR_W_48, /* Mult == GROUP, w \in { 4, 8 } */ - GF_E_GR_W_16, /* Mult == GROUP, w == 16, arg1 != 4 || arg2 != 4 */ - GF_E_GR_128A, /* Mult == GROUP, w == 128, bad args */ - GF_E_GR_A_27, /* Mult == GROUP, either arg > 27 */ - GF_E_GR_AR_W, /* Mult == GROUP, either arg > w */ - GF_E_GR____J, /* Mult == GROUP, Reg == SSE|ALTMAP|NOSSE */ - GF_E_TABLE_W, /* Mult == TABLE, w too big */ - GF_E_TAB_SSE, /* Mult == TABLE, SSE|NOSSE only apply to w == 4 */ - GF_E_TABSSE3, /* Mult == TABLE, Need SSSE3 for SSE */ - GF_E_TAB_ALT, /* Mult == TABLE, Reg == ALTMAP */ - GF_E_SP128AR, /* Mult == SPLIT, w=128, Bad arg1/arg2 */ - GF_E_SP128AL, /* Mult == SPLIT, w=128, SSE requires ALTMAP */ - GF_E_SP128AS, /* Mult == SPLIT, w=128, ALTMAP requires SSE */ - GF_E_SP128_A, /* Mult == SPLIT, w=128, SSE only with 4/128 */ - GF_E_SP128_S, /* Mult == SPLIT, w=128, ALTMAP only with 4/128 */ - GF_E_SPLIT_W, /* Mult == SPLIT, Bad w (8, 16, 32, 64, 128) */ - GF_E_SP_16AR, /* Mult == SPLIT, w=16, Bad arg1/arg2 */ - GF_E_SP_16_A, /* Mult == SPLIT, w=16, ALTMAP only with 4/16 */ - GF_E_SP_16_S, /* Mult == SPLIT, w=16, SSE only with 4/16 */ - GF_E_SP_32AR, /* Mult == SPLIT, w=32, Bad arg1/arg2 */ - GF_E_SP_32AS, /* Mult == SPLIT, w=32, ALTMAP requires SSE */ - GF_E_SP_32_A, /* Mult == SPLIT, w=32, ALTMAP only with 4/32 */ - GF_E_SP_32_S, /* Mult == SPLIT, w=32, SSE only with 4/32 */ - GF_E_SP_64AR, /* Mult == SPLIT, w=64, Bad arg1/arg2 */ - GF_E_SP_64AS, /* Mult == SPLIT, w=64, ALTMAP requires SSE */ - GF_E_SP_64_A, /* Mult == SPLIT, w=64, ALTMAP only with 4/64 */ - GF_E_SP_64_S, /* Mult == SPLIT, w=64, SSE only with 4/64 */ - GF_E_SP_8_AR, /* Mult == SPLIT, w=8, Bad arg1/arg2 */ - GF_E_SP_8__A, /* Mult == SPLIT, w=8, no ALTMAP */ - GF_E_SP_SSE3, /* Mult == SPLIT, Need SSSE3 for SSE */ - GF_E_COMP_A2, /* Mult == COMP, arg1 must be = 2 */ - GF_E_COMP_SS, /* Mult == COMP, SSE|NOSSE */ - GF_E_COMP__W, /* Mult == COMP, Bad w. */ - GF_E_UNKFLAG, /* Unknown flag in create_from.... */ - GF_E_UNKNOWN, /* Unknown mult_type. */ - GF_E_UNK_REG, /* Unknown region_type. */ - GF_E_UNK_DIV, /* Unknown divide_type. */ - GF_E_CFM___W, /* Mult == CFM, Bad w. */ - GF_E_CFM4POL, /* Mult == CFM & Prim Poly has high bits set. */ - GF_E_CFM8POL, /* Mult == CFM & Prim Poly has high bits set. */ - GF_E_CF16POL, /* Mult == CFM & Prim Poly has high bits set. */ - GF_E_CF32POL, /* Mult == CFM & Prim Poly has high bits set. */ - GF_E_CF64POL, /* Mult == CFM & Prim Poly has high bits set. */ - GF_E_FEWARGS, /* Too few args in argc/argv. */ - GF_E_BADPOLY, /* Bad primitive polynomial -- too many bits set. */ - GF_E_COMP_PP, /* Bad primitive polynomial -- bigger than sub-field. */ - GF_E_COMPXPP, /* Can't derive a default pp for composite field. */ - GF_E_BASE__W, /* Composite -- Base field is the wrong size. */ - GF_E_TWOMULT, /* In create_from... two -m's. */ - GF_E_TWO_DIV, /* In create_from... two -d's. */ - GF_E_POLYSPC, /* Bad numbera after -p. */ - GF_E_SPLITAR, /* Ran out of arguments in SPLIT */ - GF_E_SPLITNU, /* Arguments not integers in SPLIT. */ - GF_E_GROUPAR, /* Ran out of arguments in GROUP */ - GF_E_GROUPNU, /* Arguments not integers in GROUP. */ - GF_E_DEFAULT } gf_error_type_t; - diff --git a/src/gf_method.c b/src/gf_method.c index 36ec3c4..a7bcacf 100644 --- a/src/gf_method.c +++ b/src/gf_method.c @@ -21,10 +21,9 @@ int create_gf_from_argv(gf_t *gf, int w, int argc, char **argv, int starting) { int mult_type, divide_type, region_type; - int arg1, arg2, subrg_size; + int arg1, arg2; uint64_t prim_poly; gf_t *base; - char *crt, *x, *y; mult_type = GF_MULT_DEFAULT; region_type = GF_REGION_DEFAULT; diff --git a/src/gf_rand.h b/src/gf_rand.h deleted file mode 100644 index 24294ad..0000000 --- a/src/gf_rand.h +++ /dev/null @@ -1,22 +0,0 @@ -/* - * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic - * James S. Plank, Ethan L. Miller, Kevin M. Greenan, - * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. - * - * gf_rand.h - * - * Random number generation, using the "Mother of All" random number generator. */ - -#pragma once -#include <stdint.h> -#include <stdio.h> -#include <stdlib.h> - -/* These are all pretty self-explanatory */ -uint32_t MOA_Random_32(); -uint64_t MOA_Random_64(); -void MOA_Random_128(uint64_t *x); -uint32_t MOA_Random_W(int w, int zero_ok); -void MOA_Fill_Random_Region (void *reg, int size); /* reg should be aligned to 4 bytes, but - size can be anything. */ -void MOA_Seed(uint32_t seed); diff --git a/src/gf_w128.c b/src/gf_w128.c index fae9f5c..881df00 100644 --- a/src/gf_w128.c +++ b/src/gf_w128.c @@ -81,6 +81,7 @@ int xor) } } +#if defined(INTEL_SSE4_PCLMUL) static void gf_w128_clm_multiply_region_from_single(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, @@ -89,9 +90,7 @@ int xor) int i; gf_val_128_t s128; gf_val_128_t d128; - uint64_t c128[2]; gf_region_data rd; -#if defined(INTEL_SSE4_PCLMUL) __m128i a,b; __m128i result0,result1; __m128i prim_poly; @@ -106,8 +105,6 @@ int xor) if (val[1] == 1) { gf_multby_one(src, dest, bytes, xor); return; } } - set_zero(c128, 0); - s128 = (gf_val_128_t) src; d128 = (gf_val_128_t) dest; @@ -184,8 +181,8 @@ int xor) d128[i+1] = (uint64_t)_mm_extract_epi64(result1,0); } } -#endif } +#endif /* * Some w128 notes: @@ -384,7 +381,7 @@ gf_w128_sse_bytwo_p_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_ { #if defined(INTEL_SSE4) int i; - __m128i a, b, pp, one, prod, amask, l_middle_one, u_middle_one; + __m128i a, b, pp, prod, amask, u_middle_one; /*John: pmask is always the highest bit set, and the rest zeros. amask changes, it's a countdown.*/ uint32_t topbit, middlebit, pmask; /* this is used as a boolean value */ gf_internal_t *h; @@ -400,7 +397,6 @@ gf_w128_sse_bytwo_p_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_ pmask = 0x80000000; amask = _mm_insert_epi32(prod, 0x80000000, 0x3); u_middle_one = _mm_insert_epi32(prod, 1, 0x2); - l_middle_one = _mm_insert_epi32(prod, 1 << 31, 0x1); for (i = 0; i < 64; i++) { topbit = (_mm_extract_epi32(prod, 0x3) & pmask); @@ -599,13 +595,13 @@ gf_w128_split_4_128_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_ } } +#ifdef INTEL_SSSE3 static void gf_w128_split_4_128_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor) { -#ifdef INTEL_SSSE3 gf_internal_t *h; - int i, m, j, k, tindex; + int i, j, k; uint64_t pp, v[2], s, *s64, *d64, *top; __m128i p, tables[32][16]; struct gf_w128_split_4_128_data *ld; @@ -624,7 +620,7 @@ gf_w128_split_4_128_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_ /* Doing this instead of gf_do_initial_region_alignment() because that doesn't hold 128-bit vals */ - gf_w128_multiply_region_from_single(gf, src, dest, val, (rd.s_start-src), xor); + gf_w128_multiply_region_from_single(gf, src, dest, val, ((char*)rd.s_start-(char*)src), xor); s64 = (uint64_t *) rd.s_start; d64 = (uint64_t *) rd.d_start; @@ -694,18 +690,18 @@ gf_w128_split_4_128_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_ /* Doing this instead of gf_do_final_region_alignment() because that doesn't hold 128-bit vals */ - gf_w128_multiply_region_from_single(gf, rd.s_top, rd.d_top, val, (src+bytes)-rd.s_top, xor); -#endif + gf_w128_multiply_region_from_single(gf, rd.s_top, rd.d_top, val, ((char*)src+bytes)-(char*)rd.s_top, xor); } +#endif +#ifdef INTEL_SSSE3 static void gf_w128_split_4_128_sse_altmap_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor) { -#ifdef INTEL_SSSE3 gf_internal_t *h; - int i, m, j, k, tindex; - uint64_t pp, v[2], s, *s64, *d64, *top; + int i, j, k; + uint64_t pp, v[2], *s64, *d64, *top; __m128i si, tables[32][16], p[16], v0, mask1; struct gf_w128_split_4_128_data *ld; uint8_t btable[16]; @@ -724,7 +720,7 @@ gf_w128_split_4_128_sse_altmap_multiply_region(gf_t *gf, void *src, void *dest, /* Doing this instead of gf_do_initial_region_alignment() because that doesn't hold 128-bit vals */ - gf_w128_multiply_region_from_single(gf, src, dest, val, (rd.s_start-src), xor); + gf_w128_multiply_region_from_single(gf, src, dest, val, ((char*)rd.s_start-(char*)src), xor); s64 = (uint64_t *) rd.s_start; d64 = (uint64_t *) rd.d_start; @@ -804,9 +800,9 @@ gf_w128_split_4_128_sse_altmap_multiply_region(gf_t *gf, void *src, void *dest, } /* Doing this instead of gf_do_final_region_alignment() because that doesn't hold 128-bit vals */ - gf_w128_multiply_region_from_single(gf, rd.s_top, rd.d_top, val, (src+bytes)-rd.s_top, xor); -#endif + gf_w128_multiply_region_from_single(gf, rd.s_top, rd.d_top, val, ((char*)src+bytes)-(char*)rd.s_top, xor); } +#endif static void @@ -886,7 +882,7 @@ gf_w128_split_8_128_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_ void gf_w128_bytwo_b_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor) { - uint64_t bmask, pp, vmask; + uint64_t bmask, pp; gf_internal_t *h; uint64_t a[2], c[2], b[2], *s64, *d64, *top; gf_region_data rd; @@ -987,7 +983,7 @@ void gf_w128_group_m_init(gf_t *gf, gf_val_128_t b128) void gf_w128_group_multiply(GFP gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128) { - int i,j; + int i; /* index_r, index_m, total_m (if g_r > g_m) */ int i_r, i_m, t_m; int mask_m, mask_r; @@ -1162,11 +1158,12 @@ gf_w128_euclid(GFP gf, gf_val_128_t a128, gf_val_128_t b128) uint64_t c_i[2]; uint64_t *b; uint64_t one = 1; - uint64_t buf, buf1; /* This needs to return some sort of error (in b128?) */ if (a128[0] == 0 && a128[1] == 0) return; + b = (uint64_t *) b128; + e_im1[0] = 0; e_im1[1] = ((gf_internal_t *) (gf->scratch))->prim_poly; e_i[0] = a128[0]; @@ -1240,7 +1237,6 @@ gf_w128_euclid(GFP gf, gf_val_128_t a128, gf_val_128_t b128) d_i = d_ip1; } - b = (uint64_t *) b128; b[0] = y_i[0]; b[1] = y_i[1]; return; @@ -1326,7 +1322,6 @@ static void gf_w128_composite_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor) { - unsigned long uls, uld; gf_internal_t *h = (gf_internal_t *) gf->scratch; gf_t *base_gf = h->base_gf; uint64_t b0 = val[1]; @@ -1381,14 +1376,13 @@ gf_w128_composite_multiply_region_alt(gf_t *gf, void *src, void *dest, gf_val_12 gf_internal_t *h = (gf_internal_t *) gf->scratch; gf_t *base_gf = h->base_gf; gf_val_64_t val0 = val[1]; gf_val_64_t val1 = val[0]; - uint64_t *l, *hi; uint8_t *slow, *shigh; uint8_t *dlow, *dhigh, *top; int sub_reg_size; gf_region_data rd; gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 64); - gf_w128_multiply_region_from_single(gf, src, dest, val, (rd.s_start-src), xor); + gf_w128_multiply_region_from_single(gf, src, dest, val, ((char*)rd.s_start-(char*)src), xor); slow = (uint8_t *) rd.s_start; dlow = (uint8_t *) rd.d_start; @@ -1404,7 +1398,7 @@ gf_w128_composite_multiply_region_alt(gf_t *gf, void *src, void *dest, gf_val_12 base_gf->multiply_region.w64(base_gf, shigh, dhigh, base_gf->multiply.w64(base_gf, h->prim_poly, val1 ), sub_reg_size, 1); - gf_w128_multiply_region_from_single(gf, rd.s_top, rd.d_top, val, (src+bytes)-rd.s_top, xor); + gf_w128_multiply_region_from_single(gf, rd.s_top, rd.d_top, val, ((char*)src+bytes)-(char*)rd.s_top, xor); } @@ -1419,8 +1413,6 @@ int gf_w128_composite_init(gf_t *gf) gf->multiply_region.w128 = gf_w128_composite_multiply_region; } - gf_internal_t *base_h = (gf_internal_t *) h->base_gf->scratch; - gf->multiply.w128 = gf_w128_composite_multiply; gf->divide.w128 = gf_w128_divide_from_inverse; gf->inverse.w128 = gf_w128_composite_inverse; @@ -1444,8 +1436,6 @@ int gf_w128_cfm_init(gf_t *gf) static int gf_w128_shift_init(gf_t *gf) { - gf_internal_t *h; - h = (gf_internal_t*) gf->scratch; gf->multiply.w128 = gf_w128_shift_multiply; gf->inverse.w128 = gf_w128_euclid; gf->multiply_region.w128 = gf_w128_multiply_region_from_single; @@ -1501,10 +1491,10 @@ void gf_w128_group_r_init(gf_t *gf) return; } +#if 0 // defined(INTEL_SSE4) static void gf_w128_group_r_sse_init(gf_t *gf) { -#if defined(INTEL_SSE4) int i, j; int g_r; uint64_t pp; @@ -1526,8 +1516,8 @@ void gf_w128_group_r_sse_init(gf_t *gf) } } return; -#endif } +#endif static int gf_w128_split_init(gf_t *gf) @@ -1587,12 +1577,10 @@ int gf_w128_group_init(gf_t *gf) { gf_internal_t *scratch; gf_group_tables_t *gt; - int g_m, g_r, size_r; - long tmp; + int g_r, size_r; scratch = (gf_internal_t *) gf->scratch; gt = scratch->private; - g_m = scratch->arg1; g_r = scratch->arg2; size_r = (1 << g_r); @@ -1690,7 +1678,6 @@ void gf_w128_composite_extract_word(gf_t *gf, void *start, int bytes, int index, int gf_w128_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2) { int size_m, size_r; - int w = 128; if (divide_type==GF_DIVIDE_MATRIX) return 0; switch(mult_type) @@ -1739,7 +1726,7 @@ int gf_w128_scratch_size(int mult_type, int region_type, int divide_type, int ar int gf_w128_init(gf_t *gf) { - gf_internal_t *h, *h_base, *h_base_base, *h_base_base_base; + gf_internal_t *h; int no_default_flag = 0; h = (gf_internal_t *) gf->scratch; diff --git a/src/gf_w16.c b/src/gf_w16.c index 454c6cc..f1fb650 100644 --- a/src/gf_w16.c +++ b/src/gf_w16.c @@ -125,6 +125,7 @@ gf_w16_multiply_region_from_single(gf_t *gf, void *src, void *dest, gf_val_32_t gf_do_final_region_alignment(&rd); } +#if defined(INTEL_SSE4_PCLMUL) static void gf_w16_clm_multiply_region_from_single_2(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) @@ -132,8 +133,6 @@ gf_w16_clm_multiply_region_from_single_2(gf_t *gf, void *src, void *dest, gf_val gf_region_data rd; uint16_t *s16; uint16_t *d16; - -#if defined(INTEL_SSE4_PCLMUL) __m128i a, b; __m128i result; __m128i prim_poly; @@ -186,9 +185,10 @@ gf_w16_clm_multiply_region_from_single_2(gf_t *gf, void *src, void *dest, gf_val } } gf_do_final_region_alignment(&rd); -#endif } +#endif +#if defined(INTEL_SSE4_PCLMUL) static void gf_w16_clm_multiply_region_from_single_3(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) @@ -197,8 +197,6 @@ gf_w16_clm_multiply_region_from_single_3(gf_t *gf, void *src, void *dest, gf_val uint16_t *s16; uint16_t *d16; -#if defined(INTEL_SSE4_PCLMUL) - __m128i a, b; __m128i result; __m128i prim_poly; @@ -255,9 +253,10 @@ gf_w16_clm_multiply_region_from_single_3(gf_t *gf, void *src, void *dest, gf_val } } gf_do_final_region_alignment(&rd); -#endif } +#endif +#if defined(INTEL_SSE4_PCLMUL) static void gf_w16_clm_multiply_region_from_single_4(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) @@ -266,8 +265,6 @@ gf_w16_clm_multiply_region_from_single_4(gf_t *gf, void *src, void *dest, gf_val uint16_t *s16; uint16_t *d16; -#if defined(INTEL_SSE4_PCLMUL) - __m128i a, b; __m128i result; __m128i prim_poly; @@ -328,8 +325,8 @@ gf_w16_clm_multiply_region_from_single_4(gf_t *gf, void *src, void *dest, gf_val } } gf_do_final_region_alignment(&rd); -#endif } +#endif static inline @@ -453,7 +450,7 @@ gf_w16_clm_multiply_2 (gf_t *gf, gf_val_32_t a16, gf_val_32_t b16) __m128i a, b; __m128i result; __m128i prim_poly; - __m128i v, w; + __m128i w; gf_internal_t * h = gf->scratch; a = _mm_insert_epi32 (_mm_setzero_si128(), a16, 0); @@ -500,7 +497,7 @@ gf_w16_clm_multiply_3 (gf_t *gf, gf_val_32_t a16, gf_val_32_t b16) __m128i a, b; __m128i result; __m128i prim_poly; - __m128i v, w; + __m128i w; gf_internal_t * h = gf->scratch; a = _mm_insert_epi32 (_mm_setzero_si128(), a16, 0); @@ -540,7 +537,7 @@ gf_w16_clm_multiply_4 (gf_t *gf, gf_val_32_t a16, gf_val_32_t b16) __m128i a, b; __m128i result; __m128i prim_poly; - __m128i v, w; + __m128i w; gf_internal_t * h = gf->scratch; a = _mm_insert_epi32 (_mm_setzero_si128(), a16, 0); @@ -605,13 +602,13 @@ int gf_w16_shift_init(gf_t *gf) static int gf_w16_cfm_init(gf_t *gf) { +#if defined(INTEL_SSE4_PCLMUL) gf_internal_t *h; h = (gf_internal_t *) gf->scratch; /*Ben: Determining how many reductions to do */ -#if defined(INTEL_SSE4_PCLMUL) if ((0xfe00 & h->prim_poly) == 0) { gf->multiply.w32 = gf_w16_clm_multiply_2; gf->multiply_region.w32 = gf_w16_clm_multiply_region_from_single_2; @@ -774,9 +771,8 @@ static void gf_w16_split_4_16_lazy_nosse_altmap_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { - uint64_t i, j, a, b, c, prod; + uint64_t i, j, c, prod; uint8_t *s8, *d8, *top; - gf_internal_t *h; uint16_t table[4][16]; gf_region_data rd; @@ -786,8 +782,6 @@ gf_w16_split_4_16_lazy_nosse_altmap_multiply_region(gf_t *gf, void *src, void *d gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 32); gf_do_initial_region_alignment(&rd); - h = (gf_internal_t *) gf->scratch; - /*Ben: Constructs lazy multiplication table*/ for (j = 0; j < 16; j++) { @@ -840,7 +834,6 @@ gf_w16_split_4_16_lazy_multiply_region(gf_t *gf, void *src, void *dest, gf_val_3 { uint64_t i, j, a, c, prod; uint16_t *s16, *d16, *top; - gf_internal_t *h; uint16_t table[4][16]; gf_region_data rd; @@ -850,8 +843,6 @@ gf_w16_split_4_16_lazy_multiply_region(gf_t *gf, void *src, void *dest, gf_val_3 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 2); gf_do_initial_region_alignment(&rd); - h = (gf_internal_t *) gf->scratch; - for (j = 0; j < 16; j++) { for (i = 0; i < 4; i++) { c = (j << (i*4)); @@ -880,7 +871,7 @@ static void gf_w16_split_8_16_lazy_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { - uint64_t j, k, v, a, c, prod, *s64, *d64, *top64; + uint64_t j, k, v, a, prod, *s64, *d64, *top64; gf_internal_t *h; uint64_t htable[256], ltable[256]; gf_region_data rd; @@ -966,7 +957,7 @@ gf_w16_split_8_16_lazy_multiply_region(gf_t *gf, void *src, void *dest, gf_val_3 static void gf_w16_table_lazy_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { - uint64_t j, a, c, pp; + uint64_t c; gf_internal_t *h; struct gf_w16_lazytable_data *ltd; gf_region_data rd; @@ -1010,12 +1001,12 @@ gf_w16_split_4_16_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_v { #ifdef INTEL_SSSE3 uint64_t i, j, *s64, *d64, *top64;; - uint64_t a, c, prod; + uint64_t c, prod; uint8_t low[4][16]; uint8_t high[4][16]; gf_region_data rd; - __m128i mask, ta, tb, ti, tpl, tph, tlow[4], thigh[4], tta, ttb, shuffler, unshuffler, lmask; + __m128i mask, ta, tb, ti, tpl, tph, tlow[4], thigh[4], tta, ttb, lmask; if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } @@ -1147,7 +1138,6 @@ gf_w16_split_4_16_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void *des uint8_t low[4][16]; uint8_t high[4][16]; gf_region_data rd; - struct gf_single_table_data *std; __m128i mask, ta, tb, ti, tpl, tph, tlow[4], thigh[4]; if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } @@ -1358,11 +1348,8 @@ issse3 = 0; static int gf_w16_table_init(gf_t *gf) { - gf_internal_t *h; gf_w16_log_init(gf); - h = (gf_internal_t *) gf->scratch; - gf->multiply_region.w32 = gf_w16_table_lazy_multiply_region; return 1; } @@ -1557,15 +1544,14 @@ gf_w16_bytwo_p_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_ prod = _mm_xor_si128(prod, t1); \ v = _mm_srli_epi64(v, 1); } +#ifdef INTEL_SSE2 static void gf_w16_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { -#ifdef INTEL_SSE2 int i; uint8_t *s8, *d8; uint32_t vrev; - uint64_t amask; __m128i pp, m1, m2, ta, prod, t1, t2, tp, one, v; struct gf_w16_bytwo_data *btd; gf_region_data rd; @@ -1618,17 +1604,16 @@ gf_w16_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t s8 += 16; } gf_do_final_region_alignment(&rd); -#endif } +#endif +#ifdef INTEL_SSE2 static void gf_w16_bytwo_b_sse_region_2_noxor(gf_region_data *rd, struct gf_w16_bytwo_data *btd) { -#ifdef INTEL_SSE2 - int i; - uint8_t *d8, *s8, tb; - __m128i pp, m1, m2, t1, t2, va, vb; + uint8_t *d8, *s8; + __m128i pp, m1, m2, t1, t2, va; s8 = (uint8_t *) rd->s_start; d8 = (uint8_t *) rd->d_start; @@ -1644,16 +1629,15 @@ gf_w16_bytwo_b_sse_region_2_noxor(gf_region_data *rd, struct gf_w16_bytwo_data * d8 += 16; s8 += 16; } -#endif } +#endif +#ifdef INTEL_SSE2 static void gf_w16_bytwo_b_sse_region_2_xor(gf_region_data *rd, struct gf_w16_bytwo_data *btd) { -#ifdef INTEL_SSE2 - int i; - uint8_t *d8, *s8, tb; + uint8_t *d8, *s8; __m128i pp, m1, m2, t1, t2, va, vb; s8 = (uint8_t *) rd->s_start; @@ -1672,15 +1656,15 @@ gf_w16_bytwo_b_sse_region_2_xor(gf_region_data *rd, struct gf_w16_bytwo_data *bt d8 += 16; s8 += 16; } -#endif } +#endif +#ifdef INTEL_SSE2 static void gf_w16_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { -#ifdef INTEL_SSE2 int itb; uint8_t *d8, *s8; __m128i pp, m1, m2, t1, t2, va, vb; @@ -1728,14 +1712,13 @@ gf_w16_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t } gf_do_final_region_alignment(&rd); -#endif } +#endif static void gf_w16_bytwo_b_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { - int i; uint64_t *s64, *d64, t1, t2, ta, tb, prod; struct gf_w16_bytwo_data *btd; gf_region_data rd; @@ -1988,7 +1971,6 @@ gf_val_32_t gf_w16_composite_multiply_inline(gf_t *gf, gf_val_32_t a, gf_val_32_t b) { gf_internal_t *h = (gf_internal_t *) gf->scratch; - gf_t *base_gf = h->base_gf; uint8_t b0 = b & 0x00ff; uint8_t b1 = (b & 0xff00) >> 8; uint8_t a0 = a & 0x00ff; @@ -2072,7 +2054,6 @@ static void gf_w16_composite_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { - unsigned long uls, uld; gf_internal_t *h = (gf_internal_t *) gf->scratch; gf_t *base_gf = h->base_gf; uint8_t b0 = val & 0x00ff; @@ -2080,7 +2061,6 @@ gf_w16_composite_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t va uint16_t *s16, *d16, *top; uint8_t a0, a1, a1b1, *mt; gf_region_data rd; - struct gf_w16_logtable_data *ltd; struct gf_w16_composite_data *cd; cd = (struct gf_w16_composite_data *) h->private; @@ -2237,7 +2217,6 @@ inline gf_val_32_t gf_w16_group_4_4_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b) { - int i; uint16_t p, l, ind, r, a16; struct gf_w16_group_4_4_data *d44; @@ -2270,7 +2249,6 @@ gf_w16_group_4_4_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b) static void gf_w16_group_4_4_region_multiply(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { - int i; uint16_t p, l, ind, r, a16, p16; struct gf_w16_group_4_4_data *d44; gf_region_data rd; @@ -2475,10 +2453,8 @@ int gf_w16_init(gf_t *gf) uint16_t *gf_w16_get_log_table(gf_t *gf) { - gf_internal_t *h; struct gf_w16_logtable_data *ltd; - h = (gf_internal_t *) gf->scratch; if (gf->multiply.w32 == gf_w16_log_multiply) { ltd = (struct gf_w16_logtable_data *) ((gf_internal_t *) gf->scratch)->private; return (uint16_t *) ltd->log_tbl; diff --git a/src/gf_w32.c b/src/gf_w32.c index 03f285f..1503c72 100644 --- a/src/gf_w32.c +++ b/src/gf_w32.c @@ -120,13 +120,13 @@ xor) } } +#if defined(INTEL_SSE4_PCLMUL) + static void gf_w32_clm_multiply_region_from_single_2(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor) { -#if defined(INTEL_SSE4_PCLMUL) - int i; uint32_t *s32; uint32_t *d32; @@ -167,16 +167,16 @@ gf_w32_clm_multiply_region_from_single_2(gf_t *gf, void *src, void *dest, uint32 d32[i] = ((gf_val_32_t)_mm_extract_epi32(result, 0)); } } -#endif } +#endif + +#if defined(INTEL_SSE4_PCLMUL) static void gf_w32_clm_multiply_region_from_single_3(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor) { -#if defined(INTEL_SSE4_PCLMUL) - int i; uint32_t *s32; uint32_t *d32; @@ -222,14 +222,14 @@ gf_w32_clm_multiply_region_from_single_3(gf_t *gf, void *src, void *dest, uint32 d32[i] = ((gf_val_32_t)_mm_extract_epi32(result, 0)); } } -#endif } +#endif +#if defined(INTEL_SSE4_PCLMUL) static void gf_w32_clm_multiply_region_from_single_4(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor) { -#if defined(INTEL_SSE4_PCLMUL) int i; uint32_t *s32; uint32_t *d32; @@ -279,8 +279,8 @@ gf_w32_clm_multiply_region_from_single_4(gf_t *gf, void *src, void *dest, uint32 d32[i] = ((gf_val_32_t)_mm_extract_epi32(result, 0)); } } -#endif } +#endif static inline @@ -414,7 +414,7 @@ gf_w32_clm_multiply_2 (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32) __m128i a, b; __m128i result; __m128i prim_poly; - __m128i v, w; + __m128i w; gf_internal_t * h = gf->scratch; @@ -458,7 +458,7 @@ gf_w32_clm_multiply_3 (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32) __m128i a, b; __m128i result; __m128i prim_poly; - __m128i v, w; + __m128i w; gf_internal_t * h = gf->scratch; @@ -497,7 +497,7 @@ gf_w32_clm_multiply_4 (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32) __m128i a, b; __m128i result; __m128i prim_poly; - __m128i v, w; + __m128i w; gf_internal_t * h = gf->scratch; @@ -555,10 +555,6 @@ gf_w32_shift_multiply (gf_t *gf, uint32_t a32, uint32_t b32) static int gf_w32_cfm_init(gf_t *gf) { - gf_internal_t *h; - - h = (gf_internal_t *) gf->scratch; - gf->inverse.w32 = gf_w32_euclid; gf->multiply_region.w32 = gf_w32_multiply_region_from_single; @@ -566,6 +562,10 @@ int gf_w32_cfm_init(gf_t *gf) /*Ben: Check to see how many reduction steps it will take*/ #if defined(INTEL_SSE4_PCLMUL) + gf_internal_t *h; + + h = (gf_internal_t *) gf->scratch; + if ((0xfffe0000 & h->prim_poly) == 0){ gf->multiply.w32 = gf_w32_clm_multiply_2; gf->multiply_region.w32 = gf_w32_clm_multiply_region_from_single_2; @@ -616,9 +616,8 @@ gf_w32_group_set_shift_tables(uint32_t *shift, uint32_t val, gf_internal_t *h) static void gf_w32_group_s_equals_r_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { - int i; int leftover, rs; - uint32_t p, l, ind, r, a32; + uint32_t p, l, ind, a32; int bits_left; int g_s; gf_region_data rd; @@ -741,9 +740,8 @@ inline gf_val_32_t gf_w32_group_s_equals_r_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b) { - int i; int leftover, rs; - uint32_t p, l, ind, r, a32; + uint32_t p, l, ind, a32; int bits_left; int g_s; @@ -781,8 +779,7 @@ inline gf_val_32_t gf_w32_group_4_4_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b) { - int i; - uint32_t p, l, ind, r, a32; + uint32_t p, l, ind, a32; struct gf_w32_group_data *d44; gf_internal_t *h = (gf_internal_t *) gf->scratch; @@ -832,7 +829,7 @@ gf_w32_group_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b) { int i; int leftover; - uint64_t p, l, r, mask; + uint64_t p, l, r; uint32_t a32, ind; int g_s, g_r; struct gf_w32_group_data *gd; @@ -986,15 +983,14 @@ gf_w32_bytwo_p_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_ prod = _mm_xor_si128(prod, t1); \ v = _mm_srli_epi64(v, 1); } +#ifdef INTEL_SSE2 static void gf_w32_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { -#ifdef INTEL_SSE2 int i; uint8_t *s8, *d8; uint32_t vrev; - uint64_t amask; __m128i pp, m1, m2, ta, prod, t1, t2, tp, one, v; struct gf_w32_bytwo_data *btd; gf_region_data rd; @@ -1039,14 +1035,13 @@ gf_w32_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t s8 += 16; } gf_do_final_region_alignment(&rd); -#endif } +#endif static void gf_w32_bytwo_b_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { - int i; uint64_t *s64, *d64, t1, t2, ta, tb, prod; struct gf_w32_bytwo_data *btd; gf_region_data rd; @@ -1181,14 +1176,13 @@ gf_w32_bytwo_b_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_ gf_do_final_region_alignment(&rd); } +#ifdef INTEL_SSE2 static void gf_w32_bytwo_b_sse_region_2_noxor(gf_region_data *rd, struct gf_w32_bytwo_data *btd) { -#ifdef INTEL_SSE2 - int i; - uint8_t *d8, *s8, tb; - __m128i pp, m1, m2, t1, t2, va, vb; + uint8_t *d8, *s8; + __m128i pp, m1, m2, t1, t2, va; s8 = (uint8_t *) rd->s_start; d8 = (uint8_t *) rd->d_start; @@ -1204,16 +1198,15 @@ gf_w32_bytwo_b_sse_region_2_noxor(gf_region_data *rd, struct gf_w32_bytwo_data * d8 += 16; s8 += 16; } -#endif } +#endif +#ifdef INTEL_SSE2 static void gf_w32_bytwo_b_sse_region_2_xor(gf_region_data *rd, struct gf_w32_bytwo_data *btd) { -#ifdef INTEL_SSE2 - int i; - uint8_t *d8, *s8, tb; + uint8_t *d8, *s8; __m128i pp, m1, m2, t1, t2, va, vb; s8 = (uint8_t *) rd->s_start; @@ -1232,15 +1225,15 @@ gf_w32_bytwo_b_sse_region_2_xor(gf_region_data *rd, struct gf_w32_bytwo_data *bt d8 += 16; s8 += 16; } -#endif } +#endif +#ifdef INTEL_SSE2 static void gf_w32_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { -#ifdef INTEL_SSE2 uint32_t itb; uint8_t *d8, *s8; __m128i pp, m1, m2, t1, t2, va, vb; @@ -1288,8 +1281,8 @@ gf_w32_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t } gf_do_final_region_alignment(&rd); -#endif } +#endif static int gf_w32_bytwo_init(gf_t *gf) @@ -1556,14 +1549,14 @@ gf_w32_split_2_32_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint32_t gf_do_final_region_alignment(&rd); } +#ifdef INTEL_SSSE3 static void gf_w32_split_2_32_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor) { -#ifdef INTEL_SSSE3 gf_internal_t *h; - int i, m, j, tindex; - uint32_t pp, v, v2, s, *s32, *d32, *top; + int i, tindex; + uint32_t pp, v, v2, *s32, *d32, *top; __m128i vi, si, pi, shuffler, tables[16], adder, xi, mask1, mask2; gf_region_data rd; @@ -1635,8 +1628,8 @@ gf_w32_split_2_32_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, uint gf_do_final_region_alignment(&rd); -#endif } +#endif static void @@ -1699,8 +1692,8 @@ gf_w32_split_4_32_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void *des { #ifdef INTEL_SSSE3 gf_internal_t *h; - int i, m, j, k, tindex; - uint32_t pp, v, s, *s32, *d32, *top, *realtop; + int i, j, k; + uint32_t pp, v, *s32, *d32, *top; __m128i si, tables[8][4], p0, p1, p2, p3, mask1, v0, v1, v2, v3; struct gf_split_4_32_lazy_data *ld; uint8_t btable[16]; @@ -1891,9 +1884,9 @@ gf_w32_split_4_32_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, uint { #ifdef INTEL_SSSE3 gf_internal_t *h; - int i, m, j, k, tindex; - uint32_t pp, v, s, *s32, *d32, *top, tmp_table[16]; - __m128i vi, si, tables[8][4], p0, p1, p2, p3, mask1, v0, v1, v2, v3, mask8; + int i, j, k; + uint32_t pp, v, *s32, *d32, *top, tmp_table[16]; + __m128i si, tables[8][4], p0, p1, p2, p3, mask1, v0, v1, v2, v3, mask8; __m128i tv1, tv2, tv3, tv0; uint8_t btable[16]; gf_region_data rd; @@ -2378,7 +2371,6 @@ uint32_t gf_w32_composite_multiply_inline(gf_t *gf, uint32_t a, uint32_t b) { gf_internal_t *h = (gf_internal_t *) gf->scratch; - gf_t *base_gf = h->base_gf; uint32_t b0 = b & 0x0000ffff; uint32_t b1 = b >> 16; uint32_t a0 = a & 0x0000ffff; @@ -2620,11 +2612,8 @@ int gf_w32_composite_init(gf_t *gf) int gf_w32_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2) { - int ss; int issse3 = 0; - ss = (GF_REGION_SSE | GF_REGION_NOSSE); - #ifdef INTEL_SSSE3 issse3 = 1; #endif diff --git a/src/gf_w4.c b/src/gf_w4.c index 2504ec6..65cbf23 100644 --- a/src/gf_w4.c +++ b/src/gf_w4.c @@ -61,7 +61,7 @@ struct gf_bytwo_data { t2 = ((t2 << 1) - (t2 >> (GF_FIELD_WIDTH-1))); \ b = (t1 ^ (t2 & ip));} -#define SSE_AB2(pp, m1 ,m2, va, t1, t2) {\ +#define SSE_AB2(pp, m1, va, t1, t2) {\ t1 = _mm_and_si128(_mm_slli_epi64(va, 1), m1); \ t2 = _mm_and_si128(va, _mm_set1_epi8(0x88)); \ t2 = _mm_sub_epi64 (_mm_slli_epi64(t2, 1), _mm_srli_epi64(t2, (GF_FIELD_WIDTH-1))); \ @@ -414,14 +414,14 @@ gf_w4_single_table_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t #define MM_PRINT(s, r) { uint8_t blah[16]; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (i = 0; i < 16; i++) printf(" %02x", blah[i]); printf("\n"); } +#ifdef INTEL_SSSE3 static void gf_w4_single_table_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { -#ifdef INTEL_SSSE3 gf_region_data rd; uint8_t *base, *sptr, *dptr, *top; - __m128i tl, loset, h4, r, va, th; + __m128i tl, loset, r, va, th; struct gf_single_table_data *std; @@ -460,15 +460,15 @@ gf_w4_single_table_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_3 } gf_do_final_region_alignment(&rd); -#endif } +#endif static int gf_w4_single_table_init(gf_t *gf) { gf_internal_t *h; struct gf_single_table_data *std; - int a, b, prod, loga, logb; + int a, b, prod; h = (gf_internal_t *) gf->scratch; @@ -531,7 +531,6 @@ static void gf_w4_double_table_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { - unsigned long uls, uld; int i; uint8_t *s8, *d8, *base; gf_region_data rd; @@ -560,7 +559,7 @@ int gf_w4_double_table_init(gf_t *gf) { gf_internal_t *h; struct gf_double_table_data *std; - int a, b, c, prod, loga, logb, ab; + int a, b, c, prod, ab; uint8_t mult[GF_FIELD_SIZE][GF_FIELD_SIZE]; h = (gf_internal_t *) gf->scratch; @@ -687,7 +686,7 @@ int gf_w4_quad_table_init(gf_t *gf) { gf_internal_t *h; struct gf_quad_table_data *std; - int prod, loga, logb, ab, val, a, b, c, d, va, vb, vc, vd; + int prod, val, a, b, c, d, va, vb, vc, vd; uint8_t mult[GF_FIELD_SIZE][GF_FIELD_SIZE]; h = (gf_internal_t *) gf->scratch; @@ -731,10 +730,9 @@ int gf_w4_quad_table_lazy_init(gf_t *gf) { gf_internal_t *h; struct gf_quad_table_lazy_data *std; - int a, b, c, prod, loga, logb, ab; + int a, b, prod, loga, logb; uint8_t log_tbl[GF_FIELD_SIZE]; uint8_t antilog_tbl[GF_FIELD_SIZE*2]; - uint8_t mult[GF_FIELD_SIZE][GF_FIELD_SIZE]; h = (gf_internal_t *) gf->scratch; std = (struct gf_quad_table_lazy_data *)h->private; @@ -911,23 +909,22 @@ gf_w4_bytwo_p_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t } #define BYTWO_P_ONESTEP {\ - SSE_AB2(pp, m1 ,m2, prod, t1, t2); \ + SSE_AB2(pp, m1, prod, t1, t2); \ t1 = _mm_and_si128(v, one); \ t1 = _mm_sub_epi8(t1, one); \ t1 = _mm_and_si128(t1, ta); \ prod = _mm_xor_si128(prod, t1); \ v = _mm_srli_epi64(v, 1); } +#ifdef INTEL_SSE2 static void gf_w4_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { -#ifdef INTEL_SSE2 int i; uint8_t *s8, *d8; uint8_t vrev; - uint64_t amask; - __m128i pp, m1, m2, ta, prod, t1, t2, tp, one, v; + __m128i pp, m1, ta, prod, t1, t2, tp, one, v; struct gf_bytwo_data *btd; gf_region_data rd; @@ -950,7 +947,6 @@ gf_w4_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t v pp = _mm_set1_epi8(btd->prim_poly&0xff); m1 = _mm_set1_epi8((btd->mask1)&0xff); - m2 = _mm_set1_epi8((btd->mask2)&0xff); one = _mm_set1_epi8(1); while (d8 < (uint8_t *) rd.d_top) { @@ -967,8 +963,8 @@ gf_w4_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t v s8 += 16; } gf_do_final_region_alignment(&rd); -#endif } +#endif /* static @@ -1036,354 +1032,330 @@ gf_w4_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t v } */ +#ifdef INTEL_SSE2 static void gf_w4_bytwo_b_sse_region_2_noxor(gf_region_data *rd, struct gf_bytwo_data *btd) { -#ifdef INTEL_SSE2 - int i; - uint8_t *d8, *s8, tb; - __m128i pp, m1, m2, t1, t2, va, vb; + uint8_t *d8, *s8; + __m128i pp, m1, t1, t2, va; s8 = (uint8_t *) rd->s_start; d8 = (uint8_t *) rd->d_start; pp = _mm_set1_epi8(btd->prim_poly&0xff); m1 = _mm_set1_epi8((btd->mask1)&0xff); - m2 = _mm_set1_epi8((btd->mask2)&0xff); while (d8 < (uint8_t *) rd->d_top) { va = _mm_load_si128 ((__m128i *)(s8)); - SSE_AB2(pp, m1, m2, va, t1, t2); + SSE_AB2(pp, m1, va, t1, t2); _mm_store_si128((__m128i *)d8, va); d8 += 16; s8 += 16; } -#endif } +#endif +#ifdef INTEL_SSE2 static void gf_w4_bytwo_b_sse_region_2_xor(gf_region_data *rd, struct gf_bytwo_data *btd) { -#ifdef INTEL_SSE2 - int i; - uint8_t *d8, *s8, tb; - __m128i pp, m1, m2, t1, t2, va, vb; + uint8_t *d8, *s8; + __m128i pp, m1, t1, t2, va, vb; s8 = (uint8_t *) rd->s_start; d8 = (uint8_t *) rd->d_start; pp = _mm_set1_epi8(btd->prim_poly&0xff); m1 = _mm_set1_epi8((btd->mask1)&0xff); - m2 = _mm_set1_epi8((btd->mask2)&0xff); while (d8 < (uint8_t *) rd->d_top) { va = _mm_load_si128 ((__m128i *)(s8)); - SSE_AB2(pp, m1, m2, va, t1, t2); + SSE_AB2(pp, m1, va, t1, t2); vb = _mm_load_si128 ((__m128i *)(d8)); vb = _mm_xor_si128(vb, va); _mm_store_si128((__m128i *)d8, vb); d8 += 16; s8 += 16; } -#endif } +#endif +#ifdef INTEL_SSE2 static void gf_w4_bytwo_b_sse_region_4_noxor(gf_region_data *rd, struct gf_bytwo_data *btd) { -#ifdef INTEL_SSE2 - int i; - uint8_t *d8, *s8, tb; - __m128i pp, m1, m2, t1, t2, va, vb; + uint8_t *d8, *s8; + __m128i pp, m1, t1, t2, va; s8 = (uint8_t *) rd->s_start; d8 = (uint8_t *) rd->d_start; pp = _mm_set1_epi8(btd->prim_poly&0xff); m1 = _mm_set1_epi8((btd->mask1)&0xff); - m2 = _mm_set1_epi8((btd->mask2)&0xff); while (d8 < (uint8_t *) rd->d_top) { va = _mm_load_si128 ((__m128i *)(s8)); - SSE_AB2(pp, m1, m2, va, t1, t2); - SSE_AB2(pp, m1, m2, va, t1, t2); + SSE_AB2(pp, m1, va, t1, t2); + SSE_AB2(pp, m1, va, t1, t2); _mm_store_si128((__m128i *)d8, va); d8 += 16; s8 += 16; } -#endif } +#endif +#ifdef INTEL_SSE2 static void gf_w4_bytwo_b_sse_region_4_xor(gf_region_data *rd, struct gf_bytwo_data *btd) { -#ifdef INTEL_SSE2 - int i; - uint8_t *d8, *s8, tb; - __m128i pp, m1, m2, t1, t2, va, vb; + uint8_t *d8, *s8; + __m128i pp, m1, t1, t2, va, vb; s8 = (uint8_t *) rd->s_start; d8 = (uint8_t *) rd->d_start; pp = _mm_set1_epi8(btd->prim_poly&0xff); m1 = _mm_set1_epi8((btd->mask1)&0xff); - m2 = _mm_set1_epi8((btd->mask2)&0xff); while (d8 < (uint8_t *) rd->d_top) { va = _mm_load_si128 ((__m128i *)(s8)); - SSE_AB2(pp, m1, m2, va, t1, t2); - SSE_AB2(pp, m1, m2, va, t1, t2); + SSE_AB2(pp, m1, va, t1, t2); + SSE_AB2(pp, m1, va, t1, t2); vb = _mm_load_si128 ((__m128i *)(d8)); vb = _mm_xor_si128(vb, va); _mm_store_si128((__m128i *)d8, vb); d8 += 16; s8 += 16; } -#endif } +#endif +#ifdef INTEL_SSE2 static void gf_w4_bytwo_b_sse_region_3_noxor(gf_region_data *rd, struct gf_bytwo_data *btd) { -#ifdef INTEL_SSE2 - int i; - uint8_t *d8, *s8, tb; - __m128i pp, m1, m2, t1, t2, va, vb; + uint8_t *d8, *s8; + __m128i pp, m1, t1, t2, va, vb; s8 = (uint8_t *) rd->s_start; d8 = (uint8_t *) rd->d_start; pp = _mm_set1_epi8(btd->prim_poly&0xff); m1 = _mm_set1_epi8((btd->mask1)&0xff); - m2 = _mm_set1_epi8((btd->mask2)&0xff); while (d8 < (uint8_t *) rd->d_top) { va = _mm_load_si128 ((__m128i *)(s8)); vb = va; - SSE_AB2(pp, m1, m2, va, t1, t2); + SSE_AB2(pp, m1, va, t1, t2); va = _mm_xor_si128(va, vb); _mm_store_si128((__m128i *)d8, va); d8 += 16; s8 += 16; } -#endif } +#endif +#ifdef INTEL_SSE2 static void gf_w4_bytwo_b_sse_region_3_xor(gf_region_data *rd, struct gf_bytwo_data *btd) { -#ifdef INTEL_SSE2 - int i; - uint8_t *d8, *s8, tb; - __m128i pp, m1, m2, t1, t2, va, vb; + uint8_t *d8, *s8; + __m128i pp, m1, t1, t2, va, vb; s8 = (uint8_t *) rd->s_start; d8 = (uint8_t *) rd->d_start; pp = _mm_set1_epi8(btd->prim_poly&0xff); m1 = _mm_set1_epi8((btd->mask1)&0xff); - m2 = _mm_set1_epi8((btd->mask2)&0xff); while (d8 < (uint8_t *) rd->d_top) { va = _mm_load_si128 ((__m128i *)(s8)); vb = _mm_xor_si128(_mm_load_si128 ((__m128i *)(d8)), va); - SSE_AB2(pp, m1, m2, va, t1, t2); + SSE_AB2(pp, m1, va, t1, t2); vb = _mm_xor_si128(vb, va); _mm_store_si128((__m128i *)d8, vb); d8 += 16; s8 += 16; } -#endif } +#endif +#ifdef INTEL_SSE2 static void gf_w4_bytwo_b_sse_region_5_noxor(gf_region_data *rd, struct gf_bytwo_data *btd) { -#ifdef INTEL_SSE2 - int i; - uint8_t *d8, *s8, tb; - __m128i pp, m1, m2, t1, t2, va, vb; + uint8_t *d8, *s8; + __m128i pp, m1, t1, t2, va, vb; s8 = (uint8_t *) rd->s_start; d8 = (uint8_t *) rd->d_start; pp = _mm_set1_epi8(btd->prim_poly&0xff); m1 = _mm_set1_epi8((btd->mask1)&0xff); - m2 = _mm_set1_epi8((btd->mask2)&0xff); while (d8 < (uint8_t *) rd->d_top) { va = _mm_load_si128 ((__m128i *)(s8)); vb = va; - SSE_AB2(pp, m1, m2, va, t1, t2); - SSE_AB2(pp, m1, m2, va, t1, t2); + SSE_AB2(pp, m1, va, t1, t2); + SSE_AB2(pp, m1, va, t1, t2); va = _mm_xor_si128(va, vb); _mm_store_si128((__m128i *)d8, va); d8 += 16; s8 += 16; } -#endif } +#endif +#ifdef INTEL_SSE2 static void gf_w4_bytwo_b_sse_region_5_xor(gf_region_data *rd, struct gf_bytwo_data *btd) { -#ifdef INTEL_SSE2 - int i; - uint8_t *d8, *s8, tb; - __m128i pp, m1, m2, t1, t2, va, vb; + uint8_t *d8, *s8; + __m128i pp, m1, t1, t2, va, vb; s8 = (uint8_t *) rd->s_start; d8 = (uint8_t *) rd->d_start; pp = _mm_set1_epi8(btd->prim_poly&0xff); m1 = _mm_set1_epi8((btd->mask1)&0xff); - m2 = _mm_set1_epi8((btd->mask2)&0xff); while (d8 < (uint8_t *) rd->d_top) { va = _mm_load_si128 ((__m128i *)(s8)); vb = _mm_xor_si128(_mm_load_si128 ((__m128i *)(d8)), va); - SSE_AB2(pp, m1, m2, va, t1, t2); - SSE_AB2(pp, m1, m2, va, t1, t2); + SSE_AB2(pp, m1, va, t1, t2); + SSE_AB2(pp, m1, va, t1, t2); vb = _mm_xor_si128(vb, va); _mm_store_si128((__m128i *)d8, vb); d8 += 16; s8 += 16; } -#endif } +#endif +#ifdef INTEL_SSE2 static void gf_w4_bytwo_b_sse_region_7_noxor(gf_region_data *rd, struct gf_bytwo_data *btd) { -#ifdef INTEL_SSE2 - int i; - uint8_t *d8, *s8, tb; - __m128i pp, m1, m2, t1, t2, va, vb; + uint8_t *d8, *s8; + __m128i pp, m1, t1, t2, va, vb; s8 = (uint8_t *) rd->s_start; d8 = (uint8_t *) rd->d_start; pp = _mm_set1_epi8(btd->prim_poly&0xff); m1 = _mm_set1_epi8((btd->mask1)&0xff); - m2 = _mm_set1_epi8((btd->mask2)&0xff); while (d8 < (uint8_t *) rd->d_top) { va = _mm_load_si128 ((__m128i *)(s8)); vb = va; - SSE_AB2(pp, m1, m2, va, t1, t2); + SSE_AB2(pp, m1, va, t1, t2); vb = _mm_xor_si128(va, vb); - SSE_AB2(pp, m1, m2, va, t1, t2); + SSE_AB2(pp, m1, va, t1, t2); va = _mm_xor_si128(va, vb); _mm_store_si128((__m128i *)d8, va); d8 += 16; s8 += 16; } -#endif } +#endif +#ifdef INTEL_SSE2 static void gf_w4_bytwo_b_sse_region_7_xor(gf_region_data *rd, struct gf_bytwo_data *btd) { -#ifdef INTEL_SSE2 - int i; - uint8_t *d8, *s8, tb; - __m128i pp, m1, m2, t1, t2, va, vb; + uint8_t *d8, *s8; + __m128i pp, m1, t1, t2, va, vb; s8 = (uint8_t *) rd->s_start; d8 = (uint8_t *) rd->d_start; pp = _mm_set1_epi8(btd->prim_poly&0xff); m1 = _mm_set1_epi8((btd->mask1)&0xff); - m2 = _mm_set1_epi8((btd->mask2)&0xff); while (d8 < (uint8_t *) rd->d_top) { va = _mm_load_si128 ((__m128i *)(s8)); vb = _mm_xor_si128(_mm_load_si128 ((__m128i *)(d8)), va); - SSE_AB2(pp, m1, m2, va, t1, t2); + SSE_AB2(pp, m1, va, t1, t2); vb = _mm_xor_si128(vb, va); - SSE_AB2(pp, m1, m2, va, t1, t2); + SSE_AB2(pp, m1, va, t1, t2); vb = _mm_xor_si128(vb, va); _mm_store_si128((__m128i *)d8, vb); d8 += 16; s8 += 16; } -#endif } +#endif +#ifdef INTEL_SSE2 static void gf_w4_bytwo_b_sse_region_6_noxor(gf_region_data *rd, struct gf_bytwo_data *btd) { -#ifdef INTEL_SSE2 - int i; - uint8_t *d8, *s8, tb; - __m128i pp, m1, m2, t1, t2, va, vb; + uint8_t *d8, *s8; + __m128i pp, m1, t1, t2, va, vb; s8 = (uint8_t *) rd->s_start; d8 = (uint8_t *) rd->d_start; pp = _mm_set1_epi8(btd->prim_poly&0xff); m1 = _mm_set1_epi8((btd->mask1)&0xff); - m2 = _mm_set1_epi8((btd->mask2)&0xff); while (d8 < (uint8_t *) rd->d_top) { va = _mm_load_si128 ((__m128i *)(s8)); - SSE_AB2(pp, m1, m2, va, t1, t2); + SSE_AB2(pp, m1, va, t1, t2); vb = va; - SSE_AB2(pp, m1, m2, va, t1, t2); + SSE_AB2(pp, m1, va, t1, t2); va = _mm_xor_si128(va, vb); _mm_store_si128((__m128i *)d8, va); d8 += 16; s8 += 16; } -#endif } +#endif +#ifdef INTEL_SSE2 static void gf_w4_bytwo_b_sse_region_6_xor(gf_region_data *rd, struct gf_bytwo_data *btd) { -#ifdef INTEL_SSE2 - int i; - uint8_t *d8, *s8, tb; - __m128i pp, m1, m2, t1, t2, va, vb; + uint8_t *d8, *s8; + __m128i pp, m1, t1, t2, va, vb; s8 = (uint8_t *) rd->s_start; d8 = (uint8_t *) rd->d_start; pp = _mm_set1_epi8(btd->prim_poly&0xff); m1 = _mm_set1_epi8((btd->mask1)&0xff); - m2 = _mm_set1_epi8((btd->mask2)&0xff); while (d8 < (uint8_t *) rd->d_top) { va = _mm_load_si128 ((__m128i *)(s8)); - SSE_AB2(pp, m1, m2, va, t1, t2); + SSE_AB2(pp, m1, va, t1, t2); vb = _mm_xor_si128(_mm_load_si128 ((__m128i *)(d8)), va); - SSE_AB2(pp, m1, m2, va, t1, t2); + SSE_AB2(pp, m1, va, t1, t2); vb = _mm_xor_si128(vb, va); _mm_store_si128((__m128i *)d8, vb); d8 += 16; s8 += 16; } -#endif } +#endif +#ifdef INTEL_SSE2 static void gf_w4_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { -#ifdef INTEL_SSE2 uint8_t *d8, *s8, tb; __m128i pp, m1, m2, t1, t2, va, vb; struct gf_bytwo_data *btd; @@ -1464,7 +1436,7 @@ gf_w4_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t v if (tb & 1) vb = _mm_xor_si128(vb, va); tb >>= 1; if (tb == 0) break; - SSE_AB2(pp, m1, m2, va, t1, t2); + SSE_AB2(pp, m1, va, t1, t2); } _mm_store_si128((__m128i *)d8, vb); d8 += 16; @@ -1491,16 +1463,13 @@ gf_w4_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t v } } gf_do_final_region_alignment(&rd); -#endif } +#endif static void gf_w4_bytwo_b_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { - unsigned long uls, uld; - int i; - uint8_t *s8, *d8, *top; uint64_t *s64, *d64, t1, t2, ta, tb, prod; struct gf_bytwo_data *btd; gf_region_data rd; @@ -1963,10 +1932,6 @@ int gf_w4_bytwo_init(gf_t *gf) static int gf_w4_cfm_init(gf_t *gf) { - gf_internal_t *h; - - h = (gf_internal_t *) gf->scratch; - #if defined(INTEL_SSE4_PCLMUL) gf->multiply.w32 = gf_w4_clm_multiply; return 1; @@ -1986,8 +1951,6 @@ int gf_w4_shift_init(gf_t *gf) int gf_w4_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2) { - int region_tbl_size; - int ss; int issse3 = 0; #ifdef INTEL_SSSE3 diff --git a/src/gf_w64.c b/src/gf_w64.c index 73bf164..f04daf0 100644 --- a/src/gf_w64.c +++ b/src/gf_w64.c @@ -87,20 +87,19 @@ xor) } } +#if defined(INTEL_SSE4_PCLMUL) static void gf_w64_clm_multiply_region_from_single_2(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor) { - int i, size; gf_val_64_t *s64, *d64, *top; gf_region_data rd; -#if defined(INTEL_SSE4_PCLMUL) __m128i a, b; __m128i result, r1; __m128i prim_poly; - __m128i v, w; + __m128i w; __m128i m1, m2, m3, m4; gf_internal_t * h = gf->scratch; @@ -121,7 +120,6 @@ xor) s64 = (gf_val_64_t *) rd.s_start; d64 = (gf_val_64_t *) rd.d_start; top = (gf_val_64_t *) rd.d_top; - size = bytes/sizeof(gf_val_64_t); if (xor) { while (d64 != top) { @@ -175,19 +173,18 @@ xor) } } gf_do_final_region_alignment(&rd); -#endif } +#endif +#if defined(INTEL_SSE4_PCLMUL) static void gf_w64_clm_multiply_region_from_single_4(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor) { - int i, size; gf_val_64_t *s64, *d64, *top; gf_region_data rd; -#if defined(INTEL_SSE4_PCLMUL) __m128i a, b; __m128i result, r1; __m128i prim_poly; @@ -210,7 +207,6 @@ xor) s64 = (gf_val_64_t *) rd.s_start; d64 = (gf_val_64_t *) rd.d_start; top = (gf_val_64_t *) rd.d_top; - size = bytes/sizeof(gf_val_64_t); if (xor) { while (d64 != top) { @@ -263,8 +259,8 @@ xor) } } gf_do_final_region_alignment(&rd); -#endif } +#endif static inline @@ -321,7 +317,7 @@ inline gf_val_64_t gf_w64_shift_multiply (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64) { - uint64_t pl, pr, ppl, ppr, i, pp, a, bl, br, one, lbit; + uint64_t pl, pr, ppl, ppr, i, a, bl, br, one, lbit; gf_internal_t *h; h = (gf_internal_t *) gf->scratch; @@ -468,9 +464,7 @@ gf_w64_clm_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int by { #if defined(INTEL_SSE4_PCLMUL) gf_internal_t *h; - int i, j, k; uint8_t *s8, *d8, *dtop; - uint64_t *s64, *d64; gf_region_data rd; __m128i v, b, m, prim_poly, c, fr, w, result; @@ -492,7 +486,6 @@ gf_w64_clm_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int by if (xor) { while (d8 != dtop) { - s64 = (uint64_t *) s8; b = _mm_load_si128((__m128i *) s8); result = _mm_clmulepi64_si128 (b, v, 0); c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0); @@ -521,7 +514,6 @@ gf_w64_clm_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int by } } else { while (d8 < dtop) { - s64 = (uint64_t *) s8; b = _mm_load_si128((__m128i *) s8); result = _mm_clmulepi64_si128 (b, v, 0); c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0); @@ -741,8 +733,6 @@ gf_w64_split_16_64_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint64_ static int gf_w64_shift_init(gf_t *gf) { - gf_internal_t *h; - gf->multiply.w64 = gf_w64_shift_multiply; gf->inverse.w64 = gf_w64_euclid; gf->multiply_region.w64 = gf_w64_multiply_region_from_single; @@ -752,14 +742,14 @@ int gf_w64_shift_init(gf_t *gf) static int gf_w64_cfm_init(gf_t *gf) { - gf_internal_t *h; - - h = (gf_internal_t *) gf->scratch; - gf->inverse.w64 = gf_w64_euclid; gf->multiply_region.w64 = gf_w64_multiply_region_from_single; #if defined(INTEL_SSE4_PCLMUL) + gf_internal_t *h; + + h = (gf_internal_t *) gf->scratch; + if ((0xfffffffe00000000ULL & h->prim_poly) == 0){ gf->multiply.w64 = gf_w64_clm_multiply_2; gf->multiply_region.w64 = gf_w64_clm_multiply_region_from_single_2; @@ -803,7 +793,6 @@ inline gf_val_64_t gf_w64_group_multiply(gf_t *gf, gf_val_64_t a, gf_val_64_t b) { - int i; uint64_t top, bot, mask, tp; int g_s, g_r, lshift, rshift; struct gf_w64_group_data *gd; @@ -854,7 +843,7 @@ static void gf_w64_group_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor) { int i, fzb; - uint64_t a64, smask, rmask, top, bot, tp, one; + uint64_t a64, smask, rmask, top, bot, tp; int lshift, rshift, g_s, g_r; gf_region_data rd; uint64_t *s64, *d64, *dtop; @@ -936,9 +925,8 @@ inline gf_val_64_t gf_w64_group_s_equals_r_multiply(gf_t *gf, gf_val_64_t a, gf_val_64_t b) { - int i; int leftover, rs; - uint64_t p, l, ind, r, a64; + uint64_t p, l, ind, a64; int bits_left; int g_s; @@ -974,9 +962,8 @@ gf_w64_group_s_equals_r_multiply(gf_t *gf, gf_val_64_t a, gf_val_64_t b) static void gf_w64_group_s_equals_r_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor) { - int i; int leftover, rs; - uint64_t p, l, ind, r, a64; + uint64_t p, l, ind, a64; int bits_left; int g_s; gf_region_data rd; @@ -1189,7 +1176,7 @@ static void gf_w64_bytwo_p_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor) { - uint64_t *s64, *d64, t1, t2, ta, prod, amask, pmask, pp; + uint64_t *s64, *d64, ta, prod, amask, pmask, pp; gf_region_data rd; gf_internal_t *h; @@ -1243,7 +1230,7 @@ static void gf_w64_bytwo_b_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor) { - uint64_t *s64, *d64, t1, t2, ta, tb, prod, amask, bmask, pp; + uint64_t *s64, *d64, ta, tb, prod, bmask, pp; gf_region_data rd; gf_internal_t *h; @@ -1374,14 +1361,13 @@ void gf_w64_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_ #endif } +#ifdef INTEL_SSE2 static void gf_w64_bytwo_b_sse_region_2_xor(gf_region_data *rd) { -#ifdef INTEL_SSE2 - int i; uint64_t one64, amask; - uint8_t *d8, *s8, tb; + uint8_t *d8, *s8; __m128i pp, m1, m2, t1, t2, va, vb; gf_internal_t *h; @@ -1405,17 +1391,16 @@ gf_w64_bytwo_b_sse_region_2_xor(gf_region_data *rd) d8 += 16; s8 += 16; } -#endif } +#endif +#ifdef INTEL_SSE2 static void gf_w64_bytwo_b_sse_region_2_noxor(gf_region_data *rd) { -#ifdef INTEL_SSE2 - int i; uint64_t one64, amask; - uint8_t *d8, *s8, tb; + uint8_t *d8, *s8; __m128i pp, m1, m2, t1, t2, va; gf_internal_t *h; @@ -1437,18 +1422,17 @@ gf_w64_bytwo_b_sse_region_2_noxor(gf_region_data *rd) d8 += 16; s8 += 16; } -#endif } +#endif +#ifdef INTEL_SSE2 static void gf_w64_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor) { -#ifdef INTEL_SSE2 uint64_t itb, amask, one64; uint8_t *d8, *s8; __m128i pp, m1, m2, t1, t2, va, vb; - struct gf_w32_bytwo_data *btd; gf_region_data rd; gf_internal_t *h; @@ -1495,8 +1479,8 @@ gf_w64_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t } gf_do_final_region_alignment(&rd); -#endif } +#endif static @@ -1620,17 +1604,13 @@ static void gf_w64_composite_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor) { - unsigned long uls, uld; gf_internal_t *h = (gf_internal_t *) gf->scratch; gf_t *base_gf = h->base_gf; - int i=0; uint32_t b0 = val & 0x00000000ffffffff; uint32_t b1 = (val & 0xffffffff00000000) >> 32; uint64_t *s64, *d64; uint64_t *top; uint64_t a0, a1, a1b1; - int num_syms = bytes / 8; - int sym_divisible = bytes % 4; gf_region_data rd; if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } @@ -1721,14 +1701,14 @@ int gf_w64_composite_init(gf_t *gf) return 1; } +#ifdef INTEL_SSSE3 static void gf_w64_split_4_64_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int bytes, int xor) { -#ifdef INTEL_SSSE3 gf_internal_t *h; - int i, m, j, k, tindex; - uint64_t pp, v, s, *s64, *d64, *top; + int i, j, k; + uint64_t pp, v, *s64, *d64, *top; __m128i si, tables[16][8], p[8], v0, mask1; struct gf_split_4_64_lazy_data *ld; uint8_t btable[16]; @@ -1802,18 +1782,18 @@ gf_w64_split_4_64_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void *des } } gf_do_final_region_alignment(&rd); -#endif } +#endif +#ifdef INTEL_SSE4 static void gf_w64_split_4_64_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int bytes, int xor) { -#ifdef INTEL_SSE4 gf_internal_t *h; - int i, m, j, k, tindex; - uint64_t pp, v, s, *s64, *d64, *top; - __m128i si, tables[16][8], p[8], st[8], mask1, mask8, mask16, t1, t2; + int i, j, k; + uint64_t pp, v, *s64, *d64, *top; + __m128i si, tables[16][8], p[8], st[8], mask1, mask8, mask16, t1; struct gf_split_4_64_lazy_data *ld; uint8_t btable[16]; gf_region_data rd; @@ -2006,8 +1986,8 @@ gf_w64_split_4_64_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, uint } gf_do_final_region_alignment(&rd); -#endif } +#endif #define GF_MULTBY_TWO(p) (((p) & GF_FIRST_BIT) ? (((p) << 1) ^ h->prim_poly) : (p) << 1); @@ -2141,8 +2121,6 @@ int gf_w64_split_init(gf_t *gf) int gf_w64_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2) { - int issse4; - switch(mult_type) { case GF_MULT_SHIFT: @@ -2162,11 +2140,9 @@ int gf_w64_scratch_size(int mult_type, int region_type, int divide_type, int arg * then fall through to split table scratch size code. */ #ifdef INTEL_SSE4 - issse4 = 1; arg1 = 64; arg2 = 4; #else - issse4 = 0; arg1 = 64; arg2 = 8; #endif @@ -2202,7 +2178,7 @@ int gf_w64_scratch_size(int mult_type, int region_type, int divide_type, int arg int gf_w64_init(gf_t *gf) { - gf_internal_t *h, *h_base, *h_base_base, *h_base_base_base; + gf_internal_t *h; int no_default_flag = 0; h = (gf_internal_t *) gf->scratch; diff --git a/src/gf_w8.c b/src/gf_w8.c index 7661aad..89ef6a2 100644 --- a/src/gf_w8.c +++ b/src/gf_w8.c @@ -216,7 +216,7 @@ gf_w8_clm_multiply_2 (gf_t *gf, gf_val_32_t a8, gf_val_32_t b8) __m128i a, b; __m128i result; __m128i prim_poly; - __m128i v, w; + __m128i w; gf_internal_t * h = gf->scratch; a = _mm_insert_epi32 (_mm_setzero_si128(), a8, 0); @@ -262,7 +262,7 @@ gf_w8_clm_multiply_3 (gf_t *gf, gf_val_32_t a8, gf_val_32_t b8) __m128i a, b; __m128i result; __m128i prim_poly; - __m128i v, w; + __m128i w; gf_internal_t * h = gf->scratch; a = _mm_insert_epi32 (_mm_setzero_si128(), a8, 0); @@ -301,7 +301,7 @@ gf_w8_clm_multiply_4 (gf_t *gf, gf_val_32_t a8, gf_val_32_t b8) __m128i a, b; __m128i result; __m128i prim_poly; - __m128i v, w; + __m128i w; gf_internal_t * h = gf->scratch; a = _mm_insert_epi32 (_mm_setzero_si128(), a8, 0); @@ -364,6 +364,7 @@ gf_w8_multiply_region_from_single(gf_t *gf, void *src, void *dest, gf_val_32_t v gf_do_final_region_alignment(&rd); } +#if defined(INTEL_SSE4_PCLMUL) static void gf_w8_clm_multiply_region_from_single_2(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int @@ -373,12 +374,10 @@ gf_w8_clm_multiply_region_from_single_2(gf_t *gf, void *src, void *dest, gf_val_ uint8_t *s8; uint8_t *d8; -#if defined(INTEL_SSE4_PCLMUL) - __m128i a, b; __m128i result; __m128i prim_poly; - __m128i v, w; + __m128i w; gf_internal_t * h = gf->scratch; prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffULL)); @@ -420,9 +419,10 @@ gf_w8_clm_multiply_region_from_single_2(gf_t *gf, void *src, void *dest, gf_val_ } } gf_do_final_region_alignment(&rd); -#endif } +#endif +#if defined(INTEL_SSE4_PCLMUL) static void gf_w8_clm_multiply_region_from_single_3(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int @@ -432,12 +432,10 @@ gf_w8_clm_multiply_region_from_single_3(gf_t *gf, void *src, void *dest, gf_val_ uint8_t *s8; uint8_t *d8; -#if defined(INTEL_SSE4_PCLMUL) - __m128i a, b; __m128i result; __m128i prim_poly; - __m128i v, w; + __m128i w; gf_internal_t * h = gf->scratch; prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffULL)); @@ -483,9 +481,10 @@ gf_w8_clm_multiply_region_from_single_3(gf_t *gf, void *src, void *dest, gf_val_ } } gf_do_final_region_alignment(&rd); -#endif } +#endif +#if defined(INTEL_SSE4_PCLMUL) static void gf_w8_clm_multiply_region_from_single_4(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int @@ -495,12 +494,10 @@ gf_w8_clm_multiply_region_from_single_4(gf_t *gf, void *src, void *dest, gf_val_ uint8_t *s8; uint8_t *d8; -#if defined(INTEL_SSE4_PCLMUL) - __m128i a, b; __m128i result; __m128i prim_poly; - __m128i v, w; + __m128i w; gf_internal_t * h = gf->scratch; prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffULL)); @@ -550,8 +547,8 @@ gf_w8_clm_multiply_region_from_single_4(gf_t *gf, void *src, void *dest, gf_val_ } } gf_do_final_region_alignment(&rd); -#endif } +#endif /* ------------------------------------------------------------ IMPLEMENTATION: SHIFT: @@ -588,11 +585,11 @@ gf_w8_shift_multiply (gf_t *gf, uint32_t a8, uint32_t b8) static int gf_w8_cfm_init(gf_t *gf) { +#if defined(INTEL_SSE4_PCLMUL) gf_internal_t *h; h = (gf_internal_t *) gf->scratch; -#if defined(INTEL_SSE4_PCLMUL) if ((0xe0 & h->prim_poly) == 0){ gf->multiply.w32 = gf_w8_clm_multiply_2; gf->multiply_region.w32 = gf_w8_clm_multiply_region_from_single_2; @@ -731,7 +728,7 @@ static gf_w8_log_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor) { int i; - uint8_t lv, b, c; + uint8_t lv; uint8_t *s8, *d8; struct gf_w8_logtable_data *ltd; @@ -760,7 +757,7 @@ static gf_w8_logzero_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor) { int i; - uint8_t lv, b, c; + uint8_t lv; uint8_t *s8, *d8; struct gf_w8_logzero_table_data *ltd; struct gf_w8_logzero_small_table_data *std; @@ -802,9 +799,9 @@ gf_w8_logzero_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int int gf_w8_log_init(gf_t *gf) { gf_internal_t *h; - struct gf_w8_logtable_data *ltd; - struct gf_w8_logzero_table_data *ztd; - struct gf_w8_logzero_small_table_data *std; + struct gf_w8_logtable_data *ltd = NULL; + struct gf_w8_logzero_table_data *ztd = NULL; + struct gf_w8_logzero_small_table_data *std = NULL; uint8_t *alt; uint8_t *inv; int i, b; @@ -941,6 +938,7 @@ gf_w8_default_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b) return (ftd->multtable[a][b]); } +#ifdef INTEL_SSSE3 static gf_val_32_t gf_w8_default_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b) @@ -950,6 +948,7 @@ gf_w8_default_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b) ftd = (struct gf_w8_default_data *) ((gf_internal_t *) gf->scratch)->private; return (ftd->divtable[a][b]); } +#endif static gf_val_32_t @@ -976,7 +975,7 @@ static gf_w8_double_table_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { uint16_t *base; - uint32_t b, c, prod, vc, vb; + uint32_t b, c, vc, vb; gf_internal_t *h; struct gf_w8_double_table_data *dtd; struct gf_w8_double_table_lazy_data *ltd; @@ -1033,7 +1032,6 @@ static gf_w8_table_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { int i; - uint8_t lv, b, c; uint8_t *s8, *d8; struct gf_w8_single_table_data *ftd; @@ -1055,14 +1053,13 @@ gf_w8_table_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, in } } +#ifdef INTEL_SSSE3 static void gf_w8_split_multiply_region_sse(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { -#ifdef INTEL_SSSE3 - uint8_t *s8, *d8, *bh, *bl, *sptr, *dptr, *top; - __m128i tbl, loset, t1, r, va, mth, mtl; - uint64_t altable[4]; + uint8_t *bh, *bl, *sptr, *dptr; + __m128i loset, t1, r, va, mth, mtl; struct gf_w8_half_table_data *htd; gf_region_data rd; @@ -1115,8 +1112,8 @@ gf_w8_split_multiply_region_sse(gf_t *gf, void *src, void *dest, gf_val_32_t val } gf_do_final_region_alignment(&rd); -#endif } +#endif /* ------------------------------------------------------------ @@ -1137,9 +1134,7 @@ static void gf_w8_split_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { - unsigned long uls, uld; int i; - uint8_t lv, b, c; uint8_t *s8, *d8; struct gf_w8_half_table_data *htd; @@ -1167,11 +1162,10 @@ int gf_w8_split_init(gf_t *gf) { gf_internal_t *h; struct gf_w8_half_table_data *htd; - int a, b, pp; + int a, b; h = (gf_internal_t *) gf->scratch; htd = (struct gf_w8_half_table_data *)h->private; - pp = h->prim_poly; bzero(htd->high, sizeof(uint8_t)*GF_FIELD_SIZE*GF_HALF_SIZE); bzero(htd->low, sizeof(uint8_t)*GF_FIELD_SIZE*GF_HALF_SIZE); @@ -1325,13 +1319,13 @@ gf_w8_composite_multiply_region_alt(gf_t *gf, void *src, void *dest, gf_val_32_t gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 32); gf_do_initial_region_alignment(&rd); - sub_reg_size = (rd.d_top - rd.d_start) / 2; + sub_reg_size = ((char*)rd.d_top - (char*)rd.d_start) / 2; base_gf->multiply_region.w32(base_gf, rd.s_start, rd.d_start, val0, sub_reg_size, xor); - base_gf->multiply_region.w32(base_gf, rd.s_start+sub_reg_size, rd.d_start, val1, sub_reg_size, 1); - base_gf->multiply_region.w32(base_gf, rd.s_start, rd.d_start+sub_reg_size, val1, sub_reg_size, xor); - base_gf->multiply_region.w32(base_gf, rd.s_start+sub_reg_size, rd.d_start+sub_reg_size, val0, sub_reg_size, 1); - base_gf->multiply_region.w32(base_gf, rd.s_start+sub_reg_size, rd.d_start+sub_reg_size, base_gf->multiply.w32(base_gf, h->prim_poly, val1), sub_reg_size, 1); + base_gf->multiply_region.w32(base_gf, (char*)rd.s_start+sub_reg_size, rd.d_start, val1, sub_reg_size, 1); + base_gf->multiply_region.w32(base_gf, rd.s_start, (char*)rd.d_start+sub_reg_size, val1, sub_reg_size, xor); + base_gf->multiply_region.w32(base_gf, (char*)rd.s_start+sub_reg_size, (char*)rd.d_start+sub_reg_size, val0, sub_reg_size, 1); + base_gf->multiply_region.w32(base_gf, (char*)rd.s_start+sub_reg_size, (char*)rd.d_start+sub_reg_size, base_gf->multiply.w32(base_gf, h->prim_poly, val1), sub_reg_size, 1); gf_do_final_region_alignment(&rd); } @@ -1361,7 +1355,6 @@ gf_val_32_t gf_w8_composite_multiply_inline(gf_t *gf, gf_val_32_t a, gf_val_32_t b) { gf_internal_t *h = (gf_internal_t *) gf->scratch; - gf_t *base_gf = h->base_gf; uint8_t b0 = b & 0x0f; uint8_t b1 = (b & 0xf0) >> 4; uint8_t a0 = a & 0x0f; @@ -1674,15 +1667,14 @@ gf_w8_bytwo_p_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t prod = _mm_xor_si128(prod, t1); \ v = _mm_srli_epi64(v, 1); } +#ifdef INTEL_SSE2 static void gf_w8_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { -#ifdef INTEL_SSE2 int i; uint8_t *s8, *d8; uint8_t vrev; - uint64_t amask; __m128i pp, m1, m2, ta, prod, t1, t2, tp, one, v; struct gf_w8_bytwo_data *btd; gf_region_data rd; @@ -1727,17 +1719,16 @@ gf_w8_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t v s8 += 16; } gf_do_final_region_alignment(&rd); -#endif } +#endif +#ifdef INTEL_SSE2 static void gf_w8_bytwo_b_sse_region_2_noxor(gf_region_data *rd, struct gf_w8_bytwo_data *btd) { -#ifdef INTEL_SSE2 - int i; - uint8_t *d8, *s8, tb; - __m128i pp, m1, m2, t1, t2, va, vb; + uint8_t *d8, *s8; + __m128i pp, m1, m2, t1, t2, va; s8 = (uint8_t *) rd->s_start; d8 = (uint8_t *) rd->d_start; @@ -1753,16 +1744,15 @@ gf_w8_bytwo_b_sse_region_2_noxor(gf_region_data *rd, struct gf_w8_bytwo_data *bt d8 += 16; s8 += 16; } -#endif } +#endif +#ifdef INTEL_SSE2 static void gf_w8_bytwo_b_sse_region_2_xor(gf_region_data *rd, struct gf_w8_bytwo_data *btd) { -#ifdef INTEL_SSE2 - int i; - uint8_t *d8, *s8, tb; + uint8_t *d8, *s8; __m128i pp, m1, m2, t1, t2, va, vb; s8 = (uint8_t *) rd->s_start; @@ -1781,15 +1771,15 @@ gf_w8_bytwo_b_sse_region_2_xor(gf_region_data *rd, struct gf_w8_bytwo_data *btd) d8 += 16; s8 += 16; } -#endif } +#endif +#ifdef INTEL_SSE2 static void gf_w8_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { -#ifdef INTEL_SSE2 int itb; uint8_t *d8, *s8; __m128i pp, m1, m2, t1, t2, va, vb; @@ -1837,15 +1827,13 @@ gf_w8_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t v } gf_do_final_region_alignment(&rd); -#endif } +#endif static void gf_w8_bytwo_b_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { - int i; - uint8_t *s8, *d8, *top; uint64_t *s64, *d64, t1, t2, ta, tb, prod; struct gf_w8_bytwo_data *btd; gf_region_data rd; @@ -2362,7 +2350,7 @@ int gf_w8_scratch_size(int mult_type, int region_type, int divide_type, int arg1 int gf_w8_init(gf_t *gf) { - gf_internal_t *h, *h_base; + gf_internal_t *h; h = (gf_internal_t *) gf->scratch; @@ -2454,11 +2442,9 @@ uint8_t *gf_w8_get_mult_table(gf_t *gf) uint8_t *gf_w8_get_div_table(gf_t *gf) { - gf_internal_t *h; struct gf_w8_default_data *ftd; struct gf_w8_single_table_data *std; - h = (gf_internal_t *) gf->scratch; if (gf->multiply.w32 == gf_w8_default_multiply) { ftd = (struct gf_w8_default_data *) ((gf_internal_t *) gf->scratch)->private; return (uint8_t *) ftd->divtable; diff --git a/src/gf_wgen.c b/src/gf_wgen.c index f5e22e0..68c6bb0 100644 --- a/src/gf_wgen.c +++ b/src/gf_wgen.c @@ -284,9 +284,8 @@ inline gf_val_32_t gf_wgen_group_s_equals_r_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b) { - int i; int leftover, rs; - uint32_t p, l, ind, r, a32; + uint32_t p, l, ind, a32; int bits_left; int g_s; int w; @@ -362,7 +361,7 @@ gf_wgen_group_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b) { int i; int leftover; - uint64_t p, l, r, mask; + uint64_t p, l, r; uint32_t a32, ind; int g_s, g_r; struct gf_wgen_group_data *gd; @@ -496,7 +495,7 @@ int gf_wgen_table_8_init(gf_t *gf) gf_internal_t *h; int w; struct gf_wgen_table_w8_data *std; - uint32_t a, b, p, pp; + uint32_t a, b, p; h = (gf_internal_t *) gf->scratch; w = h->w; @@ -557,7 +556,7 @@ int gf_wgen_table_16_init(gf_t *gf) gf_internal_t *h; int w; struct gf_wgen_table_w16_data *std; - uint32_t a, b, p, pp; + uint32_t a, b, p; h = (gf_internal_t *) gf->scratch; w = h->w; @@ -917,11 +916,11 @@ gf_wgen_cauchy_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int byte for (i = 0; i < h->w; i++) { for (j = 0; j < h->w; j++) { if (val & (1 << j)) { - gf_multby_one(src, dest + j*rs, rs, (written & (1 << j))); + gf_multby_one(src, ((char*)dest) + j*rs, rs, (written & (1 << j))); written |= (1 << j); } } - src += rs; + src = (char*)src + rs; val = gf->multiply.w32(gf, val, 2); } } diff --git a/test/gf_unit.c b/test/gf_unit.c index cf466fe..deaaced 100644 --- a/test/gf_unit.c +++ b/test/gf_unit.c @@ -70,16 +70,16 @@ int main(int argc, char **argv) { signal(SIGSEGV, SigHandler); - int w, i, verbose, single, region, tested, top; + int w, i, verbose, single, region, top; int s_start, d_start, bytes, xor, alignment_test; gf_t gf, gf_def; time_t t0; gf_internal_t *h; - gf_general_t *a, *b, *c, *d, *ai, *bi; - uint8_t a8, b8, c8, *mult4, *div4, *mult8, *div8; - uint16_t a16, b16, c16, d16, *log16, *alog16; - char as[50], bs[50], cs[50], ds[50], ais[50], bis[50]; - uint32_t mask; + gf_general_t *a, *b, *c, *d; + uint8_t a8, b8, c8, *mult4 = NULL, *mult8 = NULL; + uint16_t a16, b16, c16, *log16 = NULL, *alog16 = NULL; + char as[50], bs[50], cs[50], ds[50]; + uint32_t mask = 0; char *ra, *rb, *rc, *rd, *target; int align; @@ -115,8 +115,6 @@ int main(int argc, char **argv) b = (gf_general_t *) malloc(sizeof(gf_general_t)); c = (gf_general_t *) malloc(sizeof(gf_general_t)); d = (gf_general_t *) malloc(sizeof(gf_general_t)); - ai = (gf_general_t *) malloc(sizeof(gf_general_t)); - bi = (gf_general_t *) malloc(sizeof(gf_general_t)); //15 bytes extra to make sure it's 16byte aligned ra = (char *) malloc(sizeof(char)*REGION_SIZE+15); @@ -145,12 +143,10 @@ int main(int argc, char **argv) problem("No default for this value of w"); if (w == 4) { mult4 = gf_w4_get_mult_table(&gf); - div4 = gf_w4_get_div_table(&gf); } if (w == 8) { mult8 = gf_w8_get_mult_table(&gf); - div8 = gf_w8_get_div_table(&gf); } if (w == 16) { @@ -240,7 +236,6 @@ int main(int argc, char **argv) } } - tested = 0; gf_general_multiply(&gf, a, b, c); /* If w is 4, 8 or 16, then there are inline multiplication/division methods. @@ -285,7 +280,6 @@ int main(int argc, char **argv) /* If this is not composite, then first test against the default: */ if (h->mult_type != GF_MULT_COMPOSITE) { - tested = 1; gf_general_multiply(&gf_def, a, b, d); if (!gf_general_are_equal(c, d, w)) { @@ -306,7 +300,6 @@ int main(int argc, char **argv) if (gf_general_is_zero(a, w) || gf_general_is_zero(b, w) || gf_general_is_one(a, w) || gf_general_is_one(b, w)) { - tested = 1; if (((gf_general_is_zero(a, w) || gf_general_is_zero(b, w)) && !gf_general_is_zero(c, w)) || (gf_general_is_one(a, w) && !gf_general_are_equal(b, c, w)) || (gf_general_is_one(b, w) && !gf_general_are_equal(a, c, w))) { @@ -429,4 +422,5 @@ int main(int argc, char **argv) gf_general_do_region_check(&gf, a, rc+s_start, rd+d_start, target+d_start, bytes, xor); } } + return 0; } diff --git a/tools/gf_add.c b/tools/gf_add.c index b900e69..28cc12c 100644 --- a/tools/gf_add.c +++ b/tools/gf_add.c @@ -62,7 +62,7 @@ void print_128(uint64_t *v) int main(int argc, char **argv) { - int hex, al, bl, w; + int hex, w; uint32_t a, b, c, top; uint64_t a64, b64, c64; uint64_t a128[2], b128[2], c128[2]; diff --git a/tools/gf_inline_time.c b/tools/gf_inline_time.c index e64f0b3..c81e8a9 100644 --- a/tools/gf_inline_time.c +++ b/tools/gf_inline_time.c @@ -62,8 +62,8 @@ int main(int argc, char **argv) int w, j, i, size, iterations; gf_t gf; double timer, elapsed, dnum, num; - uint8_t *ra, *rb, *mult4, *mult8; - uint16_t *ra16, *rb16, *log16, *alog16; + uint8_t *ra = NULL, *rb = NULL, *mult4, *mult8; + uint16_t *ra16 = NULL, *rb16 = NULL, *log16, *alog16; time_t t0; if (argc != 5) usage(NULL); @@ -164,4 +164,5 @@ int main(int argc, char **argv) printf("Inline mult: %10.6lf s Mops: %10.3lf %10.3lf Mega-ops/s\n", elapsed, dnum/1024.0/1024.0, dnum/1024.0/1024.0/elapsed); } + return 0; } diff --git a/tools/gf_methods.c b/tools/gf_methods.c index 3afb438..6664bec 100644 --- a/tools/gf_methods.c +++ b/tools/gf_methods.c @@ -76,7 +76,7 @@ int main(int argc, char *argv[]) int listing; char *gf_argv[50], *x; gf_t gf; - char divs[200], ks[10], ls[10]; + char ls[10]; char * w_str; if (argc != 4) usage(NULL); diff --git a/tools/gf_poly.c b/tools/gf_poly.c index e19706c..44a24ac 100644 --- a/tools/gf_poly.c +++ b/tools/gf_poly.c @@ -84,7 +84,6 @@ int gcd_one(gf_t *gf, int w, int n, gf_general_t *poly, gf_general_t *prod) { gf_general_t *a, *b, zero, factor, p; int i, j, da, db; - char buf[30]; gf_general_set_zero(&zero, w); @@ -123,7 +122,6 @@ void x_to_q_to_i_minus_x(gf_t *gf, int w, int n, gf_general_t *poly, int logq, i gf_general_t *product; gf_general_t p, zero, factor; int j, k, lq; - char buf[20]; gf_general_set_zero(&zero, w); product = (gf_general_t *) malloc(sizeof(gf_general_t) * n*2); @@ -181,9 +179,9 @@ void x_to_q_to_i_minus_x(gf_t *gf, int w, int n, gf_general_t *poly, int logq, i free(x_to_q); } -main(int argc, char **argv) +int main(int argc, char **argv) { - int w, i, power, n, ap, success, j; + int w, i, power, n, ap, success; gf_t gf; gf_general_t *poly, *prod; char *string, *ptr; diff --git a/tools/gf_time.c b/tools/gf_time.c index 2bd2d04..4becc8d 100644 --- a/tools/gf_time.c +++ b/tools/gf_time.c @@ -119,7 +119,7 @@ int main(int argc, char **argv) if (!create_gf_from_argv(&gf, w, argc, argv, 6)) usage(BM); strcpy(tests, ""); - for (i = 0; i < argv[2][i] != '\0'; i++) { + for (i = 0; argv[2][i] != '\0'; i++) { switch(argv[2][i]) { case 'A': strcat(tests, single_tests); strcat(tests, region_tests); @@ -163,8 +163,8 @@ int main(int argc, char **argv) for (i = 0; i < 3; i++) { test = single_tests[i]; if (strchr(tests, test) != NULL) { - if (tmethods[test] == NULL) { - printf("No %s method.\n", tstrings[test]); + if (tmethods[(int)test] == NULL) { + printf("No %s method.\n", tstrings[(int)test]); } else { elapsed = 0; dnum = 0; @@ -176,7 +176,7 @@ int main(int argc, char **argv) elapsed += timer_split(&timer); } printf("%14s: %10.6lf s Mops: %10.3lf %10.3lf Mega-ops/s\n", - tstrings[test], elapsed, + tstrings[(int)test], elapsed, dnum/1024.0/1024.0, dnum/1024.0/1024.0/elapsed); } } @@ -185,8 +185,8 @@ int main(int argc, char **argv) for (i = 0; i < 4; i++) { test = region_tests[i]; if (strchr(tests, test) != NULL) { - if (tmethods[test] == NULL) { - printf("No %s method.\n", tstrings[test]); + if (tmethods[(int)test] == NULL) { + printf("No %s method.\n", tstrings[(int)test]); } else { elapsed = 0; @@ -204,10 +204,11 @@ int main(int argc, char **argv) elapsed += timer_split(&timer); } printf("%14s: XOR: %d %10.6lf s MB: %10.3lf %10.3lf MB/s\n", - tstrings[test], xor, elapsed, + tstrings[(int)test], xor, elapsed, ds*di/1024.0/1024.0, ds*di/1024.0/1024.0/elapsed); } } } } + return 0; } |