From d315d20d569f7da176eb7445b8c21ea055083f06 Mon Sep 17 00:00:00 2001 From: Janne Grunau Date: Thu, 7 Aug 2014 00:52:55 +0200 Subject: build: make CFLAGS user setable There is no need to force the non-default CFLAGS on users trying to set them via enviroment variable or on configure command. --- configure.ac | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/configure.ac b/configure.ac index 9f33852..02a62b8 100644 --- a/configure.ac +++ b/configure.ac @@ -3,6 +3,9 @@ # FIXME - add project url as the last argument AC_INIT(gf-complete, 1.0) +# Override default CFLAGS +: ${CFLAGS="-Wall -Wpointer-arith -O3 -g"} + AC_PREREQ([2.61]) AM_INIT_AUTOMAKE([no-dependencies foreign]) @@ -16,9 +19,6 @@ AC_CONFIG_MACRO_DIR([m4]) # This prevents './configure; make' from trying to run autotools. AM_MAINTAINER_MODE([disable]) -# Override default CFLAGS -CFLAGS="-Wall -Wpointer-arith -O3 -g" - dnl Compiling with per-target flags requires AM_PROG_CC_C_O. AC_PROG_CC -- cgit v1.2.1 From f6828cfbc1bf24d686e6e24ce9822e69f824351d Mon Sep 17 00:00:00 2001 From: Janne Grunau Date: Thu, 7 Aug 2014 00:54:21 +0200 Subject: build: fix out of source tree build --- examples/Makefile.am | 4 ++-- src/Makefile.am | 4 ++-- test/Makefile.am | 4 ++-- tools/Makefile.am | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/examples/Makefile.am b/examples/Makefile.am index fd547d2..a420bda 100644 --- a/examples/Makefile.am +++ b/examples/Makefile.am @@ -1,7 +1,7 @@ # GF-Complete 'examples' AM file -AM_CPPFLAGS=-I./ -I../include -AM_CFLAGS = -O3 $(SIMD_FLAGS) -fPIC $(INCLUDES) +AM_CPPFLAGS = -I$(top_builddir)/include -I$(top_srcdir)/include +AM_CFLAGS = -O3 $(SIMD_FLAGS) -fPIC bin_PROGRAMS = gf_example_1 gf_example_2 gf_example_3 gf_example_4 \ gf_example_5 gf_example_6 gf_example_7 diff --git a/src/Makefile.am b/src/Makefile.am index ba3ad5e..34633ea 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -1,8 +1,8 @@ # GF-Complete 'core' AM file # Creates the library -AM_CPPFLAGS=-I./ -I../include -AM_CFLAGS = -O3 $(SIMD_FLAGS) -fPIC $(INCLUDES) +AM_CPPFLAGS = -I$(top_builddir)/include -I$(top_srcdir)/include +AM_CFLAGS = -O3 $(SIMD_FLAGS) -fPIC lib_LTLIBRARIES = libgf_complete.la libgf_complete_la_SOURCES = gf.c gf_method.c gf_wgen.c gf_w4.c gf_w8.c gf_w16.c gf_w32.c \ diff --git a/test/Makefile.am b/test/Makefile.am index 7f39a48..2791528 100644 --- a/test/Makefile.am +++ b/test/Makefile.am @@ -1,7 +1,7 @@ # GF-Complete 'test' AM file -AM_CPPFLAGS=-I./ -I../include -AM_CFLAGS = -O3 $(SIMD_FLAGS) -fPIC $(INCLUDES) +AM_CPPFLAGS = -I$(top_builddir)/include -I$(top_srcdir)/include +AM_CFLAGS = -O3 $(SIMD_FLAGS) -fPIC bin_PROGRAMS = gf_unit diff --git a/tools/Makefile.am b/tools/Makefile.am index 7c55d65..d502623 100644 --- a/tools/Makefile.am +++ b/tools/Makefile.am @@ -1,7 +1,7 @@ # GF-Complete 'tools' AM file -AM_CPPFLAGS=-I./ -I../include -AM_CFLAGS = -O3 $(SIMD_FLAGS) -fPIC $(INCLUDES) +AM_CPPFLAGS = -I$(top_builddir)/include -I$(top_srcdir)/include +AM_CFLAGS = -O3 $(SIMD_FLAGS) -fPIC TESTS=run-tests.sh -- cgit v1.2.1 From 2a2f1e306f1759e5e52771a643a3a2df54552069 Mon Sep 17 00:00:00 2001 From: Janne Grunau Date: Wed, 24 Sep 2014 16:02:19 +0200 Subject: check: split unit tests and support paralell execution --- configure.ac | 2 +- tools/Makefile.am | 18 ++++++++++++++++-- tools/run-tests.sh | 9 --------- 3 files changed, 17 insertions(+), 12 deletions(-) delete mode 100755 tools/run-tests.sh diff --git a/configure.ac b/configure.ac index 02a62b8..47d5d62 100644 --- a/configure.ac +++ b/configure.ac @@ -8,7 +8,7 @@ AC_INIT(gf-complete, 1.0) AC_PREREQ([2.61]) -AM_INIT_AUTOMAKE([no-dependencies foreign]) +AM_INIT_AUTOMAKE([no-dependencies foreign parallel-tests]) LT_INIT # libtool AC_CONFIG_HEADER(include/config.h) diff --git a/tools/Makefile.am b/tools/Makefile.am index d502623..eb27d4a 100644 --- a/tools/Makefile.am +++ b/tools/Makefile.am @@ -3,8 +3,6 @@ AM_CPPFLAGS = -I$(top_builddir)/include -I$(top_srcdir)/include AM_CFLAGS = -O3 $(SIMD_FLAGS) -fPIC -TESTS=run-tests.sh - bin_PROGRAMS = gf_mult gf_div gf_add gf_time gf_methods gf_poly gf_inline_time gf_mult_SOURCES = gf_mult.c @@ -35,3 +33,19 @@ gf_inline_time_SOURCES = gf_inline_time.c #gf_inline_time_LDFLAGS = -lgf_complete gf_inline_time_LDADD = ../src/libgf_complete.la +# gf_unit tests as generated by gf_methods +gf_unit_w%.sh: gf_methods + ./$^ $(@:gf_unit_w%.sh=%) -A -U > $@ || rm $@ + +TESTS = gf_unit_w128.sh \ + gf_unit_w64.sh \ + gf_unit_w32.sh \ + gf_unit_w16.sh \ + gf_unit_w8.sh \ + gf_unit_w4.sh + +TEST_EXTENSIONS = .sh +SH_LOG_COMPILER = $(SHELL) +AM_SH_LOG_FLAGS = -e + +CLEANFILES = $(TESTS) diff --git a/tools/run-tests.sh b/tools/run-tests.sh deleted file mode 100755 index bd3cc60..0000000 --- a/tools/run-tests.sh +++ /dev/null @@ -1,9 +0,0 @@ -#!/bin/bash - -for w in 4 8 16 32 64 128 ; do - ./gf_methods $w -A -U | sh -e - if [ $? != "0" ] ; then - echo "Failed unit tests for w=$w" - break - fi -done -- cgit v1.2.1 From 568df90edc6ae07744de45de8665fb86ce6c84ee Mon Sep 17 00:00:00 2001 From: Janne Grunau Date: Fri, 19 Sep 2014 12:30:57 +0200 Subject: simd: rename the region flags from SSE to SIMD SSE is not the only supported SIMD instruction set. Keep the old names for backward compatibility. --- include/gf_complete.h | 2 ++ include/gf_int.h | 10 +++--- src/gf.c | 91 ++++++++++++++++++++++++++------------------------- src/gf_method.c | 10 ++++-- src/gf_w128.c | 6 ++-- src/gf_w16.c | 14 ++++---- src/gf_w32.c | 14 ++++---- src/gf_w4.c | 12 +++---- src/gf_w64.c | 16 ++++----- src/gf_w8.c | 12 +++---- tools/gf_methods.c | 2 +- 11 files changed, 99 insertions(+), 90 deletions(-) diff --git a/include/gf_complete.h b/include/gf_complete.h index 5806625..e8ea2ca 100644 --- a/include/gf_complete.h +++ b/include/gf_complete.h @@ -61,7 +61,9 @@ typedef enum {GF_MULT_DEFAULT, #define GF_REGION_DOUBLE_TABLE (0x1) #define GF_REGION_QUAD_TABLE (0x2) #define GF_REGION_LAZY (0x4) +#define GF_REGION_SIMD (0x8) #define GF_REGION_SSE (0x8) +#define GF_REGION_NOSIMD (0x10) #define GF_REGION_NOSSE (0x10) #define GF_REGION_ALTMAP (0x20) #define GF_REGION_CAUCHY (0x40) diff --git a/include/gf_int.h b/include/gf_int.h index 98294cc..32866f4 100644 --- a/include/gf_int.h +++ b/include/gf_int.h @@ -113,7 +113,7 @@ typedef enum {GF_E_MDEFDIV, /* Dev != Default && Mult == Default */ GF_E_DIVCOMP, /* Mult == Composite && Div != Default */ GF_E_CAUCOMP, /* Mult == Composite && Reg == CAUCHY */ GF_E_DOUQUAD, /* Reg == DOUBLE && Reg == QUAD */ - GF_E_SSE__NO, /* Reg == SSE && Reg == NOSSE */ + GF_E_SIMD_NO, /* Reg == SIMD && Reg == NOSIMD */ GF_E_CAUCHYB, /* Reg == CAUCHY && Other Reg */ GF_E_CAUGT32, /* Reg == CAUCHY && w > 32*/ GF_E_ARG1SET, /* Arg1 != 0 && Mult \notin COMPOSITE/SPLIT/GROUP */ @@ -129,9 +129,9 @@ typedef enum {GF_E_MDEFDIV, /* Dev != Default && Mult == Default */ GF_E_QUAD__J, /* Reg == QUAD && other Reg */ GF_E_LAZY__X, /* Reg == LAZY && not DOUBLE or QUAD*/ GF_E_ALTSHIF, /* Mult == Shift && Reg == ALTMAP */ - GF_E_SSESHIF, /* Mult == Shift && Reg == SSE|NOSSE */ + GF_E_SSESHIF, /* Mult == Shift && Reg == SIMD|NOSIMD */ GF_E_ALT_CFM, /* Mult == CARRY_FREE && Reg == ALTMAP */ - GF_E_SSE_CFM, /* Mult == CARRY_FREE && Reg == SSE|NOSSE */ + GF_E_SSE_CFM, /* Mult == CARRY_FREE && Reg == SIMD|NOSIMD */ GF_E_PCLMULX, /* Mult == Carry_Free && No PCLMUL */ GF_E_ALT_BY2, /* Mult == Bytwo_x && Reg == ALTMAP */ GF_E_BY2_SSE, /* Mult == Bytwo_x && Reg == SSE && No SSE2 */ @@ -148,7 +148,7 @@ typedef enum {GF_E_MDEFDIV, /* Dev != Default && Mult == Default */ GF_E_GR_AR_W, /* Mult == GROUP, either arg > w */ GF_E_GR____J, /* Mult == GROUP, Reg == SSE|ALTMAP|NOSSE */ GF_E_TABLE_W, /* Mult == TABLE, w too big */ - GF_E_TAB_SSE, /* Mult == TABLE, SSE|NOSSE only apply to w == 4 */ + GF_E_TAB_SSE, /* Mult == TABLE, SIMD|NOSIMD only apply to w == 4 */ GF_E_TABSSE3, /* Mult == TABLE, Need SSSE3 for SSE */ GF_E_TAB_ALT, /* Mult == TABLE, Reg == ALTMAP */ GF_E_SP128AR, /* Mult == SPLIT, w=128, Bad arg1/arg2 */ @@ -172,7 +172,7 @@ typedef enum {GF_E_MDEFDIV, /* Dev != Default && Mult == Default */ GF_E_SP_8__A, /* Mult == SPLIT, w=8, no ALTMAP */ GF_E_SP_SSE3, /* Mult == SPLIT, Need SSSE3 for SSE */ GF_E_COMP_A2, /* Mult == COMP, arg1 must be = 2 */ - GF_E_COMP_SS, /* Mult == COMP, SSE|NOSSE */ + GF_E_COMP_SS, /* Mult == COMP, SIMD|NOSIMD */ GF_E_COMP__W, /* Mult == COMP, Bad w. */ GF_E_UNKFLAG, /* Unknown flag in create_from.... */ GF_E_UNKNOWN, /* Unknown mult_type. */ diff --git a/src/gf.c b/src/gf.c index 10c9b3c..ca6a7f8 100644 --- a/src/gf.c +++ b/src/gf.c @@ -41,7 +41,7 @@ void gf_error() case GF_E_MDEFARG: s = "If multiplication method == default, can't use arg1/arg2."; break; case GF_E_DIVCOMP: s = "Cannot change the division technique with -m COMPOSITE."; break; case GF_E_DOUQUAD: s = "Cannot specify -r DOUBLE and -r QUAD."; break; - case GF_E_SSE__NO: s = "Cannot specify -r SSE and -r NOSSE."; break; + case GF_E_SIMD_NO: s = "Cannot specify -r SIMD and -r NOSIMD."; break; case GF_E_CAUCHYB: s = "Cannot specify -r CAUCHY and any other -r."; break; case GF_E_CAUCOMP: s = "Cannot specify -m COMPOSITE and -r CAUCHY."; break; case GF_E_CAUGT32: s = "Cannot specify -r CAUCHY with w > 32."; break; @@ -51,23 +51,23 @@ void gf_error() case GF_E_BAD___W: s = "W must be 1-32, 64 or 128."; break; case GF_E_DOUBLET: s = "Can only specify -r DOUBLE with -m TABLE."; break; case GF_E_DOUBLEW: s = "Can only specify -r DOUBLE w = 4 or w = 8."; break; - case GF_E_DOUBLEJ: s = "Cannot specify -r DOUBLE with -r ALTMAP|SSE|NOSSE."; break; + case GF_E_DOUBLEJ: s = "Cannot specify -r DOUBLE with -r ALTMAP|SIMD|NOSIMD."; break; case GF_E_DOUBLEL: s = "Can only specify -r DOUBLE -r LAZY with w = 8"; break; case GF_E_QUAD__T: s = "Can only specify -r QUAD with -m TABLE."; break; case GF_E_QUAD__W: s = "Can only specify -r QUAD w = 4."; break; - case GF_E_QUAD__J: s = "Cannot specify -r QUAD with -r ALTMAP|SSE|NOSSE."; break; + case GF_E_QUAD__J: s = "Cannot specify -r QUAD with -r ALTMAP|SIMD|NOSIMD."; break; case GF_E_BADPOLY: s = "Bad primitive polynomial (high bits set)."; break; case GF_E_COMP_PP: s = "Bad primitive polynomial -- bigger than sub-field."; break; case GF_E_LAZY__X: s = "If -r LAZY, then -r must be DOUBLE or QUAD."; break; case GF_E_ALTSHIF: s = "Cannot specify -m SHIFT and -r ALTMAP."; break; - case GF_E_SSESHIF: s = "Cannot specify -m SHIFT and -r SSE|NOSSE."; break; + case GF_E_SSESHIF: s = "Cannot specify -m SHIFT and -r SIMD|NOSIMD."; break; case GF_E_ALT_CFM: s = "Cannot specify -m CARRY_FREE and -r ALTMAP."; break; - case GF_E_SSE_CFM: s = "Cannot specify -m CARRY_FREE and -r SSE|NOSSE."; break; + case GF_E_SSE_CFM: s = "Cannot specify -m CARRY_FREE and -r SIMD|NOSIMD."; break; case GF_E_PCLMULX: s = "Specified -m CARRY_FREE, but PCLMUL is not supported."; break; case GF_E_ALT_BY2: s = "Cannot specify -m BYTWO_x and -r ALTMAP."; break; - case GF_E_BY2_SSE: s = "Specified -m BYTWO_x -r SSE, but SSE2 is not supported."; break; + case GF_E_BY2_SSE: s = "Specified -m BYTWO_x -r SIMD, but SSE2 is not supported."; break; case GF_E_LOGBADW: s = "With Log Tables, w must be <= 27."; break; - case GF_E_LOG___J: s = "Cannot use Log tables with -r ALTMAP|SSE|NOSSE."; break; + case GF_E_LOG___J: s = "Cannot use Log tables with -r ALTMAP|SIMD|NOSIMD."; break; case GF_E_LOGPOLY: s = "Cannot use Log tables because the polynomial is not primitive."; break; case GF_E_ZERBADW: s = "With -m LOG_ZERO, w must be 8 or 16."; break; case GF_E_ZEXBADW: s = "With -m LOG_ZERO_EXT, w must be 8."; break; @@ -77,33 +77,33 @@ void gf_error() case GF_E_GR_128A: s = "With -m GROUP, w == 128, arg1 must be 4, and arg2 in { 4,8,16 }."; break; case GF_E_GR_A_27: s = "With -m GROUP, arg1 and arg2 must be <= 27."; break; case GF_E_GR_AR_W: s = "With -m GROUP, arg1 and arg2 must be <= w."; break; - case GF_E_GR____J: s = "Cannot use GROUP with -r ALTMAP|SSE|NOSSE."; break; + case GF_E_GR____J: s = "Cannot use GROUP with -r ALTMAP|SIMD|NOSIMD."; break; case GF_E_TABLE_W: s = "With -m TABLE, w must be < 15, or == 16."; break; - case GF_E_TAB_SSE: s = "With -m TABLE, SSE|NOSSE only applies to w=4."; break; - case GF_E_TABSSE3: s = "With -m TABLE, -r SSE, you need SSSE3 supported."; break; + case GF_E_TAB_SSE: s = "With -m TABLE, SIMD|NOSIMD only applies to w=4."; break; + case GF_E_TABSSE3: s = "With -m TABLE, -r SIMD, you need SSSE3 supported."; break; case GF_E_TAB_ALT: s = "With -m TABLE, you cannot use ALTMAP."; break; case GF_E_SP128AR: s = "With -m SPLIT, w=128, bad arg1/arg2."; break; - case GF_E_SP128AL: s = "With -m SPLIT, w=128, -r SSE requires -r ALTMAP."; break; + case GF_E_SP128AL: s = "With -m SPLIT, w=128, -r SIMD requires -r ALTMAP."; break; case GF_E_SP128AS: s = "With -m SPLIT, w=128, ALTMAP needs SSSE3 supported."; break; case GF_E_SP128_A: s = "With -m SPLIT, w=128, -r ALTMAP only with arg1/arg2 = 4/128."; break; - case GF_E_SP128_S: s = "With -m SPLIT, w=128, -r SSE|NOSSE only with arg1/arg2 = 4/128."; break; + case GF_E_SP128_S: s = "With -m SPLIT, w=128, -r SIMD|NOSIMD only with arg1/arg2 = 4/128."; break; case GF_E_SPLIT_W: s = "With -m SPLIT, w must be in {8, 16, 32, 64, 128}."; break; case GF_E_SP_16AR: s = "With -m SPLIT, w=16, Bad arg1/arg2."; break; case GF_E_SP_16_A: s = "With -m SPLIT, w=16, -r ALTMAP only with arg1/arg2 = 4/16."; break; - case GF_E_SP_16_S: s = "With -m SPLIT, w=16, -r SSE|NOSSE only with arg1/arg2 = 4/16."; break; + case GF_E_SP_16_S: s = "With -m SPLIT, w=16, -r SIMD|NOSIMD only with arg1/arg2 = 4/16."; break; case GF_E_SP_32AR: s = "With -m SPLIT, w=32, Bad arg1/arg2."; break; case GF_E_SP_32AS: s = "With -m SPLIT, w=32, -r ALTMAP needs SSSE3 supported."; break; case GF_E_SP_32_A: s = "With -m SPLIT, w=32, -r ALTMAP only with arg1/arg2 = 4/32."; break; - case GF_E_SP_32_S: s = "With -m SPLIT, w=32, -r SSE|NOSSE only with arg1/arg2 = 4/32."; break; + case GF_E_SP_32_S: s = "With -m SPLIT, w=32, -r SIMD|NOSIMD only with arg1/arg2 = 4/32."; break; case GF_E_SP_64AR: s = "With -m SPLIT, w=64, Bad arg1/arg2."; break; case GF_E_SP_64AS: s = "With -m SPLIT, w=64, -r ALTMAP needs SSSE3 supported."; break; case GF_E_SP_64_A: s = "With -m SPLIT, w=64, -r ALTMAP only with arg1/arg2 = 4/64."; break; - case GF_E_SP_64_S: s = "With -m SPLIT, w=64, -r SSE|NOSSE only with arg1/arg2 = 4/64."; break; + case GF_E_SP_64_S: s = "With -m SPLIT, w=64, -r SIMD|NOSIMD only with arg1/arg2 = 4/64."; break; case GF_E_SP_8_AR: s = "With -m SPLIT, w=8, Bad arg1/arg2."; break; case GF_E_SP_8__A: s = "With -m SPLIT, w=8, Can't have -r ALTMAP."; break; - case GF_E_SP_SSE3: s = "With -m SPLIT, Need SSSE3 support for SSE."; break; + case GF_E_SP_SSE3: s = "With -m SPLIT, Need SSSE3 support for SIMD."; break; case GF_E_COMP_A2: s = "With -m COMPOSITE, arg1 must equal 2."; break; - case GF_E_COMP_SS: s = "With -m COMPOSITE, -r SSE and -r NOSSE do not apply."; break; + case GF_E_COMP_SS: s = "With -m COMPOSITE, -r SIMD and -r NOSIMD do not apply."; break; case GF_E_COMP__W: s = "With -m COMPOSITE, w must be 8, 16, 32, 64 or 128."; break; case GF_E_UNKFLAG: s = "Unknown method flag - should be -m, -d, -r or -p."; break; case GF_E_UNKNOWN: s = "Unknown multiplication type."; break; @@ -182,14 +182,14 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type, int sse3 = 0; int sse2 = 0; int pclmul = 0; - int rdouble, rquad, rlazy, rsse, rnosse, raltmap, rcauchy, tmp; + int rdouble, rquad, rlazy, rsimd, rnosimd, raltmap, rcauchy, tmp; gf_internal_t *sub; rdouble = (region_type & GF_REGION_DOUBLE_TABLE); rquad = (region_type & GF_REGION_QUAD_TABLE); rlazy = (region_type & GF_REGION_LAZY); - rsse = (region_type & GF_REGION_SSE); - rnosse = (region_type & GF_REGION_NOSSE); + rsimd = (region_type & GF_REGION_SIMD); + rnosimd = (region_type & GF_REGION_NOSIMD); raltmap = (region_type & GF_REGION_ALTMAP); rcauchy = (region_type & GF_REGION_CAUCHY); @@ -201,7 +201,8 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type, } tmp = ( GF_REGION_DOUBLE_TABLE | GF_REGION_QUAD_TABLE | GF_REGION_LAZY | - GF_REGION_SSE | GF_REGION_NOSSE | GF_REGION_ALTMAP | GF_REGION_CAUCHY ); + GF_REGION_SIMD | GF_REGION_NOSIMD | GF_REGION_ALTMAP | + GF_REGION_CAUCHY ); if (region_type & (~tmp)) { _gf_errno = GF_E_UNK_REG; return 0; } #ifdef INTEL_SSE2 @@ -230,7 +231,7 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type, return 1; } - if (rsse && rnosse) { _gf_errno = GF_E_SSE__NO; return 0; } + if (rsimd && rnosimd) { _gf_errno = GF_E_SIMD_NO; return 0; } if (rcauchy && w > 32) { _gf_errno = GF_E_CAUGT32; return 0; } if (rcauchy && region_type != GF_REGION_CAUCHY) { _gf_errno = GF_E_CAUCHYB; return 0; } if (rcauchy && mult_type == GF_MULT_COMPOSITE) { _gf_errno = GF_E_CAUCOMP; return 0; } @@ -252,7 +253,7 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type, if (rquad) { _gf_errno = GF_E_DOUQUAD; return 0; } if (mult_type != GF_MULT_TABLE) { _gf_errno = GF_E_DOUBLET; return 0; } if (w != 4 && w != 8) { _gf_errno = GF_E_DOUBLEW; return 0; } - if (rsse || rnosse || raltmap) { _gf_errno = GF_E_DOUBLEJ; return 0; } + if (rsimd || rnosimd || raltmap) { _gf_errno = GF_E_DOUBLEJ; return 0; } if (rlazy && w == 4) { _gf_errno = GF_E_DOUBLEL; return 0; } return 1; } @@ -260,7 +261,7 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type, if (rquad) { if (mult_type != GF_MULT_TABLE) { _gf_errno = GF_E_QUAD__T; return 0; } if (w != 4) { _gf_errno = GF_E_QUAD__W; return 0; } - if (rsse || rnosse || raltmap) { _gf_errno = GF_E_QUAD__J; return 0; } + if (rsimd || rnosimd || raltmap) { _gf_errno = GF_E_QUAD__J; return 0; } return 1; } @@ -268,7 +269,7 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type, if (mult_type == GF_MULT_SHIFT) { if (raltmap) { _gf_errno = GF_E_ALTSHIF; return 0; } - if (rsse || rnosse) { _gf_errno = GF_E_SSESHIF; return 0; } + if (rsimd || rnosimd) { _gf_errno = GF_E_SSESHIF; return 0; } return 1; } @@ -281,7 +282,7 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type, if (w == 32 && (poly & 0xfe000000)) { _gf_errno = GF_E_CF32POL; return 0; } if (w == 64 && (poly & 0xfffe000000000000ULL)) { _gf_errno = GF_E_CF64POL; return 0; } if (raltmap) { _gf_errno = GF_E_ALT_CFM; return 0; } - if (rsse || rnosse) { _gf_errno = GF_E_SSE_CFM; return 0; } + if (rsimd || rnosimd) { _gf_errno = GF_E_SSE_CFM; return 0; } if (!pclmul) { _gf_errno = GF_E_PCLMULX; return 0; } return 1; } @@ -290,21 +291,21 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type, if (w != 4 && w != 8 && w != 16 && w != 32 && w != 64 && w != 128) { _gf_errno = GF_E_CFM___W; return 0; } if (raltmap) { _gf_errno = GF_E_ALT_CFM; return 0; } - if (rsse || rnosse) { _gf_errno = GF_E_SSE_CFM; return 0; } + if (rsimd || rnosimd) { _gf_errno = GF_E_SSE_CFM; return 0; } if (!pclmul) { _gf_errno = GF_E_PCLMULX; return 0; } return 1; } if (mult_type == GF_MULT_BYTWO_p || mult_type == GF_MULT_BYTWO_b) { if (raltmap) { _gf_errno = GF_E_ALT_BY2; return 0; } - if (rsse && !sse2) { _gf_errno = GF_E_BY2_SSE; return 0; } + if (rsimd && !sse2) { _gf_errno = GF_E_BY2_SSE; return 0; } return 1; } if (mult_type == GF_MULT_LOG_TABLE || mult_type == GF_MULT_LOG_ZERO || mult_type == GF_MULT_LOG_ZERO_EXT ) { if (w > 27) { _gf_errno = GF_E_LOGBADW; return 0; } - if (raltmap || rsse || rnosse) { _gf_errno = GF_E_LOG___J; return 0; } + if (raltmap || rsimd || rnosimd) { _gf_errno = GF_E_LOG___J; return 0; } if (mult_type == GF_MULT_LOG_TABLE) return 1; @@ -324,14 +325,14 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type, (arg2 != 4 && arg2 != 8 && arg2 != 16))) { _gf_errno = GF_E_GR_128A; return 0; } if (arg1 > 27 || arg2 > 27) { _gf_errno = GF_E_GR_A_27; return 0; } if (arg1 > w || arg2 > w) { _gf_errno = GF_E_GR_AR_W; return 0; } - if (raltmap || rsse || rnosse) { _gf_errno = GF_E_GR____J; return 0; } + if (raltmap || rsimd || rnosimd) { _gf_errno = GF_E_GR____J; return 0; } return 1; } if (mult_type == GF_MULT_TABLE) { if (w != 16 && w >= 15) { _gf_errno = GF_E_TABLE_W; return 0; } - if (w != 4 && (rsse || rnosse)) { _gf_errno = GF_E_TAB_SSE; return 0; } - if (rsse && !sse3) { _gf_errno = GF_E_TABSSE3; return 0; } + if (w != 4 && (rsimd || rnosimd)) { _gf_errno = GF_E_TAB_SSE; return 0; } + if (rsimd && !sse3) { _gf_errno = GF_E_TABSSE3; return 0; } if (raltmap) { _gf_errno = GF_E_TAB_ALT; return 0; } return 1; } @@ -344,46 +345,46 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type, } if (w == 8) { if (arg1 != 4 || arg2 != 8) { _gf_errno = GF_E_SP_8_AR; return 0; } - if (rsse && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; } + if (rsimd && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; } if (raltmap) { _gf_errno = GF_E_SP_8__A; return 0; } } else if (w == 16) { if ((arg1 == 8 && arg2 == 8) || (arg1 == 8 && arg2 == 16)) { - if (rsse || rnosse) { _gf_errno = GF_E_SP_16_S; return 0; } + if (rsimd || rnosimd) { _gf_errno = GF_E_SP_16_S; return 0; } if (raltmap) { _gf_errno = GF_E_SP_16_A; return 0; } } else if (arg1 == 4 && arg2 == 16) { - if (rsse && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; } + if (rsimd && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; } } else { _gf_errno = GF_E_SP_16AR; return 0; } } else if (w == 32) { if ((arg1 == 8 && arg2 == 8) || (arg1 == 8 && arg2 == 32) || (arg1 == 16 && arg2 == 32)) { - if (rsse || rnosse) { _gf_errno = GF_E_SP_32_S; return 0; } + if (rsimd || rnosimd) { _gf_errno = GF_E_SP_32_S; return 0; } if (raltmap) { _gf_errno = GF_E_SP_32_A; return 0; } } else if (arg1 == 4 && arg2 == 32) { - if (rsse && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; } + if (rsimd && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; } if (raltmap && !sse3) { _gf_errno = GF_E_SP_32AS; return 0; } - if (raltmap && rnosse) { _gf_errno = GF_E_SP_32AS; return 0; } + if (raltmap && rnosimd) { _gf_errno = GF_E_SP_32AS; return 0; } } else { _gf_errno = GF_E_SP_32AR; return 0; } } else if (w == 64) { if ((arg1 == 8 && arg2 == 8) || (arg1 == 8 && arg2 == 64) || (arg1 == 16 && arg2 == 64)) { - if (rsse || rnosse) { _gf_errno = GF_E_SP_64_S; return 0; } + if (rsimd || rnosimd) { _gf_errno = GF_E_SP_64_S; return 0; } if (raltmap) { _gf_errno = GF_E_SP_64_A; return 0; } } else if (arg1 == 4 && arg2 == 64) { - if (rsse && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; } + if (rsimd && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; } if (raltmap && !sse3) { _gf_errno = GF_E_SP_64AS; return 0; } - if (raltmap && rnosse) { _gf_errno = GF_E_SP_64AS; return 0; } + if (raltmap && rnosimd) { _gf_errno = GF_E_SP_64AS; return 0; } } else { _gf_errno = GF_E_SP_64AR; return 0; } } else if (w == 128) { if (arg1 == 8 && arg2 == 128) { - if (rsse || rnosse) { _gf_errno = GF_E_SP128_S; return 0; } + if (rsimd || rnosimd) { _gf_errno = GF_E_SP128_S; return 0; } if (raltmap) { _gf_errno = GF_E_SP128_A; return 0; } } else if (arg1 == 4 && arg2 == 128) { - if (rsse && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; } + if (rsimd && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; } if (raltmap && !sse3) { _gf_errno = GF_E_SP128AS; return 0; } - if (raltmap && rnosse) { _gf_errno = GF_E_SP128AS; return 0; } + if (raltmap && rnosimd) { _gf_errno = GF_E_SP128AS; return 0; } } else { _gf_errno = GF_E_SP128AR; return 0; } } else { _gf_errno = GF_E_SPLIT_W; return 0; } return 1; @@ -395,7 +396,7 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type, if (w < 128 && (poly >> (w/2)) != 0) { _gf_errno = GF_E_COMP_PP; return 0; } if (divide_type != GF_DIVIDE_DEFAULT) { _gf_errno = GF_E_DIVCOMP; return 0; } if (arg1 != 2) { _gf_errno = GF_E_COMP_A2; return 0; } - if (rsse || rnosse) { _gf_errno = GF_E_COMP_SS; return 0; } + if (rsimd || rnosimd) { _gf_errno = GF_E_COMP_SS; return 0; } if (base != NULL) { sub = (gf_internal_t *) base->scratch; if (sub->w != w/2) { _gf_errno = GF_E_BASE__W; return 0; } diff --git a/src/gf_method.c b/src/gf_method.c index 2548a63..2210305 100644 --- a/src/gf_method.c +++ b/src/gf_method.c @@ -121,11 +121,17 @@ int create_gf_from_argv(gf_t *gf, int w, int argc, char **argv, int starting) } else if (strcmp(argv[starting], "LAZY") == 0) { region_type |= GF_REGION_LAZY; starting++; + } else if (strcmp(argv[starting], "SIMD") == 0) { + region_type |= GF_REGION_SIMD; + starting++; + } else if (strcmp(argv[starting], "NOSIMD") == 0) { + region_type |= GF_REGION_NOSIMD; + starting++; } else if (strcmp(argv[starting], "SSE") == 0) { - region_type |= GF_REGION_SSE; + region_type |= GF_REGION_SIMD; starting++; } else if (strcmp(argv[starting], "NOSSE") == 0) { - region_type |= GF_REGION_NOSSE; + region_type |= GF_REGION_NOSIMD; starting++; } else if (strcmp(argv[starting], "CAUCHY") == 0) { region_type |= GF_REGION_CAUCHY; diff --git a/src/gf_w128.c b/src/gf_w128.c index 66f9422..190f6b0 100644 --- a/src/gf_w128.c +++ b/src/gf_w128.c @@ -1527,7 +1527,7 @@ int gf_w128_split_init(gf_t *gf) gf->multiply.w128 = gf_w128_bytwo_p_multiply; #if defined(INTEL_SSE4_PCLMUL) - if (!(h->region_type & GF_REGION_NOSSE)){ + if (!(h->region_type & GF_REGION_NOSIMD)){ gf->multiply.w128 = gf_w128_clm_multiply; } #endif @@ -1546,7 +1546,7 @@ int gf_w128_split_init(gf_t *gf) if((h->region_type & GF_REGION_ALTMAP)) { #ifdef INTEL_SSE4 - if(!(h->region_type & GF_REGION_NOSSE)) + if(!(h->region_type & GF_REGION_NOSIMD)) gf->multiply_region.w128 = gf_w128_split_4_128_sse_altmap_multiply_region; else return 0; @@ -1556,7 +1556,7 @@ int gf_w128_split_init(gf_t *gf) } else { #ifdef INTEL_SSE4 - if(!(h->region_type & GF_REGION_NOSSE)) + if(!(h->region_type & GF_REGION_NOSIMD)) gf->multiply_region.w128 = gf_w128_split_4_128_sse_multiply_region; else gf->multiply_region.w128 = gf_w128_split_4_128_multiply_region; diff --git a/src/gf_w16.c b/src/gf_w16.c index c4cd22d..0904115 100644 --- a/src/gf_w16.c +++ b/src/gf_w16.c @@ -1327,14 +1327,14 @@ int gf_w16_split_init(gf_t *gf) } else if ((h->arg1 == 4 && h->arg2 == 16) || (h->arg2 == 4 && h->arg1 == 16)) { if (issse3) { - if(h->region_type & GF_REGION_ALTMAP && h->region_type & GF_REGION_NOSSE) + if(h->region_type & GF_REGION_ALTMAP && h->region_type & GF_REGION_NOSIMD) gf->multiply_region.w32 = gf_w16_split_4_16_lazy_nosse_altmap_multiply_region; - else if(h->region_type & GF_REGION_NOSSE) + else if(h->region_type & GF_REGION_NOSIMD) gf->multiply_region.w32 = gf_w16_split_4_16_lazy_multiply_region; else if(h->region_type & GF_REGION_ALTMAP) gf->multiply_region.w32 = gf_w16_split_4_16_lazy_sse_altmap_multiply_region; } else { - if(h->region_type & GF_REGION_SSE) + if(h->region_type & GF_REGION_SIMD) return 0; else if(h->region_type & GF_REGION_ALTMAP) gf->multiply_region.w32 = gf_w16_split_4_16_lazy_nosse_altmap_multiply_region; @@ -1884,25 +1884,25 @@ int gf_w16_bytwo_init(gf_t *gf) if (h->mult_type == GF_MULT_BYTWO_p) { gf->multiply.w32 = gf_w16_bytwo_p_multiply; #ifdef INTEL_SSE2 - if (h->region_type & GF_REGION_NOSSE) + if (h->region_type & GF_REGION_NOSIMD) gf->multiply_region.w32 = gf_w16_bytwo_p_nosse_multiply_region; else gf->multiply_region.w32 = gf_w16_bytwo_p_sse_multiply_region; #else gf->multiply_region.w32 = gf_w16_bytwo_p_nosse_multiply_region; - if(h->region_type & GF_REGION_SSE) + if(h->region_type & GF_REGION_SIMD) return 0; #endif } else { gf->multiply.w32 = gf_w16_bytwo_b_multiply; #ifdef INTEL_SSE2 - if (h->region_type & GF_REGION_NOSSE) + if (h->region_type & GF_REGION_NOSIMD) gf->multiply_region.w32 = gf_w16_bytwo_b_nosse_multiply_region; else gf->multiply_region.w32 = gf_w16_bytwo_b_sse_multiply_region; #else gf->multiply_region.w32 = gf_w16_bytwo_b_nosse_multiply_region; - if(h->region_type & GF_REGION_SSE) + if(h->region_type & GF_REGION_SIMD) return 0; #endif } diff --git a/src/gf_w32.c b/src/gf_w32.c index 5ec2aa7..8e7c741 100644 --- a/src/gf_w32.c +++ b/src/gf_w32.c @@ -1434,25 +1434,25 @@ int gf_w32_bytwo_init(gf_t *gf) if (h->mult_type == GF_MULT_BYTWO_p) { gf->multiply.w32 = gf_w32_bytwo_p_multiply; #ifdef INTEL_SSE2 - if (h->region_type & GF_REGION_NOSSE) + if (h->region_type & GF_REGION_NOSIMD) gf->multiply_region.w32 = gf_w32_bytwo_p_nosse_multiply_region; else gf->multiply_region.w32 = gf_w32_bytwo_p_sse_multiply_region; #else gf->multiply_region.w32 = gf_w32_bytwo_p_nosse_multiply_region; - if(h->region_type & GF_REGION_SSE) + if(h->region_type & GF_REGION_SIMD) return 0; #endif } else { gf->multiply.w32 = gf_w32_bytwo_b_multiply; #ifdef INTEL_SSE2 - if (h->region_type & GF_REGION_NOSSE) + if (h->region_type & GF_REGION_NOSIMD) gf->multiply_region.w32 = gf_w32_bytwo_b_nosse_multiply_region; else gf->multiply_region.w32 = gf_w32_bytwo_b_sse_multiply_region; #else gf->multiply_region.w32 = gf_w32_bytwo_b_nosse_multiply_region; - if(h->region_type & GF_REGION_SSE) + if(h->region_type & GF_REGION_SIMD) return 0; #endif } @@ -2335,13 +2335,13 @@ int gf_w32_split_init(gf_t *gf) ld2 = (struct gf_split_2_32_lazy_data *) h->private; ld2->last_value = 0; #ifdef INTEL_SSSE3 - if (!(h->region_type & GF_REGION_NOSSE)) + if (!(h->region_type & GF_REGION_NOSIMD)) gf->multiply_region.w32 = gf_w32_split_2_32_lazy_sse_multiply_region; else gf->multiply_region.w32 = gf_w32_split_2_32_lazy_multiply_region; #else gf->multiply_region.w32 = gf_w32_split_2_32_lazy_multiply_region; - if(h->region_type & GF_REGION_SSE) return 0; + if(h->region_type & GF_REGION_SIMD) return 0; #endif return 1; } @@ -2352,7 +2352,7 @@ int gf_w32_split_init(gf_t *gf) (issse3 && h->mult_type == GF_REGION_DEFAULT)) { ld4 = (struct gf_split_4_32_lazy_data *) h->private; ld4->last_value = 0; - if ((h->region_type & GF_REGION_NOSSE) || !issse3) { + if ((h->region_type & GF_REGION_NOSIMD) || !issse3) { gf->multiply_region.w32 = gf_w32_split_4_32_lazy_multiply_region; } else if (h->region_type & GF_REGION_ALTMAP) { gf->multiply_region.w32 = gf_w32_split_4_32_lazy_sse_altmap_multiply_region; diff --git a/src/gf_w4.c b/src/gf_w4.c index 6bc79d0..f098323 100644 --- a/src/gf_w4.c +++ b/src/gf_w4.c @@ -490,13 +490,13 @@ int gf_w4_single_table_init(gf_t *gf) gf->divide.w32 = gf_w4_single_table_divide; gf->multiply.w32 = gf_w4_single_table_multiply; #ifdef INTEL_SSSE3 - if(h->region_type & (GF_REGION_NOSSE | GF_REGION_CAUCHY)) + if(h->region_type & (GF_REGION_NOSIMD | GF_REGION_CAUCHY)) gf->multiply_region.w32 = gf_w4_single_table_multiply_region; else gf->multiply_region.w32 = gf_w4_single_table_sse_multiply_region; #else gf->multiply_region.w32 = gf_w4_single_table_multiply_region; - if (h->region_type & GF_REGION_SSE) return 0; + if (h->region_type & GF_REGION_SIMD) return 0; #endif return 1; @@ -1905,25 +1905,25 @@ int gf_w4_bytwo_init(gf_t *gf) if (h->mult_type == GF_MULT_BYTWO_p) { gf->multiply.w32 = gf_w4_bytwo_p_multiply; #ifdef INTEL_SSE2 - if (h->region_type & GF_REGION_NOSSE) + if (h->region_type & GF_REGION_NOSIMD) gf->multiply_region.w32 = gf_w4_bytwo_p_nosse_multiply_region; else gf->multiply_region.w32 = gf_w4_bytwo_p_sse_multiply_region; #else gf->multiply_region.w32 = gf_w4_bytwo_p_nosse_multiply_region; - if (h->region_type & GF_REGION_SSE) + if (h->region_type & GF_REGION_SIMD) return 0; #endif } else { gf->multiply.w32 = gf_w4_bytwo_b_multiply; #ifdef INTEL_SSE2 - if (h->region_type & GF_REGION_NOSSE) + if (h->region_type & GF_REGION_NOSIMD) gf->multiply_region.w32 = gf_w4_bytwo_b_nosse_multiply_region; else gf->multiply_region.w32 = gf_w4_bytwo_b_sse_multiply_region; #else gf->multiply_region.w32 = gf_w4_bytwo_b_nosse_multiply_region; - if (h->region_type & GF_REGION_SSE) + if (h->region_type & GF_REGION_SIMD) return 0; #endif } diff --git a/src/gf_w64.c b/src/gf_w64.c index fdc4a7c..fe1c75d 100644 --- a/src/gf_w64.c +++ b/src/gf_w64.c @@ -1488,25 +1488,25 @@ int gf_w64_bytwo_init(gf_t *gf) if (h->mult_type == GF_MULT_BYTWO_p) { gf->multiply.w64 = gf_w64_bytwo_p_multiply; #ifdef INTEL_SSE2 - if (h->region_type & GF_REGION_NOSSE) + if (h->region_type & GF_REGION_NOSIMD) gf->multiply_region.w64 = gf_w64_bytwo_p_nosse_multiply_region; else gf->multiply_region.w64 = gf_w64_bytwo_p_sse_multiply_region; #else gf->multiply_region.w64 = gf_w64_bytwo_p_nosse_multiply_region; - if(h->region_type & GF_REGION_SSE) + if(h->region_type & GF_REGION_SIMD) return 0; #endif } else { gf->multiply.w64 = gf_w64_bytwo_b_multiply; #ifdef INTEL_SSE2 - if (h->region_type & GF_REGION_NOSSE) + if (h->region_type & GF_REGION_NOSIMD) gf->multiply_region.w64 = gf_w64_bytwo_b_nosse_multiply_region; else gf->multiply_region.w64 = gf_w64_bytwo_b_sse_multiply_region; #else gf->multiply_region.w64 = gf_w64_bytwo_b_nosse_multiply_region; - if(h->region_type & GF_REGION_SSE) + if(h->region_type & GF_REGION_SIMD) return 0; #endif } @@ -2006,7 +2006,7 @@ int gf_w64_split_init(gf_t *gf) gf->multiply.w64 = gf_w64_bytwo_p_multiply; #if defined(INTEL_SSE4_PCLMUL) - if ((!(h->region_type & GF_REGION_NOSSE) && + if ((!(h->region_type & GF_REGION_NOSIMD) && (h->arg1 == 64 || h->arg2 == 64)) || h->mult_type == GF_MULT_DEFAULT){ @@ -2045,7 +2045,7 @@ int gf_w64_split_init(gf_t *gf) d4 = (struct gf_split_4_64_lazy_data *) h->private; d4->last_value = 0; - if((h->region_type & GF_REGION_ALTMAP) && (h->region_type & GF_REGION_NOSSE)) return 0; + if((h->region_type & GF_REGION_ALTMAP) && (h->region_type & GF_REGION_NOSIMD)) return 0; if(h->region_type & GF_REGION_ALTMAP) { #ifdef INTEL_SSSE3 @@ -2057,13 +2057,13 @@ int gf_w64_split_init(gf_t *gf) else //no altmap { #ifdef INTEL_SSE4 - if(h->region_type & GF_REGION_NOSSE) + if(h->region_type & GF_REGION_NOSIMD) gf->multiply_region.w64 = gf_w64_split_4_64_lazy_multiply_region; else gf->multiply_region.w64 = gf_w64_split_4_64_lazy_sse_multiply_region; #else gf->multiply_region.w64 = gf_w64_split_4_64_lazy_multiply_region; - if(h->region_type & GF_REGION_SSE) + if(h->region_type & GF_REGION_SIMD) return 0; #endif } diff --git a/src/gf_w8.c b/src/gf_w8.c index 67fd688..bc4f5d1 100644 --- a/src/gf_w8.c +++ b/src/gf_w8.c @@ -1180,13 +1180,13 @@ int gf_w8_split_init(gf_t *gf) gf->multiply.w32 = gf_w8_split_multiply; #ifdef INTEL_SSSE3 - if (h->region_type & GF_REGION_NOSSE) + if (h->region_type & GF_REGION_NOSIMD) gf->multiply_region.w32 = gf_w8_split_multiply_region; else gf->multiply_region.w32 = gf_w8_split_multiply_region_sse; #else gf->multiply_region.w32 = gf_w8_split_multiply_region; - if(h->region_type & GF_REGION_SSE) + if(h->region_type & GF_REGION_SIMD) return 0; #endif @@ -2259,25 +2259,25 @@ int gf_w8_bytwo_init(gf_t *gf) if (h->mult_type == GF_MULT_BYTWO_p) { gf->multiply.w32 = gf_w8_bytwo_p_multiply; #ifdef INTEL_SSE2 - if (h->region_type & GF_REGION_NOSSE) + if (h->region_type & GF_REGION_NOSIMD) gf->multiply_region.w32 = gf_w8_bytwo_p_nosse_multiply_region; else gf->multiply_region.w32 = gf_w8_bytwo_p_sse_multiply_region; #else gf->multiply_region.w32 = gf_w8_bytwo_p_nosse_multiply_region; - if(h->region_type & GF_REGION_SSE) + if(h->region_type & GF_REGION_SIMD) return 0; #endif } else { gf->multiply.w32 = gf_w8_bytwo_b_multiply; #ifdef INTEL_SSE2 - if (h->region_type & GF_REGION_NOSSE) + if (h->region_type & GF_REGION_NOSIMD) gf->multiply_region.w32 = gf_w8_bytwo_b_nosse_multiply_region; else gf->multiply_region.w32 = gf_w8_bytwo_b_sse_multiply_region; #else gf->multiply_region.w32 = gf_w8_bytwo_b_nosse_multiply_region; - if(h->region_type & GF_REGION_SSE) + if(h->region_type & GF_REGION_SIMD) return 0; #endif } diff --git a/tools/gf_methods.c b/tools/gf_methods.c index 43589ac..c7d3d58 100644 --- a/tools/gf_methods.c +++ b/tools/gf_methods.c @@ -28,7 +28,7 @@ static char *MULTS[NMULTS] = { "SHIFT", "CARRY_FREE", "CARRY_FREE_GK", "GROUP44" /* Make sure CAUCHY is last */ #define NREGIONS (7) -static char *REGIONS[NREGIONS] = { "DOUBLE", "QUAD", "LAZY", "SSE", "NOSSE", +static char *REGIONS[NREGIONS] = { "DOUBLE", "QUAD", "LAZY", "SIMD", "NOSIMD", "ALTMAP", "CAUCHY" }; #define BNREGIONS (4) -- cgit v1.2.1 From eb5ce0ca4206ed4f74009c1b9a3a72407693448b Mon Sep 17 00:00:00 2001 From: Janne Grunau Date: Thu, 4 Sep 2014 18:29:58 +0200 Subject: configure: add ARM/AArch64 NEON support Checks for arm_neon.h header. --- configure.ac | 21 +++++++++++++++++++++ include/gf_complete.h | 4 ++++ m4/ax_ext.m4 | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 74 insertions(+) diff --git a/configure.ac b/configure.ac index 47d5d62..31ab1fa 100644 --- a/configure.ac +++ b/configure.ac @@ -24,6 +24,27 @@ AC_PROG_CC AX_EXT() +AC_ARG_ENABLE([neon], + AS_HELP_STRING([--disable-neon], [Build without NEON optimizations])) + +AS_IF([test "x$enable_neon" != "xno"], + [noneon_CPPFLAGS=$CPPFLAGS + CPPFLAGS="$CPPFLAGS $SIMD_FLAGS" + AC_CHECK_HEADER([arm_neon.h], + [have_neon=yes], + [have_neon=no + CPPFLAGS=$noneon_CPPFLAGS])], + [have_neon=no + AS_IF([test "x$ax_cv_have_neon_ext" = "xyes"], + [SIMD_FLAGS=""]) + ]) + +AS_IF([test "x$have_neon" = "xno"], + [AS_IF([test "x$enable_neon" = "xyes"], + [AC_MSG_ERROR([neon requested but arm_neon.h not found])]) + ]) +AM_CONDITIONAL([HAVE_NEON], [test "x$have_neon" = "xyes"]) + AC_ARG_ENABLE([sse], AS_HELP_STRING([--disable-sse], [Build without SSE optimizations]), [if test "x$enableval" = "xno" ; then diff --git a/include/gf_complete.h b/include/gf_complete.h index e8ea2ca..c4783e8 100644 --- a/include/gf_complete.h +++ b/include/gf_complete.h @@ -33,6 +33,10 @@ #include #endif +#if defined(ARM_NEON) + #include +#endif + /* These are the different ways to perform multiplication. Not all are implemented for all values of w. diff --git a/m4/ax_ext.m4 b/m4/ax_ext.m4 index cfbb797..c03ccef 100644 --- a/m4/ax_ext.m4 +++ b/m4/ax_ext.m4 @@ -41,6 +41,55 @@ AC_DEFUN([AX_EXT], AC_REQUIRE([AC_CANONICAL_HOST]) case $host_cpu in + aarch64*) + AC_DEFINE(HAVE_ARCH_AARCH64,,[targeting AArch64]) + SIMD_FLAGS="$SIMD_FLAGS -DARCH_AARCH64" + + AC_CACHE_CHECK([whether NEON is supported], [ax_cv_have_neon_ext], + [ + # TODO: detect / cross-compile + ax_cv_have_neon_ext=yes + ]) + AC_CACHE_CHECK([whether cryptographic extension is supported], [ax_cv_have_arm_crypt_ext], + [ + # TODO: detect / cross-compile + ax_cv_have_arm_crypt_ext=yes + ]) + + if test "$ax_cv_have_arm_crypt_ext" = yes; then + AC_DEFINE(HAVE_ARM_CRYPT_EXT,,[Support ARM cryptographic extension]) + fi + + if test "$ax_cv_have_neon_ext" = yes; then + AC_DEFINE(HAVE_NEON,,[Support NEON instructions]) + fi + + if test "$ax_cv_have_arm_crypt_ext" = yes && test "$ax_cv_have_neon_ext" = yes; then + AX_CHECK_COMPILE_FLAG(-march=armv8-a+simd+crypto, + SIMD_FLAGS="$SIMD_FLAGS -march=armv8-a+simd+crypto -DARM_CRYPT -DARM_NEON", []) + elif test "$ax_cv_have_arm_crypt_ext" = yes; then + AX_CHECK_COMPILE_FLAG(-march=armv8-a+crypto, + SIMD_FLAGS="$SIMD_FLAGS -march=armv8-a+crypto -DARM_CRYPT", []) + elif test "$ax_cv_have_neon_ext" = yes; then + AX_CHECK_COMPILE_FLAG(-march=armv8-a+simd, + SIMD_FLAGS="$SIMD_FLAGS -march=armv8-a+simd -DARM_NEON", []) + fi + ;; + + arm*) + AC_CACHE_CHECK([whether NEON is supported], [ax_cv_have_neon_ext], + [ + # TODO: detect / cross-compile + ax_cv_have_neon_ext=yes + ]) + + if test "$ax_cv_have_neon_ext" = yes; then + AC_DEFINE(HAVE_NEON,,[Support NEON instructions]) + AX_CHECK_COMPILE_FLAG(-mfpu=neon, + SIMD_FLAGS="$SIMD_FLAGS -mfpu=neon -DARM_NEON", []) + fi + ;; + powerpc*) AC_CACHE_CHECK([whether altivec is supported], [ax_cv_have_altivec_ext], [ -- cgit v1.2.1 From 36e75c3efec08b1e9bdb9c1f69a5b0018abd8ac7 Mon Sep 17 00:00:00 2001 From: Janne Grunau Date: Fri, 5 Sep 2014 13:33:04 +0200 Subject: use posix_memalign to align memory for SIMD region tests Properly emulate aligned allocation if posix_memalign is not available. --- configure.ac | 7 +++++++ test/gf_unit.c | 48 +++++++++++++++++++++++++++++++++++++----------- tools/gf_time.c | 24 ++++++++++++++++++++++-- 3 files changed, 66 insertions(+), 13 deletions(-) diff --git a/configure.ac b/configure.ac index 31ab1fa..ad7bb83 100644 --- a/configure.ac +++ b/configure.ac @@ -22,6 +22,13 @@ AM_MAINTAINER_MODE([disable]) dnl Compiling with per-target flags requires AM_PROG_CC_C_O. AC_PROG_CC +# Check for functions to provide aligned memory +# +AC_CHECK_FUNCS([posix_memalign], + [found_memalign=yes; break]) + +AS_IF([test "x$found_memalign" != "xyes"], [AC_MSG_WARN([No function for aligned memory allocation found])]) + AX_EXT() AC_ARG_ENABLE([neon], diff --git a/test/gf_unit.c b/test/gf_unit.c index 98ff98c..db26849 100644 --- a/test/gf_unit.c +++ b/test/gf_unit.c @@ -8,6 +8,14 @@ * Performs unit testing for gf arithmetic */ +#include "config.h" + +#ifdef HAVE_POSIX_MEMALIGN +#ifndef _XOPEN_SOURCE +#define _XOPEN_SOURCE 600 +#endif +#endif + #include #include #include @@ -82,6 +90,9 @@ int main(int argc, char **argv) uint32_t mask = 0; char *ra, *rb, *rc, *rd, *target; int align; +#ifndef HAVE_POSIX_MEMALIGN + char *malloc_ra, *malloc_rb, *malloc_rc, *malloc_rd; +#endif if (argc < 4) usage(NULL); @@ -116,18 +127,26 @@ int main(int argc, char **argv) c = (gf_general_t *) malloc(sizeof(gf_general_t)); d = (gf_general_t *) malloc(sizeof(gf_general_t)); +#if HAVE_POSIX_MEMALIGN + if (posix_memalign((void **) &ra, 16, sizeof(char)*REGION_SIZE)) + ra = NULL; + if (posix_memalign((void **) &rb, 16, sizeof(char)*REGION_SIZE)) + rb = NULL; + if (posix_memalign((void **) &rc, 16, sizeof(char)*REGION_SIZE)) + rc = NULL; + if (posix_memalign((void **) &rd, 16, sizeof(char)*REGION_SIZE)) + rd = NULL; +#else //15 bytes extra to make sure it's 16byte aligned - ra = (char *) malloc(sizeof(char)*REGION_SIZE+15); - rb = (char *) malloc(sizeof(char)*REGION_SIZE+15); - rc = (char *) malloc(sizeof(char)*REGION_SIZE+15); - rd = (char *) malloc(sizeof(char)*REGION_SIZE+15); - - //this still assumes 8 byte aligned pointer from malloc - //(which is usual on 32-bit machines) - ra += (uint64_t)ra & 0xf; - rb += (uint64_t)rb & 0xf; - rc += (uint64_t)rc & 0xf; - rd += (uint64_t)rd & 0xf; + malloc_ra = (char *) malloc(sizeof(char)*REGION_SIZE+15); + malloc_rb = (char *) malloc(sizeof(char)*REGION_SIZE+15); + malloc_rc = (char *) malloc(sizeof(char)*REGION_SIZE+15); + malloc_rd = (char *) malloc(sizeof(char)*REGION_SIZE+15); + ra = (uint8_t *) (((uintptr_t) malloc_ra + 15) & ~((uintptr_t) 0xf)); + rb = (uint8_t *) (((uintptr_t) malloc_rb + 15) & ~((uintptr_t) 0xf)); + rc = (uint8_t *) (((uintptr_t) malloc_rc + 15) & ~((uintptr_t) 0xf)); + rd = (uint8_t *) (((uintptr_t) malloc_rd + 15) & ~((uintptr_t) 0xf)); +#endif if (w <= 32) { mask = 0; @@ -423,10 +442,17 @@ int main(int argc, char **argv) free(b); free(c); free(d); +#ifdef HAVE_POSIX_MEMALIGN free(ra); free(rb); free(rc); free(rd); +#else + free(malloc_ra); + free(malloc_rb); + free(malloc_rc); + free(malloc_rd); +#endif return 0; } diff --git a/tools/gf_time.c b/tools/gf_time.c index d17a7c2..7402ab5 100644 --- a/tools/gf_time.c +++ b/tools/gf_time.c @@ -8,6 +8,14 @@ * Performs timing for gf arithmetic */ +#include "config.h" + +#ifdef HAVE_POSIX_MEMALIGN +#ifndef _XOPEN_SOURCE +#define _XOPEN_SOURCE 600 +#endif +#endif + #include #include #include @@ -95,6 +103,9 @@ int main(int argc, char **argv) time_t t0; uint8_t *ra, *rb; gf_general_t a; +#ifndef HAVE_POSIX_MEMALIGN + uint8_t *malloc_ra, *malloc_rb; +#endif if (argc < 6) usage(NULL); @@ -155,8 +166,17 @@ int main(int argc, char **argv) printf("Seed: %ld\n", t0); - ra = (uint8_t *) malloc(size); - rb = (uint8_t *) malloc(size); +#ifdef HAVE_POSIX_MEMALIGN + if (posix_memalign((void **) &ra, 16, size)) + ra = NULL; + if (posix_memalign((void **) &rb, 16, size)) + rb = NULL; +#else + malloc_ra = (uint8_t *) malloc(size + 15); + malloc_rb = (uint8_t *) malloc(size + 15); + ra = (uint8_t *) (((uintptr_t) malloc_ra + 15) & ~((uintptr_t) 0xf)); + rb = (uint8_t *) (((uintptr_t) malloc_rb + 15) & ~((uintptr_t) 0xf)); +#endif if (ra == NULL || rb == NULL) { perror("malloc"); exit(1); } -- cgit v1.2.1 From 3a1be40ea87ecc81e737aee6819ff96a6721f011 Mon Sep 17 00:00:00 2001 From: Janne Grunau Date: Thu, 4 Sep 2014 10:47:10 +0200 Subject: arm: NEON optimisations for XOR in gf_multby_one --- src/gf.c | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/src/gf.c b/src/gf.c index ca6a7f8..c3801e7 100644 --- a/src/gf.c +++ b/src/gf.c @@ -954,7 +954,42 @@ void gf_multby_one(void *src, void *dest, int bytes, int xor) } return; #endif +#if defined(ARM_NEON) + s8 = (uint8_t *) src; + d8 = (uint8_t *) dest; + if (uls % 16 == uld % 16) { + gf_set_region_data(&rd, NULL, src, dest, bytes, 1, xor, 16); + while (s8 != rd.s_start) { + *d8 ^= *s8; + s8++; + d8++; + } + while (s8 < (uint8_t *) rd.s_top) { + uint8x16_t vs = vld1q_u8 (s8); + uint8x16_t vd = vld1q_u8 (d8); + uint8x16_t vr = veorq_u8 (vs, vd); + vst1q_u8 (d8, vr); + s8 += 16; + d8 += 16; + } + } else { + while (s8 + 15 < (uint8_t *) src + bytes) { + uint8x16_t vs = vld1q_u8 (s8); + uint8x16_t vd = vld1q_u8 (d8); + uint8x16_t vr = veorq_u8 (vs, vd); + vst1q_u8 (d8, vr); + s8 += 16; + d8 += 16; + } + } + while (s8 < (uint8_t *) src + bytes) { + *d8 ^= *s8; + s8++; + d8++; + } + return; +#endif if (uls % 8 != uld % 8) { gf_unaligned_xor(src, dest, bytes); return; -- cgit v1.2.1 From 1311a44f7a27b38217a94e9d7a5dbe3ae3dde035 Mon Sep 17 00:00:00 2001 From: Janne Grunau Date: Wed, 17 Sep 2014 15:12:05 +0200 Subject: arm: NEON optimisations for gf_w4 Optimisations for the single table region multiplication and carry less multiplication using NEON's polynomial multiplication of 8-bit values. The single polynomial multiplication is not that useful but vector version is for region multiplication. Selected time_tool.sh results for a 1.7GHz cortex-a9: Region Best (MB/s): 672.72 W-Method: 4 -m CARRY_FREE - Region Best (MB/s): 265.84 W-Method: 4 -m BYTWO_p - Region Best (MB/s): 329.41 W-Method: 4 -m TABLE -r DOUBLE - Region Best (MB/s): 278.63 W-Method: 4 -m TABLE -r QUAD - Region Best (MB/s): 329.81 W-Method: 4 -m TABLE -r QUAD -r LAZY - Region Best (MB/s): 1318.03 W-Method: 4 -m TABLE -r SIMD - Region Best (MB/s): 165.15 W-Method: 4 -m TABLE -r NOSIMD - Region Best (MB/s): 99.73 W-Method: 4 -m LOG - --- include/gf_w4.h | 63 +++++++++++++ src/Makefile.am | 7 ++ src/gf_w4.c | 68 ++++---------- src/neon/gf_w4_neon.c | 247 ++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 335 insertions(+), 50 deletions(-) create mode 100644 include/gf_w4.h create mode 100644 src/neon/gf_w4_neon.c diff --git a/include/gf_w4.h b/include/gf_w4.h new file mode 100644 index 0000000..8ee94a3 --- /dev/null +++ b/include/gf_w4.h @@ -0,0 +1,63 @@ +/* + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic + * James S. Plank, Ethan L. Miller, Kevin M. Greenan, + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. + * + * gf_w4.h + * + * Defines and data structures for 4-bit Galois fields + */ + +#ifndef GF_COMPLETE_GF_W4_H +#define GF_COMPLETE_GF_W4_H + +#include + +#define GF_FIELD_WIDTH 4 +#define GF_DOUBLE_WIDTH (GF_FIELD_WIDTH*2) +#define GF_FIELD_SIZE (1 << GF_FIELD_WIDTH) +#define GF_MULT_GROUP_SIZE (GF_FIELD_SIZE-1) + +/* ------------------------------------------------------------ + JSP: Each implementation has its own data, which is allocated + at one time as part of the handle. For that reason, it + shouldn't be hierarchical -- i.e. one should be able to + allocate it with one call to malloc. */ + +struct gf_logtable_data { + uint8_t log_tbl[GF_FIELD_SIZE]; + uint8_t antilog_tbl[GF_FIELD_SIZE * 2]; + uint8_t *antilog_tbl_div; +}; + +struct gf_single_table_data { + uint8_t mult[GF_FIELD_SIZE][GF_FIELD_SIZE]; + uint8_t div[GF_FIELD_SIZE][GF_FIELD_SIZE]; +}; + +struct gf_double_table_data { + uint8_t div[GF_FIELD_SIZE][GF_FIELD_SIZE]; + uint8_t mult[GF_FIELD_SIZE][GF_FIELD_SIZE*GF_FIELD_SIZE]; +}; +struct gf_quad_table_data { + uint8_t div[GF_FIELD_SIZE][GF_FIELD_SIZE]; + uint16_t mult[GF_FIELD_SIZE][(1<<16)]; +}; + +struct gf_quad_table_lazy_data { + uint8_t div[GF_FIELD_SIZE][GF_FIELD_SIZE]; + uint8_t smult[GF_FIELD_SIZE][GF_FIELD_SIZE]; + uint16_t mult[(1 << 16)]; +}; + +struct gf_bytwo_data { + uint64_t prim_poly; + uint64_t mask1; + uint64_t mask2; +}; + +// ARM NEON init functions +int gf_w4_neon_cfm_init(gf_t *gf); +void gf_w4_neon_single_table_init(gf_t *gf); + +#endif /* GF_COMPLETE_GF_W4_H */ diff --git a/src/Makefile.am b/src/Makefile.am index 34633ea..5352d12 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -1,11 +1,18 @@ # GF-Complete 'core' AM file # Creates the library +AUTOMAKE_OPTIONS = subdir-objects + AM_CPPFLAGS = -I$(top_builddir)/include -I$(top_srcdir)/include AM_CFLAGS = -O3 $(SIMD_FLAGS) -fPIC lib_LTLIBRARIES = libgf_complete.la libgf_complete_la_SOURCES = gf.c gf_method.c gf_wgen.c gf_w4.c gf_w8.c gf_w16.c gf_w32.c \ gf_w64.c gf_w128.c gf_rand.c gf_general.c + +if HAVE_NEON +libgf_complete_la_SOURCES += neon/gf_w4_neon.c +endif + libgf_complete_la_LDFLAGS = -version-info 1:0:0 diff --git a/src/gf_w4.c b/src/gf_w4.c index f098323..0e86aa8 100644 --- a/src/gf_w4.c +++ b/src/gf_w4.c @@ -11,49 +11,7 @@ #include "gf_int.h" #include #include - -#define GF_FIELD_WIDTH 4 -#define GF_DOUBLE_WIDTH (GF_FIELD_WIDTH*2) -#define GF_FIELD_SIZE (1 << GF_FIELD_WIDTH) -#define GF_MULT_GROUP_SIZE (GF_FIELD_SIZE-1) - -/* ------------------------------------------------------------ - JSP: Each implementation has its own data, which is allocated - at one time as part of the handle. For that reason, it - shouldn't be hierarchical -- i.e. one should be able to - allocate it with one call to malloc. */ - -struct gf_logtable_data { - uint8_t log_tbl[GF_FIELD_SIZE]; - uint8_t antilog_tbl[GF_FIELD_SIZE * 2]; - uint8_t *antilog_tbl_div; -}; - -struct gf_single_table_data { - uint8_t mult[GF_FIELD_SIZE][GF_FIELD_SIZE]; - uint8_t div[GF_FIELD_SIZE][GF_FIELD_SIZE]; -}; - -struct gf_double_table_data { - uint8_t div[GF_FIELD_SIZE][GF_FIELD_SIZE]; - uint8_t mult[GF_FIELD_SIZE][GF_FIELD_SIZE*GF_FIELD_SIZE]; -}; -struct gf_quad_table_data { - uint8_t div[GF_FIELD_SIZE][GF_FIELD_SIZE]; - uint16_t mult[GF_FIELD_SIZE][(1<<16)]; -}; - -struct gf_quad_table_lazy_data { - uint8_t div[GF_FIELD_SIZE][GF_FIELD_SIZE]; - uint8_t smult[GF_FIELD_SIZE][GF_FIELD_SIZE]; - uint16_t mult[(1 << 16)]; -}; - -struct gf_bytwo_data { - uint64_t prim_poly; - uint64_t mask1; - uint64_t mask2; -}; +#include "gf_w4.h" #define AB2(ip, am1 ,am2, b, t1, t2) {\ t1 = (b << 1) & am1;\ @@ -489,11 +447,15 @@ int gf_w4_single_table_init(gf_t *gf) gf->inverse.w32 = NULL; gf->divide.w32 = gf_w4_single_table_divide; gf->multiply.w32 = gf_w4_single_table_multiply; - #ifdef INTEL_SSSE3 + #if defined(INTEL_SSSE3) || defined(ARM_NEON) if(h->region_type & (GF_REGION_NOSIMD | GF_REGION_CAUCHY)) gf->multiply_region.w32 = gf_w4_single_table_multiply_region; else + #if defined(INTEL_SSSE3) gf->multiply_region.w32 = gf_w4_single_table_sse_multiply_region; + #elif defined(ARM_NEON) + gf_w4_neon_single_table_init(gf); + #endif #else gf->multiply_region.w32 = gf_w4_single_table_multiply_region; if (h->region_type & GF_REGION_SIMD) return 0; @@ -774,16 +736,16 @@ int gf_w4_table_init(gf_t *gf) { int rt; gf_internal_t *h; - int issse3 = 0; + int simd = 0; -#ifdef INTEL_SSSE3 - issse3 = 1; +#if defined(INTEL_SSSE3) || defined(ARM_NEON) + simd = 1; #endif h = (gf_internal_t *) gf->scratch; rt = (h->region_type); - if (h->mult_type == GF_MULT_DEFAULT && !issse3) rt |= GF_REGION_DOUBLE_TABLE; + if (h->mult_type == GF_MULT_DEFAULT && !simd) rt |= GF_REGION_DOUBLE_TABLE; if (rt & GF_REGION_DOUBLE_TABLE) { return gf_w4_double_table_init(gf); @@ -1937,6 +1899,8 @@ int gf_w4_cfm_init(gf_t *gf) #if defined(INTEL_SSE4_PCLMUL) gf->multiply.w32 = gf_w4_clm_multiply; return 1; +#elif defined(ARM_NEON) + return gf_w4_neon_cfm_init(gf); #endif return 0; } @@ -1953,11 +1917,14 @@ int gf_w4_shift_init(gf_t *gf) int gf_w4_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2) { - int issse3 = 0; + int issse3 = 0, isneon = 0; #ifdef INTEL_SSSE3 issse3 = 1; #endif +#ifdef ARM_NEON + isneon = 1; +#endif switch(mult_type) { @@ -1971,7 +1938,8 @@ int gf_w4_scratch_size(int mult_type, int region_type, int divide_type, int arg1 return sizeof(gf_internal_t) + sizeof(struct gf_single_table_data) + 64; } - if (mult_type == GF_MULT_DEFAULT && !issse3) region_type = GF_REGION_DOUBLE_TABLE; + if (mult_type == GF_MULT_DEFAULT && !(issse3 || isneon)) + region_type = GF_REGION_DOUBLE_TABLE; if (region_type & GF_REGION_DOUBLE_TABLE) { return sizeof(gf_internal_t) + sizeof(struct gf_double_table_data) + 64; diff --git a/src/neon/gf_w4_neon.c b/src/neon/gf_w4_neon.c new file mode 100644 index 0000000..3a21432 --- /dev/null +++ b/src/neon/gf_w4_neon.c @@ -0,0 +1,247 @@ +/* + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic + * James S. Plank, Ethan L. Miller, Kevin M. Greenan, + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. + * + * Copyright (c) 2014: Janne Grunau + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * - Neither the name of the University of Tennessee nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * gf_w4_neon.c + * + * Neon routines for 4-bit Galois fields + * + */ + +#include "gf_int.h" +#include +#include +#include "gf_w4.h" + +static +gf_val_32_t +gf_w4_neon_clm_multiply (gf_t *gf, gf_val_32_t a4, gf_val_32_t b4) +{ + gf_val_32_t rv = 0; + poly8x8_t result, prim_poly; + poly8x8_t a, b, w; + uint8x8_t v; + gf_internal_t * h = gf->scratch; + + a = vdup_n_p8 (a4); + b = vdup_n_p8 (b4); + + prim_poly = vdup_n_p8 ((uint32_t)(h->prim_poly & 0x1fULL)); + + /* Do the initial multiply */ + result = vmul_p8 (a, b); + v = vshr_n_u8 (vreinterpret_u8_p8(result), 4); + w = vmul_p8 (prim_poly, vreinterpret_p8_u8(v)); + result = vreinterpret_p8_u8 (veor_u8 (vreinterpret_u8_p8(result), vreinterpret_u8_p8(w))); + + /* Extracts 32 bit value from result. */ + rv = (gf_val_32_t)vget_lane_u8 (vreinterpret_u8_p8 (result), 0); + + return rv; +} + +static inline void +neon_clm_multiply_region_from_single (gf_t *gf, uint8_t *s8, uint8_t *d8, + gf_val_32_t val, uint8_t *d_end, int xor) +{ + gf_internal_t * h = gf->scratch; + poly8x8_t prim_poly; + poly8x8_t a, w, even, odd; + uint8x8_t b, c, v, mask; + + a = vdup_n_p8 (val); + mask = vdup_n_u8 (0xf); + prim_poly = vdup_n_p8 ((uint8_t)(h->prim_poly & 0x1fULL)); + + while (d8 < d_end) { + b = vld1_u8 (s8); + + even = vreinterpret_p8_u8 (vand_u8 (b, mask)); + odd = vreinterpret_p8_u8 (vshr_n_u8 (b, 4)); + + if (xor) + c = vld1_u8 (d8); + + even = vmul_p8 (a, even); + odd = vmul_p8 (a, odd); + + v = vshr_n_u8 (vreinterpret_u8_p8(even), 4); + w = vmul_p8 (prim_poly, vreinterpret_p8_u8(v)); + even = vreinterpret_p8_u8 (veor_u8 (vreinterpret_u8_p8(even), vreinterpret_u8_p8(w))); + + v = vshr_n_u8 (vreinterpret_u8_p8(odd), 4); + w = vmul_p8 (prim_poly, vreinterpret_p8_u8(v)); + odd = vreinterpret_p8_u8 (veor_u8 (vreinterpret_u8_p8(odd), vreinterpret_u8_p8(w))); + + v = veor_u8 (vreinterpret_u8_p8 (even), vshl_n_u8 (vreinterpret_u8_p8 (odd), 4)); + + if (xor) + v = veor_u8 (c, v); + + vst1_u8 (d8, v); + + d8 += 8; + s8 += 8; + } +} + + +static void +gf_w4_neon_clm_multiply_region_from_single (gf_t *gf, void *src, void *dest, + gf_val_32_t val, int bytes, int xor) +{ + gf_region_data rd; + uint8_t *s8; + uint8_t *d8; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16); + gf_do_initial_region_alignment(&rd); + + s8 = (uint8_t *) rd.s_start; + d8 = (uint8_t *) rd.d_start; + + if (xor) + neon_clm_multiply_region_from_single (gf, s8, d8, val, rd.d_top, 1); + else + neon_clm_multiply_region_from_single (gf, s8, d8, val, rd.d_top, 0); + + gf_do_final_region_alignment(&rd); +} + +#ifndef ARCH_AARCH64 +#define vqtbl1q_u8(tbl, v) vcombine_u8(vtbl2_u8(tbl, vget_low_u8(v)), \ + vtbl2_u8(tbl, vget_high_u8(v))) +#endif + +static +inline +void +w4_single_table_multiply_region_neon(gf_t *gf, uint8_t *src, uint8_t *dst, + uint8_t * d_end, gf_val_32_t val, int xor) +{ + struct gf_single_table_data *std; + uint8_t *base; + uint8x16_t r, va, vh, vl, loset; + +#ifdef ARCH_AARCH64 + uint8x16_t th, tl; +#else + uint8x8x2_t th, tl; +#endif + + std = (struct gf_single_table_data *) ((gf_internal_t *) (gf->scratch))->private; + base = (uint8_t *) std->mult; + base += (val << GF_FIELD_WIDTH); + +#ifdef ARCH_AARCH64 + tl = vld1q_u8 (base); + th = vshlq_n_u8 (tl, 4); +#else + tl.val[0] = vld1_u8 (base); + tl.val[1] = vld1_u8 (base + 8); + th.val[0] = vshl_n_u8 (tl.val[0], 4); + th.val[1] = vshl_n_u8 (tl.val[1], 4); +#endif + + loset = vdupq_n_u8(0xf); + + while (dst < d_end) { + va = vld1q_u8 (src); + + vh = vshrq_n_u8 (va, 4); + vl = vandq_u8 (va, loset); + + if (xor) + va = vld1q_u8 (dst); + + vh = vqtbl1q_u8 (th, vh); + vl = vqtbl1q_u8 (tl, vl); + + r = veorq_u8 (vh, vl); + + if (xor) + r = veorq_u8 (va, r); + + vst1q_u8 (dst, r); + + dst += 16; + src += 16; + } +} + +static +void +gf_w4_single_table_multiply_region_neon(gf_t *gf, void *src, void *dest, + gf_val_32_t val, int bytes, int xor) +{ + gf_region_data rd; + uint8_t *sptr, *dptr, *top; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16); + gf_do_initial_region_alignment(&rd); + + sptr = rd.s_start; + dptr = rd.d_start; + top = rd.d_top; + + if (xor) + w4_single_table_multiply_region_neon(gf, sptr, dptr, top, val, 1); + else + w4_single_table_multiply_region_neon(gf, sptr, dptr, top, val, 0); + + gf_do_final_region_alignment(&rd); + +} + + +int gf_w4_neon_cfm_init(gf_t *gf) +{ + // single clm multiplication probably pointless + gf->multiply.w32 = gf_w4_neon_clm_multiply; + gf->multiply_region.w32 = gf_w4_neon_clm_multiply_region_from_single; + + return 1; +} + +void gf_w4_neon_single_table_init(gf_t *gf) +{ + gf->multiply_region.w32 = gf_w4_single_table_multiply_region_neon; +} -- cgit v1.2.1 From bec15359de5273d06673c43b8e73c70f97396041 Mon Sep 17 00:00:00 2001 From: Janne Grunau Date: Wed, 3 Sep 2014 16:57:06 +0200 Subject: arm: NEON optimisations for gf_w8 Optimisations for the 4,4 split table region multiplication and carry less multiplication using NEON's polynomial long multiplication. arm: w8: NEON carry less multiplication Selected time_tool.sh results for a 1.7GHz cortex-a9: Region Best (MB/s): 375.86 W-Method: 8 -m CARRY_FREE - Region Best (MB/s): 142.94 W-Method: 8 -m TABLE - Region Best (MB/s): 225.01 W-Method: 8 -m TABLE -r DOUBLE - Region Best (MB/s): 211.23 W-Method: 8 -m TABLE -r DOUBLE -r LAZY - Region Best (MB/s): 160.09 W-Method: 8 -m LOG - Region Best (MB/s): 123.61 W-Method: 8 -m LOG_ZERO - Region Best (MB/s): 123.85 W-Method: 8 -m LOG_ZERO_EXT - Region Best (MB/s): 1183.79 W-Method: 8 -m SPLIT 8 4 -r SIMD - Region Best (MB/s): 177.68 W-Method: 8 -m SPLIT 8 4 -r NOSIMD - Region Best (MB/s): 87.85 W-Method: 8 -m COMPOSITE 2 - - Region Best (MB/s): 428.59 W-Method: 8 -m COMPOSITE 2 - -r ALTMAP - --- include/gf_w8.h | 99 +++++++++++++++++ src/Makefile.am | 3 +- src/gf.c | 5 + src/gf_w8.c | 108 ++++-------------- src/neon/gf_w8_neon.c | 302 ++++++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 428 insertions(+), 89 deletions(-) create mode 100644 include/gf_w8.h create mode 100644 src/neon/gf_w8_neon.c diff --git a/include/gf_w8.h b/include/gf_w8.h new file mode 100644 index 0000000..938fcfd --- /dev/null +++ b/include/gf_w8.h @@ -0,0 +1,99 @@ +/* + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic + * James S. Plank, Ethan L. Miller, Kevin M. Greenan, + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. + * + * gf_w8.c + * + * Defines and data stuctures for 8-bit Galois fields + */ + +#ifndef GF_COMPLETE_GF_W8_H +#define GF_COMPLETE_GF_W8_H + +#include "gf_int.h" +#include + +#define GF_FIELD_WIDTH (8) +#define GF_FIELD_SIZE (1 << GF_FIELD_WIDTH) +#define GF_HALF_SIZE (1 << (GF_FIELD_WIDTH/2)) +#define GF_MULT_GROUP_SIZE GF_FIELD_SIZE-1 + +#define GF_BASE_FIELD_WIDTH (4) +#define GF_BASE_FIELD_SIZE (1 << GF_BASE_FIELD_WIDTH) + +struct gf_w8_logtable_data { + uint8_t log_tbl[GF_FIELD_SIZE]; + uint8_t antilog_tbl[GF_FIELD_SIZE * 2]; + uint8_t inv_tbl[GF_FIELD_SIZE]; +}; + +struct gf_w8_logzero_table_data { + short log_tbl[GF_FIELD_SIZE]; /* Make this signed, so that we can divide easily */ + uint8_t antilog_tbl[512+512+1]; + uint8_t *div_tbl; + uint8_t *inv_tbl; +}; + +struct gf_w8_logzero_small_table_data { + short log_tbl[GF_FIELD_SIZE]; /* Make this signed, so that we can divide easily */ + uint8_t antilog_tbl[255*3]; + uint8_t inv_tbl[GF_FIELD_SIZE]; + uint8_t *div_tbl; +}; + +struct gf_w8_composite_data { + uint8_t *mult_table; +}; + +/* Don't change the order of these relative to gf_w8_half_table_data */ + +struct gf_w8_default_data { + uint8_t high[GF_FIELD_SIZE][GF_HALF_SIZE]; + uint8_t low[GF_FIELD_SIZE][GF_HALF_SIZE]; + uint8_t divtable[GF_FIELD_SIZE][GF_FIELD_SIZE]; + uint8_t multtable[GF_FIELD_SIZE][GF_FIELD_SIZE]; +}; + +struct gf_w8_half_table_data { + uint8_t high[GF_FIELD_SIZE][GF_HALF_SIZE]; + uint8_t low[GF_FIELD_SIZE][GF_HALF_SIZE]; +}; + +struct gf_w8_single_table_data { + uint8_t divtable[GF_FIELD_SIZE][GF_FIELD_SIZE]; + uint8_t multtable[GF_FIELD_SIZE][GF_FIELD_SIZE]; +}; + +struct gf_w8_double_table_data { + uint8_t div[GF_FIELD_SIZE][GF_FIELD_SIZE]; + uint16_t mult[GF_FIELD_SIZE][GF_FIELD_SIZE*GF_FIELD_SIZE]; +}; + +struct gf_w8_double_table_lazy_data { + uint8_t div[GF_FIELD_SIZE][GF_FIELD_SIZE]; + uint8_t smult[GF_FIELD_SIZE][GF_FIELD_SIZE]; + uint16_t mult[GF_FIELD_SIZE*GF_FIELD_SIZE]; +}; + +struct gf_w4_logtable_data { + uint8_t log_tbl[GF_BASE_FIELD_SIZE]; + uint8_t antilog_tbl[GF_BASE_FIELD_SIZE * 2]; + uint8_t *antilog_tbl_div; +}; + +struct gf_w4_single_table_data { + uint8_t div[GF_BASE_FIELD_SIZE][GF_BASE_FIELD_SIZE]; + uint8_t mult[GF_BASE_FIELD_SIZE][GF_BASE_FIELD_SIZE]; +}; + +struct gf_w8_bytwo_data { + uint64_t prim_poly; + uint64_t mask1; + uint64_t mask2; +}; + +int gf_w8_neon_cfm_init(gf_t *gf); +void gf_w8_neon_split_init(gf_t *gf); + +#endif /* GF_COMPLETE_GF_W8_H */ diff --git a/src/Makefile.am b/src/Makefile.am index 5352d12..3e568d9 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -11,7 +11,8 @@ libgf_complete_la_SOURCES = gf.c gf_method.c gf_wgen.c gf_w4.c gf_w8.c gf_w16.c gf_w64.c gf_w128.c gf_rand.c gf_general.c if HAVE_NEON -libgf_complete_la_SOURCES += neon/gf_w4_neon.c +libgf_complete_la_SOURCES += neon/gf_w4_neon.c \ + neon/gf_w8_neon.c endif libgf_complete_la_LDFLAGS = -version-info 1:0:0 diff --git a/src/gf.c b/src/gf.c index c3801e7..6d34c46 100644 --- a/src/gf.c +++ b/src/gf.c @@ -217,6 +217,11 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type, pclmul = 1; #endif +#ifdef ARM_NEON + pclmul = 1; + sse3 = 1; +#endif + if (w < 1 || (w > 32 && w != 64 && w != 128)) { _gf_errno = GF_E_BAD___W; return 0; } diff --git a/src/gf_w8.c b/src/gf_w8.c index bc4f5d1..8449298 100644 --- a/src/gf_w8.c +++ b/src/gf_w8.c @@ -9,88 +9,10 @@ */ #include "gf_int.h" +#include "gf_w8.h" #include #include -#define GF_FIELD_WIDTH (8) -#define GF_FIELD_SIZE (1 << GF_FIELD_WIDTH) -#define GF_HALF_SIZE (1 << (GF_FIELD_WIDTH/2)) -#define GF_MULT_GROUP_SIZE GF_FIELD_SIZE-1 - -#define GF_BASE_FIELD_WIDTH (4) -#define GF_BASE_FIELD_SIZE (1 << GF_BASE_FIELD_WIDTH) - -struct gf_w8_logtable_data { - uint8_t log_tbl[GF_FIELD_SIZE]; - uint8_t antilog_tbl[GF_FIELD_SIZE * 2]; - uint8_t inv_tbl[GF_FIELD_SIZE]; -}; - -struct gf_w8_logzero_table_data { - short log_tbl[GF_FIELD_SIZE]; /* Make this signed, so that we can divide easily */ - uint8_t antilog_tbl[512+512+1]; - uint8_t *div_tbl; - uint8_t *inv_tbl; -}; - -struct gf_w8_logzero_small_table_data { - short log_tbl[GF_FIELD_SIZE]; /* Make this signed, so that we can divide easily */ - uint8_t antilog_tbl[255*3]; - uint8_t inv_tbl[GF_FIELD_SIZE]; - uint8_t *div_tbl; -}; - -struct gf_w8_composite_data { - uint8_t *mult_table; -}; - -/* Don't change the order of these relative to gf_w8_half_table_data */ - -struct gf_w8_default_data { - uint8_t high[GF_FIELD_SIZE][GF_HALF_SIZE]; - uint8_t low[GF_FIELD_SIZE][GF_HALF_SIZE]; - uint8_t divtable[GF_FIELD_SIZE][GF_FIELD_SIZE]; - uint8_t multtable[GF_FIELD_SIZE][GF_FIELD_SIZE]; -}; - -struct gf_w8_half_table_data { - uint8_t high[GF_FIELD_SIZE][GF_HALF_SIZE]; - uint8_t low[GF_FIELD_SIZE][GF_HALF_SIZE]; -}; - -struct gf_w8_single_table_data { - uint8_t divtable[GF_FIELD_SIZE][GF_FIELD_SIZE]; - uint8_t multtable[GF_FIELD_SIZE][GF_FIELD_SIZE]; -}; - -struct gf_w8_double_table_data { - uint8_t div[GF_FIELD_SIZE][GF_FIELD_SIZE]; - uint16_t mult[GF_FIELD_SIZE][GF_FIELD_SIZE*GF_FIELD_SIZE]; -}; - -struct gf_w8_double_table_lazy_data { - uint8_t div[GF_FIELD_SIZE][GF_FIELD_SIZE]; - uint8_t smult[GF_FIELD_SIZE][GF_FIELD_SIZE]; - uint16_t mult[GF_FIELD_SIZE*GF_FIELD_SIZE]; -}; - -struct gf_w4_logtable_data { - uint8_t log_tbl[GF_BASE_FIELD_SIZE]; - uint8_t antilog_tbl[GF_BASE_FIELD_SIZE * 2]; - uint8_t *antilog_tbl_div; -}; - -struct gf_w4_single_table_data { - uint8_t div[GF_BASE_FIELD_SIZE][GF_BASE_FIELD_SIZE]; - uint8_t mult[GF_BASE_FIELD_SIZE][GF_BASE_FIELD_SIZE]; -}; - -struct gf_w8_bytwo_data { - uint64_t prim_poly; - uint64_t mask1; - uint64_t mask2; -}; - #define AB2(ip, am1 ,am2, b, t1, t2) {\ t1 = (b << 1) & am1;\ t2 = b & am2; \ @@ -603,6 +525,8 @@ int gf_w8_cfm_init(gf_t *gf) return 0; } return 1; +#elif defined(ARM_NEON) + return gf_w8_neon_cfm_init(gf); #endif return 0; @@ -938,7 +862,7 @@ gf_w8_default_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b) return (ftd->multtable[a][b]); } -#ifdef INTEL_SSSE3 +#if defined(INTEL_SSSE3) || defined(ARM_NEON) static gf_val_32_t gf_w8_default_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b) @@ -1179,11 +1103,15 @@ int gf_w8_split_init(gf_t *gf) gf->multiply.w32 = gf_w8_split_multiply; - #ifdef INTEL_SSSE3 + #if defined(INTEL_SSSE3) || defined(ARM_NEON) if (h->region_type & GF_REGION_NOSIMD) gf->multiply_region.w32 = gf_w8_split_multiply_region; else + #if defined(INTEL_SSSE3) gf->multiply_region.w32 = gf_w8_split_multiply_region_sse; + #elif defined(ARM_NEON) + gf_w8_neon_split_init(gf); + #endif #else gf->multiply_region.w32 = gf_w8_split_multiply_region; if(h->region_type & GF_REGION_SIMD) @@ -1205,17 +1133,17 @@ int gf_w8_table_init(gf_t *gf) struct gf_w8_double_table_data *dtd = NULL; struct gf_w8_double_table_lazy_data *ltd = NULL; struct gf_w8_default_data *dd = NULL; - int a, b, c, prod, scase, issse; + int a, b, c, prod, scase, use_simd; h = (gf_internal_t *) gf->scratch; -#ifdef INTEL_SSSE3 - issse = 1; +#if defined(INTEL_SSSE3) || defined(ARM_NEON) + use_simd = 1; #else - issse = 0; + use_simd = 0; #endif - if (h->mult_type == GF_MULT_DEFAULT && issse) { + if (h->mult_type == GF_MULT_DEFAULT && use_simd) { dd = (struct gf_w8_default_data *)h->private; scase = 3; bzero(dd->high, sizeof(uint8_t) * GF_FIELD_SIZE * GF_HALF_SIZE); @@ -1290,10 +1218,14 @@ int gf_w8_table_init(gf_t *gf) gf->multiply_region.w32 = gf_w8_double_table_multiply_region; break; case 3: -#ifdef INTEL_SSSE3 +#if defined(INTEL_SSSE3) || defined(ARM_NEON) gf->divide.w32 = gf_w8_default_divide; gf->multiply.w32 = gf_w8_default_multiply; +#if defined(INTEL_SSSE3) gf->multiply_region.w32 = gf_w8_split_multiply_region_sse; +#elif defined(ARM_NEON) + gf_w8_neon_split_init(gf); +#endif #endif break; } @@ -2296,7 +2228,7 @@ int gf_w8_scratch_size(int mult_type, int region_type, int divide_type, int arg1 switch(mult_type) { case GF_MULT_DEFAULT: -#ifdef INTEL_SSSE3 +#if defined(INTEL_SSSE3) || defined(ARM_NEON) return sizeof(gf_internal_t) + sizeof(struct gf_w8_default_data) + 64; #endif return sizeof(gf_internal_t) + sizeof(struct gf_w8_single_table_data) + 64; diff --git a/src/neon/gf_w8_neon.c b/src/neon/gf_w8_neon.c new file mode 100644 index 0000000..930a916 --- /dev/null +++ b/src/neon/gf_w8_neon.c @@ -0,0 +1,302 @@ +/* + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic + * James S. Plank, Ethan L. Miller, Kevin M. Greenan, + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. + * + * Copyright (c) 2014: Janne Grunau + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * - Neither the name of the University of Tennessee nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * gf_w8_neon.c + * + * Neon optimized routines for 8-bit Galois fields + * + */ + +#include "gf_int.h" +#include "gf_w8.h" +#include +#include + +/* ARM NEON reducing macro for the carry free multiplication + * vmull_p8 is the carryless multiply operation. Here vshrn_n_u16 shifts + * the result to the right by 1 byte. This allows us to multiply + * the prim_poly by the leading bits of the result. We then xor the result + * of that operation back with the result. */ +#define NEON_CFM_REDUCE(v, w, result, prim_poly, initial) \ + do { \ + if (initial) \ + v = vshrn_n_u16 (vreinterpretq_u16_p16(result), 8); \ + else \ + v = veor_u8 (v, vshrn_n_u16 (vreinterpretq_u16_p16(result), 8)); \ + w = vmull_p8 (prim_poly, vreinterpret_p8_u8(v)); \ + result = vreinterpretq_p16_u16 (veorq_u16 (vreinterpretq_u16_p16(result), vreinterpretq_u16_p16(w))); \ + } while (0) + +static +inline +gf_val_32_t +gf_w8_neon_clm_multiply_x (gf_t *gf, gf_val_32_t a8, gf_val_32_t b8, int x) +{ + gf_val_32_t rv = 0; + poly8x8_t a, b; + uint8x8_t v; + poly16x8_t result; + poly8x8_t prim_poly; + poly16x8_t w; + gf_internal_t * h = gf->scratch; + + a = vdup_n_p8 (a8); + b = vdup_n_p8 (b8); + + prim_poly = vdup_n_p8 ((uint32_t)(h->prim_poly & 0x1ffULL)); + + /* Do the initial multiply */ + result = vmull_p8 (a, b); + + /* Ben: Do prim_poly reduction twice. We are guaranteed that we will only + have to do the reduction at most twice, because (w-2)/z == 2. Where + z is equal to the number of zeros after the leading 1 */ + NEON_CFM_REDUCE (v, w, result, prim_poly, 1); + NEON_CFM_REDUCE (v, w, result, prim_poly, 0); + if (x >= 3) { + NEON_CFM_REDUCE (v, w, result, prim_poly, 0); + } + if (x >= 4) { + NEON_CFM_REDUCE (v, w, result, prim_poly, 0); + } + /* Extracts 32 bit value from result. */ + rv = (gf_val_32_t)vget_lane_u8 (vmovn_u16 (vreinterpretq_u16_p16 (result)), 0); + + return rv; +} + +#define CLM_MULTIPLY(x) \ +static gf_val_32_t gf_w8_neon_clm_multiply_ ## x (gf_t *gf, gf_val_32_t a8, gf_val_32_t b8) \ +{\ + return gf_w8_neon_clm_multiply_x (gf, a8, b8, x);\ +} + +CLM_MULTIPLY(2) +CLM_MULTIPLY(3) +CLM_MULTIPLY(4) + +static inline void +neon_clm_multiply_region_from_single_x(gf_t *gf, uint8_t *s8, uint8_t *d8, + gf_val_32_t val, uint8_t *d_end, + int xor, int x) +{ + gf_internal_t * h = gf->scratch; + poly8x8_t a, b; + uint8x8_t c, v; + poly16x8_t result; + poly8x8_t prim_poly; + poly16x8_t w; + + a = vdup_n_p8 (val); + prim_poly = vdup_n_p8 ((uint8_t)(h->prim_poly & 0xffULL)); + + while (d8 < d_end) { + b = vld1_p8 ((poly8_t *) s8); + + if (xor) + c = vld1_u8 (d8); + + result = vmull_p8 (a, b); + + NEON_CFM_REDUCE(v, w, result, prim_poly, 1); + NEON_CFM_REDUCE (v, w, result, prim_poly, 0); + if (x >= 3) { + NEON_CFM_REDUCE (v, w, result, prim_poly, 0); + } + if (x >= 4) { + NEON_CFM_REDUCE (v, w, result, prim_poly, 0); + } + v = vmovn_u16 (vreinterpretq_u16_p16 (result)); + if (xor) + v = veor_u8 (c, v); + + vst1_u8 (d8, v); + + d8 += 8; + s8 += 8; + } +} + +#define CLM_MULT_REGION(x) \ +static void \ +gf_w8_neon_clm_multiply_region_from_single_ ## x (gf_t *gf, void *src, \ + void *dest, \ + gf_val_32_t val, int bytes, \ + int xor) \ +{ \ + gf_region_data rd; \ + uint8_t *s8; \ + uint8_t *d8; \ + \ + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } \ + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } \ + \ + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16); \ + gf_do_initial_region_alignment(&rd); \ + s8 = (uint8_t *) rd.s_start; \ + d8 = (uint8_t *) rd.d_start; \ + \ + if (xor) \ + neon_clm_multiply_region_from_single_x (gf, s8, d8, val, rd.d_top, 1, x); \ + else \ + neon_clm_multiply_region_from_single_x (gf, s8, d8, val, rd.d_top, 0, x);\ + gf_do_final_region_alignment(&rd); \ +} + +CLM_MULT_REGION(2) +CLM_MULT_REGION(3) +CLM_MULT_REGION(4) + + +int gf_w8_neon_cfm_init(gf_t *gf) +{ + gf_internal_t *h; + + h = (gf_internal_t *) gf->scratch; + + if ((0xe0 & h->prim_poly) == 0){ + gf->multiply.w32 = gf_w8_neon_clm_multiply_2; + gf->multiply_region.w32 = gf_w8_neon_clm_multiply_region_from_single_2; + }else if ((0xc0 & h->prim_poly) == 0){ + gf->multiply.w32 = gf_w8_neon_clm_multiply_3; + gf->multiply_region.w32 = gf_w8_neon_clm_multiply_region_from_single_3; + }else if ((0x80 & h->prim_poly) == 0){ + gf->multiply.w32 = gf_w8_neon_clm_multiply_4; + gf->multiply_region.w32 = gf_w8_neon_clm_multiply_region_from_single_4; + }else{ + return 0; + } + return 1; +} + +#ifndef ARCH_AARCH64 +#define vqtbl1q_u8(tbl, v) vcombine_u8(vtbl2_u8(tbl, vget_low_u8(v)), \ + vtbl2_u8(tbl, vget_high_u8(v))) +#endif + +static +void +gf_w8_split_multiply_region_neon(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + uint8_t *bh, *bl, *sptr, *dptr; + uint8x16_t r, va, vh, vl, loset; +#ifdef ARCH_AARCH64 + uint8x16_t mth, mtl; +#else + uint8x8x2_t mth, mtl; +#endif + struct gf_w8_half_table_data *htd; + gf_region_data rd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + htd = (struct gf_w8_half_table_data *) ((gf_internal_t *) (gf->scratch))->private; + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16); + gf_do_initial_region_alignment(&rd); + + bh = (uint8_t *) htd->high; + bh += (val << 4); + bl = (uint8_t *) htd->low; + bl += (val << 4); + + sptr = rd.s_start; + dptr = rd.d_start; + +#ifdef ARCH_AARCH64 + mth = vld1q_u8 (bh); + mtl = vld1q_u8 (bl); +#else + mth.val[0] = vld1_u8 (bh); + mtl.val[0] = vld1_u8 (bl); + mth.val[1] = vld1_u8 (bh + 8); + mtl.val[1] = vld1_u8 (bl + 8); +#endif + + loset = vdupq_n_u8(0xf); + + if (xor) { + while (sptr < (uint8_t *) rd.s_top) { + va = vld1q_u8 (sptr); + + vh = vshrq_n_u8 (va, 4); + vl = vandq_u8 (va, loset); + va = vld1q_u8 (dptr); + + vh = vqtbl1q_u8 (mth, vh); + vl = vqtbl1q_u8 (mtl, vl); + + r = veorq_u8 (vh, vl); + + vst1q_u8 (dptr, veorq_u8 (va, r)); + + dptr += 16; + sptr += 16; + } + } else { + while (sptr < (uint8_t *) rd.s_top) { + va = vld1q_u8 (sptr); + + vh = vshrq_n_u8 (va, 4); + vl = vandq_u8 (va, loset); +#ifdef ARCH_AARCH64 + vh = vqtbl1q_u8 (mth, vh); + vl = vqtbl1q_u8 (mtl, vl); +#else + vh = vcombine_u8 (vtbl2_u8 (mth, vget_low_u8 (vh)), + vtbl2_u8 (mth, vget_high_u8 (vh))); + vl = vcombine_u8 (vtbl2_u8 (mtl, vget_low_u8 (vl)), + vtbl2_u8 (mtl, vget_high_u8 (vl))); +#endif + + r = veorq_u8 (vh, vl); + + vst1q_u8(dptr, r); + + dptr += 16; + sptr += 16; + } + } + + gf_do_final_region_alignment(&rd); +} + + +void gf_w8_neon_split_init(gf_t *gf) +{ + gf->multiply_region.w32 = gf_w8_split_multiply_region_neon; +} -- cgit v1.2.1 From 474010a91d35fef5ca7dea77205b6a5c7e68c3e9 Mon Sep 17 00:00:00 2001 From: Janne Grunau Date: Wed, 17 Sep 2014 16:10:25 +0200 Subject: arm: NEON optimisations for gf_w16 Optimisations for the 4,16 split table region multiplications. Selected time_tool.sh 16 -A -B results for a 1.7 GHz cortex-a9: Region Best (MB/s): 532.14 W-Method: 16 -m SPLIT 16 4 -r SIMD - Region Best (MB/s): 212.34 W-Method: 16 -m SPLIT 16 4 -r NOSIMD - Region Best (MB/s): 801.36 W-Method: 16 -m SPLIT 16 4 -r SIMD -r ALTMAP - Region Best (MB/s): 93.20 W-Method: 16 -m SPLIT 16 4 -r NOSIMD -r ALTMAP - Region Best (MB/s): 273.99 W-Method: 16 -m SPLIT 16 8 - Region Best (MB/s): 270.81 W-Method: 16 -m SPLIT 8 8 - Region Best (MB/s): 70.42 W-Method: 16 -m COMPOSITE 2 - - Region Best (MB/s): 393.54 W-Method: 16 -m COMPOSITE 2 - -r ALTMAP - --- include/gf_w16.h | 66 +++++++++ src/Makefile.am | 3 +- src/gf_w16.c | 61 ++------- src/neon/gf_w16_neon.c | 356 +++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 435 insertions(+), 51 deletions(-) create mode 100644 include/gf_w16.h create mode 100644 src/neon/gf_w16_neon.c diff --git a/include/gf_w16.h b/include/gf_w16.h new file mode 100644 index 0000000..fb4c0e9 --- /dev/null +++ b/include/gf_w16.h @@ -0,0 +1,66 @@ +/* + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic + * James S. Plank, Ethan L. Miller, Kevin M. Greenan, + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. + * + * gf_w16.h + * + * Defines and data structures for 16-bit Galois fields + */ + +#ifndef GF_COMPLETE_GF_W16_H +#define GF_COMPLETE_GF_W16_H + +#include + +#define GF_FIELD_WIDTH (16) +#define GF_FIELD_SIZE (1 << GF_FIELD_WIDTH) +#define GF_MULT_GROUP_SIZE GF_FIELD_SIZE-1 + +#define GF_BASE_FIELD_WIDTH (8) +#define GF_BASE_FIELD_SIZE (1 << GF_BASE_FIELD_WIDTH) + +struct gf_w16_logtable_data { + uint16_t log_tbl[GF_FIELD_SIZE]; + uint16_t antilog_tbl[GF_FIELD_SIZE * 2]; + uint16_t inv_tbl[GF_FIELD_SIZE]; + uint16_t *d_antilog; +}; + +struct gf_w16_zero_logtable_data { + int log_tbl[GF_FIELD_SIZE]; + uint16_t _antilog_tbl[GF_FIELD_SIZE * 4]; + uint16_t *antilog_tbl; + uint16_t inv_tbl[GF_FIELD_SIZE]; +}; + +struct gf_w16_lazytable_data { + uint16_t log_tbl[GF_FIELD_SIZE]; + uint16_t antilog_tbl[GF_FIELD_SIZE * 2]; + uint16_t inv_tbl[GF_FIELD_SIZE]; + uint16_t *d_antilog; + uint16_t lazytable[GF_FIELD_SIZE]; +}; + +struct gf_w16_bytwo_data { + uint64_t prim_poly; + uint64_t mask1; + uint64_t mask2; +}; + +struct gf_w16_split_8_8_data { + uint16_t tables[3][256][256]; +}; + +struct gf_w16_group_4_4_data { + uint16_t reduce[16]; + uint16_t shift[16]; +}; + +struct gf_w16_composite_data { + uint8_t *mult_table; +}; + +void gf_w16_neon_split_init(gf_t *gf); + +#endif /* GF_COMPLETE_GF_W16_H */ diff --git a/src/Makefile.am b/src/Makefile.am index 3e568d9..f04042b 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -12,7 +12,8 @@ libgf_complete_la_SOURCES = gf.c gf_method.c gf_wgen.c gf_w4.c gf_w8.c gf_w16.c if HAVE_NEON libgf_complete_la_SOURCES += neon/gf_w4_neon.c \ - neon/gf_w8_neon.c + neon/gf_w8_neon.c \ + neon/gf_w16_neon.c endif libgf_complete_la_LDFLAGS = -version-info 1:0:0 diff --git a/src/gf_w16.c b/src/gf_w16.c index 0904115..ce47849 100644 --- a/src/gf_w16.c +++ b/src/gf_w16.c @@ -11,54 +11,7 @@ #include "gf_int.h" #include #include - -#define GF_FIELD_WIDTH (16) -#define GF_FIELD_SIZE (1 << GF_FIELD_WIDTH) -#define GF_MULT_GROUP_SIZE GF_FIELD_SIZE-1 - -#define GF_BASE_FIELD_WIDTH (8) -#define GF_BASE_FIELD_SIZE (1 << GF_BASE_FIELD_WIDTH) - -struct gf_w16_logtable_data { - uint16_t log_tbl[GF_FIELD_SIZE]; - uint16_t antilog_tbl[GF_FIELD_SIZE * 2]; - uint16_t inv_tbl[GF_FIELD_SIZE]; - uint16_t *d_antilog; -}; - -struct gf_w16_zero_logtable_data { - int log_tbl[GF_FIELD_SIZE]; - uint16_t _antilog_tbl[GF_FIELD_SIZE * 4]; - uint16_t *antilog_tbl; - uint16_t inv_tbl[GF_FIELD_SIZE]; -}; - -struct gf_w16_lazytable_data { - uint16_t log_tbl[GF_FIELD_SIZE]; - uint16_t antilog_tbl[GF_FIELD_SIZE * 2]; - uint16_t inv_tbl[GF_FIELD_SIZE]; - uint16_t *d_antilog; - uint16_t lazytable[GF_FIELD_SIZE]; -}; - -struct gf_w16_bytwo_data { - uint64_t prim_poly; - uint64_t mask1; - uint64_t mask2; -}; - -struct gf_w16_split_8_8_data { - uint16_t tables[3][256][256]; -}; - -struct gf_w16_group_4_4_data { - uint16_t reduce[16]; - uint16_t shift[16]; -}; - -struct gf_w16_composite_data { - uint8_t *mult_table; -}; +#include "gf_w16.h" #define AB2(ip, am1 ,am2, b, t1, t2) {\ t1 = (b << 1) & am1;\ @@ -1264,6 +1217,7 @@ int gf_w16_split_init(gf_t *gf) gf_internal_t *h; struct gf_w16_split_8_8_data *d8; int i, j, exp, issse3; + int isneon = 0; uint32_t p, basep; h = (gf_internal_t *) gf->scratch; @@ -1273,6 +1227,9 @@ int gf_w16_split_init(gf_t *gf) #else issse3 = 0; #endif +#ifdef ARM_NEON + isneon = 1; +#endif if (h->arg1 == 8 && h->arg2 == 8) { d8 = (struct gf_w16_split_8_8_data *) h->private; @@ -1317,6 +1274,10 @@ int gf_w16_split_init(gf_t *gf) if (issse3) { gf->multiply_region.w32 = gf_w16_split_4_16_lazy_sse_multiply_region; + } else if (isneon) { +#ifdef ARM_NEON + gf_w16_neon_split_init(gf); +#endif } else { gf->multiply_region.w32 = gf_w16_split_8_16_lazy_multiply_region; } @@ -1326,12 +1287,12 @@ int gf_w16_split_init(gf_t *gf) gf->multiply_region.w32 = gf_w16_split_8_16_lazy_multiply_region; } else if ((h->arg1 == 4 && h->arg2 == 16) || (h->arg2 == 4 && h->arg1 == 16)) { - if (issse3) { + if (issse3 || isneon) { if(h->region_type & GF_REGION_ALTMAP && h->region_type & GF_REGION_NOSIMD) gf->multiply_region.w32 = gf_w16_split_4_16_lazy_nosse_altmap_multiply_region; else if(h->region_type & GF_REGION_NOSIMD) gf->multiply_region.w32 = gf_w16_split_4_16_lazy_multiply_region; - else if(h->region_type & GF_REGION_ALTMAP) + else if(h->region_type & GF_REGION_ALTMAP && issse3) gf->multiply_region.w32 = gf_w16_split_4_16_lazy_sse_altmap_multiply_region; } else { if(h->region_type & GF_REGION_SIMD) diff --git a/src/neon/gf_w16_neon.c b/src/neon/gf_w16_neon.c new file mode 100644 index 0000000..95bfd80 --- /dev/null +++ b/src/neon/gf_w16_neon.c @@ -0,0 +1,356 @@ +/* + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic + * James S. Plank, Ethan L. Miller, Kevin M. Greenan, + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. + * + * Copyright (c) 2014: Janne Grunau + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * - Neither the name of the University of Tennessee nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * + * gf_w16_neon.c + * + * Neon routines for 16-bit Galois fields + * + */ + +#include "gf_int.h" +#include +#include +#include "gf_w16.h" + +#ifdef ARCH_AARCH64 +static +inline +void +neon_w16_split_4_multiply_region(gf_t *gf, uint16_t *src, uint16_t *dst, + uint16_t *d_end, uint8_t *tbl, + gf_val_32_t val, int xor) +{ + unsigned i; + uint8_t *high = tbl + 4 * 16; + uint16x8_t va0, va1, r0, r1; + uint8x16_t loset, rl, rh; + uint8x16x2_t va; + + uint8x16_t tbl_h[4], tbl_l[4]; + for (i = 0; i < 4; i++) { + tbl_l[i] = vld1q_u8(tbl + i*16); + tbl_h[i] = vld1q_u8(high + i*16); + } + + loset = vdupq_n_u8(0xf); + + while (dst < d_end) { + va0 = vld1q_u16(src); + va1 = vld1q_u16(src + 8); + + va = vtrnq_u8(vreinterpretq_u8_u16(va0), vreinterpretq_u8_u16(va1)); + + rl = vqtbl1q_u8(tbl_l[0], vandq_u8(va.val[0], loset)); + rh = vqtbl1q_u8(tbl_h[0], vandq_u8(va.val[0], loset)); + rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[2], vandq_u8(va.val[1], loset))); + rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[2], vandq_u8(va.val[1], loset))); + + va.val[0] = vshrq_n_u8(va.val[0], 4); + va.val[1] = vshrq_n_u8(va.val[1], 4); + + rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[1], va.val[0])); + rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[1], va.val[0])); + rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[3], va.val[1])); + rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[3], va.val[1])); + + va = vtrnq_u8(rl, rh); + r0 = vreinterpretq_u16_u8(va.val[0]); + r1 = vreinterpretq_u16_u8(va.val[1]); + + if (xor) { + va0 = vld1q_u16(dst); + va1 = vld1q_u16(dst + 8); + r0 = veorq_u16(r0, va0); + r1 = veorq_u16(r1, va1); + } + vst1q_u16(dst, r0); + vst1q_u16(dst + 8, r1); + + src += 16; + dst += 16; + } +} + +static +inline +void +neon_w16_split_4_altmap_multiply_region(gf_t *gf, uint8_t *src, + uint8_t *dst, uint8_t *d_end, + uint8_t *tbl, gf_val_32_t val, + int xor) +{ + unsigned i; + uint8_t *high = tbl + 4 * 16; + uint8x16_t vh, vl, rh, rl; + uint8x16_t loset; + + uint8x16_t tbl_h[4], tbl_l[4]; + for (i = 0; i < 4; i++) { + tbl_l[i] = vld1q_u8(tbl + i*16); + tbl_h[i] = vld1q_u8(high + i*16); + } + + loset = vdupq_n_u8(0xf); + + while (dst < d_end) { + vh = vld1q_u8(src); + vl = vld1q_u8(src + 16); + + rl = vqtbl1q_u8(tbl_l[0], vandq_u8(vl, loset)); + rh = vqtbl1q_u8(tbl_h[0], vandq_u8(vl, loset)); + rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[2], vandq_u8(vh, loset))); + rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[2], vandq_u8(vh, loset))); + + vl = vshrq_n_u8(vl, 4); + vh = vshrq_n_u8(vh, 4); + + rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[1], vl)); + rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[1], vl)); + rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[3], vh)); + rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[3], vh)); + + if (xor) { + vh = vld1q_u8(dst); + vl = vld1q_u8(dst + 16); + rh = veorq_u8(rh, vh); + rl = veorq_u8(rl, vl); + } + vst1q_u8(dst, rh); + vst1q_u8(dst + 16, rl); + + src += 32; + dst += 32; + } +} + +#else /* ARCH_AARCH64 */ + +static +inline +void +neon_w16_split_4_multiply_region(gf_t *gf, uint16_t *src, uint16_t *dst, + uint16_t *d_end, uint8_t *tbl, + gf_val_32_t val, int xor) +{ + unsigned i; + uint8_t *high = tbl + 4 * 16; + uint16x8_t va, r; + uint8x8_t loset, vb, vc, rl, rh; + + uint8x8x2_t tbl_h[4], tbl_l[4]; + for (i = 0; i < 4; i++) { + tbl_l[i].val[0] = vld1_u8(tbl + i*16); + tbl_l[i].val[1] = vld1_u8(tbl + i*16 + 8); + tbl_h[i].val[0] = vld1_u8(high + i*16); + tbl_h[i].val[1] = vld1_u8(high + i*16 + 8); + } + + loset = vdup_n_u8(0xf); + + while (dst < d_end) { + va = vld1q_u16(src); + + vb = vmovn_u16(va); + vc = vshrn_n_u16(va, 8); + + rl = vtbl2_u8(tbl_l[0], vand_u8(vb, loset)); + rh = vtbl2_u8(tbl_h[0], vand_u8(vb, loset)); + vb = vshr_n_u8(vb, 4); + rl = veor_u8(rl, vtbl2_u8(tbl_l[2], vand_u8(vc, loset))); + rh = veor_u8(rh, vtbl2_u8(tbl_h[2], vand_u8(vc, loset))); + vc = vshr_n_u8(vc, 4); + rl = veor_u8(rl, vtbl2_u8(tbl_l[1], vb)); + rh = veor_u8(rh, vtbl2_u8(tbl_h[1], vb)); + rl = veor_u8(rl, vtbl2_u8(tbl_l[3], vc)); + rh = veor_u8(rh, vtbl2_u8(tbl_h[3], vc)); + + r = vmovl_u8(rl); + r = vorrq_u16(r, vshll_n_u8(rh, 8)); + + if (xor) { + va = vld1q_u16(dst); + r = veorq_u16(r, va); + } + vst1q_u16(dst, r); + + src += 8; + dst += 8; + } +} + +static +inline +void +neon_w16_split_4_altmap_multiply_region(gf_t *gf, uint8_t *src, + uint8_t *dst, uint8_t *d_end, + uint8_t *tbl, gf_val_32_t val, + int xor) +{ + unsigned i; + uint8_t *high = tbl + 4 * 16; + uint8x8_t vh0, vh1, vl0, vl1, r0, r1, r2, r3; + uint8x8_t loset; + + uint8x8x2_t tbl_h[4], tbl_l[4]; + for (i = 0; i < 4; i++) { + tbl_l[i].val[0] = vld1_u8(tbl + i*16); + tbl_l[i].val[1] = vld1_u8(tbl + i*16 + 8); + tbl_h[i].val[0] = vld1_u8(high + i*16); + tbl_h[i].val[1] = vld1_u8(high + i*16 + 8); + } + + loset = vdup_n_u8(0xf); + + while (dst < d_end) { + vh0 = vld1_u8(src); + vh1 = vld1_u8(src + 8); + vl0 = vld1_u8(src + 16); + vl1 = vld1_u8(src + 24); + + r0 = vtbl2_u8(tbl_l[0], vand_u8(vh0, loset)); + r1 = vtbl2_u8(tbl_h[0], vand_u8(vh1, loset)); + r2 = vtbl2_u8(tbl_l[2], vand_u8(vl0, loset)); + r3 = vtbl2_u8(tbl_h[2], vand_u8(vl1, loset)); + + vh0 = vshr_n_u8(vh0, 4); + vh1 = vshr_n_u8(vh1, 4); + vl0 = vshr_n_u8(vl0, 4); + vl1 = vshr_n_u8(vl1, 4); + + r0 = veor_u8(r0, vtbl2_u8(tbl_l[1], vh0)); + r1 = veor_u8(r1, vtbl2_u8(tbl_h[1], vh1)); + r2 = veor_u8(r2, vtbl2_u8(tbl_l[3], vl0)); + r3 = veor_u8(r3, vtbl2_u8(tbl_h[3], vl1)); + + if (xor) { + vh0 = vld1_u8(dst); + vh1 = vld1_u8(dst + 8); + vl0 = vld1_u8(dst + 16); + vl1 = vld1_u8(dst + 24); + r0 = veor_u8(r0, vh0); + r1 = veor_u8(r1, vh1); + r2 = veor_u8(r2, vl0); + r3 = veor_u8(r3, vl1); + } + vst1_u8(dst, r0); + vst1_u8(dst + 8, r1); + vst1_u8(dst + 16, r2); + vst1_u8(dst + 24, r3); + + src += 32; + dst += 32; + } +} +#endif /* ARCH_AARCH64 */ + +static +inline +void +neon_w16_split_4_16_lazy_multiply_region(gf_t *gf, void *src, void *dest, + gf_val_32_t val, int bytes, int xor, + int altmap) +{ + gf_region_data rd; + unsigned i, j; + uint64_t c, prod; + uint8_t tbl[2 * 4 * 16]; + uint8_t *high = tbl + 4 * 16; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + for (i = 0; i < 4; i++) { + for (j = 0; j < 16; j++) { + c = (j << (i*4)); + prod = gf->multiply.w32(gf, c, val); + tbl[i*16 + j] = prod & 0xff; + high[i*16 + j] = prod >> 8; + } + } + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 32); + gf_do_initial_region_alignment(&rd); + + if (altmap) { + uint8_t *s8 = rd.s_start; + uint8_t *d8 = rd.d_start; + uint8_t *end8 = rd.d_top; + if (xor) + neon_w16_split_4_altmap_multiply_region(gf, s8, d8, end8, tbl, val, 1); + else + neon_w16_split_4_altmap_multiply_region(gf, s8, d8, end8, tbl, val, 0); + } else { + uint16_t *s16 = rd.s_start; + uint16_t *d16 = rd.d_start; + uint16_t *end16 = rd.d_top; + if (xor) + neon_w16_split_4_multiply_region(gf, s16, d16, end16, tbl, val, 1); + else + neon_w16_split_4_multiply_region(gf, s16, d16, end16, tbl, val, 0); + } + + gf_do_final_region_alignment(&rd); +} + +static +void +gf_w16_split_4_16_lazy_multiply_region_neon(gf_t *gf, void *src, void *dest, + gf_val_32_t val, int bytes, int xor) +{ + neon_w16_split_4_16_lazy_multiply_region(gf, src, dest, val, bytes, xor, 0); +} + +static +void +gf_w16_split_4_16_lazy_altmap_multiply_region_neon(gf_t *gf, void *src, + void *dest, + gf_val_32_t val, int bytes, + int xor) +{ + neon_w16_split_4_16_lazy_multiply_region(gf, src, dest, val, bytes, xor, 1); +} + + +void gf_w16_neon_split_init(gf_t *gf) +{ + gf_internal_t *h = (gf_internal_t *) gf->scratch; + + if (h->region_type & GF_REGION_ALTMAP) + gf->multiply_region.w32 = gf_w16_split_4_16_lazy_altmap_multiply_region_neon; + else + gf->multiply_region.w32 = gf_w16_split_4_16_lazy_multiply_region_neon; +} -- cgit v1.2.1 From 370c88b9015cbe874aca81442a5d8f6f99bfb654 Mon Sep 17 00:00:00 2001 From: Janne Grunau Date: Wed, 17 Sep 2014 16:13:02 +0200 Subject: arm: NEON optimisations for gf_w32 Optimisations for 4,32 split table multiplications. Selected time_tool.sh results on a 1.7 GHz cortex-a9: Region Best (MB/s): 346.67 W-Method: 32 -m SPLIT 32 4 -r SIMD - Region Best (MB/s): 92.89 W-Method: 32 -m SPLIT 32 4 -r NOSIMD - Region Best (MB/s): 258.17 W-Method: 32 -m SPLIT 32 4 -r SIMD -r ALTMAP - Region Best (MB/s): 162.00 W-Method: 32 -m SPLIT 32 8 - Region Best (MB/s): 160.53 W-Method: 32 -m SPLIT 8 8 - Region Best (MB/s): 32.74 W-Method: 32 -m COMPOSITE 2 - - Region Best (MB/s): 199.79 W-Method: 32 -m COMPOSITE 2 - -r ALTMAP - --- include/gf_w32.h | 71 +++++++++++++ src/Makefile.am | 3 +- src/gf_w32.c | 72 +++---------- src/neon/gf_w32_neon.c | 269 +++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 358 insertions(+), 57 deletions(-) create mode 100644 include/gf_w32.h create mode 100644 src/neon/gf_w32_neon.c diff --git a/include/gf_w32.h b/include/gf_w32.h new file mode 100644 index 0000000..3396402 --- /dev/null +++ b/include/gf_w32.h @@ -0,0 +1,71 @@ +/* + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic + * James S. Plank, Ethan L. Miller, Kevin M. Greenan, + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. + * + * gf_w32.h + * + * Defines and data structures for 32-bit Galois fields + */ + +#ifndef GF_COMPLETE_GF_W32_H +#define GF_COMPLETE_GF_W32_H + +#include + +#define GF_FIELD_WIDTH (32) +#define GF_FIRST_BIT (1 << 31) + +#define GF_BASE_FIELD_WIDTH (16) +#define GF_BASE_FIELD_SIZE (1 << GF_BASE_FIELD_WIDTH) +#define GF_BASE_FIELD_GROUP_SIZE GF_BASE_FIELD_SIZE-1 +#define GF_MULTBY_TWO(p) (((p) & GF_FIRST_BIT) ? (((p) << 1) ^ h->prim_poly) : (p) << 1) + +struct gf_split_2_32_lazy_data { + uint32_t tables[16][4]; + uint32_t last_value; +}; + +struct gf_w32_split_8_8_data { + uint32_t tables[7][256][256]; + uint32_t region_tables[4][256]; + uint32_t last_value; +}; + +struct gf_w32_group_data { + uint32_t *reduce; + uint32_t *shift; + int tshift; + uint64_t rmask; + uint32_t *memory; +}; + +struct gf_split_16_32_lazy_data { + uint32_t tables[2][(1<<16)]; + uint32_t last_value; +}; + +struct gf_split_8_32_lazy_data { + uint32_t tables[4][256]; + uint32_t last_value; +}; + +struct gf_split_4_32_lazy_data { + uint32_t tables[8][16]; + uint32_t last_value; +}; + +struct gf_w32_bytwo_data { + uint64_t prim_poly; + uint64_t mask1; + uint64_t mask2; +}; + +struct gf_w32_composite_data { + uint16_t *log; + uint16_t *alog; +}; + +void gf_w32_neon_split_init(gf_t *gf); + +#endif /* GF_COMPLETE_GF_W32_H */ diff --git a/src/Makefile.am b/src/Makefile.am index f04042b..a7f7ced 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -13,7 +13,8 @@ libgf_complete_la_SOURCES = gf.c gf_method.c gf_wgen.c gf_w4.c gf_w8.c gf_w16.c if HAVE_NEON libgf_complete_la_SOURCES += neon/gf_w4_neon.c \ neon/gf_w8_neon.c \ - neon/gf_w16_neon.c + neon/gf_w16_neon.c \ + neon/gf_w32_neon.c endif libgf_complete_la_LDFLAGS = -version-info 1:0:0 diff --git a/src/gf_w32.c b/src/gf_w32.c index 8e7c741..2e187fd 100644 --- a/src/gf_w32.c +++ b/src/gf_w32.c @@ -12,59 +12,7 @@ #include "gf_int.h" #include #include - -#define GF_FIELD_WIDTH (32) -#define GF_FIRST_BIT (1 << 31) - -#define GF_BASE_FIELD_WIDTH (16) -#define GF_BASE_FIELD_SIZE (1 << GF_BASE_FIELD_WIDTH) -#define GF_BASE_FIELD_GROUP_SIZE GF_BASE_FIELD_SIZE-1 -#define GF_MULTBY_TWO(p) (((p) & GF_FIRST_BIT) ? (((p) << 1) ^ h->prim_poly) : (p) << 1) - -struct gf_split_2_32_lazy_data { - uint32_t tables[16][4]; - uint32_t last_value; -}; - -struct gf_w32_split_8_8_data { - uint32_t tables[7][256][256]; - uint32_t region_tables[4][256]; - uint32_t last_value; -}; - -struct gf_w32_group_data { - uint32_t *reduce; - uint32_t *shift; - int tshift; - uint64_t rmask; - uint32_t *memory; -}; - -struct gf_split_16_32_lazy_data { - uint32_t tables[2][(1<<16)]; - uint32_t last_value; -}; - -struct gf_split_8_32_lazy_data { - uint32_t tables[4][256]; - uint32_t last_value; -}; - -struct gf_split_4_32_lazy_data { - uint32_t tables[8][16]; - uint32_t last_value; -}; - -struct gf_w32_bytwo_data { - uint64_t prim_poly; - uint64_t mask1; - uint64_t mask2; -}; - -struct gf_w32_composite_data { - uint16_t *log; - uint16_t *alog; -}; +#include "gf_w32.h" #define MM_PRINT32(s, r) { uint8_t blah[16], ii; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 4) printf(" %02x%02x%02x%02x", blah[15-ii], blah[14-ii], blah[13-ii], blah[12-ii]); printf("\n"); } @@ -2283,6 +2231,7 @@ int gf_w32_split_init(gf_t *gf) struct gf_split_16_32_lazy_data *d16; uint32_t p, basep; int i, j, exp, ispclmul, issse3; + int isneon = 0; #if defined(INTEL_SSE4_PCLMUL) ispclmul = 1; @@ -2295,6 +2244,9 @@ int gf_w32_split_init(gf_t *gf) #else issse3 = 0; #endif +#ifdef ARM_NEON + isneon = 1; +#endif h = (gf_internal_t *) gf->scratch; @@ -2349,11 +2301,15 @@ int gf_w32_split_init(gf_t *gf) /* 4/32 or Default + SSE - There is no ALTMAP/NOSSE. */ if ((h->arg1 == 4 && h->arg2 == 32) || (h->arg1 == 32 && h->arg2 == 4) || - (issse3 && h->mult_type == GF_REGION_DEFAULT)) { + ((issse3 || isneon) && h->mult_type == GF_REGION_DEFAULT)) { ld4 = (struct gf_split_4_32_lazy_data *) h->private; ld4->last_value = 0; - if ((h->region_type & GF_REGION_NOSIMD) || !issse3) { + if ((h->region_type & GF_REGION_NOSIMD) || !(issse3 || isneon)) { gf->multiply_region.w32 = gf_w32_split_4_32_lazy_multiply_region; + } else if (isneon) { +#ifdef ARM_NEON + gf_w32_neon_split_init(gf); +#endif } else if (h->region_type & GF_REGION_ALTMAP) { gf->multiply_region.w32 = gf_w32_split_4_32_lazy_sse_altmap_multiply_region; } else { @@ -2731,10 +2687,14 @@ int gf_w32_composite_init(gf_t *gf) int gf_w32_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2) { int issse3 = 0; + int isneon = 0; #ifdef INTEL_SSSE3 issse3 = 1; #endif +#ifdef ARM_NEON + isneon = 1; +#endif switch(mult_type) { @@ -2760,7 +2720,7 @@ int gf_w32_scratch_size(int mult_type, int region_type, int divide_type, int arg return sizeof(gf_internal_t) + sizeof(struct gf_split_2_32_lazy_data) + 64; } if ((arg1 == 8 && arg2 == 32) || (arg2 == 8 && arg1 == 32) || - (mult_type == GF_MULT_DEFAULT && !issse3)) { + (mult_type == GF_MULT_DEFAULT && !(issse3 || isneon))) { return sizeof(gf_internal_t) + sizeof(struct gf_split_8_32_lazy_data) + 64; } if ((arg1 == 4 && arg2 == 32) || diff --git a/src/neon/gf_w32_neon.c b/src/neon/gf_w32_neon.c new file mode 100644 index 0000000..8231eb3 --- /dev/null +++ b/src/neon/gf_w32_neon.c @@ -0,0 +1,269 @@ +/* + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic + * James S. Plank, Ethan L. Miller, Kevin M. Greenan, + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. + * + * Copyright (c) 2014: Janne Grunau + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * - Neither the name of the University of Tennessee nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * gf_w32_neon.c + * + * Neon routines for 32-bit Galois fields + * + */ + + +#include "gf_int.h" +#include +#include +#include "gf_w32.h" + +#ifndef ARCH_AARCH64 +#define vqtbl1q_u8(tbl, v) vcombine_u8(vtbl2_u8(tbl, vget_low_u8(v)), \ + vtbl2_u8(tbl, vget_high_u8(v))) +#endif + +static +void +neon_w32_split_4_32_multiply_region(gf_t *gf, uint32_t *src, uint32_t *dst, + uint32_t *d_end, uint8_t btable[8][4][16], + uint32_t val, int xor, int altmap) +{ + int i, j; +#ifdef ARCH_AARCH64 + uint8x16_t tables[8][4]; +#else + uint8x8x2_t tables[8][4]; +#endif + uint32x4_t v0, v1, v2, v3, s0, s1, s2, s3; + uint8x16_t p0, p1, p2, p3, si, mask1; + uint16x8x2_t r0, r1; + uint8x16x2_t q0, q1; + + for (i = 0; i < 8; i++) { + for (j = 0; j < 4; j++) { +#ifdef ARCH_AARCH64 + tables[i][j] = vld1q_u8(btable[i][j]); +#else + tables[i][j].val[0] = vld1_u8(btable[i][j]); + tables[i][j].val[1] = vld1_u8(btable[i][j] + 8); +#endif + } + } + + mask1 = vdupq_n_u8(0xf); + + while (dst < d_end) { + + v0 = vld1q_u32(src); src += 4; + v1 = vld1q_u32(src); src += 4; + v2 = vld1q_u32(src); src += 4; + v3 = vld1q_u32(src); src += 4; + + if (altmap) { + q0.val[0] = vreinterpretq_u8_u32(v0); + q0.val[1] = vreinterpretq_u8_u32(v1); + q1.val[0] = vreinterpretq_u8_u32(v2); + q1.val[1] = vreinterpretq_u8_u32(v3); + } else { + r0 = vtrnq_u16(vreinterpretq_u16_u32(v0), vreinterpretq_u16_u32(v2)); + r1 = vtrnq_u16(vreinterpretq_u16_u32(v1), vreinterpretq_u16_u32(v3)); + + q0 = vtrnq_u8(vreinterpretq_u8_u16(r0.val[0]), + vreinterpretq_u8_u16(r1.val[0])); + q1 = vtrnq_u8(vreinterpretq_u8_u16(r0.val[1]), + vreinterpretq_u8_u16(r1.val[1])); + } + + si = vandq_u8(q0.val[0], mask1); + p0 = vqtbl1q_u8(tables[0][0], si); + p1 = vqtbl1q_u8(tables[0][1], si); + p2 = vqtbl1q_u8(tables[0][2], si); + p3 = vqtbl1q_u8(tables[0][3], si); + + si = vshrq_n_u8(q0.val[0], 4); + p0 = veorq_u8(p0, vqtbl1q_u8(tables[1][0], si)); + p1 = veorq_u8(p1, vqtbl1q_u8(tables[1][1], si)); + p2 = veorq_u8(p2, vqtbl1q_u8(tables[1][2], si)); + p3 = veorq_u8(p3, vqtbl1q_u8(tables[1][3], si)); + + si = vandq_u8(q0.val[1], mask1); + p0 = veorq_u8(p0, vqtbl1q_u8(tables[2][0], si)); + p1 = veorq_u8(p1, vqtbl1q_u8(tables[2][1], si)); + p2 = veorq_u8(p2, vqtbl1q_u8(tables[2][2], si)); + p3 = veorq_u8(p3, vqtbl1q_u8(tables[2][3], si)); + + si = vshrq_n_u8(q0.val[1], 4); + p0 = veorq_u8(p0, vqtbl1q_u8(tables[3][0], si)); + p1 = veorq_u8(p1, vqtbl1q_u8(tables[3][1], si)); + p2 = veorq_u8(p2, vqtbl1q_u8(tables[3][2], si)); + p3 = veorq_u8(p3, vqtbl1q_u8(tables[3][3], si)); + + si = vandq_u8(q1.val[0], mask1); + p0 = veorq_u8(p0, vqtbl1q_u8(tables[4][0], si)); + p1 = veorq_u8(p1, vqtbl1q_u8(tables[4][1], si)); + p2 = veorq_u8(p2, vqtbl1q_u8(tables[4][2], si)); + p3 = veorq_u8(p3, vqtbl1q_u8(tables[4][3], si)); + + si = vshrq_n_u8(q1.val[0], 4); + p0 = veorq_u8(p0, vqtbl1q_u8(tables[5][0], si)); + p1 = veorq_u8(p1, vqtbl1q_u8(tables[5][1], si)); + p2 = veorq_u8(p2, vqtbl1q_u8(tables[5][2], si)); + p3 = veorq_u8(p3, vqtbl1q_u8(tables[5][3], si)); + + si = vandq_u8(q1.val[1], mask1); + p0 = veorq_u8(p0, vqtbl1q_u8(tables[6][0], si)); + p1 = veorq_u8(p1, vqtbl1q_u8(tables[6][1], si)); + p2 = veorq_u8(p2, vqtbl1q_u8(tables[6][2], si)); + p3 = veorq_u8(p3, vqtbl1q_u8(tables[6][3], si)); + + si = vshrq_n_u8(q1.val[1], 4); + p0 = veorq_u8(p0, vqtbl1q_u8(tables[7][0], si)); + p1 = veorq_u8(p1, vqtbl1q_u8(tables[7][1], si)); + p2 = veorq_u8(p2, vqtbl1q_u8(tables[7][2], si)); + p3 = veorq_u8(p3, vqtbl1q_u8(tables[7][3], si)); + + if (altmap) { + s0 = vreinterpretq_u32_u8(p0); + s1 = vreinterpretq_u32_u8(p1); + s2 = vreinterpretq_u32_u8(p2); + s3 = vreinterpretq_u32_u8(p3); + } else { + q0 = vtrnq_u8(p0, p1); + q1 = vtrnq_u8(p2, p3); + + r0 = vtrnq_u16(vreinterpretq_u16_u8(q0.val[0]), + vreinterpretq_u16_u8(q1.val[0])); + r1 = vtrnq_u16(vreinterpretq_u16_u8(q0.val[1]), + vreinterpretq_u16_u8(q1.val[1])); + + s0 = vreinterpretq_u32_u16(r0.val[0]); + s1 = vreinterpretq_u32_u16(r1.val[0]); + s2 = vreinterpretq_u32_u16(r0.val[1]); + s3 = vreinterpretq_u32_u16(r1.val[1]); + } + + if (xor) { + v0 = vld1q_u32(dst); + v1 = vld1q_u32(dst + 4); + v2 = vld1q_u32(dst + 8); + v3 = vld1q_u32(dst + 12); + s0 = veorq_u32(s0, v0); + s1 = veorq_u32(s1, v1); + s2 = veorq_u32(s2, v2); + s3 = veorq_u32(s3, v3); + } + + vst1q_u32(dst, s0); + vst1q_u32(dst + 4, s1); + vst1q_u32(dst + 8, s2); + vst1q_u32(dst + 12, s3); + + dst += 16; + } +} + +static +inline +void +neon_w32_split_4_32_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor, int altmap) +{ + gf_internal_t *h; + int i, j, k; + uint32_t pp, v, *s32, *d32, *top, tmp_table[16]; + uint8_t btable[8][4][16]; + gf_region_data rd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + h = (gf_internal_t *) gf->scratch; + pp = h->prim_poly; + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 64); + gf_do_initial_region_alignment(&rd); + + s32 = (uint32_t *) rd.s_start; + d32 = (uint32_t *) rd.d_start; + top = (uint32_t *) rd.d_top; + + v = val; + for (i = 0; i < 8; i++) { + tmp_table[0] = 0; + for (j = 1; j < 16; j <<= 1) { + for (k = 0; k < j; k++) { + tmp_table[k^j] = (v ^ tmp_table[k]); + } + v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1); + } + for (j = 0; j < 4; j++) { + for (k = 0; k < 16; k++) { + btable[i][j][k] = (uint8_t) tmp_table[k]; + tmp_table[k] >>= 8; + } + } + } + + if (xor) + neon_w32_split_4_32_multiply_region(gf, s32, d32, top, btable, val, 1, altmap); + else + neon_w32_split_4_32_multiply_region(gf, s32, d32, top, btable, val, 0, altmap); + + gf_do_final_region_alignment(&rd); +} + +static +void +gf_w32_split_4_32_lazy_multiply_region_neon(gf_t *gf, void *src, void *dest, + gf_val_32_t val, int bytes, int xor) +{ + neon_w32_split_4_32_lazy_multiply_region(gf, src, dest, val, bytes, xor, 0); +} + +static +void +gf_w32_split_4_32_lazy_altmap_multiply_region_neon(gf_t *gf, void *src, + void *dest, gf_val_32_t val, + int bytes, int xor) +{ + neon_w32_split_4_32_lazy_multiply_region(gf, src, dest, val, bytes, xor, 1); +} + +void gf_w32_neon_split_init(gf_t *gf) +{ + gf_internal_t *h = (gf_internal_t *) gf->scratch; + + if (h->region_type & GF_REGION_ALTMAP) + gf->multiply_region.w32 = gf_w32_split_4_32_lazy_altmap_multiply_region_neon; + else + gf->multiply_region.w32 = gf_w32_split_4_32_lazy_multiply_region_neon; + +} -- cgit v1.2.1 From 6fdd8bc3d32cb2f7fa55d2de9dc7cc5bb2f885aa Mon Sep 17 00:00:00 2001 From: Janne Grunau Date: Wed, 17 Sep 2014 16:15:27 +0200 Subject: arm: NEON optimisations for gf_w64 Optimisations for 4,64 split table region multiplications. Only used on ARMv8-A since it is not faster on ARMv7-A. --- include/gf_w64.h | 50 ++++++++ src/Makefile.am | 3 +- src/gf_w64.c | 51 +++----- src/neon/gf_w64_neon.c | 333 +++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 400 insertions(+), 37 deletions(-) create mode 100644 include/gf_w64.h create mode 100644 src/neon/gf_w64_neon.c diff --git a/include/gf_w64.h b/include/gf_w64.h new file mode 100644 index 0000000..9a74a81 --- /dev/null +++ b/include/gf_w64.h @@ -0,0 +1,50 @@ +/* + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic + * James S. Plank, Ethan L. Miller, Kevin M. Greenan, + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. + * + * gf_w64.h + * + * Defines and data structures for 64-bit Galois fields + */ + +#ifndef GF_COMPLETE_GF_W64_H +#define GF_COMPLETE_GF_W64_H + +#include + +#define GF_FIELD_WIDTH (64) +#define GF_FIRST_BIT (1ULL << 63) + +#define GF_BASE_FIELD_WIDTH (32) +#define GF_BASE_FIELD_SIZE (1ULL << GF_BASE_FIELD_WIDTH) +#define GF_BASE_FIELD_GROUP_SIZE GF_BASE_FIELD_SIZE-1 + +struct gf_w64_group_data { + uint64_t *reduce; + uint64_t *shift; + uint64_t *memory; +}; + +struct gf_split_4_64_lazy_data { + uint64_t tables[16][16]; + uint64_t last_value; +}; + +struct gf_split_8_64_lazy_data { + uint64_t tables[8][(1<<8)]; + uint64_t last_value; +}; + +struct gf_split_16_64_lazy_data { + uint64_t tables[4][(1<<16)]; + uint64_t last_value; +}; + +struct gf_split_8_8_data { + uint64_t tables[15][256][256]; +}; + +void gf_w64_neon_split_init(gf_t *gf); + +#endif /* GF_COMPLETE_GF_W64_H */ diff --git a/src/Makefile.am b/src/Makefile.am index a7f7ced..240c1fe 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -14,7 +14,8 @@ if HAVE_NEON libgf_complete_la_SOURCES += neon/gf_w4_neon.c \ neon/gf_w8_neon.c \ neon/gf_w16_neon.c \ - neon/gf_w32_neon.c + neon/gf_w32_neon.c \ + neon/gf_w64_neon.c endif libgf_complete_la_LDFLAGS = -version-info 1:0:0 diff --git a/src/gf_w64.c b/src/gf_w64.c index fe1c75d..6e75f5e 100644 --- a/src/gf_w64.c +++ b/src/gf_w64.c @@ -11,38 +11,7 @@ #include "gf_int.h" #include #include - -#define GF_FIELD_WIDTH (64) -#define GF_FIRST_BIT (1ULL << 63) - -#define GF_BASE_FIELD_WIDTH (32) -#define GF_BASE_FIELD_SIZE (1ULL << GF_BASE_FIELD_WIDTH) -#define GF_BASE_FIELD_GROUP_SIZE GF_BASE_FIELD_SIZE-1 - -struct gf_w64_group_data { - uint64_t *reduce; - uint64_t *shift; - uint64_t *memory; -}; - -struct gf_split_4_64_lazy_data { - uint64_t tables[16][16]; - uint64_t last_value; -}; - -struct gf_split_8_64_lazy_data { - uint64_t tables[8][(1<<8)]; - uint64_t last_value; -}; - -struct gf_split_16_64_lazy_data { - uint64_t tables[4][(1<<16)]; - uint64_t last_value; -}; - -struct gf_split_8_8_data { - uint64_t tables[15][256][256]; -}; +#include "gf_w64.h" static inline @@ -2027,11 +1996,15 @@ int gf_w64_split_init(gf_t *gf) /* Allen: set region pointers for default mult type. Single pointers are * taken care of above (explicitly for sse, implicitly for no sse). */ -#ifdef INTEL_SSE4 +#if defined(INTEL_SSE4) || defined(ARCH_AARCH64) if (h->mult_type == GF_MULT_DEFAULT) { d4 = (struct gf_split_4_64_lazy_data *) h->private; d4->last_value = 0; +#if defined(INTEL_SSE4) gf->multiply_region.w64 = gf_w64_split_4_64_lazy_sse_multiply_region; +#elif defined(ARCH_AARCH64) + gf_w64_neon_split_init(gf); +#endif } #else if (h->mult_type == GF_MULT_DEFAULT) { @@ -2050,17 +2023,23 @@ int gf_w64_split_init(gf_t *gf) { #ifdef INTEL_SSSE3 gf->multiply_region.w64 = gf_w64_split_4_64_lazy_sse_altmap_multiply_region; + #elif defined(ARCH_AARCH64) + gf_w64_neon_split_init(gf); #else return 0; #endif } else //no altmap { - #ifdef INTEL_SSE4 + #if defined(INTEL_SSE4) || defined(ARCH_AARCH64) if(h->region_type & GF_REGION_NOSIMD) gf->multiply_region.w64 = gf_w64_split_4_64_lazy_multiply_region; else - gf->multiply_region.w64 = gf_w64_split_4_64_lazy_sse_multiply_region; + #if defined(INTEL_SSE4) + gf->multiply_region.w64 = gf_w64_split_4_64_lazy_sse_multiply_region; + #elif defined(ARCH_AARCH64) + gf_w64_neon_split_init(gf); + #endif #else gf->multiply_region.w64 = gf_w64_split_4_64_lazy_multiply_region; if(h->region_type & GF_REGION_SIMD) @@ -2134,7 +2113,7 @@ int gf_w64_scratch_size(int mult_type, int region_type, int divide_type, int arg /* Allen: set the *local* arg1 and arg2, just for scratch size purposes, * then fall through to split table scratch size code. */ -#ifdef INTEL_SSE4 +#if defined(INTEL_SSE4) || defined(ARCH_AARCH64) arg1 = 64; arg2 = 4; #else diff --git a/src/neon/gf_w64_neon.c b/src/neon/gf_w64_neon.c new file mode 100644 index 0000000..0eca9c7 --- /dev/null +++ b/src/neon/gf_w64_neon.c @@ -0,0 +1,333 @@ +/* + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic + * James S. Plank, Ethan L. Miller, Kevin M. Greenan, + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. + * + * Copyright (c) 2014: Janne Grunau + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * - Neither the name of the University of Tennessee nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * gf_w64_neon.c + * + * Neon routines for 64-bit Galois fields + * + */ + +#include "gf_int.h" +#include +#include +#include "gf_w64.h" + + +#ifndef ARCH_AARCH64 +#define vqtbl1q_u8(tbl, v) vcombine_u8(vtbl2_u8(tbl, vget_low_u8(v)), \ + vtbl2_u8(tbl, vget_high_u8(v))) +#endif + +static +inline +void +neon_w64_split_4_lazy_altmap_multiply_region(gf_t *gf, uint64_t *src, + uint64_t *dst, uint64_t *d_end, + uint64_t val, int xor) +{ + unsigned i, j, k; + uint8_t btable[16]; +#ifdef ARCH_AARCH64 + uint8x16_t tables[16][8]; +#else + uint8x8x2_t tables[16][8]; +#endif + uint8x16_t p[8], mask1, si; + + gf_internal_t *h = (gf_internal_t *) gf->scratch; + struct gf_split_4_64_lazy_data *ld = (struct gf_split_4_64_lazy_data *) h->private; + + for (i = 0; i < 16; i++) { + for (j = 0; j < 8; j++) { + for (k = 0; k < 16; k++) { + btable[k] = (uint8_t) ld->tables[i][k]; + ld->tables[i][k] >>= 8; + } +#ifdef ARCH_AARCH64 + tables[i][j] = vld1q_u8(btable); +#else + tables[i][j].val[0] = vld1_u8(btable); + tables[i][j].val[1] = vld1_u8(btable + 8); +#endif + } + } + + mask1 = vdupq_n_u8(0xf); + + while (dst < d_end) { + + if (xor) { + for (i = 0; i < 8; i++) + p[i] = vld1q_u8((uint8_t *) (dst + i * 2)); + } else { + for (i = 0; i < 8; i++) + p[i] = vdupq_n_u8(0); + } + + i = 0; + for (k = 0; k < 8; k++) { + uint8x16_t v0 = vld1q_u8((uint8_t *) src); + src += 2; + + si = vandq_u8(v0, mask1); + for (j = 0; j < 8; j++) { + p[j] = veorq_u8(p[j], vqtbl1q_u8(tables[i][j], si)); + } + i++; + si = vshrq_n_u8(v0, 4); + for (j = 0; j < 8; j++) { + p[j] = veorq_u8(p[j], vqtbl1q_u8(tables[i][j], si)); + } + i++; + + } + for (i = 0; i < 8; i++) { + vst1q_u8((uint8_t *) dst, p[i]); + dst += 2; + } + } +} + +static +inline +void +neon_w64_split_4_lazy_multiply_region(gf_t *gf, uint64_t *src, uint64_t *dst, + uint64_t *d_end, uint64_t val, int xor) +{ + unsigned i, j, k; + uint8_t btable[16]; +#ifdef ARCH_AARCH64 + uint8x16_t tables[16][8]; +#else + uint8x8x2_t tables[16][8]; +#endif + uint8x16_t p[8], mask1, si; + uint64x2_t st[8]; + uint32x4x2_t s32[4]; + uint16x8x2_t s16[4]; + uint8x16x2_t s8[4]; + + gf_internal_t *h = (gf_internal_t *) gf->scratch; + struct gf_split_4_64_lazy_data *ld = (struct gf_split_4_64_lazy_data *) h->private; + + for (i = 0; i < 16; i++) { + for (j = 0; j < 8; j++) { + for (k = 0; k < 16; k++) { + btable[k] = (uint8_t) ld->tables[i][k]; + ld->tables[i][k] >>= 8; + } +#ifdef ARCH_AARCH64 + tables[i][j] = vld1q_u8(btable); +#else + tables[i][j].val[0] = vld1_u8(btable); + tables[i][j].val[1] = vld1_u8(btable + 8); +#endif + } + } + + mask1 = vdupq_n_u8(0xf); + + while (dst < d_end) { + + for (k = 0; k < 8; k++) { + st[k] = vld1q_u64(src); + src += 2; + p[k] = vdupq_n_u8(0); + } + + s32[0] = vuzpq_u32(vreinterpretq_u32_u64(st[0]), + vreinterpretq_u32_u64(st[1])); + s32[1] = vuzpq_u32(vreinterpretq_u32_u64(st[2]), + vreinterpretq_u32_u64(st[3])); + s32[2] = vuzpq_u32(vreinterpretq_u32_u64(st[4]), + vreinterpretq_u32_u64(st[5])); + s32[3] = vuzpq_u32(vreinterpretq_u32_u64(st[6]), + vreinterpretq_u32_u64(st[7])); + + s16[0] = vuzpq_u16(vreinterpretq_u16_u32(s32[0].val[0]), + vreinterpretq_u16_u32(s32[1].val[0])); + s16[1] = vuzpq_u16(vreinterpretq_u16_u32(s32[2].val[0]), + vreinterpretq_u16_u32(s32[3].val[0])); + s16[2] = vuzpq_u16(vreinterpretq_u16_u32(s32[0].val[1]), + vreinterpretq_u16_u32(s32[1].val[1])); + s16[3] = vuzpq_u16(vreinterpretq_u16_u32(s32[2].val[1]), + vreinterpretq_u16_u32(s32[3].val[1])); + + s8[0] = vuzpq_u8(vreinterpretq_u8_u16(s16[0].val[0]), + vreinterpretq_u8_u16(s16[1].val[0])); + s8[1] = vuzpq_u8(vreinterpretq_u8_u16(s16[0].val[1]), + vreinterpretq_u8_u16(s16[1].val[1])); + s8[2] = vuzpq_u8(vreinterpretq_u8_u16(s16[2].val[0]), + vreinterpretq_u8_u16(s16[3].val[0])); + s8[3] = vuzpq_u8(vreinterpretq_u8_u16(s16[2].val[1]), + vreinterpretq_u8_u16(s16[3].val[1])); + + i = 0; + for (k = 0; k < 8; k++) { + si = vandq_u8(s8[k >> 1].val[k & 1], mask1); + for (j = 0; j < 8; j++) { + p[j] = veorq_u8(p[j], vqtbl1q_u8(tables[i][j], si)); + } + i++; + si = vshrq_n_u8(s8[k >> 1].val[k & 1], 4); + for (j = 0; j < 8; j++) { + p[j] = veorq_u8(p[j], vqtbl1q_u8(tables[i][j], si)); + } + i++; + } + + s8[0] = vzipq_u8(p[0], p[1]); + s8[1] = vzipq_u8(p[2], p[3]); + s8[2] = vzipq_u8(p[4], p[5]); + s8[3] = vzipq_u8(p[6], p[7]); + + s16[0] = vzipq_u16(vreinterpretq_u16_u8(s8[0].val[0]), + vreinterpretq_u16_u8(s8[1].val[0])); + s16[1] = vzipq_u16(vreinterpretq_u16_u8(s8[2].val[0]), + vreinterpretq_u16_u8(s8[3].val[0])); + s16[2] = vzipq_u16(vreinterpretq_u16_u8(s8[0].val[1]), + vreinterpretq_u16_u8(s8[1].val[1])); + s16[3] = vzipq_u16(vreinterpretq_u16_u8(s8[2].val[1]), + vreinterpretq_u16_u8(s8[3].val[1])); + + s32[0] = vzipq_u32(vreinterpretq_u32_u16(s16[0].val[0]), + vreinterpretq_u32_u16(s16[1].val[0])); + s32[1] = vzipq_u32(vreinterpretq_u32_u16(s16[0].val[1]), + vreinterpretq_u32_u16(s16[1].val[1])); + s32[2] = vzipq_u32(vreinterpretq_u32_u16(s16[2].val[0]), + vreinterpretq_u32_u16(s16[3].val[0])); + s32[3] = vzipq_u32(vreinterpretq_u32_u16(s16[2].val[1]), + vreinterpretq_u32_u16(s16[3].val[1])); + + for (k = 0; k < 8; k ++) { + st[k] = vreinterpretq_u64_u32(s32[k >> 1].val[k & 1]); + } + + if (xor) { + for (i = 0; i < 8; i++) { + uint64x2_t t1 = vld1q_u64(dst); + vst1q_u64(dst, veorq_u64(st[i], t1)); + dst += 2; + } + } else { + for (i = 0; i < 8; i++) { + vst1q_u64(dst, st[i]); + dst += 2; + } + } + + } +} + +static +void +gf_w64_neon_split_4_lazy_multiply_region(gf_t *gf, void *src, void *dest, + uint64_t val, int bytes, int xor, + int altmap) +{ + gf_internal_t *h; + int i, j, k; + uint64_t pp, v, *s64, *d64, *top; + struct gf_split_4_64_lazy_data *ld; + gf_region_data rd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 128); + gf_do_initial_region_alignment(&rd); + + s64 = (uint64_t *) rd.s_start; + d64 = (uint64_t *) rd.d_start; + top = (uint64_t *) rd.d_top; + + h = (gf_internal_t *) gf->scratch; + pp = h->prim_poly; + ld = (struct gf_split_4_64_lazy_data *) h->private; + + v = val; + for (i = 0; i < 16; i++) { + ld->tables[i][0] = 0; + for (j = 1; j < 16; j <<= 1) { + for (k = 0; k < j; k++) { + ld->tables[i][k^j] = (v ^ ld->tables[i][k]); + } + v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1); + } + } + + if (altmap) { + if (xor) + neon_w64_split_4_lazy_altmap_multiply_region(gf, s64, d64, top, val, 1); + else + neon_w64_split_4_lazy_altmap_multiply_region(gf, s64, d64, top, val, 0); + } else { + if (xor) + neon_w64_split_4_lazy_multiply_region(gf, s64, d64, top, val, 1); + else + neon_w64_split_4_lazy_multiply_region(gf, s64, d64, top, val, 0); + } + + gf_do_final_region_alignment(&rd); +} + +static +void +gf_w64_split_4_64_lazy_multiply_region_neon(gf_t *gf, void *src, void *dest, + uint64_t val, int bytes, int xor) +{ + gf_w64_neon_split_4_lazy_multiply_region(gf, src, dest, val, bytes, xor, 0); +} + +static +void +gf_w64_split_4_64_lazy_altmap_multiply_region_neon(gf_t *gf, void *src, + void *dest, uint64_t val, + int bytes, int xor) +{ + gf_w64_neon_split_4_lazy_multiply_region(gf, src, dest, val, bytes, xor, 1); +} + +void gf_w64_neon_split_init(gf_t *gf) +{ + gf_internal_t *h = (gf_internal_t *) gf->scratch; + + if (h->region_type & GF_REGION_ALTMAP) + gf->multiply_region.w64 = gf_w64_split_4_64_lazy_altmap_multiply_region_neon; + else + gf->multiply_region.w64 = gf_w64_split_4_64_lazy_multiply_region_neon; + +} -- cgit v1.2.1