From d315d20d569f7da176eb7445b8c21ea055083f06 Mon Sep 17 00:00:00 2001
From: Janne Grunau <j@jannau.net>
Date: Thu, 7 Aug 2014 00:52:55 +0200
Subject: build: make CFLAGS user setable

There is no need to force the non-default CFLAGS on users trying to set
them via enviroment variable or on configure command.
---
 configure.ac | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/configure.ac b/configure.ac
index 9f33852..02a62b8 100644
--- a/configure.ac
+++ b/configure.ac
@@ -3,6 +3,9 @@
 # FIXME - add project url as the last argument
 AC_INIT(gf-complete, 1.0)
 
+# Override default CFLAGS
+: ${CFLAGS="-Wall -Wpointer-arith -O3 -g"}
+
 AC_PREREQ([2.61])
 
 AM_INIT_AUTOMAKE([no-dependencies foreign])
@@ -16,9 +19,6 @@ AC_CONFIG_MACRO_DIR([m4])
 # This prevents './configure; make' from trying to run autotools.
 AM_MAINTAINER_MODE([disable])
 
-# Override default CFLAGS
-CFLAGS="-Wall -Wpointer-arith -O3 -g"
-
 dnl Compiling with per-target flags requires AM_PROG_CC_C_O.
 AC_PROG_CC
 
-- 
cgit v1.2.1


From f6828cfbc1bf24d686e6e24ce9822e69f824351d Mon Sep 17 00:00:00 2001
From: Janne Grunau <j@jannau.net>
Date: Thu, 7 Aug 2014 00:54:21 +0200
Subject: build: fix out of source tree build

---
 examples/Makefile.am | 4 ++--
 src/Makefile.am      | 4 ++--
 test/Makefile.am     | 4 ++--
 tools/Makefile.am    | 4 ++--
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/examples/Makefile.am b/examples/Makefile.am
index fd547d2..a420bda 100644
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@@ -1,7 +1,7 @@
 # GF-Complete 'examples' AM file
 
-AM_CPPFLAGS=-I./ -I../include
-AM_CFLAGS = -O3 $(SIMD_FLAGS) -fPIC $(INCLUDES)
+AM_CPPFLAGS = -I$(top_builddir)/include -I$(top_srcdir)/include
+AM_CFLAGS = -O3 $(SIMD_FLAGS) -fPIC
 
 bin_PROGRAMS = gf_example_1 gf_example_2 gf_example_3 gf_example_4 \
                gf_example_5 gf_example_6 gf_example_7
diff --git a/src/Makefile.am b/src/Makefile.am
index ba3ad5e..34633ea 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -1,8 +1,8 @@
 # GF-Complete 'core' AM file
 # Creates the library
 
-AM_CPPFLAGS=-I./ -I../include
-AM_CFLAGS = -O3 $(SIMD_FLAGS) -fPIC $(INCLUDES)
+AM_CPPFLAGS = -I$(top_builddir)/include -I$(top_srcdir)/include
+AM_CFLAGS = -O3 $(SIMD_FLAGS) -fPIC
 
 lib_LTLIBRARIES = libgf_complete.la
 libgf_complete_la_SOURCES = gf.c gf_method.c gf_wgen.c gf_w4.c gf_w8.c gf_w16.c gf_w32.c \
diff --git a/test/Makefile.am b/test/Makefile.am
index 7f39a48..2791528 100644
--- a/test/Makefile.am
+++ b/test/Makefile.am
@@ -1,7 +1,7 @@
 # GF-Complete 'test' AM file
 
-AM_CPPFLAGS=-I./ -I../include
-AM_CFLAGS = -O3 $(SIMD_FLAGS) -fPIC $(INCLUDES)
+AM_CPPFLAGS = -I$(top_builddir)/include -I$(top_srcdir)/include
+AM_CFLAGS = -O3 $(SIMD_FLAGS) -fPIC
 
 bin_PROGRAMS = gf_unit 
 
diff --git a/tools/Makefile.am b/tools/Makefile.am
index 7c55d65..d502623 100644
--- a/tools/Makefile.am
+++ b/tools/Makefile.am
@@ -1,7 +1,7 @@
 # GF-Complete 'tools' AM file
 
-AM_CPPFLAGS=-I./ -I../include
-AM_CFLAGS = -O3 $(SIMD_FLAGS) -fPIC $(INCLUDES)
+AM_CPPFLAGS = -I$(top_builddir)/include -I$(top_srcdir)/include
+AM_CFLAGS = -O3 $(SIMD_FLAGS) -fPIC
 
 TESTS=run-tests.sh
 
-- 
cgit v1.2.1


From 2a2f1e306f1759e5e52771a643a3a2df54552069 Mon Sep 17 00:00:00 2001
From: Janne Grunau <j@jannau.net>
Date: Wed, 24 Sep 2014 16:02:19 +0200
Subject: check: split unit tests and support paralell execution

---
 configure.ac       |  2 +-
 tools/Makefile.am  | 18 ++++++++++++++++--
 tools/run-tests.sh |  9 ---------
 3 files changed, 17 insertions(+), 12 deletions(-)
 delete mode 100755 tools/run-tests.sh

diff --git a/configure.ac b/configure.ac
index 02a62b8..47d5d62 100644
--- a/configure.ac
+++ b/configure.ac
@@ -8,7 +8,7 @@ AC_INIT(gf-complete, 1.0)
 
 AC_PREREQ([2.61])
 
-AM_INIT_AUTOMAKE([no-dependencies foreign])
+AM_INIT_AUTOMAKE([no-dependencies foreign parallel-tests])
 LT_INIT # libtool
 
 AC_CONFIG_HEADER(include/config.h)
diff --git a/tools/Makefile.am b/tools/Makefile.am
index d502623..eb27d4a 100644
--- a/tools/Makefile.am
+++ b/tools/Makefile.am
@@ -3,8 +3,6 @@
 AM_CPPFLAGS = -I$(top_builddir)/include -I$(top_srcdir)/include
 AM_CFLAGS = -O3 $(SIMD_FLAGS) -fPIC
 
-TESTS=run-tests.sh
-
 bin_PROGRAMS = gf_mult gf_div gf_add gf_time gf_methods gf_poly gf_inline_time
 
 gf_mult_SOURCES = gf_mult.c
@@ -35,3 +33,19 @@ gf_inline_time_SOURCES = gf_inline_time.c
 #gf_inline_time_LDFLAGS = -lgf_complete
 gf_inline_time_LDADD = ../src/libgf_complete.la
 
+# gf_unit tests as generated by gf_methods
+gf_unit_w%.sh: gf_methods
+	./$^ $(@:gf_unit_w%.sh=%) -A -U > $@ || rm $@
+
+TESTS = gf_unit_w128.sh \
+        gf_unit_w64.sh  \
+        gf_unit_w32.sh  \
+        gf_unit_w16.sh  \
+        gf_unit_w8.sh   \
+        gf_unit_w4.sh
+
+TEST_EXTENSIONS = .sh
+SH_LOG_COMPILER = $(SHELL)
+AM_SH_LOG_FLAGS = -e
+
+CLEANFILES = $(TESTS)
diff --git a/tools/run-tests.sh b/tools/run-tests.sh
deleted file mode 100755
index bd3cc60..0000000
--- a/tools/run-tests.sh
+++ /dev/null
@@ -1,9 +0,0 @@
-#!/bin/bash
-
-for w in 4 8 16 32 64 128 ; do
-    ./gf_methods $w -A -U | sh -e
-    if [ $? != "0" ] ; then
-      echo "Failed unit tests for w=$w"
-      break
-    fi
-done
-- 
cgit v1.2.1


From 568df90edc6ae07744de45de8665fb86ce6c84ee Mon Sep 17 00:00:00 2001
From: Janne Grunau <j@jannau.net>
Date: Fri, 19 Sep 2014 12:30:57 +0200
Subject: simd: rename the region flags from SSE to SIMD

SSE is not the only supported SIMD instruction set. Keep the old names
for backward compatibility.
---
 include/gf_complete.h |  2 ++
 include/gf_int.h      | 10 +++---
 src/gf.c              | 91 ++++++++++++++++++++++++++-------------------------
 src/gf_method.c       | 10 ++++--
 src/gf_w128.c         |  6 ++--
 src/gf_w16.c          | 14 ++++----
 src/gf_w32.c          | 14 ++++----
 src/gf_w4.c           | 12 +++----
 src/gf_w64.c          | 16 ++++-----
 src/gf_w8.c           | 12 +++----
 tools/gf_methods.c    |  2 +-
 11 files changed, 99 insertions(+), 90 deletions(-)

diff --git a/include/gf_complete.h b/include/gf_complete.h
index 5806625..e8ea2ca 100644
--- a/include/gf_complete.h
+++ b/include/gf_complete.h
@@ -61,7 +61,9 @@ typedef enum {GF_MULT_DEFAULT,
 #define GF_REGION_DOUBLE_TABLE (0x1)
 #define GF_REGION_QUAD_TABLE   (0x2)
 #define GF_REGION_LAZY         (0x4)
+#define GF_REGION_SIMD         (0x8)
 #define GF_REGION_SSE          (0x8)
+#define GF_REGION_NOSIMD       (0x10)
 #define GF_REGION_NOSSE        (0x10)
 #define GF_REGION_ALTMAP       (0x20)
 #define GF_REGION_CAUCHY       (0x40)
diff --git a/include/gf_int.h b/include/gf_int.h
index 98294cc..32866f4 100644
--- a/include/gf_int.h
+++ b/include/gf_int.h
@@ -113,7 +113,7 @@ typedef enum {GF_E_MDEFDIV, /* Dev != Default && Mult == Default */
               GF_E_DIVCOMP, /* Mult == Composite && Div != Default */
               GF_E_CAUCOMP, /* Mult == Composite && Reg == CAUCHY */
               GF_E_DOUQUAD, /* Reg == DOUBLE && Reg == QUAD */
-              GF_E_SSE__NO, /* Reg == SSE && Reg == NOSSE */
+              GF_E_SIMD_NO, /* Reg == SIMD && Reg == NOSIMD */
               GF_E_CAUCHYB, /* Reg == CAUCHY && Other Reg */
               GF_E_CAUGT32, /* Reg == CAUCHY && w > 32*/
               GF_E_ARG1SET, /* Arg1 != 0 && Mult \notin COMPOSITE/SPLIT/GROUP */
@@ -129,9 +129,9 @@ typedef enum {GF_E_MDEFDIV, /* Dev != Default && Mult == Default */
               GF_E_QUAD__J, /* Reg == QUAD && other Reg */
               GF_E_LAZY__X, /* Reg == LAZY && not DOUBLE or QUAD*/
               GF_E_ALTSHIF, /* Mult == Shift && Reg == ALTMAP */
-              GF_E_SSESHIF, /* Mult == Shift && Reg == SSE|NOSSE */
+              GF_E_SSESHIF, /* Mult == Shift && Reg == SIMD|NOSIMD */
               GF_E_ALT_CFM, /* Mult == CARRY_FREE && Reg == ALTMAP */
-              GF_E_SSE_CFM, /* Mult == CARRY_FREE && Reg == SSE|NOSSE */
+              GF_E_SSE_CFM, /* Mult == CARRY_FREE && Reg == SIMD|NOSIMD */
               GF_E_PCLMULX, /* Mult == Carry_Free && No PCLMUL */
               GF_E_ALT_BY2, /* Mult == Bytwo_x && Reg == ALTMAP */
               GF_E_BY2_SSE, /* Mult == Bytwo_x && Reg == SSE && No SSE2 */
@@ -148,7 +148,7 @@ typedef enum {GF_E_MDEFDIV, /* Dev != Default && Mult == Default */
               GF_E_GR_AR_W, /* Mult == GROUP, either arg > w  */
               GF_E_GR____J, /* Mult == GROUP, Reg == SSE|ALTMAP|NOSSE */
               GF_E_TABLE_W, /* Mult == TABLE, w too big */
-              GF_E_TAB_SSE, /* Mult == TABLE, SSE|NOSSE only apply to w == 4 */
+              GF_E_TAB_SSE, /* Mult == TABLE, SIMD|NOSIMD only apply to w == 4 */
               GF_E_TABSSE3, /* Mult == TABLE, Need SSSE3 for SSE */
               GF_E_TAB_ALT, /* Mult == TABLE, Reg == ALTMAP */
               GF_E_SP128AR, /* Mult == SPLIT, w=128, Bad arg1/arg2 */
@@ -172,7 +172,7 @@ typedef enum {GF_E_MDEFDIV, /* Dev != Default && Mult == Default */
               GF_E_SP_8__A, /* Mult == SPLIT, w=8, no ALTMAP */
               GF_E_SP_SSE3, /* Mult == SPLIT, Need SSSE3 for SSE */
               GF_E_COMP_A2, /* Mult == COMP, arg1 must be = 2 */
-              GF_E_COMP_SS, /* Mult == COMP, SSE|NOSSE */
+              GF_E_COMP_SS, /* Mult == COMP, SIMD|NOSIMD */
               GF_E_COMP__W, /* Mult == COMP, Bad w. */
               GF_E_UNKFLAG, /* Unknown flag in create_from.... */
               GF_E_UNKNOWN, /* Unknown mult_type. */
diff --git a/src/gf.c b/src/gf.c
index 10c9b3c..ca6a7f8 100644
--- a/src/gf.c
+++ b/src/gf.c
@@ -41,7 +41,7 @@ void gf_error()
     case GF_E_MDEFARG: s = "If multiplication method == default, can't use arg1/arg2."; break;
     case GF_E_DIVCOMP: s = "Cannot change the division technique with -m COMPOSITE."; break;
     case GF_E_DOUQUAD: s = "Cannot specify -r DOUBLE and -r QUAD."; break;
-    case GF_E_SSE__NO: s = "Cannot specify -r SSE and -r NOSSE."; break;
+    case GF_E_SIMD_NO: s = "Cannot specify -r SIMD and -r NOSIMD."; break;
     case GF_E_CAUCHYB: s = "Cannot specify -r CAUCHY and any other -r."; break;
     case GF_E_CAUCOMP: s = "Cannot specify -m COMPOSITE and -r CAUCHY."; break;
     case GF_E_CAUGT32: s = "Cannot specify -r CAUCHY with w > 32."; break;
@@ -51,23 +51,23 @@ void gf_error()
     case GF_E_BAD___W: s = "W must be 1-32, 64 or 128."; break;
     case GF_E_DOUBLET: s = "Can only specify -r DOUBLE with -m TABLE."; break;
     case GF_E_DOUBLEW: s = "Can only specify -r DOUBLE w = 4 or w = 8."; break;
-    case GF_E_DOUBLEJ: s = "Cannot specify -r DOUBLE with -r ALTMAP|SSE|NOSSE."; break;
+    case GF_E_DOUBLEJ: s = "Cannot specify -r DOUBLE with -r ALTMAP|SIMD|NOSIMD."; break;
     case GF_E_DOUBLEL: s = "Can only specify -r DOUBLE -r LAZY with w = 8"; break;
     case GF_E_QUAD__T: s = "Can only specify -r QUAD with -m TABLE."; break;
     case GF_E_QUAD__W: s = "Can only specify -r QUAD w = 4."; break;
-    case GF_E_QUAD__J: s = "Cannot specify -r QUAD with -r ALTMAP|SSE|NOSSE."; break;
+    case GF_E_QUAD__J: s = "Cannot specify -r QUAD with -r ALTMAP|SIMD|NOSIMD."; break;
     case GF_E_BADPOLY: s = "Bad primitive polynomial (high bits set)."; break;
     case GF_E_COMP_PP: s = "Bad primitive polynomial -- bigger than sub-field."; break;
     case GF_E_LAZY__X: s = "If -r LAZY, then -r must be DOUBLE or QUAD."; break;
     case GF_E_ALTSHIF: s = "Cannot specify -m SHIFT and -r ALTMAP."; break;
-    case GF_E_SSESHIF: s = "Cannot specify -m SHIFT and -r SSE|NOSSE."; break;
+    case GF_E_SSESHIF: s = "Cannot specify -m SHIFT and -r SIMD|NOSIMD."; break;
     case GF_E_ALT_CFM: s = "Cannot specify -m CARRY_FREE and -r ALTMAP."; break;
-    case GF_E_SSE_CFM: s = "Cannot specify -m CARRY_FREE and -r SSE|NOSSE."; break;
+    case GF_E_SSE_CFM: s = "Cannot specify -m CARRY_FREE and -r SIMD|NOSIMD."; break;
     case GF_E_PCLMULX: s = "Specified -m CARRY_FREE, but PCLMUL is not supported."; break;
     case GF_E_ALT_BY2: s = "Cannot specify -m BYTWO_x and -r ALTMAP."; break;
-    case GF_E_BY2_SSE: s = "Specified -m BYTWO_x -r SSE, but SSE2 is not supported."; break;
+    case GF_E_BY2_SSE: s = "Specified -m BYTWO_x -r SIMD, but SSE2 is not supported."; break;
     case GF_E_LOGBADW: s = "With Log Tables, w must be <= 27."; break;
-    case GF_E_LOG___J: s = "Cannot use Log tables with -r ALTMAP|SSE|NOSSE."; break;
+    case GF_E_LOG___J: s = "Cannot use Log tables with -r ALTMAP|SIMD|NOSIMD."; break;
     case GF_E_LOGPOLY: s = "Cannot use Log tables because the polynomial is not primitive."; break;
     case GF_E_ZERBADW: s = "With -m LOG_ZERO, w must be 8 or 16."; break;
     case GF_E_ZEXBADW: s = "With -m LOG_ZERO_EXT, w must be 8."; break;
@@ -77,33 +77,33 @@ void gf_error()
     case GF_E_GR_128A: s = "With -m GROUP, w == 128, arg1 must be 4, and arg2 in { 4,8,16 }."; break;
     case GF_E_GR_A_27: s = "With -m GROUP, arg1 and arg2 must be <= 27."; break;
     case GF_E_GR_AR_W: s = "With -m GROUP, arg1 and arg2 must be <= w."; break;
-    case GF_E_GR____J: s = "Cannot use GROUP with -r ALTMAP|SSE|NOSSE."; break;
+    case GF_E_GR____J: s = "Cannot use GROUP with -r ALTMAP|SIMD|NOSIMD."; break;
     case GF_E_TABLE_W: s = "With -m TABLE, w must be < 15, or == 16."; break;
-    case GF_E_TAB_SSE: s = "With -m TABLE, SSE|NOSSE only applies to w=4."; break;
-    case GF_E_TABSSE3: s = "With -m TABLE, -r SSE, you need SSSE3 supported."; break;
+    case GF_E_TAB_SSE: s = "With -m TABLE, SIMD|NOSIMD only applies to w=4."; break;
+    case GF_E_TABSSE3: s = "With -m TABLE, -r SIMD, you need SSSE3 supported."; break;
     case GF_E_TAB_ALT: s = "With -m TABLE, you cannot use ALTMAP."; break;
     case GF_E_SP128AR: s = "With -m SPLIT, w=128, bad arg1/arg2."; break;
-    case GF_E_SP128AL: s = "With -m SPLIT, w=128, -r SSE requires -r ALTMAP."; break;
+    case GF_E_SP128AL: s = "With -m SPLIT, w=128, -r SIMD requires -r ALTMAP."; break;
     case GF_E_SP128AS: s = "With -m SPLIT, w=128, ALTMAP needs SSSE3 supported."; break;
     case GF_E_SP128_A: s = "With -m SPLIT, w=128, -r ALTMAP only with arg1/arg2 = 4/128."; break;
-    case GF_E_SP128_S: s = "With -m SPLIT, w=128, -r SSE|NOSSE only with arg1/arg2 = 4/128."; break;
+    case GF_E_SP128_S: s = "With -m SPLIT, w=128, -r SIMD|NOSIMD only with arg1/arg2 = 4/128."; break;
     case GF_E_SPLIT_W: s = "With -m SPLIT, w must be in {8, 16, 32, 64, 128}."; break;
     case GF_E_SP_16AR: s = "With -m SPLIT, w=16, Bad arg1/arg2."; break;
     case GF_E_SP_16_A: s = "With -m SPLIT, w=16, -r ALTMAP only with arg1/arg2 = 4/16."; break;
-    case GF_E_SP_16_S: s = "With -m SPLIT, w=16, -r SSE|NOSSE only with arg1/arg2 = 4/16."; break;
+    case GF_E_SP_16_S: s = "With -m SPLIT, w=16, -r SIMD|NOSIMD only with arg1/arg2 = 4/16."; break;
     case GF_E_SP_32AR: s = "With -m SPLIT, w=32, Bad arg1/arg2."; break;
     case GF_E_SP_32AS: s = "With -m SPLIT, w=32, -r ALTMAP needs SSSE3 supported."; break;
     case GF_E_SP_32_A: s = "With -m SPLIT, w=32, -r ALTMAP only with arg1/arg2 = 4/32."; break;
-    case GF_E_SP_32_S: s = "With -m SPLIT, w=32, -r SSE|NOSSE only with arg1/arg2 = 4/32."; break;
+    case GF_E_SP_32_S: s = "With -m SPLIT, w=32, -r SIMD|NOSIMD only with arg1/arg2 = 4/32."; break;
     case GF_E_SP_64AR: s = "With -m SPLIT, w=64, Bad arg1/arg2."; break;
     case GF_E_SP_64AS: s = "With -m SPLIT, w=64, -r ALTMAP needs SSSE3 supported."; break;
     case GF_E_SP_64_A: s = "With -m SPLIT, w=64, -r ALTMAP only with arg1/arg2 = 4/64."; break;
-    case GF_E_SP_64_S: s = "With -m SPLIT, w=64, -r SSE|NOSSE only with arg1/arg2 = 4/64."; break;
+    case GF_E_SP_64_S: s = "With -m SPLIT, w=64, -r SIMD|NOSIMD only with arg1/arg2 = 4/64."; break;
     case GF_E_SP_8_AR: s = "With -m SPLIT, w=8, Bad arg1/arg2."; break;
     case GF_E_SP_8__A: s = "With -m SPLIT, w=8, Can't have -r ALTMAP."; break;
-    case GF_E_SP_SSE3: s = "With -m SPLIT, Need SSSE3 support for SSE."; break;
+    case GF_E_SP_SSE3: s = "With -m SPLIT, Need SSSE3 support for SIMD."; break;
     case GF_E_COMP_A2: s = "With -m COMPOSITE, arg1 must equal 2."; break;
-    case GF_E_COMP_SS: s = "With -m COMPOSITE, -r SSE and -r NOSSE do not apply."; break;
+    case GF_E_COMP_SS: s = "With -m COMPOSITE, -r SIMD and -r NOSIMD do not apply."; break;
     case GF_E_COMP__W: s = "With -m COMPOSITE, w must be 8, 16, 32, 64 or 128."; break;
     case GF_E_UNKFLAG: s = "Unknown method flag - should be -m, -d, -r or -p."; break;
     case GF_E_UNKNOWN: s = "Unknown multiplication type."; break;
@@ -182,14 +182,14 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type,
   int sse3 = 0;
   int sse2 = 0;
   int pclmul = 0;
-  int rdouble, rquad, rlazy, rsse, rnosse, raltmap, rcauchy, tmp;
+  int rdouble, rquad, rlazy, rsimd, rnosimd, raltmap, rcauchy, tmp;
   gf_internal_t *sub;
 
   rdouble = (region_type & GF_REGION_DOUBLE_TABLE);
   rquad   = (region_type & GF_REGION_QUAD_TABLE);
   rlazy   = (region_type & GF_REGION_LAZY);
-  rsse    = (region_type & GF_REGION_SSE);
-  rnosse  = (region_type & GF_REGION_NOSSE);
+  rsimd   = (region_type & GF_REGION_SIMD);
+  rnosimd = (region_type & GF_REGION_NOSIMD);
   raltmap = (region_type & GF_REGION_ALTMAP);
   rcauchy = (region_type & GF_REGION_CAUCHY);
 
@@ -201,7 +201,8 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type,
   }
 
   tmp = ( GF_REGION_DOUBLE_TABLE | GF_REGION_QUAD_TABLE | GF_REGION_LAZY |
-          GF_REGION_SSE | GF_REGION_NOSSE | GF_REGION_ALTMAP | GF_REGION_CAUCHY );
+          GF_REGION_SIMD | GF_REGION_NOSIMD | GF_REGION_ALTMAP |
+          GF_REGION_CAUCHY );
   if (region_type & (~tmp)) { _gf_errno = GF_E_UNK_REG; return 0; }
 
 #ifdef INTEL_SSE2
@@ -230,7 +231,7 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type,
     return 1;
   }
   
-  if (rsse && rnosse)                                { _gf_errno = GF_E_SSE__NO; return 0; }
+  if (rsimd && rnosimd)                              { _gf_errno = GF_E_SIMD_NO; return 0; }
   if (rcauchy && w > 32)                             { _gf_errno = GF_E_CAUGT32; return 0; }
   if (rcauchy && region_type != GF_REGION_CAUCHY)    { _gf_errno = GF_E_CAUCHYB; return 0; }
   if (rcauchy && mult_type == GF_MULT_COMPOSITE)     { _gf_errno = GF_E_CAUCOMP; return 0; }
@@ -252,7 +253,7 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type,
     if (rquad)                      { _gf_errno = GF_E_DOUQUAD; return 0; }
     if (mult_type != GF_MULT_TABLE) { _gf_errno = GF_E_DOUBLET; return 0; }
     if (w != 4 && w != 8)           { _gf_errno = GF_E_DOUBLEW; return 0; }
-    if (rsse || rnosse || raltmap)  { _gf_errno = GF_E_DOUBLEJ; return 0; }
+    if (rsimd || rnosimd || raltmap) { _gf_errno = GF_E_DOUBLEJ; return 0; }
     if (rlazy && w == 4)            { _gf_errno = GF_E_DOUBLEL; return 0; }
     return 1;
   }
@@ -260,7 +261,7 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type,
   if (rquad) {
     if (mult_type != GF_MULT_TABLE) { _gf_errno = GF_E_QUAD__T; return 0; }
     if (w != 4)                     { _gf_errno = GF_E_QUAD__W; return 0; }
-    if (rsse || rnosse || raltmap)  { _gf_errno = GF_E_QUAD__J; return 0; }
+    if (rsimd || rnosimd || raltmap) { _gf_errno = GF_E_QUAD__J; return 0; }
     return 1;
   }
 
@@ -268,7 +269,7 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type,
 
   if (mult_type == GF_MULT_SHIFT) {
     if (raltmap)                    { _gf_errno = GF_E_ALTSHIF; return 0; }
-    if (rsse || rnosse)             { _gf_errno = GF_E_SSESHIF; return 0; }
+    if (rsimd || rnosimd)           { _gf_errno = GF_E_SSESHIF; return 0; }
     return 1;
   }
 
@@ -281,7 +282,7 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type,
     if (w == 32 && (poly & 0xfe000000))            { _gf_errno = GF_E_CF32POL; return 0; }
     if (w == 64 && (poly & 0xfffe000000000000ULL)) { _gf_errno = GF_E_CF64POL; return 0; }
     if (raltmap)                                   { _gf_errno = GF_E_ALT_CFM; return 0; }
-    if (rsse || rnosse)                            { _gf_errno = GF_E_SSE_CFM; return 0; }
+    if (rsimd || rnosimd)                          { _gf_errno = GF_E_SSE_CFM; return 0; }
     if (!pclmul)                                   { _gf_errno = GF_E_PCLMULX; return 0; }
     return 1;
   }
@@ -290,21 +291,21 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type,
     if (w != 4 && w != 8 && w != 16 &&
         w != 32 && w != 64 && w != 128)            { _gf_errno = GF_E_CFM___W; return 0; }
     if (raltmap)                                   { _gf_errno = GF_E_ALT_CFM; return 0; }
-    if (rsse || rnosse)                            { _gf_errno = GF_E_SSE_CFM; return 0; }
+    if (rsimd || rnosimd)                          { _gf_errno = GF_E_SSE_CFM; return 0; }
     if (!pclmul)                                   { _gf_errno = GF_E_PCLMULX; return 0; }
     return 1;
   }
 
   if (mult_type == GF_MULT_BYTWO_p || mult_type == GF_MULT_BYTWO_b) {
     if (raltmap)                    { _gf_errno = GF_E_ALT_BY2; return 0; }
-    if (rsse && !sse2)              { _gf_errno = GF_E_BY2_SSE; return 0; }
+    if (rsimd && !sse2)              { _gf_errno = GF_E_BY2_SSE; return 0; }
     return 1;
   }
 
   if (mult_type == GF_MULT_LOG_TABLE || mult_type == GF_MULT_LOG_ZERO
                                      || mult_type == GF_MULT_LOG_ZERO_EXT ) {
     if (w > 27)                     { _gf_errno = GF_E_LOGBADW; return 0; }
-    if (raltmap || rsse || rnosse)  { _gf_errno = GF_E_LOG___J; return 0; }
+    if (raltmap || rsimd || rnosimd) { _gf_errno = GF_E_LOG___J; return 0; }
 
     if (mult_type == GF_MULT_LOG_TABLE) return 1;
 
@@ -324,14 +325,14 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type,
        (arg2 != 4 && arg2 != 8 && arg2 != 16))) { _gf_errno = GF_E_GR_128A; return 0; }
     if (arg1 > 27 || arg2 > 27)                 { _gf_errno = GF_E_GR_A_27; return 0; }
     if (arg1 > w || arg2 > w)                   { _gf_errno = GF_E_GR_AR_W; return 0; }
-    if (raltmap || rsse || rnosse)              { _gf_errno = GF_E_GR____J; return 0; }
+    if (raltmap || rsimd || rnosimd)            { _gf_errno = GF_E_GR____J; return 0; }
     return 1;
   }
   
   if (mult_type == GF_MULT_TABLE) {
     if (w != 16 && w >= 15)                     { _gf_errno = GF_E_TABLE_W; return 0; }
-    if (w != 4 && (rsse || rnosse))             { _gf_errno = GF_E_TAB_SSE; return 0; }
-    if (rsse && !sse3)                          { _gf_errno = GF_E_TABSSE3; return 0; }
+    if (w != 4 && (rsimd || rnosimd))           { _gf_errno = GF_E_TAB_SSE; return 0; }
+    if (rsimd && !sse3)                         { _gf_errno = GF_E_TABSSE3; return 0; }
     if (raltmap)                                { _gf_errno = GF_E_TAB_ALT; return 0; }
     return 1;
   }
@@ -344,46 +345,46 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type,
     }
     if (w == 8) {
       if (arg1 != 4 || arg2 != 8)               { _gf_errno = GF_E_SP_8_AR; return 0; }
-      if (rsse && !sse3)                        { _gf_errno = GF_E_SP_SSE3; return 0; }
+      if (rsimd && !sse3)                       { _gf_errno = GF_E_SP_SSE3; return 0; }
       if (raltmap)                              { _gf_errno = GF_E_SP_8__A; return 0; }
     } else if (w == 16) {
       if ((arg1 == 8 && arg2 == 8) ||
           (arg1 == 8 && arg2 == 16)) {
-        if (rsse || rnosse)                     { _gf_errno = GF_E_SP_16_S; return 0; }
+        if (rsimd || rnosimd)                   { _gf_errno = GF_E_SP_16_S; return 0; }
         if (raltmap)                            { _gf_errno = GF_E_SP_16_A; return 0; }
       } else if (arg1 == 4 && arg2 == 16) {
-        if (rsse && !sse3)                      { _gf_errno = GF_E_SP_SSE3; return 0; }
+        if (rsimd && !sse3)                     { _gf_errno = GF_E_SP_SSE3; return 0; }
       } else                                    { _gf_errno = GF_E_SP_16AR; return 0; }
     } else if (w == 32) {
       if ((arg1 == 8 && arg2 == 8) ||
           (arg1 == 8 && arg2 == 32) ||
           (arg1 == 16 && arg2 == 32)) {
-        if (rsse || rnosse)                     { _gf_errno = GF_E_SP_32_S; return 0; }
+        if (rsimd || rnosimd)                   { _gf_errno = GF_E_SP_32_S; return 0; }
         if (raltmap)                            { _gf_errno = GF_E_SP_32_A; return 0; }
       } else if (arg1 == 4 && arg2 == 32) {
-        if (rsse && !sse3)                      { _gf_errno = GF_E_SP_SSE3; return 0; }
+        if (rsimd && !sse3)                     { _gf_errno = GF_E_SP_SSE3; return 0; }
         if (raltmap && !sse3)                   { _gf_errno = GF_E_SP_32AS; return 0; }
-        if (raltmap && rnosse)                  { _gf_errno = GF_E_SP_32AS; return 0; }
+        if (raltmap && rnosimd)                 { _gf_errno = GF_E_SP_32AS; return 0; }
       } else                                    { _gf_errno = GF_E_SP_32AR; return 0; }
     } else if (w == 64) {
       if ((arg1 == 8 && arg2 == 8) ||
           (arg1 == 8 && arg2 == 64) ||
           (arg1 == 16 && arg2 == 64)) {
-        if (rsse || rnosse)                     { _gf_errno = GF_E_SP_64_S; return 0; }
+        if (rsimd || rnosimd)                   { _gf_errno = GF_E_SP_64_S; return 0; }
         if (raltmap)                            { _gf_errno = GF_E_SP_64_A; return 0; }
       } else if (arg1 == 4 && arg2 == 64) {
-        if (rsse && !sse3)                      { _gf_errno = GF_E_SP_SSE3; return 0; }
+        if (rsimd && !sse3)                     { _gf_errno = GF_E_SP_SSE3; return 0; }
         if (raltmap && !sse3)                   { _gf_errno = GF_E_SP_64AS; return 0; }
-        if (raltmap && rnosse)                  { _gf_errno = GF_E_SP_64AS; return 0; }
+        if (raltmap && rnosimd)                 { _gf_errno = GF_E_SP_64AS; return 0; }
       } else                                    { _gf_errno = GF_E_SP_64AR; return 0; }
     } else if (w == 128) {
       if (arg1 == 8 && arg2 == 128) {
-        if (rsse || rnosse)                     { _gf_errno = GF_E_SP128_S; return 0; }
+        if (rsimd || rnosimd)                   { _gf_errno = GF_E_SP128_S; return 0; }
         if (raltmap)                            { _gf_errno = GF_E_SP128_A; return 0; }
       } else if (arg1 == 4 && arg2 == 128) {
-        if (rsse && !sse3)                      { _gf_errno = GF_E_SP_SSE3; return 0; }
+        if (rsimd && !sse3)                     { _gf_errno = GF_E_SP_SSE3; return 0; }
         if (raltmap && !sse3)                   { _gf_errno = GF_E_SP128AS; return 0; }
-        if (raltmap && rnosse)                  { _gf_errno = GF_E_SP128AS; return 0; }
+        if (raltmap && rnosimd)                 { _gf_errno = GF_E_SP128AS; return 0; }
       } else                                    { _gf_errno = GF_E_SP128AR; return 0; }
     } else                                      { _gf_errno = GF_E_SPLIT_W; return 0; }
     return 1;
@@ -395,7 +396,7 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type,
     if (w < 128 && (poly >> (w/2)) != 0)                   { _gf_errno = GF_E_COMP_PP; return 0; }
     if (divide_type != GF_DIVIDE_DEFAULT)       { _gf_errno = GF_E_DIVCOMP; return 0; }
     if (arg1 != 2)                              { _gf_errno = GF_E_COMP_A2; return 0; }
-    if (rsse || rnosse)                         { _gf_errno = GF_E_COMP_SS; return 0; }
+    if (rsimd || rnosimd)                       { _gf_errno = GF_E_COMP_SS; return 0; }
     if (base != NULL) {
       sub = (gf_internal_t *) base->scratch;
       if (sub->w != w/2)                      { _gf_errno = GF_E_BASE__W; return 0; }
diff --git a/src/gf_method.c b/src/gf_method.c
index 2548a63..2210305 100644
--- a/src/gf_method.c
+++ b/src/gf_method.c
@@ -121,11 +121,17 @@ int create_gf_from_argv(gf_t *gf, int w, int argc, char **argv, int starting)
         } else if (strcmp(argv[starting], "LAZY") == 0) {
           region_type |= GF_REGION_LAZY;
           starting++;
+        } else if (strcmp(argv[starting], "SIMD") == 0) {
+          region_type |= GF_REGION_SIMD;
+          starting++;
+        } else if (strcmp(argv[starting], "NOSIMD") == 0) {
+          region_type |= GF_REGION_NOSIMD;
+          starting++;
         } else if (strcmp(argv[starting], "SSE") == 0) {
-          region_type |= GF_REGION_SSE;
+          region_type |= GF_REGION_SIMD;
           starting++;
         } else if (strcmp(argv[starting], "NOSSE") == 0) {
-          region_type |= GF_REGION_NOSSE;
+          region_type |= GF_REGION_NOSIMD;
           starting++;
         } else if (strcmp(argv[starting], "CAUCHY") == 0) {
           region_type |= GF_REGION_CAUCHY;
diff --git a/src/gf_w128.c b/src/gf_w128.c
index 66f9422..190f6b0 100644
--- a/src/gf_w128.c
+++ b/src/gf_w128.c
@@ -1527,7 +1527,7 @@ int gf_w128_split_init(gf_t *gf)
 
   gf->multiply.w128 = gf_w128_bytwo_p_multiply;
 #if defined(INTEL_SSE4_PCLMUL)
-  if (!(h->region_type & GF_REGION_NOSSE)){
+  if (!(h->region_type & GF_REGION_NOSIMD)){
     gf->multiply.w128 = gf_w128_clm_multiply;
   }
 #endif
@@ -1546,7 +1546,7 @@ int gf_w128_split_init(gf_t *gf)
     if((h->region_type & GF_REGION_ALTMAP))
     {
       #ifdef INTEL_SSE4
-        if(!(h->region_type & GF_REGION_NOSSE))
+        if(!(h->region_type & GF_REGION_NOSIMD))
           gf->multiply_region.w128 = gf_w128_split_4_128_sse_altmap_multiply_region;
         else
           return 0;
@@ -1556,7 +1556,7 @@ int gf_w128_split_init(gf_t *gf)
     }
     else {
       #ifdef INTEL_SSE4
-        if(!(h->region_type & GF_REGION_NOSSE))
+        if(!(h->region_type & GF_REGION_NOSIMD))
           gf->multiply_region.w128 = gf_w128_split_4_128_sse_multiply_region;
         else
           gf->multiply_region.w128 = gf_w128_split_4_128_multiply_region;
diff --git a/src/gf_w16.c b/src/gf_w16.c
index c4cd22d..0904115 100644
--- a/src/gf_w16.c
+++ b/src/gf_w16.c
@@ -1327,14 +1327,14 @@ int gf_w16_split_init(gf_t *gf)
 
   } else if ((h->arg1 == 4 && h->arg2 == 16) || (h->arg2 == 4 && h->arg1 == 16)) {
     if (issse3) {
-      if(h->region_type & GF_REGION_ALTMAP && h->region_type & GF_REGION_NOSSE)
+      if(h->region_type & GF_REGION_ALTMAP && h->region_type & GF_REGION_NOSIMD)
         gf->multiply_region.w32 = gf_w16_split_4_16_lazy_nosse_altmap_multiply_region;
-      else if(h->region_type & GF_REGION_NOSSE)
+      else if(h->region_type & GF_REGION_NOSIMD)
         gf->multiply_region.w32 = gf_w16_split_4_16_lazy_multiply_region;
       else if(h->region_type & GF_REGION_ALTMAP)
         gf->multiply_region.w32 = gf_w16_split_4_16_lazy_sse_altmap_multiply_region;
     } else {
-      if(h->region_type & GF_REGION_SSE)
+      if(h->region_type & GF_REGION_SIMD)
         return 0;
       else if(h->region_type & GF_REGION_ALTMAP)
         gf->multiply_region.w32 = gf_w16_split_4_16_lazy_nosse_altmap_multiply_region;
@@ -1884,25 +1884,25 @@ int gf_w16_bytwo_init(gf_t *gf)
   if (h->mult_type == GF_MULT_BYTWO_p) {
     gf->multiply.w32 = gf_w16_bytwo_p_multiply;
     #ifdef INTEL_SSE2
-      if (h->region_type & GF_REGION_NOSSE)
+      if (h->region_type & GF_REGION_NOSIMD)
         gf->multiply_region.w32 = gf_w16_bytwo_p_nosse_multiply_region;
       else
         gf->multiply_region.w32 = gf_w16_bytwo_p_sse_multiply_region;
     #else
       gf->multiply_region.w32 = gf_w16_bytwo_p_nosse_multiply_region;
-      if(h->region_type & GF_REGION_SSE)
+      if(h->region_type & GF_REGION_SIMD)
         return 0;
     #endif
   } else {
     gf->multiply.w32 = gf_w16_bytwo_b_multiply;
     #ifdef INTEL_SSE2
-      if (h->region_type & GF_REGION_NOSSE)
+      if (h->region_type & GF_REGION_NOSIMD)
         gf->multiply_region.w32 = gf_w16_bytwo_b_nosse_multiply_region;
       else
         gf->multiply_region.w32 = gf_w16_bytwo_b_sse_multiply_region;
     #else
       gf->multiply_region.w32 = gf_w16_bytwo_b_nosse_multiply_region;
-      if(h->region_type & GF_REGION_SSE)
+      if(h->region_type & GF_REGION_SIMD)
         return 0;
     #endif
   }
diff --git a/src/gf_w32.c b/src/gf_w32.c
index 5ec2aa7..8e7c741 100644
--- a/src/gf_w32.c
+++ b/src/gf_w32.c
@@ -1434,25 +1434,25 @@ int gf_w32_bytwo_init(gf_t *gf)
   if (h->mult_type == GF_MULT_BYTWO_p) {
     gf->multiply.w32 = gf_w32_bytwo_p_multiply;
     #ifdef INTEL_SSE2
-      if (h->region_type & GF_REGION_NOSSE)
+      if (h->region_type & GF_REGION_NOSIMD)
         gf->multiply_region.w32 = gf_w32_bytwo_p_nosse_multiply_region; 
       else
         gf->multiply_region.w32 = gf_w32_bytwo_p_sse_multiply_region; 
     #else
       gf->multiply_region.w32 = gf_w32_bytwo_p_nosse_multiply_region; 
-      if(h->region_type & GF_REGION_SSE)
+      if(h->region_type & GF_REGION_SIMD)
         return 0;
     #endif
   } else {
     gf->multiply.w32 = gf_w32_bytwo_b_multiply; 
     #ifdef INTEL_SSE2
-      if (h->region_type & GF_REGION_NOSSE)
+      if (h->region_type & GF_REGION_NOSIMD)
         gf->multiply_region.w32 = gf_w32_bytwo_b_nosse_multiply_region; 
       else
         gf->multiply_region.w32 = gf_w32_bytwo_b_sse_multiply_region; 
     #else
       gf->multiply_region.w32 = gf_w32_bytwo_b_nosse_multiply_region; 
-      if(h->region_type & GF_REGION_SSE)
+      if(h->region_type & GF_REGION_SIMD)
         return 0;
     #endif
   }
@@ -2335,13 +2335,13 @@ int gf_w32_split_init(gf_t *gf)
     ld2 = (struct gf_split_2_32_lazy_data *) h->private;
     ld2->last_value = 0;
     #ifdef INTEL_SSSE3
-      if (!(h->region_type & GF_REGION_NOSSE))
+      if (!(h->region_type & GF_REGION_NOSIMD))
         gf->multiply_region.w32 = gf_w32_split_2_32_lazy_sse_multiply_region;
       else
         gf->multiply_region.w32 = gf_w32_split_2_32_lazy_multiply_region;
     #else
       gf->multiply_region.w32 = gf_w32_split_2_32_lazy_multiply_region;
-      if(h->region_type & GF_REGION_SSE) return 0;
+      if(h->region_type & GF_REGION_SIMD) return 0;
     #endif
     return 1;
   } 
@@ -2352,7 +2352,7 @@ int gf_w32_split_init(gf_t *gf)
       (issse3 && h->mult_type == GF_REGION_DEFAULT)) {
     ld4 = (struct gf_split_4_32_lazy_data *) h->private;
     ld4->last_value = 0;
-    if ((h->region_type & GF_REGION_NOSSE) || !issse3) {
+    if ((h->region_type & GF_REGION_NOSIMD) || !issse3) {
       gf->multiply_region.w32 = gf_w32_split_4_32_lazy_multiply_region;
     } else if (h->region_type & GF_REGION_ALTMAP) {
       gf->multiply_region.w32 = gf_w32_split_4_32_lazy_sse_altmap_multiply_region;
diff --git a/src/gf_w4.c b/src/gf_w4.c
index 6bc79d0..f098323 100644
--- a/src/gf_w4.c
+++ b/src/gf_w4.c
@@ -490,13 +490,13 @@ int gf_w4_single_table_init(gf_t *gf)
   gf->divide.w32 = gf_w4_single_table_divide;
   gf->multiply.w32 = gf_w4_single_table_multiply;
   #ifdef INTEL_SSSE3
-    if(h->region_type & (GF_REGION_NOSSE | GF_REGION_CAUCHY))
+    if(h->region_type & (GF_REGION_NOSIMD | GF_REGION_CAUCHY))
       gf->multiply_region.w32 = gf_w4_single_table_multiply_region;
     else
       gf->multiply_region.w32 = gf_w4_single_table_sse_multiply_region;
   #else
     gf->multiply_region.w32 = gf_w4_single_table_multiply_region;
-    if (h->region_type & GF_REGION_SSE) return 0;
+    if (h->region_type & GF_REGION_SIMD) return 0;
   #endif
 
   return 1;
@@ -1905,25 +1905,25 @@ int gf_w4_bytwo_init(gf_t *gf)
   if (h->mult_type == GF_MULT_BYTWO_p) {
     gf->multiply.w32 = gf_w4_bytwo_p_multiply;
     #ifdef INTEL_SSE2
-      if (h->region_type & GF_REGION_NOSSE)
+      if (h->region_type & GF_REGION_NOSIMD)
         gf->multiply_region.w32 = gf_w4_bytwo_p_nosse_multiply_region;
       else
         gf->multiply_region.w32 = gf_w4_bytwo_p_sse_multiply_region;
     #else
       gf->multiply_region.w32 = gf_w4_bytwo_p_nosse_multiply_region;
-      if (h->region_type & GF_REGION_SSE)
+      if (h->region_type & GF_REGION_SIMD)
         return 0;
     #endif
   } else {
     gf->multiply.w32 = gf_w4_bytwo_b_multiply;
     #ifdef INTEL_SSE2
-      if (h->region_type & GF_REGION_NOSSE)
+      if (h->region_type & GF_REGION_NOSIMD)
         gf->multiply_region.w32 = gf_w4_bytwo_b_nosse_multiply_region;
       else
         gf->multiply_region.w32 = gf_w4_bytwo_b_sse_multiply_region;
     #else
       gf->multiply_region.w32 = gf_w4_bytwo_b_nosse_multiply_region;
-      if (h->region_type & GF_REGION_SSE)
+      if (h->region_type & GF_REGION_SIMD)
         return 0;
     #endif
   }
diff --git a/src/gf_w64.c b/src/gf_w64.c
index fdc4a7c..fe1c75d 100644
--- a/src/gf_w64.c
+++ b/src/gf_w64.c
@@ -1488,25 +1488,25 @@ int gf_w64_bytwo_init(gf_t *gf)
   if (h->mult_type == GF_MULT_BYTWO_p) {
     gf->multiply.w64 = gf_w64_bytwo_p_multiply;
     #ifdef INTEL_SSE2 
-      if (h->region_type & GF_REGION_NOSSE)
+      if (h->region_type & GF_REGION_NOSIMD)
         gf->multiply_region.w64 = gf_w64_bytwo_p_nosse_multiply_region; 
       else
         gf->multiply_region.w64 = gf_w64_bytwo_p_sse_multiply_region; 
     #else
       gf->multiply_region.w64 = gf_w64_bytwo_p_nosse_multiply_region; 
-      if(h->region_type & GF_REGION_SSE)
+      if(h->region_type & GF_REGION_SIMD)
         return 0;
     #endif
   } else {
     gf->multiply.w64 = gf_w64_bytwo_b_multiply;
     #ifdef INTEL_SSE2 
-      if (h->region_type & GF_REGION_NOSSE)
+      if (h->region_type & GF_REGION_NOSIMD)
         gf->multiply_region.w64 = gf_w64_bytwo_b_nosse_multiply_region; 
       else
         gf->multiply_region.w64 = gf_w64_bytwo_b_sse_multiply_region; 
     #else
       gf->multiply_region.w64 = gf_w64_bytwo_b_nosse_multiply_region; 
-      if(h->region_type & GF_REGION_SSE)
+      if(h->region_type & GF_REGION_SIMD)
         return 0;
     #endif
   }
@@ -2006,7 +2006,7 @@ int gf_w64_split_init(gf_t *gf)
   gf->multiply.w64 = gf_w64_bytwo_p_multiply; 
 
 #if defined(INTEL_SSE4_PCLMUL) 
-  if ((!(h->region_type & GF_REGION_NOSSE) &&
+  if ((!(h->region_type & GF_REGION_NOSIMD) &&
      (h->arg1 == 64 || h->arg2 == 64)) ||
      h->mult_type == GF_MULT_DEFAULT){
    
@@ -2045,7 +2045,7 @@ int gf_w64_split_init(gf_t *gf)
     d4 = (struct gf_split_4_64_lazy_data *) h->private;
     d4->last_value = 0;
 
-    if((h->region_type & GF_REGION_ALTMAP) && (h->region_type & GF_REGION_NOSSE)) return 0;
+    if((h->region_type & GF_REGION_ALTMAP) && (h->region_type & GF_REGION_NOSIMD)) return 0;
     if(h->region_type & GF_REGION_ALTMAP)
     {
       #ifdef INTEL_SSSE3
@@ -2057,13 +2057,13 @@ int gf_w64_split_init(gf_t *gf)
     else //no altmap
     {
       #ifdef INTEL_SSE4
-        if(h->region_type & GF_REGION_NOSSE)
+        if(h->region_type & GF_REGION_NOSIMD)
           gf->multiply_region.w64 = gf_w64_split_4_64_lazy_multiply_region;
         else
           gf->multiply_region.w64 = gf_w64_split_4_64_lazy_sse_multiply_region; 
       #else
         gf->multiply_region.w64 = gf_w64_split_4_64_lazy_multiply_region;
-        if(h->region_type & GF_REGION_SSE)
+        if(h->region_type & GF_REGION_SIMD)
           return 0;
       #endif
     }
diff --git a/src/gf_w8.c b/src/gf_w8.c
index 67fd688..bc4f5d1 100644
--- a/src/gf_w8.c
+++ b/src/gf_w8.c
@@ -1180,13 +1180,13 @@ int gf_w8_split_init(gf_t *gf)
   gf->multiply.w32 = gf_w8_split_multiply;
   
   #ifdef INTEL_SSSE3
-    if (h->region_type & GF_REGION_NOSSE)
+    if (h->region_type & GF_REGION_NOSIMD)
       gf->multiply_region.w32 = gf_w8_split_multiply_region;
     else
       gf->multiply_region.w32 = gf_w8_split_multiply_region_sse;
   #else
     gf->multiply_region.w32 = gf_w8_split_multiply_region;
-    if(h->region_type & GF_REGION_SSE)
+    if(h->region_type & GF_REGION_SIMD)
       return 0;
   #endif
 
@@ -2259,25 +2259,25 @@ int gf_w8_bytwo_init(gf_t *gf)
   if (h->mult_type == GF_MULT_BYTWO_p) {
     gf->multiply.w32 = gf_w8_bytwo_p_multiply;
 #ifdef INTEL_SSE2
-    if (h->region_type & GF_REGION_NOSSE)
+    if (h->region_type & GF_REGION_NOSIMD)
       gf->multiply_region.w32 = gf_w8_bytwo_p_nosse_multiply_region;
     else
       gf->multiply_region.w32 = gf_w8_bytwo_p_sse_multiply_region;
 #else
     gf->multiply_region.w32 = gf_w8_bytwo_p_nosse_multiply_region;
-    if(h->region_type & GF_REGION_SSE)
+    if(h->region_type & GF_REGION_SIMD)
       return 0;
 #endif
   } else {
     gf->multiply.w32 = gf_w8_bytwo_b_multiply;
 #ifdef INTEL_SSE2
-    if (h->region_type & GF_REGION_NOSSE)
+    if (h->region_type & GF_REGION_NOSIMD)
       gf->multiply_region.w32 = gf_w8_bytwo_b_nosse_multiply_region;
     else
       gf->multiply_region.w32 = gf_w8_bytwo_b_sse_multiply_region;
 #else
     gf->multiply_region.w32 = gf_w8_bytwo_b_nosse_multiply_region;
-    if(h->region_type & GF_REGION_SSE)
+    if(h->region_type & GF_REGION_SIMD)
       return 0;
 #endif
   }
diff --git a/tools/gf_methods.c b/tools/gf_methods.c
index 43589ac..c7d3d58 100644
--- a/tools/gf_methods.c
+++ b/tools/gf_methods.c
@@ -28,7 +28,7 @@ static char *MULTS[NMULTS] = { "SHIFT", "CARRY_FREE", "CARRY_FREE_GK", "GROUP44"
 /* Make sure CAUCHY is last */
 
 #define NREGIONS (7) 
-static char *REGIONS[NREGIONS] = { "DOUBLE", "QUAD", "LAZY", "SSE", "NOSSE", 
+static char *REGIONS[NREGIONS] = { "DOUBLE", "QUAD", "LAZY", "SIMD", "NOSIMD",
                                    "ALTMAP", "CAUCHY" };
 
 #define BNREGIONS (4) 
-- 
cgit v1.2.1


From eb5ce0ca4206ed4f74009c1b9a3a72407693448b Mon Sep 17 00:00:00 2001
From: Janne Grunau <j@jannau.net>
Date: Thu, 4 Sep 2014 18:29:58 +0200
Subject: configure: add ARM/AArch64 NEON support

Checks for arm_neon.h header.
---
 configure.ac          | 21 +++++++++++++++++++++
 include/gf_complete.h |  4 ++++
 m4/ax_ext.m4          | 49 +++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 74 insertions(+)

diff --git a/configure.ac b/configure.ac
index 47d5d62..31ab1fa 100644
--- a/configure.ac
+++ b/configure.ac
@@ -24,6 +24,27 @@ AC_PROG_CC
 
 AX_EXT()
 
+AC_ARG_ENABLE([neon],
+              AS_HELP_STRING([--disable-neon], [Build without NEON optimizations]))
+
+AS_IF([test "x$enable_neon" != "xno"],
+      [noneon_CPPFLAGS=$CPPFLAGS
+       CPPFLAGS="$CPPFLAGS $SIMD_FLAGS"
+       AC_CHECK_HEADER([arm_neon.h],
+                       [have_neon=yes],
+                       [have_neon=no
+                        CPPFLAGS=$noneon_CPPFLAGS])],
+      [have_neon=no
+       AS_IF([test "x$ax_cv_have_neon_ext" = "xyes"],
+             [SIMD_FLAGS=""])
+      ])
+
+AS_IF([test "x$have_neon" = "xno"],
+      [AS_IF([test "x$enable_neon" = "xyes"],
+             [AC_MSG_ERROR([neon requested but arm_neon.h not found])])
+      ])
+AM_CONDITIONAL([HAVE_NEON], [test "x$have_neon" = "xyes"])
+
 AC_ARG_ENABLE([sse],
               AS_HELP_STRING([--disable-sse], [Build without SSE optimizations]),
               [if   test "x$enableval" = "xno" ; then
diff --git a/include/gf_complete.h b/include/gf_complete.h
index e8ea2ca..c4783e8 100644
--- a/include/gf_complete.h
+++ b/include/gf_complete.h
@@ -33,6 +33,10 @@
   #include <wmmintrin.h>
 #endif
 
+#if defined(ARM_NEON)
+  #include <arm_neon.h>
+#endif
+
 
 /* These are the different ways to perform multiplication.
    Not all are implemented for all values of w.
diff --git a/m4/ax_ext.m4 b/m4/ax_ext.m4
index cfbb797..c03ccef 100644
--- a/m4/ax_ext.m4
+++ b/m4/ax_ext.m4
@@ -41,6 +41,55 @@ AC_DEFUN([AX_EXT],
   AC_REQUIRE([AC_CANONICAL_HOST])
 
   case $host_cpu in
+    aarch64*)
+      AC_DEFINE(HAVE_ARCH_AARCH64,,[targeting AArch64])
+      SIMD_FLAGS="$SIMD_FLAGS -DARCH_AARCH64"
+
+      AC_CACHE_CHECK([whether NEON is supported], [ax_cv_have_neon_ext],
+          [
+            # TODO: detect / cross-compile
+            ax_cv_have_neon_ext=yes
+          ])
+      AC_CACHE_CHECK([whether cryptographic extension is supported], [ax_cv_have_arm_crypt_ext],
+          [
+            # TODO: detect / cross-compile
+            ax_cv_have_arm_crypt_ext=yes
+          ])
+
+      if test "$ax_cv_have_arm_crypt_ext" = yes; then
+        AC_DEFINE(HAVE_ARM_CRYPT_EXT,,[Support ARM cryptographic extension])
+      fi
+
+      if test "$ax_cv_have_neon_ext" = yes; then
+        AC_DEFINE(HAVE_NEON,,[Support NEON instructions])
+      fi
+
+      if test "$ax_cv_have_arm_crypt_ext" = yes && test "$ax_cv_have_neon_ext" = yes; then
+          AX_CHECK_COMPILE_FLAG(-march=armv8-a+simd+crypto,
+                                SIMD_FLAGS="$SIMD_FLAGS -march=armv8-a+simd+crypto -DARM_CRYPT -DARM_NEON", [])
+      elif test "$ax_cv_have_arm_crypt_ext" = yes; then
+          AX_CHECK_COMPILE_FLAG(-march=armv8-a+crypto,
+                                SIMD_FLAGS="$SIMD_FLAGS -march=armv8-a+crypto -DARM_CRYPT", [])
+      elif test "$ax_cv_have_neon_ext" = yes; then
+          AX_CHECK_COMPILE_FLAG(-march=armv8-a+simd,
+                                SIMD_FLAGS="$SIMD_FLAGS -march=armv8-a+simd -DARM_NEON", [])
+      fi
+    ;;
+
+    arm*)
+      AC_CACHE_CHECK([whether NEON is supported], [ax_cv_have_neon_ext],
+          [
+            # TODO: detect / cross-compile
+            ax_cv_have_neon_ext=yes
+          ])
+
+      if test "$ax_cv_have_neon_ext" = yes; then
+        AC_DEFINE(HAVE_NEON,,[Support NEON instructions])
+        AX_CHECK_COMPILE_FLAG(-mfpu=neon,
+                                SIMD_FLAGS="$SIMD_FLAGS -mfpu=neon -DARM_NEON", [])
+      fi
+    ;;
+
     powerpc*)
       AC_CACHE_CHECK([whether altivec is supported], [ax_cv_have_altivec_ext],
           [
-- 
cgit v1.2.1


From 36e75c3efec08b1e9bdb9c1f69a5b0018abd8ac7 Mon Sep 17 00:00:00 2001
From: Janne Grunau <j@jannau.net>
Date: Fri, 5 Sep 2014 13:33:04 +0200
Subject: use posix_memalign to align memory for SIMD region tests

Properly emulate aligned allocation if posix_memalign is not available.
---
 configure.ac    |  7 +++++++
 test/gf_unit.c  | 48 +++++++++++++++++++++++++++++++++++++-----------
 tools/gf_time.c | 24 ++++++++++++++++++++++--
 3 files changed, 66 insertions(+), 13 deletions(-)

diff --git a/configure.ac b/configure.ac
index 31ab1fa..ad7bb83 100644
--- a/configure.ac
+++ b/configure.ac
@@ -22,6 +22,13 @@ AM_MAINTAINER_MODE([disable])
 dnl Compiling with per-target flags requires AM_PROG_CC_C_O.
 AC_PROG_CC
 
+# Check for functions to provide aligned memory
+#
+AC_CHECK_FUNCS([posix_memalign],
+ [found_memalign=yes; break])
+
+AS_IF([test "x$found_memalign" != "xyes"], [AC_MSG_WARN([No function for aligned memory allocation found])])
+
 AX_EXT()
 
 AC_ARG_ENABLE([neon],
diff --git a/test/gf_unit.c b/test/gf_unit.c
index 98ff98c..db26849 100644
--- a/test/gf_unit.c
+++ b/test/gf_unit.c
@@ -8,6 +8,14 @@
  * Performs unit testing for gf arithmetic
  */
 
+#include "config.h"
+
+#ifdef HAVE_POSIX_MEMALIGN
+#ifndef _XOPEN_SOURCE
+#define _XOPEN_SOURCE 600
+#endif
+#endif
+
 #include <stdio.h>
 #include <getopt.h>
 #include <stdint.h>
@@ -82,6 +90,9 @@ int main(int argc, char **argv)
   uint32_t mask = 0;
   char *ra, *rb, *rc, *rd, *target;
   int align;
+#ifndef HAVE_POSIX_MEMALIGN
+  char *malloc_ra, *malloc_rb, *malloc_rc, *malloc_rd;
+#endif
 
 
   if (argc < 4) usage(NULL);
@@ -116,18 +127,26 @@ int main(int argc, char **argv)
   c = (gf_general_t *) malloc(sizeof(gf_general_t));
   d = (gf_general_t *) malloc(sizeof(gf_general_t));
 
+#if HAVE_POSIX_MEMALIGN
+  if (posix_memalign((void **) &ra, 16, sizeof(char)*REGION_SIZE))
+    ra = NULL;
+  if (posix_memalign((void **) &rb, 16, sizeof(char)*REGION_SIZE))
+    rb = NULL;
+  if (posix_memalign((void **) &rc, 16, sizeof(char)*REGION_SIZE))
+    rc = NULL;
+  if (posix_memalign((void **) &rd, 16, sizeof(char)*REGION_SIZE))
+    rd = NULL;
+#else
   //15 bytes extra to make sure it's 16byte aligned
-  ra = (char *) malloc(sizeof(char)*REGION_SIZE+15);
-  rb = (char *) malloc(sizeof(char)*REGION_SIZE+15);
-  rc = (char *) malloc(sizeof(char)*REGION_SIZE+15);
-  rd = (char *) malloc(sizeof(char)*REGION_SIZE+15);
-
-  //this still assumes 8 byte aligned pointer from malloc
-  //(which is usual on 32-bit machines)
-  ra += (uint64_t)ra & 0xf;
-  rb += (uint64_t)rb & 0xf;
-  rc += (uint64_t)rc & 0xf;
-  rd += (uint64_t)rd & 0xf;
+  malloc_ra = (char *) malloc(sizeof(char)*REGION_SIZE+15);
+  malloc_rb = (char *) malloc(sizeof(char)*REGION_SIZE+15);
+  malloc_rc = (char *) malloc(sizeof(char)*REGION_SIZE+15);
+  malloc_rd = (char *) malloc(sizeof(char)*REGION_SIZE+15);
+  ra = (uint8_t *) (((uintptr_t) malloc_ra + 15) & ~((uintptr_t) 0xf));
+  rb = (uint8_t *) (((uintptr_t) malloc_rb + 15) & ~((uintptr_t) 0xf));
+  rc = (uint8_t *) (((uintptr_t) malloc_rc + 15) & ~((uintptr_t) 0xf));
+  rd = (uint8_t *) (((uintptr_t) malloc_rd + 15) & ~((uintptr_t) 0xf));
+#endif
 
   if (w <= 32) {
     mask = 0;
@@ -423,10 +442,17 @@ int main(int argc, char **argv)
   free(b);
   free(c);
   free(d);
+#ifdef HAVE_POSIX_MEMALIGN
   free(ra);
   free(rb);
   free(rc);
   free(rd);
+#else
+  free(malloc_ra);
+  free(malloc_rb);
+  free(malloc_rc);
+  free(malloc_rd);
+#endif
   
   return 0;
 }
diff --git a/tools/gf_time.c b/tools/gf_time.c
index d17a7c2..7402ab5 100644
--- a/tools/gf_time.c
+++ b/tools/gf_time.c
@@ -8,6 +8,14 @@
  * Performs timing for gf arithmetic
  */
 
+#include "config.h"
+
+#ifdef HAVE_POSIX_MEMALIGN
+#ifndef _XOPEN_SOURCE
+#define _XOPEN_SOURCE 600
+#endif
+#endif
+
 #include <stdio.h>
 #include <getopt.h>
 #include <stdint.h>
@@ -95,6 +103,9 @@ int main(int argc, char **argv)
   time_t t0;
   uint8_t *ra, *rb;
   gf_general_t a;
+#ifndef HAVE_POSIX_MEMALIGN
+  uint8_t *malloc_ra, *malloc_rb;
+#endif
 
   
   if (argc < 6) usage(NULL);
@@ -155,8 +166,17 @@ int main(int argc, char **argv)
 
   printf("Seed: %ld\n", t0);
 
-  ra = (uint8_t *) malloc(size);
-  rb = (uint8_t *) malloc(size);
+#ifdef HAVE_POSIX_MEMALIGN
+  if (posix_memalign((void **) &ra, 16, size))
+    ra = NULL;
+  if (posix_memalign((void **) &rb, 16, size))
+    rb = NULL;
+#else
+  malloc_ra = (uint8_t *) malloc(size + 15);
+  malloc_rb = (uint8_t *) malloc(size + 15);
+  ra = (uint8_t *) (((uintptr_t) malloc_ra + 15) & ~((uintptr_t) 0xf));
+  rb = (uint8_t *) (((uintptr_t) malloc_rb + 15) & ~((uintptr_t) 0xf));
+#endif
 
   if (ra == NULL || rb == NULL) { perror("malloc"); exit(1); }
 
-- 
cgit v1.2.1


From 3a1be40ea87ecc81e737aee6819ff96a6721f011 Mon Sep 17 00:00:00 2001
From: Janne Grunau <j@jannau.net>
Date: Thu, 4 Sep 2014 10:47:10 +0200
Subject: arm: NEON optimisations for XOR in gf_multby_one

---
 src/gf.c | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/src/gf.c b/src/gf.c
index ca6a7f8..c3801e7 100644
--- a/src/gf.c
+++ b/src/gf.c
@@ -954,7 +954,42 @@ void gf_multby_one(void *src, void *dest, int bytes, int xor)
   }
   return;
 #endif
+#if defined(ARM_NEON)
+  s8 = (uint8_t *) src;
+  d8 = (uint8_t *) dest;
 
+  if (uls % 16 == uld % 16) {
+    gf_set_region_data(&rd, NULL, src, dest, bytes, 1, xor, 16);
+    while (s8 != rd.s_start) {
+      *d8 ^= *s8;
+      s8++;
+      d8++;
+    }
+    while (s8 < (uint8_t *) rd.s_top) {
+      uint8x16_t vs = vld1q_u8 (s8);
+      uint8x16_t vd = vld1q_u8 (d8);
+      uint8x16_t vr = veorq_u8 (vs, vd);
+      vst1q_u8 (d8, vr);
+      s8 += 16;
+      d8 += 16;
+    }
+  } else {
+    while (s8 + 15 < (uint8_t *) src + bytes) {
+      uint8x16_t vs = vld1q_u8 (s8);
+      uint8x16_t vd = vld1q_u8 (d8);
+      uint8x16_t vr = veorq_u8 (vs, vd);
+      vst1q_u8 (d8, vr);
+      s8 += 16;
+      d8 += 16;
+    }
+  }
+  while (s8 < (uint8_t *) src + bytes) {
+    *d8 ^= *s8;
+    s8++;
+    d8++;
+  }
+  return;
+#endif
   if (uls % 8 != uld % 8) {
     gf_unaligned_xor(src, dest, bytes);
     return;
-- 
cgit v1.2.1


From 1311a44f7a27b38217a94e9d7a5dbe3ae3dde035 Mon Sep 17 00:00:00 2001
From: Janne Grunau <j@jannau.net>
Date: Wed, 17 Sep 2014 15:12:05 +0200
Subject: arm: NEON optimisations for gf_w4

Optimisations for the single table region multiplication and carry less
multiplication using NEON's polynomial multiplication of 8-bit values.

The single polynomial multiplication is not that useful but vector
version is for region multiplication.

Selected time_tool.sh results for a 1.7GHz cortex-a9:
Region Best (MB/s):   672.72   W-Method: 4 -m CARRY_FREE -
Region Best (MB/s):   265.84   W-Method: 4 -m BYTWO_p -
Region Best (MB/s):   329.41   W-Method: 4 -m TABLE -r DOUBLE -
Region Best (MB/s):   278.63   W-Method: 4 -m TABLE -r QUAD -
Region Best (MB/s):   329.81   W-Method: 4 -m TABLE -r QUAD -r LAZY -
Region Best (MB/s):  1318.03   W-Method: 4 -m TABLE -r SIMD -
Region Best (MB/s):   165.15   W-Method: 4 -m TABLE -r NOSIMD -
Region Best (MB/s):    99.73   W-Method: 4 -m LOG -
---
 include/gf_w4.h       |  63 +++++++++++++
 src/Makefile.am       |   7 ++
 src/gf_w4.c           |  68 ++++----------
 src/neon/gf_w4_neon.c | 247 ++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 335 insertions(+), 50 deletions(-)
 create mode 100644 include/gf_w4.h
 create mode 100644 src/neon/gf_w4_neon.c

diff --git a/include/gf_w4.h b/include/gf_w4.h
new file mode 100644
index 0000000..8ee94a3
--- /dev/null
+++ b/include/gf_w4.h
@@ -0,0 +1,63 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * gf_w4.h
+ *
+ * Defines and data structures for 4-bit Galois fields
+ */
+
+#ifndef GF_COMPLETE_GF_W4_H
+#define GF_COMPLETE_GF_W4_H
+
+#include <stdint.h>
+
+#define GF_FIELD_WIDTH      4
+#define GF_DOUBLE_WIDTH     (GF_FIELD_WIDTH*2)
+#define GF_FIELD_SIZE       (1 << GF_FIELD_WIDTH)
+#define GF_MULT_GROUP_SIZE       (GF_FIELD_SIZE-1)
+
+/* ------------------------------------------------------------
+   JSP: Each implementation has its own data, which is allocated
+   at one time as part of the handle. For that reason, it
+   shouldn't be hierarchical -- i.e. one should be able to
+   allocate it with one call to malloc. */
+
+struct gf_logtable_data {
+    uint8_t      log_tbl[GF_FIELD_SIZE];
+    uint8_t      antilog_tbl[GF_FIELD_SIZE * 2];
+    uint8_t      *antilog_tbl_div;
+};
+
+struct gf_single_table_data {
+    uint8_t      mult[GF_FIELD_SIZE][GF_FIELD_SIZE];
+    uint8_t      div[GF_FIELD_SIZE][GF_FIELD_SIZE];
+};
+
+struct gf_double_table_data {
+    uint8_t      div[GF_FIELD_SIZE][GF_FIELD_SIZE];
+    uint8_t      mult[GF_FIELD_SIZE][GF_FIELD_SIZE*GF_FIELD_SIZE];
+};
+struct gf_quad_table_data {
+    uint8_t      div[GF_FIELD_SIZE][GF_FIELD_SIZE];
+    uint16_t     mult[GF_FIELD_SIZE][(1<<16)];
+};
+
+struct gf_quad_table_lazy_data {
+    uint8_t      div[GF_FIELD_SIZE][GF_FIELD_SIZE];
+    uint8_t      smult[GF_FIELD_SIZE][GF_FIELD_SIZE];
+    uint16_t     mult[(1 << 16)];
+};
+
+struct gf_bytwo_data {
+    uint64_t prim_poly;
+    uint64_t mask1;
+    uint64_t mask2;
+};
+
+// ARM NEON init functions
+int gf_w4_neon_cfm_init(gf_t *gf);
+void gf_w4_neon_single_table_init(gf_t *gf);
+
+#endif /* GF_COMPLETE_GF_W4_H */
diff --git a/src/Makefile.am b/src/Makefile.am
index 34633ea..5352d12 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -1,11 +1,18 @@
 # GF-Complete 'core' AM file
 # Creates the library
 
+AUTOMAKE_OPTIONS = subdir-objects
+
 AM_CPPFLAGS = -I$(top_builddir)/include -I$(top_srcdir)/include
 AM_CFLAGS = -O3 $(SIMD_FLAGS) -fPIC
 
 lib_LTLIBRARIES = libgf_complete.la
 libgf_complete_la_SOURCES = gf.c gf_method.c gf_wgen.c gf_w4.c gf_w8.c gf_w16.c gf_w32.c \
           gf_w64.c gf_w128.c gf_rand.c gf_general.c
+
+if HAVE_NEON
+libgf_complete_la_SOURCES += neon/gf_w4_neon.c
+endif
+
 libgf_complete_la_LDFLAGS = -version-info 1:0:0
 
diff --git a/src/gf_w4.c b/src/gf_w4.c
index f098323..0e86aa8 100644
--- a/src/gf_w4.c
+++ b/src/gf_w4.c
@@ -11,49 +11,7 @@
 #include "gf_int.h"
 #include <stdio.h>
 #include <stdlib.h>
-
-#define GF_FIELD_WIDTH      4
-#define GF_DOUBLE_WIDTH     (GF_FIELD_WIDTH*2)
-#define GF_FIELD_SIZE       (1 << GF_FIELD_WIDTH)
-#define GF_MULT_GROUP_SIZE       (GF_FIELD_SIZE-1)
-
-/* ------------------------------------------------------------
-   JSP: Each implementation has its own data, which is allocated
-   at one time as part of the handle. For that reason, it 
-   shouldn't be hierarchical -- i.e. one should be able to 
-   allocate it with one call to malloc. */
-
-struct gf_logtable_data {
-    uint8_t      log_tbl[GF_FIELD_SIZE];
-    uint8_t      antilog_tbl[GF_FIELD_SIZE * 2];
-    uint8_t      *antilog_tbl_div;
-};
-
-struct gf_single_table_data {
-    uint8_t      mult[GF_FIELD_SIZE][GF_FIELD_SIZE];
-    uint8_t      div[GF_FIELD_SIZE][GF_FIELD_SIZE];
-};
-
-struct gf_double_table_data {
-    uint8_t      div[GF_FIELD_SIZE][GF_FIELD_SIZE];
-    uint8_t      mult[GF_FIELD_SIZE][GF_FIELD_SIZE*GF_FIELD_SIZE];
-};
-struct gf_quad_table_data {
-    uint8_t      div[GF_FIELD_SIZE][GF_FIELD_SIZE];
-    uint16_t     mult[GF_FIELD_SIZE][(1<<16)];
-};
-
-struct gf_quad_table_lazy_data {
-    uint8_t      div[GF_FIELD_SIZE][GF_FIELD_SIZE];
-    uint8_t      smult[GF_FIELD_SIZE][GF_FIELD_SIZE];
-    uint16_t     mult[(1 << 16)];
-};
-
-struct gf_bytwo_data {
-    uint64_t prim_poly;
-    uint64_t mask1;
-    uint64_t mask2;
-};
+#include "gf_w4.h"
 
 #define AB2(ip, am1 ,am2, b, t1, t2) {\
   t1 = (b << 1) & am1;\
@@ -489,11 +447,15 @@ int gf_w4_single_table_init(gf_t *gf)
   gf->inverse.w32 = NULL;
   gf->divide.w32 = gf_w4_single_table_divide;
   gf->multiply.w32 = gf_w4_single_table_multiply;
-  #ifdef INTEL_SSSE3
+  #if defined(INTEL_SSSE3) || defined(ARM_NEON)
     if(h->region_type & (GF_REGION_NOSIMD | GF_REGION_CAUCHY))
       gf->multiply_region.w32 = gf_w4_single_table_multiply_region;
     else
+    #if defined(INTEL_SSSE3)
       gf->multiply_region.w32 = gf_w4_single_table_sse_multiply_region;
+    #elif defined(ARM_NEON)
+      gf_w4_neon_single_table_init(gf);
+    #endif
   #else
     gf->multiply_region.w32 = gf_w4_single_table_multiply_region;
     if (h->region_type & GF_REGION_SIMD) return 0;
@@ -774,16 +736,16 @@ int gf_w4_table_init(gf_t *gf)
 {
   int rt;
   gf_internal_t *h;
-  int issse3 = 0;
+  int simd = 0;
 
-#ifdef INTEL_SSSE3
-  issse3 = 1;
+#if defined(INTEL_SSSE3) || defined(ARM_NEON)
+  simd = 1;
 #endif
 
   h = (gf_internal_t *) gf->scratch;
   rt = (h->region_type);
 
-  if (h->mult_type == GF_MULT_DEFAULT && !issse3) rt |= GF_REGION_DOUBLE_TABLE;
+  if (h->mult_type == GF_MULT_DEFAULT && !simd) rt |= GF_REGION_DOUBLE_TABLE;
 
   if (rt & GF_REGION_DOUBLE_TABLE) {
     return gf_w4_double_table_init(gf);
@@ -1937,6 +1899,8 @@ int gf_w4_cfm_init(gf_t *gf)
 #if defined(INTEL_SSE4_PCLMUL)
   gf->multiply.w32 = gf_w4_clm_multiply;
   return 1;
+#elif defined(ARM_NEON)
+  return gf_w4_neon_cfm_init(gf);
 #endif
   return 0;
 }
@@ -1953,11 +1917,14 @@ int gf_w4_shift_init(gf_t *gf)
 
 int gf_w4_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2)
 {
-  int issse3 = 0;
+  int issse3 = 0, isneon = 0;
 
 #ifdef INTEL_SSSE3
   issse3 = 1;
 #endif
+#ifdef ARM_NEON
+  isneon = 1;
+#endif
 
   switch(mult_type)
   {
@@ -1971,7 +1938,8 @@ int gf_w4_scratch_size(int mult_type, int region_type, int divide_type, int arg1
         return sizeof(gf_internal_t) + sizeof(struct gf_single_table_data) + 64;
       }
 
-      if (mult_type == GF_MULT_DEFAULT && !issse3) region_type = GF_REGION_DOUBLE_TABLE;
+      if (mult_type == GF_MULT_DEFAULT && !(issse3 || isneon))
+          region_type = GF_REGION_DOUBLE_TABLE;
 
       if (region_type & GF_REGION_DOUBLE_TABLE) {
         return sizeof(gf_internal_t) + sizeof(struct gf_double_table_data) + 64;
diff --git a/src/neon/gf_w4_neon.c b/src/neon/gf_w4_neon.c
new file mode 100644
index 0000000..3a21432
--- /dev/null
+++ b/src/neon/gf_w4_neon.c
@@ -0,0 +1,247 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * Copyright (c) 2014: Janne Grunau <j@jannau.net>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ *  - Neither the name of the University of Tennessee nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * gf_w4_neon.c
+ *
+ * Neon routines for 4-bit Galois fields
+ *
+ */
+
+#include "gf_int.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include "gf_w4.h"
+
+static
+gf_val_32_t
+gf_w4_neon_clm_multiply (gf_t *gf, gf_val_32_t a4, gf_val_32_t b4)
+{
+  gf_val_32_t rv = 0;
+  poly8x8_t       result, prim_poly;
+  poly8x8_t       a, b, w;
+  uint8x8_t       v;
+  gf_internal_t * h = gf->scratch;
+
+  a =  vdup_n_p8 (a4);
+  b =  vdup_n_p8 (b4);
+
+  prim_poly = vdup_n_p8 ((uint32_t)(h->prim_poly & 0x1fULL));
+
+  /* Do the initial multiply */
+  result = vmul_p8 (a, b);
+  v = vshr_n_u8 (vreinterpret_u8_p8(result), 4);
+  w = vmul_p8 (prim_poly, vreinterpret_p8_u8(v));
+  result = vreinterpret_p8_u8 (veor_u8 (vreinterpret_u8_p8(result), vreinterpret_u8_p8(w)));
+
+  /* Extracts 32 bit value from result. */
+  rv = (gf_val_32_t)vget_lane_u8 (vreinterpret_u8_p8 (result), 0);
+
+  return rv;
+}
+
+static inline void
+neon_clm_multiply_region_from_single (gf_t *gf, uint8_t *s8, uint8_t *d8,
+                                      gf_val_32_t val, uint8_t *d_end, int xor)
+{
+  gf_internal_t * h = gf->scratch;
+  poly8x8_t       prim_poly;
+  poly8x8_t       a, w, even, odd;
+  uint8x8_t       b, c, v, mask;
+
+  a         = vdup_n_p8 (val);
+  mask      = vdup_n_u8 (0xf);
+  prim_poly = vdup_n_p8 ((uint8_t)(h->prim_poly & 0x1fULL));
+
+  while (d8 < d_end) {
+    b = vld1_u8 (s8);
+
+    even = vreinterpret_p8_u8 (vand_u8 (b, mask));
+    odd  = vreinterpret_p8_u8 (vshr_n_u8 (b, 4));
+
+    if (xor)
+        c = vld1_u8 (d8);
+
+    even = vmul_p8 (a, even);
+    odd  = vmul_p8 (a, odd);
+
+    v = vshr_n_u8 (vreinterpret_u8_p8(even), 4);
+    w = vmul_p8 (prim_poly, vreinterpret_p8_u8(v));
+    even = vreinterpret_p8_u8 (veor_u8 (vreinterpret_u8_p8(even), vreinterpret_u8_p8(w)));
+
+    v = vshr_n_u8 (vreinterpret_u8_p8(odd), 4);
+    w = vmul_p8 (prim_poly, vreinterpret_p8_u8(v));
+    odd = vreinterpret_p8_u8 (veor_u8 (vreinterpret_u8_p8(odd), vreinterpret_u8_p8(w)));
+
+    v = veor_u8 (vreinterpret_u8_p8 (even), vshl_n_u8 (vreinterpret_u8_p8 (odd), 4));
+
+    if (xor)
+      v = veor_u8 (c, v);
+
+    vst1_u8 (d8, v);
+
+    d8 += 8;
+    s8 += 8;
+  }
+}
+
+
+static void
+gf_w4_neon_clm_multiply_region_from_single (gf_t *gf, void *src, void *dest,
+                                            gf_val_32_t val, int bytes, int xor)
+{
+  gf_region_data rd;
+  uint8_t *s8;
+  uint8_t *d8;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
+  gf_do_initial_region_alignment(&rd);
+
+  s8 = (uint8_t *) rd.s_start;
+  d8 = (uint8_t *) rd.d_start;
+
+  if (xor)
+    neon_clm_multiply_region_from_single (gf, s8, d8, val, rd.d_top, 1);
+  else
+    neon_clm_multiply_region_from_single (gf, s8, d8, val, rd.d_top, 0);
+
+  gf_do_final_region_alignment(&rd);
+}
+
+#ifndef ARCH_AARCH64
+#define vqtbl1q_u8(tbl, v) vcombine_u8(vtbl2_u8(tbl, vget_low_u8(v)),   \
+                                       vtbl2_u8(tbl, vget_high_u8(v)))
+#endif
+
+static
+inline
+void
+w4_single_table_multiply_region_neon(gf_t *gf, uint8_t *src, uint8_t *dst,
+                                     uint8_t * d_end, gf_val_32_t val, int xor)
+{
+  struct gf_single_table_data *std;
+  uint8_t *base;
+  uint8x16_t r, va, vh, vl, loset;
+
+#ifdef ARCH_AARCH64
+  uint8x16_t th, tl;
+#else
+  uint8x8x2_t th, tl;
+#endif
+
+  std = (struct gf_single_table_data *) ((gf_internal_t *) (gf->scratch))->private;
+  base = (uint8_t *) std->mult;
+  base += (val << GF_FIELD_WIDTH);
+
+#ifdef ARCH_AARCH64
+  tl = vld1q_u8 (base);
+  th = vshlq_n_u8 (tl, 4);
+#else
+  tl.val[0] = vld1_u8 (base);
+  tl.val[1] = vld1_u8 (base + 8);
+  th.val[0] =  vshl_n_u8 (tl.val[0], 4);
+  th.val[1] =  vshl_n_u8 (tl.val[1], 4);
+#endif
+
+  loset = vdupq_n_u8(0xf);
+
+  while (dst < d_end) {
+      va = vld1q_u8 (src);
+
+      vh = vshrq_n_u8 (va, 4);
+      vl = vandq_u8 (va, loset);
+
+      if (xor)
+        va = vld1q_u8 (dst);
+
+      vh = vqtbl1q_u8 (th, vh);
+      vl = vqtbl1q_u8 (tl, vl);
+
+      r = veorq_u8 (vh, vl);
+
+      if (xor)
+        r = veorq_u8 (va, r);
+
+      vst1q_u8 (dst, r);
+
+    dst += 16;
+    src += 16;
+  }
+}
+
+static
+void
+gf_w4_single_table_multiply_region_neon(gf_t *gf, void *src, void *dest,
+                                        gf_val_32_t val, int bytes, int xor)
+{
+  gf_region_data rd;
+  uint8_t *sptr, *dptr, *top;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
+  gf_do_initial_region_alignment(&rd);
+
+  sptr = rd.s_start;
+  dptr = rd.d_start;
+  top  = rd.d_top;
+
+  if (xor)
+      w4_single_table_multiply_region_neon(gf, sptr, dptr, top, val, 1);
+  else
+      w4_single_table_multiply_region_neon(gf, sptr, dptr, top, val, 0);
+
+  gf_do_final_region_alignment(&rd);
+
+}
+
+
+int gf_w4_neon_cfm_init(gf_t *gf)
+{
+  // single clm multiplication probably pointless
+  gf->multiply.w32 = gf_w4_neon_clm_multiply;
+  gf->multiply_region.w32 = gf_w4_neon_clm_multiply_region_from_single;
+
+  return 1;
+}
+
+void gf_w4_neon_single_table_init(gf_t *gf)
+{
+  gf->multiply_region.w32 = gf_w4_single_table_multiply_region_neon;
+}
-- 
cgit v1.2.1


From bec15359de5273d06673c43b8e73c70f97396041 Mon Sep 17 00:00:00 2001
From: Janne Grunau <j@jannau.net>
Date: Wed, 3 Sep 2014 16:57:06 +0200
Subject: arm: NEON optimisations for gf_w8

Optimisations for the 4,4 split table region multiplication and carry
less multiplication using NEON's polynomial long multiplication.
arm: w8: NEON carry less multiplication

Selected time_tool.sh results for a 1.7GHz cortex-a9:
Region Best (MB/s):   375.86   W-Method: 8 -m CARRY_FREE -
Region Best (MB/s):   142.94   W-Method: 8 -m TABLE -
Region Best (MB/s):   225.01   W-Method: 8 -m TABLE -r DOUBLE -
Region Best (MB/s):   211.23   W-Method: 8 -m TABLE -r DOUBLE -r LAZY -
Region Best (MB/s):   160.09   W-Method: 8 -m LOG -
Region Best (MB/s):   123.61   W-Method: 8 -m LOG_ZERO -
Region Best (MB/s):   123.85   W-Method: 8 -m LOG_ZERO_EXT -
Region Best (MB/s):  1183.79   W-Method: 8 -m SPLIT 8 4 -r SIMD -
Region Best (MB/s):   177.68   W-Method: 8 -m SPLIT 8 4 -r NOSIMD -
Region Best (MB/s):    87.85   W-Method: 8 -m COMPOSITE 2 - -
Region Best (MB/s):   428.59   W-Method: 8 -m COMPOSITE 2 - -r ALTMAP -
---
 include/gf_w8.h       |  99 +++++++++++++++++
 src/Makefile.am       |   3 +-
 src/gf.c              |   5 +
 src/gf_w8.c           | 108 ++++--------------
 src/neon/gf_w8_neon.c | 302 ++++++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 428 insertions(+), 89 deletions(-)
 create mode 100644 include/gf_w8.h
 create mode 100644 src/neon/gf_w8_neon.c

diff --git a/include/gf_w8.h b/include/gf_w8.h
new file mode 100644
index 0000000..938fcfd
--- /dev/null
+++ b/include/gf_w8.h
@@ -0,0 +1,99 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * gf_w8.c
+ *
+ * Defines and data stuctures for 8-bit Galois fields
+ */
+
+#ifndef GF_COMPLETE_GF_W8_H
+#define GF_COMPLETE_GF_W8_H
+
+#include "gf_int.h"
+#include <stdint.h>
+
+#define GF_FIELD_WIDTH (8)
+#define GF_FIELD_SIZE       (1 << GF_FIELD_WIDTH)
+#define GF_HALF_SIZE       (1 << (GF_FIELD_WIDTH/2))
+#define GF_MULT_GROUP_SIZE       GF_FIELD_SIZE-1
+
+#define GF_BASE_FIELD_WIDTH (4)
+#define GF_BASE_FIELD_SIZE       (1 << GF_BASE_FIELD_WIDTH)
+
+struct gf_w8_logtable_data {
+    uint8_t         log_tbl[GF_FIELD_SIZE];
+    uint8_t         antilog_tbl[GF_FIELD_SIZE * 2];
+    uint8_t         inv_tbl[GF_FIELD_SIZE];
+};
+
+struct gf_w8_logzero_table_data {
+    short           log_tbl[GF_FIELD_SIZE];  /* Make this signed, so that we can divide easily */
+    uint8_t         antilog_tbl[512+512+1];
+    uint8_t         *div_tbl;
+    uint8_t         *inv_tbl;
+};
+
+struct gf_w8_logzero_small_table_data {
+    short           log_tbl[GF_FIELD_SIZE];  /* Make this signed, so that we can divide easily */
+    uint8_t         antilog_tbl[255*3];
+    uint8_t         inv_tbl[GF_FIELD_SIZE];
+    uint8_t         *div_tbl;
+};
+
+struct gf_w8_composite_data {
+  uint8_t *mult_table;
+};
+
+/* Don't change the order of these relative to gf_w8_half_table_data */
+
+struct gf_w8_default_data {
+  uint8_t     high[GF_FIELD_SIZE][GF_HALF_SIZE];
+  uint8_t     low[GF_FIELD_SIZE][GF_HALF_SIZE];
+  uint8_t     divtable[GF_FIELD_SIZE][GF_FIELD_SIZE];
+  uint8_t     multtable[GF_FIELD_SIZE][GF_FIELD_SIZE];
+};
+
+struct gf_w8_half_table_data {
+  uint8_t     high[GF_FIELD_SIZE][GF_HALF_SIZE];
+  uint8_t     low[GF_FIELD_SIZE][GF_HALF_SIZE];
+};
+
+struct gf_w8_single_table_data {
+  uint8_t     divtable[GF_FIELD_SIZE][GF_FIELD_SIZE];
+  uint8_t     multtable[GF_FIELD_SIZE][GF_FIELD_SIZE];
+};
+
+struct gf_w8_double_table_data {
+    uint8_t         div[GF_FIELD_SIZE][GF_FIELD_SIZE];
+    uint16_t        mult[GF_FIELD_SIZE][GF_FIELD_SIZE*GF_FIELD_SIZE];
+};
+
+struct gf_w8_double_table_lazy_data {
+    uint8_t         div[GF_FIELD_SIZE][GF_FIELD_SIZE];
+    uint8_t         smult[GF_FIELD_SIZE][GF_FIELD_SIZE];
+    uint16_t        mult[GF_FIELD_SIZE*GF_FIELD_SIZE];
+};
+
+struct gf_w4_logtable_data {
+    uint8_t         log_tbl[GF_BASE_FIELD_SIZE];
+    uint8_t         antilog_tbl[GF_BASE_FIELD_SIZE * 2];
+    uint8_t         *antilog_tbl_div;
+};
+
+struct gf_w4_single_table_data {
+    uint8_t         div[GF_BASE_FIELD_SIZE][GF_BASE_FIELD_SIZE];
+    uint8_t         mult[GF_BASE_FIELD_SIZE][GF_BASE_FIELD_SIZE];
+};
+
+struct gf_w8_bytwo_data {
+    uint64_t prim_poly;
+    uint64_t mask1;
+    uint64_t mask2;
+};
+
+int gf_w8_neon_cfm_init(gf_t *gf);
+void gf_w8_neon_split_init(gf_t *gf);
+
+#endif /* GF_COMPLETE_GF_W8_H */
diff --git a/src/Makefile.am b/src/Makefile.am
index 5352d12..3e568d9 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -11,7 +11,8 @@ libgf_complete_la_SOURCES = gf.c gf_method.c gf_wgen.c gf_w4.c gf_w8.c gf_w16.c
           gf_w64.c gf_w128.c gf_rand.c gf_general.c
 
 if HAVE_NEON
-libgf_complete_la_SOURCES += neon/gf_w4_neon.c
+libgf_complete_la_SOURCES += neon/gf_w4_neon.c  \
+                             neon/gf_w8_neon.c
 endif
 
 libgf_complete_la_LDFLAGS = -version-info 1:0:0
diff --git a/src/gf.c b/src/gf.c
index c3801e7..6d34c46 100644
--- a/src/gf.c
+++ b/src/gf.c
@@ -217,6 +217,11 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type,
   pclmul = 1;
 #endif
 
+#ifdef ARM_NEON
+  pclmul = 1;
+  sse3 = 1;
+#endif
+
 
   if (w < 1 || (w > 32 && w != 64 && w != 128)) { _gf_errno = GF_E_BAD___W; return 0; }
     
diff --git a/src/gf_w8.c b/src/gf_w8.c
index bc4f5d1..8449298 100644
--- a/src/gf_w8.c
+++ b/src/gf_w8.c
@@ -9,88 +9,10 @@
  */
 
 #include "gf_int.h"
+#include "gf_w8.h"
 #include <stdio.h>
 #include <stdlib.h>
 
-#define GF_FIELD_WIDTH (8)
-#define GF_FIELD_SIZE       (1 << GF_FIELD_WIDTH)
-#define GF_HALF_SIZE       (1 << (GF_FIELD_WIDTH/2))
-#define GF_MULT_GROUP_SIZE       GF_FIELD_SIZE-1
-
-#define GF_BASE_FIELD_WIDTH (4)
-#define GF_BASE_FIELD_SIZE       (1 << GF_BASE_FIELD_WIDTH)
-
-struct gf_w8_logtable_data {
-    uint8_t         log_tbl[GF_FIELD_SIZE];
-    uint8_t         antilog_tbl[GF_FIELD_SIZE * 2];
-    uint8_t         inv_tbl[GF_FIELD_SIZE];
-};
-
-struct gf_w8_logzero_table_data {
-    short           log_tbl[GF_FIELD_SIZE];  /* Make this signed, so that we can divide easily */
-    uint8_t         antilog_tbl[512+512+1];
-    uint8_t         *div_tbl;
-    uint8_t         *inv_tbl;
-};
-
-struct gf_w8_logzero_small_table_data {
-    short           log_tbl[GF_FIELD_SIZE];  /* Make this signed, so that we can divide easily */
-    uint8_t         antilog_tbl[255*3];
-    uint8_t         inv_tbl[GF_FIELD_SIZE];
-    uint8_t         *div_tbl;
-};
-
-struct gf_w8_composite_data {
-  uint8_t *mult_table;
-};
-
-/* Don't change the order of these relative to gf_w8_half_table_data */
-
-struct gf_w8_default_data {
-  uint8_t     high[GF_FIELD_SIZE][GF_HALF_SIZE];
-  uint8_t     low[GF_FIELD_SIZE][GF_HALF_SIZE];
-  uint8_t     divtable[GF_FIELD_SIZE][GF_FIELD_SIZE];
-  uint8_t     multtable[GF_FIELD_SIZE][GF_FIELD_SIZE];
-};
-
-struct gf_w8_half_table_data {
-  uint8_t     high[GF_FIELD_SIZE][GF_HALF_SIZE];
-  uint8_t     low[GF_FIELD_SIZE][GF_HALF_SIZE];
-};
-
-struct gf_w8_single_table_data {
-  uint8_t     divtable[GF_FIELD_SIZE][GF_FIELD_SIZE];
-  uint8_t     multtable[GF_FIELD_SIZE][GF_FIELD_SIZE];
-};
-
-struct gf_w8_double_table_data {
-    uint8_t         div[GF_FIELD_SIZE][GF_FIELD_SIZE];
-    uint16_t        mult[GF_FIELD_SIZE][GF_FIELD_SIZE*GF_FIELD_SIZE];
-};
-
-struct gf_w8_double_table_lazy_data {
-    uint8_t         div[GF_FIELD_SIZE][GF_FIELD_SIZE];
-    uint8_t         smult[GF_FIELD_SIZE][GF_FIELD_SIZE];
-    uint16_t        mult[GF_FIELD_SIZE*GF_FIELD_SIZE];
-};
-
-struct gf_w4_logtable_data {
-    uint8_t         log_tbl[GF_BASE_FIELD_SIZE];
-    uint8_t         antilog_tbl[GF_BASE_FIELD_SIZE * 2];
-    uint8_t         *antilog_tbl_div;
-};
-
-struct gf_w4_single_table_data {
-    uint8_t         div[GF_BASE_FIELD_SIZE][GF_BASE_FIELD_SIZE];
-    uint8_t         mult[GF_BASE_FIELD_SIZE][GF_BASE_FIELD_SIZE];
-};
-
-struct gf_w8_bytwo_data {
-    uint64_t prim_poly;
-    uint64_t mask1;
-    uint64_t mask2;
-};
-
 #define AB2(ip, am1 ,am2, b, t1, t2) {\
   t1 = (b << 1) & am1;\
   t2 = b & am2; \
@@ -603,6 +525,8 @@ int gf_w8_cfm_init(gf_t *gf)
       return 0;
     }
   return 1;
+#elif defined(ARM_NEON)
+  return gf_w8_neon_cfm_init(gf);
 #endif
 
   return 0;
@@ -938,7 +862,7 @@ gf_w8_default_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
   return (ftd->multtable[a][b]);
 }
 
-#ifdef INTEL_SSSE3
+#if defined(INTEL_SSSE3) || defined(ARM_NEON)
 static
   gf_val_32_t
 gf_w8_default_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
@@ -1179,11 +1103,15 @@ int gf_w8_split_init(gf_t *gf)
 
   gf->multiply.w32 = gf_w8_split_multiply;
   
-  #ifdef INTEL_SSSE3
+  #if defined(INTEL_SSSE3) || defined(ARM_NEON)
     if (h->region_type & GF_REGION_NOSIMD)
       gf->multiply_region.w32 = gf_w8_split_multiply_region;
     else
+    #if defined(INTEL_SSSE3)
       gf->multiply_region.w32 = gf_w8_split_multiply_region_sse;
+    #elif defined(ARM_NEON)
+      gf_w8_neon_split_init(gf);
+    #endif
   #else
     gf->multiply_region.w32 = gf_w8_split_multiply_region;
     if(h->region_type & GF_REGION_SIMD)
@@ -1205,17 +1133,17 @@ int gf_w8_table_init(gf_t *gf)
   struct gf_w8_double_table_data *dtd = NULL;
   struct gf_w8_double_table_lazy_data *ltd = NULL;
   struct gf_w8_default_data *dd = NULL;
-  int a, b, c, prod, scase, issse;
+  int a, b, c, prod, scase, use_simd;
 
   h = (gf_internal_t *) gf->scratch;
 
-#ifdef INTEL_SSSE3
-  issse = 1;
+#if defined(INTEL_SSSE3) || defined(ARM_NEON)
+  use_simd = 1;
 #else
-  issse = 0;
+  use_simd = 0;
 #endif
 
-  if (h->mult_type == GF_MULT_DEFAULT && issse) {
+  if (h->mult_type == GF_MULT_DEFAULT && use_simd) {
     dd = (struct gf_w8_default_data *)h->private;
     scase = 3;
     bzero(dd->high, sizeof(uint8_t) * GF_FIELD_SIZE * GF_HALF_SIZE);
@@ -1290,10 +1218,14 @@ int gf_w8_table_init(gf_t *gf)
       gf->multiply_region.w32 = gf_w8_double_table_multiply_region;
       break;
     case 3:
-#ifdef INTEL_SSSE3
+#if defined(INTEL_SSSE3) || defined(ARM_NEON)
       gf->divide.w32 = gf_w8_default_divide;
       gf->multiply.w32 = gf_w8_default_multiply;
+#if defined(INTEL_SSSE3)
       gf->multiply_region.w32 = gf_w8_split_multiply_region_sse;
+#elif defined(ARM_NEON)
+      gf_w8_neon_split_init(gf);
+#endif
 #endif
       break;
   }
@@ -2296,7 +2228,7 @@ int gf_w8_scratch_size(int mult_type, int region_type, int divide_type, int arg1
   switch(mult_type)
   {
     case GF_MULT_DEFAULT:
-#ifdef INTEL_SSSE3
+#if defined(INTEL_SSSE3) || defined(ARM_NEON)
       return sizeof(gf_internal_t) + sizeof(struct gf_w8_default_data) + 64;
 #endif
       return sizeof(gf_internal_t) + sizeof(struct gf_w8_single_table_data) + 64;
diff --git a/src/neon/gf_w8_neon.c b/src/neon/gf_w8_neon.c
new file mode 100644
index 0000000..930a916
--- /dev/null
+++ b/src/neon/gf_w8_neon.c
@@ -0,0 +1,302 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * Copyright (c) 2014: Janne Grunau <j@jannau.net>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ *  - Neither the name of the University of Tennessee nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * gf_w8_neon.c
+ *
+ * Neon optimized routines for 8-bit Galois fields
+ *
+ */
+
+#include "gf_int.h"
+#include "gf_w8.h"
+#include <stdio.h>
+#include <stdlib.h>
+
+/* ARM NEON reducing macro for the carry free multiplication
+ *   vmull_p8 is the carryless multiply operation. Here vshrn_n_u16 shifts
+ *   the result to the right by 1 byte. This allows us to multiply
+ *   the prim_poly by the leading bits of the result. We then xor the result
+ *   of that operation back with the result. */
+#define NEON_CFM_REDUCE(v, w, result, prim_poly, initial)               \
+  do {								        \
+    if (initial)                                                        \
+      v = vshrn_n_u16 (vreinterpretq_u16_p16(result), 8);               \
+    else                                                                \
+      v = veor_u8 (v, vshrn_n_u16 (vreinterpretq_u16_p16(result), 8));  \
+    w = vmull_p8 (prim_poly, vreinterpret_p8_u8(v));                    \
+    result = vreinterpretq_p16_u16 (veorq_u16 (vreinterpretq_u16_p16(result), vreinterpretq_u16_p16(w))); \
+  } while (0)
+
+static
+inline
+gf_val_32_t
+gf_w8_neon_clm_multiply_x (gf_t *gf, gf_val_32_t a8, gf_val_32_t b8, int x)
+{
+  gf_val_32_t rv = 0;
+  poly8x8_t       a, b;
+  uint8x8_t       v;
+  poly16x8_t      result;
+  poly8x8_t       prim_poly;
+  poly16x8_t      w;
+  gf_internal_t * h = gf->scratch;
+
+  a =  vdup_n_p8 (a8);
+  b =  vdup_n_p8 (b8);
+
+  prim_poly = vdup_n_p8 ((uint32_t)(h->prim_poly & 0x1ffULL));
+
+  /* Do the initial multiply */
+  result = vmull_p8 (a, b);
+
+  /* Ben: Do prim_poly reduction twice. We are guaranteed that we will only
+     have to do the reduction at most twice, because (w-2)/z == 2. Where
+     z is equal to the number of zeros after the leading 1 */
+  NEON_CFM_REDUCE (v, w, result, prim_poly, 1);
+  NEON_CFM_REDUCE (v, w, result, prim_poly, 0);
+  if (x >= 3) {
+    NEON_CFM_REDUCE (v, w, result, prim_poly, 0);
+  }
+  if (x >= 4) {
+    NEON_CFM_REDUCE (v, w, result, prim_poly, 0);
+  }
+  /* Extracts 32 bit value from result. */
+  rv = (gf_val_32_t)vget_lane_u8 (vmovn_u16 (vreinterpretq_u16_p16 (result)), 0);
+
+  return rv;
+}
+
+#define CLM_MULTIPLY(x) \
+static gf_val_32_t gf_w8_neon_clm_multiply_ ## x (gf_t *gf, gf_val_32_t a8, gf_val_32_t b8) \
+{\
+    return gf_w8_neon_clm_multiply_x (gf, a8, b8, x);\
+}
+
+CLM_MULTIPLY(2)
+CLM_MULTIPLY(3)
+CLM_MULTIPLY(4)
+
+static inline void
+neon_clm_multiply_region_from_single_x(gf_t *gf, uint8_t *s8, uint8_t *d8,
+                                       gf_val_32_t val, uint8_t *d_end,
+                                       int xor, int x)
+{
+  gf_internal_t * h = gf->scratch;
+  poly8x8_t       a, b;
+  uint8x8_t       c, v;
+  poly16x8_t      result;
+  poly8x8_t       prim_poly;
+  poly16x8_t      w;
+
+  a         = vdup_n_p8 (val);
+  prim_poly = vdup_n_p8 ((uint8_t)(h->prim_poly & 0xffULL));
+
+  while (d8 < d_end) {
+    b = vld1_p8 ((poly8_t *) s8);
+
+    if (xor)
+        c = vld1_u8 (d8);
+
+    result = vmull_p8 (a, b);
+
+    NEON_CFM_REDUCE(v, w, result, prim_poly, 1);
+    NEON_CFM_REDUCE (v, w, result, prim_poly, 0);
+    if (x >= 3) {
+      NEON_CFM_REDUCE (v, w, result, prim_poly, 0);
+    }
+    if (x >= 4) {
+      NEON_CFM_REDUCE (v, w, result, prim_poly, 0);
+    }
+    v = vmovn_u16 (vreinterpretq_u16_p16 (result));
+    if (xor)
+      v = veor_u8 (c, v);
+
+    vst1_u8 (d8, v);
+
+    d8 += 8;
+    s8 += 8;
+  }
+}
+
+#define CLM_MULT_REGION(x)                                              \
+static void                                                             \
+gf_w8_neon_clm_multiply_region_from_single_ ## x (gf_t *gf, void *src,  \
+                                                  void *dest,           \
+                                                  gf_val_32_t val, int bytes, \
+                                                  int xor)              \
+{                                                                       \
+  gf_region_data rd;                                                    \
+  uint8_t *s8;                                                          \
+  uint8_t *d8;                                                          \
+                                                                        \
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }           \
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }       \
+                                                                        \
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);          \
+  gf_do_initial_region_alignment(&rd);                                  \
+  s8 = (uint8_t *) rd.s_start;                                          \
+  d8 = (uint8_t *) rd.d_start;                                          \
+                                                                        \
+  if (xor)                                                              \
+    neon_clm_multiply_region_from_single_x (gf, s8, d8, val, rd.d_top, 1, x); \
+  else                                                                  \
+    neon_clm_multiply_region_from_single_x (gf, s8, d8, val, rd.d_top, 0, x);\
+  gf_do_final_region_alignment(&rd);                                    \
+}
+
+CLM_MULT_REGION(2)
+CLM_MULT_REGION(3)
+CLM_MULT_REGION(4)
+
+
+int gf_w8_neon_cfm_init(gf_t *gf)
+{
+  gf_internal_t *h;
+
+  h = (gf_internal_t *) gf->scratch;
+
+  if ((0xe0 & h->prim_poly) == 0){
+    gf->multiply.w32 = gf_w8_neon_clm_multiply_2;
+    gf->multiply_region.w32 = gf_w8_neon_clm_multiply_region_from_single_2;
+  }else if ((0xc0 & h->prim_poly) == 0){
+    gf->multiply.w32 = gf_w8_neon_clm_multiply_3;
+    gf->multiply_region.w32 = gf_w8_neon_clm_multiply_region_from_single_3;
+  }else if ((0x80 & h->prim_poly) == 0){
+    gf->multiply.w32 = gf_w8_neon_clm_multiply_4;
+    gf->multiply_region.w32 = gf_w8_neon_clm_multiply_region_from_single_4;
+  }else{
+    return 0;
+  }
+  return 1;
+}
+
+#ifndef ARCH_AARCH64
+#define vqtbl1q_u8(tbl, v) vcombine_u8(vtbl2_u8(tbl, vget_low_u8(v)),   \
+                                       vtbl2_u8(tbl, vget_high_u8(v)))
+#endif
+
+static
+void
+gf_w8_split_multiply_region_neon(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  uint8_t *bh, *bl, *sptr, *dptr;
+  uint8x16_t r, va, vh, vl, loset;
+#ifdef ARCH_AARCH64
+  uint8x16_t mth, mtl;
+#else
+  uint8x8x2_t mth, mtl;
+#endif
+  struct gf_w8_half_table_data *htd;
+  gf_region_data rd;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  htd = (struct gf_w8_half_table_data *) ((gf_internal_t *) (gf->scratch))->private;
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
+  gf_do_initial_region_alignment(&rd);
+
+  bh = (uint8_t *) htd->high;
+  bh += (val << 4);
+  bl = (uint8_t *) htd->low;
+  bl += (val << 4);
+
+  sptr = rd.s_start;
+  dptr = rd.d_start;
+
+#ifdef ARCH_AARCH64
+  mth = vld1q_u8 (bh);
+  mtl = vld1q_u8 (bl);
+#else
+  mth.val[0] = vld1_u8 (bh);
+  mtl.val[0] = vld1_u8 (bl);
+  mth.val[1] = vld1_u8 (bh + 8);
+  mtl.val[1] = vld1_u8 (bl + 8);
+#endif
+
+  loset = vdupq_n_u8(0xf);
+
+  if (xor) {
+    while (sptr < (uint8_t *) rd.s_top) {
+      va = vld1q_u8 (sptr);
+
+      vh = vshrq_n_u8 (va, 4);
+      vl = vandq_u8 (va, loset);
+      va = vld1q_u8 (dptr);
+
+      vh = vqtbl1q_u8 (mth, vh);
+      vl = vqtbl1q_u8 (mtl, vl);
+
+      r = veorq_u8 (vh, vl);
+
+      vst1q_u8 (dptr, veorq_u8 (va, r));
+
+      dptr += 16;
+      sptr += 16;
+    }
+  } else {
+    while (sptr < (uint8_t *) rd.s_top) {
+      va = vld1q_u8 (sptr);
+
+      vh = vshrq_n_u8 (va, 4);
+      vl = vandq_u8 (va, loset);
+#ifdef ARCH_AARCH64
+      vh = vqtbl1q_u8 (mth, vh);
+      vl = vqtbl1q_u8 (mtl, vl);
+#else
+      vh = vcombine_u8 (vtbl2_u8 (mth, vget_low_u8 (vh)),
+			vtbl2_u8 (mth, vget_high_u8 (vh)));
+      vl = vcombine_u8 (vtbl2_u8 (mtl, vget_low_u8 (vl)),
+			vtbl2_u8 (mtl, vget_high_u8 (vl)));
+#endif
+
+      r = veorq_u8 (vh, vl);
+
+      vst1q_u8(dptr, r);
+
+      dptr += 16;
+      sptr += 16;
+    }
+  }
+
+  gf_do_final_region_alignment(&rd);
+}
+
+
+void gf_w8_neon_split_init(gf_t *gf)
+{
+  gf->multiply_region.w32 = gf_w8_split_multiply_region_neon;
+}
-- 
cgit v1.2.1


From 474010a91d35fef5ca7dea77205b6a5c7e68c3e9 Mon Sep 17 00:00:00 2001
From: Janne Grunau <j@jannau.net>
Date: Wed, 17 Sep 2014 16:10:25 +0200
Subject: arm: NEON optimisations for gf_w16

Optimisations for the 4,16 split table region multiplications.

Selected time_tool.sh 16 -A -B results for a 1.7 GHz cortex-a9:
Region Best (MB/s):   532.14   W-Method: 16 -m SPLIT 16 4 -r SIMD -
Region Best (MB/s):   212.34   W-Method: 16 -m SPLIT 16 4 -r NOSIMD -
Region Best (MB/s):   801.36   W-Method: 16 -m SPLIT 16 4 -r SIMD -r ALTMAP -
Region Best (MB/s):    93.20   W-Method: 16 -m SPLIT 16 4 -r NOSIMD -r ALTMAP -
Region Best (MB/s):   273.99   W-Method: 16 -m SPLIT 16 8 -
Region Best (MB/s):   270.81   W-Method: 16 -m SPLIT 8 8 -
Region Best (MB/s):    70.42   W-Method: 16 -m COMPOSITE 2 - -
Region Best (MB/s):   393.54   W-Method: 16 -m COMPOSITE 2 - -r ALTMAP -
---
 include/gf_w16.h       |  66 +++++++++
 src/Makefile.am        |   3 +-
 src/gf_w16.c           |  61 ++-------
 src/neon/gf_w16_neon.c | 356 +++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 435 insertions(+), 51 deletions(-)
 create mode 100644 include/gf_w16.h
 create mode 100644 src/neon/gf_w16_neon.c

diff --git a/include/gf_w16.h b/include/gf_w16.h
new file mode 100644
index 0000000..fb4c0e9
--- /dev/null
+++ b/include/gf_w16.h
@@ -0,0 +1,66 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * gf_w16.h
+ *
+ * Defines and data structures for 16-bit Galois fields
+ */
+
+#ifndef GF_COMPLETE_GF_W16_H
+#define GF_COMPLETE_GF_W16_H
+
+#include <stdint.h>
+
+#define GF_FIELD_WIDTH (16)
+#define GF_FIELD_SIZE (1 << GF_FIELD_WIDTH)
+#define GF_MULT_GROUP_SIZE GF_FIELD_SIZE-1
+
+#define GF_BASE_FIELD_WIDTH (8)
+#define GF_BASE_FIELD_SIZE       (1 << GF_BASE_FIELD_WIDTH)
+
+struct gf_w16_logtable_data {
+    uint16_t      log_tbl[GF_FIELD_SIZE];
+    uint16_t      antilog_tbl[GF_FIELD_SIZE * 2];
+    uint16_t      inv_tbl[GF_FIELD_SIZE];
+    uint16_t      *d_antilog;
+};
+
+struct gf_w16_zero_logtable_data {
+    int           log_tbl[GF_FIELD_SIZE];
+    uint16_t      _antilog_tbl[GF_FIELD_SIZE * 4];
+    uint16_t      *antilog_tbl;
+    uint16_t      inv_tbl[GF_FIELD_SIZE];
+};
+
+struct gf_w16_lazytable_data {
+    uint16_t      log_tbl[GF_FIELD_SIZE];
+    uint16_t      antilog_tbl[GF_FIELD_SIZE * 2];
+    uint16_t      inv_tbl[GF_FIELD_SIZE];
+    uint16_t      *d_antilog;
+    uint16_t      lazytable[GF_FIELD_SIZE];
+};
+
+struct gf_w16_bytwo_data {
+    uint64_t prim_poly;
+    uint64_t mask1;
+    uint64_t mask2;
+};
+
+struct gf_w16_split_8_8_data {
+    uint16_t      tables[3][256][256];
+};
+
+struct gf_w16_group_4_4_data {
+    uint16_t reduce[16];
+    uint16_t shift[16];
+};
+
+struct gf_w16_composite_data {
+  uint8_t *mult_table;
+};
+
+void gf_w16_neon_split_init(gf_t *gf);
+
+#endif /* GF_COMPLETE_GF_W16_H */
diff --git a/src/Makefile.am b/src/Makefile.am
index 3e568d9..f04042b 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -12,7 +12,8 @@ libgf_complete_la_SOURCES = gf.c gf_method.c gf_wgen.c gf_w4.c gf_w8.c gf_w16.c
 
 if HAVE_NEON
 libgf_complete_la_SOURCES += neon/gf_w4_neon.c  \
-                             neon/gf_w8_neon.c
+                             neon/gf_w8_neon.c  \
+                             neon/gf_w16_neon.c
 endif
 
 libgf_complete_la_LDFLAGS = -version-info 1:0:0
diff --git a/src/gf_w16.c b/src/gf_w16.c
index 0904115..ce47849 100644
--- a/src/gf_w16.c
+++ b/src/gf_w16.c
@@ -11,54 +11,7 @@
 #include "gf_int.h"
 #include <stdio.h>
 #include <stdlib.h>
-
-#define GF_FIELD_WIDTH (16)
-#define GF_FIELD_SIZE (1 << GF_FIELD_WIDTH)
-#define GF_MULT_GROUP_SIZE GF_FIELD_SIZE-1
-
-#define GF_BASE_FIELD_WIDTH (8)
-#define GF_BASE_FIELD_SIZE       (1 << GF_BASE_FIELD_WIDTH)
-
-struct gf_w16_logtable_data {
-    uint16_t      log_tbl[GF_FIELD_SIZE];
-    uint16_t      antilog_tbl[GF_FIELD_SIZE * 2];
-    uint16_t      inv_tbl[GF_FIELD_SIZE];
-    uint16_t      *d_antilog;
-};
-
-struct gf_w16_zero_logtable_data {
-    int           log_tbl[GF_FIELD_SIZE];
-    uint16_t      _antilog_tbl[GF_FIELD_SIZE * 4];
-    uint16_t      *antilog_tbl;
-    uint16_t      inv_tbl[GF_FIELD_SIZE];
-};
-
-struct gf_w16_lazytable_data {
-    uint16_t      log_tbl[GF_FIELD_SIZE];
-    uint16_t      antilog_tbl[GF_FIELD_SIZE * 2];
-    uint16_t      inv_tbl[GF_FIELD_SIZE];
-    uint16_t      *d_antilog;
-    uint16_t      lazytable[GF_FIELD_SIZE];
-};
-
-struct gf_w16_bytwo_data {
-    uint64_t prim_poly;
-    uint64_t mask1;
-    uint64_t mask2;
-};
-
-struct gf_w16_split_8_8_data {
-    uint16_t      tables[3][256][256];
-};
-
-struct gf_w16_group_4_4_data {
-    uint16_t reduce[16];
-    uint16_t shift[16];
-};
-
-struct gf_w16_composite_data {
-  uint8_t *mult_table;
-};
+#include "gf_w16.h"
 
 #define AB2(ip, am1 ,am2, b, t1, t2) {\
   t1 = (b << 1) & am1;\
@@ -1264,6 +1217,7 @@ int gf_w16_split_init(gf_t *gf)
   gf_internal_t *h;
   struct gf_w16_split_8_8_data *d8;
   int i, j, exp, issse3;
+  int isneon = 0;
   uint32_t p, basep;
 
   h = (gf_internal_t *) gf->scratch;
@@ -1273,6 +1227,9 @@ int gf_w16_split_init(gf_t *gf)
 #else
   issse3 = 0;
 #endif
+#ifdef ARM_NEON
+  isneon = 1;
+#endif
 
   if (h->arg1 == 8 && h->arg2 == 8) {
     d8 = (struct gf_w16_split_8_8_data *) h->private;
@@ -1317,6 +1274,10 @@ int gf_w16_split_init(gf_t *gf)
 
   if (issse3) {
     gf->multiply_region.w32 = gf_w16_split_4_16_lazy_sse_multiply_region;
+  } else if (isneon) {
+#ifdef ARM_NEON
+    gf_w16_neon_split_init(gf);
+#endif
   } else {
     gf->multiply_region.w32 = gf_w16_split_8_16_lazy_multiply_region;
   }
@@ -1326,12 +1287,12 @@ int gf_w16_split_init(gf_t *gf)
     gf->multiply_region.w32 = gf_w16_split_8_16_lazy_multiply_region;
 
   } else if ((h->arg1 == 4 && h->arg2 == 16) || (h->arg2 == 4 && h->arg1 == 16)) {
-    if (issse3) {
+    if (issse3 || isneon) {
       if(h->region_type & GF_REGION_ALTMAP && h->region_type & GF_REGION_NOSIMD)
         gf->multiply_region.w32 = gf_w16_split_4_16_lazy_nosse_altmap_multiply_region;
       else if(h->region_type & GF_REGION_NOSIMD)
         gf->multiply_region.w32 = gf_w16_split_4_16_lazy_multiply_region;
-      else if(h->region_type & GF_REGION_ALTMAP)
+      else if(h->region_type & GF_REGION_ALTMAP && issse3)
         gf->multiply_region.w32 = gf_w16_split_4_16_lazy_sse_altmap_multiply_region;
     } else {
       if(h->region_type & GF_REGION_SIMD)
diff --git a/src/neon/gf_w16_neon.c b/src/neon/gf_w16_neon.c
new file mode 100644
index 0000000..95bfd80
--- /dev/null
+++ b/src/neon/gf_w16_neon.c
@@ -0,0 +1,356 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * Copyright (c) 2014: Janne Grunau <j@jannau.net>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ *  - Neither the name of the University of Tennessee nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * gf_w16_neon.c
+ *
+ * Neon routines for 16-bit Galois fields
+ *
+ */
+
+#include "gf_int.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include "gf_w16.h"
+
+#ifdef ARCH_AARCH64
+static
+inline
+void
+neon_w16_split_4_multiply_region(gf_t *gf, uint16_t *src, uint16_t *dst,
+                                 uint16_t *d_end, uint8_t *tbl,
+                                 gf_val_32_t val, int xor)
+{
+  unsigned i;
+  uint8_t *high = tbl + 4 * 16;
+  uint16x8_t va0, va1, r0, r1;
+  uint8x16_t loset, rl, rh;
+  uint8x16x2_t va;
+
+  uint8x16_t tbl_h[4], tbl_l[4];
+  for (i = 0; i < 4; i++) {
+      tbl_l[i] = vld1q_u8(tbl + i*16);
+      tbl_h[i] = vld1q_u8(high + i*16);
+  }
+
+  loset = vdupq_n_u8(0xf);
+
+  while (dst < d_end) {
+      va0 = vld1q_u16(src);
+      va1 = vld1q_u16(src + 8);
+
+      va = vtrnq_u8(vreinterpretq_u8_u16(va0), vreinterpretq_u8_u16(va1));
+
+      rl = vqtbl1q_u8(tbl_l[0], vandq_u8(va.val[0], loset));
+      rh = vqtbl1q_u8(tbl_h[0], vandq_u8(va.val[0], loset));
+      rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[2], vandq_u8(va.val[1], loset)));
+      rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[2], vandq_u8(va.val[1], loset)));
+
+      va.val[0] = vshrq_n_u8(va.val[0], 4);
+      va.val[1] = vshrq_n_u8(va.val[1], 4);
+
+      rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[1], va.val[0]));
+      rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[1], va.val[0]));
+      rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[3], va.val[1]));
+      rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[3], va.val[1]));
+
+      va = vtrnq_u8(rl, rh);
+      r0 = vreinterpretq_u16_u8(va.val[0]);
+      r1 = vreinterpretq_u16_u8(va.val[1]);
+
+      if (xor) {
+          va0 = vld1q_u16(dst);
+          va1 = vld1q_u16(dst + 8);
+          r0 = veorq_u16(r0, va0);
+          r1 = veorq_u16(r1, va1);
+      }
+      vst1q_u16(dst, r0);
+      vst1q_u16(dst + 8, r1);
+
+      src += 16;
+      dst += 16;
+  }
+}
+
+static
+inline
+void
+neon_w16_split_4_altmap_multiply_region(gf_t *gf, uint8_t *src,
+                                        uint8_t *dst, uint8_t *d_end,
+                                        uint8_t *tbl, gf_val_32_t val,
+                                        int xor)
+{
+  unsigned i;
+  uint8_t *high = tbl + 4 * 16;
+  uint8x16_t vh, vl, rh, rl;
+  uint8x16_t loset;
+
+  uint8x16_t tbl_h[4], tbl_l[4];
+  for (i = 0; i < 4; i++) {
+      tbl_l[i] = vld1q_u8(tbl + i*16);
+      tbl_h[i] = vld1q_u8(high + i*16);
+  }
+
+  loset = vdupq_n_u8(0xf);
+
+  while (dst < d_end) {
+      vh = vld1q_u8(src);
+      vl = vld1q_u8(src + 16);
+
+      rl = vqtbl1q_u8(tbl_l[0], vandq_u8(vl, loset));
+      rh = vqtbl1q_u8(tbl_h[0], vandq_u8(vl, loset));
+      rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[2], vandq_u8(vh, loset)));
+      rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[2], vandq_u8(vh, loset)));
+
+      vl = vshrq_n_u8(vl, 4);
+      vh = vshrq_n_u8(vh, 4);
+
+      rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[1], vl));
+      rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[1], vl));
+      rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[3], vh));
+      rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[3], vh));
+
+      if (xor) {
+          vh = vld1q_u8(dst);
+          vl = vld1q_u8(dst + 16);
+          rh = veorq_u8(rh, vh);
+          rl = veorq_u8(rl, vl);
+      }
+      vst1q_u8(dst, rh);
+      vst1q_u8(dst + 16, rl);
+
+      src += 32;
+      dst += 32;
+  }
+}
+
+#else /* ARCH_AARCH64 */
+
+static
+inline
+void
+neon_w16_split_4_multiply_region(gf_t *gf, uint16_t *src, uint16_t *dst,
+                                 uint16_t *d_end, uint8_t *tbl,
+                                 gf_val_32_t val, int xor)
+{
+  unsigned i;
+  uint8_t *high = tbl + 4 * 16;
+  uint16x8_t va, r;
+  uint8x8_t loset, vb, vc, rl, rh;
+
+  uint8x8x2_t tbl_h[4], tbl_l[4];
+  for (i = 0; i < 4; i++) {
+      tbl_l[i].val[0] = vld1_u8(tbl + i*16);
+      tbl_l[i].val[1] = vld1_u8(tbl + i*16 + 8);
+      tbl_h[i].val[0] = vld1_u8(high + i*16);
+      tbl_h[i].val[1] = vld1_u8(high + i*16 + 8);
+  }
+
+  loset = vdup_n_u8(0xf);
+
+  while (dst < d_end) {
+      va = vld1q_u16(src);
+
+      vb = vmovn_u16(va);
+      vc = vshrn_n_u16(va, 8);
+
+      rl = vtbl2_u8(tbl_l[0], vand_u8(vb, loset));
+      rh = vtbl2_u8(tbl_h[0], vand_u8(vb, loset));
+      vb = vshr_n_u8(vb, 4);
+      rl = veor_u8(rl, vtbl2_u8(tbl_l[2], vand_u8(vc, loset)));
+      rh = veor_u8(rh, vtbl2_u8(tbl_h[2], vand_u8(vc, loset)));
+      vc = vshr_n_u8(vc, 4);
+      rl = veor_u8(rl, vtbl2_u8(tbl_l[1], vb));
+      rh = veor_u8(rh, vtbl2_u8(tbl_h[1], vb));
+      rl = veor_u8(rl, vtbl2_u8(tbl_l[3], vc));
+      rh = veor_u8(rh, vtbl2_u8(tbl_h[3], vc));
+
+      r  = vmovl_u8(rl);
+      r  = vorrq_u16(r, vshll_n_u8(rh, 8));
+
+      if (xor) {
+          va = vld1q_u16(dst);
+          r = veorq_u16(r, va);
+      }
+      vst1q_u16(dst, r);
+
+      src += 8;
+      dst += 8;
+  }
+}
+
+static
+inline
+void
+neon_w16_split_4_altmap_multiply_region(gf_t *gf, uint8_t *src,
+                                        uint8_t *dst, uint8_t *d_end,
+                                        uint8_t *tbl, gf_val_32_t val,
+                                        int xor)
+{
+  unsigned i;
+  uint8_t *high = tbl + 4 * 16;
+  uint8x8_t vh0, vh1, vl0, vl1, r0, r1, r2, r3;
+  uint8x8_t loset;
+
+  uint8x8x2_t tbl_h[4], tbl_l[4];
+  for (i = 0; i < 4; i++) {
+      tbl_l[i].val[0] = vld1_u8(tbl + i*16);
+      tbl_l[i].val[1] = vld1_u8(tbl + i*16 + 8);
+      tbl_h[i].val[0] = vld1_u8(high + i*16);
+      tbl_h[i].val[1] = vld1_u8(high + i*16 + 8);
+  }
+
+  loset = vdup_n_u8(0xf);
+
+  while (dst < d_end) {
+      vh0 = vld1_u8(src);
+      vh1 = vld1_u8(src + 8);
+      vl0 = vld1_u8(src + 16);
+      vl1 = vld1_u8(src + 24);
+
+      r0 = vtbl2_u8(tbl_l[0], vand_u8(vh0, loset));
+      r1 = vtbl2_u8(tbl_h[0], vand_u8(vh1, loset));
+      r2 = vtbl2_u8(tbl_l[2], vand_u8(vl0, loset));
+      r3 = vtbl2_u8(tbl_h[2], vand_u8(vl1, loset));
+
+      vh0 = vshr_n_u8(vh0, 4);
+      vh1 = vshr_n_u8(vh1, 4);
+      vl0 = vshr_n_u8(vl0, 4);
+      vl1 = vshr_n_u8(vl1, 4);
+
+      r0 = veor_u8(r0, vtbl2_u8(tbl_l[1], vh0));
+      r1 = veor_u8(r1, vtbl2_u8(tbl_h[1], vh1));
+      r2 = veor_u8(r2, vtbl2_u8(tbl_l[3], vl0));
+      r3 = veor_u8(r3, vtbl2_u8(tbl_h[3], vl1));
+
+      if (xor) {
+          vh0 = vld1_u8(dst);
+          vh1 = vld1_u8(dst + 8);
+          vl0 = vld1_u8(dst + 16);
+          vl1 = vld1_u8(dst + 24);
+          r0  = veor_u8(r0, vh0);
+          r1  = veor_u8(r1, vh1);
+          r2  = veor_u8(r2, vl0);
+          r3  = veor_u8(r3, vl1);
+      }
+      vst1_u8(dst,      r0);
+      vst1_u8(dst +  8, r1);
+      vst1_u8(dst + 16, r2);
+      vst1_u8(dst + 24, r3);
+
+      src += 32;
+      dst += 32;
+  }
+}
+#endif /* ARCH_AARCH64 */
+
+static
+inline
+void
+neon_w16_split_4_16_lazy_multiply_region(gf_t *gf, void *src, void *dest,
+                                         gf_val_32_t val, int bytes, int xor,
+                                         int altmap)
+{
+  gf_region_data rd;
+  unsigned i, j;
+  uint64_t c, prod;
+  uint8_t tbl[2 * 4 * 16];
+  uint8_t *high = tbl + 4 * 16;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  for (i = 0; i < 4; i++) {
+    for (j = 0; j < 16; j++) {
+      c = (j << (i*4));
+      prod = gf->multiply.w32(gf, c, val);
+      tbl[i*16 + j]  = prod & 0xff;
+      high[i*16 + j] = prod >> 8;
+    }
+  }
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 32);
+  gf_do_initial_region_alignment(&rd);
+
+  if (altmap) {
+    uint8_t *s8   = rd.s_start;
+    uint8_t *d8   = rd.d_start;
+    uint8_t *end8 = rd.d_top;
+    if (xor)
+      neon_w16_split_4_altmap_multiply_region(gf, s8, d8, end8, tbl, val, 1);
+    else
+      neon_w16_split_4_altmap_multiply_region(gf, s8, d8, end8, tbl, val, 0);
+  } else {
+    uint16_t *s16   = rd.s_start;
+    uint16_t *d16   = rd.d_start;
+    uint16_t *end16 = rd.d_top;
+    if (xor)
+      neon_w16_split_4_multiply_region(gf, s16, d16, end16, tbl, val, 1);
+    else
+      neon_w16_split_4_multiply_region(gf, s16, d16, end16, tbl, val, 0);
+  }
+
+  gf_do_final_region_alignment(&rd);
+}
+
+static
+void
+gf_w16_split_4_16_lazy_multiply_region_neon(gf_t *gf, void *src, void *dest,
+                                            gf_val_32_t val, int bytes, int xor)
+{
+  neon_w16_split_4_16_lazy_multiply_region(gf, src, dest, val, bytes, xor, 0);
+}
+
+static
+void
+gf_w16_split_4_16_lazy_altmap_multiply_region_neon(gf_t *gf, void *src,
+                                                   void *dest,
+                                                   gf_val_32_t val, int bytes,
+                                                   int xor)
+{
+  neon_w16_split_4_16_lazy_multiply_region(gf, src, dest, val, bytes, xor, 1);
+}
+
+
+void gf_w16_neon_split_init(gf_t *gf)
+{
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+
+  if (h->region_type & GF_REGION_ALTMAP)
+    gf->multiply_region.w32 = gf_w16_split_4_16_lazy_altmap_multiply_region_neon;
+  else
+    gf->multiply_region.w32 = gf_w16_split_4_16_lazy_multiply_region_neon;
+}
-- 
cgit v1.2.1


From 370c88b9015cbe874aca81442a5d8f6f99bfb654 Mon Sep 17 00:00:00 2001
From: Janne Grunau <j@jannau.net>
Date: Wed, 17 Sep 2014 16:13:02 +0200
Subject: arm: NEON optimisations for gf_w32

Optimisations for 4,32 split table multiplications.

Selected time_tool.sh results on a 1.7 GHz cortex-a9:
Region Best (MB/s):   346.67   W-Method: 32 -m SPLIT 32 4 -r SIMD -
Region Best (MB/s):    92.89   W-Method: 32 -m SPLIT 32 4 -r NOSIMD -
Region Best (MB/s):   258.17   W-Method: 32 -m SPLIT 32 4 -r SIMD -r ALTMAP -
Region Best (MB/s):   162.00   W-Method: 32 -m SPLIT 32 8 -
Region Best (MB/s):   160.53   W-Method: 32 -m SPLIT 8 8 -
Region Best (MB/s):    32.74   W-Method: 32 -m COMPOSITE 2 - -
Region Best (MB/s):   199.79   W-Method: 32 -m COMPOSITE 2 - -r ALTMAP -
---
 include/gf_w32.h       |  71 +++++++++++++
 src/Makefile.am        |   3 +-
 src/gf_w32.c           |  72 +++----------
 src/neon/gf_w32_neon.c | 269 +++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 358 insertions(+), 57 deletions(-)
 create mode 100644 include/gf_w32.h
 create mode 100644 src/neon/gf_w32_neon.c

diff --git a/include/gf_w32.h b/include/gf_w32.h
new file mode 100644
index 0000000..3396402
--- /dev/null
+++ b/include/gf_w32.h
@@ -0,0 +1,71 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * gf_w32.h
+ *
+ * Defines and data structures for 32-bit Galois fields
+ */
+
+#ifndef GF_COMPLETE_GF_W32_H
+#define GF_COMPLETE_GF_W32_H
+
+#include <stdint.h>
+
+#define GF_FIELD_WIDTH (32)
+#define GF_FIRST_BIT (1 << 31)
+
+#define GF_BASE_FIELD_WIDTH (16)
+#define GF_BASE_FIELD_SIZE       (1 << GF_BASE_FIELD_WIDTH)
+#define GF_BASE_FIELD_GROUP_SIZE  GF_BASE_FIELD_SIZE-1
+#define GF_MULTBY_TWO(p) (((p) & GF_FIRST_BIT) ? (((p) << 1) ^ h->prim_poly) : (p) << 1)
+
+struct gf_split_2_32_lazy_data {
+    uint32_t      tables[16][4];
+    uint32_t      last_value;
+};
+
+struct gf_w32_split_8_8_data {
+    uint32_t      tables[7][256][256];
+    uint32_t      region_tables[4][256];
+    uint32_t      last_value;
+};
+
+struct gf_w32_group_data {
+    uint32_t *reduce;
+    uint32_t *shift;
+    int      tshift;
+    uint64_t rmask;
+    uint32_t *memory;
+};
+
+struct gf_split_16_32_lazy_data {
+    uint32_t      tables[2][(1<<16)];
+    uint32_t      last_value;
+};
+
+struct gf_split_8_32_lazy_data {
+    uint32_t      tables[4][256];
+    uint32_t      last_value;
+};
+
+struct gf_split_4_32_lazy_data {
+    uint32_t      tables[8][16];
+    uint32_t      last_value;
+};
+
+struct gf_w32_bytwo_data {
+    uint64_t prim_poly;
+    uint64_t mask1;
+    uint64_t mask2;
+};
+
+struct gf_w32_composite_data {
+  uint16_t *log;
+  uint16_t *alog;
+};
+
+void gf_w32_neon_split_init(gf_t *gf);
+
+#endif /* GF_COMPLETE_GF_W32_H */
diff --git a/src/Makefile.am b/src/Makefile.am
index f04042b..a7f7ced 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -13,7 +13,8 @@ libgf_complete_la_SOURCES = gf.c gf_method.c gf_wgen.c gf_w4.c gf_w8.c gf_w16.c
 if HAVE_NEON
 libgf_complete_la_SOURCES += neon/gf_w4_neon.c  \
                              neon/gf_w8_neon.c  \
-                             neon/gf_w16_neon.c
+                             neon/gf_w16_neon.c \
+                             neon/gf_w32_neon.c
 endif
 
 libgf_complete_la_LDFLAGS = -version-info 1:0:0
diff --git a/src/gf_w32.c b/src/gf_w32.c
index 8e7c741..2e187fd 100644
--- a/src/gf_w32.c
+++ b/src/gf_w32.c
@@ -12,59 +12,7 @@
 #include "gf_int.h"
 #include <stdio.h>
 #include <stdlib.h>
-
-#define GF_FIELD_WIDTH (32)
-#define GF_FIRST_BIT (1 << 31)
-
-#define GF_BASE_FIELD_WIDTH (16)
-#define GF_BASE_FIELD_SIZE       (1 << GF_BASE_FIELD_WIDTH)
-#define GF_BASE_FIELD_GROUP_SIZE  GF_BASE_FIELD_SIZE-1
-#define GF_MULTBY_TWO(p) (((p) & GF_FIRST_BIT) ? (((p) << 1) ^ h->prim_poly) : (p) << 1)
-
-struct gf_split_2_32_lazy_data {
-    uint32_t      tables[16][4];
-    uint32_t      last_value;
-};
-
-struct gf_w32_split_8_8_data {
-    uint32_t      tables[7][256][256];
-    uint32_t      region_tables[4][256];
-    uint32_t      last_value;
-};
-
-struct gf_w32_group_data {
-    uint32_t *reduce;
-    uint32_t *shift;
-    int      tshift;
-    uint64_t rmask;
-    uint32_t *memory;
-};
-
-struct gf_split_16_32_lazy_data {
-    uint32_t      tables[2][(1<<16)];
-    uint32_t      last_value;
-};
-
-struct gf_split_8_32_lazy_data {
-    uint32_t      tables[4][256];
-    uint32_t      last_value;
-};
-
-struct gf_split_4_32_lazy_data {
-    uint32_t      tables[8][16];
-    uint32_t      last_value;
-};
-
-struct gf_w32_bytwo_data {
-    uint64_t prim_poly;
-    uint64_t mask1;
-    uint64_t mask2;
-};
-
-struct gf_w32_composite_data {
-  uint16_t *log;
-  uint16_t *alog;
-};
+#include "gf_w32.h"
 
 #define MM_PRINT32(s, r) { uint8_t blah[16], ii; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 4) printf(" %02x%02x%02x%02x", blah[15-ii], blah[14-ii], blah[13-ii], blah[12-ii]); printf("\n"); }
 
@@ -2283,6 +2231,7 @@ int gf_w32_split_init(gf_t *gf)
   struct gf_split_16_32_lazy_data *d16;
   uint32_t p, basep;
   int i, j, exp, ispclmul, issse3;
+  int isneon = 0;
 
 #if defined(INTEL_SSE4_PCLMUL)
   ispclmul = 1;
@@ -2295,6 +2244,9 @@ int gf_w32_split_init(gf_t *gf)
 #else
   issse3 = 0;
 #endif
+#ifdef ARM_NEON
+  isneon = 1;
+#endif
 
   h = (gf_internal_t *) gf->scratch;
 
@@ -2349,11 +2301,15 @@ int gf_w32_split_init(gf_t *gf)
   /* 4/32 or Default + SSE - There is no ALTMAP/NOSSE. */
 
   if ((h->arg1 == 4 && h->arg2 == 32) || (h->arg1 == 32 && h->arg2 == 4) ||
-      (issse3 && h->mult_type == GF_REGION_DEFAULT)) {
+      ((issse3 || isneon) && h->mult_type == GF_REGION_DEFAULT)) {
     ld4 = (struct gf_split_4_32_lazy_data *) h->private;
     ld4->last_value = 0;
-    if ((h->region_type & GF_REGION_NOSIMD) || !issse3) {
+    if ((h->region_type & GF_REGION_NOSIMD) || !(issse3 || isneon)) {
       gf->multiply_region.w32 = gf_w32_split_4_32_lazy_multiply_region;
+    } else if (isneon) {
+#ifdef ARM_NEON
+      gf_w32_neon_split_init(gf);
+#endif
     } else if (h->region_type & GF_REGION_ALTMAP) {
       gf->multiply_region.w32 = gf_w32_split_4_32_lazy_sse_altmap_multiply_region;
     } else {
@@ -2731,10 +2687,14 @@ int gf_w32_composite_init(gf_t *gf)
 int gf_w32_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2)
 {
   int issse3 = 0;
+  int isneon = 0;
 
 #ifdef INTEL_SSSE3
   issse3 = 1;
 #endif
+#ifdef ARM_NEON
+  isneon = 1;
+#endif
 
   switch(mult_type)
   {
@@ -2760,7 +2720,7 @@ int gf_w32_scratch_size(int mult_type, int region_type, int divide_type, int arg
           return sizeof(gf_internal_t) + sizeof(struct gf_split_2_32_lazy_data) + 64;
         }
         if ((arg1 == 8 && arg2 == 32) || (arg2 == 8 && arg1 == 32) || 
-             (mult_type == GF_MULT_DEFAULT && !issse3)) {
+             (mult_type == GF_MULT_DEFAULT && !(issse3 || isneon))) {
           return sizeof(gf_internal_t) + sizeof(struct gf_split_8_32_lazy_data) + 64;
         }
         if ((arg1 == 4 && arg2 == 32) || 
diff --git a/src/neon/gf_w32_neon.c b/src/neon/gf_w32_neon.c
new file mode 100644
index 0000000..8231eb3
--- /dev/null
+++ b/src/neon/gf_w32_neon.c
@@ -0,0 +1,269 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * Copyright (c) 2014: Janne Grunau <j@jannau.net>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ *  - Neither the name of the University of Tennessee nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * gf_w32_neon.c
+ *
+ * Neon routines for 32-bit Galois fields
+ *
+ */
+
+
+#include "gf_int.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include "gf_w32.h"
+
+#ifndef ARCH_AARCH64
+#define vqtbl1q_u8(tbl, v) vcombine_u8(vtbl2_u8(tbl, vget_low_u8(v)),   \
+                                       vtbl2_u8(tbl, vget_high_u8(v)))
+#endif
+
+static
+void
+neon_w32_split_4_32_multiply_region(gf_t *gf, uint32_t *src, uint32_t *dst,
+                                    uint32_t *d_end, uint8_t btable[8][4][16],
+                                    uint32_t val, int xor, int altmap)
+{
+  int i, j;
+#ifdef ARCH_AARCH64
+  uint8x16_t tables[8][4];
+#else
+  uint8x8x2_t tables[8][4];
+#endif
+  uint32x4_t v0, v1, v2, v3, s0, s1, s2, s3;
+  uint8x16_t p0, p1, p2, p3, si, mask1;
+  uint16x8x2_t r0, r1;
+  uint8x16x2_t q0, q1;
+
+  for (i = 0; i < 8; i++) {
+    for (j = 0; j < 4; j++) {
+#ifdef ARCH_AARCH64
+      tables[i][j] = vld1q_u8(btable[i][j]);
+#else
+      tables[i][j].val[0] = vld1_u8(btable[i][j]);
+      tables[i][j].val[1] = vld1_u8(btable[i][j] + 8);
+#endif
+    }
+  }
+
+  mask1 = vdupq_n_u8(0xf);
+
+  while (dst < d_end) {
+
+      v0 = vld1q_u32(src); src += 4;
+      v1 = vld1q_u32(src); src += 4;
+      v2 = vld1q_u32(src); src += 4;
+      v3 = vld1q_u32(src); src += 4;
+
+      if (altmap) {
+          q0.val[0] = vreinterpretq_u8_u32(v0);
+          q0.val[1] = vreinterpretq_u8_u32(v1);
+          q1.val[0] = vreinterpretq_u8_u32(v2);
+          q1.val[1] = vreinterpretq_u8_u32(v3);
+      } else {
+          r0 = vtrnq_u16(vreinterpretq_u16_u32(v0), vreinterpretq_u16_u32(v2));
+          r1 = vtrnq_u16(vreinterpretq_u16_u32(v1), vreinterpretq_u16_u32(v3));
+
+          q0 = vtrnq_u8(vreinterpretq_u8_u16(r0.val[0]),
+                        vreinterpretq_u8_u16(r1.val[0]));
+          q1 = vtrnq_u8(vreinterpretq_u8_u16(r0.val[1]),
+                        vreinterpretq_u8_u16(r1.val[1]));
+      }
+
+      si = vandq_u8(q0.val[0], mask1);
+      p0 = vqtbl1q_u8(tables[0][0], si);
+      p1 = vqtbl1q_u8(tables[0][1], si);
+      p2 = vqtbl1q_u8(tables[0][2], si);
+      p3 = vqtbl1q_u8(tables[0][3], si);
+
+      si = vshrq_n_u8(q0.val[0], 4);
+      p0 = veorq_u8(p0, vqtbl1q_u8(tables[1][0], si));
+      p1 = veorq_u8(p1, vqtbl1q_u8(tables[1][1], si));
+      p2 = veorq_u8(p2, vqtbl1q_u8(tables[1][2], si));
+      p3 = veorq_u8(p3, vqtbl1q_u8(tables[1][3], si));
+
+      si = vandq_u8(q0.val[1], mask1);
+      p0 = veorq_u8(p0, vqtbl1q_u8(tables[2][0], si));
+      p1 = veorq_u8(p1, vqtbl1q_u8(tables[2][1], si));
+      p2 = veorq_u8(p2, vqtbl1q_u8(tables[2][2], si));
+      p3 = veorq_u8(p3, vqtbl1q_u8(tables[2][3], si));
+
+      si = vshrq_n_u8(q0.val[1], 4);
+      p0 = veorq_u8(p0, vqtbl1q_u8(tables[3][0], si));
+      p1 = veorq_u8(p1, vqtbl1q_u8(tables[3][1], si));
+      p2 = veorq_u8(p2, vqtbl1q_u8(tables[3][2], si));
+      p3 = veorq_u8(p3, vqtbl1q_u8(tables[3][3], si));
+
+      si = vandq_u8(q1.val[0], mask1);
+      p0 = veorq_u8(p0, vqtbl1q_u8(tables[4][0], si));
+      p1 = veorq_u8(p1, vqtbl1q_u8(tables[4][1], si));
+      p2 = veorq_u8(p2, vqtbl1q_u8(tables[4][2], si));
+      p3 = veorq_u8(p3, vqtbl1q_u8(tables[4][3], si));
+
+      si = vshrq_n_u8(q1.val[0], 4);
+      p0 = veorq_u8(p0, vqtbl1q_u8(tables[5][0], si));
+      p1 = veorq_u8(p1, vqtbl1q_u8(tables[5][1], si));
+      p2 = veorq_u8(p2, vqtbl1q_u8(tables[5][2], si));
+      p3 = veorq_u8(p3, vqtbl1q_u8(tables[5][3], si));
+
+      si = vandq_u8(q1.val[1], mask1);
+      p0 = veorq_u8(p0, vqtbl1q_u8(tables[6][0], si));
+      p1 = veorq_u8(p1, vqtbl1q_u8(tables[6][1], si));
+      p2 = veorq_u8(p2, vqtbl1q_u8(tables[6][2], si));
+      p3 = veorq_u8(p3, vqtbl1q_u8(tables[6][3], si));
+
+      si = vshrq_n_u8(q1.val[1], 4);
+      p0 = veorq_u8(p0, vqtbl1q_u8(tables[7][0], si));
+      p1 = veorq_u8(p1, vqtbl1q_u8(tables[7][1], si));
+      p2 = veorq_u8(p2, vqtbl1q_u8(tables[7][2], si));
+      p3 = veorq_u8(p3, vqtbl1q_u8(tables[7][3], si));
+
+      if (altmap) {
+          s0 = vreinterpretq_u32_u8(p0);
+          s1 = vreinterpretq_u32_u8(p1);
+          s2 = vreinterpretq_u32_u8(p2);
+          s3 = vreinterpretq_u32_u8(p3);
+      } else {
+          q0 = vtrnq_u8(p0, p1);
+          q1 = vtrnq_u8(p2, p3);
+
+          r0 = vtrnq_u16(vreinterpretq_u16_u8(q0.val[0]),
+                         vreinterpretq_u16_u8(q1.val[0]));
+          r1 = vtrnq_u16(vreinterpretq_u16_u8(q0.val[1]),
+                         vreinterpretq_u16_u8(q1.val[1]));
+
+          s0 = vreinterpretq_u32_u16(r0.val[0]);
+          s1 = vreinterpretq_u32_u16(r1.val[0]);
+          s2 = vreinterpretq_u32_u16(r0.val[1]);
+          s3 = vreinterpretq_u32_u16(r1.val[1]);
+      }
+
+      if (xor) {
+          v0 = vld1q_u32(dst);
+          v1 = vld1q_u32(dst + 4);
+          v2 = vld1q_u32(dst + 8);
+          v3 = vld1q_u32(dst + 12);
+          s0 = veorq_u32(s0, v0);
+          s1 = veorq_u32(s1, v1);
+          s2 = veorq_u32(s2, v2);
+          s3 = veorq_u32(s3, v3);
+      }
+
+      vst1q_u32(dst,      s0);
+      vst1q_u32(dst + 4,  s1);
+      vst1q_u32(dst + 8,  s2);
+      vst1q_u32(dst + 12, s3);
+
+      dst += 16;
+  }
+}
+
+static
+inline
+void
+neon_w32_split_4_32_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor, int altmap)
+{
+  gf_internal_t *h;
+  int i, j, k;
+  uint32_t pp, v, *s32, *d32, *top, tmp_table[16];
+  uint8_t btable[8][4][16];
+  gf_region_data rd;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  h = (gf_internal_t *) gf->scratch;
+  pp = h->prim_poly;
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 64);
+  gf_do_initial_region_alignment(&rd);
+
+  s32 = (uint32_t *) rd.s_start;
+  d32 = (uint32_t *) rd.d_start;
+  top = (uint32_t *) rd.d_top;
+
+  v = val;
+  for (i = 0; i < 8; i++) {
+    tmp_table[0] = 0;
+    for (j = 1; j < 16; j <<= 1) {
+      for (k = 0; k < j; k++) {
+        tmp_table[k^j] = (v ^ tmp_table[k]);
+      }
+      v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1);
+    }
+    for (j = 0; j < 4; j++) {
+      for (k = 0; k < 16; k++) {
+        btable[i][j][k] = (uint8_t) tmp_table[k];
+        tmp_table[k] >>= 8;
+      }
+    }
+  }
+
+  if (xor)
+    neon_w32_split_4_32_multiply_region(gf, s32, d32, top, btable, val, 1, altmap);
+  else
+    neon_w32_split_4_32_multiply_region(gf, s32, d32, top, btable, val, 0, altmap);
+
+  gf_do_final_region_alignment(&rd);
+}
+
+static
+void
+gf_w32_split_4_32_lazy_multiply_region_neon(gf_t *gf, void *src, void *dest,
+                                            gf_val_32_t val, int bytes, int xor)
+{
+  neon_w32_split_4_32_lazy_multiply_region(gf, src, dest, val, bytes, xor, 0);
+}
+
+static
+void
+gf_w32_split_4_32_lazy_altmap_multiply_region_neon(gf_t *gf, void *src,
+                                                   void *dest, gf_val_32_t val,
+                                                   int bytes, int xor)
+{
+  neon_w32_split_4_32_lazy_multiply_region(gf, src, dest, val, bytes, xor, 1);
+}
+
+void gf_w32_neon_split_init(gf_t *gf)
+{
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+
+  if (h->region_type & GF_REGION_ALTMAP)
+      gf->multiply_region.w32 = gf_w32_split_4_32_lazy_altmap_multiply_region_neon;
+  else
+      gf->multiply_region.w32 = gf_w32_split_4_32_lazy_multiply_region_neon;
+
+}
-- 
cgit v1.2.1


From 6fdd8bc3d32cb2f7fa55d2de9dc7cc5bb2f885aa Mon Sep 17 00:00:00 2001
From: Janne Grunau <j@jannau.net>
Date: Wed, 17 Sep 2014 16:15:27 +0200
Subject: arm: NEON optimisations for gf_w64

Optimisations for 4,64 split table region multiplications. Only used on
ARMv8-A since it is not faster on ARMv7-A.
---
 include/gf_w64.h       |  50 ++++++++
 src/Makefile.am        |   3 +-
 src/gf_w64.c           |  51 +++-----
 src/neon/gf_w64_neon.c | 333 +++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 400 insertions(+), 37 deletions(-)
 create mode 100644 include/gf_w64.h
 create mode 100644 src/neon/gf_w64_neon.c

diff --git a/include/gf_w64.h b/include/gf_w64.h
new file mode 100644
index 0000000..9a74a81
--- /dev/null
+++ b/include/gf_w64.h
@@ -0,0 +1,50 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * gf_w64.h
+ *
+ * Defines and data structures for 64-bit Galois fields
+ */
+
+#ifndef GF_COMPLETE_GF_W64_H
+#define GF_COMPLETE_GF_W64_H
+
+#include <stdint.h>
+
+#define GF_FIELD_WIDTH (64)
+#define GF_FIRST_BIT (1ULL << 63)
+
+#define GF_BASE_FIELD_WIDTH (32)
+#define GF_BASE_FIELD_SIZE       (1ULL << GF_BASE_FIELD_WIDTH)
+#define GF_BASE_FIELD_GROUP_SIZE  GF_BASE_FIELD_SIZE-1
+
+struct gf_w64_group_data {
+    uint64_t *reduce;
+    uint64_t *shift;
+    uint64_t *memory;
+};
+
+struct gf_split_4_64_lazy_data {
+    uint64_t      tables[16][16];
+    uint64_t      last_value;
+};
+
+struct gf_split_8_64_lazy_data {
+    uint64_t      tables[8][(1<<8)];
+    uint64_t      last_value;
+};
+
+struct gf_split_16_64_lazy_data {
+    uint64_t      tables[4][(1<<16)];
+    uint64_t      last_value;
+};
+
+struct gf_split_8_8_data {
+    uint64_t      tables[15][256][256];
+};
+
+void gf_w64_neon_split_init(gf_t *gf);
+
+#endif /* GF_COMPLETE_GF_W64_H */
diff --git a/src/Makefile.am b/src/Makefile.am
index a7f7ced..240c1fe 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -14,7 +14,8 @@ if HAVE_NEON
 libgf_complete_la_SOURCES += neon/gf_w4_neon.c  \
                              neon/gf_w8_neon.c  \
                              neon/gf_w16_neon.c \
-                             neon/gf_w32_neon.c
+                             neon/gf_w32_neon.c \
+                             neon/gf_w64_neon.c
 endif
 
 libgf_complete_la_LDFLAGS = -version-info 1:0:0
diff --git a/src/gf_w64.c b/src/gf_w64.c
index fe1c75d..6e75f5e 100644
--- a/src/gf_w64.c
+++ b/src/gf_w64.c
@@ -11,38 +11,7 @@
 #include "gf_int.h"
 #include <stdio.h>
 #include <stdlib.h>
-
-#define GF_FIELD_WIDTH (64)
-#define GF_FIRST_BIT (1ULL << 63)
-
-#define GF_BASE_FIELD_WIDTH (32)
-#define GF_BASE_FIELD_SIZE       (1ULL << GF_BASE_FIELD_WIDTH)
-#define GF_BASE_FIELD_GROUP_SIZE  GF_BASE_FIELD_SIZE-1
-
-struct gf_w64_group_data {
-    uint64_t *reduce;
-    uint64_t *shift;
-    uint64_t *memory;
-};
-
-struct gf_split_4_64_lazy_data {
-    uint64_t      tables[16][16];
-    uint64_t      last_value;
-};
-
-struct gf_split_8_64_lazy_data {
-    uint64_t      tables[8][(1<<8)];
-    uint64_t      last_value;
-};
-
-struct gf_split_16_64_lazy_data {
-    uint64_t      tables[4][(1<<16)];
-    uint64_t      last_value;
-};
-
-struct gf_split_8_8_data {
-    uint64_t      tables[15][256][256];
-};
+#include "gf_w64.h"
 
 static
 inline
@@ -2027,11 +1996,15 @@ int gf_w64_split_init(gf_t *gf)
   /* Allen: set region pointers for default mult type. Single pointers are
    * taken care of above (explicitly for sse, implicitly for no sse). */
 
-#ifdef INTEL_SSE4
+#if defined(INTEL_SSE4) || defined(ARCH_AARCH64)
   if (h->mult_type == GF_MULT_DEFAULT) {
     d4 = (struct gf_split_4_64_lazy_data *) h->private;
     d4->last_value = 0;
+#if defined(INTEL_SSE4)
     gf->multiply_region.w64 = gf_w64_split_4_64_lazy_sse_multiply_region; 
+#elif defined(ARCH_AARCH64)
+    gf_w64_neon_split_init(gf);
+#endif
   }
 #else
   if (h->mult_type == GF_MULT_DEFAULT) {
@@ -2050,17 +2023,23 @@ int gf_w64_split_init(gf_t *gf)
     {
       #ifdef INTEL_SSSE3
         gf->multiply_region.w64 = gf_w64_split_4_64_lazy_sse_altmap_multiply_region; 
+      #elif defined(ARCH_AARCH64)
+        gf_w64_neon_split_init(gf);
       #else
         return 0;
       #endif
     }
     else //no altmap
     {
-      #ifdef INTEL_SSE4
+      #if defined(INTEL_SSE4) || defined(ARCH_AARCH64)
         if(h->region_type & GF_REGION_NOSIMD)
           gf->multiply_region.w64 = gf_w64_split_4_64_lazy_multiply_region;
         else
-          gf->multiply_region.w64 = gf_w64_split_4_64_lazy_sse_multiply_region; 
+        #if defined(INTEL_SSE4)
+          gf->multiply_region.w64 = gf_w64_split_4_64_lazy_sse_multiply_region;
+        #elif defined(ARCH_AARCH64)
+          gf_w64_neon_split_init(gf);
+        #endif
       #else
         gf->multiply_region.w64 = gf_w64_split_4_64_lazy_multiply_region;
         if(h->region_type & GF_REGION_SIMD)
@@ -2134,7 +2113,7 @@ int gf_w64_scratch_size(int mult_type, int region_type, int divide_type, int arg
       /* Allen: set the *local* arg1 and arg2, just for scratch size purposes,
        * then fall through to split table scratch size code. */
 
-#ifdef INTEL_SSE4
+#if defined(INTEL_SSE4) || defined(ARCH_AARCH64)
       arg1 = 64;
       arg2 = 4;
 #else
diff --git a/src/neon/gf_w64_neon.c b/src/neon/gf_w64_neon.c
new file mode 100644
index 0000000..0eca9c7
--- /dev/null
+++ b/src/neon/gf_w64_neon.c
@@ -0,0 +1,333 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * Copyright (c) 2014: Janne Grunau <j@jannau.net>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ *  - Neither the name of the University of Tennessee nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * gf_w64_neon.c
+ *
+ * Neon routines for 64-bit Galois fields
+ *
+ */
+
+#include "gf_int.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include "gf_w64.h"
+
+
+#ifndef ARCH_AARCH64
+#define vqtbl1q_u8(tbl, v) vcombine_u8(vtbl2_u8(tbl, vget_low_u8(v)),   \
+                                       vtbl2_u8(tbl, vget_high_u8(v)))
+#endif
+
+static
+inline
+void
+neon_w64_split_4_lazy_altmap_multiply_region(gf_t *gf, uint64_t *src,
+                                             uint64_t *dst, uint64_t *d_end,
+                                             uint64_t val, int xor)
+{
+  unsigned i, j, k;
+  uint8_t btable[16];
+#ifdef ARCH_AARCH64
+  uint8x16_t tables[16][8];
+#else
+  uint8x8x2_t tables[16][8];
+#endif
+  uint8x16_t p[8], mask1, si;
+
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+  struct gf_split_4_64_lazy_data *ld = (struct gf_split_4_64_lazy_data *) h->private;
+
+  for (i = 0; i < 16; i++) {
+    for (j = 0; j < 8; j++) {
+      for (k = 0; k < 16; k++) {
+        btable[k] = (uint8_t) ld->tables[i][k];
+        ld->tables[i][k] >>= 8;
+      }
+#ifdef ARCH_AARCH64
+      tables[i][j] = vld1q_u8(btable);
+#else
+      tables[i][j].val[0] = vld1_u8(btable);
+      tables[i][j].val[1] = vld1_u8(btable + 8);
+#endif
+    }
+  }
+
+  mask1 = vdupq_n_u8(0xf);
+
+  while (dst < d_end) {
+
+    if (xor) {
+      for (i = 0; i < 8; i++)
+        p[i] = vld1q_u8((uint8_t *) (dst + i * 2));
+    } else {
+      for (i = 0; i < 8; i++)
+        p[i] = vdupq_n_u8(0);
+    }
+
+    i = 0;
+    for (k = 0; k < 8; k++) {
+      uint8x16_t v0 = vld1q_u8((uint8_t *) src);
+      src += 2;
+
+      si = vandq_u8(v0, mask1);
+      for (j = 0; j < 8; j++) {
+        p[j] = veorq_u8(p[j], vqtbl1q_u8(tables[i][j], si));
+      }
+      i++;
+      si = vshrq_n_u8(v0, 4);
+      for (j = 0; j < 8; j++) {
+        p[j] = veorq_u8(p[j], vqtbl1q_u8(tables[i][j], si));
+      }
+      i++;
+
+    }
+    for (i = 0; i < 8; i++) {
+      vst1q_u8((uint8_t *) dst, p[i]);
+      dst += 2;
+    }
+  }
+}
+
+static
+inline
+void
+neon_w64_split_4_lazy_multiply_region(gf_t *gf, uint64_t *src, uint64_t *dst,
+                                      uint64_t *d_end, uint64_t val, int xor)
+{
+  unsigned i, j, k;
+  uint8_t btable[16];
+#ifdef ARCH_AARCH64
+  uint8x16_t tables[16][8];
+#else
+  uint8x8x2_t tables[16][8];
+#endif
+  uint8x16_t p[8], mask1, si;
+  uint64x2_t st[8];
+  uint32x4x2_t s32[4];
+  uint16x8x2_t s16[4];
+  uint8x16x2_t s8[4];
+
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+  struct gf_split_4_64_lazy_data *ld = (struct gf_split_4_64_lazy_data *) h->private;
+
+  for (i = 0; i < 16; i++) {
+    for (j = 0; j < 8; j++) {
+      for (k = 0; k < 16; k++) {
+        btable[k] = (uint8_t) ld->tables[i][k];
+        ld->tables[i][k] >>= 8;
+      }
+#ifdef ARCH_AARCH64
+      tables[i][j] = vld1q_u8(btable);
+#else
+      tables[i][j].val[0] = vld1_u8(btable);
+      tables[i][j].val[1] = vld1_u8(btable + 8);
+#endif
+    }
+  }
+
+  mask1 = vdupq_n_u8(0xf);
+
+  while (dst < d_end) {
+
+    for (k = 0; k < 8; k++) {
+      st[k]  = vld1q_u64(src);
+      src += 2;
+      p[k] = vdupq_n_u8(0);
+    }
+
+    s32[0] = vuzpq_u32(vreinterpretq_u32_u64(st[0]),
+                       vreinterpretq_u32_u64(st[1]));
+    s32[1] = vuzpq_u32(vreinterpretq_u32_u64(st[2]),
+                       vreinterpretq_u32_u64(st[3]));
+    s32[2] = vuzpq_u32(vreinterpretq_u32_u64(st[4]),
+                       vreinterpretq_u32_u64(st[5]));
+    s32[3] = vuzpq_u32(vreinterpretq_u32_u64(st[6]),
+                       vreinterpretq_u32_u64(st[7]));
+
+    s16[0] = vuzpq_u16(vreinterpretq_u16_u32(s32[0].val[0]),
+                       vreinterpretq_u16_u32(s32[1].val[0]));
+    s16[1] = vuzpq_u16(vreinterpretq_u16_u32(s32[2].val[0]),
+                       vreinterpretq_u16_u32(s32[3].val[0]));
+    s16[2] = vuzpq_u16(vreinterpretq_u16_u32(s32[0].val[1]),
+                       vreinterpretq_u16_u32(s32[1].val[1]));
+    s16[3] = vuzpq_u16(vreinterpretq_u16_u32(s32[2].val[1]),
+                       vreinterpretq_u16_u32(s32[3].val[1]));
+
+    s8[0]  = vuzpq_u8(vreinterpretq_u8_u16(s16[0].val[0]),
+                      vreinterpretq_u8_u16(s16[1].val[0]));
+    s8[1]  = vuzpq_u8(vreinterpretq_u8_u16(s16[0].val[1]),
+                      vreinterpretq_u8_u16(s16[1].val[1]));
+    s8[2]  = vuzpq_u8(vreinterpretq_u8_u16(s16[2].val[0]),
+                      vreinterpretq_u8_u16(s16[3].val[0]));
+    s8[3]  = vuzpq_u8(vreinterpretq_u8_u16(s16[2].val[1]),
+                      vreinterpretq_u8_u16(s16[3].val[1]));
+
+    i = 0;
+    for (k = 0; k < 8; k++) {
+      si = vandq_u8(s8[k >> 1].val[k & 1], mask1);
+      for (j = 0; j < 8; j++) {
+        p[j] = veorq_u8(p[j], vqtbl1q_u8(tables[i][j], si));
+      }
+      i++;
+      si = vshrq_n_u8(s8[k >> 1].val[k & 1], 4);
+      for (j = 0; j < 8; j++) {
+        p[j] = veorq_u8(p[j], vqtbl1q_u8(tables[i][j], si));
+      }
+      i++;
+    }
+
+    s8[0]  = vzipq_u8(p[0], p[1]);
+    s8[1]  = vzipq_u8(p[2], p[3]);
+    s8[2]  = vzipq_u8(p[4], p[5]);
+    s8[3]  = vzipq_u8(p[6], p[7]);
+
+    s16[0] = vzipq_u16(vreinterpretq_u16_u8(s8[0].val[0]),
+                       vreinterpretq_u16_u8(s8[1].val[0]));
+    s16[1] = vzipq_u16(vreinterpretq_u16_u8(s8[2].val[0]),
+                       vreinterpretq_u16_u8(s8[3].val[0]));
+    s16[2] = vzipq_u16(vreinterpretq_u16_u8(s8[0].val[1]),
+                       vreinterpretq_u16_u8(s8[1].val[1]));
+    s16[3] = vzipq_u16(vreinterpretq_u16_u8(s8[2].val[1]),
+                       vreinterpretq_u16_u8(s8[3].val[1]));
+
+    s32[0] = vzipq_u32(vreinterpretq_u32_u16(s16[0].val[0]),
+                       vreinterpretq_u32_u16(s16[1].val[0]));
+    s32[1] = vzipq_u32(vreinterpretq_u32_u16(s16[0].val[1]),
+                       vreinterpretq_u32_u16(s16[1].val[1]));
+    s32[2] = vzipq_u32(vreinterpretq_u32_u16(s16[2].val[0]),
+                       vreinterpretq_u32_u16(s16[3].val[0]));
+    s32[3] = vzipq_u32(vreinterpretq_u32_u16(s16[2].val[1]),
+                       vreinterpretq_u32_u16(s16[3].val[1]));
+
+    for (k = 0; k < 8; k ++) {
+        st[k] = vreinterpretq_u64_u32(s32[k >> 1].val[k & 1]);
+    }
+
+    if (xor) {
+      for (i = 0; i < 8; i++) {
+        uint64x2_t t1 = vld1q_u64(dst);
+        vst1q_u64(dst, veorq_u64(st[i], t1));
+        dst += 2;
+      }
+    } else {
+      for (i = 0; i < 8; i++) {
+        vst1q_u64(dst, st[i]);
+        dst += 2;
+      }
+    }
+
+  }
+}
+
+static
+void
+gf_w64_neon_split_4_lazy_multiply_region(gf_t *gf, void *src, void *dest,
+                                         uint64_t val, int bytes, int xor,
+                                         int altmap)
+{
+  gf_internal_t *h;
+  int i, j, k;
+  uint64_t pp, v, *s64, *d64, *top;
+  struct gf_split_4_64_lazy_data *ld;
+  gf_region_data rd;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 128);
+  gf_do_initial_region_alignment(&rd);
+
+  s64 = (uint64_t *) rd.s_start;
+  d64 = (uint64_t *) rd.d_start;
+  top = (uint64_t *) rd.d_top;
+
+  h = (gf_internal_t *) gf->scratch;
+  pp = h->prim_poly;
+  ld = (struct gf_split_4_64_lazy_data *) h->private;
+
+  v = val;
+  for (i = 0; i < 16; i++) {
+    ld->tables[i][0] = 0;
+    for (j = 1; j < 16; j <<= 1) {
+      for (k = 0; k < j; k++) {
+        ld->tables[i][k^j] = (v ^ ld->tables[i][k]);
+      }
+      v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1);
+    }
+  }
+
+  if (altmap) {
+    if (xor)
+      neon_w64_split_4_lazy_altmap_multiply_region(gf, s64, d64, top, val, 1);
+    else
+      neon_w64_split_4_lazy_altmap_multiply_region(gf, s64, d64, top, val, 0);
+  } else {
+    if (xor)
+      neon_w64_split_4_lazy_multiply_region(gf, s64, d64, top, val, 1);
+    else
+      neon_w64_split_4_lazy_multiply_region(gf, s64, d64, top, val, 0);
+  }
+
+  gf_do_final_region_alignment(&rd);
+}
+
+static
+void
+gf_w64_split_4_64_lazy_multiply_region_neon(gf_t *gf, void *src, void *dest,
+                                            uint64_t val, int bytes, int xor)
+{
+  gf_w64_neon_split_4_lazy_multiply_region(gf, src, dest, val, bytes, xor, 0);
+}
+
+static
+void
+gf_w64_split_4_64_lazy_altmap_multiply_region_neon(gf_t *gf, void *src,
+                                                   void *dest, uint64_t val,
+                                                   int bytes, int xor)
+{
+  gf_w64_neon_split_4_lazy_multiply_region(gf, src, dest, val, bytes, xor, 1);
+}
+
+void gf_w64_neon_split_init(gf_t *gf)
+{
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+
+  if (h->region_type & GF_REGION_ALTMAP)
+      gf->multiply_region.w64 = gf_w64_split_4_64_lazy_altmap_multiply_region_neon;
+  else
+      gf->multiply_region.w64 = gf_w64_split_4_64_lazy_multiply_region_neon;
+
+}
-- 
cgit v1.2.1