summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJim Plank <plank@cs.utk.edu>2013-10-09 10:36:37 -0400
committerJim Plank <plank@cs.utk.edu>2013-10-09 10:36:37 -0400
commit110523d6f311d7ee81835566b5594e6feb2ce9dd (patch)
tree8fcbd7d045b47a607ec225b9805bde8025a52667
parent79a46d18b687f3e470a35becb21a68bab41d53f5 (diff)
downloadgf-complete-110523d6f311d7ee81835566b5594e6feb2ce9dd.tar.gz
GF-Complete Release 1.0.
Please see the user's manual for details.
-rw-r--r--GNUmakefile29
-rw-r--r--Log-Zero-for-w=8.odgbin9769 -> 0 bytes
-rw-r--r--Manual.pdfbin0 -> 489205 bytes
-rw-r--r--README1
-rw-r--r--README.txt16
-rw-r--r--explanation.html777
-rw-r--r--flag_tester/README.txt10
-rw-r--r--flag_tester/flag_test.c120
-rw-r--r--flag_tester/intel_cpu_capabilities.h (renamed from intel_cpu_capabilities.h)3
-rw-r--r--flag_tester/pclmul_test.c40
-rw-r--r--flag_tester/pclmul_test.txt8
-rw-r--r--flag_tester/sse2_test.txt30
-rw-r--r--flag_tester/sse4_test.txt35
-rw-r--r--flag_tester/sse_test.c142
-rw-r--r--flag_tester/ssse3_test.txt31
-rw-r--r--flag_tester/whats_my_sse.c (renamed from whats_my_sse.c)0
-rwxr-xr-xflag_tester/which_compile_flags.sh19
-rw-r--r--gf.c660
-rw-r--r--gf_54.c29
-rw-r--r--gf_add.c2
-rw-r--r--gf_complete.h84
-rw-r--r--gf_example_5.c73
-rw-r--r--gf_example_6.c79
-rw-r--r--gf_example_7.c70
-rw-r--r--gf_general.c115
-rw-r--r--gf_general.h4
-rw-r--r--gf_inline_time.c1
-rw-r--r--gf_int.h100
-rw-r--r--gf_method.c307
-rw-r--r--gf_method.h7
-rw-r--r--gf_methods.c129
-rw-r--r--gf_mult.c112
-rw-r--r--gf_poly.c716
-rw-r--r--gf_time.c25
-rw-r--r--gf_unit.c243
-rw-r--r--gf_w128.c1314
-rw-r--r--gf_w16.c1060
-rw-r--r--gf_w32.c989
-rw-r--r--gf_w4.c376
-rw-r--r--gf_w64.c928
-rw-r--r--gf_w8.c1816
-rw-r--r--gf_wgen.c147
-rw-r--r--release-files.txt31
-rw-r--r--tests.txt0
-rw-r--r--tmp-10-out.txt0
-rw-r--r--tmp-time-test.sh14
-rw-r--r--tmp.c1583
-rw-r--r--tmp.sh15
-rw-r--r--tmp.txt162
-rw-r--r--tmp2.sh13
50 files changed, 7017 insertions, 5448 deletions
diff --git a/GNUmakefile b/GNUmakefile
index 0f35276..80cd3d3 100644
--- a/GNUmakefile
+++ b/GNUmakefile
@@ -1,24 +1,23 @@
#
# GNUmakefile for Galois field library
#
-#
+# The default flags do *not* have the SSE instructions enabled.
+# Please cd to flag_tester and run which_compile_flags.sh to see which SSE instructions
+# your machine and compiler support, and which flags you should include below.
+
+CFLAGS = -O3
+LDFLAGS = -O3
SRCS = gf_w4.c gf_w8.c gf_w16.c gf_w32.c gf_w64.c gf_w128.c gf_wgen.c gf.c gf_unit.c \
gf_time.c gf_mult.c gf_method.c gf_methods.c gf_div.c gf_rand.c gf_general.c \
gf_poly.c gf_example_1.c gf_add.c gf_example_2.c gf_example_3.c gf_example_4.c \
- gf_inline_time.c
+ gf_inline_time.c gf_example_5.c gf_example_6.c gf_example_7.c
HDRS = gf_complete.h gf_int.h
EXECUTABLES = gf_mult gf_div gf_add gf_unit gf_time gf_methods gf_poly \
- gf_example_1 gf_example_2 gf_example_3 gf_example_4 gf_inline_time
-
-CFLAGS = -O3 -msse4 -maes -mpclmul -DINTEL_SSE4 -DINTEL_PCLMUL
-LDFLAGS = -O3 -msse4 -maes -mpclmul
-
-# Use these if you don't have INTEL_PCLMUL
-# CFLAGS = -O3 -msse4 -DINTEL_SSE4
-# LDFLAGS = -O3 -msse4
+ gf_example_1 gf_example_2 gf_example_3 gf_example_4 gf_inline_time \
+ gf_example_5 gf_example_6 gf_example_7
RM = /bin/rm -f
@@ -45,6 +44,9 @@ gf_example_1: gf_example_1.o gf_complete.a
gf_example_2: gf_example_2.o gf_complete.a
gf_example_3: gf_example_3.o gf_complete.a
gf_example_4: gf_example_4.o gf_complete.a
+gf_example_5: gf_example_5.o gf_complete.a
+gf_example_6: gf_example_6.o gf_complete.a
+gf_example_7: gf_example_7.o gf_complete.a
gf_mult: gf_mult.o gf_complete.a
gf_div: gf_div.o gf_complete.a
gf_poly: gf_poly.o gf_complete.a
@@ -54,7 +56,8 @@ clean:
$(RM) $(OBJS) gf_div.c
spotless: clean
- $(RM) *~ $(EXECUTABLES)
+ $(RM) *~ $(EXECUTABLES) which_compile_flags
+ $(RM) gf_complete.a
gf_div.o: gf_complete.h gf_method.h
gf_methods.o: gf_complete.h gf_method.h
@@ -71,8 +74,12 @@ gf_example_1.o: gf_complete.h gf_rand.h
gf_example_2.o: gf_complete.h gf_rand.h
gf_example_3.o: gf_complete.h gf_rand.h
gf_example_4.o: gf_complete.h gf_rand.h
+gf_example_5.o: gf_complete.h gf_rand.h
+gf_example_6.o: gf_complete.h gf_rand.h
+gf_example_7.o: gf_complete.h gf_rand.h
gf_general.o: gf_complete.h gf_int.h gf_general.h gf_rand.h
gf_mult.o: gf_complete.h gf_method.h
+gf.o: gf_complete.h gf_int.h
gf_method.o: gf_complete.h
gf_div.c: gf_mult.c
diff --git a/Log-Zero-for-w=8.odg b/Log-Zero-for-w=8.odg
deleted file mode 100644
index 138a673..0000000
--- a/Log-Zero-for-w=8.odg
+++ /dev/null
Binary files differ
diff --git a/Manual.pdf b/Manual.pdf
new file mode 100644
index 0000000..fdc9756
--- /dev/null
+++ b/Manual.pdf
Binary files differ
diff --git a/README b/README
deleted file mode 100644
index 4169e1c..0000000
--- a/README
+++ /dev/null
@@ -1 +0,0 @@
-This is a README file.
diff --git a/README.txt b/README.txt
index 91fecc5..0726922 100644
--- a/README.txt
+++ b/README.txt
@@ -1,5 +1,13 @@
-This is GF-Complete, Revision 0.1.
+This is GF-Complete, Revision 1.0.
+
+The user's manual is in the file Manual.pdf.
+
+There are two online homes for GF-Complete:
+
+ - https://bitbucket.org/jimplank/gf-complete
+ - http://www.cs.utk.edu/~plank/plank/papers/CS-13-716.html
+
+When compiling this for the first time, cd to flag_tester, and
+do "sh which_compile_flags.sh xxx", where xxx is the compiler
+that you will use in the GNUMakefile.
-Please see http://www.cs.utk.edu/~plank/plank/papers/CS-13-703.html for the user's
-manual and other important documentation about this library, including more
-recent revisions.
diff --git a/explanation.html b/explanation.html
deleted file mode 100644
index 72f03d0..0000000
--- a/explanation.html
+++ /dev/null
@@ -1,777 +0,0 @@
-<h3>Code structure as of 7/20/2012</h3>
-
-written by Jim.
-<p>
-Ok -- once again, I have messed with the structure. My goal is flexible and efficient.
-It's similar to the stuff before, but better because it makes things like Euclid's
-method much cleaner.
-<p>
-I think we're ready to hack.
-<p>
-<p>
-<hr>
-<h3>Files</h3>
-<UL>
-<LI> <a href=GNUmakefile><b>GNUmakefile</b></a>: Makefile
-<LI> <a href=README><b>README</b></a>: Empty readme
-<LI> <a href=explanation.html><b>explanation.html</b></a>: This file.
-<LI> <a href=gf.c><b>gf.c</b></a>: Main gf routines
-<LI> <a href=gf.h><b>gf.h</b></a>: Main gf prototypes and typedefs
-<LI> <a href=gf_int.h><b>gf_int.h</b></a>: Prototypes and typedefs for common routines for the
- internal gf implementations.
-<LI> <a href=gf_method.c><b>gf_method.c</b></a>: Code to help parse argc/argv to define the method.
- This way, various programs can be consistent with how they handle the command line.
-<LI> <a href=gf_method.h><b>gf_method.h</b></a>: Prototypes for ibid.
-<LI> <a href=gf_methods.c><b>gf_methods.c</b></a>: This program prints out how to define
- the various methods on the command line. My idea is to beef this up so that you can
- give it a method spec on the command line, and it will tell you whether it's valid, or
- why it's invalid. I haven't written that part yet.
-<LI> <a href=gf_mult.c><b>gf_mult.c</b></a>: Program to do single multiplication.
-<LI> <a href=gf_mult.c><b>gf_mult.c</b></a>: Program to do single divisions -- it's created
- in the makefile with a sed script on gf_mult.c.
-<LI> <a href=gf_time.c><b>gf_time.c</b></a>: Time tester
-<LI> <a href=gf_unit.c><b>gf_unit.c</b></a>: Unit tester
-<LI> <a href=gf_54.c><b>gf_54.c</b></a>: A simple example program that multiplies
- 5 and 4 in GF(2^4).
-<LI> <a href=gf_w4.c><b>gf_w4.c</b></a>: Implementation of code for <i>w</i> = 4.
-(For now, only SHIFT and LOG, plus EUCLID & MATRIX).
-<LI> <a href=gf_w8.c><b>gf_w8.c</b></a>: Implementation of code for <i>w</i> = 8.
-(For now, only SHIFT plus EUCLID & MATRIX).
-<LI> <a href=gf_w16.c><b>gf_w16.c</b></a>: Implementation of code for <i>w</i> = 16.
-(For now, only SHIFT plus EUCLID & MATRIX).
-<LI> <a href=gf_w32.c><b>gf_w32.c</b></a>: Implementation of code for <i>w</i> = 32.
-(For now, only SHIFT plus EUCLID & MATRIX).
-<LI> <a href=gf_w64.c><b>gf_w64.c</b></a>: Implementation of code for <i>w</i> = 64.
-(For now, only SHIFT and EUCLID.
-<LI> I don't have gf_w128.c or gf_gen.c yet.
-</UL>
-
-<hr>
-<h3>Prototypes and typedefs in gf.h</h3>
-
-The main structure that users will see is in <b>gf.h</b>, and it is of type
-<b>gf_t</b>:
-
-<p><center><table border=3 cellpadding=3><td><pre>
-typedef struct gf {
- gf_func_a_b multiply;
- gf_func_a_b divide;
- gf_func_a inverse;
- gf_region multiply_region;
- void *scratch;
-} gf_t;
-</pre></td></table></center><p>
-
-We can beef it up later with buf-buf or buf-acc. The problem is that the paper is
-already bloated, so right now, I want to keep it lean.
-<p>
-The types of the procedures are big unions, so that they work with the following
-types of arguments:
-
-<p><center><table border=3 cellpadding=3><td><pre>
-typedef uint8_t gf_val_4_t;
-typedef uint8_t gf_val_8_t;
-typedef uint16_t gf_val_16_t;
-typedef uint32_t gf_val_32_t;
-typedef uint64_t gf_val_64_t;
-typedef uint64_t *gf_val_128_t;
-typedef uint32_t gf_val_gen_t; /* The intent here is for general values <= 32 */
-</pre></td></table></center><p>
-
-To use one of these, you need to create one with <b>gf_init_easy()</b> or
-<b>gf_init_hard()</b>. Let's concentrate on the former:
-
-<p><center><table border=3 cellpadding=3><td><pre>
-extern int gf_init_easy(gf_t *gf, int w, int mult_type);
-</pre></td></table></center><p>
-
-You pass it memory for a <b>gf_t</b>, a value of <b>w</b> and
-a variable that says how to do multiplication. The valid values of <b>mult_type</b>
-are enumerated in <b>gf.h</b>:
-
-<p><center><table border=3 cellpadding=3><td><pre>
-typedef enum {GF_MULT_DEFAULT,
- GF_MULT_SHIFT,
- GF_MULT_GROUP,
- GF_MULT_BYTWO_p,
- GF_MULT_BYTWO_b,
- GF_MULT_TABLE,
- GF_MULT_LOG_TABLE,
- GF_MULT_SPLIT_TABLE,
- GF_MULT_COMPOSITE } gf_mult_type_t;
-</pre></td></table></center><p>
-
-After creating the <b>gf_t</b>, you use its <b>multiply</b> method
-to multiply, using the union's fields to work with the various types.
-It looks easier than my explanation. For example, suppose you wanted to multiply 5 and 4 in <i>GF(2<sup>4</sup>)</i>.
-You can do it as in
-<b><a href=gf_54.c>gf_54.c</a></b>
-
-<p><center><table border=3 cellpadding=3><td><pre>
-#include "gf.h"
-
-main()
-{
- gf_t gf;
-
- gf_init_easy(&gf, 4, GF_MULT_DEFAULT);
- printf("%d\n", gf.multiply.w4(&gf, 5, 4));
- exit(0);
-}
-</pre></td></table></center><p>
-
-
-If you wanted to multiply in <i>GF(2<sup>8</sup>)</i>, then you'd have to use 8 as a parameter
-to <b>gf_init_easy</b>, and call the multiplier as <b>gf.mult.w8()</b>.
-<p>
-When you're done with your <b>gf_t</b>, you should call <b>gf_free()</b> on it so
-that it can free memory that it has allocated. We'll talk more about memory later, but if you
-create your <b>gf_t</b> with <b>gf_init_easy</b>, then it calls <b>malloc()</b>, and
-if you care about freeing memory, you'll have to call <b>gf_free()</b>.
-<p>
-
-<hr>
-<h3>Memory allocation</h3>
-
-Each implementation of a multiplication technique keeps around its
-own data. For example, <b>GF_MULT_TABLE</b> keeps around
-multiplication and division tables, and <b>GF_MULT_LOG</b> maintains log and
-antilog tables. This data is stored in the pointer <b>scratch</b>. My intent
-is that the memory that is there is all that's required. In other
-words, the <b>multiply()</b>, <b>divide()</b>, <b>inverse()</b> and
-<b>multiply_region()</b> calls don't do any memory allocation.
-Moreover, <b>gf_init_easy()</b> only allocates one chunk of memory --
-the one in <b>scratch</b>.
-<p>
-If you don't want to have the initialization call allocate memory, you can use <b>gf_init_hard()</b>:
-
-<p><center><table border=3 cellpadding=3><td><pre>
-extern int gf_init_hard(gf_t *gf,
- int w,
- int mult_type,
- int region_type,
- int divide_type,
- uint64_t prim_poly,
- int arg1,
- int arg2,
- gf_t *base_gf,
- void *scratch_memory);
-</pre></td></table></center><p>
-
-The first three parameters are the same as <b>gf_init_easy()</b>.
-You can add additional arguments for performing <b>multiply_region</b>, and
-for performing division in the <b>region_type</b> and <b>divide_type</b>
-arguments. Their values are also defined in <b>gf.h</b>. You can
-mix the <b>region_type</b> values (e.g. "DOUBLE" and "SSE"):
-
-<p><center><table border=3 cellpadding=3><td><pre>
-#define GF_REGION_DEFAULT (0x0)
-#define GF_REGION_SINGLE_TABLE (0x1)
-#define GF_REGION_DOUBLE_TABLE (0x2)
-#define GF_REGION_QUAD_TABLE (0x4)
-#define GF_REGION_LAZY (0x8)
-#define GF_REGION_SSE (0x10)
-#define GF_REGION_NOSSE (0x20)
-#define GF_REGION_STDMAP (0x40)
-#define GF_REGION_ALTMAP (0x80)
-#define GF_REGION_CAUCHY (0x100)
-
-typedef uint32_t gf_region_type_t;
-
-typedef enum { GF_DIVIDE_DEFAULT,
- GF_DIVIDE_MATRIX,
- GF_DIVIDE_EUCLID } gf_division_type_t;
-</pre></td></table></center><p>
-You can change
-the primitive polynomial with <b>prim_poly</b>, give additional arguments with
-<b>arg1</b> and <b>arg2</b> and give a base Galois Field for composite fields.
-Finally, you can pass it a pointer to memory in <b>scratch_memory</b>. That
-way, you can avoid having <b>gf_init_hard()</b> call <b>malloc()</b>.
-<p>
-There is a procedure called <b>gf_scratch_size()</b> that lets you know the minimum
-size for <b>scratch_memory</b>, depending on <i>w</i>, the multiplication type
-and the arguments:
-
-<p><center><table border=3 cellpadding=3><td><pre>
-extern int gf_scratch_size(int w,
- int mult_type,
- int region_type,
- int divide_type,
- int arg1,
- int arg2);
-</pre></td></table></center><p>
-
-You can specify default arguments in <b>gf_init_hard()</b>:
-<UL>
-<LI> <b>region_type</b> = <b>GF_REGION_DEFAULT</b>
-<LI> <b>divide_type</b> = <b>GF_REGION_DEFAULT</b>
-<LI> <b>prim_poly</b> = 0
-<LI> <b>arg1</b> = 0
-<LI> <b>arg2</b> = 0
-<LI> <b>base_gf</b> = <b>NULL</b>
-<LI> <b>scratch_memory</b> = <b>NULL</b>
-</UL>
-If any argument is equal to its default, then default actions are taken (e.g. a
-standard primitive polynomial is used, or memory is allocated for <b>scratch_memory</b>).
-In fact, <b>gf_init_easy()</b> simply calls <b>gf_init_hard()</b> with the default
-parameters.
-<p>
-<b>gf_free()</b> frees memory that was allocated with <b>gf_init_easy()</b>
-or <b>gf_init_hard()</b>. The <b>recursive</b> parameter is in case you
-use composite fields, and want to recursively free the base fields.
-If you pass <b>scratch_memory</b> to <b>gf_init_hard()</b>, then you typically
-don't need to call <b>gf_free()</b>. It won't hurt to call it, though.
-
-<hr>
-<h3>gf_mult and gf_div</h3>
-
-For the moment, I have few things completely implemented, but that's because I want
-to be able to explain the structure, and how to specify methods. In particular, for
-<i>w=4</i>, I have implemented <b>SHIFT</b> and <b>LOG</b>. For <i>w=8, 16, 32, 64</i>
-I have implemented <b>SHIFT</b>. For all <i>w &le; 32</i>, I have implemented both
-Euclid's algorithm for inversion, and the matrix method for inversion. For
-<i>w=64</i>, it's just Euclid. You can
-test these all with <b>gf_mult</b> and <b>gf_div</b>. Here are a few calls:
-
-<pre>
-UNIX> <font color=darkred><b>gf_mult 7 11 4</b></font> - Default
-4
-UNIX> <font color=darkred><b>gf_mult 7 11 4 SHIFT - -</b></font> - Use shift
-4
-UNIX> <font color=darkred><b>gf_mult 7 11 4 LOG - -</b></font> - Use logs
-4
-UNIX> <font color=darkred><b>gf_div 4 7 4</b></font> - Default
-11
-UNIX> <font color=darkred><b>gf_div 4 7 4 LOG - -</b></font> - Use logs
-11
-UNIX> <font color=darkred><b>gf_div 4 7 4 LOG - EUCLID</b></font> - Use Euclid instead of logs
-11
-UNIX> <font color=darkred><b>gf_div 4 7 4 LOG - MATRIX</b></font> - Use Matrix inversion instead of logs
-11
-UNIX> <font color=darkred><b>gf_div 4 7 4 SHIFT - -</b></font> - Default
-11
-UNIX> <font color=darkred><b>gf_div 4 7 4 SHIFT - EUCLID</b></font> - Use Euclid (which is the default)
-11
-UNIX> <font color=darkred><b>gf_div 4 7 4 SHIFT - MATRIX</b></font> - Use Matrix inversion instead of logs
-11
-UNIX> <font color=darkred><b>gf_mult 200 211 8</b></font> - The remainder are shift/Euclid
-201
-UNIX> <font color=darkred><b>gf_div 201 211 8</b></font>
-200
-UNIX> <font color=darkred><b>gf_mult 60000 65111 16</b></font>
-63515
-UNIX> <font color=darkred><b>gf_div 63515 65111 16</b></font>
-60000
-UNIX> <font color=darkred><b>gf_mult abcd0001 9afbf788 32h</b></font>
-b0359681
-UNIX> <font color=darkred><b>gf_div b0359681 9afbf788 32h</b></font>
-abcd0001
-UNIX> <font color=darkred><b>gf_mult abcd00018c8b8c8a 9afbf7887f6d8e5b 64h</b></font>
-3a7def35185bd571
-UNIX> <font color=darkred><b>gf_mult abcd00018c8b8c8a 9afbf7887f6d8e5b 64h</b></font>
-3a7def35185bd571
-UNIX> <font color=darkred><b>gf_div 3a7def35185bd571 9afbf7887f6d8e5b 64h</b></font>
-abcd00018c8b8c8a
-UNIX> <font color=darkred><b></b></font>
-</pre>
-
-You can see all the methods with <b>gf_methods</b>. We have a lot of implementing to do:
-
-<pre>
-UNIX> <font color=darkred><b>gf_methods</b></font>
-To specify the methods, do one of the following:
- - leave empty to use defaults
- - use a single dash to use defaults
- - specify MULTIPLY REGION DIVIDE
-
-Legal values of MULTIPLY:
- SHIFT: shift
- GROUP g_mult g_reduce: the Group technique - see the paper
- BYTWO_p: BYTWO doubling the product.
- BYTWO_b: BYTWO doubling b (more efficient thatn BYTWO_p)
- TABLE: Full multiplication table
- LOG: Discrete logs
- LOG_ZERO: Discrete logs with a large table for zeros
- SPLIT g_a g_b: Split tables defined by g_a and g_b
- COMPOSITE k l [METHOD]: Composite field, recursively specify the
- method of the base field in GF(2^l)
-
-Legal values of REGION: Specify multiples with commas e.g. 'DOUBLE,LAZY'
- -: Use defaults
- SINGLE/DOUBLE/QUAD: Expand tables
- LAZY: Lazily create table (only applies to TABLE and SPLIT)
- SSE/NOSSE: Use 128-bit SSE instructions if you can
- CAUCHY/ALTMAP/STDMAP: Use different memory mappings
-
-Legal values of DIVIDE:
- -: Use defaults
- MATRIX: Use matrix inversion
- EUCLID: Use the extended Euclidian algorithm.
-
-See the user's manual for more information.
-There are many restrictions, so it is better to simply use defaults in most cases.
-UNIX> <font color=darkred><b></b></font>
-</pre>
-
-<hr>
-<h3>gf_unit and gf_time</h3>
-
-<b><a href=gf_unit.c>gf_unit.c</a></b> is a unit tester, and
-<b><a href=gf_time.c>gf_time.c</a></b> is a time tester.
-
-They are called as follows:
-
-<p><center><table border=3 cellpadding=3><td><pre>
-UNIX> <font color=darkred><b>gf_unit w tests seed [METHOD] </b></font>
-UNIX> <font color=darkred><b>gf_time w tests seed size(bytes) iterations [METHOD] </b></font>
-</pre></td></table></center><p>
-
-The <b>tests</b> parameter is one or more of the following characters:
-
-<UL>
-<LI> A: Do all tests
-<LI> S: Test only single operations (multiplication/division)
-<LI> R: Test only region operations
-<LI> V: Verbose Output
-</UL>
-
-<b>seed</b> is a seed for <b>srand48()</b> -- using -1 defaults to the current time.
-<p>
-For example, testing the defaults with w=4:
-
-<pre>
-UNIX> <font color=darkred><b>gf_unit 4 AV 1 LOG - -</b></font>
-Seed: 1
-Testing single multiplications/divisions.
-Testing Inversions.
-Testing buffer-constant, src != dest, xor = 0
-Testing buffer-constant, src != dest, xor = 1
-Testing buffer-constant, src == dest, xor = 0
-Testing buffer-constant, src == dest, xor = 1
-UNIX> <font color=darkred><b>gf_unit 4 AV 1 SHIFT - -</b></font>
-Seed: 1
-Testing single multiplications/divisions.
-Testing Inversions.
-No multiply_region.
-UNIX> <font color=darkred><b></b></font>
-</pre>
-
-There is no <b>multiply_region()</b> method defined for <b>SHIFT</b>.
-Thus, the procedures are <b>NULL</b> and the unit tester ignores them.
-<p>
-At the moment, I only have the unit tester working for w=4.
-<p>
-<b>gf_time</b> takes the size of an array (in bytes) and a number of iterations, and
-tests the speed of both single and region operations. The tests are:
-
-<UL>
-<LI> A: All
-<LI> S: All Single Operations
-<LI> R: All Region Operations
-<LI> M: Single: Multiplications
-<LI> D: Single: Divisions
-<LI> I: Single: Inverses
-<LI> B: Region: Multipy_Region
-</UL>
-
-Here are some examples with <b>SHIFT</b> and <b>LOG</b> on my mac.
-
-<pre>
-UNIX> <font color=darkred><b>gf_time 4 A 1 102400 1024 LOG - -</b></font>
-Seed: 1
-Multiply: 0.538126 s 185.830 Mega-ops/s
-Divide: 0.520825 s 192.003 Mega-ops/s
-Inverse: 0.631198 s 158.429 Mega-ops/s
-Buffer-Const,s!=d,xor=0: 0.478395 s 209.032 MB/s
-Buffer-Const,s!=d,xor=1: 0.524245 s 190.751 MB/s
-Buffer-Const,s==d,xor=0: 0.471851 s 211.931 MB/s
-Buffer-Const,s==d,xor=1: 0.528275 s 189.295 MB/s
-UNIX> <font color=darkred><b>gf_time 4 A 1 102400 1024 LOG - EUCLID</b></font>
-Seed: 1
-Multiply: 0.555512 s 180.014 Mega-ops/s
-Divide: 5.359434 s 18.659 Mega-ops/s
-Inverse: 4.911719 s 20.359 Mega-ops/s
-Buffer-Const,s!=d,xor=0: 0.496097 s 201.573 MB/s
-Buffer-Const,s!=d,xor=1: 0.538536 s 185.689 MB/s
-Buffer-Const,s==d,xor=0: 0.485564 s 205.946 MB/s
-Buffer-Const,s==d,xor=1: 0.540227 s 185.107 MB/s
-UNIX> <font color=darkred><b>gf_time 4 A 1 102400 1024 LOG - MATRIX</b></font>
-Seed: 1
-Multiply: 0.544005 s 183.822 Mega-ops/s
-Divide: 7.602822 s 13.153 Mega-ops/s
-Inverse: 7.000564 s 14.285 Mega-ops/s
-Buffer-Const,s!=d,xor=0: 0.474868 s 210.585 MB/s
-Buffer-Const,s!=d,xor=1: 0.527588 s 189.542 MB/s
-Buffer-Const,s==d,xor=0: 0.473130 s 211.358 MB/s
-Buffer-Const,s==d,xor=1: 0.529877 s 188.723 MB/s
-UNIX> <font color=darkred><b>gf_time 4 A 1 102400 1024 SHIFT - -</b></font>
-Seed: 1
-Multiply: 2.708842 s 36.916 Mega-ops/s
-Divide: 8.756882 s 11.420 Mega-ops/s
-Inverse: 5.695511 s 17.558 Mega-ops/s
-UNIX> <font color=darkred><b></b></font>
-</pre>
-
-At the moment, I only have the timer working for w=4.
-
-<hr>
-<h3>Walking you through <b>LOG</b></h3>
-
-To see how <b>scratch</b> is used to store data, let's look at what happens when
-you call <b>gf_init_easy(&gf, 4, GF_MULT_LOG);</b>
-First, <b>gf_init_easy()</b> calls <b>gf_init_hard()</b> with default parameters.
-This is in <b><a href=gf.c>gf.c</a></b>.
-<p>
-<b>gf_init_hard()</b>' first job is to set up the scratch.
-The scratch's type is <b>gf_internal_t</b>, defined in
-<b><a href=gf_int.h>gf_int.h</a></b>:
-
-<p><center><table border=3 cellpadding=3><td><pre>
-typedef struct {
- int mult_type;
- int region_type;
- int divide_type;
- int w;
- uint64_t prim_poly;
- int free_me;
- int arg1;
- int arg2;
- gf_t *base_gf;
- void *private;
-} gf_internal_t;
-</pre></td></table></center><p>
-
-All the fields are straightfoward, with the exception of <b>private</b>. That is
-a <b>(void *)</b> which points to the implementation's private data.
-<p>
-Here's the code for
-<b>gf_init_hard()</b>:
-
-<p><center><table border=3 cellpadding=3><td><pre>
-int gf_init_hard(gf_t *gf, int w, int mult_type,
- int region_type,
- int divide_type,
- uint64_t prim_poly,
- int arg1, int arg2,
- gf_t *base_gf,
- void *scratch_memory)
-{
- int sz;
- gf_internal_t *h;
-
-
- if (scratch_memory == NULL) {
- sz = gf_scratch_size(w, mult_type, region_type, divide_type, arg1, arg2);
- if (sz &lt;= 0) return 0;
- h = (gf_internal_t *) malloc(sz);
- h-&gt;free_me = 1;
- } else {
- h = scratch_memory;
- h-&gt;free_me = 0;
- }
- gf-&gt;scratch = (void *) h;
- h-&gt;mult_type = mult_type;
- h-&gt;region_type = region_type;
- h-&gt;divide_type = divide_type;
- h-&gt;w = w;
- h-&gt;prim_poly = prim_poly;
- h-&gt;arg1 = arg1;
- h-&gt;arg2 = arg2;
- h-&gt;base_gf = base_gf;
- h-&gt;private = (void *) gf-&gt;scratch;
- h-&gt;private += (sizeof(gf_internal_t));
-
- switch(w) {
- case 4: return gf_w4_init(gf);
- case 8: return gf_w8_init(gf);
- case 16: return gf_w16_init(gf);
- case 32: return gf_w32_init(gf);
- case 64: return gf_w64_init(gf);
- case 128: return gf_dummy_init(gf);
- default: return 0;
- }
-}
-</pre></td></table></center><p>
-
-The first thing it does is determine if it has to allocate space for <b>scratch</b>.
-If it must, it uses <b>gf_scratch_size()</b> to figure out how big the space must be.
-It then sets <b>gf->scratch</b> to this space, and sets all of the fields of the
-scratch to the arguments in <b>gf_init_hard()</b>. The <b>private</b> pointer is
-set to be the space just after the pointer <b>gf->private</b>. Again, it is up to
-<b>gf_scratch_size()</b> to make sure there is enough space for the scratch, and
-for all of the private data needed by the implementation.
-<p>
-Once the scratch is set up, <b>gf_init_hard()</b> calls <b>gf_w4_init()</b>. This is
-in <b><a href=gf_w4.c>gf_w4.c</a></b>, and it is a
-simple dispatcher to the various initialization routines, plus it
-sets <b>EUCLID</b> and <b>MATRIX</b> if need be:
-
-<p><center><table border=3 cellpadding=3><td><pre>
-int gf_w4_init(gf_t *gf)
-{
- gf_internal_t *h;
-
- h = (gf_internal_t *) gf-&gt;scratch;
- if (h-&gt;prim_poly == 0) h-&gt;prim_poly = 0x13;
-
- gf-&gt;multiply.w4 = NULL;
- gf-&gt;divide.w4 = NULL;
- gf-&gt;inverse.w4 = NULL;
- gf-&gt;multiply_region.w4 = NULL;
-
- switch(h-&gt;mult_type) {
- case GF_MULT_SHIFT: if (gf_w4_shift_init(gf) == 0) return 0; break;
- case GF_MULT_LOG_TABLE: if (gf_w4_log_init(gf) == 0) return 0; break;
- case GF_MULT_DEFAULT: if (gf_w4_log_init(gf) == 0) return 0; break;
- default: return 0;
- }
- if (h-&gt;divide_type == GF_DIVIDE_EUCLID) {
- gf-&gt;divide.w4 = gf_w4_divide_from_inverse;
- gf-&gt;inverse.w4 = gf_w4_euclid;
- } else if (h-&gt;divide_type == GF_DIVIDE_MATRIX) {
- gf-&gt;divide.w4 = gf_w4_divide_from_inverse;
- gf-&gt;inverse.w4 = gf_w4_matrix;
- }
-
- if (gf-&gt;inverse.w4 != NULL && gf-&gt;divide.w4 == NULL) {
- gf-&gt;divide.w4 = gf_w4_divide_from_inverse;
- }
- if (gf-&gt;inverse.w4 == NULL && gf-&gt;divide.w4 != NULL) {
- gf-&gt;inverse.w4 = gf_w4_inverse_from_divide;
- }
- return 1;
-}
-</pre></td></table></center><p>
-
-The code in <b>gf_w4_log_init()</b> sets up the log and antilog tables, and sets
-the <b>multiply.w4</b>, <b>divide.w4</b> etc routines to be the ones for logs. The
-tables are put into <b>gf->scratch->private</b>, which is typecast to a <b>struct
-gf_logtable_data *</b>:
-
-<p><center><table border=3 cellpadding=3><td><pre>
-struct gf_logtable_data {
- gf_val_4_t log_tbl[GF_FIELD_SIZE];
- gf_val_4_t antilog_tbl[GF_FIELD_SIZE * 2];
- gf_val_4_t *antilog_tbl_div;
-};
-.......
-
-static
-int gf_w4_log_init(gf_t *gf)
-{
- gf_internal_t *h;
- struct gf_logtable_data *ltd;
- int i, b;
-
- h = (gf_internal_t *) gf-&gt;scratch;
- ltd = h-&gt;private;
-
- ltd-&gt;log_tbl[0] = 0;
-
- ltd-&gt;antilog_tbl_div = ltd-&gt;antilog_tbl + (GF_FIELD_SIZE-1);
- b = 1;
- for (i = 0; i &lt; GF_FIELD_SIZE-1; i++) {
- ltd-&gt;log_tbl[b] = (gf_val_8_t)i;
- ltd-&gt;antilog_tbl[i] = (gf_val_8_t)b;
- ltd-&gt;antilog_tbl[i+GF_FIELD_SIZE-1] = (gf_val_8_t)b;
- b &lt;&lt;= 1;
- if (b & GF_FIELD_SIZE) {
- b = b ^ h-&gt;prim_poly;
- }
- }
-
- gf-&gt;inverse.w4 = gf_w4_inverse_from_divide;
- gf-&gt;divide.w4 = gf_w4_log_divide;
- gf-&gt;multiply.w4 = gf_w4_log_multiply;
- gf-&gt;multiply_region.w4 = gf_w4_log_multiply_region;
- return 1;
-}
-</pre></td></table></center><p>
-
-And of course the individual routines use <b>h->private</b> to access the tables:
-
-<p><center><table border=3 cellpadding=3><td><pre>
-static
-inline
-gf_val_8_t gf_w4_log_multiply (gf_t *gf, gf_val_8_t a, gf_val_8_t b)
-{
- struct gf_logtable_data *ltd;
-
- ltd = (struct gf_logtable_data *) ((gf_internal_t *) (gf-&gt;scratch))-&gt;private;
- return (a == 0 || b == 0) ? 0 : ltd-&gt;antilog_tbl[(unsigned)(ltd-&gt;log_tbl[a] + ltd-&gt;log_tbl[b])];
-}
-</pre></td></table></center><p>
-
-Finally, it's important that the proper sizes are put into
-<b>gf_w4_scratch_size()</b> for each implementation:
-
-<p><center><table border=3 cellpadding=3><td><pre>
-int gf_w4_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2)
-{
- int region_tbl_size;
- switch(mult_type)
- {
- case GF_MULT_DEFAULT:
- case GF_MULT_LOG_TABLE:
- return sizeof(gf_internal_t) + sizeof(struct gf_logtable_data) + 64;
- break;
- case GF_MULT_SHIFT:
- return sizeof(gf_internal_t);
- break;
- default:
- return -1;
- }
-}
-</pre></td></table></center><p>
-I hope that's enough explanation for y'all to start implementing. Let me know if you have
-problems -- thanks -- Jim
-
-<hr>
-The initial structure has been set for w=4, 8, 16, 32 and 64, with implementations of SHIFT and EUCLID, and for w <= 32, MATRIX. There are some weird caveats:
-
-<UL>
-<LI> For w=32 and w=64, the primitive polynomial does not have the leading one.
-<LI> I'd like for naming to be:
-<p>
-<UL>
- <b>gf_w</b><i>w</i><b>_</b><i>technique</i></i><b>_</b><i>funcationality</i><b>()</b>.
-</UL>
-<p>
-For example, the log techniques for w=4 are:
-<pre>
-gf_w4_log_multiply()
-gf_w4_log_divide()
-gf_w4_log_multiply_region()
-gf_w4_log_init()
-</pre>
-<p>
-<LI> I'd also like a header block on implementations that says who wrote it.
-</UL>
-
-<hr>
-<h3>Things we need to Implement: <i>w=4</i></h3>
-
-<p><table border=3 cellpadding=2>
-<tr> <td> SHIFT </td> <td> Done - Jim </td> </tr>
-<tr> <td> BYTWO_p </td> <td>Done - Jim</td> </tr>
-<tr> <td> BYTWO_b </td> <td>Done - Jim</td> </tr>
-<tr> <td> BYTWO_p, SSE </td> <td>Done - Jim</td> </tr>
-<tr> <td> BYTWO_b, SSE </td> <td>Done - Jim</td> </tr>
-<tr> <td> Single TABLE </td> <td> Done - Jim </td> </tr>
-<tr> <td> Double TABLE </td> <td> Done - Jim </td> </tr>
-<tr> <td> Double TABLE, SSE </td> <td> Done - Jim </td> </tr>
-<tr> <td> Quad TABLE </td> <td>Done - Jim</td> </tr>
-<tr> <td> Lazy Quad TABLE </td> <td>Done - Jim</td> </tr>
-<tr> <td> LOG </td> <td> Done - Jim </td> </tr>
-</table><p>
-
-<hr>
-<h3>Things we need to Implement: <i>w=8</i></h3>
-
-<p><table border=3 cellpadding=2>
-<tr> <td> SHIFT </td> <td> Done - Jim </td> </tr>
-<tr> <td> BYTWO_p </td> <td>Done - Jim </td> </tr>
-<tr> <td> BYTWO_b </td> <td>Done - Jim </td> </tr>
-<tr> <td> BYTWO_p, SSE </td> <td>Done - Jim </td> </tr>
-<tr> <td> BYTWO_b, SSE </td> <td>Done - Jim </td> </tr>
-<tr> <td> Single TABLE </td> <td> Done - Kevin </td> </tr>
-<tr> <td> Double TABLE </td> <td> Done - Jim </td> </tr>
-<tr> <td> Lazy Double TABLE </td> <td> Done - Jim </td> </tr>
-<tr> <td> Split 2 1 (Half) SSE </td> <td>Done - Jim</td> </tr>
-<tr> <td> Composite, k=2 </td> <td> Done - Kevin (alt mapping not passing unit test) </td> </tr>
-<tr> <td> LOG </td> <td> Done - Kevin </td> </tr>
-<tr> <td> LOG ZERO</td> <td> Done - Jim</td> </tr>
-</table><p>
-
-<hr>
-<h3>Things we need to Implement: <i>w=16</i></h3>
-
-<p><table border=3 cellpadding=2>
-<tr> <td> SHIFT </td> <td> Done - Jim </td> </tr>
-<tr> <td> BYTWO_p </td> <td>Done - Jim</td> </tr>
-<tr> <td> BYTWO_b </td> <td>Done - Jim</td> </tr>
-<tr> <td> BYTWO_p, SSE </td> <td>Done - Jim</td> </tr>
-<tr> <td> BYTWO_b, SSE </td> <td>Done - Jim</td> </tr>
-<tr> <td> Lazy TABLE </td> <td>Done - Jim</td> </tr>
-<tr> <td> Split 4 16 No-SSE, lazy </td> <td>Done - Jim</td> </tr>
-<tr> <td> Split 4 16 SSE, lazy </td> <td>Done - Jim</td> </tr>
-<tr> <td> Split 4 16 SSE, lazy, alternate mapping </td> <td>Done - Jim</td> </tr>
-<tr> <td> Split 8 16, lazy </td> <td>Done - Jim</td> </tr>
-<tr> <td> Composite, k=2, stdmap recursive </td> <td> Done - Kevin</td> </tr>
-<tr> <td> Composite, k=2, altmap recursive </td> <td> Done - Kevin</td> </tr>
-<tr> <td> Composite, k=2, stdmap inline </td> <td> Done - Kevin</td> </tr>
-<tr> <td> LOG </td> <td> Done - Kevin </td> </tr>
-<tr> <td> LOG ZERO</td> <td> Done - Kevin </td> </tr>
-<tr> <td> Group 4 4 </td> <td>Done - Jim: I don't see a reason to implement others, although 4-8 will be faster, and 8 8 will have faster region ops. They'll never beat SPLIT.</td> </tr>
-</table><p>
-
-<hr>
-<h3>Things we need to Implement: <i>w=32</i></h3>
-
-<p><table border=3 cellpadding=2>
-<tr> <td> SHIFT </td> <td> Done - Jim </td> </tr>
-<tr> <td> BYTWO_p </td> <td>Done - Jim</td> </tr>
-<tr> <td> BYTWO_b </td> <td>Done - Jim</td> </tr>
-<tr> <td> BYTWO_p, SSE </td> <td>Done - Jim</td> </tr>
-<tr> <td> BYTWO_b, SSE </td> <td>Done - Jim</td> </tr>
-<tr> <td> Split 2 32,lazy </td> <td>Done - Jim</td> </tr>
-<tr> <td> Split 2 32, SSE, lazy </td> <td>Done - Jim</td> </tr>
-<tr> <td> Split 4 32, lazy </td> <td>Done - Jim</td> </tr>
-<tr> <td> Split 4 32, SSE,ALTMAP lazy </td> <td>Done - Jim</td> </tr>
-<tr> <td> Split 4 32, SSE, lazy </td> <td>Done - Jim</td> </tr>
-<tr> <td> Split 8 8 </td> <td>Done - Jim </td> </tr>
-<tr> <td> Group, g_s == g_r </td> <td>Done - Jim</td></tr>
-<tr> <td> Group, any g_s and g_r</td> <td>Done - Jim</td></tr>
-<tr> <td> Composite, k=2, stdmap recursive </td> <td> Done - Kevin</td> </tr>
-<tr> <td> Composite, k=2, altmap recursive </td> <td> Done - Kevin</td> </tr>
-<tr> <td> Composite, k=2, stdmap inline </td> <td> Done - Kevin</td> </tr>
-</table><p>
-<hr>
-<h3>Things we need to Implement: <i>w=64</i></h3>
-
-<p><table border=3 cellpadding=2>
-<tr> <td> SHIFT </td> <td> Done - Jim </td> </tr>
-<tr> <td> BYTWO_p </td> <td> - </td> </tr>
-<tr> <td> BYTWO_b </td> <td> - </td> </tr>
-<tr> <td> BYTWO_p, SSE </td> <td> - </td> </tr>
-<tr> <td> BYTWO_b, SSE </td> <td> - </td> </tr>
-<tr> <td> Split 16 1 SSE, maybe lazy </td> <td> - </td> </tr>
-<tr> <td> Split 8 1 lazy </td> <td> - </td> </tr>
-<tr> <td> Split 8 8 </td> <td> - </td> </tr>
-<tr> <td> Split 8 8 lazy </td> <td> - </td> </tr>
-<tr> <td> Group </td> <td> - </td> </tr>
-<tr> <td> Composite, k=2, alternate mapping </td> <td> - </td> </tr>
-</table><p>
-<hr>
-<h3>Things we need to Implement: <i>w=128</i></h3>
-
-<p><table border=3 cellpadding=2>
-<tr> <td> SHIFT </td> <td> Done - Will </td> </tr>
-<tr> <td> BYTWO_p </td> <td> - </td> </tr>
-<tr> <td> BYTWO_b </td> <td> - </td> </tr>
-<tr> <td> BYTWO_p, SSE </td> <td> - </td> </tr>
-<tr> <td> BYTWO_b, SSE </td> <td> - </td> </tr>
-<tr> <td> Split 32 1 SSE, maybe lazy </td> <td> - </td> </tr>
-<tr> <td> Split 16 1 lazy </td> <td> - </td> </tr>
-<tr> <td> Split 16 16 - Maybe that's insanity</td> <td> - </td> </tr>
-<tr> <td> Split 16 16 lazy </td> <td> - </td> </tr>
-<tr> <td> Group (SSE) </td> <td> - </td> </tr>
-<tr> <td> Composite, k=?, alternate mapping </td> <td> - </td> </tr>
-</table><p>
-<hr>
-<h3>Things we need to Implement: <i>w=general between 1 & 32</i></h3>
-
-<p><table border=3 cellpadding=2>
-<tr> <td> CAUCHY Region (SSE XOR)</td> <td> Done - Jim </td> </tr>
-<tr> <td> SHIFT </td> <td> Done - Jim </td> </tr>
-<tr> <td> TABLE </td> <td> Done - Jim </td> </tr>
-<tr> <td> LOG </td> <td> Done - Jim </td> </tr>
-<tr> <td> BYTWO_p </td> <td>Done - Jim</td> </tr>
-<tr> <td> BYTWO_b </td> <td>Done - Jim</td> </tr>
-<tr> <td> Group, g_s == g_r </td> <td>Done - Jim</td></tr>
-<tr> <td> Group, any g_s and g_r</td> <td>Done - Jim</td></tr>
-<tr> <td> Split - do we need it?</td> <td>Done - Jim</td></tr>
-<tr> <td> Composite - do we need it?</td> <td> - </td></tr>
-<tr> <td> Split - do we need it?</td> <td> - </td></tr>
-<tr> <td> Logzero?</td> <td> - </td></tr>
-</table><p>
diff --git a/flag_tester/README.txt b/flag_tester/README.txt
new file mode 100644
index 0000000..19101ff
--- /dev/null
+++ b/flag_tester/README.txt
@@ -0,0 +1,10 @@
+Run which_compile_flags.sh and it will print out the compile flags to use in
+ GNUmakefile. By default, this script uses "cc" as its compiler but you can
+ pass in the name of your compiler as an argument.
+
+EXAMPLE: "./which_compile_flags.sh clang"
+
+This script will run "clang" in the above example so be warned that if you type
+something like "rm" for that argument, you get what you asked for. Also, make
+sure that the compiler that you pass to which_compile_flags.sh is the same as
+the compiler in GNUmakefile.
diff --git a/flag_tester/flag_test.c b/flag_tester/flag_test.c
new file mode 100644
index 0000000..cecf472
--- /dev/null
+++ b/flag_tester/flag_test.c
@@ -0,0 +1,120 @@
+/*
+ * flag_test.c - copied from whats_my_sse.c to output proper compile
+ * flags for the GNUmakefile
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "intel_cpu_capabilities.h"
+
+void usage()
+{
+ fprintf(stderr, "usage: flag_test <compiler name>\n");
+ exit(EXIT_FAILURE);
+}
+
+int main(int argc, char **argv)
+{
+ //make sure to extend these buffers if more flags are added to this program
+ char cflags[1000], ldflags[1000], buf[1000];
+ FILE *file;
+ char sse_found = 0;
+
+ if(argc != 2)
+ usage();
+
+ sprintf(cflags, "CFLAGS = -O3");
+ sprintf(ldflags, "LDFLAGS = -O3");
+
+ if(cpu_has_feature(CPU_CAP_SSE42))
+ {
+ sprintf(buf, "%s sse_test.c -o sse4 -msse4 -DSSE4 2> /dev/null", argv[1]);
+ system(buf);
+ if(file = fopen("sse4", "r"))
+ {
+ fclose(file);
+
+ //run program and compare to the included output
+ system("./sse4 > temp.txt 2> /dev/null");
+ system("diff sse4_test.txt temp.txt > diff.txt 2> /dev/null");
+ file = fopen("diff.txt", "r");
+ if(fgetc(file) == EOF)
+ {
+ strcat(cflags, " -msse4 -DINTEL_SSE4");
+ strcat(ldflags, " -msse4");
+ sse_found = 1;
+ }
+ fclose(file);
+ }
+ }
+
+ if(cpu_has_feature(CPU_CAP_SSSE3) && !sse_found)
+ {
+ sprintf(buf, "%s sse_test.c -o ssse3 -mssse3 -DSSSE3 2> /dev/null", argv[1]);
+ system(buf);
+ if(file = fopen("ssse3", "r"))
+ {
+ fclose(file);
+
+ //run program and compare to the included output
+ system("./ssse3 > temp.txt 2> /dev/null");
+ system("diff ssse3_test.txt temp.txt > diff.txt 2> /dev/null");
+ file = fopen("diff.txt", "r");
+ if(fgetc(file) == EOF)
+ {
+ strcat(cflags, " -mssse3 -DINTEL_SSSE3");
+ strcat(ldflags, " -mssse3");
+ sse_found = 1;
+ }
+ fclose(file);
+ }
+ }
+
+ if(cpu_has_feature(CPU_CAP_SSE2) && !sse_found)
+ {
+ sprintf(buf, "%s sse_test.c -o sse2 -msse2 -DSSE2 2> /dev/null", argv[1]);
+ system(buf);
+ if(file = fopen("sse2", "r"))
+ {
+ fclose(file);
+
+ //run program and compare to the included output
+ system("./sse2 > temp.txt 2> /dev/null");
+ system("diff sse2_test.txt temp.txt > diff.txt 2> /dev/null");
+ file = fopen("diff.txt", "r");
+ if(fgetc(file) == EOF)
+ {
+ strcat(cflags, " -msse2 -DINTEL_SSE2");
+ strcat(ldflags, " -msse2");
+ sse_found = 1;
+ }
+ fclose(file);
+ }
+ }
+
+ if(cpu_has_feature(CPU_CAP_PCLMULQDQ) && sse_found)
+ {
+ sprintf(buf, "%s pclmul_test.c -o pclmul -maes -mpclmul 2> /dev/null"
+ , argv[1]);
+ system(buf);
+ if(file = fopen("pclmul", "r"))
+ {
+ fclose(file);
+
+ //run program and compare to the included output
+ system("./pclmul > temp.txt 2> /dev/null");
+ system("diff pclmul_test.txt temp.txt > diff.txt 2> /dev/null");
+ file = fopen("diff.txt", "r");
+ if(fgetc(file) == EOF)
+ {
+ strcat(cflags, " -maes -mpclmul -DINTEL_PCLMUL");
+ strcat(ldflags, " -maes -mpclmul");
+ }
+ fclose(file);
+ }
+ }
+
+ printf("%s\n%s\n", cflags, ldflags);
+}
diff --git a/intel_cpu_capabilities.h b/flag_tester/intel_cpu_capabilities.h
index 5fe0fea..6d1bbeb 100644
--- a/intel_cpu_capabilities.h
+++ b/flag_tester/intel_cpu_capabilities.h
@@ -16,7 +16,7 @@
#define CPU_CPSSE 0x2000
#define CPU_CAP_SSE3 (CPU_CPSSE | 0)
#define CPU_CAP_PCLMULQDQ (CPU_CPSSE | 1)
-#define CPU_CAP_SSSE3 (CPU_CPSSE | 10)
+#define CPU_CAP_SSSE3 (CPU_CPSSE | 9)
#define CPU_CAP_SSE41 (CPU_CPSSE | 19)
#define CPU_CAP_SSE42 (CPU_CPSSE | 20)
#define CPU_CAP_AVX (CPU_CPSSE | 28)
@@ -25,7 +25,6 @@
__asm__ __volatile__ ("cpuid":\
"=a" (ax), "=b" (bx), "=c" (cx), "=d" (dx) : "a" (func));
-inline
int
cpu_has_feature (unsigned which)
{
diff --git a/flag_tester/pclmul_test.c b/flag_tester/pclmul_test.c
new file mode 100644
index 0000000..bdae184
--- /dev/null
+++ b/flag_tester/pclmul_test.c
@@ -0,0 +1,40 @@
+#include <wmmintrin.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#define MM_PRINT8(s, r) { uint8_t blah[16], ii; printf("%-20s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 1) printf("%s%02x", (ii%4==0) ? " " : " ", blah[15-ii]); printf("\n"); }
+
+
+int main()
+{
+ uint64_t answer;
+ uint32_t pp;
+ __m128i a, b, c;
+
+ a = _mm_set1_epi8(0x0D);
+ b = _mm_set_epi32(0,0,0,0x0A);
+ pp = 0x13;
+ MM_PRINT8("a", a);
+ MM_PRINT8("b", b);
+
+ c = _mm_clmulepi64_si128(a, b, 0);
+ MM_PRINT8("a clm b", c);
+
+ a = _mm_set1_epi8(0xf0);
+ MM_PRINT8("a", a);
+ b = _mm_and_si128(a, c);
+ b = _mm_srli_epi64(b, 4);
+ MM_PRINT8("shifted", b);
+
+
+ a = _mm_set_epi32(0,0,0,pp);
+ MM_PRINT8("PP", a);
+
+ b = _mm_clmulepi64_si128(a, b, 0);
+ MM_PRINT8("PP clm over", b);
+
+ c = _mm_xor_si128(c,b);
+ MM_PRINT8("Answer", c);
+ //answer = _mm_extract_epi64(c, 0);
+ //printf("%llx\n", answer);
+}
diff --git a/flag_tester/pclmul_test.txt b/flag_tester/pclmul_test.txt
new file mode 100644
index 0000000..6102f94
--- /dev/null
+++ b/flag_tester/pclmul_test.txt
@@ -0,0 +1,8 @@
+a 0d 0d 0d 0d 0d 0d 0d 0d 0d 0d 0d 0d 0d 0d 0d 0d
+b 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 0a
+a clm b 00 00 00 00 00 00 00 00 72 72 72 72 72 72 72 72
+a f0 f0 f0 f0 f0 f0 f0 f0 f0 f0 f0 f0 f0 f0 f0 f0
+shifted 00 00 00 00 00 00 00 00 07 07 07 07 07 07 07 07
+PP 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 13
+PP clm over 00 00 00 00 00 00 00 00 79 79 79 79 79 79 79 79
+Answer 00 00 00 00 00 00 00 00 0b 0b 0b 0b 0b 0b 0b 0b
diff --git a/flag_tester/sse2_test.txt b/flag_tester/sse2_test.txt
new file mode 100644
index 0000000..f79b6e0
--- /dev/null
+++ b/flag_tester/sse2_test.txt
@@ -0,0 +1,30 @@
+a 0f 0e 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
+b 10 0f 0e 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
+c 11 10 0f 0e 0d 0c 0b 0a 09 08 07 06 05 04 03 02
+d 12 11 10 0f 0e 0d 0c 0b 0a 09 08 07 06 05 04 03
+a sl16 3c 38 34 30 2c 28 24 20 1c 18 14 10 0c 08 04 00
+b sl32 40 3c 38 34 30 2c 28 24 20 1c 18 14 10 0c 08 04
+c sl64 44 40 3c 38 34 30 2c 28 24 20 1c 18 14 10 0c 08
+d sl128 10 0f 0e 0d 0c 0b 0a 09 08 07 06 05 04 03 00 00
+a sr16 0f 0e 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
+b sr32 10 0f 0e 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
+c sr64 11 10 0f 0e 0d 0c 0b 0a 09 08 07 06 05 04 03 02
+d sr128 00 00 10 0f 0e 0d 0c 0b 0a 09 08 07 06 05 04 03
+d = a^b 1f 01 03 01 07 01 03 01 0f 01 03 01 07 01 03 01
+d = a-b epi8 ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
+d = a-b epi16 fe ff fe ff fe ff fe ff fe ff fe ff fe ff fe ff
+d = a-b epi32 fe fe fe ff fe fe fe ff fe fe fe ff fe fe fe ff
+d = a-b epi64 fe fe fe fe fe fe fe ff fe fe fe fe fe fe fe ff
+d set_epi8 0f 0e 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
+d set_epi32 12 34 56 78 9a bc de f0 12 34 56 78 9a bc de f0
+d set1_epi64 f0 f0 f0 f0 f0 f0 f0 f0 f0 f0 f0 f0 f0 f0 f0 f0
+d set1_epi32 e2 e2 e2 e2 e2 e2 e2 e2 e2 e2 e2 e2 e2 e2 e2 e2
+d set1_epi16 af f3 af f3 af f3 af f3 af f3 af f3 af f3 af f3
+d set1_epi8 c5 c5 c5 c5 c5 c5 c5 c5 c5 c5 c5 c5 c5 c5 c5 c5
+d packus_epi16(d,d) 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+c unpackhi(a,d) 00 0f 00 0e 00 0d 00 0c 00 0b 00 0a 00 09 00 08
+b unpacklo(c,a) 07 00 06 0b 05 00 04 0a 03 00 02 09 01 00 00 08
+d and(d,b) 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+a stored to mem: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+d setzero 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+c 05 05 05 05 05 05 05 05 05 05 05 05 05 05 05 05
diff --git a/flag_tester/sse4_test.txt b/flag_tester/sse4_test.txt
new file mode 100644
index 0000000..3f6d7ec
--- /dev/null
+++ b/flag_tester/sse4_test.txt
@@ -0,0 +1,35 @@
+a 0f 0e 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
+b 10 0f 0e 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
+c 11 10 0f 0e 0d 0c 0b 0a 09 08 07 06 05 04 03 02
+d 12 11 10 0f 0e 0d 0c 0b 0a 09 08 07 06 05 04 03
+a sl16 3c 38 34 30 2c 28 24 20 1c 18 14 10 0c 08 04 00
+b sl32 40 3c 38 34 30 2c 28 24 20 1c 18 14 10 0c 08 04
+c sl64 44 40 3c 38 34 30 2c 28 24 20 1c 18 14 10 0c 08
+d sl128 10 0f 0e 0d 0c 0b 0a 09 08 07 06 05 04 03 00 00
+a sr16 0f 0e 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
+b sr32 10 0f 0e 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
+c sr64 11 10 0f 0e 0d 0c 0b 0a 09 08 07 06 05 04 03 02
+d sr128 00 00 10 0f 0e 0d 0c 0b 0a 09 08 07 06 05 04 03
+d = a^b 1f 01 03 01 07 01 03 01 0f 01 03 01 07 01 03 01
+d = a-b epi8 ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
+d = a-b epi16 fe ff fe ff fe ff fe ff fe ff fe ff fe ff fe ff
+d = a-b epi32 fe fe fe ff fe fe fe ff fe fe fe ff fe fe fe ff
+d = a-b epi64 fe fe fe fe fe fe fe ff fe fe fe fe fe fe fe ff
+d set_epi8 0f 0e 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
+d set_epi32 12 34 56 78 9a bc de f0 12 34 56 78 9a bc de f0
+d set1_epi64 f0 f0 f0 f0 f0 f0 f0 f0 f0 f0 f0 f0 f0 f0 f0 f0
+d set1_epi32 e2 e2 e2 e2 e2 e2 e2 e2 e2 e2 e2 e2 e2 e2 e2 e2
+d set1_epi16 af f3 af f3 af f3 af f3 af f3 af f3 af f3 af f3
+d set1_epi8 c5 c5 c5 c5 c5 c5 c5 c5 c5 c5 c5 c5 c5 c5 c5 c5
+d packus_epi16(d,d) 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+c unpackhi(a,d) 00 0f 00 0e 00 0d 00 0c 00 0b 00 0a 00 09 00 08
+b unpacklo(c,a) 07 00 06 0b 05 00 04 0a 03 00 02 09 01 00 00 08
+d and(d,b) 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+a stored to mem: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+d setzero 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+d insert32 @ 2 00 00 00 00 ab cd 12 34 00 00 00 00 00 00 00 00
+extract_epi32 @ 2: abcd1234
+d insert64 @ 0 00 00 00 00 ab cd 12 34 fe dc ba 12 91 82 73 64
+extract_epi64 @ 0: fedcba1291827364
+c 05 05 05 05 05 05 05 05 05 05 05 05 05 05 05 05
+a shuffle(b, c) 02 02 02 02 02 02 02 02 02 02 02 02 02 02 02 02
diff --git a/flag_tester/sse_test.c b/flag_tester/sse_test.c
new file mode 100644
index 0000000..e40cf25
--- /dev/null
+++ b/flag_tester/sse_test.c
@@ -0,0 +1,142 @@
+#ifdef SSE4
+#define SSSE3
+#include <nmmintrin.h>
+#endif
+
+#ifdef SSSE3
+#define SSE2
+#include <tmmintrin.h>
+#endif
+
+#ifdef SSE2
+#include <emmintrin.h>
+#endif
+
+#include <stdio.h>
+#include <stdint.h>
+#include <inttypes.h>
+
+#define MM_PRINT8(s, r) { uint8_t blah[16], ii; printf("%-20s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 1) printf("%s%02x", (ii%4==0) ? " " : " ", blah[15-ii]); printf("\n"); }
+
+int main()
+{
+ uint32_t u32;
+ uint64_t u64;
+ uint8_t *ui8 = malloc(20), i;
+ __m128i a, b, c, d;
+
+ for(i=0; i < 20; i++)
+ ui8[i] = i;
+
+ a = _mm_load_si128( (__m128i *) ui8 );
+ b = _mm_loadu_si128( (__m128i *) (ui8+1));
+ c = _mm_loadu_si128( (__m128i *) (ui8+2));
+ d = _mm_loadu_si128( (__m128i *) (ui8+3));
+
+ MM_PRINT8("a", a);
+ MM_PRINT8("b", b);
+ MM_PRINT8("c", c);
+ MM_PRINT8("d", d);
+
+ a = _mm_slli_epi16(a, 2);
+ b = _mm_slli_epi32(b, 2);
+ c = _mm_slli_epi64(c, 2);
+ d = _mm_slli_si128(d, 2);
+
+ MM_PRINT8("a sl16", a);
+ MM_PRINT8("b sl32", b);
+ MM_PRINT8("c sl64", c);
+ MM_PRINT8("d sl128", d);
+
+ a = _mm_srli_epi16(a, 2);
+ b = _mm_srli_epi32(b, 2);
+ c = _mm_srli_epi64(c, 2);
+ d = _mm_srli_si128(d, 2);
+
+ MM_PRINT8("a sr16", a);
+ MM_PRINT8("b sr32", b);
+ MM_PRINT8("c sr64", c);
+ MM_PRINT8("d sr128", d);
+
+ d = _mm_xor_si128(a, b);
+ MM_PRINT8("d = a^b", d);
+
+ d = _mm_sub_epi8(a, b);
+ MM_PRINT8("d = a-b epi8", d);
+
+ d = _mm_sub_epi16(a, b);
+ MM_PRINT8("d = a-b epi16", d);
+
+ d = _mm_sub_epi32(a, b);
+ MM_PRINT8("d = a-b epi32", d);
+
+ d = _mm_sub_epi64(a, b);
+ MM_PRINT8("d = a-b epi64", d);
+
+ d = _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+ MM_PRINT8("d set_epi8", d);
+
+ d = _mm_set_epi32(0x12345678, 0x9abcdef0, 0x12345678, 0x9abcdef0);
+ MM_PRINT8("d set_epi32", d);
+
+ d = _mm_set1_epi64x(0xF0F0F0F0F0F0F0F0ULL);
+ MM_PRINT8("d set1_epi64", d);
+
+ d = _mm_set1_epi32(0xe2e2e2e2);
+ MM_PRINT8("d set1_epi32", d);
+
+ d = _mm_set1_epi16(0xaff3);
+ MM_PRINT8("d set1_epi16", d);
+
+ d = _mm_set1_epi8(0xc5);
+ MM_PRINT8("d set1_epi8", d);
+
+ d = _mm_packus_epi16(d, d);
+ MM_PRINT8("d packus_epi16(d,d)", d);
+
+ c = _mm_unpackhi_epi8(a, d);
+ MM_PRINT8("c unpackhi(a,d)", c);
+
+ b = _mm_unpacklo_epi8(c, a);
+ MM_PRINT8("b unpacklo(c,a)", b);
+
+ d = _mm_and_si128(d, b);
+ MM_PRINT8("d and(d,b)", d);
+
+ _mm_store_si128( (__m128i *) ui8, a);
+ printf("a stored to mem: ");
+ for(i=0; i < 16; i++)
+ printf("%u ", ui8[i]);
+ printf("\n");
+
+ d = _mm_setzero_si128();
+ MM_PRINT8("d setzero", d);
+
+ u32 = 0xABCD1234;
+ u64 = 0xFEDCBA1291827364ULL;
+
+ #ifdef SSE4
+ d = _mm_insert_epi32(d, u32, 2);
+ MM_PRINT8("d insert32 @ 2", d);
+
+ u32 = 0;
+ u32 = _mm_extract_epi32(d, 2);
+ printf("extract_epi32 @ 2: %x\n", u32);
+
+ d = _mm_insert_epi64(d, u64, 0);
+ MM_PRINT8("d insert64 @ 0", d);
+
+ u64 = 0;
+ u64 = _mm_extract_epi64(d, 0);
+ printf("extract_epi64 @ 0: %" PRIx64 "\n", u64);
+ #endif
+
+ c = _mm_set1_epi8(5);
+ MM_PRINT8("c", c);
+
+ #ifdef SSSE3
+ a = _mm_shuffle_epi8(b, c);
+ MM_PRINT8("a shuffle(b, c)", a);
+ #endif
+
+}
diff --git a/flag_tester/ssse3_test.txt b/flag_tester/ssse3_test.txt
new file mode 100644
index 0000000..17bee1a
--- /dev/null
+++ b/flag_tester/ssse3_test.txt
@@ -0,0 +1,31 @@
+a 0f 0e 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
+b 10 0f 0e 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
+c 11 10 0f 0e 0d 0c 0b 0a 09 08 07 06 05 04 03 02
+d 12 11 10 0f 0e 0d 0c 0b 0a 09 08 07 06 05 04 03
+a sl16 3c 38 34 30 2c 28 24 20 1c 18 14 10 0c 08 04 00
+b sl32 40 3c 38 34 30 2c 28 24 20 1c 18 14 10 0c 08 04
+c sl64 44 40 3c 38 34 30 2c 28 24 20 1c 18 14 10 0c 08
+d sl128 10 0f 0e 0d 0c 0b 0a 09 08 07 06 05 04 03 00 00
+a sr16 0f 0e 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
+b sr32 10 0f 0e 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
+c sr64 11 10 0f 0e 0d 0c 0b 0a 09 08 07 06 05 04 03 02
+d sr128 00 00 10 0f 0e 0d 0c 0b 0a 09 08 07 06 05 04 03
+d = a^b 1f 01 03 01 07 01 03 01 0f 01 03 01 07 01 03 01
+d = a-b epi8 ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
+d = a-b epi16 fe ff fe ff fe ff fe ff fe ff fe ff fe ff fe ff
+d = a-b epi32 fe fe fe ff fe fe fe ff fe fe fe ff fe fe fe ff
+d = a-b epi64 fe fe fe fe fe fe fe ff fe fe fe fe fe fe fe ff
+d set_epi8 0f 0e 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
+d set_epi32 12 34 56 78 9a bc de f0 12 34 56 78 9a bc de f0
+d set1_epi64 f0 f0 f0 f0 f0 f0 f0 f0 f0 f0 f0 f0 f0 f0 f0 f0
+d set1_epi32 e2 e2 e2 e2 e2 e2 e2 e2 e2 e2 e2 e2 e2 e2 e2 e2
+d set1_epi16 af f3 af f3 af f3 af f3 af f3 af f3 af f3 af f3
+d set1_epi8 c5 c5 c5 c5 c5 c5 c5 c5 c5 c5 c5 c5 c5 c5 c5 c5
+d packus_epi16(d,d) 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+c unpackhi(a,d) 00 0f 00 0e 00 0d 00 0c 00 0b 00 0a 00 09 00 08
+b unpacklo(c,a) 07 00 06 0b 05 00 04 0a 03 00 02 09 01 00 00 08
+d and(d,b) 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+a stored to mem: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+d setzero 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+c 05 05 05 05 05 05 05 05 05 05 05 05 05 05 05 05
+a shuffle(b, c) 02 02 02 02 02 02 02 02 02 02 02 02 02 02 02 02
diff --git a/whats_my_sse.c b/flag_tester/whats_my_sse.c
index 8c9192c..8c9192c 100644
--- a/whats_my_sse.c
+++ b/flag_tester/whats_my_sse.c
diff --git a/flag_tester/which_compile_flags.sh b/flag_tester/which_compile_flags.sh
new file mode 100755
index 0000000..f39c609
--- /dev/null
+++ b/flag_tester/which_compile_flags.sh
@@ -0,0 +1,19 @@
+if [ -n "$1" ]; then
+ CC=$1
+else
+ CC=cc
+fi
+
+$CC flag_test.c -o flag_test 2> /dev/null
+if [ -e "flag_test" ]; then
+ OUTPUT=`./flag_test $CC 2> /dev/null`
+ if [ -n "$OUTPUT" ]; then
+ echo "$OUTPUT"
+ else
+ printf "CFLAGS = -O3\nLDFLAGS = -O3\n"
+ fi
+else
+ printf "$CC failed to compile flag_test.c\n"
+fi
+
+rm sse4 sse2 ssse3 pclmul diff.txt flag_test temp.txt 2> /dev/null
diff --git a/gf.c b/gf.c
index 4304e1d..b027473 100644
--- a/gf.c
+++ b/gf.c
@@ -8,6 +8,405 @@
#include <stdio.h>
#include <stdlib.h>
+int _gf_errno = GF_E_DEFAULT;
+
+void gf_error()
+{
+ char *s;
+
+ switch(_gf_errno) {
+ case GF_E_DEFAULT: s = "No Error."; break;
+ case GF_E_TWOMULT: s = "Cannot specify two -m's."; break;
+ case GF_E_TWO_DIV: s = "Cannot specify two -d's."; break;
+ case GF_E_POLYSPC: s = "-p needs to be followed by a number in hex (0x optional)."; break;
+ case GF_E_GROUPAR: s = "Ran out of arguments in -m GROUP."; break;
+ case GF_E_GROUPNU: s = "In -m GROUP g_s g_r -- g_s and g_r need to be numbers."; break;
+ case GF_E_SPLITAR: s = "Ran out of arguments in -m SPLIT."; break;
+ case GF_E_SPLITNU: s = "In -m SPLIT w_a w_b -- w_a and w_b need to be numbers."; break;
+ case GF_E_FEWARGS: s = "Not enough arguments (Perhaps end with '-'?)"; break;
+ case GF_E_CFM___W: s = "-m CARRY_FREE, w must be 4, 8, 16, 32, 64 or 128."; break;
+ case GF_E_COMPXPP: s = "-m COMPOSITE, No poly specified, and we don't have a default for the given sub-field."; break;
+ case GF_E_BASE__W: s = "-m COMPOSITE and the base field is not for w/2."; break;
+ case GF_E_CFM4POL: s = "-m CARRY_FREE, w=4. (Prim-poly & 0xc) must equal 0."; break;
+ case GF_E_CFM8POL: s = "-m CARRY_FREE, w=8. (Prim-poly & 0x80) must equal 0."; break;
+ case GF_E_CF16POL: s = "-m CARRY_FREE, w=16. (Prim-poly & 0xe000) must equal 0."; break;
+ case GF_E_CF32POL: s = "-m CARRY_FREE, w=32. (Prim-poly & 0xfe000000) must equal 0."; break;
+ case GF_E_CF64POL: s = "-m CARRY_FREE, w=64. (Prim-poly & 0xfffe000000000000ULL) must equal 0."; break;
+ case GF_E_MDEFDIV: s = "If multiplication method == default, can't change division."; break;
+ case GF_E_MDEFREG: s = "If multiplication method == default, can't change region."; break;
+ case GF_E_MDEFARG: s = "If multiplication method == default, can't use arg1/arg2."; break;
+ case GF_E_DIVCOMP: s = "Cannot change the division technique with -m COMPOSITE."; break;
+ case GF_E_DOUQUAD: s = "Cannot specify -r DOUBLE and -r QUAD."; break;
+ case GF_E_SSE__NO: s = "Cannot specify -r SSE and -r NOSSE."; break;
+ case GF_E_CAUCHYB: s = "Cannot specify -r CAUCHY and any other -r."; break;
+ case GF_E_CAUCOMP: s = "Cannot specify -m COMPOSITE and -r CAUCHY."; break;
+ case GF_E_CAUGT32: s = "Cannot specify -r CAUCHY with w > 32."; break;
+ case GF_E_ARG1SET: s = "Only use arg1 with SPLIT, GROUP or COMPOSITE."; break;
+ case GF_E_ARG2SET: s = "Only use arg2 with SPLIT or GROUP."; break;
+ case GF_E_MATRIXW: s = "Cannot specify -d MATRIX with w > 32."; break;
+ case GF_E_BAD___W: s = "W must be 1-32, 64 or 128."; break;
+ case GF_E_DOUBLET: s = "Can only specify -r DOUBLE with -m TABLE."; break;
+ case GF_E_DOUBLEW: s = "Can only specify -r DOUBLE w = 4 or w = 8."; break;
+ case GF_E_DOUBLEJ: s = "Cannot specify -r DOUBLE with -r ALTMAP|SSE|NOSSE."; break;
+ case GF_E_DOUBLEL: s = "Can only specify -r DOUBLE -r LAZY with w = 8"; break;
+ case GF_E_QUAD__T: s = "Can only specify -r QUAD with -m TABLE."; break;
+ case GF_E_QUAD__W: s = "Can only specify -r QUAD w = 4."; break;
+ case GF_E_QUAD__J: s = "Cannot specify -r QUAD with -r ALTMAP|SSE|NOSSE."; break;
+ case GF_E_BADPOLY: s = "Bad primitive polynomial (high bits set)."; break;
+ case GF_E_COMP_PP: s = "Bad primitive polynomial -- bigger than sub-field."; break;
+ case GF_E_LAZY__X: s = "If -r LAZY, then -r must be DOUBLE or QUAD."; break;
+ case GF_E_ALTSHIF: s = "Cannot specify -m SHIFT and -r ALTMAP."; break;
+ case GF_E_SSESHIF: s = "Cannot specify -m SHIFT and -r SSE|NOSSE."; break;
+ case GF_E_ALT_CFM: s = "Cannot specify -m CARRY_FREE and -r ALTMAP."; break;
+ case GF_E_SSE_CFM: s = "Cannot specify -m CARRY_FREE and -r SSE|NOSSE."; break;
+ case GF_E_PCLMULX: s = "Specified -m CARRY_FREE, but PCLMUL is not supported."; break;
+ case GF_E_ALT_BY2: s = "Cannot specify -m BYTWO_x and -r ALTMAP."; break;
+ case GF_E_BY2_SSE: s = "Specified -m BYTWO_x -r SSE, but SSE2 is not supported."; break;
+ case GF_E_LOGBADW: s = "With Log Tables, w must be <= 27."; break;
+ case GF_E_LOG___J: s = "Cannot use Log tables with -r ALTMAP|SSE|NOSSE."; break;
+ case GF_E_LOGPOLY: s = "Cannot use Log tables because the polynomial is not primitive."; break;
+ case GF_E_ZERBADW: s = "With -m LOG_ZERO, w must be 8 or 16."; break;
+ case GF_E_ZEXBADW: s = "With -m LOG_ZERO_EXT, w must be 8."; break;
+ case GF_E_GR_ARGX: s = "With -m GROUP, arg1 and arg2 must be >= 0."; break;
+ case GF_E_GR_W_48: s = "With -m GROUP, w cannot be 4 or 8."; break;
+ case GF_E_GR_W_16: s = "With -m GROUP, w == 16, arg1 and arg2 must be 4."; break;
+ case GF_E_GR_128A: s = "With -m GROUP, w == 128, arg1 must be 4, and arg2 in { 4,8,16 }."; break;
+ case GF_E_GR_SSE4: s = "With -m GROUP, w == 128, you need SSE4."; break;
+ case GF_E_GR_A_27: s = "With -m GROUP, arg1 and arg2 must be <= 27."; break;
+ case GF_E_GR_AR_W: s = "With -m GROUP, arg1 and arg2 must be <= w."; break;
+ case GF_E_GR____J: s = "Cannot use GROUP with -r ALTMAP|SSE|NOSSE."; break;
+ case GF_E_TABLE_W: s = "With -m TABLE, w must be < 15, or == 16."; break;
+ case GF_E_TAB_SSE: s = "With -m TABLE, SSE|NOSSE only applies to w=4."; break;
+ case GF_E_TABSSE3: s = "With -m TABLE, -r SSE, you need SSSE3 supported."; break;
+ case GF_E_TAB_ALT: s = "With -m TABLE, you cannot use ALTMAP."; break;
+ case GF_E_SP128AR: s = "With -m SPLIT, w=128, bad arg1/arg2."; break;
+ case GF_E_SP128AL: s = "With -m SPLIT, w=128, -r SSE requires -r ALTMAP."; break;
+ case GF_E_SP128AS: s = "With -m SPLIT, w=128, ALTMAP needs SSSE3 supported."; break;
+ case GF_E_SP128_A: s = "With -m SPLIT, w=128, -r SSE|NOSSE only with arg1/arg2 = 4/128."; break;
+ case GF_E_SP128_S: s = "With -m SPLIT, w=128, -r ALTMAP only with arg1/arg2 = 4/128."; break;
+ case GF_E_SPLIT_W: s = "With -m SPLIT, w must be in {8, 16, 32, 64, 128}."; break;
+ case GF_E_SP_16AR: s = "With -m SPLIT, w=16, Bad arg1/arg2."; break;
+ case GF_E_SP_16_A: s = "With -m SPLIT, w=16, -r ALTMAP only with arg1/arg2 = 4/16."; break;
+ case GF_E_SP_16_S: s = "With -m SPLIT, w=16, -r SSE|NOSSE only with arg1/arg2 = 4/16."; break;
+ case GF_E_SP_32AR: s = "With -m SPLIT, w=32, Bad arg1/arg2."; break;
+ case GF_E_SP_32AS: s = "With -m SPLIT, w=32, -r ALTMAP needs SSSE3 supported."; break;
+ case GF_E_SP_32_A: s = "With -m SPLIT, w=32, -r ALTMAP only with arg1/arg2 = 4/32."; break;
+ case GF_E_SP_32_S: s = "With -m SPLIT, w=32, -r SSE|NOSSE only with arg1/arg2 = 4/32."; break;
+ case GF_E_SP_64AR: s = "With -m SPLIT, w=64, Bad arg1/arg2."; break;
+ case GF_E_SP_64AS: s = "With -m SPLIT, w=64, -r ALTMAP needs SSSE3 supported."; break;
+ case GF_E_SP_64_A: s = "With -m SPLIT, w=64, -r ALTMAP only with arg1/arg2 = 4/64."; break;
+ case GF_E_SP_64_S: s = "With -m SPLIT, w=64, -r SSE|NOSSE only with arg1/arg2 = 4/64."; break;
+ case GF_E_SP_8_AR: s = "With -m SPLIT, w=8, Bad arg1/arg2."; break;
+ case GF_E_SP_8__A: s = "With -m SPLIT, w=8, Can't have -r ALTMAP."; break;
+ case GF_E_SP_SSE3: s = "With -m SPLIT, Need SSSE3 support for SSE."; break;
+ case GF_E_COMP_A2: s = "With -m COMPOSITE, arg1 must equal 2."; break;
+ case GF_E_COMP_SS: s = "With -m COMPOSITE, -r SSE and -r NOSSE do not apply."; break;
+ case GF_E_COMP__W: s = "With -m COMPOSITE, w must be 8, 16, 32, 64 or 128."; break;
+ case GF_E_UNKFLAG: s = "Unknown method flag - should be -m, -d, -r or -p."; break;
+ case GF_E_UNKNOWN: s = "Unknown multiplication type."; break;
+ case GF_E_UNK_REG: s = "Unknown region type."; break;
+ case GF_E_UNK_DIV: s = "Unknown division type."; break;
+ default: s = "Undefined error.";
+ }
+
+ fprintf(stderr, "%s\n", s);
+}
+
+uint64_t gf_composite_get_default_poly(gf_t *base)
+{
+ gf_internal_t *h;
+ int rv;
+
+ h = (gf_internal_t *) base->scratch;
+ if (h->w == 4) {
+ if (h->mult_type == GF_MULT_COMPOSITE) return 0;
+ if (h->prim_poly == 0x13) return 2;
+ return 0;
+ }
+ if (h->w == 8) {
+ if (h->mult_type == GF_MULT_COMPOSITE) return 0;
+ if (h->prim_poly == 0x11d) return 3;
+ return 0;
+ }
+ if (h->w == 16) {
+ if (h->mult_type == GF_MULT_COMPOSITE) {
+ rv = gf_composite_get_default_poly(h->base_gf);
+ if (rv != h->prim_poly) return 0;
+ if (rv == 3) return 0x105;
+ return 0;
+ } else {
+ if (h->prim_poly == 0x1100b) return 2;
+ if (h->prim_poly == 0x1002d) return 7;
+ return 0;
+ }
+ }
+ if (h->w == 32) {
+ if (h->mult_type == GF_MULT_COMPOSITE) {
+ rv = gf_composite_get_default_poly(h->base_gf);
+ if (rv != h->prim_poly) return 0;
+ if (rv == 2) return 0x10005;
+ if (rv == 7) return 0x10008;
+ if (rv == 0x105) return 0x10002;
+ return 0;
+ } else {
+ if (h->prim_poly == 0x400007) return 2;
+ if (h->prim_poly == 0xc5) return 3;
+ return 0;
+ }
+ }
+ if (h->w == 64) {
+ if (h->mult_type == GF_MULT_COMPOSITE) {
+ rv = gf_composite_get_default_poly(h->base_gf);
+ if (rv != h->prim_poly) return 0;
+ if (rv == 3) return 0x100000009ULL;
+ if (rv == 2) return 0x100000004ULL;
+ if (rv == 0x10005) return 0x100000003ULL;
+ if (rv == 0x10002) return 0x100000005ULL;
+ if (rv == 0x10008) return 0x100000006ULL; /* JSP: (0x0x100000003 works too,
+ but I want to differentiate cases). */
+ return 0;
+ } else {
+ if (h->prim_poly == 0x1bULL) return 2;
+ return 0;
+ }
+ }
+ return 0;
+}
+
+int gf_error_check(int w, int mult_type, int region_type, int divide_type,
+ int arg1, int arg2, uint64_t poly, gf_t *base)
+{
+ int sse4 = 0;
+ int sse3 = 0;
+ int sse2 = 0;
+ int pclmul = 0;
+ int rdouble, rquad, rlazy, rsse, rnosse, raltmap, rcauchy, tmp;
+ uint64_t pp;
+ gf_internal_t *sub, *subsub, *subsubsub;
+
+ rdouble = (region_type & GF_REGION_DOUBLE_TABLE);
+ rquad = (region_type & GF_REGION_QUAD_TABLE);
+ rlazy = (region_type & GF_REGION_LAZY);
+ rsse = (region_type & GF_REGION_SSE);
+ rnosse = (region_type & GF_REGION_NOSSE);
+ raltmap = (region_type & GF_REGION_ALTMAP);
+ rcauchy = (region_type & GF_REGION_CAUCHY);
+
+ if (divide_type != GF_DIVIDE_DEFAULT &&
+ divide_type != GF_DIVIDE_MATRIX &&
+ divide_type != GF_DIVIDE_EUCLID) {
+ _gf_errno = GF_E_UNK_DIV;
+ return 0;
+ }
+
+ tmp = ( GF_REGION_DOUBLE_TABLE | GF_REGION_QUAD_TABLE | GF_REGION_LAZY |
+ GF_REGION_SSE | GF_REGION_NOSSE | GF_REGION_ALTMAP | GF_REGION_CAUCHY );
+ if (region_type & (~tmp)) { _gf_errno = GF_E_UNK_REG; return 0; }
+
+#ifdef INTEL_SSE2
+ sse2 = 1;
+#endif
+
+#ifdef INTEL_SSSE3
+ sse3 = 1;
+#endif
+
+#ifdef INTEL_SSE4
+ sse4 = 1;
+#endif
+
+#ifdef INTEL_PCLMUL
+ pclmul = 1;
+#endif
+
+
+ if (w < 1 || (w > 32 && w != 64 && w != 128)) { _gf_errno = GF_E_BAD___W; return 0; }
+
+ if (mult_type != GF_MULT_COMPOSITE && w < 64) {
+ if ((poly >> (w+1)) != 0) { _gf_errno = GF_E_BADPOLY; return 0; }
+ }
+
+ if (mult_type == GF_MULT_DEFAULT) {
+ if (divide_type != GF_DIVIDE_DEFAULT) { _gf_errno = GF_E_MDEFDIV; return 0; }
+ if (region_type != GF_REGION_DEFAULT) { _gf_errno = GF_E_MDEFREG; return 0; }
+ if (arg1 != 0 || arg2 != 0) { _gf_errno = GF_E_MDEFARG; return 0; }
+ return 1;
+ }
+
+ if (rsse && rnosse) { _gf_errno = GF_E_SSE__NO; return 0; }
+ if (rcauchy && w > 32) { _gf_errno = GF_E_CAUGT32; return 0; }
+ if (rcauchy && region_type != GF_REGION_CAUCHY) { _gf_errno = GF_E_CAUCHYB; return 0; }
+ if (rcauchy && mult_type == GF_MULT_COMPOSITE) { _gf_errno = GF_E_CAUCOMP; return 0; }
+
+ if (arg1 != 0 && mult_type != GF_MULT_COMPOSITE &&
+ mult_type != GF_MULT_SPLIT_TABLE && mult_type != GF_MULT_GROUP) {
+ _gf_errno = GF_E_ARG1SET;
+ return 0;
+ }
+
+ if (arg2 != 0 && mult_type != GF_MULT_SPLIT_TABLE && mult_type != GF_MULT_GROUP) {
+ _gf_errno = GF_E_ARG2SET;
+ return 0;
+ }
+
+ if (divide_type == GF_DIVIDE_MATRIX && w > 32) { _gf_errno = GF_E_MATRIXW; return 0; }
+
+ if (rdouble) {
+ if (rquad) { _gf_errno = GF_E_DOUQUAD; return 0; }
+ if (mult_type != GF_MULT_TABLE) { _gf_errno = GF_E_DOUBLET; return 0; }
+ if (w != 4 && w != 8) { _gf_errno = GF_E_DOUBLEW; return 0; }
+ if (rsse || rnosse || raltmap) { _gf_errno = GF_E_DOUBLEJ; return 0; }
+ if (rlazy && w == 4) { _gf_errno = GF_E_DOUBLEL; return 0; }
+ return 1;
+ }
+
+ if (rquad) {
+ if (mult_type != GF_MULT_TABLE) { _gf_errno = GF_E_QUAD__T; return 0; }
+ if (w != 4) { _gf_errno = GF_E_QUAD__W; return 0; }
+ if (rsse || rnosse || raltmap) { _gf_errno = GF_E_QUAD__J; return 0; }
+ return 1;
+ }
+
+ if (rlazy) { _gf_errno = GF_E_LAZY__X; return 0; }
+
+ if (mult_type == GF_MULT_SHIFT) {
+ if (raltmap) { _gf_errno = GF_E_ALTSHIF; return 0; }
+ if (rsse || rnosse) { _gf_errno = GF_E_SSESHIF; return 0; }
+ return 1;
+ }
+
+ if (mult_type == GF_MULT_CARRY_FREE) {
+ if (w != 4 && w != 8 && w != 16 &&
+ w != 32 && w != 64 && w != 128) { _gf_errno = GF_E_CFM___W; return 0; }
+ if (w == 4 && (poly & 0xc)) { _gf_errno = GF_E_CFM4POL; return 0; }
+ if (w == 8 && (poly & 0x80)) { _gf_errno = GF_E_CFM8POL; return 0; }
+ if (w == 16 && (poly & 0xe000)) { _gf_errno = GF_E_CF16POL; return 0; }
+ if (w == 32 && (poly & 0xfe000000)) { _gf_errno = GF_E_CF32POL; return 0; }
+ if (w == 64 && (poly & 0xfffe000000000000ULL)) { _gf_errno = GF_E_CF64POL; return 0; }
+ if (raltmap) { _gf_errno = GF_E_ALT_CFM; return 0; }
+ if (rsse || rnosse) { _gf_errno = GF_E_SSE_CFM; return 0; }
+ if (!pclmul) { _gf_errno = GF_E_PCLMULX; return 0; }
+ return 1;
+ }
+
+ if (mult_type == GF_MULT_BYTWO_p || mult_type == GF_MULT_BYTWO_b) {
+ if (raltmap) { _gf_errno = GF_E_ALT_BY2; return 0; }
+ if (rsse && !sse2) { _gf_errno = GF_E_BY2_SSE; return 0; }
+ return 1;
+ }
+
+ if (mult_type == GF_MULT_LOG_TABLE || mult_type == GF_MULT_LOG_ZERO
+ || mult_type == GF_MULT_LOG_ZERO_EXT ) {
+ if (w > 27) { _gf_errno = GF_E_LOGBADW; return 0; }
+ if (raltmap || rsse || rnosse) { _gf_errno = GF_E_LOG___J; return 0; }
+
+ if (mult_type == GF_MULT_LOG_TABLE) return 1;
+
+ if (w != 8 && w != 16) { _gf_errno = GF_E_ZERBADW; return 0; }
+
+ if (mult_type == GF_MULT_LOG_ZERO) return 1;
+
+ if (w != 8) { _gf_errno = GF_E_ZEXBADW; return 0; }
+ return 1;
+ }
+
+ if (mult_type == GF_MULT_GROUP) {
+ if (arg1 <= 0 || arg2 <= 0) { _gf_errno = GF_E_GR_ARGX; return 0; }
+ if (w == 4 || w == 8) { _gf_errno = GF_E_GR_W_48; return 0; }
+ if (w == 16 && (arg1 != 4 || arg2 != 4)) { _gf_errno = GF_E_GR_W_16; return 0; }
+ if (w == 128 && (arg1 != 4 ||
+ (arg2 != 4 && arg2 != 8 && arg2 != 16))) { _gf_errno = GF_E_GR_128A; return 0; }
+ if (w == 128 && !sse4) { _gf_errno = GF_E_GR_SSE4; return 0; }
+ if (arg1 > 27 || arg2 > 27) { _gf_errno = GF_E_GR_A_27; return 0; }
+ if (arg1 > w || arg2 > w) { _gf_errno = GF_E_GR_AR_W; return 0; }
+ if (raltmap || rsse || rnosse) { _gf_errno = GF_E_GR____J; return 0; }
+ return 1;
+ }
+
+ if (mult_type == GF_MULT_TABLE) {
+ if (w != 16 && w >= 15) { _gf_errno = GF_E_TABLE_W; return 0; }
+ if (w != 4 && (rsse || rnosse)) { _gf_errno = GF_E_TAB_SSE; return 0; }
+ if (rsse && !sse3) { _gf_errno = GF_E_TABSSE3; return 0; }
+ if (raltmap) { _gf_errno = GF_E_TAB_ALT; return 0; }
+ return 1;
+ }
+
+ if (mult_type == GF_MULT_SPLIT_TABLE) {
+ if (arg1 > arg2) {
+ tmp = arg1;
+ arg1 = arg2;
+ arg2 = tmp;
+ }
+ if (w == 8) {
+ if (arg1 != 4 || arg2 != 8) { _gf_errno = GF_E_SP_8_AR; return 0; }
+ if (rsse && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; }
+ if (raltmap) { _gf_errno = GF_E_SP_8__A; return 0; }
+ } else if (w == 16) {
+ if (arg1 == 4 && arg2 == 16) {
+ if (rsse && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; }
+ } else if (arg1 == 8 && (arg2 == 16 || arg2 == 8)) {
+ if (rsse || rnosse) { _gf_errno = GF_E_SP_16_S; return 0; }
+ if (raltmap) { _gf_errno = GF_E_SP_16_A; return 0; }
+ } else { _gf_errno = GF_E_SP_16AR; return 0; }
+ } else if (w == 32) {
+ if ((arg1 == 8 && arg2 == 8) ||
+ (arg1 == 8 && arg2 == 32) ||
+ (arg1 == 16 && arg2 == 32)) {
+ if (rsse || rnosse) { _gf_errno = GF_E_SP_32_S; return 0; }
+ if (raltmap) { _gf_errno = GF_E_SP_32_A; return 0; }
+ } else if ((arg1 == 4 && arg2 == 32) ||
+ (arg1 == 4 && arg2 == 32)) {
+ if (rsse && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; }
+ if (raltmap && arg1 != 4) { _gf_errno = GF_E_SP_32_A; return 0; }
+ if (raltmap && !sse3) { _gf_errno = GF_E_SP_32AS; return 0; }
+ if (raltmap && rnosse) { _gf_errno = GF_E_SP_32AS; return 0; }
+ } else { _gf_errno = GF_E_SP_32AR; return 0; }
+ } else if (w == 64) {
+ if ((arg1 == 8 && arg2 == 8) ||
+ (arg1 == 8 && arg2 == 64) ||
+ (arg1 == 16 && arg2 == 64)) {
+ if (rsse || rnosse) { _gf_errno = GF_E_SP_64_S; return 0; }
+ if (raltmap) { _gf_errno = GF_E_SP_64_A; return 0; }
+ } else if (arg1 == 4 && arg2 == 64) {
+ if (rsse && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; }
+ if (raltmap && !sse3) { _gf_errno = GF_E_SP_64AS; return 0; }
+ if (raltmap && rnosse) { _gf_errno = GF_E_SP_64AS; return 0; }
+ } else { _gf_errno = GF_E_SP_64AR; return 0; }
+ } else if (w == 128) {
+ if (arg1 == 8 && arg2 == 128) {
+ if (rsse || rnosse) { _gf_errno = GF_E_SP128_S; return 0; }
+ if (raltmap) { _gf_errno = GF_E_SP128_A; return 0; }
+ } else if (arg1 == 4 && arg2 == 128) {
+ if (rsse && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; }
+ if (raltmap && !sse3) { _gf_errno = GF_E_SP128AS; return 0; }
+ if (raltmap && rnosse) { _gf_errno = GF_E_SP128AS; return 0; }
+ if (!raltmap && rsse) { _gf_errno = GF_E_SP128AL; return 0; }
+ } else { _gf_errno = GF_E_SP128AR; return 0; }
+ } else { _gf_errno = GF_E_SPLIT_W; return 0; }
+ return 1;
+ }
+
+ if (mult_type == GF_MULT_COMPOSITE) {
+ if (w != 8 && w != 16 && w != 32
+ && w != 64 && w != 128) { _gf_errno = GF_E_COMP__W; return 0; }
+ if ((poly >> (w/2)) != 0) { _gf_errno = GF_E_COMP_PP; return 0; }
+ if (divide_type != GF_DIVIDE_DEFAULT) { _gf_errno = GF_E_DIVCOMP; return 0; }
+ if (arg1 != 2) { _gf_errno = GF_E_COMP_A2; return 0; }
+ if (rsse || rnosse) { _gf_errno = GF_E_COMP_SS; return 0; }
+ if (base != NULL) {
+ sub = (gf_internal_t *) base->scratch;
+ if (sub->w != w/2) { _gf_errno = GF_E_BASE__W; return 0; }
+ if (poly == 0) {
+ if (gf_composite_get_default_poly(base) == 0) { _gf_errno = GF_E_COMPXPP; return 0; }
+ }
+ }
+ return 1;
+ }
+
+ _gf_errno = GF_E_UNKNOWN;
+ return 0;
+}
+
int gf_scratch_size(int w,
int mult_type,
int region_type,
@@ -15,6 +414,8 @@ int gf_scratch_size(int w,
int arg1,
int arg2)
{
+ if (gf_error_check(w, mult_type, region_type, divide_type, arg1, arg2, 0, NULL) == 0) return 0;
+
switch(w) {
case 4: return gf_w4_scratch_size(mult_type, region_type, divide_type, arg1, arg2);
case 8: return gf_w8_scratch_size(mult_type, region_type, divide_type, arg1, arg2);
@@ -26,16 +427,31 @@ int gf_scratch_size(int w,
}
}
-int gf_dummy_init(gf_t *gf)
+extern int gf_size(gf_t *gf)
{
- return 0;
+ gf_internal_t *h;
+ int s;
+
+ s = sizeof(gf_t);
+ h = (gf_internal_t *) gf->scratch;
+ s += gf_scratch_size(h->w, h->mult_type, h->region_type, h->divide_type, h->arg1, h->arg2);
+ if (h->mult_type == GF_MULT_COMPOSITE) s += gf_size(h->base_gf);
+ return s;
}
+
int gf_init_easy(gf_t *gf, int w)
{
- return gf_init_hard(gf, w, GF_MULT_DEFAULT, GF_REGION_DEFAULT, GF_DIVIDE_DEFAULT, 0, 0, 0, NULL, NULL);
+ return gf_init_hard(gf, w, GF_MULT_DEFAULT, GF_REGION_DEFAULT, GF_DIVIDE_DEFAULT,
+ 0, 0, 0, NULL, NULL);
}
+/* Allen: What's going on here is this function is putting info into the
+ scratch mem of gf, and then calling the relevant REAL init
+ func for the word size. Probably done this way to consolidate
+ those aspects of initialization that don't rely on word size,
+ and then take care of word-size-specific stuff. */
+
int gf_init_hard(gf_t *gf, int w, int mult_type,
int region_type,
int divide_type,
@@ -46,11 +462,14 @@ int gf_init_hard(gf_t *gf, int w, int mult_type,
{
int sz;
gf_internal_t *h;
-
- sz = gf_scratch_size(w, mult_type, region_type, divide_type, arg1, arg2);
-
- if (sz <= 0) return 0;
+
+ if (gf_error_check(w, mult_type, region_type, divide_type,
+ arg1, arg2, prim_poly, base_gf) == 0) return 0;
+ sz = gf_scratch_size(w, mult_type, region_type, divide_type, arg1, arg2);
+ if (sz <= 0) return 0; /* This shouldn't happen, as all errors should get caught
+ in gf_error_check() */
+
if (scratch_memory == NULL) {
h = (gf_internal_t *) malloc(sz);
h->free_me = 1;
@@ -71,8 +490,6 @@ int gf_init_hard(gf_t *gf, int w, int mult_type,
h->private += (sizeof(gf_internal_t));
gf->extract_word.w32 = NULL;
- //printf("Created w=%d, with mult_type=%d and region_type=%d\n", w, mult_type, region_type);
-
switch(w) {
case 4: return gf_w4_init(gf);
case 8: return gf_w8_init(gf);
@@ -94,6 +511,7 @@ int gf_free(gf_t *gf, int recursive)
free(h->base_gf);
}
if (h->free_me) free(h);
+ return 0; /* Making compiler happy */
}
void gf_alignment_error(char *s, int a)
@@ -105,9 +523,9 @@ void gf_alignment_error(char *s, int a)
}
static
-void gf_invert_binary_matrix(int *mat, int *inv, int rows) {
+void gf_invert_binary_matrix(uint32_t *mat, uint32_t *inv, int rows) {
int cols, i, j, k;
- int tmp;
+ uint32_t tmp;
cols = rows;
@@ -172,34 +590,6 @@ uint32_t gf_bitmatrix_inverse(uint32_t y, int w, uint32_t pp)
return inv[0];
}
-/*
-void gf_two_byte_region_table_multiply(gf_region_data *rd, uint16_t *base)
-{
- uint64_t p, ta, shift, tb;
- uint64_t *s64, *d64
-
- s64 = rd->s_start;
- d64 = rd->d_start;
-
- while (s64 < (uint64_t *) rd->s_top) {
- p = (rd->xor) ? *d64 : 0;
- ta = *s64;
-
- shift = 0;
- while (ta != 0) {
- tb = base[ta&0xffff];
- p ^= (tb << shift);
- ta >>= 16;
- shift += 16;
- }
-
- *d64 = p;
- d64++;
- s64++;
- }
-}
-*/
-
void gf_two_byte_region_table_multiply(gf_region_data *rd, uint16_t *base)
{
uint64_t a, prod;
@@ -226,8 +616,8 @@ void gf_two_byte_region_table_multiply(gf_region_data *rd, uint16_t *base)
prod ^= base[a >> 48];
prod ^= *d64;
*d64 = prod;
- *s64++;
- *d64++;
+ s64++;
+ d64++;
}
} else {
while (d64 != top) {
@@ -243,8 +633,8 @@ void gf_two_byte_region_table_multiply(gf_region_data *rd, uint16_t *base)
prod <<= 16;
prod ^= base[a >> 48];
*d64 = prod;
- *s64++;
- *d64++;
+ s64++;
+ d64++;
}
}
}
@@ -307,9 +697,71 @@ static void gf_slow_multiply_region(gf_region_data *rd, void *src, void *dest, v
}
}
-/* If align>16, you align to 16 bytes, but make sure that within the aligned region bytes is a multiple of align. However, you make sure that the region itself is a multiple of align.
+/* JSP - The purpose of this procedure is to error check alignment,
+ and to set up the region operation so that it can best leverage
+ large words.
+
+ It stores its information in rd.
+
+ Assuming you're not doing Cauchy coding, (see below for that),
+ then w will be 4, 8, 16, 32 or 64. It can't be 128 (probably
+ should change that).
+
+ src and dest must then be aligned on ceil(w/8)-byte boundaries.
+ Moreover, bytes must be a multiple of ceil(w/8). If the variable
+ align is equal to ceil(w/8), then we will set s_start = src,
+ d_start = dest, s_top to (src+bytes) and d_top to (dest+bytes).
+ And we return -- the implementation will go ahead and do the
+ multiplication on individual words (e.g. using discrete logs).
+
+ If align is greater than ceil(w/8), then the implementation needs
+ to work on groups of "align" bytes. For example, suppose you are
+ implementing BYTWO, without SSE. Then you will be doing the region
+ multiplication in units of 8 bytes, so align = 8. Or, suppose you
+ are doing a Quad table in GF(2^4). You will be doing the region
+ multiplication in units of 2 bytes, so align = 2. Or, suppose you
+ are doing split multiplication with SSE operations in GF(2^8).
+ Then align = 16. Worse yet, suppose you are doing split
+ multiplication with SSE operations in GF(2^16), with or without
+ ALTMAP. Then, you will be doing the multiplication on 256 bits at
+ a time. So align = 32.
+
+ When align does not equal ceil(w/8), we split the region
+ multiplication into three parts. We are going to make s_start be
+ the first address greater than or equal to src that is a multiple
+ of align. s_top is going to be the largest address >= src+bytes
+ such that (s_top - s_start) is a multiple of align. We do the
+ same with d_start and d_top. When we say that "src and dest must
+ be aligned with respect to each other, we mean that s_start-src
+ must equal d_start-dest.
+
+ Now, the region multiplication is done in three parts -- the part
+ between src and s_start must be done using single words.
+ Similarly, the part between s_top and src+bytes must also be done
+ using single words. The part between s_start and s_top will be
+ done in chunks of "align" bytes.
+
+ One final thing -- if align > 16, then s_start and d_start will be
+ aligned on a 16 byte boundary. Perhaps we should have two
+ variables: align and chunksize. Then we'd have s_start & d_start
+ aligned to "align", and have s_top-s_start be a multiple of
+ chunksize. That may be less confusing, but it would be a big
+ change.
- If align = -1, then this is cauchy. You need to make sure that bytes is a multiple of w. */
+ Finally, if align = -1, then we are doing Cauchy multiplication,
+ using only XOR's. In this case, we're not going to care about
+ alignment because we are just doing XOR's. Instead, the only
+ thing we care about is that bytes must be a multiple of w.
+
+ This is not to say that alignment doesn't matter in performance
+ with XOR's. See that discussion in gf_multby_one().
+
+ After you call gf_set_region_data(), the procedure
+ gf_do_initial_region_alignment() calls gf->multiply.w32() on
+ everything between src and s_start. The procedure
+ gf_do_final_region_alignment() calls gf->multiply.w32() on
+ everything between s_top and src+bytes.
+ */
void gf_set_region_data(gf_region_data *rd,
gf_t *gf,
@@ -326,7 +778,7 @@ void gf_set_region_data(gf_region_data *rd,
uint32_t a;
unsigned long uls, uld;
- if (gf == NULL) {
+ if (gf == NULL) { /* JSP - Can be NULL if you're just doing XOR's */
wb = 1;
} else {
h = gf->scratch;
@@ -347,7 +799,7 @@ void gf_set_region_data(gf_region_data *rd,
a = (align <= 16) ? align : 16;
- if (align == -1) { /* This is cauchy. Error check bytes, then set up the pointers
+ if (align == -1) { /* JSP: This is cauchy. Error check bytes, then set up the pointers
so that there are no alignment regions. */
if (bytes % h->w != 0) {
fprintf(stderr, "Error in region multiply operation.\n");
@@ -386,14 +838,14 @@ void gf_set_region_data(gf_region_data *rd,
}
uls %= a;
- if (uls != 0) uls = (align-uls);
+ if (uls != 0) uls = (a-uls);
rd->s_start = rd->src + uls;
rd->d_start = rd->dest + uls;
bytes -= uls;
-
bytes -= (bytes % align);
rd->s_top = rd->s_start + bytes;
rd->d_top = rd->d_start + bytes;
+
}
void gf_do_initial_region_alignment(gf_region_data *rd)
@@ -413,25 +865,76 @@ void gf_multby_zero(void *dest, int bytes, int xor)
return;
}
+/* JSP - gf_multby_one tries to do this in the most efficient way
+ possible. If xor = 0, then simply call memcpy() since that
+ should be optimized by the system. Otherwise, try to do the xor
+ in the following order:
+
+ If src and dest are aligned with respect to each other on 16-byte
+ boundaries and you have SSE instructions, then use aligned SSE
+ instructions.
+
+ If they aren't but you still have SSE instructions, use unaligned
+ SSE instructions.
+
+ If there are no SSE instructions, but they are aligned with
+ respect to each other on 8-byte boundaries, then do them with
+ uint64_t's.
+
+ Otherwise, call gf_unaligned_xor(), which does the following:
+ align a destination pointer along an 8-byte boundary, and then
+ memcpy 32 bytes at a time from the src pointer to an array of
+ doubles. I'm not sure if that's the best -- probably needs
+ testing, but this seems like it could be a black hole.
+ */
+
+static void gf_unaligned_xor(void *src, void *dest, int bytes);
+
void gf_multby_one(void *src, void *dest, int bytes, int xor)
{
-#ifdef INTEL_SSE4
+#ifdef INTEL_SSE2
__m128i ms, md;
#endif
+ unsigned long uls, uld;
uint8_t *s8, *d8, *dtop8;
uint64_t *s64, *d64, *dtop64;
int abytes;
-
gf_region_data rd;
+
if (!xor) {
memcpy(dest, src, bytes);
return;
}
+ uls = (unsigned long) src;
+ uld = (unsigned long) dest;
-#ifdef INTEL_SSE4
+#ifdef INTEL_SSE2
s8 = (uint8_t *) src;
d8 = (uint8_t *) dest;
- abytes = bytes & 0xfffffff0;
+ if (uls % 16 == uld % 16) {
+ gf_set_region_data(&rd, NULL, src, dest, bytes, 1, xor, 16);
+ while (s8 != rd.s_start) {
+ *d8 ^= *s8;
+ d8++;
+ s8++;
+ }
+ while (s8 < (uint8_t *) rd.s_top) {
+ ms = _mm_load_si128 ((__m128i *)(s8));
+ md = _mm_load_si128 ((__m128i *)(d8));
+ md = _mm_xor_si128(md, ms);
+ _mm_store_si128((__m128i *)(d8), md);
+ s8 += 16;
+ d8 += 16;
+ }
+ while (s8 != (uint8_t *) src + bytes) {
+ *d8 ^= *s8;
+ d8++;
+ s8++;
+ }
+ return;
+ }
+
+ abytes = (bytes & 0xfffffff0);
while (d8 < (uint8_t *) dest + abytes) {
ms = _mm_loadu_si128 ((__m128i *)(s8));
@@ -449,8 +952,11 @@ void gf_multby_one(void *src, void *dest, int bytes, int xor)
return;
#endif
- /* If you don't have SSE, you'd better be aligned..... */
-
+ if (uls % 8 != uld % 8) {
+ gf_unaligned_xor(src, dest, bytes);
+ return;
+ }
+
gf_set_region_data(&rd, NULL, src, dest, bytes, 1, xor, 8);
s8 = (uint8_t *) src;
d8 = (uint8_t *) dest;
@@ -480,3 +986,47 @@ void gf_multby_one(void *src, void *dest, int bytes, int xor)
}
return;
}
+
+#define UNALIGNED_BUFSIZE (8)
+
+static void gf_unaligned_xor(void *src, void *dest, int bytes)
+{
+ uint64_t scopy[UNALIGNED_BUFSIZE], *d64;
+ int i;
+ gf_region_data rd;
+ uint8_t *s8, *d8;
+
+ /* JSP - call gf_set_region_data(), but use dest in both places. This is
+ because I only want to set up dest. If I used src, gf_set_region_data()
+ would fail because src and dest are not aligned to each other wrt
+ 8-byte pointers. I know this will actually align d_start to 16 bytes.
+ If I change gf_set_region_data() to split alignment & chunksize, then
+ I could do this correctly. */
+
+ gf_set_region_data(&rd, NULL, dest, dest, bytes, 1, 1, 8*UNALIGNED_BUFSIZE);
+ s8 = (uint8_t *) src;
+ d8 = (uint8_t *) dest;
+
+ while (d8 < (uint8_t *) rd.d_start) {
+ *d8 ^= *s8;
+ d8++;
+ s8++;
+ }
+
+ d64 = (uint64_t *) d8;
+ while (d64 < (uint64_t *) rd.d_top) {
+ memcpy(scopy, s8, 8*UNALIGNED_BUFSIZE);
+ s8 += 8*UNALIGNED_BUFSIZE;
+ for (i = 0; i < UNALIGNED_BUFSIZE; i++) {
+ *d64 ^= scopy[i];
+ d64++;
+ }
+ }
+
+ d8 = (uint8_t *) d64;
+ while (d8 < (uint8_t *) (dest+bytes)) {
+ *d8 ^= *s8;
+ d8++;
+ s8++;
+ }
+}
diff --git a/gf_54.c b/gf_54.c
deleted file mode 100644
index fc37783..0000000
--- a/gf_54.c
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * Multiplies four and five in GF(2^4).
- */
-
-#include <stdio.h>
-#include <stdint.h>
-#include <stdlib.h>
-
-#include "gf_complete.h"
-
-main()
-{
- gf_t gf;
- void *scratch;
- int size;
-
- size = gf_scratch_size(16, GF_MULT_SPLIT_TABLE,
- GF_REGION_SSE | GF_REGION_ALTMAP,
- GF_DIVIDE_DEFAULT,
- 16, 4);
- if (size == -1) exit(1); /* It failed. That shouldn't happen*/
- scratch = (void *) malloc(size);
- if (scratch == NULL) { perror("malloc"); exit(1); }
- if (!gf_init_hard(&gf, 16, GF_MULT_SPLIT_TABLE,
- GF_REGION_SSE | GF_REGION_ALTMAP,
- GF_DIVIDE_DEFAULT,
- 0, 16, 4, NULL, scratch)) exit(1);
- printf("Yo\n");
-}
diff --git a/gf_add.c b/gf_add.c
index 78d443f..545b4b7 100644
--- a/gf_add.c
+++ b/gf_add.c
@@ -16,7 +16,7 @@ void usage(char *s)
fprintf(stderr, " If w has an h on the end, treat a, b and the sum as hexadecimal (no 0x required)\n");
fprintf(stderr, "\n");
fprintf(stderr, " legal w are: 1-32, 64 and 128\n");
- fprintf(stderr, " 128 is hex only (i.e. '128' will be an error - do '128h')\n");
+ fprintf(stderr, " 128 is hex only (i.e. '128' will be an error - do '128h')\n");
if (s != NULL) fprintf(stderr, "%s", s);
exit(1);
diff --git a/gf_complete.h b/gf_complete.h
index ac6688e..de3b753 100644
--- a/gf_complete.h
+++ b/gf_complete.h
@@ -4,22 +4,30 @@
#pragma once
#include <stdint.h>
-#ifdef INTEL_SSE4
-#include <nmmintrin.h>
-#include <emmintrin.h>
-#include <smmintrin.h>
+#ifdef INTEL_SSE4
+ #define INTEL_SSSE3
+ #include <nmmintrin.h>
#endif
-#ifdef INTEL_PCLMUL
-#include <wmmintrin.h>
+#ifdef INTEL_SSSE3
+ #define INTEL_SSE2
+ #include <tmmintrin.h>
#endif
-/* This does either memcpy or xor, depending on "xor" */
+#ifdef INTEL_SSE2
+ #include <emmintrin.h>
+#endif
-extern void gf_multby_one(void *src, void *dest, int bytes, int xor);
+#ifdef INTEL_PCLMUL
+ #include <wmmintrin.h>
+ #ifdef INTEL_SSE4
+ #define INTEL_SSE4_PCLMUL
+ #endif
+ #ifdef INTEL_SSSE3
+ #define INTEL_SSSE3_PCLMUL
+ #endif
+#endif
-#define GF_W128_IS_ZERO(val) (val[0] == 0 && val[1] == 0)
-#define GF_W128_EQUAL(val1, val2) ((val1[0] == val2[0]) && (val1[1] == val2[1]))
/* These are the different ways to perform multiplication.
Not all are implemented for all values of w.
@@ -27,30 +35,30 @@ extern void gf_multby_one(void *src, void *dest, int bytes, int xor);
typedef enum {GF_MULT_DEFAULT,
GF_MULT_SHIFT,
+ GF_MULT_CARRY_FREE,
GF_MULT_GROUP,
GF_MULT_BYTWO_p,
GF_MULT_BYTWO_b,
GF_MULT_TABLE,
GF_MULT_LOG_TABLE,
+ GF_MULT_LOG_ZERO,
+ GF_MULT_LOG_ZERO_EXT,
GF_MULT_SPLIT_TABLE,
GF_MULT_COMPOSITE } gf_mult_type_t;
/* These are the different ways to optimize region
- operations. They are bits because you can compose them:
- You can mix SINGLE/DOUBLE/QUAD, LAZY, SSE/NOSSE, STDMAP/ALTMAP/CAUCHY.
+ operations. They are bits because you can compose them.
Certain optimizations only apply to certain gf_mult_type_t's.
Again, please see documentation for how to use these */
#define GF_REGION_DEFAULT (0x0)
-#define GF_REGION_SINGLE_TABLE (0x1)
-#define GF_REGION_DOUBLE_TABLE (0x2)
-#define GF_REGION_QUAD_TABLE (0x4)
-#define GF_REGION_LAZY (0x8)
-#define GF_REGION_SSE (0x10)
-#define GF_REGION_NOSSE (0x20)
-#define GF_REGION_STDMAP (0x40)
-#define GF_REGION_ALTMAP (0x80)
-#define GF_REGION_CAUCHY (0x100)
+#define GF_REGION_DOUBLE_TABLE (0x1)
+#define GF_REGION_QUAD_TABLE (0x2)
+#define GF_REGION_LAZY (0x4)
+#define GF_REGION_SSE (0x8)
+#define GF_REGION_NOSSE (0x10)
+#define GF_REGION_ALTMAP (0x20)
+#define GF_REGION_CAUCHY (0x40)
typedef uint32_t gf_region_type_t;
@@ -74,6 +82,9 @@ typedef uint32_t gf_val_32_t;
typedef uint64_t gf_val_64_t;
typedef uint64_t *gf_val_128_t;
+extern int _gf_errno;
+extern void gf_error();
+
typedef struct gf *GFP;
typedef union gf_func_a_b {
@@ -109,8 +120,21 @@ typedef struct gf {
void *scratch;
} gf_t;
+/* Initializes the GF to defaults. Pass it a pointer to a gf_t.
+ Returns 0 on failure, 1 on success. */
+
extern int gf_init_easy(GFP gf, int w);
+/* Initializes the GF changing the defaults.
+ Returns 0 on failure, 1 on success.
+ Pass it a pointer to a gf_t.
+ For mult_type and divide_type, use one of gf_mult_type_t gf_divide_type_t .
+ For region_type, OR together the GF_REGION_xxx's defined above.
+ Use 0 as prim_poly for defaults. Otherwise, the leading 1 is optional.
+ Use NULL for scratch_memory to have init_hard allocate memory. Otherwise,
+ use gf_scratch_size() to determine how big scratch_memory has to be.
+ */
+
extern int gf_init_hard(GFP gf,
int w,
int mult_type,
@@ -122,6 +146,9 @@ extern int gf_init_hard(GFP gf,
GFP base_gf,
void *scratch_memory);
+/* Determines the size for scratch_memory.
+ Returns 0 on failure and non-zero on success. */
+
extern int gf_scratch_size(int w,
int mult_type,
int region_type,
@@ -129,25 +156,32 @@ extern int gf_scratch_size(int w,
int arg1,
int arg2);
+/* This reports the gf_scratch_size of a gf_t that has already been created */
+
+extern int gf_size(GFP gf);
+
+/* Frees scratch memory if gf_init_easy/gf_init_hard called malloc.
+ If recursive = 1, then it calls itself recursively on base_gf. */
+
extern int gf_free(GFP gf, int recursive);
/* This is support for inline single multiplications and divisions.
I know it's yucky, but if you've got to be fast, you've got to be fast.
- We'll support inlines for w=4, w=8 and w=16.
+ We support inlining for w=4, w=8 and w=16.
To use inline multiplication and division with w=4 or 8, you should use the
default gf_t, or one with a single table. Otherwise, gf_w4/8_get_mult_table()
- will return NULL. */
+ will return NULL. Similarly, with w=16, the gf_t must be LOG */
uint8_t *gf_w4_get_mult_table(GFP gf);
uint8_t *gf_w4_get_div_table(GFP gf);
-#define GF_W4_INLINE_MULTDIV(table, a, b) (table[((a)<<4)|b])
+#define GF_W4_INLINE_MULTDIV(table, a, b) (table[((a)<<4)|(b)])
uint8_t *gf_w8_get_mult_table(GFP gf);
uint8_t *gf_w8_get_div_table(GFP gf);
-#define GF_W8_INLINE_MULTDIV(table, a, b) (table[(((uint32_t) a)<<8)|b])
+#define GF_W8_INLINE_MULTDIV(table, a, b) (table[(((uint32_t) (a))<<8)|(b)])
uint16_t *gf_w16_get_log_table(GFP gf);
uint16_t *gf_w16_get_mult_alog_table(GFP gf);
diff --git a/gf_example_5.c b/gf_example_5.c
new file mode 100644
index 0000000..3e303a3
--- /dev/null
+++ b/gf_example_5.c
@@ -0,0 +1,73 @@
+/*
+ * gf_example_5.c
+ *
+ * Demonstrating altmap and extract_word
+ */
+
+#include <stdio.h>
+#include <getopt.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdlib.h>
+#include <time.h>
+
+#include "gf_complete.h"
+#include "gf_rand.h"
+
+void usage(char *s)
+{
+ fprintf(stderr, "usage: gf_example_5\n");
+ exit(1);
+}
+
+int main(int argc, char **argv)
+{
+ uint16_t *a, *b;
+ int i, j;
+ gf_t gf;
+
+ if (gf_init_hard(&gf, 16, GF_MULT_SPLIT_TABLE, GF_REGION_ALTMAP, GF_DIVIDE_DEFAULT,
+ 0, 16, 4, NULL, NULL) == 0) {
+ fprintf(stderr, "gf_init_hard failed\n");
+ exit(1);
+ }
+
+ a = (uint16_t *) malloc(200);
+ b = (uint16_t *) malloc(200);
+
+ a += 6;
+ b += 6;
+
+ MOA_Seed(0);
+
+ for (i = 0; i < 30; i++) a[i] = MOA_Random_W(16, 1);
+
+ gf.multiply_region.w32(&gf, a, b, 0x1234, 30*2, 0);
+
+ printf("a: 0x%lx b: 0x%lx\n", (unsigned long) a, (unsigned long) b);
+
+ for (i = 0; i < 30; i += 10) {
+ printf("\n");
+ printf(" ");
+ for (j = 0; j < 10; j++) printf(" %4d", i+j);
+ printf("\n");
+
+ printf("a:");
+ for (j = 0; j < 10; j++) printf(" %04x", a[i+j]);
+ printf("\n");
+
+ printf("b:");
+ for (j = 0; j < 10; j++) printf(" %04x", b[i+j]);
+ printf("\n");
+ printf("\n");
+ }
+
+ for (i = 0; i < 15; i ++) {
+ printf("Word %2d: 0x%04x * 0x1234 = 0x%04x ", i,
+ gf.extract_word.w32(&gf, a, 30*2, i),
+ gf.extract_word.w32(&gf, b, 30*2, i));
+ printf("Word %2d: 0x%04x * 0x1234 = 0x%04x\n", i+15,
+ gf.extract_word.w32(&gf, a, 30*2, i+15),
+ gf.extract_word.w32(&gf, b, 30*2, i+15));
+ }
+}
diff --git a/gf_example_6.c b/gf_example_6.c
new file mode 100644
index 0000000..86dda11
--- /dev/null
+++ b/gf_example_6.c
@@ -0,0 +1,79 @@
+/*
+ * gf_example_6.c
+ *
+ * Demonstrating altmap and extract_word
+ */
+
+#include <stdio.h>
+#include <getopt.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdlib.h>
+#include <time.h>
+
+#include "gf_complete.h"
+#include "gf_rand.h"
+
+void usage(char *s)
+{
+ fprintf(stderr, "usage: gf_example_6\n");
+ exit(1);
+}
+
+int main(int argc, char **argv)
+{
+ uint32_t *a, *b;
+ int i, j;
+ gf_t gf, gf_16;
+
+ if (gf_init_hard(&gf_16, 16, GF_MULT_LOG_TABLE, GF_REGION_DEFAULT, GF_DIVIDE_DEFAULT,
+ 0, 0, 0, NULL, NULL) == 0) {
+ fprintf(stderr, "gf_init_hard (6) failed\n");
+ exit(1);
+ }
+
+ if (gf_init_hard(&gf, 32, GF_MULT_COMPOSITE, GF_REGION_ALTMAP, GF_DIVIDE_DEFAULT,
+ 0, 2, 0, &gf_16, NULL) == 0) {
+ fprintf(stderr, "gf_init_hard (32) failed\n");
+ exit(1);
+ }
+
+ a = (uint32_t *) malloc(200);
+ b = (uint32_t *) malloc(200);
+
+ a += 3;
+ b += 3;
+
+ MOA_Seed(0);
+
+ for (i = 0; i < 30; i++) a[i] = MOA_Random_W(32, 1);
+
+ gf.multiply_region.w32(&gf, a, b, 0x12345678, 30*4, 0);
+
+ printf("a: 0x%lx b: 0x%lx\n", (unsigned long) a, (unsigned long) b);
+
+ for (i = 0; i < 30; i += 10) {
+ printf("\n");
+ printf(" ");
+ for (j = 0; j < 10; j++) printf(" %8d", i+j);
+ printf("\n");
+
+ printf("a:");
+ for (j = 0; j < 10; j++) printf(" %08x", a[i+j]);
+ printf("\n");
+
+ printf("b:");
+ for (j = 0; j < 10; j++) printf(" %08x", b[i+j]);
+ printf("\n");
+ printf("\n");
+ }
+
+ for (i = 0; i < 15; i ++) {
+ printf("Word %2d: 0x%08x * 0x12345678 = 0x%08x ", i,
+ gf.extract_word.w32(&gf, a, 30*4, i),
+ gf.extract_word.w32(&gf, b, 30*4, i));
+ printf("Word %2d: 0x%08x * 0x12345678 = 0x%08x\n", i+15,
+ gf.extract_word.w32(&gf, a, 30*4, i+15),
+ gf.extract_word.w32(&gf, b, 30*4, i+15));
+ }
+}
diff --git a/gf_example_7.c b/gf_example_7.c
new file mode 100644
index 0000000..445ae20
--- /dev/null
+++ b/gf_example_7.c
@@ -0,0 +1,70 @@
+/*
+ * gf_example_7.c
+ *
+ * Demonstrating extract_word and Cauchy
+ */
+
+#include <stdio.h>
+#include <getopt.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdlib.h>
+#include <time.h>
+
+#include "gf_complete.h"
+#include "gf_rand.h"
+
+void usage(char *s)
+{
+ fprintf(stderr, "usage: gf_example_7\n");
+ exit(1);
+}
+
+int main(int argc, char **argv)
+{
+ uint8_t *a, *b;
+ int i, j;
+ gf_t gf;
+
+ if (gf_init_hard(&gf, 3, GF_MULT_TABLE, GF_REGION_CAUCHY, GF_DIVIDE_DEFAULT, 0, 0, 0, NULL, NULL) == 0) {
+ fprintf(stderr, "gf_init_hard failed\n");
+ exit(1);
+ }
+
+ a = (uint8_t *) malloc(3);
+ b = (uint8_t *) malloc(3);
+
+ MOA_Seed(0);
+
+ for (i = 0; i < 3; i++) a[i] = MOA_Random_W(8, 1);
+
+ gf.multiply_region.w32(&gf, a, b, 5, 3, 0);
+
+ printf("a: 0x%lx b: 0x%lx\n", (unsigned long) a, (unsigned long) b);
+
+ printf("\n");
+ printf("a: 0x%02x 0x%02x 0x%02x\n", a[0], a[1], a[2]);
+ printf("b: 0x%02x 0x%02x 0x%02x\n", b[0], b[1], b[2]);
+ printf("\n");
+
+ printf("a bits:");
+ for (i = 0; i < 3; i++) {
+ printf(" ");
+ for (j = 7; j >= 0; j--) printf("%c", (a[i] & (1 << j)) ? '1' : '0');
+ }
+ printf("\n");
+
+ printf("b bits:");
+ for (i = 0; i < 3; i++) {
+ printf(" ");
+ for (j = 7; j >= 0; j--) printf("%c", (b[i] & (1 << j)) ? '1' : '0');
+ }
+ printf("\n");
+
+ printf("\n");
+ for (i = 0; i < 8; i++) {
+ printf("Word %2d: %d * 5 = %d\n", i,
+ gf.extract_word.w32(&gf, a, 3, i),
+ gf.extract_word.w32(&gf, b, 3, i));
+ }
+}
diff --git a/gf_general.c b/gf_general.c
index ac0c236..02efdc7 100644
--- a/gf_general.c
+++ b/gf_general.c
@@ -95,12 +95,20 @@ void gf_general_set_random(gf_general_t *v, int w, int zero_ok)
}
}
-void gf_general_val_to_s(gf_general_t *v, int w, char *s)
+void gf_general_val_to_s(gf_general_t *v, int w, char *s, int hex)
{
if (w <= 32) {
- sprintf(s, "%x", v->w32);
+ if (hex) {
+ sprintf(s, "%x", v->w32);
+ } else {
+ sprintf(s, "%d", v->w32);
+ }
} else if (w <= 64) {
- sprintf(s, "%llx", (long long unsigned int) v->w64);
+ if (hex) {
+ sprintf(s, "%llx", (long long unsigned int) v->w64);
+ } else {
+ sprintf(s, "%lld", (long long unsigned int) v->w64);
+ }
} else {
if (v->w128[0] == 0) {
sprintf(s, "%llx", (long long unsigned int) v->w128[1]);
@@ -111,6 +119,64 @@ void gf_general_val_to_s(gf_general_t *v, int w, char *s)
}
}
+int gf_general_s_to_val(gf_general_t *v, int w, char *s, int hex)
+{
+ int l;
+ int save;
+
+ if (w <= 32) {
+ if (hex) {
+ if (sscanf(s, "%x", &(v->w32)) == 0) return 0;
+ } else {
+ if (sscanf(s, "%d", &(v->w32)) == 0) return 0;
+ }
+ if (w == 32) return 1;
+ if (w == 31) {
+ if (v->w32 & (1 << 31)) return 0;
+ return 1;
+ }
+ if (v->w32 & ~((1 << w)-1)) return 0;
+ return 1;
+ } else if (w <= 64) {
+ if (hex) return (sscanf(s, "%llx", &(v->w64)) == 1);
+ return (sscanf(s, "%lld", &(v->w64)) == 1);
+ } else {
+ if (!hex) return 0;
+ l = strlen(s);
+ if (l <= 16) {
+ v->w128[0] = 0;
+ return (sscanf(s, "%llx", &(v->w128[1])) == 1);
+ } else {
+ if (l > 32) return 0;
+ save = s[l-16];
+ s[l-16] = '\0';
+ if (sscanf(s, "%llx", &(v->w128[0])) == 0) {
+ s[l-16] = save;
+ return 0;
+ }
+ return (sscanf(s+(l-16), "%llx", &(v->w128[1])) == 1);
+ }
+ }
+}
+
+void gf_general_add(gf_t *gf, gf_general_t *a, gf_general_t *b, gf_general_t *c)
+{
+ gf_internal_t *h;
+ int w;
+
+ h = (gf_internal_t *) gf->scratch;
+ w = h->w;
+
+ if (w <= 32) {
+ c->w32 = a->w32 ^ b->w32;
+ } else if (w <= 64) {
+ c->w64 = a->w64 ^ b->w64;
+ } else {
+ c->w128[0] = a->w128[0] ^ b->w128[0];
+ c->w128[1] = a->w128[1] ^ b->w128[1];
+ }
+}
+
void gf_general_multiply(gf_t *gf, gf_general_t *a, gf_general_t *b, gf_general_t *c)
{
gf_internal_t *h;
@@ -229,19 +295,19 @@ void gf_general_do_region_check(gf_t *gf, gf_general_t *a, void *orig_a, void *o
if (!gf_general_are_equal(&ft, &sb, w)) {
- printf("Problem with region multiply (all values in hex):\n");
- printf(" Target address base: 0x%lx. Word 0x%x of 0x%x. Xor: %d\n",
+ fprintf(stderr,"Problem with region multiply (all values in hex):\n");
+ fprintf(stderr," Target address base: 0x%lx. Word 0x%x of 0x%x. Xor: %d\n",
(unsigned long) final_target, i, words, xor);
- gf_general_val_to_s(a, w, sa);
- gf_general_val_to_s(&oa, w, soa);
- gf_general_val_to_s(&ot, w, sot);
- gf_general_val_to_s(&ft, w, sft);
- gf_general_val_to_s(&sb, w, ssb);
- printf(" Value: %s\n", sa);
- printf(" Original source word: %s\n", soa);
- if (xor) printf(" XOR with target word: %s\n", sot);
- printf(" Product word: %s\n", sft);
- printf(" It should be: %s\n", ssb);
+ gf_general_val_to_s(a, w, sa, 1);
+ gf_general_val_to_s(&oa, w, soa, 1);
+ gf_general_val_to_s(&ot, w, sot, 1);
+ gf_general_val_to_s(&ft, w, sft, 1);
+ gf_general_val_to_s(&sb, w, ssb, 1);
+ fprintf(stderr," Value: %s\n", sa);
+ fprintf(stderr," Original source word: %s\n", soa);
+ if (xor) fprintf(stderr," XOR with target word: %s\n", sot);
+ fprintf(stderr," Product word: %s\n", sft);
+ fprintf(stderr," It should be: %s\n", ssb);
exit(0);
}
}
@@ -251,7 +317,7 @@ void gf_general_set_up_single_timing_test(int w, void *ra, void *rb, int size)
{
void *top;
gf_general_t g;
- uint8_t *r8;
+ uint8_t *r8, *r8a;
uint16_t *r16;
uint32_t *r32;
uint64_t *r64;
@@ -263,6 +329,8 @@ void gf_general_set_up_single_timing_test(int w, void *ra, void *rb, int size)
However, don't allow for zeros in rb, because that will screw up
division.
+ When w is 4, you fill the regions with random 4-bit words in each byte.
+
Otherwise, treat every four bytes as an uint32_t
and fill it with a random value mod (1 << w).
*/
@@ -296,6 +364,17 @@ void gf_general_set_up_single_timing_test(int w, void *ra, void *rb, int size)
}
rb += (w/8);
}
+ } else if (w == 4) {
+ r8a = (uint8_t *) ra;
+ r8 = (uint8_t *) rb;
+ while (r8 < (uint8_t *) top) {
+ gf_general_set_random(&g, w, 1);
+ *r8a = g.w32;
+ gf_general_set_random(&g, w, 0);
+ *r8 = g.w32;
+ r8a++;
+ r8++;
+ }
} else {
r32 = (uint32_t *) ra;
for (i = 0; i < size/4; i++) r32[i] = MOA_Random_W(w, 1);
@@ -306,7 +385,7 @@ void gf_general_set_up_single_timing_test(int w, void *ra, void *rb, int size)
/* This sucks, but in order to time, you really need to avoid putting ifs in
the inner loops. So, I'm doing a separate timing test for each w:
- 8, 16, 32, 64, 128 and everything else. Fortunately, the "everything else"
+ (4 & 8), 16, 32, 64, 128 and everything else. Fortunately, the "everything else"
tests can be equivalent to w=32.
I'm also putting the results back into ra, because otherwise, the optimizer might
@@ -327,7 +406,7 @@ int gf_general_do_single_timing_test(gf_t *gf, void *ra, void *rb, int size, cha
w = h->w;
top = ra + size;
- if (w == 8) {
+ if (w == 8 || w == 4) {
r8a = (uint8_t *) ra;
r8b = (uint8_t *) rb;
top8 = (uint8_t *) top;
diff --git a/gf_general.h b/gf_general.h
index 0848f36..b257348 100644
--- a/gf_general.h
+++ b/gf_general.h
@@ -32,10 +32,12 @@ int gf_general_is_zero(gf_general_t *v, int w);
int gf_general_is_one(gf_general_t *v, int w);
int gf_general_are_equal(gf_general_t *v1, gf_general_t *v2, int w);
-void gf_general_val_to_s(gf_general_t *v, int w, char *s);
+void gf_general_val_to_s(gf_general_t *v, int w, char *s, int hex);
+int gf_general_s_to_val(gf_general_t *v, int w, char *s, int hex);
void gf_general_set_random(gf_general_t *v, int w, int zero_ok);
+void gf_general_add(gf_t *gf, gf_general_t *a, gf_general_t *b, gf_general_t *c);
void gf_general_multiply(gf_t *gf, gf_general_t *a, gf_general_t *b, gf_general_t *c);
void gf_general_divide(gf_t *gf, gf_general_t *a, gf_general_t *b, gf_general_t *c);
void gf_general_inverse(gf_t *gf, gf_general_t *a, gf_general_t *b);
diff --git a/gf_inline_time.c b/gf_inline_time.c
index d52c814..55709cd 100644
--- a/gf_inline_time.c
+++ b/gf_inline_time.c
@@ -9,6 +9,7 @@
#include <string.h>
#include <stdlib.h>
#include <time.h>
+#include <sys/time.h>
#include "gf_complete.h"
#include "gf_rand.h"
diff --git a/gf_int.h b/gf_int.h
index bd544bc..bdff2a2 100644
--- a/gf_int.h
+++ b/gf_int.h
@@ -51,11 +51,15 @@ extern int gf_wgen_scratch_size(int w, int mult_type, int region_type, int divid
void gf_wgen_cauchy_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor);
gf_val_32_t gf_wgen_extract_word(gf_t *gf, void *start, int bytes, int index);
-
extern void gf_alignment_error(char *s, int a);
extern uint32_t gf_bitmatrix_inverse(uint32_t y, int w, uint32_t pp);
+/* This returns the correct default for prim_poly when base is used as the base
+ field for COMPOSITE. It returns 0 if we don't have a default prim_poly. */
+
+extern uint64_t gf_composite_get_default_poly(gf_t *base);
+
/* This structure lets you define a region multiply. It helps because you can handle
unaligned portions of the data with the procedures below, which really cleans
up the code. */
@@ -96,3 +100,97 @@ extern void gf_do_final_region_alignment(gf_region_data *rd);
extern void gf_two_byte_region_table_multiply(gf_region_data *rd, uint16_t *base);
extern void gf_multby_zero(void *dest, int bytes, int xor);
+extern void gf_multby_one(void *src, void *dest, int bytes, int xor);
+
+typedef enum {GF_E_MDEFDIV, /* Dev != Default && Mult == Default */
+ GF_E_MDEFREG, /* Reg != Default && Mult == Default */
+ GF_E_MDEFARG, /* Args != Default && Mult == Default */
+ GF_E_DIVCOMP, /* Mult == Composite && Div != Default */
+ GF_E_CAUCOMP, /* Mult == Composite && Reg == CAUCHY */
+ GF_E_DOUQUAD, /* Reg == DOUBLE && Reg == QUAD */
+ GF_E_SSE__NO, /* Reg == SSE && Reg == NOSSE */
+ GF_E_CAUCHYB, /* Reg == CAUCHY && Other Reg */
+ GF_E_CAUGT32, /* Reg == CAUCHY && w > 32*/
+ GF_E_ARG1SET, /* Arg1 != 0 && Mult \notin COMPOSITE/SPLIT/GROUP */
+ GF_E_ARG2SET, /* Arg2 != 0 && Mult \notin SPLIT/GROUP */
+ GF_E_MATRIXW, /* Div == MATRIX && w > 32 */
+ GF_E_BAD___W, /* Illegal w */
+ GF_E_DOUBLET, /* Reg == DOUBLE && Mult != TABLE */
+ GF_E_DOUBLEW, /* Reg == DOUBLE && w \notin {4,8} */
+ GF_E_DOUBLEJ, /* Reg == DOUBLE && other Reg */
+ GF_E_DOUBLEL, /* Reg == DOUBLE & LAZY but w = 4 */
+ GF_E_QUAD__T, /* Reg == QUAD && Mult != TABLE */
+ GF_E_QUAD__W, /* Reg == QUAD && w != 4 */
+ GF_E_QUAD__J, /* Reg == QUAD && other Reg */
+ GF_E_LAZY__X, /* Reg == LAZY && not DOUBLE or QUAD*/
+ GF_E_ALTSHIF, /* Mult == Shift && Reg == ALTMAP */
+ GF_E_SSESHIF, /* Mult == Shift && Reg == SSE|NOSSE */
+ GF_E_ALT_CFM, /* Mult == CARRY_FREE && Reg == ALTMAP */
+ GF_E_SSE_CFM, /* Mult == CARRY_FREE && Reg == SSE|NOSSE */
+ GF_E_PCLMULX, /* Mult == Carry_Free && No PCLMUL */
+ GF_E_ALT_BY2, /* Mult == Bytwo_x && Reg == ALTMAP */
+ GF_E_BY2_SSE, /* Mult == Bytwo_x && Reg == SSE && No SSE2 */
+ GF_E_LOGBADW, /* Mult == LOGx, w too big*/
+ GF_E_LOG___J, /* Mult == LOGx, && Reg == SSE|ALTMAP|NOSSE */
+ GF_E_ZERBADW, /* Mult == LOG_ZERO, w \notin {8,16} */
+ GF_E_ZEXBADW, /* Mult == LOG_ZERO_EXT, w != 8 */
+ GF_E_LOGPOLY, /* Mult == LOG & poly not primitive */
+ GF_E_GR_ARGX, /* Mult == GROUP, Bad arg1/2 */
+ GF_E_GR_W_48, /* Mult == GROUP, w \in { 4, 8 } */
+ GF_E_GR_W_16, /* Mult == GROUP, w == 16, arg1 != 4 || arg2 != 4 */
+ GF_E_GR_128A, /* Mult == GROUP, w == 128, bad args */
+ GF_E_GR_SSE4, /* Mult == GROUP, w == 128, No SSE4 */
+ GF_E_GR_A_27, /* Mult == GROUP, either arg > 27 */
+ GF_E_GR_AR_W, /* Mult == GROUP, either arg > w */
+ GF_E_GR____J, /* Mult == GROUP, Reg == SSE|ALTMAP|NOSSE */
+ GF_E_TABLE_W, /* Mult == TABLE, w too big */
+ GF_E_TAB_SSE, /* Mult == TABLE, SSE|NOSSE only apply to w == 4 */
+ GF_E_TABSSE3, /* Mult == TABLE, Need SSSE3 for SSE */
+ GF_E_TAB_ALT, /* Mult == TABLE, Reg == ALTMAP */
+ GF_E_SP128AR, /* Mult == SPLIT, w=128, Bad arg1/arg2 */
+ GF_E_SP128AL, /* Mult == SPLIT, w=128, SSE requires ALTMAP */
+ GF_E_SP128AS, /* Mult == SPLIT, w=128, ALTMAP requires SSE */
+ GF_E_SP128_A, /* Mult == SPLIT, w=128, SSE only with 4/128 */
+ GF_E_SP128_S, /* Mult == SPLIT, w=128, ALTMAP only with 4/128 */
+ GF_E_SPLIT_W, /* Mult == SPLIT, Bad w (8, 16, 32, 64, 128) */
+ GF_E_SP_16AR, /* Mult == SPLIT, w=16, Bad arg1/arg2 */
+ GF_E_SP_16_A, /* Mult == SPLIT, w=16, ALTMAP only with 4/16 */
+ GF_E_SP_16_S, /* Mult == SPLIT, w=16, SSE only with 4/16 */
+ GF_E_SP_32AR, /* Mult == SPLIT, w=32, Bad arg1/arg2 */
+ GF_E_SP_32AS, /* Mult == SPLIT, w=32, ALTMAP requires SSE */
+ GF_E_SP_32_A, /* Mult == SPLIT, w=32, ALTMAP only with 4/32 */
+ GF_E_SP_32_S, /* Mult == SPLIT, w=32, SSE only with 4/32 */
+ GF_E_SP_64AR, /* Mult == SPLIT, w=64, Bad arg1/arg2 */
+ GF_E_SP_64AS, /* Mult == SPLIT, w=64, ALTMAP requires SSE */
+ GF_E_SP_64_A, /* Mult == SPLIT, w=64, ALTMAP only with 4/64 */
+ GF_E_SP_64_S, /* Mult == SPLIT, w=64, SSE only with 4/64 */
+ GF_E_SP_8_AR, /* Mult == SPLIT, w=8, Bad arg1/arg2 */
+ GF_E_SP_8__A, /* Mult == SPLIT, w=8, no ALTMAP */
+ GF_E_SP_SSE3, /* Mult == SPLIT, Need SSSE3 for SSE */
+ GF_E_COMP_A2, /* Mult == COMP, arg1 must be = 2 */
+ GF_E_COMP_SS, /* Mult == COMP, SSE|NOSSE */
+ GF_E_COMP__W, /* Mult == COMP, Bad w. */
+ GF_E_UNKFLAG, /* Unknown flag in create_from.... */
+ GF_E_UNKNOWN, /* Unknown mult_type. */
+ GF_E_UNK_REG, /* Unknown region_type. */
+ GF_E_UNK_DIV, /* Unknown divide_type. */
+ GF_E_CFM___W, /* Mult == CFM, Bad w. */
+ GF_E_CFM4POL, /* Mult == CFM & Prim Poly has high bits set. */
+ GF_E_CFM8POL, /* Mult == CFM & Prim Poly has high bits set. */
+ GF_E_CF16POL, /* Mult == CFM & Prim Poly has high bits set. */
+ GF_E_CF32POL, /* Mult == CFM & Prim Poly has high bits set. */
+ GF_E_CF64POL, /* Mult == CFM & Prim Poly has high bits set. */
+ GF_E_FEWARGS, /* Too few args in argc/argv. */
+ GF_E_BADPOLY, /* Bad primitive polynomial -- too many bits set. */
+ GF_E_COMP_PP, /* Bad primitive polynomial -- bigger than sub-field. */
+ GF_E_COMPXPP, /* Can't derive a default pp for composite field. */
+ GF_E_BASE__W, /* Composite -- Base field is the wrong size. */
+ GF_E_TWOMULT, /* In create_from... two -m's. */
+ GF_E_TWO_DIV, /* In create_from... two -d's. */
+ GF_E_POLYSPC, /* Bad numbera after -p. */
+ GF_E_SPLITAR, /* Ran out of arguments in SPLIT */
+ GF_E_SPLITNU, /* Arguments not integers in SPLIT. */
+ GF_E_GROUPAR, /* Ran out of arguments in GROUP */
+ GF_E_GROUPNU, /* Arguments not integers in GROUP. */
+ GF_E_DEFAULT } gf_error_type_t;
+
diff --git a/gf_method.c b/gf_method.c
index f65c4e3..bc9bd35 100644
--- a/gf_method.c
+++ b/gf_method.c
@@ -11,179 +11,172 @@
#include <time.h>
#include "gf_complete.h"
+#include "gf_int.h"
#include "gf_method.h"
-void methods_to_stderr()
-{
- fprintf(stderr, "To specify the methods, do one of the following: \n");
- fprintf(stderr, " - leave empty to use defaults\n");
- fprintf(stderr, " - use a single dash to use defaults\n");
- fprintf(stderr, " - specify MULTIPLY REGION DIVIDE\n");
- fprintf(stderr, "\n");
- fprintf(stderr, "Legal values of MULTIPLY:\n");
- fprintf(stderr, " SHIFT: shift\n");
- fprintf(stderr, " GROUP g_mult g_reduce: the Group technique - see the paper\n");
- fprintf(stderr, " BYTWO_p: BYTWO doubling the product.\n");
- fprintf(stderr, " BYTWO_b: BYTWO doubling b (more efficient thatn BYTWO_p)\n");
- fprintf(stderr, " TABLE: Full multiplication table\n");
- fprintf(stderr, " LOG: Discrete logs\n");
- fprintf(stderr, " LOG_ZERO: Discrete logs with a large table for zeros\n");
- fprintf(stderr, " LOG_ZERO_EXT: Discrete logs with an extra large table for zeros\n");
- fprintf(stderr, " SPLIT g_a g_b: Split tables defined by g_a and g_b\n");
- fprintf(stderr, " COMPOSITE k rec METHOD: Composite field. GF((2^l)^k), l=w/k.\n");
- fprintf(stderr, " rec = 0 means inline single multiplication\n");
- fprintf(stderr, " rec = 1 means recursive single multiplication\n");
- fprintf(stderr, " METHOD is the method of the base field in GF(2^l)\n");
- fprintf(stderr, "\n");
- fprintf(stderr, "Legal values of REGION: Specify multiples with commas e.g. 'DOUBLE,LAZY'\n");
- fprintf(stderr, " -: Use defaults\n");
- fprintf(stderr, " SINGLE/DOUBLE/QUAD: Expand tables\n");
- fprintf(stderr, " LAZY: Lazily create table (only applies to TABLE and SPLIT)\n");
- fprintf(stderr, " SSE/NOSSE: Use 128-bit SSE instructions if you can\n");
- fprintf(stderr, " CAUCHY/ALTMAP/STDMAP: Use different memory mappings\n");
- fprintf(stderr, "\n");
- fprintf(stderr, "Legal values of DIVIDE:\n");
- fprintf(stderr, " -: Use defaults\n");
- fprintf(stderr, " MATRIX: Use matrix inversion\n");
- fprintf(stderr, " EUCLID: Use the extended Euclidian algorithm.\n");
- fprintf(stderr, "\n");
- fprintf(stderr, "See the user's manual for more information.\n");
- fprintf(stderr, "There are many restrictions, so it is better to simply use defaults in most cases.\n");
-}
-
int create_gf_from_argv(gf_t *gf, int w, int argc, char **argv, int starting)
{
int mult_type, divide_type, region_type;
- uint32_t prim_poly = 0;
int arg1, arg2, subrg_size;
+ uint64_t prim_poly;
gf_t *base;
char *crt, *x, *y;
- if (argc <= starting || strcmp(argv[starting], "-") == 0) {
- if (!gf_init_easy(gf, w)) return 0;
- return (argc <= starting) ? starting : starting+1;
- }
-
+ mult_type = GF_MULT_DEFAULT;
region_type = GF_REGION_DEFAULT;
divide_type = GF_DIVIDE_DEFAULT;
-
- arg1 = 0;
- arg2 = 0;
prim_poly = 0;
base = NULL;
- subrg_size = 0;
-
- if (argc < starting+3) return 0;
-
- if (strcmp(argv[starting], "SHIFT") == 0) {
- mult_type = GF_MULT_SHIFT;
- starting++;
- } else if (strcmp(argv[starting], "GROUP") == 0) {
- mult_type = GF_MULT_GROUP;
- if (argc < starting+5) return 0;
- if (sscanf(argv[starting+1], "%d", &arg1) == 0 ||
- sscanf(argv[starting+2], "%d", &arg2) == 0 ||
- arg1 <= 0 || arg2 <= 0 || arg1 >= w || arg2 >= w) return 0;
- starting += 3;
- } else if (strcmp(argv[starting], "BYTWO_p") == 0) {
- mult_type = GF_MULT_BYTWO_p;
- starting++;
- } else if (strcmp(argv[starting], "BYTWO_b") == 0) {
- mult_type = GF_MULT_BYTWO_b;
- starting++;
- } else if (strcmp(argv[starting], "TABLE") == 0) {
- mult_type = GF_MULT_TABLE;
- starting++;
- } else if (strcmp(argv[starting], "LOG") == 0) {
- mult_type = GF_MULT_LOG_TABLE;
- starting++;
- } else if (strcmp(argv[starting], "LOG_ZERO") == 0) {
- mult_type = GF_MULT_LOG_TABLE;
- arg1 = 1;
- starting++;
- } else if (strcmp(argv[starting], "LOG_ZERO_EXT") == 0) {
- mult_type = GF_MULT_LOG_TABLE;
- arg1 = 2;
- starting++;
- } else if (strcmp(argv[starting], "SPLIT") == 0) {
- mult_type = GF_MULT_SPLIT_TABLE;
- if (argc < starting+5) return 0;
- if (sscanf(argv[starting+1], "%d", &arg1) == 0 ||
- sscanf(argv[starting+2], "%d", &arg2) == 0 ||
- arg1 <= 0 || arg2 <= 0 || w % arg1 != 0 || w % arg2 != 0) return 0;
- starting += 3;
- } else if (strcmp(argv[starting], "COMPOSITE") == 0) {
- mult_type = GF_MULT_COMPOSITE;
- if (argc < starting+6) return 0;
- if (sscanf(argv[starting+1], "%d", &arg1) == 0 ||
- sscanf(argv[starting+2], "%d", &arg2) == 0 ||
- arg1 <= 1 || w %arg1 != 0 || ((arg2 | 1) != 1)) return 0;
- base = (gf_t *) malloc(sizeof(gf_t));
- starting = create_gf_from_argv(base, w/arg1, argc, argv, starting+3);
- if (starting == 0) { free(base); return 0; }
- } else {
- return 0;
- }
-
- if (argc < starting+2) {
- if (base != NULL) gf_free(base, 1);
- return 0;
- }
-
- if (strcmp(argv[starting], "-") == 0) {
- region_type = GF_REGION_DEFAULT;
- } else {
- crt = strdup(argv[starting]);
- region_type = 0;
- x = crt;
- do {
- y = strchr(x, ',');
- if (y != NULL) *y = '\0';
- if (strcmp(x, "DOUBLE") == 0) {
- region_type |= GF_REGION_DOUBLE_TABLE;
- } else if (strcmp(x, "QUAD") == 0) {
- region_type |= GF_REGION_QUAD_TABLE;
- } else if (strcmp(x, "SINGLE") == 0) {
- region_type |= GF_REGION_SINGLE_TABLE;
- } else if (strcmp(x, "LAZY") == 0) {
- region_type |= GF_REGION_LAZY;
- } else if (strcmp(x, "SSE") == 0) {
- region_type |= GF_REGION_SSE;
- } else if (strcmp(x, "NOSSE") == 0) {
- region_type |= GF_REGION_NOSSE;
- } else if (strcmp(x, "CAUCHY") == 0) {
- region_type |= GF_REGION_CAUCHY;
- } else if (strcmp(x, "ALTMAP") == 0) {
- region_type |= GF_REGION_ALTMAP;
- } else if (strcmp(x, "STDMAP") == 0) {
- region_type |= GF_REGION_STDMAP;
+ arg1 = 0;
+ arg2 = 0;
+ while (1) {
+ if (argc > starting) {
+ if (strcmp(argv[starting], "-m") == 0) {
+ starting++;
+ if (mult_type != GF_MULT_DEFAULT) {
+ if (base != NULL) gf_free(base, 1);
+ _gf_errno = GF_E_TWOMULT;
+ return 0;
+ }
+ if (strcmp(argv[starting], "SHIFT") == 0) {
+ mult_type = GF_MULT_SHIFT;
+ starting++;
+ } else if (strcmp(argv[starting], "CARRY_FREE") == 0) {
+ mult_type = GF_MULT_CARRY_FREE;
+ starting++;
+ } else if (strcmp(argv[starting], "GROUP") == 0) {
+ mult_type = GF_MULT_GROUP;
+ if (argc < starting + 3) {
+ _gf_errno = GF_E_GROUPAR;
+ return 0;
+ }
+ if (sscanf(argv[starting+1], "%d", &arg1) == 0 ||
+ sscanf(argv[starting+2], "%d", &arg2) == 0) {
+ _gf_errno = GF_E_GROUPNU;
+ return 0;
+ }
+ starting += 3;
+ } else if (strcmp(argv[starting], "BYTWO_p") == 0) {
+ mult_type = GF_MULT_BYTWO_p;
+ starting++;
+ } else if (strcmp(argv[starting], "BYTWO_b") == 0) {
+ mult_type = GF_MULT_BYTWO_b;
+ starting++;
+ } else if (strcmp(argv[starting], "TABLE") == 0) {
+ mult_type = GF_MULT_TABLE;
+ starting++;
+ } else if (strcmp(argv[starting], "LOG") == 0) {
+ mult_type = GF_MULT_LOG_TABLE;
+ starting++;
+ } else if (strcmp(argv[starting], "LOG_ZERO") == 0) {
+ mult_type = GF_MULT_LOG_ZERO;
+ starting++;
+ } else if (strcmp(argv[starting], "LOG_ZERO_EXT") == 0) {
+ mult_type = GF_MULT_LOG_ZERO_EXT;
+ starting++;
+ } else if (strcmp(argv[starting], "SPLIT") == 0) {
+ mult_type = GF_MULT_SPLIT_TABLE;
+ if (argc < starting + 3) {
+ _gf_errno = GF_E_SPLITAR;
+ return 0;
+ }
+ if (sscanf(argv[starting+1], "%d", &arg1) == 0 ||
+ sscanf(argv[starting+2], "%d", &arg2) == 0) {
+ _gf_errno = GF_E_SPLITNU;
+ return 0;
+ }
+ starting += 3;
+ } else if (strcmp(argv[starting], "COMPOSITE") == 0) {
+ mult_type = GF_MULT_COMPOSITE;
+ if (argc < starting + 2) { _gf_errno = GF_E_FEWARGS; return 0; }
+ if (sscanf(argv[starting+1], "%d", &arg1) == 0) {
+ _gf_errno = GF_E_COMP_A2;
+ return 0;
+ }
+ starting += 2;
+ base = (gf_t *) malloc(sizeof(gf_t));
+ starting = create_gf_from_argv(base, w/arg1, argc, argv, starting);
+ if (starting == 0) {
+ free(base);
+ return 0;
+ }
+ } else {
+ if (base != NULL) gf_free(base, 1);
+ _gf_errno = GF_E_UNKNOWN;
+ return 0;
+ }
+ } else if (strcmp(argv[starting], "-r") == 0) {
+ starting++;
+ if (strcmp(argv[starting], "DOUBLE") == 0) {
+ region_type |= GF_REGION_DOUBLE_TABLE;
+ starting++;
+ } else if (strcmp(argv[starting], "QUAD") == 0) {
+ region_type |= GF_REGION_QUAD_TABLE;
+ starting++;
+ } else if (strcmp(argv[starting], "LAZY") == 0) {
+ region_type |= GF_REGION_LAZY;
+ starting++;
+ } else if (strcmp(argv[starting], "SSE") == 0) {
+ region_type |= GF_REGION_SSE;
+ starting++;
+ } else if (strcmp(argv[starting], "NOSSE") == 0) {
+ region_type |= GF_REGION_NOSSE;
+ starting++;
+ } else if (strcmp(argv[starting], "CAUCHY") == 0) {
+ region_type |= GF_REGION_CAUCHY;
+ starting++;
+ } else if (strcmp(argv[starting], "ALTMAP") == 0) {
+ region_type |= GF_REGION_ALTMAP;
+ starting++;
+ } else {
+ if (base != NULL) gf_free(base, 1);
+ _gf_errno = GF_E_UNK_REG;
+ return 0;
+ }
+ } else if (strcmp(argv[starting], "-p") == 0) {
+ starting++;
+ if (sscanf(argv[starting], "%llx", (long long unsigned int *)(&prim_poly)) == 0) {
+ if (base != NULL) gf_free(base, 1);
+ _gf_errno = GF_E_POLYSPC;
+ return 0;
+ }
+ starting++;
+ } else if (strcmp(argv[starting], "-d") == 0) {
+ starting++;
+ if (divide_type != GF_DIVIDE_DEFAULT) {
+ if (base != NULL) gf_free(base, 1);
+ _gf_errno = GF_E_TWO_DIV;
+ return 0;
+ } else if (strcmp(argv[starting], "EUCLID") == 0) {
+ divide_type = GF_DIVIDE_EUCLID;
+ starting++;
+ } else if (strcmp(argv[starting], "MATRIX") == 0) {
+ divide_type = GF_DIVIDE_MATRIX;
+ starting++;
+ } else {
+ _gf_errno = GF_E_UNK_DIV;
+ return 0;
+ }
+ } else if (strcmp(argv[starting], "-") == 0) {
+ /*
+ printf("Scratch size: %d\n", gf_scratch_size(w,
+ mult_type, region_type, divide_type, arg1, arg2));
+ */
+ if (gf_init_hard(gf, w, mult_type, region_type, divide_type,
+ prim_poly, arg1, arg2, base, NULL) == 0) {
+ if (base != NULL) gf_free(base, 1);
+ return 0;
+ } else
+ return starting + 1;
} else {
if (base != NULL) gf_free(base, 1);
- free(crt);
+ _gf_errno = GF_E_UNKFLAG;
return 0;
}
- if (y != NULL) x = y+1;
- } while (y != NULL);
- free(crt);
- }
-
- starting++;
-
- if (strcmp(argv[starting], "-") == 0) {
- divide_type = GF_DIVIDE_DEFAULT;
- } else if (strcmp(argv[starting], "MATRIX") == 0) {
- divide_type = GF_DIVIDE_MATRIX;
- } else if (strcmp(argv[starting], "EUCLID") == 0) {
- divide_type = GF_DIVIDE_EUCLID;
- } else {
- if (base != NULL) gf_free(base, 1);
- return 0;
- }
- starting++;
-
- if (!gf_init_hard(gf, w, mult_type, region_type, divide_type, prim_poly, arg1, arg2, base, NULL)) {
- if (base != NULL) gf_free(base, 1);
- return 0;
+ } else {
+ if (base != NULL) gf_free(base, 1);
+ _gf_errno = GF_E_FEWARGS;
+ return 0;
+ }
}
- return starting;
}
diff --git a/gf_method.h b/gf_method.h
index c7df540..ff29f25 100644
--- a/gf_method.h
+++ b/gf_method.h
@@ -8,8 +8,9 @@
#include "gf_complete.h"
-/* This prints out the error string defining the methods that you can put on argv*/
-extern void methods_to_stderr();
+/* Parses argv starting at "starting".
+
+ Returns 0 on failure.
+ On success, it returns one past the last argument it read in argv. */
-/* Parses argv starting at "starting" */
extern int create_gf_from_argv(gf_t *gf, int w, int argc, char **argv, int starting);
diff --git a/gf_methods.c b/gf_methods.c
index 13aeb8e..c4db5f5 100644
--- a/gf_methods.c
+++ b/gf_methods.c
@@ -11,58 +11,26 @@
#include "gf_complete.h"
#include "gf_method.h"
+#include "gf_int.h"
-#define NMULTS (15)
-static char *mults[NMULTS] = { "SHIFT", "GROUP44", "GROUP48", "BYTWO_p", "BYTWO_b",
- "TABLE", "LOG", "LOG_ZERO", "LOG_ZERO_EXT", "SPLIT2", "SPLIT4", "SPLIT8", "SPLIT88", "COMPOSITE-0", "COMPOSITE-1" };
+#define NMULTS (16)
+static char *mults[NMULTS] = { "SHIFT", "CARRY_FREE", "GROUP44", "GROUP48", "BYTWO_p", "BYTWO_b",
+ "TABLE", "LOG", "LOG_ZERO", "LOG_ZERO_EXT", "SPLIT2",
+ "SPLIT4", "SPLIT8", "SPLIT16", "SPLIT88", "COMPOSITE" };
-#define NREGIONS (96)
-static char *regions[NREGIONS] = { "-", "SINGLE", "DOUBLE", "QUAD",
-"LAZY", "SINGLE,LAZY", "DOUBLE,LAZY", "QUAD,LAZY", "SSE",
-"SINGLE,SSE", "DOUBLE,SSE", "QUAD,SSE", "LAZY,SSE",
-"SINGLE,LAZY,SSE", "DOUBLE,LAZY,SSE", "QUAD,LAZY,SSE", "NOSSE",
-"SINGLE,NOSSE", "DOUBLE,NOSSE", "QUAD,NOSSE", "LAZY,NOSSE",
-"SINGLE,LAZY,NOSSE", "DOUBLE,LAZY,NOSSE", "QUAD,LAZY,NOSSE",
-"STDMAP", "SINGLE,STDMAP", "DOUBLE,STDMAP", "QUAD,STDMAP",
-"LAZY,STDMAP", "SINGLE,LAZY,STDMAP", "DOUBLE,LAZY,STDMAP",
-"QUAD,LAZY,STDMAP", "SSE,STDMAP", "SINGLE,SSE,STDMAP",
-"DOUBLE,SSE,STDMAP", "QUAD,SSE,STDMAP", "LAZY,SSE,STDMAP",
-"SINGLE,LAZY,SSE,STDMAP", "DOUBLE,LAZY,SSE,STDMAP",
-"QUAD,LAZY,SSE,STDMAP", "NOSSE,STDMAP", "SINGLE,NOSSE,STDMAP",
-"DOUBLE,NOSSE,STDMAP", "QUAD,NOSSE,STDMAP", "LAZY,NOSSE,STDMAP",
-"SINGLE,LAZY,NOSSE,STDMAP", "DOUBLE,LAZY,NOSSE,STDMAP",
-"QUAD,LAZY,NOSSE,STDMAP", "ALTMAP", "SINGLE,ALTMAP", "DOUBLE,ALTMAP",
-"QUAD,ALTMAP", "LAZY,ALTMAP", "SINGLE,LAZY,ALTMAP",
-"DOUBLE,LAZY,ALTMAP", "QUAD,LAZY,ALTMAP", "SSE,ALTMAP",
-"SINGLE,SSE,ALTMAP", "DOUBLE,SSE,ALTMAP", "QUAD,SSE,ALTMAP",
-"LAZY,SSE,ALTMAP", "SINGLE,LAZY,SSE,ALTMAP",
-"DOUBLE,LAZY,SSE,ALTMAP", "QUAD,LAZY,SSE,ALTMAP", "NOSSE,ALTMAP",
-"SINGLE,NOSSE,ALTMAP", "DOUBLE,NOSSE,ALTMAP", "QUAD,NOSSE,ALTMAP",
-"LAZY,NOSSE,ALTMAP", "SINGLE,LAZY,NOSSE,ALTMAP",
-"DOUBLE,LAZY,NOSSE,ALTMAP", "QUAD,LAZY,NOSSE,ALTMAP", "CAUCHY",
-"SINGLE,CAUCHY", "DOUBLE,CAUCHY", "QUAD,CAUCHY", "LAZY,CAUCHY",
-"SINGLE,LAZY,CAUCHY", "DOUBLE,LAZY,CAUCHY", "QUAD,LAZY,CAUCHY",
-"SSE,CAUCHY", "SINGLE,SSE,CAUCHY", "DOUBLE,SSE,CAUCHY",
-"QUAD,SSE,CAUCHY", "LAZY,SSE,CAUCHY", "SINGLE,LAZY,SSE,CAUCHY",
-"DOUBLE,LAZY,SSE,CAUCHY", "QUAD,LAZY,SSE,CAUCHY", "NOSSE,CAUCHY",
-"SINGLE,NOSSE,CAUCHY", "DOUBLE,NOSSE,CAUCHY", "QUAD,NOSSE,CAUCHY",
-"LAZY,NOSSE,CAUCHY", "SINGLE,LAZY,NOSSE,CAUCHY",
-"DOUBLE,LAZY,NOSSE,CAUCHY", "QUAD,LAZY,NOSSE,CAUCHY" };
+#define NREGIONS (7)
+static char *regions[NREGIONS] = { "DOUBLE", "QUAD", "LAZY", "SSE", "NOSSE",
+ "ALTMAP", "CAUCHY" };
-#define NDIVS (3)
-static char *divides[NDIVS] = { "-", "MATRIX", "EUCLID" };
+#define NDIVS (2)
+static char *divides[NDIVS] = { "MATRIX", "EUCLID" };
-int main()
+int main()
{
- int m, r, d, w, i, sa, j;
- char *argv[20];
+ int m, r, d, w, i, sa, j, k, reset;
+ char *argv[50];
gf_t gf;
char divs[200], ks[10], ls[10];
-
- methods_to_stderr();
-
- printf("\n");
- printf("Implemented Methods: \n\n");
for (i = 2; i < 8; i++) {
w = (1 << i);
@@ -70,9 +38,14 @@ int main()
if (create_gf_from_argv(&gf, w, 1, argv, 0) > 0) {
printf("w=%d: -\n", w);
gf_free(&gf, 1);
+ } else if (_gf_errno == GF_E_DEFAULT) {
+ fprintf(stderr, "Unlabeled failed method: w=%d: -\n", 2);
+ exit(1);
}
+
for (m = 0; m < NMULTS; m++) {
sa = 0;
+ argv[sa++] = "-m";
if (strcmp(mults[m], "GROUP44") == 0) {
argv[sa++] = "GROUP";
argv[sa++] = "4";
@@ -96,46 +69,66 @@ int main()
sprintf(ls, "%d", w);
argv[sa++] = ls;
argv[sa++] = "8";
+ } else if (strcmp(mults[m], "SPLIT16") == 0) {
+ argv[sa++] = "SPLIT";
+ sprintf(ls, "%d", w);
+ argv[sa++] = ls;
+ argv[sa++] = "16";
} else if (strcmp(mults[m], "SPLIT88") == 0) {
argv[sa++] = "SPLIT";
argv[sa++] = "8";
argv[sa++] = "8";
- } else if (strcmp(mults[m], "COMPOSITE-0") == 0) {
+ } else if (strcmp(mults[m], "COMPOSITE") == 0) {
argv[sa++] = "COMPOSITE";
argv[sa++] = "2";
- argv[sa++] = "0";
- argv[sa++] = "-";
- } else if (strcmp(mults[m], "COMPOSITE-1") == 0) {
- argv[sa++] = "COMPOSITE";
- argv[sa++] = "2";
- argv[sa++] = "1";
argv[sa++] = "-";
} else {
argv[sa++] = mults[m];
}
- for (r = 0; r < NREGIONS; r++) {
- argv[sa++] = regions[r];
- strcpy(divs, "");
+ reset = sa;
+ for (r = 0; r < (1 << NREGIONS); r++) {
+ sa = reset;
+ for (k = 0; k < NREGIONS; k++) {
+ if (r & 1 << k) {
+ argv[sa++] = "-r";
+ argv[sa++] = regions[k];
+ }
+ }
+ argv[sa++] = "-";
+ if (create_gf_from_argv(&gf, w, sa, argv, 0) > 0) {
+ printf("w=%d:", w);
+ for (j = 0; j < sa; j++) printf(" %s", argv[j]);
+ printf("\n");
+ gf_free(&gf, 1);
+ } else if (_gf_errno == GF_E_DEFAULT) {
+ fprintf(stderr, "Unlabeled failed method: w=%d:", w);
+ for (j = 0; j < sa; j++) fprintf(stderr, " %s", argv[j]);
+ fprintf(stderr, "\n");
+ exit(1);
+ }
+ sa--;
for (d = 0; d < NDIVS; d++) {
+ argv[sa++] = "-d";
argv[sa++] = divides[d];
-/* printf("w=%d:", w);
- for (j = 0; j < sa; j++) printf(" %s", argv[j]);
- printf("\n"); */
+ /* printf("w=%d:", w);
+ for (j = 0; j < sa; j++) printf(" %s", argv[j]);
+ printf("\n"); */
+ argv[sa++] = "-";
if (create_gf_from_argv(&gf, w, sa, argv, 0) > 0) {
- strcat(divs, "|");
- strcat(divs, divides[d]);
+ printf("w=%d:", w);
+ for (j = 0; j < sa; j++) printf(" %s", argv[j]);
+ printf("\n");
gf_free(&gf, 1);
+ } else if (_gf_errno == GF_E_DEFAULT) {
+ fprintf(stderr, "Unlabeled failed method: w=%d:", w);
+ for (j = 0; j < sa; j++) fprintf(stderr, " %s", argv[j]);
+ fprintf(stderr, "\n");
+ exit(1);
}
- sa--;
+ sa-=3;
}
- if (strlen(divs) > 0) {
- printf("w=%d:", w);
- for (j = 0; j < sa; j++) printf(" %s", argv[j]);
- printf(" %s\n", divs+1);
- }
- sa--;
}
- sa--;
}
}
+ return 0;
}
diff --git a/gf_mult.c b/gf_mult.c
index dc85cc6..c93a4f9 100644
--- a/gf_mult.c
+++ b/gf_mult.c
@@ -12,105 +12,53 @@
#include "gf_complete.h"
#include "gf_method.h"
+#include "gf_general.h"
-void usage(char *s)
+void usage(int why)
{
fprintf(stderr, "usage: gf_mult a b w [method] - does multiplication of a and b in GF(2^w)\n");
- fprintf(stderr, " If w has an h on the end, treat a, b and the product as hexadecimal (no 0x required)\n");
- fprintf(stderr, "\n");
- fprintf(stderr, " legal w are: 1-32, 64 and 128\n");
- fprintf(stderr, " 128 is hex only (i.e. '128' will be an error - do '128h')\n");
- fprintf(stderr, "\n");
- fprintf(stderr, " For method specification, type gf_methods\n");
-
- if (s != NULL) fprintf(stderr, "%s", s);
- exit(1);
-}
-
-int read_128(char *s, uint64_t *v)
-{
- int l, t;
- char save;
-
- l = strlen(s);
- if (l > 32) return 0;
-
- if (l > 16) {
- if (sscanf(s + (l-16), "%llx", (long long unsigned int *) &(v[1])) == 0) return 0;
- save = s[l-16];
- s[l-16] = '\0';
- t = sscanf(s, "%llx", (long long unsigned int *) &(v[0]));
- s[l-16] = save;
- return t;
- } else {
- v[0] = 0;
- return sscanf(s, "%llx", (long long unsigned int *)&(v[1]));
+ if (why == 'W') {
+ fprintf(stderr, "Bad w.\n");
+ fprintf(stderr, "Legal w are: 1 - 32, 64 and 128.\n");
+ fprintf(stderr, "Append 'h' to w to treat a, b and the product as hexadecimal.\n");
+ fprintf(stderr, "w=128 is hex only (i.e. '128' will be an error - do '128h')\n");
}
- return 1;
-}
-
-void print_128(uint64_t *v)
-{
- if (v[0] > 0) {
- printf("%llx", (long long unsigned int) v[0]);
- printf("%016llx", (long long unsigned int) v[1]);
- } else {
- printf("%llx", (long long unsigned int) v[1]);
+ if (why == 'A') fprintf(stderr, "Bad a\n");
+ if (why == 'B') fprintf(stderr, "Bad b\n");
+ if (why == 'M') {
+ fprintf(stderr, "Bad Method Specification: ");
+ gf_error();
}
- printf("\n");
+ exit(1);
}
-
int main(int argc, char **argv)
{
- int hex, al, bl, w;
- uint32_t a, b, c, top;
- uint64_t a64, b64, c64;
- uint64_t a128[2], b128[2], c128[2];
- char *format;
+ int hex, w;
gf_t gf;
+ gf_general_t a, b, c;
+ char output[50];
- if (argc < 4) usage(NULL);
- if (sscanf(argv[3], "%d", &w) == 0) usage("Bad w\n");
+ if (argc < 4) usage(' ');
- if (w <= 0 || (w > 32 && w != 64 && w != 128)) usage("Bad w");
+ if (sscanf(argv[3], "%d", &w) == 0) usage('W');
+ if (w <= 0 || (w > 32 && w != 64 && w != 128)) usage('W');
hex = (strchr(argv[3], 'h') != NULL);
- if (create_gf_from_argv(&gf, w, argc, argv, 4) == 0) usage("\nBad Method\n");
+ if (!hex && w == 128) usage('W');
- if (!hex && w == 128) usage(NULL);
+ if (argc == 4) {
+ if (gf_init_easy(&gf, w) == 0) usage('M');
+ } else {
+ if (create_gf_from_argv(&gf, w, argc, argv, 4) == 0) usage('M');
+ }
- if (w <= 32) {
- format = (hex) ? "%x" : "%u";
- if (sscanf(argv[1], format, &a) == 0) usage("Bad a\n");
- if (sscanf(argv[2], format, &b) == 0) usage("Bad b\n");
+ if (!gf_general_s_to_val(&a, w, argv[1], hex)) usage('A');
+ if (!gf_general_s_to_val(&b, w, argv[2], hex)) usage('B');
- if (w < 32) {
- top = (w == 31) ? 0x80000000 : (1 << w);
- if (w != 32 && a >= top) usage("a is too large\n");
- if (w != 32 && b >= top) usage("b is too large\n");
- }
+ gf_general_multiply(&gf, &a, &b, &c);
+ gf_general_val_to_s(&c, w, output, hex);
- c = gf.multiply.w32(&gf, a, b);
- printf(format, c);
- printf("\n");
-
- } else if (w == 64) {
- format = (hex) ? "%llx" : "%llu";
- if (sscanf(argv[1], format, &a64) == 0) usage("Bad a\n");
- if (sscanf(argv[2], format, &b64) == 0) usage("Bad b\n");
- c64 = gf.multiply.w64(&gf, a64, b64);
-
- printf(format, c64);
- printf("\n");
-
- } else if (w == 128) {
-
- if (read_128(argv[1], a128) == 0) usage("Bad a\n");
- if (read_128(argv[2], b128) == 0) usage("Bad b\n");
- gf.multiply.w128(&gf, a128, b128, c128);
-
- print_128(c128);
- }
+ printf("%s\n", output);
exit(0);
}
diff --git a/gf_poly.c b/gf_poly.c
index c057461..7134b2c 100644
--- a/gf_poly.c
+++ b/gf_poly.c
@@ -1,560 +1,268 @@
/*
- * gf_poly.c - program to help find primitive polynomials in composite fields
+ gf_poly.c - program to help find irreducible polynomials in composite fields,
+ using the Ben-Or algorithm.
+
+ James S. Plank
+
+ Please see the following paper for a
+ description of the Ben-Or algorithm:
+
+ author S. Gao and D. Panario
+ title Tests and Constructions of Irreducible Polynomials over Finite Fields
+ booktitle Foundations of Computational Mathematics
+ year 1997
+ publisher Springer Verlag
+ pages 346-361
+
+ The basic technique is this. You have a polynomial f(x) whose coefficients are
+ in a base field GF(2^w). The polynomial is of degree n. You need to do the
+ following for all i from 1 to n/2:
+
+ Construct x^(2^w)^i modulo f. That will be a polynomial of maximum degree n-1
+ with coefficients in GF(2^w). You construct that polynomial by starting with x
+ and doubling it w times, each time taking the result modulo f. Then you
+ multiply that by itself i times, again each time taking the result modulo f.
+
+ When you're done, you need to "subtract" x -- since addition = subtraction =
+ XOR, that means XOR x.
+
+ Now, find the GCD of that last polynomial and f, using Euclid's algorithm. If
+ the GCD is not one, then f is reducible. If it is not reducible for each of
+ those i, then it is irreducible.
+
+ In this code, I am using a gf_general_t to represent elements of GF(2^w). This
+ is so that I can use base fields that are GF(2^64) or GF(2^128).
+
+ I have two main procedures. The first is x_to_q_to_i_minus_x, which calculates
+ x^(2^w)^i - x, putting the result into a gf_general_t * called retval.
+
+ The second is gcd_one, which takes a polynomial of degree n and a second one
+ of degree n-1, and uses Euclid's algorithm to decide if their GCD == 1.
+
+ These can be made faster (e.g. calculate x^(2^w) once and store it).
*/
#include "gf_complete.h"
#include "gf_method.h"
+#include "gf_general.h"
+#include "gf_int.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
-#define GF_POLY_COEF_MASK8 0xff
-#define GF_POLY_COEF_MASK16 0xffff
-#define GF_POLY_COEF_MASK32 0xffffffff
-#define GF_POLY_COEF_MASK64 0xffffffffffffffff
-
-#define LLUI (long long unsigned int)
-
-struct gf_poly_coef_s;
-
-typedef struct gf_poly_coef_s {
- uint64_t coef;
- uint64_t power;
- struct gf_poly_coef_s *next;
-} gf_poly_coef_t;
-
-typedef struct gf_poly_s {
- gf_poly_coef_t *leading_coef;
- uint64_t num_coefs;
- gf_t *coef_gf;
- int w;
-} gf_poly_t;
-
-static uint64_t gf_add(int w, uint64_t a, uint64_t b)
-{
- if (w == 8) {
- return (a & GF_POLY_COEF_MASK8) ^ (b & GF_POLY_COEF_MASK8);
- } else if (w == 16) {
- return (a & GF_POLY_COEF_MASK16) ^ (b & GF_POLY_COEF_MASK16);
- } else if (w == 32) {
- return (a & GF_POLY_COEF_MASK32) ^ (b & GF_POLY_COEF_MASK32);
- } else if (w == 64) {
- return (a & GF_POLY_COEF_MASK64) ^ (b & GF_POLY_COEF_MASK64);
- }
-}
-
-static uint64_t gf_mult(int w, gf_t* gf, uint64_t a, uint64_t b)
-{
- if (w <= 32) {
- return gf->multiply.w32(gf, a, b);
- } else if (w == 64) {
- return gf->multiply.w64(gf, a, b);
- }
-}
+char *BM = "Bad Method: ";
-static uint64_t gf_divide(int w, gf_t* gf, uint64_t a, uint64_t b)
+void usage(char *s)
{
- if (w <= 32) {
- return gf->divide.w32(gf, a, b);
- } else if (w == 64) {
- return gf->divide.w64(gf, a, b);
- }
-}
-
-static uint64_t gf_inverse(int w, gf_t* gf, uint64_t a)
-{
- if (w <= 32) {
- return gf->inverse.w32(gf, a);
- } else if (w == 64) {
- return gf->inverse.w64(gf, a);
- }
-}
-
-gf_poly_t* gf_poly_init(int w, gf_t *gf)
-{
- gf_poly_t *gf_poly = (gf_poly_t*)malloc(sizeof(gf_poly_t));
-
- if (gf_poly == NULL || gf == NULL) {
- return NULL;
- }
-
- gf_poly->leading_coef = NULL;
- gf_poly->num_coefs = 0;
- gf_poly->coef_gf = gf;
- gf_poly->w = w;
-
- return gf_poly;
-}
-
-void gf_poly_print(gf_poly_t *gf_poly, char *message)
-{
- gf_poly_coef_t *tmp;
-
- if (gf_poly == NULL) {
- fprintf(stderr, "0 * x^0\n");
- return;
- }
-
- tmp = gf_poly->leading_coef;
-
- while (tmp != NULL) {
- printf("%llu * x^%llu", LLUI tmp->coef, LLUI tmp->power);
- tmp = tmp->next;
- if (tmp) {
- printf(" + ");
+ fprintf(stderr, "usage: gf_poly w(base-field) method power:coef [ power:coef .. ]\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " use - for the default method.\n");
+ fprintf(stderr, " use 0x in front of the coefficient if it's in hex\n");
+ fprintf(stderr, " \n");
+ fprintf(stderr, " For example, to test whether x^2 + 2x + 1 is irreducible\n");
+ fprintf(stderr, " in GF(2^16), the call is:\n");
+ fprintf(stderr, " \n");
+ fprintf(stderr, " gf_poly 16 - 2:1 1:2 0:1\n");
+ fprintf(stderr, " \n");
+ fprintf(stderr, " See the user's manual for more information.\n");
+ if (s != NULL) {
+ fprintf(stderr, "\n");
+ if (s == BM) {
+ fprintf(stderr, "%s", s);
+ gf_error();
+ } else {
+ fprintf(stderr, "%s\n", s);
}
}
-
- if (message != NULL) {
- printf(": %s\n", message);
- }
+ exit(1);
}
-gf_poly_t* gf_poly_copy(gf_poly_t *poly)
+int gcd_one(gf_t *gf, int w, int n, gf_general_t *poly, gf_general_t *prod)
{
- gf_poly_t *new_poly = (gf_poly_t*)malloc(sizeof(gf_poly_t));
- gf_poly_coef_t *tmp = poly->leading_coef;
-
- if (new_poly == NULL) {
- return NULL;
- }
-
- new_poly->leading_coef = NULL;
- new_poly->num_coefs = 0;
- new_poly->coef_gf = poly->coef_gf;
- new_poly->w = poly->w;
-
- while (tmp != NULL) {
- gf_poly_add_coef(new_poly, tmp->coef, tmp->power);
-
- tmp = tmp->next;
- }
-
- return new_poly;
-}
-
-void gf_poly_clear(gf_poly_t* a)
-{
- while (a->leading_coef != NULL) {
- gf_poly_coef_t *tmp = a->leading_coef;
-
- a->leading_coef = tmp->next;
-
- free(tmp);
- }
-}
-
-void gf_poly_free(gf_poly_t **a)
-{
- gf_poly_clear(*a);
- free(*a);
- *a = NULL;
-}
-
-gf_poly_coef_t* gf_poly_create_node(uint64_t coef, uint64_t power)
-{
- gf_poly_coef_t* node = (gf_poly_coef_t*)malloc(sizeof(gf_poly_coef_t));
-
- if (node == NULL) {
- return NULL;
- }
-
- node->coef = coef;
- node->power = power;
- node->next = NULL;
-
- return node;
-}
-
-int gf_poly_remove_node(gf_poly_t *gf_poly, uint64_t power)
-{
- gf_poly_coef_t* iter = gf_poly->leading_coef;
-
- if (iter->power == power) {
- gf_poly->leading_coef = iter->next;
- free(iter);
- return 0;
- }
-
- while (iter->next != NULL) {
- if (iter->next->power == power) {
- gf_poly_coef_t* tmp = iter->next;
- iter->next = iter->next->next;
- free(tmp);
- return 0;
+ gf_general_t *a, *b, zero, factor, p;
+ int i, j, da, db;
+ char buf[30];
+
+ gf_general_set_zero(&zero, w);
+
+ a = (gf_general_t *) malloc(sizeof(gf_general_t) * n+1);
+ b = (gf_general_t *) malloc(sizeof(gf_general_t) * n);
+ for (i = 0; i <= n; i++) gf_general_add(gf, &zero, poly+i, a+i);
+ for (i = 0; i < n; i++) gf_general_add(gf, &zero, prod+i, b+i);
+
+ da = n;
+ while (1) {
+ for (db = n-1; db >= 0 && gf_general_is_zero(b+db, w); db--) ;
+ if (db < 0) return 0;
+ if (db == 0) return 1;
+ for (j = da; j >= db; j--) {
+ if (!gf_general_is_zero(a+j, w)) {
+ gf_general_divide(gf, a+j, b+db, &factor);
+ for (i = 0; i <= db; i++) {
+ gf_general_multiply(gf, b+i, &factor, &p);
+ gf_general_add(gf, &p, a+(i+j-db), a+(i+j-db));
+ }
+ }
+ }
+ for (i = 0; i < n; i++) {
+ gf_general_add(gf, a+i, &zero, &p);
+ gf_general_add(gf, b+i, &zero, a+i);
+ gf_general_add(gf, &p, &zero, b+i);
}
- iter = iter->next;
}
- return -1;
}
-int gf_poly_add_coef(gf_poly_t *gf_poly, uint64_t coef_val, uint64_t power)
+void x_to_q_to_i_minus_x(gf_t *gf, int w, int n, gf_general_t *poly, int logq, int i, gf_general_t *retval)
{
- gf_poly_coef_t* node;
- gf_poly_coef_t* iter = gf_poly->leading_coef;
-
- /*
- * The new node has the highest power, or there are no terms
- */
- if (gf_poly->leading_coef == NULL || gf_poly->leading_coef->power < power) {
- node = gf_poly_create_node(coef_val, power);
- node->next = gf_poly->leading_coef;
- gf_poly->leading_coef = node;
- return 0;
- }
-
- /*
- * The new node is of the same power, add the coefs
- */
- if (gf_poly->leading_coef->power == power) {
- gf_poly->leading_coef->coef = gf_add(gf_poly->w, gf_poly->leading_coef->coef, coef_val);
- if (gf_poly->leading_coef->coef == 0) {
- gf_poly_remove_node(gf_poly, power);
+ gf_general_t x;
+ gf_general_t *x_to_q;
+ gf_general_t *product;
+ gf_general_t p, zero, factor;
+ int j, k, lq;
+ char buf[20];
+
+ gf_general_set_zero(&zero, w);
+ product = (gf_general_t *) malloc(sizeof(gf_general_t) * n*2);
+ x_to_q = (gf_general_t *) malloc(sizeof(gf_general_t) * n);
+ for (j = 0; j < n; j++) gf_general_set_zero(x_to_q+j, w);
+ gf_general_set_one(x_to_q+1, w);
+
+ for (lq = 0; lq < logq; lq++) {
+ for (j = 0; j < n*2; j++) gf_general_set_zero(product+j, w);
+ for (j = 0; j < n; j++) {
+ for (k = 0; k < n; k++) {
+ gf_general_multiply(gf, x_to_q+j, x_to_q+k, &p);
+ gf_general_add(gf, product+(j+k), &p, product+(j+k));
+ }
+ }
+ for (j = n*2-1; j >= n; j--) {
+ if (!gf_general_is_zero(product+j, w)) {
+ gf_general_add(gf, product+j, &zero, &factor);
+ for (k = 0; k <= n; k++) {
+ gf_general_multiply(gf, poly+k, &factor, &p);
+ gf_general_add(gf, product+(j-n+k), &p, product+(j-n+k));
+ }
+ }
}
- return 0;
+ for (j = 0; j < n; j++) gf_general_add(gf, product+j, &zero, x_to_q+j);
}
-
- while (iter->next != NULL) {
- if (iter->next->power == power) {
- iter->next->coef = gf_add(gf_poly->w, iter->next->coef, coef_val);
-
- if (iter->next->coef == 0) {
- gf_poly_remove_node(gf_poly, power);
+ for (j = 0; j < n; j++) gf_general_set_zero(retval+j, w);
+ gf_general_set_one(retval, w);
+
+ while (i > 0) {
+ for (j = 0; j < n*2; j++) gf_general_set_zero(product+j, w);
+ for (j = 0; j < n; j++) {
+ for (k = 0; k < n; k++) {
+ gf_general_multiply(gf, x_to_q+j, retval+k, &p);
+ gf_general_add(gf, product+(j+k), &p, product+(j+k));
}
-
- return 0;
}
- if (iter->next->power < power) {
- node = gf_poly_create_node(coef_val, power);
- node->next = iter->next;
- iter->next = node;
- return 0;
+ for (j = n*2-1; j >= n; j--) {
+ if (!gf_general_is_zero(product+j, w)) {
+ gf_general_add(gf, product+j, &zero, &factor);
+ for (k = 0; k <= n; k++) {
+ gf_general_multiply(gf, poly+k, &factor, &p);
+ gf_general_add(gf, product+(j-n+k), &p, product+(j-n+k));
+ }
+ }
}
- iter = iter->next;
+ for (j = 0; j < n; j++) gf_general_add(gf, product+j, &zero, retval+j);
+ i--;
}
-
- /*
- * The power passed in is lower than any in the existing poly
- */
- node = gf_poly_create_node(coef_val, power);
- iter->next = node;
-
- return 0;
-}
-
-/*
- * Compute a+b and store in a
- */
-int gf_poly_add(gf_poly_t* a, gf_poly_t* b)
-{
- gf_poly_coef_t* iter = b->leading_coef;
- while (iter != NULL) {
- gf_poly_add_coef(a, iter->coef, iter->power);
- iter = iter->next;
- }
+ gf_general_set_one(&x, w);
+ gf_general_add(gf, &x, retval+1, retval+1);
- return 0;
+ free(product);
+ free(x_to_q);
}
-/*
- * Compute a*b and store in a
- */
-int gf_poly_mult(gf_poly_t* a, gf_poly_t* b)
+main(int argc, char **argv)
{
- gf_poly_coef_t* a_iter = a->leading_coef;
+ int w, i, power, n, ap, success, j;
+ gf_t gf;
+ gf_general_t *poly, *prod;
+ char *string, *ptr;
+ char buf[100];
- /*
- * Remove one node at a time from 'a', starting with
- * highest power. Multiply the removed (coef,power)
- * by every entry of 'b,' adding each product into 'a.'
- */
- while (a_iter != NULL) {
- gf_poly_coef_t* tmp = a_iter;
- gf_poly_coef_t* b_iter = b->leading_coef;
+ if (argc < 4) usage(NULL);
- uint64_t a_power = a_iter->power;
- uint64_t a_coef = a_iter->coef;
- a_iter = a_iter->next;
- gf_poly_remove_node(a, tmp->power);
+ if (sscanf(argv[1], "%d", &w) != 1 || w <= 0) usage("Bad w.");
+ ap = create_gf_from_argv(&gf, w, argc, argv, 2);
- while (b_iter != NULL) {
- uint64_t new_power = b_iter->power + a_power;
- uint64_t new_coef = gf_mult(a->w, a->coef_gf, b_iter->coef, a_coef);
+ if (ap == 0) usage(BM);
- gf_poly_add_coef(a, new_coef, new_power);
+ if (ap == argc) usage("No powers/coefficients given.");
- b_iter = b_iter->next;
+ n = -1;
+ for (i = ap; i < argc; i++) {
+ if (strchr(argv[i], ':') == NULL || sscanf(argv[i], "%d:", &power) != 1) {
+ string = (char *) malloc(sizeof(char)*(strlen(argv[i]+100)));
+ sprintf(string, "Argument '%s' not in proper format of power:coefficient\n", argv[i]);
+ usage(string);
}
+ if (power < 0) usage("Can't have negative powers\n");
+ if (power > n) n = power;
}
- return 0;
-}
-
-/*
- * Compute a % b and store in a
- */
-int gf_poly_reduce(gf_poly_t* a, gf_poly_t* b)
-{
- gf_poly_t* c = gf_poly_init(a->w, a->coef_gf);
- gf_poly_coef_t* a_iter = a->leading_coef;
- gf_poly_coef_t* b_iter = b->leading_coef;
-
- /*
- * Reduce until the degree of 'a' is less than
- * the degree of 'b.' At that point 'a' will
- * contain the remainder of a / b.
- */
- while (a_iter && (a_iter->power >= b_iter->power)) {
-
- /*
- * Get the degree and leading coef of the current
- * 'b'.
- */
- uint64_t reduce_power = a_iter->power - b_iter->power;
- uint64_t reduce_coef = gf_divide(a->w, a->coef_gf, a_iter->coef, b_iter->coef);
-
- /*
- * Create a poly that will get rid of leading power
- * of 'b' when added: c*x^(n-m)*b(x), where c
- * is the leading coef of 'a', n is the deg of 'a'
- * and m is the degree of 'b'.
- */
- gf_poly_add_coef(c, reduce_coef, reduce_power);
- gf_poly_mult(c, b);
-
- /*
- * Add the newly created poly, which will reduce
- * a(x) by at least one term (leading term).
- */
- gf_poly_add(a, c);
-
- gf_poly_clear(c);
-
- /*
- * Grab the new leading term of 'a'
- */
- a_iter = a->leading_coef;
- }
-}
-
-/*
- * Get the GCD of a and b, return the result
- */
-gf_poly_t* gf_poly_gcd(gf_poly_t* a, gf_poly_t* b)
-{
- gf_poly_t *r1, *r2;
- gf_poly_t* tmp_swp;
-
- if (a->leading_coef == NULL || b->leading_coef == NULL) {
- return NULL;
- }
-
- if (a->leading_coef->power > b->leading_coef->power) {
- r1 = a;
- r2 = b;
- } else {
- r1 = b;
- r2 = a;
- }
-
- while ( 1 ) {
- if (r2->leading_coef == NULL) {
- break;
- }
- if (r2->leading_coef->power == 0 && r2->leading_coef->coef <= 1) {
- break;
- }
-
- gf_poly_reduce(r1, r2);
- tmp_swp = r1;
- r1 = r2;
- r2 = tmp_swp;
- }
-
- return r1;
-}
-
-/*
- * The Ben-Or algorithm for determining irreducibility
- */
-int gf_poly_is_irred(gf_poly_t* poly)
-{
- gf_poly_t *gcd;
- gf_poly_t *prod_of_irred;
- uint64_t prod_of_irred_power = ((unsigned long long) 1) << poly->w;
- int n = poly->leading_coef->power / 2;
- int i;
- int ret = 0;
- gf_poly_t *a = gf_poly_copy(poly);
-
- prod_of_irred = gf_poly_init(a->w, a->coef_gf);
+ poly = (gf_general_t *) malloc(sizeof(gf_general_t)*(n+1));
+ for (i = 0; i <= n; i++) gf_general_set_zero(poly+i, w);
+ prod = (gf_general_t *) malloc(sizeof(gf_general_t)*n);
- for (i = 1; i <= n; i++) {
- gf_poly_add_coef(prod_of_irred, 1, prod_of_irred_power);
- gf_poly_add_coef(prod_of_irred, 1, 1);
-
- gf_poly_reduce(prod_of_irred, a);
-
- gcd = gf_poly_gcd(a, prod_of_irred);
-
- /*
- * It is irreducible if it is not the product of
- * non-trivial factors (non-constant). Therefore,
- * the GCD of the poly and prod_of_irred should be
- * a constant (0 or 0-degree polynomial).
- */
- if (gcd == NULL) {
- ret = -1;
- break;
- } else if (gcd->leading_coef->power != 0) {
- ret = -1;
- break;
- } else if (gcd->leading_coef->power == 0) {
- ret = 0;
- break;
+ for (i = ap; i < argc; i++) {
+ sscanf(argv[i], "%d:", &power);
+ ptr = strchr(argv[i], ':');
+ ptr++;
+ if (strncmp(ptr, "0x", 2) == 0) {
+ success = gf_general_s_to_val(poly+power, w, ptr+2, 1);
} else {
- ret = -1;
- break;
+ success = gf_general_s_to_val(poly+power, w, ptr, 0);
}
-
- // Need if to avoid a overflow error
- if ((i + 1) <= n) {
- prod_of_irred_power *= prod_of_irred_power;
+ if (success == 0) {
+ string = (char *) malloc(sizeof(char)*(strlen(argv[i]+100)));
+ sprintf(string, "Argument '%s' not in proper format of power:coefficient\n", argv[i]);
+ usage(string);
}
- gf_poly_clear(prod_of_irred);
}
- gf_poly_free(&a);
-
- return ret;
-}
-
-int is_suitible_s(int w, gf_t *gf, uint64_t s)
-{
- uint64_t num_elems = ((unsigned long long) 1) << w;
- uint64_t i = 2;
- uint64_t i_inv;
-
- for (; i < num_elems; i++) {
- i_inv = gf_inverse(w, gf, i);
- if ((i ^ i_inv) == s) {
- fprintf(stderr, "Bailed on %llu ^ %llu = %llu\n", LLUI i, LLUI i_inv, LLUI s);
- return -1;
+ printf("Poly:");
+ for (power = n; power >= 0; power--) {
+ if (!gf_general_is_zero(poly+power, w)) {
+ printf("%s", (power == n) ? " " : " + ");
+ if (!gf_general_is_one(poly+power, w)) {
+ gf_general_val_to_s(poly+power, w, buf, 1);
+ if (n > 0) {
+ printf("(0x%s)", buf);
+ } else {
+ printf("0x%s", buf);
+ }
+ }
+ if (power == 0) {
+ if (gf_general_is_one(poly+power, w)) printf("1");
+ } else if (power == 1) {
+ printf("x");
+ } else {
+ printf("x^%d", power);
+ }
}
- if (i % 1000000000 == 0) fprintf(stderr, "Processed %llu\n", LLUI i);
- }
-
- return 0;
-}
-
-static void
-usage(char *cmd)
-{
- fprintf(stderr, "%s w <GF args> S <s value>\n", cmd);
- fprintf(stderr, "\t will build a trinomial x^2+S*x+1\n");
- fprintf(stderr, "OR\n");
- fprintf(stderr, "%s w <GF args> G coef1,power1 <coef2,power2> ... <coefn,powern>\n", cmd);
- fprintf(stderr, "\t will build a polynomial coef1^(power1) + ... + coefn^(powern)\n");
- fprintf(stderr, "Example: ./gf_poly 8 - - - G 1,2 2,1 1,0\n");
- fprintf(stderr, "\t will build a polynomial x^2+2*x+1 with coefs from GF(2^8)\n");
-}
-
-/*
- * Find irred poly of form x^2+sx+1
- * a_n*x^n + a_(n-1)*x^(n-1) + ...
- *
- * Terms are specified as: a_i,i a_j,j, ... where
- * i is the degree of the term and a_i is the coef
- *
- */
-int main(int argc, char **argv)
-{
- gf_t gf;
- int ret;
- int w;
- int i;
- uint64_t irred_coef_s;
- gf_poly_t *irred_poly;
- char *term;
-
- bzero(&gf, sizeof(gf_t));
-
- if (argc < 4) {
- usage(argv[0]);
- return -1;
}
-
- w = atoi(argv[1]);
-
- ret = create_gf_from_argv(&gf, w, argc, argv, 3);
-
- if (ret <= 0) {
- fprintf(stderr, "Could not create a GF\n");
- return -1;
- }
-
- irred_poly = gf_poly_init(w, &gf);
-
- i = ret + 1;
+ printf("\n");
- if (strlen(argv[i]) > 1) {
- usage(argv[0]);
- exit(1);
+ if (!gf_general_is_one(poly+n, w)) {
+ printf("\n");
+ printf("Can't do Ben-Or, because the polynomial is not monic.\n");
+ exit(0);
}
- if (argv[i][0] == 'S') {
- i++;
- irred_coef_s = (uint64_t)strtoull(argv[i], NULL, 10);
-
- /*
- * If this is a trinomial of the form x^2+s*x+1, then
- * we can do a quick pre-check to see if this may be
- * an irreducible polynomial.
- */
- if (is_suitible_s(w, &gf, irred_coef_s) < 0) {
- fprintf(stderr, "%llu is not a suitable coeffient!\n", LLUI irred_coef_s);
- return -1;
- } else {
- fprintf(stderr, "%llu IS A suitable coeffient!\n", LLUI irred_coef_s);
+ for (i = 1; i <= n/2; i++) {
+ x_to_q_to_i_minus_x(&gf, w, n, poly, w, i, prod);
+ if (!gcd_one(&gf, w, n, poly, prod)) {
+ printf("Reducible.\n");
+ exit(0);
}
-
-
- gf_poly_add_coef(irred_poly, 1, 2);
- gf_poly_add_coef(irred_poly, irred_coef_s, 1);
- gf_poly_add_coef(irred_poly, 1, 0);
-
- } else if (argv[i][0] == 'G') {
- term = argv[++i];
-
-
- while (term != NULL) {
- uint64_t coef = strtoull(strtok(term, ","), NULL, 10);
- uint64_t power = strtoull(strtok(NULL, ","), NULL, 10);
-
- gf_poly_add_coef(irred_poly, coef, power);
-
- if (i < argc) {
- term = argv[++i];
- } else {
- break;
- }
- }
- } else {
- usage(argv[0]);
- exit(1);
}
- gf_poly_print(irred_poly, " specified via the command line\n");
-
- ret = gf_poly_is_irred(irred_poly);
-
- if (ret < 0) {
- gf_poly_print(irred_poly, " IS NOT irreducible\n");
- } else {
- gf_poly_print(irred_poly, " IS irreducible\n");
- }
-
- return 0;
+ printf("Irreducible.\n");
+ exit(0);
}
diff --git a/gf_time.c b/gf_time.c
index 8313b05..55f3e11 100644
--- a/gf_time.c
+++ b/gf_time.c
@@ -9,7 +9,7 @@
#include <stdint.h>
#include <string.h>
#include <stdlib.h>
-#include <time.h>
+#include <sys/time.h>
#include "gf_complete.h"
#include "gf_method.h"
@@ -43,10 +43,14 @@ void problem(char *s)
exit(1);
}
+char *BM = "Bad Method: ";
+
void usage(char *s)
{
fprintf(stderr, "usage: gf_time w tests seed size(bytes) iterations [method [params]] - does timing\n");
fprintf(stderr, "\n");
+ fprintf(stderr, "does unit testing in GF(2^w)\n");
+ fprintf(stderr, "\n");
fprintf(stderr, "Legal w are: 1 - 32, 64 and 128\n");
fprintf(stderr, "\n");
fprintf(stderr, "Tests may be any combination of:\n");
@@ -63,9 +67,12 @@ void usage(char *s)
fprintf(stderr, "\n");
fprintf(stderr, "Use -1 for time(0) as a seed.\n");
fprintf(stderr, "\n");
- fprintf(stderr, "For method specification, type gf_methods\n");
- fprintf(stderr, "\n");
- if (s != NULL) fprintf(stderr, "%s\n", s);
+ if (s == BM) {
+ fprintf(stderr, "%s", BM);
+ gf_error();
+ } else if (s != NULL) {
+ fprintf(stderr, "%s\n", s);
+ }
exit(1);
}
@@ -84,9 +91,15 @@ int main(int argc, char **argv)
time_t t0;
uint8_t *ra, *rb;
gf_general_t a;
+
if (argc < 6) usage(NULL);
- if (sscanf(argv[1], "%d", &w) == 0) usage("Bad w\n");
+
+ if (sscanf(argv[1], "%d", &w) == 0){
+ usage("Bad w[-pp]\n");
+ }
+
+
if (sscanf(argv[3], "%ld", &t0) == 0) usage("Bad seed\n");
if (sscanf(argv[4], "%d", &size) == 0) usage("Bad size\n");
if (sscanf(argv[5], "%d", &iterations) == 0) usage("Bad iterations\n");
@@ -99,7 +112,7 @@ int main(int argc, char **argv)
if ((w > 32 && w != 64 && w != 128) || w < 0) usage("Bad w");
if ((size * 8) % w != 0) usage ("Bad size -- must be a multiple of w*8\n");
- if (!create_gf_from_argv(&gf, w, argc, argv, 6)) usage("Bad Method");
+ if (!create_gf_from_argv(&gf, w, argc, argv, 6)) usage(BM);
strcpy(tests, "");
for (i = 0; i < argv[2][i] != '\0'; i++) {
diff --git a/gf_unit.c b/gf_unit.c
index 03911c4..fbc21f9 100644
--- a/gf_unit.c
+++ b/gf_unit.c
@@ -10,6 +10,7 @@
#include <string.h>
#include <stdlib.h>
#include <time.h>
+#include <signal.h>
#include "gf_complete.h"
#include "gf_int.h"
@@ -18,6 +19,8 @@
#include "gf_general.h"
#define REGION_SIZE (16384)
+#define RMASK (0x00000000ffffffffLL)
+#define LMASK (0xffffffff00000000LL)
void problem(char *s)
{
@@ -26,11 +29,14 @@ void problem(char *s)
exit(1);
}
+char *BM = "Bad Method: ";
+
void usage(char *s)
{
fprintf(stderr, "usage: gf_unit w tests seed [method] - does unit testing in GF(2^w)\n");
- fprintf(stderr, "\n");
+ fprintf(stderr, "\n");
fprintf(stderr, "Legal w are: 1 - 32, 64 and 128\n");
+ fprintf(stderr, " 128 is hex only (i.e. '128' will be an error - do '128h')\n");
fprintf(stderr, "\n");
fprintf(stderr, "Tests may be any combination of:\n");
fprintf(stderr, " A: All\n");
@@ -40,16 +46,28 @@ void usage(char *s)
fprintf(stderr, "\n");
fprintf(stderr, "Use -1 for time(0) as a seed.\n");
fprintf(stderr, "\n");
- fprintf(stderr, "For method specification, type gf_methods\n");
- fprintf(stderr, "\n");
- if (s != NULL) fprintf(stderr, "%s\n", s);
+ if (s == BM) {
+ fprintf(stderr, "%s", BM);
+ gf_error();
+ } else if (s != NULL) {
+ fprintf(stderr, "%s\n", s);
+ }
exit(1);
}
+void SigHandler(int v)
+{
+ fprintf(stderr, "Problem: SegFault!\n");
+ fflush(stdout);
+ exit(2);
+}
+
int main(int argc, char **argv)
{
+ signal(SIGSEGV, SigHandler);
+
int w, i, verbose, single, region, tested, top;
- int start, end, xor;
+ int s_start, d_start, bytes, xor, alignment_test;
gf_t gf, gf_def;
time_t t0;
gf_internal_t *h;
@@ -61,15 +79,21 @@ int main(int argc, char **argv)
char *ra, *rb, *rc, *rd, *target;
int align;
+
if (argc < 4) usage(NULL);
- if (sscanf(argv[1], "%d", &w) == 0) usage("Bad w\n");
+
+ if (sscanf(argv[1], "%d", &w) == 0){
+ usage("Bad w\n");
+ }
+
if (sscanf(argv[3], "%ld", &t0) == 0) usage("Bad seed\n");
if (t0 == -1) t0 = time(0);
MOA_Seed(t0);
if (w > 32 && w != 64 && w != 128) usage("Bad w");
- if (create_gf_from_argv(&gf, w, argc, argv, 4) == 0) usage("Bad Method");
+ if (create_gf_from_argv(&gf, w, argc, argv, 4) == 0) usage(BM);
+ printf("Size (bytes): %d\n", gf_size(&gf));
for (i = 0; i < strlen(argv[2]); i++) {
if (strchr("ASRV", argv[2][i]) == NULL) usage("Bad test\n");
@@ -83,10 +107,18 @@ int main(int argc, char **argv)
ai = (gf_general_t *) malloc(sizeof(gf_general_t));
bi = (gf_general_t *) malloc(sizeof(gf_general_t));
- ra = (char *) malloc(sizeof(char)*REGION_SIZE);
- rb = (char *) malloc(sizeof(char)*REGION_SIZE);
- rc = (char *) malloc(sizeof(char)*REGION_SIZE);
- rd = (char *) malloc(sizeof(char)*REGION_SIZE);
+ //15 bytes extra to make sure it's 16byte aligned
+ ra = (char *) malloc(sizeof(char)*REGION_SIZE+15);
+ rb = (char *) malloc(sizeof(char)*REGION_SIZE+15);
+ rc = (char *) malloc(sizeof(char)*REGION_SIZE+15);
+ rd = (char *) malloc(sizeof(char)*REGION_SIZE+15);
+
+ //this still assumes 8 byte aligned pointer from malloc
+ //(which is usual on 32-bit machines)
+ ra += (uint64_t)ra & 0xf;
+ rb += (uint64_t)rb & 0xf;
+ rc += (uint64_t)rc & 0xf;
+ rd += (uint64_t)rd & 0xf;
if (w <= 32) {
mask = 0;
@@ -97,8 +129,9 @@ int main(int argc, char **argv)
single = (strchr(argv[2], 'S') != NULL || strchr(argv[2], 'A') != NULL);
region = (strchr(argv[2], 'R') != NULL || strchr(argv[2], 'A') != NULL);
- if (!gf_init_easy(&gf_def, w)) problem("No default for this value of w");
-
+ if (!gf_init_hard(&gf_def, w, GF_MULT_DEFAULT, GF_REGION_DEFAULT, GF_DIVIDE_DEFAULT,
+ (h->mult_type != GF_MULT_COMPOSITE) ? h->prim_poly : 0, 0, 0, NULL, NULL))
+ problem("No default for this value of w");
if (w == 4) {
mult4 = gf_w4_get_mult_table(&gf);
div4 = gf_w4_get_div_table(&gf);
@@ -129,21 +162,71 @@ int main(int argc, char **argv)
if (w <= 10) {
a->w32 = i % (1 << w);
b->w32 = (i >> w);
- } else if (i < 10) {
- gf_general_set_zero(a, w);
- gf_general_set_random(b, w, 1);
- } else if (i < 20) {
- gf_general_set_random(a, w, 1);
- gf_general_set_zero(b, w);
- } else if (i < 30) {
- gf_general_set_one(a, w);
- gf_general_set_random(b, w, 1);
- } else if (i < 40) {
- gf_general_set_random(a, w, 1);
- gf_general_set_one(b, w);
+
+ //Allen: the following conditions were being run 10 times each. That didn't seem like nearly enough to
+ //me for these special cases, so I converted to doing this mod stuff to easily make the number of times
+ //run both larger and proportional to the total size of the run.
} else {
- gf_general_set_random(a, w, 1);
- gf_general_set_random(b, w, 1);
+ switch (i % 32)
+ {
+ case 0:
+ gf_general_set_zero(a, w);
+ gf_general_set_random(b, w, 1);
+ break;
+ case 1:
+ gf_general_set_random(a, w, 1);
+ gf_general_set_zero(b, w);
+ break;
+ case 2:
+ gf_general_set_one(a, w);
+ gf_general_set_random(b, w, 1);
+ break;
+ case 3:
+ gf_general_set_random(a, w, 1);
+ gf_general_set_one(b, w);
+ break;
+ default:
+ gf_general_set_random(a, w, 1);
+ gf_general_set_random(b, w, 1);
+ }
+ }
+
+ //Allen: the following special cases for w=64 are based on the code below for w=128.
+ //These w=64 cases are based on Dr. Plank's suggestion because some of the methods for w=64
+ //involve splitting it in two. I think they're less likely to give errors than the 128-bit case
+ //though, because the 128 bit case is always split in two.
+ //As with w=128, I'm arbitrarily deciding to do this sort of thing with a quarter of the cases
+ if (w == 64) {
+ switch (i % 32)
+ {
+ case 0: if (!gf_general_is_one(a, w)) a->w64 &= RMASK; break;
+ case 1: if (!gf_general_is_one(a, w)) a->w64 &= LMASK; break;
+ case 2: if (!gf_general_is_one(a, w)) a->w64 &= RMASK; if (!gf_general_is_one(b, w)) b->w64 &= RMASK; break;
+ case 3: if (!gf_general_is_one(a, w)) a->w64 &= RMASK; if (!gf_general_is_one(b, w)) b->w64 &= LMASK; break;
+ case 4: if (!gf_general_is_one(a, w)) a->w64 &= LMASK; if (!gf_general_is_one(b, w)) b->w64 &= RMASK; break;
+ case 5: if (!gf_general_is_one(a, w)) a->w64 &= LMASK; if (!gf_general_is_one(b, w)) b->w64 &= LMASK; break;
+ case 6: if (!gf_general_is_one(b, w)) b->w64 &= RMASK; break;
+ case 7: if (!gf_general_is_one(b, w)) b->w64 &= LMASK; break;
+ }
+ }
+
+ //Allen: for w=128, we have important special cases where one half or the other of the number is all
+ //zeros. The probability of hitting such a number randomly is 1^-64, so if we don't force these cases
+ //we'll probably never hit them. This could be implemented more efficiently by changing the set-random
+ //function for w=128, but I think this is easier to follow.
+ //I'm arbitrarily deciding to do this sort of thing with a quarter of the cases
+ if (w == 128) {
+ switch (i % 32)
+ {
+ case 0: if (!gf_general_is_one(a, w)) a->w128[0] = 0; break;
+ case 1: if (!gf_general_is_one(a, w)) a->w128[1] = 0; break;
+ case 2: if (!gf_general_is_one(a, w)) a->w128[0] = 0; if (!gf_general_is_one(b, w)) b->w128[0] = 0; break;
+ case 3: if (!gf_general_is_one(a, w)) a->w128[0] = 0; if (!gf_general_is_one(b, w)) b->w128[1] = 0; break;
+ case 4: if (!gf_general_is_one(a, w)) a->w128[1] = 0; if (!gf_general_is_one(b, w)) b->w128[0] = 0; break;
+ case 5: if (!gf_general_is_one(a, w)) a->w128[1] = 0; if (!gf_general_is_one(b, w)) b->w128[1] = 0; break;
+ case 6: if (!gf_general_is_one(b, w)) b->w128[0] = 0; break;
+ case 7: if (!gf_general_is_one(b, w)) b->w128[1] = 0; break;
+ }
}
tested = 0;
@@ -195,10 +278,10 @@ int main(int argc, char **argv)
gf_general_multiply(&gf_def, a, b, d);
if (!gf_general_are_equal(c, d, w)) {
- gf_general_val_to_s(a, w, as);
- gf_general_val_to_s(b, w, bs);
- gf_general_val_to_s(c, w, cs);
- gf_general_val_to_s(d, w, ds);
+ gf_general_val_to_s(a, w, as, 1);
+ gf_general_val_to_s(b, w, bs, 1);
+ gf_general_val_to_s(c, w, cs, 1);
+ gf_general_val_to_s(d, w, ds, 1);
printf("Error in single multiplication (all numbers in hex):\n\n");
printf(" gf.multiply(gf, %s, %s) = %s\n", as, bs, cs);
printf(" The default gf multiplier returned %s\n", ds);
@@ -216,9 +299,9 @@ int main(int argc, char **argv)
if (((gf_general_is_zero(a, w) || gf_general_is_zero(b, w)) && !gf_general_is_zero(c, w)) ||
(gf_general_is_one(a, w) && !gf_general_are_equal(b, c, w)) ||
(gf_general_is_one(b, w) && !gf_general_are_equal(a, c, w))) {
- gf_general_val_to_s(a, w, as);
- gf_general_val_to_s(b, w, bs);
- gf_general_val_to_s(c, w, cs);
+ gf_general_val_to_s(a, w, as, 1);
+ gf_general_val_to_s(b, w, bs, 1);
+ gf_general_val_to_s(c, w, cs, 1);
printf("Error in single multiplication (all numbers in hex):\n\n");
printf(" gf.multiply(gf, %s, %s) = %s, which is clearly wrong.\n", as, bs, cs);
;
@@ -229,9 +312,9 @@ int main(int argc, char **argv)
/* Dumb check to make sure that it's not returning numbers that are too big: */
if (w < 32 && (c->w32 & mask) != c->w32) {
- gf_general_val_to_s(a, w, as);
- gf_general_val_to_s(b, w, bs);
- gf_general_val_to_s(c, w, cs);
+ gf_general_val_to_s(a, w, as, 1);
+ gf_general_val_to_s(b, w, bs, 1);
+ gf_general_val_to_s(c, w, cs, 1);
printf("Error in single multiplication (all numbers in hex):\n\n");
printf(" gf.multiply.w32(gf, %s, %s) = %s, which is too big.\n", as, bs, cs);
exit(1);
@@ -242,10 +325,10 @@ int main(int argc, char **argv)
if (!gf_general_is_zero(a, w)) {
gf_general_divide(&gf, c, a, d);
if (!gf_general_are_equal(b, d, w)) {
- gf_general_val_to_s(a, w, as);
- gf_general_val_to_s(b, w, bs);
- gf_general_val_to_s(c, w, cs);
- gf_general_val_to_s(d, w, ds);
+ gf_general_val_to_s(a, w, as, 1);
+ gf_general_val_to_s(b, w, bs, 1);
+ gf_general_val_to_s(c, w, cs, 1);
+ gf_general_val_to_s(d, w, ds, 1);
printf("Error in single multiplication/division (all numbers in hex):\n\n");
printf(" gf.multiply(gf, %s, %s) = %s, but gf.divide(gf, %s, %s) = %s\n", as, bs, cs, cs, as, ds);
exit(1);
@@ -257,40 +340,82 @@ int main(int argc, char **argv)
if (region) {
if (verbose) { printf("Testing region multiplications\n"); fflush(stdout); }
- for (i = 0; i < 1000; i++) {
- if (i < 20) {
- gf_general_set_zero(a, w);
- } else if (i < 40) {
- gf_general_set_one(a, w);
- } else if (i < 60) {
- gf_general_set_two(a, w);
- } else {
- gf_general_set_random(a, w, 1);
+ for (i = 0; i < 1024; i++) {
+ //Allen: changing to a switch thing as with the single ops to make things proportional
+ switch (i % 32)
+ {
+ case 0:
+ gf_general_set_zero(a, w);
+ break;
+ case 1:
+ gf_general_set_one(a, w);
+ break;
+ case 2:
+ gf_general_set_two(a, w);
+ break;
+ default:
+ gf_general_set_random(a, w, 1);
}
MOA_Fill_Random_Region(ra, REGION_SIZE);
MOA_Fill_Random_Region(rb, REGION_SIZE);
- xor = i%2;
+ xor = (i/32)%2;
align = w/8;
if (align == 0) align = 1;
if (align > 16) align = 16;
+
+ /* JSP - Cauchy test. When w < 32 & it doesn't equal 4, 8 or 16, the default is
+ equal to GF_REGION_CAUCHY, even if GF_REGION_CAUCHY is not set. We are testing
+ three alignments here:
+
+ 1. Anything goes -- no alignment guaranteed.
+ 2. Perfect alignment. Here src and dest must be aligned wrt each other,
+ and bytes must be a multiple of 16*w.
+ 3. Imperfect alignment. Here we'll have src and dest be aligned wrt each
+ other, but bytes is simply a multiple of w. That means some XOR's will
+ be aligned, and some won't.
+ */
+
if ((h->region_type & GF_REGION_CAUCHY) || (w < 32 && w != 4 && w != 8 && w != 16)) {
- start = MOA_Random_W(5, 1);
- end = REGION_SIZE - MOA_Random_W(5, 1);
+ alignment_test = (i%3);
+
+ s_start = MOA_Random_W(5, 1);
+ if (alignment_test == 0) {
+ d_start = MOA_Random_W(5, 1);
+ } else {
+ d_start = s_start;
+ }
+
+ bytes = (d_start > s_start) ? REGION_SIZE - d_start : REGION_SIZE - s_start;
+ bytes -= MOA_Random_W(5, 1);
+ if (alignment_test == 1) {
+ bytes -= (bytes % (w*16));
+ } else {
+ bytes -= (bytes % w);
+ }
+
target = rb;
- while ((end-start)%w != 0) end--;
+
+ /* JSP - Otherwise, we're testing a non-cauchy test, and alignment
+ must be more strict. We have to make sure that the regions are
+ aligned wrt each other on 16-byte pointers. */
+
} else {
- start = MOA_Random_W(5, 1) * align;
- end = REGION_SIZE - (MOA_Random_W(5, 1) * align);
+ s_start = MOA_Random_W(5, 1) * align;
+ d_start = s_start;
+ bytes = REGION_SIZE - s_start - MOA_Random_W(5, 1);
+ bytes -= (bytes % align);
+
if (h->mult_type == GF_MULT_COMPOSITE && (h->region_type & GF_REGION_ALTMAP)) {
target = rb ;
} else {
- target = ((i%4)/2) ? rb : ra;
+ target = (i/64)%2 ? rb : ra;
}
}
+
memcpy(rc, ra, REGION_SIZE);
memcpy(rd, target, REGION_SIZE);
- gf_general_do_region_multiply(&gf, a, ra+start, target+start, end-start, xor);
- gf_general_do_region_check(&gf, a, rc+start, rd+start, target+start, end-start, xor);
+ gf_general_do_region_multiply(&gf, a, ra+s_start, target+d_start, bytes, xor);
+ gf_general_do_region_check(&gf, a, rc+s_start, rd+d_start, target+d_start, bytes, xor);
}
}
}
diff --git a/gf_w128.c b/gf_w128.c
index 0a2a93f..1465be5 100644
--- a/gf_w128.c
+++ b/gf_w128.c
@@ -12,7 +12,7 @@
#define two_x(a) {\
a[0] <<= 1; \
- if (a[1] & (uint64_t) 1 << 63) a[0] ^= 1; \
+ if (a[1] & 1ULL << 63) a[0] ^= 1; \
a[1] <<= 1; }
#define a_get_b(a, i, b, j) {\
@@ -28,11 +28,18 @@ struct gf_w128_split_4_128_data {
uint64_t tables[2][32][16];
};
+struct gf_w128_split_8_128_data {
+ uint64_t last_value[2];
+ uint64_t tables[2][16][256];
+};
+
typedef struct gf_group_tables_s {
gf_val_128_t m_table;
gf_val_128_t r_table;
} gf_group_tables_t;
+#define MM_PRINT8(s, r) { uint8_t blah[16], ii; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 1) printf("%s%02x", (ii%4==0) ? " " : " ", blah[15-ii]); printf("\n"); }
+
static
void
gf_w128_multiply_region_from_single(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes,
@@ -70,11 +77,120 @@ int xor)
}
}
+static
+void
+gf_w128_clm_multiply_region_from_single(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes,
+int xor)
+{
+ int i;
+ gf_val_128_t s128;
+ gf_val_128_t d128;
+ uint64_t c128[2];
+ gf_region_data rd;
+#ifdef INTEL_SSE4_PCLMUL
+ __m128i a,b;
+ __m128i result0,result1;
+ __m128i prim_poly;
+ __m128i c,d,e,f;
+ gf_internal_t * h = gf->scratch;
+ prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)h->prim_poly);
+ /* We only do this to check on alignment. */
+ gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 8);
+
+ if (val[0] == 0) {
+ if (val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; }
+ if (val[1] == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+ }
+
+ set_zero(c128, 0);
+
+ s128 = (gf_val_128_t) src;
+ d128 = (gf_val_128_t) dest;
+
+ if (xor) {
+ for (i = 0; i < bytes/sizeof(gf_val_64_t); i += 2) {
+ a = _mm_insert_epi64 (_mm_setzero_si128(), s128[i+1], 0);
+ b = _mm_insert_epi64 (a, val[1], 0);
+ a = _mm_insert_epi64 (a, s128[i], 1);
+ b = _mm_insert_epi64 (b, val[0], 1);
+
+ c = _mm_clmulepi64_si128 (a, b, 0x00); /*low-low*/
+ f = _mm_clmulepi64_si128 (a, b, 0x01); /*high-low*/
+ e = _mm_clmulepi64_si128 (a, b, 0x10); /*low-high*/
+ d = _mm_clmulepi64_si128 (a, b, 0x11); /*high-high*/
+
+ /* now reusing a and b as temporary variables*/
+ result0 = _mm_setzero_si128();
+ result1 = result0;
+
+ result0 = _mm_xor_si128 (result0, _mm_insert_epi64 (d, 0, 0));
+ a = _mm_xor_si128 (_mm_srli_si128 (e, 8), _mm_insert_epi64 (d, 0, 1));
+ result0 = _mm_xor_si128 (result0, _mm_xor_si128 (_mm_srli_si128 (f, 8), a));
+
+ a = _mm_xor_si128 (_mm_slli_si128 (e, 8), _mm_insert_epi64 (c, 0, 0));
+ result1 = _mm_xor_si128 (result1, _mm_xor_si128 (_mm_slli_si128 (f, 8), a));
+ result1 = _mm_xor_si128 (result1, _mm_insert_epi64 (c, 0, 1));
+ /* now we have constructed our 'result' with result0 being the carry bits, and we have to reduce. */
+
+ a = _mm_srli_si128 (result0, 8);
+ b = _mm_clmulepi64_si128 (a, prim_poly, 0x00);
+ result0 = _mm_xor_si128 (result0, _mm_srli_si128 (b, 8));
+ result1 = _mm_xor_si128 (result1, _mm_slli_si128 (b, 8));
+
+ a = _mm_insert_epi64 (result0, 0, 1);
+ b = _mm_clmulepi64_si128 (a, prim_poly, 0x00);
+ result1 = _mm_xor_si128 (result1, b);
+ d128[i] ^= (uint64_t)_mm_extract_epi64(result1,1);
+ d128[i+1] ^= (uint64_t)_mm_extract_epi64(result1,0);
+ }
+ } else {
+ for (i = 0; i < bytes/sizeof(gf_val_64_t); i += 2) {
+ a = _mm_insert_epi64 (_mm_setzero_si128(), s128[i+1], 0);
+ b = _mm_insert_epi64 (a, val[1], 0);
+ a = _mm_insert_epi64 (a, s128[i], 1);
+ b = _mm_insert_epi64 (b, val[0], 1);
+
+ c = _mm_clmulepi64_si128 (a, b, 0x00); /*low-low*/
+ f = _mm_clmulepi64_si128 (a, b, 0x01); /*high-low*/
+ e = _mm_clmulepi64_si128 (a, b, 0x10); /*low-high*/
+ d = _mm_clmulepi64_si128 (a, b, 0x11); /*high-high*/
+
+ /* now reusing a and b as temporary variables*/
+ result0 = _mm_setzero_si128();
+ result1 = result0;
+
+ result0 = _mm_xor_si128 (result0, _mm_insert_epi64 (d, 0, 0));
+ a = _mm_xor_si128 (_mm_srli_si128 (e, 8), _mm_insert_epi64 (d, 0, 1));
+ result0 = _mm_xor_si128 (result0, _mm_xor_si128 (_mm_srli_si128 (f, 8), a));
+
+ a = _mm_xor_si128 (_mm_slli_si128 (e, 8), _mm_insert_epi64 (c, 0, 0));
+ result1 = _mm_xor_si128 (result1, _mm_xor_si128 (_mm_slli_si128 (f, 8), a));
+ result1 = _mm_xor_si128 (result1, _mm_insert_epi64 (c, 0, 1));
+ /* now we have constructed our 'result' with result0 being the carry bits, and we have to reduce.*/
+
+ a = _mm_srli_si128 (result0, 8);
+ b = _mm_clmulepi64_si128 (a, prim_poly, 0x00);
+ result0 = _mm_xor_si128 (result0, _mm_srli_si128 (b, 8));
+ result1 = _mm_xor_si128 (result1, _mm_slli_si128 (b, 8));
+
+ a = _mm_insert_epi64 (result0, 0, 1);
+ b = _mm_clmulepi64_si128 (a, prim_poly, 0x00);
+ result1 = _mm_xor_si128 (result1, b);
+ d128[i] = (uint64_t)_mm_extract_epi64(result1,1);
+ d128[i+1] = (uint64_t)_mm_extract_epi64(result1,0);
+ }
+ }
+#endif
+}
+
/*
* Some w128 notes:
* --Big Endian
* --return values allocated beforehand
*/
+
+#define GF_W128_IS_ZERO(val) (val[0] == 0 && val[1] == 0)
+
void
gf_w128_shift_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
{
@@ -99,6 +215,7 @@ gf_w128_shift_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_12
set_zero(pl, 0);
set_zero(pr, 0);
+ /* Allen: a*b for right half of a */
for (i = 0; i < GF_FIELD_WIDTH/2; i++) {
if (a[1] & (one << i)) {
pl[1] ^= bl[1];
@@ -112,6 +229,7 @@ gf_w128_shift_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_12
br[1] <<= 1;
}
+ /* Allen: a*b for left half of a */
for (i = 0; i < GF_FIELD_WIDTH/2; i++) {
if (a[0] & (one << i)) {
pl[0] ^= bl[0];
@@ -125,10 +243,11 @@ gf_w128_shift_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_12
br[0] <<= 1;
}
- one = lbit;
- ppl[0] = lbit;
- ppl[1] = h->prim_poly >> 1;
- ppr[0] = lbit;
+ /* Allen: do first half of reduction (based on left quarter of initial product) */
+ one = lbit >> 1;
+ ppl[0] = one; /* Allen: introduce leading one of primitive polynomial */
+ ppl[1] = h->prim_poly >> 2;
+ ppr[0] = h->prim_poly << (GF_FIELD_WIDTH/2-2);
ppr[1] = 0;
while (one != 0) {
if (pl[0] & one) {
@@ -147,6 +266,7 @@ gf_w128_shift_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_12
ppl[0] >>= 1;
}
+ /* Allen: final half of reduction */
one = lbit;
while (one != 0) {
if (pl[1] & one) {
@@ -162,6 +282,7 @@ gf_w128_shift_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_12
ppl[1] >>= 1;
}
+ /* Allen: if we really want to optimize this we can just be using c128 instead of pr all along */
c128[0] = pr[0];
c128[1] = pr[1];
@@ -169,6 +290,191 @@ gf_w128_shift_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_12
}
void
+gf_w128_clm_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
+{
+#ifdef INTEL_SSE4_PCLMUL
+
+ __m128i a,b;
+ __m128i result0,result1;
+ __m128i prim_poly;
+ __m128i c,d,e,f;
+ gf_internal_t * h = gf->scratch;
+
+ a = _mm_insert_epi64 (_mm_setzero_si128(), a128[1], 0);
+ b = _mm_insert_epi64 (a, b128[1], 0);
+ a = _mm_insert_epi64 (a, a128[0], 1);
+ b = _mm_insert_epi64 (b, b128[0], 1);
+
+ prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)h->prim_poly);
+
+ /* we need to test algorithm 2 later*/
+ c = _mm_clmulepi64_si128 (a, b, 0x00); /*low-low*/
+ f = _mm_clmulepi64_si128 (a, b, 0x01); /*high-low*/
+ e = _mm_clmulepi64_si128 (a, b, 0x10); /*low-high*/
+ d = _mm_clmulepi64_si128 (a, b, 0x11); /*high-high*/
+
+ /* now reusing a and b as temporary variables*/
+ result0 = _mm_setzero_si128();
+ result1 = result0;
+
+ result0 = _mm_xor_si128 (result0, _mm_insert_epi64 (d, 0, 0));
+ a = _mm_xor_si128 (_mm_srli_si128 (e, 8), _mm_insert_epi64 (d, 0, 1));
+ result0 = _mm_xor_si128 (result0, _mm_xor_si128 (_mm_srli_si128 (f, 8), a));
+
+ a = _mm_xor_si128 (_mm_slli_si128 (e, 8), _mm_insert_epi64 (c, 0, 0));
+ result1 = _mm_xor_si128 (result1, _mm_xor_si128 (_mm_slli_si128 (f, 8), a));
+ result1 = _mm_xor_si128 (result1, _mm_insert_epi64 (c, 0, 1));
+ /* now we have constructed our 'result' with result0 being the carry bits, and we have to reduce.*/
+
+ a = _mm_srli_si128 (result0, 8);
+ b = _mm_clmulepi64_si128 (a, prim_poly, 0x00);
+ result0 = _mm_xor_si128 (result0, _mm_srli_si128 (b, 8));
+ result1 = _mm_xor_si128 (result1, _mm_slli_si128 (b, 8));
+
+ a = _mm_insert_epi64 (result0, 0, 1);
+ b = _mm_clmulepi64_si128 (a, prim_poly, 0x00);
+ result1 = _mm_xor_si128 (result1, b);
+
+ c128[0] = (uint64_t)_mm_extract_epi64(result1,1);
+ c128[1] = (uint64_t)_mm_extract_epi64(result1,0);
+#endif
+return;
+}
+
+void
+gf_w128_bytwo_p_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
+{
+ uint64_t amask[2], pmask, pp, prod[2]; /*John: pmask is always the highest bit set, and the rest zeros. amask changes, it's a countdown.*/
+ uint64_t topbit; /* this is used as a boolean value */
+ gf_internal_t *h;
+
+ h = (gf_internal_t *) gf->scratch;
+ pp = h->prim_poly;
+ prod[0] = 0;
+ prod[1] = 0;
+ pmask = 0x8000000000000000ULL;
+ amask[0] = 0x8000000000000000ULL;
+ amask[1] = 0;
+
+ while (amask[1] != 0 || amask[0] != 0) {
+ topbit = (prod[0] & pmask);
+ prod[0] <<= 1;
+ if (prod[1] & pmask) prod[0] ^= 1;
+ prod[1] <<= 1;
+ if (topbit) prod[1] ^= pp;
+ if ((a128[0] & amask[0]) || (a128[1] & amask[1])) {
+ prod[0] ^= b128[0];
+ prod[1] ^= b128[1];
+ }
+ amask[1] >>= 1;
+ if (amask[0] & 1) amask[1] ^= pmask;
+ amask[0] >>= 1;
+ }
+ c128[0] = prod [0];
+ c128[1] = prod [1];
+ return;
+}
+
+void
+gf_w128_sse_bytwo_p_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
+{
+#ifdef INTEL_SSE4
+ int i;
+ __m128i a, b, pp, one, prod, amask, l_middle_one, u_middle_one;
+ /*John: pmask is always the highest bit set, and the rest zeros. amask changes, it's a countdown.*/
+ uint32_t topbit, middlebit, pmask; /* this is used as a boolean value */
+ gf_internal_t *h;
+
+
+ h = (gf_internal_t *) gf->scratch;
+ pp = _mm_set_epi32(0, 0, 0, (uint32_t)h->prim_poly);
+ prod = _mm_setzero_si128();
+ a = _mm_insert_epi64(prod, a128[1], 0x0);
+ a = _mm_insert_epi64(a, a128[0], 0x1);
+ b = _mm_insert_epi64(prod, b128[1], 0x0);
+ b = _mm_insert_epi64(b, b128[0], 0x1);
+ pmask = 0x80000000;
+ amask = _mm_insert_epi32(prod, 0x80000000, 0x3);
+ u_middle_one = _mm_insert_epi32(prod, 1, 0x2);
+ l_middle_one = _mm_insert_epi32(prod, 1 << 31, 0x1);
+
+ for (i = 0; i < 64; i++) {
+ topbit = (_mm_extract_epi32(prod, 0x3) & pmask);
+ middlebit = (_mm_extract_epi32(prod, 0x1) & pmask);
+ prod = _mm_slli_epi64(prod, 1); /* this instruction loses the middle bit */
+ if (middlebit) {
+ prod = _mm_xor_si128(prod, u_middle_one);
+ }
+ if (topbit) {
+ prod = _mm_xor_si128(prod, pp);
+ }
+ if (((uint64_t)_mm_extract_epi64(_mm_and_si128(a, amask), 1))) {
+ prod = _mm_xor_si128(prod, b);
+ }
+ amask = _mm_srli_epi64(amask, 1); /*so does this one, but we can just replace after loop*/
+ }
+ amask = _mm_insert_epi32(amask, 1 << 31, 0x1);
+ for (i = 64; i < 128; i++) {
+ topbit = (_mm_extract_epi32(prod, 0x3) & pmask);
+ middlebit = (_mm_extract_epi32(prod, 0x1) & pmask);
+ prod = _mm_slli_epi64(prod, 1);
+ if (middlebit) prod = _mm_xor_si128(prod, u_middle_one);
+ if (topbit) prod = _mm_xor_si128(prod, pp);
+ if (((uint64_t)_mm_extract_epi64(_mm_and_si128(a, amask), 0))) {
+ prod = _mm_xor_si128(prod, b);
+ }
+ amask = _mm_srli_epi64(amask, 1);
+ }
+ c128[0] = (uint64_t)_mm_extract_epi64(prod, 1);
+ c128[1] = (uint64_t)_mm_extract_epi64(prod, 0);
+#endif
+ return;
+}
+
+
+/* Ben: This slow function implements sse instrutions for bytwo_b because why not */
+void
+gf_w128_sse_bytwo_b_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
+{
+#ifdef INTEL_SSE4
+ __m128i a, b, lmask, hmask, pp, c, middle_one;
+ gf_internal_t *h;
+ uint64_t topbit, middlebit;
+
+ h = (gf_internal_t *) gf->scratch;
+
+ c = _mm_setzero_si128();
+ lmask = _mm_insert_epi64(c, 1ULL << 63, 0);
+ hmask = _mm_insert_epi64(c, 1ULL << 63, 1);
+ b = _mm_insert_epi64(c, a128[0], 1);
+ b = _mm_insert_epi64(b, a128[1], 0);
+ a = _mm_insert_epi64(c, b128[0], 1);
+ a = _mm_insert_epi64(a, b128[1], 0);
+ pp = _mm_insert_epi64(c, h->prim_poly, 0);
+ middle_one = _mm_insert_epi64(c, 1, 0x1);
+
+ while (1) {
+ if (_mm_extract_epi32(a, 0x0) & 1) {
+ c = _mm_xor_si128(c, b);
+ }
+ middlebit = (_mm_extract_epi32(a, 0x2) & 1);
+ a = _mm_srli_epi64(a, 1);
+ if (middlebit) a = _mm_xor_si128(a, lmask);
+ if ((_mm_extract_epi64(a, 0x1) == 0ULL) && (_mm_extract_epi64(a, 0x0) == 0ULL)){
+ c128[0] = _mm_extract_epi64(c, 0x1);
+ c128[1] = _mm_extract_epi64(c, 0x0);
+ return;
+ }
+ topbit = (_mm_extract_epi64(_mm_and_si128(b, hmask), 1));
+ middlebit = (_mm_extract_epi64(_mm_and_si128(b, lmask), 0));
+ b = _mm_slli_epi64(b, 1);
+ if (middlebit) b = _mm_xor_si128(b, middle_one);
+ if (topbit) b = _mm_xor_si128(b, pp);
+ }
+#endif
+}
+
+void
gf_w128_bytwo_b_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
{
uint64_t bmask, pp;
@@ -177,7 +483,7 @@ gf_w128_bytwo_b_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_
h = (gf_internal_t *) gf->scratch;
- bmask = (1L << 63);
+ bmask = (1ULL << 63);
set_zero(c, 0);
b[0] = a128[0];
b[1] = a128[1];
@@ -243,9 +549,9 @@ gf_w128_split_4_128_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_
ld->tables[0][i][k^j] = (v[0] ^ ld->tables[0][i][k]);
ld->tables[1][i][k^j] = (v[1] ^ ld->tables[1][i][k]);
}
- pp = (v[0] & (1L << 63));
+ pp = (v[0] & (1ULL << 63));
v[0] <<= 1;
- if (v[1] & (1L << 63)) v[0] ^= 1;
+ if (v[1] & (1ULL << 63)) v[0] ^= 1;
v[1] <<= 1;
if (pp) v[1] ^= h->prim_poly;
}
@@ -254,6 +560,15 @@ gf_w128_split_4_128_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_
ld->last_value[0] = val[0];
ld->last_value[1] = val[1];
+/*
+ for (i = 0; i < 32; i++) {
+ for (j = 0; j < 16; j++) {
+ printf("%2d %2d %016llx %016llx\n", i, j, ld->tables[0][i][j], ld->tables[1][i][j]);
+ }
+ printf("\n");
+ }
+ */
+ i = 0;
while (d64 < top) {
v[0] = (xor) ? d64[0] : 0;
v[1] = (xor) ? d64[1] : 0;
@@ -280,6 +595,191 @@ gf_w128_split_4_128_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_
}
}
+static
+void
+gf_w128_split_4_128_sse_altmap_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor)
+{
+#ifdef INTEL_SSSE3
+ gf_internal_t *h;
+ int i, m, j, k, tindex;
+ uint64_t pp, v[2], s, *s64, *d64, *top;
+ __m128i si, tables[32][16], p[16], v0, mask1;
+ struct gf_w128_split_4_128_data *ld;
+ uint8_t btable[16];
+ gf_region_data rd;
+
+ if (val[0] == 0) {
+ if (val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; }
+ if (val[1] == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+ }
+
+ h = (gf_internal_t *) gf->scratch;
+ pp = h->prim_poly;
+
+ /* We only do this to check on alignment. */
+ gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 256);
+
+ /* Doing this instead of gf_do_initial_region_alignment() because that doesn't hold 128-bit vals */
+
+ gf_w128_multiply_region_from_single(gf, src, dest, val, (rd.s_start-src), xor);
+
+ s64 = (uint64_t *) rd.s_start;
+ d64 = (uint64_t *) rd.d_start;
+ top = (uint64_t *) rd.d_top;
+
+ ld = (struct gf_w128_split_4_128_data *) h->private;
+
+ if (val[0] != ld->last_value[0] || val[1] != ld->last_value[1]) {
+ v[0] = val[0];
+ v[1] = val[1];
+ for (i = 0; i < 32; i++) {
+ ld->tables[0][i][0] = 0;
+ ld->tables[1][i][0] = 0;
+ for (j = 1; j < 16; j <<= 1) {
+ for (k = 0; k < j; k++) {
+ ld->tables[0][i][k^j] = (v[0] ^ ld->tables[0][i][k]);
+ ld->tables[1][i][k^j] = (v[1] ^ ld->tables[1][i][k]);
+ }
+ pp = (v[0] & (1ULL << 63));
+ v[0] <<= 1;
+ if (v[1] & (1ULL << 63)) v[0] ^= 1;
+ v[1] <<= 1;
+ if (pp) v[1] ^= h->prim_poly;
+ }
+ }
+ }
+
+ ld->last_value[0] = val[0];
+ ld->last_value[1] = val[1];
+
+ for (i = 0; i < 32; i++) {
+ for (j = 0; j < 16; j++) {
+ for (k = 0; k < 16; k++) {
+ btable[k] = (uint8_t) ld->tables[1-(j/8)][i][k];
+ ld->tables[1-(j/8)][i][k] >>= 8;
+ }
+ tables[i][j] = _mm_loadu_si128((__m128i *) btable);
+/*
+ printf("%2d %2d: ", i, j);
+ MM_PRINT8("", tables[i][j]);
+ */
+ }
+ }
+
+
+ mask1 = _mm_set1_epi8(0xf);
+
+ while (d64 != top) {
+
+ if (xor) {
+ for (i = 0; i < 16; i++) p[i] = _mm_load_si128 ((__m128i *) (d64+i*2));
+ } else {
+ for (i = 0; i < 16; i++) p[i] = _mm_setzero_si128();
+ }
+ i = 0;
+ for (k = 0; k < 16; k++) {
+ v0 = _mm_load_si128((__m128i *) s64);
+ s64 += 2;
+
+ si = _mm_and_si128(v0, mask1);
+
+ for (j = 0; j < 16; j++) {
+ p[j] = _mm_xor_si128(p[j], _mm_shuffle_epi8(tables[i][j], si));
+ }
+ i++;
+ v0 = _mm_srli_epi32(v0, 4);
+ si = _mm_and_si128(v0, mask1);
+ for (j = 0; j < 16; j++) {
+ p[j] = _mm_xor_si128(p[j], _mm_shuffle_epi8(tables[i][j], si));
+ }
+ i++;
+ }
+ for (i = 0; i < 16; i++) {
+ _mm_store_si128((__m128i *) d64, p[i]);
+ d64 += 2;
+ }
+ }
+ /* Doing this instead of gf_do_final_region_alignment() because that doesn't hold 128-bit vals */
+
+ gf_w128_multiply_region_from_single(gf, rd.s_top, rd.d_top, val, (src+bytes)-rd.s_top, xor);
+#endif
+}
+
+static
+void
+gf_w128_split_8_128_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor)
+{
+ int i, j, k;
+ uint64_t pp;
+ gf_internal_t *h;
+ uint64_t *s64, *d64, *top;
+ gf_region_data rd;
+ uint64_t v[2], s;
+ struct gf_w128_split_8_128_data *ld;
+
+ /* Check on alignment. Ignore it otherwise. */
+ gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 8);
+
+ if (val[0] == 0) {
+ if (val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; }
+ if (val[1] == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+ }
+
+ h = (gf_internal_t *) gf->scratch;
+ ld = (struct gf_w128_split_8_128_data *) h->private;
+
+ s64 = (uint64_t *) rd.s_start;
+ d64 = (uint64_t *) rd.d_start;
+ top = (uint64_t *) rd.d_top;
+
+ if (val[0] != ld->last_value[0] || val[1] != ld->last_value[1]) {
+ v[0] = val[0];
+ v[1] = val[1];
+ for (i = 0; i < 16; i++) {
+ ld->tables[0][i][0] = 0;
+ ld->tables[1][i][0] = 0;
+ for (j = 1; j < (1 << 8); j <<= 1) {
+ for (k = 0; k < j; k++) {
+ ld->tables[0][i][k^j] = (v[0] ^ ld->tables[0][i][k]);
+ ld->tables[1][i][k^j] = (v[1] ^ ld->tables[1][i][k]);
+ }
+ pp = (v[0] & (1ULL << 63));
+ v[0] <<= 1;
+ if (v[1] & (1ULL << 63)) v[0] ^= 1;
+ v[1] <<= 1;
+ if (pp) v[1] ^= h->prim_poly;
+ }
+ }
+ }
+ ld->last_value[0] = val[0];
+ ld->last_value[1] = val[1];
+
+ while (d64 < top) {
+ v[0] = (xor) ? d64[0] : 0;
+ v[1] = (xor) ? d64[1] : 0;
+ s = s64[1];
+ i = 0;
+ while (s != 0) {
+ v[0] ^= ld->tables[0][i][s&0xff];
+ v[1] ^= ld->tables[1][i][s&0xff];
+ s >>= 8;
+ i++;
+ }
+ s = s64[0];
+ i = 8;
+ while (s != 0) {
+ v[0] ^= ld->tables[0][i][s&0xff];
+ v[1] ^= ld->tables[1][i][s&0xff];
+ s >>= 8;
+ i++;
+ }
+ d64[0] = v[0];
+ d64[1] = v[1];
+ s64 += 2;
+ d64 += 2;
+ }
+}
+
void
gf_w128_bytwo_b_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor)
{
@@ -300,7 +800,7 @@ gf_w128_bytwo_b_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t va
s64 = (uint64_t *) rd.s_start;
d64 = (uint64_t *) rd.d_start;
top = (uint64_t *) rd.d_top;
- bmask = (1L << 63);
+ bmask = (1ULL << 63);
while (d64 < top) {
set_zero(c, 0);
@@ -359,11 +859,7 @@ void gf_w128_group_m_init(gf_t *gf, gf_val_128_t b128)
uint64_t a128[2];
scratch = (gf_internal_t *) gf->scratch;
gt = scratch->private;
- if (scratch->mult_type == GF_MULT_DEFAULT) {
- g_m = 4;
- } else {
- g_m = scratch->arg1;
- }
+ g_m = scratch->arg1;
prim_poly = scratch->prim_poly;
@@ -385,10 +881,49 @@ void gf_w128_group_m_init(gf_t *gf, gf_val_128_t b128)
return;
}
+static
+void gf_w128_group_m_sse_init(gf_t *gf, gf_val_128_t b128)
+{
+#ifdef INTEL_SSE4
+ int i, j;
+ int g_m;
+ uint64_t lbit, middlebit;
+ gf_internal_t *scratch;
+ gf_group_tables_t *gt;
+ scratch = (gf_internal_t *) gf->scratch;
+ gt = scratch->private;
+ g_m = scratch->arg1;
+
+ __m128i *table = (__m128i *)(gt->m_table), b, a, ubit, prim_poly;
+ prim_poly = _mm_insert_epi64(_mm_setzero_si128(), scratch->prim_poly, 0);
+ b = _mm_loadu_si128((__m128i *)(b128));
+
+ table[0] = _mm_setzero_si128();
+ table[1] = table[0];
+ table[1] = _mm_insert_epi64(table[1],b128[0],1);
+ table[1] = _mm_insert_epi64(table[1],b128[1],0);
+ lbit = 1;
+ lbit <<= 63;
+ ubit = _mm_set_epi32(0, 1, 0, 0);
+ for (i = 2; i < (1 << g_m); i <<= 1) {
+ a = table[(i >> 1)];
+ middlebit = (_mm_extract_epi64(a, 0x0) & lbit);
+ a = _mm_slli_epi64(a, 1);
+ if (middlebit) a = _mm_xor_si128(a, ubit);
+ table[i] = a;
+ if (_mm_extract_epi64(table[i >> 1], 0x1) & lbit) table[i] = _mm_xor_si128(table[i], prim_poly);
+ for (j = 0; j < i; j++) {
+ table[i + j] = _mm_xor_si128(table[i], table[j]);
+ }
+ }
+ return;
+#endif
+}
+
void
gf_w128_group_multiply(GFP gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
{
- int i;
+ int i,j;
/* index_r, index_m, total_m (if g_r > g_m) */
int i_r, i_m, t_m;
int mask_m, mask_r;
@@ -399,13 +934,8 @@ gf_w128_group_multiply(GFP gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_
scratch = (gf_internal_t *) gf->scratch;
gt = scratch->private;
- if (scratch->mult_type == GF_MULT_DEFAULT) {
- g_m = 4;
- g_r = 8;
- } else {
- g_m = scratch->arg1;
- g_r = scratch->arg2;
- }
+ g_m = scratch->arg1;
+ g_r = scratch->arg2;
mask_m = (1 << g_m) - 1;
mask_r = (1 << g_r) - 1;
@@ -413,7 +943,7 @@ gf_w128_group_multiply(GFP gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_
if (b128[0] != gt->m_table[2] || b128[1] != gt->m_table[3]) {
gf_w128_group_m_init(gf, b128);
}
-
+
p_i[0] = 0;
p_i[1] = 0;
a[0] = a128[0];
@@ -458,11 +988,92 @@ gf_w128_group_multiply(GFP gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_
i_r <<= g_m;
}
}
-
c128[0] = p_i[0];
c128[1] = p_i[1];
}
+void
+gf_w128_group_sse_multiply(GFP gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
+{
+#ifdef INTEL_SSE4
+ int i,j;
+ int i_r, i_m, t_m;
+ int mask_m, mask_r, mask_s;
+ int g_m, g_r;
+ uint32_t shiftbits;
+ uint64_t a[2], tbit = 1;
+ tbit <<= 63;
+ gf_internal_t *scratch;
+ gf_group_tables_t *gt;
+ __m128i p_i, *m_table, *r_table, zero;
+
+ zero = _mm_setzero_si128();
+ scratch = (gf_internal_t *) gf->scratch;
+ gt = scratch->private;
+ m_table = (__m128i *)(gt->m_table);
+ r_table = (__m128i *)(gt->r_table);
+ g_m = scratch->arg1;
+ g_r = scratch->arg2;
+
+ mask_m = (1 << g_m) - 1;
+ mask_r = (1 << g_r) - 1;
+ mask_s = mask_m << (32-g_m); /*sets g_m leftmost bits to 1*/
+ if (b128[0] != _mm_extract_epi64(m_table[1], 1) || b128[1] != _mm_extract_epi64(m_table[1], 0)) {
+ gf_w128_group_m_sse_init(gf, b128);
+ }
+
+ p_i = zero;
+ a[0] = a128[0];
+ a[1] = a128[1];
+
+ t_m = 0;
+ i_r = 0;
+
+ /* Top 64 bits */
+ for (i = ((GF_FIELD_WIDTH / 2) / g_m) - 1; i >= 0; i--) {
+ i_m = (a[0] >> (i * g_m)) & mask_m;
+ i_r ^= ((uint64_t)_mm_extract_epi64(p_i, 1) >> (64 - g_m)) & mask_r;
+
+ shiftbits = _mm_extract_epi32(p_i, 1) & mask_s;
+ shiftbits >>= 32-g_m;
+ p_i = _mm_slli_epi64(p_i, g_m);
+ p_i = _mm_xor_si128(p_i, _mm_insert_epi32(zero, shiftbits, 2));
+
+ p_i = _mm_xor_si128(p_i, m_table[i_m]);
+ t_m += g_m;
+ if (t_m == g_r) {
+ p_i = _mm_xor_si128(p_i, r_table[i_r]);
+ t_m = 0;
+ i_r = 0;
+ } else {
+ i_r <<= g_m;
+ }
+ }
+
+ for (i = ((GF_FIELD_WIDTH / 2) / g_m) - 1; i >= 0; i--) {
+ i_m = (a[1] >> (i * g_m)) & mask_m;
+ i_r ^= (((uint64_t)_mm_extract_epi64(p_i,1)) >> (64 - g_m)) & mask_r;
+
+ shiftbits = _mm_extract_epi32(p_i, 1) & mask_s;
+ shiftbits >>= 32-g_m;
+ p_i = _mm_slli_epi64(p_i, g_m);
+ p_i = _mm_xor_si128(p_i, _mm_insert_epi32(zero, shiftbits, 2));
+
+ p_i = _mm_xor_si128(p_i, m_table[i_m]);
+ t_m += g_m;
+ if (t_m == g_r) {
+ p_i = _mm_xor_si128(p_i, r_table[i_r]);
+ t_m = 0;
+ i_r = 0;
+ } else {
+ i_r <<= g_m;
+ }
+ }
+ c128[0] = _mm_extract_epi64(p_i, 1);
+ c128[1] = _mm_extract_epi64(p_i, 0);
+#endif
+}
+
static
void
gf_w128_group_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor)
@@ -487,13 +1098,8 @@ gf_w128_group_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val,
scratch = (gf_internal_t *) gf->scratch;
gt = scratch->private;
- if (scratch->mult_type == GF_MULT_DEFAULT) {
- g_m = 4;
- g_r = 8;
- } else {
- g_m = scratch->arg1;
- g_r = scratch->arg2;
- }
+ g_m = scratch->arg1;
+ g_r = scratch->arg2;
mask_m = (1 << g_m) - 1;
mask_r = (1 << g_r) - 1;
@@ -522,6 +1128,7 @@ gf_w128_group_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val,
p_i[0] <<= g_m;
p_i[0] ^= (p_i[1] >> (64-g_m));
p_i[1] <<= g_m;
+
p_i[0] ^= gt->m_table[2 * i_m];
p_i[1] ^= gt->m_table[(2 * i_m) + 1];
t_m += g_m;
@@ -533,7 +1140,6 @@ gf_w128_group_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val,
i_r <<= g_m;
}
}
-
for (i = ((GF_FIELD_WIDTH / 2) / g_m) - 1; i >= 0; i--) {
i_m = (a[1] >> (i * g_m)) & mask_m;
i_r ^= (p_i[0] >> (64 - g_m)) & mask_r;
@@ -564,9 +1170,162 @@ gf_w128_group_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val,
}
}
+static
+void
+gf_w128_group_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor)
+{
+#ifdef INTEL_SSE4
+ int i;
+ int i_r, i_m, t_m;
+ int mask_m, mask_r, mask_s;
+ int g_m, g_r;
+ uint32_t shiftbits;
+ uint64_t a[2];
+ gf_internal_t *scratch;
+ gf_group_tables_t *gt;
+ gf_region_data rd;
+ uint64_t *a128, *c128, *top;
+ __m128i *m_table, *r_table, p_i, zero;
+ zero = _mm_setzero_si128();
+ /* We only do this to check on alignment. */
+ gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 8);
+
+ if (val[0] == 0) {
+ if (val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; }
+ if (val[1] == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+ }
+
+ scratch = (gf_internal_t *) gf->scratch;
+ gt = scratch->private;
+ m_table = (__m128i *)(gt->m_table);
+ r_table = (__m128i *)(gt->r_table);
+ g_m = scratch->arg1;
+ g_r = scratch->arg2;
+
+ mask_m = (1 << g_m) - 1;
+ mask_r = (1 << g_r) - 1;
+ mask_s = mask_m << (32-g_m);
+
+ if (val[0] != _mm_extract_epi64(m_table[1], 1) || val[1] != _mm_extract_epi64(m_table[1], 0)) {
+ gf_w128_group_m_sse_init(gf, val);
+ }
+
+ a128 = (uint64_t *) src;
+ c128 = (uint64_t *) dest;
+ top = (uint64_t *) rd.d_top;
+
+ if (xor){
+ while (c128 < top) {
+ p_i = zero;
+ a[0] = a128[0];
+ a[1] = a128[1];
+
+ t_m = 0;
+ i_r = 0;
+ /* Top 64 bits */
+ for (i = ((GF_FIELD_WIDTH / 2) / g_m) - 1; i >= 0; i--) {
+ i_m = (a[0] >> (i * g_m)) & mask_m;
+ i_r ^= ((uint64_t)_mm_extract_epi64(p_i, 1) >> (64 - g_m)) & mask_r;
+
+ shiftbits = _mm_extract_epi32(p_i, 1) & mask_s;
+ shiftbits >>= 32-g_m;
+ p_i = _mm_slli_epi64(p_i, g_m);
+ p_i = _mm_xor_si128(p_i, _mm_insert_epi32(zero, shiftbits, 2));
+ p_i = _mm_xor_si128(p_i, m_table[i_m]);
+ t_m += g_m;
+ if (t_m == g_r) {
+ p_i = _mm_xor_si128(p_i, r_table[i_r]);
+ t_m = 0;
+ i_r = 0;
+ } else {
+ i_r <<= g_m;
+ }
+ }
+
+ for (i = ((GF_FIELD_WIDTH / 2) / g_m) - 1; i >= 0; i--) {
+ i_m = (a[1] >> (i * g_m)) & mask_m;
+ i_r ^= ((uint64_t)_mm_extract_epi64(p_i, 1) >> (64 - g_m)) & mask_r;
+
+ shiftbits = _mm_extract_epi32(p_i, 1) & mask_s;
+ shiftbits >>= 32-g_m;
+ p_i = _mm_slli_epi64(p_i, g_m);
+ p_i = _mm_xor_si128(p_i, _mm_insert_epi32(zero, shiftbits, 2));
+
+ p_i = _mm_xor_si128(p_i, m_table[i_m]);
+ t_m += g_m;
+ if (t_m == g_r) {
+ p_i = _mm_xor_si128(p_i, r_table[i_r]);
+ t_m = 0;
+ i_r = 0;
+ } else {
+ i_r <<= g_m;
+ }
+ }
+
+ c128[0] ^= _mm_extract_epi64(p_i, 1);
+ c128[1] ^= _mm_extract_epi64(p_i, 0);
+ a128 += 2;
+ c128 += 2;
+ }
+ }else{
+ while (c128 < top) {
+ p_i = zero;
+ a[0] = a128[0];
+ a[1] = a128[1];
+
+ t_m = 0;
+ i_r = 0;
+ /* Top 64 bits */
+ for (i = ((GF_FIELD_WIDTH / 2) / g_m) - 1; i >= 0; i--) {
+ i_m = (a[0] >> (i * g_m)) & mask_m;
+ i_r ^= ((uint64_t)_mm_extract_epi64(p_i, 1) >> (64 - g_m)) & mask_r;
+
+ shiftbits = _mm_extract_epi32(p_i, 1) & mask_s;
+ shiftbits >>= 32-g_m;
+ p_i = _mm_slli_epi64(p_i, g_m);
+ p_i = _mm_xor_si128(p_i, _mm_insert_epi32(zero, shiftbits, 2));
+ p_i = _mm_xor_si128(p_i, m_table[i_m]);
+ t_m += g_m;
+ if (t_m == g_r) {
+ p_i = _mm_xor_si128(p_i, r_table[i_r]);
+ t_m = 0;
+ i_r = 0;
+ } else {
+ i_r <<= g_m;
+ }
+ }
+
+ for (i = ((GF_FIELD_WIDTH / 2) / g_m) - 1; i >= 0; i--) {
+ i_m = (a[1] >> (i * g_m)) & mask_m;
+ i_r ^= ((uint64_t)_mm_extract_epi64(p_i, 1) >> (64 - g_m)) & mask_r;
+
+ shiftbits = _mm_extract_epi32(p_i, 1) & mask_s;
+ shiftbits >>= 32-g_m;
+ p_i = _mm_slli_epi64(p_i, g_m);
+ p_i = _mm_xor_si128(p_i, _mm_insert_epi32(zero, shiftbits, 2));
+
+ p_i = _mm_xor_si128(p_i, m_table[i_m]);
+ t_m += g_m;
+ if (t_m == g_r) {
+ p_i = _mm_xor_si128(p_i, r_table[i_r]);
+ t_m = 0;
+ i_r = 0;
+ } else {
+ i_r <<= g_m;
+ }
+ }
+
+ c128[0] = _mm_extract_epi64(p_i, 1);
+ c128[1] = _mm_extract_epi64(p_i, 0);
+ a128 += 2;
+ c128 += 2;
+ }
+ }
+#endif
+}
/* a^-1 -> b */
-void
+ void
gf_w128_euclid(GFP gf, gf_val_128_t a128, gf_val_128_t b128)
{
uint64_t e_i[2], e_im1[2], e_ip1[2];
@@ -585,10 +1344,26 @@ gf_w128_euclid(GFP gf, gf_val_128_t a128, gf_val_128_t b128)
e_i[0] = a128[0];
e_i[1] = a128[1];
d_im1 = 128;
+
+ //Allen: I think d_i starts at 63 here, and checks each bit of a, starting at MSB, looking for the first nonzero bit
+ //so d_i should be 0 if this half of a is all 0s, otherwise it should be the position from right of the first-from-left zero bit of this half of a.
+ //BUT if d_i is 0 at end we won't know yet if the rightmost bit of this half is 1 or not
+
for (d_i = (d_im1-1) % 64; ((one << d_i) & e_i[0]) == 0 && d_i > 0; d_i--) ;
+
+ //Allen: this is testing just the first half of the stop condition above, so if it holds we know we did not find a nonzero bit yet
+
if (!((one << d_i) & e_i[0])) {
- for (d_i = (d_im1-1) % 64; ((one << d_i) & e_i[1] == 0); d_i--) ;
+
+ //Allen: this is doing the same thing on the other half of a. In other words, we're still searching for a nonzero bit of a.
+ // but not bothering to test if d_i hits zero, which is fine because we've already tested for a=0.
+
+ for (d_i = (d_im1-1) % 64; ((one << d_i) & e_i[1]) == 0; d_i--) ;
+
} else {
+
+ //Allen: if a 1 was found in more-significant half of a, make d_i the ACTUAL index of the first nonzero bit in the entire a.
+
d_i += 64;
}
y_i[0] = 0;
@@ -614,11 +1389,11 @@ gf_w128_euclid(GFP gf, gf_val_128_t a128, gf_val_128_t b128)
if (d_ip1 - d_i > 0) e_ip1[0] ^= (e_i[1] >> (64 - (d_ip1 - d_i)));
e_ip1[1] ^= (e_i[1] << (d_ip1 - d_i));
}
- d_ip1--;
+ d_ip1--;
+ if (e_ip1[0] == 0 && e_ip1[1] == 0) { b[0] = 0; b[1] = 0; return; }
while (d_ip1 >= 64 && (e_ip1[0] & (one << (d_ip1 - 64))) == 0) d_ip1--;
while (d_ip1 < 64 && (e_ip1[1] & (one << d_ip1)) == 0) d_ip1--;
}
-
gf->multiply.w128(gf, c_i, y_i, y_ip1);
y_ip1[0] ^= y_im1[0];
y_ip1[1] ^= y_im1[1];
@@ -640,11 +1415,10 @@ gf_w128_euclid(GFP gf, gf_val_128_t a128, gf_val_128_t b128)
b = (uint64_t *) b128;
b[0] = y_i[0];
b[1] = y_i[1];
-
return;
}
-void
+ void
gf_w128_divide_from_inverse(GFP gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
{
uint64_t d[2];
@@ -653,7 +1427,7 @@ gf_w128_divide_from_inverse(GFP gf, gf_val_128_t a128, gf_val_128_t b128, gf_val
return;
}
-void
+ void
gf_w128_inverse_from_divide(GFP gf, gf_val_128_t a128, gf_val_128_t b128)
{
uint64_t one128[2];
@@ -663,21 +1437,209 @@ gf_w128_inverse_from_divide(GFP gf, gf_val_128_t a128, gf_val_128_t b128)
return;
}
+
+static
+ void
+gf_w128_composite_inverse(gf_t *gf, gf_val_128_t a, gf_val_128_t inv)
+{
+ gf_internal_t *h = (gf_internal_t *) gf->scratch;
+ gf_t *base_gf = h->base_gf;
+ uint64_t a0 = a[1];
+ uint64_t a1 = a[0];
+ uint64_t c0, c1, d, tmp;
+ uint64_t a0inv, a1inv;
+
+ if (a0 == 0) {
+ a1inv = base_gf->inverse.w64(base_gf, a1);
+ c0 = base_gf->multiply.w64(base_gf, a1inv, h->prim_poly);
+ c1 = a1inv;
+ } else if (a1 == 0) {
+ c0 = base_gf->inverse.w64(base_gf, a0);
+ c1 = 0;
+ } else {
+ a1inv = base_gf->inverse.w64(base_gf, a1);
+ a0inv = base_gf->inverse.w64(base_gf, a0);
+
+ d = base_gf->multiply.w64(base_gf, a1, a0inv);
+
+ tmp = (base_gf->multiply.w64(base_gf, a1, a0inv) ^ base_gf->multiply.w64(base_gf, a0, a1inv) ^ h->prim_poly);
+ tmp = base_gf->inverse.w64(base_gf, tmp);
+
+ d = base_gf->multiply.w64(base_gf, d, tmp);
+
+ c0 = base_gf->multiply.w64(base_gf, (d^1), a0inv);
+ c1 = base_gf->multiply.w64(base_gf, d, a1inv);
+ }
+ inv[0] = c1;
+ inv[1] = c0;
+}
+
+static
+ void
+gf_w128_composite_multiply(gf_t *gf, gf_val_128_t a, gf_val_128_t b, gf_val_128_t rv)
+{
+ gf_internal_t *h = (gf_internal_t *) gf->scratch;
+ gf_t *base_gf = h->base_gf;
+ uint64_t b0 = b[1];
+ uint64_t b1 = b[0];
+ uint64_t a0 = a[1];
+ uint64_t a1 = a[0];
+ uint64_t a1b1;
+
+ a1b1 = base_gf->multiply.w64(base_gf, a1, b1);
+
+ rv[1] = (base_gf->multiply.w64(base_gf, a0, b0) ^ a1b1);
+ rv[0] = base_gf->multiply.w64(base_gf, a1, b0) ^
+ base_gf->multiply.w64(base_gf, a0, b1) ^
+ base_gf->multiply.w64(base_gf, a1b1, h->prim_poly);
+}
+
+static
+ void
+gf_w128_composite_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor)
+{
+ unsigned long uls, uld;
+ gf_internal_t *h = (gf_internal_t *) gf->scratch;
+ gf_t *base_gf = h->base_gf;
+ uint64_t b0 = val[1];
+ uint64_t b1 = val[0];
+ uint64_t *s64, *d64;
+ uint64_t *top;
+ uint64_t a0, a1, a1b1;
+ gf_region_data rd;
+
+ if (val[0] == 0 && val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; }
+
+ gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 8);
+
+ s64 = rd.s_start;
+ d64 = rd.d_start;
+ top = rd.d_top;
+
+ if (xor) {
+ while (d64 < top) {
+ a1 = s64[0];
+ a0 = s64[1];
+ a1b1 = base_gf->multiply.w64(base_gf, a1, b1);
+
+ d64[1] ^= (base_gf->multiply.w64(base_gf, a0, b0) ^ a1b1);
+ d64[0] ^= (base_gf->multiply.w64(base_gf, a1, b0) ^
+ base_gf->multiply.w64(base_gf, a0, b1) ^
+ base_gf->multiply.w64(base_gf, a1b1, h->prim_poly));
+ s64 += 2;
+ d64 += 2;
+ }
+ } else {
+ while (d64 < top) {
+ a1 = s64[0];
+ a0 = s64[1];
+ a1b1 = base_gf->multiply.w64(base_gf, a1, b1);
+
+ d64[1] = (base_gf->multiply.w64(base_gf, a0, b0) ^ a1b1);
+ d64[0] = (base_gf->multiply.w64(base_gf, a1, b0) ^
+ base_gf->multiply.w64(base_gf, a0, b1) ^
+ base_gf->multiply.w64(base_gf, a1b1, h->prim_poly));
+ s64 += 2;
+ d64 += 2;
+ }
+ }
+}
+
+static
+void
+gf_w128_composite_multiply_region_alt(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int
+ xor)
+{
+ gf_internal_t *h = (gf_internal_t *) gf->scratch; gf_t *base_gf = h->base_gf;
+ gf_val_64_t val0 = val[1];
+ gf_val_64_t val1 = val[0];
+ uint64_t *l, *hi;
+ uint8_t *slow, *shigh;
+ uint8_t *dlow, *dhigh, *top;
+ int sub_reg_size;
+ gf_region_data rd;
+
+ gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 64);
+ gf_w128_multiply_region_from_single(gf, src, dest, val, (rd.s_start-src), xor);
+
+ slow = (uint8_t *) rd.s_start;
+ dlow = (uint8_t *) rd.d_start;
+ top = (uint8_t*) rd.d_top;
+ sub_reg_size = (top - dlow)/2;
+ shigh = slow + sub_reg_size;
+ dhigh = dlow + sub_reg_size;
+
+ base_gf->multiply_region.w64(base_gf, slow, dlow, val0, sub_reg_size, xor);
+ base_gf->multiply_region.w64(base_gf, shigh, dlow, val1, sub_reg_size, 1);
+ base_gf->multiply_region.w64(base_gf, slow, dhigh, val1, sub_reg_size, xor);
+ base_gf->multiply_region.w64(base_gf, shigh, dhigh, val0, sub_reg_size, 1);
+ base_gf->multiply_region.w64(base_gf, shigh, dhigh, base_gf->multiply.w64(base_gf, h->prim_poly, val1
+ ), sub_reg_size, 1);
+
+ gf_w128_multiply_region_from_single(gf, rd.s_top, rd.d_top, val, (src+bytes)-rd.s_top, xor);
+}
+
+
+ static
+int gf_w128_composite_init(gf_t *gf)
+{
+ gf_internal_t *h = (gf_internal_t *) gf->scratch;
+
+ if (h->region_type & GF_REGION_ALTMAP) {
+ gf->multiply_region.w128 = gf_w128_composite_multiply_region_alt;
+ } else {
+ gf->multiply_region.w128 = gf_w128_composite_multiply_region;
+ }
+
+ gf_internal_t *base_h = (gf_internal_t *) h->base_gf->scratch;
+
+ gf->multiply.w128 = gf_w128_composite_multiply;
+ gf->divide.w128 = gf_w128_divide_from_inverse;
+ gf->inverse.w128 = gf_w128_composite_inverse;
+
+ return 1;
+}
+
+static
+int gf_w128_cfm_init(gf_t *gf)
+{
+#ifdef INTEL_SSE4_PCLMUL
+ gf->inverse.w128 = gf_w128_euclid;
+ gf->multiply.w128 = gf_w128_clm_multiply;
+ gf->multiply_region.w128 = gf_w128_clm_multiply_region_from_single;
+ return 1;
+#endif
+
+ return 0;
+}
+
static
int gf_w128_shift_init(gf_t *gf)
{
+ gf_internal_t *h;
+ h = (gf_internal_t*) gf->scratch;
gf->multiply.w128 = gf_w128_shift_multiply;
gf->inverse.w128 = gf_w128_euclid;
gf->multiply_region.w128 = gf_w128_multiply_region_from_single;
return 1;
}
-static
+ static
int gf_w128_bytwo_init(gf_t *gf)
{
- gf->multiply.w128 = gf_w128_bytwo_b_multiply;
+ gf_internal_t *h;
+ h = (gf_internal_t *) gf->scratch;
+
+ if (h->mult_type == GF_MULT_BYTWO_p) {
+ gf->multiply.w128 = gf_w128_bytwo_p_multiply;
+ /*gf->multiply.w128 = gf_w128_sse_bytwo_p_multiply;*/
+ /* John: the sse function is slower.*/
+ } else {
+ gf->multiply.w128 = gf_w128_bytwo_b_multiply;
+ /*gf->multiply.w128 = gf_w128_sse_bytwo_b_multiply;
+Ben: This sse function is also slower. */
+ }
gf->inverse.w128 = gf_w128_euclid;
- gf->multiply_region.w128 = gf_w128_multiply_region_from_single;
gf->multiply_region.w128 = gf_w128_bytwo_b_multiply_region;
return 1;
}
@@ -686,7 +1648,7 @@ int gf_w128_bytwo_init(gf_t *gf)
* Because the prim poly is only 8 bits and we are limiting g_r to 16, I do not need the high 64
* bits in all of these numbers.
*/
-static
+ static
void gf_w128_group_r_init(gf_t *gf)
{
int i, j;
@@ -696,11 +1658,7 @@ void gf_w128_group_r_init(gf_t *gf)
gf_group_tables_t *gt;
scratch = (gf_internal_t *) gf->scratch;
gt = scratch->private;
- if (scratch->mult_type == GF_MULT_DEFAULT) {
- g_r = 8;
- } else {
- g_r = scratch->arg2;
- }
+ g_r = scratch->arg2;
pp = scratch->prim_poly;
gt->r_table[0] = 0;
@@ -715,20 +1673,76 @@ void gf_w128_group_r_init(gf_t *gf)
return;
}
-static
+ static
+void gf_w128_group_r_sse_init(gf_t *gf)
+{
+#ifdef INTEL_SSE4
+ int i, j;
+ int g_r;
+ uint64_t pp;
+ gf_internal_t *scratch;
+ gf_group_tables_t *gt;
+ scratch = (gf_internal_t *) gf->scratch;
+ gt = scratch->private;
+ __m128i zero = _mm_setzero_si128();
+ __m128i *table = (__m128i *)(gt->r_table);
+ g_r = scratch->arg2;
+ pp = scratch->prim_poly;
+ table[0] = zero;
+ for (i = 1; i < (1 << g_r); i++) {
+ table[i] = zero;
+ for (j = 0; j < g_r; j++) {
+ if (i & (1 << j)) {
+ table[i] = _mm_xor_si128(table[i], _mm_insert_epi64(zero, pp << j, 0));
+ }
+ }
+ }
+ return;
+#endif
+}
+
+ static
int gf_w128_split_init(gf_t *gf)
{
- struct gf_w128_split_4_128_data *sd;
+ struct gf_w128_split_4_128_data *sd4;
+ struct gf_w128_split_8_128_data *sd8;
gf_internal_t *h;
h = (gf_internal_t *) gf->scratch;
- sd = (struct gf_w128_split_4_128_data *) h->private;
- sd->last_value[0] = 0;
- sd->last_value[1] = 0;
- gf->multiply.w128 = gf_w128_bytwo_b_multiply;
+ gf->multiply.w128 = gf_w128_bytwo_p_multiply;
+#ifdef INTEL_SSE4_PCLMUL
+ if (!(h->region_type & GF_REGION_NOSSE)){
+ gf->multiply.w128 = gf_w128_clm_multiply;
+ }
+#endif
+
gf->inverse.w128 = gf_w128_euclid;
- gf->multiply_region.w128 = gf_w128_split_4_128_multiply_region;
+
+ if ((h->arg1 != 4 && h->arg2 != 4) || h->mult_type == GF_MULT_DEFAULT) {
+ sd8 = (struct gf_w128_split_8_128_data *) h->private;
+ sd8->last_value[0] = 0;
+ sd8->last_value[1] = 0;
+ gf->multiply_region.w128 = gf_w128_split_8_128_multiply_region;
+ } else {
+ sd4 = (struct gf_w128_split_4_128_data *) h->private;
+ sd4->last_value[0] = 0;
+ sd4->last_value[1] = 0;
+ if((h->region_type & GF_REGION_ALTMAP))
+ {
+ #ifdef INTEL_SSE4
+ if(!(h->region_type & GF_REGION_NOSSE))
+ gf->multiply_region.w128 = gf_w128_split_4_128_sse_altmap_multiply_region;
+ else
+ return 0;
+ #else
+ return 0;
+ #endif
+ }
+ else {
+ gf->multiply_region.w128 = gf_w128_split_4_128_multiply_region;
+ }
+ }
return 1;
}
@@ -739,16 +1753,12 @@ int gf_w128_group_init(gf_t *gf)
gf_internal_t *scratch;
gf_group_tables_t *gt;
int g_m, g_r, size_r;
+ long tmp;
scratch = (gf_internal_t *) gf->scratch;
gt = scratch->private;
- if (scratch->mult_type == GF_MULT_DEFAULT) {
- g_m = 4;
- g_r = 8;
- } else {
- g_m = scratch->arg1;
- g_r = scratch->arg2;
- }
+ g_m = scratch->arg1;
+ g_r = scratch->arg2;
size_r = (1 << g_r);
gt->r_table = scratch->private + (2 * sizeof(uint64_t *));
@@ -756,11 +1766,30 @@ int gf_w128_group_init(gf_t *gf)
gt->m_table[2] = 0;
gt->m_table[3] = 0;
- gf_w128_group_r_init(gf);
-
gf->multiply.w128 = gf_w128_group_multiply;
gf->inverse.w128 = gf_w128_euclid;
gf->multiply_region.w128 = gf_w128_group_multiply_region;
+
+ #ifdef INTEL_SSE4
+ if(!(scratch->region_type & GF_REGION_NOSSE))
+ {
+ if ((g_m != 4) && ((g_r != 4) || (g_r != 8)))
+ return 0;
+ gt->r_table = (void *)(((uint64_t)gt->r_table + 15) & (~0xfULL)); /* aligns gt->r_table on a 16-bit boundary*/
+ gt->m_table = gt->r_table + 2*size_r;
+ gt->m_table[2] = 0;
+ gt->m_table[3] = 0;
+ gf->multiply.w128 = gf_w128_group_sse_multiply;
+ gf->multiply_region.w128 = gf_w128_group_sse_multiply_region;
+ gf_w128_group_r_sse_init(gf);
+ }
+ else
+ gf_w128_group_r_init(gf);
+ #else
+ if(scratch->region_type & GF_REGION_SSE) return 0;
+ else gf_w128_group_r_init(gf);
+ #endif
+
return 1;
}
@@ -773,88 +1802,175 @@ void gf_w128_extract_word(gf_t *gf, void *start, int bytes, int index, gf_val_12
memcpy(rv, s, 16);
}
+static void gf_w128_split_extract_word(gf_t *gf, void *start, int bytes, int index, gf_val_128_t rv)
+{
+ int i, blocks;
+ uint64_t *r64, tmp;
+ uint8_t *r8;
+ gf_region_data rd;
+
+ gf_set_region_data(&rd, gf, start, start, bytes, 0, 0, 256);
+ r64 = (uint64_t *) start;
+ if ((r64 + index*2 < (uint64_t *) rd.d_start) ||
+ (r64 + index*2 >= (uint64_t *) rd.d_top)) {
+ memcpy(rv, r64+(index*2), 16);
+ return;
+ }
+
+ index -= (((uint64_t *) rd.d_start) - r64)/2;
+ r64 = (uint64_t *) rd.d_start;
+
+ blocks = index/16;
+ r64 += (blocks*32);
+ index %= 16;
+ r8 = (uint8_t *) r64;
+ r8 += index;
+ rv[0] = 0;
+ rv[1] = 0;
+
+ for (i = 0; i < 8; i++) {
+ tmp = *r8;
+ rv[1] |= (tmp << (i*8));
+ r8 += 16;
+ }
+
+ for (i = 0; i < 8; i++) {
+ tmp = *r8;
+ rv[0] |= (tmp << (i*8));
+ r8 += 16;
+ }
+ return;
+}
+
+ static
+void gf_w128_composite_extract_word(gf_t *gf, void *start, int bytes, int index, gf_val_128_t rv)
+{
+ int sub_size;
+ gf_internal_t *h;
+ uint8_t *r8, *top;
+ uint64_t *r64;
+ gf_region_data rd;
+
+ h = (gf_internal_t *) gf->scratch;
+ gf_set_region_data(&rd, gf, start, start, bytes, 0, 0, 64);
+ r64 = (uint64_t *) start;
+ if ((r64 + index*2 < (uint64_t *) rd.d_start) ||
+ (r64 + index*2 >= (uint64_t *) rd.d_top)) {
+ memcpy(rv, r64+(index*2), 16);
+ return;
+ }
+ index -= (((uint64_t *) rd.d_start) - r64)/2;
+ r8 = (uint8_t *) rd.d_start;
+ top = (uint8_t *) rd.d_top;
+ sub_size = (top-r8)/2;
+
+ rv[1] = h->base_gf->extract_word.w64(h->base_gf, r8, sub_size, index);
+ rv[0] = h->base_gf->extract_word.w64(h->base_gf, r8+sub_size, sub_size, index);
+
+ return;
+}
+
int gf_w128_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2)
{
int size_m, size_r;
int w = 128;
+ if (divide_type==GF_DIVIDE_MATRIX) return 0;
+
switch(mult_type)
{
+ case GF_MULT_CARRY_FREE:
+ return sizeof(gf_internal_t);
+ break;
case GF_MULT_SHIFT:
- if (arg1 != 0 || arg2 != 0 || region_type != 0) return -1;
return sizeof(gf_internal_t);
break;
+ case GF_MULT_BYTWO_p:
case GF_MULT_BYTWO_b:
- if (arg1 != 0 || arg2 != 0 || region_type != 0) return -1;
return sizeof(gf_internal_t);
break;
+ case GF_MULT_DEFAULT:
case GF_MULT_SPLIT_TABLE:
- if (region_type != 0) return -1;
if ((arg1 == 4 && arg2 == 128) || (arg1 == 128 && arg2 == 4)) {
return sizeof(gf_internal_t) + sizeof(struct gf_w128_split_4_128_data) + 64;
+ } else if ((arg1 == 8 && arg2 == 128) || (arg1 == 128 && arg2 == 8) || mult_type == GF_MULT_DEFAULT) {
+ return sizeof(gf_internal_t) + sizeof(struct gf_w128_split_8_128_data) + 64;
}
- return -1;
+ return 0;
break;
- case GF_MULT_DEFAULT:
- arg1 = 4;
- arg2 = 8;
case GF_MULT_GROUP:
-
- /* arg1 == mult size, arg2 == reduce size */
- /* Should prevent anything over arg1 > 16 || arg2 > 16 */
- if (region_type != 0) return -1;
- if (arg1 <= 0 || arg2 <= 0 || arg1 > 16 || arg2 > 16) return -1;
- if (GF_FIELD_WIDTH % arg1 != 0 || GF_FIELD_WIDTH % arg2 != 0) return -1;
- /*
- * Currently implementing code where g_m and g_r are the same or where g_r is larger, as
- * these it is more efficient to have g_r as large as possible (but still not > 16)
- */
- if (arg1 > arg2) return -1;
-
- /* size of each group, 128 bits */
+ /* JSP We've already error checked the arguments. */
size_m = (1 << arg1) * 2 * sizeof(uint64_t);
- /* The PP is only 8 bits and we are limiting g_r to 16, so only uint64_t */
- size_r = (1 << arg2) * sizeof(uint64_t);
-
+ size_r = (1 << arg2) * 2 * sizeof(uint64_t);
/*
* two pointers prepend the table data for structure
* because the tables are of dynamic size
*/
- return sizeof(gf_internal_t) + size_m + size_r + 2 * sizeof(uint64_t *);
+ return sizeof(gf_internal_t) + size_m + size_r + 4 * sizeof(uint64_t *);
+ break;
+ case GF_MULT_COMPOSITE:
+ if (arg1 == 2) {
+ return sizeof(gf_internal_t) + 4;
+ } else {
+ return 0;
+ }
+ break;
+
default:
- return -1;
+ return 0;
}
}
int gf_w128_init(gf_t *gf)
{
- gf_internal_t *h;
+ gf_internal_t *h, *h_base, *h_base_base, *h_base_base_base;
+ int no_default_flag = 0;
h = (gf_internal_t *) gf->scratch;
- if (h->prim_poly == 0) h->prim_poly = 0x87; /* Omitting the leftmost 1 as in w=32 */
+
+ /* Allen: set default primitive polynomial / irreducible polynomial if needed */
+
+ if (h->prim_poly == 0) {
+ if (h->mult_type == GF_MULT_COMPOSITE) {
+ h->prim_poly = gf_composite_get_default_poly(h->base_gf);
+ if (h->prim_poly == 0) return 0; /* This shouldn't happen */
+ } else {
+ h->prim_poly = 0x87; /* Omitting the leftmost 1 as in w=32 */
+ }
+ if (no_default_flag == 1) {
+ fprintf(stderr,"Code contains no default irreducible polynomial for given base field\n");
+ return 0;
+ }
+ }
gf->multiply.w128 = NULL;
gf->divide.w128 = NULL;
gf->inverse.w128 = NULL;
gf->multiply_region.w128 = NULL;
-
switch(h->mult_type) {
+ case GF_MULT_BYTWO_p:
case GF_MULT_BYTWO_b: if (gf_w128_bytwo_init(gf) == 0) return 0; break;
+ case GF_MULT_CARRY_FREE: if (gf_w128_cfm_init(gf) == 0) return 0; break;
case GF_MULT_SHIFT: if (gf_w128_shift_init(gf) == 0) return 0; break;
- case GF_MULT_DEFAULT:
case GF_MULT_GROUP: if (gf_w128_group_init(gf) == 0) return 0; break;
+ case GF_MULT_DEFAULT:
case GF_MULT_SPLIT_TABLE: if (gf_w128_split_init(gf) == 0) return 0; break;
+ case GF_MULT_COMPOSITE: if (gf_w128_composite_init(gf) == 0) return 0; break;
default: return 0;
}
- gf->extract_word.w128 = gf_w128_extract_word;
+ /* Ben: Used to be h->region_type == GF_REGION_ALTMAP, but failed since there
+ are multiple flags in h->region_type */
+ if (h->mult_type == GF_MULT_SPLIT_TABLE && (h->region_type & GF_REGION_ALTMAP)) {
+ gf->extract_word.w128 = gf_w128_split_extract_word;
+ } else if (h->mult_type == GF_MULT_COMPOSITE && h->region_type == GF_REGION_ALTMAP) {
+ gf->extract_word.w128 = gf_w128_composite_extract_word;
+ } else {
+ gf->extract_word.w128 = gf_w128_extract_word;
+ }
if (h->divide_type == GF_DIVIDE_EUCLID) {
gf->divide.w128 = gf_w128_divide_from_inverse;
- gf->inverse.w128 = gf_w128_euclid;
- } /* } else if (h->divide_type == GF_DIVIDE_MATRIX) {
- gf->divide.w128 = gf_w128_divide_from_inverse;
- gf->inverse.w128 = gf_w128_matrix;
- } */
+ }
if (gf->inverse.w128 != NULL && gf->divide.w128 == NULL) {
gf->divide.w128 = gf_w128_divide_from_inverse;
diff --git a/gf_w16.c b/gf_w16.c
index e8b48fd..6bc25a6 100644
--- a/gf_w16.c
+++ b/gf_w16.c
@@ -14,50 +14,47 @@
#define GF_BASE_FIELD_WIDTH (8)
#define GF_BASE_FIELD_SIZE (1 << GF_BASE_FIELD_WIDTH)
-#define GF_S_GF_8_2 (63)
-struct gf_logtable_data {
+struct gf_w16_logtable_data {
uint16_t log_tbl[GF_FIELD_SIZE];
uint16_t antilog_tbl[GF_FIELD_SIZE * 2];
uint16_t inv_tbl[GF_FIELD_SIZE];
uint16_t *d_antilog;
};
-struct gf_zero_logtable_data {
- int log_tbl[GF_FIELD_SIZE];
+struct gf_w16_zero_logtable_data {
+ int log_tbl[GF_FIELD_SIZE];
uint16_t _antilog_tbl[GF_FIELD_SIZE * 4];
uint16_t *antilog_tbl;
uint16_t inv_tbl[GF_FIELD_SIZE];
};
-struct gf_lazytable_data {
- int log_tbl[GF_FIELD_SIZE];
+struct gf_w16_lazytable_data {
+ uint16_t log_tbl[GF_FIELD_SIZE];
uint16_t antilog_tbl[GF_FIELD_SIZE * 2];
uint16_t inv_tbl[GF_FIELD_SIZE];
uint16_t lazytable[GF_FIELD_SIZE];
};
-struct gf_w8_logtable_data {
- uint8_t log_tbl[GF_BASE_FIELD_SIZE];
- uint8_t antilog_tbl[GF_BASE_FIELD_SIZE * 2];
- uint8_t *antilog_tbl_div;
-};
-
-struct gf_w8_single_table_data {
- uint8_t mult[GF_BASE_FIELD_SIZE][GF_BASE_FIELD_SIZE];
-};
-
struct gf_w16_bytwo_data {
uint64_t prim_poly;
uint64_t mask1;
uint64_t mask2;
};
+struct gf_w16_split_8_8_data {
+ uint16_t tables[3][256][256];
+};
+
struct gf_w16_group_4_4_data {
uint16_t reduce[16];
uint16_t shift[16];
};
+struct gf_w16_composite_data {
+ uint8_t *mult_table;
+};
+
#define AB2(ip, am1 ,am2, b, t1, t2) {\
t1 = (b << 1) & am1;\
t2 = b & am2; \
@@ -72,6 +69,9 @@ struct gf_w16_group_4_4_data {
#define MM_PRINT(s, r) { uint8_t blah[16], ii; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 2) printf(" %02x %02x", blah[15-ii], blah[14-ii]); printf("\n"); }
+#define GF_FIRST_BIT (1 << 15)
+#define GF_MULTBY_TWO(p) (((p) & GF_FIRST_BIT) ? (((p) << 1) ^ h->prim_poly) : (p) << 1)
+
static
inline
gf_val_32_t gf_w16_inverse_from_divide (gf_t *gf, gf_val_32_t a)
@@ -121,6 +121,212 @@ gf_w16_multiply_region_from_single(gf_t *gf, void *src, void *dest, gf_val_32_t
}
static
+void
+gf_w16_clm_multiply_region_from_single_2(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+ gf_region_data rd;
+ uint16_t *s16;
+ uint16_t *d16;
+
+#ifdef INTEL_SSE4_PCLMUL
+ __m128i a, b;
+ __m128i result;
+ __m128i prim_poly;
+ __m128i w;
+ gf_internal_t * h = gf->scratch;
+ prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffffULL));
+
+ if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+ if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+ gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 2);
+ gf_do_initial_region_alignment(&rd);
+
+ a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0);
+
+ s16 = (uint16_t *) rd.s_start;
+ d16 = (uint16_t *) rd.d_start;
+
+ if (xor) {
+ while (d16 < ((uint16_t *) rd.d_top)) {
+
+ /* see gf_w16_clm_multiply() to see explanation of method */
+
+ b = _mm_insert_epi32 (a, (gf_val_32_t)(*s16), 0);
+ result = _mm_clmulepi64_si128 (a, b, 0);
+ w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+ result = _mm_xor_si128 (result, w);
+ w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+ result = _mm_xor_si128 (result, w);
+
+ *d16 ^= ((gf_val_32_t)_mm_extract_epi32(result, 0));
+ d16++;
+ s16++;
+ }
+ } else {
+ while (d16 < ((uint16_t *) rd.d_top)) {
+
+ /* see gf_w16_clm_multiply() to see explanation of method */
+
+ b = _mm_insert_epi32 (a, (gf_val_32_t)(*s16), 0);
+ result = _mm_clmulepi64_si128 (a, b, 0);
+ w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+ result = _mm_xor_si128 (result, w);
+ w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+ result = _mm_xor_si128 (result, w);
+
+ *d16 = ((gf_val_32_t)_mm_extract_epi32(result, 0));
+ d16++;
+ s16++;
+ }
+ }
+ gf_do_final_region_alignment(&rd);
+#endif
+}
+
+static
+void
+gf_w16_clm_multiply_region_from_single_3(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+ gf_region_data rd;
+ uint16_t *s16;
+ uint16_t *d16;
+
+#ifdef INTEL_SSE4_PCLMUL
+
+ __m128i a, b;
+ __m128i result;
+ __m128i prim_poly;
+ __m128i w;
+ gf_internal_t * h = gf->scratch;
+ prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffffULL));
+
+ if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+ if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+ a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0);
+
+ gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 2);
+ gf_do_initial_region_alignment(&rd);
+
+ s16 = (uint16_t *) rd.s_start;
+ d16 = (uint16_t *) rd.d_start;
+
+ if (xor) {
+ while (d16 < ((uint16_t *) rd.d_top)) {
+
+ /* see gf_w16_clm_multiply() to see explanation of method */
+
+ b = _mm_insert_epi32 (a, (gf_val_32_t)(*s16), 0);
+ result = _mm_clmulepi64_si128 (a, b, 0);
+ w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+ result = _mm_xor_si128 (result, w);
+ w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+ result = _mm_xor_si128 (result, w);
+ w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+ result = _mm_xor_si128 (result, w);
+
+ *d16 ^= ((gf_val_32_t)_mm_extract_epi32(result, 0));
+ d16++;
+ s16++;
+ }
+ } else {
+ while (d16 < ((uint16_t *) rd.d_top)) {
+
+ /* see gf_w16_clm_multiply() to see explanation of method */
+
+ b = _mm_insert_epi32 (a, (gf_val_32_t)(*s16), 0);
+ result = _mm_clmulepi64_si128 (a, b, 0);
+ w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+ result = _mm_xor_si128 (result, w);
+ w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+ result = _mm_xor_si128 (result, w);
+ w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+ result = _mm_xor_si128 (result, w);
+
+ *d16 = ((gf_val_32_t)_mm_extract_epi32(result, 0));
+ d16++;
+ s16++;
+ }
+ }
+ gf_do_final_region_alignment(&rd);
+#endif
+}
+
+static
+void
+gf_w16_clm_multiply_region_from_single_4(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+ gf_region_data rd;
+ uint16_t *s16;
+ uint16_t *d16;
+
+#ifdef INTEL_SSE4_PCLMUL
+
+ __m128i a, b;
+ __m128i result;
+ __m128i prim_poly;
+ __m128i w;
+ gf_internal_t * h = gf->scratch;
+ prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffffULL));
+
+ if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+ if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+ gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 2);
+ gf_do_initial_region_alignment(&rd);
+
+ a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0);
+
+ s16 = (uint16_t *) rd.s_start;
+ d16 = (uint16_t *) rd.d_start;
+
+ if (xor) {
+ while (d16 < ((uint16_t *) rd.d_top)) {
+
+ /* see gf_w16_clm_multiply() to see explanation of method */
+
+ b = _mm_insert_epi32 (a, (gf_val_32_t)(*s16), 0);
+ result = _mm_clmulepi64_si128 (a, b, 0);
+ w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+ result = _mm_xor_si128 (result, w);
+ w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+ result = _mm_xor_si128 (result, w);
+ w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+ result = _mm_xor_si128 (result, w);
+ w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+ result = _mm_xor_si128 (result, w);
+
+ *d16 ^= ((gf_val_32_t)_mm_extract_epi32(result, 0));
+ d16++;
+ s16++;
+ }
+ } else {
+ while (d16 < ((uint16_t *) rd.d_top)) {
+
+ /* see gf_w16_clm_multiply() to see explanation of method */
+
+ b = _mm_insert_epi32 (a, (gf_val_32_t)(*s16), 0);
+ result = _mm_clmulepi64_si128 (a, b, 0);
+ w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+ result = _mm_xor_si128 (result, w);
+ w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+ result = _mm_xor_si128 (result, w);
+ w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+ result = _mm_xor_si128 (result, w);
+ w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+ result = _mm_xor_si128 (result, w);
+
+ *d16 = ((gf_val_32_t)_mm_extract_epi32(result, 0));
+ d16++;
+ s16++;
+ }
+ }
+ gf_do_final_region_alignment(&rd);
+#endif
+}
+
+static
inline
gf_val_32_t gf_w16_euclid (gf_t *gf, gf_val_32_t b)
{
@@ -146,6 +352,7 @@ gf_val_32_t gf_w16_euclid (gf_t *gf, gf_val_32_t b)
while (d_ip1 >= d_i) {
c_i ^= (1 << (d_ip1 - d_i));
e_ip1 ^= (e_i << (d_ip1 - d_i));
+ if (e_ip1 == 0) return 0;
while ((e_ip1 & (1 << d_ip1)) == 0) d_ip1--;
}
@@ -227,16 +434,146 @@ gf_val_32_t gf_w16_matrix (gf_t *gf, gf_val_32_t b)
/* JSP: GF_MULT_SHIFT: The world's dumbest multiplication algorithm. I only
include it for completeness. It does have the feature that it requires no
extra memory.
-*/
+ */
static
inline
gf_val_32_t
+gf_w16_clm_multiply_2 (gf_t *gf, gf_val_32_t a16, gf_val_32_t b16)
+{
+ gf_val_32_t rv = 0;
+
+#ifdef INTEL_SSE4_PCLMUL
+
+ __m128i a, b;
+ __m128i result;
+ __m128i prim_poly;
+ __m128i v, w;
+ gf_internal_t * h = gf->scratch;
+
+ a = _mm_insert_epi32 (_mm_setzero_si128(), a16, 0);
+ b = _mm_insert_epi32 (a, b16, 0);
+
+ prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffffULL));
+
+ /* Do the initial multiply */
+
+ result = _mm_clmulepi64_si128 (a, b, 0);
+
+ /* Ben: Do prim_poly reduction twice. We are guaranteed that we will only
+ have to do the reduction at most twice, because (w-2)/z == 2. Where
+ z is equal to the number of zeros after the leading 1
+
+ _mm_clmulepi64_si128 is the carryless multiply operation. Here
+ _mm_srli_si128 shifts the result to the right by 2 bytes. This allows
+ us to multiply the prim_poly by the leading bits of the result. We
+ then xor the result of that operation back with the result.*/
+
+ w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+ result = _mm_xor_si128 (result, w);
+ w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+ result = _mm_xor_si128 (result, w);
+
+ /* Extracts 32 bit value from result. */
+
+ rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
+
+
+#endif
+ return rv;
+}
+
+static
+inline
+gf_val_32_t
+gf_w16_clm_multiply_3 (gf_t *gf, gf_val_32_t a16, gf_val_32_t b16)
+{
+ gf_val_32_t rv = 0;
+
+#ifdef INTEL_SSE4_PCLMUL
+
+ __m128i a, b;
+ __m128i result;
+ __m128i prim_poly;
+ __m128i v, w;
+ gf_internal_t * h = gf->scratch;
+
+ a = _mm_insert_epi32 (_mm_setzero_si128(), a16, 0);
+ b = _mm_insert_epi32 (a, b16, 0);
+
+ prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffffULL));
+
+ /* Do the initial multiply */
+
+ result = _mm_clmulepi64_si128 (a, b, 0);
+
+ w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+ result = _mm_xor_si128 (result, w);
+ w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+ result = _mm_xor_si128 (result, w);
+ w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+ result = _mm_xor_si128 (result, w);
+
+ /* Extracts 32 bit value from result. */
+
+ rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
+
+
+#endif
+ return rv;
+}
+
+static
+inline
+gf_val_32_t
+gf_w16_clm_multiply_4 (gf_t *gf, gf_val_32_t a16, gf_val_32_t b16)
+{
+ gf_val_32_t rv = 0;
+
+#ifdef INTEL_SSE4_PCLMUL
+
+ __m128i a, b;
+ __m128i result;
+ __m128i prim_poly;
+ __m128i v, w;
+ gf_internal_t * h = gf->scratch;
+
+ a = _mm_insert_epi32 (_mm_setzero_si128(), a16, 0);
+ b = _mm_insert_epi32 (a, b16, 0);
+
+ prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffffULL));
+
+ /* Do the initial multiply */
+
+ result = _mm_clmulepi64_si128 (a, b, 0);
+
+ w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+ result = _mm_xor_si128 (result, w);
+ w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+ result = _mm_xor_si128 (result, w);
+ w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+ result = _mm_xor_si128 (result, w);
+ w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+ result = _mm_xor_si128 (result, w);
+
+ /* Extracts 32 bit value from result. */
+
+ rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
+
+
+#endif
+ return rv;
+}
+
+
+static
+inline
+ gf_val_32_t
gf_w16_shift_multiply (gf_t *gf, gf_val_32_t a16, gf_val_32_t b16)
{
gf_val_32_t product, i, pp, a, b;
gf_internal_t *h;
-
+
a = a16;
b = b16;
h = (gf_internal_t *) gf->scratch;
@@ -247,7 +584,7 @@ gf_w16_shift_multiply (gf_t *gf, gf_val_32_t a16, gf_val_32_t b16)
for (i = 0; i < GF_FIELD_WIDTH; i++) {
if (a & (1 << i)) product ^= (b << i);
}
- for (i = (GF_FIELD_WIDTH*2-1); i >= GF_FIELD_WIDTH; i--) {
+ for (i = (GF_FIELD_WIDTH*2-2); i >= GF_FIELD_WIDTH; i--) {
if (product & (1 << i)) product ^= (pp << (i-GF_FIELD_WIDTH));
}
return product;
@@ -257,11 +594,37 @@ static
int gf_w16_shift_init(gf_t *gf)
{
gf->multiply.w32 = gf_w16_shift_multiply;
- gf->inverse.w32 = gf_w16_euclid;
- gf->multiply_region.w32 = gf_w16_multiply_region_from_single;
return 1;
}
+static
+int gf_w16_cfm_init(gf_t *gf)
+{
+ gf_internal_t *h;
+
+ h = (gf_internal_t *) gf->scratch;
+
+ /*Ben: Determining how many reductions to do */
+
+#ifdef INTEL_SSE4_PCLMUL
+ if ((0xfe00 & h->prim_poly) == 0) {
+ gf->multiply.w32 = gf_w16_clm_multiply_2;
+ gf->multiply_region.w32 = gf_w16_clm_multiply_region_from_single_2;
+ } else if((0xf000 & h->prim_poly) == 0) {
+ gf->multiply.w32 = gf_w16_clm_multiply_3;
+ gf->multiply_region.w32 = gf_w16_clm_multiply_region_from_single_3;
+ } else if ((0xe000 & h->prim_poly) == 0) {
+ gf->multiply.w32 = gf_w16_clm_multiply_4;
+ gf->multiply_region.w32 = gf_w16_clm_multiply_region_from_single_4;
+ } else {
+ return 0;
+ }
+ return 1;
+#endif
+
+ return 0;
+}
+
/* KMG: GF_MULT_LOGTABLE: */
static
@@ -270,7 +633,7 @@ gf_w16_log_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int
{
uint16_t *s16, *d16;
int lv;
- struct gf_logtable_data *ltd;
+ struct gf_w16_logtable_data *ltd;
gf_region_data rd;
if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
@@ -279,7 +642,7 @@ gf_w16_log_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int
gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 2);
gf_do_initial_region_alignment(&rd);
- ltd = (struct gf_logtable_data *) ((gf_internal_t *) gf->scratch)->private;
+ ltd = (struct gf_w16_logtable_data *) ((gf_internal_t *) gf->scratch)->private;
s16 = (uint16_t *) rd.s_start;
d16 = (uint16_t *) rd.d_start;
@@ -306,9 +669,9 @@ inline
gf_val_32_t
gf_w16_log_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
{
- struct gf_logtable_data *ltd;
+ struct gf_w16_logtable_data *ltd;
- ltd = (struct gf_logtable_data *) ((gf_internal_t *) gf->scratch)->private;
+ ltd = (struct gf_w16_logtable_data *) ((gf_internal_t *) gf->scratch)->private;
return (a == 0 || b == 0) ? 0 : ltd->antilog_tbl[(int) ltd->log_tbl[a] + (int) ltd->log_tbl[b]];
}
@@ -318,10 +681,10 @@ gf_val_32_t
gf_w16_log_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
{
int log_sum = 0;
- struct gf_logtable_data *ltd;
+ struct gf_w16_logtable_data *ltd;
if (a == 0 || b == 0) return 0;
- ltd = (struct gf_logtable_data *) ((gf_internal_t *) gf->scratch)->private;
+ ltd = (struct gf_w16_logtable_data *) ((gf_internal_t *) gf->scratch)->private;
log_sum = (int) ltd->log_tbl[a] - (int) ltd->log_tbl[b];
return (ltd->d_antilog[log_sum]);
@@ -331,9 +694,9 @@ static
gf_val_32_t
gf_w16_log_inverse(gf_t *gf, gf_val_32_t a)
{
- struct gf_logtable_data *ltd;
+ struct gf_w16_logtable_data *ltd;
- ltd = (struct gf_logtable_data *) ((gf_internal_t *) gf->scratch)->private;
+ ltd = (struct gf_w16_logtable_data *) ((gf_internal_t *) gf->scratch)->private;
return (ltd->inv_tbl[a]);
}
@@ -341,17 +704,20 @@ static
int gf_w16_log_init(gf_t *gf)
{
gf_internal_t *h;
- struct gf_logtable_data *ltd;
+ struct gf_w16_logtable_data *ltd;
int i, b;
+ int check = 0;
h = (gf_internal_t *) gf->scratch;
ltd = h->private;
-
- ltd->log_tbl[0] = 0;
+
+ for (i = 0; i < GF_MULT_GROUP_SIZE+1; i++)
+ ltd->log_tbl[i] = 0;
ltd->d_antilog = ltd->antilog_tbl + GF_MULT_GROUP_SIZE;
b = 1;
for (i = 0; i < GF_MULT_GROUP_SIZE; i++) {
+ if (ltd->log_tbl[b] != 0) check = 1;
ltd->log_tbl[b] = i;
ltd->antilog_tbl[i] = b;
ltd->antilog_tbl[i+GF_MULT_GROUP_SIZE] = b;
@@ -360,6 +726,24 @@ int gf_w16_log_init(gf_t *gf)
b = b ^ h->prim_poly;
}
}
+
+ /* If you can't construct the log table, there's a problem. This code is used for
+ some other implementations (e.g. in SPLIT), so if the log table doesn't work in
+ that instance, use CARRY_FREE / SHIFT instead. */
+
+ if (check) {
+ if (h->mult_type != GF_MULT_LOG_TABLE) {
+
+#ifdef INTEL_SSE4_PCLMUL
+ return gf_w16_cfm_init(gf);
+#endif
+ return gf_w16_shift_init(gf);
+ } else {
+ _gf_errno = GF_E_LOGPOLY;
+ return 0;
+ }
+ }
+
ltd->inv_tbl[0] = 0; /* Not really, but we need to fill it with something */
ltd->inv_tbl[1] = 1;
for (i = 2; i < GF_FIELD_SIZE; i++) {
@@ -377,8 +761,76 @@ int gf_w16_log_init(gf_t *gf)
/* JSP: GF_MULT_SPLIT_TABLE: Using 8 multiplication tables to leverage SSE instructions.
*/
-static
+
+/* Ben: Does alternate mapping multiplication using a split table in the
+ lazy method without sse instructions*/
+
+static
void
+gf_w16_split_4_16_lazy_nosse_altmap_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+ uint64_t i, j, a, b, c, prod;
+ uint8_t *s8, *d8, *top;
+ gf_internal_t *h;
+ uint16_t table[4][16];
+ gf_region_data rd;
+
+ if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+ if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+ gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 32);
+ gf_do_initial_region_alignment(&rd);
+
+ h = (gf_internal_t *) gf->scratch;
+
+ /*Ben: Constructs lazy multiplication table*/
+
+ for (j = 0; j < 16; j++) {
+ for (i = 0; i < 4; i++) {
+ c = (j << (i*4));
+ table[i][j] = gf->multiply.w32(gf, c, val);
+ }
+ }
+
+ /*Ben: s8 is the start of source, d8 is the start of dest, top is end of dest region. */
+
+ s8 = (uint8_t *) rd.s_start;
+ d8 = (uint8_t *) rd.d_start;
+ top = (uint8_t *) rd.d_top;
+
+
+ while (d8 < top) {
+
+ /*Ben: Multiplies across 16 two byte quantities using alternate mapping
+ high bits are on the left, low bits are on the right. */
+
+ for (j=0;j<16;j++) {
+
+ /*Ben: If the xor flag is set, the product should include what is in dest */
+ prod = (xor) ? ((uint16_t)(*d8)<<8) ^ *(d8+16) : 0;
+
+ /*Ben: xors all 4 table lookups into the product variable*/
+
+ prod ^= ((table[0][*(s8+16)&0xf]) ^
+ (table[1][(*(s8+16)&0xf0)>>4]) ^
+ (table[2][*(s8)&0xf]) ^
+ (table[3][(*(s8)&0xf0)>>4]));
+
+ /*Ben: Stores product in the destination and moves on*/
+
+ *d8 = (uint8_t)(prod >> 8);
+ *(d8+16) = (uint8_t)(prod & 0x00ff);
+ s8++;
+ d8++;
+ }
+ s8+=16;
+ d8+=16;
+ }
+ gf_do_final_region_alignment(&rd);
+}
+
+static
+ void
gf_w16_split_4_16_lazy_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
{
uint64_t i, j, a, c, prod;
@@ -391,14 +843,14 @@ gf_w16_split_4_16_lazy_multiply_region(gf_t *gf, void *src, void *dest, gf_val_3
if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 2);
- gf_do_initial_region_alignment(&rd);
+ gf_do_initial_region_alignment(&rd);
h = (gf_internal_t *) gf->scratch;
for (j = 0; j < 16; j++) {
for (i = 0; i < 4; i++) {
c = (j << (i*4));
- table[i][j] = gf_w16_log_multiply(gf, c, val);
+ table[i][j] = gf->multiply.w32(gf, c, val);
}
}
@@ -423,7 +875,7 @@ static
void
gf_w16_split_8_16_lazy_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
{
- uint64_t j, a, c, prod, *s64, *d64, *top64;
+ uint64_t j, k, v, a, c, prod, *s64, *d64, *top64;
gf_internal_t *h;
uint64_t htable[256], ltable[256];
gf_region_data rd;
@@ -436,9 +888,16 @@ gf_w16_split_8_16_lazy_multiply_region(gf_t *gf, void *src, void *dest, gf_val_3
h = (gf_internal_t *) gf->scratch;
- for (j = 0; j < 256; j++) {
- ltable[j] = gf_w16_log_multiply(gf, j, val);
- htable[j] = gf_w16_log_multiply(gf, (j<<8), val);
+ v = val;
+ ltable[0] = 0;
+ for (j = 1; j < 256; j <<= 1) {
+ for (k = 0; k < j; k++) ltable[k^j] = (v ^ ltable[k]);
+ v = GF_MULTBY_TWO(v);
+ }
+ htable[0] = 0;
+ for (j = 1; j < 256; j <<= 1) {
+ for (k = 0; k < j; k++) htable[k^j] = (v ^ htable[k]);
+ v = GF_MULTBY_TWO(v);
}
s64 = (uint64_t *) rd.s_start;
@@ -472,8 +931,8 @@ gf_w16_split_8_16_lazy_multiply_region(gf_t *gf, void *src, void *dest, gf_val_3
prod ^= ltable[a >> 56];
prod ^= ((xor) ? *d64 : 0);
*d64 = prod;
- *s64++;
- *d64++;
+ s64++;
+ d64++;
}
*/
@@ -489,10 +948,12 @@ gf_w16_split_8_16_lazy_multiply_region(gf_t *gf, void *src, void *dest, gf_val_3
a <<= 8;
}
+ //JSP: We can move the conditional outside the while loop, but we need to fully test it to understand which is better.
+
prod ^= ((xor) ? *d64 : 0);
*d64 = prod;
- *s64++;
- *d64++;
+ s64++;
+ d64++;
}
gf_do_final_region_alignment(&rd);
}
@@ -502,7 +963,7 @@ gf_w16_table_lazy_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t v
{
uint64_t j, a, c, pp;
gf_internal_t *h;
- struct gf_lazytable_data *ltd;
+ struct gf_w16_lazytable_data *ltd;
gf_region_data rd;
if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
@@ -512,7 +973,7 @@ gf_w16_table_lazy_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t v
gf_do_initial_region_alignment(&rd);
h = (gf_internal_t *) gf->scratch;
- ltd = (struct gf_lazytable_data *) h->private;
+ ltd = (struct gf_w16_lazytable_data *) h->private;
ltd->lazytable[0] = 0;
@@ -530,9 +991,8 @@ gf_w16_table_lazy_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t v
} while (c != 1);
*/
- a = ltd->log_tbl[val];
for (c = 1; c < GF_FIELD_SIZE; c++) {
- ltd->lazytable[c] = ltd->antilog_tbl[ltd->log_tbl[c]+a];
+ ltd->lazytable[c] = gf_w16_shift_multiply(gf, c, val);
}
gf_two_byte_region_table_multiply(&rd, ltd->lazytable);
@@ -543,7 +1003,7 @@ static
void
gf_w16_split_4_16_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
{
-#ifdef INTEL_SSE4
+#ifdef INTEL_SSSE3
uint64_t i, j, *s64, *d64, *top64;;
uint64_t a, c, prod;
uint8_t low[4][16];
@@ -561,7 +1021,7 @@ gf_w16_split_4_16_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_v
for (j = 0; j < 16; j++) {
for (i = 0; i < 4; i++) {
c = (j << (i*4));
- prod = gf_w16_log_multiply(gf, c, val);
+ prod = gf->multiply.w32(gf, c, val);
low[i][j] = (prod & 0xff);
high[i][j] = (prod >> 8);
}
@@ -676,7 +1136,7 @@ static
void
gf_w16_split_4_16_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
{
-#ifdef INTEL_SSE4
+#ifdef INTEL_SSSE3
uint64_t i, j, *s64, *d64, *top64;;
uint64_t c, prod;
uint8_t low[4][16];
@@ -694,7 +1154,7 @@ gf_w16_split_4_16_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void *des
for (j = 0; j < 16; j++) {
for (i = 0; i < 4; i++) {
c = (j << (i*4));
- prod = gf_w16_log_multiply(gf, c, val);
+ prod = gf->multiply.w32(gf, c, val);
low[i][j] = (prod & 0xff);
high[i][j] = (prod >> 8);
}
@@ -782,32 +1242,111 @@ gf_w16_split_4_16_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void *des
#endif
}
+uint32_t
+gf_w16_split_8_8_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+ uint32_t alow, blow;
+ struct gf_w16_split_8_8_data *d8;
+ gf_internal_t *h;
+
+ h = (gf_internal_t *) gf->scratch;
+ d8 = (struct gf_w16_split_8_8_data *) h->private;
+
+ alow = a & 0xff;
+ blow = b & 0xff;
+ a >>= 8;
+ b >>= 8;
+
+ return d8->tables[0][alow][blow] ^
+ d8->tables[1][alow][b] ^
+ d8->tables[1][a][blow] ^
+ d8->tables[2][a][b];
+}
+
static
int gf_w16_split_init(gf_t *gf)
{
gf_internal_t *h;
- gf_w16_log_init(gf);
+ struct gf_w16_split_8_8_data *d8;
+ int i, j, exp, issse3;
+ uint32_t p, basep;
h = (gf_internal_t *) gf->scratch;
- if (h->mult_type == GF_MULT_DEFAULT) {
+issse3 = 0;
+#ifdef INTEL_SSSE3
+ issse3 = 1;
+#endif
+
+ if (h->arg1 == 8 && h->arg2 == 8) {
+ d8 = (struct gf_w16_split_8_8_data *) h->private;
+ basep = 1;
+ for (exp = 0; exp < 3; exp++) {
+ for (j = 0; j < 256; j++) d8->tables[exp][0][j] = 0;
+ for (i = 0; i < 256; i++) d8->tables[exp][i][0] = 0;
+ d8->tables[exp][1][1] = basep;
+ for (i = 2; i < 256; i++) {
+ if (i&1) {
+ p = d8->tables[exp][i^1][1];
+ d8->tables[exp][i][1] = p ^ basep;
+ } else {
+ p = d8->tables[exp][i>>1][1];
+ d8->tables[exp][i][1] = GF_MULTBY_TWO(p);
+ }
+ }
+ for (i = 1; i < 256; i++) {
+ p = d8->tables[exp][i][1];
+ for (j = 1; j < 256; j++) {
+ if (j&1) {
+ d8->tables[exp][i][j] = d8->tables[exp][i][j^1] ^ p;
+ } else {
+ d8->tables[exp][i][j] = GF_MULTBY_TWO(d8->tables[exp][i][j>>1]);
+ }
+ }
+ }
+ for (i = 0; i < 8; i++) basep = GF_MULTBY_TWO(basep);
+ }
+ gf->multiply.w32 = gf_w16_split_8_8_multiply;
gf->multiply_region.w32 = gf_w16_split_8_16_lazy_multiply_region;
-#ifdef INTEL_SSE4
+ return 1;
+
+ }
+
+ /* We'll be using LOG for multiplication, unless the pp isn't primitive.
+ In that case, we'll be using SHIFT. */
+
+ gf_w16_log_init(gf);
+
+ /* Defaults */
+
+ if (issse3) {
gf->multiply_region.w32 = gf_w16_split_4_16_lazy_sse_multiply_region;
-#endif
- } else if ((h->arg1 == 8 && h->arg2 == 16) || (h->arg2 == 8 && h->arg1 == 16)) {
+ } else {
gf->multiply_region.w32 = gf_w16_split_8_16_lazy_multiply_region;
+ }
+
+
+ if ((h->arg1 == 8 && h->arg2 == 16) || (h->arg2 == 8 && h->arg1 == 16)) {
+ gf->multiply_region.w32 = gf_w16_split_8_16_lazy_multiply_region;
+
} else if ((h->arg1 == 4 && h->arg2 == 16) || (h->arg2 == 4 && h->arg1 == 16)) {
- if (h->region_type & GF_REGION_SSE) {
- if (h->region_type & GF_REGION_ALTMAP) {
+ if (issse3) {
+ if(h->region_type & GF_REGION_ALTMAP && h->region_type & GF_REGION_NOSSE)
+ gf->multiply_region.w32 = gf_w16_split_4_16_lazy_nosse_altmap_multiply_region;
+ else if(h->region_type & GF_REGION_NOSSE)
+ gf->multiply_region.w32 = gf_w16_split_4_16_lazy_multiply_region;
+ else if(h->region_type & GF_REGION_ALTMAP)
gf->multiply_region.w32 = gf_w16_split_4_16_lazy_sse_altmap_multiply_region;
- } else {
- gf->multiply_region.w32 = gf_w16_split_4_16_lazy_sse_multiply_region;
- }
} else {
- gf->multiply_region.w32 = gf_w16_split_4_16_lazy_multiply_region;
+ if(h->region_type & GF_REGION_SSE)
+ return 0;
+ else if(h->region_type & GF_REGION_ALTMAP)
+ gf->multiply_region.w32 = gf_w16_split_4_16_lazy_nosse_altmap_multiply_region;
+ else
+ gf->multiply_region.w32 = gf_w16_split_4_16_lazy_multiply_region;
}
}
+
return 1;
}
@@ -818,7 +1357,7 @@ int gf_w16_table_init(gf_t *gf)
gf_w16_log_init(gf);
h = (gf_internal_t *) gf->scratch;
- gf->multiply_region.w32 = NULL;
+
gf->multiply_region.w32 = gf_w16_table_lazy_multiply_region;
return 1;
}
@@ -830,7 +1369,7 @@ gf_w16_log_zero_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val
uint16_t lv;
int i;
uint16_t *s16, *d16, *top16;
- struct gf_zero_logtable_data *ltd;
+ struct gf_w16_zero_logtable_data *ltd;
gf_region_data rd;
if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
@@ -839,7 +1378,7 @@ gf_w16_log_zero_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val
gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 2);
gf_do_initial_region_alignment(&rd);
- ltd = (struct gf_zero_logtable_data*) ((gf_internal_t *) gf->scratch)->private;
+ ltd = (struct gf_w16_zero_logtable_data*) ((gf_internal_t *) gf->scratch)->private;
s16 = (uint16_t *) rd.s_start;
d16 = (uint16_t *) rd.d_start;
top16 = (uint16_t *) rd.d_top;
@@ -858,18 +1397,20 @@ gf_w16_log_zero_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val
}
/* This isn't necessary. */
+
gf_do_final_region_alignment(&rd);
}
/* Here -- double-check Kevin */
+
static
inline
gf_val_32_t
gf_w16_log_zero_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
{
- struct gf_zero_logtable_data *ltd;
+ struct gf_w16_zero_logtable_data *ltd;
- ltd = (struct gf_zero_logtable_data *) ((gf_internal_t *) gf->scratch)->private;
+ ltd = (struct gf_w16_zero_logtable_data *) ((gf_internal_t *) gf->scratch)->private;
return ltd->antilog_tbl[ltd->log_tbl[a] + ltd->log_tbl[b]];
}
@@ -879,10 +1420,10 @@ gf_val_32_t
gf_w16_log_zero_divide (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
{
int log_sum = 0;
- struct gf_zero_logtable_data *ltd;
+ struct gf_w16_zero_logtable_data *ltd;
if (a == 0 || b == 0) return 0;
- ltd = (struct gf_zero_logtable_data *) ((gf_internal_t *) gf->scratch)->private;
+ ltd = (struct gf_w16_zero_logtable_data *) ((gf_internal_t *) gf->scratch)->private;
log_sum = ltd->log_tbl[a] - ltd->log_tbl[b] + (GF_MULT_GROUP_SIZE);
return (ltd->antilog_tbl[log_sum]);
@@ -892,9 +1433,9 @@ static
gf_val_32_t
gf_w16_log_zero_inverse (gf_t *gf, gf_val_32_t a)
{
- struct gf_zero_logtable_data *ltd;
+ struct gf_w16_zero_logtable_data *ltd;
- ltd = (struct gf_zero_logtable_data *) ((gf_internal_t *) gf->scratch)->private;
+ ltd = (struct gf_w16_zero_logtable_data *) ((gf_internal_t *) gf->scratch)->private;
return (ltd->inv_tbl[a]);
}
@@ -1015,7 +1556,7 @@ static
void
gf_w16_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
{
-#ifdef INTEL_SSE4
+#ifdef INTEL_SSE2
int i;
uint8_t *s8, *d8;
uint32_t vrev;
@@ -1079,7 +1620,7 @@ static
void
gf_w16_bytwo_b_sse_region_2_noxor(gf_region_data *rd, struct gf_w16_bytwo_data *btd)
{
-#ifdef INTEL_SSE4
+#ifdef INTEL_SSE2
int i;
uint8_t *d8, *s8, tb;
__m128i pp, m1, m2, t1, t2, va, vb;
@@ -1105,7 +1646,7 @@ static
void
gf_w16_bytwo_b_sse_region_2_xor(gf_region_data *rd, struct gf_w16_bytwo_data *btd)
{
-#ifdef INTEL_SSE4
+#ifdef INTEL_SSE2
int i;
uint8_t *d8, *s8, tb;
__m128i pp, m1, m2, t1, t2, va, vb;
@@ -1134,7 +1675,7 @@ static
void
gf_w16_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
{
-#ifdef INTEL_SSE4
+#ifdef INTEL_SSE2
int itb;
uint8_t *d8, *s8;
__m128i pp, m1, m2, t1, t2, va, vb;
@@ -1352,20 +1893,30 @@ int gf_w16_bytwo_init(gf_t *gf)
if (h->mult_type == GF_MULT_BYTWO_p) {
gf->multiply.w32 = gf_w16_bytwo_p_multiply;
- if (h->region_type == GF_REGION_SSE) {
- gf->multiply_region.w32 = gf_w16_bytwo_p_sse_multiply_region;
- } else {
+ #ifdef INTEL_SSE2
+ if (h->region_type & GF_REGION_NOSSE)
+ gf->multiply_region.w32 = gf_w16_bytwo_p_nosse_multiply_region;
+ else
+ gf->multiply_region.w32 = gf_w16_bytwo_p_sse_multiply_region;
+ #else
gf->multiply_region.w32 = gf_w16_bytwo_p_nosse_multiply_region;
- }
+ if(h->region_type & GF_REGION_SSE)
+ return 0;
+ #endif
} else {
gf->multiply.w32 = gf_w16_bytwo_b_multiply;
- if (h->region_type == GF_REGION_SSE) {
- gf->multiply_region.w32 = gf_w16_bytwo_b_sse_multiply_region;
- } else {
+ #ifdef INTEL_SSE2
+ if (h->region_type & GF_REGION_NOSSE)
+ gf->multiply_region.w32 = gf_w16_bytwo_b_nosse_multiply_region;
+ else
+ gf->multiply_region.w32 = gf_w16_bytwo_b_sse_multiply_region;
+ #else
gf->multiply_region.w32 = gf_w16_bytwo_b_nosse_multiply_region;
- }
+ if(h->region_type & GF_REGION_SSE)
+ return 0;
+ #endif
}
- gf->inverse.w32 = gf_w16_euclid;
+
return 1;
}
@@ -1373,7 +1924,7 @@ static
int gf_w16_log_zero_init(gf_t *gf)
{
gf_internal_t *h;
- struct gf_zero_logtable_data *ltd;
+ struct gf_w16_zero_logtable_data *ltd;
int i, b;
h = (gf_internal_t *) gf->scratch;
@@ -1423,30 +1974,30 @@ gf_w16_composite_multiply_recursive(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
a1b1 = base_gf->multiply.w32(base_gf, a1, b1);
- rv = ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) | ((base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, GF_S_GF_8_2)) << 8));
+ rv = ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) | ((base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 8));
return rv;
}
static
gf_val_32_t
-gf_w16_composite_multiply_table(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+gf_w16_composite_multiply_inline(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
{
gf_internal_t *h = (gf_internal_t *) gf->scratch;
- struct gf_w8_single_table_data * std;
-
+ gf_t *base_gf = h->base_gf;
uint8_t b0 = b & 0x00ff;
uint8_t b1 = (b & 0xff00) >> 8;
uint8_t a0 = a & 0x00ff;
uint8_t a1 = (a & 0xff00) >> 8;
- uint8_t a1b1;
+ uint8_t a1b1, *mt;
uint16_t rv;
+ struct gf_w16_composite_data *cd;
- std = (struct gf_w8_single_table_data *) h->private;
+ cd = (struct gf_w16_composite_data *) h->private;
+ mt = cd->mult_table;
- a1b1 = std->mult[a1][b1];
+ a1b1 = GF_W8_INLINE_MULTDIV(mt, a1, b1);
- rv = ((std->mult[a0][b0] ^ a1b1) |
- ((std->mult[a1][b0] ^ std->mult[a0][b1] ^ std->mult[a1b1][GF_S_GF_8_2]) << 8));
+ rv = ((GF_W8_INLINE_MULTDIV(mt, a0, b0) ^ a1b1) | ((GF_W8_INLINE_MULTDIV(mt, a1, b0) ^ GF_W8_INLINE_MULTDIV(mt, a0, b1) ^ GF_W8_INLINE_MULTDIV(mt, a1b1, h->prim_poly)) << 8));
return rv;
}
@@ -1472,6 +2023,7 @@ gf_w16_composite_multiply_table(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
*
* a / b = a * c
*/
+
static
gf_val_32_t
gf_w16_composite_inverse(gf_t *gf, gf_val_32_t a)
@@ -1486,7 +2038,7 @@ gf_w16_composite_inverse(gf_t *gf, gf_val_32_t a)
if (a0 == 0) {
a1inv = base_gf->inverse.w32(base_gf, a1);
- c0 = base_gf->multiply.w32(base_gf, a1inv, GF_S_GF_8_2);
+ c0 = base_gf->multiply.w32(base_gf, a1inv, h->prim_poly);
c1 = a1inv;
} else if (a1 == 0) {
c0 = base_gf->inverse.w32(base_gf, a0);
@@ -1497,7 +2049,7 @@ gf_w16_composite_inverse(gf_t *gf, gf_val_32_t a)
d = base_gf->multiply.w32(base_gf, a1, a0inv);
- tmp = (base_gf->multiply.w32(base_gf, a1, a0inv) ^ base_gf->multiply.w32(base_gf, a0, a1inv) ^ GF_S_GF_8_2);
+ tmp = (base_gf->multiply.w32(base_gf, a1, a0inv) ^ base_gf->multiply.w32(base_gf, a0, a1inv) ^ h->prim_poly);
tmp = base_gf->inverse.w32(base_gf, tmp);
d = base_gf->multiply.w32(base_gf, d, tmp);
@@ -1512,62 +2064,6 @@ gf_w16_composite_inverse(gf_t *gf, gf_val_32_t a)
}
static
-gf_val_32_t
-gf_w16_composite_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
-{
- uint16_t binv;
-
- binv = gf->inverse.w32(gf, b);
- return gf->multiply.w32(gf, a, binv);
-}
-
-static
-void
-gf_w16_composite_multiply_region_inline(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
-{
- gf_internal_t *h = (gf_internal_t *) gf->scratch;
- gf_t *base_gf = h->base_gf;
- struct gf_w8_single_table_data * std;
- uint8_t b0 = val & 0x00ff;
- uint8_t b1 = (val & 0xff00) >> 8;
- uint16_t *s16, *d16, *top;
- uint8_t a0, a1, a1b1;
- struct gf_logtable_data *ltd;
- gf_region_data rd;
-
- if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
-
- std = (struct gf_w8_single_table_data *) h->private;
- gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 2);
-
- s16 = rd.s_start;
- d16 = rd.d_start;
- top = rd.d_top;
-
- if (xor) {
- while (d16 < top) {
- a0 = (*s16) & 0x00ff;
- a1 = ((*s16) & 0xff00) >> 8;
- a1b1 = std->mult[a1][b1];
-
- *d16 ^= ((std->mult[a0][b0] ^ a1b1) | ((std->mult[a1][b0] ^ std->mult[a0][b1] ^ std->mult[a1b1][GF_S_GF_8_2]) << 8));
- s16++;
- d16++;
- }
- } else {
- while (d16 < top) {
- a0 = (*s16) & 0x00ff;
- a1 = ((*s16) & 0xff00) >> 8;
- a1b1 = std->mult[a1][b1];
-
- *d16 = ((std->mult[a0][b0] ^ a1b1) | ((std->mult[a1][b0] ^ std->mult[a0][b1] ^ std->mult[a1b1][GF_S_GF_8_2]) << 8));
- s16++;
- d16++;
- }
- }
-}
-
-static
void
gf_w16_composite_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
{
@@ -1577,9 +2073,13 @@ gf_w16_composite_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t va
uint8_t b0 = val & 0x00ff;
uint8_t b1 = (val & 0xff00) >> 8;
uint16_t *s16, *d16, *top;
- uint8_t a0, a1, a1b1;
+ uint8_t a0, a1, a1b1, *mt;
gf_region_data rd;
- struct gf_logtable_data *ltd;
+ struct gf_w16_logtable_data *ltd;
+ struct gf_w16_composite_data *cd;
+
+ cd = (struct gf_w16_composite_data *) h->private;
+ mt = cd->mult_table;
if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 2);
@@ -1588,27 +2088,61 @@ gf_w16_composite_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t va
d16 = rd.d_start;
top = rd.d_top;
- if (xor) {
- while (d16 < top) {
- a0 = (*s16) & 0x00ff;
- a1 = ((*s16) & 0xff00) >> 8;
- a1b1 = base_gf->multiply.w32(base_gf, a1, b1);
-
- (*d16) ^= ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) |
- ((base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, GF_S_GF_8_2)) << 8));
- s16++;
- d16++;
+ if (mt == NULL) {
+ if (xor) {
+ while (d16 < top) {
+ a0 = (*s16) & 0x00ff;
+ a1 = ((*s16) & 0xff00) >> 8;
+ a1b1 = base_gf->multiply.w32(base_gf, a1, b1);
+
+ (*d16) ^= ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) |
+ ((base_gf->multiply.w32(base_gf, a1, b0) ^
+ base_gf->multiply.w32(base_gf, a0, b1) ^
+ base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 8));
+ s16++;
+ d16++;
+ }
+ } else {
+ while (d16 < top) {
+ a0 = (*s16) & 0x00ff;
+ a1 = ((*s16) & 0xff00) >> 8;
+ a1b1 = base_gf->multiply.w32(base_gf, a1, b1);
+
+ (*d16) = ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) |
+ ((base_gf->multiply.w32(base_gf, a1, b0) ^
+ base_gf->multiply.w32(base_gf, a0, b1) ^
+ base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 8));
+ s16++;
+ d16++;
+ }
}
} else {
- while (d16 < top) {
- a0 = (*s16) & 0x00ff;
- a1 = ((*s16) & 0xff00) >> 8;
- a1b1 = base_gf->multiply.w32(base_gf, a1, b1);
-
- (*d16) = ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) |
- ((base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, GF_S_GF_8_2)) << 8));
- s16++;
- d16++;
+ if (xor) {
+ while (d16 < top) {
+ a0 = (*s16) & 0x00ff;
+ a1 = ((*s16) & 0xff00) >> 8;
+ a1b1 = GF_W8_INLINE_MULTDIV(mt, a1, b1);
+
+ (*d16) ^= ((GF_W8_INLINE_MULTDIV(mt, a0, b0) ^ a1b1) |
+ ((GF_W8_INLINE_MULTDIV(mt, a1, b0) ^
+ GF_W8_INLINE_MULTDIV(mt, a0, b1) ^
+ GF_W8_INLINE_MULTDIV(mt, a1b1, h->prim_poly)) << 8));
+ s16++;
+ d16++;
+ }
+ } else {
+ while (d16 < top) {
+ a0 = (*s16) & 0x00ff;
+ a1 = ((*s16) & 0xff00) >> 8;
+ a1b1 = GF_W8_INLINE_MULTDIV(mt, a1, b1);
+
+ (*d16) = ((GF_W8_INLINE_MULTDIV(mt, a0, b0) ^ a1b1) |
+ ((GF_W8_INLINE_MULTDIV(mt, a1, b0) ^
+ GF_W8_INLINE_MULTDIV(mt, a0, b1) ^
+ GF_W8_INLINE_MULTDIV(mt, a1b1, h->prim_poly)) << 8));
+ s16++;
+ d16++;
+ }
}
}
}
@@ -1645,7 +2179,7 @@ gf_w16_composite_multiply_region_alt(gf_t *gf, void *src, void *dest, gf_val_32_
base_gf->multiply_region.w32(base_gf, shigh, dlow, val1, sub_reg_size, 1);
base_gf->multiply_region.w32(base_gf, slow, dhigh, val1, sub_reg_size, xor);
base_gf->multiply_region.w32(base_gf, shigh, dhigh, val0, sub_reg_size, 1);
- base_gf->multiply_region.w32(base_gf, shigh, dhigh, base_gf->multiply.w32(base_gf, GF_S_GF_8_2, val1), sub_reg_size, 1);
+ base_gf->multiply_region.w32(base_gf, shigh, dhigh, base_gf->multiply.w32(base_gf, h->prim_poly, val1), sub_reg_size, 1);
gf_do_final_region_alignment(&rd);
}
@@ -1653,34 +2187,26 @@ gf_w16_composite_multiply_region_alt(gf_t *gf, void *src, void *dest, gf_val_32_
static
int gf_w16_composite_init(gf_t *gf)
{
- struct gf_w8_single_table_data * std;
gf_internal_t *h = (gf_internal_t *) gf->scratch;
- gf_t *base_gf = h->base_gf;
- gf_internal_t *base_h = (gf_internal_t *) base_gf->scratch;
- uint16_t a, b;
+ struct gf_w16_composite_data *cd;
+
+ if (h->base_gf == NULL) return 0;
+
+ cd = (struct gf_w16_composite_data *) h->private;
+ cd->mult_table = gf_w8_get_mult_table(h->base_gf);
if (h->region_type & GF_REGION_ALTMAP) {
gf->multiply_region.w32 = gf_w16_composite_multiply_region_alt;
- } else if (h->arg2 == 0 && base_h->mult_type == GF_MULT_TABLE &&
- base_h->region_type == GF_REGION_DEFAULT) {
- gf->multiply_region.w32 = gf_w16_composite_multiply_region_inline;
} else {
gf->multiply_region.w32 = gf_w16_composite_multiply_region;
}
-
- if (h->arg2 == 0) {
- std = (struct gf_w8_single_table_data *) h->private;
- for (a = 0; a < 256; a++) {
- for (b = 0; b < 256; b++) {
- std->mult[a][b] = base_gf->multiply.w32(base_gf, a, b);
- }
- }
- gf->multiply.w32 = gf_w16_composite_multiply_table;
- } else {
+
+ if (cd->mult_table == NULL) {
gf->multiply.w32 = gf_w16_composite_multiply_recursive;
+ } else {
+ gf->multiply.w32 = gf_w16_composite_multiply_inline;
}
-
- gf->divide.w32 = gf_w16_composite_divide;
+ gf->divide.w32 = NULL;
gf->inverse.w32 = gf_w16_composite_inverse;
return 1;
@@ -1815,79 +2341,50 @@ int gf_w16_group_init(gf_t *gf)
int gf_w16_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2)
{
- int ss;
- int sa;
-
- ss = (GF_REGION_SSE | GF_REGION_NOSSE);
- sa = (GF_REGION_STDMAP | GF_REGION_ALTMAP);
-
switch(mult_type)
{
case GF_MULT_TABLE:
- region_type |= GF_REGION_LAZY;
- if (arg1 != 0 || arg2 != 0 || region_type != GF_REGION_LAZY) return -1;
- return sizeof(gf_internal_t) + sizeof(struct gf_lazytable_data) + 64;
+ return sizeof(gf_internal_t) + sizeof(struct gf_w16_lazytable_data) + 64;
break;
case GF_MULT_BYTWO_p:
case GF_MULT_BYTWO_b:
- if (arg1 != 0 || arg2 != 0 || (region_type | ss) != ss ||
- (region_type & ss) == ss) return -1;
return sizeof(gf_internal_t) + sizeof(struct gf_w16_bytwo_data);
break;
- case GF_MULT_DEFAULT:
+ case GF_MULT_LOG_ZERO:
+ return sizeof(gf_internal_t) + sizeof(struct gf_w16_zero_logtable_data) + 64;
+ break;
case GF_MULT_LOG_TABLE:
- if (arg2 != 0) return -1;
- if (region_type != GF_REGION_DEFAULT) return -1;
- if (arg1 == 1) {
- return sizeof(gf_internal_t) + sizeof(struct gf_zero_logtable_data) + 64;
- } else if (arg1 == 0) {
- return sizeof(gf_internal_t) + sizeof(struct gf_logtable_data) + 64;
- } else {
- return -1;
- }
+ return sizeof(gf_internal_t) + sizeof(struct gf_w16_logtable_data) + 64;
break;
+ case GF_MULT_DEFAULT:
case GF_MULT_SPLIT_TABLE:
- if ((arg1 == 8 && arg2 == 16) || (arg2 == 8 && arg1 == 16)) {
- region_type |= GF_REGION_LAZY;
- if (region_type != GF_REGION_LAZY) return -1;
- return sizeof(gf_internal_t) + sizeof(struct gf_logtable_data) + 64;
- } else if ((arg1 == 4 && arg2 == 16) || (arg2 == 4 && arg1 == 16)) {
- region_type &= (~GF_REGION_LAZY); /* Ignore GF_REGION_LAZY */
- if ((region_type & ss) == ss) return -1;
- if ((region_type & sa) == sa) return -1;
- if ((region_type & ss) == 0) region_type |= GF_REGION_SSE;
- if (region_type & GF_REGION_NOSSE) {
- if (region_type != GF_REGION_NOSSE) return -1;
- return sizeof(gf_internal_t) + sizeof(struct gf_logtable_data) + 64;
- } else {
- if ((region_type | ss | sa) != (ss|sa)) return -1;
- return sizeof(gf_internal_t) + sizeof(struct gf_logtable_data) + 64;
- }
- }
- return -1;
- break;
- case GF_MULT_GROUP:
- if (arg1 == 4 && arg2 == 4) {
- return sizeof(gf_internal_t) + sizeof(struct gf_w16_group_4_4_data) + 64;
+ if (arg1 == 8 && arg2 == 8) {
+ return sizeof(gf_internal_t) + sizeof(struct gf_w16_split_8_8_data) + 64;
+ } else if ((arg1 == 8 && arg2 == 16) || (arg2 == 8 && arg1 == 16)) {
+ return sizeof(gf_internal_t) + sizeof(struct gf_w16_logtable_data) + 64;
+ } else if (mult_type == GF_MULT_DEFAULT ||
+ (arg1 == 4 && arg2 == 16) || (arg2 == 4 && arg1 == 16)) {
+ return sizeof(gf_internal_t) + sizeof(struct gf_w16_logtable_data) + 64;
}
- return -1;
+ return 0;
+ break;
+ case GF_MULT_GROUP:
+ return sizeof(gf_internal_t) + sizeof(struct gf_w16_group_4_4_data) + 64;
+ break;
+ case GF_MULT_CARRY_FREE:
+ return sizeof(gf_internal_t);
+ break;
case GF_MULT_SHIFT:
- if (arg1 != 0 || arg2 != 0 || region_type != 0) return -1;
return sizeof(gf_internal_t);
break;
case GF_MULT_COMPOSITE:
- if (region_type & ~(GF_REGION_ALTMAP | GF_REGION_STDMAP)) return -1;
- if (arg1 == 2 && arg2 == 0) {
- return sizeof(gf_internal_t) + sizeof(struct gf_w8_single_table_data) + 64;
- } else if (arg1 == 2 && arg2 == 1) {
- return sizeof(gf_internal_t) + 64;
- } else {
- return -1;
- }
+ return sizeof(gf_internal_t) + sizeof(struct gf_w16_composite_data) + 64;
+ break;
default:
- return -1;
+ return 0;
}
+ return 0;
}
int gf_w16_init(gf_t *gf)
@@ -1895,7 +2392,27 @@ int gf_w16_init(gf_t *gf)
gf_internal_t *h;
h = (gf_internal_t *) gf->scratch;
- if (h->prim_poly == 0) h->prim_poly = 0x1100b;
+
+ /* Allen: set default primitive polynomial / irreducible polynomial if needed */
+
+ if (h->prim_poly == 0) {
+ if (h->mult_type == GF_MULT_COMPOSITE) {
+ h->prim_poly = gf_composite_get_default_poly(h->base_gf);
+ if (h->prim_poly == 0) return 0;
+ } else {
+
+ /* Allen: use the following primitive polynomial to make
+ carryless multiply work more efficiently for GF(2^16).
+
+ h->prim_poly = 0x1002d;
+
+ The following is the traditional primitive polynomial for GF(2^16) */
+
+ h->prim_poly = 0x1100b;
+ }
+ }
+
+ if (h->mult_type != GF_MULT_COMPOSITE) h->prim_poly |= (1 << 16);
gf->multiply.w32 = NULL;
gf->divide.w32 = NULL;
@@ -1903,21 +2420,17 @@ int gf_w16_init(gf_t *gf)
gf->multiply_region.w32 = NULL;
switch(h->mult_type) {
- case GF_MULT_LOG_TABLE:
- if (h->arg1 == 1) {
- if (gf_w16_log_zero_init(gf) == 0) return 0;
- } else {
- if (gf_w16_log_init(gf) == 0) return 0;
- }
- break;
+ case GF_MULT_LOG_ZERO: if (gf_w16_log_zero_init(gf) == 0) return 0; break;
+ case GF_MULT_LOG_TABLE: if (gf_w16_log_init(gf) == 0) return 0; break;
case GF_MULT_DEFAULT:
case GF_MULT_SPLIT_TABLE: if (gf_w16_split_init(gf) == 0) return 0; break;
case GF_MULT_TABLE: if (gf_w16_table_init(gf) == 0) return 0; break;
- case GF_MULT_SHIFT: if (gf_w16_shift_init(gf) == 0) return 0; break;
- case GF_MULT_COMPOSITE: if (gf_w16_composite_init(gf) == 0) return 0; break;
+ case GF_MULT_CARRY_FREE: if (gf_w16_cfm_init(gf) == 0) return 0; break;
+ case GF_MULT_SHIFT: if (gf_w16_shift_init(gf) == 0) return 0; break;
+ case GF_MULT_COMPOSITE: if (gf_w16_composite_init(gf) == 0) return 0; break;
case GF_MULT_BYTWO_p:
- case GF_MULT_BYTWO_b: if (gf_w16_bytwo_init(gf) == 0) return 0; break;
- case GF_MULT_GROUP: if (gf_w16_group_init(gf) == 0) return 0; break;
+ case GF_MULT_BYTWO_b: if (gf_w16_bytwo_init(gf) == 0) return 0; break;
+ case GF_MULT_GROUP: if (gf_w16_group_init(gf) == 0) return 0; break;
default: return 0;
}
if (h->divide_type == GF_DIVIDE_EUCLID) {
@@ -1928,23 +2441,28 @@ int gf_w16_init(gf_t *gf)
gf->inverse.w32 = gf_w16_matrix;
}
- if (gf->inverse.w32== NULL && gf->divide.w32 == NULL) gf->inverse.w32 = gf_w16_euclid;
-
- if (gf->inverse.w32 != NULL && gf->divide.w32 == NULL) {
+ if (gf->divide.w32 == NULL) {
gf->divide.w32 = gf_w16_divide_from_inverse;
+ if (gf->inverse.w32 == NULL) gf->inverse.w32 = gf_w16_euclid;
}
- if (gf->inverse.w32 == NULL && gf->divide.w32 != NULL) {
- gf->inverse.w32 = gf_w16_inverse_from_divide;
- }
+
+ if (gf->inverse.w32 == NULL) gf->inverse.w32 = gf_w16_inverse_from_divide;
+
if (h->region_type & GF_REGION_ALTMAP) {
if (h->mult_type == GF_MULT_COMPOSITE) {
gf->extract_word.w32 = gf_w16_composite_extract_word;
} else {
gf->extract_word.w32 = gf_w16_split_extract_word;
}
+ } else if (h->region_type == GF_REGION_CAUCHY) {
+ gf->multiply_region.w32 = gf_wgen_cauchy_region;
+ gf->extract_word.w32 = gf_wgen_extract_word;
} else {
gf->extract_word.w32 = gf_w16_extract_word;
}
+ if (gf->multiply_region.w32 == NULL) {
+ gf->multiply_region.w32 = gf_w16_multiply_region_from_single;
+ }
return 1;
}
@@ -1953,11 +2471,11 @@ int gf_w16_init(gf_t *gf)
uint16_t *gf_w16_get_log_table(gf_t *gf)
{
gf_internal_t *h;
- struct gf_logtable_data *ltd;
+ struct gf_w16_logtable_data *ltd;
h = (gf_internal_t *) gf->scratch;
if (gf->multiply.w32 == gf_w16_log_multiply) {
- ltd = (struct gf_logtable_data *) ((gf_internal_t *) gf->scratch)->private;
+ ltd = (struct gf_w16_logtable_data *) ((gf_internal_t *) gf->scratch)->private;
return (uint16_t *) ltd->log_tbl;
}
return NULL;
@@ -1966,11 +2484,11 @@ uint16_t *gf_w16_get_log_table(gf_t *gf)
uint16_t *gf_w16_get_mult_alog_table(gf_t *gf)
{
gf_internal_t *h;
- struct gf_logtable_data *ltd;
+ struct gf_w16_logtable_data *ltd;
h = (gf_internal_t *) gf->scratch;
if (gf->multiply.w32 == gf_w16_log_multiply) {
- ltd = (struct gf_logtable_data *) h->private;
+ ltd = (struct gf_w16_logtable_data *) h->private;
return (uint16_t *) ltd->antilog_tbl;
}
return NULL;
@@ -1979,11 +2497,11 @@ uint16_t *gf_w16_get_mult_alog_table(gf_t *gf)
uint16_t *gf_w16_get_div_alog_table(gf_t *gf)
{
gf_internal_t *h;
- struct gf_logtable_data *ltd;
+ struct gf_w16_logtable_data *ltd;
h = (gf_internal_t *) gf->scratch;
if (gf->multiply.w32 == gf_w16_log_multiply) {
- ltd = (struct gf_logtable_data *) h->private;
+ ltd = (struct gf_w16_logtable_data *) h->private;
return (uint16_t *) ltd->d_antilog;
}
return NULL;
diff --git a/gf_w32.c b/gf_w32.c
index b0ba8c5..cae188f 100644
--- a/gf_w32.c
+++ b/gf_w32.c
@@ -15,24 +15,14 @@
#define GF_BASE_FIELD_WIDTH (16)
#define GF_BASE_FIELD_SIZE (1 << GF_BASE_FIELD_WIDTH)
#define GF_BASE_FIELD_GROUP_SIZE GF_BASE_FIELD_SIZE-1
-#define GF_S_GF_16_2 (40188)
-#define GF_MULTBY_TWO(p) (((p) & GF_FIRST_BIT) ? (((p) << 1) ^ h->prim_poly) : (p) << 1);
-
-
-struct gf_w16_logtable_data {
- int log_tbl[GF_BASE_FIELD_SIZE];
- uint16_t _antilog_tbl[GF_BASE_FIELD_SIZE * 4];
- uint16_t *antilog_tbl;
- uint16_t inv_tbl[GF_BASE_FIELD_SIZE];
- uint32_t log_s;
-};
+#define GF_MULTBY_TWO(p) (((p) & GF_FIRST_BIT) ? (((p) << 1) ^ h->prim_poly) : (p) << 1)
struct gf_split_2_32_lazy_data {
uint32_t tables[16][4];
uint32_t last_value;
};
-struct gf_split_8_8_data {
+struct gf_w32_split_8_8_data {
uint32_t tables[7][256][256];
uint32_t region_tables[4][256];
uint32_t last_value;
@@ -67,6 +57,11 @@ struct gf_w32_bytwo_data {
uint64_t mask2;
};
+struct gf_w32_composite_data {
+ uint16_t *log;
+ uint16_t *alog;
+};
+
#define MM_PRINT32(s, r) { uint8_t blah[16], ii; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 4) printf(" %02x%02x%02x%02x", blah[15-ii], blah[14-ii], blah[13-ii], blah[12-ii]); printf("\n"); }
#define MM_PRINT8(s, r) { uint8_t blah[16], ii; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 1) printf("%s%02x", (ii%4==0) ? " " : " ", blah[15-ii]); printf("\n"); }
@@ -121,6 +116,168 @@ xor)
}
}
+static
+void
+gf_w32_clm_multiply_region_from_single_2(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
+{
+
+#ifdef INTEL_SSE4_PCLMUL
+
+ int i;
+ uint32_t *s32;
+ uint32_t *d32;
+
+ __m128i a, b;
+ __m128i result;
+ __m128i prim_poly;
+ __m128i w;
+ gf_internal_t * h = gf->scratch;
+
+ prim_poly = _mm_set_epi32(0, 0, 1, (uint32_t)(h->prim_poly & 0xffffffffULL));
+
+ if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+ if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+ a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0);
+ s32 = (uint32_t *) src;
+ d32 = (uint32_t *) dest;
+
+ if (xor) {
+ for (i = 0; i < bytes/sizeof(uint32_t); i++) {
+ b = _mm_insert_epi32 (a, s32[i], 0);
+ result = _mm_clmulepi64_si128 (a, b, 0);
+ w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+ result = _mm_xor_si128 (result, w);
+ w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+ result = _mm_xor_si128 (result, w);
+ d32[i] ^= ((gf_val_32_t)_mm_extract_epi32(result, 0));
+ }
+ } else {
+ for (i = 0; i < bytes/sizeof(uint32_t); i++) {
+ b = _mm_insert_epi32 (a, s32[i], 0);
+ result = _mm_clmulepi64_si128 (a, b, 0);
+ w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+ result = _mm_xor_si128 (result, w);
+ w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+ result = _mm_xor_si128 (result, w);
+ d32[i] = ((gf_val_32_t)_mm_extract_epi32(result, 0));
+ }
+ }
+#endif
+}
+
+static
+void
+gf_w32_clm_multiply_region_from_single_3(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
+{
+
+#ifdef INTEL_SSE4_PCLMUL
+
+ int i;
+ uint32_t *s32;
+ uint32_t *d32;
+
+ __m128i a, b;
+ __m128i result;
+ __m128i prim_poly;
+ __m128i w;
+ gf_internal_t * h = gf->scratch;
+
+ prim_poly = _mm_set_epi32(0, 0, 1, (uint32_t)(h->prim_poly & 0xffffffffULL));
+
+ if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+ if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+ a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0);
+
+ s32 = (uint32_t *) src;
+ d32 = (uint32_t *) dest;
+
+ if (xor) {
+ for (i = 0; i < bytes/sizeof(uint32_t); i++) {
+ b = _mm_insert_epi32 (a, s32[i], 0);
+ result = _mm_clmulepi64_si128 (a, b, 0);
+ w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+ result = _mm_xor_si128 (result, w);
+ w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+ result = _mm_xor_si128 (result, w);
+ w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+ result = _mm_xor_si128 (result, w);
+ d32[i] ^= ((gf_val_32_t)_mm_extract_epi32(result, 0));
+ }
+ } else {
+ for (i = 0; i < bytes/sizeof(uint32_t); i++) {
+ b = _mm_insert_epi32 (a, s32[i], 0);
+ result = _mm_clmulepi64_si128 (a, b, 0);
+ w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+ result = _mm_xor_si128 (result, w);
+ w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+ result = _mm_xor_si128 (result, w);
+ w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+ result = _mm_xor_si128 (result, w);
+ d32[i] = ((gf_val_32_t)_mm_extract_epi32(result, 0));
+ }
+ }
+#endif
+}
+
+static
+void
+gf_w32_clm_multiply_region_from_single_4(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
+{
+#ifdef INTEL_SSE4_PCLMUL
+ int i;
+ uint32_t *s32;
+ uint32_t *d32;
+
+ __m128i a, b;
+ __m128i result;
+ __m128i prim_poly;
+ __m128i w;
+ gf_internal_t * h = gf->scratch;
+
+ prim_poly = _mm_set_epi32(0, 0, 1, (uint32_t)(h->prim_poly & 0xffffffffULL));
+
+ if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+ if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+ a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0);
+
+ s32 = (uint32_t *) src;
+ d32 = (uint32_t *) dest;
+
+ if (xor) {
+ for (i = 0; i < bytes/sizeof(uint32_t); i++) {
+ b = _mm_insert_epi32 (a, s32[i], 0);
+ result = _mm_clmulepi64_si128 (a, b, 0);
+ w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+ result = _mm_xor_si128 (result, w);
+ w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+ result = _mm_xor_si128 (result, w);
+ w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+ result = _mm_xor_si128 (result, w);
+ w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+ result = _mm_xor_si128 (result, w);
+ d32[i] ^= ((gf_val_32_t)_mm_extract_epi32(result, 0));
+ }
+ } else {
+ for (i = 0; i < bytes/sizeof(uint32_t); i++) {
+ b = _mm_insert_epi32 (a, s32[i], 0);
+ result = _mm_clmulepi64_si128 (a, b, 0);
+ w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+ result = _mm_xor_si128 (result, w);
+ w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+ result = _mm_xor_si128 (result, w);
+ w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+ result = _mm_xor_si128 (result, w);
+ w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+ result = _mm_xor_si128 (result, w);
+ d32[i] = ((gf_val_32_t)_mm_extract_epi32(result, 0));
+ }
+ }
+#endif
+}
+
static
inline
uint32_t gf_w32_euclid (gf_t *gf, uint32_t b)
@@ -131,7 +288,7 @@ uint32_t gf_w32_euclid (gf_t *gf, uint32_t b)
uint32_t c_i;
if (b == 0) return -1;
- e_im1 = ((gf_internal_t *) (gf->scratch))->prim_poly;
+ e_im1 = ((gf_internal_t *) (gf->scratch))->prim_poly;
e_i = b;
d_im1 = 32;
for (d_i = d_im1-1; ((1 << d_i) & e_i) == 0; d_i--) ;
@@ -148,6 +305,7 @@ uint32_t gf_w32_euclid (gf_t *gf, uint32_t b)
c_i ^= (1 << (d_ip1 - d_i));
e_ip1 ^= (e_i << (d_ip1 - d_i));
d_ip1--;
+ if (e_ip1 == 0) return 0;
while ((e_ip1 & (1 << d_ip1)) == 0) d_ip1--;
}
@@ -237,6 +395,134 @@ uint32_t gf_w32_matrix (gf_t *gf, uint32_t b)
extra memory.
*/
+
+
+
+static
+inline
+gf_val_32_t
+gf_w32_clm_multiply_2 (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32)
+{
+ gf_val_32_t rv = 0;
+
+#ifdef INTEL_SSE4_PCLMUL
+
+ __m128i a, b;
+ __m128i result;
+ __m128i prim_poly;
+ __m128i v, w;
+ gf_internal_t * h = gf->scratch;
+
+
+ a = _mm_insert_epi32 (_mm_setzero_si128(), a32, 0);
+ b = _mm_insert_epi32 (a, b32, 0);
+
+ prim_poly = _mm_set_epi32(0, 0, 1, (uint32_t)(h->prim_poly & 0xffffffffULL));
+
+ /* Do the initial multiply */
+
+ result = _mm_clmulepi64_si128 (a, b, 0);
+
+ /* Ben: Do prim_poly reduction twice. We are guaranteed that we will only
+ have to do the reduction at most twice, because (w-2)/z == 2. Where
+ z is equal to the number of zeros after the leading 1
+
+ _mm_clmulepi64_si128 is the carryless multiply operation. Here
+ _mm_srli_si128 shifts the result to the right by 4 bytes. This allows
+ us to multiply the prim_poly by the leading bits of the result. We
+ then xor the result of that operation back with the result.*/
+
+ w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+ result = _mm_xor_si128 (result, w);
+ w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+ result = _mm_xor_si128 (result, w);
+
+ /* Extracts 32 bit value from result. */
+ rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
+#endif
+ return rv;
+}
+static
+inline
+gf_val_32_t
+gf_w32_clm_multiply_3 (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32)
+{
+ gf_val_32_t rv = 0;
+
+#ifdef INTEL_SSE4_PCLMUL
+
+ __m128i a, b;
+ __m128i result;
+ __m128i prim_poly;
+ __m128i v, w;
+ gf_internal_t * h = gf->scratch;
+
+
+ a = _mm_insert_epi32 (_mm_setzero_si128(), a32, 0);
+ b = _mm_insert_epi32 (a, b32, 0);
+
+ prim_poly = _mm_set_epi32(0, 0, 1, (uint32_t)(h->prim_poly & 0xffffffffULL));
+
+ /* Do the initial multiply */
+
+ result = _mm_clmulepi64_si128 (a, b, 0);
+
+ w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+ result = _mm_xor_si128 (result, w);
+ w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+ result = _mm_xor_si128 (result, w);
+ w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+ result = _mm_xor_si128 (result, w);
+
+ /* Extracts 32 bit value from result. */
+
+ rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
+#endif
+ return rv;
+}
+
+static
+inline
+gf_val_32_t
+gf_w32_clm_multiply_4 (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32)
+{
+ gf_val_32_t rv = 0;
+
+#ifdef INTEL_SSE4_PCLMUL
+
+ __m128i a, b;
+ __m128i result;
+ __m128i prim_poly;
+ __m128i v, w;
+ gf_internal_t * h = gf->scratch;
+
+
+ a = _mm_insert_epi32 (_mm_setzero_si128(), a32, 0);
+ b = _mm_insert_epi32 (a, b32, 0);
+
+ prim_poly = _mm_set_epi32(0, 0, 1, (uint32_t)(h->prim_poly & 0xffffffffULL));
+
+ /* Do the initial multiply */
+
+ result = _mm_clmulepi64_si128 (a, b, 0);
+
+ w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+ result = _mm_xor_si128 (result, w);
+ w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+ result = _mm_xor_si128 (result, w);
+ w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+ result = _mm_xor_si128 (result, w);
+ w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+ result = _mm_xor_si128 (result, w);
+
+ /* Extracts 32 bit value from result. */
+
+ rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
+#endif
+ return rv;
+}
+
+
static
inline
uint32_t
@@ -244,7 +530,7 @@ gf_w32_shift_multiply (gf_t *gf, uint32_t a32, uint32_t b32)
{
uint64_t product, i, pp, a, b, one;
gf_internal_t *h;
-
+
a = a32;
b = b32;
h = (gf_internal_t *) gf->scratch;
@@ -256,37 +542,63 @@ gf_w32_shift_multiply (gf_t *gf, uint32_t a32, uint32_t b32)
for (i = 0; i < GF_FIELD_WIDTH; i++) {
if (a & (one << i)) product ^= (b << i);
}
- for (i = (GF_FIELD_WIDTH*2-1); i >= GF_FIELD_WIDTH; i--) {
+ for (i = (GF_FIELD_WIDTH*2-2); i >= GF_FIELD_WIDTH; i--) {
if (product & (one << i)) product ^= (pp << (i-GF_FIELD_WIDTH));
}
return product;
}
-static
+ static
+int gf_w32_cfm_init(gf_t *gf)
+{
+ gf_internal_t *h;
+
+ h = (gf_internal_t *) gf->scratch;
+
+ gf->inverse.w32 = gf_w32_euclid;
+ gf->multiply_region.w32 = gf_w32_multiply_region_from_single;
+
+ /*Ben: We also check to see if the prim poly will work for pclmul */
+ /*Ben: Check to see how many reduction steps it will take*/
+
+#ifdef INTEL_SSE4_PCLMUL
+ if ((0xfffe0000 & h->prim_poly) == 0){
+ gf->multiply.w32 = gf_w32_clm_multiply_2;
+ gf->multiply_region.w32 = gf_w32_clm_multiply_region_from_single_2;
+ }else if ((0xffc00000 & h->prim_poly) == 0){
+ gf->multiply.w32 = gf_w32_clm_multiply_3;
+ gf->multiply_region.w32 = gf_w32_clm_multiply_region_from_single_3;
+ }else if ((0xfe000000 & h->prim_poly) == 0){
+ gf->multiply.w32 = gf_w32_clm_multiply_4;
+ gf->multiply_region.w32 = gf_w32_clm_multiply_region_from_single_4;
+ } else {
+ return 0;
+ }
+ return 1;
+ #endif
+
+ return 0;
+}
+
+ static
int gf_w32_shift_init(gf_t *gf)
{
- gf->multiply.w32 = gf_w32_shift_multiply;
gf->inverse.w32 = gf_w32_euclid;
gf->multiply_region.w32 = gf_w32_multiply_region_from_single;
+ gf->multiply.w32 = gf_w32_shift_multiply;
return 1;
}
static
-void
+ void
gf_w32_group_set_shift_tables(uint32_t *shift, uint32_t val, gf_internal_t *h)
{
int i;
uint32_t j;
- int g_s;
shift[0] = 0;
-
- if (h->mult_type == GF_MULT_DEFAULT) {
- g_s = 3;
- } else {
- g_s = h->arg1;
- }
- for (i = 1; i < (1 << g_s); i <<= 1) {
+
+ for (i = 1; i < (1 << h->arg1); i <<= 1) {
for (j = 0; j < i; j++) shift[i|j] = shift[j]^val;
if (val & GF_FIRST_BIT) {
val <<= 1;
@@ -297,7 +609,7 @@ gf_w32_group_set_shift_tables(uint32_t *shift, uint32_t val, gf_internal_t *h)
}
}
-static
+ static
void gf_w32_group_s_equals_r_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
{
int i;
@@ -333,10 +645,10 @@ void gf_w32_group_s_equals_r_multiply_region(gf_t *gf, void *src, void *dest, gf
ind = a32 >> rs;
a32 <<= leftover;
p = gd->shift[ind];
-
+
bits_left = rs;
rs = 32 - g_s;
-
+
while (bits_left > 0) {
bits_left -= g_s;
ind = a32 >> rs;
@@ -352,7 +664,7 @@ void gf_w32_group_s_equals_r_multiply_region(gf_t *gf, void *src, void *dest, gf
gf_do_final_region_alignment(&rd);
}
-static
+ static
void gf_w32_group_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
{
uint32_t *s32, *d32, *top;
@@ -368,13 +680,8 @@ void gf_w32_group_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t v
if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
gf_internal_t *h = (gf_internal_t *) gf->scratch;
- if (h->mult_type == GF_MULT_DEFAULT) {
- g_s = 3;
- g_r = 8;
- } else {
- g_s = h->arg1;
- g_r = h->arg2;
- }
+ g_s = h->arg1;
+ g_r = h->arg2;
gd = (struct gf_w32_group_data *) h->private;
gf_w32_group_set_shift_tables(gd->shift, val, h);
@@ -527,13 +834,8 @@ gf_w32_group_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
struct gf_w32_group_data *gd;
gf_internal_t *h = (gf_internal_t *) gf->scratch;
- if (h->mult_type == GF_MULT_DEFAULT) {
- g_s = 3;
- g_r = 8;
- } else {
- g_s = h->arg1;
- g_r = h->arg2;
- }
+ g_s = h->arg1;
+ g_r = h->arg2;
gd = (struct gf_w32_group_data *) h->private;
gf_w32_group_set_shift_tables(gd->shift, b, h);
@@ -684,7 +986,7 @@ static
void
gf_w32_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
{
-#ifdef INTEL_SSE4
+#ifdef INTEL_SSE2
int i;
uint8_t *s8, *d8;
uint32_t vrev;
@@ -879,7 +1181,7 @@ static
void
gf_w32_bytwo_b_sse_region_2_noxor(gf_region_data *rd, struct gf_w32_bytwo_data *btd)
{
-#ifdef INTEL_SSE4
+#ifdef INTEL_SSE2
int i;
uint8_t *d8, *s8, tb;
__m128i pp, m1, m2, t1, t2, va, vb;
@@ -905,7 +1207,7 @@ static
void
gf_w32_bytwo_b_sse_region_2_xor(gf_region_data *rd, struct gf_w32_bytwo_data *btd)
{
-#ifdef INTEL_SSE4
+#ifdef INTEL_SSE2
int i;
uint8_t *d8, *s8, tb;
__m128i pp, m1, m2, t1, t2, va, vb;
@@ -934,7 +1236,7 @@ static
void
gf_w32_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
{
-#ifdef INTEL_SSE4
+#ifdef INTEL_SSE2
uint32_t itb;
uint8_t *d8, *s8;
__m128i pp, m1, m2, t1, t2, va, vb;
@@ -1012,19 +1314,30 @@ int gf_w32_bytwo_init(gf_t *gf)
if (h->mult_type == GF_MULT_BYTWO_p) {
gf->multiply.w32 = gf_w32_bytwo_p_multiply;
- if (h->region_type == GF_REGION_SSE) {
- gf->multiply_region.w32 = gf_w32_bytwo_p_sse_multiply_region;
- } else {
+ #ifdef INTEL_SSE2
+ if (h->region_type & GF_REGION_NOSSE)
+ gf->multiply_region.w32 = gf_w32_bytwo_p_nosse_multiply_region;
+ else
+ gf->multiply_region.w32 = gf_w32_bytwo_p_sse_multiply_region;
+ #else
gf->multiply_region.w32 = gf_w32_bytwo_p_nosse_multiply_region;
- }
+ if(h->region_type & GF_REGION_SSE)
+ return 0;
+ #endif
} else {
gf->multiply.w32 = gf_w32_bytwo_b_multiply;
- if (h->region_type == GF_REGION_SSE) {
- gf->multiply_region.w32 = gf_w32_bytwo_b_sse_multiply_region;
- } else {
+ #ifdef INTEL_SSE2
+ if (h->region_type & GF_REGION_NOSSE)
+ gf->multiply_region.w32 = gf_w32_bytwo_b_nosse_multiply_region;
+ else
+ gf->multiply_region.w32 = gf_w32_bytwo_b_sse_multiply_region;
+ #else
gf->multiply_region.w32 = gf_w32_bytwo_b_nosse_multiply_region;
- }
+ if(h->region_type & GF_REGION_SSE)
+ return 0;
+ #endif
}
+
gf->inverse.w32 = gf_w32_euclid;
return 1;
}
@@ -1036,10 +1349,10 @@ gf_w32_split_8_8_multiply (gf_t *gf, uint32_t a32, uint32_t b32)
{
uint32_t product, i, j, mask, tb;
gf_internal_t *h;
- struct gf_split_8_8_data *d8;
+ struct gf_w32_split_8_8_data *d8;
h = (gf_internal_t *) gf->scratch;
- d8 = (struct gf_split_8_8_data *) h->private;
+ d8 = (struct gf_w32_split_8_8_data *) h->private;
product = 0;
mask = 0xff;
@@ -1062,7 +1375,7 @@ gf_w32_split_8_32_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint32_t
gf_internal_t *h;
uint32_t *s32, *d32, *top, p, a, v;
struct gf_split_8_32_lazy_data *d8;
- struct gf_split_8_8_data *d88;
+ struct gf_w32_split_8_8_data *d88;
uint32_t *t[4];
int i, j, k, change;
uint32_t pp;
@@ -1072,13 +1385,13 @@ gf_w32_split_8_32_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint32_t
if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
h = (gf_internal_t *) gf->scratch;
- if (h->arg1 == 32 || h->arg2 == 32) {
+ if (h->arg1 == 32 || h->arg2 == 32 || h->mult_type == GF_MULT_DEFAULT) {
d8 = (struct gf_split_8_32_lazy_data *) h->private;
for (i = 0; i < 4; i++) t[i] = d8->tables[i];
change = (val != d8->last_value);
if (change) d8->last_value = val;
} else {
- d88 = (struct gf_split_8_8_data *) h->private;
+ d88 = (struct gf_w32_split_8_8_data *) h->private;
for (i = 0; i < 4; i++) t[i] = d88->region_tables[i];
change = (val != d88->last_value);
if (change) d88->last_value = val;
@@ -1243,7 +1556,7 @@ static
void
gf_w32_split_2_32_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
{
-#ifdef INTEL_SSE4
+#ifdef INTEL_SSSE3
gf_internal_t *h;
int i, m, j, tindex;
uint32_t pp, v, v2, s, *s32, *d32, *top;
@@ -1380,7 +1693,7 @@ static
void
gf_w32_split_4_32_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
{
-#ifdef INTEL_SSE4
+#ifdef INTEL_SSSE3
gf_internal_t *h;
int i, m, j, k, tindex;
uint32_t pp, v, s, *s32, *d32, *top, *realtop;
@@ -1572,15 +1885,15 @@ static
void
gf_w32_split_4_32_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
{
-#ifdef INTEL_SSE4
+#ifdef INTEL_SSSE3
gf_internal_t *h;
int i, m, j, k, tindex;
uint32_t pp, v, s, *s32, *d32, *top, tmp_table[16];
- __m128i vi, si, tables[8][4], p0, p1, p2, p3, mask1, v0, v1, v2, v3, mask8, mask16;
+ __m128i vi, si, tables[8][4], p0, p1, p2, p3, mask1, v0, v1, v2, v3, mask8;
__m128i tv1, tv2, tv3, tv0;
uint8_t btable[16];
gf_region_data rd;
-
+
if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
@@ -1593,7 +1906,7 @@ gf_w32_split_4_32_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, uint
s32 = (uint32_t *) rd.s_start;
d32 = (uint32_t *) rd.d_start;
top = (uint32_t *) rd.d_top;
-
+
v = val;
for (i = 0; i < 8; i++) {
tmp_table[0] = 0;
@@ -1614,7 +1927,6 @@ gf_w32_split_4_32_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, uint
mask1 = _mm_set1_epi8(0xf);
mask8 = _mm_set1_epi16(0xff);
- mask16 = _mm_set1_epi32(0xffff);
if (xor) {
while (d32 != top) {
@@ -1737,36 +2049,41 @@ gf_w32_split_4_32_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, uint
v1 = _mm_load_si128((__m128i *) s32); s32 += 4;
v2 = _mm_load_si128((__m128i *) s32); s32 += 4;
v3 = _mm_load_si128((__m128i *) s32); s32 += 4;
-
+
p0 = _mm_srli_epi16(v0, 8);
p1 = _mm_srli_epi16(v1, 8);
p2 = _mm_srli_epi16(v2, 8);
p3 = _mm_srli_epi16(v3, 8);
-
+
tv0 = _mm_and_si128(v0, mask8);
tv1 = _mm_and_si128(v1, mask8);
tv2 = _mm_and_si128(v2, mask8);
tv3 = _mm_and_si128(v3, mask8);
-
+
v0 = _mm_packus_epi16(p1, p0);
v1 = _mm_packus_epi16(tv1, tv0);
v2 = _mm_packus_epi16(p3, p2);
v3 = _mm_packus_epi16(tv3, tv2);
-
+
p0 = _mm_srli_epi16(v0, 8);
p1 = _mm_srli_epi16(v1, 8);
p2 = _mm_srli_epi16(v2, 8);
p3 = _mm_srli_epi16(v3, 8);
-
+
tv0 = _mm_and_si128(v0, mask8);
tv1 = _mm_and_si128(v1, mask8);
tv2 = _mm_and_si128(v2, mask8);
tv3 = _mm_and_si128(v3, mask8);
-
+
v0 = _mm_packus_epi16(p2, p0);
v1 = _mm_packus_epi16(p3, p1);
v2 = _mm_packus_epi16(tv2, tv0);
v3 = _mm_packus_epi16(tv3, tv1);
+
+ p0 = v0;
+ p1 = v1;
+ p2 = v2;
+ p3 = v3;
si = _mm_and_si128(v0, mask1);
p0 = _mm_shuffle_epi8(tables[6][0], si);
@@ -1818,18 +2135,18 @@ gf_w32_split_4_32_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, uint
p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[1][0], si));
p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[1][1], si));
p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[1][2], si));
- p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[1][3], si));
+ p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[1][3], si));
tv0 = _mm_unpackhi_epi8(p1, p3);
tv1 = _mm_unpackhi_epi8(p0, p2);
tv2 = _mm_unpacklo_epi8(p1, p3);
tv3 = _mm_unpacklo_epi8(p0, p2);
-
+
p0 = _mm_unpackhi_epi8(tv1, tv0);
p1 = _mm_unpacklo_epi8(tv1, tv0);
p2 = _mm_unpackhi_epi8(tv3, tv2);
p3 = _mm_unpacklo_epi8(tv3, tv2);
-
+
_mm_store_si128((__m128i *) d32, p0);
_mm_store_si128((__m128i *) (d32+4), p1);
_mm_store_si128((__m128i *) (d32+8), p2);
@@ -1848,19 +2165,50 @@ int gf_w32_split_init(gf_t *gf)
gf_internal_t *h;
struct gf_split_2_32_lazy_data *ld2;
struct gf_split_4_32_lazy_data *ld4;
- struct gf_split_8_8_data *d8;
+ struct gf_w32_split_8_8_data *d8;
struct gf_split_8_32_lazy_data *d32;
struct gf_split_16_32_lazy_data *d16;
uint32_t p, basep;
- int i, j, exp;
+ int i, j, exp, ispclmul, issse3;
+
+ ispclmul = 0;
+#ifdef INTEL_SSE4_PCLMUL
+ ispclmul = 1;
+#endif
+
+ issse3 = 0;
+#ifdef INTEL_SSSE3
+ issse3 = 1;
+#endif
h = (gf_internal_t *) gf->scratch;
/* Defaults */
- gf->multiply_region.w32 = gf_w32_multiply_region_from_single;
- gf->multiply.w32 = gf_w32_shift_multiply;
+
gf->inverse.w32 = gf_w32_euclid;
+ /* JSP: First handle single multiplication:
+ If args == 8, then we're doing split 8 8.
+ Otherwise, if PCLMUL, we use that.
+ Otherwise, we use bytwo_p.
+ */
+
+ if (h->arg1 == 8 && h->arg2 == 8) {
+ gf->multiply.w32 = gf_w32_split_8_8_multiply;
+ } else if (ispclmul) {
+ if ((0xfffe0000 & h->prim_poly) == 0){
+ gf->multiply.w32 = gf_w32_clm_multiply_2;
+ } else if ((0xffc00000 & h->prim_poly) == 0){
+ gf->multiply.w32 = gf_w32_clm_multiply_3;
+ } else if ((0xfe000000 & h->prim_poly) == 0){
+ gf->multiply.w32 = gf_w32_clm_multiply_4;
+ }
+ } else {
+ gf->multiply.w32 = gf_w32_bytwo_p_multiply;
+ }
+
+ /* Easy cases: 16/32 and 2/32 */
+
if ((h->arg1 == 16 && h->arg2 == 32) || (h->arg1 == 32 && h->arg2 == 16)) {
d16 = (struct gf_split_16_32_lazy_data *) h->private;
d16->last_value = 0;
@@ -1868,15 +2216,51 @@ int gf_w32_split_init(gf_t *gf)
return 1;
}
- if ((h->arg1 == 8 && h->arg2 == 32) || (h->arg1 == 32 && h->arg2 == 8)) {
+ if ((h->arg1 == 2 && h->arg2 == 32) || (h->arg1 == 32 && h->arg2 == 2)) {
+ ld2 = (struct gf_split_2_32_lazy_data *) h->private;
+ ld2->last_value = 0;
+ #ifdef INTEL_SSSE3
+ if (!(h->region_type & GF_REGION_NOSSE))
+ gf->multiply_region.w32 = gf_w32_split_2_32_lazy_sse_multiply_region;
+ else
+ gf->multiply_region.w32 = gf_w32_split_2_32_lazy_multiply_region;
+ #else
+ gf->multiply_region.w32 = gf_w32_split_2_32_lazy_multiply_region;
+ if(h->region_type & GF_REGION_SSE) return 0;
+ #endif
+ return 1;
+ }
+
+ /* 4/32 or Default + SSE - There is no ALTMAP/NOSSE. */
+
+ if ((h->arg1 == 4 && h->arg2 == 32) || (h->arg1 == 32 && h->arg2 == 4) ||
+ (issse3 && h->mult_type == GF_REGION_DEFAULT)) {
+ ld4 = (struct gf_split_4_32_lazy_data *) h->private;
+ ld4->last_value = 0;
+ if ((h->region_type & GF_REGION_NOSSE) || !issse3) {
+ gf->multiply_region.w32 = gf_w32_split_4_32_lazy_multiply_region;
+ } else if (h->region_type & GF_REGION_ALTMAP) {
+ gf->multiply_region.w32 = gf_w32_split_4_32_lazy_sse_altmap_multiply_region;
+ } else {
+ gf->multiply_region.w32 = gf_w32_split_4_32_lazy_sse_multiply_region;
+ }
+ return 1;
+ }
+
+ /* 8/32 or Default + no SSE */
+
+ if ((h->arg1 == 8 && h->arg2 == 32) || (h->arg1 == 32 && h->arg2 == 8) ||
+ h->mult_type == GF_MULT_DEFAULT) {
d32 = (struct gf_split_8_32_lazy_data *) h->private;
d32->last_value = 0;
gf->multiply_region.w32 = gf_w32_split_8_32_lazy_multiply_region;
return 1;
}
+ /* Finally, if args == 8, then we have to set up the tables here. */
+
if (h->arg1 == 8 && h->arg2 == 8) {
- d8 = (struct gf_split_8_8_data *) h->private;
+ d8 = (struct gf_w32_split_8_8_data *) h->private;
d8->last_value = 0;
gf->multiply.w32 = gf_w32_split_8_8_multiply;
gf->multiply_region.w32 = gf_w32_split_8_32_lazy_multiply_region;
@@ -1908,31 +2292,10 @@ int gf_w32_split_init(gf_t *gf)
}
return 1;
}
- if ((h->arg1 == 2 && h->arg2 == 32) || (h->arg1 == 32 && h->arg2 == 2)) {
- ld2 = (struct gf_split_2_32_lazy_data *) h->private;
- ld2->last_value = 0;
- if (h->region_type & GF_REGION_SSE) {
- gf->multiply_region.w32 = gf_w32_split_2_32_lazy_sse_multiply_region;
- } else {
- gf->multiply_region.w32 = gf_w32_split_2_32_lazy_multiply_region;
- }
- return 1;
- }
- if ((h->arg1 == 4 && h->arg2 == 32) || (h->arg1 == 32 && h->arg2 == 4)) {
- ld4 = (struct gf_split_4_32_lazy_data *) h->private;
- ld4->last_value = 0;
- if (h->region_type & GF_REGION_SSE) {
- if (h->region_type & GF_REGION_ALTMAP) {
- gf->multiply_region.w32 = gf_w32_split_4_32_lazy_sse_altmap_multiply_region;
- } else {
- gf->multiply_region.w32 = gf_w32_split_4_32_lazy_sse_multiply_region;
- }
- } else {
- gf->multiply_region.w32 = gf_w32_split_4_32_lazy_multiply_region;
- }
- return 1;
- }
- return 1;
+
+ /* If we get here, then the arguments were bad. */
+
+ return 0;
}
static
@@ -1943,13 +2306,8 @@ int gf_w32_group_init(gf_t *gf)
gf_internal_t *h = (gf_internal_t *) gf->scratch;
int g_r, g_s;
- if (h->mult_type == GF_MULT_DEFAULT) {
- g_s = 3;
- g_r = 8;
- } else {
- g_s = h->arg1;
- g_r = h->arg2;
- }
+ g_s = h->arg1;
+ g_r = h->arg2;
gd = (struct gf_w32_group_data *) h->private;
gd->shift = (uint32_t *) (&(gd->memory));
@@ -1983,11 +2341,6 @@ int gf_w32_group_init(gf_t *gf)
} else {
gf->multiply.w32 = gf_w32_group_multiply;
gf->multiply_region.w32 = gf_w32_group_multiply_region;
- if (h->mult_type == GF_MULT_DEFAULT) {
-#ifdef INTEL_SSE4
- gf->multiply_region.w32 = gf_w32_split_4_32_lazy_sse_multiply_region;
-#endif
- }
}
gf->divide.w32 = NULL;
gf->inverse.w32 = gf_w32_euclid;
@@ -1995,62 +2348,53 @@ int gf_w32_group_init(gf_t *gf)
return 1;
}
+
static
uint32_t
-gf_w32_composite_multiply_logtable(gf_t *gf, uint32_t a, uint32_t b)
+gf_w32_composite_multiply_recursive(gf_t *gf, uint32_t a, uint32_t b)
{
gf_internal_t *h = (gf_internal_t *) gf->scratch;
- struct gf_w16_logtable_data * ltd = (struct gf_w16_logtable_data *) h->private;
-
- uint32_t b0 = b & 0xffff;
- uint32_t b1 = b >> 16;
- uint32_t a0 = a & 0xffff;
- uint32_t a1 = a >> 16;
+ gf_t *base_gf = h->base_gf;
+ uint32_t b0 = b & 0x0000ffff;
+ uint32_t b1 = (b & 0xffff0000) >> 16;
+ uint32_t a0 = a & 0x0000ffff;
+ uint32_t a1 = (a & 0xffff0000) >> 16;
uint32_t a1b1;
- uint32_t la0, la1, lb0, lb1, l11;
- uint32_t p;
-
- la0 = ltd->log_tbl[a0];
- la1 = ltd->log_tbl[a1];
- lb0 = ltd->log_tbl[b0];
- lb1 = ltd->log_tbl[b1];
-
- if (a1 && b1) {
- l11 = (la1 + lb1);
- a1b1 = ltd->antilog_tbl[l11];
- l11 = ltd->log_tbl[a1b1];
- p = ltd->antilog_tbl[l11+ltd->log_s];
- } else {
- a1b1 = 0;
- p = 0;
- }
-
- if (a0 && b1) p ^= ltd->antilog_tbl[la0+lb1];
+ uint32_t rv;
+ a1b1 = base_gf->multiply.w32(base_gf, a1, b1);
- if (a1 && b0) p ^= ltd->antilog_tbl[la1+lb0];
- p <<= 16;
- p ^= a1b1;
- if (a0 && b0) p ^= ltd->antilog_tbl[la0+lb0];
- return p;
+ rv = ((base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 16) | (base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1);
+ return rv;
}
+/* JSP: This could be made faster. Someday, when I'm bored. */
+
static
uint32_t
-gf_w32_composite_multiply_recursive(gf_t *gf, uint32_t a, uint32_t b)
+gf_w32_composite_multiply_inline(gf_t *gf, uint32_t a, uint32_t b)
{
gf_internal_t *h = (gf_internal_t *) gf->scratch;
gf_t *base_gf = h->base_gf;
- uint16_t b0 = b & 0x0000ffff;
- uint16_t b1 = (b & 0xffff0000) >> 16;
- uint16_t a0 = a & 0x0000ffff;
- uint16_t a1 = (a & 0xffff0000) >> 16;
- uint16_t a1b1;
- uint32_t rv;
-
- a1b1 = base_gf->multiply.w32(base_gf, a1, b1);
-
- rv = ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) | ((base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, GF_S_GF_16_2)) << 16));
- return rv;
+ uint32_t b0 = b & 0x0000ffff;
+ uint32_t b1 = b >> 16;
+ uint32_t a0 = a & 0x0000ffff;
+ uint32_t a1 = a >> 16;
+ uint32_t a1b1, prod;
+ uint16_t *log, *alog;
+ struct gf_w32_composite_data *cd;
+
+ cd = (struct gf_w32_composite_data *) h->private;
+ log = cd->log;
+ alog = cd->alog;
+
+ a1b1 = GF_W16_INLINE_MULT(log, alog, a1, b1);
+ prod = GF_W16_INLINE_MULT(log, alog, a1, b0);
+ prod ^= GF_W16_INLINE_MULT(log, alog, a0, b1);
+ prod ^= GF_W16_INLINE_MULT(log, alog, a1b1, h->prim_poly);
+ prod <<= 16;
+ prod ^= GF_W16_INLINE_MULT(log, alog, a0, b0);
+ prod ^= a1b1;
+ return prod;
}
/*
@@ -2075,6 +2419,7 @@ gf_w32_composite_multiply_recursive(gf_t *gf, uint32_t a, uint32_t b)
*
* a / b = a * c
*/
+
static
uint32_t
gf_w32_composite_inverse(gf_t *gf, uint32_t a)
@@ -2089,7 +2434,7 @@ gf_w32_composite_inverse(gf_t *gf, uint32_t a)
if (a0 == 0) {
a1inv = base_gf->inverse.w32(base_gf, a1);
- c0 = base_gf->multiply.w32(base_gf, a1inv, GF_S_GF_16_2);
+ c0 = base_gf->multiply.w32(base_gf, a1inv, h->prim_poly);
c1 = a1inv;
} else if (a1 == 0) {
c0 = base_gf->inverse.w32(base_gf, a0);
@@ -2100,7 +2445,7 @@ gf_w32_composite_inverse(gf_t *gf, uint32_t a)
d = base_gf->multiply.w32(base_gf, a1, a0inv);
- tmp = (base_gf->multiply.w32(base_gf, a1, a0inv) ^ base_gf->multiply.w32(base_gf, a0, a1inv) ^ GF_S_GF_16_2);
+ tmp = (base_gf->multiply.w32(base_gf, a1, a0inv) ^ base_gf->multiply.w32(base_gf, a0, a1inv) ^ h->prim_poly);
tmp = base_gf->inverse.w32(base_gf, tmp);
d = base_gf->multiply.w32(base_gf, d, tmp);
@@ -2115,114 +2460,88 @@ gf_w32_composite_inverse(gf_t *gf, uint32_t a)
}
static
-uint32_t
-gf_w32_composite_divide(gf_t *gf, uint32_t a, uint32_t b)
-{
- uint32_t binv;
-
- binv = gf->inverse.w32(gf, b);
- return gf->multiply.w32(gf, a, binv);
-}
-
-/* JSP: I'm not using this because I don't think it has value added. */
-static
-void
-gf_w32_composite_multiply_region_inline(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
-{
- unsigned long uls, uld;
- gf_internal_t *h = (gf_internal_t *) gf->scratch;
- gf_t *base_gf = h->base_gf;
- int i=0;
- struct gf_w16_logtable_data * ltd;
- uint16_t b0 = val & 0x0000ffff;
- uint16_t b1 = (val & 0xffff0000) >> 16;
- uint32_t *s32 = (uint32_t *) src;
- uint32_t *d32 = (uint32_t *) dest;
- uint16_t a0, a1, a1b1;
- int num_syms = bytes >> 2;
- int sym_divisible = bytes % 4;
-
- uls = (unsigned long) src;
- uld = (unsigned long) dest;
- if ((uls & 0x7) != (uld & 0x7)) gf_alignment_error("gf_w32_buf_const_log", 2);
- if (sym_divisible) {
- gf_alignment_error("gf_w32_buf_const_log: buffer size not divisible by symbol size = 2 bytes", 2);
- }
-
- if (val == 0) {
- if (xor) return;
- bzero(dest, bytes);
- return;
- }
-
- ltd = (struct gf_w16_logtable_data *) h->private;
-
- if (xor) {
- for (i = 0;i < num_syms; i++) {
- a0 = s32[i] & 0x0000ffff;
- a1 = (s32[i] & 0xffff0000) >> 16;
- a1b1 = ltd->antilog_tbl[ltd->log_tbl[a1] + ltd->log_tbl[b1]];
-
- d32[i] ^= ((ltd->antilog_tbl[ltd->log_tbl[a0] + ltd->log_tbl[b0]] ^ a1b1) |
- ((ltd->antilog_tbl[ltd->log_tbl[a1] + ltd->log_tbl[b0]] ^ ltd->antilog_tbl[ltd->log_tbl[a0] + ltd->log_tbl[b1]] ^
- ltd->antilog_tbl[ltd->log_tbl[a1b1] + ltd->log_tbl[GF_S_GF_16_2]]) << 16));
-
- }
- } else {
- for (i = 0;i < num_syms; i++) {
- a0 = s32[i] & 0x0000ffff;
- a1 = (s32[i] & 0xffff0000) >> 16;
- a1b1 = ltd->antilog_tbl[ltd->log_tbl[a1] + ltd->log_tbl[b1]];
-
- d32[i] = ((ltd->antilog_tbl[ltd->log_tbl[a0] + ltd->log_tbl[b0]] ^ a1b1) |
- ((ltd->antilog_tbl[ltd->log_tbl[a1] + ltd->log_tbl[b0]] ^ ltd->antilog_tbl[ltd->log_tbl[a0] + ltd->log_tbl[b1]] ^
- ltd->antilog_tbl[ltd->log_tbl[a1b1] + ltd->log_tbl[GF_S_GF_16_2]]) << 16));
- }
- }
-}
-
-static
void
gf_w32_composite_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
{
gf_internal_t *h = (gf_internal_t *) gf->scratch;
gf_t *base_gf = h->base_gf;
- struct gf_w16_logtable_data * ltd;
- uint16_t b0 = val & 0x0000ffff;
- uint16_t b1 = (val & 0xffff0000) >> 16;
+ uint32_t b0 = val & 0x0000ffff;
+ uint32_t b1 = (val & 0xffff0000) >> 16;
uint32_t *s32, *d32, *top;
- uint16_t a0, a1, a1b1;
+ uint16_t a0, a1, a1b1, *log, *alog;
+ uint32_t prod;
gf_region_data rd;
+ struct gf_w32_composite_data *cd;
+
+ cd = (struct gf_w32_composite_data *) h->private;
+ log = cd->log;
+ alog = cd->alog;
if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4);
-
s32 = rd.s_start;
d32 = rd.d_start;
top = rd.d_top;
- if (xor) {
- while (d32 < top) {
- a0 = *s32 & 0x0000ffff;
- a1 = (*s32 & 0xffff0000) >> 16;
- a1b1 = base_gf->multiply.w32(base_gf, a1, b1);
-
- *d32 ^= ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) |
- ((base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, GF_S_GF_16_2)) << 16));
- s32++;
- d32++;
+ if (log == NULL) {
+ if (xor) {
+ while (d32 < top) {
+ a0 = *s32 & 0x0000ffff;
+ a1 = (*s32 & 0xffff0000) >> 16;
+ a1b1 = base_gf->multiply.w32(base_gf, a1, b1);
+
+ *d32 ^= ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) |
+ ((base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 16));
+ s32++;
+ d32++;
+ }
+ } else {
+ while (d32 < top) {
+ a0 = *s32 & 0x0000ffff;
+ a1 = (*s32 & 0xffff0000) >> 16;
+ a1b1 = base_gf->multiply.w32(base_gf, a1, b1);
+
+ *d32 = ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) |
+ ((base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 16));
+ s32++;
+ d32++;
+ }
}
} else {
- while (d32 < top) {
- a0 = *s32 & 0x0000ffff;
- a1 = (*s32 & 0xffff0000) >> 16;
- a1b1 = base_gf->multiply.w32(base_gf, a1, b1);
-
- *d32 = ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) |
- ((base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, GF_S_GF_16_2)) << 16));
- s32++;
- d32++;
+ if (xor) {
+ while (d32 < top) {
+ a0 = *s32 & 0x0000ffff;
+ a1 = (*s32 & 0xffff0000) >> 16;
+ a1b1 = GF_W16_INLINE_MULT(log, alog, a1, b1);
+
+ prod = GF_W16_INLINE_MULT(log, alog, a1, b0);
+ prod ^= GF_W16_INLINE_MULT(log, alog, a0, b1);
+ prod ^= GF_W16_INLINE_MULT(log, alog, a1b1, h->prim_poly);
+ prod <<= 16;
+ prod ^= GF_W16_INLINE_MULT(log, alog, a0, b0);
+ prod ^= a1b1;
+ *d32 ^= prod;
+ s32++;
+ d32++;
+ }
+ } else {
+ while (d32 < top) {
+ a0 = *s32 & 0x0000ffff;
+ a1 = (*s32 & 0xffff0000) >> 16;
+ a1b1 = GF_W16_INLINE_MULT(log, alog, a1, b1);
+
+ prod = GF_W16_INLINE_MULT(log, alog, a1, b0);
+ prod ^= GF_W16_INLINE_MULT(log, alog, a0, b1);
+ prod ^= GF_W16_INLINE_MULT(log, alog, a1b1, h->prim_poly);
+ prod <<= 16;
+ prod ^= GF_W16_INLINE_MULT(log, alog, a0, b0);
+ prod ^= a1b1;
+
+ *d32 = prod;
+ s32++;
+ d32++;
+ }
}
}
}
@@ -2259,7 +2578,7 @@ gf_w32_composite_multiply_region_alt(gf_t *gf, void *src, void *dest, uint32_t v
base_gf->multiply_region.w32(base_gf, shigh, dlow, val1, sub_reg_size, 1);
base_gf->multiply_region.w32(base_gf, slow, dhigh, val1, sub_reg_size, xor);
base_gf->multiply_region.w32(base_gf, shigh, dhigh, val0, sub_reg_size, 1);
- base_gf->multiply_region.w32(base_gf, shigh, dhigh, base_gf->multiply.w32(base_gf, GF_S_GF_16_2, val1), sub_reg_size, 1);
+ base_gf->multiply_region.w32(base_gf, shigh, dhigh, base_gf->multiply.w32(base_gf, h->prim_poly, val1), sub_reg_size, 1);
gf_do_final_region_alignment(&rd);
}
@@ -2267,143 +2586,92 @@ gf_w32_composite_multiply_region_alt(gf_t *gf, void *src, void *dest, uint32_t v
static
int gf_w32_composite_init(gf_t *gf)
{
- struct gf_w16_logtable_data *ltd;
gf_internal_t *h = (gf_internal_t *) gf->scratch;
- gf_t *base_gf = h->base_gf;
- gf_internal_t *base_h = (gf_internal_t *) base_gf->scratch;
- uint32_t a, b;
- uint64_t prim_poly = ((gf_internal_t *) base_gf->scratch)->prim_poly;
- int i;
+ struct gf_w32_composite_data *cd;
+
+ if (h->base_gf == NULL) return 0;
+
+ cd = (struct gf_w32_composite_data *) h->private;
+ cd->log = gf_w16_get_log_table(h->base_gf);
+ cd->alog = gf_w16_get_mult_alog_table(h->base_gf);
if (h->region_type & GF_REGION_ALTMAP) {
gf->multiply_region.w32 = gf_w32_composite_multiply_region_alt;
- } else if (h->arg2 == 0 && base_h->mult_type == GF_MULT_LOG_TABLE &&
- base_h->arg1 == 0) {
- gf->multiply_region.w32 = gf_w32_composite_multiply_region;
-/* It would be this, were that not buggy and I cared:
- gf->multiply_region.w32 = gf_w32_composite_multiply_region_inline; */
} else {
gf->multiply_region.w32 = gf_w32_composite_multiply_region;
}
- if (h->arg2 == 0) {
- ltd = (struct gf_w16_logtable_data *) h->private;
-
- ltd->log_tbl[0] = 0;
-
- bzero(&(ltd->_antilog_tbl[0]), sizeof(ltd->_antilog_tbl));
-
- ltd->antilog_tbl = &(ltd->_antilog_tbl[GF_BASE_FIELD_SIZE * 2]);
-
- b = 1;
- for (i = 0; i < GF_BASE_FIELD_GROUP_SIZE; i++) {
- ltd->log_tbl[b] = (uint16_t)i;
- ltd->antilog_tbl[i] = (uint16_t)b;
- ltd->antilog_tbl[i+GF_BASE_FIELD_GROUP_SIZE] = (uint16_t)b;
- b <<= 1;
- if (b & GF_BASE_FIELD_SIZE) {
- b = b ^ prim_poly;
- }
- }
- ltd->log_s = ltd->log_tbl[GF_S_GF_16_2];
- ltd->inv_tbl[0] = 0; /* Not really, but we need to fill it with something */
- ltd->inv_tbl[1] = 1;
- for (i = 2; i < GF_BASE_FIELD_SIZE; i++) {
- ltd->inv_tbl[i] = ltd->antilog_tbl[GF_BASE_FIELD_GROUP_SIZE-ltd->log_tbl[i]];
- }
- gf->multiply.w32 = gf_w32_composite_multiply_logtable;
- } else {
+ if (cd->log == NULL) {
gf->multiply.w32 = gf_w32_composite_multiply_recursive;
+ } else {
+ gf->multiply.w32 = gf_w32_composite_multiply_inline;
}
-
- gf->divide.w32 = gf_w32_composite_divide;
+ gf->divide.w32 = NULL;
gf->inverse.w32 = gf_w32_composite_inverse;
return 1;
}
+
+
int gf_w32_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2)
{
- int ss, sa;
+ int ss;
+ int issse3 = 0;
ss = (GF_REGION_SSE | GF_REGION_NOSSE);
- sa = (GF_REGION_STDMAP | GF_REGION_ALTMAP);
+
+#ifdef INTEL_SSSE3
+ issse3 = 1;
+#endif
switch(mult_type)
{
case GF_MULT_BYTWO_p:
case GF_MULT_BYTWO_b:
- if (arg1 != 0 || arg2 != 0) return -1;
- if (region_type != GF_REGION_CAUCHY) {
- if ((region_type | ss) != ss || (region_type & ss) == ss) return -1;
- }
- return sizeof(gf_internal_t) + sizeof(struct gf_w32_bytwo_data);
+ return sizeof(gf_internal_t) + sizeof(struct gf_w32_bytwo_data) + 64;
break;
- case GF_MULT_DEFAULT:
case GF_MULT_GROUP:
- if (mult_type == GF_MULT_DEFAULT) {
- arg1 = 3;
- arg2 = 8;
- }
- if (arg1 <= 0 || arg2 <= 0) return -1;
- if (region_type != GF_REGION_DEFAULT && region_type != GF_REGION_CAUCHY) return -1;
return sizeof(gf_internal_t) + sizeof(struct gf_w32_group_data) +
sizeof(uint32_t) * (1 << arg1) +
sizeof(uint32_t) * (1 << arg2) + 64;
break;
+ case GF_MULT_DEFAULT:
+
case GF_MULT_SPLIT_TABLE:
if (arg1 == 8 && arg2 == 8){
- if (region_type != GF_REGION_DEFAULT && region_type != GF_REGION_CAUCHY) return -1;
- return sizeof(gf_internal_t) + sizeof(struct gf_split_8_8_data) + 64;
+ return sizeof(gf_internal_t) + sizeof(struct gf_w32_split_8_8_data) + 64;
}
if ((arg1 == 16 && arg2 == 32) || (arg2 == 16 && arg1 == 32)) {
- region_type &= (~GF_REGION_LAZY);
- if (region_type != GF_REGION_DEFAULT) return -1;
return sizeof(gf_internal_t) + sizeof(struct gf_split_16_32_lazy_data) + 64;
}
- if ((arg1 == 8 && arg2 == 32) || (arg2 == 8 && arg1 == 32)) {
- region_type &= (~GF_REGION_LAZY);
- if (region_type != GF_REGION_DEFAULT) return -1;
- return sizeof(gf_internal_t) + sizeof(struct gf_split_8_32_lazy_data) + 64;
- }
if ((arg1 == 2 && arg2 == 32) || (arg2 == 2 && arg1 == 32)) {
- region_type &= (~GF_REGION_LAZY);
- if ((region_type & ss) == ss) return -1;
- if ((region_type | ss) != ss) return -1;
return sizeof(gf_internal_t) + sizeof(struct gf_split_2_32_lazy_data) + 64;
}
- if ((arg1 == 4 && arg2 == 32) || (arg2 == 4 && arg1 == 32)) {
- region_type &= (~GF_REGION_LAZY);
- if ((region_type & ss) == ss) return -1;
- if ((region_type & sa) == sa) return -1;
- if (region_type & (~(ss|sa))) return -1;
- if (region_type & GF_REGION_SSE) {
- return sizeof(gf_internal_t) + sizeof(struct gf_split_4_32_lazy_data) + 64;
- } else if (region_type & GF_REGION_ALTMAP) {
- return -1;
- } else {
- return sizeof(gf_internal_t) + sizeof(struct gf_split_4_32_lazy_data) + 64;
- }
+ if ((arg1 == 8 && arg2 == 32) || (arg2 == 8 && arg1 == 32) ||
+ (mult_type == GF_MULT_DEFAULT && !issse3)) {
+ return sizeof(gf_internal_t) + sizeof(struct gf_split_8_32_lazy_data) + 64;
+ }
+ if ((arg1 == 4 && arg2 == 32) ||
+ (arg2 == 4 && arg1 == 32) ||
+ mult_type == GF_MULT_DEFAULT) {
+ return sizeof(gf_internal_t) + sizeof(struct gf_split_4_32_lazy_data) + 64;
}
- return -1;
+ return 0;
+ case GF_MULT_CARRY_FREE:
+ return sizeof(gf_internal_t);
+ break;
case GF_MULT_SHIFT:
- if (arg1 != 0 || arg2 != 0) return -1;
- if (region_type != 0 && region_type != GF_REGION_CAUCHY) return -1;
return sizeof(gf_internal_t);
break;
case GF_MULT_COMPOSITE:
- if (region_type & ~(GF_REGION_ALTMAP | GF_REGION_STDMAP)) return -1;
- if (arg1 == 2 && arg2 == 0) {
- return sizeof(gf_internal_t) + sizeof(struct gf_w16_logtable_data) + 64;
- } else if (arg1 == 2 && arg2 == 1) {
- return sizeof(gf_internal_t) + 64;
- } else {
- return -1;
- }
+ return sizeof(gf_internal_t) + sizeof(struct gf_w32_composite_data) + 64;
+ break;
default:
- return -1;
+ return 0;
}
+ return 0;
}
int gf_w32_init(gf_t *gf)
@@ -2411,22 +2679,43 @@ int gf_w32_init(gf_t *gf)
gf_internal_t *h;
h = (gf_internal_t *) gf->scratch;
- if (h->prim_poly == 0) h->prim_poly = 0x400007;
+
+ /* Allen: set default primitive polynomial / irreducible polynomial if needed */
+
+ if (h->prim_poly == 0) {
+ if (h->mult_type == GF_MULT_COMPOSITE) {
+ h->prim_poly = gf_composite_get_default_poly(h->base_gf);
+ if (h->prim_poly == 0) return 0; /* This shouldn't happen */
+ } else {
+ /* Allen: use the following primitive polynomial to make carryless multiply work more efficiently for GF(2^32).*/
+
+ /* h->prim_poly = 0xc5; */
+
+ /* Allen: The following is the traditional primitive polynomial for GF(2^32) */
+
+ h->prim_poly = 0x400007;
+ }
+ }
+
+ /* No leading one */
+
+ if(h->mult_type != GF_MULT_COMPOSITE) h->prim_poly &= 0xffffffff;
+
gf->multiply.w32 = NULL;
gf->divide.w32 = NULL;
gf->inverse.w32 = NULL;
gf->multiply_region.w32 = NULL;
switch(h->mult_type) {
+ case GF_MULT_CARRY_FREE: if (gf_w32_cfm_init(gf) == 0) return 0; break;
case GF_MULT_SHIFT: if (gf_w32_shift_init(gf) == 0) return 0; break;
case GF_MULT_COMPOSITE: if (gf_w32_composite_init(gf) == 0) return 0; break;
- case GF_MULT_SPLIT_TABLE: if (gf_w32_split_init(gf) == 0) return 0; break;
case GF_MULT_DEFAULT:
+ case GF_MULT_SPLIT_TABLE: if (gf_w32_split_init(gf) == 0) return 0; break;
case GF_MULT_GROUP: if (gf_w32_group_init(gf) == 0) return 0; break;
case GF_MULT_BYTWO_p:
case GF_MULT_BYTWO_b: if (gf_w32_bytwo_init(gf) == 0) return 0; break;
-
default: return 0;
}
if (h->divide_type == GF_DIVIDE_EUCLID) {
diff --git a/gf_w4.c b/gf_w4.c
index 1175e01..50f00da 100644
--- a/gf_w4.c
+++ b/gf_w4.c
@@ -100,7 +100,6 @@ gf_val_32_t gf_w4_euclid (gf_t *gf, gf_val_32_t b)
y_im1 = 0;
while (e_i != 1) {
-
e_ip1 = e_im1;
d_ip1 = d_im1;
c_i = 0;
@@ -108,6 +107,7 @@ gf_val_32_t gf_w4_euclid (gf_t *gf, gf_val_32_t b)
while (d_ip1 >= d_i) {
c_i ^= (1 << (d_ip1 - d_i));
e_ip1 ^= (e_i << (d_ip1 - d_i));
+ if (e_ip1 == 0) return 0;
while ((e_ip1 & (1 << d_ip1)) == 0) d_ip1--;
}
@@ -146,6 +146,110 @@ gf_val_32_t gf_w4_matrix (gf_t *gf, gf_val_32_t b)
return gf_bitmatrix_inverse(b, 4, ((gf_internal_t *) (gf->scratch))->prim_poly);
}
+
+static
+inline
+gf_val_32_t
+gf_w4_shift_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+ uint8_t product, i, pp;
+ gf_internal_t *h;
+
+ h = (gf_internal_t *) gf->scratch;
+ pp = h->prim_poly;
+
+ product = 0;
+
+ for (i = 0; i < GF_FIELD_WIDTH; i++) {
+ if (a & (1 << i)) product ^= (b << i);
+ }
+ for (i = (GF_FIELD_WIDTH*2-2); i >= GF_FIELD_WIDTH; i--) {
+ if (product & (1 << i)) product ^= (pp << (i-GF_FIELD_WIDTH));
+ }
+ return product;
+}
+
+/* Ben: This function works, but it is 33% slower than the normal shift mult */
+
+static
+inline
+gf_val_32_t
+gf_w4_clm_multiply (gf_t *gf, gf_val_32_t a4, gf_val_32_t b4)
+{
+ gf_val_32_t rv = 0;
+
+#ifdef INTEL_SSE4_PCLMUL
+
+ __m128i a, b;
+ __m128i result;
+ __m128i prim_poly;
+ __m128i w;
+ gf_internal_t * h = gf->scratch;
+
+ a = _mm_insert_epi32 (_mm_setzero_si128(), a4, 0);
+ b = _mm_insert_epi32 (a, b4, 0);
+
+ prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1fULL));
+
+ /* Do the initial multiply */
+
+ result = _mm_clmulepi64_si128 (a, b, 0);
+
+ /* Ben/JSP: Do prim_poly reduction once. We are guaranteed that we will only
+ have to do the reduction only once, because (w-2)/z == 1. Where
+ z is equal to the number of zeros after the leading 1.
+
+ _mm_clmulepi64_si128 is the carryless multiply operation. Here
+ _mm_srli_epi64 shifts the result to the right by 4 bits. This allows
+ us to multiply the prim_poly by the leading bits of the result. We
+ then xor the result of that operation back with the result. */
+
+ w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_epi64 (result, 4), 0);
+ result = _mm_xor_si128 (result, w);
+
+ /* Extracts 32 bit value from result. */
+
+ rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
+#endif
+ return rv;
+}
+
+static
+void
+gf_w4_multiply_region_from_single(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int
+ xor)
+{
+ gf_region_data rd;
+ uint8_t *s8;
+ uint8_t *d8;
+
+ if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+ if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+ gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 1);
+ gf_do_initial_region_alignment(&rd);
+
+ s8 = (uint8_t *) rd.s_start;
+ d8 = (uint8_t *) rd.d_start;
+
+ if (xor) {
+ while (d8 < ((uint8_t *) rd.d_top)) {
+ *d8 ^= (gf->multiply.w32(gf, val, (*s8 & 0xf)) |
+ ((gf->multiply.w32(gf, val, (*s8 >> 4))) << 4));
+ d8++;
+ s8++;
+ }
+ } else {
+ while (d8 < ((uint8_t *) rd.d_top)) {
+ *d8 = (gf->multiply.w32(gf, val, (*s8 & 0xf)) |
+ ((gf->multiply.w32(gf, val, (*s8 >> 4))) << 4));
+ d8++;
+ s8++;
+ }
+ }
+ gf_do_final_region_alignment(&rd);
+}
+
/* ------------------------------------------------------------
IMPLEMENTATION: LOG_TABLE:
@@ -220,18 +324,28 @@ int gf_w4_log_init(gf_t *gf)
h = (gf_internal_t *) gf->scratch;
ltd = h->private;
- ltd->log_tbl[0] = 0;
+ for (i = 0; i < GF_FIELD_SIZE; i++)
+ ltd->log_tbl[i]=0;
ltd->antilog_tbl_div = ltd->antilog_tbl + (GF_FIELD_SIZE-1);
b = 1;
- for (i = 0; i < GF_FIELD_SIZE-1; i++) {
- ltd->log_tbl[b] = i;
- ltd->antilog_tbl[i] = b;
- ltd->antilog_tbl[i+GF_FIELD_SIZE-1] = b;
- b <<= 1;
- if (b & GF_FIELD_SIZE) {
- b = b ^ h->prim_poly;
- }
+ i = 0;
+ do {
+ if (ltd->log_tbl[b] != 0 && i != 0) {
+ fprintf(stderr, "Cannot construct log table: Polynomial is not primitive.\n\n");
+ return 0;
+ }
+ ltd->log_tbl[b] = i;
+ ltd->antilog_tbl[i] = b;
+ ltd->antilog_tbl[i+GF_FIELD_SIZE-1] = b;
+ b <<= 1;
+ i++;
+ if (b & GF_FIELD_SIZE) b = b ^ h->prim_poly;
+ } while (b != 1);
+
+ if (i != GF_FIELD_SIZE - 1) {
+ _gf_errno = GF_E_LOGPOLY;
+ return 0;
}
gf->inverse.w32 = gf_w4_inverse_from_divide;
@@ -300,7 +414,7 @@ static
void
gf_w4_single_table_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
{
-#ifdef INTEL_SSE4
+#ifdef INTEL_SSSE3
gf_region_data rd;
uint8_t *base, *sptr, *dptr, *top;
__m128i tl, loset, h4, r, va, th;
@@ -351,37 +465,17 @@ int gf_w4_single_table_init(gf_t *gf)
gf_internal_t *h;
struct gf_single_table_data *std;
int a, b, prod, loga, logb;
- uint8_t log_tbl[GF_FIELD_SIZE];
- uint8_t antilog_tbl[GF_FIELD_SIZE*2];
- int sse;
- sse = 0;
-#ifdef INTEL_SSE4
- sse = 1;
-#endif
h = (gf_internal_t *) gf->scratch;
std = (struct gf_single_table_data *)h->private;
- b = 1;
- for (a = 0; a < GF_MULT_GROUP_SIZE; a++) {
- log_tbl[b] = a;
- antilog_tbl[a] = b;
- antilog_tbl[a+GF_MULT_GROUP_SIZE] = b;
- b <<= 1;
- if (b & GF_FIELD_SIZE) {
- b = b ^ h->prim_poly;
- }
- }
-
bzero(std->mult, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);
bzero(std->div, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);
for (a = 1; a < GF_FIELD_SIZE; a++) {
- loga = log_tbl[a];
for (b = 1; b < GF_FIELD_SIZE; b++) {
- logb = log_tbl[b];
- prod = antilog_tbl[loga+logb];
+ prod = gf_w4_shift_multiply(gf, a, b);
std->mult[a][b] = prod;
std->div[prod][b] = a;
}
@@ -390,11 +484,16 @@ int gf_w4_single_table_init(gf_t *gf)
gf->inverse.w32 = NULL;
gf->divide.w32 = gf_w4_single_table_divide;
gf->multiply.w32 = gf_w4_single_table_multiply;
- if ((h->region_type & GF_REGION_SSE) || (h->mult_type == GF_MULT_DEFAULT && sse)) {
- gf->multiply_region.w32 = gf_w4_single_table_sse_multiply_region;
- } else {
+ #ifdef INTEL_SSSE3
+ if(h->region_type & (GF_REGION_NOSSE | GF_REGION_CAUCHY))
+ gf->multiply_region.w32 = gf_w4_single_table_multiply_region;
+ else
+ gf->multiply_region.w32 = gf_w4_single_table_sse_multiply_region;
+ #else
gf->multiply_region.w32 = gf_w4_single_table_multiply_region;
- }
+ if (h->region_type & GF_REGION_SSE) return 0;
+ #endif
+
return 1;
}
@@ -458,32 +557,17 @@ int gf_w4_double_table_init(gf_t *gf)
gf_internal_t *h;
struct gf_double_table_data *std;
int a, b, c, prod, loga, logb, ab;
- uint8_t log_tbl[GF_FIELD_SIZE];
- uint8_t antilog_tbl[GF_FIELD_SIZE*2];
uint8_t mult[GF_FIELD_SIZE][GF_FIELD_SIZE];
h = (gf_internal_t *) gf->scratch;
std = (struct gf_double_table_data *)h->private;
- b = 1;
- for (a = 0; a < GF_MULT_GROUP_SIZE; a++) {
- log_tbl[b] = a;
- antilog_tbl[a] = b;
- antilog_tbl[a+GF_MULT_GROUP_SIZE] = b;
- b <<= 1;
- if (b & GF_FIELD_SIZE) {
- b = b ^ h->prim_poly;
- }
- }
-
bzero(mult, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);
bzero(std->div, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);
for (a = 1; a < GF_FIELD_SIZE; a++) {
- loga = log_tbl[a];
for (b = 1; b < GF_FIELD_SIZE; b++) {
- logb = log_tbl[b];
- prod = antilog_tbl[loga+logb];
+ prod = gf_w4_shift_multiply(gf, a, b);
mult[a][b] = prod;
std->div[prod][b] = a;
}
@@ -600,32 +684,17 @@ int gf_w4_quad_table_init(gf_t *gf)
gf_internal_t *h;
struct gf_quad_table_data *std;
int prod, loga, logb, ab, val, a, b, c, d, va, vb, vc, vd;
- uint8_t log_tbl[GF_FIELD_SIZE];
- uint8_t antilog_tbl[GF_FIELD_SIZE*2];
uint8_t mult[GF_FIELD_SIZE][GF_FIELD_SIZE];
h = (gf_internal_t *) gf->scratch;
std = (struct gf_quad_table_data *)h->private;
- b = 1;
- for (a = 0; a < GF_MULT_GROUP_SIZE; a++) {
- log_tbl[b] = a;
- antilog_tbl[a] = b;
- antilog_tbl[a+GF_MULT_GROUP_SIZE] = b;
- b <<= 1;
- if (b & GF_FIELD_SIZE) {
- b = b ^ h->prim_poly;
- }
- }
-
bzero(mult, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);
bzero(std->div, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);
for (a = 1; a < GF_FIELD_SIZE; a++) {
- loga = log_tbl[a];
for (b = 1; b < GF_FIELD_SIZE; b++) {
- logb = log_tbl[b];
- prod = antilog_tbl[loga+logb];
+ prod = gf_w4_shift_multiply(gf, a, b);
mult[a][b] = prod;
std->div[prod][b] = a;
}
@@ -702,13 +771,18 @@ int gf_w4_table_init(gf_t *gf)
{
int rt;
gf_internal_t *h;
+ int issse3 = 0;
+
+#ifdef INTEL_SSSE3
+ issse3 = 1;
+#endif
h = (gf_internal_t *) gf->scratch;
rt = (h->region_type);
- if (rt == 0 || rt == GF_REGION_CAUCHY) rt |= GF_REGION_SINGLE_TABLE;
- if (rt & GF_REGION_SINGLE_TABLE) {
- return gf_w4_single_table_init(gf);
- } else if (rt & GF_REGION_DOUBLE_TABLE) {
+
+ if (h->mult_type == GF_MULT_DEFAULT && !issse3) rt |= GF_REGION_DOUBLE_TABLE;
+
+ if (rt & GF_REGION_DOUBLE_TABLE) {
return gf_w4_double_table_init(gf);
} else if (rt & GF_REGION_QUAD_TABLE) {
if (rt & GF_REGION_LAZY) {
@@ -717,7 +791,9 @@ int gf_w4_table_init(gf_t *gf)
return gf_w4_quad_table_init(gf);
}
return gf_w4_double_table_init(gf);
- }
+ } else {
+ return gf_w4_single_table_init(gf);
+ }
return 0;
}
@@ -842,7 +918,7 @@ static
void
gf_w4_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
{
-#ifdef INTEL_SSE4
+#ifdef INTEL_SSE2
int i;
uint8_t *s8, *d8;
uint8_t vrev;
@@ -895,7 +971,7 @@ static
void
gf_w4_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
{
-#ifdef INTEL_SSE4
+#ifdef INTEL_SSE2
uint8_t *d8, *s8, tb;
__m128i pp, m1, m2, t1, t2, va, vb;
struct gf_bytwo_data *btd;
@@ -960,7 +1036,7 @@ static
void
gf_w4_bytwo_b_sse_region_2_noxor(gf_region_data *rd, struct gf_bytwo_data *btd)
{
-#ifdef INTEL_SSE4
+#ifdef INTEL_SSE2
int i;
uint8_t *d8, *s8, tb;
__m128i pp, m1, m2, t1, t2, va, vb;
@@ -986,7 +1062,7 @@ static
void
gf_w4_bytwo_b_sse_region_2_xor(gf_region_data *rd, struct gf_bytwo_data *btd)
{
-#ifdef INTEL_SSE4
+#ifdef INTEL_SSE2
int i;
uint8_t *d8, *s8, tb;
__m128i pp, m1, m2, t1, t2, va, vb;
@@ -1014,7 +1090,7 @@ static
void
gf_w4_bytwo_b_sse_region_4_noxor(gf_region_data *rd, struct gf_bytwo_data *btd)
{
-#ifdef INTEL_SSE4
+#ifdef INTEL_SSE2
int i;
uint8_t *d8, *s8, tb;
__m128i pp, m1, m2, t1, t2, va, vb;
@@ -1041,7 +1117,7 @@ static
void
gf_w4_bytwo_b_sse_region_4_xor(gf_region_data *rd, struct gf_bytwo_data *btd)
{
-#ifdef INTEL_SSE4
+#ifdef INTEL_SSE2
int i;
uint8_t *d8, *s8, tb;
__m128i pp, m1, m2, t1, t2, va, vb;
@@ -1071,7 +1147,7 @@ static
void
gf_w4_bytwo_b_sse_region_3_noxor(gf_region_data *rd, struct gf_bytwo_data *btd)
{
-#ifdef INTEL_SSE4
+#ifdef INTEL_SSE2
int i;
uint8_t *d8, *s8, tb;
__m128i pp, m1, m2, t1, t2, va, vb;
@@ -1099,7 +1175,7 @@ static
void
gf_w4_bytwo_b_sse_region_3_xor(gf_region_data *rd, struct gf_bytwo_data *btd)
{
-#ifdef INTEL_SSE4
+#ifdef INTEL_SSE2
int i;
uint8_t *d8, *s8, tb;
__m128i pp, m1, m2, t1, t2, va, vb;
@@ -1127,7 +1203,7 @@ static
void
gf_w4_bytwo_b_sse_region_5_noxor(gf_region_data *rd, struct gf_bytwo_data *btd)
{
-#ifdef INTEL_SSE4
+#ifdef INTEL_SSE2
int i;
uint8_t *d8, *s8, tb;
__m128i pp, m1, m2, t1, t2, va, vb;
@@ -1156,7 +1232,7 @@ static
void
gf_w4_bytwo_b_sse_region_5_xor(gf_region_data *rd, struct gf_bytwo_data *btd)
{
-#ifdef INTEL_SSE4
+#ifdef INTEL_SSE2
int i;
uint8_t *d8, *s8, tb;
__m128i pp, m1, m2, t1, t2, va, vb;
@@ -1185,7 +1261,7 @@ static
void
gf_w4_bytwo_b_sse_region_7_noxor(gf_region_data *rd, struct gf_bytwo_data *btd)
{
-#ifdef INTEL_SSE4
+#ifdef INTEL_SSE2
int i;
uint8_t *d8, *s8, tb;
__m128i pp, m1, m2, t1, t2, va, vb;
@@ -1215,7 +1291,7 @@ static
void
gf_w4_bytwo_b_sse_region_7_xor(gf_region_data *rd, struct gf_bytwo_data *btd)
{
-#ifdef INTEL_SSE4
+#ifdef INTEL_SSE2
int i;
uint8_t *d8, *s8, tb;
__m128i pp, m1, m2, t1, t2, va, vb;
@@ -1245,7 +1321,7 @@ static
void
gf_w4_bytwo_b_sse_region_6_noxor(gf_region_data *rd, struct gf_bytwo_data *btd)
{
-#ifdef INTEL_SSE4
+#ifdef INTEL_SSE2
int i;
uint8_t *d8, *s8, tb;
__m128i pp, m1, m2, t1, t2, va, vb;
@@ -1274,7 +1350,7 @@ static
void
gf_w4_bytwo_b_sse_region_6_xor(gf_region_data *rd, struct gf_bytwo_data *btd)
{
-#ifdef INTEL_SSE4
+#ifdef INTEL_SSE2
int i;
uint8_t *d8, *s8, tb;
__m128i pp, m1, m2, t1, t2, va, vb;
@@ -1303,7 +1379,7 @@ static
void
gf_w4_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
{
-#ifdef INTEL_SSE4
+#ifdef INTEL_SSE2
uint8_t *d8, *s8, tb;
__m128i pp, m1, m2, t1, t2, va, vb;
struct gf_bytwo_data *btd;
@@ -1853,114 +1929,107 @@ int gf_w4_bytwo_init(gf_t *gf)
if (h->mult_type == GF_MULT_BYTWO_p) {
gf->multiply.w32 = gf_w4_bytwo_p_multiply;
- if (h->region_type == GF_REGION_SSE) {
- gf->multiply_region.w32 = gf_w4_bytwo_p_sse_multiply_region;
- } else {
+ #ifdef INTEL_SSE2
+ if (h->region_type & GF_REGION_NOSSE)
+ gf->multiply_region.w32 = gf_w4_bytwo_p_nosse_multiply_region;
+ else
+ gf->multiply_region.w32 = gf_w4_bytwo_p_sse_multiply_region;
+ #else
gf->multiply_region.w32 = gf_w4_bytwo_p_nosse_multiply_region;
- }
+ if (h->region_type & GF_REGION_SSE)
+ return 0;
+ #endif
} else {
gf->multiply.w32 = gf_w4_bytwo_b_multiply;
- if (h->region_type == GF_REGION_SSE) {
- gf->multiply_region.w32 = gf_w4_bytwo_b_sse_multiply_region;
- } else {
+ #ifdef INTEL_SSE2
+ if (h->region_type & GF_REGION_NOSSE)
+ gf->multiply_region.w32 = gf_w4_bytwo_b_nosse_multiply_region;
+ else
+ gf->multiply_region.w32 = gf_w4_bytwo_b_sse_multiply_region;
+ #else
gf->multiply_region.w32 = gf_w4_bytwo_b_nosse_multiply_region;
- }
+ if (h->region_type & GF_REGION_SSE)
+ return 0;
+ #endif
}
- gf->inverse.w32 = gf_w4_euclid;
return 1;
}
-/* ------------------------------------------------------------
- JSP: GF_MULT_SHIFT: The world's dumbest multiplication algorithm. I only
- include it for completeness. It does have the feature that it requires no
- extra memory.
-*/
-
-static
-inline
-gf_val_32_t
-gf_w4_shift_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+static
+int gf_w4_cfm_init(gf_t *gf)
{
- uint8_t product, i, pp;
gf_internal_t *h;
-
- h = (gf_internal_t *) gf->scratch;
- pp = h->prim_poly;
- product = 0;
+ h = (gf_internal_t *) gf->scratch;
- for (i = 0; i < GF_FIELD_WIDTH; i++) {
- if (a & (1 << i)) product ^= (b << i);
- }
- for (i = (GF_FIELD_WIDTH*2-1); i >= GF_FIELD_WIDTH; i--) {
- if (product & (1 << i)) product ^= (pp << (i-GF_FIELD_WIDTH));
- }
- return product;
+#ifdef INTEL_SSE4_PCLMUL
+ gf->multiply.w32 = gf_w4_clm_multiply;
+ return 1;
+#endif
+ return 0;
}
static
int gf_w4_shift_init(gf_t *gf)
{
gf->multiply.w32 = gf_w4_shift_multiply;
- gf->inverse.w32 = gf_w4_euclid;
return 1;
}
+/* JSP: I'm putting all error-checking into gf_error_check(), so you don't
+ have to do error checking in scratch_size or in init */
+
int gf_w4_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2)
{
int region_tbl_size;
- int sss;
int ss;
+ int issse3 = 0;
- sss = (GF_REGION_SINGLE_TABLE | GF_REGION_SSE | GF_REGION_NOSSE);
- ss = (GF_REGION_SSE | GF_REGION_NOSSE);
+#ifdef INTEL_SSSE3
+ issse3 = 1;
+#endif
switch(mult_type)
{
case GF_MULT_BYTWO_p:
case GF_MULT_BYTWO_b:
- if (arg1 != 0 || arg2 != 0) return -1;
- if (region_type != GF_REGION_CAUCHY) {
- if ((region_type | ss) != ss || (region_type & ss) == ss) return -1;
- }
return sizeof(gf_internal_t) + sizeof(struct gf_bytwo_data);
break;
case GF_MULT_DEFAULT:
case GF_MULT_TABLE:
- if (arg1 != 0 || arg2 != 0) return -1;
- if (region_type == GF_REGION_CAUCHY || region_type == (GF_REGION_CAUCHY | GF_REGION_SINGLE_TABLE)) {
+ if (region_type == GF_REGION_CAUCHY) {
return sizeof(gf_internal_t) + sizeof(struct gf_single_table_data) + 64;
}
- if (mult_type == GF_MULT_DEFAULT || region_type == 0) region_type = GF_REGION_SINGLE_TABLE;
- if (region_type & GF_REGION_SINGLE_TABLE) {
- if ((region_type | sss) != sss) return -1;
- if ((region_type & sss) == sss) return -1;
- return sizeof(gf_internal_t) + sizeof(struct gf_single_table_data) + 64;
- } else if (region_type & GF_REGION_DOUBLE_TABLE) {
- if (region_type != GF_REGION_DOUBLE_TABLE) return -1;
+
+ if (mult_type == GF_MULT_DEFAULT && !issse3) region_type = GF_REGION_DOUBLE_TABLE;
+
+ if (region_type & GF_REGION_DOUBLE_TABLE) {
return sizeof(gf_internal_t) + sizeof(struct gf_double_table_data) + 64;
} else if (region_type & GF_REGION_QUAD_TABLE) {
- if ((region_type | GF_REGION_LAZY) != (GF_REGION_QUAD_TABLE | GF_REGION_LAZY)) return -1;
if ((region_type & GF_REGION_LAZY) == 0) {
return sizeof(gf_internal_t) + sizeof(struct gf_quad_table_data) + 64;
} else {
return sizeof(gf_internal_t) + sizeof(struct gf_quad_table_lazy_data) + 64;
}
+ } else {
+ return sizeof(gf_internal_t) + sizeof(struct gf_single_table_data) + 64;
}
- return -1;
break;
+
case GF_MULT_LOG_TABLE:
- if (arg1 != 0 || arg2 != 0 || (region_type != 0 && region_type != GF_REGION_CAUCHY)) return -1;
return sizeof(gf_internal_t) + sizeof(struct gf_logtable_data) + 64;
break;
+ case GF_MULT_CARRY_FREE:
+ return sizeof(gf_internal_t);
+ break;
case GF_MULT_SHIFT:
- if (arg1 != 0 || arg2 != 0 || (region_type != 0 && region_type != GF_REGION_CAUCHY)) return -1;
return sizeof(gf_internal_t);
break;
default:
- return -1;
+ return 0;
}
+ return 0;
}
int
@@ -1970,7 +2039,7 @@ gf_w4_init (gf_t *gf)
h = (gf_internal_t *) gf->scratch;
if (h->prim_poly == 0) h->prim_poly = 0x13;
-
+ h->prim_poly |= 0x10;
gf->multiply.w32 = NULL;
gf->divide.w32 = NULL;
gf->inverse.w32 = NULL;
@@ -1978,13 +2047,13 @@ gf_w4_init (gf_t *gf)
gf->extract_word.w32 = gf_w4_extract_word;
switch(h->mult_type) {
- case GF_MULT_SHIFT: if (gf_w4_shift_init(gf) == 0) return 0; break;
+ case GF_MULT_CARRY_FREE: if (gf_w4_cfm_init(gf) == 0) return 0; break;
+ case GF_MULT_SHIFT: if (gf_w4_shift_init(gf) == 0) return 0; break;
case GF_MULT_BYTWO_p:
- case GF_MULT_BYTWO_b:
- if (gf_w4_bytwo_init(gf) == 0) return 0; break;
- case GF_MULT_LOG_TABLE: if (gf_w4_log_init(gf) == 0) return 0; break;
+ case GF_MULT_BYTWO_b: if (gf_w4_bytwo_init(gf) == 0) return 0; break;
+ case GF_MULT_LOG_TABLE: if (gf_w4_log_init(gf) == 0) return 0; break;
case GF_MULT_DEFAULT:
- case GF_MULT_TABLE: if (gf_w4_table_init(gf) == 0) return 0; break;
+ case GF_MULT_TABLE: if (gf_w4_table_init(gf) == 0) return 0; break;
default: return 0;
}
@@ -1996,17 +2065,22 @@ gf_w4_init (gf_t *gf)
gf->inverse.w32 = gf_w4_matrix;
}
- if (gf->inverse.w32 != NULL && gf->divide.w32 == NULL) {
+ if (gf->divide.w32 == NULL) {
gf->divide.w32 = gf_w4_divide_from_inverse;
+ if (gf->inverse.w32 == NULL) gf->inverse.w32 = gf_w4_euclid;
}
- if (gf->inverse.w32 == NULL && gf->divide.w32 != NULL) {
- gf->inverse.w32 = gf_w4_inverse_from_divide;
- }
+
+ if (gf->inverse.w32 == NULL) gf->inverse.w32 = gf_w4_inverse_from_divide;
if (h->region_type == GF_REGION_CAUCHY) {
gf->multiply_region.w32 = gf_wgen_cauchy_region;
gf->extract_word.w32 = gf_wgen_extract_word;
}
+
+ if (gf->multiply_region.w32 == NULL) {
+ gf->multiply_region.w32 = gf_w4_multiply_region_from_single;
+ }
+
return 1;
}
diff --git a/gf_w64.c b/gf_w64.c
index 95100f4..12ec5af 100644
--- a/gf_w64.c
+++ b/gf_w64.c
@@ -9,18 +9,12 @@
#include <stdlib.h>
#define GF_FIELD_WIDTH (64)
-#define GF_FIRST_BIT (1L << 63)
+#define GF_FIRST_BIT (1ULL << 63)
#define GF_BASE_FIELD_WIDTH (32)
-#define GF_BASE_FIELD_SIZE (1L << GF_BASE_FIELD_WIDTH)
+#define GF_BASE_FIELD_SIZE (1ULL << GF_BASE_FIELD_WIDTH)
#define GF_BASE_FIELD_GROUP_SIZE GF_BASE_FIELD_SIZE-1
-// 10000587 is a valid s for 2^16^2
-#define GF_S_GF_16_2_2 (1000587)
-
-// 1000012 is a valid s for 2^32
-#define GF_S_GF_32_2 (1000012)
-
struct gf_w64_group_data {
uint64_t *reduce;
uint64_t *shift;
@@ -46,10 +40,6 @@ struct gf_split_8_8_data {
uint64_t tables[15][256][256];
};
-typedef struct w64_composite_int_s {
- uint64_t s; // 's' will be different depending on the base field
-} w64_composite_int_t;
-
static
inline
gf_val_64_t gf_w64_inverse_from_divide (gf_t *gf, gf_val_64_t a)
@@ -79,6 +69,9 @@ xor)
s64 = (gf_val_64_t *) src;
d64 = (gf_val_64_t *) dest;
+ if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+ if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
if (xor) {
for (i = 0; i < bytes/sizeof(gf_val_64_t); i++) {
d64[i] ^= gf->multiply.w64(gf, val, s64[i]);
@@ -91,7 +84,186 @@ xor)
}
static
-inline
+void
+gf_w64_clm_multiply_region_from_single_2(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int
+xor)
+{
+ int i, size;
+ gf_val_64_t *s64, *d64, *top;
+ gf_region_data rd;
+
+#ifdef INTEL_SSE4_PCLMUL
+ __m128i a, b;
+ __m128i result, r1;
+ __m128i prim_poly;
+ __m128i v, w;
+ __m128i m1, m2, m3, m4;
+ gf_internal_t * h = gf->scratch;
+
+ if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+ if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+ gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
+ gf_do_initial_region_alignment(&rd);
+
+ prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0xffffffffULL));
+ b = _mm_insert_epi64 (_mm_setzero_si128(), val, 0);
+ m1 = _mm_set_epi32(0, 0, 0, (uint32_t)0xffffffff);
+ m2 = _mm_slli_si128(m1, 4);
+ m2 = _mm_or_si128(m1, m2);
+ m3 = _mm_slli_si128(m1, 8);
+ m4 = _mm_slli_si128(m3, 4);
+
+ s64 = (gf_val_64_t *) rd.s_start;
+ d64 = (gf_val_64_t *) rd.d_start;
+ top = (gf_val_64_t *) rd.d_top;
+ size = bytes/sizeof(gf_val_64_t);
+
+ if (xor) {
+ while (d64 != top) {
+ a = _mm_load_si128((__m128i *) s64);
+ result = _mm_clmulepi64_si128 (a, b, 1);
+
+ w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1);
+ result = _mm_xor_si128 (result, w);
+ w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1);
+ r1 = _mm_xor_si128 (result, w);
+
+ result = _mm_clmulepi64_si128 (a, b, 0);
+
+ w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1);
+ result = _mm_xor_si128 (result, w);
+
+ w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1);
+ result = _mm_xor_si128 (result, w);
+
+ result = _mm_unpacklo_epi64(result, r1);
+
+ r1 = _mm_load_si128((__m128i *) d64);
+ result = _mm_xor_si128(r1, result);
+ _mm_store_si128((__m128i *) d64, result);
+ d64 += 2;
+ s64 += 2;
+ }
+ } else {
+ while (d64 != top) {
+
+ a = _mm_load_si128((__m128i *) s64);
+ result = _mm_clmulepi64_si128 (a, b, 1);
+
+ w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1);
+ result = _mm_xor_si128 (result, w);
+ w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1);
+ r1 = _mm_xor_si128 (result, w);
+
+ result = _mm_clmulepi64_si128 (a, b, 0);
+
+ w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1);
+ result = _mm_xor_si128 (result, w);
+ w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1);
+ result = _mm_xor_si128 (result, w);
+
+ result = _mm_unpacklo_epi64(result, r1);
+
+ _mm_store_si128((__m128i *) d64, result);
+ d64 += 2;
+ s64 += 2;
+ }
+ }
+ gf_do_final_region_alignment(&rd);
+#endif
+}
+
+static
+void
+gf_w64_clm_multiply_region_from_single_4(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int
+xor)
+{
+ int i, size;
+ gf_val_64_t *s64, *d64, *top;
+ gf_region_data rd;
+
+#ifdef INTEL_SSE4_PCLMUL
+ __m128i a, b;
+ __m128i result, r1;
+ __m128i prim_poly;
+ __m128i w;
+ __m128i m1, m3, m4;
+ gf_internal_t * h = gf->scratch;
+
+ if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+ if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+ gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
+ gf_do_initial_region_alignment(&rd);
+
+ prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0xffffffffULL));
+ b = _mm_insert_epi64 (_mm_setzero_si128(), val, 0);
+ m1 = _mm_set_epi32(0, 0, 0, (uint32_t)0xffffffff);
+ m3 = _mm_slli_si128(m1, 8);
+ m4 = _mm_slli_si128(m3, 4);
+
+ s64 = (gf_val_64_t *) rd.s_start;
+ d64 = (gf_val_64_t *) rd.d_start;
+ top = (gf_val_64_t *) rd.d_top;
+ size = bytes/sizeof(gf_val_64_t);
+
+ if (xor) {
+ while (d64 != top) {
+ a = _mm_load_si128((__m128i *) s64);
+ result = _mm_clmulepi64_si128 (a, b, 1);
+
+ w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1);
+ result = _mm_xor_si128 (result, w);
+ w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1);
+ r1 = _mm_xor_si128 (result, w);
+
+ result = _mm_clmulepi64_si128 (a, b, 0);
+
+ w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1);
+ result = _mm_xor_si128 (result, w);
+
+ w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1);
+ result = _mm_xor_si128 (result, w);
+
+ result = _mm_unpacklo_epi64(result, r1);
+
+ r1 = _mm_load_si128((__m128i *) d64);
+ result = _mm_xor_si128(r1, result);
+ _mm_store_si128((__m128i *) d64, result);
+ d64 += 2;
+ s64 += 2;
+ }
+ } else {
+ while (d64 != top) {
+ a = _mm_load_si128((__m128i *) s64);
+ result = _mm_clmulepi64_si128 (a, b, 1);
+
+ w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1);
+ result = _mm_xor_si128 (result, w);
+ w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1);
+ r1 = _mm_xor_si128 (result, w);
+
+ result = _mm_clmulepi64_si128 (a, b, 0);
+
+ w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1);
+ result = _mm_xor_si128 (result, w);
+ w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1);
+ result = _mm_xor_si128 (result, w);
+
+ result = _mm_unpacklo_epi64(result, r1);
+
+ _mm_store_si128((__m128i *) d64, result);
+ d64 += 2;
+ s64 += 2;
+ }
+ }
+ gf_do_final_region_alignment(&rd);
+#endif
+}
+
+static
+ inline
gf_val_64_t gf_w64_euclid (gf_t *gf, gf_val_64_t b)
{
gf_val_64_t e_i, e_im1, e_ip1;
@@ -118,6 +290,7 @@ gf_val_64_t gf_w64_euclid (gf_t *gf, gf_val_64_t b)
c_i ^= (one << (d_ip1 - d_i));
e_ip1 ^= (e_i << (d_ip1 - d_i));
d_ip1--;
+ if (e_ip1 == 0) return 0;
while ((e_ip1 & (one << d_ip1)) == 0) d_ip1--;
}
@@ -149,31 +322,41 @@ gf_w64_shift_multiply (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64)
h = (gf_internal_t *) gf->scratch;
ppr = h->prim_poly;
- ppl = 1;
+ /* Allen: set leading one of primitive polynomial */
+
+ ppl = 1;
+
a = a64;
bl = 0;
br = b64;
one = 1;
lbit = (one << 63);
- pl = 0;
- pr = 0;
+ pl = 0; /* Allen: left side of product */
+ pr = 0; /* Allen: right side of product */
+ /* Allen: unlike the corresponding functions for smaller word sizes,
+ * this loop carries out the initial carryless multiply by
+ * shifting b itself rather than simply looking at successively
+ * higher shifts of b */
+
for (i = 0; i < GF_FIELD_WIDTH; i++) {
if (a & (one << i)) {
pl ^= bl;
pr ^= br;
}
- /* printf("P: %016llx %016llx ", pl, pr); printf("B: %016llx %016llx\n", bl, br); */
+
bl <<= 1;
if (br & lbit) bl ^= 1;
br <<= 1;
}
- one = lbit;
- ppl = ((h->prim_poly >> 1) | lbit);
- ppr = lbit;
+ /* Allen: the name of the variable "one" is no longer descriptive at this point */
+
+ one = lbit >> 1;
+ ppl = (h->prim_poly >> 2) | one;
+ ppr = (h->prim_poly << (GF_FIELD_WIDTH-2));
while (one != 0) {
if (pl & one) {
pl ^= ppl;
@@ -190,12 +373,16 @@ gf_w64_shift_multiply (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64)
/*
* ELM: Use the Intel carryless multiply instruction to do very fast 64x64 multiply.
*/
+
static
inline
gf_val_64_t
-gf_w64_clm_multiply (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64)
+gf_w64_clm_multiply_2 (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64)
{
-#ifdef INTEL_PCLMUL
+ gf_val_64_t rv = 0;
+
+#ifdef INTEL_SSE4_PCLMUL
+
__m128i a, b;
__m128i result;
__m128i prim_poly;
@@ -206,10 +393,17 @@ gf_w64_clm_multiply (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64)
b = _mm_insert_epi64 (a, b64, 0);
prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0xffffffffULL));
/* Do the initial multiply */
+
result = _mm_clmulepi64_si128 (a, b, 0);
+
/* Mask off the high order 32 bits using subtraction of the polynomial.
* NOTE: this part requires that the polynomial have at least 32 leading 0 bits.
*/
+
+ /* Adam: We cant include the leading one in the 64 bit pclmul,
+ so we need to split up the high 8 bytes of the result into two
+ parts before we multiply them with the prim_poly.*/
+
v = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0);
w = _mm_clmulepi64_si128 (prim_poly, v, 0);
result = _mm_xor_si128 (result, w);
@@ -217,47 +411,64 @@ gf_w64_clm_multiply (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64)
w = _mm_clmulepi64_si128 (prim_poly, v, 0);
result = _mm_xor_si128 (result, w);
- return ((gf_val_64_t)_mm_extract_epi64(result, 0));
+ rv = ((gf_val_64_t)_mm_extract_epi64(result, 0));
#endif
+ return rv;
}
-
-#ifdef INTEL_PCLMUL
+
+static
inline
-__m128i
-gf_w64_clm_multiply_single (__m128i v, __m128i b, __m128i pp_l, __m128i pp_h)
+gf_val_64_t
+gf_w64_clm_multiply_4 (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64)
{
- __m128i r0, r1, c0, c1, w0, w1;
-
- r0 = _mm_clmulepi64_si128 (b, v, 0);
- c0 = _mm_srli_si128 (r0, 12);
- w0 = _mm_clmulepi64_si128 (pp_h, c0, 0);
- r0 = _mm_xor_si128 (r0, w0);
- c0 = _mm_srli_si128 (_mm_slli_si128 (r0, 4), 12);
- w0 = _mm_clmulepi64_si128 (pp_l, c0, 0);
- r0 = _mm_insert_epi64 (_mm_xor_si128 (r0, w0), 0, 1);
-
- r1 = _mm_clmulepi64_si128 (b, v, 1);
- c1 = _mm_srli_si128 (r1, 12);
- w1 = _mm_clmulepi64_si128 (pp_h, c1, 0);
- r1 = _mm_xor_si128 (r1, w1);
- c1 = _mm_srli_si128 (_mm_slli_si128 (r1, 4), 12);
- w1 = _mm_clmulepi64_si128 (pp_l, c1, 0);
- r1 = _mm_slli_si128 (_mm_xor_si128 (r1, w1), 8);
-
- return (_mm_xor_si128 (r0, r1));
-}
+ gf_val_64_t rv = 0;
+#ifdef INTEL_SSE4_PCLMUL
+
+ __m128i a, b;
+ __m128i result;
+ __m128i prim_poly;
+ __m128i v, w;
+ gf_internal_t * h = gf->scratch;
+
+ a = _mm_insert_epi64 (_mm_setzero_si128(), a64, 0);
+ b = _mm_insert_epi64 (a, b64, 0);
+ prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0xffffffffULL));
+
+ /* Do the initial multiply */
+
+ result = _mm_clmulepi64_si128 (a, b, 0);
+
+ v = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0);
+ w = _mm_clmulepi64_si128 (prim_poly, v, 0);
+ result = _mm_xor_si128 (result, w);
+ v = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 1);
+ w = _mm_clmulepi64_si128 (prim_poly, v, 0);
+ result = _mm_xor_si128 (result, w);
+
+ v = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0);
+ w = _mm_clmulepi64_si128 (prim_poly, v, 0);
+ result = _mm_xor_si128 (result, w);
+ v = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 1);
+ w = _mm_clmulepi64_si128 (prim_poly, v, 0);
+ result = _mm_xor_si128 (result, w);
+
+ rv = ((gf_val_64_t)_mm_extract_epi64(result, 0));
#endif
+ return rv;
+}
-void
+
+ void
gf_w64_clm_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int bytes, int xor)
{
-#ifdef INTEL_PCLMUL
+#ifdef INTEL_SSE4_PCLMUL
gf_internal_t *h;
- int i, top;
- uint8_t *s8, *d8;
+ int i, j, k;
+ uint8_t *s8, *d8, *dtop;
+ uint64_t *s64, *d64;
gf_region_data rd;
- __m128i v, b, xv, pp_l, pp_h, final;
+ __m128i v, b, m, prim_poly, c, fr, w, result;
if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
@@ -269,25 +480,67 @@ gf_w64_clm_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int by
s8 = (uint8_t *) rd.s_start;
d8 = (uint8_t *) rd.d_start;
- top = (uint8_t *) rd.d_top - (uint8_t *)rd.d_start;
+ dtop = (uint8_t *) rd.d_top;
v = _mm_insert_epi64(_mm_setzero_si128(), val, 0);
- pp_l = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0xffffffffULL));
- pp_h = _mm_slli_si128 (pp_l, 4);
+ m = _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff);
+ prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0xffffffffULL));
if (xor) {
- for (i = 0; i < top; i += 16) {
- b = _mm_load_si128((__m128i *) (s8 + i));
- final = gf_w64_clm_multiply_single (v, b, pp_l, pp_h);
- xv = _mm_load_si128((__m128i *) (d8 + i));
- final = _mm_xor_si128 (final, xv);
- _mm_store_si128((__m128i *) (d8 + i), final);
+ while (d8 != dtop) {
+ s64 = (uint64_t *) s8;
+ b = _mm_load_si128((__m128i *) s8);
+ result = _mm_clmulepi64_si128 (b, v, 0);
+ c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0);
+ w = _mm_clmulepi64_si128 (prim_poly, c, 0);
+ result = _mm_xor_si128 (result, w);
+ c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 1);
+ w = _mm_clmulepi64_si128 (prim_poly, c, 0);
+ fr = _mm_xor_si128 (result, w);
+ fr = _mm_and_si128 (fr, m);
+
+ result = _mm_clmulepi64_si128 (b, v, 1);
+ c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0);
+ w = _mm_clmulepi64_si128 (prim_poly, c, 0);
+ result = _mm_xor_si128 (result, w);
+ c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 1);
+ w = _mm_clmulepi64_si128 (prim_poly, c, 0);
+ result = _mm_xor_si128 (result, w);
+ result = _mm_slli_si128 (result, 8);
+ fr = _mm_xor_si128 (result, fr);
+ result = _mm_load_si128((__m128i *) d8);
+ fr = _mm_xor_si128 (result, fr);
+
+ _mm_store_si128((__m128i *) d8, fr);
+ d8 += 16;
+ s8 += 16;
}
} else {
- for (i = 0; i < top; i += 16) {
- b = _mm_load_si128((__m128i *) (s8 + i));
- final = gf_w64_clm_multiply_single (v, b, pp_l, pp_h);
- _mm_store_si128((__m128i *) (d8 + i), final);
+ while (d8 < dtop) {
+ s64 = (uint64_t *) s8;
+ b = _mm_load_si128((__m128i *) s8);
+ result = _mm_clmulepi64_si128 (b, v, 0);
+ c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0);
+ w = _mm_clmulepi64_si128 (prim_poly, c, 0);
+ result = _mm_xor_si128 (result, w);
+ c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 1);
+ w = _mm_clmulepi64_si128 (prim_poly, c, 0);
+ fr = _mm_xor_si128 (result, w);
+ fr = _mm_and_si128 (fr, m);
+
+ result = _mm_clmulepi64_si128 (b, v, 1);
+ c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0);
+ w = _mm_clmulepi64_si128 (prim_poly, c, 0);
+ result = _mm_xor_si128 (result, w);
+ c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 1);
+ w = _mm_clmulepi64_si128 (prim_poly, c, 0);
+ result = _mm_xor_si128 (result, w);
+ result = _mm_slli_si128 (result, 8);
+ fr = _mm_xor_si128 (result, fr);
+
+ _mm_store_si128((__m128i *) d8, fr);
+ d8 += 16;
+ s8 += 16;
}
}
gf_do_final_region_alignment(&rd);
@@ -486,18 +739,36 @@ int gf_w64_shift_init(gf_t *gf)
{
gf_internal_t *h;
+ gf->multiply.w64 = gf_w64_shift_multiply;
+ gf->inverse.w64 = gf_w64_euclid;
+ gf->multiply_region.w64 = gf_w64_multiply_region_from_single;
+ return 1;
+}
+
+static
+int gf_w64_cfm_init(gf_t *gf)
+{
+ gf_internal_t *h;
+
h = (gf_internal_t *) gf->scratch;
- gf->multiply.w64 = gf_w64_shift_multiply;
gf->inverse.w64 = gf_w64_euclid;
gf->multiply_region.w64 = gf_w64_multiply_region_from_single;
-#ifdef INTEL_PCLMUL
- if (h->region_type != GF_REGION_NOSSE) gf->multiply.w64 = gf_w64_clm_multiply;
- if (h->region_type != GF_REGION_NOSSE) gf->multiply_region.w64 = gf_w64_clm_multiply_region;
+#ifdef INTEL_SSE4_PCLMUL
+ if ((0xfffffffe00000000ULL & h->prim_poly) == 0){
+ gf->multiply.w64 = gf_w64_clm_multiply_2;
+ gf->multiply_region.w64 = gf_w64_clm_multiply_region_from_single_2;
+ }else if((0xfffe000000000000ULL & h->prim_poly) == 0){
+ gf->multiply.w64 = gf_w64_clm_multiply_4;
+ gf->multiply_region.w64 = gf_w64_clm_multiply_region_from_single_4;
+ } else {
+ return 0;
+ }
+ return 1;
#endif
- return 1;
+ return 0;
}
static
@@ -509,11 +780,7 @@ gf_w64_group_set_shift_tables(uint64_t *shift, uint64_t val, gf_internal_t *h)
uint64_t one = 1;
int g_s;
- if (h->mult_type == GF_MULT_DEFAULT) {
- g_s = 4;
- } else {
- g_s = h->arg1;
- }
+ g_s = h->arg1;
shift[0] = 0;
for (i = 1; i < (1 << g_s); i <<= 1) {
@@ -538,13 +805,8 @@ gf_w64_group_multiply(gf_t *gf, gf_val_64_t a, gf_val_64_t b)
struct gf_w64_group_data *gd;
gf_internal_t *h = (gf_internal_t *) gf->scratch;
- if (h->mult_type == GF_MULT_DEFAULT) {
- g_s = 4;
- g_r = 8;
- } else {
- g_s = h->arg1;
- g_r = h->arg2;
- }
+ g_s = h->arg1;
+ g_r = h->arg2;
gd = (struct gf_w64_group_data *) h->private;
gf_w64_group_set_shift_tables(gd->shift, b, h);
@@ -599,19 +861,18 @@ void gf_w64_group_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t v
if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
gd = (struct gf_w64_group_data *) h->private;
- if (h->mult_type == GF_MULT_DEFAULT) {
- g_s = 4;
- g_r = 8;
- } else {
- g_s = h->arg1;
- g_r = h->arg2;
- }
+ g_s = h->arg1;
+ g_r = h->arg2;
gf_w64_group_set_shift_tables(gd->shift, val, h);
- for (i = 63; !(val & (1L << i)); i--) ;
+ for (i = 63; !(val & (1ULL << i)); i--) ;
i += g_s;
- if (i > 64) i = 64; /* i is the bit position of the first zero bit in any element of
+
+ /* i is the bit position of the first zero bit in any element of
gd->shift[] */
+
+ if (i > 64) i = 64;
+
fzb = i;
gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4);
@@ -770,13 +1031,8 @@ int gf_w64_group_init(gf_t *gf)
gf_internal_t *h = (gf_internal_t *) gf->scratch;
int g_r, g_s;
- if (h->mult_type == GF_MULT_DEFAULT) {
- g_s = 4;
- g_r = 8;
- } else {
- g_s = h->arg1;
- g_r = h->arg2;
- }
+ g_s = h->arg1;
+ g_r = h->arg2;
gd = (struct gf_w64_group_data *) h->private;
gd->shift = (uint64_t *) (&(gd->memory));
@@ -881,8 +1137,7 @@ gf_w64_bytwo_b_multiply (gf_t *gf, gf_val_64_t a, gf_val_64_t b)
pp = h->prim_poly;
prod = 0;
- bmask = 0x80000000;
- bmask <<= 32;
+ bmask = 0x8000000000000000ULL;
while (1) {
if (a & 1) prod ^= b;
@@ -908,10 +1163,11 @@ gf_w64_bytwo_p_multiply (gf_t *gf, gf_val_64_t a, gf_val_64_t b)
pp = h->prim_poly;
prod = 0;
- pmask = 0x80000000;
- pmask <<= 32;
- amask = 0x80000000;
- amask <<= 32;
+
+ /* changed from declare then shift to just declare.*/
+
+ pmask = 0x8000000000000000ULL;
+ amask = 0x8000000000000000ULL;
while (amask != 0) {
if (prod & pmask) {
@@ -1052,7 +1308,7 @@ gf_w64_bytwo_b_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_
void gf_w64_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor)
{
-#ifdef INTEL_SSE4
+#ifdef INTEL_SSE2
int i;
uint8_t *s8, *d8;
uint64_t vrev, one64;
@@ -1118,7 +1374,7 @@ static
void
gf_w64_bytwo_b_sse_region_2_xor(gf_region_data *rd)
{
-#ifdef INTEL_SSE4
+#ifdef INTEL_SSE2
int i;
uint64_t one64, amask;
uint8_t *d8, *s8, tb;
@@ -1152,7 +1408,7 @@ static
void
gf_w64_bytwo_b_sse_region_2_noxor(gf_region_data *rd)
{
-#ifdef INTEL_SSE4
+#ifdef INTEL_SSE2
int i;
uint64_t one64, amask;
uint8_t *d8, *s8, tb;
@@ -1184,7 +1440,7 @@ static
void
gf_w64_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor)
{
-#ifdef INTEL_SSE4
+#ifdef INTEL_SSE2
uint64_t itb, amask, one64;
uint8_t *d8, *s8;
__m128i pp, m1, m2, t1, t2, va, vb;
@@ -1248,18 +1504,28 @@ int gf_w64_bytwo_init(gf_t *gf)
if (h->mult_type == GF_MULT_BYTWO_p) {
gf->multiply.w64 = gf_w64_bytwo_p_multiply;
- if (h->region_type == GF_REGION_SSE) {
- gf->multiply_region.w64 = gf_w64_bytwo_p_sse_multiply_region;
- } else {
- gf->multiply_region.w64 = gf_w64_bytwo_p_nosse_multiply_region;
- }
+ #ifdef INTEL_SSE2
+ if (h->region_type & GF_REGION_NOSSE)
+ gf->multiply_region.w64 = gf_w64_bytwo_p_nosse_multiply_region;
+ else
+ gf->multiply_region.w64 = gf_w64_bytwo_p_sse_multiply_region;
+ #else
+ gf->multiply_region.w64 = gf_w64_bytwo_p_nosse_multiply_region;
+ if(h->region_type & GF_REGION_SSE)
+ return 0;
+ #endif
} else {
gf->multiply.w64 = gf_w64_bytwo_b_multiply;
- if (h->region_type == GF_REGION_SSE) {
- gf->multiply_region.w64 = gf_w64_bytwo_b_sse_multiply_region;
- } else {
+ #ifdef INTEL_SSE2
+ if (h->region_type & GF_REGION_NOSSE)
+ gf->multiply_region.w64 = gf_w64_bytwo_b_nosse_multiply_region;
+ else
+ gf->multiply_region.w64 = gf_w64_bytwo_b_sse_multiply_region;
+ #else
gf->multiply_region.w64 = gf_w64_bytwo_b_nosse_multiply_region;
- }
+ if(h->region_type & GF_REGION_SSE)
+ return 0;
+ #endif
}
gf->inverse.w64 = gf_w64_euclid;
return 1;
@@ -1277,12 +1543,11 @@ gf_w64_composite_multiply(gf_t *gf, gf_val_64_t a, gf_val_64_t b)
uint32_t a0 = a & 0x00000000ffffffff;
uint32_t a1 = (a & 0xffffffff00000000) >> 32;
uint32_t a1b1;
- w64_composite_int_t *comp_int = (w64_composite_int_t*)h->private;
a1b1 = base_gf->multiply.w32(base_gf, a1, b1);
return ((uint64_t)(base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) |
- ((uint64_t)(base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, comp_int->s)) << 32));
+ ((uint64_t)(base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 32));
}
/*
@@ -1307,6 +1572,7 @@ gf_w64_composite_multiply(gf_t *gf, gf_val_64_t a, gf_val_64_t b)
*
* a / b = a * c
*/
+
static
gf_val_64_t
gf_w64_composite_inverse(gf_t *gf, gf_val_64_t a)
@@ -1318,11 +1584,10 @@ gf_w64_composite_inverse(gf_t *gf, gf_val_64_t a)
uint32_t c0, c1, d, tmp;
uint64_t c;
uint32_t a0inv, a1inv;
- w64_composite_int_t *comp_int = (w64_composite_int_t*)h->private;
if (a0 == 0) {
a1inv = base_gf->inverse.w32(base_gf, a1);
- c0 = base_gf->multiply.w32(base_gf, a1inv, comp_int->s);
+ c0 = base_gf->multiply.w32(base_gf, a1inv, h->prim_poly);
c1 = a1inv;
} else if (a1 == 0) {
c0 = base_gf->inverse.w32(base_gf, a0);
@@ -1333,7 +1598,7 @@ gf_w64_composite_inverse(gf_t *gf, gf_val_64_t a)
d = base_gf->multiply.w32(base_gf, a1, a0inv);
- tmp = (base_gf->multiply.w32(base_gf, a1, a0inv) ^ base_gf->multiply.w32(base_gf, a0, a1inv) ^ comp_int->s);
+ tmp = (base_gf->multiply.w32(base_gf, a1, a0inv) ^ base_gf->multiply.w32(base_gf, a0, a1inv) ^ h->prim_poly);
tmp = base_gf->inverse.w32(base_gf, tmp);
d = base_gf->multiply.w32(base_gf, d, tmp);
@@ -1348,17 +1613,6 @@ gf_w64_composite_inverse(gf_t *gf, gf_val_64_t a)
}
static
-gf_val_64_t
-gf_w64_composite_divide(gf_t *gf, gf_val_64_t a, gf_val_64_t b)
-{
- gf_val_64_t binv;
-
- binv = gf_w64_composite_inverse(gf, b);
-
- return gf_w64_composite_multiply(gf, a, binv);
-}
-
-static
void
gf_w64_composite_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor)
{
@@ -1374,7 +1628,6 @@ gf_w64_composite_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t va
int num_syms = bytes / 8;
int sym_divisible = bytes % 4;
gf_region_data rd;
- w64_composite_int_t *comp_int = (w64_composite_int_t*)h->private;
if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8);
@@ -1390,7 +1643,7 @@ gf_w64_composite_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t va
a1b1 = base_gf->multiply.w32(base_gf, a1, b1);
*d64 ^= ((uint64_t)(base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) |
- ((uint64_t)(base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, comp_int->s)) << 32));
+ ((uint64_t)(base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 32));
s64++;
d64++;
}
@@ -1401,7 +1654,7 @@ gf_w64_composite_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t va
a1b1 = base_gf->multiply.w32(base_gf, a1, b1);
*d64 = ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) |
- ((uint64_t)(base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, comp_int->s)) << 32));
+ ((uint64_t)(base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 32));
s64++;
d64++;
}
@@ -1420,7 +1673,6 @@ gf_w64_composite_multiply_region_alt(gf_t *gf, void *src, void *dest, gf_val_64_
uint8_t *dlow, *dhigh, *top;
int sub_reg_size;
gf_region_data rd;
- w64_composite_int_t *comp_int = (w64_composite_int_t*)h->private;
if (!xor) {
memset(dest, 0, bytes);
@@ -1440,7 +1692,7 @@ gf_w64_composite_multiply_region_alt(gf_t *gf, void *src, void *dest, gf_val_64_
base_gf->multiply_region.w32(base_gf, shigh, dlow, val1, sub_reg_size, 1);
base_gf->multiply_region.w32(base_gf, slow, dhigh, val1, sub_reg_size, xor);
base_gf->multiply_region.w32(base_gf, shigh, dhigh, val0, sub_reg_size, 1);
- base_gf->multiply_region.w32(base_gf, shigh, dhigh, base_gf->multiply.w32(base_gf, comp_int->s, val1), sub_reg_size, 1);
+ base_gf->multiply_region.w32(base_gf, shigh, dhigh, base_gf->multiply.w32(base_gf, h->prim_poly, val1), sub_reg_size, 1);
gf_do_final_region_alignment(&rd);
}
@@ -1458,29 +1710,18 @@ int gf_w64_composite_init(gf_t *gf)
gf->multiply_region.w64 = gf_w64_composite_multiply_region;
}
- if (h->base_gf != NULL) {
- gf_internal_t *base_h = (gf_internal_t *) h->base_gf->scratch;
- w64_composite_int_t *comp_int = (w64_composite_int_t*)h->private;
-
- if (base_h->mult_type == GF_MULT_COMPOSITE) {
- comp_int->s = GF_S_GF_16_2_2;
- } else {
- comp_int->s = GF_S_GF_32_2;
- }
- }
-
gf->multiply.w64 = gf_w64_composite_multiply;
- gf->divide.w64 = gf_w64_composite_divide;
+ gf->divide.w64 = NULL;
gf->inverse.w64 = gf_w64_composite_inverse;
return 1;
}
static
-void
+ void
gf_w64_split_4_64_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int bytes, int xor)
{
-#ifdef INTEL_SSE4
+#ifdef INTEL_SSSE3
gf_internal_t *h;
int i, m, j, k, tindex;
uint64_t pp, v, s, *s64, *d64, *top;
@@ -1494,7 +1735,7 @@ gf_w64_split_4_64_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void *des
h = (gf_internal_t *) gf->scratch;
pp = h->prim_poly;
-
+
gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 128);
gf_do_initial_region_alignment(&rd);
@@ -1534,11 +1775,11 @@ gf_w64_split_4_64_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void *des
i = 0;
for (k = 0; k < 8; k++) {
v0 = _mm_load_si128((__m128i *) s64);
+ /* MM_PRINT8("v", v0); */
s64 += 2;
si = _mm_and_si128(v0, mask1);
- /* Happy now? */
for (j = 0; j < 8; j++) {
p[j] = _mm_xor_si128(p[j], _mm_shuffle_epi8(tables[i][j], si));
}
@@ -1551,6 +1792,7 @@ gf_w64_split_4_64_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void *des
i++;
}
for (i = 0; i < 8; i++) {
+ /* MM_PRINT8("v", p[i]); */
_mm_store_si128((__m128i *) d64, p[i]);
d64 += 2;
}
@@ -1559,6 +1801,210 @@ gf_w64_split_4_64_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void *des
#endif
}
+static
+ void
+gf_w64_split_4_64_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int bytes, int xor)
+{
+#ifdef INTEL_SSE4
+ gf_internal_t *h;
+ int i, m, j, k, tindex;
+ uint64_t pp, v, s, *s64, *d64, *top;
+ __m128i si, tables[16][8], p[8], st[8], mask1, mask8, mask16, t1, t2;
+ struct gf_split_4_64_lazy_data *ld;
+ uint8_t btable[16];
+ gf_region_data rd;
+
+ if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+ if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+ h = (gf_internal_t *) gf->scratch;
+ pp = h->prim_poly;
+
+ gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 128);
+ gf_do_initial_region_alignment(&rd);
+
+ s64 = (uint64_t *) rd.s_start;
+ d64 = (uint64_t *) rd.d_start;
+ top = (uint64_t *) rd.d_top;
+
+ ld = (struct gf_split_4_64_lazy_data *) h->private;
+
+ v = val;
+ for (i = 0; i < 16; i++) {
+ ld->tables[i][0] = 0;
+ for (j = 1; j < 16; j <<= 1) {
+ for (k = 0; k < j; k++) {
+ ld->tables[i][k^j] = (v ^ ld->tables[i][k]);
+ }
+ v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1);
+ }
+ for (j = 0; j < 8; j++) {
+ for (k = 0; k < 16; k++) {
+ btable[k] = (uint8_t) ld->tables[i][k];
+ ld->tables[i][k] >>= 8;
+ }
+ tables[i][j] = _mm_loadu_si128((__m128i *) btable);
+ }
+ }
+
+ mask1 = _mm_set1_epi8(0xf);
+ mask8 = _mm_set1_epi16(0xff);
+ mask16 = _mm_set1_epi32(0xffff);
+
+ while (d64 != top) {
+
+ for (i = 0; i < 8; i++) p[i] = _mm_setzero_si128();
+
+ for (k = 0; k < 8; k++) {
+ st[k] = _mm_load_si128((__m128i *) s64);
+ s64 += 2;
+ }
+
+ for (k = 0; k < 4; k ++) {
+ st[k] = _mm_shuffle_epi32(st[k], _MM_SHUFFLE(3,1,2,0));
+ st[k+4] = _mm_shuffle_epi32(st[k+4], _MM_SHUFFLE(2,0,3,1));
+ t1 = _mm_blend_epi16(st[k], st[k+4], 0xf0);
+ st[k] = _mm_srli_si128(st[k], 8);
+ st[k+4] = _mm_slli_si128(st[k+4], 8);
+ st[k+4] = _mm_blend_epi16(st[k], st[k+4], 0xf0);
+ st[k] = t1;
+ }
+
+/*
+ printf("After pack pass 1\n");
+ for (k = 0; k < 8; k++) {
+ MM_PRINT8("v", st[k]);
+ }
+ printf("\n");
+ */
+
+ t1 = _mm_packus_epi32(_mm_and_si128(st[0], mask16), _mm_and_si128(st[2], mask16));
+ st[2] = _mm_packus_epi32(_mm_srli_epi32(st[0], 16), _mm_srli_epi32(st[2], 16));
+ st[0] = t1;
+ t1 = _mm_packus_epi32(_mm_and_si128(st[1], mask16), _mm_and_si128(st[3], mask16));
+ st[3] = _mm_packus_epi32(_mm_srli_epi32(st[1], 16), _mm_srli_epi32(st[3], 16));
+ st[1] = t1;
+ t1 = _mm_packus_epi32(_mm_and_si128(st[4], mask16), _mm_and_si128(st[6], mask16));
+ st[6] = _mm_packus_epi32(_mm_srli_epi32(st[4], 16), _mm_srli_epi32(st[6], 16));
+ st[4] = t1;
+ t1 = _mm_packus_epi32(_mm_and_si128(st[5], mask16), _mm_and_si128(st[7], mask16));
+ st[7] = _mm_packus_epi32(_mm_srli_epi32(st[5], 16), _mm_srli_epi32(st[7], 16));
+ st[5] = t1;
+
+/*
+ printf("After pack pass 2\n");
+ for (k = 0; k < 8; k++) {
+ MM_PRINT8("v", st[k]);
+ }
+ printf("\n");
+ */
+ t1 = _mm_packus_epi16(_mm_and_si128(st[0], mask8), _mm_and_si128(st[1], mask8));
+ st[1] = _mm_packus_epi16(_mm_srli_epi16(st[0], 8), _mm_srli_epi16(st[1], 8));
+ st[0] = t1;
+ t1 = _mm_packus_epi16(_mm_and_si128(st[2], mask8), _mm_and_si128(st[3], mask8));
+ st[3] = _mm_packus_epi16(_mm_srli_epi16(st[2], 8), _mm_srli_epi16(st[3], 8));
+ st[2] = t1;
+ t1 = _mm_packus_epi16(_mm_and_si128(st[4], mask8), _mm_and_si128(st[5], mask8));
+ st[5] = _mm_packus_epi16(_mm_srli_epi16(st[4], 8), _mm_srli_epi16(st[5], 8));
+ st[4] = t1;
+ t1 = _mm_packus_epi16(_mm_and_si128(st[6], mask8), _mm_and_si128(st[7], mask8));
+ st[7] = _mm_packus_epi16(_mm_srli_epi16(st[6], 8), _mm_srli_epi16(st[7], 8));
+ st[6] = t1;
+
+/*
+ printf("After final pack pass 2\n");
+ for (k = 0; k < 8; k++) {
+ MM_PRINT8("v", st[k]);
+ }
+ */
+ i = 0;
+ for (k = 0; k < 8; k++) {
+ si = _mm_and_si128(st[k], mask1);
+
+ for (j = 0; j < 8; j++) {
+ p[j] = _mm_xor_si128(p[j], _mm_shuffle_epi8(tables[i][j], si));
+ }
+ i++;
+ st[k] = _mm_srli_epi32(st[k], 4);
+ si = _mm_and_si128(st[k], mask1);
+ for (j = 0; j < 8; j++) {
+ p[j] = _mm_xor_si128(p[j], _mm_shuffle_epi8(tables[i][j], si));
+ }
+ i++;
+ }
+
+ t1 = _mm_unpacklo_epi8(p[0], p[1]);
+ p[1] = _mm_unpackhi_epi8(p[0], p[1]);
+ p[0] = t1;
+ t1 = _mm_unpacklo_epi8(p[2], p[3]);
+ p[3] = _mm_unpackhi_epi8(p[2], p[3]);
+ p[2] = t1;
+ t1 = _mm_unpacklo_epi8(p[4], p[5]);
+ p[5] = _mm_unpackhi_epi8(p[4], p[5]);
+ p[4] = t1;
+ t1 = _mm_unpacklo_epi8(p[6], p[7]);
+ p[7] = _mm_unpackhi_epi8(p[6], p[7]);
+ p[6] = t1;
+
+/*
+ printf("After unpack pass 1:\n");
+ for (i = 0; i < 8; i++) {
+ MM_PRINT8("v", p[i]);
+ }
+ */
+
+ t1 = _mm_unpacklo_epi16(p[0], p[2]);
+ p[2] = _mm_unpackhi_epi16(p[0], p[2]);
+ p[0] = t1;
+ t1 = _mm_unpacklo_epi16(p[1], p[3]);
+ p[3] = _mm_unpackhi_epi16(p[1], p[3]);
+ p[1] = t1;
+ t1 = _mm_unpacklo_epi16(p[4], p[6]);
+ p[6] = _mm_unpackhi_epi16(p[4], p[6]);
+ p[4] = t1;
+ t1 = _mm_unpacklo_epi16(p[5], p[7]);
+ p[7] = _mm_unpackhi_epi16(p[5], p[7]);
+ p[5] = t1;
+
+/*
+ printf("After unpack pass 2:\n");
+ for (i = 0; i < 8; i++) {
+ MM_PRINT8("v", p[i]);
+ }
+ */
+
+ t1 = _mm_unpacklo_epi32(p[0], p[4]);
+ p[4] = _mm_unpackhi_epi32(p[0], p[4]);
+ p[0] = t1;
+ t1 = _mm_unpacklo_epi32(p[1], p[5]);
+ p[5] = _mm_unpackhi_epi32(p[1], p[5]);
+ p[1] = t1;
+ t1 = _mm_unpacklo_epi32(p[2], p[6]);
+ p[6] = _mm_unpackhi_epi32(p[2], p[6]);
+ p[2] = t1;
+ t1 = _mm_unpacklo_epi32(p[3], p[7]);
+ p[7] = _mm_unpackhi_epi32(p[3], p[7]);
+ p[3] = t1;
+
+ if (xor) {
+ for (i = 0; i < 8; i++) {
+ t1 = _mm_load_si128((__m128i *) d64);
+ _mm_store_si128((__m128i *) d64, _mm_xor_si128(p[i], t1));
+ d64 += 2;
+ }
+ } else {
+ for (i = 0; i < 8; i++) {
+ _mm_store_si128((__m128i *) d64, p[i]);
+ d64 += 2;
+ }
+ }
+
+ }
+
+ gf_do_final_region_alignment(&rd);
+#endif
+}
+
#define GF_MULTBY_TWO(p) (((p) & GF_FIRST_BIT) ? (((p) << 1) ^ h->prim_poly) : (p) << 1);
static
@@ -1575,27 +2021,72 @@ int gf_w64_split_init(gf_t *gf)
h = (gf_internal_t *) gf->scratch;
/* Defaults */
- gf->multiply_region.w64 = gf_w64_multiply_region_from_single;
- gf->multiply.w64 = gf_w64_shift_multiply;
+ gf->multiply_region.w64 = gf_w64_multiply_region_from_single;
-#ifdef INTEL_PCLMUL
- if (h->region_type != GF_REGION_NOSSE) gf->multiply.w64 = gf_w64_clm_multiply;
+ gf->multiply.w64 = gf_w64_bytwo_p_multiply;
+
+#ifdef INTEL_SSE4_PCLMUL
+ if ((!(h->region_type & GF_REGION_NOSSE) &&
+ (h->arg1 == 64 || h->arg2 == 64)) ||
+ h->mult_type == GF_MULT_DEFAULT){
+
+ if ((0xfffffffe00000000ULL & h->prim_poly) == 0){
+ gf->multiply.w64 = gf_w64_clm_multiply_2;
+ gf->multiply_region.w64 = gf_w64_clm_multiply_region_from_single_2;
+ }else if((0xfffe000000000000ULL & h->prim_poly) == 0){
+ gf->multiply.w64 = gf_w64_clm_multiply_4;
+ gf->multiply_region.w64 = gf_w64_clm_multiply_region_from_single_4;
+ }else{
+ return 0;
+ }
+ }
#endif
gf->inverse.w64 = gf_w64_euclid;
+ /* Allen: set region pointers for default mult type. Single pointers are
+ * taken care of above (explicitly for sse, implicitly for no sse). */
+
+#ifdef INTEL_SSE4
+ if (h->mult_type == GF_MULT_DEFAULT) {
+ d4 = (struct gf_split_4_64_lazy_data *) h->private;
+ d4->last_value = 0;
+ gf->multiply_region.w64 = gf_w64_split_4_64_lazy_sse_multiply_region;
+ }
+#else
+ if (h->mult_type == GF_MULT_DEFAULT) {
+ d8 = (struct gf_split_8_64_lazy_data *) h->private;
+ d8->last_value = 0;
+ gf->multiply_region.w64 = gf_w64_split_8_64_lazy_multiply_region;
+ }
+#endif
+
if ((h->arg1 == 4 && h->arg2 == 64) || (h->arg1 == 64 && h->arg2 == 4)) {
d4 = (struct gf_split_4_64_lazy_data *) h->private;
d4->last_value = 0;
- if (h->region_type & GF_REGION_SSE) {
- if (h->region_type & GF_REGION_ALTMAP) {
+
+ if((h->region_type & GF_REGION_ALTMAP) && (h->region_type & GF_REGION_NOSSE)) return 0;
+ if(h->region_type & GF_REGION_ALTMAP)
+ {
+ #ifdef INTEL_SSSE3
gf->multiply_region.w64 = gf_w64_split_4_64_lazy_sse_altmap_multiply_region;
- } else {
-/* gf->multiply_region.w32 = gf_w32_split_4_32_lazy_sse_multiply_region; */
- }
- } else {
- gf->multiply_region.w64 = gf_w64_split_4_64_lazy_multiply_region;
+ #else
+ return 0;
+ #endif
+ }
+ else //no altmap
+ {
+ #ifdef INTEL_SSE4
+ if(h->region_type & GF_REGION_NOSSE)
+ gf->multiply_region.w64 = gf_w64_split_4_64_lazy_multiply_region;
+ else
+ gf->multiply_region.w64 = gf_w64_split_4_64_lazy_sse_multiply_region;
+ #else
+ gf->multiply_region.w64 = gf_w64_split_4_64_lazy_multiply_region;
+ if(h->region_type & GF_REGION_SSE)
+ return 0;
+ #endif
}
}
if ((h->arg1 == 8 && h->arg2 == 64) || (h->arg1 == 64 && h->arg2 == 8)) {
@@ -1611,7 +2102,9 @@ int gf_w64_split_init(gf_t *gf)
if ((h->arg1 == 8 && h->arg2 == 8)) {
d88 = (struct gf_split_8_8_data *) h->private;
gf->multiply.w64 = gf_w64_split_8_8_multiply;
+
/* The performance of this guy sucks, so don't bother with a region op */
+
basep = 1;
for (exp = 0; exp < 15; exp++) {
for (j = 0; j < 256; j++) d88->tables[exp][0][j] = 0;
@@ -1639,94 +2132,93 @@ int gf_w64_split_init(gf_t *gf)
for (i = 0; i < 8; i++) basep = GF_MULTBY_TWO(basep);
}
}
- return -1;
+ return 1;
}
int gf_w64_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2)
{
- int ss, sa;
+ int issse4;
- ss = (GF_REGION_SSE | GF_REGION_NOSSE);
- sa = (GF_REGION_STDMAP | GF_REGION_ALTMAP);
-
- if (divide_type == GF_DIVIDE_MATRIX) return -1;
switch(mult_type)
{
case GF_MULT_SHIFT:
- if (arg1 != 0 || arg2 != 0) return -1;
- if (region_type != GF_REGION_NOSSE && region_type != GF_REGION_SSE && region_type != GF_REGION_DEFAULT) return -1;
+ return sizeof(gf_internal_t);
+ break;
+ case GF_MULT_CARRY_FREE:
return sizeof(gf_internal_t);
break;
case GF_MULT_BYTWO_p:
case GF_MULT_BYTWO_b:
- if (arg1 != 0 || arg2 != 0) return -1;
- if (region_type != GF_REGION_CAUCHY) {
- if ((region_type | ss) != ss || (region_type & ss) == ss) return -1;
- }
return sizeof(gf_internal_t);
break;
+ case GF_MULT_DEFAULT:
+
+ /* Allen: set the *local* arg1 and arg2, just for scratch size purposes,
+ * then fall through to split table scratch size code. */
+
+#ifdef INTEL_SSE4
+ issse4 = 1;
+ arg1 = 64;
+ arg2 = 4;
+#else
+ issse4 = 0;
+ arg1 = 64;
+ arg2 = 8;
+#endif
+
case GF_MULT_SPLIT_TABLE:
if (arg1 == 8 && arg2 == 8) {
- region_type &= (~GF_REGION_LAZY);
- if (region_type != GF_REGION_DEFAULT) return -1;
return sizeof(gf_internal_t) + sizeof(struct gf_split_8_8_data) + 64;
}
if ((arg1 == 16 && arg2 == 64) || (arg2 == 16 && arg1 == 64)) {
- region_type &= (~GF_REGION_LAZY);
- if (region_type != GF_REGION_DEFAULT) return -1;
return sizeof(gf_internal_t) + sizeof(struct gf_split_16_64_lazy_data) + 64;
}
if ((arg1 == 8 && arg2 == 64) || (arg2 == 8 && arg1 == 64)) {
- region_type &= (~GF_REGION_LAZY);
- if (region_type != GF_REGION_DEFAULT) return -1;
return sizeof(gf_internal_t) + sizeof(struct gf_split_8_64_lazy_data) + 64;
}
- if ((arg1 == 64 && arg2 == 4) || (arg1 == 4 && arg2 == 64)){
- region_type &= (~GF_REGION_LAZY);
- if ((region_type & ss) == ss) return -1;
- if ((region_type & sa) == sa) return -1;
- if (region_type & (~(ss|sa))) return -1;
- if (region_type & GF_REGION_SSE) {
- return sizeof(gf_internal_t) + sizeof(struct gf_split_4_64_lazy_data) + 64;
- } else if (region_type & GF_REGION_ALTMAP) {
- return -1;
- } else {
- return sizeof(gf_internal_t) + sizeof(struct gf_split_4_64_lazy_data) + 64;
- }
+ if ((arg1 == 64 && arg2 == 4) || (arg1 == 4 && arg2 == 64)) {
+ return sizeof(gf_internal_t) + sizeof(struct gf_split_4_64_lazy_data) + 64;
}
- return -1;
-
- case GF_MULT_DEFAULT:
- arg1 = 4;
- arg2 = 8;
+ return 0;
case GF_MULT_GROUP:
- if (arg1 <= 0 || arg2 <= 0) return -1;
- if (region_type != GF_REGION_DEFAULT && region_type != GF_REGION_CAUCHY) return -1;
return sizeof(gf_internal_t) + sizeof(struct gf_w64_group_data) +
sizeof(uint64_t) * (1 << arg1) +
sizeof(uint64_t) * (1 << arg2) + 64;
break;
case GF_MULT_COMPOSITE:
- if (region_type & ~(GF_REGION_ALTMAP | GF_REGION_STDMAP)) return -1;
- if ((arg1 == 2 && arg2 == 0) || (arg1 == 2 && arg2 == 1)) {
- return sizeof(gf_internal_t) + sizeof(w64_composite_int_t) + 4;
- } else {
- return -1;
- }
+ if (arg1 == 2) return sizeof(gf_internal_t) + 64;
+ return 0;
break;
default:
- return -1;
+ return 0;
}
}
int gf_w64_init(gf_t *gf)
{
- gf_internal_t *h;
+ gf_internal_t *h, *h_base, *h_base_base, *h_base_base_base;
+ int no_default_flag = 0;
h = (gf_internal_t *) gf->scratch;
- if (h->prim_poly == 0) h->prim_poly = 0x1b; /* Omitting the leftmost 1 as in w=32 */
+
+ /* Allen: set default primitive polynomial / irreducible polynomial if needed */
+
+ /* Omitting the leftmost 1 as in w=32 */
+
+ if (h->prim_poly == 0) {
+ if (h->mult_type == GF_MULT_COMPOSITE) {
+ h->prim_poly = gf_composite_get_default_poly(h->base_gf);
+ if (h->prim_poly == 0) return 0; /* This shouldn't happen */
+ } else {
+ h->prim_poly = 0x1b;
+ }
+ if (no_default_flag == 1) {
+ fprintf(stderr,"Code contains no default irreducible polynomial for given base field\n");
+ return 0;
+ }
+ }
gf->multiply.w64 = NULL;
gf->divide.w64 = NULL;
@@ -1734,10 +2226,11 @@ int gf_w64_init(gf_t *gf)
gf->multiply_region.w64 = NULL;
switch(h->mult_type) {
- case GF_MULT_SHIFT: if (gf_w64_shift_init(gf) == 0) return 0; break;
- case GF_MULT_COMPOSITE: if (gf_w64_composite_init(gf) == 0) return 0; break;
- case GF_MULT_SPLIT_TABLE: if (gf_w64_split_init(gf) == 0) return 0; break;
+ case GF_MULT_CARRY_FREE: if (gf_w64_cfm_init(gf) == 0) return 0; break;
+ case GF_MULT_SHIFT: if (gf_w64_shift_init(gf) == 0) return 0; break;
+ case GF_MULT_COMPOSITE: if (gf_w64_composite_init(gf) == 0) return 0; break;
case GF_MULT_DEFAULT:
+ case GF_MULT_SPLIT_TABLE: if (gf_w64_split_init(gf) == 0) return 0; break;
case GF_MULT_GROUP: if (gf_w64_group_init(gf) == 0) return 0; break;
case GF_MULT_BYTWO_p:
case GF_MULT_BYTWO_b: if (gf_w64_bytwo_init(gf) == 0) return 0; break;
@@ -1748,11 +2241,6 @@ int gf_w64_init(gf_t *gf)
gf->inverse.w64 = gf_w64_euclid;
}
-/* else if (h->divide_type == GF_DIVIDE_MATRIX) {
- gf->divide.w64 = gf_w64_divide_from_inverse;
- gf->inverse.w64 = gf_w64_matrix;
- } */
-
if (gf->inverse.w64 != NULL && gf->divide.w64 == NULL) {
gf->divide.w64 = gf_w64_divide_from_inverse;
}
@@ -1760,6 +2248,8 @@ int gf_w64_init(gf_t *gf)
gf->inverse.w64 = gf_w64_inverse_from_divide;
}
+ if (h->region_type == GF_REGION_CAUCHY) return 0;
+
if (h->region_type & GF_REGION_ALTMAP) {
if (h->mult_type == GF_MULT_COMPOSITE) {
gf->extract_word.w64 = gf_w64_composite_extract_word;
diff --git a/gf_w8.c b/gf_w8.c
index 306f911..45c500f 100644
--- a/gf_w8.c
+++ b/gf_w8.c
@@ -15,7 +15,6 @@
#define GF_BASE_FIELD_WIDTH (4)
#define GF_BASE_FIELD_SIZE (1 << GF_BASE_FIELD_WIDTH)
-#define GF_S_GF_4_2 (4)
struct gf_w8_logtable_data {
uint8_t log_tbl[GF_FIELD_SIZE];
@@ -37,6 +36,10 @@ struct gf_w8_logzero_small_table_data {
uint8_t *div_tbl;
};
+struct gf_w8_composite_data {
+ uint8_t *mult_table;
+};
+
/* Don't change the order of these relative to gf_w8_half_table_data */
struct gf_w8_default_data {
@@ -139,6 +142,7 @@ uint32_t gf_w8_euclid (gf_t *gf, uint32_t b)
while (d_ip1 >= d_i) {
c_i ^= (1 << (d_ip1 - d_i));
e_ip1 ^= (e_i << (d_ip1 - d_i));
+ if (e_ip1 == 0) return 0;
while ((e_ip1 & (1 << d_ip1)) == 0) d_ip1--;
}
@@ -165,28 +169,402 @@ gf_val_32_t gf_w8_extract_word(gf_t *gf, void *start, int bytes, int index)
}
static
+gf_val_32_t gf_w8_composite_extract_word(gf_t *gf, void *start, int bytes, int index)
+{
+ int sub_size;
+ gf_internal_t *h;
+ uint8_t *r8, *top;
+ uint8_t a, b;
+ gf_region_data rd;
+
+ h = (gf_internal_t *) gf->scratch;
+ gf_set_region_data(&rd, gf, start, start, bytes, 0, 0, 32);
+ r8 = (uint8_t *) start;
+ if (r8 + index < (uint8_t *) rd.d_start) return r8[index];
+ if (r8 + index >= (uint8_t *) rd.d_top) return r8[index];
+ index -= (((uint8_t *) rd.d_start) - r8);
+ r8 = (uint8_t *) rd.d_start;
+ top = (uint8_t *) rd.d_top;
+ sub_size = (top-r8)/2;
+
+ a = h->base_gf->extract_word.w32(h->base_gf, r8, sub_size, index);
+ b = h->base_gf->extract_word.w32(h->base_gf, r8+sub_size, sub_size, index);
+ return (a | (b << 4));
+}
+
+static
inline
uint32_t gf_w8_matrix (gf_t *gf, uint32_t b)
{
return gf_bitmatrix_inverse(b, 8, ((gf_internal_t *) (gf->scratch))->prim_poly);
}
+
+static
+inline
+gf_val_32_t
+gf_w8_clm_multiply_2 (gf_t *gf, gf_val_32_t a8, gf_val_32_t b8)
+{
+ gf_val_32_t rv = 0;
+
+#ifdef INTEL_SSE4_PCLMUL
+
+ __m128i a, b;
+ __m128i result;
+ __m128i prim_poly;
+ __m128i v, w;
+ gf_internal_t * h = gf->scratch;
+
+ a = _mm_insert_epi32 (_mm_setzero_si128(), a8, 0);
+ b = _mm_insert_epi32 (a, b8, 0);
+
+ prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffULL));
+
+ /* Do the initial multiply */
+
+ result = _mm_clmulepi64_si128 (a, b, 0);
+
+ /* Ben: Do prim_poly reduction twice. We are guaranteed that we will only
+ have to do the reduction at most twice, because (w-2)/z == 2. Where
+ z is equal to the number of zeros after the leading 1
+
+ _mm_clmulepi64_si128 is the carryless multiply operation. Here
+ _mm_srli_si128 shifts the result to the right by 1 byte. This allows
+ us to multiply the prim_poly by the leading bits of the result. We
+ then xor the result of that operation back with the result.*/
+
+ w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+ result = _mm_xor_si128 (result, w);
+ w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+ result = _mm_xor_si128 (result, w);
+
+ /* Extracts 32 bit value from result. */
+
+ rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
+
+#endif
+ return rv;
+}
+
+static
+inline
+gf_val_32_t
+gf_w8_clm_multiply_3 (gf_t *gf, gf_val_32_t a8, gf_val_32_t b8)
+{
+ gf_val_32_t rv = 0;
+
+#ifdef INTEL_SSE4_PCLMUL
+
+ __m128i a, b;
+ __m128i result;
+ __m128i prim_poly;
+ __m128i v, w;
+ gf_internal_t * h = gf->scratch;
+
+ a = _mm_insert_epi32 (_mm_setzero_si128(), a8, 0);
+ b = _mm_insert_epi32 (a, b8, 0);
+
+ prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffULL));
+
+ /* Do the initial multiply */
+
+ result = _mm_clmulepi64_si128 (a, b, 0);
+
+ w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+ result = _mm_xor_si128 (result, w);
+ w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+ result = _mm_xor_si128 (result, w);
+ w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+ result = _mm_xor_si128 (result, w);
+
+ /* Extracts 32 bit value from result. */
+
+ rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
+
+#endif
+ return rv;
+}
+
+static
+inline
+gf_val_32_t
+gf_w8_clm_multiply_4 (gf_t *gf, gf_val_32_t a8, gf_val_32_t b8)
+{
+ gf_val_32_t rv = 0;
+
+#ifdef INTEL_SSE4_PCLMUL
+
+ __m128i a, b;
+ __m128i result;
+ __m128i prim_poly;
+ __m128i v, w;
+ gf_internal_t * h = gf->scratch;
+
+ a = _mm_insert_epi32 (_mm_setzero_si128(), a8, 0);
+ b = _mm_insert_epi32 (a, b8, 0);
+
+ prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffULL));
+
+ /* Do the initial multiply */
+
+ result = _mm_clmulepi64_si128 (a, b, 0);
+
+ w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+ result = _mm_xor_si128 (result, w);
+ w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+ result = _mm_xor_si128 (result, w);
+ w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+ result = _mm_xor_si128 (result, w);
+ w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+ result = _mm_xor_si128 (result, w);
+
+ /* Extracts 32 bit value from result. */
+ rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
+
+#endif
+ return rv;
+}
+
+
+static
+void
+gf_w8_multiply_region_from_single(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int
+ xor)
+{
+ gf_region_data rd;
+ uint8_t *s8;
+ uint8_t *d8;
+
+ if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+ if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+ gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 1);
+ gf_do_initial_region_alignment(&rd);
+
+ s8 = (uint8_t *) rd.s_start;
+ d8 = (uint8_t *) rd.d_start;
+
+ if (xor) {
+ while (d8 < ((uint8_t *) rd.d_top)) {
+ *d8 ^= gf->multiply.w32(gf, val, *s8);
+ d8++;
+ s8++;
+ }
+ } else {
+ while (d8 < ((uint8_t *) rd.d_top)) {
+ *d8 = gf->multiply.w32(gf, val, *s8);
+ d8++;
+ s8++;
+ }
+ }
+ gf_do_final_region_alignment(&rd);
+}
+
+static
+void
+gf_w8_clm_multiply_region_from_single_2(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int
+ xor)
+{
+ gf_region_data rd;
+ uint8_t *s8;
+ uint8_t *d8;
+
+#ifdef INTEL_SSE4_PCLMUL
+
+ __m128i a, b;
+ __m128i result;
+ __m128i prim_poly;
+ __m128i v, w;
+ gf_internal_t * h = gf->scratch;
+
+ prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffULL));
+
+ if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+ if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+ a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0);
+
+ gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 1);
+ gf_do_initial_region_alignment(&rd);
+
+ s8 = (uint8_t *) rd.s_start;
+ d8 = (uint8_t *) rd.d_start;
+
+ if (xor) {
+ while (d8 < ((uint8_t *) rd.d_top)) {
+ b = _mm_insert_epi32 (a, (gf_val_32_t)(*s8), 0);
+ result = _mm_clmulepi64_si128 (a, b, 0);
+ w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+ result = _mm_xor_si128 (result, w);
+ w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+ result = _mm_xor_si128 (result, w);
+ *d8 ^= ((gf_val_32_t)_mm_extract_epi32(result, 0));
+ d8++;
+ s8++;
+ }
+ } else {
+ while (d8 < ((uint8_t *) rd.d_top)) {
+ b = _mm_insert_epi32 (a, (gf_val_32_t)(*s8), 0);
+ result = _mm_clmulepi64_si128 (a, b, 0);
+ w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+ result = _mm_xor_si128 (result, w);
+ w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+ result = _mm_xor_si128 (result, w);
+ *d8 = ((gf_val_32_t)_mm_extract_epi32(result, 0));
+ d8++;
+ s8++;
+ }
+ }
+ gf_do_final_region_alignment(&rd);
+#endif
+}
+
+static
+void
+gf_w8_clm_multiply_region_from_single_3(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int
+ xor)
+{
+ gf_region_data rd;
+ uint8_t *s8;
+ uint8_t *d8;
+
+#ifdef INTEL_SSE4_PCLMUL
+
+ __m128i a, b;
+ __m128i result;
+ __m128i prim_poly;
+ __m128i v, w;
+ gf_internal_t * h = gf->scratch;
+
+ prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffULL));
+
+ if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+ if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+ a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0);
+
+ gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 1);
+ gf_do_initial_region_alignment(&rd);
+
+ s8 = (uint8_t *) rd.s_start;
+ d8 = (uint8_t *) rd.d_start;
+
+ if (xor) {
+ while (d8 < ((uint8_t *) rd.d_top)) {
+ b = _mm_insert_epi32 (a, (gf_val_32_t)(*s8), 0);
+ result = _mm_clmulepi64_si128 (a, b, 0);
+ w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+ result = _mm_xor_si128 (result, w);
+ w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+ result = _mm_xor_si128 (result, w);
+ w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+ result = _mm_xor_si128 (result, w);
+ *d8 ^= ((gf_val_32_t)_mm_extract_epi32(result, 0));
+ d8++;
+ s8++;
+ }
+ } else {
+ while (d8 < ((uint8_t *) rd.d_top)) {
+ b = _mm_insert_epi32 (a, (gf_val_32_t)(*s8), 0);
+ result = _mm_clmulepi64_si128 (a, b, 0);
+ w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+ result = _mm_xor_si128 (result, w);
+ w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+ result = _mm_xor_si128 (result, w);
+ w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+ result = _mm_xor_si128 (result, w);
+ *d8 = ((gf_val_32_t)_mm_extract_epi32(result, 0));
+ d8++;
+ s8++;
+ }
+ }
+ gf_do_final_region_alignment(&rd);
+#endif
+}
+
+static
+void
+gf_w8_clm_multiply_region_from_single_4(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int
+ xor)
+{
+ gf_region_data rd;
+ uint8_t *s8;
+ uint8_t *d8;
+
+#ifdef INTEL_SSE4_PCLMUL
+
+ __m128i a, b;
+ __m128i result;
+ __m128i prim_poly;
+ __m128i v, w;
+ gf_internal_t * h = gf->scratch;
+
+ prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffULL));
+
+ if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+ if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+ a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0);
+
+ gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 1);
+ gf_do_initial_region_alignment(&rd);
+
+ s8 = (uint8_t *) rd.s_start;
+ d8 = (uint8_t *) rd.d_start;
+
+ if (xor) {
+ while (d8 < ((uint8_t *) rd.d_top)) {
+ b = _mm_insert_epi32 (a, (gf_val_32_t)(*s8), 0);
+ result = _mm_clmulepi64_si128 (a, b, 0);
+ w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+ result = _mm_xor_si128 (result, w);
+ w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+ result = _mm_xor_si128 (result, w);
+ w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+ result = _mm_xor_si128 (result, w);
+ w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+ result = _mm_xor_si128 (result, w);
+ *d8 ^= ((gf_val_32_t)_mm_extract_epi32(result, 0));
+ d8++;
+ s8++;
+ }
+ } else {
+ while (d8 < ((uint8_t *) rd.d_top)) {
+ b = _mm_insert_epi32 (a, (gf_val_32_t)(*s8), 0);
+ result = _mm_clmulepi64_si128 (a, b, 0);
+ w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+ result = _mm_xor_si128 (result, w);
+ w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+ result = _mm_xor_si128 (result, w);
+ w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+ result = _mm_xor_si128 (result, w);
+ w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+ result = _mm_xor_si128 (result, w);
+ *d8 = ((gf_val_32_t)_mm_extract_epi32(result, 0));
+ d8++;
+ s8++;
+ }
+ }
+ gf_do_final_region_alignment(&rd);
+#endif
+}
+
/* ------------------------------------------------------------
- IMPLEMENTATION: SHIFT:
+IMPLEMENTATION: SHIFT:
- JSP: The world's dumbest multiplication algorithm. I only
- include it for completeness. It does have the feature that it requires no
- extra memory.
-*/
+JSP: The world's dumbest multiplication algorithm. I only
+include it for completeness. It does have the feature that it requires no
+extra memory.
+ */
static
inline
-uint32_t
+ uint32_t
gf_w8_shift_multiply (gf_t *gf, uint32_t a8, uint32_t b8)
{
uint16_t product, i, pp, a, b;
gf_internal_t *h;
-
+
a = a8;
b = b8;
h = (gf_internal_t *) gf->scratch;
@@ -197,29 +575,55 @@ gf_w8_shift_multiply (gf_t *gf, uint32_t a8, uint32_t b8)
for (i = 0; i < GF_FIELD_WIDTH; i++) {
if (a & (1 << i)) product ^= (b << i);
}
- for (i = (GF_FIELD_WIDTH*2-1); i >= GF_FIELD_WIDTH; i--) {
+ for (i = (GF_FIELD_WIDTH*2-2); i >= GF_FIELD_WIDTH; i--) {
if (product & (1 << i)) product ^= (pp << (i-GF_FIELD_WIDTH));
}
return product;
}
static
+int gf_w8_cfm_init(gf_t *gf)
+{
+ gf_internal_t *h;
+
+ h = (gf_internal_t *) gf->scratch;
+
+#ifdef INTEL_SSE4_PCLMUL
+ if ((0xe0 & h->prim_poly) == 0){
+ gf->multiply.w32 = gf_w8_clm_multiply_2;
+ gf->multiply_region.w32 = gf_w8_clm_multiply_region_from_single_2;
+ }else if ((0xc0 & h->prim_poly) == 0){
+ gf->multiply.w32 = gf_w8_clm_multiply_3;
+ gf->multiply_region.w32 = gf_w8_clm_multiply_region_from_single_3;
+ }else if ((0x80 & h->prim_poly) == 0){
+ gf->multiply.w32 = gf_w8_clm_multiply_4;
+ gf->multiply_region.w32 = gf_w8_clm_multiply_region_from_single_4;
+ }else{
+ return 0;
+ }
+ return 1;
+#endif
+
+ return 0;
+
+}
+
+static
int gf_w8_shift_init(gf_t *gf)
-{
- gf->multiply.w32 = gf_w8_shift_multiply;
- gf->inverse.w32 = gf_w8_euclid;
+{
+ gf->multiply.w32 = gf_w8_shift_multiply; /* The others will be set automatically */
return 1;
}
/* ------------------------------------------------------------
- IMPLEMENTATION: LOG_TABLE:
+IMPLEMENTATION: LOG_TABLE:
- JSP: Kevin wrote this, and I'm converting it to my structure.
- */
+JSP: Kevin wrote this, and I'm converting it to my structure.
+*/
static
inline
-uint32_t
+ uint32_t
gf_w8_logzero_multiply (gf_t *gf, uint32_t a, uint32_t b)
{
struct gf_w8_logzero_table_data *ltd;
@@ -230,7 +634,7 @@ gf_w8_logzero_multiply (gf_t *gf, uint32_t a, uint32_t b)
static
inline
-uint32_t
+ uint32_t
gf_w8_logzero_divide (gf_t *gf, uint32_t a, uint32_t b)
{
struct gf_w8_logzero_table_data *ltd;
@@ -241,7 +645,7 @@ gf_w8_logzero_divide (gf_t *gf, uint32_t a, uint32_t b)
static
inline
-uint32_t
+ uint32_t
gf_w8_logzero_small_multiply (gf_t *gf, uint32_t a, uint32_t b)
{
struct gf_w8_logzero_small_table_data *std;
@@ -253,7 +657,7 @@ gf_w8_logzero_small_multiply (gf_t *gf, uint32_t a, uint32_t b)
static
inline
-uint32_t
+ uint32_t
gf_w8_logzero_small_divide (gf_t *gf, uint32_t a, uint32_t b)
{
struct gf_w8_logzero_small_table_data *std;
@@ -264,7 +668,7 @@ gf_w8_logzero_small_divide (gf_t *gf, uint32_t a, uint32_t b)
static
inline
-uint32_t
+ uint32_t
gf_w8_log_multiply (gf_t *gf, uint32_t a, uint32_t b)
{
struct gf_w8_logtable_data *ltd;
@@ -275,7 +679,7 @@ gf_w8_log_multiply (gf_t *gf, uint32_t a, uint32_t b)
static
inline
-uint32_t
+ uint32_t
gf_w8_log_divide (gf_t *gf, uint32_t a, uint32_t b)
{
int log_sum = 0;
@@ -289,7 +693,7 @@ gf_w8_log_divide (gf_t *gf, uint32_t a, uint32_t b)
}
static
-uint32_t
+ uint32_t
gf_w8_log_inverse (gf_t *gf, uint32_t a)
{
struct gf_w8_logtable_data *ltd;
@@ -299,7 +703,7 @@ gf_w8_log_inverse (gf_t *gf, uint32_t a)
}
static
-uint32_t
+ uint32_t
gf_w8_logzero_inverse (gf_t *gf, uint32_t a)
{
struct gf_w8_logzero_table_data *ltd;
@@ -309,7 +713,7 @@ gf_w8_logzero_inverse (gf_t *gf, uint32_t a)
}
static
-uint32_t
+ uint32_t
gf_w8_logzero_small_inverse (gf_t *gf, uint32_t a)
{
struct gf_w8_logzero_small_table_data *std;
@@ -319,7 +723,7 @@ gf_w8_logzero_small_inverse (gf_t *gf, uint32_t a)
}
static
-void
+ void
gf_w8_log_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
{
int i;
@@ -348,7 +752,7 @@ gf_w8_log_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int byt
}
static
-void
+ void
gf_w8_logzero_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
{
int i;
@@ -390,7 +794,7 @@ gf_w8_logzero_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int
}
}
-static
+ static
int gf_w8_log_init(gf_t *gf)
{
gf_internal_t *h;
@@ -400,13 +804,14 @@ int gf_w8_log_init(gf_t *gf)
uint8_t *alt;
uint8_t *inv;
int i, b;
+ int check = 0;
h = (gf_internal_t *) gf->scratch;
- if (h->arg1 == 0) {
+ if (h->mult_type == GF_MULT_LOG_TABLE) {
ltd = h->private;
alt = ltd->antilog_tbl;
inv = ltd->inv_tbl;
- } else if (h->arg1 == 1) {
+ } else if (h->mult_type == GF_MULT_LOG_ZERO) {
std = h->private;
alt = std->antilog_tbl;
std->div_tbl = (alt + 255);
@@ -418,10 +823,19 @@ int gf_w8_log_init(gf_t *gf)
ztd->div_tbl = (alt + 255);
inv = ztd->inv_tbl;
}
-
- if (h->arg1 == 0) {
+
+ for (i = 0; i < GF_MULT_GROUP_SIZE+1; i++) {
+ if (h->mult_type == GF_MULT_LOG_TABLE)
+ ltd->log_tbl[i] = 0;
+ else if (h->mult_type == GF_MULT_LOG_ZERO)
+ std->log_tbl[i] = 0;
+ else
+ ztd->log_tbl[i] = 0;
+ }
+
+ if (h->mult_type == GF_MULT_LOG_TABLE) {
ltd->log_tbl[0] = 0;
- } else if (h->arg1 == 1) {
+ } else if (h->mult_type == GF_MULT_LOG_ZERO) {
std->log_tbl[0] = 510;
} else {
ztd->log_tbl[0] = 512;
@@ -429,23 +843,31 @@ int gf_w8_log_init(gf_t *gf)
b = 1;
for (i = 0; i < GF_MULT_GROUP_SIZE; i++) {
- if (h->arg1 == 0) {
- ltd->log_tbl[b] = i;
- } else if (h->arg1 == 1) {
- std->log_tbl[b] = i;
- } else {
- ztd->log_tbl[b] = i;
- }
- alt[i] = b;
- alt[i+GF_MULT_GROUP_SIZE] = b;
- b <<= 1;
- if (b & GF_FIELD_SIZE) {
- b = b ^ h->prim_poly;
- }
+ if (h->mult_type == GF_MULT_LOG_TABLE) {
+ if (ltd->log_tbl[b] != 0) check = 1;
+ ltd->log_tbl[b] = i;
+ } else if (h->mult_type == GF_MULT_LOG_ZERO) {
+ if (std->log_tbl[b] != 0) check = 1;
+ std->log_tbl[b] = i;
+ } else {
+ if (ztd->log_tbl[b] != 0) check = 1;
+ ztd->log_tbl[b] = i;
+ }
+ alt[i] = b;
+ alt[i+GF_MULT_GROUP_SIZE] = b;
+ b <<= 1;
+ if (b & GF_FIELD_SIZE) {
+ b = b ^ h->prim_poly;
+ }
}
- if (h->arg1 == 1) bzero(alt+510, 255);
+ if (check) {
+ _gf_errno = GF_E_LOGPOLY;
+ return 0;
+ }
+
+ if (h->mult_type == GF_MULT_LOG_ZERO) bzero(alt+510, 255);
- if (h->arg1 == 2) {
+ if (h->mult_type == GF_MULT_LOG_ZERO_EXT) {
bzero(alt+512, 255);
alt[512+512] = 0;
}
@@ -459,13 +881,13 @@ int gf_w8_log_init(gf_t *gf)
if (i & (1 << 8)) i ^= h->prim_poly;
b--;
} while (i != 1);
-
- if (h->arg1 == 0) {
+
+ if (h->mult_type == GF_MULT_LOG_TABLE) {
gf->inverse.w32 = gf_w8_log_inverse;
gf->divide.w32 = gf_w8_log_divide;
gf->multiply.w32 = gf_w8_log_multiply;
gf->multiply_region.w32 = gf_w8_log_multiply_region;
- } else if (h->arg1 == 1) {
+ } else if (h->mult_type == GF_MULT_LOG_ZERO) {
gf->inverse.w32 = gf_w8_logzero_small_inverse;
gf->divide.w32 = gf_w8_logzero_small_divide;
gf->multiply.w32 = gf_w8_logzero_small_multiply;
@@ -480,13 +902,13 @@ int gf_w8_log_init(gf_t *gf)
}
/* ------------------------------------------------------------
- IMPLEMENTATION: FULL_TABLE:
+IMPLEMENTATION: FULL_TABLE:
- JSP: Kevin wrote this, and I'm converting it to my structure.
+JSP: Kevin wrote this, and I'm converting it to my structure.
*/
static
-gf_val_32_t
+ gf_val_32_t
gf_w8_table_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
{
struct gf_w8_single_table_data *ftd;
@@ -496,7 +918,7 @@ gf_w8_table_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
}
static
-gf_val_32_t
+ gf_val_32_t
gf_w8_table_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
{
struct gf_w8_single_table_data *ftd;
@@ -506,7 +928,7 @@ gf_w8_table_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
}
static
-gf_val_32_t
+ gf_val_32_t
gf_w8_default_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
{
struct gf_w8_default_data *ftd;
@@ -516,7 +938,7 @@ gf_w8_default_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
}
static
-gf_val_32_t
+ gf_val_32_t
gf_w8_default_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
{
struct gf_w8_default_data *ftd;
@@ -526,7 +948,7 @@ gf_w8_default_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
}
static
-gf_val_32_t
+ gf_val_32_t
gf_w8_double_table_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
{
struct gf_w8_double_table_data *ftd;
@@ -536,7 +958,7 @@ gf_w8_double_table_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
}
static
-gf_val_32_t
+ gf_val_32_t
gf_w8_double_table_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
{
struct gf_w8_double_table_data *ftd;
@@ -546,7 +968,7 @@ gf_w8_double_table_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
}
static
-void
+ void
gf_w8_double_table_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
{
uint16_t *base;
@@ -570,7 +992,7 @@ gf_w8_double_table_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t
base[(b << 8)| c] = (vb | vc);
}
}
-
+
} else {
dtd = (struct gf_w8_double_table_data *) h->private;
base = &(dtd->mult[val][0]);
@@ -583,7 +1005,7 @@ gf_w8_double_table_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t
}
static
-gf_val_32_t
+ gf_val_32_t
gf_w8_double_table_lazy_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
{
struct gf_w8_double_table_lazy_data *ftd;
@@ -593,7 +1015,7 @@ gf_w8_double_table_lazy_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
}
static
-gf_val_32_t
+ gf_val_32_t
gf_w8_double_table_lazy_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
{
struct gf_w8_double_table_lazy_data *ftd;
@@ -603,7 +1025,7 @@ gf_w8_double_table_lazy_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
}
static
-void
+ void
gf_w8_table_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
{
int i;
@@ -628,11 +1050,12 @@ gf_w8_table_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, in
}
}
}
+
static
-void
+ void
gf_w8_split_multiply_region_sse(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
{
-#ifdef INTEL_SSE4
+#ifdef INTEL_SSSE3
uint8_t *s8, *d8, *bh, *bl, *sptr, *dptr, *top;
__m128i tbl, loset, t1, r, va, mth, mtl;
uint64_t altable[4];
@@ -654,7 +1077,7 @@ gf_w8_split_multiply_region_sse(gf_t *gf, void *src, void *dest, gf_val_32_t val
sptr = rd.s_start;
dptr = rd.d_start;
-
+
mth = _mm_loadu_si128 ((__m128i *)(bh));
mtl = _mm_loadu_si128 ((__m128i *)(bl));
loset = _mm_set1_epi8 (0x0f);
@@ -693,11 +1116,11 @@ gf_w8_split_multiply_region_sse(gf_t *gf, void *src, void *dest, gf_val_32_t val
/* ------------------------------------------------------------
- IMPLEMENTATION: FULL_TABLE:
+IMPLEMENTATION: FULL_TABLE:
*/
static
-gf_val_32_t
+ gf_val_32_t
gf_w8_split_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
{
struct gf_w8_half_table_data *htd;
@@ -707,7 +1130,7 @@ gf_w8_split_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
}
static
-void
+ void
gf_w8_split_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
{
unsigned long uls, uld;
@@ -735,12 +1158,12 @@ gf_w8_split_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, in
}
-static
+ static
int gf_w8_split_init(gf_t *gf)
{
gf_internal_t *h;
struct gf_w8_half_table_data *htd;
- int a, b, c, d, pp;
+ int a, b, pp;
h = (gf_internal_t *) gf->scratch;
htd = (struct gf_w8_half_table_data *)h->private;
@@ -748,34 +1171,34 @@ int gf_w8_split_init(gf_t *gf)
bzero(htd->high, sizeof(uint8_t)*GF_FIELD_SIZE*GF_HALF_SIZE);
bzero(htd->low, sizeof(uint8_t)*GF_FIELD_SIZE*GF_HALF_SIZE);
-
- for (a = 1; a < GF_HALF_SIZE; a++) {
- b = 1;
- c = a;
- d = (a << (GF_FIELD_WIDTH/2));
- do {
- htd->low[b][a] = c;
- htd->high[b][a] = d;
- b <<= 1;
- if (b & GF_FIELD_SIZE) b ^= pp;
- c <<= 1;
- if (c & GF_FIELD_SIZE) c ^= pp;
- d <<= 1;
- if (d & GF_FIELD_SIZE) d ^= pp;
- } while (c != a);
+
+ for (a = 1; a < GF_FIELD_SIZE; a++) {
+ for (b = 1; b < GF_HALF_SIZE; b++) {
+ htd->low[a][b] = gf_w8_shift_multiply(gf,a,b);
+ htd->high[a][b] = gf_w8_shift_multiply(gf,a,b<<4);
+ }
}
- gf->inverse.w32 = NULL; /* Will set from divide */
- gf->divide.w32 = NULL; /* Let the user figure it out. */
gf->multiply.w32 = gf_w8_split_multiply;
- if (h->region_type == GF_REGION_NOSSE) {
+
+ #ifdef INTEL_SSSE3
+ if (h->region_type & GF_REGION_NOSSE)
+ gf->multiply_region.w32 = gf_w8_split_multiply_region;
+ else
+ gf->multiply_region.w32 = gf_w8_split_multiply_region_sse;
+ #else
gf->multiply_region.w32 = gf_w8_split_multiply_region;
- } else {
- gf->multiply_region.w32 = gf_w8_split_multiply_region_sse;
- }
+ if(h->region_type & GF_REGION_SSE)
+ return 0;
+ #endif
+
return 1;
}
+/* JSP: This is disgusting, but it is what it is. If there is no SSE,
+ then the default is equivalent to single table. If there is SSE, then
+ we use the "gf_w8_default_data" which is a hybrid of SPLIT & TABLE. */
+
static
int gf_w8_table_init(gf_t *gf)
{
@@ -784,19 +1207,24 @@ int gf_w8_table_init(gf_t *gf)
struct gf_w8_double_table_data *dtd = NULL;
struct gf_w8_double_table_lazy_data *ltd = NULL;
struct gf_w8_default_data *dd = NULL;
- int a, b, c, prod, scase;
+ int a, b, c, prod, scase, issse;
h = (gf_internal_t *) gf->scratch;
- if (h->mult_type == GF_MULT_DEFAULT) {
+ issse = 0;
+#ifdef INTEL_SSSE3
+ issse = 1;
+#endif
+
+ if (h->mult_type == GF_MULT_DEFAULT && issse) {
dd = (struct gf_w8_default_data *)h->private;
scase = 3;
bzero(dd->high, sizeof(uint8_t) * GF_FIELD_SIZE * GF_HALF_SIZE);
bzero(dd->low, sizeof(uint8_t) * GF_FIELD_SIZE * GF_HALF_SIZE);
bzero(dd->divtable, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);
bzero(dd->multtable, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);
- } else if (h->region_type == 0 || (h->region_type & GF_REGION_CAUCHY) ||
- (h->region_type & GF_REGION_SINGLE_TABLE)) {
+ } else if (h->mult_type == GF_MULT_DEFAULT ||
+ h->region_type == 0 || (h->region_type & GF_REGION_CAUCHY)) {
ftd = (struct gf_w8_single_table_data *)h->private;
bzero(ftd->divtable, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);
bzero(ftd->multtable, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);
@@ -815,93 +1243,98 @@ int gf_w8_table_init(gf_t *gf)
fprintf(stderr, "Internal error in gf_w8_table_init\n");
exit(0);
}
-
+
for (a = 1; a < GF_FIELD_SIZE; a++) {
- b = 1;
- prod = a;
- do {
+ for (b = 1; b < GF_FIELD_SIZE; b++) {
+ prod = gf_w8_shift_multiply(gf,a,b);
switch (scase) {
- case 0:
- ftd->multtable[a][b] = prod;
- ftd->divtable[prod][b] = a;
- break;
- case 1:
- dtd->div[prod][b] = a;
- for (c = 0; c < GF_FIELD_SIZE; c++) {
- dtd->mult[a][(c<<8)|b] |= prod;
- dtd->mult[a][(b<<8)|c] |= (prod<<8);
- }
- break;
- case 2:
- ltd->div[prod][b] = a;
- ltd->smult[a][b] = prod;
- break;
- case 3:
- dd->multtable[a][b] = prod;
- dd->divtable[prod][b] = a;
- if ((b & 0xf) == b) dd->low[a][b] = prod;
- if ((b & 0xf0) == b) dd->high[a][b>>4] = prod;
- break;
+ case 0:
+ ftd->multtable[a][b] = prod;
+ ftd->divtable[prod][b] = a;
+ break;
+ case 1:
+ dtd->div[prod][b] = a;
+ for (c = 0; c < GF_FIELD_SIZE; c++) {
+ dtd->mult[a][(c<<8)|b] |= prod;
+ dtd->mult[a][(b<<8)|c] |= (prod<<8);
+ }
+ break;
+ case 2:
+ ltd->div[prod][b] = a;
+ ltd->smult[a][b] = prod;
+ break;
+ case 3:
+ dd->multtable[a][b] = prod;
+ dd->divtable[prod][b] = a;
+ if ((b & 0xf) == b) { dd->low[a][b] = prod; }
+ if ((b & 0xf0) == b) { dd->high[a][b>>4] = prod; }
+ break;
}
- b <<= 1;
- if (b & GF_FIELD_SIZE) b = b ^ h->prim_poly;
- prod <<= 1;
- if (prod & GF_FIELD_SIZE) prod = prod ^ h->prim_poly;
-
- } while (b != 1);
+ }
}
gf->inverse.w32 = NULL; /* Will set from divide */
switch (scase) {
- case 0:
- gf->divide.w32 = gf_w8_table_divide;
- gf->multiply.w32 = gf_w8_table_multiply;
- gf->multiply_region.w32 = gf_w8_table_multiply_region;
- break;
- case 1:
- gf->divide.w32 = gf_w8_double_table_divide;
- gf->multiply.w32 = gf_w8_double_table_multiply;
- gf->multiply_region.w32 = gf_w8_double_table_multiply_region;
- break;
- case 2:
- gf->divide.w32 = gf_w8_double_table_lazy_divide;
- gf->multiply.w32 = gf_w8_double_table_lazy_multiply;
- gf->multiply_region.w32 = gf_w8_double_table_multiply_region;
- break;
- case 3:
- gf->divide.w32 = gf_w8_default_divide;
- gf->multiply.w32 = gf_w8_default_multiply;
- gf->multiply_region.w32 = gf_w8_split_multiply_region;
-#ifdef INTEL_SSE4
- gf->multiply_region.w32 = gf_w8_split_multiply_region_sse;
+ case 0:
+ gf->divide.w32 = gf_w8_table_divide;
+ gf->multiply.w32 = gf_w8_table_multiply;
+ gf->multiply_region.w32 = gf_w8_table_multiply_region;
+ break;
+ case 1:
+ gf->divide.w32 = gf_w8_double_table_divide;
+ gf->multiply.w32 = gf_w8_double_table_multiply;
+ gf->multiply_region.w32 = gf_w8_double_table_multiply_region;
+ break;
+ case 2:
+ gf->divide.w32 = gf_w8_double_table_lazy_divide;
+ gf->multiply.w32 = gf_w8_double_table_lazy_multiply;
+ gf->multiply_region.w32 = gf_w8_double_table_multiply_region;
+ break;
+ case 3:
+#ifdef INTEL_SSSE3
+ gf->divide.w32 = gf_w8_default_divide;
+ gf->multiply.w32 = gf_w8_default_multiply;
+ gf->multiply_region.w32 = gf_w8_split_multiply_region_sse;
#endif
- break;
+ break;
}
return 1;
}
static
-void
+ void
gf_w8_composite_multiply_region_alt(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
{
gf_internal_t *h = (gf_internal_t *) gf->scratch;
gf_t *base_gf = h->base_gf;
uint8_t val0 = val & 0x0f;
uint8_t val1 = (val & 0xf0) >> 4;
- int sub_reg_size = bytes / 2;
+ gf_region_data rd;
+ int sub_reg_size;
+
+ if (val == 0) {
+ if (xor) return;
+ bzero(dest, bytes);
+ return;
+ }
+
+ gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 32);
+ gf_do_initial_region_alignment(&rd);
- if (bytes % 2 != 0) gf_alignment_error("gf_w8_composite_multiply_region_alt", 1);
+ sub_reg_size = (rd.d_top - rd.d_start) / 2;
- base_gf->multiply_region.w32(base_gf, src, dest, val0, sub_reg_size, xor);
- base_gf->multiply_region.w32(base_gf, src+sub_reg_size, dest, val1, sub_reg_size, 1);
- base_gf->multiply_region.w32(base_gf, src, dest+sub_reg_size, val1, sub_reg_size, xor);
- base_gf->multiply_region.w32(base_gf, src+sub_reg_size, dest+sub_reg_size, val0, sub_reg_size, 1);
- base_gf->multiply_region.w32(base_gf, src+sub_reg_size, dest+sub_reg_size, base_gf->multiply.w32(base_gf, GF_S_GF_4_2, val1), sub_reg_size, 1);
+ base_gf->multiply_region.w32(base_gf, rd.s_start, rd.d_start, val0, sub_reg_size, xor);
+ base_gf->multiply_region.w32(base_gf, rd.s_start+sub_reg_size, rd.d_start, val1, sub_reg_size, 1);
+ base_gf->multiply_region.w32(base_gf, rd.s_start, rd.d_start+sub_reg_size, val1, sub_reg_size, xor);
+ base_gf->multiply_region.w32(base_gf, rd.s_start+sub_reg_size, rd.d_start+sub_reg_size, val0, sub_reg_size, 1);
+ base_gf->multiply_region.w32(base_gf, rd.s_start+sub_reg_size, rd.d_start+sub_reg_size, base_gf->multiply.w32(base_gf, h->prim_poly, val1), sub_reg_size, 1);
+
+ gf_do_final_region_alignment(&rd);
}
static
gf_val_32_t
-gf_w8_composite_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+gf_w8_composite_multiply_recursive(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
{
gf_internal_t *h = (gf_internal_t *) gf->scratch;
gf_t *base_gf = h->base_gf;
@@ -912,8 +1345,35 @@ gf_w8_composite_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
uint8_t a1b1;
a1b1 = base_gf->multiply.w32(base_gf, a1, b1);
-
- return ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) | ((base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, GF_S_GF_4_2)) << 4));
+
+ return ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) |
+ ((base_gf->multiply.w32(base_gf, a1, b0) ^
+ base_gf->multiply.w32(base_gf, a0, b1) ^
+ base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 4));
+}
+
+static
+gf_val_32_t
+gf_w8_composite_multiply_inline(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+ gf_internal_t *h = (gf_internal_t *) gf->scratch;
+ gf_t *base_gf = h->base_gf;
+ uint8_t b0 = b & 0x0f;
+ uint8_t b1 = (b & 0xf0) >> 4;
+ uint8_t a0 = a & 0x0f;
+ uint8_t a1 = (a & 0xf0) >> 4;
+ uint8_t a1b1, *mt;
+ struct gf_w8_composite_data *cd;
+
+ cd = (struct gf_w8_composite_data *) h->private;
+ mt = cd->mult_table;
+
+ a1b1 = GF_W4_INLINE_MULTDIV(mt, a1, b1);
+
+ return ((GF_W4_INLINE_MULTDIV(mt, a0, b0) ^ a1b1) |
+ ((GF_W4_INLINE_MULTDIV(mt, a1, b0) ^
+ GF_W4_INLINE_MULTDIV(mt, a0, b1) ^
+ GF_W4_INLINE_MULTDIV(mt, a1b1, h->prim_poly)) << 4));
}
/*
@@ -938,6 +1398,7 @@ gf_w8_composite_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
*
* a / b = a * c
*/
+
static
gf_val_32_t
gf_w8_composite_inverse(gf_t *gf, gf_val_32_t a)
@@ -949,10 +1410,9 @@ gf_w8_composite_inverse(gf_t *gf, gf_val_32_t a)
uint8_t c0, c1, c, d, tmp;
uint8_t a0inv, a1inv;
-
if (a0 == 0) {
a1inv = base_gf->inverse.w32(base_gf, a1) & 0xf;
- c0 = base_gf->multiply.w32(base_gf, a1inv, GF_S_GF_4_2);
+ c0 = base_gf->multiply.w32(base_gf, a1inv, h->prim_poly);
c1 = a1inv;
} else if (a1 == 0) {
c0 = base_gf->inverse.w32(base_gf, a0);
@@ -963,99 +1423,36 @@ gf_w8_composite_inverse(gf_t *gf, gf_val_32_t a)
d = base_gf->multiply.w32(base_gf, a1, a0inv) & 0xf;
- tmp = (base_gf->multiply.w32(base_gf, a1, a0inv) ^ base_gf->multiply.w32(base_gf, a0, a1inv) ^ GF_S_GF_4_2) & 0xf;
+ tmp = (base_gf->multiply.w32(base_gf, a1, a0inv) ^ base_gf->multiply.w32(base_gf, a0, a1inv) ^ h->prim_poly) & 0xf;
tmp = base_gf->inverse.w32(base_gf, tmp) & 0xf;
d = base_gf->multiply.w32(base_gf, d, tmp) & 0xf;
-
+
c0 = base_gf->multiply.w32(base_gf, (d^1), a0inv) & 0xf;
c1 = base_gf->multiply.w32(base_gf, d, a1inv) & 0xf;
}
c = c0 | (c1 << 4);
-
- return c;
-}
-
-static
-gf_val_32_t
-gf_w8_composite_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
-{
- gf_val_32_t binv;
- binv = gf_w8_composite_inverse(gf, b);
-
- return gf_w8_composite_multiply(gf, a, binv);
+ return c;
}
static
void
gf_w8_composite_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
{
- unsigned long uls, uld;
- gf_internal_t *h = (gf_internal_t *) gf->scratch;
- gf_t *base_gf = h->base_gf;
- int i=0;
- struct gf_w4_single_table_data * std;
- uint8_t b0 = val & 0x0f;
- uint8_t b1 = (val & 0xf0) >> 4;
- uint8_t *s8 = (uint8_t *) src;
- uint8_t *d8 = (uint8_t *) dest;
- uint8_t a0, a1, a1b1;
-
- uls = ((unsigned long) src) & 0xf;
- uld = ((unsigned long) dest) & 0xf;
- if ((uls & 0x7) != (uld & 0x7)) gf_alignment_error("gf_w8_composite_multiply_region", 1);
-
- if (val == 0) {
- if (xor) return;
- bzero(dest, bytes);
- return;
- }
-
- std = (struct gf_w4_single_table_data *) h->private;
-
- if (xor) {
- for (i = 0;i < bytes; i++) {
- a0 = s8[i] & 0x0f;
- a1 = (s8[i] & 0xf0) >> 4;
- a1b1 = std->mult[a1][b1];
-
- d8[i] ^= ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) |
- ((base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, GF_S_GF_4_2)) << 4));
-
- }
- } else {
- for (i = 0;i < bytes; i++) {
- a0 = s8[i] & 0x0f;
- a1 = (s8[i] & 0xf0) >> 4;
- a1b1 = std->mult[a1][b1];
-
- d8[i] = ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) |
- ((base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, GF_S_GF_4_2)) << 4));
- }
- }
- return;
-}
-
-static
-void
-gf_w8_composite_multiply_region_table(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
-{
- unsigned long uls, uld;
+ gf_region_data rd;
gf_internal_t *h = (gf_internal_t *) gf->scratch;
gf_t *base_gf = h->base_gf;
- int i=0;
- struct gf_w4_single_table_data * std;
uint8_t b0 = val & 0x0f;
uint8_t b1 = (val & 0xf0) >> 4;
- uint8_t *s8 = (uint8_t *) src;
- uint8_t *d8 = (uint8_t *) dest;
+ uint8_t *s8;
+ uint8_t *d8;
+ uint8_t *mt;
uint8_t a0, a1, a1b1;
+ struct gf_w8_composite_data *cd;
- uls = ((unsigned long) src) & 0xf;
- uld = ((unsigned long) dest) & 0xf;
- if ((uls & 0x7) != (uld & 0x7)) gf_alignment_error("gf_w8_composite_multiply_region", 1);
+ cd = (struct gf_w8_composite_data *) h->private;
if (val == 0) {
if (xor) return;
@@ -1063,74 +1460,115 @@ gf_w8_composite_multiply_region_table(gf_t *gf, void *src, void *dest, gf_val_32
return;
}
- std = (struct gf_w4_single_table_data *) h->private;
-
- if (xor) {
- for (i = 0;i < bytes; i++) {
- a0 = s8[i] & 0x0f;
- a1 = (s8[i] & 0xf0) >> 4;
- a1b1 = std->mult[a1][b1];
+ gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 1);
+ gf_do_initial_region_alignment(&rd);
+
+
+ s8 = (uint8_t *) rd.s_start;
+ d8 = (uint8_t *) rd.d_start;
- d8[i] ^= ((std->mult[a0][b0] ^ a1b1) | ((std->mult[a1][b0] ^ std->mult[a0][b1] ^ std->mult[a1b1][GF_S_GF_4_2]) << 4));
-
+ mt = cd->mult_table;
+ if (mt == NULL) {
+ if (xor) {
+ while (d8 < (uint8_t *) rd.d_top) {
+ a0 = *s8 & 0x0f;
+ a1 = (*s8 & 0xf0) >> 4;
+ a1b1 = base_gf->multiply.w32(base_gf, a1, b1);
+
+ *d8 ^= ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) |
+ ((base_gf->multiply.w32(base_gf, a1, b0) ^
+ base_gf->multiply.w32(base_gf, a0, b1) ^
+ base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 4));
+ s8++;
+ d8++;
+ }
+ } else {
+ while (d8 < (uint8_t *) rd.d_top) {
+ a0 = *s8 & 0x0f;
+ a1 = (*s8 & 0xf0) >> 4;
+ a1b1 = base_gf->multiply.w32(base_gf, a1, b1);
+
+ *d8 = ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) |
+ ((base_gf->multiply.w32(base_gf, a1, b0) ^
+ base_gf->multiply.w32(base_gf, a0, b1) ^
+ base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 4));
+ s8++;
+ d8++;
+ }
}
} else {
- for (i = 0;i < bytes; i++) {
- a0 = s8[i] & 0x0f;
- a1 = (s8[i] & 0xf0) >> 4;
- a1b1 = std->mult[a1][b1];
-
- d8[i] = ((std->mult[a0][b0] ^ a1b1) | ((std->mult[a1][b0] ^ std->mult[a0][b1] ^ std->mult[a1b1][GF_S_GF_4_2]) << 4));
+ if (xor) {
+ while (d8 < (uint8_t *) rd.d_top) {
+ a0 = *s8 & 0x0f;
+ a1 = (*s8 & 0xf0) >> 4;
+ a1b1 = GF_W4_INLINE_MULTDIV(mt, a1, b1);
+
+ *d8 ^= ((GF_W4_INLINE_MULTDIV(mt, a0, b0) ^ a1b1) |
+ ((GF_W4_INLINE_MULTDIV(mt, a1, b0) ^
+ GF_W4_INLINE_MULTDIV(mt, a0, b1) ^
+ GF_W4_INLINE_MULTDIV(mt, a1b1, h->prim_poly)) << 4));
+ s8++;
+ d8++;
+ }
+ } else {
+ while (d8 < (uint8_t *) rd.d_top) {
+ a0 = *s8 & 0x0f;
+ a1 = (*s8 & 0xf0) >> 4;
+ a1b1 = GF_W4_INLINE_MULTDIV(mt, a1, b1);
+
+ *d8 = ((GF_W4_INLINE_MULTDIV(mt, a0, b0) ^ a1b1) |
+ ((GF_W4_INLINE_MULTDIV(mt, a1, b0) ^
+ GF_W4_INLINE_MULTDIV(mt, a0, b1) ^
+ GF_W4_INLINE_MULTDIV(mt, a1b1, h->prim_poly)) << 4));
+ s8++;
+ d8++;
+ }
}
}
+ gf_do_final_region_alignment(&rd);
return;
}
static
int gf_w8_composite_init(gf_t *gf)
{
- struct gf_w4_single_table_data * std;
gf_internal_t *h = (gf_internal_t *) gf->scratch;
- gf_t *base_gf = h->base_gf;
- uint8_t a, b;
+ struct gf_w8_composite_data *cd;
- std = (struct gf_w4_single_table_data *) h->private;
+ if (h->base_gf == NULL) return 0;
+
+ cd = (struct gf_w8_composite_data *) h->private;
+ cd->mult_table = gf_w4_get_mult_table(h->base_gf);
- for (a = 0; a < 16; a++) {
- for (b = 0; b < 16; b++) {
- std->mult[a][b] = base_gf->multiply.w32(base_gf, a, b);
- }
- }
-
if (h->region_type & GF_REGION_ALTMAP) {
gf->multiply_region.w32 = gf_w8_composite_multiply_region_alt;
} else {
- if (h->region_type & GF_REGION_SINGLE_TABLE) {
- gf->multiply_region.w32 = gf_w8_composite_multiply_region_table;
- } else {
- gf->multiply_region.w32 = gf_w8_composite_multiply_region;
- }
+ gf->multiply_region.w32 = gf_w8_composite_multiply_region;
}
- gf->multiply.w32 = gf_w8_composite_multiply;
- gf->divide.w32 = gf_w8_composite_divide;
+ if (cd->mult_table == NULL) {
+ gf->multiply.w32 = gf_w8_composite_multiply_recursive;
+ } else {
+ gf->multiply.w32 = gf_w8_composite_multiply_inline;
+ }
+ gf->divide.w32 = NULL;
gf->inverse.w32 = gf_w8_composite_inverse;
-
+
return 1;
}
static
inline
-gf_val_32_t
+ gf_val_32_t
gf_w8_bytwo_p_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
{
uint32_t prod, pp, pmask, amask;
gf_internal_t *h;
-
+
h = (gf_internal_t *) gf->scratch;
pp = h->prim_poly;
-
+
prod = 0;
pmask = 0x80;
amask = 0x80;
@@ -1149,12 +1587,12 @@ gf_w8_bytwo_p_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
static
inline
-gf_val_32_t
+ gf_val_32_t
gf_w8_bytwo_b_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
{
uint32_t prod, pp, bmask;
gf_internal_t *h;
-
+
h = (gf_internal_t *) gf->scratch;
pp = h->prim_poly;
@@ -1174,13 +1612,13 @@ gf_w8_bytwo_b_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
}
static
-void
+ void
gf_w8_bytwo_p_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
{
uint64_t *s64, *d64, t1, t2, ta, prod, amask;
gf_region_data rd;
struct gf_w8_bytwo_data *btd;
-
+
if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
@@ -1225,18 +1663,18 @@ gf_w8_bytwo_p_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t
}
#define BYTWO_P_ONESTEP {\
- SSE_AB2(pp, m1 ,m2, prod, t1, t2); \
- t1 = _mm_and_si128(v, one); \
- t1 = _mm_sub_epi8(t1, one); \
- t1 = _mm_and_si128(t1, ta); \
- prod = _mm_xor_si128(prod, t1); \
- v = _mm_srli_epi64(v, 1); }
+ SSE_AB2(pp, m1 ,m2, prod, t1, t2); \
+ t1 = _mm_and_si128(v, one); \
+ t1 = _mm_sub_epi8(t1, one); \
+ t1 = _mm_and_si128(t1, ta); \
+ prod = _mm_xor_si128(prod, t1); \
+ v = _mm_srli_epi64(v, 1); }
static
-void
+ void
gf_w8_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
{
-#ifdef INTEL_SSE4
+#ifdef INTEL_SSE2
int i;
uint8_t *s8, *d8;
uint8_t vrev;
@@ -1244,7 +1682,7 @@ gf_w8_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t v
__m128i pp, m1, m2, ta, prod, t1, t2, tp, one, v;
struct gf_w8_bytwo_data *btd;
gf_region_data rd;
-
+
if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
@@ -1289,10 +1727,10 @@ gf_w8_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t v
}
static
-void
+ void
gf_w8_bytwo_b_sse_region_2_noxor(gf_region_data *rd, struct gf_w8_bytwo_data *btd)
{
-#ifdef INTEL_SSE4
+#ifdef INTEL_SSE2
int i;
uint8_t *d8, *s8, tb;
__m128i pp, m1, m2, t1, t2, va, vb;
@@ -1315,10 +1753,10 @@ gf_w8_bytwo_b_sse_region_2_noxor(gf_region_data *rd, struct gf_w8_bytwo_data *bt
}
static
-void
+ void
gf_w8_bytwo_b_sse_region_2_xor(gf_region_data *rd, struct gf_w8_bytwo_data *btd)
{
-#ifdef INTEL_SSE4
+#ifdef INTEL_SSE2
int i;
uint8_t *d8, *s8, tb;
__m128i pp, m1, m2, t1, t2, va, vb;
@@ -1344,16 +1782,16 @@ gf_w8_bytwo_b_sse_region_2_xor(gf_region_data *rd, struct gf_w8_bytwo_data *btd)
static
-void
+ void
gf_w8_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
{
-#ifdef INTEL_SSE4
+#ifdef INTEL_SSE2
int itb;
uint8_t *d8, *s8;
__m128i pp, m1, m2, t1, t2, va, vb;
struct gf_w8_bytwo_data *btd;
gf_region_data rd;
-
+
if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
@@ -1399,7 +1837,7 @@ gf_w8_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t v
}
static
-void
+ void
gf_w8_bytwo_b_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
{
int i;
@@ -1419,349 +1857,349 @@ gf_w8_bytwo_b_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t
d64 = (uint64_t *) rd.d_start;
switch (val) {
- case 2:
- if (xor) {
- while (d64 < (uint64_t *) rd.d_top) {
- ta = *s64;
- AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
- *d64 ^= ta;
- d64++;
- s64++;
- }
- } else {
- while (d64 < (uint64_t *) rd.d_top) {
- ta = *s64;
- AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
- *d64 = ta;
- d64++;
- s64++;
- }
- }
- break;
- case 3:
- if (xor) {
- while (d64 < (uint64_t *) rd.d_top) {
- ta = *s64;
- prod = ta;
- AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
- *d64 ^= (ta ^ prod);
- d64++;
- s64++;
- }
- } else {
- while (d64 < (uint64_t *) rd.d_top) {
- ta = *s64;
- prod = ta;
- AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
- *d64 = (ta ^ prod);
- d64++;
- s64++;
- }
- }
- break;
- case 4:
- if (xor) {
- while (d64 < (uint64_t *) rd.d_top) {
- ta = *s64;
- AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
- AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
- *d64 ^= ta;
- d64++;
- s64++;
- }
- } else {
- while (d64 < (uint64_t *) rd.d_top) {
- ta = *s64;
- AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
- AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
- *d64 = ta;
- d64++;
- s64++;
- }
- }
- break;
- case 5:
- if (xor) {
- while (d64 < (uint64_t *) rd.d_top) {
- ta = *s64;
- prod = ta;
- AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
- AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
- *d64 ^= (ta ^ prod);
- d64++;
- s64++;
- }
- } else {
- while (d64 < (uint64_t *) rd.d_top) {
- ta = *s64;
- prod = ta;
- AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
- AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
- *d64 = ta ^ prod;
- d64++;
- s64++;
+ case 2:
+ if (xor) {
+ while (d64 < (uint64_t *) rd.d_top) {
+ ta = *s64;
+ AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+ *d64 ^= ta;
+ d64++;
+ s64++;
+ }
+ } else {
+ while (d64 < (uint64_t *) rd.d_top) {
+ ta = *s64;
+ AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+ *d64 = ta;
+ d64++;
+ s64++;
+ }
}
- }
- case 6:
- if (xor) {
- while (d64 < (uint64_t *) rd.d_top) {
- ta = *s64;
- AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
- prod = ta;
- AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
- *d64 ^= (ta ^ prod);
- d64++;
- s64++;
+ break;
+ case 3:
+ if (xor) {
+ while (d64 < (uint64_t *) rd.d_top) {
+ ta = *s64;
+ prod = ta;
+ AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+ *d64 ^= (ta ^ prod);
+ d64++;
+ s64++;
+ }
+ } else {
+ while (d64 < (uint64_t *) rd.d_top) {
+ ta = *s64;
+ prod = ta;
+ AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+ *d64 = (ta ^ prod);
+ d64++;
+ s64++;
+ }
}
- } else {
- while (d64 < (uint64_t *) rd.d_top) {
- ta = *s64;
- AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
- prod = ta;
- AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
- *d64 = ta ^ prod;
- d64++;
- s64++;
+ break;
+ case 4:
+ if (xor) {
+ while (d64 < (uint64_t *) rd.d_top) {
+ ta = *s64;
+ AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+ AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+ *d64 ^= ta;
+ d64++;
+ s64++;
+ }
+ } else {
+ while (d64 < (uint64_t *) rd.d_top) {
+ ta = *s64;
+ AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+ AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+ *d64 = ta;
+ d64++;
+ s64++;
+ }
}
- }
-/*
- case 7:
- if (xor) {
- while (d64 < (uint64_t *) rd.d_top) {
- ta = *s64;
- prod = ta;
- AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
- prod ^= ta;
- AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
- *d64 ^= (ta ^ prod);
- d64++;
- s64++;
+ break;
+ case 5:
+ if (xor) {
+ while (d64 < (uint64_t *) rd.d_top) {
+ ta = *s64;
+ prod = ta;
+ AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+ AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+ *d64 ^= (ta ^ prod);
+ d64++;
+ s64++;
+ }
+ } else {
+ while (d64 < (uint64_t *) rd.d_top) {
+ ta = *s64;
+ prod = ta;
+ AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+ AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+ *d64 = ta ^ prod;
+ d64++;
+ s64++;
+ }
}
- } else {
- while (d64 < (uint64_t *) rd.d_top) {
- ta = *s64;
- prod = ta;
- AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
- prod ^= ta;
- AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
- *d64 = ta ^ prod;
- d64++;
- s64++;
+ case 6:
+ if (xor) {
+ while (d64 < (uint64_t *) rd.d_top) {
+ ta = *s64;
+ AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+ prod = ta;
+ AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+ *d64 ^= (ta ^ prod);
+ d64++;
+ s64++;
+ }
+ } else {
+ while (d64 < (uint64_t *) rd.d_top) {
+ ta = *s64;
+ AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+ prod = ta;
+ AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+ *d64 = ta ^ prod;
+ d64++;
+ s64++;
+ }
}
- }
- break;
- */
- case 8:
- if (xor) {
- while (d64 < (uint64_t *) rd.d_top) {
- ta = *s64;
- AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
- AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
- AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
- *d64 ^= ta;
- d64++;
- s64++;
+ /*
+ case 7:
+ if (xor) {
+ while (d64 < (uint64_t *) rd.d_top) {
+ ta = *s64;
+ prod = ta;
+ AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+ prod ^= ta;
+ AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+ *d64 ^= (ta ^ prod);
+ d64++;
+ s64++;
+ }
+ } else {
+ while (d64 < (uint64_t *) rd.d_top) {
+ ta = *s64;
+ prod = ta;
+ AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+ prod ^= ta;
+ AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+ *d64 = ta ^ prod;
+ d64++;
+ s64++;
+ }
+ }
+ break;
+ */
+ case 8:
+ if (xor) {
+ while (d64 < (uint64_t *) rd.d_top) {
+ ta = *s64;
+ AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+ AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+ AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+ *d64 ^= ta;
+ d64++;
+ s64++;
+ }
+ } else {
+ while (d64 < (uint64_t *) rd.d_top) {
+ ta = *s64;
+ AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+ AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+ AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+ *d64 = ta;
+ d64++;
+ s64++;
+ }
}
- } else {
- while (d64 < (uint64_t *) rd.d_top) {
- ta = *s64;
- AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
- AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
- AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
- *d64 = ta;
- d64++;
- s64++;
+ break;
+ /*
+ case 9:
+ if (xor) {
+ while (d64 < (uint64_t *) rd.d_top) {
+ ta = *s64;
+ prod = ta;
+ AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+ AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+ AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+ *d64 ^= (ta ^ prod);
+ d64++;
+ s64++;
+ }
+ } else {
+ while (d64 < (uint64_t *) rd.d_top) {
+ ta = *s64;
+ prod = ta;
+ AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+ AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+ AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+ *d64 = (ta ^ prod);
+ d64++;
+ s64++;
+ }
+ }
+ break;
+ case 10:
+ if (xor) {
+ while (d64 < (uint64_t *) rd.d_top) {
+ ta = *s64;
+ AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+ prod = ta;
+ AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+ AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+ *d64 ^= (ta ^ prod);
+ d64++;
+ s64++;
+ }
+ } else {
+ while (d64 < (uint64_t *) rd.d_top) {
+ ta = *s64;
+ AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+ prod = ta;
+ AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+ AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+ *d64 = (ta ^ prod);
+ d64++;
+ s64++;
+ }
+ }
+ break;
+ case 11:
+ if (xor) {
+ while (d64 < (uint64_t *) rd.d_top) {
+ ta = *s64;
+ prod = ta;
+ AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+ prod ^= ta;
+ AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+ AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+ *d64 ^= (ta ^ prod);
+ d64++;
+ s64++;
+ }
+ } else {
+ while (d64 < (uint64_t *) rd.d_top) {
+ ta = *s64;
+ prod = ta;
+ AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+ prod ^= ta;
+ AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+ AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+ *d64 = (ta ^ prod);
+ d64++;
+ s64++;
}
+ }
+ break;
+ case 12:
+ if (xor) {
+ while (d64 < (uint64_t *) rd.d_top) {
+ ta = *s64;
+ AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+ AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+ prod = ta;
+ AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+ *d64 ^= (ta ^ prod);
+ d64++;
+ s64++;
}
- break;
-/*
- case 9:
- if (xor) {
- while (d64 < (uint64_t *) rd.d_top) {
- ta = *s64;
- prod = ta;
- AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
- AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
- AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
- *d64 ^= (ta ^ prod);
- d64++;
- s64++;
- }
- } else {
- while (d64 < (uint64_t *) rd.d_top) {
- ta = *s64;
- prod = ta;
- AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
- AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
- AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
- *d64 = (ta ^ prod);
- d64++;
- s64++;
- }
+ } else {
+ while (d64 < (uint64_t *) rd.d_top) {
+ ta = *s64;
+ AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+ AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+ prod = ta;
+ AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+ *d64 = (ta ^ prod);
+ d64++;
+ s64++;
}
- break;
- case 10:
- if (xor) {
- while (d64 < (uint64_t *) rd.d_top) {
- ta = *s64;
- AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
- prod = ta;
- AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
- AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
- *d64 ^= (ta ^ prod);
- d64++;
- s64++;
- }
- } else {
- while (d64 < (uint64_t *) rd.d_top) {
- ta = *s64;
- AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
- prod = ta;
- AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
- AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
- *d64 = (ta ^ prod);
- d64++;
- s64++;
- }
+ }
+ break;
+ case 13:
+ if (xor) {
+ while (d64 < (uint64_t *) rd.d_top) {
+ ta = *s64;
+ prod = ta;
+ AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+ AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+ prod ^= ta;
+ AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+ *d64 ^= (ta ^ prod);
+ d64++;
+ s64++;
}
- break;
- case 11:
- if (xor) {
- while (d64 < (uint64_t *) rd.d_top) {
- ta = *s64;
- prod = ta;
- AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
- prod ^= ta;
- AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
- AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
- *d64 ^= (ta ^ prod);
- d64++;
- s64++;
- }
- } else {
- while (d64 < (uint64_t *) rd.d_top) {
- ta = *s64;
- prod = ta;
- AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
- prod ^= ta;
- AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
- AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
- *d64 = (ta ^ prod);
- d64++;
- s64++;
- }
+ } else {
+ while (d64 < (uint64_t *) rd.d_top) {
+ ta = *s64;
+ prod = ta;
+ AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+ AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+ prod ^= ta;
+ AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+ *d64 = (ta ^ prod);
+ d64++;
+ s64++;
}
- break;
- case 12:
- if (xor) {
- while (d64 < (uint64_t *) rd.d_top) {
- ta = *s64;
- AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
- AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
- prod = ta;
- AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
- *d64 ^= (ta ^ prod);
- d64++;
- s64++;
- }
- } else {
- while (d64 < (uint64_t *) rd.d_top) {
- ta = *s64;
- AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
- AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
- prod = ta;
- AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
- *d64 = (ta ^ prod);
- d64++;
- s64++;
- }
+ }
+ break;
+ case 14:
+ if (xor) {
+ while (d64 < (uint64_t *) rd.d_top) {
+ ta = *s64;
+ AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+ prod = ta;
+ AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+ prod ^= ta;
+ AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+ *d64 ^= (ta ^ prod);
+ d64++;
+ s64++;
}
- break;
- case 13:
- if (xor) {
- while (d64 < (uint64_t *) rd.d_top) {
- ta = *s64;
- prod = ta;
- AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
- AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
- prod ^= ta;
- AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
- *d64 ^= (ta ^ prod);
- d64++;
- s64++;
- }
- } else {
- while (d64 < (uint64_t *) rd.d_top) {
- ta = *s64;
- prod = ta;
- AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
- AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
- prod ^= ta;
- AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
- *d64 = (ta ^ prod);
- d64++;
- s64++;
- }
+ } else {
+ while (d64 < (uint64_t *) rd.d_top) {
+ ta = *s64;
+ AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+ prod = ta;
+ AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+ prod ^= ta;
+ AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+ *d64 = (ta ^ prod);
+ d64++;
+ s64++;
}
- break;
- case 14:
- if (xor) {
- while (d64 < (uint64_t *) rd.d_top) {
- ta = *s64;
- AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
- prod = ta;
- AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
- prod ^= ta;
- AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
- *d64 ^= (ta ^ prod);
- d64++;
- s64++;
- }
- } else {
- while (d64 < (uint64_t *) rd.d_top) {
- ta = *s64;
- AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
- prod = ta;
- AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
- prod ^= ta;
- AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
- *d64 = (ta ^ prod);
- d64++;
- s64++;
- }
+ }
+ break;
+ case 15:
+ if (xor) {
+ while (d64 < (uint64_t *) rd.d_top) {
+ ta = *s64;
+ prod = ta;
+ AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+ prod ^= ta;
+ AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+ prod ^= ta;
+ AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+ *d64 ^= (ta ^ prod);
+ d64++;
+ s64++;
}
- break;
- case 15:
- if (xor) {
- while (d64 < (uint64_t *) rd.d_top) {
- ta = *s64;
- prod = ta;
- AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
- prod ^= ta;
- AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
- prod ^= ta;
- AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
- *d64 ^= (ta ^ prod);
- d64++;
- s64++;
- }
- } else {
- while (d64 < (uint64_t *) rd.d_top) {
- ta = *s64;
- prod = ta;
- AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
- prod ^= ta;
- AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
- prod ^= ta;
- AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
- *d64 = (ta ^ prod);
- d64++;
- s64++;
- }
+ } else {
+ while (d64 < (uint64_t *) rd.d_top) {
+ ta = *s64;
+ prod = ta;
+ AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+ prod ^= ta;
+ AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+ prod ^= ta;
+ AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+ *d64 = (ta ^ prod);
+ d64++;
+ s64++;
}
- break;
-*/
- default:
+ }
+ break;
+ */
+ default:
if (xor) {
while (d64 < (uint64_t *) rd.d_top) {
prod = *d64 ;
@@ -1798,7 +2236,7 @@ gf_w8_bytwo_b_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t
gf_do_final_region_alignment(&rd);
}
-static
+ static
int gf_w8_bytwo_init(gf_t *gf)
{
gf_internal_t *h;
@@ -1825,48 +2263,54 @@ int gf_w8_bytwo_init(gf_t *gf)
if (h->mult_type == GF_MULT_BYTWO_p) {
gf->multiply.w32 = gf_w8_bytwo_p_multiply;
- if (h->region_type == GF_REGION_SSE) {
- gf->multiply_region.w32 = gf_w8_bytwo_p_sse_multiply_region;
- } else {
+#ifdef INTEL_SSE2
+ if (h->region_type & GF_REGION_NOSSE)
gf->multiply_region.w32 = gf_w8_bytwo_p_nosse_multiply_region;
- }
+ else
+ gf->multiply_region.w32 = gf_w8_bytwo_p_sse_multiply_region;
+#else
+ gf->multiply_region.w32 = gf_w8_bytwo_p_nosse_multiply_region;
+ if(h->region_type & GF_REGION_SSE)
+ return 0;
+#endif
} else {
gf->multiply.w32 = gf_w8_bytwo_b_multiply;
- if (h->region_type == GF_REGION_SSE) {
- gf->multiply_region.w32 = gf_w8_bytwo_b_sse_multiply_region;
- } else {
+#ifdef INTEL_SSE2
+ if (h->region_type & GF_REGION_NOSSE)
gf->multiply_region.w32 = gf_w8_bytwo_b_nosse_multiply_region;
- }
+ else
+ gf->multiply_region.w32 = gf_w8_bytwo_b_sse_multiply_region;
+#else
+ gf->multiply_region.w32 = gf_w8_bytwo_b_nosse_multiply_region;
+ if(h->region_type & GF_REGION_SSE)
+ return 0;
+#endif
}
- gf->inverse.w32 = gf_w8_euclid;
return 1;
}
/* ------------------------------------------------------------
General procedures.
+ You don't need to error check here on in init, because it's done
+ for you in gf_error_check().
*/
int gf_w8_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2)
{
- int sse;
-
- sse = (GF_REGION_SSE | GF_REGION_NOSSE);
-
switch(mult_type)
{
case GF_MULT_DEFAULT:
- if (arg1 != 0 || arg2 != 0 || region_type != 0) return -1;
+#ifdef INTEL_SSSE3
return sizeof(gf_internal_t) + sizeof(struct gf_w8_default_data) + 64;
+#endif
+ return sizeof(gf_internal_t) + sizeof(struct gf_w8_single_table_data) + 64;
case GF_MULT_TABLE:
- if (arg1 != 0 || arg2 != 0) return -1;
- if (region_type == GF_REGION_CAUCHY || region_type == (GF_REGION_CAUCHY | GF_REGION_SINGLE_TABLE)) {
+ if (region_type == GF_REGION_CAUCHY) {
return sizeof(gf_internal_t) + sizeof(struct gf_w8_single_table_data) + 64;
}
- if (region_type == 0) region_type = GF_REGION_SINGLE_TABLE;
- if (region_type & GF_REGION_SINGLE_TABLE) {
- if (region_type != GF_REGION_SINGLE_TABLE) return 0;
+ if (region_type == GF_REGION_DEFAULT) {
return sizeof(gf_internal_t) + sizeof(struct gf_w8_single_table_data) + 64;
}
if (region_type & GF_REGION_DOUBLE_TABLE) {
@@ -1875,62 +2319,62 @@ int gf_w8_scratch_size(int mult_type, int region_type, int divide_type, int arg1
} else if (region_type == (GF_REGION_DOUBLE_TABLE | GF_REGION_LAZY)) {
return sizeof(gf_internal_t) + sizeof(struct gf_w8_double_table_lazy_data) + 64;
} else {
- return -1;
+ return 0;
}
}
- return -1;
+ return 0;
break;
case GF_MULT_BYTWO_p:
case GF_MULT_BYTWO_b:
- if (arg1 != 0 || arg2 != 0) return -1;
- if (region_type != GF_REGION_CAUCHY) {
- if ((region_type | sse) != sse || (region_type & sse) == sse) return -1;
- }
return sizeof(gf_internal_t) + sizeof(struct gf_w8_bytwo_data);
break;
case GF_MULT_SPLIT_TABLE:
if ((arg1 == 4 && arg2 == 8) || (arg1 == 8 && arg2 == 4)) {
- if (region_type == GF_REGION_CAUCHY) {
- return sizeof(gf_internal_t) + sizeof(struct gf_w8_half_table_data) + 64;
- }
- if (region_type == 0) region_type = GF_REGION_SSE;
- if ((region_type | sse) != sse) return -1;
- if ((region_type & sse) == sse) return -1;
return sizeof(gf_internal_t) + sizeof(struct gf_w8_half_table_data) + 64;
}
- return -1;
break;
case GF_MULT_LOG_TABLE:
- if ((arg1 != 0 && arg1 != 1 && arg1 != 2) || arg2 != 0) return -1;
- if (region_type != 0 && region_type != GF_REGION_CAUCHY) return -1;
- if (arg1 == 0) return sizeof(gf_internal_t) + sizeof(struct gf_w8_logtable_data) + 64;
- if (arg1 == 1) return sizeof(gf_internal_t) + sizeof(struct gf_w8_logzero_small_table_data) + 64;
+ return sizeof(gf_internal_t) + sizeof(struct gf_w8_logtable_data) + 64;
+ break;
+ case GF_MULT_LOG_ZERO:
+ return sizeof(gf_internal_t) + sizeof(struct gf_w8_logzero_small_table_data) + 64;
+ break;
+ case GF_MULT_LOG_ZERO_EXT:
return sizeof(gf_internal_t) + sizeof(struct gf_w8_logzero_table_data) + 64;
break;
+ case GF_MULT_CARRY_FREE:
+ return sizeof(gf_internal_t);
+ break;
case GF_MULT_SHIFT:
- if (arg1 != 0 || arg2 != 0) return -1;
- if (region_type != 0 && region_type != GF_REGION_CAUCHY) return -1;
return sizeof(gf_internal_t);
break;
case GF_MULT_COMPOSITE:
- if (region_type & ~(GF_REGION_SINGLE_TABLE | GF_REGION_ALTMAP | GF_REGION_STDMAP)) return -1;
- if ((region_type & (GF_REGION_SINGLE_TABLE | GF_REGION_ALTMAP)) == (GF_REGION_SINGLE_TABLE | GF_REGION_ALTMAP)) return -1;
- if (arg1 == 2 && arg2 == 4) {
- return sizeof(gf_internal_t) + sizeof(struct gf_w4_single_table_data) + 64;
- } else {
- return -1;
- }
+ return sizeof(gf_internal_t) + sizeof(struct gf_w8_composite_data) + 64;
default:
- return -1;
- }
+ return 0;
+ }
+ return 0;
}
int gf_w8_init(gf_t *gf)
{
- gf_internal_t *h;
+ gf_internal_t *h, *h_base;
h = (gf_internal_t *) gf->scratch;
- if (h->prim_poly == 0) h->prim_poly = 0x11d;
+
+ /* Allen: set default primitive polynomial / irreducible polynomial if needed */
+
+ if (h->prim_poly == 0) {
+ if (h->mult_type == GF_MULT_COMPOSITE) {
+ h->prim_poly = gf_composite_get_default_poly(h->base_gf);
+ if (h->prim_poly == 0) return 0; /* JSP: This shouldn't happen, but just in case. */
+ } else {
+ h->prim_poly = 0x11d;
+ }
+ }
+ if (h->mult_type != GF_MULT_COMPOSITE) {
+ h->prim_poly |= 0x100;
+ }
gf->multiply.w32 = NULL;
gf->divide.w32 = NULL;
@@ -1939,16 +2383,20 @@ int gf_w8_init(gf_t *gf)
gf->extract_word.w32 = gf_w8_extract_word;
switch(h->mult_type) {
- case GF_MULT_DEFAULT: if (gf_w8_table_init(gf) == 0) return 0; break;
- case GF_MULT_TABLE: if (gf_w8_table_init(gf) == 0) return 0; break;
+ case GF_MULT_DEFAULT:
+ case GF_MULT_TABLE: if (gf_w8_table_init(gf) == 0) return 0; break;
case GF_MULT_BYTWO_p:
- case GF_MULT_BYTWO_b: if (gf_w8_bytwo_init(gf) == 0) return 0; break;
- case GF_MULT_LOG_TABLE: if (gf_w8_log_init(gf) == 0) return 0; break;
- case GF_MULT_SHIFT: if (gf_w8_shift_init(gf) == 0) return 0; break;
- case GF_MULT_SPLIT_TABLE: if (gf_w8_split_init(gf) == 0) return 0; break;
- case GF_MULT_COMPOSITE: if (gf_w8_composite_init(gf) == 0) return 0; break;
+ case GF_MULT_BYTWO_b: if (gf_w8_bytwo_init(gf) == 0) return 0; break;
+ case GF_MULT_LOG_ZERO:
+ case GF_MULT_LOG_ZERO_EXT:
+ case GF_MULT_LOG_TABLE: if (gf_w8_log_init(gf) == 0) return 0; break;
+ case GF_MULT_CARRY_FREE: if (gf_w8_cfm_init(gf) == 0) return 0; break;
+ case GF_MULT_SHIFT: if (gf_w8_shift_init(gf) == 0) return 0; break;
+ case GF_MULT_SPLIT_TABLE: if (gf_w8_split_init(gf) == 0) return 0; break;
+ case GF_MULT_COMPOSITE: if (gf_w8_composite_init(gf) == 0) return 0; break;
default: return 0;
}
+
if (h->divide_type == GF_DIVIDE_EUCLID) {
gf->divide.w32 = gf_w8_divide_from_inverse;
gf->inverse.w32 = gf_w8_euclid;
@@ -1957,11 +2405,15 @@ int gf_w8_init(gf_t *gf)
gf->inverse.w32 = gf_w8_matrix;
}
- if (gf->inverse.w32 != NULL && gf->divide.w32 == NULL) {
+ if (gf->divide.w32 == NULL) {
gf->divide.w32 = gf_w8_divide_from_inverse;
+ if (gf->inverse.w32 == NULL) gf->inverse.w32 = gf_w8_euclid;
}
- if (gf->inverse.w32 == NULL && gf->divide.w32 != NULL) {
- gf->inverse.w32 = gf_w8_inverse_from_divide;
+
+ if (gf->inverse.w32 == NULL) gf->inverse.w32 = gf_w8_inverse_from_divide;
+
+ if (h->mult_type == GF_MULT_COMPOSITE && (h->region_type & GF_REGION_ALTMAP)) {
+ gf->extract_word.w32 = gf_w8_composite_extract_word;
}
if (h->region_type == GF_REGION_CAUCHY) {
@@ -1969,6 +2421,10 @@ int gf_w8_init(gf_t *gf)
gf->extract_word.w32 = gf_wgen_extract_word;
}
+ if (gf->multiply_region.w32 == NULL) {
+ gf->multiply_region.w32 = gf_w8_multiply_region_from_single;
+ }
+
return 1;
}
@@ -2001,7 +2457,7 @@ uint8_t *gf_w8_get_div_table(gf_t *gf)
h = (gf_internal_t *) gf->scratch;
if (gf->multiply.w32 == gf_w8_default_multiply) {
ftd = (struct gf_w8_default_data *) ((gf_internal_t *) gf->scratch)->private;
- return (uint8_t *) std->divtable;
+ return (uint8_t *) ftd->divtable;
} else if (gf->multiply.w32 == gf_w8_table_multiply) {
std = (struct gf_w8_single_table_data *) ((gf_internal_t *) gf->scratch)->private;
return (uint8_t *) std->divtable;
diff --git a/gf_wgen.c b/gf_wgen.c
index 7d5144b..ede115c 100644
--- a/gf_wgen.c
+++ b/gf_wgen.c
@@ -93,6 +93,7 @@ gf_val_32_t gf_wgen_euclid (gf_t *gf, gf_val_32_t b)
while (d_ip1 >= d_i) {
c_i ^= (1 << (d_ip1 - d_i));
e_ip1 ^= (e_i << (d_ip1 - d_i));
+ if (e_ip1 == 0) return 0;
while ((e_ip1 & (1 << d_ip1)) == 0) d_ip1--;
}
@@ -223,7 +224,7 @@ gf_wgen_bytwo_p_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
pp = h->prim_poly;
prod = 0;
- pmask = (1 << (h->w)-1);
+ pmask = (1 << ((h->w)-1)); /*Ben: Had an operator precedence warning here*/
amask = pmask;
while (amask != 0) {
@@ -508,16 +509,11 @@ int gf_wgen_table_8_init(gf_t *gf)
}
for (a = 1; a < (1 << w); a++) {
- b = 1;
- p = a;
- do {
+ for (b = 1; b < (1 << w); b++) {
+ p = gf_wgen_shift_multiply(gf, a, b);
std->mult[(a<<w)|b] = p;
- std->div[(p<<w)|b] = a;
- b = (b & (1 << (w-1))) ? (b << 1) ^ h->prim_poly : (b << 1);
- b &= ((1 << w)-1);
- p = (p & (1 << (w-1))) ? (p << 1) ^ h->prim_poly : (p << 1);
- p &= ((1 << w)-1);
- } while (b != 1);
+ std->div[(p<<w)|a] = b;
+ }
}
gf->multiply.w32 = gf_wgen_table_8_multiply;
@@ -572,18 +568,13 @@ int gf_wgen_table_16_init(gf_t *gf)
std->div[a] = 0;
std->div[a<<w] = 0;
}
-
+
for (a = 1; a < (1 << w); a++) {
- b = 1;
- p = a;
- do {
+ for (b = 1; b < (1 << w); b++) {
+ p = gf_wgen_shift_multiply(gf, a, b);
std->mult[(a<<w)|b] = p;
- std->div[(p<<w)|b] = a;
- b = (b & (1 << (w-1))) ? (b << 1) ^ h->prim_poly : (b << 1);
- b &= ((1 << w)-1);
- p = (p & (1 << (w-1))) ? (p << 1) ^ h->prim_poly : (p << 1);
- p &= ((1 << w)-1);
- } while (b != 1);
+ std->div[(p<<w)|a] = b;
+ }
}
gf->multiply.w32 = gf_wgen_table_16_multiply;
@@ -599,6 +590,11 @@ int gf_wgen_table_init(gf_t *gf)
h = (gf_internal_t *) gf->scratch;
if (h->w <= 8) return gf_wgen_table_8_init(gf);
if (h->w <= 14) return gf_wgen_table_16_init(gf);
+
+ /* Returning zero to make the compiler happy, but this won't get
+ executed, because it is tested in _scratch_space. */
+
+ return 0;
}
static
@@ -640,6 +636,7 @@ int gf_wgen_log_8_init(gf_t *gf)
struct gf_wgen_log_w8_data *std;
int w;
uint32_t a, i;
+ int check = 0;
h = (gf_internal_t *) gf->scratch;
w = h->w;
@@ -649,17 +646,27 @@ int gf_wgen_log_8_init(gf_t *gf)
std->anti = std->log + (1<<h->w);
std->danti = std->anti + (1<<h->w)-1;
- i = 0;
+ for (i = 0; i < (1 << w); i++)
+ std->log[i] = 0;
+
a = 1;
- do {
+ for(i=0; i < (1<<w)-1; i++)
+ {
+ if (std->log[a] != 0) check = 1;
std->log[a] = i;
std->anti[i] = a;
std->danti[i] = a;
- i++;
- a = (a & (1 << (w-1))) ? (a << 1) ^ h->prim_poly : (a << 1);
- a &= ((1 << w)-1);
- } while (a != 1);
-
+ a <<= 1;
+ if(a & (1<<w))
+ a ^= h->prim_poly;
+ //a &= ((1 << w)-1);
+ }
+
+ if (check != 0) {
+ _gf_errno = GF_E_LOGPOLY;
+ return 0;
+ }
+
gf->multiply.w32 = gf_wgen_log_8_multiply;
gf->divide.w32 = gf_wgen_log_8_divide;
return 1;
@@ -704,6 +711,7 @@ int gf_wgen_log_16_init(gf_t *gf)
struct gf_wgen_log_w16_data *std;
int w;
uint32_t a, i;
+ int check = 0;
h = (gf_internal_t *) gf->scratch;
w = h->w;
@@ -712,17 +720,28 @@ int gf_wgen_log_16_init(gf_t *gf)
std->log = &(std->base);
std->anti = std->log + (1<<h->w);
std->danti = std->anti + (1<<h->w)-1;
-
- i = 0;
+
+ for (i = 0; i < (1 << w); i++)
+ std->log[i] = 0;
+
a = 1;
- do {
+ for(i=0; i < (1<<w)-1; i++)
+ {
+ if (std->log[a] != 0) check = 1;
std->log[a] = i;
std->anti[i] = a;
std->danti[i] = a;
- i++;
- a = (a & (1 << (w-1))) ? (a << 1) ^ h->prim_poly : (a << 1);
- a &= ((1 << w)-1);
- } while (a != 1);
+ a <<= 1;
+ if(a & (1<<w))
+ a ^= h->prim_poly;
+ //a &= ((1 << w)-1);
+ }
+
+ if (check) {
+ if (h->mult_type != GF_MULT_LOG_TABLE) return gf_wgen_shift_init(gf);
+ _gf_errno = GF_E_LOGPOLY;
+ return 0;
+ }
gf->multiply.w32 = gf_wgen_log_16_multiply;
gf->divide.w32 = gf_wgen_log_16_divide;
@@ -768,7 +787,8 @@ int gf_wgen_log_32_init(gf_t *gf)
struct gf_wgen_log_w32_data *std;
int w;
uint32_t a, i;
-
+ int check = 0;
+
h = (gf_internal_t *) gf->scratch;
w = h->w;
std = (struct gf_wgen_log_w32_data *) h->private;
@@ -777,17 +797,27 @@ int gf_wgen_log_32_init(gf_t *gf)
std->anti = std->log + (1<<h->w);
std->danti = std->anti + (1<<h->w)-1;
- i = 0;
+ for (i = 0; i < (1 << w); i++)
+ std->log[i] = 0;
+
a = 1;
- do {
+ for(i=0; i < (1<<w)-1; i++)
+ {
+ if (std->log[a] != 0) check = 1;
std->log[a] = i;
std->anti[i] = a;
std->danti[i] = a;
- i++;
- a = (a & (1 << (w-1))) ? (a << 1) ^ h->prim_poly : (a << 1);
- a &= ((1 << w)-1);
- } while (a != 1);
-
+ a <<= 1;
+ if(a & (1<<w))
+ a ^= h->prim_poly;
+ //a &= ((1 << w)-1);
+ }
+
+ if (check != 0) {
+ _gf_errno = GF_E_LOGPOLY;
+ return 0;
+ }
+
gf->multiply.w32 = gf_wgen_log_32_multiply;
gf->divide.w32 = gf_wgen_log_32_divide;
return 1;
@@ -802,15 +832,16 @@ int gf_wgen_log_init(gf_t *gf)
if (h->w <= 8) return gf_wgen_log_8_init(gf);
if (h->w <= 16) return gf_wgen_log_16_init(gf);
if (h->w <= 32) return gf_wgen_log_32_init(gf);
+
+ /* Returning zero to make the compiler happy, but this won't get
+ executed, because it is tested in _scratch_space. */
+
+ return 0;
}
int gf_wgen_scratch_size(int w, int mult_type, int region_type, int divide_type, int arg1, int arg2)
{
- if (w > 32 || w < 0) return -1;
-
- if ((region_type | GF_REGION_CAUCHY) != GF_REGION_CAUCHY) return -1;
-
switch(mult_type)
{
case GF_MULT_DEFAULT:
@@ -828,40 +859,37 @@ int gf_wgen_scratch_size(int w, int mult_type, int region_type, int divide_type,
case GF_MULT_SHIFT:
case GF_MULT_BYTWO_b:
case GF_MULT_BYTWO_p:
- if (arg1 != 0 || arg2 != 0) return -1;
return sizeof(gf_internal_t);
break;
case GF_MULT_GROUP:
- if (arg1 <= 0 || arg2 <= 0) return -1;
return sizeof(gf_internal_t) + sizeof(struct gf_wgen_group_data) +
sizeof(uint32_t) * (1 << arg1) +
sizeof(uint32_t) * (1 << arg2) + 64;
break;
case GF_MULT_TABLE:
- if (arg1 != 0 || arg2 != 0) return -1;
if (w <= 8) {
return sizeof(gf_internal_t) + sizeof(struct gf_wgen_table_w8_data) +
sizeof(uint8_t)*(1 << w)*(1<<w)*2 + 64;
} else if (w < 15) {
return sizeof(gf_internal_t) + sizeof(struct gf_wgen_table_w16_data) +
sizeof(uint16_t)*(1 << w)*(1<<w)*2 + 64;
- } else return -1;
+ }
+ return 0;
case GF_MULT_LOG_TABLE:
- if (arg1 != 0 || arg2 != 0) return -1;
if (w <= 8) {
return sizeof(gf_internal_t) + sizeof(struct gf_wgen_log_w8_data) +
sizeof(uint8_t)*(1 << w)*3;
} else if (w <= 16) {
return sizeof(gf_internal_t) + sizeof(struct gf_wgen_log_w16_data) +
sizeof(uint16_t)*(1 << w)*3;
- } else if (w <= 29) {
+ } else if (w <= 27) {
return sizeof(gf_internal_t) + sizeof(struct gf_wgen_log_w32_data) +
sizeof(uint32_t)*(1 << w)*3;
- } else return -1;
-
+ } else
+ return 0;
default:
- return -1;
+ return 0;
}
}
@@ -935,6 +963,13 @@ int gf_wgen_init(gf_t *gf)
case 32: h->prim_poly = 00020000007; break;
default: fprintf(stderr, "gf_wgen_init: w not defined yet\n"); exit(1);
}
+ } else {
+ if (h->w == 32) {
+ h->prim_poly &= 0xffffffff;
+ } else {
+ h->prim_poly |= (1 << h->w);
+ if (h->prim_poly & ~((1ULL<<(h->w+1))-1)) return 0;
+ }
}
gf->multiply.w32 = NULL;
@@ -950,7 +985,7 @@ int gf_wgen_init(gf_t *gf)
} else if (h->w <= 16) {
if (gf_wgen_log_init(gf) == 0) return 0;
} else {
- if (gf_wgen_group_init(gf) == 0) return 0;
+ if (gf_wgen_bytwo_p_init(gf) == 0) return 0;
}
break;
case GF_MULT_SHIFT: if (gf_wgen_shift_init(gf) == 0) return 0; break;
diff --git a/release-files.txt b/release-files.txt
deleted file mode 100644
index ca25004..0000000
--- a/release-files.txt
+++ /dev/null
@@ -1,31 +0,0 @@
-License.txt
-README.txt
-GNUmakefile
-gf.c
-gf_add.c
-gf_complete.h
-gf_div.c
-gf_example_1.c
-gf_example_2.c
-gf_example_3.c
-gf_example_4.c
-gf_general.c
-gf_general.h
-gf_int.h
-gf_method.c
-gf_method.h
-gf_methods.c
-gf_mult.c
-gf_poly.c
-gf_rand.c
-gf_rand.h
-gf_time.c
-gf_unit.c
-gf_w128.c
-gf_w16.c
-gf_w32.c
-gf_w4.c
-gf_w64.c
-gf_w8.c
-gf_wgen.c
-whats_my_sse.c
diff --git a/tests.txt b/tests.txt
deleted file mode 100644
index e69de29..0000000
--- a/tests.txt
+++ /dev/null
diff --git a/tmp-10-out.txt b/tmp-10-out.txt
deleted file mode 100644
index e69de29..0000000
--- a/tmp-10-out.txt
+++ /dev/null
diff --git a/tmp-time-test.sh b/tmp-time-test.sh
deleted file mode 100644
index e30fca8..0000000
--- a/tmp-time-test.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-if [ $# -lt 4 ]; then
- echo 'usage: sh tmp-test.sh w gf_specs (e.g. LOG - -)' >&2
- exit 1
-fi
-
-w=$1
-shift
-i=1024
-while [ $i -le 134217728 ]; do
- iter=`echo $i | awk '{ print (134217728/$1)*1 }'`
- echo $i $iter $w $* `./gf_time $w G -1 $i $iter $* | head -n 3 | tail -n 2`
- i=`echo $i | awk '{ print $1*2 }'`
-done
-
diff --git a/tmp.c b/tmp.c
deleted file mode 100644
index a6deaab..0000000
--- a/tmp.c
+++ /dev/null
@@ -1,1583 +0,0 @@
-/*
- * gf_w32.c
- *
- * Routines for 32-bit Galois fields
- */
-
-#define MM_PRINT32(s, r) { uint8_t blah[16], ii; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 4) printf(" %02x%02x%02x%02x", blah[15-ii], blah[14-ii], blah[13-ii], blah[12-ii]); printf("\n"); }
-
-#define MM_PRINT8(s, r) { uint8_t blah[16], ii; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 1) printf("%s%02x", (ii%4==0) ? " " : " ", blah[15-ii]); printf("\n"); }
-
-#include "gf_int.h"
-#include <stdio.h>
-#include <stdlib.h>
-
-#define GF_FIELD_WIDTH (32)
-#define GF_FIRST_BIT (1 << 31)
-
-#define GF_BASE_FIELD_WIDTH (16)
-#define GF_BASE_FIELD_SIZE (1 << GF_BASE_FIELD_WIDTH)
-#define GF_BASE_FIELD_GROUP_SIZE GF_BASE_FIELD_SIZE-1
-#define GF_S_GF_16_2 (40188)
-#define GF_MULTBY_TWO(p) (((p) & GF_FIRST_BIT) ? (((p) << 1) ^ h->prim_poly) : (p) << 1);
-
-
-struct gf_w16_logtable_data {
- int log_tbl[GF_BASE_FIELD_SIZE];
- gf_val_16_t _antilog_tbl[GF_BASE_FIELD_SIZE * 4];
- gf_val_16_t *antilog_tbl;
- gf_val_16_t inv_tbl[GF_BASE_FIELD_SIZE];
-};
-
-struct gf_split_2_32_lazy_data {
- gf_val_32_t last_value;
- gf_val_32_t tables[16][4];
-};
-
-struct gf_split_8_8_data {
- gf_val_32_t tables[7][256][256];
-};
-
-struct gf_split_4_32_lazy_data {
- gf_val_32_t last_value;
- gf_val_32_t tables[8][16];
-};
-
-static
-inline
-gf_val_32_t gf_w32_inverse_from_divide (gf_t *gf, gf_val_32_t a)
-{
- return gf->divide.w32(gf, 1, a);
-}
-
-static
-inline
-gf_val_32_t gf_w32_divide_from_inverse (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
-{
- b = gf->inverse.w32(gf, b);
- return gf->multiply.w32(gf, a, b);
-}
-
-static
-void
-gf_w32_multiply_region_from_single(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int
-xor)
-{
- int i;
- gf_val_32_t *s32;
- gf_val_32_t *d32;
-
- s32 = (gf_val_32_t *) src;
- d32 = (gf_val_32_t *) dest;
-
- if (xor) {
- for (i = 0; i < bytes/sizeof(gf_val_32_t); i++) {
- d32[i] ^= gf->multiply.w32(gf, val, s32[i]);
- }
- } else {
- for (i = 0; i < bytes/sizeof(gf_val_32_t); i++) {
- d32[i] = gf->multiply.w32(gf, val, s32[i]);
- }
- }
-}
-
-static
-inline
-gf_val_32_t gf_w32_euclid (gf_t *gf, gf_val_32_t b)
-{
- gf_val_32_t e_i, e_im1, e_ip1;
- gf_val_32_t d_i, d_im1, d_ip1;
- gf_val_32_t y_i, y_im1, y_ip1;
- gf_val_32_t c_i;
-
- if (b == 0) return -1;
- e_im1 = ((gf_internal_t *) (gf->scratch))->prim_poly;
- e_i = b;
- d_im1 = 32;
- for (d_i = d_im1-1; ((1 << d_i) & e_i) == 0; d_i--) ;
- y_i = 1;
- y_im1 = 0;
-
- while (e_i != 1) {
-
- e_ip1 = e_im1;
- d_ip1 = d_im1;
- c_i = 0;
-
- while (d_ip1 >= d_i) {
- c_i ^= (1 << (d_ip1 - d_i));
- e_ip1 ^= (e_i << (d_ip1 - d_i));
- d_ip1--;
- while ((e_ip1 & (1 << d_ip1)) == 0) d_ip1--;
- }
-
- y_ip1 = y_im1 ^ gf->multiply.w32(gf, c_i, y_i);
- y_im1 = y_i;
- y_i = y_ip1;
-
- e_im1 = e_i;
- d_im1 = d_i;
- e_i = e_ip1;
- d_i = d_ip1;
- }
-
- return y_i;
-}
-
-static
-inline
-gf_val_32_t gf_w32_matrix (gf_t *gf, gf_val_32_t b)
-{
- return gf_bitmatrix_inverse(b, 32, ((gf_internal_t *) (gf->scratch))->prim_poly);
-}
-
-/* JSP: GF_MULT_SHIFT: The world's dumbest multiplication algorithm. I only
- include it for completeness. It does have the feature that it requires no
- extra memory.
-*/
-
-static
-inline
-gf_val_32_t
-gf_w32_shift_multiply (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32)
-{
- uint64_t product, i, pp, a, b, one;
- gf_internal_t *h;
-
- a = a32;
- b = b32;
- h = (gf_internal_t *) gf->scratch;
- one = 1;
- pp = h->prim_poly | (one << 32);
-
- product = 0;
-
- for (i = 0; i < GF_FIELD_WIDTH; i++) {
- if (a & (one << i)) product ^= (b << i);
- }
- for (i = (GF_FIELD_WIDTH*2-1); i >= GF_FIELD_WIDTH; i--) {
- if (product & (one << i)) product ^= (pp << (i-GF_FIELD_WIDTH));
- }
- return product;
-}
-
-static
-int gf_w32_shift_init(gf_t *gf)
-{
- gf->multiply.w32 = gf_w32_shift_multiply;
- gf->inverse.w32 = gf_w32_euclid;
- gf->multiply_region.w32 = gf_w32_multiply_region_from_single;
- return 1;
-}
-
-static
-inline
-gf_val_32_t
-gf_w32_split_8_8_multiply (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32)
-{
- uint32_t product, i, j, mask, tb;
- gf_internal_t *h;
- struct gf_split_8_8_data *d8;
-
- h = (gf_internal_t *) gf->scratch;
- d8 = (struct gf_split_8_8_data *) h->private;
- product = 0;
- mask = 0xff;
-
- for (i = 0; i < 4; i++) {
- tb = b32;
- for (j = 0; j < 4; j++) {
- product ^= d8->tables[i+j][a32&mask][tb&mask];
- tb >>= 8;
- }
- a32 >>= 8;
- }
- return product;
-}
-
-static
-inline
-void
-gf_w32_split_8_8_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
-{
- uint32_t product, mask, tb, tv, tp;
- gf_internal_t *h;
- struct gf_split_8_8_data *d8;
- uint32_t *p00, *p01, *p02, *p03;
- uint32_t *p10, *p11, *p12, *p13;
- uint32_t *p20, *p21, *p22, *p23;
- uint32_t *p30, *p31, *p32, *p33;
- uint32_t *s32, *d32, *top;
- unsigned long uls, uld;
-
- uls = (unsigned long) src;
- uld = (unsigned long) dest;
- if (uls %4 != 0 || ((uls & 0x7) != (uld & 0x7))) gf_alignment_error("gf_w32_split_8_8_multiply_region", 4);
- if (bytes % 4 != 0) {
- gf_alignment_error("gf_w32_split_8_8_multiply_region: buffer size not divisible by symbol size = 4 bytes", 4);
- }
-
- tv = val;
- h = (gf_internal_t *) gf->scratch;
- d8 = (struct gf_split_8_8_data *) h->private;
- mask = 0xff;
-
- p00 = &(d8->tables[0][val&mask][0]);
- p01 = &(d8->tables[1][val&mask][0]);
- p02 = &(d8->tables[2][val&mask][0]);
- p03 = &(d8->tables[3][val&mask][0]);
- val >>= 8;
- p10 = &(d8->tables[1][val&mask][0]);
- p11 = &(d8->tables[2][val&mask][0]);
- p12 = &(d8->tables[3][val&mask][0]);
- p13 = &(d8->tables[4][val&mask][0]);
- val >>= 8;
- p20 = &(d8->tables[2][val&mask][0]);
- p21 = &(d8->tables[3][val&mask][0]);
- p22 = &(d8->tables[4][val&mask][0]);
- p23 = &(d8->tables[5][val&mask][0]);
- val >>= 8;
- p30 = &(d8->tables[3][val&mask][0]);
- p31 = &(d8->tables[4][val&mask][0]);
- p32 = &(d8->tables[5][val&mask][0]);
- p33 = &(d8->tables[6][val&mask][0]);
-
- s32 = (uint32_t *) src;
- d32 = (uint32_t *) dest;
- top = (d32 + (bytes/4));
-
- while (d32 < top) {
- tb = *s32;
- tp = *d32;
- product = (xor) ? (*d32) : 0;
- product ^= p00[tb&mask];
- product ^= p10[tb&mask];
- product ^= p20[tb&mask];
- product ^= p30[tb&mask];
-
- tb >>= 8;
- product ^= p01[tb&mask];
- product ^= p11[tb&mask];
- product ^= p21[tb&mask];
- product ^= p31[tb&mask];
-
- tb >>= 8;
- product ^= p02[tb&mask];
- product ^= p12[tb&mask];
- product ^= p22[tb&mask];
- product ^= p32[tb&mask];
-
- tb >>= 8;
- product ^= p03[tb&mask];
- product ^= p13[tb&mask];
- product ^= p23[tb&mask];
- product ^= p33[tb&mask];
- *d32 = product;
- s32++;
- d32++;
- }
-}
-
-static
-void
-gf_w32_split_2_32_lazy_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
-{
- unsigned long uls, uld;
- gf_internal_t *h;
- struct gf_split_2_32_lazy_data *ld;
- int i;
- gf_val_32_t pp, v, v2, s, *s32, *d32, *top;
-
- uls = (unsigned long) src;
- uld = (unsigned long) dest;
- if (uls %4 != 0 || ((uls & 0x7) != (uld & 0x7))) gf_alignment_error("gf_w32_split_2_32_lazy_multiply_region", 4);
- if (bytes % 4 != 0) {
- gf_alignment_error("gf_w32_split_2_32_lazy_multiply_region: buffer size not divisible by symbol size = 4 bytes", 4);
- }
-
- if (val == 0) {
- if (xor) return;
- bzero(dest, bytes);
- return;
- }
-
- h = (gf_internal_t *) gf->scratch;
- pp = h->prim_poly;
-
- ld = (struct gf_split_2_32_lazy_data *) h->private;
-
- if (ld->last_value != val) {
- v = val;
- for (i = 0; i < 16; i++) {
- v2 = (v << 1);
- if (v & GF_FIRST_BIT) v2 ^= pp;
- ld->tables[i][0] = 0;
- ld->tables[i][1] = v;
- ld->tables[i][2] = v2;
- ld->tables[i][3] = (v2 ^ v);
- v = (v2 << 1);
- if (v2 & GF_FIRST_BIT) v ^= pp;
- }
- }
- ld->last_value = val;
-
- s32 = (gf_val_32_t *) src;
- d32 = (gf_val_32_t *) dest;
- top = d32 + (bytes/4);
-
- while (d32 != top) {
- v = (xor) ? *d32 : 0;
- s = *s32;
- i = 0;
- while (s != 0) {
- v ^= ld->tables[i][s&3];
- s >>= 2;
- i++;
- }
- *d32 = v;
- d32++;
- s32++;
- }
-}
-
-static
-void
-gf_w32_split_2_32_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
-{
-#ifdef INTEL_SSE4
- unsigned long uls, uld;
- gf_internal_t *h;
- int i, m, j, tindex;
- gf_val_32_t pp, v, v2, s, *s32, *d32, *top;
- __m128i vi, si, pi, shuffler, tables[16], adder, xi, mask1, mask2;
-
- uls = (unsigned long) src;
- uld = (unsigned long) dest;
- if (uls %4 != 0 || ((uls & 0xf) != (uld & 0xf))) gf_alignment_error("gf_w32_split_2_32_lazy_sse_multiply_region", 4);
- if (bytes % 4 != 0) {
- gf_alignment_error("gf_w32_split_2_32_lazy_sse_multiply_region: buffer size not divisible by symbol size = 4 bytes", 4);
- }
-
- if (val == 0) {
- if (xor) return;
- bzero(dest, bytes);
- return;
- }
-
- h = (gf_internal_t *) gf->scratch;
- pp = h->prim_poly;
-
- uls &= 0xf;
-
- s32 = (gf_val_32_t *) src;
- d32 = (gf_val_32_t *) dest;
- top = d32 + (bytes/4);
-
- if (uls != 0) {
- while (uls != 16) {
- if (xor) {
- *d32 ^= gf->multiply.w32(gf, *s32, val);
- } else {
- *d32 = gf->multiply.w32(gf, *s32, val);
- }
- *s32++;
- *d32++;
- if (d32 == top) return;
- uls += 4;
- }
- }
-
- uld = (unsigned long) top;
- top = (gf_val_32_t *) (uld - (uld & 0xf));
- uld &= 0xf;
-
- v = val;
- for (i = 0; i < 16; i++) {
- v2 = (v << 1);
- if (v & GF_FIRST_BIT) v2 ^= pp;
- tables[i] = _mm_set_epi32(v2 ^ v, v2, v, 0);
- v = (v2 << 1);
- if (v2 & GF_FIRST_BIT) v ^= pp;
- }
-
- shuffler = _mm_set_epi8(0xc, 0xc, 0xc, 0xc, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0);
- adder = _mm_set_epi8(3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0);
- mask1 = _mm_set1_epi8(0x3);
- mask2 = _mm_set1_epi8(0xc);
-
- while (d32 != top) {
- pi = (xor) ? _mm_load_si128 ((__m128i *) d32) : _mm_setzero_si128();
- vi = _mm_load_si128((__m128i *) s32);
-
- tindex = 0;
- for (i = 0; i < 4; i++) {
- si = _mm_shuffle_epi8(vi, shuffler);
-
- xi = _mm_and_si128(si, mask1);
- xi = _mm_slli_epi16(xi, 2);
- xi = _mm_xor_si128(xi, adder);
- pi = _mm_xor_si128(pi, _mm_shuffle_epi8(tables[tindex], xi));
- tindex++;
-
- xi = _mm_and_si128(si, mask2);
- xi = _mm_xor_si128(xi, adder);
- pi = _mm_xor_si128(pi, _mm_shuffle_epi8(tables[tindex], xi));
- si = _mm_srli_epi16(si, 2);
- tindex++;
-
- xi = _mm_and_si128(si, mask2);
- xi = _mm_xor_si128(xi, adder);
- pi = _mm_xor_si128(pi, _mm_shuffle_epi8(tables[tindex], xi));
- si = _mm_srli_epi16(si, 2);
- tindex++;
-
- xi = _mm_and_si128(si, mask2);
- xi = _mm_xor_si128(xi, adder);
- pi = _mm_xor_si128(pi, _mm_shuffle_epi8(tables[tindex], xi));
- si = _mm_srli_epi16(si, 2);
- tindex++;
-
- vi = _mm_srli_epi32(vi, 8);
- }
- _mm_store_si128((__m128i *) d32, pi);
- d32 += 4;
- s32 += 4;
- }
-
- while (uld > 0) {
- if (xor) {
- *d32 ^= gf->multiply.w32(gf, *s32, val);
- } else {
- *d32 = gf->multiply.w32(gf, *s32, val);
- }
- *s32++;
- *d32++;
- uld -= 4;
- }
-
-
-#endif
-}
-
-static
-void
-gf_w32_split_4_32_lazy_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
-{
- unsigned long uls, uld;
- gf_internal_t *h;
- struct gf_split_4_32_lazy_data *ld;
- int i, j, k;
- gf_val_32_t pp, v, s, *s32, *d32, *top;
-
- uls = (unsigned long) src;
- uld = (unsigned long) dest;
- if (uls %4 != 0 || ((uls & 0x7) != (uld & 0x7))) gf_alignment_error("gf_w32_split_4_32_lazy_multiply_region", 4);
- if (bytes % 4 != 0) {
- gf_alignment_error("gf_w32_split_4_32_lazy_multiply_region: buffer size not divisible by symbol size = 4 bytes", 4);
- }
-
- if (val == 0) {
- if (xor) return;
- bzero(dest, bytes);
- return;
- }
-
- h = (gf_internal_t *) gf->scratch;
- pp = h->prim_poly;
-
- ld = (struct gf_split_4_32_lazy_data *) h->private;
-
- if (ld->last_value != val) {
- v = val;
- for (i = 0; i < 8; i++) {
- ld->tables[i][0] = 0;
- for (j = 1; j < 16; j <<= 1) {
- for (k = 0; k < j; k++) {
- ld->tables[i][k^j] = (v ^ ld->tables[i][k]);
- }
- v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1);
- }
- }
- }
- ld->last_value = val;
-
- s32 = (gf_val_32_t *) src;
- d32 = (gf_val_32_t *) dest;
- top = d32 + (bytes/4);
-
- while (d32 != top) {
- v = (xor) ? *d32 : 0;
- s = *s32;
- i = 0;
- while (s != 0) {
- v ^= ld->tables[i][s&0xf];
- s >>= 4;
- i++;
- }
- *d32 = v;
- d32++;
- s32++;
- }
-}
-
-static
-void
-gf_w32_split_4_32_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
-{
-#ifdef INTEL_SSE4
- unsigned long uls, uld;
- gf_internal_t *h;
- int i, m, j, k, tindex;
- gf_val_32_t pp, v, s, *s32, *d32, *top, *realtop;
- __m128i si, tables[8][4], p0, p1, p2, p3, mask1, v0, v1, v2, v3;
- struct gf_split_4_32_lazy_data *ld;
- uint8_t btable[16];
-
- uls = (unsigned long) src;
- uld = (unsigned long) dest;
- if (uls %4 != 0 || ((uls & 0xf) != (uld & 0xf))) gf_alignment_error("gf_w32_split_4_32_lazy_sse_multiply_region", 4);
- if (bytes % 4 != 0) {
- gf_alignment_error("gf_w32_split_4_32_lazy_sse_multiply_region: buffer size not divisible by symbol size = 4 bytes", 4);
- }
-
- if (val == 0) {
- if (xor) return;
- bzero(dest, bytes);
- return;
- }
-
- h = (gf_internal_t *) gf->scratch;
- pp = h->prim_poly;
-
- uls &= 0xf;
-
- s32 = (gf_val_32_t *) src;
- d32 = (gf_val_32_t *) dest;
- top = d32 + (bytes/4);
-
- if (uls != 0) {
- while (uls != 16) {
- if (xor) {
- *d32 ^= gf->multiply.w32(gf, *s32, val);
- } else {
- *d32 = gf->multiply.w32(gf, *s32, val);
- }
- *s32++;
- *d32++;
- if (d32 == top) return;
- uls += 4;
- }
- }
-
- uld = (unsigned long) top;
- realtop = top;
-
- /* You need the size of this region to be a multiple of 64 bytes */
- bytes = (top - d32);
- bytes -= (bytes & 0xf);
- top = (d32 + bytes);
-
- ld = (struct gf_split_4_32_lazy_data *) h->private;
-
- v = val;
- for (i = 0; i < 8; i++) {
- ld->tables[i][0] = 0;
- for (j = 1; j < 16; j <<= 1) {
- for (k = 0; k < j; k++) {
- ld->tables[i][k^j] = (v ^ ld->tables[i][k]);
- }
- v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1);
- }
- for (j = 0; j < 4; j++) {
- for (k = 0; k < 16; k++) {
- btable[k] = (uint8_t) ld->tables[i][k];
- ld->tables[i][k] >>= 8;
- }
- tables[i][j] = _mm_loadu_si128((__m128i *) btable);
- }
- }
-
- mask1 = _mm_set1_epi8(0xf);
-
- if (xor) {
- while (d32 != top) {
- p0 = _mm_load_si128 ((__m128i *) d32);
- p1 = _mm_load_si128 ((__m128i *) (d32+4));
- p2 = _mm_load_si128 ((__m128i *) (d32+8));
- p3 = _mm_load_si128 ((__m128i *) (d32+12));
-
- v0 = _mm_load_si128((__m128i *) s32); s32 += 4;
- v1 = _mm_load_si128((__m128i *) s32); s32 += 4;
- v2 = _mm_load_si128((__m128i *) s32); s32 += 4;
- v3 = _mm_load_si128((__m128i *) s32); s32 += 4;
-
- si = _mm_and_si128(v0, mask1);
- p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[0][0], si));
- p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[0][1], si));
- p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[0][2], si));
- p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[0][3], si));
-
- v0 = _mm_srli_epi32(v0, 4);
- si = _mm_and_si128(v0, mask1);
- p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[1][0], si));
- p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[1][1], si));
- p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[1][2], si));
- p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[1][3], si));
-
- si = _mm_and_si128(v1, mask1);
- p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[2][0], si));
- p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[2][1], si));
- p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[2][2], si));
- p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[2][3], si));
-
- v1 = _mm_srli_epi32(v1, 4);
- si = _mm_and_si128(v1, mask1);
- p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[3][0], si));
- p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[3][1], si));
- p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[3][2], si));
- p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[3][3], si));
-
- si = _mm_and_si128(v2, mask1);
- p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[4][0], si));
- p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[4][1], si));
- p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[4][2], si));
- p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[4][3], si));
-
- v2 = _mm_srli_epi32(v2, 4);
- si = _mm_and_si128(v2, mask1);
- p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[5][0], si));
- p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[5][1], si));
- p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[5][2], si));
- p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[5][3], si));
-
- si = _mm_and_si128(v3, mask1);
- p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[6][0], si));
- p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[6][1], si));
- p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[6][2], si));
- p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[6][3], si));
-
- v3 = _mm_srli_epi32(v3, 4);
- si = _mm_and_si128(v3, mask1);
- p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[7][0], si));
- p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[7][1], si));
- p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[7][2], si));
- p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[7][3], si));
-
- _mm_store_si128((__m128i *) d32, p0);
- _mm_store_si128((__m128i *) (d32+4), p1);
- _mm_store_si128((__m128i *) (d32+8), p2);
- _mm_store_si128((__m128i *) (d32+12), p3);
- d32 += 16;
- }
- } else {
- while (d32 != top) {
-
- v0 = _mm_load_si128((__m128i *) s32); s32 += 4;
- v1 = _mm_load_si128((__m128i *) s32); s32 += 4;
- v2 = _mm_load_si128((__m128i *) s32); s32 += 4;
- v3 = _mm_load_si128((__m128i *) s32); s32 += 4;
-
- si = _mm_and_si128(v0, mask1);
- p0 = _mm_shuffle_epi8(tables[0][0], si);
- p1 = _mm_shuffle_epi8(tables[0][1], si);
- p2 = _mm_shuffle_epi8(tables[0][2], si);
- p3 = _mm_shuffle_epi8(tables[0][3], si);
-
- v0 = _mm_srli_epi32(v0, 4);
- si = _mm_and_si128(v0, mask1);
- p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[1][0], si));
- p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[1][1], si));
- p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[1][2], si));
- p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[1][3], si));
-
- si = _mm_and_si128(v1, mask1);
- p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[2][0], si));
- p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[2][1], si));
- p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[2][2], si));
- p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[2][3], si));
-
- v1 = _mm_srli_epi32(v1, 4);
- si = _mm_and_si128(v1, mask1);
- p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[3][0], si));
- p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[3][1], si));
- p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[3][2], si));
- p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[3][3], si));
-
- si = _mm_and_si128(v2, mask1);
- p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[4][0], si));
- p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[4][1], si));
- p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[4][2], si));
- p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[4][3], si));
-
- v2 = _mm_srli_epi32(v2, 4);
- si = _mm_and_si128(v2, mask1);
- p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[5][0], si));
- p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[5][1], si));
- p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[5][2], si));
- p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[5][3], si));
-
- si = _mm_and_si128(v3, mask1);
- p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[6][0], si));
- p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[6][1], si));
- p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[6][2], si));
- p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[6][3], si));
-
- v3 = _mm_srli_epi32(v3, 4);
- si = _mm_and_si128(v3, mask1);
- p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[7][0], si));
- p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[7][1], si));
- p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[7][2], si));
- p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[7][3], si));
-
- _mm_store_si128((__m128i *) d32, p0);
- _mm_store_si128((__m128i *) (d32+4), p1);
- _mm_store_si128((__m128i *) (d32+8), p2);
- _mm_store_si128((__m128i *) (d32+12), p3);
- d32 += 16;
- }
- }
-
- while (d32 < realtop) {
- if (xor) {
- *d32 ^= gf->multiply.w32(gf, *s32, val);
- } else {
- *d32 = gf->multiply.w32(gf, *s32, val);
- }
- *s32++;
- *d32++;
- }
-
-#endif
-}
-
-
-static
-void
-gf_w32_split_4_32_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
-{
-#ifdef INTEL_SSE4
- unsigned long uls, uld;
- gf_internal_t *h;
- int i, m, j, k, tindex;
- gf_val_32_t pp, v, s, *s32, *d32, *top, *realtop;
- __m128i vi, si, tables[8][4], p0, p1, p2, p3, mask1, v0, v1, v2, v3, mask8, mask16;
- __m128i tv1, tv2, tv3, tv0;
- struct gf_split_4_32_lazy_data *ld;
- uint8_t btable[16];
-
- uls = (unsigned long) src;
- uld = (unsigned long) dest;
- if (uls %4 != 0 || ((uls & 0xf) != (uld & 0xf))) gf_alignment_error("gf_w32_split_4_32_lazy_sse_multiply_region", 4);
- if (bytes % 4 != 0) {
- gf_alignment_error("gf_w32_split_4_32_lazy_sse_multiply_region: buffer size not divisible by symbol size = 4 bytes", 4);
- }
-
- if (val == 0) {
- if (xor) return;
- bzero(dest, bytes);
- return;
- }
-
- h = (gf_internal_t *) gf->scratch;
- pp = h->prim_poly;
-
- uls &= 0xf;
-
- s32 = (gf_val_32_t *) src;
- d32 = (gf_val_32_t *) dest;
- top = d32 + (bytes/4);
-
- if (uls != 0) {
- while (uls != 16) {
- if (xor) {
- *d32 ^= gf->multiply.w32(gf, *s32, val);
- } else {
- *d32 = gf->multiply.w32(gf, *s32, val);
- }
- *s32++;
- *d32++;
- if (d32 == top) return;
- uls += 4;
- }
- }
-
- uld = (unsigned long) top;
- realtop = top;
-
- /* You need the size of this region to be a multiple of 64 bytes */
- bytes = (top - d32);
- bytes -= (bytes & 0xf);
- top = (d32 + bytes);
-
- ld = (struct gf_split_4_32_lazy_data *) h->private;
-
- v = val;
- for (i = 0; i < 8; i++) {
- ld->tables[i][0] = 0;
- for (j = 1; j < 16; j <<= 1) {
- for (k = 0; k < j; k++) {
- ld->tables[i][k^j] = (v ^ ld->tables[i][k]);
- }
- v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1);
- }
- for (j = 0; j < 4; j++) {
- for (k = 0; k < 16; k++) {
- btable[k] = (uint8_t) ld->tables[i][k];
- ld->tables[i][k] >>= 8;
- }
- tables[i][j] = _mm_loadu_si128((__m128i *) btable);
- }
- }
-
- mask1 = _mm_set1_epi8(0xf);
- mask8 = _mm_set1_epi16(0xff);
- mask16 = _mm_set1_epi32(0xffff);
-
- if (xor) {
- while (d32 != top) {
- v0 = _mm_load_si128((__m128i *) s32); s32 += 4;
- v1 = _mm_load_si128((__m128i *) s32); s32 += 4;
- v2 = _mm_load_si128((__m128i *) s32); s32 += 4;
- v3 = _mm_load_si128((__m128i *) s32); s32 += 4;
-
-/* printf("Val = %x\n", val);
- MM_PRINT8("Old V0", v0);
- MM_PRINT8("Old V1", v1);
- MM_PRINT8("Old V2", v2);
- MM_PRINT8("Old V3", v3);
- printf("\n"); */
-
- p0 = _mm_srli_epi16(v0, 8);
- p1 = _mm_srli_epi16(v1, 8);
- p2 = _mm_srli_epi16(v2, 8);
- p3 = _mm_srli_epi16(v3, 8);
-
- tv0 = _mm_and_si128(v0, mask8);
- tv1 = _mm_and_si128(v1, mask8);
- tv2 = _mm_and_si128(v2, mask8);
- tv3 = _mm_and_si128(v3, mask8);
-
- v0 = _mm_packus_epi16(p1, p0);
- v1 = _mm_packus_epi16(tv1, tv0);
- v2 = _mm_packus_epi16(p3, p2);
- v3 = _mm_packus_epi16(tv3, tv2);
-
-/* MM_PRINT8("Middle V0", v0);
- MM_PRINT8("Middle V1", v1);
- MM_PRINT8("Middle V2", v2);
- MM_PRINT8("Middle V3", v3);
- printf("\n"); */
-
- p0 = _mm_srli_epi16(v0, 8);
- p1 = _mm_srli_epi16(v1, 8);
- p2 = _mm_srli_epi16(v2, 8);
- p3 = _mm_srli_epi16(v3, 8);
-
- tv0 = _mm_and_si128(v0, mask8);
- tv1 = _mm_and_si128(v1, mask8);
- tv2 = _mm_and_si128(v2, mask8);
- tv3 = _mm_and_si128(v3, mask8);
-
- v0 = _mm_packus_epi16(p2, p0);
- v1 = _mm_packus_epi16(p3, p1);
- v2 = _mm_packus_epi16(tv2, tv0);
- v3 = _mm_packus_epi16(tv3, tv1);
-
-/* MM_PRINT8("New V0", v0);
- MM_PRINT8("New V1", v1);
- MM_PRINT8("New V2", v2);
- MM_PRINT8("New V3", v3);
- printf("\n"); */
-
- si = _mm_and_si128(v0, mask1);
- p0 = _mm_shuffle_epi8(tables[6][0], si);
- p1 = _mm_shuffle_epi8(tables[6][1], si);
- p2 = _mm_shuffle_epi8(tables[6][2], si);
- p3 = _mm_shuffle_epi8(tables[6][3], si);
-
- v0 = _mm_srli_epi32(v0, 4);
- si = _mm_and_si128(v0, mask1);
- p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[7][0], si));
- p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[7][1], si));
- p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[7][2], si));
- p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[7][3], si));
-
- si = _mm_and_si128(v1, mask1);
- p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[4][0], si));
- p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[4][1], si));
- p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[4][2], si));
- p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[4][3], si));
-
- v1 = _mm_srli_epi32(v1, 4);
- si = _mm_and_si128(v1, mask1);
- p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[5][0], si));
- p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[5][1], si));
- p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[5][2], si));
- p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[5][3], si));
-
- si = _mm_and_si128(v2, mask1);
- p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[2][0], si));
- p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[2][1], si));
- p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[2][2], si));
- p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[2][3], si));
-
- v2 = _mm_srli_epi32(v2, 4);
- si = _mm_and_si128(v2, mask1);
- p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[3][0], si));
- p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[3][1], si));
- p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[3][2], si));
- p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[3][3], si));
-
- si = _mm_and_si128(v3, mask1);
- p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[0][0], si));
- p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[0][1], si));
- p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[0][2], si));
- p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[0][3], si));
-
- v3 = _mm_srli_epi32(v3, 4);
- si = _mm_and_si128(v3, mask1);
- p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[1][0], si));
- p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[1][1], si));
- p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[1][2], si));
- p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[1][3], si));
-
-/* MM_PRINT8("Old P0", p0);
- MM_PRINT8("Old P1", p1);
- MM_PRINT8("Old P2", p2);
- MM_PRINT8("Old P3", p3);
- printf("\n"); */
-
- tv0 = _mm_unpackhi_epi8(p1, p3);
- tv1 = _mm_unpackhi_epi8(p0, p2);
- tv2 = _mm_unpacklo_epi8(p1, p3);
- tv3 = _mm_unpacklo_epi8(p0, p2);
-
-/* MM_PRINT8("Middle P0", tv0);
- MM_PRINT8("Middle P1", tv1);
- MM_PRINT8("Middle P2", tv2);
- MM_PRINT8("Middle P3", tv3);
- printf("\n"); */
-
- p0 = _mm_unpackhi_epi8(tv1, tv0);
- p1 = _mm_unpacklo_epi8(tv1, tv0);
- p2 = _mm_unpackhi_epi8(tv3, tv2);
- p3 = _mm_unpacklo_epi8(tv3, tv2);
-
-/* MM_PRINT8("New P0", p0);
- MM_PRINT8("New P1", p1);
- MM_PRINT8("New P2", p2);
- MM_PRINT8("New P3", p3);
- printf("\n");
- exit(1); */
-
- v0 = _mm_load_si128 ((__m128i *) d32);
- v1 = _mm_load_si128 ((__m128i *) (d32+4));
- v2 = _mm_load_si128 ((__m128i *) (d32+8));
- v3 = _mm_load_si128 ((__m128i *) (d32+12));
-
- p0 = _mm_xor_si128(p0, v0);
- p1 = _mm_xor_si128(p1, v1);
- p2 = _mm_xor_si128(p2, v2);
- p3 = _mm_xor_si128(p3, v3);
-
- _mm_store_si128((__m128i *) d32, p0);
- _mm_store_si128((__m128i *) (d32+4), p1);
- _mm_store_si128((__m128i *) (d32+8), p2);
- _mm_store_si128((__m128i *) (d32+12), p3);
- d32 += 16;
- }
- } else {
- while (d32 != top) {
- v0 = _mm_load_si128((__m128i *) s32); s32 += 4;
- v1 = _mm_load_si128((__m128i *) s32); s32 += 4;
- v2 = _mm_load_si128((__m128i *) s32); s32 += 4;
- v3 = _mm_load_si128((__m128i *) s32); s32 += 4;
-
-/* printf("Val = %x\n", val);
- MM_PRINT8("Old V0", v0);
- MM_PRINT8("Old V1", v1);
- MM_PRINT8("Old V2", v2);
- MM_PRINT8("Old V3", v3);
- printf("\n"); */
-
- p0 = _mm_srli_epi16(v0, 8);
- p1 = _mm_srli_epi16(v1, 8);
- p2 = _mm_srli_epi16(v2, 8);
- p3 = _mm_srli_epi16(v3, 8);
-
- tv0 = _mm_and_si128(v0, mask8);
- tv1 = _mm_and_si128(v1, mask8);
- tv2 = _mm_and_si128(v2, mask8);
- tv3 = _mm_and_si128(v3, mask8);
-
- v0 = _mm_packus_epi16(p1, p0);
- v1 = _mm_packus_epi16(tv1, tv0);
- v2 = _mm_packus_epi16(p3, p2);
- v3 = _mm_packus_epi16(tv3, tv2);
-
-/* MM_PRINT8("Middle V0", v0);
- MM_PRINT8("Middle V1", v1);
- MM_PRINT8("Middle V2", v2);
- MM_PRINT8("Middle V3", v3);
- printf("\n"); */
-
- p0 = _mm_srli_epi16(v0, 8);
- p1 = _mm_srli_epi16(v1, 8);
- p2 = _mm_srli_epi16(v2, 8);
- p3 = _mm_srli_epi16(v3, 8);
-
- tv0 = _mm_and_si128(v0, mask8);
- tv1 = _mm_and_si128(v1, mask8);
- tv2 = _mm_and_si128(v2, mask8);
- tv3 = _mm_and_si128(v3, mask8);
-
- v0 = _mm_packus_epi16(p2, p0);
- v1 = _mm_packus_epi16(p3, p1);
- v2 = _mm_packus_epi16(tv2, tv0);
- v3 = _mm_packus_epi16(tv3, tv1);
-
-/* MM_PRINT8("New V0", v0);
- MM_PRINT8("New V1", v1);
- MM_PRINT8("New V2", v2);
- MM_PRINT8("New V3", v3);
- printf("\n"); */
-
- si = _mm_and_si128(v0, mask1);
- p0 = _mm_shuffle_epi8(tables[6][0], si);
- p1 = _mm_shuffle_epi8(tables[6][1], si);
- p2 = _mm_shuffle_epi8(tables[6][2], si);
- p3 = _mm_shuffle_epi8(tables[6][3], si);
-
- v0 = _mm_srli_epi32(v0, 4);
- si = _mm_and_si128(v0, mask1);
- p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[7][0], si));
- p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[7][1], si));
- p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[7][2], si));
- p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[7][3], si));
-
- si = _mm_and_si128(v1, mask1);
- p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[4][0], si));
- p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[4][1], si));
- p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[4][2], si));
- p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[4][3], si));
-
- v1 = _mm_srli_epi32(v1, 4);
- si = _mm_and_si128(v1, mask1);
- p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[5][0], si));
- p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[5][1], si));
- p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[5][2], si));
- p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[5][3], si));
-
- si = _mm_and_si128(v2, mask1);
- p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[2][0], si));
- p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[2][1], si));
- p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[2][2], si));
- p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[2][3], si));
-
- v2 = _mm_srli_epi32(v2, 4);
- si = _mm_and_si128(v2, mask1);
- p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[3][0], si));
- p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[3][1], si));
- p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[3][2], si));
- p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[3][3], si));
-
- si = _mm_and_si128(v3, mask1);
- p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[0][0], si));
- p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[0][1], si));
- p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[0][2], si));
- p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[0][3], si));
-
- v3 = _mm_srli_epi32(v3, 4);
- si = _mm_and_si128(v3, mask1);
- p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[1][0], si));
- p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[1][1], si));
- p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[1][2], si));
- p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[1][3], si));
-
-/* MM_PRINT8("Old P0", p0);
- MM_PRINT8("Old P1", p1);
- MM_PRINT8("Old P2", p2);
- MM_PRINT8("Old P3", p3);
- printf("\n"); */
-
- tv0 = _mm_unpackhi_epi8(p1, p3);
- tv1 = _mm_unpackhi_epi8(p0, p2);
- tv2 = _mm_unpacklo_epi8(p1, p3);
- tv3 = _mm_unpacklo_epi8(p0, p2);
-
-/* MM_PRINT8("Middle P0", tv0);
- MM_PRINT8("Middle P1", tv1);
- MM_PRINT8("Middle P2", tv2);
- MM_PRINT8("Middle P3", tv3);
- printf("\n"); */
-
- p0 = _mm_unpackhi_epi8(tv1, tv0);
- p1 = _mm_unpacklo_epi8(tv1, tv0);
- p2 = _mm_unpackhi_epi8(tv3, tv2);
- p3 = _mm_unpacklo_epi8(tv3, tv2);
-
-/* MM_PRINT8("New P0", p0);
- MM_PRINT8("New P1", p1);
- MM_PRINT8("New P2", p2);
- MM_PRINT8("New P3", p3);
- printf("\n");
- exit(1); */
-
- _mm_store_si128((__m128i *) d32, p0);
- _mm_store_si128((__m128i *) (d32+4), p1);
- _mm_store_si128((__m128i *) (d32+8), p2);
- _mm_store_si128((__m128i *) (d32+12), p3);
- d32 += 16;
- }
- }
-
- while (d32 < realtop) {
- if (xor) {
- *d32 ^= gf->multiply.w32(gf, *s32, val);
- } else {
- *d32 = gf->multiply.w32(gf, *s32, val);
- }
- *s32++;
- *d32++;
- }
-
-
-#endif
-}
-
-static
-int gf_w32_split_init(gf_t *gf)
-{
- gf_internal_t *h;
- struct gf_split_2_32_lazy_data *ld2;
- struct gf_split_4_32_lazy_data *ld4;
- struct gf_split_8_8_data *d8;
- uint32_t p, basep;
- int i, j, exp;
-
- h = (gf_internal_t *) gf->scratch;
-
- /* Defaults */
- gf->multiply_region.w32 = gf_w32_multiply_region_from_single;
- gf->multiply.w32 = gf_w32_shift_multiply;
- gf->inverse.w32 = gf_w32_euclid;
-
- if (h->arg1 == 8 && h->arg2 == 8) {
- gf->multiply.w32 = gf_w32_split_8_8_multiply;
- gf->multiply_region.w32 = gf_w32_split_8_8_multiply_region;
- d8 = (struct gf_split_8_8_data *) h->private;
- basep = 1;
- for (exp = 0; exp < 7; exp++) {
- for (j = 0; j < 256; j++) d8->tables[exp][0][j] = 0;
- for (i = 0; i < 256; i++) d8->tables[exp][i][0] = 0;
- d8->tables[exp][1][1] = basep;
- for (i = 2; i < 256; i++) {
- if (i&1) {
- p = d8->tables[exp][i^1][1];
- d8->tables[exp][i][1] = p ^ basep;
- } else {
- p = d8->tables[exp][i>>1][1];
- d8->tables[exp][i][1] = GF_MULTBY_TWO(p);
- }
- }
- for (i = 1; i < 256; i++) {
- p = d8->tables[exp][i][1];
- for (j = 1; j < 256; j++) {
- if (j&1) {
- d8->tables[exp][i][j] = d8->tables[exp][i][j^1] ^ p;
- } else {
- d8->tables[exp][i][j] = GF_MULTBY_TWO(d8->tables[exp][i][j>>1]);
- }
- }
- }
- for (i = 0; i < 8; i++) basep = GF_MULTBY_TWO(basep);
- }
- }
- if ((h->arg1 == 2 && h->arg2 == 32) || (h->arg1 == 32 && h->arg2 == 2)) {
- ld2 = (struct gf_split_2_32_lazy_data *) h->private;
- ld2->last_value = 0;
- if (h->region_type & GF_REGION_SSE) {
- gf->multiply_region.w32 = gf_w32_split_2_32_lazy_sse_multiply_region;
- } else {
- gf->multiply_region.w32 = gf_w32_split_2_32_lazy_multiply_region;
- }
- }
- if ((h->arg1 == 4 && h->arg2 == 32) || (h->arg1 == 32 && h->arg2 == 4)) {
- ld4 = (struct gf_split_4_32_lazy_data *) h->private;
- ld4->last_value = 0;
- if (h->region_type & GF_REGION_SSE) {
- if (h->region_type & GF_REGION_ALTMAP) {
- gf->multiply_region.w32 = gf_w32_split_4_32_lazy_sse_altmap_multiply_region;
- } else {
- gf->multiply_region.w32 = gf_w32_split_4_32_lazy_sse_multiply_region;
- }
- } else {
- gf->multiply_region.w32 = gf_w32_split_4_32_lazy_multiply_region;
- }
- }
- return 1;
-}
-
-static
-gf_val_32_t
-gf_w32_composite_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
-{
- gf_internal_t *h = (gf_internal_t *) gf->scratch;
- gf_t *base_gf = h->base_gf;
- uint16_t b0 = b & 0x0000ffff;
- uint16_t b1 = (b & 0xffff0000) >> 16;
- uint16_t a0 = a & 0x0000ffff;
- uint16_t a1 = (a & 0xffff0000) >> 16;
- uint16_t a1b1;
-
- a1b1 = base_gf->multiply.w16(base_gf, a1, b1);
-
- return ((base_gf->multiply.w16(base_gf, a0, b0) ^ a1b1) | ((base_gf->multiply.w16(base_gf, a1, b0) ^ base_gf->multiply.w16(base_gf, a0, b1) ^ base_gf->multiply.w16(base_gf, a1b1, GF_S_GF_16_2)) << 16));
-}
-
-/*
- * Composite field division trick (explained in 2007 tech report)
- *
- * Compute a / b = a*b^-1, where p(x) = x^2 + sx + 1
- *
- * let c = b^-1
- *
- * c*b = (s*b1c1+b1c0+b0c1)x+(b1c1+b0c0)
- *
- * want (s*b1c1+b1c0+b0c1) = 0 and (b1c1+b0c0) = 1
- *
- * let d = b1c1 and d+1 = b0c0
- *
- * solve s*b1c1+b1c0+b0c1 = 0
- *
- * solution: d = (b1b0^-1)(b1b0^-1+b0b1^-1+s)^-1
- *
- * c0 = (d+1)b0^-1
- * c1 = d*b1^-1
- *
- * a / b = a * c
- */
-static
-gf_val_32_t
-gf_w32_composite_inverse(gf_t *gf, gf_val_32_t a)
-{
- gf_internal_t *h = (gf_internal_t *) gf->scratch;
- gf_t *base_gf = h->base_gf;
- uint16_t a0 = a & 0x0000ffff;
- uint16_t a1 = (a & 0xffff0000) >> 16;
- uint16_t c0, c1, d, tmp;
- uint32_t c;
- uint16_t a0inv, a1inv;
-
- if (a0 == 0) {
- a1inv = base_gf->inverse.w16(base_gf, a1);
- c0 = base_gf->multiply.w16(base_gf, a1inv, GF_S_GF_16_2);
- c1 = a1inv;
- } else if (a1 == 0) {
- c0 = base_gf->inverse.w16(base_gf, a0);
- c1 = 0;
- } else {
- a1inv = base_gf->inverse.w16(base_gf, a1);
- a0inv = base_gf->inverse.w16(base_gf, a0);
-
- d = base_gf->multiply.w16(base_gf, a1, a0inv);
-
- tmp = (base_gf->multiply.w16(base_gf, a1, a0inv) ^ base_gf->multiply.w16(base_gf, a0, a1inv) ^ GF_S_GF_16_2);
- tmp = base_gf->inverse.w16(base_gf, tmp);
-
- d = base_gf->multiply.w16(base_gf, d, tmp);
-
- c0 = base_gf->multiply.w16(base_gf, (d^1), a0inv);
- c1 = base_gf->multiply.w16(base_gf, d, a1inv);
- }
-
- c = c0 | (c1 << 16);
-
- return c;
-}
-
-static
-gf_val_32_t
-gf_w32_composite_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
-{
- gf_val_32_t binv;
-
- binv = gf_w32_composite_inverse(gf, b);
-
- return gf_w32_composite_multiply(gf, a, binv);
-}
-
-static
-void
-gf_w32_composite_multiply_region_table(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
-{
- unsigned long uls, uld;
- gf_internal_t *h = (gf_internal_t *) gf->scratch;
- gf_t *base_gf = h->base_gf;
- int i=0;
- struct gf_w16_logtable_data * ltd;
- uint16_t b0 = val & 0x0000ffff;
- uint16_t b1 = (val & 0xffff0000) >> 16;
- uint32_t *s32 = (uint32_t *) src;
- uint32_t *d32 = (uint32_t *) dest;
- uint16_t a0, a1, a1b1;
- int num_syms = bytes >> 2;
- int sym_divisible = bytes % 4;
-
- uls = (unsigned long) src;
- uld = (unsigned long) dest;
- if ((uls & 0x7) != (uld & 0x7)) gf_alignment_error("gf_w32_buf_const_log", 2);
- if (sym_divisible) {
- gf_alignment_error("gf_w32_buf_const_log: buffer size not divisible by symbol size = 2 bytes", 2);
- }
-
- if (val == 0) {
- if (xor) return;
- bzero(dest, bytes);
- return;
- }
-
- ltd = (struct gf_w16_logtable_data *) h->private;
-
- if (xor) {
- for (i = 0;i < num_syms; i++) {
- a0 = s32[i] & 0x0000ffff;
- a1 = (s32[i] & 0xffff0000) >> 16;
- a1b1 = ltd->antilog_tbl[ltd->log_tbl[a1] + ltd->log_tbl[b1]];
-
- d32[i] ^= ((ltd->antilog_tbl[ltd->log_tbl[a0] + ltd->log_tbl[b0]] ^ a1b1) |
- ((ltd->antilog_tbl[ltd->log_tbl[a1] + ltd->log_tbl[b0]] ^ ltd->antilog_tbl[ltd->log_tbl[a0] + ltd->log_tbl[b1]] ^
- ltd->antilog_tbl[ltd->log_tbl[a1b1] + ltd->log_tbl[GF_S_GF_16_2]]) << 16));
-
- }
- } else {
- for (i = 0;i < num_syms; i++) {
- a0 = s32[i] & 0x0000ffff;
- a1 = (s32[i] & 0xffff0000) >> 16;
- a1b1 = ltd->antilog_tbl[ltd->log_tbl[a1] + ltd->log_tbl[b1]];
-
- d32[i] = ((ltd->antilog_tbl[ltd->log_tbl[a0] + ltd->log_tbl[b0]] ^ a1b1) |
- ((ltd->antilog_tbl[ltd->log_tbl[a1] + ltd->log_tbl[b0]] ^ ltd->antilog_tbl[ltd->log_tbl[a0] + ltd->log_tbl[b1]] ^
- ltd->antilog_tbl[ltd->log_tbl[a1b1] + ltd->log_tbl[GF_S_GF_16_2]]) << 16));
- }
- }
-}
-
-static
-void
-gf_w32_composite_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
-{
- unsigned long uls, uld;
- gf_internal_t *h = (gf_internal_t *) gf->scratch;
- gf_t *base_gf = h->base_gf;
- int i=0;
- struct gf_w16_logtable_data * ltd;
- uint16_t b0 = val & 0x0000ffff;
- uint16_t b1 = (val & 0xffff0000) >> 16;
- uint32_t *s32 = (uint32_t *) src;
- uint32_t *d32 = (uint32_t *) dest;
- uint16_t a0, a1, a1b1;
- int num_syms = bytes >> 2;
- int sym_divisible = bytes % 4;
-
- uls = (unsigned long) src;
- uld = (unsigned long) dest;
- if ((uls & 0x7) != (uld & 0x7)) gf_alignment_error("gf_w32_buf_const_log", 2);
- if (sym_divisible) {
- gf_alignment_error("gf_w32_buf_const_log: buffer size not divisible by symbol size = 2 bytes", 2);
- }
-
- if (val == 0) {
- if (xor) return;
- bzero(dest, bytes);
- return;
- }
-
- ltd = (struct gf_w16_logtable_data *) h->private;
-
- if (xor) {
- for (i = 0;i < num_syms; i++) {
- a0 = s32[i] & 0x0000ffff;
- a1 = (s32[i] & 0xffff0000) >> 16;
- a1b1 = base_gf->multiply.w16(base_gf, a1, b1);
-
- d32[i] ^= ((base_gf->multiply.w16(base_gf, a0, b0) ^ a1b1) |
- ((base_gf->multiply.w16(base_gf, a1, b0) ^ base_gf->multiply.w16(base_gf, a0, b1) ^ base_gf->multiply.w16(base_gf, a1b1, GF_S_GF_16_2)) << 16));
-
- }
- } else {
- for (i = 0;i < num_syms; i++) {
- a0 = s32[i] & 0x0000ffff;
- a1 = (s32[i] & 0xffff0000) >> 16;
- a1b1 = base_gf->multiply.w16(base_gf, a1, b1);
-
- d32[i] = ((base_gf->multiply.w16(base_gf, a0, b0) ^ a1b1) |
- ((base_gf->multiply.w16(base_gf, a1, b0) ^ base_gf->multiply.w16(base_gf, a0, b1) ^ base_gf->multiply.w16(base_gf, a1b1, GF_S_GF_16_2)) << 16));
- }
- }
-}
-
-
-
-static
-void
-gf_w32_composite_multiply_region_alt(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
-{
- gf_internal_t *h = (gf_internal_t *) gf->scratch;
- gf_t *base_gf = h->base_gf;
- gf_val_16_t val0 = val & 0x0000ffff;
- gf_val_16_t val1 = (val & 0xffff0000) >> 16;
- int sub_reg_size = bytes / 2;
-
- if (bytes % 2 != 0) gf_alignment_error("gf_w32_composite_multiply_region_alt", 1);
- if (sub_reg_size % 2 != 0) gf_alignment_error("gf_w32_composite_multiply_region_alt", 1);
-
- if (!xor) {
- memset(dest, 0, bytes);
- }
-
- base_gf->multiply_region.w16(base_gf, src, dest, val0, sub_reg_size, xor);
- base_gf->multiply_region.w16(base_gf, src+sub_reg_size, dest, val1, sub_reg_size, 1);
- base_gf->multiply_region.w16(base_gf, src, dest+sub_reg_size, val1, sub_reg_size, xor);
- base_gf->multiply_region.w16(base_gf, src+sub_reg_size, dest+sub_reg_size, val0, sub_reg_size, 1);
- base_gf->multiply_region.w16(base_gf, src+sub_reg_size, dest+sub_reg_size, base_gf->multiply.w16(base_gf, GF_S_GF_16_2, val1), sub_reg_size, 1);
-}
-
-static
-int gf_w32_composite_init(gf_t *gf)
-{
- struct gf_w16_logtable_data *ltd;
- gf_internal_t *h = (gf_internal_t *) gf->scratch;
- gf_t *base_gf = h->base_gf;
- gf_val_32_t a, b;
- uint64_t prim_poly = ((gf_internal_t *) base_gf->scratch)->prim_poly;
- int i;
-
- ltd = (struct gf_w16_logtable_data *) h->private;
-
- ltd->log_tbl[0] = 0;
-
- bzero(&(ltd->_antilog_tbl[0]), sizeof(ltd->_antilog_tbl));
-
- ltd->antilog_tbl = &(ltd->_antilog_tbl[GF_BASE_FIELD_SIZE * 2]);
-
- b = 1;
- for (i = 0; i < GF_BASE_FIELD_GROUP_SIZE; i++) {
- ltd->log_tbl[b] = (gf_val_16_t)i;
- ltd->antilog_tbl[i] = (gf_val_16_t)b;
- ltd->antilog_tbl[i+GF_BASE_FIELD_GROUP_SIZE] = (gf_val_16_t)b;
- b <<= 1;
- if (b & GF_BASE_FIELD_SIZE) {
- b = b ^ prim_poly;
- }
- }
- ltd->inv_tbl[0] = 0; /* Not really, but we need to fill it with something */
- ltd->inv_tbl[1] = 1;
- for (i = 2; i < GF_BASE_FIELD_SIZE; i++) {
- ltd->inv_tbl[i] = ltd->antilog_tbl[GF_BASE_FIELD_GROUP_SIZE-ltd->log_tbl[i]];
- }
-
- if (h->region_type & GF_REGION_ALTMAP) {
- gf->multiply_region.w32 = gf_w32_composite_multiply_region_alt;
- } else {
- if (h->region_type & GF_REGION_SINGLE_TABLE) {
- gf->multiply_region.w32 = gf_w32_composite_multiply_region_table;
- } else {
- gf->multiply_region.w32 = gf_w32_composite_multiply_region;
- }
- }
-
- gf->multiply.w32 = gf_w32_composite_multiply;
- gf->divide.w32 = gf_w32_composite_divide;
- gf->inverse.w32 = gf_w32_composite_inverse;
-
- return 1;
-}
-
-int gf_w32_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2)
-{
- int ss, sa;
-
- ss = (GF_REGION_SSE | GF_REGION_NOSSE);
- sa = (GF_REGION_STDMAP | GF_REGION_ALTMAP);
-
- switch(mult_type)
- {
- case GF_MULT_SPLIT_TABLE:
- if (arg1 == 8 && arg2 == 8){
- if (region_type != GF_REGION_DEFAULT) return -1;
- return sizeof(gf_internal_t) + sizeof(struct gf_split_8_8_data) + 64;
- }
- if ((arg1 == 2 && arg2 == 32) || (arg2 == 2 && arg1 == 32)) {
- region_type &= (~GF_REGION_LAZY);
- if ((region_type & ss) == ss) return -1;
- if ((region_type | ss) != ss) return -1;
- return sizeof(gf_internal_t) + sizeof(struct gf_split_2_32_lazy_data) + 64;
- }
- if ((arg1 == 4 && arg2 == 32) || (arg2 == 4 && arg1 == 32)) {
- region_type &= (~GF_REGION_LAZY);
- if ((region_type & ss) == ss) return -1;
- if ((region_type & sa) == sa) return -1;
- if (region_type & (~(ss|sa))) return -1;
- if (region_type & GF_REGION_SSE) {
- return sizeof(gf_internal_t) + sizeof(struct gf_split_4_32_lazy_data) + 64;
- } else if (region_type & GF_REGION_ALTMAP) {
- return -1;
- } else {
- return sizeof(gf_internal_t) + sizeof(struct gf_split_4_32_lazy_data) + 64;
- }
- }
- return -1;
- case GF_MULT_DEFAULT:
- case GF_MULT_SHIFT:
- if (arg1 != 0 || arg2 != 0 || region_type != 0) return -1;
- return sizeof(gf_internal_t);
- break;
- case GF_MULT_COMPOSITE:
- if (region_type & ~(GF_REGION_SINGLE_TABLE | GF_REGION_ALTMAP | GF_REGION_STDMAP)) return -1;
- if ((region_type & (GF_REGION_SINGLE_TABLE | GF_REGION_ALTMAP)) == (GF_REGION_SINGLE_TABLE | GF_REGION_ALTMAP)) return -1;
- if (arg1 == 2 && arg2 == 16 || arg2 == 2 && arg1 == 16) {
- return sizeof(gf_internal_t) + sizeof(struct gf_w16_logtable_data) + 64;
- } else {
- return -1;
- }
- default:
- return -1;
- }
-}
-
-int gf_w32_init(gf_t *gf)
-{
- gf_internal_t *h;
-
- h = (gf_internal_t *) gf->scratch;
- if (h->prim_poly == 0) h->prim_poly = 0x400007;
-
- gf->multiply.w32 = NULL;
- gf->divide.w32 = NULL;
- gf->inverse.w32 = NULL;
- gf->multiply_region.w32 = NULL;
-
- switch(h->mult_type) {
- case GF_MULT_DEFAULT:
- case GF_MULT_SHIFT: if (gf_w32_shift_init(gf) == 0) return 0; break;
- case GF_MULT_COMPOSITE: if (gf_w32_composite_init(gf) == 0) return 0; break;
- case GF_MULT_SPLIT_TABLE: if (gf_w32_split_init(gf) == 0) return 0; break;
- default: return 0;
- }
- if (h->divide_type == GF_DIVIDE_EUCLID) {
- gf->divide.w32 = gf_w32_divide_from_inverse;
- gf->inverse.w32 = gf_w32_euclid;
- } else if (h->divide_type == GF_DIVIDE_MATRIX) {
- gf->divide.w32 = gf_w32_divide_from_inverse;
- gf->inverse.w32 = gf_w32_matrix;
- }
-
- if (gf->inverse.w32 != NULL && gf->divide.w32 == NULL) {
- gf->divide.w32 = gf_w32_divide_from_inverse;
- }
- if (gf->inverse.w32 == NULL && gf->divide.w32 != NULL) {
- gf->inverse.w32 = gf_w32_inverse_from_divide;
- }
- return 1;
-}
diff --git a/tmp.sh b/tmp.sh
deleted file mode 100644
index 6bd92b2..0000000
--- a/tmp.sh
+++ /dev/null
@@ -1,15 +0,0 @@
-for i in 5 10 ; do
- sed 's/1 }/'$i' }/' tmp-time-test.sh > tmp2.sh
- sh tmp2.sh 4 LOG - - >> tmp-$i-out.txt
- sh tmp2.sh 4 TABLE - - >> tmp-$i-out.txt
- sh tmp2.sh 4 TABLE SINGLE,SSE - >> tmp-$i-out.txt
- sh tmp2.sh 8 LOG - - >> tmp-$i-out.txt
- sh tmp2.sh 8 TABLE - - >> tmp-$i-out.txt
- sh tmp2.sh 8 SPLIT 8 4 SSE - >> tmp-$i-out.txt
- sh tmp2.sh 16 LOG - - >> tmp-$i-out.txt
- sh tmp2.sh 16 SPLIT 16 4 SSE,STDMAP - >> tmp-$i-out.txt
- sh tmp2.sh 16 SPLIT 16 4 SSE,ALTMAP - >> tmp-$i-out.txt
- sh tmp2.sh 32 SPLIT 8 8 - - >> tmp-$i-out.txt
- sh tmp2.sh 32 SPLIT 32 4 SSE,STDMAP - >> tmp-$i-out.txt
- sh tmp2.sh 32 SPLIT 32 4 SSE,ALTMAP - >> tmp-$i-out.txt
-done
diff --git a/tmp.txt b/tmp.txt
deleted file mode 100644
index 468cf49..0000000
--- a/tmp.txt
+++ /dev/null
@@ -1,162 +0,0 @@
-Tables[0] = 0000000000000000 3b60e7ccf8f4454e 76c1cf99f1e88a9c 4da12855091ccfd2 ed839f33e3d11538 d6e378ff1b255076 9b4250aa12399fa4 a022b766eacddaea db073e67c7a22a6b e067d9ab3f566f25 adc6f1fe364aa0f7 96a61632cebee5b9 3684a15424733f53 0de44698dc877a1d 40456ecdd59bb5cf 7b2589012d6ff081
-Tij 81 cf 1d 53 b9 f7 25 6b ea a4 76 38 d2 9c 4e 00
-Tij f0 b5 7a 3f e5 a0 6f 2a da 9f 50 15 cf 8a 45 00
-Tij 6f 9b 87 73 be 4a 56 a2 cd 39 25 d1 1c e8 f4 00
-Tij 2d d5 dc 24 ce 36 3f c7 ea 12 1b e3 09 f1 f8 00
-Tij 01 cd 98 54 32 fe ab 67 66 aa ff 33 55 99 cc 00
-Tij 89 6e 46 a1 16 f1 d9 3e b7 50 78 9f 28 cf e7 00
-Tij 25 45 e4 84 a6 c6 67 07 22 42 e3 83 a1 c1 60 00
-Tij 7b 40 0d 36 96 ad e0 db a0 9b d6 ed 4d 76 3b 00
-Tables[1] = 0000000000000000 b60e7ccf8f4454cd 6c1cf99f1e88a981 da12855091ccfd4c d839f33e3d115302 6e378ff1b25507cf b4250aa12399fa83 022b766eacddae4e b073e67c7a22a61f 067d9ab3f566f2d2 dc6f1fe364aa0f9e 6a61632cebee5b53 684a15424733f51d de44698dc877a1d0 0456ecdd59bb5c9c b2589012d6ff0851
-Tij 51 9c d0 1d 53 9e d2 1f 4e 83 cf 02 4c 81 cd 00
-Tij 08 5c a1 f5 5b 0f f2 a6 ae fa 07 53 fd a9 54 00
-Tij ff bb 77 33 ee aa 66 22 dd 99 55 11 cc 88 44 00
-Tij d6 59 c8 47 eb 64 f5 7a ac 23 b2 3d 91 1e 8f 00
-Tij 12 dd 8d 42 2c e3 b3 7c 6e a1 f1 3e 50 9f cf 00
-Tij 90 ec 69 15 63 1f 9a e6 76 0a 8f f3 85 f9 7c 00
-Tij 58 56 44 4a 61 6f 7d 73 2b 25 37 39 12 1c 0e 00
-Tij b2 04 de 68 6a dc 06 b0 02 b4 6e d8 da 6c b6 00
-Tables[2] = 0000000000000000 60e7ccf8f4454c25 c1cf99f1e88a984a a12855091ccfd46f 839f33e3d115308f e378ff1b25507caa 4250aa12399fa8c5 22b766eacddae4e0 073e67c7a22a6105 67d9ab3f566f2d20 c6f1fe364aa0f94f a61632cebee5b56a 84a15424733f518a e44698dc877a1daf 456ecdd59bb5c9c0 2589012d6ff085e5
-Tij e5 c0 af 8a 6a 4f 20 05 e0 c5 aa 8f 6f 4a 25 00
-Tij 85 c9 1d 51 b5 f9 2d 61 e4 a8 7c 30 d4 98 4c 00
-Tij f0 b5 7a 3f e5 a0 6f 2a da 9f 50 15 cf 8a 45 00
-Tij 6f 9b 87 73 be 4a 56 a2 cd 39 25 d1 1c e8 f4 00
-Tij 2d d5 dc 24 ce 36 3f c7 ea 12 1b e3 09 f1 f8 00
-Tij 01 cd 98 54 32 fe ab 67 66 aa ff 33 55 99 cc 00
-Tij 89 6e 46 a1 16 f1 d9 3e b7 50 78 9f 28 cf e7 00
-Tij 25 45 e4 84 a6 c6 67 07 22 42 e3 83 a1 c1 60 00
-Tables[3] = 0000000000000000 0e7ccf8f4454c20a 1cf99f1e88a98414 12855091ccfd461e 39f33e3d11530828 378ff1b25507ca22 250aa12399fa8c3c 2b766eacddae4e36 73e67c7a22a61050 7d9ab3f566f2d25a 6f1fe364aa0f9444 61632cebee5b564e 4a15424733f51878 44698dc877a1da72 56ecdd59bb5c9c6c 589012d6ff085e66
-Tij 66 6c 72 78 4e 44 5a 50 36 3c 22 28 1e 14 0a 00
-Tij 5e 9c da 18 56 94 d2 10 4e 8c ca 08 46 84 c2 00
-Tij 08 5c a1 f5 5b 0f f2 a6 ae fa 07 53 fd a9 54 00
-Tij ff bb 77 33 ee aa 66 22 dd 99 55 11 cc 88 44 00
-Tij d6 59 c8 47 eb 64 f5 7a ac 23 b2 3d 91 1e 8f 00
-Tij 12 dd 8d 42 2c e3 b3 7c 6e a1 f1 3e 50 9f cf 00
-Tij 90 ec 69 15 63 1f 9a e6 76 0a 8f f3 85 f9 7c 00
-Tij 58 56 44 4a 61 6f 7d 73 2b 25 37 39 12 1c 0e 00
-Tables[4] = 0000000000000000 e7ccf8f4454c20a0 cf99f1e88a98415b 2855091ccfd461fb 9f33e3d1153082ad 78ff1b25507ca20d 50aa12399fa8c3f6 b766eacddae4e356 3e67c7a22a610541 d9ab3f566f2d25e1 f1fe364aa0f9441a 1632cebee5b564ba a15424733f5187ec 4698dc877a1da74c 6ecdd59bb5c9c6b7 89012d6ff085e617
-Tij 17 b7 4c ec ba 1a e1 41 56 f6 0d ad fb 5b a0 00
-Tij e6 c6 a7 87 64 44 25 05 e3 c3 a2 82 61 41 20 00
-Tij 85 c9 1d 51 b5 f9 2d 61 e4 a8 7c 30 d4 98 4c 00
-Tij f0 b5 7a 3f e5 a0 6f 2a da 9f 50 15 cf 8a 45 00
-Tij 6f 9b 87 73 be 4a 56 a2 cd 39 25 d1 1c e8 f4 00
-Tij 2d d5 dc 24 ce 36 3f c7 ea 12 1b e3 09 f1 f8 00
-Tij 01 cd 98 54 32 fe ab 67 66 aa ff 33 55 99 cc 00
-Tij 89 6e 46 a1 16 f1 d9 3e b7 50 78 9f 28 cf e7 00
-Tables[5] = 0000000000000000 7ccf8f4454c20a82 f99f1e88a9841504 855091ccfd461f86 f33e3d1153082a13 8ff1b25507ca2091 0aa12399fa8c3f17 766eacddae4e3595 e67c7a22a610543d 9ab3f566f2d25ebf 1fe364aa0f944139 632cebee5b564bbb 15424733f5187e2e 698dc877a1da74ac ecdd59bb5c9c6b2a 9012d6ff085e61a8
-Tij a8 2a ac 2e bb 39 bf 3d 95 17 91 13 86 04 82 00
-Tij 61 6b 74 7e 4b 41 5e 54 35 3f 20 2a 1f 15 0a 00
-Tij 5e 9c da 18 56 94 d2 10 4e 8c ca 08 46 84 c2 00
-Tij 08 5c a1 f5 5b 0f f2 a6 ae fa 07 53 fd a9 54 00
-Tij ff bb 77 33 ee aa 66 22 dd 99 55 11 cc 88 44 00
-Tij d6 59 c8 47 eb 64 f5 7a ac 23 b2 3d 91 1e 8f 00
-Tij 12 dd 8d 42 2c e3 b3 7c 6e a1 f1 3e 50 9f cf 00
-Tij 90 ec 69 15 63 1f 9a e6 76 0a 8f f3 85 f9 7c 00
-Tables[6] = 0000000000000000 ccf8f4454c20a861 99f1e88a984150d9 55091ccfd461f8b8 33e3d1153082a1a9 ff1b25507ca209c8 aa12399fa8c3f170 66eacddae4e35911 67c7a22a61054352 ab3f566f2d25eb33 fe364aa0f944138b 32cebee5b564bbea 5424733f5187e2fb 98dc877a1da74a9a cdd59bb5c9c6b222 012d6ff085e61a43
-Tij 43 22 9a fb ea 8b 33 52 11 70 c8 a9 b8 d9 61 00
-Tij 1a b2 4a e2 bb 13 eb 43 59 f1 09 a1 f8 50 a8 00
-Tij e6 c6 a7 87 64 44 25 05 e3 c3 a2 82 61 41 20 00
-Tij 85 c9 1d 51 b5 f9 2d 61 e4 a8 7c 30 d4 98 4c 00
-Tij f0 b5 7a 3f e5 a0 6f 2a da 9f 50 15 cf 8a 45 00
-Tij 6f 9b 87 73 be 4a 56 a2 cd 39 25 d1 1c e8 f4 00
-Tij 2d d5 dc 24 ce 36 3f c7 ea 12 1b e3 09 f1 f8 00
-Tij 01 cd 98 54 32 fe ab 67 66 aa ff 33 55 99 cc 00
-Tables[7] = 0000000000000000 cf8f4454c20a86a4 9f1e88a984150d53 5091ccfd461f8bf7 3e3d1153082a1abd f1b25507ca209c19 a12399fa8c3f17ee 6eacddae4e35914a 7c7a22a61054357a b3f566f2d25eb3de e364aa0f94413829 2cebee5b564bbe8d 424733f5187e2fc7 8dc877a1da74a963 dd59bb5c9c6b2294 12d6ff085e61a430
-Tij 30 94 63 c7 8d 29 de 7a 4a ee 19 bd f7 53 a4 00
-Tij a4 22 a9 2f be 38 b3 35 91 17 9c 1a 8b 0d 86 00
-Tij 61 6b 74 7e 4b 41 5e 54 35 3f 20 2a 1f 15 0a 00
-Tij 5e 9c da 18 56 94 d2 10 4e 8c ca 08 46 84 c2 00
-Tij 08 5c a1 f5 5b 0f f2 a6 ae fa 07 53 fd a9 54 00
-Tij ff bb 77 33 ee aa 66 22 dd 99 55 11 cc 88 44 00
-Tij d6 59 c8 47 eb 64 f5 7a ac 23 b2 3d 91 1e 8f 00
-Tij 12 dd 8d 42 2c e3 b3 7c 6e a1 f1 3e 50 9f cf 00
-Tables[8] = 0000000000000000 f8f4454c20a86af4 f1e88a984150d5f3 091ccfd461f8bf07 e3d1153082a1abfd 1b25507ca209c109 12399fa8c3f17e0e eacddae4e35914fa c7a22a61054357e1 3f566f2d25eb3d15 364aa0f944138212 cebee5b564bbe8e6 24733f5187e2fc1c dc877a1da74a96e8 d59bb5c9c6b229ef 2d6ff085e61a431b
-Tij 1b ef e8 1c e6 12 15 e1 fa 0e 09 fd 07 f3 f4 00
-Tij 43 29 96 fc e8 82 3d 57 14 7e c1 ab bf d5 6a 00
-Tij 1a b2 4a e2 bb 13 eb 43 59 f1 09 a1 f8 50 a8 00
-Tij e6 c6 a7 87 64 44 25 05 e3 c3 a2 82 61 41 20 00
-Tij 85 c9 1d 51 b5 f9 2d 61 e4 a8 7c 30 d4 98 4c 00
-Tij f0 b5 7a 3f e5 a0 6f 2a da 9f 50 15 cf 8a 45 00
-Tij 6f 9b 87 73 be 4a 56 a2 cd 39 25 d1 1c e8 f4 00
-Tij 2d d5 dc 24 ce 36 3f c7 ea 12 1b e3 09 f1 f8 00
-Tables[9] = 0000000000000000 8f4454c20a86afd9 1e88a984150d5fa9 91ccfd461f8bf070 3d1153082a1abf52 b25507ca209c108b 2399fa8c3f17e0fb acddae4e35914f22 7a22a61054357ea4 f566f2d25eb3d17d 64aa0f944138210d ebee5b564bbe8ed4 4733f5187e2fc1f6 c877a1da74a96e2f 59bb5c9c6b229e5f d6ff085e61a43186
-Tij 86 5f 2f f6 d4 0d 7d a4 22 fb 8b 52 70 a9 d9 00
-Tij 31 9e 6e c1 8e 21 d1 7e 4f e0 10 bf f0 5f af 00
-Tij a4 22 a9 2f be 38 b3 35 91 17 9c 1a 8b 0d 86 00
-Tij 61 6b 74 7e 4b 41 5e 54 35 3f 20 2a 1f 15 0a 00
-Tij 5e 9c da 18 56 94 d2 10 4e 8c ca 08 46 84 c2 00
-Tij 08 5c a1 f5 5b 0f f2 a6 ae fa 07 53 fd a9 54 00
-Tij ff bb 77 33 ee aa 66 22 dd 99 55 11 cc 88 44 00
-Tij d6 59 c8 47 eb 64 f5 7a ac 23 b2 3d 91 1e 8f 00
-Tables[10] = 0000000000000000 f4454c20a86afd48 e88a984150d5fa8b 1ccfd461f8bf07c3 d1153082a1abf50d 25507ca209c10845 399fa8c3f17e0f86 cddae4e35914f2ce a22a61054357ea01 566f2d25eb3d1749 4aa0f9441382108a bee5b564bbe8edc2 733f5187e2fc1f0c 877a1da74a96e244 9bb5c9c6b229e587 6ff085e61a4318cf
-Tij cf 87 44 0c c2 8a 49 01 ce 86 45 0d c3 8b 48 00
-Tij 18 e5 e2 1f ed 10 17 ea f2 0f 08 f5 07 fa fd 00
-Tij 43 29 96 fc e8 82 3d 57 14 7e c1 ab bf d5 6a 00
-Tij 1a b2 4a e2 bb 13 eb 43 59 f1 09 a1 f8 50 a8 00
-Tij e6 c6 a7 87 64 44 25 05 e3 c3 a2 82 61 41 20 00
-Tij 85 c9 1d 51 b5 f9 2d 61 e4 a8 7c 30 d4 98 4c 00
-Tij f0 b5 7a 3f e5 a0 6f 2a da 9f 50 15 cf 8a 45 00
-Tij 6f 9b 87 73 be 4a 56 a2 cd 39 25 d1 1c e8 f4 00
-Tables[11] = 0000000000000000 4454c20a86afd419 88a984150d5fa832 ccfd461f8bf07c2b 1153082a1abf507f 5507ca209c108466 99fa8c3f17e0f84d ddae4e35914f2c54 22a61054357ea0fe 66f2d25eb3d174e7 aa0f9441382108cc ee5b564bbe8edcd5 33f5187e2fc1f081 77a1da74a96e2498 bb5c9c6b229e58b3 ff085e61a4318caa
-Tij aa b3 98 81 d5 cc e7 fe 54 4d 66 7f 2b 32 19 00
-Tij 8c 58 24 f0 dc 08 74 a0 2c f8 84 50 7c a8 d4 00
-Tij 31 9e 6e c1 8e 21 d1 7e 4f e0 10 bf f0 5f af 00
-Tij a4 22 a9 2f be 38 b3 35 91 17 9c 1a 8b 0d 86 00
-Tij 61 6b 74 7e 4b 41 5e 54 35 3f 20 2a 1f 15 0a 00
-Tij 5e 9c da 18 56 94 d2 10 4e 8c ca 08 46 84 c2 00
-Tij 08 5c a1 f5 5b 0f f2 a6 ae fa 07 53 fd a9 54 00
-Tij ff bb 77 33 ee aa 66 22 dd 99 55 11 cc 88 44 00
-Tables[12] = 0000000000000000 454c20a86afd41fc 8a984150d5fa83f8 cfd461f8bf07c204 153082a1abf507eb 507ca209c1084617 9fa8c3f17e0f8413 dae4e35914f2c5ef 2a61054357ea0fd6 6f2d25eb3d174e2a a0f9441382108c2e e5b564bbe8edcdd2 3f5187e2fc1f083d 7a1da74a96e249c1 b5c9c6b229e58bc5 f085e61a4318ca39
-Tij 39 c5 c1 3d d2 2e 2a d6 ef 13 17 eb 04 f8 fc 00
-Tij ca 8b 49 08 cd 8c 4e 0f c5 84 46 07 c2 83 41 00
-Tij 18 e5 e2 1f ed 10 17 ea f2 0f 08 f5 07 fa fd 00
-Tij 43 29 96 fc e8 82 3d 57 14 7e c1 ab bf d5 6a 00
-Tij 1a b2 4a e2 bb 13 eb 43 59 f1 09 a1 f8 50 a8 00
-Tij e6 c6 a7 87 64 44 25 05 e3 c3 a2 82 61 41 20 00
-Tij 85 c9 1d 51 b5 f9 2d 61 e4 a8 7c 30 d4 98 4c 00
-Tij f0 b5 7a 3f e5 a0 6f 2a da 9f 50 15 cf 8a 45 00
-Tables[13] = 0000000000000000 54c20a86afd41fac a984150d5fa83f58 fd461f8bf07c20f4 53082a1abf507eab 07ca209c10846107 fa8c3f17e0f841f3 ae4e35914f2c5e5f a61054357ea0fd56 f2d25eb3d174e2fa 0f9441382108c20e 5b564bbe8edcdda2 f5187e2fc1f083fd a1da74a96e249c51 5c9c6b229e58bca5 085e61a4318ca309
-Tij 09 a5 51 fd a2 0e fa 56 5f f3 07 ab f4 58 ac 00
-Tij a3 bc 9c 83 dd c2 e2 fd 5e 41 61 7e 20 3f 1f 00
-Tij 8c 58 24 f0 dc 08 74 a0 2c f8 84 50 7c a8 d4 00
-Tij 31 9e 6e c1 8e 21 d1 7e 4f e0 10 bf f0 5f af 00
-Tij a4 22 a9 2f be 38 b3 35 91 17 9c 1a 8b 0d 86 00
-Tij 61 6b 74 7e 4b 41 5e 54 35 3f 20 2a 1f 15 0a 00
-Tij 5e 9c da 18 56 94 d2 10 4e 8c ca 08 46 84 c2 00
-Tij 08 5c a1 f5 5b 0f f2 a6 ae fa 07 53 fd a9 54 00
-Tables[14] = 0000000000000000 4c20a86afd41fab7 984150d5fa83f56e d461f8bf07c20fd9 3082a1abf507eac7 7ca209c108461070 a8c3f17e0f841fa9 e4e35914f2c5e51e 61054357ea0fd58e 2d25eb3d174e2f39 f9441382108c20e0 b564bbe8edcdda57 5187e2fc1f083f49 1da74a96e249c5fe c9c6b229e58bca27 85e61a4318ca3090
-Tij 90 27 fe 49 57 e0 39 8e 1e a9 70 c7 d9 6e b7 00
-Tij 30 ca c5 3f da 20 2f d5 e5 1f 10 ea 0f f5 fa 00
-Tij ca 8b 49 08 cd 8c 4e 0f c5 84 46 07 c2 83 41 00
-Tij 18 e5 e2 1f ed 10 17 ea f2 0f 08 f5 07 fa fd 00
-Tij 43 29 96 fc e8 82 3d 57 14 7e c1 ab bf d5 6a 00
-Tij 1a b2 4a e2 bb 13 eb 43 59 f1 09 a1 f8 50 a8 00
-Tij e6 c6 a7 87 64 44 25 05 e3 c3 a2 82 61 41 20 00
-Tij 85 c9 1d 51 b5 f9 2d 61 e4 a8 7c 30 d4 98 4c 00
-Tables[15] = 0000000000000000 c20a86afd41fab1c 84150d5fa83f5623 461f8bf07c20fd3f 082a1abf507eac5d ca209c1084610741 8c3f17e0f841fa7e 4e35914f2c5e5162 1054357ea0fd58ba d25eb3d174e2f3a6 9441382108c20e99 564bbe8edcdda585 187e2fc1f083f4e7 da74a96e249c5ffb 9c6b229e58bca2c4 5e61a4318ca309d8
-Tij d8 c4 fb e7 85 99 a6 ba 62 7e 41 5d 3f 23 1c 00
-Tij 09 a2 5f f4 a5 0e f3 58 51 fa 07 ac fd 56 ab 00
-Tij a3 bc 9c 83 dd c2 e2 fd 5e 41 61 7e 20 3f 1f 00
-Tij 8c 58 24 f0 dc 08 74 a0 2c f8 84 50 7c a8 d4 00
-Tij 31 9e 6e c1 8e 21 d1 7e 4f e0 10 bf f0 5f af 00
-Tij a4 22 a9 2f be 38 b3 35 91 17 9c 1a 8b 0d 86 00
-Tij 61 6b 74 7e 4b 41 5e 54 35 3f 20 2a 1f 15 0a 00
-Tij 5e 9c da 18 56 94 d2 10 4e 8c ca 08 46 84 c2 00
-Val= 3b60e7ccf8f4454e
-v0 28 4f 14 e3 1b f7 ee 76 b9 31 47 0a ba 8b 70 fc
-v0 12 56 28 59 66 cd d2 d2 1c 91 30 26 a8 95 0a a9
-v0 ee 5d 14 e3 fb c8 45 23 a9 fd 8c f1 ff c9 2c 93
-v0 65 ce 82 f2 dc ec 6b e2 53 a3 9c fb 07 70 e7 ad
-v0 1b 87 3d 7b 4d 15 1d c2 d2 45 f3 03 4b e4 f4 9b
-v0 3b 01 2b c5 c5 d2 9d a9 68 7c a2 61 c9 5b 49 90
-v0 5d 13 7d ef eb f1 52 da a0 29 89 ef 08 f2 51 3b
-v0 17 05 b3 80 77 3a f2 5e 82 7a c9 39 84 df 8e bf
-
-p0 11 fc 47 f4 6c 01 44 ba ba 62 e7 3f ba fb ba 85
-p0 a6 fc 67 16 5f c3 95 fc 58 51 f4 fd 58 5f 58 a5
-p0 12 fc 1f b3 50 1e 3f 9a fd 5e 83 20 fd 9c fd dd
-p0 d9 fc 1e ee 22 42 10 7f a0 2c f0 7c a0 24 a0 dc
-p0 a2 fc 4c 30 41 ce ad eb 7e 4f c1 f0 7e 6e 7e 8e
-p0 8b fc 7c 7b 9f b5 38 67 35 91 2f 8b 35 a9 35 be
-p0 07 fc 89 1a 3b 21 fd db 54 35 7e 1f 54 74 54 4b
-p0 cf fc 94 5e 40 78 c2 31 10 4e 18 46 10 da 10 56
diff --git a/tmp2.sh b/tmp2.sh
deleted file mode 100644
index d98248f..0000000
--- a/tmp2.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-if [ $# -lt 4 ]; then
- echo 'usage: sh tmp-test.sh w gf_specs (e.g. LOG - -)' >&2
- exit 1
-fi
-
-w=$1
-shift
-i=1024
-while [ $i -le 1073741824 ]; do
- iter=`echo $i | awk '{ print (1073741824/$1)*10 }'`
- echo $i $iter $w $* `gf_time $w R -1 $i $iter $*`
- i=`echo $i | awk '{ print $1*2 }'`
-done