diff options
-rw-r--r-- | Manual.pdf | bin | 501334 -> 0 bytes | |||
-rw-r--r-- | README | 10 | ||||
-rwxr-xr-x | compile | 347 | ||||
-rw-r--r-- | configure.ac | 11 | ||||
-rwxr-xr-x | depcomp | 791 | ||||
-rw-r--r-- | examples/gf_example_2.c | 4 | ||||
-rw-r--r-- | examples/gf_example_5.c | 1 | ||||
-rw-r--r-- | examples/gf_example_6.c | 1 | ||||
-rw-r--r-- | examples/gf_example_7.c | 1 | ||||
-rw-r--r-- | include/gf_complete.h | 15 | ||||
-rw-r--r-- | include/gf_int.h | 4 | ||||
-rw-r--r-- | m4/ltoptions.m4 | 19 | ||||
-rw-r--r-- | src/gf.c | 68 | ||||
-rw-r--r-- | src/gf_general.c | 9 | ||||
-rw-r--r-- | src/gf_general.h | 61 | ||||
-rw-r--r-- | src/gf_int.h | 200 | ||||
-rw-r--r-- | src/gf_method.c | 7 | ||||
-rw-r--r-- | src/gf_rand.h | 22 | ||||
-rw-r--r-- | src/gf_w128.c | 63 | ||||
-rw-r--r-- | src/gf_w16.c | 77 | ||||
-rw-r--r-- | src/gf_w32.c | 229 | ||||
-rw-r--r-- | src/gf_w4.c | 207 | ||||
-rw-r--r-- | src/gf_w64.c | 90 | ||||
-rw-r--r-- | src/gf_w8.c | 102 | ||||
-rw-r--r-- | src/gf_wgen.c | 13 | ||||
-rwxr-xr-x | test-driver | 139 | ||||
-rw-r--r-- | test/gf_unit.c | 20 | ||||
-rw-r--r-- | tools/Makefile.am | 2 | ||||
-rw-r--r-- | tools/gf_add.c | 2 | ||||
-rw-r--r-- | tools/gf_inline_time.c | 5 | ||||
-rw-r--r-- | tools/gf_methods.c | 7 | ||||
-rw-r--r-- | tools/gf_poly.c | 6 | ||||
-rw-r--r-- | tools/gf_time.c | 15 | ||||
-rwxr-xr-x | tools/run-tests.sh | 9 |
34 files changed, 1804 insertions, 753 deletions
diff --git a/Manual.pdf b/Manual.pdf Binary files differdeleted file mode 100644 index 59968bb..0000000 --- a/Manual.pdf +++ /dev/null @@ -8,8 +8,10 @@ Authors: James S. Plank (University of Tennessee) Adam W. Disney (University of Tennessee, Allen C. McBride (University of Tennessee) -The user's manual is in the file Manual.pdf. You may also get a copy of that -manual at http://www.cs.utk.edu/~plank/plank/papers/GF-Complete-Manual-1.02.pdf. +The programmer's manual and tutorial is provided in two places: + +1.) A copy is hosted on BitBucket at https://bitbucket.org/jimplank/gf-complete/downloads/GF-Complete-Manual.pdf +2.) A copy is also available at http://www.cs.utk.edu/~plank/plank/papers/GF-Complete-Manual-1.02.pdf The online home for GF-Complete is: @@ -25,3 +27,7 @@ To compile, do: ./configure make sudo make install + +To run the tests, do: + + make check @@ -0,0 +1,347 @@ +#! /bin/sh +# Wrapper for compilers which do not understand '-c -o'. + +scriptversion=2012-10-14.11; # UTC + +# Copyright (C) 1999-2013 Free Software Foundation, Inc. +# Written by Tom Tromey <tromey@cygnus.com>. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. + +# As a special exception to the GNU General Public License, if you +# distribute this file as part of a program that contains a +# configuration script generated by Autoconf, you may include it under +# the same distribution terms that you use for the rest of that program. + +# This file is maintained in Automake, please report +# bugs to <bug-automake@gnu.org> or send patches to +# <automake-patches@gnu.org>. + +nl=' +' + +# We need space, tab and new line, in precisely that order. Quoting is +# there to prevent tools from complaining about whitespace usage. +IFS=" "" $nl" + +file_conv= + +# func_file_conv build_file lazy +# Convert a $build file to $host form and store it in $file +# Currently only supports Windows hosts. If the determined conversion +# type is listed in (the comma separated) LAZY, no conversion will +# take place. +func_file_conv () +{ + file=$1 + case $file in + / | /[!/]*) # absolute file, and not a UNC file + if test -z "$file_conv"; then + # lazily determine how to convert abs files + case `uname -s` in + MINGW*) + file_conv=mingw + ;; + CYGWIN*) + file_conv=cygwin + ;; + *) + file_conv=wine + ;; + esac + fi + case $file_conv/,$2, in + *,$file_conv,*) + ;; + mingw/*) + file=`cmd //C echo "$file " | sed -e 's/"\(.*\) " *$/\1/'` + ;; + cygwin/*) + file=`cygpath -m "$file" || echo "$file"` + ;; + wine/*) + file=`winepath -w "$file" || echo "$file"` + ;; + esac + ;; + esac +} + +# func_cl_dashL linkdir +# Make cl look for libraries in LINKDIR +func_cl_dashL () +{ + func_file_conv "$1" + if test -z "$lib_path"; then + lib_path=$file + else + lib_path="$lib_path;$file" + fi + linker_opts="$linker_opts -LIBPATH:$file" +} + +# func_cl_dashl library +# Do a library search-path lookup for cl +func_cl_dashl () +{ + lib=$1 + found=no + save_IFS=$IFS + IFS=';' + for dir in $lib_path $LIB + do + IFS=$save_IFS + if $shared && test -f "$dir/$lib.dll.lib"; then + found=yes + lib=$dir/$lib.dll.lib + break + fi + if test -f "$dir/$lib.lib"; then + found=yes + lib=$dir/$lib.lib + break + fi + if test -f "$dir/lib$lib.a"; then + found=yes + lib=$dir/lib$lib.a + break + fi + done + IFS=$save_IFS + + if test "$found" != yes; then + lib=$lib.lib + fi +} + +# func_cl_wrapper cl arg... +# Adjust compile command to suit cl +func_cl_wrapper () +{ + # Assume a capable shell + lib_path= + shared=: + linker_opts= + for arg + do + if test -n "$eat"; then + eat= + else + case $1 in + -o) + # configure might choose to run compile as 'compile cc -o foo foo.c'. + eat=1 + case $2 in + *.o | *.[oO][bB][jJ]) + func_file_conv "$2" + set x "$@" -Fo"$file" + shift + ;; + *) + func_file_conv "$2" + set x "$@" -Fe"$file" + shift + ;; + esac + ;; + -I) + eat=1 + func_file_conv "$2" mingw + set x "$@" -I"$file" + shift + ;; + -I*) + func_file_conv "${1#-I}" mingw + set x "$@" -I"$file" + shift + ;; + -l) + eat=1 + func_cl_dashl "$2" + set x "$@" "$lib" + shift + ;; + -l*) + func_cl_dashl "${1#-l}" + set x "$@" "$lib" + shift + ;; + -L) + eat=1 + func_cl_dashL "$2" + ;; + -L*) + func_cl_dashL "${1#-L}" + ;; + -static) + shared=false + ;; + -Wl,*) + arg=${1#-Wl,} + save_ifs="$IFS"; IFS=',' + for flag in $arg; do + IFS="$save_ifs" + linker_opts="$linker_opts $flag" + done + IFS="$save_ifs" + ;; + -Xlinker) + eat=1 + linker_opts="$linker_opts $2" + ;; + -*) + set x "$@" "$1" + shift + ;; + *.cc | *.CC | *.cxx | *.CXX | *.[cC]++) + func_file_conv "$1" + set x "$@" -Tp"$file" + shift + ;; + *.c | *.cpp | *.CPP | *.lib | *.LIB | *.Lib | *.OBJ | *.obj | *.[oO]) + func_file_conv "$1" mingw + set x "$@" "$file" + shift + ;; + *) + set x "$@" "$1" + shift + ;; + esac + fi + shift + done + if test -n "$linker_opts"; then + linker_opts="-link$linker_opts" + fi + exec "$@" $linker_opts + exit 1 +} + +eat= + +case $1 in + '') + echo "$0: No command. Try '$0 --help' for more information." 1>&2 + exit 1; + ;; + -h | --h*) + cat <<\EOF +Usage: compile [--help] [--version] PROGRAM [ARGS] + +Wrapper for compilers which do not understand '-c -o'. +Remove '-o dest.o' from ARGS, run PROGRAM with the remaining +arguments, and rename the output as expected. + +If you are trying to build a whole package this is not the +right script to run: please start by reading the file 'INSTALL'. + +Report bugs to <bug-automake@gnu.org>. +EOF + exit $? + ;; + -v | --v*) + echo "compile $scriptversion" + exit $? + ;; + cl | *[/\\]cl | cl.exe | *[/\\]cl.exe ) + func_cl_wrapper "$@" # Doesn't return... + ;; +esac + +ofile= +cfile= + +for arg +do + if test -n "$eat"; then + eat= + else + case $1 in + -o) + # configure might choose to run compile as 'compile cc -o foo foo.c'. + # So we strip '-o arg' only if arg is an object. + eat=1 + case $2 in + *.o | *.obj) + ofile=$2 + ;; + *) + set x "$@" -o "$2" + shift + ;; + esac + ;; + *.c) + cfile=$1 + set x "$@" "$1" + shift + ;; + *) + set x "$@" "$1" + shift + ;; + esac + fi + shift +done + +if test -z "$ofile" || test -z "$cfile"; then + # If no '-o' option was seen then we might have been invoked from a + # pattern rule where we don't need one. That is ok -- this is a + # normal compilation that the losing compiler can handle. If no + # '.c' file was seen then we are probably linking. That is also + # ok. + exec "$@" +fi + +# Name of file we expect compiler to create. +cofile=`echo "$cfile" | sed 's|^.*[\\/]||; s|^[a-zA-Z]:||; s/\.c$/.o/'` + +# Create the lock directory. +# Note: use '[/\\:.-]' here to ensure that we don't use the same name +# that we are using for the .o file. Also, base the name on the expected +# object file name, since that is what matters with a parallel build. +lockdir=`echo "$cofile" | sed -e 's|[/\\:.-]|_|g'`.d +while true; do + if mkdir "$lockdir" >/dev/null 2>&1; then + break + fi + sleep 1 +done +# FIXME: race condition here if user kills between mkdir and trap. +trap "rmdir '$lockdir'; exit 1" 1 2 15 + +# Run the compile. +"$@" +ret=$? + +if test -f "$cofile"; then + test "$cofile" = "$ofile" || mv "$cofile" "$ofile" +elif test -f "${cofile}bj"; then + test "${cofile}bj" = "$ofile" || mv "${cofile}bj" "$ofile" +fi + +rmdir "$lockdir" +exit $ret + +# Local Variables: +# mode: shell-script +# sh-indentation: 2 +# eval: (add-hook 'write-file-hooks 'time-stamp) +# time-stamp-start: "scriptversion=" +# time-stamp-format: "%:y-%02m-%02d.%02H" +# time-stamp-time-zone: "UTC" +# time-stamp-end: "; # UTC" +# End: diff --git a/configure.ac b/configure.ac index 058e71f..9f33852 100644 --- a/configure.ac +++ b/configure.ac @@ -17,13 +17,20 @@ AC_CONFIG_MACRO_DIR([m4]) AM_MAINTAINER_MODE([disable]) # Override default CFLAGS -CFLAGS="-O3 -g" -CXXFLAGS="-O3 -g" +CFLAGS="-Wall -Wpointer-arith -O3 -g" dnl Compiling with per-target flags requires AM_PROG_CC_C_O. AC_PROG_CC AX_EXT() +AC_ARG_ENABLE([sse], + AS_HELP_STRING([--disable-sse], [Build without SSE optimizations]), + [if test "x$enableval" = "xno" ; then + SIMD_FLAGS="" + echo "DISABLED SSE!!!" + fi] +) + AC_CONFIG_FILES([Makefile src/Makefile tools/Makefile test/Makefile examples/Makefile]) AC_OUTPUT @@ -0,0 +1,791 @@ +#! /bin/sh +# depcomp - compile a program generating dependencies as side-effects + +scriptversion=2013-05-30.07; # UTC + +# Copyright (C) 1999-2013 Free Software Foundation, Inc. + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. + +# As a special exception to the GNU General Public License, if you +# distribute this file as part of a program that contains a +# configuration script generated by Autoconf, you may include it under +# the same distribution terms that you use for the rest of that program. + +# Originally written by Alexandre Oliva <oliva@dcc.unicamp.br>. + +case $1 in + '') + echo "$0: No command. Try '$0 --help' for more information." 1>&2 + exit 1; + ;; + -h | --h*) + cat <<\EOF +Usage: depcomp [--help] [--version] PROGRAM [ARGS] + +Run PROGRAMS ARGS to compile a file, generating dependencies +as side-effects. + +Environment variables: + depmode Dependency tracking mode. + source Source file read by 'PROGRAMS ARGS'. + object Object file output by 'PROGRAMS ARGS'. + DEPDIR directory where to store dependencies. + depfile Dependency file to output. + tmpdepfile Temporary file to use when outputting dependencies. + libtool Whether libtool is used (yes/no). + +Report bugs to <bug-automake@gnu.org>. +EOF + exit $? + ;; + -v | --v*) + echo "depcomp $scriptversion" + exit $? + ;; +esac + +# Get the directory component of the given path, and save it in the +# global variables '$dir'. Note that this directory component will +# be either empty or ending with a '/' character. This is deliberate. +set_dir_from () +{ + case $1 in + */*) dir=`echo "$1" | sed -e 's|/[^/]*$|/|'`;; + *) dir=;; + esac +} + +# Get the suffix-stripped basename of the given path, and save it the +# global variable '$base'. +set_base_from () +{ + base=`echo "$1" | sed -e 's|^.*/||' -e 's/\.[^.]*$//'` +} + +# If no dependency file was actually created by the compiler invocation, +# we still have to create a dummy depfile, to avoid errors with the +# Makefile "include basename.Plo" scheme. +make_dummy_depfile () +{ + echo "#dummy" > "$depfile" +} + +# Factor out some common post-processing of the generated depfile. +# Requires the auxiliary global variable '$tmpdepfile' to be set. +aix_post_process_depfile () +{ + # If the compiler actually managed to produce a dependency file, + # post-process it. + if test -f "$tmpdepfile"; then + # Each line is of the form 'foo.o: dependency.h'. + # Do two passes, one to just change these to + # $object: dependency.h + # and one to simply output + # dependency.h: + # which is needed to avoid the deleted-header problem. + { sed -e "s,^.*\.[$lower]*:,$object:," < "$tmpdepfile" + sed -e "s,^.*\.[$lower]*:[$tab ]*,," -e 's,$,:,' < "$tmpdepfile" + } > "$depfile" + rm -f "$tmpdepfile" + else + make_dummy_depfile + fi +} + +# A tabulation character. +tab=' ' +# A newline character. +nl=' +' +# Character ranges might be problematic outside the C locale. +# These definitions help. +upper=ABCDEFGHIJKLMNOPQRSTUVWXYZ +lower=abcdefghijklmnopqrstuvwxyz +digits=0123456789 +alpha=${upper}${lower} + +if test -z "$depmode" || test -z "$source" || test -z "$object"; then + echo "depcomp: Variables source, object and depmode must be set" 1>&2 + exit 1 +fi + +# Dependencies for sub/bar.o or sub/bar.obj go into sub/.deps/bar.Po. +depfile=${depfile-`echo "$object" | + sed 's|[^\\/]*$|'${DEPDIR-.deps}'/&|;s|\.\([^.]*\)$|.P\1|;s|Pobj$|Po|'`} +tmpdepfile=${tmpdepfile-`echo "$depfile" | sed 's/\.\([^.]*\)$/.T\1/'`} + +rm -f "$tmpdepfile" + +# Avoid interferences from the environment. +gccflag= dashmflag= + +# Some modes work just like other modes, but use different flags. We +# parameterize here, but still list the modes in the big case below, +# to make depend.m4 easier to write. Note that we *cannot* use a case +# here, because this file can only contain one case statement. +if test "$depmode" = hp; then + # HP compiler uses -M and no extra arg. + gccflag=-M + depmode=gcc +fi + +if test "$depmode" = dashXmstdout; then + # This is just like dashmstdout with a different argument. + dashmflag=-xM + depmode=dashmstdout +fi + +cygpath_u="cygpath -u -f -" +if test "$depmode" = msvcmsys; then + # This is just like msvisualcpp but w/o cygpath translation. + # Just convert the backslash-escaped backslashes to single forward + # slashes to satisfy depend.m4 + cygpath_u='sed s,\\\\,/,g' + depmode=msvisualcpp +fi + +if test "$depmode" = msvc7msys; then + # This is just like msvc7 but w/o cygpath translation. + # Just convert the backslash-escaped backslashes to single forward + # slashes to satisfy depend.m4 + cygpath_u='sed s,\\\\,/,g' + depmode=msvc7 +fi + +if test "$depmode" = xlc; then + # IBM C/C++ Compilers xlc/xlC can output gcc-like dependency information. + gccflag=-qmakedep=gcc,-MF + depmode=gcc +fi + +case "$depmode" in +gcc3) +## gcc 3 implements dependency tracking that does exactly what +## we want. Yay! Note: for some reason libtool 1.4 doesn't like +## it if -MD -MP comes after the -MF stuff. Hmm. +## Unfortunately, FreeBSD c89 acceptance of flags depends upon +## the command line argument order; so add the flags where they +## appear in depend2.am. Note that the slowdown incurred here +## affects only configure: in makefiles, %FASTDEP% shortcuts this. + for arg + do + case $arg in + -c) set fnord "$@" -MT "$object" -MD -MP -MF "$tmpdepfile" "$arg" ;; + *) set fnord "$@" "$arg" ;; + esac + shift # fnord + shift # $arg + done + "$@" + stat=$? + if test $stat -ne 0; then + rm -f "$tmpdepfile" + exit $stat + fi + mv "$tmpdepfile" "$depfile" + ;; + +gcc) +## Note that this doesn't just cater to obsosete pre-3.x GCC compilers. +## but also to in-use compilers like IMB xlc/xlC and the HP C compiler. +## (see the conditional assignment to $gccflag above). +## There are various ways to get dependency output from gcc. Here's +## why we pick this rather obscure method: +## - Don't want to use -MD because we'd like the dependencies to end +## up in a subdir. Having to rename by hand is ugly. +## (We might end up doing this anyway to support other compilers.) +## - The DEPENDENCIES_OUTPUT environment variable makes gcc act like +## -MM, not -M (despite what the docs say). Also, it might not be +## supported by the other compilers which use the 'gcc' depmode. +## - Using -M directly means running the compiler twice (even worse +## than renaming). + if test -z "$gccflag"; then + gccflag=-MD, + fi + "$@" -Wp,"$gccflag$tmpdepfile" + stat=$? + if test $stat -ne 0; then + rm -f "$tmpdepfile" + exit $stat + fi + rm -f "$depfile" + echo "$object : \\" > "$depfile" + # The second -e expression handles DOS-style file names with drive + # letters. + sed -e 's/^[^:]*: / /' \ + -e 's/^['$alpha']:\/[^:]*: / /' < "$tmpdepfile" >> "$depfile" +## This next piece of magic avoids the "deleted header file" problem. +## The problem is that when a header file which appears in a .P file +## is deleted, the dependency causes make to die (because there is +## typically no way to rebuild the header). We avoid this by adding +## dummy dependencies for each header file. Too bad gcc doesn't do +## this for us directly. +## Some versions of gcc put a space before the ':'. On the theory +## that the space means something, we add a space to the output as +## well. hp depmode also adds that space, but also prefixes the VPATH +## to the object. Take care to not repeat it in the output. +## Some versions of the HPUX 10.20 sed can't process this invocation +## correctly. Breaking it into two sed invocations is a workaround. + tr ' ' "$nl" < "$tmpdepfile" \ + | sed -e 's/^\\$//' -e '/^$/d' -e "s|.*$object$||" -e '/:$/d' \ + | sed -e 's/$/ :/' >> "$depfile" + rm -f "$tmpdepfile" + ;; + +hp) + # This case exists only to let depend.m4 do its work. It works by + # looking at the text of this script. This case will never be run, + # since it is checked for above. + exit 1 + ;; + +sgi) + if test "$libtool" = yes; then + "$@" "-Wp,-MDupdate,$tmpdepfile" + else + "$@" -MDupdate "$tmpdepfile" + fi + stat=$? + if test $stat -ne 0; then + rm -f "$tmpdepfile" + exit $stat + fi + rm -f "$depfile" + + if test -f "$tmpdepfile"; then # yes, the sourcefile depend on other files + echo "$object : \\" > "$depfile" + # Clip off the initial element (the dependent). Don't try to be + # clever and replace this with sed code, as IRIX sed won't handle + # lines with more than a fixed number of characters (4096 in + # IRIX 6.2 sed, 8192 in IRIX 6.5). We also remove comment lines; + # the IRIX cc adds comments like '#:fec' to the end of the + # dependency line. + tr ' ' "$nl" < "$tmpdepfile" \ + | sed -e 's/^.*\.o://' -e 's/#.*$//' -e '/^$/ d' \ + | tr "$nl" ' ' >> "$depfile" + echo >> "$depfile" + # The second pass generates a dummy entry for each header file. + tr ' ' "$nl" < "$tmpdepfile" \ + | sed -e 's/^.*\.o://' -e 's/#.*$//' -e '/^$/ d' -e 's/$/:/' \ + >> "$depfile" + else + make_dummy_depfile + fi + rm -f "$tmpdepfile" + ;; + +xlc) + # This case exists only to let depend.m4 do its work. It works by + # looking at the text of this script. This case will never be run, + # since it is checked for above. + exit 1 + ;; + +aix) + # The C for AIX Compiler uses -M and outputs the dependencies + # in a .u file. In older versions, this file always lives in the + # current directory. Also, the AIX compiler puts '$object:' at the + # start of each line; $object doesn't have directory information. + # Version 6 uses the directory in both cases. + set_dir_from "$object" + set_base_from "$object" + if test "$libtool" = yes; then + tmpdepfile1=$dir$base.u + tmpdepfile2=$base.u + tmpdepfile3=$dir.libs/$base.u + "$@" -Wc,-M + else + tmpdepfile1=$dir$base.u + tmpdepfile2=$dir$base.u + tmpdepfile3=$dir$base.u + "$@" -M + fi + stat=$? + if test $stat -ne 0; then + rm -f "$tmpdepfile1" "$tmpdepfile2" "$tmpdepfile3" + exit $stat + fi + + for tmpdepfile in "$tmpdepfile1" "$tmpdepfile2" "$tmpdepfile3" + do + test -f "$tmpdepfile" && break + done + aix_post_process_depfile + ;; + +tcc) + # tcc (Tiny C Compiler) understand '-MD -MF file' since version 0.9.26 + # FIXME: That version still under development at the moment of writing. + # Make that this statement remains true also for stable, released + # versions. + # It will wrap lines (doesn't matter whether long or short) with a + # trailing '\', as in: + # + # foo.o : \ + # foo.c \ + # foo.h \ + # + # It will put a trailing '\' even on the last line, and will use leading + # spaces rather than leading tabs (at least since its commit 0394caf7 + # "Emit spaces for -MD"). + "$@" -MD -MF "$tmpdepfile" + stat=$? + if test $stat -ne 0; then + rm -f "$tmpdepfile" + exit $stat + fi + rm -f "$depfile" + # Each non-empty line is of the form 'foo.o : \' or ' dep.h \'. + # We have to change lines of the first kind to '$object: \'. + sed -e "s|.*:|$object :|" < "$tmpdepfile" > "$depfile" + # And for each line of the second kind, we have to emit a 'dep.h:' + # dummy dependency, to avoid the deleted-header problem. + sed -n -e 's|^ *\(.*\) *\\$|\1:|p' < "$tmpdepfile" >> "$depfile" + rm -f "$tmpdepfile" + ;; + +## The order of this option in the case statement is important, since the +## shell code in configure will try each of these formats in the order +## listed in this file. A plain '-MD' option would be understood by many +## compilers, so we must ensure this comes after the gcc and icc options. +pgcc) + # Portland's C compiler understands '-MD'. + # Will always output deps to 'file.d' where file is the root name of the + # source file under compilation, even if file resides in a subdirectory. + # The object file name does not affect the name of the '.d' file. + # pgcc 10.2 will output + # foo.o: sub/foo.c sub/foo.h + # and will wrap long lines using '\' : + # foo.o: sub/foo.c ... \ + # sub/foo.h ... \ + # ... + set_dir_from "$object" + # Use the source, not the object, to determine the base name, since + # that's sadly what pgcc will do too. + set_base_from "$source" + tmpdepfile=$base.d + + # For projects that build the same source file twice into different object + # files, the pgcc approach of using the *source* file root name can cause + # problems in parallel builds. Use a locking strategy to avoid stomping on + # the same $tmpdepfile. + lockdir=$base.d-lock + trap " + echo '$0: caught signal, cleaning up...' >&2 + rmdir '$lockdir' + exit 1 + " 1 2 13 15 + numtries=100 + i=$numtries + while test $i -gt 0; do + # mkdir is a portable test-and-set. + if mkdir "$lockdir" 2>/dev/null; then + # This process acquired the lock. + "$@" -MD + stat=$? + # Release the lock. + rmdir "$lockdir" + break + else + # If the lock is being held by a different process, wait + # until the winning process is done or we timeout. + while test -d "$lockdir" && test $i -gt 0; do + sleep 1 + i=`expr $i - 1` + done + fi + i=`expr $i - 1` + done + trap - 1 2 13 15 + if test $i -le 0; then + echo "$0: failed to acquire lock after $numtries attempts" >&2 + echo "$0: check lockdir '$lockdir'" >&2 + exit 1 + fi + + if test $stat -ne 0; then + rm -f "$tmpdepfile" + exit $stat + fi + rm -f "$depfile" + # Each line is of the form `foo.o: dependent.h', + # or `foo.o: dep1.h dep2.h \', or ` dep3.h dep4.h \'. + # Do two passes, one to just change these to + # `$object: dependent.h' and one to simply `dependent.h:'. + sed "s,^[^:]*:,$object :," < "$tmpdepfile" > "$depfile" + # Some versions of the HPUX 10.20 sed can't process this invocation + # correctly. Breaking it into two sed invocations is a workaround. + sed 's,^[^:]*: \(.*\)$,\1,;s/^\\$//;/^$/d;/:$/d' < "$tmpdepfile" \ + | sed -e 's/$/ :/' >> "$depfile" + rm -f "$tmpdepfile" + ;; + +hp2) + # The "hp" stanza above does not work with aCC (C++) and HP's ia64 + # compilers, which have integrated preprocessors. The correct option + # to use with these is +Maked; it writes dependencies to a file named + # 'foo.d', which lands next to the object file, wherever that + # happens to be. + # Much of this is similar to the tru64 case; see comments there. + set_dir_from "$object" + set_base_from "$object" + if test "$libtool" = yes; then + tmpdepfile1=$dir$base.d + tmpdepfile2=$dir.libs/$base.d + "$@" -Wc,+Maked + else + tmpdepfile1=$dir$base.d + tmpdepfile2=$dir$base.d + "$@" +Maked + fi + stat=$? + if test $stat -ne 0; then + rm -f "$tmpdepfile1" "$tmpdepfile2" + exit $stat + fi + + for tmpdepfile in "$tmpdepfile1" "$tmpdepfile2" + do + test -f "$tmpdepfile" && break + done + if test -f "$tmpdepfile"; then + sed -e "s,^.*\.[$lower]*:,$object:," "$tmpdepfile" > "$depfile" + # Add 'dependent.h:' lines. + sed -ne '2,${ + s/^ *// + s/ \\*$// + s/$/:/ + p + }' "$tmpdepfile" >> "$depfile" + else + make_dummy_depfile + fi + rm -f "$tmpdepfile" "$tmpdepfile2" + ;; + +tru64) + # The Tru64 compiler uses -MD to generate dependencies as a side + # effect. 'cc -MD -o foo.o ...' puts the dependencies into 'foo.o.d'. + # At least on Alpha/Redhat 6.1, Compaq CCC V6.2-504 seems to put + # dependencies in 'foo.d' instead, so we check for that too. + # Subdirectories are respected. + set_dir_from "$object" + set_base_from "$object" + + if test "$libtool" = yes; then + # Libtool generates 2 separate objects for the 2 libraries. These + # two compilations output dependencies in $dir.libs/$base.o.d and + # in $dir$base.o.d. We have to check for both files, because + # one of the two compilations can be disabled. We should prefer + # $dir$base.o.d over $dir.libs/$base.o.d because the latter is + # automatically cleaned when .libs/ is deleted, while ignoring + # the former would cause a distcleancheck panic. + tmpdepfile1=$dir$base.o.d # libtool 1.5 + tmpdepfile2=$dir.libs/$base.o.d # Likewise. + tmpdepfile3=$dir.libs/$base.d # Compaq CCC V6.2-504 + "$@" -Wc,-MD + else + tmpdepfile1=$dir$base.d + tmpdepfile2=$dir$base.d + tmpdepfile3=$dir$base.d + "$@" -MD + fi + + stat=$? + if test $stat -ne 0; then + rm -f "$tmpdepfile1" "$tmpdepfile2" "$tmpdepfile3" + exit $stat + fi + + for tmpdepfile in "$tmpdepfile1" "$tmpdepfile2" "$tmpdepfile3" + do + test -f "$tmpdepfile" && break + done + # Same post-processing that is required for AIX mode. + aix_post_process_depfile + ;; + +msvc7) + if test "$libtool" = yes; then + showIncludes=-Wc,-showIncludes + else + showIncludes=-showIncludes + fi + "$@" $showIncludes > "$tmpdepfile" + stat=$? + grep -v '^Note: including file: ' "$tmpdepfile" + if test $stat -ne 0; then + rm -f "$tmpdepfile" + exit $stat + fi + rm -f "$depfile" + echo "$object : \\" > "$depfile" + # The first sed program below extracts the file names and escapes + # backslashes for cygpath. The second sed program outputs the file + # name when reading, but also accumulates all include files in the + # hold buffer in order to output them again at the end. This only + # works with sed implementations that can handle large buffers. + sed < "$tmpdepfile" -n ' +/^Note: including file: *\(.*\)/ { + s//\1/ + s/\\/\\\\/g + p +}' | $cygpath_u | sort -u | sed -n ' +s/ /\\ /g +s/\(.*\)/'"$tab"'\1 \\/p +s/.\(.*\) \\/\1:/ +H +$ { + s/.*/'"$tab"'/ + G + p +}' >> "$depfile" + echo >> "$depfile" # make sure the fragment doesn't end with a backslash + rm -f "$tmpdepfile" + ;; + +msvc7msys) + # This case exists only to let depend.m4 do its work. It works by + # looking at the text of this script. This case will never be run, + # since it is checked for above. + exit 1 + ;; + +#nosideeffect) + # This comment above is used by automake to tell side-effect + # dependency tracking mechanisms from slower ones. + +dashmstdout) + # Important note: in order to support this mode, a compiler *must* + # always write the preprocessed file to stdout, regardless of -o. + "$@" || exit $? + + # Remove the call to Libtool. + if test "$libtool" = yes; then + while test "X$1" != 'X--mode=compile'; do + shift + done + shift + fi + + # Remove '-o $object'. + IFS=" " + for arg + do + case $arg in + -o) + shift + ;; + $object) + shift + ;; + *) + set fnord "$@" "$arg" + shift # fnord + shift # $arg + ;; + esac + done + + test -z "$dashmflag" && dashmflag=-M + # Require at least two characters before searching for ':' + # in the target name. This is to cope with DOS-style filenames: + # a dependency such as 'c:/foo/bar' could be seen as target 'c' otherwise. + "$@" $dashmflag | + sed "s|^[$tab ]*[^:$tab ][^:][^:]*:[$tab ]*|$object: |" > "$tmpdepfile" + rm -f "$depfile" + cat < "$tmpdepfile" > "$depfile" + # Some versions of the HPUX 10.20 sed can't process this sed invocation + # correctly. Breaking it into two sed invocations is a workaround. + tr ' ' "$nl" < "$tmpdepfile" \ + | sed -e 's/^\\$//' -e '/^$/d' -e '/:$/d' \ + | sed -e 's/$/ :/' >> "$depfile" + rm -f "$tmpdepfile" + ;; + +dashXmstdout) + # This case only exists to satisfy depend.m4. It is never actually + # run, as this mode is specially recognized in the preamble. + exit 1 + ;; + +makedepend) + "$@" || exit $? + # Remove any Libtool call + if test "$libtool" = yes; then + while test "X$1" != 'X--mode=compile'; do + shift + done + shift + fi + # X makedepend + shift + cleared=no eat=no + for arg + do + case $cleared in + no) + set ""; shift + cleared=yes ;; + esac + if test $eat = yes; then + eat=no + continue + fi + case "$arg" in + -D*|-I*) + set fnord "$@" "$arg"; shift ;; + # Strip any option that makedepend may not understand. Remove + # the object too, otherwise makedepend will parse it as a source file. + -arch) + eat=yes ;; + -*|$object) + ;; + *) + set fnord "$@" "$arg"; shift ;; + esac + done + obj_suffix=`echo "$object" | sed 's/^.*\././'` + touch "$tmpdepfile" + ${MAKEDEPEND-makedepend} -o"$obj_suffix" -f"$tmpdepfile" "$@" + rm -f "$depfile" + # makedepend may prepend the VPATH from the source file name to the object. + # No need to regex-escape $object, excess matching of '.' is harmless. + sed "s|^.*\($object *:\)|\1|" "$tmpdepfile" > "$depfile" + # Some versions of the HPUX 10.20 sed can't process the last invocation + # correctly. Breaking it into two sed invocations is a workaround. + sed '1,2d' "$tmpdepfile" \ + | tr ' ' "$nl" \ + | sed -e 's/^\\$//' -e '/^$/d' -e '/:$/d' \ + | sed -e 's/$/ :/' >> "$depfile" + rm -f "$tmpdepfile" "$tmpdepfile".bak + ;; + +cpp) + # Important note: in order to support this mode, a compiler *must* + # always write the preprocessed file to stdout. + "$@" || exit $? + + # Remove the call to Libtool. + if test "$libtool" = yes; then + while test "X$1" != 'X--mode=compile'; do + shift + done + shift + fi + + # Remove '-o $object'. + IFS=" " + for arg + do + case $arg in + -o) + shift + ;; + $object) + shift + ;; + *) + set fnord "$@" "$arg" + shift # fnord + shift # $arg + ;; + esac + done + + "$@" -E \ + | sed -n -e '/^# [0-9][0-9]* "\([^"]*\)".*/ s:: \1 \\:p' \ + -e '/^#line [0-9][0-9]* "\([^"]*\)".*/ s:: \1 \\:p' \ + | sed '$ s: \\$::' > "$tmpdepfile" + rm -f "$depfile" + echo "$object : \\" > "$depfile" + cat < "$tmpdepfile" >> "$depfile" + sed < "$tmpdepfile" '/^$/d;s/^ //;s/ \\$//;s/$/ :/' >> "$depfile" + rm -f "$tmpdepfile" + ;; + +msvisualcpp) + # Important note: in order to support this mode, a compiler *must* + # always write the preprocessed file to stdout. + "$@" || exit $? + + # Remove the call to Libtool. + if test "$libtool" = yes; then + while test "X$1" != 'X--mode=compile'; do + shift + done + shift + fi + + IFS=" " + for arg + do + case "$arg" in + -o) + shift + ;; + $object) + shift + ;; + "-Gm"|"/Gm"|"-Gi"|"/Gi"|"-ZI"|"/ZI") + set fnord "$@" + shift + shift + ;; + *) + set fnord "$@" "$arg" + shift + shift + ;; + esac + done + "$@" -E 2>/dev/null | + sed -n '/^#line [0-9][0-9]* "\([^"]*\)"/ s::\1:p' | $cygpath_u | sort -u > "$tmpdepfile" + rm -f "$depfile" + echo "$object : \\" > "$depfile" + sed < "$tmpdepfile" -n -e 's% %\\ %g' -e '/^\(.*\)$/ s::'"$tab"'\1 \\:p' >> "$depfile" + echo "$tab" >> "$depfile" + sed < "$tmpdepfile" -n -e 's% %\\ %g' -e '/^\(.*\)$/ s::\1\::p' >> "$depfile" + rm -f "$tmpdepfile" + ;; + +msvcmsys) + # This case exists only to let depend.m4 do its work. It works by + # looking at the text of this script. This case will never be run, + # since it is checked for above. + exit 1 + ;; + +none) + exec "$@" + ;; + +*) + echo "Unknown depmode $depmode" 1>&2 + exit 1 + ;; +esac + +exit 0 + +# Local Variables: +# mode: shell-script +# sh-indentation: 2 +# eval: (add-hook 'write-file-hooks 'time-stamp) +# time-stamp-start: "scriptversion=" +# time-stamp-format: "%:y-%02m-%02d.%02H" +# time-stamp-time-zone: "UTC" +# time-stamp-end: "; # UTC" +# End: diff --git a/examples/gf_example_2.c b/examples/gf_example_2.c index e98774a..576d9a5 100644 --- a/examples/gf_example_2.c +++ b/examples/gf_example_2.c @@ -28,8 +28,8 @@ int main(int argc, char **argv) { uint32_t a, b, c; uint8_t *r1, *r2; - uint16_t *r16; - uint32_t *r32; + uint16_t *r16 = NULL; + uint32_t *r32 = NULL; int w, i; gf_t gf; diff --git a/examples/gf_example_5.c b/examples/gf_example_5.c index 8e7dd4e..da6e9ca 100644 --- a/examples/gf_example_5.c +++ b/examples/gf_example_5.c @@ -74,4 +74,5 @@ int main(int argc, char **argv) gf.extract_word.w32(&gf, a, 30*2, i+15), gf.extract_word.w32(&gf, b, 30*2, i+15)); } + return 0; } diff --git a/examples/gf_example_6.c b/examples/gf_example_6.c index 54cdf83..800a35f 100644 --- a/examples/gf_example_6.c +++ b/examples/gf_example_6.c @@ -80,4 +80,5 @@ int main(int argc, char **argv) gf.extract_word.w32(&gf, a, 30*4, i+15), gf.extract_word.w32(&gf, b, 30*4, i+15)); } + return 0; } diff --git a/examples/gf_example_7.c b/examples/gf_example_7.c index cd5c44b..ee07d53 100644 --- a/examples/gf_example_7.c +++ b/examples/gf_example_7.c @@ -71,4 +71,5 @@ int main(int argc, char **argv) gf.extract_word.w32(&gf, a, 3, i), gf.extract_word.w32(&gf, b, 3, i)); } + return 0; } diff --git a/include/gf_complete.h b/include/gf_complete.h index 57b439e..0469b77 100644 --- a/include/gf_complete.h +++ b/include/gf_complete.h @@ -33,17 +33,18 @@ Not all are implemented for all values of w. See the paper for an explanation of how they work. */ -typedef enum {GF_MULT_DEFAULT, - GF_MULT_SHIFT, - GF_MULT_CARRY_FREE, - GF_MULT_GROUP, +typedef enum {GF_MULT_DEFAULT, + GF_MULT_SHIFT, + GF_MULT_CARRY_FREE, + GF_MULT_CARRY_FREE_GK, //ADAM + GF_MULT_GROUP, GF_MULT_BYTWO_p, GF_MULT_BYTWO_b, - GF_MULT_TABLE, - GF_MULT_LOG_TABLE, + GF_MULT_TABLE, + GF_MULT_LOG_TABLE, GF_MULT_LOG_ZERO, GF_MULT_LOG_ZERO_EXT, - GF_MULT_SPLIT_TABLE, + GF_MULT_SPLIT_TABLE, GF_MULT_COMPOSITE } gf_mult_type_t; /* These are the different ways to optimize region diff --git a/include/gf_int.h b/include/gf_int.h index 9221569..98294cc 100644 --- a/include/gf_int.h +++ b/include/gf_int.h @@ -154,8 +154,8 @@ typedef enum {GF_E_MDEFDIV, /* Dev != Default && Mult == Default */ GF_E_SP128AR, /* Mult == SPLIT, w=128, Bad arg1/arg2 */ GF_E_SP128AL, /* Mult == SPLIT, w=128, SSE requires ALTMAP */ GF_E_SP128AS, /* Mult == SPLIT, w=128, ALTMAP requires SSE */ - GF_E_SP128_A, /* Mult == SPLIT, w=128, SSE only with 4/128 */ - GF_E_SP128_S, /* Mult == SPLIT, w=128, ALTMAP only with 4/128 */ + GF_E_SP128_A, /* Mult == SPLIT, w=128, ALTMAP only with 4/128 */ + GF_E_SP128_S, /* Mult == SPLIT, w=128, SSE only with 4/128 */ GF_E_SPLIT_W, /* Mult == SPLIT, Bad w (8, 16, 32, 64, 128) */ GF_E_SP_16AR, /* Mult == SPLIT, w=16, Bad arg1/arg2 */ GF_E_SP_16_A, /* Mult == SPLIT, w=16, ALTMAP only with 4/16 */ diff --git a/m4/ltoptions.m4 b/m4/ltoptions.m4 index 17cfd51..5d9acd8 100644 --- a/m4/ltoptions.m4 +++ b/m4/ltoptions.m4 @@ -326,9 +326,24 @@ dnl AC_DEFUN([AM_DISABLE_FAST_INSTALL], []) # MODE is either `yes' or `no'. If omitted, it defaults to `both'. m4_define([_LT_WITH_PIC], [AC_ARG_WITH([pic], - [AS_HELP_STRING([--with-pic], + [AS_HELP_STRING([--with-pic@<:@=PKGS@:>@], [try to use only PIC/non-PIC objects @<:@default=use both@:>@])], - [pic_mode="$withval"], + [lt_p=${PACKAGE-default} + case $withval in + yes|no) pic_mode=$withval ;; + *) + pic_mode=default + # Look at the argument we got. We use all the common list separators. + lt_save_ifs="$IFS"; IFS="${IFS}$PATH_SEPARATOR," + for lt_pkg in $withval; do + IFS="$lt_save_ifs" + if test "X$lt_pkg" = "X$lt_p"; then + pic_mode=yes + fi + done + IFS="$lt_save_ifs" + ;; + esac], [pic_mode=default]) test -z "$pic_mode" && pic_mode=m4_default([$1], [default]) @@ -85,8 +85,8 @@ void gf_error() case GF_E_SP128AR: s = "With -m SPLIT, w=128, bad arg1/arg2."; break; case GF_E_SP128AL: s = "With -m SPLIT, w=128, -r SSE requires -r ALTMAP."; break; case GF_E_SP128AS: s = "With -m SPLIT, w=128, ALTMAP needs SSSE3 supported."; break; - case GF_E_SP128_A: s = "With -m SPLIT, w=128, -r SSE|NOSSE only with arg1/arg2 = 4/128."; break; - case GF_E_SP128_S: s = "With -m SPLIT, w=128, -r ALTMAP only with arg1/arg2 = 4/128."; break; + case GF_E_SP128_A: s = "With -m SPLIT, w=128, -r ALTMAP only with arg1/arg2 = 4/128."; break; + case GF_E_SP128_S: s = "With -m SPLIT, w=128, -r SSE|NOSSE only with arg1/arg2 = 4/128."; break; case GF_E_SPLIT_W: s = "With -m SPLIT, w must be in {8, 16, 32, 64, 128}."; break; case GF_E_SP_16AR: s = "With -m SPLIT, w=16, Bad arg1/arg2."; break; case GF_E_SP_16_A: s = "With -m SPLIT, w=16, -r ALTMAP only with arg1/arg2 = 4/16."; break; @@ -179,13 +179,11 @@ uint64_t gf_composite_get_default_poly(gf_t *base) int gf_error_check(int w, int mult_type, int region_type, int divide_type, int arg1, int arg2, uint64_t poly, gf_t *base) { - int sse4 = 0; int sse3 = 0; int sse2 = 0; int pclmul = 0; int rdouble, rquad, rlazy, rsse, rnosse, raltmap, rcauchy, tmp; - uint64_t pp; - gf_internal_t *sub, *subsub, *subsubsub; + gf_internal_t *sub; rdouble = (region_type & GF_REGION_DOUBLE_TABLE); rquad = (region_type & GF_REGION_QUAD_TABLE); @@ -214,10 +212,6 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type, sse3 = 1; #endif -#ifdef INTEL_SSE4 - sse4 = 1; -#endif - #ifdef INTEL_SSE4_PCLMUL pclmul = 1; #endif @@ -292,6 +286,16 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type, return 1; } + //ADAM + if (mult_type == GF_MULT_CARRY_FREE_GK) { + if (w != 4 && w != 8 && w != 16 && + w != 32 && w != 64 && w != 128) { _gf_errno = GF_E_CFM___W; return 0; } + if (raltmap) { _gf_errno = GF_E_ALT_CFM; return 0; } + if (rsse || rnosse) { _gf_errno = GF_E_SSE_CFM; return 0; } + if (!pclmul) { _gf_errno = GF_E_PCLMULX; return 0; } + return 1; + } + if (mult_type == GF_MULT_BYTWO_p || mult_type == GF_MULT_BYTWO_b) { if (raltmap) { _gf_errno = GF_E_ALT_BY2; return 0; } if (rsse && !sse2) { _gf_errno = GF_E_BY2_SSE; return 0; } @@ -344,11 +348,12 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type, if (rsse && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; } if (raltmap) { _gf_errno = GF_E_SP_8__A; return 0; } } else if (w == 16) { - if (arg1 == 4 && arg2 == 16) { - if (rsse && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; } - } else if (arg1 == 8 && (arg2 == 16 || arg2 == 8)) { + if ((arg1 == 8 && arg2 == 8) || + (arg1 == 8 && arg2 == 16)) { if (rsse || rnosse) { _gf_errno = GF_E_SP_16_S; return 0; } if (raltmap) { _gf_errno = GF_E_SP_16_A; return 0; } + } else if (arg1 == 4 && arg2 == 16) { + if (rsse && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; } } else { _gf_errno = GF_E_SP_16AR; return 0; } } else if (w == 32) { if ((arg1 == 8 && arg2 == 8) || @@ -356,10 +361,8 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type, (arg1 == 16 && arg2 == 32)) { if (rsse || rnosse) { _gf_errno = GF_E_SP_32_S; return 0; } if (raltmap) { _gf_errno = GF_E_SP_32_A; return 0; } - } else if ((arg1 == 4 && arg2 == 32) || - (arg1 == 4 && arg2 == 32)) { + } else if (arg1 == 4 && arg2 == 32) { if (rsse && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; } - if (raltmap && arg1 != 4) { _gf_errno = GF_E_SP_32_A; return 0; } if (raltmap && !sse3) { _gf_errno = GF_E_SP_32AS; return 0; } if (raltmap && rnosse) { _gf_errno = GF_E_SP_32AS; return 0; } } else { _gf_errno = GF_E_SP_32AR; return 0; } @@ -488,7 +491,7 @@ int gf_init_hard(gf_t *gf, int w, int mult_type, h->arg2 = arg2; h->base_gf = base_gf; h->private = (void *) gf->scratch; - h->private += (sizeof(gf_internal_t)); + h->private = (uint8_t *)h->private + (sizeof(gf_internal_t)); gf->extract_word.w32 = NULL; switch(w) { @@ -525,7 +528,7 @@ void gf_alignment_error(char *s, int a) static void gf_invert_binary_matrix(uint32_t *mat, uint32_t *inv, int rows) { - int cols, i, j, k; + int cols, i, j; uint32_t tmp; cols = rows; @@ -594,7 +597,7 @@ uint32_t gf_bitmatrix_inverse(uint32_t y, int w, uint32_t pp) void gf_two_byte_region_table_multiply(gf_region_data *rd, uint16_t *base) { uint64_t a, prod; - int j, xor; + int xor; uint64_t *s64, *d64, *top; s64 = rd->s_start; @@ -693,8 +696,8 @@ static void gf_slow_multiply_region(gf_region_data *rd, void *src, void *dest, v fprintf(stderr, "Error: gf_slow_multiply_region: w=%d not implemented.\n", h->w); exit(1); } - src += wb; - dest += wb; + src = (uint8_t *)src + wb; + dest = (uint8_t *)dest + wb; } } @@ -773,8 +776,7 @@ void gf_set_region_data(gf_region_data *rd, int xor, int align) { - uint8_t *s8, *d8; - gf_internal_t *h; + gf_internal_t *h = NULL; int wb; uint32_t a; unsigned long uls, uld; @@ -802,7 +804,7 @@ void gf_set_region_data(gf_region_data *rd, if (align == -1) { /* JSP: This is cauchy. Error check bytes, then set up the pointers so that there are no alignment regions. */ - if (bytes % h->w != 0) { + if (h != NULL && bytes % h->w != 0) { fprintf(stderr, "Error in region multiply operation.\n"); fprintf(stderr, "The size must be a multiple of %d bytes.\n", h->w); exit(1); @@ -810,8 +812,8 @@ void gf_set_region_data(gf_region_data *rd, rd->s_start = src; rd->d_start = dest; - rd->s_top = src + bytes; - rd->d_top = src + bytes; + rd->s_top = (uint8_t *)src + bytes; + rd->d_top = (uint8_t *)src + bytes; return; } @@ -840,12 +842,12 @@ void gf_set_region_data(gf_region_data *rd, uls %= a; if (uls != 0) uls = (a-uls); - rd->s_start = rd->src + uls; - rd->d_start = rd->dest + uls; + rd->s_start = (uint8_t *)rd->src + uls; + rd->d_start = (uint8_t *)rd->dest + uls; bytes -= uls; bytes -= (bytes % align); - rd->s_top = rd->s_start + bytes; - rd->d_top = rd->d_start + bytes; + rd->s_top = (uint8_t *)rd->s_start + bytes; + rd->d_top = (uint8_t *)rd->d_start + bytes; } @@ -856,7 +858,7 @@ void gf_do_initial_region_alignment(gf_region_data *rd) void gf_do_final_region_alignment(gf_region_data *rd) { - gf_slow_multiply_region(rd, rd->s_top, rd->d_top, rd->src+rd->bytes); + gf_slow_multiply_region(rd, rd->s_top, rd->d_top, (uint8_t *)rd->src+rd->bytes); } void gf_multby_zero(void *dest, int bytes, int xor) @@ -897,9 +899,8 @@ void gf_multby_one(void *src, void *dest, int bytes, int xor) __m128i ms, md; #endif unsigned long uls, uld; - uint8_t *s8, *d8, *dtop8; + uint8_t *s8, *d8; uint64_t *s64, *d64, *dtop64; - int abytes; gf_region_data rd; if (!xor) { @@ -910,6 +911,7 @@ void gf_multby_one(void *src, void *dest, int bytes, int xor) uld = (unsigned long) dest; #ifdef INTEL_SSE2 + int abytes; s8 = (uint8_t *) src; d8 = (uint8_t *) dest; if (uls % 16 == uld % 16) { @@ -1025,7 +1027,7 @@ static void gf_unaligned_xor(void *src, void *dest, int bytes) } d8 = (uint8_t *) d64; - while (d8 < (uint8_t *) (dest+bytes)) { + while (d8 < (uint8_t *) ((uint8_t *)dest+bytes)) { *d8 ^= *s8; d8++; s8++; diff --git a/src/gf_general.c b/src/gf_general.c index d9d1700..8fcc737 100644 --- a/src/gf_general.c +++ b/src/gf_general.c @@ -240,7 +240,7 @@ int gf_general_are_equal(gf_general_t *v1, gf_general_t *v2, int w) return (v1->w64 == v2->w64); } else { return (v1->w128[0] == v2->w128[0] && - v1->w128[0] == v2->w128[0]); + v1->w128[1] == v2->w128[1]); } } @@ -267,7 +267,6 @@ void gf_general_do_region_check(gf_t *gf, gf_general_t *a, void *orig_a, void *o int w, words, i; gf_general_t oa, ot, ft, sb; char sa[50], soa[50], sot[50], sft[50], ssb[50]; - uint8_t *p; h = (gf_internal_t *) gf->scratch; w = h->w; @@ -327,7 +326,7 @@ void gf_general_set_up_single_timing_test(int w, void *ra, void *rb, int size) uint64_t *r64; int i; - top = rb+size; + top = (uint8_t *)rb+size; /* If w is 8, 16, 32, 64 or 128, fill the regions with random bytes. However, don't allow for zeros in rb, because that will screw up @@ -366,7 +365,7 @@ void gf_general_set_up_single_timing_test(int w, void *ra, void *rb, int size) r64[1] = g.w128[1]; break; } - rb += (w/8); + rb = (uint8_t *)rb + (w/8); } } else if (w == 4) { r8a = (uint8_t *) ra; @@ -408,7 +407,7 @@ int gf_general_do_single_timing_test(gf_t *gf, void *ra, void *rb, int size, cha h = (gf_internal_t *) gf->scratch; w = h->w; - top = ra + size; + top = (uint8_t *)ra + size; if (w == 8 || w == 4) { r8a = (uint8_t *) ra; diff --git a/src/gf_general.h b/src/gf_general.h deleted file mode 100644 index 9a5de52..0000000 --- a/src/gf_general.h +++ /dev/null @@ -1,61 +0,0 @@ -/* - * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic - * James S. Plank, Ethan L. Miller, Kevin M. Greenan, - * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. - * - * gf_general.h - * - * This file has helper routines for doing basic GF operations with any - * legal value of w. The problem is that w <= 32, w=64 and w=128 all have - * different data types, which is a pain. The procedures in this file try - * to alleviate that pain. They are used in gf_unit and gf_time. - */ - -#pragma once - -#include <stdio.h> -#include <getopt.h> -#include <stdint.h> -#include <string.h> -#include <stdlib.h> -#include <time.h> - -#include "gf_complete.h" - -typedef union { - uint32_t w32; - uint64_t w64; - uint64_t w128[2]; -} gf_general_t; - -void gf_general_set_zero(gf_general_t *v, int w); -void gf_general_set_one(gf_general_t *v, int w); -void gf_general_set_two(gf_general_t *v, int w); - -int gf_general_is_zero(gf_general_t *v, int w); -int gf_general_is_one(gf_general_t *v, int w); -int gf_general_are_equal(gf_general_t *v1, gf_general_t *v2, int w); - -void gf_general_val_to_s(gf_general_t *v, int w, char *s, int hex); -int gf_general_s_to_val(gf_general_t *v, int w, char *s, int hex); - -void gf_general_set_random(gf_general_t *v, int w, int zero_ok); - -void gf_general_add(gf_t *gf, gf_general_t *a, gf_general_t *b, gf_general_t *c); -void gf_general_multiply(gf_t *gf, gf_general_t *a, gf_general_t *b, gf_general_t *c); -void gf_general_divide(gf_t *gf, gf_general_t *a, gf_general_t *b, gf_general_t *c); -void gf_general_inverse(gf_t *gf, gf_general_t *a, gf_general_t *b); - -void gf_general_do_region_multiply(gf_t *gf, gf_general_t *a, - void *ra, void *rb, - int bytes, int xor); - -void gf_general_do_region_check(gf_t *gf, gf_general_t *a, - void *orig_a, void *orig_target, void *final_target, - int bytes, int xor); - - -/* Which is M, D or I for multiply, divide or inverse. */ - -void gf_general_set_up_single_timing_test(int w, void *ra, void *rb, int size); -int gf_general_do_single_timing_test(gf_t *gf, void *ra, void *rb, int size, char which); diff --git a/src/gf_int.h b/src/gf_int.h deleted file mode 100644 index 9221569..0000000 --- a/src/gf_int.h +++ /dev/null @@ -1,200 +0,0 @@ -/* - * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic - * James S. Plank, Ethan L. Miller, Kevin M. Greenan, - * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. - * - * gf_int.h - * - * Internal code for Galois field routines. This is not meant for - * users to include, but for the internal GF files to use. - */ - -#pragma once - -#include "gf_complete.h" - -#include <string.h> - -extern void timer_start (double *t); -extern double timer_split (const double *t); -extern void galois_fill_random (void *buf, int len, unsigned int seed); - -typedef struct { - int mult_type; - int region_type; - int divide_type; - int w; - uint64_t prim_poly; - int free_me; - int arg1; - int arg2; - gf_t *base_gf; - void *private; -} gf_internal_t; - -extern int gf_w4_init (gf_t *gf); -extern int gf_w4_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2); - -extern int gf_w8_init (gf_t *gf); -extern int gf_w8_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2); - -extern int gf_w16_init (gf_t *gf); -extern int gf_w16_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2); - -extern int gf_w32_init (gf_t *gf); -extern int gf_w32_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2); - -extern int gf_w64_init (gf_t *gf); -extern int gf_w64_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2); - -extern int gf_w128_init (gf_t *gf); -extern int gf_w128_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2); - -extern int gf_wgen_init (gf_t *gf); -extern int gf_wgen_scratch_size(int w, int mult_type, int region_type, int divide_type, int arg1, int arg2); - -void gf_wgen_cauchy_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor); -gf_val_32_t gf_wgen_extract_word(gf_t *gf, void *start, int bytes, int index); - -extern void gf_alignment_error(char *s, int a); - -extern uint32_t gf_bitmatrix_inverse(uint32_t y, int w, uint32_t pp); - -/* This returns the correct default for prim_poly when base is used as the base - field for COMPOSITE. It returns 0 if we don't have a default prim_poly. */ - -extern uint64_t gf_composite_get_default_poly(gf_t *base); - -/* This structure lets you define a region multiply. It helps because you can handle - unaligned portions of the data with the procedures below, which really cleans - up the code. */ - -typedef struct { - gf_t *gf; - void *src; - void *dest; - int bytes; - uint64_t val; - int xor; - int align; /* The number of bytes to which to align. */ - void *s_start; /* The start and the top of the aligned region. */ - void *d_start; - void *s_top; - void *d_top; -} gf_region_data; - -/* This lets you set up one of these in one call. It also sets the start/top pointers. */ - -void gf_set_region_data(gf_region_data *rd, - gf_t *gf, - void *src, - void *dest, - int bytes, - uint64_t val, - int xor, - int align); - -/* This performs gf->multiply.32() on all of the unaligned bytes in the beginning of the region */ - -extern void gf_do_initial_region_alignment(gf_region_data *rd); - -/* This performs gf->multiply.32() on all of the unaligned bytes in the end of the region */ - -extern void gf_do_final_region_alignment(gf_region_data *rd); - -extern void gf_two_byte_region_table_multiply(gf_region_data *rd, uint16_t *base); - -extern void gf_multby_zero(void *dest, int bytes, int xor); -extern void gf_multby_one(void *src, void *dest, int bytes, int xor); - -typedef enum {GF_E_MDEFDIV, /* Dev != Default && Mult == Default */ - GF_E_MDEFREG, /* Reg != Default && Mult == Default */ - GF_E_MDEFARG, /* Args != Default && Mult == Default */ - GF_E_DIVCOMP, /* Mult == Composite && Div != Default */ - GF_E_CAUCOMP, /* Mult == Composite && Reg == CAUCHY */ - GF_E_DOUQUAD, /* Reg == DOUBLE && Reg == QUAD */ - GF_E_SSE__NO, /* Reg == SSE && Reg == NOSSE */ - GF_E_CAUCHYB, /* Reg == CAUCHY && Other Reg */ - GF_E_CAUGT32, /* Reg == CAUCHY && w > 32*/ - GF_E_ARG1SET, /* Arg1 != 0 && Mult \notin COMPOSITE/SPLIT/GROUP */ - GF_E_ARG2SET, /* Arg2 != 0 && Mult \notin SPLIT/GROUP */ - GF_E_MATRIXW, /* Div == MATRIX && w > 32 */ - GF_E_BAD___W, /* Illegal w */ - GF_E_DOUBLET, /* Reg == DOUBLE && Mult != TABLE */ - GF_E_DOUBLEW, /* Reg == DOUBLE && w \notin {4,8} */ - GF_E_DOUBLEJ, /* Reg == DOUBLE && other Reg */ - GF_E_DOUBLEL, /* Reg == DOUBLE & LAZY but w = 4 */ - GF_E_QUAD__T, /* Reg == QUAD && Mult != TABLE */ - GF_E_QUAD__W, /* Reg == QUAD && w != 4 */ - GF_E_QUAD__J, /* Reg == QUAD && other Reg */ - GF_E_LAZY__X, /* Reg == LAZY && not DOUBLE or QUAD*/ - GF_E_ALTSHIF, /* Mult == Shift && Reg == ALTMAP */ - GF_E_SSESHIF, /* Mult == Shift && Reg == SSE|NOSSE */ - GF_E_ALT_CFM, /* Mult == CARRY_FREE && Reg == ALTMAP */ - GF_E_SSE_CFM, /* Mult == CARRY_FREE && Reg == SSE|NOSSE */ - GF_E_PCLMULX, /* Mult == Carry_Free && No PCLMUL */ - GF_E_ALT_BY2, /* Mult == Bytwo_x && Reg == ALTMAP */ - GF_E_BY2_SSE, /* Mult == Bytwo_x && Reg == SSE && No SSE2 */ - GF_E_LOGBADW, /* Mult == LOGx, w too big*/ - GF_E_LOG___J, /* Mult == LOGx, && Reg == SSE|ALTMAP|NOSSE */ - GF_E_ZERBADW, /* Mult == LOG_ZERO, w \notin {8,16} */ - GF_E_ZEXBADW, /* Mult == LOG_ZERO_EXT, w != 8 */ - GF_E_LOGPOLY, /* Mult == LOG & poly not primitive */ - GF_E_GR_ARGX, /* Mult == GROUP, Bad arg1/2 */ - GF_E_GR_W_48, /* Mult == GROUP, w \in { 4, 8 } */ - GF_E_GR_W_16, /* Mult == GROUP, w == 16, arg1 != 4 || arg2 != 4 */ - GF_E_GR_128A, /* Mult == GROUP, w == 128, bad args */ - GF_E_GR_A_27, /* Mult == GROUP, either arg > 27 */ - GF_E_GR_AR_W, /* Mult == GROUP, either arg > w */ - GF_E_GR____J, /* Mult == GROUP, Reg == SSE|ALTMAP|NOSSE */ - GF_E_TABLE_W, /* Mult == TABLE, w too big */ - GF_E_TAB_SSE, /* Mult == TABLE, SSE|NOSSE only apply to w == 4 */ - GF_E_TABSSE3, /* Mult == TABLE, Need SSSE3 for SSE */ - GF_E_TAB_ALT, /* Mult == TABLE, Reg == ALTMAP */ - GF_E_SP128AR, /* Mult == SPLIT, w=128, Bad arg1/arg2 */ - GF_E_SP128AL, /* Mult == SPLIT, w=128, SSE requires ALTMAP */ - GF_E_SP128AS, /* Mult == SPLIT, w=128, ALTMAP requires SSE */ - GF_E_SP128_A, /* Mult == SPLIT, w=128, SSE only with 4/128 */ - GF_E_SP128_S, /* Mult == SPLIT, w=128, ALTMAP only with 4/128 */ - GF_E_SPLIT_W, /* Mult == SPLIT, Bad w (8, 16, 32, 64, 128) */ - GF_E_SP_16AR, /* Mult == SPLIT, w=16, Bad arg1/arg2 */ - GF_E_SP_16_A, /* Mult == SPLIT, w=16, ALTMAP only with 4/16 */ - GF_E_SP_16_S, /* Mult == SPLIT, w=16, SSE only with 4/16 */ - GF_E_SP_32AR, /* Mult == SPLIT, w=32, Bad arg1/arg2 */ - GF_E_SP_32AS, /* Mult == SPLIT, w=32, ALTMAP requires SSE */ - GF_E_SP_32_A, /* Mult == SPLIT, w=32, ALTMAP only with 4/32 */ - GF_E_SP_32_S, /* Mult == SPLIT, w=32, SSE only with 4/32 */ - GF_E_SP_64AR, /* Mult == SPLIT, w=64, Bad arg1/arg2 */ - GF_E_SP_64AS, /* Mult == SPLIT, w=64, ALTMAP requires SSE */ - GF_E_SP_64_A, /* Mult == SPLIT, w=64, ALTMAP only with 4/64 */ - GF_E_SP_64_S, /* Mult == SPLIT, w=64, SSE only with 4/64 */ - GF_E_SP_8_AR, /* Mult == SPLIT, w=8, Bad arg1/arg2 */ - GF_E_SP_8__A, /* Mult == SPLIT, w=8, no ALTMAP */ - GF_E_SP_SSE3, /* Mult == SPLIT, Need SSSE3 for SSE */ - GF_E_COMP_A2, /* Mult == COMP, arg1 must be = 2 */ - GF_E_COMP_SS, /* Mult == COMP, SSE|NOSSE */ - GF_E_COMP__W, /* Mult == COMP, Bad w. */ - GF_E_UNKFLAG, /* Unknown flag in create_from.... */ - GF_E_UNKNOWN, /* Unknown mult_type. */ - GF_E_UNK_REG, /* Unknown region_type. */ - GF_E_UNK_DIV, /* Unknown divide_type. */ - GF_E_CFM___W, /* Mult == CFM, Bad w. */ - GF_E_CFM4POL, /* Mult == CFM & Prim Poly has high bits set. */ - GF_E_CFM8POL, /* Mult == CFM & Prim Poly has high bits set. */ - GF_E_CF16POL, /* Mult == CFM & Prim Poly has high bits set. */ - GF_E_CF32POL, /* Mult == CFM & Prim Poly has high bits set. */ - GF_E_CF64POL, /* Mult == CFM & Prim Poly has high bits set. */ - GF_E_FEWARGS, /* Too few args in argc/argv. */ - GF_E_BADPOLY, /* Bad primitive polynomial -- too many bits set. */ - GF_E_COMP_PP, /* Bad primitive polynomial -- bigger than sub-field. */ - GF_E_COMPXPP, /* Can't derive a default pp for composite field. */ - GF_E_BASE__W, /* Composite -- Base field is the wrong size. */ - GF_E_TWOMULT, /* In create_from... two -m's. */ - GF_E_TWO_DIV, /* In create_from... two -d's. */ - GF_E_POLYSPC, /* Bad numbera after -p. */ - GF_E_SPLITAR, /* Ran out of arguments in SPLIT */ - GF_E_SPLITNU, /* Arguments not integers in SPLIT. */ - GF_E_GROUPAR, /* Ran out of arguments in GROUP */ - GF_E_GROUPNU, /* Arguments not integers in GROUP. */ - GF_E_DEFAULT } gf_error_type_t; - diff --git a/src/gf_method.c b/src/gf_method.c index 36ec3c4..90d62af 100644 --- a/src/gf_method.c +++ b/src/gf_method.c @@ -21,10 +21,9 @@ int create_gf_from_argv(gf_t *gf, int w, int argc, char **argv, int starting) { int mult_type, divide_type, region_type; - int arg1, arg2, subrg_size; + int arg1, arg2; uint64_t prim_poly; gf_t *base; - char *crt, *x, *y; mult_type = GF_MULT_DEFAULT; region_type = GF_REGION_DEFAULT; @@ -48,6 +47,10 @@ int create_gf_from_argv(gf_t *gf, int w, int argc, char **argv, int starting) } else if (strcmp(argv[starting], "CARRY_FREE") == 0) { mult_type = GF_MULT_CARRY_FREE; starting++; + //ADAM + } else if (strcmp(argv[starting], "CARRY_FREE_GK") == 0) { + mult_type = GF_MULT_CARRY_FREE_GK; + starting++; } else if (strcmp(argv[starting], "GROUP") == 0) { mult_type = GF_MULT_GROUP; if (argc < starting + 3) { diff --git a/src/gf_rand.h b/src/gf_rand.h deleted file mode 100644 index 24294ad..0000000 --- a/src/gf_rand.h +++ /dev/null @@ -1,22 +0,0 @@ -/* - * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic - * James S. Plank, Ethan L. Miller, Kevin M. Greenan, - * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. - * - * gf_rand.h - * - * Random number generation, using the "Mother of All" random number generator. */ - -#pragma once -#include <stdint.h> -#include <stdio.h> -#include <stdlib.h> - -/* These are all pretty self-explanatory */ -uint32_t MOA_Random_32(); -uint64_t MOA_Random_64(); -void MOA_Random_128(uint64_t *x); -uint32_t MOA_Random_W(int w, int zero_ok); -void MOA_Fill_Random_Region (void *reg, int size); /* reg should be aligned to 4 bytes, but - size can be anything. */ -void MOA_Seed(uint32_t seed); diff --git a/src/gf_w128.c b/src/gf_w128.c index fae9f5c..61cf3d7 100644 --- a/src/gf_w128.c +++ b/src/gf_w128.c @@ -81,6 +81,7 @@ int xor) } } +#if defined(INTEL_SSE4_PCLMUL) static void gf_w128_clm_multiply_region_from_single(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, @@ -89,9 +90,7 @@ int xor) int i; gf_val_128_t s128; gf_val_128_t d128; - uint64_t c128[2]; gf_region_data rd; -#if defined(INTEL_SSE4_PCLMUL) __m128i a,b; __m128i result0,result1; __m128i prim_poly; @@ -106,8 +105,6 @@ int xor) if (val[1] == 1) { gf_multby_one(src, dest, bytes, xor); return; } } - set_zero(c128, 0); - s128 = (gf_val_128_t) src; d128 = (gf_val_128_t) dest; @@ -184,8 +181,8 @@ int xor) d128[i+1] = (uint64_t)_mm_extract_epi64(result1,0); } } -#endif } +#endif /* * Some w128 notes: @@ -384,7 +381,7 @@ gf_w128_sse_bytwo_p_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_ { #if defined(INTEL_SSE4) int i; - __m128i a, b, pp, one, prod, amask, l_middle_one, u_middle_one; + __m128i a, b, pp, prod, amask, u_middle_one; /*John: pmask is always the highest bit set, and the rest zeros. amask changes, it's a countdown.*/ uint32_t topbit, middlebit, pmask; /* this is used as a boolean value */ gf_internal_t *h; @@ -400,7 +397,6 @@ gf_w128_sse_bytwo_p_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_ pmask = 0x80000000; amask = _mm_insert_epi32(prod, 0x80000000, 0x3); u_middle_one = _mm_insert_epi32(prod, 1, 0x2); - l_middle_one = _mm_insert_epi32(prod, 1 << 31, 0x1); for (i = 0; i < 64; i++) { topbit = (_mm_extract_epi32(prod, 0x3) & pmask); @@ -599,13 +595,13 @@ gf_w128_split_4_128_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_ } } +#if defined(INTEL_SSSE3) && defined(INTEL_SSE4) static void gf_w128_split_4_128_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor) { -#ifdef INTEL_SSSE3 gf_internal_t *h; - int i, m, j, k, tindex; + int i, j, k; uint64_t pp, v[2], s, *s64, *d64, *top; __m128i p, tables[32][16]; struct gf_w128_split_4_128_data *ld; @@ -624,7 +620,7 @@ gf_w128_split_4_128_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_ /* Doing this instead of gf_do_initial_region_alignment() because that doesn't hold 128-bit vals */ - gf_w128_multiply_region_from_single(gf, src, dest, val, (rd.s_start-src), xor); + gf_w128_multiply_region_from_single(gf, src, dest, val, ((uint8_t *)rd.s_start-(uint8_t *)src), xor); s64 = (uint64_t *) rd.s_start; d64 = (uint64_t *) rd.d_start; @@ -694,18 +690,18 @@ gf_w128_split_4_128_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_ /* Doing this instead of gf_do_final_region_alignment() because that doesn't hold 128-bit vals */ - gf_w128_multiply_region_from_single(gf, rd.s_top, rd.d_top, val, (src+bytes)-rd.s_top, xor); -#endif + gf_w128_multiply_region_from_single(gf, rd.s_top, rd.d_top, val, ((uint8_t *)src+bytes)-(uint8_t *)rd.s_top, xor); } +#endif +#if defined(INTEL_SSSE3) && defined(INTEL_SSE4) static void gf_w128_split_4_128_sse_altmap_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor) { -#ifdef INTEL_SSSE3 gf_internal_t *h; - int i, m, j, k, tindex; - uint64_t pp, v[2], s, *s64, *d64, *top; + int i, j, k; + uint64_t pp, v[2], *s64, *d64, *top; __m128i si, tables[32][16], p[16], v0, mask1; struct gf_w128_split_4_128_data *ld; uint8_t btable[16]; @@ -724,7 +720,7 @@ gf_w128_split_4_128_sse_altmap_multiply_region(gf_t *gf, void *src, void *dest, /* Doing this instead of gf_do_initial_region_alignment() because that doesn't hold 128-bit vals */ - gf_w128_multiply_region_from_single(gf, src, dest, val, (rd.s_start-src), xor); + gf_w128_multiply_region_from_single(gf, src, dest, val, ((uint8_t *)rd.s_start-(uint8_t *)src), xor); s64 = (uint64_t *) rd.s_start; d64 = (uint64_t *) rd.d_start; @@ -804,9 +800,9 @@ gf_w128_split_4_128_sse_altmap_multiply_region(gf_t *gf, void *src, void *dest, } /* Doing this instead of gf_do_final_region_alignment() because that doesn't hold 128-bit vals */ - gf_w128_multiply_region_from_single(gf, rd.s_top, rd.d_top, val, (src+bytes)-rd.s_top, xor); -#endif + gf_w128_multiply_region_from_single(gf, rd.s_top, rd.d_top, val, ((uint8_t *)src+bytes)-(uint8_t *)rd.s_top, xor); } +#endif static void @@ -886,7 +882,7 @@ gf_w128_split_8_128_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_ void gf_w128_bytwo_b_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor) { - uint64_t bmask, pp, vmask; + uint64_t bmask, pp; gf_internal_t *h; uint64_t a[2], c[2], b[2], *s64, *d64, *top; gf_region_data rd; @@ -987,7 +983,7 @@ void gf_w128_group_m_init(gf_t *gf, gf_val_128_t b128) void gf_w128_group_multiply(GFP gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128) { - int i,j; + int i; /* index_r, index_m, total_m (if g_r > g_m) */ int i_r, i_m, t_m; int mask_m, mask_r; @@ -1162,11 +1158,12 @@ gf_w128_euclid(GFP gf, gf_val_128_t a128, gf_val_128_t b128) uint64_t c_i[2]; uint64_t *b; uint64_t one = 1; - uint64_t buf, buf1; /* This needs to return some sort of error (in b128?) */ if (a128[0] == 0 && a128[1] == 0) return; + b = (uint64_t *) b128; + e_im1[0] = 0; e_im1[1] = ((gf_internal_t *) (gf->scratch))->prim_poly; e_i[0] = a128[0]; @@ -1240,7 +1237,6 @@ gf_w128_euclid(GFP gf, gf_val_128_t a128, gf_val_128_t b128) d_i = d_ip1; } - b = (uint64_t *) b128; b[0] = y_i[0]; b[1] = y_i[1]; return; @@ -1326,7 +1322,6 @@ static void gf_w128_composite_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor) { - unsigned long uls, uld; gf_internal_t *h = (gf_internal_t *) gf->scratch; gf_t *base_gf = h->base_gf; uint64_t b0 = val[1]; @@ -1381,14 +1376,13 @@ gf_w128_composite_multiply_region_alt(gf_t *gf, void *src, void *dest, gf_val_12 gf_internal_t *h = (gf_internal_t *) gf->scratch; gf_t *base_gf = h->base_gf; gf_val_64_t val0 = val[1]; gf_val_64_t val1 = val[0]; - uint64_t *l, *hi; uint8_t *slow, *shigh; uint8_t *dlow, *dhigh, *top; int sub_reg_size; gf_region_data rd; gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 64); - gf_w128_multiply_region_from_single(gf, src, dest, val, (rd.s_start-src), xor); + gf_w128_multiply_region_from_single(gf, src, dest, val, ((uint8_t *)rd.s_start-(uint8_t *)src), xor); slow = (uint8_t *) rd.s_start; dlow = (uint8_t *) rd.d_start; @@ -1404,7 +1398,7 @@ gf_w128_composite_multiply_region_alt(gf_t *gf, void *src, void *dest, gf_val_12 base_gf->multiply_region.w64(base_gf, shigh, dhigh, base_gf->multiply.w64(base_gf, h->prim_poly, val1 ), sub_reg_size, 1); - gf_w128_multiply_region_from_single(gf, rd.s_top, rd.d_top, val, (src+bytes)-rd.s_top, xor); + gf_w128_multiply_region_from_single(gf, rd.s_top, rd.d_top, val, ((uint8_t *)src+bytes)-(uint8_t *)rd.s_top, xor); } @@ -1419,8 +1413,6 @@ int gf_w128_composite_init(gf_t *gf) gf->multiply_region.w128 = gf_w128_composite_multiply_region; } - gf_internal_t *base_h = (gf_internal_t *) h->base_gf->scratch; - gf->multiply.w128 = gf_w128_composite_multiply; gf->divide.w128 = gf_w128_divide_from_inverse; gf->inverse.w128 = gf_w128_composite_inverse; @@ -1444,8 +1436,6 @@ int gf_w128_cfm_init(gf_t *gf) static int gf_w128_shift_init(gf_t *gf) { - gf_internal_t *h; - h = (gf_internal_t*) gf->scratch; gf->multiply.w128 = gf_w128_shift_multiply; gf->inverse.w128 = gf_w128_euclid; gf->multiply_region.w128 = gf_w128_multiply_region_from_single; @@ -1501,10 +1491,10 @@ void gf_w128_group_r_init(gf_t *gf) return; } +#if 0 // defined(INTEL_SSE4) static void gf_w128_group_r_sse_init(gf_t *gf) { -#if defined(INTEL_SSE4) int i, j; int g_r; uint64_t pp; @@ -1526,8 +1516,8 @@ void gf_w128_group_r_sse_init(gf_t *gf) } } return; -#endif } +#endif static int gf_w128_split_init(gf_t *gf) @@ -1587,16 +1577,14 @@ int gf_w128_group_init(gf_t *gf) { gf_internal_t *scratch; gf_group_tables_t *gt; - int g_m, g_r, size_r; - long tmp; + int g_r, size_r; scratch = (gf_internal_t *) gf->scratch; gt = scratch->private; - g_m = scratch->arg1; g_r = scratch->arg2; size_r = (1 << g_r); - gt->r_table = scratch->private + (2 * sizeof(uint64_t *)); + gt->r_table = (gf_val_128_t)((uint8_t *)scratch->private + (2 * sizeof(uint64_t *))); gt->m_table = gt->r_table + size_r; gt->m_table[2] = 0; gt->m_table[3] = 0; @@ -1690,7 +1678,6 @@ void gf_w128_composite_extract_word(gf_t *gf, void *start, int bytes, int index, int gf_w128_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2) { int size_m, size_r; - int w = 128; if (divide_type==GF_DIVIDE_MATRIX) return 0; switch(mult_type) @@ -1739,7 +1726,7 @@ int gf_w128_scratch_size(int mult_type, int region_type, int divide_type, int ar int gf_w128_init(gf_t *gf) { - gf_internal_t *h, *h_base, *h_base_base, *h_base_base_base; + gf_internal_t *h; int no_default_flag = 0; h = (gf_internal_t *) gf->scratch; diff --git a/src/gf_w16.c b/src/gf_w16.c index 454c6cc..272a95b 100644 --- a/src/gf_w16.c +++ b/src/gf_w16.c @@ -125,6 +125,7 @@ gf_w16_multiply_region_from_single(gf_t *gf, void *src, void *dest, gf_val_32_t gf_do_final_region_alignment(&rd); } +#if defined(INTEL_SSE4_PCLMUL) static void gf_w16_clm_multiply_region_from_single_2(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) @@ -132,8 +133,6 @@ gf_w16_clm_multiply_region_from_single_2(gf_t *gf, void *src, void *dest, gf_val gf_region_data rd; uint16_t *s16; uint16_t *d16; - -#if defined(INTEL_SSE4_PCLMUL) __m128i a, b; __m128i result; __m128i prim_poly; @@ -186,9 +185,10 @@ gf_w16_clm_multiply_region_from_single_2(gf_t *gf, void *src, void *dest, gf_val } } gf_do_final_region_alignment(&rd); -#endif } +#endif +#if defined(INTEL_SSE4_PCLMUL) static void gf_w16_clm_multiply_region_from_single_3(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) @@ -197,8 +197,6 @@ gf_w16_clm_multiply_region_from_single_3(gf_t *gf, void *src, void *dest, gf_val uint16_t *s16; uint16_t *d16; -#if defined(INTEL_SSE4_PCLMUL) - __m128i a, b; __m128i result; __m128i prim_poly; @@ -255,9 +253,10 @@ gf_w16_clm_multiply_region_from_single_3(gf_t *gf, void *src, void *dest, gf_val } } gf_do_final_region_alignment(&rd); -#endif } +#endif +#if defined(INTEL_SSE4_PCLMUL) static void gf_w16_clm_multiply_region_from_single_4(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) @@ -266,8 +265,6 @@ gf_w16_clm_multiply_region_from_single_4(gf_t *gf, void *src, void *dest, gf_val uint16_t *s16; uint16_t *d16; -#if defined(INTEL_SSE4_PCLMUL) - __m128i a, b; __m128i result; __m128i prim_poly; @@ -328,8 +325,8 @@ gf_w16_clm_multiply_region_from_single_4(gf_t *gf, void *src, void *dest, gf_val } } gf_do_final_region_alignment(&rd); -#endif } +#endif static inline @@ -453,7 +450,7 @@ gf_w16_clm_multiply_2 (gf_t *gf, gf_val_32_t a16, gf_val_32_t b16) __m128i a, b; __m128i result; __m128i prim_poly; - __m128i v, w; + __m128i w; gf_internal_t * h = gf->scratch; a = _mm_insert_epi32 (_mm_setzero_si128(), a16, 0); @@ -500,7 +497,7 @@ gf_w16_clm_multiply_3 (gf_t *gf, gf_val_32_t a16, gf_val_32_t b16) __m128i a, b; __m128i result; __m128i prim_poly; - __m128i v, w; + __m128i w; gf_internal_t * h = gf->scratch; a = _mm_insert_epi32 (_mm_setzero_si128(), a16, 0); @@ -540,7 +537,7 @@ gf_w16_clm_multiply_4 (gf_t *gf, gf_val_32_t a16, gf_val_32_t b16) __m128i a, b; __m128i result; __m128i prim_poly; - __m128i v, w; + __m128i w; gf_internal_t * h = gf->scratch; a = _mm_insert_epi32 (_mm_setzero_si128(), a16, 0); @@ -605,13 +602,13 @@ int gf_w16_shift_init(gf_t *gf) static int gf_w16_cfm_init(gf_t *gf) { +#if defined(INTEL_SSE4_PCLMUL) gf_internal_t *h; h = (gf_internal_t *) gf->scratch; /*Ben: Determining how many reductions to do */ -#if defined(INTEL_SSE4_PCLMUL) if ((0xfe00 & h->prim_poly) == 0) { gf->multiply.w32 = gf_w16_clm_multiply_2; gf->multiply_region.w32 = gf_w16_clm_multiply_region_from_single_2; @@ -774,9 +771,8 @@ static void gf_w16_split_4_16_lazy_nosse_altmap_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { - uint64_t i, j, a, b, c, prod; + uint64_t i, j, c, prod; uint8_t *s8, *d8, *top; - gf_internal_t *h; uint16_t table[4][16]; gf_region_data rd; @@ -786,8 +782,6 @@ gf_w16_split_4_16_lazy_nosse_altmap_multiply_region(gf_t *gf, void *src, void *d gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 32); gf_do_initial_region_alignment(&rd); - h = (gf_internal_t *) gf->scratch; - /*Ben: Constructs lazy multiplication table*/ for (j = 0; j < 16; j++) { @@ -840,7 +834,6 @@ gf_w16_split_4_16_lazy_multiply_region(gf_t *gf, void *src, void *dest, gf_val_3 { uint64_t i, j, a, c, prod; uint16_t *s16, *d16, *top; - gf_internal_t *h; uint16_t table[4][16]; gf_region_data rd; @@ -850,8 +843,6 @@ gf_w16_split_4_16_lazy_multiply_region(gf_t *gf, void *src, void *dest, gf_val_3 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 2); gf_do_initial_region_alignment(&rd); - h = (gf_internal_t *) gf->scratch; - for (j = 0; j < 16; j++) { for (i = 0; i < 4; i++) { c = (j << (i*4)); @@ -880,7 +871,7 @@ static void gf_w16_split_8_16_lazy_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { - uint64_t j, k, v, a, c, prod, *s64, *d64, *top64; + uint64_t j, k, v, a, prod, *s64, *d64, *top64; gf_internal_t *h; uint64_t htable[256], ltable[256]; gf_region_data rd; @@ -966,7 +957,7 @@ gf_w16_split_8_16_lazy_multiply_region(gf_t *gf, void *src, void *dest, gf_val_3 static void gf_w16_table_lazy_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { - uint64_t j, a, c, pp; + uint64_t c; gf_internal_t *h; struct gf_w16_lazytable_data *ltd; gf_region_data rd; @@ -1010,12 +1001,12 @@ gf_w16_split_4_16_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_v { #ifdef INTEL_SSSE3 uint64_t i, j, *s64, *d64, *top64;; - uint64_t a, c, prod; + uint64_t c, prod; uint8_t low[4][16]; uint8_t high[4][16]; gf_region_data rd; - __m128i mask, ta, tb, ti, tpl, tph, tlow[4], thigh[4], tta, ttb, shuffler, unshuffler, lmask; + __m128i mask, ta, tb, ti, tpl, tph, tlow[4], thigh[4], tta, ttb, lmask; if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } @@ -1147,7 +1138,6 @@ gf_w16_split_4_16_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void *des uint8_t low[4][16]; uint8_t high[4][16]; gf_region_data rd; - struct gf_single_table_data *std; __m128i mask, ta, tb, ti, tpl, tph, tlow[4], thigh[4]; if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } @@ -1358,11 +1348,8 @@ issse3 = 0; static int gf_w16_table_init(gf_t *gf) { - gf_internal_t *h; gf_w16_log_init(gf); - h = (gf_internal_t *) gf->scratch; - gf->multiply_region.w32 = gf_w16_table_lazy_multiply_region; return 1; } @@ -1557,15 +1544,14 @@ gf_w16_bytwo_p_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_ prod = _mm_xor_si128(prod, t1); \ v = _mm_srli_epi64(v, 1); } +#ifdef INTEL_SSE2 static void gf_w16_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { -#ifdef INTEL_SSE2 int i; uint8_t *s8, *d8; uint32_t vrev; - uint64_t amask; __m128i pp, m1, m2, ta, prod, t1, t2, tp, one, v; struct gf_w16_bytwo_data *btd; gf_region_data rd; @@ -1618,17 +1604,16 @@ gf_w16_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t s8 += 16; } gf_do_final_region_alignment(&rd); -#endif } +#endif +#ifdef INTEL_SSE2 static void gf_w16_bytwo_b_sse_region_2_noxor(gf_region_data *rd, struct gf_w16_bytwo_data *btd) { -#ifdef INTEL_SSE2 - int i; - uint8_t *d8, *s8, tb; - __m128i pp, m1, m2, t1, t2, va, vb; + uint8_t *d8, *s8; + __m128i pp, m1, m2, t1, t2, va; s8 = (uint8_t *) rd->s_start; d8 = (uint8_t *) rd->d_start; @@ -1644,16 +1629,15 @@ gf_w16_bytwo_b_sse_region_2_noxor(gf_region_data *rd, struct gf_w16_bytwo_data * d8 += 16; s8 += 16; } -#endif } +#endif +#ifdef INTEL_SSE2 static void gf_w16_bytwo_b_sse_region_2_xor(gf_region_data *rd, struct gf_w16_bytwo_data *btd) { -#ifdef INTEL_SSE2 - int i; - uint8_t *d8, *s8, tb; + uint8_t *d8, *s8; __m128i pp, m1, m2, t1, t2, va, vb; s8 = (uint8_t *) rd->s_start; @@ -1672,15 +1656,15 @@ gf_w16_bytwo_b_sse_region_2_xor(gf_region_data *rd, struct gf_w16_bytwo_data *bt d8 += 16; s8 += 16; } -#endif } +#endif +#ifdef INTEL_SSE2 static void gf_w16_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { -#ifdef INTEL_SSE2 int itb; uint8_t *d8, *s8; __m128i pp, m1, m2, t1, t2, va, vb; @@ -1728,14 +1712,13 @@ gf_w16_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t } gf_do_final_region_alignment(&rd); -#endif } +#endif static void gf_w16_bytwo_b_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { - int i; uint64_t *s64, *d64, t1, t2, ta, tb, prod; struct gf_w16_bytwo_data *btd; gf_region_data rd; @@ -1834,6 +1817,7 @@ gf_w16_bytwo_b_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_ s64++; } } + break; default: if (xor) { while (d64 < (uint64_t *) rd.d_top) { @@ -1988,7 +1972,6 @@ gf_val_32_t gf_w16_composite_multiply_inline(gf_t *gf, gf_val_32_t a, gf_val_32_t b) { gf_internal_t *h = (gf_internal_t *) gf->scratch; - gf_t *base_gf = h->base_gf; uint8_t b0 = b & 0x00ff; uint8_t b1 = (b & 0xff00) >> 8; uint8_t a0 = a & 0x00ff; @@ -2072,7 +2055,6 @@ static void gf_w16_composite_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { - unsigned long uls, uld; gf_internal_t *h = (gf_internal_t *) gf->scratch; gf_t *base_gf = h->base_gf; uint8_t b0 = val & 0x00ff; @@ -2080,7 +2062,6 @@ gf_w16_composite_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t va uint16_t *s16, *d16, *top; uint8_t a0, a1, a1b1, *mt; gf_region_data rd; - struct gf_w16_logtable_data *ltd; struct gf_w16_composite_data *cd; cd = (struct gf_w16_composite_data *) h->private; @@ -2237,7 +2218,6 @@ inline gf_val_32_t gf_w16_group_4_4_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b) { - int i; uint16_t p, l, ind, r, a16; struct gf_w16_group_4_4_data *d44; @@ -2270,7 +2250,6 @@ gf_w16_group_4_4_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b) static void gf_w16_group_4_4_region_multiply(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { - int i; uint16_t p, l, ind, r, a16, p16; struct gf_w16_group_4_4_data *d44; gf_region_data rd; @@ -2475,10 +2454,8 @@ int gf_w16_init(gf_t *gf) uint16_t *gf_w16_get_log_table(gf_t *gf) { - gf_internal_t *h; struct gf_w16_logtable_data *ltd; - h = (gf_internal_t *) gf->scratch; if (gf->multiply.w32 == gf_w16_log_multiply) { ltd = (struct gf_w16_logtable_data *) ((gf_internal_t *) gf->scratch)->private; return (uint16_t *) ltd->log_tbl; diff --git a/src/gf_w32.c b/src/gf_w32.c index 03f285f..c90c7fb 100644 --- a/src/gf_w32.c +++ b/src/gf_w32.c @@ -120,13 +120,13 @@ xor) } } +#if defined(INTEL_SSE4_PCLMUL) + static void gf_w32_clm_multiply_region_from_single_2(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor) { -#if defined(INTEL_SSE4_PCLMUL) - int i; uint32_t *s32; uint32_t *d32; @@ -167,16 +167,16 @@ gf_w32_clm_multiply_region_from_single_2(gf_t *gf, void *src, void *dest, uint32 d32[i] = ((gf_val_32_t)_mm_extract_epi32(result, 0)); } } -#endif } +#endif + +#if defined(INTEL_SSE4_PCLMUL) static void gf_w32_clm_multiply_region_from_single_3(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor) { -#if defined(INTEL_SSE4_PCLMUL) - int i; uint32_t *s32; uint32_t *d32; @@ -222,14 +222,14 @@ gf_w32_clm_multiply_region_from_single_3(gf_t *gf, void *src, void *dest, uint32 d32[i] = ((gf_val_32_t)_mm_extract_epi32(result, 0)); } } -#endif } +#endif +#if defined(INTEL_SSE4_PCLMUL) static void gf_w32_clm_multiply_region_from_single_4(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor) { -#if defined(INTEL_SSE4_PCLMUL) int i; uint32_t *s32; uint32_t *d32; @@ -279,8 +279,8 @@ gf_w32_clm_multiply_region_from_single_4(gf_t *gf, void *src, void *dest, uint32 d32[i] = ((gf_val_32_t)_mm_extract_epi32(result, 0)); } } -#endif } +#endif static inline @@ -399,7 +399,94 @@ uint32_t gf_w32_matrix (gf_t *gf, uint32_t b) extra memory. */ +//ADAM +static +inline +gf_val_32_t +gf_w32_cfmgk_multiply (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32) +{ + gf_val_32_t rv = 0; + +#if defined(INTEL_SSE4_PCLMUL) + + __m128i a, b; + __m128i result; + __m128i w; + __m128i g, q; + gf_internal_t * h = gf->scratch; + uint64_t g_star, q_plus; + + q_plus = *(uint64_t *) h->private; + g_star = *((uint64_t *) h->private + 1); + + a = _mm_insert_epi32 (_mm_setzero_si128(), a32, 0); + b = _mm_insert_epi32 (a, b32, 0); + g = _mm_insert_epi64 (a, g_star, 0); + q = _mm_insert_epi64 (a, q_plus, 0); + + result = _mm_clmulepi64_si128 (a, b, 0); + w = _mm_clmulepi64_si128 (q, _mm_srli_si128 (result, 4), 0); + w = _mm_clmulepi64_si128 (g, _mm_srli_si128 (w, 4), 0); + result = _mm_xor_si128 (result, w); + + /* Extracts 32 bit value from result. */ + rv = ((gf_val_32_t)_mm_extract_epi32(result, 0)); +#endif + return rv; +} + +//ADAM +#if defined(INTEL_SSE4_PCLMUL) +static +void +gf_w32_cfmgk_multiply_region_from_single(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor) +{ + + int i; + uint32_t *s32; + uint32_t *d32; + + __m128i a, b; + __m128i result; + __m128i w; + __m128i g, q; + gf_internal_t * h = gf->scratch; + uint64_t g_star, q_plus; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + q_plus = *(uint64_t *) h->private; + g_star = *((uint64_t *) h->private + 1); + + g = _mm_insert_epi64 (a, g_star, 0); + q = _mm_insert_epi64 (a, q_plus, 0); + a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0); + s32 = (uint32_t *) src; + d32 = (uint32_t *) dest; + + if (xor) { + for (i = 0; i < bytes/sizeof(uint32_t); i++) { + b = _mm_insert_epi32 (a, s32[i], 0); + result = _mm_clmulepi64_si128 (a, b, 0); + w = _mm_clmulepi64_si128 (q, _mm_srli_si128 (result, 4), 0); + w = _mm_clmulepi64_si128 (g, _mm_srli_si128 (w, 4), 0); + result = _mm_xor_si128 (result, w); + d32[i] ^= ((gf_val_32_t)_mm_extract_epi32(result, 0)); + } + } else { + for (i = 0; i < bytes/sizeof(uint32_t); i++) { + b = _mm_insert_epi32 (a, s32[i], 0); + result = _mm_clmulepi64_si128 (a, b, 0); + w = _mm_clmulepi64_si128 (q, _mm_srli_si128 (result, 4), 0); + w = _mm_clmulepi64_si128 (g, _mm_srli_si128 (w, 4), 0); + result = _mm_xor_si128 (result, w); + d32[i] = ((gf_val_32_t)_mm_extract_epi32(result, 0)); + } + } +} +#endif static @@ -414,7 +501,7 @@ gf_w32_clm_multiply_2 (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32) __m128i a, b; __m128i result; __m128i prim_poly; - __m128i v, w; + __m128i w; gf_internal_t * h = gf->scratch; @@ -446,6 +533,7 @@ gf_w32_clm_multiply_2 (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32) #endif return rv; } + static inline gf_val_32_t @@ -458,7 +546,7 @@ gf_w32_clm_multiply_3 (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32) __m128i a, b; __m128i result; __m128i prim_poly; - __m128i v, w; + __m128i w; gf_internal_t * h = gf->scratch; @@ -497,7 +585,7 @@ gf_w32_clm_multiply_4 (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32) __m128i a, b; __m128i result; __m128i prim_poly; - __m128i v, w; + __m128i w; gf_internal_t * h = gf->scratch; @@ -552,13 +640,48 @@ gf_w32_shift_multiply (gf_t *gf, uint32_t a32, uint32_t b32) return product; } +//ADAM static -int gf_w32_cfm_init(gf_t *gf) +int gf_w32_cfmgk_init(gf_t *gf) { + gf->inverse.w32 = gf_w32_euclid; + gf->multiply_region.w32 = gf_w32_multiply_region_from_single; + +#if defined(INTEL_SSE4_PCLMUL) gf_internal_t *h; h = (gf_internal_t *) gf->scratch; + gf->multiply.w32 = gf_w32_cfmgk_multiply; + gf->multiply_region.w32 = gf_w32_cfmgk_multiply_region_from_single; + + //setup in the private section the q+ and g* ADAM + uint64_t *q_plus = (uint64_t *) h->private; + uint64_t *g_star = (uint64_t *) h->private + 1; + //q+ + uint64_t tmp = h->prim_poly << 32; + *q_plus = 1ULL << 32; + + int i; + for(i = 63; i >= 32; i--) + if((1ULL << i) & tmp) + { + *q_plus |= 1ULL << (i-32); + tmp ^= h->prim_poly << (i-32); + } + + //g* + *g_star = h->prim_poly & ((1ULL << 32) - 1); + + return 1; +#endif + + return 0; +} + + static +int gf_w32_cfm_init(gf_t *gf) +{ gf->inverse.w32 = gf_w32_euclid; gf->multiply_region.w32 = gf_w32_multiply_region_from_single; @@ -566,6 +689,10 @@ int gf_w32_cfm_init(gf_t *gf) /*Ben: Check to see how many reduction steps it will take*/ #if defined(INTEL_SSE4_PCLMUL) + gf_internal_t *h; + + h = (gf_internal_t *) gf->scratch; + if ((0xfffe0000 & h->prim_poly) == 0){ gf->multiply.w32 = gf_w32_clm_multiply_2; gf->multiply_region.w32 = gf_w32_clm_multiply_region_from_single_2; @@ -616,9 +743,8 @@ gf_w32_group_set_shift_tables(uint32_t *shift, uint32_t val, gf_internal_t *h) static void gf_w32_group_s_equals_r_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { - int i; int leftover, rs; - uint32_t p, l, ind, r, a32; + uint32_t p, l, ind, a32; int bits_left; int g_s; gf_region_data rd; @@ -741,9 +867,8 @@ inline gf_val_32_t gf_w32_group_s_equals_r_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b) { - int i; int leftover, rs; - uint32_t p, l, ind, r, a32; + uint32_t p, l, ind, a32; int bits_left; int g_s; @@ -781,8 +906,7 @@ inline gf_val_32_t gf_w32_group_4_4_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b) { - int i; - uint32_t p, l, ind, r, a32; + uint32_t p, l, ind, a32; struct gf_w32_group_data *d44; gf_internal_t *h = (gf_internal_t *) gf->scratch; @@ -832,7 +956,7 @@ gf_w32_group_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b) { int i; int leftover; - uint64_t p, l, r, mask; + uint64_t p, l, r; uint32_t a32, ind; int g_s, g_r; struct gf_w32_group_data *gd; @@ -986,15 +1110,14 @@ gf_w32_bytwo_p_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_ prod = _mm_xor_si128(prod, t1); \ v = _mm_srli_epi64(v, 1); } +#ifdef INTEL_SSE2 static void gf_w32_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { -#ifdef INTEL_SSE2 int i; uint8_t *s8, *d8; uint32_t vrev; - uint64_t amask; __m128i pp, m1, m2, ta, prod, t1, t2, tp, one, v; struct gf_w32_bytwo_data *btd; gf_region_data rd; @@ -1039,14 +1162,13 @@ gf_w32_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t s8 += 16; } gf_do_final_region_alignment(&rd); -#endif } +#endif static void gf_w32_bytwo_b_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { - int i; uint64_t *s64, *d64, t1, t2, ta, tb, prod; struct gf_w32_bytwo_data *btd; gf_region_data rd; @@ -1101,6 +1223,7 @@ gf_w32_bytwo_b_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_ s64++; } } + break; case 4: if (xor) { while (d64 < (uint64_t *) rd.d_top) { @@ -1144,6 +1267,7 @@ gf_w32_bytwo_b_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_ s64++; } } + break; default: if (xor) { while (d64 < (uint64_t *) rd.d_top) { @@ -1181,14 +1305,13 @@ gf_w32_bytwo_b_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_ gf_do_final_region_alignment(&rd); } +#ifdef INTEL_SSE2 static void gf_w32_bytwo_b_sse_region_2_noxor(gf_region_data *rd, struct gf_w32_bytwo_data *btd) { -#ifdef INTEL_SSE2 - int i; - uint8_t *d8, *s8, tb; - __m128i pp, m1, m2, t1, t2, va, vb; + uint8_t *d8, *s8; + __m128i pp, m1, m2, t1, t2, va; s8 = (uint8_t *) rd->s_start; d8 = (uint8_t *) rd->d_start; @@ -1204,16 +1327,15 @@ gf_w32_bytwo_b_sse_region_2_noxor(gf_region_data *rd, struct gf_w32_bytwo_data * d8 += 16; s8 += 16; } -#endif } +#endif +#ifdef INTEL_SSE2 static void gf_w32_bytwo_b_sse_region_2_xor(gf_region_data *rd, struct gf_w32_bytwo_data *btd) { -#ifdef INTEL_SSE2 - int i; - uint8_t *d8, *s8, tb; + uint8_t *d8, *s8; __m128i pp, m1, m2, t1, t2, va, vb; s8 = (uint8_t *) rd->s_start; @@ -1232,15 +1354,15 @@ gf_w32_bytwo_b_sse_region_2_xor(gf_region_data *rd, struct gf_w32_bytwo_data *bt d8 += 16; s8 += 16; } -#endif } +#endif +#ifdef INTEL_SSE2 static void gf_w32_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { -#ifdef INTEL_SSE2 uint32_t itb; uint8_t *d8, *s8; __m128i pp, m1, m2, t1, t2, va, vb; @@ -1288,8 +1410,8 @@ gf_w32_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t } gf_do_final_region_alignment(&rd); -#endif } +#endif static int gf_w32_bytwo_init(gf_t *gf) @@ -1556,14 +1678,14 @@ gf_w32_split_2_32_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint32_t gf_do_final_region_alignment(&rd); } +#ifdef INTEL_SSSE3 static void gf_w32_split_2_32_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor) { -#ifdef INTEL_SSSE3 gf_internal_t *h; - int i, m, j, tindex; - uint32_t pp, v, v2, s, *s32, *d32, *top; + int i, tindex; + uint32_t pp, v, v2, *s32, *d32, *top; __m128i vi, si, pi, shuffler, tables[16], adder, xi, mask1, mask2; gf_region_data rd; @@ -1635,8 +1757,8 @@ gf_w32_split_2_32_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, uint gf_do_final_region_alignment(&rd); -#endif } +#endif static void @@ -1699,8 +1821,8 @@ gf_w32_split_4_32_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void *des { #ifdef INTEL_SSSE3 gf_internal_t *h; - int i, m, j, k, tindex; - uint32_t pp, v, s, *s32, *d32, *top, *realtop; + int i, j, k; + uint32_t pp, v, *s32, *d32, *top; __m128i si, tables[8][4], p0, p1, p2, p3, mask1, v0, v1, v2, v3; struct gf_split_4_32_lazy_data *ld; uint8_t btable[16]; @@ -1891,9 +2013,9 @@ gf_w32_split_4_32_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, uint { #ifdef INTEL_SSSE3 gf_internal_t *h; - int i, m, j, k, tindex; - uint32_t pp, v, s, *s32, *d32, *top, tmp_table[16]; - __m128i vi, si, tables[8][4], p0, p1, p2, p3, mask1, v0, v1, v2, v3, mask8; + int i, j, k; + uint32_t pp, v, *s32, *d32, *top, tmp_table[16]; + __m128i si, tables[8][4], p0, p1, p2, p3, mask1, v0, v1, v2, v3, mask8; __m128i tv1, tv2, tv3, tv0; uint8_t btable[16]; gf_region_data rd; @@ -2378,7 +2500,6 @@ uint32_t gf_w32_composite_multiply_inline(gf_t *gf, uint32_t a, uint32_t b) { gf_internal_t *h = (gf_internal_t *) gf->scratch; - gf_t *base_gf = h->base_gf; uint32_t b0 = b & 0x0000ffff; uint32_t b1 = b >> 16; uint32_t a0 = a & 0x0000ffff; @@ -2620,11 +2741,8 @@ int gf_w32_composite_init(gf_t *gf) int gf_w32_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2) { - int ss; int issse3 = 0; - ss = (GF_REGION_SSE | GF_REGION_NOSSE); - #ifdef INTEL_SSSE3 issse3 = 1; #endif @@ -2665,6 +2783,10 @@ int gf_w32_scratch_size(int mult_type, int region_type, int divide_type, int arg case GF_MULT_CARRY_FREE: return sizeof(gf_internal_t); break; + //ADAM + case GF_MULT_CARRY_FREE_GK: + return sizeof(gf_internal_t) + sizeof(uint64_t)*2; + break; case GF_MULT_SHIFT: return sizeof(gf_internal_t); break; @@ -2712,14 +2834,15 @@ int gf_w32_init(gf_t *gf) gf->multiply_region.w32 = NULL; switch(h->mult_type) { - case GF_MULT_CARRY_FREE: if (gf_w32_cfm_init(gf) == 0) return 0; break; - case GF_MULT_SHIFT: if (gf_w32_shift_init(gf) == 0) return 0; break; - case GF_MULT_COMPOSITE: if (gf_w32_composite_init(gf) == 0) return 0; break; + case GF_MULT_CARRY_FREE: if (gf_w32_cfm_init(gf) == 0) return 0; break; + case GF_MULT_CARRY_FREE_GK: if (gf_w32_cfmgk_init(gf) == 0) return 0; break; //ADAM + case GF_MULT_SHIFT: if (gf_w32_shift_init(gf) == 0) return 0; break; + case GF_MULT_COMPOSITE: if (gf_w32_composite_init(gf) == 0) return 0; break; case GF_MULT_DEFAULT: - case GF_MULT_SPLIT_TABLE: if (gf_w32_split_init(gf) == 0) return 0; break; - case GF_MULT_GROUP: if (gf_w32_group_init(gf) == 0) return 0; break; + case GF_MULT_SPLIT_TABLE: if (gf_w32_split_init(gf) == 0) return 0; break; + case GF_MULT_GROUP: if (gf_w32_group_init(gf) == 0) return 0; break; case GF_MULT_BYTWO_p: - case GF_MULT_BYTWO_b: if (gf_w32_bytwo_init(gf) == 0) return 0; break; + case GF_MULT_BYTWO_b: if (gf_w32_bytwo_init(gf) == 0) return 0; break; default: return 0; } if (h->divide_type == GF_DIVIDE_EUCLID) { diff --git a/src/gf_w4.c b/src/gf_w4.c index 2504ec6..6bc79d0 100644 --- a/src/gf_w4.c +++ b/src/gf_w4.c @@ -61,7 +61,8 @@ struct gf_bytwo_data { t2 = ((t2 << 1) - (t2 >> (GF_FIELD_WIDTH-1))); \ b = (t1 ^ (t2 & ip));} -#define SSE_AB2(pp, m1 ,m2, va, t1, t2) {\ +// ToDo(KMG/JSP): Why is 0x88 hard-coded? +#define SSE_AB2(pp, m1, va, t1, t2) {\ t1 = _mm_and_si128(_mm_slli_epi64(va, 1), m1); \ t2 = _mm_and_si128(va, _mm_set1_epi8(0x88)); \ t2 = _mm_sub_epi64 (_mm_slli_epi64(t2, 1), _mm_srli_epi64(t2, (GF_FIELD_WIDTH-1))); \ @@ -414,14 +415,14 @@ gf_w4_single_table_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t #define MM_PRINT(s, r) { uint8_t blah[16]; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (i = 0; i < 16; i++) printf(" %02x", blah[i]); printf("\n"); } +#ifdef INTEL_SSSE3 static void gf_w4_single_table_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { -#ifdef INTEL_SSSE3 gf_region_data rd; uint8_t *base, *sptr, *dptr, *top; - __m128i tl, loset, h4, r, va, th; + __m128i tl, loset, r, va, th; struct gf_single_table_data *std; @@ -460,15 +461,15 @@ gf_w4_single_table_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_3 } gf_do_final_region_alignment(&rd); -#endif } +#endif static int gf_w4_single_table_init(gf_t *gf) { gf_internal_t *h; struct gf_single_table_data *std; - int a, b, prod, loga, logb; + int a, b, prod; h = (gf_internal_t *) gf->scratch; @@ -531,7 +532,6 @@ static void gf_w4_double_table_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { - unsigned long uls, uld; int i; uint8_t *s8, *d8, *base; gf_region_data rd; @@ -560,7 +560,7 @@ int gf_w4_double_table_init(gf_t *gf) { gf_internal_t *h; struct gf_double_table_data *std; - int a, b, c, prod, loga, logb, ab; + int a, b, c, prod, ab; uint8_t mult[GF_FIELD_SIZE][GF_FIELD_SIZE]; h = (gf_internal_t *) gf->scratch; @@ -687,7 +687,7 @@ int gf_w4_quad_table_init(gf_t *gf) { gf_internal_t *h; struct gf_quad_table_data *std; - int prod, loga, logb, ab, val, a, b, c, d, va, vb, vc, vd; + int prod, val, a, b, c, d, va, vb, vc, vd; uint8_t mult[GF_FIELD_SIZE][GF_FIELD_SIZE]; h = (gf_internal_t *) gf->scratch; @@ -731,10 +731,9 @@ int gf_w4_quad_table_lazy_init(gf_t *gf) { gf_internal_t *h; struct gf_quad_table_lazy_data *std; - int a, b, c, prod, loga, logb, ab; + int a, b, prod, loga, logb; uint8_t log_tbl[GF_FIELD_SIZE]; uint8_t antilog_tbl[GF_FIELD_SIZE*2]; - uint8_t mult[GF_FIELD_SIZE][GF_FIELD_SIZE]; h = (gf_internal_t *) gf->scratch; std = (struct gf_quad_table_lazy_data *)h->private; @@ -794,7 +793,6 @@ int gf_w4_table_init(gf_t *gf) } else { return gf_w4_quad_table_init(gf); } - return gf_w4_double_table_init(gf); } else { return gf_w4_single_table_init(gf); } @@ -911,23 +909,22 @@ gf_w4_bytwo_p_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t } #define BYTWO_P_ONESTEP {\ - SSE_AB2(pp, m1 ,m2, prod, t1, t2); \ + SSE_AB2(pp, m1, prod, t1, t2); \ t1 = _mm_and_si128(v, one); \ t1 = _mm_sub_epi8(t1, one); \ t1 = _mm_and_si128(t1, ta); \ prod = _mm_xor_si128(prod, t1); \ v = _mm_srli_epi64(v, 1); } +#ifdef INTEL_SSE2 static void gf_w4_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { -#ifdef INTEL_SSE2 int i; uint8_t *s8, *d8; uint8_t vrev; - uint64_t amask; - __m128i pp, m1, m2, ta, prod, t1, t2, tp, one, v; + __m128i pp, m1, ta, prod, t1, t2, tp, one, v; struct gf_bytwo_data *btd; gf_region_data rd; @@ -950,7 +947,6 @@ gf_w4_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t v pp = _mm_set1_epi8(btd->prim_poly&0xff); m1 = _mm_set1_epi8((btd->mask1)&0xff); - m2 = _mm_set1_epi8((btd->mask2)&0xff); one = _mm_set1_epi8(1); while (d8 < (uint8_t *) rd.d_top) { @@ -967,8 +963,8 @@ gf_w4_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t v s8 += 16; } gf_do_final_region_alignment(&rd); -#endif } +#endif /* static @@ -1036,354 +1032,330 @@ gf_w4_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t v } */ +#ifdef INTEL_SSE2 static void gf_w4_bytwo_b_sse_region_2_noxor(gf_region_data *rd, struct gf_bytwo_data *btd) { -#ifdef INTEL_SSE2 - int i; - uint8_t *d8, *s8, tb; - __m128i pp, m1, m2, t1, t2, va, vb; + uint8_t *d8, *s8; + __m128i pp, m1, t1, t2, va; s8 = (uint8_t *) rd->s_start; d8 = (uint8_t *) rd->d_start; pp = _mm_set1_epi8(btd->prim_poly&0xff); m1 = _mm_set1_epi8((btd->mask1)&0xff); - m2 = _mm_set1_epi8((btd->mask2)&0xff); while (d8 < (uint8_t *) rd->d_top) { va = _mm_load_si128 ((__m128i *)(s8)); - SSE_AB2(pp, m1, m2, va, t1, t2); + SSE_AB2(pp, m1, va, t1, t2); _mm_store_si128((__m128i *)d8, va); d8 += 16; s8 += 16; } -#endif } +#endif +#ifdef INTEL_SSE2 static void gf_w4_bytwo_b_sse_region_2_xor(gf_region_data *rd, struct gf_bytwo_data *btd) { -#ifdef INTEL_SSE2 - int i; - uint8_t *d8, *s8, tb; - __m128i pp, m1, m2, t1, t2, va, vb; + uint8_t *d8, *s8; + __m128i pp, m1, t1, t2, va, vb; s8 = (uint8_t *) rd->s_start; d8 = (uint8_t *) rd->d_start; pp = _mm_set1_epi8(btd->prim_poly&0xff); m1 = _mm_set1_epi8((btd->mask1)&0xff); - m2 = _mm_set1_epi8((btd->mask2)&0xff); while (d8 < (uint8_t *) rd->d_top) { va = _mm_load_si128 ((__m128i *)(s8)); - SSE_AB2(pp, m1, m2, va, t1, t2); + SSE_AB2(pp, m1, va, t1, t2); vb = _mm_load_si128 ((__m128i *)(d8)); vb = _mm_xor_si128(vb, va); _mm_store_si128((__m128i *)d8, vb); d8 += 16; s8 += 16; } -#endif } +#endif +#ifdef INTEL_SSE2 static void gf_w4_bytwo_b_sse_region_4_noxor(gf_region_data *rd, struct gf_bytwo_data *btd) { -#ifdef INTEL_SSE2 - int i; - uint8_t *d8, *s8, tb; - __m128i pp, m1, m2, t1, t2, va, vb; + uint8_t *d8, *s8; + __m128i pp, m1, t1, t2, va; s8 = (uint8_t *) rd->s_start; d8 = (uint8_t *) rd->d_start; pp = _mm_set1_epi8(btd->prim_poly&0xff); m1 = _mm_set1_epi8((btd->mask1)&0xff); - m2 = _mm_set1_epi8((btd->mask2)&0xff); while (d8 < (uint8_t *) rd->d_top) { va = _mm_load_si128 ((__m128i *)(s8)); - SSE_AB2(pp, m1, m2, va, t1, t2); - SSE_AB2(pp, m1, m2, va, t1, t2); + SSE_AB2(pp, m1, va, t1, t2); + SSE_AB2(pp, m1, va, t1, t2); _mm_store_si128((__m128i *)d8, va); d8 += 16; s8 += 16; } -#endif } +#endif +#ifdef INTEL_SSE2 static void gf_w4_bytwo_b_sse_region_4_xor(gf_region_data *rd, struct gf_bytwo_data *btd) { -#ifdef INTEL_SSE2 - int i; - uint8_t *d8, *s8, tb; - __m128i pp, m1, m2, t1, t2, va, vb; + uint8_t *d8, *s8; + __m128i pp, m1, t1, t2, va, vb; s8 = (uint8_t *) rd->s_start; d8 = (uint8_t *) rd->d_start; pp = _mm_set1_epi8(btd->prim_poly&0xff); m1 = _mm_set1_epi8((btd->mask1)&0xff); - m2 = _mm_set1_epi8((btd->mask2)&0xff); while (d8 < (uint8_t *) rd->d_top) { va = _mm_load_si128 ((__m128i *)(s8)); - SSE_AB2(pp, m1, m2, va, t1, t2); - SSE_AB2(pp, m1, m2, va, t1, t2); + SSE_AB2(pp, m1, va, t1, t2); + SSE_AB2(pp, m1, va, t1, t2); vb = _mm_load_si128 ((__m128i *)(d8)); vb = _mm_xor_si128(vb, va); _mm_store_si128((__m128i *)d8, vb); d8 += 16; s8 += 16; } -#endif } +#endif +#ifdef INTEL_SSE2 static void gf_w4_bytwo_b_sse_region_3_noxor(gf_region_data *rd, struct gf_bytwo_data *btd) { -#ifdef INTEL_SSE2 - int i; - uint8_t *d8, *s8, tb; - __m128i pp, m1, m2, t1, t2, va, vb; + uint8_t *d8, *s8; + __m128i pp, m1, t1, t2, va, vb; s8 = (uint8_t *) rd->s_start; d8 = (uint8_t *) rd->d_start; pp = _mm_set1_epi8(btd->prim_poly&0xff); m1 = _mm_set1_epi8((btd->mask1)&0xff); - m2 = _mm_set1_epi8((btd->mask2)&0xff); while (d8 < (uint8_t *) rd->d_top) { va = _mm_load_si128 ((__m128i *)(s8)); vb = va; - SSE_AB2(pp, m1, m2, va, t1, t2); + SSE_AB2(pp, m1, va, t1, t2); va = _mm_xor_si128(va, vb); _mm_store_si128((__m128i *)d8, va); d8 += 16; s8 += 16; } -#endif } +#endif +#ifdef INTEL_SSE2 static void gf_w4_bytwo_b_sse_region_3_xor(gf_region_data *rd, struct gf_bytwo_data *btd) { -#ifdef INTEL_SSE2 - int i; - uint8_t *d8, *s8, tb; - __m128i pp, m1, m2, t1, t2, va, vb; + uint8_t *d8, *s8; + __m128i pp, m1, t1, t2, va, vb; s8 = (uint8_t *) rd->s_start; d8 = (uint8_t *) rd->d_start; pp = _mm_set1_epi8(btd->prim_poly&0xff); m1 = _mm_set1_epi8((btd->mask1)&0xff); - m2 = _mm_set1_epi8((btd->mask2)&0xff); while (d8 < (uint8_t *) rd->d_top) { va = _mm_load_si128 ((__m128i *)(s8)); vb = _mm_xor_si128(_mm_load_si128 ((__m128i *)(d8)), va); - SSE_AB2(pp, m1, m2, va, t1, t2); + SSE_AB2(pp, m1, va, t1, t2); vb = _mm_xor_si128(vb, va); _mm_store_si128((__m128i *)d8, vb); d8 += 16; s8 += 16; } -#endif } +#endif +#ifdef INTEL_SSE2 static void gf_w4_bytwo_b_sse_region_5_noxor(gf_region_data *rd, struct gf_bytwo_data *btd) { -#ifdef INTEL_SSE2 - int i; - uint8_t *d8, *s8, tb; - __m128i pp, m1, m2, t1, t2, va, vb; + uint8_t *d8, *s8; + __m128i pp, m1, t1, t2, va, vb; s8 = (uint8_t *) rd->s_start; d8 = (uint8_t *) rd->d_start; pp = _mm_set1_epi8(btd->prim_poly&0xff); m1 = _mm_set1_epi8((btd->mask1)&0xff); - m2 = _mm_set1_epi8((btd->mask2)&0xff); while (d8 < (uint8_t *) rd->d_top) { va = _mm_load_si128 ((__m128i *)(s8)); vb = va; - SSE_AB2(pp, m1, m2, va, t1, t2); - SSE_AB2(pp, m1, m2, va, t1, t2); + SSE_AB2(pp, m1, va, t1, t2); + SSE_AB2(pp, m1, va, t1, t2); va = _mm_xor_si128(va, vb); _mm_store_si128((__m128i *)d8, va); d8 += 16; s8 += 16; } -#endif } +#endif +#ifdef INTEL_SSE2 static void gf_w4_bytwo_b_sse_region_5_xor(gf_region_data *rd, struct gf_bytwo_data *btd) { -#ifdef INTEL_SSE2 - int i; - uint8_t *d8, *s8, tb; - __m128i pp, m1, m2, t1, t2, va, vb; + uint8_t *d8, *s8; + __m128i pp, m1, t1, t2, va, vb; s8 = (uint8_t *) rd->s_start; d8 = (uint8_t *) rd->d_start; pp = _mm_set1_epi8(btd->prim_poly&0xff); m1 = _mm_set1_epi8((btd->mask1)&0xff); - m2 = _mm_set1_epi8((btd->mask2)&0xff); while (d8 < (uint8_t *) rd->d_top) { va = _mm_load_si128 ((__m128i *)(s8)); vb = _mm_xor_si128(_mm_load_si128 ((__m128i *)(d8)), va); - SSE_AB2(pp, m1, m2, va, t1, t2); - SSE_AB2(pp, m1, m2, va, t1, t2); + SSE_AB2(pp, m1, va, t1, t2); + SSE_AB2(pp, m1, va, t1, t2); vb = _mm_xor_si128(vb, va); _mm_store_si128((__m128i *)d8, vb); d8 += 16; s8 += 16; } -#endif } +#endif +#ifdef INTEL_SSE2 static void gf_w4_bytwo_b_sse_region_7_noxor(gf_region_data *rd, struct gf_bytwo_data *btd) { -#ifdef INTEL_SSE2 - int i; - uint8_t *d8, *s8, tb; - __m128i pp, m1, m2, t1, t2, va, vb; + uint8_t *d8, *s8; + __m128i pp, m1, t1, t2, va, vb; s8 = (uint8_t *) rd->s_start; d8 = (uint8_t *) rd->d_start; pp = _mm_set1_epi8(btd->prim_poly&0xff); m1 = _mm_set1_epi8((btd->mask1)&0xff); - m2 = _mm_set1_epi8((btd->mask2)&0xff); while (d8 < (uint8_t *) rd->d_top) { va = _mm_load_si128 ((__m128i *)(s8)); vb = va; - SSE_AB2(pp, m1, m2, va, t1, t2); + SSE_AB2(pp, m1, va, t1, t2); vb = _mm_xor_si128(va, vb); - SSE_AB2(pp, m1, m2, va, t1, t2); + SSE_AB2(pp, m1, va, t1, t2); va = _mm_xor_si128(va, vb); _mm_store_si128((__m128i *)d8, va); d8 += 16; s8 += 16; } -#endif } +#endif +#ifdef INTEL_SSE2 static void gf_w4_bytwo_b_sse_region_7_xor(gf_region_data *rd, struct gf_bytwo_data *btd) { -#ifdef INTEL_SSE2 - int i; - uint8_t *d8, *s8, tb; - __m128i pp, m1, m2, t1, t2, va, vb; + uint8_t *d8, *s8; + __m128i pp, m1, t1, t2, va, vb; s8 = (uint8_t *) rd->s_start; d8 = (uint8_t *) rd->d_start; pp = _mm_set1_epi8(btd->prim_poly&0xff); m1 = _mm_set1_epi8((btd->mask1)&0xff); - m2 = _mm_set1_epi8((btd->mask2)&0xff); while (d8 < (uint8_t *) rd->d_top) { va = _mm_load_si128 ((__m128i *)(s8)); vb = _mm_xor_si128(_mm_load_si128 ((__m128i *)(d8)), va); - SSE_AB2(pp, m1, m2, va, t1, t2); + SSE_AB2(pp, m1, va, t1, t2); vb = _mm_xor_si128(vb, va); - SSE_AB2(pp, m1, m2, va, t1, t2); + SSE_AB2(pp, m1, va, t1, t2); vb = _mm_xor_si128(vb, va); _mm_store_si128((__m128i *)d8, vb); d8 += 16; s8 += 16; } -#endif } +#endif +#ifdef INTEL_SSE2 static void gf_w4_bytwo_b_sse_region_6_noxor(gf_region_data *rd, struct gf_bytwo_data *btd) { -#ifdef INTEL_SSE2 - int i; - uint8_t *d8, *s8, tb; - __m128i pp, m1, m2, t1, t2, va, vb; + uint8_t *d8, *s8; + __m128i pp, m1, t1, t2, va, vb; s8 = (uint8_t *) rd->s_start; d8 = (uint8_t *) rd->d_start; pp = _mm_set1_epi8(btd->prim_poly&0xff); m1 = _mm_set1_epi8((btd->mask1)&0xff); - m2 = _mm_set1_epi8((btd->mask2)&0xff); while (d8 < (uint8_t *) rd->d_top) { va = _mm_load_si128 ((__m128i *)(s8)); - SSE_AB2(pp, m1, m2, va, t1, t2); + SSE_AB2(pp, m1, va, t1, t2); vb = va; - SSE_AB2(pp, m1, m2, va, t1, t2); + SSE_AB2(pp, m1, va, t1, t2); va = _mm_xor_si128(va, vb); _mm_store_si128((__m128i *)d8, va); d8 += 16; s8 += 16; } -#endif } +#endif +#ifdef INTEL_SSE2 static void gf_w4_bytwo_b_sse_region_6_xor(gf_region_data *rd, struct gf_bytwo_data *btd) { -#ifdef INTEL_SSE2 - int i; - uint8_t *d8, *s8, tb; - __m128i pp, m1, m2, t1, t2, va, vb; + uint8_t *d8, *s8; + __m128i pp, m1, t1, t2, va, vb; s8 = (uint8_t *) rd->s_start; d8 = (uint8_t *) rd->d_start; pp = _mm_set1_epi8(btd->prim_poly&0xff); m1 = _mm_set1_epi8((btd->mask1)&0xff); - m2 = _mm_set1_epi8((btd->mask2)&0xff); while (d8 < (uint8_t *) rd->d_top) { va = _mm_load_si128 ((__m128i *)(s8)); - SSE_AB2(pp, m1, m2, va, t1, t2); + SSE_AB2(pp, m1, va, t1, t2); vb = _mm_xor_si128(_mm_load_si128 ((__m128i *)(d8)), va); - SSE_AB2(pp, m1, m2, va, t1, t2); + SSE_AB2(pp, m1, va, t1, t2); vb = _mm_xor_si128(vb, va); _mm_store_si128((__m128i *)d8, vb); d8 += 16; s8 += 16; } -#endif } +#endif +#ifdef INTEL_SSE2 static void gf_w4_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { -#ifdef INTEL_SSE2 uint8_t *d8, *s8, tb; __m128i pp, m1, m2, t1, t2, va, vb; struct gf_bytwo_data *btd; @@ -1464,7 +1436,7 @@ gf_w4_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t v if (tb & 1) vb = _mm_xor_si128(vb, va); tb >>= 1; if (tb == 0) break; - SSE_AB2(pp, m1, m2, va, t1, t2); + SSE_AB2(pp, m1, va, t1, t2); } _mm_store_si128((__m128i *)d8, vb); d8 += 16; @@ -1491,16 +1463,13 @@ gf_w4_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t v } } gf_do_final_region_alignment(&rd); -#endif } +#endif static void gf_w4_bytwo_b_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { - unsigned long uls, uld; - int i; - uint8_t *s8, *d8, *top; uint64_t *s64, *d64, t1, t2, ta, tb, prod; struct gf_bytwo_data *btd; gf_region_data rd; @@ -1614,6 +1583,7 @@ gf_w4_bytwo_b_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t s64++; } } + break; case 6: if (xor) { while (d64 < (uint64_t *) rd.d_top) { @@ -1636,6 +1606,7 @@ gf_w4_bytwo_b_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t s64++; } } + break; case 7: if (xor) { while (d64 < (uint64_t *) rd.d_top) { @@ -1963,10 +1934,6 @@ int gf_w4_bytwo_init(gf_t *gf) static int gf_w4_cfm_init(gf_t *gf) { - gf_internal_t *h; - - h = (gf_internal_t *) gf->scratch; - #if defined(INTEL_SSE4_PCLMUL) gf->multiply.w32 = gf_w4_clm_multiply; return 1; @@ -1986,8 +1953,6 @@ int gf_w4_shift_init(gf_t *gf) int gf_w4_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2) { - int region_tbl_size; - int ss; int issse3 = 0; #ifdef INTEL_SSSE3 diff --git a/src/gf_w64.c b/src/gf_w64.c index 73bf164..f04daf0 100644 --- a/src/gf_w64.c +++ b/src/gf_w64.c @@ -87,20 +87,19 @@ xor) } } +#if defined(INTEL_SSE4_PCLMUL) static void gf_w64_clm_multiply_region_from_single_2(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor) { - int i, size; gf_val_64_t *s64, *d64, *top; gf_region_data rd; -#if defined(INTEL_SSE4_PCLMUL) __m128i a, b; __m128i result, r1; __m128i prim_poly; - __m128i v, w; + __m128i w; __m128i m1, m2, m3, m4; gf_internal_t * h = gf->scratch; @@ -121,7 +120,6 @@ xor) s64 = (gf_val_64_t *) rd.s_start; d64 = (gf_val_64_t *) rd.d_start; top = (gf_val_64_t *) rd.d_top; - size = bytes/sizeof(gf_val_64_t); if (xor) { while (d64 != top) { @@ -175,19 +173,18 @@ xor) } } gf_do_final_region_alignment(&rd); -#endif } +#endif +#if defined(INTEL_SSE4_PCLMUL) static void gf_w64_clm_multiply_region_from_single_4(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor) { - int i, size; gf_val_64_t *s64, *d64, *top; gf_region_data rd; -#if defined(INTEL_SSE4_PCLMUL) __m128i a, b; __m128i result, r1; __m128i prim_poly; @@ -210,7 +207,6 @@ xor) s64 = (gf_val_64_t *) rd.s_start; d64 = (gf_val_64_t *) rd.d_start; top = (gf_val_64_t *) rd.d_top; - size = bytes/sizeof(gf_val_64_t); if (xor) { while (d64 != top) { @@ -263,8 +259,8 @@ xor) } } gf_do_final_region_alignment(&rd); -#endif } +#endif static inline @@ -321,7 +317,7 @@ inline gf_val_64_t gf_w64_shift_multiply (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64) { - uint64_t pl, pr, ppl, ppr, i, pp, a, bl, br, one, lbit; + uint64_t pl, pr, ppl, ppr, i, a, bl, br, one, lbit; gf_internal_t *h; h = (gf_internal_t *) gf->scratch; @@ -468,9 +464,7 @@ gf_w64_clm_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int by { #if defined(INTEL_SSE4_PCLMUL) gf_internal_t *h; - int i, j, k; uint8_t *s8, *d8, *dtop; - uint64_t *s64, *d64; gf_region_data rd; __m128i v, b, m, prim_poly, c, fr, w, result; @@ -492,7 +486,6 @@ gf_w64_clm_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int by if (xor) { while (d8 != dtop) { - s64 = (uint64_t *) s8; b = _mm_load_si128((__m128i *) s8); result = _mm_clmulepi64_si128 (b, v, 0); c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0); @@ -521,7 +514,6 @@ gf_w64_clm_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int by } } else { while (d8 < dtop) { - s64 = (uint64_t *) s8; b = _mm_load_si128((__m128i *) s8); result = _mm_clmulepi64_si128 (b, v, 0); c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0); @@ -741,8 +733,6 @@ gf_w64_split_16_64_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint64_ static int gf_w64_shift_init(gf_t *gf) { - gf_internal_t *h; - gf->multiply.w64 = gf_w64_shift_multiply; gf->inverse.w64 = gf_w64_euclid; gf->multiply_region.w64 = gf_w64_multiply_region_from_single; @@ -752,14 +742,14 @@ int gf_w64_shift_init(gf_t *gf) static int gf_w64_cfm_init(gf_t *gf) { - gf_internal_t *h; - - h = (gf_internal_t *) gf->scratch; - gf->inverse.w64 = gf_w64_euclid; gf->multiply_region.w64 = gf_w64_multiply_region_from_single; #if defined(INTEL_SSE4_PCLMUL) + gf_internal_t *h; + + h = (gf_internal_t *) gf->scratch; + if ((0xfffffffe00000000ULL & h->prim_poly) == 0){ gf->multiply.w64 = gf_w64_clm_multiply_2; gf->multiply_region.w64 = gf_w64_clm_multiply_region_from_single_2; @@ -803,7 +793,6 @@ inline gf_val_64_t gf_w64_group_multiply(gf_t *gf, gf_val_64_t a, gf_val_64_t b) { - int i; uint64_t top, bot, mask, tp; int g_s, g_r, lshift, rshift; struct gf_w64_group_data *gd; @@ -854,7 +843,7 @@ static void gf_w64_group_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor) { int i, fzb; - uint64_t a64, smask, rmask, top, bot, tp, one; + uint64_t a64, smask, rmask, top, bot, tp; int lshift, rshift, g_s, g_r; gf_region_data rd; uint64_t *s64, *d64, *dtop; @@ -936,9 +925,8 @@ inline gf_val_64_t gf_w64_group_s_equals_r_multiply(gf_t *gf, gf_val_64_t a, gf_val_64_t b) { - int i; int leftover, rs; - uint64_t p, l, ind, r, a64; + uint64_t p, l, ind, a64; int bits_left; int g_s; @@ -974,9 +962,8 @@ gf_w64_group_s_equals_r_multiply(gf_t *gf, gf_val_64_t a, gf_val_64_t b) static void gf_w64_group_s_equals_r_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor) { - int i; int leftover, rs; - uint64_t p, l, ind, r, a64; + uint64_t p, l, ind, a64; int bits_left; int g_s; gf_region_data rd; @@ -1189,7 +1176,7 @@ static void gf_w64_bytwo_p_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor) { - uint64_t *s64, *d64, t1, t2, ta, prod, amask, pmask, pp; + uint64_t *s64, *d64, ta, prod, amask, pmask, pp; gf_region_data rd; gf_internal_t *h; @@ -1243,7 +1230,7 @@ static void gf_w64_bytwo_b_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor) { - uint64_t *s64, *d64, t1, t2, ta, tb, prod, amask, bmask, pp; + uint64_t *s64, *d64, ta, tb, prod, bmask, pp; gf_region_data rd; gf_internal_t *h; @@ -1374,14 +1361,13 @@ void gf_w64_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_ #endif } +#ifdef INTEL_SSE2 static void gf_w64_bytwo_b_sse_region_2_xor(gf_region_data *rd) { -#ifdef INTEL_SSE2 - int i; uint64_t one64, amask; - uint8_t *d8, *s8, tb; + uint8_t *d8, *s8; __m128i pp, m1, m2, t1, t2, va, vb; gf_internal_t *h; @@ -1405,17 +1391,16 @@ gf_w64_bytwo_b_sse_region_2_xor(gf_region_data *rd) d8 += 16; s8 += 16; } -#endif } +#endif +#ifdef INTEL_SSE2 static void gf_w64_bytwo_b_sse_region_2_noxor(gf_region_data *rd) { -#ifdef INTEL_SSE2 - int i; uint64_t one64, amask; - uint8_t *d8, *s8, tb; + uint8_t *d8, *s8; __m128i pp, m1, m2, t1, t2, va; gf_internal_t *h; @@ -1437,18 +1422,17 @@ gf_w64_bytwo_b_sse_region_2_noxor(gf_region_data *rd) d8 += 16; s8 += 16; } -#endif } +#endif +#ifdef INTEL_SSE2 static void gf_w64_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor) { -#ifdef INTEL_SSE2 uint64_t itb, amask, one64; uint8_t *d8, *s8; __m128i pp, m1, m2, t1, t2, va, vb; - struct gf_w32_bytwo_data *btd; gf_region_data rd; gf_internal_t *h; @@ -1495,8 +1479,8 @@ gf_w64_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t } gf_do_final_region_alignment(&rd); -#endif } +#endif static @@ -1620,17 +1604,13 @@ static void gf_w64_composite_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor) { - unsigned long uls, uld; gf_internal_t *h = (gf_internal_t *) gf->scratch; gf_t *base_gf = h->base_gf; - int i=0; uint32_t b0 = val & 0x00000000ffffffff; uint32_t b1 = (val & 0xffffffff00000000) >> 32; uint64_t *s64, *d64; uint64_t *top; uint64_t a0, a1, a1b1; - int num_syms = bytes / 8; - int sym_divisible = bytes % 4; gf_region_data rd; if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } @@ -1721,14 +1701,14 @@ int gf_w64_composite_init(gf_t *gf) return 1; } +#ifdef INTEL_SSSE3 static void gf_w64_split_4_64_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int bytes, int xor) { -#ifdef INTEL_SSSE3 gf_internal_t *h; - int i, m, j, k, tindex; - uint64_t pp, v, s, *s64, *d64, *top; + int i, j, k; + uint64_t pp, v, *s64, *d64, *top; __m128i si, tables[16][8], p[8], v0, mask1; struct gf_split_4_64_lazy_data *ld; uint8_t btable[16]; @@ -1802,18 +1782,18 @@ gf_w64_split_4_64_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void *des } } gf_do_final_region_alignment(&rd); -#endif } +#endif +#ifdef INTEL_SSE4 static void gf_w64_split_4_64_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int bytes, int xor) { -#ifdef INTEL_SSE4 gf_internal_t *h; - int i, m, j, k, tindex; - uint64_t pp, v, s, *s64, *d64, *top; - __m128i si, tables[16][8], p[8], st[8], mask1, mask8, mask16, t1, t2; + int i, j, k; + uint64_t pp, v, *s64, *d64, *top; + __m128i si, tables[16][8], p[8], st[8], mask1, mask8, mask16, t1; struct gf_split_4_64_lazy_data *ld; uint8_t btable[16]; gf_region_data rd; @@ -2006,8 +1986,8 @@ gf_w64_split_4_64_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, uint } gf_do_final_region_alignment(&rd); -#endif } +#endif #define GF_MULTBY_TWO(p) (((p) & GF_FIRST_BIT) ? (((p) << 1) ^ h->prim_poly) : (p) << 1); @@ -2141,8 +2121,6 @@ int gf_w64_split_init(gf_t *gf) int gf_w64_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2) { - int issse4; - switch(mult_type) { case GF_MULT_SHIFT: @@ -2162,11 +2140,9 @@ int gf_w64_scratch_size(int mult_type, int region_type, int divide_type, int arg * then fall through to split table scratch size code. */ #ifdef INTEL_SSE4 - issse4 = 1; arg1 = 64; arg2 = 4; #else - issse4 = 0; arg1 = 64; arg2 = 8; #endif @@ -2202,7 +2178,7 @@ int gf_w64_scratch_size(int mult_type, int region_type, int divide_type, int arg int gf_w64_init(gf_t *gf) { - gf_internal_t *h, *h_base, *h_base_base, *h_base_base_base; + gf_internal_t *h; int no_default_flag = 0; h = (gf_internal_t *) gf->scratch; diff --git a/src/gf_w8.c b/src/gf_w8.c index 7661aad..a2a8600 100644 --- a/src/gf_w8.c +++ b/src/gf_w8.c @@ -216,7 +216,7 @@ gf_w8_clm_multiply_2 (gf_t *gf, gf_val_32_t a8, gf_val_32_t b8) __m128i a, b; __m128i result; __m128i prim_poly; - __m128i v, w; + __m128i w; gf_internal_t * h = gf->scratch; a = _mm_insert_epi32 (_mm_setzero_si128(), a8, 0); @@ -262,7 +262,7 @@ gf_w8_clm_multiply_3 (gf_t *gf, gf_val_32_t a8, gf_val_32_t b8) __m128i a, b; __m128i result; __m128i prim_poly; - __m128i v, w; + __m128i w; gf_internal_t * h = gf->scratch; a = _mm_insert_epi32 (_mm_setzero_si128(), a8, 0); @@ -301,7 +301,7 @@ gf_w8_clm_multiply_4 (gf_t *gf, gf_val_32_t a8, gf_val_32_t b8) __m128i a, b; __m128i result; __m128i prim_poly; - __m128i v, w; + __m128i w; gf_internal_t * h = gf->scratch; a = _mm_insert_epi32 (_mm_setzero_si128(), a8, 0); @@ -364,6 +364,7 @@ gf_w8_multiply_region_from_single(gf_t *gf, void *src, void *dest, gf_val_32_t v gf_do_final_region_alignment(&rd); } +#if defined(INTEL_SSE4_PCLMUL) static void gf_w8_clm_multiply_region_from_single_2(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int @@ -373,12 +374,10 @@ gf_w8_clm_multiply_region_from_single_2(gf_t *gf, void *src, void *dest, gf_val_ uint8_t *s8; uint8_t *d8; -#if defined(INTEL_SSE4_PCLMUL) - __m128i a, b; __m128i result; __m128i prim_poly; - __m128i v, w; + __m128i w; gf_internal_t * h = gf->scratch; prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffULL)); @@ -420,9 +419,10 @@ gf_w8_clm_multiply_region_from_single_2(gf_t *gf, void *src, void *dest, gf_val_ } } gf_do_final_region_alignment(&rd); -#endif } +#endif +#if defined(INTEL_SSE4_PCLMUL) static void gf_w8_clm_multiply_region_from_single_3(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int @@ -432,12 +432,10 @@ gf_w8_clm_multiply_region_from_single_3(gf_t *gf, void *src, void *dest, gf_val_ uint8_t *s8; uint8_t *d8; -#if defined(INTEL_SSE4_PCLMUL) - __m128i a, b; __m128i result; __m128i prim_poly; - __m128i v, w; + __m128i w; gf_internal_t * h = gf->scratch; prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffULL)); @@ -483,9 +481,10 @@ gf_w8_clm_multiply_region_from_single_3(gf_t *gf, void *src, void *dest, gf_val_ } } gf_do_final_region_alignment(&rd); -#endif } +#endif +#if defined(INTEL_SSE4_PCLMUL) static void gf_w8_clm_multiply_region_from_single_4(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int @@ -495,12 +494,10 @@ gf_w8_clm_multiply_region_from_single_4(gf_t *gf, void *src, void *dest, gf_val_ uint8_t *s8; uint8_t *d8; -#if defined(INTEL_SSE4_PCLMUL) - __m128i a, b; __m128i result; __m128i prim_poly; - __m128i v, w; + __m128i w; gf_internal_t * h = gf->scratch; prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffULL)); @@ -550,8 +547,8 @@ gf_w8_clm_multiply_region_from_single_4(gf_t *gf, void *src, void *dest, gf_val_ } } gf_do_final_region_alignment(&rd); -#endif } +#endif /* ------------------------------------------------------------ IMPLEMENTATION: SHIFT: @@ -588,11 +585,11 @@ gf_w8_shift_multiply (gf_t *gf, uint32_t a8, uint32_t b8) static int gf_w8_cfm_init(gf_t *gf) { +#if defined(INTEL_SSE4_PCLMUL) gf_internal_t *h; h = (gf_internal_t *) gf->scratch; -#if defined(INTEL_SSE4_PCLMUL) if ((0xe0 & h->prim_poly) == 0){ gf->multiply.w32 = gf_w8_clm_multiply_2; gf->multiply_region.w32 = gf_w8_clm_multiply_region_from_single_2; @@ -731,7 +728,7 @@ static gf_w8_log_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor) { int i; - uint8_t lv, b, c; + uint8_t lv; uint8_t *s8, *d8; struct gf_w8_logtable_data *ltd; @@ -760,7 +757,7 @@ static gf_w8_logzero_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor) { int i; - uint8_t lv, b, c; + uint8_t lv; uint8_t *s8, *d8; struct gf_w8_logzero_table_data *ltd; struct gf_w8_logzero_small_table_data *std; @@ -802,9 +799,9 @@ gf_w8_logzero_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int int gf_w8_log_init(gf_t *gf) { gf_internal_t *h; - struct gf_w8_logtable_data *ltd; - struct gf_w8_logzero_table_data *ztd; - struct gf_w8_logzero_small_table_data *std; + struct gf_w8_logtable_data *ltd = NULL; + struct gf_w8_logzero_table_data *ztd = NULL; + struct gf_w8_logzero_small_table_data *std = NULL; uint8_t *alt; uint8_t *inv; int i, b; @@ -941,6 +938,7 @@ gf_w8_default_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b) return (ftd->multtable[a][b]); } +#ifdef INTEL_SSSE3 static gf_val_32_t gf_w8_default_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b) @@ -950,6 +948,7 @@ gf_w8_default_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b) ftd = (struct gf_w8_default_data *) ((gf_internal_t *) gf->scratch)->private; return (ftd->divtable[a][b]); } +#endif static gf_val_32_t @@ -976,7 +975,7 @@ static gf_w8_double_table_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { uint16_t *base; - uint32_t b, c, prod, vc, vb; + uint32_t b, c, vc, vb; gf_internal_t *h; struct gf_w8_double_table_data *dtd; struct gf_w8_double_table_lazy_data *ltd; @@ -1033,7 +1032,6 @@ static gf_w8_table_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { int i; - uint8_t lv, b, c; uint8_t *s8, *d8; struct gf_w8_single_table_data *ftd; @@ -1055,14 +1053,13 @@ gf_w8_table_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, in } } +#ifdef INTEL_SSSE3 static void gf_w8_split_multiply_region_sse(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { -#ifdef INTEL_SSSE3 - uint8_t *s8, *d8, *bh, *bl, *sptr, *dptr, *top; - __m128i tbl, loset, t1, r, va, mth, mtl; - uint64_t altable[4]; + uint8_t *bh, *bl, *sptr, *dptr; + __m128i loset, t1, r, va, mth, mtl; struct gf_w8_half_table_data *htd; gf_region_data rd; @@ -1115,8 +1112,8 @@ gf_w8_split_multiply_region_sse(gf_t *gf, void *src, void *dest, gf_val_32_t val } gf_do_final_region_alignment(&rd); -#endif } +#endif /* ------------------------------------------------------------ @@ -1137,9 +1134,7 @@ static void gf_w8_split_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { - unsigned long uls, uld; int i; - uint8_t lv, b, c; uint8_t *s8, *d8; struct gf_w8_half_table_data *htd; @@ -1167,11 +1162,10 @@ int gf_w8_split_init(gf_t *gf) { gf_internal_t *h; struct gf_w8_half_table_data *htd; - int a, b, pp; + int a, b; h = (gf_internal_t *) gf->scratch; htd = (struct gf_w8_half_table_data *)h->private; - pp = h->prim_poly; bzero(htd->high, sizeof(uint8_t)*GF_FIELD_SIZE*GF_HALF_SIZE); bzero(htd->low, sizeof(uint8_t)*GF_FIELD_SIZE*GF_HALF_SIZE); @@ -1325,13 +1319,13 @@ gf_w8_composite_multiply_region_alt(gf_t *gf, void *src, void *dest, gf_val_32_t gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 32); gf_do_initial_region_alignment(&rd); - sub_reg_size = (rd.d_top - rd.d_start) / 2; + sub_reg_size = ((uint8_t *)rd.d_top - (uint8_t *)rd.d_start) / 2; base_gf->multiply_region.w32(base_gf, rd.s_start, rd.d_start, val0, sub_reg_size, xor); - base_gf->multiply_region.w32(base_gf, rd.s_start+sub_reg_size, rd.d_start, val1, sub_reg_size, 1); - base_gf->multiply_region.w32(base_gf, rd.s_start, rd.d_start+sub_reg_size, val1, sub_reg_size, xor); - base_gf->multiply_region.w32(base_gf, rd.s_start+sub_reg_size, rd.d_start+sub_reg_size, val0, sub_reg_size, 1); - base_gf->multiply_region.w32(base_gf, rd.s_start+sub_reg_size, rd.d_start+sub_reg_size, base_gf->multiply.w32(base_gf, h->prim_poly, val1), sub_reg_size, 1); + base_gf->multiply_region.w32(base_gf, (uint8_t *)rd.s_start+sub_reg_size, rd.d_start, val1, sub_reg_size, 1); + base_gf->multiply_region.w32(base_gf, rd.s_start, (uint8_t *)rd.d_start+sub_reg_size, val1, sub_reg_size, xor); + base_gf->multiply_region.w32(base_gf, (uint8_t *)rd.s_start+sub_reg_size, (uint8_t *)rd.d_start+sub_reg_size, val0, sub_reg_size, 1); + base_gf->multiply_region.w32(base_gf, (uint8_t *)rd.s_start+sub_reg_size, (uint8_t *)rd.d_start+sub_reg_size, base_gf->multiply.w32(base_gf, h->prim_poly, val1), sub_reg_size, 1); gf_do_final_region_alignment(&rd); } @@ -1361,7 +1355,6 @@ gf_val_32_t gf_w8_composite_multiply_inline(gf_t *gf, gf_val_32_t a, gf_val_32_t b) { gf_internal_t *h = (gf_internal_t *) gf->scratch; - gf_t *base_gf = h->base_gf; uint8_t b0 = b & 0x0f; uint8_t b1 = (b & 0xf0) >> 4; uint8_t a0 = a & 0x0f; @@ -1674,15 +1667,14 @@ gf_w8_bytwo_p_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t prod = _mm_xor_si128(prod, t1); \ v = _mm_srli_epi64(v, 1); } +#ifdef INTEL_SSE2 static void gf_w8_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { -#ifdef INTEL_SSE2 int i; uint8_t *s8, *d8; uint8_t vrev; - uint64_t amask; __m128i pp, m1, m2, ta, prod, t1, t2, tp, one, v; struct gf_w8_bytwo_data *btd; gf_region_data rd; @@ -1727,17 +1719,16 @@ gf_w8_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t v s8 += 16; } gf_do_final_region_alignment(&rd); -#endif } +#endif +#ifdef INTEL_SSE2 static void gf_w8_bytwo_b_sse_region_2_noxor(gf_region_data *rd, struct gf_w8_bytwo_data *btd) { -#ifdef INTEL_SSE2 - int i; - uint8_t *d8, *s8, tb; - __m128i pp, m1, m2, t1, t2, va, vb; + uint8_t *d8, *s8; + __m128i pp, m1, m2, t1, t2, va; s8 = (uint8_t *) rd->s_start; d8 = (uint8_t *) rd->d_start; @@ -1753,16 +1744,15 @@ gf_w8_bytwo_b_sse_region_2_noxor(gf_region_data *rd, struct gf_w8_bytwo_data *bt d8 += 16; s8 += 16; } -#endif } +#endif +#ifdef INTEL_SSE2 static void gf_w8_bytwo_b_sse_region_2_xor(gf_region_data *rd, struct gf_w8_bytwo_data *btd) { -#ifdef INTEL_SSE2 - int i; - uint8_t *d8, *s8, tb; + uint8_t *d8, *s8; __m128i pp, m1, m2, t1, t2, va, vb; s8 = (uint8_t *) rd->s_start; @@ -1781,15 +1771,15 @@ gf_w8_bytwo_b_sse_region_2_xor(gf_region_data *rd, struct gf_w8_bytwo_data *btd) d8 += 16; s8 += 16; } -#endif } +#endif +#ifdef INTEL_SSE2 static void gf_w8_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { -#ifdef INTEL_SSE2 int itb; uint8_t *d8, *s8; __m128i pp, m1, m2, t1, t2, va, vb; @@ -1837,15 +1827,13 @@ gf_w8_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t v } gf_do_final_region_alignment(&rd); -#endif } +#endif static void gf_w8_bytwo_b_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { - int i; - uint8_t *s8, *d8, *top; uint64_t *s64, *d64, t1, t2, ta, tb, prod; struct gf_w8_bytwo_data *btd; gf_region_data rd; @@ -1944,6 +1932,7 @@ gf_w8_bytwo_b_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t s64++; } } + break; case 6: if (xor) { while (d64 < (uint64_t *) rd.d_top) { @@ -1966,6 +1955,7 @@ gf_w8_bytwo_b_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t s64++; } } + break; /* case 7: if (xor) { @@ -2362,7 +2352,7 @@ int gf_w8_scratch_size(int mult_type, int region_type, int divide_type, int arg1 int gf_w8_init(gf_t *gf) { - gf_internal_t *h, *h_base; + gf_internal_t *h; h = (gf_internal_t *) gf->scratch; @@ -2454,11 +2444,9 @@ uint8_t *gf_w8_get_mult_table(gf_t *gf) uint8_t *gf_w8_get_div_table(gf_t *gf) { - gf_internal_t *h; struct gf_w8_default_data *ftd; struct gf_w8_single_table_data *std; - h = (gf_internal_t *) gf->scratch; if (gf->multiply.w32 == gf_w8_default_multiply) { ftd = (struct gf_w8_default_data *) ((gf_internal_t *) gf->scratch)->private; return (uint8_t *) ftd->divtable; diff --git a/src/gf_wgen.c b/src/gf_wgen.c index f5e22e0..06f7993 100644 --- a/src/gf_wgen.c +++ b/src/gf_wgen.c @@ -284,9 +284,8 @@ inline gf_val_32_t gf_wgen_group_s_equals_r_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b) { - int i; int leftover, rs; - uint32_t p, l, ind, r, a32; + uint32_t p, l, ind, a32; int bits_left; int g_s; int w; @@ -362,7 +361,7 @@ gf_wgen_group_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b) { int i; int leftover; - uint64_t p, l, r, mask; + uint64_t p, l, r; uint32_t a32, ind; int g_s, g_r; struct gf_wgen_group_data *gd; @@ -496,7 +495,7 @@ int gf_wgen_table_8_init(gf_t *gf) gf_internal_t *h; int w; struct gf_wgen_table_w8_data *std; - uint32_t a, b, p, pp; + uint32_t a, b, p; h = (gf_internal_t *) gf->scratch; w = h->w; @@ -557,7 +556,7 @@ int gf_wgen_table_16_init(gf_t *gf) gf_internal_t *h; int w; struct gf_wgen_table_w16_data *std; - uint32_t a, b, p, pp; + uint32_t a, b, p; h = (gf_internal_t *) gf->scratch; w = h->w; @@ -917,11 +916,11 @@ gf_wgen_cauchy_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int byte for (i = 0; i < h->w; i++) { for (j = 0; j < h->w; j++) { if (val & (1 << j)) { - gf_multby_one(src, dest + j*rs, rs, (written & (1 << j))); + gf_multby_one(src, ((uint8_t *)dest) + j*rs, rs, (written & (1 << j))); written |= (1 << j); } } - src += rs; + src = (uint8_t *)src + rs; val = gf->multiply.w32(gf, val, 2); } } diff --git a/test-driver b/test-driver new file mode 100755 index 0000000..d306056 --- /dev/null +++ b/test-driver @@ -0,0 +1,139 @@ +#! /bin/sh +# test-driver - basic testsuite driver script. + +scriptversion=2013-07-13.22; # UTC + +# Copyright (C) 2011-2013 Free Software Foundation, Inc. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. + +# As a special exception to the GNU General Public License, if you +# distribute this file as part of a program that contains a +# configuration script generated by Autoconf, you may include it under +# the same distribution terms that you use for the rest of that program. + +# This file is maintained in Automake, please report +# bugs to <bug-automake@gnu.org> or send patches to +# <automake-patches@gnu.org>. + +# Make unconditional expansion of undefined variables an error. This +# helps a lot in preventing typo-related bugs. +set -u + +usage_error () +{ + echo "$0: $*" >&2 + print_usage >&2 + exit 2 +} + +print_usage () +{ + cat <<END +Usage: + test-driver --test-name=NAME --log-file=PATH --trs-file=PATH + [--expect-failure={yes|no}] [--color-tests={yes|no}] + [--enable-hard-errors={yes|no}] [--] + TEST-SCRIPT [TEST-SCRIPT-ARGUMENTS] +The '--test-name', '--log-file' and '--trs-file' options are mandatory. +END +} + +test_name= # Used for reporting. +log_file= # Where to save the output of the test script. +trs_file= # Where to save the metadata of the test run. +expect_failure=no +color_tests=no +enable_hard_errors=yes +while test $# -gt 0; do + case $1 in + --help) print_usage; exit $?;; + --version) echo "test-driver $scriptversion"; exit $?;; + --test-name) test_name=$2; shift;; + --log-file) log_file=$2; shift;; + --trs-file) trs_file=$2; shift;; + --color-tests) color_tests=$2; shift;; + --expect-failure) expect_failure=$2; shift;; + --enable-hard-errors) enable_hard_errors=$2; shift;; + --) shift; break;; + -*) usage_error "invalid option: '$1'";; + *) break;; + esac + shift +done + +missing_opts= +test x"$test_name" = x && missing_opts="$missing_opts --test-name" +test x"$log_file" = x && missing_opts="$missing_opts --log-file" +test x"$trs_file" = x && missing_opts="$missing_opts --trs-file" +if test x"$missing_opts" != x; then + usage_error "the following mandatory options are missing:$missing_opts" +fi + +if test $# -eq 0; then + usage_error "missing argument" +fi + +if test $color_tests = yes; then + # Keep this in sync with 'lib/am/check.am:$(am__tty_colors)'. + red='[0;31m' # Red. + grn='[0;32m' # Green. + lgn='[1;32m' # Light green. + blu='[1;34m' # Blue. + mgn='[0;35m' # Magenta. + std='[m' # No color. +else + red= grn= lgn= blu= mgn= std= +fi + +do_exit='rm -f $log_file $trs_file; (exit $st); exit $st' +trap "st=129; $do_exit" 1 +trap "st=130; $do_exit" 2 +trap "st=141; $do_exit" 13 +trap "st=143; $do_exit" 15 + +# Test script is run here. +"$@" >$log_file 2>&1 +estatus=$? +if test $enable_hard_errors = no && test $estatus -eq 99; then + estatus=1 +fi + +case $estatus:$expect_failure in + 0:yes) col=$red res=XPASS recheck=yes gcopy=yes;; + 0:*) col=$grn res=PASS recheck=no gcopy=no;; + 77:*) col=$blu res=SKIP recheck=no gcopy=yes;; + 99:*) col=$mgn res=ERROR recheck=yes gcopy=yes;; + *:yes) col=$lgn res=XFAIL recheck=no gcopy=yes;; + *:*) col=$red res=FAIL recheck=yes gcopy=yes;; +esac + +# Report outcome to console. +echo "${col}${res}${std}: $test_name" + +# Register the test result, and other relevant metadata. +echo ":test-result: $res" > $trs_file +echo ":global-test-result: $res" >> $trs_file +echo ":recheck: $recheck" >> $trs_file +echo ":copy-in-global-log: $gcopy" >> $trs_file + +# Local Variables: +# mode: shell-script +# sh-indentation: 2 +# eval: (add-hook 'write-file-hooks 'time-stamp) +# time-stamp-start: "scriptversion=" +# time-stamp-format: "%:y-%02m-%02d.%02H" +# time-stamp-time-zone: "UTC" +# time-stamp-end: "; # UTC" +# End: diff --git a/test/gf_unit.c b/test/gf_unit.c index cf466fe..deaaced 100644 --- a/test/gf_unit.c +++ b/test/gf_unit.c @@ -70,16 +70,16 @@ int main(int argc, char **argv) { signal(SIGSEGV, SigHandler); - int w, i, verbose, single, region, tested, top; + int w, i, verbose, single, region, top; int s_start, d_start, bytes, xor, alignment_test; gf_t gf, gf_def; time_t t0; gf_internal_t *h; - gf_general_t *a, *b, *c, *d, *ai, *bi; - uint8_t a8, b8, c8, *mult4, *div4, *mult8, *div8; - uint16_t a16, b16, c16, d16, *log16, *alog16; - char as[50], bs[50], cs[50], ds[50], ais[50], bis[50]; - uint32_t mask; + gf_general_t *a, *b, *c, *d; + uint8_t a8, b8, c8, *mult4 = NULL, *mult8 = NULL; + uint16_t a16, b16, c16, *log16 = NULL, *alog16 = NULL; + char as[50], bs[50], cs[50], ds[50]; + uint32_t mask = 0; char *ra, *rb, *rc, *rd, *target; int align; @@ -115,8 +115,6 @@ int main(int argc, char **argv) b = (gf_general_t *) malloc(sizeof(gf_general_t)); c = (gf_general_t *) malloc(sizeof(gf_general_t)); d = (gf_general_t *) malloc(sizeof(gf_general_t)); - ai = (gf_general_t *) malloc(sizeof(gf_general_t)); - bi = (gf_general_t *) malloc(sizeof(gf_general_t)); //15 bytes extra to make sure it's 16byte aligned ra = (char *) malloc(sizeof(char)*REGION_SIZE+15); @@ -145,12 +143,10 @@ int main(int argc, char **argv) problem("No default for this value of w"); if (w == 4) { mult4 = gf_w4_get_mult_table(&gf); - div4 = gf_w4_get_div_table(&gf); } if (w == 8) { mult8 = gf_w8_get_mult_table(&gf); - div8 = gf_w8_get_div_table(&gf); } if (w == 16) { @@ -240,7 +236,6 @@ int main(int argc, char **argv) } } - tested = 0; gf_general_multiply(&gf, a, b, c); /* If w is 4, 8 or 16, then there are inline multiplication/division methods. @@ -285,7 +280,6 @@ int main(int argc, char **argv) /* If this is not composite, then first test against the default: */ if (h->mult_type != GF_MULT_COMPOSITE) { - tested = 1; gf_general_multiply(&gf_def, a, b, d); if (!gf_general_are_equal(c, d, w)) { @@ -306,7 +300,6 @@ int main(int argc, char **argv) if (gf_general_is_zero(a, w) || gf_general_is_zero(b, w) || gf_general_is_one(a, w) || gf_general_is_one(b, w)) { - tested = 1; if (((gf_general_is_zero(a, w) || gf_general_is_zero(b, w)) && !gf_general_is_zero(c, w)) || (gf_general_is_one(a, w) && !gf_general_are_equal(b, c, w)) || (gf_general_is_one(b, w) && !gf_general_are_equal(a, c, w))) { @@ -429,4 +422,5 @@ int main(int argc, char **argv) gf_general_do_region_check(&gf, a, rc+s_start, rd+d_start, target+d_start, bytes, xor); } } + return 0; } diff --git a/tools/Makefile.am b/tools/Makefile.am index 31dffae..9e7c564 100644 --- a/tools/Makefile.am +++ b/tools/Makefile.am @@ -3,6 +3,8 @@ INCLUDES=-I./ -I../include AM_CFLAGS = -O3 $(SIMD_FLAGS) -fPIC $(INCLUDES) +TESTS=run-tests.sh + bin_PROGRAMS = gf_mult gf_div gf_add gf_time gf_methods gf_poly gf_inline_time gf_mult_SOURCES = gf_mult.c diff --git a/tools/gf_add.c b/tools/gf_add.c index b900e69..28cc12c 100644 --- a/tools/gf_add.c +++ b/tools/gf_add.c @@ -62,7 +62,7 @@ void print_128(uint64_t *v) int main(int argc, char **argv) { - int hex, al, bl, w; + int hex, w; uint32_t a, b, c, top; uint64_t a64, b64, c64; uint64_t a128[2], b128[2], c128[2]; diff --git a/tools/gf_inline_time.c b/tools/gf_inline_time.c index e64f0b3..c81e8a9 100644 --- a/tools/gf_inline_time.c +++ b/tools/gf_inline_time.c @@ -62,8 +62,8 @@ int main(int argc, char **argv) int w, j, i, size, iterations; gf_t gf; double timer, elapsed, dnum, num; - uint8_t *ra, *rb, *mult4, *mult8; - uint16_t *ra16, *rb16, *log16, *alog16; + uint8_t *ra = NULL, *rb = NULL, *mult4, *mult8; + uint16_t *ra16 = NULL, *rb16 = NULL, *log16, *alog16; time_t t0; if (argc != 5) usage(NULL); @@ -164,4 +164,5 @@ int main(int argc, char **argv) printf("Inline mult: %10.6lf s Mops: %10.3lf %10.3lf Mega-ops/s\n", elapsed, dnum/1024.0/1024.0, dnum/1024.0/1024.0/elapsed); } + return 0; } diff --git a/tools/gf_methods.c b/tools/gf_methods.c index 3afb438..921febf 100644 --- a/tools/gf_methods.c +++ b/tools/gf_methods.c @@ -20,8 +20,9 @@ #define BNMULTS (8) static char *BMULTS[BNMULTS] = { "CARRY_FREE", "GROUP48", "TABLE", "LOG", "SPLIT4", "SPLIT8", "SPLIT88", "COMPOSITE" }; -#define NMULTS (16) -static char *MULTS[NMULTS] = { "SHIFT", "CARRY_FREE", "GROUP44", "GROUP48", "BYTWO_p", "BYTWO_b", +//ADAM +#define NMULTS (17) +static char *MULTS[NMULTS] = { "SHIFT", "CARRY_FREE", "CARRY_FREE_GK", "GROUP44", "GROUP48", "BYTWO_p", "BYTWO_b", "TABLE", "LOG", "LOG_ZERO", "LOG_ZERO_EXT", "SPLIT2", "SPLIT4", "SPLIT8", "SPLIT16", "SPLIT88", "COMPOSITE" }; @@ -76,7 +77,7 @@ int main(int argc, char *argv[]) int listing; char *gf_argv[50], *x; gf_t gf; - char divs[200], ks[10], ls[10]; + char ls[10]; char * w_str; if (argc != 4) usage(NULL); diff --git a/tools/gf_poly.c b/tools/gf_poly.c index e19706c..44a24ac 100644 --- a/tools/gf_poly.c +++ b/tools/gf_poly.c @@ -84,7 +84,6 @@ int gcd_one(gf_t *gf, int w, int n, gf_general_t *poly, gf_general_t *prod) { gf_general_t *a, *b, zero, factor, p; int i, j, da, db; - char buf[30]; gf_general_set_zero(&zero, w); @@ -123,7 +122,6 @@ void x_to_q_to_i_minus_x(gf_t *gf, int w, int n, gf_general_t *poly, int logq, i gf_general_t *product; gf_general_t p, zero, factor; int j, k, lq; - char buf[20]; gf_general_set_zero(&zero, w); product = (gf_general_t *) malloc(sizeof(gf_general_t) * n*2); @@ -181,9 +179,9 @@ void x_to_q_to_i_minus_x(gf_t *gf, int w, int n, gf_general_t *poly, int logq, i free(x_to_q); } -main(int argc, char **argv) +int main(int argc, char **argv) { - int w, i, power, n, ap, success, j; + int w, i, power, n, ap, success; gf_t gf; gf_general_t *poly, *prod; char *string, *ptr; diff --git a/tools/gf_time.c b/tools/gf_time.c index 2bd2d04..4becc8d 100644 --- a/tools/gf_time.c +++ b/tools/gf_time.c @@ -119,7 +119,7 @@ int main(int argc, char **argv) if (!create_gf_from_argv(&gf, w, argc, argv, 6)) usage(BM); strcpy(tests, ""); - for (i = 0; i < argv[2][i] != '\0'; i++) { + for (i = 0; argv[2][i] != '\0'; i++) { switch(argv[2][i]) { case 'A': strcat(tests, single_tests); strcat(tests, region_tests); @@ -163,8 +163,8 @@ int main(int argc, char **argv) for (i = 0; i < 3; i++) { test = single_tests[i]; if (strchr(tests, test) != NULL) { - if (tmethods[test] == NULL) { - printf("No %s method.\n", tstrings[test]); + if (tmethods[(int)test] == NULL) { + printf("No %s method.\n", tstrings[(int)test]); } else { elapsed = 0; dnum = 0; @@ -176,7 +176,7 @@ int main(int argc, char **argv) elapsed += timer_split(&timer); } printf("%14s: %10.6lf s Mops: %10.3lf %10.3lf Mega-ops/s\n", - tstrings[test], elapsed, + tstrings[(int)test], elapsed, dnum/1024.0/1024.0, dnum/1024.0/1024.0/elapsed); } } @@ -185,8 +185,8 @@ int main(int argc, char **argv) for (i = 0; i < 4; i++) { test = region_tests[i]; if (strchr(tests, test) != NULL) { - if (tmethods[test] == NULL) { - printf("No %s method.\n", tstrings[test]); + if (tmethods[(int)test] == NULL) { + printf("No %s method.\n", tstrings[(int)test]); } else { elapsed = 0; @@ -204,10 +204,11 @@ int main(int argc, char **argv) elapsed += timer_split(&timer); } printf("%14s: XOR: %d %10.6lf s MB: %10.3lf %10.3lf MB/s\n", - tstrings[test], xor, elapsed, + tstrings[(int)test], xor, elapsed, ds*di/1024.0/1024.0, ds*di/1024.0/1024.0/elapsed); } } } } + return 0; } diff --git a/tools/run-tests.sh b/tools/run-tests.sh new file mode 100755 index 0000000..bd3cc60 --- /dev/null +++ b/tools/run-tests.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +for w in 4 8 16 32 64 128 ; do + ./gf_methods $w -A -U | sh -e + if [ $? != "0" ] ; then + echo "Failed unit tests for w=$w" + break + fi +done |