Add generic perfect string hashes, use for directives

Add a generic facility for generating perfect string hashes, where all that is needed is an enum and a string table. The existing mechanism using a custom Perl script wrapped around a module continues to be available for any use case where this particular approach isn't sophisticated enough. Much of this patch comes from renaming "enum directives" to "enum directive" as a result of the string hash generator expecting a set of uniform naming conventions. Signed-off-by: H. Peter Anvin <hpa@zytor.com>
author: H. Peter Anvin <hpa@zytor.com> 2017-04-03 00:09:58 -0700
committer: H. Peter Anvin <hpa@zytor.com> 2017-04-03 00:27:07 -0700
commit: 5253f58c3679b9b21006567ad26a419904791645 (patch)
tree: 538cd02d8ea06b042e97ecf548366883fb9a31ba /nasmlib
parent: b1a5b26477db4ce4270b8de75c36b8f9acfc3f69 (diff)
download: nasm-5253f58c3679b9b21006567ad26a419904791645.tar.gz
3 files changed, 460 insertions, 0 deletions
diff --git a/nasmlib/badenum.c b/nasmlib/badenum.c
new file mode 100644
index 00000000..6f880c85
--- /dev/null
+++ b/nasmlib/badenum.c
@@ -0,0 +1,43 @@
+/* ----------------------------------------------------------------------- *
+ *
+ *   Copyright 2017 The NASM Authors - All Rights Reserved
+ *   See the file AUTHORS included with the NASM distribution for
+ *   the specific copyright holders.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following
+ *   conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *   * Redistributions in binary form must reproduce the above
+ *     copyright notice, this list of conditions and the following
+ *     disclaimer in the documentation and/or other materials provided
+ *     with the distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
+ *     CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ *     INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *     MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *     DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ *     CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *     SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ *     NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ *     HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ *     OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+ *     EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ----------------------------------------------------------------------- */
+
+#include "nasmlib.h"
+
+/* Used to avoid returning NULL to a debug printing function */
+const char *invalid_enum_str(int x)
+{
+    static char buf[64];
+
+    snprintf(buf, sizeof buf, "<invalid %d>", x);
+    return buf;
+}
diff --git a/nasmlib/perfhash.c b/nasmlib/perfhash.c
new file mode 100644
index 00000000..5cd6714e
--- /dev/null
+++ b/nasmlib/perfhash.c
@@ -0,0 +1,55 @@
+/* ----------------------------------------------------------------------- *
+ *
+ *   Copyright 2017 The NASM Authors - All Rights Reserved
+ *   See the file AUTHORS included with the NASM distribution for
+ *   the specific copyright holders.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following
+ *   conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *   * Redistributions in binary form must reproduce the above
+ *     copyright notice, this list of conditions and the following
+ *     disclaimer in the documentation and/or other materials provided
+ *     with the distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
+ *     CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ *     INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *     MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *     DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ *     CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *     SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ *     NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ *     HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ *     OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+ *     EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ----------------------------------------------------------------------- */
+
+#include "perfhash.h"
+#include "hashtbl.h"            /* For crc64i() */
+
+int perfhash_find(const struct perfect_hash *hash, const char *str)
+{
+    uint32_t k1, k2;
+    uint64_t crc;
+    uint16_t ix;
+
+    crc = crc64i(hash->crcinit, str);
+    k1 = (uint32_t)crc & hash->hashmask;
+    k2 = (uint32_t)(crc >> 32) & hash->hashmask;
+
+    ix = hash->hashvals[k1] + hash->hashvals[k2 + hash->hashmask + 1];
+
+    if (ix >= hash->tbllen ||
+        !hash->strings[ix] ||
+        nasm_stricmp(str, hash->strings[ix]))
+        return hash->errval;
+
+    return hash->tbloffs + ix;
+}
diff --git a/nasmlib/perfhash.pl b/nasmlib/perfhash.pl
new file mode 100755
index 00000000..639b347f
--- /dev/null
+++ b/nasmlib/perfhash.pl
@@ -0,0 +1,362 @@
+#!/usr/bin/perl
+## --------------------------------------------------------------------------
+##
+##   Copyright 1996-2017 The NASM Authors - All Rights Reserved
+##   See the file AUTHORS included with the NASM distribution for
+##   the specific copyright holders.
+##
+##   Redistribution and use in source and binary forms, with or without
+##   modification, are permitted provided that the following
+##   conditions are met:
+##
+##   * Redistributions of source code must retain the above copyright
+##     notice, this list of conditions and the following disclaimer.
+##   * Redistributions in binary form must reproduce the above
+##     copyright notice, this list of conditions and the following
+##     disclaimer in the documentation and/or other materials provided
+##     with the distribution.
+##
+##     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
+##     CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
+##     INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+##     MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+##     DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+##     CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+##     SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+##     NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+##     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+##     HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+##     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+##     OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+##     EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+##
+## --------------------------------------------------------------------------
+
+#
+# Generate a perfect hash for general case-insensitive string-to-enum
+# lookup.  This generates an enum and the corresponding hash, but
+# relies on a common function to parse the hash.
+#
+# Usage:
+#      perfhash.pl h foohash.dat foohash.h (to generate C header)
+#      perfhash.pl c foohash.dat foohash.c (to generate C source)
+#
+
+use strict;
+
+require 'phash.ph';
+
+sub basename($) {
+    my($s) = @_;
+    $s =~ s/^.*[^-[:alnum:]_\.]//;	# Remove path component as best we can
+    return $s;
+}
+
+sub intval($) {
+    my($s) = @_;
+
+    if ($s =~ /^0/) {
+	return oct($s);		# Handles octal or hexadecimal
+    } elsif ($s =~ /^\-(0.*)$/) {
+	return -oct($1);
+    } else {
+	return $s + 0;		# Forcibly convert to number
+    }
+}
+
+my($output, $infile, $outfile) = @ARGV;
+my $me = basename($0);
+
+# The following special things are allowed in the input file:
+# #<space> or ; begins a comment
+# #include filename
+# #name str
+#    The name of the hash
+# #prefix str
+#    Defines the prefix before enum
+# #guard str
+#    Defines the header guard string
+# #special str [= value]
+#    Generate an enum value without a corresponding string; not capitalized.
+# #header str
+#    Indicates the name of the .h file to include from the .c file
+# #errval str
+#    Define the value to be returned if a string is not found
+#    (defaults to -1).  This can be any constant C expression,
+#    including one of the enum values.
+#
+# Regular lines are just str [= value]
+#
+# Enumeration is generated in the order listed in the file, just as in C
+# specifying a value causes the values to increase by 1 from that point on
+# unless specified.
+
+my $name;
+my $prefix;
+my $guard;
+my $hfile;
+
+my %strings = ();
+my %specials = ();
+my $next_value = 0;
+my $errval = '-1';
+
+my @incstack =  ();
+my @filenames = ($infile);
+my @linenums =  (0);
+my $dd = undef;
+my $err = 0;
+
+while (scalar(@filenames)) {
+    if (!defined($dd)) {
+	open($dd, '<', $filenames[-1])
+	    or die "$0: cannot open: $filenames[-1]: $!\n";
+    }
+
+    my $line = <$dd>;
+    if (!defined($line)) {
+	close($dd);
+	$dd = pop @incstack;
+	pop @filenames;
+	pop @linenums;
+	next;
+    }
+
+    $linenums[-1]++;
+
+    chomp $line;
+    $line =~ s/\s*(|\;.*|\#\s.*|\#)$//; # Remove comments and trailing space
+    $line =~ s/^\s+//;			# Remove leading space
+    if ($line eq '') {
+	# Do nothing
+    } elsif ($line =~ /^\#name\s+([[:alnum:]_]+)$/) {
+	$name = $1;
+    } elsif ($line =~ /^\#prefix\s+([[:alnum:]_]+)$/) {
+	$prefix = $1;
+    } elsif ($line =~ /^\#guard\s+([[:alnum:]_]+)$/) {
+	$guard = $1;
+    } elsif ($line =~ /^\#errval\s+(\S.*)$/) {
+	$errval = $1;
+    } elsif ($line =~ /^\#header\s+(\"(.+)\"|\S+)$/) {
+	$hfile = ($2 ne '') ? $2 : $1;
+    } elsif ($line =~ /^\#include\s+(\"(.+)\"|\S+)$/) {
+	push @incstack, $dd;
+	push @filenames, (($2 ne '') ? $2 : $1);
+	push @linenums, 0;
+	undef $dd;		# Open a new file
+    } elsif ($line =~ /^(|\#special\s+)(\S+)\s*(|=\s*(\-?(0[Xx][[:xdigit:]]+|0[0-7]*|[0-9]+)))$/) {
+	$next_value = intval($4) if ($4 ne '');
+	if ($1 eq '') {
+	    $strings{$2} = $next_value++;
+	} else {
+	    $specials{$2} = $next_value++;
+	}
+    } else {
+	printf STDERR "%s:%d:%s syntax error: \"%s\"\n",
+	    $filenames[-1], $linenums[-1],
+	    (scalar(@incstack) == 1) ? '' : "(from $infile)", $line;
+	$err++;
+    }
+}
+
+exit 1 if ($err);
+
+# Default name, prefix, and header guard name
+if (!defined($name)) {
+    $name = basename($infile);
+    $name =~ s/(\..*)$//;		# Strip extension, if any
+}
+if (!defined($prefix)) {
+    $prefix = "\U${name}\E_";
+}
+if (!defined($hfile)) {
+    $hfile = $outfile;
+    $hfile =~ s/\.c$/\.h/;
+}
+if (!defined($guard)) {
+    $guard = basename($hfile);
+    $guard =~ s/[^[:alnum:]_]/_/g;
+    $guard =~ s/__+/_/g;
+    $guard =  "\U$guard";
+}
+
+# Verify input.  We can't have more than one constant with the same
+# enumeration value, nor the same enumeration string.
+if (scalar(keys(%strings)) == 0) {
+    die "$0: $infile: no strings to hash!\n";
+}
+
+my %enums;
+my %enumvals;
+my %stringbyval;
+my $max_enum;
+my $tbllen = 0;
+my $tbloffs;
+foreach my $s (keys(%strings)) {
+    my $es = "${prefix}\U${s}";
+    $es =~ s/[^[:alnum:]_]/_/g;
+    $es =~ s/__+/_/g;
+    my $v = $strings{$s};
+    $stringbyval{$v} = $s;
+    if (defined($enums{$es})) {
+	printf STDERR "%s: string \"%s\" duplicates existing enum %s\n",
+	    $infile, $s, $es;
+	$err++;
+    } else {
+	$enums{$es} = $v;
+    }
+    if (defined($enumvals{$v})) {
+	printf STDERR "%s: string \"%s\" duplicates existing enum constant %d\n", $v;
+	$err++;
+    } else {
+	$enumvals{$v} = $es;
+    }
+    $max_enum = $v if ($v > $max_enum || !defined($max_enum));
+    $tbloffs = $v if ($v < $tbloffs || !defined($tbloffs));
+    $tbllen = $v+1 if ($v >= $tbllen || !defined($tbllen));
+}
+foreach my $s (keys(%specials)) {
+    my $es = $prefix . $s;	# No string mangling here
+    my $v = $specials{$s};
+    if (defined($enums{$es})) {
+	printf STDERR "%s: special \"%s\" duplicates existing enum %s\n",
+	    $infile, $s, $es;
+	$err++;
+    } else {
+	$enums{$es} = $v;
+    }
+    if (defined ($enumvals{$v})) {
+	printf STDERR "%s: special \"%s\" duplicates existing enum constant %d\n", $v;
+	$err++;
+    } else {
+	$enumvals{$v} = $es;
+    }
+    $max_enum = $v if ($v > $max_enum || !defined($max_enum));
+}
+
+$tbllen -= $tbloffs;
+if ($tbllen > 65536) {
+    printf STDERR "%s: span of enumeration values too large\n";
+    $err++;
+}
+
+exit 1 if ($err);
+
+open(F, '>', $outfile)
+    or die "$0: cannot create: ${outfile}: $!\n";
+
+if ($output eq 'h') {
+    print F "/*\n";
+    print F " * This file is generated from $infile\n";
+    print F " * by $me; do not edit.\n";
+    print F " */\n";
+    print F "\n";
+
+    print F "#ifndef $guard\n";
+    print F "#define $guard 1\n\n";
+    print F "#include \"perfhash.h\"\n\n";
+
+    my $c = '{';
+    $next_value = 0;
+    print F "enum ${name} ";
+    foreach my $v (sort { $a <=> $b } keys(%enumvals)) {
+	my $s = $enumvals{$v};
+	print F "$c\n    $s";
+	print F " = $v" if ($v != $next_value);
+	$next_value = $v + 1;
+	$c = ',';
+    }
+    print F "\n};\n\n";
+    print F "extern const struct perfect_hash ${name}_hash;\n";
+    printf F "extern const char * const %s_tbl[%d];\n", $name, $tbllen;
+
+    print F "\nstatic inline enum ${name} ${name}_find(const char *str)\n";
+    print F "{\n";
+    print F "    return perfhash_find(&${name}_hash, str);\n";
+    print F "}\n";
+
+    print F "\nstatic inline const char * ${name}_name(enum ${name} x)\n";
+    print F "{\n";
+    printf F  "    size_t ix = (size_t)x - (%d);\n", $tbloffs;
+    printf F "    if (ix >= %d)\n", $tbllen;
+    print F  "        return NULL;\n";
+    print F  "    return ${name}_tbl[ix];\n";
+    print F  "}\n";
+
+    print F "\nstatic inline const char * ${name}_dname(enum ${name} x)\n";
+    print F "{\n";
+    print F "    const char *y = ${name}_name(x);\n";
+    print F "    return y ? y : invalid_enum_str(x);\n";
+    print F "}\n";
+
+    print F "\n#endif /* $guard */\n";
+} elsif ($output eq 'c') {
+    # The strings we hash must all be lower case, even if the string
+    # table doesn't contain them that way.
+
+    my %lcstrings;
+    foreach my $s (keys(%strings)) {
+	my $ls = "\L$s";
+	if (defined($lcstrings{$ls})) {
+	    printf STDERR "%s: strings \"%s\" and \"%s\" differ only in case\n",
+		$infile, $s, $strings{$lcstrings{$s}};
+	} else {
+	    $lcstrings{$ls} = $strings{$s} - $tbloffs;
+	}
+    }
+
+    my @hashinfo = gen_perfect_hash(\%lcstrings);
+    if (!@hashinfo) {
+	die "$0: no hash found\n";
+    }
+
+    # Paranoia...
+    verify_hash_table(\%lcstrings, \@hashinfo);
+
+    my ($n, $sv, $g) = @hashinfo;
+
+    die if ($n & ($n-1));
+
+    print F "/*\n";
+    print F " * This file is generated from $infile\n";
+    print F " * by $me; do not edit.\n";
+    print F " */\n";
+    print F "\n";
+
+    print F "#include \"$hfile\"\n\n";
+
+    printf F "const char * const %s_tbl[%d] = ", $name, $tbllen;
+    my $c = '{';
+    for (my $i = $tbloffs; $i < $tbloffs+$tbllen; $i++) {
+	printf F "%s\n    %s", $c,
+	    defined($stringbyval{$i}) ? '"'.$stringbyval{$i}.'"' : 'NULL';
+	$c = ',';
+    }
+    print F "\n};\n\n";
+
+    print F "#define UNUSED (65536/3)\n\n";
+
+    printf F "static const int16_t %s_hashvals[%d] = ", $name, $n*2;
+    $c = '{';
+    for (my $i = 0; $i < $n; $i++) {
+	my $h = ${$g}[$i*2+0];
+	print F "$c\n    ", defined($h) ? $h : 'UNUSED';
+	$c = ',';
+    }
+    for (my $i = 0; $i < $n; $i++) {
+	my $h = ${$g}[$i*2+1];
+	print F "$c\n    ", defined($h) ? $h : 'UNUSED';
+	$c = ',';
+    }
+    print F "\n};\n\n";
+
+    print F "const struct perfect_hash ${name}_hash = {\n";
+    printf F "    UINT64_C(0x%08x%08x),\n", $$sv[0], $$sv[1]; # crcinit
+    printf F "    UINT32_C(0x%x),\n", $n-1;		      # hashmask
+    printf F "    UINT32_C(%u),\n", $tbllen;		      # tbllen
+    printf F "    %d,\n", $tbloffs;			      # tbloffs
+    printf F "    (%s),\n", $errval;			      # errval
+    printf F "    ${name}_hashvals,\n";			      # hashvals
+    printf F "    ${name}_tbl\n";			      # strings
+    print F "};\n";
+}
author	H. Peter Anvin <hpa@zytor.com>	2017-04-03 00:09:58 -0700
committer	H. Peter Anvin <hpa@zytor.com>	2017-04-03 00:27:07 -0700
commit	5253f58c3679b9b21006567ad26a419904791645 (patch)
tree	538cd02d8ea06b042e97ecf548366883fb9a31ba /nasmlib
parent	b1a5b26477db4ce4270b8de75c36b8f9acfc3f69 (diff)
download	nasm-5253f58c3679b9b21006567ad26a419904791645.tar.gz