summaryrefslogtreecommitdiff
path: root/regen
diff options
context:
space:
mode:
authorYves Orton <demerphq@gmail.com>2018-04-18 22:04:53 -0600
committerKarl Williamson <khw@cpan.org>2018-04-20 01:11:54 -0600
commit1a4d3814836f7aad858cd4e420abf56006e5b6c4 (patch)
tree4e964a8daadf370a00ce71ef7e62dbc1a55bf6a9 /regen
parent394d2d3f374c001b40cce3e2709c7b75de05f55e (diff)
downloadperl-1a4d3814836f7aad858cd4e420abf56006e5b6c4.tar.gz
Add regen/mph.pl for generating a perfect hash
This is copied unchanged from github, https://github.com/demerphq/uni_prop_parser.git where it is more of a stand-alone program. Later commits will customize it for perl use.
Diffstat (limited to 'regen')
-rw-r--r--regen/mph.pl479
1 files changed, 479 insertions, 0 deletions
diff --git a/regen/mph.pl b/regen/mph.pl
new file mode 100644
index 0000000000..464ef38fb0
--- /dev/null
+++ b/regen/mph.pl
@@ -0,0 +1,479 @@
+package MinimalPerfectHash;
+use strict;
+use warnings;
+use Data::Dumper;
+use Carp;
+use Text::Wrap;
+
+my $DEBUG= 0;
+my $RSHIFT= 8;
+my $FNV_CONST= 16777619;
+
+sub _fnv {
+ my ($key, $seed)= @_;
+
+ my $hash = 0+$seed;
+ foreach my $char (split //, $key) {
+ $hash = $hash ^ ord($char);
+ $hash = ($hash * $FNV_CONST) & 0xFFFFFFFF;
+ }
+ return $hash;
+}
+
+sub build_perfect_hash {
+ my ($hash, $split_pos)= @_;
+
+ my $n= 0+keys %$hash;
+ my $max_h= 0xFFFFFFFF;
+ $max_h -= $max_h % $n; # this just avoids a tiny bit bias
+ my $seed1= unpack("N", "Perl") - 1;
+ my $hash_to_key;
+ my $key_to_hash;
+ my $length_all_keys;
+ my $key_buckets;
+ SEED1:
+ for ($seed1++;1;$seed1++) {
+ my %hash_to_key;
+ my %key_to_hash;
+ my %key_buckets;
+ my %high;
+ $length_all_keys= 0;
+ foreach my $key (sort keys %$hash) {
+ $length_all_keys += length $key;
+ my $h= _fnv($key,$seed1);
+ next SEED1 if $h >= $max_h; # check if this hash would bias, and if so find a new seed
+ next SEED1 if exists $hash_to_key{$h};
+ next SEED1 if $high{$h >> $RSHIFT}++;
+ $hash_to_key{$h}= $key;
+ $key_to_hash{$key}= $h;
+ push @{$key_buckets{$h % $n}}, $key;
+ }
+ $hash_to_key= \%hash_to_key;
+ $key_to_hash= \%key_to_hash;
+ $key_buckets= \%key_buckets;
+ last SEED1;
+ }
+
+ my %token;
+ my @first_level;
+ my @second_level;
+ foreach my $first_idx (sort { @{$key_buckets->{$b}} <=> @{$key_buckets->{$a}} || $a <=> $b } keys %$key_buckets) {
+ my $keys= $key_buckets->{$first_idx};
+ #printf "got %d keys in bucket %d\n", 0+@$keys, $first_idx;
+ my $seed2;
+ SEED2:
+ for ($seed2=1;1;$seed2++) {
+ goto FIND_SEED if $seed2 > 0xFFFF;
+ my @idx= map {
+ ( ( ( $key_to_hash->{$_} >> $RSHIFT ) ^ $seed2 ) & 0xFFFFFFFF ) % $n
+ } @$keys;
+ my %seen;
+ next SEED2 if grep { $second_level[$_] || $seen{$_}++ } @idx;
+ $first_level[$first_idx]= $seed2;
+ @second_level[@idx]= map {
+ my $sp= $split_pos->{$_} // die "no split pos for '$_':$!";
+ my ($prefix,$suffix)= unpack "A${sp}A*", $_;
+
+ +{
+ key => $_,
+ prefix => $prefix,
+ suffix => $suffix,
+ hash => $key_to_hash->{$_},
+ value => $hash->{$_},
+ seed2 => 0,
+ }
+ } @$keys;
+ last;
+ }
+
+ }
+ $second_level[$_]{seed2}= $first_level[$_]||0, $second_level[$_]{idx}= $_ for 0 .. $#second_level;
+
+ return $seed1, \@second_level, $length_all_keys;
+}
+
+sub build_split_words {
+ my ($hash, $preprocess, $blob, $old_res)= @_;
+ my %appended;
+ $blob //= "";
+ if ($preprocess) {
+ my %parts;
+ foreach my $key (sort {length($b) <=> length($a) || $a cmp $b } keys %$hash) {
+ my ($prefix,$suffix);
+ if ($key=~/^([^=]+=)([^=]+)\z/) {
+ ($prefix,$suffix)= ($1, $2);
+ $parts{$suffix}++;
+ #$parts{$prefix}++;
+ } else {
+ $prefix= $key;
+ $parts{$prefix}++;
+ }
+
+ }
+ foreach my $key (sort {length($b) <=> length($a) || $a cmp $b } keys %parts) {
+ $blob .= $key . "\0";
+ }
+ printf "Using preprocessing, initial blob size %d\n", length($blob);
+ } else {
+ printf "No preprocessing, initial blob size %d\n", length($blob);
+ }
+ my %res;
+
+ REDO:
+ %res= ();
+ KEY:
+ foreach my $key (
+ sort {
+ (length($b) <=> length($a)) ||
+ ($a cmp $b)
+ }
+ keys %$hash
+ ) {
+ next if exists $res{$key};
+ if (index($blob,$key) >= 0 ) {
+ my $idx= length($key);
+ if ($DEBUG and $old_res and $old_res->{$key} != $idx) {
+ print "changing: $key => $old_res->{$key} : $idx\n";
+ }
+ $res{$key}= $idx;
+ next KEY;
+ }
+ my $best= length($key);
+ my $append= $key;
+ my $min= 0; #length $key >= 4 ? 4 : 0;
+ my $best_prefix;
+ my $best_suffix;
+ foreach my $idx (reverse $min .. length($key)) {
+ my $prefix= substr($key,0,$idx);
+ my $suffix= substr($key,$idx);
+ my $i1= index($blob,$prefix)>=0;
+ my $i2= index($blob,$suffix)>=0;
+ if ($i1 and $i2) {
+ if ($DEBUG and $old_res and $old_res->{$key} != $idx) {
+ print "changing: $key => $old_res->{$key} : $idx\n";
+ }
+ $res{$key}= $idx;
+ $appended{$prefix}++;
+ $appended{$suffix}++;
+ next KEY;
+ } elsif ($i1) {
+ if (length $suffix <= length $append) {
+ $best= $idx;
+ $append= $suffix;
+ $best_prefix= $prefix;
+ $best_suffix= $suffix;
+ }
+ } elsif ($i2) {
+ if (length $prefix <= length $append) {
+ $best= $idx;
+ $append= $prefix;
+ $best_prefix= $prefix;
+ $best_suffix= $suffix;
+ }
+ }
+ }
+ if ($DEBUG and $old_res and $old_res->{$key} != $best) {
+ print "changing: $key => $old_res->{$key} : $best\n";
+ }
+ #print "$best_prefix|$best_suffix => $best => $append\n";
+ $res{$key}= $best;
+ $blob .= $append;
+ $appended{$best_prefix}++;
+ $appended{$best_suffix}++;
+ }
+ my $b2 = "";
+ foreach my $key (sort { length($b) <=> length($a) || $a cmp $b } keys %appended) {
+ $b2 .= $key unless index($b2,$key)>=0;
+ }
+ if (length($b2)<length($blob)) {
+ printf "Length old blob: %d length new blob: %d, recomputing using new blob\n", length($blob),length($b2);
+ $blob= $b2;
+ %appended=();
+ goto REDO;
+ } else {
+ printf "Length old blob: %d length new blob: %d, keeping old blob\n", length($blob),length($b2);
+ }
+ die sprintf "not same size? %d != %d", 0+keys %res, 0+keys %$hash unless keys %res == keys %$hash;
+ return ($blob,\%res);
+}
+
+
+sub blob_as_code {
+ my ($blob,$blob_name)= @_;
+
+ $blob_name ||= "mph_blob";
+
+ # output the blob as C code.
+ my @code= (sprintf "const unsigned char %s[] =\n",$blob_name);
+ my $blob_len= length $blob;
+ while (length($blob)) {
+ push @code, sprintf qq( "%s"), substr($blob,0,65,"");
+ push @code, length $blob ? "\n" : ";\n";
+ }
+ push @code, "/* $blob_name length: $blob_len */\n";
+ return join "",@code;
+}
+
+sub print_includes {
+ my $ofh= shift;
+ print $ofh "#include <stdio.h>\n";
+ print $ofh "#include <string.h>\n";
+ print $ofh "#include <stdint.h>\n";
+ print $ofh "\n";
+}
+
+sub print_defines {
+ my ($ofh,$defines)= @_;
+
+ my $key_len;
+ foreach my $def (keys %$defines) {
+ $key_len //= length $def;
+ $key_len= length $def if $key_len < length $def;
+ }
+ foreach my $def (sort keys %$defines) {
+ printf $ofh "#define %*s %5d\n", -$key_len, $def, $defines->{$def};
+ }
+ print $ofh "\n";
+}
+
+
+sub build_array_of_struct {
+ my ($second_level,$blob)= @_;
+
+ my %defines;
+ my %tests;
+ my @rows;
+ foreach my $row (@$second_level) {
+ $defines{$row->{value}}= $row->{idx}+1;
+ $tests{$row->{key}}= $defines{$row->{value}};
+ my @u16= (
+ $row->{seed2},
+ index($blob,$row->{prefix}//0),
+ index($blob,$row->{suffix}//0),
+ );
+ $_ > 0xFFFF and die "panic: value exceeds range of uint16_t"
+ for @u16;
+ my @u8= (
+ length($row->{prefix}),
+ length($row->{suffix}),
+ );
+ $_ > 0xFF and die "panic: value exceeds range of uint8_t"
+ for @u8;
+ push @rows, sprintf(" { %5d, %5d, %5d, %3d, %3d, %s }",
+ @u16, @u8, $row->{value} );
+ }
+ return \@rows,\%defines,\%tests;
+}
+
+sub print_algo {
+ my ($ofh, $second_level, $seed1, $length_all_keys, $smart_blob, $rows,
+ $blob_name, $struct_name, $table_name, $match_name, $prefix) = @_;
+
+ $blob_name ||= "mph_blob";
+ $struct_name ||= "mph_struct";
+ $table_name ||= "mph_table";
+ $prefix ||= "MPH";
+
+ if (!ref $ofh) {
+ my $file= $ofh;
+ undef $ofh;
+ open $ofh, ">", $file
+ or die "Failed to open '$file': $!";
+ }
+
+ my $n= 0+@$second_level;
+ my $data_size= 0+@$second_level * 8 + length $smart_blob;
+
+ print $ofh "/*\n";
+ printf $ofh "rows: %s\n", $n;
+ printf $ofh "seed: %s\n", $seed1;
+ printf $ofh "full length of keys: %d\n", $length_all_keys;
+ printf $ofh "blob length: %d\n", length $smart_blob;
+ printf $ofh "ref length: %d\n", 0+@$second_level * 8;
+ printf $ofh "data size: %d (%%%.2f)\n", $data_size, ($data_size / $length_all_keys) * 100;
+ print $ofh "*/\n\n";
+
+ print $ofh blob_as_code($smart_blob, $blob_name);
+ print $ofh <<"EOF_CODE";
+
+struct $struct_name {
+ uint16_t seed2;
+ uint16_t pfx;
+ uint16_t sfx;
+ uint8_t pfx_len;
+ uint8_t sfx_len;
+ ${prefix}_VALt value;
+};
+
+EOF_CODE
+
+ print $ofh "#define ${prefix}_RSHIFT $RSHIFT\n";
+ print $ofh "#define ${prefix}_BUCKETS $n\n\n";
+ printf $ofh "const uint32_t ${prefix}_SEED1 = 0x%08x;\n", $seed1;
+ printf $ofh "const uint32_t ${prefix}_FNV_CONST = 0x%08x;\n\n", $FNV_CONST;
+
+ print $ofh "\n";
+ print $ofh "const struct $struct_name $table_name\[${prefix}_BUCKETS] = {\n", join(",\n", @$rows)."\n};\n\n";
+ print $ofh <<"EOF_CODE";
+${prefix}_VALt $match_name( const unsigned char * const key, const uint16_t key_len ) {
+ const unsigned char * ptr= key;
+ const unsigned char * ptr_end= key + key_len;
+ uint32_t h= ${prefix}_SEED1;
+ uint32_t s;
+ uint32_t n;
+ do {
+ h ^= *ptr;
+ h *= ${prefix}_FNV_CONST;
+ } while ( ++ptr < ptr_end );
+ n= h % ${prefix}_BUCKETS;
+ s = $table_name\[n].seed2;
+ if (s) {
+ h= (h >> ${prefix}_RSHIFT) ^ s;
+ n = h % ${prefix}_BUCKETS;
+ if (
+ ( $table_name\[n].pfx_len + $table_name\[n].sfx_len == key_len ) &&
+ ( memcmp($blob_name + $table_name\[n].pfx, key, $table_name\[n].pfx_len) == 0 ) &&
+ ( !$table_name\[n].sfx_len || memcmp($blob_name + $table_name\[n].sfx,
+ key + $table_name\[n].pfx_len, $table_name\[n].sfx_len) == 0 )
+ ) {
+ return $table_name\[n].value;
+ }
+ }
+ return 0;
+}
+EOF_CODE
+}
+
+sub print_main {
+ my ($ofh,$h_file,$match_name,$prefix)=@_;
+ print $ofh <<"EOF_CODE";
+#define ${prefix}_VALt int16_t
+#include "$h_file"
+
+int main(int argc, char *argv[]){
+ int i;
+ for (i=1; i<argc; i++) {
+ unsigned char *key = (unsigned char *)argv[i];
+ int key_len = strlen(argv[i]);
+ printf("key: %s got: %d\\n", key, $match_name((unsigned char *)key,key_len));
+ }
+ return 0;
+}
+EOF_CODE
+}
+
+# output the test Perl code.
+sub print_tests {
+ my ($file, $tests_hash)= @_;
+ open my $ofh, ">", $file
+ or die "Failed to open '$file' for writing: $!";
+ my $num_tests= 2 + keys %$tests_hash;
+ print $ofh "use strict;\nuse warnings;\nuse Test::More tests => $num_tests;\nmy \@res;";
+ my $bytes= 0;
+ my @tests= sort keys %$tests_hash;
+ print $ofh "\@res=`./mph_test '$tests[0]/should-not-match' 'should-not-match/$tests[0]'`;\n";
+ print $ofh "ok( \$res[0] =~ /got: 0/,'proper prefix does not match');\n";
+ print $ofh "ok( \$res[1] =~ /got: 0/,'proper suffix does not match');\n";
+ while (@tests) {
+ my @batch= splice @tests,0,10;
+ my $batch_args= join " ", map { "'$_'" } @batch;
+ print $ofh "\@res=`./mph_test $batch_args`;\n";
+ foreach my $i (0..$#batch) {
+ my $key= $batch[$i];
+ my $want= $tests_hash->{$key};
+ print $ofh "ok(\$res[$i]=~/got: (\\d+)/ && \$1 == $want, '$key');\n";
+ }
+ }
+ close $ofh;
+}
+
+sub print_test_binary {
+ my ($file,$h_file, $second_level, $seed1, $length_all_keys,
+ $smart_blob, $rows, $defines, $match_name, $prefix)= @_;
+ open my $ofh, ">", $file
+ or die "Failed to open '$file': $!";
+ print_includes($ofh);
+ print_defines($ofh, $defines);
+ print_main($ofh,$h_file,$match_name,$prefix);
+ close $ofh;
+}
+
+sub make_mph_from_hash {
+ my $hash= shift;
+
+ # we do this twice because often we can find longer prefixes on the second pass.
+ my @keys= sort {length($b) <=> length($a) || $a cmp $b } keys %$hash;
+
+ my ($smart_blob, $res_to_split)= build_split_words($hash,0);
+ {
+ my ($smart_blob2, $res_to_split2)= build_split_words($hash,1);
+ if (length($smart_blob) > length($smart_blob2)) {
+ printf "Using preprocess-smart blob, length: %d (vs %d)\n", length $smart_blob2, length $smart_blob;
+ $smart_blob= $smart_blob2;
+ $res_to_split= $res_to_split2;
+ } else {
+ printf "Using greedy-smart blob, length: %d (vs %d)\n", length $smart_blob, length $smart_blob2;
+ }
+ }
+ my ($seed1, $second_level, $length_all_keys)= build_perfect_hash($hash, $res_to_split);
+ my ($rows, $defines, $tests)= build_array_of_struct($second_level, $smart_blob);
+ return ($second_level, $seed1, $length_all_keys, $smart_blob, $rows, $defines, $tests);
+}
+
+sub make_files {
+ my ($hash,$base_name)= @_;
+
+ my $h_name= $base_name . "_algo.h";
+ my $c_name= $base_name . "_test.c";
+ my $p_name= $base_name . "_test.pl";
+ my $blob_name= $base_name . "_blob";
+ my $struct_name= $base_name . "_bucket_info";
+ my $table_name= $base_name . "_table";
+ my $match_name= $base_name . "_match";
+ my $prefix= uc($base_name);
+
+ my ($second_level, $seed1, $length_all_keys,
+ $smart_blob, $rows, $defines, $tests)= make_mph_from_hash( $hash );
+ print_algo( $h_name,
+ $second_level, $seed1, $length_all_keys, $smart_blob, $rows,
+ $blob_name, $struct_name, $table_name, $match_name, $prefix );
+ print_test_binary( $c_name, $h_name, $second_level, $seed1, $length_all_keys,
+ $smart_blob, $rows, $defines,
+ $match_name, $prefix );
+ print_tests( $p_name, $tests );
+}
+
+unless (caller) {
+ my %hash;
+ {
+ no warnings;
+ do "../perl/lib/unicore/Heavy.pl";
+ %hash= %utf8::loose_to_file_of;
+ }
+ if ($ENV{MERGE_KEYS}) {
+ my @keys= keys %hash;
+ foreach my $loose (keys %utf8::loose_property_name_of) {
+ my $to= $utf8::loose_property_name_of{$loose};
+ next if $to eq $loose;
+ foreach my $key (@keys) {
+ my $copy= $key;
+ if ($copy=~s/^\Q$to\E(=|\z)/$loose$1/) {
+ #print "$key => $copy\n";
+ $hash{$copy}= $key;
+ }
+ }
+ }
+ }
+ foreach my $key (keys %hash) {
+ my $munged= uc($key);
+ $munged=~s/\W/__/g;
+ $hash{$key} = $munged;
+ }
+
+ my $name= shift @ARGV;
+ $name ||= "mph";
+ make_files(\%hash,$name);
+}
+
+1;
+__END__