diff options
-rw-r--r-- | charclass_invlists.h | 2 | ||||
-rw-r--r-- | lib/unicore/uni_keywords.pl | 2 | ||||
-rw-r--r-- | regen/mk_invlists.pl | 13 | ||||
-rw-r--r-- | regen/mph.pl | 200 | ||||
-rw-r--r-- | uni_keywords.h | 8 |
5 files changed, 127 insertions, 98 deletions
diff --git a/charclass_invlists.h b/charclass_invlists.h index aba4582ebf..73447dc032 100644 --- a/charclass_invlists.h +++ b/charclass_invlists.h @@ -430755,5 +430755,5 @@ static const U8 WB_table[23][23] = { * c72bbdeda99714db1c8024d3311da4aef3c0db3b9b9f11455a7cfe10d5e9aba3 lib/unicore/version * 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl * 5f8520d3a17ade6317fc0c423f5091470924b1ef425bca0c41ce8e4a9f8460fe regen/mk_PL_charclass.pl - * f53911aa9a2b154279f8aba208653ed5a83724a117b7964c01f199a026607e83 regen/mk_invlists.pl + * 9c08a09afbb28779be92ff658e0d27e654be4570241048689e2ffc20437a3d91 regen/mk_invlists.pl * ex: set ro: */ diff --git a/lib/unicore/uni_keywords.pl b/lib/unicore/uni_keywords.pl index b7eb4a3c0f..4360373914 100644 --- a/lib/unicore/uni_keywords.pl +++ b/lib/unicore/uni_keywords.pl @@ -1323,5 +1323,5 @@ # c72bbdeda99714db1c8024d3311da4aef3c0db3b9b9f11455a7cfe10d5e9aba3 lib/unicore/version # 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl # 5f8520d3a17ade6317fc0c423f5091470924b1ef425bca0c41ce8e4a9f8460fe regen/mk_PL_charclass.pl -# f53911aa9a2b154279f8aba208653ed5a83724a117b7964c01f199a026607e83 regen/mk_invlists.pl +# 9c08a09afbb28779be92ff658e0d27e654be4570241048689e2ffc20437a3d91 regen/mk_invlists.pl # ex: set ro: diff --git a/regen/mk_invlists.pl b/regen/mk_invlists.pl index e5259aa118..c2e7535ceb 100644 --- a/regen/mk_invlists.pl +++ b/regen/mk_invlists.pl @@ -3367,12 +3367,13 @@ my $keywords_fh = open_new('uni_keywords.h', '>', print $keywords_fh "\n#if defined(PERL_CORE) || defined(PERL_EXT_RE_BUILD)\n\n"; -my ($second_level, $seed1, $length_all_keys, $smart_blob, $rows) - = MinimalPerfectHash::make_mph_from_hash(\%keywords); -print $keywords_fh MinimalPerfectHash::make_algo($second_level, $seed1, - $length_all_keys, $smart_blob, - $rows, undef, undef, undef, - 'match_uniprop' ); +my $mph= MinimalPerfectHash->new( + source_hash => \%keywords, + match_name => "match_uniprop", +); +$mph->make_mph_with_split_keys(); +print $keywords_fh $mph->make_algo(); + print $keywords_fh "\n#endif /* #if defined(PERL_CORE) || defined(PERL_EXT_RE_BUILD) */\n"; push @sources, 'regen/mph.pl'; diff --git a/regen/mph.pl b/regen/mph.pl index 3cc5c01e57..29e65b55d9 100644 --- a/regen/mph.pl +++ b/regen/mph.pl @@ -13,6 +13,13 @@ use warnings 'FATAL' => 'all'; # -isbc -nolc -otr -kis -ci=4 -se -sot -sct -nsbl -pt=2 -fs \ # -fsb='##!' -fse='##.' +# Naming conventions +# * The public API, consisting of methods, uses "normal" sub names with +# no leading underscore. +# * Private subs are prefixed with a single underscore. +# * Private methods are prefixed with two underscores. (There is only +# one at the time of writing this comment) + use constant { FNV32_PRIME => 16777619, U8_MAX => 0xFF, @@ -26,6 +33,34 @@ my $MASK= U32_MAX; my $MAX_SEED2= U16_MAX; # currently the same, but not necessarily. my $IS_32BIT= !eval { pack "Q", 1 }; +sub new { + my ($class, %self)= @_; + + my $source_hash= $self{source_hash} + or die "'source_hash' is a required parameter in $class->new()\n"; + + my $length_all_keys= 0; + $length_all_keys += length($_) for keys %$source_hash; + $self{length_all_keys}= $length_all_keys; + + $self{max_attempts} ||= 16; # pick a number, any number... + + $self{base_name} ||= "mph"; + my $base_name= $self{base_name}; + + $self{prefix} ||= uc($base_name); + + $self{h_file} ||= $base_name . "_algo.h"; + $self{c_file} ||= $base_name . "_test.c"; + $self{t_file} ||= $base_name . "_test.pl"; + $self{blob_name} ||= $base_name . "_blob"; + $self{struct_name} ||= $base_name . "_struct"; + $self{table_name} ||= $base_name . "_table"; + $self{match_name} ||= $base_name . "_match"; + + return bless \%self, $class; +} + # The basic idea is that you have a two level structure, and effectively # hash the key twice. # @@ -69,9 +104,10 @@ sub _fnv1a_32 { } sub build_perfect_hash { - my ($source_hash, $max_attempts)= @_; + my ($self)= @_; - $max_attempts ||= 16; # pick a number, any number... + my $source_hash= $self->{source_hash}; + my $max_attempts= $self->{max_attempts}; my $n= 0 + keys %$source_hash; print "Building a minimal perfect hash from $n keys.\n" @@ -104,8 +140,11 @@ sub build_perfect_hash { } my $second_level= _build_mph_level2($hash_to_key, $key_to_hash, $key_buckets); - return $seed1, $second_level - if $second_level; + if ($second_level) { + $self->{seed1}= $seed1; + $self->{second_level}= $second_level; + return $seed1, $second_level; + } } die sprintf "After %d attempts failed to construct a minimal perfect " . "hash with %d keys.\nWe are using fnv32(), perhaps this " @@ -481,8 +520,12 @@ sub _build_split_words_simple { return ($blob, $res, $length_all_keys); } + sub build_split_words { - my ($hash, $length_all_keys)= @_; + my ($self)= @_; + + my $hash= $self->{source_hash}; + my $length_all_keys= $self->{length_all_keys}; my ($blob, $split_points)= _build_split_words_simple($hash, $length_all_keys, 0); @@ -502,14 +545,16 @@ sub build_split_words { length $blob, length $blob2 if $DEBUG; } + $self->{blob}= $blob; + $self->{split_points}= $split_points; return $blob, $split_points; } sub blob_as_code { - my ($blob, $blob_name)= @_; - - $blob_name ||= "mph_blob"; + my ($self)= @_; + my $blob= $self->{blob}; + my $blob_name= $self->{blob_name}; # output the blob as C code. my @code= (sprintf "STATIC const unsigned char %s[] =\n", $blob_name); @@ -519,11 +564,11 @@ sub blob_as_code { push @code, length $blob ? "\n" : ";\n"; } push @code, "/* $blob_name length: $blob_len */\n"; - return join "", @code; + return $self->{blob_as_code}= join "", @code; } sub print_includes { - my $ofh= shift; + my ($self, $ofh)= @_; print $ofh "#include <stdio.h>\n"; print $ofh "#include <string.h>\n"; print $ofh "#include <stdint.h>\n"; @@ -531,7 +576,8 @@ sub print_includes { } sub print_defines { - my ($ofh, $defines)= @_; + my ($self, $ofh)= @_; + my $defines= $self->{defines_hash}; my $key_len; foreach my $def (keys %$defines) { @@ -545,7 +591,9 @@ sub print_defines { } sub build_array_of_struct { - my ($second_level, $blob)= @_; + my ($self)= @_; + my $second_level= $self->{second_level}; + my $blob= $self->{blob}; my %defines; my %tests; @@ -574,20 +622,25 @@ sub build_array_of_struct { @u16, @u8, $row->{value}, $row->{prefix}, $row->{suffix}; ##. } + $self->{rows_array}= \@rows; + $self->{defines_hash}= \%defines; + $self->{tests_hash}= \%tests; return \@rows, \%defines, \%tests; } sub make_algo { + my ($self)= @_; + my ( $second_level, $seed1, $length_all_keys, $blob, - $rows, $blob_name, $struct_name, $table_name, - $match_name, $prefix - )= @_; - - $blob_name ||= "mph_blob"; - $struct_name ||= "mph_struct"; - $table_name ||= "mph_table"; - $prefix ||= "MPH"; + $rows_array, $blob_name, $struct_name, $table_name, + $match_name, $prefix, + ) + = @{$self}{ qw( + second_level seed1 length_all_keys blob + rows_array blob_name struct_name table_name + match_name prefix + ) }; my $n= 0 + @$second_level; my $data_size= $n * 8 + length $blob; @@ -604,7 +657,7 @@ sub make_algo { ($data_size / $length_all_keys) * 100; push @code, "*/\n\n"; - push @code, blob_as_code($blob, $blob_name); + push @code, $self->blob_as_code(); push @code, <<"EOF_CODE"; struct $struct_name { @@ -627,7 +680,7 @@ EOF_CODE push @code, "/* The comments give the input key for the row it is in */\n"; push @code, "STATIC const struct $struct_name $table_name\[${prefix}_BUCKETS] = {\n", - join(",\n", @$rows) . "\n};\n\n"; + join(",\n", @$rows_array) . "\n};\n\n"; push @code, <<"EOF_CODE"; ${prefix}_VALt $match_name( const unsigned char * const key, const U16 key_len ) { @@ -659,13 +712,13 @@ $match_name( const unsigned char * const key, const U16 key_len ) { } EOF_CODE - return join "", @code; + return $self->{algo_code}= join "", @code; } sub __ofh { - my ($to, $default)= @_; + my ($self, $to, $default_key)= @_; - $to //= $default; + $to //= $self->{$default_key}; my $ofh; if (ref $to) { @@ -678,24 +731,18 @@ sub __ofh { return $ofh; } - sub print_algo { - my ( - $ofh, $second_level, $seed1, $long_blob, $blob, - $rows, $blob_name, $struct_name, $table_name, $match_name - )= @_; + my ($self, $to)= @_; - $ofh= __ofh($ofh, "mph_algo.h"); + my $ofh= $self->__ofh($to, "h_file"); - my $code= make_algo( - $second_level, $seed1, $long_blob, $blob, $rows, - $blob_name, $struct_name, $table_name, $match_name - ); - print $ofh $code; + my $code= $self->make_algo(); + print $to $code; } sub print_main { - my ($ofh, $h_file, $match_name, $prefix)= @_; + my ($self, $ofh)= @_; + my ($h_file, $match_name, $prefix)= @{$self}{qw(h_file match_name prefix)}; print $ofh <<"EOF_CODE"; #include "$h_file" @@ -713,9 +760,10 @@ EOF_CODE # output the test Perl code. sub print_tests { - my ($file, $tests_hash)= @_; + my ($self, $to)= @_; + my $tests_hash= $self->{tests_hash}; - $ofh = __ofh($file, "mph_test.pl"); + my $ofh= $self->__ofh($to, "t_file"); my $num_tests= 2 + keys %$tests_hash; print $ofh @@ -738,75 +786,49 @@ sub print_tests { "ok(\$res[$i]=~/got: (\\d+)/ && \$1 == $want, '$key');\n"; } } - close $ofh; } sub print_test_binary { - my ( - $file, $h_file, $second_level, $seed1, - $length_all_keys, $blob, $rows, $defines, - $match_name, $prefix - )= @_; + my ($self, $to)= @_; - my $ofh= __ofh($file, "mph_test.c"); + my $ofh= $self->__ofh($to, "c_file"); - print_includes($ofh); - print_defines($ofh, $defines); - print_main($ofh, $h_file, $match_name, $prefix); - close $ofh; + $self->print_includes($ofh); + $self->print_defines($ofh); + $self->print_main($ofh); } -sub make_mph_from_hash { - my $hash= shift; +sub make_mph_with_split_keys { + my ($self)= @_; - my $length_all_keys= 0; - $length_all_keys += length($_) for keys %$hash; + my $hash= $self->{source_hash}; + my $length_all_keys= $self->{length_all_keys}; - # we do this twice because often we can find longer prefixes on the second pass. - my ($blob, $res_to_split)= build_split_words($hash, $length_all_keys); + my ($blob, $split_points)= $self->build_split_words(); - my ($seed1, $second_level)= build_perfect_hash($hash, 16); + my ($seed1, $second_level)= $self->build_perfect_hash(); # add prefix/suffix data into the bucket info in @$second_level foreach my $bucket_info (@$second_level) { my $key= $bucket_info->{key}; - my $sp= $res_to_split->{$key} // die "no split pos for '$key'\n"; + my $sp= $split_points->{$key} // die "no split_point data for '$key'\n"; my ($prefix, $suffix)= unpack "A${sp}A*", $key; $bucket_info->{prefix}= $prefix; $bucket_info->{suffix}= $suffix; $bucket_info->{value}= $hash->{$key}; } - my ($rows, $defines, $tests)= build_array_of_struct($second_level, $blob); - return ($second_level, $seed1, $length_all_keys, $blob, $rows, - $defines, $tests); + my ($rows, $defines, $tests)= $self->build_array_of_struct(); + return 1; } -sub make_files { - my ($hash, $base_name)= @_; - - my $h_name= $base_name . "_algo.h"; - my $c_name= $base_name . "_test.c"; - my $p_name= $base_name . "_test.pl"; - my $blob_name= $base_name . "_blob"; - my $struct_name= $base_name . "_bucket_info"; - my $table_name= $base_name . "_table"; - my $match_name= $base_name . "_match"; - my $prefix= uc($base_name); - - my ($second_level, $seed1, $length_all_keys, $blob, $rows, $defines, $tests) - = make_mph_from_hash($hash); - print_algo( - $h_name, $second_level, $seed1, $length_all_keys, - $blob, $rows, $blob_name, $struct_name, - $table_name, $match_name, $prefix - ); - print_test_binary( - $c_name, $h_name, $second_level, $seed1, - $length_all_keys, $blob, $rows, $defines, - $match_name, $prefix - ); - print_tests($p_name, $tests); +sub make_files_split_keys { + my ($self)= @_; + + $self->make_mph_with_split_keys(); + $self->print_algo(); + $self->print_test_binary(); + $self->print_tests(); } unless (caller) { @@ -838,7 +860,11 @@ unless (caller) { my $name= shift @ARGV; $name ||= "mph"; - make_files(\%hash, $name); + my $obj= __PACKAGE__->new( + source_hash => \%hash, + base_name => $name + ); + $obj->make_files_split_keys(); } 1; diff --git a/uni_keywords.h b/uni_keywords.h index 26e3da034c..6f17dccfe6 100644 --- a/uni_keywords.h +++ b/uni_keywords.h @@ -10,6 +10,7 @@ #define MPH_VALt I16 /* +generator script: regen/mk_invlists.pl rows: 7420 seed: 1348825710 full length of keys: 104237 @@ -7623,7 +7624,8 @@ STATIC const struct mph_struct mph_table[MPH_BUCKETS] = { { 1, 10058, 9758, 3, 4, UNI_NV__3_SLASH_20 } /* nv=3/20 */ }; -MPH_VALt match_uniprop( const unsigned char * const key, const U16 key_len ) { +MPH_VALt +match_uniprop( const unsigned char * const key, const U16 key_len ) { const unsigned char * ptr= key; const unsigned char * ptr_end= key + key_len; U32 h= MPH_SEED1; @@ -7706,6 +7708,6 @@ MPH_VALt match_uniprop( const unsigned char * const key, const U16 key_len ) { * c72bbdeda99714db1c8024d3311da4aef3c0db3b9b9f11455a7cfe10d5e9aba3 lib/unicore/version * 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl * 5f8520d3a17ade6317fc0c423f5091470924b1ef425bca0c41ce8e4a9f8460fe regen/mk_PL_charclass.pl - * f53911aa9a2b154279f8aba208653ed5a83724a117b7964c01f199a026607e83 regen/mk_invlists.pl - * fd22f39fc463c10c0b2a3aef4ac32717a000acb209ee133a24da5acfd8a0993a regen/mph.pl + * 9c08a09afbb28779be92ff658e0d27e654be4570241048689e2ffc20437a3d91 regen/mk_invlists.pl + * 193db308cfcc2343ddde130a314afa74f4632a89635607f5297db8dccb64059b regen/mph.pl * ex: set ro: */ |