summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--charclass_invlists.h2
-rw-r--r--lib/unicore/uni_keywords.pl2
-rw-r--r--regen/mk_invlists.pl13
-rw-r--r--regen/mph.pl200
-rw-r--r--uni_keywords.h8
5 files changed, 127 insertions, 98 deletions
diff --git a/charclass_invlists.h b/charclass_invlists.h
index aba4582ebf..73447dc032 100644
--- a/charclass_invlists.h
+++ b/charclass_invlists.h
@@ -430755,5 +430755,5 @@ static const U8 WB_table[23][23] = {
* c72bbdeda99714db1c8024d3311da4aef3c0db3b9b9f11455a7cfe10d5e9aba3 lib/unicore/version
* 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl
* 5f8520d3a17ade6317fc0c423f5091470924b1ef425bca0c41ce8e4a9f8460fe regen/mk_PL_charclass.pl
- * f53911aa9a2b154279f8aba208653ed5a83724a117b7964c01f199a026607e83 regen/mk_invlists.pl
+ * 9c08a09afbb28779be92ff658e0d27e654be4570241048689e2ffc20437a3d91 regen/mk_invlists.pl
* ex: set ro: */
diff --git a/lib/unicore/uni_keywords.pl b/lib/unicore/uni_keywords.pl
index b7eb4a3c0f..4360373914 100644
--- a/lib/unicore/uni_keywords.pl
+++ b/lib/unicore/uni_keywords.pl
@@ -1323,5 +1323,5 @@
# c72bbdeda99714db1c8024d3311da4aef3c0db3b9b9f11455a7cfe10d5e9aba3 lib/unicore/version
# 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl
# 5f8520d3a17ade6317fc0c423f5091470924b1ef425bca0c41ce8e4a9f8460fe regen/mk_PL_charclass.pl
-# f53911aa9a2b154279f8aba208653ed5a83724a117b7964c01f199a026607e83 regen/mk_invlists.pl
+# 9c08a09afbb28779be92ff658e0d27e654be4570241048689e2ffc20437a3d91 regen/mk_invlists.pl
# ex: set ro:
diff --git a/regen/mk_invlists.pl b/regen/mk_invlists.pl
index e5259aa118..c2e7535ceb 100644
--- a/regen/mk_invlists.pl
+++ b/regen/mk_invlists.pl
@@ -3367,12 +3367,13 @@ my $keywords_fh = open_new('uni_keywords.h', '>',
print $keywords_fh "\n#if defined(PERL_CORE) || defined(PERL_EXT_RE_BUILD)\n\n";
-my ($second_level, $seed1, $length_all_keys, $smart_blob, $rows)
- = MinimalPerfectHash::make_mph_from_hash(\%keywords);
-print $keywords_fh MinimalPerfectHash::make_algo($second_level, $seed1,
- $length_all_keys, $smart_blob,
- $rows, undef, undef, undef,
- 'match_uniprop' );
+my $mph= MinimalPerfectHash->new(
+ source_hash => \%keywords,
+ match_name => "match_uniprop",
+);
+$mph->make_mph_with_split_keys();
+print $keywords_fh $mph->make_algo();
+
print $keywords_fh "\n#endif /* #if defined(PERL_CORE) || defined(PERL_EXT_RE_BUILD) */\n";
push @sources, 'regen/mph.pl';
diff --git a/regen/mph.pl b/regen/mph.pl
index 3cc5c01e57..29e65b55d9 100644
--- a/regen/mph.pl
+++ b/regen/mph.pl
@@ -13,6 +13,13 @@ use warnings 'FATAL' => 'all';
# -isbc -nolc -otr -kis -ci=4 -se -sot -sct -nsbl -pt=2 -fs \
# -fsb='##!' -fse='##.'
+# Naming conventions
+# * The public API, consisting of methods, uses "normal" sub names with
+# no leading underscore.
+# * Private subs are prefixed with a single underscore.
+# * Private methods are prefixed with two underscores. (There is only
+# one at the time of writing this comment)
+
use constant {
FNV32_PRIME => 16777619,
U8_MAX => 0xFF,
@@ -26,6 +33,34 @@ my $MASK= U32_MAX;
my $MAX_SEED2= U16_MAX; # currently the same, but not necessarily.
my $IS_32BIT= !eval { pack "Q", 1 };
+sub new {
+ my ($class, %self)= @_;
+
+ my $source_hash= $self{source_hash}
+ or die "'source_hash' is a required parameter in $class->new()\n";
+
+ my $length_all_keys= 0;
+ $length_all_keys += length($_) for keys %$source_hash;
+ $self{length_all_keys}= $length_all_keys;
+
+ $self{max_attempts} ||= 16; # pick a number, any number...
+
+ $self{base_name} ||= "mph";
+ my $base_name= $self{base_name};
+
+ $self{prefix} ||= uc($base_name);
+
+ $self{h_file} ||= $base_name . "_algo.h";
+ $self{c_file} ||= $base_name . "_test.c";
+ $self{t_file} ||= $base_name . "_test.pl";
+ $self{blob_name} ||= $base_name . "_blob";
+ $self{struct_name} ||= $base_name . "_struct";
+ $self{table_name} ||= $base_name . "_table";
+ $self{match_name} ||= $base_name . "_match";
+
+ return bless \%self, $class;
+}
+
# The basic idea is that you have a two level structure, and effectively
# hash the key twice.
#
@@ -69,9 +104,10 @@ sub _fnv1a_32 {
}
sub build_perfect_hash {
- my ($source_hash, $max_attempts)= @_;
+ my ($self)= @_;
- $max_attempts ||= 16; # pick a number, any number...
+ my $source_hash= $self->{source_hash};
+ my $max_attempts= $self->{max_attempts};
my $n= 0 + keys %$source_hash;
print "Building a minimal perfect hash from $n keys.\n"
@@ -104,8 +140,11 @@ sub build_perfect_hash {
}
my $second_level=
_build_mph_level2($hash_to_key, $key_to_hash, $key_buckets);
- return $seed1, $second_level
- if $second_level;
+ if ($second_level) {
+ $self->{seed1}= $seed1;
+ $self->{second_level}= $second_level;
+ return $seed1, $second_level;
+ }
}
die sprintf "After %d attempts failed to construct a minimal perfect "
. "hash with %d keys.\nWe are using fnv32(), perhaps this "
@@ -481,8 +520,12 @@ sub _build_split_words_simple {
return ($blob, $res, $length_all_keys);
}
+
sub build_split_words {
- my ($hash, $length_all_keys)= @_;
+ my ($self)= @_;
+
+ my $hash= $self->{source_hash};
+ my $length_all_keys= $self->{length_all_keys};
my ($blob, $split_points)=
_build_split_words_simple($hash, $length_all_keys, 0);
@@ -502,14 +545,16 @@ sub build_split_words {
length $blob, length $blob2
if $DEBUG;
}
+ $self->{blob}= $blob;
+ $self->{split_points}= $split_points;
return $blob, $split_points;
}
sub blob_as_code {
- my ($blob, $blob_name)= @_;
-
- $blob_name ||= "mph_blob";
+ my ($self)= @_;
+ my $blob= $self->{blob};
+ my $blob_name= $self->{blob_name};
# output the blob as C code.
my @code= (sprintf "STATIC const unsigned char %s[] =\n", $blob_name);
@@ -519,11 +564,11 @@ sub blob_as_code {
push @code, length $blob ? "\n" : ";\n";
}
push @code, "/* $blob_name length: $blob_len */\n";
- return join "", @code;
+ return $self->{blob_as_code}= join "", @code;
}
sub print_includes {
- my $ofh= shift;
+ my ($self, $ofh)= @_;
print $ofh "#include <stdio.h>\n";
print $ofh "#include <string.h>\n";
print $ofh "#include <stdint.h>\n";
@@ -531,7 +576,8 @@ sub print_includes {
}
sub print_defines {
- my ($ofh, $defines)= @_;
+ my ($self, $ofh)= @_;
+ my $defines= $self->{defines_hash};
my $key_len;
foreach my $def (keys %$defines) {
@@ -545,7 +591,9 @@ sub print_defines {
}
sub build_array_of_struct {
- my ($second_level, $blob)= @_;
+ my ($self)= @_;
+ my $second_level= $self->{second_level};
+ my $blob= $self->{blob};
my %defines;
my %tests;
@@ -574,20 +622,25 @@ sub build_array_of_struct {
@u16, @u8, $row->{value}, $row->{prefix}, $row->{suffix};
##.
}
+ $self->{rows_array}= \@rows;
+ $self->{defines_hash}= \%defines;
+ $self->{tests_hash}= \%tests;
return \@rows, \%defines, \%tests;
}
sub make_algo {
+ my ($self)= @_;
+
my (
$second_level, $seed1, $length_all_keys, $blob,
- $rows, $blob_name, $struct_name, $table_name,
- $match_name, $prefix
- )= @_;
-
- $blob_name ||= "mph_blob";
- $struct_name ||= "mph_struct";
- $table_name ||= "mph_table";
- $prefix ||= "MPH";
+ $rows_array, $blob_name, $struct_name, $table_name,
+ $match_name, $prefix,
+ )
+ = @{$self}{ qw(
+ second_level seed1 length_all_keys blob
+ rows_array blob_name struct_name table_name
+ match_name prefix
+ ) };
my $n= 0 + @$second_level;
my $data_size= $n * 8 + length $blob;
@@ -604,7 +657,7 @@ sub make_algo {
($data_size / $length_all_keys) * 100;
push @code, "*/\n\n";
- push @code, blob_as_code($blob, $blob_name);
+ push @code, $self->blob_as_code();
push @code, <<"EOF_CODE";
struct $struct_name {
@@ -627,7 +680,7 @@ EOF_CODE
push @code, "/* The comments give the input key for the row it is in */\n";
push @code,
"STATIC const struct $struct_name $table_name\[${prefix}_BUCKETS] = {\n",
- join(",\n", @$rows) . "\n};\n\n";
+ join(",\n", @$rows_array) . "\n};\n\n";
push @code, <<"EOF_CODE";
${prefix}_VALt
$match_name( const unsigned char * const key, const U16 key_len ) {
@@ -659,13 +712,13 @@ $match_name( const unsigned char * const key, const U16 key_len ) {
}
EOF_CODE
- return join "", @code;
+ return $self->{algo_code}= join "", @code;
}
sub __ofh {
- my ($to, $default)= @_;
+ my ($self, $to, $default_key)= @_;
- $to //= $default;
+ $to //= $self->{$default_key};
my $ofh;
if (ref $to) {
@@ -678,24 +731,18 @@ sub __ofh {
return $ofh;
}
-
sub print_algo {
- my (
- $ofh, $second_level, $seed1, $long_blob, $blob,
- $rows, $blob_name, $struct_name, $table_name, $match_name
- )= @_;
+ my ($self, $to)= @_;
- $ofh= __ofh($ofh, "mph_algo.h");
+ my $ofh= $self->__ofh($to, "h_file");
- my $code= make_algo(
- $second_level, $seed1, $long_blob, $blob, $rows,
- $blob_name, $struct_name, $table_name, $match_name
- );
- print $ofh $code;
+ my $code= $self->make_algo();
+ print $to $code;
}
sub print_main {
- my ($ofh, $h_file, $match_name, $prefix)= @_;
+ my ($self, $ofh)= @_;
+ my ($h_file, $match_name, $prefix)= @{$self}{qw(h_file match_name prefix)};
print $ofh <<"EOF_CODE";
#include "$h_file"
@@ -713,9 +760,10 @@ EOF_CODE
# output the test Perl code.
sub print_tests {
- my ($file, $tests_hash)= @_;
+ my ($self, $to)= @_;
+ my $tests_hash= $self->{tests_hash};
- $ofh = __ofh($file, "mph_test.pl");
+ my $ofh= $self->__ofh($to, "t_file");
my $num_tests= 2 + keys %$tests_hash;
print $ofh
@@ -738,75 +786,49 @@ sub print_tests {
"ok(\$res[$i]=~/got: (\\d+)/ && \$1 == $want, '$key');\n";
}
}
- close $ofh;
}
sub print_test_binary {
- my (
- $file, $h_file, $second_level, $seed1,
- $length_all_keys, $blob, $rows, $defines,
- $match_name, $prefix
- )= @_;
+ my ($self, $to)= @_;
- my $ofh= __ofh($file, "mph_test.c");
+ my $ofh= $self->__ofh($to, "c_file");
- print_includes($ofh);
- print_defines($ofh, $defines);
- print_main($ofh, $h_file, $match_name, $prefix);
- close $ofh;
+ $self->print_includes($ofh);
+ $self->print_defines($ofh);
+ $self->print_main($ofh);
}
-sub make_mph_from_hash {
- my $hash= shift;
+sub make_mph_with_split_keys {
+ my ($self)= @_;
- my $length_all_keys= 0;
- $length_all_keys += length($_) for keys %$hash;
+ my $hash= $self->{source_hash};
+ my $length_all_keys= $self->{length_all_keys};
- # we do this twice because often we can find longer prefixes on the second pass.
- my ($blob, $res_to_split)= build_split_words($hash, $length_all_keys);
+ my ($blob, $split_points)= $self->build_split_words();
- my ($seed1, $second_level)= build_perfect_hash($hash, 16);
+ my ($seed1, $second_level)= $self->build_perfect_hash();
# add prefix/suffix data into the bucket info in @$second_level
foreach my $bucket_info (@$second_level) {
my $key= $bucket_info->{key};
- my $sp= $res_to_split->{$key} // die "no split pos for '$key'\n";
+ my $sp= $split_points->{$key} // die "no split_point data for '$key'\n";
my ($prefix, $suffix)= unpack "A${sp}A*", $key;
$bucket_info->{prefix}= $prefix;
$bucket_info->{suffix}= $suffix;
$bucket_info->{value}= $hash->{$key};
}
- my ($rows, $defines, $tests)= build_array_of_struct($second_level, $blob);
- return ($second_level, $seed1, $length_all_keys, $blob, $rows,
- $defines, $tests);
+ my ($rows, $defines, $tests)= $self->build_array_of_struct();
+ return 1;
}
-sub make_files {
- my ($hash, $base_name)= @_;
-
- my $h_name= $base_name . "_algo.h";
- my $c_name= $base_name . "_test.c";
- my $p_name= $base_name . "_test.pl";
- my $blob_name= $base_name . "_blob";
- my $struct_name= $base_name . "_bucket_info";
- my $table_name= $base_name . "_table";
- my $match_name= $base_name . "_match";
- my $prefix= uc($base_name);
-
- my ($second_level, $seed1, $length_all_keys, $blob, $rows, $defines, $tests)
- = make_mph_from_hash($hash);
- print_algo(
- $h_name, $second_level, $seed1, $length_all_keys,
- $blob, $rows, $blob_name, $struct_name,
- $table_name, $match_name, $prefix
- );
- print_test_binary(
- $c_name, $h_name, $second_level, $seed1,
- $length_all_keys, $blob, $rows, $defines,
- $match_name, $prefix
- );
- print_tests($p_name, $tests);
+sub make_files_split_keys {
+ my ($self)= @_;
+
+ $self->make_mph_with_split_keys();
+ $self->print_algo();
+ $self->print_test_binary();
+ $self->print_tests();
}
unless (caller) {
@@ -838,7 +860,11 @@ unless (caller) {
my $name= shift @ARGV;
$name ||= "mph";
- make_files(\%hash, $name);
+ my $obj= __PACKAGE__->new(
+ source_hash => \%hash,
+ base_name => $name
+ );
+ $obj->make_files_split_keys();
}
1;
diff --git a/uni_keywords.h b/uni_keywords.h
index 26e3da034c..6f17dccfe6 100644
--- a/uni_keywords.h
+++ b/uni_keywords.h
@@ -10,6 +10,7 @@
#define MPH_VALt I16
/*
+generator script: regen/mk_invlists.pl
rows: 7420
seed: 1348825710
full length of keys: 104237
@@ -7623,7 +7624,8 @@ STATIC const struct mph_struct mph_table[MPH_BUCKETS] = {
{ 1, 10058, 9758, 3, 4, UNI_NV__3_SLASH_20 } /* nv=3/20 */
};
-MPH_VALt match_uniprop( const unsigned char * const key, const U16 key_len ) {
+MPH_VALt
+match_uniprop( const unsigned char * const key, const U16 key_len ) {
const unsigned char * ptr= key;
const unsigned char * ptr_end= key + key_len;
U32 h= MPH_SEED1;
@@ -7706,6 +7708,6 @@ MPH_VALt match_uniprop( const unsigned char * const key, const U16 key_len ) {
* c72bbdeda99714db1c8024d3311da4aef3c0db3b9b9f11455a7cfe10d5e9aba3 lib/unicore/version
* 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl
* 5f8520d3a17ade6317fc0c423f5091470924b1ef425bca0c41ce8e4a9f8460fe regen/mk_PL_charclass.pl
- * f53911aa9a2b154279f8aba208653ed5a83724a117b7964c01f199a026607e83 regen/mk_invlists.pl
- * fd22f39fc463c10c0b2a3aef4ac32717a000acb209ee133a24da5acfd8a0993a regen/mph.pl
+ * 9c08a09afbb28779be92ff658e0d27e654be4570241048689e2ffc20437a3d91 regen/mk_invlists.pl
+ * 193db308cfcc2343ddde130a314afa74f4632a89635607f5297db8dccb64059b regen/mph.pl
* ex: set ro: */