summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorYves Orton <demerphq@gmail.com>2022-04-17 14:46:44 +0200
committerKarl Williamson <khw@cpan.org>2022-04-19 05:41:19 -0600
commit44a605b000708fc84ba34c075bc6ba3bb6a3d36d (patch)
treee1d11f64626eaf17d85217cdd52eb3035a52a6d1
parent19ad8281cb889184f365a02bd4c1193d351cc418 (diff)
downloadperl-44a605b000708fc84ba34c075bc6ba3bb6a3d36d.tar.gz
regen/mph.pl & mk_invlists.pl - convert from sub interfaces to OO interfaces
The old sub based API was passing around an awkward number of arguments and it was becoming difficult to enhance in certain ways. This patch changes all the "user servicable" functions into methods, and moves the configuration defaults into the constructor. Note, not all the functions have been converted, the core routines with simple interfaces have not been changed. This is OO for the purpose of encapsulation not inheritance or overloading.
-rw-r--r--charclass_invlists.h2
-rw-r--r--lib/unicore/uni_keywords.pl2
-rw-r--r--regen/mk_invlists.pl13
-rw-r--r--regen/mph.pl200
-rw-r--r--uni_keywords.h8
5 files changed, 127 insertions, 98 deletions
diff --git a/charclass_invlists.h b/charclass_invlists.h
index aba4582ebf..73447dc032 100644
--- a/charclass_invlists.h
+++ b/charclass_invlists.h
@@ -430755,5 +430755,5 @@ static const U8 WB_table[23][23] = {
* c72bbdeda99714db1c8024d3311da4aef3c0db3b9b9f11455a7cfe10d5e9aba3 lib/unicore/version
* 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl
* 5f8520d3a17ade6317fc0c423f5091470924b1ef425bca0c41ce8e4a9f8460fe regen/mk_PL_charclass.pl
- * f53911aa9a2b154279f8aba208653ed5a83724a117b7964c01f199a026607e83 regen/mk_invlists.pl
+ * 9c08a09afbb28779be92ff658e0d27e654be4570241048689e2ffc20437a3d91 regen/mk_invlists.pl
* ex: set ro: */
diff --git a/lib/unicore/uni_keywords.pl b/lib/unicore/uni_keywords.pl
index b7eb4a3c0f..4360373914 100644
--- a/lib/unicore/uni_keywords.pl
+++ b/lib/unicore/uni_keywords.pl
@@ -1323,5 +1323,5 @@
# c72bbdeda99714db1c8024d3311da4aef3c0db3b9b9f11455a7cfe10d5e9aba3 lib/unicore/version
# 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl
# 5f8520d3a17ade6317fc0c423f5091470924b1ef425bca0c41ce8e4a9f8460fe regen/mk_PL_charclass.pl
-# f53911aa9a2b154279f8aba208653ed5a83724a117b7964c01f199a026607e83 regen/mk_invlists.pl
+# 9c08a09afbb28779be92ff658e0d27e654be4570241048689e2ffc20437a3d91 regen/mk_invlists.pl
# ex: set ro:
diff --git a/regen/mk_invlists.pl b/regen/mk_invlists.pl
index e5259aa118..c2e7535ceb 100644
--- a/regen/mk_invlists.pl
+++ b/regen/mk_invlists.pl
@@ -3367,12 +3367,13 @@ my $keywords_fh = open_new('uni_keywords.h', '>',
print $keywords_fh "\n#if defined(PERL_CORE) || defined(PERL_EXT_RE_BUILD)\n\n";
-my ($second_level, $seed1, $length_all_keys, $smart_blob, $rows)
- = MinimalPerfectHash::make_mph_from_hash(\%keywords);
-print $keywords_fh MinimalPerfectHash::make_algo($second_level, $seed1,
- $length_all_keys, $smart_blob,
- $rows, undef, undef, undef,
- 'match_uniprop' );
+my $mph= MinimalPerfectHash->new(
+ source_hash => \%keywords,
+ match_name => "match_uniprop",
+);
+$mph->make_mph_with_split_keys();
+print $keywords_fh $mph->make_algo();
+
print $keywords_fh "\n#endif /* #if defined(PERL_CORE) || defined(PERL_EXT_RE_BUILD) */\n";
push @sources, 'regen/mph.pl';
diff --git a/regen/mph.pl b/regen/mph.pl
index 3cc5c01e57..29e65b55d9 100644
--- a/regen/mph.pl
+++ b/regen/mph.pl
@@ -13,6 +13,13 @@ use warnings 'FATAL' => 'all';
# -isbc -nolc -otr -kis -ci=4 -se -sot -sct -nsbl -pt=2 -fs \
# -fsb='##!' -fse='##.'
+# Naming conventions
+# * The public API, consisting of methods, uses "normal" sub names with
+# no leading underscore.
+# * Private subs are prefixed with a single underscore.
+# * Private methods are prefixed with two underscores. (There is only
+# one at the time of writing this comment)
+
use constant {
FNV32_PRIME => 16777619,
U8_MAX => 0xFF,
@@ -26,6 +33,34 @@ my $MASK= U32_MAX;
my $MAX_SEED2= U16_MAX; # currently the same, but not necessarily.
my $IS_32BIT= !eval { pack "Q", 1 };
+sub new {
+ my ($class, %self)= @_;
+
+ my $source_hash= $self{source_hash}
+ or die "'source_hash' is a required parameter in $class->new()\n";
+
+ my $length_all_keys= 0;
+ $length_all_keys += length($_) for keys %$source_hash;
+ $self{length_all_keys}= $length_all_keys;
+
+ $self{max_attempts} ||= 16; # pick a number, any number...
+
+ $self{base_name} ||= "mph";
+ my $base_name= $self{base_name};
+
+ $self{prefix} ||= uc($base_name);
+
+ $self{h_file} ||= $base_name . "_algo.h";
+ $self{c_file} ||= $base_name . "_test.c";
+ $self{t_file} ||= $base_name . "_test.pl";
+ $self{blob_name} ||= $base_name . "_blob";
+ $self{struct_name} ||= $base_name . "_struct";
+ $self{table_name} ||= $base_name . "_table";
+ $self{match_name} ||= $base_name . "_match";
+
+ return bless \%self, $class;
+}
+
# The basic idea is that you have a two level structure, and effectively
# hash the key twice.
#
@@ -69,9 +104,10 @@ sub _fnv1a_32 {
}
sub build_perfect_hash {
- my ($source_hash, $max_attempts)= @_;
+ my ($self)= @_;
- $max_attempts ||= 16; # pick a number, any number...
+ my $source_hash= $self->{source_hash};
+ my $max_attempts= $self->{max_attempts};
my $n= 0 + keys %$source_hash;
print "Building a minimal perfect hash from $n keys.\n"
@@ -104,8 +140,11 @@ sub build_perfect_hash {
}
my $second_level=
_build_mph_level2($hash_to_key, $key_to_hash, $key_buckets);
- return $seed1, $second_level
- if $second_level;
+ if ($second_level) {
+ $self->{seed1}= $seed1;
+ $self->{second_level}= $second_level;
+ return $seed1, $second_level;
+ }
}
die sprintf "After %d attempts failed to construct a minimal perfect "
. "hash with %d keys.\nWe are using fnv32(), perhaps this "
@@ -481,8 +520,12 @@ sub _build_split_words_simple {
return ($blob, $res, $length_all_keys);
}
+
sub build_split_words {
- my ($hash, $length_all_keys)= @_;
+ my ($self)= @_;
+
+ my $hash= $self->{source_hash};
+ my $length_all_keys= $self->{length_all_keys};
my ($blob, $split_points)=
_build_split_words_simple($hash, $length_all_keys, 0);
@@ -502,14 +545,16 @@ sub build_split_words {
length $blob, length $blob2
if $DEBUG;
}
+ $self->{blob}= $blob;
+ $self->{split_points}= $split_points;
return $blob, $split_points;
}
sub blob_as_code {
- my ($blob, $blob_name)= @_;
-
- $blob_name ||= "mph_blob";
+ my ($self)= @_;
+ my $blob= $self->{blob};
+ my $blob_name= $self->{blob_name};
# output the blob as C code.
my @code= (sprintf "STATIC const unsigned char %s[] =\n", $blob_name);
@@ -519,11 +564,11 @@ sub blob_as_code {
push @code, length $blob ? "\n" : ";\n";
}
push @code, "/* $blob_name length: $blob_len */\n";
- return join "", @code;
+ return $self->{blob_as_code}= join "", @code;
}
sub print_includes {
- my $ofh= shift;
+ my ($self, $ofh)= @_;
print $ofh "#include <stdio.h>\n";
print $ofh "#include <string.h>\n";
print $ofh "#include <stdint.h>\n";
@@ -531,7 +576,8 @@ sub print_includes {
}
sub print_defines {
- my ($ofh, $defines)= @_;
+ my ($self, $ofh)= @_;
+ my $defines= $self->{defines_hash};
my $key_len;
foreach my $def (keys %$defines) {
@@ -545,7 +591,9 @@ sub print_defines {
}
sub build_array_of_struct {
- my ($second_level, $blob)= @_;
+ my ($self)= @_;
+ my $second_level= $self->{second_level};
+ my $blob= $self->{blob};
my %defines;
my %tests;
@@ -574,20 +622,25 @@ sub build_array_of_struct {
@u16, @u8, $row->{value}, $row->{prefix}, $row->{suffix};
##.
}
+ $self->{rows_array}= \@rows;
+ $self->{defines_hash}= \%defines;
+ $self->{tests_hash}= \%tests;
return \@rows, \%defines, \%tests;
}
sub make_algo {
+ my ($self)= @_;
+
my (
$second_level, $seed1, $length_all_keys, $blob,
- $rows, $blob_name, $struct_name, $table_name,
- $match_name, $prefix
- )= @_;
-
- $blob_name ||= "mph_blob";
- $struct_name ||= "mph_struct";
- $table_name ||= "mph_table";
- $prefix ||= "MPH";
+ $rows_array, $blob_name, $struct_name, $table_name,
+ $match_name, $prefix,
+ )
+ = @{$self}{ qw(
+ second_level seed1 length_all_keys blob
+ rows_array blob_name struct_name table_name
+ match_name prefix
+ ) };
my $n= 0 + @$second_level;
my $data_size= $n * 8 + length $blob;
@@ -604,7 +657,7 @@ sub make_algo {
($data_size / $length_all_keys) * 100;
push @code, "*/\n\n";
- push @code, blob_as_code($blob, $blob_name);
+ push @code, $self->blob_as_code();
push @code, <<"EOF_CODE";
struct $struct_name {
@@ -627,7 +680,7 @@ EOF_CODE
push @code, "/* The comments give the input key for the row it is in */\n";
push @code,
"STATIC const struct $struct_name $table_name\[${prefix}_BUCKETS] = {\n",
- join(",\n", @$rows) . "\n};\n\n";
+ join(",\n", @$rows_array) . "\n};\n\n";
push @code, <<"EOF_CODE";
${prefix}_VALt
$match_name( const unsigned char * const key, const U16 key_len ) {
@@ -659,13 +712,13 @@ $match_name( const unsigned char * const key, const U16 key_len ) {
}
EOF_CODE
- return join "", @code;
+ return $self->{algo_code}= join "", @code;
}
sub __ofh {
- my ($to, $default)= @_;
+ my ($self, $to, $default_key)= @_;
- $to //= $default;
+ $to //= $self->{$default_key};
my $ofh;
if (ref $to) {
@@ -678,24 +731,18 @@ sub __ofh {
return $ofh;
}
-
sub print_algo {
- my (
- $ofh, $second_level, $seed1, $long_blob, $blob,
- $rows, $blob_name, $struct_name, $table_name, $match_name
- )= @_;
+ my ($self, $to)= @_;
- $ofh= __ofh($ofh, "mph_algo.h");
+ my $ofh= $self->__ofh($to, "h_file");
- my $code= make_algo(
- $second_level, $seed1, $long_blob, $blob, $rows,
- $blob_name, $struct_name, $table_name, $match_name
- );
- print $ofh $code;
+ my $code= $self->make_algo();
+ print $to $code;
}
sub print_main {
- my ($ofh, $h_file, $match_name, $prefix)= @_;
+ my ($self, $ofh)= @_;
+ my ($h_file, $match_name, $prefix)= @{$self}{qw(h_file match_name prefix)};
print $ofh <<"EOF_CODE";
#include "$h_file"
@@ -713,9 +760,10 @@ EOF_CODE
# output the test Perl code.
sub print_tests {
- my ($file, $tests_hash)= @_;
+ my ($self, $to)= @_;
+ my $tests_hash= $self->{tests_hash};
- $ofh = __ofh($file, "mph_test.pl");
+ my $ofh= $self->__ofh($to, "t_file");
my $num_tests= 2 + keys %$tests_hash;
print $ofh
@@ -738,75 +786,49 @@ sub print_tests {
"ok(\$res[$i]=~/got: (\\d+)/ && \$1 == $want, '$key');\n";
}
}
- close $ofh;
}
sub print_test_binary {
- my (
- $file, $h_file, $second_level, $seed1,
- $length_all_keys, $blob, $rows, $defines,
- $match_name, $prefix
- )= @_;
+ my ($self, $to)= @_;
- my $ofh= __ofh($file, "mph_test.c");
+ my $ofh= $self->__ofh($to, "c_file");
- print_includes($ofh);
- print_defines($ofh, $defines);
- print_main($ofh, $h_file, $match_name, $prefix);
- close $ofh;
+ $self->print_includes($ofh);
+ $self->print_defines($ofh);
+ $self->print_main($ofh);
}
-sub make_mph_from_hash {
- my $hash= shift;
+sub make_mph_with_split_keys {
+ my ($self)= @_;
- my $length_all_keys= 0;
- $length_all_keys += length($_) for keys %$hash;
+ my $hash= $self->{source_hash};
+ my $length_all_keys= $self->{length_all_keys};
- # we do this twice because often we can find longer prefixes on the second pass.
- my ($blob, $res_to_split)= build_split_words($hash, $length_all_keys);
+ my ($blob, $split_points)= $self->build_split_words();
- my ($seed1, $second_level)= build_perfect_hash($hash, 16);
+ my ($seed1, $second_level)= $self->build_perfect_hash();
# add prefix/suffix data into the bucket info in @$second_level
foreach my $bucket_info (@$second_level) {
my $key= $bucket_info->{key};
- my $sp= $res_to_split->{$key} // die "no split pos for '$key'\n";
+ my $sp= $split_points->{$key} // die "no split_point data for '$key'\n";
my ($prefix, $suffix)= unpack "A${sp}A*", $key;
$bucket_info->{prefix}= $prefix;
$bucket_info->{suffix}= $suffix;
$bucket_info->{value}= $hash->{$key};
}
- my ($rows, $defines, $tests)= build_array_of_struct($second_level, $blob);
- return ($second_level, $seed1, $length_all_keys, $blob, $rows,
- $defines, $tests);
+ my ($rows, $defines, $tests)= $self->build_array_of_struct();
+ return 1;
}
-sub make_files {
- my ($hash, $base_name)= @_;
-
- my $h_name= $base_name . "_algo.h";
- my $c_name= $base_name . "_test.c";
- my $p_name= $base_name . "_test.pl";
- my $blob_name= $base_name . "_blob";
- my $struct_name= $base_name . "_bucket_info";
- my $table_name= $base_name . "_table";
- my $match_name= $base_name . "_match";
- my $prefix= uc($base_name);
-
- my ($second_level, $seed1, $length_all_keys, $blob, $rows, $defines, $tests)
- = make_mph_from_hash($hash);
- print_algo(
- $h_name, $second_level, $seed1, $length_all_keys,
- $blob, $rows, $blob_name, $struct_name,
- $table_name, $match_name, $prefix
- );
- print_test_binary(
- $c_name, $h_name, $second_level, $seed1,
- $length_all_keys, $blob, $rows, $defines,
- $match_name, $prefix
- );
- print_tests($p_name, $tests);
+sub make_files_split_keys {
+ my ($self)= @_;
+
+ $self->make_mph_with_split_keys();
+ $self->print_algo();
+ $self->print_test_binary();
+ $self->print_tests();
}
unless (caller) {
@@ -838,7 +860,11 @@ unless (caller) {
my $name= shift @ARGV;
$name ||= "mph";
- make_files(\%hash, $name);
+ my $obj= __PACKAGE__->new(
+ source_hash => \%hash,
+ base_name => $name
+ );
+ $obj->make_files_split_keys();
}
1;
diff --git a/uni_keywords.h b/uni_keywords.h
index 26e3da034c..6f17dccfe6 100644
--- a/uni_keywords.h
+++ b/uni_keywords.h
@@ -10,6 +10,7 @@
#define MPH_VALt I16
/*
+generator script: regen/mk_invlists.pl
rows: 7420
seed: 1348825710
full length of keys: 104237
@@ -7623,7 +7624,8 @@ STATIC const struct mph_struct mph_table[MPH_BUCKETS] = {
{ 1, 10058, 9758, 3, 4, UNI_NV__3_SLASH_20 } /* nv=3/20 */
};
-MPH_VALt match_uniprop( const unsigned char * const key, const U16 key_len ) {
+MPH_VALt
+match_uniprop( const unsigned char * const key, const U16 key_len ) {
const unsigned char * ptr= key;
const unsigned char * ptr_end= key + key_len;
U32 h= MPH_SEED1;
@@ -7706,6 +7708,6 @@ MPH_VALt match_uniprop( const unsigned char * const key, const U16 key_len ) {
* c72bbdeda99714db1c8024d3311da4aef3c0db3b9b9f11455a7cfe10d5e9aba3 lib/unicore/version
* 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl
* 5f8520d3a17ade6317fc0c423f5091470924b1ef425bca0c41ce8e4a9f8460fe regen/mk_PL_charclass.pl
- * f53911aa9a2b154279f8aba208653ed5a83724a117b7964c01f199a026607e83 regen/mk_invlists.pl
- * fd22f39fc463c10c0b2a3aef4ac32717a000acb209ee133a24da5acfd8a0993a regen/mph.pl
+ * 9c08a09afbb28779be92ff658e0d27e654be4570241048689e2ffc20437a3d91 regen/mk_invlists.pl
+ * 193db308cfcc2343ddde130a314afa74f4632a89635607f5297db8dccb64059b regen/mph.pl
* ex: set ro: */