diff options
author | Yves Orton <demerphq@gmail.com> | 2022-07-30 16:50:45 +0200 |
---|---|---|
committer | Yves Orton <demerphq@gmail.com> | 2022-08-03 11:07:09 +0200 |
commit | d1907b9404696dcfd0b4dbd7fe1b07f9beff8585 (patch) | |
tree | ec0dd78b0daba96d45b2e3c344984fb2785d5c87 | |
parent | 4dd48237e57323de056710e155ba3be1fd67c65f (diff) | |
download | perl-d1907b9404696dcfd0b4dbd7fe1b07f9beff8585.tar.gz |
regen/mk_invlists.pl - under DEBUG=1 show some progress output
-rw-r--r-- | charclass_invlists.h | 10 | ||||
-rw-r--r-- | lib/unicore/uni_keywords.pl | 2 | ||||
-rw-r--r-- | regen/mk_invlists.pl | 30 | ||||
-rw-r--r-- | uni_keywords.h | 2 |
4 files changed, 33 insertions, 11 deletions
diff --git a/charclass_invlists.h b/charclass_invlists.h index 1efefac848..973ac9e519 100644 --- a/charclass_invlists.h +++ b/charclass_invlists.h @@ -15,10 +15,10 @@ * encompassing all of the Unicode BMP, and thus including all the economically * important world scripts. At 12 most of them are: including Arabic, * Cyrillic, Greek, Hebrew, Indian subcontinent, Latin, and Thai; but not Han, - * Japanese, nor Korean. (The regarglen structure in regnodes.h is a U8, and - * the trie types TRIEC and AHOCORASICKC are larger than U8 for shift values - * above 12.) Be sure to benchmark before changing, as larger sizes do - * significantly slow down the test suite */ + * Japanese, nor Korean. The regnode sizing data structure in regnodes.h currently + * uses a U8, and the trie types TRIEC and AHOCORASICKC are larger than U8 for + * shift values above 12.) Be sure to benchmark before changing, as larger sizes + * do significantly slow down the test suite. */ #define NUM_ANYOF_CODE_POINTS (1 << 8) @@ -430756,5 +430756,5 @@ static const U8 WB_table[23][23] = { * c72bbdeda99714db1c8024d3311da4aef3c0db3b9b9f11455a7cfe10d5e9aba3 lib/unicore/version * 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl * c7ff8e0d207d3538c7feb4a1a152b159e5e902d20293b303569ea8323e84633e regen/mk_PL_charclass.pl - * 9fcf5cbe9d04768ff18b5ac33c1ec809f2e8b64bc45ff303aad480936a55c35b regen/mk_invlists.pl + * 2a64e8b4ca351f490530bdf8c7b4962c407b7ed6a1123eeb8d9e8e0e4236d16a regen/mk_invlists.pl * ex: set ro: */ diff --git a/lib/unicore/uni_keywords.pl b/lib/unicore/uni_keywords.pl index f50ccc8050..dbca2cccdc 100644 --- a/lib/unicore/uni_keywords.pl +++ b/lib/unicore/uni_keywords.pl @@ -1324,5 +1324,5 @@ # c72bbdeda99714db1c8024d3311da4aef3c0db3b9b9f11455a7cfe10d5e9aba3 lib/unicore/version # 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl # c7ff8e0d207d3538c7feb4a1a152b159e5e902d20293b303569ea8323e84633e regen/mk_PL_charclass.pl -# 9fcf5cbe9d04768ff18b5ac33c1ec809f2e8b64bc45ff303aad480936a55c35b regen/mk_invlists.pl +# 2a64e8b4ca351f490530bdf8c7b4962c407b7ed6a1123eeb8d9e8e0e4236d16a regen/mk_invlists.pl # ex: set ro: diff --git a/regen/mk_invlists.pl b/regen/mk_invlists.pl index bfe37a3803..e771cd28bd 100644 --- a/regen/mk_invlists.pl +++ b/regen/mk_invlists.pl @@ -11,12 +11,15 @@ use Unicode::UCD qw(prop_aliases num charblock ); +use constant DEBUG => $ENV{DEBUG} // 0; require './regen/regen_lib.pl'; require './regen/charset_translations.pl'; require './lib/unicore/UCD.pl'; require './regen/mph.pl'; use re "/aa"; +print "Starting...\n" if DEBUG; + # This program outputs charclass_invlists.h, which contains various inversion # lists in the form of C arrays that are to be used as-is for inversion lists. # Thus, the lists it contains are essentially pre-compiled, and need only a @@ -72,10 +75,10 @@ print $out_fh <<'EOF'; * encompassing all of the Unicode BMP, and thus including all the economically * important world scripts. At 12 most of them are: including Arabic, * Cyrillic, Greek, Hebrew, Indian subcontinent, Latin, and Thai; but not Han, - * Japanese, nor Korean. (The regarglen structure in regnodes.h is a U8, and - * the trie types TRIEC and AHOCORASICKC are larger than U8 for shift values - * above 12.) Be sure to benchmark before changing, as larger sizes do - * significantly slow down the test suite */ + * Japanese, nor Korean. The regnode sizing data structure in regnodes.h currently + * uses a U8, and the trie types TRIEC and AHOCORASICKC are larger than U8 for + * shift values above 12.) Be sure to benchmark before changing, as larger sizes + * do significantly slow down the test suite. */ EOF @@ -346,6 +349,8 @@ sub output_invlist ($$;$) { my $invlist = shift; # Reference to inversion list array my $charset = shift // ""; # name of character set for comment + print " output_invlist($name) $charset\n" if DEBUG; + die "No inversion list for $name" unless defined $invlist && ref $invlist eq 'ARRAY'; @@ -392,6 +397,8 @@ sub output_invmap ($$$$$$$) { # property's standard possible values my $charset = shift // ""; # name of character set for comment + print " output_invmap($name,$prop_name) $charset\n" if DEBUG; + # Output the inversion map $invmap for property $prop_name, but use $name # as the actual data structure's name. @@ -968,6 +975,7 @@ sub mk_invlist_from_sorted_cp_list { return @invlist; } +print "Reading Case Folding rules.\n" if DEBUG; # Read in the Case Folding rules, and construct arrays of code points for the # properties we need. my ($cp_ref, $folds_ref, $format, $default) = prop_invmap("Case_Folding"); @@ -975,6 +983,9 @@ die "Could not find inversion map for Case_Folding" unless defined $format; die "Incorrect format '$format' for Case_Folding inversion map" unless $format eq 'al' || $format eq 'a'; +print "Finished reading Case Folding rules.\n" if DEBUG; + + sub _Perl_IVCF { # This creates a map of the inversion of case folding. i.e., given a @@ -2460,6 +2471,8 @@ end_file_pound_if; # # An initial & means to use the subroutine from this file instead of an # official inversion list. +# +print "Computing unicode properties\n" if DEBUG; # Below is the list of property names to generate. '&' means to use the # subroutine to generate the inversion list instead of the generic code @@ -3083,6 +3096,8 @@ foreach my $prop (@props) { } } +print "Finished computing unicode properties\n" if DEBUG; + print $out_fh "\nconst char * const deprecated_property_msgs[] = {\n\t"; print $out_fh join ",\n\t", map { "\"$_\"" } @deprecated_messages; print $out_fh "\n};\n"; @@ -3163,6 +3178,7 @@ my %joined_values; # the C compiler. my @values_indices; +print "Computing short unicode properties\n" if DEBUG; # Go through each property which is specifiable by \p{prop=value}, and create # a hash with the keys being the canonicalized short property names, and the # values for each property being all possible values that it can take on. @@ -3194,6 +3210,7 @@ for my $property (sort { prop_name_for_cmp($a) cmp prop_name_for_cmp($b) } } } } +print "Finished computing short unicode properties\n" if DEBUG; # Also include the old style block names, using the recipe given in # Unicode::UCD @@ -3201,6 +3218,7 @@ foreach my $block (prop_values('block')) { push @{$all_values{'blk'}}, charblock((prop_invlist("block=$block"))[0]); } +print "Creating property tables\n" if DEBUG; # Now create output tables for each property in @equals_properties (the keys # in %all_values) each containing that property's possible values as computed # just above. @@ -3280,6 +3298,8 @@ output_WB_table(); end_file_pound_if; +print "Computing fold data\n" if DEBUG; + print $out_fh <<"EOF"; /* More than one code point may have the same code point as their fold. This @@ -3353,6 +3373,8 @@ my $uni_pl = open_new('lib/unicore/uni_keywords.pl', '>', read_only_bottom_close_and_rename($uni_pl, \@sources); +print "Computing minimal perfect hash for unicode properties.\n" if DEBUG; + if (my $file= $ENV{DUMP_KEYWORDS_FILE}) { require Data::Dumper; diff --git a/uni_keywords.h b/uni_keywords.h index 819239debe..c209d3d1bf 100644 --- a/uni_keywords.h +++ b/uni_keywords.h @@ -7681,6 +7681,6 @@ match_uniprop( const unsigned char * const key, const U16 key_len ) { * c72bbdeda99714db1c8024d3311da4aef3c0db3b9b9f11455a7cfe10d5e9aba3 lib/unicore/version * 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl * c7ff8e0d207d3538c7feb4a1a152b159e5e902d20293b303569ea8323e84633e regen/mk_PL_charclass.pl - * 9fcf5cbe9d04768ff18b5ac33c1ec809f2e8b64bc45ff303aad480936a55c35b regen/mk_invlists.pl + * 2a64e8b4ca351f490530bdf8c7b4962c407b7ed6a1123eeb8d9e8e0e4236d16a regen/mk_invlists.pl * d6987e01ad538d1567394851cf199f99815f7701bebd6092be4bc7a6d8f147c6 regen/mph.pl * ex: set ro: */ |