regen/mk_invlists.pl - under DEBUG=1 show some progress output

author: Yves Orton <demerphq@gmail.com> 2022-07-30 16:50:45 +0200
committer: Yves Orton <demerphq@gmail.com> 2022-08-03 11:07:09 +0200
commit: d1907b9404696dcfd0b4dbd7fe1b07f9beff8585 (patch)
tree: ec0dd78b0daba96d45b2e3c344984fb2785d5c87
parent: 4dd48237e57323de056710e155ba3be1fd67c65f (diff)
download: perl-d1907b9404696dcfd0b4dbd7fe1b07f9beff8585.tar.gz
4 files changed, 33 insertions, 11 deletions
diff --git a/charclass_invlists.h b/charclass_invlists.h
index 1efefac848..973ac9e519 100644
--- a/charclass_invlists.h
+++ b/charclass_invlists.h
@@ -15,10 +15,10 @@
  * encompassing all of the Unicode BMP, and thus including all the economically
  * important world scripts.  At 12 most of them are: including Arabic,
  * Cyrillic, Greek, Hebrew, Indian subcontinent, Latin, and Thai; but not Han,
- * Japanese, nor Korean.  (The regarglen structure in regnodes.h is a U8, and
- * the trie types TRIEC and AHOCORASICKC are larger than U8 for shift values
- * above 12.)  Be sure to benchmark before changing, as larger sizes do
- * significantly slow down the test suite */
+ * Japanese, nor Korean.  The regnode sizing data structure in regnodes.h currently
+ * uses a U8, and the trie types TRIEC and AHOCORASICKC are larger than U8 for
+ * shift values above 12.)  Be sure to benchmark before changing, as larger sizes
+ * do significantly slow down the test suite. */
 
 #define NUM_ANYOF_CODE_POINTS   (1 << 8)
 
@@ -430756,5 +430756,5 @@ static const U8 WB_table[23][23] = {
  * c72bbdeda99714db1c8024d3311da4aef3c0db3b9b9f11455a7cfe10d5e9aba3 lib/unicore/version
  * 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl
  * c7ff8e0d207d3538c7feb4a1a152b159e5e902d20293b303569ea8323e84633e regen/mk_PL_charclass.pl
- * 9fcf5cbe9d04768ff18b5ac33c1ec809f2e8b64bc45ff303aad480936a55c35b regen/mk_invlists.pl
+ * 2a64e8b4ca351f490530bdf8c7b4962c407b7ed6a1123eeb8d9e8e0e4236d16a regen/mk_invlists.pl
  * ex: set ro: */
diff --git a/lib/unicore/uni_keywords.pl b/lib/unicore/uni_keywords.pl
index f50ccc8050..dbca2cccdc 100644
--- a/lib/unicore/uni_keywords.pl
+++ b/lib/unicore/uni_keywords.pl
@@ -1324,5 +1324,5 @@
 # c72bbdeda99714db1c8024d3311da4aef3c0db3b9b9f11455a7cfe10d5e9aba3 lib/unicore/version
 # 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl
 # c7ff8e0d207d3538c7feb4a1a152b159e5e902d20293b303569ea8323e84633e regen/mk_PL_charclass.pl
-# 9fcf5cbe9d04768ff18b5ac33c1ec809f2e8b64bc45ff303aad480936a55c35b regen/mk_invlists.pl
+# 2a64e8b4ca351f490530bdf8c7b4962c407b7ed6a1123eeb8d9e8e0e4236d16a regen/mk_invlists.pl
 # ex: set ro:
diff --git a/regen/mk_invlists.pl b/regen/mk_invlists.pl
index bfe37a3803..e771cd28bd 100644
--- a/regen/mk_invlists.pl
+++ b/regen/mk_invlists.pl
@@ -11,12 +11,15 @@ use Unicode::UCD qw(prop_aliases
                     num
                     charblock
                    );
+use constant DEBUG => $ENV{DEBUG} // 0;
 require './regen/regen_lib.pl';
 require './regen/charset_translations.pl';
 require './lib/unicore/UCD.pl';
 require './regen/mph.pl';
 use re "/aa";
 
+print "Starting...\n" if DEBUG;
+
 # This program outputs charclass_invlists.h, which contains various inversion
 # lists in the form of C arrays that are to be used as-is for inversion lists.
 # Thus, the lists it contains are essentially pre-compiled, and need only a
@@ -72,10 +75,10 @@ print $out_fh <<'EOF';
  * encompassing all of the Unicode BMP, and thus including all the economically
  * important world scripts.  At 12 most of them are: including Arabic,
  * Cyrillic, Greek, Hebrew, Indian subcontinent, Latin, and Thai; but not Han,
- * Japanese, nor Korean.  (The regarglen structure in regnodes.h is a U8, and
- * the trie types TRIEC and AHOCORASICKC are larger than U8 for shift values
- * above 12.)  Be sure to benchmark before changing, as larger sizes do
- * significantly slow down the test suite */
+ * Japanese, nor Korean.  The regnode sizing data structure in regnodes.h currently
+ * uses a U8, and the trie types TRIEC and AHOCORASICKC are larger than U8 for
+ * shift values above 12.)  Be sure to benchmark before changing, as larger sizes
+ * do significantly slow down the test suite. */
 
 EOF
 
@@ -346,6 +349,8 @@ sub output_invlist ($$;$) {
     my $invlist = shift;     # Reference to inversion list array
     my $charset = shift // "";  # name of character set for comment
 
+    print "  output_invlist($name) $charset\n" if DEBUG;
+
     die "No inversion list for $name" unless defined $invlist
                                              && ref $invlist eq 'ARRAY';
 
@@ -392,6 +397,8 @@ sub output_invmap ($$$$$$$) {
                                 # property's standard possible values
     my $charset = shift // "";  # name of character set for comment
 
+    print "  output_invmap($name,$prop_name) $charset\n" if DEBUG;
+
     # Output the inversion map $invmap for property $prop_name, but use $name
     # as the actual data structure's name.
 
@@ -968,6 +975,7 @@ sub mk_invlist_from_sorted_cp_list {
     return @invlist;
 }
 
+print "Reading Case Folding rules.\n" if DEBUG;
 # Read in the Case Folding rules, and construct arrays of code points for the
 # properties we need.
 my ($cp_ref, $folds_ref, $format, $default) = prop_invmap("Case_Folding");
@@ -975,6 +983,9 @@ die "Could not find inversion map for Case_Folding" unless defined $format;
 die "Incorrect format '$format' for Case_Folding inversion map"
                                                     unless $format eq 'al'
                                                            || $format eq 'a';
+print "Finished reading Case Folding rules.\n" if DEBUG;
+
+
 sub _Perl_IVCF {
 
     # This creates a map of the inversion of case folding. i.e., given a
@@ -2460,6 +2471,8 @@ end_file_pound_if;
 #
 # An initial & means to use the subroutine from this file instead of an
 # official inversion list.
+#
+print "Computing unicode properties\n" if DEBUG;
 
 # Below is the list of property names to generate.  '&' means to use the
 # subroutine to generate the inversion list instead of the generic code
@@ -3083,6 +3096,8 @@ foreach my $prop (@props) {
     }
 }
 
+print "Finished computing unicode properties\n" if DEBUG;
+
 print $out_fh "\nconst char * const deprecated_property_msgs[] = {\n\t";
 print $out_fh join ",\n\t", map { "\"$_\"" } @deprecated_messages;
 print $out_fh "\n};\n";
@@ -3163,6 +3178,7 @@ my %joined_values;
 # the C compiler.
 my @values_indices;
 
+print "Computing short unicode properties\n" if DEBUG;
 # Go through each property which is specifiable by \p{prop=value}, and create
 # a hash with the keys being the canonicalized short property names, and the
 # values for each property being all possible values that it can take on.
@@ -3194,6 +3210,7 @@ for my $property (sort { prop_name_for_cmp($a) cmp prop_name_for_cmp($b) }
         }
     }
 }
+print "Finished computing short unicode properties\n" if DEBUG;
 
 # Also include the old style block names, using the recipe given in
 # Unicode::UCD
@@ -3201,6 +3218,7 @@ foreach my $block (prop_values('block')) {
     push @{$all_values{'blk'}}, charblock((prop_invlist("block=$block"))[0]);
 }
 
+print "Creating property tables\n" if DEBUG;
 # Now create output tables for each property in @equals_properties (the keys
 # in %all_values) each containing that property's possible values as computed
 # just above.
@@ -3280,6 +3298,8 @@ output_WB_table();
 
 end_file_pound_if;
 
+print "Computing fold data\n" if DEBUG;
+
 print $out_fh <<"EOF";
 
 /* More than one code point may have the same code point as their fold.  This
@@ -3353,6 +3373,8 @@ my $uni_pl = open_new('lib/unicore/uni_keywords.pl', '>',
 
 read_only_bottom_close_and_rename($uni_pl, \@sources);
 
+print "Computing minimal perfect hash for unicode properties.\n" if DEBUG;
+
 if (my $file= $ENV{DUMP_KEYWORDS_FILE}) {
     require Data::Dumper;
 
diff --git a/uni_keywords.h b/uni_keywords.h
index 819239debe..c209d3d1bf 100644
--- a/uni_keywords.h
+++ b/uni_keywords.h
@@ -7681,6 +7681,6 @@ match_uniprop( const unsigned char * const key, const U16 key_len ) {
  * c72bbdeda99714db1c8024d3311da4aef3c0db3b9b9f11455a7cfe10d5e9aba3 lib/unicore/version
  * 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl
  * c7ff8e0d207d3538c7feb4a1a152b159e5e902d20293b303569ea8323e84633e regen/mk_PL_charclass.pl
- * 9fcf5cbe9d04768ff18b5ac33c1ec809f2e8b64bc45ff303aad480936a55c35b regen/mk_invlists.pl
+ * 2a64e8b4ca351f490530bdf8c7b4962c407b7ed6a1123eeb8d9e8e0e4236d16a regen/mk_invlists.pl
  * d6987e01ad538d1567394851cf199f99815f7701bebd6092be4bc7a6d8f147c6 regen/mph.pl
  * ex: set ro: */
author	Yves Orton <demerphq@gmail.com>	2022-07-30 16:50:45 +0200
committer	Yves Orton <demerphq@gmail.com>	2022-08-03 11:07:09 +0200
commit	d1907b9404696dcfd0b4dbd7fe1b07f9beff8585 (patch)
tree	ec0dd78b0daba96d45b2e3c344984fb2785d5c87
parent	4dd48237e57323de056710e155ba3be1fd67c65f (diff)
download	perl-d1907b9404696dcfd0b4dbd7fe1b07f9beff8585.tar.gz