summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorYves Orton <demerphq@gmail.com>2022-07-30 16:50:45 +0200
committerYves Orton <demerphq@gmail.com>2022-08-03 11:07:09 +0200
commitd1907b9404696dcfd0b4dbd7fe1b07f9beff8585 (patch)
treeec0dd78b0daba96d45b2e3c344984fb2785d5c87
parent4dd48237e57323de056710e155ba3be1fd67c65f (diff)
downloadperl-d1907b9404696dcfd0b4dbd7fe1b07f9beff8585.tar.gz
regen/mk_invlists.pl - under DEBUG=1 show some progress output
-rw-r--r--charclass_invlists.h10
-rw-r--r--lib/unicore/uni_keywords.pl2
-rw-r--r--regen/mk_invlists.pl30
-rw-r--r--uni_keywords.h2
4 files changed, 33 insertions, 11 deletions
diff --git a/charclass_invlists.h b/charclass_invlists.h
index 1efefac848..973ac9e519 100644
--- a/charclass_invlists.h
+++ b/charclass_invlists.h
@@ -15,10 +15,10 @@
* encompassing all of the Unicode BMP, and thus including all the economically
* important world scripts. At 12 most of them are: including Arabic,
* Cyrillic, Greek, Hebrew, Indian subcontinent, Latin, and Thai; but not Han,
- * Japanese, nor Korean. (The regarglen structure in regnodes.h is a U8, and
- * the trie types TRIEC and AHOCORASICKC are larger than U8 for shift values
- * above 12.) Be sure to benchmark before changing, as larger sizes do
- * significantly slow down the test suite */
+ * Japanese, nor Korean. The regnode sizing data structure in regnodes.h currently
+ * uses a U8, and the trie types TRIEC and AHOCORASICKC are larger than U8 for
+ * shift values above 12.) Be sure to benchmark before changing, as larger sizes
+ * do significantly slow down the test suite. */
#define NUM_ANYOF_CODE_POINTS (1 << 8)
@@ -430756,5 +430756,5 @@ static const U8 WB_table[23][23] = {
* c72bbdeda99714db1c8024d3311da4aef3c0db3b9b9f11455a7cfe10d5e9aba3 lib/unicore/version
* 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl
* c7ff8e0d207d3538c7feb4a1a152b159e5e902d20293b303569ea8323e84633e regen/mk_PL_charclass.pl
- * 9fcf5cbe9d04768ff18b5ac33c1ec809f2e8b64bc45ff303aad480936a55c35b regen/mk_invlists.pl
+ * 2a64e8b4ca351f490530bdf8c7b4962c407b7ed6a1123eeb8d9e8e0e4236d16a regen/mk_invlists.pl
* ex: set ro: */
diff --git a/lib/unicore/uni_keywords.pl b/lib/unicore/uni_keywords.pl
index f50ccc8050..dbca2cccdc 100644
--- a/lib/unicore/uni_keywords.pl
+++ b/lib/unicore/uni_keywords.pl
@@ -1324,5 +1324,5 @@
# c72bbdeda99714db1c8024d3311da4aef3c0db3b9b9f11455a7cfe10d5e9aba3 lib/unicore/version
# 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl
# c7ff8e0d207d3538c7feb4a1a152b159e5e902d20293b303569ea8323e84633e regen/mk_PL_charclass.pl
-# 9fcf5cbe9d04768ff18b5ac33c1ec809f2e8b64bc45ff303aad480936a55c35b regen/mk_invlists.pl
+# 2a64e8b4ca351f490530bdf8c7b4962c407b7ed6a1123eeb8d9e8e0e4236d16a regen/mk_invlists.pl
# ex: set ro:
diff --git a/regen/mk_invlists.pl b/regen/mk_invlists.pl
index bfe37a3803..e771cd28bd 100644
--- a/regen/mk_invlists.pl
+++ b/regen/mk_invlists.pl
@@ -11,12 +11,15 @@ use Unicode::UCD qw(prop_aliases
num
charblock
);
+use constant DEBUG => $ENV{DEBUG} // 0;
require './regen/regen_lib.pl';
require './regen/charset_translations.pl';
require './lib/unicore/UCD.pl';
require './regen/mph.pl';
use re "/aa";
+print "Starting...\n" if DEBUG;
+
# This program outputs charclass_invlists.h, which contains various inversion
# lists in the form of C arrays that are to be used as-is for inversion lists.
# Thus, the lists it contains are essentially pre-compiled, and need only a
@@ -72,10 +75,10 @@ print $out_fh <<'EOF';
* encompassing all of the Unicode BMP, and thus including all the economically
* important world scripts. At 12 most of them are: including Arabic,
* Cyrillic, Greek, Hebrew, Indian subcontinent, Latin, and Thai; but not Han,
- * Japanese, nor Korean. (The regarglen structure in regnodes.h is a U8, and
- * the trie types TRIEC and AHOCORASICKC are larger than U8 for shift values
- * above 12.) Be sure to benchmark before changing, as larger sizes do
- * significantly slow down the test suite */
+ * Japanese, nor Korean. The regnode sizing data structure in regnodes.h currently
+ * uses a U8, and the trie types TRIEC and AHOCORASICKC are larger than U8 for
+ * shift values above 12.) Be sure to benchmark before changing, as larger sizes
+ * do significantly slow down the test suite. */
EOF
@@ -346,6 +349,8 @@ sub output_invlist ($$;$) {
my $invlist = shift; # Reference to inversion list array
my $charset = shift // ""; # name of character set for comment
+ print " output_invlist($name) $charset\n" if DEBUG;
+
die "No inversion list for $name" unless defined $invlist
&& ref $invlist eq 'ARRAY';
@@ -392,6 +397,8 @@ sub output_invmap ($$$$$$$) {
# property's standard possible values
my $charset = shift // ""; # name of character set for comment
+ print " output_invmap($name,$prop_name) $charset\n" if DEBUG;
+
# Output the inversion map $invmap for property $prop_name, but use $name
# as the actual data structure's name.
@@ -968,6 +975,7 @@ sub mk_invlist_from_sorted_cp_list {
return @invlist;
}
+print "Reading Case Folding rules.\n" if DEBUG;
# Read in the Case Folding rules, and construct arrays of code points for the
# properties we need.
my ($cp_ref, $folds_ref, $format, $default) = prop_invmap("Case_Folding");
@@ -975,6 +983,9 @@ die "Could not find inversion map for Case_Folding" unless defined $format;
die "Incorrect format '$format' for Case_Folding inversion map"
unless $format eq 'al'
|| $format eq 'a';
+print "Finished reading Case Folding rules.\n" if DEBUG;
+
+
sub _Perl_IVCF {
# This creates a map of the inversion of case folding. i.e., given a
@@ -2460,6 +2471,8 @@ end_file_pound_if;
#
# An initial & means to use the subroutine from this file instead of an
# official inversion list.
+#
+print "Computing unicode properties\n" if DEBUG;
# Below is the list of property names to generate. '&' means to use the
# subroutine to generate the inversion list instead of the generic code
@@ -3083,6 +3096,8 @@ foreach my $prop (@props) {
}
}
+print "Finished computing unicode properties\n" if DEBUG;
+
print $out_fh "\nconst char * const deprecated_property_msgs[] = {\n\t";
print $out_fh join ",\n\t", map { "\"$_\"" } @deprecated_messages;
print $out_fh "\n};\n";
@@ -3163,6 +3178,7 @@ my %joined_values;
# the C compiler.
my @values_indices;
+print "Computing short unicode properties\n" if DEBUG;
# Go through each property which is specifiable by \p{prop=value}, and create
# a hash with the keys being the canonicalized short property names, and the
# values for each property being all possible values that it can take on.
@@ -3194,6 +3210,7 @@ for my $property (sort { prop_name_for_cmp($a) cmp prop_name_for_cmp($b) }
}
}
}
+print "Finished computing short unicode properties\n" if DEBUG;
# Also include the old style block names, using the recipe given in
# Unicode::UCD
@@ -3201,6 +3218,7 @@ foreach my $block (prop_values('block')) {
push @{$all_values{'blk'}}, charblock((prop_invlist("block=$block"))[0]);
}
+print "Creating property tables\n" if DEBUG;
# Now create output tables for each property in @equals_properties (the keys
# in %all_values) each containing that property's possible values as computed
# just above.
@@ -3280,6 +3298,8 @@ output_WB_table();
end_file_pound_if;
+print "Computing fold data\n" if DEBUG;
+
print $out_fh <<"EOF";
/* More than one code point may have the same code point as their fold. This
@@ -3353,6 +3373,8 @@ my $uni_pl = open_new('lib/unicore/uni_keywords.pl', '>',
read_only_bottom_close_and_rename($uni_pl, \@sources);
+print "Computing minimal perfect hash for unicode properties.\n" if DEBUG;
+
if (my $file= $ENV{DUMP_KEYWORDS_FILE}) {
require Data::Dumper;
diff --git a/uni_keywords.h b/uni_keywords.h
index 819239debe..c209d3d1bf 100644
--- a/uni_keywords.h
+++ b/uni_keywords.h
@@ -7681,6 +7681,6 @@ match_uniprop( const unsigned char * const key, const U16 key_len ) {
* c72bbdeda99714db1c8024d3311da4aef3c0db3b9b9f11455a7cfe10d5e9aba3 lib/unicore/version
* 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl
* c7ff8e0d207d3538c7feb4a1a152b159e5e902d20293b303569ea8323e84633e regen/mk_PL_charclass.pl
- * 9fcf5cbe9d04768ff18b5ac33c1ec809f2e8b64bc45ff303aad480936a55c35b regen/mk_invlists.pl
+ * 2a64e8b4ca351f490530bdf8c7b4962c407b7ed6a1123eeb8d9e8e0e4236d16a regen/mk_invlists.pl
* d6987e01ad538d1567394851cf199f99815f7701bebd6092be4bc7a6d8f147c6 regen/mph.pl
* ex: set ro: */