summaryrefslogtreecommitdiff
path: root/lib/unicore
diff options
context:
space:
mode:
authorKarl Williamson <public@khwilliamson.com>2012-06-28 13:32:17 -0600
committerKarl Williamson <public@khwilliamson.com>2012-07-24 21:13:43 -0600
commite94e94b5ceeb265476690d9992a953b7d876f3a1 (patch)
tree06c23cac92667c6d749c6cf3ccd882aa5f077075 /lib/unicore
parentf792674226f74e98903d6b00d08167effecfd8e9 (diff)
downloadperl-e94e94b5ceeb265476690d9992a953b7d876f3a1.tar.gz
mktables: Generate new table for foldable chars
This table consists of all characters that participate in any way in a fold in the current Unicode version. regcomp.c currently uses the Cased property as a proxy for these. This information is used to limit the number of characters whose folds have to be dealt with in compiling bracketed regex character classess. It turns out that Cased contains more than 1300 more code points than actually do appear in folds, which means potential extra work for compiling. Hence this patch allows that work to be avoided. There are a few characters in this new table that aren't in Cased, which are potential bugs in the old way of doing things. In Unicode 6.1, these are: U+02BC MODIFIER LETTER APOSTROPHE, U+0308 COMBINING DIAERESIS, U+0313 COMBINING COMMA ABOVE, and U+0342 COMBINING GREEK PERISPOMENI. I can't figure out how these might be currently causing a bug, but this patch fixes any such.
Diffstat (limited to 'lib/unicore')
-rw-r--r--lib/unicore/mktables15
1 files changed, 14 insertions, 1 deletions
diff --git a/lib/unicore/mktables b/lib/unicore/mktables
index 5aed5ee669..776741eeba 100644
--- a/lib/unicore/mktables
+++ b/lib/unicore/mktables
@@ -11477,6 +11477,7 @@ sub filter_old_style_case_folding {
# it takes no part in anything we do.
my $to_output_simple;
my $non_final_folds;
+ my $all_folds;
sub setup_case_folding($) {
# Read in the case foldings in CaseFolding.txt. This handles both
@@ -11494,6 +11495,11 @@ sub filter_old_style_case_folding {
Fate => $INTERNAL_ONLY,
Description => "Code points that particpate in a multi-char fold and are not the final character of said fold",
);
+ $all_folds = $perl->add_match_table("_Perl_Any_Folds",
+ Perl_Extension => 1,
+ Fate => $INTERNAL_ONLY,
+ Description => "Code points that particpate in some fold",
+ );
# If we ever wanted to show that these tables were combined, a new
# property method could be created, like set_combined_props()
@@ -11545,14 +11551,21 @@ END
# so that _swash_inversion_hash() is able to construct closures
# without having to worry about F mappings.
if ($type eq 'C' || $type eq 'F' || $type eq 'I' || $type eq 'S') {
+ $all_folds->add_range(hex $range, hex $range); # Assumes range is single
$_ = "$range; Case_Folding; "
. "$CMD_DELIM$REPLACE_CMD=$MULTIPLE_BEFORE$CMD_DELIM$map";
+
if ($type eq 'F') {
my @string = split " ", $map;
for my $i (0 .. @string - 1 -1) {
- $non_final_folds->add_range(hex $string[$i], hex $string[$i]);
+ my $decimal = hex $string[$i];
+ $non_final_folds->add_range($decimal, $decimal);
+ $all_folds->add_range($decimal, $decimal);
}
}
+ else {
+ $all_folds->add_range(hex $map, hex $map);
+ }
}
else {
$_ = "";