summaryrefslogtreecommitdiff
path: root/regen/mk_invlists.pl
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2020-03-18 09:12:34 -0600
committerKarl Williamson <khw@cpan.org>2020-10-16 07:01:41 -0600
commit59142b8bd98e53318226c235b25118b63b24c99f (patch)
tree4904234e51e8b20ef4ff9842f10b67df9498ddd4 /regen/mk_invlists.pl
parent4fad5f9fde90649f8a92ff93e775cf814b118f19 (diff)
downloadperl-59142b8bd98e53318226c235b25118b63b24c99f.tar.gz
charclass_invlists.h: Add some inverse folds.
The MICRO SIGN folds to above the Latin1 range, the only character that does so in Unicode (or ever likely to). This requires special handling. This commit reduces some of the need for that handling by creating the inversion map for it, which can be used in certain instances in pattern matching, without having to have a special case. The actual use of this will come in a future commit.
Diffstat (limited to 'regen/mk_invlists.pl')
-rw-r--r--regen/mk_invlists.pl32
1 files changed, 29 insertions, 3 deletions
diff --git a/regen/mk_invlists.pl b/regen/mk_invlists.pl
index 8c848ccce9..7df5b7d69e 100644
--- a/regen/mk_invlists.pl
+++ b/regen/mk_invlists.pl
@@ -1029,13 +1029,20 @@ sub _Perl_IVCF {
}
# Now go through and make some adjustments. We add synthetic entries for
- # two cases.
- # 1) Two or more code points can fold to the same multiple character,
+ # three cases.
+ # 1) If the fold of a Latin1-range character is above that range, some
+ # coding in regexec.c can be saved by creating a reverse map here. The
+ # impetus for this is that U+B5 (MICRO SIGN) folds to the Greek small
+ # mu (U+3BC). That fold isn't done at regex pattern compilation time
+ # if it means that the pattern would have to be translated into UTF-8,
+ # whose operation is slower. At run time, having this reverse
+ # translation eliminates some special cases in the code.
+ # 2) Two or more code points can fold to the same multiple character,
# sequence, as U+FB05 and U+FB06 both fold to 'st'. This code is only
# for single character folds, but FB05 and FB06 are single characters
# that are equivalent folded, so we add entries so that they are
# considered to fold to each other
- # 2) If two or more above-Latin1 code points fold to the same Latin1 range
+ # 3) If two or more above-Latin1 code points fold to the same Latin1 range
# one, we also add entries so that they are considered to fold to each
# other. This is so that under /aa or /l matching, where folding to
# their Latin1 range code point is illegal, they still can fold to each
@@ -1048,9 +1055,28 @@ sub _Perl_IVCF {
# scalar
if (scalar $new{$fold}->@* == 1) {
$new{$fold} = $new{$fold}[0];
+
+ # Handle case 1) above: if there were a Latin1 range code point
+ # whose fold is above that range, this creates an extra entry that
+ # maps the other direction, and would save some special case code.
+ # (The one current case of this is handled in the else clause
+ # below.)
+ $new{$new{$fold}} = $fold if $new{$fold} < 256 && $fold > 255;
}
else {
+ # Handle case 1) when there are multiple things that fold to an
+ # above-Latin1 code point, at least one of which is in Latin1.
+ if (! $folds_to_string && $fold > 255) {
+ foreach my $cp ($new{$fold}->@*) {
+ if ($cp < 256) {
+ my @new_entry = grep { $_ != $cp } $new{$fold}->@*;
+ push @new_entry, $fold;
+ $new{$cp}->@* = @new_entry;
+ }
+ }
+ }
+
# Otherwise, sort numerically. This places the highest code point
# in the list at the tail end. This is because Unicode keeps the
# lowercase code points as higher ordinals than the uppercase, at