summaryrefslogtreecommitdiff
path: root/regen/regcharclass_multi_char_folds.pl
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2020-04-03 12:12:06 -0600
committerKarl Williamson <khw@cpan.org>2020-10-16 07:01:41 -0600
commit114fc8b6cf6259d91d5d2c5cf7509f3f5e8cf35b (patch)
treecf524c703ebcc123d0024a0e5e40b6bf91ba70d5 /regen/regcharclass_multi_char_folds.pl
parent70dc0cf11d00e208b9cf7abd3d31a83e245d2b5c (diff)
downloadperl-114fc8b6cf6259d91d5d2c5cf7509f3f5e8cf35b.tar.gz
regen/regcharclass_multi_char_folds.pl: Use case fold
Prior to this commit, only the upper case of Latin1 characters was dealt with. But we really want case folding, and there are a few other characters that fold to Latin1. This commit acknowledges them.
Diffstat (limited to 'regen/regcharclass_multi_char_folds.pl')
-rw-r--r--regen/regcharclass_multi_char_folds.pl21
1 files changed, 18 insertions, 3 deletions
diff --git a/regen/regcharclass_multi_char_folds.pl b/regen/regcharclass_multi_char_folds.pl
index 8cf9837397..a72e1497ce 100644
--- a/regen/regcharclass_multi_char_folds.pl
+++ b/regen/regcharclass_multi_char_folds.pl
@@ -73,6 +73,19 @@ sub multi_char_folds ($$) {
die "Incorrect format '$format' for Case_Folding inversion map"
unless $format eq 'al';
+ my %inverse_latin1_folds;
+ for my $i (0 .. @$cp_ref - 1) {
+ next if ref $folds_ref->[$i]; # multi-char fold
+ next if $folds_ref->[$i] == 0; # Not folded
+ my $cp_base = $cp_ref->[$i];
+
+ for my $j ($cp_base .. $cp_ref->[$i+1] - 1) {
+ my $folded_base = $folds_ref->[$i];
+ next if $folded_base > 255; # only interested in Latin1
+ push @{$inverse_latin1_folds{$folded_base + $j - $cp_base}}, $j;
+ }
+ }
+
my @folds;
my @output_folds;
@@ -118,9 +131,11 @@ sub multi_char_folds ($$) {
my $this_ord = $this_fold_ref->[$j];
undef $this_fold_ref->[$j];
- if ($this_ord < 256 && chr($this_ord) =~ /\p{Cased}/) {
- my $uc = ord(uc(chr($this_ord)));
- @{$this_fold_ref->[$j]} = ( $this_ord, $uc);
+ # If the fold is to a Latin1-range cased letter, replace the entry
+ # with an array which also includes everything that folds to it.
+ if (exists $inverse_latin1_folds{$this_ord}) {
+ push @{$this_fold_ref->[$j]},
+ ( $this_ord, @{$inverse_latin1_folds{$this_ord}} );
}
else { # Otherwise, just itself. (gen_combinations() needs a ref)
@{$this_fold_ref->[$j]} = ( $this_ord );