mk_PL_charclass.pl: Find non-latin1 folds

The output of this .pl is to be used as the main table in l1_char_class_tab.h. Add a new bit to indicate if a Latin1 character particpates in a a simple fold with a character outside the Latin1 range. This will be used by regcomp.c to make decisions about how to compile regexes.
author: Karl Williamson <public@khwilliamson.com> 2010-11-15 12:53:27 -0700
committer: Father Chrysostomos <sprout@cpan.org> 2010-11-22 13:32:56 -0800
commit: 00c072cfb7c241474c4ff06fecb0de3a4f0e46de (patch)
tree: 2371f52adca6be2cdc8d8d072659f6ca9c55ecac /Porting/mk_PL_charclass.pl
parent: a9ef53327dcfcb93436724193f806bbf675227db (diff)
download: perl-00c072cfb7c241474c4ff06fecb0de3a4f0e46de.tar.gz
1 files changed, 43 insertions, 0 deletions
diff --git a/Porting/mk_PL_charclass.pl b/Porting/mk_PL_charclass.pl
index 25293b988f..697594bb22 100644
--- a/Porting/mk_PL_charclass.pl
+++ b/Porting/mk_PL_charclass.pl
@@ -2,6 +2,7 @@
 use 5.012;
 use strict;
 use warnings;
+use Config;
 
 # This program outputs the 256 lines that form the guts of the PL_charclass
 # table.  The output should be used to manually replace the table contents in
@@ -54,8 +55,50 @@ my @properties = qw(
     XDIGIT_A
 );
 
+# Read in the case fold mappings.
+my %folded_closure;
+my $file="$Config{privlib}/unicore/CaseFolding.txt";
+open my $fh, "<", $file or die "Failed to read '$file': $!";
+while (<$fh>) {
+    chomp;
+
+    # Lines look like (without the initial '#'
+    #0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
+    my ($line, $comment) = split / \s+ \# \s+ /x, $_;
+    next if $line eq "" || substr($line, 0, 1) eq '#';
+    my ($hex_from, $fold_type, @folded) = split /[\s;]+/, $line;
+
+    my $from = hex $hex_from;
+
+    # Perl only deals with C and F folds
+    next if $fold_type ne 'C' and $fold_type ne 'F';
+    next if $fold_type ne 'C';  # And for now, just single-char folds. XXX
+
+    # Get each code point in the range that participates in this line's fold.
+    # The hash has keys of each code point in the range, and values of what it
+    # folds to and what folds to it
+    foreach my $hex_fold (@folded) {
+        my $fold = hex $hex_fold;
+        push @{$folded_closure{$fold}}, $from if $fold < 256;
+        push @{$folded_closure{$from}}, $fold if $from < 256;
+    }
+}
+
+# Now having read all the lines, combine them into the full closure of each
+# code point in the range by adding lists together that share a common element
+foreach my $folded (keys %folded_closure) {
+    foreach my $from (grep { $_ < 256 } @{$folded_closure{$folded}}) {
+        push @{$folded_closure{$from}}, @{$folded_closure{$folded}};
+    }
+}
+
 my @bits;   # Bit map for each code point
 
+foreach my $folded (keys %folded_closure) {
+    $bits[$folded] = "_CC_NONLATIN1_FOLD" if grep { $_ > 255 }
+                                                @{$folded_closure{$folded}};
+}
+
 for my $ord (0..255) {
     my $char = chr($ord);
     utf8::upgrade($char);   # Important to use Unicode semantics!
author	Karl Williamson <public@khwilliamson.com>	2010-11-15 12:53:27 -0700
committer	Father Chrysostomos <sprout@cpan.org>	2010-11-22 13:32:56 -0800
commit	00c072cfb7c241474c4ff06fecb0de3a4f0e46de (patch)
tree	2371f52adca6be2cdc8d8d072659f6ca9c55ecac /Porting/mk_PL_charclass.pl
parent	a9ef53327dcfcb93436724193f806bbf675227db (diff)
download	perl-00c072cfb7c241474c4ff06fecb0de3a4f0e46de.tar.gz