summaryrefslogtreecommitdiff
path: root/Porting/mk_PL_charclass.pl
diff options
context:
space:
mode:
authorKarl Williamson <public@khwilliamson.com>2010-11-15 12:53:27 -0700
committerFather Chrysostomos <sprout@cpan.org>2010-11-22 13:32:56 -0800
commit00c072cfb7c241474c4ff06fecb0de3a4f0e46de (patch)
tree2371f52adca6be2cdc8d8d072659f6ca9c55ecac /Porting/mk_PL_charclass.pl
parenta9ef53327dcfcb93436724193f806bbf675227db (diff)
downloadperl-00c072cfb7c241474c4ff06fecb0de3a4f0e46de.tar.gz
mk_PL_charclass.pl: Find non-latin1 folds
The output of this .pl is to be used as the main table in l1_char_class_tab.h. Add a new bit to indicate if a Latin1 character particpates in a a simple fold with a character outside the Latin1 range. This will be used by regcomp.c to make decisions about how to compile regexes.
Diffstat (limited to 'Porting/mk_PL_charclass.pl')
-rw-r--r--Porting/mk_PL_charclass.pl43
1 files changed, 43 insertions, 0 deletions
diff --git a/Porting/mk_PL_charclass.pl b/Porting/mk_PL_charclass.pl
index 25293b988f..697594bb22 100644
--- a/Porting/mk_PL_charclass.pl
+++ b/Porting/mk_PL_charclass.pl
@@ -2,6 +2,7 @@
use 5.012;
use strict;
use warnings;
+use Config;
# This program outputs the 256 lines that form the guts of the PL_charclass
# table. The output should be used to manually replace the table contents in
@@ -54,8 +55,50 @@ my @properties = qw(
XDIGIT_A
);
+# Read in the case fold mappings.
+my %folded_closure;
+my $file="$Config{privlib}/unicore/CaseFolding.txt";
+open my $fh, "<", $file or die "Failed to read '$file': $!";
+while (<$fh>) {
+ chomp;
+
+ # Lines look like (without the initial '#'
+ #0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
+ my ($line, $comment) = split / \s+ \# \s+ /x, $_;
+ next if $line eq "" || substr($line, 0, 1) eq '#';
+ my ($hex_from, $fold_type, @folded) = split /[\s;]+/, $line;
+
+ my $from = hex $hex_from;
+
+ # Perl only deals with C and F folds
+ next if $fold_type ne 'C' and $fold_type ne 'F';
+ next if $fold_type ne 'C'; # And for now, just single-char folds. XXX
+
+ # Get each code point in the range that participates in this line's fold.
+ # The hash has keys of each code point in the range, and values of what it
+ # folds to and what folds to it
+ foreach my $hex_fold (@folded) {
+ my $fold = hex $hex_fold;
+ push @{$folded_closure{$fold}}, $from if $fold < 256;
+ push @{$folded_closure{$from}}, $fold if $from < 256;
+ }
+}
+
+# Now having read all the lines, combine them into the full closure of each
+# code point in the range by adding lists together that share a common element
+foreach my $folded (keys %folded_closure) {
+ foreach my $from (grep { $_ < 256 } @{$folded_closure{$folded}}) {
+ push @{$folded_closure{$from}}, @{$folded_closure{$folded}};
+ }
+}
+
my @bits; # Bit map for each code point
+foreach my $folded (keys %folded_closure) {
+ $bits[$folded] = "_CC_NONLATIN1_FOLD" if grep { $_ > 255 }
+ @{$folded_closure{$folded}};
+}
+
for my $ord (0..255) {
my $char = chr($ord);
utf8::upgrade($char); # Important to use Unicode semantics!