diff options
author | Karl Williamson <public@khwilliamson.com> | 2010-11-15 12:53:27 -0700 |
---|---|---|
committer | Father Chrysostomos <sprout@cpan.org> | 2010-11-22 13:32:56 -0800 |
commit | 00c072cfb7c241474c4ff06fecb0de3a4f0e46de (patch) | |
tree | 2371f52adca6be2cdc8d8d072659f6ca9c55ecac /Porting/mk_PL_charclass.pl | |
parent | a9ef53327dcfcb93436724193f806bbf675227db (diff) | |
download | perl-00c072cfb7c241474c4ff06fecb0de3a4f0e46de.tar.gz |
mk_PL_charclass.pl: Find non-latin1 folds
The output of this .pl is to be used as the main table in
l1_char_class_tab.h. Add a new bit to indicate if a Latin1 character
particpates in a a simple fold with a character outside the Latin1
range. This will be used by regcomp.c to make decisions about how to
compile regexes.
Diffstat (limited to 'Porting/mk_PL_charclass.pl')
-rw-r--r-- | Porting/mk_PL_charclass.pl | 43 |
1 files changed, 43 insertions, 0 deletions
diff --git a/Porting/mk_PL_charclass.pl b/Porting/mk_PL_charclass.pl index 25293b988f..697594bb22 100644 --- a/Porting/mk_PL_charclass.pl +++ b/Porting/mk_PL_charclass.pl @@ -2,6 +2,7 @@ use 5.012; use strict; use warnings; +use Config; # This program outputs the 256 lines that form the guts of the PL_charclass # table. The output should be used to manually replace the table contents in @@ -54,8 +55,50 @@ my @properties = qw( XDIGIT_A ); +# Read in the case fold mappings. +my %folded_closure; +my $file="$Config{privlib}/unicore/CaseFolding.txt"; +open my $fh, "<", $file or die "Failed to read '$file': $!"; +while (<$fh>) { + chomp; + + # Lines look like (without the initial '#' + #0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE + my ($line, $comment) = split / \s+ \# \s+ /x, $_; + next if $line eq "" || substr($line, 0, 1) eq '#'; + my ($hex_from, $fold_type, @folded) = split /[\s;]+/, $line; + + my $from = hex $hex_from; + + # Perl only deals with C and F folds + next if $fold_type ne 'C' and $fold_type ne 'F'; + next if $fold_type ne 'C'; # And for now, just single-char folds. XXX + + # Get each code point in the range that participates in this line's fold. + # The hash has keys of each code point in the range, and values of what it + # folds to and what folds to it + foreach my $hex_fold (@folded) { + my $fold = hex $hex_fold; + push @{$folded_closure{$fold}}, $from if $fold < 256; + push @{$folded_closure{$from}}, $fold if $from < 256; + } +} + +# Now having read all the lines, combine them into the full closure of each +# code point in the range by adding lists together that share a common element +foreach my $folded (keys %folded_closure) { + foreach my $from (grep { $_ < 256 } @{$folded_closure{$folded}}) { + push @{$folded_closure{$from}}, @{$folded_closure{$folded}}; + } +} + my @bits; # Bit map for each code point +foreach my $folded (keys %folded_closure) { + $bits[$folded] = "_CC_NONLATIN1_FOLD" if grep { $_ > 255 } + @{$folded_closure{$folded}}; +} + for my $ord (0..255) { my $char = chr($ord); utf8::upgrade($char); # Important to use Unicode semantics! |