diff options
author | Karl Williamson <public@khwilliamson.com> | 2012-03-27 09:54:53 -0600 |
---|---|---|
committer | Karl Williamson <public@khwilliamson.com> | 2012-06-02 08:29:15 -0600 |
commit | dbe1ba6ba7a5cdb7b4922771204f7ec0ed88a7b9 (patch) | |
tree | e4a7b7f8dc0564036fe8f8d8ffa4b075ed8ca7ea /regen | |
parent | 83521fcf4b2a24304c04352af0f6116208474ca7 (diff) | |
download | perl-dbe1ba6ba7a5cdb7b4922771204f7ec0ed88a7b9.tar.gz |
mk_PL_charclass.pl: Allow to work on early Unicodes
If the version of Unicode being compiled doesn't have the modern
casefolding .txt file, get the values from Unicode::UCD. Also for
EBCDIC, where otherwise the file would have to be translated.
Diffstat (limited to 'regen')
-rw-r--r-- | regen/mk_PL_charclass.pl | 45 |
1 files changed, 42 insertions, 3 deletions
diff --git a/regen/mk_PL_charclass.pl b/regen/mk_PL_charclass.pl index 5a3dbbe1f3..6a7dc92c45 100644 --- a/regen/mk_PL_charclass.pl +++ b/regen/mk_PL_charclass.pl @@ -57,8 +57,47 @@ my @properties = qw( # Read in the case fold mappings. my %folded_closure; my $file="lib/unicore/CaseFolding.txt"; -open my $fh, "<", $file or die "Failed to read '$file': $!"; -while (<$fh>) { +my @folds; +use Unicode::UCD; + +# Use the Unicode data file if we are on an ASCII platform (which its data is +# for), and it is in the modern format (starting in Unicode 3.1.0) and it is +# available. This avoids being affected by potential bugs introduced by other +# layers of Perl +if (ord('A') == 65 + && pack("C*", split /\./, Unicode::UCD::UnicodeVersion()) ge v3.1.0 + && open my $fh, "<", $file) +{ + @folds = <$fh>; +} +else { + my ($invlist_ref, $invmap_ref, undef, $default) + = Unicode::UCD::prop_invmap('Case_Folding'); + for my $i (0 .. @$invlist_ref - 1 - 1) { + next if $invmap_ref->[$i] == $default; + my $adjust = -1; + for my $j ($invlist_ref->[$i] .. $invlist_ref->[$i+1] -1) { + $adjust++; + + # Single-code point maps go to a 'C' type + if (! ref $invmap_ref->[$i]) { + push @folds, sprintf("%04X; C; %04X\n", + $j, + $invmap_ref->[$i] + $adjust); + } + else { # Multi-code point maps go to 'F'. prop_invmap() + # guarantees that no adjustment is needed for these, + # as the range will contain just one element + push @folds, sprintf("%04X; F; %s\n", + $j, + join " ", map { sprintf "%04X", $_ } + @{$invmap_ref->[$i]}); + } + } + } +} + +for (@folds) { chomp; # Lines look like (without the initial '#' @@ -230,7 +269,7 @@ my @C1 = qw( my $out_fh = open_new('l1_char_class_tab.h', '>', {style => '*', by => $0, - from => "property definitions and $file"}); + from => "property definitions"}); # Output the table using fairly short names for each char. for my $ord (0..255) { |