summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKarl Williamson <public@khwilliamson.com>2012-03-26 12:31:20 -0600
committerKarl Williamson <public@khwilliamson.com>2012-06-02 08:29:13 -0600
commit727c62fffd30ba6ed698a448b17df0199eecf416 (patch)
tree83072aac10126e29a1198e7dad7d1918cb195050
parent9a8e4a54f8b3668a7ebd8f229cdfb405a1dce77c (diff)
downloadperl-727c62fffd30ba6ed698a448b17df0199eecf416.tar.gz
Unicode::UCD::casefold(): Don't use .txt file for source
This converts this function to using the outputs of prop_invmap() to get its casefolding definitions. This allows it to work on versions of Unicode which don't have this file, allows the file to not have to be installed, and removes this function from having to be different on EBCDIC platforms (which wasn't coded anyway).
-rw-r--r--lib/Unicode/UCD.pm124
1 files changed, 79 insertions, 45 deletions
diff --git a/lib/Unicode/UCD.pm b/lib/Unicode/UCD.pm
index cdc08a0870..99152207e6 100644
--- a/lib/Unicode/UCD.pm
+++ b/lib/Unicode/UCD.pm
@@ -1028,54 +1028,88 @@ L<http://www.unicode.org/unicode/reports/tr21>
my %CASEFOLD;
sub _casefold {
- unless (%CASEFOLD) {
- if (openunicode(\$CASEFOLDFH, "CaseFolding.txt")) {
- local $_;
- local $/ = "\n";
- while (<$CASEFOLDFH>) {
- if (/^([0-9A-F]+); ([CFIST]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
- my $code = hex($1);
- $CASEFOLD{$code}{'code'} = $1;
- $CASEFOLD{$code}{'turkic'} = "" unless
- defined $CASEFOLD{$code}{'turkic'};
- if ($2 eq 'C' || $2 eq 'I') { # 'I' is only on 3.1 and
- # earlier Unicodes
- # Both entries there (I
- # only checked 3.1) are
- # the same as C, and
- # there are no other
- # entries for those
- # codepoints, so treat
- # as if C, but override
- # the turkic one for
- # 'I'.
- $CASEFOLD{$code}{'status'} = $2;
- $CASEFOLD{$code}{'full'} = $CASEFOLD{$code}{'simple'} =
- $CASEFOLD{$code}{'mapping'} = $3;
- $CASEFOLD{$code}{'turkic'} = $3 if $2 eq 'I';
- } elsif ($2 eq 'F') {
- $CASEFOLD{$code}{'full'} = $3;
- unless (defined $CASEFOLD{$code}{'simple'}) {
- $CASEFOLD{$code}{'simple'} = "";
- $CASEFOLD{$code}{'mapping'} = $3;
- $CASEFOLD{$code}{'status'} = $2;
- }
- } elsif ($2 eq 'S') {
+ unless (%CASEFOLD) { # Populate the hash
+ my ($full_invlist_ref, $full_invmap_ref, undef, $default)
+ = prop_invmap('Case_Folding');
+
+ # Use the recipe given in the prop_invmap() pod to convert the
+ # inversion map into the hash.
+ for my $i (0 .. @$full_invlist_ref - 1 - 1) {
+ next if $full_invmap_ref->[$i] == $default;
+ my $adjust = -1;
+ for my $j ($full_invlist_ref->[$i] .. $full_invlist_ref->[$i+1] -1) {
+ $adjust++;
+ if (! ref $full_invmap_ref->[$i]) {
+
+ # This is a single character mapping
+ $CASEFOLD{$j}{'status'} = 'C';
+ $CASEFOLD{$j}{'simple'}
+ = $CASEFOLD{$j}{'full'}
+ = $CASEFOLD{$j}{'mapping'}
+ = sprintf("%04X", $full_invmap_ref->[$i] + $adjust);
+ $CASEFOLD{$j}{'code'} = sprintf("%04X", $j);
+ $CASEFOLD{$j}{'turkic'} = "";
+ }
+ else { # prop_invmap ensures that $adjust is 0 for a ref
+ $CASEFOLD{$j}{'status'} = 'F';
+ $CASEFOLD{$j}{'full'}
+ = $CASEFOLD{$j}{'mapping'}
+ = join " ", map { sprintf "%04X", $_ }
+ @{$full_invmap_ref->[$i]};
+ $CASEFOLD{$j}{'simple'} = "";
+ $CASEFOLD{$j}{'code'} = sprintf("%04X", $j);
+ $CASEFOLD{$j}{'turkic'} = "";
+ }
+ }
+ }
+ # We have filled in the full mappings above, assuming there were no
+ # simple ones for the ones with multi-character maps. Now, we find
+ # and fix the cases where that assumption was false.
+ (my ($simple_invlist_ref, $simple_invmap_ref, undef), $default)
+ = prop_invmap('Simple_Case_Folding');
+ for my $i (0 .. @$simple_invlist_ref - 1 - 1) {
+ next if $simple_invmap_ref->[$i] == $default;
+ my $adjust = -1;
+ for my $j ($simple_invlist_ref->[$i]
+ .. $simple_invlist_ref->[$i+1] -1)
+ {
+ $adjust++;
+ next if $CASEFOLD{$j}{'status'} eq 'C';
+ $CASEFOLD{$j}{'status'} = 'S';
+ $CASEFOLD{$j}{'simple'}
+ = $CASEFOLD{$j}{'mapping'}
+ = sprintf("%04X", $simple_invmap_ref->[$i] + $adjust);
+ $CASEFOLD{$j}{'code'} = sprintf("%04X", $j);
+ $CASEFOLD{$j}{'turkic'} = "";
+ }
+ }
- # There can't be a simple without a full, and simple
- # overrides all but full
+ # We hard-code in the turkish rules
+ UnicodeVersion() unless defined $v_unicode_version;
+ if ($v_unicode_version ge v3.2.0) {
- $CASEFOLD{$code}{'simple'} = $3;
- $CASEFOLD{$code}{'mapping'} = $3;
- $CASEFOLD{$code}{'status'} = $2;
- } elsif ($2 eq 'T') {
- $CASEFOLD{$code}{'turkic'} = $3;
- } # else can't happen because only [CIFST] are possible
- }
- }
- close($CASEFOLDFH);
- }
+ # These two code points should already have regular entries, so
+ # just fill in the turkish fields
+ $CASEFOLD{ord('I')}{'turkic'} = '0131';
+ $CASEFOLD{0x130}{'turkic'} = sprintf "%04X", ord('i');
+ }
+ elsif ($v_unicode_version ge v3.1.0) {
+
+ # These two code points don't have entries otherwise.
+ $CASEFOLD{0x130}{'code'} = '0130';
+ $CASEFOLD{0x131}{'code'} = '0131';
+ $CASEFOLD{0x130}{'status'} = $CASEFOLD{0x131}{'status'} = 'I';
+ $CASEFOLD{0x130}{'turkic'}
+ = $CASEFOLD{0x130}{'mapping'}
+ = $CASEFOLD{0x130}{'full'}
+ = $CASEFOLD{0x130}{'simple'}
+ = $CASEFOLD{0x131}{'turkic'}
+ = $CASEFOLD{0x131}{'mapping'}
+ = $CASEFOLD{0x131}{'full'}
+ = $CASEFOLD{0x131}{'simple'}
+ = sprintf "%04X", ord('i');
+ }
}
}