From 60f577e0304ce0cd93ca30edfeb534713ea7ffd9 Mon Sep 17 00:00:00 2001 From: Steve Hay Date: Sun, 1 Sep 2013 14:59:01 +0100 Subject: Upgrade Unicode::Collate from version 0.98 to 0.99 --- cpan/Unicode-Collate/Collate.pm | 108 ++++++++++++++++++++++++++++++---------- 1 file changed, 82 insertions(+), 26 deletions(-) (limited to 'cpan/Unicode-Collate/Collate.pm') diff --git a/cpan/Unicode-Collate/Collate.pm b/cpan/Unicode-Collate/Collate.pm index 388da67e30..48840ecd74 100644 --- a/cpan/Unicode-Collate/Collate.pm +++ b/cpan/Unicode-Collate/Collate.pm @@ -14,7 +14,7 @@ use File::Spec; no warnings 'utf8'; -our $VERSION = '0.98'; +our $VERSION = '0.99'; our $PACKAGE = __PACKAGE__; ### begin XS only ### @@ -106,7 +106,7 @@ my (%VariableOK); our @ChangeOK = qw/ alternate backwards level normalization rearrange katakana_before_hiragana upper_before_lower ignore_level2 - overrideHangul overrideCJK preprocess UCA_Version + overrideCJK overrideHangul overrideOut preprocess UCA_Version hangul_terminator variable identical highestFFFF minimalFFFE /; @@ -497,7 +497,7 @@ sub splitEnt # remove a code point marked as a completely ignorable. for (my $i = 0; $i < @src; $i++) { - if (_isIllegal($src[$i]) || $vers <= 20 && _isNonchar($src[$i])) { + if ($vers <= 20 && _isIllegal($src[$i])) { $src[$i] = undef; } elsif ($ver9) { $src[$i] = undef if $map->{ $src[$i] } @@ -621,25 +621,27 @@ sub getWt my $u = shift; my $map = $self->{mapping}; my $der = $self->{derivCode}; + my $out = $self->{overrideOut}; my $uXS = $self->{__useXS}; ### XS only return if !defined $u; return $self->varCE($HighestVCE) if $u eq 0xFFFF && $self->{highestFFFF}; return $self->varCE($minimalVCE) if $u eq 0xFFFE && $self->{minimalFFFE}; - return map($self->varCE($_), @{ $map->{$u} }) if $map->{$u}; + $u = 0xFFFD if $u !~ /;/ && 0x10FFFF < $u && !$out; + + my @ce; + if ($map->{$u}) { + @ce = @{ $map->{$u} }; # $u may be a contraction ### begin XS only ### - return map($self->varCE($_), _fetch_simple($u)) - if $uXS && _exists_simple($u); + } elsif ($uXS && _exists_simple($u)) { + @ce = _fetch_simple($u); ### end XS only ### - - # JCPS must not be a contraction, then it's a code point. - if (Hangul_SIni <= $u && $u <= Hangul_SFin) { + } elsif (Hangul_SIni <= $u && $u <= Hangul_SFin) { my $hang = $self->{overrideHangul}; - my @hangulCE; if ($hang) { - @hangulCE = map _pack_override($_, $u, $der), $hang->($u); + @ce = map _pack_override($_, $u, $der), $hang->($u); } elsif (!defined $hang) { - @hangulCE = $der->($u); + @ce = $der->($u); } else { my $max = $self->{maxlength}; my @decH = _decompHangul($u); @@ -665,25 +667,26 @@ sub getWt } } - @hangulCE = map({ + @ce = map({ $map->{$_} ? @{ $map->{$_} } : $uXS && _exists_simple($_) ? _fetch_simple($_) : ### XS only $der->($_); } @decH); } - return map $self->varCE($_), @hangulCE; + } elsif ($out && 0x10FFFF < $u) { + @ce = map _pack_override($_, $u, $der), $out->($u); } else { my $cjk = $self->{overrideCJK}; my $vers = $self->{UCA_Version}; if ($cjk && _isUIdeo($u, $vers)) { - my @cjkCE = map _pack_override($_, $u, $der), $cjk->($u); - return map $self->varCE($_), @cjkCE; - } - if ($vers == 8 && defined $cjk && _isUIdeo($u, 0)) { - return map $self->varCE($_), _uideoCE_8($u); + @ce = map _pack_override($_, $u, $der), $cjk->($u); + } elsif ($vers == 8 && defined $cjk && _isUIdeo($u, 0)) { + @ce = _uideoCE_8($u); + } else { + @ce = $der->($u); } - return map $self->varCE($_), $der->($u); } + return map $self->varCE($_), @ce; } @@ -1095,6 +1098,9 @@ The following revisions are supported. The default is 26. * Noncharacters (e.g. U+FFFF) are not ignored, and can be overridden since C 22. +* Out-of-range codepoints (greater than U+10FFFF) are not ignored, +and can be overridden since C 22. + * Fully ignorable characters were ignored, and would not interrupt contractions with C 9 and 11. @@ -1216,7 +1222,8 @@ almost, but the latter has a problem that you should know which letter is next to C. For a certain language where C as the next letter, C<"abch"> is greater than C<"abc\x{FFFF}">, but less than C<"abd">. -Note: This is equivalent to C 'FFFF ; [.FFFE.0020.0005.FFFF]'>. +Note: +This is equivalent to C<(entry =E 'FFFF ; [.FFFE.0020.0005.FFFF]')>. Any other character than C can be tailored by C. =item identical @@ -1325,7 +1332,8 @@ then C<$a2> and C<$b2> at level 1, as followed. "b\x{FFFE}aaa" "bbb\x{FFFE}a" -Note: This is equivalent to C 'FFFE ; [.0001.0020.0005.FFFE]'>. +Note: +This is equivalent to C<(entry =E 'FFFE ; [.0001.0020.0005.FFFE]')>. Any other character than C can be tailored by C. =item normalization @@ -1425,10 +1433,16 @@ ex. ignores all CJK unified ideographs. # where ->eq("Pe\x{4E00}rl", "Perl") is true # as U+4E00 is a CJK unified ideograph and to be ignorable. -If C is passed explicitly as the value for this key, -weights for CJK unified ideographs are treated as undefined. +If a false value (including C) is passed, C +has no effect. +C<$Collator-Echange(overrideCJK =E 0)> resets the old one. + But assignment of weight for CJK unified ideographs in C or C is still valid. +If C is passed explicitly as the value for this key, +weights for CJK unified ideographs are treated as undefined. +However when C E 8, C<(overrideCJK =E undef)> +has no special meaning. B In addition to them, 12 CJK compatibility ideographs (C, C, C, C, C, C, C, C, @@ -1452,12 +1466,54 @@ NFD and NFKD are not appropriate, since NFD and NFKD will decompose Hangul syllables before overriding. FCD may decompose Hangul syllables as the case may be. +If a false value (but not C) is passed, C +has no effect. +C<$Collator-Echange(overrideHangul =E 0)> resets the old one. + If C is passed explicitly as the value for this key, weight for Hangul syllables is treated as undefined without decomposition into Hangul Jamo. But definition of weight for Hangul syllables in C
or C is still valid. +=item overrideOut + +-- see 7.1.1 Handling Ill-Formed Code Unit Sequences, UTS #10. + +Perl seems to allow out-of-range values (greater than 0x10FFFF). +By default, out-of-range values are replaced with C +(REPLACEMENT CHARACTER) when C E= 22, +or ignored when C E= 20. + +When C E= 22, the weights of out-of-range values +can be overridden. Though C
or C are available for them, +out-of-range values are too many. + +C can perform it algorithmically. +This parameter works like C, so see there for examples. + +ex. ignores all out-of-range values. + + overrideOut => sub {()}, # CODEREF returning empty list + +If a false value (including C) is passed, C +has no effect. +C<$Collator-Echange(overrideOut =E 0)> resets the old one. + +UCA recommends that out-of-range values should not be ignored for security +reasons. Say, C<"pe\x{110000}rl"> should not be equal to C<"perl">. +However, C is wrongly mapped to a variable collation element +in DUCET for Unicode 6.0.0 to 6.2.0, that means out-of-range values will be +ignored when C isn't C. + +Unicode 6.3.0 will correct the mapping of C. +see L. +Such a correction is reproduced by this. + + overrideOut => sub { 0xFFFD }, # CODEREF returning a very large integer + +Since Unicode 6.3.0, C<(overrideOut =E sub { 0xFFFD })> may be unnecesssary. + =item preprocess -- see 5.4 Preprocessing, UTS #10. @@ -1559,7 +1615,7 @@ may be better to avoid namespace conflict. B: When XSUB is used, the DUCET is compiled on building this module, and it may save time at the run time. -Explicit saying C
'allkeys.txt'> (or using another table), +Explicit saying C<(table =E 'allkeys.txt')>, or using another table, or using C, C, C, C or C will prevent this module from using the compiled DUCET. @@ -1934,7 +1990,7 @@ module (see L). If you need not it (say, in the case when you need not handle any combining characters), -assign C undef> explicitly. +assign C<(normalization =E undef)> explicitly. -- see 6.5 Avoiding Normalization, UTS #10. -- cgit v1.2.1