diff options
author | Chris 'BinGOs' Williams <chris@bingosnet.co.uk> | 2010-11-07 13:31:59 +0000 |
---|---|---|
committer | Chris 'BinGOs' Williams <chris@bingosnet.co.uk> | 2010-11-07 13:32:48 +0000 |
commit | 584e761d155b7f9055c1f0566f42a2ef083716d8 (patch) | |
tree | 65f48ccaaf4e2d8400cf29e903cce0c9ea8d8822 /cpan/Unicode-Collate/Collate.pm | |
parent | 90e83bc9e824bf1896bb96af89ec7c40ed9eebf2 (diff) | |
download | perl-584e761d155b7f9055c1f0566f42a2ef083716d8.tar.gz |
Update Unicode-Collate to CPAN version 0.66
[DELTA]
0.66 Sun Nov 7 10:47:30 2010
- U::C::Locale newly supports locale: ko.
- added Unicode::Collate::CJK::Korean for ko.
- added t/loc_ko.t.
- 12 compat. ideographs (e.g. U+FA0E) are treated as unified ideographs.
(though DUCET also does it, now Unicode::Collate does it without DUCET.)
- added t/compatui.t.
! Ideographs Ext.B (U+20000..U+2A6D6) can be overrided with UCA_Version 8.
This is a long-standing behavior from Unicode::Collate 0.11 to 0.63.
A wrong fix at 0.64 should be abandoned.
Diffstat (limited to 'cpan/Unicode-Collate/Collate.pm')
-rw-r--r-- | cpan/Unicode-Collate/Collate.pm | 120 |
1 files changed, 72 insertions, 48 deletions
diff --git a/cpan/Unicode-Collate/Collate.pm b/cpan/Unicode-Collate/Collate.pm index 0133ab47d5..4345d7d263 100644 --- a/cpan/Unicode-Collate/Collate.pm +++ b/cpan/Unicode-Collate/Collate.pm @@ -14,7 +14,7 @@ use File::Spec; no warnings 'utf8'; -our $VERSION = '0.65'; +our $VERSION = '0.66'; our $PACKAGE = __PACKAGE__; my @Path = qw(Unicode Collate); @@ -108,6 +108,11 @@ use constant CJK_ExtBFin => 0x2A6D6; # Unicode 3.1.0 use constant CJK_ExtCIni => 0x2A700; # Unicode 5.2.0 use constant CJK_ExtCFin => 0x2B734; # Unicode 5.2.0 +my %CompatUI = map +($_ => 1), ( + 0xFA0E, 0xFA0F, 0xFA11, 0xFA13, 0xFA14, 0xFA1F, + 0xFA21, 0xFA23, 0xFA24, 0xFA27, 0xFA28, 0xFA29, +); + # Logical_Order_Exception in PropList.txt my $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ]; @@ -667,13 +672,16 @@ sub getWt } @decH); } return map _varCE($vbl, $_), @hangulCE; - } elsif (_isUIdeo($u, $self->{UCA_Version})) { - my $cjk = $self->{overrideCJK}; - my @cjkCE = $cjk ? map(_pack_override($_, $u, $der), $cjk->($u)) - : defined $cjk && $self->{UCA_Version} <= 8 - ? _uideoCE_8($u) : $der->($u); - return map _varCE($vbl, $_), @cjkCE; } else { + my $cjk = $self->{overrideCJK}; + my $vers = $self->{UCA_Version}; + if ($cjk && _isUIdeo($u, $vers)) { + my @cjkCE = map _pack_override($_, $u, $der), $cjk->($u); + return map _varCE($vbl, $_), @cjkCE; + } + if ($vers == 8 && defined $cjk && _isUIdeo($u, 0)) { + return map _varCE($vbl, $_), _uideoCE_8($u); + } return map _varCE($vbl, $_), $der->($u); } } @@ -790,11 +798,13 @@ sub sort { sub _derivCE_20 { my $u = shift; - my $base = (CJK_UidIni <= $u && $u <= CJK_UidF52) ? 0xFB40 : # CJK + my $base = (CJK_UidIni <= $u && $u <= CJK_UidF52 || $CompatUI{$u}) + ? 0xFB40 : # CJK (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin || CJK_ExtBIni <= $u && $u <= CJK_ExtBFin || - CJK_ExtCIni <= $u && $u <= CJK_ExtCFin) ? 0xFB80 # CJK ext. - : 0xFBC0; # others + CJK_ExtCIni <= $u && $u <= CJK_ExtCFin) + ? 0xFB80 # CJK ext. + : 0xFBC0; # others my $aaaa = $base + ($u >> 15); my $bbbb = ($u & 0x7FFF) | 0x8000; return pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u), @@ -803,10 +813,12 @@ sub _derivCE_20 { sub _derivCE_18 { my $u = shift; - my $base = (CJK_UidIni <= $u && $u <= CJK_UidF51) ? 0xFB40 : # CJK + my $base = (CJK_UidIni <= $u && $u <= CJK_UidF51 || $CompatUI{$u}) + ? 0xFB40 : # CJK (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin || - CJK_ExtBIni <= $u && $u <= CJK_ExtBFin) ? 0xFB80 # CJK ext. - : 0xFBC0; # others + CJK_ExtBIni <= $u && $u <= CJK_ExtBFin) + ? 0xFB80 # CJK ext. + : 0xFBC0; # others my $aaaa = $base + ($u >> 15); my $bbbb = ($u & 0x7FFF) | 0x8000; return pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u), @@ -815,10 +827,12 @@ sub _derivCE_18 { sub _derivCE_14 { my $u = shift; - my $base = (CJK_UidIni <= $u && $u <= CJK_UidF41) ? 0xFB40 : # CJK + my $base = (CJK_UidIni <= $u && $u <= CJK_UidF41 || $CompatUI{$u}) + ? 0xFB40 : # CJK (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin || - CJK_ExtBIni <= $u && $u <= CJK_ExtBFin) ? 0xFB80 # CJK ext. - : 0xFBC0; # others + CJK_ExtBIni <= $u && $u <= CJK_ExtBFin) + ? 0xFB80 # CJK ext. + : 0xFBC0; # others my $aaaa = $base + ($u >> 15); my $bbbb = ($u & 0x7FFF) | 0x8000; return pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u), @@ -827,10 +841,12 @@ sub _derivCE_14 { sub _derivCE_9 { my $u = shift; - my $base = (CJK_UidIni <= $u && $u <= CJK_UidFin) ? 0xFB40 : # CJK + my $base = (CJK_UidIni <= $u && $u <= CJK_UidFin || $CompatUI{$u}) + ? 0xFB40 : # CJK (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin || - CJK_ExtBIni <= $u && $u <= CJK_ExtBFin) ? 0xFB80 # CJK ext. - : 0xFBC0; # others + CJK_ExtBIni <= $u && $u <= CJK_ExtBFin) + ? 0xFB80 # CJK ext. + : 0xFBC0; # others my $aaaa = $base + ($u >> 15); my $bbbb = ($u & 0x7FFF) | 0x8000; return pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u), @@ -841,9 +857,8 @@ sub _derivCE_8 { my $code = shift; my $aaaa = 0xFF80 + ($code >> 15); my $bbbb = ($code & 0x7FFF) | 0x8000; - return - pack(VCE_TEMPLATE, NON_VAR, $aaaa, 2, 1, $code), - pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $code); + return pack(VCE_TEMPLATE, NON_VAR, $aaaa, 2, 1, $code), + pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $code); } sub _uideoCE_8 { @@ -852,16 +867,17 @@ sub _uideoCE_8 { } sub _isUIdeo { + # $uca_vers = 0 for _uideoCE_8() my ($u, $uca_vers) = @_; return((CJK_UidIni <= $u && ( $uca_vers >= 20 ? ($u <= CJK_UidF52) : $uca_vers >= 18 ? ($u <= CJK_UidF51) : $uca_vers >= 14 ? ($u <= CJK_UidF41) : - ($u <= CJK_UidFin))) + ($u <= CJK_UidFin))) || $CompatUI{$u} || (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin) || - ($uca_vers >= 9 && CJK_ExtBIni <= $u && $u <= CJK_ExtBFin) + ($uca_vers >= 8 && CJK_ExtBIni <= $u && $u <= CJK_ExtBFin) || ($uca_vers >= 20 && CJK_ExtCIni <= $u && $u <= CJK_ExtCFin) ); @@ -1432,24 +1448,24 @@ B<Unicode::Normalize> is required (see also B<CAVEAT>). -- see 7.1 Derived Collation Elements, UTS #10. -By default, CJK Unified Ideographs are ordered in Unicode codepoint -order but C<CJK Unified Ideographs> are lesser than -C<CJK Unified Ideographs Extension>. +By default, CJK unified ideographs are ordered in Unicode codepoint +order, but those in the CJK Unified Ideographs block are lesser than +those in the CJK Unified Ideographs Extension A etc. - CJK Unified Ideographs: + In CJK Unified Ideographs block: U+4E00..U+9FA5 if UCA_Version is 8 to 11; U+4E00..U+9FBB if UCA_Version is 14 to 16; U+4E00..U+9FC3 if UCA_Version is 18; U+4E00..U+9FCB if UCA_Version is 20. - CJK Unified Ideographs Extension: - Ext.A (U+3400..U+4DB5) if UCA_Version is 9 or greater; - Ext.B (U+20000..U+2A6D6) if UCA_Version is 9 or greater; + In CJK Unified Ideographs Extension blocks: + Ext.A (U+3400..U+4DB5) and Ext.B (U+20000..U+2A6D6) in any UCA_Version; Ext.C (U+2A700..U+2B734) if UCA_Version is 20. -Through C<overrideCJK>, ordering of CJK Unified Ideographs can be overrided. +Through C<overrideCJK>, ordering of CJK unified ideographs (including +extensions) can be overrided. -ex. CJK Unified Ideographs in the JIS code point order. +ex. CJK unified ideographs in the JIS code point order. overrideCJK => sub { my $u = shift; # get a Unicode codepoint @@ -1475,37 +1491,45 @@ collation element will be used. The return value may be a list containing zero or more of an arrayref, an integer, or C<undef>. -ex. ignores all CJK Unified Ideographs. +ex. ignores all CJK unified ideographs. overrideCJK => sub {()}, # CODEREF returning empty list # where ->eq("Pe\x{4E00}rl", "Perl") is true - # as U+4E00 is a CJK Unified Ideograph and to be ignorable. + # as U+4E00 is a CJK unified ideograph and to be ignorable. If C<undef> is passed explicitly as the value for this key, -weights for CJK Unified Ideographs are treated as undefined. -But assignment of weight for CJK Unified Ideographs -in <table> or C<entry> is still valid. +weights for CJK unified ideographs are treated as undefined. +But assignment of weight for CJK unified ideographs +in C<table> or C<entry> is still valid. + +B<Note:> In addition to them, 12 CJK compatibility ideographs (C<U+FA0E>, +C<U+FA0F>, C<U+FA11>, C<U+FA13>, C<U+FA14>, C<U+FA1F>, C<U+FA21>, C<U+FA23>, +C<U+FA24>, C<U+FA27>, C<U+FA28>, C<U+FA29>) are also treated as CJK unified +ideographs. But they can't be overrided via C<overrideCJK> when you use +DUCET, as the table includes weights for them. C<table> or C<entry> has +priority over C<overrideCJK>. =item overrideHangul -- see 7.1 Derived Collation Elements, UTS #10. -By default, Hangul Syllables are decomposed into Hangul Jamo, +By default, Hangul syllables are decomposed into Hangul Jamo, even if C<(normalization =E<gt> undef)>. -But the mapping of Hangul Syllables may be overrided. +But the mapping of Hangul syllables may be overrided. This parameter works like C<overrideCJK>, so see there for examples. -If you want to override the mapping of Hangul Syllables, -NFD, NFKD, and FCD are not appropriate, -since they will decompose Hangul Syllables before overriding. +If you want to override the mapping of Hangul syllables, +NFD and NFKD are not appropriate, since NFD and NFKD will decompose +Hangul syllables before overriding. FCD may decompose Hangul syllables +as the case may be. If C<undef> is passed explicitly as the value for this key, -weight for Hangul Syllables is treated as undefined +weight for Hangul syllables is treated as undefined without decomposition into Hangul Jamo. -But definition of weight for Hangul Syllables -in <table> or C<entry> is still valid. +But definition of weight for Hangul syllables +in C<table> or C<entry> is still valid. =item preprocess @@ -1560,7 +1584,7 @@ but it is not warned at present.> UTS #35 (LDML). Contractions beginning with the specified characters are suppressed, -even if those contractions are defined in <table> or C<entry>. +even if those contractions are defined in C<table> or C<entry>. An example for Russian and some languages using the Cyrillic script: @@ -1615,7 +1639,7 @@ specified as a comment (following C<#>) on each line. -- see 6.3.4 Reducing the Repertoire, UTS #10. -Undefines the collation element as if it were unassigned in the <table>. +Undefines the collation element as if it were unassigned in the C<table>. This reduces the size of the table. If an unassigned character appears in the string to be collated, the sort key is made from its codepoint |