diff options
author | Chris 'BinGOs' Williams <chris@bingosnet.co.uk> | 2010-11-14 12:07:03 +0000 |
---|---|---|
committer | Chris 'BinGOs' Williams <chris@bingosnet.co.uk> | 2010-11-14 12:07:03 +0000 |
commit | b5d9a95357621a0a9d375ff6a83672c7f150655e (patch) | |
tree | 9f8d4bc2703f353701669efb7159ff490a921d80 /cpan | |
parent | e1603486d4c4f2826ab41e50c9315d9914cc9dfa (diff) | |
download | perl-b5d9a95357621a0a9d375ff6a83672c7f150655e.tar.gz |
Update Unicode-Collate to CPAN version 0.67
[DELTA]
0.67 Sun Nov 14 11:38:59 2010
- supported UCA_Version 22 for Unicode 6.0.0.
* 2B740..2B81D are new CJK unified ideographs.
* noncharacters (e.g. U+FFFF) should be overridable, not be ignored.
! DUCET is NOT updated, as no maint perl supports Unicode 6.0.0.
Thus the default UCA_Version is still 20.
- added t/nonchar.t.
- improved discontiguous contractions of 3 or more characters.
(e.g. 0FB2 0F71 0F80 and 0FB3 0F71 0F80)
- auxiliary: now 'mklocale' also copes with Korean.pm according to DUCET.
Diffstat (limited to 'cpan')
-rw-r--r-- | cpan/Unicode-Collate/Changes | 16 | ||||
-rw-r--r-- | cpan/Unicode-Collate/Collate.pm | 174 | ||||
-rw-r--r-- | cpan/Unicode-Collate/Collate/CJK/Big5.pm | 2 | ||||
-rw-r--r-- | cpan/Unicode-Collate/Collate/Locale.pm | 2 | ||||
-rw-r--r-- | cpan/Unicode-Collate/Collate/Locale/hy.pl | 4 | ||||
-rw-r--r-- | cpan/Unicode-Collate/README | 13 | ||||
-rw-r--r-- | cpan/Unicode-Collate/t/cjkrange.t | 181 | ||||
-rw-r--r-- | cpan/Unicode-Collate/t/compatui.t | 4 | ||||
-rw-r--r-- | cpan/Unicode-Collate/t/contract.t | 113 | ||||
-rw-r--r-- | cpan/Unicode-Collate/t/illegal.t | 147 | ||||
-rw-r--r-- | cpan/Unicode-Collate/t/loc_hy.t | 18 | ||||
-rw-r--r-- | cpan/Unicode-Collate/t/nonchar.t | 120 | ||||
-rw-r--r-- | cpan/Unicode-Collate/t/overcjk0.t | 132 | ||||
-rw-r--r-- | cpan/Unicode-Collate/t/overcjk1.t | 109 |
14 files changed, 639 insertions, 396 deletions
diff --git a/cpan/Unicode-Collate/Changes b/cpan/Unicode-Collate/Changes index 263208b0cb..329bcda838 100644 --- a/cpan/Unicode-Collate/Changes +++ b/cpan/Unicode-Collate/Changes @@ -1,5 +1,16 @@ Revision history for Perl module Unicode::Collate. +0.67 Sun Nov 14 11:38:59 2010 + - supported UCA_Version 22 for Unicode 6.0.0. + * 2B740..2B81D are new CJK unified ideographs. + * noncharacters (e.g. U+FFFF) should be overridable, not be ignored. + ! DUCET is NOT updated, as no maint perl supports Unicode 6.0.0. + Thus the default UCA_Version is still 20. + - added t/nonchar.t. + - improved discontiguous contractions of 3 or more characters. + (e.g. 0FB2 0F71 0F80 and 0FB3 0F71 0F80) + - auxiliary: now 'mklocale' also copes with Korean.pm according to DUCET. + 0.66 Sun Nov 7 10:47:30 2010 - U::C::Locale newly supports locale: ko. - added Unicode::Collate::CJK::Korean for ko. @@ -93,6 +104,7 @@ Revision history for Perl module Unicode::Collate. - supported locales: cs, es, es__traditional, fr, nn, pl. ! added t/locale*.t that uses DUCET. (locale_cs.t, locale_fr.t, locale_nn.t, locale_pl.t, locale_test.t) + - data/*.txt and mklocale for preparation of Locale/*.pl from DUCET. 0.54 Sun Jul 25 21:37:04 2010 - Now UCA Revision 20 (based on Unicode 5.2.0). @@ -100,7 +112,7 @@ Revision history for Perl module Unicode::Collate. which *is required* to test this module. ! Please notice that allkeys.txt will be overwritten if you have had other allkeys.txt already. - - U+9FC4..U+9FCB and U+2A700..U+2B734 are new CJK Unified Ideographs. + - U+9FC4..U+9FCB and U+2A700..U+2B734 are new CJK unified ideographs. - Many hangul jamo are assigned (affecting hangul_terminator). ! DUCET will be compiled when XS is used. Explicit saying @@ -115,7 +127,7 @@ Revision history for Perl module Unicode::Collate. which is not required to test this module. ! Please notice that allkeys.txt will be overwritten if you have had other allkeys.txt already. - - U+9FBC..U+9FC3 are new CJK Unified Ideographs. + - U+9FBC..U+9FC3 are new CJK unified ideographs. 0.52 Thu Oct 13 21:51:09 2005 - The Unicode::Collate->new method does not destroy user's $_ any longer. diff --git a/cpan/Unicode-Collate/Collate.pm b/cpan/Unicode-Collate/Collate.pm index 4345d7d263..058c1a4593 100644 --- a/cpan/Unicode-Collate/Collate.pm +++ b/cpan/Unicode-Collate/Collate.pm @@ -14,7 +14,7 @@ use File::Spec; no warnings 'utf8'; -our $VERSION = '0.66'; +our $VERSION = '0.67'; our $PACKAGE = __PACKAGE__; my @Path = qw(Unicode Collate); @@ -80,33 +80,35 @@ use constant Hangul_LBase => 0x1100; use constant Hangul_LIni => 0x1100; use constant Hangul_LFin => 0x1159; use constant Hangul_LFill => 0x115F; -use constant Hangul_LEnd => 0x115F; # Unicode 5.2.0 +use constant Hangul_LEnd => 0x115F; # Unicode 5.2 use constant Hangul_VBase => 0x1161; use constant Hangul_VIni => 0x1160; # from Vowel Filler use constant Hangul_VFin => 0x11A2; -use constant Hangul_VEnd => 0x11A7; # Unicode 5.2.0 +use constant Hangul_VEnd => 0x11A7; # Unicode 5.2 use constant Hangul_TBase => 0x11A7; # from "no-final" codepoint use constant Hangul_TIni => 0x11A8; use constant Hangul_TFin => 0x11F9; -use constant Hangul_TEnd => 0x11FF; # Unicode 5.2.0 -use constant HangulL2Ini => 0xA960; # Unicode 5.2.0 -use constant HangulL2Fin => 0xA97C; # Unicode 5.2.0 -use constant HangulV2Ini => 0xD7B0; # Unicode 5.2.0 -use constant HangulV2Fin => 0xD7C6; # Unicode 5.2.0 -use constant HangulT2Ini => 0xD7CB; # Unicode 5.2.0 -use constant HangulT2Fin => 0xD7FB; # Unicode 5.2.0 - -use constant CJK_UidIni => 0x4E00; -use constant CJK_UidFin => 0x9FA5; -use constant CJK_UidF41 => 0x9FBB; -use constant CJK_UidF51 => 0x9FC3; -use constant CJK_UidF52 => 0x9FCB; -use constant CJK_ExtAIni => 0x3400; # Unicode 3.0.0 -use constant CJK_ExtAFin => 0x4DB5; # Unicode 3.0.0 -use constant CJK_ExtBIni => 0x20000; # Unicode 3.1.0 -use constant CJK_ExtBFin => 0x2A6D6; # Unicode 3.1.0 -use constant CJK_ExtCIni => 0x2A700; # Unicode 5.2.0 -use constant CJK_ExtCFin => 0x2B734; # Unicode 5.2.0 +use constant Hangul_TEnd => 0x11FF; # Unicode 5.2 +use constant HangulL2Ini => 0xA960; # Unicode 5.2 +use constant HangulL2Fin => 0xA97C; # Unicode 5.2 +use constant HangulV2Ini => 0xD7B0; # Unicode 5.2 +use constant HangulV2Fin => 0xD7C6; # Unicode 5.2 +use constant HangulT2Ini => 0xD7CB; # Unicode 5.2 +use constant HangulT2Fin => 0xD7FB; # Unicode 5.2 + +use constant CJK_UidIni => 0x4E00; +use constant CJK_UidFin => 0x9FA5; +use constant CJK_UidF41 => 0x9FBB; +use constant CJK_UidF51 => 0x9FC3; +use constant CJK_UidF52 => 0x9FCB; +use constant CJK_ExtAIni => 0x3400; # Unicode 3.0 +use constant CJK_ExtAFin => 0x4DB5; # Unicode 3.0 +use constant CJK_ExtBIni => 0x20000; # Unicode 3.1 +use constant CJK_ExtBFin => 0x2A6D6; # Unicode 3.1 +use constant CJK_ExtCIni => 0x2A700; # Unicode 5.2 +use constant CJK_ExtCFin => 0x2B734; # Unicode 5.2 +use constant CJK_ExtDIni => 0x2B740; # Unicode 6.0 +use constant CJK_ExtDFin => 0x2B81D; # Unicode 6.0 my %CompatUI = map +($_ => 1), ( 0xFA0E, 0xFA0F, 0xFA11, 0xFA13, 0xFA14, 0xFA1F, @@ -145,7 +147,7 @@ our @ChangeOK = qw/ /; our @ChangeNG = qw/ - entry mapping table maxlength + entry mapping table maxlength contraction ignoreChar ignoreName undefChar undefName variableTable versionTable alternateTable backwardsTable forwardsTable rearrangeTable derivCode normCode rearrangeHash backwardsFlag @@ -209,6 +211,7 @@ my %DerivCode = ( 16 => \&_derivCE_14, # 16 == 14 18 => \&_derivCE_18, 20 => \&_derivCE_20, + 22 => \&_derivCE_22, ); sub checkCollator { @@ -428,8 +431,16 @@ sub parseEntry $self->{mapping}{$entry} = $is_L3_ignorable ? [] : \@key; if (@uv > 1) { - (!$self->{maxlength}{$uv[0]} || $self->{maxlength}{$uv[0]} < @uv) - and $self->{maxlength}{$uv[0]} = @uv; + if (!$self->{maxlength}{$uv[0]} || $self->{maxlength}{$uv[0]} < @uv) { + $self->{maxlength}{$uv[0]} = @uv; + } + } + if (@uv > 2) { + while (@uv) { + pop @uv; + my $fake_entry = join(CODE_SEP, @uv); # in JCPS + $self->{contraction}{$fake_entry} = 1; + } } } @@ -493,7 +504,8 @@ sub splitEnt my $map = $self->{mapping}; my $max = $self->{maxlength}; my $reH = $self->{rearrangeHash}; - my $ver9 = $self->{UCA_Version} >= 9 && $self->{UCA_Version} <= 11; + my $vers = $self->{UCA_Version}; + my $ver9 = $vers >= 9 && $vers <= 11; my ($str, @buf); @@ -527,9 +539,12 @@ sub splitEnt # remove a code point marked as a completely ignorable. for (my $i = 0; $i < @src; $i++) { - $src[$i] = undef - if _isIllegal($src[$i]) || ($ver9 && - $map->{ $src[$i] } && @{ $map->{ $src[$i] } } == 0); + if (_isIllegal($src[$i]) || $vers <= 20 && _isNonchar($src[$i])) { + $src[$i] = undef; + } elsif ($ver9) { + $src[$i] = undef if $map->{ $src[$i] } && + @{ $map->{ $src[$i] } } == 0; + } } for (my $i = 0; $i < @src; $i++) { @@ -561,31 +576,49 @@ sub splitEnt } } - # not-contiguous contraction with Combining Char (cf. UTS#10, S2.1). + # discontiguous contraction with Combining Char (cf. UTS#10, S2.1). # This process requires Unicode::Normalize. # If "normalization" is undef, here should be skipped *always* # (in spite of bool value of $CVgetCombinClass), # since canonical ordering cannot be expected. # Blocked combining character should not be contracted. - if ($self->{normalization}) # $self->{normCode} is false in the case of "prenormalized". - { + if ($self->{normalization}) { + my $cont = $self->{contraction}; my $preCC = 0; - my $curCC = 0; + my $preCC_uc = 0; + my $jcps_uc = $jcps; + my(@out, @out_uc); for (my $p = $i + 1; $p < @src; $p++) { next if ! defined $src[$p]; - $curCC = $CVgetCombinClass->($src[$p]); + my $curCC = $CVgetCombinClass->($src[$p]); last unless $curCC; my $tail = CODE_SEP . $src[$p]; + + if ($preCC_uc != $curCC && ($map->{$jcps_uc.$tail} || + $cont->{$jcps_uc.$tail})) { + $jcps_uc .= $tail; + push @out_uc, $p; + } else { + $preCC_uc = $curCC; + } + if ($preCC != $curCC && $map->{$jcps.$tail}) { $jcps .= $tail; - $src[$p] = undef; + push @out, $p; } else { $preCC = $curCC; } } + + if ($map->{$jcps_uc}) { + $jcps = $jcps_uc; + $src[$_] = undef for @out_uc; + } else { + $src[$_] = undef for @out; + } } } @@ -796,6 +829,22 @@ sub sort { } +sub _derivCE_22 { + my $u = shift; + my $base = (CJK_UidIni <= $u && $u <= CJK_UidF52 || $CompatUI{$u}) + ? 0xFB40 : # CJK + (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin || + CJK_ExtBIni <= $u && $u <= CJK_ExtBFin || + CJK_ExtCIni <= $u && $u <= CJK_ExtCFin || + CJK_ExtDIni <= $u && $u <= CJK_ExtDFin) + ? 0xFB80 # CJK ext. + : 0xFBC0; # others + my $aaaa = $base + ($u >> 15); + my $bbbb = ($u & 0x7FFF) | 0x8000; + return pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u), + pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $u); +} + sub _derivCE_20 { my $u = shift; my $base = (CJK_UidIni <= $u && $u <= CJK_UidF52 || $CompatUI{$u}) @@ -880,6 +929,8 @@ sub _isUIdeo { ($uca_vers >= 8 && CJK_ExtBIni <= $u && $u <= CJK_ExtBFin) || ($uca_vers >= 20 && CJK_ExtCIni <= $u && $u <= CJK_ExtCFin) + || + ($uca_vers >= 22 && CJK_ExtDIni <= $u && $u <= CJK_ExtDFin) ); } @@ -908,12 +959,17 @@ sub _decompHangul { sub _isIllegal { my $code = shift; - return ! defined $code # removed + return((! defined $code) # removed || ($code < 0 || 0x10FFFF < $code) # out of range - || (($code & 0xFFFE) == 0xFFFE) # ??FFF[EF] (cf. utf8.c) + ); +} + +sub _isNonchar { + my $code = shift; + return((($code & 0xFFFE) == 0xFFFE) # ??FFF[EF] (cf. utf8.c) || (0xD800 <= $code && $code <= 0xDFFF) # unpaired surrogates || (0xFDD0 <= $code && $code <= 0xFDEF) # other non-characters - ; + ); } # Hangul Syllable Type @@ -1249,12 +1305,11 @@ with no parameters, the collator should do the default collation. If the tracking version number of UCA is given, behavior of that tracking version is emulated on collating. If omitted, the return value of C<UCA_Version()> is used. -C<UCA_Version()> should return the latest tracking version supported. -The supported tracking version: 8, 9, 11, 14, 16, 18 or 20. +The following tracking versions are supported. The default is 20. UCA Unicode Standard DUCET (@version) - --------------------------------------------------- + ------------------------------------------------------- 8 3.1 3.0.1 (3.0.1d9) 9 3.1 with Corrigendum 3 3.1.1 (3.1.1) 11 4.0 4.0.0 (4.0.0) @@ -1262,9 +1317,25 @@ The supported tracking version: 8, 9, 11, 14, 16, 18 or 20. 16 5.0 5.0.0 (5.0.0) 18 5.1.0 5.1.0 (5.1.0) 20 5.2.0 5.2.0 (5.2.0) + 22 6.0.0 6.0.0 (6.0.0) Note: Recent UTS #10 renames "Tracking Version" to "Revision." +* Noncharacters (e.g. U+FFFF) are not ignored, and can be overrided +since C<UCA_Version> 22. + +* Fully ignorable characters were ignored, and would not interrupt +contractions with C<UCA_Version> 9 and 11. + +* Treatment of ignorables after variables and some behaviors +were changed at C<UCA_Version> 9. + +* Characters regarded as CJK unified ideographs (cf. C<overrideCJK>) +depend on C<UCA_Version>. + +* Many hangul jamo are assigned at C<UCA_Version> 20, that will affect +C<hangul_terminator>. + =item alternate -- see 3.2.2 Alternate Weighting, version 8 of UTS #10 @@ -1434,7 +1505,7 @@ B<is not> equivalent to C<(normalization =E<gt> 'NFD')>. In the case of C<(normalization =E<gt> "prenormalized")>, any normalization is not performed, but -non-contiguous contractions with combining characters are performed. +discontiguous contractions with combining characters are performed. Therefore C<(normalization =E<gt> 'prenormalized', preprocess =E<gt> sub { NFD(shift) })> B<is> equivalent to C<(normalization =E<gt> 'NFD')>. @@ -1452,15 +1523,16 @@ By default, CJK unified ideographs are ordered in Unicode codepoint order, but those in the CJK Unified Ideographs block are lesser than those in the CJK Unified Ideographs Extension A etc. - In CJK Unified Ideographs block: - U+4E00..U+9FA5 if UCA_Version is 8 to 11; - U+4E00..U+9FBB if UCA_Version is 14 to 16; - U+4E00..U+9FC3 if UCA_Version is 18; - U+4E00..U+9FCB if UCA_Version is 20. + In the CJK Unified Ideographs block: + U+4E00..U+9FA5 if UCA_Version is 8 to 11. + U+4E00..U+9FBB if UCA_Version is 14 to 16. + U+4E00..U+9FC3 if UCA_Version is 18. + U+4E00..U+9FCB if UCA_Version is 20 or greater. - In CJK Unified Ideographs Extension blocks: - Ext.A (U+3400..U+4DB5) and Ext.B (U+20000..U+2A6D6) in any UCA_Version; - Ext.C (U+2A700..U+2B734) if UCA_Version is 20. + In the CJK Unified Ideographs Extension blocks: + Ext.A (U+3400..U+4DB5) and Ext.B (U+20000..U+2A6D6) in any UCA_Version. + Ext.C (U+2A700..U+2B734) if UCA_Version is 20 or greater. + Ext.D (U+2B740..U+2B81D) if UCA_Version is 22 or greater. Through C<overrideCJK>, ordering of CJK unified ideographs (including extensions) can be overrided. @@ -1919,6 +1991,8 @@ returns C<"unknown">. =item C<UCA_Version()> Returns the tracking version number of UTS #10 this module consults. +C<UCA_Version()> should return the tracking version corresponding +with the DUCET incorporated. =item C<Base_Unicode_Version()> diff --git a/cpan/Unicode-Collate/Collate/CJK/Big5.pm b/cpan/Unicode-Collate/Collate/CJK/Big5.pm index 7b309e51bd..2d133fd8b1 100644 --- a/cpan/Unicode-Collate/Collate/CJK/Big5.pm +++ b/cpan/Unicode-Collate/Collate/CJK/Big5.pm @@ -3,7 +3,7 @@ package Unicode::Collate::CJK::Big5; use 5.006; use strict; -our $VERSION = '0.64'; +our $VERSION = '0.65'; my %u2p; my $wt = 0x8000; diff --git a/cpan/Unicode-Collate/Collate/Locale.pm b/cpan/Unicode-Collate/Collate/Locale.pm index c73a5de316..a9eea97990 100644 --- a/cpan/Unicode-Collate/Collate/Locale.pm +++ b/cpan/Unicode-Collate/Collate/Locale.pm @@ -4,7 +4,7 @@ use strict; use Carp; use base qw(Unicode::Collate); -our $VERSION = '0.66'; +our $VERSION = '0.67'; use File::Spec; diff --git a/cpan/Unicode-Collate/Collate/Locale/hy.pl b/cpan/Unicode-Collate/Collate/Locale/hy.pl index 61ee861bee..96a7a5e11c 100644 --- a/cpan/Unicode-Collate/Collate/Locale/hy.pl +++ b/cpan/Unicode-Collate/Collate/Locale/hy.pl @@ -1,6 +1,6 @@ +{ entry => <<'ENTRY', # for DUCET v5.2.0 -0587 ; [.1858.0020.0002.0584][.FFFF.0000.0000.0000] # ARMENIAN SMALL LIGATURE ECH YIWN -0535 0582 ; [.1858.0020.0008.0554][.FFFF.0000.0000.0000] # <ARMENIAN CAPITAL LETTER ECH, ARMENIAN SMALL LETTER YIWN> +0587 ; [.1858.0020.0002.0584][.FFF1.0000.0000.0000] # ARMENIAN SMALL LIGATURE ECH YIWN +0535 0582 ; [.1858.0020.0008.0554][.FFF1.0000.0000.0000] # <ARMENIAN CAPITAL LETTER ECH, ARMENIAN SMALL LETTER YIWN> ENTRY }; diff --git a/cpan/Unicode-Collate/README b/cpan/Unicode-Collate/README index 12dc5d4d10..76e9fa0047 100644 --- a/cpan/Unicode-Collate/README +++ b/cpan/Unicode-Collate/README @@ -1,4 +1,4 @@ -Unicode/Collate version 0.66 +Unicode/Collate version 0.67 =============================== NAME @@ -93,6 +93,17 @@ ABOUT DUCET from http://www.unicode.org/Public/UCA/latest/allkeys.txt to <a place in @INC>/Unicode/Collate/allkeys.txt manually. +HOW TO CHANGE DUCET (NOT WARRANTED) + + 0. rewriting UCA_Version and Base_Unicode_Version in Collate.pm + and t/version.t is preferred. + 1. replace Collate/allkeys.txt with a new DUCET. + 2. run mklocale to generate new Locale/*.pl and Korean.pm. + 3. replace Collate/Locale/*.pl with the new Locale/*.pl, + and Collate/CJK/Korean.pm with the new Korean.pm. + 4. make test. + IF FAIL, it may require more changes, not be easy. + AUTHOR, COPYRIGHT AND LICENSE The Unicode::Collate module for perl was written by SADAHIRO Tomoyuki, diff --git a/cpan/Unicode-Collate/t/cjkrange.t b/cpan/Unicode-Collate/t/cjkrange.t index a6118d8e6d..83d92688eb 100644 --- a/cpan/Unicode-Collate/t/cjkrange.t +++ b/cpan/Unicode-Collate/t/cjkrange.t @@ -12,7 +12,7 @@ BEGIN { } use Test; -BEGIN { plan tests => 83 }; +BEGIN { plan tests => 321 }; # 1 + 40 x @Versions use strict; use warnings; @@ -25,112 +25,75 @@ my $Collator = Unicode::Collate->new( normalization => undef, ); -# U+9FC4..U+9FCB are CJK UI since Unicode 5.2.0. -# U+9FBC..U+9FC3 are CJK UI since Unicode 5.1.0. -# U+9FA6..U+9FBB are CJK UI since Unicode 4.1.0. -# CJK UI Ext are greater than any CJK UI. -# U+3400 ..U+4DB5 are CJK UI Ext.A since Unicode 3.0.0. -# U+20000..U+2A6D6 are CJK UI Ext.B since Unicode 3.1.0. -# U+2A700..U+2B734 are CJK UI Ext.C since Unicode 5.2.0. - -##### 2..13 -$Collator->change(UCA_Version => 8); -ok($Collator->gt("\x{9FA5}", "\x{3400}")); # UI > ExtA -ok($Collator->gt("\x{9FA6}", "\x{3400}")); # Unassigned > ExtA -ok($Collator->gt("\x{9FBB}", "\x{3400}")); # Unassigned > ExtA -ok($Collator->gt("\x{9FBC}", "\x{3400}")); # Unassigned > ExtA -ok($Collator->gt("\x{9FC3}", "\x{3400}")); # Unassigned > ExtA -ok($Collator->gt("\x{9FC4}", "\x{3400}")); # Unassigned > ExtA -ok($Collator->gt("\x{9FFF}", "\x{3400}")); # Unassigned > ExtA -ok($Collator->lt("\x{9FA6}", "\x{9FBB}")); # new UI > new UI -ok($Collator->lt("\x{3400}","\x{20000}")); # ExtA < Unassigned(ExtB) -ok($Collator->lt("\x{3400}","\x{2A6D6}")); # ExtA < Unassigned(ExtB) -ok($Collator->lt("\x{9FFF}","\x{20000}")); # Unassigned < Unassigned(ExtB) -ok($Collator->lt("\x{9FFF}","\x{2A6D6}")); # Unassigned < Unassigned(ExtB) - -##### 14..25 -$Collator->change(UCA_Version => 9); -ok($Collator->lt("\x{9FA5}", "\x{3400}")); # UI < ExtA -ok($Collator->gt("\x{9FA6}", "\x{3400}")); # Unassigned > ExtA -ok($Collator->gt("\x{9FBB}", "\x{3400}")); # Unassigned > ExtA -ok($Collator->gt("\x{9FBC}", "\x{3400}")); # Unassigned > ExtA -ok($Collator->gt("\x{9FC3}", "\x{3400}")); # Unassigned > ExtA -ok($Collator->gt("\x{9FC4}", "\x{3400}")); # Unassigned > ExtA -ok($Collator->gt("\x{9FFF}", "\x{3400}")); # Unassigned > ExtA -ok($Collator->lt("\x{9FA6}", "\x{9FBB}")); # Unassigned > Unassigned -ok($Collator->lt("\x{3400}","\x{20000}")); # ExtA < ExtB -ok($Collator->lt("\x{3400}","\x{2A6D6}")); # ExtA < ExtB -ok($Collator->gt("\x{9FFF}","\x{20000}")); # Unassigned > ExtB -ok($Collator->gt("\x{9FFF}","\x{2A6D6}")); # Unassigned > ExtB - -##### 26..37 -$Collator->change(UCA_Version => 11); -ok($Collator->lt("\x{9FA5}", "\x{3400}")); # UI < ExtA -ok($Collator->gt("\x{9FA6}", "\x{3400}")); # Unassigned > ExtA -ok($Collator->gt("\x{9FBB}", "\x{3400}")); # Unassigned > ExtA -ok($Collator->gt("\x{9FBC}", "\x{3400}")); # Unassigned > ExtA -ok($Collator->gt("\x{9FC3}", "\x{3400}")); # Unassigned > ExtA -ok($Collator->gt("\x{9FC4}", "\x{3400}")); # Unassigned > ExtA -ok($Collator->gt("\x{9FFF}", "\x{3400}")); # Unassigned > ExtA -ok($Collator->lt("\x{9FA6}", "\x{9FBB}")); # Unassigned > Unassigned -ok($Collator->lt("\x{3400}","\x{20000}")); # ExtA < ExtB -ok($Collator->lt("\x{3400}","\x{2A6D6}")); # ExtA < ExtB -ok($Collator->gt("\x{9FFF}","\x{20000}")); # Unassigned > ExtB -ok($Collator->gt("\x{9FFF}","\x{2A6D6}")); # Unassigned > ExtB - - -##### 38..49 -$Collator->change(UCA_Version => 14); -ok($Collator->lt("\x{9FA5}", "\x{3400}")); # UI < ExtA -ok($Collator->lt("\x{9FA6}", "\x{3400}")); # new UI < ExtA -ok($Collator->lt("\x{9FBB}", "\x{3400}")); # new UI < ExtA -ok($Collator->gt("\x{9FBC}", "\x{3400}")); # Unassigned > ExtA -ok($Collator->gt("\x{9FC3}", "\x{3400}")); # Unassigned > ExtA -ok($Collator->gt("\x{9FC4}", "\x{3400}")); # Unassigned > ExtA -ok($Collator->gt("\x{9FFF}", "\x{3400}")); # Unassigned > ExtA -ok($Collator->lt("\x{9FA6}", "\x{9FBB}")); # new UI > new UI -ok($Collator->lt("\x{3400}","\x{20000}")); # ExtA < ExtB -ok($Collator->lt("\x{3400}","\x{2A6D6}")); # ExtA < ExtB -ok($Collator->gt("\x{9FFF}","\x{20000}")); # Unassigned > ExtB -ok($Collator->gt("\x{9FFF}","\x{2A6D6}")); # Unassigned > ExtB - -##### 50..65 -$Collator->change(UCA_Version => 18); -ok($Collator->lt("\x{9FA5}", "\x{3400}")); # UI < ExtA -ok($Collator->lt("\x{9FA6}", "\x{3400}")); # new UI < ExtA -ok($Collator->lt("\x{9FBB}", "\x{3400}")); # new UI < ExtA -ok($Collator->lt("\x{9FBC}", "\x{3400}")); # new UI < ExtA -ok($Collator->lt("\x{9FC3}", "\x{3400}")); # new UI < ExtA -ok($Collator->gt("\x{9FC4}", "\x{3400}")); # Unassigned > ExtA -ok($Collator->gt("\x{9FCB}", "\x{3400}")); # Unassigned > ExtA -ok($Collator->gt("\x{9FCC}", "\x{3400}")); # Unassigned > ExtA -ok($Collator->gt("\x{9FFF}", "\x{3400}")); # Unassigned > ExtA -ok($Collator->lt("\x{9FA6}", "\x{9FBB}")); # new UI > new UI -ok($Collator->lt("\x{3400}","\x{20000}")); # ExtA < ExtB -ok($Collator->lt("\x{3400}","\x{2A6D6}")); # ExtA < ExtB -ok($Collator->gt("\x{9FFF}","\x{20000}")); # Unassigned > ExtB -ok($Collator->gt("\x{9FFF}","\x{2A6D6}")); # Unassigned > ExtB -ok($Collator->lt("\x{9FFF}","\x{2A700}")); # Unassigned < Unassigned(ExtC) -ok($Collator->lt("\x{9FFF}","\x{2B734}")); # Unassigned < Unassigned(ExtC) - -##### 65..81 -$Collator->change(UCA_Version => 20); -ok($Collator->lt("\x{9FA5}", "\x{3400}")); # UI < ExtA -ok($Collator->lt("\x{9FA6}", "\x{3400}")); # new UI < ExtA -ok($Collator->lt("\x{9FBB}", "\x{3400}")); # new UI < ExtA -ok($Collator->lt("\x{9FBC}", "\x{3400}")); # new UI < ExtA -ok($Collator->lt("\x{9FC3}", "\x{3400}")); # new UI < ExtA -ok($Collator->lt("\x{9FC4}", "\x{3400}")); # new UI < ExtA -ok($Collator->lt("\x{9FCB}", "\x{3400}")); # new UI < ExtA -ok($Collator->gt("\x{9FCC}", "\x{3400}")); # Unassigned > ExtA -ok($Collator->gt("\x{9FFF}", "\x{3400}")); # Unassigned > ExtA -ok($Collator->lt("\x{9FA6}", "\x{9FBB}")); # new UI > new UI -ok($Collator->lt("\x{3400}","\x{20000}")); # ExtA < ExtB -ok($Collator->lt("\x{3400}","\x{2A6D6}")); # ExtA < ExtB -ok($Collator->gt("\x{9FFF}","\x{20000}")); # Unassigned > ExtB -ok($Collator->gt("\x{9FFF}","\x{2A6D6}")); # Unassigned > ExtB -ok($Collator->gt("\x{9FFF}","\x{2A700}")); # Unassigned > ExtC -ok($Collator->gt("\x{9FFF}","\x{2B734}")); # Unassigned > ExtC -ok($Collator->lt("\x{9FFF}","\x{2B735}")); # Unassigned < Unassigned -ok($Collator->lt("\x{9FFF}","\x{2B73F}")); # Unassigned < Unassigned +# CJK UI Ext > CJK UI. +# [ UCA_Version 8: Ext.A < UI and BMP < Ext.B (code point order) ] +# 4E00..9FA5 are CJK UI. +# 9FA6..9FBB are CJK UI since UCA_Version 14 (Unicode 4.1). +# 9FBC..9FC3 are CJK UI since UCA_Version 18 (Unicode 5.1). +# 9FC4..9FCB are CJK UI since UCA_Version 20 (Unicode 5.2). + +# 3400..4DB5 are CJK UI Ext.A since UCA_Version 8 (Unicode 3.0). +# 20000..2A6D6 are CJK UI Ext.B since UCA_Version 8 (Unicode 3.1). +# 2A700..2B734 are CJK UI Ext.C since UCA_Version 20 (Unicode 5.2). +# 2B740..2B81D are CJK UI Ext.D since UCA_Version 22 (Unicode 6.0). + +my @Versions = (8, 9, 11, 14, 16, 18, 20, 22); + +for my $v (@Versions) { +$Collator->change(UCA_Version => $v); + +# Ext.A > UI +ok($Collator->cmp("\x{3400}", "\x{4E00}") == ($v >= 9 ? 1 : -1)); # UI +ok($Collator->cmp("\x{3400}", "\x{9FA5}") == ($v >= 9 ? 1 : -1)); # UI +ok($Collator->cmp("\x{3400}", "\x{9FA6}") == ($v >= 14 ? 1 : -1)); # new +ok($Collator->cmp("\x{3400}", "\x{9FBB}") == ($v >= 14 ? 1 : -1)); # new +ok($Collator->cmp("\x{3400}", "\x{9FBC}") == ($v >= 18 ? 1 : -1)); # new +ok($Collator->cmp("\x{3400}", "\x{9FC3}") == ($v >= 18 ? 1 : -1)); # new +ok($Collator->cmp("\x{3400}", "\x{9FC4}") == ($v >= 20 ? 1 : -1)); # new +ok($Collator->cmp("\x{3400}", "\x{9FCB}") == ($v >= 20 ? 1 : -1)); # new +ok($Collator->cmp("\x{3400}", "\x{9FCC}") == -1); # na +ok($Collator->cmp("\x{3400}", "\x{9FFF}") == -1); # na + +# UI < UI +ok($Collator->cmp("\x{4E00}", "\x{9FA5}") == -1); # UI < UI +ok($Collator->cmp("\x{9FA5}", "\x{9FA6}") == -1); # UI < new +ok($Collator->cmp("\x{9FA6}", "\x{9FBB}") == -1); # new < new +ok($Collator->cmp("\x{9FBB}", "\x{9FBC}") == -1); # new < new +ok($Collator->cmp("\x{9FBC}", "\x{9FC3}") == -1); # new < new +ok($Collator->cmp("\x{9FC3}", "\x{9FC4}") == -1); # new < new +ok($Collator->cmp("\x{9FC4}", "\x{9FCB}") == -1); # new < new +ok($Collator->cmp("\x{9FCB}", "\x{9FCC}") == -1); # new < na +ok($Collator->cmp("\x{9FCC}", "\x{9FFF}") == -1); # na < na + +# Ext.A < Ext.B +ok($Collator->cmp("\x{3400}", "\x{20000}") == -1); + +# Ext.A +ok($Collator->cmp("\x{3400}", "\x{4DB5}") == -1); # A < A +ok($Collator->cmp("\x{2FFF}", "\x{3400}") == ($v >= 8 ? 1 : -1)); # na > A +ok($Collator->cmp("\x{2FFF}", "\x{4DB5}") == ($v >= 8 ? 1 : -1)); # na > A +ok($Collator->cmp("\x{2FFF}", "\x{4DB6}") == -1); # na < na +ok($Collator->cmp("\x{2FFF}", "\x{4DBF}") == -1); # na < na + +# Ext.B +ok($Collator->cmp("\x{20000}","\x{2A6D6}") == -1); # B < B +ok($Collator->cmp("\x{2FFF}", "\x{20000}") == ($v >= 9 ? 1 : -1)); # na > B +ok($Collator->cmp("\x{2FFF}", "\x{2A6D6}") == ($v >= 9 ? 1 : -1)); # na > B +ok($Collator->cmp("\x{2FFF}", "\x{2A6D7}") == -1); # na < na +ok($Collator->cmp("\x{2FFF}", "\x{2A6DF}") == -1); # na < na + +# Ext.C +ok($Collator->cmp("\x{2A700}","\x{2B734}") == -1); # C < C +ok($Collator->cmp("\x{2FFF}", "\x{2A700}") == ($v >= 20 ? 1 : -1)); # na > C +ok($Collator->cmp("\x{2FFF}", "\x{2B734}") == ($v >= 20 ? 1 : -1)); # na > C +ok($Collator->cmp("\x{2FFF}", "\x{2B735}") == -1); # na < na +ok($Collator->cmp("\x{2FFF}", "\x{2B73F}") == -1); # na < na + +# Ext.D +ok($Collator->cmp("\x{2B740}","\x{2B81D}") == -1); # D < D +ok($Collator->cmp("\x{2FFF}", "\x{2B740}") == ($v >= 22 ? 1 : -1)); # na > D +ok($Collator->cmp("\x{2FFF}", "\x{2B81D}") == ($v >= 22 ? 1 : -1)); # na > D +ok($Collator->cmp("\x{2FFF}", "\x{2B81E}") == -1); # na < na +ok($Collator->cmp("\x{2FFF}", "\x{2B81F}") == -1); # na < na +} diff --git a/cpan/Unicode-Collate/t/compatui.t b/cpan/Unicode-Collate/t/compatui.t index 7169fa3225..66a8735ab6 100644 --- a/cpan/Unicode-Collate/t/compatui.t +++ b/cpan/Unicode-Collate/t/compatui.t @@ -12,7 +12,7 @@ BEGIN { } use Test; -BEGIN { plan tests => 491 }; +BEGIN { plan tests => 561 }; # 1 + 70 x @Versions use strict; use warnings; @@ -20,7 +20,7 @@ use Unicode::Collate; ok(1); -my @Versions = (8, 9, 11, 14, 16, 18, 20); +my @Versions = (8, 9, 11, 14, 16, 18, 20, 22); # 12 compatibility ideographs are treated as unified ideographs: # FA0E, FA0F, FA11, FA13, FA14, FA1F, FA21, FA23, FA24, FA27, FA28, FA29. diff --git a/cpan/Unicode-Collate/t/contract.t b/cpan/Unicode-Collate/t/contract.t index e87239f38a..22a1086ca1 100644 --- a/cpan/Unicode-Collate/t/contract.t +++ b/cpan/Unicode-Collate/t/contract.t @@ -1,3 +1,4 @@ + BEGIN { unless ("A" eq pack('U', 0x41)) { print "1..0 # Unicode::Collate " . @@ -11,7 +12,7 @@ BEGIN { } use Test; -BEGIN { plan tests => 44 }; +BEGIN { plan tests => 74 }; use strict; use warnings; @@ -59,6 +60,8 @@ ok($kjeNoN->gt("\x{45C}", "\x{43A}\x{334}\x{301}")); ok($kjeNoN->eq("\x{43A}", "\x{43A}\x{334}\x{301}")); ok($kjeNoN->eq("\x{45C}", "\x{43A}\x{301}\x{334}")); +# 5 + our %sortkeys; $sortkeys{'KAac'} = $kjeNoN->viewSortKey("\x{43A}\x{301}"); @@ -72,10 +75,12 @@ if (!$@) { table => undef, entry => $kjeEntry, ); + ok($kjeNFD->lt("\x{43A}", "\x{43A}\x{301}")); ok($kjeNFD->eq("\x{45C}", "\x{43A}\x{334}\x{301}")); ok($kjeNFD->lt("\x{43A}", "\x{43A}\x{334}\x{301}")); ok($kjeNFD->eq("\x{45C}", "\x{43A}\x{301}\x{334}")); +# 9 my $aaNFD = Unicode::Collate->new( level => 1, @@ -91,6 +96,7 @@ ok($aaNFD->lt("Z", "A\x{327}\x{30A}")); ok($aaNFD->lt("Z", "A\x{30A}\x{327}")); ok($aaNFD->lt("Z", "A\x{31A}\x{30A}")); ok($aaNFD->lt("Z", "A\x{30A}\x{31A}")); +# 17 my $aaPre = Unicode::Collate->new( level => 1, @@ -107,9 +113,9 @@ ok($aaPre->lt("Z", "A\x{327}\x{30A}")); ok($aaPre->lt("Z", "A\x{30A}\x{327}")); ok($aaPre->lt("Z", "A\x{31A}\x{30A}")); ok($aaPre->lt("Z", "A\x{30A}\x{31A}")); -} -else { - ok(1) for 1..20; +# 25 +} else { + ok(1) for 1..20; } # again: loading Unicode::Normalize should not affect $kjeNoN. @@ -122,6 +128,8 @@ ok($sortkeys{'KAac'}, $kjeNoN->viewSortKey("\x{43A}\x{301}")); ok($sortkeys{'KAta'}, $kjeNoN->viewSortKey("\x{43A}\x{334}\x{301}")); ok($sortkeys{'KAat'}, $kjeNoN->viewSortKey("\x{43A}\x{301}\x{334}")); +# 32 + my $aaNoN = Unicode::Collate->new( level => 1, table => undef, @@ -138,6 +146,8 @@ ok($aaNoN->lt("Z", "A\x{30A}\x{327}")); ok($aaNoN->eq("A", "A\x{31A}\x{30A}")); ok($aaNoN->lt("Z", "A\x{30A}\x{31A}")); +# 40 + # suppress contractions my $kjeSup = Unicode::Collate->new( @@ -153,3 +163,98 @@ ok($kjeSup->gt("\x{45C}", "\x{43A}\x{301}")); ok($kjeSup->eq("\x{41A}", "\x{41A}\x{301}")); ok($kjeSup->gt("\x{40C}", "\x{41A}\x{301}")); +# 44 + +our $tibetanEntry = <<'ENTRIES'; +0000 ; [.0000.0000.0000.0000] # [0000] NULL (in 6429) +0F71 ; [.206D.0020.0002.0F71] # TIBETAN VOWEL SIGN AA +0F72 ; [.206E.0020.0002.0F72] # TIBETAN VOWEL SIGN I +0F73 ; [.206F.0020.0002.0F73] # TIBETAN VOWEL SIGN II +0F71 0F72 ; [.206F.0020.0002.0F73] # TIBETAN VOWEL SIGN II +0F80 ; [.2070.0020.0002.0F80] # TIBETAN VOWEL SIGN REVERSED I +0F81 ; [.2071.0020.0002.0F81] # TIBETAN VOWEL SIGN REVERSED II +0F71 0F80 ; [.2071.0020.0002.0F81] # TIBETAN VOWEL SIGN REVERSED II +0F74 ; [.2072.0020.0002.0F74] # TIBETAN VOWEL SIGN U +0F75 ; [.2073.0020.0002.0F75] # TIBETAN VOWEL SIGN UU +0F71 0F74 ; [.2073.0020.0002.0F75] # TIBETAN VOWEL SIGN UU +0F76 ; [.2074.0020.0002.0F76] # TIBETAN VOWEL SIGN VOCALIC R +0FB2 0F80 ; [.2074.0020.0002.0F76] # TIBETAN VOWEL SIGN VOCALIC R +0F77 ; [.2075.0020.0002.0F77] # TIBETAN VOWEL SIGN VOCALIC RR +0FB2 0F81 ; [.2075.0020.0002.0F77] # TIBETAN VOWEL SIGN VOCALIC RR +0FB2 0F71 0F80 ; [.2075.0020.0002.0F77] # TIBETAN VOWEL SIGN VOCALIC RR +0F78 ; [.2076.0020.0002.0F78] # TIBETAN VOWEL SIGN VOCALIC L +0FB3 0F80 ; [.2076.0020.0002.0F78] # TIBETAN VOWEL SIGN VOCALIC L +0F79 ; [.2077.0020.0002.0F79] # TIBETAN VOWEL SIGN VOCALIC LL +0FB3 0F81 ; [.2077.0020.0002.0F79] # TIBETAN VOWEL SIGN VOCALIC LL +0FB3 0F71 0F80 ; [.2077.0020.0002.0F79] # TIBETAN VOWEL SIGN VOCALIC LL +ENTRIES + +# ccc(0F71) = 129 +# ccc(0F80) = 130 +# 0F76 = 0FB2 0F80 +# 0F78 = 0FB3 0F80 +# 0F81 = 0F71 0F80 +# 0F77 = <compat> 0FB2 0F81 = 0FB2 0F71 0F80 = 0F76 0F71 +# 0F79 = <compat> 0FB3 0F81 = 0FB3 0F71 0F80 = 0F78 0F71 + +eval { require Unicode::Normalize }; +if (!$@) { + my $tibNFD = Unicode::Collate->new( + table => undef, + entry => $tibetanEntry, + ); + + # VOCALIC RR + ok($tibNFD->eq("\x{F77}\0\x{334}", "\x{FB2}\x{334}\x{F81}")); + ok($tibNFD->eq("\x{F77}\0\x{334}", "\x{FB2}\x{F81}\x{334}")); + ok($tibNFD->eq("\x{F77}\0\x{334}", "\x{FB2}\x{F81}\0\x{334}")); + ok($tibNFD->eq("\x{F77}\0\x{334}", "\x{F76}\x{334}\x{F71}")); + ok($tibNFD->eq("\x{F77}\0\x{334}", "\x{F76}\x{F71}\x{334}")); + ok($tibNFD->eq("\x{F77}\0\x{334}", "\x{F76}\x{F71}\0\x{334}")); + ok($tibNFD->eq("\x{F77}\0\x{334}", "\x{FB2}\x{334}\x{F71}\x{F80}")); + ok($tibNFD->eq("\x{F77}\0\x{334}", "\x{FB2}\x{F71}\x{334}\x{F80}")); + ok($tibNFD->eq("\x{F77}\0\x{334}", "\x{FB2}\x{F71}\x{F80}\x{334}")); + ok($tibNFD->eq("\x{F77}\0\x{334}", "\x{FB2}\x{F71}\x{F80}\0\x{334}")); + ok($tibNFD->eq("\x{F77}\0\x{334}", "\x{FB2}\x{334}\x{F80}\x{F71}")); + ok($tibNFD->eq("\x{F77}\0\x{334}", "\x{FB2}\x{F80}\x{334}\x{F71}")); + ok($tibNFD->eq("\x{F77}\0\x{334}", "\x{FB2}\x{F80}\x{F71}\x{334}")); + ok($tibNFD->eq("\x{F77}\0\x{334}", "\x{FB2}\x{F80}\x{F71}\0\x{334}")); +# 58 + + # VOCALIC LL + ok($tibNFD->eq("\x{F79}\0\x{334}", "\x{FB3}\x{334}\x{F81}")); + ok($tibNFD->eq("\x{F79}\0\x{334}", "\x{FB3}\x{F81}\x{334}")); + ok($tibNFD->eq("\x{F79}\0\x{334}", "\x{FB3}\x{F81}\0\x{334}")); + ok($tibNFD->eq("\x{F79}\0\x{334}", "\x{F78}\x{334}\x{F71}")); + ok($tibNFD->eq("\x{F79}\0\x{334}", "\x{F78}\x{F71}\x{334}")); + ok($tibNFD->eq("\x{F79}\0\x{334}", "\x{F78}\x{F71}\0\x{334}")); + ok($tibNFD->eq("\x{F79}\0\x{334}", "\x{FB3}\x{334}\x{F71}\x{F80}")); + ok($tibNFD->eq("\x{F79}\0\x{334}", "\x{FB3}\x{F71}\x{334}\x{F80}")); + ok($tibNFD->eq("\x{F79}\0\x{334}", "\x{FB3}\x{F71}\x{F80}\x{334}")); + ok($tibNFD->eq("\x{F79}\0\x{334}", "\x{FB3}\x{F71}\x{F80}\0\x{334}")); + ok($tibNFD->eq("\x{F79}\0\x{334}", "\x{FB3}\x{334}\x{F80}\x{F71}")); + ok($tibNFD->eq("\x{F79}\0\x{334}", "\x{FB3}\x{F80}\x{334}\x{F71}")); + ok($tibNFD->eq("\x{F79}\0\x{334}", "\x{FB3}\x{F80}\x{F71}\x{334}")); + ok($tibNFD->eq("\x{F79}\0\x{334}", "\x{FB3}\x{F80}\x{F71}\0\x{334}")); +# 72 + + my $discontNFD = Unicode::Collate->new( + table => undef, + entry => <<'ENTRIES', +0000 ; [.0000.0000.0000.0000] # [0000] NULL (in 6429) +0301 ; [.0000.0032.0002.0301] # COMBINING ACUTE ACCENT +0300 ; [.0000.0035.0002.0300] # COMBINING GRAVE ACCENT +0327 ; [.0000.0055.0002.0327] # COMBINING CEDILLA +0334 ; [.0000.008B.0002.0334] # COMBINING TILDE OVERLAY +0041 ; [.0101.0020.0008.0041] # LATIN CAPITAL LETTER A +0041 0327 0301 ; [.0102.0020.0008.0041] +0041 0300 ; [.0103.0020.0008.0041] +ENTRIES + ); + + ok($discontNFD->eq("A\x{327}\x{301}\0\x{334}", "A\x{334}\x{327}\x{301}")); + ok($discontNFD->eq("A\x{300}\0\x{327}", "A\x{327}\x{300}")); +} else { + ok(1) for 1..30; +} +# 74 diff --git a/cpan/Unicode-Collate/t/illegal.t b/cpan/Unicode-Collate/t/illegal.t index 825177c283..982518ffc8 100644 --- a/cpan/Unicode-Collate/t/illegal.t +++ b/cpan/Unicode-Collate/t/illegal.t @@ -26,7 +26,7 @@ BEGIN { } } -BEGIN { plan tests => 40 }; +BEGIN { plan tests => 65 }; ok(1); @@ -38,8 +38,7 @@ no warnings 'utf8'; # illegal code points should be always ingored # (cf. UCA, 7.1.1 Illegal code points). -my $illeg = Unicode::Collate->new( - entry => <<'ENTRIES', +my $entry = <<'ENTRIES'; 0000 ; [.0020.0000.0000.0000] # [0000] NULL 0001 ; [.0021.0000.0000.0001] # [0001] START OF HEADING FFFE ; [.0022.0000.0000.FFFE] # <noncharacter-FFFE> (invalid) @@ -55,9 +54,15 @@ FDEF ; [.0027.0000.0000.FDEF] # <noncharacter-FDEF> (invalid) 0041 0000 ; [.1100.0020.0008.0041] # latin A + NULL 0041 FFFF ; [.1200.0020.0008.0041] # latin A + FFFF (invalid) ENTRIES + +################## + +my $illeg = Unicode::Collate->new( + entry => $entry, level => 1, table => undef, normalization => undef, + UCA_Version => 20, ); # 2..12 @@ -93,85 +98,75 @@ ok($illeg->lt("AA", "A\0")); ################## -my($match, $str, $sub, $ret); +my $nonch = Unicode::Collate->new( + entry => $entry, + level => 1, + table => undef, + normalization => undef, + UCA_Version => 22, +); + +# 27..37 +ok($nonch->lt("", "\x00")); +ok($nonch->lt("", "\x01")); +ok($nonch->lt("", "\x{FFFE}")); +ok($nonch->lt("", "\x{FFFF}")); +ok($nonch->lt("", "\x{D800}")); +ok($nonch->lt("", "\x{DFFF}")); +ok($nonch->lt("", "\x{FDD0}")); +ok($nonch->lt("", "\x{FDEF}")); +ok($nonch->lt("", "\x02")); +ok($nonch->lt("", "\x{10FFFF}")); +ok($nonch->eq("", "\x{110000}")); + +# 38..47 +ok($nonch->lt("\x00", "\x01")); +ok($nonch->lt("\x01", "\x{FFFE}")); +ok($nonch->lt("\x{FFFE}", "\x{FFFF}")); +ok($nonch->lt("\x{FFFF}", "\x{D800}")); +ok($nonch->lt("\x{D800}", "\x{DFFF}")); +ok($nonch->lt("\x{DFFF}", "\x{FDD0}")); +ok($nonch->lt("\x{FDD0}", "\x{FDEF}")); +ok($nonch->lt("\x{FDEF}", "\x02")); +ok($nonch->lt("\x02", "\x{10FFFF}")); +ok($nonch->gt("\x{10FFFF}", "\x{110000}")); + +# 48..51 +ok($nonch->lt("A", "A\x{FFFF}")); +ok($nonch->lt("A\0", "A\x{FFFF}")); +ok($nonch->lt("A", "A\0")); +ok($nonch->lt("AA", "A\0")); + +################## my $Collator = Unicode::Collate->new( table => 'keys.txt', level => 1, normalization => undef, + UCA_Version => 8, ); -$sub = "pe"; - - -$str = "Pe\x{300}\x{301}rl"; -$ret = "Pe\x{300}\x{301}"; -($match) = $Collator->match($str, $sub); -ok($match, $ret); - -$str = "Pe\x{300}\0\0\x{301}rl"; -$ret = "Pe\x{300}\0\0\x{301}"; -($match) = $Collator->match($str, $sub); -ok($match, $ret); - -$str = "Pe\x{DA00}\x{301}\x{DFFF}rl"; -$ret = "Pe\x{DA00}\x{301}\x{DFFF}"; -($match) = $Collator->match($str, $sub); -ok($match, $ret); - -$str = "Pe\x{FFFF}\x{301}rl"; -$ret = "Pe\x{FFFF}\x{301}"; -($match) = $Collator->match($str, $sub); -ok($match, $ret); - -$str = "Pe\x{110000}\x{301}rl"; -$ret = "Pe\x{110000}\x{301}"; -($match) = $Collator->match($str, $sub); -ok($match, $ret); - -$str = "Pe\x{300}\x{d801}\x{301}rl"; -$ret = "Pe\x{300}\x{d801}\x{301}"; -($match) = $Collator->match($str, $sub); -ok($match, $ret); - -$str = "Pe\x{300}\x{ffff}\x{301}rl"; -$ret = "Pe\x{300}\x{ffff}\x{301}"; -($match) = $Collator->match($str, $sub); -ok($match, $ret); - -$str = "Pe\x{300}\x{110000}\x{301}rl"; -$ret = "Pe\x{300}\x{110000}\x{301}"; -($match) = $Collator->match($str, $sub); -ok($match, $ret); - -$str = "Pe\x{D9ab}\x{DFFF}rl"; -$ret = "Pe\x{D9ab}\x{DFFF}"; -($match) = $Collator->match($str, $sub); -ok($match, $ret); - -$str = "Pe\x{FFFF}rl"; -$ret = "Pe\x{FFFF}"; -($match) = $Collator->match($str, $sub); -ok($match, $ret); - -$str = "Pe\x{110000}rl"; -$ret = "Pe\x{110000}"; -($match) = $Collator->match($str, $sub); -ok($match, $ret); - -$str = "Pe\x{300}\x{D800}\x{DFFF}rl"; -$ret = "Pe\x{300}\x{D800}\x{DFFF}"; -($match) = $Collator->match($str, $sub); -ok($match, $ret); - -$str = "Pe\x{300}\x{FFFF}rl"; -$ret = "Pe\x{300}\x{FFFF}"; -($match) = $Collator->match($str, $sub); -ok($match, $ret); - -$str = "Pe\x{300}\x{110000}rl"; -$ret = "Pe\x{300}\x{110000}"; -($match) = $Collator->match($str, $sub); -ok($match, $ret); +my @ret = ( + "Pe\x{300}\x{301}", + "Pe\x{300}\0\0\x{301}", + "Pe\x{DA00}\x{301}\x{DFFF}", + "Pe\x{FFFF}\x{301}", + "Pe\x{110000}\x{301}", + "Pe\x{300}\x{d801}\x{301}", + "Pe\x{300}\x{ffff}\x{301}", + "Pe\x{300}\x{110000}\x{301}", + "Pe\x{D9ab}\x{DFFF}", + "Pe\x{FFFF}", + "Pe\x{110000}", + "Pe\x{300}\x{D800}\x{DFFF}", + "Pe\x{300}\x{FFFF}", + "Pe\x{300}\x{110000}", +); +# 52..65 +for my $ret (@ret) { + my $str = $ret."rl"; + my($match) = $Collator->match($str, "pe"); + ok($match eq $ret); +} diff --git a/cpan/Unicode-Collate/t/loc_hy.t b/cpan/Unicode-Collate/t/loc_hy.t index fe22adba5a..e3e1e1c5c6 100644 --- a/cpan/Unicode-Collate/t/loc_hy.t +++ b/cpan/Unicode-Collate/t/loc_hy.t @@ -4,7 +4,7 @@ use warnings; use Unicode::Collate::Locale; use Test; -plan tests => 7; +plan tests => 13; my $objHy = Unicode::Collate::Locale-> new(locale => 'HY', normalization => undef); @@ -17,6 +17,12 @@ $objHy->change(level => 1); ok($objHy->lt("\x{584}", "\x{587}")); ok($objHy->gt("\x{585}", "\x{587}")); +ok($objHy->lt("\x{584}\x{4E00}", "\x{587}")); +ok($objHy->lt("\x{584}\x{20000}", "\x{587}")); +ok($objHy->lt("\x{584}\x{10FFFD}","\x{587}")); + +# 7 + $objHy->change(level => 2); ok($objHy->eq("\x{587}", "\x{535}\x{582}")); @@ -29,4 +35,12 @@ $objHy->change(upper_before_lower => 1); ok($objHy->gt("\x{587}", "\x{535}\x{582}")); -# 7 +# 10 + +$objHy->change(UCA_Version => 8); + +ok($objHy->lt("\x{584}\x{4E00}", "\x{587}")); +ok($objHy->lt("\x{584}\x{20000}", "\x{587}")); +ok($objHy->lt("\x{584}\x{10FFFD}","\x{587}")); + +# 13 diff --git a/cpan/Unicode-Collate/t/nonchar.t b/cpan/Unicode-Collate/t/nonchar.t new file mode 100644 index 0000000000..ed11bf26d2 --- /dev/null +++ b/cpan/Unicode-Collate/t/nonchar.t @@ -0,0 +1,120 @@ + +BEGIN { + unless ("A" eq pack('U', 0x41)) { + print "1..0 # Unicode::Collate " . + "cannot stringify a Unicode code point\n"; + exit 0; + } + if ($ENV{PERL_CORE}) { + chdir('t') if -d 't'; + @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib); + } +} + +use Test; +use strict; +use warnings; + +BEGIN { + use Unicode::Collate; + + unless (exists &Unicode::Collate::bootstrap or 5.008 <= $]) { + print "1..0 # skipped: XSUB, or Perl 5.8.0 or later". + " needed for this test\n"; + print $@; + exit; + } +} + +BEGIN { plan tests => 27 }; + +ok(1); + +######################### + +no warnings 'utf8'; + +# Unicode 6.0 Sorting +# +# Special Database Values. The data files for CLDR provide +# special weights for two noncharacters: +# +# 1. A special noncharacter <HIGH> (U+FFFF) for specification of a range +# in a database, allowing "Sch" <= X <= "Sch<HIGH>" to pick all strings +# starting with "sch" plus those that sort equivalently. +# 2. A special noncharacter <LOW> (U+FFFE) for merged database fields, +# allowing "Disi\x{301}lva<LOW>John" to sort next to "Disilva<LOW>John". + +my $Collator = Unicode::Collate->new( + table => 'keys.txt', + level => 1, + normalization => undef, + UCA_Version => 22, + entry => <<'ENTRIES', +FFFE ; [*0001.0020.0005.FFFE] # <noncharacter-FFFE> +FFFF ; [.FFFE.0020.0005.FFFF] # <noncharacter-FFFF> +ENTRIES +); + +# 2..16 + +ok($Collator->lt("\x{FFFD}", "\x{FFFF}")); +ok($Collator->lt("\x{1FFFD}", "\x{1FFFF}")); +ok($Collator->lt("\x{2FFFD}", "\x{2FFFF}")); +ok($Collator->lt("\x{10FFFD}", "\x{10FFFF}")); + +ok($Collator->lt("perl\x{FFFD}", "perl\x{FFFF}")); +ok($Collator->lt("perl\x{1FFFD}", "perl\x{FFFF}")); +ok($Collator->lt("perl\x{1FFFE}", "perl\x{FFFF}")); +ok($Collator->lt("perl\x{1FFFF}", "perl\x{FFFF}")); +ok($Collator->lt("perl\x{2FFFD}", "perl\x{FFFF}")); +ok($Collator->lt("perl\x{2FFFE}", "perl\x{FFFF}")); +ok($Collator->lt("perl\x{2FFFF}", "perl\x{FFFF}")); +ok($Collator->lt("perl\x{10FFFD}", "perl\x{FFFF}")); +ok($Collator->lt("perl\x{10FFFE}", "perl\x{FFFF}")); +ok($Collator->lt("perl\x{10FFFF}", "perl\x{FFFF}")); + +ok($Collator->gt("perl\x{FFFF}AB", "perl\x{FFFF}")); + +$Collator->change(level => 4); + +# 17..23 + +my @dsf = ( + "di Silva\x{FFFE}Fred", + "diSilva\x{FFFE}Fred", + "di Si\x{301}lva\x{FFFE}Fred", + "diSi\x{301}lva\x{FFFE}Fred", +); +my @dsj = ( + "di Silva\x{FFFE}John", + "diSilva\x{FFFE}John", + "di Si\x{301}lva\x{FFFE}John", + "diSi\x{301}lva\x{FFFE}John", +); + +ok($Collator->lt($dsf[0], $dsf[1])); +ok($Collator->lt($dsf[1], $dsf[2])); +ok($Collator->lt($dsf[2], $dsf[3])); + +ok($Collator->lt($dsf[3], $dsj[0])); + +ok($Collator->lt($dsj[0], $dsj[1])); +ok($Collator->lt($dsj[1], $dsj[2])); +ok($Collator->lt($dsj[2], $dsj[3])); + +# 24..27 + +my @ds_j = ( + "di Silva John", + "diSilva John", + "di Si\x{301}lva John", + "diSi\x{301}lva John", +); + +ok($Collator->lt($ds_j[0], $ds_j[1])); +ok($Collator->lt($ds_j[1], $ds_j[2])); +ok($Collator->lt($ds_j[2], $ds_j[3])); + +ok($Collator->lt($dsj[0], $ds_j[0])); + diff --git a/cpan/Unicode-Collate/t/overcjk0.t b/cpan/Unicode-Collate/t/overcjk0.t index 2eec339bb0..016abe08b6 100644 --- a/cpan/Unicode-Collate/t/overcjk0.t +++ b/cpan/Unicode-Collate/t/overcjk0.t @@ -1,3 +1,4 @@ + BEGIN { unless ("A" eq pack('U', 0x41)) { print "1..0 # Unicode::Collate " . @@ -11,7 +12,7 @@ BEGIN { } use Test; -BEGIN { plan tests => 66 }; +BEGIN { plan tests => 246 }; # 6 + 30 x @Versions use strict; use warnings; @@ -38,77 +39,60 @@ ok($ignoreCJK->eq("Pe\x{4E00}rl", "Perl")); # U+4E00 is a CJK. ok($ignoreCJK->gt("\x{4DFF}", "\x{4E00}")); # U+4DFF is not CJK. ok($ignoreCJK->lt("Pe\x{5B57}rl", "Perl")); # 'r' is unassigned. -##### 7..20 -ok($ignoreCJK->eq("\x{3400}", "")); -ok($ignoreCJK->eq("\x{4DB5}", "")); -ok($ignoreCJK->eq("\x{9FA5}", "")); -ok($ignoreCJK->eq("\x{9FA6}", "")); # UI since Unicode 4.1.0 -ok($ignoreCJK->eq("\x{9FBB}", "")); # UI since Unicode 4.1.0 -ok($ignoreCJK->eq("\x{9FBC}", "")); # UI since Unicode 5.1.0 -ok($ignoreCJK->eq("\x{9FC3}", "")); # UI since Unicode 5.1.0 -ok($ignoreCJK->eq("\x{9FC4}", "")); # UI since Unicode 5.2.0 -ok($ignoreCJK->eq("\x{9FCB}", "")); # UI since Unicode 5.2.0 -ok($ignoreCJK->gt("\x{9FCC}", "Perl")); -ok($ignoreCJK->eq("\x{20000}", "")); # ExtB since Unicode 3.1.0 -ok($ignoreCJK->eq("\x{2A6D6}", "")); # ExtB since Unicode 3.1.0 -ok($ignoreCJK->eq("\x{2A700}", "")); # ExtC since Unicode 5.2.0 -ok($ignoreCJK->eq("\x{2B734}", "")); # ExtC since Unicode 5.2.0 - -##### 21..30 -$ignoreCJK->change(UCA_Version => 8); -ok($ignoreCJK->eq("\x{3400}", "")); -ok($ignoreCJK->eq("\x{4DB5}", "")); -ok($ignoreCJK->eq("\x{9FA5}", "")); -ok($ignoreCJK->gt("\x{9FA6}", "Perl")); -ok($ignoreCJK->gt("\x{9FBB}", "Perl")); -ok($ignoreCJK->gt("\x{9FBC}", "Perl")); -ok($ignoreCJK->gt("\x{9FC3}", "Perl")); -ok($ignoreCJK->gt("\x{9FC4}", "Perl")); -ok($ignoreCJK->eq("\x{20000}", "")); # ExtB since Unicode 3.1.0 -ok($ignoreCJK->eq("\x{2A6D6}", "")); # ExtB since Unicode 3.1.0 - -##### 31..40 -$ignoreCJK->change(UCA_Version => 9); -ok($ignoreCJK->eq("\x{3400}", "")); -ok($ignoreCJK->eq("\x{4DB5}", "")); -ok($ignoreCJK->eq("\x{9FA5}", "")); -ok($ignoreCJK->gt("\x{9FA6}", "Perl")); -ok($ignoreCJK->gt("\x{9FBB}", "Perl")); -ok($ignoreCJK->gt("\x{9FBC}", "Perl")); -ok($ignoreCJK->gt("\x{9FC3}", "Perl")); -ok($ignoreCJK->gt("\x{9FC4}", "Perl")); -ok($ignoreCJK->eq("\x{20000}", "")); # ExtB since Unicode 3.1.0 -ok($ignoreCJK->eq("\x{2A6D6}", "")); # ExtB since Unicode 3.1.0 - -##### 41..52 -$ignoreCJK->change(UCA_Version => 14); -ok($ignoreCJK->eq("\x{3400}", "")); -ok($ignoreCJK->eq("\x{4DB5}", "")); -ok($ignoreCJK->eq("\x{9FA5}", "")); -ok($ignoreCJK->eq("\x{9FA6}", "")); # UI since Unicode 4.1.0 -ok($ignoreCJK->eq("\x{9FBB}", "")); # UI since Unicode 4.1.0 -ok($ignoreCJK->gt("\x{9FBC}", "Perl")); -ok($ignoreCJK->gt("\x{9FC3}", "Perl")); -ok($ignoreCJK->gt("\x{9FC4}", "Perl")); -ok($ignoreCJK->eq("\x{20000}", "")); # ExtB since Unicode 3.1.0 -ok($ignoreCJK->eq("\x{2A6D6}", "")); # ExtB since Unicode 3.1.0 -ok($ignoreCJK->gt("\x{2A700}", "Perl")); -ok($ignoreCJK->gt("\x{2B734}", "Perl")); - -##### 53..66 -$ignoreCJK->change(UCA_Version => 18); -ok($ignoreCJK->eq("\x{3400}", "")); -ok($ignoreCJK->eq("\x{4DB5}", "")); -ok($ignoreCJK->eq("\x{9FA5}", "")); -ok($ignoreCJK->eq("\x{9FA6}", "")); # UI since Unicode 4.1.0 -ok($ignoreCJK->eq("\x{9FBB}", "")); # UI since Unicode 4.1.0 -ok($ignoreCJK->eq("\x{9FBC}", "")); # UI since Unicode 5.1.0 -ok($ignoreCJK->eq("\x{9FC3}", "")); # UI since Unicode 5.1.0 -ok($ignoreCJK->gt("\x{9FC4}", "Perl")); -ok($ignoreCJK->gt("\x{9FCB}", "Perl")); -ok($ignoreCJK->gt("\x{9FCC}", "Perl")); -ok($ignoreCJK->eq("\x{20000}", "")); # ExtB since Unicode 3.1.0 -ok($ignoreCJK->eq("\x{2A6D6}", "")); # ExtB since Unicode 3.1.0 -ok($ignoreCJK->gt("\x{2A700}", "Perl")); -ok($ignoreCJK->gt("\x{2B734}", "Perl")); +##### + +# 4E00..9FA5 are CJK UI. +# 9FA6..9FBB are CJK UI since UCA_Version 14 (Unicode 4.1). +# 9FBC..9FC3 are CJK UI since UCA_Version 18 (Unicode 5.1). +# 9FC4..9FCB are CJK UI since UCA_Version 20 (Unicode 5.2). + +# 3400..4DB5 are CJK UI Ext.A since UCA_Version 8 (Unicode 3.0). +# 20000..2A6D6 are CJK UI Ext.B since UCA_Version 8 (Unicode 3.1). +# 2A700..2B734 are CJK UI Ext.C since UCA_Version 20 (Unicode 5.2). +# 2B740..2B81D are CJK UI Ext.D since UCA_Version 22 (Unicode 6.0). + +my @Versions = (8, 9, 11, 14, 16, 18, 20, 22); + +for my $v (@Versions) { +$ignoreCJK->change(UCA_Version => $v); +# UI +ok($ignoreCJK->cmp("\x{4E00}", "") == 0); +ok($ignoreCJK->cmp("\x{9FA5}", "") == 0); +ok($ignoreCJK->cmp("\x{9FA6}", "") == ($v >= 14 ? 0 : 1)); +ok($ignoreCJK->cmp("\x{9FAF}", "") == ($v >= 14 ? 0 : 1)); +ok($ignoreCJK->cmp("\x{9FB0}", "") == ($v >= 14 ? 0 : 1)); +ok($ignoreCJK->cmp("\x{9FBB}", "") == ($v >= 14 ? 0 : 1)); +ok($ignoreCJK->cmp("\x{9FBC}", "") == ($v >= 18 ? 0 : 1)); +ok($ignoreCJK->cmp("\x{9FBF}", "") == ($v >= 18 ? 0 : 1)); +ok($ignoreCJK->cmp("\x{9FC0}", "") == ($v >= 18 ? 0 : 1)); +ok($ignoreCJK->cmp("\x{9FC3}", "") == ($v >= 18 ? 0 : 1)); +ok($ignoreCJK->cmp("\x{9FC4}", "") == ($v >= 20 ? 0 : 1)); +ok($ignoreCJK->cmp("\x{9FCB}", "") == ($v >= 20 ? 0 : 1)); +ok($ignoreCJK->cmp("\x{9FCC}", "") == 1); +ok($ignoreCJK->cmp("\x{9FCF}", "") == 1); + +# Ext.A +ok($ignoreCJK->cmp("\x{3400}", "") == 0); +ok($ignoreCJK->cmp("\x{4DB5}", "") == 0); +ok($ignoreCJK->cmp("\x{4DB6}", "") == 1); +ok($ignoreCJK->cmp("\x{4DBF}", "") == 1); + +# Ext.B +ok($ignoreCJK->cmp("\x{20000}","") == 0); +ok($ignoreCJK->cmp("\x{2A6D6}","") == 0); +ok($ignoreCJK->cmp("\x{2A6D7}","") == 1); +ok($ignoreCJK->cmp("\x{2A6DF}","") == 1); + +# Ext.C +ok($ignoreCJK->cmp("\x{2A700}","") == ($v >= 20 ? 0 : 1)); +ok($ignoreCJK->cmp("\x{2B734}","") == ($v >= 20 ? 0 : 1)); +ok($ignoreCJK->cmp("\x{2B735}","") == 1); +ok($ignoreCJK->cmp("\x{2B73F}","") == 1); + +# Ext.D +ok($ignoreCJK->cmp("\x{2B740}","") == ($v >= 22 ? 0 : 1)); +ok($ignoreCJK->cmp("\x{2B81D}","") == ($v >= 22 ? 0 : 1)); +ok($ignoreCJK->cmp("\x{2B81E}","") == 1); +ok($ignoreCJK->cmp("\x{2B81F}","") == 1); +} diff --git a/cpan/Unicode-Collate/t/overcjk1.t b/cpan/Unicode-Collate/t/overcjk1.t index 45daaa2321..b05340e704 100644 --- a/cpan/Unicode-Collate/t/overcjk1.t +++ b/cpan/Unicode-Collate/t/overcjk1.t @@ -1,3 +1,4 @@ + BEGIN { unless ("A" eq pack('U', 0x41)) { print "1..0 # Unicode::Collate " . @@ -11,7 +12,7 @@ BEGIN { } use Test; -BEGIN { plan tests => 57 }; +BEGIN { plan tests => 131 }; # 11 + 15 x @Versions use strict; use warnings; @@ -19,15 +20,13 @@ use Unicode::Collate; ok(1); -##### 2..6 +##### 2..11 my $overCJK = Unicode::Collate->new( - table => undef, + table => 'keys.txt', normalization => undef, entry => <<'ENTRIES', -0061 ; [.0101.0020.0002.0061] # latin a -0041 ; [.0101.0020.0008.0041] # LATIN A -4E00 ; [.B1FC.0030.0004.4E00] # Ideograph; B1FC = FFFF - 4E03. +4E01 ; [.B1FC.0030.0004.4E00] # Ideograph; B1FC = FFFF - 4E03. ENTRIES overrideCJK => sub { my $u = 0xFFFF - $_[0]; # reversed @@ -35,74 +34,40 @@ ENTRIES }, ); +ok($overCJK->gt("B", "A")); # diff. at level 1. ok($overCJK->lt("a", "A")); # diff. at level 3. -ok($overCJK->lt( "\x{4E03}", "\x{4E00}")); # diff. at level 2. -ok($overCJK->lt("A\x{4E03}", "A\x{4E00}")); -ok($overCJK->lt("A\x{4E03}", "a\x{4E00}")); -ok($overCJK->lt("a\x{4E03}", "A\x{4E00}")); - -##### 7..17 -ok($overCJK->gt("a\x{3400}", "A\x{4DB5}")); -ok($overCJK->gt("a\x{4DB5}", "A\x{9FA5}")); -ok($overCJK->gt("a\x{9FA5}", "A\x{9FA6}")); # UI since Unicode 4.1.0 -ok($overCJK->gt("a\x{9FA6}", "A\x{9FBB}")); # UI since Unicode 4.1.0 -ok($overCJK->gt("a\x{9FBB}", "A\x{9FBC}")); # UI since Unicode 5.1.0 -ok($overCJK->gt("a\x{9FBC}", "A\x{9FBF}")); # UI since Unicode 5.1.0 -ok($overCJK->gt("a\x{9FBF}", "A\x{9FC3}")); # UI since Unicode 5.1.0 -ok($overCJK->gt("a\x{9FC3}", "A\x{9FC4}")); # UI since Unicode 5.2.0 -ok($overCJK->gt("a\x{9FC4}", "A\x{9FCB}")); # UI since Unicode 5.2.0 -ok($overCJK->lt("a\x{9FCB}", "A\x{9FCC}")); -ok($overCJK->lt("a\x{9FC4}", "A\x{9FCF}")); +ok($overCJK->lt( "\x{4E03}", "\x{4E01}")); # diff. at level 2. +ok($overCJK->gt( "\x{4E03}B", "\x{4E01}A")); +ok($overCJK->lt( "\x{4E03}A", "\x{4E01}B")); +ok($overCJK->gt("B\x{4E03}", "A\x{4E01}")); +ok($overCJK->lt("A\x{4E03}", "B\x{4E01}")); +ok($overCJK->lt("A\x{4E03}", "A\x{4E01}")); +ok($overCJK->lt("A\x{4E03}", "a\x{4E01}")); +ok($overCJK->lt("a\x{4E03}", "A\x{4E01}")); -##### 18..26 -$overCJK->change(UCA_Version => 9); -ok($overCJK->gt("a\x{3400}", "A\x{4DB5}")); -ok($overCJK->gt("a\x{4DB5}", "A\x{9FA5}")); -ok($overCJK->lt("a\x{9FA5}", "A\x{9FA6}")); -ok($overCJK->lt("a\x{9FA6}", "A\x{9FBB}")); -ok($overCJK->lt("a\x{9FBB}", "A\x{9FBC}")); -ok($overCJK->lt("a\x{9FBC}", "A\x{9FBF}")); -ok($overCJK->lt("a\x{9FBF}", "A\x{9FC3}")); -ok($overCJK->lt("a\x{9FC3}", "A\x{9FC4}")); -ok($overCJK->lt("a\x{9FC4}", "A\x{9FCF}")); +##### -##### 27..35 -$overCJK->change(UCA_Version => 14); -ok($overCJK->gt("a\x{3400}", "A\x{4DB5}")); -ok($overCJK->gt("a\x{4DB5}", "A\x{9FA5}")); -ok($overCJK->gt("a\x{9FA5}", "A\x{9FA6}")); # UI since Unicode 4.1.0 -ok($overCJK->gt("a\x{9FA6}", "A\x{9FBB}")); # UI since Unicode 4.1.0 -ok($overCJK->lt("a\x{9FBB}", "A\x{9FBC}")); -ok($overCJK->lt("a\x{9FBC}", "A\x{9FBF}")); -ok($overCJK->lt("a\x{9FBF}", "A\x{9FC3}")); -ok($overCJK->lt("a\x{9FC3}", "A\x{9FC4}")); -ok($overCJK->lt("a\x{9FC4}", "A\x{9FCF}")); +# 9FA6..9FBB are CJK UI since UCA_Version 14 (Unicode 4.1). +# 9FBC..9FC3 are CJK UI since UCA_Version 18 (Unicode 5.1). +# 9FC4..9FCB are CJK UI since UCA_Version 20 (Unicode 5.2). -##### 36..46 -$overCJK->change(UCA_Version => 18); -ok($overCJK->gt("a\x{3400}", "A\x{4DB5}")); -ok($overCJK->gt("a\x{4DB5}", "A\x{9FA5}")); -ok($overCJK->gt("a\x{9FA5}", "A\x{9FA6}")); # UI since Unicode 4.1.0 -ok($overCJK->gt("a\x{9FA6}", "A\x{9FBB}")); # UI since Unicode 4.1.0 -ok($overCJK->gt("a\x{9FBB}", "A\x{9FBC}")); # UI since Unicode 5.1.0 -ok($overCJK->gt("a\x{9FBC}", "A\x{9FBF}")); # UI since Unicode 5.1.0 -ok($overCJK->gt("a\x{9FBF}", "A\x{9FC3}")); # UI since Unicode 5.1.0 -ok($overCJK->lt("a\x{9FC3}", "A\x{9FC4}")); -ok($overCJK->lt("a\x{9FC3}", "A\x{9FCB}")); -ok($overCJK->lt("a\x{9FC3}", "A\x{9FCC}")); -ok($overCJK->lt("a\x{9FC4}", "A\x{9FCF}")); - -##### 47..57 -$overCJK->change(UCA_Version => 20); -ok($overCJK->gt("a\x{3400}", "A\x{4DB5}")); -ok($overCJK->gt("a\x{4DB5}", "A\x{9FA5}")); -ok($overCJK->gt("a\x{9FA5}", "A\x{9FA6}")); # UI since Unicode 4.1.0 -ok($overCJK->gt("a\x{9FA6}", "A\x{9FBB}")); # UI since Unicode 4.1.0 -ok($overCJK->gt("a\x{9FBB}", "A\x{9FBC}")); # UI since Unicode 5.1.0 -ok($overCJK->gt("a\x{9FBC}", "A\x{9FBF}")); # UI since Unicode 5.1.0 -ok($overCJK->gt("a\x{9FBF}", "A\x{9FC3}")); # UI since Unicode 5.1.0 -ok($overCJK->gt("a\x{9FC3}", "A\x{9FC4}")); # UI since Unicode 5.2.0 -ok($overCJK->gt("a\x{9FC4}", "A\x{9FCB}")); # UI since Unicode 5.2.0 -ok($overCJK->lt("a\x{9FCB}", "A\x{9FCC}")); -ok($overCJK->lt("a\x{9FC4}", "A\x{9FCF}")); +my @Versions = (8, 9, 11, 14, 16, 18, 20, 22); +for my $v (@Versions) { +$overCJK->change(UCA_Version => $v); +ok($overCJK->cmp("a\x{3400}", "A\x{4DB5}") == 1); +ok($overCJK->cmp("a\x{4DB5}", "A\x{4E00}") == 1); +ok($overCJK->cmp("a\x{4E00}", "A\x{9FA5}") == 1); +ok($overCJK->cmp("a\x{9FA5}", "A\x{9FA6}") == ($v >= 14 ? 1 : -1)); +ok($overCJK->cmp("a\x{9FA6}", "A\x{9FAF}") == ($v >= 14 ? 1 : -1)); +ok($overCJK->cmp("a\x{9FAF}", "A\x{9FB0}") == ($v >= 14 ? 1 : -1)); +ok($overCJK->cmp("a\x{9FB0}", "A\x{9FBB}") == ($v >= 14 ? 1 : -1)); +ok($overCJK->cmp("a\x{9FBB}", "A\x{9FBC}") == ($v >= 18 ? 1 : -1)); +ok($overCJK->cmp("a\x{9FBC}", "A\x{9FBF}") == ($v >= 18 ? 1 : -1)); +ok($overCJK->cmp("a\x{9FBF}", "A\x{9FC3}") == ($v >= 18 ? 1 : -1)); +ok($overCJK->cmp("a\x{9FC3}", "A\x{9FC4}") == ($v >= 20 ? 1 : -1)); +ok($overCJK->cmp("a\x{9FC4}", "A\x{9FCA}") == ($v >= 20 ? 1 : -1)); +ok($overCJK->cmp("a\x{9FCA}", "A\x{9FCB}") == ($v >= 20 ? 1 : -1)); +ok($overCJK->cmp("a\x{9FCB}", "A\x{9FCC}") == -1); +ok($overCJK->cmp("a\x{9FCC}", "A\x{9FCF}") == -1); +} |