diff options
Diffstat (limited to 'lib/Unicode')
-rw-r--r-- | lib/Unicode/Collate.pm | 294 | ||||
-rw-r--r-- | lib/Unicode/Collate/Changes | 12 | ||||
-rw-r--r-- | lib/Unicode/Collate/README | 30 | ||||
-rw-r--r-- | lib/Unicode/Collate/t/altern.t | 7 | ||||
-rw-r--r-- | lib/Unicode/Collate/t/cjkrange.t | 94 | ||||
-rw-r--r-- | lib/Unicode/Collate/t/contract.t | 10 | ||||
-rw-r--r-- | lib/Unicode/Collate/t/hangtype.t | 12 | ||||
-rw-r--r-- | lib/Unicode/Collate/t/hangul.t | 10 | ||||
-rw-r--r-- | lib/Unicode/Collate/t/ignor.t | 158 | ||||
-rw-r--r-- | lib/Unicode/Collate/t/illegal.t | 7 | ||||
-rw-r--r-- | lib/Unicode/Collate/t/illegalp.t | 11 | ||||
-rw-r--r-- | lib/Unicode/Collate/t/index.t | 14 | ||||
-rw-r--r-- | lib/Unicode/Collate/t/normal.t | 9 | ||||
-rw-r--r-- | lib/Unicode/Collate/t/override.t | 187 | ||||
-rw-r--r-- | lib/Unicode/Collate/t/rearrang.t | 10 | ||||
-rw-r--r-- | lib/Unicode/Collate/t/test.t | 294 | ||||
-rw-r--r-- | lib/Unicode/Collate/t/trailwt.t | 7 | ||||
-rw-r--r-- | lib/Unicode/Collate/t/variable.t | 7 | ||||
-rw-r--r-- | lib/Unicode/Collate/t/version.t | 11 | ||||
-rw-r--r-- | lib/Unicode/Collate/t/view.t | 7 |
20 files changed, 736 insertions, 455 deletions
diff --git a/lib/Unicode/Collate.pm b/lib/Unicode/Collate.pm index b87054f3aa..d58fbb5d0e 100644 --- a/lib/Unicode/Collate.pm +++ b/lib/Unicode/Collate.pm @@ -14,7 +14,7 @@ use File::Spec; no warnings 'utf8'; -our $VERSION = '0.40'; +our $VERSION = '0.50'; our $PACKAGE = __PACKAGE__; my @Path = qw(Unicode Collate); @@ -76,9 +76,9 @@ use constant Hangul_LIni => 0x1100; use constant Hangul_LFin => 0x1159; use constant Hangul_LFill => 0x115F; use constant Hangul_VBase => 0x1161; -use constant Hangul_VIni => 0x1160; +use constant Hangul_VIni => 0x1160; # from Vowel Filler use constant Hangul_VFin => 0x11A2; -use constant Hangul_TBase => 0x11A7; +use constant Hangul_TBase => 0x11A7; # from "no-final" codepoint use constant Hangul_TIni => 0x11A8; use constant Hangul_TFin => 0x11F9; use constant Hangul_TCount => 28; @@ -88,6 +88,7 @@ use constant Hangul_SIni => 0xAC00; use constant Hangul_SFin => 0xD7A3; use constant CJK_UidIni => 0x4E00; use constant CJK_UidFin => 0x9FA5; +use constant CJK_UidF41 => 0x9FBB; use constant CJK_ExtAIni => 0x3400; use constant CJK_ExtAFin => 0x4DB5; use constant CJK_ExtBIni => 0x20000; @@ -95,12 +96,11 @@ use constant CJK_ExtBFin => 0x2A6D6; use constant BMP_Max => 0xFFFF; # Logical_Order_Exception in PropList.txt -# TODO: synchronization with change of PropList.txt. my $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ]; -sub UCA_Version { "11" } +sub UCA_Version { "14" } -sub Base_Unicode_Version { "4.0" } +sub Base_Unicode_Version { "4.1.0" } ###### @@ -109,7 +109,7 @@ sub pack_U { } sub unpack_U { - return unpack('U*', pack('U*').shift); + return unpack('U*', shift(@_).pack('U*')); } ###### @@ -168,7 +168,7 @@ sub change { } # else => ignored } - $self->checkCollator; + $self->checkCollator(); return wantarray ? %old : $self; } @@ -187,6 +187,7 @@ my %DerivCode = ( 8 => \&_derivCE_8, 9 => \&_derivCE_9, 11 => \&_derivCE_9, # 11 == 9 + 14 => \&_derivCE_14, ); sub checkCollator { @@ -200,7 +201,7 @@ sub checkCollator { $self->{alternateTable} || 'shifted'; $self->{variable} = $self->{alternate} = lc($self->{variable}); exists $VariableOK{ $self->{variable} } - or croak "$PACKAGE unknown variable tag name: $self->{variable}"; + or croak "$PACKAGE unknown variable parameter name: $self->{variable}"; if (! defined $self->{backwards}) { $self->{backwardsFlag} = 0; @@ -262,7 +263,7 @@ sub new # If undef is passed explicitly, no file is read. $self->{table} = $KeyFile if ! exists $self->{table}; - $self->read_table if defined $self->{table}; + $self->read_table() if defined $self->{table}; if ($self->{entry}) { while ($self->{entry} =~ /([^\n]+)/g) { @@ -279,12 +280,13 @@ sub new if ! exists $self->{overrideCJK}; $self->{normalization} = 'NFD' if ! exists $self->{normalization}; - $self->{rearrange} = $self->{rearrangeTable} || $DefaultRearrange + $self->{rearrange} = $self->{rearrangeTable} || + ($self->{UCA_Version} <= 11 ? $DefaultRearrange : []) if ! exists $self->{rearrange}; $self->{backwards} = $self->{backwardsTable} if ! exists $self->{backwards}; - $self->checkCollator; + $self->checkCollator(); return $self; } @@ -380,9 +382,9 @@ sub parseEntry my @wt = _getHexArray($arr); push @key, pack(VCE_TEMPLATE, $var, @wt); $is_L3_ignorable = FALSE - if $wt[0] + $wt[1] + $wt[2] != 0; - # if $arr !~ /[1-9A-Fa-f]/; NG - # Conformance Test shows L3-ignorable is completely ignorable. + if $wt[0] || $wt[1] || $wt[2]; + # Conformance Test for 3.1.1 and 4.0.0 shows Level 3 ignorable + # is completely ignorable. # For expansion, an entry $is_L3_ignorable # if and only if "all" CEs are [.0000.0000.0000]. } @@ -455,7 +457,7 @@ sub splitEnt my $map = $self->{mapping}; my $max = $self->{maxlength}; my $reH = $self->{rearrangeHash}; - my $ver9 = $self->{UCA_Version} >= 9; + my $ver9 = $self->{UCA_Version} >= 9 && $self->{UCA_Version} <= 11; my ($str, @buf); @@ -487,7 +489,7 @@ sub splitEnt } } - # To remove a character marked as a completely ignorable. + # remove a code point marked as a completely ignorable. for (my $i = 0; $i < @src; $i++) { $src[$i] = undef if _isIllegal($src[$i]) || ($ver9 && @@ -496,10 +498,19 @@ sub splitEnt for (my $i = 0; $i < @src; $i++) { my $jcps = $src[$i]; - next if ! defined $jcps; + + # skip removed code point + if (! defined $jcps) { + if ($wLen && @buf) { + $buf[-1][2] = $i + 1; + } + next; + } + my $i_orig = $i; - if ($max->{$jcps}) { # contract + # find contraction + if ($max->{$jcps}) { my $temp_jcps = $jcps; my $jcpsLen = 1; my $maxLen = $max->{$jcps}; @@ -542,10 +553,12 @@ sub splitEnt } } - if ($wLen) { - for (; $i + 1 < @src; $i++) { - last if defined $src[$i + 1]; + # skip completely ignorable + if ($map->{$jcps} && @{ $map->{$jcps} } == 0) { + if ($wLen && @buf) { + $buf[-1][2] = $i + 1; } + next; } push @buf, $wLen ? [$jcps, $i_orig, $i + 1] : $jcps; @@ -610,15 +623,13 @@ sub getWt } return map _varCE($vbl, $_), @hangulCE; } - elsif (CJK_UidIni <= $u && $u <= CJK_UidFin || - CJK_ExtAIni <= $u && $u <= CJK_ExtAFin || - CJK_ExtBIni <= $u && $u <= CJK_ExtBFin) { + elsif (_isUIdeo($u, $self->{UCA_Version})) { my $cjk = $self->{overrideCJK}; return map _varCE($vbl, $_), $cjk ? map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$cjk($u)) - : defined $cjk && $self->{UCA_Version} <= 8 && $u <= BMP_Max - ? pack(VCE_TEMPLATE, NON_VAR, $u, Min2Wt, Min3Wt, $u) + : defined $cjk && $self->{UCA_Version} <= 8 && $u < 0x10000 + ? _uideoCE_8($u) : $der->($u); } else { @@ -635,8 +646,8 @@ sub getSortKey my $self = shift; my $lev = $self->{level}; my $rEnt = $self->splitEnt(shift); # get an arrayref of JCPS - my $ver9 = $self->{UCA_Version} >= 9; - my $v2i = $ver9 && $self->{variable} ne 'non-ignorable'; + my $v2i = $self->{UCA_Version} >= 9 && + $self->{variable} ne 'non-ignorable'; my @buf; # weight arrays if ($self->{hangul_terminator}) { @@ -673,6 +684,8 @@ sub getSortKey foreach my $vwt (@buf) { my($var, @wt) = unpack(VCE_TEMPLATE, $vwt); + + # "Ignorable (L1, L2) after Variable" since track. v. 9 if ($v2i) { if ($var) { $last_is_variable = TRUE; @@ -740,6 +753,23 @@ sub sort { } +sub _derivCE_14 { + my $u = shift; + my $base = + (CJK_UidIni <= $u && $u <= CJK_UidF41) + ? 0xFB40 : # CJK + (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin || + CJK_ExtBIni <= $u && $u <= CJK_ExtBFin) + ? 0xFB80 # CJK ext. + : 0xFBC0; # others + + my $aaaa = $base + ($u >> 15); + my $bbbb = ($u & 0x7FFF) | 0x8000; + return + pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u), + pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $u); +} + sub _derivCE_9 { my $u = shift; my $base = @@ -766,6 +796,23 @@ sub _derivCE_8 { pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $code); } +sub _uideoCE_8 { + my $u = shift; + return pack(VCE_TEMPLATE, NON_VAR, $u, Min2Wt, Min3Wt, $u); +} + +sub _isUIdeo { + my ($u, $uca_vers) = @_; + return( + (CJK_UidIni <= $u && + ($uca_vers >= 14 ? ( $u <= CJK_UidF41) : ($u <= CJK_UidFin))) + || + (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin) + || + (CJK_ExtBIni <= $u && $u <= CJK_ExtBFin) + ); +} + sub getWtHangulTerm { my $self = shift; @@ -785,14 +832,14 @@ sub _getHexArray { map hex, $_[0] =~ /([0-9a-fA-F]+)/g } # sub _decompHangul { my $code = shift; - my $SIndex = $code - Hangul_SBase; - my $LIndex = int( $SIndex / Hangul_NCount); - my $VIndex = int(($SIndex % Hangul_NCount) / Hangul_TCount); - my $TIndex = $SIndex % Hangul_TCount; + my $si = $code - Hangul_SBase; + my $li = int( $si / Hangul_NCount); + my $vi = int(($si % Hangul_NCount) / Hangul_TCount); + my $ti = $si % Hangul_TCount; return ( - Hangul_LBase + $LIndex, - Hangul_VBase + $VIndex, - $TIndex ? (Hangul_TBase + $TIndex) : (), + Hangul_LBase + $li, + Hangul_VBase + $vi, + $ti ? (Hangul_TBase + $ti) : (), ); } @@ -874,8 +921,8 @@ sub index my $grob = shift; my $lev = $self->{level}; - my $ver9 = $self->{UCA_Version} >= 9; - my $v2i = $self->{variable} ne 'non-ignorable'; + my $v2i = $self->{UCA_Version} >= 9 && + $self->{variable} ne 'non-ignorable'; if (! @$subE) { my $temp = $pos <= 0 ? 0 : $len <= $pos ? $len : $pos; @@ -896,7 +943,8 @@ sub index my($var, @wt) = unpack(VCE_TEMPLATE, $vwt); my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev); - if ($v2i && $ver9) { + # "Ignorable (L1, L2) after Variable" since track. v. 9 + if ($v2i) { if ($var) { $last_is_variable = TRUE; } @@ -928,7 +976,8 @@ sub index my($var, @wt) = unpack(VCE_TEMPLATE, $vwt); my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev); - if ($v2i && $ver9) { + # "Ignorable (L1, L2) after Variable" since track. v. 9 + if ($v2i) { if ($var) { $last_is_variable = TRUE; } @@ -1083,9 +1132,8 @@ Unicode::Collate - Unicode Collation Algorithm =head1 DESCRIPTION -This module is an implementation -of Unicode Technical Standard #10 (UTS #10) -"Unicode Collation Algorithm." +This module is an implementation of Unicode Technical Standard #10 +(a.k.a. UTS #10) - Unicode Collation Algorithm (a.k.a. UCA). =head2 Constructor and Tailoring @@ -1117,14 +1165,27 @@ The C<new> method returns a collator object. =item UCA_Version -If the tracking version number of the older UCA is given, -the older behavior of that tracking version is emulated on collating. +If the tracking version number of UCA is given, +behavior of that tracking version is emulated on collating. If omitted, the return value of C<UCA_Version()> is used. +C<UCA_Version()> should return the latest tracking version supported. + +The supported tracking version: 8, 9, 11, or 14. + + UCA tracking version Unicode version + 8 3.1 + 9 3.1 with Corrigendum 3 + 11 4.0 + 14 4.1.0 + +Note: Recent UTS #10 renames "Tracking Version" to "Revision." + +=item alternate -The supported tracking version: 8, 9, or 11. +-- see 3.2.2 Alternate Weighting, version 8 of UTS #10 -B<This parameter may be removed in the future version, -as switching the algorithm would affect the performance.> +For backward compatibility, C<alternate> (old name) can be used +as an alias for C<variable>. =item backwards @@ -1185,7 +1246,7 @@ ordered between C<c-curl> and C<D>. =item hangul_terminator --- see Condition B.2. in 7.1.4 Trailing Weights, UTS #10. +-- see 7.1.4 Trailing Weights, UTS #10. If a true value is given (non-zero but should be positive), it will be added as a terminator primary weight to the end of @@ -1211,21 +1272,39 @@ automatically terminated with a terminator primary weight. These characters may need terminator included in a collation element table beforehand. -=item ignoreName - =item ignoreChar --- see Completely Ignorable, 3.2.2 Variable Weighting, UTS #10. +=item ignoreName + +-- see 3.2.2 Variable Weighting, UTS #10. Makes the entry in the table completely ignorable; i.e. as if the weights were zero at all level. +Through C<ignoreChar>, any character matching C<qr/$ignoreChar/> +will be ignored. Through C<ignoreName>, any character whose name +(given in the C<table> file as a comment) matches C<qr/$ignoreName/> +will be ignored. + E.g. when 'a' and 'e' are ignorable, 'element' is equal to 'lament' (or 'lmnt'). +=item katakana_before_hiragana + +-- see 7.3.1 Tertiary Weight Table, UTS #10. + +By default, hiragana is before katakana. +If the parameter is made true, this is reversed. + +B<NOTE>: This parameter simplemindedly assumes that any hiragana/katakana +distinctions must occur in level 3, and their weights at level 3 must be +same as those mentioned in 7.3.1, UTS #10. +If you define your collation elements which violate this requirement, +this parameter does not work validly. + =item level --- see 4.3 Form a sort key for each string, UTS #10. +-- see 4.3 Form Sort Key, UTS #10. Set the maximum level. Any higher levels than the specified one are ignored. @@ -1241,7 +1320,7 @@ If omitted, the maximum is the 4th. =item normalization --- see 4.1 Normalize each input string, UTS #10. +-- see 4.1 Normalize, UTS #10. If specified, strings are normalized before preparation of sort keys (the normalization is executed after preprocess). @@ -1285,9 +1364,10 @@ B<Unicode::Normalize> is required (see also B<CAVEAT>). -- see 7.1 Derived Collation Elements, UTS #10. By default, CJK Unified Ideographs are ordered in Unicode codepoint order -(but C<CJK Unified Ideographs> [C<U+4E00> to C<U+9FA5>] are lesser than -C<CJK Unified Ideographs Extension> [C<U+3400> to C<U+4DB5> and -C<U+20000> to C<U+2A6D6>]. +but C<CJK Unified Ideographs> (if C<UCA_Version> is 8 to 11, its range is +C<U+4E00..U+9FA5>; if C<UCA_Version> is 14, its range is C<U+4E00..U+9FBB>) +are lesser than C<CJK Unified Ideographs Extension> (its range is +C<U+3400..U+4DB5> and C<U+20000..U+2A6D6>). Through C<overrideCJK>, ordering of CJK Unified Ideographs can be overrided. @@ -1321,7 +1401,7 @@ By default, Hangul Syllables are decomposed into Hangul Jamo, even if C<(normalization =E<gt> undef)>. But the mapping of Hangul Syllables may be overrided. -This tag works like C<overrideCJK>, so see there for examples. +This parameter works like C<overrideCJK>, so see there for examples. If you want to override the mapping of Hangul Syllables, NFD, NFKD, and FCD are not appropriate, @@ -1356,13 +1436,14 @@ C<preprocess> is performed before C<normalization> (if defined). -- see 3.1.3 Rearrangement, UTS #10. Characters that are not coded in logical order and to be rearranged. -By default, +If C<UCA_Version> is equal to or lesser than 11, default is: rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ], -If you want to disallow any rearrangement, -pass C<undef> or C<[]> (a reference to an empty list) -as the value for this key. +If you want to disallow any rearrangement, pass C<undef> or C<[]> +(a reference to empty list) as the value for this key. + +If C<UCA_Version> is equal to 14, default is C<[]> (i.e. no rearrangement). B<According to the version 9 of UCA, this parameter shall not be used; but it is not warned at present.> @@ -1374,8 +1455,8 @@ but it is not warned at present.> You can use another collation element table if desired. The table file should locate in the F<Unicode/Collate> directory -on C<@INC>. Say, if the filename is F<Foo.txt> -the table file is searched as F<Unicode/Collate/Foo.txt> in <@INC>. +on C<@INC>. Say, if the filename is F<Foo.txt>, +the table file is searched as F<Unicode/Collate/Foo.txt> in C<@INC>. By default, F<allkeys.txt> (as the filename of DUCET) is used. @@ -1397,10 +1478,13 @@ without any file of table: ENTRIES ); -=item undefName +If C<ignoreName> or C<undefName> is used, character names should be +specified as a comment (following C<#>) on each line. =item undefChar +=item undefName + -- see 6.3.4 Reducing the Repertoire, UTS #10. Undefines the collation element as if it were unassigned in the table. @@ -1413,36 +1497,32 @@ as it is greater than any other assigned collation elements But, it'd be better to ignore characters unfamiliar to you and maybe never used. +Through C<undefChar>, any character matching C<qr/$undefChar/> +will be undefined. Through C<undefName>, any character whose name +(given in the C<table> file as a comment) matches C<qr/$undefName/> +will be undefined. + ex. Collation weights for beyond-BMP characters are not stored in object: undefChar => qr/[^\0-\x{fffd}]/, -=item katakana_before_hiragana - =item upper_before_lower --- see 6.6 Case Comparisons; 7.3.1 Tertiary Weight Table, UTS #10. +-- see 6.6 Case Comparisons, UTS #10. -By default, lowercase is before uppercase -and hiragana is before katakana. +By default, lowercase is before uppercase. +If the parameter is made true, this is reversed. -If the tag is made true, this is reversed. - -B<NOTE>: These tags simplemindedly assume -any lowercase/uppercase or hiragana/katakana distinctions -must occur in level 3, and their weights at level 3 -must be same as those mentioned in 7.3.1, UTS #10. -If you define your collation elements which violate this requirement, -these tags don't work validly. +B<NOTE>: This parameter simplemindedly assumes that any lowercase/uppercase +distinctions must occur in level 3, and their weights at level 3 must be +same as those mentioned in 7.3.1, UTS #10. +If you define your collation elements which differs from this requirement, +this parameter doesn't work validly. =item variable -=item alternate - -- see 3.2.2 Variable Weighting, UTS #10. -(the title in UCA version 8: Alternate Weighting) - This key allows to variable weighting for variable collation elements, which are marked with an ASTERISK in the table (NOTE: Many punction marks and symbols are variable in F<allkeys.txt>). @@ -1464,9 +1544,6 @@ By default (if specification is omitted), 'shifted' is adopted. 'Shift-Trimmed' Same as 'shifted', but all FFFF's at the 4th level are trimmed. -For backward compatibility, C<alternate> can be used as an alias -for C<variable>. - =back =head2 Methods for Collation @@ -1506,7 +1583,7 @@ They works like the same name operators as theirs. =item C<$sortKey = $Collator-E<gt>getSortKey($string)> --- see 4.3 Form a sort key for each string, UTS #10. +-- see 4.3 Form Sort Key, UTS #10. Returns a sort key. @@ -1521,6 +1598,9 @@ and get the result of the comparison of the strings using UCA. =item C<$sortKeyForm = $Collator-E<gt>viewSortKey($string)> +Converts a sorting key into its representation form. +If C<UCA_Version> is 8, the output is slightly different. + use Unicode::Collate; my $c = Unicode::Collate->new(); print $c->viewSortKey("Perl"),"\n"; @@ -1529,18 +1609,16 @@ and get the result of the comparison of the strings using UCA. # [0B67 0A65 0B7F 0B03 | 0020 0020 0020 0020 | 0008 0002 0002 0002 | FFFF FFFF FFFF FFFF] # Level 1 Level 2 Level 3 Level 4 - (If C<UCA_Version> is 8, the output is slightly different.) - =back =head2 Methods for Searching -B<DISCLAIMER:> If C<preprocess> or C<normalization> tag is true +B<DISCLAIMER:> If C<preprocess> or C<normalization> parameter is true for C<$Collator>, calling these methods (C<index>, C<match>, C<gmatch>, C<subst>, C<gsubst>) is croaked, as the position and the length might differ from those on the specified string. -(And C<rearrange> and C<hangul_terminator> tags are neglected.) +(And C<rearrange> and C<hangul_terminator> parameters are neglected.) The C<match>, C<gmatch>, C<subst>, C<gsubst> methods work like C<m//>, C<m//g>, C<s///>, C<s///g>, respectively, @@ -1573,7 +1651,7 @@ e.g. you say } and get C<"muß"> in C<$match> since C<"muß"> -is primary equal to C<"MÜSS">. +is primary equal to C<"MÜSS">. =item C<$match_ref = $Collator-E<gt>match($string, $substring)> @@ -1597,7 +1675,7 @@ e.g. print "doesn't match.\n"; } - or + or if (($match) = $Collator->match($str, $sub)) { # list context print "matches [$match].\n"; @@ -1639,10 +1717,10 @@ e.g. my $Collator = Unicode::Collate->new( normalization => undef, level => 1 ); # (normalization => undef) is REQUIRED. - my $str = "Camel ass came\x{301}l CAMEL horse cAm\0E\0L..."; + my $str = "Camel donkey zebra came\x{301}l CAMEL horse cAm\0E\0L..."; $Collator->gsubst($str, "camel", sub { "<b>$_[0]</b>" }); - # now $str is "<b>Camel</b> ass <b>came\x{301}l</b> <b>CAMEL</b> horse <b>cAm\0E\0L</b>..."; + # now $str is "<b>Camel</b> donkey zebra <b>came\x{301}l</b> <b>CAMEL</b> horse <b>cAm\0E\0L</b>..."; # i.e., all the camels are made bold-faced. =back @@ -1696,14 +1774,18 @@ Returns the version number of UTS #10 this module consults. =back -=head2 EXPORT +=head1 EXPORT + +No method will be exported. -None by default. +=head1 CAVEATS + +=over 4 -=head2 CAVEAT +=item Normalization -Use of the C<normalization> parameter requires -the B<Unicode::Normalize> module. +Use of the C<normalization> parameter requires the B<Unicode::Normalize> +module (see L<Unicode::Normalize>). If you need not it (say, in the case when you need not handle any combining characters), @@ -1711,7 +1793,7 @@ assign C<normalization =E<gt> undef> explicitly. -- see 6.5 Avoiding Normalization, UTS #10. -=head2 Conformance Test +=item Conformance Test The Conformance Test for the UCA is available under L<http://www.unicode.org/Public/UCA/>. @@ -1723,16 +1805,16 @@ C<Unicode::Collate-E<gt>new(variable =E<gt> "non-ignorable", level =E<gt> 3)>. B<Unicode::Normalize is required to try The Conformance Test.> +=back + =head1 AUTHOR SADAHIRO Tomoyuki <SADAHIRO@cpan.org> - http://homepage1.nifty.com/nomenclator/perl/ +Copyright(C) 2001-2005, SADAHIRO Tomoyuki. Japan. All rights reserved. - Copyright(C) 2001-2004, SADAHIRO Tomoyuki. Japan. All rights reserved. - - This library is free software; you can redistribute it - and/or modify it under the same terms as Perl itself. +This module is free software; you can redistribute it +and/or modify it under the same terms as Perl itself. =head1 SEE ALSO @@ -1760,8 +1842,6 @@ L<http://www.unicode.org/Public/UNIDATA/HangulSyllableType.txt> L<http://www.unicode.org/reports/tr15/> -=item L<Unicode::Normalize> - =back =cut diff --git a/lib/Unicode/Collate/Changes b/lib/Unicode/Collate/Changes index a59ffa0e55..38f0c9a8fe 100644 --- a/lib/Unicode/Collate/Changes +++ b/lib/Unicode/Collate/Changes @@ -1,5 +1,14 @@ Revision history for Perl module Unicode::Collate. +0.50 Sun May 8 20:26:39 2005 + - Now UCA Revision 14 (based on Unicode 4.1.0). + - Some tests are modified. + - Added cjkrange.t, ignor.t, override.t. + - Added META.yml. + +0.40 Sat Apr 24 06:54:40 2004 + - Now a table file is searched in @INC. + 0.33 Sat Dec 13 14:07:27 2003 - documentation improvement: in "entry", "overrideHangul", etc. @@ -7,7 +16,8 @@ Revision history for Perl module Unicode::Collate. - A matching part from index(), match() etc. will include illegal code points (as well as ignorable characters) following a grapheme. - Contraction with illegal code point will be invalid. - - Added some tests in illegal.t; added view.t. + - Added view.t. + - Added some tests in illegal.t. - Some tests are separated from test.t into altern.t and rearrang.t. - modified XSUB internals. diff --git a/lib/Unicode/Collate/README b/lib/Unicode/Collate/README index 376a0c2c13..db632a2369 100644 --- a/lib/Unicode/Collate/README +++ b/lib/Unicode/Collate/README @@ -1,4 +1,4 @@ -Unicode/Collate version 0.33 +Unicode/Collate version 0.50 =============================== NAME @@ -22,17 +22,23 @@ SYNOPSIS INSTALLATION -Perl 5.6.1 or later -(recommended: Perl 5.8.0 or later) +Perl 5.6.1 or later is required. +Perl 5.8.1 or later is recommended. To use this module, it is recommended to install a table file in the UCA format, by copying it into the directory -where F<Unicode/Collate.pm> is installed; -e.g. into F<perl/lib/Unicode/Collate/> directory -if you have F<perl/lib/Unicode/Collate.pm>. +where Unicode/Collate.pm is installed; +e.g., into perl/lib/Unicode/Collate/ directory +if you will have perl/lib/Unicode/Collate.pm. + +You can install such a table file by adding it +to "Collate" directory (where "keys.txt" is placed) in this distribution +before executing Makefile.PL. The most preferable one is "The Default Unicode Collation Element Table", -available from the Unicode consortium's website: +available from the Unicode Consortium's website: + + http://www.unicode.org/Public/UCA/ http://www.unicode.org/Public/UCA/latest/allkeys.txt (latest version) @@ -72,14 +78,12 @@ DEPENDENCIES The conformant collation requires Unicode::Normalize (v 0.10 or later) although Unicode::Collate can be used without Unicode::Normalize. -COPYRIGHT AND LICENCE +COPYRIGHT AND LICENSE SADAHIRO Tomoyuki <SADAHIRO@cpan.org> - http://homepage1.nifty.com/nomenclator/perl/ - - Copyright(C) 2001-2003, SADAHIRO Tomoyuki. Japan. All rights reserved. +Copyright(C) 2001-2005, SADAHIRO Tomoyuki. Japan. All rights reserved. - This library is free software; you can redistribute it - and/or modify it under the same terms as Perl itself. +This module is free software; you can redistribute it and/or +modify it under the same terms as Perl itself. diff --git a/lib/Unicode/Collate/t/altern.t b/lib/Unicode/Collate/t/altern.t index d48e168b69..c958f4b58f 100644 --- a/lib/Unicode/Collate/t/altern.t +++ b/lib/Unicode/Collate/t/altern.t @@ -5,12 +5,9 @@ BEGIN { "cannot stringify a Unicode code point\n"; exit 0; } -} - -BEGIN { if ($ENV{PERL_CORE}) { - chdir('t') if -d 't'; - @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib); + chdir('t') if -d 't'; + @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib); } } diff --git a/lib/Unicode/Collate/t/cjkrange.t b/lib/Unicode/Collate/t/cjkrange.t new file mode 100644 index 0000000000..5a39bb8a57 --- /dev/null +++ b/lib/Unicode/Collate/t/cjkrange.t @@ -0,0 +1,94 @@ + +BEGIN { + unless ("A" eq pack('U', 0x41)) { + print "1..0 # Unicode::Collate " . + "cannot stringify a Unicode code point\n"; + exit 0; + } + if ($ENV{PERL_CORE}) { + chdir('t') if -d 't'; + @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib); + } +} + +use Test; +BEGIN { plan tests => 51 }; + +use strict; +use warnings; +use Unicode::Collate; + +ok(1); + +my $Collator = Unicode::Collate->new( + table => 'keys.txt', + normalization => undef, +); + +# U+9FA6..U+9FBB are CJK UI since Unicode 4.1.0. +# U+3400 is CJK UI ExtA, then greater than any CJK UI. + +##### 2..11 +ok($Collator->lt("\x{9FA5}", "\x{3400}")); # UI < ExtA +ok($Collator->lt("\x{9FA6}", "\x{3400}")); # new UI < ExtA +ok($Collator->lt("\x{9FBB}", "\x{3400}")); # new UI < ExtA +ok($Collator->gt("\x{9FBC}", "\x{3400}")); # Unassigned > ExtA +ok($Collator->gt("\x{9FFF}", "\x{3400}")); # Unassigned > ExtA +ok($Collator->lt("\x{9FA6}", "\x{9FBB}")); # new UI > new UI +ok($Collator->lt("\x{3400}","\x{20000}")); # ExtA < ExtB +ok($Collator->lt("\x{3400}","\x{2A6D6}")); # ExtA < ExtB +ok($Collator->gt("\x{9FFF}","\x{20000}")); # Unassigned > ExtB +ok($Collator->gt("\x{9FFF}","\x{2A6D6}")); # Unassigned > ExtB + +##### 12..21 +$Collator->change(UCA_Version => 11); +ok($Collator->lt("\x{9FA5}", "\x{3400}")); # UI < ExtA +ok($Collator->gt("\x{9FA6}", "\x{3400}")); # Unassigned > ExtA +ok($Collator->gt("\x{9FBB}", "\x{3400}")); # Unassigned > ExtA +ok($Collator->gt("\x{9FBC}", "\x{3400}")); # Unassigned > ExtA +ok($Collator->gt("\x{9FFF}", "\x{3400}")); # Unassigned > ExtA +ok($Collator->lt("\x{9FA6}", "\x{9FBB}")); # Unassigned > Unassigned +ok($Collator->lt("\x{3400}","\x{20000}")); # ExtA < ExtB +ok($Collator->lt("\x{3400}","\x{2A6D6}")); # ExtA < ExtB +ok($Collator->gt("\x{9FFF}","\x{20000}")); # Unassigned > ExtB +ok($Collator->gt("\x{9FFF}","\x{2A6D6}")); # Unassigned > ExtB + +##### 22..31 +$Collator->change(UCA_Version => 9); +ok($Collator->lt("\x{9FA5}", "\x{3400}")); # UI < ExtA +ok($Collator->gt("\x{9FA6}", "\x{3400}")); # Unassigned > ExtA +ok($Collator->gt("\x{9FBB}", "\x{3400}")); # Unassigned > ExtA +ok($Collator->gt("\x{9FBC}", "\x{3400}")); # Unassigned > ExtA +ok($Collator->gt("\x{9FFF}", "\x{3400}")); # Unassigned > ExtA +ok($Collator->lt("\x{9FA6}", "\x{9FBB}")); # Unassigned > Unassigned +ok($Collator->lt("\x{3400}","\x{20000}")); # ExtA < ExtB +ok($Collator->lt("\x{3400}","\x{2A6D6}")); # ExtA < ExtB +ok($Collator->gt("\x{9FFF}","\x{20000}")); # Unassigned > ExtB +ok($Collator->gt("\x{9FFF}","\x{2A6D6}")); # Unassigned > ExtB + +##### 32..41 +$Collator->change(UCA_Version => 8); +ok($Collator->gt("\x{9FA5}", "\x{3400}")); # UI > ExtA +ok($Collator->gt("\x{9FA6}", "\x{3400}")); # Unassigned > ExtA +ok($Collator->gt("\x{9FBB}", "\x{3400}")); # Unassigned > ExtA +ok($Collator->gt("\x{9FBC}", "\x{3400}")); # Unassigned > ExtA +ok($Collator->gt("\x{9FFF}", "\x{3400}")); # Unassigned > ExtA +ok($Collator->lt("\x{9FA6}", "\x{9FBB}")); # new UI > new UI +ok($Collator->lt("\x{3400}","\x{20000}")); # ExtA < Unassigned(ExtB) +ok($Collator->lt("\x{3400}","\x{2A6D6}")); # ExtA < Unassigned(ExtB) +ok($Collator->lt("\x{9FFF}","\x{20000}")); # Unassigned < Unassigned(ExtB) +ok($Collator->lt("\x{9FFF}","\x{2A6D6}")); # Unassigned < Unassigned(ExtB) + +##### 42..51 +$Collator->change(UCA_Version => 14); +ok($Collator->lt("\x{9FA5}", "\x{3400}")); # UI < ExtA +ok($Collator->lt("\x{9FA6}", "\x{3400}")); # new UI < ExtA +ok($Collator->lt("\x{9FBB}", "\x{3400}")); # new UI < ExtA +ok($Collator->gt("\x{9FBC}", "\x{3400}")); # Unassigned > ExtA +ok($Collator->gt("\x{9FFF}", "\x{3400}")); # Unassigned > ExtA +ok($Collator->lt("\x{9FA6}", "\x{9FBB}")); # new UI > new UI +ok($Collator->lt("\x{3400}","\x{20000}")); # ExtA < ExtB +ok($Collator->lt("\x{3400}","\x{2A6D6}")); # ExtA < ExtB +ok($Collator->gt("\x{9FFF}","\x{20000}")); # Unassigned > ExtB +ok($Collator->gt("\x{9FFF}","\x{2A6D6}")); # Unassigned > ExtB + diff --git a/lib/Unicode/Collate/t/contract.t b/lib/Unicode/Collate/t/contract.t index 18a0cfbdc9..9c55ecdc8b 100644 --- a/lib/Unicode/Collate/t/contract.t +++ b/lib/Unicode/Collate/t/contract.t @@ -4,12 +4,9 @@ BEGIN { "cannot stringify a Unicode code point\n"; exit 0; } -} - -BEGIN { if ($ENV{PERL_CORE}) { - chdir('t') if -d 't'; - @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib); + chdir('t') if -d 't'; + @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib); } } @@ -20,9 +17,6 @@ use strict; use warnings; use Unicode::Collate; -use vars qw($IsEBCDIC); -$IsEBCDIC = ord("A") != 0x41; - our $kjeEntry = <<'ENTRIES'; 0301 ; [.0000.0032.0002.0301] # COMBINING ACUTE ACCENT 0334 ; [.0000.008B.0002.0334] # COMBINING TILDE OVERLAY diff --git a/lib/Unicode/Collate/t/hangtype.t b/lib/Unicode/Collate/t/hangtype.t index b6a46691aa..d8ea74673e 100644 --- a/lib/Unicode/Collate/t/hangtype.t +++ b/lib/Unicode/Collate/t/hangtype.t @@ -4,17 +4,14 @@ BEGIN { "cannot stringify a Unicode code point\n"; exit 0; } -} - -BEGIN { if ($ENV{PERL_CORE}) { - chdir('t') if -d 't'; - @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib); + chdir('t') if -d 't'; + @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib); } } use Test; -BEGIN { plan tests => 30 }; +BEGIN { plan tests => 33 }; use strict; use warnings; @@ -48,9 +45,12 @@ ok(Unicode::Collate::getHST(0x11FA), ''); ok(Unicode::Collate::getHST(0x11FF), ''); ok(Unicode::Collate::getHST(0x3011), ''); ok(Unicode::Collate::getHST(0x11A7), ''); +ok(Unicode::Collate::getHST(0xABFF), ''); ok(Unicode::Collate::getHST(0xAC00), 'LV'); ok(Unicode::Collate::getHST(0xAC01), 'LVT'); ok(Unicode::Collate::getHST(0xAC1B), 'LVT'); ok(Unicode::Collate::getHST(0xAC1C), 'LV'); ok(Unicode::Collate::getHST(0xD7A3), 'LVT'); +ok(Unicode::Collate::getHST(0xD7A4), ''); +ok(Unicode::Collate::getHST(0xFFFF), ''); diff --git a/lib/Unicode/Collate/t/hangul.t b/lib/Unicode/Collate/t/hangul.t index fd6cc69394..d9f7db9b10 100644 --- a/lib/Unicode/Collate/t/hangul.t +++ b/lib/Unicode/Collate/t/hangul.t @@ -4,12 +4,9 @@ BEGIN { "cannot stringify a Unicode code point\n"; exit 0; } -} - -BEGIN { if ($ENV{PERL_CORE}) { - chdir('t') if -d 't'; - @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib); + chdir('t') if -d 't'; + @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib); } } @@ -20,9 +17,6 @@ use strict; use warnings; use Unicode::Collate; -use vars qw($IsEBCDIC); -$IsEBCDIC = ord("A") != 0x41; - ######################### ok(1); diff --git a/lib/Unicode/Collate/t/ignor.t b/lib/Unicode/Collate/t/ignor.t new file mode 100644 index 0000000000..4ee47c6f12 --- /dev/null +++ b/lib/Unicode/Collate/t/ignor.t @@ -0,0 +1,158 @@ +BEGIN { + unless ("A" eq pack('U', 0x41)) { + print "1..0 # Unicode::Collate " . + "cannot stringify a Unicode code point\n"; + exit 0; + } + if ($ENV{PERL_CORE}) { + chdir('t') if -d 't'; + @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib); + } +} + +use Test; +BEGIN { plan tests => 41 }; + +use strict; +use warnings; +use Unicode::Collate; + +ok(1); + +my $trad = Unicode::Collate->new( + table => 'keys.txt', + normalization => undef, + ignoreName => qr/HANGUL|HIRAGANA|KATAKANA|BOPOMOFO/, + level => 3, + entry => << 'ENTRIES', + 0063 0068 ; [.0A3F.0020.0002.0063] % "ch" in traditional Spanish + 0043 0068 ; [.0A3F.0020.0007.0043] # "Ch" in traditional Spanish + 0043 0048 ; [.0A3F.0020.0008.0043] # "CH" in traditional Spanish +ENTRIES +); +# 0063 ; [.0A3D.0020.0002.0063] # LATIN SMALL LETTER C +# 0064 ; [.0A49.0020.0002.0064] # LATIN SMALL LETTER D + +##### 2..3 + +ok( + join(':', $trad->sort( qw/ acha aca ada acia acka / ) ), + join(':', qw/ aca acia acka acha ada / ), +); + +ok( + join(':', $trad->sort( qw/ ACHA ACA ADA ACIA ACKA / ) ), + join(':', qw/ ACA ACIA ACKA ACHA ADA / ), +); + +##### 4..7 + +ok($trad->gt("ocho", "oc\cAho")); # UCA v14 +ok($trad->gt("ocho", "oc\0\cA\0\cBho")); # UCA v14 +ok($trad->eq("-", "")); +ok($trad->gt("ocho", "oc-ho")); + +##### 8..11 + +$trad->change(UCA_Version => 9); + +ok($trad->eq("ocho", "oc\cAho")); # UCA v9 +ok($trad->eq("ocho", "oc\0\cA\0\cBho")); # UCA v9 +ok($trad->eq("-", "")); +ok($trad->gt("ocho", "oc-ho")); + +##### 12..15 + +$trad->change(UCA_Version => 8); + +ok($trad->gt("ocho", "oc\cAho")); +ok($trad->gt("ocho", "oc\0\cA\0\cBho")); +ok($trad->eq("-", "")); +ok($trad->gt("ocho", "oc-ho")); + + +##### 16..19 + +$trad->change(UCA_Version => 9); + +my $hiragana = "\x{3042}\x{3044}"; +my $katakana = "\x{30A2}\x{30A4}"; + +# HIRAGANA and KATAKANA are ignorable via ignoreName +ok($trad->eq($hiragana, "")); +ok($trad->eq("", $katakana)); +ok($trad->eq($hiragana, $katakana)); +ok($trad->eq($katakana, $hiragana)); + + +##### 20..31 + +# According to Conformance Test (UCA_Version == 9 or 11), +# a L3-ignorable is treated as a completely ignorable. + +my $L3ignorable = Unicode::Collate->new( + alternate => 'Non-ignorable', + level => 3, + table => undef, + normalization => undef, + UCA_Version => 9, + entry => <<'ENTRIES', +0000 ; [.0000.0000.0000.0000] # [0000] NULL (in 6429) +0001 ; [.0000.0000.0000.0000] # [0001] START OF HEADING (in 6429) +0591 ; [.0000.0000.0000.0591] # HEBREW ACCENT ETNAHTA +1D165 ; [.0000.0000.0000.1D165] # MUSICAL SYMBOL COMBINING STEM +0021 ; [*024B.0020.0002.0021] # EXCLAMATION MARK +09BE ; [.114E.0020.0002.09BE] # BENGALI VOWEL SIGN AA +09C7 ; [.1157.0020.0002.09C7] # BENGALI VOWEL SIGN E +09CB ; [.1159.0020.0002.09CB] # BENGALI VOWEL SIGN O +09C7 09BE ; [.1159.0020.0002.09CB] # BENGALI VOWEL SIGN O +1D1B9 ; [*098A.0020.0002.1D1B9] # MUSICAL SYMBOL SEMIBREVIS WHITE +1D1BA ; [*098B.0020.0002.1D1BA] # MUSICAL SYMBOL SEMIBREVIS BLACK +1D1BB ; [*098A.0020.0002.1D1B9][.0000.0000.0000.1D165] # M.S. MINIMA +1D1BC ; [*098B.0020.0002.1D1BA][.0000.0000.0000.1D165] # M.S. MINIMA BLACK +ENTRIES +); + +ok($L3ignorable->lt("\cA", "!")); +ok($L3ignorable->lt("\x{591}", "!")); +ok($L3ignorable->eq("\cA", "\x{591}")); +ok($L3ignorable->eq("\x{09C7}\x{09BE}A", "\x{09C7}\cA\x{09BE}A")); +ok($L3ignorable->eq("\x{09C7}\x{09BE}A", "\x{09C7}\x{0591}\x{09BE}A")); +ok($L3ignorable->eq("\x{09C7}\x{09BE}A", "\x{09C7}\x{1D165}\x{09BE}A")); +ok($L3ignorable->eq("\x{09C7}\x{09BE}A", "\x{09CB}A")); +ok($L3ignorable->lt("\x{1D1BB}", "\x{1D1BC}")); +ok($L3ignorable->eq("\x{1D1BB}", "\x{1D1B9}")); +ok($L3ignorable->eq("\x{1D1BC}", "\x{1D1BA}")); +ok($L3ignorable->eq("\x{1D1BB}", "\x{1D1B9}\x{1D165}")); +ok($L3ignorable->eq("\x{1D1BC}", "\x{1D1BA}\x{1D165}")); + +##### 32..41 + +my $c = Unicode::Collate->new( + table => 'keys.txt', + normalization => undef, + level => 1, + UCA_Version => 14, + entry => << 'ENTRIES', +034F ; [.0000.0000.0000.034F] # COMBINING GRAPHEME JOINER +0063 0068 ; [.0A3F.0020.0002.0063] % "ch" in traditional Spanish +0043 0068 ; [.0A3F.0020.0007.0043] # "Ch" in traditional Spanish +0043 0048 ; [.0A3F.0020.0008.0043] # "CH" in traditional Spanish +ENTRIES +); +# 0063 ; [.0A3D.0020.0002.0063] # LATIN SMALL LETTER C +# 0064 ; [.0A49.0020.0002.0064] # LATIN SMALL LETTER D + +ok($c->gt("ocho", "oc\x00\x00ho")); +ok($c->gt("ocho", "oc\cAho")); +ok($c->gt("ocho", "oc\x{034F}ho")); +ok($c->gt("ocio", "oc\x{034F}ho")); +ok($c->lt("ocgo", "oc\x{034F}ho")); +ok($c->lt("oceo", "oc\x{034F}ho")); + +ok($c->viewSortKey("ocho"), "[0B4B 0A3F 0B4B | | |]"); +ok($c->viewSortKey("oc\x00\x00ho"), "[0B4B 0A3D 0AB9 0B4B | | |]"); +ok($c->viewSortKey("oc\cAho"), "[0B4B 0A3D 0AB9 0B4B | | |]"); +ok($c->viewSortKey("oc\x{034F}ho"), "[0B4B 0A3D 0AB9 0B4B | | |]"); + + diff --git a/lib/Unicode/Collate/t/illegal.t b/lib/Unicode/Collate/t/illegal.t index 803e2f6739..825177c283 100644 --- a/lib/Unicode/Collate/t/illegal.t +++ b/lib/Unicode/Collate/t/illegal.t @@ -5,12 +5,9 @@ BEGIN { "cannot stringify a Unicode code point\n"; exit 0; } -} - -BEGIN { if ($ENV{PERL_CORE}) { - chdir('t') if -d 't'; - @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib); + chdir('t') if -d 't'; + @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib); } } diff --git a/lib/Unicode/Collate/t/illegalp.t b/lib/Unicode/Collate/t/illegalp.t index 690c88d0bb..ff1936d353 100644 --- a/lib/Unicode/Collate/t/illegalp.t +++ b/lib/Unicode/Collate/t/illegalp.t @@ -5,12 +5,9 @@ BEGIN { "cannot stringify a Unicode code point\n"; exit 0; } -} - -BEGIN { if ($ENV{PERL_CORE}) { - chdir('t') if -d 't'; - @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib); + chdir('t') if -d 't'; + @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib); } } @@ -26,9 +23,9 @@ ok(1); # No test for Unicode::Collate is included in this .t file. # # UCA conformance test requires completely ignorable characters -# (including noncharacters) must be able to be ordered in code point order; +# (including noncharacters) must be able to be sorted in code point order. # If not so, Unicode::Collate must not be compliant with UCA. -# +# # ~~~ CollationTest_SHIFTED.txt in CollationTest-4.0.0 # # 206F 0021; # ! NOMINAL DIGIT SHAPES [| | | 0251] diff --git a/lib/Unicode/Collate/t/index.t b/lib/Unicode/Collate/t/index.t index a1d67d5346..5b6c78d968 100644 --- a/lib/Unicode/Collate/t/index.t +++ b/lib/Unicode/Collate/t/index.t @@ -5,12 +5,9 @@ BEGIN { "cannot stringify a Unicode code point\n"; exit 0; } -} - -BEGIN { if ($ENV{PERL_CORE}) { - chdir('t') if -d 't'; - @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib); + chdir('t') if -d 't'; + @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib); } } @@ -339,10 +336,10 @@ $ret = $Collator->gsubst($str, "perl", \&strreverse); ok($ret, 2); ok($str, "lr\cB\x{300}e\cBP and LREP."); -$str = "Camel ass came\x{301}l CAMEL horse cAm\0E\0L..."; +$str = "Camel donkey zebra came\x{301}l CAMEL horse cAm\0E\0L..."; $Collator->gsubst($str, "camel", sub { "<b>$_[0]</b>" }); -ok($str, -"<b>Camel</b> ass <b>came\x{301}l</b> <b>CAMEL</b> horse <b>cAm\0E\0L</b>..."); +ok($str, "<b>Camel</b> donkey zebra <b>came\x{301}l</b> " + . "<b>CAMEL</b> horse <b>cAm\0E\0L</b>..."); $Collator->change(level => 3); @@ -401,3 +398,4 @@ $Collator->change(alternate => 'Non-ignorable'); ($ret) = $Collator->match("A?\x{300}!\x{301}B\x{315}", "?!"); ok($ret, undef); + diff --git a/lib/Unicode/Collate/t/normal.t b/lib/Unicode/Collate/t/normal.t index 026240d6fa..57ea03368b 100644 --- a/lib/Unicode/Collate/t/normal.t +++ b/lib/Unicode/Collate/t/normal.t @@ -4,12 +4,9 @@ BEGIN { "cannot stringify a Unicode code point\n"; exit 0; } -} - -BEGIN { if ($ENV{PERL_CORE}) { - chdir('t') if -d 't'; - @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib); + chdir('t') if -d 't'; + @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib); } } @@ -44,7 +41,7 @@ FF21; [.0A87.0020.0008] # LATIN CAPITAL LETTER A 00C5; [.0AC5.0020.0008] # LATIN CAPITAL LETTER A WITH RING ABOVE ENTRIES -# Aong < A+ring < Z < fullA+ring < A-ring +# Aong < A+ring < Z < fullA+ring < A-ring ######################### diff --git a/lib/Unicode/Collate/t/override.t b/lib/Unicode/Collate/t/override.t new file mode 100644 index 0000000000..3e48e15f29 --- /dev/null +++ b/lib/Unicode/Collate/t/override.t @@ -0,0 +1,187 @@ +BEGIN { + unless ("A" eq pack('U', 0x41)) { + print "1..0 # Unicode::Collate " . + "cannot stringify a Unicode code point\n"; + exit 0; + } + if ($ENV{PERL_CORE}) { + chdir('t') if -d 't'; + @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib); + } +} + +use Test; +BEGIN { plan tests => 76 }; + +use strict; +use warnings; +use Unicode::Collate; + +ok(1); + +##### 2..6 + +my $all_undef_8 = Unicode::Collate->new( + table => undef, + normalization => undef, + overrideCJK => undef, + overrideHangul => undef, + UCA_Version => 8, +); + +# All in the Unicode code point order. +# No hangul decomposition. + +ok($all_undef_8->lt("\x{3402}", "\x{4E00}")); +ok($all_undef_8->lt("\x{4DFF}", "\x{4E00}")); +ok($all_undef_8->lt("\x{4E00}", "\x{AC00}")); +ok($all_undef_8->gt("\x{AC00}", "\x{1100}\x{1161}")); +ok($all_undef_8->gt("\x{AC00}", "\x{ABFF}")); + + +##### 7..11 + +my $all_undef_9 = Unicode::Collate->new( + table => undef, + normalization => undef, + overrideCJK => undef, + overrideHangul => undef, + UCA_Version => 9, +); + +# CJK Ideo. < CJK ext A/B < Others. +# No hangul decomposition. + +ok($all_undef_9->lt("\x{4E00}", "\x{3402}")); +ok($all_undef_9->lt("\x{3402}", "\x{20000}")); +ok($all_undef_9->lt("\x{20000}", "\x{AC00}")); +ok($all_undef_9->gt("\x{AC00}", "\x{1100}\x{1161}")); +ok($all_undef_9->gt("\x{AC00}", "\x{ABFF}")); # U+ABFF: not assigned + +##### 12..16 + +my $ignoreHangul = Unicode::Collate->new( + table => undef, + normalization => undef, + overrideHangul => sub {()}, + entry => <<'ENTRIES', +AE00 ; [.0100.0020.0002.AE00] # Hangul GEUL +ENTRIES +); + +# All Hangul Syllables except U+AE00 are ignored. + +ok($ignoreHangul->eq("\x{AC00}", "")); +ok($ignoreHangul->lt("\x{AC00}", "\0")); +ok($ignoreHangul->lt("\x{AC00}", "\x{AE00}")); +ok($ignoreHangul->lt("\x{AC00}", "\x{1100}\x{1161}")); # Jamo are not ignored. +ok($ignoreHangul->lt("Pe\x{AE00}rl", "Perl")); # 'r' is unassigned. + + +my $ignoreCJK = Unicode::Collate->new( + table => undef, + normalization => undef, + overrideCJK => sub {()}, + entry => <<'ENTRIES', +5B57 ; [.0107.0020.0002.5B57] # CJK Ideograph "Letter" +ENTRIES +); + +# All CJK Unified Ideographs except U+5B57 are ignored. + +##### 17..21 +ok($ignoreCJK->eq("\x{4E00}", "")); +ok($ignoreCJK->lt("\x{4E00}", "\0")); +ok($ignoreCJK->eq("Pe\x{4E00}rl", "Perl")); # U+4E00 is a CJK. +ok($ignoreCJK->gt("\x{4DFF}", "\x{4E00}")); # U+4DFF is not CJK. +ok($ignoreCJK->lt("Pe\x{5B57}rl", "Perl")); # 'r' is unassigned. + +##### 22..29 +ok($ignoreCJK->eq("\x{3400}", "")); +ok($ignoreCJK->eq("\x{4DB5}", "")); +ok($ignoreCJK->eq("\x{9FA5}", "")); +ok($ignoreCJK->eq("\x{9FA6}", "")); # UI since Unicode 4.1.0 +ok($ignoreCJK->eq("\x{9FBB}", "")); # UI since Unicode 4.1.0 +ok($ignoreCJK->gt("\x{9FBC}", "Perl")); +ok($ignoreCJK->eq("\x{20000}", "")); +ok($ignoreCJK->eq("\x{2A6D6}", "")); + +##### 30..37 +$ignoreCJK->change(UCA_Version => 9); +ok($ignoreCJK->eq("\x{3400}", "")); +ok($ignoreCJK->eq("\x{4DB5}", "")); +ok($ignoreCJK->eq("\x{9FA5}", "")); +ok($ignoreCJK->gt("\x{9FA6}", "Perl")); +ok($ignoreCJK->gt("\x{9FBB}", "Perl")); +ok($ignoreCJK->gt("\x{9FBC}", "Perl")); +ok($ignoreCJK->eq("\x{20000}", "")); +ok($ignoreCJK->eq("\x{2A6D6}", "")); + +##### 38..45 +$ignoreCJK->change(UCA_Version => 8); +ok($ignoreCJK->eq("\x{3400}", "")); +ok($ignoreCJK->eq("\x{4DB5}", "")); +ok($ignoreCJK->eq("\x{9FA5}", "")); +ok($ignoreCJK->gt("\x{9FA6}", "Perl")); +ok($ignoreCJK->gt("\x{9FBB}", "Perl")); +ok($ignoreCJK->gt("\x{9FBC}", "Perl")); +ok($ignoreCJK->eq("\x{20000}", "")); +ok($ignoreCJK->eq("\x{2A6D6}", "")); + +##### 46..53 +$ignoreCJK->change(UCA_Version => 14); +ok($ignoreCJK->eq("\x{3400}", "")); +ok($ignoreCJK->eq("\x{4DB5}", "")); +ok($ignoreCJK->eq("\x{9FA5}", "")); +ok($ignoreCJK->eq("\x{9FA6}", "")); # UI since Unicode 4.1.0 +ok($ignoreCJK->eq("\x{9FBB}", "")); # UI since Unicode 4.1.0 +ok($ignoreCJK->gt("\x{9FBC}", "Perl")); +ok($ignoreCJK->eq("\x{20000}", "")); +ok($ignoreCJK->eq("\x{2A6D6}", "")); + +##### 54..76 +my $overCJK = Unicode::Collate->new( + table => undef, + normalization => undef, + entry => <<'ENTRIES', +0061 ; [.0101.0020.0002.0061] # latin a +0041 ; [.0101.0020.0008.0041] # LATIN A +4E00 ; [.B1FC.0030.0004.4E00] # Ideograph; B1FC = FFFF - 4E03. +ENTRIES + overrideCJK => sub { + my $u = 0xFFFF - $_[0]; # reversed + [$u, 0x20, 0x2, $u]; + }, +); + +ok($overCJK->lt("a", "A")); # diff. at level 3. +ok($overCJK->lt( "\x{4E03}", "\x{4E00}")); # diff. at level 2. +ok($overCJK->lt("A\x{4E03}", "A\x{4E00}")); +ok($overCJK->lt("A\x{4E03}", "a\x{4E00}")); +ok($overCJK->lt("a\x{4E03}", "A\x{4E00}")); + +ok($overCJK->gt("a\x{3400}", "A\x{4DB5}")); +ok($overCJK->gt("a\x{4DB5}", "A\x{9FA5}")); +ok($overCJK->gt("a\x{9FA5}", "A\x{9FA6}")); +ok($overCJK->gt("a\x{9FA6}", "A\x{9FBB}")); +ok($overCJK->lt("a\x{9FBB}", "A\x{9FBC}")); +ok($overCJK->lt("a\x{9FBC}", "A\x{9FBF}")); + +$overCJK->change(UCA_Version => 9); + +ok($overCJK->gt("a\x{3400}", "A\x{4DB5}")); +ok($overCJK->gt("a\x{4DB5}", "A\x{9FA5}")); +ok($overCJK->lt("a\x{9FA5}", "A\x{9FA6}")); +ok($overCJK->lt("a\x{9FA6}", "A\x{9FBB}")); +ok($overCJK->lt("a\x{9FBB}", "A\x{9FBC}")); +ok($overCJK->lt("a\x{9FBC}", "A\x{9FBF}")); + +$overCJK->change(UCA_Version => 14); + +ok($overCJK->gt("a\x{3400}", "A\x{4DB5}")); +ok($overCJK->gt("a\x{4DB5}", "A\x{9FA5}")); +ok($overCJK->gt("a\x{9FA5}", "A\x{9FA6}")); +ok($overCJK->gt("a\x{9FA6}", "A\x{9FBB}")); +ok($overCJK->lt("a\x{9FBB}", "A\x{9FBC}")); +ok($overCJK->lt("a\x{9FBC}", "A\x{9FBF}")); + diff --git a/lib/Unicode/Collate/t/rearrang.t b/lib/Unicode/Collate/t/rearrang.t index cc02fa9f79..0977db9e48 100644 --- a/lib/Unicode/Collate/t/rearrang.t +++ b/lib/Unicode/Collate/t/rearrang.t @@ -5,12 +5,9 @@ BEGIN { "cannot stringify a Unicode code point\n"; exit 0; } -} - -BEGIN { if ($ENV{PERL_CORE}) { - chdir('t') if -d 't'; - @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib); + chdir('t') if -d 't'; + @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib); } } @@ -28,6 +25,7 @@ ok(1); my $Collator = Unicode::Collate->new( table => 'keys.txt', normalization => undef, + UCA_Version => 9, ); # rearrange : 0x0E40..0x0E44, 0x0EC0..0x0EC4 (default) @@ -73,6 +71,7 @@ my $no_rearrange = Unicode::Collate->new( table => undef, normalization => undef, rearrange => [], + UCA_Version => 9, ); ok($no_rearrange->lt("A", "B")); @@ -87,6 +86,7 @@ my $undef_rearrange = Unicode::Collate->new( table => undef, normalization => undef, rearrange => undef, + UCA_Version => 9, ); ok($undef_rearrange->lt("A", "B")); diff --git a/lib/Unicode/Collate/t/test.t b/lib/Unicode/Collate/t/test.t index 53fa7ca879..a5337a014f 100644 --- a/lib/Unicode/Collate/t/test.t +++ b/lib/Unicode/Collate/t/test.t @@ -5,17 +5,14 @@ BEGIN { "cannot stringify a Unicode code point\n"; exit 0; } -} - -BEGIN { if ($ENV{PERL_CORE}) { - chdir('t') if -d 't'; - @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib); + chdir('t') if -d 't'; + @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib); } } use Test; -BEGIN { plan tests => 160 }; +BEGIN { plan tests => 107 }; use strict; use warnings; @@ -23,7 +20,17 @@ use Unicode::Collate; ok(1); -##### 2..6 +sub _pack_U { Unicode::Collate::pack_U(@_) } +sub _unpack_U { Unicode::Collate::unpack_U(@_) } + +my $A_acute = _pack_U(0xC1); +my $a_acute = _pack_U(0xE1); +my $acute = _pack_U(0x0301); + +my $hiragana = "\x{3042}\x{3044}"; +my $katakana = "\x{30A2}\x{30A4}"; + +##### 2..7 my $Collator = Unicode::Collate->new( table => 'keys.txt', @@ -32,28 +39,21 @@ my $Collator = Unicode::Collate->new( ok(ref $Collator, "Unicode::Collate"); - -ok( - join(':', $Collator->sort( - qw/ lib strict Carp ExtUtils CGI Time warnings Math overload Pod CPAN / - ) ), - join(':', - qw/ Carp CGI CPAN ExtUtils lib Math overload Pod strict Time warnings / - ), -); - ok($Collator->cmp("", ""), 0); ok($Collator->eq("", "")); ok($Collator->cmp("", "perl"), -1); -##### 7..17 +ok( + join(':', $Collator->sort( qw/ acha aca ada acia acka / ) ), + join(':', qw/ aca acha acia acka ada / ), +); -sub _pack_U { Unicode::Collate::pack_U(@_) } -sub _unpack_U { Unicode::Collate::unpack_U(@_) } +ok( + join(':', $Collator->sort( qw/ ACHA ACA ADA ACIA ACKA / ) ), + join(':', qw/ ACA ACHA ACIA ACKA ADA / ), +); -my $A_acute = _pack_U(0xC1); -my $a_acute = _pack_U(0xE1); -my $acute = _pack_U(0x0301); +##### 8..18 ok($Collator->cmp("A$acute", $A_acute), 0); # @version 3.1.1 (prev: -1) ok($Collator->cmp($a_acute, $A_acute), -1); @@ -71,84 +71,7 @@ ok($Collator->lt("A", $A_acute)); ok($Collator->lt("A", $a_acute)); ok($Collator->lt($a_acute, $A_acute)); -##### 18..20 - -eval { require Unicode::Normalize }; -if (!$@) { - my $NFD = Unicode::Collate->new( - table => 'keys.txt', - level => 1, - entry => <<'ENTRIES', -0430 ; [.0CB5.0020.0002.0430] # CYRILLIC SMALL LETTER A -0410 ; [.0CB5.0020.0008.0410] # CYRILLIC CAPITAL LETTER A -04D3 ; [.0CBD.0020.0002.04D3] # CYRILLIC SMALL LETTER A WITH DIAERESIS -0430 0308 ; [.0CBD.0020.0002.04D3] # CYRILLIC SMALL LETTER A WITH DIAERESIS -04D2 ; [.0CBD.0020.0008.04D2] # CYRILLIC CAPITAL LETTER A WITH DIAERESIS -0410 0308 ; [.0CBD.0020.0008.04D2] # CYRILLIC CAPITAL LETTER A WITH DIAERESIS -0430 3099 ; [.0CBE.0020.0002.04D3] # A WITH KATAKANA VOICED -0430 3099 0308 ; [.0CBF.0020.0002.04D3] # A WITH KATAKANA VOICED, DIAERESIS -ENTRIES - ); - ok($NFD->eq("\x{4D3}\x{325}", "\x{430}\x{308}\x{325}")); - ok($NFD->lt("\x{430}\x{308}A", "\x{430}\x{308}B")); - ok($NFD->lt("\x{430}\x{3099}B", "\x{430}\x{308}\x{3099}A")); -} -else { - ok(1); - ok(1); - ok(1); -} - -##### 21..34 - -my $trad = Unicode::Collate->new( - table => 'keys.txt', - normalization => undef, - ignoreName => qr/HANGUL|HIRAGANA|KATAKANA|BOPOMOFO/, - level => 3, - entry => << 'ENTRIES', - 0063 0068 ; [.0A3F.0020.0002.0063] % "ch" in traditional Spanish - 0043 0068 ; [.0A3F.0020.0008.0043] # "Ch" in traditional Spanish -ENTRIES -); -# 0063 ; [.0A3D.0020.0002.0063] # LATIN SMALL LETTER C -# 0064 ; [.0A49.0020.0002.0064] # LATIN SMALL LETTER D -# Deutsch sz is included in 'keys.txt'; - -ok( - join(':', $trad->sort( qw/ acha aca ada acia acka / ) ), - join(':', qw/ aca acia acka acha ada / ), -); - -ok( - join(':', $Collator->sort( qw/ acha aca ada acia acka / ) ), - join(':', qw/ aca acha acia acka ada / ), -); - -ok($trad->eq("ocho", "oc\cAho")); # UCA v9 -ok($trad->eq("ocho", "oc\0\cA\0\cBho")); # UCA v9 -ok($trad->eq("-", "")); -ok($trad->gt("ocho", "oc-ho")); - -$trad->change(UCA_Version => 8); - -ok($trad->gt("ocho", "oc\cAho")); -ok($trad->gt("ocho", "oc\0\cA\0\cBho")); -ok($trad->eq("-", "")); -ok($trad->gt("ocho", "oc-ho")); - -$trad->change(UCA_Version => 9); - -my $hiragana = "\x{3042}\x{3044}"; -my $katakana = "\x{30A2}\x{30A4}"; - -# HIRAGANA and KATAKANA are ignorable via ignoreName -ok($trad->eq($hiragana, "")); -ok($trad->eq("", $katakana)); -ok($trad->eq($hiragana, $katakana)); -ok($trad->eq($katakana, $hiragana)); - -##### 35..41 +##### 19..25 $Collator->change(level => 2); @@ -161,7 +84,7 @@ ok( $Collator->cmp($hiragana, $katakana), 0); ok( $Collator->eq($hiragana, $katakana) ); ok( $Collator->ge($hiragana, $katakana) ); -##### 42..47 +##### 26..31 # hangul ok( $Collator->eq("a\x{AC00}b", "a\x{1100}\x{1161}b") ); @@ -171,7 +94,7 @@ ok( $Collator->lt("a\x{AC00}b", "a\x{AE00}b") ); ok( $Collator->gt("a\x{D7A3}b", "a\x{C544}b") ); ok( $Collator->lt("a\x{C544}b", "a\x{30A2}b") ); # hangul < hiragana -##### 48..56 +##### 32..40 $Collator->change(%old_level, katakana_before_hiragana => 1); @@ -186,7 +109,7 @@ ok( $Collator->ne($hiragana, $katakana) ); ok( $Collator->gt($hiragana, $katakana) ); ok( $Collator->ge($hiragana, $katakana) ); -##### 57..62 +##### 41..46 $Collator->change(upper_before_lower => 1); @@ -197,13 +120,15 @@ ok( $Collator->cmp($hiragana, $katakana), 1); ok( $Collator->ge($hiragana, $katakana), 1); ok( $Collator->gt($hiragana, $katakana), 1); -##### 63..68 +##### 47..48 $Collator->change(katakana_before_hiragana => 0); ok( $Collator->cmp("abc", "ABC"), 1); ok( $Collator->cmp($hiragana, $katakana), -1); +##### 49..52 + $Collator->change(upper_before_lower => 0); ok( $Collator->cmp("abc", "ABC"), -1); @@ -211,7 +136,7 @@ ok( $Collator->le("abc", "ABC") ); ok( $Collator->cmp($hiragana, $katakana), -1); ok( $Collator->lt($hiragana, $katakana) ); -##### 69..70 +##### 53..54 my $ignoreAE = Unicode::Collate->new( table => 'keys.txt', @@ -222,7 +147,7 @@ my $ignoreAE = Unicode::Collate->new( ok($ignoreAE->eq("element","lament")); ok($ignoreAE->eq("Perl","ePrl")); -##### 71 +##### 55 my $onlyABC = Unicode::Collate->new( table => undef, @@ -242,7 +167,7 @@ ok( join(':', qw/ A aB Ab ABA BAC cAc cc / ), ); -##### 72..75 +##### 56..59 my $undefAE = Unicode::Collate->new( table => 'keys.txt', @@ -255,7 +180,7 @@ ok($Collator->lt("edge","fog")); ok($undefAE ->gt("lake","like")); ok($Collator->lt("lake","like")); -##### 76..85 +##### 60..69 # Table is undefined, then no entry is defined. @@ -270,7 +195,6 @@ ok($undef_table->lt('', 'A')); ok($undef_table->lt('ABC', 'B')); # Hangul should be decomposed (even w/o Unicode::Normalize). - ok($undef_table->lt("Perl", "\x{AC00}")); ok($undef_table->eq("\x{AC00}", "\x{1100}\x{1161}")); ok($undef_table->eq("\x{AE00}", "\x{1100}\x{1173}\x{11AF}")); @@ -280,7 +204,6 @@ ok($undef_table->lt("\x{AE00}", "\x{3042}")); # U+3042: Hiragana A # Weight for CJK Ideographs is defined, though. - ok($undef_table->lt("", "\x{4E00}")); ok($undef_table->lt("\x{4E8C}","ABC")); ok($undef_table->lt("\x{4E00}","\x{3042}")); @@ -289,7 +212,7 @@ ok($undef_table->lt("\x{4E00}","\x{4E8C}")); # U+4E8C: Ideograph "TWO" -##### 86..90 +##### 70..74 my $few_entries = Unicode::Collate->new( entry => <<'ENTRIES', @@ -320,105 +243,7 @@ ok($few_entries->lt("\x{AE30}", "\x{AC00}")); ok($few_entries->eq("\x{AC00}", "\x{1100}\x{1161}")); -##### 91..95 - -my $all_undef_8 = Unicode::Collate->new( - table => undef, - normalization => undef, - overrideCJK => undef, - overrideHangul => undef, - UCA_Version => 8, -); - -# All in the Unicode code point order. -# No hangul decomposition. - -ok($all_undef_8->lt("\x{3402}", "\x{4E00}")); -ok($all_undef_8->lt("\x{4DFF}", "\x{4E00}")); -ok($all_undef_8->lt("\x{4E00}", "\x{AC00}")); -ok($all_undef_8->gt("\x{AC00}", "\x{1100}\x{1161}")); -ok($all_undef_8->gt("\x{AC00}", "\x{ABFF}")); - -##### 96..100 - -my $all_undef_9 = Unicode::Collate->new( - table => undef, - normalization => undef, - overrideCJK => undef, - overrideHangul => undef, - UCA_Version => 9, -); - -# CJK Ideo. < CJK ext A/B < Others. -# No hangul decomposition. - -ok($all_undef_9->lt("\x{4E00}", "\x{3402}")); -ok($all_undef_9->lt("\x{3402}", "\x{20000}")); -ok($all_undef_9->lt("\x{20000}", "\x{AC00}")); -ok($all_undef_9->gt("\x{AC00}", "\x{1100}\x{1161}")); -ok($all_undef_9->gt("\x{AC00}", "\x{ABFF}")); # U+ABFF: not assigned - -##### 101..105 - -my $ignoreCJK = Unicode::Collate->new( - table => undef, - normalization => undef, - overrideCJK => sub {()}, - entry => <<'ENTRIES', -5B57 ; [.0107.0020.0002.5B57] # CJK Ideograph "Letter" -ENTRIES -); - -# All CJK Unified Ideographs except U+5B57 are ignored. - -ok($ignoreCJK->eq("\x{4E00}", "")); -ok($ignoreCJK->lt("\x{4E00}", "\0")); -ok($ignoreCJK->eq("Pe\x{4E00}rl", "Perl")); # U+4E00 is a CJK. -ok($ignoreCJK->gt("\x{4DFF}", "\x{4E00}")); # U+4DFF is not CJK. -ok($ignoreCJK->lt("Pe\x{5B57}rl", "Perl")); # 'r' is unassigned. - -##### 106..110 - -my $ignoreHangul = Unicode::Collate->new( - table => undef, - normalization => undef, - overrideHangul => sub {()}, - entry => <<'ENTRIES', -AE00 ; [.0100.0020.0002.AE00] # Hangul GEUL -ENTRIES -); - -# All Hangul Syllables except U+AE00 are ignored. - -ok($ignoreHangul->eq("\x{AC00}", "")); -ok($ignoreHangul->lt("\x{AC00}", "\0")); -ok($ignoreHangul->lt("\x{AC00}", "\x{AE00}")); -ok($ignoreHangul->lt("\x{AC00}", "\x{1100}\x{1161}")); # Jamo are not ignored. -ok($ignoreHangul->lt("Pe\x{AE00}rl", "Perl")); # 'r' is unassigned. - -##### 111..115 - -my $overCJK = Unicode::Collate->new( - table => undef, - normalization => undef, - entry => <<'ENTRIES', -0061 ; [.0101.0020.0002.0061] # latin a -0041 ; [.0101.0020.0008.0041] # LATIN A -4E00 ; [.B1FC.0030.0004.4E00] # Ideograph; B1FC = FFFF - 4E03. -ENTRIES - overrideCJK => sub { - my $u = 0xFFFF - $_[0]; # reversed - [$u, 0x20, 0x2, $u]; - }, -); - -ok($overCJK->lt("a", "A")); # diff. at level 3. -ok($overCJK->lt( "\x{4E03}", "\x{4E00}")); # diff. at level 2. -ok($overCJK->lt("A\x{4E03}", "A\x{4E00}")); -ok($overCJK->lt("A\x{4E03}", "a\x{4E00}")); -ok($overCJK->lt("a\x{4E03}", "A\x{4E00}")); - -##### 116..120 +##### 75..79 my $dropArticles = Unicode::Collate->new( table => "keys.txt", @@ -436,7 +261,7 @@ ok($dropArticles->lt("the pen", "a pencil")); ok($Collator->lt("Perl", "The Perl")); ok($Collator->gt("the pen", "a pencil")); -##### 121..122 +##### 80..81 my $backLevel1 = Unicode::Collate->new( table => undef, @@ -449,7 +274,7 @@ my $backLevel1 = Unicode::Collate->new( ok($backLevel1->gt("AB", "BA")); ok($backLevel1->gt("\x{3042}\x{3044}", "\x{3044}\x{3042}")); -##### 123..130 +##### 82..89 my $backLevel2 = Unicode::Collate->new( table => "keys.txt", @@ -471,47 +296,8 @@ ok($backLevel2->lt("\x{4E03}", $katakana)); ok($Collator ->gt("\x{4E00}", $hiragana)); ok($Collator ->gt("\x{4E03}", $katakana)); -##### 131..142 - -# According to Conformance Test, -# a L3-ignorable is treated as a completely ignorable. - -my $L3ignorable = Unicode::Collate->new( - alternate => 'Non-ignorable', - level => 3, - table => undef, - normalization => undef, - entry => <<'ENTRIES', -0000 ; [.0000.0000.0000.0000] # [0000] NULL (in 6429) -0001 ; [.0000.0000.0000.0000] # [0001] START OF HEADING (in 6429) -0591 ; [.0000.0000.0000.0591] # HEBREW ACCENT ETNAHTA -1D165 ; [.0000.0000.0000.1D165] # MUSICAL SYMBOL COMBINING STEM -0021 ; [*024B.0020.0002.0021] # EXCLAMATION MARK -09BE ; [.114E.0020.0002.09BE] # BENGALI VOWEL SIGN AA -09C7 ; [.1157.0020.0002.09C7] # BENGALI VOWEL SIGN E -09CB ; [.1159.0020.0002.09CB] # BENGALI VOWEL SIGN O -09C7 09BE ; [.1159.0020.0002.09CB] # BENGALI VOWEL SIGN O -1D1B9 ; [*098A.0020.0002.1D1B9] # MUSICAL SYMBOL SEMIBREVIS WHITE -1D1BA ; [*098B.0020.0002.1D1BA] # MUSICAL SYMBOL SEMIBREVIS BLACK -1D1BB ; [*098A.0020.0002.1D1B9][.0000.0000.0000.1D165] # M.S. MINIMA -1D1BC ; [*098B.0020.0002.1D1BA][.0000.0000.0000.1D165] # M.S. MINIMA BLACK -ENTRIES -); -ok($L3ignorable->lt("\cA", "!")); -ok($L3ignorable->lt("\x{591}", "!")); -ok($L3ignorable->eq("\cA", "\x{591}")); -ok($L3ignorable->eq("\x{09C7}\x{09BE}A", "\x{09C7}\cA\x{09BE}A")); -ok($L3ignorable->eq("\x{09C7}\x{09BE}A", "\x{09C7}\x{0591}\x{09BE}A")); -ok($L3ignorable->eq("\x{09C7}\x{09BE}A", "\x{09C7}\x{1D165}\x{09BE}A")); -ok($L3ignorable->eq("\x{09C7}\x{09BE}A", "\x{09CB}A")); -ok($L3ignorable->lt("\x{1D1BB}", "\x{1D1BC}")); -ok($L3ignorable->eq("\x{1D1BB}", "\x{1D1B9}")); -ok($L3ignorable->eq("\x{1D1BC}", "\x{1D1BA}")); -ok($L3ignorable->eq("\x{1D1BB}", "\x{1D1B9}\x{1D165}")); -ok($L3ignorable->eq("\x{1D1BC}", "\x{1D1BA}\x{1D165}")); - -##### 143..149 +##### 90..96 my $O_str = Unicode::Collate->new( table => "keys.txt", @@ -545,7 +331,7 @@ ok($Collator->eq("\x{200B}", "\0")); ok($O_str ->gt("\x{200B}", "\0")); ok($O_str ->gt("\x{200B}", "A")); -##### 150..159 +##### 97..107 my %origVer = $Collator->change(UCA_Version => 8); diff --git a/lib/Unicode/Collate/t/trailwt.t b/lib/Unicode/Collate/t/trailwt.t index 463252cf1c..e987f8f509 100644 --- a/lib/Unicode/Collate/t/trailwt.t +++ b/lib/Unicode/Collate/t/trailwt.t @@ -4,12 +4,9 @@ BEGIN { "cannot stringify a Unicode code point\n"; exit 0; } -} - -BEGIN { if ($ENV{PERL_CORE}) { - chdir('t') if -d 't'; - @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib); + chdir('t') if -d 't'; + @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib); } } diff --git a/lib/Unicode/Collate/t/variable.t b/lib/Unicode/Collate/t/variable.t index 880327a6bd..1a6bd6495b 100644 --- a/lib/Unicode/Collate/t/variable.t +++ b/lib/Unicode/Collate/t/variable.t @@ -5,12 +5,9 @@ BEGIN { "cannot stringify a Unicode code point\n"; exit 0; } -} - -BEGIN { if ($ENV{PERL_CORE}) { - chdir('t') if -d 't'; - @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib); + chdir('t') if -d 't'; + @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib); } } diff --git a/lib/Unicode/Collate/t/version.t b/lib/Unicode/Collate/t/version.t index fec144c9d7..17adf539ca 100644 --- a/lib/Unicode/Collate/t/version.t +++ b/lib/Unicode/Collate/t/version.t @@ -5,12 +5,9 @@ BEGIN { "cannot stringify a Unicode code point\n"; exit 0; } -} - -BEGIN { if ($ENV{PERL_CORE}) { - chdir('t') if -d 't'; - @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib); + chdir('t') if -d 't'; + @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib); } } @@ -26,8 +23,8 @@ ok(1); ######################### # Fix me when UCA and/or keys.txt is upgraded. -my $UCA_Version = "11"; -my $Base_Unicode_Version = "4.0"; +my $UCA_Version = "14"; +my $Base_Unicode_Version = "4.1.0"; my $Key_Version = "3.1.1"; ok(Unicode::Collate::UCA_Version, $UCA_Version); diff --git a/lib/Unicode/Collate/t/view.t b/lib/Unicode/Collate/t/view.t index 578d4843e5..44963f4189 100644 --- a/lib/Unicode/Collate/t/view.t +++ b/lib/Unicode/Collate/t/view.t @@ -5,12 +5,9 @@ BEGIN { "cannot stringify a Unicode code point\n"; exit 0; } -} - -BEGIN { if ($ENV{PERL_CORE}) { - chdir('t') if -d 't'; - @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib); + chdir('t') if -d 't'; + @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib); } } |