diff options
author | Jarkko Hietaniemi <jhi@iki.fi> | 2002-07-25 20:37:16 +0000 |
---|---|---|
committer | Jarkko Hietaniemi <jhi@iki.fi> | 2002-07-25 20:37:16 +0000 |
commit | 0116f5dc667173b72bd4d2214f20e592d19f1c37 (patch) | |
tree | 3afa1e605ff615ecca6bf776f929dca14a96b325 /lib/Unicode | |
parent | c993f866915d3552dc02138441c792f0dccb48d0 (diff) | |
download | perl-0116f5dc667173b72bd4d2214f20e592d19f1c37.tar.gz |
Upgrade to Unicode::Collate 0.20.
p4raw-id: //depot/perl@17655
Diffstat (limited to 'lib/Unicode')
-rw-r--r-- | lib/Unicode/Collate.pm | 409 | ||||
-rw-r--r-- | lib/Unicode/Collate/Changes | 8 | ||||
-rw-r--r-- | lib/Unicode/Collate/README | 6 | ||||
-rw-r--r-- | lib/Unicode/Collate/t/test.t | 211 |
4 files changed, 431 insertions, 203 deletions
diff --git a/lib/Unicode/Collate.pm b/lib/Unicode/Collate.pm index 51c290ec87..fa0ef225fa 100644 --- a/lib/Unicode/Collate.pm +++ b/lib/Unicode/Collate.pm @@ -14,7 +14,7 @@ use File::Spec; require Exporter; -our $VERSION = '0.12'; +our $VERSION = '0.20'; our $PACKAGE = __PACKAGE__; our @ISA = qw(Exporter); @@ -36,7 +36,6 @@ unless ($@) { else { # XXX, Perl 5.6.1 my($f, $fh); foreach my $d (@INC) { - use File::Spec; $f = File::Spec->catfile($d, "unicode", "Unicode.301"); if (open($fh, $f)) { $UNICODE_VERSION = '3.0.1'; @@ -48,53 +47,100 @@ else { # XXX, Perl 5.6.1 our $getCombinClass; # coderef for combining class from Unicode::Normalize -use constant Min2 => 0x20; # minimum weight at level 2 -use constant Min3 => 0x02; # minimum weight at level 3 -use constant UNDEFINED => 0xFF80; # special value for undefined CE's +use constant Min2 => 0x20; # minimum weight at level 2 +use constant Min3 => 0x02; # minimum weight at level 3 -our $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ]; - -sub UCA_Version { "8.0" } +# format for pack +use constant VCE_FORMAT => 'Cn4'; # for variable + CE with 4 levels -sub Base_Unicode_Version { $UNICODE_VERSION || 'unknown' } +# values of variable +use constant NON_VAR => 0; # Non-Variable character +use constant VAR => 1; # Variable character -## -## constructor -## -sub new -{ - my $class = shift; - my $self = bless { @_ }, $class; +our $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ]; - # alternate lowercased - $self->{alternate} = - ! exists $self->{alternate} ? 'shifted' : lc($self->{alternate}); +sub UCA_Version { "9" } - croak "$PACKAGE unknown alternate tag name: $self->{alternate}" - unless $self->{alternate} eq 'blanked' - || $self->{alternate} eq 'non-ignorable' - || $self->{alternate} eq 'shifted' - || $self->{alternate} eq 'shift-trimmed'; +sub Base_Unicode_Version { $UNICODE_VERSION || 'unknown' } - # collation level - $self->{level} ||= 4; +my (%AlternateOK); +@AlternateOK{ qw/ + blanked non-ignorable shifted shift-trimmed + / } = (); + +our @ChangeOK = qw/ + alternate backwards level normalization rearrange + katakana_before_hiragana upper_before_lower + overrideHangul overrideCJK preprocess UCA_Version + /; + +our @ChangeNG = qw/ + entry entries table ignored combining maxlength + ignoreChar ignoreName undefChar undefName + versionTable alternateTable backwardsTable forwardsTable rearrangeTable + derivCode normCode rearrangeHash isShift L3ignorable + /; + +my (%ChangeOK, %ChangeNG); +@ChangeOK{ @ChangeOK } = (); +@ChangeNG{ @ChangeNG } = (); + +sub change { + my $self = shift; + my %hash = @_; + my %old; + foreach my $k (keys %hash) { + if (exists $ChangeOK{$k}) { + $old{$k} = $self->{$k}; + $self->{$k} = $hash{$k}; + } + elsif (exists $ChangeNG{$k}) { + croak "change of $k via change() is not allowed!"; + } + # else => ignored + } + $self->checkCollator; + return wantarray ? %old : $self; +} +sub checkCollator { + my $self = shift; croak "Illegal level lower than 1 (passed $self->{level})." if $self->{level} < 1; croak "A level higher than 4 (passed $self->{level}) is not supported." if 4 < $self->{level}; - # overrideHangul and -CJK - # If true: CODEREF used; '': default; undef: derived elements - $self->{overrideHangul} = '' - if ! exists $self->{overrideHangul}; - $self->{overrideCJK} = '' - if ! exists $self->{overrideCJK}; + $self->{derivCode} = + $self->{UCA_Version} == -1 ? \&broken_derivCE : + $self->{UCA_Version} == 8 ? \&derivCE_8 : + $self->{UCA_Version} == 9 ? \&derivCE_9 : + croak "Illegal UCA version (passed $self->{UCA_Version})."; - # normalization form - $self->{normalization} = 'D' - if ! exists $self->{normalization}; - $self->{UNF} = undef; + $self->{alternate} = lc($self->{alternate}); + croak "$PACKAGE unknown alternate tag name: $self->{alternate}" + unless exists $AlternateOK{ $self->{alternate} }; + + $self->{isShift} = $self->{alternate} eq 'shifted' || + $self->{alternate} eq 'shift-trimmed'; + + $self->{backwards} = [] + if ! defined $self->{backwards}; + $self->{backwards} = [ $self->{backwards} ] + if ! ref $self->{backwards}; + + $self->{rearrange} = [] + if ! defined $self->{rearrange}; + croak "$PACKAGE: A list for rearrangement must be store in an ARRAYREF" + if ! ref $self->{rearrange}; + + # keys of $self->{rearrangeHash} are $self->{rearrange}. + $self->{rearrangeHash} = undef; + + if (@{ $self->{rearrange} }) { + @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = (); + } + + $self->{normCode} = undef; if (defined $self->{normalization}) { eval { require Unicode::Normalize }; @@ -105,7 +151,7 @@ sub new $getCombinClass = \&Unicode::Normalize::getCombinClass if ! $getCombinClass; - $self->{UNF} = + $self->{normCode} = $self->{normalization} =~ /^(?:NF)?C$/ ? \&NFC : $self->{normalization} =~ /^(?:NF)?D$/ ? \&NFD : $self->{normalization} =~ /^(?:NF)?KC$/ ? \&NFKC : @@ -113,37 +159,39 @@ sub new croak "$PACKAGE unknown normalization form name: " . $self->{normalization}; } + return; +} + +sub new +{ + my $class = shift; + my $self = bless { @_ }, $class; - # Open a table file. # If undef is passed explicitly, no file is read. - $self->{table} = $KeyFile - if ! exists $self->{table}; - $self->read_table - if defined $self->{table}; + $self->{table} = $KeyFile if ! exists $self->{table}; + $self->read_table if defined $self->{table}; if ($self->{entry}) { $self->parseEntry($_) foreach split /\n/, $self->{entry}; } - # backwards - $self->{backwards} ||= [ ]; - $self->{backwards} = [ $self->{backwards} ] - if ! ref $self->{backwards}; + $self->{level} ||= 4; + $self->{UCA_Version} ||= UCA_Version(); - # rearrange - $self->{rearrange} = $DefaultRearrange + $self->{overrideHangul} = '' + if ! exists $self->{overrideHangul}; + $self->{overrideCJK} = '' + if ! exists $self->{overrideCJK}; + $self->{normalization} = 'D' + if ! exists $self->{normalization}; + $self->{alternate} = $self->{alternateTable} || 'shifted' + if ! exists $self->{alternate}; + $self->{rearrange} = $self->{rearrangeTable} || $DefaultRearrange if ! exists $self->{rearrange}; - $self->{rearrange} = [] - if ! defined $self->{rearrange}; - croak "$PACKAGE: A list for rearrangement must be store in an ARRAYREF" - if ! ref $self->{rearrange}; + $self->{backwards} = $self->{backwardsTable} + if ! exists $self->{backwards}; - # keys of $self->{rearrangeHash} are $self->{rearrange}. - $self->{rearrangeHash} = undef; - - if (@{ $self->{rearrange} }) { - @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = (); - } + $self->checkCollator; return $self; } @@ -159,17 +207,20 @@ sub read_table { while (<$fk>) { next if /^\s*#/; if (/^\s*\@/) { - if (/^\@version\s*(\S*)/) { - $self->{version} ||= $1; + if (/^\s*\@version\s*(\S*)/) { + $self->{versionTable} ||= $1; + } + elsif (/^\s*\@alternate\s+(\S*)/) { + $self->{alternateTable} ||= $1; } - elsif (/^\@alternate\s+(.*)/) { - $self->{alternate} ||= $1; + elsif (/^\s*\@backwards\s+(\S*)/) { + push @{ $self->{backwardsTable} }, $1; } - elsif (/^\@backwards\s+(.*)/) { - push @{ $self->{backwards} }, $1; + elsif (/^\s*\@forwards\s+(\S*)/) { # parhaps no use + push @{ $self->{forwardsTable} }, $1; } - elsif (/^\@rearrange\s+(.*)/) { - push @{ $self->{rearrange} }, _getHexArray($1); + elsif (/^\s*\@rearrange\s+(.*)/) { # (\S*) is NG + push @{ $self->{rearrangeTable} }, _getHexArray($1); } next; } @@ -201,6 +252,8 @@ sub parseEntry if ! $k; my @e = _getHexArray($e); + return if !@e; + $ele = pack('U*', @e); return if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/; @@ -212,27 +265,33 @@ sub parseEntry } else { my $combining = 1; # primary = 0, secondary != 0; + my $level3ingore; foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed my $var = $arr =~ /\*/; # exactly /^\*/ but be lenient. - push @key, $self->altCE($var, _getHexArray($arr)); - $combining = 0 unless $key[-1][0] == 0 && $key[-1][1] != 0; + my @arr = _getHexArray($arr); + push @key, pack(VCE_FORMAT, $var, @arr); + $combining = 0 unless $arr[0] == 0 && $arr[1] != 0; + $level3ingore = 1 if $arr[0] == 0 && $arr[1] == 0 && $arr[2] == 0; } $self->{entries}{$ele} = \@key; - $self->{combining}{$ele} = 1 if $combining; + + $self->{combining}{$ele} = 1 + if $combining; + + $self->{L3ignorable}{$e[0]} = 1 + if @e == 1 && $level3ingore; } $self->{maxlength}{ord $ele} = scalar @e if @e > 1; } - ## ## arrayref CE = altCE(bool variable?, list[num] weights) ## sub altCE { my $self = shift; - my $var = shift; - my @c = @_; + my($var, @c) = unpack(VCE_FORMAT, shift); $self->{alternate} eq 'blanked' ? $var ? [0,0,0,$c[3]] : \@c : @@ -245,15 +304,18 @@ sub altCE croak "$PACKAGE unknown alternate name: $self->{alternate}"; } -## -## string hex_sortkey = splitCE(string arg) -## sub viewSortKey { my $self = shift; + my $ver = $self->{UCA_Version}; + my $key = $self->getSortKey(@_); my $view = join " ", map sprintf("%04X", $_), unpack 'n*', $key; - $view =~ s/ ?0000 ?/|/g; + if ($ver <= 8) { + $view =~ s/ ?0000 ?/|/g; + } else { + $view =~ s/\b0000\b/|/g; + } return "[$view]"; } @@ -265,10 +327,12 @@ sub splitCE { my $self = shift; my $code = $self->{preprocess}; - my $norm = $self->{UNF}; + my $norm = $self->{normCode}; my $ent = $self->{entries}; my $max = $self->{maxlength}; my $reH = $self->{rearrangeHash}; + my $L3i = $self->{L3ignorable}; + my $ver9 = $self->{UCA_Version} > 8; my $str = ref $code ? &$code(shift) : shift; $str = &$norm($str) if ref $norm; @@ -286,6 +350,10 @@ sub splitCE } } + if ($ver9) { + @src = grep ! $L3i->{$_}, @src; + } + for (my $i = 0; $i < @src; $i++) { my $ch; my $u = $src[$i]; @@ -293,7 +361,10 @@ sub splitCE # non-characters next unless defined $u; next if $u < 0 || 0x10FFFF < $u # out of range - || (0xD800 <= $u && $u <= 0xDFFF); # unpaired surrogates + || (0xD800 <= $u && $u <= 0xDFFF) # unpaired surrogates + || (0xFDD0 <= $u && $u <= 0xFDEF) # non-character + ; + my $four = $u & 0xFFFF; next if $four == 0xFFFE || $four == 0xFFFF; @@ -335,33 +406,38 @@ sub getWt my $ign = $self->{ignored}; my $cjk = $self->{overrideCJK}; my $hang = $self->{overrideHangul}; + my $der = $self->{derivCode}; return if !defined $ch || $ign->{$ch}; # ignored - return @{ $ent->{$ch} } if $ent->{$ch}; + return map($self->altCE($_), @{ $ent->{$ch} }) + if $ent->{$ch}; + my $u = unpack('U', $ch); if (0xAC00 <= $u && $u <= 0xD7A3) { # is_Hangul - return $hang - ? &$hang($u) - : defined $hang - ? map({ - my $v = $_; - my $ar = $ent->{pack('U', $v)}; - $ar ? @$ar : map($self->altCE(0,@$_), _derivCE($v)); - } _decompHangul($u)) - : map($self->altCE(0,@$_), _derivCE($u)); + return map $self->altCE($_), + $hang + ? map(pack(VCE_FORMAT, NON_VAR, @$_), &$hang($u)) + : defined $hang + ? map({ + my $v = $_; + my $vCE = $ent->{pack('U', $v)}; + $vCE ? @$vCE : $der->($v); + } _decompHangul($u)) + : $der->($u); } elsif (0x3400 <= $u && $u <= 0x4DB5 || 0x4E00 <= $u && $u <= 0x9FA5 || - 0x20000 <= $u && $u <= 0x2A6D6) { # is_CJK - return $cjk - ? &$cjk($u) - : defined $cjk && $u <= 0xFFFF - ? $self->altCE(0, ($u, 0x20, 0x02, $u)) - : map($self->altCE(0,@$_), _derivCE($u)); + 0x20000 <= $u && $u <= 0x2A6D6) { # CJK Ideograph + return map $self->altCE($_), + $cjk + ? map(pack(VCE_FORMAT, NON_VAR, @$_), &$cjk($u)) + : defined $cjk && $self->{UCA_Version} <= 8 && $u < 0x10000 + ? pack(VCE_FORMAT, NON_VAR, $u, 0x20, 0x02, $u) + : $der->($u); } else { - return map($self->altCE(0,@$_), _derivCE($u)); + return map $self->altCE($_), $der->($u); } } @@ -398,8 +474,8 @@ sub index while ($i + 1 < @$str && (! defined $str->[$i+1] || $comb->{ $str->[$i+1] }) ) { $i++; - $go_ahead += length $str->[$i]; next if ! defined $str->[$i]; + $go_ahead += length $str->[$i]; push @tmp, grep _ignorableAtLevel($_,$lev), $self->getWt($str->[$i]); } @@ -457,9 +533,27 @@ sub getSortKey my $self = shift; my $lev = $self->{level}; my $rCE = $self->splitCE(shift); # get an arrayref + my $ver9 = $self->{UCA_Version} > 8; + my $sht = $self->{isShift}; # weight arrays - my @buf = grep defined(), map $self->getWt($_), @$rCE; + my (@buf, $last_is_variable); + + foreach my $ce (@$rCE) { + my @t = $self->getWt($ce); + if ($sht && $ver9) { + if (@t == 1 && $t[0][0] == 0) { + if ($t[0][1] == 0 && $t[0][2] == 0) { + $last_is_variable = 1; + } else { + next if $last_is_variable; + } + } else { + $last_is_variable = 0; + } + } + push @buf, @t; + } # make sort key my @ret = ([],[],[],[]); @@ -514,16 +608,38 @@ sub sort { map [ $obj->getSortKey($_), $_ ], @_; } -## -## list[arrayrefs] CE = _derivCE(int codepoint) -## -sub _derivCE { + +sub derivCE_9 { + my $u = shift; + my $base = + (0x4E00 <= $u && $u <= 0x9FA5) # CJK + ? 0xFB40 : + (0x3400 <= $u && $u <= 0x4DB5 || 0x20000 <= $u && $u <= 0x2A6D6) + ? 0xFB80 : 0xFBC0; + + my $aaaa = $base + ($u >> 15); + my $bbbb = ($u & 0x7FFF) | 0x8000; + return + pack(VCE_FORMAT, NON_VAR, $aaaa, Min2, Min3, $u), + pack(VCE_FORMAT, NON_VAR, $bbbb, 0, 0, $u); +} + +sub derivCE_8 { + my $code = shift; + my $aaaa = 0xFF80 + ($code >> 15); + my $bbbb = ($code & 0x7FFF) | 0x8000; + return + pack(VCE_FORMAT, NON_VAR, $aaaa, 2, 1, $code), + pack(VCE_FORMAT, NON_VAR, $bbbb, 0, 0, $code); +} + +sub broken_derivCE { # NG my $code = shift; - my $a = UNDEFINED + ($code >> 15); # ok - my $b = ($code & 0x7FFF) | 0x8000; # ok -# my $a = 0xFFC2 + ($code >> 15); # ng -# my $b = $code & 0x7FFF | 0x1000; # ng - $b ? ([$a,2,1,$code],[$b,0,0,$code]) : [$a,2,1,$code]; + my $aaaa = 0xFFC2 + ($code >> 15); + my $bbbb = $code & 0x7FFF | 0x1000; + return + pack(VCE_FORMAT, NON_VAR, $aaaa, 2, 1, $code), + pack(VCE_FORMAT, NON_VAR, $bbbb, 0, 0, $code); } ## @@ -575,6 +691,7 @@ Unicode::Collate - Unicode Collation Algorithm The C<new> method returns a collator object. $Collator = Unicode::Collate->new( + UCA_Version => $UCA_Version, alternate => $alternate, backwards => $levelNumber, # or \@levelNumbers entry => $element, @@ -597,6 +714,17 @@ The C<new> method returns a collator object. =over 4 +=item UCA_Version + +If the version number of the older UCA is given, +the older behavior of that version is emulated on collating. +If omitted, the return value of C<UCA_Version()> is used. + +The supported version: 8 or 9. + +B<This parameter may be removed in the future version, +as switching the algorithm would affect the performance.> + =item alternate -- see 3.2.2 Alternate Weighting, UTR #10. @@ -772,6 +900,9 @@ If you want to disallow any rearrangement, pass C<undef> or C<[]> (a reference to an empty list) as the value for this key. +B<According to the version 9 of UCA, this parameter shall not be used; +but it is not warned at present.> + =item table -- see 3.2 Default Unicode Collation Element Table, UTR #10. @@ -887,17 +1018,15 @@ and get the result of the comparison of the strings using UCA. =item C<$sortKeyForm = $Collator-E<gt>viewSortKey($string)> -Returns a string formalized to display a sort key. -Weights are enclosed with C<'['> and C<']'> -and level boundaries are denoted by C<'|'>. - use Unicode::Collate; my $c = Unicode::Collate->new(); print $c->viewSortKey("Perl"),"\n"; - # output: - # [09B3 08B1 09CB 094F|0020 0020 0020 0020|0008 0002 0002 0002|FFFF FFFF FFFF FFFF] - # Level 1 Level 2 Level 3 Level 4 + # output: + # [0B67 0A65 0B7F 0B03 | 0020 0020 0020 0020 | 0008 0002 0002 0002 | FFFF FFFF FFFF FFFF] + # Level 1 Level 2 Level 3 Level 4 + + (If C<UCA_Version> is 8, the output is slightly different.) =item C<$position = $Collator-E<gt>index($string, $substring)> @@ -943,6 +1072,34 @@ is primary equal to C<"m>E<252>C<ss">. =over 4 +=item C<%old_tailoring = $Collator-E<gt>change(%new_tailoring)> + +Change the value of specified keys and returns the changed part. + + $Collator = Unicode::Collate->new(level => 4); + + $Collator->eq("perl", "PERL"); # false + + %old = $Collator->change(level => 2); # returns (level => 4). + + $Collator->eq("perl", "PERL"); # true + + $Collator->change(%old); # returns (level => 2). + + $Collator->eq("perl", "PERL"); # false + +Not all C<(key,value)>s are allowed to be changed. +See also C<@Unicode::Collate::ChangeOK> and C<@Unicode::Collate::ChangeNG>. + +In the scalar context, returns the modified collator +(but it is B<not> a clone from the original). + + $Collator->change(level => 2)->eq("perl", "PERL"); # true + + $Collator->eq("perl", "PERL"); # true; now max level is 2nd. + + $Collator->change(level => 4)->eq("perl", "PERL"); # false + =item UCA_Version Returns the version number of Unicode Technical Standard 10 @@ -981,6 +1138,19 @@ assign C<normalization =E<gt> undef> explicitly. -- see 6.5 Avoiding Normalization, UTR #10. +=head2 Conformance Test + +The Conformance Test for the UCA is provided +in L<http://www.unicode.org/reports/tr10/CollationTest.html> +and L<http://www.unicode.org/reports/tr10/CollationTest.zip> + +For F<CollationTest_SHIFTED.txt>, +a collator via C<Unicode::Collate-E<gt>new( )> should be used; +for F<CollationTest_NON_IGNORABLE.txt>, a collator via +C<Unicode::Collate-E<gt>new(alternate =E<gt> "non-ignorable", level =E<gt> 3)>. + +B<Unicode::Normalize is required to try this test.> + =head2 BUGS C<index()> is an experimental method and @@ -1006,19 +1176,24 @@ SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt> =over 4 -=item http://www.unicode.org/unicode/reports/tr10/ +=item http://www.unicode.org/reports/tr10/ Unicode Collation Algorithm - UTR #10 -=item http://www.unicode.org/unicode/reports/tr10/allkeys.txt +=item http://www.unicode.org/reports/tr10/allkeys.txt The Default Unicode Collation Element Table -=item http://www.unicode.org/unicode/reports/tr15/ +=item http://www.unicode.org/reports/tr10/CollationTest.html +http://www.unicode.org/reports/tr10/CollationTest.zip + +The latest versions of the conformance test for the UCA + +=item http://www.unicode.org/reports/tr15/ Unicode Normalization Forms - UAX #15 -=item http://www.unicode.org/unicode/reports/tr18 +=item http://www.unicode.org/reports/tr18 Unicode Regular Expression Guidelines - UTR #18 diff --git a/lib/Unicode/Collate/Changes b/lib/Unicode/Collate/Changes index 997117c670..66676b2feb 100644 --- a/lib/Unicode/Collate/Changes +++ b/lib/Unicode/Collate/Changes @@ -1,5 +1,13 @@ Revision history for Perl extension Unicode::Collate. +0.20 Fri Jul 26 02:15:25 2002 + - now UCA Version 9. + - U+FDD0..U+FDEF are new non-characters. + - fix: whitespace characters before @backwards etc. in a table file. + - now values for 'alternate', 'backwards', etc., + which are explicitly specified via new(), + are preferred to those specified in a table file. + 0.12 Sun May 05 09:43:10 2002 - add new methods, ->UCA_Version and ->Base_Unicode_Version. - test fix: removed the needless requirement of Unicode::Normalize. diff --git a/lib/Unicode/Collate/README b/lib/Unicode/Collate/README index 4d4f12ce97..2867b47c88 100644 --- a/lib/Unicode/Collate/README +++ b/lib/Unicode/Collate/README @@ -1,4 +1,4 @@ -Unicode/Collate version 0.12 +Unicode/Collate version 0.20 =============================== Unicode::Collate - Unicode Collation Algorithm @@ -30,7 +30,7 @@ SYNOPSIS INSTALLATION -Perl 5.006 or later +Perl 5.6.1 or better To install this module type the following: @@ -41,7 +41,7 @@ To install this module type the following: DEPENDENCIES - It's better if you have Unicode::Normalize (v 0.10 or later) + The conformant collation requires Unicode::Normalize (v 0.10 or later) although Unicode::Collate can be used without Unicode::Normalize. COPYRIGHT AND LICENCE diff --git a/lib/Unicode/Collate/t/test.t b/lib/Unicode/Collate/t/test.t index f5a7012ea9..03aed85d91 100644 --- a/lib/Unicode/Collate/t/test.t +++ b/lib/Unicode/Collate/t/test.t @@ -1,7 +1,3 @@ -# Before `make install' is performed this script should be runnable with -# `make test'. After `make install' it should work as `perl test.pl' - -######################### BEGIN { if (ord("A") == 193) { @@ -10,14 +6,22 @@ BEGIN { } } +BEGIN { + if ($ENV{PERL_CORE}) { + chdir('t') if -d 't'; + @INC = qw(../lib); + } +} + use Test; -BEGIN { plan tests => 160 }; +BEGIN { plan tests => 183}; use Unicode::Collate; -ok(1); # If we made it this far, we're ok. ######################### -my $UCA_Version = "8.0"; +ok(1); # If we made it this far, we're ok. + +my $UCA_Version = "9"; ok(Unicode::Collate::UCA_Version, $UCA_Version); ok(Unicode::Collate->UCA_Version, $UCA_Version); @@ -41,14 +45,30 @@ ok( ), ); +ok($Collator->cmp("", ""), 0); +ok($Collator->eq("", "")); +ok($Collator->cmp("", "perl"), -1); + +############## + my $A_acute = pack('U', 0x00C1); +my $a_acute = pack('U', 0x00E1); my $acute = pack('U', 0x0301); ok($Collator->cmp("A$acute", $A_acute), -1); -ok($Collator->cmp("", ""), 0); -ok(! $Collator->ne("", "") ); -ok( $Collator->eq("", "") ); -ok($Collator->cmp("", "perl"), -1); +ok($Collator->cmp($a_acute, $A_acute), -1); + +my %old_level = $Collator->change(level => 1); +ok($Collator->eq("A$acute", $A_acute)); +ok($Collator->eq("A", $A_acute)); + +ok($Collator->change(level => 2)->eq($a_acute, $A_acute)); +ok($Collator->lt("A", $A_acute)); + +ok($Collator->change(%old_level)->lt("A", $A_acute)); +ok($Collator->lt("A", $A_acute)); +ok($Collator->lt("A", $a_acute)); +ok($Collator->lt($a_acute, $A_acute)); ############## @@ -76,6 +96,16 @@ ENTRIES ok($NFD->lt("\x{430}\x{3099}B", "\x{430}\x{308}\x{3099}A")); ok($NFD->eq("\x{0430}\x{3099}\x{309A}\x{0308}", "\x{0430}\x{309A}\x{3099}\x{0308}") ); + + my %old_norm = $NFD->change(normalization => undef); + ok($NFD->lt("A$acute", $A_acute)); + ok($NFD->cmp("A$acute", $A_acute), $Collator->cmp("A$acute", $A_acute)); + + $NFD->change(%old_norm); + ok($NFD->eq("A$acute", $A_acute)); + ok($NFD->change(normalization => undef)->lt("A$acute", $A_acute)); + ok($NFD->change(level => 1)->eq("A$acute", $A_acute)); + } else { ok(1); @@ -83,6 +113,11 @@ else { ok(1); ok(1); ok(1); + ok(1); + ok(1); + ok(1); + ok(1); + ok(1); } ############## @@ -120,9 +155,9 @@ ok($trad->eq($katakana, $hiragana)); ############## -my $old_level = $Collator->{level}; +$Collator->change(level => 2); -$Collator->{level} = 2; +ok($Collator->{level}, 2); ok( $Collator->cmp("ABC","abc"), 0); ok( $Collator->eq("ABC","abc") ); @@ -139,9 +174,9 @@ ok( $Collator->lt("a\x{AC00}b", "a\x{AE00}b") ); ok( $Collator->gt("a\x{D7A3}b", "a\x{C544}b") ); ok( $Collator->lt("a\x{C544}b", "a\x{30A2}b") ); # hangul < hiragana -$Collator->{level} = $old_level; +$Collator->change(%old_level, katakana_before_hiragana => 1); -$Collator->{katakana_before_hiragana} = 1; +ok($Collator->{level}, 4); ok( $Collator->cmp("abc", "ABC"), -1); ok( $Collator->ne("abc", "ABC") ); @@ -152,7 +187,7 @@ ok( $Collator->ne($hiragana, $katakana) ); ok( $Collator->gt($hiragana, $katakana) ); ok( $Collator->ge($hiragana, $katakana) ); -$Collator->{upper_before_lower} = 1; +$Collator->change(upper_before_lower => 1); ok( $Collator->cmp("abc", "ABC"), 1); ok( $Collator->ge("abc", "ABC"), 1); @@ -161,12 +196,12 @@ ok( $Collator->cmp($hiragana, $katakana), 1); ok( $Collator->ge($hiragana, $katakana), 1); ok( $Collator->gt($hiragana, $katakana), 1); -$Collator->{katakana_before_hiragana} = 0; +$Collator->change(katakana_before_hiragana => 0); ok( $Collator->cmp("abc", "ABC"), 1); ok( $Collator->cmp($hiragana, $katakana), -1); -$Collator->{upper_before_lower} = 0; +$Collator->change(upper_before_lower => 0); ok( $Collator->cmp("abc", "ABC"), -1); ok( $Collator->le("abc", "ABC") ); @@ -219,7 +254,7 @@ ok($Collator->lt("lake","like")); ############## -$Collator->{level} = 2; +$Collator->change(level => 2); my $str; @@ -235,7 +270,7 @@ if (my($pos,$len) = $Collator->index($str, $sub)) { ok($str, $ret); -$Collator->{level} = $old_level; +$Collator->change(%old_level); $str = $orig; if (my($pos,$len) = $Collator->index($str, $sub)) { @@ -248,7 +283,7 @@ ok($str, $orig); my $match; -$Collator->{level} = 1; +$Collator->change(level => 1); $str = "Pe\x{300}rl"; $sub = "pe"; @@ -266,11 +301,11 @@ if (my($pos, $len) = $Collator->index($str, $sub)) { } ok($match, "P\x{300}e\x{300}\x{301}\x{303}"); -$Collator->{level} = $old_level; +$Collator->change(%old_level); ############## -$trad->{level} = 1; +%old_level = $trad->change(level => 1); $str = "Ich mu\x{00DF} studieren."; $sub = "m\x{00FC}ss"; @@ -280,7 +315,7 @@ if (my($pos, $len) = $trad->index($str, $sub)) { } ok($match, "mu\x{00DF}"); -$trad->{level} = $old_level; +$trad->change(%old_level); $str = "Ich mu\x{00DF} studieren."; $sub = "m\x{00FC}ss"; @@ -370,21 +405,41 @@ ok($few_entries->eq("\x{AC00}", "\x{1100}\x{1161}")); ############## -my $all_undef = Unicode::Collate->new( +my $all_undef_8 = Unicode::Collate->new( table => undef, normalization => undef, overrideCJK => undef, overrideHangul => undef, + UCA_Version => 8, ); # All in the Unicode code point order. # No hangul decomposition. -ok($all_undef->lt("\x{3042}", "\x{4E00}")); -ok($all_undef->lt("\x{4DFF}", "\x{4E00}")); -ok($all_undef->lt("\x{4E00}", "\x{AC00}")); -ok($all_undef->gt("\x{AC00}", "\x{1100}\x{1161}")); -ok($all_undef->gt("\x{AC00}", "\x{ABFF}")); +ok($all_undef_8->lt("\x{3402}", "\x{4E00}")); +ok($all_undef_8->lt("\x{4DFF}", "\x{4E00}")); +ok($all_undef_8->lt("\x{4E00}", "\x{AC00}")); +ok($all_undef_8->gt("\x{AC00}", "\x{1100}\x{1161}")); +ok($all_undef_8->gt("\x{AC00}", "\x{ABFF}")); + +############## + +my $all_undef_9 = Unicode::Collate->new( + table => undef, + normalization => undef, + overrideCJK => undef, + overrideHangul => undef, + UCA_Version => 9, +); + +# CJK Ideo. < CJK ext A/B < Others. +# No hangul decomposition. + +ok($all_undef_9->lt("\x{4E00}", "\x{3402}")); +ok($all_undef_9->lt("\x{3402}", "\x{20000}")); +ok($all_undef_9->lt("\x{20000}", "\x{AC00}")); +ok($all_undef_9->gt("\x{AC00}", "\x{1100}\x{1161}")); +ok($all_undef_9->gt("\x{AC00}", "\x{ABFF}")); ############## @@ -426,59 +481,41 @@ ok($ignoreHangul->lt("Pe\x{AE00}rl", "Perl")); # 'r' is unassigned. ############## -my $blanked = Unicode::Collate->new( - table => 'keys.txt', - normalization => undef, - alternate => 'Blanked', -); +my %origAlter = $Collator->change(alternate => 'Blanked'); -ok($blanked->lt("death", "de luge")); -ok($blanked->lt("de luge", "de-luge")); -ok($blanked->lt("de-luge", "deluge")); -ok($blanked->lt("deluge", "de\x{2010}luge")); -ok($blanked->lt("deluge", "de Luge")); +ok($Collator->lt("death", "de luge")); +ok($Collator->lt("de luge", "de-luge")); +ok($Collator->lt("de-luge", "deluge")); +ok($Collator->lt("deluge", "de\x{2010}luge")); +ok($Collator->lt("deluge", "de Luge")); -############## - -my $nonIgn = Unicode::Collate->new( - table => 'keys.txt', - normalization => undef, - alternate => 'Non-ignorable', -); +$Collator->change(alternate => 'Non-ignorable'); -ok($nonIgn->lt("de luge", "de Luge")); -ok($nonIgn->lt("de Luge", "de-luge")); -ok($nonIgn->lt("de-Luge", "de\x{2010}luge")); -ok($nonIgn->lt("de-luge", "death")); -ok($nonIgn->lt("death", "deluge")); +ok($Collator->lt("de luge", "de Luge")); +ok($Collator->lt("de Luge", "de-luge")); +ok($Collator->lt("de-Luge", "de\x{2010}luge")); +ok($Collator->lt("de-luge", "death")); +ok($Collator->lt("death", "deluge")); -############## +$Collator->change(alternate => 'Shifted'); -my $shifted = Unicode::Collate->new( - table => 'keys.txt', - normalization => undef, - alternate => 'Shifted', -); +ok($Collator->lt("death", "de luge")); +ok($Collator->lt("de luge", "de-luge")); +ok($Collator->lt("de-luge", "deluge")); +ok($Collator->lt("deluge", "de Luge")); +ok($Collator->lt("de Luge", "deLuge")); -ok($shifted->lt("death", "de luge")); -ok($shifted->lt("de luge", "de-luge")); -ok($shifted->lt("de-luge", "deluge")); -ok($shifted->lt("deluge", "de Luge")); -ok($shifted->lt("de Luge", "deLuge")); +$Collator->change(alternate => 'Shift-Trimmed'); -############## +ok($Collator->lt("death", "deluge")); +ok($Collator->lt("deluge", "de luge")); +ok($Collator->lt("de luge", "de-luge")); +ok($Collator->lt("de-luge", "deLuge")); +ok($Collator->lt("deLuge", "de Luge")); -my $shTrim = Unicode::Collate->new( - table => 'keys.txt', - normalization => undef, - alternate => 'Shift-Trimmed', -); +$Collator->change(%origAlter); -ok($shTrim->lt("death", "deluge")); -ok($shTrim->lt("deluge", "de luge")); -ok($shTrim->lt("de luge", "de-luge")); -ok($shTrim->lt("de-luge", "deLuge")); -ok($shTrim->lt("deLuge", "de Luge")); +ok($Collator->{alternate}, 'shifted'); ############## @@ -504,19 +541,29 @@ ok($overCJK->lt("a\x{4E03}", "A\x{4E00}")); ############## -# rearranged : 0x0E40..0x0E44, 0x0EC0..0x0EC4 +# rearrange : 0x0E40..0x0E44, 0x0EC0..0x0EC4 (default) + +my %old_rearrange = $Collator->change(rearrange => undef); + +ok($Collator->gt("\x{0E41}A", "\x{0E40}B")); +ok($Collator->gt("A\x{0E41}A", "A\x{0E40}B")); + +$Collator->change(rearrange => [ 0x61 ]); # 'a' -ok($Collator->lt("A", "B")); +ok($Collator->gt("ab", "AB")); # as 'ba' > 'AB' + +$Collator->change(%old_rearrange); + +ok($Collator->lt("ab", "AB")); ok($Collator->lt("\x{0E40}", "\x{0E41}")); ok($Collator->lt("\x{0E40}A", "\x{0E41}B")); ok($Collator->lt("\x{0E41}A", "\x{0E40}B")); ok($Collator->lt("A\x{0E41}A", "A\x{0E40}B")); -ok($all_undef->lt("A", "B")); -ok($all_undef->lt("\x{0E40}", "\x{0E41}")); -ok($all_undef->lt("\x{0E40}A", "\x{0E41}B")); -ok($all_undef->lt("\x{0E41}A", "\x{0E40}B")); -ok($all_undef->lt("A\x{0E41}A", "A\x{0E40}B")); +ok($all_undef_8->lt("\x{0E40}", "\x{0E41}")); +ok($all_undef_8->lt("\x{0E40}A", "\x{0E41}B")); +ok($all_undef_8->lt("\x{0E41}A", "\x{0E40}B")); +ok($all_undef_8->lt("A\x{0E41}A", "A\x{0E40}B")); ############## @@ -534,8 +581,6 @@ ok($no_rearrange->gt("A\x{0E41}A", "A\x{0E40}B")); ############## -# equivalent to $no_rearrange - my $undef_rearrange = Unicode::Collate->new( table => undef, normalization => undef, |