summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorJarkko Hietaniemi <jhi@iki.fi>2002-07-25 20:37:16 +0000
committerJarkko Hietaniemi <jhi@iki.fi>2002-07-25 20:37:16 +0000
commit0116f5dc667173b72bd4d2214f20e592d19f1c37 (patch)
tree3afa1e605ff615ecca6bf776f929dca14a96b325 /lib
parentc993f866915d3552dc02138441c792f0dccb48d0 (diff)
downloadperl-0116f5dc667173b72bd4d2214f20e592d19f1c37.tar.gz
Upgrade to Unicode::Collate 0.20.
p4raw-id: //depot/perl@17655
Diffstat (limited to 'lib')
-rw-r--r--lib/Unicode/Collate.pm409
-rw-r--r--lib/Unicode/Collate/Changes8
-rw-r--r--lib/Unicode/Collate/README6
-rw-r--r--lib/Unicode/Collate/t/test.t211
4 files changed, 431 insertions, 203 deletions
diff --git a/lib/Unicode/Collate.pm b/lib/Unicode/Collate.pm
index 51c290ec87..fa0ef225fa 100644
--- a/lib/Unicode/Collate.pm
+++ b/lib/Unicode/Collate.pm
@@ -14,7 +14,7 @@ use File::Spec;
require Exporter;
-our $VERSION = '0.12';
+our $VERSION = '0.20';
our $PACKAGE = __PACKAGE__;
our @ISA = qw(Exporter);
@@ -36,7 +36,6 @@ unless ($@) {
else { # XXX, Perl 5.6.1
my($f, $fh);
foreach my $d (@INC) {
- use File::Spec;
$f = File::Spec->catfile($d, "unicode", "Unicode.301");
if (open($fh, $f)) {
$UNICODE_VERSION = '3.0.1';
@@ -48,53 +47,100 @@ else { # XXX, Perl 5.6.1
our $getCombinClass; # coderef for combining class from Unicode::Normalize
-use constant Min2 => 0x20; # minimum weight at level 2
-use constant Min3 => 0x02; # minimum weight at level 3
-use constant UNDEFINED => 0xFF80; # special value for undefined CE's
+use constant Min2 => 0x20; # minimum weight at level 2
+use constant Min3 => 0x02; # minimum weight at level 3
-our $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ];
-
-sub UCA_Version { "8.0" }
+# format for pack
+use constant VCE_FORMAT => 'Cn4'; # for variable + CE with 4 levels
-sub Base_Unicode_Version { $UNICODE_VERSION || 'unknown' }
+# values of variable
+use constant NON_VAR => 0; # Non-Variable character
+use constant VAR => 1; # Variable character
-##
-## constructor
-##
-sub new
-{
- my $class = shift;
- my $self = bless { @_ }, $class;
+our $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ];
- # alternate lowercased
- $self->{alternate} =
- ! exists $self->{alternate} ? 'shifted' : lc($self->{alternate});
+sub UCA_Version { "9" }
- croak "$PACKAGE unknown alternate tag name: $self->{alternate}"
- unless $self->{alternate} eq 'blanked'
- || $self->{alternate} eq 'non-ignorable'
- || $self->{alternate} eq 'shifted'
- || $self->{alternate} eq 'shift-trimmed';
+sub Base_Unicode_Version { $UNICODE_VERSION || 'unknown' }
- # collation level
- $self->{level} ||= 4;
+my (%AlternateOK);
+@AlternateOK{ qw/
+ blanked non-ignorable shifted shift-trimmed
+ / } = ();
+
+our @ChangeOK = qw/
+ alternate backwards level normalization rearrange
+ katakana_before_hiragana upper_before_lower
+ overrideHangul overrideCJK preprocess UCA_Version
+ /;
+
+our @ChangeNG = qw/
+ entry entries table ignored combining maxlength
+ ignoreChar ignoreName undefChar undefName
+ versionTable alternateTable backwardsTable forwardsTable rearrangeTable
+ derivCode normCode rearrangeHash isShift L3ignorable
+ /;
+
+my (%ChangeOK, %ChangeNG);
+@ChangeOK{ @ChangeOK } = ();
+@ChangeNG{ @ChangeNG } = ();
+
+sub change {
+ my $self = shift;
+ my %hash = @_;
+ my %old;
+ foreach my $k (keys %hash) {
+ if (exists $ChangeOK{$k}) {
+ $old{$k} = $self->{$k};
+ $self->{$k} = $hash{$k};
+ }
+ elsif (exists $ChangeNG{$k}) {
+ croak "change of $k via change() is not allowed!";
+ }
+ # else => ignored
+ }
+ $self->checkCollator;
+ return wantarray ? %old : $self;
+}
+sub checkCollator {
+ my $self = shift;
croak "Illegal level lower than 1 (passed $self->{level})."
if $self->{level} < 1;
croak "A level higher than 4 (passed $self->{level}) is not supported."
if 4 < $self->{level};
- # overrideHangul and -CJK
- # If true: CODEREF used; '': default; undef: derived elements
- $self->{overrideHangul} = ''
- if ! exists $self->{overrideHangul};
- $self->{overrideCJK} = ''
- if ! exists $self->{overrideCJK};
+ $self->{derivCode} =
+ $self->{UCA_Version} == -1 ? \&broken_derivCE :
+ $self->{UCA_Version} == 8 ? \&derivCE_8 :
+ $self->{UCA_Version} == 9 ? \&derivCE_9 :
+ croak "Illegal UCA version (passed $self->{UCA_Version}).";
- # normalization form
- $self->{normalization} = 'D'
- if ! exists $self->{normalization};
- $self->{UNF} = undef;
+ $self->{alternate} = lc($self->{alternate});
+ croak "$PACKAGE unknown alternate tag name: $self->{alternate}"
+ unless exists $AlternateOK{ $self->{alternate} };
+
+ $self->{isShift} = $self->{alternate} eq 'shifted' ||
+ $self->{alternate} eq 'shift-trimmed';
+
+ $self->{backwards} = []
+ if ! defined $self->{backwards};
+ $self->{backwards} = [ $self->{backwards} ]
+ if ! ref $self->{backwards};
+
+ $self->{rearrange} = []
+ if ! defined $self->{rearrange};
+ croak "$PACKAGE: A list for rearrangement must be store in an ARRAYREF"
+ if ! ref $self->{rearrange};
+
+ # keys of $self->{rearrangeHash} are $self->{rearrange}.
+ $self->{rearrangeHash} = undef;
+
+ if (@{ $self->{rearrange} }) {
+ @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
+ }
+
+ $self->{normCode} = undef;
if (defined $self->{normalization}) {
eval { require Unicode::Normalize };
@@ -105,7 +151,7 @@ sub new
$getCombinClass = \&Unicode::Normalize::getCombinClass
if ! $getCombinClass;
- $self->{UNF} =
+ $self->{normCode} =
$self->{normalization} =~ /^(?:NF)?C$/ ? \&NFC :
$self->{normalization} =~ /^(?:NF)?D$/ ? \&NFD :
$self->{normalization} =~ /^(?:NF)?KC$/ ? \&NFKC :
@@ -113,37 +159,39 @@ sub new
croak "$PACKAGE unknown normalization form name: "
. $self->{normalization};
}
+ return;
+}
+
+sub new
+{
+ my $class = shift;
+ my $self = bless { @_ }, $class;
- # Open a table file.
# If undef is passed explicitly, no file is read.
- $self->{table} = $KeyFile
- if ! exists $self->{table};
- $self->read_table
- if defined $self->{table};
+ $self->{table} = $KeyFile if ! exists $self->{table};
+ $self->read_table if defined $self->{table};
if ($self->{entry}) {
$self->parseEntry($_) foreach split /\n/, $self->{entry};
}
- # backwards
- $self->{backwards} ||= [ ];
- $self->{backwards} = [ $self->{backwards} ]
- if ! ref $self->{backwards};
+ $self->{level} ||= 4;
+ $self->{UCA_Version} ||= UCA_Version();
- # rearrange
- $self->{rearrange} = $DefaultRearrange
+ $self->{overrideHangul} = ''
+ if ! exists $self->{overrideHangul};
+ $self->{overrideCJK} = ''
+ if ! exists $self->{overrideCJK};
+ $self->{normalization} = 'D'
+ if ! exists $self->{normalization};
+ $self->{alternate} = $self->{alternateTable} || 'shifted'
+ if ! exists $self->{alternate};
+ $self->{rearrange} = $self->{rearrangeTable} || $DefaultRearrange
if ! exists $self->{rearrange};
- $self->{rearrange} = []
- if ! defined $self->{rearrange};
- croak "$PACKAGE: A list for rearrangement must be store in an ARRAYREF"
- if ! ref $self->{rearrange};
+ $self->{backwards} = $self->{backwardsTable}
+ if ! exists $self->{backwards};
- # keys of $self->{rearrangeHash} are $self->{rearrange}.
- $self->{rearrangeHash} = undef;
-
- if (@{ $self->{rearrange} }) {
- @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
- }
+ $self->checkCollator;
return $self;
}
@@ -159,17 +207,20 @@ sub read_table {
while (<$fk>) {
next if /^\s*#/;
if (/^\s*\@/) {
- if (/^\@version\s*(\S*)/) {
- $self->{version} ||= $1;
+ if (/^\s*\@version\s*(\S*)/) {
+ $self->{versionTable} ||= $1;
+ }
+ elsif (/^\s*\@alternate\s+(\S*)/) {
+ $self->{alternateTable} ||= $1;
}
- elsif (/^\@alternate\s+(.*)/) {
- $self->{alternate} ||= $1;
+ elsif (/^\s*\@backwards\s+(\S*)/) {
+ push @{ $self->{backwardsTable} }, $1;
}
- elsif (/^\@backwards\s+(.*)/) {
- push @{ $self->{backwards} }, $1;
+ elsif (/^\s*\@forwards\s+(\S*)/) { # parhaps no use
+ push @{ $self->{forwardsTable} }, $1;
}
- elsif (/^\@rearrange\s+(.*)/) {
- push @{ $self->{rearrange} }, _getHexArray($1);
+ elsif (/^\s*\@rearrange\s+(.*)/) { # (\S*) is NG
+ push @{ $self->{rearrangeTable} }, _getHexArray($1);
}
next;
}
@@ -201,6 +252,8 @@ sub parseEntry
if ! $k;
my @e = _getHexArray($e);
+ return if !@e;
+
$ele = pack('U*', @e);
return if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
@@ -212,27 +265,33 @@ sub parseEntry
}
else {
my $combining = 1; # primary = 0, secondary != 0;
+ my $level3ingore;
foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed
my $var = $arr =~ /\*/; # exactly /^\*/ but be lenient.
- push @key, $self->altCE($var, _getHexArray($arr));
- $combining = 0 unless $key[-1][0] == 0 && $key[-1][1] != 0;
+ my @arr = _getHexArray($arr);
+ push @key, pack(VCE_FORMAT, $var, @arr);
+ $combining = 0 unless $arr[0] == 0 && $arr[1] != 0;
+ $level3ingore = 1 if $arr[0] == 0 && $arr[1] == 0 && $arr[2] == 0;
}
$self->{entries}{$ele} = \@key;
- $self->{combining}{$ele} = 1 if $combining;
+
+ $self->{combining}{$ele} = 1
+ if $combining;
+
+ $self->{L3ignorable}{$e[0]} = 1
+ if @e == 1 && $level3ingore;
}
$self->{maxlength}{ord $ele} = scalar @e if @e > 1;
}
-
##
## arrayref CE = altCE(bool variable?, list[num] weights)
##
sub altCE
{
my $self = shift;
- my $var = shift;
- my @c = @_;
+ my($var, @c) = unpack(VCE_FORMAT, shift);
$self->{alternate} eq 'blanked' ?
$var ? [0,0,0,$c[3]] : \@c :
@@ -245,15 +304,18 @@ sub altCE
croak "$PACKAGE unknown alternate name: $self->{alternate}";
}
-##
-## string hex_sortkey = splitCE(string arg)
-##
sub viewSortKey
{
my $self = shift;
+ my $ver = $self->{UCA_Version};
+
my $key = $self->getSortKey(@_);
my $view = join " ", map sprintf("%04X", $_), unpack 'n*', $key;
- $view =~ s/ ?0000 ?/|/g;
+ if ($ver <= 8) {
+ $view =~ s/ ?0000 ?/|/g;
+ } else {
+ $view =~ s/\b0000\b/|/g;
+ }
return "[$view]";
}
@@ -265,10 +327,12 @@ sub splitCE
{
my $self = shift;
my $code = $self->{preprocess};
- my $norm = $self->{UNF};
+ my $norm = $self->{normCode};
my $ent = $self->{entries};
my $max = $self->{maxlength};
my $reH = $self->{rearrangeHash};
+ my $L3i = $self->{L3ignorable};
+ my $ver9 = $self->{UCA_Version} > 8;
my $str = ref $code ? &$code(shift) : shift;
$str = &$norm($str) if ref $norm;
@@ -286,6 +350,10 @@ sub splitCE
}
}
+ if ($ver9) {
+ @src = grep ! $L3i->{$_}, @src;
+ }
+
for (my $i = 0; $i < @src; $i++) {
my $ch;
my $u = $src[$i];
@@ -293,7 +361,10 @@ sub splitCE
# non-characters
next unless defined $u;
next if $u < 0 || 0x10FFFF < $u # out of range
- || (0xD800 <= $u && $u <= 0xDFFF); # unpaired surrogates
+ || (0xD800 <= $u && $u <= 0xDFFF) # unpaired surrogates
+ || (0xFDD0 <= $u && $u <= 0xFDEF) # non-character
+ ;
+
my $four = $u & 0xFFFF;
next if $four == 0xFFFE || $four == 0xFFFF;
@@ -335,33 +406,38 @@ sub getWt
my $ign = $self->{ignored};
my $cjk = $self->{overrideCJK};
my $hang = $self->{overrideHangul};
+ my $der = $self->{derivCode};
return if !defined $ch || $ign->{$ch}; # ignored
- return @{ $ent->{$ch} } if $ent->{$ch};
+ return map($self->altCE($_), @{ $ent->{$ch} })
+ if $ent->{$ch};
+
my $u = unpack('U', $ch);
if (0xAC00 <= $u && $u <= 0xD7A3) { # is_Hangul
- return $hang
- ? &$hang($u)
- : defined $hang
- ? map({
- my $v = $_;
- my $ar = $ent->{pack('U', $v)};
- $ar ? @$ar : map($self->altCE(0,@$_), _derivCE($v));
- } _decompHangul($u))
- : map($self->altCE(0,@$_), _derivCE($u));
+ return map $self->altCE($_),
+ $hang
+ ? map(pack(VCE_FORMAT, NON_VAR, @$_), &$hang($u))
+ : defined $hang
+ ? map({
+ my $v = $_;
+ my $vCE = $ent->{pack('U', $v)};
+ $vCE ? @$vCE : $der->($v);
+ } _decompHangul($u))
+ : $der->($u);
}
elsif (0x3400 <= $u && $u <= 0x4DB5 ||
0x4E00 <= $u && $u <= 0x9FA5 ||
- 0x20000 <= $u && $u <= 0x2A6D6) { # is_CJK
- return $cjk
- ? &$cjk($u)
- : defined $cjk && $u <= 0xFFFF
- ? $self->altCE(0, ($u, 0x20, 0x02, $u))
- : map($self->altCE(0,@$_), _derivCE($u));
+ 0x20000 <= $u && $u <= 0x2A6D6) { # CJK Ideograph
+ return map $self->altCE($_),
+ $cjk
+ ? map(pack(VCE_FORMAT, NON_VAR, @$_), &$cjk($u))
+ : defined $cjk && $self->{UCA_Version} <= 8 && $u < 0x10000
+ ? pack(VCE_FORMAT, NON_VAR, $u, 0x20, 0x02, $u)
+ : $der->($u);
}
else {
- return map($self->altCE(0,@$_), _derivCE($u));
+ return map $self->altCE($_), $der->($u);
}
}
@@ -398,8 +474,8 @@ sub index
while ($i + 1 < @$str &&
(! defined $str->[$i+1] || $comb->{ $str->[$i+1] }) ) {
$i++;
- $go_ahead += length $str->[$i];
next if ! defined $str->[$i];
+ $go_ahead += length $str->[$i];
push @tmp,
grep _ignorableAtLevel($_,$lev), $self->getWt($str->[$i]);
}
@@ -457,9 +533,27 @@ sub getSortKey
my $self = shift;
my $lev = $self->{level};
my $rCE = $self->splitCE(shift); # get an arrayref
+ my $ver9 = $self->{UCA_Version} > 8;
+ my $sht = $self->{isShift};
# weight arrays
- my @buf = grep defined(), map $self->getWt($_), @$rCE;
+ my (@buf, $last_is_variable);
+
+ foreach my $ce (@$rCE) {
+ my @t = $self->getWt($ce);
+ if ($sht && $ver9) {
+ if (@t == 1 && $t[0][0] == 0) {
+ if ($t[0][1] == 0 && $t[0][2] == 0) {
+ $last_is_variable = 1;
+ } else {
+ next if $last_is_variable;
+ }
+ } else {
+ $last_is_variable = 0;
+ }
+ }
+ push @buf, @t;
+ }
# make sort key
my @ret = ([],[],[],[]);
@@ -514,16 +608,38 @@ sub sort {
map [ $obj->getSortKey($_), $_ ], @_;
}
-##
-## list[arrayrefs] CE = _derivCE(int codepoint)
-##
-sub _derivCE {
+
+sub derivCE_9 {
+ my $u = shift;
+ my $base =
+ (0x4E00 <= $u && $u <= 0x9FA5) # CJK
+ ? 0xFB40 :
+ (0x3400 <= $u && $u <= 0x4DB5 || 0x20000 <= $u && $u <= 0x2A6D6)
+ ? 0xFB80 : 0xFBC0;
+
+ my $aaaa = $base + ($u >> 15);
+ my $bbbb = ($u & 0x7FFF) | 0x8000;
+ return
+ pack(VCE_FORMAT, NON_VAR, $aaaa, Min2, Min3, $u),
+ pack(VCE_FORMAT, NON_VAR, $bbbb, 0, 0, $u);
+}
+
+sub derivCE_8 {
+ my $code = shift;
+ my $aaaa = 0xFF80 + ($code >> 15);
+ my $bbbb = ($code & 0x7FFF) | 0x8000;
+ return
+ pack(VCE_FORMAT, NON_VAR, $aaaa, 2, 1, $code),
+ pack(VCE_FORMAT, NON_VAR, $bbbb, 0, 0, $code);
+}
+
+sub broken_derivCE { # NG
my $code = shift;
- my $a = UNDEFINED + ($code >> 15); # ok
- my $b = ($code & 0x7FFF) | 0x8000; # ok
-# my $a = 0xFFC2 + ($code >> 15); # ng
-# my $b = $code & 0x7FFF | 0x1000; # ng
- $b ? ([$a,2,1,$code],[$b,0,0,$code]) : [$a,2,1,$code];
+ my $aaaa = 0xFFC2 + ($code >> 15);
+ my $bbbb = $code & 0x7FFF | 0x1000;
+ return
+ pack(VCE_FORMAT, NON_VAR, $aaaa, 2, 1, $code),
+ pack(VCE_FORMAT, NON_VAR, $bbbb, 0, 0, $code);
}
##
@@ -575,6 +691,7 @@ Unicode::Collate - Unicode Collation Algorithm
The C<new> method returns a collator object.
$Collator = Unicode::Collate->new(
+ UCA_Version => $UCA_Version,
alternate => $alternate,
backwards => $levelNumber, # or \@levelNumbers
entry => $element,
@@ -597,6 +714,17 @@ The C<new> method returns a collator object.
=over 4
+=item UCA_Version
+
+If the version number of the older UCA is given,
+the older behavior of that version is emulated on collating.
+If omitted, the return value of C<UCA_Version()> is used.
+
+The supported version: 8 or 9.
+
+B<This parameter may be removed in the future version,
+as switching the algorithm would affect the performance.>
+
=item alternate
-- see 3.2.2 Alternate Weighting, UTR #10.
@@ -772,6 +900,9 @@ If you want to disallow any rearrangement,
pass C<undef> or C<[]> (a reference to an empty list)
as the value for this key.
+B<According to the version 9 of UCA, this parameter shall not be used;
+but it is not warned at present.>
+
=item table
-- see 3.2 Default Unicode Collation Element Table, UTR #10.
@@ -887,17 +1018,15 @@ and get the result of the comparison of the strings using UCA.
=item C<$sortKeyForm = $Collator-E<gt>viewSortKey($string)>
-Returns a string formalized to display a sort key.
-Weights are enclosed with C<'['> and C<']'>
-and level boundaries are denoted by C<'|'>.
-
use Unicode::Collate;
my $c = Unicode::Collate->new();
print $c->viewSortKey("Perl"),"\n";
- # output:
- # [09B3 08B1 09CB 094F|0020 0020 0020 0020|0008 0002 0002 0002|FFFF FFFF FFFF FFFF]
- # Level 1 Level 2 Level 3 Level 4
+ # output:
+ # [0B67 0A65 0B7F 0B03 | 0020 0020 0020 0020 | 0008 0002 0002 0002 | FFFF FFFF FFFF FFFF]
+ # Level 1 Level 2 Level 3 Level 4
+
+ (If C<UCA_Version> is 8, the output is slightly different.)
=item C<$position = $Collator-E<gt>index($string, $substring)>
@@ -943,6 +1072,34 @@ is primary equal to C<"m>E<252>C<ss">.
=over 4
+=item C<%old_tailoring = $Collator-E<gt>change(%new_tailoring)>
+
+Change the value of specified keys and returns the changed part.
+
+ $Collator = Unicode::Collate->new(level => 4);
+
+ $Collator->eq("perl", "PERL"); # false
+
+ %old = $Collator->change(level => 2); # returns (level => 4).
+
+ $Collator->eq("perl", "PERL"); # true
+
+ $Collator->change(%old); # returns (level => 2).
+
+ $Collator->eq("perl", "PERL"); # false
+
+Not all C<(key,value)>s are allowed to be changed.
+See also C<@Unicode::Collate::ChangeOK> and C<@Unicode::Collate::ChangeNG>.
+
+In the scalar context, returns the modified collator
+(but it is B<not> a clone from the original).
+
+ $Collator->change(level => 2)->eq("perl", "PERL"); # true
+
+ $Collator->eq("perl", "PERL"); # true; now max level is 2nd.
+
+ $Collator->change(level => 4)->eq("perl", "PERL"); # false
+
=item UCA_Version
Returns the version number of Unicode Technical Standard 10
@@ -981,6 +1138,19 @@ assign C<normalization =E<gt> undef> explicitly.
-- see 6.5 Avoiding Normalization, UTR #10.
+=head2 Conformance Test
+
+The Conformance Test for the UCA is provided
+in L<http://www.unicode.org/reports/tr10/CollationTest.html>
+and L<http://www.unicode.org/reports/tr10/CollationTest.zip>
+
+For F<CollationTest_SHIFTED.txt>,
+a collator via C<Unicode::Collate-E<gt>new( )> should be used;
+for F<CollationTest_NON_IGNORABLE.txt>, a collator via
+C<Unicode::Collate-E<gt>new(alternate =E<gt> "non-ignorable", level =E<gt> 3)>.
+
+B<Unicode::Normalize is required to try this test.>
+
=head2 BUGS
C<index()> is an experimental method and
@@ -1006,19 +1176,24 @@ SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>
=over 4
-=item http://www.unicode.org/unicode/reports/tr10/
+=item http://www.unicode.org/reports/tr10/
Unicode Collation Algorithm - UTR #10
-=item http://www.unicode.org/unicode/reports/tr10/allkeys.txt
+=item http://www.unicode.org/reports/tr10/allkeys.txt
The Default Unicode Collation Element Table
-=item http://www.unicode.org/unicode/reports/tr15/
+=item http://www.unicode.org/reports/tr10/CollationTest.html
+http://www.unicode.org/reports/tr10/CollationTest.zip
+
+The latest versions of the conformance test for the UCA
+
+=item http://www.unicode.org/reports/tr15/
Unicode Normalization Forms - UAX #15
-=item http://www.unicode.org/unicode/reports/tr18
+=item http://www.unicode.org/reports/tr18
Unicode Regular Expression Guidelines - UTR #18
diff --git a/lib/Unicode/Collate/Changes b/lib/Unicode/Collate/Changes
index 997117c670..66676b2feb 100644
--- a/lib/Unicode/Collate/Changes
+++ b/lib/Unicode/Collate/Changes
@@ -1,5 +1,13 @@
Revision history for Perl extension Unicode::Collate.
+0.20 Fri Jul 26 02:15:25 2002
+ - now UCA Version 9.
+ - U+FDD0..U+FDEF are new non-characters.
+ - fix: whitespace characters before @backwards etc. in a table file.
+ - now values for 'alternate', 'backwards', etc.,
+ which are explicitly specified via new(),
+ are preferred to those specified in a table file.
+
0.12 Sun May 05 09:43:10 2002
- add new methods, ->UCA_Version and ->Base_Unicode_Version.
- test fix: removed the needless requirement of Unicode::Normalize.
diff --git a/lib/Unicode/Collate/README b/lib/Unicode/Collate/README
index 4d4f12ce97..2867b47c88 100644
--- a/lib/Unicode/Collate/README
+++ b/lib/Unicode/Collate/README
@@ -1,4 +1,4 @@
-Unicode/Collate version 0.12
+Unicode/Collate version 0.20
===============================
Unicode::Collate - Unicode Collation Algorithm
@@ -30,7 +30,7 @@ SYNOPSIS
INSTALLATION
-Perl 5.006 or later
+Perl 5.6.1 or better
To install this module type the following:
@@ -41,7 +41,7 @@ To install this module type the following:
DEPENDENCIES
- It's better if you have Unicode::Normalize (v 0.10 or later)
+ The conformant collation requires Unicode::Normalize (v 0.10 or later)
although Unicode::Collate can be used without Unicode::Normalize.
COPYRIGHT AND LICENCE
diff --git a/lib/Unicode/Collate/t/test.t b/lib/Unicode/Collate/t/test.t
index f5a7012ea9..03aed85d91 100644
--- a/lib/Unicode/Collate/t/test.t
+++ b/lib/Unicode/Collate/t/test.t
@@ -1,7 +1,3 @@
-# Before `make install' is performed this script should be runnable with
-# `make test'. After `make install' it should work as `perl test.pl'
-
-#########################
BEGIN {
if (ord("A") == 193) {
@@ -10,14 +6,22 @@ BEGIN {
}
}
+BEGIN {
+ if ($ENV{PERL_CORE}) {
+ chdir('t') if -d 't';
+ @INC = qw(../lib);
+ }
+}
+
use Test;
-BEGIN { plan tests => 160 };
+BEGIN { plan tests => 183};
use Unicode::Collate;
-ok(1); # If we made it this far, we're ok.
#########################
-my $UCA_Version = "8.0";
+ok(1); # If we made it this far, we're ok.
+
+my $UCA_Version = "9";
ok(Unicode::Collate::UCA_Version, $UCA_Version);
ok(Unicode::Collate->UCA_Version, $UCA_Version);
@@ -41,14 +45,30 @@ ok(
),
);
+ok($Collator->cmp("", ""), 0);
+ok($Collator->eq("", ""));
+ok($Collator->cmp("", "perl"), -1);
+
+##############
+
my $A_acute = pack('U', 0x00C1);
+my $a_acute = pack('U', 0x00E1);
my $acute = pack('U', 0x0301);
ok($Collator->cmp("A$acute", $A_acute), -1);
-ok($Collator->cmp("", ""), 0);
-ok(! $Collator->ne("", "") );
-ok( $Collator->eq("", "") );
-ok($Collator->cmp("", "perl"), -1);
+ok($Collator->cmp($a_acute, $A_acute), -1);
+
+my %old_level = $Collator->change(level => 1);
+ok($Collator->eq("A$acute", $A_acute));
+ok($Collator->eq("A", $A_acute));
+
+ok($Collator->change(level => 2)->eq($a_acute, $A_acute));
+ok($Collator->lt("A", $A_acute));
+
+ok($Collator->change(%old_level)->lt("A", $A_acute));
+ok($Collator->lt("A", $A_acute));
+ok($Collator->lt("A", $a_acute));
+ok($Collator->lt($a_acute, $A_acute));
##############
@@ -76,6 +96,16 @@ ENTRIES
ok($NFD->lt("\x{430}\x{3099}B", "\x{430}\x{308}\x{3099}A"));
ok($NFD->eq("\x{0430}\x{3099}\x{309A}\x{0308}",
"\x{0430}\x{309A}\x{3099}\x{0308}") );
+
+ my %old_norm = $NFD->change(normalization => undef);
+ ok($NFD->lt("A$acute", $A_acute));
+ ok($NFD->cmp("A$acute", $A_acute), $Collator->cmp("A$acute", $A_acute));
+
+ $NFD->change(%old_norm);
+ ok($NFD->eq("A$acute", $A_acute));
+ ok($NFD->change(normalization => undef)->lt("A$acute", $A_acute));
+ ok($NFD->change(level => 1)->eq("A$acute", $A_acute));
+
}
else {
ok(1);
@@ -83,6 +113,11 @@ else {
ok(1);
ok(1);
ok(1);
+ ok(1);
+ ok(1);
+ ok(1);
+ ok(1);
+ ok(1);
}
##############
@@ -120,9 +155,9 @@ ok($trad->eq($katakana, $hiragana));
##############
-my $old_level = $Collator->{level};
+$Collator->change(level => 2);
-$Collator->{level} = 2;
+ok($Collator->{level}, 2);
ok( $Collator->cmp("ABC","abc"), 0);
ok( $Collator->eq("ABC","abc") );
@@ -139,9 +174,9 @@ ok( $Collator->lt("a\x{AC00}b", "a\x{AE00}b") );
ok( $Collator->gt("a\x{D7A3}b", "a\x{C544}b") );
ok( $Collator->lt("a\x{C544}b", "a\x{30A2}b") ); # hangul < hiragana
-$Collator->{level} = $old_level;
+$Collator->change(%old_level, katakana_before_hiragana => 1);
-$Collator->{katakana_before_hiragana} = 1;
+ok($Collator->{level}, 4);
ok( $Collator->cmp("abc", "ABC"), -1);
ok( $Collator->ne("abc", "ABC") );
@@ -152,7 +187,7 @@ ok( $Collator->ne($hiragana, $katakana) );
ok( $Collator->gt($hiragana, $katakana) );
ok( $Collator->ge($hiragana, $katakana) );
-$Collator->{upper_before_lower} = 1;
+$Collator->change(upper_before_lower => 1);
ok( $Collator->cmp("abc", "ABC"), 1);
ok( $Collator->ge("abc", "ABC"), 1);
@@ -161,12 +196,12 @@ ok( $Collator->cmp($hiragana, $katakana), 1);
ok( $Collator->ge($hiragana, $katakana), 1);
ok( $Collator->gt($hiragana, $katakana), 1);
-$Collator->{katakana_before_hiragana} = 0;
+$Collator->change(katakana_before_hiragana => 0);
ok( $Collator->cmp("abc", "ABC"), 1);
ok( $Collator->cmp($hiragana, $katakana), -1);
-$Collator->{upper_before_lower} = 0;
+$Collator->change(upper_before_lower => 0);
ok( $Collator->cmp("abc", "ABC"), -1);
ok( $Collator->le("abc", "ABC") );
@@ -219,7 +254,7 @@ ok($Collator->lt("lake","like"));
##############
-$Collator->{level} = 2;
+$Collator->change(level => 2);
my $str;
@@ -235,7 +270,7 @@ if (my($pos,$len) = $Collator->index($str, $sub)) {
ok($str, $ret);
-$Collator->{level} = $old_level;
+$Collator->change(%old_level);
$str = $orig;
if (my($pos,$len) = $Collator->index($str, $sub)) {
@@ -248,7 +283,7 @@ ok($str, $orig);
my $match;
-$Collator->{level} = 1;
+$Collator->change(level => 1);
$str = "Pe\x{300}rl";
$sub = "pe";
@@ -266,11 +301,11 @@ if (my($pos, $len) = $Collator->index($str, $sub)) {
}
ok($match, "P\x{300}e\x{300}\x{301}\x{303}");
-$Collator->{level} = $old_level;
+$Collator->change(%old_level);
##############
-$trad->{level} = 1;
+%old_level = $trad->change(level => 1);
$str = "Ich mu\x{00DF} studieren.";
$sub = "m\x{00FC}ss";
@@ -280,7 +315,7 @@ if (my($pos, $len) = $trad->index($str, $sub)) {
}
ok($match, "mu\x{00DF}");
-$trad->{level} = $old_level;
+$trad->change(%old_level);
$str = "Ich mu\x{00DF} studieren.";
$sub = "m\x{00FC}ss";
@@ -370,21 +405,41 @@ ok($few_entries->eq("\x{AC00}", "\x{1100}\x{1161}"));
##############
-my $all_undef = Unicode::Collate->new(
+my $all_undef_8 = Unicode::Collate->new(
table => undef,
normalization => undef,
overrideCJK => undef,
overrideHangul => undef,
+ UCA_Version => 8,
);
# All in the Unicode code point order.
# No hangul decomposition.
-ok($all_undef->lt("\x{3042}", "\x{4E00}"));
-ok($all_undef->lt("\x{4DFF}", "\x{4E00}"));
-ok($all_undef->lt("\x{4E00}", "\x{AC00}"));
-ok($all_undef->gt("\x{AC00}", "\x{1100}\x{1161}"));
-ok($all_undef->gt("\x{AC00}", "\x{ABFF}"));
+ok($all_undef_8->lt("\x{3402}", "\x{4E00}"));
+ok($all_undef_8->lt("\x{4DFF}", "\x{4E00}"));
+ok($all_undef_8->lt("\x{4E00}", "\x{AC00}"));
+ok($all_undef_8->gt("\x{AC00}", "\x{1100}\x{1161}"));
+ok($all_undef_8->gt("\x{AC00}", "\x{ABFF}"));
+
+##############
+
+my $all_undef_9 = Unicode::Collate->new(
+ table => undef,
+ normalization => undef,
+ overrideCJK => undef,
+ overrideHangul => undef,
+ UCA_Version => 9,
+);
+
+# CJK Ideo. < CJK ext A/B < Others.
+# No hangul decomposition.
+
+ok($all_undef_9->lt("\x{4E00}", "\x{3402}"));
+ok($all_undef_9->lt("\x{3402}", "\x{20000}"));
+ok($all_undef_9->lt("\x{20000}", "\x{AC00}"));
+ok($all_undef_9->gt("\x{AC00}", "\x{1100}\x{1161}"));
+ok($all_undef_9->gt("\x{AC00}", "\x{ABFF}"));
##############
@@ -426,59 +481,41 @@ ok($ignoreHangul->lt("Pe\x{AE00}rl", "Perl")); # 'r' is unassigned.
##############
-my $blanked = Unicode::Collate->new(
- table => 'keys.txt',
- normalization => undef,
- alternate => 'Blanked',
-);
+my %origAlter = $Collator->change(alternate => 'Blanked');
-ok($blanked->lt("death", "de luge"));
-ok($blanked->lt("de luge", "de-luge"));
-ok($blanked->lt("de-luge", "deluge"));
-ok($blanked->lt("deluge", "de\x{2010}luge"));
-ok($blanked->lt("deluge", "de Luge"));
+ok($Collator->lt("death", "de luge"));
+ok($Collator->lt("de luge", "de-luge"));
+ok($Collator->lt("de-luge", "deluge"));
+ok($Collator->lt("deluge", "de\x{2010}luge"));
+ok($Collator->lt("deluge", "de Luge"));
-##############
-
-my $nonIgn = Unicode::Collate->new(
- table => 'keys.txt',
- normalization => undef,
- alternate => 'Non-ignorable',
-);
+$Collator->change(alternate => 'Non-ignorable');
-ok($nonIgn->lt("de luge", "de Luge"));
-ok($nonIgn->lt("de Luge", "de-luge"));
-ok($nonIgn->lt("de-Luge", "de\x{2010}luge"));
-ok($nonIgn->lt("de-luge", "death"));
-ok($nonIgn->lt("death", "deluge"));
+ok($Collator->lt("de luge", "de Luge"));
+ok($Collator->lt("de Luge", "de-luge"));
+ok($Collator->lt("de-Luge", "de\x{2010}luge"));
+ok($Collator->lt("de-luge", "death"));
+ok($Collator->lt("death", "deluge"));
-##############
+$Collator->change(alternate => 'Shifted');
-my $shifted = Unicode::Collate->new(
- table => 'keys.txt',
- normalization => undef,
- alternate => 'Shifted',
-);
+ok($Collator->lt("death", "de luge"));
+ok($Collator->lt("de luge", "de-luge"));
+ok($Collator->lt("de-luge", "deluge"));
+ok($Collator->lt("deluge", "de Luge"));
+ok($Collator->lt("de Luge", "deLuge"));
-ok($shifted->lt("death", "de luge"));
-ok($shifted->lt("de luge", "de-luge"));
-ok($shifted->lt("de-luge", "deluge"));
-ok($shifted->lt("deluge", "de Luge"));
-ok($shifted->lt("de Luge", "deLuge"));
+$Collator->change(alternate => 'Shift-Trimmed');
-##############
+ok($Collator->lt("death", "deluge"));
+ok($Collator->lt("deluge", "de luge"));
+ok($Collator->lt("de luge", "de-luge"));
+ok($Collator->lt("de-luge", "deLuge"));
+ok($Collator->lt("deLuge", "de Luge"));
-my $shTrim = Unicode::Collate->new(
- table => 'keys.txt',
- normalization => undef,
- alternate => 'Shift-Trimmed',
-);
+$Collator->change(%origAlter);
-ok($shTrim->lt("death", "deluge"));
-ok($shTrim->lt("deluge", "de luge"));
-ok($shTrim->lt("de luge", "de-luge"));
-ok($shTrim->lt("de-luge", "deLuge"));
-ok($shTrim->lt("deLuge", "de Luge"));
+ok($Collator->{alternate}, 'shifted');
##############
@@ -504,19 +541,29 @@ ok($overCJK->lt("a\x{4E03}", "A\x{4E00}"));
##############
-# rearranged : 0x0E40..0x0E44, 0x0EC0..0x0EC4
+# rearrange : 0x0E40..0x0E44, 0x0EC0..0x0EC4 (default)
+
+my %old_rearrange = $Collator->change(rearrange => undef);
+
+ok($Collator->gt("\x{0E41}A", "\x{0E40}B"));
+ok($Collator->gt("A\x{0E41}A", "A\x{0E40}B"));
+
+$Collator->change(rearrange => [ 0x61 ]); # 'a'
-ok($Collator->lt("A", "B"));
+ok($Collator->gt("ab", "AB")); # as 'ba' > 'AB'
+
+$Collator->change(%old_rearrange);
+
+ok($Collator->lt("ab", "AB"));
ok($Collator->lt("\x{0E40}", "\x{0E41}"));
ok($Collator->lt("\x{0E40}A", "\x{0E41}B"));
ok($Collator->lt("\x{0E41}A", "\x{0E40}B"));
ok($Collator->lt("A\x{0E41}A", "A\x{0E40}B"));
-ok($all_undef->lt("A", "B"));
-ok($all_undef->lt("\x{0E40}", "\x{0E41}"));
-ok($all_undef->lt("\x{0E40}A", "\x{0E41}B"));
-ok($all_undef->lt("\x{0E41}A", "\x{0E40}B"));
-ok($all_undef->lt("A\x{0E41}A", "A\x{0E40}B"));
+ok($all_undef_8->lt("\x{0E40}", "\x{0E41}"));
+ok($all_undef_8->lt("\x{0E40}A", "\x{0E41}B"));
+ok($all_undef_8->lt("\x{0E41}A", "\x{0E40}B"));
+ok($all_undef_8->lt("A\x{0E41}A", "A\x{0E40}B"));
##############
@@ -534,8 +581,6 @@ ok($no_rearrange->gt("A\x{0E41}A", "A\x{0E40}B"));
##############
-# equivalent to $no_rearrange
-
my $undef_rearrange = Unicode::Collate->new(
table => undef,
normalization => undef,