1 files changed, 91 insertions, 49 deletions
diff --git a/cpan/Unicode-Collate/Collate.pm b/cpan/Unicode-Collate/Collate.pm
index 5964f83511..9e1623cf4e 100644
--- a/cpan/Unicode-Collate/Collate.pm
+++ b/cpan/Unicode-Collate/Collate.pm
@@ -14,7 +14,7 @@ use File::Spec;
 
 no warnings 'utf8';
 
-our $VERSION = '0.91';
+our $VERSION = '0.92';
 our $PACKAGE = __PACKAGE__;
 
 ### begin XS only ###
@@ -48,16 +48,14 @@ use constant Min3Wt => 0x02;
 use constant Shift4Wt => 0xFFFF;
 
 # A boolean for Variable and 16-bit weights at 4 levels of Collation Element
-# PROBLEM: The Default Unicode Collation Element Table
-# has weights over 0xFFFF at the 4th level.
-# The tie-breaking in the variable weights
-# other than "shift" (as well as "shift-trimmed") is unreliable.
 use constant VCE_TEMPLATE => 'Cn4';
 
 # A sort key: 16-bit weights
-# See also the PROBLEM on VCE_TEMPLATE above.
 use constant KEY_TEMPLATE => 'n*';
 
+# The tie-breaking: 32-bit weights
+use constant TIE_TEMPLATE => 'N*';
+
 # Level separator in a sort key:
 # i.e. pack(KEY_TEMPLATE, 0)
 use constant LEVEL_SEP => "\0\0";
@@ -105,7 +103,7 @@ our @ChangeOK = qw/
     alternate backwards level normalization rearrange
     katakana_before_hiragana upper_before_lower ignore_level2
     overrideHangul overrideCJK preprocess UCA_Version
-    hangul_terminator variable
+    hangul_terminator variable identical
   /;
 
 our @ChangeNG = qw/
@@ -135,18 +133,18 @@ sub change {
     my $self = shift;
     my %hash = @_;
     my %old;
-    if (exists $hash{variable} && exists $hash{alternate}) {
-	delete $hash{alternate};
-    }
-    elsif (!exists $hash{variable} && exists $hash{alternate}) {
-	$hash{variable} = $hash{alternate};
+    if (exists $hash{alternate}) {
+	if (exists $hash{variable}) {
+	    delete $hash{alternate};
+	} else {
+	    $hash{variable} = $hash{alternate};
+	}
     }
     foreach my $k (keys %hash) {
 	if (exists $ChangeOK{$k}) {
 	    $old{$k} = $self->{$k};
 	    $self->{$k} = $hash{$k};
-	}
-	elsif (exists $ChangeNG{$k}) {
+	} elsif (exists $ChangeNG{$k}) {
 	    croak "change of $k via change() is not allowed!";
 	}
 	# else => ignored
@@ -176,6 +174,7 @@ my %DerivCode = (
    20 => \&_derivCE_20,
    22 => \&_derivCE_22,
    24 => \&_derivCE_24,
+   26 => \&_derivCE_24, # 26 == 24
 );
 
 sub checkCollator {
@@ -193,12 +192,10 @@ sub checkCollator {
 
     if (! defined $self->{backwards}) {
 	$self->{backwardsFlag} = 0;
-    }
-    elsif (! ref $self->{backwards}) {
+    } elsif (! ref $self->{backwards}) {
 	_checkLevel($self->{backwards}, "backwards");
 	$self->{backwardsFlag} = 1 << $self->{backwards};
-    }
-    else {
+    } else {
 	my %level;
 	$self->{backwardsFlag} = 0;
 	for my $b (@{ $self->{backwards} }) {
@@ -443,21 +440,33 @@ sub parseEntry
 sub viewSortKey
 {
     my $self = shift;
-    $self->visualizeSortKey($self->getSortKey(@_));
+    my $str  = shift;
+    $self->visualizeSortKey($self->getSortKey($str));
 }
 
 
+sub process
+{
+    my $self = shift;
+    my $str  = shift;
+    my $prep = $self->{preprocess};
+    my $norm = $self->{normCode};
+
+    $str = &$prep($str) if ref $prep;
+    $str = &$norm($str) if ref $norm;
+    return $str;
+}
+
 ##
 ## arrayref of JCPS   = splitEnt(string to be collated)
-## arrayref of arrayref[JCPS, ini_pos, fin_pos] = splitEnt(string, true)
+## arrayref of arrayref[JCPS, ini_pos, fin_pos] = splitEnt(string, TRUE)
 ##
 sub splitEnt
 {
     my $self = shift;
-    my $wLen = $_[1];
+    my $str  = shift;
+    my $wLen = shift; # with Length
 
-    my $code = $self->{preprocess};
-    my $norm = $self->{normCode};
     my $map  = $self->{mapping};
     my $max  = $self->{maxlength};
     my $reH  = $self->{rearrangeHash};
@@ -465,20 +474,7 @@ sub splitEnt
     my $ver9 = $vers >= 9 && $vers <= 11;
     my $uXS  = $self->{__useXS}; ### XS only
 
-    my ($str, @buf);
-
-    if ($wLen) {
-	$code and croak "Preprocess breaks character positions. "
-			. "Don't use with index(), match(), etc.";
-	$norm and croak "Normalization breaks character positions. "
-			. "Don't use with index(), match(), etc.";
-	$str = $_[0];
-    }
-    else {
-	$str = $_[0];
-	$str = &$code($str) if ref $code;
-	$str = &$norm($str) if ref $norm;
-    }
+    my @buf;
 
     # get array of Unicode code point of string.
     my @src = unpack_U($str);
@@ -696,9 +692,13 @@ sub getWt
 sub getSortKey
 {
     my $self = shift;
-    my $rEnt = $self->splitEnt(shift); # get an arrayref of JCPS
+    my $orig = shift;
+    my $str  = $self->process($orig);
+    my $rEnt = $self->splitEnt($str); # get an arrayref of JCPS
     my $vers = $self->{UCA_Version};
     my $term = $self->{hangul_terminator};
+    my $lev  = $self->{level};
+    my $iden = $self->{identical};
 
     my @buf; # weight arrays
     if ($term) {
@@ -723,7 +723,13 @@ sub getSortKey
 	}
     }
 
-    return $self->mk_SortKey(\@buf); ### XS only
+    my $rkey = $self->mk_SortKey(\@buf); ### XS only
+
+    if ($iden || $vers >= 26 && $lev == MaxLevel) {
+	$rkey .= LEVEL_SEP;
+	$rkey .= pack(TIE_TEMPLATE, unpack_U($str)) if $iden;
+    }
+    return $rkey;
 }
 
 
@@ -798,9 +804,15 @@ sub _eqArray($$$)
 sub index
 {
     my $self = shift;
+    $self->{preprocess} and
+	croak "Don't use Preprocess with index(), match(), etc.";
+    $self->{normCode} and
+	croak "Don't use Normalization with index(), match(), etc.";
+
     my $str  = shift;
     my $len  = length($str);
-    my $subE = $self->splitEnt(shift);
+    my $sub  = shift;
+    my $subE = $self->splitEnt($sub);
     my $pos  = @_ ? shift : 0;
        $pos  = 0 if $pos < 0;
     my $glob = shift;
@@ -1034,6 +1046,7 @@ with no parameters, the collator should do the default collation.
       backwards => $levelNumber, # or \@levelNumbers
       entry => $element,
       hangul_terminator => $term_primary_weight,
+      identical => $bool,
       ignoreName => qr/$ignoreName/,
       ignoreChar => qr/$ignoreChar/,
       ignore_level2 => $bool,
@@ -1074,6 +1087,7 @@ The following revisions are supported.  The default is 24.
      20             5.2.0               5.2.0 (5.2.0)
      22             6.0.0               6.0.0 (6.0.0)
      24             6.1.0               6.1.0 (6.1.0)
+     26             6.2.0               6.2.0 (6.2.0)
 
 * Noncharacters (e.g. U+FFFF) are not ignored, and can be overridden
 since C<UCA_Version> 22.
@@ -1099,7 +1113,7 @@ as an alias for C<variable>.
 
 =item backwards
 
--- see 3.1.2 French Accents, UTS #10.
+-- see 3.4 Backward Accents, UTS #10.
 
      backwards => $levelNumber or \@levelNumbers
 
@@ -1109,7 +1123,7 @@ forwards at all the levels.
 
 =item entry
 
--- see 3.1 Linguistic Features; 3.2.1 File Format, UTS #10.
+-- see 5 Tailoring; 3.6.1 File Format, UTS #10.
 
 If the same character (or a sequence of characters) exists
 in the collation element table through C<table>,
@@ -1183,11 +1197,27 @@ automatically terminated with a terminator primary weight.
 These characters may need terminator included in a collation element
 table beforehand.
 
+=item identical
+
+-- see A.3 Deterministic Comparison, UTS #10.
+
+By default, strings whose weights are equal should be equal,
+even though their code points are not equal.
+
+If the parameter is made true, a final, tie-breaking level is used.
+If no difference of weights is found after the comparison through all
+the level (independent of the value of C<level>), the comparison with
+code points will be performed. For the tie-breaking comparision,
+the sort key has code points of the original string appended.
+
+If C<preprocess> and/or C<normalization> is applied, the code points
+of the string after them (in NFD by default) are used.
+
 =item ignoreChar
 
 =item ignoreName
 
--- see 3.2.2 Variable Weighting, UTS #10.
+-- see 3.6.2 Variable Weighting, UTS #10.
 
 Makes the entry in the table completely ignorable;
 i.e. as if the weights were zero at all level.
@@ -1214,7 +1244,7 @@ B<NOTE>: C<level> should be 3 or greater.
 
 =item katakana_before_hiragana
 
--- see 7.3.1 Tertiary Weight Table, UTS #10.
+-- see 7.2 Tertiary Weight Table, UTS #10.
 
 By default, hiragana is before katakana.
 If the parameter is made true, this is reversed.
@@ -1241,6 +1271,13 @@ Any higher levels than the specified one are ignored.
 
 If omitted, the maximum is the 4th.
 
+B<NOTE:> The DUCET includes weights over 0xFFFF at the 4th level.
+But this module only uses weights within 0xFFFF.
+When C<variable> is 'blanked' or 'non-ignorable' (other than 'shifted'
+and 'shift-trimmed'), the level 4 may be unreliable.
+
+See also C<identical>.
+
 =item normalization
 
 -- see 4.1 Normalize, UTS #10.
@@ -1295,7 +1332,7 @@ those in the CJK Unified Ideographs Extension A etc.
     U+4E00..U+9FBB if UCA_Version is 14 or 16.
     U+4E00..U+9FC3 if UCA_Version is 18.
     U+4E00..U+9FCB if UCA_Version is 20 or 22.
-    U+4E00..U+9FCC if UCA_Version is 24.
+    U+4E00..U+9FCC if UCA_Version is 24 or 26.
 
     In the CJK Unified Ideographs Extension blocks:
     Ext.A (U+3400..U+4DB5) and Ext.B (U+20000..U+2A6D6) in any UCA_Version.
@@ -1373,7 +1410,7 @@ in C<table> or C<entry> is still valid.
 
 =item preprocess
 
--- see 5.1 Preprocessing, UTS #10.
+-- see 5.4 Preprocessing, UTS #10.
 
 If specified, the coderef is used to preprocess each string
 before the formation of sort keys.
@@ -1402,7 +1439,7 @@ L<perluniintro>, L<perlunitut>, L<perlunifaq>, L<utf8>.
 
 =item rearrange
 
--- see 3.1.3 Rearrangement, UTS #10.
+-- see 3.5 Rearrangement, UTS #10.
 
 Characters that are not coded in logical order and to be rearranged.
 If C<UCA_Version> is equal to or lesser than 11, default is:
@@ -1458,7 +1495,7 @@ B<NOTE>: Contractions via C<entry> are not be suppressed.
 
 =item table
 
--- see 3.2 Default Unicode Collation Element Table, UTS #10.
+-- see 3.6 Default Unicode Collation Element Table, UTS #10.
 
 You can use another collation element table if desired.
 
@@ -1537,7 +1574,7 @@ this parameter doesn't work validly.
 
 =item variable
 
--- see 3.2.2 Variable Weighting, UTS #10.
+-- see 3.6.2 Variable Weighting, UTS #10.
 
 This key allows for variable weighting of variable collation elements,
 which are marked with an ASTERISK in the table
@@ -1861,6 +1898,11 @@ a collator via C<Unicode::Collate-E<gt>new( )> should be used;
 for F<CollationTest_NON_IGNORABLE.txt>, a collator via
 C<Unicode::Collate-E<gt>new(variable =E<gt> "non-ignorable", level =E<gt> 3)>.
 
+If C<UCA_Version> is 26 or later, the C<identical> level is preferred;
+C<Unicode::Collate-E<gt>new(identical =E<gt> 1)> and
+C<Unicode::Collate-E<gt>new(identical =E<gt> 1,>
+C<variable =E<gt> "non-ignorable", level =E<gt> 3)> should be used.
+
 B<Unicode::Normalize is required to try The Conformance Test.>
 
 =back