summaryrefslogtreecommitdiff
path: root/cpan/Unicode-Collate/Collate.pm
diff options
context:
space:
mode:
Diffstat (limited to 'cpan/Unicode-Collate/Collate.pm')
-rw-r--r--cpan/Unicode-Collate/Collate.pm140
1 files changed, 91 insertions, 49 deletions
diff --git a/cpan/Unicode-Collate/Collate.pm b/cpan/Unicode-Collate/Collate.pm
index 5964f83511..9e1623cf4e 100644
--- a/cpan/Unicode-Collate/Collate.pm
+++ b/cpan/Unicode-Collate/Collate.pm
@@ -14,7 +14,7 @@ use File::Spec;
no warnings 'utf8';
-our $VERSION = '0.91';
+our $VERSION = '0.92';
our $PACKAGE = __PACKAGE__;
### begin XS only ###
@@ -48,16 +48,14 @@ use constant Min3Wt => 0x02;
use constant Shift4Wt => 0xFFFF;
# A boolean for Variable and 16-bit weights at 4 levels of Collation Element
-# PROBLEM: The Default Unicode Collation Element Table
-# has weights over 0xFFFF at the 4th level.
-# The tie-breaking in the variable weights
-# other than "shift" (as well as "shift-trimmed") is unreliable.
use constant VCE_TEMPLATE => 'Cn4';
# A sort key: 16-bit weights
-# See also the PROBLEM on VCE_TEMPLATE above.
use constant KEY_TEMPLATE => 'n*';
+# The tie-breaking: 32-bit weights
+use constant TIE_TEMPLATE => 'N*';
+
# Level separator in a sort key:
# i.e. pack(KEY_TEMPLATE, 0)
use constant LEVEL_SEP => "\0\0";
@@ -105,7 +103,7 @@ our @ChangeOK = qw/
alternate backwards level normalization rearrange
katakana_before_hiragana upper_before_lower ignore_level2
overrideHangul overrideCJK preprocess UCA_Version
- hangul_terminator variable
+ hangul_terminator variable identical
/;
our @ChangeNG = qw/
@@ -135,18 +133,18 @@ sub change {
my $self = shift;
my %hash = @_;
my %old;
- if (exists $hash{variable} && exists $hash{alternate}) {
- delete $hash{alternate};
- }
- elsif (!exists $hash{variable} && exists $hash{alternate}) {
- $hash{variable} = $hash{alternate};
+ if (exists $hash{alternate}) {
+ if (exists $hash{variable}) {
+ delete $hash{alternate};
+ } else {
+ $hash{variable} = $hash{alternate};
+ }
}
foreach my $k (keys %hash) {
if (exists $ChangeOK{$k}) {
$old{$k} = $self->{$k};
$self->{$k} = $hash{$k};
- }
- elsif (exists $ChangeNG{$k}) {
+ } elsif (exists $ChangeNG{$k}) {
croak "change of $k via change() is not allowed!";
}
# else => ignored
@@ -176,6 +174,7 @@ my %DerivCode = (
20 => \&_derivCE_20,
22 => \&_derivCE_22,
24 => \&_derivCE_24,
+ 26 => \&_derivCE_24, # 26 == 24
);
sub checkCollator {
@@ -193,12 +192,10 @@ sub checkCollator {
if (! defined $self->{backwards}) {
$self->{backwardsFlag} = 0;
- }
- elsif (! ref $self->{backwards}) {
+ } elsif (! ref $self->{backwards}) {
_checkLevel($self->{backwards}, "backwards");
$self->{backwardsFlag} = 1 << $self->{backwards};
- }
- else {
+ } else {
my %level;
$self->{backwardsFlag} = 0;
for my $b (@{ $self->{backwards} }) {
@@ -443,21 +440,33 @@ sub parseEntry
sub viewSortKey
{
my $self = shift;
- $self->visualizeSortKey($self->getSortKey(@_));
+ my $str = shift;
+ $self->visualizeSortKey($self->getSortKey($str));
}
+sub process
+{
+ my $self = shift;
+ my $str = shift;
+ my $prep = $self->{preprocess};
+ my $norm = $self->{normCode};
+
+ $str = &$prep($str) if ref $prep;
+ $str = &$norm($str) if ref $norm;
+ return $str;
+}
+
##
## arrayref of JCPS = splitEnt(string to be collated)
-## arrayref of arrayref[JCPS, ini_pos, fin_pos] = splitEnt(string, true)
+## arrayref of arrayref[JCPS, ini_pos, fin_pos] = splitEnt(string, TRUE)
##
sub splitEnt
{
my $self = shift;
- my $wLen = $_[1];
+ my $str = shift;
+ my $wLen = shift; # with Length
- my $code = $self->{preprocess};
- my $norm = $self->{normCode};
my $map = $self->{mapping};
my $max = $self->{maxlength};
my $reH = $self->{rearrangeHash};
@@ -465,20 +474,7 @@ sub splitEnt
my $ver9 = $vers >= 9 && $vers <= 11;
my $uXS = $self->{__useXS}; ### XS only
- my ($str, @buf);
-
- if ($wLen) {
- $code and croak "Preprocess breaks character positions. "
- . "Don't use with index(), match(), etc.";
- $norm and croak "Normalization breaks character positions. "
- . "Don't use with index(), match(), etc.";
- $str = $_[0];
- }
- else {
- $str = $_[0];
- $str = &$code($str) if ref $code;
- $str = &$norm($str) if ref $norm;
- }
+ my @buf;
# get array of Unicode code point of string.
my @src = unpack_U($str);
@@ -696,9 +692,13 @@ sub getWt
sub getSortKey
{
my $self = shift;
- my $rEnt = $self->splitEnt(shift); # get an arrayref of JCPS
+ my $orig = shift;
+ my $str = $self->process($orig);
+ my $rEnt = $self->splitEnt($str); # get an arrayref of JCPS
my $vers = $self->{UCA_Version};
my $term = $self->{hangul_terminator};
+ my $lev = $self->{level};
+ my $iden = $self->{identical};
my @buf; # weight arrays
if ($term) {
@@ -723,7 +723,13 @@ sub getSortKey
}
}
- return $self->mk_SortKey(\@buf); ### XS only
+ my $rkey = $self->mk_SortKey(\@buf); ### XS only
+
+ if ($iden || $vers >= 26 && $lev == MaxLevel) {
+ $rkey .= LEVEL_SEP;
+ $rkey .= pack(TIE_TEMPLATE, unpack_U($str)) if $iden;
+ }
+ return $rkey;
}
@@ -798,9 +804,15 @@ sub _eqArray($$$)
sub index
{
my $self = shift;
+ $self->{preprocess} and
+ croak "Don't use Preprocess with index(), match(), etc.";
+ $self->{normCode} and
+ croak "Don't use Normalization with index(), match(), etc.";
+
my $str = shift;
my $len = length($str);
- my $subE = $self->splitEnt(shift);
+ my $sub = shift;
+ my $subE = $self->splitEnt($sub);
my $pos = @_ ? shift : 0;
$pos = 0 if $pos < 0;
my $glob = shift;
@@ -1034,6 +1046,7 @@ with no parameters, the collator should do the default collation.
backwards => $levelNumber, # or \@levelNumbers
entry => $element,
hangul_terminator => $term_primary_weight,
+ identical => $bool,
ignoreName => qr/$ignoreName/,
ignoreChar => qr/$ignoreChar/,
ignore_level2 => $bool,
@@ -1074,6 +1087,7 @@ The following revisions are supported. The default is 24.
20 5.2.0 5.2.0 (5.2.0)
22 6.0.0 6.0.0 (6.0.0)
24 6.1.0 6.1.0 (6.1.0)
+ 26 6.2.0 6.2.0 (6.2.0)
* Noncharacters (e.g. U+FFFF) are not ignored, and can be overridden
since C<UCA_Version> 22.
@@ -1099,7 +1113,7 @@ as an alias for C<variable>.
=item backwards
--- see 3.1.2 French Accents, UTS #10.
+-- see 3.4 Backward Accents, UTS #10.
backwards => $levelNumber or \@levelNumbers
@@ -1109,7 +1123,7 @@ forwards at all the levels.
=item entry
--- see 3.1 Linguistic Features; 3.2.1 File Format, UTS #10.
+-- see 5 Tailoring; 3.6.1 File Format, UTS #10.
If the same character (or a sequence of characters) exists
in the collation element table through C<table>,
@@ -1183,11 +1197,27 @@ automatically terminated with a terminator primary weight.
These characters may need terminator included in a collation element
table beforehand.
+=item identical
+
+-- see A.3 Deterministic Comparison, UTS #10.
+
+By default, strings whose weights are equal should be equal,
+even though their code points are not equal.
+
+If the parameter is made true, a final, tie-breaking level is used.
+If no difference of weights is found after the comparison through all
+the level (independent of the value of C<level>), the comparison with
+code points will be performed. For the tie-breaking comparision,
+the sort key has code points of the original string appended.
+
+If C<preprocess> and/or C<normalization> is applied, the code points
+of the string after them (in NFD by default) are used.
+
=item ignoreChar
=item ignoreName
--- see 3.2.2 Variable Weighting, UTS #10.
+-- see 3.6.2 Variable Weighting, UTS #10.
Makes the entry in the table completely ignorable;
i.e. as if the weights were zero at all level.
@@ -1214,7 +1244,7 @@ B<NOTE>: C<level> should be 3 or greater.
=item katakana_before_hiragana
--- see 7.3.1 Tertiary Weight Table, UTS #10.
+-- see 7.2 Tertiary Weight Table, UTS #10.
By default, hiragana is before katakana.
If the parameter is made true, this is reversed.
@@ -1241,6 +1271,13 @@ Any higher levels than the specified one are ignored.
If omitted, the maximum is the 4th.
+B<NOTE:> The DUCET includes weights over 0xFFFF at the 4th level.
+But this module only uses weights within 0xFFFF.
+When C<variable> is 'blanked' or 'non-ignorable' (other than 'shifted'
+and 'shift-trimmed'), the level 4 may be unreliable.
+
+See also C<identical>.
+
=item normalization
-- see 4.1 Normalize, UTS #10.
@@ -1295,7 +1332,7 @@ those in the CJK Unified Ideographs Extension A etc.
U+4E00..U+9FBB if UCA_Version is 14 or 16.
U+4E00..U+9FC3 if UCA_Version is 18.
U+4E00..U+9FCB if UCA_Version is 20 or 22.
- U+4E00..U+9FCC if UCA_Version is 24.
+ U+4E00..U+9FCC if UCA_Version is 24 or 26.
In the CJK Unified Ideographs Extension blocks:
Ext.A (U+3400..U+4DB5) and Ext.B (U+20000..U+2A6D6) in any UCA_Version.
@@ -1373,7 +1410,7 @@ in C<table> or C<entry> is still valid.
=item preprocess
--- see 5.1 Preprocessing, UTS #10.
+-- see 5.4 Preprocessing, UTS #10.
If specified, the coderef is used to preprocess each string
before the formation of sort keys.
@@ -1402,7 +1439,7 @@ L<perluniintro>, L<perlunitut>, L<perlunifaq>, L<utf8>.
=item rearrange
--- see 3.1.3 Rearrangement, UTS #10.
+-- see 3.5 Rearrangement, UTS #10.
Characters that are not coded in logical order and to be rearranged.
If C<UCA_Version> is equal to or lesser than 11, default is:
@@ -1458,7 +1495,7 @@ B<NOTE>: Contractions via C<entry> are not be suppressed.
=item table
--- see 3.2 Default Unicode Collation Element Table, UTS #10.
+-- see 3.6 Default Unicode Collation Element Table, UTS #10.
You can use another collation element table if desired.
@@ -1537,7 +1574,7 @@ this parameter doesn't work validly.
=item variable
--- see 3.2.2 Variable Weighting, UTS #10.
+-- see 3.6.2 Variable Weighting, UTS #10.
This key allows for variable weighting of variable collation elements,
which are marked with an ASTERISK in the table
@@ -1861,6 +1898,11 @@ a collator via C<Unicode::Collate-E<gt>new( )> should be used;
for F<CollationTest_NON_IGNORABLE.txt>, a collator via
C<Unicode::Collate-E<gt>new(variable =E<gt> "non-ignorable", level =E<gt> 3)>.
+If C<UCA_Version> is 26 or later, the C<identical> level is preferred;
+C<Unicode::Collate-E<gt>new(identical =E<gt> 1)> and
+C<Unicode::Collate-E<gt>new(identical =E<gt> 1,>
+C<variable =E<gt> "non-ignorable", level =E<gt> 3)> should be used.
+
B<Unicode::Normalize is required to try The Conformance Test.>
=back