summaryrefslogtreecommitdiff
path: root/cpan/Unicode-Collate
diff options
context:
space:
mode:
authorChris 'BinGOs' Williams <chris@bingosnet.co.uk>2012-11-14 14:46:15 +0000
committerChris 'BinGOs' Williams <chris@bingosnet.co.uk>2012-11-14 14:46:15 +0000
commit750da83817c1e4f89d6495fec7275f6ae3981632 (patch)
tree7aec06cd5ce1d3706fa2fcf19159d2d083e627b9 /cpan/Unicode-Collate
parent265075c0d13cbdf53c9e662decbb83fc34d4e7a4 (diff)
downloadperl-750da83817c1e4f89d6495fec7275f6ae3981632.tar.gz
Update Unicode-Collate to CPAN version 0.92
[DELTA] 0.92 Wed Nov 14 20:58:19 2012 - fix: index() etc. with preprocess/normalization should be always croaked. - doc: referred to the latest UTS #10 and updated its section numbers. - supported the identical level (see 'identical' in POD). - Now UCA_Version 26 (for Unicode 6.2.0) is supported. - added ident.t in t. - modified tests: cjkrange.t, compatui.t, hangtype.t, index.t, overcjk0.t, overcjk1.t, test.t, view.t in t. * But the default UCA_Version is still 24. (In the next release, UCA_Version 26 will be the default.)
Diffstat (limited to 'cpan/Unicode-Collate')
-rw-r--r--cpan/Unicode-Collate/Changes14
-rw-r--r--cpan/Unicode-Collate/Collate.pm140
-rw-r--r--cpan/Unicode-Collate/Collate.xs31
-rw-r--r--cpan/Unicode-Collate/README2
-rw-r--r--cpan/Unicode-Collate/t/cjkrange.t4
-rw-r--r--cpan/Unicode-Collate/t/compatui.t4
-rw-r--r--cpan/Unicode-Collate/t/hangtype.t4
-rw-r--r--cpan/Unicode-Collate/t/ident.t161
-rw-r--r--cpan/Unicode-Collate/t/index.t122
-rw-r--r--cpan/Unicode-Collate/t/overcjk0.t4
-rw-r--r--cpan/Unicode-Collate/t/overcjk1.t4
-rw-r--r--cpan/Unicode-Collate/t/test.t35
-rw-r--r--cpan/Unicode-Collate/t/view.t43
13 files changed, 472 insertions, 96 deletions
diff --git a/cpan/Unicode-Collate/Changes b/cpan/Unicode-Collate/Changes
index 71e8646c26..7225c3bb63 100644
--- a/cpan/Unicode-Collate/Changes
+++ b/cpan/Unicode-Collate/Changes
@@ -1,5 +1,17 @@
Revision history for Perl module Unicode::Collate.
+0.92 Wed Nov 14 20:58:19 2012
+ - fix: index() etc. with preprocess/normalization should be always croaked.
+ - doc: referred to the latest UTS #10 and updated its section numbers.
+ - supported the identical level (see 'identical' in POD).
+ - Now UCA_Version 26 (for Unicode 6.2.0) is supported.
+ - added ident.t in t.
+ - modified tests: cjkrange.t, compatui.t, hangtype.t, index.t,
+ overcjk0.t, overcjk1.t, test.t, view.t in t.
+
+ * But the default UCA_Version is still 24.
+ (In the next release, UCA_Version 26 will be the default.)
+
0.91 Sun Nov 4 17:00:20 2012
- XSUB: use PERL_NO_GET_CONTEXT (see perlguts)
(see [rt.cpan.org #80313])
@@ -310,7 +322,7 @@ Revision history for Perl module Unicode::Collate.
0.29 Mon Oct 13 12:18:23 2003
- now UCA Version 11 (but no functionality is different from Version 9).
- - supported hangul_terminator.
+ - supported 'hangul_terminator'.
- fix: Base_Unicode_Version falsely returns Perl's Unicode version.
C4 in UTS #10 requires UTS's Unicode version.
- For variable weighting, 'variable' is recommended
diff --git a/cpan/Unicode-Collate/Collate.pm b/cpan/Unicode-Collate/Collate.pm
index 5964f83511..9e1623cf4e 100644
--- a/cpan/Unicode-Collate/Collate.pm
+++ b/cpan/Unicode-Collate/Collate.pm
@@ -14,7 +14,7 @@ use File::Spec;
no warnings 'utf8';
-our $VERSION = '0.91';
+our $VERSION = '0.92';
our $PACKAGE = __PACKAGE__;
### begin XS only ###
@@ -48,16 +48,14 @@ use constant Min3Wt => 0x02;
use constant Shift4Wt => 0xFFFF;
# A boolean for Variable and 16-bit weights at 4 levels of Collation Element
-# PROBLEM: The Default Unicode Collation Element Table
-# has weights over 0xFFFF at the 4th level.
-# The tie-breaking in the variable weights
-# other than "shift" (as well as "shift-trimmed") is unreliable.
use constant VCE_TEMPLATE => 'Cn4';
# A sort key: 16-bit weights
-# See also the PROBLEM on VCE_TEMPLATE above.
use constant KEY_TEMPLATE => 'n*';
+# The tie-breaking: 32-bit weights
+use constant TIE_TEMPLATE => 'N*';
+
# Level separator in a sort key:
# i.e. pack(KEY_TEMPLATE, 0)
use constant LEVEL_SEP => "\0\0";
@@ -105,7 +103,7 @@ our @ChangeOK = qw/
alternate backwards level normalization rearrange
katakana_before_hiragana upper_before_lower ignore_level2
overrideHangul overrideCJK preprocess UCA_Version
- hangul_terminator variable
+ hangul_terminator variable identical
/;
our @ChangeNG = qw/
@@ -135,18 +133,18 @@ sub change {
my $self = shift;
my %hash = @_;
my %old;
- if (exists $hash{variable} && exists $hash{alternate}) {
- delete $hash{alternate};
- }
- elsif (!exists $hash{variable} && exists $hash{alternate}) {
- $hash{variable} = $hash{alternate};
+ if (exists $hash{alternate}) {
+ if (exists $hash{variable}) {
+ delete $hash{alternate};
+ } else {
+ $hash{variable} = $hash{alternate};
+ }
}
foreach my $k (keys %hash) {
if (exists $ChangeOK{$k}) {
$old{$k} = $self->{$k};
$self->{$k} = $hash{$k};
- }
- elsif (exists $ChangeNG{$k}) {
+ } elsif (exists $ChangeNG{$k}) {
croak "change of $k via change() is not allowed!";
}
# else => ignored
@@ -176,6 +174,7 @@ my %DerivCode = (
20 => \&_derivCE_20,
22 => \&_derivCE_22,
24 => \&_derivCE_24,
+ 26 => \&_derivCE_24, # 26 == 24
);
sub checkCollator {
@@ -193,12 +192,10 @@ sub checkCollator {
if (! defined $self->{backwards}) {
$self->{backwardsFlag} = 0;
- }
- elsif (! ref $self->{backwards}) {
+ } elsif (! ref $self->{backwards}) {
_checkLevel($self->{backwards}, "backwards");
$self->{backwardsFlag} = 1 << $self->{backwards};
- }
- else {
+ } else {
my %level;
$self->{backwardsFlag} = 0;
for my $b (@{ $self->{backwards} }) {
@@ -443,21 +440,33 @@ sub parseEntry
sub viewSortKey
{
my $self = shift;
- $self->visualizeSortKey($self->getSortKey(@_));
+ my $str = shift;
+ $self->visualizeSortKey($self->getSortKey($str));
}
+sub process
+{
+ my $self = shift;
+ my $str = shift;
+ my $prep = $self->{preprocess};
+ my $norm = $self->{normCode};
+
+ $str = &$prep($str) if ref $prep;
+ $str = &$norm($str) if ref $norm;
+ return $str;
+}
+
##
## arrayref of JCPS = splitEnt(string to be collated)
-## arrayref of arrayref[JCPS, ini_pos, fin_pos] = splitEnt(string, true)
+## arrayref of arrayref[JCPS, ini_pos, fin_pos] = splitEnt(string, TRUE)
##
sub splitEnt
{
my $self = shift;
- my $wLen = $_[1];
+ my $str = shift;
+ my $wLen = shift; # with Length
- my $code = $self->{preprocess};
- my $norm = $self->{normCode};
my $map = $self->{mapping};
my $max = $self->{maxlength};
my $reH = $self->{rearrangeHash};
@@ -465,20 +474,7 @@ sub splitEnt
my $ver9 = $vers >= 9 && $vers <= 11;
my $uXS = $self->{__useXS}; ### XS only
- my ($str, @buf);
-
- if ($wLen) {
- $code and croak "Preprocess breaks character positions. "
- . "Don't use with index(), match(), etc.";
- $norm and croak "Normalization breaks character positions. "
- . "Don't use with index(), match(), etc.";
- $str = $_[0];
- }
- else {
- $str = $_[0];
- $str = &$code($str) if ref $code;
- $str = &$norm($str) if ref $norm;
- }
+ my @buf;
# get array of Unicode code point of string.
my @src = unpack_U($str);
@@ -696,9 +692,13 @@ sub getWt
sub getSortKey
{
my $self = shift;
- my $rEnt = $self->splitEnt(shift); # get an arrayref of JCPS
+ my $orig = shift;
+ my $str = $self->process($orig);
+ my $rEnt = $self->splitEnt($str); # get an arrayref of JCPS
my $vers = $self->{UCA_Version};
my $term = $self->{hangul_terminator};
+ my $lev = $self->{level};
+ my $iden = $self->{identical};
my @buf; # weight arrays
if ($term) {
@@ -723,7 +723,13 @@ sub getSortKey
}
}
- return $self->mk_SortKey(\@buf); ### XS only
+ my $rkey = $self->mk_SortKey(\@buf); ### XS only
+
+ if ($iden || $vers >= 26 && $lev == MaxLevel) {
+ $rkey .= LEVEL_SEP;
+ $rkey .= pack(TIE_TEMPLATE, unpack_U($str)) if $iden;
+ }
+ return $rkey;
}
@@ -798,9 +804,15 @@ sub _eqArray($$$)
sub index
{
my $self = shift;
+ $self->{preprocess} and
+ croak "Don't use Preprocess with index(), match(), etc.";
+ $self->{normCode} and
+ croak "Don't use Normalization with index(), match(), etc.";
+
my $str = shift;
my $len = length($str);
- my $subE = $self->splitEnt(shift);
+ my $sub = shift;
+ my $subE = $self->splitEnt($sub);
my $pos = @_ ? shift : 0;
$pos = 0 if $pos < 0;
my $glob = shift;
@@ -1034,6 +1046,7 @@ with no parameters, the collator should do the default collation.
backwards => $levelNumber, # or \@levelNumbers
entry => $element,
hangul_terminator => $term_primary_weight,
+ identical => $bool,
ignoreName => qr/$ignoreName/,
ignoreChar => qr/$ignoreChar/,
ignore_level2 => $bool,
@@ -1074,6 +1087,7 @@ The following revisions are supported. The default is 24.
20 5.2.0 5.2.0 (5.2.0)
22 6.0.0 6.0.0 (6.0.0)
24 6.1.0 6.1.0 (6.1.0)
+ 26 6.2.0 6.2.0 (6.2.0)
* Noncharacters (e.g. U+FFFF) are not ignored, and can be overridden
since C<UCA_Version> 22.
@@ -1099,7 +1113,7 @@ as an alias for C<variable>.
=item backwards
--- see 3.1.2 French Accents, UTS #10.
+-- see 3.4 Backward Accents, UTS #10.
backwards => $levelNumber or \@levelNumbers
@@ -1109,7 +1123,7 @@ forwards at all the levels.
=item entry
--- see 3.1 Linguistic Features; 3.2.1 File Format, UTS #10.
+-- see 5 Tailoring; 3.6.1 File Format, UTS #10.
If the same character (or a sequence of characters) exists
in the collation element table through C<table>,
@@ -1183,11 +1197,27 @@ automatically terminated with a terminator primary weight.
These characters may need terminator included in a collation element
table beforehand.
+=item identical
+
+-- see A.3 Deterministic Comparison, UTS #10.
+
+By default, strings whose weights are equal should be equal,
+even though their code points are not equal.
+
+If the parameter is made true, a final, tie-breaking level is used.
+If no difference of weights is found after the comparison through all
+the level (independent of the value of C<level>), the comparison with
+code points will be performed. For the tie-breaking comparision,
+the sort key has code points of the original string appended.
+
+If C<preprocess> and/or C<normalization> is applied, the code points
+of the string after them (in NFD by default) are used.
+
=item ignoreChar
=item ignoreName
--- see 3.2.2 Variable Weighting, UTS #10.
+-- see 3.6.2 Variable Weighting, UTS #10.
Makes the entry in the table completely ignorable;
i.e. as if the weights were zero at all level.
@@ -1214,7 +1244,7 @@ B<NOTE>: C<level> should be 3 or greater.
=item katakana_before_hiragana
--- see 7.3.1 Tertiary Weight Table, UTS #10.
+-- see 7.2 Tertiary Weight Table, UTS #10.
By default, hiragana is before katakana.
If the parameter is made true, this is reversed.
@@ -1241,6 +1271,13 @@ Any higher levels than the specified one are ignored.
If omitted, the maximum is the 4th.
+B<NOTE:> The DUCET includes weights over 0xFFFF at the 4th level.
+But this module only uses weights within 0xFFFF.
+When C<variable> is 'blanked' or 'non-ignorable' (other than 'shifted'
+and 'shift-trimmed'), the level 4 may be unreliable.
+
+See also C<identical>.
+
=item normalization
-- see 4.1 Normalize, UTS #10.
@@ -1295,7 +1332,7 @@ those in the CJK Unified Ideographs Extension A etc.
U+4E00..U+9FBB if UCA_Version is 14 or 16.
U+4E00..U+9FC3 if UCA_Version is 18.
U+4E00..U+9FCB if UCA_Version is 20 or 22.
- U+4E00..U+9FCC if UCA_Version is 24.
+ U+4E00..U+9FCC if UCA_Version is 24 or 26.
In the CJK Unified Ideographs Extension blocks:
Ext.A (U+3400..U+4DB5) and Ext.B (U+20000..U+2A6D6) in any UCA_Version.
@@ -1373,7 +1410,7 @@ in C<table> or C<entry> is still valid.
=item preprocess
--- see 5.1 Preprocessing, UTS #10.
+-- see 5.4 Preprocessing, UTS #10.
If specified, the coderef is used to preprocess each string
before the formation of sort keys.
@@ -1402,7 +1439,7 @@ L<perluniintro>, L<perlunitut>, L<perlunifaq>, L<utf8>.
=item rearrange
--- see 3.1.3 Rearrangement, UTS #10.
+-- see 3.5 Rearrangement, UTS #10.
Characters that are not coded in logical order and to be rearranged.
If C<UCA_Version> is equal to or lesser than 11, default is:
@@ -1458,7 +1495,7 @@ B<NOTE>: Contractions via C<entry> are not be suppressed.
=item table
--- see 3.2 Default Unicode Collation Element Table, UTS #10.
+-- see 3.6 Default Unicode Collation Element Table, UTS #10.
You can use another collation element table if desired.
@@ -1537,7 +1574,7 @@ this parameter doesn't work validly.
=item variable
--- see 3.2.2 Variable Weighting, UTS #10.
+-- see 3.6.2 Variable Weighting, UTS #10.
This key allows for variable weighting of variable collation elements,
which are marked with an ASTERISK in the table
@@ -1861,6 +1898,11 @@ a collator via C<Unicode::Collate-E<gt>new( )> should be used;
for F<CollationTest_NON_IGNORABLE.txt>, a collator via
C<Unicode::Collate-E<gt>new(variable =E<gt> "non-ignorable", level =E<gt> 3)>.
+If C<UCA_Version> is 26 or later, the C<identical> level is preferred;
+C<Unicode::Collate-E<gt>new(identical =E<gt> 1)> and
+C<Unicode::Collate-E<gt>new(identical =E<gt> 1,>
+C<variable =E<gt> "non-ignorable", level =E<gt> 3)> should be used.
+
B<Unicode::Normalize is required to try The Conformance Test.>
=back
diff --git a/cpan/Unicode-Collate/Collate.xs b/cpan/Unicode-Collate/Collate.xs
index 94ff6e41f6..4d4ecca3ee 100644
--- a/cpan/Unicode-Collate/Collate.xs
+++ b/cpan/Unicode-Collate/Collate.xs
@@ -617,10 +617,14 @@ varCE (self, vce)
else if (*a == 's') { /* shifted or shift-trimmed */
totwt = d[1] + d[2] + d[3] + d[4] + d[5] + d[6];
if (alen == 7 && totwt != 0) { /* shifted */
- d[7] = (U8)(Shift4Wt >> 8);
- d[8] = (U8)(Shift4Wt & 0xFF);
- }
- else { /* shift-trimmed */
+ if (d[1] == 0 && d[2] == 1) { /* XXX: CollationAuxiliary-6.2.0 */
+ d[7] = d[1]; /* wt level 1 to 4 */
+ d[8] = d[2];
+ } else {
+ d[7] = (U8)(Shift4Wt >> 8);
+ d[8] = (U8)(Shift4Wt & 0xFF);
+ }
+ } else { /* shift-trimmed */
d[7] = d[8] = '\0';
}
}
@@ -642,7 +646,7 @@ visualizeSortKey (self, key)
U8 *s, *e, *d;
STRLEN klen, dlen;
UV uv;
- IV uca_vers;
+ IV uca_vers, sep = 0;
static const char *upperhex = "0123456789ABCDEF";
CODE:
if (SvROK(self) && SvTYPE(SvRV(self)) == SVt_PVHV)
@@ -658,10 +662,13 @@ visualizeSortKey (self, key)
s = (U8*)SvPV(key, klen);
/* slightly *longer* than the need, but I'm afraid of miscounting;
- exactly: (klen / 2) * 5 + MaxLevel * 2 - 1 (excluding '\0')
- = (klen / 2) * 5 - 1 # FFFF (16bit) and ' ' between 16bit units
- + (MaxLevel - 1) * 2 # ' ' and '|' for level boundaries
- + 2 # '[' and ']'
+ = (klen / 2) * 5 - 1
+ # FFFF and ' ' for each 16bit units but ' ' is less by 1;
+ # ' ' and '|' for level boundaries including the identical level
+ + 2 # '[' and ']'
+ + 1 # '\0'
+ (a) if klen is odd (not expected), maybe more 5 bytes.
+ (b) there is not always the identical level.
*/
dlen = (klen / 2) * 5 + MaxLevel * 2 + 2;
dst = newSV(dlen);
@@ -671,18 +678,18 @@ visualizeSortKey (self, key)
*d++ = '[';
for (e = s + klen; s < e; s += 2) {
uv = (U16)(*s << 8 | s[1]);
- if (uv) {
+ if (uv || sep >= MaxLevel) {
if ((d[-1] != '[') && ((9 <= uca_vers) || (d[-1] != '|')))
*d++ = ' ';
*d++ = upperhex[ (s[0] >> 4) & 0xF ];
*d++ = upperhex[ s[0] & 0xF ];
*d++ = upperhex[ (s[1] >> 4) & 0xF ];
*d++ = upperhex[ s[1] & 0xF ];
- }
- else {
+ } else {
if ((9 <= uca_vers) && (d[-1] != '['))
*d++ = ' ';
*d++ = '|';
+ ++sep;
}
}
*d++ = ']';
diff --git a/cpan/Unicode-Collate/README b/cpan/Unicode-Collate/README
index 5947585c0a..26e8ff5e6b 100644
--- a/cpan/Unicode-Collate/README
+++ b/cpan/Unicode-Collate/README
@@ -1,4 +1,4 @@
-Unicode/Collate version 0.91
+Unicode/Collate version 0.92
===============================
NAME
diff --git a/cpan/Unicode-Collate/t/cjkrange.t b/cpan/Unicode-Collate/t/cjkrange.t
index 37fb9fd279..e3d4f38d73 100644
--- a/cpan/Unicode-Collate/t/cjkrange.t
+++ b/cpan/Unicode-Collate/t/cjkrange.t
@@ -13,7 +13,7 @@ BEGIN {
use strict;
use warnings;
-BEGIN { $| = 1; print "1..379\n"; } # 1 + 42 x @Versions
+BEGIN { $| = 1; print "1..421\n"; } # 1 + 42 x @Versions
my $count = 0;
sub ok ($;$) {
my $p = my $r = shift;
@@ -49,7 +49,7 @@ my $coll = Unicode::Collate->new(
# 2A700..2B734 are CJK UI Ext.C since UCA_Version 20 (Unicode 5.2).
# 2B740..2B81D are CJK UI Ext.D since UCA_Version 22 (Unicode 6.0).
-my @Versions = (8, 9, 11, 14, 16, 18, 20, 22, 24);
+my @Versions = (8, 9, 11, 14, 16, 18, 20, 22, 24, 26);
for my $v (@Versions) {
$coll->change(UCA_Version => $v);
diff --git a/cpan/Unicode-Collate/t/compatui.t b/cpan/Unicode-Collate/t/compatui.t
index 6fb01b892c..822743ed8f 100644
--- a/cpan/Unicode-Collate/t/compatui.t
+++ b/cpan/Unicode-Collate/t/compatui.t
@@ -13,7 +13,7 @@ BEGIN {
use strict;
use warnings;
-BEGIN { $| = 1; print "1..631\n"; } # 1 + 70 x @Versions
+BEGIN { $| = 1; print "1..701\n"; } # 1 + 70 x @Versions
my $count = 0;
sub ok ($;$) {
my $p = my $r = shift;
@@ -30,7 +30,7 @@ ok(1);
#########################
-my @Versions = (8, 9, 11, 14, 16, 18, 20, 22, 24);
+my @Versions = (8, 9, 11, 14, 16, 18, 20, 22, 24, 26);
# 12 compatibility ideographs are treated as unified ideographs:
# FA0E, FA0F, FA11, FA13, FA14, FA1F, FA21, FA23, FA24, FA27, FA28, FA29.
diff --git a/cpan/Unicode-Collate/t/hangtype.t b/cpan/Unicode-Collate/t/hangtype.t
index b85a308f12..5aa7d49153 100644
--- a/cpan/Unicode-Collate/t/hangtype.t
+++ b/cpan/Unicode-Collate/t/hangtype.t
@@ -13,7 +13,7 @@ BEGIN {
use strict;
use warnings;
-BEGIN { $| = 1; print "1..451\n"; } # 1 + 50 x @Versions
+BEGIN { $| = 1; print "1..501\n"; } # 1 + 50 x @Versions
my $count = 0;
sub ok ($;$) {
my $p = my $r = shift;
@@ -30,7 +30,7 @@ ok(1);
#########################
-my @Versions = (8, 9, 11, 14, 16, 18, 20, 22, 24);
+my @Versions = (8, 9, 11, 14, 16, 18, 20, 22, 24, 26);
for my $v (@Versions) {
ok(Unicode::Collate::getHST(0x0000, $v), '');
diff --git a/cpan/Unicode-Collate/t/ident.t b/cpan/Unicode-Collate/t/ident.t
new file mode 100644
index 0000000000..4f132d4ec1
--- /dev/null
+++ b/cpan/Unicode-Collate/t/ident.t
@@ -0,0 +1,161 @@
+
+BEGIN {
+ unless ("A" eq pack('U', 0x41)) {
+ print "1..0 # Unicode::Collate " .
+ "cannot stringify a Unicode code point\n";
+ exit 0;
+ }
+ if ($ENV{PERL_CORE}) {
+ chdir('t') if -d 't';
+ @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib);
+ }
+}
+
+use strict;
+use warnings;
+BEGIN { $| = 1; print "1..45\n"; }
+my $count = 0;
+sub ok ($;$) {
+ my $p = my $r = shift;
+ if (@_) {
+ my $x = shift;
+ $p = !defined $x ? !defined $r : !defined $r ? 0 : $r eq $x;
+ }
+ print $p ? "ok" : "not ok", ' ', ++$count, "\n";
+}
+
+use Unicode::Collate;
+
+ok(1);
+
+#########################
+
+my $Collator = Unicode::Collate->new(
+ table => 'keys.txt',
+ normalization => undef,
+);
+
+# [001F] UNIT SEPARATOR
+{
+ ok($Collator->eq("\0", "\x1F"));
+ ok($Collator->eq("\x1F", "\x{200B}"));
+ ok($Collator->eq("\0", "\x{200B}"));
+ ok($Collator->eq("\x{313}", "\x{343}"));
+ ok($Collator->eq("\x{2000}", "\x{2001}"));
+ ok($Collator->eq("\x{200B}", "\x{200C}"));
+ ok($Collator->eq("\x{304C}", "\x{304B}\x{3099}"));
+
+ $Collator->change(identical => 1);
+
+ ok($Collator->lt("\0", "\x1F"));
+ ok($Collator->lt("\x1F", "\x{200B}"));
+ ok($Collator->lt("\0", "\x{200B}"));
+ ok($Collator->lt("\x{313}", "\x{343}"));
+ ok($Collator->lt("\x{2000}", "\x{2001}"));
+ ok($Collator->lt("\x{200B}", "\x{200C}"));
+ ok($Collator->gt("\x{304C}", "\x{304B}\x{3099}"));
+
+ $Collator->change(identical => 0);
+
+ ok($Collator->eq("\0", "\x1F"));
+ ok($Collator->eq("\x1F", "\x{200B}"));
+ ok($Collator->eq("\0", "\x{200B}"));
+ ok($Collator->eq("\x{313}", "\x{343}"));
+ ok($Collator->eq("\x{2000}", "\x{2001}"));
+ ok($Collator->eq("\x{200B}", "\x{200C}"));
+ ok($Collator->eq("\x{304C}", "\x{304B}\x{3099}"));
+}
+
+#### 22
+
+eval { require Unicode::Normalize };
+if (!$@) {
+ $Collator->change(normalization => "NFD");
+
+ $Collator->change(identical => 1);
+
+ ok($Collator->lt("\0", "\x{200B}"));
+ ok($Collator->eq("\x{313}", "\x{343}"));
+ ok($Collator->lt("\x{2000}", "\x{2001}"));
+ ok($Collator->lt("\x{200B}", "\x{200C}"));
+ ok($Collator->eq("\x{304C}", "\x{304B}\x{3099}"));
+
+ $Collator->change(identical => 0);
+
+ ok($Collator->eq("\0", "\x{200B}"));
+ ok($Collator->eq("\x{313}", "\x{343}"));
+ ok($Collator->eq("\x{2000}", "\x{2001}"));
+ ok($Collator->eq("\x{200B}", "\x{200C}"));
+ ok($Collator->eq("\x{304C}", "\x{304B}\x{3099}"));
+} else {
+ ok(1) for 1..10;
+}
+
+$Collator->change(normalization => undef, identical => 1);
+
+##### 32
+
+ok($Collator->viewSortKey("\0"), '[| | | | 0000 0000]');
+ok($Collator->viewSortKey("\x{200B}"), '[| | | | 0000 200B]');
+
+ok($Collator->viewSortKey('a'),
+ '[0A15 | 0020 | 0002 | FFFF | 0000 0061]');
+
+ok($Collator->viewSortKey("\x{304C}"),
+ '[1926 | 0020 013D | 000E 0002 | FFFF FFFF | 0000 304C]');
+
+ok($Collator->viewSortKey("\x{100000}"),
+ '[FBE0 8000 | 0020 | 0002 | FFFF FFFF | 0010 0000]');
+
+eval { require Unicode::Normalize };
+if (!$@) {
+ $Collator->change(normalization => "NFD");
+
+ ok($Collator->viewSortKey("\x{304C}"),
+ '[1926 | 0020 013D | 000E 0002 | FFFF FFFF | 0000 304B 0000 3099]');
+} else {
+ ok(1);
+}
+
+$Collator->change(normalization => undef);
+
+##### 38
+
+$Collator->change(level => 3);
+
+ok($Collator->viewSortKey("\x{304C}"),
+ '[1926 | 0020 013D | 000E 0002 | | 0000 304C]');
+
+$Collator->change(level => 2);
+
+ok($Collator->viewSortKey("\x{304C}"),
+ '[1926 | 0020 013D | | | 0000 304C]');
+
+$Collator->change(level => 1);
+
+ok($Collator->viewSortKey("\x{304C}"),
+ '[1926 | | | | 0000 304C]');
+
+##### 41
+
+$Collator->change(UCA_Version => 8);
+
+ok($Collator->viewSortKey("\x{304C}"),
+ '[1926||||0000 304C]');
+
+$Collator->change(level => 2);
+
+ok($Collator->viewSortKey("\x{304C}"),
+ '[1926|0020 013D|||0000 304C]');
+
+$Collator->change(level => 3);
+
+ok($Collator->viewSortKey("\x{304C}"),
+ '[1926|0020 013D|000E 0002||0000 304C]');
+
+$Collator->change(level => 4);
+
+ok($Collator->viewSortKey("\x{304C}"),
+ '[1926|0020 013D|000E 0002|FFFF FFFF|0000 304C]');
+
+##### 45
diff --git a/cpan/Unicode-Collate/t/index.t b/cpan/Unicode-Collate/t/index.t
index b3433a9e5f..11cf618f5a 100644
--- a/cpan/Unicode-Collate/t/index.t
+++ b/cpan/Unicode-Collate/t/index.t
@@ -13,7 +13,7 @@ BEGIN {
use strict;
use warnings;
-BEGIN { $| = 1; print "1..71\n"; }
+BEGIN { $| = 1; print "1..91\n"; }
my $count = 0;
sub ok ($;$) {
my $p = my $r = shift;
@@ -28,8 +28,6 @@ use Unicode::Collate;
ok(1);
-#########################
-
our $IsEBCDIC = ord("A") != 0x41;
my $Collator = Unicode::Collate->new(
@@ -37,7 +35,7 @@ my $Collator = Unicode::Collate->new(
normalization => undef,
);
-##############
+##### 1
my %old_level = $Collator->change(level => 2);
@@ -64,7 +62,7 @@ if (my($pos,$len) = $Collator->index($str, $sub)) {
ok($str, $orig);
-##############
+##### 3
my $match;
@@ -126,7 +124,7 @@ if (my($pos, $len) = $Collator->index($str, $sub)) {
}
ok($match, $ret);
-##############
+##### 9
$Collator->change(level => 1);
@@ -165,7 +163,7 @@ if (my($pos,$len) = $Collator->index("", "abc")) {
}
ok($match, undef);
-##############
+##### 13
$Collator->change(level => 1);
@@ -201,7 +199,7 @@ if (my($pos, $len) = $Collator->index($str, $sub)) {
}
ok($match, $ret);
-##############
+##### 16
$Collator->change(level => 1);
@@ -246,7 +244,7 @@ ok($match, undef);
$Collator->change(%old_level);
-##############
+##### 22
my @ret;
@@ -318,7 +316,7 @@ ok($ret, undef);
$Collator->change(%old_level);
-##############
+##### 38
$Collator->change(level => 1);
@@ -349,6 +347,8 @@ $Collator->gsubst($str, "camel", sub { "<b>$_[0]</b>" });
ok($str, "<b>Camel</b> donkey zebra <b>came\x{301}l</b> "
. "<b>CAMEL</b> horse <b>cAm\0E\0L</b>...");
+##### 47
+
# http://www.xray.mpe.mpg.de/mailing-lists/perl-unicode/2010-09/msg00014.html
# when the substring includes an ignorable element like a space...
@@ -376,6 +376,8 @@ $str = "Camel donkey zebra camex{301}l CAMEL horse cAmEL-horse...";
$Collator->gsubst($str, "ca\x{300}melho\x{302}rse", sub { "=$_[0]=" });
ok($str, "Camel donkey zebra camex{301}l =CAMEL horse= =cAmEL-horse=...");
+##### 53
+
$Collator->change(level => 3);
$str = "P\cBe\x{300}\cBrl and PERL.";
@@ -400,7 +402,7 @@ ok($str, "P\cBe\x{300}\cBrl and PERL.");
$Collator->change(%old_level);
-##############
+##### 61
$str = "Perl and Camel";
$ret = $Collator->gsubst($str, "\cA\cA\0", "AB");
@@ -422,7 +424,7 @@ $ret = $Collator->gsubst($str, 'PP', "ABC");
ok($ret, 2);
ok($str, "ABCABCP");
-##############
+##### 69
# Shifted; ignorable after variable
@@ -434,3 +436,99 @@ $Collator->change(alternate => 'Non-ignorable');
($ret) = $Collator->match("A?\x{300}!\x{301}B\x{315}", "?!");
ok($ret, undef);
+##### 71
+
+# Now preprocess is defined.
+
+$Collator->change(preprocess => sub {''});
+
+eval { $Collator->index("", "") };
+ok($@ && $@ =~ /Don't use Preprocess with index\(\)/);
+
+eval { $Collator->index("a", "a") };
+ok($@ && $@ =~ /Don't use Preprocess with index\(\)/);
+
+eval { $Collator->match("", "") };
+ok($@ && $@ =~ /Don't use Preprocess with.*match\(\)/);
+
+eval { $Collator->match("a", "a") };
+ok($@ && $@ =~ /Don't use Preprocess with.*match\(\)/);
+
+$Collator->change(preprocess => sub { uc shift });
+
+eval { $Collator->index("", "") };
+ok($@ && $@ =~ /Don't use Preprocess with index\(\)/);
+
+eval { $Collator->index("a", "a") };
+ok($@ && $@ =~ /Don't use Preprocess with index\(\)/);
+
+eval { $Collator->match("", "") };
+ok($@ && $@ =~ /Don't use Preprocess with.*match\(\)/);
+
+eval { $Collator->match("a", "a") };
+ok($@ && $@ =~ /Don't use Preprocess with.*match\(\)/);
+
+##### 79
+
+eval { require Unicode::Normalize };
+my $has_norm = !$@;
+
+if ($has_norm) {
+ # Now preprocess and normalization are defined.
+
+ $Collator->change(normalization => 'NFD');
+
+ eval { $Collator->index("", "") };
+ ok($@ && $@ =~ /Don't use Preprocess with index\(\)/);
+
+ eval { $Collator->index("a", "a") };
+ ok($@ && $@ =~ /Don't use Preprocess with index\(\)/);
+
+ eval { $Collator->match("", "") };
+ ok($@ && $@ =~ /Don't use Preprocess with.*match\(\)/);
+
+ eval { $Collator->match("a", "a") };
+ ok($@ && $@ =~ /Don't use Preprocess with.*match\(\)/);
+} else {
+ ok(1) for 1..4;
+}
+
+$Collator->change(preprocess => undef);
+
+if ($has_norm) {
+ # Now only normalization is defined.
+
+ eval { $Collator->index("", "") };
+ ok($@ && $@ =~ /Don't use Normalization with index\(\)/);
+
+ eval { $Collator->index("a", "a") };
+ ok($@ && $@ =~ /Don't use Normalization with index\(\)/);
+
+ eval { $Collator->match("", "") };
+ ok($@ && $@ =~ /Don't use Normalization with.*match\(\)/);
+
+ eval { $Collator->match("a", "a") };
+ ok($@ && $@ =~ /Don't use Normalization with.*match\(\)/);
+
+ $Collator->change(normalization => undef);
+} else {
+ ok(1) for 1..4;
+}
+
+##### 87
+
+# Now preprocess and normalization are undef.
+
+eval { $Collator->index("", "") };
+ok(!$@);
+
+eval { $Collator->index("a", "a") };
+ok(!$@);
+
+eval { $Collator->match("", "") };
+ok(!$@);
+
+eval { $Collator->match("a", "a") };
+ok(!$@);
+
+##### 91
diff --git a/cpan/Unicode-Collate/t/overcjk0.t b/cpan/Unicode-Collate/t/overcjk0.t
index 588e8a8c02..081f57b158 100644
--- a/cpan/Unicode-Collate/t/overcjk0.t
+++ b/cpan/Unicode-Collate/t/overcjk0.t
@@ -13,7 +13,7 @@ BEGIN {
use strict;
use warnings;
-BEGIN { $| = 1; print "1..285\n"; } # 6 + 31 x @Versions
+BEGIN { $| = 1; print "1..316\n"; } # 6 + 31 x @Versions
my $count = 0;
sub ok ($;$) {
my $p = my $r = shift;
@@ -62,7 +62,7 @@ ok($ignoreCJK->lt("Pe\x{5B57}rl", "Perl")); # 'r' is unassigned.
# 2A700..2B734 are CJK UI Ext.C since UCA_Version 20 (Unicode 5.2).
# 2B740..2B81D are CJK UI Ext.D since UCA_Version 22 (Unicode 6.0).
-my @Versions = (8, 9, 11, 14, 16, 18, 20, 22, 24);
+my @Versions = (8, 9, 11, 14, 16, 18, 20, 22, 24, 26);
for my $v (@Versions) {
$ignoreCJK->change(UCA_Version => $v);
diff --git a/cpan/Unicode-Collate/t/overcjk1.t b/cpan/Unicode-Collate/t/overcjk1.t
index dc3ae8f23b..7bee17658a 100644
--- a/cpan/Unicode-Collate/t/overcjk1.t
+++ b/cpan/Unicode-Collate/t/overcjk1.t
@@ -13,7 +13,7 @@ BEGIN {
use strict;
use warnings;
-BEGIN { $| = 1; print "1..155\n"; } # 11 + 16 x @Versions
+BEGIN { $| = 1; print "1..171\n"; } # 11 + 16 x @Versions
my $count = 0;
sub ok ($;$) {
my $p = my $r = shift;
@@ -62,7 +62,7 @@ ok($overCJK->lt("a\x{4E03}", "A\x{4E01}"));
# 9FC4..9FCB are CJK UI since UCA_Version 20 (Unicode 5.2).
# 9FCC is CJK UI since UCA_Version 24 (Unicode 6.1).
-my @Versions = (8, 9, 11, 14, 16, 18, 20, 22, 24);
+my @Versions = (8, 9, 11, 14, 16, 18, 20, 22, 24, 26);
for my $v (@Versions) {
$overCJK->change(UCA_Version => $v);
diff --git a/cpan/Unicode-Collate/t/test.t b/cpan/Unicode-Collate/t/test.t
index 440c3a9da1..552440f919 100644
--- a/cpan/Unicode-Collate/t/test.t
+++ b/cpan/Unicode-Collate/t/test.t
@@ -13,7 +13,7 @@ BEGIN {
use strict;
use warnings;
-BEGIN { $| = 1; print "1..107\n"; }
+BEGIN { $| = 1; print "1..112\n"; }
my $count = 0;
sub ok ($;$) {
my $p = my $r = shift;
@@ -385,5 +385,36 @@ $_ = 'Foo';
@temp = $c->index("perl5", "LR");
ok($_, 'Foo');
-#####
+##### 108..109
+
+{
+ my $caseless = Unicode::Collate->new(
+ table => "keys.txt",
+ normalization => undef,
+ preprocess => sub { uc shift },
+ );
+ ok( $Collator->gt("ABC","abc") );
+ ok( $caseless->eq("ABC","abc") );
+}
+##### 110..112
+
+{
+ eval { require Unicode::Normalize; };
+ if ($@) {
+ eval { my $n1 = Unicode::Collate->new(table => "keys.txt"); };
+ ok($@ =~ /Unicode::Normalize is required/);
+
+ eval { my $n2 = Unicode::Collate->new
+ (table => "keys.txt", normalization => undef); };
+ ok(!$@);
+
+ eval { my $n3 = Unicode::Collate->new
+ (table => "keys.txt", normalization => 'prenormalized'); };
+ ok($@ =~ /Unicode::Normalize is required/);
+ } else {
+ ok(1) for 1..3;
+ }
+}
+
+#####
diff --git a/cpan/Unicode-Collate/t/view.t b/cpan/Unicode-Collate/t/view.t
index 6f7c0fb7ae..4759533823 100644
--- a/cpan/Unicode-Collate/t/view.t
+++ b/cpan/Unicode-Collate/t/view.t
@@ -13,7 +13,7 @@ BEGIN {
use strict;
use warnings;
-BEGIN { $| = 1; print "1..53\n"; }
+BEGIN { $| = 1; print "1..89\n"; }
my $count = 0;
sub ok ($;$) {
my $p = my $r = shift;
@@ -28,15 +28,14 @@ use Unicode::Collate;
ok(1);
-#########################
+##### 1
my $Collator = Unicode::Collate->new(
table => 'keys.txt',
normalization => undef,
+ UCA_Version => 24,
);
-##############
-
ok($Collator->viewSortKey(""), "[| | |]");
ok($Collator->viewSortKey("A"), "[0A15 | 0020 | 0008 | FFFF]");
@@ -60,7 +59,7 @@ ok($Collator->viewSortKey("A"), "[0A15 | 0020 | |]");
$Collator->change(level => 1);
ok($Collator->viewSortKey("A"), "[0A15 | | |]");
-### Version 8
+##### 10
$Collator->change(level => 4, UCA_Version => 8);
@@ -87,7 +86,7 @@ ok($Collator->viewSortKey("A"), "[0A15|0020||]");
$Collator->change(level => 1);
ok($Collator->viewSortKey("A"), "[0A15|||]");
-# Version 9
+##### 19
$Collator->change(level => 3, UCA_Version => 9);
ok($Collator->viewSortKey("A\x{300}z\x{301}"),
@@ -156,7 +155,7 @@ ok($Collator->viewSortKey("?!."), '[| | | 024E 024B 0255]');
$Collator->change(%origVar);
-#####
+##### 37
# Level 3 weight
@@ -197,7 +196,7 @@ ok($Collator->viewSortKey("a\x{3042}"),
ok($Collator->viewSortKey("A\x{30A2}"),
'[0A15 1921 | 0020 0020 | 0008 0011 | FFFF FFFF]');
-#####
+##### 47
our $el = Unicode::Collate->new(
entry => <<'ENTRY',
@@ -214,6 +213,7 @@ FF2C ; [.0B03.0020.0009.FF2C] # FULLWIDTH LATIN CAPITAL LETTER L; QQK
ENTRY
table => undef,
normalization => undef,
+ UCA_Version => 24,
);
our $el12 = '0B03 0B03 0B03 0B03 0B03 | 0020 0020 0020 0020 0020';
@@ -240,5 +240,30 @@ ok($el->viewSortKey("l\x{FF4C}\x{217C}\x{2113}\x{24DB}"),
ok($el->viewSortKey("L\x{FF2C}\x{216C}\x{2112}\x{24C1}"),
"[$el12 | 0008 0009 000A 000B 000C | FFFF FFFF FFFF FFFF FFFF]");
-#####
+##### 53
+
+my @Versions = (9, 11, 14, 16, 18, 20, 22, 24, 26);
+
+for my $v (@Versions) {
+ $Collator->change(UCA_Version => $v);
+ my $app = $v >= 26 ? ' |]' : ']';
+
+ $Collator->change(variable => 'Shifted', level => 4);
+ ok($Collator->viewSortKey("1+2"),
+ '[0A0C 0A0D | 0020 0020 | 0002 0002 | FFFF 039F FFFF'.$app);
+
+ $Collator->change(variable => 'Shift-Trimmed');
+ ok($Collator->viewSortKey("1+2"),
+ '[0A0C 0A0D | 0020 0020 | 0002 0002 | 039F'.$app);
+
+ $Collator->change(variable => 'Non-ignorable', level => 3);
+ ok($Collator->viewSortKey("1+2"),
+ '[0A0C 039F 0A0D | 0020 0020 0020 | 0002 0002 0002 |]');
+
+ $Collator->change(variable => 'Blanked');
+ ok($Collator->viewSortKey("1+2"),
+ '[0A0C 0A0D | 0020 0020 | 0002 0002 |]');
+}
+
+##### 89