diff options
author | Chris 'BinGOs' Williams <chris@bingosnet.co.uk> | 2012-11-14 14:46:15 +0000 |
---|---|---|
committer | Chris 'BinGOs' Williams <chris@bingosnet.co.uk> | 2012-11-14 14:46:15 +0000 |
commit | 750da83817c1e4f89d6495fec7275f6ae3981632 (patch) | |
tree | 7aec06cd5ce1d3706fa2fcf19159d2d083e627b9 /cpan/Unicode-Collate | |
parent | 265075c0d13cbdf53c9e662decbb83fc34d4e7a4 (diff) | |
download | perl-750da83817c1e4f89d6495fec7275f6ae3981632.tar.gz |
Update Unicode-Collate to CPAN version 0.92
[DELTA]
0.92 Wed Nov 14 20:58:19 2012
- fix: index() etc. with preprocess/normalization should be always croaked.
- doc: referred to the latest UTS #10 and updated its section numbers.
- supported the identical level (see 'identical' in POD).
- Now UCA_Version 26 (for Unicode 6.2.0) is supported.
- added ident.t in t.
- modified tests: cjkrange.t, compatui.t, hangtype.t, index.t,
overcjk0.t, overcjk1.t, test.t, view.t in t.
* But the default UCA_Version is still 24.
(In the next release, UCA_Version 26 will be the default.)
Diffstat (limited to 'cpan/Unicode-Collate')
-rw-r--r-- | cpan/Unicode-Collate/Changes | 14 | ||||
-rw-r--r-- | cpan/Unicode-Collate/Collate.pm | 140 | ||||
-rw-r--r-- | cpan/Unicode-Collate/Collate.xs | 31 | ||||
-rw-r--r-- | cpan/Unicode-Collate/README | 2 | ||||
-rw-r--r-- | cpan/Unicode-Collate/t/cjkrange.t | 4 | ||||
-rw-r--r-- | cpan/Unicode-Collate/t/compatui.t | 4 | ||||
-rw-r--r-- | cpan/Unicode-Collate/t/hangtype.t | 4 | ||||
-rw-r--r-- | cpan/Unicode-Collate/t/ident.t | 161 | ||||
-rw-r--r-- | cpan/Unicode-Collate/t/index.t | 122 | ||||
-rw-r--r-- | cpan/Unicode-Collate/t/overcjk0.t | 4 | ||||
-rw-r--r-- | cpan/Unicode-Collate/t/overcjk1.t | 4 | ||||
-rw-r--r-- | cpan/Unicode-Collate/t/test.t | 35 | ||||
-rw-r--r-- | cpan/Unicode-Collate/t/view.t | 43 |
13 files changed, 472 insertions, 96 deletions
diff --git a/cpan/Unicode-Collate/Changes b/cpan/Unicode-Collate/Changes index 71e8646c26..7225c3bb63 100644 --- a/cpan/Unicode-Collate/Changes +++ b/cpan/Unicode-Collate/Changes @@ -1,5 +1,17 @@ Revision history for Perl module Unicode::Collate. +0.92 Wed Nov 14 20:58:19 2012 + - fix: index() etc. with preprocess/normalization should be always croaked. + - doc: referred to the latest UTS #10 and updated its section numbers. + - supported the identical level (see 'identical' in POD). + - Now UCA_Version 26 (for Unicode 6.2.0) is supported. + - added ident.t in t. + - modified tests: cjkrange.t, compatui.t, hangtype.t, index.t, + overcjk0.t, overcjk1.t, test.t, view.t in t. + + * But the default UCA_Version is still 24. + (In the next release, UCA_Version 26 will be the default.) + 0.91 Sun Nov 4 17:00:20 2012 - XSUB: use PERL_NO_GET_CONTEXT (see perlguts) (see [rt.cpan.org #80313]) @@ -310,7 +322,7 @@ Revision history for Perl module Unicode::Collate. 0.29 Mon Oct 13 12:18:23 2003 - now UCA Version 11 (but no functionality is different from Version 9). - - supported hangul_terminator. + - supported 'hangul_terminator'. - fix: Base_Unicode_Version falsely returns Perl's Unicode version. C4 in UTS #10 requires UTS's Unicode version. - For variable weighting, 'variable' is recommended diff --git a/cpan/Unicode-Collate/Collate.pm b/cpan/Unicode-Collate/Collate.pm index 5964f83511..9e1623cf4e 100644 --- a/cpan/Unicode-Collate/Collate.pm +++ b/cpan/Unicode-Collate/Collate.pm @@ -14,7 +14,7 @@ use File::Spec; no warnings 'utf8'; -our $VERSION = '0.91'; +our $VERSION = '0.92'; our $PACKAGE = __PACKAGE__; ### begin XS only ### @@ -48,16 +48,14 @@ use constant Min3Wt => 0x02; use constant Shift4Wt => 0xFFFF; # A boolean for Variable and 16-bit weights at 4 levels of Collation Element -# PROBLEM: The Default Unicode Collation Element Table -# has weights over 0xFFFF at the 4th level. -# The tie-breaking in the variable weights -# other than "shift" (as well as "shift-trimmed") is unreliable. use constant VCE_TEMPLATE => 'Cn4'; # A sort key: 16-bit weights -# See also the PROBLEM on VCE_TEMPLATE above. use constant KEY_TEMPLATE => 'n*'; +# The tie-breaking: 32-bit weights +use constant TIE_TEMPLATE => 'N*'; + # Level separator in a sort key: # i.e. pack(KEY_TEMPLATE, 0) use constant LEVEL_SEP => "\0\0"; @@ -105,7 +103,7 @@ our @ChangeOK = qw/ alternate backwards level normalization rearrange katakana_before_hiragana upper_before_lower ignore_level2 overrideHangul overrideCJK preprocess UCA_Version - hangul_terminator variable + hangul_terminator variable identical /; our @ChangeNG = qw/ @@ -135,18 +133,18 @@ sub change { my $self = shift; my %hash = @_; my %old; - if (exists $hash{variable} && exists $hash{alternate}) { - delete $hash{alternate}; - } - elsif (!exists $hash{variable} && exists $hash{alternate}) { - $hash{variable} = $hash{alternate}; + if (exists $hash{alternate}) { + if (exists $hash{variable}) { + delete $hash{alternate}; + } else { + $hash{variable} = $hash{alternate}; + } } foreach my $k (keys %hash) { if (exists $ChangeOK{$k}) { $old{$k} = $self->{$k}; $self->{$k} = $hash{$k}; - } - elsif (exists $ChangeNG{$k}) { + } elsif (exists $ChangeNG{$k}) { croak "change of $k via change() is not allowed!"; } # else => ignored @@ -176,6 +174,7 @@ my %DerivCode = ( 20 => \&_derivCE_20, 22 => \&_derivCE_22, 24 => \&_derivCE_24, + 26 => \&_derivCE_24, # 26 == 24 ); sub checkCollator { @@ -193,12 +192,10 @@ sub checkCollator { if (! defined $self->{backwards}) { $self->{backwardsFlag} = 0; - } - elsif (! ref $self->{backwards}) { + } elsif (! ref $self->{backwards}) { _checkLevel($self->{backwards}, "backwards"); $self->{backwardsFlag} = 1 << $self->{backwards}; - } - else { + } else { my %level; $self->{backwardsFlag} = 0; for my $b (@{ $self->{backwards} }) { @@ -443,21 +440,33 @@ sub parseEntry sub viewSortKey { my $self = shift; - $self->visualizeSortKey($self->getSortKey(@_)); + my $str = shift; + $self->visualizeSortKey($self->getSortKey($str)); } +sub process +{ + my $self = shift; + my $str = shift; + my $prep = $self->{preprocess}; + my $norm = $self->{normCode}; + + $str = &$prep($str) if ref $prep; + $str = &$norm($str) if ref $norm; + return $str; +} + ## ## arrayref of JCPS = splitEnt(string to be collated) -## arrayref of arrayref[JCPS, ini_pos, fin_pos] = splitEnt(string, true) +## arrayref of arrayref[JCPS, ini_pos, fin_pos] = splitEnt(string, TRUE) ## sub splitEnt { my $self = shift; - my $wLen = $_[1]; + my $str = shift; + my $wLen = shift; # with Length - my $code = $self->{preprocess}; - my $norm = $self->{normCode}; my $map = $self->{mapping}; my $max = $self->{maxlength}; my $reH = $self->{rearrangeHash}; @@ -465,20 +474,7 @@ sub splitEnt my $ver9 = $vers >= 9 && $vers <= 11; my $uXS = $self->{__useXS}; ### XS only - my ($str, @buf); - - if ($wLen) { - $code and croak "Preprocess breaks character positions. " - . "Don't use with index(), match(), etc."; - $norm and croak "Normalization breaks character positions. " - . "Don't use with index(), match(), etc."; - $str = $_[0]; - } - else { - $str = $_[0]; - $str = &$code($str) if ref $code; - $str = &$norm($str) if ref $norm; - } + my @buf; # get array of Unicode code point of string. my @src = unpack_U($str); @@ -696,9 +692,13 @@ sub getWt sub getSortKey { my $self = shift; - my $rEnt = $self->splitEnt(shift); # get an arrayref of JCPS + my $orig = shift; + my $str = $self->process($orig); + my $rEnt = $self->splitEnt($str); # get an arrayref of JCPS my $vers = $self->{UCA_Version}; my $term = $self->{hangul_terminator}; + my $lev = $self->{level}; + my $iden = $self->{identical}; my @buf; # weight arrays if ($term) { @@ -723,7 +723,13 @@ sub getSortKey } } - return $self->mk_SortKey(\@buf); ### XS only + my $rkey = $self->mk_SortKey(\@buf); ### XS only + + if ($iden || $vers >= 26 && $lev == MaxLevel) { + $rkey .= LEVEL_SEP; + $rkey .= pack(TIE_TEMPLATE, unpack_U($str)) if $iden; + } + return $rkey; } @@ -798,9 +804,15 @@ sub _eqArray($$$) sub index { my $self = shift; + $self->{preprocess} and + croak "Don't use Preprocess with index(), match(), etc."; + $self->{normCode} and + croak "Don't use Normalization with index(), match(), etc."; + my $str = shift; my $len = length($str); - my $subE = $self->splitEnt(shift); + my $sub = shift; + my $subE = $self->splitEnt($sub); my $pos = @_ ? shift : 0; $pos = 0 if $pos < 0; my $glob = shift; @@ -1034,6 +1046,7 @@ with no parameters, the collator should do the default collation. backwards => $levelNumber, # or \@levelNumbers entry => $element, hangul_terminator => $term_primary_weight, + identical => $bool, ignoreName => qr/$ignoreName/, ignoreChar => qr/$ignoreChar/, ignore_level2 => $bool, @@ -1074,6 +1087,7 @@ The following revisions are supported. The default is 24. 20 5.2.0 5.2.0 (5.2.0) 22 6.0.0 6.0.0 (6.0.0) 24 6.1.0 6.1.0 (6.1.0) + 26 6.2.0 6.2.0 (6.2.0) * Noncharacters (e.g. U+FFFF) are not ignored, and can be overridden since C<UCA_Version> 22. @@ -1099,7 +1113,7 @@ as an alias for C<variable>. =item backwards --- see 3.1.2 French Accents, UTS #10. +-- see 3.4 Backward Accents, UTS #10. backwards => $levelNumber or \@levelNumbers @@ -1109,7 +1123,7 @@ forwards at all the levels. =item entry --- see 3.1 Linguistic Features; 3.2.1 File Format, UTS #10. +-- see 5 Tailoring; 3.6.1 File Format, UTS #10. If the same character (or a sequence of characters) exists in the collation element table through C<table>, @@ -1183,11 +1197,27 @@ automatically terminated with a terminator primary weight. These characters may need terminator included in a collation element table beforehand. +=item identical + +-- see A.3 Deterministic Comparison, UTS #10. + +By default, strings whose weights are equal should be equal, +even though their code points are not equal. + +If the parameter is made true, a final, tie-breaking level is used. +If no difference of weights is found after the comparison through all +the level (independent of the value of C<level>), the comparison with +code points will be performed. For the tie-breaking comparision, +the sort key has code points of the original string appended. + +If C<preprocess> and/or C<normalization> is applied, the code points +of the string after them (in NFD by default) are used. + =item ignoreChar =item ignoreName --- see 3.2.2 Variable Weighting, UTS #10. +-- see 3.6.2 Variable Weighting, UTS #10. Makes the entry in the table completely ignorable; i.e. as if the weights were zero at all level. @@ -1214,7 +1244,7 @@ B<NOTE>: C<level> should be 3 or greater. =item katakana_before_hiragana --- see 7.3.1 Tertiary Weight Table, UTS #10. +-- see 7.2 Tertiary Weight Table, UTS #10. By default, hiragana is before katakana. If the parameter is made true, this is reversed. @@ -1241,6 +1271,13 @@ Any higher levels than the specified one are ignored. If omitted, the maximum is the 4th. +B<NOTE:> The DUCET includes weights over 0xFFFF at the 4th level. +But this module only uses weights within 0xFFFF. +When C<variable> is 'blanked' or 'non-ignorable' (other than 'shifted' +and 'shift-trimmed'), the level 4 may be unreliable. + +See also C<identical>. + =item normalization -- see 4.1 Normalize, UTS #10. @@ -1295,7 +1332,7 @@ those in the CJK Unified Ideographs Extension A etc. U+4E00..U+9FBB if UCA_Version is 14 or 16. U+4E00..U+9FC3 if UCA_Version is 18. U+4E00..U+9FCB if UCA_Version is 20 or 22. - U+4E00..U+9FCC if UCA_Version is 24. + U+4E00..U+9FCC if UCA_Version is 24 or 26. In the CJK Unified Ideographs Extension blocks: Ext.A (U+3400..U+4DB5) and Ext.B (U+20000..U+2A6D6) in any UCA_Version. @@ -1373,7 +1410,7 @@ in C<table> or C<entry> is still valid. =item preprocess --- see 5.1 Preprocessing, UTS #10. +-- see 5.4 Preprocessing, UTS #10. If specified, the coderef is used to preprocess each string before the formation of sort keys. @@ -1402,7 +1439,7 @@ L<perluniintro>, L<perlunitut>, L<perlunifaq>, L<utf8>. =item rearrange --- see 3.1.3 Rearrangement, UTS #10. +-- see 3.5 Rearrangement, UTS #10. Characters that are not coded in logical order and to be rearranged. If C<UCA_Version> is equal to or lesser than 11, default is: @@ -1458,7 +1495,7 @@ B<NOTE>: Contractions via C<entry> are not be suppressed. =item table --- see 3.2 Default Unicode Collation Element Table, UTS #10. +-- see 3.6 Default Unicode Collation Element Table, UTS #10. You can use another collation element table if desired. @@ -1537,7 +1574,7 @@ this parameter doesn't work validly. =item variable --- see 3.2.2 Variable Weighting, UTS #10. +-- see 3.6.2 Variable Weighting, UTS #10. This key allows for variable weighting of variable collation elements, which are marked with an ASTERISK in the table @@ -1861,6 +1898,11 @@ a collator via C<Unicode::Collate-E<gt>new( )> should be used; for F<CollationTest_NON_IGNORABLE.txt>, a collator via C<Unicode::Collate-E<gt>new(variable =E<gt> "non-ignorable", level =E<gt> 3)>. +If C<UCA_Version> is 26 or later, the C<identical> level is preferred; +C<Unicode::Collate-E<gt>new(identical =E<gt> 1)> and +C<Unicode::Collate-E<gt>new(identical =E<gt> 1,> +C<variable =E<gt> "non-ignorable", level =E<gt> 3)> should be used. + B<Unicode::Normalize is required to try The Conformance Test.> =back diff --git a/cpan/Unicode-Collate/Collate.xs b/cpan/Unicode-Collate/Collate.xs index 94ff6e41f6..4d4ecca3ee 100644 --- a/cpan/Unicode-Collate/Collate.xs +++ b/cpan/Unicode-Collate/Collate.xs @@ -617,10 +617,14 @@ varCE (self, vce) else if (*a == 's') { /* shifted or shift-trimmed */ totwt = d[1] + d[2] + d[3] + d[4] + d[5] + d[6]; if (alen == 7 && totwt != 0) { /* shifted */ - d[7] = (U8)(Shift4Wt >> 8); - d[8] = (U8)(Shift4Wt & 0xFF); - } - else { /* shift-trimmed */ + if (d[1] == 0 && d[2] == 1) { /* XXX: CollationAuxiliary-6.2.0 */ + d[7] = d[1]; /* wt level 1 to 4 */ + d[8] = d[2]; + } else { + d[7] = (U8)(Shift4Wt >> 8); + d[8] = (U8)(Shift4Wt & 0xFF); + } + } else { /* shift-trimmed */ d[7] = d[8] = '\0'; } } @@ -642,7 +646,7 @@ visualizeSortKey (self, key) U8 *s, *e, *d; STRLEN klen, dlen; UV uv; - IV uca_vers; + IV uca_vers, sep = 0; static const char *upperhex = "0123456789ABCDEF"; CODE: if (SvROK(self) && SvTYPE(SvRV(self)) == SVt_PVHV) @@ -658,10 +662,13 @@ visualizeSortKey (self, key) s = (U8*)SvPV(key, klen); /* slightly *longer* than the need, but I'm afraid of miscounting; - exactly: (klen / 2) * 5 + MaxLevel * 2 - 1 (excluding '\0') - = (klen / 2) * 5 - 1 # FFFF (16bit) and ' ' between 16bit units - + (MaxLevel - 1) * 2 # ' ' and '|' for level boundaries - + 2 # '[' and ']' + = (klen / 2) * 5 - 1 + # FFFF and ' ' for each 16bit units but ' ' is less by 1; + # ' ' and '|' for level boundaries including the identical level + + 2 # '[' and ']' + + 1 # '\0' + (a) if klen is odd (not expected), maybe more 5 bytes. + (b) there is not always the identical level. */ dlen = (klen / 2) * 5 + MaxLevel * 2 + 2; dst = newSV(dlen); @@ -671,18 +678,18 @@ visualizeSortKey (self, key) *d++ = '['; for (e = s + klen; s < e; s += 2) { uv = (U16)(*s << 8 | s[1]); - if (uv) { + if (uv || sep >= MaxLevel) { if ((d[-1] != '[') && ((9 <= uca_vers) || (d[-1] != '|'))) *d++ = ' '; *d++ = upperhex[ (s[0] >> 4) & 0xF ]; *d++ = upperhex[ s[0] & 0xF ]; *d++ = upperhex[ (s[1] >> 4) & 0xF ]; *d++ = upperhex[ s[1] & 0xF ]; - } - else { + } else { if ((9 <= uca_vers) && (d[-1] != '[')) *d++ = ' '; *d++ = '|'; + ++sep; } } *d++ = ']'; diff --git a/cpan/Unicode-Collate/README b/cpan/Unicode-Collate/README index 5947585c0a..26e8ff5e6b 100644 --- a/cpan/Unicode-Collate/README +++ b/cpan/Unicode-Collate/README @@ -1,4 +1,4 @@ -Unicode/Collate version 0.91 +Unicode/Collate version 0.92 =============================== NAME diff --git a/cpan/Unicode-Collate/t/cjkrange.t b/cpan/Unicode-Collate/t/cjkrange.t index 37fb9fd279..e3d4f38d73 100644 --- a/cpan/Unicode-Collate/t/cjkrange.t +++ b/cpan/Unicode-Collate/t/cjkrange.t @@ -13,7 +13,7 @@ BEGIN { use strict; use warnings; -BEGIN { $| = 1; print "1..379\n"; } # 1 + 42 x @Versions +BEGIN { $| = 1; print "1..421\n"; } # 1 + 42 x @Versions my $count = 0; sub ok ($;$) { my $p = my $r = shift; @@ -49,7 +49,7 @@ my $coll = Unicode::Collate->new( # 2A700..2B734 are CJK UI Ext.C since UCA_Version 20 (Unicode 5.2). # 2B740..2B81D are CJK UI Ext.D since UCA_Version 22 (Unicode 6.0). -my @Versions = (8, 9, 11, 14, 16, 18, 20, 22, 24); +my @Versions = (8, 9, 11, 14, 16, 18, 20, 22, 24, 26); for my $v (@Versions) { $coll->change(UCA_Version => $v); diff --git a/cpan/Unicode-Collate/t/compatui.t b/cpan/Unicode-Collate/t/compatui.t index 6fb01b892c..822743ed8f 100644 --- a/cpan/Unicode-Collate/t/compatui.t +++ b/cpan/Unicode-Collate/t/compatui.t @@ -13,7 +13,7 @@ BEGIN { use strict; use warnings; -BEGIN { $| = 1; print "1..631\n"; } # 1 + 70 x @Versions +BEGIN { $| = 1; print "1..701\n"; } # 1 + 70 x @Versions my $count = 0; sub ok ($;$) { my $p = my $r = shift; @@ -30,7 +30,7 @@ ok(1); ######################### -my @Versions = (8, 9, 11, 14, 16, 18, 20, 22, 24); +my @Versions = (8, 9, 11, 14, 16, 18, 20, 22, 24, 26); # 12 compatibility ideographs are treated as unified ideographs: # FA0E, FA0F, FA11, FA13, FA14, FA1F, FA21, FA23, FA24, FA27, FA28, FA29. diff --git a/cpan/Unicode-Collate/t/hangtype.t b/cpan/Unicode-Collate/t/hangtype.t index b85a308f12..5aa7d49153 100644 --- a/cpan/Unicode-Collate/t/hangtype.t +++ b/cpan/Unicode-Collate/t/hangtype.t @@ -13,7 +13,7 @@ BEGIN { use strict; use warnings; -BEGIN { $| = 1; print "1..451\n"; } # 1 + 50 x @Versions +BEGIN { $| = 1; print "1..501\n"; } # 1 + 50 x @Versions my $count = 0; sub ok ($;$) { my $p = my $r = shift; @@ -30,7 +30,7 @@ ok(1); ######################### -my @Versions = (8, 9, 11, 14, 16, 18, 20, 22, 24); +my @Versions = (8, 9, 11, 14, 16, 18, 20, 22, 24, 26); for my $v (@Versions) { ok(Unicode::Collate::getHST(0x0000, $v), ''); diff --git a/cpan/Unicode-Collate/t/ident.t b/cpan/Unicode-Collate/t/ident.t new file mode 100644 index 0000000000..4f132d4ec1 --- /dev/null +++ b/cpan/Unicode-Collate/t/ident.t @@ -0,0 +1,161 @@ + +BEGIN { + unless ("A" eq pack('U', 0x41)) { + print "1..0 # Unicode::Collate " . + "cannot stringify a Unicode code point\n"; + exit 0; + } + if ($ENV{PERL_CORE}) { + chdir('t') if -d 't'; + @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib); + } +} + +use strict; +use warnings; +BEGIN { $| = 1; print "1..45\n"; } +my $count = 0; +sub ok ($;$) { + my $p = my $r = shift; + if (@_) { + my $x = shift; + $p = !defined $x ? !defined $r : !defined $r ? 0 : $r eq $x; + } + print $p ? "ok" : "not ok", ' ', ++$count, "\n"; +} + +use Unicode::Collate; + +ok(1); + +######################### + +my $Collator = Unicode::Collate->new( + table => 'keys.txt', + normalization => undef, +); + +# [001F] UNIT SEPARATOR +{ + ok($Collator->eq("\0", "\x1F")); + ok($Collator->eq("\x1F", "\x{200B}")); + ok($Collator->eq("\0", "\x{200B}")); + ok($Collator->eq("\x{313}", "\x{343}")); + ok($Collator->eq("\x{2000}", "\x{2001}")); + ok($Collator->eq("\x{200B}", "\x{200C}")); + ok($Collator->eq("\x{304C}", "\x{304B}\x{3099}")); + + $Collator->change(identical => 1); + + ok($Collator->lt("\0", "\x1F")); + ok($Collator->lt("\x1F", "\x{200B}")); + ok($Collator->lt("\0", "\x{200B}")); + ok($Collator->lt("\x{313}", "\x{343}")); + ok($Collator->lt("\x{2000}", "\x{2001}")); + ok($Collator->lt("\x{200B}", "\x{200C}")); + ok($Collator->gt("\x{304C}", "\x{304B}\x{3099}")); + + $Collator->change(identical => 0); + + ok($Collator->eq("\0", "\x1F")); + ok($Collator->eq("\x1F", "\x{200B}")); + ok($Collator->eq("\0", "\x{200B}")); + ok($Collator->eq("\x{313}", "\x{343}")); + ok($Collator->eq("\x{2000}", "\x{2001}")); + ok($Collator->eq("\x{200B}", "\x{200C}")); + ok($Collator->eq("\x{304C}", "\x{304B}\x{3099}")); +} + +#### 22 + +eval { require Unicode::Normalize }; +if (!$@) { + $Collator->change(normalization => "NFD"); + + $Collator->change(identical => 1); + + ok($Collator->lt("\0", "\x{200B}")); + ok($Collator->eq("\x{313}", "\x{343}")); + ok($Collator->lt("\x{2000}", "\x{2001}")); + ok($Collator->lt("\x{200B}", "\x{200C}")); + ok($Collator->eq("\x{304C}", "\x{304B}\x{3099}")); + + $Collator->change(identical => 0); + + ok($Collator->eq("\0", "\x{200B}")); + ok($Collator->eq("\x{313}", "\x{343}")); + ok($Collator->eq("\x{2000}", "\x{2001}")); + ok($Collator->eq("\x{200B}", "\x{200C}")); + ok($Collator->eq("\x{304C}", "\x{304B}\x{3099}")); +} else { + ok(1) for 1..10; +} + +$Collator->change(normalization => undef, identical => 1); + +##### 32 + +ok($Collator->viewSortKey("\0"), '[| | | | 0000 0000]'); +ok($Collator->viewSortKey("\x{200B}"), '[| | | | 0000 200B]'); + +ok($Collator->viewSortKey('a'), + '[0A15 | 0020 | 0002 | FFFF | 0000 0061]'); + +ok($Collator->viewSortKey("\x{304C}"), + '[1926 | 0020 013D | 000E 0002 | FFFF FFFF | 0000 304C]'); + +ok($Collator->viewSortKey("\x{100000}"), + '[FBE0 8000 | 0020 | 0002 | FFFF FFFF | 0010 0000]'); + +eval { require Unicode::Normalize }; +if (!$@) { + $Collator->change(normalization => "NFD"); + + ok($Collator->viewSortKey("\x{304C}"), + '[1926 | 0020 013D | 000E 0002 | FFFF FFFF | 0000 304B 0000 3099]'); +} else { + ok(1); +} + +$Collator->change(normalization => undef); + +##### 38 + +$Collator->change(level => 3); + +ok($Collator->viewSortKey("\x{304C}"), + '[1926 | 0020 013D | 000E 0002 | | 0000 304C]'); + +$Collator->change(level => 2); + +ok($Collator->viewSortKey("\x{304C}"), + '[1926 | 0020 013D | | | 0000 304C]'); + +$Collator->change(level => 1); + +ok($Collator->viewSortKey("\x{304C}"), + '[1926 | | | | 0000 304C]'); + +##### 41 + +$Collator->change(UCA_Version => 8); + +ok($Collator->viewSortKey("\x{304C}"), + '[1926||||0000 304C]'); + +$Collator->change(level => 2); + +ok($Collator->viewSortKey("\x{304C}"), + '[1926|0020 013D|||0000 304C]'); + +$Collator->change(level => 3); + +ok($Collator->viewSortKey("\x{304C}"), + '[1926|0020 013D|000E 0002||0000 304C]'); + +$Collator->change(level => 4); + +ok($Collator->viewSortKey("\x{304C}"), + '[1926|0020 013D|000E 0002|FFFF FFFF|0000 304C]'); + +##### 45 diff --git a/cpan/Unicode-Collate/t/index.t b/cpan/Unicode-Collate/t/index.t index b3433a9e5f..11cf618f5a 100644 --- a/cpan/Unicode-Collate/t/index.t +++ b/cpan/Unicode-Collate/t/index.t @@ -13,7 +13,7 @@ BEGIN { use strict; use warnings; -BEGIN { $| = 1; print "1..71\n"; } +BEGIN { $| = 1; print "1..91\n"; } my $count = 0; sub ok ($;$) { my $p = my $r = shift; @@ -28,8 +28,6 @@ use Unicode::Collate; ok(1); -######################### - our $IsEBCDIC = ord("A") != 0x41; my $Collator = Unicode::Collate->new( @@ -37,7 +35,7 @@ my $Collator = Unicode::Collate->new( normalization => undef, ); -############## +##### 1 my %old_level = $Collator->change(level => 2); @@ -64,7 +62,7 @@ if (my($pos,$len) = $Collator->index($str, $sub)) { ok($str, $orig); -############## +##### 3 my $match; @@ -126,7 +124,7 @@ if (my($pos, $len) = $Collator->index($str, $sub)) { } ok($match, $ret); -############## +##### 9 $Collator->change(level => 1); @@ -165,7 +163,7 @@ if (my($pos,$len) = $Collator->index("", "abc")) { } ok($match, undef); -############## +##### 13 $Collator->change(level => 1); @@ -201,7 +199,7 @@ if (my($pos, $len) = $Collator->index($str, $sub)) { } ok($match, $ret); -############## +##### 16 $Collator->change(level => 1); @@ -246,7 +244,7 @@ ok($match, undef); $Collator->change(%old_level); -############## +##### 22 my @ret; @@ -318,7 +316,7 @@ ok($ret, undef); $Collator->change(%old_level); -############## +##### 38 $Collator->change(level => 1); @@ -349,6 +347,8 @@ $Collator->gsubst($str, "camel", sub { "<b>$_[0]</b>" }); ok($str, "<b>Camel</b> donkey zebra <b>came\x{301}l</b> " . "<b>CAMEL</b> horse <b>cAm\0E\0L</b>..."); +##### 47 + # http://www.xray.mpe.mpg.de/mailing-lists/perl-unicode/2010-09/msg00014.html # when the substring includes an ignorable element like a space... @@ -376,6 +376,8 @@ $str = "Camel donkey zebra camex{301}l CAMEL horse cAmEL-horse..."; $Collator->gsubst($str, "ca\x{300}melho\x{302}rse", sub { "=$_[0]=" }); ok($str, "Camel donkey zebra camex{301}l =CAMEL horse= =cAmEL-horse=..."); +##### 53 + $Collator->change(level => 3); $str = "P\cBe\x{300}\cBrl and PERL."; @@ -400,7 +402,7 @@ ok($str, "P\cBe\x{300}\cBrl and PERL."); $Collator->change(%old_level); -############## +##### 61 $str = "Perl and Camel"; $ret = $Collator->gsubst($str, "\cA\cA\0", "AB"); @@ -422,7 +424,7 @@ $ret = $Collator->gsubst($str, 'PP', "ABC"); ok($ret, 2); ok($str, "ABCABCP"); -############## +##### 69 # Shifted; ignorable after variable @@ -434,3 +436,99 @@ $Collator->change(alternate => 'Non-ignorable'); ($ret) = $Collator->match("A?\x{300}!\x{301}B\x{315}", "?!"); ok($ret, undef); +##### 71 + +# Now preprocess is defined. + +$Collator->change(preprocess => sub {''}); + +eval { $Collator->index("", "") }; +ok($@ && $@ =~ /Don't use Preprocess with index\(\)/); + +eval { $Collator->index("a", "a") }; +ok($@ && $@ =~ /Don't use Preprocess with index\(\)/); + +eval { $Collator->match("", "") }; +ok($@ && $@ =~ /Don't use Preprocess with.*match\(\)/); + +eval { $Collator->match("a", "a") }; +ok($@ && $@ =~ /Don't use Preprocess with.*match\(\)/); + +$Collator->change(preprocess => sub { uc shift }); + +eval { $Collator->index("", "") }; +ok($@ && $@ =~ /Don't use Preprocess with index\(\)/); + +eval { $Collator->index("a", "a") }; +ok($@ && $@ =~ /Don't use Preprocess with index\(\)/); + +eval { $Collator->match("", "") }; +ok($@ && $@ =~ /Don't use Preprocess with.*match\(\)/); + +eval { $Collator->match("a", "a") }; +ok($@ && $@ =~ /Don't use Preprocess with.*match\(\)/); + +##### 79 + +eval { require Unicode::Normalize }; +my $has_norm = !$@; + +if ($has_norm) { + # Now preprocess and normalization are defined. + + $Collator->change(normalization => 'NFD'); + + eval { $Collator->index("", "") }; + ok($@ && $@ =~ /Don't use Preprocess with index\(\)/); + + eval { $Collator->index("a", "a") }; + ok($@ && $@ =~ /Don't use Preprocess with index\(\)/); + + eval { $Collator->match("", "") }; + ok($@ && $@ =~ /Don't use Preprocess with.*match\(\)/); + + eval { $Collator->match("a", "a") }; + ok($@ && $@ =~ /Don't use Preprocess with.*match\(\)/); +} else { + ok(1) for 1..4; +} + +$Collator->change(preprocess => undef); + +if ($has_norm) { + # Now only normalization is defined. + + eval { $Collator->index("", "") }; + ok($@ && $@ =~ /Don't use Normalization with index\(\)/); + + eval { $Collator->index("a", "a") }; + ok($@ && $@ =~ /Don't use Normalization with index\(\)/); + + eval { $Collator->match("", "") }; + ok($@ && $@ =~ /Don't use Normalization with.*match\(\)/); + + eval { $Collator->match("a", "a") }; + ok($@ && $@ =~ /Don't use Normalization with.*match\(\)/); + + $Collator->change(normalization => undef); +} else { + ok(1) for 1..4; +} + +##### 87 + +# Now preprocess and normalization are undef. + +eval { $Collator->index("", "") }; +ok(!$@); + +eval { $Collator->index("a", "a") }; +ok(!$@); + +eval { $Collator->match("", "") }; +ok(!$@); + +eval { $Collator->match("a", "a") }; +ok(!$@); + +##### 91 diff --git a/cpan/Unicode-Collate/t/overcjk0.t b/cpan/Unicode-Collate/t/overcjk0.t index 588e8a8c02..081f57b158 100644 --- a/cpan/Unicode-Collate/t/overcjk0.t +++ b/cpan/Unicode-Collate/t/overcjk0.t @@ -13,7 +13,7 @@ BEGIN { use strict; use warnings; -BEGIN { $| = 1; print "1..285\n"; } # 6 + 31 x @Versions +BEGIN { $| = 1; print "1..316\n"; } # 6 + 31 x @Versions my $count = 0; sub ok ($;$) { my $p = my $r = shift; @@ -62,7 +62,7 @@ ok($ignoreCJK->lt("Pe\x{5B57}rl", "Perl")); # 'r' is unassigned. # 2A700..2B734 are CJK UI Ext.C since UCA_Version 20 (Unicode 5.2). # 2B740..2B81D are CJK UI Ext.D since UCA_Version 22 (Unicode 6.0). -my @Versions = (8, 9, 11, 14, 16, 18, 20, 22, 24); +my @Versions = (8, 9, 11, 14, 16, 18, 20, 22, 24, 26); for my $v (@Versions) { $ignoreCJK->change(UCA_Version => $v); diff --git a/cpan/Unicode-Collate/t/overcjk1.t b/cpan/Unicode-Collate/t/overcjk1.t index dc3ae8f23b..7bee17658a 100644 --- a/cpan/Unicode-Collate/t/overcjk1.t +++ b/cpan/Unicode-Collate/t/overcjk1.t @@ -13,7 +13,7 @@ BEGIN { use strict; use warnings; -BEGIN { $| = 1; print "1..155\n"; } # 11 + 16 x @Versions +BEGIN { $| = 1; print "1..171\n"; } # 11 + 16 x @Versions my $count = 0; sub ok ($;$) { my $p = my $r = shift; @@ -62,7 +62,7 @@ ok($overCJK->lt("a\x{4E03}", "A\x{4E01}")); # 9FC4..9FCB are CJK UI since UCA_Version 20 (Unicode 5.2). # 9FCC is CJK UI since UCA_Version 24 (Unicode 6.1). -my @Versions = (8, 9, 11, 14, 16, 18, 20, 22, 24); +my @Versions = (8, 9, 11, 14, 16, 18, 20, 22, 24, 26); for my $v (@Versions) { $overCJK->change(UCA_Version => $v); diff --git a/cpan/Unicode-Collate/t/test.t b/cpan/Unicode-Collate/t/test.t index 440c3a9da1..552440f919 100644 --- a/cpan/Unicode-Collate/t/test.t +++ b/cpan/Unicode-Collate/t/test.t @@ -13,7 +13,7 @@ BEGIN { use strict; use warnings; -BEGIN { $| = 1; print "1..107\n"; } +BEGIN { $| = 1; print "1..112\n"; } my $count = 0; sub ok ($;$) { my $p = my $r = shift; @@ -385,5 +385,36 @@ $_ = 'Foo'; @temp = $c->index("perl5", "LR"); ok($_, 'Foo'); -##### +##### 108..109 + +{ + my $caseless = Unicode::Collate->new( + table => "keys.txt", + normalization => undef, + preprocess => sub { uc shift }, + ); + ok( $Collator->gt("ABC","abc") ); + ok( $caseless->eq("ABC","abc") ); +} +##### 110..112 + +{ + eval { require Unicode::Normalize; }; + if ($@) { + eval { my $n1 = Unicode::Collate->new(table => "keys.txt"); }; + ok($@ =~ /Unicode::Normalize is required/); + + eval { my $n2 = Unicode::Collate->new + (table => "keys.txt", normalization => undef); }; + ok(!$@); + + eval { my $n3 = Unicode::Collate->new + (table => "keys.txt", normalization => 'prenormalized'); }; + ok($@ =~ /Unicode::Normalize is required/); + } else { + ok(1) for 1..3; + } +} + +##### diff --git a/cpan/Unicode-Collate/t/view.t b/cpan/Unicode-Collate/t/view.t index 6f7c0fb7ae..4759533823 100644 --- a/cpan/Unicode-Collate/t/view.t +++ b/cpan/Unicode-Collate/t/view.t @@ -13,7 +13,7 @@ BEGIN { use strict; use warnings; -BEGIN { $| = 1; print "1..53\n"; } +BEGIN { $| = 1; print "1..89\n"; } my $count = 0; sub ok ($;$) { my $p = my $r = shift; @@ -28,15 +28,14 @@ use Unicode::Collate; ok(1); -######################### +##### 1 my $Collator = Unicode::Collate->new( table => 'keys.txt', normalization => undef, + UCA_Version => 24, ); -############## - ok($Collator->viewSortKey(""), "[| | |]"); ok($Collator->viewSortKey("A"), "[0A15 | 0020 | 0008 | FFFF]"); @@ -60,7 +59,7 @@ ok($Collator->viewSortKey("A"), "[0A15 | 0020 | |]"); $Collator->change(level => 1); ok($Collator->viewSortKey("A"), "[0A15 | | |]"); -### Version 8 +##### 10 $Collator->change(level => 4, UCA_Version => 8); @@ -87,7 +86,7 @@ ok($Collator->viewSortKey("A"), "[0A15|0020||]"); $Collator->change(level => 1); ok($Collator->viewSortKey("A"), "[0A15|||]"); -# Version 9 +##### 19 $Collator->change(level => 3, UCA_Version => 9); ok($Collator->viewSortKey("A\x{300}z\x{301}"), @@ -156,7 +155,7 @@ ok($Collator->viewSortKey("?!."), '[| | | 024E 024B 0255]'); $Collator->change(%origVar); -##### +##### 37 # Level 3 weight @@ -197,7 +196,7 @@ ok($Collator->viewSortKey("a\x{3042}"), ok($Collator->viewSortKey("A\x{30A2}"), '[0A15 1921 | 0020 0020 | 0008 0011 | FFFF FFFF]'); -##### +##### 47 our $el = Unicode::Collate->new( entry => <<'ENTRY', @@ -214,6 +213,7 @@ FF2C ; [.0B03.0020.0009.FF2C] # FULLWIDTH LATIN CAPITAL LETTER L; QQK ENTRY table => undef, normalization => undef, + UCA_Version => 24, ); our $el12 = '0B03 0B03 0B03 0B03 0B03 | 0020 0020 0020 0020 0020'; @@ -240,5 +240,30 @@ ok($el->viewSortKey("l\x{FF4C}\x{217C}\x{2113}\x{24DB}"), ok($el->viewSortKey("L\x{FF2C}\x{216C}\x{2112}\x{24C1}"), "[$el12 | 0008 0009 000A 000B 000C | FFFF FFFF FFFF FFFF FFFF]"); -##### +##### 53 + +my @Versions = (9, 11, 14, 16, 18, 20, 22, 24, 26); + +for my $v (@Versions) { + $Collator->change(UCA_Version => $v); + my $app = $v >= 26 ? ' |]' : ']'; + + $Collator->change(variable => 'Shifted', level => 4); + ok($Collator->viewSortKey("1+2"), + '[0A0C 0A0D | 0020 0020 | 0002 0002 | FFFF 039F FFFF'.$app); + + $Collator->change(variable => 'Shift-Trimmed'); + ok($Collator->viewSortKey("1+2"), + '[0A0C 0A0D | 0020 0020 | 0002 0002 | 039F'.$app); + + $Collator->change(variable => 'Non-ignorable', level => 3); + ok($Collator->viewSortKey("1+2"), + '[0A0C 039F 0A0D | 0020 0020 0020 | 0002 0002 0002 |]'); + + $Collator->change(variable => 'Blanked'); + ok($Collator->viewSortKey("1+2"), + '[0A0C 0A0D | 0020 0020 | 0002 0002 |]'); +} + +##### 89 |