summaryrefslogtreecommitdiff
path: root/cpan/Unicode-Collate
diff options
context:
space:
mode:
authorSteve Hay <steve.m.hay@googlemail.com>2014-05-28 12:37:53 +0100
committerSteve Hay <steve.m.hay@googlemail.com>2014-05-28 16:58:07 +0100
commitf8187d97b5f7e833d712cb8220883e02b4c2bfa1 (patch)
tree0318c495f96f63934a5cb2b3f6a4f15cd36cd226 /cpan/Unicode-Collate
parent60d2b2aebfceab38ad669bac181bb68e42c05cb2 (diff)
downloadperl-f8187d97b5f7e833d712cb8220883e02b4c2bfa1.tar.gz
Upgrade Unicode::Collate from version 1.04 to 1.07
Diffstat (limited to 'cpan/Unicode-Collate')
-rw-r--r--cpan/Unicode-Collate/Collate.pm89
-rw-r--r--cpan/Unicode-Collate/Collate.xs50
-rw-r--r--cpan/Unicode-Collate/t/contract.t46
3 files changed, 132 insertions, 53 deletions
diff --git a/cpan/Unicode-Collate/Collate.pm b/cpan/Unicode-Collate/Collate.pm
index 0fd2951726..a67cbdff3b 100644
--- a/cpan/Unicode-Collate/Collate.pm
+++ b/cpan/Unicode-Collate/Collate.pm
@@ -17,7 +17,7 @@ use File::Spec;
no warnings 'utf8';
-our $VERSION = '1.04';
+our $VERSION = '1.07';
our $PACKAGE = __PACKAGE__;
### begin XS only ###
@@ -117,6 +117,7 @@ our @ChangeOK = qw/
katakana_before_hiragana upper_before_lower ignore_level2
overrideCJK overrideHangul overrideOut preprocess UCA_Version
hangul_terminator variable identical highestFFFF minimalFFFE
+ long_contraction
/;
our @ChangeNG = qw/
@@ -285,6 +286,7 @@ sub new
}
}
+ # only in new(), not in change()
$self->{level} ||= MaxLevel;
$self->{UCA_Version} ||= UCA_Version();
@@ -299,7 +301,10 @@ sub new
if ! exists $self->{rearrange};
$self->{backwards} = $self->{backwardsTable}
if ! exists $self->{backwards};
+ exists $self->{long_contraction} or $self->{long_contraction}
+ = 22 <= $self->{UCA_Version} && $self->{UCA_Version} <= 24;
+ # checkCollator() will be called in change()
$self->checkCollator();
return $self;
@@ -441,12 +446,10 @@ sub parseEntry
$self->{maxlength}{$uv[0]} = @uv;
}
}
- if (@uv > 2) {
- while (@uv) {
- pop @uv;
- my $fake_entry = join(CODE_SEP, @uv); # in JCPS
- $self->{contraction}{$fake_entry} = 1;
- }
+ while (@uv > 2) {
+ pop @uv;
+ my $fake_entry = join(CODE_SEP, @uv); # in JCPS
+ $self->{contraction}{$fake_entry} = 1;
}
}
@@ -486,6 +489,7 @@ sub splitEnt
my $reH = $self->{rearrangeHash};
my $vers = $self->{UCA_Version};
my $ver9 = $vers >= 9 && $vers <= 11;
+ my $long = $self->{long_contraction};
my $uXS = $self->{__useXS}; ### XS only
my @buf;
@@ -566,6 +570,15 @@ sub splitEnt
last unless $curCC;
my $tail = CODE_SEP . $src[$p];
+ if ($preCC != $curCC && $map->{$jcps.$tail}) {
+ $jcps .= $tail;
+ push @out, $p;
+ } else {
+ $preCC = $curCC;
+ }
+
+ next if !$long;
+
if ($preCC_uc != $curCC && ($map->{$jcps_uc.$tail} ||
$cont->{$jcps_uc.$tail})) {
$jcps_uc .= $tail;
@@ -573,16 +586,9 @@ sub splitEnt
} else {
$preCC_uc = $curCC;
}
-
- if ($preCC != $curCC && $map->{$jcps.$tail}) {
- $jcps .= $tail;
- push @out, $p;
- } else {
- $preCC = $curCC;
- }
}
- if ($map->{$jcps_uc}) {
+ if (@out_uc && $map->{$jcps_uc}) {
$jcps = $jcps_uc;
$src[$_] = undef for @out_uc;
} else {
@@ -1068,6 +1074,7 @@ with no parameters, the collator should do the default collation.
ignore_level2 => $bool,
katakana_before_hiragana => $bool,
level => $collationLevel,
+ long_contraction => $bool,
minimalFFFE => $bool,
normalization => $normalization_form,
overrideCJK => \&overrideCJK,
@@ -1107,6 +1114,8 @@ The following revisions are supported. The default is 28.
26 6.2.0 6.2.0 (6.2.0)
28 6.3.0 6.3.0 (6.3.0)
+* See below C<long_contraction> with C<UCA_Version> 22 and 24.
+
* Noncharacters (e.g. U+FFFF) are not ignored, and can be overridden
since C<UCA_Version> 22.
@@ -1144,7 +1153,7 @@ forwards at all the levels.
=item entry
--- see 5 Tailoring; 3.6.1 File Format, UTS #10.
+-- see 5 Tailoring; 9.1 Allkeys File Format, UTS #10.
If the same character (or a sequence of characters) exists
in the collation element table through C<table>,
@@ -1261,7 +1270,7 @@ of the string after them (in NFD by default) are used.
=item ignoreName
--- see 3.6.2 Variable Weighting, UTS #10.
+-- see 3.6 Variable Weighting, UTS #10.
Makes the entry in the table completely ignorable;
i.e. as if the weights were zero at all level.
@@ -1322,6 +1331,46 @@ and 'shift-trimmed'), the level 4 may be unreliable.
See also C<identical>.
+=item long_contraction
+
+-- see 3.8.2 Well-Formedness of the DUCET, 4.2 Produce Array, UTS #10.
+
+If the parameter is made true, for a contraction with three or more
+characters (here nicknamed "long contraction"), initial substrings
+will be handled.
+For example, a contraction ABC, where A is a starter, and B and C
+are non-starters (character with non-zero combining character class),
+will be detected even if there is not AB as a contraction.
+
+B<Default:> Usually false.
+If C<UCA_Version> is 22 or 24, and the value of C<long_contraction>
+is not specified in C<new()>, a true value is set implicitly.
+This is a workaround to pass Conformance Tests for Unicode 6.0.0 and 6.1.0.
+
+C<change()> handles C<long_contraction> explicitly only.
+If C<long_contraction> is not specified in C<change()>, even though
+C<UCA_Version> is changed, C<long_contraction> will not be changed.
+
+B<Limitation:> Scanning non-starters is one-way (no back tracking).
+If AB is found but not ABC is not found, other long contraction where
+the first character is A and the second is not B may not be found.
+
+Under C<(normalization =E<gt> undef)>, detection step of discontiguous
+contractions are skipped.
+
+B<Note:> The following contractions in DUCET are not considered
+in steps S2.1.1 to S2.1.3, where they are discontiguous.
+
+ 0FB2 0F71 0F80 (TIBETAN VOWEL SIGN VOCALIC RR)
+ 0FB3 0F71 0F80 (TIBETAN VOWEL SIGN VOCALIC LL)
+
+For example C<TIBETAN VOWEL SIGN VOCALIC RR> with C<COMBINING TILDE OVERLAY>
+(C<U+0344>) is C<0FB2 0344 0F71 0F80> in NFD.
+In this case C<0FB2 0F80> (C<TIBETAN VOWEL SIGN VOCALIC R>) is detected,
+instead of C<0FB2 0F71 0F80>.
+Inserted C<0344> makes C<0FB2 0F71 0F80> discontiguous and lack of
+contraction C<0FB2 0F71> prohibits C<0FB2 0F71 0F80> from being detected.
+
=item minimalFFFE
-- see 5.14 Collation Elements, UTS #35.
@@ -1615,7 +1664,7 @@ B<NOTE>: Contractions via C<entry> are not be suppressed.
=item table
--- see 3.6 Default Unicode Collation Element Table, UTS #10.
+-- see 3.8 Default Unicode Collation Element Table, UTS #10.
You can use another collation element table if desired.
@@ -1694,7 +1743,7 @@ this parameter doesn't work validly.
=item variable
--- see 3.6.2 Variable Weighting, UTS #10.
+-- see 3.6 Variable Weighting, UTS #10.
This key allows for variable weighting of variable collation elements,
which are marked with an ASTERISK in the table
@@ -2029,7 +2078,7 @@ B<Unicode::Normalize is required to try The Conformance Test.>
=head1 AUTHOR, COPYRIGHT AND LICENSE
The Unicode::Collate module for perl was written by SADAHIRO Tomoyuki,
-<SADAHIRO@cpan.org>. This module is Copyright(C) 2001-2013,
+<SADAHIRO@cpan.org>. This module is Copyright(C) 2001-2014,
SADAHIRO Tomoyuki. Japan. All rights reserved.
This module is free software; you can redistribute it and/or
diff --git a/cpan/Unicode-Collate/Collate.xs b/cpan/Unicode-Collate/Collate.xs
index af62d0b295..ed0074b7c9 100644
--- a/cpan/Unicode-Collate/Collate.xs
+++ b/cpan/Unicode-Collate/Collate.xs
@@ -11,9 +11,9 @@
#include "ucatbl.h"
/* Perl 5.6.1 ? */
-#ifndef utf8n_to_uvuni
+#ifdef utf8_to_uv
#define utf8n_to_uvuni utf8_to_uv
-#endif /* utf8n_to_uvuni */
+#endif /* utf8_to_uv */
/* UTF8_ALLOW_BOM is used before Perl 5.8.0 */
#ifndef UTF8_ALLOW_BOM
@@ -590,36 +590,28 @@ varCE (self, vce)
/* variable: checked only the first char and the length,
trusting checkCollator() and %VariableOK in Perl ... */
- if (vlen < VCE_Length /* ignore short VCE (unexpected) */
- ||
- *a == 'n') /* non-ignorable */
- 1;
- else if (*v) {
- if (*a == 's') { /* shifted or shift-trimmed */
- d[7] = d[1]; /* wt level 1 to 4 */
- d[8] = d[2];
- } /* else blanked */
-
- d[1] = d[2] = d[3] = d[4] = d[5] = d[6] = '\0';
- }
- else if (*a == 'b') /* blanked */
- 1;
- else if (*a == 's') { /* shifted or shift-trimmed */
- totwt = d[1] + d[2] + d[3] + d[4] + d[5] + d[6];
- if (alen == 7 && totwt != 0) { /* shifted */
- if (d[1] == 0 && d[2] == 1) { /* XXX: CollationAuxiliary-6.2.0 */
+ if (vlen >= VCE_Length && *a != 'n') {
+ if (*v) {
+ if (*a == 's') { /* shifted or shift-trimmed */
d[7] = d[1]; /* wt level 1 to 4 */
d[8] = d[2];
- } else {
- d[7] = (U8)(Shift4Wt >> 8);
- d[8] = (U8)(Shift4Wt & 0xFF);
+ } /* else blanked */
+ d[1] = d[2] = d[3] = d[4] = d[5] = d[6] = '\0';
+ } else if (*a == 's') { /* shifted or shift-trimmed */
+ totwt = d[1] + d[2] + d[3] + d[4] + d[5] + d[6];
+ if (alen == 7 && totwt != 0) { /* shifted */
+ if (d[1] == 0 && d[2] == 1) { /* XXX: CollationAuxiliary-6.2.0 */
+ d[7] = d[1]; /* wt level 1 to 4 */
+ d[8] = d[2];
+ } else {
+ d[7] = (U8)(Shift4Wt >> 8);
+ d[8] = (U8)(Shift4Wt & 0xFF);
+ }
+ } else { /* shift-trimmed or completely ignorable */
+ d[7] = d[8] = '\0';
}
- } else { /* shift-trimmed or completely ignorable */
- d[7] = d[8] = '\0';
- }
- }
- else
- croak("unknown variable value '%s'", a);
+ } /* else blanked */
+ } /* else non-ignorable */
RETVAL = dst;
OUTPUT:
RETVAL
diff --git a/cpan/Unicode-Collate/t/contract.t b/cpan/Unicode-Collate/t/contract.t
index d659562fb0..cec3c8017e 100644
--- a/cpan/Unicode-Collate/t/contract.t
+++ b/cpan/Unicode-Collate/t/contract.t
@@ -16,7 +16,7 @@ BEGIN {
use strict;
use warnings;
-BEGIN { $| = 1; print "1..74\n"; }
+BEGIN { $| = 1; print "1..108\n"; }
my $count = 0;
sub ok ($;$) {
my $p = my $r = shift;
@@ -179,7 +179,9 @@ ok($kjeSup->eq("\x{40C}", "\x{41A}\x{301}"));
# 44
our $tibetanEntry = <<'ENTRIES';
-0000 ; [.0000.0000.0000.0000] # [0000] NULL (in 6429)
+0000 ; [.0000.0000.0000.0000] # [0000] NULL (in 6429)
+0FB2 ; [.205B.0020.0002.0FB2] # TIBETAN SUBJOINED LETTER RA
+0FB3 ; [.205E.0020.0002.0FB3] # TIBETAN SUBJOINED LETTER LA
0F71 ; [.206D.0020.0002.0F71] # TIBETAN VOWEL SIGN AA
0F72 ; [.206E.0020.0002.0F72] # TIBETAN VOWEL SIGN I
0F73 ; [.206F.0020.0002.0F73] # TIBETAN VOWEL SIGN II
@@ -215,6 +217,7 @@ if (!$@) {
my $tibNFD = Unicode::Collate->new(
table => undef,
entry => $tibetanEntry,
+ UCA_Version => 24,
);
# VOCALIC RR
@@ -251,8 +254,43 @@ if (!$@) {
ok($tibNFD->eq("\x{F79}\0\x{334}", "\x{FB3}\x{F80}\x{F71}\0\x{334}"));
# 72
+ my $a1 = "\x{FB2}\x{334}\x{F81}";
+ my $b1 = "\x{F77}\0\x{334}";
+ my $a2 = "\x{FB2}\x{334}\x{F81}";
+ my $b2 = "\x{FB2}\x{F80}\0\x{334}\x{F71}";
+
+ for my $v (qw/20 22 24 26 28/) {
+ my $tib = Unicode::Collate->new(
+ table => undef,
+ entry => $tibetanEntry,
+ UCA_Version => $v,
+ );
+ my $long = 22 <= $v && $v <= 24;
+ ok($tib->cmp($a1, $b1), $long ? 0 : -1);
+ ok($tib->cmp($a2, $b2), $long ? 1 : 0);
+
+ $tib->change(long_contraction => 0);
+ ok($tib->cmp($a1, $b1), -1);
+ ok($tib->cmp($a2, $b2), 0);
+
+ $tib->change(long_contraction => 1);
+ ok($tib->cmp($a1, $b1), 0);
+ ok($tib->cmp($a2, $b2), 1);
+ }
+# 102
+
+ # UCA_Version => 22
+ ok($tibNFD->cmp($a1, $b1), 0);
+ ok($tibNFD->cmp($a2, $b2), 1);
+
+ $tibNFD->change(UCA_Version => 26); # not affect long_contraction
+ ok($tibNFD->cmp($a1, $b1), 0);
+ ok($tibNFD->cmp($a2, $b2), 1);
+# 106
+
my $discontNFD = Unicode::Collate->new(
table => undef,
+ UCA_Version => 22,
entry => <<'ENTRIES',
0000 ; [.0000.0000.0000.0000] # [0000] NULL (in 6429)
0301 ; [.0000.0032.0002.0301] # COMBINING ACUTE ACCENT
@@ -268,6 +306,6 @@ ENTRIES
ok($discontNFD->eq("A\x{327}\x{301}\0\x{334}", "A\x{334}\x{327}\x{301}"));
ok($discontNFD->eq("A\x{300}\0\x{327}", "A\x{327}\x{300}"));
} else {
- ok(1) for 1..30;
+ ok(1) for 1..64;
}
-# 74
+# 108