Upgrade Unicode::Collate from version 1.04 to 1.07

author: Steve Hay <steve.m.hay@googlemail.com> 2014-05-28 12:37:53 +0100
committer: Steve Hay <steve.m.hay@googlemail.com> 2014-05-28 16:58:07 +0100
commit: f8187d97b5f7e833d712cb8220883e02b4c2bfa1 (patch)
tree: 0318c495f96f63934a5cb2b3f6a4f15cd36cd226 /cpan/Unicode-Collate
parent: 60d2b2aebfceab38ad669bac181bb68e42c05cb2 (diff)
download: perl-f8187d97b5f7e833d712cb8220883e02b4c2bfa1.tar.gz
3 files changed, 132 insertions, 53 deletions
diff --git a/cpan/Unicode-Collate/Collate.pm b/cpan/Unicode-Collate/Collate.pm
index 0fd2951726..a67cbdff3b 100644
--- a/cpan/Unicode-Collate/Collate.pm
+++ b/cpan/Unicode-Collate/Collate.pm
@@ -17,7 +17,7 @@ use File::Spec;
 
 no warnings 'utf8';
 
-our $VERSION = '1.04';
+our $VERSION = '1.07';
 our $PACKAGE = __PACKAGE__;
 
 ### begin XS only ###
@@ -117,6 +117,7 @@ our @ChangeOK = qw/
     katakana_before_hiragana upper_before_lower ignore_level2
     overrideCJK overrideHangul overrideOut preprocess UCA_Version
     hangul_terminator variable identical highestFFFF minimalFFFE
+    long_contraction
   /;
 
 our @ChangeNG = qw/
@@ -285,6 +286,7 @@ sub new
 	}
     }
 
+    # only in new(), not in change()
     $self->{level} ||= MaxLevel;
     $self->{UCA_Version} ||= UCA_Version();
 
@@ -299,7 +301,10 @@ sub new
 	if ! exists $self->{rearrange};
     $self->{backwards} = $self->{backwardsTable}
 	if ! exists $self->{backwards};
+    exists $self->{long_contraction} or $self->{long_contraction}
+	= 22 <= $self->{UCA_Version} && $self->{UCA_Version} <= 24;
 
+    # checkCollator() will be called in change()
     $self->checkCollator();
 
     return $self;
@@ -441,12 +446,10 @@ sub parseEntry
 	    $self->{maxlength}{$uv[0]} = @uv;
 	}
     }
-    if (@uv > 2) {
-	while (@uv) {
-	    pop @uv;
-	    my $fake_entry = join(CODE_SEP, @uv); # in JCPS
-	    $self->{contraction}{$fake_entry} = 1;
-	}
+    while (@uv > 2) {
+	pop @uv;
+	my $fake_entry = join(CODE_SEP, @uv); # in JCPS
+	$self->{contraction}{$fake_entry} = 1;
     }
 }
 
@@ -486,6 +489,7 @@ sub splitEnt
     my $reH  = $self->{rearrangeHash};
     my $vers = $self->{UCA_Version};
     my $ver9 = $vers >= 9 && $vers <= 11;
+    my $long = $self->{long_contraction};
     my $uXS  = $self->{__useXS}; ### XS only
 
     my @buf;
@@ -566,6 +570,15 @@ sub splitEnt
 		    last unless $curCC;
 		    my $tail = CODE_SEP . $src[$p];
 
+		    if ($preCC != $curCC && $map->{$jcps.$tail}) {
+			$jcps .= $tail;
+			push @out, $p;
+		    } else {
+			$preCC = $curCC;
+		    }
+
+		    next if !$long;
+
 		    if ($preCC_uc != $curCC && ($map->{$jcps_uc.$tail} ||
 					       $cont->{$jcps_uc.$tail})) {
 			$jcps_uc .= $tail;
@@ -573,16 +586,9 @@ sub splitEnt
 		    } else {
 			$preCC_uc = $curCC;
 		    }
-
-		    if ($preCC != $curCC && $map->{$jcps.$tail}) {
-			$jcps .= $tail;
-			push @out, $p;
-		    } else {
-			$preCC = $curCC;
-		    }
 		}
 
-		if ($map->{$jcps_uc}) {
+		if (@out_uc && $map->{$jcps_uc}) {
 		    $jcps = $jcps_uc;
 		    $src[$_] = undef for @out_uc;
 		} else {
@@ -1068,6 +1074,7 @@ with no parameters, the collator should do the default collation.
       ignore_level2 => $bool,
       katakana_before_hiragana => $bool,
       level => $collationLevel,
+      long_contraction => $bool,
       minimalFFFE => $bool,
       normalization  => $normalization_form,
       overrideCJK => \&overrideCJK,
@@ -1107,6 +1114,8 @@ The following revisions are supported.  The default is 28.
      26             6.2.0               6.2.0 (6.2.0)
      28             6.3.0               6.3.0 (6.3.0)
 
+* See below C<long_contraction> with C<UCA_Version> 22 and 24.
+
 * Noncharacters (e.g. U+FFFF) are not ignored, and can be overridden
 since C<UCA_Version> 22.
 
@@ -1144,7 +1153,7 @@ forwards at all the levels.
 
 =item entry
 
--- see 5 Tailoring; 3.6.1 File Format, UTS #10.
+-- see 5 Tailoring; 9.1 Allkeys File Format, UTS #10.
 
 If the same character (or a sequence of characters) exists
 in the collation element table through C<table>,
@@ -1261,7 +1270,7 @@ of the string after them (in NFD by default) are used.
 
 =item ignoreName
 
--- see 3.6.2 Variable Weighting, UTS #10.
+-- see 3.6 Variable Weighting, UTS #10.
 
 Makes the entry in the table completely ignorable;
 i.e. as if the weights were zero at all level.
@@ -1322,6 +1331,46 @@ and 'shift-trimmed'), the level 4 may be unreliable.
 
 See also C<identical>.
 
+=item long_contraction
+
+-- see 3.8.2 Well-Formedness of the DUCET, 4.2 Produce Array, UTS #10.
+
+If the parameter is made true, for a contraction with three or more
+characters (here nicknamed "long contraction"), initial substrings
+will be handled.
+For example, a contraction ABC, where A is a starter, and B and C
+are non-starters (character with non-zero combining character class),
+will be detected even if there is not AB as a contraction.
+
+B<Default:> Usually false.
+If C<UCA_Version> is 22 or 24, and the value of C<long_contraction>
+is not specified in C<new()>, a true value is set implicitly.
+This is a workaround to pass Conformance Tests for Unicode 6.0.0 and 6.1.0.
+
+C<change()> handles C<long_contraction> explicitly only.
+If C<long_contraction> is not specified in C<change()>, even though
+C<UCA_Version> is changed, C<long_contraction> will not be changed.
+
+B<Limitation:> Scanning non-starters is one-way (no back tracking).
+If AB is found but not ABC is not found, other long contraction where
+the first character is A and the second is not B may not be found.
+
+Under C<(normalization =E<gt> undef)>, detection step of discontiguous
+contractions are skipped.
+
+B<Note:> The following contractions in DUCET are not considered
+in steps S2.1.1 to S2.1.3, where they are discontiguous.
+
+    0FB2 0F71 0F80 (TIBETAN VOWEL SIGN VOCALIC RR)
+    0FB3 0F71 0F80 (TIBETAN VOWEL SIGN VOCALIC LL)
+
+For example C<TIBETAN VOWEL SIGN VOCALIC RR> with C<COMBINING TILDE OVERLAY>
+(C<U+0344>) is C<0FB2 0344 0F71 0F80> in NFD.
+In this case C<0FB2 0F80> (C<TIBETAN VOWEL SIGN VOCALIC R>) is detected,
+instead of C<0FB2 0F71 0F80>.
+Inserted C<0344> makes C<0FB2 0F71 0F80> discontiguous and lack of
+contraction C<0FB2 0F71> prohibits C<0FB2 0F71 0F80> from being detected.
+
 =item minimalFFFE
 
 -- see 5.14 Collation Elements, UTS #35.
@@ -1615,7 +1664,7 @@ B<NOTE>: Contractions via C<entry> are not be suppressed.
 
 =item table
 
--- see 3.6 Default Unicode Collation Element Table, UTS #10.
+-- see 3.8 Default Unicode Collation Element Table, UTS #10.
 
 You can use another collation element table if desired.
 
@@ -1694,7 +1743,7 @@ this parameter doesn't work validly.
 
 =item variable
 
--- see 3.6.2 Variable Weighting, UTS #10.
+-- see 3.6 Variable Weighting, UTS #10.
 
 This key allows for variable weighting of variable collation elements,
 which are marked with an ASTERISK in the table
@@ -2029,7 +2078,7 @@ B<Unicode::Normalize is required to try The Conformance Test.>
 =head1 AUTHOR, COPYRIGHT AND LICENSE
 
 The Unicode::Collate module for perl was written by SADAHIRO Tomoyuki,
-<SADAHIRO@cpan.org>. This module is Copyright(C) 2001-2013,
+<SADAHIRO@cpan.org>. This module is Copyright(C) 2001-2014,
 SADAHIRO Tomoyuki. Japan. All rights reserved.
 
 This module is free software; you can redistribute it and/or
diff --git a/cpan/Unicode-Collate/Collate.xs b/cpan/Unicode-Collate/Collate.xs
index af62d0b295..ed0074b7c9 100644
--- a/cpan/Unicode-Collate/Collate.xs
+++ b/cpan/Unicode-Collate/Collate.xs
@@ -11,9 +11,9 @@
 #include "ucatbl.h"
 
 /* Perl 5.6.1 ? */
-#ifndef utf8n_to_uvuni
+#ifdef utf8_to_uv
 #define utf8n_to_uvuni  utf8_to_uv
-#endif /* utf8n_to_uvuni */
+#endif /* utf8_to_uv */
 
 /* UTF8_ALLOW_BOM is used before Perl 5.8.0 */
 #ifndef UTF8_ALLOW_BOM
@@ -590,36 +590,28 @@ varCE (self, vce)
     /* variable: checked only the first char and the length,
        trusting checkCollator() and %VariableOK in Perl ... */
 
-    if (vlen < VCE_Length /* ignore short VCE (unexpected) */
-	||
-	*a == 'n') /* non-ignorable */
-	1;
-    else if (*v) {
-	if (*a == 's') { /* shifted or shift-trimmed */
-	    d[7] = d[1]; /* wt level 1 to 4 */
-	    d[8] = d[2];
-	} /* else blanked */
-
-	d[1] = d[2] = d[3] = d[4] = d[5] = d[6] = '\0';
-    }
-    else if (*a == 'b') /* blanked */
-	1;
-    else if (*a == 's') { /* shifted or shift-trimmed */
-	totwt = d[1] + d[2] + d[3] + d[4] + d[5] + d[6];
-	if (alen == 7 && totwt != 0) { /* shifted */
-	    if (d[1] == 0 && d[2] == 1) { /* XXX: CollationAuxiliary-6.2.0 */
+    if (vlen >= VCE_Length && *a != 'n') {
+	if (*v) {
+	    if (*a == 's') { /* shifted or shift-trimmed */
 		d[7] = d[1]; /* wt level 1 to 4 */
 		d[8] = d[2];
-	    } else {
-		d[7] = (U8)(Shift4Wt >> 8);
-		d[8] = (U8)(Shift4Wt & 0xFF);
+	    } /* else blanked */
+	    d[1] = d[2] = d[3] = d[4] = d[5] = d[6] = '\0';
+	} else if (*a == 's') { /* shifted or shift-trimmed */
+	    totwt = d[1] + d[2] + d[3] + d[4] + d[5] + d[6];
+	    if (alen == 7 && totwt != 0) { /* shifted */
+		if (d[1] == 0 && d[2] == 1) { /* XXX: CollationAuxiliary-6.2.0 */
+		    d[7] = d[1]; /* wt level 1 to 4 */
+		    d[8] = d[2];
+		} else {
+		    d[7] = (U8)(Shift4Wt >> 8);
+		    d[8] = (U8)(Shift4Wt & 0xFF);
+		}
+	    } else { /* shift-trimmed or completely ignorable */
+		d[7] = d[8] = '\0';
 	    }
-	} else { /* shift-trimmed or completely ignorable */
-	    d[7] = d[8] = '\0';
-	}
-    }
-    else
-	croak("unknown variable value '%s'", a);
+	} /* else blanked */
+    } /* else non-ignorable */
     RETVAL = dst;
 OUTPUT:
     RETVAL
diff --git a/cpan/Unicode-Collate/t/contract.t b/cpan/Unicode-Collate/t/contract.t
index d659562fb0..cec3c8017e 100644
--- a/cpan/Unicode-Collate/t/contract.t
+++ b/cpan/Unicode-Collate/t/contract.t
@@ -16,7 +16,7 @@ BEGIN {
 
 use strict;
 use warnings;
-BEGIN { $| = 1; print "1..74\n"; }
+BEGIN { $| = 1; print "1..108\n"; }
 my $count = 0;
 sub ok ($;$) {
     my $p = my $r = shift;
@@ -179,7 +179,9 @@ ok($kjeSup->eq("\x{40C}", "\x{41A}\x{301}"));
 # 44
 
 our $tibetanEntry = <<'ENTRIES';
-0000  ; [.0000.0000.0000.0000] # [0000] NULL (in 6429)
+0000           ; [.0000.0000.0000.0000] # [0000] NULL (in 6429)
+0FB2           ; [.205B.0020.0002.0FB2] # TIBETAN SUBJOINED LETTER RA
+0FB3           ; [.205E.0020.0002.0FB3] # TIBETAN SUBJOINED LETTER LA
 0F71           ; [.206D.0020.0002.0F71] # TIBETAN VOWEL SIGN AA
 0F72           ; [.206E.0020.0002.0F72] # TIBETAN VOWEL SIGN I
 0F73           ; [.206F.0020.0002.0F73] # TIBETAN VOWEL SIGN II
@@ -215,6 +217,7 @@ if (!$@) {
     my $tibNFD = Unicode::Collate->new(
 	table => undef,
 	entry => $tibetanEntry,
+	UCA_Version => 24,
     );
 
     # VOCALIC RR
@@ -251,8 +254,43 @@ if (!$@) {
     ok($tibNFD->eq("\x{F79}\0\x{334}", "\x{FB3}\x{F80}\x{F71}\0\x{334}"));
 # 72
 
+    my $a1 = "\x{FB2}\x{334}\x{F81}";
+    my $b1 = "\x{F77}\0\x{334}";
+    my $a2 = "\x{FB2}\x{334}\x{F81}";
+    my $b2 = "\x{FB2}\x{F80}\0\x{334}\x{F71}";
+
+    for my $v (qw/20 22 24 26 28/) {
+	my $tib = Unicode::Collate->new(
+	    table => undef,
+	    entry => $tibetanEntry,
+	    UCA_Version => $v,
+	);
+	my $long = 22 <= $v && $v <= 24;
+	ok($tib->cmp($a1, $b1), $long ? 0 : -1);
+	ok($tib->cmp($a2, $b2), $long ? 1 : 0);
+
+	$tib->change(long_contraction => 0);
+	ok($tib->cmp($a1, $b1), -1);
+	ok($tib->cmp($a2, $b2),  0);
+
+	$tib->change(long_contraction => 1);
+	ok($tib->cmp($a1, $b1), 0);
+	ok($tib->cmp($a2, $b2), 1);
+    }
+# 102
+
+    # UCA_Version => 22
+    ok($tibNFD->cmp($a1, $b1), 0);
+    ok($tibNFD->cmp($a2, $b2), 1);
+
+    $tibNFD->change(UCA_Version => 26); # not affect long_contraction
+    ok($tibNFD->cmp($a1, $b1), 0);
+    ok($tibNFD->cmp($a2, $b2), 1);
+# 106
+
     my $discontNFD = Unicode::Collate->new(
 	table => undef,
+	UCA_Version => 22,
 	entry => <<'ENTRIES',
 0000  ; [.0000.0000.0000.0000] # [0000] NULL (in 6429)
 0301  ; [.0000.0032.0002.0301] # COMBINING ACUTE ACCENT
@@ -268,6 +306,6 @@ ENTRIES
     ok($discontNFD->eq("A\x{327}\x{301}\0\x{334}", "A\x{334}\x{327}\x{301}"));
     ok($discontNFD->eq("A\x{300}\0\x{327}",        "A\x{327}\x{300}"));
 } else {
-    ok(1) for 1..30;
+    ok(1) for 1..64;
 }
-# 74
+# 108
author	Steve Hay <steve.m.hay@googlemail.com>	2014-05-28 12:37:53 +0100
committer	Steve Hay <steve.m.hay@googlemail.com>	2014-05-28 16:58:07 +0100
commit	f8187d97b5f7e833d712cb8220883e02b4c2bfa1 (patch)
tree	0318c495f96f63934a5cb2b3f6a4f15cd36cd226 /cpan/Unicode-Collate
parent	60d2b2aebfceab38ad669bac181bb68e42c05cb2 (diff)
download	perl-f8187d97b5f7e833d712cb8220883e02b4c2bfa1.tar.gz