diff options
author | Steve Hay <steve.m.hay@googlemail.com> | 2014-05-28 12:37:53 +0100 |
---|---|---|
committer | Steve Hay <steve.m.hay@googlemail.com> | 2014-05-28 16:58:07 +0100 |
commit | f8187d97b5f7e833d712cb8220883e02b4c2bfa1 (patch) | |
tree | 0318c495f96f63934a5cb2b3f6a4f15cd36cd226 /cpan/Unicode-Collate | |
parent | 60d2b2aebfceab38ad669bac181bb68e42c05cb2 (diff) | |
download | perl-f8187d97b5f7e833d712cb8220883e02b4c2bfa1.tar.gz |
Upgrade Unicode::Collate from version 1.04 to 1.07
Diffstat (limited to 'cpan/Unicode-Collate')
-rw-r--r-- | cpan/Unicode-Collate/Collate.pm | 89 | ||||
-rw-r--r-- | cpan/Unicode-Collate/Collate.xs | 50 | ||||
-rw-r--r-- | cpan/Unicode-Collate/t/contract.t | 46 |
3 files changed, 132 insertions, 53 deletions
diff --git a/cpan/Unicode-Collate/Collate.pm b/cpan/Unicode-Collate/Collate.pm index 0fd2951726..a67cbdff3b 100644 --- a/cpan/Unicode-Collate/Collate.pm +++ b/cpan/Unicode-Collate/Collate.pm @@ -17,7 +17,7 @@ use File::Spec; no warnings 'utf8'; -our $VERSION = '1.04'; +our $VERSION = '1.07'; our $PACKAGE = __PACKAGE__; ### begin XS only ### @@ -117,6 +117,7 @@ our @ChangeOK = qw/ katakana_before_hiragana upper_before_lower ignore_level2 overrideCJK overrideHangul overrideOut preprocess UCA_Version hangul_terminator variable identical highestFFFF minimalFFFE + long_contraction /; our @ChangeNG = qw/ @@ -285,6 +286,7 @@ sub new } } + # only in new(), not in change() $self->{level} ||= MaxLevel; $self->{UCA_Version} ||= UCA_Version(); @@ -299,7 +301,10 @@ sub new if ! exists $self->{rearrange}; $self->{backwards} = $self->{backwardsTable} if ! exists $self->{backwards}; + exists $self->{long_contraction} or $self->{long_contraction} + = 22 <= $self->{UCA_Version} && $self->{UCA_Version} <= 24; + # checkCollator() will be called in change() $self->checkCollator(); return $self; @@ -441,12 +446,10 @@ sub parseEntry $self->{maxlength}{$uv[0]} = @uv; } } - if (@uv > 2) { - while (@uv) { - pop @uv; - my $fake_entry = join(CODE_SEP, @uv); # in JCPS - $self->{contraction}{$fake_entry} = 1; - } + while (@uv > 2) { + pop @uv; + my $fake_entry = join(CODE_SEP, @uv); # in JCPS + $self->{contraction}{$fake_entry} = 1; } } @@ -486,6 +489,7 @@ sub splitEnt my $reH = $self->{rearrangeHash}; my $vers = $self->{UCA_Version}; my $ver9 = $vers >= 9 && $vers <= 11; + my $long = $self->{long_contraction}; my $uXS = $self->{__useXS}; ### XS only my @buf; @@ -566,6 +570,15 @@ sub splitEnt last unless $curCC; my $tail = CODE_SEP . $src[$p]; + if ($preCC != $curCC && $map->{$jcps.$tail}) { + $jcps .= $tail; + push @out, $p; + } else { + $preCC = $curCC; + } + + next if !$long; + if ($preCC_uc != $curCC && ($map->{$jcps_uc.$tail} || $cont->{$jcps_uc.$tail})) { $jcps_uc .= $tail; @@ -573,16 +586,9 @@ sub splitEnt } else { $preCC_uc = $curCC; } - - if ($preCC != $curCC && $map->{$jcps.$tail}) { - $jcps .= $tail; - push @out, $p; - } else { - $preCC = $curCC; - } } - if ($map->{$jcps_uc}) { + if (@out_uc && $map->{$jcps_uc}) { $jcps = $jcps_uc; $src[$_] = undef for @out_uc; } else { @@ -1068,6 +1074,7 @@ with no parameters, the collator should do the default collation. ignore_level2 => $bool, katakana_before_hiragana => $bool, level => $collationLevel, + long_contraction => $bool, minimalFFFE => $bool, normalization => $normalization_form, overrideCJK => \&overrideCJK, @@ -1107,6 +1114,8 @@ The following revisions are supported. The default is 28. 26 6.2.0 6.2.0 (6.2.0) 28 6.3.0 6.3.0 (6.3.0) +* See below C<long_contraction> with C<UCA_Version> 22 and 24. + * Noncharacters (e.g. U+FFFF) are not ignored, and can be overridden since C<UCA_Version> 22. @@ -1144,7 +1153,7 @@ forwards at all the levels. =item entry --- see 5 Tailoring; 3.6.1 File Format, UTS #10. +-- see 5 Tailoring; 9.1 Allkeys File Format, UTS #10. If the same character (or a sequence of characters) exists in the collation element table through C<table>, @@ -1261,7 +1270,7 @@ of the string after them (in NFD by default) are used. =item ignoreName --- see 3.6.2 Variable Weighting, UTS #10. +-- see 3.6 Variable Weighting, UTS #10. Makes the entry in the table completely ignorable; i.e. as if the weights were zero at all level. @@ -1322,6 +1331,46 @@ and 'shift-trimmed'), the level 4 may be unreliable. See also C<identical>. +=item long_contraction + +-- see 3.8.2 Well-Formedness of the DUCET, 4.2 Produce Array, UTS #10. + +If the parameter is made true, for a contraction with three or more +characters (here nicknamed "long contraction"), initial substrings +will be handled. +For example, a contraction ABC, where A is a starter, and B and C +are non-starters (character with non-zero combining character class), +will be detected even if there is not AB as a contraction. + +B<Default:> Usually false. +If C<UCA_Version> is 22 or 24, and the value of C<long_contraction> +is not specified in C<new()>, a true value is set implicitly. +This is a workaround to pass Conformance Tests for Unicode 6.0.0 and 6.1.0. + +C<change()> handles C<long_contraction> explicitly only. +If C<long_contraction> is not specified in C<change()>, even though +C<UCA_Version> is changed, C<long_contraction> will not be changed. + +B<Limitation:> Scanning non-starters is one-way (no back tracking). +If AB is found but not ABC is not found, other long contraction where +the first character is A and the second is not B may not be found. + +Under C<(normalization =E<gt> undef)>, detection step of discontiguous +contractions are skipped. + +B<Note:> The following contractions in DUCET are not considered +in steps S2.1.1 to S2.1.3, where they are discontiguous. + + 0FB2 0F71 0F80 (TIBETAN VOWEL SIGN VOCALIC RR) + 0FB3 0F71 0F80 (TIBETAN VOWEL SIGN VOCALIC LL) + +For example C<TIBETAN VOWEL SIGN VOCALIC RR> with C<COMBINING TILDE OVERLAY> +(C<U+0344>) is C<0FB2 0344 0F71 0F80> in NFD. +In this case C<0FB2 0F80> (C<TIBETAN VOWEL SIGN VOCALIC R>) is detected, +instead of C<0FB2 0F71 0F80>. +Inserted C<0344> makes C<0FB2 0F71 0F80> discontiguous and lack of +contraction C<0FB2 0F71> prohibits C<0FB2 0F71 0F80> from being detected. + =item minimalFFFE -- see 5.14 Collation Elements, UTS #35. @@ -1615,7 +1664,7 @@ B<NOTE>: Contractions via C<entry> are not be suppressed. =item table --- see 3.6 Default Unicode Collation Element Table, UTS #10. +-- see 3.8 Default Unicode Collation Element Table, UTS #10. You can use another collation element table if desired. @@ -1694,7 +1743,7 @@ this parameter doesn't work validly. =item variable --- see 3.6.2 Variable Weighting, UTS #10. +-- see 3.6 Variable Weighting, UTS #10. This key allows for variable weighting of variable collation elements, which are marked with an ASTERISK in the table @@ -2029,7 +2078,7 @@ B<Unicode::Normalize is required to try The Conformance Test.> =head1 AUTHOR, COPYRIGHT AND LICENSE The Unicode::Collate module for perl was written by SADAHIRO Tomoyuki, -<SADAHIRO@cpan.org>. This module is Copyright(C) 2001-2013, +<SADAHIRO@cpan.org>. This module is Copyright(C) 2001-2014, SADAHIRO Tomoyuki. Japan. All rights reserved. This module is free software; you can redistribute it and/or diff --git a/cpan/Unicode-Collate/Collate.xs b/cpan/Unicode-Collate/Collate.xs index af62d0b295..ed0074b7c9 100644 --- a/cpan/Unicode-Collate/Collate.xs +++ b/cpan/Unicode-Collate/Collate.xs @@ -11,9 +11,9 @@ #include "ucatbl.h" /* Perl 5.6.1 ? */ -#ifndef utf8n_to_uvuni +#ifdef utf8_to_uv #define utf8n_to_uvuni utf8_to_uv -#endif /* utf8n_to_uvuni */ +#endif /* utf8_to_uv */ /* UTF8_ALLOW_BOM is used before Perl 5.8.0 */ #ifndef UTF8_ALLOW_BOM @@ -590,36 +590,28 @@ varCE (self, vce) /* variable: checked only the first char and the length, trusting checkCollator() and %VariableOK in Perl ... */ - if (vlen < VCE_Length /* ignore short VCE (unexpected) */ - || - *a == 'n') /* non-ignorable */ - 1; - else if (*v) { - if (*a == 's') { /* shifted or shift-trimmed */ - d[7] = d[1]; /* wt level 1 to 4 */ - d[8] = d[2]; - } /* else blanked */ - - d[1] = d[2] = d[3] = d[4] = d[5] = d[6] = '\0'; - } - else if (*a == 'b') /* blanked */ - 1; - else if (*a == 's') { /* shifted or shift-trimmed */ - totwt = d[1] + d[2] + d[3] + d[4] + d[5] + d[6]; - if (alen == 7 && totwt != 0) { /* shifted */ - if (d[1] == 0 && d[2] == 1) { /* XXX: CollationAuxiliary-6.2.0 */ + if (vlen >= VCE_Length && *a != 'n') { + if (*v) { + if (*a == 's') { /* shifted or shift-trimmed */ d[7] = d[1]; /* wt level 1 to 4 */ d[8] = d[2]; - } else { - d[7] = (U8)(Shift4Wt >> 8); - d[8] = (U8)(Shift4Wt & 0xFF); + } /* else blanked */ + d[1] = d[2] = d[3] = d[4] = d[5] = d[6] = '\0'; + } else if (*a == 's') { /* shifted or shift-trimmed */ + totwt = d[1] + d[2] + d[3] + d[4] + d[5] + d[6]; + if (alen == 7 && totwt != 0) { /* shifted */ + if (d[1] == 0 && d[2] == 1) { /* XXX: CollationAuxiliary-6.2.0 */ + d[7] = d[1]; /* wt level 1 to 4 */ + d[8] = d[2]; + } else { + d[7] = (U8)(Shift4Wt >> 8); + d[8] = (U8)(Shift4Wt & 0xFF); + } + } else { /* shift-trimmed or completely ignorable */ + d[7] = d[8] = '\0'; } - } else { /* shift-trimmed or completely ignorable */ - d[7] = d[8] = '\0'; - } - } - else - croak("unknown variable value '%s'", a); + } /* else blanked */ + } /* else non-ignorable */ RETVAL = dst; OUTPUT: RETVAL diff --git a/cpan/Unicode-Collate/t/contract.t b/cpan/Unicode-Collate/t/contract.t index d659562fb0..cec3c8017e 100644 --- a/cpan/Unicode-Collate/t/contract.t +++ b/cpan/Unicode-Collate/t/contract.t @@ -16,7 +16,7 @@ BEGIN { use strict; use warnings; -BEGIN { $| = 1; print "1..74\n"; } +BEGIN { $| = 1; print "1..108\n"; } my $count = 0; sub ok ($;$) { my $p = my $r = shift; @@ -179,7 +179,9 @@ ok($kjeSup->eq("\x{40C}", "\x{41A}\x{301}")); # 44 our $tibetanEntry = <<'ENTRIES'; -0000 ; [.0000.0000.0000.0000] # [0000] NULL (in 6429) +0000 ; [.0000.0000.0000.0000] # [0000] NULL (in 6429) +0FB2 ; [.205B.0020.0002.0FB2] # TIBETAN SUBJOINED LETTER RA +0FB3 ; [.205E.0020.0002.0FB3] # TIBETAN SUBJOINED LETTER LA 0F71 ; [.206D.0020.0002.0F71] # TIBETAN VOWEL SIGN AA 0F72 ; [.206E.0020.0002.0F72] # TIBETAN VOWEL SIGN I 0F73 ; [.206F.0020.0002.0F73] # TIBETAN VOWEL SIGN II @@ -215,6 +217,7 @@ if (!$@) { my $tibNFD = Unicode::Collate->new( table => undef, entry => $tibetanEntry, + UCA_Version => 24, ); # VOCALIC RR @@ -251,8 +254,43 @@ if (!$@) { ok($tibNFD->eq("\x{F79}\0\x{334}", "\x{FB3}\x{F80}\x{F71}\0\x{334}")); # 72 + my $a1 = "\x{FB2}\x{334}\x{F81}"; + my $b1 = "\x{F77}\0\x{334}"; + my $a2 = "\x{FB2}\x{334}\x{F81}"; + my $b2 = "\x{FB2}\x{F80}\0\x{334}\x{F71}"; + + for my $v (qw/20 22 24 26 28/) { + my $tib = Unicode::Collate->new( + table => undef, + entry => $tibetanEntry, + UCA_Version => $v, + ); + my $long = 22 <= $v && $v <= 24; + ok($tib->cmp($a1, $b1), $long ? 0 : -1); + ok($tib->cmp($a2, $b2), $long ? 1 : 0); + + $tib->change(long_contraction => 0); + ok($tib->cmp($a1, $b1), -1); + ok($tib->cmp($a2, $b2), 0); + + $tib->change(long_contraction => 1); + ok($tib->cmp($a1, $b1), 0); + ok($tib->cmp($a2, $b2), 1); + } +# 102 + + # UCA_Version => 22 + ok($tibNFD->cmp($a1, $b1), 0); + ok($tibNFD->cmp($a2, $b2), 1); + + $tibNFD->change(UCA_Version => 26); # not affect long_contraction + ok($tibNFD->cmp($a1, $b1), 0); + ok($tibNFD->cmp($a2, $b2), 1); +# 106 + my $discontNFD = Unicode::Collate->new( table => undef, + UCA_Version => 22, entry => <<'ENTRIES', 0000 ; [.0000.0000.0000.0000] # [0000] NULL (in 6429) 0301 ; [.0000.0032.0002.0301] # COMBINING ACUTE ACCENT @@ -268,6 +306,6 @@ ENTRIES ok($discontNFD->eq("A\x{327}\x{301}\0\x{334}", "A\x{334}\x{327}\x{301}")); ok($discontNFD->eq("A\x{300}\0\x{327}", "A\x{327}\x{300}")); } else { - ok(1) for 1..30; + ok(1) for 1..64; } -# 74 +# 108 |