diff options
Diffstat (limited to 'cpan/Unicode-Collate/Collate.pm')
-rw-r--r-- | cpan/Unicode-Collate/Collate.pm | 140 |
1 files changed, 91 insertions, 49 deletions
diff --git a/cpan/Unicode-Collate/Collate.pm b/cpan/Unicode-Collate/Collate.pm index 5964f83511..9e1623cf4e 100644 --- a/cpan/Unicode-Collate/Collate.pm +++ b/cpan/Unicode-Collate/Collate.pm @@ -14,7 +14,7 @@ use File::Spec; no warnings 'utf8'; -our $VERSION = '0.91'; +our $VERSION = '0.92'; our $PACKAGE = __PACKAGE__; ### begin XS only ### @@ -48,16 +48,14 @@ use constant Min3Wt => 0x02; use constant Shift4Wt => 0xFFFF; # A boolean for Variable and 16-bit weights at 4 levels of Collation Element -# PROBLEM: The Default Unicode Collation Element Table -# has weights over 0xFFFF at the 4th level. -# The tie-breaking in the variable weights -# other than "shift" (as well as "shift-trimmed") is unreliable. use constant VCE_TEMPLATE => 'Cn4'; # A sort key: 16-bit weights -# See also the PROBLEM on VCE_TEMPLATE above. use constant KEY_TEMPLATE => 'n*'; +# The tie-breaking: 32-bit weights +use constant TIE_TEMPLATE => 'N*'; + # Level separator in a sort key: # i.e. pack(KEY_TEMPLATE, 0) use constant LEVEL_SEP => "\0\0"; @@ -105,7 +103,7 @@ our @ChangeOK = qw/ alternate backwards level normalization rearrange katakana_before_hiragana upper_before_lower ignore_level2 overrideHangul overrideCJK preprocess UCA_Version - hangul_terminator variable + hangul_terminator variable identical /; our @ChangeNG = qw/ @@ -135,18 +133,18 @@ sub change { my $self = shift; my %hash = @_; my %old; - if (exists $hash{variable} && exists $hash{alternate}) { - delete $hash{alternate}; - } - elsif (!exists $hash{variable} && exists $hash{alternate}) { - $hash{variable} = $hash{alternate}; + if (exists $hash{alternate}) { + if (exists $hash{variable}) { + delete $hash{alternate}; + } else { + $hash{variable} = $hash{alternate}; + } } foreach my $k (keys %hash) { if (exists $ChangeOK{$k}) { $old{$k} = $self->{$k}; $self->{$k} = $hash{$k}; - } - elsif (exists $ChangeNG{$k}) { + } elsif (exists $ChangeNG{$k}) { croak "change of $k via change() is not allowed!"; } # else => ignored @@ -176,6 +174,7 @@ my %DerivCode = ( 20 => \&_derivCE_20, 22 => \&_derivCE_22, 24 => \&_derivCE_24, + 26 => \&_derivCE_24, # 26 == 24 ); sub checkCollator { @@ -193,12 +192,10 @@ sub checkCollator { if (! defined $self->{backwards}) { $self->{backwardsFlag} = 0; - } - elsif (! ref $self->{backwards}) { + } elsif (! ref $self->{backwards}) { _checkLevel($self->{backwards}, "backwards"); $self->{backwardsFlag} = 1 << $self->{backwards}; - } - else { + } else { my %level; $self->{backwardsFlag} = 0; for my $b (@{ $self->{backwards} }) { @@ -443,21 +440,33 @@ sub parseEntry sub viewSortKey { my $self = shift; - $self->visualizeSortKey($self->getSortKey(@_)); + my $str = shift; + $self->visualizeSortKey($self->getSortKey($str)); } +sub process +{ + my $self = shift; + my $str = shift; + my $prep = $self->{preprocess}; + my $norm = $self->{normCode}; + + $str = &$prep($str) if ref $prep; + $str = &$norm($str) if ref $norm; + return $str; +} + ## ## arrayref of JCPS = splitEnt(string to be collated) -## arrayref of arrayref[JCPS, ini_pos, fin_pos] = splitEnt(string, true) +## arrayref of arrayref[JCPS, ini_pos, fin_pos] = splitEnt(string, TRUE) ## sub splitEnt { my $self = shift; - my $wLen = $_[1]; + my $str = shift; + my $wLen = shift; # with Length - my $code = $self->{preprocess}; - my $norm = $self->{normCode}; my $map = $self->{mapping}; my $max = $self->{maxlength}; my $reH = $self->{rearrangeHash}; @@ -465,20 +474,7 @@ sub splitEnt my $ver9 = $vers >= 9 && $vers <= 11; my $uXS = $self->{__useXS}; ### XS only - my ($str, @buf); - - if ($wLen) { - $code and croak "Preprocess breaks character positions. " - . "Don't use with index(), match(), etc."; - $norm and croak "Normalization breaks character positions. " - . "Don't use with index(), match(), etc."; - $str = $_[0]; - } - else { - $str = $_[0]; - $str = &$code($str) if ref $code; - $str = &$norm($str) if ref $norm; - } + my @buf; # get array of Unicode code point of string. my @src = unpack_U($str); @@ -696,9 +692,13 @@ sub getWt sub getSortKey { my $self = shift; - my $rEnt = $self->splitEnt(shift); # get an arrayref of JCPS + my $orig = shift; + my $str = $self->process($orig); + my $rEnt = $self->splitEnt($str); # get an arrayref of JCPS my $vers = $self->{UCA_Version}; my $term = $self->{hangul_terminator}; + my $lev = $self->{level}; + my $iden = $self->{identical}; my @buf; # weight arrays if ($term) { @@ -723,7 +723,13 @@ sub getSortKey } } - return $self->mk_SortKey(\@buf); ### XS only + my $rkey = $self->mk_SortKey(\@buf); ### XS only + + if ($iden || $vers >= 26 && $lev == MaxLevel) { + $rkey .= LEVEL_SEP; + $rkey .= pack(TIE_TEMPLATE, unpack_U($str)) if $iden; + } + return $rkey; } @@ -798,9 +804,15 @@ sub _eqArray($$$) sub index { my $self = shift; + $self->{preprocess} and + croak "Don't use Preprocess with index(), match(), etc."; + $self->{normCode} and + croak "Don't use Normalization with index(), match(), etc."; + my $str = shift; my $len = length($str); - my $subE = $self->splitEnt(shift); + my $sub = shift; + my $subE = $self->splitEnt($sub); my $pos = @_ ? shift : 0; $pos = 0 if $pos < 0; my $glob = shift; @@ -1034,6 +1046,7 @@ with no parameters, the collator should do the default collation. backwards => $levelNumber, # or \@levelNumbers entry => $element, hangul_terminator => $term_primary_weight, + identical => $bool, ignoreName => qr/$ignoreName/, ignoreChar => qr/$ignoreChar/, ignore_level2 => $bool, @@ -1074,6 +1087,7 @@ The following revisions are supported. The default is 24. 20 5.2.0 5.2.0 (5.2.0) 22 6.0.0 6.0.0 (6.0.0) 24 6.1.0 6.1.0 (6.1.0) + 26 6.2.0 6.2.0 (6.2.0) * Noncharacters (e.g. U+FFFF) are not ignored, and can be overridden since C<UCA_Version> 22. @@ -1099,7 +1113,7 @@ as an alias for C<variable>. =item backwards --- see 3.1.2 French Accents, UTS #10. +-- see 3.4 Backward Accents, UTS #10. backwards => $levelNumber or \@levelNumbers @@ -1109,7 +1123,7 @@ forwards at all the levels. =item entry --- see 3.1 Linguistic Features; 3.2.1 File Format, UTS #10. +-- see 5 Tailoring; 3.6.1 File Format, UTS #10. If the same character (or a sequence of characters) exists in the collation element table through C<table>, @@ -1183,11 +1197,27 @@ automatically terminated with a terminator primary weight. These characters may need terminator included in a collation element table beforehand. +=item identical + +-- see A.3 Deterministic Comparison, UTS #10. + +By default, strings whose weights are equal should be equal, +even though their code points are not equal. + +If the parameter is made true, a final, tie-breaking level is used. +If no difference of weights is found after the comparison through all +the level (independent of the value of C<level>), the comparison with +code points will be performed. For the tie-breaking comparision, +the sort key has code points of the original string appended. + +If C<preprocess> and/or C<normalization> is applied, the code points +of the string after them (in NFD by default) are used. + =item ignoreChar =item ignoreName --- see 3.2.2 Variable Weighting, UTS #10. +-- see 3.6.2 Variable Weighting, UTS #10. Makes the entry in the table completely ignorable; i.e. as if the weights were zero at all level. @@ -1214,7 +1244,7 @@ B<NOTE>: C<level> should be 3 or greater. =item katakana_before_hiragana --- see 7.3.1 Tertiary Weight Table, UTS #10. +-- see 7.2 Tertiary Weight Table, UTS #10. By default, hiragana is before katakana. If the parameter is made true, this is reversed. @@ -1241,6 +1271,13 @@ Any higher levels than the specified one are ignored. If omitted, the maximum is the 4th. +B<NOTE:> The DUCET includes weights over 0xFFFF at the 4th level. +But this module only uses weights within 0xFFFF. +When C<variable> is 'blanked' or 'non-ignorable' (other than 'shifted' +and 'shift-trimmed'), the level 4 may be unreliable. + +See also C<identical>. + =item normalization -- see 4.1 Normalize, UTS #10. @@ -1295,7 +1332,7 @@ those in the CJK Unified Ideographs Extension A etc. U+4E00..U+9FBB if UCA_Version is 14 or 16. U+4E00..U+9FC3 if UCA_Version is 18. U+4E00..U+9FCB if UCA_Version is 20 or 22. - U+4E00..U+9FCC if UCA_Version is 24. + U+4E00..U+9FCC if UCA_Version is 24 or 26. In the CJK Unified Ideographs Extension blocks: Ext.A (U+3400..U+4DB5) and Ext.B (U+20000..U+2A6D6) in any UCA_Version. @@ -1373,7 +1410,7 @@ in C<table> or C<entry> is still valid. =item preprocess --- see 5.1 Preprocessing, UTS #10. +-- see 5.4 Preprocessing, UTS #10. If specified, the coderef is used to preprocess each string before the formation of sort keys. @@ -1402,7 +1439,7 @@ L<perluniintro>, L<perlunitut>, L<perlunifaq>, L<utf8>. =item rearrange --- see 3.1.3 Rearrangement, UTS #10. +-- see 3.5 Rearrangement, UTS #10. Characters that are not coded in logical order and to be rearranged. If C<UCA_Version> is equal to or lesser than 11, default is: @@ -1458,7 +1495,7 @@ B<NOTE>: Contractions via C<entry> are not be suppressed. =item table --- see 3.2 Default Unicode Collation Element Table, UTS #10. +-- see 3.6 Default Unicode Collation Element Table, UTS #10. You can use another collation element table if desired. @@ -1537,7 +1574,7 @@ this parameter doesn't work validly. =item variable --- see 3.2.2 Variable Weighting, UTS #10. +-- see 3.6.2 Variable Weighting, UTS #10. This key allows for variable weighting of variable collation elements, which are marked with an ASTERISK in the table @@ -1861,6 +1898,11 @@ a collator via C<Unicode::Collate-E<gt>new( )> should be used; for F<CollationTest_NON_IGNORABLE.txt>, a collator via C<Unicode::Collate-E<gt>new(variable =E<gt> "non-ignorable", level =E<gt> 3)>. +If C<UCA_Version> is 26 or later, the C<identical> level is preferred; +C<Unicode::Collate-E<gt>new(identical =E<gt> 1)> and +C<Unicode::Collate-E<gt>new(identical =E<gt> 1,> +C<variable =E<gt> "non-ignorable", level =E<gt> 3)> should be used. + B<Unicode::Normalize is required to try The Conformance Test.> =back |