diff options
author | Nicholas Clark <nick@ccl4.org> | 2003-10-27 13:11:48 +0000 |
---|---|---|
committer | Nicholas Clark <nick@ccl4.org> | 2003-10-27 13:11:48 +0000 |
commit | 91ae00cbaa2c38b2a8123e4417004ca29a7c9bfc (patch) | |
tree | 819d7d0f114e48b9c739b3f94e9af3735c0c341f /lib/Unicode | |
parent | 0377e16d912288b7c21a9d90350476c453da3e44 (diff) | |
download | perl-91ae00cbaa2c38b2a8123e4417004ca29a7c9bfc.tar.gz |
Sync with Unicode::Collate 0.30
p4raw-id: //depot/perl@21549
Diffstat (limited to 'lib/Unicode')
-rw-r--r-- | lib/Unicode/Collate.pm | 523 | ||||
-rw-r--r-- | lib/Unicode/Collate/Changes | 24 | ||||
-rw-r--r-- | lib/Unicode/Collate/README | 2 | ||||
-rw-r--r-- | lib/Unicode/Collate/t/contract.t | 2 | ||||
-rw-r--r-- | lib/Unicode/Collate/t/hangtype.t | 56 | ||||
-rw-r--r-- | lib/Unicode/Collate/t/hangul.t | 45 | ||||
-rw-r--r-- | lib/Unicode/Collate/t/index.t | 2 | ||||
-rw-r--r-- | lib/Unicode/Collate/t/normal.t | 205 | ||||
-rw-r--r-- | lib/Unicode/Collate/t/test.t | 103 | ||||
-rw-r--r-- | lib/Unicode/Collate/t/trailwt.t | 229 | ||||
-rw-r--r-- | lib/Unicode/Collate/t/variable.t | 108 | ||||
-rw-r--r-- | lib/Unicode/Collate/t/version.t | 61 |
12 files changed, 1121 insertions, 239 deletions
diff --git a/lib/Unicode/Collate.pm b/lib/Unicode/Collate.pm index 18ed44626c..a4d6d80cd1 100644 --- a/lib/Unicode/Collate.pm +++ b/lib/Unicode/Collate.pm @@ -14,7 +14,7 @@ use File::Spec; require Exporter; -our $VERSION = '0.28'; +our $VERSION = '0.30'; our $PACKAGE = __PACKAGE__; our @ISA = qw(Exporter); @@ -26,25 +26,6 @@ our @EXPORT = (); (our $Path = $INC{'Unicode/Collate.pm'}) =~ s/\.pm$//; our $KeyFile = "allkeys.txt"; -our $UNICODE_VERSION; - -eval { require Unicode::UCD }; - -unless ($@) { - $UNICODE_VERSION = Unicode::UCD::UnicodeVersion(); -} -else { # Perl 5.6.1 - my($f, $fh); - foreach my $d (@INC) { - $f = File::Spec->catfile($d, "unicode", "Unicode.301"); - if (open($fh, $f)) { - $UNICODE_VERSION = '3.0.1'; - close $fh; - last; - } - } -} - # Perl's boolean use constant TRUE => 1; use constant FALSE => ""; @@ -101,13 +82,37 @@ use constant CODE_SEP => ';'; use constant NON_VAR => 0; # Non-Variable character use constant VAR => 1; # Variable character +# specific code points +use constant Hangul_LBase => 0x1100; +use constant Hangul_LIni => 0x1100; +use constant Hangul_LFin => 0x1159; +use constant Hangul_LFill => 0x115F; +use constant Hangul_VBase => 0x1161; +use constant Hangul_VIni => 0x1160; +use constant Hangul_VFin => 0x11A2; +use constant Hangul_TBase => 0x11A7; +use constant Hangul_TIni => 0x11A8; +use constant Hangul_TFin => 0x11F9; +use constant Hangul_TCount => 28; +use constant Hangul_NCount => 588; +use constant Hangul_SBase => 0xAC00; +use constant Hangul_SIni => 0xAC00; +use constant Hangul_SFin => 0xD7A3; +use constant CJK_UidIni => 0x4E00; +use constant CJK_UidFin => 0x9FA5; +use constant CJK_ExtAIni => 0x3400; +use constant CJK_ExtAFin => 0x4DB5; +use constant CJK_ExtBIni => 0x20000; +use constant CJK_ExtBFin => 0x2A6D6; +use constant BMP_Max => 0xFFFF; + # Logical_Order_Exception in PropList.txt # TODO: synchronization with change of PropList.txt. our $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ]; -sub UCA_Version { "9" } +sub UCA_Version { "11" } -sub Base_Unicode_Version { $UNICODE_VERSION || 'unknown' } +sub Base_Unicode_Version { "4.0" } ###### @@ -121,20 +126,21 @@ sub unpack_U { ###### -my (%AlternateOK); -@AlternateOK{ qw/ +my (%VariableOK); +@VariableOK{ qw/ blanked non-ignorable shifted shift-trimmed - / } = (); + / } = (); # keys lowercased our @ChangeOK = qw/ alternate backwards level normalization rearrange katakana_before_hiragana upper_before_lower overrideHangul overrideCJK preprocess UCA_Version + hangul_terminator variable /; our @ChangeNG = qw/ - entry entries table maxlength - ignoreChar ignoreName undefChar undefName + entry mapping table maxlength + ignoreChar ignoreName undefChar undefName variableTable versionTable alternateTable backwardsTable forwardsTable rearrangeTable derivCode normCode rearrangeHash L3_ignorable backwardsFlag @@ -142,6 +148,12 @@ our @ChangeNG = qw/ # The hash key 'ignored' is deleted at v 0.21. # The hash key 'isShift' is deleted at v 0.23. # The hash key 'combining' is deleted at v 0.24. +# The hash key 'entries' is deleted at v 0.30. + +sub version { + my $self = shift; + return $self->{versionTable} || 'unknown'; +} my (%ChangeOK, %ChangeNG); @ChangeOK{ @ChangeOK } = (); @@ -151,6 +163,12 @@ sub change { my $self = shift; my %hash = @_; my %old; + if (exists $hash{variable} && exists $hash{alternate}) { + delete $hash{alternate}; + } + elsif (!exists $hash{variable} && exists $hash{alternate}) { + $hash{variable} = $hash{alternate}; + } foreach my $k (keys %hash) { if (exists $ChangeOK{$k}) { $old{$k} = $self->{$k}; @@ -174,18 +192,24 @@ sub _checkLevel { $level, $key, MaxLevel if MaxLevel < $level; } +my %DerivCode = ( + 8 => \&_derivCE_8, + 9 => \&_derivCE_9, + 11 => \&_derivCE_9, # 11 == 9 +); + sub checkCollator { my $self = shift; _checkLevel($self->{level}, "level"); - $self->{derivCode} = - $self->{UCA_Version} == 8 ? \&_derivCE_8 : - $self->{UCA_Version} == 9 ? \&_derivCE_9 : - croak "Illegal UCA version (passed $self->{UCA_Version})."; + $self->{derivCode} = $DerivCode{ $self->{UCA_Version} } + or croak "Illegal UCA version (passed $self->{UCA_Version})."; - $self->{alternate} = lc($self->{alternate}); - croak "$PACKAGE unknown alternate tag name: $self->{alternate}" - unless exists $AlternateOK{ $self->{alternate} }; + $self->{variable} ||= $self->{alternate} || $self->{variableTable} || + $self->{alternateTable} || $self->{alternate} || 'shifted'; + $self->{variable} = $self->{alternate} = lc($self->{variable}); + exists $VariableOK{ $self->{variable} } + or croak "$PACKAGE unknown variable tag name: $self->{variable}"; if (! defined $self->{backwards}) { $self->{backwardsFlag} = 0; @@ -206,10 +230,9 @@ sub checkCollator { } } - $self->{rearrange} = [] - if ! defined $self->{rearrange}; - croak "$PACKAGE: A list for rearrangement must be store in an ARRAYREF" - if ! ref $self->{rearrange}; + defined $self->{rearrange} or $self->{rearrange} = []; + ref $self->{rearrange} + or croak "$PACKAGE: list for rearrangement must be store in ARRAYREF"; # keys of $self->{rearrangeHash} are $self->{rearrange}. $self->{rearrangeHash} = undef; @@ -222,13 +245,14 @@ sub checkCollator { if (defined $self->{normalization}) { eval { require Unicode::Normalize }; - croak "Unicode/Normalize.pm is required to normalize strings: $@" - if $@; + $@ and croak "Unicode::Normalize is required to normalize strings"; - $CVgetCombinClass = \&Unicode::Normalize::getCombinClass - if ! $CVgetCombinClass; + $CVgetCombinClass ||= \&Unicode::Normalize::getCombinClass; - if ($self->{normalization} ne 'prenormalized') { + if ($self->{normalization} =~ /^(?:NF)D\z/) { # tweak for default + $self->{normCode} = \&Unicode::Normalize::NFD; + } + elsif ($self->{normalization} ne 'prenormalized') { my $norm = $self->{normalization}; $self->{normCode} = sub { Unicode::Normalize::normalize($norm, shift); @@ -262,8 +286,6 @@ sub new if ! exists $self->{overrideCJK}; $self->{normalization} = 'NFD' if ! exists $self->{normalization}; - $self->{alternate} = $self->{alternateTable} || 'shifted' - if ! exists $self->{alternate}; $self->{rearrange} = $self->{rearrangeTable} || $DefaultRearrange if ! exists $self->{rearrange}; $self->{backwards} = $self->{backwardsTable} @@ -288,7 +310,10 @@ sub read_table { if (/^\s*\@version\s*(\S*)/) { $self->{versionTable} ||= $1; } - elsif (/^\s*\@alternate\s+(\S*)/) { + elsif (/^\s*\@variable\s+(\S*)/) { # since UTS #10-9 + $self->{variableTable} ||= $1; + } + elsif (/^\s*\@alternate\s+(\S*)/) { # till UTS #10-8 $self->{alternateTable} ||= $1; } elsif (/^\s*\@backwards\s+(\S*)/) { @@ -364,35 +389,39 @@ sub parseEntry # if and only if "all" CEs are [.0000.0000.0000]. } - $self->{entries}{$entry} = \@key; - - $self->{L3_ignorable}{$uv[0]} = TRUE - if @uv == 1 && $is_L3_ignorable; + $self->{mapping}{$entry} = \@key; - # Contraction is to be considered in the range of this maxlength. - $self->{maxlength}{$uv[0]} = scalar @uv - if @uv > 1; + if (@uv > 1) { + (!$self->{maxlength}{$uv[0]} || $self->{maxlength}{$uv[0]} < @uv) + and $self->{maxlength}{$uv[0]} = @uv; + } + else { + $is_L3_ignorable + ? ($self->{L3_ignorable}{$uv[0]} = TRUE) + : ($self->{L3_ignorable}{$uv[0]} and + $self->{L3_ignorable}{$uv[0]} = FALSE); # &&= stores key. + } } ## -## arrayref[weights] = altCE(VCE) +## arrayref[weights] = varCE(VCE) ## -sub altCE +sub varCE { my $self = shift; my($var, @wt) = unpack(VCE_TEMPLATE, shift); - $self->{alternate} eq 'blanked' ? + $self->{variable} eq 'blanked' ? $var ? [Var1Wt, 0, 0, $wt[3]] : \@wt : - $self->{alternate} eq 'non-ignorable' ? + $self->{variable} eq 'non-ignorable' ? \@wt : - $self->{alternate} eq 'shifted' ? + $self->{variable} eq 'shifted' ? $var ? [Var1Wt, 0, 0, $wt[0] ] : [ @wt[0..2], $wt[0]+$wt[1]+$wt[2] ? Shift4Wt : 0 ] : - $self->{alternate} eq 'shift-trimmed' ? + $self->{variable} eq 'shift-trimmed' ? $var ? [Var1Wt, 0, 0, $wt[0] ] : [ @wt[0..2], 0 ] : - croak "$PACKAGE unknown alternate name: $self->{alternate}"; + croak "$PACKAGE unknown variable name: $self->{variable}"; } sub viewSortKey @@ -416,21 +445,21 @@ sub visualizeSortKey ## -## arrayref of JCPS = splitCE(string to be collated) -## arrayref of arrayref[JCPS, ini_pos, fin_pos] = splitCE(string, true) +## arrayref of JCPS = splitEnt(string to be collated) +## arrayref of arrayref[JCPS, ini_pos, fin_pos] = splitEnt(string, true) ## -sub splitCE +sub splitEnt { my $self = shift; my $wLen = $_[1]; my $code = $self->{preprocess}; my $norm = $self->{normCode}; - my $ent = $self->{entries}; + my $map = $self->{mapping}; my $max = $self->{maxlength}; my $reH = $self->{rearrangeHash}; my $ign = $self->{L3_ignorable}; - my $ver9 = $self->{UCA_Version} > 8; + my $ver9 = $self->{UCA_Version} >= 9; my ($str, @buf); @@ -473,26 +502,26 @@ sub splitCE next if _isNonCharacter($src[$i]); my $i_orig = $i; - my $ce = $src[$i]; + my $jcps = $src[$i]; - if ($max->{$ce}) { # contract - my $temp_ce = $ce; - my $ceLen = 1; - my $maxLen = $max->{$ce}; + if ($max->{$jcps}) { # contract + my $temp_jcps = $jcps; + my $jcpsLen = 1; + my $maxLen = $max->{$jcps}; - for (my $p = $i + 1; $ceLen < $maxLen && $p < @src; $p++) { + for (my $p = $i + 1; $jcpsLen < $maxLen && $p < @src; $p++) { next if ! defined $src[$p]; - $temp_ce .= CODE_SEP . $src[$p]; - $ceLen++; - if ($ent->{$temp_ce}) { - $ce = $temp_ce; + $temp_jcps .= CODE_SEP . $src[$p]; + $jcpsLen++; + if ($map->{$temp_jcps}) { + $jcps = $temp_jcps; $i = $p; } } # not-contiguous contraction with Combining Char (cf. UTS#10, S2.1). # This process requires Unicode::Normalize. - # If "normalize" is undef, here should be skipped *always* + # If "normalization" is undef, here should be skipped *always* # (in spite of bool value of $CVgetCombinClass), # since canonical ordering cannot be expected. # Blocked combining character should not be contracted. @@ -508,8 +537,8 @@ sub splitCE $curCC = $CVgetCombinClass->($src[$p]); last unless $curCC; my $tail = CODE_SEP . $src[$p]; - if ($preCC != $curCC && $ent->{$ce.$tail}) { - $ce .= $tail; + if ($preCC != $curCC && $map->{$jcps.$tail}) { + $jcps .= $tail; $src[$p] = undef; } else { $preCC = $curCC; @@ -525,7 +554,7 @@ sub splitCE } } - push @buf, $wLen ? [$ce, $i_orig, $i + 1] : $ce; + push @buf, $wLen ? [$jcps, $i_orig, $i + 1] : $jcps; } return \@buf; } @@ -537,18 +566,16 @@ sub splitCE sub getWt { my $self = shift; - my $ce = shift; - my $ent = $self->{entries}; + my $u = shift; + my $map = $self->{mapping}; my $der = $self->{derivCode}; - return if !defined $ce; - return map($self->altCE($_), @{ $ent->{$ce} }) - if $ent->{$ce}; - - # CE must not be a contraction, then it's a code point. - my $u = $ce; + return if !defined $u; + return map($self->varCE($_), @{ $map->{$u} }) + if $map->{$u}; - if (0xAC00 <= $u && $u <= 0xD7A3) { # is Hangul Syllale + # JCPS must not be a contraction, then it's a code point. + if (Hangul_SIni <= $u && $u <= Hangul_SFin) { my $hang = $self->{overrideHangul}; my @hangulCE; if ($hang) { @@ -563,45 +590,44 @@ sub getWt if (@decH == 2) { my $contract = join(CODE_SEP, @decH); - @decH = ($contract) if $ent->{$contract}; + @decH = ($contract) if $map->{$contract}; } else { # must be <@decH == 3> if ($max->{$decH[0]}) { my $contract = join(CODE_SEP, @decH); - if ($ent->{$contract}) { + if ($map->{$contract}) { @decH = ($contract); } else { $contract = join(CODE_SEP, @decH[0,1]); - $ent->{$contract} and @decH = ($contract, $decH[2]); + $map->{$contract} and @decH = ($contract, $decH[2]); } # even if V's ignorable, LT contraction is not supported. # If such a situatution were required, NFD should be used. } if (@decH == 3 && $max->{$decH[1]}) { my $contract = join(CODE_SEP, @decH[1,2]); - $ent->{$contract} and @decH = ($decH[0], $contract); + $map->{$contract} and @decH = ($decH[0], $contract); } } @hangulCE = map({ - $ent->{$_} ? @{ $ent->{$_} } : $der->($_); + $map->{$_} ? @{ $map->{$_} } : $der->($_); } @decH); } - return map $self->altCE($_), @hangulCE; + return map $self->varCE($_), @hangulCE; } - elsif (0x3400 <= $u && $u <= 0x4DB5 || - 0x4E00 <= $u && $u <= 0x9FA5 || - 0x20000 <= $u && $u <= 0x2A6D6) # CJK Ideograph - { + elsif (CJK_UidIni <= $u && $u <= CJK_UidFin || + CJK_ExtAIni <= $u && $u <= CJK_ExtAFin || + CJK_ExtBIni <= $u && $u <= CJK_ExtBFin) { my $cjk = $self->{overrideCJK}; - return map $self->altCE($_), + return map $self->varCE($_), $cjk ? map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$cjk($u)) - : defined $cjk && $self->{UCA_Version} <= 8 && $u < 0x10000 + : defined $cjk && $self->{UCA_Version} <= 8 && $u <= BMP_Max ? pack(VCE_TEMPLATE, NON_VAR, $u, Min2Wt, Min3Wt, $u) : $der->($u); } else { - return map $self->altCE($_), $der->($u); + return map $self->varCE($_), $der->($u); } } @@ -613,14 +639,42 @@ sub getSortKey { my $self = shift; my $lev = $self->{level}; - my $rCE = $self->splitCE(shift); # get an arrayref of JCPS - my $ver9 = $self->{UCA_Version} > 8; - my $v2i = $self->{alternate} ne 'non-ignorable'; + my $rEnt = $self->splitEnt(shift); # get an arrayref of JCPS + my $ver9 = $self->{UCA_Version} >= 9; + my $v2i = $self->{variable} ne 'non-ignorable'; # weight arrays - my (@buf, $last_is_variable); + my (@wts, @buf, $last_is_variable); + + if ($self->{hangul_terminator}) { + my $preHST = ''; + foreach my $jcps (@$rEnt) { + # weird things like VL, TL-contraction are not considered! + my $curHST = ''; + foreach my $u (split /;/, $jcps) { + $curHST .= getHST($u); + } + if ($preHST && !$curHST || # hangul before non-hangul + $preHST =~ /L\z/ && $curHST =~ /^T/ || + $preHST =~ /V\z/ && $curHST =~ /^L/ || + $preHST =~ /T\z/ && $curHST =~ /^[LV]/) { - foreach my $wt (map $self->getWt($_), @$rCE) { + push @wts, $self->varCE_HangulTerm; + } + $preHST = $curHST; + + push @wts, $self->getWt($jcps); + } + $preHST # end at hangul + and push @wts, $self->varCE_HangulTerm; + } + else { + foreach my $jcps (@$rEnt) { + push @wts, $self->getWt($jcps); + } + } + + foreach my $wt (@wts) { if ($v2i && $ver9) { if ($wt->[0] == 0) { # ignorable next if $last_is_variable; @@ -694,9 +748,10 @@ sub sort { sub _derivCE_9 { my $u = shift; my $base = - (0x4E00 <= $u && $u <= 0x9FA5) + (CJK_UidIni <= $u && $u <= CJK_UidFin) ? 0xFB40 : # CJK - (0x3400 <= $u && $u <= 0x4DB5 || 0x20000 <= $u && $u <= 0x2A6D6) + (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin || + CJK_ExtBIni <= $u && $u <= CJK_ExtBFin) ? 0xFB80 # CJK ext. : 0xFBC0; # others @@ -716,6 +771,14 @@ sub _derivCE_8 { pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $code); } + +sub varCE_HangulTerm { + my $self = shift; + return $self->varCE(pack(VCE_TEMPLATE, + NON_VAR, $self->{hangul_terminator}, 0,0,0)); +} + + ## ## "hhhh hhhh hhhh" to (dddd, dddd, dddd) ## @@ -727,14 +790,14 @@ sub _getHexArray { map hex, $_[0] =~ /([0-9a-fA-F]+)/g } # sub _decompHangul { my $code = shift; - my $SIndex = $code - 0xAC00; - my $LIndex = int( $SIndex / 588); - my $VIndex = int(($SIndex % 588) / 28); - my $TIndex = $SIndex % 28; + my $SIndex = $code - Hangul_SBase; + my $LIndex = int( $SIndex / Hangul_NCount); + my $VIndex = int(($SIndex % Hangul_NCount) / Hangul_TCount); + my $TIndex = $SIndex % Hangul_TCount; return ( - 0x1100 + $LIndex, - 0x1161 + $VIndex, - $TIndex ? (0x11A7 + $TIndex) : (), + Hangul_LBase + $LIndex, + Hangul_VBase + $VIndex, + $TIndex ? (Hangul_TBase + $TIndex) : (), ); } @@ -748,6 +811,17 @@ sub _isNonCharacter { ; } +# Hangul Syllable Type +sub getHST { + my $u = shift; + return + Hangul_LIni <= $u && $u <= Hangul_LFin || $u == Hangul_LFill ? "L" : + Hangul_VIni <= $u && $u <= Hangul_VFin ? "V" : + Hangul_TIni <= $u && $u <= Hangul_TFin ? "T" : + Hangul_SIni <= $u && $u <= Hangul_SFin ? + ($u - Hangul_SBase) % Hangul_TCount ? "LVT" : "LV" : ""; +} + ## ## bool _nonIgnorAtLevel(arrayref weights, int level) @@ -796,19 +870,19 @@ sub _eqArray($$$) ## sub index { - my $self = shift; - my $str = shift; - my $len = length($str); - my $subCE = $self->splitCE(shift); - my $pos = @_ ? shift : 0; - $pos = 0 if $pos < 0; - my $grob = shift; - - my $lev = $self->{level}; - my $ver9 = $self->{UCA_Version} > 8; - my $v2i = $self->{alternate} ne 'non-ignorable'; - - if (! @$subCE) { + my $self = shift; + my $str = shift; + my $len = length($str); + my $subE = $self->splitEnt(shift); + my $pos = @_ ? shift : 0; + $pos = 0 if $pos < 0; + my $grob = shift; + + my $lev = $self->{level}; + my $ver9 = $self->{UCA_Version} >= 9; + my $v2i = $self->{variable} ne 'non-ignorable'; + + if (! @$subE) { my $temp = $pos <= 0 ? 0 : $len <= $pos ? $len : $pos; return $grob ? map([$_, 0], $temp..$len) @@ -817,15 +891,15 @@ sub index if ($len < $pos) { return wantarray ? () : NOMATCHPOS; } - my $strCE = $self->splitCE($pos ? substr($str, $pos) : $str, TRUE); - if (! @$strCE) { + my $strE = $self->splitEnt($pos ? substr($str, $pos) : $str, TRUE); + if (! @$strE) { return wantarray ? () : NOMATCHPOS; } my $last_is_variable; my(@strWt, @iniPos, @finPos, @subWt, @g_ret); $last_is_variable = FALSE; - for my $wt (map $self->getWt($_), @$subCE) { + for my $wt (map $self->getWt($_), @$subE) { my $to_be_pushed = _nonIgnorAtLevel($wt,$lev); if ($v2i && $ver9) { @@ -845,7 +919,7 @@ sub index } my $count = 0; - my $end = @$strCE - 1; + my $end = @$strE - 1; $last_is_variable = FALSE; @@ -854,7 +928,7 @@ sub index # fetch a grapheme while ($i <= $end && $found_base == 0) { - for my $wt ($self->getWt($strCE->[$i][0])) { + for my $wt ($self->getWt($strE->[$i][0])) { my $to_be_pushed = _nonIgnorAtLevel($wt,$lev); if ($v2i && $ver9) { @@ -867,13 +941,13 @@ sub index if (@strWt && $wt->[0] == 0) { push @{ $strWt[-1] }, $wt if $to_be_pushed; - $finPos[-1] = $strCE->[$i][2]; + $finPos[-1] = $strE->[$i][2]; } elsif ($to_be_pushed) { $wt->[0] = 0 if $wt->[0] == Var1Wt; push @strWt, [ $wt ]; - push @iniPos, $found_base ? NOMATCHPOS : $strCE->[$i][1]; + push @iniPos, $found_base ? NOMATCHPOS : $strE->[$i][1]; $finPos[-1] = NOMATCHPOS if $found_base; - push @finPos, $strCE->[$i][2]; + push @finPos, $strE->[$i][2]; $found_base++; } # else ===> no-op @@ -1004,6 +1078,9 @@ Unicode::Collate - Unicode Collation Algorithm #compare $result = $Collator->cmp($a, $b); # returns 1, 0, or -1. + # If %tailoring is false (i.e. empty), + # $Collator should do the default collation. + =head1 DESCRIPTION This module is an implementation @@ -1016,14 +1093,15 @@ The C<new> method returns a collator object. $Collator = Unicode::Collate->new( UCA_Version => $UCA_Version, - alternate => $alternate, + alternate => $alternate, # deprecated: use of 'variable' is recommended. backwards => $levelNumber, # or \@levelNumbers entry => $element, - normalization => $normalization_form, + hangul_terminator => $term_primary_weight, ignoreName => qr/$ignoreName/, ignoreChar => qr/$ignoreChar/, katakana_before_hiragana => $bool, level => $collationLevel, + normalization => $normalization_form, overrideCJK => \&overrideCJK, overrideHangul => \&overrideHangul, preprocess => \&preprocess, @@ -1032,50 +1110,22 @@ The C<new> method returns a collator object. undefName => qr/$undefName/, undefChar => qr/$undefChar/, upper_before_lower => $bool, + variable => $variable, ); - # if %tailoring is false (i.e. empty), - # $Collator should do the default collation. =over 4 =item UCA_Version -If the version number of the older UCA is given, -the older behavior of that version is emulated on collating. +If the tracking version number of the older UCA is given, +the older behavior of that tracking version is emulated on collating. If omitted, the return value of C<UCA_Version()> is used. -The supported version: 8 or 9. +The supported tracking version: 8, 9, or 11. B<This parameter may be removed in the future version, as switching the algorithm would affect the performance.> -=item alternate - --- see 3.2.2 Variable Weighting, UTS #10. - -(the title in UCA version 8: Alternate Weighting) - -This key allows to alternate weighting for variable collation elements, -which are marked with an ASTERISK in the table -(NOTE: Many punction marks and symbols are variable in F<allkeys.txt>). - - alternate => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'. - -These names are case-insensitive. -By default (if specification is omitted), 'shifted' is adopted. - - 'Blanked' Variable elements are made ignorable at levels 1 through 3; - considered at the 4th level. - - 'Non-ignorable' Variable elements are not reset to ignorable. - - 'Shifted' Variable elements are made ignorable at levels 1 through 3 - their level 4 weight is replaced by the old level 1 weight. - Level 4 weight for Non-Variable elements is 0xFFFF. - - 'Shift-Trimmed' Same as 'shifted', but all FFFF's at the 4th level - are trimmed. - =item backwards -- see 3.1.2 French Accents, UTS #10. @@ -1089,7 +1139,10 @@ If omitted, forwards at all the levels. -- see 3.1 Linguistic Features; 3.2.1 File Format, UTS #10. -Overrides a default order or defines additional collation elements +If the same character (or a sequence of characters) exists +in the collation element table through C<table>, +mapping to collation elements is overrided. +If it does not exist, the mapping is defined additionally. entry => <<'ENTRIES', # use the UCA file format 00E6 ; [.0861.0020.0002.00E6] [.08B1.0020.0002.00E6] # ligature <ae> as <a><e> @@ -1102,6 +1155,34 @@ B<must> be a Unicode code point, but not a native code point. So C<0063> must always denote C<U+0063>, but not a character of C<"\x63">. +=item hangul_terminator + +-- see Condition B.2. in 7.1.4 Trailing Weights, UTS #10. + +If a true value is given (non-zero but should be positive), +it will be added as a terminator primary weight to the end of +every standard Hangul syllable. Secondary and any higher weights +for terminator are set to zero. +If the value is false or C<hangul_terminator> key does not exist, +insertion of terminator weights will not be performed. + +Boundaries of Hangul syllables are determined +according to conjoining Jamo behavior in F<the Unicode Standard> +and F<HangulSyllableType.txt>. + +B<Implementation Note:> +(1) For expansion mapping (Unicode character mapped +to a sequence of collation elements), a terminator will not be added +between collation elements, even if Hangul syllable boundary exists there. +Addition of terminator is restricted to the next position +to the last collation element. + +(2) Non-conjoining Hangul letters +(Compatibility Jamo, halfwidth Jamo, and enclosed letters) are not +automatically terminated with a terminator primary weight. +These characters may need terminator included in a collation element +table beforehand. + =item ignoreName =item ignoreChar @@ -1124,7 +1205,7 @@ Any higher levels than the specified one are ignored. Level 1: alphabetic ordering Level 2: diacritic ordering Level 3: case ordering - Level 4: tie-breaking (e.g. in the case when alternate is 'shifted') + Level 4: tie-breaking (e.g. in the case when variable is 'shifted') ex.level => 2, @@ -1143,7 +1224,7 @@ Acceptable names include C<'NFD'>, C<'NFC'>, C<'NFKD'>, and C<'NFKC'>. See C<Unicode::Normalize::normalize()> for detail. If omitted, C<'NFD'> is used. -L<normalization> is performed after L<preprocess> (if defined). +C<normalization> is performed after C<preprocess> (if defined). Furthermore, special values, C<undef> and C<"prenormalized">, can be used, though they are not concerned with C<Unicode::Normalize::normalize()>. @@ -1175,9 +1256,12 @@ B<Unicode::Normalize> is required (see also B<CAVEAT>). -- see 7.1 Derived Collation Elements, UTS #10. -By default, mapping of CJK Unified Ideographs -uses the Unicode codepoint order. -But the mapping of CJK Unified Ideographs may be overrided. +By default, CJK Unified Ideographs are ordered in Unicode codepoint order +(but C<CJK Unified Ideographs> [C<U+4E00> to C<U+9FA5>] are lesser than +C<CJK Unified Ideographs Extension> [C<U+3400> to C<U+4DB5> and +C<U+20000> to C<U+2A6D6>]. + +Through C<overrideCJK>, ordering of CJK Unified Ideographs can be overrided. ex. CJK Unified Ideographs in the JIS code point order. @@ -1199,7 +1283,7 @@ ex. ignores all CJK Unified Ideographs. If C<undef> is passed explicitly as the value for this key, weights for CJK Unified Ideographs are treated as undefined. But assignment of weight for CJK Unified Ideographs -in table or L<entry> is still valid. +in table or C<entry> is still valid. =item overrideHangul @@ -1208,7 +1292,7 @@ in table or L<entry> is still valid. By default, Hangul Syllables are decomposed into Hangul Jamo. But the mapping of Hangul Syllables may be overrided. -This tag works like L<overrideCJK>, so see there for examples. +This tag works like C<overrideCJK>, so see there for examples. If you want to override the mapping of Hangul Syllables, the Normalization Forms D and KD are not appropriate @@ -1218,7 +1302,7 @@ If C<undef> is passed explicitly as the value for this key, weight for Hangul Syllables is treated as undefined without decomposition into Hangul Jamo. But definition of weight for Hangul Syllables -in table or L<entry> is still valid. +in table or C<entry> is still valid. =item preprocess @@ -1236,7 +1320,7 @@ Then, "the pen" is before "a pencil". return $str; }, -L<preprocess> is performed before L<normalization> (if defined). +C<preprocess> is performed before C<normalization> (if defined). =item rearrange @@ -1258,7 +1342,7 @@ but it is not warned at present.> -- see 3.2 Default Unicode Collation Element Table, UTS #10. -You can use another element table if desired. +You can use another collation element table if desired. The table file must be put into a directory where F<Unicode/Collate.pm> is installed. E.g. in F<perl/lib/Unicode/Collate> directory @@ -1267,7 +1351,7 @@ when you have F<perl/lib/Unicode/Collate.pm>. By default, the filename F<"allkeys.txt"> is used. If C<undef> is passed explicitly as the value for this key, -no file is read (but you can define collation elements via L<entry>). +no file is read (but you can define collation elements via C<entry>). A typical way to define a collation element table without any file of table: @@ -1318,6 +1402,38 @@ must be same as those mentioned in 7.3.1, UTS #10. If you define your collation elements which violate this requirement, these tags don't work validly. +=item variable + +=item alternate + +-- see 3.2.2 Variable Weighting, UTS #10. + +(the title in UCA version 8: Alternate Weighting) + +This key allows to variable weighting for variable collation elements, +which are marked with an ASTERISK in the table +(NOTE: Many punction marks and symbols are variable in F<allkeys.txt>). + + variable => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'. + +These names are case-insensitive. +By default (if specification is omitted), 'shifted' is adopted. + + 'Blanked' Variable elements are made ignorable at levels 1 through 3; + considered at the 4th level. + + 'Non-ignorable' Variable elements are not reset to ignorable. + + 'Shifted' Variable elements are made ignorable at levels 1 through 3 + their level 4 weight is replaced by the old level 1 weight. + Level 4 weight for Non-Variable elements is 0xFFFF. + + 'Shift-Trimmed' Same as 'shifted', but all FFFF's at the 4th level + are trimmed. + +For backward compatibility, C<alternate> can be used as an alias +for C<variable>. + =back =head2 Methods for Collation @@ -1391,7 +1507,7 @@ for C<$Collator>, calling these methods (C<index>, C<match>, C<gmatch>, C<subst>, C<gsubst>) is croaked, as the position and the length might differ from those on the specified string. -(And the C<rearrange> tag is neglected.) +(And C<rearrange> and C<hangul_terminator> tags are neglected.) The C<match>, C<gmatch>, C<subst>, C<gsubst> methods work like C<m//>, C<m//g>, C<s///>, C<s///g>, respectively, @@ -1530,14 +1646,20 @@ In the scalar context, returns the modified collator $Collator->change(level => 4)->eq("perl", "PERL"); # false -=item UCA_Version +=item C<$version = $Collator-E<gt>version()> -Returns the version number of UTS #10 this module consults. +Returns the version number (a string) of the Unicode Standard +which the C<table> file used by the collator object is based on. +If the table does not include a version line (starting with C<@version>), +returns C<"unknown">. + +=item C<UCA_Version()> -=item Base_Unicode_Version +Returns the tracking version number of UTS #10 this module consults. -Returns the version number of the Unicode Standard -this module is based on. +=item C<Base_Unicode_Version()> + +Returns the version number of UTS #10 this module consults. =back @@ -1565,7 +1687,7 @@ and L<http://www.unicode.org/reports/tr10/CollationTest.zip> For F<CollationTest_SHIFTED.txt>, a collator via C<Unicode::Collate-E<gt>new( )> should be used; for F<CollationTest_NON_IGNORABLE.txt>, a collator via -C<Unicode::Collate-E<gt>new(alternate =E<gt> "non-ignorable", level =E<gt> 3)>. +C<Unicode::Collate-E<gt>new(variable =E<gt> "non-ignorable", level =E<gt> 3)>. B<Unicode::Normalize is required to try The Conformance Test.> @@ -1584,22 +1706,27 @@ SADAHIRO Tomoyuki, <SADAHIRO@cpan.org> =over 4 -=item http://www.unicode.org/reports/tr10/ +=item Unicode Collation Algorithm - UTS #10 + +L<http://www.unicode.org/reports/tr10/> + +=item The Default Unicode Collation Element Table (DUCET) + +L<http://www.unicode.org/reports/tr10/allkeys.txt> -Unicode Collation Algorithm - UTS #10 +=item The conformance test for the UCA -=item http://www.unicode.org/reports/tr10/allkeys.txt +L<http://www.unicode.org/reports/tr10/CollationTest.html> -The Default Unicode Collation Element Table +L<http://www.unicode.org/reports/tr10/CollationTest.zip> -=item http://www.unicode.org/reports/tr10/CollationTest.html -http://www.unicode.org/reports/tr10/CollationTest.zip +=item Hangul Syllable Type -The latest versions of the conformance test for the UCA +http://www.unicode.org/Public/UNIDATA/HangulSyllableType.txt -=item http://www.unicode.org/reports/tr15/ +=item Unicode Normalization Forms - UAX #15 -Unicode Normalization Forms - UAX #15 +L<http://www.unicode.org/reports/tr15/> =item L<Unicode::Normalize> diff --git a/lib/Unicode/Collate/Changes b/lib/Unicode/Collate/Changes index 3d39bbe248..7f92d7aad1 100644 --- a/lib/Unicode/Collate/Changes +++ b/lib/Unicode/Collate/Changes @@ -1,5 +1,22 @@ Revision history for Perl module Unicode::Collate. +0.30 Mon Oct 13 21:26:37 2003 + - fix: Completely ignorable in table should be able to be overrided + by non-ignorable in entry. + - fix: Maximum length for contraction must not be shortened + by a shorter contraction following. + - added normal.t. + - some doc fixes + +0.29 Mon Oct 13 12:18:23 2003 + - supported hangul_terminator. + - fix: Base_Unicode_Version falsely returns Perl's Unicode version. + C4 in UTS #10 requires UTS's Unicode version. + - For variable weighting, 'variable' is recommended + and 'alternate' is deprecated. + - added version() method. + - added hangtype.t, trailwt.t, variable.t, and version.t. + 0.28 Sat Sep 06 20:16:01 2003 - Fixed another inconsistency under (normalization => undef): Non-contiguous contraction is always neglected. @@ -14,9 +31,10 @@ Revision history for Perl module Unicode::Collate. Collation of a large string including a first letter of a contraction that is not a part of that contraction (say, 'c' of 'ca' where 'ch' is defined) was too slow, inefficient. - - A form name for 'normalize', no longer restricted to /^(?:NF)?K?[CD]\z/, - will be allowed as long as Unicode::Normalize::normalize() accepts it. - since Unicode::Normalize or UAX #15 may be changed/enhanced in future. + - A form name for 'normalization', no longer restricted to + /^(?:NF)?K?[CD]\z/, will be allowed as long as + Unicode::Normalize::normalize() accepts it, since Unicode::Normalize + or UAX #15 may be changed/enhanced in future. - When Hangul syllables are decomposed under <normalization => undef>, contraction among jamo (LV, VT, LVT) derived from the same Hangul syllable is allowed. Added hangul.t. diff --git a/lib/Unicode/Collate/README b/lib/Unicode/Collate/README index 3c86573ec3..6a4b712a8b 100644 --- a/lib/Unicode/Collate/README +++ b/lib/Unicode/Collate/README @@ -1,4 +1,4 @@ -Unicode/Collate version 0.28 +Unicode/Collate version 0.30 =============================== NAME diff --git a/lib/Unicode/Collate/t/contract.t b/lib/Unicode/Collate/t/contract.t index c2aaecfaa7..1c6658d572 100644 --- a/lib/Unicode/Collate/t/contract.t +++ b/lib/Unicode/Collate/t/contract.t @@ -51,7 +51,7 @@ ENTRIES ######################### -ok(1); # If we made it this far, we're ok. +ok(1); my $kjeNoN = Unicode::Collate->new( level => 1, diff --git a/lib/Unicode/Collate/t/hangtype.t b/lib/Unicode/Collate/t/hangtype.t new file mode 100644 index 0000000000..b6a46691aa --- /dev/null +++ b/lib/Unicode/Collate/t/hangtype.t @@ -0,0 +1,56 @@ +BEGIN { + unless ("A" eq pack('U', 0x41)) { + print "1..0 # Unicode::Collate " . + "cannot stringify a Unicode code point\n"; + exit 0; + } +} + +BEGIN { + if ($ENV{PERL_CORE}) { + chdir('t') if -d 't'; + @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib); + } +} + +use Test; +BEGIN { plan tests => 30 }; + +use strict; +use warnings; +use Unicode::Collate; + +ok(1); + +######################### + +ok(Unicode::Collate::getHST(0x0000), ''); +ok(Unicode::Collate::getHST(0x0100), ''); +ok(Unicode::Collate::getHST(0x1000), ''); +ok(Unicode::Collate::getHST(0x10FF), ''); +ok(Unicode::Collate::getHST(0x1100), 'L'); +ok(Unicode::Collate::getHST(0x1101), 'L'); +ok(Unicode::Collate::getHST(0x1159), 'L'); +ok(Unicode::Collate::getHST(0x115A), ''); +ok(Unicode::Collate::getHST(0x115E), ''); +ok(Unicode::Collate::getHST(0x115F), 'L'); +ok(Unicode::Collate::getHST(0x1160), 'V'); +ok(Unicode::Collate::getHST(0x1161), 'V'); +ok(Unicode::Collate::getHST(0x11A0), 'V'); +ok(Unicode::Collate::getHST(0x11A2), 'V'); +ok(Unicode::Collate::getHST(0x11A3), ''); +ok(Unicode::Collate::getHST(0x11A7), ''); +ok(Unicode::Collate::getHST(0x11A8), 'T'); +ok(Unicode::Collate::getHST(0x11AF), 'T'); +ok(Unicode::Collate::getHST(0x11E0), 'T'); +ok(Unicode::Collate::getHST(0x11F9), 'T'); +ok(Unicode::Collate::getHST(0x11FA), ''); +ok(Unicode::Collate::getHST(0x11FF), ''); +ok(Unicode::Collate::getHST(0x3011), ''); +ok(Unicode::Collate::getHST(0x11A7), ''); +ok(Unicode::Collate::getHST(0xAC00), 'LV'); +ok(Unicode::Collate::getHST(0xAC01), 'LVT'); +ok(Unicode::Collate::getHST(0xAC1B), 'LVT'); +ok(Unicode::Collate::getHST(0xAC1C), 'LV'); +ok(Unicode::Collate::getHST(0xD7A3), 'LVT'); + diff --git a/lib/Unicode/Collate/t/hangul.t b/lib/Unicode/Collate/t/hangul.t index be6b0724fb..1b1359e88d 100644 --- a/lib/Unicode/Collate/t/hangul.t +++ b/lib/Unicode/Collate/t/hangul.t @@ -14,7 +14,7 @@ BEGIN { } use Test; -BEGIN { plan tests => 52 }; +BEGIN { plan tests => 72 }; use strict; use warnings; @@ -25,7 +25,7 @@ $IsEBCDIC = ord("A") != 0x41; ######################### -ok(1); # If we made it this far, we're ok. +ok(1); # a standard collator (3.1.1) my $Collator = Unicode::Collate->new( @@ -41,6 +41,7 @@ my $hangul = Unicode::Collate->new( level => 3, table => undef, normalization => undef, + entry => <<'ENTRIES', 0061 ; [.0A15.0020.0002] # LATIN SMALL LETTER A 0041 ; [.0A15.0020.0008] # LATIN CAPITAL LETTER A @@ -65,87 +66,127 @@ ENTRIES ok(ref $hangul, "Unicode::Collate"); +my $trailwt = Unicode::Collate->new( + level => 3, + table => undef, + normalization => undef, + hangul_terminator => 16, + + entry => <<'ENTRIES', # Term < Jongseong < Jungseong < Choseong +0061 ; [.0A15.0020.0002] # LATIN SMALL LETTER A +0041 ; [.0A15.0020.0008] # LATIN CAPITAL LETTER A +11A8 ; [.1801.0020.0002] # HANGUL JONGSEONG KIYEOK +11A9 ; [.1801.0020.0002][.1801.0020.0002] # HANGUL JONGSEONG SSANGKIYEOK +1161 ; [.1831.0020.0002] # HANGUL JUNGSEONG A +1163 ; [.1832.0020.0002] # HANGUL JUNGSEONG YA +1100 ; [.1861.0020.0002] # HANGUL CHOSEONG KIYEOK +1101 ; [.1861.0020.0002][.1861.0020.0002] # HANGUL CHOSEONG SSANGKIYEOK +1102 ; [.1862.0020.0002] # HANGUL CHOSEONG NIEUN +3042 ; [.1921.0020.000E] # HIRAGANA LETTER A +ENTRIES +); + ######################### # L(simp)L(simp) vs L(comp): /GGA/ ok($Collator->lt("\x{1100}\x{1100}\x{1161}", "\x{1101}\x{1161}")); ok($hangul ->eq("\x{1100}\x{1100}\x{1161}", "\x{1101}\x{1161}")); +ok($trailwt ->eq("\x{1100}\x{1100}\x{1161}", "\x{1101}\x{1161}")); # L(simp) vs L(simp)L(simp): /GA/ vs /GGA/ ok($Collator->gt("\x{1100}\x{1161}", "\x{1100}\x{1100}\x{1161}")); ok($hangul ->lt("\x{1100}\x{1161}", "\x{1100}\x{1100}\x{1161}")); +ok($trailwt ->lt("\x{1100}\x{1161}", "\x{1100}\x{1100}\x{1161}")); # T(simp)T(simp) vs T(comp): /AGG/ ok($Collator->lt("\x{1161}\x{11A8}\x{11A8}", "\x{1161}\x{11A9}")); ok($hangul ->eq("\x{1161}\x{11A8}\x{11A8}", "\x{1161}\x{11A9}")); +ok($trailwt ->eq("\x{1161}\x{11A8}\x{11A8}", "\x{1161}\x{11A9}")); # T(simp) vs T(simp)T(simp): /AG/ vs /AGG/ ok($Collator->lt("\x{1161}\x{11A8}", "\x{1161}\x{11A8}\x{11A8}")); ok($hangul ->lt("\x{1161}\x{11A8}", "\x{1161}\x{11A8}\x{11A8}")); +ok($trailwt ->lt("\x{1161}\x{11A8}", "\x{1161}\x{11A8}\x{11A8}")); # LV vs LLV: /GA/ vs /GNA/ ok($Collator->gt("\x{1100}\x{1161}", "\x{1100}\x{1102}\x{1161}")); ok($hangul ->lt("\x{1100}\x{1161}", "\x{1100}\x{1102}\x{1161}")); +ok($trailwt ->lt("\x{1100}\x{1161}", "\x{1100}\x{1102}\x{1161}")); # LVX vs LVV: /GAA/ vs /GA/.latinA ok($Collator->gt("\x{1100}\x{1161}\x{1161}", "\x{1100}\x{1161}A")); ok($hangul ->gt("\x{1100}\x{1161}\x{1161}", "\x{1100}\x{1161}A")); +ok($trailwt ->gt("\x{1100}\x{1161}\x{1161}", "\x{1100}\x{1161}A")); # LVX vs LVV: /GAA/ vs /GA/.hiraganaA ok($Collator->lt("\x{1100}\x{1161}\x{1161}", "\x{1100}\x{1161}\x{3042}")); ok($hangul ->gt("\x{1100}\x{1161}\x{1161}", "\x{1100}\x{1161}\x{3042}")); +ok($trailwt ->gt("\x{1100}\x{1161}\x{1161}", "\x{1100}\x{1161}\x{3042}")); # LVX vs LVV: /GAA/ vs /GA/.hanja ok($Collator->lt("\x{1100}\x{1161}\x{1161}", "\x{1100}\x{1161}\x{4E00}")); ok($hangul ->gt("\x{1100}\x{1161}\x{1161}", "\x{1100}\x{1161}\x{4E00}")); +ok($trailwt ->gt("\x{1100}\x{1161}\x{1161}", "\x{1100}\x{1161}\x{4E00}")); # LVL vs LVT: /GA/./G/ vs /GAG/ ok($Collator->lt("\x{1100}\x{1161}\x{1100}", "\x{1100}\x{1161}\x{11A8}")); ok($hangul ->lt("\x{1100}\x{1161}\x{1100}", "\x{1100}\x{1161}\x{11A8}")); +ok($trailwt ->lt("\x{1100}\x{1161}\x{1100}", "\x{1100}\x{1161}\x{11A8}")); # LVT vs LVX: /GAG/ vs /GA/.latinA ok($Collator->gt("\x{1100}\x{1161}\x{11A8}", "\x{1100}\x{1161}A")); ok($hangul ->gt("\x{1100}\x{1161}\x{11A8}", "\x{1100}\x{1161}A")); +ok($trailwt ->gt("\x{1100}\x{1161}\x{11A8}", "\x{1100}\x{1161}A")); # LVT vs LVX: /GAG/ vs /GA/.hiraganaA ok($Collator->lt("\x{1100}\x{1161}\x{11A8}", "\x{1100}\x{1161}\x{3042}")); ok($hangul ->gt("\x{1100}\x{1161}\x{11A8}", "\x{1100}\x{1161}\x{3042}")); +ok($trailwt ->gt("\x{1100}\x{1161}\x{11A8}", "\x{1100}\x{1161}\x{3042}")); # LVT vs LVX: /GAG/ vs /GA/.hanja ok($Collator->lt("\x{1100}\x{1161}\x{11A8}", "\x{1100}\x{1161}\x{4E00}")); ok($hangul ->gt("\x{1100}\x{1161}\x{11A8}", "\x{1100}\x{1161}\x{4E00}")); +ok($trailwt ->gt("\x{1100}\x{1161}\x{11A8}", "\x{1100}\x{1161}\x{4E00}")); # LVT vs LVV: /GAG/ vs /GAA/ ok($Collator->gt("\x{1100}\x{1161}\x{11A8}", "\x{1100}\x{1161}\x{1161}")); ok($hangul ->lt("\x{1100}\x{1161}\x{11A8}", "\x{1100}\x{1161}\x{1161}")); +ok($trailwt ->lt("\x{1100}\x{1161}\x{11A8}", "\x{1100}\x{1161}\x{1161}")); # LVL vs LVV: /GA/./G/ vs /GAA/ ok($Collator->lt("\x{1100}\x{1161}\x{1100}", "\x{1100}\x{1161}\x{1161}")); ok($hangul ->lt("\x{1100}\x{1161}\x{1100}", "\x{1100}\x{1161}\x{1161}")); +ok($trailwt ->lt("\x{1100}\x{1161}\x{1100}", "\x{1100}\x{1161}\x{1161}")); # LV vs Syl(LV): /GA/ vs /[GA]/ ok($Collator->eq("\x{1100}\x{1161}", "\x{AC00}")); ok($hangul ->eq("\x{1100}\x{1161}", "\x{AC00}")); +ok($trailwt ->eq("\x{1100}\x{1161}", "\x{AC00}")); # LVT vs Syl(LV)T: /GAG/ vs /[GA]G/ ok($Collator->eq("\x{1100}\x{1161}\x{11A8}", "\x{AC00}\x{11A8}")); ok($hangul ->eq("\x{1100}\x{1161}\x{11A8}", "\x{AC00}\x{11A8}")); +ok($trailwt ->eq("\x{1100}\x{1161}\x{11A8}", "\x{AC00}\x{11A8}")); # LVT vs Syl(LVT): /GAG/ vs /[GAG]/ ok($Collator->eq("\x{1100}\x{1161}\x{11A8}", "\x{AC01}")); ok($hangul ->eq("\x{1100}\x{1161}\x{11A8}", "\x{AC01}")); +ok($trailwt ->eq("\x{1100}\x{1161}\x{11A8}", "\x{AC01}")); # LVTT vs Syl(LVTT): /GAGG/ vs /[GAGG]/ ok($Collator->eq("\x{1100}\x{1161}\x{11A9}", "\x{AC02}")); ok($hangul ->eq("\x{1100}\x{1161}\x{11A9}", "\x{AC02}")); +ok($trailwt ->eq("\x{1100}\x{1161}\x{11A9}", "\x{AC02}")); # LVTT vs Syl(LVT).T: /GAGG/ vs /[GAG]G/ ok($Collator->gt("\x{1100}\x{1161}\x{11A9}", "\x{AC01}\x{11A8}")); ok($hangul ->eq("\x{1100}\x{1161}\x{11A9}", "\x{AC01}\x{11A8}")); +ok($trailwt ->eq("\x{1100}\x{1161}\x{11A9}", "\x{AC01}\x{11A8}")); # LLVT vs L.Syl(LVT): /GGAG/ vs /G[GAG]/ ok($Collator->gt("\x{1101}\x{1161}\x{11A8}", "\x{1100}\x{AC01}")); ok($hangul ->eq("\x{1101}\x{1161}\x{11A8}", "\x{1100}\x{AC01}")); +ok($trailwt ->eq("\x{1101}\x{1161}\x{11A8}", "\x{1100}\x{AC01}")); ######################### diff --git a/lib/Unicode/Collate/t/index.t b/lib/Unicode/Collate/t/index.t index d6811c7db6..a1d67d5346 100644 --- a/lib/Unicode/Collate/t/index.t +++ b/lib/Unicode/Collate/t/index.t @@ -25,7 +25,7 @@ our $IsEBCDIC = ord("A") != 0x41; ######################### -ok(1); # If we made it this far, we're ok. +ok(1); my $Collator = Unicode::Collate->new( table => 'keys.txt', diff --git a/lib/Unicode/Collate/t/normal.t b/lib/Unicode/Collate/t/normal.t new file mode 100644 index 0000000000..026240d6fa --- /dev/null +++ b/lib/Unicode/Collate/t/normal.t @@ -0,0 +1,205 @@ +BEGIN { + unless ("A" eq pack('U', 0x41)) { + print "1..0 # Unicode::Collate " . + "cannot stringify a Unicode code point\n"; + exit 0; + } +} + +BEGIN { + if ($ENV{PERL_CORE}) { + chdir('t') if -d 't'; + @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib); + } +} + +BEGIN { + eval { require Unicode::Normalize; }; + if ($@) { + print "1..0 # skipped: Unicode::Normalize needed for this test\n"; + print $@; + exit; + } +} +use Test; +BEGIN { plan tests => 100 }; + +use strict; +use warnings; +use Unicode::Collate; + +our $Aring = pack('U', 0xC5); +our $aring = pack('U', 0xE5); + +our $entry = <<'ENTRIES'; +030A; [.0000.030A.0002] # COMBINING RING ABOVE +212B; [.002B.0020.0008] # ANGSTROM SIGN +0061; [.0A41.0020.0002] # LATIN SMALL LETTER A +0041; [.0A41.0020.0008] # LATIN CAPITAL LETTER A +007A; [.0A5A.0020.0002] # LATIN SMALL LETTER Z +005A; [.0A5A.0020.0008] # LATIN CAPITAL LETTER Z +FF41; [.0A87.0020.0002] # LATIN SMALL LETTER A +FF21; [.0A87.0020.0008] # LATIN CAPITAL LETTER A +00E5; [.0AC5.0020.0002] # LATIN SMALL LETTER A WITH RING ABOVE +00C5; [.0AC5.0020.0008] # LATIN CAPITAL LETTER A WITH RING ABOVE +ENTRIES + +# Aong < A+ring < Z < fullA+ring < A-ring + +######################### + +our $noN = Unicode::Collate->new( + level => 1, + table => undef, + normalization => undef, + entry => $entry, +); + +our $nfc = Unicode::Collate->new( + level => 1, + table => undef, + normalization => 'NFC', + entry => $entry, +); + +our $nfd = Unicode::Collate->new( + level => 1, + table => undef, + normalization => 'NFD', + entry => $entry, +); + +our $nfkc = Unicode::Collate->new( + level => 1, + table => undef, + normalization => 'NFKC', + entry => $entry, +); + +our $nfkd = Unicode::Collate->new( + level => 1, + table => undef, + normalization => 'NFKD', + entry => $entry, +); + +ok($noN->lt("\x{212B}", "A")); +ok($noN->lt("\x{212B}", $Aring)); +ok($noN->lt("A\x{30A}", $Aring)); +ok($noN->lt("A", "\x{FF21}")); +ok($noN->lt("Z", "\x{FF21}")); +ok($noN->lt("Z", $Aring)); +ok($noN->lt("\x{212B}", $aring)); +ok($noN->lt("A\x{30A}", $aring)); +ok($noN->lt("Z", $aring)); +ok($noN->lt("a\x{30A}", "Z")); + +ok($nfd->eq("\x{212B}", "A")); +ok($nfd->eq("\x{212B}", $Aring)); +ok($nfd->eq("A\x{30A}", $Aring)); +ok($nfd->lt("A", "\x{FF21}")); +ok($nfd->lt("Z", "\x{FF21}")); +ok($nfd->gt("Z", $Aring)); +ok($nfd->eq("\x{212B}", $aring)); +ok($nfd->eq("A\x{30A}", $aring)); +ok($nfd->gt("Z", $aring)); +ok($nfd->lt("a\x{30A}", "Z")); + +ok($nfc->gt("\x{212B}", "A")); +ok($nfc->eq("\x{212B}", $Aring)); +ok($nfc->eq("A\x{30A}", $Aring)); +ok($nfc->lt("A", "\x{FF21}")); +ok($nfc->lt("Z", "\x{FF21}")); +ok($nfc->lt("Z", $Aring)); +ok($nfc->eq("\x{212B}", $aring)); +ok($nfc->eq("A\x{30A}", $aring)); +ok($nfc->lt("Z", $aring)); +ok($nfc->gt("a\x{30A}", "Z")); + +ok($nfkd->eq("\x{212B}", "A")); +ok($nfkd->eq("\x{212B}", $Aring)); +ok($nfkd->eq("A\x{30A}", $Aring)); +ok($nfkd->eq("A", "\x{FF21}")); +ok($nfkd->gt("Z", "\x{FF21}")); +ok($nfkd->gt("Z", $Aring)); +ok($nfkd->eq("\x{212B}", $aring)); +ok($nfkd->eq("A\x{30A}", $aring)); +ok($nfkd->gt("Z", $aring)); +ok($nfkd->lt("a\x{30A}", "Z")); + +ok($nfkc->gt("\x{212B}", "A")); +ok($nfkc->eq("\x{212B}", $Aring)); +ok($nfkc->eq("A\x{30A}", $Aring)); +ok($nfkc->eq("A", "\x{FF21}")); +ok($nfkc->gt("Z", "\x{FF21}")); +ok($nfkc->lt("Z", $Aring)); +ok($nfkc->eq("\x{212B}", $aring)); +ok($nfkc->eq("A\x{30A}", $aring)); +ok($nfkc->lt("Z", $aring)); +ok($nfkc->gt("a\x{30A}", "Z")); + +$nfd->change(normalization => undef); + +ok($nfd->lt("\x{212B}", "A")); +ok($nfd->lt("\x{212B}", $Aring)); +ok($nfd->lt("A\x{30A}", $Aring)); +ok($nfd->lt("A", "\x{FF21}")); +ok($nfd->lt("Z", "\x{FF21}")); +ok($nfd->lt("Z", $Aring)); +ok($nfd->lt("\x{212B}", $aring)); +ok($nfd->lt("A\x{30A}", $aring)); +ok($nfd->lt("Z", $aring)); +ok($nfd->lt("a\x{30A}", "Z")); + +$nfd->change(normalization => 'C'); + +ok($nfd->gt("\x{212B}", "A")); +ok($nfd->eq("\x{212B}", $Aring)); +ok($nfd->eq("A\x{30A}", $Aring)); +ok($nfd->lt("A", "\x{FF21}")); +ok($nfd->lt("Z", "\x{FF21}")); +ok($nfd->lt("Z", $Aring)); +ok($nfd->eq("\x{212B}", $aring)); +ok($nfd->eq("A\x{30A}", $aring)); +ok($nfd->lt("Z", $aring)); +ok($nfd->gt("a\x{30A}", "Z")); + +$nfd->change(normalization => 'D'); + +ok($nfd->eq("\x{212B}", "A")); +ok($nfd->eq("\x{212B}", $Aring)); +ok($nfd->eq("A\x{30A}", $Aring)); +ok($nfd->lt("A", "\x{FF21}")); +ok($nfd->lt("Z", "\x{FF21}")); +ok($nfd->gt("Z", $Aring)); +ok($nfd->eq("\x{212B}", $aring)); +ok($nfd->eq("A\x{30A}", $aring)); +ok($nfd->gt("Z", $aring)); +ok($nfd->lt("a\x{30A}", "Z")); + +$nfd->change(normalization => 'KD'); + +ok($nfd->eq("\x{212B}", "A")); +ok($nfd->eq("\x{212B}", $Aring)); +ok($nfd->eq("A\x{30A}", $Aring)); +ok($nfd->eq("A", "\x{FF21}")); +ok($nfd->gt("Z", "\x{FF21}")); +ok($nfd->gt("Z", $Aring)); +ok($nfd->eq("\x{212B}", $aring)); +ok($nfd->eq("A\x{30A}", $aring)); +ok($nfd->gt("Z", $aring)); +ok($nfd->lt("a\x{30A}", "Z")); + +$nfd->change(normalization => 'KC'); + +ok($nfd->gt("\x{212B}", "A")); +ok($nfd->eq("\x{212B}", $Aring)); +ok($nfd->eq("A\x{30A}", $Aring)); +ok($nfd->eq("A", "\x{FF21}")); +ok($nfd->gt("Z", "\x{FF21}")); +ok($nfd->lt("Z", $Aring)); +ok($nfd->eq("\x{212B}", $aring)); +ok($nfd->eq("A\x{30A}", $aring)); +ok($nfd->lt("Z", $aring)); +ok($nfd->gt("a\x{30A}", "Z")); + diff --git a/lib/Unicode/Collate/t/test.t b/lib/Unicode/Collate/t/test.t index 0c170e422a..8a7eb8b59f 100644 --- a/lib/Unicode/Collate/t/test.t +++ b/lib/Unicode/Collate/t/test.t @@ -15,7 +15,7 @@ BEGIN { } use Test; -BEGIN { plan tests => 200 }; +BEGIN { plan tests => 203 }; use strict; use warnings; @@ -23,14 +23,9 @@ use Unicode::Collate; our $IsEBCDIC = ord("A") != 0x41; -######################### +ok(1); -ok(1); # If we made it this far, we're ok. - -my $UCA_Version = "9"; - -ok(Unicode::Collate::UCA_Version, $UCA_Version); -ok(Unicode::Collate->UCA_Version, $UCA_Version); +##### 2..6 my $Collator = Unicode::Collate->new( table => 'keys.txt', @@ -39,8 +34,6 @@ my $Collator = Unicode::Collate->new( ok(ref $Collator, "Unicode::Collate"); -ok($Collator->UCA_Version, $UCA_Version); -ok($Collator->UCA_Version(), $UCA_Version); ok( join(':', $Collator->sort( @@ -55,7 +48,7 @@ ok($Collator->cmp("", ""), 0); ok($Collator->eq("", "")); ok($Collator->cmp("", "perl"), -1); -############## +##### 7..17 sub _pack_U { Unicode::Collate::pack_U(@_) } sub _unpack_U { Unicode::Collate::unpack_U(@_) } @@ -80,7 +73,7 @@ ok($Collator->lt("A", $A_acute)); ok($Collator->lt("A", $a_acute)); ok($Collator->lt($a_acute, $A_acute)); -############## +##### 17..20 eval { require Unicode::Normalize }; @@ -109,7 +102,7 @@ else { ok(1); } -############## +##### 21..30 my $trad = Unicode::Collate->new( table => 'keys.txt', @@ -148,7 +141,7 @@ ok($trad->eq("", $katakana)); ok($trad->eq($hiragana, $katakana)); ok($trad->eq($katakana, $hiragana)); -############## +##### 31..37 $Collator->change(level => 2); @@ -161,6 +154,8 @@ ok( $Collator->cmp($hiragana, $katakana), 0); ok( $Collator->eq($hiragana, $katakana) ); ok( $Collator->ge($hiragana, $katakana) ); +##### 38..43 + # hangul ok( $Collator->eq("a\x{AC00}b", "a\x{1100}\x{1161}b") ); ok( $Collator->eq("a\x{AE00}b", "a\x{1100}\x{1173}\x{11AF}b") ); @@ -169,6 +164,8 @@ ok( $Collator->lt("a\x{AC00}b", "a\x{AE00}b") ); ok( $Collator->gt("a\x{D7A3}b", "a\x{C544}b") ); ok( $Collator->lt("a\x{C544}b", "a\x{30A2}b") ); # hangul < hiragana +##### 44..52 + $Collator->change(%old_level, katakana_before_hiragana => 1); ok($Collator->{level}, 4); @@ -182,6 +179,8 @@ ok( $Collator->ne($hiragana, $katakana) ); ok( $Collator->gt($hiragana, $katakana) ); ok( $Collator->ge($hiragana, $katakana) ); +##### 53..58 + $Collator->change(upper_before_lower => 1); ok( $Collator->cmp("abc", "ABC"), 1); @@ -191,6 +190,8 @@ ok( $Collator->cmp($hiragana, $katakana), 1); ok( $Collator->ge($hiragana, $katakana), 1); ok( $Collator->gt($hiragana, $katakana), 1); +##### 59..64 + $Collator->change(katakana_before_hiragana => 0); ok( $Collator->cmp("abc", "ABC"), 1); @@ -203,7 +204,7 @@ ok( $Collator->le("abc", "ABC") ); ok( $Collator->cmp($hiragana, $katakana), -1); ok( $Collator->lt($hiragana, $katakana) ); -############## +##### 65..66 my $ignoreAE = Unicode::Collate->new( table => 'keys.txt', @@ -214,7 +215,7 @@ my $ignoreAE = Unicode::Collate->new( ok($ignoreAE->eq("element","lament")); ok($ignoreAE->eq("Perl","ePrl")); -############## +##### 67 my $onlyABC = Unicode::Collate->new( table => undef, @@ -234,7 +235,7 @@ ok( join(':', qw/ A aB Ab ABA BAC cAc cc / ), ); -############## +##### 68..71 my $undefAE = Unicode::Collate->new( table => 'keys.txt', @@ -247,7 +248,7 @@ ok($Collator->lt("edge","fog")); ok($undefAE ->gt("lake","like")); ok($Collator->lt("lake","like")); -############## +##### 72..81 # Table is undefined, then no entry is defined. @@ -281,7 +282,7 @@ ok($undef_table->lt("\x{4E00}","\x{4E8C}")); # U+4E8C: Ideograph "TWO" -############## +##### 82..86 my $few_entries = Unicode::Collate->new( entry => <<'ENTRIES', @@ -312,7 +313,7 @@ ok($few_entries->lt("\x{AE30}", "\x{AC00}")); ok($few_entries->eq("\x{AC00}", "\x{1100}\x{1161}")); -############## +##### 87..91 my $all_undef_8 = Unicode::Collate->new( table => undef, @@ -331,7 +332,7 @@ ok($all_undef_8->lt("\x{4E00}", "\x{AC00}")); ok($all_undef_8->gt("\x{AC00}", "\x{1100}\x{1161}")); ok($all_undef_8->gt("\x{AC00}", "\x{ABFF}")); -############## +##### 92..96 my $all_undef_9 = Unicode::Collate->new( table => undef, @@ -350,7 +351,7 @@ ok($all_undef_9->lt("\x{20000}", "\x{AC00}")); ok($all_undef_9->gt("\x{AC00}", "\x{1100}\x{1161}")); ok($all_undef_9->gt("\x{AC00}", "\x{ABFF}")); # U+ABFF: not assigned -############## +##### 97..101 my $ignoreCJK = Unicode::Collate->new( table => undef, @@ -369,7 +370,7 @@ ok($ignoreCJK->eq("Pe\x{4E00}rl", "Perl")); # U+4E00 is a CJK. ok($ignoreCJK->gt("\x{4DFF}", "\x{4E00}")); # U+4DFF is not CJK. ok($ignoreCJK->lt("Pe\x{5B57}rl", "Perl")); # 'r' is unassigned. -############## +##### 102..106 my $ignoreHangul = Unicode::Collate->new( table => undef, @@ -388,7 +389,7 @@ ok($ignoreHangul->lt("\x{AC00}", "\x{AE00}")); ok($ignoreHangul->lt("\x{AC00}", "\x{1100}\x{1161}")); # Jamo are not ignored. ok($ignoreHangul->lt("Pe\x{AE00}rl", "Perl")); # 'r' is unassigned. -############## +##### 107..127 my %origAlter = $Collator->change(alternate => 'Blanked'); @@ -426,7 +427,7 @@ $Collator->change(%origAlter); ok($Collator->{alternate}, 'shifted'); -############## +##### 128..132 my $overCJK = Unicode::Collate->new( table => undef, @@ -448,7 +449,7 @@ ok($overCJK->lt("A\x{4E03}", "A\x{4E00}")); ok($overCJK->lt("A\x{4E03}", "a\x{4E00}")); ok($overCJK->lt("a\x{4E03}", "A\x{4E00}")); -############## +##### 133..144 # rearrange : 0x0E40..0x0E44, 0x0EC0..0x0EC4 (default) @@ -475,7 +476,7 @@ ok($all_undef_8->lt("\x{0E40}A", "\x{0E41}B")); ok($all_undef_8->lt("\x{0E41}A", "\x{0E40}B")); ok($all_undef_8->lt("A\x{0E41}A", "A\x{0E40}B")); -############## +##### 145..149 my $no_rearrange = Unicode::Collate->new( table => undef, @@ -489,7 +490,7 @@ ok($no_rearrange->lt("\x{0E40}A", "\x{0E41}B")); ok($no_rearrange->gt("\x{0E41}A", "\x{0E40}B")); ok($no_rearrange->gt("A\x{0E41}A", "A\x{0E40}B")); -############## +##### 150..154 my $undef_rearrange = Unicode::Collate->new( table => undef, @@ -503,7 +504,7 @@ ok($undef_rearrange->lt("\x{0E40}A", "\x{0E41}B")); ok($undef_rearrange->gt("\x{0E41}A", "\x{0E40}B")); ok($undef_rearrange->gt("A\x{0E41}A", "A\x{0E40}B")); -############## +##### 155..159 my $dropArticles = Unicode::Collate->new( table => "keys.txt", @@ -521,7 +522,7 @@ ok($dropArticles->lt("the pen", "a pencil")); ok($Collator->lt("Perl", "The Perl")); ok($Collator->gt("the pen", "a pencil")); -############## +##### 160..161 my $backLevel1 = Unicode::Collate->new( table => undef, @@ -534,7 +535,7 @@ my $backLevel1 = Unicode::Collate->new( ok($backLevel1->gt("AB", "BA")); ok($backLevel1->gt("\x{3042}\x{3044}", "\x{3044}\x{3042}")); -############## +##### 162..169 my $backLevel2 = Unicode::Collate->new( table => "keys.txt", @@ -556,7 +557,7 @@ ok($backLevel2->lt("\x{4E03}", $katakana)); ok($Collator ->gt("\x{4E00}", $hiragana)); ok($Collator ->gt("\x{4E03}", $katakana)); -############## +##### 170..184 # ignorable after variable @@ -590,7 +591,7 @@ ok($Collator->lt("\cA", "?")); $Collator->change(alternate => 'Shifted', level => 4); -############## +##### 185..196 # According to Conformance Test, # a L3-ignorable is treated as a completely ignorable. @@ -629,3 +630,39 @@ ok($L3ignorable->eq("\x{1D1BB}", "\x{1D1B9}")); ok($L3ignorable->eq("\x{1D1BC}", "\x{1D1BA}")); ok($L3ignorable->eq("\x{1D1BB}", "\x{1D1B9}\x{1D165}")); ok($L3ignorable->eq("\x{1D1BC}", "\x{1D1BA}\x{1D165}")); + +##### 197..203 + +my $O_str = Unicode::Collate->new( + table => "keys.txt", + normalization => undef, + entry => <<'ENTRIES', +0008 ; [*0008.0000.0000.0000] # BACKSPACE (need to be non-ignorable) +004F 0337 ; [.0B53.0020.0008.004F] # capital O WITH SHORT SOLIDUS OVERLAY +006F 0008 002F ; [.0B53.0020.0002.006F] # LATIN SMALL LETTER O WITH STROKE +004F 0008 002F ; [.0B53.0020.0008.004F] # LATIN CAPITAL LETTER O WITH STROKE +006F 0337 ; [.0B53.0020.0002.004F] # small O WITH SHORT SOLIDUS OVERLAY +200B ; [.2000.0000.0000.0000] # ZERO WIDTH SPACE (may be non-sense but ...) +#00F8 ; [.0B53.0020.0002.00F8] # LATIN SMALL LETTER O WITH STROKE +#00D8 ; [.0B53.0020.0008.00D8] # LATIN CAPITAL LETTER O WITH STROKE +ENTRIES +); + +my $o_BS_slash = _pack_U(0x006F, 0x0008, 0x002F); +my $O_BS_slash = _pack_U(0x004F, 0x0008, 0x002F); +my $o_sol = _pack_U(0x006F, 0x0337); +my $O_sol = _pack_U(0x004F, 0x0337); +my $o_stroke = _pack_U(0x00F8); +my $O_stroke = _pack_U(0x00D8); + +ok($O_str->eq($o_stroke, $o_BS_slash)); +ok($O_str->eq($O_stroke, $O_BS_slash)); + +ok($O_str->eq($o_stroke, $o_sol)); +ok($O_str->eq($O_stroke, $O_sol)); + +ok($Collator->eq("\x{200B}", "\0")); +ok($O_str ->gt("\x{200B}", "\0")); +ok($O_str ->gt("\x{200B}", "A")); + +##### diff --git a/lib/Unicode/Collate/t/trailwt.t b/lib/Unicode/Collate/t/trailwt.t new file mode 100644 index 0000000000..463252cf1c --- /dev/null +++ b/lib/Unicode/Collate/t/trailwt.t @@ -0,0 +1,229 @@ +BEGIN { + unless ("A" eq pack('U', 0x41)) { + print "1..0 # Unicode::Collate " . + "cannot stringify a Unicode code point\n"; + exit 0; + } +} + +BEGIN { + if ($ENV{PERL_CORE}) { + chdir('t') if -d 't'; + @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib); + } +} + +use Test; +BEGIN { plan tests => 58 }; + +use strict; +use warnings; +use Unicode::Collate; + +######################### + +ok(1); + +# a standard collator (3.1.1) +my $Collator = Unicode::Collate->new( + level => 1, + table => 'keys.txt', + normalization => undef, + + entry => <<'ENTRIES', +326E ; [.1831.0020.0006.326E][.188D.0020.0006.326E] # c.h.s. GA +326F ; [.1833.0020.0006.326F][.188D.0020.0006.326F] # c.h.s. NA +3270 ; [.1834.0020.0006.3270][.188D.0020.0006.3270] # c.h.s. DA +3271 ; [.1836.0020.0006.3271][.188D.0020.0006.3271] # c.h.s. RA +3272 ; [.1837.0020.0006.3272][.188D.0020.0006.3272] # c.h.s. MA +3273 ; [.1838.0020.0006.3273][.188D.0020.0006.3273] # c.h.s. BA +3274 ; [.183A.0020.0006.3274][.188D.0020.0006.3274] # c.h.s. SA +3275 ; [.183C.0020.0006.3275][.188D.0020.0006.3275] # c.h.s. A +3276 ; [.183D.0020.0006.3276][.188D.0020.0006.3276] # c.h.s. JA +3277 ; [.183F.0020.0006.3277][.188D.0020.0006.3277] # c.h.s. CA +3278 ; [.1840.0020.0006.3278][.188D.0020.0006.3278] # c.h.s. KA +3279 ; [.1841.0020.0006.3279][.188D.0020.0006.3279] # c.h.s. TA +327A ; [.1842.0020.0006.327A][.188D.0020.0006.327A] # c.h.s. PA +327B ; [.1843.0020.0006.327B][.188D.0020.0006.327B] # c.h.s. HA +ENTRIES +); + +my $hangul = Unicode::Collate->new( + level => 1, + table => 'keys.txt', + normalization => undef, + hangul_terminator => 16, + + entry => <<'ENTRIES', +326E ; [.1831.0020.0006.326E][.188D.0020.0006.326E] # c.h.s. GA +326F ; [.1833.0020.0006.326F][.188D.0020.0006.326F] # c.h.s. NA +3270 ; [.1834.0020.0006.3270][.188D.0020.0006.3270] # c.h.s. DA +3271 ; [.1836.0020.0006.3271][.188D.0020.0006.3271] # c.h.s. RA +3272 ; [.1837.0020.0006.3272][.188D.0020.0006.3272] # c.h.s. MA +3273 ; [.1838.0020.0006.3273][.188D.0020.0006.3273] # c.h.s. BA +3274 ; [.183A.0020.0006.3274][.188D.0020.0006.3274] # c.h.s. SA +3275 ; [.183C.0020.0006.3275][.188D.0020.0006.3275] # c.h.s. A +3276 ; [.183D.0020.0006.3276][.188D.0020.0006.3276] # c.h.s. JA +3277 ; [.183F.0020.0006.3277][.188D.0020.0006.3277] # c.h.s. CA +3278 ; [.1840.0020.0006.3278][.188D.0020.0006.3278] # c.h.s. KA +3279 ; [.1841.0020.0006.3279][.188D.0020.0006.3279] # c.h.s. TA +327A ; [.1842.0020.0006.327A][.188D.0020.0006.327A] # c.h.s. PA +327B ; [.1843.0020.0006.327B][.188D.0020.0006.327B] # c.h.s. HA +ENTRIES +); + +ok(ref $hangul, "Unicode::Collate"); + +######################### + +# LVX vs LVV: /GAA/ vs /GA/.latinA +ok($Collator->gt("\x{1100}\x{1161}\x{1161}", "\x{1100}\x{1161}A")); +ok($hangul ->gt("\x{1100}\x{1161}\x{1161}", "\x{1100}\x{1161}A")); + +# LVX vs LVV: /GAA/ vs /GA/.hiraganaA +ok($Collator->lt("\x{1100}\x{1161}\x{1161}", "\x{1100}\x{1161}\x{3042}")); +ok($hangul ->gt("\x{1100}\x{1161}\x{1161}", "\x{1100}\x{1161}\x{3042}")); + +# LVX vs LVV: /GAA/ vs /GA/.hanja +ok($Collator->lt("\x{1100}\x{1161}\x{1161}", "\x{1100}\x{1161}\x{4E00}")); +ok($hangul ->gt("\x{1100}\x{1161}\x{1161}", "\x{1100}\x{1161}\x{4E00}")); + +# LVL vs LVT: /GA/./G/ vs /GAG/ +ok($Collator->lt("\x{1100}\x{1161}\x{1100}", "\x{1100}\x{1161}\x{11A8}")); +ok($hangul ->lt("\x{1100}\x{1161}\x{1100}", "\x{1100}\x{1161}\x{11A8}")); + +# LVT vs LVX: /GAG/ vs /GA/.latinA +ok($Collator->gt("\x{1100}\x{1161}\x{11A8}", "\x{1100}\x{1161}A")); +ok($hangul ->gt("\x{1100}\x{1161}\x{11A8}", "\x{1100}\x{1161}A")); + +# LVT vs LVX: /GAG/ vs /GA/.hiraganaA +ok($Collator->lt("\x{1100}\x{1161}\x{11A8}", "\x{1100}\x{1161}\x{3042}")); +ok($hangul ->gt("\x{1100}\x{1161}\x{11A8}", "\x{1100}\x{1161}\x{3042}")); + +# LVT vs LVX: /GAG/ vs /GA/.hanja +ok($Collator->lt("\x{1100}\x{1161}\x{11A8}", "\x{1100}\x{1161}\x{4E00}")); +ok($hangul ->gt("\x{1100}\x{1161}\x{11A8}", "\x{1100}\x{1161}\x{4E00}")); + +# LV vs Syl(LV): /GA/ vs /[GA]/ +ok($Collator->eq("\x{1100}\x{1161}", "\x{AC00}")); +ok($hangul ->eq("\x{1100}\x{1161}", "\x{AC00}")); + +# LVT vs Syl(LV)T: /GAG/ vs /[GA]G/ +ok($Collator->eq("\x{1100}\x{1161}\x{11A8}", "\x{AC00}\x{11A8}")); +ok($hangul ->eq("\x{1100}\x{1161}\x{11A8}", "\x{AC00}\x{11A8}")); + +# LVT vs Syl(LVT): /GAG/ vs /[GAG]/ +ok($Collator->eq("\x{1100}\x{1161}\x{11A8}", "\x{AC01}")); +ok($hangul ->eq("\x{1100}\x{1161}\x{11A8}", "\x{AC01}")); + +# LVTT vs Syl(LVTT): /GAGG/ vs /[GAGG]/ +ok($Collator->eq("\x{1100}\x{1161}\x{11A9}", "\x{AC02}")); +ok($hangul ->eq("\x{1100}\x{1161}\x{11A9}", "\x{AC02}")); + +# Syl(LVT) vs : /GAG/ vs /[GAG]/ +ok($Collator->eq("\x{1100}\x{1161}\x{11A8}", "\x{AC01}")); +ok($hangul ->eq("\x{1100}\x{1161}\x{11A8}", "\x{AC01}")); + +######################### + +my $hangcirc = Unicode::Collate->new( + level => 1, + table => 'keys.txt', + normalization => undef, + hangul_terminator => 16, + + entry => <<'ENTRIES', +326E ; [.1831.0020.0006.326E][.188D.0020.0006.326E][.10.0.0.0] # c.h.s. GA +326F ; [.1833.0020.0006.326F][.188D.0020.0006.326F][.10.0.0.0] # c.h.s. NA +3270 ; [.1834.0020.0006.3270][.188D.0020.0006.3270][.10.0.0.0] # c.h.s. DA +3271 ; [.1836.0020.0006.3271][.188D.0020.0006.3271][.10.0.0.0] # c.h.s. RA +3272 ; [.1837.0020.0006.3272][.188D.0020.0006.3272][.10.0.0.0] # c.h.s. MA +3273 ; [.1838.0020.0006.3273][.188D.0020.0006.3273][.10.0.0.0] # c.h.s. BA +3274 ; [.183A.0020.0006.3274][.188D.0020.0006.3274][.10.0.0.0] # c.h.s. SA +3275 ; [.183C.0020.0006.3275][.188D.0020.0006.3275][.10.0.0.0] # c.h.s. A +3276 ; [.183D.0020.0006.3276][.188D.0020.0006.3276][.10.0.0.0] # c.h.s. JA +3277 ; [.183F.0020.0006.3277][.188D.0020.0006.3277][.10.0.0.0] # c.h.s. CA +3278 ; [.1840.0020.0006.3278][.188D.0020.0006.3278][.10.0.0.0] # c.h.s. KA +3279 ; [.1841.0020.0006.3279][.188D.0020.0006.3279][.10.0.0.0] # c.h.s. TA +327A ; [.1842.0020.0006.327A][.188D.0020.0006.327A][.10.0.0.0] # c.h.s. PA +327B ; [.1843.0020.0006.327B][.188D.0020.0006.327B][.10.0.0.0] # c.h.s. HA +ENTRIES +); + +# LV vs Circled Syl(LV): /GA/ vs /(GA)/ +ok($Collator->eq("\x{1100}\x{1161}", "\x{326E}")); +ok($hangul ->gt("\x{1100}\x{1161}", "\x{326E}")); +ok($hangcirc->eq("\x{1100}\x{1161}", "\x{326E}")); + +# LV vs Circled Syl(LV): followed by latin A +ok($Collator->eq("\x{1100}\x{1161}A", "\x{326E}A")); +ok($hangul ->lt("\x{1100}\x{1161}A", "\x{326E}A")); +ok($hangcirc->eq("\x{1100}\x{1161}A", "\x{326E}A")); + +# LV vs Circled Syl(LV): followed by hiragana A +ok($Collator->eq("\x{1100}\x{1161}\x{3042}", "\x{326E}\x{3042}")); +ok($hangul ->lt("\x{1100}\x{1161}\x{3042}", "\x{326E}\x{3042}")); +ok($hangcirc->eq("\x{1100}\x{1161}\x{3042}", "\x{326E}\x{3042}")); + +# LVT vs LVX: /GAG/ vs /GA/.hanja +ok($Collator->eq("\x{1100}\x{1161}\x{4E00}", "\x{326E}\x{4E00}")); +ok($hangul ->lt("\x{1100}\x{1161}\x{4E00}", "\x{326E}\x{4E00}")); +ok($hangcirc->eq("\x{1100}\x{1161}\x{4E00}", "\x{326E}\x{4E00}")); + +######################### + +# checks contraction in LVT: +# weights of these contractions may be non-sense. + +my $hangcont = Unicode::Collate->new( + level => 1, + table => 'keys.txt', + normalization => undef, + hangul_terminator => 16, + + entry => <<'ENTRIES', +1100 1161 ; [.1831.0020.0002.1100][.188D.0020.0002.1161] # KIYEOK+A +1161 11A8 ; [.188D.0020.0002.1161][.18CF.0020.0002.11A8] # A+KIYEOK +ENTRIES +); + +# cont<LV> vs Syl(LV): /<GA>/ vs /[GA]/ +ok($Collator->eq("\x{1100}\x{1161}", "\x{AC00}")); +ok($hangcont->eq("\x{1100}\x{1161}", "\x{AC00}")); + +# cont<LV>.T vs Syl(LV).T: /<GA>G/ vs /[GA]G/ +ok($Collator->eq("\x{1100}\x{1161}\x{11A8}", "\x{AC00}\x{11A8}")); +ok($hangcont->eq("\x{1100}\x{1161}\x{11A8}", "\x{AC00}\x{11A8}")); + +# cont<LV>.T vs Syl(LVT): /<GA>G/ vs /[GAG]/ +ok($Collator->eq("\x{1100}\x{1161}\x{11A8}", "\x{AC01}")); +ok($hangcont->eq("\x{1100}\x{1161}\x{11A8}", "\x{AC01}")); + +# L.cont<VT> vs Syl(LV).T: /D<AG>/ vs /[DA]G/ +ok($Collator->eq("\x{1103}\x{1161}\x{11A8}", "\x{B2E4}\x{11A8}")); +ok($hangcont->eq("\x{1103}\x{1161}\x{11A8}", "\x{B2E4}\x{11A8}")); + +# L.cont<VT> vs Syl(LVT): /D<AG>/ vs /[DAG]/ +ok($Collator->eq("\x{1103}\x{1161}\x{11A8}", "\x{B2E5}")); +ok($hangcont->eq("\x{1103}\x{1161}\x{11A8}", "\x{B2E5}")); + +##### + +$Collator->change(hangul_terminator => 16); + +ok($Collator->gt("\x{1100}\x{1161}\x{11A8}", "\x{1100}\x{1161}\x{4E00}")); +ok($Collator->gt("\x{1100}\x{1161}", "\x{326E}")); +ok($Collator->lt("\x{1100}\x{1161}A", "\x{326E}A")); +ok($Collator->lt("\x{1100}\x{1161}\x{3042}", "\x{326E}\x{3042}")); +ok($Collator->lt("\x{1100}\x{1161}\x{4E00}", "\x{326E}\x{4E00}")); + +$Collator->change(hangul_terminator => 0); + +ok($Collator->lt("\x{1100}\x{1161}\x{11A8}", "\x{1100}\x{1161}\x{4E00}")); +ok($Collator->eq("\x{1100}\x{1161}", "\x{326E}")); +ok($Collator->eq("\x{1100}\x{1161}A", "\x{326E}A")); +ok($Collator->eq("\x{1100}\x{1161}\x{3042}", "\x{326E}\x{3042}")); +ok($Collator->eq("\x{1100}\x{1161}\x{4E00}", "\x{326E}\x{4E00}")); + +1; +__END__ diff --git a/lib/Unicode/Collate/t/variable.t b/lib/Unicode/Collate/t/variable.t new file mode 100644 index 0000000000..880327a6bd --- /dev/null +++ b/lib/Unicode/Collate/t/variable.t @@ -0,0 +1,108 @@ + +BEGIN { + unless ("A" eq pack('U', 0x41)) { + print "1..0 # Unicode::Collate " . + "cannot stringify a Unicode code point\n"; + exit 0; + } +} + +BEGIN { + if ($ENV{PERL_CORE}) { + chdir('t') if -d 't'; + @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib); + } +} + +use Test; +BEGIN { plan tests => 37 }; + +use strict; +use warnings; +use Unicode::Collate; + +ok(1); + +######################### + +sub _pack_U { Unicode::Collate::pack_U(@_) } +sub _unpack_U { Unicode::Collate::unpack_U(@_) } + +my $A_acute = _pack_U(0xC1); +my $acute = _pack_U(0x0301); + +my $Collator = Unicode::Collate->new( + table => 'keys.txt', + normalization => undef, +); + +my %origVar = $Collator->change(variable => 'Blanked'); + +ok($Collator->lt("death", "de luge")); +ok($Collator->lt("de luge", "de-luge")); +ok($Collator->lt("de-luge", "deluge")); +ok($Collator->lt("deluge", "de\x{2010}luge")); +ok($Collator->lt("deluge", "de Luge")); + +$Collator->change(variable => 'Non-ignorable'); + +ok($Collator->lt("de luge", "de Luge")); +ok($Collator->lt("de Luge", "de-luge")); +ok($Collator->lt("de-Luge", "de\x{2010}luge")); +ok($Collator->lt("de-luge", "death")); +ok($Collator->lt("death", "deluge")); + +$Collator->change(variable => 'Shifted'); + +ok($Collator->lt("death", "de luge")); +ok($Collator->lt("de luge", "de-luge")); +ok($Collator->lt("de-luge", "deluge")); +ok($Collator->lt("deluge", "de Luge")); +ok($Collator->lt("de Luge", "deLuge")); + +$Collator->change(variable => 'Shift-Trimmed'); + +ok($Collator->lt("death", "deluge")); +ok($Collator->lt("deluge", "de luge")); +ok($Collator->lt("de luge", "de-luge")); +ok($Collator->lt("de-luge", "deLuge")); +ok($Collator->lt("deLuge", "de Luge")); + +$Collator->change(%origVar); + +ok($Collator->{variable}, 'shifted'); + +############## + +# ignorable after variable + +# Shifted; +ok($Collator->eq("?\x{300}!\x{301}\x{315}", "?!")); +ok($Collator->eq("?\x{300}A\x{301}", "?$A_acute")); +ok($Collator->eq("?\x{300}", "?")); +ok($Collator->eq("?\x{344}", "?")); # U+0344 has two CEs. + +$Collator->change(level => 3); +ok($Collator->eq("\cA", "?")); + +$Collator->change(variable => 'blanked', level => 4); +ok($Collator->eq("?\x{300}!\x{301}\x{315}", "?!")); +ok($Collator->eq("?\x{300}A\x{301}", "?$A_acute")); +ok($Collator->eq("?\x{300}", "?")); +ok($Collator->eq("?\x{344}", "?")); # U+0344 has two CEs. + +$Collator->change(level => 3); +ok($Collator->eq("\cA", "?")); + +$Collator->change(variable => 'Non-ignorable', level => 4); + +ok($Collator->lt("?\x{300}", "?!")); +ok($Collator->gt("?\x{300}A$acute", "?$A_acute")); +ok($Collator->gt("?\x{300}", "?")); +ok($Collator->gt("?\x{344}", "?")); + +$Collator->change(level => 3); +ok($Collator->lt("\cA", "?")); + +$Collator->change(variable => 'Shifted', level => 4); + diff --git a/lib/Unicode/Collate/t/version.t b/lib/Unicode/Collate/t/version.t new file mode 100644 index 0000000000..0a6d448e1e --- /dev/null +++ b/lib/Unicode/Collate/t/version.t @@ -0,0 +1,61 @@ + +BEGIN { + unless ("A" eq pack('U', 0x41)) { + print "1..0 # Unicode::Collate " . + "cannot stringify a Unicode code point\n"; + exit 0; + } +} + +BEGIN { + if ($ENV{PERL_CORE}) { + chdir('t') if -d 't'; + @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib); + } +} + +use Test; +BEGIN { plan tests => 17 }; + +use strict; +use warnings; +use Unicode::Collate; + +ok(1); + +######################### + +# Fix me when UCA and/or key.txt is upgraded. +my $UCA_Version = "11"; +my $Base_Unicode_Version = "4.0"; +my $Key_Version = "3.1.1"; + +ok(Unicode::Collate::UCA_Version, $UCA_Version); +ok(Unicode::Collate->UCA_Version, $UCA_Version); +ok(Unicode::Collate::Base_Unicode_Version, $Base_Unicode_Version); +ok(Unicode::Collate->Base_Unicode_Version, $Base_Unicode_Version); + +my $Collator = Unicode::Collate->new( + table => 'keys.txt', + normalization => undef, +); + +ok($Collator->UCA_Version, $UCA_Version); +ok($Collator->UCA_Version(), $UCA_Version); +ok($Collator->Base_Unicode_Version, $Base_Unicode_Version); +ok($Collator->Base_Unicode_Version(), $Base_Unicode_Version); +ok($Collator->version, $Key_Version); +ok($Collator->version(), $Key_Version); + +my $UndefTable = Unicode::Collate->new( + table => undef, + normalization => undef, +); + +ok($UndefTable->UCA_Version, $UCA_Version); +ok($UndefTable->UCA_Version(), $UCA_Version); +ok($UndefTable->Base_Unicode_Version, $Base_Unicode_Version); +ok($UndefTable->Base_Unicode_Version(), $Base_Unicode_Version); +ok($UndefTable->version, "unknown"); +ok($UndefTable->version(), "unknown"); + |