summaryrefslogtreecommitdiff
path: root/lib/Unicode
diff options
context:
space:
mode:
authorNicholas Clark <nick@ccl4.org>2003-10-27 13:11:48 +0000
committerNicholas Clark <nick@ccl4.org>2003-10-27 13:11:48 +0000
commit91ae00cbaa2c38b2a8123e4417004ca29a7c9bfc (patch)
tree819d7d0f114e48b9c739b3f94e9af3735c0c341f /lib/Unicode
parent0377e16d912288b7c21a9d90350476c453da3e44 (diff)
downloadperl-91ae00cbaa2c38b2a8123e4417004ca29a7c9bfc.tar.gz
Sync with Unicode::Collate 0.30
p4raw-id: //depot/perl@21549
Diffstat (limited to 'lib/Unicode')
-rw-r--r--lib/Unicode/Collate.pm523
-rw-r--r--lib/Unicode/Collate/Changes24
-rw-r--r--lib/Unicode/Collate/README2
-rw-r--r--lib/Unicode/Collate/t/contract.t2
-rw-r--r--lib/Unicode/Collate/t/hangtype.t56
-rw-r--r--lib/Unicode/Collate/t/hangul.t45
-rw-r--r--lib/Unicode/Collate/t/index.t2
-rw-r--r--lib/Unicode/Collate/t/normal.t205
-rw-r--r--lib/Unicode/Collate/t/test.t103
-rw-r--r--lib/Unicode/Collate/t/trailwt.t229
-rw-r--r--lib/Unicode/Collate/t/variable.t108
-rw-r--r--lib/Unicode/Collate/t/version.t61
12 files changed, 1121 insertions, 239 deletions
diff --git a/lib/Unicode/Collate.pm b/lib/Unicode/Collate.pm
index 18ed44626c..a4d6d80cd1 100644
--- a/lib/Unicode/Collate.pm
+++ b/lib/Unicode/Collate.pm
@@ -14,7 +14,7 @@ use File::Spec;
require Exporter;
-our $VERSION = '0.28';
+our $VERSION = '0.30';
our $PACKAGE = __PACKAGE__;
our @ISA = qw(Exporter);
@@ -26,25 +26,6 @@ our @EXPORT = ();
(our $Path = $INC{'Unicode/Collate.pm'}) =~ s/\.pm$//;
our $KeyFile = "allkeys.txt";
-our $UNICODE_VERSION;
-
-eval { require Unicode::UCD };
-
-unless ($@) {
- $UNICODE_VERSION = Unicode::UCD::UnicodeVersion();
-}
-else { # Perl 5.6.1
- my($f, $fh);
- foreach my $d (@INC) {
- $f = File::Spec->catfile($d, "unicode", "Unicode.301");
- if (open($fh, $f)) {
- $UNICODE_VERSION = '3.0.1';
- close $fh;
- last;
- }
- }
-}
-
# Perl's boolean
use constant TRUE => 1;
use constant FALSE => "";
@@ -101,13 +82,37 @@ use constant CODE_SEP => ';';
use constant NON_VAR => 0; # Non-Variable character
use constant VAR => 1; # Variable character
+# specific code points
+use constant Hangul_LBase => 0x1100;
+use constant Hangul_LIni => 0x1100;
+use constant Hangul_LFin => 0x1159;
+use constant Hangul_LFill => 0x115F;
+use constant Hangul_VBase => 0x1161;
+use constant Hangul_VIni => 0x1160;
+use constant Hangul_VFin => 0x11A2;
+use constant Hangul_TBase => 0x11A7;
+use constant Hangul_TIni => 0x11A8;
+use constant Hangul_TFin => 0x11F9;
+use constant Hangul_TCount => 28;
+use constant Hangul_NCount => 588;
+use constant Hangul_SBase => 0xAC00;
+use constant Hangul_SIni => 0xAC00;
+use constant Hangul_SFin => 0xD7A3;
+use constant CJK_UidIni => 0x4E00;
+use constant CJK_UidFin => 0x9FA5;
+use constant CJK_ExtAIni => 0x3400;
+use constant CJK_ExtAFin => 0x4DB5;
+use constant CJK_ExtBIni => 0x20000;
+use constant CJK_ExtBFin => 0x2A6D6;
+use constant BMP_Max => 0xFFFF;
+
# Logical_Order_Exception in PropList.txt
# TODO: synchronization with change of PropList.txt.
our $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ];
-sub UCA_Version { "9" }
+sub UCA_Version { "11" }
-sub Base_Unicode_Version { $UNICODE_VERSION || 'unknown' }
+sub Base_Unicode_Version { "4.0" }
######
@@ -121,20 +126,21 @@ sub unpack_U {
######
-my (%AlternateOK);
-@AlternateOK{ qw/
+my (%VariableOK);
+@VariableOK{ qw/
blanked non-ignorable shifted shift-trimmed
- / } = ();
+ / } = (); # keys lowercased
our @ChangeOK = qw/
alternate backwards level normalization rearrange
katakana_before_hiragana upper_before_lower
overrideHangul overrideCJK preprocess UCA_Version
+ hangul_terminator variable
/;
our @ChangeNG = qw/
- entry entries table maxlength
- ignoreChar ignoreName undefChar undefName
+ entry mapping table maxlength
+ ignoreChar ignoreName undefChar undefName variableTable
versionTable alternateTable backwardsTable forwardsTable rearrangeTable
derivCode normCode rearrangeHash L3_ignorable
backwardsFlag
@@ -142,6 +148,12 @@ our @ChangeNG = qw/
# The hash key 'ignored' is deleted at v 0.21.
# The hash key 'isShift' is deleted at v 0.23.
# The hash key 'combining' is deleted at v 0.24.
+# The hash key 'entries' is deleted at v 0.30.
+
+sub version {
+ my $self = shift;
+ return $self->{versionTable} || 'unknown';
+}
my (%ChangeOK, %ChangeNG);
@ChangeOK{ @ChangeOK } = ();
@@ -151,6 +163,12 @@ sub change {
my $self = shift;
my %hash = @_;
my %old;
+ if (exists $hash{variable} && exists $hash{alternate}) {
+ delete $hash{alternate};
+ }
+ elsif (!exists $hash{variable} && exists $hash{alternate}) {
+ $hash{variable} = $hash{alternate};
+ }
foreach my $k (keys %hash) {
if (exists $ChangeOK{$k}) {
$old{$k} = $self->{$k};
@@ -174,18 +192,24 @@ sub _checkLevel {
$level, $key, MaxLevel if MaxLevel < $level;
}
+my %DerivCode = (
+ 8 => \&_derivCE_8,
+ 9 => \&_derivCE_9,
+ 11 => \&_derivCE_9, # 11 == 9
+);
+
sub checkCollator {
my $self = shift;
_checkLevel($self->{level}, "level");
- $self->{derivCode} =
- $self->{UCA_Version} == 8 ? \&_derivCE_8 :
- $self->{UCA_Version} == 9 ? \&_derivCE_9 :
- croak "Illegal UCA version (passed $self->{UCA_Version}).";
+ $self->{derivCode} = $DerivCode{ $self->{UCA_Version} }
+ or croak "Illegal UCA version (passed $self->{UCA_Version}).";
- $self->{alternate} = lc($self->{alternate});
- croak "$PACKAGE unknown alternate tag name: $self->{alternate}"
- unless exists $AlternateOK{ $self->{alternate} };
+ $self->{variable} ||= $self->{alternate} || $self->{variableTable} ||
+ $self->{alternateTable} || $self->{alternate} || 'shifted';
+ $self->{variable} = $self->{alternate} = lc($self->{variable});
+ exists $VariableOK{ $self->{variable} }
+ or croak "$PACKAGE unknown variable tag name: $self->{variable}";
if (! defined $self->{backwards}) {
$self->{backwardsFlag} = 0;
@@ -206,10 +230,9 @@ sub checkCollator {
}
}
- $self->{rearrange} = []
- if ! defined $self->{rearrange};
- croak "$PACKAGE: A list for rearrangement must be store in an ARRAYREF"
- if ! ref $self->{rearrange};
+ defined $self->{rearrange} or $self->{rearrange} = [];
+ ref $self->{rearrange}
+ or croak "$PACKAGE: list for rearrangement must be store in ARRAYREF";
# keys of $self->{rearrangeHash} are $self->{rearrange}.
$self->{rearrangeHash} = undef;
@@ -222,13 +245,14 @@ sub checkCollator {
if (defined $self->{normalization}) {
eval { require Unicode::Normalize };
- croak "Unicode/Normalize.pm is required to normalize strings: $@"
- if $@;
+ $@ and croak "Unicode::Normalize is required to normalize strings";
- $CVgetCombinClass = \&Unicode::Normalize::getCombinClass
- if ! $CVgetCombinClass;
+ $CVgetCombinClass ||= \&Unicode::Normalize::getCombinClass;
- if ($self->{normalization} ne 'prenormalized') {
+ if ($self->{normalization} =~ /^(?:NF)D\z/) { # tweak for default
+ $self->{normCode} = \&Unicode::Normalize::NFD;
+ }
+ elsif ($self->{normalization} ne 'prenormalized') {
my $norm = $self->{normalization};
$self->{normCode} = sub {
Unicode::Normalize::normalize($norm, shift);
@@ -262,8 +286,6 @@ sub new
if ! exists $self->{overrideCJK};
$self->{normalization} = 'NFD'
if ! exists $self->{normalization};
- $self->{alternate} = $self->{alternateTable} || 'shifted'
- if ! exists $self->{alternate};
$self->{rearrange} = $self->{rearrangeTable} || $DefaultRearrange
if ! exists $self->{rearrange};
$self->{backwards} = $self->{backwardsTable}
@@ -288,7 +310,10 @@ sub read_table {
if (/^\s*\@version\s*(\S*)/) {
$self->{versionTable} ||= $1;
}
- elsif (/^\s*\@alternate\s+(\S*)/) {
+ elsif (/^\s*\@variable\s+(\S*)/) { # since UTS #10-9
+ $self->{variableTable} ||= $1;
+ }
+ elsif (/^\s*\@alternate\s+(\S*)/) { # till UTS #10-8
$self->{alternateTable} ||= $1;
}
elsif (/^\s*\@backwards\s+(\S*)/) {
@@ -364,35 +389,39 @@ sub parseEntry
# if and only if "all" CEs are [.0000.0000.0000].
}
- $self->{entries}{$entry} = \@key;
-
- $self->{L3_ignorable}{$uv[0]} = TRUE
- if @uv == 1 && $is_L3_ignorable;
+ $self->{mapping}{$entry} = \@key;
- # Contraction is to be considered in the range of this maxlength.
- $self->{maxlength}{$uv[0]} = scalar @uv
- if @uv > 1;
+ if (@uv > 1) {
+ (!$self->{maxlength}{$uv[0]} || $self->{maxlength}{$uv[0]} < @uv)
+ and $self->{maxlength}{$uv[0]} = @uv;
+ }
+ else {
+ $is_L3_ignorable
+ ? ($self->{L3_ignorable}{$uv[0]} = TRUE)
+ : ($self->{L3_ignorable}{$uv[0]} and
+ $self->{L3_ignorable}{$uv[0]} = FALSE); # &&= stores key.
+ }
}
##
-## arrayref[weights] = altCE(VCE)
+## arrayref[weights] = varCE(VCE)
##
-sub altCE
+sub varCE
{
my $self = shift;
my($var, @wt) = unpack(VCE_TEMPLATE, shift);
- $self->{alternate} eq 'blanked' ?
+ $self->{variable} eq 'blanked' ?
$var ? [Var1Wt, 0, 0, $wt[3]] : \@wt :
- $self->{alternate} eq 'non-ignorable' ?
+ $self->{variable} eq 'non-ignorable' ?
\@wt :
- $self->{alternate} eq 'shifted' ?
+ $self->{variable} eq 'shifted' ?
$var ? [Var1Wt, 0, 0, $wt[0] ]
: [ @wt[0..2], $wt[0]+$wt[1]+$wt[2] ? Shift4Wt : 0 ] :
- $self->{alternate} eq 'shift-trimmed' ?
+ $self->{variable} eq 'shift-trimmed' ?
$var ? [Var1Wt, 0, 0, $wt[0] ] : [ @wt[0..2], 0 ] :
- croak "$PACKAGE unknown alternate name: $self->{alternate}";
+ croak "$PACKAGE unknown variable name: $self->{variable}";
}
sub viewSortKey
@@ -416,21 +445,21 @@ sub visualizeSortKey
##
-## arrayref of JCPS = splitCE(string to be collated)
-## arrayref of arrayref[JCPS, ini_pos, fin_pos] = splitCE(string, true)
+## arrayref of JCPS = splitEnt(string to be collated)
+## arrayref of arrayref[JCPS, ini_pos, fin_pos] = splitEnt(string, true)
##
-sub splitCE
+sub splitEnt
{
my $self = shift;
my $wLen = $_[1];
my $code = $self->{preprocess};
my $norm = $self->{normCode};
- my $ent = $self->{entries};
+ my $map = $self->{mapping};
my $max = $self->{maxlength};
my $reH = $self->{rearrangeHash};
my $ign = $self->{L3_ignorable};
- my $ver9 = $self->{UCA_Version} > 8;
+ my $ver9 = $self->{UCA_Version} >= 9;
my ($str, @buf);
@@ -473,26 +502,26 @@ sub splitCE
next if _isNonCharacter($src[$i]);
my $i_orig = $i;
- my $ce = $src[$i];
+ my $jcps = $src[$i];
- if ($max->{$ce}) { # contract
- my $temp_ce = $ce;
- my $ceLen = 1;
- my $maxLen = $max->{$ce};
+ if ($max->{$jcps}) { # contract
+ my $temp_jcps = $jcps;
+ my $jcpsLen = 1;
+ my $maxLen = $max->{$jcps};
- for (my $p = $i + 1; $ceLen < $maxLen && $p < @src; $p++) {
+ for (my $p = $i + 1; $jcpsLen < $maxLen && $p < @src; $p++) {
next if ! defined $src[$p];
- $temp_ce .= CODE_SEP . $src[$p];
- $ceLen++;
- if ($ent->{$temp_ce}) {
- $ce = $temp_ce;
+ $temp_jcps .= CODE_SEP . $src[$p];
+ $jcpsLen++;
+ if ($map->{$temp_jcps}) {
+ $jcps = $temp_jcps;
$i = $p;
}
}
# not-contiguous contraction with Combining Char (cf. UTS#10, S2.1).
# This process requires Unicode::Normalize.
- # If "normalize" is undef, here should be skipped *always*
+ # If "normalization" is undef, here should be skipped *always*
# (in spite of bool value of $CVgetCombinClass),
# since canonical ordering cannot be expected.
# Blocked combining character should not be contracted.
@@ -508,8 +537,8 @@ sub splitCE
$curCC = $CVgetCombinClass->($src[$p]);
last unless $curCC;
my $tail = CODE_SEP . $src[$p];
- if ($preCC != $curCC && $ent->{$ce.$tail}) {
- $ce .= $tail;
+ if ($preCC != $curCC && $map->{$jcps.$tail}) {
+ $jcps .= $tail;
$src[$p] = undef;
} else {
$preCC = $curCC;
@@ -525,7 +554,7 @@ sub splitCE
}
}
- push @buf, $wLen ? [$ce, $i_orig, $i + 1] : $ce;
+ push @buf, $wLen ? [$jcps, $i_orig, $i + 1] : $jcps;
}
return \@buf;
}
@@ -537,18 +566,16 @@ sub splitCE
sub getWt
{
my $self = shift;
- my $ce = shift;
- my $ent = $self->{entries};
+ my $u = shift;
+ my $map = $self->{mapping};
my $der = $self->{derivCode};
- return if !defined $ce;
- return map($self->altCE($_), @{ $ent->{$ce} })
- if $ent->{$ce};
-
- # CE must not be a contraction, then it's a code point.
- my $u = $ce;
+ return if !defined $u;
+ return map($self->varCE($_), @{ $map->{$u} })
+ if $map->{$u};
- if (0xAC00 <= $u && $u <= 0xD7A3) { # is Hangul Syllale
+ # JCPS must not be a contraction, then it's a code point.
+ if (Hangul_SIni <= $u && $u <= Hangul_SFin) {
my $hang = $self->{overrideHangul};
my @hangulCE;
if ($hang) {
@@ -563,45 +590,44 @@ sub getWt
if (@decH == 2) {
my $contract = join(CODE_SEP, @decH);
- @decH = ($contract) if $ent->{$contract};
+ @decH = ($contract) if $map->{$contract};
} else { # must be <@decH == 3>
if ($max->{$decH[0]}) {
my $contract = join(CODE_SEP, @decH);
- if ($ent->{$contract}) {
+ if ($map->{$contract}) {
@decH = ($contract);
} else {
$contract = join(CODE_SEP, @decH[0,1]);
- $ent->{$contract} and @decH = ($contract, $decH[2]);
+ $map->{$contract} and @decH = ($contract, $decH[2]);
}
# even if V's ignorable, LT contraction is not supported.
# If such a situatution were required, NFD should be used.
}
if (@decH == 3 && $max->{$decH[1]}) {
my $contract = join(CODE_SEP, @decH[1,2]);
- $ent->{$contract} and @decH = ($decH[0], $contract);
+ $map->{$contract} and @decH = ($decH[0], $contract);
}
}
@hangulCE = map({
- $ent->{$_} ? @{ $ent->{$_} } : $der->($_);
+ $map->{$_} ? @{ $map->{$_} } : $der->($_);
} @decH);
}
- return map $self->altCE($_), @hangulCE;
+ return map $self->varCE($_), @hangulCE;
}
- elsif (0x3400 <= $u && $u <= 0x4DB5 ||
- 0x4E00 <= $u && $u <= 0x9FA5 ||
- 0x20000 <= $u && $u <= 0x2A6D6) # CJK Ideograph
- {
+ elsif (CJK_UidIni <= $u && $u <= CJK_UidFin ||
+ CJK_ExtAIni <= $u && $u <= CJK_ExtAFin ||
+ CJK_ExtBIni <= $u && $u <= CJK_ExtBFin) {
my $cjk = $self->{overrideCJK};
- return map $self->altCE($_),
+ return map $self->varCE($_),
$cjk
? map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$cjk($u))
- : defined $cjk && $self->{UCA_Version} <= 8 && $u < 0x10000
+ : defined $cjk && $self->{UCA_Version} <= 8 && $u <= BMP_Max
? pack(VCE_TEMPLATE, NON_VAR, $u, Min2Wt, Min3Wt, $u)
: $der->($u);
}
else {
- return map $self->altCE($_), $der->($u);
+ return map $self->varCE($_), $der->($u);
}
}
@@ -613,14 +639,42 @@ sub getSortKey
{
my $self = shift;
my $lev = $self->{level};
- my $rCE = $self->splitCE(shift); # get an arrayref of JCPS
- my $ver9 = $self->{UCA_Version} > 8;
- my $v2i = $self->{alternate} ne 'non-ignorable';
+ my $rEnt = $self->splitEnt(shift); # get an arrayref of JCPS
+ my $ver9 = $self->{UCA_Version} >= 9;
+ my $v2i = $self->{variable} ne 'non-ignorable';
# weight arrays
- my (@buf, $last_is_variable);
+ my (@wts, @buf, $last_is_variable);
+
+ if ($self->{hangul_terminator}) {
+ my $preHST = '';
+ foreach my $jcps (@$rEnt) {
+ # weird things like VL, TL-contraction are not considered!
+ my $curHST = '';
+ foreach my $u (split /;/, $jcps) {
+ $curHST .= getHST($u);
+ }
+ if ($preHST && !$curHST || # hangul before non-hangul
+ $preHST =~ /L\z/ && $curHST =~ /^T/ ||
+ $preHST =~ /V\z/ && $curHST =~ /^L/ ||
+ $preHST =~ /T\z/ && $curHST =~ /^[LV]/) {
- foreach my $wt (map $self->getWt($_), @$rCE) {
+ push @wts, $self->varCE_HangulTerm;
+ }
+ $preHST = $curHST;
+
+ push @wts, $self->getWt($jcps);
+ }
+ $preHST # end at hangul
+ and push @wts, $self->varCE_HangulTerm;
+ }
+ else {
+ foreach my $jcps (@$rEnt) {
+ push @wts, $self->getWt($jcps);
+ }
+ }
+
+ foreach my $wt (@wts) {
if ($v2i && $ver9) {
if ($wt->[0] == 0) { # ignorable
next if $last_is_variable;
@@ -694,9 +748,10 @@ sub sort {
sub _derivCE_9 {
my $u = shift;
my $base =
- (0x4E00 <= $u && $u <= 0x9FA5)
+ (CJK_UidIni <= $u && $u <= CJK_UidFin)
? 0xFB40 : # CJK
- (0x3400 <= $u && $u <= 0x4DB5 || 0x20000 <= $u && $u <= 0x2A6D6)
+ (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin ||
+ CJK_ExtBIni <= $u && $u <= CJK_ExtBFin)
? 0xFB80 # CJK ext.
: 0xFBC0; # others
@@ -716,6 +771,14 @@ sub _derivCE_8 {
pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $code);
}
+
+sub varCE_HangulTerm {
+ my $self = shift;
+ return $self->varCE(pack(VCE_TEMPLATE,
+ NON_VAR, $self->{hangul_terminator}, 0,0,0));
+}
+
+
##
## "hhhh hhhh hhhh" to (dddd, dddd, dddd)
##
@@ -727,14 +790,14 @@ sub _getHexArray { map hex, $_[0] =~ /([0-9a-fA-F]+)/g }
#
sub _decompHangul {
my $code = shift;
- my $SIndex = $code - 0xAC00;
- my $LIndex = int( $SIndex / 588);
- my $VIndex = int(($SIndex % 588) / 28);
- my $TIndex = $SIndex % 28;
+ my $SIndex = $code - Hangul_SBase;
+ my $LIndex = int( $SIndex / Hangul_NCount);
+ my $VIndex = int(($SIndex % Hangul_NCount) / Hangul_TCount);
+ my $TIndex = $SIndex % Hangul_TCount;
return (
- 0x1100 + $LIndex,
- 0x1161 + $VIndex,
- $TIndex ? (0x11A7 + $TIndex) : (),
+ Hangul_LBase + $LIndex,
+ Hangul_VBase + $VIndex,
+ $TIndex ? (Hangul_TBase + $TIndex) : (),
);
}
@@ -748,6 +811,17 @@ sub _isNonCharacter {
;
}
+# Hangul Syllable Type
+sub getHST {
+ my $u = shift;
+ return
+ Hangul_LIni <= $u && $u <= Hangul_LFin || $u == Hangul_LFill ? "L" :
+ Hangul_VIni <= $u && $u <= Hangul_VFin ? "V" :
+ Hangul_TIni <= $u && $u <= Hangul_TFin ? "T" :
+ Hangul_SIni <= $u && $u <= Hangul_SFin ?
+ ($u - Hangul_SBase) % Hangul_TCount ? "LVT" : "LV" : "";
+}
+
##
## bool _nonIgnorAtLevel(arrayref weights, int level)
@@ -796,19 +870,19 @@ sub _eqArray($$$)
##
sub index
{
- my $self = shift;
- my $str = shift;
- my $len = length($str);
- my $subCE = $self->splitCE(shift);
- my $pos = @_ ? shift : 0;
- $pos = 0 if $pos < 0;
- my $grob = shift;
-
- my $lev = $self->{level};
- my $ver9 = $self->{UCA_Version} > 8;
- my $v2i = $self->{alternate} ne 'non-ignorable';
-
- if (! @$subCE) {
+ my $self = shift;
+ my $str = shift;
+ my $len = length($str);
+ my $subE = $self->splitEnt(shift);
+ my $pos = @_ ? shift : 0;
+ $pos = 0 if $pos < 0;
+ my $grob = shift;
+
+ my $lev = $self->{level};
+ my $ver9 = $self->{UCA_Version} >= 9;
+ my $v2i = $self->{variable} ne 'non-ignorable';
+
+ if (! @$subE) {
my $temp = $pos <= 0 ? 0 : $len <= $pos ? $len : $pos;
return $grob
? map([$_, 0], $temp..$len)
@@ -817,15 +891,15 @@ sub index
if ($len < $pos) {
return wantarray ? () : NOMATCHPOS;
}
- my $strCE = $self->splitCE($pos ? substr($str, $pos) : $str, TRUE);
- if (! @$strCE) {
+ my $strE = $self->splitEnt($pos ? substr($str, $pos) : $str, TRUE);
+ if (! @$strE) {
return wantarray ? () : NOMATCHPOS;
}
my $last_is_variable;
my(@strWt, @iniPos, @finPos, @subWt, @g_ret);
$last_is_variable = FALSE;
- for my $wt (map $self->getWt($_), @$subCE) {
+ for my $wt (map $self->getWt($_), @$subE) {
my $to_be_pushed = _nonIgnorAtLevel($wt,$lev);
if ($v2i && $ver9) {
@@ -845,7 +919,7 @@ sub index
}
my $count = 0;
- my $end = @$strCE - 1;
+ my $end = @$strE - 1;
$last_is_variable = FALSE;
@@ -854,7 +928,7 @@ sub index
# fetch a grapheme
while ($i <= $end && $found_base == 0) {
- for my $wt ($self->getWt($strCE->[$i][0])) {
+ for my $wt ($self->getWt($strE->[$i][0])) {
my $to_be_pushed = _nonIgnorAtLevel($wt,$lev);
if ($v2i && $ver9) {
@@ -867,13 +941,13 @@ sub index
if (@strWt && $wt->[0] == 0) {
push @{ $strWt[-1] }, $wt if $to_be_pushed;
- $finPos[-1] = $strCE->[$i][2];
+ $finPos[-1] = $strE->[$i][2];
} elsif ($to_be_pushed) {
$wt->[0] = 0 if $wt->[0] == Var1Wt;
push @strWt, [ $wt ];
- push @iniPos, $found_base ? NOMATCHPOS : $strCE->[$i][1];
+ push @iniPos, $found_base ? NOMATCHPOS : $strE->[$i][1];
$finPos[-1] = NOMATCHPOS if $found_base;
- push @finPos, $strCE->[$i][2];
+ push @finPos, $strE->[$i][2];
$found_base++;
}
# else ===> no-op
@@ -1004,6 +1078,9 @@ Unicode::Collate - Unicode Collation Algorithm
#compare
$result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
+ # If %tailoring is false (i.e. empty),
+ # $Collator should do the default collation.
+
=head1 DESCRIPTION
This module is an implementation
@@ -1016,14 +1093,15 @@ The C<new> method returns a collator object.
$Collator = Unicode::Collate->new(
UCA_Version => $UCA_Version,
- alternate => $alternate,
+ alternate => $alternate, # deprecated: use of 'variable' is recommended.
backwards => $levelNumber, # or \@levelNumbers
entry => $element,
- normalization => $normalization_form,
+ hangul_terminator => $term_primary_weight,
ignoreName => qr/$ignoreName/,
ignoreChar => qr/$ignoreChar/,
katakana_before_hiragana => $bool,
level => $collationLevel,
+ normalization => $normalization_form,
overrideCJK => \&overrideCJK,
overrideHangul => \&overrideHangul,
preprocess => \&preprocess,
@@ -1032,50 +1110,22 @@ The C<new> method returns a collator object.
undefName => qr/$undefName/,
undefChar => qr/$undefChar/,
upper_before_lower => $bool,
+ variable => $variable,
);
- # if %tailoring is false (i.e. empty),
- # $Collator should do the default collation.
=over 4
=item UCA_Version
-If the version number of the older UCA is given,
-the older behavior of that version is emulated on collating.
+If the tracking version number of the older UCA is given,
+the older behavior of that tracking version is emulated on collating.
If omitted, the return value of C<UCA_Version()> is used.
-The supported version: 8 or 9.
+The supported tracking version: 8, 9, or 11.
B<This parameter may be removed in the future version,
as switching the algorithm would affect the performance.>
-=item alternate
-
--- see 3.2.2 Variable Weighting, UTS #10.
-
-(the title in UCA version 8: Alternate Weighting)
-
-This key allows to alternate weighting for variable collation elements,
-which are marked with an ASTERISK in the table
-(NOTE: Many punction marks and symbols are variable in F<allkeys.txt>).
-
- alternate => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'.
-
-These names are case-insensitive.
-By default (if specification is omitted), 'shifted' is adopted.
-
- 'Blanked' Variable elements are made ignorable at levels 1 through 3;
- considered at the 4th level.
-
- 'Non-ignorable' Variable elements are not reset to ignorable.
-
- 'Shifted' Variable elements are made ignorable at levels 1 through 3
- their level 4 weight is replaced by the old level 1 weight.
- Level 4 weight for Non-Variable elements is 0xFFFF.
-
- 'Shift-Trimmed' Same as 'shifted', but all FFFF's at the 4th level
- are trimmed.
-
=item backwards
-- see 3.1.2 French Accents, UTS #10.
@@ -1089,7 +1139,10 @@ If omitted, forwards at all the levels.
-- see 3.1 Linguistic Features; 3.2.1 File Format, UTS #10.
-Overrides a default order or defines additional collation elements
+If the same character (or a sequence of characters) exists
+in the collation element table through C<table>,
+mapping to collation elements is overrided.
+If it does not exist, the mapping is defined additionally.
entry => <<'ENTRIES', # use the UCA file format
00E6 ; [.0861.0020.0002.00E6] [.08B1.0020.0002.00E6] # ligature <ae> as <a><e>
@@ -1102,6 +1155,34 @@ B<must> be a Unicode code point, but not a native code point.
So C<0063> must always denote C<U+0063>,
but not a character of C<"\x63">.
+=item hangul_terminator
+
+-- see Condition B.2. in 7.1.4 Trailing Weights, UTS #10.
+
+If a true value is given (non-zero but should be positive),
+it will be added as a terminator primary weight to the end of
+every standard Hangul syllable. Secondary and any higher weights
+for terminator are set to zero.
+If the value is false or C<hangul_terminator> key does not exist,
+insertion of terminator weights will not be performed.
+
+Boundaries of Hangul syllables are determined
+according to conjoining Jamo behavior in F<the Unicode Standard>
+and F<HangulSyllableType.txt>.
+
+B<Implementation Note:>
+(1) For expansion mapping (Unicode character mapped
+to a sequence of collation elements), a terminator will not be added
+between collation elements, even if Hangul syllable boundary exists there.
+Addition of terminator is restricted to the next position
+to the last collation element.
+
+(2) Non-conjoining Hangul letters
+(Compatibility Jamo, halfwidth Jamo, and enclosed letters) are not
+automatically terminated with a terminator primary weight.
+These characters may need terminator included in a collation element
+table beforehand.
+
=item ignoreName
=item ignoreChar
@@ -1124,7 +1205,7 @@ Any higher levels than the specified one are ignored.
Level 1: alphabetic ordering
Level 2: diacritic ordering
Level 3: case ordering
- Level 4: tie-breaking (e.g. in the case when alternate is 'shifted')
+ Level 4: tie-breaking (e.g. in the case when variable is 'shifted')
ex.level => 2,
@@ -1143,7 +1224,7 @@ Acceptable names include C<'NFD'>, C<'NFC'>, C<'NFKD'>, and C<'NFKC'>.
See C<Unicode::Normalize::normalize()> for detail.
If omitted, C<'NFD'> is used.
-L<normalization> is performed after L<preprocess> (if defined).
+C<normalization> is performed after C<preprocess> (if defined).
Furthermore, special values, C<undef> and C<"prenormalized">, can be used,
though they are not concerned with C<Unicode::Normalize::normalize()>.
@@ -1175,9 +1256,12 @@ B<Unicode::Normalize> is required (see also B<CAVEAT>).
-- see 7.1 Derived Collation Elements, UTS #10.
-By default, mapping of CJK Unified Ideographs
-uses the Unicode codepoint order.
-But the mapping of CJK Unified Ideographs may be overrided.
+By default, CJK Unified Ideographs are ordered in Unicode codepoint order
+(but C<CJK Unified Ideographs> [C<U+4E00> to C<U+9FA5>] are lesser than
+C<CJK Unified Ideographs Extension> [C<U+3400> to C<U+4DB5> and
+C<U+20000> to C<U+2A6D6>].
+
+Through C<overrideCJK>, ordering of CJK Unified Ideographs can be overrided.
ex. CJK Unified Ideographs in the JIS code point order.
@@ -1199,7 +1283,7 @@ ex. ignores all CJK Unified Ideographs.
If C<undef> is passed explicitly as the value for this key,
weights for CJK Unified Ideographs are treated as undefined.
But assignment of weight for CJK Unified Ideographs
-in table or L<entry> is still valid.
+in table or C<entry> is still valid.
=item overrideHangul
@@ -1208,7 +1292,7 @@ in table or L<entry> is still valid.
By default, Hangul Syllables are decomposed into Hangul Jamo.
But the mapping of Hangul Syllables may be overrided.
-This tag works like L<overrideCJK>, so see there for examples.
+This tag works like C<overrideCJK>, so see there for examples.
If you want to override the mapping of Hangul Syllables,
the Normalization Forms D and KD are not appropriate
@@ -1218,7 +1302,7 @@ If C<undef> is passed explicitly as the value for this key,
weight for Hangul Syllables is treated as undefined
without decomposition into Hangul Jamo.
But definition of weight for Hangul Syllables
-in table or L<entry> is still valid.
+in table or C<entry> is still valid.
=item preprocess
@@ -1236,7 +1320,7 @@ Then, "the pen" is before "a pencil".
return $str;
},
-L<preprocess> is performed before L<normalization> (if defined).
+C<preprocess> is performed before C<normalization> (if defined).
=item rearrange
@@ -1258,7 +1342,7 @@ but it is not warned at present.>
-- see 3.2 Default Unicode Collation Element Table, UTS #10.
-You can use another element table if desired.
+You can use another collation element table if desired.
The table file must be put into a directory
where F<Unicode/Collate.pm> is installed.
E.g. in F<perl/lib/Unicode/Collate> directory
@@ -1267,7 +1351,7 @@ when you have F<perl/lib/Unicode/Collate.pm>.
By default, the filename F<"allkeys.txt"> is used.
If C<undef> is passed explicitly as the value for this key,
-no file is read (but you can define collation elements via L<entry>).
+no file is read (but you can define collation elements via C<entry>).
A typical way to define a collation element table
without any file of table:
@@ -1318,6 +1402,38 @@ must be same as those mentioned in 7.3.1, UTS #10.
If you define your collation elements which violate this requirement,
these tags don't work validly.
+=item variable
+
+=item alternate
+
+-- see 3.2.2 Variable Weighting, UTS #10.
+
+(the title in UCA version 8: Alternate Weighting)
+
+This key allows to variable weighting for variable collation elements,
+which are marked with an ASTERISK in the table
+(NOTE: Many punction marks and symbols are variable in F<allkeys.txt>).
+
+ variable => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'.
+
+These names are case-insensitive.
+By default (if specification is omitted), 'shifted' is adopted.
+
+ 'Blanked' Variable elements are made ignorable at levels 1 through 3;
+ considered at the 4th level.
+
+ 'Non-ignorable' Variable elements are not reset to ignorable.
+
+ 'Shifted' Variable elements are made ignorable at levels 1 through 3
+ their level 4 weight is replaced by the old level 1 weight.
+ Level 4 weight for Non-Variable elements is 0xFFFF.
+
+ 'Shift-Trimmed' Same as 'shifted', but all FFFF's at the 4th level
+ are trimmed.
+
+For backward compatibility, C<alternate> can be used as an alias
+for C<variable>.
+
=back
=head2 Methods for Collation
@@ -1391,7 +1507,7 @@ for C<$Collator>, calling these methods (C<index>, C<match>, C<gmatch>,
C<subst>, C<gsubst>) is croaked,
as the position and the length might differ
from those on the specified string.
-(And the C<rearrange> tag is neglected.)
+(And C<rearrange> and C<hangul_terminator> tags are neglected.)
The C<match>, C<gmatch>, C<subst>, C<gsubst> methods work
like C<m//>, C<m//g>, C<s///>, C<s///g>, respectively,
@@ -1530,14 +1646,20 @@ In the scalar context, returns the modified collator
$Collator->change(level => 4)->eq("perl", "PERL"); # false
-=item UCA_Version
+=item C<$version = $Collator-E<gt>version()>
-Returns the version number of UTS #10 this module consults.
+Returns the version number (a string) of the Unicode Standard
+which the C<table> file used by the collator object is based on.
+If the table does not include a version line (starting with C<@version>),
+returns C<"unknown">.
+
+=item C<UCA_Version()>
-=item Base_Unicode_Version
+Returns the tracking version number of UTS #10 this module consults.
-Returns the version number of the Unicode Standard
-this module is based on.
+=item C<Base_Unicode_Version()>
+
+Returns the version number of UTS #10 this module consults.
=back
@@ -1565,7 +1687,7 @@ and L<http://www.unicode.org/reports/tr10/CollationTest.zip>
For F<CollationTest_SHIFTED.txt>,
a collator via C<Unicode::Collate-E<gt>new( )> should be used;
for F<CollationTest_NON_IGNORABLE.txt>, a collator via
-C<Unicode::Collate-E<gt>new(alternate =E<gt> "non-ignorable", level =E<gt> 3)>.
+C<Unicode::Collate-E<gt>new(variable =E<gt> "non-ignorable", level =E<gt> 3)>.
B<Unicode::Normalize is required to try The Conformance Test.>
@@ -1584,22 +1706,27 @@ SADAHIRO Tomoyuki, <SADAHIRO@cpan.org>
=over 4
-=item http://www.unicode.org/reports/tr10/
+=item Unicode Collation Algorithm - UTS #10
+
+L<http://www.unicode.org/reports/tr10/>
+
+=item The Default Unicode Collation Element Table (DUCET)
+
+L<http://www.unicode.org/reports/tr10/allkeys.txt>
-Unicode Collation Algorithm - UTS #10
+=item The conformance test for the UCA
-=item http://www.unicode.org/reports/tr10/allkeys.txt
+L<http://www.unicode.org/reports/tr10/CollationTest.html>
-The Default Unicode Collation Element Table
+L<http://www.unicode.org/reports/tr10/CollationTest.zip>
-=item http://www.unicode.org/reports/tr10/CollationTest.html
-http://www.unicode.org/reports/tr10/CollationTest.zip
+=item Hangul Syllable Type
-The latest versions of the conformance test for the UCA
+http://www.unicode.org/Public/UNIDATA/HangulSyllableType.txt
-=item http://www.unicode.org/reports/tr15/
+=item Unicode Normalization Forms - UAX #15
-Unicode Normalization Forms - UAX #15
+L<http://www.unicode.org/reports/tr15/>
=item L<Unicode::Normalize>
diff --git a/lib/Unicode/Collate/Changes b/lib/Unicode/Collate/Changes
index 3d39bbe248..7f92d7aad1 100644
--- a/lib/Unicode/Collate/Changes
+++ b/lib/Unicode/Collate/Changes
@@ -1,5 +1,22 @@
Revision history for Perl module Unicode::Collate.
+0.30 Mon Oct 13 21:26:37 2003
+ - fix: Completely ignorable in table should be able to be overrided
+ by non-ignorable in entry.
+ - fix: Maximum length for contraction must not be shortened
+ by a shorter contraction following.
+ - added normal.t.
+ - some doc fixes
+
+0.29 Mon Oct 13 12:18:23 2003
+ - supported hangul_terminator.
+ - fix: Base_Unicode_Version falsely returns Perl's Unicode version.
+ C4 in UTS #10 requires UTS's Unicode version.
+ - For variable weighting, 'variable' is recommended
+ and 'alternate' is deprecated.
+ - added version() method.
+ - added hangtype.t, trailwt.t, variable.t, and version.t.
+
0.28 Sat Sep 06 20:16:01 2003
- Fixed another inconsistency under (normalization => undef):
Non-contiguous contraction is always neglected.
@@ -14,9 +31,10 @@ Revision history for Perl module Unicode::Collate.
Collation of a large string including a first letter of a contraction
that is not a part of that contraction (say, 'c' of 'ca'
where 'ch' is defined) was too slow, inefficient.
- - A form name for 'normalize', no longer restricted to /^(?:NF)?K?[CD]\z/,
- will be allowed as long as Unicode::Normalize::normalize() accepts it.
- since Unicode::Normalize or UAX #15 may be changed/enhanced in future.
+ - A form name for 'normalization', no longer restricted to
+ /^(?:NF)?K?[CD]\z/, will be allowed as long as
+ Unicode::Normalize::normalize() accepts it, since Unicode::Normalize
+ or UAX #15 may be changed/enhanced in future.
- When Hangul syllables are decomposed under <normalization => undef>,
contraction among jamo (LV, VT, LVT) derived from the same
Hangul syllable is allowed. Added hangul.t.
diff --git a/lib/Unicode/Collate/README b/lib/Unicode/Collate/README
index 3c86573ec3..6a4b712a8b 100644
--- a/lib/Unicode/Collate/README
+++ b/lib/Unicode/Collate/README
@@ -1,4 +1,4 @@
-Unicode/Collate version 0.28
+Unicode/Collate version 0.30
===============================
NAME
diff --git a/lib/Unicode/Collate/t/contract.t b/lib/Unicode/Collate/t/contract.t
index c2aaecfaa7..1c6658d572 100644
--- a/lib/Unicode/Collate/t/contract.t
+++ b/lib/Unicode/Collate/t/contract.t
@@ -51,7 +51,7 @@ ENTRIES
#########################
-ok(1); # If we made it this far, we're ok.
+ok(1);
my $kjeNoN = Unicode::Collate->new(
level => 1,
diff --git a/lib/Unicode/Collate/t/hangtype.t b/lib/Unicode/Collate/t/hangtype.t
new file mode 100644
index 0000000000..b6a46691aa
--- /dev/null
+++ b/lib/Unicode/Collate/t/hangtype.t
@@ -0,0 +1,56 @@
+BEGIN {
+ unless ("A" eq pack('U', 0x41)) {
+ print "1..0 # Unicode::Collate " .
+ "cannot stringify a Unicode code point\n";
+ exit 0;
+ }
+}
+
+BEGIN {
+ if ($ENV{PERL_CORE}) {
+ chdir('t') if -d 't';
+ @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib);
+ }
+}
+
+use Test;
+BEGIN { plan tests => 30 };
+
+use strict;
+use warnings;
+use Unicode::Collate;
+
+ok(1);
+
+#########################
+
+ok(Unicode::Collate::getHST(0x0000), '');
+ok(Unicode::Collate::getHST(0x0100), '');
+ok(Unicode::Collate::getHST(0x1000), '');
+ok(Unicode::Collate::getHST(0x10FF), '');
+ok(Unicode::Collate::getHST(0x1100), 'L');
+ok(Unicode::Collate::getHST(0x1101), 'L');
+ok(Unicode::Collate::getHST(0x1159), 'L');
+ok(Unicode::Collate::getHST(0x115A), '');
+ok(Unicode::Collate::getHST(0x115E), '');
+ok(Unicode::Collate::getHST(0x115F), 'L');
+ok(Unicode::Collate::getHST(0x1160), 'V');
+ok(Unicode::Collate::getHST(0x1161), 'V');
+ok(Unicode::Collate::getHST(0x11A0), 'V');
+ok(Unicode::Collate::getHST(0x11A2), 'V');
+ok(Unicode::Collate::getHST(0x11A3), '');
+ok(Unicode::Collate::getHST(0x11A7), '');
+ok(Unicode::Collate::getHST(0x11A8), 'T');
+ok(Unicode::Collate::getHST(0x11AF), 'T');
+ok(Unicode::Collate::getHST(0x11E0), 'T');
+ok(Unicode::Collate::getHST(0x11F9), 'T');
+ok(Unicode::Collate::getHST(0x11FA), '');
+ok(Unicode::Collate::getHST(0x11FF), '');
+ok(Unicode::Collate::getHST(0x3011), '');
+ok(Unicode::Collate::getHST(0x11A7), '');
+ok(Unicode::Collate::getHST(0xAC00), 'LV');
+ok(Unicode::Collate::getHST(0xAC01), 'LVT');
+ok(Unicode::Collate::getHST(0xAC1B), 'LVT');
+ok(Unicode::Collate::getHST(0xAC1C), 'LV');
+ok(Unicode::Collate::getHST(0xD7A3), 'LVT');
+
diff --git a/lib/Unicode/Collate/t/hangul.t b/lib/Unicode/Collate/t/hangul.t
index be6b0724fb..1b1359e88d 100644
--- a/lib/Unicode/Collate/t/hangul.t
+++ b/lib/Unicode/Collate/t/hangul.t
@@ -14,7 +14,7 @@ BEGIN {
}
use Test;
-BEGIN { plan tests => 52 };
+BEGIN { plan tests => 72 };
use strict;
use warnings;
@@ -25,7 +25,7 @@ $IsEBCDIC = ord("A") != 0x41;
#########################
-ok(1); # If we made it this far, we're ok.
+ok(1);
# a standard collator (3.1.1)
my $Collator = Unicode::Collate->new(
@@ -41,6 +41,7 @@ my $hangul = Unicode::Collate->new(
level => 3,
table => undef,
normalization => undef,
+
entry => <<'ENTRIES',
0061 ; [.0A15.0020.0002] # LATIN SMALL LETTER A
0041 ; [.0A15.0020.0008] # LATIN CAPITAL LETTER A
@@ -65,87 +66,127 @@ ENTRIES
ok(ref $hangul, "Unicode::Collate");
+my $trailwt = Unicode::Collate->new(
+ level => 3,
+ table => undef,
+ normalization => undef,
+ hangul_terminator => 16,
+
+ entry => <<'ENTRIES', # Term < Jongseong < Jungseong < Choseong
+0061 ; [.0A15.0020.0002] # LATIN SMALL LETTER A
+0041 ; [.0A15.0020.0008] # LATIN CAPITAL LETTER A
+11A8 ; [.1801.0020.0002] # HANGUL JONGSEONG KIYEOK
+11A9 ; [.1801.0020.0002][.1801.0020.0002] # HANGUL JONGSEONG SSANGKIYEOK
+1161 ; [.1831.0020.0002] # HANGUL JUNGSEONG A
+1163 ; [.1832.0020.0002] # HANGUL JUNGSEONG YA
+1100 ; [.1861.0020.0002] # HANGUL CHOSEONG KIYEOK
+1101 ; [.1861.0020.0002][.1861.0020.0002] # HANGUL CHOSEONG SSANGKIYEOK
+1102 ; [.1862.0020.0002] # HANGUL CHOSEONG NIEUN
+3042 ; [.1921.0020.000E] # HIRAGANA LETTER A
+ENTRIES
+);
+
#########################
# L(simp)L(simp) vs L(comp): /GGA/
ok($Collator->lt("\x{1100}\x{1100}\x{1161}", "\x{1101}\x{1161}"));
ok($hangul ->eq("\x{1100}\x{1100}\x{1161}", "\x{1101}\x{1161}"));
+ok($trailwt ->eq("\x{1100}\x{1100}\x{1161}", "\x{1101}\x{1161}"));
# L(simp) vs L(simp)L(simp): /GA/ vs /GGA/
ok($Collator->gt("\x{1100}\x{1161}", "\x{1100}\x{1100}\x{1161}"));
ok($hangul ->lt("\x{1100}\x{1161}", "\x{1100}\x{1100}\x{1161}"));
+ok($trailwt ->lt("\x{1100}\x{1161}", "\x{1100}\x{1100}\x{1161}"));
# T(simp)T(simp) vs T(comp): /AGG/
ok($Collator->lt("\x{1161}\x{11A8}\x{11A8}", "\x{1161}\x{11A9}"));
ok($hangul ->eq("\x{1161}\x{11A8}\x{11A8}", "\x{1161}\x{11A9}"));
+ok($trailwt ->eq("\x{1161}\x{11A8}\x{11A8}", "\x{1161}\x{11A9}"));
# T(simp) vs T(simp)T(simp): /AG/ vs /AGG/
ok($Collator->lt("\x{1161}\x{11A8}", "\x{1161}\x{11A8}\x{11A8}"));
ok($hangul ->lt("\x{1161}\x{11A8}", "\x{1161}\x{11A8}\x{11A8}"));
+ok($trailwt ->lt("\x{1161}\x{11A8}", "\x{1161}\x{11A8}\x{11A8}"));
# LV vs LLV: /GA/ vs /GNA/
ok($Collator->gt("\x{1100}\x{1161}", "\x{1100}\x{1102}\x{1161}"));
ok($hangul ->lt("\x{1100}\x{1161}", "\x{1100}\x{1102}\x{1161}"));
+ok($trailwt ->lt("\x{1100}\x{1161}", "\x{1100}\x{1102}\x{1161}"));
# LVX vs LVV: /GAA/ vs /GA/.latinA
ok($Collator->gt("\x{1100}\x{1161}\x{1161}", "\x{1100}\x{1161}A"));
ok($hangul ->gt("\x{1100}\x{1161}\x{1161}", "\x{1100}\x{1161}A"));
+ok($trailwt ->gt("\x{1100}\x{1161}\x{1161}", "\x{1100}\x{1161}A"));
# LVX vs LVV: /GAA/ vs /GA/.hiraganaA
ok($Collator->lt("\x{1100}\x{1161}\x{1161}", "\x{1100}\x{1161}\x{3042}"));
ok($hangul ->gt("\x{1100}\x{1161}\x{1161}", "\x{1100}\x{1161}\x{3042}"));
+ok($trailwt ->gt("\x{1100}\x{1161}\x{1161}", "\x{1100}\x{1161}\x{3042}"));
# LVX vs LVV: /GAA/ vs /GA/.hanja
ok($Collator->lt("\x{1100}\x{1161}\x{1161}", "\x{1100}\x{1161}\x{4E00}"));
ok($hangul ->gt("\x{1100}\x{1161}\x{1161}", "\x{1100}\x{1161}\x{4E00}"));
+ok($trailwt ->gt("\x{1100}\x{1161}\x{1161}", "\x{1100}\x{1161}\x{4E00}"));
# LVL vs LVT: /GA/./G/ vs /GAG/
ok($Collator->lt("\x{1100}\x{1161}\x{1100}", "\x{1100}\x{1161}\x{11A8}"));
ok($hangul ->lt("\x{1100}\x{1161}\x{1100}", "\x{1100}\x{1161}\x{11A8}"));
+ok($trailwt ->lt("\x{1100}\x{1161}\x{1100}", "\x{1100}\x{1161}\x{11A8}"));
# LVT vs LVX: /GAG/ vs /GA/.latinA
ok($Collator->gt("\x{1100}\x{1161}\x{11A8}", "\x{1100}\x{1161}A"));
ok($hangul ->gt("\x{1100}\x{1161}\x{11A8}", "\x{1100}\x{1161}A"));
+ok($trailwt ->gt("\x{1100}\x{1161}\x{11A8}", "\x{1100}\x{1161}A"));
# LVT vs LVX: /GAG/ vs /GA/.hiraganaA
ok($Collator->lt("\x{1100}\x{1161}\x{11A8}", "\x{1100}\x{1161}\x{3042}"));
ok($hangul ->gt("\x{1100}\x{1161}\x{11A8}", "\x{1100}\x{1161}\x{3042}"));
+ok($trailwt ->gt("\x{1100}\x{1161}\x{11A8}", "\x{1100}\x{1161}\x{3042}"));
# LVT vs LVX: /GAG/ vs /GA/.hanja
ok($Collator->lt("\x{1100}\x{1161}\x{11A8}", "\x{1100}\x{1161}\x{4E00}"));
ok($hangul ->gt("\x{1100}\x{1161}\x{11A8}", "\x{1100}\x{1161}\x{4E00}"));
+ok($trailwt ->gt("\x{1100}\x{1161}\x{11A8}", "\x{1100}\x{1161}\x{4E00}"));
# LVT vs LVV: /GAG/ vs /GAA/
ok($Collator->gt("\x{1100}\x{1161}\x{11A8}", "\x{1100}\x{1161}\x{1161}"));
ok($hangul ->lt("\x{1100}\x{1161}\x{11A8}", "\x{1100}\x{1161}\x{1161}"));
+ok($trailwt ->lt("\x{1100}\x{1161}\x{11A8}", "\x{1100}\x{1161}\x{1161}"));
# LVL vs LVV: /GA/./G/ vs /GAA/
ok($Collator->lt("\x{1100}\x{1161}\x{1100}", "\x{1100}\x{1161}\x{1161}"));
ok($hangul ->lt("\x{1100}\x{1161}\x{1100}", "\x{1100}\x{1161}\x{1161}"));
+ok($trailwt ->lt("\x{1100}\x{1161}\x{1100}", "\x{1100}\x{1161}\x{1161}"));
# LV vs Syl(LV): /GA/ vs /[GA]/
ok($Collator->eq("\x{1100}\x{1161}", "\x{AC00}"));
ok($hangul ->eq("\x{1100}\x{1161}", "\x{AC00}"));
+ok($trailwt ->eq("\x{1100}\x{1161}", "\x{AC00}"));
# LVT vs Syl(LV)T: /GAG/ vs /[GA]G/
ok($Collator->eq("\x{1100}\x{1161}\x{11A8}", "\x{AC00}\x{11A8}"));
ok($hangul ->eq("\x{1100}\x{1161}\x{11A8}", "\x{AC00}\x{11A8}"));
+ok($trailwt ->eq("\x{1100}\x{1161}\x{11A8}", "\x{AC00}\x{11A8}"));
# LVT vs Syl(LVT): /GAG/ vs /[GAG]/
ok($Collator->eq("\x{1100}\x{1161}\x{11A8}", "\x{AC01}"));
ok($hangul ->eq("\x{1100}\x{1161}\x{11A8}", "\x{AC01}"));
+ok($trailwt ->eq("\x{1100}\x{1161}\x{11A8}", "\x{AC01}"));
# LVTT vs Syl(LVTT): /GAGG/ vs /[GAGG]/
ok($Collator->eq("\x{1100}\x{1161}\x{11A9}", "\x{AC02}"));
ok($hangul ->eq("\x{1100}\x{1161}\x{11A9}", "\x{AC02}"));
+ok($trailwt ->eq("\x{1100}\x{1161}\x{11A9}", "\x{AC02}"));
# LVTT vs Syl(LVT).T: /GAGG/ vs /[GAG]G/
ok($Collator->gt("\x{1100}\x{1161}\x{11A9}", "\x{AC01}\x{11A8}"));
ok($hangul ->eq("\x{1100}\x{1161}\x{11A9}", "\x{AC01}\x{11A8}"));
+ok($trailwt ->eq("\x{1100}\x{1161}\x{11A9}", "\x{AC01}\x{11A8}"));
# LLVT vs L.Syl(LVT): /GGAG/ vs /G[GAG]/
ok($Collator->gt("\x{1101}\x{1161}\x{11A8}", "\x{1100}\x{AC01}"));
ok($hangul ->eq("\x{1101}\x{1161}\x{11A8}", "\x{1100}\x{AC01}"));
+ok($trailwt ->eq("\x{1101}\x{1161}\x{11A8}", "\x{1100}\x{AC01}"));
#########################
diff --git a/lib/Unicode/Collate/t/index.t b/lib/Unicode/Collate/t/index.t
index d6811c7db6..a1d67d5346 100644
--- a/lib/Unicode/Collate/t/index.t
+++ b/lib/Unicode/Collate/t/index.t
@@ -25,7 +25,7 @@ our $IsEBCDIC = ord("A") != 0x41;
#########################
-ok(1); # If we made it this far, we're ok.
+ok(1);
my $Collator = Unicode::Collate->new(
table => 'keys.txt',
diff --git a/lib/Unicode/Collate/t/normal.t b/lib/Unicode/Collate/t/normal.t
new file mode 100644
index 0000000000..026240d6fa
--- /dev/null
+++ b/lib/Unicode/Collate/t/normal.t
@@ -0,0 +1,205 @@
+BEGIN {
+ unless ("A" eq pack('U', 0x41)) {
+ print "1..0 # Unicode::Collate " .
+ "cannot stringify a Unicode code point\n";
+ exit 0;
+ }
+}
+
+BEGIN {
+ if ($ENV{PERL_CORE}) {
+ chdir('t') if -d 't';
+ @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib);
+ }
+}
+
+BEGIN {
+ eval { require Unicode::Normalize; };
+ if ($@) {
+ print "1..0 # skipped: Unicode::Normalize needed for this test\n";
+ print $@;
+ exit;
+ }
+}
+use Test;
+BEGIN { plan tests => 100 };
+
+use strict;
+use warnings;
+use Unicode::Collate;
+
+our $Aring = pack('U', 0xC5);
+our $aring = pack('U', 0xE5);
+
+our $entry = <<'ENTRIES';
+030A; [.0000.030A.0002] # COMBINING RING ABOVE
+212B; [.002B.0020.0008] # ANGSTROM SIGN
+0061; [.0A41.0020.0002] # LATIN SMALL LETTER A
+0041; [.0A41.0020.0008] # LATIN CAPITAL LETTER A
+007A; [.0A5A.0020.0002] # LATIN SMALL LETTER Z
+005A; [.0A5A.0020.0008] # LATIN CAPITAL LETTER Z
+FF41; [.0A87.0020.0002] # LATIN SMALL LETTER A
+FF21; [.0A87.0020.0008] # LATIN CAPITAL LETTER A
+00E5; [.0AC5.0020.0002] # LATIN SMALL LETTER A WITH RING ABOVE
+00C5; [.0AC5.0020.0008] # LATIN CAPITAL LETTER A WITH RING ABOVE
+ENTRIES
+
+# Aong < A+ring < Z < fullA+ring < A-ring
+
+#########################
+
+our $noN = Unicode::Collate->new(
+ level => 1,
+ table => undef,
+ normalization => undef,
+ entry => $entry,
+);
+
+our $nfc = Unicode::Collate->new(
+ level => 1,
+ table => undef,
+ normalization => 'NFC',
+ entry => $entry,
+);
+
+our $nfd = Unicode::Collate->new(
+ level => 1,
+ table => undef,
+ normalization => 'NFD',
+ entry => $entry,
+);
+
+our $nfkc = Unicode::Collate->new(
+ level => 1,
+ table => undef,
+ normalization => 'NFKC',
+ entry => $entry,
+);
+
+our $nfkd = Unicode::Collate->new(
+ level => 1,
+ table => undef,
+ normalization => 'NFKD',
+ entry => $entry,
+);
+
+ok($noN->lt("\x{212B}", "A"));
+ok($noN->lt("\x{212B}", $Aring));
+ok($noN->lt("A\x{30A}", $Aring));
+ok($noN->lt("A", "\x{FF21}"));
+ok($noN->lt("Z", "\x{FF21}"));
+ok($noN->lt("Z", $Aring));
+ok($noN->lt("\x{212B}", $aring));
+ok($noN->lt("A\x{30A}", $aring));
+ok($noN->lt("Z", $aring));
+ok($noN->lt("a\x{30A}", "Z"));
+
+ok($nfd->eq("\x{212B}", "A"));
+ok($nfd->eq("\x{212B}", $Aring));
+ok($nfd->eq("A\x{30A}", $Aring));
+ok($nfd->lt("A", "\x{FF21}"));
+ok($nfd->lt("Z", "\x{FF21}"));
+ok($nfd->gt("Z", $Aring));
+ok($nfd->eq("\x{212B}", $aring));
+ok($nfd->eq("A\x{30A}", $aring));
+ok($nfd->gt("Z", $aring));
+ok($nfd->lt("a\x{30A}", "Z"));
+
+ok($nfc->gt("\x{212B}", "A"));
+ok($nfc->eq("\x{212B}", $Aring));
+ok($nfc->eq("A\x{30A}", $Aring));
+ok($nfc->lt("A", "\x{FF21}"));
+ok($nfc->lt("Z", "\x{FF21}"));
+ok($nfc->lt("Z", $Aring));
+ok($nfc->eq("\x{212B}", $aring));
+ok($nfc->eq("A\x{30A}", $aring));
+ok($nfc->lt("Z", $aring));
+ok($nfc->gt("a\x{30A}", "Z"));
+
+ok($nfkd->eq("\x{212B}", "A"));
+ok($nfkd->eq("\x{212B}", $Aring));
+ok($nfkd->eq("A\x{30A}", $Aring));
+ok($nfkd->eq("A", "\x{FF21}"));
+ok($nfkd->gt("Z", "\x{FF21}"));
+ok($nfkd->gt("Z", $Aring));
+ok($nfkd->eq("\x{212B}", $aring));
+ok($nfkd->eq("A\x{30A}", $aring));
+ok($nfkd->gt("Z", $aring));
+ok($nfkd->lt("a\x{30A}", "Z"));
+
+ok($nfkc->gt("\x{212B}", "A"));
+ok($nfkc->eq("\x{212B}", $Aring));
+ok($nfkc->eq("A\x{30A}", $Aring));
+ok($nfkc->eq("A", "\x{FF21}"));
+ok($nfkc->gt("Z", "\x{FF21}"));
+ok($nfkc->lt("Z", $Aring));
+ok($nfkc->eq("\x{212B}", $aring));
+ok($nfkc->eq("A\x{30A}", $aring));
+ok($nfkc->lt("Z", $aring));
+ok($nfkc->gt("a\x{30A}", "Z"));
+
+$nfd->change(normalization => undef);
+
+ok($nfd->lt("\x{212B}", "A"));
+ok($nfd->lt("\x{212B}", $Aring));
+ok($nfd->lt("A\x{30A}", $Aring));
+ok($nfd->lt("A", "\x{FF21}"));
+ok($nfd->lt("Z", "\x{FF21}"));
+ok($nfd->lt("Z", $Aring));
+ok($nfd->lt("\x{212B}", $aring));
+ok($nfd->lt("A\x{30A}", $aring));
+ok($nfd->lt("Z", $aring));
+ok($nfd->lt("a\x{30A}", "Z"));
+
+$nfd->change(normalization => 'C');
+
+ok($nfd->gt("\x{212B}", "A"));
+ok($nfd->eq("\x{212B}", $Aring));
+ok($nfd->eq("A\x{30A}", $Aring));
+ok($nfd->lt("A", "\x{FF21}"));
+ok($nfd->lt("Z", "\x{FF21}"));
+ok($nfd->lt("Z", $Aring));
+ok($nfd->eq("\x{212B}", $aring));
+ok($nfd->eq("A\x{30A}", $aring));
+ok($nfd->lt("Z", $aring));
+ok($nfd->gt("a\x{30A}", "Z"));
+
+$nfd->change(normalization => 'D');
+
+ok($nfd->eq("\x{212B}", "A"));
+ok($nfd->eq("\x{212B}", $Aring));
+ok($nfd->eq("A\x{30A}", $Aring));
+ok($nfd->lt("A", "\x{FF21}"));
+ok($nfd->lt("Z", "\x{FF21}"));
+ok($nfd->gt("Z", $Aring));
+ok($nfd->eq("\x{212B}", $aring));
+ok($nfd->eq("A\x{30A}", $aring));
+ok($nfd->gt("Z", $aring));
+ok($nfd->lt("a\x{30A}", "Z"));
+
+$nfd->change(normalization => 'KD');
+
+ok($nfd->eq("\x{212B}", "A"));
+ok($nfd->eq("\x{212B}", $Aring));
+ok($nfd->eq("A\x{30A}", $Aring));
+ok($nfd->eq("A", "\x{FF21}"));
+ok($nfd->gt("Z", "\x{FF21}"));
+ok($nfd->gt("Z", $Aring));
+ok($nfd->eq("\x{212B}", $aring));
+ok($nfd->eq("A\x{30A}", $aring));
+ok($nfd->gt("Z", $aring));
+ok($nfd->lt("a\x{30A}", "Z"));
+
+$nfd->change(normalization => 'KC');
+
+ok($nfd->gt("\x{212B}", "A"));
+ok($nfd->eq("\x{212B}", $Aring));
+ok($nfd->eq("A\x{30A}", $Aring));
+ok($nfd->eq("A", "\x{FF21}"));
+ok($nfd->gt("Z", "\x{FF21}"));
+ok($nfd->lt("Z", $Aring));
+ok($nfd->eq("\x{212B}", $aring));
+ok($nfd->eq("A\x{30A}", $aring));
+ok($nfd->lt("Z", $aring));
+ok($nfd->gt("a\x{30A}", "Z"));
+
diff --git a/lib/Unicode/Collate/t/test.t b/lib/Unicode/Collate/t/test.t
index 0c170e422a..8a7eb8b59f 100644
--- a/lib/Unicode/Collate/t/test.t
+++ b/lib/Unicode/Collate/t/test.t
@@ -15,7 +15,7 @@ BEGIN {
}
use Test;
-BEGIN { plan tests => 200 };
+BEGIN { plan tests => 203 };
use strict;
use warnings;
@@ -23,14 +23,9 @@ use Unicode::Collate;
our $IsEBCDIC = ord("A") != 0x41;
-#########################
+ok(1);
-ok(1); # If we made it this far, we're ok.
-
-my $UCA_Version = "9";
-
-ok(Unicode::Collate::UCA_Version, $UCA_Version);
-ok(Unicode::Collate->UCA_Version, $UCA_Version);
+##### 2..6
my $Collator = Unicode::Collate->new(
table => 'keys.txt',
@@ -39,8 +34,6 @@ my $Collator = Unicode::Collate->new(
ok(ref $Collator, "Unicode::Collate");
-ok($Collator->UCA_Version, $UCA_Version);
-ok($Collator->UCA_Version(), $UCA_Version);
ok(
join(':', $Collator->sort(
@@ -55,7 +48,7 @@ ok($Collator->cmp("", ""), 0);
ok($Collator->eq("", ""));
ok($Collator->cmp("", "perl"), -1);
-##############
+##### 7..17
sub _pack_U { Unicode::Collate::pack_U(@_) }
sub _unpack_U { Unicode::Collate::unpack_U(@_) }
@@ -80,7 +73,7 @@ ok($Collator->lt("A", $A_acute));
ok($Collator->lt("A", $a_acute));
ok($Collator->lt($a_acute, $A_acute));
-##############
+##### 17..20
eval { require Unicode::Normalize };
@@ -109,7 +102,7 @@ else {
ok(1);
}
-##############
+##### 21..30
my $trad = Unicode::Collate->new(
table => 'keys.txt',
@@ -148,7 +141,7 @@ ok($trad->eq("", $katakana));
ok($trad->eq($hiragana, $katakana));
ok($trad->eq($katakana, $hiragana));
-##############
+##### 31..37
$Collator->change(level => 2);
@@ -161,6 +154,8 @@ ok( $Collator->cmp($hiragana, $katakana), 0);
ok( $Collator->eq($hiragana, $katakana) );
ok( $Collator->ge($hiragana, $katakana) );
+##### 38..43
+
# hangul
ok( $Collator->eq("a\x{AC00}b", "a\x{1100}\x{1161}b") );
ok( $Collator->eq("a\x{AE00}b", "a\x{1100}\x{1173}\x{11AF}b") );
@@ -169,6 +164,8 @@ ok( $Collator->lt("a\x{AC00}b", "a\x{AE00}b") );
ok( $Collator->gt("a\x{D7A3}b", "a\x{C544}b") );
ok( $Collator->lt("a\x{C544}b", "a\x{30A2}b") ); # hangul < hiragana
+##### 44..52
+
$Collator->change(%old_level, katakana_before_hiragana => 1);
ok($Collator->{level}, 4);
@@ -182,6 +179,8 @@ ok( $Collator->ne($hiragana, $katakana) );
ok( $Collator->gt($hiragana, $katakana) );
ok( $Collator->ge($hiragana, $katakana) );
+##### 53..58
+
$Collator->change(upper_before_lower => 1);
ok( $Collator->cmp("abc", "ABC"), 1);
@@ -191,6 +190,8 @@ ok( $Collator->cmp($hiragana, $katakana), 1);
ok( $Collator->ge($hiragana, $katakana), 1);
ok( $Collator->gt($hiragana, $katakana), 1);
+##### 59..64
+
$Collator->change(katakana_before_hiragana => 0);
ok( $Collator->cmp("abc", "ABC"), 1);
@@ -203,7 +204,7 @@ ok( $Collator->le("abc", "ABC") );
ok( $Collator->cmp($hiragana, $katakana), -1);
ok( $Collator->lt($hiragana, $katakana) );
-##############
+##### 65..66
my $ignoreAE = Unicode::Collate->new(
table => 'keys.txt',
@@ -214,7 +215,7 @@ my $ignoreAE = Unicode::Collate->new(
ok($ignoreAE->eq("element","lament"));
ok($ignoreAE->eq("Perl","ePrl"));
-##############
+##### 67
my $onlyABC = Unicode::Collate->new(
table => undef,
@@ -234,7 +235,7 @@ ok(
join(':', qw/ A aB Ab ABA BAC cAc cc / ),
);
-##############
+##### 68..71
my $undefAE = Unicode::Collate->new(
table => 'keys.txt',
@@ -247,7 +248,7 @@ ok($Collator->lt("edge","fog"));
ok($undefAE ->gt("lake","like"));
ok($Collator->lt("lake","like"));
-##############
+##### 72..81
# Table is undefined, then no entry is defined.
@@ -281,7 +282,7 @@ ok($undef_table->lt("\x{4E00}","\x{4E8C}"));
# U+4E8C: Ideograph "TWO"
-##############
+##### 82..86
my $few_entries = Unicode::Collate->new(
entry => <<'ENTRIES',
@@ -312,7 +313,7 @@ ok($few_entries->lt("\x{AE30}", "\x{AC00}"));
ok($few_entries->eq("\x{AC00}", "\x{1100}\x{1161}"));
-##############
+##### 87..91
my $all_undef_8 = Unicode::Collate->new(
table => undef,
@@ -331,7 +332,7 @@ ok($all_undef_8->lt("\x{4E00}", "\x{AC00}"));
ok($all_undef_8->gt("\x{AC00}", "\x{1100}\x{1161}"));
ok($all_undef_8->gt("\x{AC00}", "\x{ABFF}"));
-##############
+##### 92..96
my $all_undef_9 = Unicode::Collate->new(
table => undef,
@@ -350,7 +351,7 @@ ok($all_undef_9->lt("\x{20000}", "\x{AC00}"));
ok($all_undef_9->gt("\x{AC00}", "\x{1100}\x{1161}"));
ok($all_undef_9->gt("\x{AC00}", "\x{ABFF}")); # U+ABFF: not assigned
-##############
+##### 97..101
my $ignoreCJK = Unicode::Collate->new(
table => undef,
@@ -369,7 +370,7 @@ ok($ignoreCJK->eq("Pe\x{4E00}rl", "Perl")); # U+4E00 is a CJK.
ok($ignoreCJK->gt("\x{4DFF}", "\x{4E00}")); # U+4DFF is not CJK.
ok($ignoreCJK->lt("Pe\x{5B57}rl", "Perl")); # 'r' is unassigned.
-##############
+##### 102..106
my $ignoreHangul = Unicode::Collate->new(
table => undef,
@@ -388,7 +389,7 @@ ok($ignoreHangul->lt("\x{AC00}", "\x{AE00}"));
ok($ignoreHangul->lt("\x{AC00}", "\x{1100}\x{1161}")); # Jamo are not ignored.
ok($ignoreHangul->lt("Pe\x{AE00}rl", "Perl")); # 'r' is unassigned.
-##############
+##### 107..127
my %origAlter = $Collator->change(alternate => 'Blanked');
@@ -426,7 +427,7 @@ $Collator->change(%origAlter);
ok($Collator->{alternate}, 'shifted');
-##############
+##### 128..132
my $overCJK = Unicode::Collate->new(
table => undef,
@@ -448,7 +449,7 @@ ok($overCJK->lt("A\x{4E03}", "A\x{4E00}"));
ok($overCJK->lt("A\x{4E03}", "a\x{4E00}"));
ok($overCJK->lt("a\x{4E03}", "A\x{4E00}"));
-##############
+##### 133..144
# rearrange : 0x0E40..0x0E44, 0x0EC0..0x0EC4 (default)
@@ -475,7 +476,7 @@ ok($all_undef_8->lt("\x{0E40}A", "\x{0E41}B"));
ok($all_undef_8->lt("\x{0E41}A", "\x{0E40}B"));
ok($all_undef_8->lt("A\x{0E41}A", "A\x{0E40}B"));
-##############
+##### 145..149
my $no_rearrange = Unicode::Collate->new(
table => undef,
@@ -489,7 +490,7 @@ ok($no_rearrange->lt("\x{0E40}A", "\x{0E41}B"));
ok($no_rearrange->gt("\x{0E41}A", "\x{0E40}B"));
ok($no_rearrange->gt("A\x{0E41}A", "A\x{0E40}B"));
-##############
+##### 150..154
my $undef_rearrange = Unicode::Collate->new(
table => undef,
@@ -503,7 +504,7 @@ ok($undef_rearrange->lt("\x{0E40}A", "\x{0E41}B"));
ok($undef_rearrange->gt("\x{0E41}A", "\x{0E40}B"));
ok($undef_rearrange->gt("A\x{0E41}A", "A\x{0E40}B"));
-##############
+##### 155..159
my $dropArticles = Unicode::Collate->new(
table => "keys.txt",
@@ -521,7 +522,7 @@ ok($dropArticles->lt("the pen", "a pencil"));
ok($Collator->lt("Perl", "The Perl"));
ok($Collator->gt("the pen", "a pencil"));
-##############
+##### 160..161
my $backLevel1 = Unicode::Collate->new(
table => undef,
@@ -534,7 +535,7 @@ my $backLevel1 = Unicode::Collate->new(
ok($backLevel1->gt("AB", "BA"));
ok($backLevel1->gt("\x{3042}\x{3044}", "\x{3044}\x{3042}"));
-##############
+##### 162..169
my $backLevel2 = Unicode::Collate->new(
table => "keys.txt",
@@ -556,7 +557,7 @@ ok($backLevel2->lt("\x{4E03}", $katakana));
ok($Collator ->gt("\x{4E00}", $hiragana));
ok($Collator ->gt("\x{4E03}", $katakana));
-##############
+##### 170..184
# ignorable after variable
@@ -590,7 +591,7 @@ ok($Collator->lt("\cA", "?"));
$Collator->change(alternate => 'Shifted', level => 4);
-##############
+##### 185..196
# According to Conformance Test,
# a L3-ignorable is treated as a completely ignorable.
@@ -629,3 +630,39 @@ ok($L3ignorable->eq("\x{1D1BB}", "\x{1D1B9}"));
ok($L3ignorable->eq("\x{1D1BC}", "\x{1D1BA}"));
ok($L3ignorable->eq("\x{1D1BB}", "\x{1D1B9}\x{1D165}"));
ok($L3ignorable->eq("\x{1D1BC}", "\x{1D1BA}\x{1D165}"));
+
+##### 197..203
+
+my $O_str = Unicode::Collate->new(
+ table => "keys.txt",
+ normalization => undef,
+ entry => <<'ENTRIES',
+0008 ; [*0008.0000.0000.0000] # BACKSPACE (need to be non-ignorable)
+004F 0337 ; [.0B53.0020.0008.004F] # capital O WITH SHORT SOLIDUS OVERLAY
+006F 0008 002F ; [.0B53.0020.0002.006F] # LATIN SMALL LETTER O WITH STROKE
+004F 0008 002F ; [.0B53.0020.0008.004F] # LATIN CAPITAL LETTER O WITH STROKE
+006F 0337 ; [.0B53.0020.0002.004F] # small O WITH SHORT SOLIDUS OVERLAY
+200B ; [.2000.0000.0000.0000] # ZERO WIDTH SPACE (may be non-sense but ...)
+#00F8 ; [.0B53.0020.0002.00F8] # LATIN SMALL LETTER O WITH STROKE
+#00D8 ; [.0B53.0020.0008.00D8] # LATIN CAPITAL LETTER O WITH STROKE
+ENTRIES
+);
+
+my $o_BS_slash = _pack_U(0x006F, 0x0008, 0x002F);
+my $O_BS_slash = _pack_U(0x004F, 0x0008, 0x002F);
+my $o_sol = _pack_U(0x006F, 0x0337);
+my $O_sol = _pack_U(0x004F, 0x0337);
+my $o_stroke = _pack_U(0x00F8);
+my $O_stroke = _pack_U(0x00D8);
+
+ok($O_str->eq($o_stroke, $o_BS_slash));
+ok($O_str->eq($O_stroke, $O_BS_slash));
+
+ok($O_str->eq($o_stroke, $o_sol));
+ok($O_str->eq($O_stroke, $O_sol));
+
+ok($Collator->eq("\x{200B}", "\0"));
+ok($O_str ->gt("\x{200B}", "\0"));
+ok($O_str ->gt("\x{200B}", "A"));
+
+#####
diff --git a/lib/Unicode/Collate/t/trailwt.t b/lib/Unicode/Collate/t/trailwt.t
new file mode 100644
index 0000000000..463252cf1c
--- /dev/null
+++ b/lib/Unicode/Collate/t/trailwt.t
@@ -0,0 +1,229 @@
+BEGIN {
+ unless ("A" eq pack('U', 0x41)) {
+ print "1..0 # Unicode::Collate " .
+ "cannot stringify a Unicode code point\n";
+ exit 0;
+ }
+}
+
+BEGIN {
+ if ($ENV{PERL_CORE}) {
+ chdir('t') if -d 't';
+ @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib);
+ }
+}
+
+use Test;
+BEGIN { plan tests => 58 };
+
+use strict;
+use warnings;
+use Unicode::Collate;
+
+#########################
+
+ok(1);
+
+# a standard collator (3.1.1)
+my $Collator = Unicode::Collate->new(
+ level => 1,
+ table => 'keys.txt',
+ normalization => undef,
+
+ entry => <<'ENTRIES',
+326E ; [.1831.0020.0006.326E][.188D.0020.0006.326E] # c.h.s. GA
+326F ; [.1833.0020.0006.326F][.188D.0020.0006.326F] # c.h.s. NA
+3270 ; [.1834.0020.0006.3270][.188D.0020.0006.3270] # c.h.s. DA
+3271 ; [.1836.0020.0006.3271][.188D.0020.0006.3271] # c.h.s. RA
+3272 ; [.1837.0020.0006.3272][.188D.0020.0006.3272] # c.h.s. MA
+3273 ; [.1838.0020.0006.3273][.188D.0020.0006.3273] # c.h.s. BA
+3274 ; [.183A.0020.0006.3274][.188D.0020.0006.3274] # c.h.s. SA
+3275 ; [.183C.0020.0006.3275][.188D.0020.0006.3275] # c.h.s. A
+3276 ; [.183D.0020.0006.3276][.188D.0020.0006.3276] # c.h.s. JA
+3277 ; [.183F.0020.0006.3277][.188D.0020.0006.3277] # c.h.s. CA
+3278 ; [.1840.0020.0006.3278][.188D.0020.0006.3278] # c.h.s. KA
+3279 ; [.1841.0020.0006.3279][.188D.0020.0006.3279] # c.h.s. TA
+327A ; [.1842.0020.0006.327A][.188D.0020.0006.327A] # c.h.s. PA
+327B ; [.1843.0020.0006.327B][.188D.0020.0006.327B] # c.h.s. HA
+ENTRIES
+);
+
+my $hangul = Unicode::Collate->new(
+ level => 1,
+ table => 'keys.txt',
+ normalization => undef,
+ hangul_terminator => 16,
+
+ entry => <<'ENTRIES',
+326E ; [.1831.0020.0006.326E][.188D.0020.0006.326E] # c.h.s. GA
+326F ; [.1833.0020.0006.326F][.188D.0020.0006.326F] # c.h.s. NA
+3270 ; [.1834.0020.0006.3270][.188D.0020.0006.3270] # c.h.s. DA
+3271 ; [.1836.0020.0006.3271][.188D.0020.0006.3271] # c.h.s. RA
+3272 ; [.1837.0020.0006.3272][.188D.0020.0006.3272] # c.h.s. MA
+3273 ; [.1838.0020.0006.3273][.188D.0020.0006.3273] # c.h.s. BA
+3274 ; [.183A.0020.0006.3274][.188D.0020.0006.3274] # c.h.s. SA
+3275 ; [.183C.0020.0006.3275][.188D.0020.0006.3275] # c.h.s. A
+3276 ; [.183D.0020.0006.3276][.188D.0020.0006.3276] # c.h.s. JA
+3277 ; [.183F.0020.0006.3277][.188D.0020.0006.3277] # c.h.s. CA
+3278 ; [.1840.0020.0006.3278][.188D.0020.0006.3278] # c.h.s. KA
+3279 ; [.1841.0020.0006.3279][.188D.0020.0006.3279] # c.h.s. TA
+327A ; [.1842.0020.0006.327A][.188D.0020.0006.327A] # c.h.s. PA
+327B ; [.1843.0020.0006.327B][.188D.0020.0006.327B] # c.h.s. HA
+ENTRIES
+);
+
+ok(ref $hangul, "Unicode::Collate");
+
+#########################
+
+# LVX vs LVV: /GAA/ vs /GA/.latinA
+ok($Collator->gt("\x{1100}\x{1161}\x{1161}", "\x{1100}\x{1161}A"));
+ok($hangul ->gt("\x{1100}\x{1161}\x{1161}", "\x{1100}\x{1161}A"));
+
+# LVX vs LVV: /GAA/ vs /GA/.hiraganaA
+ok($Collator->lt("\x{1100}\x{1161}\x{1161}", "\x{1100}\x{1161}\x{3042}"));
+ok($hangul ->gt("\x{1100}\x{1161}\x{1161}", "\x{1100}\x{1161}\x{3042}"));
+
+# LVX vs LVV: /GAA/ vs /GA/.hanja
+ok($Collator->lt("\x{1100}\x{1161}\x{1161}", "\x{1100}\x{1161}\x{4E00}"));
+ok($hangul ->gt("\x{1100}\x{1161}\x{1161}", "\x{1100}\x{1161}\x{4E00}"));
+
+# LVL vs LVT: /GA/./G/ vs /GAG/
+ok($Collator->lt("\x{1100}\x{1161}\x{1100}", "\x{1100}\x{1161}\x{11A8}"));
+ok($hangul ->lt("\x{1100}\x{1161}\x{1100}", "\x{1100}\x{1161}\x{11A8}"));
+
+# LVT vs LVX: /GAG/ vs /GA/.latinA
+ok($Collator->gt("\x{1100}\x{1161}\x{11A8}", "\x{1100}\x{1161}A"));
+ok($hangul ->gt("\x{1100}\x{1161}\x{11A8}", "\x{1100}\x{1161}A"));
+
+# LVT vs LVX: /GAG/ vs /GA/.hiraganaA
+ok($Collator->lt("\x{1100}\x{1161}\x{11A8}", "\x{1100}\x{1161}\x{3042}"));
+ok($hangul ->gt("\x{1100}\x{1161}\x{11A8}", "\x{1100}\x{1161}\x{3042}"));
+
+# LVT vs LVX: /GAG/ vs /GA/.hanja
+ok($Collator->lt("\x{1100}\x{1161}\x{11A8}", "\x{1100}\x{1161}\x{4E00}"));
+ok($hangul ->gt("\x{1100}\x{1161}\x{11A8}", "\x{1100}\x{1161}\x{4E00}"));
+
+# LV vs Syl(LV): /GA/ vs /[GA]/
+ok($Collator->eq("\x{1100}\x{1161}", "\x{AC00}"));
+ok($hangul ->eq("\x{1100}\x{1161}", "\x{AC00}"));
+
+# LVT vs Syl(LV)T: /GAG/ vs /[GA]G/
+ok($Collator->eq("\x{1100}\x{1161}\x{11A8}", "\x{AC00}\x{11A8}"));
+ok($hangul ->eq("\x{1100}\x{1161}\x{11A8}", "\x{AC00}\x{11A8}"));
+
+# LVT vs Syl(LVT): /GAG/ vs /[GAG]/
+ok($Collator->eq("\x{1100}\x{1161}\x{11A8}", "\x{AC01}"));
+ok($hangul ->eq("\x{1100}\x{1161}\x{11A8}", "\x{AC01}"));
+
+# LVTT vs Syl(LVTT): /GAGG/ vs /[GAGG]/
+ok($Collator->eq("\x{1100}\x{1161}\x{11A9}", "\x{AC02}"));
+ok($hangul ->eq("\x{1100}\x{1161}\x{11A9}", "\x{AC02}"));
+
+# Syl(LVT) vs : /GAG/ vs /[GAG]/
+ok($Collator->eq("\x{1100}\x{1161}\x{11A8}", "\x{AC01}"));
+ok($hangul ->eq("\x{1100}\x{1161}\x{11A8}", "\x{AC01}"));
+
+#########################
+
+my $hangcirc = Unicode::Collate->new(
+ level => 1,
+ table => 'keys.txt',
+ normalization => undef,
+ hangul_terminator => 16,
+
+ entry => <<'ENTRIES',
+326E ; [.1831.0020.0006.326E][.188D.0020.0006.326E][.10.0.0.0] # c.h.s. GA
+326F ; [.1833.0020.0006.326F][.188D.0020.0006.326F][.10.0.0.0] # c.h.s. NA
+3270 ; [.1834.0020.0006.3270][.188D.0020.0006.3270][.10.0.0.0] # c.h.s. DA
+3271 ; [.1836.0020.0006.3271][.188D.0020.0006.3271][.10.0.0.0] # c.h.s. RA
+3272 ; [.1837.0020.0006.3272][.188D.0020.0006.3272][.10.0.0.0] # c.h.s. MA
+3273 ; [.1838.0020.0006.3273][.188D.0020.0006.3273][.10.0.0.0] # c.h.s. BA
+3274 ; [.183A.0020.0006.3274][.188D.0020.0006.3274][.10.0.0.0] # c.h.s. SA
+3275 ; [.183C.0020.0006.3275][.188D.0020.0006.3275][.10.0.0.0] # c.h.s. A
+3276 ; [.183D.0020.0006.3276][.188D.0020.0006.3276][.10.0.0.0] # c.h.s. JA
+3277 ; [.183F.0020.0006.3277][.188D.0020.0006.3277][.10.0.0.0] # c.h.s. CA
+3278 ; [.1840.0020.0006.3278][.188D.0020.0006.3278][.10.0.0.0] # c.h.s. KA
+3279 ; [.1841.0020.0006.3279][.188D.0020.0006.3279][.10.0.0.0] # c.h.s. TA
+327A ; [.1842.0020.0006.327A][.188D.0020.0006.327A][.10.0.0.0] # c.h.s. PA
+327B ; [.1843.0020.0006.327B][.188D.0020.0006.327B][.10.0.0.0] # c.h.s. HA
+ENTRIES
+);
+
+# LV vs Circled Syl(LV): /GA/ vs /(GA)/
+ok($Collator->eq("\x{1100}\x{1161}", "\x{326E}"));
+ok($hangul ->gt("\x{1100}\x{1161}", "\x{326E}"));
+ok($hangcirc->eq("\x{1100}\x{1161}", "\x{326E}"));
+
+# LV vs Circled Syl(LV): followed by latin A
+ok($Collator->eq("\x{1100}\x{1161}A", "\x{326E}A"));
+ok($hangul ->lt("\x{1100}\x{1161}A", "\x{326E}A"));
+ok($hangcirc->eq("\x{1100}\x{1161}A", "\x{326E}A"));
+
+# LV vs Circled Syl(LV): followed by hiragana A
+ok($Collator->eq("\x{1100}\x{1161}\x{3042}", "\x{326E}\x{3042}"));
+ok($hangul ->lt("\x{1100}\x{1161}\x{3042}", "\x{326E}\x{3042}"));
+ok($hangcirc->eq("\x{1100}\x{1161}\x{3042}", "\x{326E}\x{3042}"));
+
+# LVT vs LVX: /GAG/ vs /GA/.hanja
+ok($Collator->eq("\x{1100}\x{1161}\x{4E00}", "\x{326E}\x{4E00}"));
+ok($hangul ->lt("\x{1100}\x{1161}\x{4E00}", "\x{326E}\x{4E00}"));
+ok($hangcirc->eq("\x{1100}\x{1161}\x{4E00}", "\x{326E}\x{4E00}"));
+
+#########################
+
+# checks contraction in LVT:
+# weights of these contractions may be non-sense.
+
+my $hangcont = Unicode::Collate->new(
+ level => 1,
+ table => 'keys.txt',
+ normalization => undef,
+ hangul_terminator => 16,
+
+ entry => <<'ENTRIES',
+1100 1161 ; [.1831.0020.0002.1100][.188D.0020.0002.1161] # KIYEOK+A
+1161 11A8 ; [.188D.0020.0002.1161][.18CF.0020.0002.11A8] # A+KIYEOK
+ENTRIES
+);
+
+# cont<LV> vs Syl(LV): /<GA>/ vs /[GA]/
+ok($Collator->eq("\x{1100}\x{1161}", "\x{AC00}"));
+ok($hangcont->eq("\x{1100}\x{1161}", "\x{AC00}"));
+
+# cont<LV>.T vs Syl(LV).T: /<GA>G/ vs /[GA]G/
+ok($Collator->eq("\x{1100}\x{1161}\x{11A8}", "\x{AC00}\x{11A8}"));
+ok($hangcont->eq("\x{1100}\x{1161}\x{11A8}", "\x{AC00}\x{11A8}"));
+
+# cont<LV>.T vs Syl(LVT): /<GA>G/ vs /[GAG]/
+ok($Collator->eq("\x{1100}\x{1161}\x{11A8}", "\x{AC01}"));
+ok($hangcont->eq("\x{1100}\x{1161}\x{11A8}", "\x{AC01}"));
+
+# L.cont<VT> vs Syl(LV).T: /D<AG>/ vs /[DA]G/
+ok($Collator->eq("\x{1103}\x{1161}\x{11A8}", "\x{B2E4}\x{11A8}"));
+ok($hangcont->eq("\x{1103}\x{1161}\x{11A8}", "\x{B2E4}\x{11A8}"));
+
+# L.cont<VT> vs Syl(LVT): /D<AG>/ vs /[DAG]/
+ok($Collator->eq("\x{1103}\x{1161}\x{11A8}", "\x{B2E5}"));
+ok($hangcont->eq("\x{1103}\x{1161}\x{11A8}", "\x{B2E5}"));
+
+#####
+
+$Collator->change(hangul_terminator => 16);
+
+ok($Collator->gt("\x{1100}\x{1161}\x{11A8}", "\x{1100}\x{1161}\x{4E00}"));
+ok($Collator->gt("\x{1100}\x{1161}", "\x{326E}"));
+ok($Collator->lt("\x{1100}\x{1161}A", "\x{326E}A"));
+ok($Collator->lt("\x{1100}\x{1161}\x{3042}", "\x{326E}\x{3042}"));
+ok($Collator->lt("\x{1100}\x{1161}\x{4E00}", "\x{326E}\x{4E00}"));
+
+$Collator->change(hangul_terminator => 0);
+
+ok($Collator->lt("\x{1100}\x{1161}\x{11A8}", "\x{1100}\x{1161}\x{4E00}"));
+ok($Collator->eq("\x{1100}\x{1161}", "\x{326E}"));
+ok($Collator->eq("\x{1100}\x{1161}A", "\x{326E}A"));
+ok($Collator->eq("\x{1100}\x{1161}\x{3042}", "\x{326E}\x{3042}"));
+ok($Collator->eq("\x{1100}\x{1161}\x{4E00}", "\x{326E}\x{4E00}"));
+
+1;
+__END__
diff --git a/lib/Unicode/Collate/t/variable.t b/lib/Unicode/Collate/t/variable.t
new file mode 100644
index 0000000000..880327a6bd
--- /dev/null
+++ b/lib/Unicode/Collate/t/variable.t
@@ -0,0 +1,108 @@
+
+BEGIN {
+ unless ("A" eq pack('U', 0x41)) {
+ print "1..0 # Unicode::Collate " .
+ "cannot stringify a Unicode code point\n";
+ exit 0;
+ }
+}
+
+BEGIN {
+ if ($ENV{PERL_CORE}) {
+ chdir('t') if -d 't';
+ @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib);
+ }
+}
+
+use Test;
+BEGIN { plan tests => 37 };
+
+use strict;
+use warnings;
+use Unicode::Collate;
+
+ok(1);
+
+#########################
+
+sub _pack_U { Unicode::Collate::pack_U(@_) }
+sub _unpack_U { Unicode::Collate::unpack_U(@_) }
+
+my $A_acute = _pack_U(0xC1);
+my $acute = _pack_U(0x0301);
+
+my $Collator = Unicode::Collate->new(
+ table => 'keys.txt',
+ normalization => undef,
+);
+
+my %origVar = $Collator->change(variable => 'Blanked');
+
+ok($Collator->lt("death", "de luge"));
+ok($Collator->lt("de luge", "de-luge"));
+ok($Collator->lt("de-luge", "deluge"));
+ok($Collator->lt("deluge", "de\x{2010}luge"));
+ok($Collator->lt("deluge", "de Luge"));
+
+$Collator->change(variable => 'Non-ignorable');
+
+ok($Collator->lt("de luge", "de Luge"));
+ok($Collator->lt("de Luge", "de-luge"));
+ok($Collator->lt("de-Luge", "de\x{2010}luge"));
+ok($Collator->lt("de-luge", "death"));
+ok($Collator->lt("death", "deluge"));
+
+$Collator->change(variable => 'Shifted');
+
+ok($Collator->lt("death", "de luge"));
+ok($Collator->lt("de luge", "de-luge"));
+ok($Collator->lt("de-luge", "deluge"));
+ok($Collator->lt("deluge", "de Luge"));
+ok($Collator->lt("de Luge", "deLuge"));
+
+$Collator->change(variable => 'Shift-Trimmed');
+
+ok($Collator->lt("death", "deluge"));
+ok($Collator->lt("deluge", "de luge"));
+ok($Collator->lt("de luge", "de-luge"));
+ok($Collator->lt("de-luge", "deLuge"));
+ok($Collator->lt("deLuge", "de Luge"));
+
+$Collator->change(%origVar);
+
+ok($Collator->{variable}, 'shifted');
+
+##############
+
+# ignorable after variable
+
+# Shifted;
+ok($Collator->eq("?\x{300}!\x{301}\x{315}", "?!"));
+ok($Collator->eq("?\x{300}A\x{301}", "?$A_acute"));
+ok($Collator->eq("?\x{300}", "?"));
+ok($Collator->eq("?\x{344}", "?")); # U+0344 has two CEs.
+
+$Collator->change(level => 3);
+ok($Collator->eq("\cA", "?"));
+
+$Collator->change(variable => 'blanked', level => 4);
+ok($Collator->eq("?\x{300}!\x{301}\x{315}", "?!"));
+ok($Collator->eq("?\x{300}A\x{301}", "?$A_acute"));
+ok($Collator->eq("?\x{300}", "?"));
+ok($Collator->eq("?\x{344}", "?")); # U+0344 has two CEs.
+
+$Collator->change(level => 3);
+ok($Collator->eq("\cA", "?"));
+
+$Collator->change(variable => 'Non-ignorable', level => 4);
+
+ok($Collator->lt("?\x{300}", "?!"));
+ok($Collator->gt("?\x{300}A$acute", "?$A_acute"));
+ok($Collator->gt("?\x{300}", "?"));
+ok($Collator->gt("?\x{344}", "?"));
+
+$Collator->change(level => 3);
+ok($Collator->lt("\cA", "?"));
+
+$Collator->change(variable => 'Shifted', level => 4);
+
diff --git a/lib/Unicode/Collate/t/version.t b/lib/Unicode/Collate/t/version.t
new file mode 100644
index 0000000000..0a6d448e1e
--- /dev/null
+++ b/lib/Unicode/Collate/t/version.t
@@ -0,0 +1,61 @@
+
+BEGIN {
+ unless ("A" eq pack('U', 0x41)) {
+ print "1..0 # Unicode::Collate " .
+ "cannot stringify a Unicode code point\n";
+ exit 0;
+ }
+}
+
+BEGIN {
+ if ($ENV{PERL_CORE}) {
+ chdir('t') if -d 't';
+ @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib);
+ }
+}
+
+use Test;
+BEGIN { plan tests => 17 };
+
+use strict;
+use warnings;
+use Unicode::Collate;
+
+ok(1);
+
+#########################
+
+# Fix me when UCA and/or key.txt is upgraded.
+my $UCA_Version = "11";
+my $Base_Unicode_Version = "4.0";
+my $Key_Version = "3.1.1";
+
+ok(Unicode::Collate::UCA_Version, $UCA_Version);
+ok(Unicode::Collate->UCA_Version, $UCA_Version);
+ok(Unicode::Collate::Base_Unicode_Version, $Base_Unicode_Version);
+ok(Unicode::Collate->Base_Unicode_Version, $Base_Unicode_Version);
+
+my $Collator = Unicode::Collate->new(
+ table => 'keys.txt',
+ normalization => undef,
+);
+
+ok($Collator->UCA_Version, $UCA_Version);
+ok($Collator->UCA_Version(), $UCA_Version);
+ok($Collator->Base_Unicode_Version, $Base_Unicode_Version);
+ok($Collator->Base_Unicode_Version(), $Base_Unicode_Version);
+ok($Collator->version, $Key_Version);
+ok($Collator->version(), $Key_Version);
+
+my $UndefTable = Unicode::Collate->new(
+ table => undef,
+ normalization => undef,
+);
+
+ok($UndefTable->UCA_Version, $UCA_Version);
+ok($UndefTable->UCA_Version(), $UCA_Version);
+ok($UndefTable->Base_Unicode_Version, $Base_Unicode_Version);
+ok($UndefTable->Base_Unicode_Version(), $Base_Unicode_Version);
+ok($UndefTable->version, "unknown");
+ok($UndefTable->version(), "unknown");
+