summaryrefslogtreecommitdiff
path: root/cpan/Unicode-Collate
diff options
context:
space:
mode:
authorSteve Hay <steve.m.hay@googlemail.com>2013-09-01 14:59:01 +0100
committerSteve Hay <steve.m.hay@googlemail.com>2013-09-01 14:59:01 +0100
commit60f577e0304ce0cd93ca30edfeb534713ea7ffd9 (patch)
treec79c7479400d9cb24c19c152c3f94f5a24651e3b /cpan/Unicode-Collate
parentad434879973009b368013b6390fb5691800a87bb (diff)
downloadperl-60f577e0304ce0cd93ca30edfeb534713ea7ffd9.tar.gz
Upgrade Unicode::Collate from version 0.98 to 0.99
Diffstat (limited to 'cpan/Unicode-Collate')
-rw-r--r--cpan/Unicode-Collate/Changes6
-rw-r--r--cpan/Unicode-Collate/Collate.pm108
-rw-r--r--cpan/Unicode-Collate/Collate.xs18
-rw-r--r--cpan/Unicode-Collate/README2
-rw-r--r--cpan/Unicode-Collate/mkheader8
-rw-r--r--cpan/Unicode-Collate/t/illegal.t50
-rw-r--r--cpan/Unicode-Collate/t/override.t309
7 files changed, 325 insertions, 176 deletions
diff --git a/cpan/Unicode-Collate/Changes b/cpan/Unicode-Collate/Changes
index 0fd0908bf0..b8fa1a9e9a 100644
--- a/cpan/Unicode-Collate/Changes
+++ b/cpan/Unicode-Collate/Changes
@@ -1,5 +1,11 @@
Revision history for Perl module Unicode::Collate.
+0.99 Sun Sep 1 12:46:14 2013
+ - by default out-of-range values are treated as if it were U+FFFD
+ when UCA_Version >= 22.
+ - supported overriding out-of-range values (see 'overrideOut' in POD).
+ - modified tests: override.t, illegal.t in t.
+
0.98 Sat Jun 15 19:44:06 2013
- typo (see [rt.cpan.org #85655] typo fixes)
diff --git a/cpan/Unicode-Collate/Collate.pm b/cpan/Unicode-Collate/Collate.pm
index 388da67e30..48840ecd74 100644
--- a/cpan/Unicode-Collate/Collate.pm
+++ b/cpan/Unicode-Collate/Collate.pm
@@ -14,7 +14,7 @@ use File::Spec;
no warnings 'utf8';
-our $VERSION = '0.98';
+our $VERSION = '0.99';
our $PACKAGE = __PACKAGE__;
### begin XS only ###
@@ -106,7 +106,7 @@ my (%VariableOK);
our @ChangeOK = qw/
alternate backwards level normalization rearrange
katakana_before_hiragana upper_before_lower ignore_level2
- overrideHangul overrideCJK preprocess UCA_Version
+ overrideCJK overrideHangul overrideOut preprocess UCA_Version
hangul_terminator variable identical highestFFFF minimalFFFE
/;
@@ -497,7 +497,7 @@ sub splitEnt
# remove a code point marked as a completely ignorable.
for (my $i = 0; $i < @src; $i++) {
- if (_isIllegal($src[$i]) || $vers <= 20 && _isNonchar($src[$i])) {
+ if ($vers <= 20 && _isIllegal($src[$i])) {
$src[$i] = undef;
} elsif ($ver9) {
$src[$i] = undef if $map->{ $src[$i] }
@@ -621,25 +621,27 @@ sub getWt
my $u = shift;
my $map = $self->{mapping};
my $der = $self->{derivCode};
+ my $out = $self->{overrideOut};
my $uXS = $self->{__useXS}; ### XS only
return if !defined $u;
return $self->varCE($HighestVCE) if $u eq 0xFFFF && $self->{highestFFFF};
return $self->varCE($minimalVCE) if $u eq 0xFFFE && $self->{minimalFFFE};
- return map($self->varCE($_), @{ $map->{$u} }) if $map->{$u};
+ $u = 0xFFFD if $u !~ /;/ && 0x10FFFF < $u && !$out;
+
+ my @ce;
+ if ($map->{$u}) {
+ @ce = @{ $map->{$u} }; # $u may be a contraction
### begin XS only ###
- return map($self->varCE($_), _fetch_simple($u))
- if $uXS && _exists_simple($u);
+ } elsif ($uXS && _exists_simple($u)) {
+ @ce = _fetch_simple($u);
### end XS only ###
-
- # JCPS must not be a contraction, then it's a code point.
- if (Hangul_SIni <= $u && $u <= Hangul_SFin) {
+ } elsif (Hangul_SIni <= $u && $u <= Hangul_SFin) {
my $hang = $self->{overrideHangul};
- my @hangulCE;
if ($hang) {
- @hangulCE = map _pack_override($_, $u, $der), $hang->($u);
+ @ce = map _pack_override($_, $u, $der), $hang->($u);
} elsif (!defined $hang) {
- @hangulCE = $der->($u);
+ @ce = $der->($u);
} else {
my $max = $self->{maxlength};
my @decH = _decompHangul($u);
@@ -665,25 +667,26 @@ sub getWt
}
}
- @hangulCE = map({
+ @ce = map({
$map->{$_} ? @{ $map->{$_} } :
$uXS && _exists_simple($_) ? _fetch_simple($_) : ### XS only
$der->($_);
} @decH);
}
- return map $self->varCE($_), @hangulCE;
+ } elsif ($out && 0x10FFFF < $u) {
+ @ce = map _pack_override($_, $u, $der), $out->($u);
} else {
my $cjk = $self->{overrideCJK};
my $vers = $self->{UCA_Version};
if ($cjk && _isUIdeo($u, $vers)) {
- my @cjkCE = map _pack_override($_, $u, $der), $cjk->($u);
- return map $self->varCE($_), @cjkCE;
- }
- if ($vers == 8 && defined $cjk && _isUIdeo($u, 0)) {
- return map $self->varCE($_), _uideoCE_8($u);
+ @ce = map _pack_override($_, $u, $der), $cjk->($u);
+ } elsif ($vers == 8 && defined $cjk && _isUIdeo($u, 0)) {
+ @ce = _uideoCE_8($u);
+ } else {
+ @ce = $der->($u);
}
- return map $self->varCE($_), $der->($u);
}
+ return map $self->varCE($_), @ce;
}
@@ -1095,6 +1098,9 @@ The following revisions are supported. The default is 26.
* Noncharacters (e.g. U+FFFF) are not ignored, and can be overridden
since C<UCA_Version> 22.
+* Out-of-range codepoints (greater than U+10FFFF) are not ignored,
+and can be overridden since C<UCA_Version> 22.
+
* Fully ignorable characters were ignored, and would not interrupt
contractions with C<UCA_Version> 9 and 11.
@@ -1216,7 +1222,8 @@ almost, but the latter has a problem that you should know which letter is
next to C<c>. For a certain language where C<ch> as the next letter,
C<"abch"> is greater than C<"abc\x{FFFF}">, but less than C<"abd">.
-Note: This is equivalent to C<entry =E<gt> 'FFFF ; [.FFFE.0020.0005.FFFF]'>.
+Note:
+This is equivalent to C<(entry =E<gt> 'FFFF ; [.FFFE.0020.0005.FFFF]')>.
Any other character than C<U+FFFF> can be tailored by C<entry>.
=item identical
@@ -1325,7 +1332,8 @@ then C<$a2> and C<$b2> at level 1, as followed.
"b\x{FFFE}aaa"
"bbb\x{FFFE}a"
-Note: This is equivalent to C<entry =E<gt> 'FFFE ; [.0001.0020.0005.FFFE]'>.
+Note:
+This is equivalent to C<(entry =E<gt> 'FFFE ; [.0001.0020.0005.FFFE]')>.
Any other character than C<U+FFFE> can be tailored by C<entry>.
=item normalization
@@ -1425,10 +1433,16 @@ ex. ignores all CJK unified ideographs.
# where ->eq("Pe\x{4E00}rl", "Perl") is true
# as U+4E00 is a CJK unified ideograph and to be ignorable.
-If C<undef> is passed explicitly as the value for this key,
-weights for CJK unified ideographs are treated as undefined.
+If a false value (including C<undef>) is passed, C<overrideCJK>
+has no effect.
+C<$Collator-E<gt>change(overrideCJK =E<gt> 0)> resets the old one.
+
But assignment of weight for CJK unified ideographs
in C<table> or C<entry> is still valid.
+If C<undef> is passed explicitly as the value for this key,
+weights for CJK unified ideographs are treated as undefined.
+However when C<UCA_Version> E<gt> 8, C<(overrideCJK =E<gt> undef)>
+has no special meaning.
B<Note:> In addition to them, 12 CJK compatibility ideographs (C<U+FA0E>,
C<U+FA0F>, C<U+FA11>, C<U+FA13>, C<U+FA14>, C<U+FA1F>, C<U+FA21>, C<U+FA23>,
@@ -1452,12 +1466,54 @@ NFD and NFKD are not appropriate, since NFD and NFKD will decompose
Hangul syllables before overriding. FCD may decompose Hangul syllables
as the case may be.
+If a false value (but not C<undef>) is passed, C<overrideHangul>
+has no effect.
+C<$Collator-E<gt>change(overrideHangul =E<gt> 0)> resets the old one.
+
If C<undef> is passed explicitly as the value for this key,
weight for Hangul syllables is treated as undefined
without decomposition into Hangul Jamo.
But definition of weight for Hangul syllables
in C<table> or C<entry> is still valid.
+=item overrideOut
+
+-- see 7.1.1 Handling Ill-Formed Code Unit Sequences, UTS #10.
+
+Perl seems to allow out-of-range values (greater than 0x10FFFF).
+By default, out-of-range values are replaced with C<U+FFFD>
+(REPLACEMENT CHARACTER) when C<UCA_Version> E<gt>= 22,
+or ignored when C<UCA_Version> E<lt>= 20.
+
+When C<UCA_Version> E<gt>= 22, the weights of out-of-range values
+can be overridden. Though C<table> or C<entry> are available for them,
+out-of-range values are too many.
+
+C<overrideOut> can perform it algorithmically.
+This parameter works like C<overrideCJK>, so see there for examples.
+
+ex. ignores all out-of-range values.
+
+ overrideOut => sub {()}, # CODEREF returning empty list
+
+If a false value (including C<undef>) is passed, C<overrideOut>
+has no effect.
+C<$Collator-E<gt>change(overrideOut =E<gt> 0)> resets the old one.
+
+UCA recommends that out-of-range values should not be ignored for security
+reasons. Say, C<"pe\x{110000}rl"> should not be equal to C<"perl">.
+However, C<U+FFFD> is wrongly mapped to a variable collation element
+in DUCET for Unicode 6.0.0 to 6.2.0, that means out-of-range values will be
+ignored when C<variable> isn't C<Non-ignorable>.
+
+Unicode 6.3.0 will correct the mapping of C<U+FFFD>.
+see L<http://www.unicode.org/reports/tr10/tr10-27.html#Trailing_Weights>.
+Such a correction is reproduced by this.
+
+ overrideOut => sub { 0xFFFD }, # CODEREF returning a very large integer
+
+Since Unicode 6.3.0, C<(overrideOut =E<gt> sub { 0xFFFD })> may be unnecesssary.
+
=item preprocess
-- see 5.4 Preprocessing, UTS #10.
@@ -1559,7 +1615,7 @@ may be better to avoid namespace conflict.
B<NOTE>: When XSUB is used, the DUCET is compiled on building this
module, and it may save time at the run time.
-Explicit saying C<table =E<gt> 'allkeys.txt'> (or using another table),
+Explicit saying C<(table =E<gt> 'allkeys.txt')>, or using another table,
or using C<ignoreChar>, C<ignoreName>, C<undefChar>, C<undefName> or
C<rewrite> will prevent this module from using the compiled DUCET.
@@ -1934,7 +1990,7 @@ module (see L<Unicode::Normalize>).
If you need not it (say, in the case when you need not
handle any combining characters),
-assign C<normalization =E<gt> undef> explicitly.
+assign C<(normalization =E<gt> undef)> explicitly.
-- see 6.5 Avoiding Normalization, UTS #10.
diff --git a/cpan/Unicode-Collate/Collate.xs b/cpan/Unicode-Collate/Collate.xs
index 27920ed0ea..c339cc7734 100644
--- a/cpan/Unicode-Collate/Collate.xs
+++ b/cpan/Unicode-Collate/Collate.xs
@@ -210,22 +210,8 @@ _isIllegal (sv)
XSRETURN_YES;
uv = SvUVX(sv);
RETVAL = boolSV(
- 0x10FFFF < uv /* out of range */
- );
-OUTPUT:
- RETVAL
-
-
-SV*
-_isNonchar (sv)
- SV* sv
- PREINIT:
- UV uv;
- CODE:
- /* should be called only if ! _isIllegal(sv). */
- uv = SvUVX(sv);
- RETVAL = boolSV(
- ((uv & 0xFFFE) == 0xFFFE) /* ??FFF[EF] (cf. utf8.c) */
+ 0x10FFFF < uv /* out of range */
+ || ((uv & 0xFFFE) == 0xFFFE) /* ??FFF[EF] (cf. utf8.c) */
|| (0xD800 <= uv && uv <= 0xDFFF) /* unpaired surrogates */
|| (0xFDD0 <= uv && uv <= 0xFDEF) /* other non-characters */
);
diff --git a/cpan/Unicode-Collate/README b/cpan/Unicode-Collate/README
index 141de8a305..1de270915b 100644
--- a/cpan/Unicode-Collate/README
+++ b/cpan/Unicode-Collate/README
@@ -1,4 +1,4 @@
-Unicode/Collate version 0.98
+Unicode/Collate version 0.99
===============================
NAME
diff --git a/cpan/Unicode-Collate/mkheader b/cpan/Unicode-Collate/mkheader
index c92d7c9036..c3d0ebf00a 100644
--- a/cpan/Unicode-Collate/mkheader
+++ b/cpan/Unicode-Collate/mkheader
@@ -135,11 +135,11 @@ foreach my $tbl (@tripletable) {
my $null = $tbl->{null};
my $init = $tbl->{init};
- open FH, ">$file" or croak "$PACKAGE: $file can't be made";
- binmode FH; select FH;
+ open my $fh_h, ">$file" or croak "$PACKAGE: $file can't be made";
+ binmode $fh_h; select $fh_h;
my %val;
- print FH << 'EOF';
+ print << 'EOF';
/*
* This file is auto-generated by mkheader.
* Any changes here will be lost!
@@ -189,7 +189,7 @@ EOF
print "\n";
}
print "};\n\n";
- close FH;
+ close $fh_h;
}
1;
diff --git a/cpan/Unicode-Collate/t/illegal.t b/cpan/Unicode-Collate/t/illegal.t
index 5d7999df52..7fa81e43e3 100644
--- a/cpan/Unicode-Collate/t/illegal.t
+++ b/cpan/Unicode-Collate/t/illegal.t
@@ -25,7 +25,7 @@ BEGIN {
use strict;
use warnings;
-BEGIN { $| = 1; print "1..65\n"; }
+BEGIN { $| = 1; print "1..127\n"; } # 77 + 5 x @Versions
my $count = 0;
sub ok ($;$) {
my $p = my $r = shift;
@@ -125,7 +125,7 @@ ok($nonch->lt("", "\x{FDD0}"));
ok($nonch->lt("", "\x{FDEF}"));
ok($nonch->lt("", "\x02"));
ok($nonch->lt("", "\x{10FFFF}"));
-ok($nonch->eq("", "\x{110000}"));
+ok($nonch->lt("", "\x{110000}"));
# 38..47
ok($nonch->lt("\x00", "\x01"));
@@ -137,7 +137,7 @@ ok($nonch->lt("\x{DFFF}", "\x{FDD0}"));
ok($nonch->lt("\x{FDD0}", "\x{FDEF}"));
ok($nonch->lt("\x{FDEF}", "\x02"));
ok($nonch->lt("\x02", "\x{10FFFF}"));
-ok($nonch->gt("\x{10FFFF}", "\x{110000}"));
+ok($nonch->lt("\x{10FFFF}", "\x{110000}"));
# 48..51
ok($nonch->lt("A", "A\x{FFFF}"));
@@ -178,3 +178,47 @@ for my $ret (@ret) {
ok($match eq $ret);
}
+##################
+
+my $out = Unicode::Collate->new(
+ level => 1,
+ table => undef,
+ normalization => undef,
+ overrideOut => sub { 0xFFFD },
+);
+
+my @Versions = (8, 9, 11, 14, 16, 18, 20, 22, 24, 26);
+
+for my $v (@Versions) {
+ $out->change(UCA_Version => $v);
+ ok($out->cmp('', "\x{10FFFF}") == ($v >= 22 ? -1 : 0));
+ ok($out->cmp('', "\x{110000}") == ($v >= 22 ? -1 : 0));
+ ok($out->cmp('ABC', "\x{110000}") == ($v >= 22 ? -1 : 1));
+ ok($out->cmp("\x{10FFFD}", "\x{110000}") == ($v >= 22 ? -1 : 1));
+ ok($out->cmp("\x{11FFFD}", "\x{110000}") == ($v >= 22 ? 0 : 0));
+}
+
+# x+66..x+77
+ok($out->lt('ABC', "\x{123456}"));
+ok($out->lt("\x{FFFD}", "\x{123456}"));
+
+$out->change(overrideOut => sub {()});
+
+ok($out->eq('', "\x{123456}"));
+ok($out->gt('ABC', "\x{123456}"));
+ok($out->gt("\x{FFFD}", "\x{123456}"));
+
+$out->change(overrideOut => undef);
+ok($out->lt('', "\x{123456}"));
+ok($out->eq("\x{FFFD}", "\x{123456}"));
+
+$out->change(overrideOut => sub { 0xFFFD });
+
+ok($out->lt('', "\x{123456}"));
+ok($out->lt('ABC', "\x{123456}"));
+ok($out->lt("\x{FFFD}", "\x{123456}"));
+
+$out->change(overrideOut => 0);
+ok($out->lt('', "\x{123456}"));
+ok($out->eq("\x{FFFD}", "\x{123456}"));
+
diff --git a/cpan/Unicode-Collate/t/override.t b/cpan/Unicode-Collate/t/override.t
index bc6a70c361..025a3698ab 100644
--- a/cpan/Unicode-Collate/t/override.t
+++ b/cpan/Unicode-Collate/t/override.t
@@ -13,7 +13,7 @@ BEGIN {
use strict;
use warnings;
-BEGIN { $| = 1; print "1..35\n"; }
+BEGIN { $| = 1; print "1..65\n"; }
my $count = 0;
sub ok ($;$) {
my $p = my $r = shift;
@@ -30,129 +30,186 @@ ok(1);
#########################
-##### 2..6
-
-my $all_undef_8 = Unicode::Collate->new(
- table => undef,
- normalization => undef,
- overrideCJK => undef,
- overrideHangul => undef,
- UCA_Version => 8,
-);
-
-# All in the Unicode code point order.
-# No hangul decomposition.
-
-ok($all_undef_8->lt("\x{3402}", "\x{4E00}"));
-ok($all_undef_8->lt("\x{4DFF}", "\x{4E00}"));
-ok($all_undef_8->lt("\x{4E00}", "\x{AC00}"));
-ok($all_undef_8->gt("\x{AC00}", "\x{1100}\x{1161}"));
-ok($all_undef_8->gt("\x{AC00}", "\x{ABFF}"));
-
-
-##### 7..11
-
-my $all_undef_9 = Unicode::Collate->new(
- table => undef,
- normalization => undef,
- overrideCJK => undef,
- overrideHangul => undef,
- UCA_Version => 9,
-);
-
-# CJK Ideo. < CJK ext A/B < Others.
-# No hangul decomposition.
-
-ok($all_undef_9->lt("\x{4E00}", "\x{3402}"));
-ok($all_undef_9->lt("\x{3402}", "\x{20000}"));
-ok($all_undef_9->lt("\x{20000}", "\x{AC00}"));
-ok($all_undef_9->gt("\x{AC00}", "\x{1100}\x{1161}"));
-ok($all_undef_9->gt("\x{AC00}", "\x{ABFF}")); # U+ABFF: not assigned
-
-##### 12..16
-
-my $ignoreHangul = Unicode::Collate->new(
- table => undef,
- normalization => undef,
- overrideHangul => sub {()},
- entry => <<'ENTRIES',
-AE00 ; [.0100.0020.0002.AE00] # Hangul GEUL
-ENTRIES
-);
-
-# All Hangul Syllables except U+AE00 are ignored.
-
-ok($ignoreHangul->eq("\x{AC00}", ""));
-ok($ignoreHangul->lt("\x{AC00}", "\0"));
-ok($ignoreHangul->lt("\x{AC00}", "\x{AE00}"));
-ok($ignoreHangul->lt("\x{AC00}", "\x{1100}\x{1161}")); # Jamo are not ignored.
-ok($ignoreHangul->lt("Pe\x{AE00}rl", "Perl")); # 'r' is unassigned.
-
-##### 17..21
-
-my $undefHangul = Unicode::Collate->new(
- table => undef,
- normalization => undef,
- overrideHangul => sub {
- my $u = shift;
- return $u == 0xAE00 ? 0x100 : undef;
- }
-);
-
-# All Hangul Syllables except U+AE00 are undefined.
-
-ok($undefHangul->lt("\x{AE00}", "r"));
-ok($undefHangul->gt("\x{AC00}", "r"));
-ok($undefHangul->gt("\x{AC00}", "\x{1100}\x{1161}"));
-ok($undefHangul->lt("Pe\x{AE00}rl", "Perl")); # 'r' is unassigned.
-ok($undefHangul->lt("\x{AC00}", "\x{B000}"));
-
-##### 22..25
-
-my $undefCJK = Unicode::Collate->new(
- table => undef,
- normalization => undef,
- overrideCJK => sub {
- my $u = shift;
- return $u == 0x4E00 ? 0x100 : undef;
- }
-);
-
-# All CJK Ideographs except U+4E00 are undefined.
-
-ok($undefCJK->lt("\x{4E00}", "r"));
-ok($undefCJK->lt("\x{5000}", "r")); # still CJK < unassigned
-ok($undefCJK->lt("Pe\x{4E00}rl", "Perl")); # 'r' is unassigned.
-ok($undefCJK->lt("\x{5000}", "\x{6000}"));
-
-##### 26..30
-
-my $cpHangul = Unicode::Collate->new(
- table => undef,
- normalization => undef,
- overrideHangul => sub { shift }
-);
-
-ok($cpHangul->lt("\x{AC00}", "\x{AC01}"));
-ok($cpHangul->lt("\x{AC01}", "\x{D7A3}"));
-ok($cpHangul->lt("\x{D7A3}", "r")); # 'r' is unassigned.
-ok($cpHangul->lt("r", "\x{D7A4}"));
-ok($cpHangul->lt("\x{D7A3}", "\x{4E00}"));
-
-##### 31..35
-
-my $arrayHangul = Unicode::Collate->new(
- table => undef,
- normalization => undef,
- overrideHangul => sub {
- my $u = shift;
- return [$u, 0x20, 0x2, $u];
- }
-);
-
-ok($arrayHangul->lt("\x{AC00}", "\x{AC01}"));
-ok($arrayHangul->lt("\x{AC01}", "\x{D7A3}"));
-ok($arrayHangul->lt("\x{D7A3}", "r")); # 'r' is unassigned.
-ok($arrayHangul->lt("r", "\x{D7A4}"));
-ok($arrayHangul->lt("\x{D7A3}", "\x{4E00}"));
+##### 2..31
+
+{
+ my $all_undef_8 = Unicode::Collate->new(
+ table => undef,
+ normalization => undef,
+ overrideCJK => undef,
+ overrideHangul => undef,
+ UCA_Version => 8,
+ );
+ # All in the Unicode code point order.
+ # No hangul decomposition.
+
+ ok($all_undef_8->lt("\x{1100}", "\x{3402}"));
+ ok($all_undef_8->lt("\x{3402}", "\x{4E00}"));
+ ok($all_undef_8->lt("\x{4DFF}", "\x{4E00}"));
+ ok($all_undef_8->lt("\x{4E00}", "\x{AC00}"));
+ ok($all_undef_8->gt("\x{AC00}", "\x{1100}\x{1161}"));
+ ok($all_undef_8->gt("\x{AC00}", "\x{ABFF}"));
+ # U+ABFF: not assigned
+
+ # a hangul syllable is decomposed into jamo.
+ $all_undef_8->change(overrideHangul => 0);
+ ok($all_undef_8->lt("\x{1100}", "\x{3402}"));
+ ok($all_undef_8->lt("\x{3402}", "\x{4E00}"));
+ ok($all_undef_8->lt("\x{4DFF}", "\x{4E00}"));
+ ok($all_undef_8->gt("\x{4E00}", "\x{AC00}"));
+ ok($all_undef_8->eq("\x{AC00}", "\x{1100}\x{1161}"));
+ ok($all_undef_8->lt("\x{AC00}", "\x{ABFF}"));
+
+ # CJK defined < Jamo undefined
+ $all_undef_8->change(overrideCJK => 0);
+ ok($all_undef_8->gt("\x{1100}", "\x{3402}"));
+ ok($all_undef_8->lt("\x{3402}", "\x{4E00}"));
+ ok($all_undef_8->gt("\x{4DFF}", "\x{4E00}"));
+ ok($all_undef_8->lt("\x{4E00}", "\x{AC00}"));
+ ok($all_undef_8->eq("\x{AC00}", "\x{1100}\x{1161}"));
+ ok($all_undef_8->lt("\x{AC00}", "\x{ABFF}"));
+
+ # CJK undefined > Jamo undefined
+ $all_undef_8->change(overrideCJK => undef);
+ ok($all_undef_8->lt("\x{1100}", "\x{3402}"));
+ ok($all_undef_8->lt("\x{3402}", "\x{4E00}"));
+ ok($all_undef_8->lt("\x{4DFF}", "\x{4E00}"));
+ ok($all_undef_8->gt("\x{4E00}", "\x{AC00}"));
+ ok($all_undef_8->eq("\x{AC00}", "\x{1100}\x{1161}"));
+ ok($all_undef_8->lt("\x{AC00}", "\x{ABFF}"));
+
+ $all_undef_8->change(overrideHangul => undef);
+ ok($all_undef_8->lt("\x{1100}", "\x{3402}"));
+ ok($all_undef_8->lt("\x{3402}", "\x{4E00}"));
+ ok($all_undef_8->lt("\x{4DFF}", "\x{4E00}"));
+ ok($all_undef_8->lt("\x{4E00}", "\x{AC00}"));
+ ok($all_undef_8->gt("\x{AC00}", "\x{1100}\x{1161}"));
+ ok($all_undef_8->gt("\x{AC00}", "\x{ABFF}"));
+}
+
+##### 32..38
+
+{
+ my $all_undef_9 = Unicode::Collate->new(
+ table => undef,
+ normalization => undef,
+ overrideCJK => undef,
+ overrideHangul => undef,
+ UCA_Version => 9,
+ );
+ # CJK Ideo. < CJK ext A/B < Others.
+ # No hangul decomposition.
+
+ ok($all_undef_9->lt("\x{4E00}", "\x{3402}"));
+ ok($all_undef_9->lt("\x{3402}", "\x{20000}"));
+ ok($all_undef_9->lt("\x{20000}", "\x{AC00}"));
+ ok($all_undef_9->gt("\x{AC00}", "\x{1100}\x{1161}"));
+ ok($all_undef_9->gt("\x{AC00}", "\x{ABFF}"));
+ # U+ABFF: not assigned
+
+ # a hangul syllable is decomposed into jamo.
+ $all_undef_9->change(overrideHangul => 0);
+ ok($all_undef_9->eq("\x{AC00}", "\x{1100}\x{1161}"));
+ ok($all_undef_9->lt("\x{AC00}", "\x{ABFF}"));
+}
+
+##### 39..46
+
+{
+ my $ignoreHangul = Unicode::Collate->new(
+ table => undef,
+ normalization => undef,
+ overrideHangul => sub {()},
+ entry => 'AE00 ; [.0100.0020.0002.AE00] # Hangul GEUL',
+ );
+ # All Hangul Syllables except U+AE00 are ignored.
+
+ ok($ignoreHangul->eq("\x{AC00}", ""));
+ ok($ignoreHangul->lt("\x{AC00}", "\0"));
+ ok($ignoreHangul->lt("\x{AC00}", "\x{AE00}"));
+ ok($ignoreHangul->lt("\x{AC00}", "\x{1100}\x{1161}")); # Jamo are not ignored.
+ ok($ignoreHangul->eq("Pe\x{AC00}rl", "Perl"));
+ ok($ignoreHangul->lt("Pe\x{AE00}rl", "Perl"));
+ # 'r' is unassigned.
+
+ $ignoreHangul->change(overrideHangul => 0);
+ ok($ignoreHangul->eq("\x{AC00}", "\x{1100}\x{1161}"));
+
+ $ignoreHangul->change(overrideHangul => undef);
+ ok($ignoreHangul->gt("\x{AC00}", "\x{1100}\x{1161}"));
+}
+
+##### 47..51
+
+{
+ my $undefHangul = Unicode::Collate->new(
+ table => undef,
+ normalization => undef,
+ overrideHangul => sub {
+ my $u = shift;
+ return $u == 0xAE00 ? 0x100 : undef;
+ }
+ );
+ # All Hangul Syllables except U+AE00 are undefined.
+
+ ok($undefHangul->lt("\x{AE00}", "r"));
+ ok($undefHangul->gt("\x{AC00}", "r"));
+ ok($undefHangul->gt("\x{AC00}", "\x{1100}\x{1161}"));
+ ok($undefHangul->lt("Pe\x{AE00}rl", "Perl")); # 'r' is unassigned.
+ ok($undefHangul->lt("\x{AC00}", "\x{B000}"));
+}
+
+##### 52..55
+
+{
+ my $undefCJK = Unicode::Collate->new(
+ table => undef,
+ normalization => undef,
+ overrideCJK => sub {
+ my $u = shift;
+ return $u == 0x4E00 ? 0x100 : undef;
+ }
+ );
+ # All CJK Ideographs except U+4E00 are undefined.
+
+ ok($undefCJK->lt("\x{4E00}", "r"));
+ ok($undefCJK->lt("\x{5000}", "r")); # still CJK < unassigned
+ ok($undefCJK->lt("Pe\x{4E00}rl", "Perl"));
+ ok($undefCJK->lt("\x{5000}", "\x{6000}"));
+}
+
+##### 56..60
+
+{
+ my $cpHangul = Unicode::Collate->new(
+ table => undef,
+ normalization => undef,
+ overrideHangul => sub { shift }
+ );
+
+ ok($cpHangul->lt("\x{AC00}", "\x{AC01}"));
+ ok($cpHangul->lt("\x{AC01}", "\x{D7A3}"));
+ ok($cpHangul->lt("\x{D7A3}", "r"));
+ ok($cpHangul->lt("r", "\x{D7A4}"));
+ ok($cpHangul->lt("\x{D7A3}", "\x{4E00}"));
+}
+
+##### 61..65
+
+{
+ my $arrayHangul = Unicode::Collate->new(
+ table => undef,
+ normalization => undef,
+ overrideHangul => sub {
+ my $u = shift;
+ return [$u, 0x20, 0x2, $u];
+ }
+ );
+
+ ok($arrayHangul->lt("\x{AC00}", "\x{AC01}"));
+ ok($arrayHangul->lt("\x{AC01}", "\x{D7A3}"));
+ ok($arrayHangul->lt("\x{D7A3}", "r"));
+ ok($arrayHangul->lt("r", "\x{D7A4}"));
+ ok($arrayHangul->lt("\x{D7A3}", "\x{4E00}"));
+}