diff options
Diffstat (limited to 'lib/Unicode')
-rw-r--r-- | lib/Unicode/Collate.pm | 94 | ||||
-rw-r--r-- | lib/Unicode/Collate/Changes | 15 | ||||
-rw-r--r-- | lib/Unicode/Collate/README | 2 | ||||
-rw-r--r-- | lib/Unicode/Collate/t/hangul.t | 193 |
4 files changed, 272 insertions, 32 deletions
diff --git a/lib/Unicode/Collate.pm b/lib/Unicode/Collate.pm index fa19afef4f..2bcc3155d2 100644 --- a/lib/Unicode/Collate.pm +++ b/lib/Unicode/Collate.pm @@ -14,7 +14,7 @@ use File::Spec; require Exporter; -our $VERSION = '0.26'; +our $VERSION = '0.27'; our $PACKAGE = __PACKAGE__; our @ISA = qw(Exporter); @@ -225,17 +225,18 @@ sub checkCollator { croak "Unicode/Normalize.pm is required to normalize strings: $@" if $@; - Unicode::Normalize->import(); $getCombinClass = \&Unicode::Normalize::getCombinClass if ! $getCombinClass; - $self->{normCode} = - $self->{normalization} =~ /^(?:NF)?C$/ ? \&NFC : - $self->{normalization} =~ /^(?:NF)?D$/ ? \&NFD : - $self->{normalization} =~ /^(?:NF)?KC$/ ? \&NFKC : - $self->{normalization} =~ /^(?:NF)?KD$/ ? \&NFKD : - croak "$PACKAGE unknown normalization form name: " - . $self->{normalization}; + my $norm = $self->{normalization}; + $self->{normCode} = sub { + Unicode::Normalize::normalize($norm, shift); + }; + + eval { $self->{normCode}->("") }; # try + if ($@) { + croak "$PACKAGE unknown normalization form name: $norm"; + } } return; } @@ -477,10 +478,13 @@ sub splitCE if ($max->{$ce}) { # contract my $temp_ce = $ce; + my $ceLen = 1; + my $maxLen = $max->{$ce}; - for (my $p = $i + 1; $p < @src; $p++) { + for (my $p = $i + 1; $ceLen < $maxLen && $p < @src; $p++) { next if ! defined $src[$p]; $temp_ce .= CODE_SEP . $src[$p]; + $ceLen++; if ($ent->{$temp_ce}) { $ce = $temp_ce; $i = $p; @@ -524,8 +528,6 @@ sub getWt my $self = shift; my $ce = shift; my $ent = $self->{entries}; - my $cjk = $self->{overrideCJK}; - my $hang = $self->{overrideHangul}; my $der = $self->{derivCode}; return if !defined $ce; @@ -536,18 +538,50 @@ sub getWt my $u = $ce; if (0xAC00 <= $u && $u <= 0xD7A3) { # is Hangul Syllale - return map $self->altCE($_), - $hang - ? map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$hang($u)) - : defined $hang - ? map({ - $ent->{$_} ? @{ $ent->{$_} } : $der->($_); - } _decompHangul($u)) - : $der->($u); + my $hang = $self->{overrideHangul}; + my @hangulCE; + if ($hang) { + @hangulCE = map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$hang($u)); + } + elsif (!defined $hang) { + @hangulCE = $der->($u); + } + else { + my $max = $self->{maxlength}; + my @decH = _decompHangul($u); + + if (@decH == 2) { + my $contract = join(CODE_SEP, @decH); + @decH = ($contract) if $ent->{$contract}; + } else { # must be <@decH == 3> + if ($max->{$decH[0]}) { + my $contract = join(CODE_SEP, @decH); + if ($ent->{$contract}) { + @decH = ($contract); + } else { + $contract = join(CODE_SEP, @decH[0,1]); + $ent->{$contract} and @decH = ($contract, $decH[2]); + } + # even if V's ignorable, LT contraction is not supported. + # If such a situatution were required, NFD should be used. + } + if (@decH == 3 && $max->{$decH[1]}) { + my $contract = join(CODE_SEP, @decH[1,2]); + $ent->{$contract} and @decH = ($decH[0], $contract); + } + } + + @hangulCE = map({ + $ent->{$_} ? @{ $ent->{$_} } : $der->($_); + } @decH); + } + return map $self->altCE($_), @hangulCE; } elsif (0x3400 <= $u && $u <= 0x4DB5 || 0x4E00 <= $u && $u <= 0x9FA5 || - 0x20000 <= $u && $u <= 0x2A6D6) { # CJK Ideograph + 0x20000 <= $u && $u <= 0x2A6D6) # CJK Ideograph + { + my $cjk = $self->{overrideCJK}; return map $self->altCE($_), $cjk ? map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$cjk($u)) @@ -1092,14 +1126,12 @@ If omitted, the maximum is the 4th. If specified, strings are normalized before preparation of sort keys (the normalization is executed after preprocess). -As a form name, one of the following names must be used. +A form name C<Unicode::Normalize::normalize()> accepts will be applied +as C<$normalization_form>. +See C<Unicode::Normalize::normalize()> for detail. +If omitted, C<'NFD'> is used. - 'C' or 'NFC' for Normalization Form C - 'D' or 'NFD' for Normalization Form D - 'KC' or 'NFKC' for Normalization Form KC - 'KD' or 'NFKD' for Normalization Form KD - -If omitted, the string is put into Normalization Form D. +L<normalization> is performed after L<preprocess> (if defined). If C<undef> is passed explicitly as the value for this key, any normalization is not carried out (this may make tailoring easier @@ -1169,9 +1201,11 @@ Then, "the pen" is before "a pencil". preprocess => sub { my $str = shift; $str =~ s/\b(?:an?|the)\s+//gi; - $str; + return $str; }, +L<preprocess> is performed before L<normalization> (if defined). + =item rearrange -- see 3.1.3 Rearrangement, UTS #10. @@ -1505,7 +1539,7 @@ B<Unicode::Normalize is required to try The Conformance Test.> =head1 AUTHOR -SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt> +SADAHIRO Tomoyuki, <SADAHIRO@cpan.org> http://homepage1.nifty.com/nomenclator/perl/ diff --git a/lib/Unicode/Collate/Changes b/lib/Unicode/Collate/Changes index c54933e328..4f61b8332f 100644 --- a/lib/Unicode/Collate/Changes +++ b/lib/Unicode/Collate/Changes @@ -1,8 +1,21 @@ Revision history for Perl module Unicode::Collate. +0.27 Sun Aug 31 22:23:17 2003 + some improvements: + - The maximum length of contracted CE was not checked. + Collation of a large string including a first letter of a contraction + that is not a part of that contraction (say, 'c' of 'ca' + where 'ch' is defined) was too slow, inefficient. + - A form name for 'normalize', no longer restricted to /^(?:NF)?K?[CD]\z/, + will be allowed as long as Unicode::Normalize::normalize() accepts it. + since Unicode::Normalize or UAX #15 may be changed/enhanced in future. + - When Hangul syllables are decomposed under <normalization => undef>, + contraction among jamo (LV, VT, LVT) derived from the same + Hangul syllable is allowed. Added hangul.t. + 0.26 Sun Aug 03 22:23:17 2003 - fix: an expansion in which a CE is level 3 ignorable and others are not - was wrongly made level 3 ignorable as a whole entry. + was wrongly made level 3 ignorable as a whole entry. (In DUCET, some precomposites in Musical Symbols are so) 0.25 Mon Jun 06 23:20:17 2003 diff --git a/lib/Unicode/Collate/README b/lib/Unicode/Collate/README index 7b555fce7a..21e1ff8b48 100644 --- a/lib/Unicode/Collate/README +++ b/lib/Unicode/Collate/README @@ -1,4 +1,4 @@ -Unicode/Collate version 0.26 +Unicode/Collate version 0.27 =============================== NAME diff --git a/lib/Unicode/Collate/t/hangul.t b/lib/Unicode/Collate/t/hangul.t new file mode 100644 index 0000000000..be6b0724fb --- /dev/null +++ b/lib/Unicode/Collate/t/hangul.t @@ -0,0 +1,193 @@ +BEGIN { + unless ("A" eq pack('U', 0x41)) { + print "1..0 # Unicode::Collate " . + "cannot stringify a Unicode code point\n"; + exit 0; + } +} + +BEGIN { + if ($ENV{PERL_CORE}) { + chdir('t') if -d 't'; + @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib); + } +} + +use Test; +BEGIN { plan tests => 52 }; + +use strict; +use warnings; +use Unicode::Collate; + +use vars qw($IsEBCDIC); +$IsEBCDIC = ord("A") != 0x41; + +######################### + +ok(1); # If we made it this far, we're ok. + +# a standard collator (3.1.1) +my $Collator = Unicode::Collate->new( + table => 'keys.txt', + normalization => undef, +); + + +# a collator for hangul sorting, +# cf. http://std.dkuug.dk/JTC1/SC22/WG20/docs/documents.html +# http://std.dkuug.dk/JTC1/SC22/WG20/docs/n1051-hangulsort.pdf +my $hangul = Unicode::Collate->new( + level => 3, + table => undef, + normalization => undef, + entry => <<'ENTRIES', +0061 ; [.0A15.0020.0002] # LATIN SMALL LETTER A +0041 ; [.0A15.0020.0008] # LATIN CAPITAL LETTER A +#1161 ; [.1800.0020.0002] # <comment> initial jungseong A +#1163 ; [.1801.0020.0002] # <comment> initial jungseong YA +1100 ; [.1831.0020.0002] # choseong KIYEOK +1100 1161 ; [.1831.0020.0002][.1800.0020.0002] # G-A +1100 1163 ; [.1831.0020.0002][.1801.0020.0002] # G-YA +1101 ; [.1831.0020.0002][.1831.0020.0002] # choseong SSANGKIYEOK +1101 1161 ; [.1831.0020.0002][.1831.0020.0002][.1800.0020.0002] # GG-A +1101 1163 ; [.1831.0020.0002][.1831.0020.0002][.1801.0020.0002] # GG-YA +1102 ; [.1833.0020.0002] # choseong NIEUN +1102 1161 ; [.1833.0020.0002][.1800.0020.0002] # N-A +1102 1163 ; [.1833.0020.0002][.1801.0020.0002] # N-YA +3042 ; [.1921.0020.000E] # HIRAGANA LETTER A +11A8 ; [.FE10.0020.0002] # jongseong KIYEOK +11A9 ; [.FE10.0020.0002][.FE10.0020.0002] # jongseong SSANGKIYEOK +1161 ; [.FE20.0020.0002] # jungseong A <non-initial> +1163 ; [.FE21.0020.0002] # jungseong YA <non-initial> +ENTRIES +); + +ok(ref $hangul, "Unicode::Collate"); + +######################### + +# L(simp)L(simp) vs L(comp): /GGA/ +ok($Collator->lt("\x{1100}\x{1100}\x{1161}", "\x{1101}\x{1161}")); +ok($hangul ->eq("\x{1100}\x{1100}\x{1161}", "\x{1101}\x{1161}")); + +# L(simp) vs L(simp)L(simp): /GA/ vs /GGA/ +ok($Collator->gt("\x{1100}\x{1161}", "\x{1100}\x{1100}\x{1161}")); +ok($hangul ->lt("\x{1100}\x{1161}", "\x{1100}\x{1100}\x{1161}")); + +# T(simp)T(simp) vs T(comp): /AGG/ +ok($Collator->lt("\x{1161}\x{11A8}\x{11A8}", "\x{1161}\x{11A9}")); +ok($hangul ->eq("\x{1161}\x{11A8}\x{11A8}", "\x{1161}\x{11A9}")); + +# T(simp) vs T(simp)T(simp): /AG/ vs /AGG/ +ok($Collator->lt("\x{1161}\x{11A8}", "\x{1161}\x{11A8}\x{11A8}")); +ok($hangul ->lt("\x{1161}\x{11A8}", "\x{1161}\x{11A8}\x{11A8}")); + +# LV vs LLV: /GA/ vs /GNA/ +ok($Collator->gt("\x{1100}\x{1161}", "\x{1100}\x{1102}\x{1161}")); +ok($hangul ->lt("\x{1100}\x{1161}", "\x{1100}\x{1102}\x{1161}")); + +# LVX vs LVV: /GAA/ vs /GA/.latinA +ok($Collator->gt("\x{1100}\x{1161}\x{1161}", "\x{1100}\x{1161}A")); +ok($hangul ->gt("\x{1100}\x{1161}\x{1161}", "\x{1100}\x{1161}A")); + +# LVX vs LVV: /GAA/ vs /GA/.hiraganaA +ok($Collator->lt("\x{1100}\x{1161}\x{1161}", "\x{1100}\x{1161}\x{3042}")); +ok($hangul ->gt("\x{1100}\x{1161}\x{1161}", "\x{1100}\x{1161}\x{3042}")); + +# LVX vs LVV: /GAA/ vs /GA/.hanja +ok($Collator->lt("\x{1100}\x{1161}\x{1161}", "\x{1100}\x{1161}\x{4E00}")); +ok($hangul ->gt("\x{1100}\x{1161}\x{1161}", "\x{1100}\x{1161}\x{4E00}")); + +# LVL vs LVT: /GA/./G/ vs /GAG/ +ok($Collator->lt("\x{1100}\x{1161}\x{1100}", "\x{1100}\x{1161}\x{11A8}")); +ok($hangul ->lt("\x{1100}\x{1161}\x{1100}", "\x{1100}\x{1161}\x{11A8}")); + +# LVT vs LVX: /GAG/ vs /GA/.latinA +ok($Collator->gt("\x{1100}\x{1161}\x{11A8}", "\x{1100}\x{1161}A")); +ok($hangul ->gt("\x{1100}\x{1161}\x{11A8}", "\x{1100}\x{1161}A")); + +# LVT vs LVX: /GAG/ vs /GA/.hiraganaA +ok($Collator->lt("\x{1100}\x{1161}\x{11A8}", "\x{1100}\x{1161}\x{3042}")); +ok($hangul ->gt("\x{1100}\x{1161}\x{11A8}", "\x{1100}\x{1161}\x{3042}")); + +# LVT vs LVX: /GAG/ vs /GA/.hanja +ok($Collator->lt("\x{1100}\x{1161}\x{11A8}", "\x{1100}\x{1161}\x{4E00}")); +ok($hangul ->gt("\x{1100}\x{1161}\x{11A8}", "\x{1100}\x{1161}\x{4E00}")); + +# LVT vs LVV: /GAG/ vs /GAA/ +ok($Collator->gt("\x{1100}\x{1161}\x{11A8}", "\x{1100}\x{1161}\x{1161}")); +ok($hangul ->lt("\x{1100}\x{1161}\x{11A8}", "\x{1100}\x{1161}\x{1161}")); + +# LVL vs LVV: /GA/./G/ vs /GAA/ +ok($Collator->lt("\x{1100}\x{1161}\x{1100}", "\x{1100}\x{1161}\x{1161}")); +ok($hangul ->lt("\x{1100}\x{1161}\x{1100}", "\x{1100}\x{1161}\x{1161}")); + +# LV vs Syl(LV): /GA/ vs /[GA]/ +ok($Collator->eq("\x{1100}\x{1161}", "\x{AC00}")); +ok($hangul ->eq("\x{1100}\x{1161}", "\x{AC00}")); + +# LVT vs Syl(LV)T: /GAG/ vs /[GA]G/ +ok($Collator->eq("\x{1100}\x{1161}\x{11A8}", "\x{AC00}\x{11A8}")); +ok($hangul ->eq("\x{1100}\x{1161}\x{11A8}", "\x{AC00}\x{11A8}")); + +# LVT vs Syl(LVT): /GAG/ vs /[GAG]/ +ok($Collator->eq("\x{1100}\x{1161}\x{11A8}", "\x{AC01}")); +ok($hangul ->eq("\x{1100}\x{1161}\x{11A8}", "\x{AC01}")); + +# LVTT vs Syl(LVTT): /GAGG/ vs /[GAGG]/ +ok($Collator->eq("\x{1100}\x{1161}\x{11A9}", "\x{AC02}")); +ok($hangul ->eq("\x{1100}\x{1161}\x{11A9}", "\x{AC02}")); + +# LVTT vs Syl(LVT).T: /GAGG/ vs /[GAG]G/ +ok($Collator->gt("\x{1100}\x{1161}\x{11A9}", "\x{AC01}\x{11A8}")); +ok($hangul ->eq("\x{1100}\x{1161}\x{11A9}", "\x{AC01}\x{11A8}")); + +# LLVT vs L.Syl(LVT): /GGAG/ vs /G[GAG]/ +ok($Collator->gt("\x{1101}\x{1161}\x{11A8}", "\x{1100}\x{AC01}")); +ok($hangul ->eq("\x{1101}\x{1161}\x{11A8}", "\x{1100}\x{AC01}")); + +######################### + +# checks contraction in LVT: +# weights of these contractions may be non-sense. + +my $hangcont = Unicode::Collate->new( + level => 3, + table => undef, + normalization => undef, + entry => <<'ENTRIES', +1100 ; [.1831.0020.0002] # HANGUL CHOSEONG KIYEOK +1101 ; [.1832.0020.0002] # HANGUL CHOSEONG SSANGKIYEOK +1161 ; [.188D.0020.0002] # HANGUL JUNGSEONG A +1162 ; [.188E.0020.0002] # HANGUL JUNGSEONG AE +1163 ; [.188F.0020.0002] # HANGUL JUNGSEONG YA +11A8 ; [.18CF.0020.0002] # HANGUL JONGSEONG KIYEOK +11A9 ; [.18D0.0020.0002] # HANGUL JONGSEONG SSANGKIYEOK +1161 11A9 ; [.0000.0000.0000] # A-GG <contraction> +1100 1163 11A8 ; [.1000.0020.0002] # G-YA-G <contraction> eq. U+AC39 +ENTRIES +); + +# contracted into VT +ok($Collator->lt("\x{1101}", "\x{1101}\x{1161}\x{11A9}")); +ok($hangcont->eq("\x{1101}", "\x{1101}\x{1161}\x{11A9}")); + +# not contracted into LVT but into VT +ok($Collator->lt("\x{1100}", "\x{1100}\x{1161}\x{11A9}")); +ok($hangcont->eq("\x{1100}", "\x{1100}\x{1161}\x{11A9}")); + +# contracted into LVT +ok($Collator->gt("\x{1100}\x{1163}\x{11A8}", "\x{1100}")); +ok($hangcont->lt("\x{1100}\x{1163}\x{11A8}", "\x{1100}")); + +# LVTT vs Syl(LVTT): /GAGG/ vs /[GAGG]/ +ok($Collator->eq("\x{1100}\x{1161}\x{11A9}", "\x{AC02}")); +ok($hangcont->eq("\x{1100}\x{1161}\x{11A9}", "\x{AC02}")); + +# LVT vs Syl(LVT): /GYAG/ vs /[GYAG]/ +ok($Collator->eq("\x{1100}\x{1163}\x{11A8}", "\x{AC39}")); +ok($hangcont->eq("\x{1100}\x{1163}\x{11A8}", "\x{AC39}")); + +1; +__END__ |