summaryrefslogtreecommitdiff
path: root/cpan/Unicode-Collate/t/hangul.t
diff options
context:
space:
mode:
Diffstat (limited to 'cpan/Unicode-Collate/t/hangul.t')
-rw-r--r--cpan/Unicode-Collate/t/hangul.t228
1 files changed, 228 insertions, 0 deletions
diff --git a/cpan/Unicode-Collate/t/hangul.t b/cpan/Unicode-Collate/t/hangul.t
new file mode 100644
index 0000000000..d9f7db9b10
--- /dev/null
+++ b/cpan/Unicode-Collate/t/hangul.t
@@ -0,0 +1,228 @@
+BEGIN {
+ unless ("A" eq pack('U', 0x41)) {
+ print "1..0 # Unicode::Collate " .
+ "cannot stringify a Unicode code point\n";
+ exit 0;
+ }
+ if ($ENV{PERL_CORE}) {
+ chdir('t') if -d 't';
+ @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib);
+ }
+}
+
+use Test;
+BEGIN { plan tests => 72 };
+
+use strict;
+use warnings;
+use Unicode::Collate;
+
+#########################
+
+ok(1);
+
+# a standard collator (3.1.1)
+my $Collator = Unicode::Collate->new(
+ table => 'keys.txt',
+ normalization => undef,
+);
+
+
+# a collator for hangul sorting,
+# cf. http://std.dkuug.dk/JTC1/SC22/WG20/docs/documents.html
+# http://std.dkuug.dk/JTC1/SC22/WG20/docs/n1051-hangulsort.pdf
+my $hangul = Unicode::Collate->new(
+ level => 3,
+ table => undef,
+ normalization => undef,
+
+ entry => <<'ENTRIES',
+0061 ; [.0A15.0020.0002] # LATIN SMALL LETTER A
+0041 ; [.0A15.0020.0008] # LATIN CAPITAL LETTER A
+#1161 ; [.1800.0020.0002] # <comment> initial jungseong A
+#1163 ; [.1801.0020.0002] # <comment> initial jungseong YA
+1100 ; [.1831.0020.0002] # choseong KIYEOK
+1100 1161 ; [.1831.0020.0002][.1800.0020.0002] # G-A
+1100 1163 ; [.1831.0020.0002][.1801.0020.0002] # G-YA
+1101 ; [.1831.0020.0002][.1831.0020.0002] # choseong SSANGKIYEOK
+1101 1161 ; [.1831.0020.0002][.1831.0020.0002][.1800.0020.0002] # GG-A
+1101 1163 ; [.1831.0020.0002][.1831.0020.0002][.1801.0020.0002] # GG-YA
+1102 ; [.1833.0020.0002] # choseong NIEUN
+1102 1161 ; [.1833.0020.0002][.1800.0020.0002] # N-A
+1102 1163 ; [.1833.0020.0002][.1801.0020.0002] # N-YA
+3042 ; [.1921.0020.000E] # HIRAGANA LETTER A
+11A8 ; [.FE10.0020.0002] # jongseong KIYEOK
+11A9 ; [.FE10.0020.0002][.FE10.0020.0002] # jongseong SSANGKIYEOK
+1161 ; [.FE20.0020.0002] # jungseong A <non-initial>
+1163 ; [.FE21.0020.0002] # jungseong YA <non-initial>
+ENTRIES
+);
+
+ok(ref $hangul, "Unicode::Collate");
+
+my $trailwt = Unicode::Collate->new(
+ level => 3,
+ table => undef,
+ normalization => undef,
+ hangul_terminator => 16,
+
+ entry => <<'ENTRIES', # Term < Jongseong < Jungseong < Choseong
+0061 ; [.0A15.0020.0002] # LATIN SMALL LETTER A
+0041 ; [.0A15.0020.0008] # LATIN CAPITAL LETTER A
+11A8 ; [.1801.0020.0002] # HANGUL JONGSEONG KIYEOK
+11A9 ; [.1801.0020.0002][.1801.0020.0002] # HANGUL JONGSEONG SSANGKIYEOK
+1161 ; [.1831.0020.0002] # HANGUL JUNGSEONG A
+1163 ; [.1832.0020.0002] # HANGUL JUNGSEONG YA
+1100 ; [.1861.0020.0002] # HANGUL CHOSEONG KIYEOK
+1101 ; [.1861.0020.0002][.1861.0020.0002] # HANGUL CHOSEONG SSANGKIYEOK
+1102 ; [.1862.0020.0002] # HANGUL CHOSEONG NIEUN
+3042 ; [.1921.0020.000E] # HIRAGANA LETTER A
+ENTRIES
+);
+
+#########################
+
+# L(simp)L(simp) vs L(comp): /GGA/
+ok($Collator->lt("\x{1100}\x{1100}\x{1161}", "\x{1101}\x{1161}"));
+ok($hangul ->eq("\x{1100}\x{1100}\x{1161}", "\x{1101}\x{1161}"));
+ok($trailwt ->eq("\x{1100}\x{1100}\x{1161}", "\x{1101}\x{1161}"));
+
+# L(simp) vs L(simp)L(simp): /GA/ vs /GGA/
+ok($Collator->gt("\x{1100}\x{1161}", "\x{1100}\x{1100}\x{1161}"));
+ok($hangul ->lt("\x{1100}\x{1161}", "\x{1100}\x{1100}\x{1161}"));
+ok($trailwt ->lt("\x{1100}\x{1161}", "\x{1100}\x{1100}\x{1161}"));
+
+# T(simp)T(simp) vs T(comp): /AGG/
+ok($Collator->lt("\x{1161}\x{11A8}\x{11A8}", "\x{1161}\x{11A9}"));
+ok($hangul ->eq("\x{1161}\x{11A8}\x{11A8}", "\x{1161}\x{11A9}"));
+ok($trailwt ->eq("\x{1161}\x{11A8}\x{11A8}", "\x{1161}\x{11A9}"));
+
+# T(simp) vs T(simp)T(simp): /AG/ vs /AGG/
+ok($Collator->lt("\x{1161}\x{11A8}", "\x{1161}\x{11A8}\x{11A8}"));
+ok($hangul ->lt("\x{1161}\x{11A8}", "\x{1161}\x{11A8}\x{11A8}"));
+ok($trailwt ->lt("\x{1161}\x{11A8}", "\x{1161}\x{11A8}\x{11A8}"));
+
+# LV vs LLV: /GA/ vs /GNA/
+ok($Collator->gt("\x{1100}\x{1161}", "\x{1100}\x{1102}\x{1161}"));
+ok($hangul ->lt("\x{1100}\x{1161}", "\x{1100}\x{1102}\x{1161}"));
+ok($trailwt ->lt("\x{1100}\x{1161}", "\x{1100}\x{1102}\x{1161}"));
+
+# LVX vs LVV: /GAA/ vs /GA/.latinA
+ok($Collator->gt("\x{1100}\x{1161}\x{1161}", "\x{1100}\x{1161}A"));
+ok($hangul ->gt("\x{1100}\x{1161}\x{1161}", "\x{1100}\x{1161}A"));
+ok($trailwt ->gt("\x{1100}\x{1161}\x{1161}", "\x{1100}\x{1161}A"));
+
+# LVX vs LVV: /GAA/ vs /GA/.hiraganaA
+ok($Collator->lt("\x{1100}\x{1161}\x{1161}", "\x{1100}\x{1161}\x{3042}"));
+ok($hangul ->gt("\x{1100}\x{1161}\x{1161}", "\x{1100}\x{1161}\x{3042}"));
+ok($trailwt ->gt("\x{1100}\x{1161}\x{1161}", "\x{1100}\x{1161}\x{3042}"));
+
+# LVX vs LVV: /GAA/ vs /GA/.hanja
+ok($Collator->lt("\x{1100}\x{1161}\x{1161}", "\x{1100}\x{1161}\x{4E00}"));
+ok($hangul ->gt("\x{1100}\x{1161}\x{1161}", "\x{1100}\x{1161}\x{4E00}"));
+ok($trailwt ->gt("\x{1100}\x{1161}\x{1161}", "\x{1100}\x{1161}\x{4E00}"));
+
+# LVL vs LVT: /GA/./G/ vs /GAG/
+ok($Collator->lt("\x{1100}\x{1161}\x{1100}", "\x{1100}\x{1161}\x{11A8}"));
+ok($hangul ->lt("\x{1100}\x{1161}\x{1100}", "\x{1100}\x{1161}\x{11A8}"));
+ok($trailwt ->lt("\x{1100}\x{1161}\x{1100}", "\x{1100}\x{1161}\x{11A8}"));
+
+# LVT vs LVX: /GAG/ vs /GA/.latinA
+ok($Collator->gt("\x{1100}\x{1161}\x{11A8}", "\x{1100}\x{1161}A"));
+ok($hangul ->gt("\x{1100}\x{1161}\x{11A8}", "\x{1100}\x{1161}A"));
+ok($trailwt ->gt("\x{1100}\x{1161}\x{11A8}", "\x{1100}\x{1161}A"));
+
+# LVT vs LVX: /GAG/ vs /GA/.hiraganaA
+ok($Collator->lt("\x{1100}\x{1161}\x{11A8}", "\x{1100}\x{1161}\x{3042}"));
+ok($hangul ->gt("\x{1100}\x{1161}\x{11A8}", "\x{1100}\x{1161}\x{3042}"));
+ok($trailwt ->gt("\x{1100}\x{1161}\x{11A8}", "\x{1100}\x{1161}\x{3042}"));
+
+# LVT vs LVX: /GAG/ vs /GA/.hanja
+ok($Collator->lt("\x{1100}\x{1161}\x{11A8}", "\x{1100}\x{1161}\x{4E00}"));
+ok($hangul ->gt("\x{1100}\x{1161}\x{11A8}", "\x{1100}\x{1161}\x{4E00}"));
+ok($trailwt ->gt("\x{1100}\x{1161}\x{11A8}", "\x{1100}\x{1161}\x{4E00}"));
+
+# LVT vs LVV: /GAG/ vs /GAA/
+ok($Collator->gt("\x{1100}\x{1161}\x{11A8}", "\x{1100}\x{1161}\x{1161}"));
+ok($hangul ->lt("\x{1100}\x{1161}\x{11A8}", "\x{1100}\x{1161}\x{1161}"));
+ok($trailwt ->lt("\x{1100}\x{1161}\x{11A8}", "\x{1100}\x{1161}\x{1161}"));
+
+# LVL vs LVV: /GA/./G/ vs /GAA/
+ok($Collator->lt("\x{1100}\x{1161}\x{1100}", "\x{1100}\x{1161}\x{1161}"));
+ok($hangul ->lt("\x{1100}\x{1161}\x{1100}", "\x{1100}\x{1161}\x{1161}"));
+ok($trailwt ->lt("\x{1100}\x{1161}\x{1100}", "\x{1100}\x{1161}\x{1161}"));
+
+# LV vs Syl(LV): /GA/ vs /[GA]/
+ok($Collator->eq("\x{1100}\x{1161}", "\x{AC00}"));
+ok($hangul ->eq("\x{1100}\x{1161}", "\x{AC00}"));
+ok($trailwt ->eq("\x{1100}\x{1161}", "\x{AC00}"));
+
+# LVT vs Syl(LV)T: /GAG/ vs /[GA]G/
+ok($Collator->eq("\x{1100}\x{1161}\x{11A8}", "\x{AC00}\x{11A8}"));
+ok($hangul ->eq("\x{1100}\x{1161}\x{11A8}", "\x{AC00}\x{11A8}"));
+ok($trailwt ->eq("\x{1100}\x{1161}\x{11A8}", "\x{AC00}\x{11A8}"));
+
+# LVT vs Syl(LVT): /GAG/ vs /[GAG]/
+ok($Collator->eq("\x{1100}\x{1161}\x{11A8}", "\x{AC01}"));
+ok($hangul ->eq("\x{1100}\x{1161}\x{11A8}", "\x{AC01}"));
+ok($trailwt ->eq("\x{1100}\x{1161}\x{11A8}", "\x{AC01}"));
+
+# LVTT vs Syl(LVTT): /GAGG/ vs /[GAGG]/
+ok($Collator->eq("\x{1100}\x{1161}\x{11A9}", "\x{AC02}"));
+ok($hangul ->eq("\x{1100}\x{1161}\x{11A9}", "\x{AC02}"));
+ok($trailwt ->eq("\x{1100}\x{1161}\x{11A9}", "\x{AC02}"));
+
+# LVTT vs Syl(LVT).T: /GAGG/ vs /[GAG]G/
+ok($Collator->gt("\x{1100}\x{1161}\x{11A9}", "\x{AC01}\x{11A8}"));
+ok($hangul ->eq("\x{1100}\x{1161}\x{11A9}", "\x{AC01}\x{11A8}"));
+ok($trailwt ->eq("\x{1100}\x{1161}\x{11A9}", "\x{AC01}\x{11A8}"));
+
+# LLVT vs L.Syl(LVT): /GGAG/ vs /G[GAG]/
+ok($Collator->gt("\x{1101}\x{1161}\x{11A8}", "\x{1100}\x{AC01}"));
+ok($hangul ->eq("\x{1101}\x{1161}\x{11A8}", "\x{1100}\x{AC01}"));
+ok($trailwt ->eq("\x{1101}\x{1161}\x{11A8}", "\x{1100}\x{AC01}"));
+
+#########################
+
+# checks contraction in LVT:
+# weights of these contractions may be non-sense.
+
+my $hangcont = Unicode::Collate->new(
+ level => 3,
+ table => undef,
+ normalization => undef,
+ entry => <<'ENTRIES',
+1100 ; [.1831.0020.0002] # HANGUL CHOSEONG KIYEOK
+1101 ; [.1832.0020.0002] # HANGUL CHOSEONG SSANGKIYEOK
+1161 ; [.188D.0020.0002] # HANGUL JUNGSEONG A
+1162 ; [.188E.0020.0002] # HANGUL JUNGSEONG AE
+1163 ; [.188F.0020.0002] # HANGUL JUNGSEONG YA
+11A8 ; [.18CF.0020.0002] # HANGUL JONGSEONG KIYEOK
+11A9 ; [.18D0.0020.0002] # HANGUL JONGSEONG SSANGKIYEOK
+1161 11A9 ; [.0000.0000.0000] # A-GG <contraction>
+1100 1163 11A8 ; [.1000.0020.0002] # G-YA-G <contraction> eq. U+AC39
+ENTRIES
+);
+
+# contracted into VT
+ok($Collator->lt("\x{1101}", "\x{1101}\x{1161}\x{11A9}"));
+ok($hangcont->eq("\x{1101}", "\x{1101}\x{1161}\x{11A9}"));
+
+# not contracted into LVT but into VT
+ok($Collator->lt("\x{1100}", "\x{1100}\x{1161}\x{11A9}"));
+ok($hangcont->eq("\x{1100}", "\x{1100}\x{1161}\x{11A9}"));
+
+# contracted into LVT
+ok($Collator->gt("\x{1100}\x{1163}\x{11A8}", "\x{1100}"));
+ok($hangcont->lt("\x{1100}\x{1163}\x{11A8}", "\x{1100}"));
+
+# LVTT vs Syl(LVTT): /GAGG/ vs /[GAGG]/
+ok($Collator->eq("\x{1100}\x{1161}\x{11A9}", "\x{AC02}"));
+ok($hangcont->eq("\x{1100}\x{1161}\x{11A9}", "\x{AC02}"));
+
+# LVT vs Syl(LVT): /GYAG/ vs /[GYAG]/
+ok($Collator->eq("\x{1100}\x{1163}\x{11A8}", "\x{AC39}"));
+ok($hangcont->eq("\x{1100}\x{1163}\x{11A8}", "\x{AC39}"));
+
+1;
+__END__