path: root/ext/Encode/t
diff options
Diffstat (limited to 'ext/Encode/t')
2 files changed, 394 insertions, 0 deletions
diff --git a/ext/Encode/t/Encode.t b/ext/Encode/t/Encode.t
new file mode 100644
index 0000000000..3f5d206685
--- /dev/null
+++ b/ext/Encode/t/Encode.t
@@ -0,0 +1,140 @@
+ chdir 't' if -d 't';
+ @INC = '../lib';
+ require Config; import Config;
+ if ($Config{'extensions'} !~ /\bEncode\b/) {
+ print "1..0 # Skip: Encode was not built\n";
+ exit 0;
+ }
+use Test;
+use Encode qw(from_to encode decode
+ encode_utf8 decode_utf8
+ find_encoding is_utf8);
+use charnames qw(greek);
+my @encodings = grep(/iso-?8859/,Encode::encodings());
+my $n = 2;
+my @character_set = ('0'..'9', 'A'..'Z', 'a'..'z');
+my @source = qw(ascii iso8859-1 cp1250);
+my @destiny = qw(cp1047 cp37 posix-bc);
+my @ebcdic_sets = qw(cp1047 cp37 posix-bc);
+plan test => 38+$n*@encodings + 2*@source*@destiny*@character_set + 2*@ebcdic_sets*256 + 6;
+my $str = join('',map(chr($_),0x20..0x7E));
+my $cpy = $str;
+ok(length($str),from_to($cpy,'iso8859-1','Unicode'),"Length Wrong");
+ok($cpy,$str,"ASCII mangled by translating from iso8859-1 to Unicode");
+$cpy = $str;
+ok(from_to($cpy,'Unicode','iso8859-1'),length($str),"Length wrong");
+ok($cpy,$str,"ASCII mangled by translating from Unicode to iso8859-1");
+$str = join('',map(chr($_),0xa0..0xff));
+$cpy = $str;
+ok(length($str),from_to($cpy,'iso8859-1','Unicode'),"Length Wrong");
+my $sym = Encode->getEncoding('symbol');
+my $uni = $sym->decode(encode(ascii => 'a'));
+ok("\N{alpha}",substr($uni,0,1),"alpha does not map to symbol 'a'");
+$str = $sym->encode("\N{Beta}");
+ok("B",decode(ascii => substr($str,0,1)),"Symbol 'B' does not map to Beta");
+foreach my $enc (qw(symbol dingbats ascii),@encodings)
+ {
+ my $tab = Encode->getEncoding($enc);
+ ok(1,defined($tab),"Could not load $enc");
+ $str = join('',map(chr($_),0x20..0x7E));
+ $uni = $tab->decode($str);
+ $cpy = $tab->encode($uni);
+ ok($cpy,$str,"$enc mangled translating to Unicode and back");
+ }
+# On ASCII based machines see if we can map several codepoints from
+# three distinct ASCII sets to three distinct EBCDIC coded character sets.
+# On EBCDIC machines see if we can map from three EBCDIC sets to three
+# distinct ASCII sets.
+my @expectation = (240..249, 193..201,209..217,226..233, 129..137,145..153,162..169);
+if (ord('A') != 65) {
+ my @temp = @destiny;
+ @destiny = @source;
+ @source = @temp;
+ undef(@temp);
+ @expectation = (48..57, 65..90, 97..122);
+foreach my $to (@destiny)
+ {
+ foreach my $from (@source)
+ {
+ my @expected = @expectation;
+ foreach my $chr (@character_set)
+ {
+ my $native_chr = $chr;
+ my $cpy = $chr;
+ my $rc = from_to($cpy,$from,$to);
+ ok(1,$rc,"Could not translate from $from to $to");
+ ok(ord($cpy),shift(@expected),"mangled translating $native_chr from $from to $to");
+ }
+ }
+ }
+# On either ASCII or EBCDIC machines ensure we can take the full one
+# byte repetoire to EBCDIC sets and back.
+my $enc_as = 'iso8859-1';
+foreach my $enc_eb (@ebcdic_sets)
+ {
+ foreach my $ord (0..255)
+ {
+ $str = chr($ord);
+ my $rc = from_to($str,$enc_as,$enc_eb);
+ $rc += from_to($str,$enc_eb,$enc_as);
+ ok($rc,2,"return code for $ord $enc_eb -> $enc_as -> $enc_eb was not obtained");
+ ok($ord,ord($str),"$enc_as mangled translating $ord to $enc_eb and back");
+ }
+ }
+my $mime = find_encoding('iso-8859-2');
+ok(defined($mime),1,"Cannot find MIME-ish'iso-8859-2'");
+my $x11 = find_encoding('iso8859-2');
+ok(defined($x11),1,"Cannot find X11-ish 'iso8859-2'");
+ok($mime,$x11,"iso8598-2 and iso-8859-2 not same");
+my $spc = find_encoding('iso 8859-2');
+ok(defined($spc),1,"Cannot find 'iso 8859-2'");
+ok($spc,$mime,"iso 8859-2 and iso-8859-2 not same");
+for my $i (256,128,129,256)
+ {
+ my $c = chr($i);
+ my $s = "$c\n".sprintf("%02X",$i);
+ ok(utf8::valid($s),1,"concat of $i botched");
+ utf8::upgrade($s);
+ ok(utf8::valid($s),1,"concat of $i botched");
+ }
+# Spot check a few points in/out of utf8
+for my $i (0x41,128,256,0x20AC)
+ {
+ my $c = chr($i);
+ my $o = encode_utf8($c);
+ ok(decode_utf8($o),$c,"decode_utf8 not inverse of encode_utf8 for $i");
+ ok(encode('utf8',$c),$o,"utf8 encode by name broken for $i");
+ ok(decode('utf8',$o),$c,"utf8 decode by name broken for $i");
+ }
+# is_utf8
+ok( is_utf8("\x{100}"));
+ok(! is_utf8("a"));
+ok(! is_utf8(""));
+"\x{100}" =~ /(.)/;
+ok( is_utf8($1)); # ID 20011127.151
+$a = $1;
+ok( is_utf8($a));
+$a = "\x{100}";
+chop $a;
+ok( is_utf8($a)); # weird but true: an empty UTF-8 string
diff --git a/ext/Encode/t/Tcl.t b/ext/Encode/t/Tcl.t
new file mode 100644
index 0000000000..950f658f90
--- /dev/null
+++ b/ext/Encode/t/Tcl.t
@@ -0,0 +1,254 @@
+ chdir 't' if -d 't';
+# @INC = '../lib';
+ require Config; import Config;
+ if ($Config{'extensions'} !~ /\bEncode\b/) {
+ print "1..0 # Skip: Encode was not built\n";
+ exit 0;
+ }
+use Test;
+use Encode qw(encode decode);
+use Encode::Tcl;
+my @encodings = qw(euc-cn euc-jp euc-kr big5 shiftjis); # CJK
+my $n = 2;
+my %greek = (
+ 'euc-cn' => [0xA6A1..0xA6B8,0xA6C1..0xA6D8],
+ 'euc-jp' => [0xA6A1..0xA6B8,0xA6C1..0xA6D8],
+ 'euc-kr' => [0xA5C1..0xA5D8,0xA5E1..0xA5F8],
+ 'big5' => [0xA344..0xA35B,0xA35C..0xA373],
+ 'shiftjis' => [0x839F..0x83B6,0x83BF..0x83D6],
+ 'utf8' => [0x0391..0x03A1,0x03A3..0x03A9,0x03B1..0x03C1,0x03C3..0x03C9],
+my @greek = qw(
+ alpha beta gamma delta epsilon zeta eta
+ theta iota kappa lambda mu nu xi omicron
+ pi rho sigma tau upsilon phi chi psi omega
+my %ideodigit = ( # cjk ideograph 'one' to 'ten'
+ 'euc-cn' => [qw(d2bb b6fe c8fd cbc4 cee5 c1f9 c6df b0cb bec5 caae)],
+ 'euc-jp' => [qw(b0ec c6f3 bbb0 bbcd b8de cfbb bcb7 c8ac b6e5 bdbd)],
+ 'euc-kr' => [qw(ece9 eca3 dfb2 decc e7e9 d7bf f6d2 f8a2 cefa e4a8)],
+ 'big5' => [qw(a440 a447 a454 a57c a4ad a4bb a443 a44b a445 a451)],
+ 'shiftjis' => [qw(88ea 93f1 8e4f 8e6c 8cdc 985a 8eb5 94aa 8be3 8f5c)],
+ 'utf8' => [qw(4e00 4e8c 4e09 56db 4e94 516d 4e03 516b 4e5d 5341)],
+my @ideodigit = qw(one two three four five six seven eight nine ten);
+my $jis = '7bit-jis';
+my $kr = '2022-kr';
+my %esc_str;
+$esc_str{$jis} = {qw(
+ 1b24422422242424262428242a1b2842
+ 3042304430463048304a
+ 1b284931323334355d1b2842
+ ff71ff72ff73ff74ff75ff9d
+ 1b2442467c4b5c1b2842
+ 65e5672c
+ 3132331b244234413b7a1b28425065726c
+ 0031003200336f225b57005000650072006c
+ 546573740a1b24422546253925481b28420a
+ 0054006500730074000a30c630b930c8000a
+$esc_str{$kr} = {qw(
+ 1b2429430e2a22213e0f410d0a
+ 304200b10041000d000a
+ 1b2429430e3021332a34593673383639593b673e46405a0f0d0a
+ ac00b098b2e4b77cb9c8bc14c0acc544c790000d000a
+ 1b2429434142430d0a
+ 004100420043000d000a
+my $num_esc = $n * keys(%esc_str);
+foreach (values %esc_str){ $num_esc += $n * keys %$_ }
+my $FS_preserves_case = 1; # Unix e.g.
+if ($^O eq 'VMS') { # || $^O eq ...
+ $FS_preserves_case = 0;
+my $hz = 'HZ'; # HanZi
+if (!$FS_preserves_case) {
+ $hz = 'hz'; # HanZi
+my @hz_txt = (
+ "~~in GB.~{<:Ky2;S{#,NpJ)l6HK!#~}Bye.~~",
+ "~~in GB.~{<:Ky2;S{#,~}~\cJ~{NpJ)l6HK!#~}Bye.~~",
+ "~~in GB.~\cJ~{<:Ky2;S{#,NpJ)l6HK!#~}~\cJBye.~~",
+my $hz_exp = '007e0069006e002000470042002e5df162404e0d6b32'
+ . 'ff0c52ff65bd65bc4eba3002004200790065002e007e';
+use constant BUFSIZ => 64; # for test
+use constant hiragana => "\x{3042}\x{3044}\x{3046}\x{3048}\x{304A}";
+use constant han_kana => "\x{FF71}\x{FF72}\x{FF73}\x{FF74}\x{FF75}";
+use constant macron => "\x{0100}\x{0112}\x{012a}\x{014c}\x{016a}";
+use constant TAIL => 'bbb';
+use constant YES => 1;
+my @ary_buff = ( # [ encoding, decoded, encoded ]
+# type-M
+ ["euc-cn", hiragana, "\xA4\xA2\xA4\xA4\xA4\xA6\xA4\xA8\xA4\xAA" ],
+ ["euc-jp", hiragana, "\xA4\xA2\xA4\xA4\xA4\xA6\xA4\xA8\xA4\xAA" ],
+ ["euc-jp", han_kana, "\x8E\xB1\x8E\xB2\x8E\xB3\x8E\xB4\x8E\xB5" ],
+ ["euc-kr", hiragana, "\xAA\xA2\xAA\xA4\xAA\xA6\xAA\xA8\xAA\xAA" ],
+ ["shiftjis", hiragana, "\x82\xA0\x82\xA2\x82\xA4\x82\xA6\x82\xA8" ],
+ ["shiftjis", han_kana, "\xB1\xB2\xB3\xB4\xB5" ],
+# type-E
+ ["2022-cn", hiragana, "\e\$)A\cN". '$"$$$&$($*' . "\cO" ],
+ ["2022-jp", hiragana, "\e\$B".'$"$$$&$($*'."\e(B" ],
+ ["2022-kr", hiragana, "\e\$)C\cN". '*"*$*&*(**' . "\cO" ],
+ [ $jis, han_kana, "\e\(I".'12345'."\e(B" ],
+ ["2022-jp1", macron, "\e\$(D\x2A\x27\x2A\x37\x2A\x45\x2A\x57\x2A\x69\e(B"],
+ ["2022-jp2", "\x{C0}" . macron . "\x{C1}",
+ "\e\$(D\e.A\eN\x40\x2A\x27\x2A\x37\x2A\x45\x2A\x57\x2A\x69\e(B\eN\x41"],
+# type-X
+ ["euc-jp-0212", hiragana, "\xA4\xA2\xA4\xA4\xA4\xA6\xA4\xA8\xA4\xAA" ],
+ ["euc-jp-0212", han_kana, "\x8E\xB1\x8E\xB2\x8E\xB3\x8E\xB4\x8E\xB5" ],
+ ["euc-jp-0212", macron,
+ "\x8F\xAA\xA7\x8F\xAA\xB7\x8F\xAA\xC5\x8F\xAA\xD7\x8F\xAA\xE9" ],
+# type-H
+ [ $hz, hiragana, "~{". '$"$$$&$($*' . "~}" ],
+ [ $hz, hiragana, "~{". '$"$$' ."~\cJ". '$&$($*' . "~}" ],
+plan test => $n*@encodings + $n*@encodings*@greek
+ + $n*@encodings*@ideodigit + $num_esc + $n + @hz_txt + @ary_buff;
+foreach my $enc (@encodings)
+ {
+ my $tab = Encode->getEncoding($enc);
+ ok(1,defined($tab),"Could not load $enc");
+ my $str = join('',map(chr($_),0x20..0x7E));
+ my $uni = $tab->decode($str);
+ my $cpy = $tab->encode($uni);
+ ok($cpy,$str,"$enc mangled translating to Unicode and back");
+ }
+foreach my $enc (@encodings)
+ {
+ my $tab = Encode->getEncoding($enc);
+ foreach my $gk (0..$#greek)
+ {
+ my $uni = unpack 'U', $tab->decode(pack 'n', $greek{$enc}[$gk]);
+ ok($uni,$greek{'utf8'}[$gk],
+ "$enc mangled translating to Unicode GREEK $greek[$gk]");
+ my $cpy = unpack 'n',$tab->encode(pack 'U',$uni);
+ ok($cpy,$greek{$enc}[$gk],
+ "$enc mangled translating from Unicode GREEK $greek[$gk]");
+ }
+ }
+foreach my $enc (@encodings)
+ {
+ my $tab = Encode->getEncoding($enc);
+ foreach my $id (0..$#ideodigit)
+ {
+ my $uni = unpack 'U',$tab->decode(pack 'H*', $ideodigit{$enc}[$id]);
+ ok($uni,hex($ideodigit{'utf8'}[$id]),
+ "$enc mangled translating to Unicode CJK IDEOGRAPH $ideodigit[$id]");
+ my $cpy = lc unpack 'H*', $tab->encode(pack 'U',$uni);
+ ok($cpy,$ideodigit{$enc}[$id],
+ "$enc mangled translating from Unicode CJK IDEOGRAPH $ideodigit[$id]");
+ }
+ }
+ sub to_unicode
+ {
+ my $enc = shift;
+ return unpack('H*', pack 'n*', unpack 'U*',
+ decode $enc, pack 'H*', join '', @_);
+ }
+ sub from_unicode
+ {
+ my $enc = shift;
+ return unpack('H*', encode $enc,
+ pack 'U*', unpack 'n*', pack 'H*', join '', @_);
+ }
+ foreach my $enc (sort keys %esc_str)
+ {
+ my $tab = Encode->getEncoding($enc);
+ ok(1,defined($tab),"Could not load $enc");
+ my %strings = %{ $esc_str{$enc} };
+ foreach my $estr (sort keys %strings)
+ {
+ my $ustr = to_unicode($enc, $estr);
+ ok($ustr, $strings{$estr},
+ "$enc mangled translating to Unicode");
+ ok(from_unicode($enc, $ustr), $estr,
+ "$enc mangled translating from Unicode");
+ }
+ ok(to_unicode($enc, keys %strings), join('', values %strings),
+ "$enc mangled translating to Unicode");
+ }
+ my $hz_to_unicode = sub
+ {
+ return unpack('H*', pack 'n*', unpack 'U*', decode $hz, shift);
+ };
+ my $hz_from_unicode = sub
+ {
+ return encode($hz, pack 'U*', unpack 'n*', pack 'H*', shift);
+ };
+ foreach my $enc ($hz)
+ {
+ my $tab = Encode->getEncoding($enc);
+ ok(1,defined($tab),"Could not load $enc");
+ ok(&$hz_from_unicode($hz_exp), $hz_txt[0],
+ "$enc mangled translating from Unicode");
+ foreach my $str (@hz_txt)
+ {
+ ok(&$hz_to_unicode($str), $hz_exp,
+ "$enc mangled translating to Unicode");
+ }
+ }
+for my $ary (@ary_buff) {
+ my $NG = 0;
+ my $enc = $ary->[0];
+ for my $n ( int(BUFSIZ/2) .. 2*BUFSIZ+4 ){
+ my $dst = "a"x$n. $ary->[1] . TAIL;
+ my $src = "a"x$n. $ary->[2] . TAIL;
+ my $utf = buff_decode($enc, $src);
+ $NG++ unless $dst eq $utf;
+ }
+ ok($NG, 0, "$enc mangled translating to Unicode");
+sub buff_decode {
+ my($enc, $str) = @_;
+ my $utf8 = '';
+ my $inconv = '';
+ while(length $str){
+ my $buff = $inconv.substr($str,0,BUFSIZ - length $inconv,'');
+ my $decoded = decode($enc, $buff, YES);
+ if(length $decoded){
+ $utf8 .= $decoded;
+ $inconv = $buff;
+ } else {
+ last; # malformed?
+ }
+ }
+ return $utf8;