diff options
Diffstat (limited to 'ext/Encode/t')
-rw-r--r-- | ext/Encode/t/Encode.t | 140 | ||||
-rw-r--r-- | ext/Encode/t/Tcl.t | 254 |
2 files changed, 394 insertions, 0 deletions
diff --git a/ext/Encode/t/Encode.t b/ext/Encode/t/Encode.t new file mode 100644 index 0000000000..3f5d206685 --- /dev/null +++ b/ext/Encode/t/Encode.t @@ -0,0 +1,140 @@ +BEGIN { + chdir 't' if -d 't'; + @INC = '../lib'; + require Config; import Config; + if ($Config{'extensions'} !~ /\bEncode\b/) { + print "1..0 # Skip: Encode was not built\n"; + exit 0; + } +} +use Test; +use Encode qw(from_to encode decode + encode_utf8 decode_utf8 + find_encoding is_utf8); +use charnames qw(greek); +my @encodings = grep(/iso-?8859/,Encode::encodings()); +my $n = 2; +my @character_set = ('0'..'9', 'A'..'Z', 'a'..'z'); +my @source = qw(ascii iso8859-1 cp1250); +my @destiny = qw(cp1047 cp37 posix-bc); +my @ebcdic_sets = qw(cp1047 cp37 posix-bc); +plan test => 38+$n*@encodings + 2*@source*@destiny*@character_set + 2*@ebcdic_sets*256 + 6; +my $str = join('',map(chr($_),0x20..0x7E)); +my $cpy = $str; +ok(length($str),from_to($cpy,'iso8859-1','Unicode'),"Length Wrong"); +ok($cpy,$str,"ASCII mangled by translating from iso8859-1 to Unicode"); +$cpy = $str; +ok(from_to($cpy,'Unicode','iso8859-1'),length($str),"Length wrong"); +ok($cpy,$str,"ASCII mangled by translating from Unicode to iso8859-1"); + +$str = join('',map(chr($_),0xa0..0xff)); +$cpy = $str; +ok(length($str),from_to($cpy,'iso8859-1','Unicode'),"Length Wrong"); + +my $sym = Encode->getEncoding('symbol'); +my $uni = $sym->decode(encode(ascii => 'a')); +ok("\N{alpha}",substr($uni,0,1),"alpha does not map to symbol 'a'"); +$str = $sym->encode("\N{Beta}"); +ok("B",decode(ascii => substr($str,0,1)),"Symbol 'B' does not map to Beta"); + +foreach my $enc (qw(symbol dingbats ascii),@encodings) + { + my $tab = Encode->getEncoding($enc); + ok(1,defined($tab),"Could not load $enc"); + $str = join('',map(chr($_),0x20..0x7E)); + $uni = $tab->decode($str); + $cpy = $tab->encode($uni); + ok($cpy,$str,"$enc mangled translating to Unicode and back"); + } + +# On ASCII based machines see if we can map several codepoints from +# three distinct ASCII sets to three distinct EBCDIC coded character sets. +# On EBCDIC machines see if we can map from three EBCDIC sets to three +# distinct ASCII sets. + +my @expectation = (240..249, 193..201,209..217,226..233, 129..137,145..153,162..169); +if (ord('A') != 65) { + my @temp = @destiny; + @destiny = @source; + @source = @temp; + undef(@temp); + @expectation = (48..57, 65..90, 97..122); +} + +foreach my $to (@destiny) + { + foreach my $from (@source) + { + my @expected = @expectation; + foreach my $chr (@character_set) + { + my $native_chr = $chr; + my $cpy = $chr; + my $rc = from_to($cpy,$from,$to); + ok(1,$rc,"Could not translate from $from to $to"); + ok(ord($cpy),shift(@expected),"mangled translating $native_chr from $from to $to"); + } + } + } + +# On either ASCII or EBCDIC machines ensure we can take the full one +# byte repetoire to EBCDIC sets and back. + +my $enc_as = 'iso8859-1'; +foreach my $enc_eb (@ebcdic_sets) + { + foreach my $ord (0..255) + { + $str = chr($ord); + my $rc = from_to($str,$enc_as,$enc_eb); + $rc += from_to($str,$enc_eb,$enc_as); + ok($rc,2,"return code for $ord $enc_eb -> $enc_as -> $enc_eb was not obtained"); + ok($ord,ord($str),"$enc_as mangled translating $ord to $enc_eb and back"); + } + } + +my $mime = find_encoding('iso-8859-2'); +ok(defined($mime),1,"Cannot find MIME-ish'iso-8859-2'"); +my $x11 = find_encoding('iso8859-2'); +ok(defined($x11),1,"Cannot find X11-ish 'iso8859-2'"); +ok($mime,$x11,"iso8598-2 and iso-8859-2 not same"); +my $spc = find_encoding('iso 8859-2'); +ok(defined($spc),1,"Cannot find 'iso 8859-2'"); +ok($spc,$mime,"iso 8859-2 and iso-8859-2 not same"); + +for my $i (256,128,129,256) + { + my $c = chr($i); + my $s = "$c\n".sprintf("%02X",$i); + ok(utf8::valid($s),1,"concat of $i botched"); + utf8::upgrade($s); + ok(utf8::valid($s),1,"concat of $i botched"); + } + +# Spot check a few points in/out of utf8 +for my $i (0x41,128,256,0x20AC) + { + my $c = chr($i); + my $o = encode_utf8($c); + ok(decode_utf8($o),$c,"decode_utf8 not inverse of encode_utf8 for $i"); + ok(encode('utf8',$c),$o,"utf8 encode by name broken for $i"); + ok(decode('utf8',$o),$c,"utf8 decode by name broken for $i"); + } + + +# is_utf8 + +ok( is_utf8("\x{100}")); +ok(! is_utf8("a")); +ok(! is_utf8("")); +"\x{100}" =~ /(.)/; +ok( is_utf8($1)); # ID 20011127.151 +$a = $1; +ok( is_utf8($a)); +$a = "\x{100}"; +chop $a; +ok( is_utf8($a)); # weird but true: an empty UTF-8 string + + + + diff --git a/ext/Encode/t/Tcl.t b/ext/Encode/t/Tcl.t new file mode 100644 index 0000000000..950f658f90 --- /dev/null +++ b/ext/Encode/t/Tcl.t @@ -0,0 +1,254 @@ +BEGIN { + chdir 't' if -d 't'; +# @INC = '../lib'; + require Config; import Config; + if ($Config{'extensions'} !~ /\bEncode\b/) { + print "1..0 # Skip: Encode was not built\n"; + exit 0; + } +} +use Test; +use Encode qw(encode decode); +use Encode::Tcl; + +my @encodings = qw(euc-cn euc-jp euc-kr big5 shiftjis); # CJK +my $n = 2; + +my %greek = ( + 'euc-cn' => [0xA6A1..0xA6B8,0xA6C1..0xA6D8], + 'euc-jp' => [0xA6A1..0xA6B8,0xA6C1..0xA6D8], + 'euc-kr' => [0xA5C1..0xA5D8,0xA5E1..0xA5F8], + 'big5' => [0xA344..0xA35B,0xA35C..0xA373], + 'shiftjis' => [0x839F..0x83B6,0x83BF..0x83D6], + 'utf8' => [0x0391..0x03A1,0x03A3..0x03A9,0x03B1..0x03C1,0x03C3..0x03C9], +); +my @greek = qw( + ALPHA BETA GAMMA DELTA EPSILON ZETA ETA + THETA IOTA KAPPA LAMBDA MU NU XI OMICRON + PI RHO SIGMA TAU UPSILON PHI CHI PSI OMEGA + alpha beta gamma delta epsilon zeta eta + theta iota kappa lambda mu nu xi omicron + pi rho sigma tau upsilon phi chi psi omega +); + +my %ideodigit = ( # cjk ideograph 'one' to 'ten' + 'euc-cn' => [qw(d2bb b6fe c8fd cbc4 cee5 c1f9 c6df b0cb bec5 caae)], + 'euc-jp' => [qw(b0ec c6f3 bbb0 bbcd b8de cfbb bcb7 c8ac b6e5 bdbd)], + 'euc-kr' => [qw(ece9 eca3 dfb2 decc e7e9 d7bf f6d2 f8a2 cefa e4a8)], + 'big5' => [qw(a440 a447 a454 a57c a4ad a4bb a443 a44b a445 a451)], + 'shiftjis' => [qw(88ea 93f1 8e4f 8e6c 8cdc 985a 8eb5 94aa 8be3 8f5c)], + 'utf8' => [qw(4e00 4e8c 4e09 56db 4e94 516d 4e03 516b 4e5d 5341)], +); +my @ideodigit = qw(one two three four five six seven eight nine ten); + +my $jis = '7bit-jis'; +my $kr = '2022-kr'; +my %esc_str; + +$esc_str{$jis} = {qw( + 1b24422422242424262428242a1b2842 + 3042304430463048304a + 1b284931323334355d1b2842 + ff71ff72ff73ff74ff75ff9d + 1b2442467c4b5c1b2842 + 65e5672c + 3132331b244234413b7a1b28425065726c + 0031003200336f225b57005000650072006c + 546573740a1b24422546253925481b28420a + 0054006500730074000a30c630b930c8000a +)}; + +$esc_str{$kr} = {qw( + 1b2429430e2a22213e0f410d0a + 304200b10041000d000a + 1b2429430e3021332a34593673383639593b673e46405a0f0d0a + ac00b098b2e4b77cb9c8bc14c0acc544c790000d000a + 1b2429434142430d0a + 004100420043000d000a +)}; + +my $num_esc = $n * keys(%esc_str); +foreach (values %esc_str){ $num_esc += $n * keys %$_ } + +my $FS_preserves_case = 1; # Unix e.g. +if ($^O eq 'VMS') { # || $^O eq ... + $FS_preserves_case = 0; +} +my $hz = 'HZ'; # HanZi +if (!$FS_preserves_case) { + $hz = 'hz'; # HanZi +} + +my @hz_txt = ( + "~~in GB.~{<:Ky2;S{#,NpJ)l6HK!#~}Bye.~~", + "~~in GB.~{<:Ky2;S{#,~}~\cJ~{NpJ)l6HK!#~}Bye.~~", + "~~in GB.~\cJ~{<:Ky2;S{#,NpJ)l6HK!#~}~\cJBye.~~", +); + +my $hz_exp = '007e0069006e002000470042002e5df162404e0d6b32' + . 'ff0c52ff65bd65bc4eba3002004200790065002e007e'; + +use constant BUFSIZ => 64; # for test +use constant hiragana => "\x{3042}\x{3044}\x{3046}\x{3048}\x{304A}"; +use constant han_kana => "\x{FF71}\x{FF72}\x{FF73}\x{FF74}\x{FF75}"; +use constant macron => "\x{0100}\x{0112}\x{012a}\x{014c}\x{016a}"; +use constant TAIL => 'bbb'; +use constant YES => 1; + +my @ary_buff = ( # [ encoding, decoded, encoded ] +# type-M + ["euc-cn", hiragana, "\xA4\xA2\xA4\xA4\xA4\xA6\xA4\xA8\xA4\xAA" ], + ["euc-jp", hiragana, "\xA4\xA2\xA4\xA4\xA4\xA6\xA4\xA8\xA4\xAA" ], + ["euc-jp", han_kana, "\x8E\xB1\x8E\xB2\x8E\xB3\x8E\xB4\x8E\xB5" ], + ["euc-kr", hiragana, "\xAA\xA2\xAA\xA4\xAA\xA6\xAA\xA8\xAA\xAA" ], + ["shiftjis", hiragana, "\x82\xA0\x82\xA2\x82\xA4\x82\xA6\x82\xA8" ], + ["shiftjis", han_kana, "\xB1\xB2\xB3\xB4\xB5" ], +# type-E + ["2022-cn", hiragana, "\e\$)A\cN". '$"$$$&$($*' . "\cO" ], + ["2022-jp", hiragana, "\e\$B".'$"$$$&$($*'."\e(B" ], + ["2022-kr", hiragana, "\e\$)C\cN". '*"*$*&*(**' . "\cO" ], + [ $jis, han_kana, "\e\(I".'12345'."\e(B" ], + ["2022-jp1", macron, "\e\$(D\x2A\x27\x2A\x37\x2A\x45\x2A\x57\x2A\x69\e(B"], + ["2022-jp2", "\x{C0}" . macron . "\x{C1}", + "\e\$(D\e.A\eN\x40\x2A\x27\x2A\x37\x2A\x45\x2A\x57\x2A\x69\e(B\eN\x41"], +# type-X + ["euc-jp-0212", hiragana, "\xA4\xA2\xA4\xA4\xA4\xA6\xA4\xA8\xA4\xAA" ], + ["euc-jp-0212", han_kana, "\x8E\xB1\x8E\xB2\x8E\xB3\x8E\xB4\x8E\xB5" ], + ["euc-jp-0212", macron, + "\x8F\xAA\xA7\x8F\xAA\xB7\x8F\xAA\xC5\x8F\xAA\xD7\x8F\xAA\xE9" ], +# type-H + [ $hz, hiragana, "~{". '$"$$$&$($*' . "~}" ], + [ $hz, hiragana, "~{". '$"$$' ."~\cJ". '$&$($*' . "~}" ], +); + +plan test => $n*@encodings + $n*@encodings*@greek + + $n*@encodings*@ideodigit + $num_esc + $n + @hz_txt + @ary_buff; + +foreach my $enc (@encodings) + { + my $tab = Encode->getEncoding($enc); + ok(1,defined($tab),"Could not load $enc"); + my $str = join('',map(chr($_),0x20..0x7E)); + my $uni = $tab->decode($str); + my $cpy = $tab->encode($uni); + ok($cpy,$str,"$enc mangled translating to Unicode and back"); + } + +foreach my $enc (@encodings) + { + my $tab = Encode->getEncoding($enc); + foreach my $gk (0..$#greek) + { + my $uni = unpack 'U', $tab->decode(pack 'n', $greek{$enc}[$gk]); + ok($uni,$greek{'utf8'}[$gk], + "$enc mangled translating to Unicode GREEK $greek[$gk]"); + my $cpy = unpack 'n',$tab->encode(pack 'U',$uni); + ok($cpy,$greek{$enc}[$gk], + "$enc mangled translating from Unicode GREEK $greek[$gk]"); + } + } + +foreach my $enc (@encodings) + { + my $tab = Encode->getEncoding($enc); + foreach my $id (0..$#ideodigit) + { + my $uni = unpack 'U',$tab->decode(pack 'H*', $ideodigit{$enc}[$id]); + ok($uni,hex($ideodigit{'utf8'}[$id]), + "$enc mangled translating to Unicode CJK IDEOGRAPH $ideodigit[$id]"); + my $cpy = lc unpack 'H*', $tab->encode(pack 'U',$uni); + ok($cpy,$ideodigit{$enc}[$id], + "$enc mangled translating from Unicode CJK IDEOGRAPH $ideodigit[$id]"); + } + } + +{ + sub to_unicode + { + my $enc = shift; + return unpack('H*', pack 'n*', unpack 'U*', + decode $enc, pack 'H*', join '', @_); + } + + sub from_unicode + { + my $enc = shift; + return unpack('H*', encode $enc, + pack 'U*', unpack 'n*', pack 'H*', join '', @_); + } + + foreach my $enc (sort keys %esc_str) + { + my $tab = Encode->getEncoding($enc); + ok(1,defined($tab),"Could not load $enc"); + my %strings = %{ $esc_str{$enc} }; + foreach my $estr (sort keys %strings) + { + my $ustr = to_unicode($enc, $estr); + ok($ustr, $strings{$estr}, + "$enc mangled translating to Unicode"); + ok(from_unicode($enc, $ustr), $estr, + "$enc mangled translating from Unicode"); + } + ok(to_unicode($enc, keys %strings), join('', values %strings), + "$enc mangled translating to Unicode"); + } +} + + +{ + my $hz_to_unicode = sub + { + return unpack('H*', pack 'n*', unpack 'U*', decode $hz, shift); + }; + + my $hz_from_unicode = sub + { + return encode($hz, pack 'U*', unpack 'n*', pack 'H*', shift); + }; + + foreach my $enc ($hz) + { + my $tab = Encode->getEncoding($enc); + ok(1,defined($tab),"Could not load $enc"); + + ok(&$hz_from_unicode($hz_exp), $hz_txt[0], + "$enc mangled translating from Unicode"); + + foreach my $str (@hz_txt) + { + ok(&$hz_to_unicode($str), $hz_exp, + "$enc mangled translating to Unicode"); + } + } +} + +for my $ary (@ary_buff) { + my $NG = 0; + my $enc = $ary->[0]; + for my $n ( int(BUFSIZ/2) .. 2*BUFSIZ+4 ){ + my $dst = "a"x$n. $ary->[1] . TAIL; + my $src = "a"x$n. $ary->[2] . TAIL; + my $utf = buff_decode($enc, $src); + $NG++ unless $dst eq $utf; + } + ok($NG, 0, "$enc mangled translating to Unicode"); +} + +sub buff_decode { + my($enc, $str) = @_; + my $utf8 = ''; + my $inconv = ''; + while(length $str){ + my $buff = $inconv.substr($str,0,BUFSIZ - length $inconv,''); + my $decoded = decode($enc, $buff, YES); + if(length $decoded){ + $utf8 .= $decoded; + $inconv = $buff; + } else { + last; # malformed? + } + } + return $utf8; +} + |