diff options
author | SADAHIRO Tomoyuki <BQW10602@nifty.com> | 2001-10-16 10:50:16 +0900 |
---|---|---|
committer | Jarkko Hietaniemi <jhi@iki.fi> | 2001-10-19 19:01:46 +0000 |
commit | 96d6357cc349e1e67175ef1b324a18e782e390b7 (patch) | |
tree | e4f671e34ac03a9dce5f38b08f431dd05e48cc24 /ext/Encode | |
parent | 2cd1776c1779fda5b6df8baba77742b2f09eb9e2 (diff) | |
download | perl-96d6357cc349e1e67175ef1b324a18e782e390b7.tar.gz |
Re: PerlIO and Encode
Message-Id: <20011016014150.0C8E.BQW10602@nifty.com>
p4raw-id: //depot/perl@12509
Diffstat (limited to 'ext/Encode')
-rw-r--r-- | ext/Encode/Encode.pm | 2 | ||||
-rw-r--r-- | ext/Encode/Encode/Tcl.pm | 520 | ||||
-rw-r--r-- | ext/Encode/Encode/Tcl.t | 67 |
3 files changed, 388 insertions, 201 deletions
diff --git a/ext/Encode/Encode.pm b/ext/Encode/Encode.pm index 2035e20c15..6ddcb32132 100644 --- a/ext/Encode/Encode.pm +++ b/ext/Encode/Encode.pm @@ -225,7 +225,7 @@ sub decode my $enc = find_encoding($name); croak("Unknown encoding '$name'") unless defined $enc; my $string = $enc->decode($octets,$check); - return undef if ($check && length($octets)); + $_[1] = $octets if $check; return $string; } diff --git a/ext/Encode/Encode/Tcl.pm b/ext/Encode/Encode/Tcl.pm index eb13c5f4fc..460a521bb9 100644 --- a/ext/Encode/Encode/Tcl.pm +++ b/ext/Encode/Encode/Tcl.pm @@ -40,6 +40,23 @@ sub import INC_search(); } +sub no_map_in_encode ($$) + # codepoint, enc-name; +{ + carp sprintf "\"\\N{U+%x}\" does not map to %s", @_; +# /* FIXME: Skip over the character, copy in replacement and continue +# * but that is messy so for now just fail. +# */ + return; +} + +sub no_map_in_decode ($$) + # enc-name, string beginning the malform char; +{ +# /* UTF-8 is supposed to be "Universal" so should not happen */ + croak sprintf "%s '%s' does not map to UTF-8", @_; +} + sub encode { my $obj = shift; @@ -78,11 +95,11 @@ sub loadEncoding $type = substr($line,0,1); last unless $type eq '#'; } - my $class = ref($obj).('::'.( - ($type eq 'X') ? 'Extended' : - ($type eq 'H') ? 'HanZi' : - ($type eq 'E') ? 'Escape' : 'Table' - )); + my $subclass = + ($type eq 'X') ? 'Extended' : + ($type eq 'H') ? 'HanZi' : + ($type eq 'E') ? 'Escape' : 'Table'; + my $class = ref($obj) . '::' . $subclass; # carp "Loading $file"; bless $obj,$class; return $obj if $obj->read($fh,$obj->name,$type); @@ -109,7 +126,8 @@ sub INC_find package Encode::Tcl::Table; use base 'Encode::Encoding'; -use Data::Dumper; +use Carp; +#use Data::Dumper; sub read { @@ -150,8 +168,12 @@ sub read } $touni[$page] = \@page; } - $rep = $type ne 'M' ? $obj->can("rep_$type") : - sub { ($_[0] > 255) || $leading[$_[0]] ? 'n' : 'C'}; + $rep = $type ne 'M' + ? $obj->can("rep_$type") + : sub + { + ($_[0] > 255) || $leading[$_[0]] ? 'n' : 'C'; + }; $obj->{'Rep'} = $rep; $obj->{'ToUni'} = \@touni; $obj->{'FmUni'} = \%fmuni; @@ -175,13 +197,15 @@ sub representation sub decode { - my ($obj,$str,$chk) = @_; + my($obj,$str,$chk) = @_; + my $name = $obj->{'Name'}; my $rep = $obj->{'Rep'}; my $touni = $obj->{'ToUni'}; my $uni; while (length($str)) { - my $ch = ord(substr($str,0,1,'')); + my $cc = substr($str,0,1,''); + my $ch = ord($cc); my $x; if (&$rep($ch) eq 'C') { @@ -189,13 +213,18 @@ sub decode } else { - $x = $touni->[$ch][ord(substr($str,0,1,''))]; + if(! length $str) + { + $str = pack('C',$ch); # split leading byte + last; + } + my $c2 = substr($str,0,1,''); + $cc .= $c2; + $x = $touni->[$ch][ord($c2)]; } unless (defined $x) { - last if $chk; - # What do we do here ? - $x = ''; + Encode::Tcl::no_map_in_decode($name, $cc.$str); } $uni .= $x; } @@ -209,16 +238,20 @@ sub encode my ($obj,$uni,$chk) = @_; my $fmuni = $obj->{'FmUni'}; my $def = $obj->{'Def'}; + my $name = $obj->{'Name'}; my $rep = $obj->{'Rep'}; my $str; while (length($uni)) { my $ch = substr($uni,0,1,''); - my $x = $fmuni->{chr(ord($ch))}; - unless (defined $x) + my $x = $fmuni->{$ch}; + unless(defined $x) { - last if ($chk); - $x = $def; + unless($chk) + { + Encode::Tcl::no_map_in_encode(ord($ch), $name) + } + return undef; } $str .= pack(&$rep($x),$x); } @@ -231,29 +264,41 @@ use base 'Encode::Encoding'; use Carp; +use constant SI => "\cO"; +use constant SO => "\cN"; +use constant SS2 => "\eN"; +use constant SS3 => "\eO"; + sub read { my ($obj,$fh,$name) = @_; my(%tbl, @seq, $enc, @esc, %grp); while (<$fh>) { - my ($key,$val) = /^(\S+)\s+(.*)$/; + next unless /^(\S+)\s+(.*)$/; + my ($key,$val) = ($1,$2); $val =~ s/^\{(.*?)\}/$1/g; $val =~ s/\\x([0-9a-f]{2})/chr(hex($1))/ge; - if($enc = Encode->getEncoding($key)){ + if($enc = Encode->getEncoding($key)) + { $tbl{$val} = ref($enc) eq 'Encode::Tcl' ? $enc->loadEncoding : $enc; push @seq, $val; $grp{$val} = - $val =~ m|[(]| ? 0 : # G0 : SI eq "\cO" - $val =~ m|[)-]| ? 1 : # G1 : SO eq "\cN" - $val =~ m|[*.]| ? 2 : # G2 : SS2 eq "\eN" - $val =~ m|[+/]| ? 3 : # G3 : SS3 eq "\eO" - 0; # G0 - }else{ + $val =~ m|[(]| ? 0 : # G0 : SI eq "\cO" + $val =~ m|[)-]| ? 1 : # G1 : SO eq "\cN" + $val =~ m|[*.]| ? 2 : # G2 : SS2 eq "\eN" + $val =~ m|[+/]| ? 3 : # G3 : SS3 eq "\eO" + 0; # G0 + } + else + { $obj->{$key} = $val; - } - if($val =~ /^\e(.*)/){ push(@esc, quotemeta $1) } + } + if($val =~ /^\e(.*)/) + { + push(@esc, quotemeta $1); + } } $obj->{'Grp'} = \%grp; # graphic chars $obj->{'Seq'} = \@seq; # escape sequences @@ -265,6 +310,7 @@ sub read sub decode { my ($obj,$str,$chk) = @_; + my $name = $obj->{'Name'}; my $tbl = $obj->{'Tbl'}; my $seq = $obj->{'Seq'}; my $grp = $obj->{'Grp'}; @@ -277,45 +323,57 @@ sub decode my $s = 0; # state of SO-SI. 0 (G0) or 1 (G1); my $ss = 0; # state of SS2,SS3. 0 (G0), 2 (G2) or 3 (G3); my $uni; - while (length($str)){ - my $uch = substr($str,0,1,''); - if($uch eq "\e"){ - if($str =~ s/^($esc)//) - { - my $e = "\e$1"; - $sta[ $grp->{$e} ] = $e if $tbl->{$e}; - } + while (length($str)) + { + my $cc = substr($str,0,1,''); + if($cc eq "\e") + { + if($str =~ s/^($esc)//) + { + my $e = "\e$1"; + $sta[ $grp->{$e} ] = $e if $tbl->{$e}; + } # appearance of "\eN\eO" or "\eO\eN" isn't supposed. - elsif($str =~ s/^N//) - { - $ss = 2; - } - elsif($str =~ s/^O//) - { - $ss = 3; - } - else - { - $str =~ s/^([\x20-\x2F]*[\x30-\x7E])//; - carp "unknown escape sequence: ESC $1"; - } - next; - } - if($uch eq "\x0e"){ - $s = 1; next; - } - if($uch eq "\x0f"){ - $s = 0; next; - } + # but in that case, the former will be ignored. + elsif($str =~ s/^N//) + { + $ss = 2; + } + elsif($str =~ s/^O//) + { + $ss = 3; + } + else + { + # strictly, ([\x20-\x2F]*[\x30-\x7E]). '?' for chopped. + $str =~ s/^([\x20-\x2F]*[\x30-\x7E]?)//; + if($chk && ! length $str) + { + $str = "\e$1"; # split sequence + last; + } + croak "unknown escape sequence: ESC $1"; + } + next; + } + if($cc eq SO) + { + $s = 1; next; + } + if($cc eq SI) + { + $s = 0; next; + } $cur = $ss ? $sta[$ss] : $sta[$s]; - if(ref($tbl->{$cur}) eq 'Encode::XS'){ - $uni .= $tbl->{$cur}->decode($uch); + if(ref($tbl->{$cur}) ne 'Encode::Tcl::Table') + { + $uni .= $tbl->{$cur}->decode($cc); $ss = 0; next; - } - my $ch = ord($uch); + } + my $ch = ord($cc); my $rep = $tbl->{$cur}->{'Rep'}; my $touni = $tbl->{$cur}->{'ToUni'}; my $x; @@ -325,24 +383,36 @@ sub decode } else { - $x = $touni->[$ch][ord(substr($str,0,1,''))]; + if(! length $str) + { + $str = $cc; # split leading byte + last; + } + my $c2 = substr($str,0,1,''); + $cc .= $c2; + $x = $touni->[$ch][ord($c2)]; } unless (defined $x) { - last if $chk; - # What do we do here ? - $x = ''; + Encode::Tcl::no_map_in_decode($name, $cc.$str); } $uni .= $x; $ss = 0; } - $_[1] = $str if $chk; - return $uni; + if($chk) + { + my $back = join('', grep defined($_) && $_ ne $std, @sta); + $back .= SO if $s; + $back .= $ss == 2 ? SS2 : SS3 if $ss; + $_[1] = $back.$str; + } + return $uni; } sub encode { my ($obj,$uni,$chk) = @_; + my $name = $obj->{'Name'}; my $tbl = $obj->{'Tbl'}; my $seq = $obj->{'Seq'}; my $grp = $obj->{'Grp'}; @@ -357,39 +427,45 @@ sub encode if($ini && defined $grp->{$ini}) { - $sta[ $grp->{$ini} ] = $ini; + $sta[ $grp->{$ini} ] = $ini; } - while (length($uni)){ - my $ch = substr($uni,0,1,''); - my $x; - foreach my $e_seq (@$seq){ - $x = ref($tbl->{$e_seq}) eq 'Encode::XS' - ? $tbl->{$e_seq}->encode($ch,1) - : $tbl->{$e_seq}->{FmUni}->{$ch}; - $cur = $e_seq, last if defined $x; - } - if(ref($tbl->{$cur}) ne 'Encode::XS') - { - my $def = $tbl->{$cur}->{'Def'}; - my $rep = $tbl->{$cur}->{'Rep'}; - unless (defined $x){ - last if ($chk); - $x = $def; + while (length($uni)) + { + my $ch = substr($uni,0,1,''); + my $x; + foreach my $e_seq (@$seq) + { + $x = ref($tbl->{$e_seq}) eq 'Encode::Tcl::Table' + ? $tbl->{$e_seq}->{FmUni}->{$ch} + : $tbl->{$e_seq}->encode($ch,1); + $cur = $e_seq, last if defined $x; } - $x = pack(&$rep($x),$x); + unless (defined $x) + { + unless($chk) + { + Encode::Tcl::no_map_in_encode(ord($ch), $name) + } + return undef; } - $cG = $grp->{$cur}; - $str .= $sta[$cG] = $cur unless $cG < 2 && $cur eq $sta[$cG]; - - $str .= $cG == 0 && $pG == 1 ? "\cO" : - $cG == 1 && $pG == 0 ? "\cN" : - $cG == 2 ? "\eN" : - $cG == 3 ? "\eO" : ""; - $str .= $x; - $pG = $cG if $cG < 2; - } - $str .= "\cO" if $pG == 1; # back to G0 + if(ref($tbl->{$cur}) eq 'Encode::Tcl::Table') + { + my $def = $tbl->{$cur}->{'Def'}; + my $rep = $tbl->{$cur}->{'Rep'}; + $x = pack(&$rep($x),$x); + } + $cG = $grp->{$cur}; + $str .= $sta[$cG] = $cur unless $cG < 2 && $cur eq $sta[$cG]; + + $str .= $cG == 0 && $pG == 1 ? SI : + $cG == 1 && $pG == 0 ? SO : + $cG == 2 ? SS2 : + $cG == 3 ? SS3 : ""; + $str .= $x; + $pG = $cG if $cG < 2; + } + $str .= SI if $pG == 1; # back to G0 $str .= $std unless $std eq $sta[0]; # GO to ASCII $str .= $fin; # necessary? $_[1] = $uni if $chk; @@ -408,18 +484,21 @@ sub read my(%tbl, $enc, %ssc, @key); while (<$fh>) { - my ($key,$val) = /^(\S+)\s+(.*)$/; + next unless /^(\S+)\s+(.*)$/; + my ($key,$val) = ($1,$2); $val =~ s/\{(.*?)\}/$1/; $val =~ s/\\x([0-9a-f]{2})/chr(hex($1))/ge; - if($enc = Encode->getEncoding($key)){ + if($enc = Encode->getEncoding($key)) + { push @key, $val; - $tbl{$val} = ref($enc) eq 'Encode::Tcl' - ? $enc->loadEncoding : $enc; + $tbl{$val} = ref($enc) eq 'Encode::Tcl' ? $enc->loadEncoding : $enc; $ssc{$val} = substr($val,1) if $val =~ /^>/; - }else{ + } + else + { $obj->{$key} = $val; - } + } } $obj->{'SSC'} = \%ssc; # single shift char $obj->{'Tbl'} = \%tbl; # encoding tables @@ -430,25 +509,28 @@ sub read sub decode { my ($obj,$str,$chk) = @_; - my $tbl = $obj->{'Tbl'}; - my $ssc = $obj->{'SSC'}; + my $name = $obj->{'Name'}; + my $tbl = $obj->{'Tbl'}; + my $ssc = $obj->{'SSC'}; my $cur = ''; # current state my $uni; - while (length($str)){ - my $uch = substr($str,0,1,''); - my $ch = ord($uch); + while (length($str)) + { + my $cc = substr($str,0,1,''); + my $ch = ord($cc); if(!$cur && $ch > 0x7F) { $cur = '>'; - $cur .= $uch, next if $ssc->{$cur.$uch}; + $cur .= $cc, next if $ssc->{$cur.$cc}; } $ch ^= 0x80 if $cur; - if(ref($tbl->{$cur}) eq 'Encode::XS'){ - $uni .= $tbl->{$cur}->decode(chr($ch)); + if(ref($tbl->{$cur}) ne 'Encode::Tcl::Table') + { + $uni .= $tbl->{$cur}->decode($cc); $cur = ''; next; - } + } my $rep = $tbl->{$cur}->{'Rep'}; my $touni = $tbl->{$cur}->{'ToUni'}; my $x; @@ -458,59 +540,74 @@ sub decode } else { - $x = $touni->[$ch][0x80 ^ ord(substr($str,0,1,''))]; + if(! length $str) + { + $str = $cc; # split leading byte + last; + } + my $c2 = substr($str,0,1,''); + $cc .= $c2; + $x = $touni->[$ch][0x80 ^ ord($c2)]; } unless (defined $x) { - last if $chk; - # What do we do here ? - $x = ''; + Encode::Tcl::no_map_in_decode($name, $cc.$str); } $uni .= $x; $cur = ''; } - $_[1] = $str if $chk; + if($chk) + { + $cur =~ s/>//; + $_[1] = $cur ne '' ? $cur.$str : $str; + } return $uni; } sub encode { my ($obj,$uni,$chk) = @_; + my $name = $obj->{'Name'}; my $tbl = $obj->{'Tbl'}; my $ssc = $obj->{'SSC'}; my $key = $obj->{'Key'}; my $str; my $cur; - while (length($uni)){ - my $ch = substr($uni,0,1,''); - my $x; - foreach my $k (@$key){ - $x = ref($tbl->{$k}) eq 'Encode::XS' - ? $k =~ /^>/ - ? $tbl->{$k}->encode(chr(0x80 ^ ord $ch),1) - : $tbl->{$k}->encode($ch,1) - : $tbl->{$k}->{FmUni}->{$ch}; - $cur = $k, last if defined $x; - } - if(ref($tbl->{$cur}) ne 'Encode::XS') - { - my $def = $tbl->{$cur}->{'Def'}; - my $rep = $tbl->{$cur}->{'Rep'}; - unless (defined $x){ - last if ($chk); - $x = $def; - } - my $r = &$rep($x); - $x = pack($r, + while (length($uni)) + { + my $ch = substr($uni,0,1,''); + my $x; + foreach my $k (@$key) + { + $x = ref($tbl->{$k}) ne 'Encode::Tcl::Table' + ? $k =~ /^>/ + ? $tbl->{$k}->encode(chr(0x80 ^ ord $ch),1) + : $tbl->{$k}->encode($ch,1) + : $tbl->{$k}->{FmUni}->{$ch}; + $cur = $k, last if defined $x; + } + unless (defined $x) + { + unless($chk) + { + Encode::Tcl::no_map_in_encode(ord($ch), $name) + } + return undef; + } + if(ref($tbl->{$cur}) eq 'Encode::Tcl::Table') + { + my $def = $tbl->{$cur}->{'Def'}; + my $rep = $tbl->{$cur}->{'Rep'}; + my $r = &$rep($x); + $x = pack($r, $cur =~ /^>/ ? $r eq 'C' ? 0x80 ^ $x : 0x8080 ^ $x : $x); - } - - $str .= $ssc->{$cur} if defined $ssc->{$cur}; - $str .= $x; - } + } + $str .= $ssc->{$cur} if defined $ssc->{$cur}; + $str .= $x; + } $_[1] = $uni if $chk; return $str; } @@ -526,15 +623,19 @@ sub read my(%tbl, @seq, $enc); while (<$fh>) { - my ($key,$val) = /^(\S+)\s+(.*)$/; + next unless /^(\S+)\s+(.*)$/; + my ($key,$val) = ($1,$2); $val =~ s/^\{(.*?)\}/$1/g; $val =~ s/\\x([0-9a-f]{2})/chr(hex($1))/ge; - if($enc = Encode->getEncoding($key)){ + if($enc = Encode->getEncoding($key)) + { $tbl{$val} = ref($enc) eq 'Encode::Tcl' ? $enc->loadEncoding : $enc; push @seq, $val; - }else{ + } + else + { $obj->{$key} = $val; - } + } } $obj->{'Seq'} = \@seq; # escape sequences $obj->{'Tbl'} = \%tbl; # encoding tables @@ -544,39 +645,47 @@ sub read sub decode { my ($obj,$str,$chk) = @_; + my $name = $obj->{'Name'}; my $tbl = $obj->{'Tbl'}; my $seq = $obj->{'Seq'}; my $std = $seq->[0]; my $cur = $std; my $uni; while (length($str)){ - my $uch = substr($str,0,1,''); - if($uch eq "~"){ - if($str =~ s/^\cJ//) - { - next; - } - elsif($str =~ s/^\~//) - { - 1; - } - elsif($str =~ s/^([{}])//) - { - $cur = "~$1"; - next; - } - else - { - $str =~ s/^([^~])//; - carp "unknown HanZi escape sequence: ~$1"; - next; - } - } - if(ref($tbl->{$cur}) eq 'Encode::XS'){ - $uni .= $tbl->{$cur}->decode($uch); + my $cc = substr($str,0,1,''); + if($cc eq "~") + { + if($str =~ s/^\cJ//) + { + next; + } + elsif($str =~ s/^\~//) + { + 1; # no-op + } + elsif($str =~ s/^([{}])//) + { + $cur = "~$1"; + next; + } + elsif(! length $str) + { + $str = '~'; + last; + } + else + { + $str =~ s/^([^~])//; + croak "unknown HanZi escape sequence: ~$1"; + next; + } + } + if(ref($tbl->{$cur}) ne 'Encode::Tcl::Table') + { + $uni .= $tbl->{$cur}->decode($cc); next; - } - my $ch = ord($uch); + } + my $ch = ord($cc); my $rep = $tbl->{$cur}->{'Rep'}; my $touni = $tbl->{$cur}->{'ToUni'}; my $x; @@ -586,23 +695,32 @@ sub decode } else { - $x = $touni->[$ch][ord(substr($str,0,1,''))]; + if(! length $str) + { + $str = $cc; # split leading byte + last; + } + my $c2 = substr($str,0,1,''); + $cc .= $c2; + $x = $touni->[$ch][ord($c2)]; } unless (defined $x) { - last if $chk; - # What do we do here ? - $x = ''; + Encode::Tcl::no_map_in_decode($name, $cc.$str); } $uni .= $x; } - $_[1] = $str if $chk; + if($chk) + { + $_[1] = $cur eq $std ? $str : $cur.$str; + } return $uni; } sub encode { my ($obj,$uni,$chk) = @_; + my $name = $obj->{'Name'}; my $tbl = $obj->{'Tbl'}; my $seq = $obj->{'Seq'}; my $std = $seq->[0]; @@ -610,28 +728,34 @@ sub encode my $pre = $std; my $cur = $pre; - while (length($uni)){ - my $ch = chr(ord(substr($uni,0,1,''))); - my $x; - foreach my $e_seq (@$seq){ - $x = ref($tbl->{$e_seq}) eq 'Encode::XS' - ? $tbl->{$e_seq}->encode($ch,1) - : $tbl->{$e_seq}->{FmUni}->{$ch}; - $cur = $e_seq and last if defined $x; - } - if(ref($tbl->{$cur}) ne 'Encode::XS') - { - my $def = $tbl->{$cur}->{'Def'}; - my $rep = $tbl->{$cur}->{'Rep'}; - unless (defined $x){ - last if ($chk); - $x = $def; + while (length($uni)) + { + my $ch = substr($uni,0,1,''); + my $x; + foreach my $e_seq (@$seq) + { + $x = ref($tbl->{$e_seq}) eq 'Encode::Tcl::Table' + ? $tbl->{$e_seq}->{FmUni}->{$ch} + : $tbl->{$e_seq}->encode($ch,1); + $cur = $e_seq and last if defined $x; } - $x = pack(&$rep($x),$x); - } - $str .= $cur eq $pre ? $x : ($pre = $cur).$x; - $str .= '~' if $x eq '~'; # to '~~' - } + unless (defined $x) + { + unless($chk) + { + Encode::Tcl::no_map_in_encode(ord($ch), $name) + } + return undef; + } + if(ref($tbl->{$cur}) eq 'Encode::Tcl::Table') + { + my $def = $tbl->{$cur}->{'Def'}; + my $rep = $tbl->{$cur}->{'Rep'}; + $x = pack(&$rep($x),$x); + } + $str .= $cur eq $pre ? $x : ($pre = $cur).$x; + $str .= '~' if $x eq '~'; # to '~~' + } $str .= $std unless $cur eq $std; $_[1] = $uni if $chk; return $str; diff --git a/ext/Encode/Encode/Tcl.t b/ext/Encode/Encode/Tcl.t index 7e01ca6c13..950f658f90 100644 --- a/ext/Encode/Encode/Tcl.t +++ b/ext/Encode/Encode/Tcl.t @@ -1,6 +1,6 @@ BEGIN { chdir 't' if -d 't'; - @INC = '../lib'; +# @INC = '../lib'; require Config; import Config; if ($Config{'extensions'} !~ /\bEncode\b/) { print "1..0 # Skip: Encode was not built\n"; @@ -88,8 +88,41 @@ my @hz_txt = ( my $hz_exp = '007e0069006e002000470042002e5df162404e0d6b32' . 'ff0c52ff65bd65bc4eba3002004200790065002e007e'; +use constant BUFSIZ => 64; # for test +use constant hiragana => "\x{3042}\x{3044}\x{3046}\x{3048}\x{304A}"; +use constant han_kana => "\x{FF71}\x{FF72}\x{FF73}\x{FF74}\x{FF75}"; +use constant macron => "\x{0100}\x{0112}\x{012a}\x{014c}\x{016a}"; +use constant TAIL => 'bbb'; +use constant YES => 1; + +my @ary_buff = ( # [ encoding, decoded, encoded ] +# type-M + ["euc-cn", hiragana, "\xA4\xA2\xA4\xA4\xA4\xA6\xA4\xA8\xA4\xAA" ], + ["euc-jp", hiragana, "\xA4\xA2\xA4\xA4\xA4\xA6\xA4\xA8\xA4\xAA" ], + ["euc-jp", han_kana, "\x8E\xB1\x8E\xB2\x8E\xB3\x8E\xB4\x8E\xB5" ], + ["euc-kr", hiragana, "\xAA\xA2\xAA\xA4\xAA\xA6\xAA\xA8\xAA\xAA" ], + ["shiftjis", hiragana, "\x82\xA0\x82\xA2\x82\xA4\x82\xA6\x82\xA8" ], + ["shiftjis", han_kana, "\xB1\xB2\xB3\xB4\xB5" ], +# type-E + ["2022-cn", hiragana, "\e\$)A\cN". '$"$$$&$($*' . "\cO" ], + ["2022-jp", hiragana, "\e\$B".'$"$$$&$($*'."\e(B" ], + ["2022-kr", hiragana, "\e\$)C\cN". '*"*$*&*(**' . "\cO" ], + [ $jis, han_kana, "\e\(I".'12345'."\e(B" ], + ["2022-jp1", macron, "\e\$(D\x2A\x27\x2A\x37\x2A\x45\x2A\x57\x2A\x69\e(B"], + ["2022-jp2", "\x{C0}" . macron . "\x{C1}", + "\e\$(D\e.A\eN\x40\x2A\x27\x2A\x37\x2A\x45\x2A\x57\x2A\x69\e(B\eN\x41"], +# type-X + ["euc-jp-0212", hiragana, "\xA4\xA2\xA4\xA4\xA4\xA6\xA4\xA8\xA4\xAA" ], + ["euc-jp-0212", han_kana, "\x8E\xB1\x8E\xB2\x8E\xB3\x8E\xB4\x8E\xB5" ], + ["euc-jp-0212", macron, + "\x8F\xAA\xA7\x8F\xAA\xB7\x8F\xAA\xC5\x8F\xAA\xD7\x8F\xAA\xE9" ], +# type-H + [ $hz, hiragana, "~{". '$"$$$&$($*' . "~}" ], + [ $hz, hiragana, "~{". '$"$$' ."~\cJ". '$&$($*' . "~}" ], +); + plan test => $n*@encodings + $n*@encodings*@greek - + $n*@encodings*@ideodigit + $num_esc + $n + @hz_txt; + + $n*@encodings*@ideodigit + $num_esc + $n + @hz_txt + @ary_buff; foreach my $enc (@encodings) { @@ -189,3 +222,33 @@ foreach my $enc (@encodings) } } } + +for my $ary (@ary_buff) { + my $NG = 0; + my $enc = $ary->[0]; + for my $n ( int(BUFSIZ/2) .. 2*BUFSIZ+4 ){ + my $dst = "a"x$n. $ary->[1] . TAIL; + my $src = "a"x$n. $ary->[2] . TAIL; + my $utf = buff_decode($enc, $src); + $NG++ unless $dst eq $utf; + } + ok($NG, 0, "$enc mangled translating to Unicode"); +} + +sub buff_decode { + my($enc, $str) = @_; + my $utf8 = ''; + my $inconv = ''; + while(length $str){ + my $buff = $inconv.substr($str,0,BUFSIZ - length $inconv,''); + my $decoded = decode($enc, $buff, YES); + if(length $decoded){ + $utf8 .= $decoded; + $inconv = $buff; + } else { + last; # malformed? + } + } + return $utf8; +} + |