summaryrefslogtreecommitdiff
path: root/ext
diff options
context:
space:
mode:
authorSADAHIRO Tomoyuki <BQW10602@nifty.com>2001-07-10 09:29:55 +0900
committerJarkko Hietaniemi <jhi@iki.fi>2001-07-10 01:16:27 +0000
commit466d6cd3e7b7bbd27604d839f967b18f16a935e5 (patch)
tree5b4186411c6b0a55b0285a13890ad78e7bdc629d /ext
parentd763e2de08a0f2126d4686756b97d0aecd52e13c (diff)
downloadperl-466d6cd3e7b7bbd27604d839f967b18f16a935e5.tar.gz
Encode::Tcl. add "HZ" encoding and bugfix
Message-Id: <20010710002756.B497.BQW10602@nifty.com> p4raw-id: //depot/perl@11244
Diffstat (limited to 'ext')
-rw-r--r--ext/Encode/Encode/7bit-kana.enc4
-rw-r--r--ext/Encode/Encode/HZ.enc7
-rw-r--r--ext/Encode/Encode/Tcl.pm170
-rw-r--r--ext/Encode/Encode/Tcl.t43
-rw-r--r--ext/Encode/Encode/iso2022-jp.enc9
-rw-r--r--ext/Encode/Encode/iso2022-kr.enc2
6 files changed, 192 insertions, 43 deletions
diff --git a/ext/Encode/Encode/7bit-kana.enc b/ext/Encode/Encode/7bit-kana.enc
index 871dbf669a..2ca1d5fad9 100644
--- a/ext/Encode/Encode/7bit-kana.enc
+++ b/ext/Encode/Encode/7bit-kana.enc
@@ -2,8 +2,8 @@
S
0025 0 1
00
-0000000100020003000400050006000700080009000A000B000C000D00000000
-0010001100120013001400150016001700180019001A0000001C001D001E001F
+0000000000000000000000000000000000000000000000000000000000000000
+0000000000000000000000000000000000000000000000000000000000000000
0000FF61FF62FF63FF64FF65FF66FF67FF68FF69FF6AFF6BFF6CFF6DFF6EFF6F
FF70FF71FF72FF73FF74FF75FF76FF77FF78FF79FF7AFF7BFF7CFF7DFF7EFF7F
FF80FF81FF82FF83FF84FF85FF86FF87FF88FF89FF8AFF8BFF8CFF8DFF8EFF8F
diff --git a/ext/Encode/Encode/HZ.enc b/ext/Encode/Encode/HZ.enc
new file mode 100644
index 0000000000..748ee0bd20
--- /dev/null
+++ b/ext/Encode/Encode/HZ.enc
@@ -0,0 +1,7 @@
+# Encoding file: HZ, HanZi
+H
+name HZ
+init {}
+final {}
+ascii \x7e\x7d
+gb2312 \x7e\x7b
diff --git a/ext/Encode/Encode/Tcl.pm b/ext/Encode/Encode/Tcl.pm
index 84d107ff85..d0ff0c8139 100644
--- a/ext/Encode/Encode/Tcl.pm
+++ b/ext/Encode/Encode/Tcl.pm
@@ -78,7 +78,7 @@ sub loadEncoding
$type = substr($line,0,1);
last unless $type eq '#';
}
- my $class = ref($obj).('::'.(($type eq 'E') ? 'Escape' : 'Table'));
+ my $class = ref($obj).('::'.(($type eq 'H') ? 'HanZi' : ($type eq 'E') ? 'Escape' : 'Table'));
# carp "Loading $file";
bless $obj,$class;
return $obj if $obj->read($fh,$obj->name,$type);
@@ -323,46 +323,152 @@ sub encode
while (length($uni)){
my $ch = chr(ord(substr($uni,0,1,'')));
- my $x = ref($tbl->{$pre}) eq 'Encode::XS'
- ? $tbl->{$pre}->encode($ch,1)
- : $tbl->{$pre}->{FmUni}->{$ch};
-
- unless(defined $x){
- foreach my $e_seq (@$seq){
- $x = ref($tbl->{$e_seq}) eq 'Encode::XS'
- ? $tbl->{$e_seq}->encode($ch,1)
- : $tbl->{$e_seq}->{FmUni}->{$ch};
- $cur = $e_seq and last if defined $x;
- }
+ my $x;
+ foreach my $e_seq ($std, $pre, @$seq){
+ $x = ref($tbl->{$e_seq}) eq 'Encode::XS'
+ ? $tbl->{$e_seq}->encode($ch,1)
+ : $tbl->{$e_seq}->{FmUni}->{$ch};
+ $cur = $e_seq and last if defined $x;
}
- if($x == 0x0d && !($ini eq '' && $fin eq '') && substr($uni,0,1) eq "\x0a")
+ if(ref($tbl->{$cur}) ne 'Encode::XS')
{
- $str .= $cur unless $cur eq $pre;
- $str .= $fin."\x0d\x0a".$ini;
- substr($uni,0,1,'');
- $pre = $std;
- next;
+ my $def = $tbl->{$cur}->{'Def'};
+ my $rep = $tbl->{$cur}->{'Rep'};
+ unless (defined $x){
+ last if ($chk);
+ $x = $def;
+ }
+ $x = pack(&$rep($x),$x);
+ }
+ $str .= $cur eq $pre ? $x : ($pre = $cur).$x;
+ }
+ $str .= $std unless $cur eq $std;
+ $str .= $fin;
+ $_[1] = $uni if $chk;
+ return $str;
+}
+
+package Encode::Tcl::HanZi;
+use base 'Encode::Encoding';
+
+use Carp;
+
+sub read
+{
+ my ($obj,$fh,$name) = @_;
+ my(%tbl, @seq, $enc);
+ while (<$fh>)
+ {
+ my ($key,$val) = /^(\S+)\s+(.*)$/;
+ $val =~ s/^\{(.*?)\}/$1/g;
+ $val =~ s/\\x([0-9a-f]{2})/chr(hex($1))/ge;
+ if($enc = Encode->getEncoding($key)){
+ $tbl{$val} = ref($enc) eq 'Encode::Tcl' ? $enc->loadEncoding : $enc;
+ push @seq, $val;
+ }else{
+ $obj->{$key} = $val;
+ }
+ }
+ $obj->{'Seq'} = \@seq; # escape sequences
+ $obj->{'Tbl'} = \%tbl; # encoding tables
+ return $obj;
+}
+
+sub decode
+{
+ my ($obj,$str,$chk) = @_;
+ my $tbl = $obj->{'Tbl'};
+ my $seq = $obj->{'Seq'};
+ my $std = $seq->[0];
+ my $cur = $std;
+ my $uni;
+ while (length($str)){
+ my $uch = substr($str,0,1,'');
+ if($uch eq "~"){
+ if($str =~ s/^\cJ//)
+ {
+ next;
+ }
+ elsif($str =~ s/^\~//)
+ {
+ 1;
+ }
+ elsif($str =~ s/^([{}])//)
+ {
+ $cur = "~$1";
+ next;
+ }
+ else
+ {
+ $str =~ s/^([^~])//;
+ carp "unknown HanZi escape sequence: ~$1";
+ next;
+ }
}
- if(ref($tbl->{$cur}) eq 'Encode::XS'){
- $str .= $cur unless $cur eq $pre;
- $str .= $x; # "DEF" is lost
- $pre = $cur;
- next;
+ if(ref($tbl->{$cur}) eq 'Encode::XS'){
+ $uni .= $tbl->{$cur}->decode($uch);
+ next;
+ }
+ my $ch = ord($uch);
+ my $rep = $tbl->{$cur}->{'Rep'};
+ my $touni = $tbl->{$cur}->{'ToUni'};
+ my $x;
+ if (&$rep($ch) eq 'C')
+ {
+ $x = $touni->[0][$ch];
+ }
+ else
+ {
+ $x = $touni->[$ch][ord(substr($str,0,1,''))];
+ }
+ unless (defined $x)
+ {
+ last if $chk;
+ # What do we do here ?
+ $x = '';
+ }
+ $uni .= $x;
}
- my $def = $tbl->{$cur}->{'Def'};
- my $rep = $tbl->{$cur}->{'Rep'};
- unless (defined $x){
- last if ($chk);
- $x = $def;
+ $_[1] = $str if $chk;
+ return $uni;
+}
+
+sub encode
+{
+ my ($obj,$uni,$chk) = @_;
+ my $tbl = $obj->{'Tbl'};
+ my $seq = $obj->{'Seq'};
+ my $std = $seq->[0];
+ my $str;
+ my $pre = $std;
+ my $cur = $pre;
+
+ while (length($uni)){
+ my $ch = chr(ord(substr($uni,0,1,'')));
+ my $x;
+ foreach my $e_seq (@$seq){
+ $x = ref($tbl->{$e_seq}) eq 'Encode::XS'
+ ? $tbl->{$e_seq}->encode($ch,1)
+ : $tbl->{$e_seq}->{FmUni}->{$ch};
+ $cur = $e_seq and last if defined $x;
}
- $str .= $cur unless $cur eq $pre;
- $str .= pack(&$rep($x),$x);
- $pre = $cur;
+ if(ref($tbl->{$cur}) ne 'Encode::XS')
+ {
+ my $def = $tbl->{$cur}->{'Def'};
+ my $rep = $tbl->{$cur}->{'Rep'};
+ unless (defined $x){
+ last if ($chk);
+ $x = $def;
+ }
+ $x = pack(&$rep($x),$x);
+ }
+ $str .= $cur eq $pre ? $x : ($pre = $cur).$x;
+ $str .= '~' if $x eq '~'; # to '~~'
}
$str .= $std unless $cur eq $std;
- $str .= $fin;
$_[1] = $uni if $chk;
return $str;
}
+
1;
__END__
diff --git a/ext/Encode/Encode/Tcl.t b/ext/Encode/Encode/Tcl.t
index 5e5d8f95e8..1d1692e763 100644
--- a/ext/Encode/Encode/Tcl.t
+++ b/ext/Encode/Encode/Tcl.t
@@ -42,7 +42,7 @@ my %ideodigit = ( # cjk ideograph 'one' to 'ten'
my @ideodigit = qw(one two three four five six seven eight nine ten);
my $jis = '7bit-jis';
-my $kr = '7bit-kr';
+my $kr = 'iso2022-kr';
my %esc_str;
$esc_str{$jis} = {qw(
@@ -70,8 +70,19 @@ $esc_str{$kr} = {qw(
my $num_esc = $n * keys(%esc_str);
foreach (values %esc_str){ $num_esc += $n * keys %$_ }
+my $hz = 'HZ'; # HanZi
+
+my @hz_txt = (
+ "~~in GB.~{<:Ky2;S{#,NpJ)l6HK!#~}Bye.~~",
+ "~~in GB.~{<:Ky2;S{#,~}~\cJ~{NpJ)l6HK!#~}Bye.~~",
+ "~~in GB.~\cJ~{<:Ky2;S{#,NpJ)l6HK!#~}~\cJBye.~~",
+);
+
+my $hz_exp = '007e0069006e002000470042002e5df162404e0d6b32'
+ . 'ff0c52ff65bd65bc4eba3002004200790065002e007e';
+
plan test => $n*@encodings + $n*@encodings*@greek
- + $n*@encodings*@ideodigit + $num_esc;
+ + $n*@encodings*@ideodigit + $num_esc + $n + @hz_txt;
foreach my $enc (@encodings)
{
@@ -143,3 +154,31 @@ foreach my $enc (@encodings)
"$enc mangled translating to Unicode");
}
}
+
+
+{
+ my $hz_to_unicode = sub
+ {
+ return unpack('H*', pack 'n*', unpack 'U*', decode $hz, shift);
+ };
+
+ my $hz_from_unicode = sub
+ {
+ return encode($hz, pack 'U*', unpack 'n*', pack 'H*', shift);
+ };
+
+ foreach my $enc ($hz)
+ {
+ my $tab = Encode->getEncoding($enc);
+ ok(1,defined($tab),"Could not load $enc");
+
+ ok(&$hz_from_unicode($hz_exp), $hz_txt[0],
+ "$enc mangled translating from Unicode");
+
+ foreach my $str (@hz_txt)
+ {
+ ok(&$hz_to_unicode($str), $hz_exp,
+ "$enc mangled translating to Unicode");
+ }
+ }
+}
diff --git a/ext/Encode/Encode/iso2022-jp.enc b/ext/Encode/Encode/iso2022-jp.enc
index a4e455f3ba..15d52d2f44 100644
--- a/ext/Encode/Encode/iso2022-jp.enc
+++ b/ext/Encode/Encode/iso2022-jp.enc
@@ -3,10 +3,7 @@ E
name iso2022-jp
init {}
final {}
-iso8859-1 \x1b(B
-jis0201 \x1b(J
-jis0208 \x1b$@
+ascii \x1b(B
+ascii \x1b(J
jis0208 \x1b$B
-jis0212 \x1b$(D
-gb2312 \x1b$A
-ksc5601 \x1b$(C
+jis0208 \x1b$@
diff --git a/ext/Encode/Encode/iso2022-kr.enc b/ext/Encode/Encode/iso2022-kr.enc
index d20ce2bc12..b894d22da6 100644
--- a/ext/Encode/Encode/iso2022-kr.enc
+++ b/ext/Encode/Encode/iso2022-kr.enc
@@ -3,5 +3,5 @@ E
name iso2022-kr
init \x1b$)C
final {}
-iso8859-1 \x0f
+ascii \x0f
ksc5601 \x0e