summaryrefslogtreecommitdiff
path: root/ext
diff options
context:
space:
mode:
authorJarkko Hietaniemi <jhi@iki.fi>2002-03-05 01:39:29 +0000
committerJarkko Hietaniemi <jhi@iki.fi>2002-03-05 01:39:29 +0000
commitc0d88b767d1f5af863e544cf079429d1c44da957 (patch)
treeb063520a9c9015a4ca0b7fa713b85b3c3527891d /ext
parent8ed7e7ad0a4d2488790fb3d3f698a8d0ae4a798b (diff)
downloadperl-c0d88b767d1f5af863e544cf079429d1c44da957.tar.gz
"The last pieces of Chinese puzzle" from Autrijus.
p4raw-id: //depot/perl@15029
Diffstat (limited to 'ext')
-rw-r--r--ext/Encode/CN/CN.pm15
-rw-r--r--ext/Encode/Encode.pm1
-rw-r--r--ext/Encode/Encode/HZ.enc7
-rw-r--r--ext/Encode/KR/KR.pm5
-rw-r--r--ext/Encode/MANIFEST2
-rw-r--r--ext/Encode/TW/TW.pm11
-rw-r--r--ext/Encode/lib/Encode/CN/HZ.pm50
7 files changed, 74 insertions, 17 deletions
diff --git a/ext/Encode/CN/CN.pm b/ext/Encode/CN/CN.pm
index 7f828d35bd..b2d1795e30 100644
--- a/ext/Encode/CN/CN.pm
+++ b/ext/Encode/CN/CN.pm
@@ -1,9 +1,14 @@
package Encode::CN;
-use Encode;
our $VERSION = '0.02';
+
+use Encode;
+use Encode::CN::HZ;
use XSLoader;
XSLoader::load('Encode::CN',$VERSION);
+local $@;
+eval "use Encode::HanExtra"; # load extra encodings if they exist
+
1;
__END__
=head1 NAME
@@ -25,7 +30,8 @@ Encodings supported are as follows.
gb2312 The raw (low-bit) GB2312 character map
gb12345 Traditional chinese counterpart to GB2312 (raw)
iso-ir-165 GB2312 + GB6345 + GB8565 + additions
- cp936 Code Page 936, also known as GBK (Extended GuoBiao)
+ cp936 Code Page 936, also known as GBK (Extended GuoBiao)
+ hz 7-bit escaped GB2312 encoding
To find how to use this module in detail, see L<Encode>.
@@ -35,9 +41,10 @@ Due to size concerns, C<GB 18030> (an extension to C<GBK>) is distributed
separately on CPAN, under the name L<Encode::HanExtra>. That module
also contains extra Taiwan-based encodings.
-=head1 BUGS
+This module will automatically load L<Encode::HanExtra> if you have it on
+your machine.
-The C<HZ> (Hanzi) escaped encoding is not supported.
+=head1 BUGS
ASCII part (0x00-0x7f) is preserved for all encodings, even though it
conflicts with mappings by the Unicode Consortium. See
diff --git a/ext/Encode/Encode.pm b/ext/Encode/Encode.pm
index d0bb788b06..445dd2401b 100644
--- a/ext/Encode/Encode.pm
+++ b/ext/Encode/Encode.pm
@@ -173,7 +173,6 @@ define_alias( qr/^gbk$/i => '"cp936"');
# TODO: HP-UX '8' encodings arabic8 greek8 hebrew8 kana8 thai8 turkish8
# TODO: HP-UX '15' encodings japanese15 korean15 roi15
# TODO: Cyrillic encoding ISO-IR-111 (useful?)
-# TODO: Chinese encodings HZ
# TODO: Armenian encoding ARMSCII-8
# TODO: Hebrew encoding ISO-8859-8-1
# TODO: Thai encoding TCVN
diff --git a/ext/Encode/Encode/HZ.enc b/ext/Encode/Encode/HZ.enc
deleted file mode 100644
index 748ee0bd20..0000000000
--- a/ext/Encode/Encode/HZ.enc
+++ /dev/null
@@ -1,7 +0,0 @@
-# Encoding file: HZ, HanZi
-H
-name HZ
-init {}
-final {}
-ascii \x7e\x7d
-gb2312 \x7e\x7b
diff --git a/ext/Encode/KR/KR.pm b/ext/Encode/KR/KR.pm
index aa2428128d..9936c5d22a 100644
--- a/ext/Encode/KR/KR.pm
+++ b/ext/Encode/KR/KR.pm
@@ -1,6 +1,7 @@
package Encode::KR;
-use Encode;
our $VERSION = '0.02';
+
+use Encode;
use XSLoader;
XSLoader::load('Encode::KR',$VERSION);
@@ -23,7 +24,7 @@ are as follows.
euc-kr EUC (Extended Unix Character)
ksc5601 Korean standard code set
- cp949 Code Page 949 (EUC-KR + Unified Hangul Code)
+ cp949 Code Page 949 (EUC-KR + Unified Hangul Code)
To find how to use this module in detail, see L<Encode>.
diff --git a/ext/Encode/MANIFEST b/ext/Encode/MANIFEST
index bf34b59bf4..6300a735c2 100644
--- a/ext/Encode/MANIFEST
+++ b/ext/Encode/MANIFEST
@@ -95,7 +95,6 @@ Encode/gb12345.enc
Encode/gb1988.enc
Encode/gb2312.enc
Encode/gsm0338.enc
-Encode/HZ.enc
Encode/iso-ir-165.enc
Encode/ir-197.enc
Encode/jis0201.enc
@@ -155,6 +154,7 @@ lib/Encode/ucs2_le.pm
lib/Encode/Unicode.pm
lib/Encode/utf8.pm
lib/Encode/XS.pm
+lib/Encode/CN/HZ.pm
lib/Encode/Tcl/Escape.pm
lib/Encode/Tcl/Extended.pm
lib/Encode/Tcl/HanZi.pm
diff --git a/ext/Encode/TW/TW.pm b/ext/Encode/TW/TW.pm
index 90b046041b..c3f64fdd25 100644
--- a/ext/Encode/TW/TW.pm
+++ b/ext/Encode/TW/TW.pm
@@ -1,9 +1,13 @@
package Encode::TW;
-use Encode;
our $VERSION = '0.02';
+
+use Encode;
use XSLoader;
XSLoader::load('Encode::TW',$VERSION);
+local $@;
+eval "use Encode::HanExtra"; # load extra encodings if they exist
+
1;
__END__
=head1 NAME
@@ -23,7 +27,7 @@ Encodings supported are as follows.
big5 The original Big5 encoding
big5-hkscs Big5 plus Cantonese characters in Hong Kong
- cp950 Code Page 950 (Big5 + Microsoft vendor mappings)
+ cp950 Code Page 950 (Big5 + Microsoft vendor mappings)
To find how to use this module in detail, see L<Encode>.
@@ -33,6 +37,9 @@ Due to size concerns, C<EUC-TW> (Extended Unix Character) and C<BIG5PLUS>
(CMEX's Big5+) are distributed separately on CPAN, under the name
L<Encode::HanExtra>. That module also contains extra China-based encodings.
+This module will automatically load L<Encode::HanExtra> if you have it on
+your machine.
+
=head1 BUGS
The C<CNS11643> encoding files are not complete (only the first two planes,
diff --git a/ext/Encode/lib/Encode/CN/HZ.pm b/ext/Encode/lib/Encode/CN/HZ.pm
new file mode 100644
index 0000000000..a57ae8a971
--- /dev/null
+++ b/ext/Encode/lib/Encode/CN/HZ.pm
@@ -0,0 +1,50 @@
+package Encode::CN::HZ;
+
+use Encode::CN;
+use Encode qw|encode decode|;
+use base 'Encode::Encoding';
+
+use strict;
+
+# HZ is but escaped GB, so we implement it with the
+# GB2312(raw) encoding here. Cf. RFC 1842 & 1843.
+
+my $canon = 'hz';
+my $obj = bless {name => $canon}, __PACKAGE__;
+$obj->Define($canon);
+
+sub decode
+{
+ my ($obj,$str,$chk) = @_;
+ my $gb = Encode::find_encoding('gb2312');
+
+ $str =~ s{~(?:(~)|\n|{([^~]*)~}|)}
+ {$1 ? '~' : defined $2 ? $gb->decode($2, $chk) : ''}eg;
+
+ return $str;
+}
+
+sub encode
+{
+ my ($obj,$str,$chk) = @_;
+ my $gb = Encode::find_encoding('gb2312');
+
+ $str =~ s/~/~~/g;
+ $str =~ s/((?:
+ \p{InCJKCompatibility}|
+ \p{InCJKCompatibilityForms}|
+ \p{InCJKCompatibilityIdeographs}|
+ \p{InCJKCompatibilityIdeographsSupplement}|
+ \p{InCJKRadicalsSupplement}|
+ \p{InCJKSymbolsAndPunctuation}|
+ \p{InCJKUnifiedIdeographsExtensionA}|
+ \p{InCJKUnifiedIdeographs}|
+ \p{InCJKUnifiedIdeographsExtensionB}|
+ \p{InEnclosedCJKLettersAndMonths}
+ )+)/'~{'.$gb->encode($1, $chk).'~}'/egx;
+
+ return $str;
+}
+
+1;
+__END__