diff options
author | Jarkko Hietaniemi <jhi@iki.fi> | 2002-03-05 01:39:29 +0000 |
---|---|---|
committer | Jarkko Hietaniemi <jhi@iki.fi> | 2002-03-05 01:39:29 +0000 |
commit | c0d88b767d1f5af863e544cf079429d1c44da957 (patch) | |
tree | b063520a9c9015a4ca0b7fa713b85b3c3527891d /ext | |
parent | 8ed7e7ad0a4d2488790fb3d3f698a8d0ae4a798b (diff) | |
download | perl-c0d88b767d1f5af863e544cf079429d1c44da957.tar.gz |
"The last pieces of Chinese puzzle" from Autrijus.
p4raw-id: //depot/perl@15029
Diffstat (limited to 'ext')
-rw-r--r-- | ext/Encode/CN/CN.pm | 15 | ||||
-rw-r--r-- | ext/Encode/Encode.pm | 1 | ||||
-rw-r--r-- | ext/Encode/Encode/HZ.enc | 7 | ||||
-rw-r--r-- | ext/Encode/KR/KR.pm | 5 | ||||
-rw-r--r-- | ext/Encode/MANIFEST | 2 | ||||
-rw-r--r-- | ext/Encode/TW/TW.pm | 11 | ||||
-rw-r--r-- | ext/Encode/lib/Encode/CN/HZ.pm | 50 |
7 files changed, 74 insertions, 17 deletions
diff --git a/ext/Encode/CN/CN.pm b/ext/Encode/CN/CN.pm index 7f828d35bd..b2d1795e30 100644 --- a/ext/Encode/CN/CN.pm +++ b/ext/Encode/CN/CN.pm @@ -1,9 +1,14 @@ package Encode::CN; -use Encode; our $VERSION = '0.02'; + +use Encode; +use Encode::CN::HZ; use XSLoader; XSLoader::load('Encode::CN',$VERSION); +local $@; +eval "use Encode::HanExtra"; # load extra encodings if they exist + 1; __END__ =head1 NAME @@ -25,7 +30,8 @@ Encodings supported are as follows. gb2312 The raw (low-bit) GB2312 character map gb12345 Traditional chinese counterpart to GB2312 (raw) iso-ir-165 GB2312 + GB6345 + GB8565 + additions - cp936 Code Page 936, also known as GBK (Extended GuoBiao) + cp936 Code Page 936, also known as GBK (Extended GuoBiao) + hz 7-bit escaped GB2312 encoding To find how to use this module in detail, see L<Encode>. @@ -35,9 +41,10 @@ Due to size concerns, C<GB 18030> (an extension to C<GBK>) is distributed separately on CPAN, under the name L<Encode::HanExtra>. That module also contains extra Taiwan-based encodings. -=head1 BUGS +This module will automatically load L<Encode::HanExtra> if you have it on +your machine. -The C<HZ> (Hanzi) escaped encoding is not supported. +=head1 BUGS ASCII part (0x00-0x7f) is preserved for all encodings, even though it conflicts with mappings by the Unicode Consortium. See diff --git a/ext/Encode/Encode.pm b/ext/Encode/Encode.pm index d0bb788b06..445dd2401b 100644 --- a/ext/Encode/Encode.pm +++ b/ext/Encode/Encode.pm @@ -173,7 +173,6 @@ define_alias( qr/^gbk$/i => '"cp936"'); # TODO: HP-UX '8' encodings arabic8 greek8 hebrew8 kana8 thai8 turkish8 # TODO: HP-UX '15' encodings japanese15 korean15 roi15 # TODO: Cyrillic encoding ISO-IR-111 (useful?) -# TODO: Chinese encodings HZ # TODO: Armenian encoding ARMSCII-8 # TODO: Hebrew encoding ISO-8859-8-1 # TODO: Thai encoding TCVN diff --git a/ext/Encode/Encode/HZ.enc b/ext/Encode/Encode/HZ.enc deleted file mode 100644 index 748ee0bd20..0000000000 --- a/ext/Encode/Encode/HZ.enc +++ /dev/null @@ -1,7 +0,0 @@ -# Encoding file: HZ, HanZi -H -name HZ -init {} -final {} -ascii \x7e\x7d -gb2312 \x7e\x7b diff --git a/ext/Encode/KR/KR.pm b/ext/Encode/KR/KR.pm index aa2428128d..9936c5d22a 100644 --- a/ext/Encode/KR/KR.pm +++ b/ext/Encode/KR/KR.pm @@ -1,6 +1,7 @@ package Encode::KR; -use Encode; our $VERSION = '0.02'; + +use Encode; use XSLoader; XSLoader::load('Encode::KR',$VERSION); @@ -23,7 +24,7 @@ are as follows. euc-kr EUC (Extended Unix Character) ksc5601 Korean standard code set - cp949 Code Page 949 (EUC-KR + Unified Hangul Code) + cp949 Code Page 949 (EUC-KR + Unified Hangul Code) To find how to use this module in detail, see L<Encode>. diff --git a/ext/Encode/MANIFEST b/ext/Encode/MANIFEST index bf34b59bf4..6300a735c2 100644 --- a/ext/Encode/MANIFEST +++ b/ext/Encode/MANIFEST @@ -95,7 +95,6 @@ Encode/gb12345.enc Encode/gb1988.enc Encode/gb2312.enc Encode/gsm0338.enc -Encode/HZ.enc Encode/iso-ir-165.enc Encode/ir-197.enc Encode/jis0201.enc @@ -155,6 +154,7 @@ lib/Encode/ucs2_le.pm lib/Encode/Unicode.pm lib/Encode/utf8.pm lib/Encode/XS.pm +lib/Encode/CN/HZ.pm lib/Encode/Tcl/Escape.pm lib/Encode/Tcl/Extended.pm lib/Encode/Tcl/HanZi.pm diff --git a/ext/Encode/TW/TW.pm b/ext/Encode/TW/TW.pm index 90b046041b..c3f64fdd25 100644 --- a/ext/Encode/TW/TW.pm +++ b/ext/Encode/TW/TW.pm @@ -1,9 +1,13 @@ package Encode::TW; -use Encode; our $VERSION = '0.02'; + +use Encode; use XSLoader; XSLoader::load('Encode::TW',$VERSION); +local $@; +eval "use Encode::HanExtra"; # load extra encodings if they exist + 1; __END__ =head1 NAME @@ -23,7 +27,7 @@ Encodings supported are as follows. big5 The original Big5 encoding big5-hkscs Big5 plus Cantonese characters in Hong Kong - cp950 Code Page 950 (Big5 + Microsoft vendor mappings) + cp950 Code Page 950 (Big5 + Microsoft vendor mappings) To find how to use this module in detail, see L<Encode>. @@ -33,6 +37,9 @@ Due to size concerns, C<EUC-TW> (Extended Unix Character) and C<BIG5PLUS> (CMEX's Big5+) are distributed separately on CPAN, under the name L<Encode::HanExtra>. That module also contains extra China-based encodings. +This module will automatically load L<Encode::HanExtra> if you have it on +your machine. + =head1 BUGS The C<CNS11643> encoding files are not complete (only the first two planes, diff --git a/ext/Encode/lib/Encode/CN/HZ.pm b/ext/Encode/lib/Encode/CN/HZ.pm new file mode 100644 index 0000000000..a57ae8a971 --- /dev/null +++ b/ext/Encode/lib/Encode/CN/HZ.pm @@ -0,0 +1,50 @@ +package Encode::CN::HZ; + +use Encode::CN; +use Encode qw|encode decode|; +use base 'Encode::Encoding'; + +use strict; + +# HZ is but escaped GB, so we implement it with the +# GB2312(raw) encoding here. Cf. RFC 1842 & 1843. + +my $canon = 'hz'; +my $obj = bless {name => $canon}, __PACKAGE__; +$obj->Define($canon); + +sub decode +{ + my ($obj,$str,$chk) = @_; + my $gb = Encode::find_encoding('gb2312'); + + $str =~ s{~(?:(~)|\n|{([^~]*)~}|)} + {$1 ? '~' : defined $2 ? $gb->decode($2, $chk) : ''}eg; + + return $str; +} + +sub encode +{ + my ($obj,$str,$chk) = @_; + my $gb = Encode::find_encoding('gb2312'); + + $str =~ s/~/~~/g; + $str =~ s/((?: + \p{InCJKCompatibility}| + \p{InCJKCompatibilityForms}| + \p{InCJKCompatibilityIdeographs}| + \p{InCJKCompatibilityIdeographsSupplement}| + \p{InCJKRadicalsSupplement}| + \p{InCJKSymbolsAndPunctuation}| + \p{InCJKUnifiedIdeographsExtensionA}| + \p{InCJKUnifiedIdeographs}| + \p{InCJKUnifiedIdeographsExtensionB}| + \p{InEnclosedCJKLettersAndMonths} + )+)/'~{'.$gb->encode($1, $chk).'~}'/egx; + + return $str; +} + +1; +__END__ |