From a63c962f6b57d9d07801c81cc6e7f8a1b904a8c5 Mon Sep 17 00:00:00 2001 From: Jarkko Hietaniemi Date: Sun, 24 Mar 2002 15:53:59 +0000 Subject: Upgrade to Encode 0.98, from Dan Kogai. p4raw-id: //depot/perl@15467 --- MANIFEST | 3 +- ext/Encode/CN/CN.pm | 4 +- ext/Encode/Changes | 56 +++++++++- ext/Encode/Encode.pm | 99 +++++++++--------- ext/Encode/JP/JP.pm | 30 +++++- ext/Encode/KR/KR.pm | 4 +- ext/Encode/MANIFEST | 3 +- ext/Encode/TW/TW.pm | 4 +- ext/Encode/lib/Encode/Alias.pm | 58 +++++++---- ext/Encode/lib/Encode/Details.pod | 41 +++----- ext/Encode/lib/Encode/EncFormat.pod | 163 ++++++++++++++++++++++++++++++ ext/Encode/lib/Encode/JP/ISO_2022_JP.pm | 4 +- ext/Encode/lib/Encode/JP/ISO_2022_JP_1.pm | 38 +++++++ ext/Encode/lib/Encode/JP/JIS.pm | 8 +- ext/Encode/lib/Encode/Supported.pod | 115 ++++++++++++++------- ext/Encode/lib/EncodeFormat.pod | 163 ------------------------------ ext/Encode/t/Aliases.t | 124 +++++++++++++---------- 17 files changed, 545 insertions(+), 372 deletions(-) create mode 100644 ext/Encode/lib/Encode/EncFormat.pod create mode 100644 ext/Encode/lib/Encode/JP/ISO_2022_JP_1.pm delete mode 100644 ext/Encode/lib/EncodeFormat.pod diff --git a/MANIFEST b/MANIFEST index 26bbcdac6f..2b4c0bf11c 100644 --- a/MANIFEST +++ b/MANIFEST @@ -355,12 +355,14 @@ ext/Encode/KR/Makefile.PL Encode extension ext/Encode/lib/Encode/Alias.pm Encode extension ext/Encode/lib/Encode/CN/HZ.pm Encode extension ext/Encode/lib/Encode/Details.pod Encode extension +ext/Encode/lib/Encode/EncFormat.pod Encode extension ext/Encode/lib/Encode/Encoding.pm Encode extension ext/Encode/lib/Encode/Internal.pm Encode extension ext/Encode/lib/Encode/iso10646_1.pm Encode extension ext/Encode/lib/Encode/JP/Constants.pm Encode extension ext/Encode/lib/Encode/JP/H2Z.pm Encode extension ext/Encode/lib/Encode/JP/ISO_2022_JP.pm Encode extension +ext/Encode/lib/Encode/JP/ISO_2022_JP_1.pm Encode extension ext/Encode/lib/Encode/JP/JIS.pm Encode extension ext/Encode/lib/Encode/Supported.pod Encode extension ext/Encode/lib/Encode/Tcl.pm Encode extension @@ -370,7 +372,6 @@ ext/Encode/lib/Encode/ucs2_le.pm Encode extension ext/Encode/lib/Encode/Unicode.pm Encode extension ext/Encode/lib/Encode/utf8.pm Encode extension ext/Encode/lib/Encode/XS.pm Encode extension -ext/Encode/lib/EncodeFormat.pod Encode extension ext/Encode/Makefile.PL Encode extension makefile writer ext/Encode/MANIFEST Encode extension ext/Encode/README Encode extension diff --git a/ext/Encode/CN/CN.pm b/ext/Encode/CN/CN.pm index 51d90bb5ec..0a468f9f59 100644 --- a/ext/Encode/CN/CN.pm +++ b/ext/Encode/CN/CN.pm @@ -4,7 +4,7 @@ BEGIN { die "Encode::CN not supported on EBCDIC\n"; } } -our $VERSION = do { my @r = (q$Revision: 0.97 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = do { my @r = (q$Revision: 0.98 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; use Encode; use Encode::CN::HZ; @@ -58,7 +58,7 @@ also contains extra Taiwan-based encodings. ASCII part (0x00-0x7f) is preserved for all encodings, even though it conflicts with mappings by the Unicode Consortium. See -F +L to find why it is implemented that way. diff --git a/ext/Encode/Changes b/ext/Encode/Changes index a981280638..0054d256b9 100644 --- a/ext/Encode/Changes +++ b/ext/Encode/Changes @@ -1,8 +1,62 @@ # Revision history for Perl extension Encode. # -# $Id: Changes,v 0.97 2002/03/23 20:24:42 dankogai Exp dankogai $ +# $Id: Changes,v 0.98 2002/03/24 15:43:37 dankogai Exp dankogai $ # +0.98 Mon Mar 25 2002 +! lib/Encode/Supported.pod + Further pod fixes ++ lib/Encode/JP/ISO_2022_JP_1.pm +! lib/Encode/JP/ISO_2022_JP.pm +! lib/Encode/JP/JIS.pm +! JP/JP.pm + Now Encode::JP is more strict on the difference between ISO-2022-JP + and ISO-2022-JP-1. See JP/JP.pm for details. I hope this move + makes Anton happier :) FYI the previous version implements + ISO-2022-JP as ISO-2022-JP-1 since it had X0212 support. +! lib/Encode/Supported.pod + Further pod fixes +! Encode.xs + Avoid core-dump in Encode with PERLIO=mmap by NI-S + Message-Id: <20020324104139.1326.7@bactrian.ni-s.u-net.com> +! CN/CN.pm +! JP/JP.pm +! KR/KR.pm +! TW/TW.pm +! lib/Encode/Suppoted.pod + pod fixes to replace F to L, + as suggested by Autrijius in: + Message-Id: <20020324083943.GA14901@not.autrijus.org> +! lib/Encode/Suppoted.pod + fixes and enhancements by Anton + Message-Id: <10632060120.20020324103753@motor.ru> +! lib/Encode/Alias.pm + > define_alias( qr/^GB[- ]?(\d+)$/i => '"gb$1"' ); + added. Suggested by Anton then deobfuscated by Autrijius + Message-Id: <20020324064455.GA3667@not.autrijus.org> +! compile + Further fix by Nicholas Clark + Message-Id: <20020323145840.GD304@Bagpuss.unfortu.net> +- lib/EncodeFormat.pod ++ lib/Encode/EncFormat.pod +! MANIFEST + File renamed as suggested by Autrijius +! Encode.pm +! lib/Encode/Details.pod +! lib/Encode/Supported.pod Sun Mar 24 13:29:35 2002 +! Encode.pm Sun Mar 24 13:43:47 2002 + pod fixes by Autrijius. + Message-Id: <20020324062804.GA3595@not.autrijus.org> + Message-Id: <20020324075627.GB11986@not.autrijus.org> +! t/Alias.t +! lib/Encode/Alias.pm +! Encode.pm + now more EBCDIC conscious; + %ExtModules on EBCDIC system excludes CJK so that you don't + have to worry about the matched alias resulting cloaking. + t/Alias.t also revised to reflect changes. Verified by jhi + Message-Id: <20020324022929.D22596@alpha.hut.fi> + 0.97 Sun Mar 24 2002 ! CN/CN.pm ! KR/KR.pm diff --git a/ext/Encode/Encode.pm b/ext/Encode/Encode.pm index 7886c63826..39953d0de8 100644 --- a/ext/Encode/Encode.pm +++ b/ext/Encode/Encode.pm @@ -1,6 +1,6 @@ package Encode; use strict; -our $VERSION = do { my @r = (q$Revision: 0.97 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = do { my @r = (q$Revision: 0.98 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; our $DEBUG = 0; require DynaLoader; @@ -37,6 +37,7 @@ bootstrap Encode (); use Carp; +our $ON_EBCDIC = (ord("A") == 193); use Encode::Alias; # Make a %Encoding package variable to allow a certain amount of cheating @@ -51,27 +52,6 @@ our %ExtModule = 'posix-bc' => 'Encode/EBCDIC.pm', symbol => 'Encode/Symbol.pm', dingbats => 'Encode/Symbol.pm', - 'euc-cn' => 'Encode/CN.pm', - gb2312 => 'Encode/CN.pm', - gb12345 => 'Encode/CN.pm', - gbk => 'Encode/CN.pm', - cp936 => 'Encode/CN.pm', - 'iso-ir-165' => 'Encode/CN.pm', - 'euc-jp' => 'Encode/JP.pm', - 'iso-2022-jp' => 'Encode/JP.pm', - '7bit-jis' => 'Encode/JP.pm', - shiftjis => 'Encode/JP.pm', - macjapan => 'Encode/JP.pm', - cp932 => 'Encode/JP.pm', - 'euc-kr' => 'Encode/KR.pm', - ksc5601 => 'Encode/KR.pm', - cp949 => 'Encode/KR.pm', - big5 => 'Encode/TW.pm', - 'big5-hkscs' => 'Encode/TW.pm', - cp950 => 'Encode/TW.pm', - gb18030 => 'Encode/HanExtra.pm', - big5plus => 'Encode/HanExtra.pm', - 'euc-tw' => 'Encode/HanExtra.pm', ); for my $k (2..11,13..16){ @@ -82,6 +62,34 @@ for my $k (1250..1258){ $ExtModule{"cp$k"} = 'Encode/Byte.pm'; } +unless ($ON_EBCDIC) { # CJK added to autoload unless EBCDIC env +%ExtModule =( + %ExtModule, + 'euc-cn' => 'Encode/CN.pm', + gb2312 => 'Encode/CN.pm', + gb12345 => 'Encode/CN.pm', + gbk => 'Encode/CN.pm', + cp936 => 'Encode/CN.pm', + 'iso-ir-165' => 'Encode/CN.pm', + 'euc-jp' => 'Encode/JP.pm', + 'iso-2022-jp' => 'Encode/JP.pm', + 'iso-2022-jp-1' => 'Encode/JP.pm', + '7bit-jis' => 'Encode/JP.pm', + shiftjis => 'Encode/JP.pm', + macjapan => 'Encode/JP.pm', + cp932 => 'Encode/JP.pm', + 'euc-kr' => 'Encode/KR.pm', + ksc5601 => 'Encode/KR.pm', + cp949 => 'Encode/KR.pm', + big5 => 'Encode/TW.pm', + 'big5-hkscs' => 'Encode/TW.pm', + cp950 => 'Encode/TW.pm', + gb18030 => 'Encode/HanExtra.pm', + big5plus => 'Encode/HanExtra.pm', + 'euc-tw' => 'Encode/HanExtra.pm', + ); +} + for my $k (qw(centeuro croatian cyrillic dingbats greek iceland roman rumanian sami thai turkish ukraine)) @@ -234,7 +242,7 @@ The C module provides the interfaces between Perl's strings and the rest of the system. Perl strings are sequences of B. To find more about character encodings, please consult -L . This document focuses on programming references. +L. This document focuses on programming references. =head1 PERL ENCODING API @@ -242,9 +250,7 @@ L . This document focuses on programming references. =over 4 -=item * - - $bytes = encode(ENCODING, $string[, CHECK]) +=item $bytes = encode(ENCODING, $string[, CHECK]) Encodes string from Perl's internal form into I and returns a sequence of octets. For CHECK see L. @@ -254,9 +260,7 @@ to octets: $octets = encode("utf8", $unicode); -=item * - - $string = decode(ENCODING, $bytes[, CHECK]) +=item $string = decode(ENCODING, $bytes[, CHECK]) Decode sequence of octets assumed to be in I into Perl's internal form and returns the resulting string. For CHECK see @@ -266,9 +270,7 @@ For example to convert ISO-8859-1 data to UTF-8: $utf8 = decode("latin1", $latin1); -=item * - - from_to($string, FROM_ENCODING, TO_ENCODING[, CHECK]) +=item from_to($string, FROM_ENCODING, TO_ENCODING[, CHECK]) Convert B the data between two encodings. How did the data in $string originally get to be in FROM_ENCODING? Either using @@ -342,32 +344,28 @@ Hybrids of above. Multiple return values rather than in-place modifications. -Index into the string could be pos($str) allowing s/\G...//. +Index into the string could be C allowing C. =back =head2 UTF-8 / utf8 The Unicode consortium defines the UTF-8 standard as a way of encoding -the entire Unicode repertiore as sequences of octets. This encoding is -expected to become very widespread. Perl can use this form internaly +the entire Unicode repertoire as sequences of octets. This encoding is +expected to become very widespread. Perl can use this form internally to represent strings, so conversions to and from this form are particularly efficient (as octets in memory do not have to change, just the meta-data that tells Perl how to treat them). =over 4 -=item * - - $bytes = encode_utf8($string); +=item $bytes = encode_utf8($string); The characters that comprise string are encoded in Perl's superset of UTF-8 and the resulting octets returned as a sequence of bytes. All possible characters have a UTF-8 representation so this function cannot fail. -=item * - - $string = decode_utf8($bytes [,CHECK]); +=item $string = decode_utf8($bytes [, CHECK]); The sequence of octets represented by $bytes is decoded from UTF-8 into a sequence of logical characters. Not all sequences of octets @@ -391,16 +389,17 @@ Or you can give the name of specific module. @with_jp = Encode->encodings("Encode/JP.pm"); -Note in this case you have to say "Encode/JP.pm instead of Encode::JP. +Note in this case you have to say C<"Encode/JP.pm"> instead of +C<"Encode::JP">. -To find which encodings are suppoted by this package in details, +To find which encodings are supported by this package in details, see L. =head2 Defining Aliases use Encode; use Encode::Alias; - define_alias( newName => ENCODING); + define_alias(newName => ENCODING); Allows newName to be used as am alias for ENCODING. ENCODING may be either the name of an encoding or and encoding object (as above). @@ -410,7 +409,7 @@ See L on details. =head1 Defining Encodings use Encode qw(define_alias); - define_encoding( $object, 'canonicalName' [,alias...]); + define_encoding($object, 'canonicalName' [, alias...]); Causes I to be associated with I<$object>. The object should provide the interface described in L @@ -490,15 +489,13 @@ implementation. As such they are efficient, but may change. =over 4 -=item * is_utf8(STRING [, CHECK]) +=item is_utf8(STRING [, CHECK]) [INTERNAL] Test whether the UTF-8 flag is turned on in the STRING. If CHECK is true, also checks the data in STRING for being well-formed UTF-8. Returns true if successful, false otherwise. -=item * - - _utf8_on(STRING) +=item _utf8_on(STRING) [INTERNAL] Turn on the UTF-8 flag in STRING. The data in STRING is B checked for being well-formed UTF-8. Do not use unless you @@ -506,9 +503,7 @@ B that the STRING is well-formed UTF-8. Returns the previous state of the UTF-8 flag (so please don't test the return value as I success or failure), or C if STRING is not a string. -=item * - - _utf8_off(STRING) +=item _utf8_off(STRING) [INTERNAL] Turn off the UTF-8 flag in STRING. Do not use frivolously. Returns the previous state of the UTF-8 flag (so please don't test the diff --git a/ext/Encode/JP/JP.pm b/ext/Encode/JP/JP.pm index cff0d98c1d..c4cbac152a 100644 --- a/ext/Encode/JP/JP.pm +++ b/ext/Encode/JP/JP.pm @@ -5,13 +5,14 @@ BEGIN { } } use Encode; -our $VERSION = do { my @r = (q$Revision: 0.96 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = do { my @r = (q$Revision: 0.98 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; use XSLoader; XSLoader::load('Encode::JP',$VERSION); use Encode::JP::JIS; use Encode::JP::ISO_2022_JP; +use Encode::JP::ISO_2022_JP_1; 1; __END__ @@ -41,6 +42,9 @@ supported are as follows. iso-2022-jp ISO-2022-JP (7bit JIS with all Halfwidth Kana converted to Fullwidth) + iso-2022-jp-1 ISO-2022-JP-1 + (ISO-2022-JP with JIS X 0212-1990 + support. See below) macjapan Mac Japan (Shift JIS + Apple vendor mappings) cp932 Code Page 932 (Shift JIS + MS/IBM vendor mappings) -------------------------------------------------------------------- @@ -49,12 +53,34 @@ supported are as follows. To find how to use this module in detail, see L. +=head1 Note on ISO-2022-JP(-1)? + +ISO-2022-JP-1 (RFC2237) is a superset of ISO-2022-JP (RFC1468) which +adds support for JIS X 0212-1990. That means you can use the same +code to decode to utf8 but not vice versa. + + $utf8 = decode('iso-2022-jp-1', $stream); + $utf8 = decode('iso-2022-jp', $stream); + +Yields the same result but + + $with_0212 = encode('iso-2022-jp-1', $utf8); + +is now different from + + $without_0212 = encode('iso-2022-jp', $utf8 ); + +In the latter case, characters that map to 0212 are at first converted +to U+3013 (0xA2AE in EUC-JP; a white square also known as 'Tofu') then +fed to decoding engine. U+FFFD is not used to preserve text layout as +much as possible. + =head1 BUGS ASCII part (0x00-0x7f) is preserved for all encodings, even though it conflicts with mappings by the Unicode Consortium. See -F +L to find why it is implemented that way. diff --git a/ext/Encode/KR/KR.pm b/ext/Encode/KR/KR.pm index 7dcafd0441..9e2e1d3d77 100644 --- a/ext/Encode/KR/KR.pm +++ b/ext/Encode/KR/KR.pm @@ -4,7 +4,7 @@ BEGIN { die "Encode::KR not supported on EBCDIC\n"; } } -our $VERSION = do { my @r = (q$Revision: 0.97 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = do { my @r = (q$Revision: 0.98 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; use Encode; use XSLoader; @@ -46,7 +46,7 @@ The C (two-byte combination code) encoding is not supported. ASCII part (0x00-0x7f) is preserved for all encodings, even though it conflicts with mappings by the Unicode Consortium. See -F +L to find why it is implemented that way. diff --git a/ext/Encode/MANIFEST b/ext/Encode/MANIFEST index 24adacad5c..79ea273241 100644 --- a/ext/Encode/MANIFEST +++ b/ext/Encode/MANIFEST @@ -169,6 +169,7 @@ lib/Encode/Internal.pm Encode extension lib/Encode/JP/Constants.pm Encode extension lib/Encode/JP/H2Z.pm Encode extension lib/Encode/JP/ISO_2022_JP.pm Encode extension +lib/Encode/JP/ISO_2022_JP_1.pm Encode extension lib/Encode/JP/JIS.pm Encode extension lib/Encode/Supported.pod Documents supported encodings lib/Encode/Tcl.pm Encode extension @@ -179,7 +180,7 @@ lib/Encode/XS.pm Encode extension lib/Encode/iso10646_1.pm Encode extension lib/Encode/ucs2_le.pm Encode extension lib/Encode/utf8.pm Encode extension -lib/EncodeFormat.pod Encode extension +lib/Encode/EncFormat.pod Encode extension t/Aliases.t Encode extension test t/CN.t Encode extension test t/Encode.t Encode extension test diff --git a/ext/Encode/TW/TW.pm b/ext/Encode/TW/TW.pm index b44c8d2acb..d1f85c57fb 100644 --- a/ext/Encode/TW/TW.pm +++ b/ext/Encode/TW/TW.pm @@ -4,7 +4,7 @@ BEGIN { die "Encode::TW not supported on EBCDIC\n"; } } -our $VERSION = do { my @r = (q$Revision: 0.97 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = do { my @r = (q$Revision: 0.98 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; use Encode; use XSLoader; @@ -54,7 +54,7 @@ plane 1-7. ASCII part (0x00-0x7f) is preserved for all encodings, even though it conflicts with mappings by the Unicode Consortium. See -F +L to find why it is implemented that way. diff --git a/ext/Encode/lib/Encode/Alias.pm b/ext/Encode/lib/Encode/Alias.pm index 5f7d34575a..2a97261684 100644 --- a/ext/Encode/lib/Encode/Alias.pm +++ b/ext/Encode/lib/Encode/Alias.pm @@ -1,7 +1,7 @@ package Encode::Alias; use strict; use Encode; -our $VERSION = do { my @r = (q$Revision: 0.96 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = do { my @r = (q$Revision: 0.98 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; our $DEBUG = 0; require Exporter; @@ -31,11 +31,13 @@ sub find_alias my $new; if (ref($alias) eq 'Regexp' && $_ =~ $alias) { + $DEBUG and warn "eval $val"; $new = eval $val; # $@ and warn "$val, $@"; } elsif (ref($alias) eq 'CODE') { + $DEBUG and warn "$alias", "->", "($val)"; $new = $alias->($val); } elsif (lc($_) eq lc($alias)) @@ -45,6 +47,7 @@ sub find_alias if (defined($new)) { next if $new eq $_; # avoid (direct) recursion on bugs + $DEBUG and warn "$alias, $new"; my $enc = (ref($new)) ? $new : Encode::find_encoding($new); if ($enc) { @@ -54,6 +57,15 @@ sub find_alias } } } + if ($DEBUG){ + my $name; + if (my $e = $Alias{$_}){ + $name = $e->name; + }else{ + $name = ""; + } + warn "find_alias($class, $_)->name = $name"; + } return $Alias{$_}; } @@ -69,15 +81,17 @@ sub define_alias for my $k (@a){ if (ref($alias) eq 'Regexp' && $k =~ $alias) { - $DEBUG and warn $k; + $DEBUG and warn "delete \$Alias\{$k\}"; delete $Alias{$k}; } elsif (ref($alias) eq 'CODE') { + $DEBUG and warn "delete \$Alias\{$k\}"; delete $Alias{$alias->($name)}; } } }else{ + $DEBUG and warn "delete \$Alias\{$alias\}"; delete $Alias{$alias}; } } @@ -154,29 +168,29 @@ sub init_aliases define_alias( qr/^macRomanian$/i => '"macRumanian"'); # Standardize on the dashed versions. - define_alias( qr/^utf8$/i => 'utf-8' ); + # define_alias( qr/^utf8$/i => 'utf-8' ); define_alias( qr/^koi8r$/i => 'koi8-r' ); define_alias( qr/^koi8u$/i => 'koi8-u' ); -# for Encode::CN - define_alias( qr/euc.*cn$/i => '"euc-cn"' ); - define_alias( qr/cn.*euc/i => '"euc-cn"' ); - -# for Encode::JP - define_alias( qr/euc.*jp$/i => '"euc-jp"' ); - define_alias( qr/jp.*euc/i => '"euc-jp"' ); - define_alias( qr/ujis$/i => '"euc-jp"' ); - define_alias( qr/shift.*jis$/i => '"shiftjis"' ); - define_alias( qr/sjis$/i => '"shiftjis"' ); - define_alias( qr/^jis$/i => '"7bit-jis"' ); - -# for Encode::KR - define_alias( qr/euc.*kr$/i => '"euc-kr"' ); - define_alias( qr/kr.*euc/i => '"euc-kr"' ); - -# for Encode::TW - define_alias( qr/big-?5$/i => '"big5"' ); - define_alias( qr/big5-hk(?:scs)?/i => '"big5-hkscs"' ); + unless ($Encode::ON_EBCDIC){ + # for Encode::CN + define_alias( qr/euc.*cn$/i => '"euc-cn"' ); + define_alias( qr/cn.*euc/i => '"euc-cn"' ); + define_alias( qr/^GB[- ]?(\d+)$/i => '"gb$1"' ); + # for Encode::JP + define_alias( qr/euc.*jp$/i => '"euc-jp"' ); + define_alias( qr/jp.*euc/i => '"euc-jp"' ); + define_alias( qr/ujis$/i => '"euc-jp"' ); + define_alias( qr/shift.*jis$/i => '"shiftjis"' ); + define_alias( qr/sjis$/i => '"shiftjis"' ); + define_alias( qr/^jis$/i => '"7bit-jis"' ); + # for Encode::KR + define_alias( qr/euc.*kr$/i => '"euc-kr"' ); + define_alias( qr/kr.*euc/i => '"euc-kr"' ); + # for Encode::TW + define_alias( qr/big-?5$/i => '"big5"' ); + define_alias( qr/big5-hk(?:scs)?/i => '"big5-hkscs"' ); + } # At last, Map white space and _ to '-' define_alias( qr/^(\S+)[\s_]+(.*)$/i => '"$1-$2"' ); diff --git a/ext/Encode/lib/Encode/Details.pod b/ext/Encode/lib/Encode/Details.pod index aa3a0af782..6721484808 100644 --- a/ext/Encode/lib/Encode/Details.pod +++ b/ext/Encode/lib/Encode/Details.pod @@ -1,11 +1,6 @@ - =head1 NAME -Encode - character encodings - -=head1 SYNOPSIS - - use Encode; +Encode::Details - implementation details of Encode.pm =head1 DESCRIPTION @@ -19,7 +14,7 @@ codepoint" for the character (the exceptions are those platforms where the legacy encoding is some variant of EBCDIC rather than a super-set of ASCII - see L). -Traditionaly computer data has been moved around in 8-bit chunks +Traditionally computer data has been moved around in 8-bit chunks often called "bytes". These chunks are also known as "octets" in networking standards. Perl is widely used to manipulate data of many types - not only strings of characters representing human or @@ -92,7 +87,7 @@ encodings for East Asian languages. Not really very "encoded" encodings. The Unicode code points are just represented as 4-octet integers. None the less because different architectures use different representations of integers -(so called "endian") there at least two disctinct encodings. +(so called "endian") there at least two distinct encodings. =item * Multi-byte encodings @@ -265,7 +260,7 @@ Microsft proprietary. UTF-16 KOI8-U ISO-2022-JP-2 -are IANA-registered preferred MIME names but probably shoule +are IANA-registered preferred MIME names but probably should be avoided as encoding for web pages due to lack of browser support. @@ -412,25 +407,21 @@ Index into the string could be pos($str) allowing s/\G...//. =head2 UTF-8 / utf8 The Unicode consortium defines the UTF-8 standard as a way of encoding -the entire Unicode repertiore as sequences of octets. This encoding is -expected to become very widespread. Perl can use this form internaly +the entire Unicode repertoire as sequences of octets. This encoding is +expected to become very widespread. Perl can use this form internally to represent strings, so conversions to and from this form are particularly efficient (as octets in memory do not have to change, just the meta-data that tells Perl how to treat them). =over 4 -=item * - - $bytes = encode_utf8($string); +=item $bytes = encode_utf8($string); The characters that comprise string are encoded in Perl's superset of UTF-8 and the resulting octets returned as a sequence of bytes. All possible characters have a UTF-8 representation so this function cannot fail. -=item * - - $string = decode_utf8($bytes [,CHECK]); +=item $string = decode_utf8($bytes [,CHECK]); The sequence of octets represented by $bytes is decoded from UTF-8 into a sequence of logical characters. Not all sequences of octets @@ -505,10 +496,10 @@ Currently I can be specified in the following ways: define_alias( qr/^iso8859-(\d+)$/i => '"iso-8859-$1"' ); In this case if I is not a reference it is C-ed to -allow C<$1> etc. to be subsituted. The example is one way to names as +allow C<$1> etc. to be substituted. The example is one way to names as used in X11 font names to alias the MIME names for the iso-8859-* family. Note the double quote inside the single quote. If you are -using regex here, y ou have to do so or it won't work in this case. +using regex here, you have to do so or it won't work in this case. =item As a code reference, e.g.: @@ -622,15 +613,13 @@ implementation. As such they are efficient, but may change. =over 4 -=item * is_utf8(STRING [, CHECK]) +=item is_utf8(STRING [, CHECK]) [INTERNAL] Test whether the UTF-8 flag is turned on in the STRING. If CHECK is true, also checks the data in STRING for being well-formed UTF-8. Returns true if successful, false otherwise. -=item * - - _utf8_on(STRING) +=item _utf8_on(STRING) [INTERNAL] Turn on the UTF-8 flag in STRING. The data in STRING is B checked for being well-formed UTF-8. Do not use unless you @@ -638,9 +627,7 @@ B that the STRING is well-formed UTF-8. Returns the previous state of the UTF-8 flag (so please don't test the return value as I success or failure), or C if STRING is not a string. -=item * - - _utf8_off(STRING) +=item _utf8_off(STRING) [INTERNAL] Turn off the UTF-8 flag in STRING. Do not use frivolously. Returns the previous state of the UTF-8 flag (so please don't test the @@ -816,6 +803,4 @@ to be rationalized. L, L, L, L, L, L, the Perl Unicode Mailing List Eperl-unicode@perl.orgE - =cut - diff --git a/ext/Encode/lib/Encode/EncFormat.pod b/ext/Encode/lib/Encode/EncFormat.pod new file mode 100644 index 0000000000..abb805709b --- /dev/null +++ b/ext/Encode/lib/Encode/EncFormat.pod @@ -0,0 +1,163 @@ +=head1 NAME + +Encode::EncFormat - the format of encoding tables of the Encode/*.enc files + +=head1 DESCRIPTION + +I + +Space would prohibit precompiling into Tcl every possible encoding +algorithm, so many encodings are stored on disk as dynamically-loadable +encoding files. This behavior also allows the user to create additional +encoding files that can be loaded using the same mechanism. These +encoding files contain information about the tables and/or escape +sequences used to map between an external encoding and Unicode. The +external encoding may consist of single-byte, multi-byte, or double-byte +characters. + +Each dynamically-loadable encoding is represented as a text file. The +initial line of the file, beginning with a ``#'' symbol, is a comment +that provides a human-readable description of the file. The next line +identifies the type of encoding file. It can be one of the following +letters: + +=over 4 + +=item [1] B + +A single-byte encoding, where one character is always one byte long in +the encoding. An example is B, used by many European languages. + +=item [2] B + +A double-byte encoding, where one character is always two bytes long in the +encoding. An example is B, used for Chinese text. + +=item [3] B + +A multi-byte encoding, where one character may be either one or two +bytes long. Certain bytes are a lead bytes, indicating that another +byte must follow and that together the two bytes represent one +character. Other bytes are not lead bytes and represent themselves. +An example is B, used by many Japanese computers. + +=item [4] B + +An escape-sequence encoding, specifying that certain sequences of +bytes do not represent characters, but commands that describe how +following bytes should be interpreted. + +=back + +The rest of the lines in the file depend on the type. + +Cases [1], [2], and [3] are collectively referred to as table-based +encoding files. The lines in a table-based encoding file are in the +same format as this example taken from the B encoding (this +is not the complete file): + + # Encoding file: shiftjis, multi-byte + M + 003F 0 40 + 00 + 0000000100020003000400050006000700080009000A000B000C000D000E000F + 0010001100120013001400150016001700180019001A001B001C001D001E001F + 0020002100220023002400250026002700280029002A002B002C002D002E002F + 0030003100320033003400350036003700380039003A003B003C003D003E003F + 0040004100420043004400450046004700480049004A004B004C004D004E004F + 0050005100520053005400550056005700580059005A005B005C005D005E005F + 0060006100620063006400650066006700680069006A006B006C006D006E006F + 0070007100720073007400750076007700780079007A007B007C007D203E007F + 0080000000000000000000000000000000000000000000000000000000000000 + 0000000000000000000000000000000000000000000000000000000000000000 + 0000FF61FF62FF63FF64FF65FF66FF67FF68FF69FF6AFF6BFF6CFF6DFF6EFF6F + FF70FF71FF72FF73FF74FF75FF76FF77FF78FF79FF7AFF7BFF7CFF7DFF7EFF7F + FF80FF81FF82FF83FF84FF85FF86FF87FF88FF89FF8AFF8BFF8CFF8DFF8EFF8F + FF90FF91FF92FF93FF94FF95FF96FF97FF98FF99FF9AFF9BFF9CFF9DFF9EFF9F + 0000000000000000000000000000000000000000000000000000000000000000 + 0000000000000000000000000000000000000000000000000000000000000000 + 81 + 0000000000000000000000000000000000000000000000000000000000000000 + 0000000000000000000000000000000000000000000000000000000000000000 + 0000000000000000000000000000000000000000000000000000000000000000 + 0000000000000000000000000000000000000000000000000000000000000000 + 300030013002FF0CFF0E30FBFF1AFF1BFF1FFF01309B309C00B4FF4000A8FF3E + FFE3FF3F30FD30FE309D309E30034EDD30053006300730FC20152010FF0F005C + 301C2016FF5C2026202520182019201C201DFF08FF0930143015FF3BFF3DFF5B + FF5D30083009300A300B300C300D300E300F30103011FF0B221200B100D70000 + 00F7FF1D2260FF1CFF1E22662267221E22342642264000B0203220332103FFE5 + FF0400A200A3FF05FF03FF06FF0AFF2000A72606260525CB25CF25CE25C725C6 + 25A125A025B325B225BD25BC203B301221922190219121933013000000000000 + 000000000000000000000000000000002208220B2286228722822283222A2229 + 000000000000000000000000000000002227222800AC21D221D4220022030000 + 0000000000000000000000000000000000000000222022A52312220222072261 + 2252226A226B221A223D221D2235222B222C0000000000000000000000000000 + 212B2030266F266D266A2020202100B6000000000000000025EF000000000000 + +The third line of the file is three numbers. The first number is the +fallback character (in base 16) to use when converting from UTF-8 to +this encoding. The second number is a B<1> if this file represents +the encoding for a symbol font, or B<0> otherwise. The last number +(in base 10) is how many pages of data follow. + +Subsequent lines in the example above are pages that describe how to +map from the encoding into 2-byte Unicode. The first line in a page +identifies the page number. Following it are 256 double-byte numbers, +arranged as 16 rows of 16 numbers. Given a character in the encoding, +the high byte of that character is used to select which page, and the +low byte of that character is used as an index to select one of the +double-byte numbers in that page - the value obtained being the +corresponding Unicode character. By examination of the example above, +one can see that the characters 0x7E and 0x8163 in B map to +203E and 2026 in Unicode, respectively. + +Following the first page will be all the other pages, each in the same +format as the first: one number identifying the page followed by 256 +double-byte Unicode characters. If a character in the encoding maps +to the Unicode character 0000, it means that the character doesn't +actually exist. If all characters on a page would map to 0000, that +page can be omitted. + +Case [4] is the escape-sequence encoding file. The lines in an this +type of file are in the same format as this example taken from the +B encoding: + + # Encoding file: iso2022-jp, escape-driven + E + init {} + final {} + iso8859-1 \\x1b(B + jis0201 \\x1b(J + jis0208 \\x1b$@ + jis0208 \\x1b$B + jis0212 \\x1b$(D + gb2312 \\x1b$A + ksc5601 \\x1b$(C + +In the file, the first column represents an option and the second +column is the associated value. B is a string to emit or expect +before the first character is converted, while B is a string to +emit or expect after the last character. All other options are names +of table-based encodings; the associated value is the escape-sequence +that marks that encoding. Tcl syntax is used for the values; in the +above example, for instance, ``B<{}>'' represents the empty string and +``B<\\x1b>'' represents character 27. + +B +When B encounters an encoding I that has not +been loaded, it attempts to load an encoding file called +IB<.enc> from the B subdirectory of each directory +specified in the library path B<$tcl_libPath>. If the encoding file +exists, but is malformed, an error message will be left in I. + +=head1 KEYWORDS + +utf, encoding, convert + +=head1 COPYRIGHT + + # Copyright (c) 1997-1998 Sun Microsystems, Inc. + # See the file "license.terms" for information on usage and redistribution + # of this file, and for a DISCLAIMER OF ALL WARRANTIES. diff --git a/ext/Encode/lib/Encode/JP/ISO_2022_JP.pm b/ext/Encode/lib/Encode/JP/ISO_2022_JP.pm index 388be5f247..29df75054b 100644 --- a/ext/Encode/lib/Encode/JP/ISO_2022_JP.pm +++ b/ext/Encode/lib/Encode/JP/ISO_2022_JP.pm @@ -5,7 +5,7 @@ use Encode::JP::H2Z; use base 'Encode::Encoding'; use vars qw($VERSION); -$VERSION = do { my @r = (q$Revision: 0.94 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +$VERSION = do { my @r = (q$Revision: 0.98 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; my $canon = 'iso-2022-jp'; my $obj = bless {name => $canon}, __PACKAGE__; @@ -31,7 +31,7 @@ sub encode my ($obj,$str,$chk) = @_; my $euc = Encode::encode('euc-jp', $str, $chk); &Encode::JP::H2Z::h2z(\$euc); - return &Encode::JP::JIS::euc_jis(\$euc); + return &Encode::JP::JIS::euc_jis_nox0212(\$euc); } 1; diff --git a/ext/Encode/lib/Encode/JP/ISO_2022_JP_1.pm b/ext/Encode/lib/Encode/JP/ISO_2022_JP_1.pm new file mode 100644 index 0000000000..9b1c3191e7 --- /dev/null +++ b/ext/Encode/lib/Encode/JP/ISO_2022_JP_1.pm @@ -0,0 +1,38 @@ +package Encode::JP::ISO_2022_JP_1; +use Encode::JP; +use Encode::JP::JIS; +use Encode::JP::H2Z; +use base 'Encode::Encoding'; + +use vars qw($VERSION); +$VERSION = do { my @r = (q$Revision: 0.98 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; + +my $canon = 'iso-2022-jp-1'; +my $obj = bless {name => $canon}, __PACKAGE__; +$obj->Define($canon); + +sub name { return $_[0]->{name}; } + +# +# decode is identical to 7bit-jis +# + +sub decode +{ + my ($obj,$str,$chk) = @_; + return Encode::decode('7bit-jis', $str, $chk); +} + +# iso-2022-jp = 7bit-jis with all x201 (Hankaku) converted to +# x208 equivalent (Zenkaku) + +sub encode +{ + my ($obj,$str,$chk) = @_; + my $euc = Encode::encode('euc-jp', $str, $chk); + &Encode::JP::H2Z::h2z(\$euc); + return &Encode::JP::JIS::euc_jis(\$euc); +} + +1; +__END__ diff --git a/ext/Encode/lib/Encode/JP/JIS.pm b/ext/Encode/lib/Encode/JP/JIS.pm index 6e6dd0fd24..86878216f9 100644 --- a/ext/Encode/lib/Encode/JP/JIS.pm +++ b/ext/Encode/lib/Encode/JP/JIS.pm @@ -5,7 +5,7 @@ use base 'Encode::Encoding'; use strict; use vars qw($VERSION); -$VERSION = do { my @r = (q$Revision: 0.94 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +$VERSION = do { my @r = (q$Revision: 0.98 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; # Just for the time being, we implement jis-7bit # encoding via EUC @@ -77,5 +77,11 @@ sub euc_jis{ $$r_str; } +sub euc_jis_nox0212{ + my $r_str = shift; + $$r_str =~ s/$RE{EUC_0212}/$CHARCODE{UNDEF_EUC}/go; + euc_jis($r_str); +} + 1; __END__ diff --git a/ext/Encode/lib/Encode/Supported.pod b/ext/Encode/lib/Encode/Supported.pod index d48d14d342..1a9f88ed01 100644 --- a/ext/Encode/lib/Encode/Supported.pod +++ b/ext/Encode/lib/Encode/Supported.pod @@ -24,7 +24,7 @@ once an operation is in progress. As of Perl 5.8.0, at least the following encodings are recognized. Note that unless otherwise specified, they are all case insensitive -(via alias) and all occurance of spaces are replaced with '-'. In +(via alias) and all occurrance of spaces are replaced with '-'. In other words, "ISO 8859 1" and "iso-8859-1" are identical. Encodings are categorized and implemented in several different modules @@ -51,12 +51,12 @@ extended ASCII. For most cases it uses \x80-\xff (upper half) to map non-ASCII characters. ----------------------- - iso-8859-1 latin + (iso-8859-1 is in built-in) iso-8859-2 latin2 iso-8859-3 latin3 iso-8859-4 latin4 - iso-8859-5 latin - iso-8859-6 latin + iso-8859-5 + iso-8859-6 iso-8859-7 iso-8859-8 iso-8859-9 latin5 @@ -102,8 +102,9 @@ non-ASCII characters. =head2 The CJK: Chinese, Japanese, Korean (Multibyte) Note Vietnamese is listed above. Also read "Encoding vs Charset" -below. Also note these are impelemented in distinct module by -languages, due the the size concerns. See these perldocs also. +below. Also note these are implemented in distinct module by +languages, due the the size concerns. Please also refer to their +respective document pages. =over 4 @@ -125,6 +126,7 @@ languages, due the the size concerns. See these perldocs also. cp932 euc-jp ujis iso-2022-jp + iso-2022-jp-1 macjapan shiftjis Shift_JIS, sjis ----------------------- @@ -172,7 +174,7 @@ See perlebcdic for details. posix-bc ----------------------- -=item Enocode::Symbols +=item Encode::Symbols For symbols and dingbats. @@ -193,70 +195,105 @@ Charset determines which characters to be included in a given text. Encoding actually maps charset(s) to stream of bits. -Note a given encoding contains multiple charsets. For instance, -euc-jp contains ASCII, JIS X 0201 (Hankaku Kana), JIS X 0208 (Zenkaku -Kana and Kanji) and JIS X 0212 (Extended Kanji) in a single encoding. +Note a given encoding may contain multiple charsets and complex CJK +encodings are usually implemented that way. + +For instance, euc-jp contains ASCII, JIS X 0201-1978 (Hankaku Kana), +JIS X 0208-1997 (ZenkakuKana and Kanji) and JIS X 0212-1990 (Extended +Kanji) in a single encoding. As the name suggests, the Encode module supports encodings, not individual charsets. -=head1 Encoding Classification (by Anton Tagunov) +=head1 Encoding Classification (by Anton Tagunov and Dan Kogai) + +This section tries to classify the supported encodings by their +applicability for information exchange over the Internet and to +choose the most suitable aliases to name them in the context of +such communication. + +Encoding names -Encodings + US-ASCII UTF-8 + ISO-8859-* KOI8-R + Shift_JIS EUC-JP ISO-2022-JP ISO-2022-JP-1 + EUC-KR + Big5 - US-ASCII UTF-8 KOI8-R ISO-8859-* - ISO-2022-CN ISO-2022-JP Big5 - EUC-CN EUC-JP EUC-KR +are L-registered as +preferred MIME names and may probably be used over the Internet. -are -registered as -preferred MIME names and may probably be used over the Internet. So is +C is no longer Microsft proprietary since it has been +officialized by JIS X 0208-1997. It is probably the most wide +spread encoding for Japanese on the Internet. - Shift_JIS + EUC-CN -but despite its wide spread it bears the label of being -Microsft proprietary -- was. Now Shift JIS is official as of -JIS X 0208-1997. +has not been registered with IANA (as of march 2002) but +seems to be supported by major web browsers. (IANA has registered +this encoding as C, but C currently has a different +meaning to the C module. It will probably become alias to +C in the future; until then it is safer to avoid using +C as encoding name within Perl). - UTF-16 KOI8-U + UTF-16 + KOI8-U (http://www.faqs.org/rfcs/rfc2319.html) -are IANA-registered preferred MIME names but probably -shoule be avoided as encoding for web pages due to lack of -browser support. +are IANA-registered (C even as a preferred MIME name) +but probably should be avoided as encoding for web pages due to +lack of browser support. - ISO-2022 (http://www.ecma.ch/ecma1/STAND/ECMA-035.HTM) - ISO-2022-JP-1 (http://www.faqs.org/rfcs/rfc2237.html) ISO-IR-165 (http://www.faqs.org/rfcs/rfc1345.html) GBK VISCII - GB 12345 (only plains 1 and 2 available) - GB 18030 - CNS 11643 + GB 12345 + GB 18030 (*) (see links bellow) + EUC-TW (*) are totally valid encodings but not registered at IANA. +The names under which they are listed here are probably the +most widely-known names for these encodings and are recommended +names. + + +=for comment this used to be listed as supported but - BIG5PLUS - EUC-JP-0212 (Encode::lib::Encode::Tcl::Extended) +do not work @15457 when it's clear they will be uncommented +or deleted - Anton +ISO-2022 (http://www.ecma.ch/ecma1/STAND/ECMA-035.HTM) +CNS 11643 (only plains 1 and 2 available) -are a bit proprietary + BIG5PLUS (*) + +is a bit proprietary name. C<(*)>-marked encodings belong to +C available from CPAN. You may probably get some info on CJK encodings at brief description for most of the mentioned CJK encodings - -F +L several years old, but still useful - -F +L and some in-depth reading for the heroes :-) -F (eq ISO-2022) +L (eq C) + +gives brief info on C, C and mostly on C +F + +The nature of information in this section is most fragile and +error-prone; I is the most popular adverb :) +Please feel free to send your comments, disagreements and +additions to L<...>. (Note however, +that the mission of this document is to cover the +C-supported encodings only. =head1 See Also L, L, -L, L, L, L +L, L, L, L, L, L =cut diff --git a/ext/Encode/lib/EncodeFormat.pod b/ext/Encode/lib/EncodeFormat.pod deleted file mode 100644 index 3a1269dd04..0000000000 --- a/ext/Encode/lib/EncodeFormat.pod +++ /dev/null @@ -1,163 +0,0 @@ -=head1 NAME - -EncodeFormat - the format of encoding tables of the Encode extension - -=head1 DESCRIPTION - -I - -Space would prohibit precompiling into Tcl every possible encoding -algorithm, so many encodings are stored on disk as dynamically-loadable -encoding files. This behavior also allows the user to create additional -encoding files that can be loaded using the same mechanism. These -encoding files contain information about the tables and/or escape -sequences used to map between an external encoding and Unicode. The -external encoding may consist of single-byte, multi-byte, or double-byte -characters. - -Each dynamically-loadable encoding is represented as a text file. The -initial line of the file, beginning with a ``#'' symbol, is a comment -that provides a human-readable description of the file. The next line -identifies the type of encoding file. It can be one of the following -letters: - -=over 4 - -=item [1] B - -A single-byte encoding, where one character is always one byte long in -the encoding. An example is B, used by many European languages. - -=item [2] B - -A double-byte encoding, where one character is always two bytes long in the -encoding. An example is B, used for Chinese text. - -=item [3] B - -A multi-byte encoding, where one character may be either one or two -bytes long. Certain bytes are a lead bytes, indicating that another -byte must follow and that together the two bytes represent one -character. Other bytes are not lead bytes and represent themselves. -An example is B, used by many Japanese computers. - -=item [4] B - -An escape-sequence encoding, specifying that certain sequences of -bytes do not represent characters, but commands that describe how -following bytes should be interpreted. - -=back - -The rest of the lines in the file depend on the type. - -Cases [1], [2], and [3] are collectively referred to as table-based -encoding files. The lines in a table-based encoding file are in the -same format as this example taken from the B encoding (this -is not the complete file): - - # Encoding file: shiftjis, multi-byte - M - 003F 0 40 - 00 - 0000000100020003000400050006000700080009000A000B000C000D000E000F - 0010001100120013001400150016001700180019001A001B001C001D001E001F - 0020002100220023002400250026002700280029002A002B002C002D002E002F - 0030003100320033003400350036003700380039003A003B003C003D003E003F - 0040004100420043004400450046004700480049004A004B004C004D004E004F - 0050005100520053005400550056005700580059005A005B005C005D005E005F - 0060006100620063006400650066006700680069006A006B006C006D006E006F - 0070007100720073007400750076007700780079007A007B007C007D203E007F - 0080000000000000000000000000000000000000000000000000000000000000 - 0000000000000000000000000000000000000000000000000000000000000000 - 0000FF61FF62FF63FF64FF65FF66FF67FF68FF69FF6AFF6BFF6CFF6DFF6EFF6F - FF70FF71FF72FF73FF74FF75FF76FF77FF78FF79FF7AFF7BFF7CFF7DFF7EFF7F - FF80FF81FF82FF83FF84FF85FF86FF87FF88FF89FF8AFF8BFF8CFF8DFF8EFF8F - FF90FF91FF92FF93FF94FF95FF96FF97FF98FF99FF9AFF9BFF9CFF9DFF9EFF9F - 0000000000000000000000000000000000000000000000000000000000000000 - 0000000000000000000000000000000000000000000000000000000000000000 - 81 - 0000000000000000000000000000000000000000000000000000000000000000 - 0000000000000000000000000000000000000000000000000000000000000000 - 0000000000000000000000000000000000000000000000000000000000000000 - 0000000000000000000000000000000000000000000000000000000000000000 - 300030013002FF0CFF0E30FBFF1AFF1BFF1FFF01309B309C00B4FF4000A8FF3E - FFE3FF3F30FD30FE309D309E30034EDD30053006300730FC20152010FF0F005C - 301C2016FF5C2026202520182019201C201DFF08FF0930143015FF3BFF3DFF5B - FF5D30083009300A300B300C300D300E300F30103011FF0B221200B100D70000 - 00F7FF1D2260FF1CFF1E22662267221E22342642264000B0203220332103FFE5 - FF0400A200A3FF05FF03FF06FF0AFF2000A72606260525CB25CF25CE25C725C6 - 25A125A025B325B225BD25BC203B301221922190219121933013000000000000 - 000000000000000000000000000000002208220B2286228722822283222A2229 - 000000000000000000000000000000002227222800AC21D221D4220022030000 - 0000000000000000000000000000000000000000222022A52312220222072261 - 2252226A226B221A223D221D2235222B222C0000000000000000000000000000 - 212B2030266F266D266A2020202100B6000000000000000025EF000000000000 - -The third line of the file is three numbers. The first number is the -fallback character (in base 16) to use when converting from UTF-8 to -this encoding. The second number is a B<1> if this file represents -the encoding for a symbol font, or B<0> otherwise. The last number -(in base 10) is how many pages of data follow. - -Subsequent lines in the example above are pages that describe how to -map from the encoding into 2-byte Unicode. The first line in a page -identifies the page number. Following it are 256 double-byte numbers, -arranged as 16 rows of 16 numbers. Given a character in the encoding, -the high byte of that character is used to select which page, and the -low byte of that character is used as an index to select one of the -double-byte numbers in that page - the value obtained being the -corresponding Unicode character. By examination of the example above, -one can see that the characters 0x7E and 0x8163 in B map to -203E and 2026 in Unicode, respectively. - -Following the first page will be all the other pages, each in the same -format as the first: one number identifying the page followed by 256 -double-byte Unicode characters. If a character in the encoding maps -to the Unicode character 0000, it means that the character doesn't -actually exist. If all characters on a page would map to 0000, that -page can be omitted. - -Case [4] is the escape-sequence encoding file. The lines in an this -type of file are in the same format as this example taken from the -B encoding: - - # Encoding file: iso2022-jp, escape-driven - E - init {} - final {} - iso8859-1 \\x1b(B - jis0201 \\x1b(J - jis0208 \\x1b$@ - jis0208 \\x1b$B - jis0212 \\x1b$(D - gb2312 \\x1b$A - ksc5601 \\x1b$(C - -In the file, the first column represents an option and the second -column is the associated value. B is a string to emit or expect -before the first character is converted, while B is a string to -emit or expect after the last character. All other options are names -of table-based encodings; the associated value is the escape-sequence -that marks that encoding. Tcl syntax is used for the values; in the -above example, for instance, ``B<{}>'' represents the empty string and -``B<\\x1b>'' represents character 27. - -B -When B encounters an encoding I that has not -been loaded, it attempts to load an encoding file called -IB<.enc> from the B subdirectory of each directory -specified in the library path B<$tcl_libPath>. If the encoding file -exists, but is malformed, an error message will be left in I. - -=head1 KEYWORDS - -utf, encoding, convert - -=head1 COPYRIGHT - - # Copyright (c) 1997-1998 Sun Microsystems, Inc. - # See the file "license.terms" for information on usage and redistribution - # of this file, and for a DISCLAIMER OF ALL WARRANTIES. diff --git a/ext/Encode/t/Aliases.t b/ext/Encode/t/Aliases.t index 3640f4b097..8fe298b0f7 100644 --- a/ext/Encode/t/Aliases.t +++ b/ext/Encode/t/Aliases.t @@ -3,66 +3,72 @@ use strict; use Encode; use Encode::Alias; +my %a2c; +my $ON_EBCDIC; BEGIN { - if (ord("A") == 193) { - print "1..0 # Skip: EBCDIC\n"; - exit 0; + $ON_EBCDIC = ord("A") == 193; + @ARGV and $ON_EBCDIC = $ARGV[0] eq 'EBCDIC'; + $Encode::ON_EBCDIC = $ON_EBCDIC; + + %a2c = ( + 'ascii' => 'US-ascii', + 'cyrillic' => 'iso-8859-5', + 'arabic' => 'iso-8859-6', + 'greek' => 'iso-8859-7', + 'hebrew' => 'iso-8859-8', + 'thai' => 'iso-8859-11', + 'tis620' => 'iso-8859-11', + 'WinLatin1' => 'cp1252', + 'WinLatin2' => 'cp1250', + 'WinCyrillic' => 'cp1251', + 'WinGreek' => 'cp1253', + 'WinTurkish' => 'cp1254', + 'WinHebrew' => 'cp1255', + 'WinArabic' => 'cp1256', + 'WinBaltic' => 'cp1257', + 'WinVietnamese' => 'cp1258', + 'ja_JP.euc' => $ON_EBCDIC ? '' : 'euc-jp', + 'x-euc-jp' => $ON_EBCDIC ? '' : 'euc-jp', + 'zh_CN.euc' => $ON_EBCDIC ? '' : 'euc-cn', + 'x-euc-cn' => $ON_EBCDIC ? '' : 'euc-cn', + 'ko_KR.euc' => $ON_EBCDIC ? '' : 'euc-kr', + 'x-euc-kr' => $ON_EBCDIC ? '' : 'euc-kr', + 'ujis' => $ON_EBCDIC ? '' : 'euc-jp', + 'Shift_JIS' => $ON_EBCDIC ? '' : 'shiftjis', + 'x-sjis' => $ON_EBCDIC ? '' : 'shiftjis', + 'jis' => $ON_EBCDIC ? '' : '7bit-jis', + 'big-5' => $ON_EBCDIC ? '' : 'big5', + 'zh_TW.Big5' => $ON_EBCDIC ? '' : 'big5', + 'big5-hk' => $ON_EBCDIC ? '' : 'big5-hkscs', + ); + + for my $i (1..11,13..16){ + $a2c{"ISO 8859 $i"} = "iso-8859-$i"; + } + for my $i (1..10){ + $a2c{"ISO Latin $i"} = "iso-8859-$Encode::Alias::Latin2iso[$i]"; + } + for my $k (keys %Encode::Alias::Winlatin2cp){ + my $v = $Encode::Alias::Winlatin2cp{$k}; + $a2c{"Win" . ucfirst($k)} = "cp" . $v; + $a2c{"IBM-$v"} = $a2c{"MS-$v"} = "cp" . $v; } } -my %a2c; - -BEGIN { - %a2c = ( - 'ascii' => 'US-ascii', - 'cyrillic' => 'iso-8859-5', - 'arabic' => 'iso-8859-6', - 'greek' => 'iso-8859-7', - 'hebrew' => 'iso-8859-8', - 'thai' => 'iso-8859-11', - 'tis620' => 'iso-8859-11', - 'ja_JP.euc' => 'euc-jp', - 'x-euc-jp' => 'euc-jp', - 'zh_CN.euc' => 'euc-cn', - 'x-euc-cn' => 'euc-cn', - 'ko_KR.euc' => 'euc-kr', - 'x-euc-kr' => 'euc-kr', - 'ujis' => 'euc-jp', - 'Shift_JIS' => 'shiftjis', - 'x-sjis' => 'shiftjis', - 'jis' => '7bit-jis', - 'big-5' => 'big5', - 'zh_TW.Big5' => 'big5', - 'big5-hk' => 'big5-hkscs', - 'WinLatin1' => 'cp1252', - 'WinLatin2' => 'cp1250', - 'WinCyrillic' => 'cp1251', - 'WinGreek' => 'cp1253', - 'WinTurkish' => 'cp1254', - 'WinHebrew' => 'cp1255', - 'WinArabic' => 'cp1256', - 'WinBaltic' => 'cp1257', - 'WinVietnamese' => 'cp1258', - ); - - for my $i (1..11,13..16){ - $a2c{"ISO 8859 $i"} = "iso-8859-$i"; - } - for my $i (1..10){ - $a2c{"ISO Latin $i"} = "iso-8859-$Encode::Alias::Latin2iso[$i]"; - } - for my $k (keys %Encode::Alias::Winlatin2cp){ - my $v = $Encode::Alias::Winlatin2cp{$k}; - $a2c{"Win" . ucfirst($k)} = "cp" . $v; - $a2c{"IBM-$v"} = "cp" . $v; - $a2c{"MS-$v"} = "cp" . $v; - } +if ($ON_EBCDIC){ + delete @Encode::ExtModule{ + qw(euc-cn gb2312 gb12345 gbk cp936 iso-ir-165 + euc-jp iso-2022-jp 7bit-jis shiftjis macjapan cp932 + euc-kr ksc5601 cp949 + big5 big5-hkscs cp950 + gb18030 big5plus euc-tw) + }; } use Test::More tests => (scalar keys %a2c) * 3; -print "# alias test\n"; +print "# alias test; \$ON_EBCDIC == $ON_EBCDIC\n"; foreach my $a (keys %a2c){ my $e = Encode::find_encoding($a); @@ -71,10 +77,20 @@ foreach my $a (keys %a2c){ # now we override some of the aliases and see if it works fine -define_alias( qr/shift.*jis$/i => '"macjapan"' ); -define_alias( qr/sjis$/i => '"cp932"' ); +define_alias(ascii => 'WinLatin1', + cyrillic => 'WinCyrillic', + arabic => 'WinArabic', + greek => 'WinGreek', + hebrew => 'WinHebrew'); -@a2c{qw(Shift_JIS x-sjis)} = qw(macjapan cp932); +@a2c{qw(ascii cyrillic arabic greek hebrew)} = + qw(cp1252 cp1251 cp1256 cp1253 cp1255); + +unless ($ON_EBCDIC){ + define_alias( qr/shift.*jis$/i => '"macjapan"', + qr/sjis$/i => '"cp932"' ); + @a2c{qw(Shift_JIS x-sjis)} = qw(macjapan cp932); +} print "# alias test with alias overrides\n"; -- cgit v1.2.1