summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJarkko Hietaniemi <jhi@iki.fi>2002-03-24 15:53:59 +0000
committerJarkko Hietaniemi <jhi@iki.fi>2002-03-24 15:53:59 +0000
commita63c962f6b57d9d07801c81cc6e7f8a1b904a8c5 (patch)
tree40959cf93404c9e5d2c4db6e45b9d5fe6e3b22db
parentf54fca96add61189e2fde1d41312bc8885ac2d97 (diff)
downloadperl-a63c962f6b57d9d07801c81cc6e7f8a1b904a8c5.tar.gz
Upgrade to Encode 0.98, from Dan Kogai.
p4raw-id: //depot/perl@15467
-rw-r--r--MANIFEST3
-rw-r--r--ext/Encode/CN/CN.pm4
-rw-r--r--ext/Encode/Changes56
-rw-r--r--ext/Encode/Encode.pm99
-rw-r--r--ext/Encode/JP/JP.pm30
-rw-r--r--ext/Encode/KR/KR.pm4
-rw-r--r--ext/Encode/MANIFEST3
-rw-r--r--ext/Encode/TW/TW.pm4
-rw-r--r--ext/Encode/lib/Encode/Alias.pm58
-rw-r--r--ext/Encode/lib/Encode/Details.pod41
-rw-r--r--ext/Encode/lib/Encode/EncFormat.pod (renamed from ext/Encode/lib/EncodeFormat.pod)2
-rw-r--r--ext/Encode/lib/Encode/JP/ISO_2022_JP.pm4
-rw-r--r--ext/Encode/lib/Encode/JP/ISO_2022_JP_1.pm38
-rw-r--r--ext/Encode/lib/Encode/JP/JIS.pm8
-rw-r--r--ext/Encode/lib/Encode/Supported.pod115
-rw-r--r--ext/Encode/t/Aliases.t124
16 files changed, 383 insertions, 210 deletions
diff --git a/MANIFEST b/MANIFEST
index 26bbcdac6f..2b4c0bf11c 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -355,12 +355,14 @@ ext/Encode/KR/Makefile.PL Encode extension
ext/Encode/lib/Encode/Alias.pm Encode extension
ext/Encode/lib/Encode/CN/HZ.pm Encode extension
ext/Encode/lib/Encode/Details.pod Encode extension
+ext/Encode/lib/Encode/EncFormat.pod Encode extension
ext/Encode/lib/Encode/Encoding.pm Encode extension
ext/Encode/lib/Encode/Internal.pm Encode extension
ext/Encode/lib/Encode/iso10646_1.pm Encode extension
ext/Encode/lib/Encode/JP/Constants.pm Encode extension
ext/Encode/lib/Encode/JP/H2Z.pm Encode extension
ext/Encode/lib/Encode/JP/ISO_2022_JP.pm Encode extension
+ext/Encode/lib/Encode/JP/ISO_2022_JP_1.pm Encode extension
ext/Encode/lib/Encode/JP/JIS.pm Encode extension
ext/Encode/lib/Encode/Supported.pod Encode extension
ext/Encode/lib/Encode/Tcl.pm Encode extension
@@ -370,7 +372,6 @@ ext/Encode/lib/Encode/ucs2_le.pm Encode extension
ext/Encode/lib/Encode/Unicode.pm Encode extension
ext/Encode/lib/Encode/utf8.pm Encode extension
ext/Encode/lib/Encode/XS.pm Encode extension
-ext/Encode/lib/EncodeFormat.pod Encode extension
ext/Encode/Makefile.PL Encode extension makefile writer
ext/Encode/MANIFEST Encode extension
ext/Encode/README Encode extension
diff --git a/ext/Encode/CN/CN.pm b/ext/Encode/CN/CN.pm
index 51d90bb5ec..0a468f9f59 100644
--- a/ext/Encode/CN/CN.pm
+++ b/ext/Encode/CN/CN.pm
@@ -4,7 +4,7 @@ BEGIN {
die "Encode::CN not supported on EBCDIC\n";
}
}
-our $VERSION = do { my @r = (q$Revision: 0.97 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
+our $VERSION = do { my @r = (q$Revision: 0.98 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
use Encode;
use Encode::CN::HZ;
@@ -58,7 +58,7 @@ also contains extra Taiwan-based encodings.
ASCII part (0x00-0x7f) is preserved for all encodings, even though it
conflicts with mappings by the Unicode Consortium. See
-F<http://www.debian.or.jp/~kubota/unicode-symbols.html.en>
+L<http://www.debian.or.jp/~kubota/unicode-symbols.html.en>
to find why it is implemented that way.
diff --git a/ext/Encode/Changes b/ext/Encode/Changes
index a981280638..0054d256b9 100644
--- a/ext/Encode/Changes
+++ b/ext/Encode/Changes
@@ -1,8 +1,62 @@
# Revision history for Perl extension Encode.
#
-# $Id: Changes,v 0.97 2002/03/23 20:24:42 dankogai Exp dankogai $
+# $Id: Changes,v 0.98 2002/03/24 15:43:37 dankogai Exp dankogai $
#
+0.98 Mon Mar 25 2002
+! lib/Encode/Supported.pod
+ Further pod fixes
++ lib/Encode/JP/ISO_2022_JP_1.pm
+! lib/Encode/JP/ISO_2022_JP.pm
+! lib/Encode/JP/JIS.pm
+! JP/JP.pm
+ Now Encode::JP is more strict on the difference between ISO-2022-JP
+ and ISO-2022-JP-1. See JP/JP.pm for details. I hope this move
+ makes Anton happier :) FYI the previous version implements
+ ISO-2022-JP as ISO-2022-JP-1 since it had X0212 support.
+! lib/Encode/Supported.pod
+ Further pod fixes
+! Encode.xs
+ Avoid core-dump in Encode with PERLIO=mmap by NI-S
+ Message-Id: <20020324104139.1326.7@bactrian.ni-s.u-net.com>
+! CN/CN.pm
+! JP/JP.pm
+! KR/KR.pm
+! TW/TW.pm
+! lib/Encode/Suppoted.pod
+ pod fixes to replace F<http://...> to L<http://...>,
+ as suggested by Autrijius in:
+ Message-Id: <20020324083943.GA14901@not.autrijus.org>
+! lib/Encode/Suppoted.pod
+ fixes and enhancements by Anton
+ Message-Id: <10632060120.20020324103753@motor.ru>
+! lib/Encode/Alias.pm
+ > define_alias( qr/^GB[- ]?(\d+)$/i => '"gb$1"' );
+ added. Suggested by Anton then deobfuscated by Autrijius
+ Message-Id: <20020324064455.GA3667@not.autrijus.org>
+! compile
+ Further fix by Nicholas Clark
+ Message-Id: <20020323145840.GD304@Bagpuss.unfortu.net>
+- lib/EncodeFormat.pod
++ lib/Encode/EncFormat.pod
+! MANIFEST
+ File renamed as suggested by Autrijius
+! Encode.pm
+! lib/Encode/Details.pod
+! lib/Encode/Supported.pod Sun Mar 24 13:29:35 2002
+! Encode.pm Sun Mar 24 13:43:47 2002
+ pod fixes by Autrijius.
+ Message-Id: <20020324062804.GA3595@not.autrijus.org>
+ Message-Id: <20020324075627.GB11986@not.autrijus.org>
+! t/Alias.t
+! lib/Encode/Alias.pm
+! Encode.pm
+ now more EBCDIC conscious;
+ %ExtModules on EBCDIC system excludes CJK so that you don't
+ have to worry about the matched alias resulting cloaking.
+ t/Alias.t also revised to reflect changes. Verified by jhi
+ Message-Id: <20020324022929.D22596@alpha.hut.fi>
+
0.97 Sun Mar 24 2002
! CN/CN.pm
! KR/KR.pm
diff --git a/ext/Encode/Encode.pm b/ext/Encode/Encode.pm
index 7886c63826..39953d0de8 100644
--- a/ext/Encode/Encode.pm
+++ b/ext/Encode/Encode.pm
@@ -1,6 +1,6 @@
package Encode;
use strict;
-our $VERSION = do { my @r = (q$Revision: 0.97 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
+our $VERSION = do { my @r = (q$Revision: 0.98 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
our $DEBUG = 0;
require DynaLoader;
@@ -37,6 +37,7 @@ bootstrap Encode ();
use Carp;
+our $ON_EBCDIC = (ord("A") == 193);
use Encode::Alias;
# Make a %Encoding package variable to allow a certain amount of cheating
@@ -51,27 +52,6 @@ our %ExtModule =
'posix-bc' => 'Encode/EBCDIC.pm',
symbol => 'Encode/Symbol.pm',
dingbats => 'Encode/Symbol.pm',
- 'euc-cn' => 'Encode/CN.pm',
- gb2312 => 'Encode/CN.pm',
- gb12345 => 'Encode/CN.pm',
- gbk => 'Encode/CN.pm',
- cp936 => 'Encode/CN.pm',
- 'iso-ir-165' => 'Encode/CN.pm',
- 'euc-jp' => 'Encode/JP.pm',
- 'iso-2022-jp' => 'Encode/JP.pm',
- '7bit-jis' => 'Encode/JP.pm',
- shiftjis => 'Encode/JP.pm',
- macjapan => 'Encode/JP.pm',
- cp932 => 'Encode/JP.pm',
- 'euc-kr' => 'Encode/KR.pm',
- ksc5601 => 'Encode/KR.pm',
- cp949 => 'Encode/KR.pm',
- big5 => 'Encode/TW.pm',
- 'big5-hkscs' => 'Encode/TW.pm',
- cp950 => 'Encode/TW.pm',
- gb18030 => 'Encode/HanExtra.pm',
- big5plus => 'Encode/HanExtra.pm',
- 'euc-tw' => 'Encode/HanExtra.pm',
);
for my $k (2..11,13..16){
@@ -82,6 +62,34 @@ for my $k (1250..1258){
$ExtModule{"cp$k"} = 'Encode/Byte.pm';
}
+unless ($ON_EBCDIC) { # CJK added to autoload unless EBCDIC env
+%ExtModule =(
+ %ExtModule,
+ 'euc-cn' => 'Encode/CN.pm',
+ gb2312 => 'Encode/CN.pm',
+ gb12345 => 'Encode/CN.pm',
+ gbk => 'Encode/CN.pm',
+ cp936 => 'Encode/CN.pm',
+ 'iso-ir-165' => 'Encode/CN.pm',
+ 'euc-jp' => 'Encode/JP.pm',
+ 'iso-2022-jp' => 'Encode/JP.pm',
+ 'iso-2022-jp-1' => 'Encode/JP.pm',
+ '7bit-jis' => 'Encode/JP.pm',
+ shiftjis => 'Encode/JP.pm',
+ macjapan => 'Encode/JP.pm',
+ cp932 => 'Encode/JP.pm',
+ 'euc-kr' => 'Encode/KR.pm',
+ ksc5601 => 'Encode/KR.pm',
+ cp949 => 'Encode/KR.pm',
+ big5 => 'Encode/TW.pm',
+ 'big5-hkscs' => 'Encode/TW.pm',
+ cp950 => 'Encode/TW.pm',
+ gb18030 => 'Encode/HanExtra.pm',
+ big5plus => 'Encode/HanExtra.pm',
+ 'euc-tw' => 'Encode/HanExtra.pm',
+ );
+}
+
for my $k (qw(centeuro croatian cyrillic dingbats greek
iceland roman rumanian sami
thai turkish ukraine))
@@ -234,7 +242,7 @@ The C<Encode> module provides the interfaces between Perl's strings
and the rest of the system. Perl strings are sequences of B<characters>.
To find more about character encodings, please consult
-L<Encode::Details> . This document focuses on programming references.
+L<Encode::Details>. This document focuses on programming references.
=head1 PERL ENCODING API
@@ -242,9 +250,7 @@ L<Encode::Details> . This document focuses on programming references.
=over 4
-=item *
-
- $bytes = encode(ENCODING, $string[, CHECK])
+=item $bytes = encode(ENCODING, $string[, CHECK])
Encodes string from Perl's internal form into I<ENCODING> and returns
a sequence of octets. For CHECK see L</"Handling Malformed Data">.
@@ -254,9 +260,7 @@ to octets:
$octets = encode("utf8", $unicode);
-=item *
-
- $string = decode(ENCODING, $bytes[, CHECK])
+=item $string = decode(ENCODING, $bytes[, CHECK])
Decode sequence of octets assumed to be in I<ENCODING> into Perl's
internal form and returns the resulting string. For CHECK see
@@ -266,9 +270,7 @@ For example to convert ISO-8859-1 data to UTF-8:
$utf8 = decode("latin1", $latin1);
-=item *
-
- from_to($string, FROM_ENCODING, TO_ENCODING[, CHECK])
+=item from_to($string, FROM_ENCODING, TO_ENCODING[, CHECK])
Convert B<in-place> the data between two encodings. How did the data
in $string originally get to be in FROM_ENCODING? Either using
@@ -342,32 +344,28 @@ Hybrids of above.
Multiple return values rather than in-place modifications.
-Index into the string could be pos($str) allowing s/\G...//.
+Index into the string could be C<pos($str)> allowing C<s/\G...//>.
=back
=head2 UTF-8 / utf8
The Unicode consortium defines the UTF-8 standard as a way of encoding
-the entire Unicode repertiore as sequences of octets. This encoding is
-expected to become very widespread. Perl can use this form internaly
+the entire Unicode repertoire as sequences of octets. This encoding is
+expected to become very widespread. Perl can use this form internally
to represent strings, so conversions to and from this form are
particularly efficient (as octets in memory do not have to change,
just the meta-data that tells Perl how to treat them).
=over 4
-=item *
-
- $bytes = encode_utf8($string);
+=item $bytes = encode_utf8($string);
The characters that comprise string are encoded in Perl's superset of UTF-8
and the resulting octets returned as a sequence of bytes. All possible
characters have a UTF-8 representation so this function cannot fail.
-=item *
-
- $string = decode_utf8($bytes [,CHECK]);
+=item $string = decode_utf8($bytes [, CHECK]);
The sequence of octets represented by $bytes is decoded from UTF-8
into a sequence of logical characters. Not all sequences of octets
@@ -391,16 +389,17 @@ Or you can give the name of specific module.
@with_jp = Encode->encodings("Encode/JP.pm");
-Note in this case you have to say "Encode/JP.pm instead of Encode::JP.
+Note in this case you have to say C<"Encode/JP.pm"> instead of
+C<"Encode::JP">.
-To find which encodings are suppoted by this package in details,
+To find which encodings are supported by this package in details,
see L<Encode::Supported>.
=head2 Defining Aliases
use Encode;
use Encode::Alias;
- define_alias( newName => ENCODING);
+ define_alias(newName => ENCODING);
Allows newName to be used as am alias for ENCODING. ENCODING may be
either the name of an encoding or and encoding object (as above).
@@ -410,7 +409,7 @@ See L<Encode::Alias> on details.
=head1 Defining Encodings
use Encode qw(define_alias);
- define_encoding( $object, 'canonicalName' [,alias...]);
+ define_encoding($object, 'canonicalName' [, alias...]);
Causes I<canonicalName> to be associated with I<$object>. The object
should provide the interface described in L<Encode::Encoding>
@@ -490,15 +489,13 @@ implementation. As such they are efficient, but may change.
=over 4
-=item * is_utf8(STRING [, CHECK])
+=item is_utf8(STRING [, CHECK])
[INTERNAL] Test whether the UTF-8 flag is turned on in the STRING.
If CHECK is true, also checks the data in STRING for being well-formed
UTF-8. Returns true if successful, false otherwise.
-=item *
-
- _utf8_on(STRING)
+=item _utf8_on(STRING)
[INTERNAL] Turn on the UTF-8 flag in STRING. The data in STRING is
B<not> checked for being well-formed UTF-8. Do not use unless you
@@ -506,9 +503,7 @@ B<know> that the STRING is well-formed UTF-8. Returns the previous
state of the UTF-8 flag (so please don't test the return value as
I<not> success or failure), or C<undef> if STRING is not a string.
-=item *
-
- _utf8_off(STRING)
+=item _utf8_off(STRING)
[INTERNAL] Turn off the UTF-8 flag in STRING. Do not use frivolously.
Returns the previous state of the UTF-8 flag (so please don't test the
diff --git a/ext/Encode/JP/JP.pm b/ext/Encode/JP/JP.pm
index cff0d98c1d..c4cbac152a 100644
--- a/ext/Encode/JP/JP.pm
+++ b/ext/Encode/JP/JP.pm
@@ -5,13 +5,14 @@ BEGIN {
}
}
use Encode;
-our $VERSION = do { my @r = (q$Revision: 0.96 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
+our $VERSION = do { my @r = (q$Revision: 0.98 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
use XSLoader;
XSLoader::load('Encode::JP',$VERSION);
use Encode::JP::JIS;
use Encode::JP::ISO_2022_JP;
+use Encode::JP::ISO_2022_JP_1;
1;
__END__
@@ -41,6 +42,9 @@ supported are as follows.
iso-2022-jp ISO-2022-JP
(7bit JIS with all Halfwidth Kana
converted to Fullwidth)
+ iso-2022-jp-1 ISO-2022-JP-1
+ (ISO-2022-JP with JIS X 0212-1990
+ support. See below)
macjapan Mac Japan (Shift JIS + Apple vendor mappings)
cp932 Code Page 932 (Shift JIS + MS/IBM vendor mappings)
--------------------------------------------------------------------
@@ -49,12 +53,34 @@ supported are as follows.
To find how to use this module in detail, see L<Encode>.
+=head1 Note on ISO-2022-JP(-1)?
+
+ISO-2022-JP-1 (RFC2237) is a superset of ISO-2022-JP (RFC1468) which
+adds support for JIS X 0212-1990. That means you can use the same
+code to decode to utf8 but not vice versa.
+
+ $utf8 = decode('iso-2022-jp-1', $stream);
+ $utf8 = decode('iso-2022-jp', $stream);
+
+Yields the same result but
+
+ $with_0212 = encode('iso-2022-jp-1', $utf8);
+
+is now different from
+
+ $without_0212 = encode('iso-2022-jp', $utf8 );
+
+In the latter case, characters that map to 0212 are at first converted
+to U+3013 (0xA2AE in EUC-JP; a white square also known as 'Tofu') then
+fed to decoding engine. U+FFFD is not used to preserve text layout as
+much as possible.
+
=head1 BUGS
ASCII part (0x00-0x7f) is preserved for all encodings, even though it
conflicts with mappings by the Unicode Consortium. See
-F<http://www.debian.or.jp/~kubota/unicode-symbols.html.en>
+L<http://www.debian.or.jp/~kubota/unicode-symbols.html.en>
to find why it is implemented that way.
diff --git a/ext/Encode/KR/KR.pm b/ext/Encode/KR/KR.pm
index 7dcafd0441..9e2e1d3d77 100644
--- a/ext/Encode/KR/KR.pm
+++ b/ext/Encode/KR/KR.pm
@@ -4,7 +4,7 @@ BEGIN {
die "Encode::KR not supported on EBCDIC\n";
}
}
-our $VERSION = do { my @r = (q$Revision: 0.97 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
+our $VERSION = do { my @r = (q$Revision: 0.98 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
use Encode;
use XSLoader;
@@ -46,7 +46,7 @@ The C<Johab> (two-byte combination code) encoding is not supported.
ASCII part (0x00-0x7f) is preserved for all encodings, even though it
conflicts with mappings by the Unicode Consortium. See
-F<http://www.debian.or.jp/~kubota/unicode-symbols.html.en>
+L<http://www.debian.or.jp/~kubota/unicode-symbols.html.en>
to find why it is implemented that way.
diff --git a/ext/Encode/MANIFEST b/ext/Encode/MANIFEST
index 24adacad5c..79ea273241 100644
--- a/ext/Encode/MANIFEST
+++ b/ext/Encode/MANIFEST
@@ -169,6 +169,7 @@ lib/Encode/Internal.pm Encode extension
lib/Encode/JP/Constants.pm Encode extension
lib/Encode/JP/H2Z.pm Encode extension
lib/Encode/JP/ISO_2022_JP.pm Encode extension
+lib/Encode/JP/ISO_2022_JP_1.pm Encode extension
lib/Encode/JP/JIS.pm Encode extension
lib/Encode/Supported.pod Documents supported encodings
lib/Encode/Tcl.pm Encode extension
@@ -179,7 +180,7 @@ lib/Encode/XS.pm Encode extension
lib/Encode/iso10646_1.pm Encode extension
lib/Encode/ucs2_le.pm Encode extension
lib/Encode/utf8.pm Encode extension
-lib/EncodeFormat.pod Encode extension
+lib/Encode/EncFormat.pod Encode extension
t/Aliases.t Encode extension test
t/CN.t Encode extension test
t/Encode.t Encode extension test
diff --git a/ext/Encode/TW/TW.pm b/ext/Encode/TW/TW.pm
index b44c8d2acb..d1f85c57fb 100644
--- a/ext/Encode/TW/TW.pm
+++ b/ext/Encode/TW/TW.pm
@@ -4,7 +4,7 @@ BEGIN {
die "Encode::TW not supported on EBCDIC\n";
}
}
-our $VERSION = do { my @r = (q$Revision: 0.97 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
+our $VERSION = do { my @r = (q$Revision: 0.98 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
use Encode;
use XSLoader;
@@ -54,7 +54,7 @@ plane 1-7.
ASCII part (0x00-0x7f) is preserved for all encodings, even though it
conflicts with mappings by the Unicode Consortium. See
-F<http://www.debian.or.jp/~kubota/unicode-symbols.html.en>
+L<http://www.debian.or.jp/~kubota/unicode-symbols.html.en>
to find why it is implemented that way.
diff --git a/ext/Encode/lib/Encode/Alias.pm b/ext/Encode/lib/Encode/Alias.pm
index 5f7d34575a..2a97261684 100644
--- a/ext/Encode/lib/Encode/Alias.pm
+++ b/ext/Encode/lib/Encode/Alias.pm
@@ -1,7 +1,7 @@
package Encode::Alias;
use strict;
use Encode;
-our $VERSION = do { my @r = (q$Revision: 0.96 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
+our $VERSION = do { my @r = (q$Revision: 0.98 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
our $DEBUG = 0;
require Exporter;
@@ -31,11 +31,13 @@ sub find_alias
my $new;
if (ref($alias) eq 'Regexp' && $_ =~ $alias)
{
+ $DEBUG and warn "eval $val";
$new = eval $val;
# $@ and warn "$val, $@";
}
elsif (ref($alias) eq 'CODE')
{
+ $DEBUG and warn "$alias", "->", "($val)";
$new = $alias->($val);
}
elsif (lc($_) eq lc($alias))
@@ -45,6 +47,7 @@ sub find_alias
if (defined($new))
{
next if $new eq $_; # avoid (direct) recursion on bugs
+ $DEBUG and warn "$alias, $new";
my $enc = (ref($new)) ? $new : Encode::find_encoding($new);
if ($enc)
{
@@ -54,6 +57,15 @@ sub find_alias
}
}
}
+ if ($DEBUG){
+ my $name;
+ if (my $e = $Alias{$_}){
+ $name = $e->name;
+ }else{
+ $name = "";
+ }
+ warn "find_alias($class, $_)->name = $name";
+ }
return $Alias{$_};
}
@@ -69,15 +81,17 @@ sub define_alias
for my $k (@a){
if (ref($alias) eq 'Regexp' && $k =~ $alias)
{
- $DEBUG and warn $k;
+ $DEBUG and warn "delete \$Alias\{$k\}";
delete $Alias{$k};
}
elsif (ref($alias) eq 'CODE')
{
+ $DEBUG and warn "delete \$Alias\{$k\}";
delete $Alias{$alias->($name)};
}
}
}else{
+ $DEBUG and warn "delete \$Alias\{$alias\}";
delete $Alias{$alias};
}
}
@@ -154,29 +168,29 @@ sub init_aliases
define_alias( qr/^macRomanian$/i => '"macRumanian"');
# Standardize on the dashed versions.
- define_alias( qr/^utf8$/i => 'utf-8' );
+ # define_alias( qr/^utf8$/i => 'utf-8' );
define_alias( qr/^koi8r$/i => 'koi8-r' );
define_alias( qr/^koi8u$/i => 'koi8-u' );
-# for Encode::CN
- define_alias( qr/euc.*cn$/i => '"euc-cn"' );
- define_alias( qr/cn.*euc/i => '"euc-cn"' );
-
-# for Encode::JP
- define_alias( qr/euc.*jp$/i => '"euc-jp"' );
- define_alias( qr/jp.*euc/i => '"euc-jp"' );
- define_alias( qr/ujis$/i => '"euc-jp"' );
- define_alias( qr/shift.*jis$/i => '"shiftjis"' );
- define_alias( qr/sjis$/i => '"shiftjis"' );
- define_alias( qr/^jis$/i => '"7bit-jis"' );
-
-# for Encode::KR
- define_alias( qr/euc.*kr$/i => '"euc-kr"' );
- define_alias( qr/kr.*euc/i => '"euc-kr"' );
-
-# for Encode::TW
- define_alias( qr/big-?5$/i => '"big5"' );
- define_alias( qr/big5-hk(?:scs)?/i => '"big5-hkscs"' );
+ unless ($Encode::ON_EBCDIC){
+ # for Encode::CN
+ define_alias( qr/euc.*cn$/i => '"euc-cn"' );
+ define_alias( qr/cn.*euc/i => '"euc-cn"' );
+ define_alias( qr/^GB[- ]?(\d+)$/i => '"gb$1"' );
+ # for Encode::JP
+ define_alias( qr/euc.*jp$/i => '"euc-jp"' );
+ define_alias( qr/jp.*euc/i => '"euc-jp"' );
+ define_alias( qr/ujis$/i => '"euc-jp"' );
+ define_alias( qr/shift.*jis$/i => '"shiftjis"' );
+ define_alias( qr/sjis$/i => '"shiftjis"' );
+ define_alias( qr/^jis$/i => '"7bit-jis"' );
+ # for Encode::KR
+ define_alias( qr/euc.*kr$/i => '"euc-kr"' );
+ define_alias( qr/kr.*euc/i => '"euc-kr"' );
+ # for Encode::TW
+ define_alias( qr/big-?5$/i => '"big5"' );
+ define_alias( qr/big5-hk(?:scs)?/i => '"big5-hkscs"' );
+ }
# At last, Map white space and _ to '-'
define_alias( qr/^(\S+)[\s_]+(.*)$/i => '"$1-$2"' );
diff --git a/ext/Encode/lib/Encode/Details.pod b/ext/Encode/lib/Encode/Details.pod
index aa3a0af782..6721484808 100644
--- a/ext/Encode/lib/Encode/Details.pod
+++ b/ext/Encode/lib/Encode/Details.pod
@@ -1,11 +1,6 @@
-
=head1 NAME
-Encode - character encodings
-
-=head1 SYNOPSIS
-
- use Encode;
+Encode::Details - implementation details of Encode.pm
=head1 DESCRIPTION
@@ -19,7 +14,7 @@ codepoint" for the character (the exceptions are those platforms where
the legacy encoding is some variant of EBCDIC rather than a super-set
of ASCII - see L<perlebcdic>).
-Traditionaly computer data has been moved around in 8-bit chunks
+Traditionally computer data has been moved around in 8-bit chunks
often called "bytes". These chunks are also known as "octets" in
networking standards. Perl is widely used to manipulate data of
many types - not only strings of characters representing human or
@@ -92,7 +87,7 @@ encodings for East Asian languages.
Not really very "encoded" encodings. The Unicode code points
are just represented as 4-octet integers. None the less because
different architectures use different representations of integers
-(so called "endian") there at least two disctinct encodings.
+(so called "endian") there at least two distinct encodings.
=item * Multi-byte encodings
@@ -265,7 +260,7 @@ Microsft proprietary.
UTF-16 KOI8-U ISO-2022-JP-2
-are IANA-registered preferred MIME names but probably shoule
+are IANA-registered preferred MIME names but probably should
be avoided as encoding for web pages due to lack of browser
support.
@@ -412,25 +407,21 @@ Index into the string could be pos($str) allowing s/\G...//.
=head2 UTF-8 / utf8
The Unicode consortium defines the UTF-8 standard as a way of encoding
-the entire Unicode repertiore as sequences of octets. This encoding is
-expected to become very widespread. Perl can use this form internaly
+the entire Unicode repertoire as sequences of octets. This encoding is
+expected to become very widespread. Perl can use this form internally
to represent strings, so conversions to and from this form are
particularly efficient (as octets in memory do not have to change,
just the meta-data that tells Perl how to treat them).
=over 4
-=item *
-
- $bytes = encode_utf8($string);
+=item $bytes = encode_utf8($string);
The characters that comprise string are encoded in Perl's superset of UTF-8
and the resulting octets returned as a sequence of bytes. All possible
characters have a UTF-8 representation so this function cannot fail.
-=item *
-
- $string = decode_utf8($bytes [,CHECK]);
+=item $string = decode_utf8($bytes [,CHECK]);
The sequence of octets represented by $bytes is decoded from UTF-8
into a sequence of logical characters. Not all sequences of octets
@@ -505,10 +496,10 @@ Currently I<newName> can be specified in the following ways:
define_alias( qr/^iso8859-(\d+)$/i => '"iso-8859-$1"' );
In this case if I<ENCODING> is not a reference it is C<eval>-ed to
-allow C<$1> etc. to be subsituted. The example is one way to names as
+allow C<$1> etc. to be substituted. The example is one way to names as
used in X11 font names to alias the MIME names for the iso-8859-*
family. Note the double quote inside the single quote. If you are
-using regex here, y ou have to do so or it won't work in this case.
+using regex here, you have to do so or it won't work in this case.
=item As a code reference, e.g.:
@@ -622,15 +613,13 @@ implementation. As such they are efficient, but may change.
=over 4
-=item * is_utf8(STRING [, CHECK])
+=item is_utf8(STRING [, CHECK])
[INTERNAL] Test whether the UTF-8 flag is turned on in the STRING.
If CHECK is true, also checks the data in STRING for being well-formed
UTF-8. Returns true if successful, false otherwise.
-=item *
-
- _utf8_on(STRING)
+=item _utf8_on(STRING)
[INTERNAL] Turn on the UTF-8 flag in STRING. The data in STRING is
B<not> checked for being well-formed UTF-8. Do not use unless you
@@ -638,9 +627,7 @@ B<know> that the STRING is well-formed UTF-8. Returns the previous
state of the UTF-8 flag (so please don't test the return value as
I<not> success or failure), or C<undef> if STRING is not a string.
-=item *
-
- _utf8_off(STRING)
+=item _utf8_off(STRING)
[INTERNAL] Turn off the UTF-8 flag in STRING. Do not use frivolously.
Returns the previous state of the UTF-8 flag (so please don't test the
@@ -816,6 +803,4 @@ to be rationalized.
L<perlunicode>, L<perlebcdic>, L<perlfunc/open>, L<PerlIO>, L<encoding>,
L<utf8>, the Perl Unicode Mailing List E<lt>perl-unicode@perl.orgE<gt>
-
=cut
-
diff --git a/ext/Encode/lib/EncodeFormat.pod b/ext/Encode/lib/Encode/EncFormat.pod
index 3a1269dd04..abb805709b 100644
--- a/ext/Encode/lib/EncodeFormat.pod
+++ b/ext/Encode/lib/Encode/EncFormat.pod
@@ -1,6 +1,6 @@
=head1 NAME
-EncodeFormat - the format of encoding tables of the Encode extension
+Encode::EncFormat - the format of encoding tables of the Encode/*.enc files
=head1 DESCRIPTION
diff --git a/ext/Encode/lib/Encode/JP/ISO_2022_JP.pm b/ext/Encode/lib/Encode/JP/ISO_2022_JP.pm
index 388be5f247..29df75054b 100644
--- a/ext/Encode/lib/Encode/JP/ISO_2022_JP.pm
+++ b/ext/Encode/lib/Encode/JP/ISO_2022_JP.pm
@@ -5,7 +5,7 @@ use Encode::JP::H2Z;
use base 'Encode::Encoding';
use vars qw($VERSION);
-$VERSION = do { my @r = (q$Revision: 0.94 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
+$VERSION = do { my @r = (q$Revision: 0.98 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
my $canon = 'iso-2022-jp';
my $obj = bless {name => $canon}, __PACKAGE__;
@@ -31,7 +31,7 @@ sub encode
my ($obj,$str,$chk) = @_;
my $euc = Encode::encode('euc-jp', $str, $chk);
&Encode::JP::H2Z::h2z(\$euc);
- return &Encode::JP::JIS::euc_jis(\$euc);
+ return &Encode::JP::JIS::euc_jis_nox0212(\$euc);
}
1;
diff --git a/ext/Encode/lib/Encode/JP/ISO_2022_JP_1.pm b/ext/Encode/lib/Encode/JP/ISO_2022_JP_1.pm
new file mode 100644
index 0000000000..9b1c3191e7
--- /dev/null
+++ b/ext/Encode/lib/Encode/JP/ISO_2022_JP_1.pm
@@ -0,0 +1,38 @@
+package Encode::JP::ISO_2022_JP_1;
+use Encode::JP;
+use Encode::JP::JIS;
+use Encode::JP::H2Z;
+use base 'Encode::Encoding';
+
+use vars qw($VERSION);
+$VERSION = do { my @r = (q$Revision: 0.98 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
+
+my $canon = 'iso-2022-jp-1';
+my $obj = bless {name => $canon}, __PACKAGE__;
+$obj->Define($canon);
+
+sub name { return $_[0]->{name}; }
+
+#
+# decode is identical to 7bit-jis
+#
+
+sub decode
+{
+ my ($obj,$str,$chk) = @_;
+ return Encode::decode('7bit-jis', $str, $chk);
+}
+
+# iso-2022-jp = 7bit-jis with all x201 (Hankaku) converted to
+# x208 equivalent (Zenkaku)
+
+sub encode
+{
+ my ($obj,$str,$chk) = @_;
+ my $euc = Encode::encode('euc-jp', $str, $chk);
+ &Encode::JP::H2Z::h2z(\$euc);
+ return &Encode::JP::JIS::euc_jis(\$euc);
+}
+
+1;
+__END__
diff --git a/ext/Encode/lib/Encode/JP/JIS.pm b/ext/Encode/lib/Encode/JP/JIS.pm
index 6e6dd0fd24..86878216f9 100644
--- a/ext/Encode/lib/Encode/JP/JIS.pm
+++ b/ext/Encode/lib/Encode/JP/JIS.pm
@@ -5,7 +5,7 @@ use base 'Encode::Encoding';
use strict;
use vars qw($VERSION);
-$VERSION = do { my @r = (q$Revision: 0.94 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
+$VERSION = do { my @r = (q$Revision: 0.98 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
# Just for the time being, we implement jis-7bit
# encoding via EUC
@@ -77,5 +77,11 @@ sub euc_jis{
$$r_str;
}
+sub euc_jis_nox0212{
+ my $r_str = shift;
+ $$r_str =~ s/$RE{EUC_0212}/$CHARCODE{UNDEF_EUC}/go;
+ euc_jis($r_str);
+}
+
1;
__END__
diff --git a/ext/Encode/lib/Encode/Supported.pod b/ext/Encode/lib/Encode/Supported.pod
index d48d14d342..1a9f88ed01 100644
--- a/ext/Encode/lib/Encode/Supported.pod
+++ b/ext/Encode/lib/Encode/Supported.pod
@@ -24,7 +24,7 @@ once an operation is in progress.
As of Perl 5.8.0, at least the following encodings are recognized.
Note that unless otherwise specified, they are all case insensitive
-(via alias) and all occurance of spaces are replaced with '-'. In
+(via alias) and all occurrance of spaces are replaced with '-'. In
other words, "ISO 8859 1" and "iso-8859-1" are identical.
Encodings are categorized and implemented in several different modules
@@ -51,12 +51,12 @@ extended ASCII. For most cases it uses \x80-\xff (upper half) to map
non-ASCII characters.
-----------------------
- iso-8859-1 latin
+ (iso-8859-1 is in built-in)
iso-8859-2 latin2
iso-8859-3 latin3
iso-8859-4 latin4
- iso-8859-5 latin
- iso-8859-6 latin
+ iso-8859-5
+ iso-8859-6
iso-8859-7
iso-8859-8
iso-8859-9 latin5
@@ -102,8 +102,9 @@ non-ASCII characters.
=head2 The CJK: Chinese, Japanese, Korean (Multibyte)
Note Vietnamese is listed above. Also read "Encoding vs Charset"
-below. Also note these are impelemented in distinct module by
-languages, due the the size concerns. See these perldocs also.
+below. Also note these are implemented in distinct module by
+languages, due the the size concerns. Please also refer to their
+respective document pages.
=over 4
@@ -125,6 +126,7 @@ languages, due the the size concerns. See these perldocs also.
cp932
euc-jp ujis
iso-2022-jp
+ iso-2022-jp-1
macjapan
shiftjis Shift_JIS, sjis
-----------------------
@@ -172,7 +174,7 @@ See perlebcdic for details.
posix-bc
-----------------------
-=item Enocode::Symbols
+=item Encode::Symbols
For symbols and dingbats.
@@ -193,70 +195,105 @@ Charset determines which characters to be included in a given text.
Encoding actually maps charset(s) to stream of bits.
-Note a given encoding contains multiple charsets. For instance,
-euc-jp contains ASCII, JIS X 0201 (Hankaku Kana), JIS X 0208 (Zenkaku
-Kana and Kanji) and JIS X 0212 (Extended Kanji) in a single encoding.
+Note a given encoding may contain multiple charsets and complex CJK
+encodings are usually implemented that way.
+
+For instance, euc-jp contains ASCII, JIS X 0201-1978 (Hankaku Kana),
+JIS X 0208-1997 (ZenkakuKana and Kanji) and JIS X 0212-1990 (Extended
+Kanji) in a single encoding.
As the name suggests, the Encode module supports encodings, not
individual charsets.
-=head1 Encoding Classification (by Anton Tagunov)
+=head1 Encoding Classification (by Anton Tagunov and Dan Kogai)
+
+This section tries to classify the supported encodings by their
+applicability for information exchange over the Internet and to
+choose the most suitable aliases to name them in the context of
+such communication.
+
+Encoding names
-Encodings
+ US-ASCII UTF-8
+ ISO-8859-* KOI8-R
+ Shift_JIS EUC-JP ISO-2022-JP ISO-2022-JP-1
+ EUC-KR
+ Big5
- US-ASCII UTF-8 KOI8-R ISO-8859-*
- ISO-2022-CN ISO-2022-JP Big5
- EUC-CN EUC-JP EUC-KR
+are L<http://www.iana.org/assignments/character-sets>-registered as
+preferred MIME names and may probably be used over the Internet.
-are <http://www.iana.org/assignments/character-sets>-registered as
-preferred MIME names and may probably be used over the Internet. So is
+C<Shift_JIS> is no longer Microsft proprietary since it has been
+officialized by JIS X 0208-1997. It is probably the most wide
+spread encoding for Japanese on the Internet.
- Shift_JIS
+ EUC-CN
-but despite its wide spread it bears the label of being
-Microsft proprietary -- was. Now Shift JIS is official as of
-JIS X 0208-1997.
+has not been registered with IANA (as of march 2002) but
+seems to be supported by major web browsers. (IANA has registered
+this encoding as C<GB2312>, but C<gb2312> currently has a different
+meaning to the C<Encode> module. It will probably become alias to
+C<EUC-CN> in the future; until then it is safer to avoid using
+C<gb2312> as encoding name within Perl).
- UTF-16 KOI8-U
+ UTF-16
+ KOI8-U (http://www.faqs.org/rfcs/rfc2319.html)
-are IANA-registered preferred MIME names but probably
-shoule be avoided as encoding for web pages due to lack of
-browser support.
+are IANA-registered (C<UTF-16> even as a preferred MIME name)
+but probably should be avoided as encoding for web pages due to
+lack of browser support.
- ISO-2022 (http://www.ecma.ch/ecma1/STAND/ECMA-035.HTM)
- ISO-2022-JP-1 (http://www.faqs.org/rfcs/rfc2237.html)
ISO-IR-165 (http://www.faqs.org/rfcs/rfc1345.html)
GBK
VISCII
- GB 12345 (only plains 1 and 2 available)
- GB 18030
- CNS 11643
+ GB 12345
+ GB 18030 (*) (see links bellow)
+ EUC-TW (*)
are totally valid encodings but not registered at IANA.
+The names under which they are listed here are probably the
+most widely-known names for these encodings and are recommended
+names.
+
+
+=for comment this used to be listed as supported but
- BIG5PLUS
- EUC-JP-0212 (Encode::lib::Encode::Tcl::Extended)
+do not work @15457 when it's clear they will be uncommented
+or deleted - Anton
+ISO-2022 (http://www.ecma.ch/ecma1/STAND/ECMA-035.HTM)
+CNS 11643 (only plains 1 and 2 available)
-are a bit proprietary
+ BIG5PLUS (*)
+
+is a bit proprietary name. C<(*)>-marked encodings belong to
+C<Encode::HanExtra> available from CPAN.
You may probably get some info on CJK encodings at
brief description for most of the mentioned CJK encodings
-
-F<http://www.debian.org.ru/doc/manuals/intro-i18n/ch-codes.html>
+L<http://www.debian.org.ru/doc/manuals/intro-i18n/ch-codes.html>
several years old, but still useful
-
-F<http://www.oreilly.com/people/authors/lunde/cjk_inf.html>
+L<http://www.oreilly.com/people/authors/lunde/cjk_inf.html>
and some in-depth reading for the heroes :-)
-F<http://www.ecma.ch/ecma1/STAND/ECMA-035.HTM> (eq ISO-2022)
+L<http://www.ecma.ch/ecma1/STAND/ECMA-035.HTM> (eq C<ISO-2022>)
+
+gives brief info on C<EUC-CN>, C<GBK> and mostly on C<GB 18030>
+F<ftp://ftp.oreilly.com/pub/examples/nutshell/cjkv/pdf/GB18030_Summary.pdf>
+
+The nature of information in this section is most fragile and
+error-prone; I<probably> is the most popular adverb :)
+Please feel free to send your comments, disagreements and
+additions to L<...>. (Note however,
+that the mission of this document is to cover the
+C<Encode>-supported encodings only.
=head1 See Also
L<Encode>,
L<Encode::Byte>,
-L<Encode::CN>, L<Encode::JP>, L<Encode::KR>, L<Encode::TW>
+L<Encode::CN>, L<Encode::JP>, L<Encode::KR>, L<Encode::TW>,
L<Encode::EBCDIC>, L<Encode::Symbol>
=cut
diff --git a/ext/Encode/t/Aliases.t b/ext/Encode/t/Aliases.t
index 3640f4b097..8fe298b0f7 100644
--- a/ext/Encode/t/Aliases.t
+++ b/ext/Encode/t/Aliases.t
@@ -3,66 +3,72 @@
use strict;
use Encode;
use Encode::Alias;
+my %a2c;
+my $ON_EBCDIC;
BEGIN {
- if (ord("A") == 193) {
- print "1..0 # Skip: EBCDIC\n";
- exit 0;
+ $ON_EBCDIC = ord("A") == 193;
+ @ARGV and $ON_EBCDIC = $ARGV[0] eq 'EBCDIC';
+ $Encode::ON_EBCDIC = $ON_EBCDIC;
+
+ %a2c = (
+ 'ascii' => 'US-ascii',
+ 'cyrillic' => 'iso-8859-5',
+ 'arabic' => 'iso-8859-6',
+ 'greek' => 'iso-8859-7',
+ 'hebrew' => 'iso-8859-8',
+ 'thai' => 'iso-8859-11',
+ 'tis620' => 'iso-8859-11',
+ 'WinLatin1' => 'cp1252',
+ 'WinLatin2' => 'cp1250',
+ 'WinCyrillic' => 'cp1251',
+ 'WinGreek' => 'cp1253',
+ 'WinTurkish' => 'cp1254',
+ 'WinHebrew' => 'cp1255',
+ 'WinArabic' => 'cp1256',
+ 'WinBaltic' => 'cp1257',
+ 'WinVietnamese' => 'cp1258',
+ 'ja_JP.euc' => $ON_EBCDIC ? '' : 'euc-jp',
+ 'x-euc-jp' => $ON_EBCDIC ? '' : 'euc-jp',
+ 'zh_CN.euc' => $ON_EBCDIC ? '' : 'euc-cn',
+ 'x-euc-cn' => $ON_EBCDIC ? '' : 'euc-cn',
+ 'ko_KR.euc' => $ON_EBCDIC ? '' : 'euc-kr',
+ 'x-euc-kr' => $ON_EBCDIC ? '' : 'euc-kr',
+ 'ujis' => $ON_EBCDIC ? '' : 'euc-jp',
+ 'Shift_JIS' => $ON_EBCDIC ? '' : 'shiftjis',
+ 'x-sjis' => $ON_EBCDIC ? '' : 'shiftjis',
+ 'jis' => $ON_EBCDIC ? '' : '7bit-jis',
+ 'big-5' => $ON_EBCDIC ? '' : 'big5',
+ 'zh_TW.Big5' => $ON_EBCDIC ? '' : 'big5',
+ 'big5-hk' => $ON_EBCDIC ? '' : 'big5-hkscs',
+ );
+
+ for my $i (1..11,13..16){
+ $a2c{"ISO 8859 $i"} = "iso-8859-$i";
+ }
+ for my $i (1..10){
+ $a2c{"ISO Latin $i"} = "iso-8859-$Encode::Alias::Latin2iso[$i]";
+ }
+ for my $k (keys %Encode::Alias::Winlatin2cp){
+ my $v = $Encode::Alias::Winlatin2cp{$k};
+ $a2c{"Win" . ucfirst($k)} = "cp" . $v;
+ $a2c{"IBM-$v"} = $a2c{"MS-$v"} = "cp" . $v;
}
}
-my %a2c;
-
-BEGIN {
- %a2c = (
- 'ascii' => 'US-ascii',
- 'cyrillic' => 'iso-8859-5',
- 'arabic' => 'iso-8859-6',
- 'greek' => 'iso-8859-7',
- 'hebrew' => 'iso-8859-8',
- 'thai' => 'iso-8859-11',
- 'tis620' => 'iso-8859-11',
- 'ja_JP.euc' => 'euc-jp',
- 'x-euc-jp' => 'euc-jp',
- 'zh_CN.euc' => 'euc-cn',
- 'x-euc-cn' => 'euc-cn',
- 'ko_KR.euc' => 'euc-kr',
- 'x-euc-kr' => 'euc-kr',
- 'ujis' => 'euc-jp',
- 'Shift_JIS' => 'shiftjis',
- 'x-sjis' => 'shiftjis',
- 'jis' => '7bit-jis',
- 'big-5' => 'big5',
- 'zh_TW.Big5' => 'big5',
- 'big5-hk' => 'big5-hkscs',
- 'WinLatin1' => 'cp1252',
- 'WinLatin2' => 'cp1250',
- 'WinCyrillic' => 'cp1251',
- 'WinGreek' => 'cp1253',
- 'WinTurkish' => 'cp1254',
- 'WinHebrew' => 'cp1255',
- 'WinArabic' => 'cp1256',
- 'WinBaltic' => 'cp1257',
- 'WinVietnamese' => 'cp1258',
- );
-
- for my $i (1..11,13..16){
- $a2c{"ISO 8859 $i"} = "iso-8859-$i";
- }
- for my $i (1..10){
- $a2c{"ISO Latin $i"} = "iso-8859-$Encode::Alias::Latin2iso[$i]";
- }
- for my $k (keys %Encode::Alias::Winlatin2cp){
- my $v = $Encode::Alias::Winlatin2cp{$k};
- $a2c{"Win" . ucfirst($k)} = "cp" . $v;
- $a2c{"IBM-$v"} = "cp" . $v;
- $a2c{"MS-$v"} = "cp" . $v;
- }
+if ($ON_EBCDIC){
+ delete @Encode::ExtModule{
+ qw(euc-cn gb2312 gb12345 gbk cp936 iso-ir-165
+ euc-jp iso-2022-jp 7bit-jis shiftjis macjapan cp932
+ euc-kr ksc5601 cp949
+ big5 big5-hkscs cp950
+ gb18030 big5plus euc-tw)
+ };
}
use Test::More tests => (scalar keys %a2c) * 3;
-print "# alias test\n";
+print "# alias test; \$ON_EBCDIC == $ON_EBCDIC\n";
foreach my $a (keys %a2c){
my $e = Encode::find_encoding($a);
@@ -71,10 +77,20 @@ foreach my $a (keys %a2c){
# now we override some of the aliases and see if it works fine
-define_alias( qr/shift.*jis$/i => '"macjapan"' );
-define_alias( qr/sjis$/i => '"cp932"' );
+define_alias(ascii => 'WinLatin1',
+ cyrillic => 'WinCyrillic',
+ arabic => 'WinArabic',
+ greek => 'WinGreek',
+ hebrew => 'WinHebrew');
-@a2c{qw(Shift_JIS x-sjis)} = qw(macjapan cp932);
+@a2c{qw(ascii cyrillic arabic greek hebrew)} =
+ qw(cp1252 cp1251 cp1256 cp1253 cp1255);
+
+unless ($ON_EBCDIC){
+ define_alias( qr/shift.*jis$/i => '"macjapan"',
+ qr/sjis$/i => '"cp932"' );
+ @a2c{qw(Shift_JIS x-sjis)} = qw(macjapan cp932);
+}
print "# alias test with alias overrides\n";