summaryrefslogtreecommitdiff
path: root/ext/Encode
diff options
context:
space:
mode:
authorMarcus Holland-Moritz <mhx-perl@gmx.net>2004-10-18 21:00:26 +0000
committerMarcus Holland-Moritz <mhx-perl@gmx.net>2004-10-18 21:00:26 +0000
commit8f1ed24a4d3af53304bd3890a619bde305168316 (patch)
tree8eeb4928e410f53b411ec5d5712685e1b3cbd9cd /ext/Encode
parentaa534ff5c7e215c8ef5cbc68519f3301b552522c (diff)
downloadperl-8f1ed24a4d3af53304bd3890a619bde305168316.tar.gz
Upgrade to Encode 2.04.
p4raw-id: //depot/perl@23380
Diffstat (limited to 'ext/Encode')
-rw-r--r--ext/Encode/AUTHORS1
-rw-r--r--ext/Encode/Changes28
-rw-r--r--ext/Encode/Encode.pm4
-rw-r--r--ext/Encode/JP/JP.pm4
-rw-r--r--ext/Encode/META.yml2
-rw-r--r--ext/Encode/Makefile.PL4
-rw-r--r--ext/Encode/bin/piconv54
-rw-r--r--ext/Encode/bin/ucmsort2
-rw-r--r--ext/Encode/encoding.pm122
-rw-r--r--ext/Encode/lib/Encode/Alias.pm62
-rw-r--r--ext/Encode/lib/Encode/Supported.pod3
-rw-r--r--ext/Encode/ucm/big5-hkscs.ucm2
12 files changed, 106 insertions, 182 deletions
diff --git a/ext/Encode/AUTHORS b/ext/Encode/AUTHORS
index c0e1df6780..a40701f40c 100644
--- a/ext/Encode/AUTHORS
+++ b/ext/Encode/AUTHORS
@@ -45,6 +45,7 @@ SUGAWARA Hajime <sugawara@hdt.co.jp>
SUZUKI Norio <ZAP00217@nifty.com>
Simon Cozens <simon@netthink.co.uk>
Spider Boardman <spider@web.zk3.dec.com>
+Steve Hay <steve.hay@uk.radan.com>
Tatsuhiko Miyagawa <miyagawa@edge.co.jp>
Vadim Konovalov <vkonovalov@peterstar.ru>
Yitzchak Scott-Thoennes <sthoenna@efn.org>
diff --git a/ext/Encode/Changes b/ext/Encode/Changes
index b6b6041f90..9f15ee5950 100644
--- a/ext/Encode/Changes
+++ b/ext/Encode/Changes
@@ -1,17 +1,35 @@
# Revision history for Perl extension Encode.
#
-# $Id: Changes,v 2.2 2004/08/31 10:55:34 dankogai Exp dankogai $
+# $Id: Changes,v 2.4 2004/10/16 21:22:44 dankogai Exp dankogai $
#
-$Revision: 2.2 $ $Date: 2004/08/31 10:55:34 $
- ucm/big5-hkscs.ucm AUTHORS t/big5-hkscs.enc t/big5-hkscs.utf
+$Revision: 2.4 $ $Date: 2004/10/16 21:22:44 $
+! Makefle.PL
+ From: craigberry@mac.com
+ Subject: [PATCH ext/Encode/Makefile.PL] make Encode.c dependency explicit
+ Message-Id: <41716868.7000102@mac.com>
+
+2.03 2004/10/06 05:07:20
+! lib/Encode/Alias.pm
+ Resolved some alias case sensitivity glitches reported via RT.
+ http://rt.cpan.org/NoAuth/Bug.html?id=7835
+! bin/piconv
+ Resolved Win32 glitches reported via RT.
+ (Fixed by dankogai and tested by Steve Hay)
+ http://rt.cpan.org/Ticket/Display.html?id=7831
+! JP/JP.pm lib/Encode/Alias.pm lib/Encode/Supported.pod AUTHORS
+ /\bwindows-31j$/i is now an alias of CP932, by Steve Hay.
+ http://rt.cpan.org/NoAuth/Bug.html?id=6695
+
+2.02 2004/08/31 10:55:34
+! ucm/big5-hkscs.ucm AUTHORS t/big5-hkscs.enc t/big5-hkscs.utf
New map submitted by Deng Liu and Autrijus. Test data needed
to be upgrade as well, done by dankogai
Message-Id: <20040824204828.GB6999@aut.dyndns.org>
- bin/ucmsort
+! bin/ucmsort
Now works for characters U+10000 and above. This fix was needed
to "tidy" the original map that was submitted.
- bin/enc2xs
+! bin/enc2xs
"ucmsort" now mentioned in pod
2.01 2004/05/25 16:27:14
diff --git a/ext/Encode/Encode.pm b/ext/Encode/Encode.pm
index 616f50cd87..ab80547b55 100644
--- a/ext/Encode/Encode.pm
+++ b/ext/Encode/Encode.pm
@@ -1,9 +1,9 @@
#
-# $Id: Encode.pm,v 2.2 2004/08/31 10:52:11 dankogai Exp $
+# $Id: Encode.pm,v 2.4 2004/10/16 21:22:31 dankogai Exp $
#
package Encode;
use strict;
-our $VERSION = do { my @r = (q$Revision: 2.2 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
+our $VERSION = do { my @r = (q$Revision: 2.4 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
sub DEBUG () { 0 }
use XSLoader ();
XSLoader::load(__PACKAGE__, $VERSION);
diff --git a/ext/Encode/JP/JP.pm b/ext/Encode/JP/JP.pm
index 696f9282bf..01ad37f30d 100644
--- a/ext/Encode/JP/JP.pm
+++ b/ext/Encode/JP/JP.pm
@@ -5,7 +5,7 @@ BEGIN {
}
}
use Encode;
-our $VERSION = do { my @r = (q$Revision: 2.0 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
+our $VERSION = do { my @r = (q$Revision: 2.1 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
use XSLoader;
XSLoader::load(__PACKAGE__,$VERSION);
@@ -45,7 +45,7 @@ supported are as follows.
= ISO-2022-JP with JIS X 0212-1990
support. See below
MacJapanese Shift JIS + Apple vendor mappings
- cp932 Code Page 932
+ cp932 /\bwindows-31j$/i Code Page 932
= Shift JIS + MS/IBM vendor mappings
jis0201-raw JIS0201, raw format
jis0208-raw JIS0201, raw format
diff --git a/ext/Encode/META.yml b/ext/Encode/META.yml
index 9a3546ecff..b6a65136dd 100644
--- a/ext/Encode/META.yml
+++ b/ext/Encode/META.yml
@@ -1,7 +1,7 @@
# http://module-build.sourceforge.net/META-spec.html
#XXXXXXX This is a prototype!!! It will change in the future!!! XXXXX#
name: Encode
-version: 2.02
+version: 2.04
version_from: Encode.pm
installdirs: perl
requires:
diff --git a/ext/Encode/Makefile.PL b/ext/Encode/Makefile.PL
index 83e4e64ccd..58b307b1b6 100644
--- a/ext/Encode/Makefile.PL
+++ b/ext/Encode/Makefile.PL
@@ -83,8 +83,10 @@ sub postamble
{
my $self = shift;
my $dir = $self->catdir($self->curdir,'ucm');
- my $str = "# Encode\$(OBJ_EXT) depends on .h and .exh files not .c files - but all written by enc2xs\n";
+ my $str = "# Encode\$(OBJ_EXT) does not depend on .c files directly\n";
+ $str .= "# (except Encode.c), but on .h and .exh files written by enc2xs\n";
$str .= $^O eq 'MacOS' ? 'Encode.c.{$(MACPERL_BUILD_EXT_STATIC)}.o :' : 'Encode$(OBJ_EXT) :';
+ $str .= ' Encode.c';
foreach my $table (keys %tables)
{
$str .= " $table.c";
diff --git a/ext/Encode/bin/piconv b/ext/Encode/bin/piconv
index fd546f0428..cb0c236698 100644
--- a/ext/Encode/bin/piconv
+++ b/ext/Encode/bin/piconv
@@ -1,5 +1,5 @@
#!./perl
-# $Id: piconv,v 2.0 2004/05/16 20:55:16 dankogai Exp $
+# $Id: piconv,v 2.1 2004/10/06 05:07:20 dankogai Exp $
#
use 5.8.0;
use strict;
@@ -52,25 +52,39 @@ To: $to => $cto
EOT
}
-# default
-if ($scheme eq 'from_to'){
- while(<>){
- Encode::from_to($_, $from, $to, $Opt{check}); print;
- };
-# step-by-step
-}elsif ($scheme eq 'decode_encode'){
- while(<>){
- my $decoded = decode($from, $_, $Opt{check});
- my $encoded = encode($to, $decoded);
- print $encoded;
- };
-# NI-S favorite
-}elsif ($scheme eq 'perlio'){
- binmode(STDIN, ":encoding($from)");
- binmode(STDOUT, ":encoding($to)");
- while(<>){ print; }
-} else { # won't reach
- die "$name: unknown scheme: $scheme";
+# we do not use <> (or ARGV) for the sake of binmode()
+@ARGV or push @ARGV, \*STDIN;
+
+unless ($scheme eq 'perlio'){
+ binmode STDOUT;
+ for my $argv (@ARGV){
+ my $ifh = ref $argv ? $argv : undef;
+ $ifh or open $ifh, "<", $argv or next;
+ binmode $ifh;
+ if ($scheme eq 'from_to'){ # default
+ while(<$ifh>){
+ Encode::from_to($_, $from, $to, $Opt{check});
+ print;
+ }
+ }elsif ($scheme eq 'decode_encode'){ # step-by-step
+ while(<$ifh>){
+ my $decoded = decode($from, $_, $Opt{check});
+ my $encoded = encode($to, $decoded);
+ print $encoded;
+ }
+ } else { # won't reach
+ die "$name: unknown scheme: $scheme";
+ }
+ }
+}else{
+ # NI-S favorite
+ binmode STDOUT => "raw:encoding($to)";
+ for my $argv (@ARGV){
+ my $ifh = ref $argv ? $argv : undef;
+ $ifh or open $ifh, "<", $argv or next;
+ binmode $ifh => "raw:encoding($from)";
+ print while(<$ifh>);
+ }
}
sub list_encodings{
diff --git a/ext/Encode/bin/ucmsort b/ext/Encode/bin/ucmsort
index 63d495209c..a67ee8ee81 100644
--- a/ext/Encode/bin/ucmsort
+++ b/ext/Encode/bin/ucmsort
@@ -1,6 +1,6 @@
#!/usr/local/bin/perl
#
-# $Id: ucmsort,v 2.1 2004/08/31 10:55:34 dankogai Exp dankogai $
+# $Id: ucmsort,v 2.1 2004/08/31 10:55:34 dankogai Exp $
#
use strict;
my @lines;
diff --git a/ext/Encode/encoding.pm b/ext/Encode/encoding.pm
index b398301513..d1181ff403 100644
--- a/ext/Encode/encoding.pm
+++ b/ext/Encode/encoding.pm
@@ -1,16 +1,15 @@
-# $Id: encoding.pm,v 2.01 2004/05/16 20:55:16 dankogai Exp $
+# $Id: encoding.pm,v 2.0 2004/05/16 20:55:16 dankogai Exp $
package encoding;
-our $VERSION = do { my @r = (q$Revision: 2.01 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
+our $VERSION = do { my @r = (q$Revision: 2.0 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
use Encode;
use strict;
-
sub DEBUG () { 0 }
BEGIN {
if (ord("A") == 193) {
require Carp;
- Carp::croak("encoding: pragma does not support EBCDIC platforms");
+ Carp::croak("encoding pragma does not support EBCDIC platforms");
}
}
@@ -31,79 +30,15 @@ sub _exception{
return $Config{perl_patchlevel} ? 0 : 1 # maintperl then no
}
-sub in_locale { $^H & ($locale::hint_bits || 0)}
-
-sub _get_locale_encoding {
- my $locale_encoding;
-
- # I18N::Langinfo isn't available everywhere
- eval {
- require I18N::Langinfo;
- I18N::Langinfo->import(qw(langinfo CODESET));
- $locale_encoding = langinfo(CODESET());
- };
-
- my $country_language;
-
- no warnings 'uninitialized';
-
- if (not $locale_encoding && in_locale()) {
- if ($ENV{LC_ALL} =~ /^([^.]+)\.([^.]+)$/) {
- ($country_language, $locale_encoding) = ($1, $2);
- } elsif ($ENV{LANG} =~ /^([^.]+)\.([^.]+)$/) {
- ($country_language, $locale_encoding) = ($1, $2);
- }
- # LANGUAGE affects only LC_MESSAGES only on glibc
- } elsif (not $locale_encoding) {
- if ($ENV{LC_ALL} =~ /\butf-?8\b/i ||
- $ENV{LANG} =~ /\butf-?8\b/i) {
- $locale_encoding = 'utf8';
- }
- # Could do more heuristics based on the country and language
- # parts of LC_ALL and LANG (the parts before the dot (if any)),
- # since we have Locale::Country and Locale::Language available.
- # TODO: get a database of Language -> Encoding mappings
- # (the Estonian database at http://www.eki.ee/letter/
- # would be excellent!) --jhi
- }
- if (defined $locale_encoding &&
- lc($locale_encoding) eq 'euc' &&
- defined $country_language) {
- if ($country_language =~ /^ja_JP|japan(?:ese)?$/i) {
- $locale_encoding = 'euc-jp';
- } elsif ($country_language =~ /^ko_KR|korean?$/i) {
- $locale_encoding = 'euc-kr';
- } elsif ($country_language =~ /^zh_CN|chin(?:a|ese)?$/i) {
- $locale_encoding = 'euc-cn';
- } elsif ($country_language =~ /^zh_TW|taiwan(?:ese)?$/i) {
- $locale_encoding = 'euc-tw';
- } else {
- require Carp;
- Carp::croak("encoding: Locale encoding '$locale_encoding' too ambiguous");
- }
- }
-
- return $locale_encoding;
-}
-
sub import {
my $class = shift;
my $name = shift;
- if ($name eq ':_get_locale_encoding') { # used by lib/open.pm
- my $caller = caller();
- {
- no strict 'refs';
- *{"${caller}::_get_locale_encoding"} = \&_get_locale_encoding;
- }
- return;
- }
- $name = _get_locale_encoding() if $name eq ':locale';
my %arg = @_;
- $name = $ENV{PERL_ENCODING} unless defined $name;
+ $name ||= $ENV{PERL_ENCODING};
my $enc = find_encoding($name);
unless (defined $enc) {
require Carp;
- Carp::croak("encoding: Unknown encoding '$name'");
+ Carp::croak("Unknown encoding '$name'");
}
$name = $enc->name; # canonize
unless ($arg{Filter}) {
@@ -127,14 +62,13 @@ sub import {
$status ;
});
};
- $@ == '' and DEBUG and warn "Filter installed";
- }
+ } DEBUG and warn "Filter installed";
defined ${^UNICODE} and ${^UNICODE} != 0 and return 1;
for my $h (qw(STDIN STDOUT)){
if ($arg{$h}){
unless (defined find_encoding($arg{$h})) {
require Carp;
- Carp::croak("encoding: Unknown encoding for $h, '$arg{$h}'");
+ Carp::croak("Unknown encoding for $h, '$arg{$h}'");
}
eval { binmode($h, ":raw :encoding($arg{$h})") };
}else{
@@ -199,14 +133,6 @@ encoding - allows you to write your script in non-ascii or non-utf8
use encoding "euc-jp", Filter=>1;
# now you can use kanji identifiers -- in euc-jp!
- # switch on locale -
- # note that this probably means that unless you have a complete control
- # over the environments the application is ever going to be run, you should
- # NOT use the feature of encoding pragma allowing you to write your script
- # in any recognized encoding because changing locale settings will wreck
- # the script; you can of course still use the other features of the pragma.
- use encoding ':locale';
-
=head1 ABSTRACT
Let's start with a bit of history: Perl 5.6.0 introduced Unicode
@@ -584,45 +510,11 @@ Arabic and Hebrew).
=back
-=head2 The Logic of :locale
-
-The logic of C<:locale> is as follows:
-
-=over 4
-
-=item 1.
-
-If the platform supports the langinfo(CODESET) interface, the codeset
-returned is used as the default encoding for the open pragma.
-
-=item 2.
-
-If 1. didn't work but we are under the locale pragma, the environment
-variables LC_ALL and LANG (in that order) are matched for encodings
-(the part after C<.>, if any), and if any found, that is used
-as the default encoding for the open pragma.
-
-=item 3.
-
-If 1. and 2. didn't work, the environment variables LC_ALL and LANG
-(in that order) are matched for anything looking like UTF-8, and if
-any found, C<:utf8> is used as the default encoding for the open
-pragma.
-
-=back
-
-If your locale environment variables (LC_ALL, LC_CTYPE, LANG)
-contain the strings 'UTF-8' or 'UTF8' (case-insensitive matching),
-the default encoding of your STDIN, STDOUT, and STDERR, and of
-B<any subsequent file open>, is UTF-8.
-
=head1 HISTORY
This pragma first appeared in Perl 5.8.0. For features that require
5.8.1 and better, see above.
-The C<:locale> subpragma was implemented in 2.01, or Perl 5.8.6.
-
=head1 SEE ALSO
L<perlunicode>, L<Encode>, L<open>, L<Filter::Util::Call>,
diff --git a/ext/Encode/lib/Encode/Alias.pm b/ext/Encode/lib/Encode/Alias.pm
index 554adce303..a1cc25355c 100644
--- a/ext/Encode/lib/Encode/Alias.pm
+++ b/ext/Encode/lib/Encode/Alias.pm
@@ -2,7 +2,7 @@ package Encode::Alias;
use strict;
no warnings 'redefine';
use Encode;
-our $VERSION = do { my @r = (q$Revision: 2.1 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
+our $VERSION = do { my @r = (q$Revision: 2.2 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
sub DEBUG () { 0 }
use base qw(Exporter);
@@ -18,45 +18,45 @@ our @EXPORT =
our @Alias; # ordered matching list
our %Alias; # cached known aliases
-sub find_alias
-{
+sub find_alias{
my $class = shift;
my $find = shift;
- unless (exists $Alias{$find})
- {
+ unless (exists $Alias{$find}) {
$Alias{$find} = undef; # Recursion guard
- for (my $i=0; $i < @Alias; $i += 2)
- {
+ for (my $i=0; $i < @Alias; $i += 2){
my $alias = $Alias[$i];
my $val = $Alias[$i+1];
my $new;
- if (ref($alias) eq 'Regexp' && $find =~ $alias)
- {
+ if (ref($alias) eq 'Regexp' && $find =~ $alias){
DEBUG and warn "eval $val";
$new = eval $val;
DEBUG and $@ and warn "$val, $@";
- }
- elsif (ref($alias) eq 'CODE')
- {
+ }elsif (ref($alias) eq 'CODE'){
DEBUG and warn "$alias", "->", "($find)";
$new = $alias->($find);
- }
- elsif (lc($find) eq lc($alias))
- {
+ }elsif (lc($find) eq lc($alias)){
$new = $val;
}
- if (defined($new))
- {
+ if (defined($new)){
next if $new eq $find; # avoid (direct) recursion on bugs
DEBUG and warn "$alias, $new";
my $enc = (ref($new)) ? $new : Encode::find_encoding($new);
- if ($enc)
- {
+ if ($enc){
$Alias{$find} = $enc;
last;
}
}
}
+ # case insensitive search when canonical is not in all lowercase
+ # RT ticket #7835
+ unless ($Alias{$find}){
+ my $lcfind = lc($find);
+ for my $name (keys %Encode::Encoding, keys %Encode::ExtModule){
+ $lcfind eq lc($name) or next;
+ $Alias{$find} = Encode::find_encoding($name);
+ DEBUG and warn "$find => $name";
+ }
+ }
}
if (DEBUG){
my $name;
@@ -70,23 +70,19 @@ sub find_alias
return $Alias{$find};
}
-sub define_alias
-{
- while (@_)
- {
+sub define_alias{
+ while (@_){
my ($alias,$name) = splice(@_,0,2);
unshift(@Alias, $alias => $name); # newer one has precedence
- # clear %Alias cache to allow overrides
if (ref($alias)){
+ # clear %Alias cache to allow overrides
my @a = keys %Alias;
for my $k (@a){
- if (ref($alias) eq 'Regexp' && $k =~ $alias)
- {
+ if (ref($alias) eq 'Regexp' && $k =~ $alias){
DEBUG and warn "delete \$Alias\{$k\}";
delete $Alias{$k};
}
- elsif (ref($alias) eq 'CODE')
- {
+ elsif (ref($alias) eq 'CODE'){
DEBUG and warn "delete \$Alias\{$k\}";
delete $Alias{$alias->($name)};
}
@@ -99,7 +95,7 @@ sub define_alias
}
# Allow latin-1 style names as well
- # 0 1 2 3 4 5 6 7 8 9 10
+# 0 1 2 3 4 5 6 7 8 9 10
our @Latin2iso = ( 0, 1, 2, 3, 4, 9, 10, 13, 14, 15, 16 );
# Allow winlatin1 style names as well
our %Winlatin2cp = (
@@ -124,7 +120,6 @@ sub undef_aliases{
sub init_aliases
{
undef_aliases();
-
# Try all-lower-case version should all else fails
define_alias( qr/^(.*)$/ => '"\L$1"' );
@@ -134,9 +129,9 @@ sub init_aliases
define_alias( qr/^UCS-?2-?(BE)?$/i => '"UCS-2BE"',
qr/^UCS-?4-?(BE|LE)?$/i => 'uc("UTF-32$1")',
qr/^iso-10646-1$/i => '"UCS-2BE"' );
- define_alias( qr/^UTF(16|32)-?BE$/i => '"UTF-$1BE"',
- qr/^UTF(16|32)-?LE$/i => '"UTF-$1LE"',
- qr/^UTF(16|32)$/i => '"UTF-$1"',
+ define_alias( qr/^UTF-?(16|32)-?BE$/i => '"UTF-$1BE"',
+ qr/^UTF-?(16|32)-?LE$/i => '"UTF-$1LE"',
+ qr/^UTF-?(16|32)$/i => '"UTF-$1"',
);
# ASCII
define_alias(qr/^(?:US-?)ascii$/i => '"ascii"');
@@ -211,6 +206,7 @@ sub init_aliases
define_alias( qr/\bujis$/i => '"euc-jp"' );
define_alias( qr/\bshift.*jis$/i => '"shiftjis"' );
define_alias( qr/\bsjis$/i => '"shiftjis"' );
+ define_alias( qr/\bwindows-31j$/i => '"cp932"' );
# for Encode::KR
define_alias( qr/\beuc.*kr$/i => '"euc-kr"' );
define_alias( qr/\bkr.*euc$/i => '"euc-kr"' );
diff --git a/ext/Encode/lib/Encode/Supported.pod b/ext/Encode/lib/Encode/Supported.pod
index 9280a97e72..7a535c6d58 100644
--- a/ext/Encode/lib/Encode/Supported.pod
+++ b/ext/Encode/lib/Encode/Supported.pod
@@ -664,7 +664,8 @@ probably has more rights for the name, though it may be objected
that Microsoft shouldn't have used JIS as part of the name
in the first place.
-Unambiguous name: C<CP932>. C<IANA> name (not used?): C<Windows-31J>.
+Unambiguous name: C<CP932>. C<IANA> name (also used by Mozilla, and
+provided as an alias by Encode): C<Windows-31J>.
Encode separately supports C<Shift_JIS> and C<cp932>.
diff --git a/ext/Encode/ucm/big5-hkscs.ucm b/ext/Encode/ucm/big5-hkscs.ucm
index 75bffc4c56..7bc366251f 100644
--- a/ext/Encode/ucm/big5-hkscs.ucm
+++ b/ext/Encode/ucm/big5-hkscs.ucm
@@ -1,5 +1,5 @@
#
-# $Id: big5-hkscs.ucm,v 2.1 2004/08/31 10:55:34 dankogai Exp dankogai $
+# $Id: big5-hkscs.ucm,v 2.1 2004/08/31 10:55:34 dankogai Exp $
#
<code_set_name> "big5-hkscs"
<mb_cur_min> 1