summaryrefslogtreecommitdiff
path: root/ext/Encode
diff options
context:
space:
mode:
authorDan Kogai <dankogai@dan.co.jp>2003-06-28 10:20:59 +0900
committerRafael Garcia-Suarez <rgarciasuarez@gmail.com>2003-06-28 15:47:22 +0000
commit23f3589e21445e9141901c2894bc97b457493332 (patch)
treee937011553ef2e0d3e0eac71acbd0730096ffe29 /ext/Encode
parentb4ab917c3d812d8e61d365bfa48d9bf7675bc113 (diff)
downloadperl-23f3589e21445e9141901c2894bc97b457493332.tar.gz
[Encode] pre-1.97 patches
Message-Id: <56D5BFEE-A8BB-11D7-9092-000393AE4244@dan.co.jp> p4raw-id: //depot/perl@19871
Diffstat (limited to 'ext/Encode')
-rw-r--r--ext/Encode/Changes10
-rw-r--r--ext/Encode/Encode.pm2
-rw-r--r--ext/Encode/lib/Encode/Guess.pm124
3 files changed, 78 insertions, 58 deletions
diff --git a/ext/Encode/Changes b/ext/Encode/Changes
index 18f5788e92..7251f5d365 100644
--- a/ext/Encode/Changes
+++ b/ext/Encode/Changes
@@ -3,6 +3,16 @@
# $Id: Changes,v 1.96 2003/06/18 09:29:02 dankogai Exp $
#
$Revision: 1.96 $ $Date: 2003/06/18 09:29:02 $
+! lib/Encode/Guess.pm
+ $Encode::Guess::NoUTFAutoGuess is added so you can turn off
+ automatic utf(8|16|32) guessing -- originally by Autrijus
+ Message-Id: <20030626162731.GA2077@not.autrijus.org>
+! Encode.pm
+ Addressed the following;
+ Subject: [perl #22835] FB_QUIET doesn't work with Encode::encode
+ Message-Id: <rt-22835-59975.6.8650775354304@rt.perl.org>
+
+1.96 2003/06/18 09:29:02
! lib/Encode/JP/JP.pm t/guess.t
m/(...)/ in void context then $1 is considered a Bad Thing
Message-Id: <B5AB34D0-A019-11D7-AF03-000393AE4244@dan.co.jp>
diff --git a/ext/Encode/Encode.pm b/ext/Encode/Encode.pm
index 57bcc2b0d2..db74b6a194 100644
--- a/ext/Encode/Encode.pm
+++ b/ext/Encode/Encode.pm
@@ -147,7 +147,7 @@ sub encode($$;$)
Carp::croak("Unknown encoding '$name'");
}
my $octets = $enc->encode($string,$check);
- return undef if ($check && length($string));
+ $_[1] = $string if $check;
return $octets;
}
diff --git a/ext/Encode/lib/Encode/Guess.pm b/ext/Encode/lib/Encode/Guess.pm
index fc8d267d02..5858f819cd 100644
--- a/ext/Encode/lib/Encode/Guess.pm
+++ b/ext/Encode/lib/Encode/Guess.pm
@@ -18,6 +18,7 @@ sub needs_lines { 1 }
sub perlio_ok { 0 }
our @EXPORT = qw(guess_encoding);
+our $NoUTFAutoGuess = 0;
sub import { # Exporter not used so we do it on our own
my $callpkg = caller;
@@ -70,75 +71,80 @@ sub guess {
return unless defined $octet and length $octet;
# cheat 0: utf8 flag;
- Encode::is_utf8($octet) and return find_encoding('utf8');
+ if ( Encode::is_utf8($octet) ) {
+ return find_encoding('utf8') unless $NoUTFAutoGuess;
+ Encode::_utf8_off($octet);
+ }
# cheat 1: BOM
use Encode::Unicode;
- my $BOM = unpack('n', $octet);
- return find_encoding('UTF-16')
- if (defined $BOM and ($BOM == 0xFeFF or $BOM == 0xFFFe));
- $BOM = unpack('N', $octet);
- return find_encoding('UTF-32')
- if (defined $BOM and ($BOM == 0xFeFF or $BOM == 0xFFFe0000));
+ unless ($NoUTFAutoGuess) {
+ my $BOM = unpack('n', $octet);
+ return find_encoding('UTF-16')
+ if (defined $BOM and ($BOM == 0xFeFF or $BOM == 0xFFFe));
+ $BOM = unpack('N', $octet);
+ return find_encoding('UTF-32')
+ if (defined $BOM and ($BOM == 0xFeFF or $BOM == 0xFFFe0000));
+ if ($octet =~ /\x00/o){ # if \x00 found, we assume UTF-(16|32)(BE|LE)
+ my $utf;
+ my ($be, $le) = (0, 0);
+ if ($octet =~ /\x00\x00/o){ # UTF-32(BE|LE) assumed
+ $utf = "UTF-32";
+ for my $char (unpack('N*', $octet)){
+ $char & 0x0000ffff and $be++;
+ $char & 0xffff0000 and $le++;
+ }
+ }else{ # UTF-16(BE|LE) assumed
+ $utf = "UTF-16";
+ for my $char (unpack('n*', $octet)){
+ $char & 0x00ff and $be++;
+ $char & 0xff00 and $le++;
+ }
+ }
+ $DEBUG and warn "$utf, be == $be, le == $le";
+ $be == $le
+ and return
+ "Encodings ambiguous between $utf BE and LE ($be, $le)";
+ $utf .= ($be > $le) ? 'BE' : 'LE';
+ return find_encoding($utf);
+ }
+ }
my %try = %{$obj->{Suspects}};
for my $c (@_){
my $e = find_encoding($c) or die "Unknown encoding: $c";
$try{$e->name} = $e;
$DEBUG and warn "Added: ", $e->name;
}
- if ($octet =~ /\x00/o){ # if \x00 found, we assume UTF-(16|32)(BE|LE)
- my $utf;
- my ($be, $le) = (0, 0);
- if ($octet =~ /\x00\x00/o){ # UTF-32(BE|LE) assumed
- $utf = "UTF-32";
- for my $char (unpack('N*', $octet)){
- $char & 0x0000ffff and $be++;
- $char & 0xffff0000 and $le++;
- }
- }else{ # UTF-16(BE|LE) assumed
- $utf = "UTF-16";
- for my $char (unpack('n*', $octet)){
- $char & 0x00ff and $be++;
- $char & 0xff00 and $le++;
+ my $nline = 1;
+ for my $line (split /\r\n?|\n/, $octet){
+ # cheat 2 -- \e in the string
+ if ($line =~ /\e/o){
+ my @keys = keys %try;
+ delete @try{qw/utf8 ascii/};
+ for my $k (@keys){
+ ref($try{$k}) eq 'Encode::XS' and delete $try{$k};
}
}
- $DEBUG and warn "$utf, be == $be, le == $le";
- $be == $le
- and return "Encodings ambiguous between $utf BE and LE ($be, $le)";
- $utf .= ($be > $le) ? 'BE' : 'LE';
- return find_encoding($utf);
- }else{
- my $nline = 1;
- for my $line (split /\r\n?|\n/, $octet){
- # cheat 2 -- \e in the string
- if ($line =~ /\e/o){
- my @keys = keys %try;
- delete @try{qw/utf8 ascii/};
- for my $k (@keys){
- ref($try{$k}) eq 'Encode::XS' and delete $try{$k};
- }
- }
- my %ok = %try;
- # warn join(",", keys %try);
- for my $k (keys %try){
- my $scratch = $line;
- $try{$k}->decode($scratch, FB_QUIET);
- if ($scratch eq ''){
- $DEBUG and warn sprintf("%4d:%-24s ok\n", $nline, $k);
- }else{
- use bytes ();
- $DEBUG and
- warn sprintf("%4d:%-24s not ok; %d bytes left\n",
- $nline, $k, bytes::length($scratch));
- delete $ok{$k};
- }
+ my %ok = %try;
+ # warn join(",", keys %try);
+ for my $k (keys %try){
+ my $scratch = $line;
+ $try{$k}->decode($scratch, FB_QUIET);
+ if ($scratch eq ''){
+ $DEBUG and warn sprintf("%4d:%-24s ok\n", $nline, $k);
+ }else{
+ use bytes ();
+ $DEBUG and
+ warn sprintf("%4d:%-24s not ok; %d bytes left\n",
+ $nline, $k, bytes::length($scratch));
+ delete $ok{$k};
}
- %ok or return "No appropriate encodings found!";
- if (scalar(keys(%ok)) == 1){
- my ($retval) = values(%ok);
- return $retval;
- }
- %try = %ok; $nline++;
}
+ %ok or return "No appropriate encodings found!";
+ if (scalar(keys(%ok)) == 1){
+ my ($retval) = values(%ok);
+ return $retval;
+ }
+ %try = %ok; $nline++;
}
$try{ascii} or
return "Encodings too ambiguous: ", join(" or ", keys %try);
@@ -189,6 +195,10 @@ canonical names or aliases.
# tries all major Japanese Encodings as well
use Encode::Guess qw/euc-jp shiftjis 7bit-jis/;
+If the C<$Encode::Guess::NoUTFAutoGuess> variable is set to a true
+value, no heuristics will be applied to UTF8/16/32, and the result
+will be limited to the suspects and C<ascii>.
+
=over 4
=item Encode::Guess->set_suspects