diff options
author | Dan Kogai <dankogai@dan.co.jp> | 2003-06-28 10:20:59 +0900 |
---|---|---|
committer | Rafael Garcia-Suarez <rgarciasuarez@gmail.com> | 2003-06-28 15:47:22 +0000 |
commit | 23f3589e21445e9141901c2894bc97b457493332 (patch) | |
tree | e937011553ef2e0d3e0eac71acbd0730096ffe29 /ext/Encode | |
parent | b4ab917c3d812d8e61d365bfa48d9bf7675bc113 (diff) | |
download | perl-23f3589e21445e9141901c2894bc97b457493332.tar.gz |
[Encode] pre-1.97 patches
Message-Id: <56D5BFEE-A8BB-11D7-9092-000393AE4244@dan.co.jp>
p4raw-id: //depot/perl@19871
Diffstat (limited to 'ext/Encode')
-rw-r--r-- | ext/Encode/Changes | 10 | ||||
-rw-r--r-- | ext/Encode/Encode.pm | 2 | ||||
-rw-r--r-- | ext/Encode/lib/Encode/Guess.pm | 124 |
3 files changed, 78 insertions, 58 deletions
diff --git a/ext/Encode/Changes b/ext/Encode/Changes index 18f5788e92..7251f5d365 100644 --- a/ext/Encode/Changes +++ b/ext/Encode/Changes @@ -3,6 +3,16 @@ # $Id: Changes,v 1.96 2003/06/18 09:29:02 dankogai Exp $ # $Revision: 1.96 $ $Date: 2003/06/18 09:29:02 $ +! lib/Encode/Guess.pm + $Encode::Guess::NoUTFAutoGuess is added so you can turn off + automatic utf(8|16|32) guessing -- originally by Autrijus + Message-Id: <20030626162731.GA2077@not.autrijus.org> +! Encode.pm + Addressed the following; + Subject: [perl #22835] FB_QUIET doesn't work with Encode::encode + Message-Id: <rt-22835-59975.6.8650775354304@rt.perl.org> + +1.96 2003/06/18 09:29:02 ! lib/Encode/JP/JP.pm t/guess.t m/(...)/ in void context then $1 is considered a Bad Thing Message-Id: <B5AB34D0-A019-11D7-AF03-000393AE4244@dan.co.jp> diff --git a/ext/Encode/Encode.pm b/ext/Encode/Encode.pm index 57bcc2b0d2..db74b6a194 100644 --- a/ext/Encode/Encode.pm +++ b/ext/Encode/Encode.pm @@ -147,7 +147,7 @@ sub encode($$;$) Carp::croak("Unknown encoding '$name'"); } my $octets = $enc->encode($string,$check); - return undef if ($check && length($string)); + $_[1] = $string if $check; return $octets; } diff --git a/ext/Encode/lib/Encode/Guess.pm b/ext/Encode/lib/Encode/Guess.pm index fc8d267d02..5858f819cd 100644 --- a/ext/Encode/lib/Encode/Guess.pm +++ b/ext/Encode/lib/Encode/Guess.pm @@ -18,6 +18,7 @@ sub needs_lines { 1 } sub perlio_ok { 0 } our @EXPORT = qw(guess_encoding); +our $NoUTFAutoGuess = 0; sub import { # Exporter not used so we do it on our own my $callpkg = caller; @@ -70,75 +71,80 @@ sub guess { return unless defined $octet and length $octet; # cheat 0: utf8 flag; - Encode::is_utf8($octet) and return find_encoding('utf8'); + if ( Encode::is_utf8($octet) ) { + return find_encoding('utf8') unless $NoUTFAutoGuess; + Encode::_utf8_off($octet); + } # cheat 1: BOM use Encode::Unicode; - my $BOM = unpack('n', $octet); - return find_encoding('UTF-16') - if (defined $BOM and ($BOM == 0xFeFF or $BOM == 0xFFFe)); - $BOM = unpack('N', $octet); - return find_encoding('UTF-32') - if (defined $BOM and ($BOM == 0xFeFF or $BOM == 0xFFFe0000)); + unless ($NoUTFAutoGuess) { + my $BOM = unpack('n', $octet); + return find_encoding('UTF-16') + if (defined $BOM and ($BOM == 0xFeFF or $BOM == 0xFFFe)); + $BOM = unpack('N', $octet); + return find_encoding('UTF-32') + if (defined $BOM and ($BOM == 0xFeFF or $BOM == 0xFFFe0000)); + if ($octet =~ /\x00/o){ # if \x00 found, we assume UTF-(16|32)(BE|LE) + my $utf; + my ($be, $le) = (0, 0); + if ($octet =~ /\x00\x00/o){ # UTF-32(BE|LE) assumed + $utf = "UTF-32"; + for my $char (unpack('N*', $octet)){ + $char & 0x0000ffff and $be++; + $char & 0xffff0000 and $le++; + } + }else{ # UTF-16(BE|LE) assumed + $utf = "UTF-16"; + for my $char (unpack('n*', $octet)){ + $char & 0x00ff and $be++; + $char & 0xff00 and $le++; + } + } + $DEBUG and warn "$utf, be == $be, le == $le"; + $be == $le + and return + "Encodings ambiguous between $utf BE and LE ($be, $le)"; + $utf .= ($be > $le) ? 'BE' : 'LE'; + return find_encoding($utf); + } + } my %try = %{$obj->{Suspects}}; for my $c (@_){ my $e = find_encoding($c) or die "Unknown encoding: $c"; $try{$e->name} = $e; $DEBUG and warn "Added: ", $e->name; } - if ($octet =~ /\x00/o){ # if \x00 found, we assume UTF-(16|32)(BE|LE) - my $utf; - my ($be, $le) = (0, 0); - if ($octet =~ /\x00\x00/o){ # UTF-32(BE|LE) assumed - $utf = "UTF-32"; - for my $char (unpack('N*', $octet)){ - $char & 0x0000ffff and $be++; - $char & 0xffff0000 and $le++; - } - }else{ # UTF-16(BE|LE) assumed - $utf = "UTF-16"; - for my $char (unpack('n*', $octet)){ - $char & 0x00ff and $be++; - $char & 0xff00 and $le++; + my $nline = 1; + for my $line (split /\r\n?|\n/, $octet){ + # cheat 2 -- \e in the string + if ($line =~ /\e/o){ + my @keys = keys %try; + delete @try{qw/utf8 ascii/}; + for my $k (@keys){ + ref($try{$k}) eq 'Encode::XS' and delete $try{$k}; } } - $DEBUG and warn "$utf, be == $be, le == $le"; - $be == $le - and return "Encodings ambiguous between $utf BE and LE ($be, $le)"; - $utf .= ($be > $le) ? 'BE' : 'LE'; - return find_encoding($utf); - }else{ - my $nline = 1; - for my $line (split /\r\n?|\n/, $octet){ - # cheat 2 -- \e in the string - if ($line =~ /\e/o){ - my @keys = keys %try; - delete @try{qw/utf8 ascii/}; - for my $k (@keys){ - ref($try{$k}) eq 'Encode::XS' and delete $try{$k}; - } - } - my %ok = %try; - # warn join(",", keys %try); - for my $k (keys %try){ - my $scratch = $line; - $try{$k}->decode($scratch, FB_QUIET); - if ($scratch eq ''){ - $DEBUG and warn sprintf("%4d:%-24s ok\n", $nline, $k); - }else{ - use bytes (); - $DEBUG and - warn sprintf("%4d:%-24s not ok; %d bytes left\n", - $nline, $k, bytes::length($scratch)); - delete $ok{$k}; - } + my %ok = %try; + # warn join(",", keys %try); + for my $k (keys %try){ + my $scratch = $line; + $try{$k}->decode($scratch, FB_QUIET); + if ($scratch eq ''){ + $DEBUG and warn sprintf("%4d:%-24s ok\n", $nline, $k); + }else{ + use bytes (); + $DEBUG and + warn sprintf("%4d:%-24s not ok; %d bytes left\n", + $nline, $k, bytes::length($scratch)); + delete $ok{$k}; } - %ok or return "No appropriate encodings found!"; - if (scalar(keys(%ok)) == 1){ - my ($retval) = values(%ok); - return $retval; - } - %try = %ok; $nline++; } + %ok or return "No appropriate encodings found!"; + if (scalar(keys(%ok)) == 1){ + my ($retval) = values(%ok); + return $retval; + } + %try = %ok; $nline++; } $try{ascii} or return "Encodings too ambiguous: ", join(" or ", keys %try); @@ -189,6 +195,10 @@ canonical names or aliases. # tries all major Japanese Encodings as well use Encode::Guess qw/euc-jp shiftjis 7bit-jis/; +If the C<$Encode::Guess::NoUTFAutoGuess> variable is set to a true +value, no heuristics will be applied to UTF8/16/32, and the result +will be limited to the suspects and C<ascii>. + =over 4 =item Encode::Guess->set_suspects |