diff options
author | Neil Bowers <neilb@neilb.org> | 2021-07-22 21:54:10 +0100 |
---|---|---|
committer | Neil Bowers <neilb@neilb.org> | 2021-07-22 21:54:10 +0100 |
commit | 41cb5863c312e0d002f7c4f0abc736434d74c13f (patch) | |
tree | e64a87c1da8d7acf6023227f4560a232bfb48e9a /cpan | |
parent | 994da94187afd0618d0c3e2c538ee84b5f4fca6f (diff) | |
download | perl-41cb5863c312e0d002f7c4f0abc736434d74c13f.tar.gz |
Upgraded Encode from 3.08 to 3.10
Diffstat (limited to 'cpan')
-rw-r--r-- | cpan/Encode/Encode.pm | 48 | ||||
-rw-r--r-- | cpan/Encode/JP/JP.pm | 6 | ||||
-rw-r--r-- | cpan/Encode/Makefile.PL | 2 | ||||
-rw-r--r-- | cpan/Encode/bin/encguess | 2 | ||||
-rw-r--r-- | cpan/Encode/t/Encode.t | 11 |
5 files changed, 32 insertions, 37 deletions
diff --git a/cpan/Encode/Encode.pm b/cpan/Encode/Encode.pm index d3eb3c1b11..a56a99947f 100644 --- a/cpan/Encode/Encode.pm +++ b/cpan/Encode/Encode.pm @@ -1,5 +1,5 @@ # -# $Id: Encode.pm,v 3.08 2020/12/02 01:27:44 dankogai Exp $ +# $Id: Encode.pm,v 3.10 2021/05/18 07:42:45 dankogai Exp dankogai $ # package Encode; use strict; @@ -7,7 +7,7 @@ use warnings; use constant DEBUG => !!$ENV{PERL_ENCODE_DEBUG}; our $VERSION; BEGIN { - $VERSION = sprintf "%d.%02d", q$Revision: 3.08 $ =~ /(\d+)/g; + $VERSION = sprintf "%d.%02d", q$Revision: 3.10 $ =~ /(\d+)/g; require XSLoader; XSLoader::load( __PACKAGE__, $VERSION ); } @@ -202,18 +202,6 @@ if ($ON_EBCDIC) { $_[1] = '' if $chk; return $res; } -} else { - package Encode::Internal; - use parent 'Encode::Encoding'; - my $obj = bless { Name => "Internal" } => "Encode::Internal"; - Encode::define_encoding($obj, 'Unicode'); - sub decode { - my ( undef, $str, $chk ) = @_; - utf8::upgrade($str); - $_[1] = '' if $chk; - return $str; - } - *encode = \&decode; } { @@ -499,19 +487,25 @@ followed by C<encode> as follows: $octets = encode_utf8($string); +B<WARNING>: L<This function can produce invalid UTF-8!|/UTF-8 vs. utf8 vs. UTF8> +Do not use it for data exchange. +Unless you want Perl's older "lax" mode, prefer +C<$octets = encode("UTF-8", $string)>. + Equivalent to C<$octets = encode("utf8", $string)>. The characters in $string are encoded in Perl's internal format, and the result is returned as a sequence of octets. Because all possible characters in Perl have a (loose, not strict) utf8 representation, this function cannot fail. -B<WARNING>: do not use this function for data exchange as it can produce -not strict utf8 $octets! For strictly valid UTF-8 output use -C<$octets = encode("UTF-8", $string)>. - =head3 decode_utf8 $string = decode_utf8($octets [, CHECK]); +B<WARNING>: L<This function accepts invalid UTF-8!|/UTF-8 vs. utf8 vs. UTF8> +Do not use it for data exchange. +Unless you want Perl's older "lax" mode, prefer +C<$string = decode("UTF-8", $octets [, CHECK])>. + Equivalent to C<$string = decode("utf8", $octets [, CHECK])>. The sequence of octets represented by $octets is decoded from (loose, not strict) utf8 into a sequence of logical characters. @@ -519,10 +513,6 @@ Because not all sequences of octets are valid not strict utf8, it is quite possible for this function to fail. For CHECK, see L</"Handling Malformed Data">. -B<WARNING>: do not use this function for data exchange as it can produce -$string with not strict utf8 representation! For strictly valid UTF-8 -$string representation use C<$string = decode("UTF-8", $octets [, CHECK])>. - B<CAVEAT>: the input I<$octets> might be modified in-place depending on what is set in CHECK. See L</LEAVE_SRC> if you want your inputs to be left unchanged. @@ -927,6 +917,20 @@ important distinction between C<"UTF-8"> and C<"utf8">. encode("utf8", "\x{FFFF_FFFF}", 1); # okay encode("UTF-8", "\x{FFFF_FFFF}", 1); # croaks +This distinction is also important for decoding. In the following, +C<$s> stores character U+200000, which exceeds UTF-8's allowed range. +C<$s> thus stores an invalid Unicode code point: + + $s = decode("utf8", "\xf8\x88\x80\x80\x80"); + +C<"UTF-8">, by contrast, will either coerce the input to something valid: + + $s = decode("UTF-8", "\xf8\x88\x80\x80\x80"); # U+FFFD + +.. or croak: + + decode("UTF-8", "\xf8\x88\x80\x80\x80", FB_CROAK|LEAVE_SRC); + In the C<Encode> module, C<"UTF-8"> is actually a canonical name for C<"utf-8-strict">. That hyphen between the C<"UTF"> and the C<"8"> is critical; without it, C<Encode> goes "liberal" and (perhaps overly-)permissive: diff --git a/cpan/Encode/JP/JP.pm b/cpan/Encode/JP/JP.pm index 4251170c56..09efaaa8c3 100644 --- a/cpan/Encode/JP/JP.pm +++ b/cpan/Encode/JP/JP.pm @@ -7,7 +7,7 @@ BEGIN { use strict; use warnings; use Encode; -our $VERSION = do { my @r = ( q$Revision: 2.4 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; +our $VERSION = do { my @r = ( q$Revision: 2.5 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; use XSLoader; XSLoader::load( __PACKAGE__, $VERSION ); @@ -50,8 +50,8 @@ supported are as follows. cp932 /\bwindows-31j$/i Code Page 932 = Shift JIS + MS/IBM vendor mappings jis0201-raw JIS0201, raw format - jis0208-raw JIS0201, raw format - jis0212-raw JIS0201, raw format + jis0208-raw JIS0208, raw format + jis0212-raw JIS0212, raw format -------------------------------------------------------------------- =head1 DESCRIPTION diff --git a/cpan/Encode/Makefile.PL b/cpan/Encode/Makefile.PL index f9c774845f..f711d6dd4f 100644 --- a/cpan/Encode/Makefile.PL +++ b/cpan/Encode/Makefile.PL @@ -1,5 +1,5 @@ # -# $Id: Makefile.PL,v 2.23 2020/12/02 01:28:17 dankogai Exp dankogai $ +# $Id: Makefile.PL,v 2.23 2020/12/02 01:28:17 dankogai Exp $ # use 5.007003; use strict; diff --git a/cpan/Encode/bin/encguess b/cpan/Encode/bin/encguess index 19a0673e76..440733eea0 100644 --- a/cpan/Encode/bin/encguess +++ b/cpan/Encode/bin/encguess @@ -61,7 +61,7 @@ encguess - guess character encodings of files =head1 VERSION -$Id: encguess,v 0.3 2020/12/02 01:28:17 dankogai Exp dankogai $ +$Id: encguess,v 0.3 2020/12/02 01:28:17 dankogai Exp $ =head1 SYNOPSIS diff --git a/cpan/Encode/t/Encode.t b/cpan/Encode/t/Encode.t index 0536b4b714..f60f37d596 100644 --- a/cpan/Encode/t/Encode.t +++ b/cpan/Encode/t/Encode.t @@ -25,19 +25,10 @@ my @character_set = ('0'..'9', 'A'..'Z', 'a'..'z'); my @source = qw(ascii iso8859-1 cp1250); my @destiny = qw(cp1047 cp37 posix-bc); my @ebcdic_sets = qw(cp1047 cp37 posix-bc); -plan tests => 38+$n*@encodings + 2*@source*@destiny*@character_set + 2*@ebcdic_sets*256 + 6 + 3 + 3*8 + 2; +plan tests => 38+$n*@encodings + 2*@source*@destiny*@character_set + 2*@ebcdic_sets*256 + 6 + 3*8; my $str = join('',map(chr($_),0x20..0x7E)); my $cpy = $str; -is length($str),from_to($cpy,'iso8859-1','Unicode'),"Length Wrong"; -is $cpy,$str,"ASCII mangled by translating from iso8859-1 to Unicode"; -$cpy = $str; -is from_to($cpy,'Unicode','iso8859-1'),length($str),"Length wrong"; -is $cpy,$str,"ASCII mangled by translating from Unicode to iso8859-1"; - -$str = join('',map(chr($_),0xa0..0xff)); -$cpy = $str; -is length($str),from_to($cpy,'iso8859-1','Unicode'),"Length Wrong"; my $sym = Encode->getEncoding('symbol'); my $uni = $sym->decode(encode(ascii => 'a')); |