Upgraded Encode from 3.08 to 3.10

author: Neil Bowers <neilb@neilb.org> 2021-07-22 21:54:10 +0100
committer: Neil Bowers <neilb@neilb.org> 2021-07-22 21:54:10 +0100
commit: 41cb5863c312e0d002f7c4f0abc736434d74c13f (patch)
tree: e64a87c1da8d7acf6023227f4560a232bfb48e9a /cpan
parent: 994da94187afd0618d0c3e2c538ee84b5f4fca6f (diff)
download: perl-41cb5863c312e0d002f7c4f0abc736434d74c13f.tar.gz
5 files changed, 32 insertions, 37 deletions
diff --git a/cpan/Encode/Encode.pm b/cpan/Encode/Encode.pm
index d3eb3c1b11..a56a99947f 100644
--- a/cpan/Encode/Encode.pm
+++ b/cpan/Encode/Encode.pm
@@ -1,5 +1,5 @@
 #
-# $Id: Encode.pm,v 3.08 2020/12/02 01:27:44 dankogai Exp $
+# $Id: Encode.pm,v 3.10 2021/05/18 07:42:45 dankogai Exp dankogai $
 #
 package Encode;
 use strict;
@@ -7,7 +7,7 @@ use warnings;
 use constant DEBUG => !!$ENV{PERL_ENCODE_DEBUG};
 our $VERSION;
 BEGIN {
-    $VERSION = sprintf "%d.%02d", q$Revision: 3.08 $ =~ /(\d+)/g;
+    $VERSION = sprintf "%d.%02d", q$Revision: 3.10 $ =~ /(\d+)/g;
     require XSLoader;
     XSLoader::load( __PACKAGE__, $VERSION );
 }
@@ -202,18 +202,6 @@ if ($ON_EBCDIC) {
         $_[1] = '' if $chk;
         return $res;
     }
-} else {
-    package Encode::Internal;
-    use parent 'Encode::Encoding';
-    my $obj = bless { Name => "Internal" } => "Encode::Internal";
-    Encode::define_encoding($obj, 'Unicode');
-    sub decode {
-        my ( undef, $str, $chk ) = @_;
-        utf8::upgrade($str);
-        $_[1] = '' if $chk;
-        return $str;
-    }
-    *encode = \&decode;
 }
 
 {
@@ -499,19 +487,25 @@ followed by C<encode> as follows:
 
   $octets = encode_utf8($string);
 
+B<WARNING>: L<This function can produce invalid UTF-8!|/UTF-8 vs. utf8 vs. UTF8>
+Do not use it for data exchange.
+Unless you want Perl's older "lax" mode, prefer
+C<$octets = encode("UTF-8", $string)>.
+
 Equivalent to C<$octets = encode("utf8", $string)>.  The characters in
 $string are encoded in Perl's internal format, and the result is returned
 as a sequence of octets.  Because all possible characters in Perl have a
 (loose, not strict) utf8 representation, this function cannot fail.
 
-B<WARNING>: do not use this function for data exchange as it can produce
-not strict utf8 $octets! For strictly valid UTF-8 output use
-C<$octets = encode("UTF-8", $string)>.
-
 =head3 decode_utf8
 
   $string = decode_utf8($octets [, CHECK]);
 
+B<WARNING>: L<This function accepts invalid UTF-8!|/UTF-8 vs. utf8 vs. UTF8>
+Do not use it for data exchange.
+Unless you want Perl's older "lax" mode, prefer
+C<$string = decode("UTF-8", $octets [, CHECK])>.
+
 Equivalent to C<$string = decode("utf8", $octets [, CHECK])>.
 The sequence of octets represented by $octets is decoded
 from (loose, not strict) utf8 into a sequence of logical characters.
@@ -519,10 +513,6 @@ Because not all sequences of octets are valid not strict utf8,
 it is quite possible for this function to fail.
 For CHECK, see L</"Handling Malformed Data">.
 
-B<WARNING>: do not use this function for data exchange as it can produce
-$string with not strict utf8 representation! For strictly valid UTF-8
-$string representation use C<$string = decode("UTF-8", $octets [, CHECK])>.
-
 B<CAVEAT>: the input I<$octets> might be modified in-place depending on
 what is set in CHECK. See L</LEAVE_SRC> if you want your inputs to be
 left unchanged.
@@ -927,6 +917,20 @@ important distinction between C<"UTF-8"> and C<"utf8">.
   encode("utf8",  "\x{FFFF_FFFF}", 1); # okay
   encode("UTF-8", "\x{FFFF_FFFF}", 1); # croaks
 
+This distinction is also important for decoding. In the following,
+C<$s> stores character U+200000, which exceeds UTF-8's allowed range.
+C<$s> thus stores an invalid Unicode code point:
+
+  $s = decode("utf8", "\xf8\x88\x80\x80\x80");
+
+C<"UTF-8">, by contrast, will either coerce the input to something valid:
+
+    $s = decode("UTF-8", "\xf8\x88\x80\x80\x80"); # U+FFFD
+
+.. or croak:
+
+    decode("UTF-8", "\xf8\x88\x80\x80\x80", FB_CROAK|LEAVE_SRC);
+
 In the C<Encode> module, C<"UTF-8"> is actually a canonical name for
 C<"utf-8-strict">.  That hyphen between the C<"UTF"> and the C<"8"> is
 critical; without it, C<Encode> goes "liberal" and (perhaps overly-)permissive:
diff --git a/cpan/Encode/JP/JP.pm b/cpan/Encode/JP/JP.pm
index 4251170c56..09efaaa8c3 100644
--- a/cpan/Encode/JP/JP.pm
+++ b/cpan/Encode/JP/JP.pm
@@ -7,7 +7,7 @@ BEGIN {
 use strict;
 use warnings;
 use Encode;
-our $VERSION = do { my @r = ( q$Revision: 2.4 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r };
+our $VERSION = do { my @r = ( q$Revision: 2.5 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r };
 
 use XSLoader;
 XSLoader::load( __PACKAGE__, $VERSION );
@@ -50,8 +50,8 @@ supported are as follows.
   cp932       /\bwindows-31j$/i Code Page 932
                                 = Shift JIS + MS/IBM vendor mappings
   jis0201-raw                   JIS0201, raw format
-  jis0208-raw                   JIS0201, raw format
-  jis0212-raw                   JIS0201, raw format
+  jis0208-raw                   JIS0208, raw format
+  jis0212-raw                   JIS0212, raw format
   --------------------------------------------------------------------
 
 =head1 DESCRIPTION
diff --git a/cpan/Encode/Makefile.PL b/cpan/Encode/Makefile.PL
index f9c774845f..f711d6dd4f 100644
--- a/cpan/Encode/Makefile.PL
+++ b/cpan/Encode/Makefile.PL
@@ -1,5 +1,5 @@
 #
-# $Id: Makefile.PL,v 2.23 2020/12/02 01:28:17 dankogai Exp dankogai $
+# $Id: Makefile.PL,v 2.23 2020/12/02 01:28:17 dankogai Exp $
 #
 use 5.007003;
 use strict;
diff --git a/cpan/Encode/bin/encguess b/cpan/Encode/bin/encguess
index 19a0673e76..440733eea0 100644
--- a/cpan/Encode/bin/encguess
+++ b/cpan/Encode/bin/encguess
@@ -61,7 +61,7 @@ encguess - guess character encodings of files
 
 =head1 VERSION
 
-$Id: encguess,v 0.3 2020/12/02 01:28:17 dankogai Exp dankogai $
+$Id: encguess,v 0.3 2020/12/02 01:28:17 dankogai Exp $
 
 =head1 SYNOPSIS
 
diff --git a/cpan/Encode/t/Encode.t b/cpan/Encode/t/Encode.t
index 0536b4b714..f60f37d596 100644
--- a/cpan/Encode/t/Encode.t
+++ b/cpan/Encode/t/Encode.t
@@ -25,19 +25,10 @@ my @character_set = ('0'..'9', 'A'..'Z', 'a'..'z');
 my @source = qw(ascii iso8859-1 cp1250);
 my @destiny = qw(cp1047 cp37 posix-bc);
 my @ebcdic_sets = qw(cp1047 cp37 posix-bc);
-plan tests => 38+$n*@encodings + 2*@source*@destiny*@character_set + 2*@ebcdic_sets*256 + 6 + 3 + 3*8 + 2;
+plan tests => 38+$n*@encodings + 2*@source*@destiny*@character_set + 2*@ebcdic_sets*256 + 6 + 3*8;
 
 my $str = join('',map(chr($_),0x20..0x7E));
 my $cpy = $str;
-is length($str),from_to($cpy,'iso8859-1','Unicode'),"Length Wrong";
-is $cpy,$str,"ASCII mangled by translating from iso8859-1 to Unicode";
-$cpy = $str;
-is from_to($cpy,'Unicode','iso8859-1'),length($str),"Length wrong";
-is $cpy,$str,"ASCII mangled by translating from Unicode to iso8859-1";
-
-$str = join('',map(chr($_),0xa0..0xff));
-$cpy = $str;
-is length($str),from_to($cpy,'iso8859-1','Unicode'),"Length Wrong";
 
 my $sym = Encode->getEncoding('symbol');
 my $uni = $sym->decode(encode(ascii => 'a'));
author	Neil Bowers <neilb@neilb.org>	2021-07-22 21:54:10 +0100
committer	Neil Bowers <neilb@neilb.org>	2021-07-22 21:54:10 +0100
commit	41cb5863c312e0d002f7c4f0abc736434d74c13f (patch)
tree	e64a87c1da8d7acf6023227f4560a232bfb48e9a /cpan
parent	994da94187afd0618d0c3e2c538ee84b5f4fca6f (diff)
download	perl-41cb5863c312e0d002f7c4f0abc736434d74c13f.tar.gz