summaryrefslogtreecommitdiff
path: root/cpan
diff options
context:
space:
mode:
authorNeil Bowers <neilb@neilb.org>2021-07-22 21:54:10 +0100
committerNeil Bowers <neilb@neilb.org>2021-07-22 21:54:10 +0100
commit41cb5863c312e0d002f7c4f0abc736434d74c13f (patch)
treee64a87c1da8d7acf6023227f4560a232bfb48e9a /cpan
parent994da94187afd0618d0c3e2c538ee84b5f4fca6f (diff)
downloadperl-41cb5863c312e0d002f7c4f0abc736434d74c13f.tar.gz
Upgraded Encode from 3.08 to 3.10
Diffstat (limited to 'cpan')
-rw-r--r--cpan/Encode/Encode.pm48
-rw-r--r--cpan/Encode/JP/JP.pm6
-rw-r--r--cpan/Encode/Makefile.PL2
-rw-r--r--cpan/Encode/bin/encguess2
-rw-r--r--cpan/Encode/t/Encode.t11
5 files changed, 32 insertions, 37 deletions
diff --git a/cpan/Encode/Encode.pm b/cpan/Encode/Encode.pm
index d3eb3c1b11..a56a99947f 100644
--- a/cpan/Encode/Encode.pm
+++ b/cpan/Encode/Encode.pm
@@ -1,5 +1,5 @@
#
-# $Id: Encode.pm,v 3.08 2020/12/02 01:27:44 dankogai Exp $
+# $Id: Encode.pm,v 3.10 2021/05/18 07:42:45 dankogai Exp dankogai $
#
package Encode;
use strict;
@@ -7,7 +7,7 @@ use warnings;
use constant DEBUG => !!$ENV{PERL_ENCODE_DEBUG};
our $VERSION;
BEGIN {
- $VERSION = sprintf "%d.%02d", q$Revision: 3.08 $ =~ /(\d+)/g;
+ $VERSION = sprintf "%d.%02d", q$Revision: 3.10 $ =~ /(\d+)/g;
require XSLoader;
XSLoader::load( __PACKAGE__, $VERSION );
}
@@ -202,18 +202,6 @@ if ($ON_EBCDIC) {
$_[1] = '' if $chk;
return $res;
}
-} else {
- package Encode::Internal;
- use parent 'Encode::Encoding';
- my $obj = bless { Name => "Internal" } => "Encode::Internal";
- Encode::define_encoding($obj, 'Unicode');
- sub decode {
- my ( undef, $str, $chk ) = @_;
- utf8::upgrade($str);
- $_[1] = '' if $chk;
- return $str;
- }
- *encode = \&decode;
}
{
@@ -499,19 +487,25 @@ followed by C<encode> as follows:
$octets = encode_utf8($string);
+B<WARNING>: L<This function can produce invalid UTF-8!|/UTF-8 vs. utf8 vs. UTF8>
+Do not use it for data exchange.
+Unless you want Perl's older "lax" mode, prefer
+C<$octets = encode("UTF-8", $string)>.
+
Equivalent to C<$octets = encode("utf8", $string)>. The characters in
$string are encoded in Perl's internal format, and the result is returned
as a sequence of octets. Because all possible characters in Perl have a
(loose, not strict) utf8 representation, this function cannot fail.
-B<WARNING>: do not use this function for data exchange as it can produce
-not strict utf8 $octets! For strictly valid UTF-8 output use
-C<$octets = encode("UTF-8", $string)>.
-
=head3 decode_utf8
$string = decode_utf8($octets [, CHECK]);
+B<WARNING>: L<This function accepts invalid UTF-8!|/UTF-8 vs. utf8 vs. UTF8>
+Do not use it for data exchange.
+Unless you want Perl's older "lax" mode, prefer
+C<$string = decode("UTF-8", $octets [, CHECK])>.
+
Equivalent to C<$string = decode("utf8", $octets [, CHECK])>.
The sequence of octets represented by $octets is decoded
from (loose, not strict) utf8 into a sequence of logical characters.
@@ -519,10 +513,6 @@ Because not all sequences of octets are valid not strict utf8,
it is quite possible for this function to fail.
For CHECK, see L</"Handling Malformed Data">.
-B<WARNING>: do not use this function for data exchange as it can produce
-$string with not strict utf8 representation! For strictly valid UTF-8
-$string representation use C<$string = decode("UTF-8", $octets [, CHECK])>.
-
B<CAVEAT>: the input I<$octets> might be modified in-place depending on
what is set in CHECK. See L</LEAVE_SRC> if you want your inputs to be
left unchanged.
@@ -927,6 +917,20 @@ important distinction between C<"UTF-8"> and C<"utf8">.
encode("utf8", "\x{FFFF_FFFF}", 1); # okay
encode("UTF-8", "\x{FFFF_FFFF}", 1); # croaks
+This distinction is also important for decoding. In the following,
+C<$s> stores character U+200000, which exceeds UTF-8's allowed range.
+C<$s> thus stores an invalid Unicode code point:
+
+ $s = decode("utf8", "\xf8\x88\x80\x80\x80");
+
+C<"UTF-8">, by contrast, will either coerce the input to something valid:
+
+ $s = decode("UTF-8", "\xf8\x88\x80\x80\x80"); # U+FFFD
+
+.. or croak:
+
+ decode("UTF-8", "\xf8\x88\x80\x80\x80", FB_CROAK|LEAVE_SRC);
+
In the C<Encode> module, C<"UTF-8"> is actually a canonical name for
C<"utf-8-strict">. That hyphen between the C<"UTF"> and the C<"8"> is
critical; without it, C<Encode> goes "liberal" and (perhaps overly-)permissive:
diff --git a/cpan/Encode/JP/JP.pm b/cpan/Encode/JP/JP.pm
index 4251170c56..09efaaa8c3 100644
--- a/cpan/Encode/JP/JP.pm
+++ b/cpan/Encode/JP/JP.pm
@@ -7,7 +7,7 @@ BEGIN {
use strict;
use warnings;
use Encode;
-our $VERSION = do { my @r = ( q$Revision: 2.4 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r };
+our $VERSION = do { my @r = ( q$Revision: 2.5 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r };
use XSLoader;
XSLoader::load( __PACKAGE__, $VERSION );
@@ -50,8 +50,8 @@ supported are as follows.
cp932 /\bwindows-31j$/i Code Page 932
= Shift JIS + MS/IBM vendor mappings
jis0201-raw JIS0201, raw format
- jis0208-raw JIS0201, raw format
- jis0212-raw JIS0201, raw format
+ jis0208-raw JIS0208, raw format
+ jis0212-raw JIS0212, raw format
--------------------------------------------------------------------
=head1 DESCRIPTION
diff --git a/cpan/Encode/Makefile.PL b/cpan/Encode/Makefile.PL
index f9c774845f..f711d6dd4f 100644
--- a/cpan/Encode/Makefile.PL
+++ b/cpan/Encode/Makefile.PL
@@ -1,5 +1,5 @@
#
-# $Id: Makefile.PL,v 2.23 2020/12/02 01:28:17 dankogai Exp dankogai $
+# $Id: Makefile.PL,v 2.23 2020/12/02 01:28:17 dankogai Exp $
#
use 5.007003;
use strict;
diff --git a/cpan/Encode/bin/encguess b/cpan/Encode/bin/encguess
index 19a0673e76..440733eea0 100644
--- a/cpan/Encode/bin/encguess
+++ b/cpan/Encode/bin/encguess
@@ -61,7 +61,7 @@ encguess - guess character encodings of files
=head1 VERSION
-$Id: encguess,v 0.3 2020/12/02 01:28:17 dankogai Exp dankogai $
+$Id: encguess,v 0.3 2020/12/02 01:28:17 dankogai Exp $
=head1 SYNOPSIS
diff --git a/cpan/Encode/t/Encode.t b/cpan/Encode/t/Encode.t
index 0536b4b714..f60f37d596 100644
--- a/cpan/Encode/t/Encode.t
+++ b/cpan/Encode/t/Encode.t
@@ -25,19 +25,10 @@ my @character_set = ('0'..'9', 'A'..'Z', 'a'..'z');
my @source = qw(ascii iso8859-1 cp1250);
my @destiny = qw(cp1047 cp37 posix-bc);
my @ebcdic_sets = qw(cp1047 cp37 posix-bc);
-plan tests => 38+$n*@encodings + 2*@source*@destiny*@character_set + 2*@ebcdic_sets*256 + 6 + 3 + 3*8 + 2;
+plan tests => 38+$n*@encodings + 2*@source*@destiny*@character_set + 2*@ebcdic_sets*256 + 6 + 3*8;
my $str = join('',map(chr($_),0x20..0x7E));
my $cpy = $str;
-is length($str),from_to($cpy,'iso8859-1','Unicode'),"Length Wrong";
-is $cpy,$str,"ASCII mangled by translating from iso8859-1 to Unicode";
-$cpy = $str;
-is from_to($cpy,'Unicode','iso8859-1'),length($str),"Length wrong";
-is $cpy,$str,"ASCII mangled by translating from Unicode to iso8859-1";
-
-$str = join('',map(chr($_),0xa0..0xff));
-$cpy = $str;
-is length($str),from_to($cpy,'iso8859-1','Unicode'),"Length Wrong";
my $sym = Encode->getEncoding('symbol');
my $uni = $sym->decode(encode(ascii => 'a'));