diff options
-rw-r--r-- | MANIFEST | 1 | ||||
-rw-r--r-- | doio.c | 3 | ||||
-rw-r--r-- | ext/Encode/encoding.pm | 3 | ||||
-rw-r--r-- | ext/Encode/t/enc_eucjp.t | 66 | ||||
-rw-r--r-- | ext/Encode/t/enc_utf8.t | 9 | ||||
-rw-r--r-- | pp.c | 13 | ||||
-rw-r--r-- | sv.c | 2 | ||||
-rw-r--r-- | sv.h | 1 | ||||
-rw-r--r-- | t/uni/tr_utf8.t | 3 |
9 files changed, 91 insertions, 10 deletions
@@ -259,6 +259,7 @@ ext/Encode/t/big5-hkscs.utf test data ext/Encode/t/CJKT.t test script ext/Encode/t/Encode.t test script ext/Encode/t/Encoder.t test script +ext/Encode/t/enc_eucjp.t test script ext/Encode/t/enc_utf8.t test script ext/Encode/t/encoding.t test script ext/Encode/t/fallback.t test script @@ -1268,7 +1268,8 @@ Perl_do_print(pTHX_ register SV *sv, PerlIO *fp) default: if (PerlIO_isutf8(fp)) { if (!SvUTF8(sv)) - sv_utf8_upgrade(sv = sv_mortalcopy(sv)); + sv_utf8_upgrade_flags(sv = sv_mortalcopy(sv), + SV_GMAGIC|SV_UTF8_NO_ENCODING); } else if (DO_UTF8(sv)) { if (!sv_utf8_downgrade((sv = sv_mortalcopy(sv)), TRUE) diff --git a/ext/Encode/encoding.pm b/ext/Encode/encoding.pm index e8aa7374d5..1a43790b5c 100644 --- a/ext/Encode/encoding.pm +++ b/ext/Encode/encoding.pm @@ -29,8 +29,7 @@ sub import { Carp::croak("Unknown encoding '$name'"); } unless ($arg{Filter}) { - ${^ENCODING} = $enc # this is all you need, actually. - unless $name =~ /^(?:utf-?(?:8|16|32)|ucs-?(?:2|4))(?:[bl]e)?$/i; + ${^ENCODING} = $enc; $HAS_PERLIO or return 1; for my $h (qw(STDIN STDOUT)){ if ($arg{$h}){ diff --git a/ext/Encode/t/enc_eucjp.t b/ext/Encode/t/enc_eucjp.t new file mode 100644 index 0000000000..019b42606f --- /dev/null +++ b/ext/Encode/t/enc_eucjp.t @@ -0,0 +1,66 @@ +# This is the twin of enc_utf8.t, the only difference is that +# this has "use encoding 'euc-jp'". + +BEGIN { + require Config; import Config; + if ($Config{'extensions'} !~ /\bEncode\b/) { + print "1..0 # Skip: Encode was not built\n"; + exit 0; + } + unless (find PerlIO::Layer 'perlio') { + print "1..0 # Skip: PerlIO was not built\n"; + exit 0; + } + if (ord("A") == 193) { + print "1..0 # encoding pragma does not support EBCDIC platforms\n"; + exit(0); + } +} + +use encoding 'euc-jp'; + +my @c = (127, 128, 255, 256); + +print "1.." . (scalar @c + 1) . "\n"; + +my @f; + +for my $i (0..$#c) { + push @f, "f$i"; + open(F, ">f$i") or die "$0: failed to open 'f$i' for writing: $!"; + binmode(F, ":utf8"); + print F chr($c[$i]); + close F; +} + +my $t = 1; + +for my $i (0..$#c) { + open(F, "<f$i") or die "$0: failed to open 'f$i' for reading: $!"; + binmode(F, ":utf8"); + my $c = <F>; + my $o = ord($c); + print $o == $c[$i] ? "ok $t - utf8 I/O $c[$i]\n" : "not ok $t - utf8 I/O $c[$i]: $o != $c[$i]\n"; + $t++; +} + +my $f = "f" . @f; + +push @f, $f; +open(F, ">$f") or die "$0: failed to open '$f' for writing: $!"; +binmode(F, ":raw"); # Output raw bytes. +print F chr(128); # Output illegal UTF-8. +close F; +open(F, $f) or die "$0: failed to open '$f' for reading: $!"; +binmode(F, ":encoding(utf-8)"); +{ + local $^W = 1; + local $SIG{__WARN__} = sub { $a = shift }; + eval { <F> }; # This should get caught. +} +print $a =~ qr{^utf8 "\\x80" does not map to Unicode} ? + "ok $t - illegal utf8 input\n" : "not ok $t - illegal utf8 input: a = " . unpack("H*", $a) . "\n"; + +END { + 1 while unlink @f; +} diff --git a/ext/Encode/t/enc_utf8.t b/ext/Encode/t/enc_utf8.t index 20eb288400..6271fe607f 100644 --- a/ext/Encode/t/enc_utf8.t +++ b/ext/Encode/t/enc_utf8.t @@ -1,3 +1,6 @@ +# This is the twin of enc_eucjp.t, the only difference is that +# this has "use encoding 'utf8'". + BEGIN { require Config; import Config; if ($Config{'extensions'} !~ /\bEncode\b/) { @@ -37,11 +40,11 @@ for my $i (0..$#c) { binmode(F, ":utf8"); my $c = <F>; my $o = ord($c); - print $o == $c[$i] ? "ok $t\n" : "not ok $t # $o != $c[$i]\n"; + print $o == $c[$i] ? "ok $t - utf8 I/O $c[$i]\n" : "not ok $t - utf8 I/O $c[$$i]: $o != $c[$i]\n"; $t++; } -my $f = "f4"; +my $f = "f" . @f; push @f, $f; open(F, ">$f") or die "$0: failed to open '$f' for writing: $!"; @@ -56,7 +59,7 @@ binmode(F, ":encoding(utf-8)"); eval { <F> }; # This should get caught. } print $a =~ qr{^utf8 "\\x80" does not map to Unicode} ? - "ok $t\n" : "not ok $t: $a\n"; + "ok $t - illegal utf8 input\n" : "not ok $t - illegal utf8 input: a = " . unpack("H*", $a) . "\n"; END { 1 while unlink @f; @@ -3278,8 +3278,19 @@ PP(pp_chr) *tmps++ = (char)value; *tmps = '\0'; (void)SvPOK_only(TARG); - if (PL_encoding) + if (PL_encoding && !IN_BYTES) { sv_recode_to_utf8(TARG, PL_encoding); + tmps = SvPVX(TARG); + if (SvCUR(TARG) == 0 || !is_utf8_string((U8*)tmps, SvCUR(TARG)) || + memEQ(tmps, "\xef\xbf\xbd\0", 4)) { + SvGROW(TARG,3); + SvCUR_set(TARG, 2); + *tmps++ = (U8)UTF8_EIGHT_BIT_HI(value); + *tmps++ = (U8)UTF8_EIGHT_BIT_LO(value); + *tmps = '\0'; + SvUTF8_on(TARG); + } + } XPUSHs(TARG); RETURN; } @@ -3395,7 +3395,7 @@ Perl_sv_utf8_upgrade_flags(pTHX_ register SV *sv, I32 flags) sv_force_normal_flags(sv, 0); } - if (PL_encoding) + if (PL_encoding && !(flags & SV_UTF8_NO_ENCODING)) sv_recode_to_utf8(sv, PL_encoding); else { /* Assume Latin-1/EBCDIC */ /* This function could be much more efficient if we @@ -1030,6 +1030,7 @@ otherwise. #define SV_IMMEDIATE_UNREF 1 #define SV_GMAGIC 2 #define SV_COW_DROP_PV 4 +#define SV_UTF8_NO_ENCODING 8 /* We are about to replace the SV's current value. So if it's copy on write we need to normalise it. Use the SV_COW_DROP_PV flag hint to say that diff --git a/t/uni/tr_utf8.t b/t/uni/tr_utf8.t index ee95c36d2b..12f3516fc2 100644 --- a/t/uni/tr_utf8.t +++ b/t/uni/tr_utf8.t @@ -62,7 +62,6 @@ is($str, $hiragana, "s/// # hiragana -> katakana"); # [perl 16843] my $line = 'abcdefghijklmnopqrstuvwxyz$0123456789'; $line =~ tr/bcdeghijklmnprstvwxyz$02578/בצדעגהיײקלמנפּרסטװשכיזשױתײחא/; -# is($line, "aבצדעfגהיײקלמנoפqּרסuטװשכיזש1ױ34ת6ײח9", "[perl #16843]"); - ok(1, "TODO: Encode 1.84 broke the test for perl #16843"); + is($line, "aבצדעfגהיײקלמנoפqּרסuטװשכיזש1ױ34ת6ײח9", "[perl #16843]"); } __END__ |