Fix 'use encoding' I/O for code points 0x80..0xFF;

code changes from Inaba Hiroto; test tweaks by jhi. p4raw-id: //depot/perl@18496
author: Jarkko Hietaniemi <jhi@iki.fi> 2003-01-16 20:36:23 +0000
committer: Jarkko Hietaniemi <jhi@iki.fi> 2003-01-16 20:36:23 +0000
commit: 88632417a970dff8f92718b0800b1aa1400cb4ae (patch)
tree: ea66b4eab7dfc6a1a4ebdd064c01f1f0095bcfe1
parent: caf774a6b47c93401a4581fab332b04b560c89d5 (diff)
download: perl-88632417a970dff8f92718b0800b1aa1400cb4ae.tar.gz
9 files changed, 91 insertions, 10 deletions
diff --git a/MANIFEST b/MANIFEST
index 9ebb6a64b5..be7882c0f1 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -259,6 +259,7 @@ ext/Encode/t/big5-hkscs.utf	test data
 ext/Encode/t/CJKT.t		test script
 ext/Encode/t/Encode.t		test script
 ext/Encode/t/Encoder.t		test script
+ext/Encode/t/enc_eucjp.t	test script
 ext/Encode/t/enc_utf8.t		test script
 ext/Encode/t/encoding.t		test script
 ext/Encode/t/fallback.t		test script
diff --git a/doio.c b/doio.c
index e23a2ca7d3..3ae3764d58 100644
--- a/doio.c
+++ b/doio.c
@@ -1268,7 +1268,8 @@ Perl_do_print(pTHX_ register SV *sv, PerlIO *fp)
     default:
 	if (PerlIO_isutf8(fp)) {
 	    if (!SvUTF8(sv))
-		sv_utf8_upgrade(sv = sv_mortalcopy(sv));
+		sv_utf8_upgrade_flags(sv = sv_mortalcopy(sv),
+				      SV_GMAGIC|SV_UTF8_NO_ENCODING);
 	}
 	else if (DO_UTF8(sv)) {
 	    if (!sv_utf8_downgrade((sv = sv_mortalcopy(sv)), TRUE)
diff --git a/ext/Encode/encoding.pm b/ext/Encode/encoding.pm
index e8aa7374d5..1a43790b5c 100644
--- a/ext/Encode/encoding.pm
+++ b/ext/Encode/encoding.pm
@@ -29,8 +29,7 @@ sub import {
 	Carp::croak("Unknown encoding '$name'");
     }
     unless ($arg{Filter}) {
-	${^ENCODING} = $enc # this is all you need, actually.
-	    unless $name =~ /^(?:utf-?(?:8|16|32)|ucs-?(?:2|4))(?:[bl]e)?$/i;
+	${^ENCODING} = $enc;
 	$HAS_PERLIO or return 1;
 	for my $h (qw(STDIN STDOUT)){
 	    if ($arg{$h}){
diff --git a/ext/Encode/t/enc_eucjp.t b/ext/Encode/t/enc_eucjp.t
new file mode 100644
index 0000000000..019b42606f
--- /dev/null
+++ b/ext/Encode/t/enc_eucjp.t
@@ -0,0 +1,66 @@
+# This is the twin of enc_utf8.t, the only difference is that
+# this has "use encoding 'euc-jp'".
+
+BEGIN {
+    require Config; import Config;
+    if ($Config{'extensions'} !~ /\bEncode\b/) {
+      print "1..0 # Skip: Encode was not built\n";
+      exit 0;
+    }
+    unless (find PerlIO::Layer 'perlio') {
+	print "1..0 # Skip: PerlIO was not built\n";
+	exit 0;
+    }
+    if (ord("A") == 193) {
+	print "1..0 # encoding pragma does not support EBCDIC platforms\n";
+	exit(0);
+    }
+}
+
+use encoding 'euc-jp';
+
+my @c = (127, 128, 255, 256);
+
+print "1.." . (scalar @c + 1) . "\n";
+
+my @f;
+
+for my $i (0..$#c) {
+  push @f, "f$i";
+  open(F, ">f$i") or die "$0: failed to open 'f$i' for writing: $!";
+  binmode(F, ":utf8");
+  print F chr($c[$i]);
+  close F;
+}
+
+my $t = 1;
+
+for my $i (0..$#c) {
+  open(F, "<f$i") or die "$0: failed to open 'f$i' for reading: $!";
+  binmode(F, ":utf8");
+  my $c = <F>;
+  my $o = ord($c);
+  print $o == $c[$i] ? "ok $t - utf8 I/O $c[$i]\n" : "not ok $t - utf8 I/O $c[$i]: $o != $c[$i]\n";
+  $t++;
+}
+
+my $f = "f" . @f;
+
+push @f, $f;
+open(F, ">$f") or die "$0: failed to open '$f' for writing: $!";
+binmode(F, ":raw"); # Output raw bytes.
+print F chr(128); # Output illegal UTF-8.
+close F;
+open(F, $f) or die "$0: failed to open '$f' for reading: $!";
+binmode(F, ":encoding(utf-8)");
+{
+	local $^W = 1;
+	local $SIG{__WARN__} = sub { $a = shift };
+	eval { <F> }; # This should get caught.
+}
+print $a =~ qr{^utf8 "\\x80" does not map to Unicode} ?
+  "ok $t - illegal utf8 input\n" : "not ok $t - illegal utf8 input: a = " . unpack("H*", $a) . "\n";
+
+END {
+  1 while unlink @f;
+}
diff --git a/ext/Encode/t/enc_utf8.t b/ext/Encode/t/enc_utf8.t
index 20eb288400..6271fe607f 100644
--- a/ext/Encode/t/enc_utf8.t
+++ b/ext/Encode/t/enc_utf8.t
@@ -1,3 +1,6 @@
+# This is the twin of enc_eucjp.t, the only difference is that
+# this has "use encoding 'utf8'".
+
 BEGIN {
     require Config; import Config;
     if ($Config{'extensions'} !~ /\bEncode\b/) {
@@ -37,11 +40,11 @@ for my $i (0..$#c) {
   binmode(F, ":utf8");
   my $c = <F>;
   my $o = ord($c);
-  print $o == $c[$i] ? "ok $t\n" : "not ok $t # $o != $c[$i]\n";
+  print $o == $c[$i] ? "ok $t - utf8 I/O $c[$i]\n" : "not ok $t - utf8 I/O $c[$$i]: $o != $c[$i]\n";
   $t++;
 }
 
-my $f = "f4";
+my $f = "f" . @f;
 
 push @f, $f;
 open(F, ">$f") or die "$0: failed to open '$f' for writing: $!";
@@ -56,7 +59,7 @@ binmode(F, ":encoding(utf-8)");
 	eval { <F> }; # This should get caught.
 }
 print $a =~ qr{^utf8 "\\x80" does not map to Unicode} ?
-  "ok $t\n" : "not ok $t: $a\n";
+  "ok $t - illegal utf8 input\n" : "not ok $t - illegal utf8 input: a = " . unpack("H*", $a) . "\n";
 
 END {
   1 while unlink @f;
diff --git a/pp.c b/pp.c
index c78246e7de..c9d1dc67f0 100644
--- a/pp.c
+++ b/pp.c
@@ -3278,8 +3278,19 @@ PP(pp_chr)
     *tmps++ = (char)value;
     *tmps = '\0';
     (void)SvPOK_only(TARG);
-    if (PL_encoding)
+    if (PL_encoding && !IN_BYTES) {
         sv_recode_to_utf8(TARG, PL_encoding);
+	tmps = SvPVX(TARG);
+	if (SvCUR(TARG) == 0 || !is_utf8_string((U8*)tmps, SvCUR(TARG)) ||
+	    memEQ(tmps, "\xef\xbf\xbd\0", 4)) {
+	    SvGROW(TARG,3);
+	    SvCUR_set(TARG, 2);
+	    *tmps++ = (U8)UTF8_EIGHT_BIT_HI(value);
+	    *tmps++ = (U8)UTF8_EIGHT_BIT_LO(value);
+	    *tmps = '\0';
+	    SvUTF8_on(TARG);
+	}
+    }
     XPUSHs(TARG);
     RETURN;
 }
diff --git a/sv.c b/sv.c
index ffa71e1a68..33e22025b4 100644
--- a/sv.c
+++ b/sv.c
@@ -3395,7 +3395,7 @@ Perl_sv_utf8_upgrade_flags(pTHX_ register SV *sv, I32 flags)
         sv_force_normal_flags(sv, 0);
     }
 
-    if (PL_encoding)
+    if (PL_encoding && !(flags & SV_UTF8_NO_ENCODING))
         sv_recode_to_utf8(sv, PL_encoding);
     else { /* Assume Latin-1/EBCDIC */
 	 /* This function could be much more efficient if we
diff --git a/sv.h b/sv.h
index 7c5e6dca06..598397e31c 100644
--- a/sv.h
+++ b/sv.h
@@ -1030,6 +1030,7 @@ otherwise.
 #define SV_IMMEDIATE_UNREF	1
 #define SV_GMAGIC		2
 #define SV_COW_DROP_PV		4
+#define SV_UTF8_NO_ENCODING	8
 
 /* We are about to replace the SV's current value. So if it's copy on write
    we need to normalise it. Use the SV_COW_DROP_PV flag hint to say that
diff --git a/t/uni/tr_utf8.t b/t/uni/tr_utf8.t
index ee95c36d2b..12f3516fc2 100644
--- a/t/uni/tr_utf8.t
+++ b/t/uni/tr_utf8.t
@@ -62,7 +62,6 @@ is($str, $hiragana, "s/// # hiragana -> katakana");
   # [perl 16843]
   my $line = 'abcdefghijklmnopqrstuvwxyz$0123456789';
   $line =~ tr/bcdeghijklmnprstvwxyz$02578/בצדעגהיײקלמנפּרסטװשכיזשױתײחא/;
-#  is($line, "aבצדעfגהיײקלמנoפqּרסuטװשכיזש1ױ34ת6ײח9", "[perl #16843]");
-   ok(1, "TODO: Encode 1.84 broke the test for perl #16843");
+  is($line, "aבצדעfגהיײקלמנoפqּרסuטװשכיזש1ױ34ת6ײח9", "[perl #16843]");
 }
 __END__
author	Jarkko Hietaniemi <jhi@iki.fi>	2003-01-16 20:36:23 +0000
committer	Jarkko Hietaniemi <jhi@iki.fi>	2003-01-16 20:36:23 +0000
commit	88632417a970dff8f92718b0800b1aa1400cb4ae (patch)
tree	ea66b4eab7dfc6a1a4ebdd064c01f1f0095bcfe1
parent	caf774a6b47c93401a4581fab332b04b560c89d5 (diff)
download	perl-88632417a970dff8f92718b0800b1aa1400cb4ae.tar.gz