[perl #59908] \x, \0, and \N{} not ok in double-quotish when followed by > \x100

Attached is a patch for this problem. The root cause was that S_scan_const() was not recoding to utf8 under some circumstances when it should be. I also changed it so that in all places, the flag that indicates the output is in utf8 is changed from false to true if and only if the destination is recoded to utf8. One place was skipping this, and then setting it unconditionally later on. In one place in the routine, the routine had code to do the recoding itself. In the other places, it called sv_utf8_upgrade(). I changed it to call the subroutine in all cases. I fixed a bug that would appear only on EBCDIC machines where constants of the form \N{U+....} would have been interpreted as EBCDIC. And in inspecting the code, I realized there were problems with growing the scalar value to fit the input. I cleaned those up. I also added a number of comments to document things I found out, and changed some existing ones to be more accurate. Since no one responded to my request for where to put the test cases, and I couldn't figure out a good place to put them, I added a new test file, t/uni/lex_utf8.t.
author: karl williamson <public@khwilliamson.com> 2008-12-08 21:59:05 -0700
committer: Rafael Garcia-Suarez <rgarciasuarez@gmail.com> 2009-01-15 16:59:41 +0100
commit: 77a135fea310715f77af2560dd03830df874b5fa (patch)
tree: 4a036303d45e2123a8c5b3552a472fb00d3c5f0a /t/uni/lex_utf8.t
parent: e23621c75d209a8ee0c88579b09beccd0c4a5610 (diff)
download: perl-77a135fea310715f77af2560dd03830df874b5fa.tar.gz
1 files changed, 44 insertions, 0 deletions
diff --git a/t/uni/lex_utf8.t b/t/uni/lex_utf8.t
new file mode 100644
index 0000000000..c7d1778bef
--- /dev/null
+++ b/t/uni/lex_utf8.t
@@ -0,0 +1,44 @@
+#
+# This script is written intentionally in UTF-8
+
+BEGIN {
+    if (ord("A") == 193) {
+        print "1..0 # Skip: EBCDIC\n";
+        exit 0;
+    }
+    $| = 1;
+}
+
+use strict;
+
+use Test::More tests => 10;
+use charnames ':full';
+
+use utf8;
+
+my $A_with_ogonek = "Ą";
+my $micro_sign = "µ";
+my $hex_first = "a\x{A2}Ą";
+my $hex_last = "aĄ\x{A2}";
+my $name_first = "b\N{MICRO SIGN}Ɓ";
+my $name_last = "bƁ\N{MICRO SIGN}";
+my $uname_first = "b\N{U+00B5}Ɓ";
+my $uname_last = "bƁ\N{U+00B5}";
+my $octal_first = "c\377Ć";
+my $octal_last = "cĆ\377";
+
+do {
+	use bytes;
+	is((join "", unpack("C*", $A_with_ogonek)), "196" . "132", 'single char above 0x100');
+	is((join "", unpack("C*", $micro_sign)), "194" . "181", 'single char in 0x80 .. 0xFF');
+	is((join "", unpack("C*", $hex_first)), "97" . "194" . "162" . "196" . "132", 'a . \x{A2} . char above 0x100');
+	is((join "", unpack("C*", $hex_last)), "97" . "196" . "132" . "194" . "162", 'a . char above 0x100 . \x{A2}');
+	is((join "", unpack("C*", $name_first)), "98" . "194" . "181" . "198" . "129", 'b . \N{MICRO SIGN} . char above 0x100');
+	is((join "", unpack("C*", $name_last)), "98" . "198" . "129" . "194" . "181", 'b . char above 0x100 . \N{MICRO SIGN}');
+	is((join "", unpack("C*", $uname_first)), "98" . "194" . "181" . "198" . "129", 'b . \N{U+00B5} . char above 0x100');
+	is((join "", unpack("C*", $uname_last)), "98" . "198" . "129" . "194" . "181", 'b . char above 0x100 . \N{U+00B5}');
+	is((join "", unpack("C*", $octal_first)), "99" . "195" . "191" . "196" . "134", 'c . \377 . char above 0x100');
+	is((join "", unpack("C*", $octal_last)), "99" . "196" . "134" . "195" . "191", 'c . char above 0x100 . \377');
+}
+__END__
+
author	karl williamson <public@khwilliamson.com>	2008-12-08 21:59:05 -0700
committer	Rafael Garcia-Suarez <rgarciasuarez@gmail.com>	2009-01-15 16:59:41 +0100
commit	77a135fea310715f77af2560dd03830df874b5fa (patch)
tree	4a036303d45e2123a8c5b3552a472fb00d3c5f0a /t/uni/lex_utf8.t
parent	e23621c75d209a8ee0c88579b09beccd0c4a5610 (diff)
download	perl-77a135fea310715f77af2560dd03830df874b5fa.tar.gz