Add details to UTF-8 malformation error messages

I've long been unsatisfied with the information contained in the error/warning messages raised when some input is malformed UTF-8, but have been reluctant to change the text in case some one is relying on it. One reason that someone might be parsing the messages is that there has been no convenient way to otherwise pin down what the exact malformation might be. A few commits from now will add a facility to get the type of malformation unambiguously. This will be a better mechanism to use for those rare modules that need to know what's the exact malformation. So, I will fix and issue pull requests for any module broken by this commit. The messages are changed by now dumping (in \xXY format) the bytes that make up the malformed character, and extra details are added in most cases. Messages about overlongs now display the code point they evaluate to and what the shortest UTF-8 sequence for generating that code point is. Messages about overflowing now just display that it overflows, since the entire byte sequence is now dumped. The previous message displayed just the byte which was being processed where overflow was detected, but that information is not at all meaningfull.
author: Karl Williamson <khw@cpan.org> 2016-09-28 15:05:17 -0600
committer: Karl Williamson <khw@cpan.org> 2016-10-13 11:18:11 -0600
commit: 7cf8d05d1e856f3bd3a392b3ccea008f1c1eb743 (patch)
tree: 3387705d72ad140eb4ab6412aa70adb94afc6bb5
parent: 806547a7dc29226b6a06672e1d42fb136e766510 (diff)
download: perl-7cf8d05d1e856f3bd3a392b3ccea008f1c1eb743.tar.gz
12 files changed, 164 insertions, 64 deletions
diff --git a/embed.fnc b/embed.fnc
index 43a17deee0..4e60cd2a49 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -1674,7 +1674,11 @@ ApdD	|UV	|to_utf8_case	|NN const U8 *p					\
 				NULLOK const char *special
 #if defined(PERL_IN_UTF8_C)
 sMR	|char *	|unexpected_non_continuation_text			\
-		|NN const U8 * const s|const STRLEN len
+		|NN const U8 * const s					\
+		|const STRLEN print_len					\
+		|const STRLEN non_cont_byte_pos				\
+		|const STRLEN expect_len
+sM	|char *	|_byte_dump_string|NN const U8 * s|const STRLEN len
 s	|UV	|_to_utf8_case  |const UV uv1					\
 				|NN const U8 *p					\
 				|NN U8* ustrp					\
diff --git a/embed.h b/embed.h
index 6045ec75d2..d9c270920e 100644
--- a/embed.h
+++ b/embed.h
@@ -1821,6 +1821,7 @@
 #define isa_lookup(a,b,c,d)	S_isa_lookup(aTHX_ a,b,c,d)
 #  endif
 #  if defined(PERL_IN_UTF8_C)
+#define _byte_dump_string(a,b)	S__byte_dump_string(aTHX_ a,b)
 #define _to_utf8_case(a,b,c,d,e,f,g)	S__to_utf8_case(aTHX_ a,b,c,d,e,f,g)
 #define check_locale_boundary_crossing(a,b,c,d)	S_check_locale_boundary_crossing(aTHX_ a,b,c,d)
 #define is_utf8_common(a,b,c,d)	S_is_utf8_common(aTHX_ a,b,c,d)
@@ -1828,7 +1829,7 @@
 #define swash_scan_list_line(a,b,c,d,e,f,g)	S_swash_scan_list_line(aTHX_ a,b,c,d,e,f,g)
 #define swatch_get(a,b,c)	S_swatch_get(aTHX_ a,b,c)
 #define to_lower_latin1		S_to_lower_latin1
-#define unexpected_non_continuation_text(a,b)	S_unexpected_non_continuation_text(aTHX_ a,b)
+#define unexpected_non_continuation_text(a,b,c,d)	S_unexpected_non_continuation_text(aTHX_ a,b,c,d)
 #  endif
 #  if defined(PERL_IN_UTF8_C) || defined(PERL_IN_PP_C)
 #define _to_upper_title_latin1(a,b,c,d)	Perl__to_upper_title_latin1(aTHX_ a,b,c,d)
diff --git a/ext/XS-APItest/t/utf8.t b/ext/XS-APItest/t/utf8.t
index 32f60e0d1d..7a951a011a 100644
--- a/ext/XS-APItest/t/utf8.t
+++ b/ext/XS-APItest/t/utf8.t
@@ -1007,7 +1007,7 @@ my @malformations = (
         $UTF8_ALLOW_LONG,
         0,   # NUL
         2,
-        qr/2 bytes, need 1/
+        qr/overlong/
     ],
     [ "overlong malformation, highest 2-byte",
         (isASCII) ? "\xc1\xbf" : I8_to_native("\xc4\xbf"),
@@ -1015,7 +1015,7 @@ my @malformations = (
         $UTF8_ALLOW_LONG,
         (isASCII) ? 0x7F : utf8::unicode_to_native(0xBF),
         2,
-        qr/2 bytes, need 1/
+        qr/overlong/
     ],
     [ "overlong malformation, lowest 3-byte",
         (isASCII) ? "\xe0\x80\x80" : I8_to_native("\xe0\xa0\xa0"),
@@ -1023,7 +1023,7 @@ my @malformations = (
         $UTF8_ALLOW_LONG,
         0,   # NUL
         3,
-        qr/3 bytes, need 1/
+        qr/overlong/
     ],
     [ "overlong malformation, highest 3-byte",
         (isASCII) ? "\xe0\x9f\xbf" : I8_to_native("\xe0\xbf\xbf"),
@@ -1031,7 +1031,7 @@ my @malformations = (
         $UTF8_ALLOW_LONG,
         (isASCII) ? 0x7FF : 0x3FF,
         3,
-        qr/3 bytes, need 2/
+        qr/overlong/
     ],
     [ "overlong malformation, lowest 4-byte",
         (isASCII) ? "\xf0\x80\x80\x80" : I8_to_native("\xf0\xa0\xa0\xa0"),
@@ -1039,7 +1039,7 @@ my @malformations = (
         $UTF8_ALLOW_LONG,
         0,   # NUL
         4,
-        qr/4 bytes, need 1/
+        qr/overlong/
     ],
     [ "overlong malformation, highest 4-byte",
         (isASCII) ? "\xf0\x8F\xbf\xbf" : I8_to_native("\xf0\xaf\xbf\xbf"),
@@ -1047,7 +1047,7 @@ my @malformations = (
         $UTF8_ALLOW_LONG,
         (isASCII) ? 0xFFFF : 0x3FFF,
         4,
-        qr/4 bytes, need 3/
+        qr/overlong/
     ],
     [ "overlong malformation, lowest 5-byte",
         (isASCII)
@@ -1057,7 +1057,7 @@ my @malformations = (
         $UTF8_ALLOW_LONG,
         0,   # NUL
         5,
-        qr/5 bytes, need 1/
+        qr/overlong/
     ],
     [ "overlong malformation, highest 5-byte",
         (isASCII)
@@ -1067,7 +1067,7 @@ my @malformations = (
         $UTF8_ALLOW_LONG,
         (isASCII) ? 0x1FFFFF : 0x3FFFF,
         5,
-        qr/5 bytes, need 4/
+        qr/overlong/
     ],
     [ "overlong malformation, lowest 6-byte",
         (isASCII)
@@ -1077,7 +1077,7 @@ my @malformations = (
         $UTF8_ALLOW_LONG,
         0,   # NUL
         6,
-        qr/6 bytes, need 1/
+        qr/overlong/
     ],
     [ "overlong malformation, highest 6-byte",
         (isASCII)
@@ -1087,7 +1087,7 @@ my @malformations = (
         $UTF8_ALLOW_LONG,
         (isASCII) ? 0x3FFFFFF : 0x3FFFFF,
         6,
-        qr/6 bytes, need 5/
+        qr/overlong/
     ],
     [ "overlong malformation, lowest 7-byte",
         (isASCII)
@@ -1097,7 +1097,7 @@ my @malformations = (
         $UTF8_ALLOW_LONG,
         0,   # NUL
         7,
-        qr/7 bytes, need 1/
+        qr/overlong/
     ],
     [ "overlong malformation, highest 7-byte",
         (isASCII)
@@ -1107,7 +1107,7 @@ my @malformations = (
         $UTF8_ALLOW_LONG,
         (isASCII) ? 0x7FFFFFFF : 0x3FFFFFF,
         7,
-        qr/7 bytes, need 6/
+        qr/overlong/
     ],
 );
 
@@ -1120,7 +1120,7 @@ if (isASCII && ! $is64bit) {    # 32-bit ASCII platform
             0,  # There is no way to allow this malformation
             $REPLACEMENT,
             7,
-            qr/overflow/
+            qr/overflows/
         ],
         [ "overflow malformation, can tell on first byte",
             "\xff\x80\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80",
@@ -1128,7 +1128,7 @@ if (isASCII && ! $is64bit) {    # 32-bit ASCII platform
             0,  # There is no way to allow this malformation
             $REPLACEMENT,
             13,
-            qr/overflow/
+            qr/overflows/
         ];
 }
 else {
@@ -1146,7 +1146,7 @@ else {
             $UTF8_ALLOW_LONG,
             0,   # NUL
             (isASCII) ? 13 : 14,
-            qr/1[34] bytes, need 1/,    # 1[34] to work on either ASCII or EBCDIC
+            qr/overlong/,
         ],
         [ "overlong malformation, highest max-byte",
             (isASCII)    # 2**36-1 on ASCII; 2**30-1 on EBCDIC
@@ -1156,7 +1156,7 @@ else {
             $UTF8_ALLOW_LONG,
             (isASCII) ? 0xFFFFFFFFF : 0x3FFFFFFF,
             (isASCII) ? 13 : 14,
-            qr/1[34] bytes, need 7/,
+            qr/overlong/,
         ];
 
     if (! $is64bit) {   # 32-bit EBCDIC
@@ -1167,7 +1167,7 @@ else {
             0,  # There is no way to allow this malformation
             $REPLACEMENT,
             14,
-            qr/overflow/
+            qr/overflows/
         ];
     }
     else {  # 64-bit
@@ -1180,7 +1180,7 @@ else {
                 0,  # There is no way to allow this malformation
                 $REPLACEMENT,
                 (isASCII) ? 13 : 14,
-                qr/overflow/
+                qr/overflows/
             ];
     }
 }
@@ -1656,7 +1656,7 @@ my @tests = (
         "$UTF8_DISALLOW_ABOVE_31_BIT",
         'utf8', 0,
         (! isASCII) ? 14 : ($is64bit) ? 13 : 7,
-        qr/overflow at byte .*, after start byte 0xf/
+        qr/overflows/
     ],
 );
 
diff --git a/lib/utf8.t b/lib/utf8.t
index c9dbb6ee59..0530faf5b3 100644
--- a/lib/utf8.t
+++ b/lib/utf8.t
@@ -165,7 +165,7 @@ no utf8; # Ironic, no?
     use utf8; %a = ("\xE1\xA0"=>"sterling");
     print 'start'; printf '%x,', ord \$_ foreach keys %a; print "end\n";
 BANG
-	      qr/^Malformed UTF-8 character \(\d bytes?, need \d, .+\).*start\d+,end$/sm
+	      qr/^Malformed UTF-8 character: .*? \(too short; got \d bytes?, need \d\).*start\d+,end$/sm
 	     ],
             );
     foreach (@tests) {
diff --git a/pod/perldelta.pod b/pod/perldelta.pod
index 2c3986b14f..dbba9efcb5 100644
--- a/pod/perldelta.pod
+++ b/pod/perldelta.pod
@@ -204,7 +204,9 @@ XXX Changes (i.e. rewording) of diagnostic messages go here
 
 =item *
 
-XXX Describe change here
+Details as to the exact problem have been added to the diagnostics that
+occur when malformed UTF-8 is encountered when trying to convert to a
+code point.
 
 =back
 
diff --git a/pod/perldiag.pod b/pod/perldiag.pod
index 8348663e1d..d9f807c733 100644
--- a/pod/perldiag.pod
+++ b/pod/perldiag.pod
@@ -3346,8 +3346,11 @@ so the signature was instead interpreted as a bad prototype.
 
 =item Malformed UTF-8 character (%s)
 
-(S utf8)(F) Perl detected a string that didn't comply with UTF-8
-encoding rules, even though it had the UTF8 flag on.
+(S utf8)(F) Perl detected a string that should be UTF-8, but didn't
+comply with UTF-8 encoding rules, or represents a code point whose
+ordinal integer value doesn't fit into the word size of the current
+platform (overflows).  Details as to the exact malformation are given in
+the variable, C<%s>, part of the message.
 
 One possible cause is that you set the UTF8 flag yourself for data that
 you thought to be in UTF-8 but it wasn't (it was for example legacy
diff --git a/proto.h b/proto.h
index 6491d1f3cd..78519f6ac9 100644
--- a/proto.h
+++ b/proto.h
@@ -5572,6 +5572,9 @@ STATIC bool	S_isa_lookup(pTHX_ HV *stash, const char * const name, STRLEN len, U
 	assert(stash); assert(name)
 #endif
 #if defined(PERL_IN_UTF8_C)
+STATIC char *	S__byte_dump_string(pTHX_ const U8 * s, const STRLEN len);
+#define PERL_ARGS_ASSERT__BYTE_DUMP_STRING	\
+	assert(s)
 STATIC UV	S__to_utf8_case(pTHX_ const UV uv1, const U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp, const char *normal, const char *special);
 #define PERL_ARGS_ASSERT__TO_UTF8_CASE	\
 	assert(p); assert(ustrp); assert(swashp); assert(normal)
@@ -5604,7 +5607,7 @@ STATIC SV*	S_swatch_get(pTHX_ SV* swash, UV start, UV span)
 STATIC U8	S_to_lower_latin1(const U8 c, U8 *p, STRLEN *lenp)
 			__attribute__warn_unused_result__;
 
-STATIC char *	S_unexpected_non_continuation_text(pTHX_ const U8 * const s, const STRLEN len)
+STATIC char *	S_unexpected_non_continuation_text(pTHX_ const U8 * const s, const STRLEN print_len, const STRLEN non_cont_byte_pos, const STRLEN expect_len)
 			__attribute__warn_unused_result__;
 #define PERL_ARGS_ASSERT_UNEXPECTED_NON_CONTINUATION_TEXT	\
 	assert(s)
diff --git a/t/io/utf8.t b/t/io/utf8.t
index 282b8e78bd..3909dc854f 100644
--- a/t/io/utf8.t
+++ b/t/io/utf8.t
@@ -174,7 +174,7 @@ SKIP: {
 	local $SIG{__WARN__} = sub { push @warnings, $_[0]; };
 	eval { sprintf "%vd\n", $x };
 	is (scalar @warnings, 1);
-	like ($warnings[0], qr/Malformed UTF-8 character \(unexpected continuation byte 0x82, with no preceding start byte/);
+	like ($warnings[0], qr/Malformed UTF-8 character: \\x82 \(unexpected continuation byte 0x82, with no preceding start byte/);
     }
 }
 
diff --git a/t/lib/warnings/utf8 b/t/lib/warnings/utf8
index 4263c04958..1c782e7235 100644
--- a/t/lib/warnings/utf8
+++ b/t/lib/warnings/utf8
@@ -30,8 +30,8 @@ my $a = "sn�storm" ;
     my $a = "sn�storm";
 }
 EXPECT
-Malformed UTF-8 character (unexpected non-continuation byte 0x73, immediately after start byte 0xf8) at - line 9.
-Malformed UTF-8 character (unexpected non-continuation byte 0x73, immediately after start byte 0xf8) at - line 14.
+Malformed UTF-8 character: \xf8\x73\x74\x6f\x72 (unexpected non-continuation byte 0x73, immediately after start byte 0xf8; need 5 bytes, got 1) at - line 9.
+Malformed UTF-8 character: \xf8\x73\x74\x6f\x72 (unexpected non-continuation byte 0x73, immediately after start byte 0xf8; need 5 bytes, got 1) at - line 14.
 ########
 use warnings 'utf8';
 my $d7ff  = uc(chr(0xD7FF));
@@ -766,4 +766,4 @@ BEGIN{
     }
 {};$^H=2**400}�
 EXPECT
-Malformed UTF-8 character (unexpected non-continuation byte 0x0a, immediately after start byte 0xc2) at - line 6.
+Malformed UTF-8 character: \xc2\x0a (unexpected non-continuation byte 0x0a, immediately after start byte 0xc2; need 2 bytes, got 1) at - line 6.
diff --git a/t/op/pack.t b/t/op/pack.t
index df1646426c..014fbc55ac 100644
--- a/t/op/pack.t
+++ b/t/op/pack.t
@@ -964,7 +964,7 @@ is("@{[unpack('U*', pack('U*', 100, 200))]}", "100 200");
         my $bad = pack("U0C", 202);
         local $SIG{__WARN__} = sub { $@ = "@_" };
         my @null = unpack('U0U', $bad);
-        like($@, qr/^Malformed UTF-8 character /);
+        like($@, qr/^Malformed UTF-8 character: /);
     }
 }
 
diff --git a/t/op/utf8decode.t b/t/op/utf8decode.t
index c8c7dc7dc0..40ec540713 100644
--- a/t/op/utf8decode.t
+++ b/t/op/utf8decode.t
@@ -141,29 +141,29 @@ __DATA__
 3.3.10 n -	5	fd:bf:bf:bf:bf	-	5 bytes, need 6
 3.4	Concatenation of incomplete sequences
 3.4.1 N10 -	30	c0:e0:80:f0:80:80:f8:80:80:80:fc:80:80:80:80:df:ef:bf:f7:bf:bf:fb:bf:bf:bf:fd:bf:bf:bf:bf	-	unexpected non-continuation byte 0xe0, immediately after start byte 0xc0
-3.5	Impossible bytes
-3.5.1 n -	1	fe	-	byte 0xfe
-3.5.2 n -	1	ff	-	byte 0xff
+3.5	Impossible bytes (but not with Perl's extended UTF-8)
+3.5.1 n -	1	fe	-	1 byte, need 7
+3.5.2 n -	1	ff	-	1 byte, need 13
 3.5.3 N4 -	4	fe:fe:ff:ff	-	byte 0xfe
 4	Overlong sequences
 4.1	Examples of an overlong ASCII character
-4.1.1 n -	2	c0:af	-	2 bytes, need 1
-4.1.2 n -	3	e0:80:af	-	3 bytes, need 1
-4.1.3 n -	4	f0:80:80:af	-	4 bytes, need 1
-4.1.4 n -	5	f8:80:80:80:af	-	5 bytes, need 1
-4.1.5 n -	6	fc:80:80:80:80:af	-	6 bytes, need 1
+4.1.1 n -	2	c0:af	-	overlong
+4.1.2 n -	3	e0:80:af	-	overlong
+4.1.3 n -	4	f0:80:80:af	-	overlong
+4.1.4 n -	5	f8:80:80:80:af	-	overlong
+4.1.5 n -	6	fc:80:80:80:80:af	-	overlong
 4.2	Maximum overlong sequences
-4.2.1 n -	2	c1:bf	-	2 bytes, need 1
-4.2.2 n -	3	e0:9f:bf	-	3 bytes, need 2
-4.2.3 n -	4	f0:8f:bf:bf	-	4 bytes, need 3
-4.2.4 n -	5	f8:87:bf:bf:bf	-	5 bytes, need 4
-4.2.5 n -	6	fc:83:bf:bf:bf:bf	-	6 bytes, need 5
+4.2.1 n -	2	c1:bf	-	overlong
+4.2.2 n -	3	e0:9f:bf	-	overlong
+4.2.3 n -	4	f0:8f:bf:bf	-	overlong
+4.2.4 n -	5	f8:87:bf:bf:bf	-	overlong
+4.2.5 n -	6	fc:83:bf:bf:bf:bf	-	overlong
 4.3	Overlong representation of the NUL character
-4.3.1 n -	2	c0:80	-	2 bytes, need 1
-4.3.2 n -	3	e0:80:80	-	3 bytes, need 1
-4.3.3 n -	4	f0:80:80:80	-	4 bytes, need 1
-4.3.4 n -	5	f8:80:80:80:80	-	5 bytes, need 1
-4.3.5 n -	6	fc:80:80:80:80:80	-	6 bytes, need 1
+4.3.1 n -	2	c0:80	-	overlong
+4.3.2 n -	3	e0:80:80	-	overlong
+4.3.3 n -	4	f0:80:80:80	-	overlong
+4.3.4 n -	5	f8:80:80:80:80	-	overlong
+4.3.5 n -	6	fc:80:80:80:80:80	-	overlong
 5	Illegal code positions
 5.1	Single UTF-16 surrogates
 5.1.1 y d800	3	ed:a0:80	1	UTF-16 surrogate 0xd800
diff --git a/utf8.c b/utf8.c
index 2c3ecf7090..1803bd8f91 100644
--- a/utf8.c
+++ b/utf8.c
@@ -656,21 +656,85 @@ Perl__is_utf8_char_helper(const U8 * const s, const U8 * e, const U32 flags)
 #undef FE_ABOVE_OVERLONG
 #undef FF_OVERLONG_PREFIX
 
+STATIC char *
+S__byte_dump_string(pTHX_ const U8 * s, const STRLEN len)
+{
+    /* Returns a mortalized C string that is a displayable copy of the 'len'
+     * bytes starting at 's', each in a \xXY format. */
+
+    const STRLEN output_len = 4 * len + 1;  /* 4 bytes per each input, plus a
+                                               trailing NUL */
+    const U8 * const e = s + len;
+    char * output;
+    char * d;
+
+    PERL_ARGS_ASSERT__BYTE_DUMP_STRING;
+
+    Newx(output, output_len, char);
+    SAVEFREEPV(output);
+
+    d = output;
+    for (; s < e; s++) {
+        const unsigned high_nibble = (*s & 0xF0) >> 4;
+        const unsigned low_nibble =  (*s & 0x0F);
+
+        *d++ = '\\';
+        *d++ = 'x';
+
+        if (high_nibble < 10) {
+            *d++ = high_nibble + '0';
+        }
+        else {
+            *d++ = high_nibble - 10 + 'a';
+        }
+
+        if (low_nibble < 10) {
+            *d++ = low_nibble + '0';
+        }
+        else {
+            *d++ = low_nibble - 10 + 'a';
+        }
+    }
+
+    *d = '\0';
+    return output;
+}
+
 PERL_STATIC_INLINE char *
-S_unexpected_non_continuation_text(pTHX_ const U8 * const s, const STRLEN len)
+S_unexpected_non_continuation_text(pTHX_ const U8 * const s,
+
+                                         /* How many bytes to print */
+                                         const STRLEN print_len,
+
+                                         /* Which one is the non-continuation */
+                                         const STRLEN non_cont_byte_pos,
+
+                                         /* How many bytes should there be? */
+                                         const STRLEN expect_len)
 {
     /* Return the malformation warning text for an unexpected continuation
      * byte. */
 
-    const char * const where = (len == 1)
+    const char * const where = (non_cont_byte_pos == 1)
                                ? "immediately"
-                               : Perl_form(aTHX_ "%d bytes", (int) len);
+                               : Perl_form(aTHX_ "%d bytes",
+                                                 (int) non_cont_byte_pos);
 
     PERL_ARGS_ASSERT_UNEXPECTED_NON_CONTINUATION_TEXT;
 
-    return Perl_form(aTHX_ "%s (unexpected non-continuation byte 0x%02x,"
-                           " %s after start byte 0x%02x)",
-                           malformed_text, *(s + len), where, *s);
+    /* We don't need to pass this parameter, but since it has already been
+     * calculated, it's likely faster to pass it; verify under DEBUGGING */
+    assert(expect_len == UTF8SKIP(s));
+
+    return Perl_form(aTHX_ "%s: %s (unexpected non-continuation byte 0x%02x,"
+                           " %s after start byte 0x%02x; need %d bytes, got %d)",
+                           malformed_text,
+                           _byte_dump_string(s, print_len),
+                           *(s + non_cont_byte_pos),
+                           where,
+                           *s,
+                           (int) expect_len,
+                           (int) non_cont_byte_pos);
 }
 
 /*
@@ -782,7 +846,6 @@ UV
 Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
 {
     const U8 * const s0 = s;
-    U8 overflow_byte = '\0';	/* Save byte in case of overflow */
     U8 * send;
     UV uv = *s;
     STRLEN expectlen;
@@ -827,7 +890,8 @@ Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
 	    return 0;
 	}
 	if (! (flags & UTF8_CHECK_ONLY)) {
-	    sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (empty string)", malformed_text));
+	    sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (empty string)",
+                                                malformed_text));
 	}
 	goto malformed;
     }
@@ -857,7 +921,11 @@ Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
 	}
 
 	if (! (flags & UTF8_CHECK_ONLY)) {
-	    sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (unexpected continuation byte 0x%02x, with no preceding start byte)", malformed_text, *s0));
+	    sv = sv_2mortal(Perl_newSVpvf(aTHX_
+                            "%s: %s (unexpected continuation byte 0x%02x,"
+                            " with no preceding start byte)",
+                            malformed_text,
+                            _byte_dump_string(s0, 1), *s0));
 	}
 	curlen = 1;
 	goto malformed;
@@ -887,7 +955,6 @@ Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
 		 * Set a flag, but keep going in the loop, so that we absorb
 		 * the rest of the bytes that comprise the character. */
 		overflowed = TRUE;
-		overflow_byte = *s; /* Save for warning message's use */
 	    }
 	    uv = UTF8_ACCUMULATE(uv, *s);
 	}
@@ -924,7 +991,10 @@ Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
 	if (!(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 	    if (! (flags & UTF8_CHECK_ONLY)) {
                 sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s",
-                                unexpected_non_continuation_text(s0, curlen)));
+                                unexpected_non_continuation_text(s0,
+                                                             send - s0,
+                                                             s - s0,
+                                                             (int) expectlen)));
 	    }
 	    goto malformed;
 	}
@@ -940,7 +1010,13 @@ Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
     else if (UNLIKELY(curlen < expectlen)) {
 	if (! (flags & UTF8_ALLOW_SHORT)) {
 	    if (! (flags & UTF8_CHECK_ONLY)) {
-		sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (%d byte%s, need %d, after start byte 0x%02x)", malformed_text, (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, *s0));
+		sv = sv_2mortal(Perl_newSVpvf(aTHX_
+                                "%s: %s (too short; got %d byte%s, need %d)",
+                                malformed_text,
+                                _byte_dump_string(s0, send - s0),
+                                (int)curlen,
+                                curlen == 1 ? "" : "s",
+                                (int)expectlen));
 	    }
 	    goto malformed;
 	}
@@ -952,7 +1028,9 @@ Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
     }
 
     if (UNLIKELY(overflowed)) {
-	sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (overflow at byte 0x%02x, after start byte 0x%02x)", malformed_text, overflow_byte, *s0));
+        sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s: %s (overflows)",
+                                            malformed_text,
+                                            _byte_dump_string(s0, send - s0)));
 	goto malformed;
     }
 
@@ -965,7 +1043,16 @@ Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
 	 * value, instead of the replacement character.  This is because this
 	 * value is actually well-defined. */
 	if (! (flags & UTF8_CHECK_ONLY)) {
-	    sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (%d byte%s, need %d, after start byte 0x%02x)", malformed_text, (int)expectlen, expectlen == 1 ? "": "s", OFFUNISKIP(uv), *s0));
+            U8 tmpbuf[UTF8_MAXBYTES+1];
+            const U8 * const e = uvchr_to_utf8(tmpbuf, uv);
+            sv = sv_2mortal(Perl_newSVpvf(aTHX_
+                  "%s: %s (overlong; instead use %s to represent U+%0*"UVXf")",
+                  malformed_text,
+                  _byte_dump_string(s0, send - s0),
+                  _byte_dump_string(tmpbuf, e - tmpbuf),
+                  ((uv < 256) ? 2 : 4), /* Field width of 2 for small code
+                                           points */
+                  uv));
 	}
 	goto malformed;
     }
@@ -1279,7 +1366,7 @@ Perl_bytes_cmp_utf8(pTHX_ const U8 *b, STRLEN blen, const U8 *u, STRLEN ulen)
                         /* diag_listed_as: Malformed UTF-8 character (%s) */
 			Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
                                     "%s %s%s",
-                                    unexpected_non_continuation_text(u - 1, 1),
+                                    unexpected_non_continuation_text(u - 1, 2, 1, 2),
                                     PL_op ? " in " : "",
                                     PL_op ? OP_DESC(PL_op) : "");
 			return -2;
author	Karl Williamson <khw@cpan.org>	2016-09-28 15:05:17 -0600
committer	Karl Williamson <khw@cpan.org>	2016-10-13 11:18:11 -0600
commit	7cf8d05d1e856f3bd3a392b3ccea008f1c1eb743 (patch)
tree	3387705d72ad140eb4ab6412aa70adb94afc6bb5
parent	806547a7dc29226b6a06672e1d42fb136e766510 (diff)
download	perl-7cf8d05d1e856f3bd3a392b3ccea008f1c1eb743.tar.gz