utf8n_to_uvchr(): Reduce chances of reading beyond buffer

utf8n_to_uvchr() can be called incorrectly, leading it to believe the buffer is longer than it actually is. But often, it will be called with NUL terminated strings, so it can reduce it's chances of being fooled by refusing to read beyond a NUL. The NUL will terminate any UTF-8 byte sequence, and the only reason to read beyond it would be to print all the expected bytes in the sequence. This commit is not the final word, but it is an easy fix for a common case.
author: Karl Williamson <khw@cpan.org> 2016-10-19 21:20:48 -0600
committer: Karl Williamson <khw@cpan.org> 2016-10-19 21:32:13 -0600
commit: 3cc6a05eedade6f51526feb18c12356b0589d77a (patch)
tree: fd4a087fd5513b9149a6f3bfe1627c75e7e41737
parent: 14c482b0d8b7402f1b2b28d2918a55c83577d6ff (diff)
download: perl-3cc6a05eedade6f51526feb18c12356b0589d77a.tar.gz
3 files changed, 16 insertions, 3 deletions
diff --git a/embed.fnc b/embed.fnc
index 94cb9845a9..5cc73b7978 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -1685,7 +1685,7 @@ inRP	|bool	|does_utf8_overflow|NN const U8 * const s|NN const U8 * e
 inRP	|bool	|is_utf8_overlong_given_start_byte_ok|NN const U8 * const s|const STRLEN len
 sMR	|char *	|unexpected_non_continuation_text			\
 		|NN const U8 * const s					\
-		|const STRLEN print_len					\
+		|STRLEN print_len					\
 		|const STRLEN non_cont_byte_pos				\
 		|const STRLEN expect_len
 sM	|char *	|_byte_dump_string|NN const U8 * s|const STRLEN len
diff --git a/proto.h b/proto.h
index ec870f73c0..1d79c46e2e 100644
--- a/proto.h
+++ b/proto.h
@@ -5635,7 +5635,7 @@ STATIC SV*	S_swatch_get(pTHX_ SV* swash, UV start, UV span)
 STATIC U8	S_to_lower_latin1(const U8 c, U8 *p, STRLEN *lenp)
 			__attribute__warn_unused_result__;
 
-STATIC char *	S_unexpected_non_continuation_text(pTHX_ const U8 * const s, const STRLEN print_len, const STRLEN non_cont_byte_pos, const STRLEN expect_len)
+STATIC char *	S_unexpected_non_continuation_text(pTHX_ const U8 * const s, STRLEN print_len, const STRLEN non_cont_byte_pos, const STRLEN expect_len)
 			__attribute__warn_unused_result__;
 #define PERL_ARGS_ASSERT_UNEXPECTED_NON_CONTINUATION_TEXT	\
 	assert(s)
diff --git a/utf8.c b/utf8.c
index d7450d79f1..f017f7199a 100644
--- a/utf8.c
+++ b/utf8.c
@@ -735,7 +735,7 @@ PERL_STATIC_INLINE char *
 S_unexpected_non_continuation_text(pTHX_ const U8 * const s,
 
                                          /* How many bytes to print */
-                                         const STRLEN print_len,
+                                         STRLEN print_len,
 
                                          /* Which one is the non-continuation */
                                          const STRLEN non_cont_byte_pos,
@@ -750,6 +750,7 @@ S_unexpected_non_continuation_text(pTHX_ const U8 * const s,
                                ? "immediately"
                                : Perl_form(aTHX_ "%d bytes",
                                                  (int) non_cont_byte_pos);
+    unsigned int i;
 
     PERL_ARGS_ASSERT_UNEXPECTED_NON_CONTINUATION_TEXT;
 
@@ -757,6 +758,18 @@ S_unexpected_non_continuation_text(pTHX_ const U8 * const s,
      * calculated, it's likely faster to pass it; verify under DEBUGGING */
     assert(expect_len == UTF8SKIP(s));
 
+    /* It is possible that utf8n_to_uvchr() was called incorrectly, with a
+     * length that is larger than is actually available in the buffer.  If we
+     * print all the bytes based on that length, we will read past the buffer
+     * end.  Often, the strings are NUL terminated, so to lower the chances of
+     * this happening, print the malformed bytes only up through any NUL. */
+    for (i = 1; i < print_len; i++) {
+        if (*(s + i) == '\0') {
+            print_len = i + 1;  /* +1 gets the NUL printed */
+            break;
+        }
+    }
+
     return Perl_form(aTHX_ "%s: %s (unexpected non-continuation byte 0x%02x,"
                            " %s after start byte 0x%02x; need %d bytes, got %d)",
                            malformed_text,
author	Karl Williamson <khw@cpan.org>	2016-10-19 21:20:48 -0600
committer	Karl Williamson <khw@cpan.org>	2016-10-19 21:32:13 -0600
commit	3cc6a05eedade6f51526feb18c12356b0589d77a (patch)
tree	fd4a087fd5513b9149a6f3bfe1627c75e7e41737
parent	14c482b0d8b7402f1b2b28d2918a55c83577d6ff (diff)
download	perl-3cc6a05eedade6f51526feb18c12356b0589d77a.tar.gz