summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKarl Williamson <public@khwilliamson.com>2011-11-21 17:04:38 -0700
committerKarl Williamson <public@khwilliamson.com>2011-11-21 17:20:24 -0700
commite032854818ad111e923e4cb101b55494a5bcff56 (patch)
tree62de35eab3945636e4ecdad08cc85887c7b544f3
parent880d3d38e51b563e24201397c8ab4f960508d33e (diff)
downloadperl-e032854818ad111e923e4cb101b55494a5bcff56.tar.gz
PATCH: [perl #32080] is_utf8_string() reads too far
This function and is_utf8_string_loclen() are modified to check before reading beyond the end of the string; and the pod for is_utf8_char() is modified to warn about the buffer overflow potential.
-rw-r--r--utf8.c58
1 files changed, 30 insertions, 28 deletions
diff --git a/utf8.c b/utf8.c
index 6831e76105..150af5ce1c 100644
--- a/utf8.c
+++ b/utf8.c
@@ -323,6 +323,9 @@ character. Note that an INVARIANT (i.e. ASCII on non-EBCDIC machines)
character is a valid UTF-8 character. The actual number of bytes in the UTF-8
character will be returned if it is valid, otherwise 0.
+WARNING: use only if you *know* that C<s> has at least either UTF8_MAXBYTES or
+UTF8SKIP(s) bytes.
+
=cut */
STRLEN
Perl_is_utf8_char(const U8 *s)
@@ -343,9 +346,9 @@ Perl_is_utf8_char(const U8 *s)
Returns true if first C<len> bytes of the given string form a valid
UTF-8 string, false otherwise. If C<len> is 0, it will be calculated
-using C<strlen(s)>. Note that 'a valid UTF-8 string' does not mean 'a
-string that contains code points above 0x7F encoded in UTF-8' because a
-valid ASCII string is a valid UTF-8 string.
+using C<strlen(s)> (which means if you use this option, that C<s> has to have a
+terminating NUL byte). Note that all characters being ASCII constitute 'a
+valid UTF-8 string'.
See also is_ascii_string(), is_utf8_string_loclen(), and is_utf8_string_loc().
@@ -361,35 +364,32 @@ Perl_is_utf8_string(const U8 *s, STRLEN len)
PERL_ARGS_ASSERT_IS_UTF8_STRING;
while (x < send) {
- STRLEN c;
/* Inline the easy bits of is_utf8_char() here for speed... */
- if (UTF8_IS_INVARIANT(*x))
- c = 1;
+ if (UTF8_IS_INVARIANT(*x)) {
+ x++;
+ }
else if (!UTF8_IS_START(*x))
- goto out;
+ return FALSE;
else {
/* ... and call is_utf8_char() only if really needed. */
-#ifdef IS_UTF8_CHAR
- c = UTF8SKIP(x);
+ const STRLEN c = UTF8SKIP(x);
+ const U8* const next_char_ptr = x + c;
+
+ if (next_char_ptr > send) {
+ return FALSE;
+ }
+
if (IS_UTF8_CHAR_FAST(c)) {
if (!IS_UTF8_CHAR(x, c))
- c = 0;
+ return FALSE;
}
- else
- c = is_utf8_char_slow(x, c);
-#else
- c = is_utf8_char(x);
-#endif /* #ifdef IS_UTF8_CHAR */
- if (!c)
- goto out;
+ else if (! is_utf8_char_slow(x, c)) {
+ return FALSE;
+ }
+ x = next_char_ptr;
}
- x += c;
}
- out:
- if (x != send)
- return FALSE;
-
return TRUE;
}
@@ -427,27 +427,29 @@ Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
while (x < send) {
+ const U8* next_char_ptr;
+
/* Inline the easy bits of is_utf8_char() here for speed... */
if (UTF8_IS_INVARIANT(*x))
- c = 1;
+ next_char_ptr = x + 1;
else if (!UTF8_IS_START(*x))
goto out;
else {
/* ... and call is_utf8_char() only if really needed. */
-#ifdef IS_UTF8_CHAR
c = UTF8SKIP(x);
+ next_char_ptr = c + x;
+ if (next_char_ptr > send) {
+ goto out;
+ }
if (IS_UTF8_CHAR_FAST(c)) {
if (!IS_UTF8_CHAR(x, c))
c = 0;
} else
c = is_utf8_char_slow(x, c);
-#else
- c = is_utf8_char(x);
-#endif /* #ifdef IS_UTF8_CHAR */
if (!c)
goto out;
}
- x += c;
+ x = next_char_ptr;
outlen++;
}