diff options
author | Karl Williamson <public@khwilliamson.com> | 2011-11-21 17:04:38 -0700 |
---|---|---|
committer | Karl Williamson <public@khwilliamson.com> | 2011-11-21 17:20:24 -0700 |
commit | e032854818ad111e923e4cb101b55494a5bcff56 (patch) | |
tree | 62de35eab3945636e4ecdad08cc85887c7b544f3 | |
parent | 880d3d38e51b563e24201397c8ab4f960508d33e (diff) | |
download | perl-e032854818ad111e923e4cb101b55494a5bcff56.tar.gz |
PATCH: [perl #32080] is_utf8_string() reads too far
This function and is_utf8_string_loclen() are modified to check before
reading beyond the end of the string; and the pod for is_utf8_char()
is modified to warn about the buffer overflow potential.
-rw-r--r-- | utf8.c | 58 |
1 files changed, 30 insertions, 28 deletions
@@ -323,6 +323,9 @@ character. Note that an INVARIANT (i.e. ASCII on non-EBCDIC machines) character is a valid UTF-8 character. The actual number of bytes in the UTF-8 character will be returned if it is valid, otherwise 0. +WARNING: use only if you *know* that C<s> has at least either UTF8_MAXBYTES or +UTF8SKIP(s) bytes. + =cut */ STRLEN Perl_is_utf8_char(const U8 *s) @@ -343,9 +346,9 @@ Perl_is_utf8_char(const U8 *s) Returns true if first C<len> bytes of the given string form a valid UTF-8 string, false otherwise. If C<len> is 0, it will be calculated -using C<strlen(s)>. Note that 'a valid UTF-8 string' does not mean 'a -string that contains code points above 0x7F encoded in UTF-8' because a -valid ASCII string is a valid UTF-8 string. +using C<strlen(s)> (which means if you use this option, that C<s> has to have a +terminating NUL byte). Note that all characters being ASCII constitute 'a +valid UTF-8 string'. See also is_ascii_string(), is_utf8_string_loclen(), and is_utf8_string_loc(). @@ -361,35 +364,32 @@ Perl_is_utf8_string(const U8 *s, STRLEN len) PERL_ARGS_ASSERT_IS_UTF8_STRING; while (x < send) { - STRLEN c; /* Inline the easy bits of is_utf8_char() here for speed... */ - if (UTF8_IS_INVARIANT(*x)) - c = 1; + if (UTF8_IS_INVARIANT(*x)) { + x++; + } else if (!UTF8_IS_START(*x)) - goto out; + return FALSE; else { /* ... and call is_utf8_char() only if really needed. */ -#ifdef IS_UTF8_CHAR - c = UTF8SKIP(x); + const STRLEN c = UTF8SKIP(x); + const U8* const next_char_ptr = x + c; + + if (next_char_ptr > send) { + return FALSE; + } + if (IS_UTF8_CHAR_FAST(c)) { if (!IS_UTF8_CHAR(x, c)) - c = 0; + return FALSE; } - else - c = is_utf8_char_slow(x, c); -#else - c = is_utf8_char(x); -#endif /* #ifdef IS_UTF8_CHAR */ - if (!c) - goto out; + else if (! is_utf8_char_slow(x, c)) { + return FALSE; + } + x = next_char_ptr; } - x += c; } - out: - if (x != send) - return FALSE; - return TRUE; } @@ -427,27 +427,29 @@ Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el) PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN; while (x < send) { + const U8* next_char_ptr; + /* Inline the easy bits of is_utf8_char() here for speed... */ if (UTF8_IS_INVARIANT(*x)) - c = 1; + next_char_ptr = x + 1; else if (!UTF8_IS_START(*x)) goto out; else { /* ... and call is_utf8_char() only if really needed. */ -#ifdef IS_UTF8_CHAR c = UTF8SKIP(x); + next_char_ptr = c + x; + if (next_char_ptr > send) { + goto out; + } if (IS_UTF8_CHAR_FAST(c)) { if (!IS_UTF8_CHAR(x, c)) c = 0; } else c = is_utf8_char_slow(x, c); -#else - c = is_utf8_char(x); -#endif /* #ifdef IS_UTF8_CHAR */ if (!c) goto out; } - x += c; + x = next_char_ptr; outlen++; } |