diff options
author | Karl Williamson <khw@cpan.org> | 2016-11-22 17:47:35 -0700 |
---|---|---|
committer | Karl Williamson <khw@cpan.org> | 2016-11-24 08:29:14 -0700 |
commit | e308b348b63b8c65648ae3d340ce96b3ec19f1a2 (patch) | |
tree | 2b8b5bf146d199751c85d6a43b9b3dae21b7a3f2 /utf8.c | |
parent | 2d0a280183c8525ba909db81b0007830c2f3a118 (diff) | |
download | perl-e308b348b63b8c65648ae3d340ce96b3ec19f1a2.tar.gz |
Split diagnostics for two UTF-8 malformations
Some UTF-8 sequences may have multiple malformations. Commit
2b5e7bc2e60b4c4b5d87aa66e066363d9dce7930 tried to make sure that all
possible ones are raised, instead of abandoning searching after one is
found. Since, I realized that there was yet another case of two
malformations that it returned only one or the other of.
An input buffer may be too short to fully express the code point it
purports to. This can be determined by the first byte of the UTF-8
sequence indicating a longer sequence is requred than the space
available. But also, that shortened sequence can have a premature
beginning of another character earlier than the shortness. This commit
causes these to be both raised, instead of the previous behavior of
noting just one.
Diffstat (limited to 'utf8.c')
-rw-r--r-- | utf8.c | 75 |
1 files changed, 40 insertions, 35 deletions
@@ -1015,6 +1015,7 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, STRLEN expectlen = 0; /* How long should this sequence be? (initialized to silence compilers' wrong warning) */ + STRLEN avail_len = 0; /* When input is too short, gives what that is */ U32 discard_errors = 0; /* Used to save branches when 'errors' is NULL; this gets set and discarded */ @@ -1101,12 +1102,21 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, * sequence, leaving just the bits that are part of the value. */ uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen); + /* Setup the loop end point, making sure to not look past the end of the + * input string, and flag it as too short if the size isn't big enough. */ + send = (U8*) s0; + if (UNLIKELY(curlen < expectlen)) { + possible_problems |= UTF8_GOT_SHORT; + avail_len = curlen; + send += curlen; + } + else { + send += expectlen; + } + adjusted_send = send; + /* Now, loop through the remaining bytes in the character's sequence, - * accumulating each into the working value as we go. Be sure to not look - * past the end of the input string */ - send = adjusted_send = (U8*) s0 + ((expectlen <= curlen) - ? expectlen - : curlen); + * accumulating each into the working value as we go. */ for (s = s0 + 1; s < send; s++) { if (LIKELY(UTF8_IS_CONTINUATION(*s))) { uv = UTF8_ACCUMULATE(uv, *s); @@ -1116,21 +1126,17 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, /* Here, found a non-continuation before processing all expected bytes. * This byte indicates the beginning of a new character, so quit, even * if allowing this malformation. */ - curlen = s - s0; /* Save how many bytes we actually got */ possible_problems |= UTF8_GOT_NON_CONTINUATION; - goto finish_short; + break; } /* End of loop through the character's bytes */ /* Save how many bytes were actually in the character */ curlen = s - s0; - /* Did we get all the continuation bytes that were expected? Note that we - * know this result even without executing the loop above. But we had to - * do the loop to see if there are unexpected non-continuations. */ - if (UNLIKELY(curlen < expectlen)) { - possible_problems |= UTF8_GOT_SHORT; + /* A convenience macro that matches either of the too-short conditions. */ +# define UTF8_GOT_TOO_SHORT (UTF8_GOT_SHORT|UTF8_GOT_NON_CONTINUATION) - finish_short: + if (UNLIKELY(possible_problems & UTF8_GOT_TOO_SHORT)) { uv_so_far = uv; uv = UNICODE_REPLACEMENT; } @@ -1164,10 +1170,6 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, { possible_problems |= UTF8_GOT_LONG; - /* A convenience macro that matches either of the too-short conditions. - * */ -# define UTF8_GOT_TOO_SHORT (UTF8_GOT_SHORT|UTF8_GOT_NON_CONTINUATION) - if (UNLIKELY(possible_problems & UTF8_GOT_TOO_SHORT)) { UV min_uv = uv_so_far; STRLEN i; @@ -1264,6 +1266,9 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, /* At this point: * curlen contains the number of bytes in the sequence that * this call should advance the input by. + * avail_len gives the available number of bytes passed in, but + * only if this is less than the expected number of + * bytes, based on the code point's start byte. * possible_problems' is 0 if there weren't any problems; otherwise a bit * is set in it for each potential problem found. * uv contains the code point the input sequence @@ -1360,22 +1365,6 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, } } } - else if (possible_problems & UTF8_GOT_NON_CONTINUATION) { - possible_problems &= ~UTF8_GOT_NON_CONTINUATION; - *errors |= UTF8_GOT_NON_CONTINUATION; - - if (! (flags & UTF8_ALLOW_NON_CONTINUATION)) { - disallowed = TRUE; - if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) { - pack_warn = packWARN(WARN_UTF8); - message = Perl_form(aTHX_ "%s", - unexpected_non_continuation_text(s0, - send - s0, - s - s0, - (int) expectlen)); - } - } - } else if (possible_problems & UTF8_GOT_SHORT) { possible_problems &= ~UTF8_GOT_SHORT; *errors |= UTF8_GOT_SHORT; @@ -1388,13 +1377,29 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, "%s: %s (too short; %d byte%s available, need %d)", malformed_text, _byte_dump_string(s0, send - s0), - (int)curlen, - curlen == 1 ? "" : "s", + (int)avail_len, + avail_len == 1 ? "" : "s", (int)expectlen); } } } + else if (possible_problems & UTF8_GOT_NON_CONTINUATION) { + possible_problems &= ~UTF8_GOT_NON_CONTINUATION; + *errors |= UTF8_GOT_NON_CONTINUATION; + + if (! (flags & UTF8_ALLOW_NON_CONTINUATION)) { + disallowed = TRUE; + if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) { + pack_warn = packWARN(WARN_UTF8); + message = Perl_form(aTHX_ "%s", + unexpected_non_continuation_text(s0, + send - s0, + s - s0, + (int) expectlen)); + } + } + } else if (possible_problems & UTF8_GOT_LONG) { possible_problems &= ~UTF8_GOT_LONG; *errors |= UTF8_GOT_LONG; |