summaryrefslogtreecommitdiff
path: root/utf8.c
diff options
context:
space:
mode:
authorKarl Williamson <public@khwilliamson.com>2012-04-27 11:09:14 -0600
committerRicardo Signes <rjbs@cpan.org>2012-05-01 19:08:57 -0400
commit2f8f112e03b73a49c60674d3b5e00b4463f1d5b7 (patch)
treeabb87cf258b6151db2e57d7147562ab893ea0372 /utf8.c
parent1530a57dfaff29c214be6c42259309d263abc973 (diff)
downloadperl-2f8f112e03b73a49c60674d3b5e00b4463f1d5b7.tar.gz
utf8n_to_uvuni(): Fix broken malformation interactions
All code points whose UTF-8 representations start with a byte containing either \xFE or \xFF are considered problematic because they are not portable. There are many such code points that are too large to represent on a 32 or even a 64 bit platform. Commit eb83ed87110e41de6a4cd4463f75df60798a9243 failed to properly catch overflow when the input flags to this function say to warn on, but otherwise accept FE and FF sequences. Now overflow is checked for unconditionally.
Diffstat (limited to 'utf8.c')
-rw-r--r--utf8.c15
1 files changed, 12 insertions, 3 deletions
diff --git a/utf8.c b/utf8.c
index c01ea4b5b9..83d239735e 100644
--- a/utf8.c
+++ b/utf8.c
@@ -560,6 +560,7 @@ Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
UV pack_warn = 0; /* Save result of packWARN() for later */
bool unexpected_non_continuation = FALSE;
bool overflowed = FALSE;
+ bool do_overlong_test = TRUE; /* May have to skip this test */
const char* const malformed_text = "Malformed UTF-8 character";
@@ -707,6 +708,10 @@ Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
goto malformed;
}
uv = UNICODE_REPLACEMENT;
+
+ /* Skip testing for overlongs, as the REPLACEMENT may not be the same
+ * as what the original expectations were. */
+ do_overlong_test = FALSE;
if (retlen) {
*retlen = curlen;
}
@@ -719,13 +724,14 @@ Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
goto malformed;
}
uv = UNICODE_REPLACEMENT;
+ do_overlong_test = FALSE;
if (retlen) {
*retlen = curlen;
}
}
#ifndef EBCDIC /* EBCDIC allows FE, FF, can't overflow */
- else if ((*s0 & 0xFE) == 0xFE /* matches FE or FF */
+ if ((*s0 & 0xFE) == 0xFE /* matches both FE, FF */
&& (flags & (UTF8_WARN_FE_FF|UTF8_DISALLOW_FE_FF)))
{
/* By adding UTF8_CHECK_ONLY to the test, we avoid unnecessary
@@ -740,7 +746,7 @@ Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
goto malformed;
}
}
- else if (overflowed) {
+ if (overflowed) {
/* If the first byte is FF, it will overflow a 32-bit word. If the
* first byte is FE, it will overflow a signed 32-bit word. The
@@ -751,7 +757,10 @@ Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
}
#endif
- else if (expectlen > (STRLEN)UNISKIP(uv) && ! (flags & UTF8_ALLOW_LONG)) {
+ if (do_overlong_test
+ && expectlen > (STRLEN)UNISKIP(uv)
+ && ! (flags & UTF8_ALLOW_LONG))
+ {
/* The overlong malformation has lower precedence than the others.
* Note that if this malformation is allowed, we return the actual
* value, instead of the replacement character. This is because this