summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKarl Williamson <public@khwilliamson.com>2012-03-25 11:05:55 -0600
committerKarl Williamson <public@khwilliamson.com>2012-03-25 19:28:58 -0600
commit92e107508971a4414c83a4cb347f8e50c829b8ac (patch)
treed7e18f3e58ffb30688565db71f44490ee8c9a257
parent37254b39824bccccf4f8b727cb87e556877bc308 (diff)
downloadperl-smoke-me/khw-smoke.tar.gz
-rw-r--r--pp_pack.c1
-rw-r--r--regexec.c4
-rw-r--r--t/op/utf8decode.t8
-rw-r--r--utf8.c310
-rw-r--r--utf8.h83
5 files changed, 254 insertions, 152 deletions
diff --git a/pp_pack.c b/pp_pack.c
index 273908cf98..62aa60055c 100644
--- a/pp_pack.c
+++ b/pp_pack.c
@@ -644,6 +644,7 @@ uni_to_byte(pTHX_ const char **s, const char *end, I32 datumtype)
{
STRLEN retlen;
UV val = utf8n_to_uvchr((U8 *) *s, end-*s, &retlen,
+ /* XXX */
ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
/* We try to process malformed UTF-8 as much as possible (preferably with
warnings), but these two mean we make no progress in the string and
diff --git a/regexec.c b/regexec.c
index 8ccb6f7032..1343488b08 100644
--- a/regexec.c
+++ b/regexec.c
@@ -5257,10 +5257,10 @@ NULL
#ifdef EBCDIC
ST.c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN, 0,
ckWARN(WARN_UTF8) ?
- 0 : UTF8_ALLOW_ANY);
+ 0 : UTF8_ALLOW_ANYUV);
ST.c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN, 0,
ckWARN(WARN_UTF8) ?
- 0 : UTF8_ALLOW_ANY);
+ 0 : UTF8_ALLOW_ANYUV);
#else
ST.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
uniflags);
diff --git a/t/op/utf8decode.t b/t/op/utf8decode.t
index ba785fa10a..6e69db9b62 100644
--- a/t/op/utf8decode.t
+++ b/t/op/utf8decode.t
@@ -1,5 +1,7 @@
#!./perl
+$|=1;
+
BEGIN {
chdir 't' if -d 't';
@INC = '../lib';
@@ -68,7 +70,9 @@ foreach (<DATA>) {
$expect = -$expect;
$::TODO = "Markus Kuhn states that $expect invalid sequences should be signalled";
}
- is(scalar @warnings, $expect, "Expected number of warnings for $id seen");
+ if (! is(scalar @warnings, $expect, "Expected number of warnings for $id seen")) {
+ note("Actual warnings received are:\n" . join("\n", @warnings));
+ }
}
} else {
fail("unknown format '$_'");
@@ -135,7 +139,7 @@ __DATA__
3.3.9 n - 4 fb:bf:bf:bf - 4 bytes, need 5
3.3.10 n - 5 fd:bf:bf:bf:bf - 5 bytes, need 6
3.4 Concatenation of incomplete sequences
-3.4.1 N-10 - 30 c0:e0:80:f0:80:80:f8:80:80:80:fc:80:80:80:80:df:ef:bf:f7:bf:bf:fb:bf:bf:bf:fd:bf:bf:bf:bf - unexpected non-continuation byte 0xe0, immediately after start byte 0xc0
+3.4.1 N10 - 30 c0:e0:80:f0:80:80:f8:80:80:80:fc:80:80:80:80:df:ef:bf:f7:bf:bf:fb:bf:bf:bf:fd:bf:bf:bf:bf - unexpected non-continuation byte 0xe0, immediately after start byte 0xc0
3.5 Impossible bytes
3.5.1 n - 1 fe - byte 0xfe
3.5.2 n - 1 ff - byte 0xff
diff --git a/utf8.c b/utf8.c
index f71040c314..010d966a27 100644
--- a/utf8.c
+++ b/utf8.c
@@ -137,7 +137,8 @@ Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
{
PERL_ARGS_ASSERT_UVUNI_TO_UTF8_FLAGS;
- if (ckWARN_d(WARN_UTF8)) {
+ /* The first problematic code point is the first surrogate */
+ if (uv >= UNICODE_SURROGATE_FIRST && ckWARN_d(WARN_UTF8)) {
if (UNICODE_IS_SURROGATE(uv)) {
if (flags & UNICODE_WARN_SURROGATE) {
Perl_ck_warner_d(aTHX_ packWARN(WARN_SURROGATE),
@@ -484,9 +485,9 @@ character.
The value of C<flags> determines the behavior when C<s> does not point to a
well-formed UTF-8 character. If C<flags> is 0, when a malformation is found,
-C<retlen> is set to the expected length of the UTF-8 character in bytes, zero
-is returned, and if UTF-8 warnings haven't been lexically disabled, a warning
-is raised.
+zero is returned and C<*retlen> is set so that (S<C<s> + C<*retlen>>) is the
+next possible position in C<s> that could begin a non-malformed character.
+Also, if UTF-8 warnings haven't been lexically disabled, a warning is raised.
Various ALLOW flags can be set in C<flags> to allow (and not warn on)
individual types of malformations, such as the sequence being overlong (that
@@ -495,7 +496,7 @@ overlong sequences are expressly forbidden in the UTF-8 standard due to
potential security issues). Another malformation example is the first byte of
a character not being a legal first byte. See F<utf8.h> for the list of such
flags. Of course, the value returned by this function under such conditions is
-not reliable.
+not reliable. XXX
The UTF8_CHECK_ONLY flag overrides the behavior when a non-allowed (by other
flags) malformation is found. If this flag is set, the routine assumes that
@@ -545,11 +546,12 @@ Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
{
dVAR;
const U8 * const s0 = s;
+ U8 overflow_byte; /* Save byte in case of overflow */
+ U8 * send;
UV uv = *s, ouv = 0;
- STRLEN len = 1;
bool dowarn = ckWARN_d(WARN_UTF8);
const UV startbyte = *s;
- STRLEN expectlen = 0;
+ STRLEN expectlen;
U32 warning = 0;
SV* sv = NULL;
@@ -564,104 +566,157 @@ Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
#define UTF8_WARN_OVERFLOW 5
#define UTF8_WARN_LONG 6
- if (curlen == 0 &&
- !(flags & UTF8_ALLOW_EMPTY)) {
+ /* The order of malformation tests here is important. We should consume as
+ * few bytes as possible in order to find the very next valid character.
+ * For example, once we've done a UTF8SKIP, we can tell the expected number
+ * of bytes, and could fail right off the bat if there are too few
+ * available. But the byte that is garbled could be just that initial one,
+ * and a shorter sequence is really being passed in. So instead, we go through
+ * the sequence byte-by-byte, until we reach the end or find a byte that
+ * begins the next character. Then we know how long the character really is.
+ *
+ * We also should not consume too few bytes, otherwise someone could inject
+ * things. For example, an input could be deliberately designed to
+ * overflow, and if this code bailed out immediately upon discovering that,
+ * returning to the caller *retlen pointing to the very next byte (one
+ * which is actually part of of the overflowing sequence), that could look
+ * legitimate to the caller, which could discard the initial partial
+ * sequence and process the rest */
+
+ if (curlen == 0) {
+ if (retlen) {
+ *retlen = 0;
+ }
+
+ /* Zero length strings, if allowed, of necessity are zero */
+ if (flags & UTF8_ALLOW_EMPTY) {
+ return 0;
+ }
warning = UTF8_WARN_EMPTY;
+ curlen = 0;
goto malformed;
}
+ /* A well-formed UTF-8 character, as the vast majority of calls to this
+ * function will be for, has this expected length. For efficiency, set to
+ * return it here, and override it if necessary if a malformation is found
+ */
+ expectlen = UTF8SKIP(s);
+
+ if (retlen) {
+ *retlen = expectlen;
+ }
+
+ /* An invariant is trivially well-formed */
if (UTF8_IS_INVARIANT(uv)) {
- if (retlen)
- *retlen = 1;
return (UV) (NATIVE_TO_UTF(*s));
}
- if (UTF8_IS_CONTINUATION(uv) &&
- !(flags & UTF8_ALLOW_CONTINUATION)) {
- warning = UTF8_WARN_CONTINUATION;
- goto malformed;
- }
+ if (UTF8_IS_CONTINUATION(uv)) {
+ if (flags & UTF8_ALLOW_CONTINUATION) {
+ if (retlen) {
+ *retlen = 1;
+ }
+ return UNICODE_REPLACEMENT;
+ }
- if (LAX_UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
- !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
- warning = UTF8_WARN_NON_CONTINUATION;
+ warning = UTF8_WARN_CONTINUATION;
+ curlen = 1;
goto malformed;
}
#ifdef EBCDIC
uv = NATIVE_TO_UTF(uv);
-#else
- if (uv == 0xfe || uv == 0xff) {
- if (flags & (UTF8_WARN_SUPER|UTF8_WARN_FE_FF)) {
- sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Code point beginning with byte 0x%02"UVXf" is not Unicode, and not portable", uv));
- flags &= ~UTF8_WARN_SUPER; /* Only warn once on this problem */
- }
- if (flags & (UTF8_DISALLOW_SUPER|UTF8_DISALLOW_FE_FF)) {
- goto malformed;
- }
- }
#endif
- if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
- else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
- else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
- else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
-#ifdef EBCDIC
- else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
- else { len = 7; uv &= 0x01; }
-#else
- else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
- else if (!(uv & 0x01)) { len = 7; uv = 0; }
- else { len = 13; uv = 0; } /* whoa! */
-#endif
+ /* Here is not a continuation byte, nor an invariant. The only thing left
+ * is a start byte (possibly for an overlong) */
- if (retlen)
- *retlen = len;
+ /* Remove the leading bits that indicate the number of bytes, leaving just
+ * the bits that are part of the value */
+ uv &= UTF_START_MASK(expectlen);
- expectlen = len;
+ /* Now, loop through the remaining bytes, accumulating each into the working total as we go */
+ /* Be sure to not look past the end of the input string */
+ send = (U8*) s0 + ((expectlen <= curlen) ? expectlen : curlen);
- if ((curlen < expectlen) &&
- !(flags & UTF8_ALLOW_SHORT)) {
- warning = UTF8_WARN_SHORT;
- goto malformed;
- }
-
- len--;
- s++;
ouv = uv; /* ouv is the value from the previous iteration */
- while (len--) {
- if (!UTF8_IS_CONTINUATION(*s) &&
- !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
- s--;
- warning = UTF8_WARN_NON_CONTINUATION;
- goto malformed;
- }
- else
+ for (++s; s < send; s++) {
+ if (UTF8_IS_CONTINUATION(*s)) {
uv = UTF8_ACCUMULATE(uv, *s);
- if (!(uv > ouv)) { /* If the value didn't grow from the previous
- iteration, something is horribly wrong */
- /* These cannot be allowed. */
- if (uv == ouv) {
- if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) {
- warning = UTF8_WARN_LONG;
- goto malformed;
- }
- }
- else { /* uv < ouv */
- /* This cannot be allowed. */
- warning = UTF8_WARN_OVERFLOW;
- goto malformed;
+ }
+ else {
+ /* Here, found a non-continuation before processing all expected
+ * bytes. This byte begins a new character, so quit, even if
+ * allowing this malformation. This malformation may also be
+ * construed as the too-short malformation, but because the warning
+ * has traditionally been for non-continuation, use it. If
+ * non-continuation is allowed, but shorts are not, code after the
+ * loop will catch this as a too-short */
+ if (!(flags & UTF8_ALLOW_NON_CONTINUATION)) {
+ warning = UTF8_WARN_NON_CONTINUATION;
}
+ uv = UNICODE_REPLACEMENT;
+ break;
+ }
+
+ /* If the value shrank from the previous iteration, it is actually due
+ * to overflow. The original implementors viewed this malformation as
+ * more serious than the others (though I, khw, don't understand why,
+ * since there are others that give very very wrong results), so there
+ * is no way to turn off checking for it. Set a flag, but keep going
+ * in the loop, so that we fully parse the rest of the bytes that
+ * potentially comprise the character. */
+ if (uv < ouv) {
+ warning = UTF8_WARN_OVERFLOW;
+ overflow_byte = *s;
}
- s++;
- ouv = uv;
+
+ ouv = uv; /* Save for next iteration */
+
+ } /* End of loop through the character's bytes */
+
+ /* Save how many bytes were actually in the character */
+ curlen = s - s0;
+
+ /* The non-continuation warning has precedence over any others. This is to
+ * preserve backwards compatibility for applications that may be expecting
+ * them */
+ if (warning == UTF8_WARN_NON_CONTINUATION) {
+ goto malformed;
}
- if ((expectlen > (STRLEN)UNISKIP(uv)) && !(flags & UTF8_ALLOW_LONG)) {
+ /* If there were fewer bytes than expected, this is a too-short. This has
+ * priority over overflow, as, if the input is leading up to an overflow,
+ * but has too few bytes, it should be listed as a too-short */
+ if (curlen < expectlen) {
+ if (! (flags & UTF8_ALLOW_SHORT)) {
+ warning = UTF8_WARN_SHORT;
+ }
+ /* If too-shorts are allowed, the value returned will be much smaller
+ * than anticipated. Oh well... XXX out-dated comment */
+ uv = UNICODE_REPLACEMENT;
+ }
+ else if (! warning
+ && expectlen > (STRLEN)UNISKIP(uv)
+ && ! (flags & UTF8_ALLOW_LONG))
+ {
+ /* Note that if this malformation is allowed, we return the actual value,
+ * instead of the replacement character. This is because this value is
+ * actually well-defined. Note that this is a change in behavior from
+ * earlier, where checking for this malformation in most cases could
+ * not actually be turned off XXX experimental*/
warning = UTF8_WARN_LONG;
+ }
+
+ if (warning) {
goto malformed;
- } else if (flags & (UTF8_DISALLOW_ILLEGAL_INTERCHANGE|UTF8_WARN_ILLEGAL_INTERCHANGE)) {
+ }
+
+ if (uv >= UNICODE_SURROGATE_FIRST /* This is the first problematic code point */
+ && (flags & (UTF8_DISALLOW_ILLEGAL_INTERCHANGE|UTF8_WARN_ILLEGAL_INTERCHANGE)))
+ {
if (UNICODE_IS_SURROGATE(uv)) {
if ((flags & (UTF8_WARN_SURROGATE|UTF8_CHECK_ONLY)) == UTF8_WARN_SURROGATE) {
sv = sv_2mortal(Perl_newSVpvf(aTHX_ "UTF-16 surrogate U+%04"UVXf"", uv));
@@ -679,6 +734,19 @@ Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
}
}
else if ((uv > PERL_UNICODE_MAX)) {
+
+#if defined(HAS_QUAD) && ! defined(EBCDIC)
+ /* U+8000_0000=2**31 requires a 64-bit machine and is not representable in EBCDIC */
+ if (uv > 0x7FFFFFF) {
+ if (flags & (UTF8_WARN_SUPER|UTF8_WARN_FE_FF)) {
+ sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Code point U+%"UVXf" is not Unicode, and not portable", uv));
+ flags &= ~UTF8_WARN_SUPER; /* Only warn once on this problem */
+ }
+ if (flags & (UTF8_DISALLOW_SUPER|UTF8_DISALLOW_FE_FF)) {
+ goto malformed;
+ }
+ }
+#endif
if ((flags & (UTF8_WARN_SUPER|UTF8_CHECK_ONLY)) == UTF8_WARN_SUPER) {
sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Code point 0x%04"UVXf" is not Unicode, may not be portable", uv));
}
@@ -721,24 +789,21 @@ malformed:
Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
break;
case UTF8_WARN_NON_CONTINUATION:
- if (s == s0)
+ if (curlen == 1)
Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
- (UV)s[1], startbyte);
+ (UV)*s, startbyte);
else {
- const int len = (int)(s-s0);
Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
- (UV)s[1], len, len > 1 ? "s" : "", startbyte, (int)expectlen);
+ (UV)*s, (int) curlen, curlen > 1 ? "s" : "", startbyte, (int)expectlen);
}
-
break;
case UTF8_WARN_SHORT:
Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
(int)curlen, curlen == 1 ? "" : "s", (int)expectlen, startbyte);
- expectlen = curlen; /* distance for caller to skip */
break;
case UTF8_WARN_OVERFLOW:
Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
- ouv, *s, startbyte);
+ ouv, overflow_byte, startbyte);
break;
case UTF8_WARN_LONG:
Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
@@ -760,8 +825,9 @@ malformed:
}
}
- if (retlen)
- *retlen = expectlen ? expectlen : len;
+ if (retlen) {
+ *retlen = curlen;
+ }
return 0;
}
@@ -773,8 +839,11 @@ Returns the native code point of the first character in the string C<s> which
is assumed to be in UTF-8 encoding; C<send> points to 1 beyond the end of C<s>.
C<retlen> will be set to the length, in bytes, of that character.
-If C<s> does not point to a well-formed UTF-8 character, zero is
-returned and C<retlen> is set, if possible, to -1.
+If C<s> does not point to a well-formed UTF-8 character and UTF8 warnings are
+enabled, zero is returned and C<retlen> is set, if possible, to -1. If those
+warnings are off, a garbage value is silently returned, and C<retlen> is set,
+if possible, to point to the next byte in C<s> that could be the beginning of a
+legal character.
=cut
*/
@@ -792,16 +861,17 @@ Perl_utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
}
/* Like L</utf8_to_uvchr_buf>(), but should only be called when it is known that
- * there are no malformations in the input UTF-8 string C<s>. Currently, some
- * malformations are checked for, but this checking likely will be removed in
- * the future */
+ * there are no malformations in the input UTF-8 string C<s>. surrogates,
+ * non-character code points, and non-Unicode code points are allowed */
UV
Perl_valid_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
{
+ const UV uv = valid_utf8_to_uvuni(s, retlen);
+
PERL_ARGS_ASSERT_VALID_UTF8_TO_UVCHR;
- return utf8_to_uvchr_buf(s, s + UTF8_MAXBYTES, retlen);
+ return UNI_TO_NATIVE(uv);
}
/*
@@ -817,8 +887,11 @@ Some, but not all, UTF-8 malformations are detected, and in fact, some
malformed input could cause reading beyond the end of the input buffer, which
is why this function is deprecated. Use L</utf8_to_uvchr_buf> instead.
-If C<s> points to one of the detected malformations, zero is
-returned and C<retlen> is set, if possible, to -1.
+If C<s> points to one of the detected malformations, and UTF8 warnings are
+enabled, zero is returned and C<retlen> is set, if possible, to -1. If those
+warnings are off, a garbage value is silently returned, and C<retlen> is set,
+if possible, to point to the next byte in C<s> that could be the beginning of a
+legal character.
=cut
*/
@@ -828,7 +901,7 @@ Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
{
PERL_ARGS_ASSERT_UTF8_TO_UVCHR;
- return valid_utf8_to_uvchr(s, retlen);
+ return utf8_to_uvchr_buf(s, s + UTF8_MAXBYTES, retlen);
}
/*
@@ -841,8 +914,11 @@ C<retlen> will be set to the length, in bytes, of that character.
This function should only be used when the returned UV is considered
an index into the Unicode semantic tables (e.g. swashes).
-If C<s> does not point to a well-formed UTF-8 character, zero is
-returned and C<retlen> is set, if possible, to -1.
+If C<s> does not point to a well-formed UTF-8 character and UTF8 warnings are
+enabled, zero is returned and C<retlen> is set, if possible, to -1. If those
+warnings are off, a garbage value is silently returned, and C<retlen> is set,
+if possible, to point to the next byte in C<s> that could be the beginning of a
+legal character.
=cut
*/
@@ -860,16 +936,38 @@ Perl_utf8_to_uvuni_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
}
/* Like L</utf8_to_uvuni_buf>(), but should only be called when it is known that
- * there are no malformations in the input UTF-8 string C<s>. Currently, some
- * malformations are checked for, but this checking likely will be removed in
- * the future */
+ * there are no malformations in the input UTF-8 string C<s>. surrogates,
+ * non-character code points, and non-Unicode code points are allowed */
UV
Perl_valid_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
{
+ UV expectlen = UTF8SKIP(s);
+ const U8* send = s + expectlen;
+ UV uv = NATIVE_TO_UTF(*s);
+
PERL_ARGS_ASSERT_VALID_UTF8_TO_UVUNI;
- return utf8_to_uvuni_buf(s, s + UTF8_MAXBYTES, retlen);
+ if (retlen) {
+ *retlen = expectlen;
+ }
+
+ /* An invariant is trivially returned */
+ if (expectlen == 1) {
+ return uv;
+ }
+
+ /* Remove the leading bits that indicate the number of bytes, leaving just
+ * the bits that are part of the value */
+ uv &= UTF_START_MASK(expectlen);
+
+ /* Now, loop through the remaining bytes, accumulating each into the
+ * working total as we go */
+ for (++s; s < send; s++) {
+ uv = UTF8_ACCUMULATE(uv, *s);
+ }
+
+ return uv;
}
/*
@@ -885,8 +983,11 @@ Some, but not all, UTF-8 malformations are detected, and in fact, some
malformed input could cause reading beyond the end of the input buffer, which
is why this function is deprecated. Use L</utf8_to_uvuni_buf> instead.
-If C<s> points to one of the detected malformations, zero is
-returned and C<retlen> is set, if possible, to -1.
+If C<s> points to one of the detected malformations, and UTF8 warnings are
+enabled, zero is returned and C<retlen> is set, if possible, to -1. If those
+warnings are off, a garbage value is silently returned, and C<retlen> is set,
+if possible, to point to the next byte in C<s> that could be the beginning of a
+legal character.
=cut
*/
@@ -894,6 +995,7 @@ returned and C<retlen> is set, if possible, to -1.
UV
Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
{
+
PERL_ARGS_ASSERT_UTF8_TO_UVUNI;
return valid_utf8_to_uvuni(s, retlen);
@@ -2234,7 +2336,7 @@ Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
if (lenp)
*lenp = len;
- return len ? utf8_to_uvchr(ustrp, 0) : 0;
+ return len ? valid_utf8_to_uvchr(ustrp, 0) : 0;
}
STATIC UV
diff --git a/utf8.h b/utf8.h
index ecabb20c30..a8b81ce946 100644
--- a/utf8.h
+++ b/utf8.h
@@ -61,16 +61,25 @@ START_EXTERN_C
#ifdef DOINIT
EXTCONST unsigned char PL_utf8skip[] = {
-1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* ascii */
-1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* ascii */
-1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* ascii */
-1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* ascii */
-1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* bogus */
-1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* bogus */
-2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /* scripts */
-3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6, /* cjk etc. */
-7,13, /* Perl extended (not official UTF-8). Up to 72bit allowed (64-bit +
- reserved). */
+/* 0x00 */ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* ascii */
+/* 0x10 */ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* ascii */
+/* 0x20 */ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* ascii */
+/* 0x30 */ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* ascii */
+/* 0x40 */ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* ascii */
+/* 0x50 */ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* ascii */
+/* 0x60 */ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* ascii */
+/* 0x70 */ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* ascii */
+/* 0x80 */ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* bogus: continuation byte */
+/* 0x90 */ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* bogus: continuation byte */
+/* 0xA0 */ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* bogus: continuation byte */
+/* 0xB0 */ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* bogus: continuation byte */
+/* 0xC0 */ 2,2, /* overlong */
+/* 0xC2 */ 2,2,2,2,2,2,2,2,2,2,2,2,2,2, /* U+0080 to U+03FF */
+/* 0xD0 */ 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /* U+0400 to U+07FF */
+/* 0xE0 */ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, /* U+0800 to U+FFFF */
+/* 0xF0 */ 4,4,4,4,4,4,4,4,5,5,5,5,6,6, /* above BMP to 2**31 - 1 */
+/* 0xFE */ 7,13, /* Perl extended (never was official UTF-8). Up to 72bit
+ allowed (64-bit + reserved). */
};
#else
EXTCONST unsigned char PL_utf8skip[];
@@ -111,6 +120,10 @@ END_EXTERN_C
U+10000..U+3FFFF F0 * 90..BF 80..BF 80..BF
U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
U+100000..U+10FFFF F4 80..8F 80..BF 80..BF
+ Below are non-Unicode code points
+ U+110000..U+13FFFF F4 90..8F 80..BF 80..BF
+ U+110000..U+1FFFFF F5..F7 80..8F 80..BF 80..BF
+ U+200000: F8.. * 88..8F 80..BF 80..BF 80..BF
Note the gaps before several of the byte entries above marked by '*'. These are
caused by legal UTF-8 avoiding non-shortest encodings: it is technically
@@ -123,12 +136,12 @@ explicitly forbidden, and the shortest possible encoding should always be used
/*
Another way to look at it, as bits:
- Code Points 1st Byte 2nd Byte 3rd Byte 4th Byte
+ Code Points 1st Byte 2nd Byte 3rd Byte 4th Byte
- 0aaaaaaa 0aaaaaaa
- 00000bbbbbaaaaaa 110bbbbb 10aaaaaa
- ccccbbbbbbaaaaaa 1110cccc 10bbbbbb 10aaaaaa
- 00000dddccccccbbbbbbaaaaaa 11110ddd 10cccccc 10bbbbbb 10aaaaaa
+ 0aaa aaaa 0aaa aaaa
+ 0000 0bbb bbaa aaaa 110b bbbb 10aa aaaa
+ cccc bbbb bbaa aaaa 1110 cccc 10bb bbbb 10aa aaaa
+ 00 000d ddcc cccc bbbb bbaa aaaa 1111 0ddd 10cc cccc 10bb bbbb 10aa aaaa
As you can see, the continuation bytes all begin with C<10>, and the
leading bits of the start byte tell how many bytes there are in the
@@ -140,7 +153,6 @@ Perl's extended UTF-8 means we can have start bytes up to FF.
#define UNI_IS_INVARIANT(c) (((UV)c) < 0x80)
#define UTF8_IS_START(c) (((U8)c) >= 0xc2)
-#define LAX_UTF8_IS_START(c) (((U8)c) >= 0xc0) /* Allows overlong */
#define UTF8_IS_CONTINUATION(c) (((U8)c) >= 0x80 && (((U8)c) <= 0xbf))
#define UTF8_IS_CONTINUED(c) (((U8)c) & 0x80)
@@ -148,6 +160,9 @@ Perl's extended UTF-8 means we can have start bytes up to FF.
#define UTF8_IS_DOWNGRADEABLE_START(c) (((U8)c & 0xfe) == 0xc2)
#define UTF_START_MARK(len) (((len) > 7) ? 0xFF : (0xFE << (7-(len))))
+
+/* Masks out the initial one bits in a start byte, leaving the real data ones.
+ * Doesn't work on an invariant byte */
#define UTF_START_MASK(len) (((len) >= 7) ? 0x00 : (0x1F >> ((len)-2)))
#define UTF_CONTINUATION_MARK 0x80
@@ -471,6 +486,8 @@ Perl's extended UTF-8 means we can have start bytes up to FF.
#define SHARP_S_SKIP 2
#ifndef EBCDIC
+/* If you want to exclude surrogates, and beyond legal Unicode, see the blame
+ * log for earlier versions which gave details for these */
# define IS_UTF8_CHAR_1(p) \
((p)[0] <= 0x7F)
# define IS_UTF8_CHAR_2(p) \
@@ -481,18 +498,7 @@ Perl's extended UTF-8 means we can have start bytes up to FF.
(p)[1] >= 0xA0 && (p)[1] <= 0xBF && \
(p)[2] >= 0x80 && (p)[2] <= 0xBF)
# define IS_UTF8_CHAR_3b(p) \
- ((p)[0] >= 0xE1 && (p)[0] <= 0xEC && \
- (p)[1] >= 0x80 && (p)[1] <= 0xBF && \
- (p)[2] >= 0x80 && (p)[2] <= 0xBF)
-# define IS_UTF8_CHAR_3c(p) \
- ((p)[0] == 0xED && \
- (p)[1] >= 0x80 && (p)[1] <= 0xBF && \
- (p)[2] >= 0x80 && (p)[2] <= 0xBF)
- /* In IS_UTF8_CHAR_3c(p) one could use
- * (p)[1] >= 0x80 && (p)[1] <= 0x9F
- * if one wanted to exclude surrogates. */
-# define IS_UTF8_CHAR_3d(p) \
- ((p)[0] >= 0xEE && (p)[0] <= 0xEF && \
+ ((p)[0] >= 0xE1 && (p)[0] <= 0xEF && \
(p)[1] >= 0x80 && (p)[1] <= 0xBF && \
(p)[2] >= 0x80 && (p)[2] <= 0xBF)
# define IS_UTF8_CHAR_4a(p) \
@@ -500,34 +506,23 @@ Perl's extended UTF-8 means we can have start bytes up to FF.
(p)[1] >= 0x90 && (p)[1] <= 0xBF && \
(p)[2] >= 0x80 && (p)[2] <= 0xBF && \
(p)[3] >= 0x80 && (p)[3] <= 0xBF)
-# define IS_UTF8_CHAR_4b(p) \
- ((p)[0] >= 0xF1 && (p)[0] <= 0xF3 && \
- (p)[1] >= 0x80 && (p)[1] <= 0xBF && \
- (p)[2] >= 0x80 && (p)[2] <= 0xBF && \
- (p)[3] >= 0x80 && (p)[3] <= 0xBF)
-/* In IS_UTF8_CHAR_4c(p) one could use
- * (p)[0] == 0xF4
- * if one wanted to stop at the Unicode limit U+10FFFF.
- * The 0xF7 allows us to go to 0x1fffff (0x200000 would
+/* The 0xF7 allows us to go to 0x1fffff (0x200000 would
* require five bytes). Not doing any further code points
* since that is not needed (and that would not be strict
* UTF-8, anyway). The "slow path" in Perl_is_utf8_char()
* will take care of the "extended UTF-8". */
-# define IS_UTF8_CHAR_4c(p) \
- ((p)[0] >= 0xF4 && (p)[0] <= 0xF7 && \
+# define IS_UTF8_CHAR_4b(p) \
+ ((p)[0] >= 0xF1 && (p)[0] <= 0xF7 && \
(p)[1] >= 0x80 && (p)[1] <= 0xBF && \
(p)[2] >= 0x80 && (p)[2] <= 0xBF && \
(p)[3] >= 0x80 && (p)[3] <= 0xBF)
# define IS_UTF8_CHAR_3(p) \
(IS_UTF8_CHAR_3a(p) || \
- IS_UTF8_CHAR_3b(p) || \
- IS_UTF8_CHAR_3c(p) || \
- IS_UTF8_CHAR_3d(p))
+ IS_UTF8_CHAR_3b(p))
# define IS_UTF8_CHAR_4(p) \
(IS_UTF8_CHAR_4a(p) || \
- IS_UTF8_CHAR_4b(p) || \
- IS_UTF8_CHAR_4c(p))
+ IS_UTF8_CHAR_4b(p))
/* IS_UTF8_CHAR(p) is strictly speaking wrong (not UTF-8) because it
* (1) allows UTF-8 encoded UTF-16 surrogates