summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--doop.c18
-rw-r--r--pp.c11
-rw-r--r--sv.c17
-rw-r--r--toke.c2
-rw-r--r--utf8.c144
-rw-r--r--utf8.h22
6 files changed, 131 insertions, 83 deletions
diff --git a/doop.c b/doop.c
index 868be2266b..f6dbe67a6b 100644
--- a/doop.c
+++ b/doop.c
@@ -833,15 +833,15 @@ Perl_do_chop(pTHX_ register SV *astr, register SV *sv)
char *send = s + len;
char *start = s;
s = send - 1;
- while ((*s & 0xc0) == 0x80)
- --s;
- if (UTF8SKIP(s) != send - s && ckWARN_d(WARN_UTF8))
- Perl_warner(aTHX_ WARN_UTF8, "Malformed UTF-8 character");
- sv_setpvn(astr, s, send - s);
- *s = '\0';
- SvCUR_set(sv, s - start);
- SvNIOK_off(sv);
- SvUTF8_on(astr);
+ while (s > start && UTF8_IS_CONTINUATION(*s))
+ s--;
+ if (utf8_to_uv_simple((U8*)s, 0)) {
+ sv_setpvn(astr, s, send - s);
+ *s = '\0';
+ SvCUR_set(sv, s - start);
+ SvNIOK_off(sv);
+ SvUTF8_on(astr);
+ }
}
else
sv_setpvn(astr, "", 0);
diff --git a/pp.c b/pp.c
index 950d85a459..1ea2a07566 100644
--- a/pp.c
+++ b/pp.c
@@ -3978,20 +3978,17 @@ PP(pp_reverse)
U8* s = (U8*)SvPVX(TARG);
U8* send = (U8*)(s + len);
while (s < send) {
- if (*s < 0x80) {
+ if (UTF8_IS_ASCII(*s)) {
s++;
continue;
}
else {
+ if (!utf8_to_uv_simple(s, 0))
+ break;
up = (char*)s;
s += UTF8SKIP(s);
down = (char*)(s - 1);
- if (s > send || !((*down & 0xc0) == 0x80)) {
- if (ckWARN_d(WARN_UTF8))
- Perl_warner(aTHX_ WARN_UTF8,
- "Malformed UTF-8 character");
- break;
- }
+ /* reverse this character */
while (down > up) {
tmp = *up;
*up++ = *down;
diff --git a/sv.c b/sv.c
index b999e6c322..139d98acca 100644
--- a/sv.c
+++ b/sv.c
@@ -4606,17 +4606,18 @@ Perl_sv_pos_b2u(pTHX_ register SV *sv, I32* offsetp)
s = (U8*)SvPV(sv, len);
if (len < *offsetp)
- Perl_croak(aTHX_ "panic: bad byte offset");
+ Perl_croak(aTHX_ "panic: sv_pos_b2u: bad byte offset");
send = s + *offsetp;
len = 0;
while (s < send) {
- s += UTF8SKIP(s);
- ++len;
- }
- if (s != send) {
- if (ckWARN_d(WARN_UTF8))
- Perl_warner(aTHX_ WARN_UTF8, "Malformed UTF-8 character");
- --len;
+ STRLEN n;
+
+ if (utf8_to_uv(s, UTF8SKIP(s), &n, 0)) {
+ s += n;
+ len++;
+ }
+ else
+ break;
}
*offsetp = len;
return;
diff --git a/toke.c b/toke.c
index a085c702cb..018e235031 100644
--- a/toke.c
+++ b/toke.c
@@ -1551,7 +1551,7 @@ S_scan_const(pTHX_ char *start)
STRLEN len = (STRLEN) -1;
UV uv;
if (this_utf8) {
- uv = utf8_to_uv((U8*)s, send - s, &len, UTF8_CHECK_ONLY);
+ uv = utf8_to_uv((U8*)s, send - s, &len, 0);
}
if (len == (STRLEN)-1) {
/* Illegal UTF8 (a high-bit byte), make it valid. */
diff --git a/utf8.c b/utf8.c
index d1f1d6631f..83e91fcc50 100644
--- a/utf8.c
+++ b/utf8.c
@@ -213,11 +213,24 @@ Perl_utf8_to_uv(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags)
bool dowarn = ckWARN_d(WARN_UTF8);
#endif
STRLEN expectlen = 0;
-
- if (curlen == 0) {
- if (dowarn)
- Perl_warner(aTHX_ WARN_UTF8,
- "Malformed UTF-8 character (an empty string)");
+ U32 warning = 0;
+
+/* This list is a superset of the UTF8_ALLOW_XXX. */
+
+#define UTF8_WARN_EMPTY 1
+#define UTF8_WARN_CONTINUATION 2
+#define UTF8_WARN_NON_CONTINUATION 3
+#define UTF8_WARN_FE_FF 4
+#define UTF8_WARN_SHORT 5
+#define UTF8_WARN_OVERFLOW 6
+#define UTF8_WARN_SURROGATE 7
+#define UTF8_WARN_BOM 8
+#define UTF8_WARN_LONG 9
+#define UTF8_WARN_FFFF 10
+
+ if (curlen == 0 &&
+ !(flags & UTF8_ALLOW_EMPTY)) {
+ warning = UTF8_WARN_EMPTY;
goto malformed;
}
@@ -229,28 +242,19 @@ Perl_utf8_to_uv(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags)
if (UTF8_IS_CONTINUATION(uv) &&
!(flags & UTF8_ALLOW_CONTINUATION)) {
- if (dowarn)
- Perl_warner(aTHX_ WARN_UTF8,
- "Malformed UTF-8 character (unexpected continuation byte 0x%02"UVxf")",
- uv);
+ warning = UTF8_WARN_CONTINUATION;
goto malformed;
}
if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
!(flags & UTF8_ALLOW_NON_CONTINUATION)) {
- if (dowarn)
- Perl_warner(aTHX_ WARN_UTF8,
- "Malformed UTF-8 character (unexpected non-continuation byte 0x%02"UVxf" after start byte 0x%02"UVxf")",
- (UV)s[1], uv);
+ warning = UTF8_WARN_NON_CONTINUATION;
goto malformed;
}
if ((uv == 0xfe || uv == 0xff) &&
!(flags & UTF8_ALLOW_FE_FF)) {
- if (dowarn)
- Perl_warner(aTHX_ WARN_UTF8,
- "Malformed UTF-8 character (byte 0x%02"UVxf")",
- uv);
+ warning = UTF8_WARN_FE_FF;
goto malformed;
}
@@ -269,10 +273,7 @@ Perl_utf8_to_uv(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags)
if ((curlen < expectlen) &&
!(flags & UTF8_ALLOW_SHORT)) {
- if (dowarn)
- Perl_warner(aTHX_ WARN_UTF8,
- "Malformed UTF-8 character (%d byte%s, need %d)",
- curlen, curlen == 1 ? "" : "s", expectlen);
+ warning = UTF8_WARN_SHORT;
goto malformed;
}
@@ -283,21 +284,25 @@ Perl_utf8_to_uv(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags)
while (len--) {
if (!UTF8_IS_CONTINUATION(*s) &&
!(flags & UTF8_ALLOW_NON_CONTINUATION)) {
- if (dowarn)
- Perl_warner(aTHX_ WARN_UTF8,
- "Malformed UTF-8 character (unexpected non-continuation byte 0x%02x)",
- *s);
+ s--;
+ warning = UTF8_WARN_NON_CONTINUATION;
goto malformed;
}
else
uv = UTF8_ACCUMULATE(uv, *s);
- if (uv < ouv) {
- /* This cannot be allowed. */
- if (dowarn)
- Perl_warner(aTHX_ WARN_UTF8,
- "Malformed UTF-8 character (overflow at 0x%"UVxf", byte 0x%02x)",
- ouv, *s);
- goto malformed;
+ if (!(uv > ouv)) {
+ /* These cannot be allowed. */
+ if (uv == ouv) {
+ if (!(flags & UTF8_ALLOW_LONG)) {
+ warning = UTF8_WARN_LONG;
+ goto malformed;
+ }
+ }
+ else { /* uv < ouv */
+ /* This cannot be allowed. */
+ warning = UTF8_WARN_OVERFLOW;
+ goto malformed;
+ }
}
s++;
ouv = uv;
@@ -305,31 +310,19 @@ Perl_utf8_to_uv(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags)
if (UNICODE_IS_SURROGATE(uv) &&
!(flags & UTF8_ALLOW_SURROGATE)) {
- if (dowarn)
- Perl_warner(aTHX_ WARN_UTF8,
- "Malformed UTF-8 character (UTF-16 surrogate 0x%04"UVxf")",
- uv);
+ warning = UTF8_WARN_SURROGATE;
goto malformed;
} else if (UNICODE_IS_BYTE_ORDER_MARK(uv) &&
!(flags & UTF8_ALLOW_BOM)) {
- if (dowarn)
- Perl_warner(aTHX_ WARN_UTF8,
- "Malformed UTF-8 character (byte order mark 0x%04"UVxf")",
- uv);
+ warning = UTF8_WARN_BOM;
goto malformed;
} else if ((expectlen > UNISKIP(uv)) &&
!(flags & UTF8_ALLOW_LONG)) {
- if (dowarn)
- Perl_warner(aTHX_ WARN_UTF8,
- "Malformed UTF-8 character (%d byte%s, need %d)",
- expectlen, expectlen == 1 ? "": "s", UNISKIP(uv));
+ warning = UTF8_WARN_LONG;
goto malformed;
} else if (UNICODE_IS_ILLEGAL(uv) &&
!(flags & UTF8_ALLOW_FFFF)) {
- if (dowarn)
- Perl_warner(aTHX_ WARN_UTF8,
- "Malformed UTF-8 character (character 0x%04"UVxf")",
- uv);
+ warning = UTF8_WARN_FFFF;
goto malformed;
}
@@ -343,6 +336,61 @@ malformed:
return 0;
}
+ if (dowarn) {
+ SV* sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
+
+ switch (warning) {
+ case 0: /* Intentionally empty. */ break;
+ case UTF8_WARN_EMPTY:
+ Perl_sv_catpvf(aTHX_ sv, "(empty string)");
+ break;
+ case UTF8_WARN_CONTINUATION:
+ Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf")", uv);
+ break;
+ case UTF8_WARN_NON_CONTINUATION:
+ Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf" after start byte 0x%02"UVxf")",
+ (UV)s[1], uv);
+ break;
+ case UTF8_WARN_FE_FF:
+ Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
+ break;
+ case UTF8_WARN_SHORT:
+ Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d)",
+ curlen, curlen == 1 ? "" : "s", expectlen);
+ break;
+ case UTF8_WARN_OVERFLOW:
+ Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x)",
+ ouv, *s);
+ break;
+ case UTF8_WARN_SURROGATE:
+ Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
+ break;
+ case UTF8_WARN_BOM:
+ Perl_sv_catpvf(aTHX_ sv, "(byte order mark 0x%04"UVxf")", uv);
+ break;
+ case UTF8_WARN_LONG:
+ Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d)",
+ expectlen, expectlen == 1 ? "": "s", UNISKIP(uv));
+ break;
+ case UTF8_WARN_FFFF:
+ Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
+ break;
+ default:
+ Perl_sv_catpvf(aTHX_ sv, "(unknown reason)");
+ break;
+ }
+
+ if (warning) {
+ char *s = SvPVX(sv);
+
+ if (PL_op)
+ Perl_warner(aTHX_ WARN_UTF8,
+ "%s in %s", s, PL_op_desc[PL_op->op_type]);
+ else
+ Perl_warner(aTHX_ WARN_UTF8, "%s", s);
+ }
+ }
+
if (retlen)
*retlen = expectlen ? expectlen : len;
diff --git a/utf8.h b/utf8.h
index 9061ad55c1..28aa057e36 100644
--- a/utf8.h
+++ b/utf8.h
@@ -33,16 +33,18 @@ END_EXTERN_C
#define IN_BYTE (PL_curcop->op_private & HINT_BYTE)
#define DO_UTF8(sv) (SvUTF8(sv) && !IN_BYTE)
-#define UTF8_ALLOW_CONTINUATION 0x0001
-#define UTF8_ALLOW_NON_CONTINUATION 0x0002
-#define UTF8_ALLOW_FE_FF 0x0004
-#define UTF8_ALLOW_SHORT 0x0008
-#define UTF8_ALLOW_SURROGATE 0x0010
-#define UTF8_ALLOW_BOM 0x0020
-#define UTF8_ALLOW_FFFF 0x0040
-#define UTF8_ALLOW_LONG 0x0080
-#define UTF8_ALLOW_ANYUV (UTF8_ALLOW_FE_FF|UTF8_ALLOW_FFFF \
- |UTF8_ALLOW_BOM|UTF8_ALLOW_SURROGATE)
+#define UTF8_ALLOW_EMPTY 0x0001
+#define UTF8_ALLOW_CONTINUATION 0x0002
+#define UTF8_ALLOW_NON_CONTINUATION 0x0004
+#define UTF8_ALLOW_FE_FF 0x0008
+#define UTF8_ALLOW_SHORT 0x0010
+#define UTF8_ALLOW_SURROGATE 0x0020
+#define UTF8_ALLOW_BOM 0x0040
+#define UTF8_ALLOW_FFFF 0x0080
+#define UTF8_ALLOW_LONG 0x0100
+#define UTF8_ALLOW_ANYUV (UTF8_ALLOW_EMPTY|UTF8_ALLOW_FE_FF|\
+ UTF8_ALLOW_SURROGATE|UTF8_ALLOW_BOM|\
+ UTF8_ALLOW_FFFF|UTF8_ALLOW_LONG)
#define UTF8_ALLOW_ANY 0x00ff
#define UTF8_CHECK_ONLY 0x0100