summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJarkko Hietaniemi <jhi@iki.fi>2001-12-18 15:24:50 +0000
committerJarkko Hietaniemi <jhi@iki.fi>2001-12-18 15:24:50 +0000
commit097fb8e2acde8522bd4ee4e5e00d3d2b810e2e56 (patch)
tree120af15df280bf1c52905c88cd22cd17a10fcd88
parent8218ea5d24f5de0a37393f19671aa5631c088be7 (diff)
downloadperl-097fb8e2acde8522bd4ee4e5e00d3d2b810e2e56.tar.gz
Make the utf8 malformedness messages more verbose.
p4raw-id: //depot/perl@13757
-rw-r--r--lib/utf8.t2
-rw-r--r--utf8.c25
2 files changed, 17 insertions, 10 deletions
diff --git a/lib/utf8.t b/lib/utf8.t
index ee3c258cf5..aaa06853f1 100644
--- a/lib/utf8.t
+++ b/lib/utf8.t
@@ -159,7 +159,7 @@ plan tests => 94;
use utf8; %a = ("\xE1\xA0"=>"sterling");
print 'start'; printf '%x,', ord \$_ foreach keys %a; print "end\n";
BANG
- qr/^Malformed UTF-8 character \(2 bytes, need 3\).*start\d+,end$/s
+ qr/^Malformed UTF-8 character \(2 bytes, need 3.+\).*start\d+,end$/s
],
);
foreach (@tests) {
diff --git a/utf8.c b/utf8.c
index af365927e2..4ca7b1c1f0 100644
--- a/utf8.c
+++ b/utf8.c
@@ -251,9 +251,11 @@ Most code should use utf8_to_uvchr() rather than call this directly.
UV
Perl_utf8n_to_uvuni(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
{
+ U8 *s0 = s;
UV uv = *s, ouv = 0;
STRLEN len = 1;
bool dowarn = ckWARN_d(WARN_UTF8);
+ U8 startbyte = *s;
STRLEN expectlen = 0;
U32 warning = 0;
@@ -396,23 +398,28 @@ malformed:
Perl_sv_catpvf(aTHX_ sv, "(empty string)");
break;
case UTF8_WARN_CONTINUATION:
- Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf")", uv);
+ Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
break;
case UTF8_WARN_NON_CONTINUATION:
- Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf" after start byte 0x%02"UVxf")",
- (UV)s[1], uv);
+ if (s == s0)
+ Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
+ (UV)s[1], startbyte);
+ else
+ Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
+ (UV)s[1], s - s0, s - s0 > 1 ? "s" : "", startbyte, expectlen);
+
break;
case UTF8_WARN_FE_FF:
Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
break;
case UTF8_WARN_SHORT:
- Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d)",
- curlen, curlen == 1 ? "" : "s", expectlen);
+ Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
+ curlen, curlen == 1 ? "" : "s", expectlen, startbyte);
expectlen = curlen; /* distance for caller to skip */
break;
case UTF8_WARN_OVERFLOW:
- Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x)",
- ouv, *s);
+ Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
+ ouv, *s, startbyte);
break;
case UTF8_WARN_SURROGATE:
Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
@@ -421,8 +428,8 @@ malformed:
Perl_sv_catpvf(aTHX_ sv, "(byte order mark 0x%04"UVxf")", uv);
break;
case UTF8_WARN_LONG:
- Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d)",
- expectlen, expectlen == 1 ? "": "s", UNISKIP(uv));
+ Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
+ expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
break;
case UTF8_WARN_FFFF:
Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);