diff options
-rw-r--r-- | pod/perldiag.pod | 7 | ||||
-rw-r--r-- | t/lib/warnings/utf8 | 3 | ||||
-rw-r--r-- | utf8.c | 94 |
3 files changed, 57 insertions, 47 deletions
diff --git a/pod/perldiag.pod b/pod/perldiag.pod index a031b245ff..966ecdc406 100644 --- a/pod/perldiag.pod +++ b/pod/perldiag.pod @@ -4394,8 +4394,11 @@ representative, who probably put it there in the first place. (W utf8) Certain codepoints, such as U+FFFE and U+FFFF, are defined by the Unicode standard to be non-characters. Those are legal codepoints, but are reserved for internal use; so, applications shouldn't attempt to exchange -them. If you know what you are doing you can turn off this warning by -C<no warnings 'utf8';>. +them. In some cases, this message is also given if you use a codepoint that +isn't in Unicode--that is it is above the legal maximum of U+10FFFF. These +aren't legal at all in Unicode, so they are illegal for interchange, but can be +used internally in a Perl program. If you know what you are doing you can turn +off this warning by C<no warnings 'utf8';>. =item Unknown BYTEORDER diff --git a/t/lib/warnings/utf8 b/t/lib/warnings/utf8 index be49ae9fe5..fe1cbf0133 100644 --- a/t/lib/warnings/utf8 +++ b/t/lib/warnings/utf8 @@ -114,6 +114,7 @@ my $hex4 = "\x{10000}"; my $hex5 = "\x{100000}"; my $maxm1 = "\x{10FFFE}"; my $max = "\x{10FFFF}"; +uc($ffff); no warnings 'utf8'; my $d7ff = "\x{D7FF}"; my $d800 = "\x{D800}"; @@ -127,6 +128,7 @@ my $hex4 = "\x{10000}"; my $hex5 = "\x{100000}"; my $maxm1 = "\x{10FFFE}"; my $max = "\x{10FFFF}"; +uc($ffff); EXPECT UTF-16 surrogate 0xd800 at - line 3. UTF-16 surrogate 0xdfff at - line 4. @@ -134,3 +136,4 @@ Unicode non-character 0xfffe is illegal for interchange at - line 8. Unicode non-character 0xffff is illegal for interchange at - line 9. Unicode non-character 0x10fffe is illegal for interchange at - line 12. Unicode non-character 0x10ffff is illegal for interchange at - line 13. +Unicode non-character 0xffff is illegal for interchange in uc at - line 14. @@ -454,10 +454,11 @@ Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags) const UV startbyte = *s; STRLEN expectlen = 0; U32 warning = 0; + SV* sv; PERL_ARGS_ASSERT_UTF8N_TO_UVUNI; -/* This list is a superset of the UTF8_ALLOW_XXX. */ +/* This list is a superset of the UTF8_ALLOW_XXX. BUT it isn't, eg SUPER missing XXX */ #define UTF8_WARN_EMPTY 1 #define UTF8_WARN_CONTINUATION 2 @@ -583,52 +584,55 @@ malformed: } if (dowarn) { - SV* const sv = newSVpvs_flags("Malformed UTF-8 character ", SVs_TEMP); + if (warning == UTF8_WARN_FFFF) { + sv = newSVpvs_flags("Unicode non-character ", SVs_TEMP); + Perl_sv_catpvf(aTHX_ sv, "0x%04"UVxf" is illegal for interchange", uv); + } + else { + sv = newSVpvs_flags("Malformed UTF-8 character ", SVs_TEMP); + + switch (warning) { + case 0: /* Intentionally empty. */ break; + case UTF8_WARN_EMPTY: + sv_catpvs(sv, "(empty string)"); + break; + case UTF8_WARN_CONTINUATION: + Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv); + break; + case UTF8_WARN_NON_CONTINUATION: + if (s == s0) + Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")", + (UV)s[1], startbyte); + else { + const int len = (int)(s-s0); + Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)", + (UV)s[1], len, len > 1 ? "s" : "", startbyte, (int)expectlen); + } - switch (warning) { - case 0: /* Intentionally empty. */ break; - case UTF8_WARN_EMPTY: - sv_catpvs(sv, "(empty string)"); - break; - case UTF8_WARN_CONTINUATION: - Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv); - break; - case UTF8_WARN_NON_CONTINUATION: - if (s == s0) - Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")", - (UV)s[1], startbyte); - else { - const int len = (int)(s-s0); - Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)", - (UV)s[1], len, len > 1 ? "s" : "", startbyte, (int)expectlen); + break; + case UTF8_WARN_FE_FF: + Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv); + break; + case UTF8_WARN_SHORT: + Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")", + (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, startbyte); + expectlen = curlen; /* distance for caller to skip */ + break; + case UTF8_WARN_OVERFLOW: + Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")", + ouv, *s, startbyte); + break; + case UTF8_WARN_SURROGATE: + Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv); + break; + case UTF8_WARN_LONG: + Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")", + (int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte); + break; + default: + sv_catpvs(sv, "(unknown reason)"); + break; } - - break; - case UTF8_WARN_FE_FF: - Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv); - break; - case UTF8_WARN_SHORT: - Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")", - (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, startbyte); - expectlen = curlen; /* distance for caller to skip */ - break; - case UTF8_WARN_OVERFLOW: - Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")", - ouv, *s, startbyte); - break; - case UTF8_WARN_SURROGATE: - Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv); - break; - case UTF8_WARN_LONG: - Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")", - (int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte); - break; - case UTF8_WARN_FFFF: - Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv); - break; - default: - sv_catpvs(sv, "(unknown reason)"); - break; } if (warning) { |