diff options
author | Karl Williamson <public@khwilliamson.com> | 2014-01-01 20:08:02 -0700 |
---|---|---|
committer | Karl Williamson <public@khwilliamson.com> | 2014-01-01 20:29:37 -0700 |
commit | ea5ced44af8b967bfce3763b11ba4714d4fcd154 (patch) | |
tree | f6dbae186171fc57cfb675765a07ca8b2bc2eb9f /utf8.c | |
parent | 23dfa30831199ff0adfa3c42488e59e3df455e2c (diff) | |
download | perl-ea5ced44af8b967bfce3763b11ba4714d4fcd154.tar.gz |
Change some warnings in utf8n_to_uvchr()
This bottom level function decodes the first character of a UTF-8 string
into a code point. It is discouraged from using it directly. This
commit cleans up some of the warnings it can raise. Now, tests for
malformations are done before any tests for other potential issues. One
of those issues involves code points so large that they have never
appeared in any official standard (the current standard has scaled back
the highest acceptable code point from earlier versions). It is
possible (though not done in CPAN) to warn and/or forbid these code
points, while accepting smaller code points that are still above the
legal Unicode maximum. The warning message for this now includes the
code point if representable on the machine. Previously it always
displayed raw bytes, which is what it still does for non-representable
code points.
Diffstat (limited to 'utf8.c')
-rw-r--r-- | utf8.c | 52 |
1 files changed, 26 insertions, 26 deletions
@@ -778,32 +778,8 @@ Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags) } } -#ifndef EBCDIC /* EBCDIC allows FE, FF, can't overflow */ - if ((*s0 & 0xFE) == 0xFE /* matches both FE, FF */ - && (flags & (UTF8_WARN_FE_FF|UTF8_DISALLOW_FE_FF))) - { - /* By adding UTF8_CHECK_ONLY to the test, we avoid unnecessary - * generation of the sv, since no warnings are raised under CHECK */ - if ((flags & (UTF8_WARN_FE_FF|UTF8_CHECK_ONLY)) == UTF8_WARN_FE_FF - && ckWARN_d(WARN_UTF8)) - { - /* This message is deliberately not of the same syntax as the other - * messages for malformations, for backwards compatibility in the - * unlikely event that code is relying on its precise earlier text - */ - sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s Code point beginning with byte 0x%02X is not Unicode, and not portable", malformed_text, *s0)); - pack_warn = packWARN(WARN_UTF8); - } - if (flags & UTF8_DISALLOW_FE_FF) { - goto malformed; - } - } +#ifndef EBCDIC /* EBCDIC can't overflow */ if (UNLIKELY(overflowed)) { - - /* If the first byte is FF, it will overflow a 32-bit word. If the - * first byte is FE, it will overflow a signed 32-bit word. The - * above preserves backward compatibility, since its message was used - * in earlier versions of this code in preference to overflow */ sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (overflow at byte 0x%02x, after start byte 0x%02x)", malformed_text, overflow_byte, *s0)); goto malformed; } @@ -830,6 +806,9 @@ Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags) |UTF8_WARN_ILLEGAL_INTERCHANGE))) { if (UNICODE_IS_SURROGATE(uv)) { + + /* By adding UTF8_CHECK_ONLY to the test, we avoid unnecessary + * generation of the sv, since no warnings are raised under CHECK */ if ((flags & (UTF8_WARN_SURROGATE|UTF8_CHECK_ONLY)) == UTF8_WARN_SURROGATE && ckWARN_d(WARN_SURROGATE)) { @@ -842,11 +821,32 @@ Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags) } else if ((uv > PERL_UNICODE_MAX)) { if ((flags & (UTF8_WARN_SUPER|UTF8_CHECK_ONLY)) == UTF8_WARN_SUPER - && ckWARN_d(WARN_NON_UNICODE)) + && ckWARN_d(WARN_NON_UNICODE)) { sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Code point 0x%04"UVXf" is not Unicode, may not be portable", uv)); pack_warn = packWARN(WARN_NON_UNICODE); } +#ifndef EBCDIC /* EBCDIC always allows FE, FF */ + + /* The first byte being 0xFE or 0xFF is a subset of the SUPER code + * points. We test for these after the regular SUPER ones, and + * before possibly bailing out, so that the more dire warning + * overrides the regular one, if applicable */ + if ((*s0 & 0xFE) == 0xFE /* matches both FE, FF */ + && (flags & (UTF8_WARN_FE_FF|UTF8_DISALLOW_FE_FF))) + { + if ((flags & (UTF8_WARN_FE_FF|UTF8_CHECK_ONLY)) + == UTF8_WARN_FE_FF + && ckWARN_d(WARN_UTF8)) + { + sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Code point 0x%"UVXf" is not Unicode, and not portable", uv)); + pack_warn = packWARN(WARN_UTF8); + } + if (flags & UTF8_DISALLOW_FE_FF) { + goto disallowed; + } + } +#endif if (flags & UTF8_DISALLOW_SUPER) { goto disallowed; } |