diff options
-rw-r--r-- | doio.c | 6 | ||||
-rw-r--r-- | embed.fnc | 1 | ||||
-rw-r--r-- | embed.h | 1 | ||||
-rw-r--r-- | pod/perldiag.pod | 15 | ||||
-rw-r--r-- | proto.h | 5 | ||||
-rw-r--r-- | utf8.c | 48 |
6 files changed, 76 insertions, 0 deletions
@@ -1224,6 +1224,9 @@ Perl_do_print(pTHX_ register SV *sv, PerlIO *fp) tmpbuf = bytes_to_utf8((const U8*) tmps, &len); tmps = (char *) tmpbuf; } + else if (ckWARN_d(WARN_UTF8)) { + (void) check_utf8_print((const U8*) tmps, len); + } } else if (DO_UTF8(sv)) { STRLEN tmplen = len; @@ -1240,6 +1243,9 @@ Perl_do_print(pTHX_ register SV *sv, PerlIO *fp) "Wide character in %s", PL_op ? OP_DESC(PL_op) : "print" ); + /* Could also check that isn't one of the things to avoid + * in utf8 by using check_utf8_print(), but not doing so, + * since the stream isn't a UTF8 stream */ } } /* To detect whether the process is about to overstep its @@ -1314,6 +1314,7 @@ ApMd |U8* |bytes_from_utf8|NN const U8 *s|NN STRLEN *len|NULLOK bool *is_utf8 ApMd |U8* |bytes_to_utf8 |NN const U8 *s|NN STRLEN *len Apd |UV |utf8_to_uvchr |NN const U8 *s|NULLOK STRLEN *retlen Apd |UV |utf8_to_uvuni |NN const U8 *s|NULLOK STRLEN *retlen +pMd |bool |check_utf8_print |NN const U8 *s|const STRLEN len #ifdef EBCDIC Adp |UV |utf8n_to_uvchr |NN const U8 *s|STRLEN curlen|NULLOK STRLEN *retlen|U32 flags @@ -920,6 +920,7 @@ #define boot_core_UNIVERSAL() Perl_boot_core_UNIVERSAL(aTHX) #define boot_core_mro() Perl_boot_core_mro(aTHX) #define cando(a,b,c) Perl_cando(aTHX_ a,b,c) +#define check_utf8_print(a,b) Perl_check_utf8_print(aTHX_ a,b) #define ck_anoncode(a) Perl_ck_anoncode(aTHX_ a) #define ck_bitop(a) Perl_ck_bitop(aTHX_ a) #define ck_chdir(a) Perl_ck_chdir(aTHX_ a) diff --git a/pod/perldiag.pod b/pod/perldiag.pod index 09dace1cf7..c88df90590 100644 --- a/pod/perldiag.pod +++ b/pod/perldiag.pod @@ -1358,6 +1358,17 @@ This subroutine cannot be called. (F) You had a (sub-)template that ends with a '/'. There must be another template code following the slash. See L<perlfunc/pack>. +=item Code point 0x%X is not Unicode, may not be portable + +(W utf8) You had a code point above the Unicode maximum of U+10FFFF. + +Perl allows strings to contain a superset of Unicode code +points, up to the limit of what is storable in an unsigned integer on +your system, but these may not be accepted by other languages/systems. +At one time, it was legal in some standards to have code points up to +0x7FFF_FFFF, but not higher. Code points above 0xFFFF_FFFF require +larger than a 32 bit word. + =item %s: Command not found (A) You've accidentally run your script through B<csh> instead of Perl. @@ -4619,6 +4630,8 @@ representative, who probably put it there in the first place. =item Unicode non-character 0x%x is illegal for interchange +=item Unicode non-character U+%X is illegal for open interchange + (W utf8) Certain codepoints, such as U+FFFE and U+FFFF, are defined by the Unicode standard to be non-characters. Those are legal codepoints, but are reserved for internal use; so, applications shouldn't attempt to exchange @@ -5193,6 +5206,8 @@ exceeded. In the message, the characters in the sequence are separated by dots, and each is shown by its ordinal in hex. Anything to the left of the C<HERE> was retained; anything to the right was discarded. +=item Unicode surrogate U+%X is illegal in UTF-8 + =item UTF-16 surrogate 0x%x (W utf8) You tried to generate half of a UTF-16 surrogate by @@ -255,6 +255,11 @@ PERL_CALLCONV U32 Perl_cast_ulong(pTHX_ NV f) PERL_CALLCONV UV Perl_cast_uv(pTHX_ NV f) __attribute__warn_unused_result__; +PERL_CALLCONV bool Perl_check_utf8_print(pTHX_ const U8 *s, const STRLEN len) + __attribute__nonnull__(pTHX_1); +#define PERL_ARGS_ASSERT_CHECK_UTF8_PRINT \ + assert(s) + PERL_CALLCONV OP * Perl_ck_anoncode(pTHX_ OP *o) __attribute__warn_unused_result__ __attribute__nonnull__(pTHX_1); @@ -2649,6 +2649,54 @@ U32 flags) return UNI_TO_NATIVE(uv); } +bool +Perl_check_utf8_print(pTHX_ register const U8* s, const STRLEN len) +{ + /* May change: warns if surrogates, non-character code points, or + * non-Unicode code points are in s which has length len. Returns TRUE if + * none found; FALSE otherwise. The only other validity check is to make + * sure that this won't exceed the string's length */ + + const U8* const e = s + len; + bool ok = TRUE; + + PERL_ARGS_ASSERT_CHECK_UTF8_PRINT; + + while (s < e) { + if (UTF8SKIP(s) > len) { + Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), + "%s in %s", unees, PL_op ? OP_DESC(PL_op) : "print"); + return FALSE; + } + if (*s >= UTF8_FIRST_PROBLEMATIC_CODE_POINT_FIRST_BYTE) { + STRLEN char_len; + if (UTF8_IS_SUPER(s)) { + UV uv = utf8_to_uvchr(s, &char_len); + Perl_warner(aTHX_ packWARN(WARN_UTF8), + "Code point 0x%04"UVXf" is not Unicode, may not be portable", uv); + ok = FALSE; + } + else if (UTF8_IS_SURROGATE(s)) { + UV uv = utf8_to_uvchr(s, &char_len); + Perl_warner(aTHX_ packWARN(WARN_UTF8), + "Unicode surrogate U+%04"UVXf" is illegal in UTF-8", uv); + ok = FALSE; + } + else if + (UTF8_IS_NONCHAR_GIVEN_THAT_NON_SUPER_AND_GE_PROBLEMATIC(s)) + { + UV uv = utf8_to_uvchr(s, &char_len); + Perl_warner(aTHX_ packWARN(WARN_UTF8), + "Unicode non-character U+%04"UVXf" is illegal for open interchange", uv); + ok = FALSE; + } + } + s += UTF8SKIP(s); + } + + return ok; +} + /* =for apidoc pv_uni_display |