summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--doio.c6
-rw-r--r--embed.fnc1
-rw-r--r--embed.h1
-rw-r--r--pod/perldiag.pod15
-rw-r--r--proto.h5
-rw-r--r--utf8.c48
6 files changed, 76 insertions, 0 deletions
diff --git a/doio.c b/doio.c
index c23780cc12..cecc574ba6 100644
--- a/doio.c
+++ b/doio.c
@@ -1224,6 +1224,9 @@ Perl_do_print(pTHX_ register SV *sv, PerlIO *fp)
tmpbuf = bytes_to_utf8((const U8*) tmps, &len);
tmps = (char *) tmpbuf;
}
+ else if (ckWARN_d(WARN_UTF8)) {
+ (void) check_utf8_print((const U8*) tmps, len);
+ }
}
else if (DO_UTF8(sv)) {
STRLEN tmplen = len;
@@ -1240,6 +1243,9 @@ Perl_do_print(pTHX_ register SV *sv, PerlIO *fp)
"Wide character in %s",
PL_op ? OP_DESC(PL_op) : "print"
);
+ /* Could also check that isn't one of the things to avoid
+ * in utf8 by using check_utf8_print(), but not doing so,
+ * since the stream isn't a UTF8 stream */
}
}
/* To detect whether the process is about to overstep its
diff --git a/embed.fnc b/embed.fnc
index e309acc815..3a4774c590 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -1314,6 +1314,7 @@ ApMd |U8* |bytes_from_utf8|NN const U8 *s|NN STRLEN *len|NULLOK bool *is_utf8
ApMd |U8* |bytes_to_utf8 |NN const U8 *s|NN STRLEN *len
Apd |UV |utf8_to_uvchr |NN const U8 *s|NULLOK STRLEN *retlen
Apd |UV |utf8_to_uvuni |NN const U8 *s|NULLOK STRLEN *retlen
+pMd |bool |check_utf8_print |NN const U8 *s|const STRLEN len
#ifdef EBCDIC
Adp |UV |utf8n_to_uvchr |NN const U8 *s|STRLEN curlen|NULLOK STRLEN *retlen|U32 flags
diff --git a/embed.h b/embed.h
index 48eb8bb64e..e446e23b97 100644
--- a/embed.h
+++ b/embed.h
@@ -920,6 +920,7 @@
#define boot_core_UNIVERSAL() Perl_boot_core_UNIVERSAL(aTHX)
#define boot_core_mro() Perl_boot_core_mro(aTHX)
#define cando(a,b,c) Perl_cando(aTHX_ a,b,c)
+#define check_utf8_print(a,b) Perl_check_utf8_print(aTHX_ a,b)
#define ck_anoncode(a) Perl_ck_anoncode(aTHX_ a)
#define ck_bitop(a) Perl_ck_bitop(aTHX_ a)
#define ck_chdir(a) Perl_ck_chdir(aTHX_ a)
diff --git a/pod/perldiag.pod b/pod/perldiag.pod
index 09dace1cf7..c88df90590 100644
--- a/pod/perldiag.pod
+++ b/pod/perldiag.pod
@@ -1358,6 +1358,17 @@ This subroutine cannot be called.
(F) You had a (sub-)template that ends with a '/'. There must be another
template code following the slash. See L<perlfunc/pack>.
+=item Code point 0x%X is not Unicode, may not be portable
+
+(W utf8) You had a code point above the Unicode maximum of U+10FFFF.
+
+Perl allows strings to contain a superset of Unicode code
+points, up to the limit of what is storable in an unsigned integer on
+your system, but these may not be accepted by other languages/systems.
+At one time, it was legal in some standards to have code points up to
+0x7FFF_FFFF, but not higher. Code points above 0xFFFF_FFFF require
+larger than a 32 bit word.
+
=item %s: Command not found
(A) You've accidentally run your script through B<csh> instead of Perl.
@@ -4619,6 +4630,8 @@ representative, who probably put it there in the first place.
=item Unicode non-character 0x%x is illegal for interchange
+=item Unicode non-character U+%X is illegal for open interchange
+
(W utf8) Certain codepoints, such as U+FFFE and U+FFFF, are defined by the
Unicode standard to be non-characters. Those are legal codepoints, but are
reserved for internal use; so, applications shouldn't attempt to exchange
@@ -5193,6 +5206,8 @@ exceeded. In the message, the characters in the sequence are separated by
dots, and each is shown by its ordinal in hex. Anything to the left of the
C<HERE> was retained; anything to the right was discarded.
+=item Unicode surrogate U+%X is illegal in UTF-8
+
=item UTF-16 surrogate 0x%x
(W utf8) You tried to generate half of a UTF-16 surrogate by
diff --git a/proto.h b/proto.h
index 5a32973746..c4a273fe8f 100644
--- a/proto.h
+++ b/proto.h
@@ -255,6 +255,11 @@ PERL_CALLCONV U32 Perl_cast_ulong(pTHX_ NV f)
PERL_CALLCONV UV Perl_cast_uv(pTHX_ NV f)
__attribute__warn_unused_result__;
+PERL_CALLCONV bool Perl_check_utf8_print(pTHX_ const U8 *s, const STRLEN len)
+ __attribute__nonnull__(pTHX_1);
+#define PERL_ARGS_ASSERT_CHECK_UTF8_PRINT \
+ assert(s)
+
PERL_CALLCONV OP * Perl_ck_anoncode(pTHX_ OP *o)
__attribute__warn_unused_result__
__attribute__nonnull__(pTHX_1);
diff --git a/utf8.c b/utf8.c
index 266cb9e5a4..9cc4d8963c 100644
--- a/utf8.c
+++ b/utf8.c
@@ -2649,6 +2649,54 @@ U32 flags)
return UNI_TO_NATIVE(uv);
}
+bool
+Perl_check_utf8_print(pTHX_ register const U8* s, const STRLEN len)
+{
+ /* May change: warns if surrogates, non-character code points, or
+ * non-Unicode code points are in s which has length len. Returns TRUE if
+ * none found; FALSE otherwise. The only other validity check is to make
+ * sure that this won't exceed the string's length */
+
+ const U8* const e = s + len;
+ bool ok = TRUE;
+
+ PERL_ARGS_ASSERT_CHECK_UTF8_PRINT;
+
+ while (s < e) {
+ if (UTF8SKIP(s) > len) {
+ Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
+ "%s in %s", unees, PL_op ? OP_DESC(PL_op) : "print");
+ return FALSE;
+ }
+ if (*s >= UTF8_FIRST_PROBLEMATIC_CODE_POINT_FIRST_BYTE) {
+ STRLEN char_len;
+ if (UTF8_IS_SUPER(s)) {
+ UV uv = utf8_to_uvchr(s, &char_len);
+ Perl_warner(aTHX_ packWARN(WARN_UTF8),
+ "Code point 0x%04"UVXf" is not Unicode, may not be portable", uv);
+ ok = FALSE;
+ }
+ else if (UTF8_IS_SURROGATE(s)) {
+ UV uv = utf8_to_uvchr(s, &char_len);
+ Perl_warner(aTHX_ packWARN(WARN_UTF8),
+ "Unicode surrogate U+%04"UVXf" is illegal in UTF-8", uv);
+ ok = FALSE;
+ }
+ else if
+ (UTF8_IS_NONCHAR_GIVEN_THAT_NON_SUPER_AND_GE_PROBLEMATIC(s))
+ {
+ UV uv = utf8_to_uvchr(s, &char_len);
+ Perl_warner(aTHX_ packWARN(WARN_UTF8),
+ "Unicode non-character U+%04"UVXf" is illegal for open interchange", uv);
+ ok = FALSE;
+ }
+ }
+ s += UTF8SKIP(s);
+ }
+
+ return ok;
+}
+
/*
=for apidoc pv_uni_display