6 files changed, 76 insertions, 0 deletions
diff --git a/doio.c b/doio.c
index c23780cc12..cecc574ba6 100644
--- a/doio.c
+++ b/doio.c
@@ -1224,6 +1224,9 @@ Perl_do_print(pTHX_ register SV *sv, PerlIO *fp)
 		tmpbuf = bytes_to_utf8((const U8*) tmps, &len);
 		tmps = (char *) tmpbuf;
 	    }
+	    else if (ckWARN_d(WARN_UTF8)) {
+		(void) check_utf8_print((const U8*) tmps, len);
+	    }
 	}
 	else if (DO_UTF8(sv)) {
 	    STRLEN tmplen = len;
@@ -1240,6 +1243,9 @@ Perl_do_print(pTHX_ register SV *sv, PerlIO *fp)
 				 "Wide character in %s",
 				   PL_op ? OP_DESC(PL_op) : "print"
 				);
+		    /* Could also check that isn't one of the things to avoid
+		     * in utf8 by using check_utf8_print(), but not doing so,
+		     * since the stream isn't a UTF8 stream */
 	    }
 	}
 	/* To detect whether the process is about to overstep its
diff --git a/embed.fnc b/embed.fnc
index e309acc815..3a4774c590 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -1314,6 +1314,7 @@ ApMd	|U8*	|bytes_from_utf8|NN const U8 *s|NN STRLEN *len|NULLOK bool *is_utf8
 ApMd	|U8*	|bytes_to_utf8	|NN const U8 *s|NN STRLEN *len
 Apd	|UV	|utf8_to_uvchr	|NN const U8 *s|NULLOK STRLEN *retlen
 Apd	|UV	|utf8_to_uvuni	|NN const U8 *s|NULLOK STRLEN *retlen
+pMd	|bool	|check_utf8_print	|NN const U8 *s|const STRLEN len
 
 #ifdef EBCDIC
 Adp	|UV	|utf8n_to_uvchr	|NN const U8 *s|STRLEN curlen|NULLOK STRLEN *retlen|U32 flags
diff --git a/embed.h b/embed.h
index 48eb8bb64e..e446e23b97 100644
--- a/embed.h
+++ b/embed.h
@@ -920,6 +920,7 @@
 #define boot_core_UNIVERSAL()	Perl_boot_core_UNIVERSAL(aTHX)
 #define boot_core_mro()		Perl_boot_core_mro(aTHX)
 #define cando(a,b,c)		Perl_cando(aTHX_ a,b,c)
+#define check_utf8_print(a,b)	Perl_check_utf8_print(aTHX_ a,b)
 #define ck_anoncode(a)		Perl_ck_anoncode(aTHX_ a)
 #define ck_bitop(a)		Perl_ck_bitop(aTHX_ a)
 #define ck_chdir(a)		Perl_ck_chdir(aTHX_ a)
diff --git a/pod/perldiag.pod b/pod/perldiag.pod
index 09dace1cf7..c88df90590 100644
--- a/pod/perldiag.pod
+++ b/pod/perldiag.pod
@@ -1358,6 +1358,17 @@ This subroutine cannot be called.
 (F) You had a (sub-)template that ends with a '/'. There must be another
 template code following the slash. See L<perlfunc/pack>.
 
+=item Code point 0x%X is not Unicode, may not be portable
+
+(W utf8) You had a code point above the Unicode maximum of U+10FFFF.
+
+Perl allows strings to contain a superset of Unicode code
+points, up to the limit of what is storable in an unsigned integer on
+your system, but these may not be accepted by other languages/systems.
+At one time, it was legal in some standards to have code points up to
+0x7FFF_FFFF, but not higher.  Code points above 0xFFFF_FFFF require
+larger than a 32 bit word.
+
 =item %s: Command not found
 
 (A) You've accidentally run your script through B<csh> instead of Perl.
@@ -4619,6 +4630,8 @@ representative, who probably put it there in the first place.
 
 =item Unicode non-character 0x%x is illegal for interchange
 
+=item Unicode non-character U+%X is illegal for open interchange
+
 (W utf8) Certain codepoints, such as U+FFFE and U+FFFF, are defined by the
 Unicode standard to be non-characters. Those are legal codepoints, but are
 reserved for internal use; so, applications shouldn't attempt to exchange
@@ -5193,6 +5206,8 @@ exceeded.  In the message, the characters in the sequence are separated by
 dots, and each is shown by its ordinal in hex.  Anything to the left of the
 C<HERE> was retained; anything to the right was discarded.
 
+=item Unicode surrogate U+%X is illegal in UTF-8
+
 =item UTF-16 surrogate 0x%x
 
 (W utf8) You tried to generate half of a UTF-16 surrogate by
diff --git a/proto.h b/proto.h
index 5a32973746..c4a273fe8f 100644
--- a/proto.h
+++ b/proto.h
@@ -255,6 +255,11 @@ PERL_CALLCONV U32	Perl_cast_ulong(pTHX_ NV f)
 PERL_CALLCONV UV	Perl_cast_uv(pTHX_ NV f)
 			__attribute__warn_unused_result__;
 
+PERL_CALLCONV bool	Perl_check_utf8_print(pTHX_ const U8 *s, const STRLEN len)
+			__attribute__nonnull__(pTHX_1);
+#define PERL_ARGS_ASSERT_CHECK_UTF8_PRINT	\
+	assert(s)
+
 PERL_CALLCONV OP *	Perl_ck_anoncode(pTHX_ OP *o)
 			__attribute__warn_unused_result__
 			__attribute__nonnull__(pTHX_1);
diff --git a/utf8.c b/utf8.c
index 266cb9e5a4..9cc4d8963c 100644
--- a/utf8.c
+++ b/utf8.c
@@ -2649,6 +2649,54 @@ U32 flags)
     return UNI_TO_NATIVE(uv);
 }
 
+bool
+Perl_check_utf8_print(pTHX_ register const U8* s, const STRLEN len)
+{
+    /* May change: warns if surrogates, non-character code points, or
+     * non-Unicode code points are in s which has length len.  Returns TRUE if
+     * none found; FALSE otherwise.  The only other validity check is to make
+     * sure that this won't exceed the string's length */
+
+    const U8* const e = s + len;
+    bool ok = TRUE;
+
+    PERL_ARGS_ASSERT_CHECK_UTF8_PRINT;
+
+    while (s < e) {
+	if (UTF8SKIP(s) > len) {
+	    Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
+			   "%s in %s", unees, PL_op ? OP_DESC(PL_op) : "print");
+	    return FALSE;
+	}
+	if (*s >= UTF8_FIRST_PROBLEMATIC_CODE_POINT_FIRST_BYTE) {
+	    STRLEN char_len;
+	    if (UTF8_IS_SUPER(s)) {
+		UV uv = utf8_to_uvchr(s, &char_len);
+		Perl_warner(aTHX_ packWARN(WARN_UTF8),
+		    "Code point 0x%04"UVXf" is not Unicode, may not be portable", uv);
+		ok = FALSE;
+	    }
+	    else if (UTF8_IS_SURROGATE(s)) {
+		UV uv = utf8_to_uvchr(s, &char_len);
+		Perl_warner(aTHX_ packWARN(WARN_UTF8),
+		    "Unicode surrogate U+%04"UVXf" is illegal in UTF-8", uv);
+		ok = FALSE;
+	    }
+	    else if
+		(UTF8_IS_NONCHAR_GIVEN_THAT_NON_SUPER_AND_GE_PROBLEMATIC(s))
+	    {
+		UV uv = utf8_to_uvchr(s, &char_len);
+		Perl_warner(aTHX_ packWARN(WARN_UTF8),
+		    "Unicode non-character U+%04"UVXf" is illegal for open interchange", uv);
+		ok = FALSE;
+	    }
+	}
+	s += UTF8SKIP(s);
+    }
+
+    return ok;
+}
+
 /*
 =for apidoc pv_uni_display