summaryrefslogtreecommitdiff
path: root/utf8.h
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2016-12-10 18:01:39 -0700
committerKarl Williamson <khw@cpan.org>2016-12-23 16:48:35 -0700
commitd60baaa7781e81851a5ac29fea2abebde6730478 (patch)
treed17932730657798e3e2422ec429012270bec6d14 /utf8.h
parent9495395586e6a655057cb766ed00213037dd06c0 (diff)
downloadperl-d60baaa7781e81851a5ac29fea2abebde6730478.tar.gz
Allow allowing UTF-8 overflow malformation
perl has never allowed the UTF-8 overflow malformation, for some reason. But as long as overflows are turned into the REPLACEMENT CHARACTER, there is no real reason not to. And making it allowable allows code that wants to carry on in the face of malformed input to do so, without risk of contaminating things, as the REPLACEMENT is the Unicode prescribed way of handling malformations.
Diffstat (limited to 'utf8.h')
-rw-r--r--utf8.h11
1 files changed, 8 insertions, 3 deletions
diff --git a/utf8.h b/utf8.h
index 3dde45a1dd..d7c4e1ad30 100644
--- a/utf8.h
+++ b/utf8.h
@@ -745,8 +745,8 @@ case any call to string overloading updates the internal UTF-8 encoding flag.
#define UTF8_ALLOW_LONG_AND_ITS_VALUE (UTF8_ALLOW_LONG|0x0020)
#define UTF8_GOT_LONG UTF8_ALLOW_LONG
-/* Currently no way to allow overflow */
-#define UTF8_GOT_OVERFLOW 0x0080
+#define UTF8_ALLOW_OVERFLOW 0x0080
+#define UTF8_GOT_OVERFLOW UTF8_ALLOW_OVERFLOW
#define UTF8_DISALLOW_SURROGATE 0x0100 /* Unicode surrogates */
#define UTF8_GOT_SURROGATE UTF8_DISALLOW_SURROGATE
@@ -790,10 +790,15 @@ case any call to string overloading updates the internal UTF-8 encoding flag.
#define UTF8_WARN_ILLEGAL_INTERCHANGE \
(UTF8_WARN_ILLEGAL_C9_INTERCHANGE|UTF8_WARN_NONCHAR)
+/* This is used typically for code that is willing to accept inputs of
+ * illformed UTF-8 sequences, for whatever reason. However, all such sequences
+ * evaluate to the REPLACEMENT CHARACTER unless other flags overriding this are
+ * also present. */
#define UTF8_ALLOW_ANY ( UTF8_ALLOW_CONTINUATION \
|UTF8_ALLOW_NON_CONTINUATION \
|UTF8_ALLOW_SHORT \
- |UTF8_ALLOW_LONG)
+ |UTF8_ALLOW_LONG \
+ |UTF8_ALLOW_OVERFLOW)
/* Accept any Perl-extended UTF-8 that evaluates to any UV on the platform, but
* not any malformed. This is the default. (Note that UVs above IV_MAX are