summaryrefslogtreecommitdiff
path: root/utf8.c
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2016-09-10 22:09:44 -0600
committerKarl Williamson <khw@cpan.org>2016-09-17 21:10:49 -0600
commit2b47960981adadbe81b9635d4ca7861c45ccdced (patch)
tree72c757864ba7463c7cb11c4e62d08343f8cc8a72 /utf8.c
parentd566bd20c27a46aecd668d2f739b9515f46ac74f (diff)
downloadperl-2b47960981adadbe81b9635d4ca7861c45ccdced.tar.gz
Enhance and rename is_utf8_char_slow()
This changes the name of this helper function and adds a parameter and functionality to allow it to exclude problematic classes of code points, the same ones excludeable by utf8n_to_uvchar(), like surrogates or non-character code points.
Diffstat (limited to 'utf8.c')
-rw-r--r--utf8.c136
1 files changed, 114 insertions, 22 deletions
diff --git a/utf8.c b/utf8.c
index 2b9ea5b2d2..52b31dc4c2 100644
--- a/utf8.c
+++ b/utf8.c
@@ -423,37 +423,120 @@ S_is_utf8_cp_above_31_bits(const U8 * const s, const U8 * const e)
}
-/*
-
-A helper function for the macro isUTF8_CHAR(), which should be used instead of
-this function. The macro will handle smaller code points directly saving time,
-using this function as a fall-back for higher code points. This function
-assumes that it is not called with an invariant character, and that
-'s + len - 1' is within bounds of the string 's'.
-
-Tests if the string C<s> of at least length 'len' is a valid variant UTF-8
-character. 0 is returned if not, otherwise, 'len' is returned.
-
-It is written in such a way that if 'len' is set to less than a full
-character's length, it will test if the bytes ending there form the legal
-beginning of partial character.
-
-*/
-
STRLEN
-Perl__is_utf8_char_slow(const U8 * const s, const STRLEN len)
+Perl__is_utf8_char_helper(const U8 * const s, const U8 * const e, const U32 flags)
{
- const U8 *e;
+ STRLEN len;
const U8 *x, *y;
- PERL_ARGS_ASSERT__IS_UTF8_CHAR_SLOW;
+ /* A helper function that should not be called directly.
+ *
+ * This function returns non-zero if the string beginning at 's' and
+ * looking no further than 'e - 1' is well-formed Perl-extended-UTF-8 for a
+ * code point; otherwise it returns 0. The examination stops after the
+ * first code point in 's' is validated, not looking at the rest of the
+ * input. If 'e' is such that there are not enough bytes to represent a
+ * complete code point, this function will return non-zero anyway, if the
+ * bytes it does have are well-formed UTF-8 as far as they go, and aren't
+ * excluded by 'flags'.
+ *
+ * A non-zero return gives the number of bytes required to represent the
+ * code point. Be aware that if the input is for a partial character, the
+ * return will be larger than 'e - s'.
+ *
+ * This function assumes that the code point represented is UTF-8 variant.
+ * The caller should have excluded this possibility before calling this
+ * function.
+ *
+ * 'flags' can be 0, or any combination of the UTF8_DISALLOW_foo flags
+ * accepted by L</utf8n_to_uvchr>. If non-zero, this function will return
+ * 0 if the code point represented is well-formed Perl-extended-UTF-8, but
+ * disallowed by the flags. If the input is only for a partial character,
+ * the function will return non-zero if there is any sequence of
+ * well-formed UTF-8 that, when appended to the input sequence, could
+ * result in an allowed code point; otherwise it returns 0. Non characters
+ * cannot be determined based on partial character input. But many of the
+ * other excluded types can be determined with just the first one or two
+ * bytes.
+ *
+ */
+
+ PERL_ARGS_ASSERT__IS_UTF8_CHAR_HELPER;
+ assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
+ |UTF8_DISALLOW_ABOVE_31_BIT)));
+ assert(! UTF8_IS_INVARIANT(*s));
+
+ /* A variant char must begin with a start byte */
if (UNLIKELY(! UTF8_IS_START(*s))) {
return 0;
}
- e = s + len;
+ len = e - s;
+
+ if (flags && isUTF8_POSSIBLY_PROBLEMATIC(*s)) {
+ const U8 s0 = NATIVE_UTF8_TO_I8(s[0]);
+
+ /* The code below is derived from this table. Keep in mind that legal
+ * continuation bytes range between \x80..\xBF for UTF-8, and
+ * \xA0..\xBF for I8. Anything above those aren't continuation bytes.
+ * Hence, we don't have to test the upper edge because if any of those
+ * are encountered, the sequence is malformed, and will fail elsewhere
+ * in this function.
+ * UTF-8 UTF-EBCDIC I8
+ * U+D800: \xED\xA0\x80 \xF1\xB6\xA0\xA0 First surrogate
+ * U+DFFF: \xED\xBF\xBF \xF1\xB7\xBF\xBF Final surrogate
+ * U+110000: \xF4\x90\x80\x80 \xF9\xA2\xA0\xA0\xA0 First above Unicode
+ *
+ */
+
+#ifdef EBCDIC /* On EBCDIC, these are actually I8 bytes */
+# define FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER 0xFA
+# define IS_SUPER_2_BYTE(s0, s1) ((s0) == 0xF9 && (s1) >= 0xA2)
+ /* B6 and B7 */
+# define IS_SURROGATE(s0, s1) ((s0) == 0xF1 && ((s1) & 0xFE ) == 0xB6)
+#else
+# define FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER 0xF5
+# define IS_SUPER_2_BYTE(s0, s1) ((s0) == 0xF4 && (s1) >= 0x90)
+# define IS_SURROGATE(s0, s1) ((s0) == 0xED && (s1) >= 0xA0)
+#endif
+
+ if ( (flags & UTF8_DISALLOW_SUPER)
+ && UNLIKELY(s0 >= FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER)) {
+ return 0; /* Above Unicode */
+ }
+
+ if ( (flags & UTF8_DISALLOW_ABOVE_31_BIT)
+ && UNLIKELY(is_utf8_cp_above_31_bits(s, e)))
+ {
+ return 0; /* Above 31 bits */
+ }
+
+ if (len > 1) {
+ const U8 s1 = NATIVE_UTF8_TO_I8(s[1]);
+
+ if ( (flags & UTF8_DISALLOW_SUPER)
+ && UNLIKELY(IS_SUPER_2_BYTE(s0, s1)))
+ {
+ return 0; /* Above Unicode */
+ }
+
+ if ( (flags & UTF8_DISALLOW_SURROGATE)
+ && UNLIKELY(IS_SURROGATE(s0, s1)))
+ {
+ return 0; /* Surrogate */
+ }
+
+ if ( (flags & UTF8_DISALLOW_NONCHAR)
+ && UNLIKELY(UTF8_IS_NONCHAR(s, e)))
+ {
+ return 0; /* Noncharacter code point */
+ }
+ }
+ }
+
+ /* Make sure that all that follows are continuation bytes */
for (x = s + 1; x < e; x++) {
if (UNLIKELY(! UTF8_IS_CONTINUATION(*x))) {
return 0;
@@ -555,9 +638,18 @@ Perl__is_utf8_char_slow(const U8 * const s, const STRLEN len)
break;
}
- return len;
+ return UTF8SKIP(s);
}
+#undef FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER
+#undef IS_SUPER_2_BYTE
+#undef IS_SURROGATE
+#undef F0_ABOVE_OVERLONG
+#undef F8_ABOVE_OVERLONG
+#undef FC_ABOVE_OVERLONG
+#undef FE_ABOVE_OVERLONG
+#undef FF_OVERLONG_PREFIX
+
/*
=for apidoc utf8n_to_uvchr