summaryrefslogtreecommitdiff
path: root/inline.h
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2021-07-01 18:47:45 -0600
committerKarl Williamson <khw@cpan.org>2021-08-14 06:47:43 -0600
commit22afef87083fc7ad1b066588f5c20637fd387805 (patch)
tree74f540e111eb7fbc989150361b1b375861af608a /inline.h
parent2e5a4e5adeeaafbc8743371293762d71bf27ea15 (diff)
downloadperl-22afef87083fc7ad1b066588f5c20637fd387805.tar.gz
is_utf8_valid_partial_char_flags: Use DFA
The DFA macro for determining if a sequence is valid UTF-8 was deliberately made general enough to accommodate this use-case, in which only a partial character is acceptable. Change the code to use the DFA. The helper function's name is changed to indicate it is private
Diffstat (limited to 'inline.h')
-rw-r--r--inline.h39
1 files changed, 34 insertions, 5 deletions
diff --git a/inline.h b/inline.h
index 7b13be02b6..6c5b1bda46 100644
--- a/inline.h
+++ b/inline.h
@@ -2217,18 +2217,47 @@ determined from just the first one or two bytes.
*/
PERL_STATIC_INLINE bool
-Perl_is_utf8_valid_partial_char_flags(const U8 * const s, const U8 * const e, const U32 flags)
+Perl_is_utf8_valid_partial_char_flags(const U8 * const s0, const U8 * const e, const U32 flags)
{
PERL_ARGS_ASSERT_IS_UTF8_VALID_PARTIAL_CHAR_FLAGS;
-
assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
|UTF8_DISALLOW_PERL_EXTENDED)));
- if (s >= e || s + UTF8SKIP(s) <= e) {
- return FALSE;
+ PERL_IS_UTF8_CHAR_DFA(s0, e, PL_extended_utf8_dfa_tab,
+ DFA_RETURN_FAILURE_,
+ DFA_TEASE_APART_FF_,
+ NOOP);
+
+ /* The NOOP above causes the DFA to drop down here iff the input was a
+ * partial character. flags=0 => can return TRUE immediately; otherwise we
+ * need to check (not inline) if the partial character is the beginning of
+ * a disallowed one */
+ if (flags == 0) {
+ return TRUE;
+ }
+
+ return cBOOL(is_utf8_char_helper(s0, e, flags));
+
+#ifdef HAS_EXTRA_LONG_UTF8
+
+ tease_apart_FF:
+
+ /* Getting here means the input is either malformed, or, in the case of
+ * PL_extended_utf8_dfa_tab, was for the largest possible start byte. The
+ * latter case has to be extended UTF-8, so can fail immediately if that is
+ * forbidden */
+
+ if ( *s0 != I8_TO_NATIVE_UTF8(0xFF)
+ || (flags & (UTF8_DISALLOW_SUPER|UTF8_DISALLOW_PERL_EXTENDED)))
+ {
+ return 0;
}
- return cBOOL(is_utf8_char_helper(s, e, flags));
+ return is_utf8_FF_helper_(s0, e,
+ TRUE /* Require to be a partial character */
+ );
+#endif
+
}
/*