diff options
author | Karl Williamson <khw@cpan.org> | 2021-07-01 18:47:45 -0600 |
---|---|---|
committer | Karl Williamson <khw@cpan.org> | 2021-08-14 06:47:43 -0600 |
commit | 22afef87083fc7ad1b066588f5c20637fd387805 (patch) | |
tree | 74f540e111eb7fbc989150361b1b375861af608a /inline.h | |
parent | 2e5a4e5adeeaafbc8743371293762d71bf27ea15 (diff) | |
download | perl-22afef87083fc7ad1b066588f5c20637fd387805.tar.gz |
is_utf8_valid_partial_char_flags: Use DFA
The DFA macro for determining if a sequence is valid UTF-8 was
deliberately made general enough to accommodate this use-case, in which
only a partial character is acceptable. Change the code to use the DFA.
The helper function's name is changed to indicate it is private
Diffstat (limited to 'inline.h')
-rw-r--r-- | inline.h | 39 |
1 files changed, 34 insertions, 5 deletions
@@ -2217,18 +2217,47 @@ determined from just the first one or two bytes. */ PERL_STATIC_INLINE bool -Perl_is_utf8_valid_partial_char_flags(const U8 * const s, const U8 * const e, const U32 flags) +Perl_is_utf8_valid_partial_char_flags(const U8 * const s0, const U8 * const e, const U32 flags) { PERL_ARGS_ASSERT_IS_UTF8_VALID_PARTIAL_CHAR_FLAGS; - assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE |UTF8_DISALLOW_PERL_EXTENDED))); - if (s >= e || s + UTF8SKIP(s) <= e) { - return FALSE; + PERL_IS_UTF8_CHAR_DFA(s0, e, PL_extended_utf8_dfa_tab, + DFA_RETURN_FAILURE_, + DFA_TEASE_APART_FF_, + NOOP); + + /* The NOOP above causes the DFA to drop down here iff the input was a + * partial character. flags=0 => can return TRUE immediately; otherwise we + * need to check (not inline) if the partial character is the beginning of + * a disallowed one */ + if (flags == 0) { + return TRUE; + } + + return cBOOL(is_utf8_char_helper(s0, e, flags)); + +#ifdef HAS_EXTRA_LONG_UTF8 + + tease_apart_FF: + + /* Getting here means the input is either malformed, or, in the case of + * PL_extended_utf8_dfa_tab, was for the largest possible start byte. The + * latter case has to be extended UTF-8, so can fail immediately if that is + * forbidden */ + + if ( *s0 != I8_TO_NATIVE_UTF8(0xFF) + || (flags & (UTF8_DISALLOW_SUPER|UTF8_DISALLOW_PERL_EXTENDED))) + { + return 0; } - return cBOOL(is_utf8_char_helper(s, e, flags)); + return is_utf8_FF_helper_(s0, e, + TRUE /* Require to be a partial character */ + ); +#endif + } /* |