diff options
author | Karl Williamson <khw@cpan.org> | 2021-06-25 13:09:08 -0600 |
---|---|---|
committer | Karl Williamson <khw@cpan.org> | 2021-08-14 06:47:43 -0600 |
commit | 22f363ffd253b5142b1138438c30f34da9494d4a (patch) | |
tree | a02ca691b6818aafce1300ab4cdd3706ff7c4087 | |
parent | 22afef87083fc7ad1b066588f5c20637fd387805 (diff) | |
download | perl-22f363ffd253b5142b1138438c30f34da9494d4a.tar.gz |
Make macro isUTF8_CHAR_flags an inline fcn
This makes it use the fast DFA for this functionality.
-rw-r--r-- | embed.fnc | 3 | ||||
-rw-r--r-- | embed.h | 1 | ||||
-rw-r--r-- | inline.h | 67 | ||||
-rw-r--r-- | proto.h | 7 | ||||
-rw-r--r-- | utf8.c | 5 | ||||
-rw-r--r-- | utf8.h | 39 |
6 files changed, 82 insertions, 40 deletions
@@ -1164,6 +1164,9 @@ AbTpdD |STRLEN |is_utf8_char |NN const U8 *s AbMTpd |STRLEN |is_utf8_char_buf|NN const U8 *buf|NN const U8 *buf_end ATidRp |Size_t |isUTF8_CHAR|NN const U8 * const s0 \ |NN const U8 * const e +ATidRp |Size_t |isUTF8_CHAR_flags|NN const U8 * const s0 \ + |NN const U8 * const e \ + |const U32 flags ATidRp |Size_t |isSTRICT_UTF8_CHAR |NN const U8 * const s0 \ |NN const U8 * const e ATidRp |Size_t |isC9_STRICT_UTF8_CHAR |NN const U8 * const s0 \ @@ -268,6 +268,7 @@ #define isC9_STRICT_UTF8_CHAR Perl_isC9_STRICT_UTF8_CHAR #define isSTRICT_UTF8_CHAR Perl_isSTRICT_UTF8_CHAR #define isUTF8_CHAR Perl_isUTF8_CHAR +#define isUTF8_CHAR_flags Perl_isUTF8_CHAR_flags #define is_c9strict_utf8_string_loclen Perl_is_c9strict_utf8_string_loclen #define is_lvalue_sub() Perl_is_lvalue_sub(aTHX) #define is_safe_syscall(a,b,c,d) Perl_is_safe_syscall(aTHX_ a,b,c,d) @@ -2169,6 +2169,73 @@ Perl_utf8_hop_safe(const U8 *s, SSize_t off, const U8 *start, const U8 *end) /* +=for apidoc Am|STRLEN|isUTF8_CHAR_flags|const U8 *s|const U8 *e| const U32 flags + +Evaluates to non-zero if the first few bytes of the string starting at C<s> and +looking no further than S<C<e - 1>> are well-formed UTF-8, as extended by Perl, +that represents some code point, subject to the restrictions given by C<flags>; +otherwise it evaluates to 0. If non-zero, the value gives how many bytes +starting at C<s> comprise the code point's representation. Any bytes remaining +before C<e>, but beyond the ones needed to form the first code point in C<s>, +are not examined. + +If C<flags> is 0, this gives the same results as C<L</isUTF8_CHAR>>; +if C<flags> is C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, this gives the same results +as C<L</isSTRICT_UTF8_CHAR>>; +and if C<flags> is C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE>, this gives +the same results as C<L</isC9_STRICT_UTF8_CHAR>>. +Otherwise C<flags> may be any combination of the C<UTF8_DISALLOW_I<foo>> flags +understood by C<L</utf8n_to_uvchr>>, with the same meanings. + +The three alternative macros are for the most commonly needed validations; they +are likely to run somewhat faster than this more general one, as they can be +inlined into your code. + +Use L</is_utf8_string_flags>, L</is_utf8_string_loc_flags>, and +L</is_utf8_string_loclen_flags> to check entire strings. + +=cut +*/ + +PERL_STATIC_INLINE STRLEN +Perl_isUTF8_CHAR_flags(const U8 * const s0, const U8 * const e, const U32 flags) +{ + PERL_ARGS_ASSERT_ISUTF8_CHAR_FLAGS; + assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE + |UTF8_DISALLOW_PERL_EXTENDED))); + + PERL_IS_UTF8_CHAR_DFA(s0, e, PL_extended_utf8_dfa_tab, + goto check_success, + DFA_TEASE_APART_FF_, + DFA_RETURN_FAILURE_); + + check_success: + + return is_utf8_char_helper(s0, e, flags); + +#ifdef HAS_EXTRA_LONG_UTF8 + + tease_apart_FF: + + /* In the case of PL_extended_utf8_dfa_tab, getting here means the input is + * either malformed, or was for the largest possible start byte, which + * indicates perl extended UTF-8, well above the Unicode maximum */ + if ( *s0 != I8_TO_NATIVE_UTF8(0xFF) + || (flags & (UTF8_DISALLOW_SUPER|UTF8_DISALLOW_PERL_EXTENDED))) + { + return 0; + } + + /* Otherwise examine the sequence not inline */ + return is_utf8_FF_helper_(s0, e, + FALSE /* require full, not partial char */ + ); +#endif + +} + +/* + =for apidoc is_utf8_valid_partial_char Returns 0 if the sequence of bytes starting at C<s> and looking no further than @@ -1665,6 +1665,13 @@ PERL_STATIC_INLINE Size_t Perl_isUTF8_CHAR(const U8 * const s0, const U8 * const assert(s0); assert(e) #endif +#ifndef PERL_NO_INLINE_FUNCTIONS +PERL_STATIC_INLINE Size_t Perl_isUTF8_CHAR_flags(const U8 * const s0, const U8 * const e, const U32 flags) + __attribute__warn_unused_result__; +#define PERL_ARGS_ASSERT_ISUTF8_CHAR_FLAGS \ + assert(s0); assert(e) +#endif + /* PERL_CALLCONV bool is_ascii_string(const U8* const s, STRLEN len) __attribute__warn_unused_result__ __attribute__pure__; */ @@ -817,7 +817,10 @@ Perl_is_utf8_char_helper(const U8 * const s, const U8 * e, const U32 flags) assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE |UTF8_DISALLOW_PERL_EXTENDED))); - assert(! UTF8_IS_INVARIANT(*s)); + + if (UTF8_IS_INVARIANT(*s)) { + return 1; + } /* A variant char must begin with a start byte */ if (UNLIKELY(! UTF8_IS_START(*s))) { @@ -1219,45 +1219,6 @@ point's representation. #define bytes_from_utf8(s, lenp, is_utf8p) \ bytes_from_utf8_loc(s, lenp, is_utf8p, 0) -/* - -=for apidoc Am|STRLEN|isUTF8_CHAR_flags|const U8 *s|const U8 *e| const U32 flags - -Evaluates to non-zero if the first few bytes of the string starting at C<s> and -looking no further than S<C<e - 1>> are well-formed UTF-8, as extended by Perl, -that represents some code point, subject to the restrictions given by C<flags>; -otherwise it evaluates to 0. If non-zero, the value gives how many bytes -starting at C<s> comprise the code point's representation. Any bytes remaining -before C<e>, but beyond the ones needed to form the first code point in C<s>, -are not examined. - -If C<flags> is 0, this gives the same results as C<L</isUTF8_CHAR>>; -if C<flags> is C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, this gives the same results -as C<L</isSTRICT_UTF8_CHAR>>; -and if C<flags> is C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE>, this gives -the same results as C<L</isC9_STRICT_UTF8_CHAR>>. -Otherwise C<flags> may be any combination of the C<UTF8_DISALLOW_I<foo>> flags -understood by C<L</utf8n_to_uvchr>>, with the same meanings. - -The three alternative macros are for the most commonly needed validations; they -are likely to run somewhat faster than this more general one, as they can be -inlined into your code. - -Use L</is_utf8_string_flags>, L</is_utf8_string_loc_flags>, and -L</is_utf8_string_loclen_flags> to check entire strings. - -=cut -*/ - -#define isUTF8_CHAR_flags(s, e, flags) \ - (UNLIKELY((e) <= (s)) \ - ? 0 \ - : (UTF8_IS_INVARIANT(*s)) \ - ? 1 \ - : UNLIKELY(((e) - (s)) < UTF8SKIP(s)) \ - ? 0 \ - : is_utf8_char_helper(s, e, flags)) - /* Do not use; should be deprecated. Use isUTF8_CHAR() instead; this is * retained solely for backwards compatibility */ #define IS_UTF8_CHAR(p, n) (isUTF8_CHAR(p, (p) + (n)) == n) |