summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2021-06-25 13:09:08 -0600
committerKarl Williamson <khw@cpan.org>2021-08-14 06:47:43 -0600
commit22f363ffd253b5142b1138438c30f34da9494d4a (patch)
treea02ca691b6818aafce1300ab4cdd3706ff7c4087
parent22afef87083fc7ad1b066588f5c20637fd387805 (diff)
downloadperl-22f363ffd253b5142b1138438c30f34da9494d4a.tar.gz
Make macro isUTF8_CHAR_flags an inline fcn
This makes it use the fast DFA for this functionality.
-rw-r--r--embed.fnc3
-rw-r--r--embed.h1
-rw-r--r--inline.h67
-rw-r--r--proto.h7
-rw-r--r--utf8.c5
-rw-r--r--utf8.h39
6 files changed, 82 insertions, 40 deletions
diff --git a/embed.fnc b/embed.fnc
index b4fe575f6b..5cea8260ce 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -1164,6 +1164,9 @@ AbTpdD |STRLEN |is_utf8_char |NN const U8 *s
AbMTpd |STRLEN |is_utf8_char_buf|NN const U8 *buf|NN const U8 *buf_end
ATidRp |Size_t |isUTF8_CHAR|NN const U8 * const s0 \
|NN const U8 * const e
+ATidRp |Size_t |isUTF8_CHAR_flags|NN const U8 * const s0 \
+ |NN const U8 * const e \
+ |const U32 flags
ATidRp |Size_t |isSTRICT_UTF8_CHAR |NN const U8 * const s0 \
|NN const U8 * const e
ATidRp |Size_t |isC9_STRICT_UTF8_CHAR |NN const U8 * const s0 \
diff --git a/embed.h b/embed.h
index 4578848f7a..f4e0043cf1 100644
--- a/embed.h
+++ b/embed.h
@@ -268,6 +268,7 @@
#define isC9_STRICT_UTF8_CHAR Perl_isC9_STRICT_UTF8_CHAR
#define isSTRICT_UTF8_CHAR Perl_isSTRICT_UTF8_CHAR
#define isUTF8_CHAR Perl_isUTF8_CHAR
+#define isUTF8_CHAR_flags Perl_isUTF8_CHAR_flags
#define is_c9strict_utf8_string_loclen Perl_is_c9strict_utf8_string_loclen
#define is_lvalue_sub() Perl_is_lvalue_sub(aTHX)
#define is_safe_syscall(a,b,c,d) Perl_is_safe_syscall(aTHX_ a,b,c,d)
diff --git a/inline.h b/inline.h
index 6c5b1bda46..9cfa445626 100644
--- a/inline.h
+++ b/inline.h
@@ -2169,6 +2169,73 @@ Perl_utf8_hop_safe(const U8 *s, SSize_t off, const U8 *start, const U8 *end)
/*
+=for apidoc Am|STRLEN|isUTF8_CHAR_flags|const U8 *s|const U8 *e| const U32 flags
+
+Evaluates to non-zero if the first few bytes of the string starting at C<s> and
+looking no further than S<C<e - 1>> are well-formed UTF-8, as extended by Perl,
+that represents some code point, subject to the restrictions given by C<flags>;
+otherwise it evaluates to 0. If non-zero, the value gives how many bytes
+starting at C<s> comprise the code point's representation. Any bytes remaining
+before C<e>, but beyond the ones needed to form the first code point in C<s>,
+are not examined.
+
+If C<flags> is 0, this gives the same results as C<L</isUTF8_CHAR>>;
+if C<flags> is C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, this gives the same results
+as C<L</isSTRICT_UTF8_CHAR>>;
+and if C<flags> is C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE>, this gives
+the same results as C<L</isC9_STRICT_UTF8_CHAR>>.
+Otherwise C<flags> may be any combination of the C<UTF8_DISALLOW_I<foo>> flags
+understood by C<L</utf8n_to_uvchr>>, with the same meanings.
+
+The three alternative macros are for the most commonly needed validations; they
+are likely to run somewhat faster than this more general one, as they can be
+inlined into your code.
+
+Use L</is_utf8_string_flags>, L</is_utf8_string_loc_flags>, and
+L</is_utf8_string_loclen_flags> to check entire strings.
+
+=cut
+*/
+
+PERL_STATIC_INLINE STRLEN
+Perl_isUTF8_CHAR_flags(const U8 * const s0, const U8 * const e, const U32 flags)
+{
+ PERL_ARGS_ASSERT_ISUTF8_CHAR_FLAGS;
+ assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
+ |UTF8_DISALLOW_PERL_EXTENDED)));
+
+ PERL_IS_UTF8_CHAR_DFA(s0, e, PL_extended_utf8_dfa_tab,
+ goto check_success,
+ DFA_TEASE_APART_FF_,
+ DFA_RETURN_FAILURE_);
+
+ check_success:
+
+ return is_utf8_char_helper(s0, e, flags);
+
+#ifdef HAS_EXTRA_LONG_UTF8
+
+ tease_apart_FF:
+
+ /* In the case of PL_extended_utf8_dfa_tab, getting here means the input is
+ * either malformed, or was for the largest possible start byte, which
+ * indicates perl extended UTF-8, well above the Unicode maximum */
+ if ( *s0 != I8_TO_NATIVE_UTF8(0xFF)
+ || (flags & (UTF8_DISALLOW_SUPER|UTF8_DISALLOW_PERL_EXTENDED)))
+ {
+ return 0;
+ }
+
+ /* Otherwise examine the sequence not inline */
+ return is_utf8_FF_helper_(s0, e,
+ FALSE /* require full, not partial char */
+ );
+#endif
+
+}
+
+/*
+
=for apidoc is_utf8_valid_partial_char
Returns 0 if the sequence of bytes starting at C<s> and looking no further than
diff --git a/proto.h b/proto.h
index 4dd2c192e0..16080febb6 100644
--- a/proto.h
+++ b/proto.h
@@ -1665,6 +1665,13 @@ PERL_STATIC_INLINE Size_t Perl_isUTF8_CHAR(const U8 * const s0, const U8 * const
assert(s0); assert(e)
#endif
+#ifndef PERL_NO_INLINE_FUNCTIONS
+PERL_STATIC_INLINE Size_t Perl_isUTF8_CHAR_flags(const U8 * const s0, const U8 * const e, const U32 flags)
+ __attribute__warn_unused_result__;
+#define PERL_ARGS_ASSERT_ISUTF8_CHAR_FLAGS \
+ assert(s0); assert(e)
+#endif
+
/* PERL_CALLCONV bool is_ascii_string(const U8* const s, STRLEN len)
__attribute__warn_unused_result__
__attribute__pure__; */
diff --git a/utf8.c b/utf8.c
index c78ce84ab5..7f26c8645c 100644
--- a/utf8.c
+++ b/utf8.c
@@ -817,7 +817,10 @@ Perl_is_utf8_char_helper(const U8 * const s, const U8 * e, const U32 flags)
assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
|UTF8_DISALLOW_PERL_EXTENDED)));
- assert(! UTF8_IS_INVARIANT(*s));
+
+ if (UTF8_IS_INVARIANT(*s)) {
+ return 1;
+ }
/* A variant char must begin with a start byte */
if (UNLIKELY(! UTF8_IS_START(*s))) {
diff --git a/utf8.h b/utf8.h
index c274d2e225..e85e14bc07 100644
--- a/utf8.h
+++ b/utf8.h
@@ -1219,45 +1219,6 @@ point's representation.
#define bytes_from_utf8(s, lenp, is_utf8p) \
bytes_from_utf8_loc(s, lenp, is_utf8p, 0)
-/*
-
-=for apidoc Am|STRLEN|isUTF8_CHAR_flags|const U8 *s|const U8 *e| const U32 flags
-
-Evaluates to non-zero if the first few bytes of the string starting at C<s> and
-looking no further than S<C<e - 1>> are well-formed UTF-8, as extended by Perl,
-that represents some code point, subject to the restrictions given by C<flags>;
-otherwise it evaluates to 0. If non-zero, the value gives how many bytes
-starting at C<s> comprise the code point's representation. Any bytes remaining
-before C<e>, but beyond the ones needed to form the first code point in C<s>,
-are not examined.
-
-If C<flags> is 0, this gives the same results as C<L</isUTF8_CHAR>>;
-if C<flags> is C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, this gives the same results
-as C<L</isSTRICT_UTF8_CHAR>>;
-and if C<flags> is C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE>, this gives
-the same results as C<L</isC9_STRICT_UTF8_CHAR>>.
-Otherwise C<flags> may be any combination of the C<UTF8_DISALLOW_I<foo>> flags
-understood by C<L</utf8n_to_uvchr>>, with the same meanings.
-
-The three alternative macros are for the most commonly needed validations; they
-are likely to run somewhat faster than this more general one, as they can be
-inlined into your code.
-
-Use L</is_utf8_string_flags>, L</is_utf8_string_loc_flags>, and
-L</is_utf8_string_loclen_flags> to check entire strings.
-
-=cut
-*/
-
-#define isUTF8_CHAR_flags(s, e, flags) \
- (UNLIKELY((e) <= (s)) \
- ? 0 \
- : (UTF8_IS_INVARIANT(*s)) \
- ? 1 \
- : UNLIKELY(((e) - (s)) < UTF8SKIP(s)) \
- ? 0 \
- : is_utf8_char_helper(s, e, flags))
-
/* Do not use; should be deprecated. Use isUTF8_CHAR() instead; this is
* retained solely for backwards compatibility */
#define IS_UTF8_CHAR(p, n) (isUTF8_CHAR(p, (p) + (n)) == n)