summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2021-07-01 18:48:10 -0600
committerKarl Williamson <khw@cpan.org>2021-08-14 06:47:43 -0600
commit1aa501c28abd51b6253fb6da3caeee66320bf274 (patch)
tree34755745c3dffee562186389cedada7b54c88bc0
parent22f363ffd253b5142b1138438c30f34da9494d4a (diff)
downloadperl-1aa501c28abd51b6253fb6da3caeee66320bf274.tar.gz
utf8.c: Refactor is_utf8_char_helper()
Now that the DFA is used by the only callers to this to eliminate the need to check for e.g., wrong continuation bytes, this function can be refactored to use a switch statement, which makes it clearer, shorter, and faster. The name is changed to indicate its private nature
-rw-r--r--embed.fnc2
-rw-r--r--embed.h2
-rw-r--r--inline.h4
-rw-r--r--proto.h4
-rw-r--r--utf8.c193
5 files changed, 88 insertions, 117 deletions
diff --git a/embed.fnc b/embed.fnc
index 5cea8260ce..834faea428 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -1115,7 +1115,7 @@ pR |OP* |cmpchain_finish|NN OP* ch
ApR |I32 |is_lvalue_sub
: Used in cop.h
XopR |I32 |was_lvalue_sub
-CpRTP |STRLEN |is_utf8_char_helper|NN const U8 * const s|NN const U8 * e|const U32 flags
+CpRTP |STRLEN |is_utf8_char_helper_|NN const U8 * const s|NN const U8 * e|const U32 flags
CpRTP |Size_t |is_utf8_FF_helper_|NN const U8 * const s0 \
|NN const U8 * const e \
|const bool require_partial
diff --git a/embed.h b/embed.h
index f4e0043cf1..8e9b3779dc 100644
--- a/embed.h
+++ b/embed.h
@@ -277,7 +277,7 @@
#ifndef NO_MATHOMS
#define is_utf8_char Perl_is_utf8_char
#endif
-#define is_utf8_char_helper Perl_is_utf8_char_helper
+#define is_utf8_char_helper_ Perl_is_utf8_char_helper_
#define is_utf8_fixed_width_buf_loclen_flags Perl_is_utf8_fixed_width_buf_loclen_flags
#define is_utf8_invariant_string_loc Perl_is_utf8_invariant_string_loc
#define is_utf8_string_flags Perl_is_utf8_string_flags
diff --git a/inline.h b/inline.h
index 9cfa445626..31c68bd485 100644
--- a/inline.h
+++ b/inline.h
@@ -2211,7 +2211,7 @@ Perl_isUTF8_CHAR_flags(const U8 * const s0, const U8 * const e, const U32 flags)
check_success:
- return is_utf8_char_helper(s0, e, flags);
+ return is_utf8_char_helper_(s0, e, flags);
#ifdef HAS_EXTRA_LONG_UTF8
@@ -2303,7 +2303,7 @@ Perl_is_utf8_valid_partial_char_flags(const U8 * const s0, const U8 * const e, c
return TRUE;
}
- return cBOOL(is_utf8_char_helper(s0, e, flags));
+ return cBOOL(is_utf8_char_helper_(s0, e, flags));
#ifdef HAS_EXTRA_LONG_UTF8
diff --git a/proto.h b/proto.h
index 16080febb6..effb8ea2f2 100644
--- a/proto.h
+++ b/proto.h
@@ -1733,10 +1733,10 @@ PERL_CALLCONV STRLEN Perl_is_utf8_char_buf(const U8 *buf, const U8 *buf_end);
#define PERL_ARGS_ASSERT_IS_UTF8_CHAR_BUF \
assert(buf); assert(buf_end)
#endif
-PERL_CALLCONV STRLEN Perl_is_utf8_char_helper(const U8 * const s, const U8 * e, const U32 flags)
+PERL_CALLCONV STRLEN Perl_is_utf8_char_helper_(const U8 * const s, const U8 * e, const U32 flags)
__attribute__warn_unused_result__
__attribute__pure__;
-#define PERL_ARGS_ASSERT_IS_UTF8_CHAR_HELPER \
+#define PERL_ARGS_ASSERT_IS_UTF8_CHAR_HELPER_ \
assert(s); assert(e)
/* PERL_CALLCONV bool is_utf8_fixed_width_buf_flags(const U8 * const s, STRLEN len, const U32 flags); */
diff --git a/utf8.c b/utf8.c
index 7f26c8645c..d9363b3860 100644
--- a/utf8.c
+++ b/utf8.c
@@ -776,149 +776,120 @@ S_does_utf8_overflow(const U8 * const s,
#undef FF_OVERLONG_PREFIX
STRLEN
-Perl_is_utf8_char_helper(const U8 * const s, const U8 * e, const U32 flags)
+Perl_is_utf8_char_helper_(const U8 * const s, const U8 * e, const U32 flags)
{
- STRLEN len;
- const U8 *x;
+ SSize_t len, full_len;
- /* A helper function that should not be called directly.
- *
- * This function returns non-zero if the string beginning at 's' and
- * looking no further than 'e - 1' is well-formed Perl-extended-UTF-8 for a
- * code point; otherwise it returns 0. The examination stops after the
- * first code point in 's' is validated, not looking at the rest of the
- * input. If 'e' is such that there are not enough bytes to represent a
- * complete code point, this function will return non-zero anyway, if the
- * bytes it does have are well-formed UTF-8 as far as they go, and aren't
- * excluded by 'flags'.
- *
- * A non-zero return gives the number of bytes required to represent the
- * code point. Be aware that if the input is for a partial character, the
- * return will be larger than 'e - s'.
- *
- * This function assumes that the code point represented is UTF-8 variant.
- * The caller should have excluded the possibility of it being invariant
- * before calling this function.
+ /* An internal helper function.
*
+ * On input:
+ * 's' is a string, which is known to be syntactically valid UTF-8 as far
+ * as (e - 1); e > s must hold.
+ * 'e' This function is allowed to look at any byte from 's'...'e-1', but
+ * nowhere else. The function has to cope as best it can if that
+ * sequence does not form a full character.
* 'flags' can be 0, or any combination of the UTF8_DISALLOW_foo flags
- * accepted by L</utf8n_to_uvchr>. If non-zero, this function will return
- * 0 if the code point represented is well-formed Perl-extended-UTF-8, but
- * disallowed by the flags. If the input is only for a partial character,
- * the function will return non-zero if there is any sequence of
- * well-formed UTF-8 that, when appended to the input sequence, could
- * result in an allowed code point; otherwise it returns 0. Non characters
- * cannot be determined based on partial character input. But many of the
- * other excluded types can be determined with just the first one or two
- * bytes.
+ * accepted by L</utf8n_to_uvchr>. If non-zero, this function returns
+ * 0 if it determines the input will match something disallowed.
+ * On output:
+ * The return is the number of bytes required to represent the code point
+ * if it isn't disallowed by 'flags'; 0 otherwise. Be aware that if the
+ * input is for a partial character, a successful return will be larger
+ * than 'e - s'.
+ *
+ * If *s..*(e-1) is only for a partial character, the function will return
+ * non-zero if there is any sequence of well-formed UTF-8 that, when
+ * appended to the input sequence, could result in an allowed code point;
+ * otherwise it returns 0. Non characters cannot be determined based on
+ * partial character input. But many of the other excluded types can be
+ * determined with just the first one or two bytes.
*
*/
- PERL_ARGS_ASSERT_IS_UTF8_CHAR_HELPER;
+ PERL_ARGS_ASSERT_IS_UTF8_CHAR_HELPER_;
+ assert(e > s);
assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
|UTF8_DISALLOW_PERL_EXTENDED)));
- if (UTF8_IS_INVARIANT(*s)) {
- return 1;
- }
+ full_len = UTF8SKIP(s);
- /* A variant char must begin with a start byte */
- if (UNLIKELY(! UTF8_IS_START(*s))) {
- return 0;
+ len = e - s;
+ if (len > full_len) {
+ e = s + full_len;
+ len = full_len;
}
- /* Examine a maximum of a single whole code point */
- if (e - s > UTF8SKIP(s)) {
- e = s + UTF8SKIP(s);
- }
+ switch (full_len) {
+ bool is_super;
- len = e - s;
+ default: /* Extended */
+ if (flags & UTF8_DISALLOW_PERL_EXTENDED) {
+ return 0;
+ }
- if (flags && isUTF8_POSSIBLY_PROBLEMATIC(*s)) {
- const U8 s0 = NATIVE_UTF8_TO_I8(s[0]);
-
- /* Here, we are disallowing some set of largish code points, and the
- * first byte indicates the sequence is for a code point that could be
- * in the excluded set. We generally don't have to look beyond this or
- * the second byte to see if the sequence is actually for one of the
- * excluded classes. The code below is derived from this table:
- *
- * UTF-8 UTF-EBCDIC I8
- * U+D800: \xED\xA0\x80 \xF1\xB6\xA0\xA0 First surrogate
- * U+DFFF: \xED\xBF\xBF \xF1\xB7\xBF\xBF Final surrogate
- * U+110000: \xF4\x90\x80\x80 \xF9\xA2\xA0\xA0\xA0 First above Unicode
- *
- * Keep in mind that legal continuation bytes range between \x80..\xBF
- * for UTF-8, and \xA0..\xBF for I8. Anything above those aren't
- * continuation bytes. Hence, we don't have to test the upper edge
- * because if any of those is encountered, the sequence is malformed,
- * and would fail elsewhere in this function.
- *
- * The code here likewise assumes that there aren't other
- * malformations; again the function should fail elsewhere because of
- * these. For example, an overlong beginning with FC doesn't actually
- * have to be a super; it could actually represent a small code point,
- * even U+0000. But, since overlongs (and other malformations) are
- * illegal, the function should return FALSE in either case.
- */
+ /* FALLTHROUGH */
- if ( (flags & UTF8_DISALLOW_SUPER)
- && UNLIKELY(s0 > UTF_START_BYTE_110000_))
- {
- return 0; /* Above Unicode */
+ case 6 + ONE_IF_EBCDIC_ZERO_IF_NOT: /* above Unicode */
+ case 5 + ONE_IF_EBCDIC_ZERO_IF_NOT: /* above Unicode */
+
+ if (flags & UTF8_DISALLOW_SUPER) {
+ return 0; /* Above Unicode */
}
- if ( (flags & UTF8_DISALLOW_PERL_EXTENDED)
- && UNLIKELY(UTF8_IS_PERL_EXTENDED(s)))
+ return full_len;
+
+ case 4 + ONE_IF_EBCDIC_ZERO_IF_NOT:
+ is_super = ( UNLIKELY(NATIVE_UTF8_TO_I8(s[0]) > UTF_START_BYTE_110000_)
+ || ( len > 1
+ && NATIVE_UTF8_TO_I8(s[0]) == UTF_START_BYTE_110000_
+ && NATIVE_UTF8_TO_I8(s[1])
+ >= UTF_FIRST_CONT_BYTE_110000_));
+ if (is_super) {
+ if (flags & UTF8_DISALLOW_SUPER) {
+ return 0;
+ }
+ }
+ else if ( (flags & UTF8_DISALLOW_NONCHAR)
+ && len == full_len
+ && UNLIKELY(is_LARGER_NON_CHARS_utf8(s)))
{
return 0;
}
- if (len > 1) {
- if ( (flags & UTF8_DISALLOW_SUPER)
- && NATIVE_UTF8_TO_I8(s[0]) >= UTF_START_BYTE_110000_
- && NATIVE_UTF8_TO_I8(s[1]) >= UTF_FIRST_CONT_BYTE_110000_)
- {
- return 0; /* Above Unicode */
- }
+ return full_len;
- if ( (flags & UTF8_DISALLOW_SURROGATE)
- && UNLIKELY(is_SURROGATE_utf8(s)))
- {
- return 0; /* Surrogate */
- }
+ case 3 + ONE_IF_EBCDIC_ZERO_IF_NOT:
- if ( (flags & UTF8_DISALLOW_NONCHAR)
- && UNLIKELY(UTF8_IS_NONCHAR(s, e)))
- {
- return 0; /* Noncharacter code point */
- }
+ if (! isUTF8_POSSIBLY_PROBLEMATIC(s[0]) || len < 2) {
+ return full_len;
}
- }
- /* Make sure that all that follows are continuation bytes */
- for (x = s + 1; x < e; x++) {
- if (UNLIKELY(! UTF8_IS_CONTINUATION(*x))) {
+ if ( (flags & UTF8_DISALLOW_SURROGATE)
+ && UNLIKELY(is_SURROGATE_utf8(s)))
+ {
+ return 0; /* Surrogate */
+ }
+
+ if ( (flags & UTF8_DISALLOW_NONCHAR)
+ && len == full_len
+ && UNLIKELY(is_SHORTER_NON_CHARS_utf8(s)))
+ {
return 0;
}
- }
- /* Here is syntactically valid. Next, make sure this isn't the start of an
- * overlong. */
- if (is_utf8_overlong(s, len) > 0) {
- return 0;
- }
+ return full_len;
- /* And finally, that the code point represented fits in a word on this
- * platform */
- if (0 < does_utf8_overflow(s, e,
- 0 /* Don't consider overlongs */
- ))
- {
- return 0;
- }
+ /* The lower code points don't have any disallowable characters */
+#ifdef EBCDIC
+ case 3:
+ return full_len;
+#endif
- return UTF8SKIP(s);
+ case 2:
+ case 1:
+ return full_len;
+ }
}
Size_t