summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2021-06-26 19:57:03 -0600
committerKarl Williamson <khw@cpan.org>2021-08-07 05:14:44 -0600
commit58b66e89ba80b4f91ff12da18da1301aa182a687 (patch)
treedb91351c2e183774b4b816c78fd842d73872bc98
parent527347e0493f4ee24d63df996fd6616806422d17 (diff)
downloadperl-58b66e89ba80b4f91ff12da18da1301aa182a687.tar.gz
Add helper function for longest UTF8 sequence
This specialized functionality is used to check the validity of Perl's extended-length UTF-8, which has some ideosyncratic characteristics from the shorter sequences. This means this function doesn't have to consider those differences. It will be used in the next commit to avoid some work, and to eventually enable is_utf8_char_helper() to be simplified.
-rw-r--r--embed.fnc3
-rw-r--r--embed.h1
-rw-r--r--proto.h6
-rw-r--r--utf8.c64
4 files changed, 74 insertions, 0 deletions
diff --git a/embed.fnc b/embed.fnc
index 8dba57ce12..ac8f93ab8a 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -1116,6 +1116,9 @@ ApR |I32 |is_lvalue_sub
: Used in cop.h
XopR |I32 |was_lvalue_sub
CpRTP |STRLEN |is_utf8_char_helper|NN const U8 * const s|NN const U8 * e|const U32 flags
+CpRTP |Size_t |is_utf8_FF_helper_|NN const U8 * const s0 \
+ |NN const U8 * const e \
+ |const bool require_partial
Cp |UV |to_uni_upper |UV c|NN U8 *p|NN STRLEN *lenp
Cp |UV |to_uni_title |UV c|NN U8 *p|NN STRLEN *lenp
p |void |init_uniprops
diff --git a/embed.h b/embed.h
index cece9a798c..4578848f7a 100644
--- a/embed.h
+++ b/embed.h
@@ -272,6 +272,7 @@
#define is_lvalue_sub() Perl_is_lvalue_sub(aTHX)
#define is_safe_syscall(a,b,c,d) Perl_is_safe_syscall(aTHX_ a,b,c,d)
#define is_strict_utf8_string_loclen Perl_is_strict_utf8_string_loclen
+#define is_utf8_FF_helper_ Perl_is_utf8_FF_helper_
#ifndef NO_MATHOMS
#define is_utf8_char Perl_is_utf8_char
#endif
diff --git a/proto.h b/proto.h
index 7083f13135..26884b042f 100644
--- a/proto.h
+++ b/proto.h
@@ -1708,6 +1708,12 @@ PERL_STATIC_INLINE bool Perl_is_strict_utf8_string_loclen(const U8 *s, STRLEN le
#define PERL_ARGS_ASSERT_IS_STRICT_UTF8_STRING_LOCLEN \
assert(s)
#endif
+PERL_CALLCONV Size_t Perl_is_utf8_FF_helper_(const U8 * const s0, const U8 * const e, const bool require_partial)
+ __attribute__warn_unused_result__
+ __attribute__pure__;
+#define PERL_ARGS_ASSERT_IS_UTF8_FF_HELPER_ \
+ assert(s0); assert(e)
+
#ifndef NO_MATHOMS
PERL_CALLCONV STRLEN Perl_is_utf8_char(const U8 *s)
__attribute__deprecated__;
diff --git a/utf8.c b/utf8.c
index 5877add9f2..6b157dfaaa 100644
--- a/utf8.c
+++ b/utf8.c
@@ -956,6 +956,70 @@ Perl_is_utf8_char_helper(const U8 * const s, const U8 * e, const U32 flags)
return UTF8SKIP(s);
}
+Size_t
+Perl_is_utf8_FF_helper_(const U8 * const s0, const U8 * const e,
+ const bool require_partial)
+{
+ /* This is called to determine if the UTF-8 sequence starting at s0 and
+ * continuing for up to one full character of bytes, but looking no further
+ * than 'e - 1', is legal. *s0 must be 0xFF (or whatever the native
+ * equivalent of FF in I8 on EBCDIC platforms is). This marks it as being
+ * for the largest code points recognized by Perl, the ones that require
+ * the most UTF-8 bytes per character to represent (somewhat less than
+ * twice the size of the next longest kind). This sequence will only ever
+ * be Perl extended UTF-8.
+ *
+ * The routine returns 0 if the sequence is not fully valid, syntactically
+ * or semantically. That means it checks that everything following the
+ * start byte is a continuation byte, and that it doesn't overflow, nor is
+ * an overlong representation.
+ *
+ * If 'require_partial' is FALSE, the routine returns non-zero only if the
+ * input (as far as 'e-1') is a full character. The return is the count of
+ * the bytes in the character.
+ *
+ * If 'require_partial' is TRUE, the routine returns non-zero only if the
+ * input as far as 'e-1' is a partial, not full character, with no
+ * malformations found before position 'e'. The return is either just
+ * FALSE, or TRUE. */
+
+ const U8 *s = s0 + 1;
+ const U8 *send = e;
+
+ PERL_ARGS_ASSERT_IS_UTF8_FF_HELPER_;
+
+ assert(s0 < e);
+ assert(*s0 == I8_TO_NATIVE_UTF8(0xFF));
+
+ send = s + MIN(UTF8_MAXBYTES - 1, e - s);
+ while (s < send) {
+ if (! UTF8_IS_CONTINUATION(*s)) {
+ return 0;
+ }
+
+ s++;
+ }
+
+ if (0 < does_utf8_overflow(s0, e,
+ FALSE /* Don't consider_overlongs */
+ )) {
+ return 0;
+ }
+
+ if (0 < isFF_overlong(s0, e - s0)) {
+ return 0;
+ }
+
+ /* Here, the character is valid as far as it got. Check if got a partial
+ * character */
+ if (s - s0 < UTF8_MAXBYTES) {
+ return (require_partial) ? 1 : 0;
+ }
+
+ /* Here, got a full character */
+ return (require_partial) ? 0 : UTF8_MAXBYTES;
+}
+
char *
Perl__byte_dump_string(pTHX_ const U8 * const start, const STRLEN len, const bool format)
{