summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--embed.fnc3
-rw-r--r--embed.h1
-rw-r--r--proto.h6
-rw-r--r--utf8.c64
4 files changed, 74 insertions, 0 deletions
diff --git a/embed.fnc b/embed.fnc
index 8dba57ce12..ac8f93ab8a 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -1116,6 +1116,9 @@ ApR |I32 |is_lvalue_sub
: Used in cop.h
XopR |I32 |was_lvalue_sub
CpRTP |STRLEN |is_utf8_char_helper|NN const U8 * const s|NN const U8 * e|const U32 flags
+CpRTP |Size_t |is_utf8_FF_helper_|NN const U8 * const s0 \
+ |NN const U8 * const e \
+ |const bool require_partial
Cp |UV |to_uni_upper |UV c|NN U8 *p|NN STRLEN *lenp
Cp |UV |to_uni_title |UV c|NN U8 *p|NN STRLEN *lenp
p |void |init_uniprops
diff --git a/embed.h b/embed.h
index cece9a798c..4578848f7a 100644
--- a/embed.h
+++ b/embed.h
@@ -272,6 +272,7 @@
#define is_lvalue_sub() Perl_is_lvalue_sub(aTHX)
#define is_safe_syscall(a,b,c,d) Perl_is_safe_syscall(aTHX_ a,b,c,d)
#define is_strict_utf8_string_loclen Perl_is_strict_utf8_string_loclen
+#define is_utf8_FF_helper_ Perl_is_utf8_FF_helper_
#ifndef NO_MATHOMS
#define is_utf8_char Perl_is_utf8_char
#endif
diff --git a/proto.h b/proto.h
index 7083f13135..26884b042f 100644
--- a/proto.h
+++ b/proto.h
@@ -1708,6 +1708,12 @@ PERL_STATIC_INLINE bool Perl_is_strict_utf8_string_loclen(const U8 *s, STRLEN le
#define PERL_ARGS_ASSERT_IS_STRICT_UTF8_STRING_LOCLEN \
assert(s)
#endif
+PERL_CALLCONV Size_t Perl_is_utf8_FF_helper_(const U8 * const s0, const U8 * const e, const bool require_partial)
+ __attribute__warn_unused_result__
+ __attribute__pure__;
+#define PERL_ARGS_ASSERT_IS_UTF8_FF_HELPER_ \
+ assert(s0); assert(e)
+
#ifndef NO_MATHOMS
PERL_CALLCONV STRLEN Perl_is_utf8_char(const U8 *s)
__attribute__deprecated__;
diff --git a/utf8.c b/utf8.c
index 5877add9f2..6b157dfaaa 100644
--- a/utf8.c
+++ b/utf8.c
@@ -956,6 +956,70 @@ Perl_is_utf8_char_helper(const U8 * const s, const U8 * e, const U32 flags)
return UTF8SKIP(s);
}
+Size_t
+Perl_is_utf8_FF_helper_(const U8 * const s0, const U8 * const e,
+ const bool require_partial)
+{
+ /* This is called to determine if the UTF-8 sequence starting at s0 and
+ * continuing for up to one full character of bytes, but looking no further
+ * than 'e - 1', is legal. *s0 must be 0xFF (or whatever the native
+ * equivalent of FF in I8 on EBCDIC platforms is). This marks it as being
+ * for the largest code points recognized by Perl, the ones that require
+ * the most UTF-8 bytes per character to represent (somewhat less than
+ * twice the size of the next longest kind). This sequence will only ever
+ * be Perl extended UTF-8.
+ *
+ * The routine returns 0 if the sequence is not fully valid, syntactically
+ * or semantically. That means it checks that everything following the
+ * start byte is a continuation byte, and that it doesn't overflow, nor is
+ * an overlong representation.
+ *
+ * If 'require_partial' is FALSE, the routine returns non-zero only if the
+ * input (as far as 'e-1') is a full character. The return is the count of
+ * the bytes in the character.
+ *
+ * If 'require_partial' is TRUE, the routine returns non-zero only if the
+ * input as far as 'e-1' is a partial, not full character, with no
+ * malformations found before position 'e'. The return is either just
+ * FALSE, or TRUE. */
+
+ const U8 *s = s0 + 1;
+ const U8 *send = e;
+
+ PERL_ARGS_ASSERT_IS_UTF8_FF_HELPER_;
+
+ assert(s0 < e);
+ assert(*s0 == I8_TO_NATIVE_UTF8(0xFF));
+
+ send = s + MIN(UTF8_MAXBYTES - 1, e - s);
+ while (s < send) {
+ if (! UTF8_IS_CONTINUATION(*s)) {
+ return 0;
+ }
+
+ s++;
+ }
+
+ if (0 < does_utf8_overflow(s0, e,
+ FALSE /* Don't consider_overlongs */
+ )) {
+ return 0;
+ }
+
+ if (0 < isFF_overlong(s0, e - s0)) {
+ return 0;
+ }
+
+ /* Here, the character is valid as far as it got. Check if got a partial
+ * character */
+ if (s - s0 < UTF8_MAXBYTES) {
+ return (require_partial) ? 1 : 0;
+ }
+
+ /* Here, got a full character */
+ return (require_partial) ? 0 : UTF8_MAXBYTES;
+}
+
char *
Perl__byte_dump_string(pTHX_ const U8 * const start, const STRLEN len, const bool format)
{