diff options
author | Karl Williamson <khw@cpan.org> | 2021-06-26 19:57:03 -0600 |
---|---|---|
committer | Karl Williamson <khw@cpan.org> | 2021-08-07 05:14:44 -0600 |
commit | 58b66e89ba80b4f91ff12da18da1301aa182a687 (patch) | |
tree | db91351c2e183774b4b816c78fd842d73872bc98 | |
parent | 527347e0493f4ee24d63df996fd6616806422d17 (diff) | |
download | perl-58b66e89ba80b4f91ff12da18da1301aa182a687.tar.gz |
Add helper function for longest UTF8 sequence
This specialized functionality is used to check the validity of Perl's
extended-length UTF-8, which has some ideosyncratic characteristics from
the shorter sequences. This means this function doesn't have to
consider those differences. It will be used in the next commit to avoid
some work, and to eventually enable is_utf8_char_helper() to be
simplified.
-rw-r--r-- | embed.fnc | 3 | ||||
-rw-r--r-- | embed.h | 1 | ||||
-rw-r--r-- | proto.h | 6 | ||||
-rw-r--r-- | utf8.c | 64 |
4 files changed, 74 insertions, 0 deletions
@@ -1116,6 +1116,9 @@ ApR |I32 |is_lvalue_sub : Used in cop.h XopR |I32 |was_lvalue_sub CpRTP |STRLEN |is_utf8_char_helper|NN const U8 * const s|NN const U8 * e|const U32 flags +CpRTP |Size_t |is_utf8_FF_helper_|NN const U8 * const s0 \ + |NN const U8 * const e \ + |const bool require_partial Cp |UV |to_uni_upper |UV c|NN U8 *p|NN STRLEN *lenp Cp |UV |to_uni_title |UV c|NN U8 *p|NN STRLEN *lenp p |void |init_uniprops @@ -272,6 +272,7 @@ #define is_lvalue_sub() Perl_is_lvalue_sub(aTHX) #define is_safe_syscall(a,b,c,d) Perl_is_safe_syscall(aTHX_ a,b,c,d) #define is_strict_utf8_string_loclen Perl_is_strict_utf8_string_loclen +#define is_utf8_FF_helper_ Perl_is_utf8_FF_helper_ #ifndef NO_MATHOMS #define is_utf8_char Perl_is_utf8_char #endif @@ -1708,6 +1708,12 @@ PERL_STATIC_INLINE bool Perl_is_strict_utf8_string_loclen(const U8 *s, STRLEN le #define PERL_ARGS_ASSERT_IS_STRICT_UTF8_STRING_LOCLEN \ assert(s) #endif +PERL_CALLCONV Size_t Perl_is_utf8_FF_helper_(const U8 * const s0, const U8 * const e, const bool require_partial) + __attribute__warn_unused_result__ + __attribute__pure__; +#define PERL_ARGS_ASSERT_IS_UTF8_FF_HELPER_ \ + assert(s0); assert(e) + #ifndef NO_MATHOMS PERL_CALLCONV STRLEN Perl_is_utf8_char(const U8 *s) __attribute__deprecated__; @@ -956,6 +956,70 @@ Perl_is_utf8_char_helper(const U8 * const s, const U8 * e, const U32 flags) return UTF8SKIP(s); } +Size_t +Perl_is_utf8_FF_helper_(const U8 * const s0, const U8 * const e, + const bool require_partial) +{ + /* This is called to determine if the UTF-8 sequence starting at s0 and + * continuing for up to one full character of bytes, but looking no further + * than 'e - 1', is legal. *s0 must be 0xFF (or whatever the native + * equivalent of FF in I8 on EBCDIC platforms is). This marks it as being + * for the largest code points recognized by Perl, the ones that require + * the most UTF-8 bytes per character to represent (somewhat less than + * twice the size of the next longest kind). This sequence will only ever + * be Perl extended UTF-8. + * + * The routine returns 0 if the sequence is not fully valid, syntactically + * or semantically. That means it checks that everything following the + * start byte is a continuation byte, and that it doesn't overflow, nor is + * an overlong representation. + * + * If 'require_partial' is FALSE, the routine returns non-zero only if the + * input (as far as 'e-1') is a full character. The return is the count of + * the bytes in the character. + * + * If 'require_partial' is TRUE, the routine returns non-zero only if the + * input as far as 'e-1' is a partial, not full character, with no + * malformations found before position 'e'. The return is either just + * FALSE, or TRUE. */ + + const U8 *s = s0 + 1; + const U8 *send = e; + + PERL_ARGS_ASSERT_IS_UTF8_FF_HELPER_; + + assert(s0 < e); + assert(*s0 == I8_TO_NATIVE_UTF8(0xFF)); + + send = s + MIN(UTF8_MAXBYTES - 1, e - s); + while (s < send) { + if (! UTF8_IS_CONTINUATION(*s)) { + return 0; + } + + s++; + } + + if (0 < does_utf8_overflow(s0, e, + FALSE /* Don't consider_overlongs */ + )) { + return 0; + } + + if (0 < isFF_overlong(s0, e - s0)) { + return 0; + } + + /* Here, the character is valid as far as it got. Check if got a partial + * character */ + if (s - s0 < UTF8_MAXBYTES) { + return (require_partial) ? 1 : 0; + } + + /* Here, got a full character */ + return (require_partial) ? 0 : UTF8_MAXBYTES; +} + char * Perl__byte_dump_string(pTHX_ const U8 * const start, const STRLEN len, const bool format) { |