diff options
author | Karl Williamson <khw@cpan.org> | 2016-09-10 22:09:44 -0600 |
---|---|---|
committer | Karl Williamson <khw@cpan.org> | 2016-09-17 21:10:49 -0600 |
commit | 2b47960981adadbe81b9635d4ca7861c45ccdced (patch) | |
tree | 72c757864ba7463c7cb11c4e62d08343f8cc8a72 | |
parent | d566bd20c27a46aecd668d2f739b9515f46ac74f (diff) | |
download | perl-2b47960981adadbe81b9635d4ca7861c45ccdced.tar.gz |
Enhance and rename is_utf8_char_slow()
This changes the name of this helper function and adds a parameter and
functionality to allow it to exclude problematic classes of code
points, the same ones excludeable by utf8n_to_uvchar(), like surrogates
or non-character code points.
-rw-r--r-- | embed.fnc | 2 | ||||
-rw-r--r-- | embed.h | 2 | ||||
-rw-r--r-- | inline.h | 2 | ||||
-rw-r--r-- | proto.h | 6 | ||||
-rw-r--r-- | utf8.c | 136 | ||||
-rw-r--r-- | utf8.h | 6 |
6 files changed, 123 insertions, 31 deletions
@@ -682,7 +682,7 @@ ApR |I32 |is_lvalue_sub : Used in cop.h XopR |I32 |was_lvalue_sub #ifndef PERL_NO_INLINE_FUNCTIONS -ApMRnP |STRLEN |_is_utf8_char_slow|NN const U8 * const s|const STRLEN len +ApMRnP |STRLEN |_is_utf8_char_helper|NN const U8 * const s|NN const U8 * const e|const U32 flags #endif ADMpPR |U32 |to_uni_upper_lc|U32 c ADMpPR |U32 |to_uni_title_lc|U32 c @@ -787,7 +787,7 @@ #define my_popen(a,b) Perl_my_popen(aTHX_ a,b) #endif #if !defined(PERL_NO_INLINE_FUNCTIONS) -#define _is_utf8_char_slow Perl__is_utf8_char_slow +#define _is_utf8_char_helper Perl__is_utf8_char_helper #define append_utf8_from_native_byte S_append_utf8_from_native_byte #define av_top_index(a) S_av_top_index(aTHX_ a) #define cx_popblock(a) S_cx_popblock(aTHX_ a) @@ -537,7 +537,7 @@ S_is_utf8_valid_partial_char(const U8 * const s, const U8 * const e) return FALSE; } - return cBOOL(_is_utf8_char_slow(s, e - s)); + return cBOOL(_is_utf8_char_helper(s, e, 0)); } /* ------------------------------- perl.h ----------------------------- */ @@ -3810,11 +3810,11 @@ STATIC SV * S_incpush_if_exists(pTHX_ AV *const av, SV *dir, SV *const stem); # endif #endif #if !defined(PERL_NO_INLINE_FUNCTIONS) -PERL_CALLCONV STRLEN Perl__is_utf8_char_slow(const U8 * const s, const STRLEN len) +PERL_CALLCONV STRLEN Perl__is_utf8_char_helper(const U8 * const s, const U8 * const e, const U32 flags) __attribute__warn_unused_result__ __attribute__pure__; -#define PERL_ARGS_ASSERT__IS_UTF8_CHAR_SLOW \ - assert(s) +#define PERL_ARGS_ASSERT__IS_UTF8_CHAR_HELPER \ + assert(s); assert(e) PERL_STATIC_INLINE void S_append_utf8_from_native_byte(const U8 byte, U8** dest); #define PERL_ARGS_ASSERT_APPEND_UTF8_FROM_NATIVE_BYTE \ @@ -423,37 +423,120 @@ S_is_utf8_cp_above_31_bits(const U8 * const s, const U8 * const e) } -/* - -A helper function for the macro isUTF8_CHAR(), which should be used instead of -this function. The macro will handle smaller code points directly saving time, -using this function as a fall-back for higher code points. This function -assumes that it is not called with an invariant character, and that -'s + len - 1' is within bounds of the string 's'. - -Tests if the string C<s> of at least length 'len' is a valid variant UTF-8 -character. 0 is returned if not, otherwise, 'len' is returned. - -It is written in such a way that if 'len' is set to less than a full -character's length, it will test if the bytes ending there form the legal -beginning of partial character. - -*/ - STRLEN -Perl__is_utf8_char_slow(const U8 * const s, const STRLEN len) +Perl__is_utf8_char_helper(const U8 * const s, const U8 * const e, const U32 flags) { - const U8 *e; + STRLEN len; const U8 *x, *y; - PERL_ARGS_ASSERT__IS_UTF8_CHAR_SLOW; + /* A helper function that should not be called directly. + * + * This function returns non-zero if the string beginning at 's' and + * looking no further than 'e - 1' is well-formed Perl-extended-UTF-8 for a + * code point; otherwise it returns 0. The examination stops after the + * first code point in 's' is validated, not looking at the rest of the + * input. If 'e' is such that there are not enough bytes to represent a + * complete code point, this function will return non-zero anyway, if the + * bytes it does have are well-formed UTF-8 as far as they go, and aren't + * excluded by 'flags'. + * + * A non-zero return gives the number of bytes required to represent the + * code point. Be aware that if the input is for a partial character, the + * return will be larger than 'e - s'. + * + * This function assumes that the code point represented is UTF-8 variant. + * The caller should have excluded this possibility before calling this + * function. + * + * 'flags' can be 0, or any combination of the UTF8_DISALLOW_foo flags + * accepted by L</utf8n_to_uvchr>. If non-zero, this function will return + * 0 if the code point represented is well-formed Perl-extended-UTF-8, but + * disallowed by the flags. If the input is only for a partial character, + * the function will return non-zero if there is any sequence of + * well-formed UTF-8 that, when appended to the input sequence, could + * result in an allowed code point; otherwise it returns 0. Non characters + * cannot be determined based on partial character input. But many of the + * other excluded types can be determined with just the first one or two + * bytes. + * + */ + + PERL_ARGS_ASSERT__IS_UTF8_CHAR_HELPER; + assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE + |UTF8_DISALLOW_ABOVE_31_BIT))); + assert(! UTF8_IS_INVARIANT(*s)); + + /* A variant char must begin with a start byte */ if (UNLIKELY(! UTF8_IS_START(*s))) { return 0; } - e = s + len; + len = e - s; + + if (flags && isUTF8_POSSIBLY_PROBLEMATIC(*s)) { + const U8 s0 = NATIVE_UTF8_TO_I8(s[0]); + + /* The code below is derived from this table. Keep in mind that legal + * continuation bytes range between \x80..\xBF for UTF-8, and + * \xA0..\xBF for I8. Anything above those aren't continuation bytes. + * Hence, we don't have to test the upper edge because if any of those + * are encountered, the sequence is malformed, and will fail elsewhere + * in this function. + * UTF-8 UTF-EBCDIC I8 + * U+D800: \xED\xA0\x80 \xF1\xB6\xA0\xA0 First surrogate + * U+DFFF: \xED\xBF\xBF \xF1\xB7\xBF\xBF Final surrogate + * U+110000: \xF4\x90\x80\x80 \xF9\xA2\xA0\xA0\xA0 First above Unicode + * + */ + +#ifdef EBCDIC /* On EBCDIC, these are actually I8 bytes */ +# define FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER 0xFA +# define IS_SUPER_2_BYTE(s0, s1) ((s0) == 0xF9 && (s1) >= 0xA2) + /* B6 and B7 */ +# define IS_SURROGATE(s0, s1) ((s0) == 0xF1 && ((s1) & 0xFE ) == 0xB6) +#else +# define FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER 0xF5 +# define IS_SUPER_2_BYTE(s0, s1) ((s0) == 0xF4 && (s1) >= 0x90) +# define IS_SURROGATE(s0, s1) ((s0) == 0xED && (s1) >= 0xA0) +#endif + + if ( (flags & UTF8_DISALLOW_SUPER) + && UNLIKELY(s0 >= FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER)) { + return 0; /* Above Unicode */ + } + + if ( (flags & UTF8_DISALLOW_ABOVE_31_BIT) + && UNLIKELY(is_utf8_cp_above_31_bits(s, e))) + { + return 0; /* Above 31 bits */ + } + + if (len > 1) { + const U8 s1 = NATIVE_UTF8_TO_I8(s[1]); + + if ( (flags & UTF8_DISALLOW_SUPER) + && UNLIKELY(IS_SUPER_2_BYTE(s0, s1))) + { + return 0; /* Above Unicode */ + } + + if ( (flags & UTF8_DISALLOW_SURROGATE) + && UNLIKELY(IS_SURROGATE(s0, s1))) + { + return 0; /* Surrogate */ + } + + if ( (flags & UTF8_DISALLOW_NONCHAR) + && UNLIKELY(UTF8_IS_NONCHAR(s, e))) + { + return 0; /* Noncharacter code point */ + } + } + } + + /* Make sure that all that follows are continuation bytes */ for (x = s + 1; x < e; x++) { if (UNLIKELY(! UTF8_IS_CONTINUATION(*x))) { return 0; @@ -555,9 +638,18 @@ Perl__is_utf8_char_slow(const U8 * const s, const STRLEN len) break; } - return len; + return UTF8SKIP(s); } +#undef FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER +#undef IS_SUPER_2_BYTE +#undef IS_SURROGATE +#undef F0_ABOVE_OVERLONG +#undef F8_ABOVE_OVERLONG +#undef FC_ABOVE_OVERLONG +#undef FE_ABOVE_OVERLONG +#undef FF_OVERLONG_PREFIX + /* =for apidoc utf8n_to_uvchr @@ -755,14 +755,14 @@ fit in an IV on the current machine. && ( NATIVE_UTF8_TO_I8(*(s)) > 0xF9 \ || (NATIVE_UTF8_TO_I8(*(s) + 1) >= 0xA2)) \ && LIKELY((s) + UTF8SKIP(s) <= (e))) \ - ? _is_utf8_char_slow(s, UTF8SKIP(s)) : 0) + ? _is_utf8_char_helper(s, s + UTF8SKIP(s), 0) : 0) #else # define UTF8_IS_SUPER(s, e) \ (( LIKELY((e) > (s) + 3) \ && (*(U8*) (s)) >= 0xF4 \ && ((*(U8*) (s)) > 0xF4 || (*((U8*) (s) + 1) >= 0x90))\ && LIKELY((s) + UTF8SKIP(s) <= (e))) \ - ? _is_utf8_char_slow(s, UTF8SKIP(s)) : 0) + ? _is_utf8_char_helper(s, s + UTF8SKIP(s), 0) : 0) #endif /* These are now machine generated, and the 'given' clause is no longer @@ -919,7 +919,7 @@ is a valid UTF-8 character. ? 0 \ : LIKELY(NATIVE_UTF8_TO_I8(*s) <= _IS_UTF8_CHAR_HIGHEST_START_BYTE) \ ? is_UTF8_CHAR_utf8_no_length_checks(s) \ - : _is_utf8_char_slow(s, UTF8SKIP(s))) + : _is_utf8_char_helper(s, e, 0)) #define is_utf8_char_buf(buf, buf_end) isUTF8_CHAR(buf, buf_end) |