diff options
-rw-r--r-- | embed.fnc | 2 | ||||
-rw-r--r-- | embed.h | 2 | ||||
-rw-r--r-- | inline.h | 30 | ||||
-rw-r--r-- | proto.h | 7 | ||||
-rw-r--r-- | t/lib/warnings/utf8 | 1 | ||||
-rw-r--r-- | utf8.c | 112 | ||||
-rw-r--r-- | utf8.h | 25 |
7 files changed, 136 insertions, 43 deletions
@@ -678,7 +678,7 @@ ApR |I32 |is_lvalue_sub : Used in cop.h XopR |I32 |was_lvalue_sub #ifndef PERL_NO_INLINE_FUNCTIONS -AiMRn |STRLEN |_is_utf8_char_slow|NN const U8 *s|NN const U8 *e +ApMRnP |STRLEN |_is_utf8_char_slow|NN const U8 * const s|const STRLEN len #endif ADMpPR |U32 |to_uni_upper_lc|U32 c ADMpPR |U32 |to_uni_title_lc|U32 c @@ -786,7 +786,7 @@ #define my_popen(a,b) Perl_my_popen(aTHX_ a,b) #endif #if !defined(PERL_NO_INLINE_FUNCTIONS) -#define _is_utf8_char_slow S__is_utf8_char_slow +#define _is_utf8_char_slow Perl__is_utf8_char_slow #define append_utf8_from_native_byte S_append_utf8_from_native_byte #define av_top_index(a) S_av_top_index(aTHX_ a) #define cx_popblock(a) S_cx_popblock(aTHX_ a) @@ -277,36 +277,6 @@ S_append_utf8_from_native_byte(const U8 byte, U8** dest) } /* - -A helper function for the macro isUTF8_CHAR(), which should be used instead of -this function. The macro will handle smaller code points directly saving time, -using this function as a fall-back for higher code points. - -Tests if the first bytes of string C<s> form a valid UTF-8 character. 0 is -returned if the bytes starting at C<s> up to but not including C<e> do not form a -complete well-formed UTF-8 character; otherwise the number of bytes in the -character is returned. - -Note that an INVARIANT (i.e. ASCII on non-EBCDIC) character is a valid UTF-8 -character. - -=cut */ -PERL_STATIC_INLINE STRLEN -S__is_utf8_char_slow(const U8 *s, const U8 *e) -{ - dTHX; /* The function called below requires thread context */ - - STRLEN actual_len; - - PERL_ARGS_ASSERT__IS_UTF8_CHAR_SLOW; - - assert(e >= s); - utf8n_to_uvchr(s, e - s, &actual_len, UTF8_CHECK_ONLY); - - return (actual_len == (STRLEN) -1) ? 0 : actual_len; -} - -/* =for apidoc valid_utf8_to_uvchr Like L</utf8_to_uvchr_buf>(), but should only be called when it is known that the next character in the input UTF-8 string C<s> is well-formed (I<e.g.>, @@ -3800,10 +3800,11 @@ STATIC SV * S_incpush_if_exists(pTHX_ AV *const av, SV *dir, SV *const stem); # endif #endif #if !defined(PERL_NO_INLINE_FUNCTIONS) -PERL_STATIC_INLINE STRLEN S__is_utf8_char_slow(const U8 *s, const U8 *e) - __attribute__warn_unused_result__; +PERL_CALLCONV STRLEN Perl__is_utf8_char_slow(const U8 * const s, const STRLEN len) + __attribute__warn_unused_result__ + __attribute__pure__; #define PERL_ARGS_ASSERT__IS_UTF8_CHAR_SLOW \ - assert(s); assert(e) + assert(s) PERL_STATIC_INLINE void S_append_utf8_from_native_byte(const U8 byte, U8** dest); #define PERL_ARGS_ASSERT_APPEND_UTF8_FROM_NATIVE_BYTE \ diff --git a/t/lib/warnings/utf8 b/t/lib/warnings/utf8 index 4263c04958..947dea467f 100644 --- a/t/lib/warnings/utf8 +++ b/t/lib/warnings/utf8 @@ -756,7 +756,6 @@ Operation "uc" returns its argument for non-Unicode code point 0x7F+ at - line \ Use of code point 0x80+ is deprecated; the permissible max is 0x7F+ at - line \d+. Operation "uc" returns its argument for non-Unicode code point 0x80+ at - line \d+. Code point 0x7F+ is not Unicode, may not be portable in print at - line \d+. -Use of code point 0x80+ is deprecated; the permissible max is 0x7F+ in print at - line \d+. ######## # NAME [perl #127262] BEGIN{ @@ -337,6 +337,118 @@ Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags) /* +A helper function for the macro isUTF8_CHAR(), which should be used instead of +this function. The macro will handle smaller code points directly saving time, +using this function as a fall-back for higher code points. This function +assumes that it is not called with an invariant character, and that +'s + len - 1' is within bounds of the string 's'. + +Tests if the string C<s> of at least length 'len' is a valid variant UTF-8 +character. 0 is returned if not, otherwise, 'len' is returned. + +*/ + +STRLEN +Perl__is_utf8_char_slow(const U8 * const s, const STRLEN len) +{ + const U8 *e; + const U8 *x, *y; + + PERL_ARGS_ASSERT__IS_UTF8_CHAR_SLOW; + + if (UNLIKELY(! UTF8_IS_START(*s))) { + return 0; + } + + e = s + len; + + for (x = s + 1; x < e; x++) { + if (UNLIKELY(! UTF8_IS_CONTINUATION(*x))) { + return 0; + } + } + +#ifndef EBCDIC + + /* Here is syntactically valid. Make sure this isn't the start of an + * overlong. These values were found by manually inspecting the UTF-8 + * patterns. See the tables in utf8.h and utfebcdic.h */ + + /* This is not needed on modern perls where C0 and C1 are not considered + * start bytes. */ +#if 0 + if (UNLIKELY(*s < 0xC2)) { + return 0; + } +#endif + + if (len > 1) { + if ( (*s == 0xE0 && UNLIKELY(s[1] < 0xA0)) + || (*s == 0xF0 && UNLIKELY(s[1] < 0x90)) + || (*s == 0xF8 && UNLIKELY(s[1] < 0x88)) + || (*s == 0xFC && UNLIKELY(s[1] < 0x84)) + || (*s == 0xFE && UNLIKELY(s[1] < 0x82))) + { + return 0; + } + if ((len > 6 && UNLIKELY(*s == 0xFF) && UNLIKELY(s[6] < 0x81))) { + return 0; + } + } + +#else /* For EBCDIC, we use I8, which is the same on all code pages */ + { + const U8 s0 = NATIVE_UTF8_TO_I8(*s); + + /* On modern perls C0-C4 aren't considered start bytes */ + if ( /* s0 < 0xC5 || */ s0 == 0xE0) { + return 0; + } + + if (len >= 1) { + const U8 s1 = NATIVE_UTF8_TO_I8(s[1]); + + if ( (s0 == 0xF0 && UNLIKELY(s1 < 0xB0)) + || (s0 == 0xF8 && UNLIKELY(s1 < 0xA8)) + || (s0 == 0xFC && UNLIKELY(s1 < 0xA4)) + || (s0 == 0xFE && UNLIKELY(s1 < 0x82))) + { + return 0; + } + if ((len > 7 && UNLIKELY(s0 == 0xFF) && UNLIKELY(s[7] < 0xA1))) { + return 0; + } + } + } + +#endif + + /* Now see if this would overflow a UV on this platform. See if the UTF8 + * for this code point is larger than that for the highest representable + * code point */ + y = (const U8 *) HIGHEST_REPRESENTABLE_UTF8; + + for (x = s; x < e; x++, y++) { + + /* If the same at this byte, go on to the next */ + if (UNLIKELY(NATIVE_UTF8_TO_I8(*x) == *y)) { + continue; + } + + /* If this is larger, it overflows */ + if (UNLIKELY(NATIVE_UTF8_TO_I8(*x) > *y)) { + return 0; + } + + /* But if smaller, it won't */ + break; + } + + return len; +} + +/* + =for apidoc utf8n_to_uvchr THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES. @@ -826,13 +826,24 @@ case any call to string overloading updates the internal UTF-8 encoding flag. =for apidoc Am|STRLEN|isUTF8_CHAR|const U8 *s|const U8 *e -Returns the number of bytes beginning at C<s> which form a legal UTF-8 (or -UTF-EBCDIC) encoded character, looking no further than S<C<e - s>> bytes into -C<s>. Returns 0 if the sequence starting at C<s> through S<C<e - 1>> is not -well-formed UTF-8. +Evaluates to non-zero if the first few bytes of the string starting at C<s> and +looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some +code point; otherwise it evaluates to 0. If non-zero, the value gives how many +many bytes starting at C<s> comprise the code point's representation. -Note that an INVARIANT character (i.e. ASCII on non-EBCDIC -machines) is a valid UTF-8 character. +The code point can be any that will fit in a UV on this machine, using Perl's +extension to official UTF-8 to represent those higher than the Unicode maximum +of 0x10FFFF. That means that this macro is used to efficiently decide if the +next few bytes in C<s> is legal UTF-8 for a single character. Use +L</is_utf8_string>(), L</is_utf8_string_loclen>(), and +L</is_utf8_string_loc>() to check entire strings. + +Note that it is deprecated to use code points higher than what will fit in an +IV. This macro does not raise any warnings for such code points, treating them +as valid. + +Note also that a UTF-8 INVARIANT character (i.e. ASCII on non-EBCDIC machines) +is a valid UTF-8 character. =cut */ @@ -845,7 +856,7 @@ machines) is a valid UTF-8 character. ? 0 \ : LIKELY(IS_UTF8_CHAR_FAST(UTF8SKIP(s))) \ ? is_UTF8_CHAR_utf8_no_length_checks(s) \ - : _is_utf8_char_slow(s, e)) + : _is_utf8_char_slow(s, UTF8SKIP(s))) #define is_utf8_char_buf(buf, buf_end) isUTF8_CHAR(buf, buf_end) |