diff options
-rw-r--r-- | embed.fnc | 2 | ||||
-rw-r--r-- | embed.h | 1 | ||||
-rw-r--r-- | inline.h | 68 | ||||
-rw-r--r-- | proto.h | 7 | ||||
-rw-r--r-- | regcharclass.h | 2 | ||||
-rwxr-xr-x | regen/regcharclass.pl | 21 | ||||
-rw-r--r-- | utf8.h | 91 |
7 files changed, 82 insertions, 110 deletions
@@ -827,6 +827,8 @@ AbnpdD |STRLEN |is_utf8_char |NN const U8 *s Abmnpd |STRLEN |is_utf8_char_buf|NN const U8 *buf|NN const U8 *buf_end AnidR |Size_t |isUTF8_CHAR|NN const U8 * const s0 \ |NN const U8 * const e +AnidR |Size_t |isSTRICT_UTF8_CHAR |NN const U8 * const s0 \ + |NN const U8 * const e AnmdpR |bool |is_utf8_string |NN const U8 *s|STRLEN len AnidR |bool |is_utf8_string_flags \ |NN const U8 *s|STRLEN len|const U32 flags @@ -259,6 +259,7 @@ #define init_stacks() Perl_init_stacks(aTHX) #define init_tm(a) Perl_init_tm(aTHX_ a) #define intro_my() Perl_intro_my(aTHX) +#define isSTRICT_UTF8_CHAR S_isSTRICT_UTF8_CHAR #define isUTF8_CHAR S_isUTF8_CHAR #define is_c9strict_utf8_string_loclen S_is_c9strict_utf8_string_loclen #define is_lvalue_sub() Perl_is_lvalue_sub(aTHX) @@ -1088,6 +1088,74 @@ S_isUTF8_CHAR(const U8 * const s0, const U8 * const e) /* +=for apidoc isSTRICT_UTF8_CHAR + +Evaluates to non-zero if the first few bytes of the string starting at C<s> and +looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some +Unicode code point completely acceptable for open interchange between all +applications; otherwise it evaluates to 0. If non-zero, the value gives how +many bytes starting at C<s> comprise the code point's representation. Any +bytes remaining before C<e>, but beyond the ones needed to form the first code +point in C<s>, are not examined. + +The largest acceptable code point is the Unicode maximum 0x10FFFF, and must not +be a surrogate nor a non-character code point. Thus this excludes any code +point from Perl's extended UTF-8. + +This is used to efficiently decide if the next few bytes in C<s> is +legal Unicode-acceptable UTF-8 for a single character. + +Use C<L</isC9_STRICT_UTF8_CHAR>> to use the L<Unicode Corrigendum +#9|http://www.unicode.org/versions/corrigendum9.html> definition of allowable +code points; C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8; +and C<L</isUTF8_CHAR_flags>> for a more customized definition. + +Use C<L</is_strict_utf8_string>>, C<L</is_strict_utf8_string_loc>>, and +C<L</is_strict_utf8_string_loclen>> to check entire strings. + +=cut + +This uses an adaptation of the tables and algorithm given in +http://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive +documentation of the original version. A copyright notice for the original +version is given at the beginning of this file. The Perl adapation is +documented at the definition of strict_extended_utf8_dfa_tab[]. + +*/ + +PERL_STATIC_INLINE Size_t +S_isSTRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e) +{ + const U8 * s = s0; + UV state = 0; + + PERL_ARGS_ASSERT_ISSTRICT_UTF8_CHAR; + + while (s < e && LIKELY(state != 1)) { + state = strict_utf8_dfa_tab[256 + state + strict_utf8_dfa_tab[*s]]; + + if (state != 0) { + s++; + continue; + } + + return s - s0 + 1; + } + +#ifndef EBCDIC + + /* The dfa above drops out for certain Hanguls; handle them specially */ + if (is_HANGUL_ED_utf8_safe(s0, e)) { + return 3; + } + +#endif + + return 0; +} + +/* + =for apidoc is_strict_utf8_string_loc Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the @@ -1407,6 +1407,13 @@ PERL_CALLCONV bool Perl_io_close(pTHX_ IO* io, GV *gv, bool not_implicit, bool w #define PERL_ARGS_ASSERT_IO_CLOSE \ assert(io) #ifndef PERL_NO_INLINE_FUNCTIONS +PERL_STATIC_INLINE Size_t S_isSTRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e) + __attribute__warn_unused_result__; +#define PERL_ARGS_ASSERT_ISSTRICT_UTF8_CHAR \ + assert(s0); assert(e) +#endif + +#ifndef PERL_NO_INLINE_FUNCTIONS PERL_STATIC_INLINE Size_t S_isUTF8_CHAR(const U8 * const s0, const U8 * const e) __attribute__warn_unused_result__; #define PERL_ARGS_ASSERT_ISUTF8_CHAR \ diff --git a/regcharclass.h b/regcharclass.h index bded7d53cc..3c2a94cb66 100644 --- a/regcharclass.h +++ b/regcharclass.h @@ -1910,6 +1910,6 @@ * 6aaacc29ce24746bcb2bf82a920fcf90e07cf92d75325199c50f40754d39bb72 lib/unicore/mktables * 21653d2744fdd071f9ef138c805393901bb9547cf3e777ebf50215a191f986ea lib/unicore/version * 4bb677187a1a64e39d48f2e341b5ecb6c99857e49d7a79cf503bd8a3c709999b regen/charset_translations.pl - * 0a1a1fad4b43cd9338269aa8cd46d246a33546c5409aa7e75a147e5350cd39ee regen/regcharclass.pl + * 10e68a929fd4a2cb367c0caf2f82104de6b1c769a4fc4db8a5d87408c6b3c07b regen/regcharclass.pl * 393f8d882713a3ba227351ad0f00ea4839fda74fcf77dcd1cdf31519925adba5 regen/regcharclass_multi_char_folds.pl * ex: set ro: */ diff --git a/regen/regcharclass.pl b/regen/regcharclass.pl index 4884d1abf1..3dee00060b 100755 --- a/regen/regcharclass.pl +++ b/regen/regcharclass.pl @@ -1653,27 +1653,6 @@ SURROGATE: Surrogate code points # that includes all Unicode code points. # #STRICT_UTF8_CHAR: Matches legal Unicode UTF-8 variant code points, no surrrogates nor non-character code points -#0x0080 - 0xD7FF -#0xE000 - 0xFDCF -#0xFDF0 - 0xFFFD -#0x10000 - 0x1FFFD -#0x20000 - 0x2FFFD -#0x30000 - 0x3FFFD -#0x40000 - 0x4FFFD -#0x50000 - 0x5FFFD -#0x60000 - 0x6FFFD -#0x70000 - 0x7FFFD -#0x80000 - 0x8FFFD -#0x90000 - 0x9FFFD -#0xA0000 - 0xAFFFD -#0xB0000 - 0xBFFFD -#0xC0000 - 0xCFFFD -#0xD0000 - 0xDFFFD -#0xE0000 - 0xEFFFD -#0xF0000 - 0xFFFFD -#0x100000 - 0x10FFFD -# -#STRICT_UTF8_CHAR: Matches legal Unicode UTF-8 variant code points, no surrrogates nor non-character code points #=> UTF8 :no_length_checks only_ebcdic_platform #0x00A0 - 0xD7FF #0xE000 - 0xFDCF @@ -316,58 +316,12 @@ C<cp> is Unicode if above 255; otherwise is platform-native. #define isUTF8_POSSIBLY_PROBLEMATIC(c) (__ASSERT_(FITS_IN_8_BITS(c)) \ (U8) c >= 0xED) -/* A helper macro for isSTRICT_UTF8_CHAR, so use that one instead of this. +/* A helper macro for isC9_STRICT_UTF8_CHAR, so use that one instead of this. * Like is_UTF8_CHAR_utf8_no_length_checks(), this was moved here and LIKELYs * added manually. * - STRICT_UTF8_CHAR: Matches legal Unicode UTF-8 variant code points, no - surrrogates nor non-character code points -*/ -/*** GENERATED CODE ***/ -#define is_STRICT_UTF8_CHAR_utf8_no_length_checks(s) \ -( ( 0xC2 <= ((const U8*)s)[0] && ((const U8*)s)[0] <= 0xDF ) ? \ - ( LIKELY( ( ((const U8*)s)[1] & 0xC0 ) == 0x80 ) ? 2 : 0 ) \ -: ( 0xE0 == ((const U8*)s)[0] ) ? \ - ( LIKELY( ( ( ((const U8*)s)[1] & 0xE0 ) == 0xA0 ) && ( ( ((const U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\ -: ( ( 0xE1 <= ((const U8*)s)[0] && ((const U8*)s)[0] <= 0xEC ) || 0xEE == ((const U8*)s)[0] ) ?\ - ( ( ( ( ((const U8*)s)[1] & 0xC0 ) == 0x80 ) && ( ( ((const U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\ -: ( 0xED == ((const U8*)s)[0] ) ? \ - ( LIKELY( ( ( ((const U8*)s)[1] & 0xE0 ) == 0x80 ) && ( ( ((const U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\ -: ( 0xEF == ((const U8*)s)[0] ) ? \ - ( ( ( 0x80 <= ((const U8*)s)[1] && ((const U8*)s)[1] <= 0xB6 ) || ( 0xB8 <= ((const U8*)s)[1] && ((const U8*)s)[1] <= 0xBE ) ) ?\ - ( LIKELY( ( ((const U8*)s)[2] & 0xC0 ) == 0x80 ) ? 3 : 0 ) \ - : ( 0xB7 == ((const U8*)s)[1] ) ? \ - ( LIKELY( ( ((const U8*)s)[2] & 0xF0 ) == 0x80 || ( ((const U8*)s)[2] & 0xF0 ) == 0xB0 ) ? 3 : 0 )\ - : ( ( 0xBF == ((const U8*)s)[1] ) && ( 0x80 <= ((const U8*)s)[2] && ((const U8*)s)[2] <= 0xBD ) ) ? 3 : 0 )\ -: ( 0xF0 == ((const U8*)s)[0] ) ? \ - ( ( ( 0x90 <= ((const U8*)s)[1] && ((const U8*)s)[1] <= 0x9E ) || ( 0xA0 <= ((const U8*)s)[1] && ((const U8*)s)[1] <= 0xAE ) || ( 0xB0 <= ((const U8*)s)[1] && ((const U8*)s)[1] <= 0xBE ) ) ?\ - ( LIKELY( ( ( ((const U8*)s)[2] & 0xC0 ) == 0x80 ) && ( ( ((const U8*)s)[3] & 0xC0 ) == 0x80 ) ) ? 4 : 0 )\ - : ( ((const U8*)s)[1] == 0x9F || ( ( ((const U8*)s)[1] & 0xEF ) == 0xAF ) ) ? \ - ( ( 0x80 <= ((const U8*)s)[2] && ((const U8*)s)[2] <= 0xBE ) ? \ - ( LIKELY( ( ((const U8*)s)[3] & 0xC0 ) == 0x80 ) ? 4 : 0 ) \ - : LIKELY( ( 0xBF == ((const U8*)s)[2] ) && ( 0x80 <= ((const U8*)s)[3] && ((const U8*)s)[3] <= 0xBD ) ) ? 4 : 0 )\ - : 0 ) \ -: ( 0xF1 <= ((const U8*)s)[0] && ((const U8*)s)[0] <= 0xF3 ) ? \ - ( ( ( ( ((const U8*)s)[1] & 0xC8 ) == 0x80 ) || ( ( ((const U8*)s)[1] & 0xCC ) == 0x88 ) || ( ( ((const U8*)s)[1] & 0xCE ) == 0x8C ) || ( ( ((const U8*)s)[1] & 0xCF ) == 0x8E ) ) ?\ - ( LIKELY( ( ( ((const U8*)s)[2] & 0xC0 ) == 0x80 ) && ( ( ((const U8*)s)[3] & 0xC0 ) == 0x80 ) ) ? 4 : 0 )\ - : ( ( ((const U8*)s)[1] & 0xCF ) == 0x8F ) ? \ - ( ( 0x80 <= ((const U8*)s)[2] && ((const U8*)s)[2] <= 0xBE ) ? \ - ( LIKELY( ( ((const U8*)s)[3] & 0xC0 ) == 0x80 ) ? 4 : 0 ) \ - : LIKELY( ( 0xBF == ((const U8*)s)[2] ) && ( 0x80 <= ((const U8*)s)[3] && ((const U8*)s)[3] <= 0xBD ) ) ? 4 : 0 )\ - : 0 ) \ -: ( 0xF4 == ((const U8*)s)[0] ) ? \ - ( ( 0x80 <= ((const U8*)s)[1] && ((const U8*)s)[1] <= 0x8E ) ? \ - ( LIKELY( ( ( ((const U8*)s)[2] & 0xC0 ) == 0x80 ) && ( ( ((const U8*)s)[3] & 0xC0 ) == 0x80 ) ) ? 4 : 0 )\ - : ( 0x8F == ((const U8*)s)[1] ) ? \ - ( ( 0x80 <= ((const U8*)s)[2] && ((const U8*)s)[2] <= 0xBE ) ? \ - ( LIKELY( ( ((const U8*)s)[3] & 0xC0 ) == 0x80 ) ? 4 : 0 ) \ - : LIKELY( ( 0xBF == ((const U8*)s)[2] ) && ( 0x80 <= ((const U8*)s)[3] && ((const U8*)s)[3] <= 0xBD ) ) ? 4 : 0 )\ - : 0 ) \ -: 0 ) - -/* Similarly, - C9_STRICT_UTF8_CHAR: Matches legal Unicode UTF-8 variant code - points, no surrogates + C9_STRICT_UTF8_CHAR: Matches legal Unicode UTF-8 variant code points, + no surrogates 0x0080 - 0xD7FF 0xE000 - 0x10FFFF */ @@ -1001,45 +955,6 @@ point's representation. /* -=for apidoc Am|STRLEN|isSTRICT_UTF8_CHAR|const U8 *s|const U8 *e - -Evaluates to non-zero if the first few bytes of the string starting at C<s> and -looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some -Unicode code point completely acceptable for open interchange between all -applications; otherwise it evaluates to 0. If non-zero, the value gives how -many bytes starting at C<s> comprise the code point's representation. Any -bytes remaining before C<e>, but beyond the ones needed to form the first code -point in C<s>, are not examined. - -The largest acceptable code point is the Unicode maximum 0x10FFFF, and must not -be a surrogate nor a non-character code point. Thus this excludes any code -point from Perl's extended UTF-8. - -This is used to efficiently decide if the next few bytes in C<s> is -legal Unicode-acceptable UTF-8 for a single character. - -Use C<L</isC9_STRICT_UTF8_CHAR>> to use the L<Unicode Corrigendum -#9|http://www.unicode.org/versions/corrigendum9.html> definition of allowable -code points; C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8; -and C<L</isUTF8_CHAR_flags>> for a more customized definition. - -Use C<L</is_strict_utf8_string>>, C<L</is_strict_utf8_string_loc>>, and -C<L</is_strict_utf8_string_loclen>> to check entire strings. - -=cut -*/ - -#define isSTRICT_UTF8_CHAR(s, e) \ - (UNLIKELY((e) <= (s)) \ - ? 0 \ - : (UTF8_IS_INVARIANT(*s)) \ - ? 1 \ - : UNLIKELY(((e) - (s)) < UTF8SKIP(s)) \ - ? 0 \ - : is_STRICT_UTF8_CHAR_utf8_no_length_checks(s)) - -/* - =for apidoc Am|STRLEN|isC9_STRICT_UTF8_CHAR|const U8 *s|const U8 *e Evaluates to non-zero if the first few bytes of the string starting at C<s> and |