diff options
-rwxr-xr-x | regen/regcharclass.pl | 16 | ||||
-rw-r--r-- | utf8.h | 142 |
2 files changed, 63 insertions, 95 deletions
diff --git a/regen/regcharclass.pl b/regen/regcharclass.pl index c4f5951a3c..7d126428ef 100755 --- a/regen/regcharclass.pl +++ b/regen/regcharclass.pl @@ -1161,6 +1161,22 @@ GCB_V: Grapheme_Cluster_Break=V => UTF8 :fast \p{_X_GCB_V} +# This program was run with this enabled, and the results copied to utf8.h; +# then this was commented out because it takes so long to figure out these 2 +# million code points. The results would not change unless utf8.h decides it +# wants a maximum other than 4 bytes, or this program creates better +# optimizations +#UTF8_CHAR: Matches utf8 from 1 to 4 bytes +#=> UTF8 :safe only_ascii_platform +#0x0 - 0x1FFFFF + +# This hasn't been commented out, because we haven't an EBCDIC platform to run +# it on, and the 3 types of EBCDIC allegedly supported by Perl would have +# different results +UTF8_CHAR: Matches utf8 from 1 to 5 bytes +=> UTF8 :safe only_ebcdic_platform +0x0 - 0x3FFFFF: + QUOTEMETA: Meta-characters that \Q should quote => high :fast \p{_Perl_Quotemeta} @@ -451,111 +451,63 @@ Perl's extended UTF-8 means we can have start bytes up to FF. toLOWER((input)[1]) == 's') #define SHARP_S_SKIP 2 -#ifndef EBCDIC /* If you want to exclude surrogates, and beyond legal Unicode, see the blame * log for earlier versions which gave details for these */ -# define IS_UTF8_CHAR_1(p) \ - ((p)[0] <= 0x7F) -# define IS_UTF8_CHAR_2(p) \ - ((p)[0] >= 0xC2 && (p)[0] <= 0xDF && \ - (p)[1] >= 0x80 && (p)[1] <= 0xBF) -# define IS_UTF8_CHAR_3a(p) \ - ((p)[0] == 0xE0 && \ - (p)[1] >= 0xA0 && (p)[1] <= 0xBF && \ - (p)[2] >= 0x80 && (p)[2] <= 0xBF) -# define IS_UTF8_CHAR_3b(p) \ - ((p)[0] >= 0xE1 && (p)[0] <= 0xEF && \ - (p)[1] >= 0x80 && (p)[1] <= 0xBF && \ - (p)[2] >= 0x80 && (p)[2] <= 0xBF) -# define IS_UTF8_CHAR_4a(p) \ - ((p)[0] == 0xF0 && \ - (p)[1] >= 0x90 && (p)[1] <= 0xBF && \ - (p)[2] >= 0x80 && (p)[2] <= 0xBF && \ - (p)[3] >= 0x80 && (p)[3] <= 0xBF) -/* The 0xF7 allows us to go to 0x1fffff (0x200000 would - * require five bytes). Not doing any further code points - * since that is not needed (and that would not be strict - * UTF-8, anyway). The "slow path" in Perl_is_utf8_char() - * will take care of the "extended UTF-8". */ -# define IS_UTF8_CHAR_4b(p) \ - ((p)[0] >= 0xF1 && (p)[0] <= 0xF7 && \ - (p)[1] >= 0x80 && (p)[1] <= 0xBF && \ - (p)[2] >= 0x80 && (p)[2] <= 0xBF && \ - (p)[3] >= 0x80 && (p)[3] <= 0xBF) - -# define IS_UTF8_CHAR_3(p) \ - (IS_UTF8_CHAR_3a(p) || \ - IS_UTF8_CHAR_3b(p)) -# define IS_UTF8_CHAR_4(p) \ - (IS_UTF8_CHAR_4a(p) || \ - IS_UTF8_CHAR_4b(p)) + +#ifndef EBCDIC +/* This was generated by regen/regcharclass.pl, and then moved here. The lines + * that generated it were then commented out. This was done solely because it + * takes on the order of 10 minutes to generate, and is never going to change. + * The EBCDIC equivalent hasn't been commented out in regcharclass.pl, so it + * should generate and run the correct stuff */ +/*** GENERATED CODE ***/ +#define is_UTF8_CHAR_utf8_safe(s,e) \ +( ((e)-(s) > 3) ? \ + ( ( ( ((U8*)s)[0] & 0x80 ) == 0x00 ) ? 1 \ + : ( 0xC2 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xDF ) ? \ + ( ( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) ? 2 : 0 ) \ + : ( 0xE0 == ((U8*)s)[0] ) ? \ + ( ( ( ( ((U8*)s)[1] & 0xE0 ) == 0xA0 ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\ + : ( 0xE1 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xEF ) ? \ + ( ( ( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\ + : ( 0xF0 == ((U8*)s)[0] ) ? \ + ( ( ( ( 0x90 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0xBF ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) && ( ( ((U8*)s)[3] & 0xC0 ) == 0x80 ) ) ? 4 : 0 )\ + : ( 0xF1 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xF7 ) ? \ + ( ( ( ( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) && ( ( ((U8*)s)[3] & 0xC0 ) == 0x80 ) ) ? 4 : 0 )\ + : 0 ) \ +: ((e)-(s) > 2) ? \ + ( ( ( ((U8*)s)[0] & 0x80 ) == 0x00 ) ? 1 \ + : ( 0xC2 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xDF ) ? \ + ( ( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) ? 2 : 0 ) \ + : ( 0xE0 == ((U8*)s)[0] ) ? \ + ( ( ( ( ((U8*)s)[1] & 0xE0 ) == 0xA0 ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\ + : ( 0xE1 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xEF ) ? \ + ( ( ( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\ + : 0 ) \ +: ((e)-(s) > 1) ? \ + ( ( ( ((U8*)s)[0] & 0x80 ) == 0x00 ) ? 1 \ + : ( 0xC2 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xDF ) ? \ + ( ( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) ? 2 : 0 ) \ + : 0 ) \ +: ((e)-(s) > 0) ? \ + ( ( ((U8*)s)[0] & 0x80 ) == 0x00 ) \ +: 0 ) +#endif /* IS_UTF8_CHAR(p) is strictly speaking wrong (not UTF-8) because it * (1) allows UTF-8 encoded UTF-16 surrogates * (2) it allows code points past U+10FFFF. * The Perl_is_utf8_char() full "slow" code will handle the Perl * "extended UTF-8". */ -# define IS_UTF8_CHAR(p, n) \ - ((n) == 1 ? IS_UTF8_CHAR_1(p) : \ - (n) == 2 ? IS_UTF8_CHAR_2(p) : \ - (n) == 3 ? IS_UTF8_CHAR_3(p) : \ - (n) == 4 ? IS_UTF8_CHAR_4(p) : 0) - -# define IS_UTF8_CHAR_FAST(n) ((n) <= 4) - -#else /* EBCDIC */ - -/* This is an attempt to port IS_UTF8_CHAR to EBCDIC based on eyeballing. - * untested. If want to exclude surrogates and above-Unicode, see the - * definitions for UTF8_IS_SURROGATE and UTF8_IS_SUPER */ -# define IS_UTF8_CHAR_1(p) \ - (NATIVE_TO_ASCII((p)[0]) <= 0x9F) -# define IS_UTF8_CHAR_2(p) \ - (NATIVE_TO_I8((p)[0]) >= 0xC5 && NATIVE_TO_I8((p)[0]) <= 0xDF && \ - NATIVE_TO_I8((p)[1]) >= 0xA0 && NATIVE_TO_I8((p)[1]) <= 0xBF) -# define IS_UTF8_CHAR_3(p) \ - (NATIVE_TO_I8((p)[0]) == 0xE1 && NATIVE_TO_I8((p)[1]) <= 0xEF && \ - NATIVE_TO_I8((p)[1]) >= 0xA0 && NATIVE_TO_I8((p)[1]) <= 0xBF && \ - NATIVE_TO_I8((p)[2]) >= 0xA0 && NATIVE_TO_I8((p)[2]) <= 0xBF) -# define IS_UTF8_CHAR_4a(p) \ - (NATIVE_TO_I8((p)[0]) == 0xF0 && \ - NATIVE_TO_I8((p)[1]) >= 0xB0 && NATIVE_TO_I8((p)[1]) <= 0xBF && \ - NATIVE_TO_I8((p)[2]) >= 0xA0 && NATIVE_TO_I8((p)[2]) <= 0xBF && \ - NATIVE_TO_I8((p)[3]) >= 0xA0 && NATIVE_TO_I8((p)[3]) <= 0xBF) -# define IS_UTF8_CHAR_4b(p) \ - (NATIVE_TO_I8((p)[0]) >= 0xF1 && NATIVE_TO_I8((p)[0]) <= 0xF7 && \ - NATIVE_TO_I8((p)[1]) >= 0xA0 && NATIVE_TO_I8((p)[1]) <= 0xBF && \ - NATIVE_TO_I8((p)[2]) >= 0xA0 && NATIVE_TO_I8((p)[2]) <= 0xBF && \ - NATIVE_TO_I8((p)[3]) >= 0xA0 && NATIVE_TO_I8((p)[3]) <= 0xBF) -# define IS_UTF8_CHAR_5a(p) \ - (NATIVE_TO_I8((p)[0]) == 0xF8 && \ - NATIVE_TO_I8((p)[1]) >= 0xA8 && NATIVE_TO_I8((p)[1]) <= 0xBF && \ - NATIVE_TO_I8((p)[1]) >= 0xA0 && NATIVE_TO_I8((p)[1]) <= 0xBF && \ - NATIVE_TO_I8((p)[2]) >= 0xA0 && NATIVE_TO_I8((p)[2]) <= 0xBF && \ - NATIVE_TO_I8((p)[3]) >= 0xA0 && NATIVE_TO_I8((p)[3]) <= 0xBF) -# define IS_UTF8_CHAR_5b(p) \ - (NATIVE_TO_I8((p)[0]) >= 0xF9 && NATIVE_TO_I8((p)[1]) <= 0xFB && \ - NATIVE_TO_I8((p)[1]) >= 0xA0 && NATIVE_TO_I8((p)[1]) <= 0xBF && \ - NATIVE_TO_I8((p)[1]) >= 0xA0 && NATIVE_TO_I8((p)[1]) <= 0xBF && \ - NATIVE_TO_I8((p)[2]) >= 0xA0 && NATIVE_TO_I8((p)[2]) <= 0xBF && \ - NATIVE_TO_I8((p)[3]) >= 0xA0 && NATIVE_TO_I8((p)[3]) <= 0xBF) - -# define IS_UTF8_CHAR_4(p) \ - (IS_UTF8_CHAR_4a(p) || \ - IS_UTF8_CHAR_4b(p)) -# define IS_UTF8_CHAR_5(p) \ - (IS_UTF8_CHAR_5a(p) || \ - IS_UTF8_CHAR_5b(p)) -# define IS_UTF8_CHAR(p, n) \ - ((n) == 1 ? IS_UTF8_CHAR_1(p) : \ - (n) == 2 ? IS_UTF8_CHAR_2(p) : \ - (n) == 3 ? IS_UTF8_CHAR_3(p) : \ - (n) == 4 ? IS_UTF8_CHAR_4(p) : \ - (n) == 5 ? IS_UTF8_CHAR_5(p) : 0) +#define IS_UTF8_CHAR(p, n) (is_UTF8_CHAR_utf8_safe(p, (p) + (n)) == n) +/* regen/regcharclass.pl generates is_UTF8_CHAR_utf8_safe() macros for up to + * these number of bytes. So this has to be coordinated with it */ +#ifdef EBCDIC # define IS_UTF8_CHAR_FAST(n) ((n) <= 5) - -#endif /* IS_UTF8_CHAR() for UTF-8 */ +#else +# define IS_UTF8_CHAR_FAST(n) ((n) <= 4) +#endif /* * Local variables: |