diff options
author | Karl Williamson <public@khwilliamson.com> | 2012-09-05 20:56:09 -0600 |
---|---|---|
committer | Karl Williamson <public@khwilliamson.com> | 2012-09-13 21:14:04 -0600 |
commit | 4d6461409e812aecb1fa745debb6132ce8e5612d (patch) | |
tree | 233a2c093d46c73bc151240415219e0e7ed41b11 /utf8.h | |
parent | ae1d4929d23a3d6949518058aa41cd90a700a4af (diff) | |
download | perl-4d6461409e812aecb1fa745debb6132ce8e5612d.tar.gz |
utf8.h: Use machine generated IS_UTF8_CHAR()
This takes the output of regen/regcharclass.pl for all the 1-4 byte
UTF8-representations of Unicode code points, and replaces the current
hand-rolled definition there. It does this only for ASCII platforms,
leaving EBCDIC to be machine generated when run on such a platform.
I would rather have both versions to be regenerated each time it is
needed to save an EBCDIC dependency, but it takes more than 10 minutes
on my computer to process the 2 billion code points that have to be
checked for on ASCII platforms, and currently t/porting/regen.t runs
this program every times; and that slow down would be unacceptable. If
this is ever run under EBCDIC, the macro should be machine computed
(very slowly). So, even though there is an EBCDIC dependency, it has
essentially been solved.
Diffstat (limited to 'utf8.h')
-rw-r--r-- | utf8.h | 142 |
1 files changed, 47 insertions, 95 deletions
@@ -451,111 +451,63 @@ Perl's extended UTF-8 means we can have start bytes up to FF. toLOWER((input)[1]) == 's') #define SHARP_S_SKIP 2 -#ifndef EBCDIC /* If you want to exclude surrogates, and beyond legal Unicode, see the blame * log for earlier versions which gave details for these */ -# define IS_UTF8_CHAR_1(p) \ - ((p)[0] <= 0x7F) -# define IS_UTF8_CHAR_2(p) \ - ((p)[0] >= 0xC2 && (p)[0] <= 0xDF && \ - (p)[1] >= 0x80 && (p)[1] <= 0xBF) -# define IS_UTF8_CHAR_3a(p) \ - ((p)[0] == 0xE0 && \ - (p)[1] >= 0xA0 && (p)[1] <= 0xBF && \ - (p)[2] >= 0x80 && (p)[2] <= 0xBF) -# define IS_UTF8_CHAR_3b(p) \ - ((p)[0] >= 0xE1 && (p)[0] <= 0xEF && \ - (p)[1] >= 0x80 && (p)[1] <= 0xBF && \ - (p)[2] >= 0x80 && (p)[2] <= 0xBF) -# define IS_UTF8_CHAR_4a(p) \ - ((p)[0] == 0xF0 && \ - (p)[1] >= 0x90 && (p)[1] <= 0xBF && \ - (p)[2] >= 0x80 && (p)[2] <= 0xBF && \ - (p)[3] >= 0x80 && (p)[3] <= 0xBF) -/* The 0xF7 allows us to go to 0x1fffff (0x200000 would - * require five bytes). Not doing any further code points - * since that is not needed (and that would not be strict - * UTF-8, anyway). The "slow path" in Perl_is_utf8_char() - * will take care of the "extended UTF-8". */ -# define IS_UTF8_CHAR_4b(p) \ - ((p)[0] >= 0xF1 && (p)[0] <= 0xF7 && \ - (p)[1] >= 0x80 && (p)[1] <= 0xBF && \ - (p)[2] >= 0x80 && (p)[2] <= 0xBF && \ - (p)[3] >= 0x80 && (p)[3] <= 0xBF) - -# define IS_UTF8_CHAR_3(p) \ - (IS_UTF8_CHAR_3a(p) || \ - IS_UTF8_CHAR_3b(p)) -# define IS_UTF8_CHAR_4(p) \ - (IS_UTF8_CHAR_4a(p) || \ - IS_UTF8_CHAR_4b(p)) + +#ifndef EBCDIC +/* This was generated by regen/regcharclass.pl, and then moved here. The lines + * that generated it were then commented out. This was done solely because it + * takes on the order of 10 minutes to generate, and is never going to change. + * The EBCDIC equivalent hasn't been commented out in regcharclass.pl, so it + * should generate and run the correct stuff */ +/*** GENERATED CODE ***/ +#define is_UTF8_CHAR_utf8_safe(s,e) \ +( ((e)-(s) > 3) ? \ + ( ( ( ((U8*)s)[0] & 0x80 ) == 0x00 ) ? 1 \ + : ( 0xC2 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xDF ) ? \ + ( ( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) ? 2 : 0 ) \ + : ( 0xE0 == ((U8*)s)[0] ) ? \ + ( ( ( ( ((U8*)s)[1] & 0xE0 ) == 0xA0 ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\ + : ( 0xE1 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xEF ) ? \ + ( ( ( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\ + : ( 0xF0 == ((U8*)s)[0] ) ? \ + ( ( ( ( 0x90 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0xBF ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) && ( ( ((U8*)s)[3] & 0xC0 ) == 0x80 ) ) ? 4 : 0 )\ + : ( 0xF1 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xF7 ) ? \ + ( ( ( ( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) && ( ( ((U8*)s)[3] & 0xC0 ) == 0x80 ) ) ? 4 : 0 )\ + : 0 ) \ +: ((e)-(s) > 2) ? \ + ( ( ( ((U8*)s)[0] & 0x80 ) == 0x00 ) ? 1 \ + : ( 0xC2 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xDF ) ? \ + ( ( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) ? 2 : 0 ) \ + : ( 0xE0 == ((U8*)s)[0] ) ? \ + ( ( ( ( ((U8*)s)[1] & 0xE0 ) == 0xA0 ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\ + : ( 0xE1 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xEF ) ? \ + ( ( ( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\ + : 0 ) \ +: ((e)-(s) > 1) ? \ + ( ( ( ((U8*)s)[0] & 0x80 ) == 0x00 ) ? 1 \ + : ( 0xC2 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xDF ) ? \ + ( ( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) ? 2 : 0 ) \ + : 0 ) \ +: ((e)-(s) > 0) ? \ + ( ( ((U8*)s)[0] & 0x80 ) == 0x00 ) \ +: 0 ) +#endif /* IS_UTF8_CHAR(p) is strictly speaking wrong (not UTF-8) because it * (1) allows UTF-8 encoded UTF-16 surrogates * (2) it allows code points past U+10FFFF. * The Perl_is_utf8_char() full "slow" code will handle the Perl * "extended UTF-8". */ -# define IS_UTF8_CHAR(p, n) \ - ((n) == 1 ? IS_UTF8_CHAR_1(p) : \ - (n) == 2 ? IS_UTF8_CHAR_2(p) : \ - (n) == 3 ? IS_UTF8_CHAR_3(p) : \ - (n) == 4 ? IS_UTF8_CHAR_4(p) : 0) - -# define IS_UTF8_CHAR_FAST(n) ((n) <= 4) - -#else /* EBCDIC */ - -/* This is an attempt to port IS_UTF8_CHAR to EBCDIC based on eyeballing. - * untested. If want to exclude surrogates and above-Unicode, see the - * definitions for UTF8_IS_SURROGATE and UTF8_IS_SUPER */ -# define IS_UTF8_CHAR_1(p) \ - (NATIVE_TO_ASCII((p)[0]) <= 0x9F) -# define IS_UTF8_CHAR_2(p) \ - (NATIVE_TO_I8((p)[0]) >= 0xC5 && NATIVE_TO_I8((p)[0]) <= 0xDF && \ - NATIVE_TO_I8((p)[1]) >= 0xA0 && NATIVE_TO_I8((p)[1]) <= 0xBF) -# define IS_UTF8_CHAR_3(p) \ - (NATIVE_TO_I8((p)[0]) == 0xE1 && NATIVE_TO_I8((p)[1]) <= 0xEF && \ - NATIVE_TO_I8((p)[1]) >= 0xA0 && NATIVE_TO_I8((p)[1]) <= 0xBF && \ - NATIVE_TO_I8((p)[2]) >= 0xA0 && NATIVE_TO_I8((p)[2]) <= 0xBF) -# define IS_UTF8_CHAR_4a(p) \ - (NATIVE_TO_I8((p)[0]) == 0xF0 && \ - NATIVE_TO_I8((p)[1]) >= 0xB0 && NATIVE_TO_I8((p)[1]) <= 0xBF && \ - NATIVE_TO_I8((p)[2]) >= 0xA0 && NATIVE_TO_I8((p)[2]) <= 0xBF && \ - NATIVE_TO_I8((p)[3]) >= 0xA0 && NATIVE_TO_I8((p)[3]) <= 0xBF) -# define IS_UTF8_CHAR_4b(p) \ - (NATIVE_TO_I8((p)[0]) >= 0xF1 && NATIVE_TO_I8((p)[0]) <= 0xF7 && \ - NATIVE_TO_I8((p)[1]) >= 0xA0 && NATIVE_TO_I8((p)[1]) <= 0xBF && \ - NATIVE_TO_I8((p)[2]) >= 0xA0 && NATIVE_TO_I8((p)[2]) <= 0xBF && \ - NATIVE_TO_I8((p)[3]) >= 0xA0 && NATIVE_TO_I8((p)[3]) <= 0xBF) -# define IS_UTF8_CHAR_5a(p) \ - (NATIVE_TO_I8((p)[0]) == 0xF8 && \ - NATIVE_TO_I8((p)[1]) >= 0xA8 && NATIVE_TO_I8((p)[1]) <= 0xBF && \ - NATIVE_TO_I8((p)[1]) >= 0xA0 && NATIVE_TO_I8((p)[1]) <= 0xBF && \ - NATIVE_TO_I8((p)[2]) >= 0xA0 && NATIVE_TO_I8((p)[2]) <= 0xBF && \ - NATIVE_TO_I8((p)[3]) >= 0xA0 && NATIVE_TO_I8((p)[3]) <= 0xBF) -# define IS_UTF8_CHAR_5b(p) \ - (NATIVE_TO_I8((p)[0]) >= 0xF9 && NATIVE_TO_I8((p)[1]) <= 0xFB && \ - NATIVE_TO_I8((p)[1]) >= 0xA0 && NATIVE_TO_I8((p)[1]) <= 0xBF && \ - NATIVE_TO_I8((p)[1]) >= 0xA0 && NATIVE_TO_I8((p)[1]) <= 0xBF && \ - NATIVE_TO_I8((p)[2]) >= 0xA0 && NATIVE_TO_I8((p)[2]) <= 0xBF && \ - NATIVE_TO_I8((p)[3]) >= 0xA0 && NATIVE_TO_I8((p)[3]) <= 0xBF) - -# define IS_UTF8_CHAR_4(p) \ - (IS_UTF8_CHAR_4a(p) || \ - IS_UTF8_CHAR_4b(p)) -# define IS_UTF8_CHAR_5(p) \ - (IS_UTF8_CHAR_5a(p) || \ - IS_UTF8_CHAR_5b(p)) -# define IS_UTF8_CHAR(p, n) \ - ((n) == 1 ? IS_UTF8_CHAR_1(p) : \ - (n) == 2 ? IS_UTF8_CHAR_2(p) : \ - (n) == 3 ? IS_UTF8_CHAR_3(p) : \ - (n) == 4 ? IS_UTF8_CHAR_4(p) : \ - (n) == 5 ? IS_UTF8_CHAR_5(p) : 0) +#define IS_UTF8_CHAR(p, n) (is_UTF8_CHAR_utf8_safe(p, (p) + (n)) == n) +/* regen/regcharclass.pl generates is_UTF8_CHAR_utf8_safe() macros for up to + * these number of bytes. So this has to be coordinated with it */ +#ifdef EBCDIC # define IS_UTF8_CHAR_FAST(n) ((n) <= 5) - -#endif /* IS_UTF8_CHAR() for UTF-8 */ +#else +# define IS_UTF8_CHAR_FAST(n) ((n) <= 4) +#endif /* * Local variables: |