summaryrefslogtreecommitdiff
path: root/utfebcdic.h
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2016-09-03 14:12:27 -0600
committerKarl Williamson <khw@cpan.org>2016-09-17 17:22:25 -0600
commit784d4f31222f1bf7421b1aab87276f4878d60363 (patch)
treea5e9004f9729bc99c19ccc866f834e83bcc8a3fc /utfebcdic.h
parent21cb232c014d719d883ed0f8d6185dc36037859e (diff)
downloadperl-784d4f31222f1bf7421b1aab87276f4878d60363.tar.gz
isUTF8_CHAR(): Bring UTF-EBCDIC to parity with ASCII
This changes the macro isUTF8_CHAR to have the same number of code points built-in for EBCDIC as ASCII. This obsoletes the IS_UTF8_CHAR_FAST macro, which is removed. Previously, the code generated by regen/regcharclass.pl for ASCII platforms was hand copied into utf8.h, and LIKELY's manually added, then the generating code was commented out. Now this has been done with EBCDIC platforms as well. This makes regenerating regcharclass.h faster. The copied macro in utf8.h is moved by this commit to within the main code section for non-EBCDIC compiles, cutting the number of #ifdef's down, and the comments about it are changed somewhat.
Diffstat (limited to 'utfebcdic.h')
-rw-r--r--utfebcdic.h51
1 files changed, 51 insertions, 0 deletions
diff --git a/utfebcdic.h b/utfebcdic.h
index a6078188f8..7d37fbccdb 100644
--- a/utfebcdic.h
+++ b/utfebcdic.h
@@ -275,6 +275,57 @@ explicitly forbidden, and the shortest possible encoding should always be used
# define HIGHEST_REPRESENTABLE_UTF8 "\xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA3\xBF\xBF\xBF\xBF\xBF\xBF"
#endif
+/* A helper macro for isUTF8_CHAR, so use that one instead of this. This was
+ * generated by regen/regcharclass.pl, and then moved here. Then it was
+ * hand-edited to add some LIKELY() calls, presuming that malformations are
+ * unlikely. The lines that generated it were then commented out. This was
+ * done because it takes on the order of 10 minutes to generate, and is never
+ * going to change, unless the generated code is improved, and figuring out
+ * the LIKELYs there would be hard.
+ *
+ UTF8_CHAR: Matches legal UTF-EBCDIC variant code points up through 0x1FFFFFF
+
+ 0xA0 - 0x1FFFFF
+*/
+#if '^' == 95 /* CP 1047 */
+
+/*** GENERATED CODE ***/
+#define is_UTF8_CHAR_utf8_no_length_checks(s) \
+( ( 0x80 == ((U8*)s)[0] || ( 0x8A <= ((U8*)s)[0] && ((U8*)s)[0] <= 0x90 ) || ( 0x9A <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xA0 ) || ( 0xAA <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xAC ) || ( 0xAE <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xB6 ) ) ?\
+ ( LIKELY( ( 0x41 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x4A ) || ( 0x51 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x59 ) || ( 0x62 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x6A ) || ( ((U8*)s)[1] & 0xFC ) == 0x70 ) ? 2 : 0 )\
+: ( ( ( ((U8*)s)[0] & 0xFC ) == 0xB8 ) || ((U8*)s)[0] == 0xBC || ( ( ((U8*)s)[0] & 0xFE ) == 0xBE ) || ( ( ((U8*)s)[0] & 0xEE ) == 0xCA ) || ( ( ((U8*)s)[0] & 0xFC ) == 0xCC ) ) ?\
+ ( LIKELY( ( ( 0x41 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x4A ) || ( 0x51 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x59 ) || ( 0x62 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x6A ) || ( ((U8*)s)[1] & 0xFC ) == 0x70 ) && ( ( 0x41 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x4A ) || ( 0x51 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x59 ) || ( 0x62 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x6A ) || ( ((U8*)s)[2] & 0xFC ) == 0x70 ) ) ? 3 : 0 )\
+: ( 0xDC == ((U8*)s)[0] ) ? \
+ ( LIKELY( ( ( ( 0x57 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x59 ) || ( 0x62 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x6A ) || ( ((U8*)s)[1] & 0xFC ) == 0x70 ) && ( ( 0x41 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x4A ) || ( 0x51 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x59 ) || ( 0x62 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x6A ) || ( ((U8*)s)[2] & 0xFC ) == 0x70 ) ) && ( ( 0x41 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0x4A ) || ( 0x51 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0x59 ) || ( 0x62 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0x6A ) || ( ((U8*)s)[3] & 0xFC ) == 0x70 ) ) ? 4 : 0 )\
+: ( ( 0xDD <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xDF ) || 0xE1 == ((U8*)s)[0] || ( 0xEA <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xEC ) ) ?\
+ ( LIKELY( ( ( ( 0x41 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x4A ) || ( 0x51 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x59 ) || ( 0x62 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x6A ) || ( ((U8*)s)[1] & 0xFC ) == 0x70 ) && ( ( 0x41 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x4A ) || ( 0x51 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x59 ) || ( 0x62 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x6A ) || ( ((U8*)s)[2] & 0xFC ) == 0x70 ) ) && ( ( 0x41 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0x4A ) || ( 0x51 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0x59 ) || ( 0x62 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0x6A ) || ( ((U8*)s)[3] & 0xFC ) == 0x70 ) ) ? 4 : 0 )\
+: ( 0xED == ((U8*)s)[0] ) ? \
+ ( LIKELY( ( ( ( ( 0x49 == ((U8*)s)[1] || 0x4A == ((U8*)s)[1] ) || ( 0x51 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x59 ) || ( 0x62 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x6A ) || ( ((U8*)s)[1] & 0xFC ) == 0x70 ) && ( ( 0x41 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x4A ) || ( 0x51 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x59 ) || ( 0x62 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x6A ) || ( ((U8*)s)[2] & 0xFC ) == 0x70 ) ) && ( ( 0x41 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0x4A ) || ( 0x51 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0x59 ) || ( 0x62 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0x6A ) || ( ((U8*)s)[3] & 0xFC ) == 0x70 ) ) && ( ( 0x41 <= ((U8*)s)[4] && ((U8*)s)[4] <= 0x4A ) || ( 0x51 <= ((U8*)s)[4] && ((U8*)s)[4] <= 0x59 ) || ( 0x62 <= ((U8*)s)[4] && ((U8*)s)[4] <= 0x6A ) || ( ((U8*)s)[4] & 0xFC ) == 0x70 ) ) ? 5 : 0 )\
+: ( ( ( ( ( 0xEE == ((U8*)s)[0] ) && LIKELY( ( 0x41 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x4A ) || ( 0x51 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x59 ) || ( 0x62 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x6A ) || ( ((U8*)s)[1] & 0xFC ) == 0x70 ) ) && LIKELY( ( 0x41 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x4A ) || ( 0x51 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x59 ) || ( 0x62 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x6A ) || ( ((U8*)s)[2] & 0xFC ) == 0x70 ) ) && LIKELY( ( 0x41 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0x4A ) || ( 0x51 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0x59 ) || ( 0x62 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0x6A ) || ( ((U8*)s)[3] & 0xFC ) == 0x70 ) ) && LIKELY( ( 0x41 <= ((U8*)s)[4] && ((U8*)s)[4] <= 0x4A ) || ( 0x51 <= ((U8*)s)[4] && ((U8*)s)[4] <= 0x59 ) || ( 0x62 <= ((U8*)s)[4] && ((U8*)s)[4] <= 0x6A ) || ( ((U8*)s)[4] & 0xFC ) == 0x70 ) ) ? 5 : 0 )
+
+#endif
+
+#if '^' == 176 /* CP 037 */
+
+/*** GENERATED CODE ***/
+#define is_UTF8_CHAR_utf8_no_length_checks(s) \
+( ( 0x78 == ((U8*)s)[0] || 0x80 == ((U8*)s)[0] || ( 0x8A <= ((U8*)s)[0] && ((U8*)s)[0] <= 0x90 ) || ( 0x9A <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xA0 ) || ( 0xAA <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xAF ) || ( 0xB1 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xB5 ) ) ?\
+ ( LIKELY( ( 0x41 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x4A ) || ( 0x51 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x59 ) || 0x5F == ((U8*)s)[1] || ( 0x62 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x6A ) || ( 0x70 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x72 ) ) ? 2 : 0 )\
+: ( ((U8*)s)[0] == 0xB7 || ( ( ((U8*)s)[0] & 0xFE ) == 0xB8 ) || ( ( ((U8*)s)[0] & 0xFC ) == 0xBC ) || ( ( ((U8*)s)[0] & 0xEE ) == 0xCA ) || ( ( ((U8*)s)[0] & 0xFC ) == 0xCC ) ) ?\
+ ( LIKELY( ( ( 0x41 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x4A ) || ( 0x51 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x59 ) || 0x5F == ((U8*)s)[1] || ( 0x62 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x6A ) || ( 0x70 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x72 ) ) && ( ( 0x41 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x4A ) || ( 0x51 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x59 ) || 0x5F == ((U8*)s)[2] || ( 0x62 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x6A ) || ( 0x70 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x72 ) ) ) ? 3 : 0 )\
+: ( 0xDC == ((U8*)s)[0] ) ? \
+ ( LIKELY( ( ( ( 0x57 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x59 ) || 0x5F == ((U8*)s)[1] || ( 0x62 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x6A ) || ( 0x70 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x72 ) ) && ( ( 0x41 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x4A ) || ( 0x51 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x59 ) || 0x5F == ((U8*)s)[2] || ( 0x62 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x6A ) || ( 0x70 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x72 ) ) ) && ( ( 0x41 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0x4A ) || ( 0x51 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0x59 ) || 0x5F == ((U8*)s)[3] || ( 0x62 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0x6A ) || ( 0x70 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0x72 ) ) ) ? 4 : 0 )\
+: ( ( 0xDD <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xDF ) || 0xE1 == ((U8*)s)[0] || ( 0xEA <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xEC ) ) ?\
+ ( LIKELY( ( ( ( 0x41 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x4A ) || ( 0x51 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x59 ) || 0x5F == ((U8*)s)[1] || ( 0x62 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x6A ) || ( 0x70 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x72 ) ) && ( ( 0x41 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x4A ) || ( 0x51 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x59 ) || 0x5F == ((U8*)s)[2] || ( 0x62 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x6A ) || ( 0x70 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x72 ) ) ) && ( ( 0x41 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0x4A ) || ( 0x51 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0x59 ) || 0x5F == ((U8*)s)[3] || ( 0x62 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0x6A ) || ( 0x70 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0x72 ) ) ) ? 4 : 0 )\
+: ( 0xED == ((U8*)s)[0] ) ? \
+ ( LIKELY( ( ( ( ( 0x49 == ((U8*)s)[1] || 0x4A == ((U8*)s)[1] ) || ( 0x51 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x59 ) || 0x5F == ((U8*)s)[1] || ( 0x62 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x6A ) || ( 0x70 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x72 ) ) && ( ( 0x41 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x4A ) || ( 0x51 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x59 ) || 0x5F == ((U8*)s)[2] || ( 0x62 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x6A ) || ( 0x70 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x72 ) ) ) && ( ( 0x41 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0x4A ) || ( 0x51 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0x59 ) || 0x5F == ((U8*)s)[3] || ( 0x62 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0x6A ) || ( 0x70 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0x72 ) ) ) && ( ( 0x41 <= ((U8*)s)[4] && ((U8*)s)[4] <= 0x4A ) || ( 0x51 <= ((U8*)s)[4] && ((U8*)s)[4] <= 0x59 ) || 0x5F == ((U8*)s)[4] || ( 0x62 <= ((U8*)s)[4] && ((U8*)s)[4] <= 0x6A ) || ( 0x70 <= ((U8*)s)[4] && ((U8*)s)[4] <= 0x72 ) ) ) ? 5 : 0 )\
+: ( ( ( ( ( 0xEE == ((U8*)s)[0] ) && LIKELY( ( 0x41 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x4A ) || ( 0x51 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x59 ) || 0x5F == ((U8*)s)[1] || ( 0x62 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x6A ) || ( 0x70 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x72 ) ) ) && LIKELY( ( 0x41 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x4A ) || ( 0x51 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x59 ) || 0x5F == ((U8*)s)[2] || ( 0x62 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x6A ) || ( 0x70 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x72 ) ) ) && LIKELY( ( 0x41 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0x4A ) || ( 0x51 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0x59 ) || 0x5F == ((U8*)s)[3] || ( 0x62 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0x6A ) || ( 0x70 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0x72 ) ) ) && LIKELY( ( 0x41 <= ((U8*)s)[4] && ((U8*)s)[4] <= 0x4A ) || ( 0x51 <= ((U8*)s)[4] && ((U8*)s)[4] <= 0x59 ) || 0x5F == ((U8*)s)[4] || ( 0x62 <= ((U8*)s)[4] && ((U8*)s)[4] <= 0x6A ) || ( 0x70 <= ((U8*)s)[4] && ((U8*)s)[4] <= 0x72 ) ) ) ? 5 : 0 )
+
+#endif
+
+/* The above macro in both code pages handles UTF-8 that has this start byte
+ * (expressed in I8) as the maximum */
+#define _IS_UTF8_CHAR_HIGHEST_START_BYTE 0xF9
/*
* ex: set ts=8 sts=4 sw=4 et: