summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rwxr-xr-xregen/regcharclass.pl16
-rw-r--r--utf8.h142
2 files changed, 63 insertions, 95 deletions
diff --git a/regen/regcharclass.pl b/regen/regcharclass.pl
index c4f5951a3c..7d126428ef 100755
--- a/regen/regcharclass.pl
+++ b/regen/regcharclass.pl
@@ -1161,6 +1161,22 @@ GCB_V: Grapheme_Cluster_Break=V
=> UTF8 :fast
\p{_X_GCB_V}
+# This program was run with this enabled, and the results copied to utf8.h;
+# then this was commented out because it takes so long to figure out these 2
+# million code points. The results would not change unless utf8.h decides it
+# wants a maximum other than 4 bytes, or this program creates better
+# optimizations
+#UTF8_CHAR: Matches utf8 from 1 to 4 bytes
+#=> UTF8 :safe only_ascii_platform
+#0x0 - 0x1FFFFF
+
+# This hasn't been commented out, because we haven't an EBCDIC platform to run
+# it on, and the 3 types of EBCDIC allegedly supported by Perl would have
+# different results
+UTF8_CHAR: Matches utf8 from 1 to 5 bytes
+=> UTF8 :safe only_ebcdic_platform
+0x0 - 0x3FFFFF:
+
QUOTEMETA: Meta-characters that \Q should quote
=> high :fast
\p{_Perl_Quotemeta}
diff --git a/utf8.h b/utf8.h
index a6af5571bf..bf8251a7ce 100644
--- a/utf8.h
+++ b/utf8.h
@@ -451,111 +451,63 @@ Perl's extended UTF-8 means we can have start bytes up to FF.
toLOWER((input)[1]) == 's')
#define SHARP_S_SKIP 2
-#ifndef EBCDIC
/* If you want to exclude surrogates, and beyond legal Unicode, see the blame
* log for earlier versions which gave details for these */
-# define IS_UTF8_CHAR_1(p) \
- ((p)[0] <= 0x7F)
-# define IS_UTF8_CHAR_2(p) \
- ((p)[0] >= 0xC2 && (p)[0] <= 0xDF && \
- (p)[1] >= 0x80 && (p)[1] <= 0xBF)
-# define IS_UTF8_CHAR_3a(p) \
- ((p)[0] == 0xE0 && \
- (p)[1] >= 0xA0 && (p)[1] <= 0xBF && \
- (p)[2] >= 0x80 && (p)[2] <= 0xBF)
-# define IS_UTF8_CHAR_3b(p) \
- ((p)[0] >= 0xE1 && (p)[0] <= 0xEF && \
- (p)[1] >= 0x80 && (p)[1] <= 0xBF && \
- (p)[2] >= 0x80 && (p)[2] <= 0xBF)
-# define IS_UTF8_CHAR_4a(p) \
- ((p)[0] == 0xF0 && \
- (p)[1] >= 0x90 && (p)[1] <= 0xBF && \
- (p)[2] >= 0x80 && (p)[2] <= 0xBF && \
- (p)[3] >= 0x80 && (p)[3] <= 0xBF)
-/* The 0xF7 allows us to go to 0x1fffff (0x200000 would
- * require five bytes). Not doing any further code points
- * since that is not needed (and that would not be strict
- * UTF-8, anyway). The "slow path" in Perl_is_utf8_char()
- * will take care of the "extended UTF-8". */
-# define IS_UTF8_CHAR_4b(p) \
- ((p)[0] >= 0xF1 && (p)[0] <= 0xF7 && \
- (p)[1] >= 0x80 && (p)[1] <= 0xBF && \
- (p)[2] >= 0x80 && (p)[2] <= 0xBF && \
- (p)[3] >= 0x80 && (p)[3] <= 0xBF)
-
-# define IS_UTF8_CHAR_3(p) \
- (IS_UTF8_CHAR_3a(p) || \
- IS_UTF8_CHAR_3b(p))
-# define IS_UTF8_CHAR_4(p) \
- (IS_UTF8_CHAR_4a(p) || \
- IS_UTF8_CHAR_4b(p))
+
+#ifndef EBCDIC
+/* This was generated by regen/regcharclass.pl, and then moved here. The lines
+ * that generated it were then commented out. This was done solely because it
+ * takes on the order of 10 minutes to generate, and is never going to change.
+ * The EBCDIC equivalent hasn't been commented out in regcharclass.pl, so it
+ * should generate and run the correct stuff */
+/*** GENERATED CODE ***/
+#define is_UTF8_CHAR_utf8_safe(s,e) \
+( ((e)-(s) > 3) ? \
+ ( ( ( ((U8*)s)[0] & 0x80 ) == 0x00 ) ? 1 \
+ : ( 0xC2 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xDF ) ? \
+ ( ( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) ? 2 : 0 ) \
+ : ( 0xE0 == ((U8*)s)[0] ) ? \
+ ( ( ( ( ((U8*)s)[1] & 0xE0 ) == 0xA0 ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\
+ : ( 0xE1 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xEF ) ? \
+ ( ( ( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\
+ : ( 0xF0 == ((U8*)s)[0] ) ? \
+ ( ( ( ( 0x90 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0xBF ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) && ( ( ((U8*)s)[3] & 0xC0 ) == 0x80 ) ) ? 4 : 0 )\
+ : ( 0xF1 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xF7 ) ? \
+ ( ( ( ( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) && ( ( ((U8*)s)[3] & 0xC0 ) == 0x80 ) ) ? 4 : 0 )\
+ : 0 ) \
+: ((e)-(s) > 2) ? \
+ ( ( ( ((U8*)s)[0] & 0x80 ) == 0x00 ) ? 1 \
+ : ( 0xC2 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xDF ) ? \
+ ( ( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) ? 2 : 0 ) \
+ : ( 0xE0 == ((U8*)s)[0] ) ? \
+ ( ( ( ( ((U8*)s)[1] & 0xE0 ) == 0xA0 ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\
+ : ( 0xE1 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xEF ) ? \
+ ( ( ( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\
+ : 0 ) \
+: ((e)-(s) > 1) ? \
+ ( ( ( ((U8*)s)[0] & 0x80 ) == 0x00 ) ? 1 \
+ : ( 0xC2 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xDF ) ? \
+ ( ( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) ? 2 : 0 ) \
+ : 0 ) \
+: ((e)-(s) > 0) ? \
+ ( ( ((U8*)s)[0] & 0x80 ) == 0x00 ) \
+: 0 )
+#endif
/* IS_UTF8_CHAR(p) is strictly speaking wrong (not UTF-8) because it
* (1) allows UTF-8 encoded UTF-16 surrogates
* (2) it allows code points past U+10FFFF.
* The Perl_is_utf8_char() full "slow" code will handle the Perl
* "extended UTF-8". */
-# define IS_UTF8_CHAR(p, n) \
- ((n) == 1 ? IS_UTF8_CHAR_1(p) : \
- (n) == 2 ? IS_UTF8_CHAR_2(p) : \
- (n) == 3 ? IS_UTF8_CHAR_3(p) : \
- (n) == 4 ? IS_UTF8_CHAR_4(p) : 0)
-
-# define IS_UTF8_CHAR_FAST(n) ((n) <= 4)
-
-#else /* EBCDIC */
-
-/* This is an attempt to port IS_UTF8_CHAR to EBCDIC based on eyeballing.
- * untested. If want to exclude surrogates and above-Unicode, see the
- * definitions for UTF8_IS_SURROGATE and UTF8_IS_SUPER */
-# define IS_UTF8_CHAR_1(p) \
- (NATIVE_TO_ASCII((p)[0]) <= 0x9F)
-# define IS_UTF8_CHAR_2(p) \
- (NATIVE_TO_I8((p)[0]) >= 0xC5 && NATIVE_TO_I8((p)[0]) <= 0xDF && \
- NATIVE_TO_I8((p)[1]) >= 0xA0 && NATIVE_TO_I8((p)[1]) <= 0xBF)
-# define IS_UTF8_CHAR_3(p) \
- (NATIVE_TO_I8((p)[0]) == 0xE1 && NATIVE_TO_I8((p)[1]) <= 0xEF && \
- NATIVE_TO_I8((p)[1]) >= 0xA0 && NATIVE_TO_I8((p)[1]) <= 0xBF && \
- NATIVE_TO_I8((p)[2]) >= 0xA0 && NATIVE_TO_I8((p)[2]) <= 0xBF)
-# define IS_UTF8_CHAR_4a(p) \
- (NATIVE_TO_I8((p)[0]) == 0xF0 && \
- NATIVE_TO_I8((p)[1]) >= 0xB0 && NATIVE_TO_I8((p)[1]) <= 0xBF && \
- NATIVE_TO_I8((p)[2]) >= 0xA0 && NATIVE_TO_I8((p)[2]) <= 0xBF && \
- NATIVE_TO_I8((p)[3]) >= 0xA0 && NATIVE_TO_I8((p)[3]) <= 0xBF)
-# define IS_UTF8_CHAR_4b(p) \
- (NATIVE_TO_I8((p)[0]) >= 0xF1 && NATIVE_TO_I8((p)[0]) <= 0xF7 && \
- NATIVE_TO_I8((p)[1]) >= 0xA0 && NATIVE_TO_I8((p)[1]) <= 0xBF && \
- NATIVE_TO_I8((p)[2]) >= 0xA0 && NATIVE_TO_I8((p)[2]) <= 0xBF && \
- NATIVE_TO_I8((p)[3]) >= 0xA0 && NATIVE_TO_I8((p)[3]) <= 0xBF)
-# define IS_UTF8_CHAR_5a(p) \
- (NATIVE_TO_I8((p)[0]) == 0xF8 && \
- NATIVE_TO_I8((p)[1]) >= 0xA8 && NATIVE_TO_I8((p)[1]) <= 0xBF && \
- NATIVE_TO_I8((p)[1]) >= 0xA0 && NATIVE_TO_I8((p)[1]) <= 0xBF && \
- NATIVE_TO_I8((p)[2]) >= 0xA0 && NATIVE_TO_I8((p)[2]) <= 0xBF && \
- NATIVE_TO_I8((p)[3]) >= 0xA0 && NATIVE_TO_I8((p)[3]) <= 0xBF)
-# define IS_UTF8_CHAR_5b(p) \
- (NATIVE_TO_I8((p)[0]) >= 0xF9 && NATIVE_TO_I8((p)[1]) <= 0xFB && \
- NATIVE_TO_I8((p)[1]) >= 0xA0 && NATIVE_TO_I8((p)[1]) <= 0xBF && \
- NATIVE_TO_I8((p)[1]) >= 0xA0 && NATIVE_TO_I8((p)[1]) <= 0xBF && \
- NATIVE_TO_I8((p)[2]) >= 0xA0 && NATIVE_TO_I8((p)[2]) <= 0xBF && \
- NATIVE_TO_I8((p)[3]) >= 0xA0 && NATIVE_TO_I8((p)[3]) <= 0xBF)
-
-# define IS_UTF8_CHAR_4(p) \
- (IS_UTF8_CHAR_4a(p) || \
- IS_UTF8_CHAR_4b(p))
-# define IS_UTF8_CHAR_5(p) \
- (IS_UTF8_CHAR_5a(p) || \
- IS_UTF8_CHAR_5b(p))
-# define IS_UTF8_CHAR(p, n) \
- ((n) == 1 ? IS_UTF8_CHAR_1(p) : \
- (n) == 2 ? IS_UTF8_CHAR_2(p) : \
- (n) == 3 ? IS_UTF8_CHAR_3(p) : \
- (n) == 4 ? IS_UTF8_CHAR_4(p) : \
- (n) == 5 ? IS_UTF8_CHAR_5(p) : 0)
+#define IS_UTF8_CHAR(p, n) (is_UTF8_CHAR_utf8_safe(p, (p) + (n)) == n)
+/* regen/regcharclass.pl generates is_UTF8_CHAR_utf8_safe() macros for up to
+ * these number of bytes. So this has to be coordinated with it */
+#ifdef EBCDIC
# define IS_UTF8_CHAR_FAST(n) ((n) <= 5)
-
-#endif /* IS_UTF8_CHAR() for UTF-8 */
+#else
+# define IS_UTF8_CHAR_FAST(n) ((n) <= 4)
+#endif
/*
* Local variables: