utf8.h: Use machine generated IS_UTF8_CHAR()

This takes the output of regen/regcharclass.pl for all the 1-4 byte UTF8-representations of Unicode code points, and replaces the current hand-rolled definition there. It does this only for ASCII platforms, leaving EBCDIC to be machine generated when run on such a platform. I would rather have both versions to be regenerated each time it is needed to save an EBCDIC dependency, but it takes more than 10 minutes on my computer to process the 2 billion code points that have to be checked for on ASCII platforms, and currently t/porting/regen.t runs this program every times; and that slow down would be unacceptable. If this is ever run under EBCDIC, the macro should be machine computed (very slowly). So, even though there is an EBCDIC dependency, it has essentially been solved.
author: Karl Williamson <public@khwilliamson.com> 2012-09-05 20:56:09 -0600
committer: Karl Williamson <public@khwilliamson.com> 2012-09-13 21:14:04 -0600
commit: 4d6461409e812aecb1fa745debb6132ce8e5612d (patch)
tree: 233a2c093d46c73bc151240415219e0e7ed41b11 /utf8.h
parent: ae1d4929d23a3d6949518058aa41cd90a700a4af (diff)
download: perl-4d6461409e812aecb1fa745debb6132ce8e5612d.tar.gz
1 files changed, 47 insertions, 95 deletions
diff --git a/utf8.h b/utf8.h
index a6af5571bf..bf8251a7ce 100644
--- a/utf8.h
+++ b/utf8.h
@@ -451,111 +451,63 @@ Perl's extended UTF-8 means we can have start bytes up to FF.
 	 toLOWER((input)[1]) == 's')
 #define SHARP_S_SKIP 2
 
-#ifndef EBCDIC
 /* If you want to exclude surrogates, and beyond legal Unicode, see the blame
  * log for earlier versions which gave details for these */
-#   define IS_UTF8_CHAR_1(p)	\
-	((p)[0] <= 0x7F)
-#   define IS_UTF8_CHAR_2(p)	\
-	((p)[0] >= 0xC2 && (p)[0] <= 0xDF && \
-	 (p)[1] >= 0x80 && (p)[1] <= 0xBF)
-#   define IS_UTF8_CHAR_3a(p)	\
-	((p)[0] == 0xE0 && \
-	 (p)[1] >= 0xA0 && (p)[1] <= 0xBF && \
-	 (p)[2] >= 0x80 && (p)[2] <= 0xBF)
-#   define IS_UTF8_CHAR_3b(p)	\
-	((p)[0] >= 0xE1 && (p)[0] <= 0xEF && \
-	 (p)[1] >= 0x80 && (p)[1] <= 0xBF && \
-	 (p)[2] >= 0x80 && (p)[2] <= 0xBF)
-#   define IS_UTF8_CHAR_4a(p)	\
-	((p)[0] == 0xF0 && \
-	 (p)[1] >= 0x90 && (p)[1] <= 0xBF && \
-	 (p)[2] >= 0x80 && (p)[2] <= 0xBF && \
-	 (p)[3] >= 0x80 && (p)[3] <= 0xBF)
-/* The 0xF7 allows us to go to 0x1fffff (0x200000 would
- * require five bytes).  Not doing any further code points
- * since that is not needed (and that would not be strict
- * UTF-8, anyway).  The "slow path" in Perl_is_utf8_char()
- * will take care of the "extended UTF-8". */
-#   define IS_UTF8_CHAR_4b(p)	\
-	((p)[0] >= 0xF1 && (p)[0] <= 0xF7 && \
-	 (p)[1] >= 0x80 && (p)[1] <= 0xBF && \
-	 (p)[2] >= 0x80 && (p)[2] <= 0xBF && \
-	 (p)[3] >= 0x80 && (p)[3] <= 0xBF)
-
-#   define IS_UTF8_CHAR_3(p)	\
-	(IS_UTF8_CHAR_3a(p) || \
-	 IS_UTF8_CHAR_3b(p))
-#   define IS_UTF8_CHAR_4(p)	\
-	(IS_UTF8_CHAR_4a(p) || \
-	 IS_UTF8_CHAR_4b(p))
+
+#ifndef EBCDIC
+/* This was generated by regen/regcharclass.pl, and then moved here.  The lines
+ * that generated it were then commented out.  This was done solely because it
+ * takes on the order of 10 minutes to generate, and is never going to change.
+ * The EBCDIC equivalent hasn't been commented out in regcharclass.pl, so it
+ * should generate and run the correct stuff */
+/*** GENERATED CODE ***/
+#define is_UTF8_CHAR_utf8_safe(s,e)                                         \
+( ((e)-(s) > 3) ?                                                           \
+    ( ( ( ((U8*)s)[0] & 0x80 ) == 0x00 ) ? 1                                \
+    : ( 0xC2 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xDF ) ?                      \
+	( ( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) ? 2 : 0 )                      \
+    : ( 0xE0 == ((U8*)s)[0] ) ?                                             \
+	( ( ( ( ((U8*)s)[1] & 0xE0 ) == 0xA0 ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\
+    : ( 0xE1 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xEF ) ?                      \
+	( ( ( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\
+    : ( 0xF0 == ((U8*)s)[0] ) ?                                             \
+	( ( ( ( 0x90 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0xBF ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) && ( ( ((U8*)s)[3] & 0xC0 ) == 0x80 ) ) ? 4 : 0 )\
+    : ( 0xF1 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xF7 ) ?                      \
+	( ( ( ( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) && ( ( ((U8*)s)[3] & 0xC0 ) == 0x80 ) ) ? 4 : 0 )\
+    : 0 )                                                                   \
+: ((e)-(s) > 2) ?                                                           \
+    ( ( ( ((U8*)s)[0] & 0x80 ) == 0x00 ) ? 1                                \
+    : ( 0xC2 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xDF ) ?                      \
+	( ( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) ? 2 : 0 )                      \
+    : ( 0xE0 == ((U8*)s)[0] ) ?                                             \
+	( ( ( ( ((U8*)s)[1] & 0xE0 ) == 0xA0 ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\
+    : ( 0xE1 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xEF ) ?                      \
+	( ( ( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\
+    : 0 )                                                                   \
+: ((e)-(s) > 1) ?                                                           \
+    ( ( ( ((U8*)s)[0] & 0x80 ) == 0x00 ) ? 1                                \
+    : ( 0xC2 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xDF ) ?                      \
+	( ( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) ? 2 : 0 )                      \
+    : 0 )                                                                   \
+: ((e)-(s) > 0) ?                                                           \
+    ( ( ((U8*)s)[0] & 0x80 ) == 0x00 )                                      \
+: 0 )
+#endif
 
 /* IS_UTF8_CHAR(p) is strictly speaking wrong (not UTF-8) because it
  * (1) allows UTF-8 encoded UTF-16 surrogates
  * (2) it allows code points past U+10FFFF.
  * The Perl_is_utf8_char() full "slow" code will handle the Perl
  * "extended UTF-8". */
-#   define IS_UTF8_CHAR(p, n)	\
-	((n) == 1 ? IS_UTF8_CHAR_1(p) : \
- 	 (n) == 2 ? IS_UTF8_CHAR_2(p) : \
-	 (n) == 3 ? IS_UTF8_CHAR_3(p) : \
-	 (n) == 4 ? IS_UTF8_CHAR_4(p) : 0)
-
-#   define IS_UTF8_CHAR_FAST(n) ((n) <= 4)
-
-#else	/* EBCDIC */
-
-/* This is an attempt to port IS_UTF8_CHAR to EBCDIC based on eyeballing.
- * untested.  If want to exclude surrogates and above-Unicode, see the
- * definitions for UTF8_IS_SURROGATE  and UTF8_IS_SUPER */
-#   define IS_UTF8_CHAR_1(p)	\
-	(NATIVE_TO_ASCII((p)[0]) <= 0x9F)
-#   define IS_UTF8_CHAR_2(p)	\
-	(NATIVE_TO_I8((p)[0]) >= 0xC5 && NATIVE_TO_I8((p)[0]) <= 0xDF && \
-	 NATIVE_TO_I8((p)[1]) >= 0xA0 && NATIVE_TO_I8((p)[1]) <= 0xBF)
-#   define IS_UTF8_CHAR_3(p)	\
-	(NATIVE_TO_I8((p)[0]) == 0xE1 && NATIVE_TO_I8((p)[1]) <= 0xEF && \
-	 NATIVE_TO_I8((p)[1]) >= 0xA0 && NATIVE_TO_I8((p)[1]) <= 0xBF && \
-	 NATIVE_TO_I8((p)[2]) >= 0xA0 && NATIVE_TO_I8((p)[2]) <= 0xBF)
-#   define IS_UTF8_CHAR_4a(p)	\
-	(NATIVE_TO_I8((p)[0]) == 0xF0 && \
-	 NATIVE_TO_I8((p)[1]) >= 0xB0 && NATIVE_TO_I8((p)[1]) <= 0xBF && \
-	 NATIVE_TO_I8((p)[2]) >= 0xA0 && NATIVE_TO_I8((p)[2]) <= 0xBF && \
-	 NATIVE_TO_I8((p)[3]) >= 0xA0 && NATIVE_TO_I8((p)[3]) <= 0xBF)
-#   define IS_UTF8_CHAR_4b(p)	\
-	(NATIVE_TO_I8((p)[0]) >= 0xF1 && NATIVE_TO_I8((p)[0]) <= 0xF7 && \
-	 NATIVE_TO_I8((p)[1]) >= 0xA0 && NATIVE_TO_I8((p)[1]) <= 0xBF && \
-	 NATIVE_TO_I8((p)[2]) >= 0xA0 && NATIVE_TO_I8((p)[2]) <= 0xBF && \
-	 NATIVE_TO_I8((p)[3]) >= 0xA0 && NATIVE_TO_I8((p)[3]) <= 0xBF)
-#   define IS_UTF8_CHAR_5a(p)	\
-	(NATIVE_TO_I8((p)[0]) == 0xF8 && \
-	 NATIVE_TO_I8((p)[1]) >= 0xA8 && NATIVE_TO_I8((p)[1]) <= 0xBF && \
-	 NATIVE_TO_I8((p)[1]) >= 0xA0 && NATIVE_TO_I8((p)[1]) <= 0xBF && \
-	 NATIVE_TO_I8((p)[2]) >= 0xA0 && NATIVE_TO_I8((p)[2]) <= 0xBF && \
-	 NATIVE_TO_I8((p)[3]) >= 0xA0 && NATIVE_TO_I8((p)[3]) <= 0xBF)
-#   define IS_UTF8_CHAR_5b(p)	\
-	 (NATIVE_TO_I8((p)[0]) >= 0xF9 && NATIVE_TO_I8((p)[1]) <= 0xFB && \
-	 NATIVE_TO_I8((p)[1]) >= 0xA0 && NATIVE_TO_I8((p)[1]) <= 0xBF && \
-	 NATIVE_TO_I8((p)[1]) >= 0xA0 && NATIVE_TO_I8((p)[1]) <= 0xBF && \
-	 NATIVE_TO_I8((p)[2]) >= 0xA0 && NATIVE_TO_I8((p)[2]) <= 0xBF && \
-	 NATIVE_TO_I8((p)[3]) >= 0xA0 && NATIVE_TO_I8((p)[3]) <= 0xBF)
-
-#   define IS_UTF8_CHAR_4(p)	\
-	(IS_UTF8_CHAR_4a(p) || \
-	 IS_UTF8_CHAR_4b(p))
-#   define IS_UTF8_CHAR_5(p)	\
-	(IS_UTF8_CHAR_5a(p) || \
-	 IS_UTF8_CHAR_5b(p))
-#   define IS_UTF8_CHAR(p, n)	\
-	((n) == 1 ? IS_UTF8_CHAR_1(p) : \
-	 (n) == 2 ? IS_UTF8_CHAR_2(p) : \
-	 (n) == 3 ? IS_UTF8_CHAR_3(p) : \
-	 (n) == 4 ? IS_UTF8_CHAR_4(p) : \
-	 (n) == 5 ? IS_UTF8_CHAR_5(p) : 0)
+#define IS_UTF8_CHAR(p, n)      (is_UTF8_CHAR_utf8_safe(p, (p) + (n)) == n)
 
+/* regen/regcharclass.pl generates is_UTF8_CHAR_utf8_safe() macros for up to
+ * these number of bytes.  So this has to be coordinated with it */
+#ifdef EBCDIC
 #   define IS_UTF8_CHAR_FAST(n) ((n) <= 5)
-
-#endif /* IS_UTF8_CHAR() for UTF-8 */
+#else
+#   define IS_UTF8_CHAR_FAST(n) ((n) <= 4)
+#endif
 
 /*
  * Local variables:
author	Karl Williamson <public@khwilliamson.com>	2012-09-05 20:56:09 -0600
committer	Karl Williamson <public@khwilliamson.com>	2012-09-13 21:14:04 -0600
commit	4d6461409e812aecb1fa745debb6132ce8e5612d (patch)
tree	233a2c093d46c73bc151240415219e0e7ed41b11 /utf8.h
parent	ae1d4929d23a3d6949518058aa41cd90a700a4af (diff)
download	perl-4d6461409e812aecb1fa745debb6132ce8e5612d.tar.gz