speed up is_utf8_char()

Message-ID: <429F557E.3090007@gmail.com> p4raw-id: //depot/perl@24687
author: Jarkko Hietaniemi <jhi@iki.fi> 2005-06-03 00:52:46 +0300
committer: Rafael Garcia-Suarez <rgarciasuarez@gmail.com> 2005-06-03 08:08:25 +0000
commit: 3b0fc154d4e77cfb1d426144cb362eb2fa6018f1 (patch)
tree: 148a122517822ca9924e52a4b360d78095134af6 /utf8.h
parent: b432a67249666bce4aa3385263660dc667d150d7 (diff)
download: perl-3b0fc154d4e77cfb1d426144cb362eb2fa6018f1.tar.gz
1 files changed, 70 insertions, 0 deletions
diff --git a/utf8.h b/utf8.h
index c87bbf248c..fb44c8576c 100644
--- a/utf8.h
+++ b/utf8.h
@@ -257,3 +257,73 @@ encoded character.
 	 toLOWER((input)[1]) == 's')
 #endif
 #define SHARP_S_SKIP 2
+
+#define IS_UTF8_CHAR_1(p)	\
+	((p)[0] <= 0x7F)
+#define IS_UTF8_CHAR_2(p)	\
+	((p)[0] >= 0xC2 && (p)[0] <= 0xDF && \
+	 (p)[1] >= 0x80 && (p)[1] <= 0xBF)
+#define IS_UTF8_CHAR_3a(p)	\
+	((p)[0] == 0xE0 && \
+	 (p)[1] >= 0xA0 && (p)[1] <= 0xBF && \
+	 (p)[2] >= 0x80 && (p)[2] <= 0xBF)
+#define IS_UTF8_CHAR_3b(p)	\
+	((p)[0] >= 0xE1 && (p)[0] <= 0xEC && \
+	 (p)[1] >= 0x80 && (p)[1] <= 0xBF && \
+	 (p)[2] >= 0x80 && (p)[2] <= 0xBF)
+#define IS_UTF8_CHAR_3c(p)	\
+	((p)[0] == 0xED && \
+	 (p)[1] >= 0x80 && (p)[1] <= 0xBF && \
+	 (p)[2] >= 0x80 && (p)[2] <= 0xBF)
+/* In IS_UTF8_CHAR_3c(p) one could use
+ * (p)[1] >= 0x80 && (p)[1] <= 0x9F
+ * if one wanted to exclude surrogates. */
+#define IS_UTF8_CHAR_3d(p)	\
+	((p)[0] >= 0xEE && (p)[0] <= 0xEF && \
+	 (p)[1] >= 0x80 && (p)[1] <= 0xBF && \
+	 (p)[2] >= 0x80 && (p)[2] <= 0xBF)
+#define IS_UTF8_CHAR_4a(p)	\
+	((p)[0] == 0xF0 && \
+	 (p)[1] >= 0x90 && (p)[1] <= 0xBF && \
+	 (p)[2] >= 0x80 && (p)[2] <= 0xBF && \
+	 (p)[3] >= 0x80 && (p)[3] <= 0xBF)
+#define IS_UTF8_CHAR_4b(p)	\
+	((p)[0] >= 0xF1 && (p)[0] <= 0xF3 && \
+	 (p)[1] >= 0x80 && (p)[1] <= 0xBF && \
+	 (p)[2] >= 0x80 && (p)[2] <= 0xBF && \
+	 (p)[3] >= 0x80 && (p)[3] <= 0xBF)
+/* In IS_UTF8_CHAR_4c(p) one could use
+ * (p)[0] == 0xF4
+ * if one wanted to stop at the Unicode limit U+10FFFF.
+ * The 0xF7 allows us to go to 0x1fffff (0x200000 would
+ * require five bytes).  Not doing any further code points
+ * since that is not needed (and that would not be strict
+ * UTF-8, anyway).  The "slow path" in Perl_is_utf8_char()
+ * will take care of the "extended UTF-8". */
+#define IS_UTF8_CHAR_4c(p)	\
+	((p)[0] == 0xF4 && (p)[0] <= 0xF7 && \
+	 (p)[1] >= 0x80 && (p)[1] <= 0xBF && \
+	 (p)[2] >= 0x80 && (p)[2] <= 0xBF && \
+	 (p)[3] >= 0x80 && (p)[3] <= 0xBF)
+
+#define IS_UTF8_CHAR_3(p)	\
+	(IS_UTF8_CHAR_3a(p) || \
+	 IS_UTF8_CHAR_3b(p) || \
+	 IS_UTF8_CHAR_3c(p) || \
+	 IS_UTF8_CHAR_3d(p))
+#define IS_UTF8_CHAR_4(p)	\
+	(IS_UTF8_CHAR_4a(p) || \
+	 IS_UTF8_CHAR_4b(p) || \
+	 IS_UTF8_CHAR_4c(p))
+
+/* IS_UTF8_CHAR(p) is strictly speaking wrong (not UTF-8) because it
+ * (1) allows UTF-8 encoded UTF-16 surrogates
+ * (2) it allows code points past U+10FFFF.
+ * The Perl_is_utf8_char() full "slow" code will handle the Perl
+ * "extended UTF-8". */
+#define IS_UTF8_CHAR(p, n)	\
+	((n) == 1 ? IS_UTF8_CHAR_1(p) : \
+ 	 (n) == 2 ? IS_UTF8_CHAR_2(p) : \
+	 (n) == 3 ? IS_UTF8_CHAR_3(p) : \
+	 (n) == 4 ? IS_UTF8_CHAR_4(p) : 0)
+
author	Jarkko Hietaniemi <jhi@iki.fi>	2005-06-03 00:52:46 +0300
committer	Rafael Garcia-Suarez <rgarciasuarez@gmail.com>	2005-06-03 08:08:25 +0000
commit	3b0fc154d4e77cfb1d426144cb362eb2fa6018f1 (patch)
tree	148a122517822ca9924e52a4b360d78095134af6 /utf8.h
parent	b432a67249666bce4aa3385263660dc667d150d7 (diff)
download	perl-3b0fc154d4e77cfb1d426144cb362eb2fa6018f1.tar.gz