summaryrefslogtreecommitdiff
path: root/utf8.h
diff options
context:
space:
mode:
authorKarl Williamson <public@khwilliamson.com>2011-11-21 16:28:52 -0700
committerKarl Williamson <public@khwilliamson.com>2011-11-21 17:20:23 -0700
commit2f454f119a96b1967d2f23688383ce7c84a497d6 (patch)
tree95145125a2d8f18f83dc55fa08349b339002704e /utf8.h
parentc78f6c49c89ea5b4fb76bb0e714940a5123e8f1d (diff)
downloadperl-2f454f119a96b1967d2f23688383ce7c84a497d6.tar.gz
utf8.h: define IS_UTF8_CHAR for EBCDIC
This is based on my eyeballing a file I had generated of the encodings for Unicode code points, so could be wrong. It does compile
Diffstat (limited to 'utf8.h')
-rw-r--r--utf8.h56
1 files changed, 53 insertions, 3 deletions
diff --git a/utf8.h b/utf8.h
index 5c7b513a3b..c77079d47c 100644
--- a/utf8.h
+++ b/utf8.h
@@ -458,9 +458,7 @@ Perl's extended UTF-8 means we can have start bytes up to FF.
toLOWER((input)[1]) == 's')
#define SHARP_S_SKIP 2
-#ifdef EBCDIC
-/* IS_UTF8_CHAR() is not ported to EBCDIC */
-#else
+#ifndef EBCDIC
# define IS_UTF8_CHAR_1(p) \
((p)[0] <= 0x7F)
# define IS_UTF8_CHAR_2(p) \
@@ -532,6 +530,58 @@ Perl's extended UTF-8 means we can have start bytes up to FF.
# define IS_UTF8_CHAR_FAST(n) ((n) <= 4)
+#else /* EBCDIC */
+
+/* This is an attempt to port IS_UTF8_CHAR to EBCDIC based on eyeballing.
+ * untested. If want to exclude surrogates and above-Unicode, see the
+ * definitions for UTF8_IS_SURROGATE and UTF8_IS_SUPER */
+# define IS_UTF8_CHAR_1(p) \
+ (NATIVE_TO_ASCII((p)[0]) <= 0x9F)
+# define IS_UTF8_CHAR_2(p) \
+ (NATIVE_TO_I8((p)[0]) >= 0xC5 && NATIVE_TO_I8((p)[0]) <= 0xDF && \
+ NATIVE_TO_I8((p)[1]) >= 0xA0 && NATIVE_TO_I8((p)[1]) <= 0xBF)
+# define IS_UTF8_CHAR_3(p) \
+ (NATIVE_TO_I8((p)[0]) == 0xE1 && NATIVE_TO_I8((p)[1]) <= 0xEF && \
+ NATIVE_TO_I8((p)[1]) >= 0xA0 && NATIVE_TO_I8((p)[1]) <= 0xBF && \
+ NATIVE_TO_I8((p)[2]) >= 0xA0 && NATIVE_TO_I8((p)[2]) <= 0xBF)
+# define IS_UTF8_CHAR_4a(p) \
+ (NATIVE_TO_I8((p)[0]) == 0xF0 && \
+ NATIVE_TO_I8((p)[1]) >= 0xB0 && NATIVE_TO_I8((p)[1]) <= 0xBF && \
+ NATIVE_TO_I8((p)[2]) >= 0xA0 && NATIVE_TO_I8((p)[2]) <= 0xBF && \
+ NATIVE_TO_I8((p)[3]) >= 0xA0 && NATIVE_TO_I8((p)[3]) <= 0xBF)
+# define IS_UTF8_CHAR_4b(p) \
+ (NATIVE_TO_I8((p)[0]) >= 0xF1 && NATIVE_TO_I8((p)[0]) <= 0xF7 && \
+ NATIVE_TO_I8((p)[1]) >= 0xA0 && NATIVE_TO_I8((p)[1]) <= 0xBF && \
+ NATIVE_TO_I8((p)[2]) >= 0xA0 && NATIVE_TO_I8((p)[2]) <= 0xBF && \
+ NATIVE_TO_I8((p)[3]) >= 0xA0 && NATIVE_TO_I8((p)[3]) <= 0xBF)
+# define IS_UTF8_CHAR_5a(p) \
+ (NATIVE_TO_I8((p)[0]) == 0xF8 && \
+ NATIVE_TO_I8((p)[1]) >= 0xA8 && NATIVE_TO_I8((p)[1]) <= 0xBF && \
+ NATIVE_TO_I8((p)[1]) >= 0xA0 && NATIVE_TO_I8((p)[1]) <= 0xBF && \
+ NATIVE_TO_I8((p)[2]) >= 0xA0 && NATIVE_TO_I8((p)[2]) <= 0xBF && \
+ NATIVE_TO_I8((p)[3]) >= 0xA0 && NATIVE_TO_I8((p)[3]) <= 0xBF)
+# define IS_UTF8_CHAR_5b(p) \
+ (NATIVE_TO_I8((p)[0]) >= 0xF9 && NATIVE_TO_I8((p)[1]) <= 0xFB && \
+ NATIVE_TO_I8((p)[1]) >= 0xA0 && NATIVE_TO_I8((p)[1]) <= 0xBF && \
+ NATIVE_TO_I8((p)[1]) >= 0xA0 && NATIVE_TO_I8((p)[1]) <= 0xBF && \
+ NATIVE_TO_I8((p)[2]) >= 0xA0 && NATIVE_TO_I8((p)[2]) <= 0xBF && \
+ NATIVE_TO_I8((p)[3]) >= 0xA0 && NATIVE_TO_I8((p)[3]) <= 0xBF)
+
+# define IS_UTF8_CHAR_4(p) \
+ (IS_UTF8_CHAR_4a(p) || \
+ IS_UTF8_CHAR_4b(p))
+# define IS_UTF8_CHAR_5(p) \
+ (IS_UTF8_CHAR_5a(p) || \
+ IS_UTF8_CHAR_5b(p))
+# define IS_UTF8_CHAR(p, n) \
+ ((n) == 1 ? IS_UTF8_CHAR_1(p) : \
+ (n) == 2 ? IS_UTF8_CHAR_2(p) : \
+ (n) == 3 ? IS_UTF8_CHAR_3(p) : \
+ (n) == 4 ? IS_UTF8_CHAR_4(p) : \
+ (n) == 5 ? IS_UTF8_CHAR_5(p) : 0)
+
+# define IS_UTF8_CHAR_FAST(n) ((n) <= 5)
+
#endif /* IS_UTF8_CHAR() for UTF-8 */
/*