summaryrefslogtreecommitdiff
path: root/utfebcdic.h
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2015-09-04 09:57:11 -0600
committerKarl Williamson <khw@cpan.org>2015-09-04 10:21:17 -0600
commite4fd731240d0d51e9cda61101a7d593dd9660e22 (patch)
treef650ee0ba38884ef21d804b61ead185af721db64 /utfebcdic.h
parent0f1913794d9137557b4ae7771a8a24ab8b5ee247 (diff)
downloadperl-e4fd731240d0d51e9cda61101a7d593dd9660e22.tar.gz
Change some UTF-EBCDIC macro handling defns
This commit changes the definitions of some macros for UTF-8 handling on EBCDIC platforms. The previous definitions transformed the bytes into I8 and did tests on the transformed values. The change is to use previously unused bits in l1_char_class_tab.h so the transform isn't needed, and generally only one branch is. These macros are called from the inner loops of, for example, regex backtracking.
Diffstat (limited to 'utfebcdic.h')
-rw-r--r--utfebcdic.h33
1 files changed, 19 insertions, 14 deletions
diff --git a/utfebcdic.h b/utfebcdic.h
index d9e1402ce2..1df7b3827f 100644
--- a/utfebcdic.h
+++ b/utfebcdic.h
@@ -172,23 +172,28 @@ END_EXTERN_C
#define UNI_IS_INVARIANT(c) (((UV)(c)) < 0xA0)
-/* UTF-EBCDIC semantic macros - transform back into I8 and then compare
+/* UTF-EBCDIC semantic macros - We used to transform back into I8 and then
+ * compare, but now only have to do a single lookup by using a bit in
+ * l1_char_class_tab.h.
* Comments as to the meaning of each are given at their corresponding utf8.h
* definitions. */
-#define UTF8_IS_START(c) (NATIVE_UTF8_TO_I8(c) >= 0xC5 \
- && NATIVE_UTF8_TO_I8(c) != 0xE0)
-#define UTF8_IS_CONTINUATION(c) ((NATIVE_UTF8_TO_I8(c) & 0xE0) == 0xA0)
-#define UTF8_IS_CONTINUED(c) (NATIVE_UTF8_TO_I8(c) >= 0xA0)
-
-#define UTF8_IS_DOWNGRADEABLE_START(c) (NATIVE_UTF8_TO_I8(c) >= 0xC5 \
- && NATIVE_UTF8_TO_I8(c) <= 0xC7)
-/* Saying it this way adds a runtime test, but removes 2 run-time lookups */
-/*#define UTF8_IS_DOWNGRADEABLE_START(c) ((c) == I8_TO_NATIVE_UTF8(0xC5) \
- || (c) == I8_TO_NATIVE_UTF8(0xC6) \
- || (c) == I8_TO_NATIVE_UTF8(0xC7))
-*/
-#define UTF8_IS_ABOVE_LATIN1(c) (NATIVE_UTF8_TO_I8(c) >= 0xC8)
+#define UTF8_IS_START(c) _generic_isCC(c, _CC_UTF8_IS_START)
+#define UTF8_IS_CONTINUATION(c) _generic_isCC(c, _CC_UTF8_IS_CONTINUATION)
+
+/* Equivalent to ! UVCHR_IS_INVARIANT(c) */
+#define UTF8_IS_CONTINUED(c) cBOOL(FITS_IN_8_BITS(c) \
+ && ! (PL_charclass[(U8) (c)] & (_CC_mask(_CC_ASCII) | _CC_mask(_CC_CNTRL))))
+
+#define UTF8_IS_DOWNGRADEABLE_START(c) _generic_isCC(c, \
+ _CC_UTF8_IS_DOWNGRADEABLE_START)
+
+/* Equivalent to (UTF8_IS_START(c) && ! UTF8_IS_DOWNGRADEABLE_START(c))
+ * Makes sure that the START bit is set and the DOWNGRADEABLE bit isn't */
+#define UTF8_IS_ABOVE_LATIN1(c) cBOOL(FITS_IN_8_BITS(c) \
+ && ((PL_charclass[(U8) (c)] & ( _CC_mask(_CC_UTF8_IS_START) \
+ |_CC_mask(_CC_UTF8_IS_DOWNGRADEABLE_START))) \
+ == _CC_mask(_CC_UTF8_IS_START)))
/* Can't exceed 7 on EBCDIC platforms */
#define UTF_START_MARK(len) (0xFF & (0xFE << (7-(len))))