summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--utf8.h43
-rw-r--r--utfebcdic.h8
2 files changed, 25 insertions, 26 deletions
diff --git a/utf8.h b/utf8.h
index df106c1da9..c41d51c044 100644
--- a/utf8.h
+++ b/utf8.h
@@ -245,6 +245,21 @@ Perl's extended UTF-8 means we can have start bytes up to FF.
# define UTF8_QUAD_MAX UINT64_C(0x1000000000)
#endif
+/* ^? is defined to be DEL on ASCII systems. See the definition of toCTRL()
+ * for more */
+#define QUESTION_MARK_CTRL DEL_NATIVE
+
+/* Surrogates, non-character code points and above-Unicode code points are
+ * problematic in some contexts. This allows code that needs to check for
+ * those to to quickly exclude the vast majority of code points it will
+ * encounter */
+#define isUTF8_POSSIBLY_PROBLEMATIC(c) ((U8) c >= 0xED)
+
+#endif /* EBCDIC vs ASCII */
+
+/* 2**UTF_ACCUMULATION_SHIFT - 1 */
+#define UTF_CONTINUATION_MASK ((U8) ((1U << UTF_ACCUMULATION_SHIFT) - 1))
+
/* Internal macro to be used only in this file to aid in constructing other
* publicly accessible macros.
* The number of bytes required to express this uv in UTF-8, for just those
@@ -275,26 +290,23 @@ Perl's extended UTF-8 means we can have start bytes up to FF.
/* Internal macro to be used only in this file.
* This adds to __COMMON_UNI_SKIP the details at this platform's upper range.
- * For 64-bit ASCII platforms, we need one more test
+ * For any-sized EBCDIC platforms, or 64-bit ASCII ones, we need one more test
* to see if just 7 bytes is needed, or if the maximum is needed. For 32-bit
* ASCII platforms, everything is representable by 7 bytes */
-#ifdef UV_IS_QUAD
+#if defined(UV_IS_QUAD) || defined(EBCDIC)
# define __BASE_UNI_SKIP(uv) (__COMMON_UNI_SKIP(uv) \
(UV) (uv) < ((UV) 1U << (6 * UTF_ACCUMULATION_SHIFT)) ? 7 : UTF8_MAXBYTES)
#else
# define __BASE_UNI_SKIP(uv) (__COMMON_UNI_SKIP(uv) 7)
#endif
-/* ^? is defined to be DEL on ASCII systems. See the definition of toCTRL()
- * for more */
-#define QUESTION_MARK_CTRL DEL_NATIVE
+/* The next two macros use the base macro defined above, and add in the tests
+ * at the low-end of the range, for just 1 byte, yielding complete macros,
+ * publicly accessible. */
+
+/* Input is a true Unicode (not-native) code point */
+#define OFFUNISKIP(uv) (OFFUNI_IS_INVARIANT(uv) ? 1 : __BASE_UNI_SKIP(uv))
-/* Surrogates, non-character code points and above-Unicode code points are
- * problematic in some contexts. This allows code that needs to check for
- * those to to quickly exclude the vast majority of code points it will
- * encounter */
-#define isUTF8_POSSIBLY_PROBLEMATIC(c) ((U8) c >= 0xED)
-#define OFFUNISKIP(uv) ( OFFUNI_IS_INVARIANT(uv) ? 1 : __BASE_UNI_SKIP(uv))
/*
=for apidoc Am|STRLEN|UVCHR_SKIP|UV cp
@@ -306,13 +318,8 @@ encoded as UTF-8. C<cp> is a native (ASCII or EBCDIC) code point if less than
*/
#define UVCHR_SKIP(uv) ( UVCHR_IS_INVARIANT(uv) ? 1 : __BASE_UNI_SKIP(uv))
-
-#endif /* EBCDIC vs ASCII */
-
-/* 2**UTF_ACCUMULATION_SHIFT - 1 */
-#define UTF_CONTINUATION_MASK ((U8) ((1U << UTF_ACCUMULATION_SHIFT) - 1))
-
-/* 32 start bytes with UTF_ACCUMULATION_SHIFT bits of information each */
+/* As explained in the comments for __COMMON_UNI_SKIP, 32 start bytes with
+ * UTF_ACCUMULATION_SHIFT bits of information each */
#define MAX_UTF8_TWO_BYTE (32 * (1U << UTF_ACCUMULATION_SHIFT) - 1)
/* constrained by EBCDIC which has 5 bits per continuation byte */
diff --git a/utfebcdic.h b/utfebcdic.h
index e30612297c..97c0c9d1c6 100644
--- a/utfebcdic.h
+++ b/utfebcdic.h
@@ -193,14 +193,6 @@ above what a 64 bit word can hold */
#define UVCHR_IS_INVARIANT(uv) cBOOL(FITS_IN_8_BITS(uv) \
&& (PL_charclass[(U8) (uv)] & (_CC_mask(_CC_ASCII) | _CC_mask(_CC_CNTRL))))
-/* Internal macro to be used only in the definitions of the next two */
-#define __BASE_UNI_SKIP(uv) ((uv) < 0x400 ? 2 : \
- (uv) < 0x4000 ? 3 : \
- (uv) < 0x40000 ? 4 : \
- (uv) < 0x400000 ? 5 : \
- (uv) < 0x4000000 ? 6 : \
- (uv) < 0x40000000 ? 7 : UTF8_MAXBYTES )
-
/* UTF-EBCDIC semantic macros - We used to transform back into I8 and then
* compare, but now only have to do a single lookup by using a bit in
* l1_char_class_tab.h.