summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2015-12-05 21:56:43 -0700
committerKarl Williamson <khw@cpan.org>2015-12-05 22:06:50 -0700
commitfed423a56a927315e14182b1dac8f0bfad941b37 (patch)
tree81bc52779ffc951e5f04f7e63181370fb7b194bd
parent7028aebabf3fb76f73327317cf199bc5cde380b1 (diff)
downloadperl-fed423a56a927315e14182b1dac8f0bfad941b37.tar.gz
utf8.h: Combine EBCDIC and ASCII macros
Previous commits have set things up so the macros are the same on both platforms. By moving them to the common part of utf8.h, they can share the same definition. The difference listing shows instead other things being moved due to the size of this move in comparison with those things that really stayed the same.
-rw-r--r--utf8.h43
-rw-r--r--utfebcdic.h8
2 files changed, 25 insertions, 26 deletions
diff --git a/utf8.h b/utf8.h
index df106c1da9..c41d51c044 100644
--- a/utf8.h
+++ b/utf8.h
@@ -245,6 +245,21 @@ Perl's extended UTF-8 means we can have start bytes up to FF.
# define UTF8_QUAD_MAX UINT64_C(0x1000000000)
#endif
+/* ^? is defined to be DEL on ASCII systems. See the definition of toCTRL()
+ * for more */
+#define QUESTION_MARK_CTRL DEL_NATIVE
+
+/* Surrogates, non-character code points and above-Unicode code points are
+ * problematic in some contexts. This allows code that needs to check for
+ * those to to quickly exclude the vast majority of code points it will
+ * encounter */
+#define isUTF8_POSSIBLY_PROBLEMATIC(c) ((U8) c >= 0xED)
+
+#endif /* EBCDIC vs ASCII */
+
+/* 2**UTF_ACCUMULATION_SHIFT - 1 */
+#define UTF_CONTINUATION_MASK ((U8) ((1U << UTF_ACCUMULATION_SHIFT) - 1))
+
/* Internal macro to be used only in this file to aid in constructing other
* publicly accessible macros.
* The number of bytes required to express this uv in UTF-8, for just those
@@ -275,26 +290,23 @@ Perl's extended UTF-8 means we can have start bytes up to FF.
/* Internal macro to be used only in this file.
* This adds to __COMMON_UNI_SKIP the details at this platform's upper range.
- * For 64-bit ASCII platforms, we need one more test
+ * For any-sized EBCDIC platforms, or 64-bit ASCII ones, we need one more test
* to see if just 7 bytes is needed, or if the maximum is needed. For 32-bit
* ASCII platforms, everything is representable by 7 bytes */
-#ifdef UV_IS_QUAD
+#if defined(UV_IS_QUAD) || defined(EBCDIC)
# define __BASE_UNI_SKIP(uv) (__COMMON_UNI_SKIP(uv) \
(UV) (uv) < ((UV) 1U << (6 * UTF_ACCUMULATION_SHIFT)) ? 7 : UTF8_MAXBYTES)
#else
# define __BASE_UNI_SKIP(uv) (__COMMON_UNI_SKIP(uv) 7)
#endif
-/* ^? is defined to be DEL on ASCII systems. See the definition of toCTRL()
- * for more */
-#define QUESTION_MARK_CTRL DEL_NATIVE
+/* The next two macros use the base macro defined above, and add in the tests
+ * at the low-end of the range, for just 1 byte, yielding complete macros,
+ * publicly accessible. */
+
+/* Input is a true Unicode (not-native) code point */
+#define OFFUNISKIP(uv) (OFFUNI_IS_INVARIANT(uv) ? 1 : __BASE_UNI_SKIP(uv))
-/* Surrogates, non-character code points and above-Unicode code points are
- * problematic in some contexts. This allows code that needs to check for
- * those to to quickly exclude the vast majority of code points it will
- * encounter */
-#define isUTF8_POSSIBLY_PROBLEMATIC(c) ((U8) c >= 0xED)
-#define OFFUNISKIP(uv) ( OFFUNI_IS_INVARIANT(uv) ? 1 : __BASE_UNI_SKIP(uv))
/*
=for apidoc Am|STRLEN|UVCHR_SKIP|UV cp
@@ -306,13 +318,8 @@ encoded as UTF-8. C<cp> is a native (ASCII or EBCDIC) code point if less than
*/
#define UVCHR_SKIP(uv) ( UVCHR_IS_INVARIANT(uv) ? 1 : __BASE_UNI_SKIP(uv))
-
-#endif /* EBCDIC vs ASCII */
-
-/* 2**UTF_ACCUMULATION_SHIFT - 1 */
-#define UTF_CONTINUATION_MASK ((U8) ((1U << UTF_ACCUMULATION_SHIFT) - 1))
-
-/* 32 start bytes with UTF_ACCUMULATION_SHIFT bits of information each */
+/* As explained in the comments for __COMMON_UNI_SKIP, 32 start bytes with
+ * UTF_ACCUMULATION_SHIFT bits of information each */
#define MAX_UTF8_TWO_BYTE (32 * (1U << UTF_ACCUMULATION_SHIFT) - 1)
/* constrained by EBCDIC which has 5 bits per continuation byte */
diff --git a/utfebcdic.h b/utfebcdic.h
index e30612297c..97c0c9d1c6 100644
--- a/utfebcdic.h
+++ b/utfebcdic.h
@@ -193,14 +193,6 @@ above what a 64 bit word can hold */
#define UVCHR_IS_INVARIANT(uv) cBOOL(FITS_IN_8_BITS(uv) \
&& (PL_charclass[(U8) (uv)] & (_CC_mask(_CC_ASCII) | _CC_mask(_CC_CNTRL))))
-/* Internal macro to be used only in the definitions of the next two */
-#define __BASE_UNI_SKIP(uv) ((uv) < 0x400 ? 2 : \
- (uv) < 0x4000 ? 3 : \
- (uv) < 0x40000 ? 4 : \
- (uv) < 0x400000 ? 5 : \
- (uv) < 0x4000000 ? 6 : \
- (uv) < 0x40000000 ? 7 : UTF8_MAXBYTES )
-
/* UTF-EBCDIC semantic macros - We used to transform back into I8 and then
* compare, but now only have to do a single lookup by using a bit in
* l1_char_class_tab.h.