diff options
-rw-r--r-- | utf8.h | 43 | ||||
-rw-r--r-- | utfebcdic.h | 8 |
2 files changed, 25 insertions, 26 deletions
@@ -245,6 +245,21 @@ Perl's extended UTF-8 means we can have start bytes up to FF. # define UTF8_QUAD_MAX UINT64_C(0x1000000000) #endif +/* ^? is defined to be DEL on ASCII systems. See the definition of toCTRL() + * for more */ +#define QUESTION_MARK_CTRL DEL_NATIVE + +/* Surrogates, non-character code points and above-Unicode code points are + * problematic in some contexts. This allows code that needs to check for + * those to to quickly exclude the vast majority of code points it will + * encounter */ +#define isUTF8_POSSIBLY_PROBLEMATIC(c) ((U8) c >= 0xED) + +#endif /* EBCDIC vs ASCII */ + +/* 2**UTF_ACCUMULATION_SHIFT - 1 */ +#define UTF_CONTINUATION_MASK ((U8) ((1U << UTF_ACCUMULATION_SHIFT) - 1)) + /* Internal macro to be used only in this file to aid in constructing other * publicly accessible macros. * The number of bytes required to express this uv in UTF-8, for just those @@ -275,26 +290,23 @@ Perl's extended UTF-8 means we can have start bytes up to FF. /* Internal macro to be used only in this file. * This adds to __COMMON_UNI_SKIP the details at this platform's upper range. - * For 64-bit ASCII platforms, we need one more test + * For any-sized EBCDIC platforms, or 64-bit ASCII ones, we need one more test * to see if just 7 bytes is needed, or if the maximum is needed. For 32-bit * ASCII platforms, everything is representable by 7 bytes */ -#ifdef UV_IS_QUAD +#if defined(UV_IS_QUAD) || defined(EBCDIC) # define __BASE_UNI_SKIP(uv) (__COMMON_UNI_SKIP(uv) \ (UV) (uv) < ((UV) 1U << (6 * UTF_ACCUMULATION_SHIFT)) ? 7 : UTF8_MAXBYTES) #else # define __BASE_UNI_SKIP(uv) (__COMMON_UNI_SKIP(uv) 7) #endif -/* ^? is defined to be DEL on ASCII systems. See the definition of toCTRL() - * for more */ -#define QUESTION_MARK_CTRL DEL_NATIVE +/* The next two macros use the base macro defined above, and add in the tests + * at the low-end of the range, for just 1 byte, yielding complete macros, + * publicly accessible. */ + +/* Input is a true Unicode (not-native) code point */ +#define OFFUNISKIP(uv) (OFFUNI_IS_INVARIANT(uv) ? 1 : __BASE_UNI_SKIP(uv)) -/* Surrogates, non-character code points and above-Unicode code points are - * problematic in some contexts. This allows code that needs to check for - * those to to quickly exclude the vast majority of code points it will - * encounter */ -#define isUTF8_POSSIBLY_PROBLEMATIC(c) ((U8) c >= 0xED) -#define OFFUNISKIP(uv) ( OFFUNI_IS_INVARIANT(uv) ? 1 : __BASE_UNI_SKIP(uv)) /* =for apidoc Am|STRLEN|UVCHR_SKIP|UV cp @@ -306,13 +318,8 @@ encoded as UTF-8. C<cp> is a native (ASCII or EBCDIC) code point if less than */ #define UVCHR_SKIP(uv) ( UVCHR_IS_INVARIANT(uv) ? 1 : __BASE_UNI_SKIP(uv)) - -#endif /* EBCDIC vs ASCII */ - -/* 2**UTF_ACCUMULATION_SHIFT - 1 */ -#define UTF_CONTINUATION_MASK ((U8) ((1U << UTF_ACCUMULATION_SHIFT) - 1)) - -/* 32 start bytes with UTF_ACCUMULATION_SHIFT bits of information each */ +/* As explained in the comments for __COMMON_UNI_SKIP, 32 start bytes with + * UTF_ACCUMULATION_SHIFT bits of information each */ #define MAX_UTF8_TWO_BYTE (32 * (1U << UTF_ACCUMULATION_SHIFT) - 1) /* constrained by EBCDIC which has 5 bits per continuation byte */ diff --git a/utfebcdic.h b/utfebcdic.h index e30612297c..97c0c9d1c6 100644 --- a/utfebcdic.h +++ b/utfebcdic.h @@ -193,14 +193,6 @@ above what a 64 bit word can hold */ #define UVCHR_IS_INVARIANT(uv) cBOOL(FITS_IN_8_BITS(uv) \ && (PL_charclass[(U8) (uv)] & (_CC_mask(_CC_ASCII) | _CC_mask(_CC_CNTRL)))) -/* Internal macro to be used only in the definitions of the next two */ -#define __BASE_UNI_SKIP(uv) ((uv) < 0x400 ? 2 : \ - (uv) < 0x4000 ? 3 : \ - (uv) < 0x40000 ? 4 : \ - (uv) < 0x400000 ? 5 : \ - (uv) < 0x4000000 ? 6 : \ - (uv) < 0x40000000 ? 7 : UTF8_MAXBYTES ) - /* UTF-EBCDIC semantic macros - We used to transform back into I8 and then * compare, but now only have to do a single lookup by using a bit in * l1_char_class_tab.h. |