diff options
author | Karl Williamson <khw@cpan.org> | 2019-10-02 17:56:01 -0600 |
---|---|---|
committer | Karl Williamson <khw@cpan.org> | 2019-10-06 11:07:09 -0600 |
commit | 1df634280fbf565fc9e9ada123c12a82404aa817 (patch) | |
tree | 76c4eef4e7b829a89307e76fa6c3c57468f76cdc | |
parent | 4bab39bc1904f776c12d31a54ff5abe06fc9c103 (diff) | |
download | perl-1df634280fbf565fc9e9ada123c12a82404aa817.tar.gz |
Make defn of UTF_IS_ABOVE_LATIN1 common
This can be derived from other values, removing an EBCDIC dependency
-rw-r--r-- | utf8.h | 14 | ||||
-rw-r--r-- | utfebcdic.h | 6 |
2 files changed, 8 insertions, 12 deletions
@@ -308,12 +308,6 @@ C<cp> is Unicode if above 255; otherwise is platform-native. #define UTF8_IS_DOWNGRADEABLE_START(c) (__ASSERT_(FITS_IN_8_BITS(c)) \ (((U8)((c) | 0)) & 0xfe) == 0xc2) -/* Is the UTF8-encoded byte 'c' the first byte of a sequence of bytes that - * represent a code point > 255? The |0 makes sure this isn't mistakenly - * called with a ptr argument */ -#define UTF8_IS_ABOVE_LATIN1(c) (__ASSERT_(FITS_IN_8_BITS(c)) \ - ((U8)((c) | 0)) >= 0xc4) - /* This is the number of low-order bits a continuation byte in a UTF-8 encoded * sequence contributes to the specification of the code point. In the bit * maps above, you see that the first 2 bits are a constant '10', leaving 6 of @@ -424,6 +418,14 @@ encoded as UTF-8. C<cp> is a native (ASCII or EBCDIC) code point if less than #define UTF8_IS_START(c) (__ASSERT_(FITS_IN_8_BITS(c)) \ (NATIVE_UTF8_TO_I8(c) >= UTF_MIN_START_BYTE)) +#define UTF_MIN_ABOVE_LATIN1_BYTE \ + ((0x100 >> UTF_ACCUMULATION_SHIFT) | UTF_START_MARK(2)) + +/* Is the UTF8-encoded byte 'c' the first byte of a sequence of bytes that + * represent a code point > 255? */ +#define UTF8_IS_ABOVE_LATIN1(c) (__ASSERT_(FITS_IN_8_BITS(c)) \ + (NATIVE_UTF8_TO_I8(c) >= UTF_MIN_ABOVE_LATIN1_BYTE)) + /* The largest code point representable by two UTF-8 bytes on this platform. * As explained in the comments for __COMMON_UNI_SKIP, 32 start bytes with * UTF_ACCUMULATION_SHIFT bits of information each */ diff --git a/utfebcdic.h b/utfebcdic.h index 8fe4bdc143..751fa0a9bd 100644 --- a/utfebcdic.h +++ b/utfebcdic.h @@ -224,12 +224,6 @@ explicitly forbidden, and the shortest possible encoding should always be used #define UTF8_IS_DOWNGRADEABLE_START(c) _generic_isCC(c, \ _CC_UTF8_IS_DOWNGRADEABLE_START) -/* Equivalent to (UTF8_IS_START(c) && ! UTF8_IS_DOWNGRADEABLE_START(c)) - * Makes sure that the START bit is set and the DOWNGRADEABLE bit isn't */ -#define UTF8_IS_ABOVE_LATIN1(c) cBOOL(FITS_IN_8_BITS(c) \ - && ((PL_charclass[(U8) (c)] & ( _CC_mask(_CC_UTF8_IS_START) \ - |_CC_mask(_CC_UTF8_IS_DOWNGRADEABLE_START))) \ - == _CC_mask(_CC_UTF8_IS_START))) #define isUTF8_POSSIBLY_PROBLEMATIC(c) \ _generic_isCC(c, _CC_UTF8_START_BYTE_IS_FOR_AT_LEAST_SURROGATE) |