diff options
author | Karl Williamson <khw@cpan.org> | 2019-10-02 18:03:26 -0600 |
---|---|---|
committer | Karl Williamson <khw@cpan.org> | 2019-10-06 11:07:09 -0600 |
commit | 7c88d61e18cab1244ecd155556e1f0b3563a7e4a (patch) | |
tree | 2b1cde8c673ba4aecc210007eaed7192dc81c6ce /utf8.h | |
parent | 1df634280fbf565fc9e9ada123c12a82404aa817 (diff) | |
download | perl-7c88d61e18cab1244ecd155556e1f0b3563a7e4a.tar.gz |
Make defn of UTF8_IS_DOWNGRADEABLE_START common
This can be derived from other values, removing an EBCDIC dependency
Diffstat (limited to 'utf8.h')
-rw-r--r-- | utf8.h | 15 |
1 files changed, 7 insertions, 8 deletions
@@ -300,14 +300,6 @@ C<cp> is Unicode if above 255; otherwise is platform-native. #define UTF8_IS_CONTINUED(c) (__ASSERT_(FITS_IN_8_BITS(c)) \ ((U8)((c) | 0)) & UTF_CONTINUATION_MARK) -/* Is the UTF8-encoded byte 'c' the first byte of a two byte sequence? Use - * UTF8_IS_NEXT_CHAR_DOWNGRADEABLE() instead if the input isn't known to - * be well-formed. Masking with 0xfe allows the low bit to be 0 or 1; thus - * this matches 0xc[23]. The |0 makes sure this isn't mistakenly called with a - * ptr argument */ -#define UTF8_IS_DOWNGRADEABLE_START(c) (__ASSERT_(FITS_IN_8_BITS(c)) \ - (((U8)((c) | 0)) & 0xfe) == 0xc2) - /* This is the number of low-order bits a continuation byte in a UTF-8 encoded * sequence contributes to the specification of the code point. In the bit * maps above, you see that the first 2 bits are a constant '10', leaving 6 of @@ -426,6 +418,13 @@ encoded as UTF-8. C<cp> is a native (ASCII or EBCDIC) code point if less than #define UTF8_IS_ABOVE_LATIN1(c) (__ASSERT_(FITS_IN_8_BITS(c)) \ (NATIVE_UTF8_TO_I8(c) >= UTF_MIN_ABOVE_LATIN1_BYTE)) +/* Is the UTF8-encoded byte 'c' the first byte of a two byte sequence? Use + * UTF8_IS_NEXT_CHAR_DOWNGRADEABLE() instead if the input isn't known to + * be well-formed. */ +#define UTF8_IS_DOWNGRADEABLE_START(c) (__ASSERT_(FITS_IN_8_BITS(c)) \ + inRANGE(NATIVE_UTF8_TO_I8(c), \ + UTF_MIN_START_BYTE, UTF_MIN_ABOVE_LATIN1_BYTE - 1)) + /* The largest code point representable by two UTF-8 bytes on this platform. * As explained in the comments for __COMMON_UNI_SKIP, 32 start bytes with * UTF_ACCUMULATION_SHIFT bits of information each */ |