diff options
-rw-r--r-- | utf8.h | 28 |
1 files changed, 18 insertions, 10 deletions
@@ -266,13 +266,15 @@ C<cp> is Unicode if above 255; otherwise is platform-native. /* Misleadingly named: is the UTF8-encoded byte 'c' part of a variant sequence * in UTF-8? This is the inverse of UTF8_IS_INVARIANT. The |0 makes sure this * isn't mistakenly called with a ptr argument */ -#define UTF8_IS_CONTINUED(c) (((U8)((c) | 0)) & UTF_CONTINUATION_MARK) +#define UTF8_IS_CONTINUED(c) (__ASSERT_(FITS_IN_8_BITS(c)) \ + ((U8)((c) | 0)) & UTF_CONTINUATION_MARK) /* Is the byte 'c' the first byte of a multi-byte UTF8-8 encoded sequence? * This doesn't catch invariants (they are single-byte). It also excludes the * illegal overlong sequences that begin with C0 and C1. The |0 makes sure * this isn't mistakenly called with a ptr argument */ -#define UTF8_IS_START(c) (((U8)((c) | 0)) >= 0xc2) +#define UTF8_IS_START(c) (__ASSERT_(FITS_IN_8_BITS(c)) \ + ((U8)((c) | 0)) >= 0xc2) /* For use in UTF8_IS_CONTINUATION() below */ #define UTF_IS_CONTINUATION_MASK 0xC0 @@ -280,20 +282,22 @@ C<cp> is Unicode if above 255; otherwise is platform-native. /* Is the byte 'c' part of a multi-byte UTF8-8 encoded sequence, and not the * first byte thereof? The |0 makes sure this isn't mistakenly called with a * ptr argument */ -#define UTF8_IS_CONTINUATION(c) \ - ((((U8)((c) | 0)) & UTF_IS_CONTINUATION_MASK) == UTF_CONTINUATION_MARK) +#define UTF8_IS_CONTINUATION(c) (__ASSERT_(FITS_IN_8_BITS(c)) \ + (((U8)((c) | 0)) & UTF_IS_CONTINUATION_MASK) == UTF_CONTINUATION_MARK) /* Is the UTF8-encoded byte 'c' the first byte of a two byte sequence? Use * UTF8_IS_NEXT_CHAR_DOWNGRADEABLE() instead if the input isn't known to * be well-formed. Masking with 0xfe allows the low bit to be 0 or 1; thus * this matches 0xc[23]. The |0 makes sure this isn't mistakenly called with a * ptr argument */ -#define UTF8_IS_DOWNGRADEABLE_START(c) ((((U8)((c) | 0)) & 0xfe) == 0xc2) +#define UTF8_IS_DOWNGRADEABLE_START(c) (__ASSERT_(FITS_IN_8_BITS(c)) \ + (((U8)((c) | 0)) & 0xfe) == 0xc2) /* Is the UTF8-encoded byte 'c' the first byte of a sequence of bytes that * represent a code point > 255? The |0 makes sure this isn't mistakenly * called with a ptr argument */ -#define UTF8_IS_ABOVE_LATIN1(c) (((U8)((c) | 0)) >= 0xc4) +#define UTF8_IS_ABOVE_LATIN1(c) (__ASSERT_(FITS_IN_8_BITS(c)) \ + ((U8)((c) | 0)) >= 0xc4) /* This is the number of low-order bits a continuation byte in a UTF-8 encoded * sequence contributes to the specification of the code point. In the bit @@ -309,7 +313,8 @@ C<cp> is Unicode if above 255; otherwise is platform-native. * problematic in some contexts. This allows code that needs to check for * those to to quickly exclude the vast majority of code points it will * encounter */ -#define isUTF8_POSSIBLY_PROBLEMATIC(c) ((U8) c >= 0xED) +#define isUTF8_POSSIBLY_PROBLEMATIC(c) (__ASSERT_(FITS_IN_8_BITS(c)) \ + (U8) c >= 0xED) /* A helper macro for isUTF8_CHAR, so use that one instead of this. This was * generated by regen/regcharclass.pl, and then moved here. Then it was @@ -529,7 +534,8 @@ encoded as UTF-8. C<cp> is a native (ASCII or EBCDIC) code point if less than * that this is asymmetric on EBCDIC platforms, in that the 'new' parameter is * the UTF-EBCDIC byte, whereas the 'old' parameter is a Unicode (not EBCDIC) * code point in process of being generated */ -#define UTF8_ACCUMULATE(old, new) (((old) << UTF_ACCUMULATION_SHIFT) \ +#define UTF8_ACCUMULATE(old, new) (__ASSERT_(FITS_IN_8_BITS(new)) \ + ((old) << UTF_ACCUMULATION_SHIFT) \ | ((NATIVE_UTF8_TO_I8((U8)new)) \ & UTF_CONTINUATION_MASK)) @@ -571,8 +577,10 @@ encoded as UTF-8. C<cp> is a native (ASCII or EBCDIC) code point if less than * Note that the result can be larger than 255 if the input character is not * downgradable */ #define TWO_BYTE_UTF8_TO_NATIVE(HI, LO) \ - ( __ASSERT_(PL_utf8skip[HI] == 2) \ - __ASSERT_(UTF8_IS_CONTINUATION(LO)) \ + (__ASSERT_(FITS_IN_8_BITS(HI)) \ + __ASSERT_(FITS_IN_8_BITS(LO)) \ + __ASSERT_(PL_utf8skip[HI] == 2) \ + __ASSERT_(UTF8_IS_CONTINUATION(LO)) \ UNI_TO_NATIVE(UTF8_ACCUMULATE((NATIVE_UTF8_TO_I8(HI) & UTF_START_MASK(2)), \ (LO)))) |