summaryrefslogtreecommitdiff
path: root/utf8.h
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2017-04-26 10:29:58 -0600
committerKarl Williamson <khw@cpan.org>2017-06-01 07:05:15 -0600
commita6951642ede4abe605dcf0e94b74948e0a60a56b (patch)
tree99c9e6be83325f8ad94cd73d397f5890dab88b90 /utf8.h
parenta5ba252751fc7fd7b9d43d0ad4491eb68a14a4a6 (diff)
downloadperl-a6951642ede4abe605dcf0e94b74948e0a60a56b.tar.gz
utf8.h: Add assertions for macros that take chars
This is inspired by [perl #131190]. The UTF-8 macros whose parameters are characters now have assertions that verify they are not being called with something that won't fit in a char. These assertions should be getting optimized out if the input type is a char or U8.
Diffstat (limited to 'utf8.h')
-rw-r--r--utf8.h28
1 files changed, 18 insertions, 10 deletions
diff --git a/utf8.h b/utf8.h
index affa2d67f5..b2e338a80a 100644
--- a/utf8.h
+++ b/utf8.h
@@ -266,13 +266,15 @@ C<cp> is Unicode if above 255; otherwise is platform-native.
/* Misleadingly named: is the UTF8-encoded byte 'c' part of a variant sequence
* in UTF-8? This is the inverse of UTF8_IS_INVARIANT. The |0 makes sure this
* isn't mistakenly called with a ptr argument */
-#define UTF8_IS_CONTINUED(c) (((U8)((c) | 0)) & UTF_CONTINUATION_MARK)
+#define UTF8_IS_CONTINUED(c) (__ASSERT_(FITS_IN_8_BITS(c)) \
+ ((U8)((c) | 0)) & UTF_CONTINUATION_MARK)
/* Is the byte 'c' the first byte of a multi-byte UTF8-8 encoded sequence?
* This doesn't catch invariants (they are single-byte). It also excludes the
* illegal overlong sequences that begin with C0 and C1. The |0 makes sure
* this isn't mistakenly called with a ptr argument */
-#define UTF8_IS_START(c) (((U8)((c) | 0)) >= 0xc2)
+#define UTF8_IS_START(c) (__ASSERT_(FITS_IN_8_BITS(c)) \
+ ((U8)((c) | 0)) >= 0xc2)
/* For use in UTF8_IS_CONTINUATION() below */
#define UTF_IS_CONTINUATION_MASK 0xC0
@@ -280,20 +282,22 @@ C<cp> is Unicode if above 255; otherwise is platform-native.
/* Is the byte 'c' part of a multi-byte UTF8-8 encoded sequence, and not the
* first byte thereof? The |0 makes sure this isn't mistakenly called with a
* ptr argument */
-#define UTF8_IS_CONTINUATION(c) \
- ((((U8)((c) | 0)) & UTF_IS_CONTINUATION_MASK) == UTF_CONTINUATION_MARK)
+#define UTF8_IS_CONTINUATION(c) (__ASSERT_(FITS_IN_8_BITS(c)) \
+ (((U8)((c) | 0)) & UTF_IS_CONTINUATION_MASK) == UTF_CONTINUATION_MARK)
/* Is the UTF8-encoded byte 'c' the first byte of a two byte sequence? Use
* UTF8_IS_NEXT_CHAR_DOWNGRADEABLE() instead if the input isn't known to
* be well-formed. Masking with 0xfe allows the low bit to be 0 or 1; thus
* this matches 0xc[23]. The |0 makes sure this isn't mistakenly called with a
* ptr argument */
-#define UTF8_IS_DOWNGRADEABLE_START(c) ((((U8)((c) | 0)) & 0xfe) == 0xc2)
+#define UTF8_IS_DOWNGRADEABLE_START(c) (__ASSERT_(FITS_IN_8_BITS(c)) \
+ (((U8)((c) | 0)) & 0xfe) == 0xc2)
/* Is the UTF8-encoded byte 'c' the first byte of a sequence of bytes that
* represent a code point > 255? The |0 makes sure this isn't mistakenly
* called with a ptr argument */
-#define UTF8_IS_ABOVE_LATIN1(c) (((U8)((c) | 0)) >= 0xc4)
+#define UTF8_IS_ABOVE_LATIN1(c) (__ASSERT_(FITS_IN_8_BITS(c)) \
+ ((U8)((c) | 0)) >= 0xc4)
/* This is the number of low-order bits a continuation byte in a UTF-8 encoded
* sequence contributes to the specification of the code point. In the bit
@@ -309,7 +313,8 @@ C<cp> is Unicode if above 255; otherwise is platform-native.
* problematic in some contexts. This allows code that needs to check for
* those to to quickly exclude the vast majority of code points it will
* encounter */
-#define isUTF8_POSSIBLY_PROBLEMATIC(c) ((U8) c >= 0xED)
+#define isUTF8_POSSIBLY_PROBLEMATIC(c) (__ASSERT_(FITS_IN_8_BITS(c)) \
+ (U8) c >= 0xED)
/* A helper macro for isUTF8_CHAR, so use that one instead of this. This was
* generated by regen/regcharclass.pl, and then moved here. Then it was
@@ -529,7 +534,8 @@ encoded as UTF-8. C<cp> is a native (ASCII or EBCDIC) code point if less than
* that this is asymmetric on EBCDIC platforms, in that the 'new' parameter is
* the UTF-EBCDIC byte, whereas the 'old' parameter is a Unicode (not EBCDIC)
* code point in process of being generated */
-#define UTF8_ACCUMULATE(old, new) (((old) << UTF_ACCUMULATION_SHIFT) \
+#define UTF8_ACCUMULATE(old, new) (__ASSERT_(FITS_IN_8_BITS(new)) \
+ ((old) << UTF_ACCUMULATION_SHIFT) \
| ((NATIVE_UTF8_TO_I8((U8)new)) \
& UTF_CONTINUATION_MASK))
@@ -571,8 +577,10 @@ encoded as UTF-8. C<cp> is a native (ASCII or EBCDIC) code point if less than
* Note that the result can be larger than 255 if the input character is not
* downgradable */
#define TWO_BYTE_UTF8_TO_NATIVE(HI, LO) \
- ( __ASSERT_(PL_utf8skip[HI] == 2) \
- __ASSERT_(UTF8_IS_CONTINUATION(LO)) \
+ (__ASSERT_(FITS_IN_8_BITS(HI)) \
+ __ASSERT_(FITS_IN_8_BITS(LO)) \
+ __ASSERT_(PL_utf8skip[HI] == 2) \
+ __ASSERT_(UTF8_IS_CONTINUATION(LO)) \
UNI_TO_NATIVE(UTF8_ACCUMULATE((NATIVE_UTF8_TO_I8(HI) & UTF_START_MASK(2)), \
(LO))))