diff options
author | Karl Williamson <khw@cpan.org> | 2016-02-10 10:54:42 -0700 |
---|---|---|
committer | Karl Williamson <khw@cpan.org> | 2016-02-10 21:16:44 -0700 |
commit | 5c06326b193e9f7badbab9e6a04db1f303b7cd6c (patch) | |
tree | 0e39d5afb771f2ece868144aa7ed12b1ad97dbde /utf8.h | |
parent | 0c6c793207762fced18c2d12a733993f0f8e30f7 (diff) | |
download | perl-5c06326b193e9f7badbab9e6a04db1f303b7cd6c.tar.gz |
utf8.h: Guard some macros against improper calls
The UTF8_IS_foo() macros have an inconsistent API. In some, the
parameter is a pointer, and in others it is a byte. In the former case,
a call of the wrong type will not compile, as it will try to dereference
a non-ptr. This commit makes the other ones not compile when called
wrongly, by using the technique shown by Lukas Mai (in
9c903d5937fa3682f21b2aece7f6011b6fcb2750) of ORing the argument with a
constant 0, which should get optimized out.
Diffstat (limited to 'utf8.h')
-rw-r--r-- | utf8.h | 30 |
1 files changed, 18 insertions, 12 deletions
@@ -228,31 +228,36 @@ Perl's extended UTF-8 means we can have start bytes up to FF. #define UTF_CONTINUATION_MARK 0x80 /* Misleadingly named: is the UTF8-encoded byte 'c' part of a variant sequence - * in UTF-8? This is the inverse of UTF8_IS_INVARIANT */ -#define UTF8_IS_CONTINUED(c) (((U8)c) & UTF_CONTINUATION_MARK) + * in UTF-8? This is the inverse of UTF8_IS_INVARIANT. The |0 makes sure this + * isn't mistakenly called with a ptr argument */ +#define UTF8_IS_CONTINUED(c) (((U8)((c) | 0)) & UTF_CONTINUATION_MARK) /* Is the byte 'c' the first byte of a multi-byte UTF8-8 encoded sequence? * This doesn't catch invariants (they are single-byte). It also excludes the - * illegal overlong sequences that begin with C0 and C1. */ -#define UTF8_IS_START(c) (((U8)c) >= 0xc2) + * illegal overlong sequences that begin with C0 and C1. The |0 makes sure + * this isn't mistakenly called with a ptr argument */ +#define UTF8_IS_START(c) (((U8)((c) | 0)) >= 0xc2) /* For use in UTF8_IS_CONTINUATION() below */ #define UTF_IS_CONTINUATION_MASK 0xC0 /* Is the byte 'c' part of a multi-byte UTF8-8 encoded sequence, and not the - * first byte thereof? */ + * first byte thereof? The |0 makes sure this isn't mistakenly called with a + * ptr argument */ #define UTF8_IS_CONTINUATION(c) \ - ((((U8)c) & UTF_IS_CONTINUATION_MASK) == UTF_CONTINUATION_MARK) + ((((U8)((c) | 0)) & UTF_IS_CONTINUATION_MASK) == UTF_CONTINUATION_MARK) /* Is the UTF8-encoded byte 'c' the first byte of a two byte sequence? Use * UTF8_IS_NEXT_CHAR_DOWNGRADEABLE() instead if the input isn't known to * be well-formed. Masking with 0xfe allows the low bit to be 0 or 1; thus - * this matches 0xc[23]. */ -#define UTF8_IS_DOWNGRADEABLE_START(c) (((U8)(c) & 0xfe) == 0xc2) + * this matches 0xc[23]. The |0 makes sure this isn't mistakenly called with a + * ptr argument */ +#define UTF8_IS_DOWNGRADEABLE_START(c) ((((U8)((c) | 0)) & 0xfe) == 0xc2) /* Is the UTF8-encoded byte 'c' the first byte of a sequence of bytes that - * represent a code point > 255? */ -#define UTF8_IS_ABOVE_LATIN1(c) ((U8)(c) >= 0xc4) + * represent a code point > 255? The |0 makes sure this isn't mistakenly + * called with a ptr argument */ +#define UTF8_IS_ABOVE_LATIN1(c) (((U8)((c) | 0)) >= 0xc4) /* This is the number of low-order bits a continuation byte in a UTF-8 encoded * sequence contributes to the specification of the code point. In the bit @@ -464,8 +469,9 @@ only) byte is pointed to by C<s>. * each for the exact same set of bit patterns. It is valid on a subset of * what UVCHR_IS_INVARIANT is valid on, so can just use that; and the compiler * should optimize out anything extraneous given the implementation of the - * latter */ -#define UTF8_IS_INVARIANT(c) UVCHR_IS_INVARIANT(c) + * latter. The |0 makes sure this isn't mistakenly called with a ptr argument. + * */ +#define UTF8_IS_INVARIANT(c) UVCHR_IS_INVARIANT((c) | 0) /* Like the above, but its name implies a non-UTF8 input, which as the comments * above show, doesn't matter as to its implementation */ |