diff options
author | Karl Williamson <khw@cpan.org> | 2016-09-12 16:52:41 -0600 |
---|---|---|
committer | Karl Williamson <khw@cpan.org> | 2016-09-17 21:10:50 -0600 |
commit | a82be82b512232b63f28c5865113f7990fb59a3a (patch) | |
tree | 63e6434ecae9cfe89149e4f043dfdcac39a10434 /utf8.h | |
parent | e23e8bc1957a5981b8a507b62471ae38ec06c661 (diff) | |
download | perl-a82be82b512232b63f28c5865113f7990fb59a3a.tar.gz |
Add macro for Unicode Corregindum #9 strict
This macro follows Unicode Corrigendum #9 to allow non-character code
points. These are still discouraged but not completely forbidden.
It's best for code that isn't intended to operate on arbitrary other
code text to use the original definition, but code that does things,
such as source code control, should change to use this definition if it
wants to be Unicode-strict.
Perl can't adopt C9 wholesale, as it might create security holes in
existing applications that rely on Perl keeping non-chars out.
Diffstat (limited to 'utf8.h')
-rw-r--r-- | utf8.h | 55 |
1 files changed, 54 insertions, 1 deletions
@@ -383,6 +383,28 @@ C<cp> is Unicode if above 255; otherwise is platform-native. : 0 ) \ : 0 ) +/* Similarly, + C9_STRICT_UTF8_CHAR: Matches legal Unicode UTF-8 variant code + points, no surrogates + 0x0080 - 0xD7FF + 0xE000 - 0x10FFFF +*/ +/*** GENERATED CODE ***/ +#define is_C9_STRICT_UTF8_CHAR_utf8_no_length_checks(s) \ +( ( 0xC2 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xDF ) ? \ + ( LIKELY( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) ? 2 : 0 ) \ +: ( 0xE0 == ((U8*)s)[0] ) ? \ + ( LIKELY( ( ( ((U8*)s)[1] & 0xE0 ) == 0xA0 ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\ +: ( ( 0xE1 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xEC ) || ( ((U8*)s)[0] & 0xFE ) == 0xEE ) ?\ + ( LIKELY( ( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\ +: ( 0xED == ((U8*)s)[0] ) ? \ + ( LIKELY( ( ( ((U8*)s)[1] & 0xE0 ) == 0x80 ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\ +: ( 0xF0 == ((U8*)s)[0] ) ? \ + ( LIKELY( ( ( 0x90 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0xBF ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) && ( ( ((U8*)s)[3] & 0xC0 ) == 0x80 ) ) ? 4 : 0 )\ +: ( 0xF1 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xF3 ) ? \ + ( LIKELY( ( ( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) && ( ( ((U8*)s)[3] & 0xC0 ) == 0x80 ) ) ? 4 : 0 )\ +: LIKELY( ( ( ( 0xF4 == ((U8*)s)[0] ) && ( ( ((U8*)s)[1] & 0xF0 ) == 0x80 ) ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) && ( ( ((U8*)s)[3] & 0xC0 ) == 0x80 ) ) ? 4 : 0 ) + #endif /* EBCDIC vs ASCII */ /* 2**UTF_ACCUMULATION_SHIFT - 1 */ @@ -989,7 +1011,8 @@ be a surrogate nor a non-character code point. Thus this excludes any code point from Perl's extended UTF-8. This is used to efficiently decide if the next few bytes in C<s> is -legal Unicode-acceptable UTF-8 for a single character. +legal Unicode-acceptable UTF-8 for a single character. Use +C<L</isC9_STRICT_UTF8_CHAR>> to also accept non-character code points. =cut */ @@ -1003,6 +1026,36 @@ legal Unicode-acceptable UTF-8 for a single character. ? 0 \ : is_STRICT_UTF8_CHAR_utf8_no_length_checks(s)) +/* + +=for apidoc Am|STRLEN|isC9_STRICT_UTF8_CHAR|const U8 *s|const U8 *e + +Evaluates to non-zero if the first few bytes of the string starting at C<s> and +looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some +Unicode non-surrogate code point; otherwise it evaluates to 0. If non-zero, +the value gives how many bytes starting at C<s> comprise the code point's +representation. + +The largest acceptable code point is the Unicode maximum 0x10FFFF. This +differs from C<L</isSTRICT_UTF8_CHAR>> only in that it accepts non-character +code points. This corresponds to +L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>. +which said that non-character code points are merely discouraged rather than +completely forbidden in open interchange. See +L<perlunicode/Noncharacter code points>. + +=cut +*/ + +#define isC9_STRICT_UTF8_CHAR(s, e) \ + (UNLIKELY((e) <= (s)) \ + ? 0 \ + : (UTF8_IS_INVARIANT(*s)) \ + ? 1 \ + : UNLIKELY(((e) - (s)) < UTF8SKIP(s)) \ + ? 0 \ + : is_C9_STRICT_UTF8_CHAR_utf8_no_length_checks(s)) + /* Do not use; should be deprecated. Use isUTF8_CHAR() instead; this is * retained solely for backwards compatibility */ #define IS_UTF8_CHAR(p, n) (isUTF8_CHAR(p, (p) + (n)) == n) |