diff options
author | Karl Williamson <khw@cpan.org> | 2021-06-25 12:23:02 -0600 |
---|---|---|
committer | Karl Williamson <khw@cpan.org> | 2021-08-07 05:14:44 -0600 |
commit | 6110285c6da30a8505d3b73bbbd0cbf6e0fdecac (patch) | |
tree | a63caafa23d32229358ff0deff4437bc8f5551ef /utf8.h | |
parent | 51b58dba7ba5a13c027c24eee220e82eab517ce5 (diff) | |
download | perl-6110285c6da30a8505d3b73bbbd0cbf6e0fdecac.tar.gz |
utf8.h: Move some #defines around
This moves the defines for things like surrogates, non-character code
points, etc. to a more logical order, with like adjacent to like, and
before they are otherwise used in the file.
Diffstat (limited to 'utf8.h')
-rw-r--r-- | utf8.h | 235 |
1 files changed, 119 insertions, 116 deletions
@@ -851,6 +851,122 @@ case any call to string overloading updates the internal UTF-8 encoding flag. && _is_in_locale_category(FALSE, -1))) \ && (! IN_BYTES)) +#define UNICODE_SURROGATE_FIRST 0xD800 +#define UNICODE_SURROGATE_LAST 0xDFFF + +/* +=for apidoc Am|bool|UTF8_IS_SURROGATE|const U8 *s|const U8 *e + +Evaluates to non-zero if the first few bytes of the string starting at C<s> and +looking no further than S<C<e - 1>> are well-formed UTF-8 that represents one +of the Unicode surrogate code points; otherwise it evaluates to 0. If +non-zero, the value gives how many bytes starting at C<s> comprise the code +point's representation. + +=cut + */ + +/* This matches the 2048 code points between these */ +#define UNICODE_IS_SURROGATE(uv) UNLIKELY(inRANGE(uv, UNICODE_SURROGATE_FIRST, \ + UNICODE_SURROGATE_LAST)) +#define UTF8_IS_SURROGATE(s, e) is_SURROGATE_utf8_safe(s, e) + +/* + +=for apidoc AmnU|UV|UNICODE_REPLACEMENT + +Evaluates to 0xFFFD, the code point of the Unicode REPLACEMENT CHARACTER + +=cut + */ +#define UNICODE_REPLACEMENT 0xFFFD +#define UNICODE_IS_REPLACEMENT(uv) UNLIKELY((UV) (uv) == UNICODE_REPLACEMENT) +#define UTF8_IS_REPLACEMENT(s, send) is_REPLACEMENT_utf8_safe(s,send) + +/* Though our UTF-8 encoding can go beyond this, + * let's be conservative and do as Unicode says. */ +#define PERL_UNICODE_MAX 0x10FFFF + +#define UNICODE_IS_SUPER(uv) UNLIKELY((UV) (uv) > PERL_UNICODE_MAX) + +/* +=for apidoc Am|bool|UTF8_IS_SUPER|const U8 *s|const U8 *e + +Recall that Perl recognizes an extension to UTF-8 that can encode code +points larger than the ones defined by Unicode, which are 0..0x10FFFF. + +This macro evaluates to non-zero if the first few bytes of the string starting +at C<s> and looking no further than S<C<e - 1>> are from this UTF-8 extension; +otherwise it evaluates to 0. If non-zero, the value gives how many bytes +starting at C<s> comprise the code point's representation. + +0 is returned if the bytes are not well-formed extended UTF-8, or if they +represent a code point that cannot fit in a UV on the current platform. Hence +this macro can give different results when run on a 64-bit word machine than on +one with a 32-bit word size. + +Note that it is illegal to have code points that are larger than what can +fit in an IV on the current machine. + +=cut + + * ASCII EBCDIC I8 + * U+10FFFF: \xF4\x8F\xBF\xBF \xF9\xA1\xBF\xBF\xBF max legal Unicode + * U+110000: \xF4\x90\x80\x80 \xF9\xA2\xA0\xA0\xA0 + * U+110001: \xF4\x90\x80\x81 \xF9\xA2\xA0\xA0\xA1 + */ +#ifdef EBCDIC +# define UTF8_IS_SUPER(s, e) \ + (( ((e) > (s) + 4) \ + && (NATIVE_UTF8_TO_I8(*(s)) >= 0xF9) \ + && UNLIKELY( NATIVE_UTF8_TO_I8(*(s)) > 0xF9 \ + || (NATIVE_UTF8_TO_I8(*((s) + 1)) >= 0xA2)) \ + && LIKELY((s) + UTF8SKIP(s) <= (e))) \ + ? is_utf8_char_helper(s, s + UTF8SKIP(s), 0) : 0) +#else +# define UTF8_IS_SUPER(s, e) \ + (( ((e) > (s) + 3) \ + && (*(U8*) (s)) >= 0xF4 \ + && (UNLIKELY( ((*(U8*) (s)) > 0xF4) \ + || (*((U8*) (s) + 1) >= 0x90))) \ + && LIKELY((s) + UTF8SKIP(s) <= (e))) \ + ? is_utf8_char_helper(s, s + UTF8SKIP(s), 0) : 0) +#endif + +/* Is 'uv' one of the 32 contiguous-range noncharacters? */ +#define UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv) \ + UNLIKELY(inRANGE(uv, 0xFDD0, 0xFDEF)) + +/* Is 'uv' one of the 34 plane-ending noncharacters 0xFFFE, 0xFFFF, 0x1FFFE, + * 0x1FFFF, ... 0x10FFFE, 0x10FFFF, given that we know that 'uv' is not above + * the Unicode legal max */ +#define UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv) \ + UNLIKELY(((UV) (uv) & 0xFFFE) == 0xFFFE) + +#define UNICODE_IS_NONCHAR(uv) \ + ( UNLIKELY(UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv)) \ + || ( UNLIKELY(UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv)) \ + && LIKELY(! UNICODE_IS_SUPER(uv)))) + +/* These are now machine generated, and the 'given' clause is no longer + * applicable */ +#define UTF8_IS_NONCHAR_GIVEN_THAT_NON_SUPER_AND_GE_PROBLEMATIC(s, e) \ + cBOOL(is_NONCHAR_utf8_safe(s,e)) + +/* +=for apidoc Am|bool|UTF8_IS_NONCHAR|const U8 *s|const U8 *e + +Evaluates to non-zero if the first few bytes of the string starting at C<s> and +looking no further than S<C<e - 1>> are well-formed UTF-8 that represents one +of the Unicode non-character code points; otherwise it evaluates to 0. If +non-zero, the value gives how many bytes starting at C<s> comprise the code +point's representation. + +=cut +*/ +#define UTF8_IS_NONCHAR(s, e) \ + UTF8_IS_NONCHAR_GIVEN_THAT_NON_SUPER_AND_GE_PROBLEMATIC(s, e) + /* Surrogates, non-character code points and above-Unicode code points are * problematic in some contexts. These macros allow code that needs to check * for those to quickly exclude the vast majority of code points it will @@ -883,6 +999,8 @@ case any call to string overloading updates the internal UTF-8 encoding flag. #define UTF8_IS_PERL_EXTENDED(s) \ (UTF8SKIP(s) > 6 + ONE_IF_EBCDIC_ZERO_IF_NOT) +#define MAX_LEGAL_CP ((UV)IV_MAX) + #define UTF8_ALLOW_EMPTY 0x0001 /* Allow a zero length string */ #define UTF8_GOT_EMPTY UTF8_ALLOW_EMPTY @@ -978,100 +1096,6 @@ case any call to string overloading updates the internal UTF-8 encoding flag. #define UTF8_ALLOW_ANYUV 0 #define UTF8_ALLOW_DEFAULT UTF8_ALLOW_ANYUV -/* -=for apidoc Am|bool|UTF8_IS_SURROGATE|const U8 *s|const U8 *e - -Evaluates to non-zero if the first few bytes of the string starting at C<s> and -looking no further than S<C<e - 1>> are well-formed UTF-8 that represents one -of the Unicode surrogate code points; otherwise it evaluates to 0. If -non-zero, the value gives how many bytes starting at C<s> comprise the code -point's representation. - -=cut - */ -#define UTF8_IS_SURROGATE(s, e) is_SURROGATE_utf8_safe(s, e) - - -#define UTF8_IS_REPLACEMENT(s, send) is_REPLACEMENT_utf8_safe(s,send) - -#define MAX_LEGAL_CP ((UV)IV_MAX) - -/* -=for apidoc Am|bool|UTF8_IS_SUPER|const U8 *s|const U8 *e - -Recall that Perl recognizes an extension to UTF-8 that can encode code -points larger than the ones defined by Unicode, which are 0..0x10FFFF. - -This macro evaluates to non-zero if the first few bytes of the string starting -at C<s> and looking no further than S<C<e - 1>> are from this UTF-8 extension; -otherwise it evaluates to 0. If non-zero, the value gives how many bytes -starting at C<s> comprise the code point's representation. - -0 is returned if the bytes are not well-formed extended UTF-8, or if they -represent a code point that cannot fit in a UV on the current platform. Hence -this macro can give different results when run on a 64-bit word machine than on -one with a 32-bit word size. - -Note that it is illegal to have code points that are larger than what can -fit in an IV on the current machine. - -=cut - - * ASCII EBCDIC I8 - * U+10FFFF: \xF4\x8F\xBF\xBF \xF9\xA1\xBF\xBF\xBF max legal Unicode - * U+110000: \xF4\x90\x80\x80 \xF9\xA2\xA0\xA0\xA0 - * U+110001: \xF4\x90\x80\x81 \xF9\xA2\xA0\xA0\xA1 - */ -#ifdef EBCDIC -# define UTF8_IS_SUPER(s, e) \ - (( ((e) > (s) + 4) \ - && (NATIVE_UTF8_TO_I8(*(s)) >= 0xF9) \ - && UNLIKELY( NATIVE_UTF8_TO_I8(*(s)) > 0xF9 \ - || (NATIVE_UTF8_TO_I8(*((s) + 1)) >= 0xA2)) \ - && LIKELY((s) + UTF8SKIP(s) <= (e))) \ - ? is_utf8_char_helper(s, s + UTF8SKIP(s), 0) : 0) -#else -# define UTF8_IS_SUPER(s, e) \ - (( ((e) > (s) + 3) \ - && (*(U8*) (s)) >= 0xF4 \ - && (UNLIKELY( ((*(U8*) (s)) > 0xF4) \ - || (*((U8*) (s) + 1) >= 0x90))) \ - && LIKELY((s) + UTF8SKIP(s) <= (e))) \ - ? is_utf8_char_helper(s, s + UTF8SKIP(s), 0) : 0) -#endif - -/* These are now machine generated, and the 'given' clause is no longer - * applicable */ -#define UTF8_IS_NONCHAR_GIVEN_THAT_NON_SUPER_AND_GE_PROBLEMATIC(s, e) \ - cBOOL(is_NONCHAR_utf8_safe(s,e)) - -/* -=for apidoc Am|bool|UTF8_IS_NONCHAR|const U8 *s|const U8 *e - -Evaluates to non-zero if the first few bytes of the string starting at C<s> and -looking no further than S<C<e - 1>> are well-formed UTF-8 that represents one -of the Unicode non-character code points; otherwise it evaluates to 0. If -non-zero, the value gives how many bytes starting at C<s> comprise the code -point's representation. - -=for apidoc AmnU|UV|UNICODE_REPLACEMENT - -Evaluates to 0xFFFD, the code point of the Unicode REPLACEMENT CHARACTER - -=cut - */ -#define UTF8_IS_NONCHAR(s, e) \ - UTF8_IS_NONCHAR_GIVEN_THAT_NON_SUPER_AND_GE_PROBLEMATIC(s, e) - -#define UNICODE_SURROGATE_FIRST 0xD800 -#define UNICODE_SURROGATE_LAST 0xDFFF -#define UNICODE_REPLACEMENT 0xFFFD -#define UNICODE_BYTE_ORDER_MARK 0xFEFF - -/* Though our UTF-8 encoding can go beyond this, - * let's be conservative and do as Unicode says. */ -#define PERL_UNICODE_MAX 0x10FFFF - #define UNICODE_WARN_SURROGATE 0x0001 /* UTF-16 surrogates */ #define UNICODE_WARN_NONCHAR 0x0002 /* Non-char code points */ #define UNICODE_WARN_SUPER 0x0004 /* Above 0x10FFFF */ @@ -1106,31 +1130,10 @@ Evaluates to 0xFFFD, the code point of the Unicode REPLACEMENT CHARACTER #define UNICODE_ALLOW_SUPER 0 #define UNICODE_ALLOW_ANY 0 -/* This matches the 2048 code points between these */ -#define UNICODE_IS_SURROGATE(uv) UNLIKELY(inRANGE(uv, UNICODE_SURROGATE_FIRST, \ - UNICODE_SURROGATE_LAST)) - -#define UNICODE_IS_REPLACEMENT(uv) UNLIKELY((UV) (uv) == UNICODE_REPLACEMENT) +#define UNICODE_BYTE_ORDER_MARK 0xFEFF #define UNICODE_IS_BYTE_ORDER_MARK(uv) UNLIKELY((UV) (uv) \ == UNICODE_BYTE_ORDER_MARK) -/* Is 'uv' one of the 32 contiguous-range noncharacters? */ -#define UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv) \ - UNLIKELY(inRANGE(uv, 0xFDD0, 0xFDEF)) - -/* Is 'uv' one of the 34 plane-ending noncharacters 0xFFFE, 0xFFFF, 0x1FFFE, - * 0x1FFFF, ... 0x10FFFE, 0x10FFFF, given that we know that 'uv' is not above - * the Unicode legal max */ -#define UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv) \ - UNLIKELY(((UV) (uv) & 0xFFFE) == 0xFFFE) - -#define UNICODE_IS_NONCHAR(uv) \ - ( UNLIKELY(UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv)) \ - || ( UNLIKELY(UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv)) \ - && LIKELY(! UNICODE_IS_SUPER(uv)))) - -#define UNICODE_IS_SUPER(uv) UNLIKELY((UV) (uv) > PERL_UNICODE_MAX) - #define LATIN_SMALL_LETTER_SHARP_S LATIN_SMALL_LETTER_SHARP_S_NATIVE #define LATIN_SMALL_LETTER_Y_WITH_DIAERESIS \ LATIN_SMALL_LETTER_Y_WITH_DIAERESIS_NATIVE |