diff options
author | Karl Williamson <khw@cpan.org> | 2021-06-16 20:31:07 -0600 |
---|---|---|
committer | Karl Williamson <khw@cpan.org> | 2021-08-07 05:14:44 -0600 |
commit | 6f8b1f9311454d2c11cb8a196b1367e9b3933cee (patch) | |
tree | f44c0148da70b7bde53b53ff7361b98d82cef89a /utf8.h | |
parent | bdcc1e93b12b67f35d05618013410ca92713eaf3 (diff) | |
download | perl-6f8b1f9311454d2c11cb8a196b1367e9b3933cee.tar.gz |
utf8.h: Remove EBCDIC dependency
By generalizing a macro, we can make it serve both ASCII and EBCDIC
Diffstat (limited to 'utf8.h')
-rw-r--r-- | utf8.h | 17 |
1 files changed, 10 insertions, 7 deletions
@@ -255,13 +255,6 @@ are in the character. */ * for more */ #define QUESTION_MARK_CTRL DEL_NATIVE -/* Surrogates, non-character code points and above-Unicode code points are - * problematic in some contexts. This allows code that needs to check for - * those to quickly exclude the vast majority of code points it will - * encounter */ -#define isUTF8_POSSIBLY_PROBLEMATIC(c) (__ASSERT_(FITS_IN_8_BITS(c)) \ - (U8) c >= 0xED) - #endif /* EBCDIC vs ASCII */ /* It turns out that in a number of cases, that handling ASCII vs EBCDIC is a @@ -858,6 +851,16 @@ case any call to string overloading updates the internal UTF-8 encoding flag. && _is_in_locale_category(FALSE, -1))) \ && (! IN_BYTES)) +/* Surrogates, non-character code points and above-Unicode code points are + * problematic in some contexts. These macros allow code that needs to check + * for those to quickly exclude the vast majority of code points it will + * encounter. + * + * The lowest such code point is the smallest surrogate, U+D800. We calculate + * the start byte of that. 0xD800 occupies 16 bits. */ +#define isUNICODE_POSSIBLY_PROBLEMATIC(uv) ((uv) >= UNICODE_SURROGATE_FIRST) +#define isUTF8_POSSIBLY_PROBLEMATIC(c) \ + (NATIVE_UTF8_TO_I8(c) >= UTF_START_BYTE(UNICODE_SURROGATE_FIRST, 16)) /* Perl extends Unicode so that it is possible to encode (as extended UTF-8 or * UTF-EBCDIC) any 64-bit value. No standard known to khw ever encoded higher |