diff options
author | Karl Williamson <khw@cpan.org> | 2021-06-14 06:13:41 -0600 |
---|---|---|
committer | Karl Williamson <khw@cpan.org> | 2021-08-07 05:14:43 -0600 |
commit | 99904f65f9315ad1e8da23f22b561d878bcf524c (patch) | |
tree | 4694bebf035d2b554b0137b1b398434c9a59693a | |
parent | 28ca3ab57366a041138756872c2020aca0b98ec8 (diff) | |
download | perl-99904f65f9315ad1e8da23f22b561d878bcf524c.tar.gz |
utf8.h: Remove an EBCDIC dependency
A symbol introduced in a previous commit allows this internal macro to
only need a single version, suitable for either EBCDIC or ASCII.
-rw-r--r-- | utf8.h | 21 | ||||
-rw-r--r-- | utfebcdic.h | 2 |
2 files changed, 19 insertions, 4 deletions
@@ -274,8 +274,6 @@ are in the character. */ #define isUTF8_POSSIBLY_PROBLEMATIC(c) (__ASSERT_(FITS_IN_8_BITS(c)) \ (U8) c >= 0xED) -#define UNICODE_IS_PERL_EXTENDED(uv) UNLIKELY((UV) (uv) > 0x7FFFFFFF) - #endif /* EBCDIC vs ASCII */ /* It turns out that in a number of cases, that handling ASCII vs EBCDIC is a @@ -764,6 +762,25 @@ case any call to string overloading updates the internal UTF-8 encoding flag. && (! IN_BYTES)) +/* Perl extends Unicode so that it is possible to encode (as extended UTF-8 or + * UTF-EBCDIC) any 64-bit value. No standard known to khw ever encoded higher + * than a 31 bit value. On ASCII platforms this just meant arbitrarily saying + * nothing could be higher than this. On these the start byte FD gets you to + * 31 bits, and FE and FF are forbidden as start bytes. On EBCDIC platforms, + * FD gets you only to 26 bits; adding FE to mean 7 total bytes gets you to 30 + * bits. To get to 31 bits, they treated an initial FF byte idiosyncratically. + * It was considered to be the start byte FE meaning it had 7 total bytes, and + * the final 1 was treated as an information bit, getting you to 31 bits. + * + * Perl used to accept this idiosyncratic interpretation of FF, but now rejects + * it in order to get to being able to encode 64 bits. The bottom line is that + * it is a Perl extension to use the start bytes FE and FF on ASCII platforms, + * and the start byte FF on EBCDIC ones. That translates into that it is a + * Perl extension to represent anything occupying more than 31 bits on ASCII + * platforms; 30 bits on EBCDIC. */ +#define UNICODE_IS_PERL_EXTENDED(uv) \ + UNLIKELY((UV) (uv) > nBIT_UMAX(31 - ONE_IF_EBCDIC_ZERO_IF_NOT)) + #define UTF8_ALLOW_EMPTY 0x0001 /* Allow a zero length string */ #define UTF8_GOT_EMPTY UTF8_ALLOW_EMPTY diff --git a/utfebcdic.h b/utfebcdic.h index 1b9b35acf1..a9691bb8ef 100644 --- a/utfebcdic.h +++ b/utfebcdic.h @@ -222,8 +222,6 @@ explicitly forbidden, and the shortest possible encoding should always be used * for more */ #define QUESTION_MARK_CTRL LATIN1_TO_NATIVE(0x9F) -#define UNICODE_IS_PERL_EXTENDED(uv) UNLIKELY((UV) (uv) > 0x3FFFFFFF) - /* * ex: set ts=8 sts=4 sw=4 et: */ |