diff options
author | Karl Williamson <khw@cpan.org> | 2019-10-02 20:37:17 -0600 |
---|---|---|
committer | Karl Williamson <khw@cpan.org> | 2019-10-06 11:07:09 -0600 |
commit | ab2e28c2f2b8f2edf930448a1c0182a8bd4f469f (patch) | |
tree | 16c1c3220b88f335af9a3d93b43815dcfe1c43ac | |
parent | 2dc97505e86018c7ceba8c96fd84f477c8dd45d3 (diff) | |
download | perl-ab2e28c2f2b8f2edf930448a1c0182a8bd4f469f.tar.gz |
Make defn of UVCHR_IS_INVARIANT common
This can be derived from other values, removing an EBCDIC dependency
-rw-r--r-- | utf8.h | 26 | ||||
-rw-r--r-- | utfebcdic.h | 12 |
2 files changed, 14 insertions, 24 deletions
@@ -272,19 +272,7 @@ Perl's extended UTF-8 means we can have start bytes up through FF, though any beginning with FF yields a code point that is too large for 32-bit ASCII platforms. FF signals to use 13 bytes for the encoded character. This breaks the paradigm that the number of leading bits gives how many total bytes there -are in the character. - -=for apidoc Am|bool|UVCHR_IS_INVARIANT|UV cp - -Evaluates to 1 if the representation of code point C<cp> is the same whether or -not it is encoded in UTF-8; otherwise evaluates to 0. UTF-8 invariant -characters can be copied as-is when converting to/from UTF-8, saving time. -C<cp> is Unicode if above 255; otherwise is platform-native. - -=cut - */ - -#define UVCHR_IS_INVARIANT(cp) OFFUNI_IS_INVARIANT(cp) +are in the character. */ /* Misleadingly named: is the UTF8-encoded byte 'c' part of a variant sequence * in UTF-8? This is the inverse of UTF8_IS_INVARIANT. The |0 makes sure this @@ -339,6 +327,18 @@ C<cp> is Unicode if above 255; otherwise is platform-native. * UTF-8,EBCDIC */ #define OFFUNI_IS_INVARIANT(c) (((WIDEST_UTYPE)(c)) < UTF_CONTINUATION_MARK) +/* +=for apidoc Am|bool|UVCHR_IS_INVARIANT|UV cp + +Evaluates to 1 if the representation of code point C<cp> is the same whether or +not it is encoded in UTF-8; otherwise evaluates to 0. UTF-8 invariant +characters can be copied as-is when converting to/from UTF-8, saving time. +C<cp> is Unicode if above 255; otherwise is platform-native. + +=cut + */ +#define UVCHR_IS_INVARIANT(cp) (OFFUNI_IS_INVARIANT(NATIVE_TO_UNI(cp))) + /* Internal macro to be used only in this file to aid in constructing other * publicly accessible macros. * The number of bytes required to express this uv in UTF-8, for just those diff --git a/utfebcdic.h b/utfebcdic.h index 99a5bad5c3..d52d54a43f 100644 --- a/utfebcdic.h +++ b/utfebcdic.h @@ -202,17 +202,7 @@ possible to UTF-8-encode a single code point in different ways, but that is explicitly forbidden, and the shortest possible encoding should always be used (and that is what Perl does). */ -/* It turns out that on EBCDIC platforms, the invariants are the characters - * that have ASCII equivalents, plus the C1 controls. Since the C0 controls - * and DELETE are ASCII, this is the same as: (isASCII(uv) || isCNTRL_L1(uv)) - * */ -#define UVCHR_IS_INVARIANT(uv) cBOOL(FITS_IN_8_BITS(uv) \ - && (PL_charclass[(U8) (uv)] & (_CC_mask(_CC_ASCII) | _CC_mask(_CC_CNTRL)))) - -/* UTF-EBCDIC semantic macros - We used to transform back into I8 and then - * compare, but now only have to do a single lookup by using a bit in - * l1_char_class_tab.h. - * Comments as to the meaning of each are given at their corresponding utf8.h +/* Comments as to the meaning of each are given at their corresponding utf8.h * definitions. */ /* Equivalent to ! UVCHR_IS_INVARIANT(c) */ #define UTF8_IS_CONTINUED(c) cBOOL(FITS_IN_8_BITS(c) \ |