diff options
author | Karl Williamson <khw@cpan.org> | 2019-10-02 17:07:50 -0600 |
---|---|---|
committer | Karl Williamson <khw@cpan.org> | 2019-10-06 11:07:08 -0600 |
commit | 38f458ffd56c0eb9f5df18cb6693ca326a4b1374 (patch) | |
tree | ff10eba5cd482d48aae5bd1ff93a3ba92a7bb5e0 | |
parent | 9f3cfb7a26dab519dbc83ef02bd3fbf084cb6fc3 (diff) | |
download | perl-38f458ffd56c0eb9f5df18cb6693ca326a4b1374.tar.gz |
Make defn of UTF_CONTINUATION_MARK common
This can be derived from other values, removing an EBCDIC dependency
-rw-r--r-- | utf8.h | 10 | ||||
-rw-r--r-- | utfebcdic.h | 2 |
2 files changed, 6 insertions, 6 deletions
@@ -294,10 +294,6 @@ C<cp> is Unicode if above 255; otherwise is platform-native. #define UVCHR_IS_INVARIANT(cp) OFFUNI_IS_INVARIANT(cp) -/* This defines the bits that are to be in the continuation bytes of a multi-byte - * UTF-8 encoded character that mark it is a continuation byte. */ -#define UTF_CONTINUATION_MARK 0x80 - /* Misleadingly named: is the UTF8-encoded byte 'c' part of a variant sequence * in UTF-8? This is the inverse of UTF8_IS_INVARIANT. The |0 makes sure this * isn't mistakenly called with a ptr argument */ @@ -361,6 +357,12 @@ C<cp> is Unicode if above 255; otherwise is platform-native. * E0 in UTF-EBCDIC */ #define UTF_IS_CONTINUATION_MASK ((U8) (0xFF << UTF_ACCUMULATION_SHIFT)) +/* This defines the bits that are to be in the continuation bytes of a + * multi-byte UTF-8 encoded character that mark it is a continuation byte. + * This turns out to be 0x80 in UTF-8, 0xA0 in UTF-EBCDIC. (khw doesn't know + * the underlying reason that B0 works here) */ +#define UTF_CONTINUATION_MARK (UTF_IS_CONTINUATION_MASK & 0xB0) + /* Internal macro to be used only in this file to aid in constructing other * publicly accessible macros. * The number of bytes required to express this uv in UTF-8, for just those diff --git a/utfebcdic.h b/utfebcdic.h index f13f555f49..7200599532 100644 --- a/utfebcdic.h +++ b/utfebcdic.h @@ -245,9 +245,7 @@ explicitly forbidden, and the shortest possible encoding should always be used #define isUTF8_POSSIBLY_PROBLEMATIC(c) \ _generic_isCC(c, _CC_UTF8_START_BYTE_IS_FOR_AT_LEAST_SURROGATE) -#define UTF_CONTINUATION_MARK 0xA0 #define UTF_ACCUMULATION_SHIFT 5 - /* ^? is defined to be APC on EBCDIC systems. See the definition of toCTRL() * for more */ #define QUESTION_MARK_CTRL LATIN1_TO_NATIVE(0x9F) |