summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2019-10-02 18:03:26 -0600
committerKarl Williamson <khw@cpan.org>2019-10-06 11:07:09 -0600
commit7c88d61e18cab1244ecd155556e1f0b3563a7e4a (patch)
tree2b1cde8c673ba4aecc210007eaed7192dc81c6ce
parent1df634280fbf565fc9e9ada123c12a82404aa817 (diff)
downloadperl-7c88d61e18cab1244ecd155556e1f0b3563a7e4a.tar.gz
Make defn of UTF8_IS_DOWNGRADEABLE_START common
This can be derived from other values, removing an EBCDIC dependency
-rw-r--r--utf8.h15
-rw-r--r--utfebcdic.h3
2 files changed, 7 insertions, 11 deletions
diff --git a/utf8.h b/utf8.h
index d288905ce1..2402c6fa72 100644
--- a/utf8.h
+++ b/utf8.h
@@ -300,14 +300,6 @@ C<cp> is Unicode if above 255; otherwise is platform-native.
#define UTF8_IS_CONTINUED(c) (__ASSERT_(FITS_IN_8_BITS(c)) \
((U8)((c) | 0)) & UTF_CONTINUATION_MARK)
-/* Is the UTF8-encoded byte 'c' the first byte of a two byte sequence? Use
- * UTF8_IS_NEXT_CHAR_DOWNGRADEABLE() instead if the input isn't known to
- * be well-formed. Masking with 0xfe allows the low bit to be 0 or 1; thus
- * this matches 0xc[23]. The |0 makes sure this isn't mistakenly called with a
- * ptr argument */
-#define UTF8_IS_DOWNGRADEABLE_START(c) (__ASSERT_(FITS_IN_8_BITS(c)) \
- (((U8)((c) | 0)) & 0xfe) == 0xc2)
-
/* This is the number of low-order bits a continuation byte in a UTF-8 encoded
* sequence contributes to the specification of the code point. In the bit
* maps above, you see that the first 2 bits are a constant '10', leaving 6 of
@@ -426,6 +418,13 @@ encoded as UTF-8. C<cp> is a native (ASCII or EBCDIC) code point if less than
#define UTF8_IS_ABOVE_LATIN1(c) (__ASSERT_(FITS_IN_8_BITS(c)) \
(NATIVE_UTF8_TO_I8(c) >= UTF_MIN_ABOVE_LATIN1_BYTE))
+/* Is the UTF8-encoded byte 'c' the first byte of a two byte sequence? Use
+ * UTF8_IS_NEXT_CHAR_DOWNGRADEABLE() instead if the input isn't known to
+ * be well-formed. */
+#define UTF8_IS_DOWNGRADEABLE_START(c) (__ASSERT_(FITS_IN_8_BITS(c)) \
+ inRANGE(NATIVE_UTF8_TO_I8(c), \
+ UTF_MIN_START_BYTE, UTF_MIN_ABOVE_LATIN1_BYTE - 1))
+
/* The largest code point representable by two UTF-8 bytes on this platform.
* As explained in the comments for __COMMON_UNI_SKIP, 32 start bytes with
* UTF_ACCUMULATION_SHIFT bits of information each */
diff --git a/utfebcdic.h b/utfebcdic.h
index 751fa0a9bd..d0cf139ff3 100644
--- a/utfebcdic.h
+++ b/utfebcdic.h
@@ -221,9 +221,6 @@ explicitly forbidden, and the shortest possible encoding should always be used
#define UTF8_IS_CONTINUED(c) cBOOL(FITS_IN_8_BITS(c) \
&& ! (PL_charclass[(U8) (c)] & (_CC_mask(_CC_ASCII) | _CC_mask(_CC_CNTRL))))
-#define UTF8_IS_DOWNGRADEABLE_START(c) _generic_isCC(c, \
- _CC_UTF8_IS_DOWNGRADEABLE_START)
-
#define isUTF8_POSSIBLY_PROBLEMATIC(c) \
_generic_isCC(c, _CC_UTF8_START_BYTE_IS_FOR_AT_LEAST_SURROGATE)