Make defn of UTF8_IS_START common

This can be derived from other values, removing an EBCDIC dependency
author: Karl Williamson <khw@cpan.org> 2019-10-06 10:50:12 -0600
committer: Karl Williamson <khw@cpan.org> 2019-10-06 11:07:09 -0600
commit: 4bab39bc1904f776c12d31a54ff5abe06fc9c103 (patch)
tree: b6a1486f499a9d3e15b8884b508f4ca606ac46d0
parent: f4225fa0e24724a97c2ff1d4e608353ca1537506 (diff)
download: perl-4bab39bc1904f776c12d31a54ff5abe06fc9c103.tar.gz
2 files changed, 10 insertions, 10 deletions
diff --git a/utf8.h b/utf8.h
index acc76fcdef..ead46229db 100644
--- a/utf8.h
+++ b/utf8.h
@@ -300,13 +300,6 @@ C<cp> is Unicode if above 255; otherwise is platform-native.
 #define UTF8_IS_CONTINUED(c)  (__ASSERT_(FITS_IN_8_BITS(c))                 \
                                ((U8)((c) | 0)) &  UTF_CONTINUATION_MARK)
 
-/* Is the byte 'c' the first byte of a multi-byte UTF8-8 encoded sequence?
- * This doesn't catch invariants (they are single-byte).  It also excludes the
- * illegal overlong sequences that begin with C0 and C1.  The |0 makes sure
- * this isn't mistakenly called with a ptr argument */
-#define UTF8_IS_START(c)      (__ASSERT_(FITS_IN_8_BITS(c))                 \
-                               ((U8)((c) | 0)) >= 0xc2)
-
 /* Is the UTF8-encoded byte 'c' the first byte of a two byte sequence?  Use
  * UTF8_IS_NEXT_CHAR_DOWNGRADEABLE() instead if the input isn't known to
  * be well-formed.  Masking with 0xfe allows the low bit to be 0 or 1; thus
@@ -421,6 +414,16 @@ encoded as UTF-8.  C<cp> is a native (ASCII or EBCDIC) code point if less than
  */
 #define UVCHR_SKIP(uv) ( UVCHR_IS_INVARIANT(uv) ? 1 : __BASE_UNI_SKIP(uv))
 
+#define UTF_MIN_START_BYTE                                                  \
+     ((UTF_CONTINUATION_MARK >> UTF_ACCUMULATION_SHIFT) | UTF_START_MARK(2))
+
+/* Is the byte 'c' the first byte of a multi-byte UTF8-8 encoded sequence?
+ * This doesn't catch invariants (they are single-byte).  It also excludes the
+ * illegal overlong sequences that begin with C0 and C1 on ASCII platforms, and
+ * C0-C4 I8 start bytes on EBCDIC ones */
+#define UTF8_IS_START(c)    (__ASSERT_(FITS_IN_8_BITS(c))                   \
+                             (NATIVE_UTF8_TO_I8(c) >= UTF_MIN_START_BYTE))
+
 /* The largest code point representable by two UTF-8 bytes on this platform.
  * As explained in the comments for __COMMON_UNI_SKIP, 32 start bytes with
  * UTF_ACCUMULATION_SHIFT bits of information each */
diff --git a/utfebcdic.h b/utfebcdic.h
index ad4df4544f..8fe4bdc143 100644
--- a/utfebcdic.h
+++ b/utfebcdic.h
@@ -217,9 +217,6 @@ explicitly forbidden, and the shortest possible encoding should always be used
  * l1_char_class_tab.h.
  * Comments as to the meaning of each are given at their corresponding utf8.h
  * definitions. */
-
-#define UTF8_IS_START(c)		_generic_isCC(c, _CC_UTF8_IS_START)
-
 /* Equivalent to ! UVCHR_IS_INVARIANT(c) */
 #define UTF8_IS_CONTINUED(c) 		cBOOL(FITS_IN_8_BITS(c)                 \
    && ! (PL_charclass[(U8) (c)] & (_CC_mask(_CC_ASCII) | _CC_mask(_CC_CNTRL))))
author	Karl Williamson <khw@cpan.org>	2019-10-06 10:50:12 -0600
committer	Karl Williamson <khw@cpan.org>	2019-10-06 11:07:09 -0600
commit	4bab39bc1904f776c12d31a54ff5abe06fc9c103 (patch)
tree	b6a1486f499a9d3e15b8884b508f4ca606ac46d0
parent	f4225fa0e24724a97c2ff1d4e608353ca1537506 (diff)
download	perl-4bab39bc1904f776c12d31a54ff5abe06fc9c103.tar.gz