Make defn of UTF_IS_ABOVE_LATIN1 common

This can be derived from other values, removing an EBCDIC dependency
author: Karl Williamson <khw@cpan.org> 2019-10-02 17:56:01 -0600
committer: Karl Williamson <khw@cpan.org> 2019-10-06 11:07:09 -0600
commit: 1df634280fbf565fc9e9ada123c12a82404aa817 (patch)
tree: 76c4eef4e7b829a89307e76fa6c3c57468f76cdc
parent: 4bab39bc1904f776c12d31a54ff5abe06fc9c103 (diff)
download: perl-1df634280fbf565fc9e9ada123c12a82404aa817.tar.gz
2 files changed, 8 insertions, 12 deletions
diff --git a/utf8.h b/utf8.h
index ead46229db..d288905ce1 100644
--- a/utf8.h
+++ b/utf8.h
@@ -308,12 +308,6 @@ C<cp> is Unicode if above 255; otherwise is platform-native.
 #define UTF8_IS_DOWNGRADEABLE_START(c)	(__ASSERT_(FITS_IN_8_BITS(c))       \
                                          (((U8)((c) | 0)) & 0xfe) == 0xc2)
 
-/* Is the UTF8-encoded byte 'c' the first byte of a sequence of bytes that
- * represent a code point > 255?  The |0 makes sure this isn't mistakenly
- * called with a ptr argument */
-#define UTF8_IS_ABOVE_LATIN1(c)     (__ASSERT_(FITS_IN_8_BITS(c))           \
-                                     ((U8)((c) | 0)) >= 0xc4)
-
 /* This is the number of low-order bits a continuation byte in a UTF-8 encoded
  * sequence contributes to the specification of the code point.  In the bit
  * maps above, you see that the first 2 bits are a constant '10', leaving 6 of
@@ -424,6 +418,14 @@ encoded as UTF-8.  C<cp> is a native (ASCII or EBCDIC) code point if less than
 #define UTF8_IS_START(c)    (__ASSERT_(FITS_IN_8_BITS(c))                   \
                              (NATIVE_UTF8_TO_I8(c) >= UTF_MIN_START_BYTE))
 
+#define UTF_MIN_ABOVE_LATIN1_BYTE                                           \
+                    ((0x100 >> UTF_ACCUMULATION_SHIFT) | UTF_START_MARK(2))
+
+/* Is the UTF8-encoded byte 'c' the first byte of a sequence of bytes that
+ * represent a code point > 255? */
+#define UTF8_IS_ABOVE_LATIN1(c)     (__ASSERT_(FITS_IN_8_BITS(c))           \
+                        (NATIVE_UTF8_TO_I8(c) >= UTF_MIN_ABOVE_LATIN1_BYTE))
+
 /* The largest code point representable by two UTF-8 bytes on this platform.
  * As explained in the comments for __COMMON_UNI_SKIP, 32 start bytes with
  * UTF_ACCUMULATION_SHIFT bits of information each */
diff --git a/utfebcdic.h b/utfebcdic.h
index 8fe4bdc143..751fa0a9bd 100644
--- a/utfebcdic.h
+++ b/utfebcdic.h
@@ -224,12 +224,6 @@ explicitly forbidden, and the shortest possible encoding should always be used
 #define UTF8_IS_DOWNGRADEABLE_START(c)   _generic_isCC(c,                       \
                                               _CC_UTF8_IS_DOWNGRADEABLE_START)
 
-/* Equivalent to (UTF8_IS_START(c) && ! UTF8_IS_DOWNGRADEABLE_START(c))
- * Makes sure that the START bit is set and the DOWNGRADEABLE bit isn't */
-#define UTF8_IS_ABOVE_LATIN1(c) cBOOL(FITS_IN_8_BITS(c)                         \
-  && ((PL_charclass[(U8) (c)] & ( _CC_mask(_CC_UTF8_IS_START)                   \
-                                 |_CC_mask(_CC_UTF8_IS_DOWNGRADEABLE_START)))   \
-                        == _CC_mask(_CC_UTF8_IS_START)))
 
 #define isUTF8_POSSIBLY_PROBLEMATIC(c)                                          \
                 _generic_isCC(c, _CC_UTF8_START_BYTE_IS_FOR_AT_LEAST_SURROGATE)
author	Karl Williamson <khw@cpan.org>	2019-10-02 17:56:01 -0600
committer	Karl Williamson <khw@cpan.org>	2019-10-06 11:07:09 -0600
commit	1df634280fbf565fc9e9ada123c12a82404aa817 (patch)
tree	76c4eef4e7b829a89307e76fa6c3c57468f76cdc
parent	4bab39bc1904f776c12d31a54ff5abe06fc9c103 (diff)
download	perl-1df634280fbf565fc9e9ada123c12a82404aa817.tar.gz