utf8.h: Remove EBCDIC dependency

By generalizing a macro, we can make it serve both ASCII and EBCDIC
author: Karl Williamson <khw@cpan.org> 2021-06-16 20:31:07 -0600
committer: Karl Williamson <khw@cpan.org> 2021-08-07 05:14:44 -0600
commit: 6f8b1f9311454d2c11cb8a196b1367e9b3933cee (patch)
tree: f44c0148da70b7bde53b53ff7361b98d82cef89a /utf8.h
parent: bdcc1e93b12b67f35d05618013410ca92713eaf3 (diff)
download: perl-6f8b1f9311454d2c11cb8a196b1367e9b3933cee.tar.gz
1 files changed, 10 insertions, 7 deletions
diff --git a/utf8.h b/utf8.h
index 9deae74b23..76198ef91d 100644
--- a/utf8.h
+++ b/utf8.h
@@ -255,13 +255,6 @@ are in the character. */
  * for more */
 #define QUESTION_MARK_CTRL  DEL_NATIVE
 
-/* Surrogates, non-character code points and above-Unicode code points are
- * problematic in some contexts.  This allows code that needs to check for
- * those to quickly exclude the vast majority of code points it will
- * encounter */
-#define isUTF8_POSSIBLY_PROBLEMATIC(c) (__ASSERT_(FITS_IN_8_BITS(c))        \
-                                        (U8) c >= 0xED)
-
 #endif /* EBCDIC vs ASCII */
 
 /* It turns out that in a number of cases, that handling ASCII vs EBCDIC is a
@@ -858,6 +851,16 @@ case any call to string overloading updates the internal UTF-8 encoding flag.
                        && _is_in_locale_category(FALSE, -1)))           \
               && (! IN_BYTES))
 
+/* Surrogates, non-character code points and above-Unicode code points are
+ * problematic in some contexts.  These macros allow code that needs to check
+ * for those to quickly exclude the vast majority of code points it will
+ * encounter.
+ *
+ * The lowest such code point is the smallest surrogate, U+D800.  We calculate
+ * the start byte of that.  0xD800 occupies 16 bits. */
+#define isUNICODE_POSSIBLY_PROBLEMATIC(uv) ((uv) >= UNICODE_SURROGATE_FIRST)
+#define isUTF8_POSSIBLY_PROBLEMATIC(c)                                      \
+    (NATIVE_UTF8_TO_I8(c) >= UTF_START_BYTE(UNICODE_SURROGATE_FIRST, 16))
 
 /* Perl extends Unicode so that it is possible to encode (as extended UTF-8 or
  * UTF-EBCDIC) any 64-bit value.  No standard known to khw ever encoded higher
author	Karl Williamson <khw@cpan.org>	2021-06-16 20:31:07 -0600
committer	Karl Williamson <khw@cpan.org>	2021-08-07 05:14:44 -0600
commit	6f8b1f9311454d2c11cb8a196b1367e9b3933cee (patch)
tree	f44c0148da70b7bde53b53ff7361b98d82cef89a /utf8.h
parent	bdcc1e93b12b67f35d05618013410ca92713eaf3 (diff)
download	perl-6f8b1f9311454d2c11cb8a196b1367e9b3933cee.tar.gz