summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2021-06-14 06:13:41 -0600
committerKarl Williamson <khw@cpan.org>2021-08-07 05:14:43 -0600
commit99904f65f9315ad1e8da23f22b561d878bcf524c (patch)
tree4694bebf035d2b554b0137b1b398434c9a59693a
parent28ca3ab57366a041138756872c2020aca0b98ec8 (diff)
downloadperl-99904f65f9315ad1e8da23f22b561d878bcf524c.tar.gz
utf8.h: Remove an EBCDIC dependency
A symbol introduced in a previous commit allows this internal macro to only need a single version, suitable for either EBCDIC or ASCII.
-rw-r--r--utf8.h21
-rw-r--r--utfebcdic.h2
2 files changed, 19 insertions, 4 deletions
diff --git a/utf8.h b/utf8.h
index 1cb0b6855e..86340adc4a 100644
--- a/utf8.h
+++ b/utf8.h
@@ -274,8 +274,6 @@ are in the character. */
#define isUTF8_POSSIBLY_PROBLEMATIC(c) (__ASSERT_(FITS_IN_8_BITS(c)) \
(U8) c >= 0xED)
-#define UNICODE_IS_PERL_EXTENDED(uv) UNLIKELY((UV) (uv) > 0x7FFFFFFF)
-
#endif /* EBCDIC vs ASCII */
/* It turns out that in a number of cases, that handling ASCII vs EBCDIC is a
@@ -764,6 +762,25 @@ case any call to string overloading updates the internal UTF-8 encoding flag.
&& (! IN_BYTES))
+/* Perl extends Unicode so that it is possible to encode (as extended UTF-8 or
+ * UTF-EBCDIC) any 64-bit value. No standard known to khw ever encoded higher
+ * than a 31 bit value. On ASCII platforms this just meant arbitrarily saying
+ * nothing could be higher than this. On these the start byte FD gets you to
+ * 31 bits, and FE and FF are forbidden as start bytes. On EBCDIC platforms,
+ * FD gets you only to 26 bits; adding FE to mean 7 total bytes gets you to 30
+ * bits. To get to 31 bits, they treated an initial FF byte idiosyncratically.
+ * It was considered to be the start byte FE meaning it had 7 total bytes, and
+ * the final 1 was treated as an information bit, getting you to 31 bits.
+ *
+ * Perl used to accept this idiosyncratic interpretation of FF, but now rejects
+ * it in order to get to being able to encode 64 bits. The bottom line is that
+ * it is a Perl extension to use the start bytes FE and FF on ASCII platforms,
+ * and the start byte FF on EBCDIC ones. That translates into that it is a
+ * Perl extension to represent anything occupying more than 31 bits on ASCII
+ * platforms; 30 bits on EBCDIC. */
+#define UNICODE_IS_PERL_EXTENDED(uv) \
+ UNLIKELY((UV) (uv) > nBIT_UMAX(31 - ONE_IF_EBCDIC_ZERO_IF_NOT))
+
#define UTF8_ALLOW_EMPTY 0x0001 /* Allow a zero length string */
#define UTF8_GOT_EMPTY UTF8_ALLOW_EMPTY
diff --git a/utfebcdic.h b/utfebcdic.h
index 1b9b35acf1..a9691bb8ef 100644
--- a/utfebcdic.h
+++ b/utfebcdic.h
@@ -222,8 +222,6 @@ explicitly forbidden, and the shortest possible encoding should always be used
* for more */
#define QUESTION_MARK_CTRL LATIN1_TO_NATIVE(0x9F)
-#define UNICODE_IS_PERL_EXTENDED(uv) UNLIKELY((UV) (uv) > 0x3FFFFFFF)
-
/*
* ex: set ts=8 sts=4 sw=4 et:
*/