utf8.h: Remove an EBCDIC dependency

A symbol introduced in a previous commit allows this internal macro to only need a single version, suitable for either EBCDIC or ASCII.
author: Karl Williamson <khw@cpan.org> 2021-06-14 06:13:41 -0600
committer: Karl Williamson <khw@cpan.org> 2021-08-07 05:14:43 -0600
commit: 99904f65f9315ad1e8da23f22b561d878bcf524c (patch)
tree: 4694bebf035d2b554b0137b1b398434c9a59693a
parent: 28ca3ab57366a041138756872c2020aca0b98ec8 (diff)
download: perl-99904f65f9315ad1e8da23f22b561d878bcf524c.tar.gz
2 files changed, 19 insertions, 4 deletions
diff --git a/utf8.h b/utf8.h
index 1cb0b6855e..86340adc4a 100644
--- a/utf8.h
+++ b/utf8.h
@@ -274,8 +274,6 @@ are in the character. */
 #define isUTF8_POSSIBLY_PROBLEMATIC(c) (__ASSERT_(FITS_IN_8_BITS(c))        \
                                         (U8) c >= 0xED)
 
-#define UNICODE_IS_PERL_EXTENDED(uv)    UNLIKELY((UV) (uv) > 0x7FFFFFFF)
-
 #endif /* EBCDIC vs ASCII */
 
 /* It turns out that in a number of cases, that handling ASCII vs EBCDIC is a
@@ -764,6 +762,25 @@ case any call to string overloading updates the internal UTF-8 encoding flag.
               && (! IN_BYTES))
 
 
+/* Perl extends Unicode so that it is possible to encode (as extended UTF-8 or
+ * UTF-EBCDIC) any 64-bit value.  No standard known to khw ever encoded higher
+ * than a 31 bit value.  On ASCII platforms this just meant arbitrarily saying
+ * nothing could be higher than this.  On these the start byte FD gets you to
+ * 31 bits, and FE and FF are forbidden as start bytes.  On EBCDIC platforms,
+ * FD gets you only to 26 bits; adding FE to mean 7 total bytes gets you to 30
+ * bits.  To get to 31 bits, they treated an initial FF byte idiosyncratically.
+ * It was considered to be the start byte FE meaning it had 7 total bytes, and
+ * the final 1 was treated as an information bit, getting you to 31 bits.
+ *
+ * Perl used to accept this idiosyncratic interpretation of FF, but now rejects
+ * it in order to get to being able to encode 64 bits.  The bottom line is that
+ * it is a Perl extension to use the start bytes FE and FF on ASCII platforms,
+ * and the start byte FF on EBCDIC ones.  That translates into that it is a
+ * Perl extension to represent anything occupying more than 31 bits on ASCII
+ * platforms; 30 bits on EBCDIC. */
+#define UNICODE_IS_PERL_EXTENDED(uv)                                        \
+          UNLIKELY((UV) (uv) > nBIT_UMAX(31 - ONE_IF_EBCDIC_ZERO_IF_NOT))
+
 #define UTF8_ALLOW_EMPTY		0x0001	/* Allow a zero length string */
 #define UTF8_GOT_EMPTY                  UTF8_ALLOW_EMPTY
 
diff --git a/utfebcdic.h b/utfebcdic.h
index 1b9b35acf1..a9691bb8ef 100644
--- a/utfebcdic.h
+++ b/utfebcdic.h
@@ -222,8 +222,6 @@ explicitly forbidden, and the shortest possible encoding should always be used
  * for more */
 #define QUESTION_MARK_CTRL   LATIN1_TO_NATIVE(0x9F)
 
-#define UNICODE_IS_PERL_EXTENDED(uv)    UNLIKELY((UV) (uv) > 0x3FFFFFFF)
-
 /*
  * ex: set ts=8 sts=4 sw=4 et:
  */
author	Karl Williamson <khw@cpan.org>	2021-06-14 06:13:41 -0600
committer	Karl Williamson <khw@cpan.org>	2021-08-07 05:14:43 -0600
commit	99904f65f9315ad1e8da23f22b561d878bcf524c (patch)
tree	4694bebf035d2b554b0137b1b398434c9a59693a
parent	28ca3ab57366a041138756872c2020aca0b98ec8 (diff)
download	perl-99904f65f9315ad1e8da23f22b561d878bcf524c.tar.gz