1 files changed, 42 insertions, 26 deletions
diff --git a/utf8.h b/utf8.h
index e70559e84c..1c8e06b59e 100644
--- a/utf8.h
+++ b/utf8.h
@@ -27,7 +27,7 @@
 
 #include "utfebcdic.h"
 
-#else
+#else	/* ! EBCDIC */
 START_EXTERN_C
 
 #ifdef DOINIT
@@ -47,11 +47,9 @@ EXTCONST unsigned char PL_utf8skip[];
 #endif
 
 END_EXTERN_C
-#define UTF8SKIP(s) PL_utf8skip[*(const U8*)(s)]
 
 /* Native character to iso-8859-1 */
 #define NATIVE_TO_ASCII(ch)      (ch)
-#define NATIVE8_TO_UNI(ch)        (ch)
 #define ASCII_TO_NATIVE(ch)      (ch)
 /* Transform after encoding */
 #define NATIVE_TO_UTF(ch)        (ch)
@@ -63,7 +61,7 @@ END_EXTERN_C
 #define NATIVE_TO_NEED(enc,ch)   (ch)
 #define ASCII_TO_NEED(enc,ch)    (ch)
 
-/* As there are no translations avoid the function wrapper */
+/* As there are no translations, avoid the function wrapper */
 #define utf8n_to_uvchr utf8n_to_uvuni
 #define uvchr_to_utf8  uvuni_to_utf8
 
@@ -111,8 +109,8 @@ encoded character.
 
 
 #define UNI_IS_INVARIANT(c)		(((UV)c) <  0x80)
-#define UTF8_IS_INVARIANT(c)		UNI_IS_INVARIANT(NATIVE_TO_UTF(c))
-#define NATIVE_IS_INVARIANT(c)		UNI_IS_INVARIANT(NATIVE8_TO_UNI(c))
+/* Note that C0 and C1 are invalid in legal UTF8, so the lower bound of the
+ * below might ought to be C2 */
 #define UTF8_IS_START(c)		(((U8)c) >= 0xc0 && (((U8)c) <= 0xfd))
 #define UTF8_IS_CONTINUATION(c)		(((U8)c) >= 0x80 && (((U8)c) <= 0xbf))
 #define UTF8_IS_CONTINUED(c) 		(((U8)c) &  0x80)
@@ -124,10 +122,6 @@ encoded character.
 #define UTF_CONTINUATION_MARK		0x80
 #define UTF_ACCUMULATION_SHIFT		6
 #define UTF_CONTINUATION_MASK		((U8)0x3f)
-#define UTF8_ACCUMULATE(old, new)	(((old) << UTF_ACCUMULATION_SHIFT) | (((U8)new) & UTF_CONTINUATION_MASK))
-
-#define UTF8_EIGHT_BIT_HI(c)	((((U8)(c))>>UTF_ACCUMULATION_SHIFT)|UTF_START_MARK(2))
-#define UTF8_EIGHT_BIT_LO(c)	(((((U8)(c)))&UTF_CONTINUATION_MASK)|UTF_CONTINUATION_MARK)
 
 #ifdef HAS_QUAD
 #define UNISKIP(uv) ( (uv) < 0x80           ? 1 : \
@@ -147,24 +141,51 @@ encoded character.
 		      (uv) < 0x80000000     ? 6 : 7 )
 #endif
 
+#endif /* EBCDIC vs ASCII */
+
+/* Rest of these are attributes of Unicode and perl's internals rather than the
+ * encoding, or happen to be the same in both ASCII and EBCDIC (at least at
+ * this level; the macros that some of these call may have different
+ * definitions in the two encodings */
+
+#define NATIVE8_TO_UNI(ch)     NATIVE_TO_ASCII(ch)	/* a clearer synonym */
+
+#define UTF8_ACCUMULATE(old, new)	(((old) << UTF_ACCUMULATION_SHIFT) | (((U8)new) & UTF_CONTINUATION_MASK))
+
+#define UTF8SKIP(s) PL_utf8skip[*(const U8*)(s)]
+
+#define UTF8_IS_INVARIANT(c)		UNI_IS_INVARIANT(NATIVE_TO_UTF(c))
+#define NATIVE_IS_INVARIANT(c)		UNI_IS_INVARIANT(NATIVE8_TO_UNI(c))
+
+#define MAX_PORTABLE_UTF8_TWO_BYTE 0x3FF    /* constrained by EBCDIC */
+
+/* The macros in the next sets are used to generate the two utf8 or utfebcdic
+ * bytes from an ordinal that is known to fit into two bytes; it must be less
+ * than 0x3FF to work across both encodings. */
+/* Nocast allows these to be used in the case label of a switch statement */
+#define UTF8_TWO_BYTE_HI_nocast(c)	UTF_TO_NATIVE(((c)>>UTF_ACCUMULATION_SHIFT)|UTF_START_MARK(2))
+#define UTF8_TWO_BYTE_LO_nocast(c)	UTF_TO_NATIVE(((c)&UTF_CONTINUATION_MASK)|UTF_CONTINUATION_MARK)
+
+#define UTF8_TWO_BYTE_HI(c)	((U8) (UTF8_TWO_BYTE_HI_nocast(c)))
+#define UTF8_TWO_BYTE_LO(c)	((U8) (UTF8_TWO_BYTE_LO_nocast(c)))
+
+/* This name is used when the source is a single byte */
+#define UTF8_EIGHT_BIT_HI(c)	UTF8_TWO_BYTE_HI((U8)(c))
+#define UTF8_EIGHT_BIT_LO(c)	UTF8_TWO_BYTE_LO((U8)(c))
+
 /*
  * Note: we try to be careful never to call the isXXX_utf8() functions
- * unless we're pretty sure we've seen the beginning of a UTF-8 character
- * (that is, the two high bits are set).  Otherwise we risk loading in the
- * heavy-duty swash_init and swash_fetch routines unnecessarily.
+ * unless we're pretty sure we've seen the beginning of a UTF-8 or UTFEBCDIC
+ * character.  Otherwise we risk loading in the heavy-duty swash_init and
+ * swash_fetch routines unnecessarily.
  */
-#define isIDFIRST_lazy_if(p,c) ((IN_BYTES || (!c || (*((const U8*)p) < 0xc0))) \
+#define isIDFIRST_lazy_if(p,c) ((IN_BYTES || (!c || ! UTF8_IS_START(*((const U8*)p)))) \
 				? isIDFIRST(*(p)) \
 				: isIDFIRST_utf8((const U8*)p))
-#define isALNUM_lazy_if(p,c)   ((IN_BYTES || (!c || (*((const U8*)p) < 0xc0))) \
+#define isALNUM_lazy_if(p,c)   ((IN_BYTES || (!c || ! UTF8_IS_START(*((const U8*)p)))) \
 				? isALNUM(*(p)) \
 				: isALNUM_utf8((const U8*)p))
 
-
-#endif /* EBCDIC vs ASCII */
-
-/* Rest of these are attributes of Unicode and perl's internals rather than the encoding */
-
 #define isIDFIRST_lazy(p)	isIDFIRST_lazy_if(p,1)
 #define isALNUM_lazy(p)		isALNUM_lazy_if(p,1)
 
@@ -176,16 +197,11 @@ encoded character.
  * as a way to encode non-negative integers in a binary format. */
 #define UTF8_MAXLEN UTF8_MAXBYTES
 
-#define UTF8_MAXLEN_UCLC 3		/* Obsolete, do not use. */
-#define UTF8_MAXLEN_UCLC_MULT 39	/* Obsolete, do not use. */
-#define UTF8_MAXLEN_FOLD 3		/* Obsolete, do not use. */
-#define UTF8_MAXLEN_FOLD_MULT 39	/* Obsolete, do not use. */
-
 /* The maximum number of UTF-8 bytes a single Unicode character can
  * uppercase/lowercase/fold into; this number depends on the Unicode
  * version.  An example of maximal expansion is the U+03B0 which
  * uppercases to U+03C5 U+0308 U+0301.  The Unicode databases that
- * tell these things are UnicodeDatabase.txt, CaseFolding.txt, and
+ * tell these things are UnicodeData.txt, CaseFolding.txt, and
  * SpecialCasing.txt. */
 #define UTF8_MAXBYTES_CASE	6