summaryrefslogtreecommitdiff
path: root/utf8.h
diff options
context:
space:
mode:
Diffstat (limited to 'utf8.h')
-rw-r--r--utf8.h68
1 files changed, 42 insertions, 26 deletions
diff --git a/utf8.h b/utf8.h
index e70559e84c..1c8e06b59e 100644
--- a/utf8.h
+++ b/utf8.h
@@ -27,7 +27,7 @@
#include "utfebcdic.h"
-#else
+#else /* ! EBCDIC */
START_EXTERN_C
#ifdef DOINIT
@@ -47,11 +47,9 @@ EXTCONST unsigned char PL_utf8skip[];
#endif
END_EXTERN_C
-#define UTF8SKIP(s) PL_utf8skip[*(const U8*)(s)]
/* Native character to iso-8859-1 */
#define NATIVE_TO_ASCII(ch) (ch)
-#define NATIVE8_TO_UNI(ch) (ch)
#define ASCII_TO_NATIVE(ch) (ch)
/* Transform after encoding */
#define NATIVE_TO_UTF(ch) (ch)
@@ -63,7 +61,7 @@ END_EXTERN_C
#define NATIVE_TO_NEED(enc,ch) (ch)
#define ASCII_TO_NEED(enc,ch) (ch)
-/* As there are no translations avoid the function wrapper */
+/* As there are no translations, avoid the function wrapper */
#define utf8n_to_uvchr utf8n_to_uvuni
#define uvchr_to_utf8 uvuni_to_utf8
@@ -111,8 +109,8 @@ encoded character.
#define UNI_IS_INVARIANT(c) (((UV)c) < 0x80)
-#define UTF8_IS_INVARIANT(c) UNI_IS_INVARIANT(NATIVE_TO_UTF(c))
-#define NATIVE_IS_INVARIANT(c) UNI_IS_INVARIANT(NATIVE8_TO_UNI(c))
+/* Note that C0 and C1 are invalid in legal UTF8, so the lower bound of the
+ * below might ought to be C2 */
#define UTF8_IS_START(c) (((U8)c) >= 0xc0 && (((U8)c) <= 0xfd))
#define UTF8_IS_CONTINUATION(c) (((U8)c) >= 0x80 && (((U8)c) <= 0xbf))
#define UTF8_IS_CONTINUED(c) (((U8)c) & 0x80)
@@ -124,10 +122,6 @@ encoded character.
#define UTF_CONTINUATION_MARK 0x80
#define UTF_ACCUMULATION_SHIFT 6
#define UTF_CONTINUATION_MASK ((U8)0x3f)
-#define UTF8_ACCUMULATE(old, new) (((old) << UTF_ACCUMULATION_SHIFT) | (((U8)new) & UTF_CONTINUATION_MASK))
-
-#define UTF8_EIGHT_BIT_HI(c) ((((U8)(c))>>UTF_ACCUMULATION_SHIFT)|UTF_START_MARK(2))
-#define UTF8_EIGHT_BIT_LO(c) (((((U8)(c)))&UTF_CONTINUATION_MASK)|UTF_CONTINUATION_MARK)
#ifdef HAS_QUAD
#define UNISKIP(uv) ( (uv) < 0x80 ? 1 : \
@@ -147,24 +141,51 @@ encoded character.
(uv) < 0x80000000 ? 6 : 7 )
#endif
+#endif /* EBCDIC vs ASCII */
+
+/* Rest of these are attributes of Unicode and perl's internals rather than the
+ * encoding, or happen to be the same in both ASCII and EBCDIC (at least at
+ * this level; the macros that some of these call may have different
+ * definitions in the two encodings */
+
+#define NATIVE8_TO_UNI(ch) NATIVE_TO_ASCII(ch) /* a clearer synonym */
+
+#define UTF8_ACCUMULATE(old, new) (((old) << UTF_ACCUMULATION_SHIFT) | (((U8)new) & UTF_CONTINUATION_MASK))
+
+#define UTF8SKIP(s) PL_utf8skip[*(const U8*)(s)]
+
+#define UTF8_IS_INVARIANT(c) UNI_IS_INVARIANT(NATIVE_TO_UTF(c))
+#define NATIVE_IS_INVARIANT(c) UNI_IS_INVARIANT(NATIVE8_TO_UNI(c))
+
+#define MAX_PORTABLE_UTF8_TWO_BYTE 0x3FF /* constrained by EBCDIC */
+
+/* The macros in the next sets are used to generate the two utf8 or utfebcdic
+ * bytes from an ordinal that is known to fit into two bytes; it must be less
+ * than 0x3FF to work across both encodings. */
+/* Nocast allows these to be used in the case label of a switch statement */
+#define UTF8_TWO_BYTE_HI_nocast(c) UTF_TO_NATIVE(((c)>>UTF_ACCUMULATION_SHIFT)|UTF_START_MARK(2))
+#define UTF8_TWO_BYTE_LO_nocast(c) UTF_TO_NATIVE(((c)&UTF_CONTINUATION_MASK)|UTF_CONTINUATION_MARK)
+
+#define UTF8_TWO_BYTE_HI(c) ((U8) (UTF8_TWO_BYTE_HI_nocast(c)))
+#define UTF8_TWO_BYTE_LO(c) ((U8) (UTF8_TWO_BYTE_LO_nocast(c)))
+
+/* This name is used when the source is a single byte */
+#define UTF8_EIGHT_BIT_HI(c) UTF8_TWO_BYTE_HI((U8)(c))
+#define UTF8_EIGHT_BIT_LO(c) UTF8_TWO_BYTE_LO((U8)(c))
+
/*
* Note: we try to be careful never to call the isXXX_utf8() functions
- * unless we're pretty sure we've seen the beginning of a UTF-8 character
- * (that is, the two high bits are set). Otherwise we risk loading in the
- * heavy-duty swash_init and swash_fetch routines unnecessarily.
+ * unless we're pretty sure we've seen the beginning of a UTF-8 or UTFEBCDIC
+ * character. Otherwise we risk loading in the heavy-duty swash_init and
+ * swash_fetch routines unnecessarily.
*/
-#define isIDFIRST_lazy_if(p,c) ((IN_BYTES || (!c || (*((const U8*)p) < 0xc0))) \
+#define isIDFIRST_lazy_if(p,c) ((IN_BYTES || (!c || ! UTF8_IS_START(*((const U8*)p)))) \
? isIDFIRST(*(p)) \
: isIDFIRST_utf8((const U8*)p))
-#define isALNUM_lazy_if(p,c) ((IN_BYTES || (!c || (*((const U8*)p) < 0xc0))) \
+#define isALNUM_lazy_if(p,c) ((IN_BYTES || (!c || ! UTF8_IS_START(*((const U8*)p)))) \
? isALNUM(*(p)) \
: isALNUM_utf8((const U8*)p))
-
-#endif /* EBCDIC vs ASCII */
-
-/* Rest of these are attributes of Unicode and perl's internals rather than the encoding */
-
#define isIDFIRST_lazy(p) isIDFIRST_lazy_if(p,1)
#define isALNUM_lazy(p) isALNUM_lazy_if(p,1)
@@ -176,16 +197,11 @@ encoded character.
* as a way to encode non-negative integers in a binary format. */
#define UTF8_MAXLEN UTF8_MAXBYTES
-#define UTF8_MAXLEN_UCLC 3 /* Obsolete, do not use. */
-#define UTF8_MAXLEN_UCLC_MULT 39 /* Obsolete, do not use. */
-#define UTF8_MAXLEN_FOLD 3 /* Obsolete, do not use. */
-#define UTF8_MAXLEN_FOLD_MULT 39 /* Obsolete, do not use. */
-
/* The maximum number of UTF-8 bytes a single Unicode character can
* uppercase/lowercase/fold into; this number depends on the Unicode
* version. An example of maximal expansion is the U+03B0 which
* uppercases to U+03C5 U+0308 U+0301. The Unicode databases that
- * tell these things are UnicodeDatabase.txt, CaseFolding.txt, and
+ * tell these things are UnicodeData.txt, CaseFolding.txt, and
* SpecialCasing.txt. */
#define UTF8_MAXBYTES_CASE 6