summaryrefslogtreecommitdiff
path: root/utf8.h
diff options
context:
space:
mode:
authorNick Ing-Simmons <nik@tiuk.ti.com>2001-03-17 18:29:50 +0000
committerNick Ing-Simmons <nik@tiuk.ti.com>2001-03-17 18:29:50 +0000
commit1d72bdf6104ef56ab17c3abedf522be0125851c7 (patch)
treeee4fd5e4b221b1cb3519855686b6cc7d9077c9d6 /utf8.h
parentd742c382eed38a7010c93d369ad6896d826c21d6 (diff)
downloadperl-1d72bdf6104ef56ab17c3abedf522be0125851c7.tar.gz
Infrastructure to use UTF-EBCDIC rather than UTF-8 as the internal
encoding on EBCDIC platforms. This has property that U+0000..U+009F i.e. a superset of ASCII are invariant under the encoding. This is EBCDIC friendly as an encoded string can be looked at as being EBCDIC by lexer sprintf("%d",...) etc. in same manner that a UTF-8 string be considered ASCII on ASCII machines. - re-arrange utf8.h to get ASCII specific vs Unicode generic bits seperate. - Add some more macros to comprehend different shift amounts and possible swizzle in UTF-EBCDIC vs UTF-8. Change utf8.c to use them. - add utfebcdic.h which provides UTF-EBCDIC versions of the macros, and conditionally #include it. EBCDIC build as yet untested. ASCII still fails the one test. p4raw-id: //depot/perlio@9185
Diffstat (limited to 'utf8.h')
-rw-r--r--utf8.h137
1 files changed, 69 insertions, 68 deletions
diff --git a/utf8.h b/utf8.h
index 67b3d53487..a60639720f 100644
--- a/utf8.h
+++ b/utf8.h
@@ -7,6 +7,13 @@
*
*/
+#ifdef EBCDIC
+/* The equivalent of these macros but implementing UTF-EBCDIC
+ are in the following header file:
+ */
+
+#include "utfebcdic.h"
+#else
START_EXTERN_C
#ifdef DOINIT
@@ -26,47 +33,20 @@ EXTCONST unsigned char PL_utf8skip[];
#endif
END_EXTERN_C
-
-#define UTF8_MAXLEN 13 /* how wide can a single UTF8 encoded character become */
-
-/* #define IN_UTF8 (PL_curcop->op_private & HINT_UTF8) */
-#define IN_BYTE (PL_curcop->op_private & HINT_BYTE)
-#ifdef USE_BYTES_DOWNGRADES
-#define DO_UTF8(sv) (SvUTF8(sv) && !(IN_BYTE && sv_utf8_downgrade(sv,0)))
-#else
-#define DO_UTF8(sv) (SvUTF8(sv) && !IN_BYTE)
-#endif
-
-#define UTF8_ALLOW_EMPTY 0x0001
-#define UTF8_ALLOW_CONTINUATION 0x0002
-#define UTF8_ALLOW_NON_CONTINUATION 0x0004
-#define UTF8_ALLOW_FE_FF 0x0008
-#define UTF8_ALLOW_SHORT 0x0010
-#define UTF8_ALLOW_SURROGATE 0x0020
-#define UTF8_ALLOW_BOM 0x0040
-#define UTF8_ALLOW_FFFF 0x0080
-#define UTF8_ALLOW_LONG 0x0100
-#define UTF8_ALLOW_ANYUV (UTF8_ALLOW_EMPTY|UTF8_ALLOW_FE_FF|\
- UTF8_ALLOW_SURROGATE|UTF8_ALLOW_BOM|\
- UTF8_ALLOW_FFFF|UTF8_ALLOW_LONG)
-#define UTF8_ALLOW_ANY 0x00ff
-#define UTF8_CHECK_ONLY 0x0100
-
-#define UNICODE_SURROGATE_FIRST 0xd800
-#define UNICODE_SURROGATE_LAST 0xdfff
-#define UNICODE_REPLACEMENT 0xfffd
-#define UNICODE_BYTER_ORDER_MARK 0xfffe
-#define UNICODE_ILLEGAL 0xffff
-
-#define UNICODE_IS_SURROGATE(c) ((c) >= UNICODE_SURROGATE_FIRST && \
- (c) <= UNICODE_SURROGATE_LAST)
-#define UNICODE_IS_REPLACEMENT(c) ((c) == UNICODE_REPLACMENT)
-#define UNICODE_IS_BYTE_ORDER_MARK(c) ((c) == UNICODE_BYTER_ORDER_MARK)
-#define UNICODE_IS_ILLEGAL(c) ((c) == UNICODE_ILLEGAL)
-
#define UTF8SKIP(s) PL_utf8skip[*(U8*)s]
-#define UTF8_QUAD_MAX UINT64_C(0x1000000000)
+/* Native character to iso-8859-1 */
+#define NATIVE_TO_ASCII(ch) (ch)
+#define ASCII_TO_NATIVE(ch) (ch)
+/* Transform after encoding */
+#define NATIVE_TO_UTF(ch) (ch)
+#define UTF_TO_NATIVE(ch) (ch)
+/* Transforms in wide UV chars */
+#define UNI_TO_NATIVE(ch) (ch)
+#define NATIVE_TO_UNI(ch) (ch)
+/* Transforms in invariant space */
+#define NATIVE_TO_NEED(enc,ch) (ch)
+#define ASCII_TO_NEED(enc,ch) (ch)
/*
@@ -84,18 +64,22 @@ END_EXTERN_C
*/
-#define UTF8_IS_ASCII(c) (((U8)c) < 0x80)
+#define UTF8_IS_INVARIANT(c) (((UV)c) < 0x80)
#define UTF8_IS_START(c) (((U8)c) >= 0xc0 && (((U8)c) <= 0xfd))
#define UTF8_IS_CONTINUATION(c) (((U8)c) >= 0x80 && (((U8)c) <= 0xbf))
#define UTF8_IS_CONTINUED(c) (((U8)c) & 0x80)
#define UTF8_IS_DOWNGRADEABLE_START(c) (((U8)c & 0xfc) == 0xc0)
-#define UTF8_CONTINUATION_MASK ((U8)0x3f)
-#define UTF8_ACCUMULATION_SHIFT 6
-#define UTF8_ACCUMULATE(old, new) ((old) << UTF8_ACCUMULATION_SHIFT | (((U8)new) & UTF8_CONTINUATION_MASK))
+#define UTF_START_MARK(len) ((len > 7) ? 0xFF : (0xFE << (7-len)))
+#define UTF_START_MASK(len) ((len >= 7) ? 0x00 : (0x1F >> (len-2)))
+
+#define UTF_CONTINUATION_MARK 0x80
+#define UTF_ACCUMULATION_SHIFT 6
+#define UTF_CONTINUATION_MASK ((U8)0x3f)
+#define UTF8_ACCUMULATE(old, new) ((old) << UTF_ACCUMULATION_SHIFT | (((U8)new) & UTF_CONTINUATION_MASK))
-#define UTF8_EIGHT_BIT_HI(c) ( (((U8)(c))>>6) |0xc0)
-#define UTF8_EIGHT_BIT_LO(c) (((((U8)(c)) )&0x3f)|0x80)
+#define UTF8_EIGHT_BIT_HI(c) ((((U8)(c))>>UTF_ACCUMULATION_SHIFT)|UTF_START_MARK(2))
+#define UTF8_EIGHT_BIT_LO(c) (((((U8)(c)))&UTF_CONTINUATION_MASK)|UTF_CONTINUATION_MARK)
#ifdef HAS_QUAD
#define UNISKIP(uv) ( (uv) < 0x80 ? 1 : \
@@ -115,43 +99,60 @@ END_EXTERN_C
(uv) < 0x80000000 ? 6 : 7 )
#endif
-
/*
* Note: we try to be careful never to call the isXXX_utf8() functions
* unless we're pretty sure we've seen the beginning of a UTF-8 character
* (that is, the two high bits are set). Otherwise we risk loading in the
* heavy-duty SWASHINIT and SWASHGET routines unnecessarily.
*/
-#ifdef EBCDIC
-#define isIDFIRST_lazy_if(p,c) isIDFIRST(*(p))
-#define isALNUM_lazy_if(p,c) isALNUM(*(p))
-#else
#define isIDFIRST_lazy_if(p,c) ((IN_BYTE || (!c || (*((U8*)p) < 0xc0))) \
? isIDFIRST(*(p)) \
: isIDFIRST_utf8((U8*)p))
#define isALNUM_lazy_if(p,c) ((IN_BYTE || (!c || (*((U8*)p) < 0xc0))) \
? isALNUM(*(p)) \
: isALNUM_utf8((U8*)p))
-#endif
+
+
+#endif /* EBCDIC vs ASCII */
+
+/* Rest of these are attributes of Unicode and perl's internals rather than the encoding */
+
#define isIDFIRST_lazy(p) isIDFIRST_lazy_if(p,1)
#define isALNUM_lazy(p) isALNUM_lazy_if(p,1)
-/* EBCDIC-happy ways of converting native code to UTF8 */
+#define UTF8_MAXLEN 13 /* how wide can a single UTF8 encoded character become */
-#ifdef EBCDIC
-#define NATIVE_TO_ASCII(ch) PL_e2a[(ch)&255]
-#define ASCII_TO_NATIVE(ch) PL_a2e[(ch)&255]
-#define NATIVE_TO_UNI(ch) (((ch) > 255) ? (ch) : (UV) PL_e2a[(ch)])
-#define UNI_TO_NATIVE(ch) (((ch) > 255) ? (ch) : (UV) PL_a2e[(ch)])
-#define NATIVE_TO_NEED(enc,ch) ((enc) ? NATIVE_TO_ASCII(ch) : (ch))
-#define ASCII_TO_NEED(enc,ch) ((enc) ? (ch) : ASCII_TO_NATIVE(ch))
-#else
-#define NATIVE_TO_ASCII(ch) (ch)
-#define ASCII_TO_NATIVE(ch) (ch)
-#define UNI_TO_NATIVE(ch) (ch)
-#define NATIVE_TO_UNI(ch) (ch)
-#define NATIVE_TO_NEED(enc,ch) (ch)
-#define ASCII_TO_NEED(enc,ch) (ch)
-#endif
+/* #define IN_UTF8 (PL_curcop->op_private & HINT_UTF8) */
+#define IN_BYTE (PL_curcop->op_private & HINT_BYTE)
+#define DO_UTF8(sv) (SvUTF8(sv) && !IN_BYTE)
+
+#define UTF8_ALLOW_EMPTY 0x0001
+#define UTF8_ALLOW_CONTINUATION 0x0002
+#define UTF8_ALLOW_NON_CONTINUATION 0x0004
+#define UTF8_ALLOW_FE_FF 0x0008
+#define UTF8_ALLOW_SHORT 0x0010
+#define UTF8_ALLOW_SURROGATE 0x0020
+#define UTF8_ALLOW_BOM 0x0040
+#define UTF8_ALLOW_FFFF 0x0080
+#define UTF8_ALLOW_LONG 0x0100
+#define UTF8_ALLOW_ANYUV (UTF8_ALLOW_EMPTY|UTF8_ALLOW_FE_FF|\
+ UTF8_ALLOW_SURROGATE|UTF8_ALLOW_BOM|\
+ UTF8_ALLOW_FFFF|UTF8_ALLOW_LONG)
+#define UTF8_ALLOW_ANY 0x00ff
+#define UTF8_CHECK_ONLY 0x0100
+
+#define UNICODE_SURROGATE_FIRST 0xd800
+#define UNICODE_SURROGATE_LAST 0xdfff
+#define UNICODE_REPLACEMENT 0xfffd
+#define UNICODE_BYTER_ORDER_MARK 0xfffe
+#define UNICODE_ILLEGAL 0xffff
+
+#define UNICODE_IS_SURROGATE(c) ((c) >= UNICODE_SURROGATE_FIRST && \
+ (c) <= UNICODE_SURROGATE_LAST)
+#define UNICODE_IS_REPLACEMENT(c) ((c) == UNICODE_REPLACMENT)
+#define UNICODE_IS_BYTE_ORDER_MARK(c) ((c) == UNICODE_BYTER_ORDER_MARK)
+#define UNICODE_IS_ILLEGAL(c) ((c) == UNICODE_ILLEGAL)
+
+#define UTF8_QUAD_MAX UINT64_C(0x1000000000)
-#define UTF8_IS_INVARIANT(c) UTF8_IS_ASCII(c)
+#define UTF8_IS_ASCII(c) UTF8_IS_INVARIANT(c)