diff options
Diffstat (limited to 'utf8.h')
-rw-r--r-- | utf8.h | 68 |
1 files changed, 42 insertions, 26 deletions
@@ -27,7 +27,7 @@ #include "utfebcdic.h" -#else +#else /* ! EBCDIC */ START_EXTERN_C #ifdef DOINIT @@ -47,11 +47,9 @@ EXTCONST unsigned char PL_utf8skip[]; #endif END_EXTERN_C -#define UTF8SKIP(s) PL_utf8skip[*(const U8*)(s)] /* Native character to iso-8859-1 */ #define NATIVE_TO_ASCII(ch) (ch) -#define NATIVE8_TO_UNI(ch) (ch) #define ASCII_TO_NATIVE(ch) (ch) /* Transform after encoding */ #define NATIVE_TO_UTF(ch) (ch) @@ -63,7 +61,7 @@ END_EXTERN_C #define NATIVE_TO_NEED(enc,ch) (ch) #define ASCII_TO_NEED(enc,ch) (ch) -/* As there are no translations avoid the function wrapper */ +/* As there are no translations, avoid the function wrapper */ #define utf8n_to_uvchr utf8n_to_uvuni #define uvchr_to_utf8 uvuni_to_utf8 @@ -111,8 +109,8 @@ encoded character. #define UNI_IS_INVARIANT(c) (((UV)c) < 0x80) -#define UTF8_IS_INVARIANT(c) UNI_IS_INVARIANT(NATIVE_TO_UTF(c)) -#define NATIVE_IS_INVARIANT(c) UNI_IS_INVARIANT(NATIVE8_TO_UNI(c)) +/* Note that C0 and C1 are invalid in legal UTF8, so the lower bound of the + * below might ought to be C2 */ #define UTF8_IS_START(c) (((U8)c) >= 0xc0 && (((U8)c) <= 0xfd)) #define UTF8_IS_CONTINUATION(c) (((U8)c) >= 0x80 && (((U8)c) <= 0xbf)) #define UTF8_IS_CONTINUED(c) (((U8)c) & 0x80) @@ -124,10 +122,6 @@ encoded character. #define UTF_CONTINUATION_MARK 0x80 #define UTF_ACCUMULATION_SHIFT 6 #define UTF_CONTINUATION_MASK ((U8)0x3f) -#define UTF8_ACCUMULATE(old, new) (((old) << UTF_ACCUMULATION_SHIFT) | (((U8)new) & UTF_CONTINUATION_MASK)) - -#define UTF8_EIGHT_BIT_HI(c) ((((U8)(c))>>UTF_ACCUMULATION_SHIFT)|UTF_START_MARK(2)) -#define UTF8_EIGHT_BIT_LO(c) (((((U8)(c)))&UTF_CONTINUATION_MASK)|UTF_CONTINUATION_MARK) #ifdef HAS_QUAD #define UNISKIP(uv) ( (uv) < 0x80 ? 1 : \ @@ -147,24 +141,51 @@ encoded character. (uv) < 0x80000000 ? 6 : 7 ) #endif +#endif /* EBCDIC vs ASCII */ + +/* Rest of these are attributes of Unicode and perl's internals rather than the + * encoding, or happen to be the same in both ASCII and EBCDIC (at least at + * this level; the macros that some of these call may have different + * definitions in the two encodings */ + +#define NATIVE8_TO_UNI(ch) NATIVE_TO_ASCII(ch) /* a clearer synonym */ + +#define UTF8_ACCUMULATE(old, new) (((old) << UTF_ACCUMULATION_SHIFT) | (((U8)new) & UTF_CONTINUATION_MASK)) + +#define UTF8SKIP(s) PL_utf8skip[*(const U8*)(s)] + +#define UTF8_IS_INVARIANT(c) UNI_IS_INVARIANT(NATIVE_TO_UTF(c)) +#define NATIVE_IS_INVARIANT(c) UNI_IS_INVARIANT(NATIVE8_TO_UNI(c)) + +#define MAX_PORTABLE_UTF8_TWO_BYTE 0x3FF /* constrained by EBCDIC */ + +/* The macros in the next sets are used to generate the two utf8 or utfebcdic + * bytes from an ordinal that is known to fit into two bytes; it must be less + * than 0x3FF to work across both encodings. */ +/* Nocast allows these to be used in the case label of a switch statement */ +#define UTF8_TWO_BYTE_HI_nocast(c) UTF_TO_NATIVE(((c)>>UTF_ACCUMULATION_SHIFT)|UTF_START_MARK(2)) +#define UTF8_TWO_BYTE_LO_nocast(c) UTF_TO_NATIVE(((c)&UTF_CONTINUATION_MASK)|UTF_CONTINUATION_MARK) + +#define UTF8_TWO_BYTE_HI(c) ((U8) (UTF8_TWO_BYTE_HI_nocast(c))) +#define UTF8_TWO_BYTE_LO(c) ((U8) (UTF8_TWO_BYTE_LO_nocast(c))) + +/* This name is used when the source is a single byte */ +#define UTF8_EIGHT_BIT_HI(c) UTF8_TWO_BYTE_HI((U8)(c)) +#define UTF8_EIGHT_BIT_LO(c) UTF8_TWO_BYTE_LO((U8)(c)) + /* * Note: we try to be careful never to call the isXXX_utf8() functions - * unless we're pretty sure we've seen the beginning of a UTF-8 character - * (that is, the two high bits are set). Otherwise we risk loading in the - * heavy-duty swash_init and swash_fetch routines unnecessarily. + * unless we're pretty sure we've seen the beginning of a UTF-8 or UTFEBCDIC + * character. Otherwise we risk loading in the heavy-duty swash_init and + * swash_fetch routines unnecessarily. */ -#define isIDFIRST_lazy_if(p,c) ((IN_BYTES || (!c || (*((const U8*)p) < 0xc0))) \ +#define isIDFIRST_lazy_if(p,c) ((IN_BYTES || (!c || ! UTF8_IS_START(*((const U8*)p)))) \ ? isIDFIRST(*(p)) \ : isIDFIRST_utf8((const U8*)p)) -#define isALNUM_lazy_if(p,c) ((IN_BYTES || (!c || (*((const U8*)p) < 0xc0))) \ +#define isALNUM_lazy_if(p,c) ((IN_BYTES || (!c || ! UTF8_IS_START(*((const U8*)p)))) \ ? isALNUM(*(p)) \ : isALNUM_utf8((const U8*)p)) - -#endif /* EBCDIC vs ASCII */ - -/* Rest of these are attributes of Unicode and perl's internals rather than the encoding */ - #define isIDFIRST_lazy(p) isIDFIRST_lazy_if(p,1) #define isALNUM_lazy(p) isALNUM_lazy_if(p,1) @@ -176,16 +197,11 @@ encoded character. * as a way to encode non-negative integers in a binary format. */ #define UTF8_MAXLEN UTF8_MAXBYTES -#define UTF8_MAXLEN_UCLC 3 /* Obsolete, do not use. */ -#define UTF8_MAXLEN_UCLC_MULT 39 /* Obsolete, do not use. */ -#define UTF8_MAXLEN_FOLD 3 /* Obsolete, do not use. */ -#define UTF8_MAXLEN_FOLD_MULT 39 /* Obsolete, do not use. */ - /* The maximum number of UTF-8 bytes a single Unicode character can * uppercase/lowercase/fold into; this number depends on the Unicode * version. An example of maximal expansion is the U+03B0 which * uppercases to U+03C5 U+0308 U+0301. The Unicode databases that - * tell these things are UnicodeDatabase.txt, CaseFolding.txt, and + * tell these things are UnicodeData.txt, CaseFolding.txt, and * SpecialCasing.txt. */ #define UTF8_MAXBYTES_CASE 6 |