diff options
-rw-r--r-- | utf8.h | 13 | ||||
-rw-r--r-- | utfebcdic.h | 7 |
2 files changed, 13 insertions, 7 deletions
@@ -356,8 +356,9 @@ encoded as UTF-8. C<cp> is a native (ASCII or EBCDIC) code point if less than #define NATIVE8_TO_UNI(ch) NATIVE_TO_LATIN1(ch) /* This defines the 1-bits that are to be in the first byte of a multi-byte - * UTF-8 encoded character that give the number of bytes that comprise the - * character. 'len' is the number of bytes in the multi-byte sequence. */ + * UTF-8 encoded character that mark it as a start byte and give the number of + * bytes that comprise the character. 'len' is the number of bytes in the + * multi-byte sequence. */ #define UTF_START_MARK(len) (((len) > 7) ? 0xFF : (0xFF & (0xFE << (7-(len))))) /* Masks out the initial one bits in a start byte, leaving the real data ones. @@ -509,11 +510,11 @@ only) byte is pointed to by C<s>. * beginning of a utf8 character. Now that foo_utf8() determines that itself, * no need to do it again here */ -#define isIDFIRST_lazy_if(p,UTF) ((IN_BYTES || !UTF ) \ - ? isIDFIRST(*(p)) \ +#define isIDFIRST_lazy_if(p,UTF) ((IN_BYTES || !UTF) \ + ? isIDFIRST(*(p)) \ : isIDFIRST_utf8((const U8*)p)) -#define isWORDCHAR_lazy_if(p,UTF) ((IN_BYTES || (!UTF )) \ - ? isWORDCHAR(*(p)) \ +#define isWORDCHAR_lazy_if(p,UTF) ((IN_BYTES || (!UTF)) \ + ? isWORDCHAR(*(p)) \ : isWORDCHAR_utf8((const U8*)p)) #define isALNUM_lazy_if(p,UTF) isWORDCHAR_lazy_if(p,UTF) diff --git a/utfebcdic.h b/utfebcdic.h index 97c0c9d1c6..3a4fcc28af 100644 --- a/utfebcdic.h +++ b/utfebcdic.h @@ -41,7 +41,11 @@ * PL_utf2e, with its inverse being PL_e2utf. They are constructed so that * all EBCDIC invariants remain invariant, but no others do, and the first * byte of a variant will always have its upper bit set. But note that - * the upper bit of some invariants is also 1. + * the upper bit of some invariants is also 1. The table also is designed + * so that lexically comparing two UTF-EBCDIC-variant characters yields + * the Unicode code point order. (To get native code point order, one has + * to convert the latin1-range characters to their native code point + * value.) * * For example, the ordinal value of 'A' is 193 in EBCDIC, and also is 193 in * UTF-EBCDIC. Step 1) converts it to 65, Step 2 leaves it at 65, and Step 3 @@ -184,6 +188,7 @@ information, so that with 13 continuation bytes, we can handle 65 bits, just above what a 64 bit word can hold */ +/* This is a fundamental property of UTF-EBCDIC */ #define OFFUNI_IS_INVARIANT(c) (((UV)(c)) < 0xA0) /* It turns out that on EBCDIC platforms, the invariants are the characters |