diff options
author | Karl Williamson <khw@cpan.org> | 2016-08-26 16:07:22 -0600 |
---|---|---|
committer | Karl Williamson <khw@cpan.org> | 2016-08-31 20:32:36 -0600 |
commit | a14e0a36c312aa5bbf90f3971ff3d9ab65c4cda9 (patch) | |
tree | e781cf0490f993a04008826a02696a7a6f7ae328 | |
parent | 5ff889fb14e2876aef87fbed6e39779692ad2aa4 (diff) | |
download | perl-a14e0a36c312aa5bbf90f3971ff3d9ab65c4cda9.tar.gz |
utf8.h, utfebcdic.h: Add comments, align white space
-rw-r--r-- | utf8.h | 46 | ||||
-rw-r--r-- | utfebcdic.h | 30 |
2 files changed, 56 insertions, 20 deletions
@@ -176,24 +176,30 @@ END_EXTERN_C /* - The following table is from Unicode 3.2. + The following table is from Unicode 3.2, plus the Perl extensions for above + U+10FFFF - Code Points 1st Byte 2nd Byte 3rd Byte 4th Byte + Code Points 1st Byte 2nd Byte 3rd 4th 5th 6th 7th 8th-13th U+0000..U+007F 00..7F U+0080..U+07FF * C2..DF 80..BF - U+0800..U+0FFF E0 * A0..BF 80..BF - U+1000..U+CFFF E1..EC 80..BF 80..BF - U+D000..U+D7FF ED 80..9F 80..BF - U+D800..U+DFFF ED A0..BF 80..BF (surrogates) - U+E000..U+FFFF EE..EF 80..BF 80..BF - U+10000..U+3FFFF F0 * 90..BF 80..BF 80..BF - U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF - U+100000..U+10FFFF F4 80..8F 80..BF 80..BF - Below are non-Unicode code points - U+110000..U+13FFFF F4 90..BF 80..BF 80..BF - U+110000..U+1FFFFF F5..F7 80..BF 80..BF 80..BF - U+200000..: F8.. * 88..BF 80..BF 80..BF 80..BF + U+0800..U+0FFF E0 * A0..BF 80..BF + U+1000..U+CFFF E1..EC 80..BF 80..BF + U+D000..U+D7FF ED 80..9F 80..BF + U+D800..U+DFFF ED A0..BF 80..BF (surrogates) + U+E000..U+FFFF EE..EF 80..BF 80..BF + U+10000..U+3FFFF F0 * 90..BF 80..BF 80..BF + U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF + U+100000..U+10FFFF F4 80..8F 80..BF 80..BF + Below are above-Unicode code points + U+110000..U+13FFFF F4 90..BF 80..BF 80..BF + U+110000..U+1FFFFF F5..F7 80..BF 80..BF 80..BF + U+200000..U+FFFFFF F8 * 88..BF 80..BF 80..BF 80..BF +U+1000000..U+3FFFFFF F9..FB 80..BF 80..BF 80..BF 80..BF +U+4000000..U+3FFFFFFF FC * 84..BF 80..BF 80..BF 80..BF 80..BF +U+40000000..U+7FFFFFFF FD 80..BF 80..BF 80..BF 80..BF 80..BF +U+80000000..U+FFFFFFFFF FE * 82..BF 80..BF 80..BF 80..BF 80..BF 80..BF +U+1000000000.. FF 80..BF 80..BF 80..BF 80..BF 80..BF * 81..BF 80..BF Note the gaps before several of the byte entries above marked by '*'. These are caused by legal UTF-8 avoiding non-shortest encodings: it is technically @@ -654,12 +660,14 @@ case any call to string overloading updates the internal UTF-8 encoding flag. * BE AWARE that this test doesn't rule out malformed code points, in * particular overlongs */ #ifdef EBCDIC /* Both versions assume well-formed UTF8 */ -# define UTF8_IS_SUPER(s, e) (NATIVE_UTF8_TO_I8(* (U8*) (s)) >= 0xF9 \ - && (NATIVE_UTF8_TO_I8(* (U8*) (s)) > 0xF9 \ - || (NATIVE_UTF8_TO_I8(* ((U8*) (s) + 1)) >= 0xA2))) +# define UTF8_IS_SUPER(s, e) \ + ( NATIVE_UTF8_TO_I8(* (U8*) (s)) >= 0xF9 \ + && ( NATIVE_UTF8_TO_I8(* (U8*) (s)) > 0xF9 \ + || (NATIVE_UTF8_TO_I8(* ((U8*) (s) + 1)) >= 0xA2))) #else -# define UTF8_IS_SUPER(s, e) (*(U8*) (s) >= 0xF4 \ - && (*(U8*) (s) > 0xF4 || (*((U8*) (s) + 1) >= 0x90))) +# define UTF8_IS_SUPER(s, e) \ + ( *(U8*) (s) >= 0xF4 \ + && (*(U8*) (s) > 0xF4 || (*((U8*) (s) + 1) >= 0x90))) #endif /* These are now machine generated, and the 'given' clause is no longer diff --git a/utfebcdic.h b/utfebcdic.h index 10b666afe2..a6ba4fa6a3 100644 --- a/utfebcdic.h +++ b/utfebcdic.h @@ -185,8 +185,36 @@ U+40000000..U+FFFFFFFF ttuuuuuvvvvvwwwwwzzzzzyyyyyxxxxx 11111111 10100000 101000 For 32-bit words, the 2nd through 7th bytes effectively function as leading zeros. Above 32 bits, these fill up, with each byte yielding 5 bits of information, so that with 13 continuation bytes, we can handle 65 bits, just -above what a 64 bit word can hold */ +above what a 64 bit word can hold + The following table gives the I8: + + I8 Code Points 1st Byte 2nd Byte 3rd 4th 5th 6th 7th 8th 9th-14th + + 0x0000..0x00BF 00..BF + 0x00A0..0x00FF * C5..C7 A0..BF + U+0100..U+03FF C8..DF A0..BF + U+0400..U+3FFF * E1..EF A0..BF A0..BF + U+4000..U+7FFF F0 * B0..BF A0..BF A0..BF + U+8000..U+D7FF F1 A0..B5 A0..BF A0..BF + U+D800..U+DFFF F1 B6..B7 A0..BF A0..BF (surrogates) + U+E000..U+FFFF F1 B8..BF A0..BF A0..BF + U+10000..U+3FFFF F2..F7 A0..BF A0..BF A0..BF + U+40000..U+FFFFF F8 * A8..BF A0..BF A0..BF A0..BF + U+100000..U+10FFFF F9 A0..A1 A0..BF A0..BF A0..BF + Below are above-Unicode code points + U+110000..U+1FFFFF F9 * A2..BF A0..BF A0..BF A0..BF + U+200000..U+3FFFFF FA..FB A0..BF A0..BF A0..BF A0..BF + U+400000..U+1FFFFFF FC * A4..BF A0..BF A0..BF A0..BF A0..BF +U+2000000..U+3FFFFFF FD A0..BF A0..BF A0..BF A0..BF A0..BF +U+4000000..U+3FFFFFFF FE * A2..BF A0..BF A0..BF A0..BF A0..BF A0..BF +U+40000000.. FF A0..BF A0..BF A0..BF A0..BF A0..BF A0..BF * A1..BF A0..BF + +Note the gaps before several of the byte entries above marked by '*'. These are +caused by legal UTF-8 avoiding non-shortest encodings: it is technically +possible to UTF-8-encode a single code point in different ways, but that is +explicitly forbidden, and the shortest possible encoding should always be used +(and that is what Perl does). */ /* This is a fundamental property of UTF-EBCDIC */ #define OFFUNI_IS_INVARIANT(c) (((UV)(c)) < 0xA0) |