diff options
author | Karl Williamson <public@khwilliamson.com> | 2013-02-14 10:54:32 -0700 |
---|---|---|
committer | Karl Williamson <public@khwilliamson.com> | 2013-02-25 14:57:50 -0700 |
commit | 537124e4032962cd7c5f3bd4f0ee7995cd79e8ec (patch) | |
tree | e37330ba1f315956dc962f0c1f3217c6327c81d9 | |
parent | 2550367793db9f9f86124a38dc944f949b315d84 (diff) | |
download | perl-537124e4032962cd7c5f3bd4f0ee7995cd79e8ec.tar.gz |
Add, fix comments
-rw-r--r-- | lib/unicore/mktables | 4 | ||||
-rw-r--r-- | regcomp.c | 7 | ||||
-rw-r--r-- | toke.c | 3 | ||||
-rw-r--r-- | utf8.c | 15 | ||||
-rw-r--r-- | utf8.h | 7 | ||||
-rw-r--r-- | utfebcdic.h | 12 |
6 files changed, 30 insertions, 18 deletions
diff --git a/lib/unicore/mktables b/lib/unicore/mktables index c364c838cc..808760d002 100644 --- a/lib/unicore/mktables +++ b/lib/unicore/mktables @@ -2042,10 +2042,10 @@ package Input_file; # basically be a while(next_line()) {...} loop. # # You can also set up handlers to -# 1) call before the first line is read for pre processing +# 1) call before the first line is read, for pre processing # 2) call to adjust each line of the input before the main handler gets them # 3) call upon EOF before the main handler exits its loop -# 4) call at the end for post processing +# 4) call at the end, for post processing # # $_ is used to store the input line, and is to be filtered by the # each_line_handler()s. So, if the format of the line is not in the desired @@ -15299,8 +15299,11 @@ S_put_byte(pTHX_ SV *sv, int c) EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all ones (binary 1111 1111, hexadecimal FF). It is similar, but not - identical, to the ASCII delete (DEL) or rubout control character. - ) So the old condition can be simplified to !isPRINT(c) */ + identical, to the ASCII delete (DEL) or rubout control character. ... + it is typically mapped to hexadecimal code 9F, in order to provide a + unique character mapping in both directions) + + So the old condition can be simplified to !isPRINT(c) */ if (!isPRINT(c)) { if (c < 256) { Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c); @@ -2863,7 +2863,8 @@ S_get_and_check_backslash_N_name(pTHX_ const char* s, const char* const e) In patterns: expand: - \N{ABC} => \N{U+41.42.43} + \N{FOO} => \N{U+hex_for_character_FOO} + (if FOO expands to multiple characters, expands to \N{U+xx.XX.yy ...}) pass through: all other \-char, including \N and \N{ apart from \N{ABC} @@ -90,7 +90,7 @@ Perl_is_ascii_string(const U8 *s, STRLEN len) /* =for apidoc uvuni_to_utf8_flags -Adds the UTF-8 representation of the code point C<uv> to the end +Adds the UTF-8 representation of the Unicode code point C<uv> to the end of the string C<d>; C<d> should have at least C<UTF8_MAXBYTES+1> free bytes available. The return value is the pointer to the byte after the end of the new character. In other words, @@ -109,6 +109,10 @@ This is the recommended Unicode-aware way of saying *(d++) = uv; +where uv is a code point expressed in Latin-1 or above, not the platform's +native character set. B<Almost all code should instead use L</uvchr_to_utf8> +or L</uvchr_to_utf8_flags>>. + This function will convert to UTF-8 (and not warn) even code points that aren't legal Unicode or are problematic, unless C<flags> contains one or more of the following flags: @@ -119,8 +123,9 @@ UNICODE_DISALLOW_SURROGATE is set, the function will fail and return NULL. If both flags are set, the function will both warn and return NULL. The UNICODE_WARN_NONCHAR and UNICODE_DISALLOW_NONCHAR flags correspondingly -affect how the function handles a Unicode non-character. And, likewise for the -UNICODE_WARN_SUPER and UNICODE_DISALLOW_SUPER flags, and code points that are +affect how the function handles a Unicode non-character. And likewise, the +UNICODE_WARN_SUPER and UNICODE_DISALLOW_SUPER flags, affect the handling of +code points that are above the Unicode maximum of 0x10FFFF. Code points above 0x7FFF_FFFF (which are even less portable) can be warned and/or disallowed even if other above-Unicode code points are accepted by the UNICODE_WARN_FE_FF and UNICODE_DISALLOW_FE_FF @@ -258,7 +263,7 @@ Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags) return d; } #endif -#endif /* Loop style */ +#endif /* Non loop style */ } /* @@ -275,7 +280,7 @@ or less you should use the IS_UTF8_CHAR(), for lengths of five or more you should use the _slow(). In practice this means that the _slow() will be used very rarely, since the maximum Unicode code point (as of Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes. Only -the "Perl extended UTF-8" (the infamous 'v-strings') will encode into +the "Perl extended UTF-8" (e.g, the infamous 'v-strings') will encode into five bytes or more. =cut */ @@ -136,7 +136,7 @@ END_EXTERN_C U+0800..U+0FFF E0 * A0..BF 80..BF U+1000..U+CFFF E1..EC 80..BF 80..BF U+D000..U+D7FF ED 80..9F 80..BF - U+D800..U+DFFF +++++++ utf16 surrogates, not legal utf8 +++++++ + U+D800..U+DFFF ED A0..BF 80..BF (surrogates) U+E000..U+FFFF EE..EF 80..BF 80..BF U+10000..U+3FFFF F0 * 90..BF 80..BF 80..BF U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF @@ -144,7 +144,7 @@ END_EXTERN_C Below are non-Unicode code points U+110000..U+13FFFF F4 90..BF 80..BF 80..BF U+110000..U+1FFFFF F5..F7 80..BF 80..BF 80..BF - U+200000: F8.. * 88..BF 80..BF 80..BF 80..BF + U+200000..: F8.. * 88..BF 80..BF 80..BF 80..BF Note the gaps before several of the byte entries above marked by '*'. These are caused by legal UTF-8 avoiding non-shortest encodings: it is technically @@ -275,6 +275,9 @@ Perl's extended UTF-8 means we can have start bytes up to FF. #define NATIVE8_TO_UNI(ch) NATIVE_TO_ASCII(ch) /* a clearer synonym */ +/* Adds a UTF8 continuation byte 'new' of information to a running total code + * point 'old' of all the continuation bytes so far. This is designed to be + * used in a loop to convert from UTF-8 to the code point represented */ #define UTF8_ACCUMULATE(old, new) (((old) << UTF_ACCUMULATION_SHIFT) \ | (((U8)new) & UTF_CONTINUATION_MASK)) diff --git a/utfebcdic.h b/utfebcdic.h index 5705b969d4..c6001b2f37 100644 --- a/utfebcdic.h +++ b/utfebcdic.h @@ -152,7 +152,7 @@ unsigned char PL_utf8skip[] = { * remains 'A' */ #if '^' == 95 /* if defined(__MVS__) || defined(??) (VM/ESA?) 1047 */ -EXTCONST unsigned char PL_utf2e[] = { /* I8 to EBCDIC (IBM-1047) */ +EXTCONST unsigned char PL_utf2e[] = { /* I8 to UTFEBCDIC (IBM-1047) */ 0x00, 0x01, 0x02, 0x03, 0x37, 0x2D, 0x2E, 0x2F, 0x16, 0x05, 0x15, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x3C, 0x3D, 0x32, 0x26, 0x18, 0x19, 0x3F, 0x27, 0x1C, 0x1D, 0x1E, 0x1F, 0x40, 0x5A, 0x7F, 0x7B, 0x5B, 0x6C, 0x50, 0x7D, 0x4D, 0x5D, 0x5C, 0x4E, 0x6B, 0x60, 0x4B, 0x61, @@ -171,7 +171,7 @@ EXTCONST unsigned char PL_utf2e[] = { /* I8 to EBCDIC (IBM-1047) */ 0xDC, 0xDD, 0xDE, 0xDF, 0xE1, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE }; -EXTCONST unsigned char PL_e2utf[] = { /* EBCDIC (IBM-1047) to I8 */ +EXTCONST unsigned char PL_e2utf[] = { /* UTFEBCDIC (IBM-1047) to I8 */ 0x00, 0x01, 0x02, 0x03, 0x9C, 0x09, 0x86, 0x7F, 0x97, 0x8D, 0x8E, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x9D, 0x0A, 0x08, 0x87, 0x18, 0x19, 0x92, 0x8F, 0x1C, 0x1D, 0x1E, 0x1F, 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x17, 0x1B, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x05, 0x06, 0x07, @@ -192,7 +192,7 @@ EXTCONST unsigned char PL_e2utf[] = { /* EBCDIC (IBM-1047) to I8 */ #endif /* 1047 */ #if '^' == 106 /* if defined(_OSD_POSIX) POSIX-BC */ -unsigned char PL_utf2e[] = { /* I8 to EBCDIC (POSIX-BC) */ +unsigned char PL_utf2e[] = { /* I8 to UTFEBCDIC (POSIX-BC) */ 0x00, 0x01, 0x02, 0x03, 0x37, 0x2D, 0x2E, 0x2F, 0x16, 0x05, 0x15, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x3C, 0x3D, 0x32, 0x26, 0x18, 0x19, 0x3F, 0x27, 0x1C, 0x1D, 0x1E, 0x1F, 0x40, 0x5A, 0x7F, 0x7B, 0x5B, 0x6C, 0x50, 0x7D, 0x4D, 0x5D, 0x5C, 0x4E, 0x6B, 0x60, 0x4B, 0x61, @@ -211,7 +211,7 @@ unsigned char PL_utf2e[] = { /* I8 to EBCDIC (POSIX-BC) */ 0xDC, 0xC0, 0xDE, 0xDF, 0xE1, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, 0xFA, 0xDD, 0xFC, 0xE0, 0xFE }; -unsigned char PL_e2utf[] = { /* EBCDIC (POSIX-BC) to I8 */ +unsigned char PL_e2utf[] = { /* UTFEBCDIC (POSIX-BC) to I8 */ 0x00, 0x01, 0x02, 0x03, 0x9C, 0x09, 0x86, 0x7F, 0x97, 0x8D, 0x8E, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x9D, 0x0A, 0x08, 0x87, 0x18, 0x19, 0x92, 0x8F, 0x1C, 0x1D, 0x1E, 0x1F, 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x17, 0x1B, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x05, 0x06, 0x07, @@ -232,7 +232,7 @@ unsigned char PL_e2utf[] = { /* EBCDIC (POSIX-BC) to I8 */ #endif /* POSIX-BC */ #if '^' == 176 /* if defined(??) (OS/400?) 037 */ -unsigned char PL_utf2e[] = { /* I8 to EBCDIC (IBM-037) */ +unsigned char PL_utf2e[] = { /* I8 to UTFEBCDIC (IBM-037) */ 0x00, 0x01, 0x02, 0x03, 0x37, 0x2D, 0x2E, 0x2F, 0x16, 0x05, 0x25, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x3C, 0x3D, 0x32, 0x26, 0x18, 0x19, 0x3F, 0x27, 0x1C, 0x1D, 0x1E, 0x1F, 0x40, 0x5A, 0x7F, 0x7B, 0x5B, 0x6C, 0x50, 0x7D, 0x4D, 0x5D, 0x5C, 0x4E, 0x6B, 0x60, 0x4B, 0x61, @@ -251,7 +251,7 @@ unsigned char PL_utf2e[] = { /* I8 to EBCDIC (IBM-037) */ 0xDC, 0xDD, 0xDE, 0xDF, 0xE1, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE }; -unsigned char PL_e2utf[] = { /* EBCDIC (IBM-037) to I8 */ +unsigned char PL_e2utf[] = { /* UTFEBCDIC (IBM-037) to I8 */ 0x00, 0x01, 0x02, 0x03, 0x9C, 0x09, 0x86, 0x7F, 0x97, 0x8D, 0x8E, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x9D, 0x85, 0x08, 0x87, 0x18, 0x19, 0x92, 0x8F, 0x1C, 0x1D, 0x1E, 0x1F, 0x80, 0x81, 0x82, 0x83, 0x84, 0x0A, 0x17, 0x1B, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x05, 0x06, 0x07, |