diff options
author | Karl Williamson <khw@cpan.org> | 2015-11-25 20:41:39 -0700 |
---|---|---|
committer | Karl Williamson <khw@cpan.org> | 2015-11-28 17:19:26 -0700 |
commit | d35f2ca5c7434d42bc5c28288976153ea974bef0 (patch) | |
tree | 01e7339753b9ee90a1f6264dc26b4ee2f12d23de | |
parent | 6d8b7216aef81fd37c04d2454465e80010c63968 (diff) | |
download | perl-d35f2ca5c7434d42bc5c28288976153ea974bef0.tar.gz |
utf8.h: Add clearer #define synonyms
These names have long caused me consternation, as they are named after
the internal ASCII-platform UTF-8 representation, which is not the same
for EBCDIC platforms, nor do they convey meaning to someone who isn't
currently steeped in the UTF-8 internals. I've added synonyms that are
platform-independent in meaning and make more sense to someone coming at
this cold. The old names are retained for back compat.
-rw-r--r-- | utf8.c | 30 | ||||
-rw-r--r-- | utf8.h | 42 |
2 files changed, 38 insertions, 34 deletions
@@ -124,18 +124,18 @@ Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags) } else if (UNICODE_IS_SUPER(uv)) { if ( (flags & UNICODE_WARN_SUPER) - || (UNICODE_IS_FE_FF(uv) && (flags & UNICODE_WARN_FE_FF))) + || (UNICODE_IS_ABOVE_31_BIT(uv) && (flags & UNICODE_WARN_ABOVE_31_BIT))) { Perl_ck_warner_d(aTHX_ packWARN(WARN_NON_UNICODE), /* Choose the more dire applicable warning */ - (UNICODE_IS_FE_FF(uv)) + (UNICODE_IS_ABOVE_31_BIT(uv)) ? "Code point 0x%"UVXf" is not Unicode, and not portable" : "Code point 0x%"UVXf" is not Unicode, may not be portable", uv); } if (flags & UNICODE_DISALLOW_SUPER - || (UNICODE_IS_FE_FF(uv) && (flags & UNICODE_DISALLOW_FE_FF))) + || (UNICODE_IS_ABOVE_31_BIT(uv) && (flags & UNICODE_DISALLOW_ABOVE_31_BIT))) { return NULL; } @@ -294,8 +294,8 @@ C<UNICODE_WARN_SUPER> and C<UNICODE_DISALLOW_SUPER> flags affect the handling of code points that are above the Unicode maximum of 0x10FFFF. Code points above 0x7FFF_FFFF (which are even less portable) can be warned and/or disallowed even if other above-Unicode -code points are accepted, by the C<UNICODE_WARN_FE_FF> and -C<UNICODE_DISALLOW_FE_FF> flags. +code points are accepted, by the C<UNICODE_WARN_ABOVE_31_BIT> and +C<UNICODE_DISALLOW_ABOVE_31_BIT> flags. And finally, the flag C<UNICODE_WARN_ILLEGAL_INTERCHANGE> selects all four of the above WARN flags; and C<UNICODE_DISALLOW_ILLEGAL_INTERCHANGE> selects all @@ -463,11 +463,12 @@ imposed later). (The smaller ones, those that fit into 32 bits, are representable by a UV on ASCII platforms, but not by an IV, which means that the number of operations that can be performed on them is quite restricted.) The UTF-8 encoding on ASCII platforms for these large code points begins with a -byte containing 0xFE or 0xFF. The C<UTF8_DISALLOW_FE_FF> flag will cause them to -be treated as malformations, while allowing smaller above-Unicode code points. +byte containing 0xFE or 0xFF. The C<UTF8_DISALLOW_ABOVE_31_BIT> flag will +cause them to be treated as malformations, while allowing smaller above-Unicode +code points. (Of course C<UTF8_DISALLOW_SUPER> will treat all above-Unicode code points, including these, as malformations.) -Similarly, C<UTF8_WARN_FE_FF> acts just like +Similarly, C<UTF8_WARN_ABOVE_31_BIT> acts just like the other WARN flags, but applies just to these code points. All other code points corresponding to Unicode characters, including private @@ -713,10 +714,8 @@ Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags) * very well may not be understood by other applications (including * earlier perl versions on EBCDIC platforms). On ASCII platforms, * these code points are indicated by the first UTF-8 byte being - * 0xFE or 0xFF, hence names like 'UTF8_WARN_FE_FF'. These names - * are ASCII-centric, because the criteria is different On EBCDIC - * platforms. We test for these after the regular SUPER ones, and - * before possibly bailing out, so that the slightly more dire + * 0xFE or 0xFF. We test for these after the regular SUPER ones, + * and before possibly bailing out, so that the slightly more dire * warning will override the regular one. */ if ( #ifndef EBCDIC @@ -740,10 +739,11 @@ Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags) || s0[6] > 0x41 || s0[7] > 0x42) #endif - && (flags & (UTF8_WARN_FE_FF|UTF8_WARN_SUPER|UTF8_DISALLOW_FE_FF))) + && (flags & (UTF8_WARN_ABOVE_31_BIT|UTF8_WARN_SUPER + |UTF8_DISALLOW_ABOVE_31_BIT))) { if ( ! (flags & UTF8_CHECK_ONLY) - && (flags & (UTF8_WARN_FE_FF|UTF8_WARN_SUPER)) + && (flags & (UTF8_WARN_ABOVE_31_BIT|UTF8_WARN_SUPER)) && ckWARN_d(WARN_UTF8)) { sv = sv_2mortal(Perl_newSVpvf(aTHX_ @@ -751,7 +751,7 @@ Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags) uv)); pack_warn = packWARN(WARN_UTF8); } - if (flags & UTF8_DISALLOW_FE_FF) { + if (flags & UTF8_DISALLOW_ABOVE_31_BIT) { goto disallowed; } } @@ -538,12 +538,16 @@ case any call to string overloading updates the internal UTF-8 encoding flag. #define UTF8_DISALLOW_SUPER 0x0200 /* Super-set of Unicode: code */ #define UTF8_WARN_SUPER 0x0400 /* points above the legal max */ -/* Code points which never were part of the original UTF-8 standard, the first - * byte of which is a FE or FF on ASCII platforms. If the first byte is FF, it - * will overflow a 32-bit word. If the first byte is FE, it will overflow a - * signed 32-bit word. */ -#define UTF8_DISALLOW_FE_FF 0x0800 -#define UTF8_WARN_FE_FF 0x1000 +/* Code points which never were part of the original UTF-8 standard, which only + * went up to 2 ** 31 - 1. Note that these all overflow a signed 32-bit word, + * The first byte of these code points is FE or FF on ASCII platforms. If the + * first byte is FF, it will overflow a 32-bit word. */ +#define UTF8_DISALLOW_ABOVE_31_BIT 0x0800 +#define UTF8_WARN_ABOVE_31_BIT 0x1000 + +/* For back compat, these old names are misleading for UTF_EBCDIC */ +#define UTF8_DISALLOW_FE_FF UTF8_DISALLOW_ABOVE_31_BIT +#define UTF8_WARN_FE_FF UTF8_WARN_ABOVE_31_BIT #define UTF8_CHECK_ONLY 0x2000 @@ -553,11 +557,11 @@ case any call to string overloading updates the internal UTF-8 encoding flag. #define UTF8_ALLOW_FFFF 0 #define UTF8_ALLOW_SURROGATE 0 -#define UTF8_DISALLOW_ILLEGAL_INTERCHANGE \ - (UTF8_DISALLOW_SUPER|UTF8_DISALLOW_NONCHAR \ - |UTF8_DISALLOW_SURROGATE|UTF8_DISALLOW_FE_FF) +#define UTF8_DISALLOW_ILLEGAL_INTERCHANGE \ + (UTF8_DISALLOW_SUPER|UTF8_DISALLOW_NONCHAR \ + |UTF8_DISALLOW_SURROGATE|UTF8_DISALLOW_ABOVE_31_BIT) #define UTF8_WARN_ILLEGAL_INTERCHANGE \ - (UTF8_WARN_SUPER|UTF8_WARN_NONCHAR|UTF8_WARN_SURROGATE|UTF8_WARN_FE_FF) + (UTF8_WARN_SUPER|UTF8_WARN_NONCHAR|UTF8_WARN_SURROGATE|UTF8_WARN_ABOVE_31_BIT) #define UTF8_ALLOW_ANY \ (~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE|UTF8_WARN_ILLEGAL_INTERCHANGE)) #define UTF8_ALLOW_ANYUV \ @@ -605,14 +609,14 @@ case any call to string overloading updates the internal UTF-8 encoding flag. * let's be conservative and do as Unicode says. */ #define PERL_UNICODE_MAX 0x10FFFF -#define UNICODE_WARN_SURROGATE 0x0001 /* UTF-16 surrogates */ -#define UNICODE_WARN_NONCHAR 0x0002 /* Non-char code points */ -#define UNICODE_WARN_SUPER 0x0004 /* Above 0x10FFFF */ -#define UNICODE_WARN_FE_FF 0x0008 /* Above 0x10FFFF */ -#define UNICODE_DISALLOW_SURROGATE 0x0010 -#define UNICODE_DISALLOW_NONCHAR 0x0020 -#define UNICODE_DISALLOW_SUPER 0x0040 -#define UNICODE_DISALLOW_FE_FF 0x0080 +#define UNICODE_WARN_SURROGATE 0x0001 /* UTF-16 surrogates */ +#define UNICODE_WARN_NONCHAR 0x0002 /* Non-char code points */ +#define UNICODE_WARN_SUPER 0x0004 /* Above 0x10FFFF */ +#define UNICODE_WARN_ABOVE_31_BIT 0x0008 /* Above 0x7FFF_FFFF */ +#define UNICODE_DISALLOW_SURROGATE 0x0010 +#define UNICODE_DISALLOW_NONCHAR 0x0020 +#define UNICODE_DISALLOW_SUPER 0x0040 +#define UNICODE_DISALLOW_ABOVE_31_BIT 0x0080 #define UNICODE_WARN_ILLEGAL_INTERCHANGE \ (UNICODE_WARN_SURROGATE|UNICODE_WARN_NONCHAR|UNICODE_WARN_SUPER) #define UNICODE_DISALLOW_ILLEGAL_INTERCHANGE \ @@ -635,7 +639,7 @@ case any call to string overloading updates the internal UTF-8 encoding flag. * characters at all */ \ || ((((c & 0xFFFE) == 0xFFFE)) && ! UNICODE_IS_SUPER(c))) #define UNICODE_IS_SUPER(c) ((c) > PERL_UNICODE_MAX) -#define UNICODE_IS_FE_FF(c) ((c) > 0x7FFFFFFF) +#define UNICODE_IS_ABOVE_31_BIT(uv) ((UV) (uv) > 0x7FFFFFFF) #define LATIN_SMALL_LETTER_SHARP_S LATIN_SMALL_LETTER_SHARP_S_NATIVE #define LATIN_SMALL_LETTER_Y_WITH_DIAERESIS \ |