summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2015-11-25 20:41:39 -0700
committerKarl Williamson <khw@cpan.org>2015-11-28 17:19:26 -0700
commitd35f2ca5c7434d42bc5c28288976153ea974bef0 (patch)
tree01e7339753b9ee90a1f6264dc26b4ee2f12d23de
parent6d8b7216aef81fd37c04d2454465e80010c63968 (diff)
downloadperl-d35f2ca5c7434d42bc5c28288976153ea974bef0.tar.gz
utf8.h: Add clearer #define synonyms
These names have long caused me consternation, as they are named after the internal ASCII-platform UTF-8 representation, which is not the same for EBCDIC platforms, nor do they convey meaning to someone who isn't currently steeped in the UTF-8 internals. I've added synonyms that are platform-independent in meaning and make more sense to someone coming at this cold. The old names are retained for back compat.
-rw-r--r--utf8.c30
-rw-r--r--utf8.h42
2 files changed, 38 insertions, 34 deletions
diff --git a/utf8.c b/utf8.c
index 52b6b986cd..7faecad0a1 100644
--- a/utf8.c
+++ b/utf8.c
@@ -124,18 +124,18 @@ Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
}
else if (UNICODE_IS_SUPER(uv)) {
if ( (flags & UNICODE_WARN_SUPER)
- || (UNICODE_IS_FE_FF(uv) && (flags & UNICODE_WARN_FE_FF)))
+ || (UNICODE_IS_ABOVE_31_BIT(uv) && (flags & UNICODE_WARN_ABOVE_31_BIT)))
{
Perl_ck_warner_d(aTHX_ packWARN(WARN_NON_UNICODE),
/* Choose the more dire applicable warning */
- (UNICODE_IS_FE_FF(uv))
+ (UNICODE_IS_ABOVE_31_BIT(uv))
? "Code point 0x%"UVXf" is not Unicode, and not portable"
: "Code point 0x%"UVXf" is not Unicode, may not be portable",
uv);
}
if (flags & UNICODE_DISALLOW_SUPER
- || (UNICODE_IS_FE_FF(uv) && (flags & UNICODE_DISALLOW_FE_FF)))
+ || (UNICODE_IS_ABOVE_31_BIT(uv) && (flags & UNICODE_DISALLOW_ABOVE_31_BIT)))
{
return NULL;
}
@@ -294,8 +294,8 @@ C<UNICODE_WARN_SUPER> and C<UNICODE_DISALLOW_SUPER> flags affect the handling of
code points that are
above the Unicode maximum of 0x10FFFF. Code points above 0x7FFF_FFFF (which are
even less portable) can be warned and/or disallowed even if other above-Unicode
-code points are accepted, by the C<UNICODE_WARN_FE_FF> and
-C<UNICODE_DISALLOW_FE_FF> flags.
+code points are accepted, by the C<UNICODE_WARN_ABOVE_31_BIT> and
+C<UNICODE_DISALLOW_ABOVE_31_BIT> flags.
And finally, the flag C<UNICODE_WARN_ILLEGAL_INTERCHANGE> selects all four of
the above WARN flags; and C<UNICODE_DISALLOW_ILLEGAL_INTERCHANGE> selects all
@@ -463,11 +463,12 @@ imposed later). (The smaller ones, those that fit into 32 bits, are
representable by a UV on ASCII platforms, but not by an IV, which means that
the number of operations that can be performed on them is quite restricted.)
The UTF-8 encoding on ASCII platforms for these large code points begins with a
-byte containing 0xFE or 0xFF. The C<UTF8_DISALLOW_FE_FF> flag will cause them to
-be treated as malformations, while allowing smaller above-Unicode code points.
+byte containing 0xFE or 0xFF. The C<UTF8_DISALLOW_ABOVE_31_BIT> flag will
+cause them to be treated as malformations, while allowing smaller above-Unicode
+code points.
(Of course C<UTF8_DISALLOW_SUPER> will treat all above-Unicode code points,
including these, as malformations.)
-Similarly, C<UTF8_WARN_FE_FF> acts just like
+Similarly, C<UTF8_WARN_ABOVE_31_BIT> acts just like
the other WARN flags, but applies just to these code points.
All other code points corresponding to Unicode characters, including private
@@ -713,10 +714,8 @@ Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
* very well may not be understood by other applications (including
* earlier perl versions on EBCDIC platforms). On ASCII platforms,
* these code points are indicated by the first UTF-8 byte being
- * 0xFE or 0xFF, hence names like 'UTF8_WARN_FE_FF'. These names
- * are ASCII-centric, because the criteria is different On EBCDIC
- * platforms. We test for these after the regular SUPER ones, and
- * before possibly bailing out, so that the slightly more dire
+ * 0xFE or 0xFF. We test for these after the regular SUPER ones,
+ * and before possibly bailing out, so that the slightly more dire
* warning will override the regular one. */
if (
#ifndef EBCDIC
@@ -740,10 +739,11 @@ Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
|| s0[6] > 0x41
|| s0[7] > 0x42)
#endif
- && (flags & (UTF8_WARN_FE_FF|UTF8_WARN_SUPER|UTF8_DISALLOW_FE_FF)))
+ && (flags & (UTF8_WARN_ABOVE_31_BIT|UTF8_WARN_SUPER
+ |UTF8_DISALLOW_ABOVE_31_BIT)))
{
if ( ! (flags & UTF8_CHECK_ONLY)
- && (flags & (UTF8_WARN_FE_FF|UTF8_WARN_SUPER))
+ && (flags & (UTF8_WARN_ABOVE_31_BIT|UTF8_WARN_SUPER))
&& ckWARN_d(WARN_UTF8))
{
sv = sv_2mortal(Perl_newSVpvf(aTHX_
@@ -751,7 +751,7 @@ Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
uv));
pack_warn = packWARN(WARN_UTF8);
}
- if (flags & UTF8_DISALLOW_FE_FF) {
+ if (flags & UTF8_DISALLOW_ABOVE_31_BIT) {
goto disallowed;
}
}
diff --git a/utf8.h b/utf8.h
index c3704de749..36c385202a 100644
--- a/utf8.h
+++ b/utf8.h
@@ -538,12 +538,16 @@ case any call to string overloading updates the internal UTF-8 encoding flag.
#define UTF8_DISALLOW_SUPER 0x0200 /* Super-set of Unicode: code */
#define UTF8_WARN_SUPER 0x0400 /* points above the legal max */
-/* Code points which never were part of the original UTF-8 standard, the first
- * byte of which is a FE or FF on ASCII platforms. If the first byte is FF, it
- * will overflow a 32-bit word. If the first byte is FE, it will overflow a
- * signed 32-bit word. */
-#define UTF8_DISALLOW_FE_FF 0x0800
-#define UTF8_WARN_FE_FF 0x1000
+/* Code points which never were part of the original UTF-8 standard, which only
+ * went up to 2 ** 31 - 1. Note that these all overflow a signed 32-bit word,
+ * The first byte of these code points is FE or FF on ASCII platforms. If the
+ * first byte is FF, it will overflow a 32-bit word. */
+#define UTF8_DISALLOW_ABOVE_31_BIT 0x0800
+#define UTF8_WARN_ABOVE_31_BIT 0x1000
+
+/* For back compat, these old names are misleading for UTF_EBCDIC */
+#define UTF8_DISALLOW_FE_FF UTF8_DISALLOW_ABOVE_31_BIT
+#define UTF8_WARN_FE_FF UTF8_WARN_ABOVE_31_BIT
#define UTF8_CHECK_ONLY 0x2000
@@ -553,11 +557,11 @@ case any call to string overloading updates the internal UTF-8 encoding flag.
#define UTF8_ALLOW_FFFF 0
#define UTF8_ALLOW_SURROGATE 0
-#define UTF8_DISALLOW_ILLEGAL_INTERCHANGE \
- (UTF8_DISALLOW_SUPER|UTF8_DISALLOW_NONCHAR \
- |UTF8_DISALLOW_SURROGATE|UTF8_DISALLOW_FE_FF)
+#define UTF8_DISALLOW_ILLEGAL_INTERCHANGE \
+ (UTF8_DISALLOW_SUPER|UTF8_DISALLOW_NONCHAR \
+ |UTF8_DISALLOW_SURROGATE|UTF8_DISALLOW_ABOVE_31_BIT)
#define UTF8_WARN_ILLEGAL_INTERCHANGE \
- (UTF8_WARN_SUPER|UTF8_WARN_NONCHAR|UTF8_WARN_SURROGATE|UTF8_WARN_FE_FF)
+ (UTF8_WARN_SUPER|UTF8_WARN_NONCHAR|UTF8_WARN_SURROGATE|UTF8_WARN_ABOVE_31_BIT)
#define UTF8_ALLOW_ANY \
(~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE|UTF8_WARN_ILLEGAL_INTERCHANGE))
#define UTF8_ALLOW_ANYUV \
@@ -605,14 +609,14 @@ case any call to string overloading updates the internal UTF-8 encoding flag.
* let's be conservative and do as Unicode says. */
#define PERL_UNICODE_MAX 0x10FFFF
-#define UNICODE_WARN_SURROGATE 0x0001 /* UTF-16 surrogates */
-#define UNICODE_WARN_NONCHAR 0x0002 /* Non-char code points */
-#define UNICODE_WARN_SUPER 0x0004 /* Above 0x10FFFF */
-#define UNICODE_WARN_FE_FF 0x0008 /* Above 0x10FFFF */
-#define UNICODE_DISALLOW_SURROGATE 0x0010
-#define UNICODE_DISALLOW_NONCHAR 0x0020
-#define UNICODE_DISALLOW_SUPER 0x0040
-#define UNICODE_DISALLOW_FE_FF 0x0080
+#define UNICODE_WARN_SURROGATE 0x0001 /* UTF-16 surrogates */
+#define UNICODE_WARN_NONCHAR 0x0002 /* Non-char code points */
+#define UNICODE_WARN_SUPER 0x0004 /* Above 0x10FFFF */
+#define UNICODE_WARN_ABOVE_31_BIT 0x0008 /* Above 0x7FFF_FFFF */
+#define UNICODE_DISALLOW_SURROGATE 0x0010
+#define UNICODE_DISALLOW_NONCHAR 0x0020
+#define UNICODE_DISALLOW_SUPER 0x0040
+#define UNICODE_DISALLOW_ABOVE_31_BIT 0x0080
#define UNICODE_WARN_ILLEGAL_INTERCHANGE \
(UNICODE_WARN_SURROGATE|UNICODE_WARN_NONCHAR|UNICODE_WARN_SUPER)
#define UNICODE_DISALLOW_ILLEGAL_INTERCHANGE \
@@ -635,7 +639,7 @@ case any call to string overloading updates the internal UTF-8 encoding flag.
* characters at all */ \
|| ((((c & 0xFFFE) == 0xFFFE)) && ! UNICODE_IS_SUPER(c)))
#define UNICODE_IS_SUPER(c) ((c) > PERL_UNICODE_MAX)
-#define UNICODE_IS_FE_FF(c) ((c) > 0x7FFFFFFF)
+#define UNICODE_IS_ABOVE_31_BIT(uv) ((UV) (uv) > 0x7FFFFFFF)
#define LATIN_SMALL_LETTER_SHARP_S LATIN_SMALL_LETTER_SHARP_S_NATIVE
#define LATIN_SMALL_LETTER_Y_WITH_DIAERESIS \