summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2015-08-02 21:20:44 -0600
committerKarl Williamson <khw@cpan.org>2015-09-18 13:26:12 -0600
commit2d1545e53e75b1c3ae16ad055ae011e2e015e0c3 (patch)
treeb1046aff7670fc62d2bd7e563bd4220bd3261358
parent6916a94cde40f03bd33b3b63bf26ad8d48b399fd (diff)
downloadperl-2d1545e53e75b1c3ae16ad055ae011e2e015e0c3.tar.gz
Change meaning of UNI_IS_INVARIANT on EBCDIC platforms
This should make more CPAN and other code work without change. Usually, unwittingly, code that says UNI_IS_INVARIANT means to use the native platform code values for code points below 256, so acquiesce to the expected meaning and make the macro correspond. Since the native values on ASCII machines are the same as Unicode, this change doesn't affect code running on them. A new macro, OFFUNI_IS_INVARIANT, is created for those few places that really do want a Unicode value. There are just a few places in the Perl core like that, which this commit changes.
-rw-r--r--toke.c2
-rw-r--r--utf8.c4
-rw-r--r--utf8.h8
-rw-r--r--utfebcdic.h3
4 files changed, 10 insertions, 7 deletions
diff --git a/toke.c b/toke.c
index 1b7860acdc..b1bdfad9c8 100644
--- a/toke.c
+++ b/toke.c
@@ -3525,7 +3525,7 @@ S_scan_const(pTHX_ char *start)
}
/* Add the (Unicode) code point to the output. */
- if (UNI_IS_INVARIANT(uv)) {
+ if (OFFUNI_IS_INVARIANT(uv)) {
*d++ = (char) LATIN1_TO_NATIVE(uv);
}
else {
diff --git a/utf8.c b/utf8.c
index 1cb3f6d4e3..571c45141e 100644
--- a/utf8.c
+++ b/utf8.c
@@ -104,7 +104,7 @@ Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
{
PERL_ARGS_ASSERT_UVOFFUNI_TO_UTF8_FLAGS;
- if (UNI_IS_INVARIANT(uv)) {
+ if (OFFUNI_IS_INVARIANT(uv)) {
*d++ = (U8) LATIN1_TO_NATIVE(uv);
return d;
}
@@ -1265,7 +1265,7 @@ Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
while (p < pend) {
UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
p += 2;
- if (UNI_IS_INVARIANT(uv)) {
+ if (OFFUNI_IS_INVARIANT(uv)) {
*d++ = LATIN1_TO_NATIVE((U8) uv);
continue;
}
diff --git a/utf8.h b/utf8.h
index 17f0e82615..ce537c0de6 100644
--- a/utf8.h
+++ b/utf8.h
@@ -192,12 +192,12 @@ Perl's extended UTF-8 means we can have start bytes up to FF.
/* Is the representation of the Unicode code point 'cp' the same regardless of
* being encoded in UTF-8 or not? */
-#define UNI_IS_INVARIANT(cp) isASCII(cp)
+#define OFFUNI_IS_INVARIANT(cp) isASCII(cp)
/* Is the representation of the code point 'cp' the same regardless of
* being encoded in UTF-8 or not? 'cp' is native if < 256; Unicode otherwise
* */
-#define UVCHR_IS_INVARIANT(uv) UNI_IS_INVARIANT(uv)
+#define UVCHR_IS_INVARIANT(uv) OFFUNI_IS_INVARIANT(uv)
/* Misleadingly named: is the UTF8-encoded byte 'c' part of a variant sequence
* in UTF-8? This is the inverse of UTF8_IS_INVARIANT */
@@ -401,6 +401,10 @@ only) byte is pointed to by C<s>.
#define UTF8SKIP(s) PL_utf8skip[*(const U8*)(s)]
#define UTF8_SKIP(s) UTF8SKIP(s)
+/* Most code that says 'UNI_' really means the native value for code points up
+ * through 255 */
+#define UNI_IS_INVARIANT(cp) UVCHR_IS_INVARIANT(cp)
+
/* Is the byte 'c' the same character when encoded in UTF-8 as when not. This
* works on both UTF-8 encoded strings and non-encoded, as it returns TRUE in
* each for the exact same set of bit patterns. It is valid on a subset of
diff --git a/utfebcdic.h b/utfebcdic.h
index c852946f44..5912b3a142 100644
--- a/utfebcdic.h
+++ b/utfebcdic.h
@@ -171,8 +171,7 @@ END_EXTERN_C
(uv) < 0x400000 ? 5 : \
(uv) < 0x4000000 ? 6 : 7 )
-
-#define UNI_IS_INVARIANT(c) (((UV)(c)) < 0xA0)
+#define OFFUNI_IS_INVARIANT(c) (((UV)(c)) < 0xA0)
/* It turns out that on EBCDIC platforms, the invariants are the characters
* that have ASCII equivalents, plus the C1 controls. Since the C0 controls