diff options
author | Karl Williamson <khw@cpan.org> | 2015-08-02 21:20:44 -0600 |
---|---|---|
committer | Karl Williamson <khw@cpan.org> | 2015-09-18 13:26:12 -0600 |
commit | 2d1545e53e75b1c3ae16ad055ae011e2e015e0c3 (patch) | |
tree | b1046aff7670fc62d2bd7e563bd4220bd3261358 | |
parent | 6916a94cde40f03bd33b3b63bf26ad8d48b399fd (diff) | |
download | perl-2d1545e53e75b1c3ae16ad055ae011e2e015e0c3.tar.gz |
Change meaning of UNI_IS_INVARIANT on EBCDIC platforms
This should make more CPAN and other code work without change. Usually,
unwittingly, code that says UNI_IS_INVARIANT means to use the native
platform code values for code points below 256, so acquiesce to the
expected meaning and make the macro correspond. Since the native values
on ASCII machines are the same as Unicode, this change doesn't affect
code running on them.
A new macro, OFFUNI_IS_INVARIANT, is created for those few places that
really do want a Unicode value. There are just a few places in the Perl
core like that, which this commit changes.
-rw-r--r-- | toke.c | 2 | ||||
-rw-r--r-- | utf8.c | 4 | ||||
-rw-r--r-- | utf8.h | 8 | ||||
-rw-r--r-- | utfebcdic.h | 3 |
4 files changed, 10 insertions, 7 deletions
@@ -3525,7 +3525,7 @@ S_scan_const(pTHX_ char *start) } /* Add the (Unicode) code point to the output. */ - if (UNI_IS_INVARIANT(uv)) { + if (OFFUNI_IS_INVARIANT(uv)) { *d++ = (char) LATIN1_TO_NATIVE(uv); } else { @@ -104,7 +104,7 @@ Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags) { PERL_ARGS_ASSERT_UVOFFUNI_TO_UTF8_FLAGS; - if (UNI_IS_INVARIANT(uv)) { + if (OFFUNI_IS_INVARIANT(uv)) { *d++ = (U8) LATIN1_TO_NATIVE(uv); return d; } @@ -1265,7 +1265,7 @@ Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen) while (p < pend) { UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */ p += 2; - if (UNI_IS_INVARIANT(uv)) { + if (OFFUNI_IS_INVARIANT(uv)) { *d++ = LATIN1_TO_NATIVE((U8) uv); continue; } @@ -192,12 +192,12 @@ Perl's extended UTF-8 means we can have start bytes up to FF. /* Is the representation of the Unicode code point 'cp' the same regardless of * being encoded in UTF-8 or not? */ -#define UNI_IS_INVARIANT(cp) isASCII(cp) +#define OFFUNI_IS_INVARIANT(cp) isASCII(cp) /* Is the representation of the code point 'cp' the same regardless of * being encoded in UTF-8 or not? 'cp' is native if < 256; Unicode otherwise * */ -#define UVCHR_IS_INVARIANT(uv) UNI_IS_INVARIANT(uv) +#define UVCHR_IS_INVARIANT(uv) OFFUNI_IS_INVARIANT(uv) /* Misleadingly named: is the UTF8-encoded byte 'c' part of a variant sequence * in UTF-8? This is the inverse of UTF8_IS_INVARIANT */ @@ -401,6 +401,10 @@ only) byte is pointed to by C<s>. #define UTF8SKIP(s) PL_utf8skip[*(const U8*)(s)] #define UTF8_SKIP(s) UTF8SKIP(s) +/* Most code that says 'UNI_' really means the native value for code points up + * through 255 */ +#define UNI_IS_INVARIANT(cp) UVCHR_IS_INVARIANT(cp) + /* Is the byte 'c' the same character when encoded in UTF-8 as when not. This * works on both UTF-8 encoded strings and non-encoded, as it returns TRUE in * each for the exact same set of bit patterns. It is valid on a subset of diff --git a/utfebcdic.h b/utfebcdic.h index c852946f44..5912b3a142 100644 --- a/utfebcdic.h +++ b/utfebcdic.h @@ -171,8 +171,7 @@ END_EXTERN_C (uv) < 0x400000 ? 5 : \ (uv) < 0x4000000 ? 6 : 7 ) - -#define UNI_IS_INVARIANT(c) (((UV)(c)) < 0xA0) +#define OFFUNI_IS_INVARIANT(c) (((UV)(c)) < 0xA0) /* It turns out that on EBCDIC platforms, the invariants are the characters * that have ASCII equivalents, plus the C1 controls. Since the C0 controls |