diff options
-rw-r--r-- | op.c | 10 | ||||
-rw-r--r-- | toke.c | 4 | ||||
-rw-r--r-- | utf8.c | 16 | ||||
-rw-r--r-- | utf8.h | 10 | ||||
-rw-r--r-- | utfebcdic.h | 27 |
5 files changed, 37 insertions, 30 deletions
@@ -4133,7 +4133,7 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl) while (t < tend) { cp[2*i] = utf8n_to_uvuni(t, tend-t, &ulen, flags); t += ulen; - if (t < tend && NATIVE_TO_UTF(*t) == 0xff) { + if (t < tend && NATIVE_UTF8_TO_I8(*t) == 0xff) { t++; cp[2*i+1] = utf8n_to_uvuni(t, tend-t, &ulen, flags); t += ulen; @@ -4151,7 +4151,7 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl) t = uvuni_to_utf8(tmpbuf,nextmin); sv_catpvn(transv, (char*)tmpbuf, t - tmpbuf); if (diff > 1) { - U8 range_mark = UTF_TO_NATIVE(0xff); + U8 range_mark = I8_TO_NATIVE_UTF8(0xff); t = uvuni_to_utf8(tmpbuf, val - 1); sv_catpvn(transv, (char *)&range_mark, 1); sv_catpvn(transv, (char*)tmpbuf, t - tmpbuf); @@ -4164,7 +4164,7 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl) t = uvuni_to_utf8(tmpbuf,nextmin); sv_catpvn(transv, (char*)tmpbuf, t - tmpbuf); { - U8 range_mark = UTF_TO_NATIVE(0xff); + U8 range_mark = I8_TO_NATIVE_UTF8(0xff); sv_catpvn(transv, (char *)&range_mark, 1); } t = uvuni_to_utf8(tmpbuf, 0x7fffffff); @@ -4190,7 +4190,7 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl) if (tfirst > tlast) { tfirst = (I32)utf8n_to_uvuni(t, tend - t, &ulen, flags); t += ulen; - if (t < tend && NATIVE_TO_UTF(*t) == 0xff) { /* illegal utf8 val indicates range */ + if (t < tend && NATIVE_UTF8_TO_I8(*t) == 0xff) { /* illegal utf8 val indicates range */ t++; tlast = (I32)utf8n_to_uvuni(t, tend - t, &ulen, flags); t += ulen; @@ -4204,7 +4204,7 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl) if (r < rend) { rfirst = (I32)utf8n_to_uvuni(r, rend - r, &ulen, flags); r += ulen; - if (r < rend && NATIVE_TO_UTF(*r) == 0xff) { /* illegal utf8 val indicates range */ + if (r < rend && NATIVE_UTF8_TO_I8(*r) == 0xff) { /* illegal utf8 val indicates range */ r++; rlast = (I32)utf8n_to_uvuni(r, rend - r, &ulen, flags); r += ulen; @@ -3105,7 +3105,7 @@ S_scan_const(pTHX_ char *start) char *e = d++; while (e-- > c) *(e + 1) = *e; - *c = (char)UTF_TO_NATIVE(0xff); + *c = (char)I8_TO_NATIVE_UTF8(0xff); /* mark the range as done, and continue */ dorange = FALSE; didrange = TRUE; @@ -3217,7 +3217,7 @@ S_scan_const(pTHX_ char *start) && !native_range #endif ) { - *d++ = (char)UTF_TO_NATIVE(0xff); /* use illegal utf8 byte--see pmtrans */ + *d++ = (char)I8_TO_NATIVE_UTF8(0xff); /* use illegal utf8 byte--see pmtrans */ s++; continue; } @@ -182,7 +182,7 @@ Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags) } } if (UNI_IS_INVARIANT(uv)) { - *d++ = (U8)UTF_TO_NATIVE(uv); + *d++ = (U8) I8_TO_NATIVE_UTF8(uv); return d; } #if defined(EBCDIC) @@ -190,10 +190,10 @@ Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags) STRLEN len = UNISKIP(uv); U8 *p = d+len-1; while (p > d) { - *p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK); + *p-- = (U8) I8_TO_NATIVE_UTF8((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK); uv >>= UTF_ACCUMULATION_SHIFT; } - *p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len)); + *p = (U8) I8_TO_NATIVE_UTF8((uv & UTF_START_MASK(len)) | UTF_START_MARK(len)); return d+len; } #else /* Non loop style */ @@ -623,7 +623,7 @@ Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags) /* An invariant is trivially well-formed */ if (UTF8_IS_INVARIANT(uv)) { - return (UV) (NATIVE_TO_UTF(*s)); + return (UV) (NATIVE_UTF8_TO_I8(*s)); } /* A continuation character can't start a valid sequence */ @@ -643,7 +643,7 @@ Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags) } #ifdef EBCDIC - uv = NATIVE_TO_UTF(uv); + uv = NATIVE_UTF8_TO_I8(uv); #endif /* Here is not a continuation byte, nor an invariant. The only thing left @@ -1013,7 +1013,7 @@ Perl_valid_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen) { UV expectlen = UTF8SKIP(s); const U8* send = s + expectlen; - UV uv = NATIVE_TO_UTF(*s); + UV uv = NATIVE_UTF8_TO_I8(*s); PERL_ARGS_ASSERT_VALID_UTF8_TO_UVUNI; @@ -3231,12 +3231,12 @@ Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8) * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK */ needents = UTF_CONTINUATION_MARK; - off = NATIVE_TO_UTF(ptr[klen]); + off = NATIVE_UTF8_TO_I8(ptr[klen]); } else { /* If char is encoded then swatch is for the prefix */ needents = (1 << UTF_ACCUMULATION_SHIFT); - off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK; + off = NATIVE_UTF8_TO_I8(ptr[klen]) & UTF_CONTINUATION_MASK; } /* @@ -319,9 +319,9 @@ Perl's extended UTF-8 means we can have start bytes up to FF. * works on both UTF-8 encoded strings and non-encoded, as it returns TRUE in * each for the exact same set of bit patterns. (And it works on any byte in a * UTF-8 encoded string) */ -#define UTF8_IS_INVARIANT(c) UNI_IS_INVARIANT(NATIVE_TO_UTF(c)) +#define UTF8_IS_INVARIANT(c) UNI_IS_INVARIANT(NATIVE_UTF8_TO_I8(c)) -#define NATIVE_IS_INVARIANT(c) UNI_IS_INVARIANT(NATIVE8_TO_UNI(c)) +#define NATIVE_IS_INVARIANT(c) UNI_IS_INVARIANT(NATIVE_TO_LATIN1(c)) #define MAX_PORTABLE_UTF8_TWO_BYTE 0x3FF /* constrained by EBCDIC */ @@ -431,9 +431,9 @@ Perl's extended UTF-8 means we can have start bytes up to FF. * U+110001: \xF4\x90\x80\x81 \xF9\xA2\xA0\xA0\xA1 */ #ifdef EBCDIC /* Both versions assume well-formed UTF8 */ -# define UTF8_IS_SUPER(s) (NATIVE_TO_I8(* (U8*) (s)) >= 0xF9 \ - && (NATIVE_TO_I8(* (U8*) (s)) > 0xF9 \ - || (NATIVE_TO_I8(* (U8*) ((s)) + 1 >= 0xA2)))) +# define UTF8_IS_SUPER(s) (NATIVE_UTF8_TO_I8(* (U8*) (s)) >= 0xF9 \ + && (NATIVE_UTF8_TO_I8(* (U8*) (s)) > 0xF9 \ + || (NATIVE_UTF8_TO_I8(* (U8*) ((s)) + 1 >= 0xA2)))) #else # define UTF8_IS_SUPER(s) (*(U8*) (s) >= 0xF4 \ && (*(U8*) (s) > 0xF4 || (*((U8*) (s) + 1) >= 0x90))) diff --git a/utfebcdic.h b/utfebcdic.h index 9f2bfa6d9f..ec342b5623 100644 --- a/utfebcdic.h +++ b/utfebcdic.h @@ -527,12 +527,17 @@ END_EXTERN_C #define NATIVE_UTF8_TO_I8(ch) (ch) PL_e2utf[(U8)(ch)] #define I8_TO_NATIVE_UTF8(ch) (ch) PL_utf2e[(U8)(ch)] -/* Transform in wide UV char space */ -#define NATIVE_TO_UNI(ch) (((ch) > 255) ? (ch) : NATIVE_TO_ASCII(ch)) -#define UNI_TO_NATIVE(ch) (((ch) > 255) ? (ch) : ASCII_TO_NATIVE(ch)) +/* Transforms in wide UV chars */ +#define NATIVE_TO_UNI(ch) (((ch) > 255) ? (ch) : NATIVE_TO_LATIN1(ch)) +#define UNI_TO_NATIVE(ch) (((ch) > 255) ? (ch) : LATIN1_TO_NATIVE(ch)) + /* Transform in invariant..byte space */ -#define NATIVE_TO_NEED(enc,ch) ((enc) ? UTF_TO_NATIVE(NATIVE_TO_ASCII(ch)) : (ch)) -#define ASCII_TO_NEED(enc,ch) ((enc) ? UTF_TO_NATIVE(ch) : ASCII_TO_NATIVE(ch)) +#define NATIVE_TO_NEED(enc,ch) ((enc) \ + ? I8_TO_NATIVE_UTF8(NATIVE_TO_LATIN1(ch)) \ + : (ch)) +#define ASCII_TO_NEED(enc,ch) ((enc) \ + ? I8_TO_NATIVE_UTF8(ch) \ + : LATIN1_TO_NATIVE(ch)) /* The following table is adapted from tr16, it shows I8 encoding of Unicode code points. @@ -565,11 +570,13 @@ END_EXTERN_C * Comments as to the meaning of each are given at their corresponding utf8.h * definitions */ -#define UTF8_IS_START(c) (NATIVE_TO_UTF(c) >= 0xC5 && NATIVE_TO_UTF(c) != 0xE0) -#define UTF8_IS_CONTINUATION(c) ((NATIVE_TO_UTF(c) & 0xE0) == 0xA0) -#define UTF8_IS_CONTINUED(c) (NATIVE_TO_UTF(c) >= 0xA0) -#define UTF8_IS_DOWNGRADEABLE_START(c) (NATIVE_TO_UTF(c) >= 0xC5 && NATIVE_TO_UTF(c) <= 0xC7) -#define UTF8_IS_ABOVE_LATIN1(c) (NATIVE_TO_I8(c) >= 0xC8) +#define UTF8_IS_START(c) (NATIVE_UTF8_TO_I8(c) >= 0xC5 \ + && NATIVE_UTF8_TO_I8(c) != 0xE0) +#define UTF8_IS_CONTINUATION(c) ((NATIVE_UTF8_TO_I8(c) & 0xE0) == 0xA0) +#define UTF8_IS_CONTINUED(c) (NATIVE_UTF8_TO_I8(c) >= 0xA0) +#define UTF8_IS_DOWNGRADEABLE_START(c) (NATIVE_UTF8_TO_I8(c) >= 0xC5 \ + && NATIVE_UTF8_TO_I8(c) <= 0xC7) +#define UTF8_IS_ABOVE_LATIN1(c) (NATIVE_UTF8_TO_I8(c) >= 0xC8) #define UTF_START_MARK(len) (((len) > 7) ? 0xFF : ((U8)(0xFE << (7-(len))))) #define UTF_START_MASK(len) (((len) >= 6) ? 0x01 : (0x1F >> ((len)-2))) |