diff options
-rw-r--r-- | toke.c | 2 | ||||
-rw-r--r-- | utf8.c | 6 | ||||
-rw-r--r-- | utf8.h | 14 | ||||
-rw-r--r-- | utfebcdic.h | 3 |
4 files changed, 18 insertions, 7 deletions
@@ -3772,7 +3772,7 @@ S_scan_const(pTHX_ char *start) const UV nextuv = (this_utf8) ? utf8n_to_uvchr((U8*)s, send - s, &len, 0) : (UV) ((U8) *s); - const STRLEN need = UNISKIP(NATIVE_TO_UNI(nextuv)); + const STRLEN need = UNISKIP(nextuv); if (!has_utf8) { SvCUR_set(sv, d - SvPVX_const(sv)); SvPOK_on(sv); @@ -184,7 +184,7 @@ Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags) } #if defined(EBCDIC) else { - STRLEN len = UNISKIP(uv); + STRLEN len = OFFUNISKIP(uv); U8 *p = d+len-1; while (p > d) { *p-- = (U8) I8_TO_NATIVE_UTF8((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK); @@ -772,7 +772,7 @@ Perl_utf8n_to_uvoffuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 fla #endif if (do_overlong_test - && expectlen > (STRLEN)UNISKIP(uv) + && expectlen > (STRLEN) OFFUNISKIP(uv) && ! (flags & UTF8_ALLOW_LONG)) { /* The overlong malformation has lower precedence than the others. @@ -780,7 +780,7 @@ Perl_utf8n_to_uvoffuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 fla * value, instead of the replacement character. This is because this * value is actually well-defined. */ if (! (flags & UTF8_CHECK_ONLY)) { - sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (%d byte%s, need %d, after start byte 0x%02x)", malformed_text, (int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), *s0)); + sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (%d byte%s, need %d, after start byte 0x%02x)", malformed_text, (int)expectlen, expectlen == 1 ? "": "s", OFFUNISKIP(uv), *s0)); } goto malformed; } @@ -231,7 +231,8 @@ Perl's extended UTF-8 means we can have start bytes up to FF. - UTF_ACCUMULATION_SHIFT)) #ifdef HAS_QUAD -#define UNISKIP(uv) ( (uv) < 0x80 ? 1 : \ +/* Input is a true Unicode (not-native) code point */ +#define OFFUNISKIP(uv) ( (uv) < 0x80 ? 1 : \ (uv) < 0x800 ? 2 : \ (uv) < 0x10000 ? 3 : \ (uv) < 0x200000 ? 4 : \ @@ -240,7 +241,7 @@ Perl's extended UTF-8 means we can have start bytes up to FF. (uv) < UTF8_QUAD_MAX ? 7 : 13 ) #else /* No, I'm not even going to *TRY* putting #ifdef inside a #define */ -#define UNISKIP(uv) ( (uv) < 0x80 ? 1 : \ +#define OFFUNISKIP(uv) ( (uv) < 0x80 ? 1 : \ (uv) < 0x800 ? 2 : \ (uv) < 0x10000 ? 3 : \ (uv) < 0x200000 ? 4 : \ @@ -297,6 +298,15 @@ Perl's extended UTF-8 means we can have start bytes up to FF. && ( (e) - (s) > 1) \ && UTF8_IS_CONTINUATION(*((s)+1))) +/* Number of bytes a code point occupies in UTF-8. */ +#define NATIVE_SKIP(uv) OFFUNISKIP(NATIVE_TO_UNI(uv)) + +/* Most code which says UNISKIP is really thinking in terms of native code + * points (0-255) plus all those beyond. This is an imprecise term, but having + * it means existing code continues to work. For precision, use NATIVE_SKIP + * and OFFUNISKIP */ +#define UNISKIP(uv) NATIVE_SKIP(uv) + /* Convert a two (not one) byte utf8 character to a native code point value. * Needs just one iteration of accumulate. Should not be used unless it is * known that the two bytes are legal: 1) two-byte start, and 2) continuation. diff --git a/utfebcdic.h b/utfebcdic.h index 09d59fa5de..856bcd7d07 100644 --- a/utfebcdic.h +++ b/utfebcdic.h @@ -701,7 +701,8 @@ END_EXTERN_C */ -#define UNISKIP(uv) ( (uv) < 0xA0 ? 1 : \ +/* Input is a true Unicode (not-native) code point */ +#define OFFUNISKIP(uv) ( (uv) < 0xA0 ? 1 : \ (uv) < 0x400 ? 2 : \ (uv) < 0x4000 ? 3 : \ (uv) < 0x40000 ? 4 : \ |