summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--toke.c2
-rw-r--r--utf8.c6
-rw-r--r--utf8.h14
-rw-r--r--utfebcdic.h3
4 files changed, 18 insertions, 7 deletions
diff --git a/toke.c b/toke.c
index 73f35909b2..735e0db579 100644
--- a/toke.c
+++ b/toke.c
@@ -3772,7 +3772,7 @@ S_scan_const(pTHX_ char *start)
const UV nextuv = (this_utf8)
? utf8n_to_uvchr((U8*)s, send - s, &len, 0)
: (UV) ((U8) *s);
- const STRLEN need = UNISKIP(NATIVE_TO_UNI(nextuv));
+ const STRLEN need = UNISKIP(nextuv);
if (!has_utf8) {
SvCUR_set(sv, d - SvPVX_const(sv));
SvPOK_on(sv);
diff --git a/utf8.c b/utf8.c
index 945e31c7bf..cc64ee6cd0 100644
--- a/utf8.c
+++ b/utf8.c
@@ -184,7 +184,7 @@ Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
}
#if defined(EBCDIC)
else {
- STRLEN len = UNISKIP(uv);
+ STRLEN len = OFFUNISKIP(uv);
U8 *p = d+len-1;
while (p > d) {
*p-- = (U8) I8_TO_NATIVE_UTF8((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
@@ -772,7 +772,7 @@ Perl_utf8n_to_uvoffuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 fla
#endif
if (do_overlong_test
- && expectlen > (STRLEN)UNISKIP(uv)
+ && expectlen > (STRLEN) OFFUNISKIP(uv)
&& ! (flags & UTF8_ALLOW_LONG))
{
/* The overlong malformation has lower precedence than the others.
@@ -780,7 +780,7 @@ Perl_utf8n_to_uvoffuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 fla
* value, instead of the replacement character. This is because this
* value is actually well-defined. */
if (! (flags & UTF8_CHECK_ONLY)) {
- sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (%d byte%s, need %d, after start byte 0x%02x)", malformed_text, (int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), *s0));
+ sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (%d byte%s, need %d, after start byte 0x%02x)", malformed_text, (int)expectlen, expectlen == 1 ? "": "s", OFFUNISKIP(uv), *s0));
}
goto malformed;
}
diff --git a/utf8.h b/utf8.h
index b3bf997efb..1ecb3b82d5 100644
--- a/utf8.h
+++ b/utf8.h
@@ -231,7 +231,8 @@ Perl's extended UTF-8 means we can have start bytes up to FF.
- UTF_ACCUMULATION_SHIFT))
#ifdef HAS_QUAD
-#define UNISKIP(uv) ( (uv) < 0x80 ? 1 : \
+/* Input is a true Unicode (not-native) code point */
+#define OFFUNISKIP(uv) ( (uv) < 0x80 ? 1 : \
(uv) < 0x800 ? 2 : \
(uv) < 0x10000 ? 3 : \
(uv) < 0x200000 ? 4 : \
@@ -240,7 +241,7 @@ Perl's extended UTF-8 means we can have start bytes up to FF.
(uv) < UTF8_QUAD_MAX ? 7 : 13 )
#else
/* No, I'm not even going to *TRY* putting #ifdef inside a #define */
-#define UNISKIP(uv) ( (uv) < 0x80 ? 1 : \
+#define OFFUNISKIP(uv) ( (uv) < 0x80 ? 1 : \
(uv) < 0x800 ? 2 : \
(uv) < 0x10000 ? 3 : \
(uv) < 0x200000 ? 4 : \
@@ -297,6 +298,15 @@ Perl's extended UTF-8 means we can have start bytes up to FF.
&& ( (e) - (s) > 1) \
&& UTF8_IS_CONTINUATION(*((s)+1)))
+/* Number of bytes a code point occupies in UTF-8. */
+#define NATIVE_SKIP(uv) OFFUNISKIP(NATIVE_TO_UNI(uv))
+
+/* Most code which says UNISKIP is really thinking in terms of native code
+ * points (0-255) plus all those beyond. This is an imprecise term, but having
+ * it means existing code continues to work. For precision, use NATIVE_SKIP
+ * and OFFUNISKIP */
+#define UNISKIP(uv) NATIVE_SKIP(uv)
+
/* Convert a two (not one) byte utf8 character to a native code point value.
* Needs just one iteration of accumulate. Should not be used unless it is
* known that the two bytes are legal: 1) two-byte start, and 2) continuation.
diff --git a/utfebcdic.h b/utfebcdic.h
index 09d59fa5de..856bcd7d07 100644
--- a/utfebcdic.h
+++ b/utfebcdic.h
@@ -701,7 +701,8 @@ END_EXTERN_C
*/
-#define UNISKIP(uv) ( (uv) < 0xA0 ? 1 : \
+/* Input is a true Unicode (not-native) code point */
+#define OFFUNISKIP(uv) ( (uv) < 0xA0 ? 1 : \
(uv) < 0x400 ? 2 : \
(uv) < 0x4000 ? 3 : \
(uv) < 0x40000 ? 4 : \