summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKarl Williamson <public@khwilliamson.com>2012-03-05 20:56:52 -0700
committerKarl Williamson <public@khwilliamson.com>2012-03-06 07:11:46 -0700
commitb9933c15464c85c20ff18d826f910218855216b0 (patch)
tree454bf10a8d754303fbebdbad0a07156417d64cd1
parentfab2782b37b5570d7f8f8065fd7d18621117ed49 (diff)
downloadperl-smoke-me/khw-mac.tar.gz
some macro cleanupsmoke-me/khw-mac
-rw-r--r--utf8.c40
-rw-r--r--utf8.h9
2 files changed, 11 insertions, 38 deletions
diff --git a/utf8.c b/utf8.c
index 2b1e99b23a..f7c28da5e3 100644
--- a/utf8.c
+++ b/utf8.c
@@ -276,43 +276,15 @@ five bytes or more.
STATIC STRLEN
S_is_utf8_char_slow(const U8 *s, const STRLEN len)
{
- U8 u = *s;
- STRLEN slen;
- UV uv, ouv;
-
- PERL_ARGS_ASSERT_IS_UTF8_CHAR_SLOW;
-
- if (UTF8_IS_INVARIANT(u))
- return len == 1;
+ dTHX; /* The function called below requires thread context */
- if (!UTF8_IS_START(u))
- return 0;
-
- if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
- return 0;
+ STRLEN actual_len;
- slen = len - 1;
- s++;
-#ifdef EBCDIC
- u = NATIVE_TO_UTF(u);
-#endif
- u &= UTF_START_MASK(len);
- uv = u;
- ouv = uv;
- while (slen--) {
- if (!UTF8_IS_CONTINUATION(*s))
- return 0;
- uv = UTF8_ACCUMULATE(uv, *s);
- if (uv < ouv)
- return 0;
- ouv = uv;
- s++;
- }
+ PERL_ARGS_ASSERT_IS_UTF8_CHAR_SLOW;
- if ((STRLEN)UNISKIP(uv) < len)
- return 0;
+ utf8n_to_uvuni(s, len, &actual_len, UTF8_CHECK_ONLY);
- return len;
+ return (actual_len == (STRLEN) -1) ? 0 : actual_len;
}
/*
@@ -608,7 +580,7 @@ Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
goto malformed;
}
- if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
+ if (LAX_UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
!(flags & UTF8_ALLOW_NON_CONTINUATION)) {
warning = UTF8_WARN_NON_CONTINUATION;
goto malformed;
diff --git a/utf8.h b/utf8.h
index e558bb68d5..ecabb20c30 100644
--- a/utf8.h
+++ b/utf8.h
@@ -139,12 +139,13 @@ Perl's extended UTF-8 means we can have start bytes up to FF.
*/
#define UNI_IS_INVARIANT(c) (((UV)c) < 0x80)
-/* Note that C0 and C1 are invalid in legal UTF8, so the lower bound of the
- * below might ought to be C2 */
-#define UTF8_IS_START(c) (((U8)c) >= 0xc0)
+#define UTF8_IS_START(c) (((U8)c) >= 0xc2)
+#define LAX_UTF8_IS_START(c) (((U8)c) >= 0xc0) /* Allows overlong */
#define UTF8_IS_CONTINUATION(c) (((U8)c) >= 0x80 && (((U8)c) <= 0xbf))
#define UTF8_IS_CONTINUED(c) (((U8)c) & 0x80)
-#define UTF8_IS_DOWNGRADEABLE_START(c) (((U8)c & 0xfc) == 0xc0)
+
+/* Masking with 0xfe allows low bit to be 0 or 1; thus this matches 0xc[23] */
+#define UTF8_IS_DOWNGRADEABLE_START(c) (((U8)c & 0xfe) == 0xc2)
#define UTF_START_MARK(len) (((len) > 7) ? 0xFF : (0xFE << (7-(len))))
#define UTF_START_MASK(len) (((len) >= 7) ? 0x00 : (0x1F >> ((len)-2)))