diff options
author | Karl Williamson <public@khwilliamson.com> | 2012-03-05 20:56:52 -0700 |
---|---|---|
committer | Karl Williamson <public@khwilliamson.com> | 2012-03-06 07:11:46 -0700 |
commit | b9933c15464c85c20ff18d826f910218855216b0 (patch) | |
tree | 454bf10a8d754303fbebdbad0a07156417d64cd1 | |
parent | fab2782b37b5570d7f8f8065fd7d18621117ed49 (diff) | |
download | perl-smoke-me/khw-mac.tar.gz |
some macro cleanupsmoke-me/khw-mac
-rw-r--r-- | utf8.c | 40 | ||||
-rw-r--r-- | utf8.h | 9 |
2 files changed, 11 insertions, 38 deletions
@@ -276,43 +276,15 @@ five bytes or more. STATIC STRLEN S_is_utf8_char_slow(const U8 *s, const STRLEN len) { - U8 u = *s; - STRLEN slen; - UV uv, ouv; - - PERL_ARGS_ASSERT_IS_UTF8_CHAR_SLOW; - - if (UTF8_IS_INVARIANT(u)) - return len == 1; + dTHX; /* The function called below requires thread context */ - if (!UTF8_IS_START(u)) - return 0; - - if (len < 2 || !UTF8_IS_CONTINUATION(s[1])) - return 0; + STRLEN actual_len; - slen = len - 1; - s++; -#ifdef EBCDIC - u = NATIVE_TO_UTF(u); -#endif - u &= UTF_START_MASK(len); - uv = u; - ouv = uv; - while (slen--) { - if (!UTF8_IS_CONTINUATION(*s)) - return 0; - uv = UTF8_ACCUMULATE(uv, *s); - if (uv < ouv) - return 0; - ouv = uv; - s++; - } + PERL_ARGS_ASSERT_IS_UTF8_CHAR_SLOW; - if ((STRLEN)UNISKIP(uv) < len) - return 0; + utf8n_to_uvuni(s, len, &actual_len, UTF8_CHECK_ONLY); - return len; + return (actual_len == (STRLEN) -1) ? 0 : actual_len; } /* @@ -608,7 +580,7 @@ Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags) goto malformed; } - if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) && + if (LAX_UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) && !(flags & UTF8_ALLOW_NON_CONTINUATION)) { warning = UTF8_WARN_NON_CONTINUATION; goto malformed; @@ -139,12 +139,13 @@ Perl's extended UTF-8 means we can have start bytes up to FF. */ #define UNI_IS_INVARIANT(c) (((UV)c) < 0x80) -/* Note that C0 and C1 are invalid in legal UTF8, so the lower bound of the - * below might ought to be C2 */ -#define UTF8_IS_START(c) (((U8)c) >= 0xc0) +#define UTF8_IS_START(c) (((U8)c) >= 0xc2) +#define LAX_UTF8_IS_START(c) (((U8)c) >= 0xc0) /* Allows overlong */ #define UTF8_IS_CONTINUATION(c) (((U8)c) >= 0x80 && (((U8)c) <= 0xbf)) #define UTF8_IS_CONTINUED(c) (((U8)c) & 0x80) -#define UTF8_IS_DOWNGRADEABLE_START(c) (((U8)c & 0xfc) == 0xc0) + +/* Masking with 0xfe allows low bit to be 0 or 1; thus this matches 0xc[23] */ +#define UTF8_IS_DOWNGRADEABLE_START(c) (((U8)c & 0xfe) == 0xc2) #define UTF_START_MARK(len) (((len) > 7) ? 0xFF : (0xFE << (7-(len)))) #define UTF_START_MASK(len) (((len) >= 7) ? 0x00 : (0x1F >> ((len)-2))) |