diff options
author | Jarkko Hietaniemi <jhi@iki.fi> | 2000-12-08 01:19:08 +0000 |
---|---|---|
committer | Jarkko Hietaniemi <jhi@iki.fi> | 2000-12-08 01:19:08 +0000 |
commit | 421a8bf2e4d7253d8eb0dc22451e55b15fc6c1e2 (patch) | |
tree | 6b9c0ff622d3926302e9c26b03c5d7c90ed63526 /utf8.c | |
parent | 4dffa63e352fd05b59c46f19323b72952b04b8ce (diff) | |
download | perl-421a8bf2e4d7253d8eb0dc22451e55b15fc6c1e2.tar.gz |
Introduce macros for UTF8 decoding.
p4raw-id: //depot/perl@8028
Diffstat (limited to 'utf8.c')
-rw-r--r-- | utf8.c | 29 |
1 files changed, 15 insertions, 14 deletions
@@ -190,10 +190,10 @@ If C<s> does not point to a well-formed UTF8 character, the behaviour is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY, it is assumed that the caller will raise a warning, and this function will set C<retlen> to C<-1> and return zero. If the C<flags> does not -contain UTF8_CHECK_ONLY, the UNICODE_REPLACEMENT_CHARACTER (0xFFFD) -will be returned, and C<retlen> will be set to the expected length of -the UTF-8 character in bytes. The C<flags> can also contain various -flags to allow deviations from the strict UTF-8 encoding (see F<utf8.h>). +contain UTF8_CHECK_ONLY, the UNICODE_REPLACEMENT (0xFFFD) will be +returned, and C<retlen> will be set to the expected length of the +UTF-8 character in bytes. The C<flags> can also contain various flags +to allow deviations from the strict UTF-8 encoding (see F<utf8.h>). =cut */ @@ -216,13 +216,13 @@ Perl_utf8_to_uv(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags) goto malformed; } - if (uv <= 0x7f) { /* Pure ASCII. */ + if (UTF8_IS_ASCII(uv)) { if (retlen) *retlen = 1; return *s; } - if ((uv >= 0x80 && uv <= 0xbf) && + if (UTF8_IS_CONTINUATION(uv) && !(flags & UTF8_ALLOW_CONTINUATION)) { if (dowarn) Perl_warner(aTHX_ WARN_UTF8, @@ -231,11 +231,11 @@ Perl_utf8_to_uv(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags) goto malformed; } - if ((uv >= 0xc0 && uv <= 0xfd && curlen > 1 && s[1] < 0x80) && + if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) && !(flags & UTF8_ALLOW_NON_CONTINUATION)) { if (dowarn) Perl_warner(aTHX_ WARN_UTF8, - "Malformed UTF-8 character (unexpected non-continuation byte 0x%02"UVxf" after byte 0x%02"UVxf")", + "Malformed UTF-8 character (unexpected non-continuation byte 0x%02"UVxf" after start byte 0x%02"UVxf")", (UV)s[1], uv); goto malformed; } @@ -276,10 +276,11 @@ Perl_utf8_to_uv(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags) ouv = uv; while (len--) { - if ((*s & 0xc0) != 0x80) { + if (!UTF8_IS_CONTINUATION(*s) && + !(flags & UTF8_ALLOW_NON_CONTINUATION)) { if (dowarn) Perl_warner(aTHX_ WARN_UTF8, - "Malformed UTF-8 character (unexpected continuation byte 0x%02x)", + "Malformed UTF-8 character (unexpected non-continuation byte 0x%02x)", *s); goto malformed; } @@ -297,14 +298,14 @@ Perl_utf8_to_uv(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags) ouv = uv; } - if ((uv >= 0xd800 && uv <= 0xdfff) && + if (UNICODE_IS_SURROGATE(uv) && !(flags & UTF8_ALLOW_SURROGATE)) { if (dowarn) Perl_warner(aTHX_ WARN_UTF8, "Malformed UTF-8 character (UTF-16 surrogate 0x%04"UVxf")", uv); goto malformed; - } else if ((uv == 0xfffe) && + } else if (UNICODE_IS_BYTE_ORDER_MARK(uv) && !(flags & UTF8_ALLOW_BOM)) { if (dowarn) Perl_warner(aTHX_ WARN_UTF8, @@ -318,7 +319,7 @@ Perl_utf8_to_uv(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags) "Malformed UTF-8 character (%d byte%s, need %d)", expectlen, expectlen == 1 ? "": "s", UNISKIP(uv)); goto malformed; - } else if ((uv == 0xffff) && + } else if (UNICODE_IS_ILLEGAL(uv) && !(flags & UTF8_ALLOW_FFFF)) { if (dowarn) Perl_warner(aTHX_ WARN_UTF8, @@ -340,7 +341,7 @@ malformed: if (retlen) *retlen = expectlen; - return UNICODE_REPLACEMENT_CHARACTER; + return UNICODE_REPLACEMENT; } /* |