diff options
-rw-r--r-- | pod/perlapi.pod | 15 | ||||
-rw-r--r-- | utf8.c | 29 | ||||
-rw-r--r-- | utf8.h | 5 |
3 files changed, 33 insertions, 16 deletions
diff --git a/pod/perlapi.pod b/pod/perlapi.pod index 50ac40c650..7fdc55e63d 100644 --- a/pod/perlapi.pod +++ b/pod/perlapi.pod @@ -3234,11 +3234,12 @@ Found in file utf8.c =item utf8_hop -Move the C<s> pointing to UTF-8 data by C<off> characters, either forward -or backward. +Return the UTF-8 pointer C<s> displaced by C<off> characters, either +forward or backward. WARNING: do not use the following unless you *know* C<off> is within -the UTF-8 buffer pointed to by C<s>. +the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned +on the first byte of character or just after the last byte of a character. U8* utf8_hop(U8 *s, I32 off) @@ -3279,10 +3280,10 @@ If C<s> does not point to a well-formed UTF8 character, the behaviour is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY, it is assumed that the caller will raise a warning, and this function will set C<retlen> to C<-1> and return zero. If the C<flags> does not -contain UTF8_CHECK_ONLY, the UNICODE_REPLACEMENT_CHARACTER (0xFFFD) -will be returned, and C<retlen> will be set to the expected length of -the UTF-8 character in bytes. The C<flags> can also contain various -flags to allow deviations from the strict UTF-8 encoding (see F<utf8.h>). +contain UTF8_CHECK_ONLY, the UNICODE_REPLACEMENT (0xFFFD) will be +returned, and C<retlen> will be set to the expected length of the +UTF-8 character in bytes. The C<flags> can also contain various flags +to allow deviations from the strict UTF-8 encoding (see F<utf8.h>). U8* s utf8_to_uv(STRLEN curlen, STRLEN *retlen, U32 flags) @@ -137,7 +137,7 @@ Perl_is_utf8_char(pTHX_ U8 *s) while (slen--) { if ((*s & 0xc0) != 0x80) return 0; - uv = (uv << 6) | (*s & 0x3f); + uv = UTF8_ACCUMULATE(uv, *s); if (uv < ouv) return 0; ouv = uv; @@ -285,7 +285,7 @@ Perl_utf8_to_uv(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags) goto malformed; } else - uv = (uv << 6) | (*s & 0x3f); + uv = UTF8_ACCUMULATE(uv, *s); if (uv < ouv) { /* This cannot be allowed. */ if (dowarn) @@ -379,6 +379,10 @@ Perl_utf8_length(pTHX_ U8* s, U8* e) { STRLEN len = 0; + /* Note: cannot use UTF8_IS_...() too eagerly here since e.g. + * the bitops (especially ~) can create illegal UTF-8. + * In other words: in Perl UTF-8 is not just for Unicode. */ + if (e < s) Perl_croak(aTHX_ "panic: utf8_length: unexpected end"); while (s < e) { @@ -409,6 +413,10 @@ Perl_utf8_distance(pTHX_ U8 *a, U8 *b) { IV off = 0; + /* Note: cannot use UTF8_IS_...() too eagerly here since e.g. + * the bitops (especially ~) can create illegal UTF-8. + * In other words: in Perl UTF-8 is not just for Unicode. */ + if (a < b) { while (a < b) { U8 c = UTF8SKIP(a); @@ -436,17 +444,22 @@ Perl_utf8_distance(pTHX_ U8 *a, U8 *b) /* =for apidoc Am|U8*|utf8_hop|U8 *s|I32 off -Move the C<s> pointing to UTF-8 data by C<off> characters, either forward -or backward. +Return the UTF-8 pointer C<s> displaced by C<off> characters, either +forward or backward. WARNING: do not use the following unless you *know* C<off> is within -the UTF-8 buffer pointed to by C<s>. +the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned +on the first byte of character or just after the last byte of a character. =cut */ U8 * Perl_utf8_hop(pTHX_ U8 *s, I32 off) { + /* Note: cannot use UTF8_IS_...() too eagerly here since e.g + * the bitops (especially ~) can create illegal UTF-8. + * In other words: in Perl UTF-8 is not just for Unicode. */ + if (off >= 0) { while (off--) s += UTF8SKIP(s); @@ -454,10 +467,8 @@ Perl_utf8_hop(pTHX_ U8 *s, I32 off) else { while (off++) { s--; - if (*s & 0x80) { - while ((*s & 0xc0) == 0x80) - s--; - } + while (UTF8_IS_CONTINUATION(*s)) + s--; } } return s; @@ -65,6 +65,11 @@ END_EXTERN_C #define UTF8_IS_ASCII(c) ((c) < 0x80) #define UTF8_IS_START(c) ((c) >= 0xc0 && ((c) <= 0xfd)) #define UTF8_IS_CONTINUATION(c) ((c) >= 0x80 && ((c) <= 0xbf)) +#define UTF8_IS_CONTINUED(c) ((c) & 0x80) + +#define UTF8_CONTINUATION_MASK 0x3f +#define UTF8_ACCUMULATION_SHIFT 6 +#define UTF8_ACCUMULATE(old, new) ((old) << UTF8_ACCUMULATION_SHIFT | ((new) & UTF8_CONTINUATION_MASK)) #ifdef HAS_QUAD #define UNISKIP(uv) ( (uv) < 0x80 ? 1 : \ |