diff options
-rw-r--r-- | ext/XS-APItest/t/utf8.t | 7 | ||||
-rw-r--r-- | inline.h | 42 |
2 files changed, 37 insertions, 12 deletions
diff --git a/ext/XS-APItest/t/utf8.t b/ext/XS-APItest/t/utf8.t index a05194cfcc..f4af4c4298 100644 --- a/ext/XS-APItest/t/utf8.t +++ b/ext/XS-APItest/t/utf8.t @@ -1207,9 +1207,10 @@ SKIP: [ $utf, $utf_ch_len * 5, -4, $utf_ch_len, "utf in range b, backward" ], [ $utf, $utf_ch_len * 5, 6, length($utf), "utf out of range, forward" ], [ $utf, $utf_ch_len * 5, -6, 0, "utf out of range, backward" ], - [ $bad_start, 0, 1, 1, "bad start, forward 1 from 0" ], - [ $bad_start, 0, $utf_ch_len-1, $utf_ch_len-1, "bad start, forward ch_len-1 from 0" ], - [ $bad_start, 0, $utf_ch_len, $utf_ch_len*2-1, "bad start, forward ch_len from 0" ], + [ $bad_start, 0, 1, $utf_ch_len-1, "bad start, forward 1 from 0" ], + [ $bad_start, 0, 5, 5 * $utf_ch_len-1, "bad start, forward 5 chars from 0" ], + [ $bad_start, 0, 9, length($bad_start)-$utf_ch_len, "bad start, forward 9 chars from 0" ], + [ $bad_start, 0, 10, length $bad_start, "bad start, forward 10 chars from 0" ], [ $bad_start, $utf_ch_len-1, -1, 0, "bad start, back 1 from first start byte" ], [ $bad_start, $utf_ch_len-2, -1, 0, "bad start, back 1 from before first start byte" ], [ $bad_start, 0, -1, 0, "bad start, back 1 from 0" ], @@ -1985,7 +1985,10 @@ Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b) =for apidoc utf8_hop Return the UTF-8 pointer C<s> displaced by C<off> characters, either -forward or backward. +forward (if C<off> is positive) or backward (if negative). C<s> does not need +to be pointing to the starting byte of a character. If it isn't, one count of +C<off> will be used up to get to the start of the next character for forward +hops, and to the start of the current character for negative ones. WARNING: Prefer L</utf8_hop_safe> to this one. @@ -1993,10 +1996,6 @@ Do NOT use this function unless you B<know> C<off> is within the UTF-8 data pointed to by C<s> B<and> that on entry C<s> is aligned on the first byte of a character or just after the last byte of a character. -If <off> is negative, C<s> does not need to be pointing to the starting byte of -a character. If it isn't, one count of C<off> will be used up to get to that -start. - =cut */ @@ -2006,10 +2005,20 @@ Perl_utf8_hop(const U8 *s, SSize_t off) PERL_ARGS_ASSERT_UTF8_HOP; /* Note: cannot use UTF8_IS_...() too eagerly here since e.g - * the bitops (especially ~) can create illegal UTF-8. + * the XXX bitops (especially ~) can create illegal UTF-8. * In other words: in Perl UTF-8 is not just for Unicode. */ - if (off >= 0) { + if (off > 0) { + + /* Get to next non-continuation byte */ + if (UNLIKELY(UTF8_IS_CONTINUATION(*s))) { + do { + s++; + } + while (UTF8_IS_CONTINUATION(*s)); + off--; + } + while (off--) s += UTF8SKIP(s); } @@ -2020,6 +2029,7 @@ Perl_utf8_hop(const U8 *s, SSize_t off) s--; } } + GCC_DIAG_IGNORE(-Wcast-qual) return (U8 *)s; GCC_DIAG_RESTORE @@ -2029,7 +2039,9 @@ Perl_utf8_hop(const U8 *s, SSize_t off) =for apidoc utf8_hop_forward Return the UTF-8 pointer C<s> displaced by up to C<off> characters, -forward. +forward. C<s> does not need to be pointing to the starting byte of a +character. If it isn't, one count of C<off> will be used up to get to the +start of the next character. C<off> must be non-negative. @@ -2054,6 +2066,15 @@ Perl_utf8_hop_forward(const U8 *s, SSize_t off, const U8 *end) assert(s <= end); assert(off >= 0); + if (off && UNLIKELY(UTF8_IS_CONTINUATION(*s))) { + /* Get to next non-continuation byte */ + do { + s++; + } + while (UTF8_IS_CONTINUATION(*s)); + off--; + } + while (off--) { STRLEN skip = UTF8SKIP(s); if ((STRLEN)(end - s) <= skip) { @@ -2122,7 +2143,10 @@ Perl_utf8_hop_back(const U8 *s, SSize_t off, const U8 *start) =for apidoc utf8_hop_safe Return the UTF-8 pointer C<s> displaced by up to C<off> characters, -either forward or backward. +either forward or backward. C<s> does not need to be pointing to the starting +byte of a character. If it isn't, one count of C<off> will be used up to get +to the start of the next character for forward hops, and to the start of the +current character for negative ones. When moving backward it will not move before C<start>. |