summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--ext/XS-APItest/t/utf8.t7
-rw-r--r--inline.h42
2 files changed, 37 insertions, 12 deletions
diff --git a/ext/XS-APItest/t/utf8.t b/ext/XS-APItest/t/utf8.t
index a05194cfcc..f4af4c4298 100644
--- a/ext/XS-APItest/t/utf8.t
+++ b/ext/XS-APItest/t/utf8.t
@@ -1207,9 +1207,10 @@ SKIP:
[ $utf, $utf_ch_len * 5, -4, $utf_ch_len, "utf in range b, backward" ],
[ $utf, $utf_ch_len * 5, 6, length($utf), "utf out of range, forward" ],
[ $utf, $utf_ch_len * 5, -6, 0, "utf out of range, backward" ],
- [ $bad_start, 0, 1, 1, "bad start, forward 1 from 0" ],
- [ $bad_start, 0, $utf_ch_len-1, $utf_ch_len-1, "bad start, forward ch_len-1 from 0" ],
- [ $bad_start, 0, $utf_ch_len, $utf_ch_len*2-1, "bad start, forward ch_len from 0" ],
+ [ $bad_start, 0, 1, $utf_ch_len-1, "bad start, forward 1 from 0" ],
+ [ $bad_start, 0, 5, 5 * $utf_ch_len-1, "bad start, forward 5 chars from 0" ],
+ [ $bad_start, 0, 9, length($bad_start)-$utf_ch_len, "bad start, forward 9 chars from 0" ],
+ [ $bad_start, 0, 10, length $bad_start, "bad start, forward 10 chars from 0" ],
[ $bad_start, $utf_ch_len-1, -1, 0, "bad start, back 1 from first start byte" ],
[ $bad_start, $utf_ch_len-2, -1, 0, "bad start, back 1 from before first start byte" ],
[ $bad_start, 0, -1, 0, "bad start, back 1 from 0" ],
diff --git a/inline.h b/inline.h
index 2d1e1ddbd5..1a1715f044 100644
--- a/inline.h
+++ b/inline.h
@@ -1985,7 +1985,10 @@ Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
=for apidoc utf8_hop
Return the UTF-8 pointer C<s> displaced by C<off> characters, either
-forward or backward.
+forward (if C<off> is positive) or backward (if negative). C<s> does not need
+to be pointing to the starting byte of a character. If it isn't, one count of
+C<off> will be used up to get to the start of the next character for forward
+hops, and to the start of the current character for negative ones.
WARNING: Prefer L</utf8_hop_safe> to this one.
@@ -1993,10 +1996,6 @@ Do NOT use this function unless you B<know> C<off> is within
the UTF-8 data pointed to by C<s> B<and> that on entry C<s> is aligned
on the first byte of a character or just after the last byte of a character.
-If <off> is negative, C<s> does not need to be pointing to the starting byte of
-a character. If it isn't, one count of C<off> will be used up to get to that
-start.
-
=cut
*/
@@ -2006,10 +2005,20 @@ Perl_utf8_hop(const U8 *s, SSize_t off)
PERL_ARGS_ASSERT_UTF8_HOP;
/* Note: cannot use UTF8_IS_...() too eagerly here since e.g
- * the bitops (especially ~) can create illegal UTF-8.
+ * the XXX bitops (especially ~) can create illegal UTF-8.
* In other words: in Perl UTF-8 is not just for Unicode. */
- if (off >= 0) {
+ if (off > 0) {
+
+ /* Get to next non-continuation byte */
+ if (UNLIKELY(UTF8_IS_CONTINUATION(*s))) {
+ do {
+ s++;
+ }
+ while (UTF8_IS_CONTINUATION(*s));
+ off--;
+ }
+
while (off--)
s += UTF8SKIP(s);
}
@@ -2020,6 +2029,7 @@ Perl_utf8_hop(const U8 *s, SSize_t off)
s--;
}
}
+
GCC_DIAG_IGNORE(-Wcast-qual)
return (U8 *)s;
GCC_DIAG_RESTORE
@@ -2029,7 +2039,9 @@ Perl_utf8_hop(const U8 *s, SSize_t off)
=for apidoc utf8_hop_forward
Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
-forward.
+forward. C<s> does not need to be pointing to the starting byte of a
+character. If it isn't, one count of C<off> will be used up to get to the
+start of the next character.
C<off> must be non-negative.
@@ -2054,6 +2066,15 @@ Perl_utf8_hop_forward(const U8 *s, SSize_t off, const U8 *end)
assert(s <= end);
assert(off >= 0);
+ if (off && UNLIKELY(UTF8_IS_CONTINUATION(*s))) {
+ /* Get to next non-continuation byte */
+ do {
+ s++;
+ }
+ while (UTF8_IS_CONTINUATION(*s));
+ off--;
+ }
+
while (off--) {
STRLEN skip = UTF8SKIP(s);
if ((STRLEN)(end - s) <= skip) {
@@ -2122,7 +2143,10 @@ Perl_utf8_hop_back(const U8 *s, SSize_t off, const U8 *start)
=for apidoc utf8_hop_safe
Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
-either forward or backward.
+either forward or backward. C<s> does not need to be pointing to the starting
+byte of a character. If it isn't, one count of C<off> will be used up to get
+to the start of the next character for forward hops, and to the start of the
+current character for negative ones.
When moving backward it will not move before C<start>.