utf8_hop forwards Change continuation start behavior

Prior to this commit, when hopping forwards, and the initial position to hop from is a continuation byte, it treats it and each such successive one as a single character until it gets to a start byte, and switches into normal mode. In contrast, in hopping backwards, all the consecutive continuation bytes are considered to be part of a single character (as they indeed are). Thus there is a discrepancy between forward/backwards hopping; and the forward version seems wrong to me. This commit removes the discrepancy. There is no change in behavior if the starting position is to the beginning of a character. All calls in the core except for the API test are of this form. But, if the initial position is in the middle of a character, it now moves to the beginning of the next character, subtracting just 1 from the count of characters to hop (instead of subtracting however many continuation bytes there are). This is how I would have expected it to work all along. Succinctly, getting to the next character now consumes one hop count, no matter the direction nor which byte in the character is the starting position.
author: Karl Williamson <khw@cpan.org> 2022-07-10 10:06:17 -0600
committer: Karl Williamson <khw@cpan.org> 2022-12-07 09:16:58 -0700
commit: 76062242c5bacb046859572260d73cd6bc6f2004 (patch)
tree: 86846e053927a3f55e3f0624da52c8e2edefdd97 /inline.h
parent: f0cb6a0886fcc9d9d34b3d80ffb90829db33e738 (diff)
download: perl-76062242c5bacb046859572260d73cd6bc6f2004.tar.gz
1 files changed, 33 insertions, 9 deletions
diff --git a/inline.h b/inline.h
index 2d1e1ddbd5..1a1715f044 100644
--- a/inline.h
+++ b/inline.h
@@ -1985,7 +1985,10 @@ Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
 =for apidoc utf8_hop
 
 Return the UTF-8 pointer C<s> displaced by C<off> characters, either
-forward or backward.
+forward (if C<off> is positive) or backward (if negative).  C<s> does not need
+to be pointing to the starting byte of a character.  If it isn't, one count of
+C<off> will be used up to get to the start of the next character for forward
+hops, and to the start of the current character for negative ones.
 
 WARNING: Prefer L</utf8_hop_safe> to this one.
 
@@ -1993,10 +1996,6 @@ Do NOT use this function unless you B<know> C<off> is within
 the UTF-8 data pointed to by C<s> B<and> that on entry C<s> is aligned
 on the first byte of a character or just after the last byte of a character.
 
-If <off> is negative, C<s> does not need to be pointing to the starting byte of
-a character.  If it isn't, one count of C<off> will be used up to get to that
-start.
-
 =cut
 */
 
@@ -2006,10 +2005,20 @@ Perl_utf8_hop(const U8 *s, SSize_t off)
     PERL_ARGS_ASSERT_UTF8_HOP;
 
     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
-     * the bitops (especially ~) can create illegal UTF-8.
+     * the XXX bitops (especially ~) can create illegal UTF-8.
      * In other words: in Perl UTF-8 is not just for Unicode. */
 
-    if (off >= 0) {
+    if (off > 0) {
+
+        /* Get to next non-continuation byte */
+        if (UNLIKELY(UTF8_IS_CONTINUATION(*s))) {
+            do {
+                s++;
+            }
+            while (UTF8_IS_CONTINUATION(*s));
+            off--;
+        }
+
         while (off--)
             s += UTF8SKIP(s);
     }
@@ -2020,6 +2029,7 @@ Perl_utf8_hop(const U8 *s, SSize_t off)
                 s--;
         }
     }
+
     GCC_DIAG_IGNORE(-Wcast-qual)
     return (U8 *)s;
     GCC_DIAG_RESTORE
@@ -2029,7 +2039,9 @@ Perl_utf8_hop(const U8 *s, SSize_t off)
 =for apidoc utf8_hop_forward
 
 Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
-forward.
+forward.  C<s> does not need to be pointing to the starting byte of a
+character.  If it isn't, one count of C<off> will be used up to get to the
+start of the next character.
 
 C<off> must be non-negative.
 
@@ -2054,6 +2066,15 @@ Perl_utf8_hop_forward(const U8 *s, SSize_t off, const U8 *end)
     assert(s <= end);
     assert(off >= 0);
 
+    if (off && UNLIKELY(UTF8_IS_CONTINUATION(*s))) {
+        /* Get to next non-continuation byte */
+        do {
+            s++;
+        }
+        while (UTF8_IS_CONTINUATION(*s));
+        off--;
+    }
+
     while (off--) {
         STRLEN skip = UTF8SKIP(s);
         if ((STRLEN)(end - s) <= skip) {
@@ -2122,7 +2143,10 @@ Perl_utf8_hop_back(const U8 *s, SSize_t off, const U8 *start)
 =for apidoc utf8_hop_safe
 
 Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
-either forward or backward.
+either forward or backward.  C<s> does not need to be pointing to the starting
+byte of a character.  If it isn't, one count of C<off> will be used up to get
+to the start of the next character for forward hops, and to the start of the
+current character for negative ones.
 
 When moving backward it will not move before C<start>.
author	Karl Williamson <khw@cpan.org>	2022-07-10 10:06:17 -0600
committer	Karl Williamson <khw@cpan.org>	2022-12-07 09:16:58 -0700
commit	76062242c5bacb046859572260d73cd6bc6f2004 (patch)
tree	86846e053927a3f55e3f0624da52c8e2edefdd97 /inline.h
parent	f0cb6a0886fcc9d9d34b3d80ffb90829db33e738 (diff)
download	perl-76062242c5bacb046859572260d73cd6bc6f2004.tar.gz