(perl #129000) create a safer utf8_hop()

Unlike utf8_hop(), utf8_hop_safe() won't navigate before the beginning or after the end of the supplied buffer. The original version of this put all of the logic into utf8_hop_safe(), but in many cases a caller specifically needs to go forward or backward, and supplying the other limit made the function less usable, so I split the function into forward and backward cases. This split may also make inlining these functions more efficient or more likely.
author: Tony Cook <tony@develop-help.com> 2016-10-31 14:28:34 +1100
committer: Tony Cook <tony@develop-help.com> 2016-11-09 13:29:46 +1100
commit: 65df57a84b55413fcde1e64b86e3d740485536d3 (patch)
tree: 848c157d38e8a30b2ccf0bf95d4865174daf28ca /inline.h
parent: a7ea90b1451006596c4574b1e65894f0bda1bafc (diff)
download: perl-65df57a84b55413fcde1e64b86e3d740485536d3.tar.gz
1 files changed, 111 insertions, 0 deletions
diff --git a/inline.h b/inline.h
index 66ba348714..adcd85d24b 100644
--- a/inline.h
+++ b/inline.h
@@ -920,6 +920,117 @@ Perl_utf8_hop(const U8 *s, SSize_t off)
 }
 
 /*
+=for apidoc utf8_hop_forward
+
+Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
+forward.
+
+C<off> must be non-negative.
+
+C<s> must be before or equal to C<end>.
+
+When moving forward it will not move beyond C<end>.
+
+Will not exceed this limit even if the string is not valid "UTF-8".
+
+=cut
+*/
+
+PERL_STATIC_INLINE U8 *
+Perl_utf8_hop_forward(const U8 *s, SSize_t off, const U8 *end)
+{
+    PERL_ARGS_ASSERT_UTF8_HOP_FORWARD;
+
+    /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
+     * the bitops (especially ~) can create illegal UTF-8.
+     * In other words: in Perl UTF-8 is not just for Unicode. */
+
+    assert(s <= end);
+    assert(off >= 0);
+
+    while (off--) {
+        STRLEN skip = UTF8SKIP(s);
+        if ((STRLEN)(end - s) <= skip)
+            return (U8 *)end;
+        s += skip;
+    }
+
+    return (U8 *)s;
+}
+
+/*
+=for apidoc utf8_hop_back
+
+Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
+backward.
+
+C<off> must be non-positive.
+
+C<s> must be after or equal to C<start>.
+
+When moving backward it will not move before C<start>.
+
+Will not exceed this limit even if the string is not valid "UTF-8".
+
+=cut
+*/
+
+PERL_STATIC_INLINE U8 *
+Perl_utf8_hop_back(const U8 *s, SSize_t off, const U8 *start)
+{
+    PERL_ARGS_ASSERT_UTF8_HOP_BACK;
+
+    /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
+     * the bitops (especially ~) can create illegal UTF-8.
+     * In other words: in Perl UTF-8 is not just for Unicode. */
+
+    assert(start <= s);
+    assert(off <= 0);
+
+    while (off++ && s > start) {
+        s--;
+        while (UTF8_IS_CONTINUATION(*s) && s > start)
+            s--;
+    }
+    
+    return (U8 *)s;
+}
+
+/*
+=for apidoc utf8_hop_safe
+
+Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
+either forward or backward.
+
+When moving backward it will not move before C<start>.
+
+When moving forward it will not move beyond C<end>.
+
+Will not exceed those limits even if the string is not valid "UTF-8".
+
+=cut
+*/
+
+PERL_STATIC_INLINE U8 *
+Perl_utf8_hop_safe(const U8 *s, SSize_t off, const U8 *start, const U8 *end)
+{
+    PERL_ARGS_ASSERT_UTF8_HOP_SAFE;
+
+    /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
+     * the bitops (especially ~) can create illegal UTF-8.
+     * In other words: in Perl UTF-8 is not just for Unicode. */
+
+    assert(start <= s && s <= end);
+
+    if (off >= 0) {
+        return utf8_hop_forward(s, off, end);
+    }
+    else {
+        return utf8_hop_back(s, off, start);
+    }
+}
+
+/*
 
 =for apidoc is_utf8_valid_partial_char
author	Tony Cook <tony@develop-help.com>	2016-10-31 14:28:34 +1100
committer	Tony Cook <tony@develop-help.com>	2016-11-09 13:29:46 +1100
commit	65df57a84b55413fcde1e64b86e3d740485536d3 (patch)
tree	848c157d38e8a30b2ccf0bf95d4865174daf28ca /inline.h
parent	a7ea90b1451006596c4574b1e65894f0bda1bafc (diff)
download	perl-65df57a84b55413fcde1e64b86e3d740485536d3.tar.gz