Rewrite delimcpy to use memchr and Copy, not per-byte

Prior to this commit delimcpy() parsed its input byte-by-byte, looking for a particular character, and copied the input to the output stopping just before the first such occurrence. memchr() is much faster for finding a single character. The complication is that if the character is preceded by a backslash, it doesn't count as that character, it is considered to be escaped, and parsing continues to the first unescaped occurrence, if any. Each escaping backslash is not copied. The prior code also failed to account for the possibility of the delimiter being a backslash, the same as the escape. The new routine looks for the character with memchr, sees if it is escaped. If not, Copy does the whole copy at once. If it is escaped, it uses Copy up to that backslash, and repeats the process.
author: Karl Williamson <khw@cpan.org> 2020-10-16 13:27:55 -0600
committer: Karl Williamson <khw@cpan.org> 2020-10-31 11:04:19 -0600
commit: cc448ceab170493a35039184f4f85054a8ddf971 (patch)
tree: 23ad961bdeae2eb6fe5b5bc4d8be4494d9a1d180
parent: f30cbf5aa3835c9fb81f5212c3c56439f940ae19 (diff)
download: perl-cc448ceab170493a35039184f4f85054a8ddf971.tar.gz
4 files changed, 175 insertions, 46 deletions
diff --git a/embed.fnc b/embed.fnc
index be253fe72f..e088cd844c 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -753,8 +753,9 @@ Ap	|I32	|debop		|NN const OP* o
 Ap	|I32	|debstack
 Ap	|I32	|debstackptrs
 pR	|SV *	|defelem_target	|NN SV *sv|NULLOK MAGIC *mg
-ATp	|char*	|delimcpy	|NN char* to|NN const char* toend|NN const char* from \
-				|NN const char* fromend|int delim|NN I32* retlen
+ATpd	|char*	|delimcpy|NN char* to|NN const char* to_end		\
+			 |NN const char* from|NN const char* from_end	\
+			 |const int delim|NN I32* retlen
 Tpd	|char*	|delimcpy_no_escape|NN char* to|NN const char* toend	\
 				   |NN const char* from			\
 				   |NN const char* fromend|int delim	\
diff --git a/ext/XS-APItest/t/delimcpy.t b/ext/XS-APItest/t/delimcpy.t
index e72c9903b4..9d2c7d133e 100644
--- a/ext/XS-APItest/t/delimcpy.t
+++ b/ext/XS-APItest/t/delimcpy.t
@@ -132,16 +132,17 @@ foreach my $d ("x", "\0") {     # Try both printable and NUL delimiters
     } while ($trunc_dest_len > 0);
 }
 
-TODO: {
+{
     # Repeat a few of the tests with a backslash delimiter, which means there
     # is no possibiliby of an escape
 
-        local $TODO = 'current code is broken when the delimiter is a backslash';
         my $d = "\\";
         my $source = $ib;
         my $source_len = 1;
         my $should_be = $source;
 
+        pass 'delimiter is a backslash for the rest of the tests';
+
         $ret = test_delimcpy($source, $source_len, $d, $source_len, $source_len, $poison);
         is($ret->[0], expected($source, $source_len, $poison, $source_len),
            "delimcpy works when there is no delimiter at all");
diff --git a/proto.h b/proto.h
index 8a211e84ab..4abb9769b2 100644
--- a/proto.h
+++ b/proto.h
@@ -837,9 +837,9 @@ PERL_CALLCONV SV *	Perl_defelem_target(pTHX_ SV *sv, MAGIC *mg)
 
 PERL_CALLCONV void	Perl_delete_eval_scope(pTHX);
 #define PERL_ARGS_ASSERT_DELETE_EVAL_SCOPE
-PERL_CALLCONV char*	Perl_delimcpy(char* to, const char* toend, const char* from, const char* fromend, int delim, I32* retlen);
+PERL_CALLCONV char*	Perl_delimcpy(char* to, const char* to_end, const char* from, const char* from_end, const int delim, I32* retlen);
 #define PERL_ARGS_ASSERT_DELIMCPY	\
-	assert(to); assert(toend); assert(from); assert(fromend); assert(retlen)
+	assert(to); assert(to_end); assert(from); assert(from_end); assert(retlen)
 PERL_CALLCONV char*	Perl_delimcpy_no_escape(char* to, const char* toend, const char* from, const char* fromend, int delim, I32* retlen);
 #define PERL_ARGS_ASSERT_DELIMCPY_NO_ESCAPE	\
 	assert(to); assert(toend); assert(from); assert(fromend); assert(retlen)
diff --git a/util.c b/util.c
index df43cdd237..5d2d1ba17c 100644
--- a/util.c
+++ b/util.c
@@ -533,44 +533,13 @@ Free_t   Perl_mfree (Malloc_t where)
 
 #endif
 
-/* copy a string up to some (non-backslashed) delimiter, if any.
- * With allow_escape, converts \<delimiter> to <delimiter>, while leaves
- * \<non-delimiter> as-is.
- * Returns the position in the src string of the closing delimiter, if
- * any, or returns fromend otherwise.
- * This is the internal implementation for Perl_delimcpy and
- * Perl_delimcpy_no_escape.
- */
-
-static char *
-S_delimcpy_intern(char *to, const char *toend, const char *from,
-	   const char *fromend, int delim, I32 *retlen,
-	   const bool allow_escape)
-{
-    I32 tolen;
-
-    PERL_ARGS_ASSERT_DELIMCPY;
-
-    for (tolen = 0; from < fromend; from++, tolen++) {
-	if (allow_escape && *from == '\\' && from + 1 < fromend) {
-	    if (from[1] != delim) {
-		if (to < toend)
-		    *to++ = *from;
-		tolen++;
-	    }
-	    from++;
-	}
-	else if (*from == delim)
-	    break;
-	if (to < toend)
-	    *to++ = *from;
-    }
-    if (to < toend)
-	*to = '\0';
-    *retlen = tolen;
-    return (char *)from;
-}
-
+/* This is the value stored in *retlen in the two delimcpy routines below when
+ * there wasn't enough room in the destination to store everything it was asked
+ * to.  The value is deliberately very large so that hopefully if code uses it
+ * unquestioninly to access memory, it will likely segfault.  And it is small
+ * enough that if the caller does some arithmetic on it before accessing, it
+ * won't overflow into a small legal number. */
+#define DELIMCPY_OUT_OF_BOUNDS_RET  I32_MAX
 
 /*
 =for apidoc_section String Handling
@@ -627,12 +596,170 @@ Perl_delimcpy_no_escape(char *to, const char *toend, const char *from,
     return (char *) from + copy_len;
 }
 
+/*
+=for apidoc delimcpy
+
+Copy a source buffer to a destination buffer, stopping at (but not including)
+the first occurrence in the source of an unescaped (defined below) delimiter
+byte, C<delim>.  The source is the bytes between S<C<from> and C<from_end> -
+1>.  Similarly, the dest is C<to> up to C<to_end>.
+
+The number of bytes copied is written to C<*retlen>.
+
+Returns the position of the first uncopied C<delim> in the C<from> buffer, but
+if there is no such occurrence before C<from_end>, then C<from_end> is returned,
+and the entire buffer S<C<from> .. C<from_end> - 1> is copied.
+
+If there is room in the destination available after the copy, an extra
+terminating safety C<NUL> byte is appended (not included in the returned
+length).
+
+The error case is if the destination buffer is not large enough to accommodate
+everything that should be copied.  In this situation, a value larger than
+S<C<to_end> - C<to>> is written to C<*retlen>, and as much of the source as
+fits will be written to the destination.  Not having room for the safety C<NUL>
+is not considered an error.
+
+In the following examples, let C<x> be the delimiter, and C<0> represent a C<NUL>
+byte (B<NOT> the digit C<0>).  Then we would have
+
+  Source     Destination
+ abcxdef        abc0
+
+provided the destination buffer is at least 4 bytes long.
+
+An escaped delimiter is one which is immediately preceded by a single
+backslash.  Escaped delimiters are copied, and the copy continues past the
+delimiter; the backslash is not copied:
+
+  Source       Destination
+ abc\xdef       abcxdef0
+
+(provided the destination buffer is at least 8 bytes long).
+
+It's actually somewhat more complicated than that. A sequence of any odd number
+of backslashes escapes the following delimiter, and the copy continues with
+exactly one of the backslashes stripped.
+
+     Source         Destination
+     abc\xdef          abcxdef0
+   abc\\\xdef        abc\\xdef0
+ abc\\\\\xdef      abc\\\\xdef0
+
+(as always, if the destination is large enough)
+
+An even number of preceding backslashes does not escape the delimiter, so that
+the copy stops just before it, and includes all the backslashes (no stripping;
+zero is considered even):
+
+      Source         Destination
+      abcxdef          abc0
+    abc\\xdef          abc\\0
+  abc\\\\xdef          abc\\\\0
+
+=cut
+*/
+
 char *
-Perl_delimcpy(char *to, const char *toend, const char *from, const char *fromend, int delim, I32 *retlen)
+Perl_delimcpy(char *to, const char *to_end,
+              const char *from, const char *from_end,
+              const int delim, I32 *retlen)
 {
+    const char * const orig_to = to;
+    Ptrdiff_t copy_len = 0;
+    bool stopped_early = FALSE;     /* Ran out of room to copy to */
+
     PERL_ARGS_ASSERT_DELIMCPY;
+    assert(from_end >= from);
+    assert(to_end >= to);
+
+    /* Don't use the loop for the trivial case of the first character being the
+     * delimiter; otherwise would have to worry inside the loop about backing
+     * up before the start of 'from' */
+    if (LIKELY(from_end > from && *from != delim)) {
+        while ((copy_len = from_end - from) > 0) {
+            const char * backslash_pos;
+            const char * delim_pos;
+
+            /* Look for the next delimiter in the remaining portion of the
+             * source. A loop invariant is that we already know that the copy
+             * should include *from; this comes from the conditional before the
+             * loop, and how we set things up at the end of each iteration */
+            delim_pos = (const char *) memchr(from + 1, delim, copy_len - 1);
+
+            /* If didn't find it, done looking; set up so copies all of the
+             * source */
+            if (! delim_pos) {
+                copy_len = from_end - from;
+                break;
+            }
+
+            /* Look for a backslash immediately before the delimiter */
+            backslash_pos = delim_pos - 1;
+
+            /* If the delimiter is not escaped, this ends the copy */
+            if (*backslash_pos != '\\') {
+                copy_len = delim_pos - from;
+                break;
+            }
+
+            /* Here there is a backslash just before the delimiter, but it
+             * could be the final backslash in a sequence of them.  Backup to
+             * find the first one in it. */
+            do {
+                backslash_pos--;
+            }
+            while (backslash_pos >= from && *backslash_pos == '\\');
+
+            /* If the number of backslashes is even, they just escape one
+             * another, leaving the delimiter unescaped, and stopping the copy.
+             * */
+            if (! ((delim_pos - (backslash_pos + 1)) & 1)) {
+                copy_len = delim_pos - from;  /* even, copy up to delimiter */
+                break;
+            }
+
+            /* Here is odd, so the delimiter is escaped.  We will try to copy
+             * all but the final backslash in the sequence */
+            copy_len = delim_pos - 1 - from;
 
-    return S_delimcpy_intern(to, toend, from, fromend, delim, retlen, 1);
+            /* Do the copy, but not beyond the end of the destination */
+            if (copy_len >= to_end - to) {
+                Copy(from, to, to_end - to, char);
+                stopped_early = TRUE;
+                to = (char *) to_end;
+            }
+            else {
+                Copy(from, to, copy_len, char);
+                to += copy_len;
+            }
+
+            /* Set up so next iteration will include the delimiter */
+            from = delim_pos;
+        }
+    }
+
+    /* Here, have found the final segment to copy.  Copy that, but not beyond
+     * the size of the destination.  If not enough room, copy as much as can
+     * fit, and set error return */
+    if (stopped_early || copy_len > to_end - to) {
+        Copy(from, to, to_end - to, char);
+        *retlen = DELIMCPY_OUT_OF_BOUNDS_RET;
+    }
+    else {
+        Copy(from, to, copy_len, char);
+
+        to += copy_len;
+
+        /* If there is extra space available, add a trailing NUL */
+        if (to < to_end) {
+            *to = '\0';
+        }
+
+        *retlen = to - orig_to;
+    }
+
+    return (char *) from + copy_len;
 }
 
 /*
author	Karl Williamson <khw@cpan.org>	2020-10-16 13:27:55 -0600
committer	Karl Williamson <khw@cpan.org>	2020-10-31 11:04:19 -0600
commit	cc448ceab170493a35039184f4f85054a8ddf971 (patch)
tree	23ad961bdeae2eb6fe5b5bc4d8be4494d9a1d180
parent	f30cbf5aa3835c9fb81f5212c3c56439f940ae19 (diff)
download	perl-cc448ceab170493a35039184f4f85054a8ddf971.tar.gz