diff options
author | Karl Williamson <khw@cpan.org> | 2020-10-16 13:27:55 -0600 |
---|---|---|
committer | Karl Williamson <khw@cpan.org> | 2020-10-31 11:04:19 -0600 |
commit | cc448ceab170493a35039184f4f85054a8ddf971 (patch) | |
tree | 23ad961bdeae2eb6fe5b5bc4d8be4494d9a1d180 | |
parent | f30cbf5aa3835c9fb81f5212c3c56439f940ae19 (diff) | |
download | perl-cc448ceab170493a35039184f4f85054a8ddf971.tar.gz |
Rewrite delimcpy to use memchr and Copy, not per-byte
Prior to this commit delimcpy() parsed its input byte-by-byte, looking
for a particular character, and copied the input to the output stopping
just before the first such occurrence.
memchr() is much faster for finding a single character.
The complication is that if the character is preceded by a backslash,
it doesn't count as that character, it is considered to be escaped, and
parsing continues to the first unescaped occurrence, if any. Each
escaping backslash is not copied.
The prior code also failed to account for the possibility of the delimiter
being a backslash, the same as the escape.
The new routine looks for the character with memchr, sees if it is
escaped. If not, Copy does the whole copy at once. If it is escaped,
it uses Copy up to that backslash, and repeats the process.
-rw-r--r-- | embed.fnc | 5 | ||||
-rw-r--r-- | ext/XS-APItest/t/delimcpy.t | 5 | ||||
-rw-r--r-- | proto.h | 4 | ||||
-rw-r--r-- | util.c | 207 |
4 files changed, 175 insertions, 46 deletions
@@ -753,8 +753,9 @@ Ap |I32 |debop |NN const OP* o Ap |I32 |debstack Ap |I32 |debstackptrs pR |SV * |defelem_target |NN SV *sv|NULLOK MAGIC *mg -ATp |char* |delimcpy |NN char* to|NN const char* toend|NN const char* from \ - |NN const char* fromend|int delim|NN I32* retlen +ATpd |char* |delimcpy|NN char* to|NN const char* to_end \ + |NN const char* from|NN const char* from_end \ + |const int delim|NN I32* retlen Tpd |char* |delimcpy_no_escape|NN char* to|NN const char* toend \ |NN const char* from \ |NN const char* fromend|int delim \ diff --git a/ext/XS-APItest/t/delimcpy.t b/ext/XS-APItest/t/delimcpy.t index e72c9903b4..9d2c7d133e 100644 --- a/ext/XS-APItest/t/delimcpy.t +++ b/ext/XS-APItest/t/delimcpy.t @@ -132,16 +132,17 @@ foreach my $d ("x", "\0") { # Try both printable and NUL delimiters } while ($trunc_dest_len > 0); } -TODO: { +{ # Repeat a few of the tests with a backslash delimiter, which means there # is no possibiliby of an escape - local $TODO = 'current code is broken when the delimiter is a backslash'; my $d = "\\"; my $source = $ib; my $source_len = 1; my $should_be = $source; + pass 'delimiter is a backslash for the rest of the tests'; + $ret = test_delimcpy($source, $source_len, $d, $source_len, $source_len, $poison); is($ret->[0], expected($source, $source_len, $poison, $source_len), "delimcpy works when there is no delimiter at all"); @@ -837,9 +837,9 @@ PERL_CALLCONV SV * Perl_defelem_target(pTHX_ SV *sv, MAGIC *mg) PERL_CALLCONV void Perl_delete_eval_scope(pTHX); #define PERL_ARGS_ASSERT_DELETE_EVAL_SCOPE -PERL_CALLCONV char* Perl_delimcpy(char* to, const char* toend, const char* from, const char* fromend, int delim, I32* retlen); +PERL_CALLCONV char* Perl_delimcpy(char* to, const char* to_end, const char* from, const char* from_end, const int delim, I32* retlen); #define PERL_ARGS_ASSERT_DELIMCPY \ - assert(to); assert(toend); assert(from); assert(fromend); assert(retlen) + assert(to); assert(to_end); assert(from); assert(from_end); assert(retlen) PERL_CALLCONV char* Perl_delimcpy_no_escape(char* to, const char* toend, const char* from, const char* fromend, int delim, I32* retlen); #define PERL_ARGS_ASSERT_DELIMCPY_NO_ESCAPE \ assert(to); assert(toend); assert(from); assert(fromend); assert(retlen) @@ -533,44 +533,13 @@ Free_t Perl_mfree (Malloc_t where) #endif -/* copy a string up to some (non-backslashed) delimiter, if any. - * With allow_escape, converts \<delimiter> to <delimiter>, while leaves - * \<non-delimiter> as-is. - * Returns the position in the src string of the closing delimiter, if - * any, or returns fromend otherwise. - * This is the internal implementation for Perl_delimcpy and - * Perl_delimcpy_no_escape. - */ - -static char * -S_delimcpy_intern(char *to, const char *toend, const char *from, - const char *fromend, int delim, I32 *retlen, - const bool allow_escape) -{ - I32 tolen; - - PERL_ARGS_ASSERT_DELIMCPY; - - for (tolen = 0; from < fromend; from++, tolen++) { - if (allow_escape && *from == '\\' && from + 1 < fromend) { - if (from[1] != delim) { - if (to < toend) - *to++ = *from; - tolen++; - } - from++; - } - else if (*from == delim) - break; - if (to < toend) - *to++ = *from; - } - if (to < toend) - *to = '\0'; - *retlen = tolen; - return (char *)from; -} - +/* This is the value stored in *retlen in the two delimcpy routines below when + * there wasn't enough room in the destination to store everything it was asked + * to. The value is deliberately very large so that hopefully if code uses it + * unquestioninly to access memory, it will likely segfault. And it is small + * enough that if the caller does some arithmetic on it before accessing, it + * won't overflow into a small legal number. */ +#define DELIMCPY_OUT_OF_BOUNDS_RET I32_MAX /* =for apidoc_section String Handling @@ -627,12 +596,170 @@ Perl_delimcpy_no_escape(char *to, const char *toend, const char *from, return (char *) from + copy_len; } +/* +=for apidoc delimcpy + +Copy a source buffer to a destination buffer, stopping at (but not including) +the first occurrence in the source of an unescaped (defined below) delimiter +byte, C<delim>. The source is the bytes between S<C<from> and C<from_end> - +1>. Similarly, the dest is C<to> up to C<to_end>. + +The number of bytes copied is written to C<*retlen>. + +Returns the position of the first uncopied C<delim> in the C<from> buffer, but +if there is no such occurrence before C<from_end>, then C<from_end> is returned, +and the entire buffer S<C<from> .. C<from_end> - 1> is copied. + +If there is room in the destination available after the copy, an extra +terminating safety C<NUL> byte is appended (not included in the returned +length). + +The error case is if the destination buffer is not large enough to accommodate +everything that should be copied. In this situation, a value larger than +S<C<to_end> - C<to>> is written to C<*retlen>, and as much of the source as +fits will be written to the destination. Not having room for the safety C<NUL> +is not considered an error. + +In the following examples, let C<x> be the delimiter, and C<0> represent a C<NUL> +byte (B<NOT> the digit C<0>). Then we would have + + Source Destination + abcxdef abc0 + +provided the destination buffer is at least 4 bytes long. + +An escaped delimiter is one which is immediately preceded by a single +backslash. Escaped delimiters are copied, and the copy continues past the +delimiter; the backslash is not copied: + + Source Destination + abc\xdef abcxdef0 + +(provided the destination buffer is at least 8 bytes long). + +It's actually somewhat more complicated than that. A sequence of any odd number +of backslashes escapes the following delimiter, and the copy continues with +exactly one of the backslashes stripped. + + Source Destination + abc\xdef abcxdef0 + abc\\\xdef abc\\xdef0 + abc\\\\\xdef abc\\\\xdef0 + +(as always, if the destination is large enough) + +An even number of preceding backslashes does not escape the delimiter, so that +the copy stops just before it, and includes all the backslashes (no stripping; +zero is considered even): + + Source Destination + abcxdef abc0 + abc\\xdef abc\\0 + abc\\\\xdef abc\\\\0 + +=cut +*/ + char * -Perl_delimcpy(char *to, const char *toend, const char *from, const char *fromend, int delim, I32 *retlen) +Perl_delimcpy(char *to, const char *to_end, + const char *from, const char *from_end, + const int delim, I32 *retlen) { + const char * const orig_to = to; + Ptrdiff_t copy_len = 0; + bool stopped_early = FALSE; /* Ran out of room to copy to */ + PERL_ARGS_ASSERT_DELIMCPY; + assert(from_end >= from); + assert(to_end >= to); + + /* Don't use the loop for the trivial case of the first character being the + * delimiter; otherwise would have to worry inside the loop about backing + * up before the start of 'from' */ + if (LIKELY(from_end > from && *from != delim)) { + while ((copy_len = from_end - from) > 0) { + const char * backslash_pos; + const char * delim_pos; + + /* Look for the next delimiter in the remaining portion of the + * source. A loop invariant is that we already know that the copy + * should include *from; this comes from the conditional before the + * loop, and how we set things up at the end of each iteration */ + delim_pos = (const char *) memchr(from + 1, delim, copy_len - 1); + + /* If didn't find it, done looking; set up so copies all of the + * source */ + if (! delim_pos) { + copy_len = from_end - from; + break; + } + + /* Look for a backslash immediately before the delimiter */ + backslash_pos = delim_pos - 1; + + /* If the delimiter is not escaped, this ends the copy */ + if (*backslash_pos != '\\') { + copy_len = delim_pos - from; + break; + } + + /* Here there is a backslash just before the delimiter, but it + * could be the final backslash in a sequence of them. Backup to + * find the first one in it. */ + do { + backslash_pos--; + } + while (backslash_pos >= from && *backslash_pos == '\\'); + + /* If the number of backslashes is even, they just escape one + * another, leaving the delimiter unescaped, and stopping the copy. + * */ + if (! ((delim_pos - (backslash_pos + 1)) & 1)) { + copy_len = delim_pos - from; /* even, copy up to delimiter */ + break; + } + + /* Here is odd, so the delimiter is escaped. We will try to copy + * all but the final backslash in the sequence */ + copy_len = delim_pos - 1 - from; - return S_delimcpy_intern(to, toend, from, fromend, delim, retlen, 1); + /* Do the copy, but not beyond the end of the destination */ + if (copy_len >= to_end - to) { + Copy(from, to, to_end - to, char); + stopped_early = TRUE; + to = (char *) to_end; + } + else { + Copy(from, to, copy_len, char); + to += copy_len; + } + + /* Set up so next iteration will include the delimiter */ + from = delim_pos; + } + } + + /* Here, have found the final segment to copy. Copy that, but not beyond + * the size of the destination. If not enough room, copy as much as can + * fit, and set error return */ + if (stopped_early || copy_len > to_end - to) { + Copy(from, to, to_end - to, char); + *retlen = DELIMCPY_OUT_OF_BOUNDS_RET; + } + else { + Copy(from, to, copy_len, char); + + to += copy_len; + + /* If there is extra space available, add a trailing NUL */ + if (to < to_end) { + *to = '\0'; + } + + *retlen = to - orig_to; + } + + return (char *) from + copy_len; } /* |