diff options
author | David Mitchell <davem@iabyn.com> | 2012-07-26 16:04:09 +0100 |
---|---|---|
committer | David Mitchell <davem@iabyn.com> | 2012-09-08 15:42:06 +0100 |
commit | 6502e08109cd003b2cdf39bc94ef35e52203240b (patch) | |
tree | ae4071332e6a7fd61354d33941476643066d5f56 /regexec.c | |
parent | 2c7b5d7698f52b86acffe19a7ec15e85c99337fe (diff) | |
download | perl-6502e08109cd003b2cdf39bc94ef35e52203240b.tar.gz |
Don't copy all of the match string buffer
When a pattern matches, and that pattern contains captures (or $`, $&, $'
or /p are present), a copy is made of the whole original string, so
that $1 et al continue to hold the correct value even if the original
string is subsequently modified. This can have severe performance
penalties; for example, this code causes a 1Mb buffer to be allocated,
copied and freed a million times:
$&;
$x = 'x' x 1_000_000;
1 while $x =~ /(.)/g;
This commit changes this so that, where possible, only the needed
substring of the original string is copied: in the above case, only a
1-byte buffer is copied each time. Also, it now reuses or reallocs the
buffer, rather than freeing and mallocing each time.
Now that PL_sawampersand is a 3-bit flag indicating separately whether
$`, $& and $' have been seen, they each contribute only their own
individual penalty; which ones have been seen will limit the extent to
which we can avoid copying the whole buffer.
Note that the above code *without* the $& is not currently slow, but only
because the copying is artificially disabled to avoid the performance hit.
The next but one commit will remove that hack, meaning that it will still
be fast, but will now be correct in the presence of a modified original
string.
We achieve this by by adding suboffset and subcoffset fields to the
existing subbeg and sublen fields of a regex, to indicate how many bytes
and characters have been skipped from the logical start of the string till
the physical start of the buffer. To avoid copying stuff at the end, we
just reduce sublen. For example, in this:
"abcdefgh" =~ /(c)d/
subbeg points to a malloced buffer containing "c\0"; sublen == 1,
and suboffset == 2 (as does subcoffset).
while if $& has been seen,
subbeg points to a malloced buffer containing "cd\0"; sublen == 2,
and suboffset == 2.
If in addition $' has been seen, then
subbeg points to a malloced buffer containing "cdefgh\0"; sublen == 6,
and suboffset == 2.
The regex engine won't do this by default; there are two new flag bits,
REXEC_COPY_SKIP_PRE and REXEC_COPY_SKIP_POST, which in conjunction with
REXEC_COPY_STR, request that the engine skip the start or end of the
buffer (it will still copy in the presence of the relevant $`, $&, $',
/p).
Only pp_match has been enhanced to use these extra flags; substitution
can't easily benefit, since the usual action of s///g is to copy the
whole string first time round, then perform subsequent matching iterations
against the copy, without further copying. So you still need to copy most
of the buffer.
Diffstat (limited to 'regexec.c')
-rw-r--r-- | regexec.c | 108 |
1 files changed, 101 insertions, 7 deletions
@@ -2566,9 +2566,7 @@ got_it: /* make sure $`, $&, $', and $digit will work later */ if ( !(flags & REXEC_NOT_FIRST) ) { - RX_MATCH_COPY_FREE(rx); if (flags & REXEC_COPY_STR) { - const I32 i = PL_regeol - strbeg; #ifdef PERL_OLD_COPY_ON_WRITE if ((SvIsCOW(sv) || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) { @@ -2580,17 +2578,105 @@ got_it: prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv); prog->subbeg = (char *)SvPVX_const(prog->saved_copy); assert (SvPOKp(prog->saved_copy)); + prog->sublen = PL_regeol - strbeg; + prog->suboffset = 0; + prog->subcoffset = 0; } else #endif { - RX_MATCH_COPIED_on(rx); - s = savepvn(strbeg, i); - prog->subbeg = s; - } - prog->sublen = i; + I32 min = 0; + I32 max = PL_regeol - strbeg; + I32 sublen; + + if ( (flags & REXEC_COPY_SKIP_POST) + && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */ + && !(PL_sawampersand & SAWAMPERSAND_RIGHT) + ) { /* don't copy $' part of string */ + U32 n = (PL_sawampersand & SAWAMPERSAND_MIDDLE) ? 0 : 1; + max = -1; + /* calculate the right-most part of the string covered + * by a capture. Due to look-ahead, this may be to + * the right of $&, so we have to scan all captures */ + while (n <= prog->lastparen) { + if (prog->offs[n].end > max) + max = prog->offs[n].end; + n++; + } + if (max == -1) + max = (PL_sawampersand & SAWAMPERSAND_LEFT) + ? prog->offs[0].start + : 0; + assert(max >= 0 && max <= PL_regeol - strbeg); + } + + if ( (flags & REXEC_COPY_SKIP_PRE) + && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */ + && !(PL_sawampersand & SAWAMPERSAND_LEFT) + ) { /* don't copy $` part of string */ + U32 n = (PL_sawampersand & SAWAMPERSAND_MIDDLE) ? 0 : 1; + min = max; + /* calculate the left-most part of the string covered + * by a capture. Due to look-behind, this may be to + * the left of $&, so we have to scan all captures */ + while (min && n <= prog->lastparen) { + if ( prog->offs[n].start != -1 + && prog->offs[n].start < min) + { + min = prog->offs[n].start; + } + n++; + } + if ((PL_sawampersand & SAWAMPERSAND_RIGHT) + && min > prog->offs[0].end + ) + min = prog->offs[0].end; + + } + + assert(min >= 0 && min <= max && min <= PL_regeol - strbeg); + sublen = max - min; + + if (RX_MATCH_COPIED(rx)) { + if (sublen > prog->sublen) + prog->subbeg = + (char*)saferealloc(prog->subbeg, sublen+1); + } + else + prog->subbeg = (char*)safemalloc(sublen+1); + Copy(strbeg + min, prog->subbeg, sublen, char); + prog->subbeg[sublen] = '\0'; + prog->suboffset = min; + prog->sublen = sublen; + } + RX_MATCH_COPIED_on(rx); + prog->subcoffset = prog->suboffset; + if (prog->suboffset && utf8_target) { + /* Convert byte offset to chars. + * XXX ideally should only compute this if @-/@+ + * has been seen, a la PL_sawampersand ??? */ + + /* If there's a direct correspondence between the + * string which we're matching and the original SV, + * then we can use the utf8 len cache associated with + * the SV. In particular, it means that under //g, + * sv_pos_b2u() will use the previously cached + * position to speed up working out the new length of + * subcoffset, rather than counting from the start of + * the string each time. This stops + * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g; + * from going quadratic */ + if (SvPOKp(sv) && SvPVX(sv) == strbeg) + sv_pos_b2u(sv, &(prog->subcoffset)); + else + prog->subcoffset = utf8_length((U8*)strbeg, + (U8*)(strbeg+prog->suboffset)); + } } else { + RX_MATCH_COPY_FREE(rx); prog->subbeg = strbeg; + prog->suboffset = 0; + prog->subcoffset = 0; prog->sublen = PL_regeol - strbeg; /* strend may have been modified */ } } @@ -2695,6 +2781,8 @@ S_regtry(pTHX_ regmatch_info *reginfo, char **startpos) $` inside (?{}) could fail... */ PL_reg_oldsaved = prog->subbeg; PL_reg_oldsavedlen = prog->sublen; + PL_reg_oldsavedoffset = prog->suboffset; + PL_reg_oldsavedcoffset = prog->suboffset; #ifdef PERL_OLD_COPY_ON_WRITE PL_nrs = prog->saved_copy; #endif @@ -2703,6 +2791,8 @@ S_regtry(pTHX_ regmatch_info *reginfo, char **startpos) else PL_reg_oldsaved = NULL; prog->subbeg = PL_bostr; + prog->suboffset = 0; + prog->subcoffset = 0; prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */ } #ifdef DEBUGGING @@ -4535,6 +4625,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) RXp_MATCH_COPIED_off(re); re->subbeg = rex->subbeg; re->sublen = rex->sublen; + re->suboffset = rex->suboffset; + re->subcoffset = rex->subcoffset; rei = RXi_GET(re); DEBUG_EXECUTE_r( debug_start_match(re_sv, utf8_target, locinput, PL_regeol, @@ -7167,6 +7259,8 @@ restore_pos(pTHX_ void *arg) if (PL_reg_oldsaved) { rex->subbeg = PL_reg_oldsaved; rex->sublen = PL_reg_oldsavedlen; + rex->suboffset = PL_reg_oldsavedoffset; + rex->subcoffset = PL_reg_oldsavedcoffset; #ifdef PERL_OLD_COPY_ON_WRITE rex->saved_copy = PL_nrs; #endif |