diff options
author | David Mitchell <davem@iabyn.com> | 2013-06-15 17:54:10 +0100 |
---|---|---|
committer | David Mitchell <davem@iabyn.com> | 2013-07-28 10:33:34 +0100 |
commit | 60165aa4bc7ef12661c54168e5dd07abb3881564 (patch) | |
tree | 2a9f0928bc9ca82ce666abf8967eba4be1a0a0c1 /regexec.c | |
parent | 8088f39e7b6276147727ae77eecaa0bb20cfbf2d (diff) | |
download | perl-60165aa4bc7ef12661c54168e5dd07abb3881564.tar.gz |
add Perl_reg_set_capture_string() function
Cut and paste into a separate function, the block of code in
regexec_flags() that is responsible (on successful match) for setting
RX_SAVED_COPY, RX_SUBBEG etc, ready for use by capture vars like $1, $&.
Although this function is currently only called from one place, we will
shortly use it elsewhere too.
This should contain no functional changes.
Diffstat (limited to 'regexec.c')
-rw-r--r-- | regexec.c | 253 |
1 files changed, 136 insertions, 117 deletions
@@ -2047,6 +2047,138 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, return s; } +/* set RX_SAVED_COPY, RX_SUBBEG etc. + * flags have same meanings as with regexec_flags() */ + +void +Perl_reg_set_capture_string(pTHX_ REGEXP * const rx, + char *strbeg, + char *strend, + SV *sv, + U32 flags, + bool utf8_target) +{ + struct regexp *const prog = ReANY(rx); + + PERL_ARGS_ASSERT_REG_SET_CAPTURE_STRING; + + if (flags & REXEC_COPY_STR) { +#ifdef PERL_ANY_COW + if (SvCANCOW(sv)) { + if (DEBUG_C_TEST) { + PerlIO_printf(Perl_debug_log, + "Copy on write: regexp capture, type %d\n", + (int) SvTYPE(sv)); + } + RX_MATCH_COPY_FREE(rx); + prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv); + prog->subbeg = (char *)SvPVX_const(prog->saved_copy); + assert (SvPOKp(prog->saved_copy)); + prog->sublen = strend - strbeg; + prog->suboffset = 0; + prog->subcoffset = 0; + } else +#endif + { + I32 min = 0; + I32 max = strend - strbeg; + I32 sublen; + + if ( (flags & REXEC_COPY_SKIP_POST) + && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */ + && !(PL_sawampersand & SAWAMPERSAND_RIGHT) + ) { /* don't copy $' part of string */ + U32 n = 0; + max = -1; + /* calculate the right-most part of the string covered + * by a capture. Due to look-ahead, this may be to + * the right of $&, so we have to scan all captures */ + while (n <= prog->lastparen) { + if (prog->offs[n].end > max) + max = prog->offs[n].end; + n++; + } + if (max == -1) + max = (PL_sawampersand & SAWAMPERSAND_LEFT) + ? prog->offs[0].start + : 0; + assert(max >= 0 && max <= strend - strbeg); + } + + if ( (flags & REXEC_COPY_SKIP_PRE) + && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */ + && !(PL_sawampersand & SAWAMPERSAND_LEFT) + ) { /* don't copy $` part of string */ + U32 n = 0; + min = max; + /* calculate the left-most part of the string covered + * by a capture. Due to look-behind, this may be to + * the left of $&, so we have to scan all captures */ + while (min && n <= prog->lastparen) { + if ( prog->offs[n].start != -1 + && prog->offs[n].start < min) + { + min = prog->offs[n].start; + } + n++; + } + if ((PL_sawampersand & SAWAMPERSAND_RIGHT) + && min > prog->offs[0].end + ) + min = prog->offs[0].end; + + } + + assert(min >= 0 && min <= max && min <= strend - strbeg); + sublen = max - min; + + if (RX_MATCH_COPIED(rx)) { + if (sublen > prog->sublen) + prog->subbeg = + (char*)saferealloc(prog->subbeg, sublen+1); + } + else + prog->subbeg = (char*)safemalloc(sublen+1); + Copy(strbeg + min, prog->subbeg, sublen, char); + prog->subbeg[sublen] = '\0'; + prog->suboffset = min; + prog->sublen = sublen; + RX_MATCH_COPIED_on(rx); + } + prog->subcoffset = prog->suboffset; + if (prog->suboffset && utf8_target) { + /* Convert byte offset to chars. + * XXX ideally should only compute this if @-/@+ + * has been seen, a la PL_sawampersand ??? */ + + /* If there's a direct correspondence between the + * string which we're matching and the original SV, + * then we can use the utf8 len cache associated with + * the SV. In particular, it means that under //g, + * sv_pos_b2u() will use the previously cached + * position to speed up working out the new length of + * subcoffset, rather than counting from the start of + * the string each time. This stops + * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g; + * from going quadratic */ + if (SvPOKp(sv) && SvPVX(sv) == strbeg) + sv_pos_b2u(sv, &(prog->subcoffset)); + else + prog->subcoffset = utf8_length((U8*)strbeg, + (U8*)(strbeg+prog->suboffset)); + } + } + else { + RX_MATCH_COPY_FREE(rx); + prog->subbeg = strbeg; + prog->suboffset = 0; + prog->subcoffset = 0; + prog->sublen = strend - strbeg; + } +} + + + /* - regexec_flags - match a regexp against a string @@ -2680,123 +2812,10 @@ got_it: RX_MATCH_UTF8_set(rx, utf8_target); /* make sure $`, $&, $', and $digit will work later */ - if ( !(flags & REXEC_NOT_FIRST) ) { - if (flags & REXEC_COPY_STR) { -#ifdef PERL_ANY_COW - if (SvCANCOW(sv)) { - if (DEBUG_C_TEST) { - PerlIO_printf(Perl_debug_log, - "Copy on write: regexp capture, type %d\n", - (int) SvTYPE(sv)); - } - RX_MATCH_COPY_FREE(rx); - prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv); - prog->subbeg = (char *)SvPVX_const(prog->saved_copy); - assert (SvPOKp(prog->saved_copy)); - prog->sublen = reginfo->strend - strbeg; - prog->suboffset = 0; - prog->subcoffset = 0; - } else -#endif - { - I32 min = 0; - I32 max = reginfo->strend - strbeg; - I32 sublen; - - if ( (flags & REXEC_COPY_SKIP_POST) - && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */ - && !(PL_sawampersand & SAWAMPERSAND_RIGHT) - ) { /* don't copy $' part of string */ - U32 n = 0; - max = -1; - /* calculate the right-most part of the string covered - * by a capture. Due to look-ahead, this may be to - * the right of $&, so we have to scan all captures */ - while (n <= prog->lastparen) { - if (prog->offs[n].end > max) - max = prog->offs[n].end; - n++; - } - if (max == -1) - max = (PL_sawampersand & SAWAMPERSAND_LEFT) - ? prog->offs[0].start - : 0; - assert(max >= 0 && max <= reginfo->strend - strbeg); - } - - if ( (flags & REXEC_COPY_SKIP_PRE) - && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */ - && !(PL_sawampersand & SAWAMPERSAND_LEFT) - ) { /* don't copy $` part of string */ - U32 n = 0; - min = max; - /* calculate the left-most part of the string covered - * by a capture. Due to look-behind, this may be to - * the left of $&, so we have to scan all captures */ - while (min && n <= prog->lastparen) { - if ( prog->offs[n].start != -1 - && prog->offs[n].start < min) - { - min = prog->offs[n].start; - } - n++; - } - if ((PL_sawampersand & SAWAMPERSAND_RIGHT) - && min > prog->offs[0].end - ) - min = prog->offs[0].end; - - } - - assert(min >= 0 && min <= max - && min <= reginfo->strend - strbeg); - sublen = max - min; - - if (RX_MATCH_COPIED(rx)) { - if (sublen > prog->sublen) - prog->subbeg = - (char*)saferealloc(prog->subbeg, sublen+1); - } - else - prog->subbeg = (char*)safemalloc(sublen+1); - Copy(strbeg + min, prog->subbeg, sublen, char); - prog->subbeg[sublen] = '\0'; - prog->suboffset = min; - prog->sublen = sublen; - RX_MATCH_COPIED_on(rx); - } - prog->subcoffset = prog->suboffset; - if (prog->suboffset && utf8_target) { - /* Convert byte offset to chars. - * XXX ideally should only compute this if @-/@+ - * has been seen, a la PL_sawampersand ??? */ - - /* If there's a direct correspondence between the - * string which we're matching and the original SV, - * then we can use the utf8 len cache associated with - * the SV. In particular, it means that under //g, - * sv_pos_b2u() will use the previously cached - * position to speed up working out the new length of - * subcoffset, rather than counting from the start of - * the string each time. This stops - * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g; - * from going quadratic */ - if (SvPOKp(sv) && SvPVX(sv) == strbeg) - sv_pos_b2u(sv, &(prog->subcoffset)); - else - prog->subcoffset = utf8_length((U8*)strbeg, - (U8*)(strbeg+prog->suboffset)); - } - } - else { - RX_MATCH_COPY_FREE(rx); - prog->subbeg = strbeg; - prog->suboffset = 0; - prog->subcoffset = 0; - /* use reginfo->strend, as strend may have been modified */ - prog->sublen = reginfo->strend - strbeg; - } - } + if ( !(flags & REXEC_NOT_FIRST) ) + Perl_reg_set_capture_string(aTHX_ rx, + strbeg, reginfo->strend, + sv, flags, utf8_target); return 1; |