From acababb42be12ff2986b73c1bfa963b70bb5d54e Mon Sep 17 00:00:00 2001 From: Yves Orton Date: Mon, 9 Jan 2023 22:34:13 +0100 Subject: regexec.c - teach BRANCH and BRANCHJ nodes to reset capture buffers In /((a)(b)|(a))+/ we should not end up with $2 and $4 being set at the same time. When a branch fails it should reset any capture buffers that might be touched by its branch. We change BRANCH and BRANCHJ to store the number of parens before the branch, and the number of parens after the branch was completed. When a BRANCH operation fails, we clear the buffers it contains before we continue on. It is a bit more complex than it should be because we have BRANCHJ and BRANCH. (One of these days we should merge them together.) This is also made somewhat more complex because TRIE nodes are actually branches, and may need to track capture buffers also, at two levels. The overall TRIE op, and for jump tries especially where we emulate the behavior of branches. So we have to do the same clearing logic if a trie branch fails as well. --- regexec.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 51 insertions(+), 4 deletions(-) (limited to 'regexec.c') diff --git a/regexec.c b/regexec.c index 6d7f52cb33..b112e077d5 100644 --- a/regexec.c +++ b/regexec.c @@ -350,6 +350,29 @@ S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen _pDEPTH) rex->lastparen = n; \ rex->lastcloseparen = lcp; +#define CAPTURE_CLEAR(from_ix, to_ix, str) \ +STMT_START { \ + U16 my_ix; \ + if (from_ix) { \ + for ( my_ix = from_ix; my_ix <= to_ix; my_ix++ ) { \ + DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_ \ + "CAPTURE_CLEAR %s \\%" IVdf ": " \ + "%" IVdf "(%" IVdf ") .. %" IVdf \ + " => " \ + "%" IVdf "(%" IVdf ") .. %" IVdf \ + "\n", \ + depth, str, (IV)my_ix, \ + (IV)rex->offs[my_ix].start, \ + (IV)rex->offs[my_ix].start_tmp, \ + (IV)rex->offs[my_ix].end, \ + (IV)-1, (IV)-1, (IV)-1)); \ + rex->offs[my_ix].start = -1; \ + rex->offs[my_ix].start_tmp = -1; \ + rex->offs[my_ix].end = -1; \ + } \ + } \ +} STMT_END + STATIC void S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p _pDEPTH) @@ -6640,6 +6663,11 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) /* what trie are we using right now */ reg_trie_data * const trie = (reg_trie_data*)rexi->data->data[ ARG( scan ) ]; + ST.before_paren = trie->before_paren; + ST.after_paren = trie->after_paren; + assert(ST.before_paren<=rex->nparens); + assert(ST.after_paren<=rex->nparens); + HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]); U32 state = trie->startstate; @@ -6689,6 +6717,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) U32 accepted = 0; /* have we seen any accepting states? */ ST.jump = trie->jump; + ST.j_before_paren = trie->j_before_paren; + ST.j_after_paren= trie->j_after_paren; ST.me = scan; ST.firstpos = NULL; ST.longfold = FALSE; /* char longer if folded => it's harder */ @@ -6800,6 +6830,10 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) * rest of the branch */ REGCP_UNWIND(ST.cp); UNWIND_PAREN(ST.lastparen, ST.lastcloseparen); + if (ST.after_paren) { + assert(ST.before_paren<=rex->nparens && ST.after_paren<=rex->nparens); + CAPTURE_CLEAR(ST.before_paren+1, ST.after_paren,"TRIE_next_fail"); + } } if (!--ST.accepted) { DEBUG_EXECUTE_r({ @@ -6889,10 +6923,16 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) uc += chars; } } + if (ST.jump && ST.jump[ST.nextword]) { + scan = ST.me + ST.jump[ST.nextword]; + ST.before_paren = ST.j_before_paren[ST.nextword]; + assert(ST.before_paren <= rex->nparens); + ST.after_paren = ST.j_after_paren[ST.nextword]; + assert(ST.after_paren <= rex->nparens); + } else { + scan = ST.me + NEXT_OFF(ST.me); + } - scan = ST.me + ((ST.jump && ST.jump[ST.nextword]) - ? ST.jump[ST.nextword] - : NEXT_OFF(ST.me)); DEBUG_EXECUTE_r({ Perl_re_exec_indentf( aTHX_ "%sTRIE matched word #%d, continuing%s\n", @@ -8957,9 +8997,15 @@ NULL next = scan + ARG(scan); if (next == scan) next = NULL; - /* FALLTHROUGH */ + ST.before_paren = ARG2La(scan); + ST.after_paren = ARG2Lb(scan); + goto branch_logic; + NOT_REACHED; /* NOTREACHED */ case BRANCH: /* /(...|A|...)/ */ + ST.before_paren = ARGa(scan); + ST.after_paren = ARGb(scan); + branch_logic: scan = REGNODE_AFTER_opcode(scan,state_num); /* scan now points to inner node */ assert(scan); ST.lastparen = rex->lastparen; @@ -9004,6 +9050,7 @@ NULL } REGCP_UNWIND(ST.cp); UNWIND_PAREN(ST.lastparen, ST.lastcloseparen); + CAPTURE_CLEAR(ST.before_paren+1,ST.after_paren,"BRANCH_next_fail"); scan = ST.next_branch; /* no more branches? */ if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) { -- cgit v1.2.1