diff options
-rw-r--r-- | embed.fnc | 8 | ||||
-rw-r--r-- | embed.h | 1 | ||||
-rw-r--r-- | mg.c | 52 | ||||
-rw-r--r-- | proto.h | 9 | ||||
-rw-r--r-- | regcomp.c | 172 | ||||
-rw-r--r-- | regcomp_debug.c | 28 | ||||
-rw-r--r-- | regcomp_internal.h | 48 | ||||
-rw-r--r-- | regexec.c | 101 | ||||
-rw-r--r-- | regexp.h | 21 | ||||
-rw-r--r-- | t/re/pat_advanced.t | 40 | ||||
-rw-r--r-- | t/re/re_tests | 15 |
11 files changed, 413 insertions, 82 deletions
@@ -2811,8 +2811,14 @@ Cp |SV * |reg_named_buff_all \ |const U32 flags : FIXME - is anything in re using this now? +EXp |void |reg_numbered_buff_fetch_flags \ + |NN REGEXP * const re \ + |const I32 paren \ + |NULLOK SV * const sv \ + |U32 flags +: FIXME - is anything in re using this now? EXp |void |reg_numbered_buff_fetch \ - |NN REGEXP * const rx \ + |NN REGEXP * const re \ |const I32 paren \ |NULLOK SV * const sv : FIXME - is anything in re using this now? @@ -1688,6 +1688,7 @@ # define reg_named_buff(a,b,c,d) Perl_reg_named_buff(aTHX_ a,b,c,d) # define reg_named_buff_iter(a,b,c) Perl_reg_named_buff_iter(aTHX_ a,b,c) # define reg_numbered_buff_fetch(a,b,c) Perl_reg_numbered_buff_fetch(aTHX_ a,b,c) +# define reg_numbered_buff_fetch_flags(a,b,c,d) Perl_reg_numbered_buff_fetch_flags(aTHX_ a,b,c,d) # define reg_numbered_buff_length(a,b,c) Perl_reg_numbered_buff_length(aTHX_ a,b,c) # define reg_numbered_buff_store(a,b,c) Perl_reg_numbered_buff_store(aTHX_ a,b,c) # define reg_qr_package(a) Perl_reg_qr_package(aTHX_ a) @@ -638,13 +638,15 @@ Perl_magic_regdata_cnt(pTHX_ SV *sv, MAGIC *mg) const SSize_t n = (SSize_t)mg->mg_obj; if (n == '+') { /* @+ */ /* return the number possible */ - return RX_NPARENS(rx); + return RX_LOGICAL_NPARENS(rx); } else { /* @- @^CAPTURE @{^CAPTURE} */ I32 paren = RX_LASTPAREN(rx); /* return the last filled */ while ( paren >= 0 && !RX_OFFS_VALID(rx,paren) ) paren--; + if (paren && RX_PARNO_TO_LOGICAL(rx)) + paren = RX_PARNO_TO_LOGICAL(rx)[paren]; if (n == '-') { /* @- */ return (U32)paren; @@ -665,21 +667,28 @@ int Perl_magic_regdatum_get(pTHX_ SV *sv, MAGIC *mg) { PERL_ARGS_ASSERT_MAGIC_REGDATUM_GET; - - if (PL_curpm) { - REGEXP * const rx = PM_GETRE(PL_curpm); - if (rx) { - const SSize_t n = (SSize_t)mg->mg_obj; - /* @{^CAPTURE} does not contain $&, so we need to increment by 1 */ - const I32 paren = mg->mg_len - + (n == '\003' ? 1 : 0); - SSize_t s; - SSize_t t; - if (paren < 0) - return 0; - if (paren <= (I32)RX_NPARENS(rx) && - ((s = RX_OFFS_START(rx,paren)) != -1) && - ((t = RX_OFFS_END(rx,paren)) != -1)) + REGEXP * const rx = PL_curpm ? PM_GETRE(PL_curpm) : NULL; + + if (rx) { + const SSize_t n = (SSize_t)mg->mg_obj; + /* @{^CAPTURE} does not contain $&, so we need to increment by 1 */ + const I32 paren = mg->mg_len + + (n == '\003' ? 1 : 0); + SSize_t s; + SSize_t t; + if (paren < 0) + return 0; + if (n != '+' && n != '-') { + CALLREG_NUMBUF_FETCH(rx,paren,sv); + return 0; + } + if (paren <= (I32)RX_LOGICAL_NPARENS(rx)) { + I32 true_paren = RX_LOGICAL_TO_PARNO(rx) + ? RX_LOGICAL_TO_PARNO(rx)[paren] + : paren; + do { + if (((s = RX_OFFS_START(rx,true_paren)) != -1) && + ((t = RX_OFFS_END(rx,true_paren)) != -1)) { SSize_t i; @@ -687,10 +696,6 @@ Perl_magic_regdatum_get(pTHX_ SV *sv, MAGIC *mg) i = t; else if (n == '-') /* @- */ i = s; - else { /* @^CAPTURE @{^CAPTURE} */ - CALLREG_NUMBUF_FETCH(rx,paren,sv); - return 0; - } if (RX_MATCH_UTF8(rx)) { const char * const b = RX_SUBBEG(rx); @@ -703,6 +708,11 @@ Perl_magic_regdatum_get(pTHX_ SV *sv, MAGIC *mg) sv_setuv(sv, i); return 0; } + if (RX_PARNO_TO_LOGICAL_NEXT(rx)) + true_paren = RX_PARNO_TO_LOGICAL_NEXT(rx)[true_paren]; + else + break; + } while (true_paren); } } sv_set_undef(sv); @@ -1095,6 +1105,8 @@ Perl_magic_get(pTHX_ SV *sv, MAGIC *mg) case '\016': /* ^N */ if (PL_curpm && (rx = PM_GETRE(PL_curpm))) { paren = RX_LASTCLOSEPAREN(rx); + if (RX_PARNO_TO_LOGICAL(rx)) + paren = RX_PARNO_TO_LOGICAL(rx)[paren]; if (paren) goto do_numbuf_fetch; } @@ -3722,9 +3722,14 @@ Perl_reg_named_buff_scalar(pTHX_ REGEXP * const rx, const U32 flags); assert(rx) PERL_CALLCONV void -Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const rx, const I32 paren, SV * const sv); +Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const re, const I32 paren, SV * const sv); #define PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH \ - assert(rx) + assert(re) + +PERL_CALLCONV void +Perl_reg_numbered_buff_fetch_flags(pTHX_ REGEXP * const re, const I32 paren, SV * const sv, U32 flags); +#define PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH_FLAGS \ + assert(re) PERL_CALLCONV I32 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const rx, const SV * const sv, const I32 paren); @@ -1427,7 +1427,10 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, RExC_use_BRANCHJ = 0; RExC_warned_WARN_EXPERIMENTAL__VLB = 0; RExC_warned_WARN_EXPERIMENTAL__REGEX_SETS = 0; + RExC_logical_total_parens = 0; RExC_total_parens = 0; + RExC_logical_to_parno = NULL; + RExC_parno_to_logical = NULL; RExC_open_parens = NULL; RExC_close_parens = NULL; RExC_paren_names = NULL; @@ -1612,6 +1615,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, RExC_naughty = 0; RExC_npar = 1; + RExC_logical_npar = 1; RExC_parens_buf_size = 0; RExC_emit_start = RExC_rxi->program; pRExC_state->code_index = 0; @@ -1630,6 +1634,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, /* We have that number in RExC_npar */ RExC_total_parens = RExC_npar; + RExC_logical_total_parens = RExC_logical_npar; } else if (! MUST_RESTART(flags)) { ReREFCNT_dec(Rx); @@ -1674,6 +1679,9 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, Renew(RExC_close_parens, RExC_total_parens, regnode_offset); Zero(RExC_close_parens, RExC_total_parens, regnode_offset); + /* we do NOT reinitialize RExC_logical_to_parno and + * RExC_parno_to_logical here. We need their data on the second + * pass */ } else { /* Parse did not complete. Reinitialize the parentheses structures */ @@ -1686,6 +1694,14 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, Safefree(RExC_close_parens); RExC_close_parens = NULL; } + if (RExC_logical_to_parno) { + Safefree(RExC_logical_to_parno); + RExC_logical_to_parno = NULL; + } + if (RExC_parno_to_logical) { + Safefree(RExC_parno_to_logical); + RExC_parno_to_logical = NULL; + } } /* Clean up what we did in this parse */ @@ -1702,6 +1718,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, set_regex_pv(pRExC_state, Rx); RExC_rx->nparens = RExC_total_parens - 1; + RExC_rx->logical_nparens = RExC_logical_total_parens - 1; /* Uses the upper 4 bits of the FLAGS field, so keep within that size */ if (RExC_whilem_seen > 15) @@ -2245,6 +2262,31 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, assert(scan && OP(scan) == GOSUB); ARG2L_SET( scan, RExC_open_parens[ARG(scan)] - REGNODE_OFFSET(scan)); } + if (RExC_logical_total_parens != RExC_total_parens) { + Newxz(RExC_parno_to_logical_next, RExC_total_parens, I32); + /* we rebuild this below */ + Zero(RExC_logical_to_parno, RExC_total_parens, I32); + for( int parno = RExC_total_parens-1 ; parno > 0 ; parno-- ) { + int logical_parno= RExC_parno_to_logical[parno]; + assert(logical_parno); + RExC_parno_to_logical_next[parno]= RExC_logical_to_parno[logical_parno]; + RExC_logical_to_parno[logical_parno] = parno; + } + if (0) + for( int parno = 1; parno < RExC_total_parens ; parno++ ) + PerlIO_printf(Perl_debug_log,"%d -> %d -> %d\n", + parno, RExC_parno_to_logical[parno], RExC_parno_to_logical_next[parno]); + RExC_rx->logical_to_parno = RExC_logical_to_parno; + RExC_rx->parno_to_logical = RExC_parno_to_logical; + RExC_rx->parno_to_logical_next = RExC_parno_to_logical_next; + RExC_logical_to_parno = NULL; + RExC_parno_to_logical = NULL; + RExC_parno_to_logical_next = NULL; + } else { + RExC_rx->logical_to_parno = NULL; + RExC_rx->parno_to_logical = NULL; + RExC_rx->parno_to_logical_next = NULL; + } Newxz(RExC_rx->offs, RExC_total_parens, regexp_paren_pair); /* assume we don't need to swap parens around before we match */ @@ -2266,6 +2308,14 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, Safefree(RExC_close_parens); RExC_close_parens = NULL; } + if (RExC_logical_to_parno) { + Safefree(RExC_logical_to_parno); + RExC_logical_to_parno = NULL; + } + if (RExC_parno_to_logical) { + Safefree(RExC_parno_to_logical); + RExC_parno_to_logical = NULL; + } #ifdef USE_ITHREADS /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated @@ -2883,6 +2933,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) regnode_offset br; regnode_offset lastbr; regnode_offset ender = 0; + I32 logical_parno = 0; I32 parno = 0; I32 flags; U32 oregflags = RExC_flags; @@ -3419,7 +3470,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) /* branch reset, behave like a (?:...) except that buffers in alternations share the same numbers */ paren = ':'; - after_freeze = freeze_paren = RExC_npar; + after_freeze = freeze_paren = RExC_logical_npar; /* XXX This construct currently requires an extra pass. * Investigation would be required to see if that could be @@ -3508,7 +3559,6 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) if (*RExC_parse!=')') vFAIL("Expecting close bracket"); - gen_recurse_regop: if (paren == '-' || paren == '+') { /* Don't overflow */ @@ -3545,7 +3595,26 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) vFAIL(non_existent_group_msg); } } + else + if (num && num < RExC_logical_npar) { + num = RExC_logical_to_parno[num]; + } + else + if (ALL_PARENS_COUNTED) { + if (num < RExC_logical_total_parens) { + num = RExC_logical_to_parno[num]; + } + else { + RExC_parse_inc_by(1); + vFAIL(non_existent_group_msg); + } + } + else { + REQUIRE_PARENS_PASS; + } + + gen_recurse_regop: if (num >= RExC_npar) { /* It might be a forward reference; we can't fail until we @@ -3906,6 +3975,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) capturing_parens: parno = RExC_npar; RExC_npar++; + logical_parno = RExC_logical_npar; + RExC_logical_npar++; if (! ALL_PARENS_COUNTED) { /* If we are in our first pass through (and maybe only pass), * we need to allocate memory for the capturing parentheses @@ -3932,6 +4003,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) /* we don't know where end op starts yet, so we don't need to * set RExC_close_parens[0] like we do RExC_open_parens[0] * above */ + + Newxz(RExC_logical_to_parno, RExC_parens_buf_size, I32); + Newxz(RExC_parno_to_logical, RExC_parens_buf_size, I32); } else if (RExC_npar > RExC_parens_buf_size) { I32 old_size = RExC_parens_buf_size; @@ -3947,6 +4021,14 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) regnode_offset); Zero(RExC_close_parens + old_size, RExC_parens_buf_size - old_size, regnode_offset); + + Renew(RExC_logical_to_parno, RExC_parens_buf_size, I32); + Zero(RExC_logical_to_parno + old_size, + RExC_parens_buf_size - old_size, I32); + + Renew(RExC_parno_to_logical, RExC_parens_buf_size, I32); + Zero(RExC_parno_to_logical + old_size, + RExC_parens_buf_size - old_size, I32); } } @@ -3961,7 +4043,11 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) (IV)parno, ret)); RExC_open_parens[parno]= ret; } - + if (RExC_parno_to_logical) { + RExC_parno_to_logical[parno] = logical_parno; + if (RExC_logical_to_parno && !RExC_logical_to_parno[logical_parno]) + RExC_logical_to_parno[logical_parno] = parno; + } is_open = 1; } else { /* with RXf_PMf_NOCAPTURE treat (...) as (?:...) */ @@ -4018,9 +4104,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) } nextchar(pRExC_state); if (freeze_paren) { - if (RExC_npar > after_freeze) - after_freeze = RExC_npar; - RExC_npar = freeze_paren; + if (RExC_logical_npar > after_freeze) + after_freeze = RExC_logical_npar; + RExC_logical_npar = freeze_paren; } br = regbranch(pRExC_state, &flags, 0, depth+1); @@ -4221,8 +4307,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) NOT_REACHED; /* NOTREACHED */ } - if (after_freeze > RExC_npar) - RExC_npar = after_freeze; + if (after_freeze > RExC_logical_npar) + RExC_logical_npar = after_freeze; RExC_in_lookaround = was_in_lookaround; @@ -5787,6 +5873,21 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) if (num < 1) vFAIL("Reference to nonexistent or unclosed group"); } + else + if (num < RExC_logical_npar) { + num = RExC_logical_to_parno[num]; + } + else + if (ALL_PARENS_COUNTED) { + if (num < RExC_logical_total_parens) + num = RExC_logical_to_parno[num]; + else { + num = -1; + } + } + else{ + REQUIRE_PARENS_PASS; + } } else { num = S_backref_value(RExC_parse, RExC_end); @@ -5800,7 +5901,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) if ( /* any numeric escape < 10 is always a backref */ num > 9 /* any numeric escape < RExC_npar is a backref */ - && num >= RExC_npar + && num >= RExC_logical_npar /* cannot be an octal escape if it starts with [89] * */ && ! inRANGE(*RExC_parse, '8', '9') @@ -5812,6 +5913,19 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) RExC_parse_set(atom_parse_start); goto defchar; } + if (num < RExC_logical_npar) { + num = RExC_logical_to_parno[num]; + } + else + if (ALL_PARENS_COUNTED) { + if (num < RExC_logical_total_parens) { + num = RExC_logical_to_parno[num]; + } else { + num = -1; + } + } else { + REQUIRE_PARENS_PASS; + } } /* At this point RExC_parse points at a numeric escape like @@ -5828,9 +5942,10 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) else while (isDIGIT(*RExC_parse)) { RExC_parse_inc_by(1); } + if (num < 0) + vFAIL("Reference to nonexistent group"); if (num >= (I32)RExC_npar) { - /* It might be a forward reference; we can't fail until we * know, by completing the parse to get all the groups, and * then reparsing */ @@ -12946,6 +13061,12 @@ Perl_pregfree2(pTHX_ REGEXP *rx) SvREFCNT_dec(r->saved_copy); #endif Safefree(r->offs); + if (r->logical_to_parno) { + Safefree(r->logical_to_parno); + Safefree(r->parno_to_logical); + Safefree(r->parno_to_logical_next); + } + SvREFCNT_dec(r->qr_anoncv); if (r->recurse_locinput) Safefree(r->recurse_locinput); @@ -13043,6 +13164,7 @@ Perl_reg_temp_copy(pTHX_ REGEXP *dsv, REGEXP *ssv) */ memcpy(&(drx->xpv_cur), &(srx->xpv_cur), sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur)); + if (!islv) SvLEN_set(dsv, 0); if (srx->offs) { @@ -13063,6 +13185,23 @@ Perl_reg_temp_copy(pTHX_ REGEXP *dsv, REGEXP *ssv) /* check_substr and check_utf8, if non-NULL, point to either their anchored or float namesakes, and don't hold a second reference. */ } + if (srx->logical_to_parno) { + NewCopy(srx->logical_to_parno, + drx->logical_to_parno, + srx->nparens, I32); + NewCopy(srx->parno_to_logical, + drx->parno_to_logical, + srx->nparens, I32); + NewCopy(srx->parno_to_logical_next, + drx->parno_to_logical_next, + srx->nparens, I32); + } else { + drx->logical_to_parno = NULL; + drx->parno_to_logical = NULL; + drx->parno_to_logical_next = NULL; + } + drx->logical_nparens = srx->logical_nparens; + RX_MATCH_COPIED_off(dsv); #ifdef PERL_ANY_COW drx->saved_copy = NULL; @@ -13296,6 +13435,19 @@ Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param) ret->saved_copy = NULL; #endif + if (r->logical_to_parno) { + /* we use total_parens for all three just for symmetry */ + ret->logical_to_parno = (I32*)SAVEPVN((char*)(r->logical_to_parno), r->nparens * sizeof(I32)); + ret->parno_to_logical = (I32*)SAVEPVN((char*)(r->parno_to_logical), r->nparens * sizeof(I32)); + ret->parno_to_logical_next = (I32*)SAVEPVN((char*)(r->parno_to_logical_next), r->nparens * sizeof(I32)); + } else { + ret->logical_to_parno = NULL; + ret->parno_to_logical = NULL; + ret->parno_to_logical_next = NULL; + } + + ret->logical_nparens = r->logical_nparens; + /* Whether mother_re be set or no, we need to copy the string. We cannot refrain from copying it when the storage points directly to our mother regexp, because that's diff --git a/regcomp_debug.c b/regcomp_debug.c index 0b2cc06170..74f080a9d9 100644 --- a/regcomp_debug.c +++ b/regcomp_debug.c @@ -478,7 +478,6 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_ U32 parno= (op == ACCEPT) ? (U32)ARG2L(o) : (op == OPEN || op == CLOSE) ? (U32)PARNO(o) : (U32)ARG(o); - Perl_sv_catpvf(aTHX_ sv, "%" UVuf, (UV)parno); /* Parenth number */ if ( RXp_PAREN_NAMES(prog) ) { name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]); } else if ( pRExC_state ) { @@ -486,6 +485,14 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_ } if ( name_list ) { if ( k != REF || (op < REFN)) { + UV logical_parno = parno; + if (prog->parno_to_logical) + logical_parno = prog->parno_to_logical[parno]; + + Perl_sv_catpvf(aTHX_ sv, "%" UVuf, (UV)logical_parno); /* Parenth number */ + if (parno != logical_parno) + Perl_sv_catpvf(aTHX_ sv, "/%" UVuf, (UV)parno); /* Parenth number */ + SV **name= av_fetch_simple(name_list, parno, 0 ); if (name) Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name)); @@ -511,6 +518,15 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_ Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name)); } } + } else if (parno>0) { + UV logical_parno = parno; + if (prog->parno_to_logical) + logical_parno = prog->parno_to_logical[parno]; + + Perl_sv_catpvf(aTHX_ sv, "%" UVuf, (UV)logical_parno); /* Parenth number */ + if (logical_parno != parno) + Perl_sv_catpvf(aTHX_ sv, "/%" UVuf, (UV)parno); /* Parenth number */ + } if ( k == REF && reginfo) { U32 n = ARG(o); /* which paren pair */ @@ -528,6 +544,10 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_ } } else if (k == GOSUB) { AV *name_list= NULL; + IV parno = ARG(o); + IV logical_parno = (parno && prog->parno_to_logical) + ? prog->parno_to_logical[parno] + : parno; if ( RXp_PAREN_NAMES(prog) ) { name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]); } else if ( pRExC_state ) { @@ -535,7 +555,11 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_ } /* Paren and offset */ - Perl_sv_catpvf(aTHX_ sv, "%d[%+d:%d]", (int)ARG(o),(int)ARG2L(o), + Perl_sv_catpvf(aTHX_ sv, "%" IVdf, logical_parno); + if (logical_parno != parno) + Perl_sv_catpvf(aTHX_ sv, "/%" IVdf, parno); + + Perl_sv_catpvf(aTHX_ sv, "[%+d:%d]", (int)ARG2L(o), (int)((o + (int)ARG2L(o)) - progi->program) ); if (name_list) { SV **name= av_fetch_simple(name_list, ARG(o), 0 ); diff --git a/regcomp_internal.h b/regcomp_internal.h index f1b81625a0..a895452511 100644 --- a/regcomp_internal.h +++ b/regcomp_internal.h @@ -66,15 +66,57 @@ struct RExC_state_t { * independent warning is raised for any given spot */ Size_t latest_warn_offset; + /* Branch reset /(?|...|...)/ gives us two concepts of capture buffer id. + * "Logical Parno" is the user visible view with branch reset taken into + * account. "Parno" (or physical parno) is the actual capture buffers in + * the pattern *NOT* taking into account branch reset. We also maintain + * a map of "next" pointers which allow us to skip to the next physical + * capture buffer with the same logical id, with 0 representing "none". + * + * As we compile we keep track of the two different counts using the + * 'logical_npar' and 'npar' members, and we keep track of the upper bound + * of both in 'total_par' and 'logical_total_par', we also populate + * the 'logical_to_parno' map, which gives us the first physical parno + * for a given logical parno, and the `parno_to_logical` array which gives + * us the logical id for each physical parno. When compilation is + * completed we construct the 'parno_to_logical_next' array from the + * 'parno_to_logical' array. (We do not bother constructing it during + * compilation as we do not need it, and we can construct it in O(N) time + * once we are done, but would need more complicated logic during the + * compile, because we want the next pointers to go from smallest to + * largest, eg, left to right.) + * + * Logical: $1 $2 $3 $4 $2 $3 $2 $5 + * Physical: 1 2 3 4 5 6 7 8 + * Next: 0 5 6 0 7 0 0 0 + * Pattern /(a) (?| (b) (c) (d) | (e) (f) | (g) ) (h)/ + * + * As much as possible the internals use and store the physical id of + * of capture buffers. We decode the physical to the logical only when + * we need to, for instance when someone use $2. + * + * Note that when branch reset is not used logical and physical are the + * same and the next data would be all zero. So when branch reset is not + * used we do not need to populate this data into the final regexp. + * + */ + I32 *logical_to_parno; /* logical_parno to parno */ + I32 *parno_to_logical; /* parno to logical_parno */ + I32 *parno_to_logical_next; /* parno to next (greater value) + parno with the same + logical_parno as parno.*/ + I32 npar; /* Capture buffer count so far in the parse, (OPEN) plus one. ("par" 0 is the whole pattern)*/ + I32 logical_npar; /* Logical version of npar */ I32 total_par; /* During initial parse, is either 0, or -1; the latter indicating a reparse is needed. After that pass, it is what 'npar' became after the pass. Hence, it being > 0 indicates we are in a reparse situation */ + I32 logical_total_par; /* Logical version to total par */ I32 nestroot; /* root parens we are in - used by accept */ I32 seen_zerolen; @@ -157,6 +199,11 @@ struct RExC_state_t { #define RExC_seen (pRExC_state->seen) #define RExC_size (pRExC_state->size) #define RExC_maxlen (pRExC_state->maxlen) +#define RExC_logical_npar (pRExC_state->logical_npar) +#define RExC_logical_total_parens (pRExC_state->logical_total_par) +#define RExC_logical_to_parno (pRExC_state->logical_to_parno) +#define RExC_parno_to_logical (pRExC_state->parno_to_logical) +#define RExC_parno_to_logical_next (pRExC_state->parno_to_logical_next) #define RExC_npar (pRExC_state->npar) #define RExC_total_parens (pRExC_state->total_par) #define RExC_parens_buf_size (pRExC_state->parens_buf_size) @@ -1194,4 +1241,5 @@ static const scan_data_t zero_scan_data = { CLEAR_OPTSTART; \ node = dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1); + #endif /* REGCOMP_INTERNAL_H */ @@ -7920,6 +7920,18 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) do_ref: type = OP(scan); n = ARG(scan); /* which paren pair */ + if (rex->logical_to_parno) { + n = rex->logical_to_parno[n]; + do { + if (rex->lastparen < n || rex->offs[n].start == -1 || rex->offs[n].end == -1) { + n = rex->parno_to_logical_next[n]; + } + else { + break; + } + } while(n); + if (!n) sayNO; + } do_nref_ref_common: ln = rex->offs[n].start; @@ -11818,7 +11830,7 @@ Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv, && RXp_OFFS_VALID(rx,nums[i])) { ret = newSVpvs(""); - CALLREG_NUMBUF_FETCH(r, nums[i], ret); + Perl_reg_numbered_buff_fetch_flags(aTHX_ r, nums[i], ret, REG_FETCH_ABSOLUTE); if (!retarray) return ret; } else { @@ -11971,16 +11983,24 @@ Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags) } void -Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren, +Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const re, const I32 paren, SV * const sv) { - struct regexp *const rx = ReANY(r); + PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH; + Perl_reg_numbered_buff_fetch_flags(aTHX_ re, paren, sv, 0); +} + +void +Perl_reg_numbered_buff_fetch_flags(pTHX_ REGEXP * const re, const I32 paren, + SV * const sv, U32 flags) +{ + struct regexp *const rx = ReANY(re); char *s = NULL; SSize_t i,t = 0; SSize_t s1, t1; I32 n = paren; - PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH; + PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH_FLAGS; if ( n == RX_BUFF_IDX_CARET_PREMATCH || n == RX_BUFF_IDX_CARET_FULLMATCH @@ -11993,7 +12013,7 @@ Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren, * $r = qr/.../; * /$qr/p; * the KEEPCOPY is set on the PMOP rather than the regex */ - if (PL_curpm && r == PM_GETRE(PL_curpm)) + if (PL_curpm && re == PM_GETRE(PL_curpm)) keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY); } if (!keepcopy) @@ -12021,18 +12041,32 @@ Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren, s = rx->subbeg - rx->suboffset + t; i = rx->sublen + rx->suboffset - t; } - else - if (inRANGE(n, 0, (I32)rx->nparens) && - ((s1 = RXp_OFFS_START(rx,n)) != -1 && - (t1 = RXp_OFFS_END(rx,n)) != -1)) - { - /* $&, ${^MATCH}, $1 ... */ - i = t1 - s1; - s = rx->subbeg + s1 - rx->suboffset; + else /* when flags is true we do an absolute lookup, and compare against rx->nparens */ + if (inRANGE(n, 0, flags ? (I32)rx->nparens : (I32)rx->logical_nparens)) { + I32 *map = (!flags && n) ? rx->logical_to_parno : NULL; + I32 true_parno = map ? map[n] : n; + do { + if (((s1 = RXp_OFFS_START(rx,true_parno)) != -1) && + ((t1 = RXp_OFFS_END(rx,true_parno)) != -1)) + { + /* $&, ${^MATCH}, $1 ... */ + i = t1 - s1; + s = rx->subbeg + s1 - rx->suboffset; + goto found_it; + } + else if (map) { + true_parno = rx->parno_to_logical_next[true_parno]; + } + else { + break; + } + } while (true_parno); + goto ret_undef; } else { goto ret_undef; } + found_it: assert(s >= rx->subbeg); assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) ); if (i >= 0) { @@ -12120,7 +12154,7 @@ Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv, switch (paren) { case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */ case RX_BUFF_IDX_PREMATCH: /* $` */ - if ( (i= RXp_OFFS_START(rx,0)) != -1) { + if ( (i = RXp_OFFS_START(rx,0)) != -1) { if (i > 0) { s1 = 0; t1 = i; @@ -12131,29 +12165,38 @@ Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv, case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */ case RX_BUFF_IDX_POSTMATCH: /* $' */ - if ( (i = RXp_OFFS_END(rx,0)) != -1) { + if ( (i = RXp_OFFS_END(rx,0)) != -1 ) { i = rx->sublen - i; if (i > 0) { - s1 = RXp_OFFS_END(rx,0); - t1 = rx->sublen; - goto getlen; + s1 = rx->offs[0].end; + t1 = rx->sublen; + goto getlen; } } return 0; default: /* $& / ${^MATCH}, $1, $2, ... */ - if (paren <= (I32)rx->nparens && - (s1 = RXp_OFFS_START(rx,paren)) != -1 && - (t1 = RXp_OFFS_END(rx,paren)) != -1) - { - i = t1 - s1; - goto getlen; - } else { - warn_undef: - if (ckWARN(WARN_UNINITIALIZED)) - report_uninit((const SV *)sv); - return 0; + if (paren <= (I32)rx->logical_nparens) { + I32 true_paren = rx->logical_to_parno + ? rx->logical_to_parno[paren] + : paren; + do { + if (((s1 = RXp_OFFS_START(rx,true_paren)) != -1) && + ((t1 = RXp_OFFS_END(rx,true_paren)) != -1)) + { + i = t1 - s1; + goto getlen; + } else if (rx->parno_to_logical_next) { + true_paren = rx->parno_to_logical_next[true_paren]; + } else { + break; + } + } while(true_paren); } + warn_undef: + if (ckWARN(WARN_UNINITIALIZED)) + report_uninit((const SV *)sv); + return 0; } getlen: if (i > 0 && RXp_MATCH_UTF8(rx)) { @@ -125,13 +125,22 @@ typedef struct regexp { * Information about the match that the perl core uses to manage things */ + /* see comment in regcomp_internal.h about branch reset to understand + the distinction between physical and logical capture buffers */ + U32 nparens; /* physical number of capture buffers */ + U32 logical_nparens; /* logical_number of capture buffers */ + I32 *logical_to_parno; /* map logical parno to first physcial */ + I32 *parno_to_logical; /* map every physical parno to logical */ + I32 *parno_to_logical_next; /* map every physical parno to the next + physical with the same logical id */ + U32 extflags; /* Flags used both externally and internally */ - U32 nparens; /* number of capture buffers */ SSize_t minlen; /* minimum possible number of chars in string to match */ SSize_t minlenret; /* minimum possible number of chars in $& */ STRLEN gofs; /* chars left of pos that we search from */ /* substring data about strings that must appear in * the final match, used for optimisations */ + struct reg_substr_data *substrs; /* private engine specific data */ @@ -149,6 +158,7 @@ typedef struct regexp { char **recurse_locinput; /* used to detect infinite recursion, XXX: move to internal */ U32 lastcloseparen; /* last close paren matched ($^N) */ + /*---------------------------------------------------------------------- */ /* offset from wrapped to the start of precomp */ @@ -551,6 +561,14 @@ and check for NULL. # define RX_SUBCOFFSET(rx_sv) (ReANY(rx_sv)->subcoffset) # define RXp_OFFSp(prog) (prog->offs) # define RX_OFFSp(rx_sv) (RXp_OFFSp(ReANY(rx_sv))) +# define RXp_LOGICAL_NPARENS(prog) (prog->logical_nparens) +# define RX_LOGICAL_NPARENS(rx_sv) (RXp_LOGICAL_NPARENS(ReANY(rx_sv))) +# define RXp_LOGICAL_TO_PARNO(prog) (prog->logical_to_parno) +# define RX_LOGICAL_TO_PARNO(rx_sv) (RXp_LOGICAL_TO_PARNO(ReANY(rx_sv))) +# define RXp_PARNO_TO_LOGICAL(prog) (prog->parno_to_logical) +# define RX_PARNO_TO_LOGICAL(rx_sv) (RXp_PARNO_TO_LOGICAL(ReANY(rx_sv))) +# define RXp_PARNO_TO_LOGICAL_NEXT(prog) (prog->parno_to_logical_next) +# define RX_PARNO_TO_LOGICAL_NEXT(rx_sv) (RXp_PARNO_TO_LOGICAL_NEXT(ReANY(rx_sv))) # define RXp_NPARENS(prog) (prog->nparens) # define RX_NPARENS(rx_sv) (RXp_NPARENS(ReANY(rx_sv))) # define RX_SUBLEN(rx_sv) (ReANY(rx_sv)->sublen) @@ -952,6 +970,7 @@ typedef struct regmatch_slab { } regmatch_slab; +#define REG_FETCH_ABSOLUTE 1 /* * ex: set ts=8 sts=4 sw=4 et: diff --git a/t/re/pat_advanced.t b/t/re/pat_advanced.t index 445854dfd7..d836525c15 100644 --- a/t/re/pat_advanced.t +++ b/t/re/pat_advanced.t @@ -1508,19 +1508,25 @@ sub run_tests { } { - my $res=""; + no warnings 'uninitialized'; + my $res = ""; if ('1' =~ /(?|(?<digit>1)|(?<digit>2))/) { $res = "@{$- {digit}}"; } - is($res, "1", - "Check that (?|...) doesnt cause dupe entries in the names array"); + is($res, "1 ", + "Check that repeated named captures in branch reset (?|...) work as expected"); + if ('2' =~ /(?|(?<digit>1)|(?<digit>2))/) { + $res = "@{$- {digit}}"; + } + is($res, " 2", + "Check that repeated named captures in branch reset (?|...) work as expected"); $res = ""; if ('11' =~ /(?|(?<digit>1)|(?<digit>2))(?&digit)/) { $res = "@{$- {digit}}"; } - is($res, "1", + is($res, "1 ", "Check that (?&..) to a buffer inside a (?|...) goes to the leftmost"); } @@ -2586,12 +2592,13 @@ Starting parse and generation | | atom <> | 8| tail~ OPEN1 'b' (4) -> REFN | | Setting close paren #1 to 8 - | 10| lsbr~ tying lastbr REFN0 (6) to ender CLOSE1 'b' (8) offset 2 - | | tail~ REFN0 (6) -> CLOSE + | 10| lsbr~ tying lastbr REFN (6) to ender CLOSE1 'b' (8) offset 2 + | | tail~ REFN (6) -> CLOSE Unmatched ( in regex; marked by <-- HERE in m/(?{a})( <-- HERE ?<b>\g{c}/ at - line 1. Freeing REx: "(?{a})(?<b>\g{c}" EOF_DEBUG_OUT - {}, "Github Issue #19350, assert fail in " + {rtrim_result=>1}, + "Github Issue #19350, assert fail in " . "Debug => 'ALL' from malformed qr// (heisenbug try $try)"); } { # Related to GH $19350 but segfaults instead of asserts, and does so reliably, not randomly. @@ -2612,8 +2619,8 @@ Starting parse and generation | | piec | | atom <)(?<c>x)(?&b)> | 5| tail~ OPEN1 'b' (1) -> REFN - | 7| lsbr~ tying lastbr REFN0 (3) to ender CLOSE1 'b' (5) offset 2 - | | tail~ REFN0 (3) -> CLOSE + | 7| lsbr~ tying lastbr REFN (3) to ender CLOSE1 'b' (5) offset 2 + | | tail~ REFN (3) -> CLOSE <(?<c>x)(?&b)> | | piec | | atom <?<c>x)(?&b)> | | reg @@ -2624,7 +2631,7 @@ Starting parse and generation | 13| lsbr~ tying lastbr EXACT <x> (9) to ender CLOSE2 'c' (11) offset 2 | | tail~ EXACT <x> (9) -> CLOSE <(?&b)> | | tail~ OPEN1 'b' (1) - | | ~ REFN0 (3) + | | ~ REFN (3) | | ~ CLOSE1 'b' (5) -> OPEN | | piec | | atom @@ -2634,7 +2641,7 @@ Starting parse and generation | | ~ CLOSE2 'c' (11) -> GOSUB | 17| lsbr~ tying lastbr OPEN1 'b' (1) to ender END (16) offset 15 | | tail~ OPEN1 'b' (1) - | | ~ REFN0 (3) + | | ~ REFN (3) | | ~ CLOSE1 'b' (5) | | ~ OPEN2 'c' (7) | | ~ EXACT <x> (9) @@ -2652,8 +2659,8 @@ Starting parse and generation | | piec | | atom <)(?<c>x)(?&b)> | 5| tail~ OPEN1 'b' (1) -> REFN - | 7| lsbr~ tying lastbr REFN12 'c' (3) to ender CLOSE1 'b' (5) offset 2 - | | tail~ REFN12 'c' (3) -> CLOSE + | 7| lsbr~ tying lastbr REFN2 'c' (3) to ender CLOSE1 'b' (5) offset 2 + | | tail~ REFN2 'c' (3) -> CLOSE <(?<c>x)(?&b)> | | piec | | atom <?<c>x)(?&b)> | | reg @@ -2664,7 +2671,7 @@ Starting parse and generation | 13| lsbr~ tying lastbr EXACT <x> (9) to ender CLOSE2 'c' (11) offset 2 | | tail~ EXACT <x> (9) -> CLOSE <(?&b)> | | tail~ OPEN1 'b' (1) - | | ~ REFN12 'c' (3) + | | ~ REFN2 'c' (3) | | ~ CLOSE1 'b' (5) -> OPEN | | piec | | atom @@ -2674,7 +2681,7 @@ Starting parse and generation | | ~ CLOSE2 'c' (11) -> GOSUB | 17| lsbr~ tying lastbr OPEN1 'b' (1) to ender END (16) offset 15 | | tail~ OPEN1 'b' (1) - | | ~ REFN12 'c' (3) + | | ~ REFN2 'c' (3) | | ~ CLOSE1 'b' (5) | | ~ OPEN2 'c' (7) | | ~ EXACT <x> (9) @@ -2684,7 +2691,8 @@ Required size 16 nodes first at 3 Freeing REx: "(?<b>\g{c})(?<c>x)(?&b)" EOF_DEBUG_OUT - {}, "Related to Github Issue #19350, forward \\g{x} pattern segv under use re Debug => 'PARSE'"); + {rtrim_result=>1}, + "Related to Github Issue #19350, forward \\g{x} pattern segv under use re Debug => 'PARSE'"); } { # GH 20009 diff --git a/t/re/re_tests b/t/re/re_tests index 1e79abbad7..e899a843e2 100644 --- a/t/re/re_tests +++ b/t/re/re_tests @@ -2112,7 +2112,20 @@ AB\s+\x{100} AB \x{100}X y - - \p{nv=-0} \x{660} y $& \x{660} (?:a|xx){0,4}?b aaaaab y $& aaaab # Bug is GH #8369; test is GH #19781 - +(?|(?<a>a)|(?<b>b))\1(?&a)(?&b) bbab y $& bbab # GH 20653 +(?|(?<a>a)|(?<b>b))(?(<a>)x|y)\1 byb y $& byb # GH 20653 +(?|(?<a>a)|(?<b>b))(?(<a>)x|y)\1 bxb n - - # GH 20653 +(?|(?<a>a)|(?<b>b))(?(<a>)x|y)\1 axa y $& axa # GH 20653 +(?|(?<a>a)|(?<b>b))(?(<a>)x|y)\1 aya n - - # GH 20653 +(?|(?<a>a)|(?<b>b)) a y $1-$+{a}-$+{b} a-a- # GH 20653 +(?|(?<a>a)|(?<b>b)) b y $1-$+{a}-$+{b} b--b # GH 20653 +(?<pre>pre)(?|(?<a>a)(?<b>b)(?<c>c)|(?<d>d)(?<e>e)|(?<f>f))(?<post>post) preabcpost y $2-$3-$4 a-b-c # GH 20653 +(?<pre>pre)(?|(?<a>a)(?<b>b)(?<c>c)|(?<d>d)(?<e>e)|(?<f>f))(?<post>post) predepost y $2-$3-$4 d-e- # GH 20653 +(?<pre>pre)(?|(?<a>a)(?<b>b)(?<c>c)|(?<d>d)(?<e>e)|(?<f>f))(?<post>post) prefpost y $2-$3-$4 f-- # GH 20653 +(?<pre>pre)(?|(?<a>a)(?<b>b)(?<c>c)|(?<d>d)(?<e>e)|(?<f>f))(?<post>post) preabcpost y $+{a}-$+{b}-$+{c} a-b-c # GH 20653 +((?|(?<a>a)(?-1)|(?<b>b)(?-1)|(?<c>c)(?-1))) aa y $1 aa # GH 20653 +((?|(?<a>a)(?-1)|(?<b>b)(?-1)|(?<c>c)(?-1))) bb y $1 bb # GH 20653 +((?|(?<a>a)(?-1)|(?<b>b)(?-1)|(?<c>c)(?-1))) cc y $1 cc # GH 20653 # Keep these lines at the end of the file # pat string y/n/etc expr expected-expr skip-reason comment # vim: softtabstop=0 noexpandtab |