diff options
author | Yves Orton <demerphq@gmail.com> | 2006-10-04 17:45:15 +0200 |
---|---|---|
committer | Rafael Garcia-Suarez <rgarciasuarez@gmail.com> | 2006-10-05 10:23:08 +0000 |
commit | 6bda09f9db748451f9bb2b0d8c798ce595a6609c (patch) | |
tree | 1ddc57ee0bf52f91d840b31da4dea86d20ede672 | |
parent | 87fbace95be9589b7b2c6e7ed7bd681adeae2cf4 (diff) | |
download | perl-6bda09f9db748451f9bb2b0d8c798ce595a6609c.tar.gz |
Re: [PATCH] Add recursive regexes similar to PCRE
Date: Wed, 4 Oct 2006 15:45:15 +0200
Message-ID: <9b18b3110610040645s563220a2id6f235494b497e90@mail.gmail.com>
Subject: Re: [PATCH] Add recursive regexes similar to PCRE
From: demerphq <demerphq@gmail.com>
Date: Wed, 4 Oct 2006 21:05:10 +0200
Message-ID: <9b18b3110610041205m2660eb43m1315cf4b0653db96@mail.gmail.com>
p4raw-id: //depot/perl@28939
-rw-r--r-- | embed.fnc | 2 | ||||
-rw-r--r-- | embed.h | 2 | ||||
-rw-r--r-- | pod/perlre.pod | 95 | ||||
-rw-r--r-- | pod/perltodo.pod | 67 | ||||
-rw-r--r-- | proto.h | 2 | ||||
-rw-r--r-- | regcomp.c | 126 | ||||
-rw-r--r-- | regcomp.h | 18 | ||||
-rw-r--r-- | regcomp.pl | 8 | ||||
-rw-r--r-- | regcomp.sym | 4 | ||||
-rw-r--r-- | regexec.c | 98 | ||||
-rw-r--r-- | regexp.h | 4 | ||||
-rw-r--r-- | regnodes.h | 130 | ||||
-rwxr-xr-x | t/op/pat.t | 34 | ||||
-rw-r--r-- | t/op/re_tests | 4 |
14 files changed, 482 insertions, 112 deletions
@@ -1312,7 +1312,7 @@ ERsn |I32 |regcurly |NN const char * Es |regnode*|reg_node |NN struct RExC_state_t *state|U8 op Es |regnode*|regpiece |NN struct RExC_state_t *state|NN I32 *flagp|U32 depth Es |regnode*|reg_namedseq |NN struct RExC_state_t *state|NULLOK UV *valuep -Es |void |reginsert |NN struct RExC_state_t *state|U8 op|NN regnode *opnd +Es |void |reginsert |NN struct RExC_state_t *state|U8 op|NN regnode *opnd|U32 depth Es |void |regtail |NN struct RExC_state_t *state|NN regnode *p|NN const regnode *val|U32 depth Es |U32 |join_exact |NN struct RExC_state_t *state|NN regnode *scan|NN I32 *min|U32 flags|NULLOK regnode *val|U32 depth EsRn |char* |regwhite |NN char *p|NN const char *e @@ -3514,7 +3514,7 @@ #define reg_node(a,b) S_reg_node(aTHX_ a,b) #define regpiece(a,b,c) S_regpiece(aTHX_ a,b,c) #define reg_namedseq(a,b) S_reg_namedseq(aTHX_ a,b) -#define reginsert(a,b,c) S_reginsert(aTHX_ a,b,c) +#define reginsert(a,b,c,d) S_reginsert(aTHX_ a,b,c,d) #define regtail(a,b,c,d) S_regtail(aTHX_ a,b,c,d) #define join_exact(a,b,c,d,e,f) S_join_exact(aTHX_ a,b,c,d,e,f) #define regwhite S_regwhite diff --git a/pod/perlre.pod b/pod/perlre.pod index 61720db906..0e26b11f70 100644 --- a/pod/perlre.pod +++ b/pod/perlre.pod @@ -674,6 +674,13 @@ The assignment to C<$^R> above is properly localized, so the old value of C<$^R> is restored if the assertion is backtracked; compare L<"Backtracking">. +Due to an unfortunate implementation issue the perl code contained in these +blocks is treated as a compile time closure, which can have seemingly bizarre +consequences when used with lexically scoped variables inside of subroutines +or loops. There are various workarounds for this, including simply using +global variables instead. If you are using this construct and strange results +occur then check for the use of lexically scoped variables. + For reasons of security, this construct is forbidden if the regular expression involves run-time interpolation of variables, unless the perilous C<use re 'eval'> pragma has been used (see L<re>), or the @@ -702,7 +709,6 @@ or indirectly with functions such as C<split>. =item C<(??{ code })> X<(??{})> X<regex, postponed> X<regexp, postponed> X<regular expression, postponed> -X<regex, recursive> X<regexp, recursive> X<regular expression, recursive> B<WARNING>: This extended regular expression feature is considered highly experimental, and may be changed or deleted without notice. @@ -712,7 +718,15 @@ used idioms. This is a "postponed" regular subexpression. The C<code> is evaluated at run time, at the moment this subexpression may match. The result of evaluation is considered as a regular expression and matched as -if it were inserted instead of this construct. +if it were inserted instead of this construct. Note that this means +that the contents of capture buffers defined inside an eval'ed pattern +are not available outside of the pattern, and vice versa, there is no +way for the inner pattern to refer to a capture buffer defined outside. +Thus, + + ('a' x 100)=~/(??{'(.)' x 100})/ + +B<will> match, it will B<not> set $1. The C<code> is not interpolated. As before, the rules to determine where the C<code> ends are currently somewhat convoluted. @@ -729,12 +743,80 @@ The following pattern matches a parenthesized group: \) }x; +See also C<(?PARNO)> for a different, more efficient way to accomplish +the same task. + Because perl's regex engine is not currently re-entrant, delayed code may not invoke the regex engine either directly with C<m//> or C<s///>), or indirectly with functions such as C<split>. +Recursing deeper than 50 times without consuming any input string will +result in a fatal error. The maximum depth is compiled into perl, so +changing it requires a custom build. + +=item C<(?PARNO)> C<(?R)> + +X<(?PARNO)> X<(?1)> +X<regex, recursive> X<regexp, recursive> X<regular expression, recursive> + +B<WARNING>: This extended regular expression feature is considered +highly experimental, and may be changed or deleted without notice. + +Similar to C<(??{ code })> except it does not involve compiling any code, +instead it treats the contents of a capture buffer as an independent +pattern that must match at the current position. Capture buffers +contained by the pattern will have the value as determined by the +outermost recursion. + +PARNO is a sequence of digits not starting with 0 whose value +reflects the paren-number of the capture buffer to recurse to. +C<(?R)> curses to the beginning of the pattern. + +The following pattern matches a function foo() which may contain +balanced parenthesis as the argument. + + $re = qr{ ( # paren group 1 (full function) + foo + ( # paren group 2 (parens) + \( + ( # paren group 3 (contents of parens) + (?: + (?> [^()]+ ) # Non-parens without backtracking + | + (?2) # Recurse to start of paren group 2 + )* + ) + \) + ) + ) + }x; + +If the pattern was used as follows + + 'foo(bar(baz)+baz(bop))'=~/$re/ + and print "\$1 = $1\n", + "\$2 = $2\n", + "\$3 = $3\n"; + +the output produced should be the following: + + $1 = foo(bar(baz)+baz(bop)) + $2 = (bar(baz)+baz(bop)) + $3 = bar(baz)+baz(bop) + +If there is no corresponding capture buffer defined, then it is a +fatal error. Recursing deeper than 50 times without consuming any input +string will also result in a fatal error. The maximum depth is compiled +into perl, so changing it requires a custom build. + +B<Note> that this pattern does not behave the same way as the equivalent +PCRE or Python construct of the same form. In perl you can backtrack into +a recursed group, in PCRE and Python the recursed into group is treated +as atomic. Also, constructs like (?i:(?1)) or (?:(?i)(?1)) do not affect +the pattern being recursed into. + =item C<< (?>pattern) >> -X<backtrack> X<backtracking> +X<backtrack> X<backtracking> X<atomic> X<possessive> B<WARNING>: This extended regular expression feature is considered highly experimental, and may be changed or deleted without notice. @@ -827,6 +909,9 @@ one of these: Which one you pick depends on which of these expressions better reflects the above specification of comments. +In some literature this construct is called "atomic matching" or +"possessive matching". + =item C<(?(condition)yes-pattern|no-pattern)> X<(?()> @@ -1320,10 +1405,10 @@ else in the whole regular expression.) For this grouping operator there is no need to describe the ordering, since only whether or not C<S> can match is important. -=item C<(??{ EXPR })> +=item C<(??{ EXPR })>, C<(?PARNO)> The ordering is the same as for the regular expression which is -the result of EXPR. +the result of EXPR, or the pattern contained by capture buffer PARNO. =item C<(?(condition)yes-pattern|no-pattern)> diff --git a/pod/perltodo.pod b/pod/perltodo.pod index 6bf9d1f4cb..50a79d9429 100644 --- a/pod/perltodo.pod +++ b/pod/perltodo.pod @@ -628,3 +628,70 @@ Fix (or rewrite) the implementation of the C</(?{...})/> closures. This will allow the use of a regex from inside (?{ }), (??{ }) and (?(?{ })|) constructs. + +=head2 Add named capture to regexp engine + +Named capture is supported by .NET, PCRE and Python. Its embarrassing +Perl doesn't support it yet. + +Jeffrey Friedl notes that "the most glaring omission [in perl's regexp +engine] offered by other implementations is named capture". + +demerphq is working on this. + +=head2 Add possessive quantifiers to regexp engine + +Possessive quantifiers are a syntactic sugar that affords a more +elegant way to express (?>A+). They are also provided by many other +regex engines. Most importantly they allow various patterns to be +optimised more efficiently than (?>...) allows, and allow various data +driven optimisations to be implemented (such as auto-possesification of +quantifiers followed by contrary suffixes). Common syntax for them is + + ++ possessive 1 or more + *+ possessive 0 or more + {n,m}+ possessive n..m + +A possessive quantifier basically absorbs as much as it can and doesn't +give any back. + +Jeffrey Friedl documents possessive quantifiers in Mastering Regular +Expressions 2nd edition and explicitly pleads for them to be added to +perl. We should oblige him, lest he leaves us out of a future edition. +;-) + +demerphq has this on his todo list + +=head2 Add (?YES) (?NO) to regexp enigne + +YES/NO would allow a subpattern to be passed/failed but allow backtracking. +Basically a more efficient (?=), (?!). + +demerphq has this on his todo list + +=head2 Add (?SUCCEED) (?FAIL) to regexp engine + +SUCCEED/FAIL would allow a pattern to be passed/failed but without backtracking. +Thus you could signal that a pattern has matched or not, and return (regardless +that there is more pattern following). + +demerphq has this on his todo list + +=head2 Add (?CUT) (?COMMIT) to regexp engine + +CUT would allow a pattern to say "do not backtrack beyond here". +COMMIT would say match from here or don't, but don't try the pattern from +another starting pattern. + +These correspond to the \v and \V that Jeffrey Friedl mentions in +Mastering Regular Expressions 2nd edition. + +demerphq has this on his todo list + +=head2 Add class set operations to regexp engine + +Apparently these are quite useful. Anyway, Jeffery Friedl wants them. + +demerphq has this on his todo list, but right at the bottom. + + @@ -3574,7 +3574,7 @@ STATIC regnode* S_regpiece(pTHX_ struct RExC_state_t *state, I32 *flagp, U32 dep STATIC regnode* S_reg_namedseq(pTHX_ struct RExC_state_t *state, UV *valuep) __attribute__nonnull__(pTHX_1); -STATIC void S_reginsert(pTHX_ struct RExC_state_t *state, U8 op, regnode *opnd) +STATIC void S_reginsert(pTHX_ struct RExC_state_t *state, U8 op, regnode *opnd, U32 depth) __attribute__nonnull__(pTHX_1) __attribute__nonnull__(pTHX_3); @@ -117,8 +117,9 @@ typedef struct RExC_state_t { I32 extralen; I32 seen_zerolen; I32 seen_evals; + regnode **parens; /* offsets of each paren */ I32 utf8; - HV *charnames; /* cache of named sequences */ + HV *charnames; /* cache of named sequences */ #if ADD_TO_REGEXEC char *starttry; /* -Dr: where regtry was called. */ #define RExC_starttry (pRExC_state->starttry) @@ -151,6 +152,7 @@ typedef struct RExC_state_t { #define RExC_seen_evals (pRExC_state->seen_evals) #define RExC_utf8 (pRExC_state->utf8) #define RExC_charnames (pRExC_state->charnames) +#define RExC_parens (pRExC_state->parens) #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?') #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \ @@ -2709,6 +2711,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } flags &= ~SCF_DO_STCLASS; } + else if (OP(scan)==RECURSE) { + ARG2L_SET( scan, RExC_parens[ARG(scan)-1] - scan ); + } else if (strchr((const char*)PL_varies,OP(scan))) { I32 mincount, maxcount, minnext, deltanext, fl = 0; I32 f = flags, pos_before = 0; @@ -3766,6 +3771,7 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) RExC_emit = &PL_regdummy; RExC_whilem_seen = 0; RExC_charnames = NULL; + RExC_parens= NULL; #if 0 /* REGC() is (currently) a NOP at the first pass. * Clever compilers notice this and complain. --jhi */ @@ -3820,8 +3826,14 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) r->substrs = 0; /* Useful during FAIL. */ r->startp = 0; /* Useful during FAIL. */ - r->endp = 0; /* Useful during FAIL. */ + r->endp = 0; + if (RExC_seen & REG_SEEN_RECURSE) { + Newx(RExC_parens, RExC_npar,regnode *); + SAVEFREEPV(RExC_parens); + } + + /* Useful during FAIL. */ Newxz(r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */ if (r->offsets) { r->offsets[0] = RExC_size; @@ -3847,6 +3859,7 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) r->data = 0; if (reg(pRExC_state, 0, &flags,1) == NULL) return(NULL); + /* XXXX To minimize changes to RE engine we always allocate 3-units-long substrs field. */ Newx(r->substrs, 1, struct reg_substr_data); @@ -4242,10 +4255,6 @@ reStudy: Newxz(r->startp, RExC_npar, I32); Newxz(r->endp, RExC_npar, I32); - - if (RExC_charnames) - SvREFCNT_dec((SV*)(RExC_charnames)); - DEBUG_r( RX_DEBUG_on(r) ); DEBUG_DUMP_r({ PerlIO_printf(Perl_debug_log,"Final program:\n"); @@ -4312,6 +4321,10 @@ reStudy: DEBUG_PARSE_MSG((funcname)); \ PerlIO_printf(Perl_debug_log,"%4s","\n"); \ }) +#define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \ + DEBUG_PARSE_MSG((funcname)); \ + PerlIO_printf(Perl_debug_log,fmt "\n",args); \ +}) /* - reg - regular expression, i.e. main body or parenthesized thing * @@ -4399,6 +4412,41 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) nextchar(pRExC_state); *flagp = TRYAGAIN; return NULL; + case 'R' : + if (*RExC_parse != ')') + FAIL("Sequence (?R) not terminated"); + reg_node(pRExC_state, SRECURSE); + break; + case '1': case '2': case '3': case '4': /* (?1) */ + case '5': case '6': case '7': case '8': case '9': + RExC_parse--; + { + const I32 num = atoi(RExC_parse); + char * const parse_start = RExC_parse - 1; /* MJD */ + while (isDIGIT(*RExC_parse)) + RExC_parse++; + if (*RExC_parse!=')') + vFAIL("Expecting close bracket"); + ret = reganode(pRExC_state, RECURSE, num); + if (!SIZE_ONLY) { + if (num > (I32)RExC_rx->nparens) { + RExC_parse++; + vFAIL("Reference to nonexistent group"); + } + ARG2L_SET( ret, 0); + RExC_emit++; + DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log, + "Recurse #%d to %d\n", ARG(ret), ARG2L(ret))); + } else{ + RExC_size++; + RExC_seen|=REG_SEEN_RECURSE; + } + Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */ + Set_Node_Offset(ret, RExC_parse); /* MJD */ + + nextchar(pRExC_state); + return ret; + } case 'p': /* (?p...) */ if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP)) vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})"); @@ -4612,6 +4660,12 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) parno = RExC_npar; RExC_npar++; ret = reganode(pRExC_state, OPEN, parno); + if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) { + DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log, "Setting paren #%d to %d\n", + parno,REG_NODE_NUM(ret))); + RExC_parens[parno-1]= ret; + + } Set_Node_Length(ret, 1); /* MJD */ Set_Node_Offset(ret, RExC_parse); /* MJD */ is_open = 1; @@ -4629,10 +4683,10 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) return(NULL); if (*RExC_parse == '|') { if (!SIZE_ONLY && RExC_extralen) { - reginsert(pRExC_state, BRANCHJ, br); + reginsert(pRExC_state, BRANCHJ, br, depth+1); } else { /* MJD */ - reginsert(pRExC_state, BRANCH, br); + reginsert(pRExC_state, BRANCH, br, depth+1); Set_Node_Length(br, paren != 0); Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start); } @@ -4719,7 +4773,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) if (paren == '>') node = SUSPEND, flag = 0; - reginsert(pRExC_state, node,ret); + reginsert(pRExC_state, node,ret, depth+1); Set_Node_Cur_Length(ret); Set_Node_Offset(ret, parse_start + 1); ret->flags = flag; @@ -4880,7 +4934,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) do_curly: if ((flags&SIMPLE)) { RExC_naughty += 2 + RExC_naughty / 2; - reginsert(pRExC_state, CURLY, ret); + reginsert(pRExC_state, CURLY, ret, depth+1); Set_Node_Offset(ret, parse_start+1); /* MJD */ Set_Node_Cur_Length(ret); } @@ -4890,11 +4944,11 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) w->flags = 0; REGTAIL(pRExC_state, ret, w); if (!SIZE_ONLY && RExC_extralen) { - reginsert(pRExC_state, LONGJMP,ret); - reginsert(pRExC_state, NOTHING,ret); + reginsert(pRExC_state, LONGJMP,ret, depth+1); + reginsert(pRExC_state, NOTHING,ret, depth+1); NEXT_OFF(ret) = 3; /* Go over LONGJMP. */ } - reginsert(pRExC_state, CURLYX,ret); + reginsert(pRExC_state, CURLYX,ret, depth+1); /* MJD hk */ Set_Node_Offset(ret, parse_start+1); Set_Node_Length(ret, @@ -4928,6 +4982,10 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) *flagp = flags; return(ret); } + /* else if (OP(ret)==RECURSE) { + RExC_parse++; + vFAIL("Illegal quantifier on recursion group"); + } */ #if 0 /* Now runtime fix should be reliable. */ @@ -4951,7 +5009,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH); if (op == '*' && (flags&SIMPLE)) { - reginsert(pRExC_state, STAR, ret); + reginsert(pRExC_state, STAR, ret, depth+1); ret->flags = 0; RExC_naughty += 4; } @@ -4960,7 +5018,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) goto do_curly; } else if (op == '+' && (flags&SIMPLE)) { - reginsert(pRExC_state, PLUS, ret); + reginsert(pRExC_state, PLUS, ret, depth+1); ret->flags = 0; RExC_naughty += 3; } @@ -4982,7 +5040,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) if (*RExC_parse == '?') { nextchar(pRExC_state); - reginsert(pRExC_state, MINMOD, ret); + reginsert(pRExC_state, MINMOD, ret, depth+1); REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE); } if (ISMULT2(RExC_parse)) { @@ -5098,6 +5156,7 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep) if (!RExC_charnames) { /* make sure our cache is allocated */ RExC_charnames = newHV(); + sv_2mortal((SV*)RExC_charnames); } /* see if we have looked this one up before */ he_str = hv_fetch_ent( RExC_charnames, sv_name, 0, 0 ); @@ -6944,6 +7003,20 @@ S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg) if (SIZE_ONLY) { SIZE_ALIGN(RExC_size); RExC_size += 2; + /* + We can't do this: + + assert(2==regarglen[op]+1); + + Anything larger than this has to allocate the extra amount. + If we changed this to be: + + RExC_size += (1 + regarglen[op]); + + then it wouldn't matter. Its not clear what side effect + might come from that so its not done so far. + -- dmq + */ return(ret); } @@ -6984,24 +7057,33 @@ S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s) * Means relocating the operand. */ STATIC void -S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd) +S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth) { dVAR; register regnode *src; register regnode *dst; register regnode *place; const int offset = regarglen[(U8)op]; + const int size = NODE_STEP_REGNODE + offset; GET_RE_DEBUG_FLAGS_DECL; /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */ - + DEBUG_PARSE_FMT("inst"," - %s",reg_name[op]); if (SIZE_ONLY) { - RExC_size += NODE_STEP_REGNODE + offset; + RExC_size += size; return; } src = RExC_emit; - RExC_emit += NODE_STEP_REGNODE + offset; + RExC_emit += size; dst = RExC_emit; + if (RExC_parens) { + int paren; + for ( paren=0 ; paren < RExC_npar ; paren++ ) { + if ( RExC_parens[paren] >= src ) + RExC_parens[paren] += size; + } + } + while (src > opnd) { StructCopy(--src, --dst, regnode); if (RExC_offsets) { /* MJD 20010112 */ @@ -7374,8 +7456,10 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) } else if (k == WHILEM && o->flags) /* Ordinal/of */ Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4); - else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP ) + else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP) Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */ + else if (k == RECURSE) + Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */ else if (k == LOGICAL) Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */ else if (k == ANYOF) { @@ -86,6 +86,8 @@ struct regnode_string { char string[1]; }; +/* Argument bearing node - workhorse, + arg1 is often for the data field */ struct regnode_1 { U8 flags; U8 type; @@ -93,6 +95,16 @@ struct regnode_1 { U32 arg1; }; +/* Similar to a regnode_1 but with an extra signed argument */ +struct regnode_2L { + U8 flags; + U8 type; + U16 next_off; + U32 arg1; + I32 arg2; +}; + +/* 'Two field' -- Two 16 bit unsigned args */ struct regnode_2 { U8 flags; U8 type; @@ -101,6 +113,7 @@ struct regnode_2 { U16 arg2; }; + #define ANYOF_BITMAP_SIZE 32 /* 256 b/(8 b/B) */ #define ANYOF_CLASSBITMAP_SIZE 4 /* up to 32 (8*4) named classes */ @@ -154,10 +167,12 @@ struct regnode_charclass_class { /* has [[:blah:]] classes */ #define ARG(p) ARG_VALUE(ARG_LOC(p)) #define ARG1(p) ARG_VALUE(ARG1_LOC(p)) #define ARG2(p) ARG_VALUE(ARG2_LOC(p)) +#define ARG2L(p) ARG_VALUE(ARG2L_LOC(p)) #define ARG_SET(p, val) ARG__SET(ARG_LOC(p), (val)) #define ARG1_SET(p, val) ARG__SET(ARG1_LOC(p), (val)) #define ARG2_SET(p, val) ARG__SET(ARG2_LOC(p), (val)) +#define ARG2L_SET(p, val) ARG__SET(ARG2L_LOC(p), (val)) #undef NEXT_OFF #undef NODE_ALIGN @@ -190,7 +205,7 @@ struct regnode_charclass_class { /* has [[:blah:]] classes */ #define ARG_LOC(p) (((struct regnode_1 *)p)->arg1) #define ARG1_LOC(p) (((struct regnode_2 *)p)->arg1) #define ARG2_LOC(p) (((struct regnode_2 *)p)->arg2) - +#define ARG2L_LOC(p) (((struct regnode_2L *)p)->arg2) #define NODE_STEP_REGNODE 1 /* sizeof(regnode)/sizeof(regnode) */ #define EXTRA_STEP_2ARGS EXTRA_SIZE(struct regnode_2) @@ -328,6 +343,7 @@ struct regnode_charclass_class { /* has [[:blah:]] classes */ #define REG_SEEN_EVAL 0x00000008 #define REG_SEEN_CANY 0x00000010 #define REG_SEEN_SANY REG_SEEN_CANY /* src bckwrd cmpt */ +#define REG_SEEN_RECURSE 0x00000020 START_EXTERN_C diff --git a/regcomp.pl b/regcomp.pl index ed270e8967..2e84604b5f 100644 --- a/regcomp.pl +++ b/regcomp.pl @@ -82,6 +82,8 @@ printf OUT <<EOP, Any changes made here will be lost! */ +/* Regops and State definitions */ + #define %*s\t%d #define %*s\t%d @@ -101,6 +103,7 @@ while (++$ind <= $tot) { print OUT <<EOP; +/* PL_regkind[] What type of regop or state is this. */ #ifndef DOINIT EXTCONST U8 PL_regkind[]; @@ -120,6 +123,7 @@ print OUT <<EOP; }; #endif +/* regarglen[] - How large is the argument part of the node (in regnodes) */ #ifdef REG_COMP_C static const U8 regarglen[] = { @@ -137,6 +141,8 @@ while (++$ind <= $lastregop) { print OUT <<EOP; }; +/* reg_off_by_arg[] - Which argument holds the offset to the next node */ + static const char reg_off_by_arg[] = { EOP @@ -151,6 +157,8 @@ while (++$ind <= $lastregop) { print OUT <<EOP; }; +/* reg_name[] - Opcode/state names in string form, for debugging */ + #ifdef DEBUGGING const char * reg_name[] = { EOP diff --git a/regcomp.sym b/regcomp.sym index bc6f8e3164..4365eb5897 100644 --- a/regcomp.sym +++ b/regcomp.sym @@ -153,7 +153,9 @@ TRIEC TRIE, trie charclass Same as TRIE, but with embedded charclass data AHOCORASICK TRIE, trie 1 Aho Corasick stclass. flags==type AHOCORASICKC TRIE, trie charclass Same as AHOCORASICK, but with embedded charclass data - +#*Recursion (65) +RECURSE RECURSE, num/ofs 2L recurse to paren arg1 at (signed) ofs arg2 +SRECURSE RECURSE, no recurse to start of pattern # NEW STUFF ABOVE THIS LINE -- Please update counts below. @@ -165,7 +165,7 @@ S_regcppush(pTHX_ I32 parenfloor) if (paren_elems_to_push < 0) Perl_croak(aTHX_ "panic: paren_elems_to_push < 0"); -#define REGCP_OTHER_ELEMS 6 +#define REGCP_OTHER_ELEMS 8 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS); for (p = PL_regsize; p > parenfloor; p--) { /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */ @@ -181,6 +181,8 @@ S_regcppush(pTHX_ I32 parenfloor) )); } /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */ + SSPUSHPTR(PL_regstartp); + SSPUSHPTR(PL_regendp); SSPUSHINT(PL_regsize); SSPUSHINT(*PL_reglastparen); SSPUSHINT(*PL_reglastcloseparen); @@ -227,7 +229,10 @@ S_regcppop(pTHX_ const regexp *rex) *PL_reglastcloseparen = SSPOPINT; *PL_reglastparen = SSPOPINT; PL_regsize = SSPOPINT; + PL_regendp=(I32 *) SSPOPPTR; + PL_regstartp=(I32 *) SSPOPPTR; + /* Now restore the parentheses context. */ for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS); i > 0; i -= REGCP_PAREN_ELEMS) { @@ -488,7 +493,7 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos, srch_end_shift -= ((strbeg - s) - srch_start_shift); srch_start_shift = strbeg - s; } - DEBUG_OPTIMISE_r({ + DEBUG_OPTIMISE_MORE_r({ PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n", (IV)prog->check_offset_min, (IV)srch_start_shift, @@ -524,7 +529,7 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos, start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend); end_point= HOP3(strend, -srch_end_shift, strbeg); } - DEBUG_OPTIMISE_r({ + DEBUG_OPTIMISE_MORE_r({ PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n", (int)(end_point - start_point), (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point), @@ -719,7 +724,7 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos, t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos); - DEBUG_OPTIMISE_r( + DEBUG_OPTIMISE_MORE_r( PerlIO_printf(Perl_debug_log, "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n", (IV)prog->check_offset_min, @@ -1979,9 +1984,10 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * } } if (last == NULL) { - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, - "%sCan't trim the tail, match fails (should not happen)%s\n", - PL_colors[4], PL_colors[5])); + DEBUG_EXECUTE_r( + PerlIO_printf(Perl_debug_log, + "%sCan't trim the tail, match fails (should not happen)%s\n", + PL_colors[4], PL_colors[5])); goto phooey; /* Should not happen! */ } dontbother = strend - last + prog->float_min_offset; @@ -2063,6 +2069,7 @@ phooey: return 0; } + /* - regtry - try match at specific point */ @@ -2146,16 +2153,16 @@ S_regtry(pTHX_ const regmatch_info *reginfo, char *startpos) prog->subbeg = PL_bostr; prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */ } + DEBUG_EXECUTE_r(PL_reg_starttry = startpos); prog->startp[0] = startpos - PL_bostr; PL_reginput = startpos; - PL_regstartp = prog->startp; - PL_regendp = prog->endp; PL_reglastparen = &prog->lastparen; PL_reglastcloseparen = &prog->lastcloseparen; prog->lastparen = 0; prog->lastcloseparen = 0; PL_regsize = 0; - DEBUG_EXECUTE_r(PL_reg_starttry = startpos); + PL_regstartp = prog->startp; + PL_regendp = prog->endp; if (PL_reg_start_tmpl <= prog->nparens) { PL_reg_start_tmpl = prog->nparens*3/2 + 3; if(PL_reg_start_tmp) @@ -2508,6 +2515,7 @@ S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog) register I32 nextchr; /* is always set to UCHARAT(locinput) */ bool result = 0; /* return value of S_regmatch */ int depth = 0; /* depth of recursion */ + int nochange_depth = 0; /* depth of RECURSE recursion with nochange*/ regmatch_state *yes_state = NULL; /* state to pop to on success of subpattern */ regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */ @@ -3325,10 +3333,39 @@ S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog) #undef ST #define ST st->u.eval - - case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */ { SV *ret; + regexp *re; + regnode *startpoint; + + case SRECURSE: + case RECURSE: /* /(...(?1))/ */ + if (cur_eval && cur_eval->locinput==locinput) { + if (cur_eval->u.eval.close_paren == ARG(scan)) + Perl_croak(aTHX_ "Infinite recursion in RECURSE in regexp"); + if ( ++nochange_depth > MAX_RECURSE_EVAL_NOCHANGE_DEPTH ) + Perl_croak(aTHX_ "RECURSE without pos change exceeded limit in regexp"); + } else { + nochange_depth = 0; + } + re = rex; + (void)ReREFCNT_inc(rex); + if (OP(scan)==RECURSE) { + startpoint = scan + ARG2L(scan); + ST.close_paren = ARG(scan); + } else { + startpoint = re->program+1; + ST.close_paren = 0; + } + goto eval_recurse_doit; + /* NOTREACHED */ + case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */ + if (cur_eval && cur_eval->locinput==locinput) { + if ( ++nochange_depth > MAX_RECURSE_EVAL_NOCHANGE_DEPTH ) + Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regexp"); + } else { + nochange_depth = 0; + } { /* execute the code in the {...} */ dSP; @@ -3362,7 +3399,7 @@ S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog) } } if (st->logical == 2) { /* Postponed subexpression: /(??{...})/ */ - regexp *re; + { /* extract RE object from returned value; compiling if * necessary */ @@ -3399,10 +3436,29 @@ S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog) PL_regsize = osize; } } + DEBUG_EXECUTE_r( + debug_start_match(re, do_utf8, locinput, PL_regeol, + "Matching embedded"); + ); + startpoint = re->program + 1; + ST.close_paren = 0; /* only used for RECURSE */ + /* borrowed from regtry */ + if (PL_reg_start_tmpl <= re->nparens) { + PL_reg_start_tmpl = re->nparens*3/2 + 3; + if(PL_reg_start_tmp) + Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*); + else + Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*); + } + eval_recurse_doit: /* Share code with RECURSE below this line */ /* run the pattern returned from (??{...}) */ ST.cp = regcppush(0); /* Save *all* the positions. */ REGCP_SET(ST.lastcp); + + PL_regstartp = re->startp; /* essentially NOOP on RECURSE */ + PL_regendp = re->endp; /* essentially NOOP on RECURSE */ + *PL_reglastparen = 0; *PL_reglastcloseparen = 0; PL_reginput = locinput; @@ -3425,13 +3481,8 @@ S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog) ST.B = next; ST.prev_eval = cur_eval; cur_eval = st; - - DEBUG_EXECUTE_r( - debug_start_match(re, do_utf8, locinput, PL_regeol, - "Matching embedded"); - ); /* now continue from first node in postoned RE */ - PUSH_YES_STATE_GOTO(EVAL_AB, re->program + 1); + PUSH_YES_STATE_GOTO(EVAL_AB, startpoint); /* NOTREACHED */ } /* /(?(?{...})X|Y)/ */ @@ -3466,7 +3517,6 @@ S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog) /* XXXX This is too dramatic a measure... */ PL_reg_maxiter = 0; sayNO_SILENT; - #undef ST case OPEN: @@ -3482,6 +3532,9 @@ S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog) if (n > (I32)*PL_reglastparen) *PL_reglastparen = n; *PL_reglastcloseparen = n; + if (cur_eval && cur_eval->u.eval.close_paren == (U32)n) { + goto fake_end; + } break; case GROUPP: n = ARG(scan); /* which paren pair */ @@ -4318,6 +4371,7 @@ S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog) case END: + fake_end: if (cur_eval) { /* we've just finished A in /(??{A})B/; now continue with B */ I32 tmpix; @@ -4345,8 +4399,8 @@ S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog) st->u.eval.prev_eval = cur_eval; cur_eval = cur_eval->u.eval.prev_eval; DEBUG_EXECUTE_r( - PerlIO_printf(Perl_debug_log, "%*s EVAL trying tail ...\n", - REPORT_CODE_OFF+depth*2, "");); + PerlIO_printf(Perl_debug_log, "%*s EVAL trying tail ... %x\n", + REPORT_CODE_OFF+depth*2, "",(int)cur_eval);); PUSH_YES_STATE_GOTO(EVAL_AB, st->u.eval.prev_eval->u.eval.B); /* match B */ } @@ -96,6 +96,7 @@ typedef struct regexp_engine { #define ROPT_CANY_SEEN 0x00000800 #define ROPT_SANY_SEEN ROPT_CANY_SEEN /* src bckwrd cmpt */ #define ROPT_GPOS_CHECK (ROPT_GPOS_SEEN|ROPT_ANCH_GPOS) +#define ROPT_RECURSE_SEEN 0x00001000 /* 0xf800 of reganch is used by PMf_COMPILETIME */ @@ -205,6 +206,8 @@ typedef struct { /* structures for holding and saving the state maintained by regmatch() */ +#define MAX_RECURSE_EVAL_NOCHANGE_DEPTH 50 + typedef I32 CHECKPOINT; typedef struct regmatch_state { @@ -255,6 +258,7 @@ typedef struct regmatch_state { CHECKPOINT cp; /* remember current savestack indexes */ CHECKPOINT lastcp; regnode *B; /* the node following us */ + U32 close_paren; /* which close bracket is our end */ } eval; struct { diff --git a/regnodes.h b/regnodes.h index 01a53f99d4..ec6011166c 100644 --- a/regnodes.h +++ b/regnodes.h @@ -4,8 +4,10 @@ Any changes made here will be lost! */ -#define REGNODE_MAX 66 -#define REGMATCH_STATE_MAX 91 +/* Regops and State definitions */ + +#define REGNODE_MAX 68 +#define REGMATCH_STATE_MAX 93 #define END 0 /* 0000 End of program. */ #define SUCCEED 1 /* 0x01 Return from a subroutine, basically. */ @@ -72,37 +74,40 @@ #define TRIEC 62 /* 0x3e Same as TRIE, but with embedded charclass data */ #define AHOCORASICK 63 /* 0x3f Aho Corasick stclass. flags==type */ #define AHOCORASICKC 64 /* 0x40 Same as AHOCORASICK, but with embedded charclass data */ -#define OPTIMIZED 65 /* 0x41 Placeholder for dump. */ -#define PSEUDO 66 /* 0x42 Pseudo opcode for internal use. */ +#define RECURSE 65 /* 0x41 recurse to paren arg1 at (signed) ofs arg2 */ +#define SRECURSE 66 /* 0x42 recurse to start of pattern */ +#define OPTIMIZED 67 /* 0x43 Placeholder for dump. */ +#define PSEUDO 68 /* 0x44 Pseudo opcode for internal use. */ /* ------------ States ------------- */ -#define TRIE_next 67 /* 0x43 Regmatch state for TRIE */ -#define TRIE_next_fail 68 /* 0x44 Regmatch state for TRIE */ -#define EVAL_AB 69 /* 0x45 Regmatch state for EVAL */ -#define EVAL_AB_fail 70 /* 0x46 Regmatch state for EVAL */ -#define resume_CURLYX 71 /* 0x47 Regmatch state for CURLYX */ -#define resume_WHILEM1 72 /* 0x48 Regmatch state for WHILEM */ -#define resume_WHILEM2 73 /* 0x49 Regmatch state for WHILEM */ -#define resume_WHILEM3 74 /* 0x4a Regmatch state for WHILEM */ -#define resume_WHILEM4 75 /* 0x4b Regmatch state for WHILEM */ -#define resume_WHILEM5 76 /* 0x4c Regmatch state for WHILEM */ -#define resume_WHILEM6 77 /* 0x4d Regmatch state for WHILEM */ -#define BRANCH_next 78 /* 0x4e Regmatch state for BRANCH */ -#define BRANCH_next_fail 79 /* 0x4f Regmatch state for BRANCH */ -#define CURLYM_A 80 /* 0x50 Regmatch state for CURLYM */ -#define CURLYM_A_fail 81 /* 0x51 Regmatch state for CURLYM */ -#define CURLYM_B 82 /* 0x52 Regmatch state for CURLYM */ -#define CURLYM_B_fail 83 /* 0x53 Regmatch state for CURLYM */ -#define IFMATCH_A 84 /* 0x54 Regmatch state for IFMATCH */ -#define IFMATCH_A_fail 85 /* 0x55 Regmatch state for IFMATCH */ -#define CURLY_B_min_known 86 /* 0x56 Regmatch state for CURLY */ -#define CURLY_B_min_known_fail 87 /* 0x57 Regmatch state for CURLY */ -#define CURLY_B_min 88 /* 0x58 Regmatch state for CURLY */ -#define CURLY_B_min_fail 89 /* 0x59 Regmatch state for CURLY */ -#define CURLY_B_max 90 /* 0x5a Regmatch state for CURLY */ -#define CURLY_B_max_fail 91 /* 0x5b Regmatch state for CURLY */ +#define TRIE_next 69 /* 0x45 Regmatch state for TRIE */ +#define TRIE_next_fail 70 /* 0x46 Regmatch state for TRIE */ +#define EVAL_AB 71 /* 0x47 Regmatch state for EVAL */ +#define EVAL_AB_fail 72 /* 0x48 Regmatch state for EVAL */ +#define resume_CURLYX 73 /* 0x49 Regmatch state for CURLYX */ +#define resume_WHILEM1 74 /* 0x4a Regmatch state for WHILEM */ +#define resume_WHILEM2 75 /* 0x4b Regmatch state for WHILEM */ +#define resume_WHILEM3 76 /* 0x4c Regmatch state for WHILEM */ +#define resume_WHILEM4 77 /* 0x4d Regmatch state for WHILEM */ +#define resume_WHILEM5 78 /* 0x4e Regmatch state for WHILEM */ +#define resume_WHILEM6 79 /* 0x4f Regmatch state for WHILEM */ +#define BRANCH_next 80 /* 0x50 Regmatch state for BRANCH */ +#define BRANCH_next_fail 81 /* 0x51 Regmatch state for BRANCH */ +#define CURLYM_A 82 /* 0x52 Regmatch state for CURLYM */ +#define CURLYM_A_fail 83 /* 0x53 Regmatch state for CURLYM */ +#define CURLYM_B 84 /* 0x54 Regmatch state for CURLYM */ +#define CURLYM_B_fail 85 /* 0x55 Regmatch state for CURLYM */ +#define IFMATCH_A 86 /* 0x56 Regmatch state for IFMATCH */ +#define IFMATCH_A_fail 87 /* 0x57 Regmatch state for IFMATCH */ +#define CURLY_B_min_known 88 /* 0x58 Regmatch state for CURLY */ +#define CURLY_B_min_known_fail 89 /* 0x59 Regmatch state for CURLY */ +#define CURLY_B_min 90 /* 0x5a Regmatch state for CURLY */ +#define CURLY_B_min_fail 91 /* 0x5b Regmatch state for CURLY */ +#define CURLY_B_max 92 /* 0x5c Regmatch state for CURLY */ +#define CURLY_B_max_fail 93 /* 0x5d Regmatch state for CURLY */ +/* PL_regkind[] What type of regop or state is this. */ #ifndef DOINIT EXTCONST U8 PL_regkind[]; @@ -173,6 +178,8 @@ EXTCONST U8 PL_regkind[] = { TRIE, /* TRIEC */ TRIE, /* AHOCORASICK */ TRIE, /* AHOCORASICKC */ + RECURSE, /* RECURSE */ + RECURSE, /* SRECURSE */ NOTHING, /* OPTIMIZED */ PSEUDO, /* PSEUDO */ /* ------------ States ------------- */ @@ -204,6 +211,7 @@ EXTCONST U8 PL_regkind[] = { }; #endif +/* regarglen[] - How large is the argument part of the node (in regnodes) */ #ifdef REG_COMP_C static const U8 regarglen[] = { @@ -272,10 +280,14 @@ static const U8 regarglen[] = { EXTRA_SIZE(struct regnode_charclass), /* TRIEC */ EXTRA_SIZE(struct regnode_1), /* AHOCORASICK */ EXTRA_SIZE(struct regnode_charclass), /* AHOCORASICKC */ + EXTRA_SIZE(struct regnode_2L), /* RECURSE */ + 0, /* SRECURSE */ 0, /* OPTIMIZED */ 0, /* PSEUDO */ }; +/* reg_off_by_arg[] - Which argument holds the offset to the next node */ + static const char reg_off_by_arg[] = { 0, /* END */ 0, /* SUCCEED */ @@ -342,10 +354,14 @@ static const char reg_off_by_arg[] = { 0, /* TRIEC */ 0, /* AHOCORASICK */ 0, /* AHOCORASICKC */ + 0, /* RECURSE */ + 0, /* SRECURSE */ 0, /* OPTIMIZED */ 0, /* PSEUDO */ }; +/* reg_name[] - Opcode/state names in string form, for debugging */ + #ifdef DEBUGGING const char * reg_name[] = { "END", /* 0000 */ @@ -413,34 +429,36 @@ const char * reg_name[] = { "TRIEC", /* 0x3e */ "AHOCORASICK", /* 0x3f */ "AHOCORASICKC", /* 0x40 */ - "OPTIMIZED", /* 0x41 */ - "PSEUDO", /* 0x42 */ + "RECURSE", /* 0x41 */ + "SRECURSE", /* 0x42 */ + "OPTIMIZED", /* 0x43 */ + "PSEUDO", /* 0x44 */ /* ------------ States ------------- */ - "TRIE_next", /* 0x43 */ - "TRIE_next_fail", /* 0x44 */ - "EVAL_AB", /* 0x45 */ - "EVAL_AB_fail", /* 0x46 */ - "resume_CURLYX", /* 0x47 */ - "resume_WHILEM1", /* 0x48 */ - "resume_WHILEM2", /* 0x49 */ - "resume_WHILEM3", /* 0x4a */ - "resume_WHILEM4", /* 0x4b */ - "resume_WHILEM5", /* 0x4c */ - "resume_WHILEM6", /* 0x4d */ - "BRANCH_next", /* 0x4e */ - "BRANCH_next_fail", /* 0x4f */ - "CURLYM_A", /* 0x50 */ - "CURLYM_A_fail", /* 0x51 */ - "CURLYM_B", /* 0x52 */ - "CURLYM_B_fail", /* 0x53 */ - "IFMATCH_A", /* 0x54 */ - "IFMATCH_A_fail", /* 0x55 */ - "CURLY_B_min_known", /* 0x56 */ - "CURLY_B_min_known_fail", /* 0x57 */ - "CURLY_B_min", /* 0x58 */ - "CURLY_B_min_fail", /* 0x59 */ - "CURLY_B_max", /* 0x5a */ - "CURLY_B_max_fail", /* 0x5b */ + "TRIE_next", /* 0x45 */ + "TRIE_next_fail", /* 0x46 */ + "EVAL_AB", /* 0x47 */ + "EVAL_AB_fail", /* 0x48 */ + "resume_CURLYX", /* 0x49 */ + "resume_WHILEM1", /* 0x4a */ + "resume_WHILEM2", /* 0x4b */ + "resume_WHILEM3", /* 0x4c */ + "resume_WHILEM4", /* 0x4d */ + "resume_WHILEM5", /* 0x4e */ + "resume_WHILEM6", /* 0x4f */ + "BRANCH_next", /* 0x50 */ + "BRANCH_next_fail", /* 0x51 */ + "CURLYM_A", /* 0x52 */ + "CURLYM_A_fail", /* 0x53 */ + "CURLYM_B", /* 0x54 */ + "CURLYM_B_fail", /* 0x55 */ + "IFMATCH_A", /* 0x56 */ + "IFMATCH_A_fail", /* 0x57 */ + "CURLY_B_min_known", /* 0x58 */ + "CURLY_B_min_known_fail", /* 0x59 */ + "CURLY_B_min", /* 0x5a */ + "CURLY_B_min_fail", /* 0x5b */ + "CURLY_B_max", /* 0x5c */ + "CURLY_B_max_fail", /* 0x5d */ }; #endif /* DEBUGGING */ #else diff --git a/t/op/pat.t b/t/op/pat.t index 59499b196b..c1d8e2dc33 100755 --- a/t/op/pat.t +++ b/t/op/pat.t @@ -3632,7 +3632,31 @@ $brackets = qr{ }x; ok("{b{c}d" !~ m/^((??{ $brackets }))/, "bracket mismatch"); - +SKIP:{ + our @stack=(); + my @expect=qw( + stuff1 + stuff2 + <stuff1>and<stuff2> + right + <right> + <<right>> + <<<right>>> + <<stuff1>and<stuff2>><<<<right>>>> + ); + + local $_='<<<stuff1>and<stuff2>><<<<right>>>>>'; + ok(/^(<((?:(?>[^<>]+)|(?1))*)>(?{push @stack, $2 }))$/, + "Recursion should match"); + ok(@stack==@expect) + or skip("Won't test individual results as count isn't equal", + 0+@expect); + foreach my $idx (@expect) { + ok($expect[$idx] eq $stack[$idx], + "Expecting '$expect' at stack pos #$idx"); + } + +} # stress test CURLYX/WHILEM. # # This test includes varying levels of nesting, and according to @@ -3734,11 +3758,15 @@ ok("{b{c}d" !~ m/^((??{ $brackets }))/, "bracket mismatch"); } -# Keep the following test last -- it may crash perl +# Keep the following tests last -- they may crash perl ok(("a" x (2**15 - 10)) =~ /^()(a|bb)*$/, "Recursive stack cracker: #24274") or print "# Unexpected outcome: should pass or crash perl\n"; +ok((q(a)x 100) =~ /^(??{'(.)'x 100})/, + "Regexp /^(??{'(.)'x 100})/ crashes older perls") + or print "# Unexpected outcome: should pass or crash perl\n"; + # Don't forget to update this! -BEGIN{print "1..1253\n"}; +BEGIN{print "1..1264\n"}; diff --git a/t/op/re_tests b/t/op/re_tests index 3ff5a73f9e..6759f34c54 100644 --- a/t/op/re_tests +++ b/t/op/re_tests @@ -1016,3 +1016,7 @@ X(?<=foo.)[YZ] ..XfooXY.. y pos 8 ^(.)((??{"(.)(cz+)"})|.) abcd y $1-$2 a-b ^a(?>(??{q(b)}))(??{q(c)})d abcd y - - ^x(??{""})+$ x y $& x +^(<(?:[^<>]+|(?3)|(?1))*>)()(!>!>!>)$ <<!>!>!>><>>!>!>!> y $1 <<!>!>!>><>> +^(<(?:[^<>]+|(?1))*>)$ <<><<<><>>>> y $1 <<><<<><>>>> +((?2)*)([fF]o+) fooFoFoo y $1-$2 fooFo-Foo +(<(?:[^<>]+|(?R))*>) <<><<<><>>>> y $1 <<><<<><>>>> |