summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorYves Orton <demerphq@gmail.com>2006-10-04 17:45:15 +0200
committerRafael Garcia-Suarez <rgarciasuarez@gmail.com>2006-10-05 10:23:08 +0000
commit6bda09f9db748451f9bb2b0d8c798ce595a6609c (patch)
tree1ddc57ee0bf52f91d840b31da4dea86d20ede672
parent87fbace95be9589b7b2c6e7ed7bd681adeae2cf4 (diff)
downloadperl-6bda09f9db748451f9bb2b0d8c798ce595a6609c.tar.gz
Re: [PATCH] Add recursive regexes similar to PCRE
Date: Wed, 4 Oct 2006 15:45:15 +0200 Message-ID: <9b18b3110610040645s563220a2id6f235494b497e90@mail.gmail.com> Subject: Re: [PATCH] Add recursive regexes similar to PCRE From: demerphq <demerphq@gmail.com> Date: Wed, 4 Oct 2006 21:05:10 +0200 Message-ID: <9b18b3110610041205m2660eb43m1315cf4b0653db96@mail.gmail.com> p4raw-id: //depot/perl@28939
-rw-r--r--embed.fnc2
-rw-r--r--embed.h2
-rw-r--r--pod/perlre.pod95
-rw-r--r--pod/perltodo.pod67
-rw-r--r--proto.h2
-rw-r--r--regcomp.c126
-rw-r--r--regcomp.h18
-rw-r--r--regcomp.pl8
-rw-r--r--regcomp.sym4
-rw-r--r--regexec.c98
-rw-r--r--regexp.h4
-rw-r--r--regnodes.h130
-rwxr-xr-xt/op/pat.t34
-rw-r--r--t/op/re_tests4
14 files changed, 482 insertions, 112 deletions
diff --git a/embed.fnc b/embed.fnc
index 0f0fc7dd60..6723d92de1 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -1312,7 +1312,7 @@ ERsn |I32 |regcurly |NN const char *
Es |regnode*|reg_node |NN struct RExC_state_t *state|U8 op
Es |regnode*|regpiece |NN struct RExC_state_t *state|NN I32 *flagp|U32 depth
Es |regnode*|reg_namedseq |NN struct RExC_state_t *state|NULLOK UV *valuep
-Es |void |reginsert |NN struct RExC_state_t *state|U8 op|NN regnode *opnd
+Es |void |reginsert |NN struct RExC_state_t *state|U8 op|NN regnode *opnd|U32 depth
Es |void |regtail |NN struct RExC_state_t *state|NN regnode *p|NN const regnode *val|U32 depth
Es |U32 |join_exact |NN struct RExC_state_t *state|NN regnode *scan|NN I32 *min|U32 flags|NULLOK regnode *val|U32 depth
EsRn |char* |regwhite |NN char *p|NN const char *e
diff --git a/embed.h b/embed.h
index 02f91a82d5..0e06d49632 100644
--- a/embed.h
+++ b/embed.h
@@ -3514,7 +3514,7 @@
#define reg_node(a,b) S_reg_node(aTHX_ a,b)
#define regpiece(a,b,c) S_regpiece(aTHX_ a,b,c)
#define reg_namedseq(a,b) S_reg_namedseq(aTHX_ a,b)
-#define reginsert(a,b,c) S_reginsert(aTHX_ a,b,c)
+#define reginsert(a,b,c,d) S_reginsert(aTHX_ a,b,c,d)
#define regtail(a,b,c,d) S_regtail(aTHX_ a,b,c,d)
#define join_exact(a,b,c,d,e,f) S_join_exact(aTHX_ a,b,c,d,e,f)
#define regwhite S_regwhite
diff --git a/pod/perlre.pod b/pod/perlre.pod
index 61720db906..0e26b11f70 100644
--- a/pod/perlre.pod
+++ b/pod/perlre.pod
@@ -674,6 +674,13 @@ The assignment to C<$^R> above is properly localized, so the old
value of C<$^R> is restored if the assertion is backtracked; compare
L<"Backtracking">.
+Due to an unfortunate implementation issue the perl code contained in these
+blocks is treated as a compile time closure, which can have seemingly bizarre
+consequences when used with lexically scoped variables inside of subroutines
+or loops. There are various workarounds for this, including simply using
+global variables instead. If you are using this construct and strange results
+occur then check for the use of lexically scoped variables.
+
For reasons of security, this construct is forbidden if the regular
expression involves run-time interpolation of variables, unless the
perilous C<use re 'eval'> pragma has been used (see L<re>), or the
@@ -702,7 +709,6 @@ or indirectly with functions such as C<split>.
=item C<(??{ code })>
X<(??{})>
X<regex, postponed> X<regexp, postponed> X<regular expression, postponed>
-X<regex, recursive> X<regexp, recursive> X<regular expression, recursive>
B<WARNING>: This extended regular expression feature is considered
highly experimental, and may be changed or deleted without notice.
@@ -712,7 +718,15 @@ used idioms.
This is a "postponed" regular subexpression. The C<code> is evaluated
at run time, at the moment this subexpression may match. The result
of evaluation is considered as a regular expression and matched as
-if it were inserted instead of this construct.
+if it were inserted instead of this construct. Note that this means
+that the contents of capture buffers defined inside an eval'ed pattern
+are not available outside of the pattern, and vice versa, there is no
+way for the inner pattern to refer to a capture buffer defined outside.
+Thus,
+
+ ('a' x 100)=~/(??{'(.)' x 100})/
+
+B<will> match, it will B<not> set $1.
The C<code> is not interpolated. As before, the rules to determine
where the C<code> ends are currently somewhat convoluted.
@@ -729,12 +743,80 @@ The following pattern matches a parenthesized group:
\)
}x;
+See also C<(?PARNO)> for a different, more efficient way to accomplish
+the same task.
+
Because perl's regex engine is not currently re-entrant, delayed
code may not invoke the regex engine either directly with C<m//> or C<s///>),
or indirectly with functions such as C<split>.
+Recursing deeper than 50 times without consuming any input string will
+result in a fatal error. The maximum depth is compiled into perl, so
+changing it requires a custom build.
+
+=item C<(?PARNO)> C<(?R)>
+
+X<(?PARNO)> X<(?1)>
+X<regex, recursive> X<regexp, recursive> X<regular expression, recursive>
+
+B<WARNING>: This extended regular expression feature is considered
+highly experimental, and may be changed or deleted without notice.
+
+Similar to C<(??{ code })> except it does not involve compiling any code,
+instead it treats the contents of a capture buffer as an independent
+pattern that must match at the current position. Capture buffers
+contained by the pattern will have the value as determined by the
+outermost recursion.
+
+PARNO is a sequence of digits not starting with 0 whose value
+reflects the paren-number of the capture buffer to recurse to.
+C<(?R)> curses to the beginning of the pattern.
+
+The following pattern matches a function foo() which may contain
+balanced parenthesis as the argument.
+
+ $re = qr{ ( # paren group 1 (full function)
+ foo
+ ( # paren group 2 (parens)
+ \(
+ ( # paren group 3 (contents of parens)
+ (?:
+ (?> [^()]+ ) # Non-parens without backtracking
+ |
+ (?2) # Recurse to start of paren group 2
+ )*
+ )
+ \)
+ )
+ )
+ }x;
+
+If the pattern was used as follows
+
+ 'foo(bar(baz)+baz(bop))'=~/$re/
+ and print "\$1 = $1\n",
+ "\$2 = $2\n",
+ "\$3 = $3\n";
+
+the output produced should be the following:
+
+ $1 = foo(bar(baz)+baz(bop))
+ $2 = (bar(baz)+baz(bop))
+ $3 = bar(baz)+baz(bop)
+
+If there is no corresponding capture buffer defined, then it is a
+fatal error. Recursing deeper than 50 times without consuming any input
+string will also result in a fatal error. The maximum depth is compiled
+into perl, so changing it requires a custom build.
+
+B<Note> that this pattern does not behave the same way as the equivalent
+PCRE or Python construct of the same form. In perl you can backtrack into
+a recursed group, in PCRE and Python the recursed into group is treated
+as atomic. Also, constructs like (?i:(?1)) or (?:(?i)(?1)) do not affect
+the pattern being recursed into.
+
=item C<< (?>pattern) >>
-X<backtrack> X<backtracking>
+X<backtrack> X<backtracking> X<atomic> X<possessive>
B<WARNING>: This extended regular expression feature is considered
highly experimental, and may be changed or deleted without notice.
@@ -827,6 +909,9 @@ one of these:
Which one you pick depends on which of these expressions better reflects
the above specification of comments.
+In some literature this construct is called "atomic matching" or
+"possessive matching".
+
=item C<(?(condition)yes-pattern|no-pattern)>
X<(?()>
@@ -1320,10 +1405,10 @@ else in the whole regular expression.)
For this grouping operator there is no need to describe the ordering, since
only whether or not C<S> can match is important.
-=item C<(??{ EXPR })>
+=item C<(??{ EXPR })>, C<(?PARNO)>
The ordering is the same as for the regular expression which is
-the result of EXPR.
+the result of EXPR, or the pattern contained by capture buffer PARNO.
=item C<(?(condition)yes-pattern|no-pattern)>
diff --git a/pod/perltodo.pod b/pod/perltodo.pod
index 6bf9d1f4cb..50a79d9429 100644
--- a/pod/perltodo.pod
+++ b/pod/perltodo.pod
@@ -628,3 +628,70 @@ Fix (or rewrite) the implementation of the C</(?{...})/> closures.
This will allow the use of a regex from inside (?{ }), (??{ }) and
(?(?{ })|) constructs.
+
+=head2 Add named capture to regexp engine
+
+Named capture is supported by .NET, PCRE and Python. Its embarrassing
+Perl doesn't support it yet.
+
+Jeffrey Friedl notes that "the most glaring omission [in perl's regexp
+engine] offered by other implementations is named capture".
+
+demerphq is working on this.
+
+=head2 Add possessive quantifiers to regexp engine
+
+Possessive quantifiers are a syntactic sugar that affords a more
+elegant way to express (?>A+). They are also provided by many other
+regex engines. Most importantly they allow various patterns to be
+optimised more efficiently than (?>...) allows, and allow various data
+driven optimisations to be implemented (such as auto-possesification of
+quantifiers followed by contrary suffixes). Common syntax for them is
+
+ ++ possessive 1 or more
+ *+ possessive 0 or more
+ {n,m}+ possessive n..m
+
+A possessive quantifier basically absorbs as much as it can and doesn't
+give any back.
+
+Jeffrey Friedl documents possessive quantifiers in Mastering Regular
+Expressions 2nd edition and explicitly pleads for them to be added to
+perl. We should oblige him, lest he leaves us out of a future edition.
+;-)
+
+demerphq has this on his todo list
+
+=head2 Add (?YES) (?NO) to regexp enigne
+
+YES/NO would allow a subpattern to be passed/failed but allow backtracking.
+Basically a more efficient (?=), (?!).
+
+demerphq has this on his todo list
+
+=head2 Add (?SUCCEED) (?FAIL) to regexp engine
+
+SUCCEED/FAIL would allow a pattern to be passed/failed but without backtracking.
+Thus you could signal that a pattern has matched or not, and return (regardless
+that there is more pattern following).
+
+demerphq has this on his todo list
+
+=head2 Add (?CUT) (?COMMIT) to regexp engine
+
+CUT would allow a pattern to say "do not backtrack beyond here".
+COMMIT would say match from here or don't, but don't try the pattern from
+another starting pattern.
+
+These correspond to the \v and \V that Jeffrey Friedl mentions in
+Mastering Regular Expressions 2nd edition.
+
+demerphq has this on his todo list
+
+=head2 Add class set operations to regexp engine
+
+Apparently these are quite useful. Anyway, Jeffery Friedl wants them.
+
+demerphq has this on his todo list, but right at the bottom.
+
+
diff --git a/proto.h b/proto.h
index 078b1d5ac9..e10c8eb909 100644
--- a/proto.h
+++ b/proto.h
@@ -3574,7 +3574,7 @@ STATIC regnode* S_regpiece(pTHX_ struct RExC_state_t *state, I32 *flagp, U32 dep
STATIC regnode* S_reg_namedseq(pTHX_ struct RExC_state_t *state, UV *valuep)
__attribute__nonnull__(pTHX_1);
-STATIC void S_reginsert(pTHX_ struct RExC_state_t *state, U8 op, regnode *opnd)
+STATIC void S_reginsert(pTHX_ struct RExC_state_t *state, U8 op, regnode *opnd, U32 depth)
__attribute__nonnull__(pTHX_1)
__attribute__nonnull__(pTHX_3);
diff --git a/regcomp.c b/regcomp.c
index 2b38a41b93..3090dbe844 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -117,8 +117,9 @@ typedef struct RExC_state_t {
I32 extralen;
I32 seen_zerolen;
I32 seen_evals;
+ regnode **parens; /* offsets of each paren */
I32 utf8;
- HV *charnames; /* cache of named sequences */
+ HV *charnames; /* cache of named sequences */
#if ADD_TO_REGEXEC
char *starttry; /* -Dr: where regtry was called. */
#define RExC_starttry (pRExC_state->starttry)
@@ -151,6 +152,7 @@ typedef struct RExC_state_t {
#define RExC_seen_evals (pRExC_state->seen_evals)
#define RExC_utf8 (pRExC_state->utf8)
#define RExC_charnames (pRExC_state->charnames)
+#define RExC_parens (pRExC_state->parens)
#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
@@ -2709,6 +2711,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
}
flags &= ~SCF_DO_STCLASS;
}
+ else if (OP(scan)==RECURSE) {
+ ARG2L_SET( scan, RExC_parens[ARG(scan)-1] - scan );
+ }
else if (strchr((const char*)PL_varies,OP(scan))) {
I32 mincount, maxcount, minnext, deltanext, fl = 0;
I32 f = flags, pos_before = 0;
@@ -3766,6 +3771,7 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
RExC_emit = &PL_regdummy;
RExC_whilem_seen = 0;
RExC_charnames = NULL;
+ RExC_parens= NULL;
#if 0 /* REGC() is (currently) a NOP at the first pass.
* Clever compilers notice this and complain. --jhi */
@@ -3820,8 +3826,14 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
r->substrs = 0; /* Useful during FAIL. */
r->startp = 0; /* Useful during FAIL. */
- r->endp = 0; /* Useful during FAIL. */
+ r->endp = 0;
+ if (RExC_seen & REG_SEEN_RECURSE) {
+ Newx(RExC_parens, RExC_npar,regnode *);
+ SAVEFREEPV(RExC_parens);
+ }
+
+ /* Useful during FAIL. */
Newxz(r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
if (r->offsets) {
r->offsets[0] = RExC_size;
@@ -3847,6 +3859,7 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
r->data = 0;
if (reg(pRExC_state, 0, &flags,1) == NULL)
return(NULL);
+
/* XXXX To minimize changes to RE engine we always allocate
3-units-long substrs field. */
Newx(r->substrs, 1, struct reg_substr_data);
@@ -4242,10 +4255,6 @@ reStudy:
Newxz(r->startp, RExC_npar, I32);
Newxz(r->endp, RExC_npar, I32);
-
- if (RExC_charnames)
- SvREFCNT_dec((SV*)(RExC_charnames));
-
DEBUG_r( RX_DEBUG_on(r) );
DEBUG_DUMP_r({
PerlIO_printf(Perl_debug_log,"Final program:\n");
@@ -4312,6 +4321,10 @@ reStudy:
DEBUG_PARSE_MSG((funcname)); \
PerlIO_printf(Perl_debug_log,"%4s","\n"); \
})
+#define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
+ DEBUG_PARSE_MSG((funcname)); \
+ PerlIO_printf(Perl_debug_log,fmt "\n",args); \
+})
/*
- reg - regular expression, i.e. main body or parenthesized thing
*
@@ -4399,6 +4412,41 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
nextchar(pRExC_state);
*flagp = TRYAGAIN;
return NULL;
+ case 'R' :
+ if (*RExC_parse != ')')
+ FAIL("Sequence (?R) not terminated");
+ reg_node(pRExC_state, SRECURSE);
+ break;
+ case '1': case '2': case '3': case '4': /* (?1) */
+ case '5': case '6': case '7': case '8': case '9':
+ RExC_parse--;
+ {
+ const I32 num = atoi(RExC_parse);
+ char * const parse_start = RExC_parse - 1; /* MJD */
+ while (isDIGIT(*RExC_parse))
+ RExC_parse++;
+ if (*RExC_parse!=')')
+ vFAIL("Expecting close bracket");
+ ret = reganode(pRExC_state, RECURSE, num);
+ if (!SIZE_ONLY) {
+ if (num > (I32)RExC_rx->nparens) {
+ RExC_parse++;
+ vFAIL("Reference to nonexistent group");
+ }
+ ARG2L_SET( ret, 0);
+ RExC_emit++;
+ DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
+ "Recurse #%d to %d\n", ARG(ret), ARG2L(ret)));
+ } else{
+ RExC_size++;
+ RExC_seen|=REG_SEEN_RECURSE;
+ }
+ Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
+ Set_Node_Offset(ret, RExC_parse); /* MJD */
+
+ nextchar(pRExC_state);
+ return ret;
+ }
case 'p': /* (?p...) */
if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
@@ -4612,6 +4660,12 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
parno = RExC_npar;
RExC_npar++;
ret = reganode(pRExC_state, OPEN, parno);
+ if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
+ DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log, "Setting paren #%d to %d\n",
+ parno,REG_NODE_NUM(ret)));
+ RExC_parens[parno-1]= ret;
+
+ }
Set_Node_Length(ret, 1); /* MJD */
Set_Node_Offset(ret, RExC_parse); /* MJD */
is_open = 1;
@@ -4629,10 +4683,10 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
return(NULL);
if (*RExC_parse == '|') {
if (!SIZE_ONLY && RExC_extralen) {
- reginsert(pRExC_state, BRANCHJ, br);
+ reginsert(pRExC_state, BRANCHJ, br, depth+1);
}
else { /* MJD */
- reginsert(pRExC_state, BRANCH, br);
+ reginsert(pRExC_state, BRANCH, br, depth+1);
Set_Node_Length(br, paren != 0);
Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
}
@@ -4719,7 +4773,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
if (paren == '>')
node = SUSPEND, flag = 0;
- reginsert(pRExC_state, node,ret);
+ reginsert(pRExC_state, node,ret, depth+1);
Set_Node_Cur_Length(ret);
Set_Node_Offset(ret, parse_start + 1);
ret->flags = flag;
@@ -4880,7 +4934,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
do_curly:
if ((flags&SIMPLE)) {
RExC_naughty += 2 + RExC_naughty / 2;
- reginsert(pRExC_state, CURLY, ret);
+ reginsert(pRExC_state, CURLY, ret, depth+1);
Set_Node_Offset(ret, parse_start+1); /* MJD */
Set_Node_Cur_Length(ret);
}
@@ -4890,11 +4944,11 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
w->flags = 0;
REGTAIL(pRExC_state, ret, w);
if (!SIZE_ONLY && RExC_extralen) {
- reginsert(pRExC_state, LONGJMP,ret);
- reginsert(pRExC_state, NOTHING,ret);
+ reginsert(pRExC_state, LONGJMP,ret, depth+1);
+ reginsert(pRExC_state, NOTHING,ret, depth+1);
NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
}
- reginsert(pRExC_state, CURLYX,ret);
+ reginsert(pRExC_state, CURLYX,ret, depth+1);
/* MJD hk */
Set_Node_Offset(ret, parse_start+1);
Set_Node_Length(ret,
@@ -4928,6 +4982,10 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
*flagp = flags;
return(ret);
}
+ /* else if (OP(ret)==RECURSE) {
+ RExC_parse++;
+ vFAIL("Illegal quantifier on recursion group");
+ } */
#if 0 /* Now runtime fix should be reliable. */
@@ -4951,7 +5009,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
*flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
if (op == '*' && (flags&SIMPLE)) {
- reginsert(pRExC_state, STAR, ret);
+ reginsert(pRExC_state, STAR, ret, depth+1);
ret->flags = 0;
RExC_naughty += 4;
}
@@ -4960,7 +5018,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
goto do_curly;
}
else if (op == '+' && (flags&SIMPLE)) {
- reginsert(pRExC_state, PLUS, ret);
+ reginsert(pRExC_state, PLUS, ret, depth+1);
ret->flags = 0;
RExC_naughty += 3;
}
@@ -4982,7 +5040,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
if (*RExC_parse == '?') {
nextchar(pRExC_state);
- reginsert(pRExC_state, MINMOD, ret);
+ reginsert(pRExC_state, MINMOD, ret, depth+1);
REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
}
if (ISMULT2(RExC_parse)) {
@@ -5098,6 +5156,7 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep)
if (!RExC_charnames) {
/* make sure our cache is allocated */
RExC_charnames = newHV();
+ sv_2mortal((SV*)RExC_charnames);
}
/* see if we have looked this one up before */
he_str = hv_fetch_ent( RExC_charnames, sv_name, 0, 0 );
@@ -6944,6 +7003,20 @@ S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
if (SIZE_ONLY) {
SIZE_ALIGN(RExC_size);
RExC_size += 2;
+ /*
+ We can't do this:
+
+ assert(2==regarglen[op]+1);
+
+ Anything larger than this has to allocate the extra amount.
+ If we changed this to be:
+
+ RExC_size += (1 + regarglen[op]);
+
+ then it wouldn't matter. Its not clear what side effect
+ might come from that so its not done so far.
+ -- dmq
+ */
return(ret);
}
@@ -6984,24 +7057,33 @@ S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
* Means relocating the operand.
*/
STATIC void
-S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
+S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
{
dVAR;
register regnode *src;
register regnode *dst;
register regnode *place;
const int offset = regarglen[(U8)op];
+ const int size = NODE_STEP_REGNODE + offset;
GET_RE_DEBUG_FLAGS_DECL;
/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
-
+ DEBUG_PARSE_FMT("inst"," - %s",reg_name[op]);
if (SIZE_ONLY) {
- RExC_size += NODE_STEP_REGNODE + offset;
+ RExC_size += size;
return;
}
src = RExC_emit;
- RExC_emit += NODE_STEP_REGNODE + offset;
+ RExC_emit += size;
dst = RExC_emit;
+ if (RExC_parens) {
+ int paren;
+ for ( paren=0 ; paren < RExC_npar ; paren++ ) {
+ if ( RExC_parens[paren] >= src )
+ RExC_parens[paren] += size;
+ }
+ }
+
while (src > opnd) {
StructCopy(--src, --dst, regnode);
if (RExC_offsets) { /* MJD 20010112 */
@@ -7374,8 +7456,10 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
}
else if (k == WHILEM && o->flags) /* Ordinal/of */
Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
- else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
+ else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP)
Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
+ else if (k == RECURSE)
+ Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
else if (k == LOGICAL)
Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
else if (k == ANYOF) {
diff --git a/regcomp.h b/regcomp.h
index 3213fc8e29..183420f065 100644
--- a/regcomp.h
+++ b/regcomp.h
@@ -86,6 +86,8 @@ struct regnode_string {
char string[1];
};
+/* Argument bearing node - workhorse,
+ arg1 is often for the data field */
struct regnode_1 {
U8 flags;
U8 type;
@@ -93,6 +95,16 @@ struct regnode_1 {
U32 arg1;
};
+/* Similar to a regnode_1 but with an extra signed argument */
+struct regnode_2L {
+ U8 flags;
+ U8 type;
+ U16 next_off;
+ U32 arg1;
+ I32 arg2;
+};
+
+/* 'Two field' -- Two 16 bit unsigned args */
struct regnode_2 {
U8 flags;
U8 type;
@@ -101,6 +113,7 @@ struct regnode_2 {
U16 arg2;
};
+
#define ANYOF_BITMAP_SIZE 32 /* 256 b/(8 b/B) */
#define ANYOF_CLASSBITMAP_SIZE 4 /* up to 32 (8*4) named classes */
@@ -154,10 +167,12 @@ struct regnode_charclass_class { /* has [[:blah:]] classes */
#define ARG(p) ARG_VALUE(ARG_LOC(p))
#define ARG1(p) ARG_VALUE(ARG1_LOC(p))
#define ARG2(p) ARG_VALUE(ARG2_LOC(p))
+#define ARG2L(p) ARG_VALUE(ARG2L_LOC(p))
#define ARG_SET(p, val) ARG__SET(ARG_LOC(p), (val))
#define ARG1_SET(p, val) ARG__SET(ARG1_LOC(p), (val))
#define ARG2_SET(p, val) ARG__SET(ARG2_LOC(p), (val))
+#define ARG2L_SET(p, val) ARG__SET(ARG2L_LOC(p), (val))
#undef NEXT_OFF
#undef NODE_ALIGN
@@ -190,7 +205,7 @@ struct regnode_charclass_class { /* has [[:blah:]] classes */
#define ARG_LOC(p) (((struct regnode_1 *)p)->arg1)
#define ARG1_LOC(p) (((struct regnode_2 *)p)->arg1)
#define ARG2_LOC(p) (((struct regnode_2 *)p)->arg2)
-
+#define ARG2L_LOC(p) (((struct regnode_2L *)p)->arg2)
#define NODE_STEP_REGNODE 1 /* sizeof(regnode)/sizeof(regnode) */
#define EXTRA_STEP_2ARGS EXTRA_SIZE(struct regnode_2)
@@ -328,6 +343,7 @@ struct regnode_charclass_class { /* has [[:blah:]] classes */
#define REG_SEEN_EVAL 0x00000008
#define REG_SEEN_CANY 0x00000010
#define REG_SEEN_SANY REG_SEEN_CANY /* src bckwrd cmpt */
+#define REG_SEEN_RECURSE 0x00000020
START_EXTERN_C
diff --git a/regcomp.pl b/regcomp.pl
index ed270e8967..2e84604b5f 100644
--- a/regcomp.pl
+++ b/regcomp.pl
@@ -82,6 +82,8 @@ printf OUT <<EOP,
Any changes made here will be lost!
*/
+/* Regops and State definitions */
+
#define %*s\t%d
#define %*s\t%d
@@ -101,6 +103,7 @@ while (++$ind <= $tot) {
print OUT <<EOP;
+/* PL_regkind[] What type of regop or state is this. */
#ifndef DOINIT
EXTCONST U8 PL_regkind[];
@@ -120,6 +123,7 @@ print OUT <<EOP;
};
#endif
+/* regarglen[] - How large is the argument part of the node (in regnodes) */
#ifdef REG_COMP_C
static const U8 regarglen[] = {
@@ -137,6 +141,8 @@ while (++$ind <= $lastregop) {
print OUT <<EOP;
};
+/* reg_off_by_arg[] - Which argument holds the offset to the next node */
+
static const char reg_off_by_arg[] = {
EOP
@@ -151,6 +157,8 @@ while (++$ind <= $lastregop) {
print OUT <<EOP;
};
+/* reg_name[] - Opcode/state names in string form, for debugging */
+
#ifdef DEBUGGING
const char * reg_name[] = {
EOP
diff --git a/regcomp.sym b/regcomp.sym
index bc6f8e3164..4365eb5897 100644
--- a/regcomp.sym
+++ b/regcomp.sym
@@ -153,7 +153,9 @@ TRIEC TRIE, trie charclass Same as TRIE, but with embedded charclass data
AHOCORASICK TRIE, trie 1 Aho Corasick stclass. flags==type
AHOCORASICKC TRIE, trie charclass Same as AHOCORASICK, but with embedded charclass data
-
+#*Recursion (65)
+RECURSE RECURSE, num/ofs 2L recurse to paren arg1 at (signed) ofs arg2
+SRECURSE RECURSE, no recurse to start of pattern
# NEW STUFF ABOVE THIS LINE -- Please update counts below.
diff --git a/regexec.c b/regexec.c
index f7f0d2b857..c283b2eec3 100644
--- a/regexec.c
+++ b/regexec.c
@@ -165,7 +165,7 @@ S_regcppush(pTHX_ I32 parenfloor)
if (paren_elems_to_push < 0)
Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
-#define REGCP_OTHER_ELEMS 6
+#define REGCP_OTHER_ELEMS 8
SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
for (p = PL_regsize; p > parenfloor; p--) {
/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
@@ -181,6 +181,8 @@ S_regcppush(pTHX_ I32 parenfloor)
));
}
/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
+ SSPUSHPTR(PL_regstartp);
+ SSPUSHPTR(PL_regendp);
SSPUSHINT(PL_regsize);
SSPUSHINT(*PL_reglastparen);
SSPUSHINT(*PL_reglastcloseparen);
@@ -227,7 +229,10 @@ S_regcppop(pTHX_ const regexp *rex)
*PL_reglastcloseparen = SSPOPINT;
*PL_reglastparen = SSPOPINT;
PL_regsize = SSPOPINT;
+ PL_regendp=(I32 *) SSPOPPTR;
+ PL_regstartp=(I32 *) SSPOPPTR;
+
/* Now restore the parentheses context. */
for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
i > 0; i -= REGCP_PAREN_ELEMS) {
@@ -488,7 +493,7 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
srch_end_shift -= ((strbeg - s) - srch_start_shift);
srch_start_shift = strbeg - s;
}
- DEBUG_OPTIMISE_r({
+ DEBUG_OPTIMISE_MORE_r({
PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
(IV)prog->check_offset_min,
(IV)srch_start_shift,
@@ -524,7 +529,7 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
end_point= HOP3(strend, -srch_end_shift, strbeg);
}
- DEBUG_OPTIMISE_r({
+ DEBUG_OPTIMISE_MORE_r({
PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
(int)(end_point - start_point),
(int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
@@ -719,7 +724,7 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
- DEBUG_OPTIMISE_r(
+ DEBUG_OPTIMISE_MORE_r(
PerlIO_printf(Perl_debug_log,
"Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
(IV)prog->check_offset_min,
@@ -1979,9 +1984,10 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *
}
}
if (last == NULL) {
- DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
- "%sCan't trim the tail, match fails (should not happen)%s\n",
- PL_colors[4], PL_colors[5]));
+ DEBUG_EXECUTE_r(
+ PerlIO_printf(Perl_debug_log,
+ "%sCan't trim the tail, match fails (should not happen)%s\n",
+ PL_colors[4], PL_colors[5]));
goto phooey; /* Should not happen! */
}
dontbother = strend - last + prog->float_min_offset;
@@ -2063,6 +2069,7 @@ phooey:
return 0;
}
+
/*
- regtry - try match at specific point
*/
@@ -2146,16 +2153,16 @@ S_regtry(pTHX_ const regmatch_info *reginfo, char *startpos)
prog->subbeg = PL_bostr;
prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
}
+ DEBUG_EXECUTE_r(PL_reg_starttry = startpos);
prog->startp[0] = startpos - PL_bostr;
PL_reginput = startpos;
- PL_regstartp = prog->startp;
- PL_regendp = prog->endp;
PL_reglastparen = &prog->lastparen;
PL_reglastcloseparen = &prog->lastcloseparen;
prog->lastparen = 0;
prog->lastcloseparen = 0;
PL_regsize = 0;
- DEBUG_EXECUTE_r(PL_reg_starttry = startpos);
+ PL_regstartp = prog->startp;
+ PL_regendp = prog->endp;
if (PL_reg_start_tmpl <= prog->nparens) {
PL_reg_start_tmpl = prog->nparens*3/2 + 3;
if(PL_reg_start_tmp)
@@ -2508,6 +2515,7 @@ S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog)
register I32 nextchr; /* is always set to UCHARAT(locinput) */
bool result = 0; /* return value of S_regmatch */
int depth = 0; /* depth of recursion */
+ int nochange_depth = 0; /* depth of RECURSE recursion with nochange*/
regmatch_state *yes_state = NULL; /* state to pop to on success of
subpattern */
regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
@@ -3325,10 +3333,39 @@ S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog)
#undef ST
#define ST st->u.eval
-
- case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
{
SV *ret;
+ regexp *re;
+ regnode *startpoint;
+
+ case SRECURSE:
+ case RECURSE: /* /(...(?1))/ */
+ if (cur_eval && cur_eval->locinput==locinput) {
+ if (cur_eval->u.eval.close_paren == ARG(scan))
+ Perl_croak(aTHX_ "Infinite recursion in RECURSE in regexp");
+ if ( ++nochange_depth > MAX_RECURSE_EVAL_NOCHANGE_DEPTH )
+ Perl_croak(aTHX_ "RECURSE without pos change exceeded limit in regexp");
+ } else {
+ nochange_depth = 0;
+ }
+ re = rex;
+ (void)ReREFCNT_inc(rex);
+ if (OP(scan)==RECURSE) {
+ startpoint = scan + ARG2L(scan);
+ ST.close_paren = ARG(scan);
+ } else {
+ startpoint = re->program+1;
+ ST.close_paren = 0;
+ }
+ goto eval_recurse_doit;
+ /* NOTREACHED */
+ case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
+ if (cur_eval && cur_eval->locinput==locinput) {
+ if ( ++nochange_depth > MAX_RECURSE_EVAL_NOCHANGE_DEPTH )
+ Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regexp");
+ } else {
+ nochange_depth = 0;
+ }
{
/* execute the code in the {...} */
dSP;
@@ -3362,7 +3399,7 @@ S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog)
}
}
if (st->logical == 2) { /* Postponed subexpression: /(??{...})/ */
- regexp *re;
+
{
/* extract RE object from returned value; compiling if
* necessary */
@@ -3399,10 +3436,29 @@ S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog)
PL_regsize = osize;
}
}
+ DEBUG_EXECUTE_r(
+ debug_start_match(re, do_utf8, locinput, PL_regeol,
+ "Matching embedded");
+ );
+ startpoint = re->program + 1;
+ ST.close_paren = 0; /* only used for RECURSE */
+ /* borrowed from regtry */
+ if (PL_reg_start_tmpl <= re->nparens) {
+ PL_reg_start_tmpl = re->nparens*3/2 + 3;
+ if(PL_reg_start_tmp)
+ Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
+ else
+ Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
+ }
+ eval_recurse_doit: /* Share code with RECURSE below this line */
/* run the pattern returned from (??{...}) */
ST.cp = regcppush(0); /* Save *all* the positions. */
REGCP_SET(ST.lastcp);
+
+ PL_regstartp = re->startp; /* essentially NOOP on RECURSE */
+ PL_regendp = re->endp; /* essentially NOOP on RECURSE */
+
*PL_reglastparen = 0;
*PL_reglastcloseparen = 0;
PL_reginput = locinput;
@@ -3425,13 +3481,8 @@ S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog)
ST.B = next;
ST.prev_eval = cur_eval;
cur_eval = st;
-
- DEBUG_EXECUTE_r(
- debug_start_match(re, do_utf8, locinput, PL_regeol,
- "Matching embedded");
- );
/* now continue from first node in postoned RE */
- PUSH_YES_STATE_GOTO(EVAL_AB, re->program + 1);
+ PUSH_YES_STATE_GOTO(EVAL_AB, startpoint);
/* NOTREACHED */
}
/* /(?(?{...})X|Y)/ */
@@ -3466,7 +3517,6 @@ S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog)
/* XXXX This is too dramatic a measure... */
PL_reg_maxiter = 0;
sayNO_SILENT;
-
#undef ST
case OPEN:
@@ -3482,6 +3532,9 @@ S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog)
if (n > (I32)*PL_reglastparen)
*PL_reglastparen = n;
*PL_reglastcloseparen = n;
+ if (cur_eval && cur_eval->u.eval.close_paren == (U32)n) {
+ goto fake_end;
+ }
break;
case GROUPP:
n = ARG(scan); /* which paren pair */
@@ -4318,6 +4371,7 @@ S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog)
case END:
+ fake_end:
if (cur_eval) {
/* we've just finished A in /(??{A})B/; now continue with B */
I32 tmpix;
@@ -4345,8 +4399,8 @@ S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog)
st->u.eval.prev_eval = cur_eval;
cur_eval = cur_eval->u.eval.prev_eval;
DEBUG_EXECUTE_r(
- PerlIO_printf(Perl_debug_log, "%*s EVAL trying tail ...\n",
- REPORT_CODE_OFF+depth*2, ""););
+ PerlIO_printf(Perl_debug_log, "%*s EVAL trying tail ... %x\n",
+ REPORT_CODE_OFF+depth*2, "",(int)cur_eval););
PUSH_YES_STATE_GOTO(EVAL_AB,
st->u.eval.prev_eval->u.eval.B); /* match B */
}
diff --git a/regexp.h b/regexp.h
index 934580e7cd..63e0c1af7a 100644
--- a/regexp.h
+++ b/regexp.h
@@ -96,6 +96,7 @@ typedef struct regexp_engine {
#define ROPT_CANY_SEEN 0x00000800
#define ROPT_SANY_SEEN ROPT_CANY_SEEN /* src bckwrd cmpt */
#define ROPT_GPOS_CHECK (ROPT_GPOS_SEEN|ROPT_ANCH_GPOS)
+#define ROPT_RECURSE_SEEN 0x00001000
/* 0xf800 of reganch is used by PMf_COMPILETIME */
@@ -205,6 +206,8 @@ typedef struct {
/* structures for holding and saving the state maintained by regmatch() */
+#define MAX_RECURSE_EVAL_NOCHANGE_DEPTH 50
+
typedef I32 CHECKPOINT;
typedef struct regmatch_state {
@@ -255,6 +258,7 @@ typedef struct regmatch_state {
CHECKPOINT cp; /* remember current savestack indexes */
CHECKPOINT lastcp;
regnode *B; /* the node following us */
+ U32 close_paren; /* which close bracket is our end */
} eval;
struct {
diff --git a/regnodes.h b/regnodes.h
index 01a53f99d4..ec6011166c 100644
--- a/regnodes.h
+++ b/regnodes.h
@@ -4,8 +4,10 @@
Any changes made here will be lost!
*/
-#define REGNODE_MAX 66
-#define REGMATCH_STATE_MAX 91
+/* Regops and State definitions */
+
+#define REGNODE_MAX 68
+#define REGMATCH_STATE_MAX 93
#define END 0 /* 0000 End of program. */
#define SUCCEED 1 /* 0x01 Return from a subroutine, basically. */
@@ -72,37 +74,40 @@
#define TRIEC 62 /* 0x3e Same as TRIE, but with embedded charclass data */
#define AHOCORASICK 63 /* 0x3f Aho Corasick stclass. flags==type */
#define AHOCORASICKC 64 /* 0x40 Same as AHOCORASICK, but with embedded charclass data */
-#define OPTIMIZED 65 /* 0x41 Placeholder for dump. */
-#define PSEUDO 66 /* 0x42 Pseudo opcode for internal use. */
+#define RECURSE 65 /* 0x41 recurse to paren arg1 at (signed) ofs arg2 */
+#define SRECURSE 66 /* 0x42 recurse to start of pattern */
+#define OPTIMIZED 67 /* 0x43 Placeholder for dump. */
+#define PSEUDO 68 /* 0x44 Pseudo opcode for internal use. */
/* ------------ States ------------- */
-#define TRIE_next 67 /* 0x43 Regmatch state for TRIE */
-#define TRIE_next_fail 68 /* 0x44 Regmatch state for TRIE */
-#define EVAL_AB 69 /* 0x45 Regmatch state for EVAL */
-#define EVAL_AB_fail 70 /* 0x46 Regmatch state for EVAL */
-#define resume_CURLYX 71 /* 0x47 Regmatch state for CURLYX */
-#define resume_WHILEM1 72 /* 0x48 Regmatch state for WHILEM */
-#define resume_WHILEM2 73 /* 0x49 Regmatch state for WHILEM */
-#define resume_WHILEM3 74 /* 0x4a Regmatch state for WHILEM */
-#define resume_WHILEM4 75 /* 0x4b Regmatch state for WHILEM */
-#define resume_WHILEM5 76 /* 0x4c Regmatch state for WHILEM */
-#define resume_WHILEM6 77 /* 0x4d Regmatch state for WHILEM */
-#define BRANCH_next 78 /* 0x4e Regmatch state for BRANCH */
-#define BRANCH_next_fail 79 /* 0x4f Regmatch state for BRANCH */
-#define CURLYM_A 80 /* 0x50 Regmatch state for CURLYM */
-#define CURLYM_A_fail 81 /* 0x51 Regmatch state for CURLYM */
-#define CURLYM_B 82 /* 0x52 Regmatch state for CURLYM */
-#define CURLYM_B_fail 83 /* 0x53 Regmatch state for CURLYM */
-#define IFMATCH_A 84 /* 0x54 Regmatch state for IFMATCH */
-#define IFMATCH_A_fail 85 /* 0x55 Regmatch state for IFMATCH */
-#define CURLY_B_min_known 86 /* 0x56 Regmatch state for CURLY */
-#define CURLY_B_min_known_fail 87 /* 0x57 Regmatch state for CURLY */
-#define CURLY_B_min 88 /* 0x58 Regmatch state for CURLY */
-#define CURLY_B_min_fail 89 /* 0x59 Regmatch state for CURLY */
-#define CURLY_B_max 90 /* 0x5a Regmatch state for CURLY */
-#define CURLY_B_max_fail 91 /* 0x5b Regmatch state for CURLY */
+#define TRIE_next 69 /* 0x45 Regmatch state for TRIE */
+#define TRIE_next_fail 70 /* 0x46 Regmatch state for TRIE */
+#define EVAL_AB 71 /* 0x47 Regmatch state for EVAL */
+#define EVAL_AB_fail 72 /* 0x48 Regmatch state for EVAL */
+#define resume_CURLYX 73 /* 0x49 Regmatch state for CURLYX */
+#define resume_WHILEM1 74 /* 0x4a Regmatch state for WHILEM */
+#define resume_WHILEM2 75 /* 0x4b Regmatch state for WHILEM */
+#define resume_WHILEM3 76 /* 0x4c Regmatch state for WHILEM */
+#define resume_WHILEM4 77 /* 0x4d Regmatch state for WHILEM */
+#define resume_WHILEM5 78 /* 0x4e Regmatch state for WHILEM */
+#define resume_WHILEM6 79 /* 0x4f Regmatch state for WHILEM */
+#define BRANCH_next 80 /* 0x50 Regmatch state for BRANCH */
+#define BRANCH_next_fail 81 /* 0x51 Regmatch state for BRANCH */
+#define CURLYM_A 82 /* 0x52 Regmatch state for CURLYM */
+#define CURLYM_A_fail 83 /* 0x53 Regmatch state for CURLYM */
+#define CURLYM_B 84 /* 0x54 Regmatch state for CURLYM */
+#define CURLYM_B_fail 85 /* 0x55 Regmatch state for CURLYM */
+#define IFMATCH_A 86 /* 0x56 Regmatch state for IFMATCH */
+#define IFMATCH_A_fail 87 /* 0x57 Regmatch state for IFMATCH */
+#define CURLY_B_min_known 88 /* 0x58 Regmatch state for CURLY */
+#define CURLY_B_min_known_fail 89 /* 0x59 Regmatch state for CURLY */
+#define CURLY_B_min 90 /* 0x5a Regmatch state for CURLY */
+#define CURLY_B_min_fail 91 /* 0x5b Regmatch state for CURLY */
+#define CURLY_B_max 92 /* 0x5c Regmatch state for CURLY */
+#define CURLY_B_max_fail 93 /* 0x5d Regmatch state for CURLY */
+/* PL_regkind[] What type of regop or state is this. */
#ifndef DOINIT
EXTCONST U8 PL_regkind[];
@@ -173,6 +178,8 @@ EXTCONST U8 PL_regkind[] = {
TRIE, /* TRIEC */
TRIE, /* AHOCORASICK */
TRIE, /* AHOCORASICKC */
+ RECURSE, /* RECURSE */
+ RECURSE, /* SRECURSE */
NOTHING, /* OPTIMIZED */
PSEUDO, /* PSEUDO */
/* ------------ States ------------- */
@@ -204,6 +211,7 @@ EXTCONST U8 PL_regkind[] = {
};
#endif
+/* regarglen[] - How large is the argument part of the node (in regnodes) */
#ifdef REG_COMP_C
static const U8 regarglen[] = {
@@ -272,10 +280,14 @@ static const U8 regarglen[] = {
EXTRA_SIZE(struct regnode_charclass), /* TRIEC */
EXTRA_SIZE(struct regnode_1), /* AHOCORASICK */
EXTRA_SIZE(struct regnode_charclass), /* AHOCORASICKC */
+ EXTRA_SIZE(struct regnode_2L), /* RECURSE */
+ 0, /* SRECURSE */
0, /* OPTIMIZED */
0, /* PSEUDO */
};
+/* reg_off_by_arg[] - Which argument holds the offset to the next node */
+
static const char reg_off_by_arg[] = {
0, /* END */
0, /* SUCCEED */
@@ -342,10 +354,14 @@ static const char reg_off_by_arg[] = {
0, /* TRIEC */
0, /* AHOCORASICK */
0, /* AHOCORASICKC */
+ 0, /* RECURSE */
+ 0, /* SRECURSE */
0, /* OPTIMIZED */
0, /* PSEUDO */
};
+/* reg_name[] - Opcode/state names in string form, for debugging */
+
#ifdef DEBUGGING
const char * reg_name[] = {
"END", /* 0000 */
@@ -413,34 +429,36 @@ const char * reg_name[] = {
"TRIEC", /* 0x3e */
"AHOCORASICK", /* 0x3f */
"AHOCORASICKC", /* 0x40 */
- "OPTIMIZED", /* 0x41 */
- "PSEUDO", /* 0x42 */
+ "RECURSE", /* 0x41 */
+ "SRECURSE", /* 0x42 */
+ "OPTIMIZED", /* 0x43 */
+ "PSEUDO", /* 0x44 */
/* ------------ States ------------- */
- "TRIE_next", /* 0x43 */
- "TRIE_next_fail", /* 0x44 */
- "EVAL_AB", /* 0x45 */
- "EVAL_AB_fail", /* 0x46 */
- "resume_CURLYX", /* 0x47 */
- "resume_WHILEM1", /* 0x48 */
- "resume_WHILEM2", /* 0x49 */
- "resume_WHILEM3", /* 0x4a */
- "resume_WHILEM4", /* 0x4b */
- "resume_WHILEM5", /* 0x4c */
- "resume_WHILEM6", /* 0x4d */
- "BRANCH_next", /* 0x4e */
- "BRANCH_next_fail", /* 0x4f */
- "CURLYM_A", /* 0x50 */
- "CURLYM_A_fail", /* 0x51 */
- "CURLYM_B", /* 0x52 */
- "CURLYM_B_fail", /* 0x53 */
- "IFMATCH_A", /* 0x54 */
- "IFMATCH_A_fail", /* 0x55 */
- "CURLY_B_min_known", /* 0x56 */
- "CURLY_B_min_known_fail", /* 0x57 */
- "CURLY_B_min", /* 0x58 */
- "CURLY_B_min_fail", /* 0x59 */
- "CURLY_B_max", /* 0x5a */
- "CURLY_B_max_fail", /* 0x5b */
+ "TRIE_next", /* 0x45 */
+ "TRIE_next_fail", /* 0x46 */
+ "EVAL_AB", /* 0x47 */
+ "EVAL_AB_fail", /* 0x48 */
+ "resume_CURLYX", /* 0x49 */
+ "resume_WHILEM1", /* 0x4a */
+ "resume_WHILEM2", /* 0x4b */
+ "resume_WHILEM3", /* 0x4c */
+ "resume_WHILEM4", /* 0x4d */
+ "resume_WHILEM5", /* 0x4e */
+ "resume_WHILEM6", /* 0x4f */
+ "BRANCH_next", /* 0x50 */
+ "BRANCH_next_fail", /* 0x51 */
+ "CURLYM_A", /* 0x52 */
+ "CURLYM_A_fail", /* 0x53 */
+ "CURLYM_B", /* 0x54 */
+ "CURLYM_B_fail", /* 0x55 */
+ "IFMATCH_A", /* 0x56 */
+ "IFMATCH_A_fail", /* 0x57 */
+ "CURLY_B_min_known", /* 0x58 */
+ "CURLY_B_min_known_fail", /* 0x59 */
+ "CURLY_B_min", /* 0x5a */
+ "CURLY_B_min_fail", /* 0x5b */
+ "CURLY_B_max", /* 0x5c */
+ "CURLY_B_max_fail", /* 0x5d */
};
#endif /* DEBUGGING */
#else
diff --git a/t/op/pat.t b/t/op/pat.t
index 59499b196b..c1d8e2dc33 100755
--- a/t/op/pat.t
+++ b/t/op/pat.t
@@ -3632,7 +3632,31 @@ $brackets = qr{
}x;
ok("{b{c}d" !~ m/^((??{ $brackets }))/, "bracket mismatch");
-
+SKIP:{
+ our @stack=();
+ my @expect=qw(
+ stuff1
+ stuff2
+ <stuff1>and<stuff2>
+ right
+ <right>
+ <<right>>
+ <<<right>>>
+ <<stuff1>and<stuff2>><<<<right>>>>
+ );
+
+ local $_='<<<stuff1>and<stuff2>><<<<right>>>>>';
+ ok(/^(<((?:(?>[^<>]+)|(?1))*)>(?{push @stack, $2 }))$/,
+ "Recursion should match");
+ ok(@stack==@expect)
+ or skip("Won't test individual results as count isn't equal",
+ 0+@expect);
+ foreach my $idx (@expect) {
+ ok($expect[$idx] eq $stack[$idx],
+ "Expecting '$expect' at stack pos #$idx");
+ }
+
+}
# stress test CURLYX/WHILEM.
#
# This test includes varying levels of nesting, and according to
@@ -3734,11 +3758,15 @@ ok("{b{c}d" !~ m/^((??{ $brackets }))/, "bracket mismatch");
}
-# Keep the following test last -- it may crash perl
+# Keep the following tests last -- they may crash perl
ok(("a" x (2**15 - 10)) =~ /^()(a|bb)*$/, "Recursive stack cracker: #24274")
or print "# Unexpected outcome: should pass or crash perl\n";
+ok((q(a)x 100) =~ /^(??{'(.)'x 100})/,
+ "Regexp /^(??{'(.)'x 100})/ crashes older perls")
+ or print "# Unexpected outcome: should pass or crash perl\n";
+
# Don't forget to update this!
-BEGIN{print "1..1253\n"};
+BEGIN{print "1..1264\n"};
diff --git a/t/op/re_tests b/t/op/re_tests
index 3ff5a73f9e..6759f34c54 100644
--- a/t/op/re_tests
+++ b/t/op/re_tests
@@ -1016,3 +1016,7 @@ X(?<=foo.)[YZ] ..XfooXY.. y pos 8
^(.)((??{"(.)(cz+)"})|.) abcd y $1-$2 a-b
^a(?>(??{q(b)}))(??{q(c)})d abcd y - -
^x(??{""})+$ x y $& x
+^(<(?:[^<>]+|(?3)|(?1))*>)()(!>!>!>)$ <<!>!>!>><>>!>!>!> y $1 <<!>!>!>><>>
+^(<(?:[^<>]+|(?1))*>)$ <<><<<><>>>> y $1 <<><<<><>>>>
+((?2)*)([fF]o+) fooFoFoo y $1-$2 fooFo-Foo
+(<(?:[^<>]+|(?R))*>) <<><<<><>>>> y $1 <<><<<><>>>>