summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorYves Orton <demerphq@gmail.com>2006-10-30 21:15:13 +0200
committerRafael Garcia-Suarez <rgarciasuarez@gmail.com>2006-10-30 17:36:18 +0000
commit40d049e43280582bb162c511cc362c246f19862a (patch)
treec9814e23bdc818ced498f5ecc2141867f3525af8
parentee4d86fe475b15ddee109f44751faf957c1e66af (diff)
downloadperl-40d049e43280582bb162c511cc362c246f19862a.tar.gz
The first patch from:
Subject: [PATCH] regex engine optimiser should grok subroutine patterns, and, name subroutine regops more intuitively Message-ID: <9b18b3110610300915x3abf6cddu9c2071a70bea48e1@mail.gmail.com> p4raw-id: //depot/perl@29161
-rw-r--r--embed.fnc2
-rw-r--r--embed.h2
-rw-r--r--proto.h2
-rw-r--r--regcomp.c276
-rw-r--r--regcomp.sym7
-rw-r--r--regexec.c14
-rw-r--r--regnodes.h6
7 files changed, 234 insertions, 75 deletions
diff --git a/embed.fnc b/embed.fnc
index ea5450dc4d..d3a1e27646 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -1331,6 +1331,8 @@ Esn |void |cl_or |NN const struct RExC_state_t* state|NN struct regnode_charcla
Es |I32 |study_chunk |NN struct RExC_state_t* state|NN regnode **scanp \
|NN I32 *minlenp|NN I32 *deltap \
|NN regnode *last|NULLOK struct scan_data_t *data \
+ |I32 stopparen|NULLOK U8* recursed \
+ |NULLOK struct regnode_charclass_class *and_withp \
|U32 flags|U32 depth
EsRn |I32 |add_data |NN struct RExC_state_t* state|I32 n|NN const char *s
rs |void |re_croak2 |NN const char* pat1|NN const char* pat2|...
diff --git a/embed.h b/embed.h
index bc884d7958..e6b1af3100 100644
--- a/embed.h
+++ b/embed.h
@@ -3531,7 +3531,7 @@
#define cl_init_zero S_cl_init_zero
#define cl_and S_cl_and
#define cl_or S_cl_or
-#define study_chunk(a,b,c,d,e,f,g,h) S_study_chunk(aTHX_ a,b,c,d,e,f,g,h)
+#define study_chunk(a,b,c,d,e,f,g,h,i,j,k) S_study_chunk(aTHX_ a,b,c,d,e,f,g,h,i,j,k)
#define add_data S_add_data
#endif
#ifdef PERL_CORE
diff --git a/proto.h b/proto.h
index 520c86e19d..951ee4d74e 100644
--- a/proto.h
+++ b/proto.h
@@ -3632,7 +3632,7 @@ STATIC void S_cl_or(const struct RExC_state_t* state, struct regnode_charclass_c
__attribute__nonnull__(2)
__attribute__nonnull__(3);
-STATIC I32 S_study_chunk(pTHX_ struct RExC_state_t* state, regnode **scanp, I32 *minlenp, I32 *deltap, regnode *last, struct scan_data_t *data, U32 flags, U32 depth)
+STATIC I32 S_study_chunk(pTHX_ struct RExC_state_t* state, regnode **scanp, I32 *minlenp, I32 *deltap, regnode *last, struct scan_data_t *data, I32 stopparen, U8* recursed, struct regnode_charclass_class *and_withp, U32 flags, U32 depth)
__attribute__nonnull__(pTHX_1)
__attribute__nonnull__(pTHX_2)
__attribute__nonnull__(pTHX_3)
diff --git a/regcomp.c b/regcomp.c
index be8be1ba7d..8534b0c412 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -117,10 +117,14 @@ typedef struct RExC_state_t {
I32 extralen;
I32 seen_zerolen;
I32 seen_evals;
- regnode **parens; /* offsets of each paren */
+ regnode **open_parens; /* pointers to open parens */
+ regnode **close_parens; /* pointers to close parens */
+ regnode *opend; /* END node in program */
I32 utf8;
HV *charnames; /* cache of named sequences */
HV *paren_names; /* Paren names */
+ regnode **recurse; /* Recurse regops */
+ I32 recurse_count; /* Number of recurse regops */
#if ADD_TO_REGEXEC
char *starttry; /* -Dr: where regtry was called. */
#define RExC_starttry (pRExC_state->starttry)
@@ -153,8 +157,12 @@ typedef struct RExC_state_t {
#define RExC_seen_evals (pRExC_state->seen_evals)
#define RExC_utf8 (pRExC_state->utf8)
#define RExC_charnames (pRExC_state->charnames)
-#define RExC_parens (pRExC_state->parens)
+#define RExC_open_parens (pRExC_state->open_parens)
+#define RExC_close_parens (pRExC_state->close_parens)
+#define RExC_opend (pRExC_state->opend)
#define RExC_paren_names (pRExC_state->paren_names)
+#define RExC_recurse (pRExC_state->recurse)
+#define RExC_recurse_count (pRExC_state->recurse_count)
#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
@@ -182,6 +190,14 @@ typedef struct RExC_state_t {
#endif
+
+#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
+#define PBITVAL(paren) (1 << ((paren) & 7))
+#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
+#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
+#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
+
+
/* About scan_data_t.
During optimisation we recurse through the regexp program performing
@@ -668,6 +684,8 @@ STATIC void
S_cl_and(struct regnode_charclass_class *cl,
const struct regnode_charclass_class *and_with)
{
+
+ assert(and_with->type == ANYOF);
if (!(and_with->flags & ANYOF_CLASS)
&& !(cl->flags & ANYOF_CLASS)
&& (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
@@ -2261,15 +2279,27 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags
/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
to the position after last scanned or to NULL. */
-
+#define INIT_AND_WITHP \
+ assert(!and_withp); \
+ Newx(and_withp,1,struct regnode_charclass_class); \
+ SAVEFREEPV(and_withp)
STATIC I32
-S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
+S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
I32 *minlenp, I32 *deltap,
- regnode *last, scan_data_t *data, U32 flags, U32 depth)
+ regnode *last,
+ scan_data_t *data,
+ I32 stopparen,
+ U8* recursed,
+ struct regnode_charclass_class *and_withp,
+ U32 flags, U32 depth)
/* scanp: Start here (read-write). */
/* deltap: Write maxlen-minlen here. */
/* last: Stop before this one. */
+ /* data: string data about the pattern */
+ /* stopparen: treat close N as END */
+ /* recursed: which subroutines have we recursed into */
+ /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
{
dVAR;
I32 min = 0, pars = 0, code;
@@ -2279,17 +2309,15 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
int is_inf_internal = 0; /* The studied chunk is infinite */
I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
scan_data_t data_fake;
- struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
SV *re_trie_maxbuff = NULL;
regnode *first_non_open = scan;
-
-
GET_RE_DEBUG_FLAGS_DECL;
#ifdef DEBUGGING
- StructCopy(&zero_scan_data, &data_fake, scan_data_t);
+ StructCopy(&zero_scan_data, &data_fake, scan_data_t);
#endif
+
if ( depth == 0 ) {
- while (first_non_open && OP(first_non_open) == OPEN)
+ while (first_non_open && OP(first_non_open) == OPEN)
first_non_open=regnext(first_non_open);
}
@@ -2352,7 +2380,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
num++;
data_fake.flags = 0;
- if (data) {
+ if (data) {
data_fake.whilem_c = data->whilem_c;
data_fake.last_closep = data->last_closep;
}
@@ -2372,7 +2400,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
/* we suppose the run is continuous, last=next...*/
minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
- next, &data_fake, f,depth+1);
+ next, &data_fake,
+ stopparen, recursed, NULL, f,depth+1);
if (min1 > minnext)
min1 = minnext;
if (max1 < minnext + deltanext)
@@ -2405,7 +2434,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
if (flags & SCF_DO_STCLASS_OR) {
cl_or(pRExC_state, data->start_class, &accum);
if (min1) {
- cl_and(data->start_class, &and_with);
+ cl_and(data->start_class, and_withp);
flags &= ~SCF_DO_STCLASS;
}
}
@@ -2417,7 +2446,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
else {
/* Switch to OR mode: cache the old value of
* data->start_class */
- StructCopy(data->start_class, &and_with,
+ INIT_AND_WITHP;
+ StructCopy(data->start_class, and_withp,
struct regnode_charclass_class);
flags &= ~SCF_DO_STCLASS_AND;
StructCopy(&accum, data->start_class,
@@ -2683,7 +2713,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
else
data->start_class->flags |= ANYOF_UNICODE_ALL;
data->start_class->flags &= ~ANYOF_EOS;
- cl_and(data->start_class, &and_with);
+ cl_and(data->start_class, and_withp);
}
flags &= ~SCF_DO_STCLASS;
}
@@ -2731,7 +2761,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
ANYOF_BITMAP_SET(data->start_class, uc);
data->start_class->flags &= ~ANYOF_EOS;
}
- cl_and(data->start_class, &and_with);
+ cl_and(data->start_class, and_withp);
}
flags &= ~SCF_DO_STCLASS;
}
@@ -2778,8 +2808,15 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
}
goto optimize_curly_tail;
case CURLY:
- mincount = ARG1(scan);
- maxcount = ARG2(scan);
+ if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
+ && (scan->flags == stopparen))
+ {
+ mincount = 1;
+ maxcount = 1;
+ } else {
+ mincount = ARG1(scan);
+ maxcount = ARG2(scan);
+ }
next = regnext(scan);
if (OP(scan) == CURLYX) {
I32 lp = (data ? *(data->last_closep) : 0);
@@ -2815,7 +2852,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
f &= ~SCF_WHILEM_VISITED_POS;
/* This will finish on WHILEM, setting scan, or on NULL: */
- minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext, last, data,
+ minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
+ last, data, stopparen, recursed, NULL,
(mincount == 0
? (f & ~SCF_DO_SUBSTR) : f),depth+1);
@@ -2828,7 +2866,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
else if (flags & SCF_DO_STCLASS_AND) {
/* Switch to OR mode: cache the old value of
* data->start_class */
- StructCopy(data->start_class, &and_with,
+ INIT_AND_WITHP;
+ StructCopy(data->start_class, and_withp,
struct regnode_charclass_class);
flags &= ~SCF_DO_STCLASS_AND;
StructCopy(&this_class, data->start_class,
@@ -2839,7 +2878,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
} else { /* Non-zero len */
if (flags & SCF_DO_STCLASS_OR) {
cl_or(pRExC_state, data->start_class, &this_class);
- cl_and(data->start_class, &and_with);
+ cl_and(data->start_class, and_withp);
}
else if (flags & SCF_DO_STCLASS_AND)
cl_and(data->start_class, &this_class);
@@ -2889,10 +2928,15 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
nxt = regnext(nxt);
if (OP(nxt) != CLOSE)
goto nogo;
+ if (RExC_open_parens) {
+ RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
+ RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
+ }
/* Now we know that nxt2 is the only contents: */
oscan->flags = (U8)ARG(nxt);
OP(oscan) = CURLYN;
OP(nxt1) = NOTHING; /* was OPEN. */
+
#ifdef DEBUGGING
OP(nxt1 + 1) = OPTIMIZED; /* was count. */
NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
@@ -2929,8 +2973,13 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
if (OP(nxt) != CLOSE)
FAIL("Panic opt close");
oscan->flags = (U8)ARG(nxt);
+ if (RExC_open_parens) {
+ RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
+ RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
+ }
OP(nxt1) = OPTIMIZED; /* was OPEN. */
OP(nxt) = OPTIMIZED; /* was CLOSE. */
+
#ifdef DEBUGGING
OP(nxt1 + 1) = OPTIMIZED; /* was count. */
OP(nxt + 1) = OPTIMIZED; /* was count. */
@@ -2954,7 +3003,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
#endif
/* Optimize again: */
study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
- NULL, 0,depth+1);
+ NULL, stopparen, recursed, NULL, 0,depth+1);
}
else
oscan->flags = 0;
@@ -3286,7 +3335,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
break;
}
if (flags & SCF_DO_STCLASS_OR)
- cl_and(data->start_class, &and_with);
+ cl_and(data->start_class, and_withp);
flags &= ~SCF_DO_STCLASS;
}
}
@@ -3328,7 +3377,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
f |= SCF_WHILEM_VISITED_POS;
next = regnext(scan);
nscan = NEXTOPER(NEXTOPER(scan));
- minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext, last, &data_fake, f,depth+1);
+ minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
+ last, &data_fake, stopparen, recursed, NULL, f, depth+1);
if (scan->flags) {
if (deltanext) {
vFAIL("Variable length lookbehind not implemented");
@@ -3400,8 +3450,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
f |= SCF_WHILEM_VISITED_POS;
next = regnext(scan);
nscan = NEXTOPER(NEXTOPER(scan));
-
- *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext, last, &data_fake, f,depth+1);
+
+ *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
+ last, &data_fake, stopparen, recursed, NULL, f,depth+1);
if (scan->flags) {
if (deltanext) {
vFAIL("Variable length lookbehind not implemented");
@@ -3411,17 +3462,16 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
}
scan->flags = (U8)*minnextp;
}
-
+
*minnextp += min;
-
-
+
if (f & SCF_DO_STCLASS_AND) {
const int was = (data->start_class->flags & ANYOF_EOS);
cl_and(data->start_class, &intrnl);
if (was)
data->start_class->flags |= ANYOF_EOS;
- }
+ }
if (data) {
if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
pars++;
@@ -3455,9 +3505,13 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
#endif
}
else if (OP(scan) == OPEN) {
- pars++;
+ if (stopparen != (I32)ARG(scan))
+ pars++;
}
else if (OP(scan) == CLOSE) {
+ if (stopparen == (I32)ARG(scan)) {
+ break;
+ }
if ((I32)ARG(scan) == is_par) {
next = regnext(scan);
@@ -3467,16 +3521,73 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
if (data)
*(data->last_closep) = ARG(scan);
}
+ else if (OP(scan) == RECURSE || OP(scan) == SRECURSE) {
+ /* set the pointer */
+ I32 paren;
+ regnode *start;
+ regnode *end;
+ if (OP(scan) == RECURSE) {
+ paren = ARG(scan);
+ RExC_recurse[ARG2L(scan)] = scan;
+ start = RExC_open_parens[paren-1];
+ end = RExC_close_parens[paren-1];
+ } else {
+ paren = 0;
+ start = RExC_rx->program + 1;
+ end = RExC_opend;
+ }
+ assert(start);
+ assert(end);
+ if (!recursed) {
+ Newxz(recursed, (((RExC_npar)>>3) +1), U8);
+ SAVEFREEPV(recursed);
+ }
+ if (!PAREN_TEST(recursed,paren+1)) {
+ I32 deltanext = 0;
+ PAREN_SET(recursed,paren+1);
+
+ DEBUG_PEEP("goto",start,depth);
+ min += study_chunk(
+ pRExC_state,
+ &start,
+ minlenp,
+ &deltanext,
+ end+1,
+ data,
+ paren,
+ recursed,
+ and_withp,
+ flags,depth+1);
+ delta+=deltanext;
+ if (deltanext == I32_MAX) {
+ is_inf = is_inf_internal = 1;
+ delta=deltanext;
+ }
+ DEBUG_PEEP("rtrn",end,depth);
+ PAREN_UNSET(recursed,paren+1);
+ } else {
+ if (flags & SCF_DO_SUBSTR) {
+ scan_commit(pRExC_state,data,minlenp);
+ data->longest = &(data->longest_float);
+ }
+ is_inf = is_inf_internal = 1;
+ if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
+ cl_anything(pRExC_state, data->start_class);
+ flags &= ~SCF_DO_STCLASS;
+ }
+ }
else if (OP(scan) == EVAL) {
if (data)
data->flags |= SF_HAS_EVAL;
}
- else if ( (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
- || OP(scan)==RECURSE) /* recursion */
- {
- if (OP(scan)==RECURSE) {
- ARG2L_SET( scan, RExC_parens[ARG(scan)-1] - scan );
- }
+ else if ( OP(scan)==OPFAIL ) {
+ if (flags & SCF_DO_SUBSTR) {
+ scan_commit(pRExC_state,data,minlenp);
+ flags &= ~SCF_DO_SUBSTR;
+ }
+ }
+ else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
+ {
if (flags & SCF_DO_SUBSTR) {
scan_commit(pRExC_state,data,minlenp);
data->longest = &(data->longest_float);
@@ -3487,10 +3598,10 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
flags &= ~SCF_DO_STCLASS;
}
#ifdef TRIE_STUDY_OPT
-#ifdef FULL_TRIE_STUDY
+#ifdef FULL_TRIE_STUDY
else if (PL_regkind[OP(scan)] == TRIE) {
/* NOTE - There is similar code to this block above for handling
- BRANCH nodes on the initial study. If you change stuff here
+ BRANCH nodes on the initial study. If you change stuff here
check there too. */
regnode *trie_node= scan;
regnode *tail= regnext(scan);
@@ -3539,8 +3650,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
it. Note this means we need the vestigal unused branches
even though they arent otherwise used.
*/
- minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
- (regnode *)nextbranch, &data_fake, f,depth+1);
+ minnext = study_chunk(pRExC_state, &scan, minlenp,
+ &deltanext, (regnode *)nextbranch, &data_fake,
+ stopparen, recursed, NULL, f,depth+1);
}
if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
nextbranch= regnext((regnode*)nextbranch);
@@ -3575,7 +3687,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
if (flags & SCF_DO_STCLASS_OR) {
cl_or(pRExC_state, data->start_class, &accum);
if (min1) {
- cl_and(data->start_class, &and_with);
+ cl_and(data->start_class, and_withp);
flags &= ~SCF_DO_STCLASS;
}
}
@@ -3587,7 +3699,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
else {
/* Switch to OR mode: cache the old value of
* data->start_class */
- StructCopy(data->start_class, &and_with,
+ INIT_AND_WITHP;
+ StructCopy(data->start_class, and_withp,
struct regnode_charclass_class);
flags &= ~SCF_DO_STCLASS_AND;
StructCopy(&accum, data->start_class,
@@ -3639,7 +3752,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
data->flags &= ~SF_IN_PAR;
}
if (flags & SCF_DO_STCLASS_OR)
- cl_and(data->start_class, &and_with);
+ cl_and(data->start_class, and_withp);
if (flags & SCF_TRIE_RESTUDY)
data->flags |= SCF_TRIE_RESTUDY;
@@ -3806,8 +3919,12 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
RExC_emit = &PL_regdummy;
RExC_whilem_seen = 0;
RExC_charnames = NULL;
- RExC_parens = NULL;
+ RExC_open_parens = NULL;
+ RExC_close_parens = NULL;
+ RExC_opend = NULL;
RExC_paren_names = NULL;
+ RExC_recurse = NULL;
+ RExC_recurse_count = 0;
#if 0 /* REGC() is (currently) a NOP at the first pass.
* Clever compilers notice this and complain. --jhi */
@@ -3865,8 +3982,10 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
r->paren_names = 0;
if (RExC_seen & REG_SEEN_RECURSE) {
- Newxz(RExC_parens, RExC_npar,regnode *);
- SAVEFREEPV(RExC_parens);
+ Newxz(RExC_open_parens, RExC_npar,regnode *);
+ SAVEFREEPV(RExC_open_parens);
+ Newxz(RExC_close_parens,RExC_npar,regnode *);
+ SAVEFREEPV(RExC_close_parens);
}
/* Useful during FAIL. */
@@ -3899,11 +4018,14 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
/* XXXX To minimize changes to RE engine we always allocate
3-units-long substrs field. */
Newx(r->substrs, 1, struct reg_substr_data);
+ if (RExC_recurse_count) {
+ Newxz(RExC_recurse,RExC_recurse_count,regnode *);
+ SAVEFREEPV(RExC_recurse);
+ }
reStudy:
r->minlen = minlen = sawplus = sawopen = 0;
Zero(r->substrs, 1, struct reg_substr_data);
- StructCopy(&zero_scan_data, &data, scan_data_t);
#ifdef TRIE_STUDY_OPT
if ( restudied ) {
@@ -3914,9 +4036,13 @@ reStudy:
SvREFCNT_dec(data.longest_float);
SvREFCNT_dec(data.last_found);
}
+ StructCopy(&zero_scan_data, &data, scan_data_t);
} else {
+ StructCopy(&zero_scan_data, &data, scan_data_t);
copyRExC_state=RExC_state;
}
+#else
+ StructCopy(&zero_scan_data, &data, scan_data_t);
#endif
/* Dig out information for optimizations. */
@@ -4073,7 +4199,8 @@ reStudy:
data.last_closep = &last_close;
minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
- &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
+ &data, -1, NULL, NULL,
+ SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
CHECK_RESTUDY_GOTO;
@@ -4249,7 +4376,7 @@ reStudy:
data.last_closep = &last_close;
minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
- &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
+ &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
CHECK_RESTUDY_GOTO;
@@ -4293,6 +4420,12 @@ reStudy:
else
r->paren_names = NULL;
+ if (RExC_recurse_count) {
+ for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
+ const regnode *scan = RExC_recurse[RExC_recurse_count-1];
+ ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
+ }
+ }
Newxz(r->startp, RExC_npar, I32);
Newxz(r->endp, RExC_npar, I32);
@@ -4645,7 +4778,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
RExC_parse++;
vFAIL("Reference to nonexistent group");
}
- ARG2L_SET( ret, 0);
+ ARG2L_SET( ret, RExC_recurse_count++);
RExC_emit++;
DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
"Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
@@ -4930,9 +5063,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
ret = reganode(pRExC_state, OPEN, parno);
if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
- "Setting paren #%"IVdf" to %d\n", (IV)parno, REG_NODE_NUM(ret)));
- RExC_parens[parno-1]= ret;
-
+ "Setting open paren #%"IVdf" to %d\n",
+ (IV)parno, REG_NODE_NUM(ret)));
+ RExC_open_parens[parno-1]= ret;
}
Set_Node_Length(ret, 1); /* MJD */
Set_Node_Offset(ret, RExC_parse); /* MJD */
@@ -4999,6 +5132,12 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
break;
case 1:
ender = reganode(pRExC_state, CLOSE, parno);
+ if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
+ DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
+ "Setting close paren #%"IVdf" to %d\n",
+ (IV)parno, REG_NODE_NUM(ender)));
+ RExC_close_parens[parno-1]= ender;
+ }
Set_Node_Offset(ender,RExC_parse+1); /* MJD */
Set_Node_Length(ender,1); /* MJD */
break;
@@ -5013,6 +5152,10 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
break;
case 0:
ender = reg_node(pRExC_state, END);
+ if (!SIZE_ONLY) {
+ assert(!RExC_opend); /* there can only be one! */
+ RExC_opend = ender;
+ }
break;
}
REGTAIL(pRExC_state, lastbr, ender);
@@ -7399,14 +7542,25 @@ S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
src = RExC_emit;
RExC_emit += size;
dst = RExC_emit;
- if (RExC_parens) {
+ if (RExC_open_parens) {
int paren;
+ DEBUG_PARSE_FMT("inst"," - %d",RExC_npar);
for ( paren=0 ; paren < RExC_npar ; paren++ ) {
- if ( RExC_parens[paren] >= src )
- RExC_parens[paren] += size;
- }
+ if ( RExC_open_parens[paren] >= opnd ) {
+ DEBUG_PARSE_FMT("open"," - %d",size);
+ RExC_open_parens[paren] += size;
+ } else {
+ DEBUG_PARSE_FMT("open"," - %s","ok");
+ }
+ if ( RExC_close_parens[paren] >= opnd ) {
+ DEBUG_PARSE_FMT("close"," - %d",size);
+ RExC_close_parens[paren] += size;
+ } else {
+ DEBUG_PARSE_FMT("close"," - %s","ok");
+ }
+ }
}
-
+
while (src > opnd) {
StructCopy(--src, --dst, regnode);
if (RExC_offsets) { /* MJD 20010112 */
@@ -8494,9 +8648,9 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
: "???"
);
if (trie->jump) {
- U16 dist = trie->jump[word_idx+1];
+ U16 dist= trie->jump[word_idx+1];
PerlIO_printf(Perl_debug_log, "(%u)\n",
- (dist ? this_trie + dist : next) - start);
+ (dist ? this_trie + dist : next) - start);
if (dist) {
if (!nextbranch)
nextbranch = this_trie + trie->jump[0];
diff --git a/regcomp.sym b/regcomp.sym
index 73e27a80ed..a61f172a35 100644
--- a/regcomp.sym
+++ b/regcomp.sym
@@ -98,9 +98,8 @@ STAR STAR, node Match this (simple) thing 0 or more times.
PLUS PLUS, node Match this (simple) thing 1 or more times.
CURLY CURLY, sv 2 Match this simple thing {n,m} times.
-CURLYN CURLY, no 2 Match next-after-this simple thing
-# {n,m} times, set parenths.
-CURLYM CURLY, no 2 Match this medium-complex thing {n,m} times.
+CURLYN CURLY, no 2 Capture next-after-this simple thing
+CURLYM CURLY, no 2 Capture this medium-complex thing {n,m} times.
CURLYX CURLY, sv 2 Match this complex thing {n,m} times.
# This terminator creates a loop structure for CURLYX
@@ -156,7 +155,7 @@ AHOCORASICKC TRIE, trie charclass Same as AHOCORASICK, but with embedded charc
#*Recursion (65..66)
RECURSE RECURSE, num/ofs 2L recurse to paren arg1 at (signed) ofs arg2
-SRECURSE RECURSE, no recurse to start of pattern
+SRECURSE SRECURSE, no recurse to start of pattern
#*Named references (67..69)
NREF NREF, no-sv 1 Match some already matched string
diff --git a/regexec.c b/regexec.c
index 17d0e01906..d57fd35bc5 100644
--- a/regexec.c
+++ b/regexec.c
@@ -331,7 +331,11 @@ Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *stren
RExen without fixed substrings. Similarly, it is assumed that
lengths of all the strings are no more than minlen, thus they
cannot come from lookahead.
- (Or minlen should take into account lookahead.) */
+ (Or minlen should take into account lookahead.)
+ NOTE: Some of this comment is not correct. minlen does now take account
+ of lookahead/behind. Further research is required. -- demerphq
+
+*/
/* A failure to find a constant substring means that there is no need to make
an expensive call to REx engine, thus we celebrate a failure. Similarly,
@@ -4450,6 +4454,10 @@ NULL
curly_try_B_max:
/* a successful greedy match: now try to match B */
+ if (cur_eval && cur_eval->u.eval.close_paren &&
+ cur_eval->u.eval.close_paren == ST.paren) {
+ goto fake_end;
+ }
{
UV c = 0;
if (ST.c1 != CHRTEST_VOID)
@@ -4459,10 +4467,6 @@ NULL
/* If it could work, try it. */
if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
CURLY_SETPAREN(ST.paren, ST.count);
- if (cur_eval && cur_eval->u.eval.close_paren &&
- cur_eval->u.eval.close_paren == ST.paren) {
- goto fake_end;
- }
PUSH_STATE_GOTO(CURLY_B_max, ST.B);
/* NOTREACHED */
}
diff --git a/regnodes.h b/regnodes.h
index d6842b52c3..e3d18378f2 100644
--- a/regnodes.h
+++ b/regnodes.h
@@ -50,8 +50,8 @@
#define STAR 38 /* 0x26 Match this (simple) thing 0 or more times. */
#define PLUS 39 /* 0x27 Match this (simple) thing 1 or more times. */
#define CURLY 40 /* 0x28 Match this simple thing {n,m} times. */
-#define CURLYN 41 /* 0x29 Match next-after-this simple thing */
-#define CURLYM 42 /* 0x2a Match this medium-complex thing {n,m} times. */
+#define CURLYN 41 /* 0x29 Capture next-after-this simple thing */
+#define CURLYM 42 /* 0x2a Capture this medium-complex thing {n,m} times. */
#define CURLYX 43 /* 0x2b Match this complex thing {n,m} times. */
#define WHILEM 44 /* 0x2c Do curly processing and see if rest matches. */
#define OPEN 45 /* 0x2d Mark this point in input as start of */
@@ -191,7 +191,7 @@ EXTCONST U8 PL_regkind[] = {
TRIE, /* AHOCORASICK */
TRIE, /* AHOCORASICKC */
RECURSE, /* RECURSE */
- RECURSE, /* SRECURSE */
+ SRECURSE, /* SRECURSE */
NREF, /* NREF */
NREF, /* NREFF */
NREF, /* NREFFL */