summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDave Mitchell <davem@fdisolutions.com>2006-06-16 23:25:51 +0000
committerDave Mitchell <davem@fdisolutions.com>2006-06-16 23:25:51 +0000
commit40a824489101168f94fce98aa2824baf40bad402 (patch)
treec5d3ebdd2a3b693c44ef8712a9965a4fa38d0330
parent0aec9d3674a357d2c5a029483ff5cdc65c715a57 (diff)
downloadperl-40a824489101168f94fce98aa2824baf40bad402.tar.gz
start turning regmatch() main loop into a FSM
also make BRANCH use the state stack rather than its own unwind struct p4raw-id: //depot/perl@28398
-rw-r--r--regcomp.pl1
-rw-r--r--regexec.c818
-rw-r--r--regexp.h42
-rw-r--r--regnodes.h1
-rw-r--r--t/op/re_tests1
5 files changed, 408 insertions, 455 deletions
diff --git a/regcomp.pl b/regcomp.pl
index 144cb0cf3c..febd550552 100644
--- a/regcomp.pl
+++ b/regcomp.pl
@@ -45,6 +45,7 @@ EOP
}
print OUT <<EOP;
+#define REGNODE_MAX $oind
#ifndef DOINIT
EXTCONST U8 PL_regkind[];
diff --git a/regexec.c b/regexec.c
index 6ac241e40d..872a3bc889 100644
--- a/regexec.c
+++ b/regexec.c
@@ -161,6 +161,7 @@ S_regcppush(pTHX_ I32 parenfloor)
#define REGCP_PAREN_ELEMS 4
const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
int p;
+ GET_RE_DEBUG_FLAGS_DECL;
if (paren_elems_to_push < 0)
Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
@@ -173,6 +174,12 @@ S_regcppush(pTHX_ I32 parenfloor)
SSPUSHINT(PL_regstartp[p]);
SSPUSHPTR(PL_reg_start_tmp[p]);
SSPUSHINT(p);
+ DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
+ " saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
+ (UV)p, (IV)PL_regstartp[p],
+ (IV)(PL_reg_start_tmp[p] - PL_bostr),
+ (IV)PL_regendp[p]
+ ));
}
/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
SSPUSHINT(PL_regsize);
@@ -2372,36 +2379,6 @@ S_regtry(pTHX_ const regmatch_info *reginfo, char *startpos)
return 0;
}
-#define RE_UNWIND_BRANCH 1
-#define RE_UNWIND_BRANCHJ 2
-
-union re_unwind_t;
-
-typedef struct { /* XX: makes sense to enlarge it... */
- I32 type;
- I32 prev;
- CHECKPOINT lastcp;
-} re_unwind_generic_t;
-
-typedef struct {
- I32 type;
- I32 prev;
- CHECKPOINT lastcp;
- I32 lastparen;
- regnode *next;
- char *locinput;
- I32 nextchr;
- int minmod;
-#ifdef DEBUGGING
- int regindent;
-#endif
-} re_unwind_branch_t;
-
-typedef union re_unwind_t {
- I32 type;
- re_unwind_generic_t generic;
- re_unwind_branch_t branch;
-} re_unwind_t;
#define sayYES goto yes
#define sayNO goto no
@@ -2471,8 +2448,8 @@ typedef union re_unwind_t {
/* Make sure there is a test for this +1 options in re_tests */
#define TRIE_INITAL_ACCEPT_BUFFLEN 4;
-/* this value indiciates that the c1/c2 "next char" test should be skipped */
-#define CHRTEST_VOID -1000
+#define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
+#define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
#define SLAB_FIRST(s) (&(s)->states[0])
#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
@@ -2505,43 +2482,21 @@ S_push_slab(pTHX)
goto start_recurse; \
resume_point_##where:
+/* push a new state then goto it */
+
+#define PUSH_STATE_GOTO(state, node) \
+ scan = node; \
+ st->resume_state = state; \
+ goto push_state;
+
+/* push a new state with success backtracking, then goto it */
+
+#define PUSH_YES_STATE_GOTO(state, node) \
+ scan = node; \
+ st->resume_state = state; \
+ goto push_yes_state;
+
-/* push a new regex state. Set newst to point to it */
-
-#define PUSH_STATE(newst, resume) \
- depth++; \
- DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "PUSH STATE(%d)\n", depth)); \
- st->scan = scan; \
- st->next = next; \
- st->n = n; \
- st->locinput = locinput; \
- st->resume_state = resume; \
- newst = st+1; \
- if (newst > SLAB_LAST(PL_regmatch_slab)) \
- newst = S_push_slab(aTHX); \
- PL_regmatch_state = newst; \
- newst->cc = 0; \
- newst->minmod = 0; \
- newst->sw = 0; \
- newst->logical = 0; \
- newst->unwind = 0; \
- locinput = PL_reginput; \
- nextchr = UCHARAT(locinput);
-
-#define POP_STATE \
- DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "POP STATE(%d)\n", depth)); \
- depth--; \
- st--; \
- if (st < SLAB_FIRST(PL_regmatch_slab)) { \
- PL_regmatch_slab = PL_regmatch_slab->prev; \
- st = SLAB_LAST(PL_regmatch_slab); \
- } \
- PL_regmatch_state = st; \
- scan = st->scan; \
- next = st->next; \
- n = st->n; \
- locinput = st->locinput; \
- nextchr = UCHARAT(locinput);
/*
- regmatch - main matching routine
@@ -2648,6 +2603,33 @@ S_push_slab(pTHX)
* allocated since entry are freed.
*/
+/* *** every FOO_fail should = FOO+1 */
+#define resume_TRIE1 (REGNODE_MAX+1)
+#define resume_TRIE2 (REGNODE_MAX+2)
+#define EVAL_A (REGNODE_MAX+3)
+#define EVAL_A_fail (REGNODE_MAX+4)
+#define resume_CURLYX (REGNODE_MAX+5)
+#define resume_WHILEM1 (REGNODE_MAX+6)
+#define resume_WHILEM2 (REGNODE_MAX+7)
+#define resume_WHILEM3 (REGNODE_MAX+8)
+#define resume_WHILEM4 (REGNODE_MAX+9)
+#define resume_WHILEM5 (REGNODE_MAX+10)
+#define resume_WHILEM6 (REGNODE_MAX+11)
+#define BRANCH_next (REGNODE_MAX+12)
+#define BRANCH_next_fail (REGNODE_MAX+13)
+#define CURLYM_A (REGNODE_MAX+14)
+#define CURLYM_A_fail (REGNODE_MAX+15)
+#define CURLYM_B (REGNODE_MAX+16)
+#define CURLYM_B_fail (REGNODE_MAX+17)
+#define IFMATCH_A (REGNODE_MAX+18)
+#define IFMATCH_A_fail (REGNODE_MAX+19)
+#define resume_PLUS1 (REGNODE_MAX+20)
+#define resume_PLUS2 (REGNODE_MAX+21)
+#define resume_PLUS3 (REGNODE_MAX+22)
+#define resume_PLUS4 (REGNODE_MAX+23)
+
+
+
#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
#ifdef DEBUGGING
@@ -2743,11 +2725,10 @@ S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog)
/* these variables are NOT saved during a recusive RFEGMATCH: */
register I32 nextchr; /* is always set to UCHARAT(locinput) */
bool result; /* return value of S_regmatch */
- regnode *inner; /* Next node in internal branch. */
int depth = 0; /* depth of recursion */
- regmatch_state *newst; /* when pushing a state, this is the new one */
regmatch_state *yes_state = NULL; /* state to pop to on success of
subpattern */
+ U32 state_num;
#ifdef DEBUGGING
GET_RE_DEBUG_FLAGS_DECL;
@@ -2775,7 +2756,6 @@ S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog)
st->minmod = 0;
st->sw = 0;
st->logical = 0;
- st->unwind = 0;
st->cc = NULL;
/* Note that nextchr is a byte even in UTF */
nextchr = UCHARAT(locinput);
@@ -2797,8 +2777,10 @@ S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog)
next = scan + NEXT_OFF(scan);
if (next == scan)
next = NULL;
+ state_num = OP(scan);
- switch (OP(scan)) {
+ reenter_switch:
+ switch (state_num) {
case BOL:
if (locinput == PL_bostr)
{
@@ -3529,7 +3511,11 @@ S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog)
break;
case BACK:
break;
- case EVAL:
+
+#undef ST
+#define ST st->u.eval
+
+ case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
{
SV *ret;
{
@@ -3614,8 +3600,8 @@ S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog)
(strlen(re->precomp) > 60 ? "..." : ""))
);
- st->u.eval.cp = regcppush(0); /* Save *all* the positions. */
- REGCP_SET(st->u.eval.lastcp);
+ ST.cp = regcppush(0); /* Save *all* the positions. */
+ REGCP_SET(ST.lastcp);
*PL_reglastparen = 0;
*PL_reglastcloseparen = 0;
PL_reginput = locinput;
@@ -3624,21 +3610,15 @@ S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog)
PL_reg_maxiter = 0;
st->logical = 0;
- st->u.eval.toggleutf = ((PL_reg_flags & RF_utf8) != 0) ^
+ ST.toggleutf = ((PL_reg_flags & RF_utf8) != 0) ^
((re->reganch & ROPT_UTF8) != 0);
- if (st->u.eval.toggleutf) PL_reg_flags ^= RF_utf8;
- st->u.eval.prev_rex = rex;
+ if (ST.toggleutf) PL_reg_flags ^= RF_utf8;
+ ST.prev_rex = rex;
rex = re;
- /* resume to current state on success */
- st->u.yes.prev_yes_state = yes_state;
- yes_state = st;
- PUSH_STATE(newst, resume_EVAL);
- st = newst;
-
+ ST.B = next;
/* now continue from first node in postoned RE */
- next = re->program + 1;
- break;
+ PUSH_YES_STATE_GOTO(EVAL_A, re->program + 1);
/* NOTREACHED */
}
/* /(?(?{...})X|Y)/ */
@@ -3646,6 +3626,44 @@ S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog)
st->logical = 0;
break;
}
+
+ case EVAL_A: /* successfully ran inner rex (??{rex}) */
+ if (ST.toggleutf)
+ PL_reg_flags ^= RF_utf8;
+ ReREFCNT_dec(rex);
+ rex = ST.prev_rex;
+ /* XXXX This is too dramatic a measure... */
+ PL_reg_maxiter = 0;
+ /* Restore parens of the caller without popping the
+ * savestack */
+ {
+ const I32 tmp = PL_savestack_ix;
+ PL_savestack_ix = ST.lastcp;
+ regcppop(rex);
+ PL_savestack_ix = tmp;
+ }
+ PL_reginput = locinput;
+ /* continue at the node following the (??{...}) */
+ scan = ST.B;
+ continue;
+
+ case EVAL_A_fail: /* unsuccessfully ran inner rex (??{rex}) */
+ /* Restore state to the outer re then re-throw the failure */
+ if (ST.toggleutf)
+ PL_reg_flags ^= RF_utf8;
+ ReREFCNT_dec(rex);
+ rex = ST.prev_rex;
+
+ /* XXXX This is too dramatic a measure... */
+ PL_reg_maxiter = 0;
+
+ PL_reginput = locinput;
+ REGCP_UNWIND(ST.lastcp);
+ regcppop(rex);
+ sayNO_SILENT;
+
+#undef ST
+
case OPEN:
n = ARG(scan); /* which paren pair */
PL_reg_start_tmp[n] = locinput;
@@ -4026,198 +4044,192 @@ S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog)
CACHEsayNO;
}
/* NOTREACHED */
- case BRANCHJ:
+
+#undef ST
+#define ST st->u.branch
+
+ case BRANCHJ: /* /(...|A|...)/ with long next pointer */
next = scan + ARG(scan);
if (next == scan)
next = NULL;
- inner = NEXTOPER(NEXTOPER(scan));
- goto do_branch;
- case BRANCH:
- inner = NEXTOPER(scan);
- do_branch:
- {
- const I32 type = OP(scan);
- if (!next || OP(next) != type) /* No choice. */
- next = inner; /* Avoid recursion. */
- else {
- const I32 lastparen = *PL_reglastparen;
- /* Put unwinding data on stack */
- const I32 unwind1 = SSNEWt(1,re_unwind_branch_t);
- re_unwind_branch_t * const uw = SSPTRt(unwind1,re_unwind_branch_t);
-
- uw->prev = st->unwind;
- st->unwind = unwind1;
- uw->type = ((type == BRANCH)
- ? RE_UNWIND_BRANCH
- : RE_UNWIND_BRANCHJ);
- uw->lastparen = lastparen;
- uw->next = next;
- uw->locinput = locinput;
- uw->nextchr = nextchr;
- uw->minmod = st->minmod;
-#ifdef DEBUGGING
- uw->regindent = ++PL_regindent;
-#endif
+ scan = NEXTOPER(scan);
+ /* FALL THROUGH */
- REGCP_SET(uw->lastcp);
+ case BRANCH: /* /(...|A|...)/ */
+ scan = NEXTOPER(scan); /* scan now points to inner node */
+ if (!next || (OP(next) != BRANCH && OP(next) != BRANCHJ))
+ /* last branch; skip state push and jump direct to node */
+ continue;
+ ST.lastparen = *PL_reglastparen;
+ ST.next_branch = next;
+ REGCP_SET(ST.cp);
+ PL_reginput = locinput;
- /* Now go into the first branch */
- next = inner;
- }
- }
- break;
+ /* Now go into the branch */
+ PUSH_STATE_GOTO(BRANCH_next, scan);
+ /* NOTREACHED */
+
+ case BRANCH_next_fail: /* that branch failed; try the next, if any */
+ REGCP_UNWIND(ST.cp);
+ for (n = *PL_reglastparen; n > ST.lastparen; n--)
+ PL_regendp[n] = -1;
+ *PL_reglastparen = n;
+ scan = ST.next_branch;
+ /* no more branches? */
+ if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ))
+ sayNO;
+ continue; /* execute next BRANCH[J] op */
+ /* NOTREACHED */
+
case MINMOD:
st->minmod = 1;
break;
- case CURLYM:
- {
- st->u.curlym.l = st->u.curlym.matches = 0;
-
- /* We suppose that the next guy does not need
- backtracking: in particular, it is of constant non-zero length,
- and has no parenths to influence future backrefs. */
- st->ln = ARG1(scan); /* min to match */
- n = ARG2(scan); /* max to match */
- st->u.curlym.paren = scan->flags;
- if (st->u.curlym.paren) {
- if (st->u.curlym.paren > PL_regsize)
- PL_regsize = st->u.curlym.paren;
- if (st->u.curlym.paren > (I32)*PL_reglastparen)
- *PL_reglastparen = st->u.curlym.paren;
- }
+
+#undef ST
+#define ST st->u.curlym
+
+ case CURLYM: /* /A{m,n}B/ where A is fixed-length */
+
+ /* This is an optimisation of CURLYX that enables us to push
+ * only a single backtracking state, no matter now many matches
+ * there are in {m,n}. It relies on the pattern being constant
+ * length, with no parens to influence future backrefs
+ */
+
+ ST.me = scan;
scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
- if (st->u.curlym.paren)
+
+ /* if paren positive, emulate an OPEN/CLOSE around A */
+ if (ST.me->flags) {
+ I32 paren = ST.me->flags;
+ if (paren > PL_regsize)
+ PL_regsize = paren;
+ if (paren > (I32)*PL_reglastparen)
+ *PL_reglastparen = paren;
scan += NEXT_OFF(scan); /* Skip former OPEN. */
- PL_reginput = locinput;
- st->u.curlym.maxwanted = st->minmod ? st->ln : n;
- while (PL_reginput < PL_regeol && st->u.curlym.matches < st->u.curlym.maxwanted) {
- /* resume to current state on success */
- st->u.yes.prev_yes_state = yes_state;
- yes_state = st;
- REGMATCH(scan, CURLYM1);
- yes_state = st->u.yes.prev_yes_state;
- /*** all unsaved local vars undefined at this point */
- if (!result)
- break;
- /* on first match, determine length, u.curlym.l */
- if (!st->u.curlym.matches++) {
- if (PL_reg_match_utf8) {
- char *s = locinput;
- while (s < PL_reginput) {
- st->u.curlym.l++;
- s += UTF8SKIP(s);
- }
- }
- else {
- st->u.curlym.l = PL_reginput - locinput;
- }
- if (st->u.curlym.l == 0) {
- st->u.curlym.matches = st->u.curlym.maxwanted;
- break;
- }
- }
- locinput = PL_reginput;
}
+ ST.A = scan;
+ ST.B = next;
+ ST.alen = 0;
+ ST.count = 0;
+ ST.minmod = st->minmod;
+ st->minmod = 0;
+ ST.c1 = CHRTEST_UNINIT;
+ REGCP_SET(ST.cp);
+ if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
+ goto curlym_do_B;
+
+ curlym_do_A: /* execute the A in /A{m,n}B/ */
PL_reginput = locinput;
- if (st->u.curlym.matches < st->ln) {
- st->minmod = 0;
- sayNO;
- }
+ PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
+ /* NOTREACHED */
+ case CURLYM_A: /* we've just matched an A */
+ locinput = st->locinput;
+ nextchr = UCHARAT(locinput);
+
+ ST.count++;
+ /* after first match, determine A's length: u.curlym.alen */
+ if (ST.count == 1) {
+ if (PL_reg_match_utf8) {
+ char *s = locinput;
+ while (s < PL_reginput) {
+ ST.alen++;
+ s += UTF8SKIP(s);
+ }
+ }
+ else {
+ ST.alen = PL_reginput - locinput;
+ }
+ if (ST.alen == 0)
+ ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
+ }
DEBUG_EXECUTE_r(
PerlIO_printf(Perl_debug_log,
- "%*s matched %"IVdf" times, len=%"IVdf"...\n",
+ "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
(int)(REPORT_CODE_OFF+PL_regindent*2), "",
- (IV) st->u.curlym.matches, (IV)st->u.curlym.l)
+ (IV) ST.count, (IV)ST.alen)
);
- /* calculate c1 and c1 for possible match of 1st char
- * following curly */
- st->u.curlym.c1 = st->u.curlym.c2 = CHRTEST_VOID;
- if (HAS_TEXT(next) || JUMPABLE(next)) {
- regnode *text_node = next;
- if (! HAS_TEXT(text_node))
- FIND_NEXT_IMPT(text_node);
- if (HAS_TEXT(text_node)
- && PL_regkind[OP(text_node)] != REF)
- {
- st->u.curlym.c1 = (U8)*STRING(text_node);
- st->u.curlym.c2 =
- (OP(text_node) == EXACTF || OP(text_node) == REFF)
- ? PL_fold[st->u.curlym.c1]
- : (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
- ? PL_fold_locale[st->u.curlym.c1]
- : st->u.curlym.c1;
- }
- }
+ locinput = PL_reginput;
+ if (ST.count < (ST.minmod ? ARG1(ST.me) : ARG2(ST.me)))
+ goto curlym_do_A; /* try to match another A */
+ goto curlym_do_B; /* try to match B */
- REGCP_SET(st->u.curlym.lastcp);
+ case CURLYM_A_fail: /* just failed to match an A */
+ REGCP_UNWIND(ST.cp);
+ if (ST.minmod || ST.count < ARG1(ST.me) /* min*/ )
+ sayNO;
- st->u.curlym.minmod = st->minmod;
- st->minmod = 0;
- while (st->u.curlym.matches >= st->ln
- && (st->u.curlym.matches <= n
- /* for REG_INFTY, ln could overflow to negative */
- || (n == REG_INFTY && st->u.curlym.matches >= 0)))
- {
- /* If it could work, try it. */
- if (st->u.curlym.c1 == CHRTEST_VOID ||
- UCHARAT(PL_reginput) == st->u.curlym.c1 ||
- UCHARAT(PL_reginput) == st->u.curlym.c2)
- {
- DEBUG_EXECUTE_r(
- PerlIO_printf(Perl_debug_log,
- "%*s trying tail with matches=%"IVdf"...\n",
- (int)(REPORT_CODE_OFF+PL_regindent*2),
- "", (IV)st->u.curlym.matches)
- );
- if (st->u.curlym.paren) {
- if (st->u.curlym.matches) {
- PL_regstartp[st->u.curlym.paren]
- = HOPc(PL_reginput, -st->u.curlym.l) - PL_bostr;
- PL_regendp[st->u.curlym.paren] = PL_reginput - PL_bostr;
- }
- else
- PL_regendp[st->u.curlym.paren] = -1;
- }
- /* resume to current state on success */
- st->u.yes.prev_yes_state = yes_state;
- yes_state = st;
- REGMATCH(next, CURLYM2);
- yes_state = st->u.yes.prev_yes_state;
- /*** all unsaved local vars undefined at this point */
- if (result)
- /* XXX tmp sayYES; */
- sayYES_FINAL;
- REGCP_UNWIND(st->u.curlym.lastcp);
- }
- /* Couldn't or didn't -- move forward/backward. */
- if (st->u.curlym.minmod) {
- PL_reginput = locinput;
- /* resume to current state on success */
- st->u.yes.prev_yes_state = yes_state;
- yes_state = st;
- REGMATCH(scan, CURLYM3);
- yes_state = st->u.yes.prev_yes_state;
- /*** all unsaved local vars undefined at this point */
- if (result) {
- st->u.curlym.matches++;
- locinput = PL_reginput;
+ curlym_do_B: /* execute the B in /A{m,n}B/ */
+ PL_reginput = locinput;
+ if (ST.c1 == CHRTEST_UNINIT) {
+ /* calculate c1 and c2 for possible match of 1st char
+ * following curly */
+ ST.c1 = ST.c2 = CHRTEST_VOID;
+ if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
+ regnode *text_node = ST.B;
+ if (! HAS_TEXT(text_node))
+ FIND_NEXT_IMPT(text_node);
+ if (HAS_TEXT(text_node)
+ && PL_regkind[OP(text_node)] != REF)
+ {
+ ST.c1 = (U8)*STRING(text_node);
+ ST.c2 =
+ (OP(text_node) == EXACTF || OP(text_node) == REFF)
+ ? PL_fold[ST.c1]
+ : (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
+ ? PL_fold_locale[ST.c1]
+ : ST.c1;
}
- else
- sayNO;
}
- else {
- st->u.curlym.matches--;
- locinput = HOPc(locinput, -st->u.curlym.l);
- PL_reginput = locinput;
+ }
+
+ DEBUG_EXECUTE_r(
+ PerlIO_printf(Perl_debug_log,
+ "%*s CURLYM trying tail with matches=%"IVdf"...\n",
+ (int)(REPORT_CODE_OFF+PL_regindent*2),
+ "", (IV)ST.count)
+ );
+ if (ST.c1 != CHRTEST_VOID
+ && UCHARAT(PL_reginput) != ST.c1
+ && UCHARAT(PL_reginput) != ST.c2)
+ {
+ /* simulate B failing */
+ state_num = CURLYM_B_fail;
+ goto reenter_switch;
+ }
+
+ if (ST.me->flags) {
+ /* mark current A as captured */
+ I32 paren = ST.me->flags;
+ if (ST.count) {
+ PL_regstartp[paren]
+ = HOPc(PL_reginput, -ST.alen) - PL_bostr;
+ PL_regendp[paren] = PL_reginput - PL_bostr;
}
+ else
+ PL_regendp[paren] = -1;
}
- sayNO;
+ PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
/* NOTREACHED */
- break;
- }
+
+ case CURLYM_B_fail: /* just failed to match a B */
+ REGCP_UNWIND(ST.cp);
+ if (ST.minmod) {
+ if (ST.count == ARG2(ST.me) /* max */)
+ sayNO;
+ goto curlym_do_A; /* try to match a further A */
+ }
+ /* backtrack one A */
+ if (ST.count == ARG1(ST.me) /* min */)
+ sayNO;
+ ST.count--;
+ locinput = HOPc(locinput, -ST.alen);
+ goto curlym_do_B; /* try to match B */
+
+
case CURLYN:
st->u.plus.paren = scan->flags; /* Which paren to set */
if (st->u.plus.paren > PL_regsize)
@@ -4490,17 +4502,20 @@ S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog)
PL_reginput = locinput; /* put where regtry can find it */
sayYES_FINAL; /* Success! */
- case SUSPEND: /* (?>FOO) */
- st->u.ifmatch.wanted = 1;
+#undef ST
+#define ST st->u.ifmatch
+
+ case SUSPEND: /* (?>A) */
+ ST.wanted = 1;
PL_reginput = locinput;
goto do_ifmatch;
- case UNLESSM: /* -ve lookaround: (?!FOO), or with flags, (?<!foo) */
- st->u.ifmatch.wanted = 0;
+ case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
+ ST.wanted = 0;
goto ifmatch_trivial_fail_test;
- case IFMATCH: /* +ve lookaround: (?=FOO), or with flags, (?<=foo) */
- st->u.ifmatch.wanted = 1;
+ case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
+ ST.wanted = 1;
ifmatch_trivial_fail_test:
if (scan->flags) {
char * const s = HOPBACKc(locinput, scan->flags);
@@ -4508,9 +4523,9 @@ S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog)
/* trivial fail */
if (st->logical) {
st->logical = 0;
- st->sw = 1 - (bool)st->u.ifmatch.wanted;
+ st->sw = 1 - (bool)ST.wanted;
}
- else if (st->u.ifmatch.wanted)
+ else if (ST.wanted)
sayNO;
next = scan + ARG(scan);
if (next == scan)
@@ -4523,13 +4538,35 @@ S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog)
PL_reginput = locinput;
do_ifmatch:
- /* resume to current state on success */
- st->u.yes.prev_yes_state = yes_state;
- yes_state = st;
- PUSH_STATE(newst, resume_IFMATCH);
- st = newst;
- next = NEXTOPER(NEXTOPER(scan));
- break;
+ ST.me = scan;
+ /* execute body of (?...A) */
+ PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
+ /* NOTREACHED */
+
+ case IFMATCH_A_fail: /* body of (?...A) failed */
+ ST.wanted = !ST.wanted;
+ /* FALL THROUGH */
+
+ case IFMATCH_A: /* body of (?...A) succeeded */
+ if (st->logical) {
+ st->logical = 0;
+ st->sw = (bool)ST.wanted;
+ }
+ else if (!ST.wanted)
+ sayNO;
+
+ if (OP(ST.me) == SUSPEND)
+ locinput = PL_reginput;
+ else {
+ locinput = PL_reginput = st->locinput;
+ nextchr = UCHARAT(locinput);
+ }
+ scan = ST.me + ARG(ST.me);
+ if (scan == ST.me)
+ scan = NULL;
+ continue; /* execute B */
+
+#undef ST
case LONGJMP:
next = scan + ARG(scan);
@@ -4542,11 +4579,41 @@ S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog)
Perl_croak(aTHX_ "regexp memory corruption");
}
- reenter:
scan = next;
continue;
/* NOTREACHED */
+ push_yes_state:
+ /* push a state that backtracks on success */
+ st->u.yes.prev_yes_state = yes_state;
+ yes_state = st;
+ /* FALL THROUGH */
+ push_state:
+ /* push a new regex state, then continue at scan */
+ {
+ regmatch_state *newst;
+
+ depth++;
+ DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
+ "PUSH STATE(%d)\n", depth));
+ st->locinput = locinput;
+ newst = st+1;
+ if (newst > SLAB_LAST(PL_regmatch_slab))
+ newst = S_push_slab(aTHX);
+ PL_regmatch_state = newst;
+ newst->cc = st->cc;
+ /* XXX probably don't need to initialise these */
+ newst->minmod = 0;
+ newst->sw = 0;
+ newst->logical = 0;
+
+ locinput = PL_reginput;
+ nextchr = UCHARAT(locinput);
+ st = newst;
+ continue;
+ /* NOTREACHED */
+ }
+
/* simulate recursively calling regmatch(), but without actually
* recursing - ie save the current state on the heap rather than on
* the stack, then re-enter the loop. This avoids complex regexes
@@ -4558,6 +4625,7 @@ S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog)
regmatch_state *oldst = st;
depth++;
+ DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "PUSH RECURSE STATE(%d)\n", depth));
/* grab the next free state slot */
st++;
@@ -4575,7 +4643,6 @@ S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog)
st->minmod = 0;
st->sw = 0;
st->logical = 0;
- st->unwind = 0;
#ifdef DEBUGGING
PL_regindent++;
#endif
@@ -4598,12 +4665,6 @@ yes_final:
/* we have successfully completed a subexpression, but we must now
* pop to the state marked by yes_state and continue from there */
- /*XXX tmp for CURLYM*/
- regmatch_slab * const oslab = PL_regmatch_slab;
- regmatch_state * const ost = st;
- regmatch_state * const oys = yes_state;
- int odepth = depth;
-
assert(st != yes_state);
while (yes_state < SLAB_FIRST(PL_regmatch_slab)
|| yes_state > SLAB_LAST(PL_regmatch_slab))
@@ -4614,65 +4675,23 @@ yes_final:
st = SLAB_LAST(PL_regmatch_slab);
}
depth -= (st - yes_state);
- DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "POP STATE TO (%d)\n", depth));
+ DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "POP STATES (%d..%d)\n",
+ depth+1, depth+(st - yes_state)));
st = yes_state;
yes_state = st->u.yes.prev_yes_state;
PL_regmatch_state = st;
switch (st->resume_state) {
- case resume_EVAL:
- if (st->u.eval.toggleutf)
- PL_reg_flags ^= RF_utf8;
- ReREFCNT_dec(rex);
- rex = st->u.eval.prev_rex;
- /* XXXX This is too dramatic a measure... */
- PL_reg_maxiter = 0;
- /* Restore parens of the caller without popping the
- * savestack */
- {
- const I32 tmp = PL_savestack_ix;
- PL_savestack_ix = st->u.eval.lastcp;
- regcppop(rex);
- PL_savestack_ix = tmp;
- }
- PL_reginput = locinput;
- /* continue at the node following the (??{...}) */
- next = st->next;
- goto reenter;
-
- case resume_IFMATCH:
- if (st->logical) {
- st->logical = 0;
- st->sw = (bool)st->u.ifmatch.wanted;
- }
- else if (!st->u.ifmatch.wanted)
- sayNO;
-
- if (OP(st->scan) == SUSPEND)
- locinput = PL_reginput;
- else {
- locinput = PL_reginput = st->locinput;
- nextchr = UCHARAT(locinput);
- }
- next = st->scan + ARG(st->scan);
- if (next == st->scan)
- next = NULL;
- goto reenter;
-
- /* XXX tmp don't handle yes_state yet */
- case resume_CURLYM1:
- case resume_CURLYM2:
- case resume_CURLYM3:
- PL_regmatch_slab =oslab;
- st = ost;
- PL_regmatch_state = st;
- depth = odepth;
- yes_state = oys;
- DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "XXX revering a CURLYM\n"));
- goto yes;
-
+ case IFMATCH_A:
+ case CURLYM_A:
+ case BRANCH_next:
+ case EVAL_A:
+ state_num = st->resume_state;
+ goto reenter_switch;
+
+ case CURLYM_B:
default:
- Perl_croak(aTHX_ "unexpected yes reume state");
+ Perl_croak(aTHX_ "unexpected yes resume state");
}
}
@@ -4685,11 +4704,22 @@ yes:
result = 1;
/* XXX this is duplicate(ish) code to that in the do_no section.
- * eventually a yes should just pop the stack back to the current
- * yes_state */
+ * will disappear when REGFMATCH goes */
if (depth) {
/* restore previous state and re-enter */
- POP_STATE;
+ DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "POP STATE(%d)\n", depth));
+ depth--;
+ st--;
+ if (st < SLAB_FIRST(PL_regmatch_slab)) {
+ PL_regmatch_slab = PL_regmatch_slab->prev;
+ st = SLAB_LAST(PL_regmatch_slab);
+ }
+ PL_regmatch_state = st;
+ scan = st->scan;
+ next = st->next;
+ n = st->n;
+ locinput= st->locinput;
+ nextchr = UCHARAT(locinput);
switch (st->resume_state) {
case resume_TRIE1:
@@ -4710,12 +4740,6 @@ yes:
goto resume_point_WHILEM5;
case resume_WHILEM6:
goto resume_point_WHILEM6;
- case resume_CURLYM1:
- goto resume_point_CURLYM1;
- case resume_CURLYM2:
- goto resume_point_CURLYM2;
- case resume_CURLYM3:
- goto resume_point_CURLYM3;
case resume_PLUS1:
goto resume_point_PLUS1;
case resume_PLUS2:
@@ -4725,8 +4749,13 @@ yes:
case resume_PLUS4:
goto resume_point_PLUS4;
- case resume_IFMATCH:
- case resume_EVAL:
+ case CURLYM_A:
+ case CURLYM_B:
+ case EVAL_A:
+ case IFMATCH_A:
+ case BRANCH_next:
+ break;
+
default:
Perl_croak(aTHX_ "regexp resume memory corruption");
}
@@ -4739,58 +4768,8 @@ no:
"%*s %sfailed...%s\n",
REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
);
- goto do_no;
no_final:
do_no:
- if (st->unwind) {
- re_unwind_t * const uw = SSPTRt(st->unwind,re_unwind_t);
-
- switch (uw->type) {
- case RE_UNWIND_BRANCH:
- case RE_UNWIND_BRANCHJ:
- {
- re_unwind_branch_t * const uwb = &(uw->branch);
- const I32 lastparen = uwb->lastparen;
-
- REGCP_UNWIND(uwb->lastcp);
- for (n = *PL_reglastparen; n > lastparen; n--)
- PL_regendp[n] = -1;
- *PL_reglastparen = n;
- scan = next = uwb->next;
- st->minmod = uwb->minmod;
- if ( !scan ||
- OP(scan) != (uwb->type == RE_UNWIND_BRANCH
- ? BRANCH : BRANCHJ) ) { /* Failure */
- st->unwind = uwb->prev;
-#ifdef DEBUGGING
- PL_regindent--;
-#endif
- goto do_no;
- }
- /* Have more choice yet. Reuse the same uwb. */
- if ((n = (uwb->type == RE_UNWIND_BRANCH
- ? NEXT_OFF(next) : ARG(next))))
- next += n;
- else
- next = NULL; /* XXXX Needn't unwinding in this case... */
- uwb->next = next;
- next = NEXTOPER(scan);
- if (uwb->type == RE_UNWIND_BRANCHJ)
- next = NEXTOPER(next);
- locinput = uwb->locinput;
- nextchr = uwb->nextchr;
-#ifdef DEBUGGING
- PL_regindent = uwb->regindent;
-#endif
-
- goto reenter;
- }
- /* NOTREACHED */
- default:
- Perl_croak(aTHX_ "regexp unwind memory corruption");
- }
- /* NOTREACHED */
- }
#ifdef DEBUGGING
PL_regindent--;
@@ -4799,28 +4778,25 @@ do_no:
if (depth) {
/* there's a previous state to backtrack to */
- POP_STATE;
+ DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "POP STATE(%d)\n", depth));
+ depth--;
+ st--;
+ if (st < SLAB_FIRST(PL_regmatch_slab)) {
+ PL_regmatch_slab = PL_regmatch_slab->prev;
+ st = SLAB_LAST(PL_regmatch_slab);
+ }
+ PL_regmatch_state = st;
+ scan = st->scan;
+ next = st->next;
+ n = st->n;
+ locinput= st->locinput;
+ nextchr = UCHARAT(locinput);
+
switch (st->resume_state) {
case resume_TRIE1:
goto resume_point_TRIE1;
case resume_TRIE2:
goto resume_point_TRIE2;
- case resume_EVAL:
- /* we have failed an (??{...}). Restore state to the outer re
- * then re-throw the failure */
- if (st->u.eval.toggleutf)
- PL_reg_flags ^= RF_utf8;
- ReREFCNT_dec(rex);
- rex = st->u.eval.prev_rex;
- yes_state = st->u.yes.prev_yes_state;
-
- /* XXXX This is too dramatic a measure... */
- PL_reg_maxiter = 0;
-
- PL_reginput = locinput;
- REGCP_UNWIND(st->u.eval.lastcp);
- regcppop(rex);
- goto do_no;
case resume_CURLYX:
goto resume_point_CURLYX;
@@ -4836,28 +4812,16 @@ do_no:
goto resume_point_WHILEM5;
case resume_WHILEM6:
goto resume_point_WHILEM6;
- case resume_CURLYM1:
- goto resume_point_CURLYM1;
- case resume_CURLYM2:
- goto resume_point_CURLYM2;
- case resume_CURLYM3:
- goto resume_point_CURLYM3;
- case resume_IFMATCH:
- yes_state = st->u.yes.prev_yes_state;
- if (st->logical) {
- st->logical = 0;
- st->sw = !st->u.ifmatch.wanted;
- }
- else if (st->u.ifmatch.wanted)
- sayNO;
- assert(OP(scan) != SUSPEND); /* XXX DAPM tmp */
- locinput = PL_reginput = st->locinput;
- nextchr = UCHARAT(locinput);
- next = scan + ARG(scan);
- if (next == scan)
- next = NULL;
- goto reenter;
+ case EVAL_A:
+ case BRANCH_next:
+ case CURLYM_A:
+ case CURLYM_B:
+ case IFMATCH_A:
+ if (yes_state == st)
+ yes_state = st->u.yes.prev_yes_state;
+ state_num = st->resume_state + 1; /* failure = success + 1 */
+ goto reenter_switch;
case resume_PLUS1:
goto resume_point_PLUS1;
diff --git a/regexp.h b/regexp.h
index 4ea7c694e9..c48574a43f 100644
--- a/regexp.h
+++ b/regexp.h
@@ -180,34 +180,12 @@ typedef struct {
typedef I32 CHECKPOINT;
-typedef enum {
- resume_TRIE1,
- resume_TRIE2,
- resume_EVAL,
- resume_CURLYX,
- resume_WHILEM1,
- resume_WHILEM2,
- resume_WHILEM3,
- resume_WHILEM4,
- resume_WHILEM5,
- resume_WHILEM6,
- resume_CURLYM1,
- resume_CURLYM2,
- resume_CURLYM3,
- resume_IFMATCH,
- resume_PLUS1,
- resume_PLUS2,
- resume_PLUS3,
- resume_PLUS4
-} regmatch_resume_states;
-
-
typedef struct regmatch_state {
/* these vars contain state that needs to be maintained
* across the main while loop ... */
- regmatch_resume_states resume_state; /* where to jump to on return */
+ int resume_state; /* where to jump to on return */
regnode *scan; /* Current node. */
regnode *next; /* Next node. */
bool minmod; /* the next "{n,m}" is a "{n,m}?" */
@@ -243,6 +221,7 @@ typedef struct regmatch_state {
int toggleutf;
CHECKPOINT cp; /* remember current savestack indexes */
CHECKPOINT lastcp;
+ regnode *B; /* the node following us */
} eval;
struct {
@@ -269,15 +248,21 @@ typedef struct regmatch_state {
} whilem;
struct {
+ I32 lastparen;
+ regnode *next_branch; /* next branch node */
+ CHECKPOINT cp;
+ } branch;
+
+ struct {
/* this first element must match u.yes */
struct regmatch_state *prev_yes_state;
- I32 paren;
I32 c1, c2; /* case fold search */
- CHECKPOINT lastcp;
- I32 l;
- I32 matches;
- I32 maxwanted;
+ CHECKPOINT cp;
+ I32 alen; /* length of first-matched A string */
+ I32 count;
bool minmod;
+ regnode *A, *B; /* the nodes corresponding to /A*B/ */
+ regnode *me; /* the curlym node */
} curlym;
struct {
@@ -293,6 +278,7 @@ typedef struct regmatch_state {
/* this first element must match u.yes */
struct regmatch_state *prev_yes_state;
I32 wanted;
+ regnode *me; /* the IFMATCH/SUSPEND/UNLESSM node */
} ifmatch; /* and SUSPEND/UNLESSM */
} u;
} regmatch_state;
diff --git a/regnodes.h b/regnodes.h
index 312530d1a4..3da3badf4c 100644
--- a/regnodes.h
+++ b/regnodes.h
@@ -69,6 +69,7 @@
#define TRIE 62 /* 0x3e Match many EXACT(FL?)? at once. flags==type */
#define TRIEC 63 /* 0x3f Trie + charclass. (unused at present) */
#define PSEUDO 64 /* 0x40 Pseudo opcode for internal use. */
+#define REGNODE_MAX 64
#ifndef DOINIT
EXTCONST U8 PL_regkind[];
diff --git a/t/op/re_tests b/t/op/re_tests
index 394a66591f..3e11a91fe6 100644
--- a/t/op/re_tests
+++ b/t/op/re_tests
@@ -971,3 +971,4 @@ x(?# x c - Sequence (?#... not terminated
^((?:aa)*)(?:X+((?:\d+|-)(?:X+(.+))?))?$ aaaaX5 y $1 aaaa
X(A|B||C|D)Y XXXYYY y $& XY # Trie w/ NOTHING
(?i:X([A]|[B]|y[Y]y|[D]|)Y) XXXYYYB y $& XY # Trie w/ NOTHING
+^([a]{1})*$ aa y $1 a