summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorYves Orton <demerphq@gmail.com>2006-10-31 23:49:57 +0100
committerRafael Garcia-Suarez <rgarciasuarez@gmail.com>2006-11-01 17:05:03 +0000
commit5461259206276a3618e115d5d68776273bb41ca6 (patch)
tree62be0b5e283fa9c04597fe6ec054cc83064e0f87
parenta983b08dc7c658d412bac21b8ceac00c24ae5c11 (diff)
downloadperl-5461259206276a3618e115d5d68776273bb41ca6.tar.gz
Add a commit verb to regex engine to allow fine tuning of backtracking control.
Message-ID: <9b18b3110610311349n5947cc8fsf0b2e6ddd9a7ee01@mail.gmail.com> p4raw-id: //depot/perl@29183
-rw-r--r--pod/perlre.pod48
-rw-r--r--regcomp.c16
-rw-r--r--regcomp.sym3
-rw-r--r--regexec.c24
-rw-r--r--regnodes.h143
-rwxr-xr-xt/op/pat.t12
6 files changed, 174 insertions, 72 deletions
diff --git a/pod/perlre.pod b/pod/perlre.pod
index b46b3811d0..4e683a37f4 100644
--- a/pod/perlre.pod
+++ b/pod/perlre.pod
@@ -1046,6 +1046,54 @@ to inside of one of these constructs. The following equivalences apply:
PAT?+ (?>PAT?)
PAT{min,max}+ (?>PAT{min,max})
+=item C<(?COMMIT)>
+X<(?COMMIT)>
+
+This zero-width pattern commits the match at the current point, preventing
+the engine from back-tracking on failure to the left of the commit point.
+Consider the pattern C<A (?COMMIT) B>, where A and B are complex patterns.
+Until the C<(?COMMIT)> is reached, A may backtrack as necessary to match.
+Once it is reached, matching continues in B, which may also backtrack as
+necessary; however, should B not match, then no further backtracking will
+take place, and the pattern will fail outright at that starting position.
+
+The following example counts all the possible matching strings in a
+pattern (without actually matching any of them).
+
+ 'aaab'=~/a+b?(?{print "$&\n"; $count++})(?FAIL)/;
+ print "Count=$count\n";
+
+which produces:
+
+ aaab
+ aaa
+ aa
+ a
+ aab
+ aa
+ a
+ ab
+ a
+ Count=9
+
+If we add a C<(?COMMIT)> before the count like the following
+
+ 'aaab'=~/a+b?(?COMMIT)(?{print "$&\n"; $count++})(?FAIL)/;
+ print "Count=$count\n";
+
+we prevent backtracking and find the count of the longest matching
+at each matching startpoint like so:
+
+ aaab
+ aab
+ ab
+ Count=3
+
+Any number of C<(?COMMIT)> assertions may be used in a pattern.
+
+See also C<< (?>pattern) >> and possessive quantifiers for other
+ways to control backtracking.
+
=item C<(?(condition)yes-pattern|no-pattern)>
X<(?()>
diff --git a/regcomp.c b/regcomp.c
index 4e933a99a4..6938954305 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -4717,6 +4717,22 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
case ':': /* (?:...) */
case '>': /* (?>...) */
break;
+ case 'C':
+ if (RExC_parse[0] == 'O' &&
+ RExC_parse[1] == 'M' &&
+ RExC_parse[2] == 'M' &&
+ RExC_parse[3] == 'I' &&
+ RExC_parse[4] == 'T' &&
+ RExC_parse[5] == ')')
+ {
+ RExC_parse+=5;
+ ret = reg_node(pRExC_state, COMMIT);
+ } else {
+ vFAIL("Sequence (?C... not terminated");
+ }
+ nextchar(pRExC_state);
+ return ret;
+ break;
case 'F':
if (RExC_parse[0] == 'A' &&
RExC_parse[1] == 'I' &&
diff --git a/regcomp.sym b/regcomp.sym
index 072b96973b..f60368cb8b 100644
--- a/regcomp.sym
+++ b/regcomp.sym
@@ -170,6 +170,7 @@ DEFINEP DEFINEP, none 1 Never execute directly.
#*Bactracking
OPFAIL OPFAIL, none Same as (?!)
+COMMIT COMMIT, node Pattern fails if backtracking through this
# NEW STUFF ABOVE THIS LINE -- Please update counts below.
@@ -206,4 +207,4 @@ BRANCH next:FAIL
CURLYM A,B:FAIL
IFMATCH A:FAIL
CURLY B_min_known,B_min,B_max:FAIL
-
+COMMIT next:FAIL
diff --git a/regexec.c b/regexec.c
index 9f28e3eb3d..55a90d0d9b 100644
--- a/regexec.c
+++ b/regexec.c
@@ -2571,6 +2571,7 @@ S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog)
regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
U32 state_num;
+ bool no_final = 0; /* if true then we dont backtrack on failure */
/* these three flags are set by various ops to signal information to
* the very next op. They have a useful lifetime of exactly one loop
@@ -4614,6 +4615,12 @@ NULL
if (next == scan)
next = NULL;
break;
+ case COMMIT:
+ PUSH_STATE_GOTO(COMMIT_next,next);
+ /* NOTREACHED */
+ case COMMIT_next_fail:
+ no_final = 1;
+ /* FALLTHROUGH */
case OPFAIL:
sayNO;
default:
@@ -4672,7 +4679,13 @@ yes:
PL_regmatch_slab = PL_regmatch_slab->prev;
st = SLAB_LAST(PL_regmatch_slab);
}
- DEBUG_STATE_pp("pop (yes)");
+ DEBUG_STATE_r(
+ if (no_final) {
+ DEBUG_STATE_pp("pop (no final)");
+ } else {
+ DEBUG_STATE_pp("pop (yes)");
+ }
+ );
depth--;
}
#else
@@ -4690,7 +4703,7 @@ yes:
yes_state = st->u.yes.prev_yes_state;
PL_regmatch_state = st;
- state_num = st->resume_state;
+ state_num = st->resume_state + no_final;
goto reenter_switch;
}
@@ -4709,6 +4722,13 @@ no:
);
no_silent:
+ if (no_final) {
+ if (yes_state) {
+ goto yes;
+ } else {
+ goto final_exit;
+ }
+ }
if (depth) {
/* there's a previous state to backtrack to */
st--;
diff --git a/regnodes.h b/regnodes.h
index 766dcff1ad..c42fcf89af 100644
--- a/regnodes.h
+++ b/regnodes.h
@@ -6,8 +6,8 @@
/* Regops and State definitions */
-#define REGNODE_MAX 75
-#define REGMATCH_STATE_MAX 105
+#define REGNODE_MAX 76
+#define REGMATCH_STATE_MAX 108
#define END 0 /* 0000 End of program. */
#define SUCCEED 1 /* 0x01 Return from a subroutine, basically. */
@@ -83,41 +83,44 @@
#define INSUBP 71 /* 0x47 Whether we are in a specific recurse. */
#define DEFINEP 72 /* 0x48 Never execute directly. */
#define OPFAIL 73 /* 0x49 Same as (?!) */
-#define OPTIMIZED 74 /* 0x4a Placeholder for dump. */
-#define PSEUDO 75 /* 0x4b Pseudo opcode for internal use. */
+#define COMMIT 74 /* 0x4a Pattern fails if backtracking through this */
+#define OPTIMIZED 75 /* 0x4b Placeholder for dump. */
+#define PSEUDO 76 /* 0x4c Pseudo opcode for internal use. */
/* ------------ States ------------- */
-#define TRIE_next 76 /* 0x4c Regmatch state for TRIE */
-#define TRIE_next_fail 77 /* 0x4d Regmatch state for TRIE */
-#define EVAL_AB 78 /* 0x4e Regmatch state for EVAL */
-#define EVAL_AB_fail 79 /* 0x4f Regmatch state for EVAL */
-#define CURLYX_end 80 /* 0x50 Regmatch state for CURLYX */
-#define CURLYX_end_fail 81 /* 0x51 Regmatch state for CURLYX */
-#define WHILEM_A_pre 82 /* 0x52 Regmatch state for WHILEM */
-#define WHILEM_A_pre_fail 83 /* 0x53 Regmatch state for WHILEM */
-#define WHILEM_A_min 84 /* 0x54 Regmatch state for WHILEM */
-#define WHILEM_A_min_fail 85 /* 0x55 Regmatch state for WHILEM */
-#define WHILEM_A_max 86 /* 0x56 Regmatch state for WHILEM */
-#define WHILEM_A_max_fail 87 /* 0x57 Regmatch state for WHILEM */
-#define WHILEM_B_min 88 /* 0x58 Regmatch state for WHILEM */
-#define WHILEM_B_min_fail 89 /* 0x59 Regmatch state for WHILEM */
-#define WHILEM_B_max 90 /* 0x5a Regmatch state for WHILEM */
-#define WHILEM_B_max_fail 91 /* 0x5b Regmatch state for WHILEM */
-#define BRANCH_next 92 /* 0x5c Regmatch state for BRANCH */
-#define BRANCH_next_fail 93 /* 0x5d Regmatch state for BRANCH */
-#define CURLYM_A 94 /* 0x5e Regmatch state for CURLYM */
-#define CURLYM_A_fail 95 /* 0x5f Regmatch state for CURLYM */
-#define CURLYM_B 96 /* 0x60 Regmatch state for CURLYM */
-#define CURLYM_B_fail 97 /* 0x61 Regmatch state for CURLYM */
-#define IFMATCH_A 98 /* 0x62 Regmatch state for IFMATCH */
-#define IFMATCH_A_fail 99 /* 0x63 Regmatch state for IFMATCH */
-#define CURLY_B_min_known 100 /* 0x64 Regmatch state for CURLY */
-#define CURLY_B_min_known_fail 101 /* 0x65 Regmatch state for CURLY */
-#define CURLY_B_min 102 /* 0x66 Regmatch state for CURLY */
-#define CURLY_B_min_fail 103 /* 0x67 Regmatch state for CURLY */
-#define CURLY_B_max 104 /* 0x68 Regmatch state for CURLY */
-#define CURLY_B_max_fail 105 /* 0x69 Regmatch state for CURLY */
+#define TRIE_next 77 /* 0x4d Regmatch state for TRIE */
+#define TRIE_next_fail 78 /* 0x4e Regmatch state for TRIE */
+#define EVAL_AB 79 /* 0x4f Regmatch state for EVAL */
+#define EVAL_AB_fail 80 /* 0x50 Regmatch state for EVAL */
+#define CURLYX_end 81 /* 0x51 Regmatch state for CURLYX */
+#define CURLYX_end_fail 82 /* 0x52 Regmatch state for CURLYX */
+#define WHILEM_A_pre 83 /* 0x53 Regmatch state for WHILEM */
+#define WHILEM_A_pre_fail 84 /* 0x54 Regmatch state for WHILEM */
+#define WHILEM_A_min 85 /* 0x55 Regmatch state for WHILEM */
+#define WHILEM_A_min_fail 86 /* 0x56 Regmatch state for WHILEM */
+#define WHILEM_A_max 87 /* 0x57 Regmatch state for WHILEM */
+#define WHILEM_A_max_fail 88 /* 0x58 Regmatch state for WHILEM */
+#define WHILEM_B_min 89 /* 0x59 Regmatch state for WHILEM */
+#define WHILEM_B_min_fail 90 /* 0x5a Regmatch state for WHILEM */
+#define WHILEM_B_max 91 /* 0x5b Regmatch state for WHILEM */
+#define WHILEM_B_max_fail 92 /* 0x5c Regmatch state for WHILEM */
+#define BRANCH_next 93 /* 0x5d Regmatch state for BRANCH */
+#define BRANCH_next_fail 94 /* 0x5e Regmatch state for BRANCH */
+#define CURLYM_A 95 /* 0x5f Regmatch state for CURLYM */
+#define CURLYM_A_fail 96 /* 0x60 Regmatch state for CURLYM */
+#define CURLYM_B 97 /* 0x61 Regmatch state for CURLYM */
+#define CURLYM_B_fail 98 /* 0x62 Regmatch state for CURLYM */
+#define IFMATCH_A 99 /* 0x63 Regmatch state for IFMATCH */
+#define IFMATCH_A_fail 100 /* 0x64 Regmatch state for IFMATCH */
+#define CURLY_B_min_known 101 /* 0x65 Regmatch state for CURLY */
+#define CURLY_B_min_known_fail 102 /* 0x66 Regmatch state for CURLY */
+#define CURLY_B_min 103 /* 0x67 Regmatch state for CURLY */
+#define CURLY_B_min_fail 104 /* 0x68 Regmatch state for CURLY */
+#define CURLY_B_max 105 /* 0x69 Regmatch state for CURLY */
+#define CURLY_B_max_fail 106 /* 0x6a Regmatch state for CURLY */
+#define COMMIT_next 107 /* 0x6b Regmatch state for COMMIT */
+#define COMMIT_next_fail 108 /* 0x6c Regmatch state for COMMIT */
/* PL_regkind[] What type of regop or state is this. */
@@ -199,6 +202,7 @@ EXTCONST U8 PL_regkind[] = {
INSUBP, /* INSUBP */
DEFINEP, /* DEFINEP */
OPFAIL, /* OPFAIL */
+ COMMIT, /* COMMIT */
NOTHING, /* OPTIMIZED */
PSEUDO, /* PSEUDO */
/* ------------ States ------------- */
@@ -232,6 +236,8 @@ EXTCONST U8 PL_regkind[] = {
CURLY, /* CURLY_B_min_fail */
CURLY, /* CURLY_B_max */
CURLY, /* CURLY_B_max_fail */
+ COMMIT, /* COMMIT_next */
+ COMMIT, /* COMMIT_next_fail */
};
#endif
@@ -313,6 +319,7 @@ static const U8 regarglen[] = {
EXTRA_SIZE(struct regnode_1), /* INSUBP */
EXTRA_SIZE(struct regnode_1), /* DEFINEP */
0, /* OPFAIL */
+ 0, /* COMMIT */
0, /* OPTIMIZED */
0, /* PSEUDO */
};
@@ -394,6 +401,7 @@ static const char reg_off_by_arg[] = {
0, /* INSUBP */
0, /* DEFINEP */
0, /* OPFAIL */
+ 0, /* COMMIT */
0, /* OPTIMIZED */
0, /* PSEUDO */
};
@@ -476,39 +484,42 @@ const char * reg_name[] = {
"INSUBP", /* 0x47 */
"DEFINEP", /* 0x48 */
"OPFAIL", /* 0x49 */
- "OPTIMIZED", /* 0x4a */
- "PSEUDO", /* 0x4b */
+ "COMMIT", /* 0x4a */
+ "OPTIMIZED", /* 0x4b */
+ "PSEUDO", /* 0x4c */
/* ------------ States ------------- */
- "TRIE_next", /* 0x4c */
- "TRIE_next_fail", /* 0x4d */
- "EVAL_AB", /* 0x4e */
- "EVAL_AB_fail", /* 0x4f */
- "CURLYX_end", /* 0x50 */
- "CURLYX_end_fail", /* 0x51 */
- "WHILEM_A_pre", /* 0x52 */
- "WHILEM_A_pre_fail", /* 0x53 */
- "WHILEM_A_min", /* 0x54 */
- "WHILEM_A_min_fail", /* 0x55 */
- "WHILEM_A_max", /* 0x56 */
- "WHILEM_A_max_fail", /* 0x57 */
- "WHILEM_B_min", /* 0x58 */
- "WHILEM_B_min_fail", /* 0x59 */
- "WHILEM_B_max", /* 0x5a */
- "WHILEM_B_max_fail", /* 0x5b */
- "BRANCH_next", /* 0x5c */
- "BRANCH_next_fail", /* 0x5d */
- "CURLYM_A", /* 0x5e */
- "CURLYM_A_fail", /* 0x5f */
- "CURLYM_B", /* 0x60 */
- "CURLYM_B_fail", /* 0x61 */
- "IFMATCH_A", /* 0x62 */
- "IFMATCH_A_fail", /* 0x63 */
- "CURLY_B_min_known", /* 0x64 */
- "CURLY_B_min_known_fail", /* 0x65 */
- "CURLY_B_min", /* 0x66 */
- "CURLY_B_min_fail", /* 0x67 */
- "CURLY_B_max", /* 0x68 */
- "CURLY_B_max_fail", /* 0x69 */
+ "TRIE_next", /* 0x4d */
+ "TRIE_next_fail", /* 0x4e */
+ "EVAL_AB", /* 0x4f */
+ "EVAL_AB_fail", /* 0x50 */
+ "CURLYX_end", /* 0x51 */
+ "CURLYX_end_fail", /* 0x52 */
+ "WHILEM_A_pre", /* 0x53 */
+ "WHILEM_A_pre_fail", /* 0x54 */
+ "WHILEM_A_min", /* 0x55 */
+ "WHILEM_A_min_fail", /* 0x56 */
+ "WHILEM_A_max", /* 0x57 */
+ "WHILEM_A_max_fail", /* 0x58 */
+ "WHILEM_B_min", /* 0x59 */
+ "WHILEM_B_min_fail", /* 0x5a */
+ "WHILEM_B_max", /* 0x5b */
+ "WHILEM_B_max_fail", /* 0x5c */
+ "BRANCH_next", /* 0x5d */
+ "BRANCH_next_fail", /* 0x5e */
+ "CURLYM_A", /* 0x5f */
+ "CURLYM_A_fail", /* 0x60 */
+ "CURLYM_B", /* 0x61 */
+ "CURLYM_B_fail", /* 0x62 */
+ "IFMATCH_A", /* 0x63 */
+ "IFMATCH_A_fail", /* 0x64 */
+ "CURLY_B_min_known", /* 0x65 */
+ "CURLY_B_min_known_fail", /* 0x66 */
+ "CURLY_B_min", /* 0x67 */
+ "CURLY_B_min_fail", /* 0x68 */
+ "CURLY_B_max", /* 0x69 */
+ "CURLY_B_max_fail", /* 0x6a */
+ "COMMIT_next", /* 0x6b */
+ "COMMIT_next_fail", /* 0x6c */
};
#endif /* DEBUGGING */
#else
diff --git a/t/op/pat.t b/t/op/pat.t
index 3cb8e8f660..16862343b9 100755
--- a/t/op/pat.t
+++ b/t/op/pat.t
@@ -3719,8 +3719,14 @@ sub iseq($$;$) {
';
ok(!$@,'lvalue $+{...} should not throw an exception');
}
-
-
+{
+ our $count = 0;
+ 'aaab'=~/a+b?(?{$count++})(?FAIL)/;
+ iseq($count,9,"expect 9 for no (?COMMIT)");
+ $count = 0;
+ 'aaab'=~/a+b?(?COMMIT)(?{$count++})(?FAIL)/;
+ iseq($count,3,"expect 3 with (?COMMIT)");
+}
# stress test CURLYX/WHILEM.
#
# This test includes varying levels of nesting, and according to
@@ -3860,5 +3866,5 @@ ok((q(a)x 100) =~ /^(??{'(.)'x 100})/,
or print "# Unexpected outcome: should pass or crash perl\n";
# Don't forget to update this!
-BEGIN{print "1..1287\n"};
+BEGIN{print "1..1289\n"};