summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>2011-09-20 15:45:06 +0000
committerph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>2011-09-20 15:45:06 +0000
commit5eab43195462708923e7ef6672fa8feb4493bf33 (patch)
treeb5778f7d9dedeabeba8c70f706a1f38ca02628e9
parent394e5f3cffccee4d0e00248171e2e539b298ccad (diff)
downloadpcre-5eab43195462708923e7ef6672fa8feb4493bf33.tar.gz
Restore tail-recursion optimizations when no (*THEN) in pattern.
git-svn-id: svn://vcs.exim.org/pcre/code/trunk@702 2f5784b3-3f2a-0410-8824-cb99058d5e15
-rw-r--r--ChangeLog8
-rw-r--r--pcre_compile.c1
-rw-r--r--pcre_exec.c59
-rw-r--r--pcre_internal.h2
-rw-r--r--testdata/testinput23
-rw-r--r--testdata/testoutput220
6 files changed, 71 insertions, 22 deletions
diff --git a/ChangeLog b/ChangeLog
index cd04cd9..f2bbdc9 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -59,6 +59,12 @@ Version 8.20 12-Sep-2011
10. A pathological pattern such as /(*ACCEPT)a/ was miscompiled, thinking that
the first byte in a match must be "a".
+
+11. Change 17 for 8.13 increased the recursion depth for patterns like
+ /a(?:.)*?a/ drastically. I've improved things by remembering whether a
+ pattern contains any instances of (*THEN). If it does not, the old
+ optimizations are restored. It would be nice to do this on a per-group
+ basis, but at the moment that is not feasible.
Version 8.13 16-Aug-2011
@@ -158,7 +164,7 @@ Version 8.13 16-Aug-2011
tail recursion to cut down on stack usage. Unfortunately, now that there is
the possibility of (*THEN) occurring in these branches, tail recursion is
no longer possible because the return has to be checked for (*THEN). These
- two optimizations have therefore been removed.
+ two optimizations have therefore been removed. [But see 8.20/11 above.]
18. If a pattern containing \R was studied, it was assumed that \R always
matched two bytes, thus causing the minimum subject length to be
diff --git a/pcre_compile.c b/pcre_compile.c
index b6d3637..3519aa8 100644
--- a/pcre_compile.c
+++ b/pcre_compile.c
@@ -5064,6 +5064,7 @@ for (;; ptr++)
{
PUT(code, 0, code - bcptr->current_branch - 1);
code += LINK_SIZE;
+ cd->external_flags |= PCRE_HASTHEN;
}
}
diff --git a/pcre_exec.c b/pcre_exec.c
index 38447a9..8d73a70 100644
--- a/pcre_exec.c
+++ b/pcre_exec.c
@@ -870,12 +870,17 @@ for (;;)
/* VVVVVVVVVVVVVVVVVVVVVVVVV */
/* Non-capturing or atomic group, except for possessive with unlimited
- repeat. Loop for all the alternatives. When we get to the final alternative
- within the brackets, we used to return the result of a recursive call to
- match() whatever happened so it was possible to reduce stack usage by
- turning this into a tail recursion, except in the case of a possibly empty
- group. However, now that there is the possiblity of (*THEN) occurring in
- the final alternative, this optimization is no longer possible.
+ repeat. Loop for all the alternatives.
+
+ When we get to the final alternative within the brackets, we used to return
+ the result of a recursive call to match() whatever happened so it was
+ possible to reduce stack usage by turning this into a tail recursion,
+ except in the case of a possibly empty group. However, now that there is
+ the possiblity of (*THEN) occurring in the final alternative, this
+ optimization is no longer always possible.
+
+ We can optimize if we know there are no (*THEN)s in the pattern; at present
+ this is the best that can be done.
MATCH_ONCE is returned when the end of an atomic group is successfully
reached, but subsequent matching fails. It passes back up the tree (causing
@@ -892,7 +897,20 @@ for (;;)
for (;;)
{
if (op >= OP_SBRA || op == OP_ONCE) md->match_function_type = MATCH_CBEGROUP;
- RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, eptrb,
+
+ /* If this is not a possibly empty group, and there are no (*THEN)s in
+ the pattern, and this is the final alternative, optimize as described
+ above. */
+
+ else if (!md->hasthen && ecode[GET(ecode, 1)] != OP_ALT)
+ {
+ ecode += _pcre_OP_lengths[*ecode];
+ goto TAIL_RECURSE;
+ }
+
+ /* In all other cases, we have to make another call to match(). */
+
+ RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, eptrb,
RM2);
if (rrc != MATCH_NOMATCH &&
(rrc != MATCH_THEN || md->start_match_ptr != ecode))
@@ -1264,16 +1282,25 @@ for (;;)
}
/* We are now at the branch that is to be obeyed. As there is only one,
- we used to use tail recursion to avoid using another stack frame, except
- when there was unlimited repeat of a possibly empty group. However, that
- strategy no longer works because of the possibilty of (*THEN) being
- encountered in the branch. A recursive call to match() is always required,
- unless the second alternative doesn't exist, in which case we can just
- plough on. */
+ we used always to use tail recursion to avoid using another stack frame,
+ except when there was unlimited repeat of a possibly empty group. However,
+ that strategy no longer works because of the possibilty of (*THEN) being
+ encountered in the branch. However, we can still use tail recursion if
+ there are no (*THEN)s in the pattern. Otherwise, a recursive call to
+ match() is always required, unless the second alternative doesn't exist, in
+ which case we can just plough on. */
if (condition || *ecode == OP_ALT)
{
if (op == OP_SCOND) md->match_function_type = MATCH_CBEGROUP;
+ else if (!md->hasthen)
+ {
+ ecode += 1 + LINK_SIZE;
+ goto TAIL_RECURSE;
+ }
+
+ /* A call to match() is required. */
+
RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);
/* If the result is THEN from within the "true" branch of the condition,
@@ -1292,7 +1319,10 @@ for (;;)
}
RRETURN(rrc);
}
- else /* Condition false & no alternative */
+
+ /* Condition false & no alternative; continue after the group. */
+
+ else
{
ecode += 1 + LINK_SIZE;
}
@@ -5945,6 +5975,7 @@ md->hitend = FALSE;
md->mark = NULL; /* In case never set */
md->recursive = NULL; /* No recursion at top level */
+md->hasthen = (re->flags & PCRE_HASTHEN) != 0;
md->lcc = tables + lcc_offset;
md->ctypes = tables + ctypes_offset;
diff --git a/pcre_internal.h b/pcre_internal.h
index be50ca2..54d3c26 100644
--- a/pcre_internal.h
+++ b/pcre_internal.h
@@ -594,6 +594,7 @@ compatibility. */
#define PCRE_STARTLINE 0x0008 /* start after \n for multiline */
#define PCRE_JCHANGED 0x0010 /* j option used in regex */
#define PCRE_HASCRORLF 0x0020 /* explicit \r or \n in pattern */
+#define PCRE_HASTHEN 0x0040 /* pattern contains (*THEN) */
/* Flags for the "extra" block produced by pcre_study(). */
@@ -1820,6 +1821,7 @@ typedef struct match_data {
BOOL notempty_atstart; /* Empty string match at start not wanted */
BOOL hitend; /* Hit the end of the subject at some point */
BOOL bsr_anycrlf; /* \R is just any CRLF, not full Unicode */
+ BOOL hasthen; /* Pattern contains (*THEN) */
const uschar *start_code; /* For use when recursing */
USPTR start_subject; /* Start of the subject string */
USPTR end_subject; /* End of the subject string */
diff --git a/testdata/testinput2 b/testdata/testinput2
index c43cc80..ba768ce 100644
--- a/testdata/testinput2
+++ b/testdata/testinput2
@@ -3880,4 +3880,7 @@ with \Y. ---/
/z(*ACCEPT)a/+I
baxzbx
+/a(?:.)*?a/ims
+ \Mabbbbbbbbbbbbbbbbbbbbba
+
/-- End of testinput2 --/
diff --git a/testdata/testoutput2 b/testdata/testoutput2
index 6895e44..d4d6979 100644
--- a/testdata/testoutput2
+++ b/testdata/testoutput2
@@ -4430,12 +4430,12 @@ No first char
Need char = 'z'
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaazzbbbbbb\M
Minimum match() limit = 8
-Minimum match() recursion limit = 7
+Minimum match() recursion limit = 6
0: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaazz
1: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
aaaaaaaaaaaaaz\M
Minimum match() limit = 32768
-Minimum match() recursion limit = 43
+Minimum match() recursion limit = 42
No match
/(aaa(?C1)bbb|ab)/I
@@ -6666,7 +6666,7 @@ No first char
No need char
/* this is a C style comment */\M
Minimum match() limit = 120
-Minimum match() recursion limit = 35
+Minimum match() recursion limit = 6
0: /* this is a C style comment */
1: /* this is a C style comment */
@@ -11951,22 +11951,22 @@ No set of starting bytes
/^(?>a)++/
aa\M
Minimum match() limit = 5
-Minimum match() recursion limit = 3
+Minimum match() recursion limit = 2
0: aa
aaaaaaaaa\M
Minimum match() limit = 12
-Minimum match() recursion limit = 3
+Minimum match() recursion limit = 2
0: aaaaaaaaa
/(a)(?1)++/
aa\M
Minimum match() limit = 7
-Minimum match() recursion limit = 5
+Minimum match() recursion limit = 4
0: aa
1: a
aaaaaaaaa\M
Minimum match() limit = 21
-Minimum match() recursion limit = 5
+Minimum match() recursion limit = 4
0: aaaaaaaaa
1: a
@@ -12322,4 +12322,10 @@ No need char
0: z
0+ bx
+/a(?:.)*?a/ims
+ \Mabbbbbbbbbbbbbbbbbbbbba
+Minimum match() limit = 65
+Minimum match() recursion limit = 2
+ 0: abbbbbbbbbbbbbbbbbbbbba
+
/-- End of testinput2 --/