diff options
author | ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2011-09-20 15:45:06 +0000 |
---|---|---|
committer | ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2011-09-20 15:45:06 +0000 |
commit | 5eab43195462708923e7ef6672fa8feb4493bf33 (patch) | |
tree | b5778f7d9dedeabeba8c70f706a1f38ca02628e9 | |
parent | 394e5f3cffccee4d0e00248171e2e539b298ccad (diff) | |
download | pcre-5eab43195462708923e7ef6672fa8feb4493bf33.tar.gz |
Restore tail-recursion optimizations when no (*THEN) in pattern.
git-svn-id: svn://vcs.exim.org/pcre/code/trunk@702 2f5784b3-3f2a-0410-8824-cb99058d5e15
-rw-r--r-- | ChangeLog | 8 | ||||
-rw-r--r-- | pcre_compile.c | 1 | ||||
-rw-r--r-- | pcre_exec.c | 59 | ||||
-rw-r--r-- | pcre_internal.h | 2 | ||||
-rw-r--r-- | testdata/testinput2 | 3 | ||||
-rw-r--r-- | testdata/testoutput2 | 20 |
6 files changed, 71 insertions, 22 deletions
@@ -59,6 +59,12 @@ Version 8.20 12-Sep-2011 10. A pathological pattern such as /(*ACCEPT)a/ was miscompiled, thinking that the first byte in a match must be "a". + +11. Change 17 for 8.13 increased the recursion depth for patterns like + /a(?:.)*?a/ drastically. I've improved things by remembering whether a + pattern contains any instances of (*THEN). If it does not, the old + optimizations are restored. It would be nice to do this on a per-group + basis, but at the moment that is not feasible. Version 8.13 16-Aug-2011 @@ -158,7 +164,7 @@ Version 8.13 16-Aug-2011 tail recursion to cut down on stack usage. Unfortunately, now that there is the possibility of (*THEN) occurring in these branches, tail recursion is no longer possible because the return has to be checked for (*THEN). These - two optimizations have therefore been removed. + two optimizations have therefore been removed. [But see 8.20/11 above.] 18. If a pattern containing \R was studied, it was assumed that \R always matched two bytes, thus causing the minimum subject length to be diff --git a/pcre_compile.c b/pcre_compile.c index b6d3637..3519aa8 100644 --- a/pcre_compile.c +++ b/pcre_compile.c @@ -5064,6 +5064,7 @@ for (;; ptr++) { PUT(code, 0, code - bcptr->current_branch - 1); code += LINK_SIZE; + cd->external_flags |= PCRE_HASTHEN; } } diff --git a/pcre_exec.c b/pcre_exec.c index 38447a9..8d73a70 100644 --- a/pcre_exec.c +++ b/pcre_exec.c @@ -870,12 +870,17 @@ for (;;) /* VVVVVVVVVVVVVVVVVVVVVVVVV */ /* Non-capturing or atomic group, except for possessive with unlimited - repeat. Loop for all the alternatives. When we get to the final alternative - within the brackets, we used to return the result of a recursive call to - match() whatever happened so it was possible to reduce stack usage by - turning this into a tail recursion, except in the case of a possibly empty - group. However, now that there is the possiblity of (*THEN) occurring in - the final alternative, this optimization is no longer possible. + repeat. Loop for all the alternatives. + + When we get to the final alternative within the brackets, we used to return + the result of a recursive call to match() whatever happened so it was + possible to reduce stack usage by turning this into a tail recursion, + except in the case of a possibly empty group. However, now that there is + the possiblity of (*THEN) occurring in the final alternative, this + optimization is no longer always possible. + + We can optimize if we know there are no (*THEN)s in the pattern; at present + this is the best that can be done. MATCH_ONCE is returned when the end of an atomic group is successfully reached, but subsequent matching fails. It passes back up the tree (causing @@ -892,7 +897,20 @@ for (;;) for (;;) { if (op >= OP_SBRA || op == OP_ONCE) md->match_function_type = MATCH_CBEGROUP; - RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, eptrb, + + /* If this is not a possibly empty group, and there are no (*THEN)s in + the pattern, and this is the final alternative, optimize as described + above. */ + + else if (!md->hasthen && ecode[GET(ecode, 1)] != OP_ALT) + { + ecode += _pcre_OP_lengths[*ecode]; + goto TAIL_RECURSE; + } + + /* In all other cases, we have to make another call to match(). */ + + RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, eptrb, RM2); if (rrc != MATCH_NOMATCH && (rrc != MATCH_THEN || md->start_match_ptr != ecode)) @@ -1264,16 +1282,25 @@ for (;;) } /* We are now at the branch that is to be obeyed. As there is only one, - we used to use tail recursion to avoid using another stack frame, except - when there was unlimited repeat of a possibly empty group. However, that - strategy no longer works because of the possibilty of (*THEN) being - encountered in the branch. A recursive call to match() is always required, - unless the second alternative doesn't exist, in which case we can just - plough on. */ + we used always to use tail recursion to avoid using another stack frame, + except when there was unlimited repeat of a possibly empty group. However, + that strategy no longer works because of the possibilty of (*THEN) being + encountered in the branch. However, we can still use tail recursion if + there are no (*THEN)s in the pattern. Otherwise, a recursive call to + match() is always required, unless the second alternative doesn't exist, in + which case we can just plough on. */ if (condition || *ecode == OP_ALT) { if (op == OP_SCOND) md->match_function_type = MATCH_CBEGROUP; + else if (!md->hasthen) + { + ecode += 1 + LINK_SIZE; + goto TAIL_RECURSE; + } + + /* A call to match() is required. */ + RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49); /* If the result is THEN from within the "true" branch of the condition, @@ -1292,7 +1319,10 @@ for (;;) } RRETURN(rrc); } - else /* Condition false & no alternative */ + + /* Condition false & no alternative; continue after the group. */ + + else { ecode += 1 + LINK_SIZE; } @@ -5945,6 +5975,7 @@ md->hitend = FALSE; md->mark = NULL; /* In case never set */ md->recursive = NULL; /* No recursion at top level */ +md->hasthen = (re->flags & PCRE_HASTHEN) != 0; md->lcc = tables + lcc_offset; md->ctypes = tables + ctypes_offset; diff --git a/pcre_internal.h b/pcre_internal.h index be50ca2..54d3c26 100644 --- a/pcre_internal.h +++ b/pcre_internal.h @@ -594,6 +594,7 @@ compatibility. */ #define PCRE_STARTLINE 0x0008 /* start after \n for multiline */ #define PCRE_JCHANGED 0x0010 /* j option used in regex */ #define PCRE_HASCRORLF 0x0020 /* explicit \r or \n in pattern */ +#define PCRE_HASTHEN 0x0040 /* pattern contains (*THEN) */ /* Flags for the "extra" block produced by pcre_study(). */ @@ -1820,6 +1821,7 @@ typedef struct match_data { BOOL notempty_atstart; /* Empty string match at start not wanted */ BOOL hitend; /* Hit the end of the subject at some point */ BOOL bsr_anycrlf; /* \R is just any CRLF, not full Unicode */ + BOOL hasthen; /* Pattern contains (*THEN) */ const uschar *start_code; /* For use when recursing */ USPTR start_subject; /* Start of the subject string */ USPTR end_subject; /* End of the subject string */ diff --git a/testdata/testinput2 b/testdata/testinput2 index c43cc80..ba768ce 100644 --- a/testdata/testinput2 +++ b/testdata/testinput2 @@ -3880,4 +3880,7 @@ with \Y. ---/ /z(*ACCEPT)a/+I baxzbx +/a(?:.)*?a/ims + \Mabbbbbbbbbbbbbbbbbbbbba + /-- End of testinput2 --/ diff --git a/testdata/testoutput2 b/testdata/testoutput2 index 6895e44..d4d6979 100644 --- a/testdata/testoutput2 +++ b/testdata/testoutput2 @@ -4430,12 +4430,12 @@ No first char Need char = 'z' aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaazzbbbbbb\M Minimum match() limit = 8 -Minimum match() recursion limit = 7 +Minimum match() recursion limit = 6 0: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaazz 1: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa aaaaaaaaaaaaaz\M Minimum match() limit = 32768 -Minimum match() recursion limit = 43 +Minimum match() recursion limit = 42 No match /(aaa(?C1)bbb|ab)/I @@ -6666,7 +6666,7 @@ No first char No need char /* this is a C style comment */\M Minimum match() limit = 120 -Minimum match() recursion limit = 35 +Minimum match() recursion limit = 6 0: /* this is a C style comment */ 1: /* this is a C style comment */ @@ -11951,22 +11951,22 @@ No set of starting bytes /^(?>a)++/ aa\M Minimum match() limit = 5 -Minimum match() recursion limit = 3 +Minimum match() recursion limit = 2 0: aa aaaaaaaaa\M Minimum match() limit = 12 -Minimum match() recursion limit = 3 +Minimum match() recursion limit = 2 0: aaaaaaaaa /(a)(?1)++/ aa\M Minimum match() limit = 7 -Minimum match() recursion limit = 5 +Minimum match() recursion limit = 4 0: aa 1: a aaaaaaaaa\M Minimum match() limit = 21 -Minimum match() recursion limit = 5 +Minimum match() recursion limit = 4 0: aaaaaaaaa 1: a @@ -12322,4 +12322,10 @@ No need char 0: z 0+ bx +/a(?:.)*?a/ims + \Mabbbbbbbbbbbbbbbbbbbbba +Minimum match() limit = 65 +Minimum match() recursion limit = 2 + 0: abbbbbbbbbbbbbbbbbbbbba + /-- End of testinput2 --/ |