diff options
author | ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2008-04-20 17:10:13 +0000 |
---|---|---|
committer | ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2008-04-20 17:10:13 +0000 |
commit | 83ba2e28b44dd516ba5e7734a2f9972482a9214c (patch) | |
tree | 0ba1bda6db5b8852aa708d489c3eab201c450ba0 | |
parent | c66bac83039352a3a193c9db0b156fe8faa71296 (diff) | |
download | pcre-83ba2e28b44dd516ba5e7734a2f9972482a9214c.tar.gz |
Slight performance improvement by using the new OP_ALLANY opcode for cases of
the metacharacter "." when DOTALL is set. Also, some tidies consequent upon its
invention.
git-svn-id: svn://vcs.exim.org/pcre/code/trunk@342 2f5784b3-3f2a-0410-8824-cb99058d5e15
-rw-r--r-- | ChangeLog | 7 | ||||
-rw-r--r-- | HACKING | 3 | ||||
-rw-r--r-- | pcre_compile.c | 14 | ||||
-rw-r--r-- | pcre_dfa_exec.c | 27 | ||||
-rw-r--r-- | pcre_exec.c | 77 | ||||
-rw-r--r-- | pcre_study.c | 1 | ||||
-rw-r--r-- | testdata/testoutput10 | 4 | ||||
-rw-r--r-- | testdata/testoutput2 | 12 |
8 files changed, 54 insertions, 91 deletions
@@ -86,6 +86,13 @@ Version 7.7 05-Mar-08 it was being rejected as not supported by pcre_dfa_exec(), even though other assertions are supported. I have made pcre_dfa_exec() support (*FAIL). + +16. The implementation of 13c above involved the invention of a new opcode, + OP_ALLANY, which is like OP_ANY but doesn't check the /s flag. Since /s + cannot be changed at match time, I realized I could make a small + improvement to matching performance by compiling OP_ALLANY instead of + OP_ANY for "." when DOTALL was set, and then removing the runtime tests + on the OP_ANY path. Version 7.6 28-Jan-08 @@ -125,7 +125,8 @@ Opcodes with no following data These items are all just one byte long OP_END end of pattern - OP_ANY match any character + OP_ANY match any one character other than newline + OP_ALLANY match any one character, including newline OP_ANYBYTE match any single byte, even in UTF-8 mode OP_SOD match start of data: \A OP_SOM, start of match (subject + offset): \G diff --git a/pcre_compile.c b/pcre_compile.c index e4dd87b..cd58e4b 100644 --- a/pcre_compile.c +++ b/pcre_compile.c @@ -1301,6 +1301,7 @@ for (;;) case OP_NOT_WORDCHAR: case OP_WORDCHAR: case OP_ANY: + case OP_ALLANY: branchlength++; cc++; break; @@ -1679,6 +1680,7 @@ for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE case OP_NOT_WORDCHAR: case OP_WORDCHAR: case OP_ANY: + case OP_ALLANY: case OP_ANYBYTE: case OP_CHAR: case OP_CHARNC: @@ -2665,7 +2667,7 @@ for (;; ptr++) zerofirstbyte = firstbyte; zeroreqbyte = reqbyte; previous = code; - *code++ = OP_ANY; + *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY; break; @@ -5753,14 +5755,14 @@ do { if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE; } - /* .* is not anchored unless DOTALL is set and it isn't in brackets that - are or may be referenced. */ + /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and + it isn't in brackets that are or may be referenced. */ else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR || - op == OP_TYPEPOSSTAR) && - (*options & PCRE_DOTALL) != 0) + op == OP_TYPEPOSSTAR)) { - if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE; + if (scode[1] != OP_ALLANY || (bracket_map & backref_map) != 0) + return FALSE; } /* Check for explicit anchoring */ diff --git a/pcre_dfa_exec.c b/pcre_dfa_exec.c index 0c8f219..5e6b01d 100644 --- a/pcre_dfa_exec.c +++ b/pcre_dfa_exec.c @@ -739,7 +739,7 @@ for (;;) /*-----------------------------------------------------------------*/ case OP_ANY: - if (clen > 0 && ((ims & PCRE_DOTALL) != 0 || !IS_NEWLINE(ptr))) + if (clen > 0 && !IS_NEWLINE(ptr)) { ADD_NEW(state_offset + 1, 0); } break; @@ -877,10 +877,7 @@ for (;;) { if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || (c < 256 && - (d != OP_ANY || - (ims & PCRE_DOTALL) != 0 || - !IS_NEWLINE(ptr) - ) && + (d != OP_ANY || !IS_NEWLINE(ptr)) && ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) { if (count > 0 && codevalue == OP_TYPEPOSPLUS) @@ -903,10 +900,7 @@ for (;;) { if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || (c < 256 && - (d != OP_ANY || - (ims & PCRE_DOTALL) != 0 || - !IS_NEWLINE(ptr) - ) && + (d != OP_ANY || !IS_NEWLINE(ptr)) && ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) { if (codevalue == OP_TYPEPOSQUERY) @@ -928,10 +922,7 @@ for (;;) { if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || (c < 256 && - (d != OP_ANY || - (ims & PCRE_DOTALL) != 0 || - !IS_NEWLINE(ptr) - ) && + (d != OP_ANY || !IS_NEWLINE(ptr)) && ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) { if (codevalue == OP_TYPEPOSSTAR) @@ -951,10 +942,7 @@ for (;;) { if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || (c < 256 && - (d != OP_ANY || - (ims & PCRE_DOTALL) != 0 || - !IS_NEWLINE(ptr) - ) && + (d != OP_ANY || !IS_NEWLINE(ptr)) && ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) { if (++count >= GET2(code, 1)) @@ -975,10 +963,7 @@ for (;;) { if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || (c < 256 && - (d != OP_ANY || - (ims & PCRE_DOTALL) != 0 || - !IS_NEWLINE(ptr) - ) && + (d != OP_ANY || !IS_NEWLINE(ptr)) && ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) { if (codevalue == OP_TYPEPOSUPTO) diff --git a/pcre_exec.c b/pcre_exec.c index 526658a..328060f 100644 --- a/pcre_exec.c +++ b/pcre_exec.c @@ -1429,16 +1429,12 @@ for (;;) /* Match a single character type; inline for speed */ case OP_ANY: - if ((ims & PCRE_DOTALL) == 0) - { - if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); - } + if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); /* Fall through */ case OP_ALLANY: if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH); - if (utf8) - while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; + if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; ecode++; break; @@ -2955,8 +2951,7 @@ for (;;) case OP_ANY: for (i = 1; i <= min; i++) { - if (eptr >= md->end_subject || - ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr))) + if (eptr >= md->end_subject || IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); eptr++; while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; @@ -3180,15 +3175,11 @@ for (;;) switch(ctype) { case OP_ANY: - if ((ims & PCRE_DOTALL) == 0) + for (i = 1; i <= min; i++) { - for (i = 1; i <= min; i++) - { - if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); - eptr++; - } + if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); + eptr++; } - else eptr += min; break; case OP_ALLANY: @@ -3449,14 +3440,13 @@ for (;;) RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42); if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (fi >= max || eptr >= md->end_subject || - (ctype == OP_ANY && (ims & PCRE_DOTALL) == 0 && - IS_NEWLINE(eptr))) + (ctype == OP_ANY && IS_NEWLINE(eptr))) RRETURN(MATCH_NOMATCH); GETCHARINC(c, eptr); switch(ctype) { - case OP_ANY: /* This is the DOTALL case */ + case OP_ANY: /* This is the non-NL case */ case OP_ALLANY: case OP_ANYBYTE: break; @@ -3609,13 +3599,13 @@ for (;;) RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43); if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (fi >= max || eptr >= md->end_subject || - ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr))) + (ctype == OP_ANY && IS_NEWLINE(eptr))) RRETURN(MATCH_NOMATCH); c = *eptr++; switch(ctype) { - case OP_ANY: /* This is the DOTALL case */ + case OP_ANY: /* This is the non-NL case */ case OP_ALLANY: case OP_ANYBYTE: break; @@ -3870,23 +3860,11 @@ for (;;) case OP_ANY: if (max < INT_MAX) { - if ((ims & PCRE_DOTALL) == 0) - { - for (i = min; i < max; i++) - { - if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break; - eptr++; - while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; - } - } - else + for (i = min; i < max; i++) { - for (i = min; i < max; i++) - { - if (eptr >= md->end_subject) break; - eptr++; - while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; - } + if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break; + eptr++; + while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; } } @@ -3894,18 +3872,11 @@ for (;;) else { - if ((ims & PCRE_DOTALL) == 0) - { - for (i = min; i < max; i++) - { - if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break; - eptr++; - while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; - } - } - else + for (i = min; i < max; i++) { - eptr = md->end_subject; + if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break; + eptr++; + while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; } } break; @@ -4108,16 +4079,12 @@ for (;;) switch(ctype) { case OP_ANY: - if ((ims & PCRE_DOTALL) == 0) + for (i = min; i < max; i++) { - for (i = min; i < max; i++) - { - if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break; - eptr++; - } - break; + if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break; + eptr++; } - /* For DOTALL case, fall through */ + break; case OP_ALLANY: case OP_ANYBYTE: diff --git a/pcre_study.c b/pcre_study.c index 216c889..1041a00 100644 --- a/pcre_study.c +++ b/pcre_study.c @@ -348,6 +348,7 @@ do switch(tcode[1]) { case OP_ANY: + case OP_ALLANY: return SSB_FAIL; case OP_NOT_DIGIT: diff --git a/testdata/testoutput10 b/testdata/testoutput10 index dbd5924..4eaaa39 100644 --- a/testdata/testoutput10 +++ b/testdata/testoutput10 @@ -21,7 +21,7 @@ Memory allocation (code space): 25 ------------------------------------------------------------------ 0 21 Bra 3 9 CBra 1 - 8 Any* + 8 AllAny* 10 X 12 6 Alt 15 ^ @@ -37,7 +37,7 @@ Memory allocation (code space): 29 0 25 Bra 3 9 Bra 6 04 Opt - 8 Any* + 8 AllAny* 10 X 12 8 Alt 15 04 Opt diff --git a/testdata/testoutput2 b/testdata/testoutput2 index fe32975..4d6d688 100644 --- a/testdata/testoutput2 +++ b/testdata/testoutput2 @@ -1126,7 +1126,7 @@ Need char = 'X' /.*X/IDZs ------------------------------------------------------------------ Bra - Any* + AllAny* X Ket End @@ -1160,7 +1160,7 @@ No need char ------------------------------------------------------------------ Bra CBra 1 - Any* + AllAny* X Alt ^ @@ -1179,7 +1179,7 @@ No need char ------------------------------------------------------------------ Bra CBra 1 - Any* + AllAny* X Alt ^ @@ -1199,7 +1199,7 @@ No need char Bra Bra 04 Opt - Any* + AllAny* X Alt 04 Opt @@ -1212,8 +1212,8 @@ No need char ------------------------------------------------------------------ Capturing subpattern count = 0 Partial matching not supported -No options -First char at start or follows newline +Options: anchored +No first char No need char /\Biss\B/I+ |