summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>2008-04-20 17:10:13 +0000
committerph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>2008-04-20 17:10:13 +0000
commit83ba2e28b44dd516ba5e7734a2f9972482a9214c (patch)
tree0ba1bda6db5b8852aa708d489c3eab201c450ba0
parentc66bac83039352a3a193c9db0b156fe8faa71296 (diff)
downloadpcre-83ba2e28b44dd516ba5e7734a2f9972482a9214c.tar.gz
Slight performance improvement by using the new OP_ALLANY opcode for cases of
the metacharacter "." when DOTALL is set. Also, some tidies consequent upon its invention. git-svn-id: svn://vcs.exim.org/pcre/code/trunk@342 2f5784b3-3f2a-0410-8824-cb99058d5e15
-rw-r--r--ChangeLog7
-rw-r--r--HACKING3
-rw-r--r--pcre_compile.c14
-rw-r--r--pcre_dfa_exec.c27
-rw-r--r--pcre_exec.c77
-rw-r--r--pcre_study.c1
-rw-r--r--testdata/testoutput104
-rw-r--r--testdata/testoutput212
8 files changed, 54 insertions, 91 deletions
diff --git a/ChangeLog b/ChangeLog
index 64d7d23..b5dfda8 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -86,6 +86,13 @@ Version 7.7 05-Mar-08
it was being rejected as not supported by pcre_dfa_exec(), even though
other assertions are supported. I have made pcre_dfa_exec() support
(*FAIL).
+
+16. The implementation of 13c above involved the invention of a new opcode,
+ OP_ALLANY, which is like OP_ANY but doesn't check the /s flag. Since /s
+ cannot be changed at match time, I realized I could make a small
+ improvement to matching performance by compiling OP_ALLANY instead of
+ OP_ANY for "." when DOTALL was set, and then removing the runtime tests
+ on the OP_ANY path.
Version 7.6 28-Jan-08
diff --git a/HACKING b/HACKING
index e76341f..1f30d4c 100644
--- a/HACKING
+++ b/HACKING
@@ -125,7 +125,8 @@ Opcodes with no following data
These items are all just one byte long
OP_END end of pattern
- OP_ANY match any character
+ OP_ANY match any one character other than newline
+ OP_ALLANY match any one character, including newline
OP_ANYBYTE match any single byte, even in UTF-8 mode
OP_SOD match start of data: \A
OP_SOM, start of match (subject + offset): \G
diff --git a/pcre_compile.c b/pcre_compile.c
index e4dd87b..cd58e4b 100644
--- a/pcre_compile.c
+++ b/pcre_compile.c
@@ -1301,6 +1301,7 @@ for (;;)
case OP_NOT_WORDCHAR:
case OP_WORDCHAR:
case OP_ANY:
+ case OP_ALLANY:
branchlength++;
cc++;
break;
@@ -1679,6 +1680,7 @@ for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE
case OP_NOT_WORDCHAR:
case OP_WORDCHAR:
case OP_ANY:
+ case OP_ALLANY:
case OP_ANYBYTE:
case OP_CHAR:
case OP_CHARNC:
@@ -2665,7 +2667,7 @@ for (;; ptr++)
zerofirstbyte = firstbyte;
zeroreqbyte = reqbyte;
previous = code;
- *code++ = OP_ANY;
+ *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
break;
@@ -5753,14 +5755,14 @@ do {
if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
}
- /* .* is not anchored unless DOTALL is set and it isn't in brackets that
- are or may be referenced. */
+ /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
+ it isn't in brackets that are or may be referenced. */
else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
- op == OP_TYPEPOSSTAR) &&
- (*options & PCRE_DOTALL) != 0)
+ op == OP_TYPEPOSSTAR))
{
- if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
+ if (scode[1] != OP_ALLANY || (bracket_map & backref_map) != 0)
+ return FALSE;
}
/* Check for explicit anchoring */
diff --git a/pcre_dfa_exec.c b/pcre_dfa_exec.c
index 0c8f219..5e6b01d 100644
--- a/pcre_dfa_exec.c
+++ b/pcre_dfa_exec.c
@@ -739,7 +739,7 @@ for (;;)
/*-----------------------------------------------------------------*/
case OP_ANY:
- if (clen > 0 && ((ims & PCRE_DOTALL) != 0 || !IS_NEWLINE(ptr)))
+ if (clen > 0 && !IS_NEWLINE(ptr))
{ ADD_NEW(state_offset + 1, 0); }
break;
@@ -877,10 +877,7 @@ for (;;)
{
if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
(c < 256 &&
- (d != OP_ANY ||
- (ims & PCRE_DOTALL) != 0 ||
- !IS_NEWLINE(ptr)
- ) &&
+ (d != OP_ANY || !IS_NEWLINE(ptr)) &&
((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
{
if (count > 0 && codevalue == OP_TYPEPOSPLUS)
@@ -903,10 +900,7 @@ for (;;)
{
if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
(c < 256 &&
- (d != OP_ANY ||
- (ims & PCRE_DOTALL) != 0 ||
- !IS_NEWLINE(ptr)
- ) &&
+ (d != OP_ANY || !IS_NEWLINE(ptr)) &&
((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
{
if (codevalue == OP_TYPEPOSQUERY)
@@ -928,10 +922,7 @@ for (;;)
{
if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
(c < 256 &&
- (d != OP_ANY ||
- (ims & PCRE_DOTALL) != 0 ||
- !IS_NEWLINE(ptr)
- ) &&
+ (d != OP_ANY || !IS_NEWLINE(ptr)) &&
((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
{
if (codevalue == OP_TYPEPOSSTAR)
@@ -951,10 +942,7 @@ for (;;)
{
if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
(c < 256 &&
- (d != OP_ANY ||
- (ims & PCRE_DOTALL) != 0 ||
- !IS_NEWLINE(ptr)
- ) &&
+ (d != OP_ANY || !IS_NEWLINE(ptr)) &&
((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
{
if (++count >= GET2(code, 1))
@@ -975,10 +963,7 @@ for (;;)
{
if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
(c < 256 &&
- (d != OP_ANY ||
- (ims & PCRE_DOTALL) != 0 ||
- !IS_NEWLINE(ptr)
- ) &&
+ (d != OP_ANY || !IS_NEWLINE(ptr)) &&
((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
{
if (codevalue == OP_TYPEPOSUPTO)
diff --git a/pcre_exec.c b/pcre_exec.c
index 526658a..328060f 100644
--- a/pcre_exec.c
+++ b/pcre_exec.c
@@ -1429,16 +1429,12 @@ for (;;)
/* Match a single character type; inline for speed */
case OP_ANY:
- if ((ims & PCRE_DOTALL) == 0)
- {
- if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
- }
+ if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
/* Fall through */
case OP_ALLANY:
if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
- if (utf8)
- while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
+ if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
ecode++;
break;
@@ -2955,8 +2951,7 @@ for (;;)
case OP_ANY:
for (i = 1; i <= min; i++)
{
- if (eptr >= md->end_subject ||
- ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr)))
+ if (eptr >= md->end_subject || IS_NEWLINE(eptr))
RRETURN(MATCH_NOMATCH);
eptr++;
while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
@@ -3180,15 +3175,11 @@ for (;;)
switch(ctype)
{
case OP_ANY:
- if ((ims & PCRE_DOTALL) == 0)
+ for (i = 1; i <= min; i++)
{
- for (i = 1; i <= min; i++)
- {
- if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
- eptr++;
- }
+ if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
+ eptr++;
}
- else eptr += min;
break;
case OP_ALLANY:
@@ -3449,14 +3440,13 @@ for (;;)
RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
if (fi >= max || eptr >= md->end_subject ||
- (ctype == OP_ANY && (ims & PCRE_DOTALL) == 0 &&
- IS_NEWLINE(eptr)))
+ (ctype == OP_ANY && IS_NEWLINE(eptr)))
RRETURN(MATCH_NOMATCH);
GETCHARINC(c, eptr);
switch(ctype)
{
- case OP_ANY: /* This is the DOTALL case */
+ case OP_ANY: /* This is the non-NL case */
case OP_ALLANY:
case OP_ANYBYTE:
break;
@@ -3609,13 +3599,13 @@ for (;;)
RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
if (fi >= max || eptr >= md->end_subject ||
- ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr)))
+ (ctype == OP_ANY && IS_NEWLINE(eptr)))
RRETURN(MATCH_NOMATCH);
c = *eptr++;
switch(ctype)
{
- case OP_ANY: /* This is the DOTALL case */
+ case OP_ANY: /* This is the non-NL case */
case OP_ALLANY:
case OP_ANYBYTE:
break;
@@ -3870,23 +3860,11 @@ for (;;)
case OP_ANY:
if (max < INT_MAX)
{
- if ((ims & PCRE_DOTALL) == 0)
- {
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
- eptr++;
- while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
- }
- }
- else
+ for (i = min; i < max; i++)
{
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject) break;
- eptr++;
- while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
- }
+ if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
+ eptr++;
+ while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
}
}
@@ -3894,18 +3872,11 @@ for (;;)
else
{
- if ((ims & PCRE_DOTALL) == 0)
- {
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
- eptr++;
- while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
- }
- }
- else
+ for (i = min; i < max; i++)
{
- eptr = md->end_subject;
+ if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
+ eptr++;
+ while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
}
}
break;
@@ -4108,16 +4079,12 @@ for (;;)
switch(ctype)
{
case OP_ANY:
- if ((ims & PCRE_DOTALL) == 0)
+ for (i = min; i < max; i++)
{
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
- eptr++;
- }
- break;
+ if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
+ eptr++;
}
- /* For DOTALL case, fall through */
+ break;
case OP_ALLANY:
case OP_ANYBYTE:
diff --git a/pcre_study.c b/pcre_study.c
index 216c889..1041a00 100644
--- a/pcre_study.c
+++ b/pcre_study.c
@@ -348,6 +348,7 @@ do
switch(tcode[1])
{
case OP_ANY:
+ case OP_ALLANY:
return SSB_FAIL;
case OP_NOT_DIGIT:
diff --git a/testdata/testoutput10 b/testdata/testoutput10
index dbd5924..4eaaa39 100644
--- a/testdata/testoutput10
+++ b/testdata/testoutput10
@@ -21,7 +21,7 @@ Memory allocation (code space): 25
------------------------------------------------------------------
0 21 Bra
3 9 CBra 1
- 8 Any*
+ 8 AllAny*
10 X
12 6 Alt
15 ^
@@ -37,7 +37,7 @@ Memory allocation (code space): 29
0 25 Bra
3 9 Bra
6 04 Opt
- 8 Any*
+ 8 AllAny*
10 X
12 8 Alt
15 04 Opt
diff --git a/testdata/testoutput2 b/testdata/testoutput2
index fe32975..4d6d688 100644
--- a/testdata/testoutput2
+++ b/testdata/testoutput2
@@ -1126,7 +1126,7 @@ Need char = 'X'
/.*X/IDZs
------------------------------------------------------------------
Bra
- Any*
+ AllAny*
X
Ket
End
@@ -1160,7 +1160,7 @@ No need char
------------------------------------------------------------------
Bra
CBra 1
- Any*
+ AllAny*
X
Alt
^
@@ -1179,7 +1179,7 @@ No need char
------------------------------------------------------------------
Bra
CBra 1
- Any*
+ AllAny*
X
Alt
^
@@ -1199,7 +1199,7 @@ No need char
Bra
Bra
04 Opt
- Any*
+ AllAny*
X
Alt
04 Opt
@@ -1212,8 +1212,8 @@ No need char
------------------------------------------------------------------
Capturing subpattern count = 0
Partial matching not supported
-No options
-First char at start or follows newline
+Options: anchored
+No first char
No need char
/\Biss\B/I+