summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069>2015-11-11 09:42:26 +0000
committerph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069>2015-11-11 09:42:26 +0000
commit75b4cc53f55fe53bd792736a232feab89e2632b2 (patch)
treef90f02018b6cbf0c8a01897e2d3812faf10892f4
parent3f268483a9dd0dc16814f0acbf745d6e7d5a3ca2 (diff)
downloadpcre2-75b4cc53f55fe53bd792736a232feab89e2632b2.tar.gz
Small optimizations in pcre2_study.c
git-svn-id: svn://vcs.exim.org/pcre2/code/trunk@424 6239d852-aaf2-0410-a92c-79f79f948069
-rw-r--r--ChangeLog2
-rw-r--r--doc/pcre2api.311
-rw-r--r--src/pcre2_study.c53
-rw-r--r--testdata/testoutput156
-rw-r--r--testdata/testoutput176
-rw-r--r--testdata/testoutput22
6 files changed, 46 insertions, 34 deletions
diff --git a/ChangeLog b/ChangeLog
index c7b2a96..5ad577b 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -282,6 +282,8 @@ a factor of the size of the compiling workspace (it currently is).
81. Check for integer overflow in minimum length calculation and cap it at
65535.
+82. Small optimizations in code for finding the minimum matching length.
+
Version 10.20 30-June-2015
--------------------------
diff --git a/doc/pcre2api.3 b/doc/pcre2api.3
index 4c15471..9d71c93 100644
--- a/doc/pcre2api.3
+++ b/doc/pcre2api.3
@@ -1,4 +1,4 @@
-.TH PCRE2API 3 "05 November 2015" "PCRE2 10.21"
+.TH PCRE2API 3 "10 November 2015" "PCRE2 10.21"
.SH NAME
PCRE2 - Perl-compatible regular expressions (revised API)
.sp
@@ -1684,8 +1684,11 @@ value, 0 is returned.
.sp
PCRE2_INFO_MATCHEMPTY
.sp
-Return 1 if the pattern can match an empty string, otherwise 0. The third
-argument should point to an \fBuint32_t\fP variable.
+Return 1 if the pattern might match an empty string, otherwise 0. The third
+argument should point to an \fBuint32_t\fP variable. When a pattern contains
+recursive subroutine calls it is not always possible to determine whether or
+not it can match an empty string. PCRE2 takes a cautious approach and returns 1
+in such cases.
.sp
PCRE2_INFO_MATCHLIMIT
.sp
@@ -3084,6 +3087,6 @@ Cambridge, England.
.rs
.sp
.nf
-Last updated: 05 November 2015
+Last updated: 10 November 2015
Copyright (c) 1997-2015 University of Cambridge.
.fi
diff --git a/src/pcre2_study.c b/src/pcre2_study.c
index de5ea92..3c61012 100644
--- a/src/pcre2_study.c
+++ b/src/pcre2_study.c
@@ -104,18 +104,21 @@ recurse_check this_recurse;
register int branchlength = 0;
register PCRE2_UCHAR *cc = (PCRE2_UCHAR *)code + 1 + LINK_SIZE;
-/* A large and/or complex regex can take too long to process. */
+/* If this is a "could be empty" group, its minimum length is 0. */
-if ((*countptr)++ > 1000) return -1;
+if (*code >= OP_SBRA && *code <= OP_SCOND) return 0;
/* Skip over capturing bracket number */
-if (*code == OP_CBRA || *code == OP_SCBRA ||
- *code == OP_CBRAPOS || *code == OP_SCBRAPOS) cc += IMM2_SIZE;
+if (*code == OP_CBRA || *code == OP_CBRAPOS) cc += IMM2_SIZE;
-/* Scan along the opcodes for this branch. If we get to the end of the
-branch, check the length against that of the other branches. If the accumulated
-length passes 16-bits, stop and return it. */
+/* A large and/or complex regex can take too long to process. */
+
+if ((*countptr)++ > 1000) return -1;
+
+/* Scan along the opcodes for this branch. If we get to the end of the branch,
+check the length against that of the other branches. If the accumulated length
+passes 16-bits, stop. */
for (;;)
{
@@ -1543,24 +1546,28 @@ if ((re->overall_options & PCRE2_ANCHORED) == 0 &&
if (rc == SSB_DONE) re->flags |= PCRE2_FIRSTMAPSET;
}
-/* Find the minimum length of subject string. */
+/* Find the minimum length of subject string. If it can match an empty string,
+the minimum length is already known. */
-switch(min = find_minlength(re, code, code, utf, NULL, &count))
+if ((re->flags & PCRE2_MATCH_EMPTY) == 0)
{
- case -1: /* \C in UTF mode or (*ACCEPT) or over-complex regex */
- break; /* Leave minlength unchanged (will be zero) */
-
- case -2:
- return 2; /* missing capturing bracket */
-
- case -3:
- return 3; /* unrecognized opcode */
-
- default:
- if (min > UINT16_MAX) min = UINT16_MAX;
- re->minlength = min;
- break;
- }
+ switch(min = find_minlength(re, code, code, utf, NULL, &count))
+ {
+ case -1: /* \C in UTF mode or (*ACCEPT) or over-complex regex */
+ break; /* Leave minlength unchanged (will be zero) */
+
+ case -2:
+ return 2; /* missing capturing bracket */
+
+ case -3:
+ return 3; /* unrecognized opcode */
+
+ default:
+ if (min > UINT16_MAX) min = UINT16_MAX;
+ re->minlength = min;
+ break;
+ }
+ }
return 0;
}
diff --git a/testdata/testoutput15 b/testdata/testoutput15
index 0dad75a..bb29a49 100644
--- a/testdata/testoutput15
+++ b/testdata/testoutput15
@@ -252,7 +252,7 @@ Failed: error -52: nested recursion at the same subject position
/(a|(?R))/I
Capturing subpattern count = 1
May match empty string
-Subject length lower bound = 1
+Subject length lower bound = 0
abcd
0: a
1: a
@@ -262,7 +262,7 @@ Failed: error -52: nested recursion at the same subject position
/(ab|(bc|(de|(?R))))/I
Capturing subpattern count = 3
May match empty string
-Subject length lower bound = 2
+Subject length lower bound = 0
abcd
0: ab
1: ab
@@ -272,7 +272,7 @@ Failed: error -52: nested recursion at the same subject position
/(ab|(bc|(de|(?1))))/I
Capturing subpattern count = 3
May match empty string
-Subject length lower bound = 2
+Subject length lower bound = 0
abcd
0: ab
1: ab
diff --git a/testdata/testoutput17 b/testdata/testoutput17
index 53c55b8..9227ee6 100644
--- a/testdata/testoutput17
+++ b/testdata/testoutput17
@@ -416,7 +416,7 @@ Failed: error -46: JIT stack limit reached
/(a|(?R))/I
Capturing subpattern count = 1
May match empty string
-Subject length lower bound = 1
+Subject length lower bound = 0
JIT compilation was successful
abcd
0: a (JIT)
@@ -427,7 +427,7 @@ Failed: error -46: JIT stack limit reached
/(ab|(bc|(de|(?R))))/I
Capturing subpattern count = 3
May match empty string
-Subject length lower bound = 2
+Subject length lower bound = 0
JIT compilation was successful
abcd
0: ab (JIT)
@@ -438,7 +438,7 @@ Failed: error -46: JIT stack limit reached
/(ab|(bc|(de|(?1))))/I
Capturing subpattern count = 3
May match empty string
-Subject length lower bound = 2
+Subject length lower bound = 0
JIT compilation was successful
abcd
0: ab (JIT)
diff --git a/testdata/testoutput2 b/testdata/testoutput2
index c149674..2807628 100644
--- a/testdata/testoutput2
+++ b/testdata/testoutput2
@@ -3960,7 +3960,7 @@ Subject length lower bound = 3
Capturing subpattern count = 2
Compile options: <none>
Overall options: anchored
-Subject length lower bound = 3
+Subject length lower bound = 2
a=a
0: a=a
1: a