summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069>2019-06-17 16:26:44 +0000
committerph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069>2019-06-17 16:26:44 +0000
commit42a19af33b46e32374f99578a2e4eb454aece59c (patch)
treeddd53c1e23d06e7c1b095dda77850d64b2e183a0
parent508eef833c4a634d331e0d54e505fd18a300de6a (diff)
downloadpcre2-42a19af33b46e32374f99578a2e4eb454aece59c.tar.gz
Another extension to minimum length calculation.
git-svn-id: svn://vcs.exim.org/pcre2/code/trunk@1108 6239d852-aaf2-0410-a92c-79f79f948069
-rw-r--r--ChangeLog4
-rw-r--r--src/pcre2_study.c92
-rw-r--r--testdata/testinput102
-rw-r--r--testdata/testinput122
-rw-r--r--testdata/testinput216
-rw-r--r--testdata/testoutput109
-rw-r--r--testdata/testoutput12-168
-rw-r--r--testdata/testoutput12-328
-rw-r--r--testdata/testoutput263
9 files changed, 163 insertions, 41 deletions
diff --git a/ChangeLog b/ChangeLog
index b3debf4..52f22f7 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -42,6 +42,10 @@ minimum is potentially useful.
such references as not adding to the minimum length (which it should have
done all along).
+ * Furthermore, the above action now happens only if the back reference is to
+ a group that exists more than once in a pattern instead of any back
+ reference in a pattern with duplicate numbers.
+
10. A (*MARK) value inside a successful condition was not being returned by the
interpretive matcher (it was returned by JIT). This bug has been mended.
diff --git a/src/pcre2_study.c b/src/pcre2_study.c
index 496c8dc..a6790be 100644
--- a/src/pcre2_study.c
+++ b/src/pcre2_study.c
@@ -134,7 +134,7 @@ for (;;)
int d, min, recno;
PCRE2_UCHAR *cs, *ce;
PCRE2_UCHAR op = *cc;
-
+
if (branchlength >= UINT16_MAX) return UINT16_MAX;
switch (op)
@@ -448,11 +448,13 @@ for (;;)
If PCRE2_MATCH_UNSET_BACKREF is set, a backreference to an unset bracket
matches an empty string (by default it causes a matching failure), so in
- that case we must set the minimum length to zero. */
+ that case we must set the minimum length to zero.
- /* Duplicate named pattern back reference. We cannot reliably find a length
- for this if duplicate numbers are present in the pattern, so we set the
- length to zero here also. */
+ For backreferenes, if duplicate numbers are present in the pattern we check
+ for a reference to a duplicate. If it is, we don't know which version will
+ be referenced, so we have to set the minimum length to zero. */
+
+ /* Duplicate named pattern back reference. */
case OP_DNREF:
case OP_DNREFI:
@@ -479,30 +481,34 @@ for (;;)
ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, recno);
if (cs == NULL) return -2;
do ce += GET(ce, 1); while (*ce == OP_ALT);
- if (cc > cs && cc < ce) /* Simple recursion */
+
+ dd = 0;
+ if (!dupcapused ||
+ (PCRE2_UCHAR *)PRIV(find_bracket)(ce, utf, recno) == NULL)
{
- dd = 0;
- had_recurse = TRUE;
- }
- else
- {
- recurse_check *r = recurses;
- for (r = recurses; r != NULL; r = r->prev)
- if (r->group == cs) break;
- if (r != NULL) /* Mutual recursion */
+ if (cc > cs && cc < ce) /* Simple recursion */
{
- dd = 0;
had_recurse = TRUE;
}
else
{
- this_recurse.prev = recurses;
- this_recurse.group = cs;
- dd = find_minlength(re, cs, startcode, utf, &this_recurse,
- countptr, backref_cache);
- if (dd < 0) return dd;
+ recurse_check *r = recurses;
+ for (r = recurses; r != NULL; r = r->prev)
+ if (r->group == cs) break;
+ if (r != NULL) /* Mutual recursion */
+ {
+ had_recurse = TRUE;
+ }
+ else
+ {
+ this_recurse.prev = recurses; /* No recursion */
+ this_recurse.group = cs;
+ dd = find_minlength(re, cs, startcode, utf, &this_recurse,
+ countptr, backref_cache);
+ if (dd < 0) return dd;
+ }
}
- }
+ }
backref_cache[recno] = dd;
for (i = backref_cache[0] + 1; i < recno; i++) backref_cache[i] = -1;
@@ -518,8 +524,8 @@ for (;;)
cc += 1 + 2*IMM2_SIZE;
goto REPEAT_BACK_REFERENCE;
- /* Single back reference. We cannot find a length for this if duplicate
- numbers are present in the pattern. */
+ /* Single back reference by number. References by name are converted to by
+ number when there is no duplication. */
case OP_REF:
case OP_REFI:
@@ -529,36 +535,40 @@ for (;;)
else
{
int i;
- if (!dupcapused && (re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0)
+ d = 0;
+
+ if ((re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0)
{
ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, recno);
if (cs == NULL) return -2;
do ce += GET(ce, 1); while (*ce == OP_ALT);
- if (cc > cs && cc < ce) /* Simple recursion */
- {
- d = 0;
- had_recurse = TRUE;
- }
- else
+
+ if (!dupcapused ||
+ (PCRE2_UCHAR *)PRIV(find_bracket)(ce, utf, recno) == NULL)
{
- recurse_check *r = recurses;
- for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
- if (r != NULL) /* Mutual recursion */
+ if (cc > cs && cc < ce) /* Simple recursion */
{
- d = 0;
had_recurse = TRUE;
}
else
{
- this_recurse.prev = recurses;
- this_recurse.group = cs;
- d = find_minlength(re, cs, startcode, utf, &this_recurse, countptr,
- backref_cache);
- if (d < 0) return d;
+ recurse_check *r = recurses;
+ for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
+ if (r != NULL) /* Mutual recursion */
+ {
+ had_recurse = TRUE;
+ }
+ else /* No recursion */
+ {
+ this_recurse.prev = recurses;
+ this_recurse.group = cs;
+ d = find_minlength(re, cs, startcode, utf, &this_recurse, countptr,
+ backref_cache);
+ if (d < 0) return d;
+ }
}
}
}
- else d = 0;
backref_cache[recno] = d;
for (i = backref_cache[0] + 1; i < recno; i++) backref_cache[i] = -1;
diff --git a/testdata/testinput10 b/testdata/testinput10
index 2029c44..5e8458f 100644
--- a/testdata/testinput10
+++ b/testdata/testinput10
@@ -557,4 +557,6 @@
# -------------------------------------
+/(*UTF)(?=\x{123})/I
+
# End of testinput10
diff --git a/testdata/testinput12 b/testdata/testinput12
index b1af30a..1f68732 100644
--- a/testdata/testinput12
+++ b/testdata/testinput12
@@ -447,4 +447,6 @@
# ----------------------------------------------------
+/(*UTF)(?=\x{123})/I
+
# End of testinput12
diff --git a/testdata/testinput2 b/testdata/testinput2
index bfec80c..3c1e589 100644
--- a/testdata/testinput2
+++ b/testdata/testinput2
@@ -5607,4 +5607,20 @@ a)"xI
/(*:\Q \E){5}/alt_verbnames
+/(?=abc)/I
+
+/(?|(X)|(XY))\1abc/I
+
+/(?|(a)|(bcde))(c)\2/I
+
+/(?|(a)|(bcde))(c)\1/I
+
+/(?|(?'A'a)|(?'A'bcde))(?'B'c)\k'B'(?'A')/I,dupnames
+
+/(?|(?'A'a)|(?'A'bcde))(?'B'c)\k'A'(?'A')/I,dupnames
+
+/((a|)+)+Z/I
+
+/((?=a))[abcd]/I
+
# End of testinput2
diff --git a/testdata/testoutput10 b/testdata/testoutput10
index 0a336d8..e5b7c13 100644
--- a/testdata/testoutput10
+++ b/testdata/testoutput10
@@ -1757,4 +1757,13 @@ No match
# -------------------------------------
+/(*UTF)(?=\x{123})/I
+Capture group count = 0
+May match empty string
+Compile options: <none>
+Overall options: utf
+First code unit = \xc4
+Last code unit = \xa3
+Subject length lower bound = 1
+
# End of testinput10
diff --git a/testdata/testoutput12-16 b/testdata/testoutput12-16
index fd6eef7..9b11b62 100644
--- a/testdata/testoutput12-16
+++ b/testdata/testoutput12-16
@@ -1579,4 +1579,12 @@ No match
# ----------------------------------------------------
+/(*UTF)(?=\x{123})/I
+Capture group count = 0
+May match empty string
+Compile options: <none>
+Overall options: utf
+First code unit = \x{123}
+Subject length lower bound = 1
+
# End of testinput12
diff --git a/testdata/testoutput12-32 b/testdata/testoutput12-32
index 768fd7e..9b1b38b 100644
--- a/testdata/testoutput12-32
+++ b/testdata/testoutput12-32
@@ -1577,4 +1577,12 @@ No match
# ----------------------------------------------------
+/(*UTF)(?=\x{123})/I
+Capture group count = 0
+May match empty string
+Compile options: <none>
+Overall options: utf
+First code unit = \x{123}
+Subject length lower bound = 1
+
# End of testinput12
diff --git a/testdata/testoutput2 b/testdata/testoutput2
index 9c1dc50..6a93525 100644
--- a/testdata/testoutput2
+++ b/testdata/testoutput2
@@ -16963,6 +16963,69 @@ Failed: error 109 at offset 5: quantifier does not follow a repeatable item
/(*:\Q \E){5}/alt_verbnames
Failed: error 109 at offset 11: quantifier does not follow a repeatable item
+/(?=abc)/I
+Capture group count = 0
+May match empty string
+First code unit = 'a'
+Last code unit = 'c'
+Subject length lower bound = 2
+
+/(?|(X)|(XY))\1abc/I
+Capture group count = 1
+Max back reference = 1
+First code unit = 'X'
+Last code unit = 'c'
+Subject length lower bound = 4
+
+/(?|(a)|(bcde))(c)\2/I
+Capture group count = 2
+Max back reference = 2
+Starting code units: a b
+Last code unit = 'c'
+Subject length lower bound = 3
+
+/(?|(a)|(bcde))(c)\1/I
+Capture group count = 2
+Max back reference = 1
+Starting code units: a b
+Last code unit = 'c'
+Subject length lower bound = 2
+
+/(?|(?'A'a)|(?'A'bcde))(?'B'c)\k'B'(?'A')/I,dupnames
+Capture group count = 3
+Max back reference = 2
+Named capture groups:
+ A 1
+ A 3
+ B 2
+Options: dupnames
+Starting code units: a b
+Last code unit = 'c'
+Subject length lower bound = 3
+
+/(?|(?'A'a)|(?'A'bcde))(?'B'c)\k'A'(?'A')/I,dupnames
+Capture group count = 3
+Max back reference = 3
+Named capture groups:
+ A 1
+ A 3
+ B 2
+Options: dupnames
+Starting code units: a b
+Last code unit = 'c'
+Subject length lower bound = 2
+
+/((a|)+)+Z/I
+Capture group count = 2
+Starting code units: Z a
+Last code unit = 'Z'
+Subject length lower bound = 1
+
+/((?=a))[abcd]/I
+Capture group count = 1
+First code unit = 'a'
+Subject length lower bound = 1
+
# End of testinput2
Error -70: PCRE2_ERROR_BADDATA (unknown error number)
Error -62: bad serialized data