summaryrefslogtreecommitdiff
path: root/src/pcre2_study.c
diff options
context:
space:
mode:
authorph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069>2017-02-10 16:33:15 +0000
committerph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069>2017-02-10 16:33:15 +0000
commite7d14672b4b35f2d091da344471cd5cf27cb9a62 (patch)
tree0077e0502712781b7dbc6c7db75f00ba344e8f9b /src/pcre2_study.c
parent5c535d8b83e776056f4fc43e1eee81262df249de (diff)
downloadpcre2-e7d14672b4b35f2d091da344471cd5cf27cb9a62.tar.gz
Cache group minima to speed up studying of pathological patterns. Fixes
oss-fuzz #557. git-svn-id: svn://vcs.exim.org/pcre2/code/trunk@660 6239d852-aaf2-0410-a92c-79f79f948069
Diffstat (limited to 'src/pcre2_study.c')
-rw-r--r--src/pcre2_study.c141
1 files changed, 94 insertions, 47 deletions
diff --git a/src/pcre2_study.c b/src/pcre2_study.c
index 4c08bc5..89715ab 100644
--- a/src/pcre2_study.c
+++ b/src/pcre2_study.c
@@ -50,6 +50,10 @@ collecting data (e.g. minimum matching length). */
#include "pcre2_internal.h"
+/* The maximum remembered capturing brackets minimum. */
+
+#define MAX_CACHE_BACKREF 128
+
/* Set a bit in the starting code unit bit map. */
#define SET_BIT(c) re->start_bitmap[(c)/8] |= (1 << ((c)&7))
@@ -71,6 +75,12 @@ length is 16-bits long (on the grounds that anything longer than that is
pathological), so we give up when we reach that amount. This also means that
integer overflow for really crazy patterns cannot happen.
+Backreference minimum lengths are cached to speed up multiple references. This
+function is called only when the highest back reference in the pattern is less
+than or equal to MAX_CACHE_BACKREF, which is one less than the size of the
+caching vector. The zeroth element contains the number of the highest set
+value.
+
Arguments:
re compiled pattern block
code pointer to start of group (the bracket)
@@ -78,6 +88,7 @@ Arguments:
utf UTF flag
recurses chain of recurse_check to catch mutual recursion
countptr pointer to call count (to catch over complexity)
+ backref_cache vector for caching back references.
Returns: the minimum length
-1 \C in UTF-8 mode
@@ -90,7 +101,8 @@ Returns: the minimum length
static int
find_minlength(const pcre2_real_code *re, PCRE2_SPTR code,
- PCRE2_SPTR startcode, BOOL utf, recurse_check *recurses, int *countptr)
+ PCRE2_SPTR startcode, BOOL utf, recurse_check *recurses, int *countptr,
+ int *backref_cache)
{
int length = -1;
int prev_cap_recno = -1;
@@ -166,7 +178,8 @@ for (;;)
case OP_BRAPOS:
case OP_SBRAPOS:
PROCESS_NON_CAPTURE:
- d = find_minlength(re, cc, startcode, utf, recurses, countptr);
+ d = find_minlength(re, cc, startcode, utf, recurses, countptr,
+ backref_cache);
if (d < 0) return d;
branchlength += d;
do cc += GET(cc, 1); while (*cc == OP_ALT);
@@ -186,7 +199,8 @@ for (;;)
if (recno != prev_cap_recno)
{
prev_cap_recno = recno;
- prev_cap_d = find_minlength(re, cc, startcode, utf, recurses, countptr);
+ prev_cap_d = find_minlength(re, cc, startcode, utf, recurses, countptr,
+ backref_cache);
if (prev_cap_d < 0) return prev_cap_d;
}
branchlength += prev_cap_d;
@@ -456,39 +470,52 @@ for (;;)
d = INT_MAX;
- /* Scan all groups with the same name */
+ /* Scan all groups with the same name; find the shortest. */
while (count-- > 0)
{
- ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, GET2(slot, 0));
- if (cs == NULL) return -2;
- do ce += GET(ce, 1); while (*ce == OP_ALT);
- if (cc > cs && cc < ce) /* Simple recursion */
- {
- d = 0;
- had_recurse = TRUE;
- break;
- }
+ int dd, i;
+ recno = GET2(slot, 0);
+
+ if (recno <= backref_cache[0] && backref_cache[recno] >= 0)
+ dd = backref_cache[recno];
else
{
- recurse_check *r = recurses;
- for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
- if (r != NULL) /* Mutual recursion */
+ ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, recno);
+ if (cs == NULL) return -2;
+ do ce += GET(ce, 1); while (*ce == OP_ALT);
+ if (cc > cs && cc < ce) /* Simple recursion */
{
- d = 0;
+ dd = 0;
had_recurse = TRUE;
- break;
}
else
{
- int dd;
- this_recurse.prev = recurses;
- this_recurse.group = cs;
- dd = find_minlength(re, cs, startcode, utf, &this_recurse, countptr);
- if (dd < 0) return dd;
- if (dd < d) d = dd;
+ recurse_check *r = recurses;
+ for (r = recurses; r != NULL; r = r->prev)
+ if (r->group == cs) break;
+ if (r != NULL) /* Mutual recursion */
+ {
+ dd = 0;
+ had_recurse = TRUE;
+ }
+ else
+ {
+ this_recurse.prev = recurses;
+ this_recurse.group = cs;
+ dd = find_minlength(re, cs, startcode, utf, &this_recurse,
+ countptr, backref_cache);
+ if (dd < 0) return dd;
+ }
}
+
+ backref_cache[recno] = dd;
+ for (i = backref_cache[0] + 1; i < recno; i++) backref_cache[i] = -1;
+ backref_cache[0] = recno;
}
+
+ if (dd < d) d = dd;
+ if (d <= 0) break; /* No point looking at any more */
slot += re->name_entry_size;
}
}
@@ -502,35 +529,48 @@ for (;;)
case OP_REF:
case OP_REFI:
if (dupcapused) return -1;
- if ((re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0)
+ recno = GET2(cc, 1);
+ if (recno <= backref_cache[0] && backref_cache[recno] >= 0)
+ d = backref_cache[recno];
+ else
{
- ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, GET2(cc, 1));
- if (cs == NULL) return -2;
- do ce += GET(ce, 1); while (*ce == OP_ALT);
- if (cc > cs && cc < ce) /* Simple recursion */
+ int i;
+ if ((re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0)
{
- d = 0;
- had_recurse = TRUE;
- }
- else
- {
- recurse_check *r = recurses;
- for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
- if (r != NULL) /* Mutual recursion */
+ ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, recno);
+ if (cs == NULL) return -2;
+ do ce += GET(ce, 1); while (*ce == OP_ALT);
+ if (cc > cs && cc < ce) /* Simple recursion */
{
d = 0;
had_recurse = TRUE;
}
else
{
- this_recurse.prev = recurses;
- this_recurse.group = cs;
- d = find_minlength(re, cs, startcode, utf, &this_recurse, countptr);
- if (d < 0) return d;
+ recurse_check *r = recurses;
+ for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
+ if (r != NULL) /* Mutual recursion */
+ {
+ d = 0;
+ had_recurse = TRUE;
+ }
+ else
+ {
+ this_recurse.prev = recurses;
+ this_recurse.group = cs;
+ d = find_minlength(re, cs, startcode, utf, &this_recurse, countptr,
+ backref_cache);
+ if (d < 0) return d;
+ }
}
}
+ else d = 0;
+
+ backref_cache[recno] = d;
+ for (i = backref_cache[0] + 1; i < recno; i++) backref_cache[i] = -1;
+ backref_cache[0] = recno;
}
- else d = 0;
+
cc += 1 + IMM2_SIZE;
/* Handle repeated back references */
@@ -603,7 +643,7 @@ for (;;)
this_recurse.prev = recurses;
this_recurse.group = cs;
prev_recurse_d = find_minlength(re, cs, startcode, utf, &this_recurse,
- countptr);
+ countptr, backref_cache);
if (prev_recurse_d < 0) return prev_recurse_d;
prev_recurse_recno = recno;
branchlength += prev_recurse_d;
@@ -1548,12 +1588,19 @@ if ((re->overall_options & PCRE2_ANCHORED) == 0 &&
if (rc == SSB_DONE) re->flags |= PCRE2_FIRSTMAPSET;
}
-/* Find the minimum length of subject string. If it can match an empty string,
-the minimum length is already known. */
+/* Find the minimum length of subject string. If the pattern can match an empty
+string, the minimum length is already known. If there are more back references
+than the size of the vector we are going to cache them in, do nothing. A
+pattern that complicated will probably take a long time to analyze and may in
+any case turn out to be too complicated. Note that back reference minima are
+held as 16-bit numbers. */
-if ((re->flags & PCRE2_MATCH_EMPTY) == 0)
+if ((re->flags & PCRE2_MATCH_EMPTY) == 0 &&
+ re->top_backref <= MAX_CACHE_BACKREF)
{
- min = find_minlength(re, code, code, utf, NULL, &count);
+ int backref_cache[MAX_CACHE_BACKREF+1];
+ backref_cache[0] = 0; /* Highest one that is set */
+ min = find_minlength(re, code, code, utf, NULL, &count, backref_cache);
switch(min)
{
case -1: /* \C in UTF mode or (*ACCEPT) or over-complex regex */