summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>2011-11-30 18:10:27 +0000
committerph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>2011-11-30 18:10:27 +0000
commit130cbf55fe68014f8e7226d31d7ed9827bd1a226 (patch)
tree68fd1776aafffe8e58f6e2c5b81bcd5316d7df4a
parent66dd8c7559f678189f73cc8fd4293a47c2f2a752 (diff)
downloadpcre-130cbf55fe68014f8e7226d31d7ed9827bd1a226.tar.gz
Expand compile workspace for very many forward references. This ups the limit
by a factor of 100. git-svn-id: svn://vcs.exim.org/pcre/code/trunk@773 2f5784b3-3f2a-0410-8824-cb99058d5e15
-rw-r--r--ChangeLog11
-rw-r--r--doc/pcrelimits.37
-rw-r--r--pcre_compile.c150
-rw-r--r--pcre_internal.h1
-rw-r--r--pcretest.c13
-rw-r--r--testdata/testoutput21
6 files changed, 141 insertions, 42 deletions
diff --git a/ChangeLog b/ChangeLog
index 8a1528b..3fb9b9c 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -56,8 +56,13 @@ Version 8.21
also gives an error.
15. If a forward reference was repeated with an upper limit of around 2000,
- it caused the error "internal error: overran compiling workspace". This
- is now checked, and causes "too many forward references" instead.
+ it caused the error "internal error: overran compiling workspace". The
+ maximum number of forward references (including repeats) was limited by the
+ internal workspace, and dependent on the LINK_SIZE. The code has been
+ rewritten so that the workspace expands (via pcre_malloc) if necessary, and
+ the default depends on LINK_SIZE. There is a new upper limit (for safety)
+ of around 200,000 forward references. While doing this, I also speeded up
+ the filling in of repeated forward references.
16. A repeated forward reference in a pattern such as (a)(?2){2}(.) was
incorrectly expecting the subject to contain another "a" after the start.
@@ -78,7 +83,7 @@ Version 8.21
Perl-compatible have been moved into the Perl-compatible test files. The
refactoring has had the pleasing side effect of removing one argument from
the match() function, thus reducing its stack requirements.
-
+
Version 8.20 21-Oct-2011
------------------------
diff --git a/doc/pcrelimits.3 b/doc/pcrelimits.3
index 1fceec9..028cf55 100644
--- a/doc/pcrelimits.3
+++ b/doc/pcrelimits.3
@@ -23,6 +23,11 @@ All values in repeating quantifiers must be less than 65536.
There is no limit to the number of parenthesized subpatterns, but there can be
no more than 65535 capturing subpatterns.
.P
+There is a limit to the number of forward references to subsequent subpatterns
+of around 200,000. Repeated forward references with fixed upper limits, for
+example, (?2){0,100} when subpattern number 2 is to the right, are included in
+the count. There is no limit to the number of backward references.
+.P
The maximum length of name for a named subpattern is 32 characters, and the
maximum number of named subpatterns is 10000.
.P
@@ -52,6 +57,6 @@ Cambridge CB2 3QH, England.
.rs
.sp
.nf
-Last updated: 24 August 2011
+Last updated: 30 November 2011
Copyright (c) 1997-2011 University of Cambridge.
.fi
diff --git a/pcre_compile.c b/pcre_compile.c
index e7033cb..6641121 100644
--- a/pcre_compile.c
+++ b/pcre_compile.c
@@ -88,14 +88,21 @@ so this number is very generous.
The same workspace is used during the second, actual compile phase for
remembering forward references to groups so that they can be filled in at the
end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
-is 4 there is plenty of room. */
+is 4 there is plenty of room for most patterns. However, the memory can get
+filled up by repetitions of forward references, for example patterns like
+/(?1){0,1999}(b)/, and one user did hit the limit. The code has been changed so
+that the workspace is expanded using malloc() in this situation. The value
+below is therefore a minimum, and we put a maximum on it for safety. The
+minimum is now also defined in terms of LINK_SIZE so that the use of malloc()
+kicks in at the same number of forward references in all cases. */
-#define COMPILE_WORK_SIZE (4096)
+#define COMPILE_WORK_SIZE (2048*LINK_SIZE)
+#define COMPILE_WORK_SIZE_MAX (100*COMPILE_WORK_SIZE)
/* The overrun tests check for a slightly smaller size so that they detect the
overrun before it actually does run off the end of the data block. */
-#define WORK_SIZE_CHECK (COMPILE_WORK_SIZE - 100)
+#define WORK_SIZE_SAFETY_MARGIN (100)
/* Table for handling escaped characters in the range '0'-'z'. Positive returns
@@ -582,6 +589,44 @@ return s;
/*************************************************
+* Expand the workspace *
+*************************************************/
+
+/* This function is called during the second compiling phase, if the number of
+forward references fills the existing workspace, which is originally a block on
+the stack. A larger block is obtained from malloc() unless the ultimate limit
+has been reached or the increase will be rather small.
+
+Argument: pointer to the compile data block
+Returns: 0 if all went well, else an error number
+*/
+
+static int
+expand_workspace(compile_data *cd)
+{
+uschar *newspace;
+int newsize = cd->workspace_size * 2;
+
+if (newsize > COMPILE_WORK_SIZE_MAX) newsize = COMPILE_WORK_SIZE_MAX;
+if (cd->workspace_size >= COMPILE_WORK_SIZE_MAX ||
+ newsize - cd->workspace_size < WORK_SIZE_SAFETY_MARGIN)
+ return ERR72;
+
+newspace = (pcre_malloc)(newsize);
+if (newspace == NULL) return ERR21;
+
+memcpy(newspace, cd->start_workspace, cd->workspace_size);
+cd->hwm = (uschar *)newspace + (cd->hwm - cd->start_workspace);
+if (cd->workspace_size > COMPILE_WORK_SIZE)
+ (pcre_free)((void *)cd->start_workspace);
+cd->start_workspace = newspace;
+cd->workspace_size = newsize;
+return 0;
+}
+
+
+
+/*************************************************
* Check for counted repeat *
*************************************************/
@@ -3332,7 +3377,8 @@ for (;; ptr++)
#ifdef PCRE_DEBUG
if (code > cd->hwm) cd->hwm = code; /* High water info */
#endif
- if (code > cd->start_workspace + WORK_SIZE_CHECK) /* Check for overrun */
+ if (code > cd->start_workspace + cd->workspace_size -
+ WORK_SIZE_SAFETY_MARGIN) /* Check for overrun */
{
*errorcodeptr = ERR52;
goto FAILED;
@@ -3382,7 +3428,8 @@ for (;; ptr++)
/* In the real compile phase, just check the workspace used by the forward
reference list. */
- else if (cd->hwm > cd->start_workspace + WORK_SIZE_CHECK)
+ else if (cd->hwm > cd->start_workspace + cd->workspace_size -
+ WORK_SIZE_SAFETY_MARGIN)
{
*errorcodeptr = ERR52;
goto FAILED;
@@ -4885,23 +4932,33 @@ for (;; ptr++)
}
/* This is compiling for real. If there is a set first byte for
- the group, and we have not yet set a "required byte", set it. */
+ the group, and we have not yet set a "required byte", set it. Make
+ sure there is enough workspace for copying forward references before
+ doing the copy. */
else
{
if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
+
for (i = 1; i < repeat_min; i++)
{
uschar *hc;
uschar *this_hwm = cd->hwm;
memcpy(code, previous, len);
+
+ while (cd->hwm > cd->start_workspace + cd->workspace_size -
+ WORK_SIZE_SAFETY_MARGIN - (this_hwm - save_hwm))
+ {
+ int save_offset = save_hwm - cd->start_workspace;
+ int this_offset = this_hwm - cd->start_workspace;
+ *errorcodeptr = expand_workspace(cd);
+ if (*errorcodeptr != 0) goto FAILED;
+ save_hwm = (uschar *)cd->start_workspace + save_offset;
+ this_hwm = (uschar *)cd->start_workspace + this_offset;
+ }
+
for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
{
- if (cd->hwm >= cd->start_workspace + WORK_SIZE_CHECK)
- {
- *errorcodeptr = ERR72;
- goto FAILED;
- }
PUT(cd->hwm, 0, GET(hc, 0) + len);
cd->hwm += LINK_SIZE;
}
@@ -4967,13 +5024,23 @@ for (;; ptr++)
}
memcpy(code, previous, len);
+
+ /* Ensure there is enough workspace for forward references before
+ copying them. */
+
+ while (cd->hwm > cd->start_workspace + cd->workspace_size -
+ WORK_SIZE_SAFETY_MARGIN - (this_hwm - save_hwm))
+ {
+ int save_offset = save_hwm - cd->start_workspace;
+ int this_offset = this_hwm - cd->start_workspace;
+ *errorcodeptr = expand_workspace(cd);
+ if (*errorcodeptr != 0) goto FAILED;
+ save_hwm = (uschar *)cd->start_workspace + save_offset;
+ this_hwm = (uschar *)cd->start_workspace + this_offset;
+ }
+
for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
{
- if (cd->hwm >= cd->start_workspace + WORK_SIZE_CHECK)
- {
- *errorcodeptr = ERR72;
- goto FAILED;
- }
PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
cd->hwm += LINK_SIZE;
}
@@ -5991,10 +6058,11 @@ for (;; ptr++)
of the group. Then remember the forward reference. */
called = cd->start_code + recno;
- if (cd->hwm >= cd->start_workspace + WORK_SIZE_CHECK)
+ if (cd->hwm >= cd->start_workspace + cd->workspace_size -
+ WORK_SIZE_SAFETY_MARGIN)
{
- *errorcodeptr = ERR72;
- goto FAILED;
+ *errorcodeptr = expand_workspace(cd);
+ if (*errorcodeptr != 0) goto FAILED;
}
PUTINC(cd->hwm, 0, (int)(code + 1 - cd->start_code));
}
@@ -7246,7 +7314,8 @@ compile_data *cd = &compile_block;
computing the amount of memory that is needed. Compiled items are thrown away
as soon as possible, so that a fairly large buffer should be sufficient for
this purpose. The same space is used in the second phase for remembering where
-to fill in forward references to subpatterns. */
+to fill in forward references to subpatterns. That may overflow, in which case
+new memory is obtained from malloc(). */
uschar cworkspace[COMPILE_WORK_SIZE];
@@ -7436,9 +7505,10 @@ cd->bracount = cd->final_bracount = 0;
cd->names_found = 0;
cd->name_entry_size = 0;
cd->name_table = NULL;
-cd->start_workspace = cworkspace;
cd->start_code = cworkspace;
cd->hwm = cworkspace;
+cd->start_workspace = cworkspace;
+cd->workspace_size = COMPILE_WORK_SIZE;
cd->start_pattern = (const uschar *)pattern;
cd->end_pattern = (const uschar *)(pattern + strlen(pattern));
cd->req_varyopt = 0;
@@ -7516,7 +7586,7 @@ cd->names_found = 0;
cd->name_table = (uschar *)re + re->name_table_offset;
codestart = cd->name_table + re->name_entry_size * re->name_count;
cd->start_code = codestart;
-cd->hwm = cworkspace;
+cd->hwm = (uschar *)(cd->start_workspace);
cd->req_varyopt = 0;
cd->had_accept = FALSE;
cd->check_lookbehind = FALSE;
@@ -7550,19 +7620,33 @@ if debugging, leave the test till after things are printed out. */
if (code - codestart > length) errorcode = ERR23;
#endif
-/* Fill in any forward references that are required. */
+/* Fill in any forward references that are required. There may be repeated
+references; optimize for them, as searching a large regex takes time. */
-while (errorcode == 0 && cd->hwm > cworkspace)
+if (cd->hwm > cd->start_workspace)
{
- int offset, recno;
- const uschar *groupptr;
- cd->hwm -= LINK_SIZE;
- offset = GET(cd->hwm, 0);
- recno = GET(codestart, offset);
- groupptr = _pcre_find_bracket(codestart, utf8, recno);
- if (groupptr == NULL) errorcode = ERR53;
- else PUT(((uschar *)codestart), offset, (int)(groupptr - codestart));
- }
+ int prev_recno = -1;
+ const uschar *groupptr = NULL;
+ while (errorcode == 0 && cd->hwm > cd->start_workspace)
+ {
+ int offset, recno;
+ cd->hwm -= LINK_SIZE;
+ offset = GET(cd->hwm, 0);
+ recno = GET(codestart, offset);
+ if (recno != prev_recno)
+ {
+ groupptr = _pcre_find_bracket(codestart, utf8, recno);
+ prev_recno = recno;
+ }
+ if (groupptr == NULL) errorcode = ERR53;
+ else PUT(((uschar *)codestart), offset, (int)(groupptr - codestart));
+ }
+ }
+
+/* If the workspace had to be expanded, free the new memory. */
+
+if (cd->workspace_size > COMPILE_WORK_SIZE)
+ (pcre_free)((void *)cd->start_workspace);
/* Give an error if there's back reference to a non-existent capturing
subpattern. */
diff --git a/pcre_internal.h b/pcre_internal.h
index f3758ed..5e0c67c 100644
--- a/pcre_internal.h
+++ b/pcre_internal.h
@@ -1741,6 +1741,7 @@ typedef struct compile_data {
uschar *name_table; /* The name/number table */
int names_found; /* Number of entries so far */
int name_entry_size; /* Size of each entry */
+ int workspace_size; /* Size of workspace */
int bracount; /* Count of capturing parens as we compile */
int final_bracount; /* Saved value after first pass */
int top_backref; /* Maximum back reference */
diff --git a/pcretest.c b/pcretest.c
index 9282e80..b6ab8e1 100644
--- a/pcretest.c
+++ b/pcretest.c
@@ -191,6 +191,7 @@ static int locale_set = 0;
static int show_malloc;
static int use_utf8;
static size_t gotten_store;
+static size_t first_gotten_store = 0;
static const unsigned char *last_callout_mark = NULL;
/* The buffers grow automatically if very long input lines are encountered. */
@@ -999,12 +1000,14 @@ return (cb->callout_number != callout_fail_id)? 0 :
*************************************************/
/* Alternative malloc function, to test functionality and save the size of a
-compiled re. The show_malloc variable is set only during matching. */
+compiled re, which is the first store request that pcre_compile() makes. The
+show_malloc variable is set only during matching. */
static void *new_malloc(size_t size)
{
void *block = malloc(size);
gotten_store = size;
+if (first_gotten_store == 0) first_gotten_store = size;
if (show_malloc)
fprintf(outfile, "malloc %3d %p\n", (int)size, block);
return block;
@@ -1520,7 +1523,7 @@ while (!done)
(sbuf[4] << 24) | (sbuf[5] << 16) | (sbuf[6] << 8) | sbuf[7];
re = (real_pcre *)new_malloc(true_size);
- regex_gotten_store = gotten_store;
+ regex_gotten_store = first_gotten_store;
if (fread(re, 1, true_size, f) != true_size) goto FAIL_READ;
@@ -1777,6 +1780,7 @@ while (!done)
if ((options & PCRE_UCP) != 0) cflags |= REG_UCP;
if ((options & PCRE_UNGREEDY) != 0) cflags |= REG_UNGREEDY;
+ first_gotten_store = 0;
rc = regcomp(&preg, (char *)p, cflags);
/* Compilation failed; go back for another re, skipping to blank line
@@ -1814,6 +1818,7 @@ while (!done)
(double)CLOCKS_PER_SEC);
}
+ first_gotten_store = 0;
re = pcre_compile((char *)p, options, &error, &erroroffset, tables);
/* Compilation failed; go back for another re, skipping to blank line
@@ -1854,7 +1859,7 @@ while (!done)
if (log_store)
fprintf(outfile, "Memory allocation (code space): %d\n",
- (int)(gotten_store -
+ (int)(first_gotten_store -
sizeof(real_pcre) -
((real_pcre *)re)->name_count * ((real_pcre *)re)->name_entry_size));
@@ -1862,7 +1867,7 @@ while (!done)
and remember the store that was got. */
true_size = ((real_pcre *)re)->size;
- regex_gotten_store = gotten_store;
+ regex_gotten_store = first_gotten_store;
/* If -s or /S was present, study the regex to generate additional info to
help with the matching, unless the pattern has the SS option, which
diff --git a/testdata/testoutput2 b/testdata/testoutput2
index 30a6e8f..d279fb4 100644
--- a/testdata/testoutput2
+++ b/testdata/testoutput2
@@ -12318,7 +12318,6 @@ Failed: \N is not supported in a class at offset 3
Failed: \N is not supported in a class at offset 5
/(a)(?2){0,1999}?(b)/
-Failed: too many forward references at offset 15
/(a)(?(DEFINE)(b))(?2){0,1999}?(?2)/