summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authornigel <nigel@2f5784b3-3f2a-0410-8824-cb99058d5e15>2007-02-24 21:38:57 +0000
committernigel <nigel@2f5784b3-3f2a-0410-8824-cb99058d5e15>2007-02-24 21:38:57 +0000
commit685a411841b5e69517e02508446b317764cd6d70 (patch)
tree13da4537772c74f22f6d56e303c0781009c11a12
parent7703eae0f55edaff9f482fa8d23a6910d5d18577 (diff)
downloadpcre-685a411841b5e69517e02508446b317764cd6d70.tar.gz
Load pcre-2.04 into code/trunk.
git-svn-id: svn://vcs.exim.org/pcre/code/trunk@31 2f5784b3-3f2a-0410-8824-cb99058d5e15
-rw-r--r--ChangeLog20
-rw-r--r--README9
-rw-r--r--Tech.Notes8
-rw-r--r--internal.h2
-rw-r--r--pcre.c166
-rw-r--r--pcretest.c11
-rw-r--r--testinput58
-rw-r--r--testinput24
-rw-r--r--testinput39
-rw-r--r--testoutput118
-rw-r--r--testoutput210
-rw-r--r--testoutput314
-rw-r--r--testoutput42
13 files changed, 407 insertions, 24 deletions
diff --git a/ChangeLog b/ChangeLog
index a057a22..56f17fa 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -2,12 +2,30 @@ ChangeLog for PCRE
------------------
+Version 2.04 18-Feb-99
+----------------------
+
+1. For parenthesized subpatterns with repeats whose minimum was zero, the
+computation of the store needed to hold the pattern was incorrect (too large).
+If such patterns were nested a few deep, this could multiply and become a real
+problem.
+
+2. Added /M option to pcretest to show the memory requirement of a specific
+pattern. Made -m a synonym of -s (which does this globally) for compatibility.
+
+3. Subpatterns of the form (regex){n,m} (i.e. limited maximum) were being
+compiled in such a way that the backtracking after subsequent failure was
+pessimal. Something like (a){0,3} was compiled as (a)?(a)?(a)? instead of
+((a)((a)(a)?)?)? with disastrous performance if the maximum was of any size.
+
+
Version 2.03 02-Feb-99
----------------------
1. Fixed typo and small mistake in man page.
-2. Added 4th condition (GPL supersedes) and created separate LICENCE file.
+2. Added 4th condition (GPL supersedes if conflict) and created separate
+LICENCE file containing the conditions.
3. Updated pcretest so that patterns such as /abc\/def/ work like they do in
Perl, that is the internal \ allows the delimiter to be included in the
diff --git a/README b/README
index 29fc714..02803f6 100644
--- a/README
+++ b/README
@@ -216,6 +216,9 @@ internal form of compiled regular expressions to be output after compilation.
The /S option causes pcre_study() to be called after the expression has been
compiled, and the results used when the expression is matched.
+The /M option causes information about the size of memory block used to hold
+the compile pattern to be output.
+
Finally, the /P option causes pcretest to call PCRE via the POSIX wrapper API
rather than its native API. When this is done, all other options except /i and
/m are ignored. REG_ICASE is set if /i is present, and REG_NEWLINE is set if /m
@@ -291,8 +294,10 @@ If the option -i is given to pcretest, it is equivalent to adding /I to each
regular expression: information about the compiled pattern is given after
compilation.
-If the option -s is given to pcretest, it outputs the size of each compiled
-pattern after it has been compiled.
+If the option -m is given to pcretest, it outputs the size of each compiled
+pattern after it has been compiled. It is equivalent to adding /M to each
+regular expression. For compatibility with earlier versions of pcretest, -s is
+a synonym for -m.
If the -t option is given, each compile, study, and match is run 20000 times
while being timed, and the resulting time per compile or match is output in
diff --git a/Tech.Notes b/Tech.Notes
index d564c7a..d485a4e 100644
--- a/Tech.Notes
+++ b/Tech.Notes
@@ -185,9 +185,11 @@ compiled data its minimum number of times (or once with a BRAZERO if the
minimum is zero), with the final copy terminating with a KETRMIN or KETRMAX as
appropriate.
-A subpattern with a bounded maximum repetition is replicated up to the maximum
-number of times, with BRAZERO or BRAMINZERO before each replication after the
-minimum. In effect, (abc){2,5} becomes (abc)(abc)(abc)?(abc)?(abc)?.
+A subpattern with a bounded maximum repetition is replicated in a nested
+fashion up to the maximum number of times, with BRAZERO or BRAMINZERO before
+each replication after the minimum, so that, for example, (abc){2,5} is
+compiled as (abc)(abc)((abc)((abc)(abc)?)?)?. The 200-bracket limit does not
+apply to these internally generated brackets.
Assertions
diff --git a/internal.h b/internal.h
index 1b9ffe2..a955393 100644
--- a/internal.h
+++ b/internal.h
@@ -3,7 +3,7 @@
*************************************************/
-#define PCRE_VERSION "2.03 12-Feb-1999"
+#define PCRE_VERSION "2.04 19-Feb-1999"
/* This is a library of functions to support regular expressions whose syntax
diff --git a/pcre.c b/pcre.c
index 8da2134..d4af5f8 100644
--- a/pcre.c
+++ b/pcre.c
@@ -1091,8 +1091,10 @@ for (;; ptr++)
else if ((int)*previous >= OP_BRA || (int)*previous == OP_ONCE ||
(int)*previous == OP_COND)
{
- int i, ketoffset = 0;
+ register int i;
+ int ketoffset = 0;
int len = code - previous;
+ uschar *bralink = NULL;
/* If the maximum repeat count is unlimited, find the end of the bracket
by scanning through from the start, and compute the offset back to it
@@ -1107,6 +1109,130 @@ for (;; ptr++)
ketoffset = code - ket;
}
+ /* The case of a zero minimum is special because of the need to stick
+ OP_BRAZERO in front of it, and because the group appears once in the
+ data, whereas in other cases it appears the minimum number of times. For
+ this reason, it is simplest to treat this case separately, as otherwise
+ the code gets far too mess. There are several special subcases when the
+ minimum is zero. */
+
+ if (repeat_min == 0)
+ {
+ /* If the maximum is also zero, we just omit the group from the output
+ altogether. */
+
+ if (repeat_max == 0)
+ {
+ code = previous;
+ previous = NULL;
+ break;
+ }
+
+ /* If the maximum is 1 or unlimited, we just have to stick in the
+ BRAZERO and do no more at this point. */
+
+ if (repeat_max <= 1)
+ {
+ memmove(previous+1, previous, len);
+ code++;
+ *previous++ = OP_BRAZERO + repeat_type;
+ }
+
+ /* If the maximum is greater than 1 and limited, we have to replicate
+ in a nested fashion, sticking OP_BRAZERO before each set of brackets.
+ The first one has to be handled carefully because it's the original
+ copy, which has to be moved up. The remainder can be handled by code
+ that is common with the non-zero minimum case below. We just have to
+ adjust the value or repeat_max, since one less copy is required. */
+
+ else
+ {
+ int offset;
+ memmove(previous+4, previous, len);
+ code += 4;
+ *previous++ = OP_BRAZERO + repeat_type;
+ *previous++ = OP_BRA;
+
+ /* We chain together the bracket offset fields that have to be
+ filled in later when the ends of the brackets are reached. */
+
+ offset = (bralink == NULL)? 0 : previous - bralink;
+ bralink = previous;
+ *previous++ = offset >> 8;
+ *previous++ = offset & 255;
+ }
+
+ repeat_max--;
+ }
+
+ /* If the minimum is greater than zero, replicate the group as many
+ times as necessary, and adjust the maximum to the number of subsequent
+ copies that we need. */
+
+ else
+ {
+ for (i = 1; i < repeat_min; i++)
+ {
+ memcpy(code, previous, len);
+ code += len;
+ }
+ if (repeat_max > 0) repeat_max -= repeat_min;
+ }
+
+ /* This code is common to both the zero and non-zero minimum cases. If
+ the maximum is limited, it replicates the group in a nested fashion,
+ remembering the bracket starts on a stack. In the case of a zero minimum,
+ the first one was set up above. In all cases the repeat_max now specifies
+ the number of additional copies needed. */
+
+ if (repeat_max >= 0)
+ {
+ for (i = repeat_max - 1; i >= 0; i--)
+ {
+ *code++ = OP_BRAZERO + repeat_type;
+
+ /* All but the final copy start a new nesting, maintaining the
+ chain of brackets outstanding. */
+
+ if (i != 0)
+ {
+ int offset;
+ *code++ = OP_BRA;
+ offset = (bralink == NULL)? 0 : code - bralink;
+ bralink = code;
+ *code++ = offset >> 8;
+ *code++ = offset & 255;
+ }
+
+ memcpy(code, previous, len);
+ code += len;
+ }
+
+ /* Now chain through the pending brackets, and fill in their length
+ fields (which are holding the chain links pro tem). */
+
+ while (bralink != NULL)
+ {
+ int oldlinkoffset;
+ int offset = code - bralink + 1;
+ uschar *bra = code - offset;
+ oldlinkoffset = (bra[1] << 8) + bra[2];
+ bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
+ *code++ = OP_KET;
+ *code++ = bra[1] = offset >> 8;
+ *code++ = bra[2] = (offset & 255);
+ }
+ }
+
+ /* If the maximum is unlimited, set a repeater in the final copy. We
+ can't just offset backwards from the current code point, because we
+ don't know if there's been an options resetting after the ket. The
+ correct offset was computed above. */
+
+ else code[-ketoffset] = OP_KETRMAX + repeat_type;
+
+
+#ifdef NEVER
/* If the minimum is greater than zero, and the maximum is unlimited or
equal to the minimum, the first copy remains where it is, and is
replicated up to the minimum number of times. This case includes the +
@@ -1154,6 +1280,9 @@ for (;; ptr++)
correct offset was computed above. */
if (repeat_max == -1) code[-ketoffset] = OP_KETRMAX + repeat_type;
+#endif
+
+
}
/* Else there's some kind of shambles */
@@ -2272,14 +2401,29 @@ while ((c = *(++ptr)) != 0)
else if (c == '+') { maxval = -1; ptr++; }
else if (c == '?') { minval = 0; ptr++; }
- /* If there is a minimum > 1 we have to replicate up to minval-1 times;
- if there is a limited maximum we have to replicate up to maxval-1 times
- and allow for a BRAZERO item before each optional copy, as we also have
- to do before the first copy if the minimum is zero. */
+ /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
+ group, and if the maximum is greater than zero, we have to replicate
+ maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
+ bracket set - hence the 7. */
+
+ if (minval == 0)
+ {
+ length++;
+ if (maxval > 0) length += (maxval - 1) * (duplength + 7);
+ }
- if (minval == 0) length++;
- else if (minval > 1) length += (minval - 1) * duplength;
- if (maxval > minval) length += (maxval - minval) * (duplength + 1);
+ /* When the minimum is greater than zero, 1 we have to replicate up to
+ minval-1 times, with no additions required in the copies. Then, if
+ there is a limited maximum we have to replicate up to maxval-1 times
+ allowing for a BRAZERO item before each optional copy and nesting
+ brackets for all but one of the optional copies. */
+
+ else
+ {
+ length += (minval - 1) * duplength;
+ if (maxval > minval) /* Need this test as maxval=-1 means no limit */
+ length += (maxval - minval) * (duplength + 7) - 6;
+ }
}
continue;
@@ -2775,7 +2919,11 @@ for (;;)
int number = op - OP_BRA;
int offset = number << 1;
- DPRINTF(("start bracket %d\n", number));
+#ifdef DEBUG
+ printf("start bracket %d subject=", number);
+ pchars(eptr, 16, TRUE, md);
+ printf("\n");
+#endif
if (offset < md->offset_max)
{
diff --git a/pcretest.c b/pcretest.c
index 4b58895..da736a2 100644
--- a/pcretest.c
+++ b/pcretest.c
@@ -274,7 +274,9 @@ compiled re. */
static void *new_malloc(size_t size)
{
-if (log_store) fprintf(outfile, "Store size request: %d\n", (int)size);
+if (log_store)
+ fprintf(outfile, "Memory allocation request: %d (code space %d)\n",
+ (int)size, (int)size - offsetof(real_pcre, code[0]));
return malloc(size);
}
@@ -292,6 +294,7 @@ int study_options = 0;
int op = 1;
int timeit = 0;
int showinfo = 0;
+int showstore = 0;
int posix = 0;
int debug = 0;
int done = 0;
@@ -306,7 +309,8 @@ outfile = stdout;
while (argc > 1 && argv[op][0] == '-')
{
- if (strcmp(argv[op], "-s") == 0) log_store = 1;
+ if (strcmp(argv[op], "-s") == 0 || strcmp(argv[op], "-m") == 0)
+ showstore = 1;
else if (strcmp(argv[op], "-t") == 0) timeit = 1;
else if (strcmp(argv[op], "-i") == 0) showinfo = 1;
else if (strcmp(argv[op], "-d") == 0) showinfo = debug = 1;
@@ -434,6 +438,8 @@ while (!done)
options = 0;
study_options = 0;
+ log_store = showstore; /* default from command line */
+
while (*pp != 0)
{
switch (*pp++)
@@ -447,6 +453,7 @@ while (!done)
case 'D': do_debug = do_showinfo = 1; break;
case 'E': options |= PCRE_DOLLAR_ENDONLY; break;
case 'I': do_showinfo = 1; break;
+ case 'M': log_store = 1; break;
case 'P': do_posix = 1; break;
case 'S': do_study = 1; break;
case 'U': options |= PCRE_UNGREEDY; break;
diff --git a/testinput b/testinput
index ffd2bb4..d5d7eb3 100644
--- a/testinput
+++ b/testinput
@@ -1682,4 +1682,62 @@
/\d\d\/\d\d\/\d\d\d\d/
01/01/2000
+/word (?:[a-zA-Z0-9]+ ){0,10}otherword/
+ word cat dog elephant mussel cow horse canary baboon snake shark otherword
+ word cat dog elephant mussel cow horse canary baboon snake shark
+
+/word (?:[a-zA-Z0-9]+ ){0,300}otherword/
+ word cat dog elephant mussel cow horse canary baboon snake shark the quick brown fox and the lazy dog and several other words getting close to thirty by now I hope
+
+/^(a){0,0}/
+ bcd
+ abc
+ aab
+
+/^(a){0,1}/
+ bcd
+ abc
+ aab
+
+/^(a){0,2}/
+ bcd
+ abc
+ aab
+
+/^(a){0,3}/
+ bcd
+ abc
+ aab
+ aaa
+
+/^(a){0,}/
+ bcd
+ abc
+ aab
+ aaa
+ aaaaaaaa
+
+/^(a){1,1}/
+ bcd
+ abc
+ aab
+
+/^(a){1,2}/
+ bcd
+ abc
+ aab
+
+/^(a){1,3}/
+ bcd
+ abc
+ aab
+ aaa
+
+/^(a){1,}/
+ bcd
+ abc
+ aab
+ aaa
+ aaaaaaaa
+
/ End of test input /
diff --git a/testinput2 b/testinput2
index f9b4427..e8ca8f8 100644
--- a/testinput2
+++ b/testinput2
@@ -425,5 +425,9 @@
/^abc\00def/
abc\00def\L\C0
+
+/word ((?:[a-zA-Z0-9]+ )((?:[a-zA-Z0-9]+ )((?:[a-zA-Z0-9]+ )((?:[a-zA-Z0-9]+
+)((?:[a-zA-Z0-9]+ )((?:[a-zA-Z0-9]+ )((?:[a-zA-Z0-9]+ )((?:[a-zA-Z0-9]+
+)?)?)?)?)?)?)?)?)?otherword/M
/ End of test input /
diff --git a/testinput3 b/testinput3
index e4b356c..2e686b3 100644
--- a/testinput3
+++ b/testinput3
@@ -1630,5 +1630,12 @@
endingwxyz
*** Failers
a rather long string that doesn't end with one of them
-
+
+/word (?>(?:(?!otherword)[a-zA-Z0-9]+ ){0,30})otherword/
+ word cat dog elephant mussel cow horse canary baboon snake shark otherword
+ word cat dog elephant mussel cow horse canary baboon snake shark
+
+/word (?>[a-zA-Z0-9]+ ){0,30}otherword/
+ word cat dog elephant mussel cow horse canary baboon snake shark the quick brown fox and the lazy dog and several other words getting close to thirty by now I hope
+
/ End of test input /
diff --git a/testoutput b/testoutput
index 408588b..3f677c0 100644
--- a/testoutput
+++ b/testoutput
@@ -1,4 +1,4 @@
-PCRE version 2.03 12-Feb-1999
+PCRE version 2.04 19-Feb-1999
/the quick brown fox/
the quick brown fox
@@ -2531,5 +2531,121 @@ No match
01/01/2000
0: 01/01/2000
+/word (?:[a-zA-Z0-9]+ ){0,10}otherword/
+ word cat dog elephant mussel cow horse canary baboon snake shark otherword
+ 0: word cat dog elephant mussel cow horse canary baboon snake shark otherword
+ word cat dog elephant mussel cow horse canary baboon snake shark
+No match
+
+/word (?:[a-zA-Z0-9]+ ){0,300}otherword/
+ word cat dog elephant mussel cow horse canary baboon snake shark the quick brown fox and the lazy dog and several other words getting close to thirty by now I hope
+No match
+
+/^(a){0,0}/
+ bcd
+ 0:
+ abc
+ 0:
+ aab
+ 0:
+
+/^(a){0,1}/
+ bcd
+ 0:
+ abc
+ 0: a
+ 1: a
+ aab
+ 0: a
+ 1: a
+
+/^(a){0,2}/
+ bcd
+ 0:
+ abc
+ 0: a
+ 1: a
+ aab
+ 0: aa
+ 1: a
+
+/^(a){0,3}/
+ bcd
+ 0:
+ abc
+ 0: a
+ 1: a
+ aab
+ 0: aa
+ 1: a
+ aaa
+ 0: aaa
+ 1: a
+
+/^(a){0,}/
+ bcd
+ 0:
+ abc
+ 0: a
+ 1: a
+ aab
+ 0: aa
+ 1: a
+ aaa
+ 0: aaa
+ 1: a
+ aaaaaaaa
+ 0: aaaaaaaa
+ 1: a
+
+/^(a){1,1}/
+ bcd
+No match
+ abc
+ 0: a
+ 1: a
+ aab
+ 0: a
+ 1: a
+
+/^(a){1,2}/
+ bcd
+No match
+ abc
+ 0: a
+ 1: a
+ aab
+ 0: aa
+ 1: a
+
+/^(a){1,3}/
+ bcd
+No match
+ abc
+ 0: a
+ 1: a
+ aab
+ 0: aa
+ 1: a
+ aaa
+ 0: aaa
+ 1: a
+
+/^(a){1,}/
+ bcd
+No match
+ abc
+ 0: a
+ 1: a
+ aab
+ 0: aa
+ 1: a
+ aaa
+ 0: aaa
+ 1: a
+ aaaaaaaa
+ 0: aaaaaaaa
+ 1: a
+
/ End of test input /
diff --git a/testoutput2 b/testoutput2
index 6416c79..af2309e 100644
--- a/testoutput2
+++ b/testoutput2
@@ -1,4 +1,4 @@
-PCRE version 2.03 12-Feb-1999
+PCRE version 2.04 19-Feb-1999
/(a)b|/
Identifying subpattern count = 1
@@ -977,6 +977,14 @@ No first char
0: abc\x00def
0C abc (7)
0L abc
+
+/word ((?:[a-zA-Z0-9]+ )((?:[a-zA-Z0-9]+ )((?:[a-zA-Z0-9]+ )((?:[a-zA-Z0-9]+
+)((?:[a-zA-Z0-9]+ )((?:[a-zA-Z0-9]+ )((?:[a-zA-Z0-9]+ )((?:[a-zA-Z0-9]+
+)?)?)?)?)?)?)?)?)?otherword/M
+Memory allocation request: 441 (code space 428)
+Identifying subpattern count = 8
+No options
+First char = 'w'
/ End of test input /
Identifying subpattern count = 0
diff --git a/testoutput3 b/testoutput3
index 0be0787..ffff396 100644
--- a/testoutput3
+++ b/testoutput3
@@ -1,4 +1,4 @@
-PCRE version 2.03 12-Feb-1999
+PCRE version 2.04 19-Feb-1999
/(?<!bar)foo/
foo
@@ -2817,6 +2817,16 @@ No match
No match
a rather long string that doesn't end with one of them
No match
-
+
+/word (?>(?:(?!otherword)[a-zA-Z0-9]+ ){0,30})otherword/
+ word cat dog elephant mussel cow horse canary baboon snake shark otherword
+ 0: word cat dog elephant mussel cow horse canary baboon snake shark otherword
+ word cat dog elephant mussel cow horse canary baboon snake shark
+No match
+
+/word (?>[a-zA-Z0-9]+ ){0,30}otherword/
+ word cat dog elephant mussel cow horse canary baboon snake shark the quick brown fox and the lazy dog and several other words getting close to thirty by now I hope
+No match
+
/ End of test input /
diff --git a/testoutput4 b/testoutput4
index fde2a46..d301506 100644
--- a/testoutput4
+++ b/testoutput4
@@ -1,4 +1,4 @@
-PCRE version 2.03 12-Feb-1999
+PCRE version 2.04 19-Feb-1999
/^[\w]+/
*** Failers