summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>2010-11-23 15:34:55 +0000
committerph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>2010-11-23 15:34:55 +0000
commit1c4a198f0a69223930a4b118a35a618342d20898 (patch)
treec5d001906700e6bd5605b02ef767d4fcbe55c5b6
parent5f27c5877073e20515e674f9749251fb6710937d (diff)
downloadpcre-1c4a198f0a69223930a4b118a35a618342d20898.tar.gz
Fix internal error for recursive named back references.
git-svn-id: svn://vcs.exim.org/pcre/code/trunk@578 2f5784b3-3f2a-0410-8824-cb99058d5e15
-rw-r--r--ChangeLog6
-rw-r--r--pcre_compile.c52
-rw-r--r--testdata/testinput113
-rw-r--r--testdata/testinput210
-rw-r--r--testdata/testoutput115
-rw-r--r--testdata/testoutput247
6 files changed, 114 insertions, 9 deletions
diff --git a/ChangeLog b/ChangeLog
index b18a0d5..86fad7c 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -120,6 +120,12 @@ Version 8.11 10-Oct-2010
to pcregrep and other applications that have no direct access to PCRE
options. The new /Y option in pcretest sets this option when calling
pcre_compile().
+
+21. Change 18 of release 8.01 broke the use of named subpatterns for recursive
+ back references. Groups containing recursive back references were forced to
+ be atomic by that change, but in the case of named groups, the amount of
+ memory required was incorrectly computed, leading to "Failed: internal
+ error: code overflow". This has been fixed.
Version 8.10 25-Jun-2010
diff --git a/pcre_compile.c b/pcre_compile.c
index 5cb069a..14c252c 100644
--- a/pcre_compile.c
+++ b/pcre_compile.c
@@ -1105,10 +1105,21 @@ top-level call starts at the beginning of the pattern. All other calls must
start at a parenthesis. It scans along a pattern's text looking for capturing
subpatterns, and counting them. If it finds a named pattern that matches the
name it is given, it returns its number. Alternatively, if the name is NULL, it
-returns when it reaches a given numbered subpattern. We know that if (?P< is
-encountered, the name will be terminated by '>' because that is checked in the
-first pass. Recursion is used to keep track of subpatterns that reset the
-capturing group numbers - the (?| feature.
+returns when it reaches a given numbered subpattern. Recursion is used to keep
+track of subpatterns that reset the capturing group numbers - the (?| feature.
+
+This function was originally called only from the second pass, in which we know
+that if (?< or (?' or (?P< is encountered, the name will be correctly
+terminated because that is checked in the first pass. There is now one call to
+this function in the first pass, to check for a recursive back reference by
+name (so that we can make the whole group atomic). In this case, we need check
+only up to the current position in the pattern, and that is still OK because
+and previous occurrences will have been checked. To make this work, the test
+for "end of pattern" is a check against cd->end_pattern in the main loop,
+instead of looking for a binary zero. This means that the special first-pass
+call can adjust cd->end_pattern temporarily. (Checks for binary zero while
+processing items within the loop are OK, because afterwards the main loop will
+terminate.)
Arguments:
ptrptr address of the current character pointer (updated)
@@ -1209,9 +1220,11 @@ if (ptr[0] == CHAR_LEFT_PARENTHESIS)
}
/* Past any initial parenthesis handling, scan for parentheses or vertical
-bars. */
+bars. Stop if we get to cd->end_pattern. Note that this is important for the
+first-pass call when this value is temporarily adjusted to stop at the current
+position. So DO NOT change this to a test for binary zero. */
-for (; *ptr != 0; ptr++)
+for (; ptr < cd->end_pattern; ptr++)
{
/* Skip over backslashed characters and also entire \Q...\E */
@@ -5373,11 +5386,17 @@ for (;; ptr++)
while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
namelen = (int)(ptr - name);
- /* In the pre-compile phase, do a syntax check and set a dummy
- reference number. */
+ /* In the pre-compile phase, do a syntax check. We used to just set
+ a dummy reference number, because it was not used in the first pass.
+ However, with the change of recursive back references to be atomic,
+ we have to look for the number so that this state can be identified, as
+ otherwise the incorrect length is computed. If it's not a backwards
+ reference, the dummy number will do. */
if (lengthptr != NULL)
{
+ const uschar *temp;
+
if (namelen == 0)
{
*errorcodeptr = ERR62;
@@ -5393,7 +5412,22 @@ for (;; ptr++)
*errorcodeptr = ERR48;
goto FAILED;
}
- recno = 0;
+
+ /* The name table does not exist in the first pass, so we cannot
+ do a simple search as in the code below. Instead, we have to scan the
+ pattern to find the number. It is important that we scan it only as
+ far as we have got because the syntax of named subpatterns has not
+ been checked for the rest of the pattern, and find_parens() assumes
+ correct syntax. In any case, it's a waste of resources to scan
+ further. We stop the scan at the current point by temporarily
+ adjusting the value of cd->endpattern. */
+
+ temp = cd->end_pattern;
+ cd->end_pattern = ptr;
+ recno = find_parens(cd, name, namelen,
+ (options & PCRE_EXTENDED) != 0, utf8);
+ cd->end_pattern = temp;
+ if (recno < 0) recno = 0; /* Forward ref; set dummy number */
}
/* In the real compile, seek the name in the table. We check the name
diff --git a/testdata/testinput11 b/testdata/testinput11
index f115a64..79ebd08 100644
--- a/testdata/testinput11
+++ b/testdata/testinput11
@@ -504,4 +504,7 @@ however, we need the complication for Perl. ---/
/(*SKIP)b/
a
+/(?P<abn>(?P=abn)xxx|)+/
+ xxx
+
/-- End of testinput11 --/
diff --git a/testdata/testinput2 b/testdata/testinput2
index 8ac500e..7cc8761 100644
--- a/testdata/testinput2
+++ b/testdata/testinput2
@@ -3560,4 +3560,14 @@ with \Y. ---/
/^\cģ/
+/(?P<abn>(?P=abn)xxx)/BZ
+
+/(a\1z)/BZ
+
+/(?P<abn>(?P=abn)(?<badstufxxx)/BZ
+
+/(?P<abn>(?P=axn)xxx)/BZ
+
+/(?P<abn>(?P=axn)xxx)(?<axn>yy)/BZ
+
/-- End of testinput2 --/
diff --git a/testdata/testoutput11 b/testdata/testoutput11
index ee178c1..dbd22e7 100644
--- a/testdata/testoutput11
+++ b/testdata/testoutput11
@@ -970,4 +970,9 @@ No match
a
No match
+/(?P<abn>(?P=abn)xxx|)+/
+ xxx
+ 0:
+ 1:
+
/-- End of testinput11 --/
diff --git a/testdata/testoutput2 b/testdata/testoutput2
index 5c599a4..531d617 100644
--- a/testdata/testoutput2
+++ b/testdata/testoutput2
@@ -11258,4 +11258,51 @@ Error -24
/^\cģ/
Failed: \c must be followed by an ASCII character at offset 3
+/(?P<abn>(?P=abn)xxx)/BZ
+------------------------------------------------------------------
+ Bra
+ Once
+ CBra 1
+ \1
+ xxx
+ Ket
+ Ket
+ Ket
+ End
+------------------------------------------------------------------
+
+/(a\1z)/BZ
+------------------------------------------------------------------
+ Bra
+ Once
+ CBra 1
+ a
+ \1
+ z
+ Ket
+ Ket
+ Ket
+ End
+------------------------------------------------------------------
+
+/(?P<abn>(?P=abn)(?<badstufxxx)/BZ
+Failed: syntax error in subpattern name (missing terminator) at offset 29
+
+/(?P<abn>(?P=axn)xxx)/BZ
+Failed: reference to non-existent subpattern at offset 15
+
+/(?P<abn>(?P=axn)xxx)(?<axn>yy)/BZ
+------------------------------------------------------------------
+ Bra
+ CBra 1
+ \2
+ xxx
+ Ket
+ CBra 2
+ yy
+ Ket
+ Ket
+ End
+------------------------------------------------------------------
+
/-- End of testinput2 --/