summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069>2016-11-23 17:17:57 +0000
committerph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069>2016-11-23 17:17:57 +0000
commit0986cdb1a2b5aa6331a545d018863983d5951459 (patch)
tree7a91f8093fc436bf3b81041f36fbc8998d2f860f
parentb55d9179309bb3b6ba08c92d279555b2f897be69 (diff)
downloadpcre2-0986cdb1a2b5aa6331a545d018863983d5951459.tar.gz
Fix bad behaviour for subroutine call in lookbehind when the called subroutine
contained an option setting such as (?s) and PCRE2_ANCHORED was set. git-svn-id: svn://vcs.exim.org/pcre2/code/trunk@606 6239d852-aaf2-0410-a92c-79f79f948069
-rw-r--r--ChangeLog6
-rw-r--r--HACKING10
-rwxr-xr-xRunTest2
-rw-r--r--src/pcre2_compile.c141
-rw-r--r--src/pcre2_error.c1
-rw-r--r--testdata/testinput22
-rw-r--r--testdata/testoutput25
7 files changed, 146 insertions, 21 deletions
diff --git a/ChangeLog b/ChangeLog
index 8ed6e83..8dd69cf 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -89,6 +89,12 @@ copied).
(k) If parsing a pattern exactly filled the buffer, the internal test for
overrun did not check when the final META_END item was added.
+
+ (l) If a lookbehind contained a subroutine call, and the called group
+ contained an option setting such as (?s), and the PCRE2_ANCHORED option
+ was set, unpredictable behaviour could occur. The underlying bug was
+ incorrect code and insufficient checking while searching for the end of
+ the called subroutine in the parsed pattern.
4. Back references are now permitted in lookbehind assertions when there are
no duplicated group numbers (that is, (?| has not been used), and, if the
diff --git a/HACKING b/HACKING
index 85b52c9..5c77601 100644
--- a/HACKING
+++ b/HACKING
@@ -240,8 +240,8 @@ META_RECURSE is always followed by an offset, for use in error messages.
META_ESCAPE has an ESC_xxx value as its data. For ESC_P and ESC_p, the next
element contains the 16-bit type and data property values, packed together.
ESC_g and ESC_k are used only for named references - numerical ones are turned
-into META_RECURSE or META_BACKREF as appropriate. They are followed by a length
-and an offset into the pattern to specify the name.
+into META_RECURSE or META_BACKREF as appropriate. ESC_g and ESC_k are followed
+by a length and an offset into the pattern to specify the name.
The following have one data item that follows in the next vector element:
@@ -279,10 +279,6 @@ The following is followed just by an offset, for use in error messages:
META_COND_DEFINE (?(DEFINE)
-In fact, META_COND_ASSERT is used for any group starting (?( that does not
-match any of the other META_COND cases. The check that this group is an
-assertion (optionally preceded by a callout) happens at compile time.
-
The following are also followed just by an offset, but also the lower 16 bits
of the main word contain the length of the first branch of the lookbehind
group; this is used when generating OP_REVERSE for that branch.
@@ -799,4 +795,4 @@ not a real opcode, but is used to check that tables indexed by opcode are the
correct length, in order to catch updating errors.
Philip Hazel
-October 2016
+November 2016
diff --git a/RunTest b/RunTest
index 38f0078..b53d404 100755
--- a/RunTest
+++ b/RunTest
@@ -502,7 +502,7 @@ for bmode in "$test8" "$test16" "$test32"; do
for opt in "" $jitopt; do
$sim $valgrind ${opt:+$vjs} ./pcre2test -q $test2stack $bmode $opt $testdata/testinput2 testtry
if [ $? = 0 ] ; then
- $sim $valgrind ${opt:+$vjs} ./pcre2test -q $bmode $opt -error -63,-62,-2,-1,0,100,188,189,190 >>testtry
+ $sim $valgrind ${opt:+$vjs} ./pcre2test -q $bmode $opt -error -63,-62,-2,-1,0,100,188,189,190,191 >>testtry
checkresult $? 2 "$opt"
else
echo " "
diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c
index 4f72872..2703791 100644
--- a/src/pcre2_compile.c
+++ b/src/pcre2_compile.c
@@ -198,7 +198,10 @@ don't have to check them every time. */
unsigned ints. Values less than META_END are literal data values. The coding
for identifying the item is in the top 16-bits, leaving 16 bits for the
additional data that some of them need. The META_CODE, META_DATA, and META_DIFF
-macros are used to manipulate parsed pattern elements. */
+macros are used to manipulate parsed pattern elements.
+
+NOTE: When these definitions are changed, the table of extra lengths for each
+code (meta_extra_lengths, just below) must be updated to remain in step. */
#define META_END 0x80000000u /* End of pattern */
@@ -277,6 +280,73 @@ versions. */
#define META_FIRST_QUANTIFIER META_ASTERISK
#define META_LAST_QUANTIFIER META_MINMAX_QUERY
+/* Table of extra lengths for each of the meta codes. Must be kept in step with
+the definitions above. For some items these values are a basic length to which
+a variable amount has to be added. */
+
+static unsigned char meta_extra_lengths[] = {
+ 0, /* META_END */
+ 0, /* META_ALT */
+ 0, /* META_ATOMIC */
+ 0, /* META_BACKREF - more if group is >= 10 */
+ 1+SIZEOFFSET, /* META_BACKREF_BYNAME */
+ 1, /* META_BIGVALUE */
+ 3, /* META_CALLOUT_NUMBER */
+ 3+SIZEOFFSET, /* META_CALLOUT_STRING */
+ 0, /* META_CAPTURE */
+ 0, /* META_CIRCUMFLEX */
+ 0, /* META_CLASS */
+ 0, /* META_CLASS_EMPTY */
+ 0, /* META_CLASS_EMPTY_NOT */
+ 0, /* META_CLASS_END */
+ 0, /* META_CLASS_NOT */
+ 0, /* META_COND_ASSERT */
+ SIZEOFFSET, /* META_COND_DEFINE */
+ 1+SIZEOFFSET, /* META_COND_NAME */
+ 1+SIZEOFFSET, /* META_COND_NUMBER */
+ 1+SIZEOFFSET, /* META_COND_RNAME */
+ 1+SIZEOFFSET, /* META_COND_RNUMBER */
+ 3, /* META_COND_VERSION */
+ 0, /* META_DOLLAR */
+ 0, /* META_DOT */
+ 0, /* META_ESCAPE - more for ESC_P, ESC_p, ESC_g, ESC_k */
+ 0, /* META_KET */
+ 0, /* META_NOCAPTURE */
+ 1, /* META_OPTIONS */
+ 1, /* META_POSIX */
+ 1, /* META_POSIX_NEG */
+ 0, /* META_RANGE_ESCAPED */
+ 0, /* META_RANGE_LITERAL */
+ SIZEOFFSET, /* META_RECURSE */
+ 1+SIZEOFFSET, /* META_RECURSE_BYNAME */
+ 0, /* META_LOOKAHEAD */
+ 0, /* META_LOOKAHEADNOT */
+ SIZEOFFSET, /* META_LOOKBEHIND */
+ SIZEOFFSET, /* META_LOOKBEHINDNOT */
+ 1, /* META_MARK - plus the string length */
+ 0, /* META_ACCEPT */
+ 0, /* META_COMMIT */
+ 0, /* META_FAIL */
+ 0, /* META_PRUNE */
+ 1, /* META_PRUNE_ARG - plus the string length */
+ 0, /* META_SKIP */
+ 1, /* META_SKIP_ARG - plus the string length */
+ 0, /* META_THEN */
+ 1, /* META_THEN_ARG - plus the string length */
+ 0, /* META_ASTERISK */
+ 0, /* META_ASTERISK_PLUS */
+ 0, /* META_ASTERISK_QUERY */
+ 0, /* META_PLUS */
+ 0, /* META_PLUS_PLUS */
+ 0, /* META_PLUS_QUERY */
+ 0, /* META_QUERY */
+ 0, /* META_QUERY_PLUS */
+ 0, /* META_QUERY_QUERY */
+ 2, /* META_MINMAX */
+ 2, /* META_MINMAX_PLUS */
+ 2 /* META_MINMAX_QUERY */
+};
+
/* Types for skipping parts of a parsed pattern. */
enum { PSKIP_ALT, PSKIP_CLASS, PSKIP_KET };
@@ -318,8 +388,8 @@ comparison). */
locale, and may mark arbitrary characters as digits. We want to recognize only
0-9, a-z, and A-Z as hex digits, which is why we have a private table here. It
costs 256 bytes, but it is a lot faster than doing character value tests (at
-least in some simple cases I timed), and in some applications one wants PCRE to
-compile efficiently as well as match efficiently. The value in the table is
+least in some simple cases I timed), and in some applications one wants PCRE2
+to compile efficiently as well as match efficiently. The value in the table is
the binary hex digit value, or 0xff for non-hex digits. */
/* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
@@ -646,7 +716,7 @@ enum { ERR0 = COMPILE_ERROR_BASE,
ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59, ERR60,
ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68, ERR69, ERR70,
ERR71, ERR72, ERR73, ERR74, ERR75, ERR76, ERR77, ERR78, ERR79, ERR80,
- ERR81, ERR82, ERR83, ERR84, ERR85, ERR86, ERR87, ERR88, ERR89 };
+ ERR81, ERR82, ERR83, ERR84, ERR85, ERR86, ERR87, ERR88, ERR89, ERR90 };
/* This is a table of start-of-pattern options such as (*UTF) and settings such
as (*LIMIT_MATCH=nnnn) and (*CRLF). For completeness and backward
@@ -1036,7 +1106,7 @@ if ((code->flags & PCRE2_DEREF_TABLES) != 0)
ref_count = (PCRE2_SIZE *)(code->tables + tables_length);
(*ref_count)++;
}
-
+
return newcode;
}
@@ -5325,7 +5395,7 @@ for (;; pptr++)
/* Any other non-literal must be an escape */
- if (meta > META_END)
+ if (meta >= META_END)
{
if (META_CODE(meta) != META_ESCAPE)
{
@@ -8240,6 +8310,7 @@ Arguments:
PSKIP_KET when only META_KET ends the skip
Returns: new value of pptr
+ NULL if META_END is reached - should never occur
*/
static uint32_t *
@@ -8249,12 +8320,47 @@ uint32_t nestlevel = 0;
for (pptr += 1;; pptr++)
{
- switch(META_CODE(*pptr))
+ uint32_t meta = META_CODE(*pptr);
+ switch(meta)
{
- case META_BIGVALUE:
- pptr += 1;
+ default: /* Just skip over most items */
+ break;
+
+ /* This should never occur. */
+
+ case META_END:
+ return NULL;
+
+ /* The data for these items is variable in length. */
+
+ case META_BACKREF: /* Offset is present only if group >= 10 */
+ if (META_DATA(*pptr) >= 10) pptr += SIZEOFFSET;
+ break;
+
+ case META_ESCAPE: /* A few escapes are followed by data items. */
+ switch (META_DATA(*pptr))
+ {
+ case ESC_P:
+ case ESC_p:
+ pptr += 1;
+ break;
+
+ case ESC_g:
+ case ESC_k:
+ pptr += 1 + SIZEOFFSET;
+ break;
+ }
break;
+ case META_MARK: /* Add the length of the name. */
+ case META_PRUNE_ARG:
+ case META_SKIP_ARG:
+ case META_THEN_ARG:
+ pptr += pptr[1];
+ break;
+
+ /* These are the "active" items in this loop. */
+
case META_CLASS_END:
if (skiptype == PSKIP_CLASS) return pptr;
break;
@@ -8284,10 +8390,13 @@ for (pptr += 1;; pptr++)
if (nestlevel == 0) return pptr;
nestlevel--;
break;
-
- default:
- break;
}
+
+ /* The extra data item length for each meta is in a table. */
+
+ meta = (meta & 0x0fff0000u) >> 16;
+ if (meta >= sizeof(meta_extra_lengths)) return NULL;
+ pptr += meta_extra_lengths[meta];
}
/* Control never reaches here */
return pptr;
@@ -8427,6 +8536,7 @@ for (;; pptr++)
case META_ACCEPT:
case META_FAIL:
pptr = parsed_skip(pptr, PSKIP_ALT);
+ if (pptr == NULL) goto PARSED_SKIP_FAILED;
goto EXIT;
case META_MARK:
@@ -8457,6 +8567,7 @@ for (;; pptr++)
case META_CLASS_NOT:
itemlength = 1;
pptr = parsed_skip(pptr, PSKIP_CLASS);
+ if (pptr == NULL) goto PARSED_SKIP_FAILED;
break;
case META_CLASS_EMPTY_NOT:
@@ -8498,6 +8609,7 @@ for (;; pptr++)
case META_LOOKAHEAD:
case META_LOOKAHEADNOT:
pptr = parsed_skip(pptr, PSKIP_KET);
+ if (pptr == NULL) goto PARSED_SKIP_FAILED;
break;
/* Lookbehinds can be ignored, but must themselves be checked. */
@@ -8595,6 +8707,7 @@ for (;; pptr++)
}
gptrend = parsed_skip(gptr, PSKIP_KET);
+ if (gptrend == NULL) goto PARSED_SKIP_FAILED;
if (pptr > gptr && pptr < gptrend) goto ISNOTFIXED; /* Local recursion */
for (r = recurses; r != NULL; r = r->prev) if (r->groupptr == gptr) break;
if (r != NULL) goto ISNOTFIXED; /* Mutual recursion */
@@ -8685,6 +8798,10 @@ EXIT:
*pptrptr = pptr;
if (branchlength > cb->max_lookbehind) cb->max_lookbehind = branchlength;
return branchlength;
+
+PARSED_SKIP_FAILED:
+*errcodeptr = ERR90;
+return -1;
}
diff --git a/src/pcre2_error.c b/src/pcre2_error.c
index 83c647d..50ad2fd 100644
--- a/src/pcre2_error.c
+++ b/src/pcre2_error.c
@@ -175,6 +175,7 @@ static const unsigned char compile_error_texts[] =
"pattern string is longer than the limit set by the application\0"
"internal error: unknown code in parsed pattern\0"
/* 90 */
+ "internal error: bad code value in parsed_skip()\0"
;
/* Match-time and UTF error texts are in the same format. */
diff --git a/testdata/testinput2 b/testdata/testinput2
index 159ab99..b2c66a1 100644
--- a/testdata/testinput2
+++ b/testdata/testinput2
@@ -4916,4 +4916,6 @@ a)"xI
/(?(?(?(?(?(?))))))/
+/(?<=(?1))((?s))/anchored
+
# End of testinput2
diff --git a/testdata/testoutput2 b/testdata/testoutput2
index 9a0a4dc..d980c76 100644
--- a/testdata/testoutput2
+++ b/testdata/testoutput2
@@ -15367,6 +15367,8 @@ Failed: error 128 at offset 6: assertion expected after (?( or (?(?C)
/(?(?(?(?(?(?))))))/
Failed: error 128 at offset 2: assertion expected after (?( or (?(?C)
+/(?<=(?1))((?s))/anchored
+
# End of testinput2
Error -63: PCRE2_ERROR_BADDATA (unknown error number)
Error -62: bad serialized data
@@ -15376,4 +15378,5 @@ Error 0: PCRE2_ERROR_BADDATA (unknown error number)
Error 100: no error
Error 188: pattern string is longer than the limit set by the application
Error 189: internal error: unknown code in parsed pattern
-Error 190: PCRE2_ERROR_BADDATA (unknown error number)
+Error 190: internal error: bad code value in parsed_skip()
+Error 191: PCRE2_ERROR_BADDATA (unknown error number)