summaryrefslogtreecommitdiff
path: root/ext/pcre/pcrelib/pcre_study.c
diff options
context:
space:
mode:
authorStanislav Malyshev <stas@php.net>2015-04-27 23:15:27 -0700
committerStanislav Malyshev <stas@php.net>2015-04-27 23:16:54 -0700
commit23917b451bf4029e78082b4e0a56bc4c6f117990 (patch)
tree8f5f061cc3073e0428d0c92b49780fab4c4d69eb /ext/pcre/pcrelib/pcre_study.c
parent983f155e1ca47ac6eedd68922b86248b03b45096 (diff)
downloadphp-git-23917b451bf4029e78082b4e0a56bc4c6f117990.tar.gz
Upgrade PCRE to 8.36, it fixes some crashes
We probably will need to go to 8.37 once it is released.
Diffstat (limited to 'ext/pcre/pcrelib/pcre_study.c')
-rw-r--r--ext/pcre/pcrelib/pcre_study.c187
1 files changed, 134 insertions, 53 deletions
diff --git a/ext/pcre/pcrelib/pcre_study.c b/ext/pcre/pcrelib/pcre_study.c
index 12d2a66817..f19d9fbb90 100644
--- a/ext/pcre/pcrelib/pcre_study.c
+++ b/ext/pcre/pcrelib/pcre_study.c
@@ -66,8 +66,9 @@ string of that length that matches. In UTF8 mode, the result is in characters
rather than bytes.
Arguments:
+ re compiled pattern block
code pointer to start of group (the bracket)
- startcode pointer to start of the whole pattern
+ startcode pointer to start of the whole pattern's code
options the compiling options
int RECURSE depth
@@ -78,8 +79,8 @@ Returns: the minimum length
*/
static int
-find_minlength(const pcre_uchar *code, const pcre_uchar *startcode, int options,
- int recurse_depth)
+find_minlength(const REAL_PCRE *re, const pcre_uchar *code,
+ const pcre_uchar *startcode, int options, int recurse_depth)
{
int length = -1;
/* PCRE_UTF16 has the same value as PCRE_UTF8. */
@@ -129,7 +130,7 @@ for (;;)
case OP_SBRAPOS:
case OP_ONCE:
case OP_ONCE_NC:
- d = find_minlength(cc, startcode, options, recurse_depth);
+ d = find_minlength(re, cc, startcode, options, recurse_depth);
if (d < 0) return d;
branchlength += d;
do cc += GET(cc, 1); while (*cc == OP_ALT);
@@ -175,9 +176,9 @@ for (;;)
case OP_REVERSE:
case OP_CREF:
- case OP_NCREF:
+ case OP_DNCREF:
case OP_RREF:
- case OP_NRREF:
+ case OP_DNRREF:
case OP_DEF:
case OP_CALLOUT:
case OP_SOD:
@@ -341,6 +342,7 @@ for (;;)
{
case OP_CRPLUS:
case OP_CRMINPLUS:
+ case OP_CRPOSPLUS:
branchlength++;
/* Fall through */
@@ -348,11 +350,14 @@ for (;;)
case OP_CRMINSTAR:
case OP_CRQUERY:
case OP_CRMINQUERY:
+ case OP_CRPOSSTAR:
+ case OP_CRPOSQUERY:
cc++;
break;
case OP_CRRANGE:
case OP_CRMINRANGE:
+ case OP_CRPOSRANGE:
branchlength += GET2(cc,1);
cc += 1 + 2 * IMM2_SIZE;
break;
@@ -375,7 +380,38 @@ for (;;)
matches an empty string (by default it causes a matching failure), so in
that case we must set the minimum length to zero. */
- case OP_REF:
+ case OP_DNREF: /* Duplicate named pattern back reference */
+ case OP_DNREFI:
+ if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
+ {
+ int count = GET2(cc, 1+IMM2_SIZE);
+ pcre_uchar *slot = (pcre_uchar *)re +
+ re->name_table_offset + GET2(cc, 1) * re->name_entry_size;
+ d = INT_MAX;
+ while (count-- > 0)
+ {
+ ce = cs = (pcre_uchar *)PRIV(find_bracket)(startcode, utf, GET2(slot, 0));
+ if (cs == NULL) return -2;
+ do ce += GET(ce, 1); while (*ce == OP_ALT);
+ if (cc > cs && cc < ce)
+ {
+ d = 0;
+ had_recurse = TRUE;
+ break;
+ }
+ else
+ {
+ int dd = find_minlength(re, cs, startcode, options, recurse_depth);
+ if (dd < d) d = dd;
+ }
+ slot += re->name_entry_size;
+ }
+ }
+ else d = 0;
+ cc += 1 + 2*IMM2_SIZE;
+ goto REPEAT_BACK_REFERENCE;
+
+ case OP_REF: /* Single back reference */
case OP_REFI:
if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
{
@@ -389,7 +425,7 @@ for (;;)
}
else
{
- d = find_minlength(cs, startcode, options, recurse_depth);
+ d = find_minlength(re, cs, startcode, options, recurse_depth);
}
}
else d = 0;
@@ -397,24 +433,29 @@ for (;;)
/* Handle repeated back references */
+ REPEAT_BACK_REFERENCE:
switch (*cc)
{
case OP_CRSTAR:
case OP_CRMINSTAR:
case OP_CRQUERY:
case OP_CRMINQUERY:
+ case OP_CRPOSSTAR:
+ case OP_CRPOSQUERY:
min = 0;
cc++;
break;
case OP_CRPLUS:
case OP_CRMINPLUS:
+ case OP_CRPOSPLUS:
min = 1;
cc++;
break;
case OP_CRRANGE:
case OP_CRMINRANGE:
+ case OP_CRPOSRANGE:
min = GET2(cc, 1);
cc += 1 + 2 * IMM2_SIZE;
break;
@@ -437,7 +478,8 @@ for (;;)
had_recurse = TRUE;
else
{
- branchlength += find_minlength(cs, startcode, options, recurse_depth + 1);
+ branchlength += find_minlength(re, cs, startcode, options,
+ recurse_depth + 1);
}
cc += 1 + LINK_SIZE;
break;
@@ -778,6 +820,10 @@ do
case OP_COND:
case OP_CREF:
case OP_DEF:
+ case OP_DNCREF:
+ case OP_DNREF:
+ case OP_DNREFI:
+ case OP_DNRREF:
case OP_DOLL:
case OP_DOLLM:
case OP_END:
@@ -786,7 +832,6 @@ do
case OP_EXTUNI:
case OP_FAIL:
case OP_MARK:
- case OP_NCREF:
case OP_NOT:
case OP_NOTEXACT:
case OP_NOTEXACTI:
@@ -818,8 +863,6 @@ do
case OP_NOTUPTOI:
case OP_NOT_HSPACE:
case OP_NOT_VSPACE:
- case OP_NRREF:
- case OP_PROP:
case OP_PRUNE:
case OP_PRUNE_ARG:
case OP_RECURSE:
@@ -835,11 +878,33 @@ do
case OP_SOM:
case OP_THEN:
case OP_THEN_ARG:
-#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
- case OP_XCLASS:
-#endif
return SSB_FAIL;
+ /* A "real" property test implies no starting bits, but the fake property
+ PT_CLIST identifies a list of characters. These lists are short, as they
+ are used for characters with more than one "other case", so there is no
+ point in recognizing them for OP_NOTPROP. */
+
+ case OP_PROP:
+ if (tcode[1] != PT_CLIST) return SSB_FAIL;
+ {
+ const pcre_uint32 *p = PRIV(ucd_caseless_sets) + tcode[2];
+ while ((c = *p++) < NOTACHAR)
+ {
+#if defined SUPPORT_UTF && defined COMPILE_PCRE8
+ if (utf)
+ {
+ pcre_uchar buff[6];
+ (void)PRIV(ord2utf)(c, buff);
+ c = buff[0];
+ }
+#endif
+ if (c > 0xff) SET_BIT(0xff); else SET_BIT(c);
+ }
+ }
+ try_next = FALSE;
+ break;
+
/* We can ignore word boundary tests. */
case OP_WORD_BOUNDARY:
@@ -1065,24 +1130,17 @@ do
try_next = FALSE;
break;
- /* The cbit_space table has vertical tab as whitespace; we have to
- ensure it is set as not whitespace. Luckily, the code value is the same
- (0x0b) in ASCII and EBCDIC, so we can just adjust the appropriate bit. */
+ /* The cbit_space table has vertical tab as whitespace; we no longer
+ have to play fancy tricks because Perl added VT to its whitespace at
+ release 5.18. PCRE added it at release 8.34. */
case OP_NOT_WHITESPACE:
set_nottype_bits(start_bits, cbit_space, table_limit, cd);
- start_bits[1] |= 0x08;
try_next = FALSE;
break;
- /* The cbit_space table has vertical tab as whitespace; we have to not
- set it from the table. Luckily, the code value is the same (0x0b) in
- ASCII and EBCDIC, so we can just adjust the appropriate bit. */
-
case OP_WHITESPACE:
- c = start_bits[1]; /* Save in case it was already set */
set_type_bits(start_bits, cbit_space, table_limit, cd);
- start_bits[1] = (start_bits[1] & ~0x08) | c;
try_next = FALSE;
break;
@@ -1183,24 +1241,16 @@ do
set_type_bits(start_bits, cbit_digit, table_limit, cd);
break;
- /* The cbit_space table has vertical tab as whitespace; we have to
- ensure it gets set as not whitespace. Luckily, the code value is the
- same (0x0b) in ASCII and EBCDIC, so we can just adjust the appropriate
- bit. */
+ /* The cbit_space table has vertical tab as whitespace; we no longer
+ have to play fancy tricks because Perl added VT to its whitespace at
+ release 5.18. PCRE added it at release 8.34. */
case OP_NOT_WHITESPACE:
set_nottype_bits(start_bits, cbit_space, table_limit, cd);
- start_bits[1] |= 0x08;
break;
- /* The cbit_space table has vertical tab as whitespace; we have to
- avoid setting it. Luckily, the code value is the same (0x0b) in ASCII
- and EBCDIC, so we can just adjust the appropriate bit. */
-
case OP_WHITESPACE:
- c = start_bits[1]; /* Save in case it was already set */
set_type_bits(start_bits, cbit_space, table_limit, cd);
- start_bits[1] = (start_bits[1] & ~0x08) | c;
break;
case OP_NOT_WORDCHAR:
@@ -1221,6 +1271,16 @@ do
with a value >= 0xc4 is a potentially valid starter because it starts a
character with a value > 255. */
+#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
+ case OP_XCLASS:
+ if ((tcode[1 + LINK_SIZE] & XCL_HASPROP) != 0)
+ return SSB_FAIL;
+ /* All bits are set. */
+ if ((tcode[1 + LINK_SIZE] & XCL_MAP) == 0 && (tcode[1 + LINK_SIZE] & XCL_NOT) != 0)
+ return SSB_FAIL;
+#endif
+ /* Fall through */
+
case OP_NCLASS:
#if defined SUPPORT_UTF && defined COMPILE_PCRE8
if (utf)
@@ -1237,8 +1297,21 @@ do
case OP_CLASS:
{
pcre_uint8 *map;
- tcode++;
- map = (pcre_uint8 *)tcode;
+#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
+ map = NULL;
+ if (*tcode == OP_XCLASS)
+ {
+ if ((tcode[1 + LINK_SIZE] & XCL_MAP) != 0)
+ map = (pcre_uint8 *)(tcode + 1 + LINK_SIZE + 1);
+ tcode += GET(tcode, 1);
+ }
+ else
+#endif
+ {
+ tcode++;
+ map = (pcre_uint8 *)tcode;
+ tcode += 32 / sizeof(pcre_uchar);
+ }
/* In UTF-8 mode, the bits in a bit map correspond to character
values, not to byte values. However, the bit map we are constructing is
@@ -1246,42 +1319,49 @@ do
value is > 127. In fact, there are only two possible starting bytes for
characters in the range 128 - 255. */
-#if defined SUPPORT_UTF && defined COMPILE_PCRE8
- if (utf)
+#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
+ if (map != NULL)
+#endif
{
- for (c = 0; c < 16; c++) start_bits[c] |= map[c];
- for (c = 128; c < 256; c++)
+#if defined SUPPORT_UTF && defined COMPILE_PCRE8
+ if (utf)
{
- if ((map[c/8] && (1 << (c&7))) != 0)
+ for (c = 0; c < 16; c++) start_bits[c] |= map[c];
+ for (c = 128; c < 256; c++)
{
- int d = (c >> 6) | 0xc0; /* Set bit for this starter */
- start_bits[d/8] |= (1 << (d&7)); /* and then skip on to the */
- c = (c & 0xc0) + 0x40 - 1; /* next relevant character. */
+ if ((map[c/8] && (1 << (c&7))) != 0)
+ {
+ int d = (c >> 6) | 0xc0; /* Set bit for this starter */
+ start_bits[d/8] |= (1 << (d&7)); /* and then skip on to the */
+ c = (c & 0xc0) + 0x40 - 1; /* next relevant character. */
+ }
}
}
- }
- else
+ else
#endif
- {
- /* In non-UTF-8 mode, the two bit maps are completely compatible. */
- for (c = 0; c < 32; c++) start_bits[c] |= map[c];
+ {
+ /* In non-UTF-8 mode, the two bit maps are completely compatible. */
+ for (c = 0; c < 32; c++) start_bits[c] |= map[c];
+ }
}
/* Advance past the bit map, and act on what follows. For a zero
minimum repeat, continue; otherwise stop processing. */
- tcode += 32 / sizeof(pcre_uchar);
switch (*tcode)
{
case OP_CRSTAR:
case OP_CRMINSTAR:
case OP_CRQUERY:
case OP_CRMINQUERY:
+ case OP_CRPOSSTAR:
+ case OP_CRPOSQUERY:
tcode++;
break;
case OP_CRRANGE:
case OP_CRMINRANGE:
+ case OP_CRPOSRANGE:
if (GET2(tcode, 1) == 0) tcode += 1 + 2 * IMM2_SIZE;
else try_next = FALSE;
break;
@@ -1346,6 +1426,7 @@ pcre_uchar *code;
compile_data compile_block;
const REAL_PCRE *re = (const REAL_PCRE *)external_re;
+
*errorptr = NULL;
if (re == NULL || re->magic_number != MAGIC_NUMBER)
@@ -1422,7 +1503,7 @@ if ((re->options & PCRE_ANCHORED) == 0 &&
/* Find the minimum length of subject string. */
-switch(min = find_minlength(code, code, re->options, 0))
+switch(min = find_minlength(re, code, code, re->options, 0))
{
case -2: *errorptr = "internal error: missing capturing bracket"; return NULL;
case -3: *errorptr = "internal error: opcode not recognized"; return NULL;