summaryrefslogtreecommitdiff
path: root/ext/pcre/pcrelib/pcre_study.c
diff options
context:
space:
mode:
Diffstat (limited to 'ext/pcre/pcrelib/pcre_study.c')
-rw-r--r--ext/pcre/pcrelib/pcre_study.c725
1 files changed, 565 insertions, 160 deletions
diff --git a/ext/pcre/pcrelib/pcre_study.c b/ext/pcre/pcrelib/pcre_study.c
index 431ad9d0dd..9d893d5e20 100644
--- a/ext/pcre/pcrelib/pcre_study.c
+++ b/ext/pcre/pcrelib/pcre_study.c
@@ -6,7 +6,7 @@
and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
- Copyright (c) 1997-2010 University of Cambridge
+ Copyright (c) 1997-2012 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@@ -50,7 +50,7 @@ supporting functions. */
/* Returns from set_start_bits() */
-enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE };
+enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE, SSB_UNKNOWN };
@@ -64,25 +64,30 @@ string of that length that matches. In UTF8 mode, the result is in characters
rather than bytes.
Arguments:
- code pointer to start of group (the bracket)
- startcode pointer to start of the whole pattern
- options the compiling options
+ code pointer to start of group (the bracket)
+ startcode pointer to start of the whole pattern
+ options the compiling options
+ int RECURSE depth
Returns: the minimum length
- -1 if \C was encountered
+ -1 if \C in UTF-8 mode or (*ACCEPT) was encountered
-2 internal error (missing capturing bracket)
+ -3 internal error (opcode not listed)
*/
static int
-find_minlength(const uschar *code, const uschar *startcode, int options)
+find_minlength(const pcre_uchar *code, const pcre_uchar *startcode, int options,
+ int recurse_depth)
{
int length = -1;
-BOOL utf8 = (options & PCRE_UTF8) != 0;
+/* PCRE_UTF16 has the same value as PCRE_UTF8. */
+BOOL utf = (options & PCRE_UTF8) != 0;
BOOL had_recurse = FALSE;
register int branchlength = 0;
-register uschar *cc = (uschar *)code + 1 + LINK_SIZE;
+register pcre_uchar *cc = (pcre_uchar *)code + 1 + LINK_SIZE;
-if (*code == OP_CBRA || *code == OP_SCBRA) cc += 2;
+if (*code == OP_CBRA || *code == OP_SCBRA ||
+ *code == OP_CBRAPOS || *code == OP_SCBRAPOS) cc += IMM2_SIZE;
/* Scan along the opcodes for this branch. If we get to the end of the
branch, check the length against that of the other branches. */
@@ -90,7 +95,7 @@ branch, check the length against that of the other branches. */
for (;;)
{
int d, min;
- uschar *cs, *ce;
+ pcre_uchar *cs, *ce;
register int op = *cc;
switch (op)
@@ -116,26 +121,40 @@ for (;;)
case OP_SCBRA:
case OP_BRA:
case OP_SBRA:
+ case OP_CBRAPOS:
+ case OP_SCBRAPOS:
+ case OP_BRAPOS:
+ case OP_SBRAPOS:
case OP_ONCE:
- d = find_minlength(cc, startcode, options);
+ case OP_ONCE_NC:
+ d = find_minlength(cc, startcode, options, recurse_depth);
if (d < 0) return d;
branchlength += d;
do cc += GET(cc, 1); while (*cc == OP_ALT);
cc += 1 + LINK_SIZE;
break;
+ /* ACCEPT makes things far too complicated; we have to give up. */
+
+ case OP_ACCEPT:
+ case OP_ASSERT_ACCEPT:
+ return -1;
+
/* Reached end of a branch; if it's a ket it is the end of a nested
- call. If it's ALT it is an alternation in a nested call. If it is
- END it's the end of the outer call. All can be handled by the same code. */
+ call. If it's ALT it is an alternation in a nested call. If it is END it's
+ the end of the outer call. All can be handled by the same code. If an
+ ACCEPT was previously encountered, use the length that was in force at that
+ time, and pass back the shortest ACCEPT length. */
case OP_ALT:
case OP_KET:
case OP_KETRMAX:
case OP_KETRMIN:
+ case OP_KETRPOS:
case OP_END:
if (length < 0 || (!had_recurse && branchlength < length))
length = branchlength;
- if (*cc != OP_ALT) return length;
+ if (op != OP_ALT) return length;
cc += 1 + LINK_SIZE;
branchlength = 0;
had_recurse = FALSE;
@@ -158,25 +177,27 @@ for (;;)
case OP_RREF:
case OP_NRREF:
case OP_DEF:
- case OP_OPT:
case OP_CALLOUT:
case OP_SOD:
case OP_SOM:
case OP_EOD:
case OP_EODN:
case OP_CIRC:
+ case OP_CIRCM:
case OP_DOLL:
+ case OP_DOLLM:
case OP_NOT_WORD_BOUNDARY:
case OP_WORD_BOUNDARY:
- cc += _pcre_OP_lengths[*cc];
+ cc += PRIV(OP_lengths)[*cc];
break;
/* Skip over a subpattern that has a {0} or {0,x} quantifier */
case OP_BRAZERO:
case OP_BRAMINZERO:
+ case OP_BRAPOSZERO:
case OP_SKIPZERO:
- cc += _pcre_OP_lengths[*cc];
+ cc += PRIV(OP_lengths)[*cc];
do cc += GET(cc, 1); while (*cc == OP_ALT);
cc += 1 + LINK_SIZE;
break;
@@ -184,18 +205,25 @@ for (;;)
/* Handle literal characters and + repetitions */
case OP_CHAR:
- case OP_CHARNC:
+ case OP_CHARI:
case OP_NOT:
+ case OP_NOTI:
case OP_PLUS:
+ case OP_PLUSI:
case OP_MINPLUS:
+ case OP_MINPLUSI:
case OP_POSPLUS:
+ case OP_POSPLUSI:
case OP_NOTPLUS:
+ case OP_NOTPLUSI:
case OP_NOTMINPLUS:
+ case OP_NOTMINPLUSI:
case OP_NOTPOSPLUS:
+ case OP_NOTPOSPLUSI:
branchlength++;
cc += 2;
-#ifdef SUPPORT_UTF8
- if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
+#ifdef SUPPORT_UTF
+ if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
#endif
break;
@@ -210,17 +238,20 @@ for (;;)
need to skip over a multibyte character in UTF8 mode. */
case OP_EXACT:
+ case OP_EXACTI:
case OP_NOTEXACT:
+ case OP_NOTEXACTI:
branchlength += GET2(cc,1);
- cc += 4;
-#ifdef SUPPORT_UTF8
- if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
+ cc += 2 + IMM2_SIZE;
+#ifdef SUPPORT_UTF
+ if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
#endif
break;
case OP_TYPEEXACT:
branchlength += GET2(cc,1);
- cc += (cc[3] == OP_PROP || cc[3] == OP_NOTPROP)? 6 : 4;
+ cc += 2 + IMM2_SIZE + ((cc[1 + IMM2_SIZE] == OP_PROP
+ || cc[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
break;
/* Handle single-char non-literal matchers */
@@ -247,18 +278,21 @@ for (;;)
cc++;
break;
- /* "Any newline" might match two characters */
+ /* "Any newline" might match two characters, but it also might match just
+ one. */
case OP_ANYNL:
- branchlength += 2;
+ branchlength += 1;
cc++;
break;
- /* The single-byte matcher means we can't proceed in UTF-8 mode */
+ /* The single-byte matcher means we can't proceed in UTF-8 mode. (In
+ non-UTF-8 mode \C will actually be turned into OP_ALLANY, so won't ever
+ appear, but leave the code, just in case.) */
case OP_ANYBYTE:
-#ifdef SUPPORT_UTF8
- if (utf8) return -1;
+#ifdef SUPPORT_UTF
+ if (utf) return -1;
#endif
branchlength++;
cc++;
@@ -274,27 +308,28 @@ for (;;)
case OP_TYPEPOSSTAR:
case OP_TYPEPOSQUERY:
if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
- cc += _pcre_OP_lengths[op];
+ cc += PRIV(OP_lengths)[op];
break;
case OP_TYPEUPTO:
case OP_TYPEMINUPTO:
case OP_TYPEPOSUPTO:
- if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
- cc += _pcre_OP_lengths[op];
+ if (cc[1 + IMM2_SIZE] == OP_PROP
+ || cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2;
+ cc += PRIV(OP_lengths)[op];
break;
/* Check a class for variable quantification */
-#ifdef SUPPORT_UTF8
+#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
case OP_XCLASS:
- cc += GET(cc, 1) - 33;
+ cc += GET(cc, 1) - PRIV(OP_lengths)[OP_CLASS];
/* Fall through */
#endif
case OP_CLASS:
case OP_NCLASS:
- cc += 33;
+ cc += PRIV(OP_lengths)[OP_CLASS];
switch (*cc)
{
@@ -313,7 +348,7 @@ for (;;)
case OP_CRRANGE:
case OP_CRMINRANGE:
branchlength += GET2(cc,1);
- cc += 5;
+ cc += 1 + 2 * IMM2_SIZE;
break;
default:
@@ -335,9 +370,10 @@ for (;;)
that case we must set the minimum length to zero. */
case OP_REF:
+ case OP_REFI:
if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
{
- ce = cs = (uschar *)_pcre_find_bracket(startcode, utf8, GET2(cc, 1));
+ ce = cs = (pcre_uchar *)PRIV(find_bracket)(startcode, utf, GET2(cc, 1));
if (cs == NULL) return -2;
do ce += GET(ce, 1); while (*ce == OP_ALT);
if (cc > cs && cc < ce)
@@ -345,10 +381,13 @@ for (;;)
d = 0;
had_recurse = TRUE;
}
- else d = find_minlength(cs, startcode, options);
+ else
+ {
+ d = find_minlength(cs, startcode, options, recurse_depth);
+ }
}
else d = 0;
- cc += 3;
+ cc += 1 + IMM2_SIZE;
/* Handle repeated back references */
@@ -362,10 +401,16 @@ for (;;)
cc++;
break;
+ case OP_CRPLUS:
+ case OP_CRMINPLUS:
+ min = 1;
+ cc++;
+ break;
+
case OP_CRRANGE:
case OP_CRMINRANGE:
min = GET2(cc, 1);
- cc += 5;
+ cc += 1 + 2 * IMM2_SIZE;
break;
default:
@@ -376,39 +421,71 @@ for (;;)
branchlength += min * d;
break;
+ /* We can easily detect direct recursion, but not mutual recursion. This is
+ caught by a recursion depth count. */
+
case OP_RECURSE:
- cs = ce = (uschar *)startcode + GET(cc, 1);
- if (cs == NULL) return -2;
+ cs = ce = (pcre_uchar *)startcode + GET(cc, 1);
do ce += GET(ce, 1); while (*ce == OP_ALT);
- if (cc > cs && cc < ce)
+ if ((cc > cs && cc < ce) || recurse_depth > 10)
had_recurse = TRUE;
else
- branchlength += find_minlength(cs, startcode, options);
+ {
+ branchlength += find_minlength(cs, startcode, options, recurse_depth + 1);
+ }
cc += 1 + LINK_SIZE;
break;
/* Anything else does not or need not match a character. We can get the
item's length from the table, but for those that can match zero occurrences
- of a character, we must take special action for UTF-8 characters. */
+ of a character, we must take special action for UTF-8 characters. As it
+ happens, the "NOT" versions of these opcodes are used at present only for
+ ASCII characters, so they could be omitted from this list. However, in
+ future that may change, so we include them here so as not to leave a
+ gotcha for a future maintainer. */
case OP_UPTO:
+ case OP_UPTOI:
case OP_NOTUPTO:
+ case OP_NOTUPTOI:
case OP_MINUPTO:
+ case OP_MINUPTOI:
case OP_NOTMINUPTO:
+ case OP_NOTMINUPTOI:
case OP_POSUPTO:
+ case OP_POSUPTOI:
+ case OP_NOTPOSUPTO:
+ case OP_NOTPOSUPTOI:
+
case OP_STAR:
+ case OP_STARI:
+ case OP_NOTSTAR:
+ case OP_NOTSTARI:
case OP_MINSTAR:
+ case OP_MINSTARI:
case OP_NOTMINSTAR:
+ case OP_NOTMINSTARI:
case OP_POSSTAR:
+ case OP_POSSTARI:
case OP_NOTPOSSTAR:
+ case OP_NOTPOSSTARI:
+
case OP_QUERY:
+ case OP_QUERYI:
+ case OP_NOTQUERY:
+ case OP_NOTQUERYI:
case OP_MINQUERY:
+ case OP_MINQUERYI:
case OP_NOTMINQUERY:
+ case OP_NOTMINQUERYI:
case OP_POSQUERY:
+ case OP_POSQUERYI:
case OP_NOTPOSQUERY:
- cc += _pcre_OP_lengths[op];
-#ifdef SUPPORT_UTF8
- if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
+ case OP_NOTPOSQUERYI:
+
+ cc += PRIV(OP_lengths)[op];
+#ifdef SUPPORT_UTF
+ if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
#endif
break;
@@ -417,20 +494,27 @@ for (;;)
case OP_MARK:
case OP_PRUNE_ARG:
case OP_SKIP_ARG:
- cc += _pcre_OP_lengths[op] + cc[1];
+ case OP_THEN_ARG:
+ cc += PRIV(OP_lengths)[op] + cc[1];
break;
- case OP_THEN_ARG:
- cc += _pcre_OP_lengths[op] + cc[1+LINK_SIZE];
+ /* The remaining opcodes are just skipped over. */
+
+ case OP_CLOSE:
+ case OP_COMMIT:
+ case OP_FAIL:
+ case OP_PRUNE:
+ case OP_SET_SOM:
+ case OP_SKIP:
+ case OP_THEN:
+ cc += PRIV(OP_lengths)[op];
break;
- /* For the record, these are the opcodes that are matched by "default":
- OP_ACCEPT, OP_CLOSE, OP_COMMIT, OP_FAIL, OP_PRUNE, OP_SET_SOM, OP_SKIP,
- OP_THEN. */
+ /* This should not occur: we list all opcodes explicitly so that when
+ new ones get added they are properly considered. */
default:
- cc += _pcre_OP_lengths[op];
- break;
+ return -3;
}
}
/* Control never gets here */
@@ -452,29 +536,30 @@ Arguments:
p points to the character
caseless the caseless flag
cd the block with char table pointers
- utf8 TRUE for UTF-8 mode
+ utf TRUE for UTF-8 / UTF-16 mode
Returns: pointer after the character
*/
-static const uschar *
-set_table_bit(uschar *start_bits, const uschar *p, BOOL caseless,
- compile_data *cd, BOOL utf8)
+static const pcre_uchar *
+set_table_bit(pcre_uint8 *start_bits, const pcre_uchar *p, BOOL caseless,
+ compile_data *cd, BOOL utf)
{
unsigned int c = *p;
+#ifdef COMPILE_PCRE8
SET_BIT(c);
-#ifdef SUPPORT_UTF8
-if (utf8 && c > 127)
+#ifdef SUPPORT_UTF
+if (utf && c > 127)
{
GETCHARINC(c, p);
#ifdef SUPPORT_UCP
if (caseless)
{
- uschar buff[8];
+ pcre_uchar buff[6];
c = UCD_OTHERCASE(c);
- (void)_pcre_ord2utf8(c, buff);
+ (void)PRIV(ord2utf)(c, buff);
SET_BIT(buff[0]);
}
#endif
@@ -486,6 +571,36 @@ if (utf8 && c > 127)
if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
return p + 1;
+#endif
+
+#ifdef COMPILE_PCRE16
+if (c > 0xff)
+ {
+ c = 0xff;
+ caseless = FALSE;
+ }
+SET_BIT(c);
+
+#ifdef SUPPORT_UTF
+if (utf && c > 127)
+ {
+ GETCHARINC(c, p);
+#ifdef SUPPORT_UCP
+ if (caseless)
+ {
+ c = UCD_OTHERCASE(c);
+ if (c > 0xff)
+ c = 0xff;
+ SET_BIT(c);
+ }
+#endif
+ return p;
+ }
+#endif
+
+if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
+return p + 1;
+#endif
}
@@ -511,21 +626,23 @@ Returns: nothing
*/
static void
-set_type_bits(uschar *start_bits, int cbit_type, int table_limit,
+set_type_bits(pcre_uint8 *start_bits, int cbit_type, int table_limit,
compile_data *cd)
{
register int c;
for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type];
+#if defined SUPPORT_UTF && defined COMPILE_PCRE8
if (table_limit == 32) return;
for (c = 128; c < 256; c++)
{
if ((cd->cbits[c/8] & (1 << (c&7))) != 0)
{
- uschar buff[8];
- (void)_pcre_ord2utf8(c, buff);
+ pcre_uchar buff[6];
+ (void)PRIV(ord2utf)(c, buff);
SET_BIT(buff[0]);
}
}
+#endif
}
@@ -551,12 +668,14 @@ Returns: nothing
*/
static void
-set_nottype_bits(uschar *start_bits, int cbit_type, int table_limit,
+set_nottype_bits(pcre_uint8 *start_bits, int cbit_type, int table_limit,
compile_data *cd)
{
register int c;
for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type];
+#if defined SUPPORT_UTF && defined COMPILE_PCRE8
if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff;
+#endif
}
@@ -576,22 +695,26 @@ function fails unless the result is SSB_DONE.
Arguments:
code points to an expression
start_bits points to a 32-byte table, initialized to 0
- caseless the current state of the caseless flag
- utf8 TRUE if in UTF-8 mode
+ utf TRUE if in UTF-8 / UTF-16 mode
cd the block with char table pointers
Returns: SSB_FAIL => Failed to find any starting bytes
SSB_DONE => Found mandatory starting bytes
SSB_CONTINUE => Found optional starting bytes
+ SSB_UNKNOWN => Hit an unrecognized opcode
*/
static int
-set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,
- BOOL utf8, compile_data *cd)
+set_start_bits(const pcre_uchar *code, pcre_uint8 *start_bits, BOOL utf,
+ compile_data *cd)
{
register int c;
int yield = SSB_DONE;
-int table_limit = utf8? 16:32;
+#if defined SUPPORT_UTF && defined COMPILE_PCRE8
+int table_limit = utf? 16:32;
+#else
+int table_limit = 32;
+#endif
#if 0
/* ========================================================================= */
@@ -612,19 +735,108 @@ volatile int dummy;
do
{
- const uschar *tcode = code + (((int)*code == OP_CBRA)? 3:1) + LINK_SIZE;
BOOL try_next = TRUE;
+ const pcre_uchar *tcode = code + 1 + LINK_SIZE;
+
+ if (*code == OP_CBRA || *code == OP_SCBRA ||
+ *code == OP_CBRAPOS || *code == OP_SCBRAPOS) tcode += IMM2_SIZE;
while (try_next) /* Loop for items in this branch */
{
int rc;
+
switch(*tcode)
{
- /* Fail if we reach something we don't understand */
+ /* If we reach something we don't understand, it means a new opcode has
+ been created that hasn't been added to this code. Hopefully this problem
+ will be discovered during testing. */
default:
+ return SSB_UNKNOWN;
+
+ /* Fail for a valid opcode that implies no starting bits. */
+
+ case OP_ACCEPT:
+ case OP_ASSERT_ACCEPT:
+ case OP_ALLANY:
+ case OP_ANY:
+ case OP_ANYBYTE:
+ case OP_CIRC:
+ case OP_CIRCM:
+ case OP_CLOSE:
+ case OP_COMMIT:
+ case OP_COND:
+ case OP_CREF:
+ case OP_DEF:
+ case OP_DOLL:
+ case OP_DOLLM:
+ case OP_END:
+ case OP_EOD:
+ case OP_EODN:
+ case OP_EXTUNI:
+ case OP_FAIL:
+ case OP_MARK:
+ case OP_NCREF:
+ case OP_NOT:
+ case OP_NOTEXACT:
+ case OP_NOTEXACTI:
+ case OP_NOTI:
+ case OP_NOTMINPLUS:
+ case OP_NOTMINPLUSI:
+ case OP_NOTMINQUERY:
+ case OP_NOTMINQUERYI:
+ case OP_NOTMINSTAR:
+ case OP_NOTMINSTARI:
+ case OP_NOTMINUPTO:
+ case OP_NOTMINUPTOI:
+ case OP_NOTPLUS:
+ case OP_NOTPLUSI:
+ case OP_NOTPOSPLUS:
+ case OP_NOTPOSPLUSI:
+ case OP_NOTPOSQUERY:
+ case OP_NOTPOSQUERYI:
+ case OP_NOTPOSSTAR:
+ case OP_NOTPOSSTARI:
+ case OP_NOTPOSUPTO:
+ case OP_NOTPOSUPTOI:
+ case OP_NOTPROP:
+ case OP_NOTQUERY:
+ case OP_NOTQUERYI:
+ case OP_NOTSTAR:
+ case OP_NOTSTARI:
+ case OP_NOTUPTO:
+ case OP_NOTUPTOI:
+ case OP_NOT_HSPACE:
+ case OP_NOT_VSPACE:
+ case OP_NRREF:
+ case OP_PROP:
+ case OP_PRUNE:
+ case OP_PRUNE_ARG:
+ case OP_RECURSE:
+ case OP_REF:
+ case OP_REFI:
+ case OP_REVERSE:
+ case OP_RREF:
+ case OP_SCOND:
+ case OP_SET_SOM:
+ case OP_SKIP:
+ case OP_SKIP_ARG:
+ case OP_SOD:
+ case OP_SOM:
+ case OP_THEN:
+ case OP_THEN_ARG:
+#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
+ case OP_XCLASS:
+#endif
return SSB_FAIL;
+ /* We can ignore word boundary tests. */
+
+ case OP_WORD_BOUNDARY:
+ case OP_NOT_WORD_BOUNDARY:
+ tcode++;
+ break;
+
/* If we hit a bracket or a positive lookahead assertion, recurse to set
bits from within the subpattern. If it can't find anything, we have to
give up. If it finds some mandatory character(s), we are done for this
@@ -634,10 +846,15 @@ do
case OP_SBRA:
case OP_CBRA:
case OP_SCBRA:
+ case OP_BRAPOS:
+ case OP_SBRAPOS:
+ case OP_CBRAPOS:
+ case OP_SCBRAPOS:
case OP_ONCE:
+ case OP_ONCE_NC:
case OP_ASSERT:
- rc = set_start_bits(tcode, start_bits, caseless, utf8, cd);
- if (rc == SSB_FAIL) return SSB_FAIL;
+ rc = set_start_bits(tcode, start_bits, utf, cd);
+ if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
if (rc == SSB_DONE) try_next = FALSE; else
{
do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
@@ -660,6 +877,7 @@ do
case OP_KET:
case OP_KETRMAX:
case OP_KETRMIN:
+ case OP_KETRPOS:
return SSB_CONTINUE;
/* Skip over callout */
@@ -677,19 +895,13 @@ do
tcode += 1 + LINK_SIZE;
break;
- /* Skip over an option setting, changing the caseless flag */
-
- case OP_OPT:
- caseless = (tcode[1] & PCRE_CASELESS) != 0;
- tcode += 2;
- break;
-
/* BRAZERO does the bracket, but carries on. */
case OP_BRAZERO:
case OP_BRAMINZERO:
- if (set_start_bits(++tcode, start_bits, caseless, utf8, cd) == SSB_FAIL)
- return SSB_FAIL;
+ case OP_BRAPOSZERO:
+ rc = set_start_bits(++tcode, start_bits, utf, cd);
+ if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
/* =========================================================================
See the comment at the head of this function concerning the next line,
which was an old fudge for the benefit of OS/2.
@@ -715,7 +927,16 @@ do
case OP_QUERY:
case OP_MINQUERY:
case OP_POSQUERY:
- tcode = set_table_bit(start_bits, tcode + 1, caseless, cd, utf8);
+ tcode = set_table_bit(start_bits, tcode + 1, FALSE, cd, utf);
+ break;
+
+ case OP_STARI:
+ case OP_MINSTARI:
+ case OP_POSSTARI:
+ case OP_QUERYI:
+ case OP_MINQUERYI:
+ case OP_POSQUERYI:
+ tcode = set_table_bit(start_bits, tcode + 1, TRUE, cd, utf);
break;
/* Single-char upto sets the bit and tries the next */
@@ -723,20 +944,36 @@ do
case OP_UPTO:
case OP_MINUPTO:
case OP_POSUPTO:
- tcode = set_table_bit(start_bits, tcode + 3, caseless, cd, utf8);
+ tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, FALSE, cd, utf);
break;
- /* At least one single char sets the bit and stops */
+ case OP_UPTOI:
+ case OP_MINUPTOI:
+ case OP_POSUPTOI:
+ tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, TRUE, cd, utf);
+ break;
- case OP_EXACT: /* Fall through */
- tcode += 2;
+ /* At least one single char sets the bit and stops */
+ case OP_EXACT:
+ tcode += IMM2_SIZE;
+ /* Fall through */
case OP_CHAR:
- case OP_CHARNC:
case OP_PLUS:
case OP_MINPLUS:
case OP_POSPLUS:
- (void)set_table_bit(start_bits, tcode + 1, caseless, cd, utf8);
+ (void)set_table_bit(start_bits, tcode + 1, FALSE, cd, utf);
+ try_next = FALSE;
+ break;
+
+ case OP_EXACTI:
+ tcode += IMM2_SIZE;
+ /* Fall through */
+ case OP_CHARI:
+ case OP_PLUSI:
+ case OP_MINPLUSI:
+ case OP_POSPLUSI:
+ (void)set_table_bit(start_bits, tcode + 1, TRUE, cd, utf);
try_next = FALSE;
break;
@@ -749,14 +986,28 @@ do
case OP_HSPACE:
SET_BIT(0x09);
SET_BIT(0x20);
- if (utf8)
+#ifdef SUPPORT_UTF
+ if (utf)
{
+#ifdef COMPILE_PCRE8
SET_BIT(0xC2); /* For U+00A0 */
SET_BIT(0xE1); /* For U+1680, U+180E */
SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
SET_BIT(0xE3); /* For U+3000 */
+#endif
+#ifdef COMPILE_PCRE16
+ SET_BIT(0xA0);
+ SET_BIT(0xFF); /* For characters > 255 */
+#endif
+ }
+ else
+#endif /* SUPPORT_UTF */
+ {
+ SET_BIT(0xA0);
+#ifdef COMPILE_PCRE16
+ SET_BIT(0xFF); /* For characters > 255 */
+#endif
}
- else SET_BIT(0xA0);
try_next = FALSE;
break;
@@ -766,12 +1017,26 @@ do
SET_BIT(0x0B);
SET_BIT(0x0C);
SET_BIT(0x0D);
- if (utf8)
+#ifdef SUPPORT_UTF
+ if (utf)
{
+#ifdef COMPILE_PCRE8
SET_BIT(0xC2); /* For U+0085 */
SET_BIT(0xE2); /* For U+2028, U+2029 */
+#endif
+#ifdef COMPILE_PCRE16
+ SET_BIT(0x85);
+ SET_BIT(0xFF); /* For characters > 255 */
+#endif
+ }
+ else
+#endif /* SUPPORT_UTF */
+ {
+ SET_BIT(0x85);
+#ifdef COMPILE_PCRE16
+ SET_BIT(0xFF); /* For characters > 255 */
+#endif
}
- else SET_BIT(0x85);
try_next = FALSE;
break;
@@ -829,7 +1094,7 @@ do
break;
case OP_TYPEEXACT:
- tcode += 3;
+ tcode += 1 + IMM2_SIZE;
break;
/* Zero or more repeats of character types set the bits and then
@@ -838,7 +1103,7 @@ do
case OP_TYPEUPTO:
case OP_TYPEMINUPTO:
case OP_TYPEPOSUPTO:
- tcode += 2; /* Fall through */
+ tcode += IMM2_SIZE; /* Fall through */
case OP_TYPESTAR:
case OP_TYPEMINSTAR:
@@ -856,14 +1121,23 @@ do
case OP_HSPACE:
SET_BIT(0x09);
SET_BIT(0x20);
- if (utf8)
+#ifdef SUPPORT_UTF
+ if (utf)
{
+#ifdef COMPILE_PCRE8
SET_BIT(0xC2); /* For U+00A0 */
SET_BIT(0xE1); /* For U+1680, U+180E */
SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
SET_BIT(0xE3); /* For U+3000 */
+#endif
+#ifdef COMPILE_PCRE16
+ SET_BIT(0xA0);
+ SET_BIT(0xFF); /* For characters > 255 */
+#endif
}
- else SET_BIT(0xA0);
+ else
+#endif /* SUPPORT_UTF */
+ SET_BIT(0xA0);
break;
case OP_ANYNL:
@@ -872,12 +1146,21 @@ do
SET_BIT(0x0B);
SET_BIT(0x0C);
SET_BIT(0x0D);
- if (utf8)
+#ifdef SUPPORT_UTF
+ if (utf)
{
+#ifdef COMPILE_PCRE8
SET_BIT(0xC2); /* For U+0085 */
SET_BIT(0xE2); /* For U+2028, U+2029 */
+#endif
+#ifdef COMPILE_PCRE16
+ SET_BIT(0x85);
+ SET_BIT(0xFF); /* For characters > 255 */
+#endif
}
- else SET_BIT(0x85);
+ else
+#endif /* SUPPORT_UTF */
+ SET_BIT(0x85);
break;
case OP_NOT_DIGIT:
@@ -924,18 +1207,23 @@ do
character with a value > 255. */
case OP_NCLASS:
-#ifdef SUPPORT_UTF8
- if (utf8)
+#if defined SUPPORT_UTF && defined COMPILE_PCRE8
+ if (utf)
{
start_bits[24] |= 0xf0; /* Bits for 0xc4 - 0xc8 */
memset(start_bits+25, 0xff, 7); /* Bits for 0xc9 - 0xff */
}
#endif
+#ifdef COMPILE_PCRE16
+ SET_BIT(0xFF); /* For characters > 255 */
+#endif
/* Fall through */
case OP_CLASS:
{
+ pcre_uint8 *map;
tcode++;
+ map = (pcre_uint8 *)tcode;
/* In UTF-8 mode, the bits in a bit map correspond to character
values, not to byte values. However, the bit map we are constructing is
@@ -943,13 +1231,13 @@ do
value is > 127. In fact, there are only two possible starting bytes for
characters in the range 128 - 255. */
-#ifdef SUPPORT_UTF8
- if (utf8)
+#if defined SUPPORT_UTF && defined COMPILE_PCRE8
+ if (utf)
{
- for (c = 0; c < 16; c++) start_bits[c] |= tcode[c];
+ for (c = 0; c < 16; c++) start_bits[c] |= map[c];
for (c = 128; c < 256; c++)
{
- if ((tcode[c/8] && (1 << (c&7))) != 0)
+ if ((map[c/8] && (1 << (c&7))) != 0)
{
int d = (c >> 6) | 0xc0; /* Set bit for this starter */
start_bits[d/8] |= (1 << (d&7)); /* and then skip on to the */
@@ -957,18 +1245,17 @@ do
}
}
}
-
- /* In non-UTF-8 mode, the two bit maps are completely compatible. */
-
else
#endif
{
- for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
+ /* In non-UTF-8 mode, the two bit maps are completely compatible. */
+ for (c = 0; c < 32; c++) start_bits[c] |= map[c];
}
- /* Advance past the bit map, and act on what follows */
+ /* Advance past the bit map, and act on what follows. For a zero
+ minimum repeat, continue; otherwise stop processing. */
- tcode += 32;
+ tcode += 32 / sizeof(pcre_uchar);
switch (*tcode)
{
case OP_CRSTAR:
@@ -980,7 +1267,7 @@ do
case OP_CRRANGE:
case OP_CRMINRANGE:
- if (((tcode[1] << 8) + tcode[2]) == 0) tcode += 5;
+ if (GET2(tcode, 1) == 0) tcode += 1 + 2 * IMM2_SIZE;
else try_next = FALSE;
break;
@@ -1002,12 +1289,14 @@ return yield;
+
+
/*************************************************
* Study a compiled expression *
*************************************************/
/* This function is handed a compiled expression that it must study to produce
-information that will speed up the matching. It returns a pcre_extra block
+information that will speed up the matching. It returns a pcre[16]_extra block
which then gets handed back to pcre_exec().
Arguments:
@@ -1016,23 +1305,28 @@ Arguments:
errorptr points to where to place error messages;
set NULL unless error
-Returns: pointer to a pcre_extra block, with study_data filled in and the
- appropriate flags set;
+Returns: pointer to a pcre[16]_extra block, with study_data filled in and
+ the appropriate flags set;
NULL on error or if no optimization possible
*/
+#ifdef COMPILE_PCRE8
PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION
pcre_study(const pcre *external_re, int options, const char **errorptr)
+#else
+PCRE_EXP_DEFN pcre16_extra * PCRE_CALL_CONVENTION
+pcre16_study(const pcre16 *external_re, int options, const char **errorptr)
+#endif
{
int min;
BOOL bits_set = FALSE;
-uschar start_bits[32];
-pcre_extra *extra;
+pcre_uint8 start_bits[32];
+PUBL(extra) *extra = NULL;
pcre_study_data *study;
-const uschar *tables;
-uschar *code;
+const pcre_uint8 *tables;
+pcre_uchar *code;
compile_data compile_block;
-const real_pcre *re = (const real_pcre *)external_re;
+const REAL_PCRE *re = (const REAL_PCRE *)external_re;
*errorptr = NULL;
@@ -1042,13 +1336,23 @@ if (re == NULL || re->magic_number != MAGIC_NUMBER)
return NULL;
}
+if ((re->flags & PCRE_MODE) == 0)
+ {
+#ifdef COMPILE_PCRE8
+ *errorptr = "argument is compiled in 16 bit mode";
+#else
+ *errorptr = "argument is compiled in 8 bit mode";
+#endif
+ return NULL;
+ }
+
if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
{
*errorptr = "unknown or incorrect option bit(s) set";
return NULL;
}
-code = (uschar *)re + re->name_table_offset +
+code = (pcre_uchar *)re + re->name_table_offset +
(re->name_count * re->name_entry_size);
/* For an anchored pattern, or an unanchored pattern that has a first char, or
@@ -1058,12 +1362,21 @@ seeking a list of starting bytes. */
if ((re->options & PCRE_ANCHORED) == 0 &&
(re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) == 0)
{
+ int rc;
+
/* Set the character tables in the block that is passed around */
tables = re->tables;
+
+#ifdef COMPILE_PCRE8
if (tables == NULL)
(void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
(void *)(&tables));
+#else
+ if (tables == NULL)
+ (void)pcre16_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
+ (void *)(&tables));
+#endif
compile_block.lcc = tables + lcc_offset;
compile_block.fcc = tables + fcc_offset;
@@ -1072,56 +1385,148 @@ if ((re->options & PCRE_ANCHORED) == 0 &&
/* See if we can find a fixed set of initial characters for the pattern. */
- memset(start_bits, 0, 32 * sizeof(uschar));
- bits_set = set_start_bits(code, start_bits,
- (re->options & PCRE_CASELESS) != 0, (re->options & PCRE_UTF8) != 0,
- &compile_block) == SSB_DONE;
+ memset(start_bits, 0, 32 * sizeof(pcre_uint8));
+ rc = set_start_bits(code, start_bits, (re->options & PCRE_UTF8) != 0,
+ &compile_block);
+ bits_set = rc == SSB_DONE;
+ if (rc == SSB_UNKNOWN)
+ {
+ *errorptr = "internal error: opcode not recognized";
+ return NULL;
+ }
}
/* Find the minimum length of subject string. */
-min = find_minlength(code, code, re->options);
+switch(min = find_minlength(code, code, re->options, 0))
+ {
+ case -2: *errorptr = "internal error: missing capturing bracket"; return NULL;
+ case -3: *errorptr = "internal error: opcode not recognized"; return NULL;
+ default: break;
+ }
-/* Return NULL if no optimization is possible. */
+/* If a set of starting bytes has been identified, or if the minimum length is
+greater than zero, or if JIT optimization has been requested, get a
+pcre[16]_extra block and a pcre_study_data block. The study data is put in the
+latter, which is pointed to by the former, which may also get additional data
+set later by the calling program. At the moment, the size of pcre_study_data
+is fixed. We nevertheless save it in a field for returning via the
+pcre_fullinfo() function so that if it becomes variable in the future,
+we don't have to change that code. */
+
+if (bits_set || min > 0
+#ifdef SUPPORT_JIT
+ || (options & (PCRE_STUDY_JIT_COMPILE | PCRE_STUDY_JIT_PARTIAL_SOFT_COMPILE
+ | PCRE_STUDY_JIT_PARTIAL_HARD_COMPILE)) != 0
+#endif
+ )
+ {
+ extra = (PUBL(extra) *)(PUBL(malloc))
+ (sizeof(PUBL(extra)) + sizeof(pcre_study_data));
+ if (extra == NULL)
+ {
+ *errorptr = "failed to get memory";
+ return NULL;
+ }
-if (!bits_set && min < 0) return NULL;
+ study = (pcre_study_data *)((char *)extra + sizeof(PUBL(extra)));
+ extra->flags = PCRE_EXTRA_STUDY_DATA;
+ extra->study_data = study;
-/* Get a pcre_extra block and a pcre_study_data block. The study data is put in
-the latter, which is pointed to by the former, which may also get additional
-data set later by the calling program. At the moment, the size of
-pcre_study_data is fixed. We nevertheless save it in a field for returning via
-the pcre_fullinfo() function so that if it becomes variable in the future, we
-don't have to change that code. */
+ study->size = sizeof(pcre_study_data);
+ study->flags = 0;
-extra = (pcre_extra *)(pcre_malloc)
- (sizeof(pcre_extra) + sizeof(pcre_study_data));
+ /* Set the start bits always, to avoid unset memory errors if the
+ study data is written to a file, but set the flag only if any of the bits
+ are set, to save time looking when none are. */
-if (extra == NULL)
- {
- *errorptr = "failed to get memory";
- return NULL;
- }
+ if (bits_set)
+ {
+ study->flags |= PCRE_STUDY_MAPPED;
+ memcpy(study->start_bits, start_bits, sizeof(start_bits));
+ }
+ else memset(study->start_bits, 0, 32 * sizeof(pcre_uint8));
-study = (pcre_study_data *)((char *)extra + sizeof(pcre_extra));
-extra->flags = PCRE_EXTRA_STUDY_DATA;
-extra->study_data = study;
+#ifdef PCRE_DEBUG
+ if (bits_set)
+ {
+ pcre_uint8 *ptr = start_bits;
+ int i;
-study->size = sizeof(pcre_study_data);
-study->flags = 0;
+ printf("Start bits:\n");
+ for (i = 0; i < 32; i++)
+ printf("%3d: %02x%s", i * 8, *ptr++, ((i + 1) & 0x7) != 0? " " : "\n");
+ }
+#endif
-if (bits_set)
- {
- study->flags |= PCRE_STUDY_MAPPED;
- memcpy(study->start_bits, start_bits, sizeof(start_bits));
- }
+ /* Always set the minlength value in the block, because the JIT compiler
+ makes use of it. However, don't set the bit unless the length is greater than
+ zero - the interpretive pcre_exec() and pcre_dfa_exec() needn't waste time
+ checking the zero case. */
-if (min >= 0)
- {
- study->flags |= PCRE_STUDY_MINLEN;
- study->minlength = min;
+ if (min > 0)
+ {
+ study->flags |= PCRE_STUDY_MINLEN;
+ study->minlength = min;
+ }
+ else study->minlength = 0;
+
+ /* If JIT support was compiled and requested, attempt the JIT compilation.
+ If no starting bytes were found, and the minimum length is zero, and JIT
+ compilation fails, abandon the extra block and return NULL. */
+
+#ifdef SUPPORT_JIT
+ extra->executable_jit = NULL;
+ if ((options & PCRE_STUDY_JIT_COMPILE) != 0)
+ PRIV(jit_compile)(re, extra, JIT_COMPILE);
+ if ((options & PCRE_STUDY_JIT_PARTIAL_SOFT_COMPILE) != 0)
+ PRIV(jit_compile)(re, extra, JIT_PARTIAL_SOFT_COMPILE);
+ if ((options & PCRE_STUDY_JIT_PARTIAL_HARD_COMPILE) != 0)
+ PRIV(jit_compile)(re, extra, JIT_PARTIAL_HARD_COMPILE);
+
+ if (study->flags == 0 && (extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) == 0)
+ {
+#ifdef COMPILE_PCRE8
+ pcre_free_study(extra);
+#endif
+#ifdef COMPILE_PCRE16
+ pcre16_free_study(extra);
+#endif
+ extra = NULL;
+ }
+#endif
}
return extra;
}
+
+/*************************************************
+* Free the study data *
+*************************************************/
+
+/* This function frees the memory that was obtained by pcre_study().
+
+Argument: a pointer to the pcre[16]_extra block
+Returns: nothing
+*/
+
+#ifdef COMPILE_PCRE8
+PCRE_EXP_DEFN void
+pcre_free_study(pcre_extra *extra)
+#else
+PCRE_EXP_DEFN void
+pcre16_free_study(pcre16_extra *extra)
+#endif
+{
+if (extra == NULL)
+ return;
+#ifdef SUPPORT_JIT
+if ((extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0 &&
+ extra->executable_jit != NULL)
+ PRIV(jit_free)(extra->executable_jit);
+#endif
+PUBL(free)(extra);
+}
+
/* End of pcre_study.c */