summaryrefslogtreecommitdiff
path: root/pcre.c
diff options
context:
space:
mode:
Diffstat (limited to 'pcre.c')
-rw-r--r--pcre.c299
1 files changed, 215 insertions, 84 deletions
diff --git a/pcre.c b/pcre.c
index 58adcef..1bca847 100644
--- a/pcre.c
+++ b/pcre.c
@@ -111,7 +111,7 @@ static const short int escapes[] = {
static BOOL
compile_regex(int, int, int *, uschar **, const uschar **, const char **,
- BOOL, int, compile_data *);
+ BOOL, int, int *, int *, compile_data *);
@@ -162,7 +162,10 @@ return PCRE_VERSION;
*************************************************/
/* This function picks potentially useful data out of the private
-structure.
+structure. The public options are passed back in an int - though the
+re->options field has been expanded to a long int, all the public options
+at the low end of it, and so even on 16-bit systems this will still be OK.
+Therefore, I haven't changed the API for pcre_info().
Arguments:
external_re points to compiled code
@@ -181,7 +184,7 @@ pcre_info(const pcre *external_re, int *optptr, int *first_char)
const real_pcre *re = (const real_pcre *)external_re;
if (re == NULL) return PCRE_ERROR_NULL;
if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
-if (optptr != NULL) *optptr = (re->options & PUBLIC_OPTIONS);
+if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_OPTIONS);
if (first_char != NULL)
*first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
@@ -532,6 +535,7 @@ for (;;)
case OP_REVERSE:
cc++;
+ /* Fall through */
case OP_CREF:
case OP_OPT:
@@ -627,6 +631,8 @@ Arguments:
ptrptr points to the current pattern pointer
errorptr points to pointer to error message
optchanged set to the value of the last OP_OPT item compiled
+ reqchar set to the last literal character required, else -1
+ countlits set to count of mandatory literal characters
cd contains pointers to tables
Returns: TRUE on success
@@ -636,12 +642,15 @@ Returns: TRUE on success
static BOOL
compile_branch(int options, int *brackets, uschar **codeptr,
const uschar **ptrptr, const char **errorptr, int *optchanged,
- compile_data *cd)
+ int *reqchar, int *countlits, compile_data *cd)
{
int repeat_type, op_type;
int repeat_min, repeat_max;
int bravalue, length;
int greedy_default, greedy_non_default;
+int prevreqchar;
+int condcount = 0;
+int subcountlits = 0;
register int c;
register uschar *code = *codeptr;
uschar *tempcode;
@@ -655,6 +664,11 @@ uschar class[32];
greedy_default = ((options & PCRE_UNGREEDY) != 0);
greedy_non_default = greedy_default ^ 1;
+/* Initialize no required char, and count of literals */
+
+*reqchar = prevreqchar = -1;
+*countlits = 0;
+
/* Switch on next character until the end of the branch */
for (;; ptr++)
@@ -664,6 +678,7 @@ for (;; ptr++)
int class_lastchar;
int newoptions;
int condref;
+ int subreqchar;
c = *ptr;
if ((options & PCRE_EXTENDED) != 0)
@@ -937,18 +952,19 @@ for (;; ptr++)
{ repeat_type = greedy_non_default; ptr++; }
else repeat_type = greedy_default;
- /* If the maximum is zero then the minimum must also be zero; Perl allows
- this case, so we do too - by simply omitting the item altogether. */
-
- if (repeat_max == 0) code = previous;
-
/* If previous was a string of characters, chop off the last one and use it
as the subject of the repeat. If there was only one character, we can
- abolish the previous item altogether. */
+ abolish the previous item altogether. A repeat with a zero minimum wipes
+ out any reqchar setting, backing up to the previous value. We must also
+ adjust the countlits value. */
- else if (*previous == OP_CHARS)
+ if (*previous == OP_CHARS)
{
int len = previous[1];
+
+ if (repeat_min == 0) *reqchar = prevreqchar;
+ *countlits += repeat_min - 1;
+
if (len == 1)
{
c = previous[2];
@@ -987,7 +1003,15 @@ for (;; ptr++)
code = previous;
OUTPUT_SINGLE_REPEAT:
- repeat_type += op_type; /* Combine both values for many cases */
+
+ /* If the maximum is zero then the minimum must also be zero; Perl allows
+ this case, so we do too - by simply omitting the item altogether. */
+
+ if (repeat_max == 0) goto END_REPEAT;
+
+ /* Combine the op_type with the repeat_type */
+
+ repeat_type += op_type;
/* A minimum of zero is handled either as the special case * or ?, or as
an UPTO, with the maximum given. */
@@ -1064,10 +1088,15 @@ for (;; ptr++)
}
/* If previous was a character class or a back reference, we put the repeat
- stuff after it. */
+ stuff after it, but just skip the item if the repeat was {0,0}. */
else if (*previous == OP_CLASS || *previous == OP_REF)
{
+ if (repeat_max == 0)
+ {
+ code = previous;
+ goto END_REPEAT;
+ }
if (repeat_min == 0 && repeat_max == -1)
*code++ = OP_CRSTAR + repeat_type;
else if (repeat_min == 1 && repeat_max == -1)
@@ -1118,14 +1147,22 @@ for (;; ptr++)
if (repeat_min == 0)
{
+ /* If we set up a required char from the bracket, we must back off
+ to the previous value and reset the countlits value too. */
+
+ if (subcountlits > 0)
+ {
+ *reqchar = prevreqchar;
+ *countlits -= subcountlits;
+ }
+
/* If the maximum is also zero, we just omit the group from the output
altogether. */
if (repeat_max == 0)
{
code = previous;
- previous = NULL;
- break;
+ goto END_REPEAT;
}
/* If the maximum is 1 or unlimited, we just have to stick in the
@@ -1230,59 +1267,6 @@ for (;; ptr++)
correct offset was computed above. */
else code[-ketoffset] = OP_KETRMAX + repeat_type;
-
-
-#ifdef NEVER
- /* If the minimum is greater than zero, and the maximum is unlimited or
- equal to the minimum, the first copy remains where it is, and is
- replicated up to the minimum number of times. This case includes the +
- repeat, but of course no replication is needed in that case. */
-
- if (repeat_min > 0 && (repeat_max == -1 || repeat_max == repeat_min))
- {
- for (i = 1; i < repeat_min; i++)
- {
- memcpy(code, previous, len);
- code += len;
- }
- }
-
- /* If the minimum is zero, stick BRAZERO in front of the first copy.
- Then, if there is a fixed upper limit, replicated up to that many times,
- sticking BRAZERO in front of all the optional ones. */
-
- else
- {
- if (repeat_min == 0)
- {
- memmove(previous+1, previous, len);
- code++;
- *previous++ = OP_BRAZERO + repeat_type;
- }
-
- for (i = 1; i < repeat_min; i++)
- {
- memcpy(code, previous, len);
- code += len;
- }
-
- for (i = (repeat_min > 0)? repeat_min : 1; i < repeat_max; i++)
- {
- *code++ = OP_BRAZERO + repeat_type;
- memcpy(code, previous, len);
- code += len;
- }
- }
-
- /* If the maximum is unlimited, set a repeater in the final copy. We
- can't just offset backwards from the current code point, because we
- don't know if there's been an options resetting after the ket. The
- correct offset was computed above. */
-
- if (repeat_max == -1) code[-ketoffset] = OP_KETRMAX + repeat_type;
-#endif
-
-
}
/* Else there's some kind of shambles */
@@ -1295,6 +1279,7 @@ for (;; ptr++)
/* In all case we no longer have a previous item. */
+ END_REPEAT:
previous = NULL;
break;
@@ -1463,6 +1448,8 @@ for (;; ptr++)
(bravalue == OP_ASSERTBACK ||
bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
condref, /* Condition reference number */
+ &subreqchar, /* For possible last char */
+ &subcountlits, /* For literal count */
cd)) /* Tables block */
goto FAILED;
@@ -1476,22 +1463,39 @@ for (;; ptr++)
if (bravalue == OP_COND)
{
- int branchcount = 0;
uschar *tc = code;
+ condcount = 0;
do {
- branchcount++;
+ condcount++;
tc += (tc[1] << 8) | tc[2];
}
while (*tc != OP_KET);
- if (branchcount > 2)
+ if (condcount > 2)
{
*errorptr = ERR27;
goto FAILED;
}
}
+ /* Handle updating of the required character. If the subpattern didn't
+ set one, leave it as it was. Otherwise, update it for normal brackets of
+ all kinds, forward assertions, and conditions with two branches. Don't
+ update the literal count for forward assertions, however. If the bracket
+ is followed by a quantifier with zero repeat, we have to back off. Hence
+ the definition of prevreqchar and subcountlits outside the main loop so
+ that they can be accessed for the back off. */
+
+ if (subreqchar > 0 &&
+ (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_ASSERT ||
+ (bravalue == OP_COND && condcount == 2)))
+ {
+ prevreqchar = *reqchar;
+ *reqchar = subreqchar;
+ if (bravalue != OP_ASSERT) *countlits += subcountlits;
+ }
+
/* Now update the main code pointer to the end of the group. */
code = tempcode;
@@ -1586,6 +1590,12 @@ for (;; ptr++)
while (length < 255 && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0);
+ /* Update the last character and the count of literals */
+
+ prevreqchar = (length > 1)? code[-2] : *reqchar;
+ *reqchar = code[-1];
+ *countlits += length;
+
/* Compute the length and set it in the data vector, and advance to
the next state. */
@@ -1629,6 +1639,8 @@ Argument:
errorptr -> pointer to error message
lookbehind TRUE if this is a lookbehind assertion
condref > 0 for OPT_CREF setting at start of conditional group
+ reqchar -> place to put the last required character, or a negative number
+ countlits -> place to put the shortest literal count of any branch
cd points to the data block with tables pointers
Returns: TRUE on success
@@ -1637,7 +1649,7 @@ Returns: TRUE on success
static BOOL
compile_regex(int options, int optchanged, int *brackets, uschar **codeptr,
const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int condref,
- compile_data *cd)
+ int *reqchar, int *countlits, compile_data *cd)
{
const uschar *ptr = *ptrptr;
uschar *code = *codeptr;
@@ -1645,7 +1657,10 @@ uschar *last_branch = code;
uschar *start_bracket = code;
uschar *reverse_count = NULL;
int oldoptions = options & PCRE_IMS;
+int branchreqchar, branchcountlits;
+*reqchar = -1;
+*countlits = INT_MAX;
code += 3;
/* At the start of a reference-based conditional group, insert the reference
@@ -1684,7 +1699,8 @@ for (;;)
/* Now compile the branch */
- if (!compile_branch(options,brackets,&code,&ptr,errorptr,&optchanged,cd))
+ if (!compile_branch(options, brackets, &code, &ptr, errorptr, &optchanged,
+ &branchreqchar, &branchcountlits, cd))
{
*ptrptr = ptr;
return FALSE;
@@ -1696,6 +1712,25 @@ for (;;)
last_branch[1] = length >> 8;
last_branch[2] = length & 255;
+ /* Save the last required character if all branches have the same; a current
+ value of -1 means unset, while -2 means "previous branch had no last required
+ char". */
+
+ if (*reqchar != -2)
+ {
+ if (branchreqchar >= 0)
+ {
+ if (*reqchar == -1) *reqchar = branchreqchar;
+ else if (*reqchar != branchreqchar) *reqchar = -2;
+ }
+ else *reqchar = -2;
+ }
+
+ /* Keep the shortest literal count */
+
+ if (branchcountlits < *countlits) *countlits = branchcountlits;
+ DPRINTF(("literal count = %d min=%d\n", branchcountlits, *countlits));
+
/* If lookbehind, check that this branch matches a fixed-length string,
and put the length into the OP_REVERSE item. Temporarily mark the end of
the branch with OP_END. */
@@ -1977,7 +2012,7 @@ pcre_compile(const char *pattern, int options, const char **errorptr,
real_pcre *re;
int length = 3; /* For initial BRA plus length */
int runlength;
-int c, size;
+int c, size, reqchar, countlits;
int bracount = 0;
int top_backref = 0;
int branch_extra = 0;
@@ -2317,13 +2352,16 @@ while ((c = *(++ptr)) != 0)
will lead to an over-estimate on the length, but this shouldn't
matter very much. We also have to allow for resetting options at
the start of any alternations, which we do by setting
- branch_newextra to 2. */
+ branch_newextra to 2. Finally, we record whether the case-dependent
+ flag ever changes within the regex. This is used by the "required
+ character" code. */
case ':':
if (((set|unset) & PCRE_IMS) != 0)
{
length += 4;
branch_newextra = 2;
+ if (((set|unset) & PCRE_CASELESS) != 0) options |= PCRE_ICHANGED;
}
goto END_OPTIONS;
@@ -2524,7 +2562,7 @@ code = re->code;
*code = OP_BRA;
bracount = 0;
(void)compile_regex(options, -1, &bracount, &code, &ptr, errorptr, FALSE, -1,
- &compile_block);
+ &reqchar, &countlits, &compile_block);
re->top_bracket = bracount;
re->top_backref = top_backref;
@@ -2584,6 +2622,15 @@ if ((options & PCRE_ANCHORED) == 0)
}
}
+/* Save the last required character if there are at least two literal
+characters on all paths, or if there is no first character setting. */
+
+if (reqchar >= 0 && (countlits > 1 || (re->options & PCRE_FIRSTSET) == 0))
+ {
+ re->req_char = reqchar;
+ re->options |= PCRE_REQCHSET;
+ }
+
/* Print out the compiled data for debugging */
#ifdef DEBUG
@@ -2593,9 +2640,10 @@ printf("Length = %d top_bracket = %d top_backref = %d\n",
if (re->options != 0)
{
- printf("%s%s%s%s%s%s%s%s\n",
+ printf("%s%s%s%s%s%s%s%s%s\n",
((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
+ ((re->options & PCRE_ICHANGED) != 0)? "case state changed " : "",
((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
@@ -2610,6 +2658,12 @@ if ((re->options & PCRE_FIRSTSET) != 0)
else printf("First char = \\x%02x\n", re->first_char);
}
+if ((re->options & PCRE_REQCHSET) != 0)
+ {
+ if (isprint(re->req_char)) printf("Req char = %c\n", re->req_char);
+ else printf("Req char = \\x%02x\n", re->req_char);
+ }
+
code_end = code;
code_base = code = re->code;
@@ -2843,7 +2897,7 @@ Returns: TRUE if matched
static BOOL
match_ref(int offset, register const uschar *eptr, int length, match_data *md,
- int ims)
+ unsigned long int ims)
{
const uschar *p = md->start_subject + md->offset_vector[offset];
@@ -2902,9 +2956,10 @@ Returns: TRUE if matched
static BOOL
match(register const uschar *eptr, register const uschar *ecode,
- int offset_top, match_data *md, int ims, BOOL condassert, const uschar *eptrb)
+ int offset_top, match_data *md, unsigned long int ims, BOOL condassert,
+ const uschar *eptrb)
{
-int original_ims = ims; /* Save for resetting on ')' */
+unsigned long int original_ims = ims; /* Save for resetting on ')' */
for (;;)
{
@@ -3019,9 +3074,11 @@ for (;;)
ecode += 2;
break;
- /* End of the pattern */
+ /* End of the pattern. If PCRE_NOTEMPTY is set, fail if we have matched
+ an empty string - recursion will then try other alternatives, if any. */
case OP_END:
+ if (md->notempty && eptr == md->start_match) return FALSE;
md->end_match_ptr = eptr; /* Record where we ended */
md->end_offset_top = offset_top; /* and how many extracts were taken */
return TRUE;
@@ -4136,11 +4193,14 @@ pcre_exec(const pcre *external_re, const pcre_extra *external_extra,
{
int resetcount, ocount;
int first_char = -1;
-int ims = 0;
+int req_char = -1;
+int req_char2 = -1;
+unsigned long int ims = 0;
match_data match_block;
const uschar *start_bits = NULL;
const uschar *start_match = (const uschar *)subject + start_offset;
const uschar *end_subject;
+const uschar *req_char_ptr = start_match - 1;
const real_pcre *re = (const real_pcre *)external_re;
const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;
BOOL using_temporary_offsets = FALSE;
@@ -4161,6 +4221,7 @@ match_block.endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
match_block.notbol = (options & PCRE_NOTBOL) != 0;
match_block.noteol = (options & PCRE_NOTEOL) != 0;
+match_block.notempty = (options & PCRE_NOTEMPTY) != 0;
match_block.errorcode = PCRE_ERROR_NOMATCH; /* Default error */
@@ -4231,7 +4292,23 @@ if (!anchored)
start_bits = extra->start_bits;
}
-/* Loop for unanchored matches; for anchored regexs the loop runs just once. */
+/* For anchored or unanchored matches, there may be a "last known required
+character" set. If the PCRE_CASELESS is set, implying that the match starts
+caselessly, or if there are any changes of this flag within the regex, set up
+both cases of the character. Otherwise set the two values the same, which will
+avoid duplicate testing (which takes significant time). This covers the vast
+majority of cases. It will be suboptimal when the case flag changes in a regex
+and the required character in fact is caseful. */
+
+if ((re->options & PCRE_REQCHSET) != 0)
+ {
+ req_char = re->req_char;
+ req_char2 = ((re->options & (PCRE_CASELESS | PCRE_ICHANGED)) != 0)?
+ (re->tables + fcc_offset)[req_char] : req_char;
+ }
+
+/* Loop for handling unanchored repeated matching attempts; for anchored regexs
+the loop runs just once. */
do
{
@@ -4267,7 +4344,7 @@ do
}
}
- /* Or to a non-unique first char */
+ /* Or to a non-unique first char after study */
else if (start_bits != NULL)
{
@@ -4284,6 +4361,59 @@ do
printf("\n");
#endif
+ /* If req_char is set, we know that that character must appear in the subject
+ for the match to succeed. If the first character is set, req_char must be
+ later in the subject; otherwise the test starts at the match point. This
+ optimization can save a huge amount of backtracking in patterns with nested
+ unlimited repeats that aren't going to match. We don't know what the state of
+ case matching may be when this character is hit, so test for it in both its
+ cases if necessary. However, the different cased versions will not be set up
+ unless PCRE_CASELESS was given or the casing state changes within the regex.
+ Writing separate code makes it go faster, as does using an autoincrement and
+ backing off on a match. */
+
+ if (req_char >= 0)
+ {
+ register const uschar *p = start_match + ((first_char >= 0)? 1 : 0);
+
+ /* We don't need to repeat the search if we haven't yet reached the
+ place we found it at last time. */
+
+ if (p > req_char_ptr)
+ {
+ /* Do a single test if no case difference is set up */
+
+ if (req_char == req_char2)
+ {
+ while (p < end_subject)
+ {
+ if (*p++ == req_char) { p--; break; }
+ }
+ }
+
+ /* Otherwise test for either case */
+
+ else
+ {
+ while (p < end_subject)
+ {
+ register int pp = *p++;
+ if (pp == req_char || pp == req_char2) { p--; break; }
+ }
+ }
+
+ /* If we can't find the required character, break the matching loop */
+
+ if (p >= end_subject) break;
+
+ /* If we have found the required character, save the point where we
+ found it, so that we don't search again next time round the loop if
+ the start hasn't passed this character yet. */
+
+ req_char_ptr = p;
+ }
+ }
+
/* When a match occurs, substrings will be set for all internal extractions;
we just need to set up the whole thing as substring 0 before returning. If
there were too many extractions, set the return code to zero. In the case
@@ -4291,6 +4421,7 @@ do
those back references that we can. In this case there need not be overflow
if certain parts of the pattern were not used. */
+ match_block.start_match = start_match;
if (!match(start_match, re->code, 2, &match_block, ims, FALSE, start_match))
continue;