diff options
Diffstat (limited to 'ext/pcre/pcrelib/pcre_exec.c')
-rw-r--r-- | ext/pcre/pcrelib/pcre_exec.c | 233 |
1 files changed, 142 insertions, 91 deletions
diff --git a/ext/pcre/pcrelib/pcre_exec.c b/ext/pcre/pcrelib/pcre_exec.c index 91b5047731..073cf2410a 100644 --- a/ext/pcre/pcrelib/pcre_exec.c +++ b/ext/pcre/pcrelib/pcre_exec.c @@ -320,9 +320,9 @@ typedef struct heapframe { /* Function arguments that may change */ - const uschar *Xeptr; + USPTR Xeptr; const uschar *Xecode; - const uschar *Xmstart; + USPTR Xmstart; int Xoffset_top; long int Xims; eptrblock *Xeptrb; @@ -331,13 +331,15 @@ typedef struct heapframe { /* Function local variables */ - const uschar *Xcallpat; - const uschar *Xcharptr; - const uschar *Xdata; - const uschar *Xnext; - const uschar *Xpp; - const uschar *Xprev; - const uschar *Xsaved_eptr; + USPTR Xcallpat; +#ifdef SUPPORT_UTF8 + USPTR Xcharptr; +#endif + USPTR Xdata; + USPTR Xnext; + USPTR Xpp; + USPTR Xprev; + USPTR Xsaved_eptr; recursion_info Xnew_recursive; @@ -358,6 +360,7 @@ typedef struct heapframe { uschar Xocchars[8]; #endif + int Xcodelink; int Xctype; unsigned int Xfc; int Xfi; @@ -423,7 +426,7 @@ Returns: MATCH_MATCH if matched ) these values are >= 0 */ static int -match(REGISTER USPTR eptr, REGISTER const uschar *ecode, const uschar *mstart, +match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart, int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb, int flags, unsigned int rdepth) { @@ -437,6 +440,7 @@ register unsigned int c; /* Character values not kept over RMATCH() calls */ register BOOL utf8; /* Local copy of UTF-8 flag for speed */ BOOL minimize, possessive; /* Quantifier options */ +int condcode; /* When recursion is not being used, all "local" variables that have to be preserved over calls to RMATCH() are part of a "frame" which is obtained from @@ -479,6 +483,7 @@ HEAP_RECURSE: #define charptr frame->Xcharptr #endif #define callpat frame->Xcallpat +#define codelink frame->Xcodelink #define data frame->Xdata #define next frame->Xnext #define pp frame->Xpp @@ -559,6 +564,7 @@ int oclength; uschar occhars[8]; #endif +int codelink; int ctype; int length; int max; @@ -785,7 +791,39 @@ for (;;) case OP_COND: case OP_SCOND: - if (ecode[LINK_SIZE+1] == OP_RREF) /* Recursion test */ + codelink= GET(ecode, 1); + + /* Because of the way auto-callout works during compile, a callout item is + inserted between OP_COND and an assertion condition. */ + + if (ecode[LINK_SIZE+1] == OP_CALLOUT) + { + if (pcre_callout != NULL) + { + pcre_callout_block cb; + cb.version = 1; /* Version 1 of the callout block */ + cb.callout_number = ecode[LINK_SIZE+2]; + cb.offset_vector = md->offset_vector; + cb.subject = (PCRE_SPTR)md->start_subject; + cb.subject_length = md->end_subject - md->start_subject; + cb.start_match = mstart - md->start_subject; + cb.current_position = eptr - md->start_subject; + cb.pattern_position = GET(ecode, LINK_SIZE + 3); + cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE); + cb.capture_top = offset_top/2; + cb.capture_last = md->capture_last; + cb.callout_data = md->callout_data; + if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH); + if (rrc < 0) RRETURN(rrc); + } + ecode += _pcre_OP_lengths[OP_CALLOUT]; + } + + condcode = ecode[LINK_SIZE+1]; + + /* Now see what the actual condition is */ + + if (condcode == OP_RREF) /* Recursion test */ { offset = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/ condition = md->recursive != NULL && @@ -793,14 +831,14 @@ for (;;) ecode += condition? 3 : GET(ecode, 1); } - else if (ecode[LINK_SIZE+1] == OP_CREF) /* Group used test */ + else if (condcode == OP_CREF) /* Group used test */ { offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */ condition = offset < offset_top && md->offset_vector[offset] >= 0; ecode += condition? 3 : GET(ecode, 1); } - else if (ecode[LINK_SIZE+1] == OP_DEF) /* DEFINE - always false */ + else if (condcode == OP_DEF) /* DEFINE - always false */ { condition = FALSE; ecode += GET(ecode, 1); @@ -827,7 +865,7 @@ for (;;) else { condition = FALSE; - ecode += GET(ecode, 1); + ecode += codelink; } } @@ -850,7 +888,7 @@ for (;;) goto TAIL_RECURSE; } } - else /* Condition false & no 2nd alternative */ + else /* Condition false & no alternative */ { ecode += 1 + LINK_SIZE; } @@ -1073,6 +1111,8 @@ for (;;) else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) { DPRINTF(("Recursion gave error %d\n", rrc)); + if (new_recursive.offset_save != stacksave) + (pcre_free)(new_recursive.offset_save); RRETURN(rrc); } @@ -1419,7 +1459,7 @@ for (;;) { if (eptr == md->start_subject) prev_is_word = FALSE; else { - const uschar *lastptr = eptr - 1; + USPTR lastptr = eptr - 1; while((*lastptr & 0xc0) == 0x80) lastptr--; GETCHAR(c, lastptr); prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0; @@ -1677,7 +1717,7 @@ for (;;) if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINCTEST(c, eptr); { - const ucd_record * prop = GET_UCD(c); + const ucd_record *prop = GET_UCD(c); switch(ecode[1]) { @@ -2045,7 +2085,8 @@ for (;;) /* Match an extended character class. This opcode is encountered only - in UTF-8 mode, because that's the only time it is compiled. */ + when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8 + mode, because Unicode properties are supported in non-UTF-8 mode. */ #ifdef SUPPORT_UTF8 case OP_XCLASS: @@ -2087,7 +2128,7 @@ for (;;) for (i = 1; i <= min; i++) { if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); - GETCHARINC(c, eptr); + GETCHARINCTEST(c, eptr); if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH); } @@ -2106,7 +2147,7 @@ for (;;) RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20); if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); - GETCHARINC(c, eptr); + GETCHARINCTEST(c, eptr); if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH); } /* Control never gets here */ @@ -2121,7 +2162,7 @@ for (;;) { int len = 1; if (eptr >= md->end_subject) break; - GETCHARLEN(c, eptr, len); + GETCHARLENTEST(c, eptr, len); if (!_pcre_xclass(c, data)) break; eptr += len; } @@ -4531,10 +4572,10 @@ switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) & PCRE_NEWLINE_BITS) { case 0: newline = NEWLINE; break; /* Compile-time default */ - case PCRE_NEWLINE_CR: newline = '\r'; break; - case PCRE_NEWLINE_LF: newline = '\n'; break; + case PCRE_NEWLINE_CR: newline = CHAR_CR; break; + case PCRE_NEWLINE_LF: newline = CHAR_NL; break; case PCRE_NEWLINE_CR+ - PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break; + PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break; case PCRE_NEWLINE_ANY: newline = -1; break; case PCRE_NEWLINE_ANYCRLF: newline = -2; break; default: return PCRE_ERROR_BADNEWLINE; @@ -4576,11 +4617,11 @@ back the character offset. */ #ifdef SUPPORT_UTF8 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0) { - if (_pcre_valid_utf8((uschar *)subject, length) >= 0) + if (_pcre_valid_utf8((USPTR)subject, length) >= 0) return PCRE_ERROR_BADUTF8; if (start_offset > 0 && start_offset < length) { - int tb = ((uschar *)subject)[start_offset]; + int tb = ((USPTR)subject)[start_offset]; if (tb > 127) { tb &= 0xc0; @@ -4686,11 +4727,11 @@ for(;;) while (iptr < iend) *iptr++ = -1; } - /* Advance to a unique first char if possible. If firstline is TRUE, the - start of the match is constrained to the first line of a multiline string. - That is, the match must be before or at the first newline. Implement this by - temporarily adjusting end_subject so that we stop scanning at a newline. If - the match fails at the newline, later code breaks this loop. */ + /* If firstline is TRUE, the start of the match is constrained to the first + line of a multiline string. That is, the match must be before or at the first + newline. Implement this by temporarily adjusting end_subject so that we stop + scanning at a newline. If the match fails at the newline, later code breaks + this loop. */ if (firstline) { @@ -4710,62 +4751,70 @@ for(;;) end_subject = t; } - /* Now advance to a unique first byte if there is one. */ + /* There are some optimizations that avoid running the match if a known + starting point is not found, or if a known later character is not present. + However, there is an option that disables these, for testing and for ensuring + that all callouts do actually occur. */ - if (first_byte >= 0) + if ((options & PCRE_NO_START_OPTIMIZE) == 0) { - if (first_byte_caseless) - while (start_match < end_subject && md->lcc[*start_match] != first_byte) - start_match++; - else - while (start_match < end_subject && *start_match != first_byte) - start_match++; - } + /* Advance to a unique first byte if there is one. */ - /* Or to just after a linebreak for a multiline match */ + if (first_byte >= 0) + { + if (first_byte_caseless) + while (start_match < end_subject && md->lcc[*start_match] != first_byte) + start_match++; + else + while (start_match < end_subject && *start_match != first_byte) + start_match++; + } - else if (startline) - { - if (start_match > md->start_subject + start_offset) + /* Or to just after a linebreak for a multiline match */ + + else if (startline) { -#ifdef SUPPORT_UTF8 - if (utf8) + if (start_match > md->start_subject + start_offset) { - while (start_match < end_subject && !WAS_NEWLINE(start_match)) +#ifdef SUPPORT_UTF8 + if (utf8) { - start_match++; - while(start_match < end_subject && (*start_match & 0xc0) == 0x80) + while (start_match < end_subject && !WAS_NEWLINE(start_match)) + { start_match++; + while(start_match < end_subject && (*start_match & 0xc0) == 0x80) + start_match++; + } } - } - else + else #endif - while (start_match < end_subject && !WAS_NEWLINE(start_match)) - start_match++; - - /* If we have just passed a CR and the newline option is ANY or ANYCRLF, - and we are now at a LF, advance the match position by one more character. - */ - - if (start_match[-1] == '\r' && - (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) && - start_match < end_subject && - *start_match == '\n') - start_match++; + while (start_match < end_subject && !WAS_NEWLINE(start_match)) + start_match++; + + /* If we have just passed a CR and the newline option is ANY or ANYCRLF, + and we are now at a LF, advance the match position by one more character. + */ + + if (start_match[-1] == CHAR_CR && + (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) && + start_match < end_subject && + *start_match == CHAR_NL) + start_match++; + } } - } - /* Or to a non-unique first byte after study */ + /* Or to a non-unique first byte after study */ - else if (start_bits != NULL) - { - while (start_match < end_subject) + else if (start_bits != NULL) { - register unsigned int c = *start_match; - if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; - else break; + while (start_match < end_subject) + { + register unsigned int c = *start_match; + if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; + else break; + } } - } + } /* Starting optimizations */ /* Restore fudged end_subject */ @@ -4777,23 +4826,25 @@ for(;;) printf("\n"); #endif - /* If req_byte is set, we know that that character must appear in the subject - for the match to succeed. If the first character is set, req_byte must be - later in the subject; otherwise the test starts at the match point. This - optimization can save a huge amount of backtracking in patterns with nested - unlimited repeats that aren't going to match. Writing separate code for - cased/caseless versions makes it go faster, as does using an autoincrement - and backing off on a match. - - HOWEVER: when the subject string is very, very long, searching to its end can - take a long time, and give bad performance on quite ordinary patterns. This - showed up when somebody was matching something like /^\d+C/ on a 32-megabyte - string... so we don't do this when the string is sufficiently long. - - ALSO: this processing is disabled when partial matching is requested. - */ - - if (req_byte >= 0 && + /* If req_byte is set, we know that that character must appear in the + subject for the match to succeed. If the first character is set, req_byte + must be later in the subject; otherwise the test starts at the match point. + This optimization can save a huge amount of backtracking in patterns with + nested unlimited repeats that aren't going to match. Writing separate code + for cased/caseless versions makes it go faster, as does using an + autoincrement and backing off on a match. + + HOWEVER: when the subject string is very, very long, searching to its end + can take a long time, and give bad performance on quite ordinary patterns. + This showed up when somebody was matching something like /^\d+C/ on a + 32-megabyte string... so we don't do this when the string is sufficiently + long. + + ALSO: this processing is disabled when partial matching is requested, or if + disabling is explicitly requested. */ + + if ((options & PCRE_NO_START_OPTIMIZE) == 0 && + req_byte >= 0 && end_subject - start_match < REQ_BYTE_MAX && !md->partial) { @@ -4901,9 +4952,9 @@ for(;;) not contain any explicit matches for \r or \n, and the newline option is CRLF or ANY or ANYCRLF, advance the match position by one more character. */ - if (start_match[-1] == '\r' && + if (start_match[-1] == CHAR_CR && start_match < end_subject && - *start_match == '\n' && + *start_match == CHAR_NL && (re->flags & PCRE_HASCRORLF) == 0 && (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF || |