diff options
author | ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2009-09-11 10:21:02 +0000 |
---|---|---|
committer | ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2009-09-11 10:21:02 +0000 |
commit | ce9acaefd260bb9b3285c97b80f966f8be08983e (patch) | |
tree | b06cff980512a020f5455bc0debb57557a6cd044 | |
parent | 4f7977841d1bab49257fed652dbd05a90a503f84 (diff) | |
download | pcre-ce9acaefd260bb9b3285c97b80f966f8be08983e.tar.gz |
Added PCRE_NOTEMPTY_ATSTART to fix /g bug when \K is present.
git-svn-id: svn://vcs.exim.org/pcre/code/trunk@442 2f5784b3-3f2a-0410-8824-cb99058d5e15
-rw-r--r-- | ChangeLog | 5 | ||||
-rw-r--r-- | doc/pcre_dfa_exec.3 | 44 | ||||
-rw-r--r-- | doc/pcre_exec.3 | 40 | ||||
-rw-r--r-- | doc/pcreapi.3 | 38 | ||||
-rw-r--r-- | doc/pcrecompat.3 | 6 | ||||
-rw-r--r-- | doc/pcretest.1 | 15 | ||||
-rw-r--r-- | pcre.h.in | 1 | ||||
-rw-r--r-- | pcre_dfa_exec.c | 9 | ||||
-rw-r--r-- | pcre_exec.c | 16 | ||||
-rw-r--r-- | pcre_internal.h | 19 | ||||
-rw-r--r-- | pcredemo.c | 14 | ||||
-rw-r--r-- | pcretest.c | 13 | ||||
-rw-r--r-- | testdata/testinput2 | 46 | ||||
-rw-r--r-- | testdata/testinput7 | 3 | ||||
-rw-r--r-- | testdata/testoutput2 | 100 | ||||
-rw-r--r-- | testdata/testoutput7 | 7 |
16 files changed, 286 insertions, 90 deletions
@@ -113,6 +113,11 @@ Version 8.00 ??-???-?? over the character class, thus treating the ] as data rather than terminating the class. This meant it could skip too much.] +20. Added PCRE_NOTEMPTY_ATSTART in order to be able to correctly implement the + /g option in pcretest when the pattern contains \K, which makes it possible + to have an empty string match not at the start, even when the pattern is + anchored. Updated pcretest and pcredemo to use this option. + Version 7.9 11-Apr-09 --------------------- diff --git a/doc/pcre_dfa_exec.3 b/doc/pcre_dfa_exec.3 index ebcb273..4f4bb91 100644 --- a/doc/pcre_dfa_exec.3 +++ b/doc/pcre_dfa_exec.3 @@ -38,27 +38,29 @@ matching function is \fBpcre_exec()\fP. The arguments for this function are: .sp The options are: .sp - PCRE_ANCHORED Match only at the first position - PCRE_BSR_ANYCRLF \eR matches only CR, LF, or CRLF - PCRE_BSR_UNICODE \eR matches all Unicode line endings - PCRE_NEWLINE_ANY Recognize any Unicode newline sequence - PCRE_NEWLINE_ANYCRLF Recognize CR, LF, and CRLF as newline sequences - PCRE_NEWLINE_CR Set CR as the newline sequence - PCRE_NEWLINE_CRLF Set CRLF as the newline sequence - PCRE_NEWLINE_LF Set LF as the newline sequence - PCRE_NOTBOL Subject is not the beginning of a line - PCRE_NOTEOL Subject is not the end of a line - PCRE_NOTEMPTY An empty string is not a valid match - PCRE_NO_START_OPTIMIZE Do not do "start-match" optimizations - PCRE_NO_UTF8_CHECK Do not check the subject for UTF-8 - validity (only relevant if PCRE_UTF8 - was set at compile time) - PCRE_PARTIAL ) Return PCRE_ERROR_PARTIAL for a partial match - PCRE_PARTIAL_SOFT ) if no full matches are found - PCRE_PARTIAL_HARD Return PCRE_ERROR_PARTIAL for a partial match - even if there is a full match as well - PCRE_DFA_SHORTEST Return only the shortest match - PCRE_DFA_RESTART This is a restart after a partial match + PCRE_ANCHORED Match only at the first position + PCRE_BSR_ANYCRLF \eR matches only CR, LF, or CRLF + PCRE_BSR_UNICODE \eR matches all Unicode line endings + PCRE_NEWLINE_ANY Recognize any Unicode newline sequence + PCRE_NEWLINE_ANYCRLF Recognize CR, LF, & CRLF as newline sequences + PCRE_NEWLINE_CR Recognize CR as the only newline sequence + PCRE_NEWLINE_CRLF Recognize CRLF as the only newline sequence + PCRE_NEWLINE_LF Recognize LF as the only newline sequence + PCRE_NOTBOL Subject is not the beginning of a line + PCRE_NOTEOL Subject is not the end of a line + PCRE_NOTEMPTY An empty string is not a valid match + PCRE_NOTEMPTY_ATSTART An empty string at the start of the subject + is not a valid match + PCRE_NO_START_OPTIMIZE Do not do "start-match" optimizations + PCRE_NO_UTF8_CHECK Do not check the subject for UTF-8 + validity (only relevant if PCRE_UTF8 + was set at compile time) + PCRE_PARTIAL ) Return PCRE_ERROR_PARTIAL for a partial + PCRE_PARTIAL_SOFT ) match if no full matches are found + PCRE_PARTIAL_HARD Return PCRE_ERROR_PARTIAL for a partial match + even if there is a full match as well + PCRE_DFA_SHORTEST Return only the shortest match + PCRE_DFA_RESTART Restart after a partial match .sp There are restrictions on what may appear in a pattern when using this matching function. Details are given in the diff --git a/doc/pcre_exec.3 b/doc/pcre_exec.3 index 98f4c44..d5689eb 100644 --- a/doc/pcre_exec.3 +++ b/doc/pcre_exec.3 @@ -33,25 +33,27 @@ offsets to captured substrings. Its arguments are: .sp The options are: .sp - PCRE_ANCHORED Match only at the first position - PCRE_BSR_ANYCRLF \eR matches only CR, LF, or CRLF - PCRE_BSR_UNICODE \eR matches all Unicode line endings - PCRE_NEWLINE_ANY Recognize any Unicode newline sequence - PCRE_NEWLINE_ANYCRLF Recognize CR, LF, and CRLF as newline sequences - PCRE_NEWLINE_CR Set CR as the newline sequence - PCRE_NEWLINE_CRLF Set CRLF as the newline sequence - PCRE_NEWLINE_LF Set LF as the newline sequence - PCRE_NOTBOL Subject is not the beginning of a line - PCRE_NOTEOL Subject is not the end of a line - PCRE_NOTEMPTY An empty string is not a valid match - PCRE_NO_START_OPTIMIZE Do not do "start-match" optimizations - PCRE_NO_UTF8_CHECK Do not check the subject for UTF-8 - validity (only relevant if PCRE_UTF8 - was set at compile time) - PCRE_PARTIAL ) Return PCRE_ERROR_PARTIAL for a partial match - PCRE_PARTIAL_SOFT ) if no full matches are found - PCRE_PARTIAL_HARD Return PCRE_ERROR_PARTIAL for a partial match - even if there is a full match as well + PCRE_ANCHORED Match only at the first position + PCRE_BSR_ANYCRLF \eR matches only CR, LF, or CRLF + PCRE_BSR_UNICODE \eR matches all Unicode line endings + PCRE_NEWLINE_ANY Recognize any Unicode newline sequence + PCRE_NEWLINE_ANYCRLF Recognize CR, LF, & CRLF as newline sequences + PCRE_NEWLINE_CR Recognize CR as the only newline sequence + PCRE_NEWLINE_CRLF Recognize CRLF as the only newline sequence + PCRE_NEWLINE_LF Recognize LF as the only newline sequence + PCRE_NOTBOL Subject string is not the beginning of a line + PCRE_NOTEOL Subject string is not the end of a line + PCRE_NOTEMPTY An empty string is not a valid match + PCRE_NOTEMPTY_ATSTART An empty string at the start of the subject + is not a valid match + PCRE_NO_START_OPTIMIZE Do not do "start-match" optimizations + PCRE_NO_UTF8_CHECK Do not check the subject for UTF-8 + validity (only relevant if PCRE_UTF8 + was set at compile time) + PCRE_PARTIAL ) Return PCRE_ERROR_PARTIAL for a partial + PCRE_PARTIAL_SOFT ) match if no full matches are found + PCRE_PARTIAL_HARD Return PCRE_ERROR_PARTIAL for a partial match + even if there is a full match as well .sp For details of partial matching, see the .\" HREF diff --git a/doc/pcreapi.3 b/doc/pcreapi.3 index d1ef930..9175820 100644 --- a/doc/pcreapi.3 +++ b/doc/pcreapi.3 @@ -1246,8 +1246,9 @@ documentation for a discussion of saving compiled patterns for later use. .sp The unused bits of the \fIoptions\fP argument for \fBpcre_exec()\fP must be zero. The only bits that may be set are PCRE_ANCHORED, PCRE_NEWLINE_\fIxxx\fP, -PCRE_NOTBOL, PCRE_NOTEOL, PCRE_NOTEMPTY, PCRE_NO_START_OPTIMIZE, -PCRE_NO_UTF8_CHECK, PCRE_PARTIAL_SOFT, and PCRE_PARTIAL_HARD. +PCRE_NOTBOL, PCRE_NOTEOL, PCRE_NOTEMPTY, PCRE_NOTEMPTY_ATSTART, +PCRE_NO_START_OPTIMIZE, PCRE_NO_UTF8_CHECK, PCRE_PARTIAL_SOFT, and +PCRE_PARTIAL_HARD. .sp PCRE_ANCHORED .sp @@ -1322,17 +1323,24 @@ match the empty string, the entire match fails. For example, if the pattern .sp a?b? .sp -is applied to a string not beginning with "a" or "b", it matches the empty +is applied to a string not beginning with "a" or "b", it matches an empty string at the start of the subject. With PCRE_NOTEMPTY set, this match is not valid, so PCRE searches further into the string for occurrences of "a" or "b". +.sp + PCRE_NOTEMPTY_ATSTART +.sp +This is like PCRE_NOTEMPTY, except that an empty string match that is not at +the start of the subject is permitted. If the pattern is anchored, such a match +can occur only if the pattern contains \eK. .P -Perl has no direct equivalent of PCRE_NOTEMPTY, but it does make a special case -of a pattern match of the empty string within its \fBsplit()\fP function, and -when using the /g modifier. It is possible to emulate Perl's behaviour after -matching a null string by first trying the match again at the same offset with -PCRE_NOTEMPTY and PCRE_ANCHORED, and then if that fails by advancing the -starting offset (see below) and trying an ordinary match again. There is some -code that demonstrates how to do this in the +Perl has no direct equivalent of PCRE_NOTEMPTY or PCRE_NOTEMPTY_ATSTART, but it +does make a special case of a pattern match of the empty string within its +\fBsplit()\fP function, and when using the /g modifier. It is possible to +emulate Perl's behaviour after matching a null string by first trying the match +again at the same offset with PCRE_NOTEMPTY_ATSTART and PCRE_ANCHORED, and then +if that fails, by advancing the starting offset (see below) and trying an +ordinary match again. There is some code that demonstrates how to do this in +the .\" HREF \fBpcredemo\fP .\" @@ -1875,10 +1883,10 @@ Here is an example of a simple call to \fBpcre_dfa_exec()\fP: .sp The unused bits of the \fIoptions\fP argument for \fBpcre_dfa_exec()\fP must be zero. The only bits that may be set are PCRE_ANCHORED, PCRE_NEWLINE_\fIxxx\fP, -PCRE_NOTBOL, PCRE_NOTEOL, PCRE_NOTEMPTY, PCRE_NO_UTF8_CHECK, PCRE_PARTIAL_HARD, -PCRE_PARTIAL_SOFT, PCRE_DFA_SHORTEST, and PCRE_DFA_RESTART. All but the last -four of these are exactly the same as for \fBpcre_exec()\fP, so their -description is not repeated here. +PCRE_NOTBOL, PCRE_NOTEOL, PCRE_NOTEMPTY, PCRE_NOTEMPTY_ATSTART, +PCRE_NO_UTF8_CHECK, PCRE_PARTIAL_HARD, PCRE_PARTIAL_SOFT, PCRE_DFA_SHORTEST, +and PCRE_DFA_RESTART. All but the last four of these are exactly the same as +for \fBpcre_exec()\fP, so their description is not repeated here. .sp PCRE_PARTIAL_HARD PCRE_PARTIAL_SOFT @@ -2012,6 +2020,6 @@ Cambridge CB2 3QH, England. .rs .sp .nf -Last updated: 09 September 2009 +Last updated: 11 September 2009 Copyright (c) 1997-2009 University of Cambridge. .fi diff --git a/doc/pcrecompat.3 b/doc/pcrecompat.3 index 47e2fd8..c9e594b 100644 --- a/doc/pcrecompat.3 +++ b/doc/pcrecompat.3 @@ -109,8 +109,8 @@ question mark they are. (e) PCRE_ANCHORED can be used at matching time to force a pattern to be tried only at the first matching position in the subject string. .sp -(f) The PCRE_NOTBOL, PCRE_NOTEOL, PCRE_NOTEMPTY, and PCRE_NO_AUTO_CAPTURE -options for \fBpcre_exec()\fP have no Perl equivalents. +(f) The PCRE_NOTBOL, PCRE_NOTEOL, PCRE_NOTEMPTY, PCRE_NOTEMPTY_ATSTART, and +PCRE_NO_AUTO_CAPTURE options for \fBpcre_exec()\fP have no Perl equivalents. .sp (g) The \eR escape sequence can be restricted to match only CR, LF, or CRLF by the PCRE_BSR_ANYCRLF option. @@ -143,6 +143,6 @@ Cambridge CB2 3QH, England. .rs .sp .nf -Last updated: 25 August 2009 +Last updated: 11 September 2009 Copyright (c) 1997-2009 University of Cambridge. .fi diff --git a/doc/pcretest.1 b/doc/pcretest.1 index f51aefe..1bfafae 100644 --- a/doc/pcretest.1 +++ b/doc/pcretest.1 @@ -211,11 +211,11 @@ substring. This makes a difference to the matching process if the pattern begins with a lookbehind assertion (including \eb or \eB). .P If any call to \fBpcre_exec()\fP in a \fB/g\fP or \fB/G\fP sequence matches an -empty string, the next call is done with the PCRE_NOTEMPTY and PCRE_ANCHORED -flags set in order to search for another, non-empty, match at the same point. -If this second match fails, the start offset is advanced by one, and the normal -match is retried. This imitates the way Perl handles such cases when using the -\fB/g\fP modifier or the \fBsplit()\fP function. +empty string, the next call is done with the PCRE_NOTEMPTY_ATSTART and +PCRE_ANCHORED flags set in order to search for another, non-empty, match at the +same point. If this second match fails, the start offset is advanced by one +character, and the normal match is retried. This imitates the way Perl handles +such cases when using the \fB/g\fP modifier or the \fBsplit()\fP function. . . .SS "Other modifiers" @@ -356,7 +356,8 @@ recognized: MATCH_LIMIT_RECURSION settings .\" JOIN \eN pass the PCRE_NOTEMPTY option to \fBpcre_exec()\fP - or \fBpcre_dfa_exec()\fP + or \fBpcre_dfa_exec()\fP; if used twice, pass the + PCRE_NOTEMPTY_ATSTART option .\" JOIN \eOdd set the size of the output vector passed to \fBpcre_exec()\fP to dd (any number of digits) @@ -727,6 +728,6 @@ Cambridge CB2 3QH, England. .rs .sp .nf -Last updated: 05 September 2009 +Last updated: 11 September 2009 Copyright (c) 1997-2009 University of Cambridge. .fi @@ -130,6 +130,7 @@ both, so we keep them all distinct. */ #define PCRE_NO_START_OPTIMIZE 0x04000000 #define PCRE_NO_START_OPTIMISE 0x04000000 #define PCRE_PARTIAL_HARD 0x08000000 +#define PCRE_NOTEMPTY_ATSTART 0x10000000 /* Exec-time and get/set-time error codes */ diff --git a/pcre_dfa_exec.c b/pcre_dfa_exec.c index 208cf2d..ca32e51 100644 --- a/pcre_dfa_exec.c +++ b/pcre_dfa_exec.c @@ -647,7 +647,8 @@ for (;;) /* ========================================================================== */ /* Reached a closing bracket. If not at the end of the pattern, carry on with the next opcode. Otherwise, unless we have an empty string and - PCRE_NOTEMPTY is set, save the match data, shifting up all previous + PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the + start of the subject, save the match data, shifting up all previous matches so we always have the longest first. */ case OP_KET: @@ -664,7 +665,10 @@ for (;;) else { reached_end++; /* Count branches that reach the end */ - if (ptr > current_subject || (md->moptions & PCRE_NOTEMPTY) == 0) + if (ptr > current_subject || + ((md->moptions & PCRE_NOTEMPTY) == 0 && + ((md->moptions & PCRE_NOTEMPTY_ATSTART) == 0 || + current_subject > start_subject + md->start_offset))) { if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0; else if (match_count > 0 && ++match_count * 2 >= offsetcount) @@ -2681,6 +2685,7 @@ md->start_code = (const uschar *)argument_re + re->name_table_offset + re->name_count * re->name_entry_size; md->start_subject = (const unsigned char *)subject; md->end_subject = end_subject; +md->start_offset = start_offset; md->moptions = options; md->poptions = re->options; diff --git a/pcre_exec.c b/pcre_exec.c index 553f09e..c72577e 100644 --- a/pcre_exec.c +++ b/pcre_exec.c @@ -930,10 +930,19 @@ for (;;) break; } - /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty - string - backtracking will then try other alternatives, if any. */ + /* Otherwise, if we have matched an empty string, fail if PCRE_NOTEMPTY is + set, or if PCRE_NOTEMPTY_ATSTART is set and we have matched at the start of + the subject. In both cases, backtracking will then try other alternatives, + if any. */ + + if (eptr == mstart && + (md->notempty || + (md->notempty_atstart && + mstart == md->start_subject + md->start_offset))) + RRETURN(MATCH_NOMATCH); + + /* Otherwise, we have a match. */ - if (md->notempty && eptr == mstart) RRETURN(MATCH_NOMATCH); md->end_match_ptr = eptr; /* Record where we ended */ md->end_offset_top = offset_top; /* and how many extracts were taken */ md->start_match_ptr = mstart; /* and the start (\K can modify) */ @@ -4920,6 +4929,7 @@ md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0; md->notbol = (options & PCRE_NOTBOL) != 0; md->noteol = (options & PCRE_NOTEOL) != 0; md->notempty = (options & PCRE_NOTEMPTY) != 0; +md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0; md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 : ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0; md->hitend = FALSE; diff --git a/pcre_internal.h b/pcre_internal.h index c48f248..ba3c018 100644 --- a/pcre_internal.h +++ b/pcre_internal.h @@ -564,14 +564,15 @@ time, run time, or study time, respectively. */ PCRE_JAVASCRIPT_COMPAT) #define PUBLIC_EXEC_OPTIONS \ - (PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NO_UTF8_CHECK| \ - PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT|PCRE_NEWLINE_BITS|PCRE_BSR_ANYCRLF| \ - PCRE_BSR_UNICODE|PCRE_NO_START_OPTIMIZE) + (PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NOTEMPTY_ATSTART| \ + PCRE_NO_UTF8_CHECK|PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT|PCRE_NEWLINE_BITS| \ + PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE|PCRE_NO_START_OPTIMIZE) #define PUBLIC_DFA_EXEC_OPTIONS \ - (PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NO_UTF8_CHECK| \ - PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT|PCRE_DFA_SHORTEST|PCRE_DFA_RESTART| \ - PCRE_NEWLINE_BITS|PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE|PCRE_NO_START_OPTIMIZE) + (PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NOTEMPTY_ATSTART| \ + PCRE_NO_UTF8_CHECK|PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT|PCRE_DFA_SHORTEST| \ + PCRE_DFA_RESTART|PCRE_NEWLINE_BITS|PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE| \ + PCRE_NO_START_OPTIMIZE) #define PUBLIC_STUDY_OPTIONS 0 /* None defined */ @@ -1601,6 +1602,7 @@ typedef struct match_data { BOOL jscript_compat; /* JAVASCRIPT_COMPAT flag */ BOOL endonly; /* Dollar not before final \n */ BOOL notempty; /* Empty string match not wanted */ + BOOL notempty_atstart; /* Empty string match at start not wanted */ BOOL hitend; /* Hit the end of the subject at some point */ BOOL bsr_anycrlf; /* \R is just any CRLF, not full Unicode */ const uschar *start_code; /* For use when recursing */ @@ -1608,7 +1610,7 @@ typedef struct match_data { USPTR end_subject; /* End of the subject string */ USPTR start_match_ptr; /* Start of matched string */ USPTR end_match_ptr; /* Subject position at end match */ - USPTR start_used_ptr; /* Earliest consulted character */ + USPTR start_used_ptr; /* Earliest consulted character */ int partial; /* PARTIAL options */ int end_offset_top; /* Highwater mark at end of match */ int capture_last; /* Most recent capture number */ @@ -1626,8 +1628,9 @@ typedef struct dfa_match_data { const uschar *start_code; /* Start of the compiled pattern */ const uschar *start_subject; /* Start of the subject string */ const uschar *end_subject; /* End of subject string */ - const uschar *start_used_ptr; /* Earliest consulted character */ + const uschar *start_used_ptr; /* Earliest consulted character */ const uschar *tables; /* Character tables */ + int start_offset; /* The start offset value */ int moptions; /* Match options */ int poptions; /* Pattern options */ int nltype; /* Newline type */ @@ -223,12 +223,12 @@ if (namecount <= 0) printf("No named substrings\n"); else * * * If the previous match WAS for an empty string, we can't do that, as it * * would lead to an infinite loop. Instead, a special call of pcre_exec() * -* is made with the PCRE_NOTEMPTY and PCRE_ANCHORED flags set. The first * -* of these tells PCRE that an empty string is not a valid match; other * -* possibilities must be tried. The second flag restricts PCRE to one * -* match attempt at the initial string position. If this match succeeds, * -* an alternative to the empty string match has been found, and we can * -* proceed round the loop. * +* is made with the PCRE_NOTEMPTY_ATSTART and PCRE_ANCHORED flags set. * +* The first of these tells PCRE that an empty string at the start of the * +* subject is not a valid match; other possibilities must be tried. The * +* second flag restricts PCRE to one match attempt at the initial string * +* position. If this match succeeds, an alternative to the empty string * +* match has been found, and we can proceed round the loop. * *************************************************************************/ if (!find_all) @@ -251,7 +251,7 @@ for (;;) if (ovector[0] == ovector[1]) { if (ovector[0] == subject_length) break; - options = PCRE_NOTEMPTY | PCRE_ANCHORED; + options = PCRE_NOTEMPTY_ATSTART | PCRE_ANCHORED; } /* Run the next matching operation */ @@ -1970,7 +1970,10 @@ while (!done) continue; case 'N': - options |= PCRE_NOTEMPTY; + if ((options & PCRE_NOTEMPTY) != 0) + options = (options & ~PCRE_NOTEMPTY) | PCRE_NOTEMPTY_ATSTART; + else + options |= PCRE_NOTEMPTY; continue; case 'O': @@ -2443,9 +2446,9 @@ while (!done) if (!do_g && !do_G) break; /* If we have matched an empty string, first check to see if we are at - the end of the subject. If so, the /g loop is over. Otherwise, mimic - what Perl's /g options does. This turns out to be rather cunning. First - we set PCRE_NOTEMPTY and PCRE_ANCHORED and try the match again at the + the end of the subject. If so, the /g loop is over. Otherwise, mimic what + Perl's /g options does. This turns out to be rather cunning. First we set + PCRE_NOTEMPTY_ATSTART and PCRE_ANCHORED and try the match again at the same point. If this fails (picked up above) we advance to the next character. */ @@ -2454,7 +2457,7 @@ while (!done) if (use_offsets[0] == use_offsets[1]) { if (use_offsets[0] == len) break; - g_notempty = PCRE_NOTEMPTY | PCRE_ANCHORED; + g_notempty = PCRE_NOTEMPTY_ATSTART | PCRE_ANCHORED; } /* For /g, update the start offset, leaving the rest alone */ diff --git a/testdata/testinput2 b/testdata/testinput2 index 0688785..5961e14 100644 --- a/testdata/testinput2 +++ b/testdata/testinput2 @@ -2961,4 +2961,50 @@ a random value. /Ix /(?&word)(?&element)(?(DEFINE)(?<element><[^\d][^>]>[^<])(?<word>\w*+))/BZ +/abc\K|def\K/g+ + Xabcdefghi + +/ab\Kc|de\Kf/g+ + Xabcdefghi + +/(?=C)/g+ + ABCDECBA + +/^abc\K/+ + abcdef + ** Failers + defabcxyz + +/abc\K/+ + abcdef + abcdef\N\N + xyzabcdef\N\N + ** Failers + abcdef\N + xyzabcdef\N + +/^(?:(?=abc)|abc\K)/+ + abcdef + abcdef\N\N + ** Failers + abcdef\N + +/a?b?/+ + xyz + xyzabc + xyzabc\N + xyzabc\N\N + xyz\N\N + ** Failers + xyz\N + +/^a?b?/+ + xyz + xyzabc + ** Failers + xyzabc\N + xyzabc\N\N + xyz\N\N + xyz\N + / End of testinput2 / diff --git a/testdata/testinput7 b/testdata/testinput7 index 237914e..c32d336 100644 --- a/testdata/testinput7 +++ b/testdata/testinput7 @@ -4483,4 +4483,7 @@ +++ab\P +++ab\P\P +/(?=C)/g+ + ABCDECBA + / End of testinput7 / diff --git a/testdata/testoutput2 b/testdata/testoutput2 index 725cc5b..f84c3ca 100644 --- a/testdata/testoutput2 +++ b/testdata/testoutput2 @@ -10048,4 +10048,104 @@ Partial match: +ab End ------------------------------------------------------------------ +/abc\K|def\K/g+ + Xabcdefghi + 0: + 0+ defghi + 0: + 0+ ghi + +/ab\Kc|de\Kf/g+ + Xabcdefghi + 0: c + 0+ defghi + 0: f + 0+ ghi + +/(?=C)/g+ + ABCDECBA + 0: + 0+ CDECBA + 0: + 0+ CBA + +/^abc\K/+ + abcdef + 0: + 0+ def + ** Failers +No match + defabcxyz +No match + +/abc\K/+ + abcdef + 0: + 0+ def + abcdef\N\N + 0: + 0+ def + xyzabcdef\N\N + 0: + 0+ def + ** Failers +No match + abcdef\N +No match + xyzabcdef\N +No match + +/^(?:(?=abc)|abc\K)/+ + abcdef + 0: + 0+ abcdef + abcdef\N\N + 0: + 0+ def + ** Failers +No match + abcdef\N +No match + +/a?b?/+ + xyz + 0: + 0+ xyz + xyzabc + 0: + 0+ xyzabc + xyzabc\N + 0: ab + 0+ c + xyzabc\N\N + 0: + 0+ yzabc + xyz\N\N + 0: + 0+ yz + ** Failers + 0: + 0+ ** Failers + xyz\N +No match + +/^a?b?/+ + xyz + 0: + 0+ xyz + xyzabc + 0: + 0+ xyzabc + ** Failers + 0: + 0+ ** Failers + xyzabc\N +No match + xyzabc\N\N +No match + xyz\N\N +No match + xyz\N +No match + / End of testinput2 / diff --git a/testdata/testoutput7 b/testdata/testoutput7 index 552ebc0..cc8b91d 100644 --- a/testdata/testoutput7 +++ b/testdata/testoutput7 @@ -7462,4 +7462,11 @@ Partial match: +ab +++ab\P\P Partial match: +ab +/(?=C)/g+ + ABCDECBA + 0: + 0+ CDECBA + 0: + 0+ CBA + / End of testinput7 / |