diff options
author | ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2007-04-02 10:08:14 +0000 |
---|---|---|
committer | ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2007-04-02 10:08:14 +0000 |
commit | f8735bf7b631d53067a040bd9f2c04b8f80c6dae (patch) | |
tree | aa7ebef3c4d31210c14705eaeed55a0716a37f5a | |
parent | d0fc62ee8e85255467ef8541458df6e7f4e01cef (diff) | |
download | pcre-f8735bf7b631d53067a040bd9f2c04b8f80c6dae.tar.gz |
Previous fix for pcretest was buggy. Try again.
git-svn-id: svn://vcs.exim.org/pcre/code/trunk@143 2f5784b3-3f2a-0410-8824-cb99058d5e15
-rw-r--r-- | ChangeLog | 3 | ||||
-rw-r--r-- | pcretest.c | 47 | ||||
-rw-r--r-- | testdata/testinput2 | 6 | ||||
-rw-r--r-- | testdata/testinput7 | 6 | ||||
-rw-r--r-- | testdata/testoutput2 | 16 | ||||
-rw-r--r-- | testdata/testoutput7 | 15 |
6 files changed, 68 insertions, 25 deletions
@@ -133,7 +133,8 @@ Version 7.1 12-Mar-07 19. In pcretest, if the pattern /(?m)^$/g<any> was matched against the string "abc\r\n\r\n", it found an unwanted second match after the second \r. This was because its rules for how to advance for /g after matching an empty - string did not allow for this case. They now check for it specially. + string at the end of a line did not allow for this case. They now check for + it specially. Version 7.0 19-Dec-06 @@ -1972,7 +1972,6 @@ while (!done) for (;; gmatched++) /* Loop for /g or /G */ { - int gany_fudge; if (timeitm > 0) { register int i; @@ -2212,11 +2211,18 @@ while (!done) } /* Failed to match. If this is a /g or /G loop and we previously set - g_notempty after a null match, this is not necessarily the end. - We want to advance the start offset, and continue. In the case of UTF-8 - matching, the advance must be one character, not one byte. Fudge the - offset values to achieve this. We won't be at the end of the string - - that was checked before setting g_notempty. */ + g_notempty after a null match, this is not necessarily the end. We want + to advance the start offset, and continue. We won't be at the end of the + string - that was checked before setting g_notempty. + + Complication arises in the case when the newline option is "any". + If the previous match was at the end of a line terminated by CRLF, an + advance of one character just passes the \r, whereas we should prefer the + longer newline sequence, as does the code in pcre_exec(). Fudge the + offset value to achieve this. + + Otherwise, in the case of UTF-8 matching, the advance must be one + character, not one byte. */ else { @@ -2224,7 +2230,13 @@ while (!done) { int onechar = 1; use_offsets[0] = start_offset; - if (use_utf8) + if ((((real_pcre *)re)->options & PCRE_NEWLINE_BITS) == + PCRE_NEWLINE_ANY && + start_offset < len - 1 && + bptr[start_offset] == '\r' && + bptr[start_offset+1] == '\n') + onechar++; + else if (use_utf8) { while (start_offset + onechar < len) { @@ -2256,39 +2268,26 @@ while (!done) what Perl's /g options does. This turns out to be rather cunning. First we set PCRE_NOTEMPTY and PCRE_ANCHORED and try the match again at the same point. If this fails (picked up above) we advance to the next - character. - - Yet more complication arises in the case when the newline option is - "any" and a pattern in multiline mode has to match at the start of a - line. If a previous match was at the end of a line, and advance of one - character just passes the \r, whereas we should prefer the longer newline - sequence, as does the code in pcre_exec(). So we fudge it. */ + character. */ g_notempty = 0; - gany_fudge = 0; if (use_offsets[0] == use_offsets[1]) { if (use_offsets[0] == len) break; g_notempty = PCRE_NOTEMPTY | PCRE_ANCHORED; - if ((((real_pcre *)re)->options & PCRE_STARTLINE) != 0 && - (((real_pcre *)re)->options & PCRE_NEWLINE_BITS) == PCRE_NEWLINE_ANY && - use_offsets[0] < len - 1 && - bptr[use_offsets[0]] == '\r' && - bptr[use_offsets[0]+1] == '\n') - gany_fudge = 1; } /* For /g, update the start offset, leaving the rest alone */ - if (do_g) start_offset = use_offsets[1] + gany_fudge; + if (do_g) start_offset = use_offsets[1]; /* For /G, update the pointer and length */ else { - bptr += use_offsets[1] + gany_fudge; - len -= use_offsets[1] + gany_fudge; + bptr += use_offsets[1]; + len -= use_offsets[1]; } } /* End of loop for /g and /G */ diff --git a/testdata/testinput2 b/testdata/testinput2 index 1de01cd..dcfa77f 100644 --- a/testdata/testinput2 +++ b/testdata/testinput2 @@ -2139,4 +2139,10 @@ a random value. /Ix /(?m)^$/<any>g+ abc\r\n\r\n +/(?m)^$|^\r\n/<any>g+ + abc\r\n\r\n + +/(?m)$/<any>g+ + abc\r\n\r\n + / End of testinput2 / diff --git a/testdata/testinput7 b/testdata/testinput7 index 4f1dbc5..5c2dd6f 100644 --- a/testdata/testinput7 +++ b/testdata/testinput7 @@ -4243,4 +4243,10 @@ /(?m)^$/<any>g+ abc\r\n\r\n +/(?m)^$|^\r\n/<any>g+ + abc\r\n\r\n + +/(?m)$/<any>g+ + abc\r\n\r\n + / End of testinput7 / diff --git a/testdata/testoutput2 b/testdata/testoutput2 index 56806b6..222b8ef 100644 --- a/testdata/testoutput2 +++ b/testdata/testoutput2 @@ -8141,4 +8141,20 @@ No match 0: 0+ \x0d\x0a +/(?m)^$|^\r\n/<any>g+ + abc\r\n\r\n + 0: + 0+ \x0d\x0a + 0: \x0d\x0a + 0+ + +/(?m)$/<any>g+ + abc\r\n\r\n + 0: + 0+ \x0d\x0a\x0d\x0a + 0: + 0+ \x0d\x0a + 0: + 0+ + / End of testinput2 / diff --git a/testdata/testoutput7 b/testdata/testoutput7 index f4abe04..6860b66 100644 --- a/testdata/testoutput7 +++ b/testdata/testoutput7 @@ -6975,4 +6975,19 @@ No match 0: 0+ \x0d\x0a +/(?m)^$|^\r\n/<any>g+ + abc\r\n\r\n + 0: \x0d\x0a + 0+ + 1: + +/(?m)$/<any>g+ + abc\r\n\r\n + 0: + 0+ \x0d\x0a\x0d\x0a + 0: + 0+ \x0d\x0a + 0: + 0+ + / End of testinput7 / |