diff options
author | ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2010-11-03 18:32:55 +0000 |
---|---|---|
committer | ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2010-11-03 18:32:55 +0000 |
commit | ed44c1dfe4d6a49f32fbb2927444306ccf4e0acb (patch) | |
tree | 5d80119550dc232d4ed295fd0d2907d4627a11a6 | |
parent | 1baf641ddd67f78693280553a81a05b69cbb3fff (diff) | |
download | pcre-ed44c1dfe4d6a49f32fbb2927444306ccf4e0acb.tar.gz |
Fix broken /g code in pcretest and -g code in pcredemo.
git-svn-id: svn://vcs.exim.org/pcre/code/trunk@566 2f5784b3-3f2a-0410-8824-cb99058d5e15
-rw-r--r-- | ChangeLog | 16 | ||||
-rw-r--r-- | doc/pcreapi.3 | 7 | ||||
-rw-r--r-- | doc/pcretest.1 | 11 | ||||
-rw-r--r-- | pcredemo.c | 76 | ||||
-rw-r--r-- | pcretest.c | 19 | ||||
-rw-r--r-- | testdata/testinput2 | 9 | ||||
-rw-r--r-- | testdata/testinput4 | 3 | ||||
-rw-r--r-- | testdata/testoutput2 | 21 | ||||
-rw-r--r-- | testdata/testoutput4 | 9 |
9 files changed, 149 insertions, 22 deletions
@@ -66,6 +66,22 @@ Version 8.11 10-Oct-2010 12. Added an optional parentheses number to the -o and --only-matching options of pcregrep. + +13. Imitating Perl's /g action for multiple matches is tricky when the pattern + can match an empty string. The code to do it in pcretest and pcredemo + needed fixing: + + (a) When the newline convention was "crlf", pcretest got it wrong, skipping + only one byte after an empty string match just before CRLF (this case + just got forgotten; "any" and "anycrlf" were OK). + + (b) The pcretest code also had a bug, causing it to loop forever in UTF-8 + mode when an empty string match preceded an ASCII character followed by + a non-ASCII character. (The code for advancing by one character rather + than one byte was nonsense.) + + (c) The pcredemo.c sample program did not have any code at all to handle + the cases when CRLF is a valid newline sequence. Version 8.10 25-Jun-2010 diff --git a/doc/pcreapi.3 b/doc/pcreapi.3 index e94fc62..b56ee7e 100644 --- a/doc/pcreapi.3 +++ b/doc/pcreapi.3 @@ -1450,7 +1450,10 @@ the .\" HREF \fBpcredemo\fP .\" -sample program. +sample program. In the most general case, you have to check to see if the +newline convention recognizes CRLF as a newline, and if so, and the current +character is CR followed by LF, advance the starting offset by two characters +instead of one. .sp PCRE_NO_START_OPTIMIZE .sp @@ -2193,6 +2196,6 @@ Cambridge CB2 3QH, England. .rs .sp .nf -Last updated: 22 October 2010 +Last updated: 01 November 2010 Copyright (c) 1997-2010 University of Cambridge. .fi diff --git a/doc/pcretest.1 b/doc/pcretest.1 index 75c2611..37b970c 100644 --- a/doc/pcretest.1 +++ b/doc/pcretest.1 @@ -219,9 +219,12 @@ begins with a lookbehind assertion (including \eb or \eB). If any call to \fBpcre_exec()\fP in a \fB/g\fP or \fB/G\fP sequence matches an empty string, the next call is done with the PCRE_NOTEMPTY_ATSTART and PCRE_ANCHORED flags set in order to search for another, non-empty, match at the -same point. If this second match fails, the start offset is advanced by one -character, and the normal match is retried. This imitates the way Perl handles -such cases when using the \fB/g\fP modifier or the \fBsplit()\fP function. +same point. If this second match fails, the start offset is advanced, and the +normal match is retried. This imitates the way Perl handles such cases when +using the \fB/g\fP modifier or the \fBsplit()\fP function. Normally, the start +offset is advanced by one character, but if the newline convention recognizes +CRLF as a newline, and the current character is CR followed by LF, an advance +of two is used. . . .SS "Other modifiers" @@ -767,6 +770,6 @@ Cambridge CB2 3QH, England. .rs .sp .nf -Last updated: 22 October 2010 +Last updated: 01 November 2010 Copyright (c) 1997-2010 University of Cambridge. .fi @@ -50,13 +50,16 @@ const char *error; char *pattern; char *subject; unsigned char *name_table; +unsigned int option_bits; int erroffset; int find_all; +int crlf_is_newline; int namecount; int name_entry_size; int ovector[OVECCOUNT]; int subject_length; int rc, i; +int utf8; /************************************************************************** @@ -238,15 +241,56 @@ if (namecount <= 0) printf("No named substrings\n"); else * subject is not a valid match; other possibilities must be tried. The * * second flag restricts PCRE to one match attempt at the initial string * * position. If this match succeeds, an alternative to the empty string * -* match has been found, and we can proceed round the loop. * +* match has been found, and we can print it and proceed round the loop, * +* advancing by the length of whatever was found. If this match does not * +* succeed, we still stay in the loop, advancing by just one character. * +* In UTF-8 mode, which can be set by (*UTF8) in the pattern, this may be * +* more than one byte. * +* * +* However, there is a complication concerned with newlines. When the * +* newline convention is such that CRLF is a valid newline, we want must * +* advance by two characters rather than one. The newline convention can * +* be set in the regex by (*CR), etc.; if not, we must find the default. * *************************************************************************/ -if (!find_all) +if (!find_all) /* Check for -g */ { pcre_free(re); /* Release the memory used for the compiled pattern */ return 0; /* Finish unless -g was given */ } +/* Before running the loop, check for UTF-8 and whether CRLF is a valid newline +sequence. First, find the options with which the regex was compiled; extract +the UTF-8 state, and mask off all but the newline options. */ + +(void)pcre_fullinfo(re, NULL, PCRE_INFO_OPTIONS, &option_bits); +utf8 = option_bits & PCRE_UTF8; +option_bits &= PCRE_NEWLINE_CR|PCRE_NEWLINE_LF|PCRE_NEWLINE_CRLF| + PCRE_NEWLINE_ANY|PCRE_NEWLINE_ANYCRLF; + +/* If no newline options were set, find the default newline convention from the +build configuration. */ + +if (option_bits == 0) + { + int d; + (void)pcre_config(PCRE_CONFIG_NEWLINE, &d); + /* Note that these values are always the ASCII ones, even in + EBCDIC environments. CR = 13, NL = 10. */ + option_bits = (d == 13)? PCRE_NEWLINE_CR : + (d == 10)? PCRE_NEWLINE_LF : + (d == (13<<8 | 10))? PCRE_NEWLINE_CRLF : + (d == -2)? PCRE_NEWLINE_ANYCRLF : + (d == -1)? PCRE_NEWLINE_ANY : 0; + } + +/* See if CRLF is a valid newline sequence. */ + +crlf_is_newline = + option_bits == PCRE_NEWLINE_ANY || + option_bits == PCRE_NEWLINE_CRLF || + option_bits == PCRE_NEWLINE_ANYCRLF; + /* Loop for second and subsequent matches */ for (;;) @@ -280,14 +324,32 @@ for (;;) is zero, it just means we have found all possible matches, so the loop ends. Otherwise, it means we have failed to find a non-empty-string match at a point where there was a previous empty-string match. In this case, we do what - Perl does: advance the matching position by one, and continue. We do this by - setting the "end of previous match" offset, because that is picked up at the - top of the loop as the point at which to start again. */ + Perl does: advance the matching position by one character, and continue. We + do this by setting the "end of previous match" offset, because that is picked + up at the top of the loop as the point at which to start again. + + There are two complications: (a) When CRLF is a valid newline sequence, and + the current position is just before it, advance by an extra byte. (b) + Otherwise we must ensure that we skip an entire UTF-8 character if we are in + UTF-8 mode. */ if (rc == PCRE_ERROR_NOMATCH) { - if (options == 0) break; - ovector[1] = start_offset + 1; + if (options == 0) break; /* All matches found */ + ovector[1] = start_offset + 1; /* Advance one byte */ + if (crlf_is_newline && /* If CRLF is newline & */ + start_offset < subject_length - 1 && /* we are at CRLF, */ + subject[start_offset] == '\r' && + subject[start_offset + 1] == '\n') + ovector[1] += 1; /* Advance by one more. */ + else if (utf8) /* Otherwise, ensure we */ + { /* advance a whole UTF-8 */ + while (ovector[1] < subject_length) /* character. */ + { + if ((subject[ovector[1]] & 0xc0) != 0x80) break; + ovector[1] += 1; + } + } continue; /* Go round the loop again */ } @@ -2791,11 +2791,13 @@ while (!done) to advance the start offset, and continue. We won't be at the end of the string - that was checked before setting g_notempty. - Complication arises in the case when the newline option is "any" or - "anycrlf". If the previous match was at the end of a line terminated by - CRLF, an advance of one character just passes the \r, whereas we should - prefer the longer newline sequence, as does the code in pcre_exec(). - Fudge the offset value to achieve this. + Complication arises in the case when the newline convention is "any", + "crlf", or "anycrlf". If the previous match was at the end of a line + terminated by CRLF, an advance of one character just passes the \r, + whereas we should prefer the longer newline sequence, as does the code in + pcre_exec(). Fudge the offset value to achieve this. We check for a + newline setting in the pattern; if none was set, use pcre_config() to + find the default. Otherwise, in the case of UTF-8 matching, the advance must be one character, not one byte. */ @@ -2820,6 +2822,7 @@ while (!done) (d == -1)? PCRE_NEWLINE_ANY : 0; } if (((obits & PCRE_NEWLINE_BITS) == PCRE_NEWLINE_ANY || + (obits & PCRE_NEWLINE_BITS) == PCRE_NEWLINE_CRLF || (obits & PCRE_NEWLINE_BITS) == PCRE_NEWLINE_ANYCRLF) && start_offset < len - 1 && @@ -2830,10 +2833,8 @@ while (!done) { while (start_offset + onechar < len) { - int tb = bptr[start_offset+onechar]; - if (tb <= 127) break; - tb &= 0xc0; - if (tb != 0 && tb != 0xc0) onechar++; + if ((bptr[start_offset+onechar] & 0xc0) != 0x80) break; + onechar++; } } use_offsets[1] = start_offset + onechar; diff --git a/testdata/testinput2 b/testdata/testinput2 index a16b4d6..8967ba2 100644 --- a/testdata/testinput2 +++ b/testdata/testinput2 @@ -2346,6 +2346,15 @@ a random value. /Ix a\nb a\r\nb a\x85b + +/(*ANY).*/g + abc\r\ndef + +/(*ANYCRLF).*/g + abc\r\ndef + +/(*CRLF).*/g + abc\r\ndef /a\Rb/I<bsr_anycrlf> a\rb diff --git a/testdata/testinput4 b/testdata/testinput4 index a629387..e2bae42 100644 --- a/testdata/testinput4 +++ b/testdata/testinput4 @@ -641,4 +641,7 @@ a\x{c0}aaaa/ a\x{c0}a\x{c0}aaa/ +/A*/g8 + AAB\x{123}BAA + /-- End of testinput4 --/ diff --git a/testdata/testoutput2 b/testdata/testoutput2 index 6545cfd..33419e5 100644 --- a/testdata/testoutput2 +++ b/testdata/testoutput2 @@ -8787,6 +8787,27 @@ No match No match a\x85b No match + +/(*ANY).*/g + abc\r\ndef + 0: abc + 0: + 0: def + 0: + +/(*ANYCRLF).*/g + abc\r\ndef + 0: abc + 0: + 0: def + 0: + +/(*CRLF).*/g + abc\r\ndef + 0: abc + 0: + 0: def + 0: /a\Rb/I<bsr_anycrlf> Capturing subpattern count = 0 diff --git a/testdata/testoutput4 b/testdata/testoutput4 index ad331c7..4591026 100644 --- a/testdata/testoutput4 +++ b/testdata/testoutput4 @@ -1119,4 +1119,13 @@ No match 0: a\x{c0}a\x{c0} 1: a\x{c0} +/A*/g8 + AAB\x{123}BAA + 0: AA + 0: + 0: + 0: + 0: AA + 0: + /-- End of testinput4 --/ |