diff options
author | ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2008-07-11 14:53:41 +0000 |
---|---|---|
committer | ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2008-07-11 14:53:41 +0000 |
commit | 4299955a938be0e20bea4fae5fc346b6501d3997 (patch) | |
tree | a4ed201cc3623cd6963e6ab4e49e6e68aded98b9 | |
parent | a16000d8f2c04ad3c448033d27256d9b3fe53b34 (diff) | |
download | pcre-4299955a938be0e20bea4fae5fc346b6501d3997.tar.gz |
Several bugs concerned with skipping over UTF-8 characters at the start of
matching (8.0/13, 8.0/14).
git-svn-id: svn://vcs.exim.org/pcre/code/trunk@364 2f5784b3-3f2a-0410-8824-cb99058d5e15
-rw-r--r-- | ChangeLog | 7 | ||||
-rw-r--r-- | pcre_exec.c | 92 | ||||
-rw-r--r-- | pcre_internal.h | 8 | ||||
-rw-r--r-- | testdata/testinput5 | 3 | ||||
-rw-r--r-- | testdata/testoutput5 | 4 |
5 files changed, 93 insertions, 21 deletions
@@ -55,6 +55,13 @@ Version 8.0 02 Jul-08 pcre_dfa_exec() could read past the end of the passed subject if there was no match. To help with detecting such bugs (e.g. with valgrind), I modified pcretest so that it places the subject at the end of its malloc-ed buffer. + +13. The change to pcretest in 12 above threw up a couple more cases when pcre_ + exec() might read past the end of the data buffer in UTF-8 mode. + +14. A similar bug to 7.3/2 existed when the PCRE_FIRSTLINE option was set and + the data contained the byte 0x85 as part of a UTF-8 character within its + first line. Version 7.7 07-May-08 diff --git a/pcre_exec.c b/pcre_exec.c index 11c742f..83ee29a 100644 --- a/pcre_exec.c +++ b/pcre_exec.c @@ -4695,32 +4695,82 @@ for(;;) if (firstline) { - USPTR t = start_match; + USPTR *t = start_match; +#ifdef SUPPORT_UTF8 + if (utf8) + { + while (t < md->end_subject && !IS_NEWLINE(t)) + { + t++; + while (t < end_subject && (*t & 0xc0) == 0x80) t++; + } + } + else +#endif while (t < md->end_subject && !IS_NEWLINE(t)) t++; end_subject = t; } - /* Now test for a unique first byte */ + /* Now advance to a unique first byte if there is one. */ if (first_byte >= 0) { if (first_byte_caseless) - while (start_match < end_subject && - md->lcc[*start_match] != first_byte) - { NEXTCHAR(start_match); } - else + { +#ifdef SUPPORT_UTF8 + if (utf8) + { + while (start_match < end_subject && md->lcc[*start_match] != first_byte) + { + start_match++; + while(start_match < end_subject && (*start_match & 0xc0) == 0x80) + start_match++; + } + } + else +#endif + while (start_match < end_subject && md->lcc[*start_match] != first_byte) + start_match++; + } + else /* Caseful case */ + { +#ifdef SUPPORT_UTF8 + if (utf8) + { + while (start_match < end_subject && *start_match != first_byte) + { + start_match++; + while(start_match < end_subject && (*start_match & 0xc0) == 0x80) + start_match++; + } + } + else +#endif while (start_match < end_subject && *start_match != first_byte) - { NEXTCHAR(start_match); } + start_match++; + } } - /* Or to just after a linebreak for a multiline match if possible */ + /* Or to just after a linebreak for a multiline match */ else if (startline) { if (start_match > md->start_subject + start_offset) { +#ifdef SUPPORT_UTF8 + if (utf8) + { + while (start_match < end_subject && !WAS_NEWLINE(start_match)) + { + start_match++; + while(start_match < end_subject && (*start_match & 0xc0) == 0x80) + start_match++; + } + } + else +#endif while (start_match < end_subject && !WAS_NEWLINE(start_match)) - { NEXTCHAR(start_match); } + start_match++; /* If we have just passed a CR and the newline option is ANY or ANYCRLF, and we are now at a LF, advance the match position by one more character. @@ -4734,16 +4784,32 @@ for(;;) } } - /* Or to a non-unique first char after study */ + /* Or to a non-unique first byte after study */ else if (start_bits != NULL) { +#ifdef SUPPORT_UTF8 + if (utf8) + { + while (start_match < end_subject) + { + register unsigned int c = *start_match; + if ((start_bits[c/8] & (1 << (c&7))) == 0) + { + start_match++; + while(start_match < end_subject && (*start_match & 0xc0) == 0x80) + start_match++; + } + else break; + } + } + else +#endif while (start_match < end_subject) { register unsigned int c = *start_match; - if ((start_bits[c/8] & (1 << (c&7))) == 0) - { NEXTCHAR(start_match); } - else break; + if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; + else break; } } diff --git a/pcre_internal.h b/pcre_internal.h index 7547053..97c62d9 100644 --- a/pcre_internal.h +++ b/pcre_internal.h @@ -381,7 +381,6 @@ never be called in byte mode. To make sure it can never even appear when UTF-8 support is omitted, we don't even define it. */ #ifndef SUPPORT_UTF8 -#define NEXTCHAR(p) p++; #define GETCHAR(c, eptr) c = *eptr; #define GETCHARTEST(c, eptr) c = *eptr; #define GETCHARINC(c, eptr) c = *eptr++; @@ -391,13 +390,6 @@ support is omitted, we don't even define it. */ #else /* SUPPORT_UTF8 */ -/* Advance a character pointer one byte in non-UTF-8 mode and by one character -in UTF-8 mode. */ - -#define NEXTCHAR(p) \ - p++; \ - if (utf8) { while((*p & 0xc0) == 0x80) p++; } - /* Get the next UTF-8 character, not advancing the pointer. This is called when we know we are in UTF-8 mode. */ diff --git a/testdata/testinput5 b/testdata/testinput5 index 62e9d1e..7d64b43 100644 --- a/testdata/testinput5 +++ b/testdata/testinput5 @@ -477,4 +477,7 @@ can't tell the difference.) --/ \x{de}\x{de} \x{123} +/X/8f<any> + A\x{1ec5}ABCXYZ + / End of testinput5 / diff --git a/testdata/testoutput5 b/testdata/testoutput5 index d36a246..9567233 100644 --- a/testdata/testoutput5 +++ b/testdata/testoutput5 @@ -1637,4 +1637,8 @@ No match ** Truncation will probably give the wrong result. No match +/X/8f<any> + A\x{1ec5}ABCXYZ + 0: X + / End of testinput5 / |