summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>2008-07-11 14:53:41 +0000
committerph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>2008-07-11 14:53:41 +0000
commit4299955a938be0e20bea4fae5fc346b6501d3997 (patch)
treea4ed201cc3623cd6963e6ab4e49e6e68aded98b9
parenta16000d8f2c04ad3c448033d27256d9b3fe53b34 (diff)
downloadpcre-4299955a938be0e20bea4fae5fc346b6501d3997.tar.gz
Several bugs concerned with skipping over UTF-8 characters at the start of
matching (8.0/13, 8.0/14). git-svn-id: svn://vcs.exim.org/pcre/code/trunk@364 2f5784b3-3f2a-0410-8824-cb99058d5e15
-rw-r--r--ChangeLog7
-rw-r--r--pcre_exec.c92
-rw-r--r--pcre_internal.h8
-rw-r--r--testdata/testinput53
-rw-r--r--testdata/testoutput54
5 files changed, 93 insertions, 21 deletions
diff --git a/ChangeLog b/ChangeLog
index 095447b..0d9d133 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -55,6 +55,13 @@ Version 8.0 02 Jul-08
pcre_dfa_exec() could read past the end of the passed subject if there was
no match. To help with detecting such bugs (e.g. with valgrind), I modified
pcretest so that it places the subject at the end of its malloc-ed buffer.
+
+13. The change to pcretest in 12 above threw up a couple more cases when pcre_
+ exec() might read past the end of the data buffer in UTF-8 mode.
+
+14. A similar bug to 7.3/2 existed when the PCRE_FIRSTLINE option was set and
+ the data contained the byte 0x85 as part of a UTF-8 character within its
+ first line.
Version 7.7 07-May-08
diff --git a/pcre_exec.c b/pcre_exec.c
index 11c742f..83ee29a 100644
--- a/pcre_exec.c
+++ b/pcre_exec.c
@@ -4695,32 +4695,82 @@ for(;;)
if (firstline)
{
- USPTR t = start_match;
+ USPTR *t = start_match;
+#ifdef SUPPORT_UTF8
+ if (utf8)
+ {
+ while (t < md->end_subject && !IS_NEWLINE(t))
+ {
+ t++;
+ while (t < end_subject && (*t & 0xc0) == 0x80) t++;
+ }
+ }
+ else
+#endif
while (t < md->end_subject && !IS_NEWLINE(t)) t++;
end_subject = t;
}
- /* Now test for a unique first byte */
+ /* Now advance to a unique first byte if there is one. */
if (first_byte >= 0)
{
if (first_byte_caseless)
- while (start_match < end_subject &&
- md->lcc[*start_match] != first_byte)
- { NEXTCHAR(start_match); }
- else
+ {
+#ifdef SUPPORT_UTF8
+ if (utf8)
+ {
+ while (start_match < end_subject && md->lcc[*start_match] != first_byte)
+ {
+ start_match++;
+ while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
+ start_match++;
+ }
+ }
+ else
+#endif
+ while (start_match < end_subject && md->lcc[*start_match] != first_byte)
+ start_match++;
+ }
+ else /* Caseful case */
+ {
+#ifdef SUPPORT_UTF8
+ if (utf8)
+ {
+ while (start_match < end_subject && *start_match != first_byte)
+ {
+ start_match++;
+ while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
+ start_match++;
+ }
+ }
+ else
+#endif
while (start_match < end_subject && *start_match != first_byte)
- { NEXTCHAR(start_match); }
+ start_match++;
+ }
}
- /* Or to just after a linebreak for a multiline match if possible */
+ /* Or to just after a linebreak for a multiline match */
else if (startline)
{
if (start_match > md->start_subject + start_offset)
{
+#ifdef SUPPORT_UTF8
+ if (utf8)
+ {
+ while (start_match < end_subject && !WAS_NEWLINE(start_match))
+ {
+ start_match++;
+ while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
+ start_match++;
+ }
+ }
+ else
+#endif
while (start_match < end_subject && !WAS_NEWLINE(start_match))
- { NEXTCHAR(start_match); }
+ start_match++;
/* If we have just passed a CR and the newline option is ANY or ANYCRLF,
and we are now at a LF, advance the match position by one more character.
@@ -4734,16 +4784,32 @@ for(;;)
}
}
- /* Or to a non-unique first char after study */
+ /* Or to a non-unique first byte after study */
else if (start_bits != NULL)
{
+#ifdef SUPPORT_UTF8
+ if (utf8)
+ {
+ while (start_match < end_subject)
+ {
+ register unsigned int c = *start_match;
+ if ((start_bits[c/8] & (1 << (c&7))) == 0)
+ {
+ start_match++;
+ while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
+ start_match++;
+ }
+ else break;
+ }
+ }
+ else
+#endif
while (start_match < end_subject)
{
register unsigned int c = *start_match;
- if ((start_bits[c/8] & (1 << (c&7))) == 0)
- { NEXTCHAR(start_match); }
- else break;
+ if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++;
+ else break;
}
}
diff --git a/pcre_internal.h b/pcre_internal.h
index 7547053..97c62d9 100644
--- a/pcre_internal.h
+++ b/pcre_internal.h
@@ -381,7 +381,6 @@ never be called in byte mode. To make sure it can never even appear when UTF-8
support is omitted, we don't even define it. */
#ifndef SUPPORT_UTF8
-#define NEXTCHAR(p) p++;
#define GETCHAR(c, eptr) c = *eptr;
#define GETCHARTEST(c, eptr) c = *eptr;
#define GETCHARINC(c, eptr) c = *eptr++;
@@ -391,13 +390,6 @@ support is omitted, we don't even define it. */
#else /* SUPPORT_UTF8 */
-/* Advance a character pointer one byte in non-UTF-8 mode and by one character
-in UTF-8 mode. */
-
-#define NEXTCHAR(p) \
- p++; \
- if (utf8) { while((*p & 0xc0) == 0x80) p++; }
-
/* Get the next UTF-8 character, not advancing the pointer. This is called when
we know we are in UTF-8 mode. */
diff --git a/testdata/testinput5 b/testdata/testinput5
index 62e9d1e..7d64b43 100644
--- a/testdata/testinput5
+++ b/testdata/testinput5
@@ -477,4 +477,7 @@ can't tell the difference.) --/
\x{de}\x{de}
\x{123}
+/X/8f<any>
+ A\x{1ec5}ABCXYZ
+
/ End of testinput5 /
diff --git a/testdata/testoutput5 b/testdata/testoutput5
index d36a246..9567233 100644
--- a/testdata/testoutput5
+++ b/testdata/testoutput5
@@ -1637,4 +1637,8 @@ No match
** Truncation will probably give the wrong result.
No match
+/X/8f<any>
+ A\x{1ec5}ABCXYZ
+ 0: X
+
/ End of testinput5 /