Several bugs concerned with skipping over UTF-8 characters at the start of

matching (8.0/13, 8.0/14). git-svn-id: svn://vcs.exim.org/pcre/code/trunk@364 2f5784b3-3f2a-0410-8824-cb99058d5e15
author: ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> 2008-07-11 14:53:41 +0000
committer: ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> 2008-07-11 14:53:41 +0000
commit: 4299955a938be0e20bea4fae5fc346b6501d3997 (patch)
tree: a4ed201cc3623cd6963e6ab4e49e6e68aded98b9
parent: a16000d8f2c04ad3c448033d27256d9b3fe53b34 (diff)
download: pcre-4299955a938be0e20bea4fae5fc346b6501d3997.tar.gz
5 files changed, 93 insertions, 21 deletions
diff --git a/ChangeLog b/ChangeLog
index 095447b..0d9d133 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -55,6 +55,13 @@ Version 8.0 02 Jul-08
     pcre_dfa_exec() could read past the end of the passed subject if there was 
     no match. To help with detecting such bugs (e.g. with valgrind), I modified
     pcretest so that it places the subject at the end of its malloc-ed buffer.
+    
+13. The change to pcretest in 12 above threw up a couple more cases when pcre_
+    exec() might read past the end of the data buffer in UTF-8 mode. 
+    
+14. A similar bug to 7.3/2 existed when the PCRE_FIRSTLINE option was set and
+    the data contained the byte 0x85 as part of a UTF-8 character within its 
+    first line.  
 
 
 Version 7.7 07-May-08
diff --git a/pcre_exec.c b/pcre_exec.c
index 11c742f..83ee29a 100644
--- a/pcre_exec.c
+++ b/pcre_exec.c
@@ -4695,32 +4695,82 @@ for(;;)
 
   if (firstline)
     {
-    USPTR t = start_match;
+    USPTR *t = start_match;
+#ifdef SUPPORT_UTF8
+    if (utf8)
+      {     
+      while (t < md->end_subject && !IS_NEWLINE(t)) 
+        {
+        t++;
+        while (t < end_subject && (*t & 0xc0) == 0x80) t++;
+        } 
+      }
+    else
+#endif        
     while (t < md->end_subject && !IS_NEWLINE(t)) t++;
     end_subject = t;
     }
 
-  /* Now test for a unique first byte */
+  /* Now advance to a unique first byte if there is one. */
 
   if (first_byte >= 0)
     {
     if (first_byte_caseless)
-      while (start_match < end_subject &&
-             md->lcc[*start_match] != first_byte)
-        { NEXTCHAR(start_match); }
-    else
+      {
+#ifdef SUPPORT_UTF8
+      if (utf8)
+        {
+        while (start_match < end_subject && md->lcc[*start_match] != first_byte)
+          {
+          start_match++;       
+          while(start_match < end_subject && (*start_match & 0xc0) == 0x80) 
+            start_match++;
+          } 
+        }
+      else
+#endif                  
+      while (start_match < end_subject && md->lcc[*start_match] != first_byte)
+        start_match++;
+      }   
+    else    /* Caseful case */
+      { 
+#ifdef SUPPORT_UTF8
+      if (utf8)
+        {
+        while (start_match < end_subject && *start_match != first_byte)
+          {
+          start_match++;       
+          while(start_match < end_subject && (*start_match & 0xc0) == 0x80) 
+            start_match++;
+          } 
+        }
+      else
+#endif                  
       while (start_match < end_subject && *start_match != first_byte)
-        { NEXTCHAR(start_match); }
+        start_match++;
+      }   
     }
 
-  /* Or to just after a linebreak for a multiline match if possible */
+  /* Or to just after a linebreak for a multiline match */
 
   else if (startline)
     {
     if (start_match > md->start_subject + start_offset)
       {
+#ifdef SUPPORT_UTF8
+      if (utf8)
+        {
+        while (start_match < end_subject && !WAS_NEWLINE(start_match))
+          {
+          start_match++;       
+          while(start_match < end_subject && (*start_match & 0xc0) == 0x80) 
+            start_match++;
+          } 
+        }
+      else
+#endif                  
       while (start_match < end_subject && !WAS_NEWLINE(start_match))
-        { NEXTCHAR(start_match); }
+        start_match++;
         
       /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
       and we are now at a LF, advance the match position by one more character.
@@ -4734,16 +4784,32 @@ for(;;)
       }
     }
 
-  /* Or to a non-unique first char after study */
+  /* Or to a non-unique first byte after study */
 
   else if (start_bits != NULL)
     {
+#ifdef SUPPORT_UTF8    
+    if (utf8)
+      { 
+      while (start_match < end_subject)
+        {
+        register unsigned int c = *start_match;
+        if ((start_bits[c/8] & (1 << (c&7))) == 0)
+          { 
+          start_match++;       
+          while(start_match < end_subject && (*start_match & 0xc0) == 0x80) 
+            start_match++;
+          } 
+        else break;
+        }
+      }
+    else
+#endif           
     while (start_match < end_subject)
       {
       register unsigned int c = *start_match;
-      if ((start_bits[c/8] & (1 << (c&7))) == 0)
-        { NEXTCHAR(start_match); }
-      else break;
+      if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++;
+        else break;
       }
     }
 
diff --git a/pcre_internal.h b/pcre_internal.h
index 7547053..97c62d9 100644
--- a/pcre_internal.h
+++ b/pcre_internal.h
@@ -381,7 +381,6 @@ never be called in byte mode. To make sure it can never even appear when UTF-8
 support is omitted, we don't even define it. */
 
 #ifndef SUPPORT_UTF8
-#define NEXTCHAR(p) p++;
 #define GETCHAR(c, eptr) c = *eptr;
 #define GETCHARTEST(c, eptr) c = *eptr;
 #define GETCHARINC(c, eptr) c = *eptr++;
@@ -391,13 +390,6 @@ support is omitted, we don't even define it. */
 
 #else   /* SUPPORT_UTF8 */
 
-/* Advance a character pointer one byte in non-UTF-8 mode and by one character
-in UTF-8 mode. */
-
-#define NEXTCHAR(p) \
-  p++; \
-  if (utf8) { while((*p & 0xc0) == 0x80) p++; }
-
 /* Get the next UTF-8 character, not advancing the pointer. This is called when
 we know we are in UTF-8 mode. */
 
diff --git a/testdata/testinput5 b/testdata/testinput5
index 62e9d1e..7d64b43 100644
--- a/testdata/testinput5
+++ b/testdata/testinput5
@@ -477,4 +477,7 @@ can't tell the difference.) --/
     \x{de}\x{de}
     \x{123} 
 
+/X/8f<any> 
+    A\x{1ec5}ABCXYZ
+
 / End of testinput5 /
diff --git a/testdata/testoutput5 b/testdata/testoutput5
index d36a246..9567233 100644
--- a/testdata/testoutput5
+++ b/testdata/testoutput5
@@ -1637,4 +1637,8 @@ No match
 ** Truncation will probably give the wrong result.
 No match
 
+/X/8f<any> 
+    A\x{1ec5}ABCXYZ
+ 0: X
+
 / End of testinput5 /
author	ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>	2008-07-11 14:53:41 +0000
committer	ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>	2008-07-11 14:53:41 +0000
commit	4299955a938be0e20bea4fae5fc346b6501d3997 (patch)
tree	a4ed201cc3623cd6963e6ab4e49e6e68aded98b9
parent	a16000d8f2c04ad3c448033d27256d9b3fe53b34 (diff)
download	pcre-4299955a938be0e20bea4fae5fc346b6501d3997.tar.gz