Further fixes for bumpalong processing in UTF-8 mode.

git-svn-id: svn://vcs.exim.org/pcre/code/trunk@365 2f5784b3-3f2a-0410-8824-cb99058d5e15
author: ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> 2008-07-11 17:06:55 +0000
committer: ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> 2008-07-11 17:06:55 +0000
commit: 1b526c19fd97a3aff7d5432dea74ff5f87d7f0b4 (patch)
tree: 422150d0913b5149dc27befb19155bc657074d2f
parent: 4299955a938be0e20bea4fae5fc346b6501d3997 (diff)
download: pcre-1b526c19fd97a3aff7d5432dea74ff5f87d7f0b4.tar.gz
5 files changed, 36 insertions, 50 deletions
diff --git a/ChangeLog b/ChangeLog
index 0d9d133..26033ba 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -61,7 +61,7 @@ Version 8.0 02 Jul-08
     
 14. A similar bug to 7.3/2 existed when the PCRE_FIRSTLINE option was set and
     the data contained the byte 0x85 as part of a UTF-8 character within its 
-    first line.  
+    first line. This applied both to normal and DFA matching. 
 
 
 Version 7.7 07-May-08
diff --git a/pcre_dfa_exec.c b/pcre_dfa_exec.c
index c0ed8eb..11a03e6 100644
--- a/pcre_dfa_exec.c
+++ b/pcre_dfa_exec.c
@@ -2733,7 +2733,18 @@ for (;;)
 
     if (firstline)
       {
-      const uschar *t = current_subject;
+      USPTR t = current_subject;
+#ifdef SUPPORT_UTF8
+      if (utf8)
+        {     
+        while (t < md->end_subject && !IS_NEWLINE(t)) 
+          {
+          t++;
+          while (t < end_subject && (*t & 0xc0) == 0x80) t++;
+          } 
+        }
+      else
+#endif        
       while (t < md->end_subject && !IS_NEWLINE(t)) t++;
       end_subject = t;
       }
@@ -2755,9 +2766,22 @@ for (;;)
       {
       if (current_subject > md->start_subject + start_offset)
         {
+#ifdef SUPPORT_UTF8
+        if (utf8)
+          {
+          while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
+            {
+            current_subject++;       
+            while(current_subject < end_subject && 
+                  (*current_subject & 0xc0) == 0x80) 
+              current_subject++;
+            } 
+          }
+        else
+#endif                  
         while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
           current_subject++;
-
+          
         /* If we have just passed a CR and the newline option is ANY or
         ANYCRLF, and we are now at a LF, advance the match position by one more
         character. */
diff --git a/pcre_exec.c b/pcre_exec.c
index 83ee29a..212acbe 100644
--- a/pcre_exec.c
+++ b/pcre_exec.c
@@ -4695,7 +4695,7 @@ for(;;)
 
   if (firstline)
     {
-    USPTR *t = start_match;
+    USPTR t = start_match;
 #ifdef SUPPORT_UTF8
     if (utf8)
       {     
@@ -4716,39 +4716,11 @@ for(;;)
   if (first_byte >= 0)
     {
     if (first_byte_caseless)
-      {
-#ifdef SUPPORT_UTF8
-      if (utf8)
-        {
-        while (start_match < end_subject && md->lcc[*start_match] != first_byte)
-          {
-          start_match++;       
-          while(start_match < end_subject && (*start_match & 0xc0) == 0x80) 
-            start_match++;
-          } 
-        }
-      else
-#endif                  
       while (start_match < end_subject && md->lcc[*start_match] != first_byte)
         start_match++;
-      }   
-    else    /* Caseful case */
-      { 
-#ifdef SUPPORT_UTF8
-      if (utf8)
-        {
-        while (start_match < end_subject && *start_match != first_byte)
-          {
-          start_match++;       
-          while(start_match < end_subject && (*start_match & 0xc0) == 0x80) 
-            start_match++;
-          } 
-        }
-      else
-#endif                  
+    else
       while (start_match < end_subject && *start_match != first_byte)
         start_match++;
-      }   
     }
 
   /* Or to just after a linebreak for a multiline match */
@@ -4788,23 +4760,6 @@ for(;;)
 
   else if (start_bits != NULL)
     {
-#ifdef SUPPORT_UTF8    
-    if (utf8)
-      { 
-      while (start_match < end_subject)
-        {
-        register unsigned int c = *start_match;
-        if ((start_bits[c/8] & (1 << (c&7))) == 0)
-          { 
-          start_match++;       
-          while(start_match < end_subject && (*start_match & 0xc0) == 0x80) 
-            start_match++;
-          } 
-        else break;
-        }
-      }
-    else
-#endif           
     while (start_match < end_subject)
       {
       register unsigned int c = *start_match;
diff --git a/testdata/testinput8 b/testdata/testinput8
index 5bcfab5..11884ad 100644
--- a/testdata/testinput8
+++ b/testdata/testinput8
@@ -664,4 +664,7 @@
     a\x{85}b\<bsr_anycrlf>
     a\x0bb\<bsr_anycrlf>
  
+/X/8f<any> 
+    A\x{1ec5}ABCXYZ
+
 / End of testinput 8 / 
diff --git a/testdata/testoutput8 b/testdata/testoutput8
index 631e5b8..af10c4a 100644
--- a/testdata/testoutput8
+++ b/testdata/testoutput8
@@ -1284,4 +1284,8 @@ No match
     a\x0bb\<bsr_anycrlf>
 No match
  
+/X/8f<any> 
+    A\x{1ec5}ABCXYZ
+ 0: X
+
 / End of testinput 8 /
author	ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>	2008-07-11 17:06:55 +0000
committer	ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>	2008-07-11 17:06:55 +0000
commit	1b526c19fd97a3aff7d5432dea74ff5f87d7f0b4 (patch)
tree	422150d0913b5149dc27befb19155bc657074d2f
parent	4299955a938be0e20bea4fae5fc346b6501d3997 (diff)
download	pcre-1b526c19fd97a3aff7d5432dea74ff5f87d7f0b4.tar.gz