Fix DFA bug (3 cases) when UTF code was being obeyed in non-UTF mode.

git-svn-id: svn://vcs.exim.org/pcre/code/trunk@979 2f5784b3-3f2a-0410-8824-cb99058d5e15
author: ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> 2012-06-17 19:08:41 +0000
committer: ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> 2012-06-17 19:08:41 +0000
commit: 37b4efccc0529613ee2c93004d33ef681c410a9f (patch)
tree: 488ac9efcee13934e6990762bd969181d192484e
parent: 9a61463ee5099a08610fb08ada15b3c8fc3e0ad5 (diff)
download: pcre-37b4efccc0529613ee2c93004d33ef681c410a9f.tar.gz
4 files changed, 57 insertions, 11 deletions
diff --git a/ChangeLog b/ChangeLog
index a29548d..f167ea3 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -136,6 +136,15 @@ Version 8.31 02-June-2012
     the same checks as \x{...} characters in non-JavaScript mode. Specifically, 
     codepoints that are too big for the mode are faulted, and in a UTF mode, 
     disallowed codepoints are also faulted. 
+    
+39. If PCRE was compiled with UTF support, in three places in the DFA
+    matcher there was code that should only have been obeyed in UTF mode, but
+    was being obeyed unconditionally. In 8-bit mode this could cause incorrect
+    processing when bytes with values greater than 127 were present. In 16-bit
+    mode the bug would be provoked by values in the range 0xfc00 to 0xdc00. In
+    both cases the values are those that cannot be the first data item in a UTF
+    character. The three items that might have provoked this were recursions,
+    possessively repeated groups, and atomic groups.
 
 
 Version 8.30 04-February-2012
diff --git a/pcre_dfa_exec.c b/pcre_dfa_exec.c
index 9565d46..140d85f 100644
--- a/pcre_dfa_exec.c
+++ b/pcre_dfa_exec.c
@@ -38,7 +38,6 @@ POSSIBILITY OF SUCH DAMAGE.
 -----------------------------------------------------------------------------
 */
 
-
 /* This module contains the external function pcre_dfa_exec(), which is an
 alternative matching function that uses a sort of DFA algorithm (not a true
 FSM). This is NOT Perl-compatible, but it has advantages in certain
@@ -382,7 +381,8 @@ for the current character, one for the following character). */
     next_new_state->count  = (y); \
     next_new_state->data   = (z); \
     next_new_state++; \
-    DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
+    DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d) line %d\n", rlevel*2-2, SP, \
+      (x), (y), (z), __LINE__)); \
     } \
   else return PCRE_ERROR_DFA_WSSIZE
 
@@ -611,7 +611,7 @@ for (;;)
 
   if (ptr < end_subject)
     {
-    clen = 1;        /* Number of bytes in the character */
+    clen = 1;        /* Number of data items in the character */
 #ifdef SUPPORT_UTF
     if (utf) { GETCHARLEN(c, ptr, clen); } else
 #endif  /* SUPPORT_UTF */
@@ -789,7 +789,7 @@ for (;;)
             offsets[0] = (int)(current_subject - start_subject);
             offsets[1] = (int)(ptr - start_subject);
             DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
-              offsets[1] - offsets[0], current_subject));
+              offsets[1] - offsets[0], (char *)current_subject));
             }
           if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
             {
@@ -2797,9 +2797,12 @@ for (;;)
             {
             int charcount = local_offsets[rc+1] - local_offsets[rc];
 #ifdef SUPPORT_UTF
-            const pcre_uchar *p = start_subject + local_offsets[rc];
-            const pcre_uchar *pp = start_subject + local_offsets[rc+1];
-            while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
+            if (utf)
+              { 
+              const pcre_uchar *p = start_subject + local_offsets[rc];
+              const pcre_uchar *pp = start_subject + local_offsets[rc+1];
+              while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
+              } 
 #endif
             if (charcount > 0)
               {
@@ -2898,7 +2901,7 @@ for (;;)
             const pcre_uchar *pp = local_ptr;
             charcount = (int)(pp - p);
 #ifdef SUPPORT_UTF
-            while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
+            if (utf) while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
 #endif
             ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
             }
@@ -2980,9 +2983,12 @@ for (;;)
           else
             {
 #ifdef SUPPORT_UTF
-            const pcre_uchar *p = start_subject + local_offsets[0];
-            const pcre_uchar *pp = start_subject + local_offsets[1];
-            while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
+            if (utf)
+              { 
+              const pcre_uchar *p = start_subject + local_offsets[0];
+              const pcre_uchar *pp = start_subject + local_offsets[1];
+              while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
+              } 
 #endif
             ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
             if (repeat_state_offset >= 0)
diff --git a/testdata/testinput8 b/testdata/testinput8
index 2b628a3..e235445 100644
--- a/testdata/testinput8
+++ b/testdata/testinput8
@@ -4787,4 +4787,15 @@
 /abcdef/
    abc\R
 
+/<H((?(?!<H|F>)(.)|(?R))++)*F>/
+    text <H more text <H texting more  hexA0-"\xA0"    hex above 7F-"\xBC" F> text xxxxx <H text F> text F> text2 <H text sample F> more text.
+
+/^(?>.{4})abc|^\w\w.xabcd/
+    xxxxabcd
+    xx\xa0xabcd 
+
+/^(.{4}){2}+abc|^\w\w.x\w\w\w\wabcd/
+    xxxxxxxxabcd
+    xx\xa0xxxxxabcd 
+
 /-- End of testinput8 --/
diff --git a/testdata/testoutput8 b/testdata/testoutput8
index c5f01e1..73e0eae 100644
--- a/testdata/testoutput8
+++ b/testdata/testoutput8
@@ -7996,4 +7996,24 @@ Partial match: \x0d\x0d\x0d
    abc\R
 Error -30 (invalid data in workspace for DFA restart)
 
+/<H((?(?!<H|F>)(.)|(?R))++)*F>/
+    text <H more text <H texting more  hexA0-"\xA0"    hex above 7F-"\xBC" F> text xxxxx <H text F> text F> text2 <H text sample F> more text.
+ 0: <H more text <H texting more  hexA0-"\xa0"    hex above 7F-"\xbc" F> text xxxxx <H text F> text F>
+
+/^(?>.{4})abc|^\w\w.xabcd/
+    xxxxabcd
+ 0: xxxxabcd
+ 1: xxxxabc
+    xx\xa0xabcd 
+ 0: xx\xa0xabcd
+ 1: xx\xa0xabc
+
+/^(.{4}){2}+abc|^\w\w.x\w\w\w\wabcd/
+    xxxxxxxxabcd
+ 0: xxxxxxxxabcd
+ 1: xxxxxxxxabc
+    xx\xa0xxxxxabcd 
+ 0: xx\xa0xxxxxabcd
+ 1: xx\xa0xxxxxabc
+
 /-- End of testinput8 --/
author	ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>	2012-06-17 19:08:41 +0000
committer	ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>	2012-06-17 19:08:41 +0000
commit	37b4efccc0529613ee2c93004d33ef681c410a9f (patch)
tree	488ac9efcee13934e6990762bd969181d192484e
parent	9a61463ee5099a08610fb08ada15b3c8fc3e0ad5 (diff)
download	pcre-37b4efccc0529613ee2c93004d33ef681c410a9f.tar.gz