Fix #-comment bugs in UTF-8 mode with PCRE_NEWLINE_ANY.

git-svn-id: svn://vcs.exim.org/pcre/code/trunk@556 2f5784b3-3f2a-0410-8824-cb99058d5e15
author: ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> 2010-10-26 11:06:44 +0000
committer: ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> 2010-10-26 11:06:44 +0000
commit: 32ebaf1e2edf0e1a456e0fa9ccf8f8c71c56ef34 (patch)
tree: c115236dcc3c169a9be96de8b61dc2f2e72fd5c7
parent: 6408a103fd1a51242975a7e80a309a0c0c3187d9 (diff)
download: pcre-32ebaf1e2edf0e1a456e0fa9ccf8f8c71c56ef34.tar.gz
5 files changed, 150 insertions, 12 deletions
diff --git a/ChangeLog b/ChangeLog
index 5b8d840..e03961b 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -39,6 +39,19 @@ Version 8.11 10-Oct-2010
     /t\b/ matched against "cat" with PCRE_PARTIAL_HARD set did return a partial
     match rather than a full match, which was wrong by the old rules, but is 
     now correct.] 
+    
+6.  There was a bug in the handling of #-introduced comments, recognized when
+    PCRE_EXTENDED is set, when PCRE_NEWLINE_ANY and PCRE_UTF8 were also set.
+    If a UTF-8 multi-byte character included the byte 0x85 (e.g. +U0445, whose
+    UTF-8 encoding is 0xd1,0x85), this was misinterpreted as a newline when
+    scanning for the end of the comment. (*Character* 0x85 is an "any" newline,
+    but *byte* 0x85 is not, in UTF-8 mode). This bug was present in several 
+    places in pcre_compile().
+    
+7.  Related to (6) above, when pcre_compile() was skipping #-introduced 
+    comments when looking ahead for named forward references to subpatterns, 
+    the only newline sequence it recognized was NL. It now handles newlines 
+    according to the set newline convention.
 
 
 Version 8.10 25-Jun-2010
diff --git a/doc/pcrepattern.3 b/doc/pcrepattern.3
index 963703c..9f9f1ef 100644
--- a/doc/pcrepattern.3
+++ b/doc/pcrepattern.3
@@ -66,6 +66,7 @@ discussed in the
 page.
 .
 .
+.\" HTML <a name="newlines"></a>
 .SH "NEWLINE CONVENTIONS"
 .rs
 .sp
@@ -2109,7 +2110,25 @@ that make up a comment play no part in the pattern matching at all.
 .P
 If the PCRE_EXTENDED option is set, an unescaped # character outside a
 character class introduces a comment that continues to immediately after the
-next newline in the pattern.
+next newline character or character sequence in the pattern. Which characters 
+are interpreted as newlines is controlled by the options passed to 
+\fBpcre_compile()\fP or by a special sequence at the start of the pattern, as
+described in the section entitled
+.\" HTML <a href="#recursion">
+.\" </a>
+"Newline conventions"
+.\"
+above. Note that end of a comment is a literal newline sequence in the pattern; 
+escape sequences that happen to represent a newline do not terminate a comment. 
+For example, consider this pattern when PCRE_EXTENDED is set, and the default 
+newline convention is in force:
+.sp
+  abc #comment \en still comment
+.sp
+On encountering the # character, \fBpcre_compile()\fP skips along, looking for 
+a newline in the pattern. The sequence \en is still literal at this stage, so
+it does not terminate the comment. Only an actual character with the code value
+0x0a does so.
 .
 .
 .\" HTML <a name="recursion"></a>
diff --git a/pcre_compile.c b/pcre_compile.c
index 9aa9246..0115eb3 100644
--- a/pcre_compile.c
+++ b/pcre_compile.c
@@ -1110,6 +1110,7 @@ Arguments:
   name         name to seek, or NULL if seeking a numbered subpattern
   lorn         name length, or subpattern number if name is NULL
   xmode        TRUE if we are in /x mode
+  utf8         TRUE if we are in UTF-8 mode 
   count        pointer to the current capturing subpattern number (updated)
 
 Returns:       the number of the named subpattern, or -1 if not found
@@ -1117,7 +1118,7 @@ Returns:       the number of the named subpattern, or -1 if not found
 
 static int
 find_parens_sub(uschar **ptrptr, compile_data *cd, const uschar *name, int lorn,
-  BOOL xmode, int *count)
+  BOOL xmode, BOOL utf8, int *count)
 {
 uschar *ptr = *ptrptr;
 int start_count = *count;
@@ -1278,7 +1279,15 @@ for (; *ptr != 0; ptr++)
 
   if (xmode && *ptr == CHAR_NUMBER_SIGN)
     {
-    while (*(++ptr) != 0 && *ptr != CHAR_NL) {};
+    ptr++; 
+    while (*ptr != 0)
+      {
+      if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
+      ptr++;
+#ifdef SUPPORT_UTF8         
+      if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
+#endif
+      }
     if (*ptr == 0) goto FAIL_EXIT;
     continue;
     }
@@ -1287,7 +1296,7 @@ for (; *ptr != 0; ptr++)
 
   if (*ptr == CHAR_LEFT_PARENTHESIS)
     {
-    int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, count);
+    int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, count);
     if (rc > 0) return rc;
     if (*ptr == 0) goto FAIL_EXIT;
     }
@@ -1333,12 +1342,14 @@ Arguments:
   name         name to seek, or NULL if seeking a numbered subpattern
   lorn         name length, or subpattern number if name is NULL
   xmode        TRUE if we are in /x mode
+  utf8         TRUE if we are in UTF-8 mode 
 
 Returns:       the number of the found subpattern, or -1 if not found
 */
 
 static int
-find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode)
+find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode,
+  BOOL utf8)
 {
 uschar *ptr = (uschar *)cd->start_pattern;
 int count = 0;
@@ -1351,7 +1362,7 @@ matching closing parens. That is why we have to have a loop. */
 
 for (;;)
   {
-  rc = find_parens_sub(&ptr, cd, name, lorn, xmode, &count);
+  rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, &count);
   if (rc > 0 || *ptr++ == 0) break;
   }
 
@@ -2515,8 +2526,15 @@ if ((options & PCRE_EXTENDED) != 0)
     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
     if (*ptr == CHAR_NUMBER_SIGN)
       {
-      while (*(++ptr) != 0)
+      ptr++; 
+      while (*ptr != 0)
+        {
         if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
+        ptr++;
+#ifdef SUPPORT_UTF8         
+        if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
+#endif
+        }
       }
     else break;
     }
@@ -2552,8 +2570,15 @@ if ((options & PCRE_EXTENDED) != 0)
     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
     if (*ptr == CHAR_NUMBER_SIGN)
       {
-      while (*(++ptr) != 0)
+      ptr++; 
+      while (*ptr != 0)
+        {
         if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
+        ptr++;
+#ifdef SUPPORT_UTF8         
+        if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
+#endif
+        }
       }
     else break;
     }
@@ -3126,9 +3151,14 @@ for (;; ptr++)
     if ((cd->ctypes[c] & ctype_space) != 0) continue;
     if (c == CHAR_NUMBER_SIGN)
       {
-      while (*(++ptr) != 0)
+      ptr++; 
+      while (*ptr != 0)
         {
         if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
+        ptr++;
+#ifdef SUPPORT_UTF8         
+        if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
+#endif
         }
       if (*ptr != 0) continue;
 
@@ -5036,7 +5066,7 @@ for (;; ptr++)
         /* Search the pattern for a forward reference */
 
         else if ((i = find_parens(cd, name, namelen,
-                        (options & PCRE_EXTENDED) != 0)) > 0)
+                        (options & PCRE_EXTENDED) != 0, utf8)) > 0)
           {
           PUT2(code, 2+LINK_SIZE, i);
           code[1+LINK_SIZE]++;
@@ -5382,7 +5412,7 @@ for (;; ptr++)
             }
           else if ((recno =                /* Forward back reference */
                     find_parens(cd, name, namelen,
-                      (options & PCRE_EXTENDED) != 0)) <= 0)
+                      (options & PCRE_EXTENDED) != 0, utf8)) <= 0)
             {
             *errorcodeptr = ERR15;
             goto FAILED;
@@ -5493,7 +5523,7 @@ for (;; ptr++)
             if (called == NULL)
               {
               if (find_parens(cd, NULL, recno,
-                    (options & PCRE_EXTENDED) != 0) < 0)
+                    (options & PCRE_EXTENDED) != 0, utf8) < 0)
                 {
                 *errorcodeptr = ERR15;
                 goto FAILED;
diff --git a/testdata/testinput5 b/testdata/testinput5
index 5e200b3..32cfc65 100644
--- a/testdata/testinput5
+++ b/testdata/testinput5
@@ -794,4 +794,19 @@ can't tell the difference.) --/
     \x{a2} \x{84} 
     A Z 
 
+'A#хц'8x<any>BZ
+
+'A#хц
+  PQ'8x<any>BZ
+  
+/a+#хaa
+  z#XX?/8x<any>BZ 
+
+/a+#хaa
+  z#х?/8x<any>BZ 
+
+/\g{A}xxx#bXX(?'A'123)
+(?'A'456)/8x<any>BZ
+
+/\g{A}xxx#bх(?'A'123)
+(?'A'456)/8x<any>BZ
+
 /-- End of testinput5 --/
diff --git a/testdata/testoutput5 b/testdata/testoutput5
index b3ce48a..8784ebe 100644
--- a/testdata/testoutput5
+++ b/testdata/testoutput5
@@ -2222,4 +2222,65 @@ Starting byte set: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x0b \x0e
     A Z 
  0: A Z
 
+'A#хц'8x<any>BZ
+------------------------------------------------------------------
+        Bra
+        A
+        Ket
+        End
+------------------------------------------------------------------
+
+'A#хц
+  PQ'8x<any>BZ
+------------------------------------------------------------------
+        Bra
+        APQ
+        Ket
+        End
+------------------------------------------------------------------
+  
+/a+#хaa
+  z#XX?/8x<any>BZ 
+------------------------------------------------------------------
+        Bra
+        a++
+        z
+        Ket
+        End
+------------------------------------------------------------------
+
+/a+#хaa
+  z#х?/8x<any>BZ 
+------------------------------------------------------------------
+        Bra
+        a++
+        z
+        Ket
+        End
+------------------------------------------------------------------
+
+/\g{A}xxx#bXX(?'A'123)
+(?'A'456)/8x<any>BZ
+------------------------------------------------------------------
+        Bra
+        \1
+        xxx
+        CBra 1
+        456
+        Ket
+        Ket
+        End
+------------------------------------------------------------------
+
+/\g{A}xxx#bх(?'A'123)
+(?'A'456)/8x<any>BZ
+------------------------------------------------------------------
+        Bra
+        \1
+        xxx
+        CBra 1
+        456
+        Ket
+        Ket
+        End
+------------------------------------------------------------------
+
 /-- End of testinput5 --/
author	ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>	2010-10-26 11:06:44 +0000
committer	ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>	2010-10-26 11:06:44 +0000
commit	32ebaf1e2edf0e1a456e0fa9ccf8f8c71c56ef34 (patch)
tree	c115236dcc3c169a9be96de8b61dc2f2e72fd5c7
parent	6408a103fd1a51242975a7e80a309a0c0c3187d9 (diff)
download	pcre-32ebaf1e2edf0e1a456e0fa9ccf8f8c71c56ef34.tar.gz