summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>2010-10-26 11:06:44 +0000
committerph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>2010-10-26 11:06:44 +0000
commit32ebaf1e2edf0e1a456e0fa9ccf8f8c71c56ef34 (patch)
treec115236dcc3c169a9be96de8b61dc2f2e72fd5c7
parent6408a103fd1a51242975a7e80a309a0c0c3187d9 (diff)
downloadpcre-32ebaf1e2edf0e1a456e0fa9ccf8f8c71c56ef34.tar.gz
Fix #-comment bugs in UTF-8 mode with PCRE_NEWLINE_ANY.
git-svn-id: svn://vcs.exim.org/pcre/code/trunk@556 2f5784b3-3f2a-0410-8824-cb99058d5e15
-rw-r--r--ChangeLog13
-rw-r--r--doc/pcrepattern.321
-rw-r--r--pcre_compile.c52
-rw-r--r--testdata/testinput515
-rw-r--r--testdata/testoutput561
5 files changed, 150 insertions, 12 deletions
diff --git a/ChangeLog b/ChangeLog
index 5b8d840..e03961b 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -39,6 +39,19 @@ Version 8.11 10-Oct-2010
/t\b/ matched against "cat" with PCRE_PARTIAL_HARD set did return a partial
match rather than a full match, which was wrong by the old rules, but is
now correct.]
+
+6. There was a bug in the handling of #-introduced comments, recognized when
+ PCRE_EXTENDED is set, when PCRE_NEWLINE_ANY and PCRE_UTF8 were also set.
+ If a UTF-8 multi-byte character included the byte 0x85 (e.g. +U0445, whose
+ UTF-8 encoding is 0xd1,0x85), this was misinterpreted as a newline when
+ scanning for the end of the comment. (*Character* 0x85 is an "any" newline,
+ but *byte* 0x85 is not, in UTF-8 mode). This bug was present in several
+ places in pcre_compile().
+
+7. Related to (6) above, when pcre_compile() was skipping #-introduced
+ comments when looking ahead for named forward references to subpatterns,
+ the only newline sequence it recognized was NL. It now handles newlines
+ according to the set newline convention.
Version 8.10 25-Jun-2010
diff --git a/doc/pcrepattern.3 b/doc/pcrepattern.3
index 963703c..9f9f1ef 100644
--- a/doc/pcrepattern.3
+++ b/doc/pcrepattern.3
@@ -66,6 +66,7 @@ discussed in the
page.
.
.
+.\" HTML <a name="newlines"></a>
.SH "NEWLINE CONVENTIONS"
.rs
.sp
@@ -2109,7 +2110,25 @@ that make up a comment play no part in the pattern matching at all.
.P
If the PCRE_EXTENDED option is set, an unescaped # character outside a
character class introduces a comment that continues to immediately after the
-next newline in the pattern.
+next newline character or character sequence in the pattern. Which characters
+are interpreted as newlines is controlled by the options passed to
+\fBpcre_compile()\fP or by a special sequence at the start of the pattern, as
+described in the section entitled
+.\" HTML <a href="#recursion">
+.\" </a>
+"Newline conventions"
+.\"
+above. Note that end of a comment is a literal newline sequence in the pattern;
+escape sequences that happen to represent a newline do not terminate a comment.
+For example, consider this pattern when PCRE_EXTENDED is set, and the default
+newline convention is in force:
+.sp
+ abc #comment \en still comment
+.sp
+On encountering the # character, \fBpcre_compile()\fP skips along, looking for
+a newline in the pattern. The sequence \en is still literal at this stage, so
+it does not terminate the comment. Only an actual character with the code value
+0x0a does so.
.
.
.\" HTML <a name="recursion"></a>
diff --git a/pcre_compile.c b/pcre_compile.c
index 9aa9246..0115eb3 100644
--- a/pcre_compile.c
+++ b/pcre_compile.c
@@ -1110,6 +1110,7 @@ Arguments:
name name to seek, or NULL if seeking a numbered subpattern
lorn name length, or subpattern number if name is NULL
xmode TRUE if we are in /x mode
+ utf8 TRUE if we are in UTF-8 mode
count pointer to the current capturing subpattern number (updated)
Returns: the number of the named subpattern, or -1 if not found
@@ -1117,7 +1118,7 @@ Returns: the number of the named subpattern, or -1 if not found
static int
find_parens_sub(uschar **ptrptr, compile_data *cd, const uschar *name, int lorn,
- BOOL xmode, int *count)
+ BOOL xmode, BOOL utf8, int *count)
{
uschar *ptr = *ptrptr;
int start_count = *count;
@@ -1278,7 +1279,15 @@ for (; *ptr != 0; ptr++)
if (xmode && *ptr == CHAR_NUMBER_SIGN)
{
- while (*(++ptr) != 0 && *ptr != CHAR_NL) {};
+ ptr++;
+ while (*ptr != 0)
+ {
+ if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
+ ptr++;
+#ifdef SUPPORT_UTF8
+ if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
+#endif
+ }
if (*ptr == 0) goto FAIL_EXIT;
continue;
}
@@ -1287,7 +1296,7 @@ for (; *ptr != 0; ptr++)
if (*ptr == CHAR_LEFT_PARENTHESIS)
{
- int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, count);
+ int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, count);
if (rc > 0) return rc;
if (*ptr == 0) goto FAIL_EXIT;
}
@@ -1333,12 +1342,14 @@ Arguments:
name name to seek, or NULL if seeking a numbered subpattern
lorn name length, or subpattern number if name is NULL
xmode TRUE if we are in /x mode
+ utf8 TRUE if we are in UTF-8 mode
Returns: the number of the found subpattern, or -1 if not found
*/
static int
-find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode)
+find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode,
+ BOOL utf8)
{
uschar *ptr = (uschar *)cd->start_pattern;
int count = 0;
@@ -1351,7 +1362,7 @@ matching closing parens. That is why we have to have a loop. */
for (;;)
{
- rc = find_parens_sub(&ptr, cd, name, lorn, xmode, &count);
+ rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, &count);
if (rc > 0 || *ptr++ == 0) break;
}
@@ -2515,8 +2526,15 @@ if ((options & PCRE_EXTENDED) != 0)
while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
if (*ptr == CHAR_NUMBER_SIGN)
{
- while (*(++ptr) != 0)
+ ptr++;
+ while (*ptr != 0)
+ {
if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
+ ptr++;
+#ifdef SUPPORT_UTF8
+ if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
+#endif
+ }
}
else break;
}
@@ -2552,8 +2570,15 @@ if ((options & PCRE_EXTENDED) != 0)
while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
if (*ptr == CHAR_NUMBER_SIGN)
{
- while (*(++ptr) != 0)
+ ptr++;
+ while (*ptr != 0)
+ {
if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
+ ptr++;
+#ifdef SUPPORT_UTF8
+ if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
+#endif
+ }
}
else break;
}
@@ -3126,9 +3151,14 @@ for (;; ptr++)
if ((cd->ctypes[c] & ctype_space) != 0) continue;
if (c == CHAR_NUMBER_SIGN)
{
- while (*(++ptr) != 0)
+ ptr++;
+ while (*ptr != 0)
{
if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
+ ptr++;
+#ifdef SUPPORT_UTF8
+ if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
+#endif
}
if (*ptr != 0) continue;
@@ -5036,7 +5066,7 @@ for (;; ptr++)
/* Search the pattern for a forward reference */
else if ((i = find_parens(cd, name, namelen,
- (options & PCRE_EXTENDED) != 0)) > 0)
+ (options & PCRE_EXTENDED) != 0, utf8)) > 0)
{
PUT2(code, 2+LINK_SIZE, i);
code[1+LINK_SIZE]++;
@@ -5382,7 +5412,7 @@ for (;; ptr++)
}
else if ((recno = /* Forward back reference */
find_parens(cd, name, namelen,
- (options & PCRE_EXTENDED) != 0)) <= 0)
+ (options & PCRE_EXTENDED) != 0, utf8)) <= 0)
{
*errorcodeptr = ERR15;
goto FAILED;
@@ -5493,7 +5523,7 @@ for (;; ptr++)
if (called == NULL)
{
if (find_parens(cd, NULL, recno,
- (options & PCRE_EXTENDED) != 0) < 0)
+ (options & PCRE_EXTENDED) != 0, utf8) < 0)
{
*errorcodeptr = ERR15;
goto FAILED;
diff --git a/testdata/testinput5 b/testdata/testinput5
index 5e200b3..32cfc65 100644
--- a/testdata/testinput5
+++ b/testdata/testinput5
@@ -794,4 +794,19 @@ can't tell the difference.) --/
\x{a2} \x{84}
A Z
+'A#хц'8x<any>BZ
+
+'A#хц
+ PQ'8x<any>BZ
+
+/a+#хaa
+ z#XX?/8x<any>BZ
+
+/a+#хaa
+ z#х?/8x<any>BZ
+
+/\g{A}xxx#bXX(?'A'123) (?'A'456)/8x<any>BZ
+
+/\g{A}xxx#bх(?'A'123) (?'A'456)/8x<any>BZ
+
/-- End of testinput5 --/
diff --git a/testdata/testoutput5 b/testdata/testoutput5
index b3ce48a..8784ebe 100644
--- a/testdata/testoutput5
+++ b/testdata/testoutput5
@@ -2222,4 +2222,65 @@ Starting byte set: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x0b \x0e
A Z
0: A Z
+'A#хц'8x<any>BZ
+------------------------------------------------------------------
+ Bra
+ A
+ Ket
+ End
+------------------------------------------------------------------
+
+'A#хц
+ PQ'8x<any>BZ
+------------------------------------------------------------------
+ Bra
+ APQ
+ Ket
+ End
+------------------------------------------------------------------
+
+/a+#хaa
+ z#XX?/8x<any>BZ
+------------------------------------------------------------------
+ Bra
+ a++
+ z
+ Ket
+ End
+------------------------------------------------------------------
+
+/a+#хaa
+ z#х?/8x<any>BZ
+------------------------------------------------------------------
+ Bra
+ a++
+ z
+ Ket
+ End
+------------------------------------------------------------------
+
+/\g{A}xxx#bXX(?'A'123) (?'A'456)/8x<any>BZ
+------------------------------------------------------------------
+ Bra
+ \1
+ xxx
+ CBra 1
+ 456
+ Ket
+ Ket
+ End
+------------------------------------------------------------------
+
+/\g{A}xxx#bх(?'A'123) (?'A'456)/8x<any>BZ
+------------------------------------------------------------------
+ Bra
+ \1
+ xxx
+ CBra 1
+ 456
+ Ket
+ Ket
+ End
+------------------------------------------------------------------
+
/-- End of testinput5 --/