diff options
author | ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2010-10-26 11:06:44 +0000 |
---|---|---|
committer | ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2010-10-26 11:06:44 +0000 |
commit | 32ebaf1e2edf0e1a456e0fa9ccf8f8c71c56ef34 (patch) | |
tree | c115236dcc3c169a9be96de8b61dc2f2e72fd5c7 | |
parent | 6408a103fd1a51242975a7e80a309a0c0c3187d9 (diff) | |
download | pcre-32ebaf1e2edf0e1a456e0fa9ccf8f8c71c56ef34.tar.gz |
Fix #-comment bugs in UTF-8 mode with PCRE_NEWLINE_ANY.
git-svn-id: svn://vcs.exim.org/pcre/code/trunk@556 2f5784b3-3f2a-0410-8824-cb99058d5e15
-rw-r--r-- | ChangeLog | 13 | ||||
-rw-r--r-- | doc/pcrepattern.3 | 21 | ||||
-rw-r--r-- | pcre_compile.c | 52 | ||||
-rw-r--r-- | testdata/testinput5 | 15 | ||||
-rw-r--r-- | testdata/testoutput5 | 61 |
5 files changed, 150 insertions, 12 deletions
@@ -39,6 +39,19 @@ Version 8.11 10-Oct-2010 /t\b/ matched against "cat" with PCRE_PARTIAL_HARD set did return a partial match rather than a full match, which was wrong by the old rules, but is now correct.] + +6. There was a bug in the handling of #-introduced comments, recognized when + PCRE_EXTENDED is set, when PCRE_NEWLINE_ANY and PCRE_UTF8 were also set. + If a UTF-8 multi-byte character included the byte 0x85 (e.g. +U0445, whose + UTF-8 encoding is 0xd1,0x85), this was misinterpreted as a newline when + scanning for the end of the comment. (*Character* 0x85 is an "any" newline, + but *byte* 0x85 is not, in UTF-8 mode). This bug was present in several + places in pcre_compile(). + +7. Related to (6) above, when pcre_compile() was skipping #-introduced + comments when looking ahead for named forward references to subpatterns, + the only newline sequence it recognized was NL. It now handles newlines + according to the set newline convention. Version 8.10 25-Jun-2010 diff --git a/doc/pcrepattern.3 b/doc/pcrepattern.3 index 963703c..9f9f1ef 100644 --- a/doc/pcrepattern.3 +++ b/doc/pcrepattern.3 @@ -66,6 +66,7 @@ discussed in the page. . . +.\" HTML <a name="newlines"></a> .SH "NEWLINE CONVENTIONS" .rs .sp @@ -2109,7 +2110,25 @@ that make up a comment play no part in the pattern matching at all. .P If the PCRE_EXTENDED option is set, an unescaped # character outside a character class introduces a comment that continues to immediately after the -next newline in the pattern. +next newline character or character sequence in the pattern. Which characters +are interpreted as newlines is controlled by the options passed to +\fBpcre_compile()\fP or by a special sequence at the start of the pattern, as +described in the section entitled +.\" HTML <a href="#recursion"> +.\" </a> +"Newline conventions" +.\" +above. Note that end of a comment is a literal newline sequence in the pattern; +escape sequences that happen to represent a newline do not terminate a comment. +For example, consider this pattern when PCRE_EXTENDED is set, and the default +newline convention is in force: +.sp + abc #comment \en still comment +.sp +On encountering the # character, \fBpcre_compile()\fP skips along, looking for +a newline in the pattern. The sequence \en is still literal at this stage, so +it does not terminate the comment. Only an actual character with the code value +0x0a does so. . . .\" HTML <a name="recursion"></a> diff --git a/pcre_compile.c b/pcre_compile.c index 9aa9246..0115eb3 100644 --- a/pcre_compile.c +++ b/pcre_compile.c @@ -1110,6 +1110,7 @@ Arguments: name name to seek, or NULL if seeking a numbered subpattern lorn name length, or subpattern number if name is NULL xmode TRUE if we are in /x mode + utf8 TRUE if we are in UTF-8 mode count pointer to the current capturing subpattern number (updated) Returns: the number of the named subpattern, or -1 if not found @@ -1117,7 +1118,7 @@ Returns: the number of the named subpattern, or -1 if not found static int find_parens_sub(uschar **ptrptr, compile_data *cd, const uschar *name, int lorn, - BOOL xmode, int *count) + BOOL xmode, BOOL utf8, int *count) { uschar *ptr = *ptrptr; int start_count = *count; @@ -1278,7 +1279,15 @@ for (; *ptr != 0; ptr++) if (xmode && *ptr == CHAR_NUMBER_SIGN) { - while (*(++ptr) != 0 && *ptr != CHAR_NL) {}; + ptr++; + while (*ptr != 0) + { + if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; } + ptr++; +#ifdef SUPPORT_UTF8 + if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++; +#endif + } if (*ptr == 0) goto FAIL_EXIT; continue; } @@ -1287,7 +1296,7 @@ for (; *ptr != 0; ptr++) if (*ptr == CHAR_LEFT_PARENTHESIS) { - int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, count); + int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, count); if (rc > 0) return rc; if (*ptr == 0) goto FAIL_EXIT; } @@ -1333,12 +1342,14 @@ Arguments: name name to seek, or NULL if seeking a numbered subpattern lorn name length, or subpattern number if name is NULL xmode TRUE if we are in /x mode + utf8 TRUE if we are in UTF-8 mode Returns: the number of the found subpattern, or -1 if not found */ static int -find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode) +find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode, + BOOL utf8) { uschar *ptr = (uschar *)cd->start_pattern; int count = 0; @@ -1351,7 +1362,7 @@ matching closing parens. That is why we have to have a loop. */ for (;;) { - rc = find_parens_sub(&ptr, cd, name, lorn, xmode, &count); + rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, &count); if (rc > 0 || *ptr++ == 0) break; } @@ -2515,8 +2526,15 @@ if ((options & PCRE_EXTENDED) != 0) while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++; if (*ptr == CHAR_NUMBER_SIGN) { - while (*(++ptr) != 0) + ptr++; + while (*ptr != 0) + { if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; } + ptr++; +#ifdef SUPPORT_UTF8 + if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++; +#endif + } } else break; } @@ -2552,8 +2570,15 @@ if ((options & PCRE_EXTENDED) != 0) while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++; if (*ptr == CHAR_NUMBER_SIGN) { - while (*(++ptr) != 0) + ptr++; + while (*ptr != 0) + { if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; } + ptr++; +#ifdef SUPPORT_UTF8 + if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++; +#endif + } } else break; } @@ -3126,9 +3151,14 @@ for (;; ptr++) if ((cd->ctypes[c] & ctype_space) != 0) continue; if (c == CHAR_NUMBER_SIGN) { - while (*(++ptr) != 0) + ptr++; + while (*ptr != 0) { if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; } + ptr++; +#ifdef SUPPORT_UTF8 + if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++; +#endif } if (*ptr != 0) continue; @@ -5036,7 +5066,7 @@ for (;; ptr++) /* Search the pattern for a forward reference */ else if ((i = find_parens(cd, name, namelen, - (options & PCRE_EXTENDED) != 0)) > 0) + (options & PCRE_EXTENDED) != 0, utf8)) > 0) { PUT2(code, 2+LINK_SIZE, i); code[1+LINK_SIZE]++; @@ -5382,7 +5412,7 @@ for (;; ptr++) } else if ((recno = /* Forward back reference */ find_parens(cd, name, namelen, - (options & PCRE_EXTENDED) != 0)) <= 0) + (options & PCRE_EXTENDED) != 0, utf8)) <= 0) { *errorcodeptr = ERR15; goto FAILED; @@ -5493,7 +5523,7 @@ for (;; ptr++) if (called == NULL) { if (find_parens(cd, NULL, recno, - (options & PCRE_EXTENDED) != 0) < 0) + (options & PCRE_EXTENDED) != 0, utf8) < 0) { *errorcodeptr = ERR15; goto FAILED; diff --git a/testdata/testinput5 b/testdata/testinput5 index 5e200b3..32cfc65 100644 --- a/testdata/testinput5 +++ b/testdata/testinput5 @@ -794,4 +794,19 @@ can't tell the difference.) --/ \x{a2} \x{84} A Z +'A#хц'8x<any>BZ + +'A#хц + PQ'8x<any>BZ + +/a+#хaa + z#XX?/8x<any>BZ + +/a+#хaa + z#х?/8x<any>BZ + +/\g{A}xxx#bXX(?'A'123)
(?'A'456)/8x<any>BZ + +/\g{A}xxx#bх(?'A'123)
(?'A'456)/8x<any>BZ + /-- End of testinput5 --/ diff --git a/testdata/testoutput5 b/testdata/testoutput5 index b3ce48a..8784ebe 100644 --- a/testdata/testoutput5 +++ b/testdata/testoutput5 @@ -2222,4 +2222,65 @@ Starting byte set: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x0b \x0e A Z 0: A Z +'A#хц'8x<any>BZ +------------------------------------------------------------------ + Bra + A + Ket + End +------------------------------------------------------------------ + +'A#хц + PQ'8x<any>BZ +------------------------------------------------------------------ + Bra + APQ + Ket + End +------------------------------------------------------------------ + +/a+#хaa + z#XX?/8x<any>BZ +------------------------------------------------------------------ + Bra + a++ + z + Ket + End +------------------------------------------------------------------ + +/a+#хaa + z#х?/8x<any>BZ +------------------------------------------------------------------ + Bra + a++ + z + Ket + End +------------------------------------------------------------------ + +/\g{A}xxx#bXX(?'A'123)
(?'A'456)/8x<any>BZ +------------------------------------------------------------------ + Bra + \1 + xxx + CBra 1 + 456 + Ket + Ket + End +------------------------------------------------------------------ + +/\g{A}xxx#bх(?'A'123)
(?'A'456)/8x<any>BZ +------------------------------------------------------------------ + Bra + \1 + xxx + CBra 1 + 456 + Ket + Ket + End +------------------------------------------------------------------ + /-- End of testinput5 --/ |