diff options
author | ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2007-11-13 11:07:16 +0000 |
---|---|---|
committer | ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2007-11-13 11:07:16 +0000 |
commit | 215cbb51a382717bf585b7f0196f3a72239a8659 (patch) | |
tree | 9068ed7a90e7052a2b77a932e5a52eb42a764754 | |
parent | 68565d28dbbdfa6d073328347815306dd96b37a6 (diff) | |
download | pcre-215cbb51a382717bf585b7f0196f3a72239a8659.tar.gz |
Fix [\S] etc. bug in UTF-8 mode with characters > 255.
git-svn-id: svn://vcs.exim.org/pcre/code/trunk@264 2f5784b3-3f2a-0410-8824-cb99058d5e15
-rw-r--r-- | ChangeLog | 4 | ||||
-rw-r--r-- | pcre_compile.c | 32 | ||||
-rw-r--r-- | testdata/testinput4 | 30 | ||||
-rw-r--r-- | testdata/testoutput2 | 2 | ||||
-rw-r--r-- | testdata/testoutput4 | 62 |
5 files changed, 119 insertions, 11 deletions
@@ -6,6 +6,10 @@ Version 7.5 12-Nov-07 1. Applied a patch from Craig: "This patch makes it possible to 'ignore' values in parens when parsing an RE using the C++ wrapper." + +2. Negative specials like \S did not work in character classes in UTF-8 mode. + Characters greater than 255 were excluded from the class instead of being + included. Version 7.4 21-Sep-07 diff --git a/pcre_compile.c b/pcre_compile.c index 3994781..29a6c3d 100644 --- a/pcre_compile.c +++ b/pcre_compile.c @@ -2383,6 +2383,7 @@ req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0; for (;; ptr++) { BOOL negate_class; + BOOL should_flip_negation; BOOL possessive_quantifier; BOOL is_quantifier; BOOL is_recurse; @@ -2631,6 +2632,12 @@ for (;; ptr++) else break; } + /* If a class contains a negative special such as \S, we need to flip the + negation flag at the end, so that support for characters > 255 works + correctly (they are all included in the class). */ + + should_flip_negation = FALSE; + /* Keep a count of chars with values < 256 so that we can optimize the case of just a single character (as long as it's < 256). However, For higher valued UTF-8 characters, we don't yet do any optimization. */ @@ -2805,6 +2812,7 @@ for (;; ptr++) continue; case ESC_D: + should_flip_negation = TRUE; for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit]; continue; @@ -2813,6 +2821,7 @@ for (;; ptr++) continue; case ESC_W: + should_flip_negation = TRUE; for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word]; continue; @@ -2822,6 +2831,7 @@ for (;; ptr++) continue; case ESC_S: + should_flip_negation = TRUE; for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space]; classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */ continue; @@ -3327,11 +3337,14 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ zeroreqbyte = reqbyte; /* If there are characters with values > 255, we have to compile an - extended class, with its own opcode. If there are no characters < 256, - we can omit the bitmap in the actual compiled code. */ + extended class, with its own opcode, unless there was a negated special + such as \S in the class, because in that case all characters > 255 are in + the class, so any that were explicitly given as well can be ignored. If + (when there are explicit characters > 255 that must be listed) there are no + characters < 256, we can omit the bitmap in the actual compiled code. */ #ifdef SUPPORT_UTF8 - if (class_utf8) + if (class_utf8 && !should_flip_negation) { *class_utf8data++ = XCL_END; /* Marks the end of extra data */ *code++ = OP_XCLASS; @@ -3357,20 +3370,19 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ } #endif - /* If there are no characters > 255, negate the 32-byte map if necessary, - and copy it into the code vector. If this is the first thing in the branch, - there can be no first char setting, whatever the repeat count. Any reqbyte - setting must remain unchanged after any kind of repeat. */ - + /* If there are no characters > 255, set the opcode to OP_CLASS or + OP_NCLASS, depending on whether the whole class was negated and whether + there were negative specials such as \S in the class. Then copy the 32-byte + map into the code vector, negating it if necessary. */ + + *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS; if (negate_class) { - *code++ = OP_NCLASS; if (lengthptr == NULL) /* Save time in the pre-compile phase */ for (c = 0; c < 32; c++) code[c] = ~classbits[c]; } else { - *code++ = OP_CLASS; memcpy(code, classbits, 32); } code += 32; diff --git a/testdata/testinput4 b/testdata/testinput4 index 630fb1d..f9306de 100644 --- a/testdata/testinput4 +++ b/testdata/testinput4 @@ -535,4 +535,34 @@ /\W{2}/8g +\x{a3}== +/\S/8g + \x{442}\x{435}\x{441}\x{442} + +/[\S]/8g + \x{442}\x{435}\x{441}\x{442} + +/\D/8g + \x{442}\x{435}\x{441}\x{442} + +/[\D]/8g + \x{442}\x{435}\x{441}\x{442} + +/\W/8g + \x{2442}\x{2435}\x{2441}\x{2442} + +/[\W]/8g + \x{2442}\x{2435}\x{2441}\x{2442} + +/[\S\s]*/8 + abc\n\r\x{442}\x{435}\x{441}\x{442}xyz + +/[\x{41f}\S]/8g + \x{442}\x{435}\x{441}\x{442} + +/.[^\S]./8g + abc def\x{442}\x{443}xyz\npqr + +/.[^\S\n]./8g + abc def\x{442}\x{443}xyz\npqr + / End of testinput4 / diff --git a/testdata/testoutput2 b/testdata/testoutput2 index 11fcb1e..4b6302c 100644 --- a/testdata/testoutput2 +++ b/testdata/testoutput2 @@ -2736,7 +2736,7 @@ No need char /[\S]/DZ ------------------------------------------------------------------ Bra - [\x00-\x08\x0b\x0e-\x1f!-\xff] + [\x00-\x08\x0b\x0e-\x1f!-\xff] (neg) Ket End ------------------------------------------------------------------ diff --git a/testdata/testoutput4 b/testdata/testoutput4 index b49d4f9..ebb298a 100644 --- a/testdata/testoutput4 +++ b/testdata/testoutput4 @@ -938,4 +938,66 @@ No match 0: +\x{a3} 0: == +/\S/8g + \x{442}\x{435}\x{441}\x{442} + 0: \x{442} + 0: \x{435} + 0: \x{441} + 0: \x{442} + +/[\S]/8g + \x{442}\x{435}\x{441}\x{442} + 0: \x{442} + 0: \x{435} + 0: \x{441} + 0: \x{442} + +/\D/8g + \x{442}\x{435}\x{441}\x{442} + 0: \x{442} + 0: \x{435} + 0: \x{441} + 0: \x{442} + +/[\D]/8g + \x{442}\x{435}\x{441}\x{442} + 0: \x{442} + 0: \x{435} + 0: \x{441} + 0: \x{442} + +/\W/8g + \x{2442}\x{2435}\x{2441}\x{2442} + 0: \x{2442} + 0: \x{2435} + 0: \x{2441} + 0: \x{2442} + +/[\W]/8g + \x{2442}\x{2435}\x{2441}\x{2442} + 0: \x{2442} + 0: \x{2435} + 0: \x{2441} + 0: \x{2442} + +/[\S\s]*/8 + abc\n\r\x{442}\x{435}\x{441}\x{442}xyz + 0: abc\x{0a}\x{0d}\x{442}\x{435}\x{441}\x{442}xyz + +/[\x{41f}\S]/8g + \x{442}\x{435}\x{441}\x{442} + 0: \x{442} + 0: \x{435} + 0: \x{441} + 0: \x{442} + +/.[^\S]./8g + abc def\x{442}\x{443}xyz\npqr + 0: c d + 0: z\x{0a}p + +/.[^\S\n]./8g + abc def\x{442}\x{443}xyz\npqr + 0: c d + / End of testinput4 / |