From cc97428733afa4374b180ae8261852f60cb61de2 Mon Sep 17 00:00:00 2001 From: ph10 Date: Fri, 20 Nov 2015 17:34:16 +0000 Subject: Fix wide character bug for some negative POSIX classes. git-svn-id: svn://vcs.exim.org/pcre/code/trunk@1608 2f5784b3-3f2a-0410-8824-cb99058d5e15 --- ChangeLog | 10 +++++-- pcre_compile.c | 23 ++++++++++++-- testdata/testinput6 | 44 +++++++++++++++++++++++++++ testdata/testoutput11-16 | 16 +++++----- testdata/testoutput11-32 | 16 +++++----- testdata/testoutput11-8 | 16 +++++----- testdata/testoutput6 | 78 ++++++++++++++++++++++++++++++++++++++++++++++++ testdata/testoutput7 | 4 +-- 8 files changed, 176 insertions(+), 31 deletions(-) diff --git a/ChangeLog b/ChangeLog index e0bb2ee..8073df1 100644 --- a/ChangeLog +++ b/ChangeLog @@ -159,8 +159,8 @@ Version 8.38 27-October-2015 42. In a character class such as [\W\p{Any}] where both a negative-type escape ("not a word character") and a property escape were present, the property escape was being ignored. - -43. Fix crash caused by very long (*MARK) or (*THEN) names. + +43. Fix crash caused by very long (*MARK) or (*THEN) names. 44. A sequence such as [[:punct:]b] that is, a POSIX character class followed by a single ASCII character in a class item, was incorrectly compiled in @@ -170,6 +170,12 @@ Version 8.38 27-October-2015 45. [:punct:] in UCP mode was matching some characters in the range 128-255 that should not have been matched. +46. If [:^ascii:] or [:^xdigit:] or [:^cntrl:] are present in a non-negated + class, all characters with code points greater than 255 are in the class. + When a Unicode property was also in the class (if PCRE_UCP is set, escapes + such as \w are turned into Unicode properties), wide characters were not + correctly handled, and could fail to match. + Version 8.37 28-April-2015 -------------------------- diff --git a/pcre_compile.c b/pcre_compile.c index c253f79..b16e641 100644 --- a/pcre_compile.c +++ b/pcre_compile.c @@ -4940,9 +4940,10 @@ for (;; ptr++) (which is on the stack). We have to remember that there was XCLASS data, however. */ + if (class_uchardata > class_uchardata_base) xclass = TRUE; + if (lengthptr != NULL && class_uchardata > class_uchardata_base) { - xclass = TRUE; *lengthptr += (int)(class_uchardata - class_uchardata_base); class_uchardata = class_uchardata_base; } @@ -5045,10 +5046,26 @@ for (;; ptr++) ptr = tempptr + 1; continue; - /* For all other POSIX classes, no special action is taken in UCP - mode. Fall through to the non_UCP case. */ + /* For the other POSIX classes (ascii, xdigit) we are going to fall + through to the non-UCP case and build a bit map for characters with + code points less than 256. If we are in a negated POSIX class + within a non-negated overall class, characters with code points + greater than 255 must all match. In the special case where we have + not yet generated any xclass data, and this is the final item in + the overall class, we need do nothing: later on, the opcode + OP_NCLASS will be used to indicate that characters greater than 255 + are acceptable. If we have already seen an xclass item or one may + follow (we have to assume that it might if this is not the end of + the class), explicitly match all wide codepoints. */ default: + if (!negate_class && local_negate && + (xclass || tempptr[2] != CHAR_RIGHT_SQUARE_BRACKET)) + { + *class_uchardata++ = XCL_RANGE; + class_uchardata += PRIV(ord2utf)(0x100, class_uchardata); + class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata); + } break; } } diff --git a/testdata/testinput6 b/testdata/testinput6 index 8aee0d0..aeb62a0 100644 --- a/testdata/testinput6 +++ b/testdata/testinput6 @@ -1509,4 +1509,48 @@ \xc2\xb4 \x{b4} +/[[:^ascii:]]/8W + \x{100} + \x{200} + \x{300} + \x{37e} + a + 9 + g + +/[[:^ascii:]\w]/8W + a + 9 + g + \x{100} + \x{200} + \x{300} + \x{37e} + +/[\w[:^ascii:]]/8W + a + 9 + g + \x{100} + \x{200} + \x{300} + \x{37e} + +/[^[:ascii:]\W]/8W + a + 9 + g + \x{100} + \x{200} + \x{300} + \x{37e} + +/[[:^ascii:]a]/8W + a + 9 + g + \x{100} + \x{200} + \x{37e} + /-- End of testinput6 --/ diff --git a/testdata/testoutput11-16 b/testdata/testoutput11-16 index 87acadd..9a0a12d 100644 --- a/testdata/testoutput11-16 +++ b/testdata/testoutput11-16 @@ -650,18 +650,18 @@ Memory allocation (code space): 14 /[[:^alpha:][:^cntrl:]]+/8WB ------------------------------------------------------------------ - 0 26 Bra - 2 [ -~\x80-\xff\P{L}]++ - 26 26 Ket - 28 End + 0 30 Bra + 2 [ -~\x80-\xff\P{L}\x{100}-\x{10ffff}]++ + 30 30 Ket + 32 End ------------------------------------------------------------------ /[[:^cntrl:][:^alpha:]]+/8WB ------------------------------------------------------------------ - 0 26 Bra - 2 [ -~\x80-\xff\P{L}]++ - 26 26 Ket - 28 End + 0 30 Bra + 2 [ -~\x80-\xff\x{100}-\x{10ffff}\P{L}]++ + 30 30 Ket + 32 End ------------------------------------------------------------------ /[[:alpha:]]+/8WB diff --git a/testdata/testoutput11-32 b/testdata/testoutput11-32 index 325fedb..57e5da0 100644 --- a/testdata/testoutput11-32 +++ b/testdata/testoutput11-32 @@ -650,18 +650,18 @@ Memory allocation (code space): 28 /[[:^alpha:][:^cntrl:]]+/8WB ------------------------------------------------------------------ - 0 18 Bra - 2 [ -~\x80-\xff\P{L}]++ - 18 18 Ket - 20 End + 0 21 Bra + 2 [ -~\x80-\xff\P{L}\x{100}-\x{10ffff}]++ + 21 21 Ket + 23 End ------------------------------------------------------------------ /[[:^cntrl:][:^alpha:]]+/8WB ------------------------------------------------------------------ - 0 18 Bra - 2 [ -~\x80-\xff\P{L}]++ - 18 18 Ket - 20 End + 0 21 Bra + 2 [ -~\x80-\xff\x{100}-\x{10ffff}\P{L}]++ + 21 21 Ket + 23 End ------------------------------------------------------------------ /[[:alpha:]]+/8WB diff --git a/testdata/testoutput11-8 b/testdata/testoutput11-8 index b1a4a90..748548a 100644 --- a/testdata/testoutput11-8 +++ b/testdata/testoutput11-8 @@ -650,18 +650,18 @@ Memory allocation (code space): 10 /[[:^alpha:][:^cntrl:]]+/8WB ------------------------------------------------------------------ - 0 44 Bra - 3 [ -~\x80-\xff\P{L}]++ - 44 44 Ket - 47 End + 0 51 Bra + 3 [ -~\x80-\xff\P{L}\x{100}-\x{10ffff}]++ + 51 51 Ket + 54 End ------------------------------------------------------------------ /[[:^cntrl:][:^alpha:]]+/8WB ------------------------------------------------------------------ - 0 44 Bra - 3 [ -~\x80-\xff\P{L}]++ - 44 44 Ket - 47 End + 0 51 Bra + 3 [ -~\x80-\xff\x{100}-\x{10ffff}\P{L}]++ + 51 51 Ket + 54 End ------------------------------------------------------------------ /[[:alpha:]]+/8WB diff --git a/testdata/testoutput6 b/testdata/testoutput6 index 196f993..beb85aa 100644 --- a/testdata/testoutput6 +++ b/testdata/testoutput6 @@ -2479,4 +2479,82 @@ No match \x{b4} No match +/[[:^ascii:]]/8W + \x{100} + 0: \x{100} + \x{200} + 0: \x{200} + \x{300} + 0: \x{300} + \x{37e} + 0: \x{37e} + a +No match + 9 +No match + g +No match + +/[[:^ascii:]\w]/8W + a + 0: a + 9 + 0: 9 + g + 0: g + \x{100} + 0: \x{100} + \x{200} + 0: \x{200} + \x{300} + 0: \x{300} + \x{37e} + 0: \x{37e} + +/[\w[:^ascii:]]/8W + a + 0: a + 9 + 0: 9 + g + 0: g + \x{100} + 0: \x{100} + \x{200} + 0: \x{200} + \x{300} + 0: \x{300} + \x{37e} + 0: \x{37e} + +/[^[:ascii:]\W]/8W + a +No match + 9 +No match + g +No match + \x{100} + 0: \x{100} + \x{200} + 0: \x{200} + \x{300} +No match + \x{37e} +No match + +/[[:^ascii:]a]/8W + a + 0: a + 9 +No match + g +No match + \x{100} + 0: \x{100} + \x{200} + 0: \x{200} + \x{37e} + 0: \x{37e} + /-- End of testinput6 --/ diff --git a/testdata/testoutput7 b/testdata/testoutput7 index a05b381..cc9ebdd 100644 --- a/testdata/testoutput7 +++ b/testdata/testoutput7 @@ -949,7 +949,7 @@ No match /[[:^alpha:][:^cntrl:]]+/8WBZ ------------------------------------------------------------------ Bra - [ -~\x80-\xff\P{L}]++ + [ -~\x80-\xff\P{L}\x{100}-\x{10ffff}]++ Ket End ------------------------------------------------------------------ @@ -961,7 +961,7 @@ No match /[[:^cntrl:][:^alpha:]]+/8WBZ ------------------------------------------------------------------ Bra - [ -~\x80-\xff\P{L}]++ + [ -~\x80-\xff\x{100}-\x{10ffff}\P{L}]++ Ket End ------------------------------------------------------------------ -- cgit v1.2.1