diff options
author | ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2012-06-17 19:08:41 +0000 |
---|---|---|
committer | ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2012-06-17 19:08:41 +0000 |
commit | 37b4efccc0529613ee2c93004d33ef681c410a9f (patch) | |
tree | 488ac9efcee13934e6990762bd969181d192484e | |
parent | 9a61463ee5099a08610fb08ada15b3c8fc3e0ad5 (diff) | |
download | pcre-37b4efccc0529613ee2c93004d33ef681c410a9f.tar.gz |
Fix DFA bug (3 cases) when UTF code was being obeyed in non-UTF mode.
git-svn-id: svn://vcs.exim.org/pcre/code/trunk@979 2f5784b3-3f2a-0410-8824-cb99058d5e15
-rw-r--r-- | ChangeLog | 9 | ||||
-rw-r--r-- | pcre_dfa_exec.c | 28 | ||||
-rw-r--r-- | testdata/testinput8 | 11 | ||||
-rw-r--r-- | testdata/testoutput8 | 20 |
4 files changed, 57 insertions, 11 deletions
@@ -136,6 +136,15 @@ Version 8.31 02-June-2012 the same checks as \x{...} characters in non-JavaScript mode. Specifically, codepoints that are too big for the mode are faulted, and in a UTF mode, disallowed codepoints are also faulted. + +39. If PCRE was compiled with UTF support, in three places in the DFA + matcher there was code that should only have been obeyed in UTF mode, but + was being obeyed unconditionally. In 8-bit mode this could cause incorrect + processing when bytes with values greater than 127 were present. In 16-bit + mode the bug would be provoked by values in the range 0xfc00 to 0xdc00. In + both cases the values are those that cannot be the first data item in a UTF + character. The three items that might have provoked this were recursions, + possessively repeated groups, and atomic groups. Version 8.30 04-February-2012 diff --git a/pcre_dfa_exec.c b/pcre_dfa_exec.c index 9565d46..140d85f 100644 --- a/pcre_dfa_exec.c +++ b/pcre_dfa_exec.c @@ -38,7 +38,6 @@ POSSIBILITY OF SUCH DAMAGE. ----------------------------------------------------------------------------- */ - /* This module contains the external function pcre_dfa_exec(), which is an alternative matching function that uses a sort of DFA algorithm (not a true FSM). This is NOT Perl-compatible, but it has advantages in certain @@ -382,7 +381,8 @@ for the current character, one for the following character). */ next_new_state->count = (y); \ next_new_state->data = (z); \ next_new_state++; \ - DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \ + DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d) line %d\n", rlevel*2-2, SP, \ + (x), (y), (z), __LINE__)); \ } \ else return PCRE_ERROR_DFA_WSSIZE @@ -611,7 +611,7 @@ for (;;) if (ptr < end_subject) { - clen = 1; /* Number of bytes in the character */ + clen = 1; /* Number of data items in the character */ #ifdef SUPPORT_UTF if (utf) { GETCHARLEN(c, ptr, clen); } else #endif /* SUPPORT_UTF */ @@ -789,7 +789,7 @@ for (;;) offsets[0] = (int)(current_subject - start_subject); offsets[1] = (int)(ptr - start_subject); DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP, - offsets[1] - offsets[0], current_subject)); + offsets[1] - offsets[0], (char *)current_subject)); } if ((md->moptions & PCRE_DFA_SHORTEST) != 0) { @@ -2797,9 +2797,12 @@ for (;;) { int charcount = local_offsets[rc+1] - local_offsets[rc]; #ifdef SUPPORT_UTF - const pcre_uchar *p = start_subject + local_offsets[rc]; - const pcre_uchar *pp = start_subject + local_offsets[rc+1]; - while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--; + if (utf) + { + const pcre_uchar *p = start_subject + local_offsets[rc]; + const pcre_uchar *pp = start_subject + local_offsets[rc+1]; + while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--; + } #endif if (charcount > 0) { @@ -2898,7 +2901,7 @@ for (;;) const pcre_uchar *pp = local_ptr; charcount = (int)(pp - p); #ifdef SUPPORT_UTF - while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--; + if (utf) while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--; #endif ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1)); } @@ -2980,9 +2983,12 @@ for (;;) else { #ifdef SUPPORT_UTF - const pcre_uchar *p = start_subject + local_offsets[0]; - const pcre_uchar *pp = start_subject + local_offsets[1]; - while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--; + if (utf) + { + const pcre_uchar *p = start_subject + local_offsets[0]; + const pcre_uchar *pp = start_subject + local_offsets[1]; + while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--; + } #endif ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1)); if (repeat_state_offset >= 0) diff --git a/testdata/testinput8 b/testdata/testinput8 index 2b628a3..e235445 100644 --- a/testdata/testinput8 +++ b/testdata/testinput8 @@ -4787,4 +4787,15 @@ /abcdef/ abc\R +/<H((?(?!<H|F>)(.)|(?R))++)*F>/ + text <H more text <H texting more hexA0-"\xA0" hex above 7F-"\xBC" F> text xxxxx <H text F> text F> text2 <H text sample F> more text. + +/^(?>.{4})abc|^\w\w.xabcd/ + xxxxabcd + xx\xa0xabcd + +/^(.{4}){2}+abc|^\w\w.x\w\w\w\wabcd/ + xxxxxxxxabcd + xx\xa0xxxxxabcd + /-- End of testinput8 --/ diff --git a/testdata/testoutput8 b/testdata/testoutput8 index c5f01e1..73e0eae 100644 --- a/testdata/testoutput8 +++ b/testdata/testoutput8 @@ -7996,4 +7996,24 @@ Partial match: \x0d\x0d\x0d abc\R Error -30 (invalid data in workspace for DFA restart) +/<H((?(?!<H|F>)(.)|(?R))++)*F>/ + text <H more text <H texting more hexA0-"\xA0" hex above 7F-"\xBC" F> text xxxxx <H text F> text F> text2 <H text sample F> more text. + 0: <H more text <H texting more hexA0-"\xa0" hex above 7F-"\xbc" F> text xxxxx <H text F> text F> + +/^(?>.{4})abc|^\w\w.xabcd/ + xxxxabcd + 0: xxxxabcd + 1: xxxxabc + xx\xa0xabcd + 0: xx\xa0xabcd + 1: xx\xa0xabc + +/^(.{4}){2}+abc|^\w\w.x\w\w\w\wabcd/ + xxxxxxxxabcd + 0: xxxxxxxxabcd + 1: xxxxxxxxabc + xx\xa0xxxxxabcd + 0: xx\xa0xxxxxabcd + 1: xx\xa0xxxxxabc + /-- End of testinput8 --/ |