summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>2012-06-17 19:08:41 +0000
committerph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>2012-06-17 19:08:41 +0000
commit37b4efccc0529613ee2c93004d33ef681c410a9f (patch)
tree488ac9efcee13934e6990762bd969181d192484e
parent9a61463ee5099a08610fb08ada15b3c8fc3e0ad5 (diff)
downloadpcre-37b4efccc0529613ee2c93004d33ef681c410a9f.tar.gz
Fix DFA bug (3 cases) when UTF code was being obeyed in non-UTF mode.
git-svn-id: svn://vcs.exim.org/pcre/code/trunk@979 2f5784b3-3f2a-0410-8824-cb99058d5e15
-rw-r--r--ChangeLog9
-rw-r--r--pcre_dfa_exec.c28
-rw-r--r--testdata/testinput811
-rw-r--r--testdata/testoutput820
4 files changed, 57 insertions, 11 deletions
diff --git a/ChangeLog b/ChangeLog
index a29548d..f167ea3 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -136,6 +136,15 @@ Version 8.31 02-June-2012
the same checks as \x{...} characters in non-JavaScript mode. Specifically,
codepoints that are too big for the mode are faulted, and in a UTF mode,
disallowed codepoints are also faulted.
+
+39. If PCRE was compiled with UTF support, in three places in the DFA
+ matcher there was code that should only have been obeyed in UTF mode, but
+ was being obeyed unconditionally. In 8-bit mode this could cause incorrect
+ processing when bytes with values greater than 127 were present. In 16-bit
+ mode the bug would be provoked by values in the range 0xfc00 to 0xdc00. In
+ both cases the values are those that cannot be the first data item in a UTF
+ character. The three items that might have provoked this were recursions,
+ possessively repeated groups, and atomic groups.
Version 8.30 04-February-2012
diff --git a/pcre_dfa_exec.c b/pcre_dfa_exec.c
index 9565d46..140d85f 100644
--- a/pcre_dfa_exec.c
+++ b/pcre_dfa_exec.c
@@ -38,7 +38,6 @@ POSSIBILITY OF SUCH DAMAGE.
-----------------------------------------------------------------------------
*/
-
/* This module contains the external function pcre_dfa_exec(), which is an
alternative matching function that uses a sort of DFA algorithm (not a true
FSM). This is NOT Perl-compatible, but it has advantages in certain
@@ -382,7 +381,8 @@ for the current character, one for the following character). */
next_new_state->count = (y); \
next_new_state->data = (z); \
next_new_state++; \
- DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
+ DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d) line %d\n", rlevel*2-2, SP, \
+ (x), (y), (z), __LINE__)); \
} \
else return PCRE_ERROR_DFA_WSSIZE
@@ -611,7 +611,7 @@ for (;;)
if (ptr < end_subject)
{
- clen = 1; /* Number of bytes in the character */
+ clen = 1; /* Number of data items in the character */
#ifdef SUPPORT_UTF
if (utf) { GETCHARLEN(c, ptr, clen); } else
#endif /* SUPPORT_UTF */
@@ -789,7 +789,7 @@ for (;;)
offsets[0] = (int)(current_subject - start_subject);
offsets[1] = (int)(ptr - start_subject);
DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
- offsets[1] - offsets[0], current_subject));
+ offsets[1] - offsets[0], (char *)current_subject));
}
if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
{
@@ -2797,9 +2797,12 @@ for (;;)
{
int charcount = local_offsets[rc+1] - local_offsets[rc];
#ifdef SUPPORT_UTF
- const pcre_uchar *p = start_subject + local_offsets[rc];
- const pcre_uchar *pp = start_subject + local_offsets[rc+1];
- while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
+ if (utf)
+ {
+ const pcre_uchar *p = start_subject + local_offsets[rc];
+ const pcre_uchar *pp = start_subject + local_offsets[rc+1];
+ while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
+ }
#endif
if (charcount > 0)
{
@@ -2898,7 +2901,7 @@ for (;;)
const pcre_uchar *pp = local_ptr;
charcount = (int)(pp - p);
#ifdef SUPPORT_UTF
- while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
+ if (utf) while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
#endif
ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
}
@@ -2980,9 +2983,12 @@ for (;;)
else
{
#ifdef SUPPORT_UTF
- const pcre_uchar *p = start_subject + local_offsets[0];
- const pcre_uchar *pp = start_subject + local_offsets[1];
- while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
+ if (utf)
+ {
+ const pcre_uchar *p = start_subject + local_offsets[0];
+ const pcre_uchar *pp = start_subject + local_offsets[1];
+ while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
+ }
#endif
ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
if (repeat_state_offset >= 0)
diff --git a/testdata/testinput8 b/testdata/testinput8
index 2b628a3..e235445 100644
--- a/testdata/testinput8
+++ b/testdata/testinput8
@@ -4787,4 +4787,15 @@
/abcdef/
abc\R
+/<H((?(?!<H|F>)(.)|(?R))++)*F>/
+ text <H more text <H texting more hexA0-"\xA0" hex above 7F-"\xBC" F> text xxxxx <H text F> text F> text2 <H text sample F> more text.
+
+/^(?>.{4})abc|^\w\w.xabcd/
+ xxxxabcd
+ xx\xa0xabcd
+
+/^(.{4}){2}+abc|^\w\w.x\w\w\w\wabcd/
+ xxxxxxxxabcd
+ xx\xa0xxxxxabcd
+
/-- End of testinput8 --/
diff --git a/testdata/testoutput8 b/testdata/testoutput8
index c5f01e1..73e0eae 100644
--- a/testdata/testoutput8
+++ b/testdata/testoutput8
@@ -7996,4 +7996,24 @@ Partial match: \x0d\x0d\x0d
abc\R
Error -30 (invalid data in workspace for DFA restart)
+/<H((?(?!<H|F>)(.)|(?R))++)*F>/
+ text <H more text <H texting more hexA0-"\xA0" hex above 7F-"\xBC" F> text xxxxx <H text F> text F> text2 <H text sample F> more text.
+ 0: <H more text <H texting more hexA0-"\xa0" hex above 7F-"\xbc" F> text xxxxx <H text F> text F>
+
+/^(?>.{4})abc|^\w\w.xabcd/
+ xxxxabcd
+ 0: xxxxabcd
+ 1: xxxxabc
+ xx\xa0xabcd
+ 0: xx\xa0xabcd
+ 1: xx\xa0xabc
+
+/^(.{4}){2}+abc|^\w\w.x\w\w\w\wabcd/
+ xxxxxxxxabcd
+ 0: xxxxxxxxabcd
+ 1: xxxxxxxxabc
+ xx\xa0xxxxxabcd
+ 0: xx\xa0xxxxxabcd
+ 1: xx\xa0xxxxxabc
+
/-- End of testinput8 --/