diff options
author | ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2008-07-11 17:06:55 +0000 |
---|---|---|
committer | ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2008-07-11 17:06:55 +0000 |
commit | 1b526c19fd97a3aff7d5432dea74ff5f87d7f0b4 (patch) | |
tree | 422150d0913b5149dc27befb19155bc657074d2f | |
parent | 4299955a938be0e20bea4fae5fc346b6501d3997 (diff) | |
download | pcre-1b526c19fd97a3aff7d5432dea74ff5f87d7f0b4.tar.gz |
Further fixes for bumpalong processing in UTF-8 mode.
git-svn-id: svn://vcs.exim.org/pcre/code/trunk@365 2f5784b3-3f2a-0410-8824-cb99058d5e15
-rw-r--r-- | ChangeLog | 2 | ||||
-rw-r--r-- | pcre_dfa_exec.c | 28 | ||||
-rw-r--r-- | pcre_exec.c | 49 | ||||
-rw-r--r-- | testdata/testinput8 | 3 | ||||
-rw-r--r-- | testdata/testoutput8 | 4 |
5 files changed, 36 insertions, 50 deletions
@@ -61,7 +61,7 @@ Version 8.0 02 Jul-08 14. A similar bug to 7.3/2 existed when the PCRE_FIRSTLINE option was set and the data contained the byte 0x85 as part of a UTF-8 character within its - first line. + first line. This applied both to normal and DFA matching. Version 7.7 07-May-08 diff --git a/pcre_dfa_exec.c b/pcre_dfa_exec.c index c0ed8eb..11a03e6 100644 --- a/pcre_dfa_exec.c +++ b/pcre_dfa_exec.c @@ -2733,7 +2733,18 @@ for (;;) if (firstline) { - const uschar *t = current_subject; + USPTR t = current_subject; +#ifdef SUPPORT_UTF8 + if (utf8) + { + while (t < md->end_subject && !IS_NEWLINE(t)) + { + t++; + while (t < end_subject && (*t & 0xc0) == 0x80) t++; + } + } + else +#endif while (t < md->end_subject && !IS_NEWLINE(t)) t++; end_subject = t; } @@ -2755,9 +2766,22 @@ for (;;) { if (current_subject > md->start_subject + start_offset) { +#ifdef SUPPORT_UTF8 + if (utf8) + { + while (current_subject < end_subject && !WAS_NEWLINE(current_subject)) + { + current_subject++; + while(current_subject < end_subject && + (*current_subject & 0xc0) == 0x80) + current_subject++; + } + } + else +#endif while (current_subject < end_subject && !WAS_NEWLINE(current_subject)) current_subject++; - + /* If we have just passed a CR and the newline option is ANY or ANYCRLF, and we are now at a LF, advance the match position by one more character. */ diff --git a/pcre_exec.c b/pcre_exec.c index 83ee29a..212acbe 100644 --- a/pcre_exec.c +++ b/pcre_exec.c @@ -4695,7 +4695,7 @@ for(;;) if (firstline) { - USPTR *t = start_match; + USPTR t = start_match; #ifdef SUPPORT_UTF8 if (utf8) { @@ -4716,39 +4716,11 @@ for(;;) if (first_byte >= 0) { if (first_byte_caseless) - { -#ifdef SUPPORT_UTF8 - if (utf8) - { - while (start_match < end_subject && md->lcc[*start_match] != first_byte) - { - start_match++; - while(start_match < end_subject && (*start_match & 0xc0) == 0x80) - start_match++; - } - } - else -#endif while (start_match < end_subject && md->lcc[*start_match] != first_byte) start_match++; - } - else /* Caseful case */ - { -#ifdef SUPPORT_UTF8 - if (utf8) - { - while (start_match < end_subject && *start_match != first_byte) - { - start_match++; - while(start_match < end_subject && (*start_match & 0xc0) == 0x80) - start_match++; - } - } - else -#endif + else while (start_match < end_subject && *start_match != first_byte) start_match++; - } } /* Or to just after a linebreak for a multiline match */ @@ -4788,23 +4760,6 @@ for(;;) else if (start_bits != NULL) { -#ifdef SUPPORT_UTF8 - if (utf8) - { - while (start_match < end_subject) - { - register unsigned int c = *start_match; - if ((start_bits[c/8] & (1 << (c&7))) == 0) - { - start_match++; - while(start_match < end_subject && (*start_match & 0xc0) == 0x80) - start_match++; - } - else break; - } - } - else -#endif while (start_match < end_subject) { register unsigned int c = *start_match; diff --git a/testdata/testinput8 b/testdata/testinput8 index 5bcfab5..11884ad 100644 --- a/testdata/testinput8 +++ b/testdata/testinput8 @@ -664,4 +664,7 @@ a\x{85}b\<bsr_anycrlf> a\x0bb\<bsr_anycrlf> +/X/8f<any> + A\x{1ec5}ABCXYZ + / End of testinput 8 / diff --git a/testdata/testoutput8 b/testdata/testoutput8 index 631e5b8..af10c4a 100644 --- a/testdata/testoutput8 +++ b/testdata/testoutput8 @@ -1284,4 +1284,8 @@ No match a\x0bb\<bsr_anycrlf> No match +/X/8f<any> + A\x{1ec5}ABCXYZ + 0: X + / End of testinput 8 / |