summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>2008-07-11 17:06:55 +0000
committerph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>2008-07-11 17:06:55 +0000
commit1b526c19fd97a3aff7d5432dea74ff5f87d7f0b4 (patch)
tree422150d0913b5149dc27befb19155bc657074d2f
parent4299955a938be0e20bea4fae5fc346b6501d3997 (diff)
downloadpcre-1b526c19fd97a3aff7d5432dea74ff5f87d7f0b4.tar.gz
Further fixes for bumpalong processing in UTF-8 mode.
git-svn-id: svn://vcs.exim.org/pcre/code/trunk@365 2f5784b3-3f2a-0410-8824-cb99058d5e15
-rw-r--r--ChangeLog2
-rw-r--r--pcre_dfa_exec.c28
-rw-r--r--pcre_exec.c49
-rw-r--r--testdata/testinput83
-rw-r--r--testdata/testoutput84
5 files changed, 36 insertions, 50 deletions
diff --git a/ChangeLog b/ChangeLog
index 0d9d133..26033ba 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -61,7 +61,7 @@ Version 8.0 02 Jul-08
14. A similar bug to 7.3/2 existed when the PCRE_FIRSTLINE option was set and
the data contained the byte 0x85 as part of a UTF-8 character within its
- first line.
+ first line. This applied both to normal and DFA matching.
Version 7.7 07-May-08
diff --git a/pcre_dfa_exec.c b/pcre_dfa_exec.c
index c0ed8eb..11a03e6 100644
--- a/pcre_dfa_exec.c
+++ b/pcre_dfa_exec.c
@@ -2733,7 +2733,18 @@ for (;;)
if (firstline)
{
- const uschar *t = current_subject;
+ USPTR t = current_subject;
+#ifdef SUPPORT_UTF8
+ if (utf8)
+ {
+ while (t < md->end_subject && !IS_NEWLINE(t))
+ {
+ t++;
+ while (t < end_subject && (*t & 0xc0) == 0x80) t++;
+ }
+ }
+ else
+#endif
while (t < md->end_subject && !IS_NEWLINE(t)) t++;
end_subject = t;
}
@@ -2755,9 +2766,22 @@ for (;;)
{
if (current_subject > md->start_subject + start_offset)
{
+#ifdef SUPPORT_UTF8
+ if (utf8)
+ {
+ while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
+ {
+ current_subject++;
+ while(current_subject < end_subject &&
+ (*current_subject & 0xc0) == 0x80)
+ current_subject++;
+ }
+ }
+ else
+#endif
while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
current_subject++;
-
+
/* If we have just passed a CR and the newline option is ANY or
ANYCRLF, and we are now at a LF, advance the match position by one more
character. */
diff --git a/pcre_exec.c b/pcre_exec.c
index 83ee29a..212acbe 100644
--- a/pcre_exec.c
+++ b/pcre_exec.c
@@ -4695,7 +4695,7 @@ for(;;)
if (firstline)
{
- USPTR *t = start_match;
+ USPTR t = start_match;
#ifdef SUPPORT_UTF8
if (utf8)
{
@@ -4716,39 +4716,11 @@ for(;;)
if (first_byte >= 0)
{
if (first_byte_caseless)
- {
-#ifdef SUPPORT_UTF8
- if (utf8)
- {
- while (start_match < end_subject && md->lcc[*start_match] != first_byte)
- {
- start_match++;
- while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
- start_match++;
- }
- }
- else
-#endif
while (start_match < end_subject && md->lcc[*start_match] != first_byte)
start_match++;
- }
- else /* Caseful case */
- {
-#ifdef SUPPORT_UTF8
- if (utf8)
- {
- while (start_match < end_subject && *start_match != first_byte)
- {
- start_match++;
- while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
- start_match++;
- }
- }
- else
-#endif
+ else
while (start_match < end_subject && *start_match != first_byte)
start_match++;
- }
}
/* Or to just after a linebreak for a multiline match */
@@ -4788,23 +4760,6 @@ for(;;)
else if (start_bits != NULL)
{
-#ifdef SUPPORT_UTF8
- if (utf8)
- {
- while (start_match < end_subject)
- {
- register unsigned int c = *start_match;
- if ((start_bits[c/8] & (1 << (c&7))) == 0)
- {
- start_match++;
- while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
- start_match++;
- }
- else break;
- }
- }
- else
-#endif
while (start_match < end_subject)
{
register unsigned int c = *start_match;
diff --git a/testdata/testinput8 b/testdata/testinput8
index 5bcfab5..11884ad 100644
--- a/testdata/testinput8
+++ b/testdata/testinput8
@@ -664,4 +664,7 @@
a\x{85}b\<bsr_anycrlf>
a\x0bb\<bsr_anycrlf>
+/X/8f<any>
+ A\x{1ec5}ABCXYZ
+
/ End of testinput 8 /
diff --git a/testdata/testoutput8 b/testdata/testoutput8
index 631e5b8..af10c4a 100644
--- a/testdata/testoutput8
+++ b/testdata/testoutput8
@@ -1284,4 +1284,8 @@ No match
a\x0bb\<bsr_anycrlf>
No match
+/X/8f<any>
+ A\x{1ec5}ABCXYZ
+ 0: X
+
/ End of testinput 8 /