summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>2008-07-07 16:30:33 +0000
committerph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>2008-07-07 16:30:33 +0000
commit798472a8be18bffaa324fa1c6cc052895a063e47 (patch)
tree705a42ce28f65a5b1fc978bccad3831fdf1a9bae
parentb17a62a5663fbb1eda289040153ed59188551e15 (diff)
downloadpcre-798472a8be18bffaa324fa1c6cc052895a063e47.tar.gz
Fix caseless backreferences for non-ASCII characters.
git-svn-id: svn://vcs.exim.org/pcre/code/trunk@354 2f5784b3-3f2a-0410-8824-cb99058d5e15
-rw-r--r--ChangeLog4
-rw-r--r--pcre_exec.c30
-rw-r--r--testdata/testinput618
-rw-r--r--testdata/testoutput642
4 files changed, 92 insertions, 2 deletions
diff --git a/ChangeLog b/ChangeLog
index 4c97765..3e5ccdf 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -17,6 +17,10 @@ Version 8.0 02 Jul-08
3. Change 12 for 7.7 introduced a bug in pcre_study() when a pattern contained
a group with a zero qualifier. The result of the study could be incorrect,
or the function might crash, depending on the pattern.
+
+4. Caseless matching was not working for non-ASCII characters in back
+ references. For example, /(\x{de})\1/8i was not matching \x{de}\x{fe}.
+ It now works when Unicode Property Support is available.
Version 7.7 07-May-08
diff --git a/pcre_exec.c b/pcre_exec.c
index 428916b..008b950 100644
--- a/pcre_exec.c
+++ b/pcre_exec.c
@@ -158,13 +158,39 @@ printf("\n");
if (length > md->end_subject - eptr) return FALSE;
-/* Separate the caselesss case for speed */
+/* Separate the caseless case for speed. In UTF-8 mode we can only do this
+properly if Unicode properties are supported. Otherwise, we can check only
+ASCII characters. */
if ((ims & PCRE_CASELESS) != 0)
{
+#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UCP
+ if (md->utf8)
+ {
+ USPTR endptr = eptr + length;
+ while (eptr < endptr)
+ {
+ int c, d;
+ GETCHARINC(c, eptr);
+ GETCHARINC(d, p);
+ if (c != d && c != UCD_OTHERCASE(d)) return FALSE;
+ }
+ }
+ else
+#endif
+#endif
+
+ /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
+ is no UCP support. */
+
while (length-- > 0)
- if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
+ { if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE; }
}
+
+/* In the caseful case, we can just compare the bytes, whether or not we
+are in UTF-8 mode. */
+
else
{ while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
diff --git a/testdata/testinput6 b/testdata/testinput6
index a386e9a..a8640f9 100644
--- a/testdata/testinput6
+++ b/testdata/testinput6
@@ -925,4 +925,22 @@ was broken in all cases./
** Failers
\x{1d79}\x{a77d}
+/(A)\1/8i
+ AA
+ Aa
+ aa
+ aA
+
+/(\x{de})\1/8i
+ \x{de}\x{de}
+ \x{de}\x{fe}
+ \x{fe}\x{fe}
+ \x{fe}\x{de}
+
+/(\x{10a})\1/8i
+ \x{10a}\x{10a}
+ \x{10a}\x{10b}
+ \x{10b}\x{10b}
+ \x{10b}\x{10a}
+
/ End of testinput6 /
diff --git a/testdata/testoutput6 b/testdata/testoutput6
index c3b4c81..caba466 100644
--- a/testdata/testoutput6
+++ b/testdata/testoutput6
@@ -1705,4 +1705,46 @@ No match
\x{1d79}\x{a77d}
No match
+/(A)\1/8i
+ AA
+ 0: AA
+ 1: A
+ Aa
+ 0: Aa
+ 1: A
+ aa
+ 0: aa
+ 1: a
+ aA
+ 0: aA
+ 1: a
+
+/(\x{de})\1/8i
+ \x{de}\x{de}
+ 0: \x{de}\x{de}
+ 1: \x{de}
+ \x{de}\x{fe}
+ 0: \x{de}\x{fe}
+ 1: \x{de}
+ \x{fe}\x{fe}
+ 0: \x{fe}\x{fe}
+ 1: \x{fe}
+ \x{fe}\x{de}
+ 0: \x{fe}\x{de}
+ 1: \x{fe}
+
+/(\x{10a})\1/8i
+ \x{10a}\x{10a}
+ 0: \x{10a}\x{10a}
+ 1: \x{10a}
+ \x{10a}\x{10b}
+ 0: \x{10a}\x{10b}
+ 1: \x{10a}
+ \x{10b}\x{10b}
+ 0: \x{10b}\x{10b}
+ 1: \x{10b}
+ \x{10b}\x{10a}
+ 0: \x{10b}\x{10a}
+ 1: \x{10b}
+
/ End of testinput6 /