Fix caseless backreferences for non-ASCII characters.

git-svn-id: svn://vcs.exim.org/pcre/code/trunk@354 2f5784b3-3f2a-0410-8824-cb99058d5e15
author: ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> 2008-07-07 16:30:33 +0000
committer: ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> 2008-07-07 16:30:33 +0000
commit: 798472a8be18bffaa324fa1c6cc052895a063e47 (patch)
tree: 705a42ce28f65a5b1fc978bccad3831fdf1a9bae
parent: b17a62a5663fbb1eda289040153ed59188551e15 (diff)
download: pcre-798472a8be18bffaa324fa1c6cc052895a063e47.tar.gz
4 files changed, 92 insertions, 2 deletions
diff --git a/ChangeLog b/ChangeLog
index 4c97765..3e5ccdf 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -17,6 +17,10 @@ Version 8.0 02 Jul-08
 3.  Change 12 for 7.7 introduced a bug in pcre_study() when a pattern contained
     a group with a zero qualifier. The result of the study could be incorrect,
     or the function might crash, depending on the pattern. 
+    
+4.  Caseless matching was not working for non-ASCII characters in back 
+    references. For example, /(\x{de})\1/8i was not matching \x{de}\x{fe}.
+    It now works when Unicode Property Support is available. 
 
 
 Version 7.7 07-May-08
diff --git a/pcre_exec.c b/pcre_exec.c
index 428916b..008b950 100644
--- a/pcre_exec.c
+++ b/pcre_exec.c
@@ -158,13 +158,39 @@ printf("\n");
 
 if (length > md->end_subject - eptr) return FALSE;
 
-/* Separate the caselesss case for speed */
+/* Separate the caseless case for speed. In UTF-8 mode we can only do this
+properly if Unicode properties are supported. Otherwise, we can check only
+ASCII characters. */
 
 if ((ims & PCRE_CASELESS) != 0)
   {
+#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UCP
+  if (md->utf8)
+    {
+    USPTR endptr = eptr + length; 
+    while (eptr < endptr)
+      {
+      int c, d; 
+      GETCHARINC(c, eptr);
+      GETCHARINC(d, p);
+      if (c != d && c != UCD_OTHERCASE(d)) return FALSE;
+      }  
+    }  
+  else
+#endif
+#endif
+
+  /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
+  is no UCP support. */
+   
   while (length-- > 0)
-    if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
+    { if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE; }
   }
+  
+/* In the caseful case, we can just compare the bytes, whether or not we
+are in UTF-8 mode. */
+ 
 else
   { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
 
diff --git a/testdata/testinput6 b/testdata/testinput6
index a386e9a..a8640f9 100644
--- a/testdata/testinput6
+++ b/testdata/testinput6
@@ -925,4 +925,22 @@ was broken in all cases./
     ** Failers 
     \x{1d79}\x{a77d} 
 
+/(A)\1/8i
+    AA
+    Aa
+    aa
+    aA
+
+/(\x{de})\1/8i
+    \x{de}\x{de}
+    \x{de}\x{fe}
+    \x{fe}\x{fe}
+    \x{fe}\x{de}
+
+/(\x{10a})\1/8i
+    \x{10a}\x{10a}
+    \x{10a}\x{10b}
+    \x{10b}\x{10b}
+    \x{10b}\x{10a}
+
 / End of testinput6 /
diff --git a/testdata/testoutput6 b/testdata/testoutput6
index c3b4c81..caba466 100644
--- a/testdata/testoutput6
+++ b/testdata/testoutput6
@@ -1705,4 +1705,46 @@ No match
     \x{1d79}\x{a77d} 
 No match
 
+/(A)\1/8i
+    AA
+ 0: AA
+ 1: A
+    Aa
+ 0: Aa
+ 1: A
+    aa
+ 0: aa
+ 1: a
+    aA
+ 0: aA
+ 1: a
+
+/(\x{de})\1/8i
+    \x{de}\x{de}
+ 0: \x{de}\x{de}
+ 1: \x{de}
+    \x{de}\x{fe}
+ 0: \x{de}\x{fe}
+ 1: \x{de}
+    \x{fe}\x{fe}
+ 0: \x{fe}\x{fe}
+ 1: \x{fe}
+    \x{fe}\x{de}
+ 0: \x{fe}\x{de}
+ 1: \x{fe}
+
+/(\x{10a})\1/8i
+    \x{10a}\x{10a}
+ 0: \x{10a}\x{10a}
+ 1: \x{10a}
+    \x{10a}\x{10b}
+ 0: \x{10a}\x{10b}
+ 1: \x{10a}
+    \x{10b}\x{10b}
+ 0: \x{10b}\x{10b}
+ 1: \x{10b}
+    \x{10b}\x{10a}
+ 0: \x{10b}\x{10a}
+ 1: \x{10b}
+
 / End of testinput6 /
author	ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>	2008-07-07 16:30:33 +0000
committer	ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>	2008-07-07 16:30:33 +0000
commit	798472a8be18bffaa324fa1c6cc052895a063e47 (patch)
tree	705a42ce28f65a5b1fc978bccad3831fdf1a9bae
parent	b17a62a5663fbb1eda289040153ed59188551e15 (diff)
download	pcre-798472a8be18bffaa324fa1c6cc052895a063e47.tar.gz