Fix bugs in new UCP casing code for back references and characters with more

than 2 cases. git-svn-id: svn://vcs.exim.org/pcre2/code/trunk@1231 6239d852-aaf2-0410-a92c-79f79f948069
author: ph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069> 2020-02-26 16:53:39 +0000
committer: ph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069> 2020-02-26 16:53:39 +0000
commit: 4ea71d47b6608e27759ebc39359c980d788db68a (patch)
tree: 99e2f55a345ce3a3a73ff6959d5adcf867534d93
parent: dd37c27fef3592a02fef0a8d9b98a78268717fe9 (diff)
download: pcre2-4ea71d47b6608e27759ebc39359c980d788db68a.tar.gz
8 files changed, 88 insertions, 15 deletions
diff --git a/ChangeLog b/ChangeLog
index 95e0123..1e4e778 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -69,7 +69,7 @@ character is decoded in JIT.
 18. Changes in many areas of the code so that when Unicode is supported and 
 PCRE2_UCP is set without PCRE2_UTF, Unicode character properties are used for 
 upper/lower case computations on characters whose code points are greater than 
-127. Documentation is not yet updated. JIT is not yet updated.
+127.
 
 19. The function for checking UTF-16 validity was returning an incorrect offset
 for the start of the error when a high surrogate was not followed by a valid
diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c
index 13769a0..9f05d19 100644
--- a/src/pcre2_compile.c
+++ b/src/pcre2_compile.c
@@ -5565,12 +5565,12 @@ for (;; pptr++)
       zerofirstcu = firstcu;
       zerofirstcuflags = firstcuflags;
 
-      /* For caseless UTF mode, check whether this character has more than
-      one other case. If so, generate a special OP_NOTPROP item instead of
+      /* For caseless UTF or UCP mode, check whether this character has more
+      than one other case. If so, generate a special OP_NOTPROP item instead of
       OP_NOTI. */
 
 #ifdef SUPPORT_UNICODE
-      if (utf && (options & PCRE2_CASELESS) != 0 &&
+      if ((utf||ucp) && (options & PCRE2_CASELESS) != 0 &&
           (d = UCD_CASESET(c)) != 0)
         {
         *code++ = OP_NOTPROP;
@@ -7824,11 +7824,12 @@ for (;; pptr++)
     NORMAL_CHAR_SET:  /* Character is already in meta */
     matched_char = TRUE;
 
-    /* For caseless UTF mode, check whether this character has more than one
-    other case. If so, generate a special OP_PROP item instead of OP_CHARI. */
+    /* For caseless UTF or UCP mode, check whether this character has more than
+    one other case. If so, generate a special OP_PROP item instead of OP_CHARI.
+    */
 
 #ifdef SUPPORT_UNICODE
-    if (utf && (options & PCRE2_CASELESS) != 0)
+    if ((utf||ucp) && (options & PCRE2_CASELESS) != 0)
       {
       uint32_t caseset = UCD_CASESET(meta);
       if (caseset != 0)
diff --git a/src/pcre2_match.c b/src/pcre2_match.c
index 77c98f5..4b86134 100644
--- a/src/pcre2_match.c
+++ b/src/pcre2_match.c
@@ -381,8 +381,12 @@ length = Fovector[offset+1] - Fovector[offset];
 if (caseless)
   {
 #if defined SUPPORT_UNICODE
-  if ((mb->poptions & PCRE2_UTF) != 0)
+  BOOL utf = (mb->poptions & PCRE2_UTF) != 0;
+   
+  if (utf || (mb->poptions & PCRE2_UCP) != 0)
     {
+    PCRE2_SPTR endptr = p + length;
+
     /* Match characters up to the end of the reference. NOTE: the number of
     code units matched may differ, because in UTF-8 there are some characters
     whose upper and lower case codes have different numbers of bytes. For
@@ -390,16 +394,25 @@ if (caseless)
     bytes in UTF-8); a sequence of 3 of the former uses 6 bytes, as does a
     sequence of two of the latter. It is important, therefore, to check the
     length along the reference, not along the subject (earlier code did this
-    wrong). */
-
-    PCRE2_SPTR endptr = p + length;
+    wrong). UCP without uses Unicode properties but without UTF encoding. */
+     
     while (p < endptr)
       {
       uint32_t c, d;
       const ucd_record *ur;
       if (eptr >= mb->end_subject) return 1;   /* Partial match */
-      GETCHARINC(c, eptr);
-      GETCHARINC(d, p);
+      
+      if (utf)
+        { 
+        GETCHARINC(c, eptr);
+        GETCHARINC(d, p);
+        }
+      else
+        {
+        c = *eptr++;
+        d = *p++;  
+        }
+ 
       ur = GET_UCD(d);
       if (c != d && c != (uint32_t)((int)d + ur->other_case))
         {
@@ -415,7 +428,7 @@ if (caseless)
   else
 #endif
 
-  /* Not in UTF mode */
+  /* Not in UTF or UCP mode */
     {
     for (; length > 0; length--)
       {
@@ -432,7 +445,8 @@ if (caseless)
   }
 
 /* In the caseful case, we can just compare the code units, whether or not we
-are in UTF mode. When partial matching, we have to do this unit-by-unit. */
+are in UTF and/or UCP mode. When partial matching, we have to do this unit by
+unit. */
 
 else
   {
diff --git a/testdata/testinput12 b/testdata/testinput12
index fbfacc5..9b4f8d3 100644
--- a/testdata/testinput12
+++ b/testdata/testinput12
@@ -530,6 +530,20 @@
 /X(\x{121})Y/ucp,replace=>\U$1<,substitute_extended
     X\x{121}Y
 
+/s/i,ucp
+    \x{17f} 
+
+/s/i,utf
+    \x{17f} 
+
+/[^s]/i,ucp
+\= Expect no match
+    \x{17f} 
+
+/[^s]/i,utf
+\= Expect no match
+    \x{17f} 
+
 # ---------------------------------------------------- 
 
 # End of testinput12
diff --git a/testdata/testinput5 b/testdata/testinput5
index b3fcfef..ecac178 100644
--- a/testdata/testinput5
+++ b/testdata/testinput5
@@ -2181,4 +2181,7 @@
 
 /(|�)7/caseless,ucp
 
+/(\xc1)\1/i,ucp
+    \xc1\xe1\=no_jit
+    
 # End of testinput5
diff --git a/testdata/testoutput12-16 b/testdata/testoutput12-16
index 9689ab1..84c4858 100644
--- a/testdata/testoutput12-16
+++ b/testdata/testoutput12-16
@@ -1761,6 +1761,24 @@ Subject length lower bound = 1
     X\x{121}Y
  1: >\x{120}<
 
+/s/i,ucp
+    \x{17f} 
+ 0: \x{17f}
+
+/s/i,utf
+    \x{17f} 
+ 0: \x{17f}
+
+/[^s]/i,ucp
+\= Expect no match
+    \x{17f} 
+No match
+
+/[^s]/i,utf
+\= Expect no match
+    \x{17f} 
+No match
+
 # ---------------------------------------------------- 
 
 # End of testinput12
diff --git a/testdata/testoutput12-32 b/testdata/testoutput12-32
index c51c517..03b6e39 100644
--- a/testdata/testoutput12-32
+++ b/testdata/testoutput12-32
@@ -1759,6 +1759,24 @@ Subject length lower bound = 1
     X\x{121}Y
  1: >\x{120}<
 
+/s/i,ucp
+    \x{17f} 
+ 0: \x{17f}
+
+/s/i,utf
+    \x{17f} 
+ 0: \x{17f}
+
+/[^s]/i,ucp
+\= Expect no match
+    \x{17f} 
+No match
+
+/[^s]/i,utf
+\= Expect no match
+    \x{17f} 
+No match
+
 # ---------------------------------------------------- 
 
 # End of testinput12
diff --git a/testdata/testoutput5 b/testdata/testoutput5
index 5bdf873..2ff8516 100644
--- a/testdata/testoutput5
+++ b/testdata/testoutput5
@@ -4940,4 +4940,9 @@ Subject length lower bound = 3
 
 /(|�)7/caseless,ucp
 
+/(\xc1)\1/i,ucp
+    \xc1\xe1\=no_jit
+ 0: \xc1\xe1
+ 1: \xc1
+    
 # End of testinput5
author	ph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069>	2020-02-26 16:53:39 +0000
committer	ph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069>	2020-02-26 16:53:39 +0000
commit	4ea71d47b6608e27759ebc39359c980d788db68a (patch)
tree	99e2f55a345ce3a3a73ff6959d5adcf867534d93
parent	dd37c27fef3592a02fef0a8d9b98a78268717fe9 (diff)
download	pcre2-4ea71d47b6608e27759ebc39359c980d788db68a.tar.gz