Extend auto-possessify to handle some Unicode properties.

git-svn-id: svn://vcs.exim.org/pcre/code/trunk@532 2f5784b3-3f2a-0410-8824-cb99058d5e15
author: ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> 2010-06-01 16:21:42 +0000
committer: ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> 2010-06-01 16:21:42 +0000
commit: 1d63547e1bfd241769214b9f80fde16de397f51f (patch)
tree: e31edef7db79dc7de47c9d9a18f436cedf86dcbc
parent: ae22c8e3671a04fd2a4b1c823de25aa472f651d0 (diff)
download: pcre-1d63547e1bfd241769214b9f80fde16de397f51f.tar.gz
4 files changed, 295 insertions, 46 deletions
diff --git a/ChangeLog b/ChangeLog
index f1fdc6d..3656f65 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -58,7 +58,9 @@ Version 8.10 03 May-2010
 14. pcre_study() now recognizes \h, \v, and \R when constructing a bit map of 
     possible starting bytes for non-anchored patterns. 
     
-15. The "auto-possessify" feature of pcre_compile() now recognizes \R. 
+15. Extended the "auto-possessify" feature of pcre_compile(). It now recognizes
+    \R, and also a number of cases that involve Unicode properties, both 
+    explicit and implicit when PCRE_UCP is set.
 
 16. If a repeated Unicode property match (e.g. \p{Lu}*) was used with non-UTF-8
     input, it could crash or give wrong results if characters with values 
@@ -70,7 +72,7 @@ Version 8.10 03 May-2010
     
 18. Added a check for running out of memory when PCRE is compiled with
     --disable-stack-for-recursion. 
- 
+    
 
 
 Version 8.02 19-Mar-2010
diff --git a/pcre_compile.c b/pcre_compile.c
index c0486b7..f8c673d 100644
--- a/pcre_compile.c
+++ b/pcre_compile.c
@@ -2392,6 +2392,69 @@ for (++c; c <= d; c++)
 
 return TRUE;
 }
+
+
+
+/*************************************************
+*        Check a character and a property        *
+*************************************************/
+
+/* This function is called by check_auto_possessive() when a property item
+is adjacent to a fixed character.
+
+Arguments:
+  c            the character
+  ptype        the property type
+  pdata        the data for the type
+  negated      TRUE if it's a negated property (\P or \p{^)
+  
+Returns:       TRUE if auto-possessifying is OK
+*/    
+
+static BOOL
+check_char_prop(int c, int ptype, int pdata, BOOL negated)
+{
+const ucd_record *prop = GET_UCD(c);
+switch(ptype)
+  {
+  case PT_LAMP:
+  return (prop->chartype == ucp_Lu ||
+          prop->chartype == ucp_Ll ||
+          prop->chartype == ucp_Lt) == negated;
+
+  case PT_GC:
+  return (pdata == _pcre_ucp_gentype[prop->chartype]) == negated;
+
+  case PT_PC:
+  return (pdata == prop->chartype) == negated;
+
+  case PT_SC:
+  return (pdata == prop->script) == negated;
+
+  /* These are specials */
+
+  case PT_ALNUM:
+  return (_pcre_ucp_gentype[prop->chartype] == ucp_L ||
+          _pcre_ucp_gentype[prop->chartype] == ucp_N) == negated;
+
+  case PT_SPACE:    /* Perl space */
+  return (_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
+          c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
+          == negated;
+
+  case PT_PXSPACE:  /* POSIX space */
+  return (_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
+          c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
+          c == CHAR_FF || c == CHAR_CR)
+          == negated;
+
+  case PT_WORD:
+  return (_pcre_ucp_gentype[prop->chartype] == ucp_L ||
+          _pcre_ucp_gentype[prop->chartype] == ucp_N ||
+          c == CHAR_UNDERSCORE) == negated;
+  }
+return FALSE;  
+}
 #endif  /* SUPPORT_UCP */
 
 
@@ -2405,10 +2468,8 @@ whether the next thing could possibly match the repeated item. If not, it makes
 sense to automatically possessify the repeated item.
 
 Arguments:
-  op_code       the repeated op code
-  this          data for this item, depends on the opcode
+  previous      pointer to the repeated opcode
   utf8          TRUE in UTF-8 mode
-  utf8_char     used for utf8 character bytes, NULL if not relevant
   ptr           next character in pattern
   options       options bits
   cd            contains pointers to tables etc.
@@ -2417,10 +2478,11 @@ Returns:        TRUE if possessifying is wanted
 */
 
 static BOOL
-check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
-  const uschar *ptr, int options, compile_data *cd)
+check_auto_possessive(const uschar *previous, BOOL utf8, const uschar *ptr, 
+  int options, compile_data *cd)
 {
-int next;
+int c, next;
+int op_code = *previous++;
 
 /* Skip whitespace and comments in extended mode */
 
@@ -2481,33 +2543,30 @@ if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
   strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
     return FALSE;
 
-/* Now compare the next item with the previous opcode. If the previous is a
-positive single character match, "item" either contains the character or, if
-"item" is greater than 127 in utf8 mode, the character's bytes are in
-utf8_char. */
-
-
-/* Handle cases when the next item is a character. */
+/* Now compare the next item with the previous opcode. First, handle cases when
+the next item is a character. */
 
 if (next >= 0) switch(op_code)
   {
   case OP_CHAR:
-#ifdef SUPPORT_UTF8
-  if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
+#ifdef SUPPORT_UTF8  
+  GETCHARTEST(c, previous);
 #else
-  (void)(utf8_char);  /* Keep compiler happy by referencing function argument */
-#endif
-  return item != next;
+  c = *previous;
+#endif      
+  return c != next; 
 
   /* For CHARNC (caseless character) we must check the other case. If we have
   Unicode property support, we can use it to test the other case of
   high-valued characters. */
 
   case OP_CHARNC:
-#ifdef SUPPORT_UTF8
-  if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
-#endif
-  if (item == next) return FALSE;
+#ifdef SUPPORT_UTF8  
+  GETCHARTEST(c, previous);
+#else
+  c = *previous;
+#endif      
+  if (c == next) return FALSE;
 #ifdef SUPPORT_UTF8
   if (utf8)
     {
@@ -2518,16 +2577,16 @@ if (next >= 0) switch(op_code)
 #else
     othercase = NOTACHAR;
 #endif
-    return (unsigned int)item != othercase;
+    return (unsigned int)c != othercase;
     }
   else
 #endif  /* SUPPORT_UTF8 */
-  return (item != cd->fcc[next]);  /* Non-UTF-8 mode */
+  return (c != cd->fcc[next]);  /* Non-UTF-8 mode */
 
-  /* For OP_NOT, "item" must be a single-byte character. */
+  /* For OP_NOT, its data is always a single-byte character. */
 
   case OP_NOT:
-  if (item == next) return TRUE;
+  if ((c = *previous) == next) return TRUE;
   if ((options & PCRE_CASELESS) == 0) return FALSE;
 #ifdef SUPPORT_UTF8
   if (utf8)
@@ -2539,11 +2598,11 @@ if (next >= 0) switch(op_code)
 #else
     othercase = NOTACHAR;
 #endif
-    return (unsigned int)item == othercase;
+    return (unsigned int)c == othercase;
     }
   else
 #endif  /* SUPPORT_UTF8 */
-  return (item == cd->fcc[next]);  /* Non-UTF-8 mode */
+  return (c == cd->fcc[next]);  /* Non-UTF-8 mode */
   
   /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set. 
   When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */ 
@@ -2611,6 +2670,14 @@ if (next >= 0) switch(op_code)
     return op_code != OP_NOT_VSPACE;
     }
 
+#ifdef SUPPORT_UCP
+  case OP_PROP:
+  return check_char_prop(next, previous[0], previous[1], FALSE);
+ 
+  case OP_NOTPROP:
+  return check_char_prop(next, previous[0], previous[1], TRUE);
+#endif
+
   default:
   return FALSE;
   }
@@ -2619,38 +2686,41 @@ if (next >= 0) switch(op_code)
 /* Handle the case when the next item is \d, \s, etc. Note that when PCRE_UCP 
 is set, \d turns into ESC_du rather than ESC_d, etc., so ESC_d etc. are 
 generated only when PCRE_UCP is *not* set, that is, when only ASCII 
-characteristics are recognized. */
+characteristics are recognized. Similarly, the opcodes OP_DIGIT etc. are 
+replaced by OP_PROP codes when PCRE_UCP is set. */
 
 switch(op_code)
   {
   case OP_CHAR:
   case OP_CHARNC:
-#ifdef SUPPORT_UTF8
-  if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
-#endif
+#ifdef SUPPORT_UTF8  
+  GETCHARTEST(c, previous);
+#else
+  c = *previous;
+#endif      
   switch(-next)
     {
     case ESC_d:
-    return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
+    return c > 127 || (cd->ctypes[c] & ctype_digit) == 0;
 
     case ESC_D:
-    return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
+    return c <= 127 && (cd->ctypes[c] & ctype_digit) != 0;
 
     case ESC_s:
-    return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
+    return c > 127 || (cd->ctypes[c] & ctype_space) == 0;
 
     case ESC_S:
-    return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
+    return c <= 127 && (cd->ctypes[c] & ctype_space) != 0;
 
     case ESC_w:
-    return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
+    return c > 127 || (cd->ctypes[c] & ctype_word) == 0;
 
     case ESC_W:
-    return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
+    return c <= 127 && (cd->ctypes[c] & ctype_word) != 0;
 
     case ESC_h:
     case ESC_H:
-    switch(item)
+    switch(c)
       {
       case 0x09:
       case 0x20:
@@ -2678,7 +2748,7 @@ switch(op_code)
 
     case ESC_v:
     case ESC_V:
-    switch(item)
+    switch(c)
       {
       case 0x0a:
       case 0x0b:
@@ -2691,10 +2761,61 @@ switch(op_code)
       default:
       return -next == ESC_v;
       }
+      
+    /* When PCRE_UCP is set, these values get generated for \d etc. Find 
+    their substitutions and process them. The result will always be either 
+    -ESC_p or -ESC_P. Then fall through to process those values. */
+  
+#ifdef SUPPORT_UCP
+    case ESC_du:
+    case ESC_DU:
+    case ESC_wu:
+    case ESC_WU:
+    case ESC_su:
+    case ESC_SU:
+      {
+      int temperrorcode = 0;
+      ptr = substitutes[-next - ESC_DU];
+      next = check_escape(&ptr, &temperrorcode, 0, options, FALSE);
+      if (temperrorcode != 0) return FALSE;
+      ptr++;    /* For compatibility */
+      }
+    /* Fall through */   
+
+    case ESC_p:
+    case ESC_P:
+      {
+      int ptype, pdata, errorcodeptr;
+      BOOL negated;  
+        
+      ptr--;      /* Make ptr point at the p or P */
+      ptype = get_ucp(&ptr, &negated, &pdata, &errorcodeptr);
+      if (ptype < 0) return FALSE;
+      ptr++;      /* Point past the final curly ket */
+      
+      /* If the property item is optional, we have to give up. (When generated
+      from \d etc by PCRE_UCP, this test will have been applied much earlier,
+      to the original \d etc. At this point, ptr will point to a zero byte. */
+      
+      if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
+        strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
+          return FALSE;
+      
+      /* Do the property check. */
+      
+      return check_char_prop(c, ptype, pdata, (next == -ESC_P) != negated);
+      } 
+#endif
 
     default:
     return FALSE;
     }
+    
+  /* In principle, support for Unicode properties should be integrated here as 
+  well. It means re-organizing the above code so as to get hold of the property 
+  values before switching on the op-code. However, I wonder how many patterns 
+  combine ASCII \d etc with Unicode properties? (Note that if PCRE_UCP is set, 
+  these op-codes are never generated.) */ 
 
   case OP_DIGIT:
   return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
@@ -3998,8 +4119,7 @@ for (;; ptr++)
 
       if (!possessive_quantifier &&
           repeat_max < 0 &&
-          check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
-            options, cd))
+          check_auto_possessive(previous, utf8, ptr + 1, options, cd))
         {
         repeat_type = 0;    /* Force greedy */
         possessive_quantifier = TRUE;
@@ -4020,7 +4140,7 @@ for (;; ptr++)
       c = previous[1];
       if (!possessive_quantifier &&
           repeat_max < 0 &&
-          check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
+          check_auto_possessive(previous, utf8, ptr + 1, options, cd))
         {
         repeat_type = 0;    /* Force greedy */
         possessive_quantifier = TRUE;
@@ -4044,7 +4164,7 @@ for (;; ptr++)
 
       if (!possessive_quantifier &&
           repeat_max < 0 &&
-          check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
+          check_auto_possessive(previous, utf8, ptr + 1, options, cd))
         {
         repeat_type = 0;    /* Force greedy */
         possessive_quantifier = TRUE;
diff --git a/testdata/testinput12 b/testdata/testinput12
index 42efabe..78ecf64 100644
--- a/testdata/testinput12
+++ b/testdata/testinput12
@@ -485,4 +485,22 @@ of case for anything other than the ASCII letters. --/
 
 /\p{Xps}*/SI
 
+/\p{Lu}+9\p{Lu}+B\p{Lu}+b/BZ
+
+/\p{^Lu}+9\p{^Lu}+B\p{^Lu}+b/BZ
+
+/\P{Lu}+9\P{Lu}+B\P{Lu}+b/BZ
+
+/\p{Han}+X\p{Greek}+\x{370}/BZ8
+
+/\p{Xan}+!\p{Xan}+A/BZ
+
+/\p{Xsp}+!\p{Xsp}\t/BZ
+
+/\p{Xps}+!\p{Xps}\t/BZ
+
+/\p{Xwd}+!\p{Xwd}_/BZ
+
+/A+\p{N}A+\dB+\p{N}*B+\d*/WBZ
+
 /-- End of testinput12 --/
diff --git a/testdata/testoutput12 b/testdata/testoutput12
index 5bd83ab..ab9dbfd 100644
--- a/testdata/testoutput12
+++ b/testdata/testoutput12
@@ -1067,4 +1067,113 @@ No need char
 Subject length lower bound = 0
 No set of starting bytes
 
+/\p{Lu}+9\p{Lu}+B\p{Lu}+b/BZ
+------------------------------------------------------------------
+        Bra
+        prop Lu ++
+        9
+        prop Lu +
+        B
+        prop Lu ++
+        b
+        Ket
+        End
+------------------------------------------------------------------
+
+/\p{^Lu}+9\p{^Lu}+B\p{^Lu}+b/BZ
+------------------------------------------------------------------
+        Bra
+        notprop Lu +
+        9
+        notprop Lu ++
+        B
+        notprop Lu +
+        b
+        Ket
+        End
+------------------------------------------------------------------
+
+/\P{Lu}+9\P{Lu}+B\P{Lu}+b/BZ
+------------------------------------------------------------------
+        Bra
+        notprop Lu +
+        9
+        notprop Lu ++
+        B
+        notprop Lu +
+        b
+        Ket
+        End
+------------------------------------------------------------------
+
+/\p{Han}+X\p{Greek}+\x{370}/BZ8
+------------------------------------------------------------------
+        Bra
+        prop Han ++
+        X
+        prop Greek +
+        \x{370}
+        Ket
+        End
+------------------------------------------------------------------
+
+/\p{Xan}+!\p{Xan}+A/BZ
+------------------------------------------------------------------
+        Bra
+        prop Xan ++
+        !
+        prop Xan +
+        A
+        Ket
+        End
+------------------------------------------------------------------
+
+/\p{Xsp}+!\p{Xsp}\t/BZ
+------------------------------------------------------------------
+        Bra
+        prop Xsp ++
+        !
+        prop Xsp
+        \x09
+        Ket
+        End
+------------------------------------------------------------------
+
+/\p{Xps}+!\p{Xps}\t/BZ
+------------------------------------------------------------------
+        Bra
+        prop Xps ++
+        !
+        prop Xps
+        \x09
+        Ket
+        End
+------------------------------------------------------------------
+
+/\p{Xwd}+!\p{Xwd}_/BZ
+------------------------------------------------------------------
+        Bra
+        prop Xwd ++
+        !
+        prop Xwd
+        _
+        Ket
+        End
+------------------------------------------------------------------
+
+/A+\p{N}A+\dB+\p{N}*B+\d*/WBZ
+------------------------------------------------------------------
+        Bra
+        A++
+        prop N
+        A++
+        prop Nd
+        B+
+        prop N *+
+        B+
+        prop Nd *
+        Ket
+        End
+------------------------------------------------------------------
+
 /-- End of testinput12 --/
author	ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>	2010-06-01 16:21:42 +0000
committer	ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>	2010-06-01 16:21:42 +0000
commit	1d63547e1bfd241769214b9f80fde16de397f51f (patch)
tree	e31edef7db79dc7de47c9d9a18f436cedf86dcbc
parent	ae22c8e3671a04fd2a4b1c823de25aa472f651d0 (diff)
download	pcre-1d63547e1bfd241769214b9f80fde16de397f51f.tar.gz