Add U+0085 and U+180E to what \s matches in UCP mode, to match Perl.

git-svn-id: svn://vcs.exim.org/pcre/code/trunk@1376 2f5784b3-3f2a-0410-8824-cb99058d5e15
author: ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> 2013-10-12 18:02:11 +0000
committer: ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> 2013-10-12 18:02:11 +0000
commit: d588bf144394bc85610f20333985554b9919d735 (patch)
tree: 8bae0bea9f9079050cb6b429b95c3a09f2e899f8
parent: dfde99fb13ffbdac43d4c284cd91114d962c5978 (diff)
download: pcre-d588bf144394bc85610f20333985554b9919d735.tar.gz
10 files changed, 210 insertions, 87 deletions
diff --git a/ChangeLog b/ChangeLog
index 070bd56..11a8fcd 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -126,7 +126,12 @@ Version 8.34 xx-xxxx-201x
 25. If PCRE_AUTO_CALLOUT and PCRE_UCP were set for a pattern that contained 
     character types such as \d or \w, too many callouts were inserted, and the 
     data that they returned was rubbish.  
-
+    
+26. In UCP mode, \s was not matching two of the characters that Perl matches,
+    namely NEL (U+0085) and MONGOLIAN VOWEL SEPARATOR (U+180E), though they 
+    were matched by \h. The code has now been refactored so that the lists of 
+    the horizontal and vertical whitespace characters used for \h and \v (which 
+    are defined only in one place) are now also used for \s.  
  
 
 Version 8.33 28-May-2013
diff --git a/doc/pcrepattern.3 b/doc/pcrepattern.3
index 676e61c..3019a22 100644
--- a/doc/pcrepattern.3
+++ b/doc/pcrepattern.3
@@ -1,4 +1,4 @@
-.TH PCREPATTERN 3 "08 October 2013" "PCRE 8.34"
+.TH PCREPATTERN 3 "12 October 2013" "PCRE 8.34"
 .SH NAME
 PCRE - Perl-compatible regular expressions
 .SH "PCRE REGULAR EXPRESSION DETAILS"
@@ -543,9 +543,9 @@ efficiency reasons. However, if PCRE is compiled with Unicode property support,
 and the PCRE_UCP option is set, the behaviour is changed so that Unicode
 properties are used to determine character types, as follows:
 .sp
-  \ed  any character that \ep{Nd} matches (decimal digit)
-  \es  any character that \ep{Z} matches, plus HT, LF, FF, CR
-  \ew  any character that \ep{L} or \ep{N} matches, plus underscore
+  \ed  any character that matches \ep{Nd} (decimal digit)
+  \es  any character that matches \ep{Z} or \eh or \ev
+  \ew  any character that matches \ep{L} or \ep{N}, plus underscore
 .sp
 The upper case escapes match the inverse sets of characters. Note that \ed
 matches only decimal digits, whereas \ew matches any Unicode digit, as well as
@@ -1309,7 +1309,7 @@ are:
   lower    lower case letters
   print    printing characters, including space
   punct    printing characters, excluding letters and digits and space
-  space    white space (not quite the same as \es)
+  space    white space (the same as \es from PCRE 8.34)
   upper    upper case letters
   word     "word" characters (same as \ew)
   xdigit   hexadecimal digits
@@ -3176,6 +3176,6 @@ Cambridge CB2 3QH, England.
 .rs
 .sp
 .nf
-Last updated: 08 October 2013
+Last updated: 12 October 2013
 Copyright (c) 1997-2013 University of Cambridge.
 .fi
diff --git a/pcre_compile.c b/pcre_compile.c
index 35aee65..d56b7f8 100644
--- a/pcre_compile.c
+++ b/pcre_compile.c
@@ -2709,10 +2709,16 @@ switch(ptype)
 
   case PT_SPACE:    /* Perl space */
   case PT_PXSPACE:  /* POSIX space */
-  return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
-          c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
-          c == CHAR_FF || c == CHAR_CR)
-          == negated;
+  switch(c)
+    {
+    HSPACE_CASES:
+    VSPACE_CASES:
+    return negated;
+    
+    default:
+    return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == negated;
+    }       
+  break;  /* Control never reaches here */
 
   case PT_WORD:
   return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
diff --git a/pcre_dfa_exec.c b/pcre_dfa_exec.c
index c64f7c3..19fba5b 100644
--- a/pcre_dfa_exec.c
+++ b/pcre_dfa_exec.c
@@ -1104,9 +1104,17 @@ for (;;)
     
           case PT_SPACE:    /* Perl space */
           case PT_PXSPACE:  /* POSIX space */
-          OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
-               c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
-               c == CHAR_FF || c == CHAR_CR;
+          switch(c)
+            {
+            HSPACE_CASES:
+            VSPACE_CASES:
+            OK = TRUE;
+            break;
+             
+            default:      
+            OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
+            break;
+            }  
           break;
 
           case PT_WORD:
@@ -1354,9 +1362,17 @@ for (;;)
     
           case PT_SPACE:    /* Perl space */
           case PT_PXSPACE:  /* POSIX space */
-          OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
-               c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
-               c == CHAR_FF || c == CHAR_CR;
+          switch(c)
+            {
+            HSPACE_CASES:
+            VSPACE_CASES:
+            OK = TRUE;
+            break;
+             
+            default:      
+            OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
+            break;
+            }  
           break;
 
           case PT_WORD:
@@ -1598,9 +1614,17 @@ for (;;)
     
           case PT_SPACE:    /* Perl space */
           case PT_PXSPACE:  /* POSIX space */
-          OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
-               c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
-               c == CHAR_FF || c == CHAR_CR;
+          switch(c)
+            {
+            HSPACE_CASES:
+            VSPACE_CASES:
+            OK = TRUE;
+            break;
+             
+            default:      
+            OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
+            break;
+            }  
           break;
 
           case PT_WORD:
@@ -1867,9 +1891,17 @@ for (;;)
     
           case PT_SPACE:    /* Perl space */
           case PT_PXSPACE:  /* POSIX space */
-          OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
-               c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
-               c == CHAR_FF || c == CHAR_CR;
+          switch(c)
+            {
+            HSPACE_CASES:
+            VSPACE_CASES:
+            OK = TRUE;
+            break;
+             
+            default:      
+            OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
+            break;
+            }  
           break;
 
           case PT_WORD:
diff --git a/pcre_exec.c b/pcre_exec.c
index 36f0a7a..7311aac 100644
--- a/pcre_exec.c
+++ b/pcre_exec.c
@@ -1281,12 +1281,12 @@ for (;;)
 
     case OP_COND:
     case OP_SCOND:
-    
-    /* The variable codelink will be added to ecode when the condition is 
-    false, to get to the second branch. Setting it to the offset to the ALT 
-    or KET, then incrementing ecode achieves this effect. We now have ecode 
+
+    /* The variable codelink will be added to ecode when the condition is
+    false, to get to the second branch. Setting it to the offset to the ALT
+    or KET, then incrementing ecode achieves this effect. We now have ecode
     pointing to the condition or callout. */
- 
+
     codelink = GET(ecode, 1);   /* Offset to the second branch */
     ecode += 1 + LINK_SIZE;     /* From this opcode */
 
@@ -1322,10 +1322,10 @@ for (;;)
         if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
         if (rrc < 0) RRETURN(rrc);
         }
-        
-      /* Advance ecode past the callout, so it now points to the condition. We 
+
+      /* Advance ecode past the callout, so it now points to the condition. We
       must adjust codelink so that the value of ecode+codelink is unchanged. */
-       
+
       ecode += PRIV(OP_lengths)[OP_CALLOUT];
       codelink -= PRIV(OP_lengths)[OP_CALLOUT];
       }
@@ -1334,7 +1334,7 @@ for (;;)
 
     condition = FALSE;
     switch(condcode = *ecode)
-      { 
+      {
       case OP_RREF:         /* Numbered group recursion test */
       if (md->recursive != NULL)     /* Not recursing => FALSE */
         {
@@ -1345,8 +1345,8 @@ for (;;)
 
       case OP_DNRREF:       /* Duplicate named group recursion test */
       if (md->recursive != NULL)
-        {       
-        int count = GET2(ecode, 1 + IMM2_SIZE); 
+        {
+        int count = GET2(ecode, 1 + IMM2_SIZE);
         pcre_uchar *slot = md->name_table + GET2(ecode, 1) * md->name_entry_size;
         while (count-- > 0)
           {
@@ -1355,7 +1355,7 @@ for (;;)
           if (condition) break;
           slot += md->name_entry_size;
           }
-        }   
+        }
       break;
 
       case OP_CREF:         /* Numbered group used test */
@@ -1365,7 +1365,7 @@ for (;;)
 
       case OP_DNCREF:      /* Duplicate named group used test */
         {
-        int count = GET2(ecode, 1 + IMM2_SIZE); 
+        int count = GET2(ecode, 1 + IMM2_SIZE);
         pcre_uchar *slot = md->name_table + GET2(ecode, 1) * md->name_entry_size;
         while (count-- > 0)
           {
@@ -1375,7 +1375,7 @@ for (;;)
           slot += md->name_entry_size;
           }
         }
-      break;   
+      break;
 
       case OP_DEF:     /* DEFINE - always false */
       break;
@@ -1383,8 +1383,8 @@ for (;;)
       /* The condition is an assertion. Call match() to evaluate it - setting
       md->match_function_type to MATCH_CONDASSERT causes it to stop at the end
       of an assertion. */
-      
-      default: 
+
+      default:
       md->match_function_type = MATCH_CONDASSERT;
       RMATCH(eptr, ecode, offset_top, md, NULL, RM3);
       if (rrc == MATCH_MATCH)
@@ -1392,30 +1392,30 @@ for (;;)
         if (md->end_offset_top > offset_top)
           offset_top = md->end_offset_top;  /* Captures may have happened */
         condition = TRUE;
-         
-        /* Advance ecode past the assertion to the start of the first branch, 
+
+        /* Advance ecode past the assertion to the start of the first branch,
         but adjust it so that the general choosing code below works. */
- 
+
         ecode += GET(ecode, 1);
         while (*ecode == OP_ALT) ecode += GET(ecode, 1);
-        ecode += 1 + LINK_SIZE - PRIV(OP_lengths)[condcode]; 
+        ecode += 1 + LINK_SIZE - PRIV(OP_lengths)[condcode];
         }
 
       /* PCRE doesn't allow the effect of (*THEN) to escape beyond an
-      assertion; it is therefore treated as NOMATCH. Any other return is an 
+      assertion; it is therefore treated as NOMATCH. Any other return is an
       error. */
 
       else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
         {
         RRETURN(rrc);         /* Need braces because of following else */
         }
-      break;   
+      break;
       }
-      
+
     /* Choose branch according to the condition */
-      
+
     ecode += condition? PRIV(OP_lengths)[condcode] : codelink;
- 
+
     /* We are now at the branch that is to be obeyed. As there is only one, we
     can use tail recursion to avoid using another stack frame, except when
     there is unlimited repeat of a possibly empty group. In the latter case, a
@@ -1425,7 +1425,7 @@ for (;;)
     creating two alternatives. If a THEN is encountered in the branch, it
     propagates out to the enclosing alternative (unless nested in a deeper set
     of alternatives, of course). */
-    
+
     if (condition || ecode[-(1+LINK_SIZE)] == OP_ALT)
       {
       if (op != OP_SCOND)
@@ -2577,14 +2577,21 @@ for (;;)
         /* Perl space used to exclude VT, but from Perl 5.18 it is included,
         which means that Perl space and POSIX space are now identical. PCRE
         was changed at release 8.34. */
-    
+
         case PT_SPACE:    /* Perl space */
         case PT_PXSPACE:  /* POSIX space */
-        if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
-             c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
-             c == CHAR_FF || c == CHAR_CR)
-               == (op == OP_NOTPROP))
-          RRETURN(MATCH_NOMATCH);
+        switch(c)
+          {
+          HSPACE_CASES:
+          VSPACE_CASES:
+          if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
+          break;
+
+          default:
+          if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z) ==
+            (op == OP_NOTPROP)) RRETURN(MATCH_NOMATCH);
+          break;
+          }
         break;
 
         case PT_WORD:
@@ -2669,27 +2676,27 @@ for (;;)
 
     Otherwise, set the length to the length of what was matched by the
     referenced subpattern.
-    
-    The OP_REF and OP_REFI opcodes are used for a reference to a numbered group 
-    or to a non-duplicated named group. For a duplicated named group, OP_DNREF 
-    and OP_DNREFI are used. In this case we must scan the list of groups to 
+
+    The OP_REF and OP_REFI opcodes are used for a reference to a numbered group
+    or to a non-duplicated named group. For a duplicated named group, OP_DNREF
+    and OP_DNREFI are used. In this case we must scan the list of groups to
     which the name refers, and use the first one that is set. */
-    
+
     case OP_DNREF:
     case OP_DNREFI:
     caseless = op == OP_DNREFI;
       {
-      int count = GET2(ecode, 1+IMM2_SIZE); 
+      int count = GET2(ecode, 1+IMM2_SIZE);
       pcre_uchar *slot = md->name_table + GET2(ecode, 1) * md->name_entry_size;
       ecode += 1 + 2*IMM2_SIZE;
-      
+
       while (count-- > 0)
         {
         offset = GET2(slot, 0) << 1;
         if (offset < offset_top && md->offset_vector[offset] >= 0) break;
         slot += md->name_entry_size;
         }
-      if (count < 0)     
+      if (count < 0)
         length = (md->jscript_compat)? 0 : -1;
       else
         length = md->offset_vector[offset+1] - md->offset_vector[offset];
@@ -4200,7 +4207,7 @@ for (;;)
           /* Perl space used to exclude VT, but from Perl 5.18 it is included,
           which means that Perl space and POSIX space are now identical. PCRE
           was changed at release 8.34. */
-    
+
           case PT_SPACE:    /* Perl space */
           case PT_PXSPACE:  /* POSIX space */
           for (i = 1; i <= min; i++)
@@ -4211,10 +4218,18 @@ for (;;)
               RRETURN(MATCH_NOMATCH);
               }
             GETCHARINCTEST(c, eptr);
-            if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
-                 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
-                   == prop_fail_result)
-              RRETURN(MATCH_NOMATCH);
+            switch(c)
+              {
+              HSPACE_CASES:
+              VSPACE_CASES:
+              if (prop_fail_result) RRETURN(MATCH_NOMATCH);
+              break;
+
+              default:
+              if ((UCD_CATEGORY(c) == ucp_Z) == prop_fail_result)
+                RRETURN(MATCH_NOMATCH);
+              break;
+              }
             }
           break;
 
@@ -4937,7 +4952,7 @@ for (;;)
           /* Perl space used to exclude VT, but from Perl 5.18 it is included,
           which means that Perl space and POSIX space are now identical. PCRE
           was changed at release 8.34. */
-    
+
           case PT_SPACE:    /* Perl space */
           case PT_PXSPACE:  /* POSIX space */
           for (fi = min;; fi++)
@@ -4951,10 +4966,18 @@ for (;;)
               RRETURN(MATCH_NOMATCH);
               }
             GETCHARINCTEST(c, eptr);
-            if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
-                 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
-                   == prop_fail_result)
-              RRETURN(MATCH_NOMATCH);
+            switch(c)
+              {
+              HSPACE_CASES:
+              VSPACE_CASES:
+              if (prop_fail_result) RRETURN(MATCH_NOMATCH);
+              break;
+
+              default:
+              if ((UCD_CATEGORY(c) == ucp_Z) == prop_fail_result)
+                RRETURN(MATCH_NOMATCH);
+              break;
+              }
             }
           /* Control never gets here */
 
@@ -5441,7 +5464,7 @@ for (;;)
           /* Perl space used to exclude VT, but from Perl 5.18 it is included,
           which means that Perl space and POSIX space are now identical. PCRE
           was changed at release 8.34. */
-    
+
           case PT_SPACE:    /* Perl space */
           case PT_PXSPACE:  /* POSIX space */
           for (i = min; i < max; i++)
@@ -5453,12 +5476,21 @@ for (;;)
               break;
               }
             GETCHARLENTEST(c, eptr, len);
-            if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
-                 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
-                 == prop_fail_result)
+            switch(c)
+              {
+              HSPACE_CASES:
+              VSPACE_CASES:
+              if (prop_fail_result) goto ENDLOOP99;  /* Break the loop */
+              break;
+
+              default:
+              if ((UCD_CATEGORY(c) == ucp_Z) == prop_fail_result)
+                goto ENDLOOP99;   /* Break the loop */
               break;
+              }
             eptr+= len;
             }
+          ENDLOOP99:
           break;
 
           case PT_WORD:
@@ -5572,12 +5604,12 @@ for (;;)
         /* eptr is now past the end of the maximum run */
 
         if (possessive) continue;    /* No backtracking */
-         
+
         for(;;)
           {
-          int lgb, rgb; 
+          int lgb, rgb;
           PCRE_PUCHAR fptr;
-            
+
           if (eptr == pp) goto TAIL_RECURSE;   /* At start of char run */
           RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
           if (rrc != MATCH_NOMATCH) RRETURN(rrc);
@@ -5585,7 +5617,7 @@ for (;;)
           /* Backtracking over an extended grapheme cluster involves inspecting
           the previous two characters (if present) to see if a break is
           permitted between them. */
- 
+
           eptr--;
           if (!utf) c = *eptr; else
             {
@@ -5603,7 +5635,7 @@ for (;;)
               BACKCHAR(fptr);
               GETCHAR(c, fptr);
               }
-            lgb = UCD_GRAPHBREAK(c);        
+            lgb = UCD_GRAPHBREAK(c);
             if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
             eptr = fptr;
             rgb = lgb;
@@ -6127,7 +6159,7 @@ for (;;)
               eptr[-1] == CHAR_CR) eptr--;
           }
         }
-        
+
       /* Control never gets here */
       }
 
diff --git a/pcre_xclass.c b/pcre_xclass.c
index 335b6aa..9b85166 100644
--- a/pcre_xclass.c
+++ b/pcre_xclass.c
@@ -165,10 +165,18 @@ while ((t = *data++) != XCL_END)
     
       case PT_SPACE:    /* Perl space */
       case PT_PXSPACE:  /* POSIX space */
-      if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
-           c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
-           c == CHAR_FF || c == CHAR_CR) == (t == XCL_PROP))
-        return !negated;
+      switch(c)
+        {
+        HSPACE_CASES:
+        VSPACE_CASES:
+        if (t == XCL_PROP) return !negated; 
+        break;
+        
+        default:
+        if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == (t == XCL_PROP))
+          return !negated;
+        break;
+        }
       break;
 
       case PT_WORD:
diff --git a/testdata/testinput10 b/testdata/testinput10
index 38ae260..93ddb3a 100644
--- a/testdata/testinput10
+++ b/testdata/testinput10
@@ -1408,4 +1408,12 @@
     `abc
     \x{1234}abc
 
+/^A\s+Z/8W
+    A\x{2005}Z
+    A\x{85}\x{180e}\x{2005}Z
+
+/^A[\s]+Z/8W
+    A\x{2005}Z
+    A\x{85}\x{180e}\x{2005}Z
+
 /-- End of testinput10 --/ 
diff --git a/testdata/testinput6 b/testdata/testinput6
index 503af5e..19ec56f 100644
--- a/testdata/testinput6
+++ b/testdata/testinput6
@@ -1328,4 +1328,12 @@
     1234
     123  
     
+/^A\s+Z/8W
+    A\x{2005}Z
+    A\x{85}\x{180e}\x{2005}Z
+
+/^A[\s]+Z/8W
+    A\x{2005}Z
+    A\x{85}\x{180e}\x{2005}Z
+
 /-- End of testinput6 --/
diff --git a/testdata/testoutput10 b/testdata/testoutput10
index 196b0c8..3dd8b68 100644
--- a/testdata/testoutput10
+++ b/testdata/testoutput10
@@ -2597,4 +2597,16 @@ No match
     \x{1234}abc
 No match
 
+/^A\s+Z/8W
+    A\x{2005}Z
+ 0: A\x{2005}Z
+    A\x{85}\x{180e}\x{2005}Z
+ 0: A\x{85}\x{180e}\x{2005}Z
+
+/^A[\s]+Z/8W
+    A\x{2005}Z
+ 0: A\x{2005}Z
+    A\x{85}\x{180e}\x{2005}Z
+ 0: A\x{85}\x{180e}\x{2005}Z
+
 /-- End of testinput10 --/ 
diff --git a/testdata/testoutput6 b/testdata/testoutput6
index 2e9f82b..dd3910d 100644
--- a/testdata/testoutput6
+++ b/testdata/testoutput6
@@ -2151,4 +2151,16 @@ No match
     123  
 No match
     
+/^A\s+Z/8W
+    A\x{2005}Z
+ 0: A\x{2005}Z
+    A\x{85}\x{180e}\x{2005}Z
+ 0: A\x{85}\x{180e}\x{2005}Z
+
+/^A[\s]+Z/8W
+    A\x{2005}Z
+ 0: A\x{2005}Z
+    A\x{85}\x{180e}\x{2005}Z
+ 0: A\x{85}\x{180e}\x{2005}Z
+
 /-- End of testinput6 --/
author	ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>	2013-10-12 18:02:11 +0000
committer	ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>	2013-10-12 18:02:11 +0000
commit	d588bf144394bc85610f20333985554b9919d735 (patch)
tree	8bae0bea9f9079050cb6b429b95c3a09f2e899f8
parent	dfde99fb13ffbdac43d4c284cd91114d962c5978 (diff)
download	pcre-d588bf144394bc85610f20333985554b9919d735.tar.gz