Make POSIX character class parsing more like Perl.

git-svn-id: svn://vcs.exim.org/pcre/code/trunk@295 2f5784b3-3f2a-0410-8824-cb99058d5e15
author: ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> 2007-12-31 17:00:24 +0000
committer: ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> 2007-12-31 17:00:24 +0000
commit: 737057052f43b1189428dad91d1a18e726481e8b (patch)
tree: 0480304774c584c944dc806c5ab60080bd8d9e1e
parent: ee516e9c1ac3d79f58e350f6fc8b5a9dbca0c6f4 (diff)
download: pcre-737057052f43b1189428dad91d1a18e726481e8b.tar.gz
8 files changed, 126 insertions, 15 deletions
diff --git a/ChangeLog b/ChangeLog
index cecb0fe..a712d43 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,7 +1,7 @@
 ChangeLog for PCRE
 ------------------
 
-Version 7.5 29-Dec-07
+Version 7.5 31-Dec-07
 ---------------------
 
 1.  Applied a patch from Craig: "This patch makes it possible to 'ignore'
@@ -118,6 +118,19 @@ Version 7.5 29-Dec-07
     newline. The bug was in the code for advancing after a failed match and 
     checking that the new position followed a newline. It was not taking 
     account of UTF-8 characters correctly.
+    
+23. PCRE was behaving differently from Perl in the way it recognized POSIX 
+    character classes. PCRE was not treating the sequence [:...:] as a 
+    character class unless the ... were all letters. Perl, however, seems to 
+    allow any characters between [: and :], though of course it rejects as 
+    unknown any "names" that contain non-letters, because all the known class 
+    names consist only of letters. Thus, Perl gives an error for [[:1234:]], 
+    for example, whereas PCRE did not - it did not recognize a POSIX character 
+    class. This seemed a bit dangerous, so the code has been changed to be 
+    closer to Perl. The behaviour is not identical to Perl, because PCRE will 
+    diagnose an unknown class for, for example, [[:l\ower:]] where Perl will 
+    treat it as [[:lower:]]. However, PCRE does now give "unknown" errors where 
+    Perl does, and where it didn't before. 
 
 
 Version 7.4 21-Sep-07
diff --git a/pcre_compile.c b/pcre_compile.c
index e962d19..33b2c48 100644
--- a/pcre_compile.c
+++ b/pcre_compile.c
@@ -1737,30 +1737,49 @@ return TRUE;
 *************************************************/
 
 /* This function is called when the sequence "[:" or "[." or "[=" is
-encountered in a character class. It checks whether this is followed by an
-optional ^ and then a sequence of letters, terminated by a matching ":]" or
-".]" or "=]".
+encountered in a character class. It checks whether this is followed by a
+sequence of characters terminated by a matching ":]" or ".]" or "=]". If we 
+reach an unescaped ']' without the special preceding character, return FALSE.
+
+Originally, this function only recognized a sequence of letters between the 
+terminators, but it seems that Perl recognizes any sequence of characters, 
+though of course unknown POSIX names are subsequently rejected. Perl gives an 
+"Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE 
+didn't consider this to be a POSIX class. Likewise for [:1234:]. 
+
+The problem in trying to be exactly like Perl is in the handling of escapes. We 
+have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX 
+class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code 
+below handles the special case of \], but does not try to do any other escape 
+processing. This makes it different from Perl for cases such as [:l\ower:] 
+where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
+"l\ower". This is a lesser evil that not diagnosing bad classes when Perl does, 
+I think.
 
-Argument:
+Arguments:
   ptr      pointer to the initial [
   endptr   where to return the end pointer
-  cd       pointer to compile data
 
 Returns:   TRUE or FALSE
 */
 
 static BOOL
-check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
+check_posix_syntax(const uschar *ptr, const uschar **endptr)
 {
 int terminator;          /* Don't combine these lines; the Solaris cc */
 terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
-if (*(++ptr) == '^') ptr++;
-while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
-if (*ptr == terminator && ptr[1] == ']')
+for (++ptr; *ptr != 0; ptr++)
   {
-  *endptr = ptr;
-  return TRUE;
-  }
+  if (*ptr == '\\' && ptr[1] == ']') ptr++; else
+    { 
+    if (*ptr == ']') return FALSE; 
+    if (*ptr == terminator && ptr[1] == ']')
+      {
+      *endptr = ptr;
+      return TRUE;
+      }     
+    }   
+  } 
 return FALSE;
 }
 
@@ -2620,7 +2639,7 @@ for (;; ptr++)
     they are encountered at the top level, so we'll do that too. */
 
     if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
-        check_posix_syntax(ptr, &tempptr, cd))
+        check_posix_syntax(ptr, &tempptr))
       {
       *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
       goto FAILED;
@@ -2706,7 +2725,7 @@ for (;; ptr++)
 
       if (c == '[' &&
           (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
-          check_posix_syntax(ptr, &tempptr, cd))
+          check_posix_syntax(ptr, &tempptr))
         {
         BOOL local_negate = FALSE;
         int posix_class, taboffset, tabopt;
diff --git a/testdata/testinput1 b/testdata/testinput1
index c7f264a..91554ff 100644
--- a/testdata/testinput1
+++ b/testdata/testinput1
@@ -4025,4 +4025,15 @@
 /(  (?(1)0|)*   )/x
     abcd
 
+/[[:abcd:xyz]]/
+    a]
+    :] 
+    
+/[abc[:x\]pqr]/
+    a
+    [
+    :
+    ]
+    p    
+
 / End of testinput1 /
diff --git a/testdata/testinput2 b/testdata/testinput2
index 32c5ef2..1a13fa8 100644
--- a/testdata/testinput2
+++ b/testdata/testinput2
@@ -2571,4 +2571,22 @@ a random value. /Ix
 
 /(?<=\w)(?R)/
 
+/[[:foo:]]/
+
+/[[:1234:]]/
+
+/[[:f\oo:]]/
+
+/[[: :]]/
+
+/[[:...:]]/
+
+/[[:l\ower:]]/
+
+/[[:abc\:]]/
+
+/[abc[:x\]pqr:]]/
+
+/[[:a\dz:]]/
+
 / End of testinput2 /
diff --git a/testdata/testinput5 b/testdata/testinput5
index f5f61cd..75a4857 100644
--- a/testdata/testinput5
+++ b/testdata/testinput5
@@ -459,4 +459,6 @@ can't tell the difference.) --/
     a\x{2029}b
     \x61\xe2\x80\xa9\x62 
 
+/[[:a\x{100}b:]]/8
+
 / End of testinput5 /
diff --git a/testdata/testoutput1 b/testdata/testoutput1
index 83682f7..9b8b268 100644
--- a/testdata/testoutput1
+++ b/testdata/testoutput1
@@ -6587,4 +6587,22 @@ No match
  0: 
  1: 
 
+/[[:abcd:xyz]]/
+    a]
+ 0: a]
+    :] 
+ 0: :]
+    
+/[abc[:x\]pqr]/
+    a
+ 0: a
+    [
+ 0: [
+    :
+ 0: :
+    ]
+ 0: ]
+    p    
+ 0: p
+
 / End of testinput1 /
diff --git a/testdata/testoutput2 b/testdata/testoutput2
index 72ac0af..dba227f 100644
--- a/testdata/testoutput2
+++ b/testdata/testoutput2
@@ -9358,4 +9358,31 @@ Failed: recursive call could loop indefinitely at offset 10
 /(?<=\w)(?R)/
 Failed: recursive call could loop indefinitely at offset 10
 
+/[[:foo:]]/
+Failed: unknown POSIX class name at offset 3
+
+/[[:1234:]]/
+Failed: unknown POSIX class name at offset 3
+
+/[[:f\oo:]]/
+Failed: unknown POSIX class name at offset 3
+
+/[[: :]]/
+Failed: unknown POSIX class name at offset 3
+
+/[[:...:]]/
+Failed: unknown POSIX class name at offset 3
+
+/[[:l\ower:]]/
+Failed: unknown POSIX class name at offset 3
+
+/[[:abc\:]]/
+Failed: unknown POSIX class name at offset 3
+
+/[abc[:x\]pqr:]]/
+Failed: unknown POSIX class name at offset 6
+
+/[[:a\dz:]]/
+Failed: unknown POSIX class name at offset 3
+
 / End of testinput2 /
diff --git a/testdata/testoutput5 b/testdata/testoutput5
index 122cada..abbe1c8 100644
--- a/testdata/testoutput5
+++ b/testdata/testoutput5
@@ -1605,4 +1605,7 @@ No match
     \x61\xe2\x80\xa9\x62 
 No match
 
+/[[:a\x{100}b:]]/8
+Failed: unknown POSIX class name at offset 3
+
 / End of testinput5 /
author	ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>	2007-12-31 17:00:24 +0000
committer	ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>	2007-12-31 17:00:24 +0000
commit	737057052f43b1189428dad91d1a18e726481e8b (patch)
tree	0480304774c584c944dc806c5ab60080bd8d9e1e
parent	ee516e9c1ac3d79f58e350f6fc8b5a9dbca0c6f4 (diff)
download	pcre-737057052f43b1189428dad91d1a18e726481e8b.tar.gz