summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>2007-12-31 17:00:24 +0000
committerph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>2007-12-31 17:00:24 +0000
commit737057052f43b1189428dad91d1a18e726481e8b (patch)
tree0480304774c584c944dc806c5ab60080bd8d9e1e
parentee516e9c1ac3d79f58e350f6fc8b5a9dbca0c6f4 (diff)
downloadpcre-737057052f43b1189428dad91d1a18e726481e8b.tar.gz
Make POSIX character class parsing more like Perl.
git-svn-id: svn://vcs.exim.org/pcre/code/trunk@295 2f5784b3-3f2a-0410-8824-cb99058d5e15
-rw-r--r--ChangeLog15
-rw-r--r--pcre_compile.c47
-rw-r--r--testdata/testinput111
-rw-r--r--testdata/testinput218
-rw-r--r--testdata/testinput52
-rw-r--r--testdata/testoutput118
-rw-r--r--testdata/testoutput227
-rw-r--r--testdata/testoutput53
8 files changed, 126 insertions, 15 deletions
diff --git a/ChangeLog b/ChangeLog
index cecb0fe..a712d43 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,7 +1,7 @@
ChangeLog for PCRE
------------------
-Version 7.5 29-Dec-07
+Version 7.5 31-Dec-07
---------------------
1. Applied a patch from Craig: "This patch makes it possible to 'ignore'
@@ -118,6 +118,19 @@ Version 7.5 29-Dec-07
newline. The bug was in the code for advancing after a failed match and
checking that the new position followed a newline. It was not taking
account of UTF-8 characters correctly.
+
+23. PCRE was behaving differently from Perl in the way it recognized POSIX
+ character classes. PCRE was not treating the sequence [:...:] as a
+ character class unless the ... were all letters. Perl, however, seems to
+ allow any characters between [: and :], though of course it rejects as
+ unknown any "names" that contain non-letters, because all the known class
+ names consist only of letters. Thus, Perl gives an error for [[:1234:]],
+ for example, whereas PCRE did not - it did not recognize a POSIX character
+ class. This seemed a bit dangerous, so the code has been changed to be
+ closer to Perl. The behaviour is not identical to Perl, because PCRE will
+ diagnose an unknown class for, for example, [[:l\ower:]] where Perl will
+ treat it as [[:lower:]]. However, PCRE does now give "unknown" errors where
+ Perl does, and where it didn't before.
Version 7.4 21-Sep-07
diff --git a/pcre_compile.c b/pcre_compile.c
index e962d19..33b2c48 100644
--- a/pcre_compile.c
+++ b/pcre_compile.c
@@ -1737,30 +1737,49 @@ return TRUE;
*************************************************/
/* This function is called when the sequence "[:" or "[." or "[=" is
-encountered in a character class. It checks whether this is followed by an
-optional ^ and then a sequence of letters, terminated by a matching ":]" or
-".]" or "=]".
+encountered in a character class. It checks whether this is followed by a
+sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
+reach an unescaped ']' without the special preceding character, return FALSE.
+
+Originally, this function only recognized a sequence of letters between the
+terminators, but it seems that Perl recognizes any sequence of characters,
+though of course unknown POSIX names are subsequently rejected. Perl gives an
+"Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
+didn't consider this to be a POSIX class. Likewise for [:1234:].
+
+The problem in trying to be exactly like Perl is in the handling of escapes. We
+have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
+class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
+below handles the special case of \], but does not try to do any other escape
+processing. This makes it different from Perl for cases such as [:l\ower:]
+where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
+"l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
+I think.
-Argument:
+Arguments:
ptr pointer to the initial [
endptr where to return the end pointer
- cd pointer to compile data
Returns: TRUE or FALSE
*/
static BOOL
-check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
+check_posix_syntax(const uschar *ptr, const uschar **endptr)
{
int terminator; /* Don't combine these lines; the Solaris cc */
terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
-if (*(++ptr) == '^') ptr++;
-while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
-if (*ptr == terminator && ptr[1] == ']')
+for (++ptr; *ptr != 0; ptr++)
{
- *endptr = ptr;
- return TRUE;
- }
+ if (*ptr == '\\' && ptr[1] == ']') ptr++; else
+ {
+ if (*ptr == ']') return FALSE;
+ if (*ptr == terminator && ptr[1] == ']')
+ {
+ *endptr = ptr;
+ return TRUE;
+ }
+ }
+ }
return FALSE;
}
@@ -2620,7 +2639,7 @@ for (;; ptr++)
they are encountered at the top level, so we'll do that too. */
if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
- check_posix_syntax(ptr, &tempptr, cd))
+ check_posix_syntax(ptr, &tempptr))
{
*errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
goto FAILED;
@@ -2706,7 +2725,7 @@ for (;; ptr++)
if (c == '[' &&
(ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
- check_posix_syntax(ptr, &tempptr, cd))
+ check_posix_syntax(ptr, &tempptr))
{
BOOL local_negate = FALSE;
int posix_class, taboffset, tabopt;
diff --git a/testdata/testinput1 b/testdata/testinput1
index c7f264a..91554ff 100644
--- a/testdata/testinput1
+++ b/testdata/testinput1
@@ -4025,4 +4025,15 @@
/( (?(1)0|)* )/x
abcd
+/[[:abcd:xyz]]/
+ a]
+ :]
+
+/[abc[:x\]pqr]/
+ a
+ [
+ :
+ ]
+ p
+
/ End of testinput1 /
diff --git a/testdata/testinput2 b/testdata/testinput2
index 32c5ef2..1a13fa8 100644
--- a/testdata/testinput2
+++ b/testdata/testinput2
@@ -2571,4 +2571,22 @@ a random value. /Ix
/(?<=\w)(?R)/
+/[[:foo:]]/
+
+/[[:1234:]]/
+
+/[[:f\oo:]]/
+
+/[[: :]]/
+
+/[[:...:]]/
+
+/[[:l\ower:]]/
+
+/[[:abc\:]]/
+
+/[abc[:x\]pqr:]]/
+
+/[[:a\dz:]]/
+
/ End of testinput2 /
diff --git a/testdata/testinput5 b/testdata/testinput5
index f5f61cd..75a4857 100644
--- a/testdata/testinput5
+++ b/testdata/testinput5
@@ -459,4 +459,6 @@ can't tell the difference.) --/
a\x{2029}b
\x61\xe2\x80\xa9\x62
+/[[:a\x{100}b:]]/8
+
/ End of testinput5 /
diff --git a/testdata/testoutput1 b/testdata/testoutput1
index 83682f7..9b8b268 100644
--- a/testdata/testoutput1
+++ b/testdata/testoutput1
@@ -6587,4 +6587,22 @@ No match
0:
1:
+/[[:abcd:xyz]]/
+ a]
+ 0: a]
+ :]
+ 0: :]
+
+/[abc[:x\]pqr]/
+ a
+ 0: a
+ [
+ 0: [
+ :
+ 0: :
+ ]
+ 0: ]
+ p
+ 0: p
+
/ End of testinput1 /
diff --git a/testdata/testoutput2 b/testdata/testoutput2
index 72ac0af..dba227f 100644
--- a/testdata/testoutput2
+++ b/testdata/testoutput2
@@ -9358,4 +9358,31 @@ Failed: recursive call could loop indefinitely at offset 10
/(?<=\w)(?R)/
Failed: recursive call could loop indefinitely at offset 10
+/[[:foo:]]/
+Failed: unknown POSIX class name at offset 3
+
+/[[:1234:]]/
+Failed: unknown POSIX class name at offset 3
+
+/[[:f\oo:]]/
+Failed: unknown POSIX class name at offset 3
+
+/[[: :]]/
+Failed: unknown POSIX class name at offset 3
+
+/[[:...:]]/
+Failed: unknown POSIX class name at offset 3
+
+/[[:l\ower:]]/
+Failed: unknown POSIX class name at offset 3
+
+/[[:abc\:]]/
+Failed: unknown POSIX class name at offset 3
+
+/[abc[:x\]pqr:]]/
+Failed: unknown POSIX class name at offset 6
+
+/[[:a\dz:]]/
+Failed: unknown POSIX class name at offset 3
+
/ End of testinput2 /
diff --git a/testdata/testoutput5 b/testdata/testoutput5
index 122cada..abbe1c8 100644
--- a/testdata/testoutput5
+++ b/testdata/testoutput5
@@ -1605,4 +1605,7 @@ No match
\x61\xe2\x80\xa9\x62
No match
+/[[:a\x{100}b:]]/8
+Failed: unknown POSIX class name at offset 3
+
/ End of testinput5 /