diff options
author | ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2007-12-31 17:00:24 +0000 |
---|---|---|
committer | ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2007-12-31 17:00:24 +0000 |
commit | 737057052f43b1189428dad91d1a18e726481e8b (patch) | |
tree | 0480304774c584c944dc806c5ab60080bd8d9e1e | |
parent | ee516e9c1ac3d79f58e350f6fc8b5a9dbca0c6f4 (diff) | |
download | pcre-737057052f43b1189428dad91d1a18e726481e8b.tar.gz |
Make POSIX character class parsing more like Perl.
git-svn-id: svn://vcs.exim.org/pcre/code/trunk@295 2f5784b3-3f2a-0410-8824-cb99058d5e15
-rw-r--r-- | ChangeLog | 15 | ||||
-rw-r--r-- | pcre_compile.c | 47 | ||||
-rw-r--r-- | testdata/testinput1 | 11 | ||||
-rw-r--r-- | testdata/testinput2 | 18 | ||||
-rw-r--r-- | testdata/testinput5 | 2 | ||||
-rw-r--r-- | testdata/testoutput1 | 18 | ||||
-rw-r--r-- | testdata/testoutput2 | 27 | ||||
-rw-r--r-- | testdata/testoutput5 | 3 |
8 files changed, 126 insertions, 15 deletions
@@ -1,7 +1,7 @@ ChangeLog for PCRE ------------------ -Version 7.5 29-Dec-07 +Version 7.5 31-Dec-07 --------------------- 1. Applied a patch from Craig: "This patch makes it possible to 'ignore' @@ -118,6 +118,19 @@ Version 7.5 29-Dec-07 newline. The bug was in the code for advancing after a failed match and checking that the new position followed a newline. It was not taking account of UTF-8 characters correctly. + +23. PCRE was behaving differently from Perl in the way it recognized POSIX + character classes. PCRE was not treating the sequence [:...:] as a + character class unless the ... were all letters. Perl, however, seems to + allow any characters between [: and :], though of course it rejects as + unknown any "names" that contain non-letters, because all the known class + names consist only of letters. Thus, Perl gives an error for [[:1234:]], + for example, whereas PCRE did not - it did not recognize a POSIX character + class. This seemed a bit dangerous, so the code has been changed to be + closer to Perl. The behaviour is not identical to Perl, because PCRE will + diagnose an unknown class for, for example, [[:l\ower:]] where Perl will + treat it as [[:lower:]]. However, PCRE does now give "unknown" errors where + Perl does, and where it didn't before. Version 7.4 21-Sep-07 diff --git a/pcre_compile.c b/pcre_compile.c index e962d19..33b2c48 100644 --- a/pcre_compile.c +++ b/pcre_compile.c @@ -1737,30 +1737,49 @@ return TRUE; *************************************************/ /* This function is called when the sequence "[:" or "[." or "[=" is -encountered in a character class. It checks whether this is followed by an -optional ^ and then a sequence of letters, terminated by a matching ":]" or -".]" or "=]". +encountered in a character class. It checks whether this is followed by a +sequence of characters terminated by a matching ":]" or ".]" or "=]". If we +reach an unescaped ']' without the special preceding character, return FALSE. + +Originally, this function only recognized a sequence of letters between the +terminators, but it seems that Perl recognizes any sequence of characters, +though of course unknown POSIX names are subsequently rejected. Perl gives an +"Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE +didn't consider this to be a POSIX class. Likewise for [:1234:]. + +The problem in trying to be exactly like Perl is in the handling of escapes. We +have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX +class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code +below handles the special case of \], but does not try to do any other escape +processing. This makes it different from Perl for cases such as [:l\ower:] +where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize +"l\ower". This is a lesser evil that not diagnosing bad classes when Perl does, +I think. -Argument: +Arguments: ptr pointer to the initial [ endptr where to return the end pointer - cd pointer to compile data Returns: TRUE or FALSE */ static BOOL -check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd) +check_posix_syntax(const uschar *ptr, const uschar **endptr) { int terminator; /* Don't combine these lines; the Solaris cc */ terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */ -if (*(++ptr) == '^') ptr++; -while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++; -if (*ptr == terminator && ptr[1] == ']') +for (++ptr; *ptr != 0; ptr++) { - *endptr = ptr; - return TRUE; - } + if (*ptr == '\\' && ptr[1] == ']') ptr++; else + { + if (*ptr == ']') return FALSE; + if (*ptr == terminator && ptr[1] == ']') + { + *endptr = ptr; + return TRUE; + } + } + } return FALSE; } @@ -2620,7 +2639,7 @@ for (;; ptr++) they are encountered at the top level, so we'll do that too. */ if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') && - check_posix_syntax(ptr, &tempptr, cd)) + check_posix_syntax(ptr, &tempptr)) { *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31; goto FAILED; @@ -2706,7 +2725,7 @@ for (;; ptr++) if (c == '[' && (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') && - check_posix_syntax(ptr, &tempptr, cd)) + check_posix_syntax(ptr, &tempptr)) { BOOL local_negate = FALSE; int posix_class, taboffset, tabopt; diff --git a/testdata/testinput1 b/testdata/testinput1 index c7f264a..91554ff 100644 --- a/testdata/testinput1 +++ b/testdata/testinput1 @@ -4025,4 +4025,15 @@ /( (?(1)0|)* )/x abcd +/[[:abcd:xyz]]/ + a] + :] + +/[abc[:x\]pqr]/ + a + [ + : + ] + p + / End of testinput1 / diff --git a/testdata/testinput2 b/testdata/testinput2 index 32c5ef2..1a13fa8 100644 --- a/testdata/testinput2 +++ b/testdata/testinput2 @@ -2571,4 +2571,22 @@ a random value. /Ix /(?<=\w)(?R)/ +/[[:foo:]]/ + +/[[:1234:]]/ + +/[[:f\oo:]]/ + +/[[: :]]/ + +/[[:...:]]/ + +/[[:l\ower:]]/ + +/[[:abc\:]]/ + +/[abc[:x\]pqr:]]/ + +/[[:a\dz:]]/ + / End of testinput2 / diff --git a/testdata/testinput5 b/testdata/testinput5 index f5f61cd..75a4857 100644 --- a/testdata/testinput5 +++ b/testdata/testinput5 @@ -459,4 +459,6 @@ can't tell the difference.) --/ a\x{2029}b \x61\xe2\x80\xa9\x62 +/[[:a\x{100}b:]]/8 + / End of testinput5 / diff --git a/testdata/testoutput1 b/testdata/testoutput1 index 83682f7..9b8b268 100644 --- a/testdata/testoutput1 +++ b/testdata/testoutput1 @@ -6587,4 +6587,22 @@ No match 0: 1: +/[[:abcd:xyz]]/ + a] + 0: a] + :] + 0: :] + +/[abc[:x\]pqr]/ + a + 0: a + [ + 0: [ + : + 0: : + ] + 0: ] + p + 0: p + / End of testinput1 / diff --git a/testdata/testoutput2 b/testdata/testoutput2 index 72ac0af..dba227f 100644 --- a/testdata/testoutput2 +++ b/testdata/testoutput2 @@ -9358,4 +9358,31 @@ Failed: recursive call could loop indefinitely at offset 10 /(?<=\w)(?R)/ Failed: recursive call could loop indefinitely at offset 10 +/[[:foo:]]/ +Failed: unknown POSIX class name at offset 3 + +/[[:1234:]]/ +Failed: unknown POSIX class name at offset 3 + +/[[:f\oo:]]/ +Failed: unknown POSIX class name at offset 3 + +/[[: :]]/ +Failed: unknown POSIX class name at offset 3 + +/[[:...:]]/ +Failed: unknown POSIX class name at offset 3 + +/[[:l\ower:]]/ +Failed: unknown POSIX class name at offset 3 + +/[[:abc\:]]/ +Failed: unknown POSIX class name at offset 3 + +/[abc[:x\]pqr:]]/ +Failed: unknown POSIX class name at offset 6 + +/[[:a\dz:]]/ +Failed: unknown POSIX class name at offset 3 + / End of testinput2 / diff --git a/testdata/testoutput5 b/testdata/testoutput5 index 122cada..abbe1c8 100644 --- a/testdata/testoutput5 +++ b/testdata/testoutput5 @@ -1605,4 +1605,7 @@ No match \x61\xe2\x80\xa9\x62 No match +/[[:a\x{100}b:]]/8 +Failed: unknown POSIX class name at offset 3 + / End of testinput5 / |