summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>2013-11-06 18:00:09 +0000
committerph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>2013-11-06 18:00:09 +0000
commit912f34d2b213b26d217473769a99fade935c989b (patch)
tree9b452650ac16bfb4e0a41fb33c674347f6a68f20
parent66ee048b1fbddd62e2079aea5e5a9090f3b5b164 (diff)
downloadpcre-912f34d2b213b26d217473769a99fade935c989b.tar.gz
Give errors for [A-\d] and [a-[:digit:]] etc.
git-svn-id: svn://vcs.exim.org/pcre/code/trunk@1392 2f5784b3-3f2a-0410-8824-cb99058d5e15
-rw-r--r--ChangeLog4
-rw-r--r--doc/pcreapi.32
-rw-r--r--doc/pcrecompat.313
-rw-r--r--doc/pcrepattern.39
-rw-r--r--pcre_compile.c56
-rw-r--r--pcre_internal.h2
-rw-r--r--pcreposix.c3
-rw-r--r--testdata/testinput17
-rw-r--r--testdata/testinput217
-rw-r--r--testdata/testinput87
-rw-r--r--testdata/testoutput112
-rw-r--r--testdata/testoutput227
-rw-r--r--testdata/testoutput812
13 files changed, 106 insertions, 65 deletions
diff --git a/ChangeLog b/ChangeLog
index 6d61196..3582685 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -159,6 +159,10 @@ Version 8.34 xx-xxxx-201x
This limit is imposed to control the amount of system stack used at compile
time. It can be changed at build time by --with-parens-nest-limit=xxx or
the equivalent in CMake.
+
+34. Character classes such as [A-\d] or [a-[:digit:]] now cause compile-time
+ errors. Perl warns for these when in warning mode, but PCRE has no facility
+ for giving warnings.
Version 8.33 28-May-2013
diff --git a/doc/pcreapi.3 b/doc/pcreapi.3
index ed71b77..feabfb3 100644
--- a/doc/pcreapi.3
+++ b/doc/pcreapi.3
@@ -978,6 +978,8 @@ have fallen out of use. To avoid confusion, they have not been re-used.
79 non-hex character in \ex{} (closing brace missing?)
80 non-octal character in \eo{} (closing brace missing?)
81 missing opening brace after \eo
+ 82 parentheses are too deeply nested
+ 83 invalid range in character class
.sp
The numbers 32 and 10000 in errors 48 and 49 are defaults; different values may
be used if the limits were changed when PCRE was built.
diff --git a/doc/pcrecompat.3 b/doc/pcrecompat.3
index 2c109ea..1f12cd3 100644
--- a/doc/pcrecompat.3
+++ b/doc/pcrecompat.3
@@ -1,4 +1,4 @@
-.TH PCRECOMPAT 3 "19 March 2013" "PCRE 8.33"
+.TH PCRECOMPAT 3 "05 November 2013" "PCRE 8.34"
.SH NAME
PCRE - Perl-compatible regular expressions
.SH "DIFFERENCES BETWEEN PCRE AND PERL"
@@ -125,13 +125,18 @@ between the ( and ? at the start of a subpattern. If the /x modifier is set,
Perl allows white space between ( and ? but PCRE never does, even if the
PCRE_EXTENDED option is set.
.P
-16. In PCRE, the upper/lower case character properties Lu and Ll are not
+16. Perl, when in warning mode, gives warnings for character classes such as
+[A-\ed] or [a-[:digit:]]. It then treats the hyphens as literals. PCRE has no
+warning features, so it gives an error in these cases because they are almost
+certainly user mistakes.
+.P
+17. In PCRE, the upper/lower case character properties Lu and Ll are not
affected when case-independent matching is specified. For example, \ep{Lu}
always matches an upper case letter. I think Perl has changed in this respect;
in the release at the time of writing (5.16), \ep{Lu} and \ep{Ll} match all
letters, regardless of case, when case independence is specified.
.P
-17. PCRE provides some extensions to the Perl regular expression facilities.
+18. PCRE provides some extensions to the Perl regular expression facilities.
Perl 5.10 includes new features that are not in earlier versions of Perl, some
of which (such as named parentheses) have been in PCRE for some time. This list
is with respect to Perl 5.10:
@@ -190,6 +195,6 @@ Cambridge CB2 3QH, England.
.rs
.sp
.nf
-Last updated: 19 March 2013
+Last updated: 05 November 2013
Copyright (c) 1997-2013 University of Cambridge.
.fi
diff --git a/doc/pcrepattern.3 b/doc/pcrepattern.3
index 9eca02c..550ea15 100644
--- a/doc/pcrepattern.3
+++ b/doc/pcrepattern.3
@@ -1235,7 +1235,9 @@ The minus (hyphen) character can be used to specify a range of characters in a
character class. For example, [d-m] matches any letter between d and m,
inclusive. If a minus character is required in a class, it must be escaped with
a backslash or appear in a position where it cannot be interpreted as
-indicating a range, typically as the first or last character in the class.
+indicating a range, typically as the first or last character in the class, or
+immediately after a range. For example, [b-d-z] matches letters in the range b
+to d, a hyphen character, or z.
.P
It is not possible to have the literal character "]" as the end character of a
range. A pattern such as [W-]46] is interpreted as a class of two characters
@@ -1245,6 +1247,11 @@ the end of range, so [W-\e]46] is interpreted as a class containing a range
followed by two other characters. The octal or hexadecimal representation of
"]" can also be used to end a range.
.P
+An error is generated if a POSIX character class (see below) or an escape
+sequence other than one that defines a single character appears at a point
+where a range ending character is expected. For example, [z-\exff] is valid,
+but [A-\ed] and [A-[:digit:]] are not.
+.P
Ranges operate in the collating sequence of character values. They can also be
used for characters specified numerically, for example [\e000-\e037]. Ranges
can include any characters that are valid for the current mode.
diff --git a/pcre_compile.c b/pcre_compile.c
index 4e93386..8f8a79c 100644
--- a/pcre_compile.c
+++ b/pcre_compile.c
@@ -532,6 +532,7 @@ static const char error_texts[] =
"non-octal character in \\o{} (closing brace missing?)\0"
"missing opening brace after \\o\0"
"parentheses are too deeply nested\0"
+ "invalid range in character class\0"
;
/* Table to identify digits and hex digits. This is used when compiling
@@ -3793,7 +3794,7 @@ class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
below handles the special case of \], but does not try to do any other escape
processing. This makes it different from Perl for cases such as [:l\ower:]
where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
-"l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
+"l\ower". This is a lesser evil than not diagnosing bad classes when Perl does,
I think.
A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
@@ -5143,28 +5144,45 @@ for (;; ptr++)
else
#endif
d = *ptr; /* Not UTF-8 mode */
-
- /* The second part of a range can be a single-character escape, but
- not any of the other escapes. Perl 5.6 treats a hyphen as a literal
- in such circumstances. */
-
- if (!inescq && d == CHAR_BACKSLASH)
- {
- int descape;
- descape = check_escape(&ptr, &d, errorcodeptr, cd->bracount, options, TRUE);
- if (*errorcodeptr != 0) goto FAILED;
-
- /* \b is backspace; any other special means the '-' was literal. */
-
- if (descape != 0)
+
+ /* The second part of a range can be a single-character escape
+ sequence, but not any of the other escapes. Perl treats a hyphen as a
+ literal in such circumstances. However, in Perl's warning mode, a
+ warning is given, so PCRE now faults it as it is almost certainly a
+ mistake on the user's part. */
+
+ if (!inescq)
+ {
+ if (d == CHAR_BACKSLASH)
{
- if (descape == ESC_b) d = CHAR_BS; else
+ int descape;
+ descape = check_escape(&ptr, &d, errorcodeptr, cd->bracount, options, TRUE);
+ if (*errorcodeptr != 0) goto FAILED;
+
+ /* 0 means a character was put into d; \b is backspace; any other
+ special causes an error. */
+
+ if (descape != 0)
{
- ptr = oldptr;
- goto CLASS_SINGLE_CHARACTER; /* A few lines below */
+ if (descape == ESC_b) d = CHAR_BS; else
+ {
+ *errorcodeptr = ERR83;
+ goto FAILED;
+ }
}
}
- }
+
+ /* A hyphen followed by a POSIX class is treated in the same way. */
+
+ else if (d == CHAR_LEFT_SQUARE_BRACKET &&
+ (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
+ ptr[1] == CHAR_EQUALS_SIGN) &&
+ check_posix_syntax(ptr, &tempptr))
+ {
+ *errorcodeptr = ERR83;
+ goto FAILED;
+ }
+ }
/* Check that the two values are in the correct order. Optimize
one-character ranges. */
diff --git a/pcre_internal.h b/pcre_internal.h
index 9ded34d..515c023 100644
--- a/pcre_internal.h
+++ b/pcre_internal.h
@@ -2335,7 +2335,7 @@ enum { ERR0, ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9,
ERR50, ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59,
ERR60, ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68, ERR69,
ERR70, ERR71, ERR72, ERR73, ERR74, ERR75, ERR76, ERR77, ERR78, ERR79,
- ERR80, ERR81, ERR82, ERRCOUNT };
+ ERR80, ERR81, ERR82, ERR83, ERRCOUNT };
/* JIT compiling modes. The function list is indexed by them. */
diff --git a/pcreposix.c b/pcreposix.c
index 2f27c01..2f7bf98 100644
--- a/pcreposix.c
+++ b/pcreposix.c
@@ -168,7 +168,8 @@ static const int eint[] = {
/* 80 */
REG_BADPAT, /* non-octal character in \o{} (closing brace missing?) */
REG_BADPAT, /* missing opening brace after \o */
- REG_BADPAT /* parentheses too deeply nested */
+ REG_BADPAT, /* parentheses too deeply nested */
+ REG_BADPAT /* invalid range in character class */
};
/* Table of texts corresponding to POSIX error codes */
diff --git a/testdata/testinput1 b/testdata/testinput1
index 2f40648..59024eb 100644
--- a/testdata/testinput1
+++ b/testdata/testinput1
@@ -3661,13 +3661,6 @@
/a*/g
abbab
-/^[a-\d]/
- abcde
- -things
- 0digit
- *** Failers
- bcdef
-
/^[\d-a]/
abcde
-things
diff --git a/testdata/testinput2 b/testdata/testinput2
index a0e5405..c4b0d22 100644
--- a/testdata/testinput2
+++ b/testdata/testinput2
@@ -3492,6 +3492,8 @@ with \Y. ---/
/a[B-\Nc]/
+/a[B\Nc]/
+
/(a)(?2){0,1999}?(b)/
/(a)(?(DEFINE)(b))(?2){0,1999}?(?2)/
@@ -3977,4 +3979,19 @@ backtracking verbs. --/
/a{4}+/BZOi
+/[a-[:digit:]]+/
+
+/[A-[:digit:]]+/
+
+/[a-[.xxx.]]+/
+
+/[a-[=xxx=]]+/
+
+/[a-[!xxx!]]+/
+
+/[A-[!xxx!]]+/
+ A]]]
+
+/[a-\d]+/
+
/-- End of testinput2 --/
diff --git a/testdata/testinput8 b/testdata/testinput8
index fb8e536..bb2747b 100644
--- a/testdata/testinput8
+++ b/testdata/testinput8
@@ -3830,13 +3830,6 @@
/a*/g
abbab
-/^[a-\d]/
- abcde
- -things
- 0digit
- *** Failers
- bcdef
-
/^[\d-a]/
abcde
-things
diff --git a/testdata/testoutput1 b/testdata/testoutput1
index fb8ccc8..976d8a7 100644
--- a/testdata/testoutput1
+++ b/testdata/testoutput1
@@ -5991,18 +5991,6 @@ No match
0:
0:
-/^[a-\d]/
- abcde
- 0: a
- -things
- 0: -
- 0digit
- 0: 0
- *** Failers
-No match
- bcdef
-No match
-
/^[\d-a]/
abcde
0: a
diff --git a/testdata/testoutput2 b/testdata/testoutput2
index 35fda7d..978430e 100644
--- a/testdata/testoutput2
+++ b/testdata/testoutput2
@@ -11944,7 +11944,10 @@ No match
Failed: \N is not supported in a class at offset 3
/a[B-\Nc]/
-Failed: \N is not supported in a class at offset 5
+Failed: invalid range in character class at offset 5
+
+/a[B\Nc]/
+Failed: \N is not supported in a class at offset 4
/(a)(?2){0,1999}?(b)/
@@ -13987,4 +13990,26 @@ Failed: non-hex character in \x{} (closing brace missing?) at offset 5
End
------------------------------------------------------------------
+/[a-[:digit:]]+/
+Failed: invalid range in character class at offset 3
+
+/[A-[:digit:]]+/
+Failed: invalid range in character class at offset 3
+
+/[a-[.xxx.]]+/
+Failed: invalid range in character class at offset 3
+
+/[a-[=xxx=]]+/
+Failed: invalid range in character class at offset 3
+
+/[a-[!xxx!]]+/
+Failed: range out of order in character class at offset 3
+
+/[A-[!xxx!]]+/
+ A]]]
+ 0: A]]]
+
+/[a-\d]+/
+Failed: invalid range in character class at offset 4
+
/-- End of testinput2 --/
diff --git a/testdata/testoutput8 b/testdata/testoutput8
index d97d4f8..bb68d3e 100644
--- a/testdata/testoutput8
+++ b/testdata/testoutput8
@@ -6000,18 +6000,6 @@ No match
0:
0:
-/^[a-\d]/
- abcde
- 0: a
- -things
- 0: -
- 0digit
- 0: 0
- *** Failers
-No match
- bcdef
-No match
-
/^[\d-a]/
abcde
0: a