summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--pod/perldiag.pod3
-rw-r--r--pod/perlre.pod14
-rw-r--r--regcomp.c52
-rw-r--r--t/op/re_tests4
4 files changed, 46 insertions, 27 deletions
diff --git a/pod/perldiag.pod b/pod/perldiag.pod
index 0ff865d06d..8e686bab29 100644
--- a/pod/perldiag.pod
+++ b/pod/perldiag.pod
@@ -1595,7 +1595,8 @@ rebuild Perl.
=item invalid [] range in regexp
(F) The range specified in a character class had a minimum character
-greater than the maximum character. See L<perlre>.
+greater than the maximum character, or the range didn't start/end with
+a literal character. See L<perlre>.
=item Invalid conversion in %s: "%s"
diff --git a/pod/perlre.pod b/pod/perlre.pod
index 85b2a949c2..a1a118f371 100644
--- a/pod/perlre.pod
+++ b/pod/perlre.pod
@@ -217,7 +217,7 @@ character class. For example:
matches one, zero, any alphabetic character, and the percentage sign.
The exact meanings of the above classes depend from many things:
-if the C<utf8> pragma is used, the following equivalenced to Unicode
+if the C<utf8> pragma is used, the following equivalences to Unicode
\p{} constructs hold:
alpha IsAlpha
@@ -238,7 +238,7 @@ For example, [:lower:] and \p{IsLower} are equivalent.
If the C<utf8> pragma is not used but the C<locale> pragma is, the
classes correlate with the isalpha(3) interface (except for `word',
-which is a Perl extension).
+which is a Perl extension, mirroring \w).
The assumedly non-obviously named classes are:
@@ -249,6 +249,8 @@ The assumedly non-obviously named classes are:
Any control character. Usually characters that don't produce
output as such but instead control the terminal somehow:
for example newline and backspace are control characters.
+ All characters with ord() less than 32 are most often control
+ classified as characters.
=item graph
@@ -275,9 +277,11 @@ The assumedly non-obviously named classes are:
You can negate the [::] character classes by prefixing the class name
with a '^'. This is a Perl extension. For example:
- ^digit \D \P{IsDigit}
- ^space \S \P{IsSpace}
- ^word \W \P{IsWord}
+ POSIX trad. Perl utf8 Perl
+
+ [:^digit:] \D \P{IsDigit}
+ [:^space:] \S \P{IsSpace}
+ [:^word:] \W \P{IsWord}
The POSIX character classes [.cc.] and [=cc=] are B<not> supported
and trying to use them will cause an error.
diff --git a/regcomp.c b/regcomp.c
index df2fc0c796..9846022186 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -192,6 +192,7 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
#define OOB_CHAR8 1234
#define OOB_UTF8 123456
+#define OOB_NAMEDCLASS -1
#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
@@ -2246,7 +2247,7 @@ S_regpposixcc(pTHX_ I32 value)
}
break;
}
- if ((namedclass == -1 ||
+ if ((namedclass == OOB_NAMEDCLASS ||
!(posixcc + skip + 2 < PL_regxend &&
(posixcc[skip] == ':' &&
posixcc[skip + 1] == ']'))))
@@ -2275,7 +2276,7 @@ S_checkposixcc(pTHX)
*PL_regcomp_parse == '=' ||
*PL_regcomp_parse == '.')) {
char *s = PL_regcomp_parse;
- char c = *s++;
+ char c = *s++;
while(*s && isALNUM(*s))
s++;
@@ -2329,7 +2330,7 @@ S_regclass(pTHX)
goto skipcond; /* allow 1st char to be ] or - */
while (PL_regcomp_parse < PL_regxend && *PL_regcomp_parse != ']') {
skipcond:
- namedclass = -1;
+ namedclass = OOB_NAMEDCLASS;
value = UCHARAT(PL_regcomp_parse++);
if (value == '[')
namedclass = regpposixcc(value);
@@ -2364,7 +2365,9 @@ S_regclass(pTHX)
break;
}
}
- if (!SIZE_ONLY && namedclass > -1) {
+ if (!SIZE_ONLY && namedclass > OOB_NAMEDCLASS) {
+ if (range)
+ FAIL("invalid [] range in regexp"); /* [a-\w], [a-[:word:]] */
switch (namedclass) {
case ANYOF_ALNUM:
if (LOC)
@@ -2605,25 +2608,27 @@ S_regclass(pTHX)
}
if (LOC)
ANYOF_FLAGS(opnd) |= ANYOF_CLASS;
- lastvalue = OOB_CHAR8;
+ continue;
}
- else
if (range) {
if (lastvalue > value)
- FAIL("invalid [] range in regexp");
+ FAIL("invalid [] range in regexp"); /* [b-a] */
range = 0;
}
else {
lastvalue = value;
if (*PL_regcomp_parse == '-' && PL_regcomp_parse+1 < PL_regxend &&
- PL_regcomp_parse[1] != ']') {
+ PL_regcomp_parse[1] != ']') {
+ if (namedclass > OOB_NAMEDCLASS)
+ FAIL("invalid [] range in regexp"); /* [\w-a] */
PL_regcomp_parse++;
range = 1;
continue; /* do it next time */
}
}
+ /* now is the next time */
if (!SIZE_ONLY) {
-#ifndef ASCIIish
+#ifndef ASCIIish /* EBCDIC, for example. */
if ((isLOWER(lastvalue) && isLOWER(value)) ||
(isUPPER(lastvalue) && isUPPER(value)))
{
@@ -2643,7 +2648,7 @@ S_regclass(pTHX)
for ( ; lastvalue <= value; lastvalue++)
ANYOF_BITMAP_SET(opnd, lastvalue);
}
- lastvalue = value;
+ range = 0;
}
/* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
if (!SIZE_ONLY &&
@@ -2701,7 +2706,7 @@ S_regclassutf8(pTHX)
while (PL_regcomp_parse < PL_regxend && *PL_regcomp_parse != ']') {
skipcond:
- namedclass = -1;
+ namedclass = OOB_NAMEDCLASS;
value = utf8_to_uv((U8*)PL_regcomp_parse, &numlen);
PL_regcomp_parse += numlen;
@@ -2773,7 +2778,9 @@ S_regclassutf8(pTHX)
break;
}
}
- if (!SIZE_ONLY && namedclass > -1) {
+ if (!SIZE_ONLY && namedclass > OOB_NAMEDCLASS) {
+ if (range)
+ FAIL("invalid [] range in regexp"); /* [a-\w], [a-[:word:]] */
switch (namedclass) {
case ANYOF_ALNUM:
Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n"); break;
@@ -2828,11 +2835,11 @@ S_regclassutf8(pTHX)
case ANYOF_NXDIGIT:
Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n"); break;
}
+ continue;
}
- else
if (range) {
if (lastvalue > value)
- FAIL("invalid [] range in regexp");
+ FAIL("invalid [] range in regexp"); /* [b-a] */
#ifdef UV_IS_QUAD
if (!SIZE_ONLY)
Perl_sv_catpvf(aTHX_ listsv, "%04" PERL_PRIx64 "\t%04" PERL_PRIx64 "\n", (UV)lastvalue, (UV)value);
@@ -2840,25 +2847,28 @@ S_regclassutf8(pTHX)
if (!SIZE_ONLY)
Perl_sv_catpvf(aTHX_ listsv, "%04x\t%04x\n", lastvalue, value);
#endif
- lastvalue = value;
range = 0;
}
else {
lastvalue = value;
if (*PL_regcomp_parse == '-' && PL_regcomp_parse+1 < PL_regxend &&
- PL_regcomp_parse[1] != ']') {
+ PL_regcomp_parse[1] != ']') {
+ if (namedclass > OOB_NAMEDCLASS)
+ FAIL("invalid [] range in regexp"); /* [\w-a] */
PL_regcomp_parse++;
range = 1;
continue; /* do it next time */
}
+ }
+ /* now is the next time */
#ifdef UV_IS_QUAD
- if (!SIZE_ONLY)
- Perl_sv_catpvf(aTHX_ listsv, "%04" PERL_PRIx64 "\n", (UV)value);
+ if (!SIZE_ONLY)
+ Perl_sv_catpvf(aTHX_ listsv, "%04" PERL_PRIx64 "\n", (UV)value);
#else
- if (!SIZE_ONLY)
- Perl_sv_catpvf(aTHX_ listsv, "%04x\n", value);
+ if (!SIZE_ONLY)
+ Perl_sv_catpvf(aTHX_ listsv, "%04x\n", value);
#endif
- }
+ range = 0;
}
ret = reganode(ANYOFUTF8, 0);
diff --git a/t/op/re_tests b/t/op/re_tests
index 899b35ee83..b35e964dc1 100644
--- a/t/op/re_tests
+++ b/t/op/re_tests
@@ -735,3 +735,7 @@ foo.bart foo.bart y - -
.[X](.+)+[X][X] bbbbXXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa n - -
.[X][X](.+)+[X] bbbbXXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa n - -
tt+$ xxxtt y - -
+[a-\w] - c - /[a-\w]/: invalid [] range in regexp
+[\w-z] - c - /[\w-z]/: invalid [] range in regexp
+[0-[:digit:]] - c - /[0-[:digit:]]/: invalid [] range in regexp
+[[:digit:]-9] - c - /[[:digit:]-9]/: invalid [] range in regexp