summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--dquote_static.c13
-rw-r--r--embed.fnc6
-rw-r--r--embed.h4
-rw-r--r--handy.h3
-rw-r--r--l1_char_class_tab.h14
-rw-r--r--pod/perldelta.pod28
-rw-r--r--pod/perldiag.pod25
-rw-r--r--pod/perlre.pod22
-rw-r--r--proto.h4
-rw-r--r--regcomp.c8
-rw-r--r--regen/mk_PL_charclass.pl8
-rw-r--r--t/lib/warnings/toke27
-rw-r--r--t/re/re_tests1
-rw-r--r--toke.c92
14 files changed, 204 insertions, 51 deletions
diff --git a/dquote_static.c b/dquote_static.c
index 5a22993ac0..da1b5b950a 100644
--- a/dquote_static.c
+++ b/dquote_static.c
@@ -15,7 +15,11 @@
Pulled from regcomp.c.
*/
PERL_STATIC_INLINE I32
-S_regcurly(pTHX_ const char *s)
+S_regcurly(pTHX_ const char *s,
+ const bool rbrace_must_be_escaped /* Should the terminating '} be
+ preceded by a backslash? This
+ is an abnormal case */
+ )
{
PERL_ARGS_ASSERT_REGCURLY;
@@ -30,9 +34,10 @@ S_regcurly(pTHX_ const char *s)
while (isDIGIT(*s))
s++;
}
- if (*s != '}')
- return FALSE;
- return TRUE;
+
+ return (rbrace_must_be_escaped)
+ ? *s == '\\' && *(s+1) == '}'
+ : *s == '}';
}
/* XXX Add documentation after final interface and behavior is decided */
diff --git a/embed.fnc b/embed.fnc
index 53c582d5be..01343575de 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -1113,7 +1113,8 @@ Ap |char* |re_intuit_start|NN REGEXP * const rx|NULLOK SV* sv|NN char* strpos \
|NULLOK re_scream_pos_data *data
Ap |SV* |re_intuit_string|NN REGEXP *const r
#if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_TOKE_C)
-EiPR |I32 |regcurly |NN const char *s
+EiPR |I32 |regcurly |NN const char *s \
+ |const bool rbrace_must_be_escaped
#endif
Ap |I32 |regexec_flags |NN REGEXP *const rx|NN char *stringarg \
|NN char *strend|NN char *strbeg|I32 minend \
@@ -2197,7 +2198,8 @@ s |char* |scan_ident |NN char *s|NN const char *send|NN char *dest \
sR |char* |scan_inputsymbol|NN char *start
sR |char* |scan_pat |NN char *start|I32 type
sR |char* |scan_str |NN char *start|int keep_quoted \
- |int keep_delims|int re_reparse
+ |int keep_delims|int re_reparse \
+ |bool deprecate_escaped_matching
sR |char* |scan_subst |NN char *start
sR |char* |scan_trans |NN char *start
s |char* |scan_word |NN char *s|NN char *dest|STRLEN destlen \
diff --git a/embed.h b/embed.h
index 1df6ab4b1f..b2da778a9d 100644
--- a/embed.h
+++ b/embed.h
@@ -965,7 +965,7 @@
#define grok_bslash_c(a,b,c) S_grok_bslash_c(aTHX_ a,b,c)
#define grok_bslash_o(a,b,c,d,e,f,g) S_grok_bslash_o(aTHX_ a,b,c,d,e,f,g)
#define grok_bslash_x(a,b,c,d,e,f,g) S_grok_bslash_x(aTHX_ a,b,c,d,e,f,g)
-#define regcurly(a) S_regcurly(aTHX_ a)
+#define regcurly(a,b) S_regcurly(aTHX_ a,b)
# endif
# if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_UTF8_C)
#define _add_range_to_invlist(a,b,c) Perl__add_range_to_invlist(aTHX_ a,b,c)
@@ -1615,7 +1615,7 @@
#define scan_ident(a,b,c,d,e) S_scan_ident(aTHX_ a,b,c,d,e)
#define scan_inputsymbol(a) S_scan_inputsymbol(aTHX_ a)
#define scan_pat(a,b) S_scan_pat(aTHX_ a,b)
-#define scan_str(a,b,c,d) S_scan_str(aTHX_ a,b,c,d)
+#define scan_str(a,b,c,d,e) S_scan_str(aTHX_ a,b,c,d,e)
#define scan_subst(a) S_scan_subst(aTHX_ a)
#define scan_trans(a) S_scan_trans(aTHX_ a)
#define scan_word(a,b,c,d,e) S_scan_word(aTHX_ a,b,c,d,e)
diff --git a/handy.h b/handy.h
index 298383e25b..509837975f 100644
--- a/handy.h
+++ b/handy.h
@@ -794,7 +794,8 @@ patched there. The file as of this writing is cpan/Devel-PPPort/parts/inc/misc
# define _CC_QUOTEMETA 20
# define _CC_NON_FINAL_FOLD 21
# define _CC_IS_IN_SOME_FOLD 22
-/* Unused: 23-31
+# define _CC_BACKSLASH_FOO_LBRACE_IS_META 31 /* temp, see mk_PL_charclass.pl */
+/* Unused: 23-30
* If more bits are needed, one could add a second word for non-64bit
* QUAD_IS_INT systems, using some #ifdefs to distinguish between having a 2nd
* word or not. The IS_IN_SOME_FOLD bit is the most easily expendable, as it
diff --git a/l1_char_class_tab.h b/l1_char_class_tab.h
index 709c97e8b4..b5bf444a6a 100644
--- a/l1_char_class_tab.h
+++ b/l1_char_class_tab.h
@@ -82,9 +82,9 @@
/* U+4B 'K' */ (1U<<_CC_NONLATIN1_FOLD)|(1U<<_CC_ALPHANUMERIC)|(1U<<_CC_ALPHA)|(1U<<_CC_ASCII)|(1U<<_CC_CASED)|(1U<<_CC_CHARNAME_CONT)|(1U<<_CC_GRAPH)|(1U<<_CC_IDFIRST)|(1U<<_CC_PRINT)|(1U<<_CC_UPPER)|(1U<<_CC_WORDCHAR)|(1U<<_CC_IS_IN_SOME_FOLD),
/* U+4C 'L' */ (1U<<_CC_NONLATIN1_FOLD)|(1U<<_CC_ALPHANUMERIC)|(1U<<_CC_ALPHA)|(1U<<_CC_ASCII)|(1U<<_CC_CASED)|(1U<<_CC_CHARNAME_CONT)|(1U<<_CC_GRAPH)|(1U<<_CC_IDFIRST)|(1U<<_CC_PRINT)|(1U<<_CC_UPPER)|(1U<<_CC_WORDCHAR)|(1U<<_CC_IS_IN_SOME_FOLD),
/* U+4D 'M' */ (1U<<_CC_ALPHANUMERIC)|(1U<<_CC_ALPHA)|(1U<<_CC_ASCII)|(1U<<_CC_CASED)|(1U<<_CC_CHARNAME_CONT)|(1U<<_CC_GRAPH)|(1U<<_CC_IDFIRST)|(1U<<_CC_PRINT)|(1U<<_CC_UPPER)|(1U<<_CC_WORDCHAR)|(1U<<_CC_IS_IN_SOME_FOLD),
-/* U+4E 'N' */ (1U<<_CC_NONLATIN1_FOLD)|(1U<<_CC_ALPHANUMERIC)|(1U<<_CC_ALPHA)|(1U<<_CC_ASCII)|(1U<<_CC_CASED)|(1U<<_CC_CHARNAME_CONT)|(1U<<_CC_GRAPH)|(1U<<_CC_IDFIRST)|(1U<<_CC_PRINT)|(1U<<_CC_UPPER)|(1U<<_CC_WORDCHAR)|(1U<<_CC_IS_IN_SOME_FOLD),
+/* U+4E 'N' */ (1U<<_CC_NONLATIN1_FOLD)|(1U<<_CC_ALPHANUMERIC)|(1U<<_CC_ALPHA)|(1U<<_CC_ASCII)|(1U<<_CC_CASED)|(1U<<_CC_CHARNAME_CONT)|(1U<<_CC_GRAPH)|(1U<<_CC_IDFIRST)|(1U<<_CC_PRINT)|(1U<<_CC_UPPER)|(1U<<_CC_WORDCHAR)|(1U<<_CC_IS_IN_SOME_FOLD)|(1U<<_CC_BACKSLASH_FOO_LBRACE_IS_META),
/* U+4F 'O' */ (1U<<_CC_ALPHANUMERIC)|(1U<<_CC_ALPHA)|(1U<<_CC_ASCII)|(1U<<_CC_CASED)|(1U<<_CC_CHARNAME_CONT)|(1U<<_CC_GRAPH)|(1U<<_CC_IDFIRST)|(1U<<_CC_PRINT)|(1U<<_CC_UPPER)|(1U<<_CC_WORDCHAR)|(1U<<_CC_IS_IN_SOME_FOLD),
-/* U+50 'P' */ (1U<<_CC_ALPHANUMERIC)|(1U<<_CC_ALPHA)|(1U<<_CC_ASCII)|(1U<<_CC_CASED)|(1U<<_CC_CHARNAME_CONT)|(1U<<_CC_GRAPH)|(1U<<_CC_IDFIRST)|(1U<<_CC_PRINT)|(1U<<_CC_UPPER)|(1U<<_CC_WORDCHAR)|(1U<<_CC_IS_IN_SOME_FOLD),
+/* U+50 'P' */ (1U<<_CC_ALPHANUMERIC)|(1U<<_CC_ALPHA)|(1U<<_CC_ASCII)|(1U<<_CC_CASED)|(1U<<_CC_CHARNAME_CONT)|(1U<<_CC_GRAPH)|(1U<<_CC_IDFIRST)|(1U<<_CC_PRINT)|(1U<<_CC_UPPER)|(1U<<_CC_WORDCHAR)|(1U<<_CC_IS_IN_SOME_FOLD)|(1U<<_CC_BACKSLASH_FOO_LBRACE_IS_META),
/* U+51 'Q' */ (1U<<_CC_ALPHANUMERIC)|(1U<<_CC_ALPHA)|(1U<<_CC_ASCII)|(1U<<_CC_CASED)|(1U<<_CC_CHARNAME_CONT)|(1U<<_CC_GRAPH)|(1U<<_CC_IDFIRST)|(1U<<_CC_PRINT)|(1U<<_CC_UPPER)|(1U<<_CC_WORDCHAR)|(1U<<_CC_IS_IN_SOME_FOLD),
/* U+52 'R' */ (1U<<_CC_ALPHANUMERIC)|(1U<<_CC_ALPHA)|(1U<<_CC_ASCII)|(1U<<_CC_CASED)|(1U<<_CC_CHARNAME_CONT)|(1U<<_CC_GRAPH)|(1U<<_CC_IDFIRST)|(1U<<_CC_PRINT)|(1U<<_CC_UPPER)|(1U<<_CC_WORDCHAR)|(1U<<_CC_IS_IN_SOME_FOLD),
/* U+53 'S' */ (1U<<_CC_NONLATIN1_FOLD)|(1U<<_CC_ALPHANUMERIC)|(1U<<_CC_ALPHA)|(1U<<_CC_ASCII)|(1U<<_CC_CASED)|(1U<<_CC_CHARNAME_CONT)|(1U<<_CC_GRAPH)|(1U<<_CC_IDFIRST)|(1U<<_CC_NON_FINAL_FOLD)|(1U<<_CC_PRINT)|(1U<<_CC_UPPER)|(1U<<_CC_WORDCHAR)|(1U<<_CC_IS_IN_SOME_FOLD),
@@ -107,16 +107,16 @@
/* U+64 'd' */ (1U<<_CC_ALPHANUMERIC)|(1U<<_CC_ALPHA)|(1U<<_CC_ASCII)|(1U<<_CC_CASED)|(1U<<_CC_CHARNAME_CONT)|(1U<<_CC_GRAPH)|(1U<<_CC_IDFIRST)|(1U<<_CC_LOWER)|(1U<<_CC_PRINT)|(1U<<_CC_WORDCHAR)|(1U<<_CC_XDIGIT)|(1U<<_CC_IS_IN_SOME_FOLD),
/* U+65 'e' */ (1U<<_CC_ALPHANUMERIC)|(1U<<_CC_ALPHA)|(1U<<_CC_ASCII)|(1U<<_CC_CASED)|(1U<<_CC_CHARNAME_CONT)|(1U<<_CC_GRAPH)|(1U<<_CC_IDFIRST)|(1U<<_CC_LOWER)|(1U<<_CC_PRINT)|(1U<<_CC_WORDCHAR)|(1U<<_CC_XDIGIT)|(1U<<_CC_IS_IN_SOME_FOLD),
/* U+66 'f' */ (1U<<_CC_NONLATIN1_FOLD)|(1U<<_CC_ALPHANUMERIC)|(1U<<_CC_ALPHA)|(1U<<_CC_ASCII)|(1U<<_CC_CASED)|(1U<<_CC_CHARNAME_CONT)|(1U<<_CC_GRAPH)|(1U<<_CC_IDFIRST)|(1U<<_CC_LOWER)|(1U<<_CC_NON_FINAL_FOLD)|(1U<<_CC_PRINT)|(1U<<_CC_WORDCHAR)|(1U<<_CC_XDIGIT)|(1U<<_CC_IS_IN_SOME_FOLD),
-/* U+67 'g' */ (1U<<_CC_ALPHANUMERIC)|(1U<<_CC_ALPHA)|(1U<<_CC_ASCII)|(1U<<_CC_CASED)|(1U<<_CC_CHARNAME_CONT)|(1U<<_CC_GRAPH)|(1U<<_CC_IDFIRST)|(1U<<_CC_LOWER)|(1U<<_CC_PRINT)|(1U<<_CC_WORDCHAR)|(1U<<_CC_IS_IN_SOME_FOLD),
+/* U+67 'g' */ (1U<<_CC_ALPHANUMERIC)|(1U<<_CC_ALPHA)|(1U<<_CC_ASCII)|(1U<<_CC_CASED)|(1U<<_CC_CHARNAME_CONT)|(1U<<_CC_GRAPH)|(1U<<_CC_IDFIRST)|(1U<<_CC_LOWER)|(1U<<_CC_PRINT)|(1U<<_CC_WORDCHAR)|(1U<<_CC_IS_IN_SOME_FOLD)|(1U<<_CC_BACKSLASH_FOO_LBRACE_IS_META),
/* U+68 'h' */ (1U<<_CC_NONLATIN1_FOLD)|(1U<<_CC_ALPHANUMERIC)|(1U<<_CC_ALPHA)|(1U<<_CC_ASCII)|(1U<<_CC_CASED)|(1U<<_CC_CHARNAME_CONT)|(1U<<_CC_GRAPH)|(1U<<_CC_IDFIRST)|(1U<<_CC_LOWER)|(1U<<_CC_NON_FINAL_FOLD)|(1U<<_CC_PRINT)|(1U<<_CC_WORDCHAR)|(1U<<_CC_IS_IN_SOME_FOLD),
/* U+69 'i' */ (1U<<_CC_NONLATIN1_FOLD)|(1U<<_CC_ALPHANUMERIC)|(1U<<_CC_ALPHA)|(1U<<_CC_ASCII)|(1U<<_CC_CASED)|(1U<<_CC_CHARNAME_CONT)|(1U<<_CC_GRAPH)|(1U<<_CC_IDFIRST)|(1U<<_CC_LOWER)|(1U<<_CC_NON_FINAL_FOLD)|(1U<<_CC_PRINT)|(1U<<_CC_WORDCHAR)|(1U<<_CC_IS_IN_SOME_FOLD),
/* U+6A 'j' */ (1U<<_CC_NONLATIN1_FOLD)|(1U<<_CC_ALPHANUMERIC)|(1U<<_CC_ALPHA)|(1U<<_CC_ASCII)|(1U<<_CC_CASED)|(1U<<_CC_CHARNAME_CONT)|(1U<<_CC_GRAPH)|(1U<<_CC_IDFIRST)|(1U<<_CC_LOWER)|(1U<<_CC_NON_FINAL_FOLD)|(1U<<_CC_PRINT)|(1U<<_CC_WORDCHAR)|(1U<<_CC_IS_IN_SOME_FOLD),
-/* U+6B 'k' */ (1U<<_CC_NONLATIN1_FOLD)|(1U<<_CC_ALPHANUMERIC)|(1U<<_CC_ALPHA)|(1U<<_CC_ASCII)|(1U<<_CC_CASED)|(1U<<_CC_CHARNAME_CONT)|(1U<<_CC_GRAPH)|(1U<<_CC_IDFIRST)|(1U<<_CC_LOWER)|(1U<<_CC_PRINT)|(1U<<_CC_WORDCHAR)|(1U<<_CC_IS_IN_SOME_FOLD),
+/* U+6B 'k' */ (1U<<_CC_NONLATIN1_FOLD)|(1U<<_CC_ALPHANUMERIC)|(1U<<_CC_ALPHA)|(1U<<_CC_ASCII)|(1U<<_CC_CASED)|(1U<<_CC_CHARNAME_CONT)|(1U<<_CC_GRAPH)|(1U<<_CC_IDFIRST)|(1U<<_CC_LOWER)|(1U<<_CC_PRINT)|(1U<<_CC_WORDCHAR)|(1U<<_CC_IS_IN_SOME_FOLD)|(1U<<_CC_BACKSLASH_FOO_LBRACE_IS_META),
/* U+6C 'l' */ (1U<<_CC_NONLATIN1_FOLD)|(1U<<_CC_ALPHANUMERIC)|(1U<<_CC_ALPHA)|(1U<<_CC_ASCII)|(1U<<_CC_CASED)|(1U<<_CC_CHARNAME_CONT)|(1U<<_CC_GRAPH)|(1U<<_CC_IDFIRST)|(1U<<_CC_LOWER)|(1U<<_CC_PRINT)|(1U<<_CC_WORDCHAR)|(1U<<_CC_IS_IN_SOME_FOLD),
/* U+6D 'm' */ (1U<<_CC_ALPHANUMERIC)|(1U<<_CC_ALPHA)|(1U<<_CC_ASCII)|(1U<<_CC_CASED)|(1U<<_CC_CHARNAME_CONT)|(1U<<_CC_GRAPH)|(1U<<_CC_IDFIRST)|(1U<<_CC_LOWER)|(1U<<_CC_PRINT)|(1U<<_CC_WORDCHAR)|(1U<<_CC_IS_IN_SOME_FOLD),
/* U+6E 'n' */ (1U<<_CC_NONLATIN1_FOLD)|(1U<<_CC_ALPHANUMERIC)|(1U<<_CC_ALPHA)|(1U<<_CC_ASCII)|(1U<<_CC_CASED)|(1U<<_CC_CHARNAME_CONT)|(1U<<_CC_GRAPH)|(1U<<_CC_IDFIRST)|(1U<<_CC_LOWER)|(1U<<_CC_PRINT)|(1U<<_CC_WORDCHAR)|(1U<<_CC_IS_IN_SOME_FOLD),
-/* U+6F 'o' */ (1U<<_CC_ALPHANUMERIC)|(1U<<_CC_ALPHA)|(1U<<_CC_ASCII)|(1U<<_CC_CASED)|(1U<<_CC_CHARNAME_CONT)|(1U<<_CC_GRAPH)|(1U<<_CC_IDFIRST)|(1U<<_CC_LOWER)|(1U<<_CC_PRINT)|(1U<<_CC_WORDCHAR)|(1U<<_CC_IS_IN_SOME_FOLD),
-/* U+70 'p' */ (1U<<_CC_ALPHANUMERIC)|(1U<<_CC_ALPHA)|(1U<<_CC_ASCII)|(1U<<_CC_CASED)|(1U<<_CC_CHARNAME_CONT)|(1U<<_CC_GRAPH)|(1U<<_CC_IDFIRST)|(1U<<_CC_LOWER)|(1U<<_CC_PRINT)|(1U<<_CC_WORDCHAR)|(1U<<_CC_IS_IN_SOME_FOLD),
+/* U+6F 'o' */ (1U<<_CC_ALPHANUMERIC)|(1U<<_CC_ALPHA)|(1U<<_CC_ASCII)|(1U<<_CC_CASED)|(1U<<_CC_CHARNAME_CONT)|(1U<<_CC_GRAPH)|(1U<<_CC_IDFIRST)|(1U<<_CC_LOWER)|(1U<<_CC_PRINT)|(1U<<_CC_WORDCHAR)|(1U<<_CC_IS_IN_SOME_FOLD)|(1U<<_CC_BACKSLASH_FOO_LBRACE_IS_META),
+/* U+70 'p' */ (1U<<_CC_ALPHANUMERIC)|(1U<<_CC_ALPHA)|(1U<<_CC_ASCII)|(1U<<_CC_CASED)|(1U<<_CC_CHARNAME_CONT)|(1U<<_CC_GRAPH)|(1U<<_CC_IDFIRST)|(1U<<_CC_LOWER)|(1U<<_CC_PRINT)|(1U<<_CC_WORDCHAR)|(1U<<_CC_IS_IN_SOME_FOLD)|(1U<<_CC_BACKSLASH_FOO_LBRACE_IS_META),
/* U+71 'q' */ (1U<<_CC_ALPHANUMERIC)|(1U<<_CC_ALPHA)|(1U<<_CC_ASCII)|(1U<<_CC_CASED)|(1U<<_CC_CHARNAME_CONT)|(1U<<_CC_GRAPH)|(1U<<_CC_IDFIRST)|(1U<<_CC_LOWER)|(1U<<_CC_PRINT)|(1U<<_CC_WORDCHAR)|(1U<<_CC_IS_IN_SOME_FOLD),
/* U+72 'r' */ (1U<<_CC_ALPHANUMERIC)|(1U<<_CC_ALPHA)|(1U<<_CC_ASCII)|(1U<<_CC_CASED)|(1U<<_CC_CHARNAME_CONT)|(1U<<_CC_GRAPH)|(1U<<_CC_IDFIRST)|(1U<<_CC_LOWER)|(1U<<_CC_PRINT)|(1U<<_CC_WORDCHAR)|(1U<<_CC_IS_IN_SOME_FOLD),
/* U+73 's' */ (1U<<_CC_NONLATIN1_FOLD)|(1U<<_CC_ALPHANUMERIC)|(1U<<_CC_ALPHA)|(1U<<_CC_ASCII)|(1U<<_CC_CASED)|(1U<<_CC_CHARNAME_CONT)|(1U<<_CC_GRAPH)|(1U<<_CC_IDFIRST)|(1U<<_CC_LOWER)|(1U<<_CC_NON_FINAL_FOLD)|(1U<<_CC_PRINT)|(1U<<_CC_WORDCHAR)|(1U<<_CC_IS_IN_SOME_FOLD),
@@ -124,7 +124,7 @@
/* U+75 'u' */ (1U<<_CC_ALPHANUMERIC)|(1U<<_CC_ALPHA)|(1U<<_CC_ASCII)|(1U<<_CC_CASED)|(1U<<_CC_CHARNAME_CONT)|(1U<<_CC_GRAPH)|(1U<<_CC_IDFIRST)|(1U<<_CC_LOWER)|(1U<<_CC_PRINT)|(1U<<_CC_WORDCHAR)|(1U<<_CC_IS_IN_SOME_FOLD),
/* U+76 'v' */ (1U<<_CC_ALPHANUMERIC)|(1U<<_CC_ALPHA)|(1U<<_CC_ASCII)|(1U<<_CC_CASED)|(1U<<_CC_CHARNAME_CONT)|(1U<<_CC_GRAPH)|(1U<<_CC_IDFIRST)|(1U<<_CC_LOWER)|(1U<<_CC_PRINT)|(1U<<_CC_WORDCHAR)|(1U<<_CC_IS_IN_SOME_FOLD),
/* U+77 'w' */ (1U<<_CC_NONLATIN1_FOLD)|(1U<<_CC_ALPHANUMERIC)|(1U<<_CC_ALPHA)|(1U<<_CC_ASCII)|(1U<<_CC_CASED)|(1U<<_CC_CHARNAME_CONT)|(1U<<_CC_GRAPH)|(1U<<_CC_IDFIRST)|(1U<<_CC_LOWER)|(1U<<_CC_NON_FINAL_FOLD)|(1U<<_CC_PRINT)|(1U<<_CC_WORDCHAR)|(1U<<_CC_IS_IN_SOME_FOLD),
-/* U+78 'x' */ (1U<<_CC_ALPHANUMERIC)|(1U<<_CC_ALPHA)|(1U<<_CC_ASCII)|(1U<<_CC_CASED)|(1U<<_CC_CHARNAME_CONT)|(1U<<_CC_GRAPH)|(1U<<_CC_IDFIRST)|(1U<<_CC_LOWER)|(1U<<_CC_PRINT)|(1U<<_CC_WORDCHAR)|(1U<<_CC_IS_IN_SOME_FOLD),
+/* U+78 'x' */ (1U<<_CC_ALPHANUMERIC)|(1U<<_CC_ALPHA)|(1U<<_CC_ASCII)|(1U<<_CC_CASED)|(1U<<_CC_CHARNAME_CONT)|(1U<<_CC_GRAPH)|(1U<<_CC_IDFIRST)|(1U<<_CC_LOWER)|(1U<<_CC_PRINT)|(1U<<_CC_WORDCHAR)|(1U<<_CC_IS_IN_SOME_FOLD)|(1U<<_CC_BACKSLASH_FOO_LBRACE_IS_META),
/* U+79 'y' */ (1U<<_CC_NONLATIN1_FOLD)|(1U<<_CC_ALPHANUMERIC)|(1U<<_CC_ALPHA)|(1U<<_CC_ASCII)|(1U<<_CC_CASED)|(1U<<_CC_CHARNAME_CONT)|(1U<<_CC_GRAPH)|(1U<<_CC_IDFIRST)|(1U<<_CC_LOWER)|(1U<<_CC_NON_FINAL_FOLD)|(1U<<_CC_PRINT)|(1U<<_CC_WORDCHAR)|(1U<<_CC_IS_IN_SOME_FOLD),
/* U+7A 'z' */ (1U<<_CC_ALPHANUMERIC)|(1U<<_CC_ALPHA)|(1U<<_CC_ASCII)|(1U<<_CC_CASED)|(1U<<_CC_CHARNAME_CONT)|(1U<<_CC_GRAPH)|(1U<<_CC_IDFIRST)|(1U<<_CC_LOWER)|(1U<<_CC_PRINT)|(1U<<_CC_WORDCHAR)|(1U<<_CC_IS_IN_SOME_FOLD),
/* U+7B '{' */ (1U<<_CC_ASCII)|(1U<<_CC_GRAPH)|(1U<<_CC_PRINT)|(1U<<_CC_PUNCT)|(1U<<_CC_QUOTEMETA),
diff --git a/pod/perldelta.pod b/pod/perldelta.pod
index e9f0e122a2..f608decb5d 100644
--- a/pod/perldelta.pod
+++ b/pod/perldelta.pod
@@ -94,6 +94,34 @@ In addition these three functions that have never worked properly are
deprecated:
C<to_uni_lower_lc>, C<to_uni_title_lc>, and C<to_uni_upper_lc>.
+=head2 Certain rare uses of backslashes within regexes are now deprectated
+
+There are three pairs of characters that Perl recognizes as
+metacharacters in regular expression patterns: C<{}>, C<[]>, and C<()>.
+These can be used as well to delimit patterns, as in:
+
+ m{foo}
+ s(foo)(bar)
+
+Since they are metacharacters, they have special meaning to regular
+expression patterns, and it turns out that you can't turn off that
+special meaning by the normal means of preceding them with a backslash,
+if you use them, paired, within a pattern delimitted by them. For
+example, in
+
+ m{foo\{1,3\}}
+
+the backslashes do not change the behavior, and this matches
+S<C<"f o">> followed by one to three more occurrences of C<"o">.
+
+Usages like this, where they are interpreted as metacharacters, are
+exceedingly rare; we think there are none, for example, in all of CPAN.
+Hence, this deprecation should affect very little code. It does give
+notice, however, that any such code needs to change, which will in turn
+allow us to change the behavior in future Perl versions so that the
+backslashes do have an effect, and without fear that we are silently
+breaking any existing code.
+
=head1 Performance Enhancements
XXX Changes which enhance performance without changing behaviour go here.
diff --git a/pod/perldiag.pod b/pod/perldiag.pod
index 8df73326ca..19aaa55bf7 100644
--- a/pod/perldiag.pod
+++ b/pod/perldiag.pod
@@ -5456,6 +5456,31 @@ discovered. See L<perlre>.
same length as the replacelist. See L<perlop> for more information
about the /d modifier.
+=item Useless use of '\'; doesn't escape metacharacter '%c'
+
+(D deprecated) You wrote a regular expression pattern something like
+one of these:
+
+ m{ \x\{FF\} }x
+ m{foo\{1,3\}}
+ qr(foo\(bar\))
+ s[foo\[a-z\]bar][baz]
+
+The interior braces, square brackets, and parentheses are treated as
+metacharacters even though they are backslashed; instead write:
+
+ m{ \x{FF} }x
+ m{foo{1,3}}
+ qr(foo(bar))
+ s[foo[a-z]bar][baz]
+
+The backslashes have no effect when a regular expression pattern is
+delimitted by C<{}>, C<[]>, or C<()>, which ordinarily are
+metacharacters, and the delimiters are also used, paired, within the
+interior of the pattern. It is planned that a future Perl release will
+change the meaning of constructs like these so that the backslashes
+will have an effect, so remove them from your code.
+
=item Useless use of \E
(W misc) You have a \E in a double-quotish string without a C<\U>,
diff --git a/pod/perlre.pod b/pod/perlre.pod
index b4b7bf27f9..e6d579306e 100644
--- a/pod/perlre.pod
+++ b/pod/perlre.pod
@@ -505,22 +505,22 @@ X<metacharacter> X<quantifier> X<*> X<+> X<?> X<{n}> X<{n,}> X<{n,m}>
{n,m} Match at least n but not more than m times
(If a curly bracket occurs in any other context and does not form part of
-a backslashed sequence like C<\x{...}>, it is treated
-as a regular character. In particular, the lower quantifier bound
-is not optional. However, in Perl v5.18, it is planned to issue a
-deprecation warning for all such occurrences, and in Perl v5.20 to
-require literal uses of a curly bracket to be escaped, say by preceding
-them with a backslash or enclosing them within square brackets, (C<"\{">
-or C<"[{]">). This change will allow for future syntax extensions (like
-making the lower bound of a quantifier optional), and better error
-checking of quantifiers. Now, a typo in a quantifier silently causes
-it to be treated as the literal characters. For example,
+a backslashed sequence like C<\x{...}>, it is treated as a regular
+character. In particular, the lower quantifier bound is not optional,
+and a typo in a quantifier silently causes it to be treated as the
+literal characters. For example,
/o{4,3}/
looks like a quantifier that matches 0 times, since 4 is greater than 3,
but it really means to match the sequence of six characters
-S<C<"o { 4 , 3 }">>.)
+S<C<"o { 4 , 3 }">>. It is planned to eventually require literal uses
+of curly brackets to be escaped, say by preceding them with a backslash
+or enclosing them within square brackets, (C<"\{"> or C<"[{]">). This
+change will allow for future syntax extensions (like making the lower
+bound of a quantifier optional), and better error checking. In the
+meantime, you should get in the habit of escaping all instances where
+you mean a literal "{".)
The "*" quantifier is equivalent to C<{0,}>, the "+"
quantifier to C<{1,}>, and the "?" quantifier to C<{0,1}>. n and m are limited
diff --git a/proto.h b/proto.h
index 0d0078d107..feae8a2ecb 100644
--- a/proto.h
+++ b/proto.h
@@ -6816,7 +6816,7 @@ PERL_STATIC_INLINE bool S_grok_bslash_x(pTHX_ char** s, UV* uv, const char** err
#define PERL_ARGS_ASSERT_GROK_BSLASH_X \
assert(s); assert(uv); assert(error_msg)
-PERL_STATIC_INLINE I32 S_regcurly(pTHX_ const char *s)
+PERL_STATIC_INLINE I32 S_regcurly(pTHX_ const char *s, const bool rbrace_must_be_escaped)
__attribute__warn_unused_result__
__attribute__pure__
__attribute__nonnull__(pTHX_1);
@@ -7295,7 +7295,7 @@ STATIC char* S_scan_pat(pTHX_ char *start, I32 type)
#define PERL_ARGS_ASSERT_SCAN_PAT \
assert(start)
-STATIC char* S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims, int re_reparse)
+STATIC char* S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims, int re_reparse, bool deprecate_escaped_matching)
__attribute__warn_unused_result__
__attribute__nonnull__(pTHX_1);
#define PERL_ARGS_ASSERT_SCAN_STR \
diff --git a/regcomp.c b/regcomp.c
index 1cf4a84510..2084f5343e 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -214,7 +214,7 @@ typedef struct RExC_state_t {
#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
- ((*s) == '{' && regcurly(s)))
+ ((*s) == '{' && regcurly(s, FALSE)))
#ifdef SPSTART
#undef SPSTART /* dratted cpp namespace... */
@@ -9441,7 +9441,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
op = *RExC_parse;
- if (op == '{' && regcurly(RExC_parse)) {
+ if (op == '{' && regcurly(RExC_parse, FALSE)) {
maxpos = NULL;
#ifdef RE_TRACK_PATTERN_OFFSETS
parse_start = RExC_parse; /* MJD */
@@ -9705,7 +9705,7 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I
/* Disambiguate between \N meaning a named character versus \N meaning
* [^\n]. The former is assumed when it can't be the latter. */
- if (*p != '{' || regcurly(p)) {
+ if (*p != '{' || regcurly(p, FALSE)) {
RExC_parse = p;
if (! node_p) {
/* no bare \N in a charclass */
@@ -10166,7 +10166,7 @@ tryagain:
/* Supposed to be caught earlier. */
break;
case '{':
- if (!regcurly(RExC_parse)) {
+ if (!regcurly(RExC_parse, FALSE)) {
RExC_parse++;
goto defchar;
}
diff --git a/regen/mk_PL_charclass.pl b/regen/mk_PL_charclass.pl
index 5d328c8341..63c06bc9f6 100644
--- a/regen/mk_PL_charclass.pl
+++ b/regen/mk_PL_charclass.pl
@@ -45,6 +45,7 @@ my @properties = qw(
XDIGIT
VERTSPACE
IS_IN_SOME_FOLD
+ BACKSLASH_FOO_LBRACE_IS_META
);
# Read in the case fold mappings.
@@ -204,6 +205,13 @@ for my $ord (0..255) {
$re = qr/\p{Is_Non_Final_Fold}/;
} elsif ($name eq 'IS_IN_SOME_FOLD') {
$re = qr/\p{_Perl_Any_Folds}/;
+ } elsif ($name eq 'BACKSLASH_FOO_LBRACE_IS_META') {
+
+ # This is true for FOO where FOO is the varying character in:
+ # \a{, \b{, \c{, ...
+ # and the sequence has non-literal meaning to Perl; so it is true
+ # for 'x' because \x{ is special, but not 'a' because \a{ isn't.
+ $re = qr/[gkNopPx]/;
} else { # The remainder have the same name and values as Unicode
$re = eval "qr/\\p{$name}/";
use Carp;
diff --git a/t/lib/warnings/toke b/t/lib/warnings/toke
index 7d66ab6d25..1817d86e18 100644
--- a/t/lib/warnings/toke
+++ b/t/lib/warnings/toke
@@ -1306,3 +1306,30 @@ sub { # do not actually call require
require a::b + 1; # ambiguity warnings.
}
EXPECT
+########
+# toke.c
+# [perl #XXX] Erroneous ambiguity warnings
+print "aa" =~ m{^a\{1,2\}$}, "\n";
+print "aa" =~ m{^a\x\{61\}$}, "\n";
+print "aa" =~ m{^a{1,2}$}, "\n";
+print "aq" =~ m[^a\[a-z\]$], "\n";
+print "aq" =~ m(^a\(q\)$), "\n";
+no warnings 'deprecated';
+print "aa" =~ m{^a\{1,2\}$}, "\n";
+print "aa" =~ m{^a\x\{61\}$}, "\n";
+print "aq" =~ m[^a\[a-z\]$], "\n";
+print "aq" =~ m(^a\(q\)$), "\n";
+EXPECT
+Useless use of '\'; doesn't escape metacharacter '{' at - line 3.
+Useless use of '\'; doesn't escape metacharacter '{' at - line 4.
+Useless use of '\'; doesn't escape metacharacter '[' at - line 6.
+Useless use of '\'; doesn't escape metacharacter '(' at - line 7.
+1
+1
+1
+1
+q
+1
+1
+1
+q
diff --git a/t/re/re_tests b/t/re/re_tests
index e2a7e890c4..c41d529f8f 100644
--- a/t/re/re_tests
+++ b/t/re/re_tests
@@ -1730,5 +1730,6 @@ ab[c\\\](??{"x"})]{3}d ab\\](d y - -
\Vn \xFFn/ y $& \xFFn
/(?l:a?\w)/ b y $& b
+m?^xy\?$? xy? y $& xy?
# vim: softtabstop=0 noexpandtab
diff --git a/toke.c b/toke.c
index 24e794d2d7..efcdb25aef 100644
--- a/toke.c
+++ b/toke.c
@@ -3248,7 +3248,7 @@ S_scan_const(pTHX_ char *start)
else if (PL_lex_inpat
&& (*s != 'N'
|| s[1] != '{'
- || regcurly(s + 1)))
+ || regcurly(s + 1, FALSE)))
{
*d++ = NATIVE_TO_NEED(has_utf8,'\\');
goto default_action;
@@ -3818,7 +3818,7 @@ S_intuit_more(pTHX_ char *s)
/* In a pattern, so maybe we have {n,m}. */
if (*s == '{') {
- if (regcurly(s)) {
+ if (regcurly(s, FALSE)) {
return FALSE;
}
return TRUE;
@@ -5772,7 +5772,7 @@ Perl_yylex(pTHX)
}
sv = newSVpvn_flags(s, len, UTF ? SVf_UTF8 : 0);
if (*d == '(') {
- d = scan_str(d,TRUE,TRUE,FALSE);
+ d = scan_str(d,TRUE,TRUE,FALSE, FALSE);
if (!d) {
/* MUST advance bufptr here to avoid bogus
"at end of line" context messages from yyerror().
@@ -6677,7 +6677,7 @@ Perl_yylex(pTHX)
TERM(THING);
case '\'':
- s = scan_str(s,!!PL_madskills,FALSE,FALSE);
+ s = scan_str(s,!!PL_madskills,FALSE,FALSE, FALSE);
DEBUG_T( { printbuf("### Saw string before %s\n", s); } );
if (PL_expect == XOPERATOR) {
if (PL_lex_formbrack && PL_lex_brackets == PL_lex_formbrack) {
@@ -6692,7 +6692,7 @@ Perl_yylex(pTHX)
TERM(sublex_start());
case '"':
- s = scan_str(s,!!PL_madskills,FALSE,FALSE);
+ s = scan_str(s,!!PL_madskills,FALSE,FALSE, FALSE);
DEBUG_T( { printbuf("### Saw string before %s\n", s); } );
if (PL_expect == XOPERATOR) {
if (PL_lex_formbrack && PL_lex_brackets == PL_lex_formbrack) {
@@ -6715,7 +6715,7 @@ Perl_yylex(pTHX)
TERM(sublex_start());
case '`':
- s = scan_str(s,!!PL_madskills,FALSE,FALSE);
+ s = scan_str(s,!!PL_madskills,FALSE,FALSE, FALSE);
DEBUG_T( { printbuf("### Saw backtick string before %s\n", s); } );
if (PL_expect == XOPERATOR)
no_op("Backticks",s);
@@ -8174,7 +8174,7 @@ Perl_yylex(pTHX)
LOP(OP_PIPE_OP,XTERM);
case KEY_q:
- s = scan_str(s,!!PL_madskills,FALSE,FALSE);
+ s = scan_str(s,!!PL_madskills,FALSE,FALSE, FALSE);
if (!s)
missingterm(NULL);
pl_yylval.ival = OP_CONST;
@@ -8185,7 +8185,7 @@ Perl_yylex(pTHX)
case KEY_qw: {
OP *words = NULL;
- s = scan_str(s,!!PL_madskills,FALSE,FALSE);
+ s = scan_str(s,!!PL_madskills,FALSE,FALSE, FALSE);
if (!s)
missingterm(NULL);
PL_expect = XOPERATOR;
@@ -8235,7 +8235,7 @@ Perl_yylex(pTHX)
}
case KEY_qq:
- s = scan_str(s,!!PL_madskills,FALSE,FALSE);
+ s = scan_str(s,!!PL_madskills,FALSE,FALSE, FALSE);
if (!s)
missingterm(NULL);
pl_yylval.ival = OP_STRINGIFY;
@@ -8248,7 +8248,7 @@ Perl_yylex(pTHX)
TERM(sublex_start());
case KEY_qx:
- s = scan_str(s,!!PL_madskills,FALSE,FALSE);
+ s = scan_str(s,!!PL_madskills,FALSE,FALSE, FALSE);
if (!s)
missingterm(NULL);
readpipe_override();
@@ -8569,7 +8569,7 @@ Perl_yylex(pTHX)
const bool warnillegalproto = ckWARN(WARN_ILLEGALPROTO);
STRLEN tmplen;
- s = scan_str(s,!!PL_madskills,FALSE,FALSE);
+ s = scan_str(s,!!PL_madskills,FALSE,FALSE, FALSE);
if (!s)
Perl_croak(aTHX_ "Prototype not terminated");
/* strip spaces and check for bad characters */
@@ -9508,7 +9508,8 @@ S_scan_pat(pTHX_ char *start, I32 type)
{
dVAR;
PMOP *pm;
- char *s = scan_str(start,!!PL_madskills,FALSE, PL_reg_state.re_reparsing);
+ char *s = scan_str(start,!!PL_madskills,FALSE, PL_reg_state.re_reparsing,
+ TRUE /* look for escaped bracketed metas */ );
const char * const valid_flags =
(const char *)((type == OP_QR) ? QR_PAT_MODS : M_PAT_MODS);
char charset = '\0'; /* character set modifier */
@@ -9611,7 +9612,8 @@ S_scan_subst(pTHX_ char *start)
pl_yylval.ival = OP_NULL;
- s = scan_str(start,!!PL_madskills,FALSE,FALSE);
+ s = scan_str(start,!!PL_madskills,FALSE,FALSE,
+ TRUE /* look for escaped bracketed metas */ );
if (!s)
Perl_croak(aTHX_ "Substitution pattern not terminated");
@@ -9629,7 +9631,7 @@ S_scan_subst(pTHX_ char *start)
#endif
first_start = PL_multi_start;
- s = scan_str(s,!!PL_madskills,FALSE,FALSE);
+ s = scan_str(s,!!PL_madskills,FALSE,FALSE, FALSE);
if (!s) {
if (PL_lex_stuff) {
SvREFCNT_dec(PL_lex_stuff);
@@ -9715,7 +9717,7 @@ S_scan_trans(pTHX_ char *start)
pl_yylval.ival = OP_NULL;
- s = scan_str(start,!!PL_madskills,FALSE,FALSE);
+ s = scan_str(start,!!PL_madskills,FALSE,FALSE, FALSE);
if (!s)
Perl_croak(aTHX_ "Transliteration pattern not terminated");
@@ -9731,7 +9733,7 @@ S_scan_trans(pTHX_ char *start)
}
#endif
- s = scan_str(s,!!PL_madskills,FALSE,FALSE);
+ s = scan_str(s,!!PL_madskills,FALSE,FALSE, FALSE);
if (!s) {
if (PL_lex_stuff) {
SvREFCNT_dec(PL_lex_stuff);
@@ -10180,7 +10182,7 @@ S_scan_inputsymbol(pTHX_ char *start)
if (d - PL_tokenbuf != len) {
pl_yylval.ival = OP_GLOB;
- s = scan_str(start,!!PL_madskills,FALSE,FALSE);
+ s = scan_str(start,!!PL_madskills,FALSE,FALSE, FALSE);
if (!s)
Perl_croak(aTHX_ "Glob not terminated");
return s;
@@ -10322,7 +10324,11 @@ intro_sym:
*/
STATIC char *
-S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims, int re_reparse)
+S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims, int re_reparse,
+ bool deprecate_escaped_meta /* Should we issue a deprecation warning
+ for certain paired metacharacters that
+ appear escaped within it */
+ )
{
dVAR;
SV *sv; /* scalar value: string */
@@ -10336,6 +10342,7 @@ S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims, int re_reparse)
U8 termstr[UTF8_MAXBYTES]; /* terminating string */
STRLEN termlen; /* length of terminating string */
int last_off = 0; /* last position for nesting bracket */
+ char *escaped_open = NULL;
#ifdef PERL_MAD
int stuffstart;
char *tstart;
@@ -10382,6 +10389,18 @@ S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims, int re_reparse)
PL_multi_close = term;
+ /* A warning is raised if the input parameter requires it for escaped (by a
+ * backslash) paired metacharacters {} [] and () when the delimiters are
+ * those same characters, and the backslash is ineffective. This doesn't
+ * happen for <>, as they aren't metas. */
+ if (deprecate_escaped_meta
+ && (PL_multi_open == PL_multi_close
+ || ! ckWARN_d(WARN_DEPRECATED)
+ || PL_multi_open == '<'))
+ {
+ deprecate_escaped_meta = FALSE;
+ }
+
/* create a new SV to hold the contents. 79 is the SV's initial length.
What a random number. */
sv = newSV_type(SVt_PVIV);
@@ -10520,7 +10539,44 @@ S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims, int re_reparse)
if (*s == '\\' && s+1 < PL_bufend) {
if (!keep_quoted &&
((s[1] == PL_multi_open) || (s[1] == PL_multi_close)))
+ {
s++;
+
+ /* Here, 'deprecate_escaped_meta' is true iff the
+ * delimiters are paired metacharacters, and 's' points
+ * to an occurrence of one of them within the string,
+ * which was preceded by a backslash. If this is a
+ * context where the delimiter is also a metacharacter,
+ * the backslash is useless, and deprecated. () and []
+ * are meta in any context. {} are meta only when
+ * appearing in a quantifier or in things like '\p{'.
+ * They also aren't meta unless there is a matching
+ * closed, escaped char later on within the string.
+ * If 's' points to an open, set a flag; if to a close,
+ * test that flag, and raise a warning if it was set */
+
+ if (deprecate_escaped_meta) {
+ if (*s == PL_multi_open) {
+ if (*s != '{') {
+ escaped_open = s;
+ }
+ else if (regcurly(s,
+ TRUE /* Look for a closing
+ '\}' */)
+ || (s - start > 2 /* Look for e.g.
+ '\x{' */
+ && _generic_isCC(*(s-2), _CC_BACKSLASH_FOO_LBRACE_IS_META)))
+ {
+ escaped_open = s;
+ }
+ }
+ else if (escaped_open) {
+ Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
+ "Useless use of '\\'; doesn't escape metacharacter '%c'", PL_multi_open);
+ escaped_open = NULL;
+ }
+ }
+ }
else
*to++ = *s++;
}