diff options
-rw-r--r-- | dquote_static.c | 13 | ||||
-rw-r--r-- | embed.fnc | 6 | ||||
-rw-r--r-- | embed.h | 4 | ||||
-rw-r--r-- | handy.h | 3 | ||||
-rw-r--r-- | l1_char_class_tab.h | 14 | ||||
-rw-r--r-- | pod/perldelta.pod | 28 | ||||
-rw-r--r-- | pod/perldiag.pod | 25 | ||||
-rw-r--r-- | pod/perlre.pod | 22 | ||||
-rw-r--r-- | proto.h | 4 | ||||
-rw-r--r-- | regcomp.c | 8 | ||||
-rw-r--r-- | regen/mk_PL_charclass.pl | 8 | ||||
-rw-r--r-- | t/lib/warnings/toke | 27 | ||||
-rw-r--r-- | t/re/re_tests | 1 | ||||
-rw-r--r-- | toke.c | 92 |
14 files changed, 204 insertions, 51 deletions
diff --git a/dquote_static.c b/dquote_static.c index 5a22993ac0..da1b5b950a 100644 --- a/dquote_static.c +++ b/dquote_static.c @@ -15,7 +15,11 @@ Pulled from regcomp.c. */ PERL_STATIC_INLINE I32 -S_regcurly(pTHX_ const char *s) +S_regcurly(pTHX_ const char *s, + const bool rbrace_must_be_escaped /* Should the terminating '} be + preceded by a backslash? This + is an abnormal case */ + ) { PERL_ARGS_ASSERT_REGCURLY; @@ -30,9 +34,10 @@ S_regcurly(pTHX_ const char *s) while (isDIGIT(*s)) s++; } - if (*s != '}') - return FALSE; - return TRUE; + + return (rbrace_must_be_escaped) + ? *s == '\\' && *(s+1) == '}' + : *s == '}'; } /* XXX Add documentation after final interface and behavior is decided */ @@ -1113,7 +1113,8 @@ Ap |char* |re_intuit_start|NN REGEXP * const rx|NULLOK SV* sv|NN char* strpos \ |NULLOK re_scream_pos_data *data Ap |SV* |re_intuit_string|NN REGEXP *const r #if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_TOKE_C) -EiPR |I32 |regcurly |NN const char *s +EiPR |I32 |regcurly |NN const char *s \ + |const bool rbrace_must_be_escaped #endif Ap |I32 |regexec_flags |NN REGEXP *const rx|NN char *stringarg \ |NN char *strend|NN char *strbeg|I32 minend \ @@ -2197,7 +2198,8 @@ s |char* |scan_ident |NN char *s|NN const char *send|NN char *dest \ sR |char* |scan_inputsymbol|NN char *start sR |char* |scan_pat |NN char *start|I32 type sR |char* |scan_str |NN char *start|int keep_quoted \ - |int keep_delims|int re_reparse + |int keep_delims|int re_reparse \ + |bool deprecate_escaped_matching sR |char* |scan_subst |NN char *start sR |char* |scan_trans |NN char *start s |char* |scan_word |NN char *s|NN char *dest|STRLEN destlen \ @@ -965,7 +965,7 @@ #define grok_bslash_c(a,b,c) S_grok_bslash_c(aTHX_ a,b,c) #define grok_bslash_o(a,b,c,d,e,f,g) S_grok_bslash_o(aTHX_ a,b,c,d,e,f,g) #define grok_bslash_x(a,b,c,d,e,f,g) S_grok_bslash_x(aTHX_ a,b,c,d,e,f,g) -#define regcurly(a) S_regcurly(aTHX_ a) +#define regcurly(a,b) S_regcurly(aTHX_ a,b) # endif # if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_UTF8_C) #define _add_range_to_invlist(a,b,c) Perl__add_range_to_invlist(aTHX_ a,b,c) @@ -1615,7 +1615,7 @@ #define scan_ident(a,b,c,d,e) S_scan_ident(aTHX_ a,b,c,d,e) #define scan_inputsymbol(a) S_scan_inputsymbol(aTHX_ a) #define scan_pat(a,b) S_scan_pat(aTHX_ a,b) -#define scan_str(a,b,c,d) S_scan_str(aTHX_ a,b,c,d) +#define scan_str(a,b,c,d,e) S_scan_str(aTHX_ a,b,c,d,e) #define scan_subst(a) S_scan_subst(aTHX_ a) #define scan_trans(a) S_scan_trans(aTHX_ a) #define scan_word(a,b,c,d,e) S_scan_word(aTHX_ a,b,c,d,e) @@ -794,7 +794,8 @@ patched there. The file as of this writing is cpan/Devel-PPPort/parts/inc/misc # define _CC_QUOTEMETA 20 # define _CC_NON_FINAL_FOLD 21 # define _CC_IS_IN_SOME_FOLD 22 -/* Unused: 23-31 +# define _CC_BACKSLASH_FOO_LBRACE_IS_META 31 /* temp, see mk_PL_charclass.pl */ +/* Unused: 23-30 * If more bits are needed, one could add a second word for non-64bit * QUAD_IS_INT systems, using some #ifdefs to distinguish between having a 2nd * word or not. The IS_IN_SOME_FOLD bit is the most easily expendable, as it diff --git a/l1_char_class_tab.h b/l1_char_class_tab.h index 709c97e8b4..b5bf444a6a 100644 --- a/l1_char_class_tab.h +++ b/l1_char_class_tab.h @@ -82,9 +82,9 @@ /* U+4B 'K' */ (1U<<_CC_NONLATIN1_FOLD)|(1U<<_CC_ALPHANUMERIC)|(1U<<_CC_ALPHA)|(1U<<_CC_ASCII)|(1U<<_CC_CASED)|(1U<<_CC_CHARNAME_CONT)|(1U<<_CC_GRAPH)|(1U<<_CC_IDFIRST)|(1U<<_CC_PRINT)|(1U<<_CC_UPPER)|(1U<<_CC_WORDCHAR)|(1U<<_CC_IS_IN_SOME_FOLD), /* U+4C 'L' */ (1U<<_CC_NONLATIN1_FOLD)|(1U<<_CC_ALPHANUMERIC)|(1U<<_CC_ALPHA)|(1U<<_CC_ASCII)|(1U<<_CC_CASED)|(1U<<_CC_CHARNAME_CONT)|(1U<<_CC_GRAPH)|(1U<<_CC_IDFIRST)|(1U<<_CC_PRINT)|(1U<<_CC_UPPER)|(1U<<_CC_WORDCHAR)|(1U<<_CC_IS_IN_SOME_FOLD), /* U+4D 'M' */ (1U<<_CC_ALPHANUMERIC)|(1U<<_CC_ALPHA)|(1U<<_CC_ASCII)|(1U<<_CC_CASED)|(1U<<_CC_CHARNAME_CONT)|(1U<<_CC_GRAPH)|(1U<<_CC_IDFIRST)|(1U<<_CC_PRINT)|(1U<<_CC_UPPER)|(1U<<_CC_WORDCHAR)|(1U<<_CC_IS_IN_SOME_FOLD), -/* U+4E 'N' */ (1U<<_CC_NONLATIN1_FOLD)|(1U<<_CC_ALPHANUMERIC)|(1U<<_CC_ALPHA)|(1U<<_CC_ASCII)|(1U<<_CC_CASED)|(1U<<_CC_CHARNAME_CONT)|(1U<<_CC_GRAPH)|(1U<<_CC_IDFIRST)|(1U<<_CC_PRINT)|(1U<<_CC_UPPER)|(1U<<_CC_WORDCHAR)|(1U<<_CC_IS_IN_SOME_FOLD), +/* U+4E 'N' */ (1U<<_CC_NONLATIN1_FOLD)|(1U<<_CC_ALPHANUMERIC)|(1U<<_CC_ALPHA)|(1U<<_CC_ASCII)|(1U<<_CC_CASED)|(1U<<_CC_CHARNAME_CONT)|(1U<<_CC_GRAPH)|(1U<<_CC_IDFIRST)|(1U<<_CC_PRINT)|(1U<<_CC_UPPER)|(1U<<_CC_WORDCHAR)|(1U<<_CC_IS_IN_SOME_FOLD)|(1U<<_CC_BACKSLASH_FOO_LBRACE_IS_META), /* U+4F 'O' */ (1U<<_CC_ALPHANUMERIC)|(1U<<_CC_ALPHA)|(1U<<_CC_ASCII)|(1U<<_CC_CASED)|(1U<<_CC_CHARNAME_CONT)|(1U<<_CC_GRAPH)|(1U<<_CC_IDFIRST)|(1U<<_CC_PRINT)|(1U<<_CC_UPPER)|(1U<<_CC_WORDCHAR)|(1U<<_CC_IS_IN_SOME_FOLD), -/* U+50 'P' */ (1U<<_CC_ALPHANUMERIC)|(1U<<_CC_ALPHA)|(1U<<_CC_ASCII)|(1U<<_CC_CASED)|(1U<<_CC_CHARNAME_CONT)|(1U<<_CC_GRAPH)|(1U<<_CC_IDFIRST)|(1U<<_CC_PRINT)|(1U<<_CC_UPPER)|(1U<<_CC_WORDCHAR)|(1U<<_CC_IS_IN_SOME_FOLD), +/* U+50 'P' */ (1U<<_CC_ALPHANUMERIC)|(1U<<_CC_ALPHA)|(1U<<_CC_ASCII)|(1U<<_CC_CASED)|(1U<<_CC_CHARNAME_CONT)|(1U<<_CC_GRAPH)|(1U<<_CC_IDFIRST)|(1U<<_CC_PRINT)|(1U<<_CC_UPPER)|(1U<<_CC_WORDCHAR)|(1U<<_CC_IS_IN_SOME_FOLD)|(1U<<_CC_BACKSLASH_FOO_LBRACE_IS_META), /* U+51 'Q' */ (1U<<_CC_ALPHANUMERIC)|(1U<<_CC_ALPHA)|(1U<<_CC_ASCII)|(1U<<_CC_CASED)|(1U<<_CC_CHARNAME_CONT)|(1U<<_CC_GRAPH)|(1U<<_CC_IDFIRST)|(1U<<_CC_PRINT)|(1U<<_CC_UPPER)|(1U<<_CC_WORDCHAR)|(1U<<_CC_IS_IN_SOME_FOLD), /* U+52 'R' */ (1U<<_CC_ALPHANUMERIC)|(1U<<_CC_ALPHA)|(1U<<_CC_ASCII)|(1U<<_CC_CASED)|(1U<<_CC_CHARNAME_CONT)|(1U<<_CC_GRAPH)|(1U<<_CC_IDFIRST)|(1U<<_CC_PRINT)|(1U<<_CC_UPPER)|(1U<<_CC_WORDCHAR)|(1U<<_CC_IS_IN_SOME_FOLD), /* U+53 'S' */ (1U<<_CC_NONLATIN1_FOLD)|(1U<<_CC_ALPHANUMERIC)|(1U<<_CC_ALPHA)|(1U<<_CC_ASCII)|(1U<<_CC_CASED)|(1U<<_CC_CHARNAME_CONT)|(1U<<_CC_GRAPH)|(1U<<_CC_IDFIRST)|(1U<<_CC_NON_FINAL_FOLD)|(1U<<_CC_PRINT)|(1U<<_CC_UPPER)|(1U<<_CC_WORDCHAR)|(1U<<_CC_IS_IN_SOME_FOLD), @@ -107,16 +107,16 @@ /* U+64 'd' */ (1U<<_CC_ALPHANUMERIC)|(1U<<_CC_ALPHA)|(1U<<_CC_ASCII)|(1U<<_CC_CASED)|(1U<<_CC_CHARNAME_CONT)|(1U<<_CC_GRAPH)|(1U<<_CC_IDFIRST)|(1U<<_CC_LOWER)|(1U<<_CC_PRINT)|(1U<<_CC_WORDCHAR)|(1U<<_CC_XDIGIT)|(1U<<_CC_IS_IN_SOME_FOLD), /* U+65 'e' */ (1U<<_CC_ALPHANUMERIC)|(1U<<_CC_ALPHA)|(1U<<_CC_ASCII)|(1U<<_CC_CASED)|(1U<<_CC_CHARNAME_CONT)|(1U<<_CC_GRAPH)|(1U<<_CC_IDFIRST)|(1U<<_CC_LOWER)|(1U<<_CC_PRINT)|(1U<<_CC_WORDCHAR)|(1U<<_CC_XDIGIT)|(1U<<_CC_IS_IN_SOME_FOLD), /* U+66 'f' */ (1U<<_CC_NONLATIN1_FOLD)|(1U<<_CC_ALPHANUMERIC)|(1U<<_CC_ALPHA)|(1U<<_CC_ASCII)|(1U<<_CC_CASED)|(1U<<_CC_CHARNAME_CONT)|(1U<<_CC_GRAPH)|(1U<<_CC_IDFIRST)|(1U<<_CC_LOWER)|(1U<<_CC_NON_FINAL_FOLD)|(1U<<_CC_PRINT)|(1U<<_CC_WORDCHAR)|(1U<<_CC_XDIGIT)|(1U<<_CC_IS_IN_SOME_FOLD), -/* U+67 'g' */ (1U<<_CC_ALPHANUMERIC)|(1U<<_CC_ALPHA)|(1U<<_CC_ASCII)|(1U<<_CC_CASED)|(1U<<_CC_CHARNAME_CONT)|(1U<<_CC_GRAPH)|(1U<<_CC_IDFIRST)|(1U<<_CC_LOWER)|(1U<<_CC_PRINT)|(1U<<_CC_WORDCHAR)|(1U<<_CC_IS_IN_SOME_FOLD), +/* U+67 'g' */ (1U<<_CC_ALPHANUMERIC)|(1U<<_CC_ALPHA)|(1U<<_CC_ASCII)|(1U<<_CC_CASED)|(1U<<_CC_CHARNAME_CONT)|(1U<<_CC_GRAPH)|(1U<<_CC_IDFIRST)|(1U<<_CC_LOWER)|(1U<<_CC_PRINT)|(1U<<_CC_WORDCHAR)|(1U<<_CC_IS_IN_SOME_FOLD)|(1U<<_CC_BACKSLASH_FOO_LBRACE_IS_META), /* U+68 'h' */ (1U<<_CC_NONLATIN1_FOLD)|(1U<<_CC_ALPHANUMERIC)|(1U<<_CC_ALPHA)|(1U<<_CC_ASCII)|(1U<<_CC_CASED)|(1U<<_CC_CHARNAME_CONT)|(1U<<_CC_GRAPH)|(1U<<_CC_IDFIRST)|(1U<<_CC_LOWER)|(1U<<_CC_NON_FINAL_FOLD)|(1U<<_CC_PRINT)|(1U<<_CC_WORDCHAR)|(1U<<_CC_IS_IN_SOME_FOLD), /* U+69 'i' */ (1U<<_CC_NONLATIN1_FOLD)|(1U<<_CC_ALPHANUMERIC)|(1U<<_CC_ALPHA)|(1U<<_CC_ASCII)|(1U<<_CC_CASED)|(1U<<_CC_CHARNAME_CONT)|(1U<<_CC_GRAPH)|(1U<<_CC_IDFIRST)|(1U<<_CC_LOWER)|(1U<<_CC_NON_FINAL_FOLD)|(1U<<_CC_PRINT)|(1U<<_CC_WORDCHAR)|(1U<<_CC_IS_IN_SOME_FOLD), /* U+6A 'j' */ (1U<<_CC_NONLATIN1_FOLD)|(1U<<_CC_ALPHANUMERIC)|(1U<<_CC_ALPHA)|(1U<<_CC_ASCII)|(1U<<_CC_CASED)|(1U<<_CC_CHARNAME_CONT)|(1U<<_CC_GRAPH)|(1U<<_CC_IDFIRST)|(1U<<_CC_LOWER)|(1U<<_CC_NON_FINAL_FOLD)|(1U<<_CC_PRINT)|(1U<<_CC_WORDCHAR)|(1U<<_CC_IS_IN_SOME_FOLD), -/* U+6B 'k' */ (1U<<_CC_NONLATIN1_FOLD)|(1U<<_CC_ALPHANUMERIC)|(1U<<_CC_ALPHA)|(1U<<_CC_ASCII)|(1U<<_CC_CASED)|(1U<<_CC_CHARNAME_CONT)|(1U<<_CC_GRAPH)|(1U<<_CC_IDFIRST)|(1U<<_CC_LOWER)|(1U<<_CC_PRINT)|(1U<<_CC_WORDCHAR)|(1U<<_CC_IS_IN_SOME_FOLD), +/* U+6B 'k' */ (1U<<_CC_NONLATIN1_FOLD)|(1U<<_CC_ALPHANUMERIC)|(1U<<_CC_ALPHA)|(1U<<_CC_ASCII)|(1U<<_CC_CASED)|(1U<<_CC_CHARNAME_CONT)|(1U<<_CC_GRAPH)|(1U<<_CC_IDFIRST)|(1U<<_CC_LOWER)|(1U<<_CC_PRINT)|(1U<<_CC_WORDCHAR)|(1U<<_CC_IS_IN_SOME_FOLD)|(1U<<_CC_BACKSLASH_FOO_LBRACE_IS_META), /* U+6C 'l' */ (1U<<_CC_NONLATIN1_FOLD)|(1U<<_CC_ALPHANUMERIC)|(1U<<_CC_ALPHA)|(1U<<_CC_ASCII)|(1U<<_CC_CASED)|(1U<<_CC_CHARNAME_CONT)|(1U<<_CC_GRAPH)|(1U<<_CC_IDFIRST)|(1U<<_CC_LOWER)|(1U<<_CC_PRINT)|(1U<<_CC_WORDCHAR)|(1U<<_CC_IS_IN_SOME_FOLD), /* U+6D 'm' */ (1U<<_CC_ALPHANUMERIC)|(1U<<_CC_ALPHA)|(1U<<_CC_ASCII)|(1U<<_CC_CASED)|(1U<<_CC_CHARNAME_CONT)|(1U<<_CC_GRAPH)|(1U<<_CC_IDFIRST)|(1U<<_CC_LOWER)|(1U<<_CC_PRINT)|(1U<<_CC_WORDCHAR)|(1U<<_CC_IS_IN_SOME_FOLD), /* U+6E 'n' */ (1U<<_CC_NONLATIN1_FOLD)|(1U<<_CC_ALPHANUMERIC)|(1U<<_CC_ALPHA)|(1U<<_CC_ASCII)|(1U<<_CC_CASED)|(1U<<_CC_CHARNAME_CONT)|(1U<<_CC_GRAPH)|(1U<<_CC_IDFIRST)|(1U<<_CC_LOWER)|(1U<<_CC_PRINT)|(1U<<_CC_WORDCHAR)|(1U<<_CC_IS_IN_SOME_FOLD), -/* U+6F 'o' */ (1U<<_CC_ALPHANUMERIC)|(1U<<_CC_ALPHA)|(1U<<_CC_ASCII)|(1U<<_CC_CASED)|(1U<<_CC_CHARNAME_CONT)|(1U<<_CC_GRAPH)|(1U<<_CC_IDFIRST)|(1U<<_CC_LOWER)|(1U<<_CC_PRINT)|(1U<<_CC_WORDCHAR)|(1U<<_CC_IS_IN_SOME_FOLD), -/* U+70 'p' */ (1U<<_CC_ALPHANUMERIC)|(1U<<_CC_ALPHA)|(1U<<_CC_ASCII)|(1U<<_CC_CASED)|(1U<<_CC_CHARNAME_CONT)|(1U<<_CC_GRAPH)|(1U<<_CC_IDFIRST)|(1U<<_CC_LOWER)|(1U<<_CC_PRINT)|(1U<<_CC_WORDCHAR)|(1U<<_CC_IS_IN_SOME_FOLD), +/* U+6F 'o' */ (1U<<_CC_ALPHANUMERIC)|(1U<<_CC_ALPHA)|(1U<<_CC_ASCII)|(1U<<_CC_CASED)|(1U<<_CC_CHARNAME_CONT)|(1U<<_CC_GRAPH)|(1U<<_CC_IDFIRST)|(1U<<_CC_LOWER)|(1U<<_CC_PRINT)|(1U<<_CC_WORDCHAR)|(1U<<_CC_IS_IN_SOME_FOLD)|(1U<<_CC_BACKSLASH_FOO_LBRACE_IS_META), +/* U+70 'p' */ (1U<<_CC_ALPHANUMERIC)|(1U<<_CC_ALPHA)|(1U<<_CC_ASCII)|(1U<<_CC_CASED)|(1U<<_CC_CHARNAME_CONT)|(1U<<_CC_GRAPH)|(1U<<_CC_IDFIRST)|(1U<<_CC_LOWER)|(1U<<_CC_PRINT)|(1U<<_CC_WORDCHAR)|(1U<<_CC_IS_IN_SOME_FOLD)|(1U<<_CC_BACKSLASH_FOO_LBRACE_IS_META), /* U+71 'q' */ (1U<<_CC_ALPHANUMERIC)|(1U<<_CC_ALPHA)|(1U<<_CC_ASCII)|(1U<<_CC_CASED)|(1U<<_CC_CHARNAME_CONT)|(1U<<_CC_GRAPH)|(1U<<_CC_IDFIRST)|(1U<<_CC_LOWER)|(1U<<_CC_PRINT)|(1U<<_CC_WORDCHAR)|(1U<<_CC_IS_IN_SOME_FOLD), /* U+72 'r' */ (1U<<_CC_ALPHANUMERIC)|(1U<<_CC_ALPHA)|(1U<<_CC_ASCII)|(1U<<_CC_CASED)|(1U<<_CC_CHARNAME_CONT)|(1U<<_CC_GRAPH)|(1U<<_CC_IDFIRST)|(1U<<_CC_LOWER)|(1U<<_CC_PRINT)|(1U<<_CC_WORDCHAR)|(1U<<_CC_IS_IN_SOME_FOLD), /* U+73 's' */ (1U<<_CC_NONLATIN1_FOLD)|(1U<<_CC_ALPHANUMERIC)|(1U<<_CC_ALPHA)|(1U<<_CC_ASCII)|(1U<<_CC_CASED)|(1U<<_CC_CHARNAME_CONT)|(1U<<_CC_GRAPH)|(1U<<_CC_IDFIRST)|(1U<<_CC_LOWER)|(1U<<_CC_NON_FINAL_FOLD)|(1U<<_CC_PRINT)|(1U<<_CC_WORDCHAR)|(1U<<_CC_IS_IN_SOME_FOLD), @@ -124,7 +124,7 @@ /* U+75 'u' */ (1U<<_CC_ALPHANUMERIC)|(1U<<_CC_ALPHA)|(1U<<_CC_ASCII)|(1U<<_CC_CASED)|(1U<<_CC_CHARNAME_CONT)|(1U<<_CC_GRAPH)|(1U<<_CC_IDFIRST)|(1U<<_CC_LOWER)|(1U<<_CC_PRINT)|(1U<<_CC_WORDCHAR)|(1U<<_CC_IS_IN_SOME_FOLD), /* U+76 'v' */ (1U<<_CC_ALPHANUMERIC)|(1U<<_CC_ALPHA)|(1U<<_CC_ASCII)|(1U<<_CC_CASED)|(1U<<_CC_CHARNAME_CONT)|(1U<<_CC_GRAPH)|(1U<<_CC_IDFIRST)|(1U<<_CC_LOWER)|(1U<<_CC_PRINT)|(1U<<_CC_WORDCHAR)|(1U<<_CC_IS_IN_SOME_FOLD), /* U+77 'w' */ (1U<<_CC_NONLATIN1_FOLD)|(1U<<_CC_ALPHANUMERIC)|(1U<<_CC_ALPHA)|(1U<<_CC_ASCII)|(1U<<_CC_CASED)|(1U<<_CC_CHARNAME_CONT)|(1U<<_CC_GRAPH)|(1U<<_CC_IDFIRST)|(1U<<_CC_LOWER)|(1U<<_CC_NON_FINAL_FOLD)|(1U<<_CC_PRINT)|(1U<<_CC_WORDCHAR)|(1U<<_CC_IS_IN_SOME_FOLD), -/* U+78 'x' */ (1U<<_CC_ALPHANUMERIC)|(1U<<_CC_ALPHA)|(1U<<_CC_ASCII)|(1U<<_CC_CASED)|(1U<<_CC_CHARNAME_CONT)|(1U<<_CC_GRAPH)|(1U<<_CC_IDFIRST)|(1U<<_CC_LOWER)|(1U<<_CC_PRINT)|(1U<<_CC_WORDCHAR)|(1U<<_CC_IS_IN_SOME_FOLD), +/* U+78 'x' */ (1U<<_CC_ALPHANUMERIC)|(1U<<_CC_ALPHA)|(1U<<_CC_ASCII)|(1U<<_CC_CASED)|(1U<<_CC_CHARNAME_CONT)|(1U<<_CC_GRAPH)|(1U<<_CC_IDFIRST)|(1U<<_CC_LOWER)|(1U<<_CC_PRINT)|(1U<<_CC_WORDCHAR)|(1U<<_CC_IS_IN_SOME_FOLD)|(1U<<_CC_BACKSLASH_FOO_LBRACE_IS_META), /* U+79 'y' */ (1U<<_CC_NONLATIN1_FOLD)|(1U<<_CC_ALPHANUMERIC)|(1U<<_CC_ALPHA)|(1U<<_CC_ASCII)|(1U<<_CC_CASED)|(1U<<_CC_CHARNAME_CONT)|(1U<<_CC_GRAPH)|(1U<<_CC_IDFIRST)|(1U<<_CC_LOWER)|(1U<<_CC_NON_FINAL_FOLD)|(1U<<_CC_PRINT)|(1U<<_CC_WORDCHAR)|(1U<<_CC_IS_IN_SOME_FOLD), /* U+7A 'z' */ (1U<<_CC_ALPHANUMERIC)|(1U<<_CC_ALPHA)|(1U<<_CC_ASCII)|(1U<<_CC_CASED)|(1U<<_CC_CHARNAME_CONT)|(1U<<_CC_GRAPH)|(1U<<_CC_IDFIRST)|(1U<<_CC_LOWER)|(1U<<_CC_PRINT)|(1U<<_CC_WORDCHAR)|(1U<<_CC_IS_IN_SOME_FOLD), /* U+7B '{' */ (1U<<_CC_ASCII)|(1U<<_CC_GRAPH)|(1U<<_CC_PRINT)|(1U<<_CC_PUNCT)|(1U<<_CC_QUOTEMETA), diff --git a/pod/perldelta.pod b/pod/perldelta.pod index e9f0e122a2..f608decb5d 100644 --- a/pod/perldelta.pod +++ b/pod/perldelta.pod @@ -94,6 +94,34 @@ In addition these three functions that have never worked properly are deprecated: C<to_uni_lower_lc>, C<to_uni_title_lc>, and C<to_uni_upper_lc>. +=head2 Certain rare uses of backslashes within regexes are now deprectated + +There are three pairs of characters that Perl recognizes as +metacharacters in regular expression patterns: C<{}>, C<[]>, and C<()>. +These can be used as well to delimit patterns, as in: + + m{foo} + s(foo)(bar) + +Since they are metacharacters, they have special meaning to regular +expression patterns, and it turns out that you can't turn off that +special meaning by the normal means of preceding them with a backslash, +if you use them, paired, within a pattern delimitted by them. For +example, in + + m{foo\{1,3\}} + +the backslashes do not change the behavior, and this matches +S<C<"f o">> followed by one to three more occurrences of C<"o">. + +Usages like this, where they are interpreted as metacharacters, are +exceedingly rare; we think there are none, for example, in all of CPAN. +Hence, this deprecation should affect very little code. It does give +notice, however, that any such code needs to change, which will in turn +allow us to change the behavior in future Perl versions so that the +backslashes do have an effect, and without fear that we are silently +breaking any existing code. + =head1 Performance Enhancements XXX Changes which enhance performance without changing behaviour go here. diff --git a/pod/perldiag.pod b/pod/perldiag.pod index 8df73326ca..19aaa55bf7 100644 --- a/pod/perldiag.pod +++ b/pod/perldiag.pod @@ -5456,6 +5456,31 @@ discovered. See L<perlre>. same length as the replacelist. See L<perlop> for more information about the /d modifier. +=item Useless use of '\'; doesn't escape metacharacter '%c' + +(D deprecated) You wrote a regular expression pattern something like +one of these: + + m{ \x\{FF\} }x + m{foo\{1,3\}} + qr(foo\(bar\)) + s[foo\[a-z\]bar][baz] + +The interior braces, square brackets, and parentheses are treated as +metacharacters even though they are backslashed; instead write: + + m{ \x{FF} }x + m{foo{1,3}} + qr(foo(bar)) + s[foo[a-z]bar][baz] + +The backslashes have no effect when a regular expression pattern is +delimitted by C<{}>, C<[]>, or C<()>, which ordinarily are +metacharacters, and the delimiters are also used, paired, within the +interior of the pattern. It is planned that a future Perl release will +change the meaning of constructs like these so that the backslashes +will have an effect, so remove them from your code. + =item Useless use of \E (W misc) You have a \E in a double-quotish string without a C<\U>, diff --git a/pod/perlre.pod b/pod/perlre.pod index b4b7bf27f9..e6d579306e 100644 --- a/pod/perlre.pod +++ b/pod/perlre.pod @@ -505,22 +505,22 @@ X<metacharacter> X<quantifier> X<*> X<+> X<?> X<{n}> X<{n,}> X<{n,m}> {n,m} Match at least n but not more than m times (If a curly bracket occurs in any other context and does not form part of -a backslashed sequence like C<\x{...}>, it is treated -as a regular character. In particular, the lower quantifier bound -is not optional. However, in Perl v5.18, it is planned to issue a -deprecation warning for all such occurrences, and in Perl v5.20 to -require literal uses of a curly bracket to be escaped, say by preceding -them with a backslash or enclosing them within square brackets, (C<"\{"> -or C<"[{]">). This change will allow for future syntax extensions (like -making the lower bound of a quantifier optional), and better error -checking of quantifiers. Now, a typo in a quantifier silently causes -it to be treated as the literal characters. For example, +a backslashed sequence like C<\x{...}>, it is treated as a regular +character. In particular, the lower quantifier bound is not optional, +and a typo in a quantifier silently causes it to be treated as the +literal characters. For example, /o{4,3}/ looks like a quantifier that matches 0 times, since 4 is greater than 3, but it really means to match the sequence of six characters -S<C<"o { 4 , 3 }">>.) +S<C<"o { 4 , 3 }">>. It is planned to eventually require literal uses +of curly brackets to be escaped, say by preceding them with a backslash +or enclosing them within square brackets, (C<"\{"> or C<"[{]">). This +change will allow for future syntax extensions (like making the lower +bound of a quantifier optional), and better error checking. In the +meantime, you should get in the habit of escaping all instances where +you mean a literal "{".) The "*" quantifier is equivalent to C<{0,}>, the "+" quantifier to C<{1,}>, and the "?" quantifier to C<{0,1}>. n and m are limited @@ -6816,7 +6816,7 @@ PERL_STATIC_INLINE bool S_grok_bslash_x(pTHX_ char** s, UV* uv, const char** err #define PERL_ARGS_ASSERT_GROK_BSLASH_X \ assert(s); assert(uv); assert(error_msg) -PERL_STATIC_INLINE I32 S_regcurly(pTHX_ const char *s) +PERL_STATIC_INLINE I32 S_regcurly(pTHX_ const char *s, const bool rbrace_must_be_escaped) __attribute__warn_unused_result__ __attribute__pure__ __attribute__nonnull__(pTHX_1); @@ -7295,7 +7295,7 @@ STATIC char* S_scan_pat(pTHX_ char *start, I32 type) #define PERL_ARGS_ASSERT_SCAN_PAT \ assert(start) -STATIC char* S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims, int re_reparse) +STATIC char* S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims, int re_reparse, bool deprecate_escaped_matching) __attribute__warn_unused_result__ __attribute__nonnull__(pTHX_1); #define PERL_ARGS_ASSERT_SCAN_STR \ @@ -214,7 +214,7 @@ typedef struct RExC_state_t { #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?') #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \ - ((*s) == '{' && regcurly(s))) + ((*s) == '{' && regcurly(s, FALSE))) #ifdef SPSTART #undef SPSTART /* dratted cpp namespace... */ @@ -9441,7 +9441,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) op = *RExC_parse; - if (op == '{' && regcurly(RExC_parse)) { + if (op == '{' && regcurly(RExC_parse, FALSE)) { maxpos = NULL; #ifdef RE_TRACK_PATTERN_OFFSETS parse_start = RExC_parse; /* MJD */ @@ -9705,7 +9705,7 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I /* Disambiguate between \N meaning a named character versus \N meaning * [^\n]. The former is assumed when it can't be the latter. */ - if (*p != '{' || regcurly(p)) { + if (*p != '{' || regcurly(p, FALSE)) { RExC_parse = p; if (! node_p) { /* no bare \N in a charclass */ @@ -10166,7 +10166,7 @@ tryagain: /* Supposed to be caught earlier. */ break; case '{': - if (!regcurly(RExC_parse)) { + if (!regcurly(RExC_parse, FALSE)) { RExC_parse++; goto defchar; } diff --git a/regen/mk_PL_charclass.pl b/regen/mk_PL_charclass.pl index 5d328c8341..63c06bc9f6 100644 --- a/regen/mk_PL_charclass.pl +++ b/regen/mk_PL_charclass.pl @@ -45,6 +45,7 @@ my @properties = qw( XDIGIT VERTSPACE IS_IN_SOME_FOLD + BACKSLASH_FOO_LBRACE_IS_META ); # Read in the case fold mappings. @@ -204,6 +205,13 @@ for my $ord (0..255) { $re = qr/\p{Is_Non_Final_Fold}/; } elsif ($name eq 'IS_IN_SOME_FOLD') { $re = qr/\p{_Perl_Any_Folds}/; + } elsif ($name eq 'BACKSLASH_FOO_LBRACE_IS_META') { + + # This is true for FOO where FOO is the varying character in: + # \a{, \b{, \c{, ... + # and the sequence has non-literal meaning to Perl; so it is true + # for 'x' because \x{ is special, but not 'a' because \a{ isn't. + $re = qr/[gkNopPx]/; } else { # The remainder have the same name and values as Unicode $re = eval "qr/\\p{$name}/"; use Carp; diff --git a/t/lib/warnings/toke b/t/lib/warnings/toke index 7d66ab6d25..1817d86e18 100644 --- a/t/lib/warnings/toke +++ b/t/lib/warnings/toke @@ -1306,3 +1306,30 @@ sub { # do not actually call require require a::b + 1; # ambiguity warnings. } EXPECT +######## +# toke.c +# [perl #XXX] Erroneous ambiguity warnings +print "aa" =~ m{^a\{1,2\}$}, "\n"; +print "aa" =~ m{^a\x\{61\}$}, "\n"; +print "aa" =~ m{^a{1,2}$}, "\n"; +print "aq" =~ m[^a\[a-z\]$], "\n"; +print "aq" =~ m(^a\(q\)$), "\n"; +no warnings 'deprecated'; +print "aa" =~ m{^a\{1,2\}$}, "\n"; +print "aa" =~ m{^a\x\{61\}$}, "\n"; +print "aq" =~ m[^a\[a-z\]$], "\n"; +print "aq" =~ m(^a\(q\)$), "\n"; +EXPECT +Useless use of '\'; doesn't escape metacharacter '{' at - line 3. +Useless use of '\'; doesn't escape metacharacter '{' at - line 4. +Useless use of '\'; doesn't escape metacharacter '[' at - line 6. +Useless use of '\'; doesn't escape metacharacter '(' at - line 7. +1 +1 +1 +1 +q +1 +1 +1 +q diff --git a/t/re/re_tests b/t/re/re_tests index e2a7e890c4..c41d529f8f 100644 --- a/t/re/re_tests +++ b/t/re/re_tests @@ -1730,5 +1730,6 @@ ab[c\\\](??{"x"})]{3}d ab\\](d y - - \Vn \xFFn/ y $& \xFFn /(?l:a?\w)/ b y $& b +m?^xy\?$? xy? y $& xy? # vim: softtabstop=0 noexpandtab @@ -3248,7 +3248,7 @@ S_scan_const(pTHX_ char *start) else if (PL_lex_inpat && (*s != 'N' || s[1] != '{' - || regcurly(s + 1))) + || regcurly(s + 1, FALSE))) { *d++ = NATIVE_TO_NEED(has_utf8,'\\'); goto default_action; @@ -3818,7 +3818,7 @@ S_intuit_more(pTHX_ char *s) /* In a pattern, so maybe we have {n,m}. */ if (*s == '{') { - if (regcurly(s)) { + if (regcurly(s, FALSE)) { return FALSE; } return TRUE; @@ -5772,7 +5772,7 @@ Perl_yylex(pTHX) } sv = newSVpvn_flags(s, len, UTF ? SVf_UTF8 : 0); if (*d == '(') { - d = scan_str(d,TRUE,TRUE,FALSE); + d = scan_str(d,TRUE,TRUE,FALSE, FALSE); if (!d) { /* MUST advance bufptr here to avoid bogus "at end of line" context messages from yyerror(). @@ -6677,7 +6677,7 @@ Perl_yylex(pTHX) TERM(THING); case '\'': - s = scan_str(s,!!PL_madskills,FALSE,FALSE); + s = scan_str(s,!!PL_madskills,FALSE,FALSE, FALSE); DEBUG_T( { printbuf("### Saw string before %s\n", s); } ); if (PL_expect == XOPERATOR) { if (PL_lex_formbrack && PL_lex_brackets == PL_lex_formbrack) { @@ -6692,7 +6692,7 @@ Perl_yylex(pTHX) TERM(sublex_start()); case '"': - s = scan_str(s,!!PL_madskills,FALSE,FALSE); + s = scan_str(s,!!PL_madskills,FALSE,FALSE, FALSE); DEBUG_T( { printbuf("### Saw string before %s\n", s); } ); if (PL_expect == XOPERATOR) { if (PL_lex_formbrack && PL_lex_brackets == PL_lex_formbrack) { @@ -6715,7 +6715,7 @@ Perl_yylex(pTHX) TERM(sublex_start()); case '`': - s = scan_str(s,!!PL_madskills,FALSE,FALSE); + s = scan_str(s,!!PL_madskills,FALSE,FALSE, FALSE); DEBUG_T( { printbuf("### Saw backtick string before %s\n", s); } ); if (PL_expect == XOPERATOR) no_op("Backticks",s); @@ -8174,7 +8174,7 @@ Perl_yylex(pTHX) LOP(OP_PIPE_OP,XTERM); case KEY_q: - s = scan_str(s,!!PL_madskills,FALSE,FALSE); + s = scan_str(s,!!PL_madskills,FALSE,FALSE, FALSE); if (!s) missingterm(NULL); pl_yylval.ival = OP_CONST; @@ -8185,7 +8185,7 @@ Perl_yylex(pTHX) case KEY_qw: { OP *words = NULL; - s = scan_str(s,!!PL_madskills,FALSE,FALSE); + s = scan_str(s,!!PL_madskills,FALSE,FALSE, FALSE); if (!s) missingterm(NULL); PL_expect = XOPERATOR; @@ -8235,7 +8235,7 @@ Perl_yylex(pTHX) } case KEY_qq: - s = scan_str(s,!!PL_madskills,FALSE,FALSE); + s = scan_str(s,!!PL_madskills,FALSE,FALSE, FALSE); if (!s) missingterm(NULL); pl_yylval.ival = OP_STRINGIFY; @@ -8248,7 +8248,7 @@ Perl_yylex(pTHX) TERM(sublex_start()); case KEY_qx: - s = scan_str(s,!!PL_madskills,FALSE,FALSE); + s = scan_str(s,!!PL_madskills,FALSE,FALSE, FALSE); if (!s) missingterm(NULL); readpipe_override(); @@ -8569,7 +8569,7 @@ Perl_yylex(pTHX) const bool warnillegalproto = ckWARN(WARN_ILLEGALPROTO); STRLEN tmplen; - s = scan_str(s,!!PL_madskills,FALSE,FALSE); + s = scan_str(s,!!PL_madskills,FALSE,FALSE, FALSE); if (!s) Perl_croak(aTHX_ "Prototype not terminated"); /* strip spaces and check for bad characters */ @@ -9508,7 +9508,8 @@ S_scan_pat(pTHX_ char *start, I32 type) { dVAR; PMOP *pm; - char *s = scan_str(start,!!PL_madskills,FALSE, PL_reg_state.re_reparsing); + char *s = scan_str(start,!!PL_madskills,FALSE, PL_reg_state.re_reparsing, + TRUE /* look for escaped bracketed metas */ ); const char * const valid_flags = (const char *)((type == OP_QR) ? QR_PAT_MODS : M_PAT_MODS); char charset = '\0'; /* character set modifier */ @@ -9611,7 +9612,8 @@ S_scan_subst(pTHX_ char *start) pl_yylval.ival = OP_NULL; - s = scan_str(start,!!PL_madskills,FALSE,FALSE); + s = scan_str(start,!!PL_madskills,FALSE,FALSE, + TRUE /* look for escaped bracketed metas */ ); if (!s) Perl_croak(aTHX_ "Substitution pattern not terminated"); @@ -9629,7 +9631,7 @@ S_scan_subst(pTHX_ char *start) #endif first_start = PL_multi_start; - s = scan_str(s,!!PL_madskills,FALSE,FALSE); + s = scan_str(s,!!PL_madskills,FALSE,FALSE, FALSE); if (!s) { if (PL_lex_stuff) { SvREFCNT_dec(PL_lex_stuff); @@ -9715,7 +9717,7 @@ S_scan_trans(pTHX_ char *start) pl_yylval.ival = OP_NULL; - s = scan_str(start,!!PL_madskills,FALSE,FALSE); + s = scan_str(start,!!PL_madskills,FALSE,FALSE, FALSE); if (!s) Perl_croak(aTHX_ "Transliteration pattern not terminated"); @@ -9731,7 +9733,7 @@ S_scan_trans(pTHX_ char *start) } #endif - s = scan_str(s,!!PL_madskills,FALSE,FALSE); + s = scan_str(s,!!PL_madskills,FALSE,FALSE, FALSE); if (!s) { if (PL_lex_stuff) { SvREFCNT_dec(PL_lex_stuff); @@ -10180,7 +10182,7 @@ S_scan_inputsymbol(pTHX_ char *start) if (d - PL_tokenbuf != len) { pl_yylval.ival = OP_GLOB; - s = scan_str(start,!!PL_madskills,FALSE,FALSE); + s = scan_str(start,!!PL_madskills,FALSE,FALSE, FALSE); if (!s) Perl_croak(aTHX_ "Glob not terminated"); return s; @@ -10322,7 +10324,11 @@ intro_sym: */ STATIC char * -S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims, int re_reparse) +S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims, int re_reparse, + bool deprecate_escaped_meta /* Should we issue a deprecation warning + for certain paired metacharacters that + appear escaped within it */ + ) { dVAR; SV *sv; /* scalar value: string */ @@ -10336,6 +10342,7 @@ S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims, int re_reparse) U8 termstr[UTF8_MAXBYTES]; /* terminating string */ STRLEN termlen; /* length of terminating string */ int last_off = 0; /* last position for nesting bracket */ + char *escaped_open = NULL; #ifdef PERL_MAD int stuffstart; char *tstart; @@ -10382,6 +10389,18 @@ S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims, int re_reparse) PL_multi_close = term; + /* A warning is raised if the input parameter requires it for escaped (by a + * backslash) paired metacharacters {} [] and () when the delimiters are + * those same characters, and the backslash is ineffective. This doesn't + * happen for <>, as they aren't metas. */ + if (deprecate_escaped_meta + && (PL_multi_open == PL_multi_close + || ! ckWARN_d(WARN_DEPRECATED) + || PL_multi_open == '<')) + { + deprecate_escaped_meta = FALSE; + } + /* create a new SV to hold the contents. 79 is the SV's initial length. What a random number. */ sv = newSV_type(SVt_PVIV); @@ -10520,7 +10539,44 @@ S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims, int re_reparse) if (*s == '\\' && s+1 < PL_bufend) { if (!keep_quoted && ((s[1] == PL_multi_open) || (s[1] == PL_multi_close))) + { s++; + + /* Here, 'deprecate_escaped_meta' is true iff the + * delimiters are paired metacharacters, and 's' points + * to an occurrence of one of them within the string, + * which was preceded by a backslash. If this is a + * context where the delimiter is also a metacharacter, + * the backslash is useless, and deprecated. () and [] + * are meta in any context. {} are meta only when + * appearing in a quantifier or in things like '\p{'. + * They also aren't meta unless there is a matching + * closed, escaped char later on within the string. + * If 's' points to an open, set a flag; if to a close, + * test that flag, and raise a warning if it was set */ + + if (deprecate_escaped_meta) { + if (*s == PL_multi_open) { + if (*s != '{') { + escaped_open = s; + } + else if (regcurly(s, + TRUE /* Look for a closing + '\}' */) + || (s - start > 2 /* Look for e.g. + '\x{' */ + && _generic_isCC(*(s-2), _CC_BACKSLASH_FOO_LBRACE_IS_META))) + { + escaped_open = s; + } + } + else if (escaped_open) { + Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), + "Useless use of '\\'; doesn't escape metacharacter '%c'", PL_multi_open); + escaped_open = NULL; + } + } + } else *to++ = *s++; } |