diff options
author | Karl Williamson <public@khwilliamson.com> | 2013-01-18 21:58:10 -0700 |
---|---|---|
committer | Karl Williamson <public@khwilliamson.com> | 2013-01-19 09:57:53 -0700 |
commit | 902994e45aafa5c63ac8bf2219075daf29139b3c (patch) | |
tree | fe0282064d4457174f60becf2c8818e09d575680 | |
parent | 7e62f75cb6326421c6dade1c6ca08206084d7348 (diff) | |
download | perl-902994e45aafa5c63ac8bf2219075daf29139b3c.tar.gz |
Extend strictness for qr/(?[ \N{} ])/
This recently added regex syntax imposes stricter rules on parsing than
normal. However, this did not include parsing \N{} constructs that
occur within it. This commit does that, making fatal the warnings that
come from \N{}
I will add to perldiag the newly added messages along with the others
for (?[ ]) before 5.18 ships
-rw-r--r-- | embed.fnc | 7 | ||||
-rw-r--r-- | embed.h | 2 | ||||
-rw-r--r-- | proto.h | 2 | ||||
-rw-r--r-- | regcomp.c | 32 | ||||
-rw-r--r-- | t/porting/diag.t | 2 | ||||
-rw-r--r-- | t/re/pat_advanced.t | 3 | ||||
-rw-r--r-- | t/re/reg_mesg.t | 2 |
7 files changed, 37 insertions, 13 deletions
@@ -1983,9 +1983,10 @@ Es |regnode*|reg_node |NN struct RExC_state_t *pRExC_state|U8 op Es |UV |reg_recode |const char value|NN SV **encp Es |regnode*|regpiece |NN struct RExC_state_t *pRExC_state \ |NN I32 *flagp|U32 depth -Es |bool |grok_bslash_N |NN struct RExC_state_t *pRExC_state \ - |NULLOK regnode** nodep|NULLOK UV *valuep \ - |NN I32 *flagp|U32 depth|bool in_char_class +Es |bool |grok_bslash_N |NN struct RExC_state_t *pRExC_state \ + |NULLOK regnode** nodep|NULLOK UV *valuep \ + |NN I32 *flagp|U32 depth|bool in_char_class \ + |const bool strict Es |void |reginsert |NN struct RExC_state_t *pRExC_state \ |U8 op|NN regnode *opnd|U32 depth Es |void |regtail |NN struct RExC_state_t *pRExC_state \ @@ -910,7 +910,7 @@ #define get_invlist_previous_index_addr(a) S_get_invlist_previous_index_addr(aTHX_ a) #define get_invlist_version_id_addr(a) S_get_invlist_version_id_addr(aTHX_ a) #define get_invlist_zero_addr(a) S_get_invlist_zero_addr(aTHX_ a) -#define grok_bslash_N(a,b,c,d,e,f) S_grok_bslash_N(aTHX_ a,b,c,d,e,f) +#define grok_bslash_N(a,b,c,d,e,f,g) S_grok_bslash_N(aTHX_ a,b,c,d,e,f,g) #define handle_sets(a,b,c,d) S_handle_sets(aTHX_ a,b,c,d) #define invlist_array(a) S_invlist_array(aTHX_ a) #define invlist_clone(a) S_invlist_clone(aTHX_ a) @@ -6503,7 +6503,7 @@ PERL_STATIC_INLINE UV* S_get_invlist_zero_addr(pTHX_ SV* invlist) #define PERL_ARGS_ASSERT_GET_INVLIST_ZERO_ADDR \ assert(invlist) -STATIC bool S_grok_bslash_N(pTHX_ struct RExC_state_t *pRExC_state, regnode** nodep, UV *valuep, I32 *flagp, U32 depth, bool in_char_class) +STATIC bool S_grok_bslash_N(pTHX_ struct RExC_state_t *pRExC_state, regnode** nodep, UV *valuep, I32 *flagp, U32 depth, bool in_char_class, const bool strict) __attribute__nonnull__(pTHX_1) __attribute__nonnull__(pTHX_4); #define PERL_ARGS_ASSERT_GROK_BSLASH_N \ @@ -9634,7 +9634,9 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) } STATIC bool -S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I32 *flagp, U32 depth, bool in_char_class) +S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I32 *flagp, U32 depth, bool in_char_class, + const bool strict /* Apply stricter parsing rules? */ + ) { /* This is expected to be called by a parser routine that has recognized '\N' @@ -9749,9 +9751,14 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I } else if (in_char_class) { if (SIZE_ONLY && in_char_class) { - ckWARNreg(RExC_parse, - "Ignoring zero length \\N{} in character class" - ); + if (strict) { + RExC_parse++; /* Position after the "}" */ + vFAIL("Zero length \\N{}"); + } + else { + ckWARNreg(RExC_parse, + "Ignoring zero length \\N{} in character class"); + } } ret = FALSE; } @@ -9803,7 +9810,13 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I } if (in_char_class && has_multiple_chars) { - ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class"); + if (strict) { + RExC_parse = endbrace; + vFAIL("\\N{} in character class restricted to one character"); + } + else { + ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class"); + } } RExC_parse = endbrace + 1; @@ -10339,7 +10352,8 @@ tryagain: * special treatment for quantifiers is not needed for such single * character sequences */ ++RExC_parse; - if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE)) { + if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE, + FALSE /* not strict */ )) { RExC_parse--; goto defchar; } @@ -10603,7 +10617,8 @@ tryagain: * */ RExC_parse = p + 1; if (! grok_bslash_N(pRExC_state, NULL, &ender, - flagp, depth, FALSE)) + flagp, depth, FALSE, + FALSE /* not strict */ )) { RExC_parse = p = oldp; goto loopdone; @@ -11988,7 +12003,8 @@ parseit: from earlier versions, OTOH that behaviour was broken as well. */ if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth, - TRUE /* => charclass */)) + TRUE, /* => charclass */ + strict)) { goto parseit; } diff --git a/t/porting/diag.t b/t/porting/diag.t index 13f1811eb3..7355151dc9 100644 --- a/t/porting/diag.t +++ b/t/porting/diag.t @@ -658,3 +658,5 @@ Operation "%s" returns its argument for UTF-16 surrogate U+%X Unicode surrogate U+%X is illegal in UTF-8 UTF-16 surrogate U+%X False [] range "%s" in regex; marked by <-- HERE in m/%s/ +\N{} in character class restricted to one character in regex; marked by <-- HERE in m/%s/ +Zero length \N{} in regex; marked by <-- HERE in m/%s/ diff --git a/t/re/pat_advanced.t b/t/re/pat_advanced.t index a411220c7a..60ae9d6a42 100644 --- a/t/re/pat_advanced.t +++ b/t/re/pat_advanced.t @@ -1014,6 +1014,9 @@ sub run_tests { ok "\N{LONG-STR}" =~ /^\N{LONG-STR}$/, 'Verify that long string works'; ok "\N{LONG-STR}" =~ /^\N{LONG-STR}$/i, 'Verify under folding that long string works'; + eval '/(?[[\N{EMPTY-STR}]])/'; + ok $@ && $@ =~ /Zero length \\N\{}/; + undef $w; eval q [is("\N{TOO MANY SPACES}", "TOO MANY SPACES", "Multiple spaces in character name works")]; like ($w, qr/A sequence of multiple spaces in a charnames alias definition is deprecated/, "... but returns a deprecation warning"); diff --git a/t/re/reg_mesg.t b/t/re/reg_mesg.t index 14e9aceee2..30bc2d6354 100644 --- a/t/re/reg_mesg.t +++ b/t/re/reg_mesg.t @@ -147,7 +147,9 @@ my @death = 'm/(?[[\w-x]])/' => 'False [] range "\w-" in regex; marked by {#} in m/(?[[\w-{#}x]])/', 'm/(?[[a-\pM]])/' => 'False [] range "a-\pM" in regex; marked by {#} in m/(?[[a-\pM{#}]])/', 'm/(?[[\pM-x]])/' => 'False [] range "\pM-" in regex; marked by {#} in m/(?[[\pM-{#}x]])/', + 'm/(?[[\N{LATIN CAPITAL LETTER A WITH MACRON AND GRAVE}]])/' => '\N{} in character class restricted to one character in regex; marked by {#} in m/(?[[\N{U+100.300{#}}]])/', ); +# Tests involving a user-defined charnames translator are in pat_advanced.t ## ## Key-value pairs of code/error of code that should have non-fatal warnings. |