Extend strictness for qr/(?[ \N{} ])/

This recently added regex syntax imposes stricter rules on parsing than normal. However, this did not include parsing \N{} constructs that occur within it. This commit does that, making fatal the warnings that come from \N{} I will add to perldiag the newly added messages along with the others for (?[ ]) before 5.18 ships
author: Karl Williamson <public@khwilliamson.com> 2013-01-18 21:58:10 -0700
committer: Karl Williamson <public@khwilliamson.com> 2013-01-19 09:57:53 -0700
commit: 902994e45aafa5c63ac8bf2219075daf29139b3c (patch)
tree: fe0282064d4457174f60becf2c8818e09d575680
parent: 7e62f75cb6326421c6dade1c6ca08206084d7348 (diff)
download: perl-902994e45aafa5c63ac8bf2219075daf29139b3c.tar.gz
7 files changed, 37 insertions, 13 deletions
diff --git a/embed.fnc b/embed.fnc
index 1fd1f4ee87..53c582d5be 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -1983,9 +1983,10 @@ Es	|regnode*|reg_node	|NN struct RExC_state_t *pRExC_state|U8 op
 Es	|UV	|reg_recode	|const char value|NN SV **encp
 Es	|regnode*|regpiece	|NN struct RExC_state_t *pRExC_state \
 				|NN I32 *flagp|U32 depth
-Es	|bool	|grok_bslash_N	|NN struct RExC_state_t *pRExC_state \
-				|NULLOK regnode** nodep|NULLOK UV *valuep \
-				|NN I32 *flagp|U32 depth|bool in_char_class
+Es	|bool	|grok_bslash_N	|NN struct RExC_state_t *pRExC_state        \
+				|NULLOK regnode** nodep|NULLOK UV *valuep   \
+				|NN I32 *flagp|U32 depth|bool in_char_class \
+				|const bool strict
 Es	|void	|reginsert	|NN struct RExC_state_t *pRExC_state \
 				|U8 op|NN regnode *opnd|U32 depth
 Es	|void	|regtail	|NN struct RExC_state_t *pRExC_state \
diff --git a/embed.h b/embed.h
index 470805da5e..1df6ab4b1f 100644
--- a/embed.h
+++ b/embed.h
@@ -910,7 +910,7 @@
 #define get_invlist_previous_index_addr(a)	S_get_invlist_previous_index_addr(aTHX_ a)
 #define get_invlist_version_id_addr(a)	S_get_invlist_version_id_addr(aTHX_ a)
 #define get_invlist_zero_addr(a)	S_get_invlist_zero_addr(aTHX_ a)
-#define grok_bslash_N(a,b,c,d,e,f)	S_grok_bslash_N(aTHX_ a,b,c,d,e,f)
+#define grok_bslash_N(a,b,c,d,e,f,g)	S_grok_bslash_N(aTHX_ a,b,c,d,e,f,g)
 #define handle_sets(a,b,c,d)	S_handle_sets(aTHX_ a,b,c,d)
 #define invlist_array(a)	S_invlist_array(aTHX_ a)
 #define invlist_clone(a)	S_invlist_clone(aTHX_ a)
diff --git a/proto.h b/proto.h
index e0c3279fd3..0d0078d107 100644
--- a/proto.h
+++ b/proto.h
@@ -6503,7 +6503,7 @@ PERL_STATIC_INLINE UV*	S_get_invlist_zero_addr(pTHX_ SV* invlist)
 #define PERL_ARGS_ASSERT_GET_INVLIST_ZERO_ADDR	\
 	assert(invlist)
 
-STATIC bool	S_grok_bslash_N(pTHX_ struct RExC_state_t *pRExC_state, regnode** nodep, UV *valuep, I32 *flagp, U32 depth, bool in_char_class)
+STATIC bool	S_grok_bslash_N(pTHX_ struct RExC_state_t *pRExC_state, regnode** nodep, UV *valuep, I32 *flagp, U32 depth, bool in_char_class, const bool strict)
 			__attribute__nonnull__(pTHX_1)
 			__attribute__nonnull__(pTHX_4);
 #define PERL_ARGS_ASSERT_GROK_BSLASH_N	\
diff --git a/regcomp.c b/regcomp.c
index 05e9fe55f6..a22f8ff734 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -9634,7 +9634,9 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
 }
 
 STATIC bool
-S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I32 *flagp, U32 depth, bool in_char_class)
+S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I32 *flagp, U32 depth, bool in_char_class,
+        const bool strict   /* Apply stricter parsing rules? */
+    )
 {
    
  /* This is expected to be called by a parser routine that has recognized '\N'
@@ -9749,9 +9751,14 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I
 	}
         else if (in_char_class) {
             if (SIZE_ONLY && in_char_class) {
-                ckWARNreg(RExC_parse,
-                        "Ignoring zero length \\N{} in character class"
-                );
+                if (strict) {
+                    RExC_parse++;   /* Position after the "}" */
+                    vFAIL("Zero length \\N{}");
+                }
+                else {
+                    ckWARNreg(RExC_parse,
+                              "Ignoring zero length \\N{} in character class");
+                }
             }
             ret = FALSE;
 	}
@@ -9803,7 +9810,13 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I
 	}
 
         if (in_char_class && has_multiple_chars) {
-	    ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
+            if (strict) {
+                RExC_parse = endbrace;
+                vFAIL("\\N{} in character class restricted to one character");
+            }
+            else {
+                ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
+            }
         }
 
         RExC_parse = endbrace + 1;
@@ -10339,7 +10352,8 @@ tryagain:
              * special treatment for quantifiers is not needed for such single
              * character sequences */
             ++RExC_parse;
-            if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE)) {
+            if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE,
+                                FALSE /* not strict */ )) {
                 RExC_parse--;
                 goto defchar;
             }
@@ -10603,7 +10617,8 @@ tryagain:
                          * */
                         RExC_parse = p + 1;
                         if (! grok_bslash_N(pRExC_state, NULL, &ender,
-                                            flagp, depth, FALSE))
+                                            flagp, depth, FALSE,
+                                            FALSE /* not strict */ ))
                         {
                             RExC_parse = p = oldp;
                             goto loopdone;
@@ -11988,7 +12003,8 @@ parseit:
                     from earlier versions, OTOH that behaviour was broken
                     as well. */
                     if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth,
-                                      TRUE /* => charclass */))
+                                      TRUE, /* => charclass */
+                                      strict))
                     {
                         goto parseit;
                     }
diff --git a/t/porting/diag.t b/t/porting/diag.t
index 13f1811eb3..7355151dc9 100644
--- a/t/porting/diag.t
+++ b/t/porting/diag.t
@@ -658,3 +658,5 @@ Operation "%s" returns its argument for UTF-16 surrogate U+%X
 Unicode surrogate U+%X is illegal in UTF-8
 UTF-16 surrogate U+%X
 False [] range "%s" in regex; marked by <-- HERE in m/%s/
+\N{} in character class restricted to one character in regex; marked by <-- HERE in m/%s/
+Zero length \N{} in regex; marked by <-- HERE in m/%s/
diff --git a/t/re/pat_advanced.t b/t/re/pat_advanced.t
index a411220c7a..60ae9d6a42 100644
--- a/t/re/pat_advanced.t
+++ b/t/re/pat_advanced.t
@@ -1014,6 +1014,9 @@ sub run_tests {
         ok "\N{LONG-STR}" =~ /^\N{LONG-STR}$/, 'Verify that long string works';
         ok "\N{LONG-STR}" =~ /^\N{LONG-STR}$/i, 'Verify under folding that long string works';
 
+        eval '/(?[[\N{EMPTY-STR}]])/';
+        ok $@ && $@ =~ /Zero length \\N\{}/;
+
         undef $w;
         eval q [is("\N{TOO  MANY SPACES}", "TOO  MANY SPACES", "Multiple spaces in character name works")];
         like ($w, qr/A sequence of multiple spaces in a charnames alias definition is deprecated/, "... but returns a deprecation warning");
diff --git a/t/re/reg_mesg.t b/t/re/reg_mesg.t
index 14e9aceee2..30bc2d6354 100644
--- a/t/re/reg_mesg.t
+++ b/t/re/reg_mesg.t
@@ -147,7 +147,9 @@ my @death =
  'm/(?[[\w-x]])/' => 'False [] range "\w-" in regex; marked by {#} in m/(?[[\w-{#}x]])/',
  'm/(?[[a-\pM]])/' => 'False [] range "a-\pM" in regex; marked by {#} in m/(?[[a-\pM{#}]])/',
  'm/(?[[\pM-x]])/' => 'False [] range "\pM-" in regex; marked by {#} in m/(?[[\pM-{#}x]])/',
+ 'm/(?[[\N{LATIN CAPITAL LETTER A WITH MACRON AND GRAVE}]])/' => '\N{} in character class restricted to one character in regex; marked by {#} in m/(?[[\N{U+100.300{#}}]])/',
 );
+# Tests involving a user-defined charnames translator are in pat_advanced.t
 
 ##
 ## Key-value pairs of code/error of code that should have non-fatal warnings.
author	Karl Williamson <public@khwilliamson.com>	2013-01-18 21:58:10 -0700
committer	Karl Williamson <public@khwilliamson.com>	2013-01-19 09:57:53 -0700
commit	902994e45aafa5c63ac8bf2219075daf29139b3c (patch)
tree	fe0282064d4457174f60becf2c8818e09d575680
parent	7e62f75cb6326421c6dade1c6ca08206084d7348 (diff)
download	perl-902994e45aafa5c63ac8bf2219075daf29139b3c.tar.gz