summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKarl Williamson <public@khwilliamson.com>2013-01-18 21:58:10 -0700
committerKarl Williamson <public@khwilliamson.com>2013-01-19 09:57:53 -0700
commit902994e45aafa5c63ac8bf2219075daf29139b3c (patch)
treefe0282064d4457174f60becf2c8818e09d575680
parent7e62f75cb6326421c6dade1c6ca08206084d7348 (diff)
downloadperl-902994e45aafa5c63ac8bf2219075daf29139b3c.tar.gz
Extend strictness for qr/(?[ \N{} ])/
This recently added regex syntax imposes stricter rules on parsing than normal. However, this did not include parsing \N{} constructs that occur within it. This commit does that, making fatal the warnings that come from \N{} I will add to perldiag the newly added messages along with the others for (?[ ]) before 5.18 ships
-rw-r--r--embed.fnc7
-rw-r--r--embed.h2
-rw-r--r--proto.h2
-rw-r--r--regcomp.c32
-rw-r--r--t/porting/diag.t2
-rw-r--r--t/re/pat_advanced.t3
-rw-r--r--t/re/reg_mesg.t2
7 files changed, 37 insertions, 13 deletions
diff --git a/embed.fnc b/embed.fnc
index 1fd1f4ee87..53c582d5be 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -1983,9 +1983,10 @@ Es |regnode*|reg_node |NN struct RExC_state_t *pRExC_state|U8 op
Es |UV |reg_recode |const char value|NN SV **encp
Es |regnode*|regpiece |NN struct RExC_state_t *pRExC_state \
|NN I32 *flagp|U32 depth
-Es |bool |grok_bslash_N |NN struct RExC_state_t *pRExC_state \
- |NULLOK regnode** nodep|NULLOK UV *valuep \
- |NN I32 *flagp|U32 depth|bool in_char_class
+Es |bool |grok_bslash_N |NN struct RExC_state_t *pRExC_state \
+ |NULLOK regnode** nodep|NULLOK UV *valuep \
+ |NN I32 *flagp|U32 depth|bool in_char_class \
+ |const bool strict
Es |void |reginsert |NN struct RExC_state_t *pRExC_state \
|U8 op|NN regnode *opnd|U32 depth
Es |void |regtail |NN struct RExC_state_t *pRExC_state \
diff --git a/embed.h b/embed.h
index 470805da5e..1df6ab4b1f 100644
--- a/embed.h
+++ b/embed.h
@@ -910,7 +910,7 @@
#define get_invlist_previous_index_addr(a) S_get_invlist_previous_index_addr(aTHX_ a)
#define get_invlist_version_id_addr(a) S_get_invlist_version_id_addr(aTHX_ a)
#define get_invlist_zero_addr(a) S_get_invlist_zero_addr(aTHX_ a)
-#define grok_bslash_N(a,b,c,d,e,f) S_grok_bslash_N(aTHX_ a,b,c,d,e,f)
+#define grok_bslash_N(a,b,c,d,e,f,g) S_grok_bslash_N(aTHX_ a,b,c,d,e,f,g)
#define handle_sets(a,b,c,d) S_handle_sets(aTHX_ a,b,c,d)
#define invlist_array(a) S_invlist_array(aTHX_ a)
#define invlist_clone(a) S_invlist_clone(aTHX_ a)
diff --git a/proto.h b/proto.h
index e0c3279fd3..0d0078d107 100644
--- a/proto.h
+++ b/proto.h
@@ -6503,7 +6503,7 @@ PERL_STATIC_INLINE UV* S_get_invlist_zero_addr(pTHX_ SV* invlist)
#define PERL_ARGS_ASSERT_GET_INVLIST_ZERO_ADDR \
assert(invlist)
-STATIC bool S_grok_bslash_N(pTHX_ struct RExC_state_t *pRExC_state, regnode** nodep, UV *valuep, I32 *flagp, U32 depth, bool in_char_class)
+STATIC bool S_grok_bslash_N(pTHX_ struct RExC_state_t *pRExC_state, regnode** nodep, UV *valuep, I32 *flagp, U32 depth, bool in_char_class, const bool strict)
__attribute__nonnull__(pTHX_1)
__attribute__nonnull__(pTHX_4);
#define PERL_ARGS_ASSERT_GROK_BSLASH_N \
diff --git a/regcomp.c b/regcomp.c
index 05e9fe55f6..a22f8ff734 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -9634,7 +9634,9 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
}
STATIC bool
-S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I32 *flagp, U32 depth, bool in_char_class)
+S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I32 *flagp, U32 depth, bool in_char_class,
+ const bool strict /* Apply stricter parsing rules? */
+ )
{
/* This is expected to be called by a parser routine that has recognized '\N'
@@ -9749,9 +9751,14 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I
}
else if (in_char_class) {
if (SIZE_ONLY && in_char_class) {
- ckWARNreg(RExC_parse,
- "Ignoring zero length \\N{} in character class"
- );
+ if (strict) {
+ RExC_parse++; /* Position after the "}" */
+ vFAIL("Zero length \\N{}");
+ }
+ else {
+ ckWARNreg(RExC_parse,
+ "Ignoring zero length \\N{} in character class");
+ }
}
ret = FALSE;
}
@@ -9803,7 +9810,13 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I
}
if (in_char_class && has_multiple_chars) {
- ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
+ if (strict) {
+ RExC_parse = endbrace;
+ vFAIL("\\N{} in character class restricted to one character");
+ }
+ else {
+ ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
+ }
}
RExC_parse = endbrace + 1;
@@ -10339,7 +10352,8 @@ tryagain:
* special treatment for quantifiers is not needed for such single
* character sequences */
++RExC_parse;
- if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE)) {
+ if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE,
+ FALSE /* not strict */ )) {
RExC_parse--;
goto defchar;
}
@@ -10603,7 +10617,8 @@ tryagain:
* */
RExC_parse = p + 1;
if (! grok_bslash_N(pRExC_state, NULL, &ender,
- flagp, depth, FALSE))
+ flagp, depth, FALSE,
+ FALSE /* not strict */ ))
{
RExC_parse = p = oldp;
goto loopdone;
@@ -11988,7 +12003,8 @@ parseit:
from earlier versions, OTOH that behaviour was broken
as well. */
if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth,
- TRUE /* => charclass */))
+ TRUE, /* => charclass */
+ strict))
{
goto parseit;
}
diff --git a/t/porting/diag.t b/t/porting/diag.t
index 13f1811eb3..7355151dc9 100644
--- a/t/porting/diag.t
+++ b/t/porting/diag.t
@@ -658,3 +658,5 @@ Operation "%s" returns its argument for UTF-16 surrogate U+%X
Unicode surrogate U+%X is illegal in UTF-8
UTF-16 surrogate U+%X
False [] range "%s" in regex; marked by <-- HERE in m/%s/
+\N{} in character class restricted to one character in regex; marked by <-- HERE in m/%s/
+Zero length \N{} in regex; marked by <-- HERE in m/%s/
diff --git a/t/re/pat_advanced.t b/t/re/pat_advanced.t
index a411220c7a..60ae9d6a42 100644
--- a/t/re/pat_advanced.t
+++ b/t/re/pat_advanced.t
@@ -1014,6 +1014,9 @@ sub run_tests {
ok "\N{LONG-STR}" =~ /^\N{LONG-STR}$/, 'Verify that long string works';
ok "\N{LONG-STR}" =~ /^\N{LONG-STR}$/i, 'Verify under folding that long string works';
+ eval '/(?[[\N{EMPTY-STR}]])/';
+ ok $@ && $@ =~ /Zero length \\N\{}/;
+
undef $w;
eval q [is("\N{TOO MANY SPACES}", "TOO MANY SPACES", "Multiple spaces in character name works")];
like ($w, qr/A sequence of multiple spaces in a charnames alias definition is deprecated/, "... but returns a deprecation warning");
diff --git a/t/re/reg_mesg.t b/t/re/reg_mesg.t
index 14e9aceee2..30bc2d6354 100644
--- a/t/re/reg_mesg.t
+++ b/t/re/reg_mesg.t
@@ -147,7 +147,9 @@ my @death =
'm/(?[[\w-x]])/' => 'False [] range "\w-" in regex; marked by {#} in m/(?[[\w-{#}x]])/',
'm/(?[[a-\pM]])/' => 'False [] range "a-\pM" in regex; marked by {#} in m/(?[[a-\pM{#}]])/',
'm/(?[[\pM-x]])/' => 'False [] range "\pM-" in regex; marked by {#} in m/(?[[\pM-{#}x]])/',
+ 'm/(?[[\N{LATIN CAPITAL LETTER A WITH MACRON AND GRAVE}]])/' => '\N{} in character class restricted to one character in regex; marked by {#} in m/(?[[\N{U+100.300{#}}]])/',
);
+# Tests involving a user-defined charnames translator are in pat_advanced.t
##
## Key-value pairs of code/error of code that should have non-fatal warnings.