summaryrefslogtreecommitdiff
path: root/regcomp.c
diff options
context:
space:
mode:
authorKarl Williamson <public@khwilliamson.com>2013-04-23 13:39:35 -0600
committerKarl Williamson <public@khwilliamson.com>2013-05-02 13:39:00 -0600
commit000947ada5b027f394ee63b5166df8c06b64a74e (patch)
treee6b3d6d7078fea713cdde8a314b7ea3adbb25298 /regcomp.c
parenteb5682978c85adb0d49e6aef177310cfe054d2aa (diff)
downloadperl-000947ada5b027f394ee63b5166df8c06b64a74e.tar.gz
Deprecate spaces/comments in some regex tokens
This commit deprecates having space/comments between the first two characters of regular expression forms '(*VERB:ARG)' and '(?...)'. That is, the '(' should be immediately be followed by the '*' or '?'. Previously, things like: qr/((?# This is a comment in the middle of a token)?:foo)/ were silently accepted. The problem is that during regex parsing, the '(' is seen, and the input pointer advanced, skipping comments and, under /x, white space, without regard to whether the left parenthesis is part of a bigger token or not. S_reg() handles the parsing of what comes next in the input, and it just assumes that the first character it sees is the one that immediately followed the input parenthesis. Since the parenthesis may or may not be a part of a bigger token, and the current structure of handling things, I don't see an elegant way to fix this. What I did is flag the single call to S_reg() where this could be an issue, and have S_reg check for for adjacency if the parenthesis is part of a bigger token, and if so, warn if not-adjacent.
Diffstat (limited to 'regcomp.c')
-rw-r--r--regcomp.c23
1 files changed, 20 insertions, 3 deletions
diff --git a/regcomp.c b/regcomp.c
index 95f895804e..d1bdf44f45 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -8601,7 +8601,10 @@ S_parse_lparen_question_flags(pTHX_ struct RExC_state_t *pRExC_state)
cannot happen. */
STATIC regnode *
S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
- /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
+ /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
+ * 2 is like 1, but indicates that nextchar() has been called to advance
+ * RExC_parse beyond the '('. Things like '(?' are indivisible tokens, and
+ * this flag alerts us to the need to check for that */
{
dVAR;
regnode *ret; /* Will be the head of the group. */
@@ -8629,6 +8632,13 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
/* Make an OPEN node, if parenthesized. */
if (paren) {
+
+ /* Under /x, space and comments can be gobbled up between the '(' and
+ * here (if paren ==2). The forms '(*VERB' and '(?...' disallow such
+ * intervening space, as the sequence is a token, and a token should be
+ * indivisible */
+ bool has_intervening_patws = paren == 2 && *(RExC_parse - 1) != '(';
+
if ( *RExC_parse == '*') { /* (*VERB:ARG) */
char *start_verb = RExC_parse;
STRLEN verb_len = 0;
@@ -8636,6 +8646,10 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
unsigned char op = 0;
int argok = 1;
int internal_argval = 0; /* internal_argval is only useful if !argok */
+
+ if (has_intervening_patws && SIZE_ONLY) {
+ ckWARNregdep(RExC_parse + 1, "In '(*VERB...)', splitting the initial '(*' is deprecated");
+ }
while ( *RExC_parse && *RExC_parse != ')' ) {
if ( *RExC_parse == ':' ) {
start_arg = RExC_parse + 1;
@@ -8737,6 +8751,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
if (*RExC_parse == '?') { /* (?...) */
bool is_logical = 0;
const char * const seqstart = RExC_parse;
+ if (has_intervening_patws && SIZE_ONLY) {
+ ckWARNregdep(RExC_parse + 1, "In '(?...)', splitting the initial '(?' is deprecated");
+ }
RExC_parse++;
paren = *RExC_parse++;
@@ -9322,7 +9339,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
case ':':
ender = reg_node(pRExC_state, TAIL);
break;
- case 1:
+ case 1: case 2:
ender = reganode(pRExC_state, CLOSE, parno);
if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
@@ -10312,7 +10329,7 @@ tryagain:
}
case '(':
nextchar(pRExC_state);
- ret = reg(pRExC_state, 1, &flags,depth+1);
+ ret = reg(pRExC_state, 2, &flags,depth+1);
if (ret == NULL) {
if (flags & TRYAGAIN) {
if (RExC_parse == RExC_end) {