summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--pod/perldelta.pod8
-rw-r--r--pod/perldiag.pod12
-rw-r--r--regcomp.c142
3 files changed, 110 insertions, 52 deletions
diff --git a/pod/perldelta.pod b/pod/perldelta.pod
index d95df70c30..413845e29a 100644
--- a/pod/perldelta.pod
+++ b/pod/perldelta.pod
@@ -206,7 +206,13 @@ and New Warnings
=item *
-XXX L<message|perldiag/"message">
+L<Sequence (?PE<lt>... not terminated in regex; marked by S<<-- HERE> in mE<sol>%sE<sol>
+|perldiag/"Sequence (?PE<lt>... not terminated in regex; marked by <-- HERE in mE<sol>%sE<sol>">
+
+=item *
+
+L<Sequence (?PE<gt>... not terminated in regex; marked by S<<-- HERE> in mE<sol>%sE<sol>
+|perldiag/Sequence (?PE<gt>... not terminated in regex; marked by <-- HERE in mE<sol>%sE<sol>>
=back
diff --git a/pod/perldiag.pod b/pod/perldiag.pod
index 80a125e5b4..35d3edba3d 100644
--- a/pod/perldiag.pod
+++ b/pod/perldiag.pod
@@ -5343,6 +5343,18 @@ m/%s/
closing parenthesis after the name. The S<<-- HERE> shows whereabouts
in the regular expression the problem was discovered.
+=item Sequence (?PE<lt>... not terminated in regex; marked by S<<-- HERE> in m/%s/
+
+(F) A named group of the form C<(?PE<lt>...E<gt>')> was missing the final
+closing angle bracket. The S<<-- HERE> shows whereabouts in the
+regular expression the problem was discovered.
+
+=item Sequence (?PE<gt>... not terminated in regex; marked by S<<-- HERE> in m/%s/
+
+(F) A named reference of the form C<(?PE<gt>...)> was missing the final
+closing parenthesis after the name. The S<<-- HERE> shows whereabouts
+in the regular expression the problem was discovered.
+
=item Sequence (?R) not terminated in regex m/%s/
(F) An C<(?R)> or C<(?0)> sequence in a regular expression was missing the
diff --git a/regcomp.c b/regcomp.c
index c00fdffd93..aa06bc6e8d 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -9871,7 +9871,7 @@ S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
cs = REGEX_UNICODE_CHARSET;
}
- while (*RExC_parse) {
+ while (RExC_parse < RExC_end) {
/* && strchr("iogcmsx", *RExC_parse) */
/* (?g), (?gc) and (?o) are useless here
and must be globally applied -- japhy */
@@ -10029,7 +10029,7 @@ S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
NOT_REACHED; /*NOTREACHED*/
}
- ++RExC_parse;
+ RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
}
vFAIL("Sequence (?... not terminated");
@@ -10061,12 +10061,11 @@ S_handle_named_backref(pTHX_ RExC_state_t *pRExC_state,
regnode *ret;
char* name_start = RExC_parse;
U32 num = 0;
-
- GET_RE_DEBUG_FLAGS_DECL;
-
SV *sv_dat = reg_scan_name(pRExC_state, SIZE_ONLY
? REG_RSN_RETURN_NULL
: REG_RSN_RETURN_DATA);
+ GET_RE_DEBUG_FLAGS_DECL;
+
PERL_ARGS_ASSERT_HANDLE_NAMED_BACKREF;
if (RExC_parse == name_start || *RExC_parse != ch) {
@@ -10152,9 +10151,11 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
* indivisible */
bool has_intervening_patws = paren == 2 && *(RExC_parse - 1) != '(';
+ assert(RExC_parse < RExC_end);
+
if ( *RExC_parse == '*') { /* (*VERB:ARG) */
- char *start_verb = RExC_parse;
- STRLEN verb_len = 0;
+ char *start_verb = RExC_parse + 1;
+ STRLEN verb_len;
char *start_arg = NULL;
unsigned char op = 0;
int arg_required = 0;
@@ -10164,28 +10165,33 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
RExC_parse++;
vFAIL("In '(*VERB...)', the '(' and '*' must be adjacent");
}
- while ( *RExC_parse && *RExC_parse != ')' ) {
+ while (RExC_parse < RExC_end && *RExC_parse != ')' ) {
if ( *RExC_parse == ':' ) {
start_arg = RExC_parse + 1;
break;
}
- RExC_parse++;
+ RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
}
- ++start_verb;
verb_len = RExC_parse - start_verb;
if ( start_arg ) {
- RExC_parse++;
- while ( *RExC_parse && *RExC_parse != ')' )
- RExC_parse++;
- if ( *RExC_parse != ')' )
+ if (RExC_parse >= RExC_end) {
+ goto unterminated_verb_pattern;
+ }
+ RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
+ while ( RExC_parse < RExC_end && *RExC_parse != ')' )
+ RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
+ if ( RExC_parse >= RExC_end || *RExC_parse != ')' )
+ unterminated_verb_pattern:
vFAIL("Unterminated verb pattern argument");
if ( RExC_parse == start_arg )
start_arg = NULL;
} else {
- if ( *RExC_parse != ')' )
+ if ( RExC_parse >= RExC_end || *RExC_parse != ')' )
vFAIL("Unterminated verb pattern");
}
+ /* Here, we know that RExC_parse < RExC_end */
+
switch ( *start_verb ) {
case 'A': /* (*ACCEPT) */
if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
@@ -10268,22 +10274,36 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
}
RExC_parse++;
- paren = *RExC_parse++;
+ paren = *RExC_parse; /* might be a trailing NUL, if not
+ well-formed */
+ RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
+ if (RExC_parse > RExC_end) {
+ paren = '\0';
+ }
ret = NULL; /* For lookahead/behind. */
switch (paren) {
case 'P': /* (?P...) variants for those used to PCRE/Python */
- paren = *RExC_parse++;
- if ( paren == '<') /* (?P<...>) named capture */
+ paren = *RExC_parse;
+ if ( paren == '<') { /* (?P<...>) named capture */
+ RExC_parse++;
+ if (RExC_parse >= RExC_end) {
+ vFAIL("Sequence (?P<... not terminated");
+ }
goto named_capture;
+ }
else if (paren == '>') { /* (?P>name) named recursion */
+ RExC_parse++;
+ if (RExC_parse >= RExC_end) {
+ vFAIL("Sequence (?P>... not terminated");
+ }
goto named_recursion;
}
else if (paren == '=') { /* (?P=...) named backref */
+ RExC_parse++;
return handle_named_backref(pRExC_state, flagp,
parse_start, ')');
}
- --RExC_parse;
RExC_parse += SKIP_IF_CHAR(RExC_parse);
/* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
vFAIL3("Sequence (%.*s...) not recognized",
@@ -10304,9 +10324,13 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
SIZE_ONLY /* reverse test from the others */
? REG_RSN_RETURN_NAME
: REG_RSN_RETURN_NULL);
- if (RExC_parse == name_start || *RExC_parse != paren)
+ if ( RExC_parse == name_start
+ || RExC_parse >= RExC_end
+ || *RExC_parse != paren)
+ {
vFAIL2("Sequence (?%c... not terminated",
paren=='>' ? '<' : paren);
+ }
if (SIZE_ONLY) {
HE *he_str;
SV *sv_dat = NULL;
@@ -10374,6 +10398,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
RExC_seen |= REG_LOOKBEHIND_SEEN;
RExC_in_lookbehind++;
RExC_parse++;
+ assert(RExC_parse < RExC_end);
/* FALLTHROUGH */
case '=': /* (?=...) */
RExC_seen_zerolen++;
@@ -10421,7 +10446,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
}
- if (RExC_parse == RExC_end || *RExC_parse != ')')
+ if (RExC_parse >= RExC_end || *RExC_parse != ')')
vFAIL("Sequence (?&... not terminated");
goto gen_recurse_regop;
/* NOTREACHED */
@@ -10440,7 +10465,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
/* FALLTHROUGH */
case '1': case '2': case '3': case '4': /* (?1) */
case '5': case '6': case '7': case '8': case '9':
- RExC_parse--;
+ RExC_parse = (char *) seqstart + 1; /* Point to the digit */
parse_recursion:
{
bool is_neg = FALSE;
@@ -10520,7 +10545,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
NOT_REACHED; /*NOTREACHED*/
}
*flagp |= POSTPONED;
- paren = *RExC_parse++;
+ paren = '{';
+ RExC_parse++;
/* FALLTHROUGH */
case '{': /* (?{...}) */
{
@@ -10587,11 +10613,11 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
int is_define= 0;
const int DEFINE_len = sizeof("DEFINE") - 1;
if (RExC_parse[0] == '?') { /* (?(?...)) */
- if (
- RExC_parse[1] == '=' ||
- RExC_parse[1] == '!' ||
- RExC_parse[1] == '<' ||
- RExC_parse[1] == '{'
+ if ( RExC_parse < RExC_end - 1
+ && ( RExC_parse[1] == '='
+ || RExC_parse[1] == '!'
+ || RExC_parse[1] == '<'
+ || RExC_parse[1] == '{')
) { /* Lookahead or eval. */
I32 flag;
regnode *tail;
@@ -10619,9 +10645,13 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
U32 num = 0;
SV *sv_dat=reg_scan_name(pRExC_state,
SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
- if (RExC_parse == name_start || *RExC_parse != ch)
+ if ( RExC_parse == name_start
+ || RExC_parse >= RExC_end
+ || *RExC_parse != ch)
+ {
vFAIL2("Sequence (?(%c... not terminated",
(ch == '>' ? '<' : ch));
+ }
RExC_parse++;
if (!SIZE_ONLY) {
num = add_data( pRExC_state, STR_WITH_LEN("S"));
@@ -10725,7 +10755,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
else
lastbr = NULL;
if (c != ')') {
- if (RExC_parse>RExC_end)
+ if (RExC_parse >= RExC_end)
vFAIL("Switch (?(condition)... not terminated");
else
vFAIL("Switch (?(condition)... contains too many branches");
@@ -10754,11 +10784,11 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
vFAIL("Sequence (? incomplete");
break;
default: /* e.g., (?i) */
- --RExC_parse;
+ RExC_parse = (char *) seqstart + 1;
parse_flags:
parse_lparen_question_flags(pRExC_state);
if (UCHARAT(RExC_parse) != ':') {
- if (*RExC_parse)
+ if (RExC_parse < RExC_end)
nextchar(pRExC_state);
*flagp = TRYAGAIN;
return NULL;
@@ -11205,7 +11235,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
ret = reganode(pRExC_state, OPFAIL, 0);
return ret;
}
- else if (min == max && RExC_parse < RExC_end && *RExC_parse == '?')
+ else if (min == max && *RExC_parse == '?')
{
if (PASS2) {
ckWARN2reg(RExC_parse + 1,
@@ -11327,13 +11357,12 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
(void)ReREFCNT_inc(RExC_rx_sv);
}
- if (RExC_parse < RExC_end && *RExC_parse == '?') {
+ if (*RExC_parse == '?') {
nextchar(pRExC_state);
reginsert(pRExC_state, MINMOD, ret, depth+1);
REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
}
- else
- if (RExC_parse < RExC_end && *RExC_parse == '+') {
+ else if (*RExC_parse == '+') {
regnode *ender;
nextchar(pRExC_state);
ender = reg_node(pRExC_state, SUCCEED);
@@ -11344,7 +11373,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
REGTAIL(pRExC_state, ret, ender);
}
- if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
+ if (ISMULT2(RExC_parse)) {
RExC_parse++;
vFAIL("Nested quantifiers");
}
@@ -12002,6 +12031,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
tryagain:
parse_start = RExC_parse;
+ assert(RExC_parse < RExC_end);
switch ((U8)*RExC_parse) {
case '^':
RExC_seen_zerolen++;
@@ -12062,7 +12092,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
ret = reg(pRExC_state, 2, &flags,depth+1);
if (ret == NULL) {
if (flags & TRYAGAIN) {
- if (RExC_parse == RExC_end) {
+ if (RExC_parse >= RExC_end) {
/* Make parent create an empty node if needed. */
*flagp |= TRYAGAIN;
return(NULL);
@@ -12106,7 +12136,8 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
required, as the default for this switch is to jump to the
literal text handling code.
*/
- switch ((U8)*++RExC_parse) {
+ RExC_parse++;
+ switch ((U8)*RExC_parse) {
/* Special Escapes */
case 'A':
RExC_seen_zerolen++;
@@ -12174,7 +12205,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
ret = reg_node(pRExC_state, op);
*flagp |= SIMPLE;
- if (*(RExC_parse + 1) != '{') {
+ if (RExC_parse >= RExC_end || *(RExC_parse + 1) != '{') {
FLAGS(ret) = TRADITIONAL_BOUND;
if (PASS2 && op > BOUNDA) { /* /aa is same as /a */
OP(ret) = BOUNDA;
@@ -12397,8 +12428,12 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
case 'k': /* Handle \k<NAME> and \k'NAME' */
parse_named_seq:
{
- char ch= RExC_parse[1];
- if (ch != '<' && ch != '\'' && ch != '{') {
+ char ch;
+ if ( RExC_parse >= RExC_end - 1
+ || (( ch = RExC_parse[1]) != '<'
+ && ch != '\''
+ && ch != '{'))
+ {
RExC_parse++;
/* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
vFAIL2("Sequence %.2s... not terminated",parse_start);
@@ -12440,6 +12475,9 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
goto parse_named_seq;
}
+ if (RExC_parse >= RExC_end) {
+ goto unterminated_g;
+ }
num = S_backref_value(RExC_parse);
if (num == 0)
vFAIL("Reference to invalid group 0");
@@ -12447,6 +12485,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
if (isDIGIT(*RExC_parse))
vFAIL("Reference to nonexistent group");
else
+ unterminated_g:
vFAIL("Unterminated \\g... pattern");
}
@@ -12850,7 +12889,6 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
p += numlen;
if (PASS2 /* like \08, \178 */
&& numlen < 3
- && p < RExC_end
&& isDIGIT(*p) && ckWARN(WARN_REGEXP))
{
reg_warn_non_literal_string(
@@ -14367,9 +14405,7 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
case ']':
if (depth--) break;
RExC_parse++;
- if (RExC_parse < RExC_end
- && *RExC_parse == ')')
- {
+ if (*RExC_parse == ')') {
node = reganode(pRExC_state, ANYOF, 0);
RExC_size += ANYOF_SKIP;
nextchar(pRExC_state);
@@ -14516,7 +14552,8 @@ redo_curchar:
case '(':
- if (RExC_parse < RExC_end && (UCHARAT(RExC_parse + 1) == '?'))
+ if ( RExC_parse < RExC_end - 1
+ && (UCHARAT(RExC_parse + 1) == '?'))
{
/* If is a '(?', could be an embedded '(?flags:(?[...])'.
* This happens when we have some thing like
@@ -14572,12 +14609,12 @@ redo_curchar:
* inversion list, and RExC_parse points to the trailing
* ']'; the next character should be the ')' */
RExC_parse++;
- assert(RExC_parse < RExC_end && UCHARAT(RExC_parse) == ')');
+ assert(UCHARAT(RExC_parse) == ')');
/* Then the ')' matching the original '(' handled by this
* case: statement */
RExC_parse++;
- assert(RExC_parse < RExC_end && UCHARAT(RExC_parse) == ')');
+ assert(UCHARAT(RExC_parse) == ')');
RExC_parse++;
RExC_flags = save_flags;
@@ -15173,8 +15210,7 @@ S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN c
#define SKIP_BRACKETED_WHITE_SPACE(do_skip, p) \
STMT_START { \
if (do_skip) { \
- while ( p < RExC_end \
- && isBLANK_A(UCHARAT(p))) \
+ while (isBLANK_A(UCHARAT(p))) \
{ \
p++; \
} \
@@ -15355,6 +15391,8 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
+ assert(RExC_parse <= RExC_end);
+
if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
RExC_parse++;
invert = TRUE;
@@ -17599,6 +17637,7 @@ S_nextchar(pTHX_ RExC_state_t *pRExC_state)
{
PERL_ARGS_ASSERT_NEXTCHAR;
+ if (RExC_parse < RExC_end) {
assert( ! UTF
|| UTF8_IS_INVARIANT(*RExC_parse)
|| UTF8_IS_START(*RExC_parse));
@@ -17607,6 +17646,7 @@ S_nextchar(pTHX_ RExC_state_t *pRExC_state)
skip_to_be_ignored_text(pRExC_state, &RExC_parse,
FALSE /* Don't assume /x */ );
+ }
}
STATIC regnode *