summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2015-02-25 23:19:39 -0700
committerKarl Williamson <khw@cpan.org>2015-03-19 21:10:03 -0600
commit6798c95dd27b33efd71f394c18649af7bbaf42b7 (patch)
treeb8ac326854bc4865f83453c91b768bc30f110391
parent9269c59f17a7bb8e6d97c55ab987b934cfb48034 (diff)
downloadperl-6798c95dd27b33efd71f394c18649af7bbaf42b7.tar.gz
Change /(?[...]) to have normal operator precedence
This experimental feature now has the intersection operator ("&") higher precedence than the other binary operators.
-rw-r--r--embed.fnc1
-rw-r--r--embed.h1
-rw-r--r--pod/perldelta.pod10
-rw-r--r--pod/perlrecharclass.pod13
-rw-r--r--proto.h3
-rw-r--r--regcomp.c602
-rw-r--r--t/re/regex_sets.t12
7 files changed, 436 insertions, 206 deletions
diff --git a/embed.fnc b/embed.fnc
index ce36c6c64a..a909f7d5bc 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -2136,6 +2136,7 @@ Es |regnode*|regclass |NN RExC_state_t *pRExC_state \
Es |void|add_above_Latin1_folds|NN RExC_state_t *pRExC_state|const U8 cp \
|NN SV** invlist
Esn |bool|could_it_be_a_POSIX_class|NN RExC_state_t *pRExC_state
+EsnP |unsigned int|regex_set_precedence|const U8 my_operator
Es |regnode*|handle_regex_sets|NN RExC_state_t *pRExC_state \
|NULLOK SV ** return_invlist \
|NN I32 *flagp|U32 depth \
diff --git a/embed.h b/embed.h
index acbd1ea23a..4d9ca18439 100644
--- a/embed.h
+++ b/embed.h
@@ -992,6 +992,7 @@
#define regatom(a,b,c) S_regatom(aTHX_ a,b,c)
#define regbranch(a,b,c,d) S_regbranch(aTHX_ a,b,c,d)
#define regclass(a,b,c,d,e,f,g,h) S_regclass(aTHX_ a,b,c,d,e,f,g,h)
+#define regex_set_precedence S_regex_set_precedence
#define reginsert(a,b,c,d) S_reginsert(aTHX_ a,b,c,d)
#define regnode_guts(a,b,c,d) S_regnode_guts(aTHX_ a,b,c,d)
#define regpatws S_regpatws
diff --git a/pod/perldelta.pod b/pod/perldelta.pod
index 87ac56c019..6af8da5b78 100644
--- a/pod/perldelta.pod
+++ b/pod/perldelta.pod
@@ -45,6 +45,16 @@ XXX For a release on a stable branch, this section aspires to be:
[ List each incompatible change as a =head2 entry ]
+=head2 C<(?[...])> operators now follow standard Perl precedence
+
+This experimental feature allows set operations in regular expression
+patterns. Prior to this, the intersection operator had the same
+precedence as the other binary operators. Now it has higher precedence.
+This could lead to different outcomes than existing code expects (though
+the documentation has always noted that this change might happen,
+recommending fully parenthesizing the expressions). See
+L<perlrecharclass/Extended Bracketed Character Classes>.
+
=head1 Deprecations
XXX Any deprecated features, syntax, modules etc. should be listed here.
diff --git a/pod/perlrecharclass.pod b/pod/perlrecharclass.pod
index 84a9226451..20a439ac36 100644
--- a/pod/perlrecharclass.pod
+++ b/pod/perlrecharclass.pod
@@ -1025,14 +1025,11 @@ There is one unary operator:
! complement
-All the binary operators left associate, and are of equal precedence.
-The unary operator right associates, and has higher precedence. Use
-parentheses to override the default associations. Some feedback we've
-received indicates a desire for intersection to have higher precedence
-than union. This is something that feedback from the field may cause us
-to change in future releases; you may want to parenthesize copiously to
-avoid such changes affecting your code, until this feature is no longer
-considered experimental.
+All the binary operators left associate; C<"&"> is higher precedence
+than the others, which all have equal precedence. The unary operator
+right associates, and has highest precedence. Thus this follows the
+normal Perl precedence rules for logical operators. Use parentheses to
+override the default precedence and associativity.
The main restriction is that everything is a metacharacter. Thus,
you cannot refer to single characters by doing something like this:
diff --git a/proto.h b/proto.h
index 4bc200dae6..f45a4a3617 100644
--- a/proto.h
+++ b/proto.h
@@ -7100,6 +7100,9 @@ STATIC regnode* S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 dept
#define PERL_ARGS_ASSERT_REGCLASS \
assert(pRExC_state); assert(flagp)
+STATIC unsigned int S_regex_set_precedence(const U8 my_operator)
+ __attribute__pure__;
+
STATIC void S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
__attribute__nonnull__(pTHX_1)
__attribute__nonnull__(pTHX_3);
diff --git a/regcomp.c b/regcomp.c
index d736a0131a..51065d58f2 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -13266,6 +13266,34 @@ S_could_it_be_a_POSIX_class(RExC_state_t *pRExC_state)
&& first_char == *(p - 1));
}
+STATIC unsigned int
+S_regex_set_precedence(const U8 my_operator) {
+
+ /* Returns the precedence in the (?[...]) construct of the input operator,
+ * specified by its character representation. The precedence follows
+ * general Perl rules, but it extends this so that ')' and ']' have (low)
+ * precedence even though they aren't really operators */
+
+ switch (my_operator) {
+ case '!':
+ return 5;
+ case '&':
+ return 4;
+ case '^':
+ case '|':
+ case '+':
+ case '-':
+ return 3;
+ case ')':
+ return 2;
+ case ']':
+ return 1;
+ }
+
+ NOT_REACHED; /* NOTREACHED */
+ return 0; /* Silence compiler warning */
+}
+
STATIC regnode *
S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
I32 *flagp, U32 depth,
@@ -13273,24 +13301,35 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
{
/* Handle the (?[...]) construct to do set operations */
- U8 curchar;
- UV start, end; /* End points of code point ranges */
- SV* result_string;
- char *save_end, *save_parse;
- SV* final;
- STRLEN len;
- regnode* node;
- AV* stack;
- const bool save_fold = FOLD;
+ U8 curchar; /* Current character being parsed */
+ UV start, end; /* End points of code point ranges */
+ SV* final = NULL; /* The end result inversion list */
+ SV* result_string; /* 'final' stringified */
+ AV* stack; /* stack of operators and operands not yet
+ resolved */
+ AV* fence_stack = NULL; /* A stack containing the positions in
+ 'stack' of where the undealt-with left
+ parens would be if they were actually
+ put there */
+ IV fence = 0; /* Position of where most recent undealt-
+ with left paren in stack is; -1 if none.
+ */
+ STRLEN len; /* Temporary */
+ regnode* node; /* Temporary, and final regnode returned by
+ this function */
+ const bool save_fold = FOLD; /* Temporary */
+ char *save_end, *save_parse; /* Temporaries */
GET_RE_DEBUG_FLAGS_DECL;
PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
- if (LOC) {
+ if (LOC) { /* XXX could make valid in UTF-8 locales */
vFAIL("(?[...]) not valid in locale");
}
- RExC_uni_semantics = 1;
+ RExC_uni_semantics = 1; /* The use of this operator implies /u. This
+ is required so that the compile time values
+ are valid in all runtime cases */
/* This will return only an ANYOF regnode, or (unlikely) something smaller
* (such as EXACT). Thus we can skip most everything if just sizing. We
@@ -13299,16 +13338,7 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
* upon an unescaped ']' that isn't one ending a regclass. To do both
* these things, we need to realize that something preceded by a backslash
* is escaped, so we have to keep track of backslashes */
- if (PASS2) {
- Perl_ck_warner_d(aTHX_
- packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
- "The regex_sets feature is experimental" REPORT_LOCATION,
- UTF8fARG(UTF, (RExC_parse - RExC_precomp), RExC_precomp),
- UTF8fARG(UTF,
- RExC_end - RExC_start - (RExC_parse - RExC_precomp),
- RExC_precomp + (RExC_parse - RExC_precomp)));
- }
- else {
+ if (SIZE_ONLY) {
UV depth = 0; /* how many nested (?[...]) constructs */
while (RExC_parse < RExC_end) {
@@ -13354,8 +13384,8 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
TRUE, /* strict */
&current
))
- FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
- (UV) *flagp);
+ FAIL2("panic: regclass returned NULL to handle_sets, "
+ "flags=%#"UVxf"", (UV) *flagp);
/* function call leaves parse pointing to the ']', except
* if we faked it */
@@ -13389,69 +13419,124 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
FAIL("Syntax error in (?[...])");
}
- /* Pass 2 only after this. Everything in this construct is a
- * metacharacter. Operands begin with either a '\' (for an escape
- * sequence), or a '[' for a bracketed character class. Any other
- * character should be an operator, or parenthesis for grouping. Both
- * types of operands are handled by calling regclass() to parse them. It
- * is called with a parameter to indicate to return the computed inversion
- * list. The parsing here is implemented via a stack. Each entry on the
- * stack is a single character representing one of the operators, or the
- * '('; or else a pointer to an operand inversion list. */
+ /* Pass 2 only after this. */
+ Perl_ck_warner_d(aTHX_
+ packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
+ "The regex_sets feature is experimental" REPORT_LOCATION,
+ UTF8fARG(UTF, (RExC_parse - RExC_precomp), RExC_precomp),
+ UTF8fARG(UTF,
+ RExC_end - RExC_start - (RExC_parse - RExC_precomp),
+ RExC_precomp + (RExC_parse - RExC_precomp)));
+
+ /* Everything in this construct is a metacharacter. Operands begin with
+ * either a '\' (for an escape sequence), or a '[' for a bracketed
+ * character class. Any other character should be an operator, or
+ * parenthesis for grouping. Both types of operands are handled by calling
+ * regclass() to parse them. It is called with a parameter to indicate to
+ * return the computed inversion list. The parsing here is implemented via
+ * a stack. Each entry on the stack is a single character representing one
+ * of the operators; or else a pointer to an operand inversion list. */
#define IS_OPERAND(a) (! SvIOK(a))
- /* The stack starts empty. It is a syntax error if the first thing parsed
- * is a binary operator; everything else is pushed on the stack. When an
- * operand is parsed, the top of the stack is examined. If it is a binary
- * operator, the item before it should be an operand, and both are replaced
- * by the result of doing that operation on the new operand and the one on
- * the stack. Thus a sequence of binary operands is reduced to a single
- * one before the next one is parsed.
+ /* The stack is kept in Łukasiewicz order. (That's pronounced similar
+ * to luke-a-shave-itch (or -itz), but people who didn't want to bother
+ * with prounouncing it called it Reverse Polish instead, but now that YOU
+ * know how to prounounce it you can use the correct term, thus giving due
+ * credit to the person who invented it, and impressing your geek friends.
+ * Wikipedia says that the pronounciation of "Ł" has been changing so that
+ * it is now more like an English initial W (as in wonk) than an L.)
+ *
+ * This means that, for example, 'a | b & c' is stored on the stack as
+ *
+ * c [4]
+ * b [3]
+ * & [2]
+ * a [1]
+ * | [0]
+ *
+ * where the numbers in brackets give the stack [array] element number.
+ * In this implementation, parentheses are not stored on the stack.
+ * Instead a '(' creates a "fence" so that the part of the stack below the
+ * fence is invisible except to the corresponding ')' (this allows us to
+ * replace testing for parens, by using instead subtraction of the fence
+ * position). As new operands are processed they are pushed onto the stack
+ * (except as noted in the next paragraph). New operators of higher
+ * precedence than the current final one are inserted on the stack before
+ * the lhs operand (so that when the rhs is pushed next, everything will be
+ * in the correct positions shown above. When an operator of equal or
+ * lower precedence is encountered in parsing, all the stacked operations
+ * of equal or higher precedence are evaluated, leaving the result as the
+ * top entry on the stack. This makes higher precedence operations
+ * evaluate before lower precedence ones, and causes operations of equal
+ * precedence to left associate.
*
- * A unary operator may immediately follow a binary in the input, for
- * example
+ * The only unary operator '!' is immediately pushed onto the stack when
+ * encountered. When an operand is encountered, if the top of the stack is
+ * a '!", the complement is immediately performed, and the '!' popped. The
+ * resulting value is treated as a new operand, and the logic in the
+ * previous paragraph is executed. Thus in the expression
* [a] + ! [b]
- * When an operand is parsed and the top of the stack is a unary operator,
- * the operation is performed, and then the stack is rechecked to see if
- * this new operand is part of a binary operation; if so, it is handled as
- * above.
+ * the stack looks like
+ *
+ * !
+ * a
+ * +
+ *
+ * as 'b' gets parsed, the latter gets evaluated to '!b', and the stack
+ * becomes
+ *
+ * !b
+ * a
+ * +
+ *
+ * A ')' is treated as an operator with lower precedence than all the
+ * aforementioned ones, which causes all operations on the stack above the
+ * corresponding '(' to be evaluated down to a single resultant operand.
+ * Then the fence for the '(' is removed, and the operand goes through the
+ * algorithm above, without the fence.
*
- * A '(' is simply pushed on the stack; it is valid only if the stack is
- * empty, or the top element of the stack is an operator or another '('
- * (for which the parenthesized expression will become an operand). By the
- * time the corresponding ')' is parsed everything in between should have
- * been parsed and evaluated to a single operand (or else is a syntax
- * error), and is handled as a regular operand */
+ * A separate stack is kept of the fence positions, so that the position of
+ * the latest so-far unbalanced '(' is at the top of it.
+ *
+ * The ']' ending the construct is treated as the lowest operator of all,
+ * so that everything gets evaluated down to a single operand, which is the
+ * result */
sv_2mortal((SV *)(stack = newAV()));
+ sv_2mortal((SV *)(fence_stack = newAV()));
while (RExC_parse < RExC_end) {
- I32 top_index = av_tindex(stack);
- SV** top_ptr;
- SV* current = NULL;
+ I32 top_index; /* Index of top-most element in 'stack' */
+ SV** top_ptr; /* Pointer to top 'stack' element */
+ SV* current = NULL; /* To contain the current inversion list
+ operand */
+ SV* only_to_avoid_leaks;
/* Skip white space */
RExC_parse = regpatws(pRExC_state, RExC_parse,
- TRUE /* means recognize comments */ );
+ TRUE /* means recognize comments */ );
if (RExC_parse >= RExC_end) {
Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
}
- if ((curchar = UCHARAT(RExC_parse)) == ']') {
- break;
- }
+
+ curchar = UCHARAT(RExC_parse);
+
+redo_curchar:
+
+ top_index = av_tindex(stack);
switch (curchar) {
+ SV** stacked_ptr; /* Ptr to something already on 'stack' */
+ char stacked_operator; /* The topmost operator on the 'stack'. */
+ SV* lhs; /* Operand to the left of the operator */
+ SV* rhs; /* Operand to the right of the operator */
+ SV* fence_ptr; /* Pointer to top element of the fence
+ stack */
+
+ case '(':
- case '?':
- if (av_tindex(stack) >= 0 /* This makes sure that we can
- safely subtract 1 from
- RExC_parse in the next clause.
- If we have something on the
- stack, we have parsed something
- */
- && UCHARAT(RExC_parse - 1) == '('
- && RExC_parse < RExC_end)
+ if (RExC_parse < RExC_end && (UCHARAT(RExC_parse + 1) == '?'))
{
/* If is a '(?', could be an embedded '(?flags:(?[...])'.
* This happens when we have some thing like
@@ -13466,14 +13551,18 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
* interpolated expression evaluates to. We use the flags
* from the interpolated pattern. */
U32 save_flags = RExC_flags;
- const char * const save_parse = ++RExC_parse;
+ const char * save_parse;
+
+ RExC_parse += 2; /* Skip past the '(?' */
+ save_parse = RExC_parse;
+ /* Parse any flags for the '(?' */
parse_lparen_question_flags(pRExC_state);
if (RExC_parse == save_parse /* Makes sure there was at
- least one flag (or this
- embedding wasn't compiled)
- */
+ least one flag (or else
+ this embedding wasn't
+ compiled) */
|| RExC_parse >= RExC_end - 4
|| UCHARAT(RExC_parse) != ':'
|| UCHARAT(++RExC_parse) != '('
@@ -13493,25 +13582,50 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
}
vFAIL("Expecting '(?flags:(?[...'");
}
+
+ /* Recurse, with the meat of the embedded expression */
RExC_parse++;
(void) handle_regex_sets(pRExC_state, &current, flagp,
depth+1, oregcomp_parse);
/* Here, 'current' contains the embedded expression's
* inversion list, and RExC_parse points to the trailing
- * ']'; the next character should be the ')' which will be
- * paired with the '(' that has been put on the stack, so
- * the whole embedded expression reduces to '(operand)' */
+ * ']'; the next character should be the ')' */
+ RExC_parse++;
+ assert(RExC_parse < RExC_end && UCHARAT(RExC_parse) == ')');
+
+ /* Then the ')' matching the original '(' handled by this
+ * case: statement */
RExC_parse++;
+ assert(RExC_parse < RExC_end && UCHARAT(RExC_parse) == ')');
+ RExC_parse++;
RExC_flags = save_flags;
goto handle_operand;
}
- /* FALLTHROUGH */
- default:
- RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
- vFAIL("Unexpected character");
+ /* A regular '('. Look behind for illegal syntax */
+ if (top_index - fence >= 0) {
+ /* If the top entry on the stack is an operator, it had
+ * better be a '!', otherwise the entry below the top
+ * operand should be an operator */
+ if ( ! (top_ptr = av_fetch(stack, top_index, FALSE))
+ || (! IS_OPERAND(*top_ptr) && SvUV(*top_ptr) != '!')
+ || top_index - fence < 1
+ || ! (stacked_ptr = av_fetch(stack,
+ top_index - 1,
+ FALSE))
+ || IS_OPERAND(*stacked_ptr))
+ {
+ RExC_parse++;
+ vFAIL("Unexpected '(' with no preceding operator");
+ }
+ }
+
+ /* Stack the position of this undealt-with left paren */
+ fence = top_index + 1;
+ av_push(fence_stack, newSViv(fence));
+ break;
case '\\':
/* regclass() can only return RESTART_UTF8 if multi-char
@@ -13521,10 +13635,12 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
FALSE, /* don't allow multi-char folds */
FALSE, /* don't silence non-portable warnings. */
TRUE, /* strict */
- &current
- ))
- FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
- (UV) *flagp);
+ &current))
+ {
+ FAIL2("panic: regclass returned NULL to handle_sets, "
+ "flags=%#"UVxf"", (UV) *flagp);
+ }
+
/* regclass() will return with parsing just the \ sequence,
* leaving the parse pointer at the next thing to parse */
RExC_parse--;
@@ -13548,8 +13664,11 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
TRUE, /* strict */
&current
))
- FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
- (UV) *flagp);
+ {
+ FAIL2("panic: regclass returned NULL to handle_sets, "
+ "flags=%#"UVxf"", (UV) *flagp);
+ }
+
/* function call leaves parse pointing to the ']', except if we
* faked it */
if (is_posix_class) {
@@ -13559,147 +13678,237 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
goto handle_operand;
}
+ case ']':
+ if (top_index >= 1) {
+ goto join_operators;
+ }
+
+ /* Only a single operand on the stack: are done */
+ goto done;
+
+ case ')':
+ if (av_tindex(fence_stack) < 0) {
+ RExC_parse++;
+ vFAIL("Unexpected ')'");
+ }
+
+ /* If at least two thing on the stack, treat this as an
+ * operator */
+ if (top_index - fence >= 1) {
+ goto join_operators;
+ }
+
+ /* Here only a single thing on the fenced stack, and there is a
+ * fence. Get rid of it */
+ fence_ptr = av_pop(fence_stack);
+ assert(fence_ptr);
+ fence = SvIV(fence_ptr) - 1;
+ SvREFCNT_dec_NN(fence_ptr);
+ fence_ptr = NULL;
+
+ if (fence < 0) {
+ fence = 0;
+ }
+
+ /* Having gotten rid of the fence, we pop the operand at the
+ * stack top and process it as a newly encountered operand */
+ current = av_pop(stack);
+ assert(IS_OPERAND(current));
+ goto handle_operand;
+
case '&':
case '|':
case '+':
case '-':
case '^':
- if (top_index < 0
+
+ /* These binary operators should have a left operand already
+ * parsed */
+ if ( top_index - fence < 0
+ || top_index - fence == 1
|| ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
|| ! IS_OPERAND(*top_ptr))
{
- RExC_parse++;
- vFAIL2("Unexpected binary operator '%c' with no preceding operand", curchar);
+ goto unexpected_binary;
}
- av_push(stack, newSVuv(curchar));
- break;
- case '!':
- av_push(stack, newSVuv(curchar));
- break;
+ /* If only the one operand is on the part of the stack visible
+ * to us, we just place this operator in the proper position */
+ if (top_index - fence < 2) {
- case '(':
- if (top_index >= 0) {
- top_ptr = av_fetch(stack, top_index, FALSE);
- assert(top_ptr);
- if (IS_OPERAND(*top_ptr)) {
- RExC_parse++;
- vFAIL("Unexpected '(' with no preceding operator");
- }
+ /* Place the operator before the operand */
+
+ SV* lhs = av_pop(stack);
+ av_push(stack, newSVuv(curchar));
+ av_push(stack, lhs);
+ break;
}
- av_push(stack, newSVuv(curchar));
- break;
- case ')':
- {
- SV* lparen;
- if (top_index < 1
- || ! (current = av_pop(stack))
- || ! IS_OPERAND(current)
- || ! (lparen = av_pop(stack))
- || IS_OPERAND(lparen)
- || SvUV(lparen) != '(')
+ /* But if there is something else on the stack, we need to
+ * process it before this new operator if and only if the
+ * stacked operation has equal or higher precedence than the
+ * new one */
+
+ join_operators:
+
+ /* The operator on the stack is supposed to be below both its
+ * operands */
+ if ( ! (stacked_ptr = av_fetch(stack, top_index - 2, FALSE))
+ || IS_OPERAND(*stacked_ptr))
{
- SvREFCNT_dec(current);
+ /* But if not, it's legal and indicates we are completely
+ * done if and only if we're currently processing a ']',
+ * which should be the final thing in the expression */
+ if (curchar == ']') {
+ goto done;
+ }
+
+ unexpected_binary:
RExC_parse++;
- vFAIL("Unexpected ')'");
+ vFAIL2("Unexpected binary operator '%c' with no "
+ "preceding operand", curchar);
}
- top_index -= 2;
- SvREFCNT_dec_NN(lparen);
+ stacked_operator = (char) SvUV(*stacked_ptr);
- /* FALLTHROUGH */
- }
+ if (regex_set_precedence(curchar)
+ > regex_set_precedence(stacked_operator))
+ {
+ /* Here, the new operator has higher precedence than the
+ * stacked one. This means we need to add the new one to
+ * the stack to await its rhs operand (and maybe more
+ * stuff). We put it before the lhs operand, leaving
+ * untouched the stacked operator and everything below it
+ * */
+ lhs = av_pop(stack);
+ assert(IS_OPERAND(lhs));
+
+ av_push(stack, newSVuv(curchar));
+ av_push(stack, lhs);
+ break;
+ }
- handle_operand:
+ /* Here, the new operator has equal or lower precedence than
+ * what's already there. This means the operation already
+ * there should be performed now, before the new one. */
+ rhs = av_pop(stack);
+ lhs = av_pop(stack);
- /* Here, we have an operand to process, in 'current' */
+ assert(IS_OPERAND(rhs));
+ assert(IS_OPERAND(lhs));
- if (top_index < 0) { /* Just push if stack is empty */
- av_push(stack, current);
- }
- else {
- SV* top = av_pop(stack);
- SV *prev = NULL;
- char current_operator;
-
- if (IS_OPERAND(top)) {
- SvREFCNT_dec_NN(top);
- SvREFCNT_dec_NN(current);
- vFAIL("Operand with no preceding operator");
+ switch (stacked_operator) {
+ case '&':
+ _invlist_intersection(lhs, rhs, &rhs);
+ break;
+
+ case '|':
+ case '+':
+ _invlist_union(lhs, rhs, &rhs);
+ break;
+
+ case '-':
+ _invlist_subtract(lhs, rhs, &rhs);
+ break;
+
+ case '^': /* The union minus the intersection */
+ {
+ SV* i = NULL;
+ SV* u = NULL;
+ SV* element;
+
+ _invlist_union(lhs, rhs, &u);
+ _invlist_intersection(lhs, rhs, &i);
+ /* _invlist_subtract will overwrite rhs
+ without freeing what it already contains */
+ element = rhs;
+ _invlist_subtract(u, i, &rhs);
+ SvREFCNT_dec_NN(i);
+ SvREFCNT_dec_NN(u);
+ SvREFCNT_dec_NN(element);
+ break;
}
- current_operator = (char) SvUV(top);
- switch (current_operator) {
- case '(': /* Push the '(' back on followed by the new
- operand */
- av_push(stack, top);
- av_push(stack, current);
- SvREFCNT_inc(top); /* Counters the '_dec' done
- just after the 'break', so
- it doesn't get wrongly freed
- */
- break;
+ }
+ SvREFCNT_dec(lhs);
+
+ /* Here, the higher precedence operation has been done, and the
+ * result is in 'rhs'. We overwrite the stacked operator with
+ * the result. Then we redo this code to either push the new
+ * operator onto the stack or perform any higher precedence
+ * stacked operation */
+ only_to_avoid_leaks = av_pop(stack);
+ SvREFCNT_dec(only_to_avoid_leaks);
+ av_push(stack, rhs);
+ goto redo_curchar;
+
+ case '!': /* Highest priority, right associative, so just push
+ onto stack */
+ av_push(stack, newSVuv(curchar));
+ break;
- case '!':
- _invlist_invert(current);
-
- /* Unlike binary operators, the top of the stack,
- * now that this unary one has been popped off, may
- * legally be an operator, and we now have operand
- * for it. */
- top_index--;
- SvREFCNT_dec_NN(top);
- goto handle_operand;
-
- case '&':
- prev = av_pop(stack);
- _invlist_intersection(prev,
- current,
- &current);
- av_push(stack, current);
- break;
+ default:
+ RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
+ vFAIL("Unexpected character");
- case '|':
- case '+':
- prev = av_pop(stack);
- _invlist_union(prev, current, &current);
- av_push(stack, current);
- break;
+ handle_operand:
+
+ /* Here 'current' is the operand. If something is already on the
+ * stack, we have to check if it is a !. */
+ top_index = av_tindex(stack); /* Code above may have altered the
+ * stack in the time since we
+ * earlier set 'top_index'. */
+ if (top_index - fence >= 0) {
+ /* If the top entry on the stack is an operator, it had better
+ * be a '!', otherwise the entry below the top operand should
+ * be an operator */
+ top_ptr = av_fetch(stack, top_index, FALSE);
+ assert(top_ptr);
+ if (! IS_OPERAND(*top_ptr)) {
+
+ /* The only permissible operator at the top of the stack is
+ * '!', which is applied immediately to this operand. */
+ curchar = (char) SvUV(*top_ptr);
+ if (curchar != '!') {
+ SvREFCNT_dec(current);
+ vFAIL2("Unexpected binary operator '%c' with no "
+ "preceding operand", curchar);
+ }
- case '-':
- prev = av_pop(stack);;
- _invlist_subtract(prev, current, &current);
- av_push(stack, current);
- break;
+ _invlist_invert(current);
- case '^': /* The union minus the intersection */
- {
- SV* i = NULL;
- SV* u = NULL;
- SV* element;
-
- prev = av_pop(stack);
- _invlist_union(prev, current, &u);
- _invlist_intersection(prev, current, &i);
- /* _invlist_subtract will overwrite current
- without freeing what it already contains */
- element = current;
- _invlist_subtract(u, i, &current);
- av_push(stack, current);
- SvREFCNT_dec_NN(i);
- SvREFCNT_dec_NN(u);
- SvREFCNT_dec_NN(element);
- break;
- }
+ only_to_avoid_leaks = av_pop(stack);
+ SvREFCNT_dec(only_to_avoid_leaks);
+ top_index = av_tindex(stack);
- default:
- Perl_croak(aTHX_ "panic: Unexpected item on '(?[ ])' stack");
+ /* And we redo with the inverted operand. This allows
+ * handling multiple ! in a row */
+ goto handle_operand;
+ }
+ /* Single operand is ok only for the non-binary ')'
+ * operator */
+ else if ((top_index - fence == 0 && curchar != ')')
+ || (top_index - fence > 0
+ && (! (stacked_ptr = av_fetch(stack,
+ top_index - 1,
+ FALSE))
+ || IS_OPERAND(*stacked_ptr))))
+ {
+ SvREFCNT_dec(current);
+ vFAIL("Operand with no preceding operator");
}
- SvREFCNT_dec_NN(top);
- SvREFCNT_dec(prev);
}
- }
+
+ /* Here there was nothing on the stack or the top element was
+ * another operand. Just add this new one */
+ av_push(stack, current);
+
+ } /* End of switch on next parse token */
RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
+ } /* End of loop parsing through the construct */
+
+ done:
+ if (av_tindex(fence_stack) >= 0) {
+ vFAIL("Unmatched (");
}
if (av_tindex(stack) < 0 /* Was empty */
@@ -13707,6 +13916,7 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
|| ! IS_OPERAND(final)
|| av_tindex(stack) >= 0) /* More left on stack */
{
+ SvREFCNT_dec(final);
vFAIL("Incomplete expression within '(?[ ])'");
}
@@ -13731,6 +13941,8 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
}
}
+ /* About to generate an ANYOF (or similar) node from the inversion list we
+ * have calculated */
save_parse = RExC_parse;
RExC_parse = SvPV(result_string, len);
save_end = RExC_end;
diff --git a/t/re/regex_sets.t b/t/re/regex_sets.t
index c880897f37..48a4f00b8e 100644
--- a/t/re/regex_sets.t
+++ b/t/re/regex_sets.t
@@ -68,13 +68,19 @@ like("\N{LAO DIGIT NINE}", $thai_or_lao_digit, 'embedded qr/(?[ ])/ works');
unlike(chr(ord("\N{LAO DIGIT NINE}") + 1), $thai_or_lao_digit, 'embedded qr/(?[ ])/ works');
my $ascii_word = qr/(?[ \w ])/a;
-my $ascii_digits_plus_all_of_arabic = qr/(?[ \p{Digit} & $ascii_word + \p{Arabic} ])/;
+my $ascii_digits_plus_all_of_arabic = qr/(?[ \p{Arabic} + \p{Digit} & $ascii_word ])/;
like("9", $ascii_digits_plus_all_of_arabic, "/a, then interpolating and intersection works for ASCII in the set");
unlike("A", $ascii_digits_plus_all_of_arabic, "/a, then interpolating and intersection works for ASCII not in the set");
unlike("\N{BENGALI DIGIT ZERO}", $ascii_digits_plus_all_of_arabic, "/a, then interpolating and intersection works for non-ASCII not in either set");
unlike("\N{BENGALI LETTER A}", $ascii_digits_plus_all_of_arabic, "/a, then interpolating and intersection works for non-ASCII in one set");
-like("\N{ARABIC LETTER HAMZA}", $ascii_digits_plus_all_of_arabic, "interpolation and intersection is left-associative");
-like("\N{EXTENDED ARABIC-INDIC DIGIT ZERO}", $ascii_digits_plus_all_of_arabic, "interpolation and intersection is left-associative");
+like("\N{ARABIC LETTER HAMZA}", $ascii_digits_plus_all_of_arabic, "intersection has higher precedence than union");
+like("\N{EXTENDED ARABIC-INDIC DIGIT ZERO}", $ascii_digits_plus_all_of_arabic, "intersection has higher precedence than union");
+
+like("\r", qr/(?[ \p{lb=cr} ])/, '\r matches \p{lb=cr}');
+unlike("\r", qr/(?[ ! \p{lb=cr} ])/, '\r doesnt match ! \p{lb=cr}');
+like("\r", qr/(?[ ! ! \p{lb=cr} ])/, 'Two ! ! are the original');
+unlike("\r", qr/(?[ ! ! ! \p{lb=cr} ])/, 'Three ! ! ! are the complement');
+# left associatve
my $kelvin = qr/(?[ \N{KELVIN SIGN} ])/;
my $fold = qr/(?[ $kelvin ])/i;