Change /(?[...]) to have normal operator precedence

This experimental feature now has the intersection operator ("&") higher precedence than the other binary operators.
author: Karl Williamson <khw@cpan.org> 2015-02-25 23:19:39 -0700
committer: Karl Williamson <khw@cpan.org> 2015-03-19 21:10:03 -0600
commit: 6798c95dd27b33efd71f394c18649af7bbaf42b7 (patch)
tree: b8ac326854bc4865f83453c91b768bc30f110391
parent: 9269c59f17a7bb8e6d97c55ab987b934cfb48034 (diff)
download: perl-6798c95dd27b33efd71f394c18649af7bbaf42b7.tar.gz
7 files changed, 436 insertions, 206 deletions
diff --git a/embed.fnc b/embed.fnc
index ce36c6c64a..a909f7d5bc 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -2136,6 +2136,7 @@ Es	|regnode*|regclass	|NN RExC_state_t *pRExC_state                 \
 Es	|void|add_above_Latin1_folds|NN RExC_state_t *pRExC_state|const U8 cp \
 				|NN SV** invlist
 Esn	|bool|could_it_be_a_POSIX_class|NN RExC_state_t *pRExC_state
+EsnP	|unsigned int|regex_set_precedence|const U8 my_operator
 Es	|regnode*|handle_regex_sets|NN RExC_state_t *pRExC_state \
 				|NULLOK SV ** return_invlist            \
 				|NN I32 *flagp|U32 depth                \
diff --git a/embed.h b/embed.h
index acbd1ea23a..4d9ca18439 100644
--- a/embed.h
+++ b/embed.h
@@ -992,6 +992,7 @@
 #define regatom(a,b,c)		S_regatom(aTHX_ a,b,c)
 #define regbranch(a,b,c,d)	S_regbranch(aTHX_ a,b,c,d)
 #define regclass(a,b,c,d,e,f,g,h)	S_regclass(aTHX_ a,b,c,d,e,f,g,h)
+#define regex_set_precedence	S_regex_set_precedence
 #define reginsert(a,b,c,d)	S_reginsert(aTHX_ a,b,c,d)
 #define regnode_guts(a,b,c,d)	S_regnode_guts(aTHX_ a,b,c,d)
 #define regpatws		S_regpatws
diff --git a/pod/perldelta.pod b/pod/perldelta.pod
index 87ac56c019..6af8da5b78 100644
--- a/pod/perldelta.pod
+++ b/pod/perldelta.pod
@@ -45,6 +45,16 @@ XXX For a release on a stable branch, this section aspires to be:
 
 [ List each incompatible change as a =head2 entry ]
 
+=head2 C<(?[...])> operators now follow standard Perl precedence
+
+This experimental feature allows set operations in regular expression
+patterns.  Prior to this, the intersection operator had the same
+precedence as the other binary operators.  Now it has higher precedence.
+This could lead to different outcomes than existing code expects (though
+the documentation has always noted that this change might happen,
+recommending fully parenthesizing the expressions).  See
+L<perlrecharclass/Extended Bracketed Character Classes>.
+
 =head1 Deprecations
 
 XXX Any deprecated features, syntax, modules etc. should be listed here.
diff --git a/pod/perlrecharclass.pod b/pod/perlrecharclass.pod
index 84a9226451..20a439ac36 100644
--- a/pod/perlrecharclass.pod
+++ b/pod/perlrecharclass.pod
@@ -1025,14 +1025,11 @@ There is one unary operator:
 
  !    complement
 
-All the binary operators left associate, and are of equal precedence.
-The unary operator right associates, and has higher precedence.  Use
-parentheses to override the default associations.  Some feedback we've
-received indicates a desire for intersection to have higher precedence
-than union.  This is something that feedback from the field may cause us
-to change in future releases; you may want to parenthesize copiously to
-avoid such changes affecting your code, until this feature is no longer
-considered experimental.
+All the binary operators left associate; C<"&"> is higher precedence
+than the others, which all have equal precedence.  The unary operator
+right associates, and has highest precedence.  Thus this follows the
+normal Perl precedence rules for logical operators.  Use parentheses to
+override the default precedence and associativity.
 
 The main restriction is that everything is a metacharacter.  Thus,
 you cannot refer to single characters by doing something like this:
diff --git a/proto.h b/proto.h
index 4bc200dae6..f45a4a3617 100644
--- a/proto.h
+++ b/proto.h
@@ -7100,6 +7100,9 @@ STATIC regnode*	S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 dept
 #define PERL_ARGS_ASSERT_REGCLASS	\
 	assert(pRExC_state); assert(flagp)
 
+STATIC unsigned int	S_regex_set_precedence(const U8 my_operator)
+			__attribute__pure__;
+
 STATIC void	S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
 			__attribute__nonnull__(pTHX_1)
 			__attribute__nonnull__(pTHX_3);
diff --git a/regcomp.c b/regcomp.c
index d736a0131a..51065d58f2 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -13266,6 +13266,34 @@ S_could_it_be_a_POSIX_class(RExC_state_t *pRExC_state)
             && first_char == *(p - 1));
 }
 
+STATIC unsigned  int
+S_regex_set_precedence(const U8 my_operator) {
+
+    /* Returns the precedence in the (?[...]) construct of the input operator,
+     * specified by its character representation.  The precedence follows
+     * general Perl rules, but it extends this so that ')' and ']' have (low)
+     * precedence even though they aren't really operators */
+
+    switch (my_operator) {
+        case '!':
+            return 5;
+        case '&':
+            return 4;
+        case '^':
+        case '|':
+        case '+':
+        case '-':
+            return 3;
+        case ')':
+            return 2;
+        case ']':
+            return 1;
+    }
+
+    NOT_REACHED; /* NOTREACHED */
+    return 0;   /* Silence compiler warning */
+}
+
 STATIC regnode *
 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
                     I32 *flagp, U32 depth,
@@ -13273,24 +13301,35 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
 {
     /* Handle the (?[...]) construct to do set operations */
 
-    U8 curchar;
-    UV start, end;	/* End points of code point ranges */
-    SV* result_string;
-    char *save_end, *save_parse;
-    SV* final;
-    STRLEN len;
-    regnode* node;
-    AV* stack;
-    const bool save_fold = FOLD;
+    U8 curchar;                     /* Current character being parsed */
+    UV start, end;	            /* End points of code point ranges */
+    SV* final = NULL;               /* The end result inversion list */
+    SV* result_string;              /* 'final' stringified */
+    AV* stack;                      /* stack of operators and operands not yet
+                                       resolved */
+    AV* fence_stack = NULL;         /* A stack containing the positions in
+                                       'stack' of where the undealt-with left
+                                       parens would be if they were actually
+                                       put there */
+    IV fence = 0;                   /* Position of where most recent undealt-
+                                       with left paren in stack is; -1 if none.
+                                     */
+    STRLEN len;                     /* Temporary */
+    regnode* node;                  /* Temporary, and final regnode returned by
+                                       this function */
+    const bool save_fold = FOLD;    /* Temporary */
+    char *save_end, *save_parse;    /* Temporaries */
 
     GET_RE_DEBUG_FLAGS_DECL;
 
     PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
 
-    if (LOC) {
+    if (LOC) {  /* XXX could make valid in UTF-8 locales */
         vFAIL("(?[...]) not valid in locale");
     }
-    RExC_uni_semantics = 1;
+    RExC_uni_semantics = 1;     /* The use of this operator implies /u.  This
+                                   is required so that the compile time values
+                                   are valid in all runtime cases */
 
     /* This will return only an ANYOF regnode, or (unlikely) something smaller
      * (such as EXACT).  Thus we can skip most everything if just sizing.  We
@@ -13299,16 +13338,7 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
      * upon an unescaped ']' that isn't one ending a regclass.  To do both
      * these things, we need to realize that something preceded by a backslash
      * is escaped, so we have to keep track of backslashes */
-    if (PASS2) {
-        Perl_ck_warner_d(aTHX_
-            packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
-            "The regex_sets feature is experimental" REPORT_LOCATION,
-                UTF8fARG(UTF, (RExC_parse - RExC_precomp), RExC_precomp),
-                UTF8fARG(UTF,
-                         RExC_end - RExC_start - (RExC_parse - RExC_precomp),
-                         RExC_precomp + (RExC_parse - RExC_precomp)));
-    }
-    else {
+    if (SIZE_ONLY) {
         UV depth = 0; /* how many nested (?[...]) constructs */
 
         while (RExC_parse < RExC_end) {
@@ -13354,8 +13384,8 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
                                   TRUE, /* strict */
                                   &current
                                  ))
-                        FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
-                              (UV) *flagp);
+                        FAIL2("panic: regclass returned NULL to handle_sets, "
+                              "flags=%#"UVxf"", (UV) *flagp);
 
                     /* function call leaves parse pointing to the ']', except
                      * if we faked it */
@@ -13389,69 +13419,124 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
         FAIL("Syntax error in (?[...])");
     }
 
-    /* Pass 2 only after this.  Everything in this construct is a
-     * metacharacter.  Operands begin with either a '\' (for an escape
-     * sequence), or a '[' for a bracketed character class.  Any other
-     * character should be an operator, or parenthesis for grouping.  Both
-     * types of operands are handled by calling regclass() to parse them.  It
-     * is called with a parameter to indicate to return the computed inversion
-     * list.  The parsing here is implemented via a stack.  Each entry on the
-     * stack is a single character representing one of the operators, or the
-     * '('; or else a pointer to an operand inversion list. */
+    /* Pass 2 only after this. */
+    Perl_ck_warner_d(aTHX_
+        packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
+        "The regex_sets feature is experimental" REPORT_LOCATION,
+            UTF8fARG(UTF, (RExC_parse - RExC_precomp), RExC_precomp),
+            UTF8fARG(UTF,
+                     RExC_end - RExC_start - (RExC_parse - RExC_precomp),
+                     RExC_precomp + (RExC_parse - RExC_precomp)));
+
+    /* Everything in this construct is a metacharacter.  Operands begin with
+     * either a '\' (for an escape sequence), or a '[' for a bracketed
+     * character class.  Any other character should be an operator, or
+     * parenthesis for grouping.  Both types of operands are handled by calling
+     * regclass() to parse them.  It is called with a parameter to indicate to
+     * return the computed inversion list.  The parsing here is implemented via
+     * a stack.  Each entry on the stack is a single character representing one
+     * of the operators; or else a pointer to an operand inversion list. */
 
 #define IS_OPERAND(a)  (! SvIOK(a))
 
-    /* The stack starts empty.  It is a syntax error if the first thing parsed
-     * is a binary operator; everything else is pushed on the stack.  When an
-     * operand is parsed, the top of the stack is examined.  If it is a binary
-     * operator, the item before it should be an operand, and both are replaced
-     * by the result of doing that operation on the new operand and the one on
-     * the stack.   Thus a sequence of binary operands is reduced to a single
-     * one before the next one is parsed.
+    /* The stack is kept in Łukasiewicz order.  (That's pronounced similar
+     * to luke-a-shave-itch (or -itz), but people who didn't want to bother
+     * with prounouncing it called it Reverse Polish instead, but now that YOU
+     * know how to prounounce it you can use the correct term, thus giving due
+     * credit to the person who invented it, and impressing your geek friends.
+     * Wikipedia says that the pronounciation of "Ł" has been changing so that
+     * it is now more like an English initial W (as in wonk) than an L.)
+     *
+     * This means that, for example, 'a | b & c' is stored on the stack as
+     *
+     * c  [4]
+     * b  [3]
+     * &  [2]
+     * a  [1]
+     * |  [0]
+     *
+     * where the numbers in brackets give the stack [array] element number.
+     * In this implementation, parentheses are not stored on the stack.
+     * Instead a '(' creates a "fence" so that the part of the stack below the
+     * fence is invisible except to the corresponding ')' (this allows us to
+     * replace testing for parens, by using instead subtraction of the fence
+     * position).  As new operands are processed they are pushed onto the stack
+     * (except as noted in the next paragraph).  New operators of higher
+     * precedence than the current final one are inserted on the stack before
+     * the lhs operand (so that when the rhs is pushed next, everything will be
+     * in the correct positions shown above.  When an operator of equal or
+     * lower precedence is encountered in parsing, all the stacked operations
+     * of equal or higher precedence are evaluated, leaving the result as the
+     * top entry on the stack.  This makes higher precedence operations
+     * evaluate before lower precedence ones, and causes operations of equal
+     * precedence to left associate.
      *
-     * A unary operator may immediately follow a binary in the input, for
-     * example
+     * The only unary operator '!' is immediately pushed onto the stack when
+     * encountered.  When an operand is encountered, if the top of the stack is
+     * a '!", the complement is immediately performed, and the '!' popped.  The
+     * resulting value is treated as a new operand, and the logic in the
+     * previous paragraph is executed.  Thus in the expression
      *      [a] + ! [b]
-     * When an operand is parsed and the top of the stack is a unary operator,
-     * the operation is performed, and then the stack is rechecked to see if
-     * this new operand is part of a binary operation; if so, it is handled as
-     * above.
+     * the stack looks like
+     *
+     * !
+     * a
+     * +
+     *
+     * as 'b' gets parsed, the latter gets evaluated to '!b', and the stack
+     * becomes
+     *
+     * !b
+     * a
+     * +
+     *
+     * A ')' is treated as an operator with lower precedence than all the
+     * aforementioned ones, which causes all operations on the stack above the
+     * corresponding '(' to be evaluated down to a single resultant operand.
+     * Then the fence for the '(' is removed, and the operand goes through the
+     * algorithm above, without the fence.
      *
-     * A '(' is simply pushed on the stack; it is valid only if the stack is
-     * empty, or the top element of the stack is an operator or another '('
-     * (for which the parenthesized expression will become an operand).  By the
-     * time the corresponding ')' is parsed everything in between should have
-     * been parsed and evaluated to a single operand (or else is a syntax
-     * error), and is handled as a regular operand */
+     * A separate stack is kept of the fence positions, so that the position of
+     * the latest so-far unbalanced '(' is at the top of it.
+     *
+     * The ']' ending the construct is treated as the lowest operator of all,
+     * so that everything gets evaluated down to a single operand, which is the
+     * result */
 
     sv_2mortal((SV *)(stack = newAV()));
+    sv_2mortal((SV *)(fence_stack = newAV()));
 
     while (RExC_parse < RExC_end) {
-        I32 top_index = av_tindex(stack);
-        SV** top_ptr;
-        SV* current = NULL;
+        I32 top_index;              /* Index of top-most element in 'stack' */
+        SV** top_ptr;               /* Pointer to top 'stack' element */
+        SV* current = NULL;         /* To contain the current inversion list
+                                       operand */
+        SV* only_to_avoid_leaks;
 
         /* Skip white space */
         RExC_parse = regpatws(pRExC_state, RExC_parse,
-                                         TRUE /* means recognize comments */ );
+                TRUE /* means recognize comments */ );
         if (RExC_parse >= RExC_end) {
             Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
         }
-        if ((curchar = UCHARAT(RExC_parse)) == ']') {
-            break;
-        }
+
+        curchar = UCHARAT(RExC_parse);
+
+redo_curchar:
+
+        top_index = av_tindex(stack);
 
         switch (curchar) {
+            SV** stacked_ptr;       /* Ptr to something already on 'stack' */
+            char stacked_operator;  /* The topmost operator on the 'stack'. */
+            SV* lhs;                /* Operand to the left of the operator */
+            SV* rhs;                /* Operand to the right of the operator */
+            SV* fence_ptr;          /* Pointer to top element of the fence
+                                       stack */
+
+            case '(':
 
-            case '?':
-                if (av_tindex(stack) >= 0   /* This makes sure that we can
-                                               safely subtract 1 from
-                                               RExC_parse in the next clause.
-                                               If we have something on the
-                                               stack, we have parsed something
-                                             */
-                    && UCHARAT(RExC_parse - 1) == '('
-                    && RExC_parse < RExC_end)
+                if (RExC_parse < RExC_end && (UCHARAT(RExC_parse + 1) == '?'))
                 {
                     /* If is a '(?', could be an embedded '(?flags:(?[...])'.
                      * This happens when we have some thing like
@@ -13466,14 +13551,18 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
                      * interpolated expression evaluates to.  We use the flags
                      * from the interpolated pattern. */
                     U32 save_flags = RExC_flags;
-                    const char * const save_parse = ++RExC_parse;
+                    const char * save_parse;
+
+                    RExC_parse += 2;        /* Skip past the '(?' */
+                    save_parse = RExC_parse;
 
+                    /* Parse any flags for the '(?' */
                     parse_lparen_question_flags(pRExC_state);
 
                     if (RExC_parse == save_parse  /* Makes sure there was at
-                                                     least one flag (or this
-                                                     embedding wasn't compiled)
-                                                   */
+                                                     least one flag (or else
+                                                     this embedding wasn't
+                                                     compiled) */
                         || RExC_parse >= RExC_end - 4
                         || UCHARAT(RExC_parse) != ':'
                         || UCHARAT(++RExC_parse) != '('
@@ -13493,25 +13582,50 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
                         }
                         vFAIL("Expecting '(?flags:(?[...'");
                     }
+
+                    /* Recurse, with the meat of the embedded expression */
                     RExC_parse++;
                     (void) handle_regex_sets(pRExC_state, &current, flagp,
                                                     depth+1, oregcomp_parse);
 
                     /* Here, 'current' contains the embedded expression's
                      * inversion list, and RExC_parse points to the trailing
-                     * ']'; the next character should be the ')' which will be
-                     * paired with the '(' that has been put on the stack, so
-                     * the whole embedded expression reduces to '(operand)' */
+                     * ']'; the next character should be the ')' */
+                    RExC_parse++;
+                    assert(RExC_parse < RExC_end && UCHARAT(RExC_parse) == ')');
+
+                    /* Then the ')' matching the original '(' handled by this
+                     * case: statement */
                     RExC_parse++;
+                    assert(RExC_parse < RExC_end && UCHARAT(RExC_parse) == ')');
 
+                    RExC_parse++;
                     RExC_flags = save_flags;
                     goto handle_operand;
                 }
-                /* FALLTHROUGH */
 
-            default:
-                RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
-                vFAIL("Unexpected character");
+                /* A regular '('.  Look behind for illegal syntax */
+                if (top_index - fence >= 0) {
+                    /* If the top entry on the stack is an operator, it had
+                     * better be a '!', otherwise the entry below the top
+                     * operand should be an operator */
+                    if ( ! (top_ptr = av_fetch(stack, top_index, FALSE))
+                        || (! IS_OPERAND(*top_ptr) && SvUV(*top_ptr) != '!')
+                        || top_index - fence < 1
+                        || ! (stacked_ptr = av_fetch(stack,
+                                                     top_index - 1,
+                                                     FALSE))
+                        || IS_OPERAND(*stacked_ptr))
+                    {
+                        RExC_parse++;
+                        vFAIL("Unexpected '(' with no preceding operator");
+                    }
+                }
+
+                /* Stack the position of this undealt-with left paren */
+                fence = top_index + 1;
+                av_push(fence_stack, newSViv(fence));
+                break;
 
             case '\\':
                 /* regclass() can only return RESTART_UTF8 if multi-char
@@ -13521,10 +13635,12 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
                               FALSE, /* don't allow multi-char folds */
                               FALSE, /* don't silence non-portable warnings.  */
                               TRUE,  /* strict */
-                              &current
-                             ))
-                    FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
-                          (UV) *flagp);
+                              &current))
+                {
+                    FAIL2("panic: regclass returned NULL to handle_sets, "
+                          "flags=%#"UVxf"", (UV) *flagp);
+                }
+
                 /* regclass() will return with parsing just the \ sequence,
                  * leaving the parse pointer at the next thing to parse */
                 RExC_parse--;
@@ -13548,8 +13664,11 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
                              TRUE,   /* strict */
                              &current
                             ))
-                    FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
-                          (UV) *flagp);
+                {
+                    FAIL2("panic: regclass returned NULL to handle_sets, "
+                          "flags=%#"UVxf"", (UV) *flagp);
+                }
+
                 /* function call leaves parse pointing to the ']', except if we
                  * faked it */
                 if (is_posix_class) {
@@ -13559,147 +13678,237 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
                 goto handle_operand;
             }
 
+            case ']':
+                if (top_index >= 1) {
+                    goto join_operators;
+                }
+
+                /* Only a single operand on the stack: are done */
+                goto done;
+
+            case ')':
+                if (av_tindex(fence_stack) < 0) {
+                    RExC_parse++;
+                    vFAIL("Unexpected ')'");
+                }
+
+                 /* If at least two thing on the stack, treat this as an
+                  * operator */
+                if (top_index - fence >= 1) {
+                    goto join_operators;
+                }
+
+                /* Here only a single thing on the fenced stack, and there is a
+                 * fence.  Get rid of it */
+                fence_ptr = av_pop(fence_stack);
+                assert(fence_ptr);
+                fence = SvIV(fence_ptr) - 1;
+                SvREFCNT_dec_NN(fence_ptr);
+                fence_ptr = NULL;
+
+                if (fence < 0) {
+                    fence = 0;
+                }
+
+                /* Having gotten rid of the fence, we pop the operand at the
+                 * stack top and process it as a newly encountered operand */
+                current = av_pop(stack);
+                assert(IS_OPERAND(current));
+                goto handle_operand;
+
             case '&':
             case '|':
             case '+':
             case '-':
             case '^':
-                if (top_index < 0
+
+                /* These binary operators should have a left operand already
+                 * parsed */
+                if (   top_index - fence < 0
+                    || top_index - fence == 1
                     || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
                     || ! IS_OPERAND(*top_ptr))
                 {
-                    RExC_parse++;
-                    vFAIL2("Unexpected binary operator '%c' with no preceding operand", curchar);
+                    goto unexpected_binary;
                 }
-                av_push(stack, newSVuv(curchar));
-                break;
 
-            case '!':
-                av_push(stack, newSVuv(curchar));
-                break;
+                /* If only the one operand is on the part of the stack visible
+                 * to us, we just place this operator in the proper position */
+                if (top_index - fence < 2) {
 
-            case '(':
-                if (top_index >= 0) {
-                    top_ptr = av_fetch(stack, top_index, FALSE);
-                    assert(top_ptr);
-                    if (IS_OPERAND(*top_ptr)) {
-                        RExC_parse++;
-                        vFAIL("Unexpected '(' with no preceding operator");
-                    }
+                    /* Place the operator before the operand */
+
+                    SV* lhs = av_pop(stack);
+                    av_push(stack, newSVuv(curchar));
+                    av_push(stack, lhs);
+                    break;
                 }
-                av_push(stack, newSVuv(curchar));
-                break;
 
-            case ')':
-            {
-                SV* lparen;
-                if (top_index < 1
-                    || ! (current = av_pop(stack))
-                    || ! IS_OPERAND(current)
-                    || ! (lparen = av_pop(stack))
-                    || IS_OPERAND(lparen)
-                    || SvUV(lparen) != '(')
+                /* But if there is something else on the stack, we need to
+                 * process it before this new operator if and only if the
+                 * stacked operation has equal or higher precedence than the
+                 * new one */
+
+             join_operators:
+
+                /* The operator on the stack is supposed to be below both its
+                 * operands */
+                if (   ! (stacked_ptr = av_fetch(stack, top_index - 2, FALSE))
+                    || IS_OPERAND(*stacked_ptr))
                 {
-                    SvREFCNT_dec(current);
+                    /* But if not, it's legal and indicates we are completely
+                     * done if and only if we're currently processing a ']',
+                     * which should be the final thing in the expression */
+                    if (curchar == ']') {
+                        goto done;
+                    }
+
+                  unexpected_binary:
                     RExC_parse++;
-                    vFAIL("Unexpected ')'");
+                    vFAIL2("Unexpected binary operator '%c' with no "
+                           "preceding operand", curchar);
                 }
-                top_index -= 2;
-                SvREFCNT_dec_NN(lparen);
+                stacked_operator = (char) SvUV(*stacked_ptr);
 
-                /* FALLTHROUGH */
-            }
+                if (regex_set_precedence(curchar)
+                    > regex_set_precedence(stacked_operator))
+                {
+                    /* Here, the new operator has higher precedence than the
+                     * stacked one.  This means we need to add the new one to
+                     * the stack to await its rhs operand (and maybe more
+                     * stuff).  We put it before the lhs operand, leaving
+                     * untouched the stacked operator and everything below it
+                     * */
+                    lhs = av_pop(stack);
+                    assert(IS_OPERAND(lhs));
+
+                    av_push(stack, newSVuv(curchar));
+                    av_push(stack, lhs);
+                    break;
+                }
 
-              handle_operand:
+                /* Here, the new operator has equal or lower precedence than
+                 * what's already there.  This means the operation already
+                 * there should be performed now, before the new one. */
+                rhs = av_pop(stack);
+                lhs = av_pop(stack);
 
-                /* Here, we have an operand to process, in 'current' */
+                assert(IS_OPERAND(rhs));
+                assert(IS_OPERAND(lhs));
 
-                if (top_index < 0) {    /* Just push if stack is empty */
-                    av_push(stack, current);
-                }
-                else {
-                    SV* top = av_pop(stack);
-                    SV *prev = NULL;
-                    char current_operator;
-
-                    if (IS_OPERAND(top)) {
-                        SvREFCNT_dec_NN(top);
-                        SvREFCNT_dec_NN(current);
-                        vFAIL("Operand with no preceding operator");
+                switch (stacked_operator) {
+                    case '&':
+                        _invlist_intersection(lhs, rhs, &rhs);
+                        break;
+
+                    case '|':
+                    case '+':
+                        _invlist_union(lhs, rhs, &rhs);
+                        break;
+
+                    case '-':
+                        _invlist_subtract(lhs, rhs, &rhs);
+                        break;
+
+                    case '^':   /* The union minus the intersection */
+                    {
+                        SV* i = NULL;
+                        SV* u = NULL;
+                        SV* element;
+
+                        _invlist_union(lhs, rhs, &u);
+                        _invlist_intersection(lhs, rhs, &i);
+                        /* _invlist_subtract will overwrite rhs
+                            without freeing what it already contains */
+                        element = rhs;
+                        _invlist_subtract(u, i, &rhs);
+                        SvREFCNT_dec_NN(i);
+                        SvREFCNT_dec_NN(u);
+                        SvREFCNT_dec_NN(element);
+                        break;
                     }
-                    current_operator = (char) SvUV(top);
-                    switch (current_operator) {
-                        case '(':   /* Push the '(' back on followed by the new
-                                       operand */
-                            av_push(stack, top);
-                            av_push(stack, current);
-                            SvREFCNT_inc(top);  /* Counters the '_dec' done
-                                                   just after the 'break', so
-                                                   it doesn't get wrongly freed
-                                                 */
-                            break;
+                }
+                SvREFCNT_dec(lhs);
+
+                /* Here, the higher precedence operation has been done, and the
+                 * result is in 'rhs'.  We overwrite the stacked operator with
+                 * the result.  Then we redo this code to either push the new
+                 * operator onto the stack or perform any higher precedence
+                 * stacked operation */
+                only_to_avoid_leaks = av_pop(stack);
+                SvREFCNT_dec(only_to_avoid_leaks);
+                av_push(stack, rhs);
+                goto redo_curchar;
+
+            case '!':   /* Highest priority, right associative, so just push
+                           onto stack */
+                av_push(stack, newSVuv(curchar));
+                break;
 
-                        case '!':
-                            _invlist_invert(current);
-
-                            /* Unlike binary operators, the top of the stack,
-                             * now that this unary one has been popped off, may
-                             * legally be an operator, and we now have operand
-                             * for it. */
-                            top_index--;
-                            SvREFCNT_dec_NN(top);
-                            goto handle_operand;
-
-                        case '&':
-                            prev = av_pop(stack);
-                            _invlist_intersection(prev,
-                                                   current,
-                                                   &current);
-                            av_push(stack, current);
-                            break;
+            default:
+                RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
+                vFAIL("Unexpected character");
 
-                        case '|':
-                        case '+':
-                            prev = av_pop(stack);
-                            _invlist_union(prev, current, &current);
-                            av_push(stack, current);
-                            break;
+          handle_operand:
+
+            /* Here 'current' is the operand.  If something is already on the
+             * stack, we have to check if it is a !. */
+            top_index = av_tindex(stack);   /* Code above may have altered the
+                                             * stack in the time since we
+                                             * earlier set 'top_index'. */
+            if (top_index - fence >= 0) {
+                /* If the top entry on the stack is an operator, it had better
+                 * be a '!', otherwise the entry below the top operand should
+                 * be an operator */
+                top_ptr = av_fetch(stack, top_index, FALSE);
+                assert(top_ptr);
+                if (! IS_OPERAND(*top_ptr)) {
+
+                    /* The only permissible operator at the top of the stack is
+                     * '!', which is applied immediately to this operand. */
+                    curchar = (char) SvUV(*top_ptr);
+                    if (curchar != '!') {
+                        SvREFCNT_dec(current);
+                        vFAIL2("Unexpected binary operator '%c' with no "
+                                "preceding operand", curchar);
+                    }
 
-                        case '-':
-                            prev = av_pop(stack);;
-                            _invlist_subtract(prev, current, &current);
-                            av_push(stack, current);
-                            break;
+                    _invlist_invert(current);
 
-                        case '^':   /* The union minus the intersection */
-                        {
-                            SV* i = NULL;
-                            SV* u = NULL;
-                            SV* element;
-
-                            prev = av_pop(stack);
-                            _invlist_union(prev, current, &u);
-                            _invlist_intersection(prev, current, &i);
-                            /* _invlist_subtract will overwrite current
-                                without freeing what it already contains */
-                            element = current;
-                            _invlist_subtract(u, i, &current);
-                            av_push(stack, current);
-                            SvREFCNT_dec_NN(i);
-                            SvREFCNT_dec_NN(u);
-                            SvREFCNT_dec_NN(element);
-                            break;
-                        }
+                    only_to_avoid_leaks = av_pop(stack);
+                    SvREFCNT_dec(only_to_avoid_leaks);
+                    top_index = av_tindex(stack);
 
-                        default:
-                            Perl_croak(aTHX_ "panic: Unexpected item on '(?[ ])' stack");
+                    /* And we redo with the inverted operand.  This allows
+                     * handling multiple ! in a row */
+                    goto handle_operand;
+                }
+                          /* Single operand is ok only for the non-binary ')'
+                           * operator */
+                else if ((top_index - fence == 0 && curchar != ')')
+                         || (top_index - fence > 0
+                             && (! (stacked_ptr = av_fetch(stack,
+                                                           top_index - 1,
+                                                           FALSE))
+                                 || IS_OPERAND(*stacked_ptr))))
+                {
+                    SvREFCNT_dec(current);
+                    vFAIL("Operand with no preceding operator");
                 }
-                SvREFCNT_dec_NN(top);
-                SvREFCNT_dec(prev);
             }
-        }
+
+            /* Here there was nothing on the stack or the top element was
+             * another operand.  Just add this new one */
+            av_push(stack, current);
+
+        } /* End of switch on next parse token */
 
         RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
+    } /* End of loop parsing through the construct */
+
+  done:
+    if (av_tindex(fence_stack) >= 0) {
+        vFAIL("Unmatched (");
     }
 
     if (av_tindex(stack) < 0   /* Was empty */
@@ -13707,6 +13916,7 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
         || ! IS_OPERAND(final)
         || av_tindex(stack) >= 0)  /* More left on stack */
     {
+        SvREFCNT_dec(final);
         vFAIL("Incomplete expression within '(?[ ])'");
     }
 
@@ -13731,6 +13941,8 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
         }
     }
 
+    /* About to generate an ANYOF (or similar) node from the inversion list we
+     * have calculated */
     save_parse = RExC_parse;
     RExC_parse = SvPV(result_string, len);
     save_end = RExC_end;
diff --git a/t/re/regex_sets.t b/t/re/regex_sets.t
index c880897f37..48a4f00b8e 100644
--- a/t/re/regex_sets.t
+++ b/t/re/regex_sets.t
@@ -68,13 +68,19 @@ like("\N{LAO DIGIT NINE}", $thai_or_lao_digit, 'embedded qr/(?[ ])/ works');
 unlike(chr(ord("\N{LAO DIGIT NINE}") + 1), $thai_or_lao_digit, 'embedded qr/(?[ ])/ works');
 
 my $ascii_word = qr/(?[ \w ])/a;
-my $ascii_digits_plus_all_of_arabic = qr/(?[ \p{Digit} & $ascii_word + \p{Arabic} ])/;
+my $ascii_digits_plus_all_of_arabic = qr/(?[ \p{Arabic} + \p{Digit} & $ascii_word ])/;
 like("9", $ascii_digits_plus_all_of_arabic, "/a, then interpolating and intersection works for ASCII in the set");
 unlike("A", $ascii_digits_plus_all_of_arabic, "/a, then interpolating and intersection works for ASCII not in the set");
 unlike("\N{BENGALI DIGIT ZERO}", $ascii_digits_plus_all_of_arabic, "/a, then interpolating and intersection works for non-ASCII not in either set");
 unlike("\N{BENGALI LETTER A}", $ascii_digits_plus_all_of_arabic, "/a, then interpolating and intersection works for non-ASCII in one set");
-like("\N{ARABIC LETTER HAMZA}", $ascii_digits_plus_all_of_arabic, "interpolation and intersection is left-associative");
-like("\N{EXTENDED ARABIC-INDIC DIGIT ZERO}", $ascii_digits_plus_all_of_arabic, "interpolation and intersection is left-associative");
+like("\N{ARABIC LETTER HAMZA}", $ascii_digits_plus_all_of_arabic, "intersection has higher precedence than union");
+like("\N{EXTENDED ARABIC-INDIC DIGIT ZERO}", $ascii_digits_plus_all_of_arabic, "intersection has higher precedence than union");
+
+like("\r", qr/(?[ \p{lb=cr} ])/, '\r matches \p{lb=cr}');
+unlike("\r", qr/(?[ ! \p{lb=cr} ])/, '\r doesnt match ! \p{lb=cr}');
+like("\r", qr/(?[ ! ! \p{lb=cr} ])/, 'Two ! ! are the original');
+unlike("\r", qr/(?[ ! ! ! \p{lb=cr} ])/, 'Three ! ! ! are the complement');
+# left associatve
 
 my $kelvin = qr/(?[ \N{KELVIN SIGN} ])/;
 my $fold = qr/(?[ $kelvin ])/i;
author	Karl Williamson <khw@cpan.org>	2015-02-25 23:19:39 -0700
committer	Karl Williamson <khw@cpan.org>	2015-03-19 21:10:03 -0600
commit	6798c95dd27b33efd71f394c18649af7bbaf42b7 (patch)
tree	b8ac326854bc4865f83453c91b768bc30f110391
parent	9269c59f17a7bb8e6d97c55ab987b934cfb48034 (diff)
download	perl-6798c95dd27b33efd71f394c18649af7bbaf42b7.tar.gz