diff options
author | Karl Williamson <khw@cpan.org> | 2015-02-25 23:19:39 -0700 |
---|---|---|
committer | Karl Williamson <khw@cpan.org> | 2015-03-19 21:10:03 -0600 |
commit | 6798c95dd27b33efd71f394c18649af7bbaf42b7 (patch) | |
tree | b8ac326854bc4865f83453c91b768bc30f110391 | |
parent | 9269c59f17a7bb8e6d97c55ab987b934cfb48034 (diff) | |
download | perl-6798c95dd27b33efd71f394c18649af7bbaf42b7.tar.gz |
Change /(?[...]) to have normal operator precedence
This experimental feature now has the intersection operator ("&") higher
precedence than the other binary operators.
-rw-r--r-- | embed.fnc | 1 | ||||
-rw-r--r-- | embed.h | 1 | ||||
-rw-r--r-- | pod/perldelta.pod | 10 | ||||
-rw-r--r-- | pod/perlrecharclass.pod | 13 | ||||
-rw-r--r-- | proto.h | 3 | ||||
-rw-r--r-- | regcomp.c | 602 | ||||
-rw-r--r-- | t/re/regex_sets.t | 12 |
7 files changed, 436 insertions, 206 deletions
@@ -2136,6 +2136,7 @@ Es |regnode*|regclass |NN RExC_state_t *pRExC_state \ Es |void|add_above_Latin1_folds|NN RExC_state_t *pRExC_state|const U8 cp \ |NN SV** invlist Esn |bool|could_it_be_a_POSIX_class|NN RExC_state_t *pRExC_state +EsnP |unsigned int|regex_set_precedence|const U8 my_operator Es |regnode*|handle_regex_sets|NN RExC_state_t *pRExC_state \ |NULLOK SV ** return_invlist \ |NN I32 *flagp|U32 depth \ @@ -992,6 +992,7 @@ #define regatom(a,b,c) S_regatom(aTHX_ a,b,c) #define regbranch(a,b,c,d) S_regbranch(aTHX_ a,b,c,d) #define regclass(a,b,c,d,e,f,g,h) S_regclass(aTHX_ a,b,c,d,e,f,g,h) +#define regex_set_precedence S_regex_set_precedence #define reginsert(a,b,c,d) S_reginsert(aTHX_ a,b,c,d) #define regnode_guts(a,b,c,d) S_regnode_guts(aTHX_ a,b,c,d) #define regpatws S_regpatws diff --git a/pod/perldelta.pod b/pod/perldelta.pod index 87ac56c019..6af8da5b78 100644 --- a/pod/perldelta.pod +++ b/pod/perldelta.pod @@ -45,6 +45,16 @@ XXX For a release on a stable branch, this section aspires to be: [ List each incompatible change as a =head2 entry ] +=head2 C<(?[...])> operators now follow standard Perl precedence + +This experimental feature allows set operations in regular expression +patterns. Prior to this, the intersection operator had the same +precedence as the other binary operators. Now it has higher precedence. +This could lead to different outcomes than existing code expects (though +the documentation has always noted that this change might happen, +recommending fully parenthesizing the expressions). See +L<perlrecharclass/Extended Bracketed Character Classes>. + =head1 Deprecations XXX Any deprecated features, syntax, modules etc. should be listed here. diff --git a/pod/perlrecharclass.pod b/pod/perlrecharclass.pod index 84a9226451..20a439ac36 100644 --- a/pod/perlrecharclass.pod +++ b/pod/perlrecharclass.pod @@ -1025,14 +1025,11 @@ There is one unary operator: ! complement -All the binary operators left associate, and are of equal precedence. -The unary operator right associates, and has higher precedence. Use -parentheses to override the default associations. Some feedback we've -received indicates a desire for intersection to have higher precedence -than union. This is something that feedback from the field may cause us -to change in future releases; you may want to parenthesize copiously to -avoid such changes affecting your code, until this feature is no longer -considered experimental. +All the binary operators left associate; C<"&"> is higher precedence +than the others, which all have equal precedence. The unary operator +right associates, and has highest precedence. Thus this follows the +normal Perl precedence rules for logical operators. Use parentheses to +override the default precedence and associativity. The main restriction is that everything is a metacharacter. Thus, you cannot refer to single characters by doing something like this: @@ -7100,6 +7100,9 @@ STATIC regnode* S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 dept #define PERL_ARGS_ASSERT_REGCLASS \ assert(pRExC_state); assert(flagp) +STATIC unsigned int S_regex_set_precedence(const U8 my_operator) + __attribute__pure__; + STATIC void S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth) __attribute__nonnull__(pTHX_1) __attribute__nonnull__(pTHX_3); @@ -13266,6 +13266,34 @@ S_could_it_be_a_POSIX_class(RExC_state_t *pRExC_state) && first_char == *(p - 1)); } +STATIC unsigned int +S_regex_set_precedence(const U8 my_operator) { + + /* Returns the precedence in the (?[...]) construct of the input operator, + * specified by its character representation. The precedence follows + * general Perl rules, but it extends this so that ')' and ']' have (low) + * precedence even though they aren't really operators */ + + switch (my_operator) { + case '!': + return 5; + case '&': + return 4; + case '^': + case '|': + case '+': + case '-': + return 3; + case ')': + return 2; + case ']': + return 1; + } + + NOT_REACHED; /* NOTREACHED */ + return 0; /* Silence compiler warning */ +} + STATIC regnode * S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *flagp, U32 depth, @@ -13273,24 +13301,35 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, { /* Handle the (?[...]) construct to do set operations */ - U8 curchar; - UV start, end; /* End points of code point ranges */ - SV* result_string; - char *save_end, *save_parse; - SV* final; - STRLEN len; - regnode* node; - AV* stack; - const bool save_fold = FOLD; + U8 curchar; /* Current character being parsed */ + UV start, end; /* End points of code point ranges */ + SV* final = NULL; /* The end result inversion list */ + SV* result_string; /* 'final' stringified */ + AV* stack; /* stack of operators and operands not yet + resolved */ + AV* fence_stack = NULL; /* A stack containing the positions in + 'stack' of where the undealt-with left + parens would be if they were actually + put there */ + IV fence = 0; /* Position of where most recent undealt- + with left paren in stack is; -1 if none. + */ + STRLEN len; /* Temporary */ + regnode* node; /* Temporary, and final regnode returned by + this function */ + const bool save_fold = FOLD; /* Temporary */ + char *save_end, *save_parse; /* Temporaries */ GET_RE_DEBUG_FLAGS_DECL; PERL_ARGS_ASSERT_HANDLE_REGEX_SETS; - if (LOC) { + if (LOC) { /* XXX could make valid in UTF-8 locales */ vFAIL("(?[...]) not valid in locale"); } - RExC_uni_semantics = 1; + RExC_uni_semantics = 1; /* The use of this operator implies /u. This + is required so that the compile time values + are valid in all runtime cases */ /* This will return only an ANYOF regnode, or (unlikely) something smaller * (such as EXACT). Thus we can skip most everything if just sizing. We @@ -13299,16 +13338,7 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, * upon an unescaped ']' that isn't one ending a regclass. To do both * these things, we need to realize that something preceded by a backslash * is escaped, so we have to keep track of backslashes */ - if (PASS2) { - Perl_ck_warner_d(aTHX_ - packWARN(WARN_EXPERIMENTAL__REGEX_SETS), - "The regex_sets feature is experimental" REPORT_LOCATION, - UTF8fARG(UTF, (RExC_parse - RExC_precomp), RExC_precomp), - UTF8fARG(UTF, - RExC_end - RExC_start - (RExC_parse - RExC_precomp), - RExC_precomp + (RExC_parse - RExC_precomp))); - } - else { + if (SIZE_ONLY) { UV depth = 0; /* how many nested (?[...]) constructs */ while (RExC_parse < RExC_end) { @@ -13354,8 +13384,8 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, TRUE, /* strict */ ¤t )) - FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"", - (UV) *flagp); + FAIL2("panic: regclass returned NULL to handle_sets, " + "flags=%#"UVxf"", (UV) *flagp); /* function call leaves parse pointing to the ']', except * if we faked it */ @@ -13389,69 +13419,124 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, FAIL("Syntax error in (?[...])"); } - /* Pass 2 only after this. Everything in this construct is a - * metacharacter. Operands begin with either a '\' (for an escape - * sequence), or a '[' for a bracketed character class. Any other - * character should be an operator, or parenthesis for grouping. Both - * types of operands are handled by calling regclass() to parse them. It - * is called with a parameter to indicate to return the computed inversion - * list. The parsing here is implemented via a stack. Each entry on the - * stack is a single character representing one of the operators, or the - * '('; or else a pointer to an operand inversion list. */ + /* Pass 2 only after this. */ + Perl_ck_warner_d(aTHX_ + packWARN(WARN_EXPERIMENTAL__REGEX_SETS), + "The regex_sets feature is experimental" REPORT_LOCATION, + UTF8fARG(UTF, (RExC_parse - RExC_precomp), RExC_precomp), + UTF8fARG(UTF, + RExC_end - RExC_start - (RExC_parse - RExC_precomp), + RExC_precomp + (RExC_parse - RExC_precomp))); + + /* Everything in this construct is a metacharacter. Operands begin with + * either a '\' (for an escape sequence), or a '[' for a bracketed + * character class. Any other character should be an operator, or + * parenthesis for grouping. Both types of operands are handled by calling + * regclass() to parse them. It is called with a parameter to indicate to + * return the computed inversion list. The parsing here is implemented via + * a stack. Each entry on the stack is a single character representing one + * of the operators; or else a pointer to an operand inversion list. */ #define IS_OPERAND(a) (! SvIOK(a)) - /* The stack starts empty. It is a syntax error if the first thing parsed - * is a binary operator; everything else is pushed on the stack. When an - * operand is parsed, the top of the stack is examined. If it is a binary - * operator, the item before it should be an operand, and both are replaced - * by the result of doing that operation on the new operand and the one on - * the stack. Thus a sequence of binary operands is reduced to a single - * one before the next one is parsed. + /* The stack is kept in Łukasiewicz order. (That's pronounced similar + * to luke-a-shave-itch (or -itz), but people who didn't want to bother + * with prounouncing it called it Reverse Polish instead, but now that YOU + * know how to prounounce it you can use the correct term, thus giving due + * credit to the person who invented it, and impressing your geek friends. + * Wikipedia says that the pronounciation of "Ł" has been changing so that + * it is now more like an English initial W (as in wonk) than an L.) + * + * This means that, for example, 'a | b & c' is stored on the stack as + * + * c [4] + * b [3] + * & [2] + * a [1] + * | [0] + * + * where the numbers in brackets give the stack [array] element number. + * In this implementation, parentheses are not stored on the stack. + * Instead a '(' creates a "fence" so that the part of the stack below the + * fence is invisible except to the corresponding ')' (this allows us to + * replace testing for parens, by using instead subtraction of the fence + * position). As new operands are processed they are pushed onto the stack + * (except as noted in the next paragraph). New operators of higher + * precedence than the current final one are inserted on the stack before + * the lhs operand (so that when the rhs is pushed next, everything will be + * in the correct positions shown above. When an operator of equal or + * lower precedence is encountered in parsing, all the stacked operations + * of equal or higher precedence are evaluated, leaving the result as the + * top entry on the stack. This makes higher precedence operations + * evaluate before lower precedence ones, and causes operations of equal + * precedence to left associate. * - * A unary operator may immediately follow a binary in the input, for - * example + * The only unary operator '!' is immediately pushed onto the stack when + * encountered. When an operand is encountered, if the top of the stack is + * a '!", the complement is immediately performed, and the '!' popped. The + * resulting value is treated as a new operand, and the logic in the + * previous paragraph is executed. Thus in the expression * [a] + ! [b] - * When an operand is parsed and the top of the stack is a unary operator, - * the operation is performed, and then the stack is rechecked to see if - * this new operand is part of a binary operation; if so, it is handled as - * above. + * the stack looks like + * + * ! + * a + * + + * + * as 'b' gets parsed, the latter gets evaluated to '!b', and the stack + * becomes + * + * !b + * a + * + + * + * A ')' is treated as an operator with lower precedence than all the + * aforementioned ones, which causes all operations on the stack above the + * corresponding '(' to be evaluated down to a single resultant operand. + * Then the fence for the '(' is removed, and the operand goes through the + * algorithm above, without the fence. * - * A '(' is simply pushed on the stack; it is valid only if the stack is - * empty, or the top element of the stack is an operator or another '(' - * (for which the parenthesized expression will become an operand). By the - * time the corresponding ')' is parsed everything in between should have - * been parsed and evaluated to a single operand (or else is a syntax - * error), and is handled as a regular operand */ + * A separate stack is kept of the fence positions, so that the position of + * the latest so-far unbalanced '(' is at the top of it. + * + * The ']' ending the construct is treated as the lowest operator of all, + * so that everything gets evaluated down to a single operand, which is the + * result */ sv_2mortal((SV *)(stack = newAV())); + sv_2mortal((SV *)(fence_stack = newAV())); while (RExC_parse < RExC_end) { - I32 top_index = av_tindex(stack); - SV** top_ptr; - SV* current = NULL; + I32 top_index; /* Index of top-most element in 'stack' */ + SV** top_ptr; /* Pointer to top 'stack' element */ + SV* current = NULL; /* To contain the current inversion list + operand */ + SV* only_to_avoid_leaks; /* Skip white space */ RExC_parse = regpatws(pRExC_state, RExC_parse, - TRUE /* means recognize comments */ ); + TRUE /* means recognize comments */ ); if (RExC_parse >= RExC_end) { Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'"); } - if ((curchar = UCHARAT(RExC_parse)) == ']') { - break; - } + + curchar = UCHARAT(RExC_parse); + +redo_curchar: + + top_index = av_tindex(stack); switch (curchar) { + SV** stacked_ptr; /* Ptr to something already on 'stack' */ + char stacked_operator; /* The topmost operator on the 'stack'. */ + SV* lhs; /* Operand to the left of the operator */ + SV* rhs; /* Operand to the right of the operator */ + SV* fence_ptr; /* Pointer to top element of the fence + stack */ + + case '(': - case '?': - if (av_tindex(stack) >= 0 /* This makes sure that we can - safely subtract 1 from - RExC_parse in the next clause. - If we have something on the - stack, we have parsed something - */ - && UCHARAT(RExC_parse - 1) == '(' - && RExC_parse < RExC_end) + if (RExC_parse < RExC_end && (UCHARAT(RExC_parse + 1) == '?')) { /* If is a '(?', could be an embedded '(?flags:(?[...])'. * This happens when we have some thing like @@ -13466,14 +13551,18 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, * interpolated expression evaluates to. We use the flags * from the interpolated pattern. */ U32 save_flags = RExC_flags; - const char * const save_parse = ++RExC_parse; + const char * save_parse; + + RExC_parse += 2; /* Skip past the '(?' */ + save_parse = RExC_parse; + /* Parse any flags for the '(?' */ parse_lparen_question_flags(pRExC_state); if (RExC_parse == save_parse /* Makes sure there was at - least one flag (or this - embedding wasn't compiled) - */ + least one flag (or else + this embedding wasn't + compiled) */ || RExC_parse >= RExC_end - 4 || UCHARAT(RExC_parse) != ':' || UCHARAT(++RExC_parse) != '(' @@ -13493,25 +13582,50 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, } vFAIL("Expecting '(?flags:(?[...'"); } + + /* Recurse, with the meat of the embedded expression */ RExC_parse++; (void) handle_regex_sets(pRExC_state, ¤t, flagp, depth+1, oregcomp_parse); /* Here, 'current' contains the embedded expression's * inversion list, and RExC_parse points to the trailing - * ']'; the next character should be the ')' which will be - * paired with the '(' that has been put on the stack, so - * the whole embedded expression reduces to '(operand)' */ + * ']'; the next character should be the ')' */ + RExC_parse++; + assert(RExC_parse < RExC_end && UCHARAT(RExC_parse) == ')'); + + /* Then the ')' matching the original '(' handled by this + * case: statement */ RExC_parse++; + assert(RExC_parse < RExC_end && UCHARAT(RExC_parse) == ')'); + RExC_parse++; RExC_flags = save_flags; goto handle_operand; } - /* FALLTHROUGH */ - default: - RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1; - vFAIL("Unexpected character"); + /* A regular '('. Look behind for illegal syntax */ + if (top_index - fence >= 0) { + /* If the top entry on the stack is an operator, it had + * better be a '!', otherwise the entry below the top + * operand should be an operator */ + if ( ! (top_ptr = av_fetch(stack, top_index, FALSE)) + || (! IS_OPERAND(*top_ptr) && SvUV(*top_ptr) != '!') + || top_index - fence < 1 + || ! (stacked_ptr = av_fetch(stack, + top_index - 1, + FALSE)) + || IS_OPERAND(*stacked_ptr)) + { + RExC_parse++; + vFAIL("Unexpected '(' with no preceding operator"); + } + } + + /* Stack the position of this undealt-with left paren */ + fence = top_index + 1; + av_push(fence_stack, newSViv(fence)); + break; case '\\': /* regclass() can only return RESTART_UTF8 if multi-char @@ -13521,10 +13635,12 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, FALSE, /* don't allow multi-char folds */ FALSE, /* don't silence non-portable warnings. */ TRUE, /* strict */ - ¤t - )) - FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"", - (UV) *flagp); + ¤t)) + { + FAIL2("panic: regclass returned NULL to handle_sets, " + "flags=%#"UVxf"", (UV) *flagp); + } + /* regclass() will return with parsing just the \ sequence, * leaving the parse pointer at the next thing to parse */ RExC_parse--; @@ -13548,8 +13664,11 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, TRUE, /* strict */ ¤t )) - FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"", - (UV) *flagp); + { + FAIL2("panic: regclass returned NULL to handle_sets, " + "flags=%#"UVxf"", (UV) *flagp); + } + /* function call leaves parse pointing to the ']', except if we * faked it */ if (is_posix_class) { @@ -13559,147 +13678,237 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, goto handle_operand; } + case ']': + if (top_index >= 1) { + goto join_operators; + } + + /* Only a single operand on the stack: are done */ + goto done; + + case ')': + if (av_tindex(fence_stack) < 0) { + RExC_parse++; + vFAIL("Unexpected ')'"); + } + + /* If at least two thing on the stack, treat this as an + * operator */ + if (top_index - fence >= 1) { + goto join_operators; + } + + /* Here only a single thing on the fenced stack, and there is a + * fence. Get rid of it */ + fence_ptr = av_pop(fence_stack); + assert(fence_ptr); + fence = SvIV(fence_ptr) - 1; + SvREFCNT_dec_NN(fence_ptr); + fence_ptr = NULL; + + if (fence < 0) { + fence = 0; + } + + /* Having gotten rid of the fence, we pop the operand at the + * stack top and process it as a newly encountered operand */ + current = av_pop(stack); + assert(IS_OPERAND(current)); + goto handle_operand; + case '&': case '|': case '+': case '-': case '^': - if (top_index < 0 + + /* These binary operators should have a left operand already + * parsed */ + if ( top_index - fence < 0 + || top_index - fence == 1 || ( ! (top_ptr = av_fetch(stack, top_index, FALSE))) || ! IS_OPERAND(*top_ptr)) { - RExC_parse++; - vFAIL2("Unexpected binary operator '%c' with no preceding operand", curchar); + goto unexpected_binary; } - av_push(stack, newSVuv(curchar)); - break; - case '!': - av_push(stack, newSVuv(curchar)); - break; + /* If only the one operand is on the part of the stack visible + * to us, we just place this operator in the proper position */ + if (top_index - fence < 2) { - case '(': - if (top_index >= 0) { - top_ptr = av_fetch(stack, top_index, FALSE); - assert(top_ptr); - if (IS_OPERAND(*top_ptr)) { - RExC_parse++; - vFAIL("Unexpected '(' with no preceding operator"); - } + /* Place the operator before the operand */ + + SV* lhs = av_pop(stack); + av_push(stack, newSVuv(curchar)); + av_push(stack, lhs); + break; } - av_push(stack, newSVuv(curchar)); - break; - case ')': - { - SV* lparen; - if (top_index < 1 - || ! (current = av_pop(stack)) - || ! IS_OPERAND(current) - || ! (lparen = av_pop(stack)) - || IS_OPERAND(lparen) - || SvUV(lparen) != '(') + /* But if there is something else on the stack, we need to + * process it before this new operator if and only if the + * stacked operation has equal or higher precedence than the + * new one */ + + join_operators: + + /* The operator on the stack is supposed to be below both its + * operands */ + if ( ! (stacked_ptr = av_fetch(stack, top_index - 2, FALSE)) + || IS_OPERAND(*stacked_ptr)) { - SvREFCNT_dec(current); + /* But if not, it's legal and indicates we are completely + * done if and only if we're currently processing a ']', + * which should be the final thing in the expression */ + if (curchar == ']') { + goto done; + } + + unexpected_binary: RExC_parse++; - vFAIL("Unexpected ')'"); + vFAIL2("Unexpected binary operator '%c' with no " + "preceding operand", curchar); } - top_index -= 2; - SvREFCNT_dec_NN(lparen); + stacked_operator = (char) SvUV(*stacked_ptr); - /* FALLTHROUGH */ - } + if (regex_set_precedence(curchar) + > regex_set_precedence(stacked_operator)) + { + /* Here, the new operator has higher precedence than the + * stacked one. This means we need to add the new one to + * the stack to await its rhs operand (and maybe more + * stuff). We put it before the lhs operand, leaving + * untouched the stacked operator and everything below it + * */ + lhs = av_pop(stack); + assert(IS_OPERAND(lhs)); + + av_push(stack, newSVuv(curchar)); + av_push(stack, lhs); + break; + } - handle_operand: + /* Here, the new operator has equal or lower precedence than + * what's already there. This means the operation already + * there should be performed now, before the new one. */ + rhs = av_pop(stack); + lhs = av_pop(stack); - /* Here, we have an operand to process, in 'current' */ + assert(IS_OPERAND(rhs)); + assert(IS_OPERAND(lhs)); - if (top_index < 0) { /* Just push if stack is empty */ - av_push(stack, current); - } - else { - SV* top = av_pop(stack); - SV *prev = NULL; - char current_operator; - - if (IS_OPERAND(top)) { - SvREFCNT_dec_NN(top); - SvREFCNT_dec_NN(current); - vFAIL("Operand with no preceding operator"); + switch (stacked_operator) { + case '&': + _invlist_intersection(lhs, rhs, &rhs); + break; + + case '|': + case '+': + _invlist_union(lhs, rhs, &rhs); + break; + + case '-': + _invlist_subtract(lhs, rhs, &rhs); + break; + + case '^': /* The union minus the intersection */ + { + SV* i = NULL; + SV* u = NULL; + SV* element; + + _invlist_union(lhs, rhs, &u); + _invlist_intersection(lhs, rhs, &i); + /* _invlist_subtract will overwrite rhs + without freeing what it already contains */ + element = rhs; + _invlist_subtract(u, i, &rhs); + SvREFCNT_dec_NN(i); + SvREFCNT_dec_NN(u); + SvREFCNT_dec_NN(element); + break; } - current_operator = (char) SvUV(top); - switch (current_operator) { - case '(': /* Push the '(' back on followed by the new - operand */ - av_push(stack, top); - av_push(stack, current); - SvREFCNT_inc(top); /* Counters the '_dec' done - just after the 'break', so - it doesn't get wrongly freed - */ - break; + } + SvREFCNT_dec(lhs); + + /* Here, the higher precedence operation has been done, and the + * result is in 'rhs'. We overwrite the stacked operator with + * the result. Then we redo this code to either push the new + * operator onto the stack or perform any higher precedence + * stacked operation */ + only_to_avoid_leaks = av_pop(stack); + SvREFCNT_dec(only_to_avoid_leaks); + av_push(stack, rhs); + goto redo_curchar; + + case '!': /* Highest priority, right associative, so just push + onto stack */ + av_push(stack, newSVuv(curchar)); + break; - case '!': - _invlist_invert(current); - - /* Unlike binary operators, the top of the stack, - * now that this unary one has been popped off, may - * legally be an operator, and we now have operand - * for it. */ - top_index--; - SvREFCNT_dec_NN(top); - goto handle_operand; - - case '&': - prev = av_pop(stack); - _invlist_intersection(prev, - current, - ¤t); - av_push(stack, current); - break; + default: + RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1; + vFAIL("Unexpected character"); - case '|': - case '+': - prev = av_pop(stack); - _invlist_union(prev, current, ¤t); - av_push(stack, current); - break; + handle_operand: + + /* Here 'current' is the operand. If something is already on the + * stack, we have to check if it is a !. */ + top_index = av_tindex(stack); /* Code above may have altered the + * stack in the time since we + * earlier set 'top_index'. */ + if (top_index - fence >= 0) { + /* If the top entry on the stack is an operator, it had better + * be a '!', otherwise the entry below the top operand should + * be an operator */ + top_ptr = av_fetch(stack, top_index, FALSE); + assert(top_ptr); + if (! IS_OPERAND(*top_ptr)) { + + /* The only permissible operator at the top of the stack is + * '!', which is applied immediately to this operand. */ + curchar = (char) SvUV(*top_ptr); + if (curchar != '!') { + SvREFCNT_dec(current); + vFAIL2("Unexpected binary operator '%c' with no " + "preceding operand", curchar); + } - case '-': - prev = av_pop(stack);; - _invlist_subtract(prev, current, ¤t); - av_push(stack, current); - break; + _invlist_invert(current); - case '^': /* The union minus the intersection */ - { - SV* i = NULL; - SV* u = NULL; - SV* element; - - prev = av_pop(stack); - _invlist_union(prev, current, &u); - _invlist_intersection(prev, current, &i); - /* _invlist_subtract will overwrite current - without freeing what it already contains */ - element = current; - _invlist_subtract(u, i, ¤t); - av_push(stack, current); - SvREFCNT_dec_NN(i); - SvREFCNT_dec_NN(u); - SvREFCNT_dec_NN(element); - break; - } + only_to_avoid_leaks = av_pop(stack); + SvREFCNT_dec(only_to_avoid_leaks); + top_index = av_tindex(stack); - default: - Perl_croak(aTHX_ "panic: Unexpected item on '(?[ ])' stack"); + /* And we redo with the inverted operand. This allows + * handling multiple ! in a row */ + goto handle_operand; + } + /* Single operand is ok only for the non-binary ')' + * operator */ + else if ((top_index - fence == 0 && curchar != ')') + || (top_index - fence > 0 + && (! (stacked_ptr = av_fetch(stack, + top_index - 1, + FALSE)) + || IS_OPERAND(*stacked_ptr)))) + { + SvREFCNT_dec(current); + vFAIL("Operand with no preceding operator"); } - SvREFCNT_dec_NN(top); - SvREFCNT_dec(prev); } - } + + /* Here there was nothing on the stack or the top element was + * another operand. Just add this new one */ + av_push(stack, current); + + } /* End of switch on next parse token */ RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1; + } /* End of loop parsing through the construct */ + + done: + if (av_tindex(fence_stack) >= 0) { + vFAIL("Unmatched ("); } if (av_tindex(stack) < 0 /* Was empty */ @@ -13707,6 +13916,7 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, || ! IS_OPERAND(final) || av_tindex(stack) >= 0) /* More left on stack */ { + SvREFCNT_dec(final); vFAIL("Incomplete expression within '(?[ ])'"); } @@ -13731,6 +13941,8 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, } } + /* About to generate an ANYOF (or similar) node from the inversion list we + * have calculated */ save_parse = RExC_parse; RExC_parse = SvPV(result_string, len); save_end = RExC_end; diff --git a/t/re/regex_sets.t b/t/re/regex_sets.t index c880897f37..48a4f00b8e 100644 --- a/t/re/regex_sets.t +++ b/t/re/regex_sets.t @@ -68,13 +68,19 @@ like("\N{LAO DIGIT NINE}", $thai_or_lao_digit, 'embedded qr/(?[ ])/ works'); unlike(chr(ord("\N{LAO DIGIT NINE}") + 1), $thai_or_lao_digit, 'embedded qr/(?[ ])/ works'); my $ascii_word = qr/(?[ \w ])/a; -my $ascii_digits_plus_all_of_arabic = qr/(?[ \p{Digit} & $ascii_word + \p{Arabic} ])/; +my $ascii_digits_plus_all_of_arabic = qr/(?[ \p{Arabic} + \p{Digit} & $ascii_word ])/; like("9", $ascii_digits_plus_all_of_arabic, "/a, then interpolating and intersection works for ASCII in the set"); unlike("A", $ascii_digits_plus_all_of_arabic, "/a, then interpolating and intersection works for ASCII not in the set"); unlike("\N{BENGALI DIGIT ZERO}", $ascii_digits_plus_all_of_arabic, "/a, then interpolating and intersection works for non-ASCII not in either set"); unlike("\N{BENGALI LETTER A}", $ascii_digits_plus_all_of_arabic, "/a, then interpolating and intersection works for non-ASCII in one set"); -like("\N{ARABIC LETTER HAMZA}", $ascii_digits_plus_all_of_arabic, "interpolation and intersection is left-associative"); -like("\N{EXTENDED ARABIC-INDIC DIGIT ZERO}", $ascii_digits_plus_all_of_arabic, "interpolation and intersection is left-associative"); +like("\N{ARABIC LETTER HAMZA}", $ascii_digits_plus_all_of_arabic, "intersection has higher precedence than union"); +like("\N{EXTENDED ARABIC-INDIC DIGIT ZERO}", $ascii_digits_plus_all_of_arabic, "intersection has higher precedence than union"); + +like("\r", qr/(?[ \p{lb=cr} ])/, '\r matches \p{lb=cr}'); +unlike("\r", qr/(?[ ! \p{lb=cr} ])/, '\r doesnt match ! \p{lb=cr}'); +like("\r", qr/(?[ ! ! \p{lb=cr} ])/, 'Two ! ! are the original'); +unlike("\r", qr/(?[ ! ! ! \p{lb=cr} ])/, 'Three ! ! ! are the complement'); +# left associatve my $kelvin = qr/(?[ \N{KELVIN SIGN} ])/; my $fold = qr/(?[ $kelvin ])/i; |