diff options
-rw-r--r-- | ChangeLog | 3 | ||||
-rw-r--r-- | NEWS | 6 | ||||
-rw-r--r-- | doc/pcrepattern.3 | 37 | ||||
-rw-r--r-- | pcre_compile.c | 40 | ||||
-rw-r--r-- | testdata/testinput2 | 25 | ||||
-rw-r--r-- | testdata/testinput7 | 18 | ||||
-rw-r--r-- | testdata/testoutput2 | 117 | ||||
-rw-r--r-- | testdata/testoutput7 | 28 |
8 files changed, 266 insertions, 8 deletions
@@ -40,6 +40,9 @@ Version 7.2 05-June-07 (f) \g{name} is another synonym - part of Perl 5.10's unification of reference syntax. + + (g) (?| introduces a group in which the numbering of parentheses in each + alternative starts with the same number. 7. Added two new calls to pcre_fullinfo(): PCRE_INFO_OKPARTIAL and PCRE_INFO_JCHANGED. @@ -24,7 +24,11 @@ Some more features from Perl 5.10 have been added: (?(-n) and (?(+n) relative references as conditions. \K to reset the start of the matched string; for example, (foo)\Kbar - matches bar preceded by foo, but only sets bar as the matched string + matches bar preceded by foo, but only sets bar as the matched string. + + (?| introduces a group where the capturing parentheses in each alternative + start from the same number; for example, (?|(abc)|(xyz)) sets capturing + parentheses number 1 in both cases. Release 7.1 24-Apr-07 diff --git a/doc/pcrepattern.3 b/doc/pcrepattern.3 index 6dcd161..e760623 100644 --- a/doc/pcrepattern.3 +++ b/doc/pcrepattern.3 @@ -958,6 +958,38 @@ is reached, an option setting in one branch does affect subsequent branches, so the above patterns match "SUNDAY" as well as "Saturday". . . +.SH "DUPLICATE SUBPATTERN NUMBERS" +.rs +.sp +Perl 5.10 introduced a feature whereby each alternative in a subpattern uses +the same numbers for its capturing parentheses. Such a subpattern starts with +(?| and is itself a non-capturing subpattern. For example, consider this +pattern: +.sp + (?|(Sat)ur|(Sun))day +.sp +Because the two alternatives are inside a (?| group, both sets of capturing +parentheses are numbered one. Thus, when the pattern matches, you can look +at captured substring number one, whichever alternative matched. This construct +is useful when you want to capture part, but not all, of one of a number of +alternatives. Inside a (?| group, parentheses are numbered as usual, but the +number is reset at the start of each branch. The numbers of any capturing +buffers that follow the subpattern start after the highest number used in any +branch. The following example is taken from the Perl documentation. +The numbers underneath show in which buffer the captured content will be +stored. +.sp + # before ---------------branch-reset----------- after + / ( a ) (?| x ( y ) z | (p (q) r) | (t) u (v) ) ( z ) /x + # 1 2 2 3 2 3 4 +.sp +A backreference or a recursive call to a numbered subpattern always refers to +the first one in the pattern with the given number. +.P +An alternative approach to using this "branch reset" feature is to use +duplicate named subpatterns, as described in the next section. +. +. .SH "NAMED SUBPATTERNS" .rs .sp @@ -1007,6 +1039,9 @@ abbreviation. This pattern (ignoring the line breaks) does the job: (?<DN>Sat)(?:urday)? .sp There are five capturing substrings, but only one is ever set after a match. +(An alternative way of solving this problem is to use a "branch reset" +subpattern, as described in the previous section.) +.P The convenience function for extracting the data by name returns the substring for the first (and in this example, the only) subpattern of that name that matched. This saves searching to find which numbered subpattern it was. If you @@ -1898,6 +1933,6 @@ Cambridge CB2 3QH, England. .rs .sp .nf -Last updated: 29 May 2007 +Last updated: 11 June 2007 Copyright (c) 1997-2007 University of Cambridge. .fi diff --git a/pcre_compile.c b/pcre_compile.c index 92b52df..2deb2d1 100644 --- a/pcre_compile.c +++ b/pcre_compile.c @@ -374,8 +374,8 @@ static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */ /* Definition to allow mutual recursion */ static BOOL - compile_regex(int, int, uschar **, const uschar **, int *, BOOL, int, int *, - int *, branch_chain *, compile_data *, int *); + compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int, + int *, int *, branch_chain *, compile_data *, int *); @@ -2110,6 +2110,7 @@ for (;; ptr++) BOOL possessive_quantifier; BOOL is_quantifier; BOOL is_recurse; + BOOL reset_bracount; int class_charcount; int class_lastchar; int newoptions; @@ -3584,6 +3585,7 @@ for (;; ptr++) skipbytes = 0; bravalue = OP_CBRA; save_hwm = cd->hwm; + reset_bracount = FALSE; if (*(++ptr) == '?') { @@ -3606,6 +3608,11 @@ for (;; ptr++) /* ------------------------------------------------------------ */ + case '|': /* Reset capture count for each branch */ + reset_bracount = TRUE; + /* Fall through */ + + /* ------------------------------------------------------------ */ case ':': /* Non-capturing bracket */ bravalue = OP_BRA; ptr++; @@ -4304,6 +4311,7 @@ for (;; ptr++) errorcodeptr, /* Where to put an error message */ (bravalue == OP_ASSERTBACK || bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */ + reset_bracount, /* True if (?| group */ skipbytes, /* Skip over bracket number */ &subfirstbyte, /* For possible first char */ &subreqbyte, /* For possible last char */ @@ -4663,6 +4671,7 @@ Arguments: ptrptr -> the address of the current pattern pointer errorcodeptr -> pointer to error code variable lookbehind TRUE if this is a lookbehind assertion + reset_bracount TRUE to reset the count for each branch skipbytes skip this many bytes at start (for brackets and OP_COND) firstbyteptr place to put the first required character, or a negative number reqbyteptr place to put the last required character, or a negative number @@ -4676,8 +4685,9 @@ Returns: TRUE on success static BOOL compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr, - int *errorcodeptr, BOOL lookbehind, int skipbytes, int *firstbyteptr, - int *reqbyteptr, branch_chain *bcptr, compile_data *cd, int *lengthptr) + int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes, + int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd, + int *lengthptr) { const uschar *ptr = *ptrptr; uschar *code = *codeptr; @@ -4687,6 +4697,8 @@ uschar *reverse_count = NULL; int firstbyte, reqbyte; int branchfirstbyte, branchreqbyte; int length; +int orig_bracount; +int max_bracount; branch_chain bc; bc.outer = bcptr; @@ -4715,8 +4727,14 @@ code += 1 + LINK_SIZE + skipbytes; /* Loop for each alternative branch */ +orig_bracount = max_bracount = cd->bracount; for (;;) { + /* For a (?| group, reset the capturing bracket count so that each branch + uses the same numbers. */ + + if (reset_bracount) cd->bracount = orig_bracount; + /* Handle a change of ims options at the start of the branch */ if ((options & PCRE_IMS) != oldims) @@ -4745,6 +4763,11 @@ for (;;) *ptrptr = ptr; return FALSE; } + + /* Keep the highest bracket count in case (?| was used and some branch + has fewer than the rest. */ + + if (cd->bracount > max_bracount) max_bracount = cd->bracount; /* In the real compile phase, there is some post-processing to be done. */ @@ -4847,6 +4870,10 @@ for (;;) *code++ = oldims; length += 2; } + + /* Retain the highest bracket number, in case resetting was used. */ + + cd->bracount = max_bracount; /* Set values to pass back */ @@ -5322,7 +5349,8 @@ outside can help speed up starting point checks. */ code = cworkspace; *code = OP_BRA; (void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS, - &code, &ptr, &errorcode, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, &length); + &code, &ptr, &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, + &length); if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN; DPRINTF(("end pre-compile: length=%d workspace=%d\n", length, @@ -5390,7 +5418,7 @@ ptr = (const uschar *)pattern; code = (uschar *)codestart; *code = OP_BRA; (void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr, - &errorcode, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL); + &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL); re->top_bracket = cd->bracount; re->top_backref = cd->top_backref; diff --git a/testdata/testinput2 b/testdata/testinput2 index 8b027c7..357b1a9 100644 --- a/testdata/testinput2 +++ b/testdata/testinput2 @@ -2215,4 +2215,29 @@ a random value. /Ix /\g{A/ +/(?|(abc)|(xyz))/BZ + >abc< + >xyz< + +/(x)(?|(abc)|(xyz))(x)/BZ + xabcx + xxyzx + +/(x)(?|(abc)(pqr)|(xyz))(x)/BZ + xabcpqrx + xxyzx + +/(?|(abc)|(xyz))\1/ + abcabc + xyzxyz + ** Failers + abcxyz + xyzabc + +/(?|(abc)|(xyz))(?1)/ + abcabc + xyzabc + ** Failers + xyzxyz + / End of testinput2 / diff --git a/testdata/testinput7 b/testdata/testinput7 index 5c2dd6f..55ed629 100644 --- a/testdata/testinput7 +++ b/testdata/testinput7 @@ -4249,4 +4249,22 @@ /(?m)$/<any>g+ abc\r\n\r\n +/(?|(abc)|(xyz))/ + >abc< + >xyz< + +/(x)(?|(abc)|(xyz))(x)/ + xabcx + xxyzx + +/(x)(?|(abc)(pqr)|(xyz))(x)/ + xabcpqrx + xxyzx + +/(?|(abc)|(xyz))(?1)/ + abcabc + xyzabc + ** Failers + xyzxyz + / End of testinput7 / diff --git a/testdata/testoutput2 b/testdata/testoutput2 index d51f0c8..d975fa1 100644 --- a/testdata/testoutput2 +++ b/testdata/testoutput2 @@ -8357,4 +8357,121 @@ No match /\g{A/ Failed: syntax error in subpattern name (missing terminator) at offset 4 +/(?|(abc)|(xyz))/BZ +------------------------------------------------------------------ + Bra 0 + Bra 0 + Bra 1 + abc + Ket + Alt + Bra 1 + xyz + Ket + Ket + Ket + End +------------------------------------------------------------------ + >abc< + 0: abc + 1: abc + >xyz< + 0: xyz + 1: xyz + +/(x)(?|(abc)|(xyz))(x)/BZ +------------------------------------------------------------------ + Bra 0 + Bra 1 + x + Ket + Bra 0 + Bra 2 + abc + Ket + Alt + Bra 2 + xyz + Ket + Ket + Bra 3 + x + Ket + Ket + End +------------------------------------------------------------------ + xabcx + 0: xabcx + 1: x + 2: abc + 3: x + xxyzx + 0: xxyzx + 1: x + 2: xyz + 3: x + +/(x)(?|(abc)(pqr)|(xyz))(x)/BZ +------------------------------------------------------------------ + Bra 0 + Bra 1 + x + Ket + Bra 0 + Bra 2 + abc + Ket + Bra 3 + pqr + Ket + Alt + Bra 2 + xyz + Ket + Ket + Bra 4 + x + Ket + Ket + End +------------------------------------------------------------------ + xabcpqrx + 0: xabcpqrx + 1: x + 2: abc + 3: pqr + 4: x + xxyzx + 0: xxyzx + 1: x + 2: xyz + 3: <unset> + 4: x + +/(?|(abc)|(xyz))\1/ + abcabc + 0: abcabc + 1: abc + xyzxyz + 0: xyzxyz + 1: xyz + ** Failers +No match + abcxyz +No match + xyzabc +No match + +/(?|(abc)|(xyz))(?1)/ + abcabc + 0: abcabc + 1: abc + xyzabc + 0: xyzabc + 1: xyz + ** Failers +No match + xyzxyz +No match + / End of testinput2 / diff --git a/testdata/testoutput7 b/testdata/testoutput7 index 6860b66..9127526 100644 --- a/testdata/testoutput7 +++ b/testdata/testoutput7 @@ -6990,4 +6990,32 @@ No match 0: 0+ +/(?|(abc)|(xyz))/ + >abc< + 0: abc + >xyz< + 0: xyz + +/(x)(?|(abc)|(xyz))(x)/ + xabcx + 0: xabcx + xxyzx + 0: xxyzx + +/(x)(?|(abc)(pqr)|(xyz))(x)/ + xabcpqrx + 0: xabcpqrx + xxyzx + 0: xxyzx + +/(?|(abc)|(xyz))(?1)/ + abcabc + 0: abcabc + xyzabc + 0: xyzabc + ** Failers +No match + xyzxyz +No match + / End of testinput7 / |