diff options
author | ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2009-09-15 18:17:54 +0000 |
---|---|---|
committer | ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2009-09-15 18:17:54 +0000 |
commit | 912ae74971cb3b32d007d9fa83295c38fc871b31 (patch) | |
tree | 315555d53b3c1d918f3acaed403bec7d9fbeb682 | |
parent | efd42072fc347bc5f9e2af584f79286553c3efb2 (diff) | |
download | pcre-912ae74971cb3b32d007d9fa83295c38fc871b31.tar.gz |
Capture data when (*ACCEPT) is inside capturing parentheses.
git-svn-id: svn://vcs.exim.org/pcre/code/trunk@447 2f5784b3-3f2a-0410-8824-cb99058d5e15
-rw-r--r-- | ChangeLog | 5 | ||||
-rw-r--r-- | doc/pcrecompat.3 | 5 | ||||
-rw-r--r-- | doc/pcrepattern.3 | 13 | ||||
-rw-r--r-- | pcre_compile.c | 34 | ||||
-rw-r--r-- | pcre_exec.c | 24 | ||||
-rw-r--r-- | pcre_internal.h | 17 | ||||
-rw-r--r-- | pcre_printint.src | 4 |
7 files changed, 87 insertions, 15 deletions
@@ -123,6 +123,11 @@ Version 8.00 ??-???-?? with unset values at the outer level. The correct (outer level) value is now given. +22. If (*ACCEPT) appeared inside capturing parentheses, previous releases of + PCRE did not set those parentheses (unlike Perl). I have now found a way to + make it do so. The string so far is captured, making this feature + compatible with Perl. + Version 7.9 11-Apr-09 --------------------- diff --git a/doc/pcrecompat.3 b/doc/pcrecompat.3 index c9e594b..68b5b9c 100644 --- a/doc/pcrecompat.3 +++ b/doc/pcrecompat.3 @@ -83,8 +83,7 @@ the pattern /^(a(b)?)+$/ in Perl leaves $2 unset, but in PCRE it is set to "b". .P 11. PCRE does support Perl 5.10's backtracking verbs (*ACCEPT), (*FAIL), (*F), (*COMMIT), (*PRUNE), (*SKIP), and (*THEN), but only in the forms without an -argument. PCRE does not support (*MARK). If (*ACCEPT) is within capturing -parentheses, PCRE does not set that capture group; this is different to Perl. +argument. PCRE does not support (*MARK). .P 12. PCRE provides some extensions to the Perl regular expression facilities. Perl 5.10 will include new features that are not in earlier versions, some of @@ -143,6 +142,6 @@ Cambridge CB2 3QH, England. .rs .sp .nf -Last updated: 11 September 2009 +Last updated: 15 September 2009 Copyright (c) 1997-2009 University of Cambridge. .fi diff --git a/doc/pcrepattern.3 b/doc/pcrepattern.3 index 98e4c06..ff9ce53 100644 --- a/doc/pcrepattern.3 +++ b/doc/pcrepattern.3 @@ -2155,14 +2155,13 @@ The following verbs act as soon as they are encountered: .sp This verb causes the match to end successfully, skipping the remainder of the pattern. When inside a recursion, only the innermost pattern is ended -immediately. PCRE differs from Perl in what happens if the (*ACCEPT) is inside -capturing parentheses. In Perl, the data so far is captured: in PCRE no data is -captured. For example: +immediately. If the (*ACCEPT) is inside capturing parentheses, the data so far +is captured. (This feature was added to PCRE at release 8.00.) For example: .sp - A(A|B(*ACCEPT)|C)D + A((?:A|B(*ACCEPT)|C)D) .sp -This matches "AB", "AAD", or "ACD", but when it matches "AB", no data is -captured. +This matches "AB", "AAD", or "ACD"; when it matches "AB", "B" is captured by +the outer parentheses. .sp (*FAIL) or (*F) .sp @@ -2259,6 +2258,6 @@ Cambridge CB2 3QH, England. .rs .sp .nf -Last updated: 13 September 2009 +Last updated: 15 September 2009 Copyright (c) 1997-2009 University of Cambridge. .fi diff --git a/pcre_compile.c b/pcre_compile.c index 7678495..69fa428 100644 --- a/pcre_compile.c +++ b/pcre_compile.c @@ -4440,8 +4440,19 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ if (namelen == verbs[i].len && strncmp((char *)name, vn, namelen) == 0) { - *code = verbs[i].op; - if (*code++ == OP_ACCEPT) cd->had_accept = TRUE; + /* Check for open captures before ACCEPT */ + + if (verbs[i].op == OP_ACCEPT) + { + open_capitem *oc; + cd->had_accept = TRUE; + for (oc = cd->open_caps; oc != NULL; oc = oc->next) + { + *code++ = OP_CLOSE; + PUT2INC(code, 0, oc->number); + } + } + *code++ = verbs[i].op; break; } vn += verbs[i].len + 1; @@ -5669,6 +5680,8 @@ uschar *code = *codeptr; uschar *last_branch = code; uschar *start_bracket = code; uschar *reverse_count = NULL; +open_capitem capitem; +int capnumber = 0; int firstbyte, reqbyte; int branchfirstbyte, branchreqbyte; int length; @@ -5695,6 +5708,17 @@ the code that abstracts option settings at the start of the pattern and makes them global. It tests the value of length for (2 + 2*LINK_SIZE) in the pre-compile phase to find out whether anything has yet been compiled or not. */ +/* If this is a capturing subpattern, add to the chain of open capturing items +so that we can detect them if (*ACCEPT) is encountered. */ + +if (*code == OP_CBRA) + { + capnumber = GET2(code, 1 + LINK_SIZE); + capitem.number = capnumber; + capitem.next = cd->open_caps; + cd->open_caps = &capitem; + } + /* Offset is set zero to mark that this bracket is still open */ PUT(code, 1, 0); @@ -5830,6 +5854,10 @@ for (;;) } while (branch_length > 0); } + + /* If it was a capturing subpattern, remove it from the chain. */ + + if (capnumber > 0) cd->open_caps = cd->open_caps->next; /* Fill in the ket */ @@ -6398,6 +6426,7 @@ cd->end_pattern = (const uschar *)(pattern + strlen(pattern)); cd->req_varyopt = 0; cd->external_options = options; cd->external_flags = 0; +cd->open_caps = NULL; /* Now do the pre-compile. On error, errorcode will be set non-zero, so we don't need to look at the result of the function here. The initial options have @@ -6472,6 +6501,7 @@ cd->start_code = codestart; cd->hwm = cworkspace; cd->req_varyopt = 0; cd->had_accept = FALSE; +cd->open_caps = NULL; /* Set up a starting, non-extracting bracket, then compile the expression. On error, errorcode will be set non-zero, so we don't need to look at the result diff --git a/pcre_exec.c b/pcre_exec.c index 7107426..fe741fa 100644 --- a/pcre_exec.c +++ b/pcre_exec.c @@ -909,6 +909,30 @@ for (;;) ecode += 1 + LINK_SIZE; } break; + + + /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes, + to close any currently open capturing brackets. */ + + case OP_CLOSE: + number = GET2(ecode, 1); + offset = number << 1; + +#ifdef DEBUG + printf("end bracket %d at *ACCEPT", number); + printf("\n"); +#endif + + md->capture_last = number; + if (offset >= md->offset_max) md->offset_overflow = TRUE; else + { + md->offset_vector[offset] = + md->offset_vector[md->offset_end - number]; + md->offset_vector[offset+1] = eptr - md->start_subject; + if (offset_top <= offset) offset_top = offset + 2; + } + ecode += 3; + break; /* End of the pattern, either real or forced. If we are in a top-level diff --git a/pcre_internal.h b/pcre_internal.h index c6a1870..f64c809 100644 --- a/pcre_internal.h +++ b/pcre_internal.h @@ -1364,10 +1364,11 @@ enum { OP_FAIL, /* 109 */ OP_ACCEPT, /* 110 */ + OP_CLOSE, /* 111 Used before OP_ACCEPT to close open captures */ /* This is used to skip a subpattern with a {0} quantifier */ - OP_SKIPZERO /* 111 */ + OP_SKIPZERO /* 112 */ }; @@ -1393,7 +1394,7 @@ for debugging. The macro is referenced only in pcre_printint.c. */ "Once", "Bra", "CBra", "Cond", "SBra", "SCBra", "SCond", \ "Cond ref", "Cond rec", "Cond def", "Brazero", "Braminzero", \ "*PRUNE", "*SKIP", "*THEN", "*COMMIT", "*FAIL", "*ACCEPT", \ - "Skip zero" + "Close", "Skip zero" /* This macro defines the length of fixed length operations in the compiled @@ -1458,7 +1459,7 @@ in UTF-8 mode. The code that uses this table must know about such things. */ 1, /* DEF */ \ 1, 1, /* BRAZERO, BRAMINZERO */ \ 1, 1, 1, 1, /* PRUNE, SKIP, THEN, COMMIT, */ \ - 1, 1, 1 /* FAIL, ACCEPT, SKIPZERO */ + 1, 1, 3, 1 /* FAIL, ACCEPT, CLOSE, SKIPZERO */ /* A magic value for OP_RREF to indicate the "any recursion" condition. */ @@ -1521,6 +1522,15 @@ typedef struct pcre_study_data { uschar start_bits[32]; } pcre_study_data; +/* Structure for building a chain of open capturing subpatterns during +compiling, so that instructions to close them can be compiled when (*ACCEPT) is +encountered. */ + +typedef struct open_capitem { + struct open_capitem *next; /* Chain link */ + pcre_uint16 number; /* Capture number */ +} open_capitem; + /* Structure for passing "static" information around between the functions doing the compiling, so that they are thread-safe. */ @@ -1533,6 +1543,7 @@ typedef struct compile_data { const uschar *start_code; /* The start of the compiled code */ const uschar *start_pattern; /* The start of the pattern */ const uschar *end_pattern; /* The end of the pattern */ + open_capitem *open_caps; /* Chain of open capture items */ uschar *hwm; /* High watermark of workspace */ uschar *name_table; /* The name/number table */ int names_found; /* Number of entries so far */ diff --git a/pcre_printint.src b/pcre_printint.src index 5f45fc1..e096f6d 100644 --- a/pcre_printint.src +++ b/pcre_printint.src @@ -245,6 +245,10 @@ for(;;) else fprintf(f, " "); fprintf(f, "%s", OP_names[*code]); break; + + case OP_CLOSE: + fprintf(f, " %s %d", OP_names[*code], GET2(code, 1)); + break; case OP_CREF: fprintf(f, "%3d %s", GET2(code,1), OP_names[*code]); |