summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>2009-09-15 18:17:54 +0000
committerph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>2009-09-15 18:17:54 +0000
commit912ae74971cb3b32d007d9fa83295c38fc871b31 (patch)
tree315555d53b3c1d918f3acaed403bec7d9fbeb682
parentefd42072fc347bc5f9e2af584f79286553c3efb2 (diff)
downloadpcre-912ae74971cb3b32d007d9fa83295c38fc871b31.tar.gz
Capture data when (*ACCEPT) is inside capturing parentheses.
git-svn-id: svn://vcs.exim.org/pcre/code/trunk@447 2f5784b3-3f2a-0410-8824-cb99058d5e15
-rw-r--r--ChangeLog5
-rw-r--r--doc/pcrecompat.35
-rw-r--r--doc/pcrepattern.313
-rw-r--r--pcre_compile.c34
-rw-r--r--pcre_exec.c24
-rw-r--r--pcre_internal.h17
-rw-r--r--pcre_printint.src4
7 files changed, 87 insertions, 15 deletions
diff --git a/ChangeLog b/ChangeLog
index 3e4d702..14d61c0 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -123,6 +123,11 @@ Version 8.00 ??-???-??
with unset values at the outer level. The correct (outer level) value is
now given.
+22. If (*ACCEPT) appeared inside capturing parentheses, previous releases of
+ PCRE did not set those parentheses (unlike Perl). I have now found a way to
+ make it do so. The string so far is captured, making this feature
+ compatible with Perl.
+
Version 7.9 11-Apr-09
---------------------
diff --git a/doc/pcrecompat.3 b/doc/pcrecompat.3
index c9e594b..68b5b9c 100644
--- a/doc/pcrecompat.3
+++ b/doc/pcrecompat.3
@@ -83,8 +83,7 @@ the pattern /^(a(b)?)+$/ in Perl leaves $2 unset, but in PCRE it is set to "b".
.P
11. PCRE does support Perl 5.10's backtracking verbs (*ACCEPT), (*FAIL), (*F),
(*COMMIT), (*PRUNE), (*SKIP), and (*THEN), but only in the forms without an
-argument. PCRE does not support (*MARK). If (*ACCEPT) is within capturing
-parentheses, PCRE does not set that capture group; this is different to Perl.
+argument. PCRE does not support (*MARK).
.P
12. PCRE provides some extensions to the Perl regular expression facilities.
Perl 5.10 will include new features that are not in earlier versions, some of
@@ -143,6 +142,6 @@ Cambridge CB2 3QH, England.
.rs
.sp
.nf
-Last updated: 11 September 2009
+Last updated: 15 September 2009
Copyright (c) 1997-2009 University of Cambridge.
.fi
diff --git a/doc/pcrepattern.3 b/doc/pcrepattern.3
index 98e4c06..ff9ce53 100644
--- a/doc/pcrepattern.3
+++ b/doc/pcrepattern.3
@@ -2155,14 +2155,13 @@ The following verbs act as soon as they are encountered:
.sp
This verb causes the match to end successfully, skipping the remainder of the
pattern. When inside a recursion, only the innermost pattern is ended
-immediately. PCRE differs from Perl in what happens if the (*ACCEPT) is inside
-capturing parentheses. In Perl, the data so far is captured: in PCRE no data is
-captured. For example:
+immediately. If the (*ACCEPT) is inside capturing parentheses, the data so far
+is captured. (This feature was added to PCRE at release 8.00.) For example:
.sp
- A(A|B(*ACCEPT)|C)D
+ A((?:A|B(*ACCEPT)|C)D)
.sp
-This matches "AB", "AAD", or "ACD", but when it matches "AB", no data is
-captured.
+This matches "AB", "AAD", or "ACD"; when it matches "AB", "B" is captured by
+the outer parentheses.
.sp
(*FAIL) or (*F)
.sp
@@ -2259,6 +2258,6 @@ Cambridge CB2 3QH, England.
.rs
.sp
.nf
-Last updated: 13 September 2009
+Last updated: 15 September 2009
Copyright (c) 1997-2009 University of Cambridge.
.fi
diff --git a/pcre_compile.c b/pcre_compile.c
index 7678495..69fa428 100644
--- a/pcre_compile.c
+++ b/pcre_compile.c
@@ -4440,8 +4440,19 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
if (namelen == verbs[i].len &&
strncmp((char *)name, vn, namelen) == 0)
{
- *code = verbs[i].op;
- if (*code++ == OP_ACCEPT) cd->had_accept = TRUE;
+ /* Check for open captures before ACCEPT */
+
+ if (verbs[i].op == OP_ACCEPT)
+ {
+ open_capitem *oc;
+ cd->had_accept = TRUE;
+ for (oc = cd->open_caps; oc != NULL; oc = oc->next)
+ {
+ *code++ = OP_CLOSE;
+ PUT2INC(code, 0, oc->number);
+ }
+ }
+ *code++ = verbs[i].op;
break;
}
vn += verbs[i].len + 1;
@@ -5669,6 +5680,8 @@ uschar *code = *codeptr;
uschar *last_branch = code;
uschar *start_bracket = code;
uschar *reverse_count = NULL;
+open_capitem capitem;
+int capnumber = 0;
int firstbyte, reqbyte;
int branchfirstbyte, branchreqbyte;
int length;
@@ -5695,6 +5708,17 @@ the code that abstracts option settings at the start of the pattern and makes
them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
pre-compile phase to find out whether anything has yet been compiled or not. */
+/* If this is a capturing subpattern, add to the chain of open capturing items
+so that we can detect them if (*ACCEPT) is encountered. */
+
+if (*code == OP_CBRA)
+ {
+ capnumber = GET2(code, 1 + LINK_SIZE);
+ capitem.number = capnumber;
+ capitem.next = cd->open_caps;
+ cd->open_caps = &capitem;
+ }
+
/* Offset is set zero to mark that this bracket is still open */
PUT(code, 1, 0);
@@ -5830,6 +5854,10 @@ for (;;)
}
while (branch_length > 0);
}
+
+ /* If it was a capturing subpattern, remove it from the chain. */
+
+ if (capnumber > 0) cd->open_caps = cd->open_caps->next;
/* Fill in the ket */
@@ -6398,6 +6426,7 @@ cd->end_pattern = (const uschar *)(pattern + strlen(pattern));
cd->req_varyopt = 0;
cd->external_options = options;
cd->external_flags = 0;
+cd->open_caps = NULL;
/* Now do the pre-compile. On error, errorcode will be set non-zero, so we
don't need to look at the result of the function here. The initial options have
@@ -6472,6 +6501,7 @@ cd->start_code = codestart;
cd->hwm = cworkspace;
cd->req_varyopt = 0;
cd->had_accept = FALSE;
+cd->open_caps = NULL;
/* Set up a starting, non-extracting bracket, then compile the expression. On
error, errorcode will be set non-zero, so we don't need to look at the result
diff --git a/pcre_exec.c b/pcre_exec.c
index 7107426..fe741fa 100644
--- a/pcre_exec.c
+++ b/pcre_exec.c
@@ -909,6 +909,30 @@ for (;;)
ecode += 1 + LINK_SIZE;
}
break;
+
+
+ /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
+ to close any currently open capturing brackets. */
+
+ case OP_CLOSE:
+ number = GET2(ecode, 1);
+ offset = number << 1;
+
+#ifdef DEBUG
+ printf("end bracket %d at *ACCEPT", number);
+ printf("\n");
+#endif
+
+ md->capture_last = number;
+ if (offset >= md->offset_max) md->offset_overflow = TRUE; else
+ {
+ md->offset_vector[offset] =
+ md->offset_vector[md->offset_end - number];
+ md->offset_vector[offset+1] = eptr - md->start_subject;
+ if (offset_top <= offset) offset_top = offset + 2;
+ }
+ ecode += 3;
+ break;
/* End of the pattern, either real or forced. If we are in a top-level
diff --git a/pcre_internal.h b/pcre_internal.h
index c6a1870..f64c809 100644
--- a/pcre_internal.h
+++ b/pcre_internal.h
@@ -1364,10 +1364,11 @@ enum {
OP_FAIL, /* 109 */
OP_ACCEPT, /* 110 */
+ OP_CLOSE, /* 111 Used before OP_ACCEPT to close open captures */
/* This is used to skip a subpattern with a {0} quantifier */
- OP_SKIPZERO /* 111 */
+ OP_SKIPZERO /* 112 */
};
@@ -1393,7 +1394,7 @@ for debugging. The macro is referenced only in pcre_printint.c. */
"Once", "Bra", "CBra", "Cond", "SBra", "SCBra", "SCond", \
"Cond ref", "Cond rec", "Cond def", "Brazero", "Braminzero", \
"*PRUNE", "*SKIP", "*THEN", "*COMMIT", "*FAIL", "*ACCEPT", \
- "Skip zero"
+ "Close", "Skip zero"
/* This macro defines the length of fixed length operations in the compiled
@@ -1458,7 +1459,7 @@ in UTF-8 mode. The code that uses this table must know about such things. */
1, /* DEF */ \
1, 1, /* BRAZERO, BRAMINZERO */ \
1, 1, 1, 1, /* PRUNE, SKIP, THEN, COMMIT, */ \
- 1, 1, 1 /* FAIL, ACCEPT, SKIPZERO */
+ 1, 1, 3, 1 /* FAIL, ACCEPT, CLOSE, SKIPZERO */
/* A magic value for OP_RREF to indicate the "any recursion" condition. */
@@ -1521,6 +1522,15 @@ typedef struct pcre_study_data {
uschar start_bits[32];
} pcre_study_data;
+/* Structure for building a chain of open capturing subpatterns during
+compiling, so that instructions to close them can be compiled when (*ACCEPT) is
+encountered. */
+
+typedef struct open_capitem {
+ struct open_capitem *next; /* Chain link */
+ pcre_uint16 number; /* Capture number */
+} open_capitem;
+
/* Structure for passing "static" information around between the functions
doing the compiling, so that they are thread-safe. */
@@ -1533,6 +1543,7 @@ typedef struct compile_data {
const uschar *start_code; /* The start of the compiled code */
const uschar *start_pattern; /* The start of the pattern */
const uschar *end_pattern; /* The end of the pattern */
+ open_capitem *open_caps; /* Chain of open capture items */
uschar *hwm; /* High watermark of workspace */
uschar *name_table; /* The name/number table */
int names_found; /* Number of entries so far */
diff --git a/pcre_printint.src b/pcre_printint.src
index 5f45fc1..e096f6d 100644
--- a/pcre_printint.src
+++ b/pcre_printint.src
@@ -245,6 +245,10 @@ for(;;)
else fprintf(f, " ");
fprintf(f, "%s", OP_names[*code]);
break;
+
+ case OP_CLOSE:
+ fprintf(f, " %s %d", OP_names[*code], GET2(code, 1));
+ break;
case OP_CREF:
fprintf(f, "%3d %s", GET2(code,1), OP_names[*code]);