summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>2008-04-12 14:36:14 +0000
committerph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>2008-04-12 14:36:14 +0000
commitbd39c50b17337e4e0f4f77370c0794046e7d2768 (patch)
tree6af19fd983f194e306480c5c9af1e25705bdc3a9
parentf5fb7cc67b4338d6eadac91effd3ff4f30c33dde (diff)
downloadpcre-bd39c50b17337e4e0f4f77370c0794046e7d2768.tar.gz
Do not discard subpatterns with {0} quantifiers, as they may be called as
subroutines. git-svn-id: svn://vcs.exim.org/pcre/code/trunk@335 2f5784b3-3f2a-0410-8824-cb99058d5e15
-rw-r--r--ChangeLog7
-rw-r--r--HACKING11
-rw-r--r--doc/pcrepattern.311
-rw-r--r--pcre_compile.c59
-rw-r--r--pcre_dfa_exec.c7
-rw-r--r--pcre_exec.c18
-rw-r--r--pcre_internal.h11
-rw-r--r--pcre_study.c7
-rw-r--r--testdata/testinput26
-rw-r--r--testdata/testoutput212
10 files changed, 111 insertions, 38 deletions
diff --git a/ChangeLog b/ChangeLog
index d9baa07..9c93ffe 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -50,6 +50,13 @@ Version 7.7 05-Mar-08
which, however, unlike Perl's \g{...}, are subroutine calls, not back
references. PCRE supports relative numbers with this syntax (I don't think
Oniguruma does).
+
+12. Previously, a group with a zero repeat such as (...){0} was completely
+ omitted from the compiled regex. However, this means that if the group
+ was called as a subroutine from elsewhere in the pattern, things went wrong
+ (an internal error was given). Such groups are now left in the compiled
+ pattern, with a new opcode that causes them to be skipped at execution
+ time.
Version 7.6 28-Jan-08
diff --git a/HACKING b/HACKING
index c946cd2..e76341f 100644
--- a/HACKING
+++ b/HACKING
@@ -318,9 +318,12 @@ maximally respectively. All three are followed by LINK_SIZE bytes giving (as a
positive number) the offset back to the matching bracket opcode.
If a subpattern is quantified such that it is permitted to match zero times, it
-is preceded by one of OP_BRAZERO or OP_BRAMINZERO. These are single-byte
-opcodes which tell the matcher that skipping this subpattern entirely is a
-valid branch.
+is preceded by one of OP_BRAZERO, OP_BRAMINZERO, or OP_SKIPZERO. These are
+single-byte opcodes that tell the matcher that skipping the following
+subpattern entirely is a valid branch. In the case of the first two, not
+skipping the pattern is also valid (greedy and non-greedy). The third is used
+when a pattern has the quantifier {0,0}. It cannot be entirely discarded,
+because it may be called as a subroutine from elsewhere in the regex.
A subpattern with an indefinite maximum repetition is replicated in the
compiled data its minimum number of times (or once with OP_BRAZERO if the
@@ -411,4 +414,4 @@ at compile time, and so does not cause anything to be put into the compiled
data.
Philip Hazel
-August 2007
+April 2008
diff --git a/doc/pcrepattern.3 b/doc/pcrepattern.3
index b2b4024..2727b86 100644
--- a/doc/pcrepattern.3
+++ b/doc/pcrepattern.3
@@ -1259,7 +1259,14 @@ support is available, \eX{3} matches three Unicode extended sequences, each of
which may be several bytes long (and they may be of different lengths).
.P
The quantifier {0} is permitted, causing the expression to behave as if the
-previous item and the quantifier were not present.
+previous item and the quantifier were not present. This may be useful for
+subpatterns that are referenced as
+.\" HTML <a href="#subpatternsassubroutines">
+.\" </a>
+subroutines
+.\"
+from elsewhere in the pattern. Items other than subpatterns that have a {0}
+quantifier are omitted from the compiled pattern.
.P
For convenience, the three most common quantifiers have single-character
abbreviations:
@@ -2232,6 +2239,6 @@ Cambridge CB2 3QH, England.
.rs
.sp
.nf
-Last updated: 10 April 2008
+Last updated: 12 April 2008
Copyright (c) 1997-2008 University of Cambridge.
.fi
diff --git a/pcre_compile.c b/pcre_compile.c
index f7be568..492222a 100644
--- a/pcre_compile.c
+++ b/pcre_compile.c
@@ -1567,7 +1567,7 @@ for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE
/* Groups with zero repeats can of course be empty; skip them. */
- if (c == OP_BRAZERO || c == OP_BRAMINZERO)
+ if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO)
{
code += _pcre_OP_lengths[c];
do code += GET(code, 1); while (*code == OP_ALT);
@@ -1847,11 +1847,12 @@ return -1;
that is referenced. This means that groups can be replicated for fixed
repetition simply by copying (because the recursion is allowed to refer to
earlier groups that are outside the current group). However, when a group is
-optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
-it, after it has been compiled. This means that any OP_RECURSE items within it
-that refer to the group itself or any contained groups have to have their
-offsets adjusted. That one of the jobs of this function. Before it is called,
-the partially compiled regex must be temporarily terminated with OP_END.
+optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
+inserted before it, after it has been compiled. This means that any OP_RECURSE
+items within it that refer to the group itself or any contained groups have to
+have their offsets adjusted. That one of the jobs of this function. Before it
+is called, the partially compiled regex must be temporarily terminated with
+OP_END.
This function has been extended with the possibility of forward references for
recursions and subroutine calls. It must also check the list of such references
@@ -3842,28 +3843,38 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
if (repeat_min == 0)
{
- /* If the maximum is also zero, we just omit the group from the output
- altogether. */
-
- if (repeat_max == 0)
- {
- code = previous;
- goto END_REPEAT;
- }
-
- /* If the maximum is 1 or unlimited, we just have to stick in the
- BRAZERO and do no more at this point. However, we do need to adjust
- any OP_RECURSE calls inside the group that refer to the group itself or
- any internal or forward referenced group, because the offset is from
- the start of the whole regex. Temporarily terminate the pattern while
- doing this. */
-
- if (repeat_max <= 1)
+ /* If the maximum is also zero, we used to just omit the group from the
+ output altogether, like this:
+
+ ** if (repeat_max == 0)
+ ** {
+ ** code = previous;
+ ** goto END_REPEAT;
+ ** }
+
+ However, that fails when a group is referenced as a subroutine from
+ elsewhere in the pattern, so now we stick in OP_SKIPZERO in front of it
+ so that it is skipped on execution. As we don't have a list of which
+ groups are referenced, we cannot do this selectively.
+
+ If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
+ and do no more at this point. However, we do need to adjust any
+ OP_RECURSE calls inside the group that refer to the group itself or any
+ internal or forward referenced group, because the offset is from the
+ start of the whole regex. Temporarily terminate the pattern while doing
+ this. */
+
+ if (repeat_max <= 1) /* Covers 0, 1, and unlimited */
{
*code = OP_END;
adjust_recurse(previous, 1, utf8, cd, save_hwm);
memmove(previous+1, previous, len);
code++;
+ if (repeat_max == 0)
+ {
+ *previous++ = OP_SKIPZERO;
+ goto END_REPEAT;
+ }
*previous++ = OP_BRAZERO + repeat_type;
}
@@ -6198,7 +6209,7 @@ while (errorcode == 0 && cd->hwm > cworkspace)
if (groupptr == NULL) errorcode = ERR53;
else PUT(((uschar *)codestart), offset, groupptr - codestart);
}
-
+
/* Give an error if there's back reference to a non-existent capturing
subpattern. */
diff --git a/pcre_dfa_exec.c b/pcre_dfa_exec.c
index cfa3b2a..427c46f 100644
--- a/pcre_dfa_exec.c
+++ b/pcre_dfa_exec.c
@@ -694,6 +694,13 @@ for (;;)
break;
/*-----------------------------------------------------------------*/
+ case OP_SKIPZERO:
+ code += 1 + GET(code, 2);
+ while (*code == OP_ALT) code += GET(code, 1);
+ ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
+ break;
+
+ /*-----------------------------------------------------------------*/
case OP_CIRC:
if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
((ims & PCRE_MULTILINE) != 0 &&
diff --git a/pcre_exec.c b/pcre_exec.c
index 5673991..89fe6c2 100644
--- a/pcre_exec.c
+++ b/pcre_exec.c
@@ -1148,11 +1148,11 @@ for (;;)
do ecode += GET(ecode,1); while (*ecode == OP_ALT);
break;
- /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
- that it may occur zero times. It may repeat infinitely, or not at all -
- i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
- repeat limits are compiled as a number of copies, with the optional ones
- preceded by BRAZERO or BRAMINZERO. */
+ /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
+ indicating that it may occur zero times. It may repeat infinitely, or not
+ at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
+ with fixed upper repeat limits are compiled as a number of copies, with the
+ optional ones preceded by BRAZERO or BRAMINZERO. */
case OP_BRAZERO:
{
@@ -1174,6 +1174,14 @@ for (;;)
}
break;
+ case OP_SKIPZERO:
+ {
+ next = ecode+1;
+ do next += GET(next,1); while (*next == OP_ALT);
+ ecode = next + 1 + LINK_SIZE;
+ }
+ break;
+
/* End of a group, repeated or non-repeating. */
case OP_KET:
diff --git a/pcre_internal.h b/pcre_internal.h
index bca9564..a2a30f4 100644
--- a/pcre_internal.h
+++ b/pcre_internal.h
@@ -773,7 +773,11 @@ enum {
/* These are forced failure and success verbs */
OP_FAIL, /* 108 */
- OP_ACCEPT /* 109 */
+ OP_ACCEPT, /* 109 */
+
+ /* This is used to skip a subpattern with a {0} quantifier */
+
+ OP_SKIPZERO /* 110 */
};
@@ -798,7 +802,8 @@ for debugging. The macro is referenced only in pcre_printint.c. */
"AssertB", "AssertB not", "Reverse", \
"Once", "Bra", "CBra", "Cond", "SBra", "SCBra", "SCond", \
"Cond ref", "Cond rec", "Cond def", "Brazero", "Braminzero", \
- "*PRUNE", "*SKIP", "*THEN", "*COMMIT", "*FAIL", "*ACCEPT"
+ "*PRUNE", "*SKIP", "*THEN", "*COMMIT", "*FAIL", "*ACCEPT", \
+ "Skip zero"
/* This macro defines the length of fixed length operations in the compiled
@@ -863,7 +868,7 @@ in UTF-8 mode. The code that uses this table must know about such things. */
1, /* DEF */ \
1, 1, /* BRAZERO, BRAMINZERO */ \
1, 1, 1, 1, /* PRUNE, SKIP, THEN, COMMIT, */ \
- 1, 1 /* FAIL, ACCEPT */
+ 1, 1, 1 /* FAIL, ACCEPT, SKIPZERO */
/* A magic value for OP_RREF to indicate the "any recursion" condition. */
diff --git a/pcre_study.c b/pcre_study.c
index ff1f260..216c889 100644
--- a/pcre_study.c
+++ b/pcre_study.c
@@ -217,6 +217,13 @@ do
tcode += 1 + LINK_SIZE;
break;
+ /* SKIPZERO skips the bracket. */
+
+ case OP_SKIPZERO:
+ do tcode += GET(tcode,1); while (*tcode == OP_ALT);
+ tcode += 1 + LINK_SIZE;
+ break;
+
/* Single-char * or ? sets the bit and tries the next item */
case OP_STAR:
diff --git a/testdata/testinput2 b/testdata/testinput2
index a4a0326..16e712a 100644
--- a/testdata/testinput2
+++ b/testdata/testinput2
@@ -2649,4 +2649,10 @@ a random value. /Ix
abc
accccbbb
+/^(?+1)(?<a>x|y){0}z/
+ xzxx
+ yzyy
+ ** Failers
+ xxz
+
/ End of testinput2 /
diff --git a/testdata/testoutput2 b/testdata/testoutput2
index c2e41f7..1987cf8 100644
--- a/testdata/testoutput2
+++ b/testdata/testoutput2
@@ -9515,4 +9515,16 @@ No match
0: accccbbb
1: a
+/^(?+1)(?<a>x|y){0}z/
+ xzxx
+ 0: xz
+ 1: <unset>
+ yzyy
+ 0: yz
+ 1: <unset>
+ ** Failers
+No match
+ xxz
+No match
+
/ End of testinput2 /