summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--ChangeLog5
-rw-r--r--doc/pcrepattern.326
-rw-r--r--pcre_compile.c104
-rw-r--r--pcre_dfa_exec.c4
-rw-r--r--pcre_exec.c46
-rw-r--r--pcre_internal.h98
-rw-r--r--pcre_printint.c14
-rw-r--r--pcre_study.c53
-rw-r--r--testdata/saved16bin86 -> 86 bytes
-rw-r--r--testdata/saved16BE-1bin410 -> 410 bytes
-rw-r--r--testdata/saved16BE-2bin344 -> 344 bytes
-rw-r--r--testdata/saved16LE-1bin410 -> 410 bytes
-rw-r--r--testdata/saved16LE-2bin344 -> 344 bytes
-rw-r--r--testdata/saved32bin108 -> 108 bytes
-rw-r--r--testdata/saved32BE-1bin552 -> 552 bytes
-rw-r--r--testdata/saved32BE-2bin456 -> 456 bytes
-rw-r--r--testdata/saved32LE-1bin552 -> 552 bytes
-rw-r--r--testdata/saved32LE-2bin456 -> 456 bytes
-rw-r--r--testdata/saved8bin77 -> 77 bytes
-rw-r--r--testdata/testinput111
-rw-r--r--testdata/testinput24
-rw-r--r--testdata/testinput216
-rw-r--r--testdata/testinput2215
-rw-r--r--testdata/testoutput126
-rw-r--r--testdata/testoutput234
-rw-r--r--testdata/testoutput21-1610
-rw-r--r--testdata/testoutput21-3210
-rw-r--r--testdata/testoutput22-1619
-rw-r--r--testdata/testoutput22-3219
29 files changed, 392 insertions, 112 deletions
diff --git a/ChangeLog b/ChangeLog
index 213d7c1..68541e1 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -74,6 +74,11 @@ Version 8.34 xx-xxxx-201x
compile happens. This has simplified the code (it is now nearly 150 lines
shorter) and prepared the way for better handling of references to groups
with duplicate names.
+
+15. A back reference to a named subpattern when there is more than one of the
+ same name now checks them in the order in which they appear in the pattern.
+ The first one that is set is used for the reference. Previously only the
+ first one was inspected. This change makes PCRE more compatible with Perl.
Version 8.33 28-May-2013
diff --git a/doc/pcrepattern.3 b/doc/pcrepattern.3
index d3b126e..07365f5 100644
--- a/doc/pcrepattern.3
+++ b/doc/pcrepattern.3
@@ -1,4 +1,4 @@
-.TH PCREPATTERN 3 "26 April 2013" "PCRE 8.33"
+.TH PCREPATTERN 3 "06 September 2013" "PCRE 8.34"
.SH NAME
PCRE - Perl-compatible regular expressions
.SH "PCRE REGULAR EXPRESSION DETAILS"
@@ -1577,9 +1577,20 @@ for the first (and in this example, the only) subpattern of that name that
matched. This saves searching to find which numbered subpattern it was.
.P
If you make a back reference to a non-unique named subpattern from elsewhere in
-the pattern, the one that corresponds to the first occurrence of the name is
-used. In the absence of duplicate numbers (see the previous section) this is
-the one with the lowest number. If you use a named reference in a condition
+the pattern, the subpatterns to which the name refers are checked in the order
+in which they appear in the overall pattern. The first one that is set is used
+for the reference. For example, this pattern matches both "foofoo" and
+"barbar" but not "foobar" or "barfoo":
+.sp
+ (?:(?<n>foo)|(?<n>bar))\k<n>
+.sp
+.P
+If you make a subroutine call to a non-unique named subpattern, the one that
+corresponds to the first occurrence of the name is used. In the absence of
+duplicate numbers (see the previous section) this is the one with the lowest
+number.
+.P
+If you use a named reference in a condition
test (see the
.\"
.\" HTML <a href="#conditions">
@@ -1599,8 +1610,9 @@ documentation.
\fBWarning:\fP You cannot use different names to distinguish between two
subpatterns with the same number because PCRE uses only the numbers when
matching. For this reason, an error is given at compile time if different names
-are given to subpatterns with the same number. However, you can give the same
-name to subpatterns with the same number, even when PCRE_DUPNAMES is not set.
+are given to subpatterns with the same number. However, you can always give the
+same name to subpatterns with the same number, even when PCRE_DUPNAMES is not
+set.
.
.
.SH REPETITION
@@ -3145,6 +3157,6 @@ Cambridge CB2 3QH, England.
.rs
.sp
.nf
-Last updated: 26 April 2013
+Last updated: 06 September 2013
Copyright (c) 1997-2013 University of Cambridge.
.fi
diff --git a/pcre_compile.c b/pcre_compile.c
index 0de5565..45937c1 100644
--- a/pcre_compile.c
+++ b/pcre_compile.c
@@ -1722,6 +1722,8 @@ for (;;)
case OP_QUERYI:
case OP_REF:
case OP_REFI:
+ case OP_DNREF:
+ case OP_DNREFI:
case OP_SBRA:
case OP_SBRAPOS:
case OP_SCBRA:
@@ -4826,13 +4828,12 @@ for (;; ptr++)
/* If previous was a character class or a back reference, we put the repeat
stuff after it, but just skip the item if the repeat was {0,0}. */
- else if (*previous == OP_CLASS ||
- *previous == OP_NCLASS ||
+ else if (*previous == OP_CLASS || *previous == OP_NCLASS ||
#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
*previous == OP_XCLASS ||
#endif
- *previous == OP_REF ||
- *previous == OP_REFI)
+ *previous == OP_REF || *previous == OP_REFI ||
+ *previous == OP_DNREF || *previous == OP_DNREFI)
{
if (repeat_max == 0)
{
@@ -5886,7 +5887,8 @@ for (;; ptr++)
{
*errorcodeptr = ERR43;
goto FAILED;
- }
+ }
+ cd->dupnames = TRUE; /* Duplicate names exist */
}
else if (ng->number == number)
{
@@ -5987,6 +5989,10 @@ for (;; ptr++)
break;
}
recno = (i < cd->names_found)? ng->number : 0;
+
+ /* Count named back references. */
+
+ if (!is_recurse) cd->namedrefcount++;
}
/* In the real compile, search the name table. We check the name
@@ -6016,12 +6022,66 @@ for (;; ptr++)
}
}
- /* In both phases, we can now go to the code than handles numerical
- recursion or backreferences. */
+ /* In both phases, for recursions, we can now go to the code than
+ handles numerical recursion. */
if (is_recurse) goto HANDLE_RECURSION;
- else goto HANDLE_REFERENCE;
+
+ /* In the second pass we must see if the name is duplicated. If so, we
+ generate a different opcode. */
+
+ if (lengthptr == NULL && cd->dupnames)
+ {
+ int count = 1;
+ unsigned int index = i;
+ pcre_uchar *cslot = slot + cd->name_entry_size;
+
+ for (i++; i < cd->names_found; i++)
+ {
+ if (STRCMP_UC_UC(slot + IMM2_SIZE, cslot + IMM2_SIZE) != 0) break;
+ count++;
+ cslot += cd->name_entry_size;
+ }
+ if (count > 1)
+ {
+ if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
+ previous = code;
+ *code++ = ((options & PCRE_CASELESS) != 0)? OP_DNREFI : OP_DNREF;
+ PUT2INC(code, 0, index);
+ PUT2INC(code, 0, count);
+
+ /* Process each potentially referenced group. */
+
+ for (; slot < cslot; slot += cd->name_entry_size)
+ {
+ open_capitem *oc;
+ recno = GET2(slot, 0);
+ cd->backref_map |= (recno < 32)? (1 << recno) : 1;
+ if (recno > cd->top_backref) cd->top_backref = recno;
+
+ /* Check to see if this back reference is recursive, that it, it
+ is inside the group that it references. A flag is set so that the
+ group can be made atomic. */
+
+ for (oc = cd->open_caps; oc != NULL; oc = oc->next)
+ {
+ if (oc->number == recno)
+ {
+ oc->flag = TRUE;
+ break;
+ }
+ }
+ }
+
+ continue; /* End of back ref handling */
+ }
+ }
+
+ /* First pass, or a non-duplicated name. */
+
+ goto HANDLE_REFERENCE;
+
/* ------------------------------------------------------------ */
case CHAR_R: /* Recursion */
@@ -6602,8 +6662,11 @@ for (;; ptr++)
{
open_capitem *oc;
recno = -escape;
+
+ /* Come here from named backref handling when the reference is to a
+ single group (i.e. not to a duplicated name. */
- HANDLE_REFERENCE: /* Come here from named backref handling */
+ HANDLE_REFERENCE:
if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
previous = code;
*code++ = ((options & PCRE_CASELESS) != 0)? OP_REFI : OP_REF;
@@ -7872,6 +7935,8 @@ cd->bracount = cd->final_bracount = 0;
cd->names_found = 0;
cd->name_entry_size = 0;
cd->name_table = NULL;
+cd->dupnames = FALSE;
+cd->namedrefcount = 0;
cd->start_code = cworkspace;
cd->hwm = cworkspace;
cd->start_workspace = cworkspace;
@@ -7909,14 +7974,23 @@ if (length > MAX_PATTERN_SIZE)
goto PCRE_EARLY_ERROR_RETURN;
}
-/* Compute the size of data block needed and get it, either from malloc or
-externally provided function. Integer overflow should no longer be possible
-because nowadays we limit the maximum value of cd->names_found and
-cd->name_entry_size. */
+/* If there are groups with duplicate names and there are also references by
+name, we must allow for the possibility of named references to duplicated
+groups. These require an extra data item each. */
-size = sizeof(REAL_PCRE) + (length + cd->names_found * cd->name_entry_size) * sizeof(pcre_uchar);
-re = (REAL_PCRE *)(PUBL(malloc))(size);
+if (cd->dupnames && cd->namedrefcount > 0)
+ length += cd->namedrefcount * IMM2_SIZE * sizeof(pcre_uchar);
+
+/* Compute the size of the data block for storing the compiled pattern. Integer
+overflow should no longer be possible because nowadays we limit the maximum
+value of cd->names_found and cd->name_entry_size. */
+size = sizeof(REAL_PCRE) +
+ (length + cd->names_found * cd->name_entry_size) * sizeof(pcre_uchar);
+
+/* Get the memory. */
+
+re = (REAL_PCRE *)(PUBL(malloc))(size);
if (re == NULL)
{
errorcode = ERR21;
diff --git a/pcre_dfa_exec.c b/pcre_dfa_exec.c
index 02bd3f0..bd5eb44 100644
--- a/pcre_dfa_exec.c
+++ b/pcre_dfa_exec.c
@@ -156,6 +156,8 @@ static const pcre_uint8 coptable[] = {
0, /* XCLASS - variable length */
0, /* REF */
0, /* REFI */
+ 0, /* DNREF */
+ 0, /* DNREFI */
0, /* RECURSE */
0, /* CALLOUT */
0, /* Alt */
@@ -225,6 +227,8 @@ static const pcre_uint8 poptable[] = {
1, /* XCLASS - variable length */
0, /* REF */
0, /* REFI */
+ 0, /* DNREF */
+ 0, /* DNREFI */
0, /* RECURSE */
0, /* CALLOUT */
0, /* Alt */
diff --git a/pcre_exec.c b/pcre_exec.c
index c5d0566..0f8526f 100644
--- a/pcre_exec.c
+++ b/pcre_exec.c
@@ -2742,15 +2742,7 @@ for (;;)
similar code to character type repeats - written out again for speed.
However, if the referenced string is the empty string, always treat
it as matched, any number of times (otherwise there could be infinite
- loops). */
-
- case OP_REF:
- case OP_REFI:
- caseless = op == OP_REFI;
- offset = GET2(ecode, 1) << 1; /* Doubled ref number */
- ecode += 1 + IMM2_SIZE;
-
- /* If the reference is unset, there are two possibilities:
+ loops). If the reference is unset, there are two possibilities:
(a) In the default, Perl-compatible state, set the length negative;
this ensures that every attempt at a match fails. We can't just fail
@@ -2760,7 +2752,40 @@ for (;;)
so that the back reference matches an empty string.
Otherwise, set the length to the length of what was matched by the
- referenced subpattern. */
+ referenced subpattern.
+
+ The OP_REF and OP_REFI opcodes are used for a reference to a numbered group
+ or to a non-duplicated named group. For a duplicated named group, OP_DNREF
+ and OP_DNREFI are used. In this case we must scan the list of groups to
+ which the name refers, and use the first one that is set. */
+
+ case OP_DNREF:
+ case OP_DNREFI:
+ caseless = op == OP_DNREFI;
+ {
+ int count = GET2(ecode, 1+IMM2_SIZE);
+ pcre_uchar *slot = md->name_table + GET2(ecode, 1) * md->name_entry_size;
+ ecode += 1 + 2*IMM2_SIZE;
+
+ while (count-- > 0)
+ {
+ offset = GET2(slot, 0) << 1;
+ if (offset < offset_top && md->offset_vector[offset] >= 0) break;
+ slot += md->name_entry_size;
+ }
+ if (count < 0)
+ length = (md->jscript_compat)? 0 : -1;
+ else
+ length = md->offset_vector[offset+1] - md->offset_vector[offset];
+ }
+ goto REF_REPEAT;
+
+ case OP_REF:
+ case OP_REFI:
+ caseless = op == OP_REFI;
+ offset = GET2(ecode, 1) << 1; /* Doubled ref number */
+ ecode += 1 + IMM2_SIZE;
+
if (offset >= offset_top || md->offset_vector[offset] < 0)
length = (md->jscript_compat)? 0 : -1;
@@ -2769,6 +2794,7 @@ for (;;)
/* Set up for repetition, or handle the non-repeated case */
+ REF_REPEAT:
switch (*ecode)
{
case OP_CRSTAR:
diff --git a/pcre_internal.h b/pcre_internal.h
index ec9fca6..fa3af98 100644
--- a/pcre_internal.h
+++ b/pcre_internal.h
@@ -2055,79 +2055,81 @@ enum {
class. This does both positive and negative. */
OP_REF, /* 109 Match a back reference, casefully */
OP_REFI, /* 110 Match a back reference, caselessly */
- OP_RECURSE, /* 111 Match a numbered subpattern (possibly recursive) */
- OP_CALLOUT, /* 112 Call out to external function if provided */
+ OP_DNREF, /* 111 Match a duplicate name backref, casefully */
+ OP_DNREFI, /* 112 Match a duplicate name backref, caselessly */
+ OP_RECURSE, /* 113 Match a numbered subpattern (possibly recursive) */
+ OP_CALLOUT, /* 114 Call out to external function if provided */
- OP_ALT, /* 113 Start of alternation */
- OP_KET, /* 114 End of group that doesn't have an unbounded repeat */
- OP_KETRMAX, /* 115 These two must remain together and in this */
- OP_KETRMIN, /* 116 order. They are for groups the repeat for ever. */
- OP_KETRPOS, /* 117 Possessive unlimited repeat. */
+ OP_ALT, /* 115 Start of alternation */
+ OP_KET, /* 116 End of group that doesn't have an unbounded repeat */
+ OP_KETRMAX, /* 117 These two must remain together and in this */
+ OP_KETRMIN, /* 118 order. They are for groups the repeat for ever. */
+ OP_KETRPOS, /* 119 Possessive unlimited repeat. */
/* The assertions must come before BRA, CBRA, ONCE, and COND, and the four
asserts must remain in order. */
- OP_REVERSE, /* 118 Move pointer back - used in lookbehind assertions */
- OP_ASSERT, /* 119 Positive lookahead */
- OP_ASSERT_NOT, /* 120 Negative lookahead */
- OP_ASSERTBACK, /* 121 Positive lookbehind */
- OP_ASSERTBACK_NOT, /* 122 Negative lookbehind */
+ OP_REVERSE, /* 129 Move pointer back - used in lookbehind assertions */
+ OP_ASSERT, /* 121 Positive lookahead */
+ OP_ASSERT_NOT, /* 122 Negative lookahead */
+ OP_ASSERTBACK, /* 123 Positive lookbehind */
+ OP_ASSERTBACK_NOT, /* 124 Negative lookbehind */
/* ONCE, ONCE_NC, BRA, BRAPOS, CBRA, CBRAPOS, and COND must come immediately
after the assertions, with ONCE first, as there's a test for >= ONCE for a
subpattern that isn't an assertion. The POS versions must immediately follow
the non-POS versions in each case. */
- OP_ONCE, /* 123 Atomic group, contains captures */
- OP_ONCE_NC, /* 124 Atomic group containing no captures */
- OP_BRA, /* 125 Start of non-capturing bracket */
- OP_BRAPOS, /* 126 Ditto, with unlimited, possessive repeat */
- OP_CBRA, /* 127 Start of capturing bracket */
- OP_CBRAPOS, /* 128 Ditto, with unlimited, possessive repeat */
- OP_COND, /* 129 Conditional group */
+ OP_ONCE, /* 125 Atomic group, contains captures */
+ OP_ONCE_NC, /* 126 Atomic group containing no captures */
+ OP_BRA, /* 127 Start of non-capturing bracket */
+ OP_BRAPOS, /* 128 Ditto, with unlimited, possessive repeat */
+ OP_CBRA, /* 129 Start of capturing bracket */
+ OP_CBRAPOS, /* 130 Ditto, with unlimited, possessive repeat */
+ OP_COND, /* 131 Conditional group */
/* These five must follow the previous five, in the same order. There's a
check for >= SBRA to distinguish the two sets. */
- OP_SBRA, /* 130 Start of non-capturing bracket, check empty */
- OP_SBRAPOS, /* 131 Ditto, with unlimited, possessive repeat */
- OP_SCBRA, /* 132 Start of capturing bracket, check empty */
- OP_SCBRAPOS, /* 133 Ditto, with unlimited, possessive repeat */
- OP_SCOND, /* 134 Conditional group, check empty */
+ OP_SBRA, /* 132 Start of non-capturing bracket, check empty */
+ OP_SBRAPOS, /* 133 Ditto, with unlimited, possessive repeat */
+ OP_SCBRA, /* 134 Start of capturing bracket, check empty */
+ OP_SCBRAPOS, /* 135 Ditto, with unlimited, possessive repeat */
+ OP_SCOND, /* 136 Conditional group, check empty */
/* The next two pairs must (respectively) be kept together. */
- OP_CREF, /* 135 Used to hold a capture number as condition */
- OP_NCREF, /* 136 Same, but generated by a name reference*/
- OP_RREF, /* 137 Used to hold a recursion number as condition */
- OP_NRREF, /* 138 Same, but generated by a name reference*/
- OP_DEF, /* 139 The DEFINE condition */
+ OP_CREF, /* 137 Used to hold a capture number as condition */
+ OP_NCREF, /* 138 Same, but generated by a name reference*/
+ OP_RREF, /* 139 Used to hold a recursion number as condition */
+ OP_NRREF, /* 140 Same, but generated by a name reference*/
+ OP_DEF, /* 141 The DEFINE condition */
- OP_BRAZERO, /* 140 These two must remain together and in this */
- OP_BRAMINZERO, /* 141 order. */
- OP_BRAPOSZERO, /* 142 */
+ OP_BRAZERO, /* 142 These two must remain together and in this */
+ OP_BRAMINZERO, /* 143 order. */
+ OP_BRAPOSZERO, /* 144 */
/* These are backtracking control verbs */
- OP_MARK, /* 143 always has an argument */
- OP_PRUNE, /* 144 */
- OP_PRUNE_ARG, /* 145 same, but with argument */
- OP_SKIP, /* 146 */
- OP_SKIP_ARG, /* 147 same, but with argument */
- OP_THEN, /* 148 */
- OP_THEN_ARG, /* 149 same, but with argument */
- OP_COMMIT, /* 150 */
+ OP_MARK, /* 145 always has an argument */
+ OP_PRUNE, /* 146 */
+ OP_PRUNE_ARG, /* 147 same, but with argument */
+ OP_SKIP, /* 148 */
+ OP_SKIP_ARG, /* 149 same, but with argument */
+ OP_THEN, /* 150 */
+ OP_THEN_ARG, /* 151 same, but with argument */
+ OP_COMMIT, /* 152 */
/* These are forced failure and success verbs */
- OP_FAIL, /* 151 */
- OP_ACCEPT, /* 152 */
- OP_ASSERT_ACCEPT, /* 153 Used inside assertions */
- OP_CLOSE, /* 154 Used before OP_ACCEPT to close open captures */
+ OP_FAIL, /* 153 */
+ OP_ACCEPT, /* 154 */
+ OP_ASSERT_ACCEPT, /* 155 Used inside assertions */
+ OP_CLOSE, /* 156 Used before OP_ACCEPT to close open captures */
/* This is used to skip a subpattern with a {0} quantifier */
- OP_SKIPZERO, /* 155 */
+ OP_SKIPZERO, /* 157 */
/* This is not an opcode, but is used to check that tables indexed by opcode
are the correct length, in order to catch updating errors - there have been
@@ -2167,7 +2169,7 @@ some cases doesn't actually use these names at all). */
"*", "*?", "+", "+?", "?", "??", "{", "{", "{", \
"*+","++", "?+", "{", \
"*", "*?", "+", "+?", "?", "??", "{", "{", \
- "class", "nclass", "xclass", "Ref", "Refi", \
+ "class", "nclass", "xclass", "Ref", "Refi", "DnRef", "DnRefi", \
"Recurse", "Callout", \
"Alt", "Ket", "KetRmax", "KetRmin", "KetRpos", \
"Reverse", "Assert", "Assert not", "AssertB", "AssertB not", \
@@ -2237,6 +2239,8 @@ in UTF-8 mode. The code that uses this table must know about such things. */
0, /* XCLASS - variable length */ \
1+IMM2_SIZE, /* REF */ \
1+IMM2_SIZE, /* REFI */ \
+ 1+2*IMM2_SIZE, /* DNREF */ \
+ 1+2*IMM2_SIZE, /* DNREFI */ \
1+LINK_SIZE, /* RECURSE */ \
2+2*LINK_SIZE, /* CALLOUT */ \
1+LINK_SIZE, /* Alt */ \
@@ -2441,6 +2445,7 @@ typedef struct compile_data {
int max_lookbehind; /* Maximum lookbehind (characters) */
int top_backref; /* Maximum back reference */
unsigned int backref_map; /* Bitmap of low back refs */
+ unsigned int namedrefcount; /* Number of backreferences by name */
int assert_depth; /* Depth of nested assertions */
pcre_uint32 external_options; /* External (initial) options */
pcre_uint32 external_flags; /* External flag bits to be set */
@@ -2448,6 +2453,7 @@ typedef struct compile_data {
BOOL had_accept; /* (*ACCEPT) encountered */
BOOL had_pruneorskip; /* (*PRUNE) or (*SKIP) encountered */
BOOL check_lookbehind; /* Lookbehinds need later checking */
+ BOOL dupnames; /* Duplicate names exist */
int nltype; /* Newline type */
int nllen; /* Newline string length */
pcre_uchar nl[4]; /* Newline string when fixed length */
diff --git a/pcre_printint.c b/pcre_printint.c
index 10b5754..437fe31 100644
--- a/pcre_printint.c
+++ b/pcre_printint.c
@@ -598,6 +598,20 @@ for(;;)
ccode = code + priv_OP_lengths[*code];
goto CLASS_REF_REPEAT;
+ case OP_DNREFI:
+ flag = "/i";
+ /* Fall through */
+ case OP_DNREF:
+ {
+ pcre_uchar *entry = (pcre_uchar *)re + offset + (GET2(code, 1) * size) +
+ IMM2_SIZE;
+ fprintf(f, " %s \\k<", flag);
+ print_puchar(f, entry);
+ fprintf(f, ">%d", GET2(code, 1 + IMM2_SIZE));
+ }
+ ccode = code + priv_OP_lengths[*code];
+ goto CLASS_REF_REPEAT;
+
case OP_CALLOUT:
fprintf(f, " %s %d %d %d", priv_OP_names[*code], code[1], GET(code,2),
GET(code, 2 + LINK_SIZE));
diff --git a/pcre_study.c b/pcre_study.c
index 12d2a66..de85ede 100644
--- a/pcre_study.c
+++ b/pcre_study.c
@@ -66,8 +66,9 @@ string of that length that matches. In UTF8 mode, the result is in characters
rather than bytes.
Arguments:
+ re compiled pattern block
code pointer to start of group (the bracket)
- startcode pointer to start of the whole pattern
+ startcode pointer to start of the whole pattern's code
options the compiling options
int RECURSE depth
@@ -78,8 +79,8 @@ Returns: the minimum length
*/
static int
-find_minlength(const pcre_uchar *code, const pcre_uchar *startcode, int options,
- int recurse_depth)
+find_minlength(const REAL_PCRE *re, const pcre_uchar *code,
+ const pcre_uchar *startcode, int options, int recurse_depth)
{
int length = -1;
/* PCRE_UTF16 has the same value as PCRE_UTF8. */
@@ -129,7 +130,7 @@ for (;;)
case OP_SBRAPOS:
case OP_ONCE:
case OP_ONCE_NC:
- d = find_minlength(cc, startcode, options, recurse_depth);
+ d = find_minlength(re, cc, startcode, options, recurse_depth);
if (d < 0) return d;
branchlength += d;
do cc += GET(cc, 1); while (*cc == OP_ALT);
@@ -374,8 +375,39 @@ for (;;)
If PCRE_JAVASCRIPT_COMPAT is set, a backreference to an unset bracket
matches an empty string (by default it causes a matching failure), so in
that case we must set the minimum length to zero. */
+
+ case OP_DNREF: /* Duplicate named pattern back reference */
+ case OP_DNREFI:
+ if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
+ {
+ int count = GET2(cc, 1+IMM2_SIZE);
+ pcre_uchar *slot = (pcre_uchar *)re +
+ re->name_table_offset + GET2(cc, 1) * re->name_entry_size;
+ d = INT_MAX;
+ while (count-- > 0)
+ {
+ ce = cs = (pcre_uchar *)PRIV(find_bracket)(startcode, utf, GET2(slot, 0));
+ if (cs == NULL) return -2;
+ do ce += GET(ce, 1); while (*ce == OP_ALT);
+ if (cc > cs && cc < ce)
+ {
+ d = 0;
+ had_recurse = TRUE;
+ break;
+ }
+ else
+ {
+ int dd = find_minlength(re, cs, startcode, options, recurse_depth);
+ if (dd < d) d = dd;
+ }
+ slot += re->name_entry_size;
+ }
+ }
+ else d = 0;
+ cc += 1 + 2*IMM2_SIZE;
+ goto REPEAT_BACK_REFERENCE;
- case OP_REF:
+ case OP_REF: /* Single back reference */
case OP_REFI:
if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
{
@@ -389,7 +421,7 @@ for (;;)
}
else
{
- d = find_minlength(cs, startcode, options, recurse_depth);
+ d = find_minlength(re, cs, startcode, options, recurse_depth);
}
}
else d = 0;
@@ -397,6 +429,7 @@ for (;;)
/* Handle repeated back references */
+ REPEAT_BACK_REFERENCE:
switch (*cc)
{
case OP_CRSTAR:
@@ -437,7 +470,8 @@ for (;;)
had_recurse = TRUE;
else
{
- branchlength += find_minlength(cs, startcode, options, recurse_depth + 1);
+ branchlength += find_minlength(re, cs, startcode, options,
+ recurse_depth + 1);
}
cc += 1 + LINK_SIZE;
break;
@@ -825,6 +859,8 @@ do
case OP_RECURSE:
case OP_REF:
case OP_REFI:
+ case OP_DNREF:
+ case OP_DNREFI:
case OP_REVERSE:
case OP_RREF:
case OP_SCOND:
@@ -1346,6 +1382,7 @@ pcre_uchar *code;
compile_data compile_block;
const REAL_PCRE *re = (const REAL_PCRE *)external_re;
+
*errorptr = NULL;
if (re == NULL || re->magic_number != MAGIC_NUMBER)
@@ -1422,7 +1459,7 @@ if ((re->options & PCRE_ANCHORED) == 0 &&
/* Find the minimum length of subject string. */
-switch(min = find_minlength(code, code, re->options, 0))
+switch(min = find_minlength(re, code, code, re->options, 0))
{
case -2: *errorptr = "internal error: missing capturing bracket"; return NULL;
case -3: *errorptr = "internal error: opcode not recognized"; return NULL;
diff --git a/testdata/saved16 b/testdata/saved16
index 583c434..f86326c 100644
--- a/testdata/saved16
+++ b/testdata/saved16
Binary files differ
diff --git a/testdata/saved16BE-1 b/testdata/saved16BE-1
index e2e807d..669f586 100644
--- a/testdata/saved16BE-1
+++ b/testdata/saved16BE-1
Binary files differ
diff --git a/testdata/saved16BE-2 b/testdata/saved16BE-2
index cc2718a..063d6bc 100644
--- a/testdata/saved16BE-2
+++ b/testdata/saved16BE-2
Binary files differ
diff --git a/testdata/saved16LE-1 b/testdata/saved16LE-1
index b037d49..22e2c6a 100644
--- a/testdata/saved16LE-1
+++ b/testdata/saved16LE-1
Binary files differ
diff --git a/testdata/saved16LE-2 b/testdata/saved16LE-2
index d7034f7..b74d9a0 100644
--- a/testdata/saved16LE-2
+++ b/testdata/saved16LE-2
Binary files differ
diff --git a/testdata/saved32 b/testdata/saved32
index 5b6fe34..a4e2704 100644
--- a/testdata/saved32
+++ b/testdata/saved32
Binary files differ
diff --git a/testdata/saved32BE-1 b/testdata/saved32BE-1
index ebe62ca..514d4d7 100644
--- a/testdata/saved32BE-1
+++ b/testdata/saved32BE-1
Binary files differ
diff --git a/testdata/saved32BE-2 b/testdata/saved32BE-2
index 8168343..ae6c18f 100644
--- a/testdata/saved32BE-2
+++ b/testdata/saved32BE-2
Binary files differ
diff --git a/testdata/saved32LE-1 b/testdata/saved32LE-1
index e008f3a..0504d0e 100644
--- a/testdata/saved32LE-1
+++ b/testdata/saved32LE-1
Binary files differ
diff --git a/testdata/saved32LE-2 b/testdata/saved32LE-2
index cf3bd73..d260260 100644
--- a/testdata/saved32LE-2
+++ b/testdata/saved32LE-2
Binary files differ
diff --git a/testdata/saved8 b/testdata/saved8
index 37d733e..8cf0c13 100644
--- a/testdata/saved8
+++ b/testdata/saved8
Binary files differ
diff --git a/testdata/testinput1 b/testdata/testinput1
index f2194ff..1763c5a 100644
--- a/testdata/testinput1
+++ b/testdata/testinput1
@@ -5609,4 +5609,15 @@ AbcdCBefgBhiBqz
ca
cd
+/(?:(?<n>foo)|(?<n>bar))\k<n>/J
+ foofoo
+ barbar
+
+/(?<n>A)(?:(?<n>foo)|(?<n>bar))\k<n>/J
+ AfooA
+ AbarA
+ ** Failers
+ Afoofoo
+ Abarbar
+
/-- End of testinput1 --/
diff --git a/testdata/testinput2 b/testdata/testinput2
index a19730e..fd93065 100644
--- a/testdata/testinput2
+++ b/testdata/testinput2
@@ -3844,4 +3844,8 @@ backtracking verbs. --/
/^(?=(a)){0}b(?1)/
backgammon
+/(?|(?<n>f)|(?<n>b))/JI
+
+/(?<a>abc)(?<a>z)\k<a>()/JDZS
+
/-- End of testinput2 --/
diff --git a/testdata/testinput21 b/testdata/testinput21
index b285d63..30895ee 100644
--- a/testdata/testinput21
+++ b/testdata/testinput21
@@ -4,7 +4,11 @@ typical). The others require the link size to be 2. */x
<!testsaved8
-%-- Generated from: ^[aL](?P<name>(?:[AaLl]+)[^xX-]*?)(?P<other>[\x{150}-\x{250}\x{300}]|[^\x{800}aAs-uS-U\x{d800}-\x{dfff}])++[^#\b\x{500}\x{1000}]{3,5}$
+%-- Generated from:
+ /^[aL](?P<name>(?:[AaLl]+)[^xX-]*?)(?P<other>[\x{150}-\x{250}\x{300}]|
+ [^\x{800}aAs-uS-U\x{d800}-\x{dfff}])++[^#\b\x{500}\x{1000}]{3,5}$
+ /x
+
In 16-bit mode with options: S>testdata/saved16LE-1
FS>testdata/saved16BE-1
In 32-bit mode with options: S>testdata/saved32LE-1
diff --git a/testdata/testinput22 b/testdata/testinput22
index 58239f1..ca408db 100644
--- a/testdata/testinput22
+++ b/testdata/testinput22
@@ -1,10 +1,15 @@
/-- Tests for reloading pre-compile patterns with UTF-16 or UTF-32 support. */
-%-- Generated from: (?P<cbra1>[aZ\x{400}-\x{10ffff}]{4,}[\x{f123}\x{10039}\x{20000}-\x{21234}]?|[A-Cx-z\x{100000}-\x{1000a7}\x{101234}])(?<cb2>[^az])
- In 16-bit mode with options: S8>testdata/saved16LE-1
- FS8>testdata/saved16BE-1
- In 32-bit mode with options: S8>testdata/saved32LE-1
- FS8testdata/saved32BE-1
+%-- Generated from:
+ /(?P<cbra1>[aZ\x{400}-\x{10ffff}]{4,}
+ [\x{f123}\x{10039}\x{20000}-\x{21234}]?|
+ [A-Cx-z\x{100000}-\x{1000a7}\x{101234}])
+ (?<cb2>[^az])/x
+
+ In 16-bit mode with options: S8>testdata/saved16LE-2
+ FS8>testdata/saved16BE-2
+ In 32-bit mode with options: S8>testdata/saved32LE-2
+ FS8>testdata/saved32BE-2
--%8x
<!testsaved16LE-2
diff --git a/testdata/testoutput1 b/testdata/testoutput1
index 73fb1f1..ed5755d 100644
--- a/testdata/testoutput1
+++ b/testdata/testoutput1
@@ -9204,4 +9204,30 @@ No match
0:
0+ cd
+/(?:(?<n>foo)|(?<n>bar))\k<n>/J
+ foofoo
+ 0: foofoo
+ 1: foo
+ barbar
+ 0: barbar
+ 1: <unset>
+ 2: bar
+
+/(?<n>A)(?:(?<n>foo)|(?<n>bar))\k<n>/J
+ AfooA
+ 0: AfooA
+ 1: A
+ 2: foo
+ AbarA
+ 0: AbarA
+ 1: A
+ 2: <unset>
+ 3: bar
+ ** Failers
+No match
+ Afoofoo
+No match
+ Abarbar
+No match
+
/-- End of testinput1 --/
diff --git a/testdata/testoutput2 b/testdata/testoutput2
index fad6159..8e13a6e 100644
--- a/testdata/testoutput2
+++ b/testdata/testoutput2
@@ -12734,4 +12734,38 @@ Error -21 (recursion limit exceeded)
backgammon
0: ba
+/(?|(?<n>f)|(?<n>b))/JI
+Capturing subpattern count = 1
+Named capturing subpatterns:
+ n 1
+Options: dupnames
+No first char
+No need char
+
+/(?<a>abc)(?<a>z)\k<a>()/JDZS
+------------------------------------------------------------------
+ Bra
+ CBra 1
+ abc
+ Ket
+ CBra 2
+ z
+ Ket
+ \k<a>2
+ CBra 3
+ Ket
+ Ket
+ End
+------------------------------------------------------------------
+Capturing subpattern count = 3
+Max back reference = 2
+Named capturing subpatterns:
+ a 1
+ a 2
+Options: dupnames
+First char = 'a'
+Need char = 'z'
+Subject length lower bound = 5
+No set of starting bytes
+
/-- End of testinput2 --/
diff --git a/testdata/testoutput21-16 b/testdata/testoutput21-16
index e831888..0e21350 100644
--- a/testdata/testoutput21-16
+++ b/testdata/testoutput21-16
@@ -8,7 +8,11 @@ No study data
Error -28 from pcre16_fullinfo(0)
Running in 16-bit mode but pattern was compiled in 8-bit mode
-%-- Generated from: ^[aL](?P<name>(?:[AaLl]+)[^xX-]*?)(?P<other>[\x{150}-\x{250}\x{300}]|[^\x{800}aAs-uS-U\x{d800}-\x{dfff}])++[^#\b\x{500}\x{1000}]{3,5}$
+%-- Generated from:
+ /^[aL](?P<name>(?:[AaLl]+)[^xX-]*?)(?P<other>[\x{150}-\x{250}\x{300}]|
+ [^\x{800}aAs-uS-U\x{d800}-\x{dfff}])++[^#\b\x{500}\x{1000}]{3,5}$
+ /x
+
In 16-bit mode with options: S>testdata/saved16LE-1
FS>testdata/saved16BE-1
In 32-bit mode with options: S>testdata/saved32LE-1
@@ -42,7 +46,7 @@ Capturing subpattern count = 2
Named capturing subpatterns:
name 1
other 2
-Options: anchored
+Options: anchored extended
No first char
No need char
Subject length lower bound = 6
@@ -75,7 +79,7 @@ Capturing subpattern count = 2
Named capturing subpatterns:
name 1
other 2
-Options: anchored
+Options: anchored extended
No first char
No need char
Subject length lower bound = 6
diff --git a/testdata/testoutput21-32 b/testdata/testoutput21-32
index c6e8f6c..183487a 100644
--- a/testdata/testoutput21-32
+++ b/testdata/testoutput21-32
@@ -8,7 +8,11 @@ No study data
Error -28 from pcre32_fullinfo(0)
Running in 32-bit mode but pattern was compiled in 8-bit mode
-%-- Generated from: ^[aL](?P<name>(?:[AaLl]+)[^xX-]*?)(?P<other>[\x{150}-\x{250}\x{300}]|[^\x{800}aAs-uS-U\x{d800}-\x{dfff}])++[^#\b\x{500}\x{1000}]{3,5}$
+%-- Generated from:
+ /^[aL](?P<name>(?:[AaLl]+)[^xX-]*?)(?P<other>[\x{150}-\x{250}\x{300}]|
+ [^\x{800}aAs-uS-U\x{d800}-\x{dfff}])++[^#\b\x{500}\x{1000}]{3,5}$
+ /x
+
In 16-bit mode with options: S>testdata/saved16LE-1
FS>testdata/saved16BE-1
In 32-bit mode with options: S>testdata/saved32LE-1
@@ -54,7 +58,7 @@ Capturing subpattern count = 2
Named capturing subpatterns:
name 1
other 2
-Options: anchored
+Options: anchored extended
No first char
No need char
Subject length lower bound = 6
@@ -87,7 +91,7 @@ Capturing subpattern count = 2
Named capturing subpatterns:
name 1
other 2
-Options: anchored
+Options: anchored extended
No first char
No need char
Subject length lower bound = 6
diff --git a/testdata/testoutput22-16 b/testdata/testoutput22-16
index b2c673d..f896b13 100644
--- a/testdata/testoutput22-16
+++ b/testdata/testoutput22-16
@@ -1,10 +1,15 @@
/-- Tests for reloading pre-compile patterns with UTF-16 or UTF-32 support. */
-%-- Generated from: (?P<cbra1>[aZ\x{400}-\x{10ffff}]{4,}[\x{f123}\x{10039}\x{20000}-\x{21234}]?|[A-Cx-z\x{100000}-\x{1000a7}\x{101234}])(?<cb2>[^az])
- In 16-bit mode with options: S8>testdata/saved16LE-1
- FS8>testdata/saved16BE-1
- In 32-bit mode with options: S8>testdata/saved32LE-1
- FS8testdata/saved32BE-1
+%-- Generated from:
+ /(?P<cbra1>[aZ\x{400}-\x{10ffff}]{4,}
+ [\x{f123}\x{10039}\x{20000}-\x{21234}]?|
+ [A-Cx-z\x{100000}-\x{1000a7}\x{101234}])
+ (?<cb2>[^az])/x
+
+ In 16-bit mode with options: S8>testdata/saved16LE-2
+ FS8>testdata/saved16BE-2
+ In 32-bit mode with options: S8>testdata/saved32LE-2
+ FS8>testdata/saved32BE-2
--%8x
<!testsaved16LE-2
@@ -28,7 +33,7 @@ Capturing subpattern count = 2
Named capturing subpatterns:
cb2 2
cbra1 1
-Options: utf
+Options: extended utf
No first char
No need char
Subject length lower bound = 2
@@ -55,7 +60,7 @@ Capturing subpattern count = 2
Named capturing subpatterns:
cb2 2
cbra1 1
-Options: utf
+Options: extended utf
No first char
No need char
Subject length lower bound = 2
diff --git a/testdata/testoutput22-32 b/testdata/testoutput22-32
index 5a1d0da..783926b 100644
--- a/testdata/testoutput22-32
+++ b/testdata/testoutput22-32
@@ -1,10 +1,15 @@
/-- Tests for reloading pre-compile patterns with UTF-16 or UTF-32 support. */
-%-- Generated from: (?P<cbra1>[aZ\x{400}-\x{10ffff}]{4,}[\x{f123}\x{10039}\x{20000}-\x{21234}]?|[A-Cx-z\x{100000}-\x{1000a7}\x{101234}])(?<cb2>[^az])
- In 16-bit mode with options: S8>testdata/saved16LE-1
- FS8>testdata/saved16BE-1
- In 32-bit mode with options: S8>testdata/saved32LE-1
- FS8testdata/saved32BE-1
+%-- Generated from:
+ /(?P<cbra1>[aZ\x{400}-\x{10ffff}]{4,}
+ [\x{f123}\x{10039}\x{20000}-\x{21234}]?|
+ [A-Cx-z\x{100000}-\x{1000a7}\x{101234}])
+ (?<cb2>[^az])/x
+
+ In 16-bit mode with options: S8>testdata/saved16LE-2
+ FS8>testdata/saved16BE-2
+ In 32-bit mode with options: S8>testdata/saved32LE-2
+ FS8>testdata/saved32BE-2
--%8x
<!testsaved16LE-2
@@ -40,7 +45,7 @@ Capturing subpattern count = 2
Named capturing subpatterns:
cb2 2
cbra1 1
-Options: utf
+Options: extended utf
No first char
No need char
Subject length lower bound = 2
@@ -67,7 +72,7 @@ Capturing subpattern count = 2
Named capturing subpatterns:
cb2 2
cbra1 1
-Options: utf
+Options: extended utf
No first char
No need char
Subject length lower bound = 2