summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>2011-11-19 18:32:18 +0000
committerph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>2011-11-19 18:32:18 +0000
commit8f0ed27a10ee3efb8b11a044637144fb8fc6641f (patch)
treec74caa3f756e12f475c840392d507a89bcfe8bc8
parent8e93f278ded1ac082d15af60dcf24fc6a8d2a672 (diff)
downloadpcre-8f0ed27a10ee3efb8b11a044637144fb8fc6641f.tar.gz
Support \C in lookbehinds and DFA matching when not in UTF-8 mode.
git-svn-id: svn://vcs.exim.org/pcre/code/trunk@754 2f5784b3-3f2a-0410-8824-cb99058d5e15
-rw-r--r--ChangeLog2
-rw-r--r--doc/pcrematching.38
-rw-r--r--doc/pcrepattern.314
-rw-r--r--pcre_compile.c16
-rw-r--r--pcre_internal.h4
-rw-r--r--pcre_study.c4
-rw-r--r--testdata/testinput113
-rw-r--r--testdata/testinput22
-rw-r--r--testdata/testinput43
-rw-r--r--testdata/testinput76
-rw-r--r--testdata/testinput85
-rw-r--r--testdata/testoutput118
-rw-r--r--testdata/testoutput23
-rw-r--r--testdata/testoutput44
-rw-r--r--testdata/testoutput78
-rw-r--r--testdata/testoutput87
16 files changed, 97 insertions, 20 deletions
diff --git a/ChangeLog b/ChangeLog
index 7792ca7..1cc0d94 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -49,6 +49,8 @@ Version 8.21
12. Updated pcre-config so that it no longer shows -L/usr/lib, which seems
best practice nowadays, and helps with cross-compiling. (If the exec_prefix
is anything other than /usr, -L is still shown).
+
+13. In non-UTF-8 mode, \C is now supported in lookbehinds and DFA matching.
Version 8.20 21-Oct-2011
diff --git a/doc/pcrematching.3 b/doc/pcrematching.3
index 4c88322..4ca806b 100644
--- a/doc/pcrematching.3
+++ b/doc/pcrematching.3
@@ -132,9 +132,9 @@ and not on others), is not supported. It causes an error if encountered.
always 1, and the value of the \fIcapture_last\fP field is always -1.
.P
7. The \eC escape sequence, which (in the standard algorithm) matches a single
-byte, even in UTF-8 mode, is not supported because the alternative algorithm
-moves through the subject string one character at a time, for all active paths
-through the tree.
+byte, even in UTF-8 mode, is not supported in UTF-8 mode, because the
+alternative algorithm moves through the subject string one character at a time,
+for all active paths through the tree.
.P
8. Except for (*FAIL), the backtracking control verbs such as (*PRUNE) are not
supported. (*FAIL) is supported, and behaves like a failing negative assertion.
@@ -191,6 +191,6 @@ Cambridge CB2 3QH, England.
.rs
.sp
.nf
-Last updated: 17 November 2010
+Last updated: 19 November 2011
Copyright (c) 1997-2010 University of Cambridge.
.fi
diff --git a/doc/pcrepattern.3 b/doc/pcrepattern.3
index e35bf71..18def50 100644
--- a/doc/pcrepattern.3
+++ b/doc/pcrepattern.3
@@ -1003,9 +1003,9 @@ processing unless the PCRE_NO_UTF8_CHECK option is used).
PCRE does not allow \eC to appear in lookbehind assertions
.\" HTML <a href="#lookbehind">
.\" </a>
-(described below),
+(described below)
.\"
-because in UTF-8 mode this would make it impossible to calculate the length of
+in UTF-8 mode, because this would make it impossible to calculate the length of
the lookbehind.
.P
In general, the \eC escape sequence is best avoided in UTF-8 mode. However, one
@@ -1970,10 +1970,10 @@ temporarily move the current position back by the fixed length and then try to
match. If there are insufficient characters before the current position, the
assertion fails.
.P
-PCRE does not allow the \eC escape (which matches a single byte in UTF-8 mode)
-to appear in lookbehind assertions, because it makes it impossible to calculate
-the length of the lookbehind. The \eX and \eR escapes, which can match
-different numbers of bytes, are also not permitted.
+In UTF-8 mode, PCRE does not allow the \eC escape (which matches a single byte,
+even in UTF-8 mode) to appear in lookbehind assertions, because it makes it
+impossible to calculate the length of the lookbehind. The \eX and \eR escapes,
+which can match different numbers of bytes, are also not permitted.
.P
.\" HTML <a href="#subpatternsassubroutines">
.\" </a>
@@ -2874,6 +2874,6 @@ Cambridge CB2 3QH, England.
.rs
.sp
.nf
-Last updated: 14 November 2011
+Last updated: 19 November 2011
Copyright (c) 1997-2011 University of Cambridge.
.fi
diff --git a/pcre_compile.c b/pcre_compile.c
index 45705ca..0252261 100644
--- a/pcre_compile.c
+++ b/pcre_compile.c
@@ -1528,7 +1528,7 @@ Arguments:
Returns: the fixed length,
or -1 if there is no fixed length,
- or -2 if \C was encountered
+ or -2 if \C was encountered (in UTF-8 mode only)
or -3 if an OP_RECURSE item was encountered and atend is FALSE
or -4 if an unknown opcode was encountered (internal error)
*/
@@ -1702,7 +1702,8 @@ for (;;)
cc++;
break;
- /* The single-byte matcher isn't allowed */
+ /* The single-byte matcher isn't allowed. This only happens in UTF-8 mode;
+ otherwise \C is coded as OP_ALLANY. */
case OP_ANYBYTE:
return -2;
@@ -5600,8 +5601,8 @@ for (;; ptr++)
/* ------------------------------------------------------------ */
case CHAR_C: /* Callout - may be followed by digits; */
- previous_callout = code; /* Save for later completion */
- after_manual_callout = 1; /* Skip one item before completing */
+ previous_callout = code; /* Save for later completion */
+ after_manual_callout = 1; /* Skip one item before completing */
*code++ = OP_CALLOUT;
{
int n = 0;
@@ -6478,9 +6479,12 @@ for (;; ptr++)
}
else
#endif
- {
+ /* In non-UTF-8 mode, we turn \C into OP_ALLANY instead of OP_ANYBYTE
+ so that it works in DFA mode and in lookbehinds. */
+
+ {
previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
- *code++ = -c;
+ *code++ = (!utf8 && c == -ESC_C)? OP_ALLANY : -c;
}
}
continue;
diff --git a/pcre_internal.h b/pcre_internal.h
index 2d02e5d..0c5d676 100644
--- a/pcre_internal.h
+++ b/pcre_internal.h
@@ -1252,8 +1252,8 @@ value such as \n. They must have non-zero values, as check_escape() returns
their negation. Also, they must appear in the same order as in the opcode
definitions below, up to ESC_z. There's a dummy for OP_ALLANY because it
corresponds to "." in DOTALL mode rather than an escape sequence. It is also
-used for [^] in JavaScript compatibility mode. In non-DOTALL mode, "." behaves
-like \N.
+used for [^] in JavaScript compatibility mode, and for \C in non-utf8 mode. In
+non-DOTALL mode, "." behaves like \N.
The special values ESC_DU, ESC_du, etc. are used instead of ESC_D, ESC_d, etc.
when PCRE_UCP is set, when replacement of \d etc by \p sequences is required.
diff --git a/pcre_study.c b/pcre_study.c
index d4b2100..9da92bf 100644
--- a/pcre_study.c
+++ b/pcre_study.c
@@ -286,7 +286,9 @@ for (;;)
cc++;
break;
- /* The single-byte matcher means we can't proceed in UTF-8 mode */
+ /* The single-byte matcher means we can't proceed in UTF-8 mode. (In
+ non-UTF-8 mode \C will actually be turned into OP_ALLANY, so won't ever
+ appear, but leave the code, just in case.) */
case OP_ANYBYTE:
#ifdef SUPPORT_UTF8
diff --git a/testdata/testinput1 b/testdata/testinput1
index 61fd20b..36d7028 100644
--- a/testdata/testinput1
+++ b/testdata/testinput1
@@ -4305,4 +4305,17 @@
** Failers
aaaaaa
+/ab\Cde/
+ abXde
+
+/(?<=ab\Cde)X/
+ abZdeX
+
+/a[\CD]b/
+ aCb
+ aDb
+
+/a[\C-X]b/
+ aJb
+
/-- End of testinput1 --/
diff --git a/testdata/testinput2 b/testdata/testinput2
index 61bbeba..b673fef 100644
--- a/testdata/testinput2
+++ b/testdata/testinput2
@@ -4007,4 +4007,6 @@ AbcdCBefgBhiBqz
/(?(?=c)c|d)*+Y/BZ
+/(?<=ab\Cde)X/8
+
/-- End of testinput2 --/
diff --git a/testdata/testinput4 b/testdata/testinput4
index 6a04f6a..b339f71 100644
--- a/testdata/testinput4
+++ b/testdata/testinput4
@@ -650,4 +650,7 @@
/(abc)\1/8
abc
+/ab\Cde/8
+ abXde
+
/-- End of testinput4 --/
diff --git a/testdata/testinput7 b/testdata/testinput7
index fd6a942..3ad201e 100644
--- a/testdata/testinput7
+++ b/testdata/testinput7
@@ -4703,4 +4703,10 @@
\O6aaaa
\O8aaaa
+/ab\Cde/
+ abXde
+
+/(?<=ab\Cde)X/
+ abZdeX
+
/-- End of testinput7 --/
diff --git a/testdata/testinput8 b/testdata/testinput8
index 55d2fd3..fc05761 100644
--- a/testdata/testinput8
+++ b/testdata/testinput8
@@ -700,4 +700,9 @@
a\x{123}aa\>5
a\x{123}aa\>6
+/ab\Cde/8
+ abXde
+
+/(?<=ab\Cde)X/8
+
/-- End of testinput8 --/
diff --git a/testdata/testoutput1 b/testdata/testoutput1
index 2557fe2..bc237ab 100644
--- a/testdata/testoutput1
+++ b/testdata/testoutput1
@@ -7035,4 +7035,22 @@ No match
aaaaaa
No match
+/ab\Cde/
+ abXde
+ 0: abXde
+
+/(?<=ab\Cde)X/
+ abZdeX
+ 0: X
+
+/a[\CD]b/
+ aCb
+ 0: aCb
+ aDb
+ 0: aDb
+
+/a[\C-X]b/
+ aJb
+ 0: aJb
+
/-- End of testinput1 --/
diff --git a/testdata/testoutput2 b/testdata/testoutput2
index 58874a7..417225a 100644
--- a/testdata/testoutput2
+++ b/testdata/testoutput2
@@ -12591,4 +12591,7 @@ No match
End
------------------------------------------------------------------
+/(?<=ab\Cde)X/8
+Failed: \C not allowed in lookbehind assertion at offset 10
+
/-- End of testinput2 --/
diff --git a/testdata/testoutput4 b/testdata/testoutput4
index 00f3673..2f1b4fd 100644
--- a/testdata/testoutput4
+++ b/testdata/testoutput4
@@ -1136,4 +1136,8 @@ No match
abc
No match
+/ab\Cde/8
+ abXde
+ 0: abXde
+
/-- End of testinput4 --/
diff --git a/testdata/testoutput7 b/testdata/testoutput7
index 21107d4..f8eb18d 100644
--- a/testdata/testoutput7
+++ b/testdata/testoutput7
@@ -7858,4 +7858,12 @@ Matched, but too many subsidiary matches
2: aa
3: a
+/ab\Cde/
+ abXde
+ 0: abXde
+
+/(?<=ab\Cde)X/
+ abZdeX
+ 0: X
+
/-- End of testinput7 --/
diff --git a/testdata/testoutput8 b/testdata/testoutput8
index 737226f..0c569b3 100644
--- a/testdata/testoutput8
+++ b/testdata/testoutput8
@@ -1348,4 +1348,11 @@ No match
a\x{123}aa\>6
Error -24 (bad offset value)
+/ab\Cde/8
+ abXde
+Error -16 (item unsupported for DFA matching)
+
+/(?<=ab\Cde)X/8
+Failed: \C not allowed in lookbehind assertion at offset 10
+
/-- End of testinput8 --/