diff options
author | ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2011-11-19 18:32:18 +0000 |
---|---|---|
committer | ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2011-11-19 18:32:18 +0000 |
commit | 8f0ed27a10ee3efb8b11a044637144fb8fc6641f (patch) | |
tree | c74caa3f756e12f475c840392d507a89bcfe8bc8 | |
parent | 8e93f278ded1ac082d15af60dcf24fc6a8d2a672 (diff) | |
download | pcre-8f0ed27a10ee3efb8b11a044637144fb8fc6641f.tar.gz |
Support \C in lookbehinds and DFA matching when not in UTF-8 mode.
git-svn-id: svn://vcs.exim.org/pcre/code/trunk@754 2f5784b3-3f2a-0410-8824-cb99058d5e15
-rw-r--r-- | ChangeLog | 2 | ||||
-rw-r--r-- | doc/pcrematching.3 | 8 | ||||
-rw-r--r-- | doc/pcrepattern.3 | 14 | ||||
-rw-r--r-- | pcre_compile.c | 16 | ||||
-rw-r--r-- | pcre_internal.h | 4 | ||||
-rw-r--r-- | pcre_study.c | 4 | ||||
-rw-r--r-- | testdata/testinput1 | 13 | ||||
-rw-r--r-- | testdata/testinput2 | 2 | ||||
-rw-r--r-- | testdata/testinput4 | 3 | ||||
-rw-r--r-- | testdata/testinput7 | 6 | ||||
-rw-r--r-- | testdata/testinput8 | 5 | ||||
-rw-r--r-- | testdata/testoutput1 | 18 | ||||
-rw-r--r-- | testdata/testoutput2 | 3 | ||||
-rw-r--r-- | testdata/testoutput4 | 4 | ||||
-rw-r--r-- | testdata/testoutput7 | 8 | ||||
-rw-r--r-- | testdata/testoutput8 | 7 |
16 files changed, 97 insertions, 20 deletions
@@ -49,6 +49,8 @@ Version 8.21 12. Updated pcre-config so that it no longer shows -L/usr/lib, which seems best practice nowadays, and helps with cross-compiling. (If the exec_prefix is anything other than /usr, -L is still shown). + +13. In non-UTF-8 mode, \C is now supported in lookbehinds and DFA matching. Version 8.20 21-Oct-2011 diff --git a/doc/pcrematching.3 b/doc/pcrematching.3 index 4c88322..4ca806b 100644 --- a/doc/pcrematching.3 +++ b/doc/pcrematching.3 @@ -132,9 +132,9 @@ and not on others), is not supported. It causes an error if encountered. always 1, and the value of the \fIcapture_last\fP field is always -1. .P 7. The \eC escape sequence, which (in the standard algorithm) matches a single -byte, even in UTF-8 mode, is not supported because the alternative algorithm -moves through the subject string one character at a time, for all active paths -through the tree. +byte, even in UTF-8 mode, is not supported in UTF-8 mode, because the +alternative algorithm moves through the subject string one character at a time, +for all active paths through the tree. .P 8. Except for (*FAIL), the backtracking control verbs such as (*PRUNE) are not supported. (*FAIL) is supported, and behaves like a failing negative assertion. @@ -191,6 +191,6 @@ Cambridge CB2 3QH, England. .rs .sp .nf -Last updated: 17 November 2010 +Last updated: 19 November 2011 Copyright (c) 1997-2010 University of Cambridge. .fi diff --git a/doc/pcrepattern.3 b/doc/pcrepattern.3 index e35bf71..18def50 100644 --- a/doc/pcrepattern.3 +++ b/doc/pcrepattern.3 @@ -1003,9 +1003,9 @@ processing unless the PCRE_NO_UTF8_CHECK option is used). PCRE does not allow \eC to appear in lookbehind assertions .\" HTML <a href="#lookbehind"> .\" </a> -(described below), +(described below) .\" -because in UTF-8 mode this would make it impossible to calculate the length of +in UTF-8 mode, because this would make it impossible to calculate the length of the lookbehind. .P In general, the \eC escape sequence is best avoided in UTF-8 mode. However, one @@ -1970,10 +1970,10 @@ temporarily move the current position back by the fixed length and then try to match. If there are insufficient characters before the current position, the assertion fails. .P -PCRE does not allow the \eC escape (which matches a single byte in UTF-8 mode) -to appear in lookbehind assertions, because it makes it impossible to calculate -the length of the lookbehind. The \eX and \eR escapes, which can match -different numbers of bytes, are also not permitted. +In UTF-8 mode, PCRE does not allow the \eC escape (which matches a single byte, +even in UTF-8 mode) to appear in lookbehind assertions, because it makes it +impossible to calculate the length of the lookbehind. The \eX and \eR escapes, +which can match different numbers of bytes, are also not permitted. .P .\" HTML <a href="#subpatternsassubroutines"> .\" </a> @@ -2874,6 +2874,6 @@ Cambridge CB2 3QH, England. .rs .sp .nf -Last updated: 14 November 2011 +Last updated: 19 November 2011 Copyright (c) 1997-2011 University of Cambridge. .fi diff --git a/pcre_compile.c b/pcre_compile.c index 45705ca..0252261 100644 --- a/pcre_compile.c +++ b/pcre_compile.c @@ -1528,7 +1528,7 @@ Arguments: Returns: the fixed length, or -1 if there is no fixed length, - or -2 if \C was encountered + or -2 if \C was encountered (in UTF-8 mode only) or -3 if an OP_RECURSE item was encountered and atend is FALSE or -4 if an unknown opcode was encountered (internal error) */ @@ -1702,7 +1702,8 @@ for (;;) cc++; break; - /* The single-byte matcher isn't allowed */ + /* The single-byte matcher isn't allowed. This only happens in UTF-8 mode; + otherwise \C is coded as OP_ALLANY. */ case OP_ANYBYTE: return -2; @@ -5600,8 +5601,8 @@ for (;; ptr++) /* ------------------------------------------------------------ */ case CHAR_C: /* Callout - may be followed by digits; */ - previous_callout = code; /* Save for later completion */ - after_manual_callout = 1; /* Skip one item before completing */ + previous_callout = code; /* Save for later completion */ + after_manual_callout = 1; /* Skip one item before completing */ *code++ = OP_CALLOUT; { int n = 0; @@ -6478,9 +6479,12 @@ for (;; ptr++) } else #endif - { + /* In non-UTF-8 mode, we turn \C into OP_ALLANY instead of OP_ANYBYTE + so that it works in DFA mode and in lookbehinds. */ + + { previous = (-c > ESC_b && -c < ESC_Z)? code : NULL; - *code++ = -c; + *code++ = (!utf8 && c == -ESC_C)? OP_ALLANY : -c; } } continue; diff --git a/pcre_internal.h b/pcre_internal.h index 2d02e5d..0c5d676 100644 --- a/pcre_internal.h +++ b/pcre_internal.h @@ -1252,8 +1252,8 @@ value such as \n. They must have non-zero values, as check_escape() returns their negation. Also, they must appear in the same order as in the opcode definitions below, up to ESC_z. There's a dummy for OP_ALLANY because it corresponds to "." in DOTALL mode rather than an escape sequence. It is also -used for [^] in JavaScript compatibility mode. In non-DOTALL mode, "." behaves -like \N. +used for [^] in JavaScript compatibility mode, and for \C in non-utf8 mode. In +non-DOTALL mode, "." behaves like \N. The special values ESC_DU, ESC_du, etc. are used instead of ESC_D, ESC_d, etc. when PCRE_UCP is set, when replacement of \d etc by \p sequences is required. diff --git a/pcre_study.c b/pcre_study.c index d4b2100..9da92bf 100644 --- a/pcre_study.c +++ b/pcre_study.c @@ -286,7 +286,9 @@ for (;;) cc++; break; - /* The single-byte matcher means we can't proceed in UTF-8 mode */ + /* The single-byte matcher means we can't proceed in UTF-8 mode. (In + non-UTF-8 mode \C will actually be turned into OP_ALLANY, so won't ever + appear, but leave the code, just in case.) */ case OP_ANYBYTE: #ifdef SUPPORT_UTF8 diff --git a/testdata/testinput1 b/testdata/testinput1 index 61fd20b..36d7028 100644 --- a/testdata/testinput1 +++ b/testdata/testinput1 @@ -4305,4 +4305,17 @@ ** Failers aaaaaa +/ab\Cde/ + abXde + +/(?<=ab\Cde)X/ + abZdeX + +/a[\CD]b/ + aCb + aDb + +/a[\C-X]b/ + aJb + /-- End of testinput1 --/ diff --git a/testdata/testinput2 b/testdata/testinput2 index 61bbeba..b673fef 100644 --- a/testdata/testinput2 +++ b/testdata/testinput2 @@ -4007,4 +4007,6 @@ AbcdCBefgBhiBqz /(?(?=c)c|d)*+Y/BZ +/(?<=ab\Cde)X/8 + /-- End of testinput2 --/ diff --git a/testdata/testinput4 b/testdata/testinput4 index 6a04f6a..b339f71 100644 --- a/testdata/testinput4 +++ b/testdata/testinput4 @@ -650,4 +650,7 @@ /(abc)\1/8 abc +/ab\Cde/8 + abXde + /-- End of testinput4 --/ diff --git a/testdata/testinput7 b/testdata/testinput7 index fd6a942..3ad201e 100644 --- a/testdata/testinput7 +++ b/testdata/testinput7 @@ -4703,4 +4703,10 @@ \O6aaaa \O8aaaa +/ab\Cde/ + abXde + +/(?<=ab\Cde)X/ + abZdeX + /-- End of testinput7 --/ diff --git a/testdata/testinput8 b/testdata/testinput8 index 55d2fd3..fc05761 100644 --- a/testdata/testinput8 +++ b/testdata/testinput8 @@ -700,4 +700,9 @@ a\x{123}aa\>5 a\x{123}aa\>6 +/ab\Cde/8 + abXde + +/(?<=ab\Cde)X/8 + /-- End of testinput8 --/ diff --git a/testdata/testoutput1 b/testdata/testoutput1 index 2557fe2..bc237ab 100644 --- a/testdata/testoutput1 +++ b/testdata/testoutput1 @@ -7035,4 +7035,22 @@ No match aaaaaa No match +/ab\Cde/ + abXde + 0: abXde + +/(?<=ab\Cde)X/ + abZdeX + 0: X + +/a[\CD]b/ + aCb + 0: aCb + aDb + 0: aDb + +/a[\C-X]b/ + aJb + 0: aJb + /-- End of testinput1 --/ diff --git a/testdata/testoutput2 b/testdata/testoutput2 index 58874a7..417225a 100644 --- a/testdata/testoutput2 +++ b/testdata/testoutput2 @@ -12591,4 +12591,7 @@ No match End ------------------------------------------------------------------ +/(?<=ab\Cde)X/8 +Failed: \C not allowed in lookbehind assertion at offset 10 + /-- End of testinput2 --/ diff --git a/testdata/testoutput4 b/testdata/testoutput4 index 00f3673..2f1b4fd 100644 --- a/testdata/testoutput4 +++ b/testdata/testoutput4 @@ -1136,4 +1136,8 @@ No match abc No match +/ab\Cde/8 + abXde + 0: abXde + /-- End of testinput4 --/ diff --git a/testdata/testoutput7 b/testdata/testoutput7 index 21107d4..f8eb18d 100644 --- a/testdata/testoutput7 +++ b/testdata/testoutput7 @@ -7858,4 +7858,12 @@ Matched, but too many subsidiary matches 2: aa 3: a +/ab\Cde/ + abXde + 0: abXde + +/(?<=ab\Cde)X/ + abZdeX + 0: X + /-- End of testinput7 --/ diff --git a/testdata/testoutput8 b/testdata/testoutput8 index 737226f..0c569b3 100644 --- a/testdata/testoutput8 +++ b/testdata/testoutput8 @@ -1348,4 +1348,11 @@ No match a\x{123}aa\>6 Error -24 (bad offset value) +/ab\Cde/8 + abXde +Error -16 (item unsupported for DFA matching) + +/(?<=ab\Cde)X/8 +Failed: \C not allowed in lookbehind assertion at offset 10 + /-- End of testinput8 --/ |