Support \C in lookbehinds and DFA matching when not in UTF-8 mode.

git-svn-id: svn://vcs.exim.org/pcre/code/trunk@754 2f5784b3-3f2a-0410-8824-cb99058d5e15
author: ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> 2011-11-19 18:32:18 +0000
committer: ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> 2011-11-19 18:32:18 +0000
commit: 8f0ed27a10ee3efb8b11a044637144fb8fc6641f (patch)
tree: c74caa3f756e12f475c840392d507a89bcfe8bc8
parent: 8e93f278ded1ac082d15af60dcf24fc6a8d2a672 (diff)
download: pcre-8f0ed27a10ee3efb8b11a044637144fb8fc6641f.tar.gz
16 files changed, 97 insertions, 20 deletions
diff --git a/ChangeLog b/ChangeLog
index 7792ca7..1cc0d94 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -49,6 +49,8 @@ Version 8.21
 12. Updated pcre-config so that it no longer shows -L/usr/lib, which seems
     best practice nowadays, and helps with cross-compiling. (If the exec_prefix 
     is anything other than /usr, -L is still shown). 
+    
+13. In non-UTF-8 mode, \C is now supported in lookbehinds and DFA matching.
 
 
 Version 8.20 21-Oct-2011
diff --git a/doc/pcrematching.3 b/doc/pcrematching.3
index 4c88322..4ca806b 100644
--- a/doc/pcrematching.3
+++ b/doc/pcrematching.3
@@ -132,9 +132,9 @@ and not on others), is not supported. It causes an error if encountered.
 always 1, and the value of the \fIcapture_last\fP field is always -1.
 .P
 7. The \eC escape sequence, which (in the standard algorithm) matches a single
-byte, even in UTF-8 mode, is not supported because the alternative algorithm
-moves through the subject string one character at a time, for all active paths
-through the tree.
+byte, even in UTF-8 mode, is not supported in UTF-8 mode, because the
+alternative algorithm moves through the subject string one character at a time,
+for all active paths through the tree.
 .P
 8. Except for (*FAIL), the backtracking control verbs such as (*PRUNE) are not
 supported. (*FAIL) is supported, and behaves like a failing negative assertion.
@@ -191,6 +191,6 @@ Cambridge CB2 3QH, England.
 .rs
 .sp
 .nf
-Last updated: 17 November 2010
+Last updated: 19 November 2011
 Copyright (c) 1997-2010 University of Cambridge.
 .fi
diff --git a/doc/pcrepattern.3 b/doc/pcrepattern.3
index e35bf71..18def50 100644
--- a/doc/pcrepattern.3
+++ b/doc/pcrepattern.3
@@ -1003,9 +1003,9 @@ processing unless the PCRE_NO_UTF8_CHECK option is used).
 PCRE does not allow \eC to appear in lookbehind assertions
 .\" HTML <a href="#lookbehind">
 .\" </a>
-(described below),
+(described below)
 .\"
-because in UTF-8 mode this would make it impossible to calculate the length of
+in UTF-8 mode, because this would make it impossible to calculate the length of
 the lookbehind.
 .P
 In general, the \eC escape sequence is best avoided in UTF-8 mode. However, one
@@ -1970,10 +1970,10 @@ temporarily move the current position back by the fixed length and then try to
 match. If there are insufficient characters before the current position, the
 assertion fails.
 .P
-PCRE does not allow the \eC escape (which matches a single byte in UTF-8 mode)
-to appear in lookbehind assertions, because it makes it impossible to calculate
-the length of the lookbehind. The \eX and \eR escapes, which can match
-different numbers of bytes, are also not permitted.
+In UTF-8 mode, PCRE does not allow the \eC escape (which matches a single byte,
+even in UTF-8 mode) to appear in lookbehind assertions, because it makes it
+impossible to calculate the length of the lookbehind. The \eX and \eR escapes,
+which can match different numbers of bytes, are also not permitted.
 .P
 .\" HTML <a href="#subpatternsassubroutines">
 .\" </a>
@@ -2874,6 +2874,6 @@ Cambridge CB2 3QH, England.
 .rs
 .sp
 .nf
-Last updated: 14 November 2011
+Last updated: 19 November 2011
 Copyright (c) 1997-2011 University of Cambridge.
 .fi
diff --git a/pcre_compile.c b/pcre_compile.c
index 45705ca..0252261 100644
--- a/pcre_compile.c
+++ b/pcre_compile.c
@@ -1528,7 +1528,7 @@ Arguments:
 
 Returns:   the fixed length,
              or -1 if there is no fixed length,
-             or -2 if \C was encountered
+             or -2 if \C was encountered (in UTF-8 mode only)
              or -3 if an OP_RECURSE item was encountered and atend is FALSE
              or -4 if an unknown opcode was encountered (internal error)
 */
@@ -1702,7 +1702,8 @@ for (;;)
     cc++;
     break;
 
-    /* The single-byte matcher isn't allowed */
+    /* The single-byte matcher isn't allowed. This only happens in UTF-8 mode; 
+    otherwise \C is coded as OP_ALLANY. */
 
     case OP_ANYBYTE:
     return -2;
@@ -5600,8 +5601,8 @@ for (;; ptr++)
 
         /* ------------------------------------------------------------ */
         case CHAR_C:                 /* Callout - may be followed by digits; */
-        previous_callout = code;  /* Save for later completion */
-        after_manual_callout = 1; /* Skip one item before completing */
+        previous_callout = code;     /* Save for later completion */
+        after_manual_callout = 1;    /* Skip one item before completing */
         *code++ = OP_CALLOUT;
           {
           int n = 0;
@@ -6478,9 +6479,12 @@ for (;; ptr++)
           }
         else
 #endif
-          {
+        /* In non-UTF-8 mode, we turn \C into OP_ALLANY instead of OP_ANYBYTE
+        so that it works in DFA mode and in lookbehinds. */
+         
+          {  
           previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
-          *code++ = -c;
+          *code++ = (!utf8 && c == -ESC_C)? OP_ALLANY : -c;
           }
         }
       continue;
diff --git a/pcre_internal.h b/pcre_internal.h
index 2d02e5d..0c5d676 100644
--- a/pcre_internal.h
+++ b/pcre_internal.h
@@ -1252,8 +1252,8 @@ value such as \n. They must have non-zero values, as check_escape() returns
 their negation. Also, they must appear in the same order as in the opcode
 definitions below, up to ESC_z. There's a dummy for OP_ALLANY because it
 corresponds to "." in DOTALL mode rather than an escape sequence. It is also
-used for [^] in JavaScript compatibility mode. In non-DOTALL mode, "." behaves
-like \N.
+used for [^] in JavaScript compatibility mode, and for \C in non-utf8 mode. In
+non-DOTALL mode, "." behaves like \N.
 
 The special values ESC_DU, ESC_du, etc. are used instead of ESC_D, ESC_d, etc.
 when PCRE_UCP is set, when replacement of \d etc by \p sequences is required.
diff --git a/pcre_study.c b/pcre_study.c
index d4b2100..9da92bf 100644
--- a/pcre_study.c
+++ b/pcre_study.c
@@ -286,7 +286,9 @@ for (;;)
     cc++;
     break;
 
-    /* The single-byte matcher means we can't proceed in UTF-8 mode */
+    /* The single-byte matcher means we can't proceed in UTF-8 mode. (In 
+    non-UTF-8 mode \C will actually be turned into OP_ALLANY, so won't ever 
+    appear, but leave the code, just in case.) */
 
     case OP_ANYBYTE:
 #ifdef SUPPORT_UTF8
diff --git a/testdata/testinput1 b/testdata/testinput1
index 61fd20b..36d7028 100644
--- a/testdata/testinput1
+++ b/testdata/testinput1
@@ -4305,4 +4305,17 @@
     ** Failers
     aaaaaa
 
+/ab\Cde/
+    abXde
+    
+/(?<=ab\Cde)X/
+    abZdeX
+
+/a[\CD]b/
+    aCb
+    aDb 
+
+/a[\C-X]b/
+    aJb
+
 /-- End of testinput1 --/
diff --git a/testdata/testinput2 b/testdata/testinput2
index 61bbeba..b673fef 100644
--- a/testdata/testinput2
+++ b/testdata/testinput2
@@ -4007,4 +4007,6 @@ AbcdCBefgBhiBqz
 
 /(?(?=c)c|d)*+Y/BZ
 
+/(?<=ab\Cde)X/8
+
 /-- End of testinput2 --/
diff --git a/testdata/testinput4 b/testdata/testinput4
index 6a04f6a..b339f71 100644
--- a/testdata/testinput4
+++ b/testdata/testinput4
@@ -650,4 +650,7 @@
 /(abc)\1/8
    abc
 
+/ab\Cde/8
+    abXde
+
 /-- End of testinput4 --/
diff --git a/testdata/testinput7 b/testdata/testinput7
index fd6a942..3ad201e 100644
--- a/testdata/testinput7
+++ b/testdata/testinput7
@@ -4703,4 +4703,10 @@
     \O6aaaa
     \O8aaaa
 
+/ab\Cde/
+    abXde
+    
+/(?<=ab\Cde)X/
+    abZdeX
+
 /-- End of testinput7 --/
diff --git a/testdata/testinput8 b/testdata/testinput8
index 55d2fd3..fc05761 100644
--- a/testdata/testinput8
+++ b/testdata/testinput8
@@ -700,4 +700,9 @@
     a\x{123}aa\>5
     a\x{123}aa\>6
 
+/ab\Cde/8
+    abXde
+
+/(?<=ab\Cde)X/8
+
 /-- End of testinput8 --/ 
diff --git a/testdata/testoutput1 b/testdata/testoutput1
index 2557fe2..bc237ab 100644
--- a/testdata/testoutput1
+++ b/testdata/testoutput1
@@ -7035,4 +7035,22 @@ No match
     aaaaaa
 No match
 
+/ab\Cde/
+    abXde
+ 0: abXde
+    
+/(?<=ab\Cde)X/
+    abZdeX
+ 0: X
+
+/a[\CD]b/
+    aCb
+ 0: aCb
+    aDb 
+ 0: aDb
+
+/a[\C-X]b/
+    aJb
+ 0: aJb
+
 /-- End of testinput1 --/
diff --git a/testdata/testoutput2 b/testdata/testoutput2
index 58874a7..417225a 100644
--- a/testdata/testoutput2
+++ b/testdata/testoutput2
@@ -12591,4 +12591,7 @@ No match
         End
 ------------------------------------------------------------------
 
+/(?<=ab\Cde)X/8
+Failed: \C not allowed in lookbehind assertion at offset 10
+
 /-- End of testinput2 --/
diff --git a/testdata/testoutput4 b/testdata/testoutput4
index 00f3673..2f1b4fd 100644
--- a/testdata/testoutput4
+++ b/testdata/testoutput4
@@ -1136,4 +1136,8 @@ No match
    abc
 No match
 
+/ab\Cde/8
+    abXde
+ 0: abXde
+
 /-- End of testinput4 --/
diff --git a/testdata/testoutput7 b/testdata/testoutput7
index 21107d4..f8eb18d 100644
--- a/testdata/testoutput7
+++ b/testdata/testoutput7
@@ -7858,4 +7858,12 @@ Matched, but too many subsidiary matches
  2: aa
  3: a
 
+/ab\Cde/
+    abXde
+ 0: abXde
+    
+/(?<=ab\Cde)X/
+    abZdeX
+ 0: X
+
 /-- End of testinput7 --/
diff --git a/testdata/testoutput8 b/testdata/testoutput8
index 737226f..0c569b3 100644
--- a/testdata/testoutput8
+++ b/testdata/testoutput8
@@ -1348,4 +1348,11 @@ No match
     a\x{123}aa\>6
 Error -24 (bad offset value)
 
+/ab\Cde/8
+    abXde
+Error -16 (item unsupported for DFA matching)
+
+/(?<=ab\Cde)X/8
+Failed: \C not allowed in lookbehind assertion at offset 10
+
 /-- End of testinput8 --/
author	ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>	2011-11-19 18:32:18 +0000
committer	ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>	2011-11-19 18:32:18 +0000
commit	8f0ed27a10ee3efb8b11a044637144fb8fc6641f (patch)
tree	c74caa3f756e12f475c840392d507a89bcfe8bc8
parent	8e93f278ded1ac082d15af60dcf24fc6a8d2a672 (diff)
download	pcre-8f0ed27a10ee3efb8b11a044637144fb8fc6641f.tar.gz