Give error if \c is followed by a byte > 127 (in ASCII/UTF-8 modes).

git-svn-id: svn://vcs.exim.org/pcre/code/trunk@574 2f5784b3-3f2a-0410-8824-cb99058d5e15
author: ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> 2010-11-20 17:47:27 +0000
committer: ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> 2010-11-20 17:47:27 +0000
commit: b363a6392769a2cadb9175b9a0cc79bd7ecdbd99 (patch)
tree: 31f238229845c20395ad1c7adae0472535cacca1
parent: df3e1f5efe8ad07dd34302fa929a13a320d8f76a (diff)
download: pcre-b363a6392769a2cadb9175b9a0cc79bd7ecdbd99.tar.gz
11 files changed, 50 insertions, 12 deletions
diff --git a/ChangeLog b/ChangeLog
index ad9f6c5..47dd4a3 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -108,6 +108,11 @@ Version 8.11 10-Oct-2010
     loops, in order to improve performance in some environments. At the same 
     time, I abstracted some of the common code into auxiliary macros to save 
     repetition (this should not affect the compiled code).
+    
+19. If \c was followed by a multibyte UTF-8 character, bad things happened. A 
+    compile-time error is now given if \c is not followed by an ASCII 
+    character, that is, a byte less than 128. (In EBCDIC mode, the code is 
+    different, and any byte value is allowed.)
 
 
 Version 8.10 25-Jun-2010
diff --git a/doc/pcrepattern.3 b/doc/pcrepattern.3
index a8a9081..c5de618 100644
--- a/doc/pcrepattern.3
+++ b/doc/pcrepattern.3
@@ -182,9 +182,9 @@ The following sections describe the use of each of the metacharacters.
 .rs
 .sp
 The backslash character has several uses. Firstly, if it is followed by a
-non-alphanumeric character, it takes away any special meaning that character
-may have. This use of backslash as an escape character applies both inside and
-outside character classes.
+character that is not a number or a letter, it takes away any special meaning
+that character may have. This use of backslash as an escape character applies
+both inside and outside character classes. 
 .P
 For example, if you want to match a * character, you write \e* in the pattern.
 This escaping action applies whether or not the following character would
@@ -192,6 +192,10 @@ otherwise be interpreted as a metacharacter, so it is always safe to precede a
 non-alphanumeric with backslash to specify that it stands for itself. In
 particular, if you want to match a backslash, you write \e\e.
 .P
+In UTF-8 mode, only ASCII numbers and letters have any special meaning after a
+backslash. All other characters (in particular, those whose codepoints are 
+greater than 127) are treated as literals.
+.P
 If a pattern is compiled with the PCRE_EXTENDED option, whitespace in the
 pattern (other than in a character class) and characters between a # outside
 a character class and the next newline are ignored. An escaping backslash can
@@ -225,7 +229,7 @@ but when a pattern is being prepared by text editing, it is often easier to use
 one of the following escape sequences than the binary character it represents:
 .sp
   \ea        alarm, that is, the BEL character (hex 07)
-  \ecx       "control-x", where x is any character
+  \ecx       "control-x", where x is any ASCII character
   \ee        escape (hex 1B)
   \ef        formfeed (hex 0C)
   \en        linefeed (hex 0A)
@@ -237,8 +241,12 @@ one of the following escape sequences than the binary character it represents:
 .sp
 The precise effect of \ecx is as follows: if x is a lower case letter, it
 is converted to upper case. Then bit 6 of the character (hex 40) is inverted.
-Thus \ecz becomes hex 1A, but \ec{ becomes hex 3B, while \ec; becomes hex
-7B.
+Thus \ecz becomes hex 1A (z is 7A), but \ec{ becomes hex 3B ({ is 7B), while
+\ec; becomes hex 7B (; is 3B). If the byte following \ec has a value greater 
+than 127, a compile-time error occurs. This locks out non-ASCII characters in 
+both byte mode and UTF-8 mode. (When PCRE is compiled in EBCDIC mode, all byte 
+values are valid. A lower case letter is converted to upper case, and then the 
+0xc0 bits are flipped.)
 .P
 After \ex, from zero to two hexadecimal digits are read (letters can be in
 upper or lower case). Any number of hexadecimal digits may appear between \ex{
@@ -2718,6 +2726,6 @@ Cambridge CB2 3QH, England.
 .rs
 .sp
 .nf
-Last updated: 17 November 2010
+Last updated: 20 November 2010
 Copyright (c) 1997-2010 University of Cambridge.
 .fi
diff --git a/pcre_compile.c b/pcre_compile.c
index 0115eb3..c269eaa 100644
--- a/pcre_compile.c
+++ b/pcre_compile.c
@@ -408,6 +408,7 @@ static const char error_texts[] =
   "different names for subpatterns of the same number are not allowed\0"
   "(*MARK) must have an argument\0"
   "this version of PCRE is not compiled with PCRE_UCP support\0"
+  "\\c must be followed by an ASCII character\0" 
   ;
 
 /* Table to identify digits and hex digits. This is used when compiling
@@ -841,7 +842,8 @@ else
     break;
 
     /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
-    This coding is ASCII-specific, but then the whole concept of \cx is
+    An error is given if the byte following \c is not an ASCII character. This
+    coding is ASCII-specific, but then the whole concept of \cx is
     ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
 
     case CHAR_c:
@@ -851,11 +853,15 @@ else
       *errorcodeptr = ERR2;
       break;
       }
-
-#ifndef EBCDIC  /* ASCII/UTF-8 coding */
+#ifndef EBCDIC    /* ASCII/UTF-8 coding */
+    if (c > 127)  /* Excludes all non-ASCII in either mode */
+      {
+      *errorcodeptr = ERR68;
+      break;  
+      }      
     if (c >= CHAR_a && c <= CHAR_z) c -= 32;
     c ^= 0x40;
-#else           /* EBCDIC coding */
+#else             /* EBCDIC coding */
     if (c >= CHAR_a && c <= CHAR_z) c += 64;
     c ^= 0xC0;
 #endif
diff --git a/pcre_internal.h b/pcre_internal.h
index e42ac28..dcf7223 100644
--- a/pcre_internal.h
+++ b/pcre_internal.h
@@ -1569,7 +1569,8 @@ enum { ERR0,  ERR1,  ERR2,  ERR3,  ERR4,  ERR5,  ERR6,  ERR7,  ERR8,  ERR9,
        ERR30, ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39,
        ERR40, ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49,
        ERR50, ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59,
-       ERR60, ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERRCOUNT };
+       ERR60, ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68, 
+       ERRCOUNT };
 
 /* The real format of the start of the pcre block; the index of names and the
 code vector run on as long as necessary after the end. We store an explicit
diff --git a/pcreposix.c b/pcreposix.c
index c5f3d71..15ae19d 100644
--- a/pcreposix.c
+++ b/pcreposix.c
@@ -151,6 +151,7 @@ static const int eint[] = {
   REG_BADPAT,  /* different names for subpatterns of the same number are not allowed */
   REG_BADPAT,  /* (*MARK) must have an argument */
   REG_INVARG,  /* this version of PCRE is not compiled with PCRE_UCP support */
+  REG_BADPAT,  /* \c must be followed by an ASCII character */ 
 };
 
 /* Table of texts corresponding to POSIX error codes */
diff --git a/testdata/testinput1 b/testdata/testinput1
index a55cc73..97e5c36 100644
--- a/testdata/testinput1
+++ b/testdata/testinput1
@@ -4076,4 +4076,7 @@
 /[\x00-\xff\s]+/
     \x0a\x0b\x0c\x0d
 
+/^\c/
+    ?
+
 /-- End of testinput1 --/
diff --git a/testdata/testinput2 b/testdata/testinput2
index caa664b..6528138 100644
--- a/testdata/testinput2
+++ b/testdata/testinput2
@@ -3552,4 +3552,6 @@ with \Y. ---/
     abc\>4
     abc\>-4 
 
+/^\cģ/
+
 /-- End of testinput2 --/
diff --git a/testdata/testinput5 b/testdata/testinput5
index ba11836..5cebd7e 100644
--- a/testdata/testinput5
+++ b/testdata/testinput5
@@ -827,4 +827,6 @@ correctly, but that messes up comparisons). --/
     a\x{123}aa\>5
     a\x{123}aa\>6
 
+/^\cģ/8
+
 /-- End of testinput5 --/
diff --git a/testdata/testoutput1 b/testdata/testoutput1
index 6f96558..6ab67a7 100644
--- a/testdata/testoutput1
+++ b/testdata/testoutput1
@@ -6662,4 +6662,8 @@ No match
     \x0a\x0b\x0c\x0d
  0: \x0a\x0b\x0c\x0d
 
+/^\c/
+    ?
+ 0: ?
+
 /-- End of testinput1 --/
diff --git a/testdata/testoutput2 b/testdata/testoutput2
index 2cd7bd9..9bbc784 100644
--- a/testdata/testoutput2
+++ b/testdata/testoutput2
@@ -11231,4 +11231,7 @@ Error -24
     abc\>-4 
 Error -24
 
+/^\cģ/
+Failed: \c must be followed by an ASCII character at offset 3
+
 /-- End of testinput2 --/
diff --git a/testdata/testoutput5 b/testdata/testoutput5
index 9d2c439..ed617cc 100644
--- a/testdata/testoutput5
+++ b/testdata/testoutput5
@@ -2312,4 +2312,7 @@ No match
     a\x{123}aa\>6
 Error -24
 
+/^\cģ/8
+Failed: \c must be followed by an ASCII character at offset 3
+
 /-- End of testinput5 --/
author	ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>	2010-11-20 17:47:27 +0000
committer	ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>	2010-11-20 17:47:27 +0000
commit	b363a6392769a2cadb9175b9a0cc79bd7ecdbd99 (patch)
tree	31f238229845c20395ad1c7adae0472535cacca1
parent	df3e1f5efe8ad07dd34302fa929a13a320d8f76a (diff)
download	pcre-b363a6392769a2cadb9175b9a0cc79bd7ecdbd99.tar.gz