Make \c in EBCDIC environments compatible with Perl.

git-svn-id: svn://vcs.exim.org/pcre/code/trunk@1568 2f5784b3-3f2a-0410-8824-cb99058d5e15
author: ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> 2015-06-14 15:53:41 +0000
committer: ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> 2015-06-14 15:53:41 +0000
commit: 68e2e67656942f34537a41f75b8548764283686d (patch)
tree: d1f4e0d2f483924a23cc43318c819dcf16f81b33
parent: 2d606295bb68dcb32222489cd97a2a01f2fb4670 (diff)
download: pcre-68e2e67656942f34537a41f75b8548764283686d.tar.gz
3 files changed, 57 insertions, 22 deletions
diff --git a/ChangeLog b/ChangeLog
index c39ab9a..6ac0419 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -57,6 +57,9 @@ Version 8.38 xx-xxx-xxxx
     
 13. In an EBCDIC environment, \a in a pattern was converted to the ASCII 
     instead of the EBCDIC value. 
+    
+14. The handling of \c in an EBCDIC environment has been revised so that it is
+    now compatible with the specification in Perl's perlebcdic page.
  
 
 Version 8.37 28-April-2015
diff --git a/doc/pcrepattern.3 b/doc/pcrepattern.3
index d0c6eeb..953f05c 100644
--- a/doc/pcrepattern.3
+++ b/doc/pcrepattern.3
@@ -1,4 +1,4 @@
-.TH PCREPATTERN 3 "08 January 2014" "PCRE 8.35"
+.TH PCREPATTERN 3 "14 June 2015" "PCRE 8.38"
 .SH NAME
 PCRE - Perl-compatible regular expressions
 .SH "PCRE REGULAR EXPRESSION DETAILS"
@@ -308,7 +308,8 @@ A second use of backslash provides a way of encoding non-printing characters
 in patterns in a visible manner. There is no restriction on the appearance of
 non-printing characters, apart from the binary zero that terminates a pattern,
 but when a pattern is being prepared by text editing, it is often easier to use
-one of the following escape sequences than the binary character it represents:
+one of the following escape sequences than the binary character it represents.
+In an ASCII or Unicode environment, these escapes are as follows:
 .sp
   \ea        alarm, that is, the BEL character (hex 07)
   \ecx       "control-x", where x is any ASCII character
@@ -330,19 +331,31 @@ case letter, it is converted to upper case. Then bit 6 of the character (hex
 but \ec{ becomes hex 3B ({ is 7B), and \ec; becomes hex 7B (; is 3B). If the
 data item (byte or 16-bit value) following \ec has a value greater than 127, a
 compile-time error occurs. This locks out non-ASCII characters in all modes.
-.P
-The \ec facility was designed for use with ASCII characters, but with the
-extension to Unicode it is even less useful than it once was. It is, however,
-recognized when PCRE is compiled in EBCDIC mode, where data items are always
-bytes. In this mode, all values are valid after \ec. If the next character is a
-lower case letter, it is converted to upper case. Then the 0xc0 bits of the
-byte are inverted. Thus \ecA becomes hex 01, as in ASCII (A is C1), but because
-the EBCDIC letters are disjoint, \ecZ becomes hex 29 (Z is E9), and other
-characters also generate different values.
+.P                                                    
+When PCRE is compiled in EBCDIC mode, \ea, \ee, \ef, \en, \er, and \et
+generate the appropriate EBCDIC code values. The \ec escape is processed
+as specified for Perl in the \fBperlebcdic\fP document. The only characters
+that are allowed after \ec are A-Z, a-z, or one of @, [, \e, ], ^, _, or ?. Any
+other character provokes a compile-time error. The sequence \e@ encodes
+character code 0; the letters (in either case) encode characters 1-26 (hex 01
+to hex 1A); [, \e, ], ^, and _ encode characters 27-31 (hex 1B to hex 1F), and
+\e? becomes either 255 (hex FF) or 95 (hex 5F).
+.P
+Thus, apart from \e?, these escapes generate the same character code values as
+they do in an ASCII environment, though the meanings of the values mostly 
+differ. For example, \eG always generates code value 7, which is BEL in ASCII
+but DEL in EBCDIC.
+.P
+The sequence \e? generates DEL (127, hex 7F) in an ASCII environment, but
+because 127 is not a control character in EBCDIC, Perl makes it generate the 
+APC character. Unfortunately, there are several variants of EBCDIC. In most of 
+them the APC character has the value 255 (hex FF), but in the one Perl calls 
+POSIX-BC its value is 95 (hex 5F). If certain other characters have POSIX-BC 
+values, PCRE makes \e? generate 95; otherwise it generates 255.
 .P
 After \e0 up to two further octal digits are read. If there are fewer than two
-digits, just those that are present are used. Thus the sequence \e0\ex\e07
-specifies two binary zeros followed by a BEL character (code value 7). Make
+digits, just those that are present are used. Thus the sequence \e0\ex\e015
+specifies two binary zeros followed by a CR character (code value 13). Make
 sure you supply two digits after the initial zero if the pattern character that
 follows is itself an octal digit.
 .P
@@ -3283,6 +3296,6 @@ Cambridge CB2 3QH, England.
 .rs
 .sp
 .nf
-Last updated: 08 January 2014
-Copyright (c) 1997-2014 University of Cambridge.
+Last updated: 14 June 2015
+Copyright (c) 1997-2015 University of Cambridge.
 .fi
diff --git a/pcre_compile.c b/pcre_compile.c
index be10086..575ee7a 100644
--- a/pcre_compile.c
+++ b/pcre_compile.c
@@ -219,6 +219,12 @@ static const short int escapes[] = {
 /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,
 /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0
 };
+
+/* We also need a table of characters that may follow \c in an EBCDIC
+environment for characters 0-31. */
+
+static unsigned char ebcdic_escape_c[] = "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_";
+
 #endif
 
 
@@ -527,7 +533,11 @@ static const char error_texts[] =
   "different names for subpatterns of the same number are not allowed\0"
   "(*MARK) must have an argument\0"
   "this version of PCRE is not compiled with Unicode property support\0"
+#ifndef EBCDIC
   "\\c must be followed by an ASCII character\0"
+#else
+  "\\c must be followed by a letter or one of [\\]^_?\0"
+#endif
   "\\k is not followed by a braced, angle-bracketed, or quoted name\0"
   /* 70 */
   "internal error: unknown opcode in find_fixedlength()\0"
@@ -1425,7 +1435,16 @@ else
     c ^= 0x40;
 #else             /* EBCDIC coding */
     if (c >= CHAR_a && c <= CHAR_z) c += 64;
-    c ^= 0xC0;
+    if (c == CHAR_QUESTION_MARK)
+      c = ('\\' == 188 && '`' == 74)? 0x5f : 0xff;
+    else
+      {
+      for (i = 0; i < 32; i++)
+        {
+        if (c == ebcdic_escape_c[i]) break;
+        }
+      if (i < 32) c = i; else *errorcodeptr = ERR68;
+      }
 #endif
     break;
 
@@ -7354,14 +7373,14 @@ for (;; ptr++)
           recno = 0;
           while(IS_DIGIT(*ptr))
             {
-            if (recno > INT_MAX / 10 - 1) /* Integer overflow */            
-              {                                                             
-              while (IS_DIGIT(*ptr)) ptr++;                                 
-              *errorcodeptr = ERR61;                                        
-              goto FAILED;                                                  
+            if (recno > INT_MAX / 10 - 1) /* Integer overflow */
+              {
+              while (IS_DIGIT(*ptr)) ptr++;
+              *errorcodeptr = ERR61;
+              goto FAILED;
               }
             recno = recno * 10 + *ptr++ - CHAR_0;
-            } 
+            }
 
           if (*ptr != (pcre_uchar)terminator)
             {
author	ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>	2015-06-14 15:53:41 +0000
committer	ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>	2015-06-14 15:53:41 +0000
commit	68e2e67656942f34537a41f75b8548764283686d (patch)
tree	d1f4e0d2f483924a23cc43318c819dcf16f81b33
parent	2d606295bb68dcb32222489cd97a2a01f2fb4670 (diff)
download	pcre-68e2e67656942f34537a41f75b8548764283686d.tar.gz