summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>2015-06-14 15:53:41 +0000
committerph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>2015-06-14 15:53:41 +0000
commit68e2e67656942f34537a41f75b8548764283686d (patch)
treed1f4e0d2f483924a23cc43318c819dcf16f81b33
parent2d606295bb68dcb32222489cd97a2a01f2fb4670 (diff)
downloadpcre-68e2e67656942f34537a41f75b8548764283686d.tar.gz
Make \c in EBCDIC environments compatible with Perl.
git-svn-id: svn://vcs.exim.org/pcre/code/trunk@1568 2f5784b3-3f2a-0410-8824-cb99058d5e15
-rw-r--r--ChangeLog3
-rw-r--r--doc/pcrepattern.343
-rw-r--r--pcre_compile.c33
3 files changed, 57 insertions, 22 deletions
diff --git a/ChangeLog b/ChangeLog
index c39ab9a..6ac0419 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -57,6 +57,9 @@ Version 8.38 xx-xxx-xxxx
13. In an EBCDIC environment, \a in a pattern was converted to the ASCII
instead of the EBCDIC value.
+
+14. The handling of \c in an EBCDIC environment has been revised so that it is
+ now compatible with the specification in Perl's perlebcdic page.
Version 8.37 28-April-2015
diff --git a/doc/pcrepattern.3 b/doc/pcrepattern.3
index d0c6eeb..953f05c 100644
--- a/doc/pcrepattern.3
+++ b/doc/pcrepattern.3
@@ -1,4 +1,4 @@
-.TH PCREPATTERN 3 "08 January 2014" "PCRE 8.35"
+.TH PCREPATTERN 3 "14 June 2015" "PCRE 8.38"
.SH NAME
PCRE - Perl-compatible regular expressions
.SH "PCRE REGULAR EXPRESSION DETAILS"
@@ -308,7 +308,8 @@ A second use of backslash provides a way of encoding non-printing characters
in patterns in a visible manner. There is no restriction on the appearance of
non-printing characters, apart from the binary zero that terminates a pattern,
but when a pattern is being prepared by text editing, it is often easier to use
-one of the following escape sequences than the binary character it represents:
+one of the following escape sequences than the binary character it represents.
+In an ASCII or Unicode environment, these escapes are as follows:
.sp
\ea alarm, that is, the BEL character (hex 07)
\ecx "control-x", where x is any ASCII character
@@ -330,19 +331,31 @@ case letter, it is converted to upper case. Then bit 6 of the character (hex
but \ec{ becomes hex 3B ({ is 7B), and \ec; becomes hex 7B (; is 3B). If the
data item (byte or 16-bit value) following \ec has a value greater than 127, a
compile-time error occurs. This locks out non-ASCII characters in all modes.
-.P
-The \ec facility was designed for use with ASCII characters, but with the
-extension to Unicode it is even less useful than it once was. It is, however,
-recognized when PCRE is compiled in EBCDIC mode, where data items are always
-bytes. In this mode, all values are valid after \ec. If the next character is a
-lower case letter, it is converted to upper case. Then the 0xc0 bits of the
-byte are inverted. Thus \ecA becomes hex 01, as in ASCII (A is C1), but because
-the EBCDIC letters are disjoint, \ecZ becomes hex 29 (Z is E9), and other
-characters also generate different values.
+.P
+When PCRE is compiled in EBCDIC mode, \ea, \ee, \ef, \en, \er, and \et
+generate the appropriate EBCDIC code values. The \ec escape is processed
+as specified for Perl in the \fBperlebcdic\fP document. The only characters
+that are allowed after \ec are A-Z, a-z, or one of @, [, \e, ], ^, _, or ?. Any
+other character provokes a compile-time error. The sequence \e@ encodes
+character code 0; the letters (in either case) encode characters 1-26 (hex 01
+to hex 1A); [, \e, ], ^, and _ encode characters 27-31 (hex 1B to hex 1F), and
+\e? becomes either 255 (hex FF) or 95 (hex 5F).
+.P
+Thus, apart from \e?, these escapes generate the same character code values as
+they do in an ASCII environment, though the meanings of the values mostly
+differ. For example, \eG always generates code value 7, which is BEL in ASCII
+but DEL in EBCDIC.
+.P
+The sequence \e? generates DEL (127, hex 7F) in an ASCII environment, but
+because 127 is not a control character in EBCDIC, Perl makes it generate the
+APC character. Unfortunately, there are several variants of EBCDIC. In most of
+them the APC character has the value 255 (hex FF), but in the one Perl calls
+POSIX-BC its value is 95 (hex 5F). If certain other characters have POSIX-BC
+values, PCRE makes \e? generate 95; otherwise it generates 255.
.P
After \e0 up to two further octal digits are read. If there are fewer than two
-digits, just those that are present are used. Thus the sequence \e0\ex\e07
-specifies two binary zeros followed by a BEL character (code value 7). Make
+digits, just those that are present are used. Thus the sequence \e0\ex\e015
+specifies two binary zeros followed by a CR character (code value 13). Make
sure you supply two digits after the initial zero if the pattern character that
follows is itself an octal digit.
.P
@@ -3283,6 +3296,6 @@ Cambridge CB2 3QH, England.
.rs
.sp
.nf
-Last updated: 08 January 2014
-Copyright (c) 1997-2014 University of Cambridge.
+Last updated: 14 June 2015
+Copyright (c) 1997-2015 University of Cambridge.
.fi
diff --git a/pcre_compile.c b/pcre_compile.c
index be10086..575ee7a 100644
--- a/pcre_compile.c
+++ b/pcre_compile.c
@@ -219,6 +219,12 @@ static const short int escapes[] = {
/* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
/* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
};
+
+/* We also need a table of characters that may follow \c in an EBCDIC
+environment for characters 0-31. */
+
+static unsigned char ebcdic_escape_c[] = "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_";
+
#endif
@@ -527,7 +533,11 @@ static const char error_texts[] =
"different names for subpatterns of the same number are not allowed\0"
"(*MARK) must have an argument\0"
"this version of PCRE is not compiled with Unicode property support\0"
+#ifndef EBCDIC
"\\c must be followed by an ASCII character\0"
+#else
+ "\\c must be followed by a letter or one of [\\]^_?\0"
+#endif
"\\k is not followed by a braced, angle-bracketed, or quoted name\0"
/* 70 */
"internal error: unknown opcode in find_fixedlength()\0"
@@ -1425,7 +1435,16 @@ else
c ^= 0x40;
#else /* EBCDIC coding */
if (c >= CHAR_a && c <= CHAR_z) c += 64;
- c ^= 0xC0;
+ if (c == CHAR_QUESTION_MARK)
+ c = ('\\' == 188 && '`' == 74)? 0x5f : 0xff;
+ else
+ {
+ for (i = 0; i < 32; i++)
+ {
+ if (c == ebcdic_escape_c[i]) break;
+ }
+ if (i < 32) c = i; else *errorcodeptr = ERR68;
+ }
#endif
break;
@@ -7354,14 +7373,14 @@ for (;; ptr++)
recno = 0;
while(IS_DIGIT(*ptr))
{
- if (recno > INT_MAX / 10 - 1) /* Integer overflow */
- {
- while (IS_DIGIT(*ptr)) ptr++;
- *errorcodeptr = ERR61;
- goto FAILED;
+ if (recno > INT_MAX / 10 - 1) /* Integer overflow */
+ {
+ while (IS_DIGIT(*ptr)) ptr++;
+ *errorcodeptr = ERR61;
+ goto FAILED;
}
recno = recno * 10 + *ptr++ - CHAR_0;
- }
+ }
if (*ptr != (pcre_uchar)terminator)
{