diff options
author | ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2015-06-14 15:53:41 +0000 |
---|---|---|
committer | ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2015-06-14 15:53:41 +0000 |
commit | 68e2e67656942f34537a41f75b8548764283686d (patch) | |
tree | d1f4e0d2f483924a23cc43318c819dcf16f81b33 | |
parent | 2d606295bb68dcb32222489cd97a2a01f2fb4670 (diff) | |
download | pcre-68e2e67656942f34537a41f75b8548764283686d.tar.gz |
Make \c in EBCDIC environments compatible with Perl.
git-svn-id: svn://vcs.exim.org/pcre/code/trunk@1568 2f5784b3-3f2a-0410-8824-cb99058d5e15
-rw-r--r-- | ChangeLog | 3 | ||||
-rw-r--r-- | doc/pcrepattern.3 | 43 | ||||
-rw-r--r-- | pcre_compile.c | 33 |
3 files changed, 57 insertions, 22 deletions
@@ -57,6 +57,9 @@ Version 8.38 xx-xxx-xxxx 13. In an EBCDIC environment, \a in a pattern was converted to the ASCII instead of the EBCDIC value. + +14. The handling of \c in an EBCDIC environment has been revised so that it is + now compatible with the specification in Perl's perlebcdic page. Version 8.37 28-April-2015 diff --git a/doc/pcrepattern.3 b/doc/pcrepattern.3 index d0c6eeb..953f05c 100644 --- a/doc/pcrepattern.3 +++ b/doc/pcrepattern.3 @@ -1,4 +1,4 @@ -.TH PCREPATTERN 3 "08 January 2014" "PCRE 8.35" +.TH PCREPATTERN 3 "14 June 2015" "PCRE 8.38" .SH NAME PCRE - Perl-compatible regular expressions .SH "PCRE REGULAR EXPRESSION DETAILS" @@ -308,7 +308,8 @@ A second use of backslash provides a way of encoding non-printing characters in patterns in a visible manner. There is no restriction on the appearance of non-printing characters, apart from the binary zero that terminates a pattern, but when a pattern is being prepared by text editing, it is often easier to use -one of the following escape sequences than the binary character it represents: +one of the following escape sequences than the binary character it represents. +In an ASCII or Unicode environment, these escapes are as follows: .sp \ea alarm, that is, the BEL character (hex 07) \ecx "control-x", where x is any ASCII character @@ -330,19 +331,31 @@ case letter, it is converted to upper case. Then bit 6 of the character (hex but \ec{ becomes hex 3B ({ is 7B), and \ec; becomes hex 7B (; is 3B). If the data item (byte or 16-bit value) following \ec has a value greater than 127, a compile-time error occurs. This locks out non-ASCII characters in all modes. -.P -The \ec facility was designed for use with ASCII characters, but with the -extension to Unicode it is even less useful than it once was. It is, however, -recognized when PCRE is compiled in EBCDIC mode, where data items are always -bytes. In this mode, all values are valid after \ec. If the next character is a -lower case letter, it is converted to upper case. Then the 0xc0 bits of the -byte are inverted. Thus \ecA becomes hex 01, as in ASCII (A is C1), but because -the EBCDIC letters are disjoint, \ecZ becomes hex 29 (Z is E9), and other -characters also generate different values. +.P +When PCRE is compiled in EBCDIC mode, \ea, \ee, \ef, \en, \er, and \et +generate the appropriate EBCDIC code values. The \ec escape is processed +as specified for Perl in the \fBperlebcdic\fP document. The only characters +that are allowed after \ec are A-Z, a-z, or one of @, [, \e, ], ^, _, or ?. Any +other character provokes a compile-time error. The sequence \e@ encodes +character code 0; the letters (in either case) encode characters 1-26 (hex 01 +to hex 1A); [, \e, ], ^, and _ encode characters 27-31 (hex 1B to hex 1F), and +\e? becomes either 255 (hex FF) or 95 (hex 5F). +.P +Thus, apart from \e?, these escapes generate the same character code values as +they do in an ASCII environment, though the meanings of the values mostly +differ. For example, \eG always generates code value 7, which is BEL in ASCII +but DEL in EBCDIC. +.P +The sequence \e? generates DEL (127, hex 7F) in an ASCII environment, but +because 127 is not a control character in EBCDIC, Perl makes it generate the +APC character. Unfortunately, there are several variants of EBCDIC. In most of +them the APC character has the value 255 (hex FF), but in the one Perl calls +POSIX-BC its value is 95 (hex 5F). If certain other characters have POSIX-BC +values, PCRE makes \e? generate 95; otherwise it generates 255. .P After \e0 up to two further octal digits are read. If there are fewer than two -digits, just those that are present are used. Thus the sequence \e0\ex\e07 -specifies two binary zeros followed by a BEL character (code value 7). Make +digits, just those that are present are used. Thus the sequence \e0\ex\e015 +specifies two binary zeros followed by a CR character (code value 13). Make sure you supply two digits after the initial zero if the pattern character that follows is itself an octal digit. .P @@ -3283,6 +3296,6 @@ Cambridge CB2 3QH, England. .rs .sp .nf -Last updated: 08 January 2014 -Copyright (c) 1997-2014 University of Cambridge. +Last updated: 14 June 2015 +Copyright (c) 1997-2015 University of Cambridge. .fi diff --git a/pcre_compile.c b/pcre_compile.c index be10086..575ee7a 100644 --- a/pcre_compile.c +++ b/pcre_compile.c @@ -219,6 +219,12 @@ static const short int escapes[] = { /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0, /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0 }; + +/* We also need a table of characters that may follow \c in an EBCDIC +environment for characters 0-31. */ + +static unsigned char ebcdic_escape_c[] = "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_"; + #endif @@ -527,7 +533,11 @@ static const char error_texts[] = "different names for subpatterns of the same number are not allowed\0" "(*MARK) must have an argument\0" "this version of PCRE is not compiled with Unicode property support\0" +#ifndef EBCDIC "\\c must be followed by an ASCII character\0" +#else + "\\c must be followed by a letter or one of [\\]^_?\0" +#endif "\\k is not followed by a braced, angle-bracketed, or quoted name\0" /* 70 */ "internal error: unknown opcode in find_fixedlength()\0" @@ -1425,7 +1435,16 @@ else c ^= 0x40; #else /* EBCDIC coding */ if (c >= CHAR_a && c <= CHAR_z) c += 64; - c ^= 0xC0; + if (c == CHAR_QUESTION_MARK) + c = ('\\' == 188 && '`' == 74)? 0x5f : 0xff; + else + { + for (i = 0; i < 32; i++) + { + if (c == ebcdic_escape_c[i]) break; + } + if (i < 32) c = i; else *errorcodeptr = ERR68; + } #endif break; @@ -7354,14 +7373,14 @@ for (;; ptr++) recno = 0; while(IS_DIGIT(*ptr)) { - if (recno > INT_MAX / 10 - 1) /* Integer overflow */ - { - while (IS_DIGIT(*ptr)) ptr++; - *errorcodeptr = ERR61; - goto FAILED; + if (recno > INT_MAX / 10 - 1) /* Integer overflow */ + { + while (IS_DIGIT(*ptr)) ptr++; + *errorcodeptr = ERR61; + goto FAILED; } recno = recno * 10 + *ptr++ - CHAR_0; - } + } if (*ptr != (pcre_uchar)terminator) { |