summaryrefslogtreecommitdiff
path: root/pcre_compile.c
diff options
context:
space:
mode:
authorph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>2013-10-09 10:18:26 +0000
committerph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>2013-10-09 10:18:26 +0000
commitaa623d620b555facb974220d9feb5167804436b6 (patch)
tree2abe7a2e0252844f3f525b97561d100b1d9cfb95 /pcre_compile.c
parente53ac621ef11427dd1c9fd6def13349cc196fd8c (diff)
downloadpcre-aa623d620b555facb974220d9feb5167804436b6.tar.gz
Add \o{} and tidy up \x{} handling. Minor update to RunTest.
git-svn-id: svn://vcs.exim.org/pcre/code/trunk@1370 2f5784b3-3f2a-0410-8824-cb99058d5e15
Diffstat (limited to 'pcre_compile.c')
-rw-r--r--pcre_compile.c168
1 files changed, 110 insertions, 58 deletions
diff --git a/pcre_compile.c b/pcre_compile.c
index c3cc028..3ac3395 100644
--- a/pcre_compile.c
+++ b/pcre_compile.c
@@ -462,7 +462,7 @@ static const char error_texts[] =
"POSIX collating elements are not supported\0"
"this version of PCRE is compiled without UTF support\0"
"spare error\0" /** DEAD **/
- "character value in \\x{...} sequence is too large\0"
+ "character value in \\x{} or \\o{} is too large\0"
/* 35 */
"invalid condition (?(0)\0"
"\\C not allowed in lookbehind assertion\0"
@@ -516,6 +516,10 @@ static const char error_texts[] =
"character value in \\u.... sequence is too large\0"
"invalid UTF-32 string\0"
"setting UTF is disabled by the application\0"
+ "non-hex character in \\x{} (closing brace missing?)\0"
+ /* 80 */
+ "non-octal character in \\o{} (closing brace missing?)\0"
+ "missing opening brace after \\o\0"
;
/* Table to identify digits and hex digits. This is used when compiling
@@ -1162,16 +1166,51 @@ else
if (!utf && c > 0xff) *errorcodeptr = ERR51;
#endif
break;
+
+ /* \o is a relatively new Perl feature, supporting a more general way of
+ specifying character codes in octal. The only supported form is \o{ddd}. */
+
+ case CHAR_o:
+ if (ptr[1] != CHAR_LEFT_CURLY_BRACKET) *errorcodeptr = ERR81; else
+ {
+ ptr += 2;
+ c = 0;
+ overflow = FALSE;
+ while (*ptr >= CHAR_0 && *ptr <= CHAR_7)
+ {
+ register pcre_uint32 cc = *ptr++;
+ if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
+#ifdef COMPILE_PCRE32
+ if (c >= 0x10000000l) { overflow = TRUE; break; }
+#endif
+ c = (c << 3) + cc - CHAR_0 ;
+#if defined COMPILE_PCRE8
+ if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
+#elif defined COMPILE_PCRE16
+ if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
+#elif defined COMPILE_PCRE32
+ if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
+#endif
+ }
+ if (overflow)
+ {
+ while (*ptr >= CHAR_0 && *ptr <= CHAR_7) ptr++;
+ *errorcodeptr = ERR34;
+ }
+ else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
+ {
+ if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
+ }
+ else *errorcodeptr = ERR80;
+ }
+ break;
- /* \x is complicated. \x{ddd} is a character number which can be greater
- than 0xff in utf or non-8bit mode, but only if the ddd are hex digits.
- If not, { is treated as a data character. */
+ /* \x is complicated. In JavaScript, \x must be followed by two hexadecimal
+ numbers. Otherwise it is a lowercase x letter. */
case CHAR_x:
if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
{
- /* In JavaScript, \x must be followed by two hexadecimal numbers.
- Otherwise it is a lowercase x letter. */
if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
&& MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0)
{
@@ -1188,73 +1227,86 @@ else
#endif
}
}
- break;
- }
-
- if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
- {
- const pcre_uchar *pt = ptr + 2;
-
- c = 0;
- overflow = FALSE;
- while (MAX_255(*pt) && (digitab[*pt] & ctype_xdigit) != 0)
+ } /* End JavaScript handling */
+
+ /* Handle \x in Perl's style. \x{ddd} is a character number which can be
+ greater than 0xff in utf or non-8bit mode, but only if the ddd are hex
+ digits. If not, { used to be treated as a data character. However, Perl
+ seems to read hex digits up to the first non-such, and ignore the rest, so
+ that, for example \x{zz} matches a binary zero. This seems crazy, so PCRE
+ now gives an error. */
+
+ else
+ {
+ if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
{
- register pcre_uint32 cc = *pt++;
- if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
-
+ ptr += 2;
+ c = 0;
+ overflow = FALSE;
+ while (MAX_255(*ptr) && (digitab[*ptr] & ctype_xdigit) != 0)
+ {
+ register pcre_uint32 cc = *ptr++;
+ if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
+
#ifdef COMPILE_PCRE32
- if (c >= 0x10000000l) { overflow = TRUE; break; }
+ if (c >= 0x10000000l) { overflow = TRUE; break; }
#endif
-
+
#ifndef EBCDIC /* ASCII/UTF-8 coding */
- if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
- c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
+ if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
+ c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
#else /* EBCDIC coding */
- if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
- c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
+ if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
+ c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
#endif
-
+
#if defined COMPILE_PCRE8
- if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
+ if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
#elif defined COMPILE_PCRE16
- if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
+ if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
#elif defined COMPILE_PCRE32
- if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
+ if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
#endif
- }
-
- if (overflow)
- {
- while (MAX_255(*pt) && (digitab[*pt] & ctype_xdigit) != 0) pt++;
- *errorcodeptr = ERR34;
- }
-
- if (*pt == CHAR_RIGHT_CURLY_BRACKET)
- {
- if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
- ptr = pt;
- break;
- }
-
- /* If the sequence of hex digits does not end with '}', then we don't
- recognize this construct; fall through to the normal \x handling. */
- }
+ }
+
+ if (overflow)
+ {
+ while (MAX_255(*ptr) && (digitab[*ptr] & ctype_xdigit) != 0) ptr++;
+ *errorcodeptr = ERR34;
+ }
+
+ else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
+ {
+ if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
+ }
+
+ /* If the sequence of hex digits does not end with '}', give an error.
+ We used just to recognize this construct and fall through to the normal
+ \x handling, but nowadays Perl gives an error, which seems much more
+ sensible, so we do too. */
+
+ else *errorcodeptr = ERR79;
+ } /* End of \x{} processing */
- /* Read just a single-byte hex-defined char */
+ /* Read a single-byte hex-defined char (up to two hex digits after \x) */
- c = 0;
- while (i++ < 2 && MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0)
- {
- pcre_uint32 cc; /* Some compilers don't like */
- cc = *(++ptr); /* ++ in initializers */
+ else
+ {
+ c = 0;
+ while (i++ < 2 && MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0)
+ {
+ pcre_uint32 cc; /* Some compilers don't like */
+ cc = *(++ptr); /* ++ in initializers */
#ifndef EBCDIC /* ASCII/UTF-8 coding */
- if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
- c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
+ if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
+ c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
#else /* EBCDIC coding */
- if (cc <= CHAR_z) cc += 64; /* Convert to upper case */
- c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
+ if (cc <= CHAR_z) cc += 64; /* Convert to upper case */
+ c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
#endif
- }
+ }
+ } /* End of \xdd handling */
+ } /* End of Perl-style \x handling */
break;
/* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.