diff options
author | ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2013-10-09 10:18:26 +0000 |
---|---|---|
committer | ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2013-10-09 10:18:26 +0000 |
commit | aa623d620b555facb974220d9feb5167804436b6 (patch) | |
tree | 2abe7a2e0252844f3f525b97561d100b1d9cfb95 /pcre_compile.c | |
parent | e53ac621ef11427dd1c9fd6def13349cc196fd8c (diff) | |
download | pcre-aa623d620b555facb974220d9feb5167804436b6.tar.gz |
Add \o{} and tidy up \x{} handling. Minor update to RunTest.
git-svn-id: svn://vcs.exim.org/pcre/code/trunk@1370 2f5784b3-3f2a-0410-8824-cb99058d5e15
Diffstat (limited to 'pcre_compile.c')
-rw-r--r-- | pcre_compile.c | 168 |
1 files changed, 110 insertions, 58 deletions
diff --git a/pcre_compile.c b/pcre_compile.c index c3cc028..3ac3395 100644 --- a/pcre_compile.c +++ b/pcre_compile.c @@ -462,7 +462,7 @@ static const char error_texts[] = "POSIX collating elements are not supported\0" "this version of PCRE is compiled without UTF support\0" "spare error\0" /** DEAD **/ - "character value in \\x{...} sequence is too large\0" + "character value in \\x{} or \\o{} is too large\0" /* 35 */ "invalid condition (?(0)\0" "\\C not allowed in lookbehind assertion\0" @@ -516,6 +516,10 @@ static const char error_texts[] = "character value in \\u.... sequence is too large\0" "invalid UTF-32 string\0" "setting UTF is disabled by the application\0" + "non-hex character in \\x{} (closing brace missing?)\0" + /* 80 */ + "non-octal character in \\o{} (closing brace missing?)\0" + "missing opening brace after \\o\0" ; /* Table to identify digits and hex digits. This is used when compiling @@ -1162,16 +1166,51 @@ else if (!utf && c > 0xff) *errorcodeptr = ERR51; #endif break; + + /* \o is a relatively new Perl feature, supporting a more general way of + specifying character codes in octal. The only supported form is \o{ddd}. */ + + case CHAR_o: + if (ptr[1] != CHAR_LEFT_CURLY_BRACKET) *errorcodeptr = ERR81; else + { + ptr += 2; + c = 0; + overflow = FALSE; + while (*ptr >= CHAR_0 && *ptr <= CHAR_7) + { + register pcre_uint32 cc = *ptr++; + if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */ +#ifdef COMPILE_PCRE32 + if (c >= 0x10000000l) { overflow = TRUE; break; } +#endif + c = (c << 3) + cc - CHAR_0 ; +#if defined COMPILE_PCRE8 + if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; } +#elif defined COMPILE_PCRE16 + if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; } +#elif defined COMPILE_PCRE32 + if (utf && c > 0x10ffffU) { overflow = TRUE; break; } +#endif + } + if (overflow) + { + while (*ptr >= CHAR_0 && *ptr <= CHAR_7) ptr++; + *errorcodeptr = ERR34; + } + else if (*ptr == CHAR_RIGHT_CURLY_BRACKET) + { + if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73; + } + else *errorcodeptr = ERR80; + } + break; - /* \x is complicated. \x{ddd} is a character number which can be greater - than 0xff in utf or non-8bit mode, but only if the ddd are hex digits. - If not, { is treated as a data character. */ + /* \x is complicated. In JavaScript, \x must be followed by two hexadecimal + numbers. Otherwise it is a lowercase x letter. */ case CHAR_x: if ((options & PCRE_JAVASCRIPT_COMPAT) != 0) { - /* In JavaScript, \x must be followed by two hexadecimal numbers. - Otherwise it is a lowercase x letter. */ if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0 && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0) { @@ -1188,73 +1227,86 @@ else #endif } } - break; - } - - if (ptr[1] == CHAR_LEFT_CURLY_BRACKET) - { - const pcre_uchar *pt = ptr + 2; - - c = 0; - overflow = FALSE; - while (MAX_255(*pt) && (digitab[*pt] & ctype_xdigit) != 0) + } /* End JavaScript handling */ + + /* Handle \x in Perl's style. \x{ddd} is a character number which can be + greater than 0xff in utf or non-8bit mode, but only if the ddd are hex + digits. If not, { used to be treated as a data character. However, Perl + seems to read hex digits up to the first non-such, and ignore the rest, so + that, for example \x{zz} matches a binary zero. This seems crazy, so PCRE + now gives an error. */ + + else + { + if (ptr[1] == CHAR_LEFT_CURLY_BRACKET) { - register pcre_uint32 cc = *pt++; - if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */ - + ptr += 2; + c = 0; + overflow = FALSE; + while (MAX_255(*ptr) && (digitab[*ptr] & ctype_xdigit) != 0) + { + register pcre_uint32 cc = *ptr++; + if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */ + #ifdef COMPILE_PCRE32 - if (c >= 0x10000000l) { overflow = TRUE; break; } + if (c >= 0x10000000l) { overflow = TRUE; break; } #endif - + #ifndef EBCDIC /* ASCII/UTF-8 coding */ - if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */ - c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10)); + if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */ + c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10)); #else /* EBCDIC coding */ - if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */ - c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10)); + if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */ + c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10)); #endif - + #if defined COMPILE_PCRE8 - if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; } + if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; } #elif defined COMPILE_PCRE16 - if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; } + if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; } #elif defined COMPILE_PCRE32 - if (utf && c > 0x10ffffU) { overflow = TRUE; break; } + if (utf && c > 0x10ffffU) { overflow = TRUE; break; } #endif - } - - if (overflow) - { - while (MAX_255(*pt) && (digitab[*pt] & ctype_xdigit) != 0) pt++; - *errorcodeptr = ERR34; - } - - if (*pt == CHAR_RIGHT_CURLY_BRACKET) - { - if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73; - ptr = pt; - break; - } - - /* If the sequence of hex digits does not end with '}', then we don't - recognize this construct; fall through to the normal \x handling. */ - } + } + + if (overflow) + { + while (MAX_255(*ptr) && (digitab[*ptr] & ctype_xdigit) != 0) ptr++; + *errorcodeptr = ERR34; + } + + else if (*ptr == CHAR_RIGHT_CURLY_BRACKET) + { + if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73; + } + + /* If the sequence of hex digits does not end with '}', give an error. + We used just to recognize this construct and fall through to the normal + \x handling, but nowadays Perl gives an error, which seems much more + sensible, so we do too. */ + + else *errorcodeptr = ERR79; + } /* End of \x{} processing */ - /* Read just a single-byte hex-defined char */ + /* Read a single-byte hex-defined char (up to two hex digits after \x) */ - c = 0; - while (i++ < 2 && MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0) - { - pcre_uint32 cc; /* Some compilers don't like */ - cc = *(++ptr); /* ++ in initializers */ + else + { + c = 0; + while (i++ < 2 && MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0) + { + pcre_uint32 cc; /* Some compilers don't like */ + cc = *(++ptr); /* ++ in initializers */ #ifndef EBCDIC /* ASCII/UTF-8 coding */ - if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */ - c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10)); + if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */ + c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10)); #else /* EBCDIC coding */ - if (cc <= CHAR_z) cc += 64; /* Convert to upper case */ - c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10)); + if (cc <= CHAR_z) cc += 64; /* Convert to upper case */ + c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10)); #endif - } + } + } /* End of \xdd handling */ + } /* End of Perl-style \x handling */ break; /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped. |