summaryrefslogtreecommitdiff
path: root/src/pcre2_compile.c
diff options
context:
space:
mode:
authorph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069>2019-02-12 17:50:19 +0000
committerph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069>2019-02-12 17:50:19 +0000
commit4e01f37e73ba7afa29fbfbe45a5f923efb0a1c68 (patch)
tree2f92e9bdf9f05dbe278c16ae6162b5dd725f2749 /src/pcre2_compile.c
parent5a5285b1066d191d22eb858cbc9862b6e044ca9e (diff)
downloadpcre2-4e01f37e73ba7afa29fbfbe45a5f923efb0a1c68.tar.gz
Implement PCRE2_EXTRA_ALT_BSUX to support ECMAscript 6's \u{hhh..} syntax.
git-svn-id: svn://vcs.exim.org/pcre2/code/trunk@1070 6239d852-aaf2-0410-a92c-79f79f948069
Diffstat (limited to 'src/pcre2_compile.c')
-rw-r--r--src/pcre2_compile.c137
1 files changed, 89 insertions, 48 deletions
diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c
index d72d9bd..1edcf1a 100644
--- a/src/pcre2_compile.c
+++ b/src/pcre2_compile.c
@@ -764,7 +764,7 @@ are allowed. */
#define PUBLIC_COMPILE_EXTRA_OPTIONS \
(PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS| \
PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES|PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL| \
- PCRE2_EXTRA_ESCAPED_CR_IS_LF)
+ PCRE2_EXTRA_ESCAPED_CR_IS_LF|PCRE2_EXTRA_ALT_BSUX)
/* Compile time error code numbers. They are given names so that they can more
easily be tracked. When a new number is added, the tables called eint1 and
@@ -1459,7 +1459,8 @@ Returns: zero => a data character
int
PRIV(check_escape)(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *chptr,
- int *errorcodeptr, uint32_t options, BOOL isclass, compile_block *cb)
+ int *errorcodeptr, uint32_t options, uint32_t extra_options, BOOL isclass,
+ compile_block *cb)
{
BOOL utf = (options & PCRE2_UTF) != 0;
PCRE2_SPTR ptr = *ptrptr;
@@ -1495,8 +1496,7 @@ else if ((i = escapes[c - ESCAPES_FIRST]) != 0)
if (i > 0)
{
c = (uint32_t)i;
- if (cb != NULL && c == CHAR_CR &&
- (cb->cx->extra_options & PCRE2_EXTRA_ESCAPED_CR_IS_LF) != 0)
+ if (c == CHAR_CR && (extra_options & PCRE2_EXTRA_ESCAPED_CR_IS_LF) != 0)
c = CHAR_LF;
}
else /* Negative table entry */
@@ -1551,22 +1551,28 @@ else if ((i = escapes[c - ESCAPES_FIRST]) != 0)
/* Escapes that need further processing, including those that are unknown, have
a zero entry in the lookup table. When called from pcre2_substitute(), only \c,
-\o, and \x are recognized (and \u when BSUX is set). */
+\o, and \x are recognized (\u and \U can never appear as they are used for case
+forcing). */
else
{
+ int s;
PCRE2_SPTR oldptr;
BOOL overflow;
- int s;
+ BOOL alt_bsux =
+ ((options & PCRE2_ALT_BSUX) | (extra_options & PCRE2_EXTRA_ALT_BSUX)) != 0;
/* Filter calls from pcre2_substitute(). */
- if (cb == NULL && c != CHAR_c && c != CHAR_o && c != CHAR_x &&
- (c != CHAR_u || (options & PCRE2_ALT_BSUX) != 0))
+ if (cb == NULL)
{
- *errorcodeptr = ERR3;
- return 0;
- }
+ if (c != CHAR_c && c != CHAR_o && c != CHAR_x)
+ {
+ *errorcodeptr = ERR3;
+ return 0;
+ }
+ alt_bsux = FALSE; /* Do not modify \x handling */
+ }
switch (c)
{
@@ -1579,40 +1585,74 @@ else
*errorcodeptr = ERR37;
break;
- /* \u is unrecognized when PCRE2_ALT_BSUX is not set. When it is treated
- specially, \u must be followed by four hex digits. Otherwise it is a
- lowercase u letter. */
+ /* \u is unrecognized when neither PCRE2_ALT_BSUX nor PCRE2_EXTRA_ALT_BSUX
+ is set. Otherwise, \u must be followed by exactly four hex digits or, if
+ PCRE2_EXTRA_ALT_BSUX is set, by any number of hex digits in braces.
+ Otherwise it is a lowercase u letter. This gives some compatibility with
+ ECMAScript (aka JavaScript). */
case CHAR_u:
- if ((options & PCRE2_ALT_BSUX) == 0) *errorcodeptr = ERR37; else
+ if (!alt_bsux) *errorcodeptr = ERR37; else
{
uint32_t xc;
- if (ptrend - ptr < 4) break; /* Less than 4 chars */
- if ((cc = XDIGIT(ptr[0])) == 0xff) break; /* Not a hex digit */
- if ((xc = XDIGIT(ptr[1])) == 0xff) break; /* Not a hex digit */
- cc = (cc << 4) | xc;
- if ((xc = XDIGIT(ptr[2])) == 0xff) break; /* Not a hex digit */
- cc = (cc << 4) | xc;
- if ((xc = XDIGIT(ptr[3])) == 0xff) break; /* Not a hex digit */
- c = (cc << 4) | xc;
- ptr += 4;
+
+ if (*ptr == CHAR_LEFT_CURLY_BRACKET &&
+ (extra_options & PCRE2_EXTRA_ALT_BSUX) != 0)
+ {
+ PCRE2_SPTR hptr = ptr + 1;
+ cc = 0;
+
+ while (hptr < ptrend && (xc = XDIGIT(*hptr)) != 0xff)
+ {
+ if ((cc & 0xf0000000) != 0) /* Test for 32-bit overflow */
+ {
+ *errorcodeptr = ERR77;
+ ptr = hptr; /* Show where */
+ break; /* *hptr != } will cause another break below */
+ }
+ cc = (cc << 4) | xc;
+ hptr++;
+ }
+
+ if (hptr == ptr + 1 || /* No hex digits */
+ hptr >= ptrend || /* Hit end of input */
+ *hptr != CHAR_RIGHT_CURLY_BRACKET) /* No } terminator */
+ break; /* Hex escape not recognized */
+
+ c = cc; /* Accept the code point */
+ ptr = hptr + 1;
+ }
+
+ else /* Must be exactly 4 hex digits */
+ {
+ if (ptrend - ptr < 4) break; /* Less than 4 chars */
+ if ((cc = XDIGIT(ptr[0])) == 0xff) break; /* Not a hex digit */
+ if ((xc = XDIGIT(ptr[1])) == 0xff) break; /* Not a hex digit */
+ cc = (cc << 4) | xc;
+ if ((xc = XDIGIT(ptr[2])) == 0xff) break; /* Not a hex digit */
+ cc = (cc << 4) | xc;
+ if ((xc = XDIGIT(ptr[3])) == 0xff) break; /* Not a hex digit */
+ c = (cc << 4) | xc;
+ ptr += 4;
+ }
+
if (utf)
{
if (c > 0x10ffffU) *errorcodeptr = ERR77;
else
if (c >= 0xd800 && c <= 0xdfff &&
- (cb->cx->extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
- *errorcodeptr = ERR73;
+ (extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
+ *errorcodeptr = ERR73;
}
else if (c > MAX_NON_UTF_CHAR) *errorcodeptr = ERR77;
}
break;
- /* \U is unrecognized unless PCRE2_ALT_BSUX is set, in which case it is an
- upper case letter. */
+ /* \U is unrecognized unless PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX is set,
+ in which case it is an upper case letter. */
case CHAR_U:
- if ((options & PCRE2_ALT_BSUX) == 0) *errorcodeptr = ERR37;
+ if (!alt_bsux) *errorcodeptr = ERR37;
break;
/* In a character class, \g is just a literal "g". Outside a character
@@ -1791,8 +1831,8 @@ else
}
else if (ptr < ptrend && *ptr++ == CHAR_RIGHT_CURLY_BRACKET)
{
- if (utf && c >= 0xd800 && c <= 0xdfff && (cb == NULL ||
- (cb->cx->extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0))
+ if (utf && c >= 0xd800 && c <= 0xdfff &&
+ (extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
{
ptr--;
*errorcodeptr = ERR73;
@@ -1806,11 +1846,11 @@ else
}
break;
- /* \x is complicated. When PCRE2_ALT_BSUX is set, \x must be followed by
- two hexadecimal digits. Otherwise it is a lowercase x letter. */
+ /* When PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX is set, \x must be followed
+ by two hexadecimal digits. Otherwise it is a lowercase x letter. */
case CHAR_x:
- if ((options & PCRE2_ALT_BSUX) != 0)
+ if (alt_bsux)
{
uint32_t xc;
if (ptrend - ptr < 2) break; /* Less than 2 characters */
@@ -1818,9 +1858,9 @@ else
if ((xc = XDIGIT(ptr[1])) == 0xff) break; /* Not a hex digit */
c = (cc << 4) | xc;
ptr += 2;
- } /* End PCRE2_ALT_BSUX handling */
+ }
- /* Handle \x in Perl's style. \x{ddd} is a character number which can be
+ /* Handle \x in Perl's style. \x{ddd} is a character code which can be
greater than 0xff in UTF-8 or non-8bit mode, but only if the ddd are hex
digits. If not, { used to be treated as a data character. However, Perl
seems to read hex digits up to the first non-such, and ignore the rest, so
@@ -1864,8 +1904,8 @@ else
}
else if (ptr < ptrend && *ptr++ == CHAR_RIGHT_CURLY_BRACKET)
{
- if (utf && c >= 0xd800 && c <= 0xdfff && (cb == NULL ||
- (cb->cx->extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0))
+ if (utf && c >= 0xd800 && c <= 0xdfff &&
+ (extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
{
ptr--;
*errorcodeptr = ERR73;
@@ -2438,6 +2478,7 @@ uint32_t *parsed_pattern = cb->parsed_pattern;
uint32_t *parsed_pattern_end = cb->parsed_pattern_end;
uint32_t meta_quantifier = 0;
uint32_t add_after_mark = 0;
+uint32_t extra_options = cb->cx->extra_options;
uint16_t nest_depth = 0;
int after_manual_callout = 0;
int expect_cond_assert = 0;
@@ -2461,12 +2502,12 @@ nest_save *top_nest, *end_nests;
/* Insert leading items for word and line matching (features provided for the
benefit of pcre2grep). */
-if ((cb->cx->extra_options & PCRE2_EXTRA_MATCH_LINE) != 0)
+if ((extra_options & PCRE2_EXTRA_MATCH_LINE) != 0)
{
*parsed_pattern++ = META_CIRCUMFLEX;
*parsed_pattern++ = META_NOCAPTURE;
}
-else if ((cb->cx->extra_options & PCRE2_EXTRA_MATCH_WORD) != 0)
+else if ((extra_options & PCRE2_EXTRA_MATCH_WORD) != 0)
{
*parsed_pattern++ = META_ESCAPE + ESC_b;
*parsed_pattern++ = META_NOCAPTURE;
@@ -2631,7 +2672,7 @@ while (ptr < ptrend)
if ((options & PCRE2_ALT_VERBNAMES) != 0)
{
escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
- FALSE, cb);
+ cb->cx->extra_options, FALSE, cb);
if (errorcode != 0) goto FAILED;
}
else escape = 0; /* Treat all as literal */
@@ -2821,11 +2862,11 @@ while (ptr < ptrend)
case CHAR_BACKSLASH:
tempptr = ptr;
escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
- FALSE, cb);
+ cb->cx->extra_options, FALSE, cb);
if (errorcode != 0)
{
ESCAPE_FAILED:
- if ((cb->cx->extra_options & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) == 0)
+ if ((extra_options & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) == 0)
goto FAILED;
ptr = tempptr;
if (ptr >= ptrend) c = CHAR_BACKSLASH; else
@@ -3382,12 +3423,12 @@ while (ptr < ptrend)
else
{
tempptr = ptr;
- escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode,
- options, TRUE, cb);
+ escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
+ cb->cx->extra_options, TRUE, cb);
if (errorcode != 0)
{
- if ((cb->cx->extra_options & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) == 0)
+ if ((extra_options & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) == 0)
goto FAILED;
ptr = tempptr;
if (ptr >= ptrend) c = CHAR_BACKSLASH; else
@@ -4545,12 +4586,12 @@ parsed_pattern = manage_callouts(ptr, &previous_callout, auto_callout,
/* Insert trailing items for word and line matching (features provided for the
benefit of pcre2grep). */
-if ((cb->cx->extra_options & PCRE2_EXTRA_MATCH_LINE) != 0)
+if ((extra_options & PCRE2_EXTRA_MATCH_LINE) != 0)
{
*parsed_pattern++ = META_KET;
*parsed_pattern++ = META_DOLLAR;
}
-else if ((cb->cx->extra_options & PCRE2_EXTRA_MATCH_WORD) != 0)
+else if ((extra_options & PCRE2_EXTRA_MATCH_WORD) != 0)
{
*parsed_pattern++ = META_KET;
*parsed_pattern++ = META_ESCAPE + ESC_b;