diff options
author | Nikita Popov <nikita.ppv@gmail.com> | 2020-06-30 18:24:48 +0200 |
---|---|---|
committer | Nikita Popov <nikita.ppv@gmail.com> | 2020-06-30 18:24:48 +0200 |
commit | e2a407c2fe788e685055e81d672c8cc6cd7ceaa5 (patch) | |
tree | eff567b3b57d2ec369e6a8f8615d754a2655cac0 /ext/pcre/pcre2lib/pcre2_compile.c | |
parent | af4ff75c989374d2f850ae397d62d5a537532c40 (diff) | |
download | php-git-e2a407c2fe788e685055e81d672c8cc6cd7ceaa5.tar.gz |
Revert "Update to PCRE2 10.35"
This reverts commit b419f96c626d1f9cbbba42698e947e32a0af9c4f.
This breaks the GCC build with -fcf-protection (default on Ubuntu
at least).
Diffstat (limited to 'ext/pcre/pcre2lib/pcre2_compile.c')
-rw-r--r-- | ext/pcre/pcre2lib/pcre2_compile.c | 194 |
1 files changed, 102 insertions, 92 deletions
diff --git a/ext/pcre/pcre2lib/pcre2_compile.c b/ext/pcre/pcre2lib/pcre2_compile.c index 62393bea74..f2e6b6b5bd 100644 --- a/ext/pcre/pcre2lib/pcre2_compile.c +++ b/ext/pcre/pcre2lib/pcre2_compile.c @@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel Original API code Copyright (c) 1997-2012 University of Cambridge - New API code Copyright (c) 2016-2020 University of Cambridge + New API code Copyright (c) 2016-2019 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -1202,7 +1202,7 @@ in the decoded tables. */ if ((code->flags & PCRE2_DEREF_TABLES) != 0) { - ref_count = (PCRE2_SIZE *)(code->tables + TABLES_LENGTH); + ref_count = (PCRE2_SIZE *)(code->tables + tables_length); (*ref_count)++; } @@ -1232,15 +1232,15 @@ if (newcode == NULL) return NULL; memcpy(newcode, code, code->blocksize); newcode->executable_jit = NULL; -newtables = code->memctl.malloc(TABLES_LENGTH + sizeof(PCRE2_SIZE), +newtables = code->memctl.malloc(tables_length + sizeof(PCRE2_SIZE), code->memctl.memory_data); if (newtables == NULL) { code->memctl.free((void *)newcode, code->memctl.memory_data); return NULL; } -memcpy(newtables, code->tables, TABLES_LENGTH); -ref_count = (PCRE2_SIZE *)(newtables + TABLES_LENGTH); +memcpy(newtables, code->tables, tables_length); +ref_count = (PCRE2_SIZE *)(newtables + tables_length); *ref_count = 1; newcode->tables = newtables; @@ -1270,7 +1270,7 @@ if (code != NULL) be freed when there are no more references to them. The *ref_count should always be > 0. */ - ref_count = (PCRE2_SIZE *)(code->tables + TABLES_LENGTH); + ref_count = (PCRE2_SIZE *)(code->tables + tables_length); if (*ref_count > 0) { (*ref_count)--; @@ -3653,7 +3653,7 @@ while (ptr < ptrend) if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS; /* If ( is not followed by ? it is either a capture or a special verb or an - alpha assertion or a positive non-atomic lookahead. */ + alpha assertion. */ if (*ptr != CHAR_QUESTION_MARK) { @@ -3685,10 +3685,10 @@ while (ptr < ptrend) break; /* Handle "alpha assertions" such as (*pla:...). Most of these are - synonyms for the historical symbolic assertions, but the script run and - non-atomic lookaround ones are new. They are distinguished by starting - with a lower case letter. Checking both ends of the alphabet makes this - work in all character codes. */ + synonyms for the historical symbolic assertions, but the script run ones + are new. They are distinguished by starting with a lower case letter. + Checking both ends of the alphabet makes this work in all character + codes. */ else if (CHMAX_255(c) && (cb->ctypes[c] & ctype_lcletter) != 0) { @@ -3747,7 +3747,9 @@ while (ptr < ptrend) goto POSITIVE_LOOK_AHEAD; case META_LOOKAHEAD_NA: - goto POSITIVE_NONATOMIC_LOOK_AHEAD; + *parsed_pattern++ = meta; + ptr++; + goto POST_ASSERTION; case META_LOOKAHEADNOT: goto NEGATIVE_LOOK_AHEAD; @@ -4436,12 +4438,6 @@ while (ptr < ptrend) ptr++; goto POST_ASSERTION; - case CHAR_ASTERISK: - POSITIVE_NONATOMIC_LOOK_AHEAD: /* Come from (?* */ - *parsed_pattern++ = META_LOOKAHEAD_NA; - ptr++; - goto POST_ASSERTION; - case CHAR_EXCLAMATION_MARK: NEGATIVE_LOOK_AHEAD: /* Come from (*nla: */ *parsed_pattern++ = META_LOOKAHEADNOT; @@ -4451,23 +4447,20 @@ while (ptr < ptrend) /* ---- Lookbehind assertions ---- */ - /* (?< followed by = or ! or * is a lookbehind assertion. Otherwise (?< - is the start of the name of a capturing group. */ + /* (?< followed by = or ! is a lookbehind assertion. Otherwise (?< is the + start of the name of a capturing group. */ case CHAR_LESS_THAN_SIGN: if (ptrend - ptr <= 1 || - (ptr[1] != CHAR_EQUALS_SIGN && - ptr[1] != CHAR_EXCLAMATION_MARK && - ptr[1] != CHAR_ASTERISK)) + (ptr[1] != CHAR_EQUALS_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK)) { terminator = CHAR_GREATER_THAN_SIGN; goto DEFINE_NAME; } *parsed_pattern++ = (ptr[1] == CHAR_EQUALS_SIGN)? - META_LOOKBEHIND : (ptr[1] == CHAR_EXCLAMATION_MARK)? - META_LOOKBEHINDNOT : META_LOOKBEHIND_NA; + META_LOOKBEHIND : META_LOOKBEHINDNOT; - POST_LOOKBEHIND: /* Come from (*plb: (*naplb: and (*nlb: */ + POST_LOOKBEHIND: /* Come from (*plb: (*naplb: and (*nlb: */ *has_lookbehind = TRUE; offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2); PUTOFFSET(offset, parsed_pattern); @@ -4640,6 +4633,8 @@ while (ptr < ptrend) *parsed_pattern++ = META_KET; } + + if (top_nest == (nest_save *)(cb->start_workspace)) top_nest = NULL; else top_nest--; } @@ -4904,7 +4899,7 @@ range. */ if ((options & PCRE2_CASELESS) != 0) { #ifdef SUPPORT_UNICODE - if ((options & (PCRE2_UTF|PCRE2_UCP)) != 0) + if ((options & PCRE2_UTF) != 0) { int rc; uint32_t oc, od; @@ -5319,8 +5314,7 @@ dynamically as we process the pattern. */ #ifdef SUPPORT_UNICODE BOOL utf = (options & PCRE2_UTF) != 0; -BOOL ucp = (options & PCRE2_UCP) != 0; -#else /* No Unicode support */ +#else /* No UTF support */ BOOL utf = FALSE; #endif @@ -5565,12 +5559,12 @@ for (;; pptr++) zerofirstcu = firstcu; zerofirstcuflags = firstcuflags; - /* For caseless UTF or UCP mode, check whether this character has more - than one other case. If so, generate a special OP_NOTPROP item instead of + /* For caseless UTF mode, check whether this character has more than + one other case. If so, generate a special OP_NOTPROP item instead of OP_NOTI. */ #ifdef SUPPORT_UNICODE - if ((utf||ucp) && (options & PCRE2_CASELESS) != 0 && + if (utf && (options & PCRE2_CASELESS) != 0 && (d = UCD_CASESET(c)) != 0) { *code++ = OP_NOTPROP; @@ -5603,7 +5597,7 @@ for (;; pptr++) uint32_t d; #ifdef SUPPORT_UNICODE - if ((utf || ucp) && c > 127) d = UCD_OTHERCASE(c); else + if (utf && c > 127) d = UCD_OTHERCASE(c); else #endif { #if PCRE2_CODE_UNIT_WIDTH != 8 @@ -6677,11 +6671,23 @@ for (;; pptr++) } /* For a back reference, update the back reference map and the - maximum back reference. */ + maximum back reference. Then, for each group, we must check to + see if it is recursive, that is, it is inside the group that it + references. A flag is set so that the group can be made atomic. + */ cb->backref_map |= (groupnumber < 32)? (1u << groupnumber) : 1; if (groupnumber > cb->top_backref) cb->top_backref = groupnumber; + + for (oc = cb->open_caps; oc != NULL; oc = oc->next) + { + if (oc->number == groupnumber) + { + oc->flag = TRUE; + break; + } + } } } @@ -7075,18 +7081,15 @@ for (;; pptr++) previous[GET(previous, 1)] != OP_ALT) goto END_REPEAT; - /* Perl allows all assertions to be quantified, and when they contain - capturing parentheses and/or are optional there are potential uses for - this feature. PCRE2 used to force the maximum quantifier to 1 on the - invalid grounds that further repetition was never useful. This was - always a bit pointless, since an assertion could be wrapped with a - repeated group to achieve the effect. General repetition is now - permitted, but if the maximum is unlimited it is set to one more than - the minimum. */ + /* There is no sense in actually repeating assertions. The only + potential use of repetition is in cases when the assertion is optional. + Therefore, if the minimum is greater than zero, just ignore the repeat. + If the maximum is not zero or one, set it to 1. */ if (op_previous < OP_ONCE) /* Assertion */ { - if (repeat_max == REPEAT_UNLIMITED) repeat_max = repeat_min + 1; + if (repeat_min > 0) goto END_REPEAT; + if (repeat_max > 1) repeat_max = 1; } /* The case of a zero minimum is special because of the need to stick @@ -7679,6 +7682,19 @@ for (;; pptr++) cb->backref_map |= (meta_arg < 32)? (1u << meta_arg) : 1; if (meta_arg > cb->top_backref) cb->top_backref = meta_arg; + + /* Check to see if this back reference is recursive, that it, it + is inside the group that it references. A flag is set so that the + group can be made atomic. */ + + for (oc = cb->open_caps; oc != NULL; oc = oc->next) + { + if (oc->number == meta_arg) + { + oc->flag = TRUE; + break; + } + } break; @@ -7824,12 +7840,11 @@ for (;; pptr++) NORMAL_CHAR_SET: /* Character is already in meta */ matched_char = TRUE; - /* For caseless UTF or UCP mode, check whether this character has more than - one other case. If so, generate a special OP_PROP item instead of OP_CHARI. - */ + /* For caseless UTF mode, check whether this character has more than one + other case. If so, generate a special OP_PROP item instead of OP_CHARI. */ #ifdef SUPPORT_UNICODE - if ((utf||ucp) && (options & PCRE2_CASELESS) != 0) + if (utf && (options & PCRE2_CASELESS) != 0) { uint32_t caseset = UCD_CASESET(meta); if (caseset != 0) @@ -8038,6 +8053,7 @@ if (*code == OP_CBRA) capnumber = GET2(code, 1 + LINK_SIZE); capitem.number = capnumber; capitem.next = cb->open_caps; + capitem.flag = FALSE; capitem.assert_depth = cb->assert_depth; cb->open_caps = &capitem; } @@ -8166,9 +8182,26 @@ for (;;) PUT(code, 1, (int)(code - start_bracket)); code += 1 + LINK_SIZE; - /* If it was a capturing subpattern, remove the block from the chain. */ + /* If it was a capturing subpattern, check to see if it contained any + recursive back references. If so, we must wrap it in atomic brackets. In + any event, remove the block from the chain. */ - if (capnumber > 0) cb->open_caps = cb->open_caps->next; + if (capnumber > 0) + { + if (cb->open_caps->flag) + { + (void)memmove(start_bracket + 1 + LINK_SIZE, start_bracket, + CU2BYTES(code - start_bracket)); + *start_bracket = OP_ONCE; + code += 1 + LINK_SIZE; + PUT(start_bracket, 1, (int)(code - start_bracket)); + *code = OP_KET; + PUT(code, 1, (int)(code - start_bracket)); + code += 1 + LINK_SIZE; + length += 2 + 2*LINK_SIZE; + } + cb->open_caps = cb->open_caps->next; + } /* Set values to pass back */ @@ -8803,10 +8836,9 @@ memset(slot + IMM2_SIZE + length, 0, /* This function is called to skip parts of the parsed pattern when finding the length of a lookbehind branch. It is called after (*ACCEPT) and (*FAIL) to find -the end of the branch, it is called to skip over an internal lookaround or -(DEFINE) group, and it is also called to skip to the end of a class, during -which it will never encounter nested groups (but there's no need to have -special code for that). +the end of the branch, it is called to skip over an internal lookaround, and it +is also called to skip to the end of a class, during which it will never +encounter nested groups (but there's no need to have special code for that). When called to find the end of a branch or group, pptr must point to the first meta code inside the branch, not the branch-starting code. In other cases it @@ -9284,21 +9316,14 @@ for (;; pptr++) itemlength = grouplength; break; - /* A (DEFINE) group is never obeyed inline and so it does not contribute to - the length of this branch. Skip from the following item to the next - unpaired ket. */ - - case META_COND_DEFINE: - pptr = parsed_skip(pptr + 1, PSKIP_KET); - break; - - /* Check other nested groups - advance past the initial data for each type - and then seek a fixed length with get_grouplength(). */ + /* Check nested groups - advance past the initial data for each type and + then seek a fixed length with get_grouplength(). */ case META_COND_NAME: case META_COND_NUMBER: case META_COND_RNAME: case META_COND_RNUMBER: + case META_COND_DEFINE: pptr += 2 + SIZEOFFSET; goto CHECK_GROUP; @@ -9555,10 +9580,6 @@ for (; *pptr != META_END; pptr++) break; case META_COND_DEFINE: - pptr += SIZEOFFSET; - nestlevel++; - break; - case META_COND_NAME: case META_COND_NUMBER: case META_COND_RNAME: @@ -9639,7 +9660,6 @@ pcre2_compile(PCRE2_SPTR pattern, PCRE2_SIZE patlen, uint32_t options, int *errorptr, PCRE2_SIZE *erroroffset, pcre2_compile_context *ccontext) { BOOL utf; /* Set TRUE for UTF mode */ -BOOL ucp; /* Set TRUE for UCP mode */ BOOL has_lookbehind = FALSE; /* Set TRUE if a lookbehind is found */ BOOL zero_terminated; /* Set TRUE for zero-terminated pattern */ pcre2_real_code *re = NULL; /* What we will return */ @@ -9927,8 +9947,8 @@ if (utf) /* Check UCP lockout. */ -ucp = (cb.external_options & PCRE2_UCP) != 0; -if (ucp && (cb.external_options & PCRE2_NEVER_UCP) != 0) +if ((cb.external_options & (PCRE2_UCP|PCRE2_NEVER_UCP)) == + (PCRE2_UCP|PCRE2_NEVER_UCP)) { errorcode = ERR75; goto HAD_EARLY_ERROR; @@ -10304,7 +10324,7 @@ function call. */ if (errorcode == 0 && (re->overall_options & PCRE2_NO_AUTO_POSSESS) == 0) { PCRE2_UCHAR *temp = (PCRE2_UCHAR *)codestart; - if (PRIV(auto_possessify)(temp, &cb) != 0) errorcode = ERR80; + if (PRIV(auto_possessify)(temp, utf, &cb) != 0) errorcode = ERR80; } /* Failed to compile, or error while post-processing. */ @@ -10352,25 +10372,21 @@ if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0) if ((firstcuflags & REQ_CASELESS) != 0) { - if (firstcu < 128 || (!utf && !ucp && firstcu < 255)) + if (firstcu < 128 || (!utf && firstcu < 255)) { if (cb.fcc[firstcu] != firstcu) re->flags |= PCRE2_FIRSTCASELESS; } - /* The first code unit is > 128 in UTF or UCP mode, or > 255 otherwise. - In 8-bit UTF mode, codepoints in the range 128-255 are introductory code - points and cannot have another case, but if UCP is set they may do. */ + /* The first code unit is > 128 in UTF mode, or > 255 otherwise. In + 8-bit UTF mode, codepoints in the range 128-255 are introductory code + points and cannot have another case. In 16-bit and 32-bit modes, we can + check wide characters when UTF (and therefore UCP) is supported. */ -#ifdef SUPPORT_UNICODE -#if PCRE2_CODE_UNIT_WIDTH == 8 - else if (ucp && !utf && UCD_OTHERCASE(firstcu) != firstcu) - re->flags |= PCRE2_FIRSTCASELESS; -#else - else if ((utf || ucp) && firstcu <= MAX_UTF_CODE_POINT && +#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8 + else if (firstcu <= MAX_UTF_CODE_POINT && UCD_OTHERCASE(firstcu) != firstcu) re->flags |= PCRE2_FIRSTCASELESS; #endif -#endif /* SUPPORT_UNICODE */ } } @@ -10419,20 +10435,14 @@ if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0) if ((reqcuflags & REQ_CASELESS) != 0) { - if (reqcu < 128 || (!utf && !ucp && reqcu < 255)) + if (reqcu < 128 || (!utf && reqcu < 255)) { if (cb.fcc[reqcu] != reqcu) re->flags |= PCRE2_LASTCASELESS; } -#ifdef SUPPORT_UNICODE -#if PCRE2_CODE_UNIT_WIDTH == 8 - else if (ucp && !utf && UCD_OTHERCASE(reqcu) != reqcu) - re->flags |= PCRE2_LASTCASELESS; -#else - else if ((utf || ucp) && reqcu <= MAX_UTF_CODE_POINT && - UCD_OTHERCASE(reqcu) != reqcu) - re->flags |= PCRE2_LASTCASELESS; +#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8 + else if (reqcu <= MAX_UTF_CODE_POINT && UCD_OTHERCASE(reqcu) != reqcu) + re->flags |= PCRE2_LASTCASELESS; #endif -#endif /* SUPPORT_UNICODE */ } } } |