diff options
Diffstat (limited to 'ext/pcre/pcre2lib/pcre2_match.c')
-rw-r--r-- | ext/pcre/pcre2lib/pcre2_match.c | 160 |
1 files changed, 38 insertions, 122 deletions
diff --git a/ext/pcre/pcre2lib/pcre2_match.c b/ext/pcre/pcre2lib/pcre2_match.c index 11289d575d..48e7b9dbb2 100644 --- a/ext/pcre/pcre2lib/pcre2_match.c +++ b/ext/pcre/pcre2lib/pcre2_match.c @@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel Original API code Copyright (c) 1997-2012 University of Cambridge - New API code Copyright (c) 2015-2020 University of Cambridge + New API code Copyright (c) 2015-2019 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -381,12 +381,8 @@ length = Fovector[offset+1] - Fovector[offset]; if (caseless) { #if defined SUPPORT_UNICODE - BOOL utf = (mb->poptions & PCRE2_UTF) != 0; - - if (utf || (mb->poptions & PCRE2_UCP) != 0) + if ((mb->poptions & PCRE2_UTF) != 0) { - PCRE2_SPTR endptr = p + length; - /* Match characters up to the end of the reference. NOTE: the number of code units matched may differ, because in UTF-8 there are some characters whose upper and lower case codes have different numbers of bytes. For @@ -394,25 +390,16 @@ if (caseless) bytes in UTF-8); a sequence of 3 of the former uses 6 bytes, as does a sequence of two of the latter. It is important, therefore, to check the length along the reference, not along the subject (earlier code did this - wrong). UCP without uses Unicode properties but without UTF encoding. */ + wrong). */ + PCRE2_SPTR endptr = p + length; while (p < endptr) { uint32_t c, d; const ucd_record *ur; if (eptr >= mb->end_subject) return 1; /* Partial match */ - - if (utf) - { - GETCHARINC(c, eptr); - GETCHARINC(d, p); - } - else - { - c = *eptr++; - d = *p++; - } - + GETCHARINC(c, eptr); + GETCHARINC(d, p); ur = GET_UCD(d); if (c != d && c != (uint32_t)((int)d + ur->other_case)) { @@ -428,7 +415,7 @@ if (caseless) else #endif - /* Not in UTF or UCP mode */ + /* Not in UTF mode */ { for (; length > 0; length--) { @@ -445,8 +432,7 @@ if (caseless) } /* In the caseful case, we can just compare the code units, whether or not we -are in UTF and/or UCP mode. When partial matching, we have to do this unit by -unit. */ +are in UTF mode. When partial matching, we have to do this unit-by-unit. */ else { @@ -588,8 +574,8 @@ match(PCRE2_SPTR start_eptr, PCRE2_SPTR start_ecode, PCRE2_SIZE *ovector, heapframe *F; /* Current frame pointer */ heapframe *N = NULL; /* Temporary frame pointers */ heapframe *P = NULL; -heapframe *assert_accept_frame = NULL; /* For passing back a frame with captures */ -PCRE2_SIZE frame_copy_size; /* Amount to copy when creating a new frame */ +heapframe *assert_accept_frame; /* For passing back the frame with captures */ +PCRE2_SIZE frame_copy_size; /* Amount to copy when creating a new frame */ /* Local variables that do not need to be preserved over calls to RRMATCH(). */ @@ -612,13 +598,12 @@ BOOL condition; /* Used in conditional groups */ BOOL cur_is_word; /* Used in "word" tests */ BOOL prev_is_word; /* Used in "word" tests */ -/* UTF and UCP flags */ +/* UTF flag */ #ifdef SUPPORT_UNICODE BOOL utf = (mb->poptions & PCRE2_UTF) != 0; -BOOL ucp = (mb->poptions & PCRE2_UCP) != 0; #else -BOOL utf = FALSE; /* Required for convenience even when no Unicode support */ +BOOL utf = FALSE; #endif /* This is the length of the last part of a backtracking frame that must be @@ -943,7 +928,6 @@ fprintf(stderr, "++ op=%d\n", *Fecode); } else #endif - /* Not UTF mode */ { if (mb->end_subject - Feptr < 1) @@ -1003,30 +987,10 @@ fprintf(stderr, "++ op=%d\n", *Fecode); if (dc != fc && dc != UCD_OTHERCASE(fc)) RRETURN(MATCH_NOMATCH); } } - - /* If UCP is set without UTF we must do the same as above, but with one - character per code unit. */ - - else if (ucp) - { - uint32_t cc = UCHAR21(Feptr); - fc = Fecode[1]; - if (fc < 128) - { - if (mb->lcc[fc] != TABLE_GET(cc, mb->lcc, cc)) RRETURN(MATCH_NOMATCH); - } - else - { - if (cc != fc && cc != UCD_OTHERCASE(fc)) RRETURN(MATCH_NOMATCH); - } - Feptr++; - Fecode += 2; - } - else #endif /* SUPPORT_UNICODE */ - /* Not UTF or UCP mode; use the table for characters < 256. */ + /* Not UTF mode; use the table for characters < 256. */ { if (TABLE_GET(Fecode[1], mb->lcc, Fecode[1]) != TABLE_GET(*Feptr, mb->lcc, *Feptr)) RRETURN(MATCH_NOMATCH); @@ -1046,7 +1010,6 @@ fprintf(stderr, "++ op=%d\n", *Fecode); SCHECK_PARTIAL(); RRETURN(MATCH_NOMATCH); } - #ifdef SUPPORT_UNICODE if (utf) { @@ -1063,42 +1026,15 @@ fprintf(stderr, "++ op=%d\n", *Fecode); if (ch > 127) ch = UCD_OTHERCASE(ch); else - ch = (mb->fcc)[ch]; - if (ch == fc) RRETURN(MATCH_NOMATCH); - } - } - - /* UCP without UTF is as above, but with one character per code unit. */ - - else if (ucp) - { - uint32_t ch; - fc = UCHAR21INC(Feptr); - ch = Fecode[1]; - Fecode += 2; - - if (ch == fc) - { - RRETURN(MATCH_NOMATCH); /* Caseful match */ - } - else if (Fop == OP_NOTI) /* If caseless */ - { - if (ch > 127) - ch = UCD_OTHERCASE(ch); - else - ch = (mb->fcc)[ch]; + ch = TABLE_GET(ch, mb->fcc, ch); if (ch == fc) RRETURN(MATCH_NOMATCH); } } - else #endif /* SUPPORT_UNICODE */ - - /* Neither UTF nor UCP is set */ - { uint32_t ch = Fecode[1]; - fc = UCHAR21INC(Feptr); + fc = *Feptr++; if (ch == fc || (Fop == OP_NOTI && TABLE_GET(ch, mb->fcc, ch) == fc)) RRETURN(MATCH_NOMATCH); Fecode += 2; @@ -1308,7 +1244,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode); #endif /* SUPPORT_UNICODE */ /* When not in UTF mode, load a single-code-unit character. Then proceed as - above, using Unicode casing if either UTF or UCP is set. */ + above. */ Lc = *Fecode++; @@ -1317,15 +1253,11 @@ fprintf(stderr, "++ op=%d\n", *Fecode); if (Fop >= OP_STARI) { #if PCRE2_CODE_UNIT_WIDTH == 8 -#ifdef SUPPORT_UNICODE - if (ucp && !utf && Lc > 127) Loc = UCD_OTHERCASE(Lc); - else -#endif /* SUPPORT_UNICODE */ - /* Lc will be < 128 in UTF-8 mode. */ + /* Lc must be < 128 in UTF-8 mode. */ Loc = mb->fcc[Lc]; #else /* 16-bit & 32-bit */ #ifdef SUPPORT_UNICODE - if ((utf || ucp) && Lc > 127) Loc = UCD_OTHERCASE(Lc); + if (utf && Lc > 127) Loc = UCD_OTHERCASE(Lc); else #endif /* SUPPORT_UNICODE */ Loc = TABLE_GET(Lc, mb->fcc, Lc); @@ -1558,7 +1490,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode); if (Fop >= OP_NOTSTARI) /* Caseless */ { #ifdef SUPPORT_UNICODE - if ((utf || ucp) && Lc > 127) + if (utf && Lc > 127) Loc = UCD_OTHERCASE(Lc); else #endif /* SUPPORT_UNICODE */ @@ -6113,6 +6045,7 @@ BOOL firstline; BOOL has_first_cu = FALSE; BOOL has_req_cu = FALSE; BOOL startline; +BOOL utf; #if PCRE2_CODE_UNIT_WIDTH == 8 BOOL memchr_not_found_first_cu = FALSE; @@ -6136,19 +6069,13 @@ PCRE2_SPTR match_partial; BOOL use_jit; #endif -/* This flag is needed even when Unicode is not supported for convenience -(it is used by the IS_NEWLINE macro). */ - -BOOL utf = FALSE; - #ifdef SUPPORT_UNICODE -BOOL ucp = FALSE; BOOL allow_invalid; uint32_t fragment_options = 0; #ifdef SUPPORT_JIT BOOL jit_checked_utf = FALSE; #endif -#endif /* SUPPORT_UNICODE */ +#endif PCRE2_SIZE frame_size; @@ -6164,8 +6091,7 @@ proves to be too small, it is replaced by a larger one on the heap. To get a vector of the size required that is aligned for pointers, allocate it as a vector of pointers. */ -PCRE2_SPTR stack_frames_vector[START_FRAMES_SIZE/sizeof(PCRE2_SPTR)] - PCRE2_KEEP_UNINITIALIZED; +PCRE2_SPTR stack_frames_vector[START_FRAMES_SIZE/sizeof(PCRE2_SPTR)]; mb->stack_frames = (heapframe *)stack_frames_vector; /* A length equal to PCRE2_ZERO_TERMINATED implies a zero-terminated @@ -6221,13 +6147,12 @@ use_jit = (re->executable_jit != NULL && (options & ~PUBLIC_JIT_MATCH_OPTIONS) == 0); #endif -/* Initialize UTF/UCP parameters. */ +/* Initialize UTF parameters. */ -#ifdef SUPPORT_UNICODE utf = (re->overall_options & PCRE2_UTF) != 0; +#ifdef SUPPORT_UNICODE allow_invalid = (re->overall_options & PCRE2_MATCH_INVALID_UTF) != 0; -ucp = (re->overall_options & PCRE2_UCP) != 0; -#endif /* SUPPORT_UNICODE */ +#endif /* Convert the partial matching flags into an integer. */ @@ -6664,13 +6589,9 @@ if ((re->flags & PCRE2_FIRSTSET) != 0) if ((re->flags & PCRE2_FIRSTCASELESS) != 0) { first_cu2 = TABLE_GET(first_cu, mb->fcc, first_cu); -#ifdef SUPPORT_UNICODE -#if PCRE2_CODE_UNIT_WIDTH == 8 - if (first_cu > 127 && ucp && !utf) first_cu2 = UCD_OTHERCASE(first_cu); -#else - if (first_cu > 127 && (utf || ucp)) first_cu2 = UCD_OTHERCASE(first_cu); +#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8 + if (utf && first_cu > 127) first_cu2 = UCD_OTHERCASE(first_cu); #endif -#endif /* SUPPORT_UNICODE */ } } else @@ -6686,13 +6607,9 @@ if ((re->flags & PCRE2_LASTSET) != 0) if ((re->flags & PCRE2_LASTCASELESS) != 0) { req_cu2 = TABLE_GET(req_cu, mb->fcc, req_cu); -#ifdef SUPPORT_UNICODE -#if PCRE2_CODE_UNIT_WIDTH == 8 - if (req_cu > 127 && ucp && !utf) req_cu2 = UCD_OTHERCASE(req_cu); -#else - if (req_cu > 127 && (utf || ucp)) req_cu2 = UCD_OTHERCASE(req_cu); +#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8 + if (utf && req_cu > 127) req_cu2 = UCD_OTHERCASE(req_cu); #endif -#endif /* SUPPORT_UNICODE */ } } @@ -6839,16 +6756,15 @@ for(;;) #endif } - /* If we can't find the required first code unit, having reached the - true end of the subject, break the bumpalong loop, to force a match - failure, except when doing partial matching, when we let the next cycle - run at the end of the subject. To see why, consider the pattern - /(?<=abc)def/, which partially matches "abc", even though the string - does not contain the starting character "d". If we have not reached the - true end of the subject (PCRE2_FIRSTLINE caused end_subject to be - temporarily modified) we also let the cycle run, because the matching - string is legitimately allowed to start with the first code unit of a - newline. */ + /* If we can't find the required code unit, having reached the true end + of the subject, break the bumpalong loop, to force a match failure, + except when doing partial matching, when we let the next cycle run at + the end of the subject. To see why, consider the pattern /(?<=abc)def/, + which partially matches "abc", even though the string does not contain + the starting character "d". If we have not reached the true end of the + subject (PCRE2_FIRSTLINE caused end_subject to be temporarily modified) + we also let the cycle run, because the matching string is legitimately + allowed to start with the first code unit of a newline. */ if (mb->partial == 0 && start_match >= mb->end_subject) { |