diff options
author | Karl Williamson <khw@cpan.org> | 2015-10-19 12:41:10 -0600 |
---|---|---|
committer | Karl Williamson <khw@cpan.org> | 2015-10-19 12:41:10 -0600 |
commit | f0bd363c36d925d8d3dfe3b68715763c850b171a (patch) | |
tree | 5595b776fafd491cf69eaa693e5d68af667ea597 | |
parent | 9dfbfb6e46f94347d97400b65ab59a8bd120948d (diff) | |
parent | 139a998acd6eae73587ff4f048925394f73682d9 (diff) | |
download | perl-f0bd363c36d925d8d3dfe3b68715763c850b171a.tar.gz |
PATCH: [perl #126319] Seg fault
This is a merge into blead of a branch that fixes several errors in the
\b{gcb}, \b{wb}, and \b{sb} (and \B{} corresponding) constructs added in
v5.22.
Finding and fixing the bug in the ticket caused several other
bugs to show up, so that fixing just that one caused other tests to
fail.
-rw-r--r-- | regexec.c | 170 | ||||
-rw-r--r-- | t/lib/warnings/regexec | 1 | ||||
-rw-r--r-- | t/re/re_tests | 7 | ||||
-rw-r--r-- | t/re/subst.t | 21 |
4 files changed, 118 insertions, 81 deletions
@@ -2065,14 +2065,17 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, FBC_BOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8); break; case GCB_BOUND: - if (s == reginfo->strbeg) { /* GCB always matches at begin and - end */ - if (to_complement ^ cBOOL(reginfo->intuit - || regtry(reginfo, &s))) + if (s == reginfo->strbeg) { + if (reginfo->intuit || regtry(reginfo, &s)) { goto got_it; } + + /* Didn't match. Try at the next position (if there is one) */ s += (utf8_target) ? UTF8SKIP(s) : 1; + if (UNLIKELY(s >= reginfo->strend)) { + break; + } } if (utf8_target) { @@ -2083,46 +2086,44 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, while (s < strend) { GCB_enum after = getGCB_VAL_UTF8((U8*) s, (U8*) reginfo->strend); - if (to_complement ^ isGCB(before, after)) { - if (reginfo->intuit || regtry(reginfo, &s)) { - goto got_it; - } - before = after; + if ( (to_complement ^ isGCB(before, after)) + && (reginfo->intuit || regtry(reginfo, &s))) + { + goto got_it; } + before = after; s += UTF8SKIP(s); } } else { /* Not utf8. Everything is a GCB except between CR and LF */ while (s < strend) { - if (to_complement ^ (UCHARAT(s - 1) != '\r' - || UCHARAT(s) != '\n')) + if ((to_complement ^ ( UCHARAT(s - 1) != '\r' + || UCHARAT(s) != '\n')) + && (reginfo->intuit || regtry(reginfo, &s))) { - if (reginfo->intuit || regtry(reginfo, &s)) { - goto got_it; - } - s++; + goto got_it; } + s++; } } /* And, since this is a bound, it can match after the final * character in the string */ - if (to_complement ^ cBOOL(reginfo->intuit || regtry(reginfo, &s))) { + if ((reginfo->intuit || regtry(reginfo, &s))) { goto got_it; } break; case SB_BOUND: - if (s == reginfo->strbeg) { /* SB always matches at beginning */ - if (to_complement - ^ cBOOL(reginfo->intuit || regtry(reginfo, &s))) - { + if (s == reginfo->strbeg) { + if (reginfo->intuit || regtry(reginfo, &s)) { goto got_it; } - - /* Didn't match. Go try at the next position */ s += (utf8_target) ? UTF8SKIP(s) : 1; + if (UNLIKELY(s >= reginfo->strend)) { + break; + } } if (utf8_target) { @@ -2133,18 +2134,17 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, while (s < strend) { SB_enum after = getSB_VAL_UTF8((U8*) s, (U8*) reginfo->strend); - if (to_complement ^ isSB(before, - after, - (U8*) reginfo->strbeg, - (U8*) s, - (U8*) reginfo->strend, - utf8_target)) + if ((to_complement ^ isSB(before, + after, + (U8*) reginfo->strbeg, + (U8*) s, + (U8*) reginfo->strend, + utf8_target)) + && (reginfo->intuit || regtry(reginfo, &s))) { - if (reginfo->intuit || regtry(reginfo, &s)) { - goto got_it; - } - before = after; + goto got_it; } + before = after; s += UTF8SKIP(s); } } @@ -2152,18 +2152,17 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, SB_enum before = getSB_VAL_CP((U8) *(s -1)); while (s < strend) { SB_enum after = getSB_VAL_CP((U8) *s); - if (to_complement ^ isSB(before, - after, - (U8*) reginfo->strbeg, - (U8*) s, - (U8*) reginfo->strend, - utf8_target)) + if ((to_complement ^ isSB(before, + after, + (U8*) reginfo->strbeg, + (U8*) s, + (U8*) reginfo->strend, + utf8_target)) + && (reginfo->intuit || regtry(reginfo, &s))) { - if (reginfo->intuit || regtry(reginfo, &s)) { - goto got_it; - } - before = after; + goto got_it; } + before = after; s++; } } @@ -2171,9 +2170,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, /* Here are at the final position in the target string. The SB * value is always true here, so matches, depending on other * constraints */ - if (to_complement ^ cBOOL(reginfo->intuit - || regtry(reginfo, &s))) - { + if (reginfo->intuit || regtry(reginfo, &s)) { goto got_it; } @@ -2181,12 +2178,13 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, case WB_BOUND: if (s == reginfo->strbeg) { - if (to_complement ^ cBOOL(reginfo->intuit - || regtry(reginfo, &s))) - { + if (reginfo->intuit || regtry(reginfo, &s)) { goto got_it; } s += (utf8_target) ? UTF8SKIP(s) : 1; + if (UNLIKELY(s >= reginfo->strend)) { + break; + } } if (utf8_target) { @@ -2204,20 +2202,19 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, while (s < strend) { WB_enum after = getWB_VAL_UTF8((U8*) s, (U8*) reginfo->strend); - if (to_complement ^ isWB(previous, - before, - after, - (U8*) reginfo->strbeg, - (U8*) s, - (U8*) reginfo->strend, - utf8_target)) + if ((to_complement ^ isWB(previous, + before, + after, + (U8*) reginfo->strbeg, + (U8*) s, + (U8*) reginfo->strend, + utf8_target)) + && (reginfo->intuit || regtry(reginfo, &s))) { - if (reginfo->intuit || regtry(reginfo, &s)) { - goto got_it; - } - previous = before; - before = after; + goto got_it; } + previous = before; + before = after; s += UTF8SKIP(s); } } @@ -2226,27 +2223,24 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, WB_enum before = getWB_VAL_CP((U8) *(s -1)); while (s < strend) { WB_enum after = getWB_VAL_CP((U8) *s); - if (to_complement ^ isWB(previous, - before, - after, - (U8*) reginfo->strbeg, - (U8*) s, - (U8*) reginfo->strend, - utf8_target)) + if ((to_complement ^ isWB(previous, + before, + after, + (U8*) reginfo->strbeg, + (U8*) s, + (U8*) reginfo->strend, + utf8_target)) + && (reginfo->intuit || regtry(reginfo, &s))) { - if (reginfo->intuit || regtry(reginfo, &s)) { - goto got_it; - } - previous = before; - before = after; + goto got_it; } + previous = before; + before = after; s++; } } - if (to_complement ^ cBOOL(reginfo->intuit - || regtry(reginfo, &s))) - { + if (reginfo->intuit || regtry(reginfo, &s)) { goto got_it; } } @@ -4743,10 +4737,24 @@ S_backup_one_WB(pTHX_ WB_enum * previous, const U8 * const strbeg, U8 ** curpos, * to look it up */ if (*previous != WB_UNKNOWN) { wb = *previous; - *previous = WB_UNKNOWN; - /* XXX Note that doesn't change curpos, and maybe should */ - /* But we always back up over these two types */ + /* But we need to move backwards by one */ + if (utf8_target) { + *curpos = reghopmaybe3(*curpos, -1, strbeg); + if (! *curpos) { + *previous = WB_EDGE; + *curpos = (U8 *) strbeg; + } + else { + *previous = WB_UNKNOWN; + } + } + else { + (*curpos)--; + *previous = (*curpos <= strbeg) ? WB_EDGE : WB_UNKNOWN; + } + + /* And we always back up over these two types */ if (wb != WB_Extend && wb != WB_Format) { return wb; } @@ -5610,8 +5618,10 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) case BOUNDU: /* /\b/u */ boundu: - if (utf8_target) { - + if (UNLIKELY(reginfo->strbeg >= reginfo->strend)) { + match = FALSE; + } + else if (utf8_target) { bound_utf8: switch((bound_type) FLAGS(scan)) { case TRADITIONAL_BOUND: diff --git a/t/lib/warnings/regexec b/t/lib/warnings/regexec index 1f3b65b167..900dd6ee7f 100644 --- a/t/lib/warnings/regexec +++ b/t/lib/warnings/regexec @@ -212,6 +212,7 @@ Use of \b{} or \B{} for non-UTF-8 locale is wrong. Assuming a UTF-8 locale at - Use of \b{} or \B{} for non-UTF-8 locale is wrong. Assuming a UTF-8 locale at - line 16. Use of \b{} or \B{} for non-UTF-8 locale is wrong. Assuming a UTF-8 locale at - line 17. Use of \b{} or \B{} for non-UTF-8 locale is wrong. Assuming a UTF-8 locale at - line 17. +Use of \b{} or \B{} for non-UTF-8 locale is wrong. Assuming a UTF-8 locale at - line 17. ######## # NAME (?[ ]) in non-UTF-8 locale eval { require POSIX; POSIX->import("locale_h") }; diff --git a/t/re/re_tests b/t/re/re_tests index 0dba2495a6..67ac57c08d 100644 --- a/t/re/re_tests +++ b/t/re/re_tests @@ -134,7 +134,14 @@ a[^]b]c adc y $& adc \By\b xy y - - \by\B yz y - - \By\B xyz y - - +\b n - - +\b{gcb} n - - +\b{sb} n - - +\b{wb} n - - \B y - - +\B{gcb} y - - +\B{sb} y - - +\B{wb} y - - \w a y - - \w - n - - \W a n - - diff --git a/t/re/subst.t b/t/re/subst.t index 59e11b56dc..f2bf0a2b54 100644 --- a/t/re/subst.t +++ b/t/re/subst.t @@ -9,7 +9,7 @@ BEGIN { require './loc_tools.pl'; } -plan( tests => 261 ); +plan( tests => 267 ); $_ = 'david'; $a = s/david/rules/r; @@ -1035,6 +1035,25 @@ SKIP: { is("$division$division$division" =~ s/\B/!/ugr, "!$division!$division!$division!", '\\B matches Latin1 before string, mid, and end, /u'); is("\x{2028}\x{2028}\x{2028}" =~ s/\B/!/ugr, "!\x{2028}!\x{2028}!\x{2028}!", '\\B matches above-Latin1 before string, mid, and end, /u'); + fresh_perl_like( '$_=""; /\b{gcb}/; s///g', qr/^$/, {}, + '[perl #126319: Segmentation fault in Perl_sv_catpvn_flags with \b{gcb}' + ); + fresh_perl_like( '$_=""; /\B{gcb}/; s///g', qr/^$/, {}, + '[perl #126319: Segmentation fault in Perl_sv_catpvn_flags with \b{gcb}' + ); + fresh_perl_like( '$_=""; /\b{wb}/; s///g', qr/^$/, {}, + '[perl #126319: Segmentation fault in Perl_sv_catpvn_flags with \b{wb}' + ); + fresh_perl_like( '$_=""; /\B{wb}/; s///g', qr/^$/, {}, + '[perl #126319: Segmentation fault in Perl_sv_catpvn_flags with \b{wb}' + ); + fresh_perl_like( '$_=""; /\b{sb}/; s///g', qr/^$/, {}, + '[perl #126319: Segmentation fault in Perl_sv_catpvn_flags with \b{sb}' + ); + fresh_perl_like( '$_=""; /\B{sb}/; s///g', qr/^$/, {}, + '[perl #126319: Segmentation fault in Perl_sv_catpvn_flags with \b{sb}' + ); + SKIP: { if (! locales_enabled('LC_ALL')) { skip "Can't test locale (maybe you are missing POSIX)", 6; |