summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2015-10-19 12:41:10 -0600
committerKarl Williamson <khw@cpan.org>2015-10-19 12:41:10 -0600
commitf0bd363c36d925d8d3dfe3b68715763c850b171a (patch)
tree5595b776fafd491cf69eaa693e5d68af667ea597
parent9dfbfb6e46f94347d97400b65ab59a8bd120948d (diff)
parent139a998acd6eae73587ff4f048925394f73682d9 (diff)
downloadperl-f0bd363c36d925d8d3dfe3b68715763c850b171a.tar.gz
PATCH: [perl #126319] Seg fault
This is a merge into blead of a branch that fixes several errors in the \b{gcb}, \b{wb}, and \b{sb} (and \B{} corresponding) constructs added in v5.22. Finding and fixing the bug in the ticket caused several other bugs to show up, so that fixing just that one caused other tests to fail.
-rw-r--r--regexec.c170
-rw-r--r--t/lib/warnings/regexec1
-rw-r--r--t/re/re_tests7
-rw-r--r--t/re/subst.t21
4 files changed, 118 insertions, 81 deletions
diff --git a/regexec.c b/regexec.c
index 9f4d395fa7..85c31a69ba 100644
--- a/regexec.c
+++ b/regexec.c
@@ -2065,14 +2065,17 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
FBC_BOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8);
break;
case GCB_BOUND:
- if (s == reginfo->strbeg) { /* GCB always matches at begin and
- end */
- if (to_complement ^ cBOOL(reginfo->intuit
- || regtry(reginfo, &s)))
+ if (s == reginfo->strbeg) {
+ if (reginfo->intuit || regtry(reginfo, &s))
{
goto got_it;
}
+
+ /* Didn't match. Try at the next position (if there is one) */
s += (utf8_target) ? UTF8SKIP(s) : 1;
+ if (UNLIKELY(s >= reginfo->strend)) {
+ break;
+ }
}
if (utf8_target) {
@@ -2083,46 +2086,44 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
while (s < strend) {
GCB_enum after = getGCB_VAL_UTF8((U8*) s,
(U8*) reginfo->strend);
- if (to_complement ^ isGCB(before, after)) {
- if (reginfo->intuit || regtry(reginfo, &s)) {
- goto got_it;
- }
- before = after;
+ if ( (to_complement ^ isGCB(before, after))
+ && (reginfo->intuit || regtry(reginfo, &s)))
+ {
+ goto got_it;
}
+ before = after;
s += UTF8SKIP(s);
}
}
else { /* Not utf8. Everything is a GCB except between CR and
LF */
while (s < strend) {
- if (to_complement ^ (UCHARAT(s - 1) != '\r'
- || UCHARAT(s) != '\n'))
+ if ((to_complement ^ ( UCHARAT(s - 1) != '\r'
+ || UCHARAT(s) != '\n'))
+ && (reginfo->intuit || regtry(reginfo, &s)))
{
- if (reginfo->intuit || regtry(reginfo, &s)) {
- goto got_it;
- }
- s++;
+ goto got_it;
}
+ s++;
}
}
/* And, since this is a bound, it can match after the final
* character in the string */
- if (to_complement ^ cBOOL(reginfo->intuit || regtry(reginfo, &s))) {
+ if ((reginfo->intuit || regtry(reginfo, &s))) {
goto got_it;
}
break;
case SB_BOUND:
- if (s == reginfo->strbeg) { /* SB always matches at beginning */
- if (to_complement
- ^ cBOOL(reginfo->intuit || regtry(reginfo, &s)))
- {
+ if (s == reginfo->strbeg) {
+ if (reginfo->intuit || regtry(reginfo, &s)) {
goto got_it;
}
-
- /* Didn't match. Go try at the next position */
s += (utf8_target) ? UTF8SKIP(s) : 1;
+ if (UNLIKELY(s >= reginfo->strend)) {
+ break;
+ }
}
if (utf8_target) {
@@ -2133,18 +2134,17 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
while (s < strend) {
SB_enum after = getSB_VAL_UTF8((U8*) s,
(U8*) reginfo->strend);
- if (to_complement ^ isSB(before,
- after,
- (U8*) reginfo->strbeg,
- (U8*) s,
- (U8*) reginfo->strend,
- utf8_target))
+ if ((to_complement ^ isSB(before,
+ after,
+ (U8*) reginfo->strbeg,
+ (U8*) s,
+ (U8*) reginfo->strend,
+ utf8_target))
+ && (reginfo->intuit || regtry(reginfo, &s)))
{
- if (reginfo->intuit || regtry(reginfo, &s)) {
- goto got_it;
- }
- before = after;
+ goto got_it;
}
+ before = after;
s += UTF8SKIP(s);
}
}
@@ -2152,18 +2152,17 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
SB_enum before = getSB_VAL_CP((U8) *(s -1));
while (s < strend) {
SB_enum after = getSB_VAL_CP((U8) *s);
- if (to_complement ^ isSB(before,
- after,
- (U8*) reginfo->strbeg,
- (U8*) s,
- (U8*) reginfo->strend,
- utf8_target))
+ if ((to_complement ^ isSB(before,
+ after,
+ (U8*) reginfo->strbeg,
+ (U8*) s,
+ (U8*) reginfo->strend,
+ utf8_target))
+ && (reginfo->intuit || regtry(reginfo, &s)))
{
- if (reginfo->intuit || regtry(reginfo, &s)) {
- goto got_it;
- }
- before = after;
+ goto got_it;
}
+ before = after;
s++;
}
}
@@ -2171,9 +2170,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
/* Here are at the final position in the target string. The SB
* value is always true here, so matches, depending on other
* constraints */
- if (to_complement ^ cBOOL(reginfo->intuit
- || regtry(reginfo, &s)))
- {
+ if (reginfo->intuit || regtry(reginfo, &s)) {
goto got_it;
}
@@ -2181,12 +2178,13 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
case WB_BOUND:
if (s == reginfo->strbeg) {
- if (to_complement ^ cBOOL(reginfo->intuit
- || regtry(reginfo, &s)))
- {
+ if (reginfo->intuit || regtry(reginfo, &s)) {
goto got_it;
}
s += (utf8_target) ? UTF8SKIP(s) : 1;
+ if (UNLIKELY(s >= reginfo->strend)) {
+ break;
+ }
}
if (utf8_target) {
@@ -2204,20 +2202,19 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
while (s < strend) {
WB_enum after = getWB_VAL_UTF8((U8*) s,
(U8*) reginfo->strend);
- if (to_complement ^ isWB(previous,
- before,
- after,
- (U8*) reginfo->strbeg,
- (U8*) s,
- (U8*) reginfo->strend,
- utf8_target))
+ if ((to_complement ^ isWB(previous,
+ before,
+ after,
+ (U8*) reginfo->strbeg,
+ (U8*) s,
+ (U8*) reginfo->strend,
+ utf8_target))
+ && (reginfo->intuit || regtry(reginfo, &s)))
{
- if (reginfo->intuit || regtry(reginfo, &s)) {
- goto got_it;
- }
- previous = before;
- before = after;
+ goto got_it;
}
+ previous = before;
+ before = after;
s += UTF8SKIP(s);
}
}
@@ -2226,27 +2223,24 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
WB_enum before = getWB_VAL_CP((U8) *(s -1));
while (s < strend) {
WB_enum after = getWB_VAL_CP((U8) *s);
- if (to_complement ^ isWB(previous,
- before,
- after,
- (U8*) reginfo->strbeg,
- (U8*) s,
- (U8*) reginfo->strend,
- utf8_target))
+ if ((to_complement ^ isWB(previous,
+ before,
+ after,
+ (U8*) reginfo->strbeg,
+ (U8*) s,
+ (U8*) reginfo->strend,
+ utf8_target))
+ && (reginfo->intuit || regtry(reginfo, &s)))
{
- if (reginfo->intuit || regtry(reginfo, &s)) {
- goto got_it;
- }
- previous = before;
- before = after;
+ goto got_it;
}
+ previous = before;
+ before = after;
s++;
}
}
- if (to_complement ^ cBOOL(reginfo->intuit
- || regtry(reginfo, &s)))
- {
+ if (reginfo->intuit || regtry(reginfo, &s)) {
goto got_it;
}
}
@@ -4743,10 +4737,24 @@ S_backup_one_WB(pTHX_ WB_enum * previous, const U8 * const strbeg, U8 ** curpos,
* to look it up */
if (*previous != WB_UNKNOWN) {
wb = *previous;
- *previous = WB_UNKNOWN;
- /* XXX Note that doesn't change curpos, and maybe should */
- /* But we always back up over these two types */
+ /* But we need to move backwards by one */
+ if (utf8_target) {
+ *curpos = reghopmaybe3(*curpos, -1, strbeg);
+ if (! *curpos) {
+ *previous = WB_EDGE;
+ *curpos = (U8 *) strbeg;
+ }
+ else {
+ *previous = WB_UNKNOWN;
+ }
+ }
+ else {
+ (*curpos)--;
+ *previous = (*curpos <= strbeg) ? WB_EDGE : WB_UNKNOWN;
+ }
+
+ /* And we always back up over these two types */
if (wb != WB_Extend && wb != WB_Format) {
return wb;
}
@@ -5610,8 +5618,10 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
case BOUNDU: /* /\b/u */
boundu:
- if (utf8_target) {
-
+ if (UNLIKELY(reginfo->strbeg >= reginfo->strend)) {
+ match = FALSE;
+ }
+ else if (utf8_target) {
bound_utf8:
switch((bound_type) FLAGS(scan)) {
case TRADITIONAL_BOUND:
diff --git a/t/lib/warnings/regexec b/t/lib/warnings/regexec
index 1f3b65b167..900dd6ee7f 100644
--- a/t/lib/warnings/regexec
+++ b/t/lib/warnings/regexec
@@ -212,6 +212,7 @@ Use of \b{} or \B{} for non-UTF-8 locale is wrong. Assuming a UTF-8 locale at -
Use of \b{} or \B{} for non-UTF-8 locale is wrong. Assuming a UTF-8 locale at - line 16.
Use of \b{} or \B{} for non-UTF-8 locale is wrong. Assuming a UTF-8 locale at - line 17.
Use of \b{} or \B{} for non-UTF-8 locale is wrong. Assuming a UTF-8 locale at - line 17.
+Use of \b{} or \B{} for non-UTF-8 locale is wrong. Assuming a UTF-8 locale at - line 17.
########
# NAME (?[ ]) in non-UTF-8 locale
eval { require POSIX; POSIX->import("locale_h") };
diff --git a/t/re/re_tests b/t/re/re_tests
index 0dba2495a6..67ac57c08d 100644
--- a/t/re/re_tests
+++ b/t/re/re_tests
@@ -134,7 +134,14 @@ a[^]b]c adc y $& adc
\By\b xy y - -
\by\B yz y - -
\By\B xyz y - -
+\b n - -
+\b{gcb} n - -
+\b{sb} n - -
+\b{wb} n - -
\B y - -
+\B{gcb} y - -
+\B{sb} y - -
+\B{wb} y - -
\w a y - -
\w - n - -
\W a n - -
diff --git a/t/re/subst.t b/t/re/subst.t
index 59e11b56dc..f2bf0a2b54 100644
--- a/t/re/subst.t
+++ b/t/re/subst.t
@@ -9,7 +9,7 @@ BEGIN {
require './loc_tools.pl';
}
-plan( tests => 261 );
+plan( tests => 267 );
$_ = 'david';
$a = s/david/rules/r;
@@ -1035,6 +1035,25 @@ SKIP: {
is("$division$division$division" =~ s/\B/!/ugr, "!$division!$division!$division!", '\\B matches Latin1 before string, mid, and end, /u');
is("\x{2028}\x{2028}\x{2028}" =~ s/\B/!/ugr, "!\x{2028}!\x{2028}!\x{2028}!", '\\B matches above-Latin1 before string, mid, and end, /u');
+ fresh_perl_like( '$_=""; /\b{gcb}/; s///g', qr/^$/, {},
+ '[perl #126319: Segmentation fault in Perl_sv_catpvn_flags with \b{gcb}'
+ );
+ fresh_perl_like( '$_=""; /\B{gcb}/; s///g', qr/^$/, {},
+ '[perl #126319: Segmentation fault in Perl_sv_catpvn_flags with \b{gcb}'
+ );
+ fresh_perl_like( '$_=""; /\b{wb}/; s///g', qr/^$/, {},
+ '[perl #126319: Segmentation fault in Perl_sv_catpvn_flags with \b{wb}'
+ );
+ fresh_perl_like( '$_=""; /\B{wb}/; s///g', qr/^$/, {},
+ '[perl #126319: Segmentation fault in Perl_sv_catpvn_flags with \b{wb}'
+ );
+ fresh_perl_like( '$_=""; /\b{sb}/; s///g', qr/^$/, {},
+ '[perl #126319: Segmentation fault in Perl_sv_catpvn_flags with \b{sb}'
+ );
+ fresh_perl_like( '$_=""; /\B{sb}/; s///g', qr/^$/, {},
+ '[perl #126319: Segmentation fault in Perl_sv_catpvn_flags with \b{sb}'
+ );
+
SKIP: {
if (! locales_enabled('LC_ALL')) {
skip "Can't test locale (maybe you are missing POSIX)", 6;