PATCH: [perl #126319] Seg fault

This is a merge into blead of a branch that fixes several errors in the \b{gcb}, \b{wb}, and \b{sb} (and \B{} corresponding) constructs added in v5.22. Finding and fixing the bug in the ticket caused several other bugs to show up, so that fixing just that one caused other tests to fail.
author: Karl Williamson <khw@cpan.org> 2015-10-19 12:41:10 -0600
committer: Karl Williamson <khw@cpan.org> 2015-10-19 12:41:10 -0600
commit: f0bd363c36d925d8d3dfe3b68715763c850b171a (patch)
tree: 5595b776fafd491cf69eaa693e5d68af667ea597
parent: 9dfbfb6e46f94347d97400b65ab59a8bd120948d (diff)
parent: 139a998acd6eae73587ff4f048925394f73682d9 (diff)
download: perl-f0bd363c36d925d8d3dfe3b68715763c850b171a.tar.gz
4 files changed, 118 insertions, 81 deletions
diff --git a/regexec.c b/regexec.c
index 9f4d395fa7..85c31a69ba 100644
--- a/regexec.c
+++ b/regexec.c
@@ -2065,14 +2065,17 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
                 FBC_BOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8);
                 break;
             case GCB_BOUND:
-                if (s == reginfo->strbeg) { /* GCB always matches at begin and
-                                               end */
-                    if (to_complement ^ cBOOL(reginfo->intuit
-                                                      || regtry(reginfo, &s)))
+                if (s == reginfo->strbeg) {
+                    if (reginfo->intuit || regtry(reginfo, &s))
                     {
                         goto got_it;
                     }
+
+                    /* Didn't match.  Try at the next position (if there is one) */
                     s += (utf8_target) ? UTF8SKIP(s) : 1;
+                    if (UNLIKELY(s >= reginfo->strend)) {
+                        break;
+                    }
                 }
 
                 if (utf8_target) {
@@ -2083,46 +2086,44 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
                     while (s < strend) {
                         GCB_enum after = getGCB_VAL_UTF8((U8*) s,
                                                         (U8*) reginfo->strend);
-                        if (to_complement ^ isGCB(before, after)) {
-                            if (reginfo->intuit || regtry(reginfo, &s)) {
-                                goto got_it;
-                            }
-                            before = after;
+                        if (   (to_complement ^ isGCB(before, after))
+                            && (reginfo->intuit || regtry(reginfo, &s)))
+                        {
+                            goto got_it;
                         }
+                        before = after;
                         s += UTF8SKIP(s);
                     }
                 }
                 else {  /* Not utf8.  Everything is a GCB except between CR and
                            LF */
                     while (s < strend) {
-                        if (to_complement ^ (UCHARAT(s - 1) != '\r'
-                                             || UCHARAT(s) != '\n'))
+                        if ((to_complement ^ (   UCHARAT(s - 1) != '\r'
+                                              || UCHARAT(s) != '\n'))
+                            && (reginfo->intuit || regtry(reginfo, &s)))
                         {
-                            if (reginfo->intuit || regtry(reginfo, &s)) {
-                                goto got_it;
-                            }
-                            s++;
+                            goto got_it;
                         }
+                        s++;
                     }
                 }
 
                 /* And, since this is a bound, it can match after the final
                  * character in the string */
-                if (to_complement ^ cBOOL(reginfo->intuit || regtry(reginfo, &s))) {
+                if ((reginfo->intuit || regtry(reginfo, &s))) {
                     goto got_it;
                 }
                 break;
 
             case SB_BOUND:
-                if (s == reginfo->strbeg) { /* SB always matches at beginning */
-                    if (to_complement
-                                ^ cBOOL(reginfo->intuit || regtry(reginfo, &s)))
-                    {
+                if (s == reginfo->strbeg) {
+                    if (reginfo->intuit || regtry(reginfo, &s)) {
                         goto got_it;
                     }
-
-                    /* Didn't match.  Go try at the next position */
                     s += (utf8_target) ? UTF8SKIP(s) : 1;
+                    if (UNLIKELY(s >= reginfo->strend)) {
+                        break;
+                    }
                 }
 
                 if (utf8_target) {
@@ -2133,18 +2134,17 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
                     while (s < strend) {
                         SB_enum after = getSB_VAL_UTF8((U8*) s,
                                                          (U8*) reginfo->strend);
-                        if (to_complement ^ isSB(before,
-                                                 after,
-                                                 (U8*) reginfo->strbeg,
-                                                 (U8*) s,
-                                                 (U8*) reginfo->strend,
-                                                 utf8_target))
+                        if ((to_complement ^ isSB(before,
+                                                  after,
+                                                  (U8*) reginfo->strbeg,
+                                                  (U8*) s,
+                                                  (U8*) reginfo->strend,
+                                                  utf8_target))
+                            && (reginfo->intuit || regtry(reginfo, &s)))
                         {
-                            if (reginfo->intuit || regtry(reginfo, &s)) {
-                                goto got_it;
-                            }
-                            before = after;
+                            goto got_it;
                         }
+                        before = after;
                         s += UTF8SKIP(s);
                     }
                 }
@@ -2152,18 +2152,17 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
                     SB_enum before = getSB_VAL_CP((U8) *(s -1));
                     while (s < strend) {
                         SB_enum after = getSB_VAL_CP((U8) *s);
-                        if (to_complement ^ isSB(before,
-                                                 after,
-                                                 (U8*) reginfo->strbeg,
-                                                 (U8*) s,
-                                                 (U8*) reginfo->strend,
-                                                 utf8_target))
+                        if ((to_complement ^ isSB(before,
+                                                  after,
+                                                  (U8*) reginfo->strbeg,
+                                                  (U8*) s,
+                                                  (U8*) reginfo->strend,
+                                                  utf8_target))
+                            && (reginfo->intuit || regtry(reginfo, &s)))
                         {
-                            if (reginfo->intuit || regtry(reginfo, &s)) {
-                                goto got_it;
-                            }
-                            before = after;
+                            goto got_it;
                         }
+                        before = after;
                         s++;
                     }
                 }
@@ -2171,9 +2170,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
                 /* Here are at the final position in the target string.  The SB
                  * value is always true here, so matches, depending on other
                  * constraints */
-                if (to_complement ^ cBOOL(reginfo->intuit
-                                                      || regtry(reginfo, &s)))
-                {
+                if (reginfo->intuit || regtry(reginfo, &s)) {
                     goto got_it;
                 }
 
@@ -2181,12 +2178,13 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
 
             case WB_BOUND:
                 if (s == reginfo->strbeg) {
-                    if (to_complement ^ cBOOL(reginfo->intuit
-                                              || regtry(reginfo, &s)))
-                    {
+                    if (reginfo->intuit || regtry(reginfo, &s)) {
                         goto got_it;
                     }
                     s += (utf8_target) ? UTF8SKIP(s) : 1;
+                    if (UNLIKELY(s >= reginfo->strend)) {
+                        break;
+                    }
                 }
 
                 if (utf8_target) {
@@ -2204,20 +2202,19 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
                     while (s < strend) {
                         WB_enum after = getWB_VAL_UTF8((U8*) s,
                                                         (U8*) reginfo->strend);
-                        if (to_complement ^ isWB(previous,
-                                                 before,
-                                                 after,
-                                                 (U8*) reginfo->strbeg,
-                                                 (U8*) s,
-                                                 (U8*) reginfo->strend,
-                                                 utf8_target))
+                        if ((to_complement ^ isWB(previous,
+                                                  before,
+                                                  after,
+                                                  (U8*) reginfo->strbeg,
+                                                  (U8*) s,
+                                                  (U8*) reginfo->strend,
+                                                  utf8_target))
+                            && (reginfo->intuit || regtry(reginfo, &s)))
                         {
-                            if (reginfo->intuit || regtry(reginfo, &s)) {
-                                goto got_it;
-                            }
-                            previous = before;
-                            before = after;
+                            goto got_it;
                         }
+                        previous = before;
+                        before = after;
                         s += UTF8SKIP(s);
                     }
                 }
@@ -2226,27 +2223,24 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
                     WB_enum before = getWB_VAL_CP((U8) *(s -1));
                     while (s < strend) {
                         WB_enum after = getWB_VAL_CP((U8) *s);
-                        if (to_complement ^ isWB(previous,
-                                                 before,
-                                                 after,
-                                                 (U8*) reginfo->strbeg,
-                                                 (U8*) s,
-                                                 (U8*) reginfo->strend,
-                                                 utf8_target))
+                        if ((to_complement ^ isWB(previous,
+                                                  before,
+                                                  after,
+                                                  (U8*) reginfo->strbeg,
+                                                  (U8*) s,
+                                                  (U8*) reginfo->strend,
+                                                  utf8_target))
+                            && (reginfo->intuit || regtry(reginfo, &s)))
                         {
-                            if (reginfo->intuit || regtry(reginfo, &s)) {
-                                goto got_it;
-                            }
-                            previous = before;
-                            before = after;
+                            goto got_it;
                         }
+                        previous = before;
+                        before = after;
                         s++;
                     }
                 }
 
-                if (to_complement ^ cBOOL(reginfo->intuit
-                                          || regtry(reginfo, &s)))
-                {
+                if (reginfo->intuit || regtry(reginfo, &s)) {
                     goto got_it;
                 }
         }
@@ -4743,10 +4737,24 @@ S_backup_one_WB(pTHX_ WB_enum * previous, const U8 * const strbeg, U8 ** curpos,
         * to look it up */
     if (*previous != WB_UNKNOWN) {
         wb = *previous;
-        *previous = WB_UNKNOWN;
-        /* XXX Note that doesn't change curpos, and maybe should */
 
-        /* But we always back up over these two types */
+        /* But we need to move backwards by one */
+        if (utf8_target) {
+            *curpos = reghopmaybe3(*curpos, -1, strbeg);
+            if (! *curpos) {
+                *previous = WB_EDGE;
+                *curpos = (U8 *) strbeg;
+            }
+            else {
+                *previous = WB_UNKNOWN;
+            }
+        }
+        else {
+            (*curpos)--;
+            *previous = (*curpos <= strbeg) ? WB_EDGE : WB_UNKNOWN;
+        }
+
+        /* And we always back up over these two types */
         if (wb != WB_Extend && wb != WB_Format) {
             return wb;
         }
@@ -5610,8 +5618,10 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
 	case BOUNDU:  /*  /\b/u  */
 
           boundu:
-	    if (utf8_target) {
-
+            if (UNLIKELY(reginfo->strbeg >= reginfo->strend)) {
+                match = FALSE;
+            }
+            else if (utf8_target) {
               bound_utf8:
                 switch((bound_type) FLAGS(scan)) {
                     case TRADITIONAL_BOUND:
diff --git a/t/lib/warnings/regexec b/t/lib/warnings/regexec
index 1f3b65b167..900dd6ee7f 100644
--- a/t/lib/warnings/regexec
+++ b/t/lib/warnings/regexec
@@ -212,6 +212,7 @@ Use of \b{} or \B{} for non-UTF-8 locale is wrong.  Assuming a UTF-8 locale at -
 Use of \b{} or \B{} for non-UTF-8 locale is wrong.  Assuming a UTF-8 locale at - line 16.
 Use of \b{} or \B{} for non-UTF-8 locale is wrong.  Assuming a UTF-8 locale at - line 17.
 Use of \b{} or \B{} for non-UTF-8 locale is wrong.  Assuming a UTF-8 locale at - line 17.
+Use of \b{} or \B{} for non-UTF-8 locale is wrong.  Assuming a UTF-8 locale at - line 17.
 ########
 # NAME (?[ ]) in non-UTF-8 locale
 eval { require POSIX; POSIX->import("locale_h") };
diff --git a/t/re/re_tests b/t/re/re_tests
index 0dba2495a6..67ac57c08d 100644
--- a/t/re/re_tests
+++ b/t/re/re_tests
@@ -134,7 +134,14 @@ a[^]b]c	adc	y	$&	adc
 \By\b	xy	y	-	-
 \by\B	yz	y	-	-
 \By\B	xyz	y	-	-
+\b		n	-	-
+\b{gcb}		n	-	-
+\b{sb}		n	-	-
+\b{wb}		n	-	-
 \B		y	-	-
+\B{gcb}		y	-	-
+\B{sb}		y	-	-
+\B{wb}		y	-	-
 \w	a	y	-	-
 \w	-	n	-	-
 \W	a	n	-	-
diff --git a/t/re/subst.t b/t/re/subst.t
index 59e11b56dc..f2bf0a2b54 100644
--- a/t/re/subst.t
+++ b/t/re/subst.t
@@ -9,7 +9,7 @@ BEGIN {
     require './loc_tools.pl';
 }
 
-plan( tests => 261 );
+plan( tests => 267 );
 
 $_ = 'david';
 $a = s/david/rules/r;
@@ -1035,6 +1035,25 @@ SKIP: {
     is("$division$division$division" =~ s/\B/!/ugr, "!$division!$division!$division!", '\\B matches Latin1 before string, mid, and end, /u');
     is("\x{2028}\x{2028}\x{2028}" =~ s/\B/!/ugr, "!\x{2028}!\x{2028}!\x{2028}!", '\\B matches above-Latin1 before string, mid, and end, /u');
 
+    fresh_perl_like( '$_=""; /\b{gcb}/;  s///g', qr/^$/, {},
+        '[perl #126319: Segmentation fault in Perl_sv_catpvn_flags with \b{gcb}'
+    );
+    fresh_perl_like( '$_=""; /\B{gcb}/;  s///g', qr/^$/, {},
+        '[perl #126319: Segmentation fault in Perl_sv_catpvn_flags with \b{gcb}'
+    );
+    fresh_perl_like( '$_=""; /\b{wb}/;  s///g', qr/^$/, {},
+        '[perl #126319: Segmentation fault in Perl_sv_catpvn_flags with \b{wb}'
+    );
+    fresh_perl_like( '$_=""; /\B{wb}/;  s///g', qr/^$/, {},
+        '[perl #126319: Segmentation fault in Perl_sv_catpvn_flags with \b{wb}'
+    );
+    fresh_perl_like( '$_=""; /\b{sb}/;  s///g', qr/^$/, {},
+        '[perl #126319: Segmentation fault in Perl_sv_catpvn_flags with \b{sb}'
+    );
+    fresh_perl_like( '$_=""; /\B{sb}/;  s///g', qr/^$/, {},
+        '[perl #126319: Segmentation fault in Perl_sv_catpvn_flags with \b{sb}'
+    );
+
 SKIP: {
     if (! locales_enabled('LC_ALL')) {
         skip "Can't test locale (maybe you are missing POSIX)", 6;
author	Karl Williamson <khw@cpan.org>	2015-10-19 12:41:10 -0600
committer	Karl Williamson <khw@cpan.org>	2015-10-19 12:41:10 -0600
commit	f0bd363c36d925d8d3dfe3b68715763c850b171a (patch)
tree	5595b776fafd491cf69eaa693e5d68af667ea597
parent	9dfbfb6e46f94347d97400b65ab59a8bd120948d (diff)
parent	139a998acd6eae73587ff4f048925394f73682d9 (diff)
download	perl-f0bd363c36d925d8d3dfe3b68715763c850b171a.tar.gz