"" =~ /\b{gcb}/ should fail; same \b{wb}, \b{sb}

The Unicode standard indicates that these breaks should succeed at the beginning and end of text. It appears to me to be an oversight on their part to not make an exception when there is no actual text. (Their test suite does not cover this case.) I blindly implemented their algorithm for 5.22, but it really is the wrong thing to do.
author: Karl Williamson <khw@cpan.org> 2015-10-19 12:14:36 -0600
committer: Karl Williamson <khw@cpan.org> 2015-10-19 12:40:24 -0600
commit: a7a8bd1ed56dbdb7e63735924945bbb66b7e2e5c (patch)
tree: 533c7a08e31e28aa24888065b383635469d30617 /regexec.c
parent: 67481c39e5c4241caaadaabb962ba281af64d9aa (diff)
download: perl-a7a8bd1ed56dbdb7e63735924945bbb66b7e2e5c.tar.gz
1 files changed, 17 insertions, 7 deletions
diff --git a/regexec.c b/regexec.c
index 2af7653c60..85c31a69ba 100644
--- a/regexec.c
+++ b/regexec.c
@@ -2065,13 +2065,17 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
                 FBC_BOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8);
                 break;
             case GCB_BOUND:
-                if (s == reginfo->strbeg) { /* GCB always matches at begin and
-                                               end */
+                if (s == reginfo->strbeg) {
                     if (reginfo->intuit || regtry(reginfo, &s))
                     {
                         goto got_it;
                     }
+
+                    /* Didn't match.  Try at the next position (if there is one) */
                     s += (utf8_target) ? UTF8SKIP(s) : 1;
+                    if (UNLIKELY(s >= reginfo->strend)) {
+                        break;
+                    }
                 }
 
                 if (utf8_target) {
@@ -2112,13 +2116,14 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
                 break;
 
             case SB_BOUND:
-                if (s == reginfo->strbeg) { /* SB always matches at beginning */
+                if (s == reginfo->strbeg) {
                     if (reginfo->intuit || regtry(reginfo, &s)) {
                         goto got_it;
                     }
-
-                    /* Didn't match.  Go try at the next position */
                     s += (utf8_target) ? UTF8SKIP(s) : 1;
+                    if (UNLIKELY(s >= reginfo->strend)) {
+                        break;
+                    }
                 }
 
                 if (utf8_target) {
@@ -2177,6 +2182,9 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
                         goto got_it;
                     }
                     s += (utf8_target) ? UTF8SKIP(s) : 1;
+                    if (UNLIKELY(s >= reginfo->strend)) {
+                        break;
+                    }
                 }
 
                 if (utf8_target) {
@@ -5610,8 +5618,10 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
 	case BOUNDU:  /*  /\b/u  */
 
           boundu:
-	    if (utf8_target) {
-
+            if (UNLIKELY(reginfo->strbeg >= reginfo->strend)) {
+                match = FALSE;
+            }
+            else if (utf8_target) {
               bound_utf8:
                 switch((bound_type) FLAGS(scan)) {
                     case TRADITIONAL_BOUND:
author	Karl Williamson <khw@cpan.org>	2015-10-19 12:14:36 -0600
committer	Karl Williamson <khw@cpan.org>	2015-10-19 12:40:24 -0600
commit	a7a8bd1ed56dbdb7e63735924945bbb66b7e2e5c (patch)
tree	533c7a08e31e28aa24888065b383635469d30617 /regexec.c
parent	67481c39e5c4241caaadaabb962ba281af64d9aa (diff)
download	perl-a7a8bd1ed56dbdb7e63735924945bbb66b7e2e5c.tar.gz