fix rt75680 - when working with utf8 strings one must always use s+=UTF8SKIP(s) to move to the next char

Most of the regex code where do the two types of increments are wrapped up in macros. Unfortunately these macros arent suitable in this case because we use goto to jump inside the loop under some situations, and since this is a one-off case I figured the modest C&P associated was better than creating a new macro just for this case. There is still a possible bug here marked by an XXX, which will need to be fixed once I find out the correct way to simulate strptr--. Additionally I havent found a test case that actually exposes this form of the bug. Moral of the story, utf8 makes string scanning awkward... And slow...
author: Yves Orton <demerphq@gmail.com> 2010-08-23 14:36:37 +0200
committer: Yves Orton <demerphq@gmail.com> 2010-08-23 14:42:20 +0200
commit: 92f3d4829170316374b610b3fc665389803d93f8 (patch)
tree: 75ec84dc6954d249d557a2b23893c9a87f910edf /regexec.c
parent: d67eb5f4852e0f62e6cd359eeceb88a45e6164ea (diff)
download: perl-92f3d4829170316374b610b3fc665389803d93f8.tar.gz
1 files changed, 56 insertions, 21 deletions
diff --git a/regexec.c b/regexec.c
index dd4ec41f95..35ef8d4b12 100644
--- a/regexec.c
+++ b/regexec.c
@@ -2018,33 +2018,68 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *stre
 	    end = HOP3c(strend, -dontbother, strbeg) - 1;
 	    /* for multiline we only have to try after newlines */
 	    if (prog->check_substr || prog->check_utf8) {
-		if (s == startpos)
-		    goto after_try;
-		while (1) {
-		    if (regtry(&reginfo, &s))
-			goto got_it;
-		  after_try:
-		    if (s > end)
-			goto phooey;
-		    if (prog->extflags & RXf_USE_INTUIT) {
-			s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
-			if (!s)
-			    goto phooey;
-		    }
-		    else
-			s++;
-		}		
-	    } else {
-		if (s > startpos)
+                /* because of the goto we can not easily reuse the macros for bifurcating the
+                   unicode/non-unicode match modes here like we do elsewhere - demerphq */
+                if (utf8_target) {
+                    if (s == startpos)
+                        goto after_try_utf8;
+                    while (1) {
+                        if (regtry(&reginfo, &s)) {
+                            goto got_it;
+                        }
+                      after_try_utf8:
+                        if (s > end) {
+                            goto phooey;
+                        }
+                        if (prog->extflags & RXf_USE_INTUIT) {
+                            s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
+                            if (!s) {
+                                goto phooey;
+                            }
+                        }
+                        else {
+                            s += UTF8SKIP(s);
+                        }
+                    }
+                } /* end search for check string in unicode */
+                else {
+                    if (s == startpos) {
+                        goto after_try_latin;
+                    }
+                    while (1) {
+                        if (regtry(&reginfo, &s)) {
+                            goto got_it;
+                        }
+                      after_try_latin:
+                        if (s > end) {
+                            goto phooey;
+                        }
+                        if (prog->extflags & RXf_USE_INTUIT) {
+                            s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
+                            if (!s) {
+                                goto phooey;
+                            }
+                        }
+                        else {
+                            s++;
+                        }
+                    }
+                } /* end search for check string in latin*/
+	    } /* end search for check string */
+	    else { /* search for newline */
+		if (s > startpos) {
+                    /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
 		    s--;
+		}
+                /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
 		while (s < end) {
 		    if (*s++ == '\n') {	/* don't need PL_utf8skip here */
 			if (regtry(&reginfo, &s))
 			    goto got_it;
 		    }
-		}		
-	    }
-	}
+		}
+	    } /* end search for newline */
+	} /* end anchored/multiline check string search */
 	goto phooey;
     } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK)) 
     {
author	Yves Orton <demerphq@gmail.com>	2010-08-23 14:36:37 +0200
committer	Yves Orton <demerphq@gmail.com>	2010-08-23 14:42:20 +0200
commit	92f3d4829170316374b610b3fc665389803d93f8 (patch)
tree	75ec84dc6954d249d557a2b23893c9a87f910edf /regexec.c
parent	d67eb5f4852e0f62e6cd359eeceb88a45e6164ea (diff)
download	perl-92f3d4829170316374b610b3fc665389803d93f8.tar.gz