Fix some UTF-8 bugs in qr// compiling

Prior to this commit, nextchar() blindly advanced the parse by one byte, and then looked to see if that starts a character that should be skipped. But if the character being parsed is multi-byte, it's going to advance to an interior byte, which is incorrect. So change the code so it advances across the whole current character. not just the first byte. And several places in the code that wanted to advance the parse from the current place to the end of any (#...) comments, or white space under /x, would move the parse pointer back one byte, and call nextchar(). This sort of worked (but took up unnecessary cpu) as long as nextchar() always advanced by exactly one byte. But the previous paragraph showed why this is wrong. So change those calls to use the new function that advances from where we are now, created in the previous commit. I didn't come up with tests that consistently fail, but valgrind on various .t files in the test suite shows bounds errors fixed by this patch.
author: Karl Williamson <khw@cpan.org> 2015-09-20 22:49:08 -0600
committer: Karl Williamson <khw@cpan.org> 2015-10-11 10:48:31 -0600
commit: 5de9b4f7517b91856df72e5131b59b631ea72e5b (patch)
tree: 914dcfba455ab01f8cdd02bc1046a8bde2340845
parent: 3e699fa01bec7dca385966fb2e0bd8150ad039b6 (diff)
download: perl-5de9b4f7517b91856df72e5131b59b631ea72e5b.tar.gz
1 files changed, 19 insertions, 12 deletions
diff --git a/regcomp.c b/regcomp.c
index 91d91bc6cb..6de8920190 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -10117,8 +10117,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
 	    case '!':           /* (?!...) */
 		RExC_seen_zerolen++;
 		/* check if we're really just a "FAIL" assertion */
-		--RExC_parse;
-		nextchar(pRExC_state);
+                skip_to_be_ignored_text(pRExC_state, &RExC_parse,
+                                        FALSE /* Don't force to /x */ );
 	        if (*RExC_parse == ')') {
                     ret=reganode(pRExC_state, OPFAIL, 0);
 	            nextchar(pRExC_state);
@@ -10790,8 +10790,8 @@ S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
 
     *flagp = WORST;			/* Tentatively. */
 
-    RExC_parse--;
-    nextchar(pRExC_state);
+    skip_to_be_ignored_text(pRExC_state, &RExC_parse,
+                            FALSE /* Don't force to /x */ );
     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
 	flags &= ~TRYAGAIN;
         latest = regpiece(pRExC_state, &flags,depth+1);
@@ -11213,9 +11213,8 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
 	if (! node_p) {
             return FALSE;
         }
-        RExC_parse--;   /* Need to back off so nextchar() doesn't skip the
-                           current char */
-	nextchar(pRExC_state);
+	skip_to_be_ignored_text(pRExC_state, &RExC_parse,
+                               FALSE /* Don't force to /x */ );
 	*node_p = reg_node(pRExC_state, REG_ANY);
 	*flagp |= HASWIDTH|SIMPLE;
 	MARK_NAUGHTY(1);
@@ -12284,8 +12283,8 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                     /* override incorrect value set in reganode MJD */
                     Set_Node_Offset(ret, parse_start+1);
                     Set_Node_Cur_Length(ret, parse_start);
-		    RExC_parse--;
-		    nextchar(pRExC_state);
+                    skip_to_be_ignored_text(pRExC_state, &RExC_parse,
+                                            FALSE /* Don't force to /x */ );
 		}
 	    }
 	    break;
@@ -13049,7 +13048,9 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
 
 	    RExC_parse = p - 1;
             Set_Node_Cur_Length(ret, parse_start);
-	    nextchar(pRExC_state);
+	    RExC_parse = p;
+            skip_to_be_ignored_text(pRExC_state, &RExC_parse,
+                                    FALSE /* Don't force to /x */ );
 	    {
 		/* len is STRLEN which is unsigned, need to copy to signed */
 		IV iv = len;
@@ -16420,6 +16421,8 @@ S_skip_to_be_ignored_text(pTHX_ RExC_state_t *pRExC_state,
 
     PERL_ARGS_ASSERT_SKIP_TO_BE_IGNORED_TEXT;
 
+    assert( ! UTF || UTF8_IS_INVARIANT(**p) || UTF8_IS_START(**p));
+
     for (;;) {
 	if (RExC_end - (*p) >= 3
 	    && *(*p)     == '('
@@ -16457,7 +16460,7 @@ S_skip_to_be_ignored_text(pTHX_ RExC_state_t *pRExC_state,
    those two cases, the parse position is advanced beyond all such comments and
    white space.
 
-   This is the (?#...) and /x friendly way of saying RExC_parse++.
+   This is the UTF, (?#...), and /x friendly way of saying RExC_parse++.
 */
 
 STATIC void
@@ -16465,7 +16468,11 @@ S_nextchar(pTHX_ RExC_state_t *pRExC_state)
 {
     PERL_ARGS_ASSERT_NEXTCHAR;
 
-    RExC_parse++;
+    assert(   ! UTF
+           || UTF8_IS_INVARIANT(*RExC_parse)
+           || UTF8_IS_START(*RExC_parse));
+
+    RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
 
     skip_to_be_ignored_text(pRExC_state, &RExC_parse,
                             FALSE /* Don't assume /x */ );
author	Karl Williamson <khw@cpan.org>	2015-09-20 22:49:08 -0600
committer	Karl Williamson <khw@cpan.org>	2015-10-11 10:48:31 -0600
commit	5de9b4f7517b91856df72e5131b59b631ea72e5b (patch)
tree	914dcfba455ab01f8cdd02bc1046a8bde2340845
parent	3e699fa01bec7dca385966fb2e0bd8150ad039b6 (diff)
download	perl-5de9b4f7517b91856df72e5131b59b631ea72e5b.tar.gz