diff options
author | Karl Williamson <khw@cpan.org> | 2015-09-20 22:49:08 -0600 |
---|---|---|
committer | Karl Williamson <khw@cpan.org> | 2015-10-11 10:48:31 -0600 |
commit | 5de9b4f7517b91856df72e5131b59b631ea72e5b (patch) | |
tree | 914dcfba455ab01f8cdd02bc1046a8bde2340845 | |
parent | 3e699fa01bec7dca385966fb2e0bd8150ad039b6 (diff) | |
download | perl-5de9b4f7517b91856df72e5131b59b631ea72e5b.tar.gz |
Fix some UTF-8 bugs in qr// compiling
Prior to this commit, nextchar() blindly advanced the parse by one byte,
and then looked to see if that starts a character that should be skipped.
But if the character being parsed is multi-byte, it's going to advance
to an interior byte, which is incorrect. So change the code so it
advances across the whole current character. not just the first byte.
And several places in the code that wanted to advance the parse from the
current place to the end of any (#...) comments, or white space under
/x, would move the parse pointer back one byte, and call nextchar().
This sort of worked (but took up unnecessary cpu) as long as nextchar()
always advanced by exactly one byte. But the previous paragraph showed
why this is wrong. So change those calls to use the new function that
advances from where we are now, created in the previous commit.
I didn't come up with tests that consistently fail, but valgrind on
various .t files in the test suite shows bounds errors fixed by this
patch.
-rw-r--r-- | regcomp.c | 31 |
1 files changed, 19 insertions, 12 deletions
@@ -10117,8 +10117,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) case '!': /* (?!...) */ RExC_seen_zerolen++; /* check if we're really just a "FAIL" assertion */ - --RExC_parse; - nextchar(pRExC_state); + skip_to_be_ignored_text(pRExC_state, &RExC_parse, + FALSE /* Don't force to /x */ ); if (*RExC_parse == ')') { ret=reganode(pRExC_state, OPFAIL, 0); nextchar(pRExC_state); @@ -10790,8 +10790,8 @@ S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth) *flagp = WORST; /* Tentatively. */ - RExC_parse--; - nextchar(pRExC_state); + skip_to_be_ignored_text(pRExC_state, &RExC_parse, + FALSE /* Don't force to /x */ ); while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') { flags &= ~TRYAGAIN; latest = regpiece(pRExC_state, &flags,depth+1); @@ -11213,9 +11213,8 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, if (! node_p) { return FALSE; } - RExC_parse--; /* Need to back off so nextchar() doesn't skip the - current char */ - nextchar(pRExC_state); + skip_to_be_ignored_text(pRExC_state, &RExC_parse, + FALSE /* Don't force to /x */ ); *node_p = reg_node(pRExC_state, REG_ANY); *flagp |= HASWIDTH|SIMPLE; MARK_NAUGHTY(1); @@ -12284,8 +12283,8 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) /* override incorrect value set in reganode MJD */ Set_Node_Offset(ret, parse_start+1); Set_Node_Cur_Length(ret, parse_start); - RExC_parse--; - nextchar(pRExC_state); + skip_to_be_ignored_text(pRExC_state, &RExC_parse, + FALSE /* Don't force to /x */ ); } } break; @@ -13049,7 +13048,9 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) RExC_parse = p - 1; Set_Node_Cur_Length(ret, parse_start); - nextchar(pRExC_state); + RExC_parse = p; + skip_to_be_ignored_text(pRExC_state, &RExC_parse, + FALSE /* Don't force to /x */ ); { /* len is STRLEN which is unsigned, need to copy to signed */ IV iv = len; @@ -16420,6 +16421,8 @@ S_skip_to_be_ignored_text(pTHX_ RExC_state_t *pRExC_state, PERL_ARGS_ASSERT_SKIP_TO_BE_IGNORED_TEXT; + assert( ! UTF || UTF8_IS_INVARIANT(**p) || UTF8_IS_START(**p)); + for (;;) { if (RExC_end - (*p) >= 3 && *(*p) == '(' @@ -16457,7 +16460,7 @@ S_skip_to_be_ignored_text(pTHX_ RExC_state_t *pRExC_state, those two cases, the parse position is advanced beyond all such comments and white space. - This is the (?#...) and /x friendly way of saying RExC_parse++. + This is the UTF, (?#...), and /x friendly way of saying RExC_parse++. */ STATIC void @@ -16465,7 +16468,11 @@ S_nextchar(pTHX_ RExC_state_t *pRExC_state) { PERL_ARGS_ASSERT_NEXTCHAR; - RExC_parse++; + assert( ! UTF + || UTF8_IS_INVARIANT(*RExC_parse) + || UTF8_IS_START(*RExC_parse)); + + RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1; skip_to_be_ignored_text(pRExC_state, &RExC_parse, FALSE /* Don't assume /x */ ); |