diff options
author | Karl Williamson <khw@cpan.org> | 2015-09-19 21:13:54 -0600 |
---|---|---|
committer | Karl Williamson <khw@cpan.org> | 2015-10-11 10:48:31 -0600 |
commit | f400bbf00eccb191120b1fd72ccece07f2f3e1c1 (patch) | |
tree | f02d91c539b4b06828f63031f5d2007fe44f98ae | |
parent | 652f02bd26f1b9e1efe1a3059bc460ba39e083d7 (diff) | |
download | perl-f400bbf00eccb191120b1fd72ccece07f2f3e1c1.tar.gz |
regcomp.c: refactor a static function
nextchar() advances the parse to the next byte beyond any ignorable
bytes, returning the parse pointer before the advancement.
I find this confusing, as
foo = nextchar();
reads as if foo should point to the next character, instead of the
character where the parse already is at. This functionality is hard for
a reader to grok, even if the name weren't misleading, as the place the
variable gets set in the source is far away from the call. It's clearer
to say
foo = current;
nextchar();
This has confused others as well, as in one place several commits have
been required to get it so it works properly, and games have been played
to back up the parse if it turns out it shouldn't have been advanced,
whereas it's better to check first, then advance if it is the right
thing to do. Ready-Fire-Aim is not a best practice.
This commit makes nextchar() return void, and changes the few places
where the en-passant value was used.
The new scheme is still buggy, as nextchar() only advances a single
byte, which may be the wrong thing to do when the pattern is UTF-8
encoded. More work is needed to be in a position to fix this. We have
only gotten away with this so far because apparently no one is using
non-ASCII white space under /x, and our meta characters are all ASCII,
and there are likely other things that reposition things to a character
boundary before problems have arisen.
-rw-r--r-- | embed.fnc | 2 | ||||
-rw-r--r-- | proto.h | 2 | ||||
-rw-r--r-- | regcomp.c | 25 |
3 files changed, 14 insertions, 15 deletions
@@ -2168,7 +2168,7 @@ Ei |void |alloc_maybe_populate_EXACT|NN RExC_state_t *pRExC_state \ |NN regnode *node|NN I32 *flagp|STRLEN len \ |UV code_point|bool downgradable Ein |U8 |compute_EXACTish|NN RExC_state_t *pRExC_state -Es |char * |nextchar |NN RExC_state_t *pRExC_state +Es |void |nextchar |NN RExC_state_t *pRExC_state Ein |char * |reg_skipcomment|NN RExC_state_t *pRExC_state|NN char * p Es |void |scan_commit |NN const RExC_state_t *pRExC_state \ |NN struct scan_data_t *data \ @@ -4730,7 +4730,7 @@ STATIC U32 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_ STATIC I32 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth); #define PERL_ARGS_ASSERT_MAKE_TRIE \ assert(pRExC_state); assert(startbranch); assert(first); assert(last); assert(tail) -STATIC char * S_nextchar(pTHX_ RExC_state_t *pRExC_state); +STATIC void S_nextchar(pTHX_ RExC_state_t *pRExC_state); #define PERL_ARGS_ASSERT_NEXTCHAR \ assert(pRExC_state) STATIC void S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state); @@ -10402,7 +10402,6 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) { /* (?(1)...) */ char c; - char *tmp; UV uv; if (grok_atoUV(RExC_parse, &uv, &endptr) && uv <= I32_MAX @@ -10416,14 +10415,11 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) ret = reganode(pRExC_state, GROUPP, parno); insert_if_check_paren: - if (*(tmp = nextchar(pRExC_state)) != ')') { - /* nextchar also skips comments, so undo its work - * and skip over the the next character. - */ - RExC_parse = tmp; + if (UCHARAT(RExC_parse) != ')') { RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1; vFAIL("Switch condition not recognized"); } + nextchar(pRExC_state); insert_if: REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0)); br = regbranch(pRExC_state, &flags, 1,depth+1); @@ -10437,7 +10433,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) } else REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0)); - c = *nextchar(pRExC_state); + c = UCHARAT(RExC_parse); + nextchar(pRExC_state); if (flags&HASWIDTH) *flagp |= HASWIDTH; if (c == '|') { @@ -10458,7 +10455,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) REGTAIL(pRExC_state, ret, lastbr); if (flags&HASWIDTH) *flagp |= HASWIDTH; - c = *nextchar(pRExC_state); + c = UCHARAT(RExC_parse); + nextchar(pRExC_state); } else lastbr = NULL; @@ -10731,10 +10729,11 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) if (DEPENDS_SEMANTICS && RExC_uni_semantics) { set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET); } - if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') { + if (RExC_parse >= RExC_end || UCHARAT(RExC_parse) != ')') { RExC_parse = oregcomp_parse; vFAIL("Unmatched ("); } + nextchar(pRExC_state); } else if (!paren && RExC_parse < RExC_end) { if (*RExC_parse == ')') { @@ -16416,13 +16415,13 @@ S_reg_skipcomment(RExC_state_t *pRExC_state, char* p) This is the (?#...) and /x friendly way of saying RExC_parse++. */ -STATIC char* +STATIC void S_nextchar(pTHX_ RExC_state_t *pRExC_state) { - char* const retval = RExC_parse++; - PERL_ARGS_ASSERT_NEXTCHAR; + RExC_parse++; + for (;;) { if (RExC_end - RExC_parse >= 3 && *RExC_parse == '(' @@ -16445,7 +16444,7 @@ S_nextchar(pTHX_ RExC_state_t *pRExC_state) continue; } } - return retval; + break; } } |