regcomp.c: refactor a static function

nextchar() advances the parse to the next byte beyond any ignorable bytes, returning the parse pointer before the advancement. I find this confusing, as foo = nextchar(); reads as if foo should point to the next character, instead of the character where the parse already is at. This functionality is hard for a reader to grok, even if the name weren't misleading, as the place the variable gets set in the source is far away from the call. It's clearer to say foo = current; nextchar(); This has confused others as well, as in one place several commits have been required to get it so it works properly, and games have been played to back up the parse if it turns out it shouldn't have been advanced, whereas it's better to check first, then advance if it is the right thing to do. Ready-Fire-Aim is not a best practice. This commit makes nextchar() return void, and changes the few places where the en-passant value was used. The new scheme is still buggy, as nextchar() only advances a single byte, which may be the wrong thing to do when the pattern is UTF-8 encoded. More work is needed to be in a position to fix this. We have only gotten away with this so far because apparently no one is using non-ASCII white space under /x, and our meta characters are all ASCII, and there are likely other things that reposition things to a character boundary before problems have arisen.
author: Karl Williamson <khw@cpan.org> 2015-09-19 21:13:54 -0600
committer: Karl Williamson <khw@cpan.org> 2015-10-11 10:48:31 -0600
commit: f400bbf00eccb191120b1fd72ccece07f2f3e1c1 (patch)
tree: f02d91c539b4b06828f63031f5d2007fe44f98ae
parent: 652f02bd26f1b9e1efe1a3059bc460ba39e083d7 (diff)
download: perl-f400bbf00eccb191120b1fd72ccece07f2f3e1c1.tar.gz
3 files changed, 14 insertions, 15 deletions
diff --git a/embed.fnc b/embed.fnc
index 6cc7e2bd0c..64837f7e47 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -2168,7 +2168,7 @@ Ei	|void   |alloc_maybe_populate_EXACT|NN RExC_state_t *pRExC_state \
 				|NN regnode *node|NN I32 *flagp|STRLEN len \
 				|UV code_point|bool downgradable
 Ein	|U8   |compute_EXACTish|NN RExC_state_t *pRExC_state
-Es	|char *	|nextchar	|NN RExC_state_t *pRExC_state
+Es	|void	|nextchar	|NN RExC_state_t *pRExC_state
 Ein	|char *	|reg_skipcomment|NN RExC_state_t *pRExC_state|NN char * p
 Es	|void	|scan_commit	|NN const RExC_state_t *pRExC_state \
 				|NN struct scan_data_t *data        \
diff --git a/proto.h b/proto.h
index d8d29e38a8..57ed19b302 100644
--- a/proto.h
+++ b/proto.h
@@ -4730,7 +4730,7 @@ STATIC U32	S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_
 STATIC I32	S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth);
 #define PERL_ARGS_ASSERT_MAKE_TRIE	\
 	assert(pRExC_state); assert(startbranch); assert(first); assert(last); assert(tail)
-STATIC char *	S_nextchar(pTHX_ RExC_state_t *pRExC_state);
+STATIC void	S_nextchar(pTHX_ RExC_state_t *pRExC_state);
 #define PERL_ARGS_ASSERT_NEXTCHAR	\
 	assert(pRExC_state)
 STATIC void	S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state);
diff --git a/regcomp.c b/regcomp.c
index 52334996f5..855c26b142 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -10402,7 +10402,6 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
 		else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
                     /* (?(1)...) */
 		    char c;
-		    char *tmp;
                     UV uv;
                     if (grok_atoUV(RExC_parse, &uv, &endptr)
                         && uv <= I32_MAX
@@ -10416,14 +10415,11 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                     ret = reganode(pRExC_state, GROUPP, parno);
 
                  insert_if_check_paren:
-		    if (*(tmp = nextchar(pRExC_state)) != ')') {
-                        /* nextchar also skips comments, so undo its work
-                         * and skip over the the next character.
-                         */
-                        RExC_parse = tmp;
+		    if (UCHARAT(RExC_parse) != ')') {
                         RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
 			vFAIL("Switch condition not recognized");
 		    }
+		    nextchar(pRExC_state);
 		  insert_if:
                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
                     br = regbranch(pRExC_state, &flags, 1,depth+1);
@@ -10437,7 +10433,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                     } else
                         REGTAIL(pRExC_state, br, reganode(pRExC_state,
                                                           LONGJMP, 0));
-		    c = *nextchar(pRExC_state);
+		    c = UCHARAT(RExC_parse);
+                    nextchar(pRExC_state);
 		    if (flags&HASWIDTH)
 			*flagp |= HASWIDTH;
 		    if (c == '|') {
@@ -10458,7 +10455,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                         REGTAIL(pRExC_state, ret, lastbr);
 		 	if (flags&HASWIDTH)
 			    *flagp |= HASWIDTH;
-			c = *nextchar(pRExC_state);
+                        c = UCHARAT(RExC_parse);
+                        nextchar(pRExC_state);
 		    }
 		    else
 			lastbr = NULL;
@@ -10731,10 +10729,11 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
         if (DEPENDS_SEMANTICS && RExC_uni_semantics) {
             set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
         }
-	if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
+	if (RExC_parse >= RExC_end || UCHARAT(RExC_parse) != ')') {
 	    RExC_parse = oregcomp_parse;
 	    vFAIL("Unmatched (");
 	}
+	nextchar(pRExC_state);
     }
     else if (!paren && RExC_parse < RExC_end) {
 	if (*RExC_parse == ')') {
@@ -16416,13 +16415,13 @@ S_reg_skipcomment(RExC_state_t *pRExC_state, char* p)
    This is the (?#...) and /x friendly way of saying RExC_parse++.
 */
 
-STATIC char*
+STATIC void
 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
 {
-    char* const retval = RExC_parse++;
-
     PERL_ARGS_ASSERT_NEXTCHAR;
 
+    RExC_parse++;
+
     for (;;) {
 	if (RExC_end - RExC_parse >= 3
 	    && *RExC_parse == '('
@@ -16445,7 +16444,7 @@ S_nextchar(pTHX_ RExC_state_t *pRExC_state)
                 continue;
             }
 	}
-        return retval;
+        break;
     }
 }
author	Karl Williamson <khw@cpan.org>	2015-09-19 21:13:54 -0600
committer	Karl Williamson <khw@cpan.org>	2015-10-11 10:48:31 -0600
commit	f400bbf00eccb191120b1fd72ccece07f2f3e1c1 (patch)
tree	f02d91c539b4b06828f63031f5d2007fe44f98ae
parent	652f02bd26f1b9e1efe1a3059bc460ba39e083d7 (diff)
download	perl-f400bbf00eccb191120b1fd72ccece07f2f3e1c1.tar.gz