regexec.c: Collapse some macros

By adding a utf8ness parameter these 4 macros can be collapsed into 2, with no increase in run time, as the parameter is always a compile time constant and modern compilers will avoid the conditional.
author: Karl Williamson <khw@cpan.org> 2018-01-28 19:15:25 -0700
committer: Karl Williamson <khw@cpan.org> 2018-01-29 16:29:44 -0700
commit: da10aa09bd687402764bf887b625b8081dd8cf0a (patch)
tree: 54f4ae846f75bc8d04a8ae33e6091b7fb1f0f2f7 /regexec.c
parent: e4eb64812fa316ef6a2f62a20180e4f106fbd8b4 (diff)
download: perl-da10aa09bd687402764bf887b625b8081dd8cf0a.tar.gz
1 files changed, 34 insertions, 52 deletions
diff --git a/regexec.c b/regexec.c
index 6e8c83fd08..c28d245263 100644
--- a/regexec.c
+++ b/regexec.c
@@ -1715,23 +1715,16 @@ STMT_START {                                              \
     }                                                     \
 } STMT_END
 
-#define REXEC_FBC_UTF8_SCAN(CODE)                     \
-STMT_START {                                          \
-    while (s < strend) {                              \
-	CODE                                          \
-	s += UTF8SKIP(s);                             \
-    }                                                 \
-} STMT_END
+#define REXEC_FBC_SCAN(UTF8, CODE)                          \
+    STMT_START {                                            \
+        while (s < strend) {                                \
+            CODE                                            \
+            s += ((UTF8) ? UTF8SKIP(s) : 1);                \
+        }                                                   \
+    } STMT_END
 
-#define REXEC_FBC_SCAN(CODE)                          \
-STMT_START {                                          \
-    while (s < strend) {                              \
-	CODE                                          \
-	s++;                                          \
-    }                                                 \
-} STMT_END
 
-/* In the next few macros, 'try_it' is a bool indicating whether to actually
+/* In the next macro, 'try_it' is a bool indicating whether to actually
  * try the match or not.  It is used for when the flags indicate that only the
  * first occurrence of 'x' in a string of them should be considered for
  * matching.  try_it is initialized to 1, and set to 1 on every failure of the
@@ -1741,20 +1734,8 @@ STMT_START {                                          \
  * the first in a string; otherwise TRUE, so try_it will be 0 when the previous
  * thing was 'x' and we only want the first 'x' */
 
-#define REXEC_FBC_UTF8_CLASS_SCAN(COND)                        \
-REXEC_FBC_UTF8_SCAN( /* Loops while (s < strend) */            \
-    if (COND) {                                                \
-	if (try_it && (reginfo->intuit || regtry(reginfo, &s)))\
-	    goto got_it;                                       \
-	else                                                   \
-	    try_it = doevery;                                  \
-    }                                                          \
-    else                                                       \
-	try_it = 1;                                            \
-)
-
-#define REXEC_FBC_CLASS_SCAN(COND)                             \
-REXEC_FBC_SCAN( /* Loops while (s < strend) */                 \
+#define REXEC_FBC_CLASS_SCAN(UTF8, COND)                       \
+REXEC_FBC_SCAN(UTF8, /* Loops while (s < strend) */            \
     if (COND) {                                                \
 	if (try_it && (reginfo->intuit || regtry(reginfo, &s)))\
 	    goto got_it;                                       \
@@ -1767,10 +1748,10 @@ REXEC_FBC_SCAN( /* Loops while (s < strend) */                 \
 
 #define REXEC_FBC_CSCAN(CONDUTF8,COND)                         \
     if (utf8_target) {                                         \
-	REXEC_FBC_UTF8_CLASS_SCAN(CONDUTF8);                   \
+	REXEC_FBC_CLASS_SCAN(1, CONDUTF8);                     \
     }                                                          \
     else {                                                     \
-	REXEC_FBC_CLASS_SCAN(COND);                            \
+	REXEC_FBC_CLASS_SCAN(0, COND);                         \
     }
 
 /* The three macros below are slightly different versions of the same logic.
@@ -1801,7 +1782,7 @@ REXEC_FBC_SCAN( /* Loops while (s < strend) */                 \
  * here.  And vice-versa if we are looking for a non-boundary.
  *
  * 'tmp' below in the next three macros in the REXEC_FBC_SCAN and
- * REXEC_FBC_UTF8_SCAN loops is a loop invariant, a bool giving the return of
+ * REXEC_FBC_SCAN loops is a loop invariant, a bool giving the return of
  * TEST_NON_UTF8(s-1).  To see this, note that that's what it is defined to be
  * at entry to the loop, and to get to the IF_FAIL branch, tmp must equal
  * TEST_NON_UTF8(s), and in the opposite branch, IF_SUCCESS, tmp is that
@@ -1812,7 +1793,7 @@ REXEC_FBC_SCAN( /* Loops while (s < strend) */                 \
 #define FBC_UTF8_A(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL)                         \
     tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n';                      \
     tmp = TEST_NON_UTF8(tmp);                                                  \
-    REXEC_FBC_UTF8_SCAN( /* advances s while s < strend */                     \
+    REXEC_FBC_SCAN(1,  /* 1=>is-utf8; advances s while s < strend */           \
         if (tmp == ! TEST_NON_UTF8((U8) *s)) {                                 \
             tmp = !tmp;                                                        \
             IF_SUCCESS; /* Is a boundary if values for s-1 and s differ */     \
@@ -1836,7 +1817,7 @@ REXEC_FBC_SCAN( /* Loops while (s < strend) */                 \
     }                                                                          \
     tmp = TEST_UV(tmp);                                                        \
     LOAD_UTF8_CHARCLASS_ALNUM();                                               \
-    REXEC_FBC_UTF8_SCAN( /* advances s while s < strend */                     \
+    REXEC_FBC_SCAN(1,  /* 1=>is-utf8; advances s while s < strend */           \
         if (tmp == ! (TEST_UTF8((U8 *) s, (U8 *) reginfo->strend))) {          \
             tmp = !tmp;                                                        \
             IF_SUCCESS;                                                        \
@@ -1856,7 +1837,7 @@ REXEC_FBC_SCAN( /* Loops while (s < strend) */                 \
     else {  /* Not utf8 */                                                     \
 	tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n';                  \
 	tmp = TEST_NON_UTF8(tmp);                                              \
-	REXEC_FBC_SCAN( /* advances s while s < strend */                      \
+	REXEC_FBC_SCAN(0, /* 0=>not-utf8; advances s while s < strend */       \
 	    if (tmp == ! TEST_NON_UTF8((U8) *s)) {                             \
 		IF_SUCCESS;                                                    \
 		tmp = !tmp;                                                    \
@@ -2039,14 +2020,14 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
     case ANYOFD:
     case ANYOF:
         if (utf8_target) {
-            REXEC_FBC_UTF8_CLASS_SCAN(
+            REXEC_FBC_CLASS_SCAN(1, /* 1=>is-utf8 */
                       reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target));
         }
         else if (ANYOF_FLAGS(c)) {
-            REXEC_FBC_CLASS_SCAN(reginclass(prog,c, (U8*)s, (U8*)s+1, 0));
+            REXEC_FBC_CLASS_SCAN(0, reginclass(prog,c, (U8*)s, (U8*)s+1, 0));
         }
         else {
-            REXEC_FBC_CLASS_SCAN(ANYOF_BITMAP_TEST(c, *((U8*)s)));
+            REXEC_FBC_CLASS_SCAN(0, ANYOF_BITMAP_TEST(c, *((U8*)s)));
         }
         break;
 
@@ -2558,8 +2539,8 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
         if (utf8_target) {
             /* The complement of something that matches only ASCII matches all
              * non-ASCII, plus everything in ASCII that isn't in the class. */
-            REXEC_FBC_UTF8_CLASS_SCAN(   ! isASCII_utf8_safe(s, strend)
-                                      || ! _generic_isCC_A(*s, FLAGS(c)));
+            REXEC_FBC_CLASS_SCAN(1,   ! isASCII_utf8_safe(s, strend)
+                                   || ! _generic_isCC_A(*s, FLAGS(c)));
             break;
         }
 
@@ -2572,12 +2553,12 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
          * as otherwise we would have to examine all the continuation
          * characters */
         if (utf8_target) {
-            REXEC_FBC_UTF8_CLASS_SCAN(_generic_isCC_A(*s, FLAGS(c)));
+            REXEC_FBC_CLASS_SCAN(1, _generic_isCC_A(*s, FLAGS(c)));
             break;
         }
 
       posixa:
-        REXEC_FBC_CLASS_SCAN(
+        REXEC_FBC_CLASS_SCAN(0, /* 0=>not-utf8 */
                         to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c))));
         break;
 
@@ -2587,7 +2568,8 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
 
     case POSIXU:
         if (! utf8_target) {
-            REXEC_FBC_CLASS_SCAN(to_complement ^ cBOOL(_generic_isCC(*s,
+            REXEC_FBC_CLASS_SCAN(0, /* 0=>not-utf8 */
+                                 to_complement ^ cBOOL(_generic_isCC(*s,
                                                                     FLAGS(c))));
         }
         else {
@@ -2600,7 +2582,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
                     /* We avoid loading in the swash as long as possible, but
                      * should we have to, we jump to a separate loop.  This
                      * extra 'if' statement is what keeps this code from being
-                     * just a call to REXEC_FBC_UTF8_CLASS_SCAN() */
+                     * just a call to REXEC_FBC_CLASS_SCAN() */
                     if (UTF8_IS_ABOVE_LATIN1(*s)) {
                         goto found_above_latin1;
                     }
@@ -2628,27 +2610,27 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
             else switch (classnum) {    /* These classes are implemented as
                                            macros */
                 case _CC_ENUM_SPACE:
-                    REXEC_FBC_UTF8_CLASS_SCAN(
+                    REXEC_FBC_CLASS_SCAN(1, /* 1=>is-utf8 */
                         to_complement ^ cBOOL(isSPACE_utf8_safe(s, strend)));
                     break;
 
                 case _CC_ENUM_BLANK:
-                    REXEC_FBC_UTF8_CLASS_SCAN(
+                    REXEC_FBC_CLASS_SCAN(1,
                         to_complement ^ cBOOL(isBLANK_utf8_safe(s, strend)));
                     break;
 
                 case _CC_ENUM_XDIGIT:
-                    REXEC_FBC_UTF8_CLASS_SCAN(
+                    REXEC_FBC_CLASS_SCAN(1,
                        to_complement ^ cBOOL(isXDIGIT_utf8_safe(s, strend)));
                     break;
 
                 case _CC_ENUM_VERTSPACE:
-                    REXEC_FBC_UTF8_CLASS_SCAN(
+                    REXEC_FBC_CLASS_SCAN(1,
                        to_complement ^ cBOOL(isVERTWS_utf8_safe(s, strend)));
                     break;
 
                 case _CC_ENUM_CNTRL:
-                    REXEC_FBC_UTF8_CLASS_SCAN(
+                    REXEC_FBC_CLASS_SCAN(1,
                         to_complement ^ cBOOL(isCNTRL_utf8_safe(s, strend)));
                     break;
 
@@ -2673,7 +2655,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
         /* This is a copy of the loop above for swash classes, though using the
          * FBC macro instead of being expanded out.  Since we've loaded the
          * swash, we don't have to check for that each time through the loop */
-        REXEC_FBC_UTF8_CLASS_SCAN(
+        REXEC_FBC_CLASS_SCAN(1, /* 1=>is-utf8 */
                 to_complement ^ cBOOL(_generic_utf8_safe(
                                       classnum,
                                       s,
@@ -3400,7 +3382,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
                 to_utf8_substr(prog);
             }
             ch = SvPVX_const(prog->anchored_utf8)[0];
-	    REXEC_FBC_SCAN(
+	    REXEC_FBC_SCAN(0,   /* 0=>not-utf8 */
 		if (*s == ch) {
 		    DEBUG_EXECUTE_r( did_match = 1 );
 		    if (regtry(reginfo, &s)) goto got_it;
@@ -3418,7 +3400,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
                 }
             }
             ch = SvPVX_const(prog->anchored_substr)[0];
-	    REXEC_FBC_SCAN(
+	    REXEC_FBC_SCAN(0,   /* 0=>not-utf8 */
 		if (*s == ch) {
 		    DEBUG_EXECUTE_r( did_match = 1 );
 		    if (regtry(reginfo, &s)) goto got_it;
author	Karl Williamson <khw@cpan.org>	2018-01-28 19:15:25 -0700
committer	Karl Williamson <khw@cpan.org>	2018-01-29 16:29:44 -0700
commit	da10aa09bd687402764bf887b625b8081dd8cf0a (patch)
tree	54f4ae846f75bc8d04a8ae33e6091b7fb1f0f2f7 /regexec.c
parent	e4eb64812fa316ef6a2f62a20180e4f106fbd8b4 (diff)
download	perl-da10aa09bd687402764bf887b625b8081dd8cf0a.tar.gz