regexec.c: More cleaning of FBC macro/code interface

The definition of \w is now compiled into the Perl core. This allows the complicated swash_fetch function call to be replaced by isWORDCHAR_utf8, which takes a single parameter, so the interface can be simplified. [1]. This macro will execute faster on Latin1-range inputs, as it doesn't do a swash_fetch on them, but slower on other code points due to function call overhead, and some currently in-place error checking that wasn't done previously. This overhead could be removed by using inline functions, and perhaps a different interface for known non-malformed input (though I'm actually not sure the input is known to be well-formed in this case). These macros still depend on and modify outside variables. That could be cleaned up by adding additional parameters to them, but I'm not going to do it now. I don't like these kinds of code-generating macros, and have been tempted to rewrite these as inline functions, but it's not a trivial task to do. [1] I hadn't realized it before, but the interface could have been cleaned up instead by introducting a macro that makes it look like a single parameter is used uniformly to existing macros, looking like #define FBC_BOUND_SWASH_FETCH(s) \ cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], s, utf8_target)) But it seems better to me to use isWORDCHAR_utf8 as it is faster for Western European languages, and can be made nearly the same speed as the alternative if experience tells us that this is a slow spot that should be sped up.
author: Karl Williamson <khw@cpan.org> 2014-06-22 19:41:25 -0600
committer: Karl Williamson <khw@cpan.org> 2014-06-26 18:09:19 -0600
commit: 236d82fd88ad0b9cb2804ba94207e47da102e6c5 (patch)
tree: f4c2f85793adfc8f5cf3f97dcff09c92cd79cd2e /regexec.c
parent: 5b8bb145554d90d7a985c271cf673ab2b63413d1 (diff)
download: perl-236d82fd88ad0b9cb2804ba94207e47da102e6c5.tar.gz
1 files changed, 18 insertions, 35 deletions
diff --git a/regexec.c b/regexec.c
index 0dbef02fc6..06d90d3bae 100644
--- a/regexec.c
+++ b/regexec.c
@@ -1583,7 +1583,7 @@ if ((reginfo->intuit || regtry(reginfo, &s))) \
 	    }                                                                  \
 	);                                                                     \
 
-#define FBC_UTF8(TEST_UV, TEST2_UTF8, IF_SUCCESS, IF_FAIL)                     \
+#define FBC_UTF8(TEST_UV, TEST_UTF8, IF_SUCCESS, IF_FAIL)                      \
 	if (s == reginfo->strbeg) {                                            \
 	    tmp = '\n';                                                        \
 	}                                                                      \
@@ -1595,7 +1595,7 @@ if ((reginfo->intuit || regtry(reginfo, &s))) \
 	tmp = TEST_UV(tmp);                                                    \
 	LOAD_UTF8_CHARCLASS_ALNUM();                                           \
 	REXEC_FBC_UTF8_SCAN(                                                   \
-	    if (tmp == ! (TEST2_UTF8)) {                                       \
+	    if (tmp == ! (TEST_UTF8((U8 *) s))) {                              \
 		tmp = !tmp;                                                    \
 		IF_SUCCESS;                                                    \
 	    }                                                                  \
@@ -1608,21 +1608,19 @@ if ((reginfo->intuit || regtry(reginfo, &s))) \
  * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
  * NBOUND.  This is accomplished by passing it in either the if or else clause,
  * with the other one being empty */
-#define FBC_BOUND(TEST_NON_UTF8, TEST_UV, TEST2_UTF8) \
-    FBC_BOUND_COMMON(FBC_UTF8(TEST_UV, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
+#define FBC_BOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \
+    FBC_BOUND_COMMON(FBC_UTF8(TEST_UV, TEST_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
 
-#define FBC_BOUND_A(TEST_NON_UTF8, TEST_UV, TEST2_UTF8) \
+#define FBC_BOUND_A(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \
     FBC_BOUND_COMMON(FBC_UTF8_A(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
 
-#define FBC_NBOUND(TEST_NON_UTF8, TEST_UV, TEST2_UTF8) \
-    FBC_BOUND_COMMON(FBC_UTF8(TEST_UV, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
+#define FBC_NBOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \
+    FBC_BOUND_COMMON(FBC_UTF8(TEST_UV, TEST_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
 
-#define FBC_NBOUND_A(TEST_NON_UTF8, TEST_UV, TEST2_UTF8) \
+#define FBC_NBOUND_A(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \
     FBC_BOUND_COMMON(FBC_UTF8_A(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
 
-/* Common to the BOUND and NBOUND cases.  Unfortunately the UTF8 tests need to
- * be passed in completely with the variable name being tested, which isn't
- * such a clean interface, but this is easier to read than it was before.  We
+/* Common to the BOUND and NBOUND cases.  We
  * are looking for the boundary (or non-boundary between a word and non-word
  * character.  The utf8 and non-utf8 cases have the same logic, but the details
  * must be different.  Find the "wordness" of the character just prior to this
@@ -1842,45 +1840,30 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
         }
         break;
     }
+
     case BOUNDL:
-        FBC_BOUND(isWORDCHAR_LC,
-                  isWORDCHAR_LC_uvchr,
-                  isWORDCHAR_LC_utf8((U8*)s));
+        FBC_BOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8);
         break;
     case NBOUNDL:
-        FBC_NBOUND(isWORDCHAR_LC,
-                   isWORDCHAR_LC_uvchr,
-                   isWORDCHAR_LC_utf8((U8*)s));
+        FBC_NBOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8);
         break;
     case BOUND:
-        FBC_BOUND(isWORDCHAR,
-                  isWORDCHAR_uni,
-                  cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
+        FBC_BOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8);
         break;
     case BOUNDA:
-        FBC_BOUND_A(isWORDCHAR_A,
-                    isWORDCHAR_A,
-                    isWORDCHAR_A((U8*)s));
+        FBC_BOUND_A(isWORDCHAR_A, isWORDCHAR_A, isWORDCHAR_A);
         break;
     case NBOUND:
-        FBC_NBOUND(isWORDCHAR,
-                   isWORDCHAR_uni,
-                   cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
+        FBC_NBOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8);
         break;
     case NBOUNDA:
-        FBC_NBOUND_A(isWORDCHAR_A,
-                     isWORDCHAR_A,
-                     isWORDCHAR_A((U8*)s));
+        FBC_NBOUND_A(isWORDCHAR_A, isWORDCHAR_A, isWORDCHAR_A);
         break;
     case BOUNDU:
-        FBC_BOUND(isWORDCHAR_L1,
-                  isWORDCHAR_uni,
-                  cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
+        FBC_BOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8);
         break;
     case NBOUNDU:
-        FBC_NBOUND(isWORDCHAR_L1,
-                   isWORDCHAR_uni,
-                   cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
+        FBC_NBOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8);
         break;
     case LNBREAK:
         REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
author	Karl Williamson <khw@cpan.org>	2014-06-22 19:41:25 -0600
committer	Karl Williamson <khw@cpan.org>	2014-06-26 18:09:19 -0600
commit	236d82fd88ad0b9cb2804ba94207e47da102e6c5 (patch)
tree	f4c2f85793adfc8f5cf3f97dcff09c92cd79cd2e /regexec.c
parent	5b8bb145554d90d7a985c271cf673ab2b63413d1 (diff)
download	perl-236d82fd88ad0b9cb2804ba94207e47da102e6c5.tar.gz