summaryrefslogtreecommitdiff
path: root/regcomp.c
diff options
context:
space:
mode:
authorKarl Williamson <public@khwilliamson.com>2012-10-11 12:15:53 -0600
committerKarl Williamson <public@khwilliamson.com>2012-10-11 20:37:50 -0600
commit3465e1f03c6c748e8f8a6bf8bfdfaf1fc58a4810 (patch)
treec096c1b4462c2c70501f6cf54569be38c0cf0997 /regcomp.c
parent5e4a1da18f8fd71f2e5f0b98b0d41e3da257281a (diff)
downloadperl-3465e1f03c6c748e8f8a6bf8bfdfaf1fc58a4810.tar.gz
regcomp.c: Optimize EXACTFish nodes without folds to EXACT
Often, case folding will be applied to the entire regular expression (such as by using "/i"), but there will be components in it that are the same, folded or not. These components could be represented as EXACT nodes with no loss of information. The regex optimizer is then able to apply more optimizations to them than it could otherwise, and pattern matching will execute faster. This commit turns any EXACTFish node (except those under locale rules, whose folding rules are not known until runtime)) that contains entirely unfoldable characters into the equivalent EXACT node. This optimization brings up the idea of possibly splitting an EXACTFish node that contains a sufficiently long contiguous string of non-folding characters into the portions that have folding and the portions that don't. That might or might not be beneficial; I'm not undertaking the experiments to check that out.
Diffstat (limited to 'regcomp.c')
-rw-r--r--regcomp.c62
1 files changed, 55 insertions, 7 deletions
diff --git a/regcomp.c b/regcomp.c
index 128bbbbdb2..55aa218b1b 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -10495,6 +10495,11 @@ tryagain:
bool next_is_quantifier;
char * oldp = NULL;
+ /* If a folding node contains only code points that don't
+ * participate in folds, it can be changed into an EXACT node,
+ * which allows the optimizer more things to look for */
+ bool maybe_exact;
+
ender = 0;
node_type = compute_EXACTish(pRExC_state);
ret = reg_node(pRExC_state, node_type);
@@ -10507,6 +10512,11 @@ tryagain:
reparse:
+ /* We do the EXACTFish to EXACT node only if folding, and not if in
+ * locale, as whether a character folds or not isn't known until
+ * runtime */
+ maybe_exact = FOLD && ! LOC;
+
/* XXX The node can hold up to 255 bytes, yet this only goes to
* 127. I (khw) do not know why. Keeping it somewhat less than
* 255 allows us to not have to worry about overflow due to
@@ -10788,13 +10798,44 @@ tryagain:
}
}
else {
- ender = _to_uni_fold_flags(ender, (U8 *) s, &foldlen,
- FOLD_FLAGS_FULL
- | ((LOC) ? FOLD_FLAGS_LOCALE
- : (ASCII_FOLD_RESTRICTED)
- ? FOLD_FLAGS_NOMIX_ASCII
- : 0)
- );
+ UV folded = _to_uni_fold_flags(
+ ender,
+ (U8 *) s,
+ &foldlen,
+ FOLD_FLAGS_FULL
+ | ((LOC) ? FOLD_FLAGS_LOCALE
+ : (ASCII_FOLD_RESTRICTED)
+ ? FOLD_FLAGS_NOMIX_ASCII
+ : 0)
+ );
+
+ /* If this node only contains non-folding code
+ * points so far, see if this new one is also
+ * non-folding */
+ if (maybe_exact) {
+ if (folded != ender) {
+ maybe_exact = FALSE;
+ }
+ else {
+ /* Here the fold is the original; we have
+ * to check further to see if anything
+ * folds to it */
+ if (! PL_utf8_foldable) {
+ SV* swash = swash_init("utf8",
+ "_Perl_Any_Folds",
+ &PL_sv_undef, 1, 0);
+ PL_utf8_foldable =
+ _get_swash_invlist(swash);
+ SvREFCNT_dec(swash);
+ }
+ if (_invlist_contains_cp(PL_utf8_foldable,
+ ender))
+ {
+ maybe_exact = FALSE;
+ }
+ }
+ }
+ ender = folded;
}
s += foldlen;
@@ -10808,6 +10849,7 @@ tryagain:
}
else {
*(s++) = ender;
+ maybe_exact &= ! isALPHA_L1(ender);
}
}
else if (UTF) {
@@ -10997,6 +11039,12 @@ tryagain:
loopdone: /* Jumped to when encounters something that shouldn't be in
the node */
+ /* If 'maybe_exact' is still set here, means there are no
+ * code points in the node that participate in folds */
+ if (FOLD && maybe_exact) {
+ OP(ret) = EXACT;
+ }
+
/* I (khw) don't know if you can get here with zero length, but the
* old code handled this situation by creating a zero-length EXACT
* node. Might as well be NOTHING instead */