summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2018-12-08 15:39:24 -0700
committerKarl Williamson <khw@cpan.org>2018-12-26 12:50:38 -0700
commit6f0fba9bd70f6f3481c9d325e460600f33289639 (patch)
treebb2b9dbf6ab053616b2b8d9c099ccca3a30c8f80
parentd11f185e6a90b33ef850c63d279285d422708be6 (diff)
downloadperl-6f0fba9bd70f6f3481c9d325e460600f33289639.tar.gz
Change length-1 ASCII fold pairs to ANYOFM regnodes
A node that matches only 'A' and 'a', for example, can be turned into an ANYOFM node, which is faster to execute. This is done after joining of adjacent EXACTFish nodes, as longer nodes are better than shorter ones, including because they lessen the number of bugs with multi-char folds not matching because of node boundaries. But if a length 1 node remains, ANYOFM is better.
-rw-r--r--regcomp.c38
-rw-r--r--t/re/anyof.t10
2 files changed, 43 insertions, 5 deletions
diff --git a/regcomp.c b/regcomp.c
index 8be6cbe274..0de0afd9be 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -4335,6 +4335,23 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
}
#endif
}
+
+ if ( STR_LEN(scan) == 1
+ && isALPHA_A(* STRING(scan))
+ && ( OP(scan) == EXACTFAA
+ || ( OP(scan) == EXACTFU
+ && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(* STRING(scan)))))
+ {
+ U8 mask = ~ ('A' ^ 'a'); /* These differ in just one bit */
+
+ /* Replace a length 1 ASCII fold pair node with an ANYOFM node,
+ * with the mask set to the complement of the bit that differs
+ * between upper and lower case, and the lowest code point of the
+ * pair (which the '&' forces) */
+ OP(scan) = ANYOFM;
+ ARG_SET(scan, *STRING(scan) & mask);
+ FLAGS(scan) = mask;
+ }
}
#ifdef DEBUGGING
@@ -5275,6 +5292,27 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
OP(next) = EXACTFU;
}
+ if ( STR_LEN(next) == 1
+ && isALPHA_A(* STRING(next))
+ && ( OP(next) == EXACTFAA
+ || ( OP(next) == EXACTFU
+ && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(* STRING(next)))))
+ {
+ /* These differ in just one bit */
+ U8 mask = ~ ('A' ^ 'a');
+
+ assert(isALPHA_A(* STRING(next)));
+
+ /* Then replace it by an ANYOFM node, with
+ * the mask set to the complement of the
+ * bit that differs between upper and lower
+ * case, and the lowest code point of the
+ * pair (which the '&' forces) */
+ OP(next) = ANYOFM;
+ ARG_SET(next, *STRING(next) & mask);
+ FLAGS(next) = mask;
+ }
+
if (flags & SCF_DO_STCLASS) {
mincount = 0;
maxcount = REG_INFTY;
diff --git a/t/re/anyof.t b/t/re/anyof.t
index f08116b9be..ad0a2d9ada 100644
--- a/t/re/anyof.t
+++ b/t/re/anyof.t
@@ -141,13 +141,13 @@ my @tests = (
'(?il)[\x{212A}]' => 'ANYOFL{i}[{utf8 locale}Kk][212A]',
'(?il)(?[\x{212A}])' => 'ANYOFL{utf8-locale-reqd}[Kk][212A]',
- '(?i)b[s]\xe0' => 'EXACTFU <b>', # The s goes into a 2nd node
+ '(?i)b[s]\xe0' => 'ANYOFM[Bb]', # The s goes into a 2nd node
- 'ebcdic_ok_below_this_marker',
+ '[aA]' => 'ANYOFM[Aa]',
+ '[bB]' => 'ANYOFM[Bb]',
+ '[kK]' => 'ANYOFM[Kk]',
- '[aA]' => 'EXACTFAA <a>',
- '[bB]' => 'EXACTFU <b>',
- '[kK]' => 'EXACTFAA <k>',
+ 'ebcdic_ok_below_this_marker',
'(?i:[^:])' => 'NANYOFM[:]',