diff options
author | Karl Williamson <khw@cpan.org> | 2018-12-08 15:39:24 -0700 |
---|---|---|
committer | Karl Williamson <khw@cpan.org> | 2018-12-26 12:50:38 -0700 |
commit | 6f0fba9bd70f6f3481c9d325e460600f33289639 (patch) | |
tree | bb2b9dbf6ab053616b2b8d9c099ccca3a30c8f80 | |
parent | d11f185e6a90b33ef850c63d279285d422708be6 (diff) | |
download | perl-6f0fba9bd70f6f3481c9d325e460600f33289639.tar.gz |
Change length-1 ASCII fold pairs to ANYOFM regnodes
A node that matches only 'A' and 'a', for example, can be turned into an
ANYOFM node, which is faster to execute. This is done after joining of
adjacent EXACTFish nodes, as longer nodes are better than shorter ones,
including because they lessen the number of bugs with multi-char folds
not matching because of node boundaries.
But if a length 1 node remains, ANYOFM is better.
-rw-r--r-- | regcomp.c | 38 | ||||
-rw-r--r-- | t/re/anyof.t | 10 |
2 files changed, 43 insertions, 5 deletions
@@ -4335,6 +4335,23 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, } #endif } + + if ( STR_LEN(scan) == 1 + && isALPHA_A(* STRING(scan)) + && ( OP(scan) == EXACTFAA + || ( OP(scan) == EXACTFU + && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(* STRING(scan))))) + { + U8 mask = ~ ('A' ^ 'a'); /* These differ in just one bit */ + + /* Replace a length 1 ASCII fold pair node with an ANYOFM node, + * with the mask set to the complement of the bit that differs + * between upper and lower case, and the lowest code point of the + * pair (which the '&' forces) */ + OP(scan) = ANYOFM; + ARG_SET(scan, *STRING(scan) & mask); + FLAGS(scan) = mask; + } } #ifdef DEBUGGING @@ -5275,6 +5292,27 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, OP(next) = EXACTFU; } + if ( STR_LEN(next) == 1 + && isALPHA_A(* STRING(next)) + && ( OP(next) == EXACTFAA + || ( OP(next) == EXACTFU + && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(* STRING(next))))) + { + /* These differ in just one bit */ + U8 mask = ~ ('A' ^ 'a'); + + assert(isALPHA_A(* STRING(next))); + + /* Then replace it by an ANYOFM node, with + * the mask set to the complement of the + * bit that differs between upper and lower + * case, and the lowest code point of the + * pair (which the '&' forces) */ + OP(next) = ANYOFM; + ARG_SET(next, *STRING(next) & mask); + FLAGS(next) = mask; + } + if (flags & SCF_DO_STCLASS) { mincount = 0; maxcount = REG_INFTY; diff --git a/t/re/anyof.t b/t/re/anyof.t index f08116b9be..ad0a2d9ada 100644 --- a/t/re/anyof.t +++ b/t/re/anyof.t @@ -141,13 +141,13 @@ my @tests = ( '(?il)[\x{212A}]' => 'ANYOFL{i}[{utf8 locale}Kk][212A]', '(?il)(?[\x{212A}])' => 'ANYOFL{utf8-locale-reqd}[Kk][212A]', - '(?i)b[s]\xe0' => 'EXACTFU <b>', # The s goes into a 2nd node + '(?i)b[s]\xe0' => 'ANYOFM[Bb]', # The s goes into a 2nd node - 'ebcdic_ok_below_this_marker', + '[aA]' => 'ANYOFM[Aa]', + '[bB]' => 'ANYOFM[Bb]', + '[kK]' => 'ANYOFM[Kk]', - '[aA]' => 'EXACTFAA <a>', - '[bB]' => 'EXACTFU <b>', - '[kK]' => 'EXACTFAA <k>', + 'ebcdic_ok_below_this_marker', '(?i:[^:])' => 'NANYOFM[:]', |