diff options
author | Karl Williamson <public@khwilliamson.com> | 2010-12-15 18:11:44 -0700 |
---|---|---|
committer | Karl Williamson <public@khwilliamson.com> | 2010-12-15 18:16:08 -0700 |
commit | 6da63e10e37727ea553bda1a68547e3053b6bc0d (patch) | |
tree | dcafb944d45196d1e590f65729ae68010c1c51ab /regcomp.c | |
parent | 1e6ade674296a6205f357980325166c942b708fd (diff) | |
download | perl-6da63e10e37727ea553bda1a68547e3053b6bc0d.tar.gz |
regcomp.c: Optimize [cC] char class to EXACTF
A two character character class where the two elements are the folds of
each other can be optimized into an EXACTF regnode. This should not
change the speed of execution appreciably, except that EXACTF regnodes
are candidates for further optimization by combining with adjacent nodes
of the same type.
This commit brings the optimization level up to somewhat better than
when I started mucking around with ANYOF nodes.
Diffstat (limited to 'regcomp.c')
-rw-r--r-- | regcomp.c | 66 |
1 files changed, 46 insertions, 20 deletions
@@ -8936,31 +8936,64 @@ parseit: * character class, which means that it can't be an inversion into a * many-character class, and there must be no possibility of there being * things outside the bitmap. 'stored' (only) for locales doesn't include - * \w, etc, so have to make a special test that they aren't present */ + * \w, etc, so have to make a special test that they aren't present + * + * Similarly A 2-character class of the very special form like [bB] can be + * optimized into an EXACTFish node, but only for non-locales, and for + * characters which only have the two folds; so things like 'fF' and 'Ii' + * wouldn't work because they are part of the fold of 'LATIN SMALL LIGATURE + * FI'. */ if (! (ANYOF_FLAGS(ret) & (ANYOF_NONBITMAP|ANYOF_INVERT|ANYOF_UNICODE_ALL)) - && ((stored == 1 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE)) - || (! ANYOF_CLASS_TEST_ANY_SET(ret)))))) + && (((stored == 1 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE)) + || (! ANYOF_CLASS_TEST_ANY_SET(ret))))) + || (stored == 2 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE)) + && (! _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value)) + /* If the latest code point has a fold whose + * bit is set, it must be the only other one */ + && ((prevvalue = PL_fold_latin1[value]) != value) + && ANYOF_BITMAP_TEST(ret, prevvalue))))) { /* Note that the information needed to decide to do this optimization * is not currently available until the 2nd pass, and that the actually - * used EXACT node takes less space than the calculated ANYOF node, and - * hence the amount of space calculated in the first pass is larger + * used EXACTish node takes less space than the calculated ANYOF node, + * and hence the amount of space calculated in the first pass is larger * than actually used, so this optimization doesn't gain us any space. * But an EXACT node is faster than an ANYOF node, and can be combined * with any adjacent EXACT nodes later by the optimizer for further - * gains. */ + * gains. The speed of executing an EXACTF is similar to an ANYOF + * node, so the optimization advantage comes from the ability to join + * it to adjacent EXACT nodes */ const char * cur_parse= RExC_parse; + U8 op; RExC_emit = (regnode *)orig_emit; RExC_parse = (char *)orig_parse; - /* (A locale node can have 1 point and be folded; all the other folds - * will include the fold, hence will have 2 points, so we won't get - * here with ANYOF_FOLD set unless it is also locale) */ - ret = reg_node(pRExC_state, (U8) (! (ANYOF_FLAGS(ret) & ANYOF_FOLD)) - ? EXACT - : EXACTFL - ); + if (stored == 1) { + + /* A locale node with one point can be folded; all the other cases + * with folding will have two points, since we calculate them above + */ + if (ANYOF_FLAGS(ret) & ANYOF_FOLD) { + op = EXACTFL; + } + else { + op = EXACT; + } + } /* else 2 chars in the bit map: the folds of each other */ + else if (UNI_SEMANTICS || !isASCII(value)) { + + /* To join adjacent nodes, they must be the exact EXACTish type. + * Try to use the most likely type, by using EXACTFU if the regex + * calls for them, or is required because the character is + * non-ASCII */ + op = EXACTFU; + } + else { /* Otherwise, more likely to be EXACTF type */ + op = EXACTF; + } + + ret = reg_node(pRExC_state, op); RExC_parse = (char *)cur_parse; if (UTF && ! NATIVE_IS_INVARIANT(value)) { *STRING(ret)= UTF8_EIGHT_BIT_HI((U8) value); @@ -8975,13 +9008,6 @@ parseit: } SvREFCNT_dec(listsv); return ret; - - /* (A 2-character class of the very special form like [aA] could be - * optimized into an EXACTFish node, but only for non-locales, and for - * characters which only have the two folds; so things like 'fF' and - * 'Ii' wouldn't work because of the fold of 'LATIN SMALL LIGATURE FI'. - * Since we don't have that information currently conveniently - * available, skip the optimization) */ } { |