diff options
author | Karl Williamson <public@khwilliamson.com> | 2010-12-07 22:51:34 -0700 |
---|---|---|
committer | Karl Williamson <public@khwilliamson.com> | 2010-12-11 15:58:18 -0700 |
commit | 2786be71c69a3e244009b94145ca66f2326aadb9 (patch) | |
tree | 37df79d93e6ea56e936b1b1a70c37f69c07a8220 /regcomp.c | |
parent | 3a15e693385b1ab6186ad77f2fc208db1d0e05ea (diff) | |
download | perl-2786be71c69a3e244009b94145ca66f2326aadb9.tar.gz |
regcomp.c: Clean up optimization for 1-char []
A single character character class can be optimized into an EXACT node.
The changes elsewhere allow this to no longer be constrained to
ASCII-only when the pattern isn't UTF-8. Also, the optimization
shouldn't have happened for FOLDED characters, as explained in the
comments, when they participate in multi-char folds; so that is removed.
Also, a locale node with folded characters can be optimized.
Diffstat (limited to 'regcomp.c')
-rw-r--r-- | regcomp.c | 63 |
1 files changed, 47 insertions, 16 deletions
@@ -8928,28 +8928,59 @@ parseit: ANYOF_FLAGS(ret) = ANYOF_UTF8|ANYOF_UNICODE_ALL; } - if( stored == 1 && (value < 128 || (value < 256 && !UTF)) - && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) ) - ) { - /* optimize single char class to an EXACT node but *only* when its not - * a UTF/high char. Note that the information needed to decide to do - * this optimization is not currently available until the 2nd pass, and - * that the actually used EXACT node takes less space than the - * calculated ANYOF node, and hence the amount of space calculated in - * the first pass is larger than actually used. Currently we don't - * keep track of enough information to do this for nodes which contain - * matches outside the bitmap */ + /* A single character class can be "optimized" into an EXACTish node. + * Note that since we don't currently count how many characters there are + * outside the bitmap, we are XXX missing optimization possibilities for + * them. This optimization can't happen unless this is a truly single + * character class, which means that it can't be an inversion into a + * many-character class, and there must be no possibility of there being + * things outside the bitmap. 'stored' (only) for locales doesn't include + * \w, etc, so have to make a special test that they aren't present */ + if (! (ANYOF_FLAGS(ret) & (ANYOF_NONBITMAP|ANYOF_INVERT|ANYOF_UNICODE_ALL)) + && ((stored == 1 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE)) + || (! ANYOF_CLASS_TEST_ANY_SET(ret)))))) + { + /* Note that the information needed to decide to do this optimization + * is not currently available until the 2nd pass, and that the actually + * used EXACT node takes less space than the calculated ANYOF node, and + * hence the amount of space calculated in the first pass is larger + * than actually used, so this optimization doesn't gain us any space. + * But an EXACT node is faster than an ANYOF node, and can be combined + * with any adjacent EXACT nodes later by the optimizer for further + * gains. */ + const char * cur_parse= RExC_parse; RExC_emit = (regnode *)orig_emit; RExC_parse = (char *)orig_parse; - ret = reg_node(pRExC_state, - (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT)); + + /* (A locale node can have 1 point and be folded; all the other folds + * will include the fold, hence will have 2 points, so we won't get + * here with FOLD set unless it is also locale) */ + ret = reg_node(pRExC_state, (U8) (! FOLD) + ? EXACT + : EXACTFL + ); RExC_parse = (char *)cur_parse; - *STRING(ret)= (char)value; - STR_LEN(ret)= 1; - RExC_emit += STR_SZ(1); + if (UTF && ! NATIVE_IS_INVARIANT(value)) { + *STRING(ret)= UTF8_EIGHT_BIT_HI((U8) value); + *(STRING(ret) + 1)= UTF8_EIGHT_BIT_LO((U8) value); + STR_LEN(ret)= 2; + RExC_emit += STR_SZ(2); + } + else { + *STRING(ret)= (char)value; + STR_LEN(ret)= 1; + RExC_emit += STR_SZ(1); + } SvREFCNT_dec(listsv); return ret; + + /* (A 2-character class of the very special form like [aA] could be + * optimized into an EXACTFish node, but only for non-locales, and for + * characters which only have the two folds; so things like 'fF' and + * 'Ii' wouldn't work because of the fold of 'LATIN SMALL LIGATURE FI'. + * Since we don't have that information currently conveniently + * available, skip the optimization) */ } { |