diff options
author | Karl Williamson <public@khwilliamson.com> | 2014-02-15 14:45:03 -0700 |
---|---|---|
committer | Karl Williamson <public@khwilliamson.com> | 2014-02-19 08:32:59 -0700 |
commit | 0ba8faef1d393a5e5eec58121e95c78331a76dda (patch) | |
tree | 5a99cabc67c0ca8855d474227f2564967ea405c3 | |
parent | 8a50cd03a18f63853c30d77231f3eed37cdf7efa (diff) | |
download | perl-0ba8faef1d393a5e5eec58121e95c78331a76dda.tar.gz |
Convert more EXACTFish nodes to EXACT when possible
Under /i matching, many characters match only themselves, such a
punctuation. If a node contains only such characters it can be an EXACT
node. The optimizer gets better hints when dealing with EXACT nodes
than ones with folding.
This changes the alloc_maybe_populate() function to look for
possibilities of non-folding input.
-rw-r--r-- | embed.fnc | 2 | ||||
-rw-r--r-- | embed.h | 2 | ||||
-rw-r--r-- | proto.h | 2 | ||||
-rw-r--r-- | regcomp.c | 73 |
4 files changed, 62 insertions, 17 deletions
@@ -2071,7 +2071,7 @@ EsRn |char * |regpatws |NN RExC_state_t *pRExC_state \ |NN char *p|const bool recognize_comment Ei |void |alloc_maybe_populate_EXACT|NN RExC_state_t *pRExC_state \ |NN regnode *node|NN I32 *flagp|STRLEN len \ - |UV code_point + |UV code_point|const bool downgradable Ei |U8 |compute_EXACTish|NN RExC_state_t *pRExC_state Es |char * |nextchar |NN RExC_state_t *pRExC_state Es |bool |reg_skipcomment|NN RExC_state_t *pRExC_state @@ -906,7 +906,7 @@ #define _invlist_array_init(a,b) S__invlist_array_init(aTHX_ a,b) #define add_cp_to_invlist(a,b) S_add_cp_to_invlist(aTHX_ a,b) #define add_data S_add_data -#define alloc_maybe_populate_EXACT(a,b,c,d,e) S_alloc_maybe_populate_EXACT(aTHX_ a,b,c,d,e) +#define alloc_maybe_populate_EXACT(a,b,c,d,e,f) S_alloc_maybe_populate_EXACT(aTHX_ a,b,c,d,e,f) #define compute_EXACTish(a) S_compute_EXACTish(aTHX_ a) #define could_it_be_a_POSIX_class(a) S_could_it_be_a_POSIX_class(aTHX_ a) #define get_ANYOF_cp_list_for_ssc(a,b) S_get_ANYOF_cp_list_for_ssc(aTHX_ a,b) @@ -6607,7 +6607,7 @@ STATIC U32 S_add_data(RExC_state_t* const pRExC_state, const char* const s, cons #define PERL_ARGS_ASSERT_ADD_DATA \ assert(pRExC_state); assert(s) -PERL_STATIC_INLINE void S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32 *flagp, STRLEN len, UV code_point) +PERL_STATIC_INLINE void S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32 *flagp, STRLEN len, UV code_point, const bool downgradable) __attribute__nonnull__(pTHX_1) __attribute__nonnull__(pTHX_2) __attribute__nonnull__(pTHX_3); @@ -10907,7 +10907,8 @@ S_compute_EXACTish(pTHX_ RExC_state_t *pRExC_state) PERL_STATIC_INLINE void S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, - regnode *node, I32* flagp, STRLEN len, UV code_point) + regnode *node, I32* flagp, STRLEN len, UV code_point, + const bool downgradable) { /* This knows the details about sizing an EXACTish node, setting flags for * it (by setting <*flagp>, and potentially populating it with a single @@ -10929,7 +10930,12 @@ S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, * * It knows that under FOLD, the Latin Sharp S and UTF characters above * 255, must be folded (the former only when the rules indicate it can - * match 'ss') */ + * match 'ss') + * + * When it does the populating, it looks at the flag 'downgradable'. If + * true with a node that folds, it checks if the single code point + * participates in a fold, and if not downgrades the node to an EXACT. + * This helps the optimizer */ bool len_passed_in = cBOOL(len != 0); U8 character[UTF8_MAXBYTES_CASE+1]; @@ -10947,18 +10953,31 @@ S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, EBCDIC, but it works there, as the extra invariants fold to themselves) */ *character = toFOLD((U8) code_point); + if (downgradable + && *character == code_point + && ! HAS_NONLATIN1_FOLD_CLOSURE(code_point)) + { + OP(node) = EXACT; + } } len = 1; } else if (FOLD && (! LOC || ! is_PROBLEMATIC_LOCALE_FOLD_cp(code_point))) { /* Folding, and ok to do so now */ - _to_uni_fold_flags(code_point, + UV folded = _to_uni_fold_flags( + code_point, character, &len, FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED) ? FOLD_FLAGS_NOMIX_ASCII : 0)); + if (downgradable + && folded == code_point + && ! _invlist_contains_cp(PL_utf8_foldable, code_point)) + { + OP(node) = EXACT; + } } else if (code_point <= MAX_UTF8_TWO_BYTE) { @@ -10971,19 +10990,36 @@ S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, uvchr_to_utf8( character, code_point); len = UTF8SKIP(character); } - } /* Else pattern isn't UTF8. We only fold the sharp s, when - appropriate */ - else if (UNLIKELY(code_point == LATIN_SMALL_LETTER_SHARP_S) - && FOLD - && AT_LEAST_UNI_SEMANTICS - && ! ASCII_FOLD_RESTRICTED) - { + } /* Else pattern isn't UTF8. */ + else if (! FOLD) { + *character = (U8) code_point; + len = 1; + } /* Else is folded non-UTF8 */ + else if (LIKELY(code_point != LATIN_SMALL_LETTER_SHARP_S)) { + + /* We don't fold any non-UTF8 except possibly the Sharp s (see + * comments at join_exact()); */ + *character = (U8) code_point; + len = 1; + + /* Can turn into an EXACT node if we know the fold at compile time, + * and it folds to itself and doesn't particpate in other folds */ + if (downgradable + && ! LOC + && PL_fold_latin1[code_point] == code_point + && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point) + || (isASCII(code_point) && ASCII_FOLD_RESTRICTED))) + { + OP(node) = EXACT; + } + } /* else is Sharp s. May need to fold it */ + else if (AT_LEAST_UNI_SEMANTICS && ! ASCII_FOLD_RESTRICTED) { *character = 's'; *(character + 1) = 's'; len = 2; } else { - *character = (U8) code_point; + *character = LATIN_SMALL_LETTER_SHARP_S; len = 1; } } @@ -12222,7 +12258,12 @@ tryagain: OP(ret) = EXACTFU; } } - alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender); + alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender, + FALSE /* Don't look to see if could + be turned into an EXACT + node, as we have already + computed that */ + ); } RExC_parse = p - 1; @@ -14214,7 +14255,9 @@ parseit: *flagp |= HASWIDTH|SIMPLE; } else if (PL_regkind[op] == EXACT) { - alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value); + alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value, + TRUE /* downgradable to EXACT */ + ); } RExC_parse = (char *) cur_parse; @@ -14743,7 +14786,9 @@ parseit: RExC_parse = (char *)cur_parse; if (PL_regkind[op] == EXACT) { - alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value); + alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value, + TRUE /* downgradable to EXACT */ + ); } SvREFCNT_dec_NN(cp_list); |