diff options
author | Karl Williamson <public@khwilliamson.com> | 2012-06-27 14:43:41 -0600 |
---|---|---|
committer | Karl Williamson <public@khwilliamson.com> | 2012-06-29 22:22:42 -0600 |
commit | 3172e3fd885a9c54105d3b6156f18dc761fe29e5 (patch) | |
tree | d3b2ef63938ee83c1873a6a4b0f845b522bc63e4 | |
parent | 693fefec6759ebf0a9ec40a0f59346d86831349c (diff) | |
download | perl-3172e3fd885a9c54105d3b6156f18dc761fe29e5.tar.gz |
regcomp.c: Optimize e.g., /[^\w]/, /[[^:word:]]/ into /\W/
This optimizes character classes that have a single element that is one
of the ops that have the same meaning outside (namely \d, \h, \s, \w,
\v, :word:, :digit: and their complements) to that op. Those
ops take less space than a character class and run faster. An initial
'^' for complementing the class is also handled.
-rw-r--r-- | regcomp.c | 136 | ||||
-rw-r--r-- | regcomp.sym | 3 |
2 files changed, 138 insertions, 1 deletions
@@ -10978,6 +10978,13 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth) Optimizations may be possible if this is tiny */ UV n; + /* Certain named classes have equivalents that can appear outside a + * character class, e.g. \w. These flags are set for these classes. The + * first flag indicates the op depends on the character set modifier, like + * /d, /u.... The second is for those that don't have this dependency. */ + bool has_special_charset_op = FALSE; + bool has_special_non_charset_op = FALSE; + /* Unicode properties are stored in a swash; this holds the current one * being parsed. If this swash is the only above-latin1 component of the * character class, an optimization is to pass it directly on to the @@ -11371,14 +11378,44 @@ parseit: } range = 0; /* this was not a true range */ + element_count += 2; /* So counts for three values */ } - if (!SIZE_ONLY) { + if (SIZE_ONLY) { + + /* In the first pass, do a little extra work so below can + * possibly optimize the whole node to one of the nodes that + * correspond to the classes given below */ + + /* The optimization will only take place if there is a single + * element in the class, so can skip if there is more than one + */ + if (element_count == 1) { /* Possible truncation here but in some 64-bit environments * the compiler gets heartburn about switch on 64-bit values. * A similar issue a little earlier when switching on value. * --jhi */ + switch ((I32)namedclass) { + case ANYOF_ALNUM: + case ANYOF_NALNUM: + case ANYOF_DIGIT: + case ANYOF_NDIGIT: + case ANYOF_SPACE: + case ANYOF_NSPACE: + has_special_charset_op = TRUE; + break; + + case ANYOF_HORIZWS: + case ANYOF_NHORIZWS: + case ANYOF_VERTWS: + case ANYOF_NVERTWS: + has_special_non_charset_op = TRUE; + break; + } + } + } + else { switch ((I32)namedclass) { case ANYOF_ALNUMC: /* C's alnum, in contrast to \w */ @@ -11439,10 +11476,12 @@ parseit: * them */ DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(ret, namedclass, properties, PL_PosixDigit, "XPosixDigit", listsv); + has_special_charset_op = TRUE; break; case ANYOF_NDIGIT: DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, PL_PosixDigit, PL_PosixDigit, "XPosixDigit", listsv); + has_special_charset_op = TRUE; break; case ANYOF_GRAPH: DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, @@ -11459,10 +11498,12 @@ parseit: * cp_list is subject to folding. It turns out that \h * is just a synonym for XPosixBlank */ _invlist_union(cp_list, PL_XPosixBlank, &cp_list); + has_special_non_charset_op = TRUE; break; case ANYOF_NHORIZWS: _invlist_union_complement_2nd(cp_list, PL_XPosixBlank, &cp_list); + has_special_non_charset_op = TRUE; break; case ANYOF_LOWER: case ANYOF_NLOWER: @@ -11521,10 +11562,12 @@ parseit: case ANYOF_SPACE: DO_POSIX(ret, namedclass, properties, PL_PerlSpace, PL_XPerlSpace); + has_special_charset_op = TRUE; break; case ANYOF_NSPACE: DO_N_POSIX(ret, namedclass, properties, PL_PerlSpace, PL_XPerlSpace); + has_special_charset_op = TRUE; break; case ANYOF_UPPER: /* Same as LOWER, above */ case ANYOF_NUPPER: @@ -11556,10 +11599,12 @@ parseit: case ANYOF_ALNUM: /* Really is 'Word' */ DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv); + has_special_charset_op = TRUE; break; case ANYOF_NALNUM: DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv); + has_special_charset_op = TRUE; break; case ANYOF_VERTWS: /* For these, we use the cp_list, as /d doesn't make a @@ -11567,10 +11612,12 @@ parseit: * if these characters had folds other than themselves, as * cp_list is subject to folding */ _invlist_union(cp_list, PL_VertSpace, &cp_list); + has_special_non_charset_op = TRUE; break; case ANYOF_NVERTWS: _invlist_union_complement_2nd(cp_list, PL_VertSpace, &cp_list); + has_special_non_charset_op = TRUE; break; case ANYOF_XDIGIT: DO_POSIX(ret, namedclass, properties, @@ -11662,6 +11709,93 @@ parseit: range = 0; /* this range (if it was one) is done now */ } + /* [\w] can be optimized into \w, but not if there is anything else in the + * brackets (except for an initial '^' which indictes omplementing) */ + if (element_count == 1 && (has_special_charset_op || has_special_non_charset_op)) { + U8 op; + bool invert = ANYOF_FLAGS(ret) & ANYOF_INVERT; + const char * cur_parse = RExC_parse; + + if (has_special_charset_op) { + U8 offset = get_regex_charset(RExC_flags); + + /* /aa is the same as /a for these */ + if (offset == REGEX_ASCII_MORE_RESTRICTED_CHARSET) { + offset = REGEX_ASCII_RESTRICTED_CHARSET; + } + switch ((I32)namedclass) { + case ANYOF_NALNUM: + invert = ! invert; + /* FALLTHROUGH */ + case ANYOF_ALNUM: + op = ALNUM; + break; + case ANYOF_NSPACE: + invert = ! invert; + /* FALLTHROUGH */ + case ANYOF_SPACE: + op = SPACE; + break; + case ANYOF_NDIGIT: + invert = ! invert; + /* FALLTHROUGH */ + case ANYOF_DIGIT: + op = DIGIT; + + /* There is no DIGITU */ + if (offset == REGEX_UNICODE_CHARSET) { + offset = REGEX_DEPENDS_CHARSET; + } + break; + default: + Perl_croak(aTHX_ "panic: Named character class %"IVdf" is not expected to have a non-[...] version", namedclass); + } + + /* The number of varieties of each of these is the same, hence, so + * is the delta between the normal and complemented nodes */ + if (invert) { + offset += NALNUM - ALNUM; + } + + op += offset; + } + else if (has_special_non_charset_op) { + switch ((I32)namedclass) { + case ANYOF_NHORIZWS: + invert = ! invert; + /* FALLTHROUGH */ + case ANYOF_HORIZWS: + op = HORIZWS; + break; + case ANYOF_NVERTWS: + invert = ! invert; + /* FALLTHROUGH */ + case ANYOF_VERTWS: + op = VERTWS; + break; + default: + Perl_croak(aTHX_ "panic: Named character class %"IVdf" is not expected to have a non-[...] version", namedclass); + } + + /* The complement version of each of these nodes is adjacently next + * */ + if (invert) { + op++; + } + } + + /* Throw away this ANYOF regnode, and emit the calculated one, which + * should correspond to the beginning, not current, state of the parse + */ + RExC_parse = (char *)orig_parse; + RExC_emit = (regnode *)orig_emit; + ret = reg_node(pRExC_state, op); + RExC_parse = (char *) cur_parse; + + SvREFCNT_dec(listsv); + return ret; + } + if (SIZE_ONLY) return ret; /****** !SIZE_ONLY AFTER HERE *********/ diff --git a/regcomp.sym b/regcomp.sym index c36a7fc2cd..0865a73ed5 100644 --- a/regcomp.sym +++ b/regcomp.sym @@ -227,6 +227,9 @@ KEEPS KEEPS, no ; $& begins here. #*New charclass like patterns LNBREAK LNBREAK, none ; generic newline pattern + +# regcomp.c expects the node number of the complement to be one greater than +# the non-complement VERTWS VERTWS, none 0 S ; vertical whitespace (Perl 6) NVERTWS NVERTWS, none 0 S ; not vertical whitespace (Perl 6) HORIZWS HORIZWS, none 0 S ; horizontal whitespace (Perl 6) |