summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKarl Williamson <public@khwilliamson.com>2012-06-27 14:43:41 -0600
committerKarl Williamson <public@khwilliamson.com>2012-06-29 22:22:42 -0600
commit3172e3fd885a9c54105d3b6156f18dc761fe29e5 (patch)
treed3b2ef63938ee83c1873a6a4b0f845b522bc63e4
parent693fefec6759ebf0a9ec40a0f59346d86831349c (diff)
downloadperl-3172e3fd885a9c54105d3b6156f18dc761fe29e5.tar.gz
regcomp.c: Optimize e.g., /[^\w]/, /[[^:word:]]/ into /\W/
This optimizes character classes that have a single element that is one of the ops that have the same meaning outside (namely \d, \h, \s, \w, \v, :word:, :digit: and their complements) to that op. Those ops take less space than a character class and run faster. An initial '^' for complementing the class is also handled.
-rw-r--r--regcomp.c136
-rw-r--r--regcomp.sym3
2 files changed, 138 insertions, 1 deletions
diff --git a/regcomp.c b/regcomp.c
index 155ef098fb..22a220f1b0 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -10978,6 +10978,13 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
Optimizations may be possible if this is tiny */
UV n;
+ /* Certain named classes have equivalents that can appear outside a
+ * character class, e.g. \w. These flags are set for these classes. The
+ * first flag indicates the op depends on the character set modifier, like
+ * /d, /u.... The second is for those that don't have this dependency. */
+ bool has_special_charset_op = FALSE;
+ bool has_special_non_charset_op = FALSE;
+
/* Unicode properties are stored in a swash; this holds the current one
* being parsed. If this swash is the only above-latin1 component of the
* character class, an optimization is to pass it directly on to the
@@ -11371,14 +11378,44 @@ parseit:
}
range = 0; /* this was not a true range */
+ element_count += 2; /* So counts for three values */
}
- if (!SIZE_ONLY) {
+ if (SIZE_ONLY) {
+
+ /* In the first pass, do a little extra work so below can
+ * possibly optimize the whole node to one of the nodes that
+ * correspond to the classes given below */
+
+ /* The optimization will only take place if there is a single
+ * element in the class, so can skip if there is more than one
+ */
+ if (element_count == 1) {
/* Possible truncation here but in some 64-bit environments
* the compiler gets heartburn about switch on 64-bit values.
* A similar issue a little earlier when switching on value.
* --jhi */
+ switch ((I32)namedclass) {
+ case ANYOF_ALNUM:
+ case ANYOF_NALNUM:
+ case ANYOF_DIGIT:
+ case ANYOF_NDIGIT:
+ case ANYOF_SPACE:
+ case ANYOF_NSPACE:
+ has_special_charset_op = TRUE;
+ break;
+
+ case ANYOF_HORIZWS:
+ case ANYOF_NHORIZWS:
+ case ANYOF_VERTWS:
+ case ANYOF_NVERTWS:
+ has_special_non_charset_op = TRUE;
+ break;
+ }
+ }
+ }
+ else {
switch ((I32)namedclass) {
case ANYOF_ALNUMC: /* C's alnum, in contrast to \w */
@@ -11439,10 +11476,12 @@ parseit:
* them */
DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(ret, namedclass, properties,
PL_PosixDigit, "XPosixDigit", listsv);
+ has_special_charset_op = TRUE;
break;
case ANYOF_NDIGIT:
DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
PL_PosixDigit, PL_PosixDigit, "XPosixDigit", listsv);
+ has_special_charset_op = TRUE;
break;
case ANYOF_GRAPH:
DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
@@ -11459,10 +11498,12 @@ parseit:
* cp_list is subject to folding. It turns out that \h
* is just a synonym for XPosixBlank */
_invlist_union(cp_list, PL_XPosixBlank, &cp_list);
+ has_special_non_charset_op = TRUE;
break;
case ANYOF_NHORIZWS:
_invlist_union_complement_2nd(cp_list,
PL_XPosixBlank, &cp_list);
+ has_special_non_charset_op = TRUE;
break;
case ANYOF_LOWER:
case ANYOF_NLOWER:
@@ -11521,10 +11562,12 @@ parseit:
case ANYOF_SPACE:
DO_POSIX(ret, namedclass, properties,
PL_PerlSpace, PL_XPerlSpace);
+ has_special_charset_op = TRUE;
break;
case ANYOF_NSPACE:
DO_N_POSIX(ret, namedclass, properties,
PL_PerlSpace, PL_XPerlSpace);
+ has_special_charset_op = TRUE;
break;
case ANYOF_UPPER: /* Same as LOWER, above */
case ANYOF_NUPPER:
@@ -11556,10 +11599,12 @@ parseit:
case ANYOF_ALNUM: /* Really is 'Word' */
DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv);
+ has_special_charset_op = TRUE;
break;
case ANYOF_NALNUM:
DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv);
+ has_special_charset_op = TRUE;
break;
case ANYOF_VERTWS:
/* For these, we use the cp_list, as /d doesn't make a
@@ -11567,10 +11612,12 @@ parseit:
* if these characters had folds other than themselves, as
* cp_list is subject to folding */
_invlist_union(cp_list, PL_VertSpace, &cp_list);
+ has_special_non_charset_op = TRUE;
break;
case ANYOF_NVERTWS:
_invlist_union_complement_2nd(cp_list,
PL_VertSpace, &cp_list);
+ has_special_non_charset_op = TRUE;
break;
case ANYOF_XDIGIT:
DO_POSIX(ret, namedclass, properties,
@@ -11662,6 +11709,93 @@ parseit:
range = 0; /* this range (if it was one) is done now */
}
+ /* [\w] can be optimized into \w, but not if there is anything else in the
+ * brackets (except for an initial '^' which indictes omplementing) */
+ if (element_count == 1 && (has_special_charset_op || has_special_non_charset_op)) {
+ U8 op;
+ bool invert = ANYOF_FLAGS(ret) & ANYOF_INVERT;
+ const char * cur_parse = RExC_parse;
+
+ if (has_special_charset_op) {
+ U8 offset = get_regex_charset(RExC_flags);
+
+ /* /aa is the same as /a for these */
+ if (offset == REGEX_ASCII_MORE_RESTRICTED_CHARSET) {
+ offset = REGEX_ASCII_RESTRICTED_CHARSET;
+ }
+ switch ((I32)namedclass) {
+ case ANYOF_NALNUM:
+ invert = ! invert;
+ /* FALLTHROUGH */
+ case ANYOF_ALNUM:
+ op = ALNUM;
+ break;
+ case ANYOF_NSPACE:
+ invert = ! invert;
+ /* FALLTHROUGH */
+ case ANYOF_SPACE:
+ op = SPACE;
+ break;
+ case ANYOF_NDIGIT:
+ invert = ! invert;
+ /* FALLTHROUGH */
+ case ANYOF_DIGIT:
+ op = DIGIT;
+
+ /* There is no DIGITU */
+ if (offset == REGEX_UNICODE_CHARSET) {
+ offset = REGEX_DEPENDS_CHARSET;
+ }
+ break;
+ default:
+ Perl_croak(aTHX_ "panic: Named character class %"IVdf" is not expected to have a non-[...] version", namedclass);
+ }
+
+ /* The number of varieties of each of these is the same, hence, so
+ * is the delta between the normal and complemented nodes */
+ if (invert) {
+ offset += NALNUM - ALNUM;
+ }
+
+ op += offset;
+ }
+ else if (has_special_non_charset_op) {
+ switch ((I32)namedclass) {
+ case ANYOF_NHORIZWS:
+ invert = ! invert;
+ /* FALLTHROUGH */
+ case ANYOF_HORIZWS:
+ op = HORIZWS;
+ break;
+ case ANYOF_NVERTWS:
+ invert = ! invert;
+ /* FALLTHROUGH */
+ case ANYOF_VERTWS:
+ op = VERTWS;
+ break;
+ default:
+ Perl_croak(aTHX_ "panic: Named character class %"IVdf" is not expected to have a non-[...] version", namedclass);
+ }
+
+ /* The complement version of each of these nodes is adjacently next
+ * */
+ if (invert) {
+ op++;
+ }
+ }
+
+ /* Throw away this ANYOF regnode, and emit the calculated one, which
+ * should correspond to the beginning, not current, state of the parse
+ */
+ RExC_parse = (char *)orig_parse;
+ RExC_emit = (regnode *)orig_emit;
+ ret = reg_node(pRExC_state, op);
+ RExC_parse = (char *) cur_parse;
+
+ SvREFCNT_dec(listsv);
+ return ret;
+ }
+
if (SIZE_ONLY)
return ret;
/****** !SIZE_ONLY AFTER HERE *********/
diff --git a/regcomp.sym b/regcomp.sym
index c36a7fc2cd..0865a73ed5 100644
--- a/regcomp.sym
+++ b/regcomp.sym
@@ -227,6 +227,9 @@ KEEPS KEEPS, no ; $& begins here.
#*New charclass like patterns
LNBREAK LNBREAK, none ; generic newline pattern
+
+# regcomp.c expects the node number of the complement to be one greater than
+# the non-complement
VERTWS VERTWS, none 0 S ; vertical whitespace (Perl 6)
NVERTWS NVERTWS, none 0 S ; not vertical whitespace (Perl 6)
HORIZWS HORIZWS, none 0 S ; horizontal whitespace (Perl 6)