regcomp.c: Optimize e.g., /[^\w]/, /[[^:word:]]/ into /\W/

This optimizes character classes that have a single element that is one of the ops that have the same meaning outside (namely \d, \h, \s, \w, \v, :word:, :digit: and their complements) to that op. Those ops take less space than a character class and run faster. An initial '^' for complementing the class is also handled.
author: Karl Williamson <public@khwilliamson.com> 2012-06-27 14:43:41 -0600
committer: Karl Williamson <public@khwilliamson.com> 2012-06-29 22:22:42 -0600
commit: 3172e3fd885a9c54105d3b6156f18dc761fe29e5 (patch)
tree: d3b2ef63938ee83c1873a6a4b0f845b522bc63e4
parent: 693fefec6759ebf0a9ec40a0f59346d86831349c (diff)
download: perl-3172e3fd885a9c54105d3b6156f18dc761fe29e5.tar.gz
2 files changed, 138 insertions, 1 deletions
diff --git a/regcomp.c b/regcomp.c
index 155ef098fb..22a220f1b0 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -10978,6 +10978,13 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
 			       Optimizations may be possible if this is tiny */
     UV n;
 
+    /* Certain named classes have equivalents that can appear outside a
+     * character class, e.g. \w.  These flags are set for these classes.  The
+     * first flag indicates the op depends on the character set modifier, like
+     * /d, /u....  The second is for those that don't have this dependency. */
+    bool has_special_charset_op = FALSE;
+    bool has_special_non_charset_op = FALSE;
+
     /* Unicode properties are stored in a swash; this holds the current one
      * being parsed.  If this swash is the only above-latin1 component of the
      * character class, an optimization is to pass it directly on to the
@@ -11371,14 +11378,44 @@ parseit:
 		}
 
 		range = 0; /* this was not a true range */
+                element_count += 2; /* So counts for three values */
 	    }
 
-	    if (!SIZE_ONLY) {
+	    if (SIZE_ONLY) {
+
+                /* In the first pass, do a little extra work so below can
+                 * possibly optimize the whole node to one of the nodes that
+                 * correspond to the classes given below */
+
+                /* The optimization will only take place if there is a single
+                 * element in the class, so can skip if there is more than one
+                 */
+                if (element_count == 1) {
 
 		/* Possible truncation here but in some 64-bit environments
 		 * the compiler gets heartburn about switch on 64-bit values.
 		 * A similar issue a little earlier when switching on value.
 		 * --jhi */
+                    switch ((I32)namedclass) {
+                        case ANYOF_ALNUM:
+                        case ANYOF_NALNUM:
+                        case ANYOF_DIGIT:
+                        case ANYOF_NDIGIT:
+                        case ANYOF_SPACE:
+                        case ANYOF_NSPACE:
+                            has_special_charset_op = TRUE;
+                            break;
+
+                        case ANYOF_HORIZWS:
+                        case ANYOF_NHORIZWS:
+                        case ANYOF_VERTWS:
+                        case ANYOF_NVERTWS:
+                            has_special_non_charset_op = TRUE;
+                            break;
+                    }
+                }
+            }
+            else {
 		switch ((I32)namedclass) {
 
 		case ANYOF_ALNUMC: /* C's alnum, in contrast to \w */
@@ -11439,10 +11476,12 @@ parseit:
 		     * them */
 		    DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(ret, namedclass, properties,
                         PL_PosixDigit, "XPosixDigit", listsv);
+                    has_special_charset_op = TRUE;
 		    break;
 		case ANYOF_NDIGIT:
 		    DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
                         PL_PosixDigit, PL_PosixDigit, "XPosixDigit", listsv);
+                    has_special_charset_op = TRUE;
 		    break;
 		case ANYOF_GRAPH:
 		    DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
@@ -11459,10 +11498,12 @@ parseit:
 		     * cp_list is subject to folding.  It turns out that \h
 		     * is just a synonym for XPosixBlank */
 		    _invlist_union(cp_list, PL_XPosixBlank, &cp_list);
+                    has_special_non_charset_op = TRUE;
 		    break;
 		case ANYOF_NHORIZWS:
                     _invlist_union_complement_2nd(cp_list,
                                                  PL_XPosixBlank, &cp_list);
+                    has_special_non_charset_op = TRUE;
 		    break;
 		case ANYOF_LOWER:
 		case ANYOF_NLOWER:
@@ -11521,10 +11562,12 @@ parseit:
 		case ANYOF_SPACE:
                     DO_POSIX(ret, namedclass, properties,
                                             PL_PerlSpace, PL_XPerlSpace);
+                    has_special_charset_op = TRUE;
 		    break;
 		case ANYOF_NSPACE:
                     DO_N_POSIX(ret, namedclass, properties,
                                             PL_PerlSpace, PL_XPerlSpace);
+                    has_special_charset_op = TRUE;
 		    break;
 		case ANYOF_UPPER:   /* Same as LOWER, above */
 		case ANYOF_NUPPER:
@@ -11556,10 +11599,12 @@ parseit:
 		case ANYOF_ALNUM:   /* Really is 'Word' */
 		    DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
                             PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv);
+                    has_special_charset_op = TRUE;
 		    break;
 		case ANYOF_NALNUM:
 		    DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
                             PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv);
+                    has_special_charset_op = TRUE;
 		    break;
 		case ANYOF_VERTWS:
 		    /* For these, we use the cp_list, as /d doesn't make a
@@ -11567,10 +11612,12 @@ parseit:
 		     * if these characters had folds other than themselves, as
 		     * cp_list is subject to folding */
 		    _invlist_union(cp_list, PL_VertSpace, &cp_list);
+                    has_special_non_charset_op = TRUE;
 		    break;
 		case ANYOF_NVERTWS:
                     _invlist_union_complement_2nd(cp_list,
                                                     PL_VertSpace, &cp_list);
+                    has_special_non_charset_op = TRUE;
 		    break;
 		case ANYOF_XDIGIT:
                     DO_POSIX(ret, namedclass, properties,
@@ -11662,6 +11709,93 @@ parseit:
 	range = 0; /* this range (if it was one) is done now */
     }
 
+    /* [\w] can be optimized into \w, but not if there is anything else in the
+     * brackets (except for an initial '^' which indictes omplementing) */
+    if (element_count == 1 && (has_special_charset_op || has_special_non_charset_op)) {
+        U8 op;
+        bool invert = ANYOF_FLAGS(ret) & ANYOF_INVERT;
+        const char * cur_parse = RExC_parse;
+
+        if (has_special_charset_op) {
+            U8 offset = get_regex_charset(RExC_flags);
+
+            /* /aa is the same as /a for these */
+            if (offset == REGEX_ASCII_MORE_RESTRICTED_CHARSET) {
+                offset = REGEX_ASCII_RESTRICTED_CHARSET;
+            }
+            switch ((I32)namedclass) {
+                case ANYOF_NALNUM:
+                    invert = ! invert;
+                    /* FALLTHROUGH */
+                case ANYOF_ALNUM:
+                    op = ALNUM;
+                    break;
+                case ANYOF_NSPACE:
+                    invert = ! invert;
+                    /* FALLTHROUGH */
+                case ANYOF_SPACE:
+                    op = SPACE;
+                    break;
+                case ANYOF_NDIGIT:
+                    invert = ! invert;
+                    /* FALLTHROUGH */
+                case ANYOF_DIGIT:
+                    op = DIGIT;
+
+                    /* There is no DIGITU */
+                    if (offset == REGEX_UNICODE_CHARSET) {
+                        offset = REGEX_DEPENDS_CHARSET;
+                    }
+                    break;
+                default:
+                    Perl_croak(aTHX_ "panic: Named character class %"IVdf" is not expected to have a non-[...] version", namedclass);
+            }
+
+            /* The number of varieties of each of these is the same, hence, so
+             * is the delta between the normal and complemented nodes */
+            if (invert) {
+                offset += NALNUM - ALNUM;
+            }
+
+            op += offset;
+        }
+        else if (has_special_non_charset_op) {
+            switch ((I32)namedclass) {
+                case ANYOF_NHORIZWS:
+                    invert = ! invert;
+                    /* FALLTHROUGH */
+                case ANYOF_HORIZWS:
+                    op = HORIZWS;
+                    break;
+                case ANYOF_NVERTWS:
+                    invert = ! invert;
+                    /* FALLTHROUGH */
+                case ANYOF_VERTWS:
+                    op = VERTWS;
+                    break;
+                default:
+                    Perl_croak(aTHX_ "panic: Named character class %"IVdf" is not expected to have a non-[...] version", namedclass);
+            }
+
+            /* The complement version of each of these nodes is adjacently next
+             * */
+            if (invert) {
+                op++;
+            }
+        }
+
+        /* Throw away this ANYOF regnode, and emit the calculated one, which
+         * should correspond to the beginning, not current, state of the parse
+         */
+        RExC_parse = (char *)orig_parse;
+        RExC_emit = (regnode *)orig_emit;
+        ret = reg_node(pRExC_state, op);
+        RExC_parse = (char *) cur_parse;
+
+        SvREFCNT_dec(listsv);
+        return ret;
+    }
+
     if (SIZE_ONLY)
         return ret;
     /****** !SIZE_ONLY AFTER HERE *********/
diff --git a/regcomp.sym b/regcomp.sym
index c36a7fc2cd..0865a73ed5 100644
--- a/regcomp.sym
+++ b/regcomp.sym
@@ -227,6 +227,9 @@ KEEPS       KEEPS,      no        ; $& begins here.
 
 #*New charclass like patterns
 LNBREAK     LNBREAK,    none      ; generic newline pattern
+
+# regcomp.c expects the node number of the complement to be one greater than
+# the non-complement
 VERTWS      VERTWS,     none 0 S  ; vertical whitespace         (Perl 6)
 NVERTWS     NVERTWS,    none 0 S  ; not vertical whitespace     (Perl 6)
 HORIZWS     HORIZWS,    none 0 S  ; horizontal whitespace       (Perl 6)
author	Karl Williamson <public@khwilliamson.com>	2012-06-27 14:43:41 -0600
committer	Karl Williamson <public@khwilliamson.com>	2012-06-29 22:22:42 -0600
commit	3172e3fd885a9c54105d3b6156f18dc761fe29e5 (patch)
tree	d3b2ef63938ee83c1873a6a4b0f845b522bc63e4
parent	693fefec6759ebf0a9ec40a0f59346d86831349c (diff)
download	perl-3172e3fd885a9c54105d3b6156f18dc761fe29e5.tar.gz