regcomp.c: Use POSIXA, NPOSIXA

This commit optimizes character classes which are matched under /a or /aa and consist of a single Posix class, into POSIXA or NPOSIXA regop types. For example /[[:word:]]/a. Since [:ascii:] is always ascii-restricted no matter what the charset modifier is, it is always optimized. These nodes should execute somewhat faster than a generic ANYOF node, and are significantly smaller, taking 2 bytes instead of 12. The flags field of the node structure is used to hold an enum indicating which of the 15 Posix classes is being matched.
author: Karl Williamson <public@khwilliamson.com> 2012-07-20 10:23:14 -0600
committer: Karl Williamson <public@khwilliamson.com> 2012-07-24 21:13:49 -0600
commit: 0658cddeb9feb16c427ac50f4000b008516b9958 (patch)
tree: 6107b53e26cbdce9e42393b8aadf84dd14600772 /regexec.c
parent: 3615ea5819e869b314f723b0ce28dfb1d80017ef (diff)
download: perl-0658cddeb9feb16c427ac50f4000b008516b9958.tar.gz
1 files changed, 52 insertions, 0 deletions
diff --git a/regexec.c b/regexec.c
index af64a69f31..dca278b406 100644
--- a/regexec.c
+++ b/regexec.c
@@ -1814,6 +1814,20 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
 		!is_HORIZWS_latin1(s)
 	    );	    
 	    break;
+	case POSIXA:
+	    /* Don't need to worry about utf8, as it can match only a single
+            * byte invariant character.  The flag in this node type is the
+            * class number to pass to _generic_isCC() to build a mask for
+            * searching in PL_charclass[] */
+	    REXEC_FBC_CLASS_SCAN( _generic_isCC_A(*s, FLAGS(c)));
+	    break;
+	case NPOSIXA:
+	    REXEC_FBC_CSCAN(
+		!_generic_isCC_A(*s, FLAGS(c)),
+		!_generic_isCC_A(*s, FLAGS(c))
+	    );
+	    break;
+
 	case AHOCORASICKC:
 	case AHOCORASICK: 
 	    {
@@ -3881,6 +3895,26 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
 		DIGITA, NDIGITA, isDIGIT_A,
 		digit, "0");
 
+        case POSIXA:
+            if (locinput >= PL_regeol || ! _generic_isCC_A(nextchr, FLAGS(scan))) {
+                sayNO;
+            }
+            /* Matched a utf8-invariant, so don't have to worry about utf8 */
+            nextchr = UCHARAT(++locinput);
+            break;
+        case NPOSIXA:
+            if (locinput >= PL_regeol || _generic_isCC_A(nextchr, FLAGS(scan))) {
+                sayNO;
+            }
+            if (utf8_target) {
+                locinput += PL_utf8skip[nextchr];
+                nextchr = UCHARAT(locinput);
+            }
+            else {
+                nextchr = UCHARAT(++locinput);
+            }
+            break;
+
 	case CLUMP: /* Match \X: logical Unicode character.  This is defined as
 		       a Unicode extended Grapheme Cluster */
 	    /* From http://www.unicode.org/reports/tr29 (5.2 version).  An
@@ -6298,6 +6332,24 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
 	    scan++;
 	}
 	break;
+
+    case POSIXA:
+       while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
+	    scan++;
+	}
+	break;
+    case NPOSIXA:
+	if (utf8_target) {
+	    while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
+		scan += UTF8SKIP(scan);
+	    }
+	}
+	else {
+	    while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
+		scan++;
+	    }
+	}
+	break;
     case NALNUMA:
 	if (utf8_target) {
 	    while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
author	Karl Williamson <public@khwilliamson.com>	2012-07-20 10:23:14 -0600
committer	Karl Williamson <public@khwilliamson.com>	2012-07-24 21:13:49 -0600
commit	0658cddeb9feb16c427ac50f4000b008516b9958 (patch)
tree	6107b53e26cbdce9e42393b8aadf84dd14600772 /regexec.c
parent	3615ea5819e869b314f723b0ce28dfb1d80017ef (diff)
download	perl-0658cddeb9feb16c427ac50f4000b008516b9958.tar.gz