ANYOF regnode often overallocates space

The ANYOF regnode is used for [bracketed character classes]. It is normally a struct regnode_charclass, but sometimes it is a struct regnode_charclass_class, which adds an extra 4 bytes at the end to store which of things like \d, \w, [:posix:] within the outer bracketed class are to be matched at run time, like [\d\s[:upper:]]. However these are currently only stored the if the match is to be locale based. I haven't looked in the history, but the comments seem to indicate that in the past those things were stored there all the time. But, the space is allocated even if nothing gets stored there. This patch only allocates the space if there is one of these classes occurring in a [bracketed class] and something is stored there, namely only if the regex is being compiled under 'use locale'. The previous code was not noting that these classes matched code points (in the 'stored' variable), but was relying on ANYOF_LARGE getting set which thus avoided it thinking it was a single character class that could be optimized to an EXACT node. Thus, this patch has to explicitly set 'stored' to > 1, as the function already does elsewhere for similar reasons.
author: Karl Williamson <public@khwilliamson.com> 2010-10-26 13:14:12 -0600
committer: Father Chrysostomos <sprout@cpan.org> 2010-10-26 18:10:39 -0700
commit: 2c63ecadcb2533b7a53efa836736d86cb484d969 (patch)
tree: 1bd78a70454feca4da3d354ebf01a416d8d91663
parent: 021897e0cddffa1615b83551abd2136c5f0fbeee (diff)
download: perl-2c63ecadcb2533b7a53efa836736d86cb484d969.tar.gz
1 files changed, 19 insertions, 14 deletions
diff --git a/regcomp.c b/regcomp.c
index 831d579593..0d469c1e97 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -8177,7 +8177,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
 #ifdef EBCDIC
     UV literal_endpoint = 0;
 #endif
-    UV stored = 0;  /* number of chars stored in the class */
+    UV stored = 0;  /* 0, 1, or more than 1 chars stored in the class */
 
     regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
         case we need to change the emitted regop to an EXACT. */
@@ -8415,10 +8415,23 @@ parseit:
 
 	if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
 
-	    if (!SIZE_ONLY && !need_class)
-		ANYOF_CLASS_ZERO(ret);
-
-	    need_class = 1;
+	    /* What matches in a locale is not known until runtime, so need to
+	     * (one time per class) allocate extra space to pass to regexec.
+	     * The space will contain a bit for each named class that is to be
+	     * matched against.  This isn't needed for \p{} and pseudo-classes,
+	     * as they are not affected by locale, and hence are dealt with
+	     * separately */
+	    if (LOC && namedclass < ANYOF_MAX && ! need_class) {
+		need_class = 1;
+		if (SIZE_ONLY) {
+		    RExC_size += ANYOF_CLASS_ADD_SKIP;
+		}
+		else {
+		    RExC_emit += ANYOF_CLASS_ADD_SKIP;
+		    ANYOF_CLASS_ZERO(ret);
+		}
+		    ANYOF_FLAGS(ret) |= ANYOF_CLASS|ANYOF_LARGE;
+	    }
 
 	    /* a bad range like a-\d, a-[:digit:] ? */
 	    if (range) {
@@ -8549,8 +8562,7 @@ parseit:
 		    /* Strings such as "+utf8::isWord\n" */
 		    Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
 		}
-		if (LOC)
-		    ANYOF_FLAGS(ret) |= ANYOF_CLASS;
+		stored+=2; /* can't optimize this class */
 		continue;
 	    }
 	} /* end of namedclass \blah */
@@ -8709,13 +8721,6 @@ parseit:
 	range = 0; /* this range (if it was one) is done now */
     }
 
-    if (need_class) {
-	ANYOF_FLAGS(ret) |= ANYOF_LARGE;
-	if (SIZE_ONLY)
-	    RExC_size += ANYOF_CLASS_ADD_SKIP;
-	else
-	    RExC_emit += ANYOF_CLASS_ADD_SKIP;
-    }
 
 
     if (SIZE_ONLY)
author	Karl Williamson <public@khwilliamson.com>	2010-10-26 13:14:12 -0600
committer	Father Chrysostomos <sprout@cpan.org>	2010-10-26 18:10:39 -0700
commit	2c63ecadcb2533b7a53efa836736d86cb484d969 (patch)
tree	1bd78a70454feca4da3d354ebf01a416d8d91663
parent	021897e0cddffa1615b83551abd2136c5f0fbeee (diff)
download	perl-2c63ecadcb2533b7a53efa836736d86cb484d969.tar.gz