summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKarl Williamson <public@khwilliamson.com>2012-07-15 16:11:45 -0600
committerKarl Williamson <public@khwilliamson.com>2012-07-19 09:39:06 -0600
commit8b27d3db700fc2fce268e3d78e221a16ccaca2e8 (patch)
tree5c0eb906bbd3391361d7e06c6a9798533c5df1cb
parentc8c10c2ff3acbb5b933bbbd2eb5b4f3c49608e42 (diff)
downloadperl-8b27d3db700fc2fce268e3d78e221a16ccaca2e8.tar.gz
regcomp.h: Free up bit; downside is makes locale ANYOF nodes large
There have been two flavors of ANYOF nodes under /l (locale) (for bracketed character classes). If a class didn't try to match things like [:word:], it was smaller by 4 bytes than one that did. A flag bit was used to indicate which size it was. By making all such nodes the larger size, whether needed or not, that bit can be freed to be used for other purposes. This only affects ANYOF nodes compiled under locale rules. The hope is to eventually git rid of these nodes anyway, by taking the suggestion of Yves Orton to compile regular expressions using the current locale, and automatically recompile the next time they are used after the locale changes. This commit is somewhat experimental, done early in the development cycle to see if there are any breakages. There are other ways to free up a bit, as explained in the comments. Best would be to split off nodes that match everything outside Latin1, freeing up the ANYOF_UNICODE_ALL bit. However, there currently would need to be two flavors of this, one also for ANYOFV. I'm currently working to eliminate the need for ANYOFV nodes (which aren't sufficient, [perl #89774]), so it's easiest to wait for this work to be done before doing the split, after which we can revert this change in order to gain back the space, but in the meantime, this will have had the opportunity to smoke out issues that I would like to know about.
-rw-r--r--regcomp.c27
-rw-r--r--regcomp.h29
2 files changed, 34 insertions, 22 deletions
diff --git a/regcomp.c b/regcomp.c
index 3d5be65a00..22e2cd888c 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -10968,7 +10968,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
register regnode *ret;
STRLEN numlen;
- IV namedclass;
+ IV namedclass = OOB_NAMEDCLASS;
char *rangebegin = NULL;
bool need_class = 0;
bool allow_full_fold = TRUE; /* Assume wants multi-char folding */
@@ -11348,15 +11348,20 @@ parseit:
literal_endpoint++;
#endif
- if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
-
- /* What matches in a locale is not known until runtime, so need to
- * (one time per class) allocate extra space to pass to regexec.
- * The space will contain a bit for each named class that is to be
- * matched against. This isn't needed for \p{} and pseudo-classes,
- * as they are not affected by locale, and hence are dealt with
- * separately */
- if (LOC && namedclass < ANYOF_MAX && ! need_class) {
+ /* What matches in a locale is not known until runtime. This
+ * includes what the Posix classes (like \w, [:space:]) match.
+ * Room must be reserved (one time per class) to store such
+ * classes, either if Perl is compiled so that locale nodes always
+ * should have this space, or if there is such class info to be
+ * stored. The space will contain a bit for each named class that
+ * is to be matched against. This isn't needed for \p{} and
+ * pseudo-classes, as they are not affected by locale, and hence
+ * are dealt with separately */
+ if (LOC
+ && ! need_class
+ && (ANYOF_LOCALE == ANYOF_CLASS
+ || (namedclass > OOB_NAMEDCLASS && namedclass < ANYOF_MAX)))
+ {
need_class = 1;
if (SIZE_ONLY) {
RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP;
@@ -11368,6 +11373,8 @@ parseit:
ANYOF_FLAGS(ret) |= ANYOF_CLASS;
}
+ if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
+
/* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
* literal, as is the character that began the false range, i.e.
* the 'a' in the examples */
diff --git a/regcomp.h b/regcomp.h
index 8fd6b3617f..0267d2a586 100644
--- a/regcomp.h
+++ b/regcomp.h
@@ -308,16 +308,20 @@ struct regnode_charclass_class {
#define ANYOF_NONBITMAP(node) (ARG(node) != ANYOF_NONBITMAP_EMPTY)
/* Flags for node->flags of ANYOF. These are in short supply, so some games
- * are done to share them, as described below. If necessary, the ANYOF_LOCALE
- * and ANYOF_CLASS bits could be shared with a space penalty for locale nodes,
- * but this isn't quite so easy, as the optimizer also uses ANYOF_CLASS.
- * Another option would be to push them into new nodes. E.g. there could be an
- * ANYOF_LOCALE node that would be in place of the flag of the same name.
- * The UNICODE_ALL bit could be freed up by resorting to creating a swash with
- * everything above 255 in it. This introduces a performance penalty.
- * If flags need to be added that are applicable to the synthetic start class
- * only, with some work, they could be put in the next-node field, or in an
- * unused bit of the classflags field. */
+ * are done to share them, as described below. Already, the ANYOF_LOCALE and
+ * ANYOF_CLASS bits are shared, making a space penalty for all locale nodes.
+ * An option would be to push them into new nodes. E.g. there could be an
+ * ANYOF_LOCALE node that would be in place of the flag of the same name. But
+ * there are better options. The UNICODE_ALL bit could be freed up by
+ * resorting to creating a swash containing everything above 255. This
+ * introduces a performance penalty. Better would be to split it off into a
+ * separate node, which actually would improve performance by allowing adding a
+ * case statement to regexec.c use the bit map for code points under 256, and
+ * to match everything above. If flags need to be added that are applicable to
+ * the synthetic start class only, with some work, they could be put in the
+ * next-node field, or in an unused bit of the classflags field. This could be
+ * done with the current EOS flag, and a new node type created that is just for
+ * the scc, freeing up that bit */
#define ANYOF_LOCALE 0x01 /* /l modifier */
@@ -335,8 +339,9 @@ struct regnode_charclass_class {
/* Set if this is a struct regnode_charclass_class vs a regnode_charclass. This
* is used for runtime \d, \w, [:posix:], ..., which are used only in locale
* and the optimizer's synthetic start class. Non-locale \d, etc are resolved
- * at compile-time */
-#define ANYOF_CLASS 0x08
+ * at compile-time. Now shared with ANYOF_LOCALE, forcing all locale nodes to
+ * be large */
+#define ANYOF_CLASS ANYOF_LOCALE
#define ANYOF_LARGE ANYOF_CLASS /* Same; name retained for back compat */
/* EOS, meaning that it can match an empty string too, is used for the