summaryrefslogtreecommitdiff
path: root/regcomp.h
diff options
context:
space:
mode:
authorKarl Williamson <public@khwilliamson.com>2014-01-29 20:42:33 -0700
committerKarl Williamson <public@khwilliamson.com>2014-02-15 21:55:32 -0700
commit34fdef848b1687b91892ba55e9e0c3430e0770f6 (patch)
tree0c42f2e84076a6040b8b29fe47ad88c92228640b /regcomp.h
parent56feebade29d8842a38364ccb13c5ff09284d0d7 (diff)
downloadperl-34fdef848b1687b91892ba55e9e0c3430e0770f6.tar.gz
Free up bit for regex ANYOF nodes
This commit frees up a bit by using an extra regnode to pass the information to the regex engine instead of the flag. I originally thought that if this was needed, it should be the ANYOF_ABOVE_LATIN1_ALL bit, as that might speed some things up. But if we need to do this again by adding another node to get another bit, we want one that is mutually exclusive of the first one we did, For otherwise we start having to make 3 nodes instead of two to get the combinations: 1 0 0 1 1 1 This combinatorial problem is avoided by using bits that are mutually exclusive, which the ABOVE_LATIN1_ALL isn't, but the one freed by this commit ANYOF_NON_UTF8_NON_ASCII_ALL is only set under /d matching, and there are other bits that are set only under /l, so if we need to do this again, we should use one of those. I wrote this code when I thought I really needed a bit. But since, I have figured out a better way to get the bit needed now. But I don't want to lose this code to posterity, so this commit is being made long enough to get the commit number, then it will be reverted, adding comments referring to the commit number, so that it can easily be reconstructed when necessary.
Diffstat (limited to 'regcomp.h')
-rw-r--r--regcomp.h26
1 files changed, 10 insertions, 16 deletions
diff --git a/regcomp.h b/regcomp.h
index 84aa85d0f2..69bd852e31 100644
--- a/regcomp.h
+++ b/regcomp.h
@@ -358,17 +358,15 @@ struct regnode_ssc {
* ANYOF_NONBITMAP_NON_UTF8 bit is also set. */
#define ANYOF_NONBITMAP(node) (ARG(node) != ANYOF_NONBITMAP_EMPTY)
-/* Flags for node->flags of ANYOF. These are in short supply, with none
- * currently available. If more are needed, the ANYOF_LOCALE and
- * ANYOF_POSIXL bits could be shared, making a space penalty for all locale
- * nodes. Also, the ABOVE_LATIN1_ALL bit could be freed up by resorting to
- * creating a swash containing everything above 255. This introduces a
- * performance penalty. Better would be to split it off into a separate node,
- * which actually would improve performance a bit by allowing regexec.c to test
- * for a UTF-8 character being above 255 without having to call a function nor
- * calculate its code point value. Several flags are not used in synthetic
- * start class (SSC) nodes, so could be shared should new flags be needed for
- * SSCs. */
+/* Flags for node->flags of ANYOF. These are in short supply, with one
+ * currently available. If more are needed, the ABOVE_LATIN1_ALL bit could be
+ * freed up by resorting to creating a swash containing everything above 255.
+ * This introduces a performance penalty. An option that wouldn't slow things
+ * down would be to split one of the LOC flags out into a separate node, like
+ * what has been done with ANYOF_NON_UTF8_NON_ASCII_ALL. One of these is only
+ * for /l nodes; the other only for /d, so there are no combinatorial issues.
+ * Several flags are not used in synthetic start class (SSC) nodes, so could be
+ * shared should new flags be needed for SSCs, like ANYOF_EMPTY_STRING now. */
/* regexec.c is expecting this to be in the low bit */
#define ANYOF_INVERT 0x01
@@ -406,11 +404,7 @@ struct regnode_ssc {
#define ANYOF_ABOVE_LATIN1_ALL 0x40
#define ANYOF_UNICODE_ALL ANYOF_ABOVE_LATIN1_ALL
-/* Match all Latin1 characters that aren't ASCII when the target string is not
- * in utf8. */
-#define ANYOF_NON_UTF8_NON_ASCII_ALL 0x80
-
-#define ANYOF_FLAGS_ALL (0xff)
+#define ANYOF_FLAGS_ALL (0x7F)
#define ANYOF_LOCALE_FLAGS (ANYOF_LOCALE \
|ANYOF_LOC_FOLD \