summaryrefslogtreecommitdiff
path: root/regcomp.h
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2019-11-19 19:15:38 -0700
committerKarl Williamson <khw@cpan.org>2019-11-20 14:09:21 -0700
commit34924db0919c191e271602c82cb2de7784fc63a4 (patch)
tree420a27c457c5c44f6089f07fc2657813531d4920 /regcomp.h
parent21c3fd9dd0a7a389c901af03acc1907666ee1870 (diff)
downloadperl-34924db0919c191e271602c82cb2de7784fc63a4.tar.gz
Add ANYOFHs regnode
This node is like ANYOFHb, but is used when more than one leading byte is the same in all the matched code points. ANYOFHb is used to avoid having to convert from UTF-8 to code point for something that won't match. It checks that the first byte in the UTF-8 encoded target is the desired one, thus ruling out most of the possible code points. But for higher code points that require longer UTF-8 sequences, many many non-matching code points pass this filter. Its almost 200K that it is ineffective for for code points above 0xFFFF. This commit creates a new node type that addresses this problem. Instead of a single byte, it stores as many leading bytes that are the same for all code points that match the class. For many classes, that will cut down the number of possible false positives by a huge amount before having to convert to code point to make the final determination. This regnode adds a UTF-8 string at the end. It is still much smaller, even in the rare worst case, than a plain ANYOF node because the maximum string length, 15 bytes, is still shorter than the 32-byte bitmap that is present in a plain ANYOF. Most of the time the added string will instead be at most 4 bytes.
Diffstat (limited to 'regcomp.h')
-rw-r--r--regcomp.h8
1 files changed, 8 insertions, 0 deletions
diff --git a/regcomp.h b/regcomp.h
index ba609d82f7..3f7dd31391 100644
--- a/regcomp.h
+++ b/regcomp.h
@@ -164,6 +164,14 @@ struct regnode_lstring { /* Constructed this way to keep the string aligned. */
char string[1];
};
+struct regnode_anyofhs { /* Constructed this way to keep the string aligned. */
+ U8 flags;
+ U8 type;
+ U16 next_off;
+ U32 arg1; /* set by set_ANYOF_arg() */
+ char string[1];
+};
+
/* Argument bearing node - workhorse,
arg1 is often for the data field */
struct regnode_1 {