summaryrefslogtreecommitdiff
path: root/regcomp.h
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2022-07-05 09:41:12 -0600
committerKarl Williamson <khw@cpan.org>2022-07-12 05:14:35 -0600
commit4c8c99df3c8fddf745e5971c2f6b56d2027f2be8 (patch)
treebf66d4668df70961fb5a208021bb6b1874417b51 /regcomp.h
parent4bc69097efa904296afd72d1c6ca0d95f3fbb028 (diff)
downloadperl-4c8c99df3c8fddf745e5971c2f6b56d2027f2be8.tar.gz
regex: Add optimizing regnode
It turns out that any character class whose UTF-8 representation is two bytes long, and where all elements share the same first byte can be represented by a compact, fast regnode designed for the purpose. This commit adds that regnode, ANYOFHbbm. ANYOFHb already exists for classes where all elements have the same first byte, and this just changes the two-byte ones to use a bitmap instead of an inversion list. The advantages of this are that no conversion to code point is required (the continuation byte is just looked up in the bitmap) and no inversion list is needed. The inversion list would occupy more space, from 4 to 34 extra 64-bit words, plus an AV and SV, depending on what elements the class matches. Many characters in the Latin, Greek, Cyrillic, Greek, Hebrew, Arabic, and several other (lesser-known) scripts are of this form. It would be possible to extend this technique to larger bitmaps, but this commit is a start.
Diffstat (limited to 'regcomp.h')
-rw-r--r--regcomp.h14
1 files changed, 14 insertions, 0 deletions
diff --git a/regcomp.h b/regcomp.h
index 10bd9fc47f..5919d16bc5 100644
--- a/regcomp.h
+++ b/regcomp.h
@@ -171,6 +171,20 @@ struct regnode_2 {
U16 arg2;
};
+#define REGNODE_BBM_BITMAP_LEN \
+ /* 6 info bits requires 64 bits; 5 => 32 */ \
+ ((1 << (UTF_CONTINUATION_BYTE_INFO_BITS)) / CHARBITS)
+
+/* Used for matching any two-byte UTF-8 character whose start byte is known.
+ * The array is a bitmap capable of representing any possible continuation
+ * byte. */
+struct regnode_bbm {
+ U8 first_byte;
+ U8 type;
+ U16 next_off;
+ U8 bitmap[REGNODE_BBM_BITMAP_LEN];
+};
+
#define ANYOF_BITMAP_SIZE (NUM_ANYOF_CODE_POINTS / CHARBITS)
/* Note that these form structs which are supersets of the next smaller one, by