summaryrefslogtreecommitdiff
path: root/regcomp.c
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2022-07-05 09:41:12 -0600
committerKarl Williamson <khw@cpan.org>2022-07-12 05:14:35 -0600
commit4c8c99df3c8fddf745e5971c2f6b56d2027f2be8 (patch)
treebf66d4668df70961fb5a208021bb6b1874417b51 /regcomp.c
parent4bc69097efa904296afd72d1c6ca0d95f3fbb028 (diff)
downloadperl-4c8c99df3c8fddf745e5971c2f6b56d2027f2be8.tar.gz
regex: Add optimizing regnode
It turns out that any character class whose UTF-8 representation is two bytes long, and where all elements share the same first byte can be represented by a compact, fast regnode designed for the purpose. This commit adds that regnode, ANYOFHbbm. ANYOFHb already exists for classes where all elements have the same first byte, and this just changes the two-byte ones to use a bitmap instead of an inversion list. The advantages of this are that no conversion to code point is required (the continuation byte is just looked up in the bitmap) and no inversion list is needed. The inversion list would occupy more space, from 4 to 34 extra 64-bit words, plus an AV and SV, depending on what elements the class matches. Many characters in the Latin, Greek, Cyrillic, Greek, Hebrew, Arabic, and several other (lesser-known) scripts are of this form. It would be possible to extend this technique to larger bitmaps, but this commit is a start.
Diffstat (limited to 'regcomp.c')
-rw-r--r--regcomp.c79
1 files changed, 77 insertions, 2 deletions
diff --git a/regcomp.c b/regcomp.c
index d8ad1b985d..b32ca22a47 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -6188,6 +6188,21 @@ S_study_chunk(pTHX_
(regnode_charclass *) scan);
break;
+ case ANYOFHbbm:
+ {
+ SV* cp_list = get_ANYOFHbbm_contents(scan);
+
+ if (flags & SCF_DO_STCLASS_OR) {
+ ssc_union(data->start_class, cp_list, invert);
+ }
+ else if (flags & SCF_DO_STCLASS_AND) {
+ ssc_intersection(data->start_class, cp_list, invert);
+ }
+
+ SvREFCNT_dec_NN(cp_list);
+ break;
+ }
+
case NANYOFM: /* NANYOFM already contains the inversion of the
input ANYOF data, so, unlike things like
NPOSIXA, don't change 'invert' to TRUE */
@@ -20519,13 +20534,46 @@ S_optimize_regclass(pTHX_
Size_t len = find_first_differing_byte_pos(low_utf8,
high_utf8,
MIN(low_len, high_len));
-
if (len == 1) {
/* No need to convert to I8 for EBCDIC as this is an exact
* match */
*anyof_flags = low_utf8[0];
- op = ANYOFHb;
+
+ if (high_len == 2) {
+ /* If the elements matched all have a 2-byte UTF-8
+ * representation, with the first byte being the same,
+ * we can use a compact, fast regnode. capable of
+ * matching any combination of continuation byte
+ * patterns.
+ *
+ * (A similar regnode could be created for the Latin1
+ * range; the complication being that it could match
+ * non-UTF8 targets. The internal bitmap would serve
+ * both cases; with some extra code in regexec.c) */
+ op = ANYOFHbbm;
+ *ret = REGNODE_GUTS(pRExC_state, op, regarglen[op]);
+ FILL_NODE(*ret, op);
+ ((struct regnode_bbm *) REGNODE_p(*ret))->first_byte = low_utf8[0],
+
+ /* The 64 bit (or 32 on EBCCDIC) map can be looked up
+ * directly based on the continuation byte, without
+ * needing to convert to code point */
+ populate_bitmap_from_invlist(
+ cp_list,
+
+ /* The base code point is from the start byte */
+ TWO_BYTE_UTF8_TO_NATIVE(low_utf8[0],
+ UTF_CONTINUATION_MARK | 0),
+
+ ((struct regnode_bbm *) REGNODE_p(*ret))->bitmap,
+ REGNODE_BBM_BITMAP_LEN);
+ RExC_emit += NODE_STEP_REGNODE + regarglen[op];
+ return op;
+ }
+ else {
+ op = ANYOFHb;
+ }
}
else {
op = ANYOFHs;
@@ -21429,6 +21477,22 @@ S_get_ANYOFM_contents(pTHX_ const regnode * n) {
return cp_list;
}
+STATIC SV *
+S_get_ANYOFHbbm_contents(pTHX_ const regnode * n) {
+ PERL_ARGS_ASSERT_GET_ANYOFHBBM_CONTENTS;
+
+ SV * cp_list = NULL;
+ populate_invlist_from_bitmap(
+ ((struct regnode_bbm *) n)->bitmap,
+ REGNODE_BBM_BITMAP_LEN * CHARBITS,
+ &cp_list,
+
+ /* The base cp is from the start byte plus a zero continuation */
+ TWO_BYTE_UTF8_TO_NATIVE(((struct regnode_bbm *) n)->first_byte,
+ UTF_CONTINUATION_MARK | 0));
+ return cp_list;
+}
+
/*
- regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
*/
@@ -22054,6 +22118,17 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
SvREFCNT_dec(cp_list);
}
+ else if (k == ANYOFHbbm) {
+ SV * cp_list = get_ANYOFHbbm_contents(o);
+ Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
+
+ Perl_sv_catsv(aTHX_ sv, invlist_contents(cp_list,
+ FALSE /* output suitable for catsv */
+ ));
+ Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
+
+ SvREFCNT_dec(cp_list);
+ }
else if (k == POSIXD || k == NPOSIXD) {
U8 index = FLAGS(o) * 2;
if (index < C_ARRAY_LENGTH(anyofs)) {