regex: Add optimizing regnode

It turns out that any character class whose UTF-8 representation is two bytes long, and where all elements share the same first byte can be represented by a compact, fast regnode designed for the purpose. This commit adds that regnode, ANYOFHbbm. ANYOFHb already exists for classes where all elements have the same first byte, and this just changes the two-byte ones to use a bitmap instead of an inversion list. The advantages of this are that no conversion to code point is required (the continuation byte is just looked up in the bitmap) and no inversion list is needed. The inversion list would occupy more space, from 4 to 34 extra 64-bit words, plus an AV and SV, depending on what elements the class matches. Many characters in the Latin, Greek, Cyrillic, Greek, Hebrew, Arabic, and several other (lesser-known) scripts are of this form. It would be possible to extend this technique to larger bitmaps, but this commit is a start.
author: Karl Williamson <khw@cpan.org> 2022-07-05 09:41:12 -0600
committer: Karl Williamson <khw@cpan.org> 2022-07-12 05:14:35 -0600
commit: 4c8c99df3c8fddf745e5971c2f6b56d2027f2be8 (patch)
tree: bf66d4668df70961fb5a208021bb6b1874417b51 /regcomp.c
parent: 4bc69097efa904296afd72d1c6ca0d95f3fbb028 (diff)
download: perl-4c8c99df3c8fddf745e5971c2f6b56d2027f2be8.tar.gz
1 files changed, 77 insertions, 2 deletions
diff --git a/regcomp.c b/regcomp.c
index d8ad1b985d..b32ca22a47 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -6188,6 +6188,21 @@ S_study_chunk(pTHX_
                                                           (regnode_charclass *) scan);
                     break;
 
+                case ANYOFHbbm:
+                  {
+                    SV* cp_list = get_ANYOFHbbm_contents(scan);
+
+                    if (flags & SCF_DO_STCLASS_OR) {
+                        ssc_union(data->start_class, cp_list, invert);
+                    }
+                    else if (flags & SCF_DO_STCLASS_AND) {
+                        ssc_intersection(data->start_class, cp_list, invert);
+                    }
+
+                    SvREFCNT_dec_NN(cp_list);
+                    break;
+                  }
+
                 case NANYOFM: /* NANYOFM already contains the inversion of the
                                  input ANYOF data, so, unlike things like
                                  NPOSIXA, don't change 'invert' to TRUE */
@@ -20519,13 +20534,46 @@ S_optimize_regclass(pTHX_
                 Size_t len = find_first_differing_byte_pos(low_utf8,
                                                            high_utf8,
                                                    MIN(low_len, high_len));
-
                 if (len == 1) {
 
                     /* No need to convert to I8 for EBCDIC as this is an exact
                      * match */
                     *anyof_flags = low_utf8[0];
-                    op = ANYOFHb;
+
+                    if (high_len == 2) {
+                        /* If the elements matched all have a 2-byte UTF-8
+                         * representation, with the first byte being the same,
+                         * we can use a compact, fast regnode. capable of
+                         * matching any combination of continuation byte
+                         * patterns.
+                         *
+                         * (A similar regnode could be created for the Latin1
+                         * range; the complication being that it could match
+                         * non-UTF8 targets.  The internal bitmap would serve
+                         * both cases; with some extra code in regexec.c) */
+                        op = ANYOFHbbm;
+                        *ret = REGNODE_GUTS(pRExC_state, op, regarglen[op]);
+                        FILL_NODE(*ret, op);
+                        ((struct regnode_bbm *) REGNODE_p(*ret))->first_byte = low_utf8[0],
+
+                        /* The 64 bit (or 32 on EBCCDIC) map can be looked up
+                         * directly based on the continuation byte, without
+                         * needing to convert to code point */
+                        populate_bitmap_from_invlist(
+                            cp_list,
+
+                            /* The base code point is from the start byte */
+                            TWO_BYTE_UTF8_TO_NATIVE(low_utf8[0],
+                                                    UTF_CONTINUATION_MARK | 0),
+
+                            ((struct regnode_bbm *) REGNODE_p(*ret))->bitmap,
+                            REGNODE_BBM_BITMAP_LEN);
+                        RExC_emit += NODE_STEP_REGNODE + regarglen[op];
+                        return op;
+                    }
+                    else {
+                        op = ANYOFHb;
+                    }
                 }
                 else {
                     op = ANYOFHs;
@@ -21429,6 +21477,22 @@ S_get_ANYOFM_contents(pTHX_ const regnode * n) {
     return cp_list;
 }
 
+STATIC SV *
+S_get_ANYOFHbbm_contents(pTHX_ const regnode * n) {
+    PERL_ARGS_ASSERT_GET_ANYOFHBBM_CONTENTS;
+
+    SV * cp_list = NULL;
+    populate_invlist_from_bitmap(
+              ((struct regnode_bbm *) n)->bitmap,
+              REGNODE_BBM_BITMAP_LEN * CHARBITS,
+              &cp_list,
+
+              /* The base cp is from the start byte plus a zero continuation */
+              TWO_BYTE_UTF8_TO_NATIVE(((struct regnode_bbm *) n)->first_byte,
+                                      UTF_CONTINUATION_MARK | 0));
+    return cp_list;
+}
+
 /*
  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
  */
@@ -22054,6 +22118,17 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
 
         SvREFCNT_dec(cp_list);
     }
+    else if (k == ANYOFHbbm) {
+        SV * cp_list = get_ANYOFHbbm_contents(o);
+        Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
+
+        Perl_sv_catsv(aTHX_ sv, invlist_contents(cp_list,
+                                      FALSE /* output suitable for catsv */
+                                     ));
+        Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
+
+        SvREFCNT_dec(cp_list);
+    }
     else if (k == POSIXD || k == NPOSIXD) {
         U8 index = FLAGS(o) * 2;
         if (index < C_ARRAY_LENGTH(anyofs)) {
author	Karl Williamson <khw@cpan.org>	2022-07-05 09:41:12 -0600
committer	Karl Williamson <khw@cpan.org>	2022-07-12 05:14:35 -0600
commit	4c8c99df3c8fddf745e5971c2f6b56d2027f2be8 (patch)
tree	bf66d4668df70961fb5a208021bb6b1874417b51 /regcomp.c
parent	4bc69097efa904296afd72d1c6ca0d95f3fbb028 (diff)
download	perl-4c8c99df3c8fddf745e5971c2f6b56d2027f2be8.tar.gz