diff options
author | Karl Williamson <khw@cpan.org> | 2022-07-05 09:41:12 -0600 |
---|---|---|
committer | Karl Williamson <khw@cpan.org> | 2022-07-12 05:14:35 -0600 |
commit | 4c8c99df3c8fddf745e5971c2f6b56d2027f2be8 (patch) | |
tree | bf66d4668df70961fb5a208021bb6b1874417b51 /regcomp.c | |
parent | 4bc69097efa904296afd72d1c6ca0d95f3fbb028 (diff) | |
download | perl-4c8c99df3c8fddf745e5971c2f6b56d2027f2be8.tar.gz |
regex: Add optimizing regnode
It turns out that any character class whose UTF-8 representation is two
bytes long, and where all elements share the same first byte can be
represented by a compact, fast regnode designed for the purpose.
This commit adds that regnode, ANYOFHbbm. ANYOFHb already exists for
classes where all elements have the same first byte, and this just
changes the two-byte ones to use a bitmap instead of an inversion list.
The advantages of this are that no conversion to code point is required
(the continuation byte is just looked up in the bitmap) and no inversion
list is needed. The inversion list would occupy more space, from 4 to
34 extra 64-bit words, plus an AV and SV, depending on what elements the
class matches.
Many characters in the Latin, Greek, Cyrillic, Greek, Hebrew, Arabic,
and several other (lesser-known) scripts are of this form.
It would be possible to extend this technique to larger bitmaps, but
this commit is a start.
Diffstat (limited to 'regcomp.c')
-rw-r--r-- | regcomp.c | 79 |
1 files changed, 77 insertions, 2 deletions
@@ -6188,6 +6188,21 @@ S_study_chunk(pTHX_ (regnode_charclass *) scan); break; + case ANYOFHbbm: + { + SV* cp_list = get_ANYOFHbbm_contents(scan); + + if (flags & SCF_DO_STCLASS_OR) { + ssc_union(data->start_class, cp_list, invert); + } + else if (flags & SCF_DO_STCLASS_AND) { + ssc_intersection(data->start_class, cp_list, invert); + } + + SvREFCNT_dec_NN(cp_list); + break; + } + case NANYOFM: /* NANYOFM already contains the inversion of the input ANYOF data, so, unlike things like NPOSIXA, don't change 'invert' to TRUE */ @@ -20519,13 +20534,46 @@ S_optimize_regclass(pTHX_ Size_t len = find_first_differing_byte_pos(low_utf8, high_utf8, MIN(low_len, high_len)); - if (len == 1) { /* No need to convert to I8 for EBCDIC as this is an exact * match */ *anyof_flags = low_utf8[0]; - op = ANYOFHb; + + if (high_len == 2) { + /* If the elements matched all have a 2-byte UTF-8 + * representation, with the first byte being the same, + * we can use a compact, fast regnode. capable of + * matching any combination of continuation byte + * patterns. + * + * (A similar regnode could be created for the Latin1 + * range; the complication being that it could match + * non-UTF8 targets. The internal bitmap would serve + * both cases; with some extra code in regexec.c) */ + op = ANYOFHbbm; + *ret = REGNODE_GUTS(pRExC_state, op, regarglen[op]); + FILL_NODE(*ret, op); + ((struct regnode_bbm *) REGNODE_p(*ret))->first_byte = low_utf8[0], + + /* The 64 bit (or 32 on EBCCDIC) map can be looked up + * directly based on the continuation byte, without + * needing to convert to code point */ + populate_bitmap_from_invlist( + cp_list, + + /* The base code point is from the start byte */ + TWO_BYTE_UTF8_TO_NATIVE(low_utf8[0], + UTF_CONTINUATION_MARK | 0), + + ((struct regnode_bbm *) REGNODE_p(*ret))->bitmap, + REGNODE_BBM_BITMAP_LEN); + RExC_emit += NODE_STEP_REGNODE + regarglen[op]; + return op; + } + else { + op = ANYOFHb; + } } else { op = ANYOFHs; @@ -21429,6 +21477,22 @@ S_get_ANYOFM_contents(pTHX_ const regnode * n) { return cp_list; } +STATIC SV * +S_get_ANYOFHbbm_contents(pTHX_ const regnode * n) { + PERL_ARGS_ASSERT_GET_ANYOFHBBM_CONTENTS; + + SV * cp_list = NULL; + populate_invlist_from_bitmap( + ((struct regnode_bbm *) n)->bitmap, + REGNODE_BBM_BITMAP_LEN * CHARBITS, + &cp_list, + + /* The base cp is from the start byte plus a zero continuation */ + TWO_BYTE_UTF8_TO_NATIVE(((struct regnode_bbm *) n)->first_byte, + UTF_CONTINUATION_MARK | 0)); + return cp_list; +} + /* - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form */ @@ -22054,6 +22118,17 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_ SvREFCNT_dec(cp_list); } + else if (k == ANYOFHbbm) { + SV * cp_list = get_ANYOFHbbm_contents(o); + Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]); + + Perl_sv_catsv(aTHX_ sv, invlist_contents(cp_list, + FALSE /* output suitable for catsv */ + )); + Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]); + + SvREFCNT_dec(cp_list); + } else if (k == POSIXD || k == NPOSIXD) { U8 index = FLAGS(o) * 2; if (index < C_ARRAY_LENGTH(anyofs)) { |