diff options
author | Karl Williamson <khw@cpan.org> | 2020-05-21 10:07:50 -0600 |
---|---|---|
committer | Karl Williamson <khw@cpan.org> | 2022-07-01 11:07:55 -0600 |
commit | d62feba66bf43f35d092bb026694f927e9f94d38 (patch) | |
tree | acdb47a399994d59f65b1cb1d2f58e5a611b13aa /regcomp.c | |
parent | c5321c9615e8ff5e65c601de5fd409878da8568b (diff) | |
download | perl-d62feba66bf43f35d092bb026694f927e9f94d38.tar.gz |
regex: Add POSIXA1R node
Several of the POSIXA classes are a single range on ASCII platforms, and
[:digit:] is a single range on both ASCII and EBCDIC. This regnode was
designed to replace the POSIXA regnode for such classes to get a bit of
performance by not needing to do an array lookup. Instead it encodes
some bits in the flags field that with shifting and masking get the
right values for the single range's bounds for any such node.
However, performance tests conducted by Sergey Aleynikov showed this was
actually slower than what it intended to replace. Rather than
completely drop this work, I'm adding it to blead, and immediately
reverting it, so that should parts of it ever become useful, it would be
available.
A few tests fail; those are skipped for the purposes of this commit so
that it doesn't interfere with bisecting.
The code also isn't completely commented.
One could add a regnode for each posix class it was decided should have
the expected performance boost. But regnodes are a finite resource, and
the boost is probably not large enough to justify doing so.
Diffstat (limited to 'regcomp.c')
-rw-r--r-- | regcomp.c | 103 |
1 files changed, 96 insertions, 7 deletions
@@ -6226,6 +6226,23 @@ S_study_chunk(pTHX_ my_invlist = invlist_clone(PL_Posix_ptrs[FLAGS(scan)], NULL); goto join_posix_and_ascii; + case NPOSIXA1R: + invert = 1; + /* FALLTHROUGH */ + case POSIXA1R: + if (POSIXA1Rmasked(scan, 'a') == 'A') { + my_invlist = _add_range_to_invlist(NULL, 'A', 'Z'); + my_invlist = _add_range_to_invlist(my_invlist, 'a', + 'z'); + } + else { + my_invlist = _add_range_to_invlist(NULL, + POSIXA1Rbase(scan), + POSIXA1Rbase(scan) + + POSIXA1Rdelta(scan)); + } + goto join_posix_and_ascii; + case NPOSIXD: case NPOSIXU: invert = 1; @@ -20370,11 +20387,57 @@ S_optimize_regclass(pTHX_ try_inverted)) { /* Here, they precisely match. Optimize this ANYOF - * node into its equivalent POSIX one of the correct - * type, possibly inverted */ - op = (try_inverted) - ? type + NPOSIXA - POSIXA - : type; + * node. */ + + /* If it's a single range it is optimizable into + * POSIXA1R */ + bool single_range_matchable = single_range; + + U32 lowest_matchable = invlist_lowest(*official_code_points); + U32 highest_matchable = invlist_highest(*official_code_points); + + /* But if not a single range, it might be the + * complement of one, or [:alpha:]; both can go to + * POSIXA1R */ + if ( ! single_range_matchable + && ( _invlist_len(*official_code_points) == 2 + || (lowest_matchable == 'A' && highest_matchable == 'z'))) + { + single_range_matchable = true; + } + + if (! single_range_matchable) { + op = (try_inverted) + ? type + NPOSIXA - POSIXA + : type; + } + else { + + posix_class = 0; + + if (lowest_matchable != 'A' || highest_matchable != 'z') { + posix_class |= POSIXA1R_ALPHA_BIT; + } + + if (lowest_matchable % 2) { + posix_class |= POSIXA1R_1x_BIT; + } + + posix_class |= NATIVE_TO_LATIN1(lowest_matchable) / 16; + + if (highest_matchable != '9') { + posix_class |= POSIXA1R_16L_BIT; + if (highest_matchable == '~') { + posix_class |= POSIXA1R_68L_BIT; + if (lowest_matchable == ' ') { + posix_class |= POSIXA1R_1L_BIT; + } + } + } + + op = POSIXA1R + try_inverted; + } + *ret = reg_node(pRExC_state, op); FLAGS(REGNODE_p(*ret)) = posix_class; SvREFCNT_dec(d_invlist); @@ -20384,6 +20447,7 @@ S_optimize_regclass(pTHX_ } } } + SvREFCNT_dec(d_invlist); SvREFCNT_dec(intersection); } @@ -21979,8 +22043,33 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_ SvREFCNT_dec(cp_list); } - else if (k == POSIXD || k == NPOSIXD) { - U8 index = FLAGS(o) * 2; + else if (k == POSIXD || k == NPOSIXD || k == POSIXA1R) { + U8 index; + + if (k == POSIXA1R) { + if (POSIXA1Rmasked(o, 'a') == 'A') { + index = CC_ALPHA_ * 2; + } + else if (POSIXA1Rbase(o) == ' ') { + index = CC_PRINT_ * 2; + } + else if (POSIXA1Rbase(o) == '!') { + index = CC_GRAPH_ * 2; + } + else if (POSIXA1Rbase(o) == '0') { + index = CC_DIGIT_ * 2; + } + else if (POSIXA1Rbase(o) == 'A') { + index = CC_UPPER_ * 2; + } + else { + assert(POSIXA1Rbase(o) == 'a'); + index = CC_LOWER_ * 2; + } + } + else { + index = FLAGS(o) * 2; + } if (index < C_ARRAY_LENGTH(anyofs)) { if (*anyofs[index] != '[') { sv_catpvs(sv, "["); |