summaryrefslogtreecommitdiff
path: root/regcomp.c
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2020-05-21 10:07:50 -0600
committerKarl Williamson <khw@cpan.org>2022-07-01 11:07:55 -0600
commitd62feba66bf43f35d092bb026694f927e9f94d38 (patch)
treeacdb47a399994d59f65b1cb1d2f58e5a611b13aa /regcomp.c
parentc5321c9615e8ff5e65c601de5fd409878da8568b (diff)
downloadperl-d62feba66bf43f35d092bb026694f927e9f94d38.tar.gz
regex: Add POSIXA1R node
Several of the POSIXA classes are a single range on ASCII platforms, and [:digit:] is a single range on both ASCII and EBCDIC. This regnode was designed to replace the POSIXA regnode for such classes to get a bit of performance by not needing to do an array lookup. Instead it encodes some bits in the flags field that with shifting and masking get the right values for the single range's bounds for any such node. However, performance tests conducted by Sergey Aleynikov showed this was actually slower than what it intended to replace. Rather than completely drop this work, I'm adding it to blead, and immediately reverting it, so that should parts of it ever become useful, it would be available. A few tests fail; those are skipped for the purposes of this commit so that it doesn't interfere with bisecting. The code also isn't completely commented. One could add a regnode for each posix class it was decided should have the expected performance boost. But regnodes are a finite resource, and the boost is probably not large enough to justify doing so.
Diffstat (limited to 'regcomp.c')
-rw-r--r--regcomp.c103
1 files changed, 96 insertions, 7 deletions
diff --git a/regcomp.c b/regcomp.c
index cc05c2ae85..ffb6481bcf 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -6226,6 +6226,23 @@ S_study_chunk(pTHX_
my_invlist = invlist_clone(PL_Posix_ptrs[FLAGS(scan)], NULL);
goto join_posix_and_ascii;
+ case NPOSIXA1R:
+ invert = 1;
+ /* FALLTHROUGH */
+ case POSIXA1R:
+ if (POSIXA1Rmasked(scan, 'a') == 'A') {
+ my_invlist = _add_range_to_invlist(NULL, 'A', 'Z');
+ my_invlist = _add_range_to_invlist(my_invlist, 'a',
+ 'z');
+ }
+ else {
+ my_invlist = _add_range_to_invlist(NULL,
+ POSIXA1Rbase(scan),
+ POSIXA1Rbase(scan)
+ + POSIXA1Rdelta(scan));
+ }
+ goto join_posix_and_ascii;
+
case NPOSIXD:
case NPOSIXU:
invert = 1;
@@ -20370,11 +20387,57 @@ S_optimize_regclass(pTHX_
try_inverted))
{
/* Here, they precisely match. Optimize this ANYOF
- * node into its equivalent POSIX one of the correct
- * type, possibly inverted */
- op = (try_inverted)
- ? type + NPOSIXA - POSIXA
- : type;
+ * node. */
+
+ /* If it's a single range it is optimizable into
+ * POSIXA1R */
+ bool single_range_matchable = single_range;
+
+ U32 lowest_matchable = invlist_lowest(*official_code_points);
+ U32 highest_matchable = invlist_highest(*official_code_points);
+
+ /* But if not a single range, it might be the
+ * complement of one, or [:alpha:]; both can go to
+ * POSIXA1R */
+ if ( ! single_range_matchable
+ && ( _invlist_len(*official_code_points) == 2
+ || (lowest_matchable == 'A' && highest_matchable == 'z')))
+ {
+ single_range_matchable = true;
+ }
+
+ if (! single_range_matchable) {
+ op = (try_inverted)
+ ? type + NPOSIXA - POSIXA
+ : type;
+ }
+ else {
+
+ posix_class = 0;
+
+ if (lowest_matchable != 'A' || highest_matchable != 'z') {
+ posix_class |= POSIXA1R_ALPHA_BIT;
+ }
+
+ if (lowest_matchable % 2) {
+ posix_class |= POSIXA1R_1x_BIT;
+ }
+
+ posix_class |= NATIVE_TO_LATIN1(lowest_matchable) / 16;
+
+ if (highest_matchable != '9') {
+ posix_class |= POSIXA1R_16L_BIT;
+ if (highest_matchable == '~') {
+ posix_class |= POSIXA1R_68L_BIT;
+ if (lowest_matchable == ' ') {
+ posix_class |= POSIXA1R_1L_BIT;
+ }
+ }
+ }
+
+ op = POSIXA1R + try_inverted;
+ }
+
*ret = reg_node(pRExC_state, op);
FLAGS(REGNODE_p(*ret)) = posix_class;
SvREFCNT_dec(d_invlist);
@@ -20384,6 +20447,7 @@ S_optimize_regclass(pTHX_
}
}
}
+
SvREFCNT_dec(d_invlist);
SvREFCNT_dec(intersection);
}
@@ -21979,8 +22043,33 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
SvREFCNT_dec(cp_list);
}
- else if (k == POSIXD || k == NPOSIXD) {
- U8 index = FLAGS(o) * 2;
+ else if (k == POSIXD || k == NPOSIXD || k == POSIXA1R) {
+ U8 index;
+
+ if (k == POSIXA1R) {
+ if (POSIXA1Rmasked(o, 'a') == 'A') {
+ index = CC_ALPHA_ * 2;
+ }
+ else if (POSIXA1Rbase(o) == ' ') {
+ index = CC_PRINT_ * 2;
+ }
+ else if (POSIXA1Rbase(o) == '!') {
+ index = CC_GRAPH_ * 2;
+ }
+ else if (POSIXA1Rbase(o) == '0') {
+ index = CC_DIGIT_ * 2;
+ }
+ else if (POSIXA1Rbase(o) == 'A') {
+ index = CC_UPPER_ * 2;
+ }
+ else {
+ assert(POSIXA1Rbase(o) == 'a');
+ index = CC_LOWER_ * 2;
+ }
+ }
+ else {
+ index = FLAGS(o) * 2;
+ }
if (index < C_ARRAY_LENGTH(anyofs)) {
if (*anyofs[index] != '[') {
sv_catpvs(sv, "[");