summaryrefslogtreecommitdiff
path: root/regexec.c
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2018-12-06 17:18:25 -0700
committerKarl Williamson <khw@cpan.org>2018-12-26 12:50:38 -0700
commitc316b824875fdd5ce52338f301fb0255d843dfec (patch)
treec713f631b889cca26e22cadb97523733b14b7c8b /regexec.c
parentb2296192536090829ba6d2cb367456f4e346dcc6 (diff)
downloadperl-c316b824875fdd5ce52338f301fb0255d843dfec.tar.gz
Add new regnode: ANYOFH, without a bitmap
This commit adds a regnode for the case where nothing in the bit map has matches. This allows the bitmap to be omitted, saving 32 bytes of otherwise wasted space per node. Many non-Latin Unicode properties have this characteristic. Further, since this node applies only to code points above 255, which are representable only in UTF-8, we can trivially fail a match where the target string isn't in UTF-8. Time savings also accrue from skipping the bitmap look-up. When swashes are removed, even more time will be saved.
Diffstat (limited to 'regexec.c')
-rw-r--r--regexec.c29
1 files changed, 28 insertions, 1 deletions
diff --git a/regexec.c b/regexec.c
index b8adce258a..e425adcc24 100644
--- a/regexec.c
+++ b/regexec.c
@@ -2149,6 +2149,11 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
(U8) ARG(c), FLAGS(c)));
break;
+ case ANYOFH:
+ if (utf8_target) REXEC_FBC_CLASS_SCAN(TRUE,
+ reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target));
+ break;
+
case EXACTFAA_NO_TRIE: /* This node only generated for non-utf8 patterns */
assert(! is_utf8_pat);
/* FALLTHROUGH */
@@ -6679,6 +6684,17 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
goto increment_locinput;
break;
+ case ANYOFH:
+ if ( ! utf8_target
+ || NEXTCHR_IS_EOS
+ || ! reginclass(rex, scan, (U8*)locinput, (U8*)reginfo->strend,
+ utf8_target))
+ {
+ sayNO;
+ }
+ goto increment_locinput;
+ break;
+
/* The argument (FLAGS) to all the POSIX node types is the class number
* */
@@ -9339,6 +9355,17 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
}
break;
+ case ANYOFH:
+ if (utf8_target) while ( hardcount < max
+ && scan < loceol
+ && reginclass(prog, p, (U8*)scan, (U8*) loceol,
+ TRUE))
+ {
+ scan += UTF8SKIP(scan);
+ hardcount++;
+ }
+ break;
+
/* The argument (FLAGS) to all the POSIX node types is the class number */
case NPOSIXL:
@@ -9631,7 +9658,7 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const
}
/* If this character is potentially in the bitmap, check it */
- if (c < NUM_ANYOF_CODE_POINTS) {
+ if (c < NUM_ANYOF_CODE_POINTS && OP(n) != ANYOFH) {
if (ANYOF_BITMAP_TEST(n, c))
match = TRUE;
else if ((flags