diff options
author | Karl Williamson <khw@cpan.org> | 2018-12-06 17:18:25 -0700 |
---|---|---|
committer | Karl Williamson <khw@cpan.org> | 2018-12-26 12:50:38 -0700 |
commit | c316b824875fdd5ce52338f301fb0255d843dfec (patch) | |
tree | c713f631b889cca26e22cadb97523733b14b7c8b /regexec.c | |
parent | b2296192536090829ba6d2cb367456f4e346dcc6 (diff) | |
download | perl-c316b824875fdd5ce52338f301fb0255d843dfec.tar.gz |
Add new regnode: ANYOFH, without a bitmap
This commit adds a regnode for the case where nothing in the bit map has
matches. This allows the bitmap to be omitted, saving 32 bytes of
otherwise wasted space per node. Many non-Latin Unicode properties have
this characteristic. Further, since this node applies only to code
points above 255, which are representable only in UTF-8, we can
trivially fail a match where the target string isn't in UTF-8. Time
savings also accrue from skipping the bitmap look-up. When swashes are
removed, even more time will be saved.
Diffstat (limited to 'regexec.c')
-rw-r--r-- | regexec.c | 29 |
1 files changed, 28 insertions, 1 deletions
@@ -2149,6 +2149,11 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, (U8) ARG(c), FLAGS(c))); break; + case ANYOFH: + if (utf8_target) REXEC_FBC_CLASS_SCAN(TRUE, + reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target)); + break; + case EXACTFAA_NO_TRIE: /* This node only generated for non-utf8 patterns */ assert(! is_utf8_pat); /* FALLTHROUGH */ @@ -6679,6 +6684,17 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) goto increment_locinput; break; + case ANYOFH: + if ( ! utf8_target + || NEXTCHR_IS_EOS + || ! reginclass(rex, scan, (U8*)locinput, (U8*)reginfo->strend, + utf8_target)) + { + sayNO; + } + goto increment_locinput; + break; + /* The argument (FLAGS) to all the POSIX node types is the class number * */ @@ -9339,6 +9355,17 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, } break; + case ANYOFH: + if (utf8_target) while ( hardcount < max + && scan < loceol + && reginclass(prog, p, (U8*)scan, (U8*) loceol, + TRUE)) + { + scan += UTF8SKIP(scan); + hardcount++; + } + break; + /* The argument (FLAGS) to all the POSIX node types is the class number */ case NPOSIXL: @@ -9631,7 +9658,7 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const } /* If this character is potentially in the bitmap, check it */ - if (c < NUM_ANYOF_CODE_POINTS) { + if (c < NUM_ANYOF_CODE_POINTS && OP(n) != ANYOFH) { if (ANYOF_BITMAP_TEST(n, c)) match = TRUE; else if ((flags |