diff options
author | Karl Williamson <public@khwilliamson.com> | 2012-07-20 10:23:14 -0600 |
---|---|---|
committer | Karl Williamson <public@khwilliamson.com> | 2012-07-24 21:13:49 -0600 |
commit | 0658cddeb9feb16c427ac50f4000b008516b9958 (patch) | |
tree | 6107b53e26cbdce9e42393b8aadf84dd14600772 /regexec.c | |
parent | 3615ea5819e869b314f723b0ce28dfb1d80017ef (diff) | |
download | perl-0658cddeb9feb16c427ac50f4000b008516b9958.tar.gz |
regcomp.c: Use POSIXA, NPOSIXA
This commit optimizes character classes which are matched under /a or
/aa and consist of a single Posix class, into POSIXA or NPOSIXA regop
types. For example /[[:word:]]/a. Since [:ascii:] is always
ascii-restricted no matter what the charset modifier is, it is always
optimized.
These nodes should execute somewhat faster than a generic ANYOF node,
and are significantly smaller, taking 2 bytes instead of 12.
The flags field of the node structure is used to hold an enum indicating
which of the 15 Posix classes is being matched.
Diffstat (limited to 'regexec.c')
-rw-r--r-- | regexec.c | 52 |
1 files changed, 52 insertions, 0 deletions
@@ -1814,6 +1814,20 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, !is_HORIZWS_latin1(s) ); break; + case POSIXA: + /* Don't need to worry about utf8, as it can match only a single + * byte invariant character. The flag in this node type is the + * class number to pass to _generic_isCC() to build a mask for + * searching in PL_charclass[] */ + REXEC_FBC_CLASS_SCAN( _generic_isCC_A(*s, FLAGS(c))); + break; + case NPOSIXA: + REXEC_FBC_CSCAN( + !_generic_isCC_A(*s, FLAGS(c)), + !_generic_isCC_A(*s, FLAGS(c)) + ); + break; + case AHOCORASICKC: case AHOCORASICK: { @@ -3881,6 +3895,26 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) DIGITA, NDIGITA, isDIGIT_A, digit, "0"); + case POSIXA: + if (locinput >= PL_regeol || ! _generic_isCC_A(nextchr, FLAGS(scan))) { + sayNO; + } + /* Matched a utf8-invariant, so don't have to worry about utf8 */ + nextchr = UCHARAT(++locinput); + break; + case NPOSIXA: + if (locinput >= PL_regeol || _generic_isCC_A(nextchr, FLAGS(scan))) { + sayNO; + } + if (utf8_target) { + locinput += PL_utf8skip[nextchr]; + nextchr = UCHARAT(locinput); + } + else { + nextchr = UCHARAT(++locinput); + } + break; + case CLUMP: /* Match \X: logical Unicode character. This is defined as a Unicode extended Grapheme Cluster */ /* From http://www.unicode.org/reports/tr29 (5.2 version). An @@ -6298,6 +6332,24 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth) scan++; } break; + + case POSIXA: + while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) { + scan++; + } + break; + case NPOSIXA: + if (utf8_target) { + while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) { + scan += UTF8SKIP(scan); + } + } + else { + while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) { + scan++; + } + } + break; case NALNUMA: if (utf8_target) { while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) { |