summaryrefslogtreecommitdiff
path: root/regexec.c
diff options
context:
space:
mode:
authorKarl Williamson <public@khwilliamson.com>2012-07-20 10:23:14 -0600
committerKarl Williamson <public@khwilliamson.com>2012-07-24 21:13:49 -0600
commit0658cddeb9feb16c427ac50f4000b008516b9958 (patch)
tree6107b53e26cbdce9e42393b8aadf84dd14600772 /regexec.c
parent3615ea5819e869b314f723b0ce28dfb1d80017ef (diff)
downloadperl-0658cddeb9feb16c427ac50f4000b008516b9958.tar.gz
regcomp.c: Use POSIXA, NPOSIXA
This commit optimizes character classes which are matched under /a or /aa and consist of a single Posix class, into POSIXA or NPOSIXA regop types. For example /[[:word:]]/a. Since [:ascii:] is always ascii-restricted no matter what the charset modifier is, it is always optimized. These nodes should execute somewhat faster than a generic ANYOF node, and are significantly smaller, taking 2 bytes instead of 12. The flags field of the node structure is used to hold an enum indicating which of the 15 Posix classes is being matched.
Diffstat (limited to 'regexec.c')
-rw-r--r--regexec.c52
1 files changed, 52 insertions, 0 deletions
diff --git a/regexec.c b/regexec.c
index af64a69f31..dca278b406 100644
--- a/regexec.c
+++ b/regexec.c
@@ -1814,6 +1814,20 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
!is_HORIZWS_latin1(s)
);
break;
+ case POSIXA:
+ /* Don't need to worry about utf8, as it can match only a single
+ * byte invariant character. The flag in this node type is the
+ * class number to pass to _generic_isCC() to build a mask for
+ * searching in PL_charclass[] */
+ REXEC_FBC_CLASS_SCAN( _generic_isCC_A(*s, FLAGS(c)));
+ break;
+ case NPOSIXA:
+ REXEC_FBC_CSCAN(
+ !_generic_isCC_A(*s, FLAGS(c)),
+ !_generic_isCC_A(*s, FLAGS(c))
+ );
+ break;
+
case AHOCORASICKC:
case AHOCORASICK:
{
@@ -3881,6 +3895,26 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
DIGITA, NDIGITA, isDIGIT_A,
digit, "0");
+ case POSIXA:
+ if (locinput >= PL_regeol || ! _generic_isCC_A(nextchr, FLAGS(scan))) {
+ sayNO;
+ }
+ /* Matched a utf8-invariant, so don't have to worry about utf8 */
+ nextchr = UCHARAT(++locinput);
+ break;
+ case NPOSIXA:
+ if (locinput >= PL_regeol || _generic_isCC_A(nextchr, FLAGS(scan))) {
+ sayNO;
+ }
+ if (utf8_target) {
+ locinput += PL_utf8skip[nextchr];
+ nextchr = UCHARAT(locinput);
+ }
+ else {
+ nextchr = UCHARAT(++locinput);
+ }
+ break;
+
case CLUMP: /* Match \X: logical Unicode character. This is defined as
a Unicode extended Grapheme Cluster */
/* From http://www.unicode.org/reports/tr29 (5.2 version). An
@@ -6298,6 +6332,24 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
scan++;
}
break;
+
+ case POSIXA:
+ while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
+ scan++;
+ }
+ break;
+ case NPOSIXA:
+ if (utf8_target) {
+ while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
+ scan += UTF8SKIP(scan);
+ }
+ }
+ else {
+ while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
+ scan++;
+ }
+ }
+ break;
case NALNUMA:
if (utf8_target) {
while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {