summaryrefslogtreecommitdiff
path: root/regexp.h
diff options
context:
space:
mode:
authorKarl Williamson <public@khwilliamson.com>2012-10-16 10:17:01 -0600
committerKarl Williamson <public@khwilliamson.com>2012-10-16 21:48:37 -0600
commit79a2a0e89816b80870df1f9b9e7bb5fb1edcd556 (patch)
treef530af448db6076a9fc00479d2d4a3bb64427eee /regexp.h
parent57f0e7e230d864f5b78d28bb89545ef671c101a0 (diff)
downloadperl-79a2a0e89816b80870df1f9b9e7bb5fb1edcd556.tar.gz
regexec: Do less work on quantified UTF-8
Consider the regexes /A*B/ and /A*?B/ where A and B are arbitrary, except that B begins with an EXACTish node. Prior to this patch, as a shortcut, the loop for accumulating A* would look for the first character of B to help it decide if B is a possiblity for the next thing. It did not test for all of B unless testing showed that the next thing could be the beginning of B. If the target string was UTF-8, it converted each new sequence of bytes to the code point they represented, and then did the comparision. This is a relative expensive process. This commit avoids that conversion by just doing a memEQ at the current input position. To do this, it revamps S_setup_EXACTISH_ST_c1_c2() to output the UTF-8 sequences to compare against. The function also has been tightened up so that there are fewer false positives.
Diffstat (limited to 'regexp.h')
-rw-r--r--regexp.h10
1 files changed, 8 insertions, 2 deletions
diff --git a/regexp.h b/regexp.h
index 0e3517d5f1..e1d5906c22 100644
--- a/regexp.h
+++ b/regexp.h
@@ -18,6 +18,8 @@
/* we don't want to include this stuff if we are inside of
an external regex engine based on the core one - like re 'debug'*/
+#include "utf8.h"
+
struct regnode {
U8 flags;
U8 type;
@@ -740,7 +742,7 @@ typedef struct regmatch_state {
struct {
/* this first element must match u.yes */
struct regmatch_state *prev_yes_state;
- I32 c1, c2; /* case fold search */
+ int c1, c2; /* case fold search */
CHECKPOINT cp;
U32 lastparen;
U32 lastcloseparen;
@@ -749,6 +751,8 @@ typedef struct regmatch_state {
bool minmod;
regnode *A, *B; /* the nodes corresponding to /A*B/ */
regnode *me; /* the curlym node */
+ U8 c1_utf8[UTF8_MAXBYTES+1]; /* */
+ U8 c2_utf8[UTF8_MAXBYTES+1];
} curlym;
struct {
@@ -756,12 +760,14 @@ typedef struct regmatch_state {
CHECKPOINT cp;
U32 lastparen;
U32 lastcloseparen;
- I32 c1, c2; /* case fold search */
+ int c1, c2; /* case fold search */
char *maxpos; /* highest possible point in string to match */
char *oldloc; /* the previous locinput */
int count;
int min, max; /* {m,n} */
regnode *A, *B; /* the nodes corresponding to /A*B/ */
+ U8 c1_utf8[UTF8_MAXBYTES+1]; /* */
+ U8 c2_utf8[UTF8_MAXBYTES+1];
} curly; /* and CURLYN/PLUS/STAR */
} u;