diff options
author | Karl Williamson <public@khwilliamson.com> | 2012-10-16 10:17:01 -0600 |
---|---|---|
committer | Karl Williamson <public@khwilliamson.com> | 2012-10-16 21:48:37 -0600 |
commit | 79a2a0e89816b80870df1f9b9e7bb5fb1edcd556 (patch) | |
tree | f530af448db6076a9fc00479d2d4a3bb64427eee /regexp.h | |
parent | 57f0e7e230d864f5b78d28bb89545ef671c101a0 (diff) | |
download | perl-79a2a0e89816b80870df1f9b9e7bb5fb1edcd556.tar.gz |
regexec: Do less work on quantified UTF-8
Consider the regexes /A*B/ and /A*?B/ where A and B are arbitrary,
except that B begins with an EXACTish node. Prior to this patch, as a
shortcut, the loop for accumulating A* would look for the first character
of B to help it decide if B is a possiblity for the next thing. It did
not test for all of B unless testing showed that the next thing could be
the beginning of B. If the target string was UTF-8, it converted each
new sequence of bytes to the code point they represented, and then did
the comparision. This is a relative expensive process.
This commit avoids that conversion by just doing a memEQ at the current
input position. To do this, it revamps S_setup_EXACTISH_ST_c1_c2() to
output the UTF-8 sequences to compare against. The function also has
been tightened up so that there are fewer false positives.
Diffstat (limited to 'regexp.h')
-rw-r--r-- | regexp.h | 10 |
1 files changed, 8 insertions, 2 deletions
@@ -18,6 +18,8 @@ /* we don't want to include this stuff if we are inside of an external regex engine based on the core one - like re 'debug'*/ +#include "utf8.h" + struct regnode { U8 flags; U8 type; @@ -740,7 +742,7 @@ typedef struct regmatch_state { struct { /* this first element must match u.yes */ struct regmatch_state *prev_yes_state; - I32 c1, c2; /* case fold search */ + int c1, c2; /* case fold search */ CHECKPOINT cp; U32 lastparen; U32 lastcloseparen; @@ -749,6 +751,8 @@ typedef struct regmatch_state { bool minmod; regnode *A, *B; /* the nodes corresponding to /A*B/ */ regnode *me; /* the curlym node */ + U8 c1_utf8[UTF8_MAXBYTES+1]; /* */ + U8 c2_utf8[UTF8_MAXBYTES+1]; } curlym; struct { @@ -756,12 +760,14 @@ typedef struct regmatch_state { CHECKPOINT cp; U32 lastparen; U32 lastcloseparen; - I32 c1, c2; /* case fold search */ + int c1, c2; /* case fold search */ char *maxpos; /* highest possible point in string to match */ char *oldloc; /* the previous locinput */ int count; int min, max; /* {m,n} */ regnode *A, *B; /* the nodes corresponding to /A*B/ */ + U8 c1_utf8[UTF8_MAXBYTES+1]; /* */ + U8 c2_utf8[UTF8_MAXBYTES+1]; } curly; /* and CURLYN/PLUS/STAR */ } u; |