diff options
author | Karl Williamson <khw@cpan.org> | 2020-11-13 09:38:21 -0700 |
---|---|---|
committer | Karl Williamson <khw@cpan.org> | 2020-12-19 21:36:46 -0700 |
commit | bb3825626ed2b1217a2ac184eff66d0d4ed6e070 (patch) | |
tree | 1c042b7d1931da43845c4dca7fe5ac62ad78417f /regexp.h | |
parent | 954dc197ae9570855eb54ab9467b24c2f1b95eba (diff) | |
download | perl-bb3825626ed2b1217a2ac184eff66d0d4ed6e070.tar.gz |
regexec.c: Revamp S_setup_EXACTISH_ST() loop end conditions
Consider the pattern /A*B/ where A and B are arbitrary. The pattern
matching code tries to make a tight loop to match the span of A's. The
logic of this was not really updated when UTF-8 was added. I did
revamp it some releases ago to fix some bugs and to at least consider
UTF-8.
This commit changes it so that Unicode is now a first class citizen.
Some details are listed in the ticket GH #18414
Diffstat (limited to 'regexp.h')
-rw-r--r-- | regexp.h | 34 |
1 files changed, 28 insertions, 6 deletions
@@ -706,6 +706,32 @@ typedef struct { # define MAX_RECURSE_EVAL_NOCHANGE_DEPTH 10 #endif +/* The +3 is based on the current Unicode standards needs, and is unlikely to + * change. An assertion should fail in regexec.c if it is too low. It is + * needed for certain edge cases involving multi-character folds when the first + * component also participates in a fold individually. */ +#define MAX_MATCHES (MAX_FOLD_FROMS + 3) + +struct next_matchable_info { + U8 first_byte_mask; + U8 first_byte_anded; + U32 mask32; + U32 anded32; + PERL_INT_FAST8_T count; /* Negative means not initialized */ + PERL_UINT_FAST8_T min_length; + PERL_UINT_FAST8_T max_length; + PERL_UINT_FAST8_T initial_definitive; + PERL_UINT_FAST8_T initial_exact; + PERL_UINT_FAST8_T lengths[MAX_MATCHES]; + + /* The size is from trial and error, and could change with new Unicode + * standards, in which case there is an assertion that should start + * failing. This size could be calculated in one of the regen scripts + * dealing with Unicode, but khw thinks the likelihood of it changing is + * low enough that it isn't worth the effort. */ + U8 matches[18]; +}; + typedef I32 CHECKPOINT; typedef struct regmatch_state { @@ -854,7 +880,6 @@ typedef struct regmatch_state { struct { /* this first element must match u.yes */ struct regmatch_state *prev_yes_state; - int c1, c2; /* case fold search */ CHECKPOINT cp; U32 lastparen; U32 lastcloseparen; @@ -863,8 +888,7 @@ typedef struct regmatch_state { bool minmod; regnode *A, *B; /* the nodes corresponding to /A*B/ */ regnode *me; /* the curlym node */ - U8 c1_utf8[UTF8_MAXBYTES+1]; /* */ - U8 c2_utf8[UTF8_MAXBYTES+1]; + struct next_matchable_info Binfo; } curlym; struct { @@ -872,14 +896,12 @@ typedef struct regmatch_state { CHECKPOINT cp; U32 lastparen; U32 lastcloseparen; - int c1, c2; /* case fold search */ char *maxpos; /* highest possible point in string to match */ char *oldloc; /* the previous locinput */ int count; int min, max; /* {m,n} */ regnode *A, *B; /* the nodes corresponding to /A*B/ */ - U8 c1_utf8[UTF8_MAXBYTES+1]; /* */ - U8 c2_utf8[UTF8_MAXBYTES+1]; + struct next_matchable_info Binfo; } curly; /* and CURLYN/PLUS/STAR */ } u; |