summaryrefslogtreecommitdiff
path: root/regexp.h
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2020-11-13 09:38:21 -0700
committerKarl Williamson <khw@cpan.org>2020-12-19 21:36:46 -0700
commitbb3825626ed2b1217a2ac184eff66d0d4ed6e070 (patch)
tree1c042b7d1931da43845c4dca7fe5ac62ad78417f /regexp.h
parent954dc197ae9570855eb54ab9467b24c2f1b95eba (diff)
downloadperl-bb3825626ed2b1217a2ac184eff66d0d4ed6e070.tar.gz
regexec.c: Revamp S_setup_EXACTISH_ST() loop end conditions
Consider the pattern /A*B/ where A and B are arbitrary. The pattern matching code tries to make a tight loop to match the span of A's. The logic of this was not really updated when UTF-8 was added. I did revamp it some releases ago to fix some bugs and to at least consider UTF-8. This commit changes it so that Unicode is now a first class citizen. Some details are listed in the ticket GH #18414
Diffstat (limited to 'regexp.h')
-rw-r--r--regexp.h34
1 files changed, 28 insertions, 6 deletions
diff --git a/regexp.h b/regexp.h
index d9f1a40909..cfb8d443ce 100644
--- a/regexp.h
+++ b/regexp.h
@@ -706,6 +706,32 @@ typedef struct {
# define MAX_RECURSE_EVAL_NOCHANGE_DEPTH 10
#endif
+/* The +3 is based on the current Unicode standards needs, and is unlikely to
+ * change. An assertion should fail in regexec.c if it is too low. It is
+ * needed for certain edge cases involving multi-character folds when the first
+ * component also participates in a fold individually. */
+#define MAX_MATCHES (MAX_FOLD_FROMS + 3)
+
+struct next_matchable_info {
+ U8 first_byte_mask;
+ U8 first_byte_anded;
+ U32 mask32;
+ U32 anded32;
+ PERL_INT_FAST8_T count; /* Negative means not initialized */
+ PERL_UINT_FAST8_T min_length;
+ PERL_UINT_FAST8_T max_length;
+ PERL_UINT_FAST8_T initial_definitive;
+ PERL_UINT_FAST8_T initial_exact;
+ PERL_UINT_FAST8_T lengths[MAX_MATCHES];
+
+ /* The size is from trial and error, and could change with new Unicode
+ * standards, in which case there is an assertion that should start
+ * failing. This size could be calculated in one of the regen scripts
+ * dealing with Unicode, but khw thinks the likelihood of it changing is
+ * low enough that it isn't worth the effort. */
+ U8 matches[18];
+};
+
typedef I32 CHECKPOINT;
typedef struct regmatch_state {
@@ -854,7 +880,6 @@ typedef struct regmatch_state {
struct {
/* this first element must match u.yes */
struct regmatch_state *prev_yes_state;
- int c1, c2; /* case fold search */
CHECKPOINT cp;
U32 lastparen;
U32 lastcloseparen;
@@ -863,8 +888,7 @@ typedef struct regmatch_state {
bool minmod;
regnode *A, *B; /* the nodes corresponding to /A*B/ */
regnode *me; /* the curlym node */
- U8 c1_utf8[UTF8_MAXBYTES+1]; /* */
- U8 c2_utf8[UTF8_MAXBYTES+1];
+ struct next_matchable_info Binfo;
} curlym;
struct {
@@ -872,14 +896,12 @@ typedef struct regmatch_state {
CHECKPOINT cp;
U32 lastparen;
U32 lastcloseparen;
- int c1, c2; /* case fold search */
char *maxpos; /* highest possible point in string to match */
char *oldloc; /* the previous locinput */
int count;
int min, max; /* {m,n} */
regnode *A, *B; /* the nodes corresponding to /A*B/ */
- U8 c1_utf8[UTF8_MAXBYTES+1]; /* */
- U8 c2_utf8[UTF8_MAXBYTES+1];
+ struct next_matchable_info Binfo;
} curly; /* and CURLYN/PLUS/STAR */
} u;