regexec.c: Revamp S_setup_EXACTISH_ST() loop end conditions

Consider the pattern /A*B/ where A and B are arbitrary. The pattern matching code tries to make a tight loop to match the span of A's. The logic of this was not really updated when UTF-8 was added. I did revamp it some releases ago to fix some bugs and to at least consider UTF-8. This commit changes it so that Unicode is now a first class citizen. Some details are listed in the ticket GH #18414
author: Karl Williamson <khw@cpan.org> 2020-11-13 09:38:21 -0700
committer: Karl Williamson <khw@cpan.org> 2020-12-19 21:36:46 -0700
commit: bb3825626ed2b1217a2ac184eff66d0d4ed6e070 (patch)
tree: 1c042b7d1931da43845c4dca7fe5ac62ad78417f /regexp.h
parent: 954dc197ae9570855eb54ab9467b24c2f1b95eba (diff)
download: perl-bb3825626ed2b1217a2ac184eff66d0d4ed6e070.tar.gz
1 files changed, 28 insertions, 6 deletions
diff --git a/regexp.h b/regexp.h
index d9f1a40909..cfb8d443ce 100644
--- a/regexp.h
+++ b/regexp.h
@@ -706,6 +706,32 @@ typedef struct {
 #  define MAX_RECURSE_EVAL_NOCHANGE_DEPTH 10
 #endif
 
+/* The +3 is based on the current Unicode standards needs, and is unlikely to
+ * change.  An assertion should fail in regexec.c if it is too low.  It is
+ * needed for certain edge cases involving multi-character folds when the first
+ * component also participates in a fold individually. */
+#define MAX_MATCHES (MAX_FOLD_FROMS + 3)
+
+struct next_matchable_info {
+    U8     first_byte_mask;
+    U8     first_byte_anded;
+    U32    mask32;
+    U32    anded32;
+    PERL_INT_FAST8_T count; /* Negative means not initialized */
+    PERL_UINT_FAST8_T min_length;
+    PERL_UINT_FAST8_T max_length;
+    PERL_UINT_FAST8_T initial_definitive;
+    PERL_UINT_FAST8_T initial_exact;
+    PERL_UINT_FAST8_T lengths[MAX_MATCHES];
+
+    /* The size is from trial and error, and could change with new Unicode
+     * standards, in which case there is an assertion that should start
+     * failing.  This size could be calculated in one of the regen scripts
+     * dealing with Unicode, but khw thinks the likelihood of it changing is
+     * low enough that it isn't worth the effort. */
+    U8 matches[18];
+};
+
 typedef I32 CHECKPOINT;
 
 typedef struct regmatch_state {
@@ -854,7 +880,6 @@ typedef struct regmatch_state {
 	struct {
 	    /* this first element must match u.yes */
 	    struct regmatch_state *prev_yes_state;
-	    int c1, c2;		/* case fold search */
 	    CHECKPOINT cp;
 	    U32 lastparen;
 	    U32 lastcloseparen;
@@ -863,8 +888,7 @@ typedef struct regmatch_state {
 	    bool minmod;
 	    regnode *A, *B;	/* the nodes corresponding to /A*B/  */
 	    regnode *me;	/* the curlym node */
-            U8 c1_utf8[UTF8_MAXBYTES+1];  /* */
-            U8 c2_utf8[UTF8_MAXBYTES+1];
+            struct next_matchable_info Binfo;
 	} curlym;
 
 	struct {
@@ -872,14 +896,12 @@ typedef struct regmatch_state {
 	    CHECKPOINT cp;
 	    U32 lastparen;
 	    U32 lastcloseparen;
-	    int c1, c2;		/* case fold search */
 	    char *maxpos;	/* highest possible point in string to match */
 	    char *oldloc;	/* the previous locinput */
 	    int count;
 	    int min, max;	/* {m,n} */
 	    regnode *A, *B;	/* the nodes corresponding to /A*B/  */
-            U8 c1_utf8[UTF8_MAXBYTES+1];  /* */
-            U8 c2_utf8[UTF8_MAXBYTES+1];
+            struct next_matchable_info Binfo;
 	} curly; /* and CURLYN/PLUS/STAR */
 
     } u;
author	Karl Williamson <khw@cpan.org>	2020-11-13 09:38:21 -0700
committer	Karl Williamson <khw@cpan.org>	2020-12-19 21:36:46 -0700
commit	bb3825626ed2b1217a2ac184eff66d0d4ed6e070 (patch)
tree	1c042b7d1931da43845c4dca7fe5ac62ad78417f /regexp.h
parent	954dc197ae9570855eb54ab9467b24c2f1b95eba (diff)
download	perl-bb3825626ed2b1217a2ac184eff66d0d4ed6e070.tar.gz