6 files changed, 208 insertions, 12 deletions
diff --git a/embedvar.h b/embedvar.h
index 01f3db139f..0a3c7fa2d9 100644
--- a/embedvar.h
+++ b/embedvar.h
@@ -67,6 +67,7 @@
 #define PL_Mem			(vTHX->IMem)
 #define PL_MemParse		(vTHX->IMemParse)
 #define PL_MemShared		(vTHX->IMemShared)
+#define PL_NonL1NonFinalFold	(vTHX->INonL1NonFinalFold)
 #define PL_PerlSpace		(vTHX->IPerlSpace)
 #define PL_PosixAlnum		(vTHX->IPosixAlnum)
 #define PL_PosixAlpha		(vTHX->IPosixAlpha)
diff --git a/handy.h b/handy.h
index 5e478cb540..db018d74cc 100644
--- a/handy.h
+++ b/handy.h
@@ -675,6 +675,8 @@ EXTCONST U32 PL_charclass[];
 #   define _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(c) ((! cBOOL(FITS_IN_8_BITS(c))) || (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_mask(_CC_NONLATIN1_FOLD)))
 
 #   define _isQUOTEMETA(c) _generic_isCC(c, _CC_QUOTEMETA)
+#   define _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c) \
+                                            _generic_isCC(c, _CC_NON_FINAL_FOLD)
 #else   /* No perl.h. */
 #   ifdef EBCDIC
 #       define isALNUMC_A(c)   (isASCII(c) && isALNUMC(c))
diff --git a/intrpvar.h b/intrpvar.h
index f2be894537..c27e33893f 100644
--- a/intrpvar.h
+++ b/intrpvar.h
@@ -611,6 +611,8 @@ PERLVAR(I, XPosixXDigit, SV *)
 
 PERLVAR(I, VertSpace,   SV *)
 
+PERLVAR(I, NonL1NonFinalFold,   SV *)
+
 /* utf8 character class swashes */
 PERLVAR(I, utf8_alnum,	SV *)
 PERLVAR(I, utf8_alpha,	SV *)
diff --git a/regcomp.c b/regcomp.c
index 488ebebf0a..0369055485 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -87,11 +87,10 @@ extern const struct regexp_engine my_reg_engine;
 #endif
 
 #include "dquote_static.c"
-#ifndef PERL_IN_XSUB_RE
-#  include "charclass_invlists.h"
-#endif
+#include "charclass_invlists.h"
 
 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
+#define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
 
 #ifdef op
 #undef op
@@ -10394,15 +10393,18 @@ tryagain:
 	    RExC_parse++;
 
 	defchar: {
-	    register STRLEN len;
+	    register STRLEN len = 0;
 	    register UV ender;
 	    register char *p;
 	    char *s;
 #define MAX_NODE_STRING_SIZE 127
-	    char foldbuf[MAX_NODE_STRING_SIZE];
+	    char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
+	    char *s0;
+	    U8 upper_parse = MAX_NODE_STRING_SIZE;
 	    STRLEN foldlen;
             U8 node_type;
             bool next_is_quantifier;
+            char * oldp;
 
 	    ender = 0;
             node_type = compute_EXACTish(pRExC_state);
@@ -10412,6 +10414,10 @@ tryagain:
              * actual node, as the node doesn't exist yet */
 	    s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
 
+            s0 = s;
+
+	reparse:
+
 	    /* XXX The node can hold up to 255 bytes, yet this only goes to
              * 127.  I (khw) do not know why.  Keeping it somewhat less than
              * 255 allows us to not have to worry about overflow due to
@@ -10430,11 +10436,11 @@ tryagain:
              * could back off to end with only a code point that isn't such a
              * non-final, but it is possible for there not to be any in the
              * entire node. */
-	    for (len = 0, p = RExC_parse - 1;
-	         len < MAX_NODE_STRING_SIZE && p < RExC_end;
+	    for (p = RExC_parse - 1;
+	         len < upper_parse && p < RExC_end;
 	         len++)
 	    {
-		char * const oldp = p;
+		oldp = p;
 
 		if (RExC_flags & RXf_PMf_EXTENDED)
 		    p = regwhite( pRExC_state, p );
@@ -10633,6 +10639,10 @@ tryagain:
 		    break;
 		} /* End of switch on the literal */
 
+		/* Here, have looked at the literal character and <ender>
+		 * contains its ordinal, <p> points to the character after it
+		 */
+
 		if ( RExC_flags & RXf_PMf_EXTENDED)
 		    p = regwhite( pRExC_state, p );
 
@@ -10693,7 +10703,7 @@ tryagain:
 			len += foldlen - 1;
                     }
                     else {
-                        REGC((char)ender, s++);
+                        *(s++) = ender;
                     }
 		}
 		else if (UTF) {
@@ -10722,6 +10732,164 @@ tryagain:
 
 	    } /* End of loop through literal characters */
 
+            /* Here we have either exhausted the input or ran out of room in
+             * the node.  (If we encountered a character that can't be in the
+             * node, transfer is made directly to <loopdone>, and so we
+             * wouldn't have fallen off the end of the loop.)  In the latter
+             * case, we artificially have to split the node into two, because
+             * we just don't have enough space to hold everything.  This
+             * creates a problem if the final character participates in a
+             * multi-character fold in the non-final position, as a match that
+             * should have occurred won't, due to the way nodes are matched,
+             * and our artificial boundary.  So back off until we find a non-
+             * problematic character -- one that isn't at the beginning or
+             * middle of such a fold.  (Either it doesn't participate in any
+             * folds, or appears only in the final position of all the folds it
+             * does participate in.)  A better solution with far fewer false
+             * positives, and that would fill the nodes more completely, would
+             * be to actually have available all the multi-character folds to
+             * test against, and to back-off only far enough to be sure that
+             * this node isn't ending with a partial one.  <upper_parse> is set
+             * further below (if we need to reparse the node) to include just
+             * up through that final non-problematic character that this code
+             * identifies, so when it is set to less than the full node, we can
+             * skip the rest of this */
+            if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
+
+                const STRLEN full_len = len;
+
+		assert(len >= MAX_NODE_STRING_SIZE);
+
+                /* Here, <s> points to the final byte of the final character.
+                 * Look backwards through the string until find a non-
+                 * problematic character */
+
+		if (! UTF) {
+
+                    /* These two have no multi-char folds to non-UTF characters
+                     */
+                    if (ASCII_FOLD_RESTRICTED || LOC) {
+                        goto loopdone;
+                    }
+
+                    while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
+                    len = s - s0 + 1;
+		}
+                else {
+                    if (!  PL_NonL1NonFinalFold) {
+                        PL_NonL1NonFinalFold = _new_invlist_C_array(
+                                        NonL1_Perl_Non_Final_Folds_invlist);
+                    }
+
+                    /* Point to the first byte of the final character */
+                    s = (char *) utf8_hop((U8 *) s, -1);
+
+                    while (s >= s0) {   /* Search backwards until find
+                                           non-problematic char */
+                        if (UTF8_IS_INVARIANT(*s)) {
+
+                            /* There are no ascii characters that participate
+                             * in multi-char folds under /aa.  In EBCDIC, the
+                             * non-ascii invariants are all control characters,
+                             * so don't ever participate in any folds. */
+                            if (ASCII_FOLD_RESTRICTED
+                                || ! IS_NON_FINAL_FOLD(*s))
+                            {
+                                break;
+                            }
+                        }
+                        else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
+
+                            /* No Latin1 characters participate in multi-char
+                             * folds under /l */
+                            if (LOC
+                                || ! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_UNI(
+                                                                *s, *(s+1))))
+                            {
+                                break;
+                            }
+                        }
+                        else if (! _invlist_contains_cp(
+                                        PL_NonL1NonFinalFold,
+                                        valid_utf8_to_uvchr((U8 *) s, NULL)))
+                        {
+                            break;
+                        }
+
+                        /* Here, the current character is problematic in that
+                         * it does occur in the non-final position of some
+                         * fold, so try the character before it, but have to
+                         * special case the very first byte in the string, so
+                         * we don't read outside the string */
+                        s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
+                    } /* End of loop backwards through the string */
+
+                    /* If there were only problematic characters in the string,
+                     * <s> will point to before s0, in which case the length
+                     * should be 0, otherwise include the length of the
+                     * non-problematic character just found */
+                    len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
+		}
+
+                /* Here, have found the final character, if any, that is
+                 * non-problematic as far as ending the node without splitting
+                 * it across a potential multi-char fold.  <len> contains the
+                 * number of bytes in the node up-to and including that
+                 * character, or is 0 if there is no such character, meaning
+                 * the whole node contains only problematic characters.  In
+                 * this case, give up and just take the node as-is.  We can't
+                 * do any better */
+                if (len == 0) {
+                    len = full_len;
+                } else {
+
+                    /* Here, the node does contain some characters that aren't
+                     * problematic.  If one such is the final character in the
+                     * node, we are done */
+                    if (len == full_len) {
+                        goto loopdone;
+                    }
+                    else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
+
+                        /* If the final character is problematic, but the
+                         * penultimate is not, back-off that last character to
+                         * later start a new node with it */
+                        p = oldp;
+                        goto loopdone;
+                    }
+
+                    /* Here, the final non-problematic character is earlier
+                     * in the input than the penultimate character.  What we do
+                     * is reparse from the beginning, going up only as far as
+                     * this final ok one, thus guaranteeing that the node ends
+                     * in an acceptable character.  The reason we reparse is
+                     * that we know how far in the character is, but we don't
+                     * know how to correlate its position with the input parse.
+                     * An alternate implementation would be to build that
+                     * correlation as we go along during the original parse,
+                     * but that would entail extra work for every node, whereas
+                     * this code gets executed only when the string is too
+                     * large for the node, and the final two characters are
+                     * problematic, an infrequent occurrence.  Yet another
+                     * possible strategy would be to save the tail of the
+                     * string, and the next time regatom is called, initialize
+                     * with that.  The problem with this is that unless you
+                     * back off one more character, you won't be guaranteed
+                     * regatom will get called again, unless regbranch,
+                     * regpiece ... are also changed.  If you do back off that
+                     * extra character, so that there is input guaranteed to
+                     * force calling regatom, you can't handle the case where
+                     * just the first character in the node is acceptable.  I
+                     * (khw) decided to try this method which doesn't have that
+                     * pitfall; if performance issues are found, we can do a
+                     * combination of the current approach plus that one */
+                    upper_parse = len;
+                    len = 0;
+                    s = s0;
+                    goto reparse;
+                }
+	    }   /* End of verifying node ends with an appropriate char */
+
 	loopdone:   /* Jumped to when encounters something that shouldn't be in
 		       the node */
 	    RExC_parse = p - 1;
diff --git a/sv.c b/sv.c
index 4ab14e362d..4ad53cdf80 100644
--- a/sv.c
+++ b/sv.c
@@ -13370,6 +13370,8 @@ perl_clone_using(PerlInterpreter *proto_perl, UV flags,
 
     PL_VertSpace	= sv_dup_inc(proto_perl->IVertSpace, param);
 
+    PL_NonL1NonFinalFold = sv_dup_inc(proto_perl->INonL1NonFinalFold, param);
+
     /* utf8 character class swashes */
     PL_utf8_alnum	= sv_dup_inc(proto_perl->Iutf8_alnum, param);
     PL_utf8_alpha	= sv_dup_inc(proto_perl->Iutf8_alpha, param);
diff --git a/t/re/pat_advanced.t b/t/re/pat_advanced.t
index 526b9e2000..a3540fd185 100644
--- a/t/re/pat_advanced.t
+++ b/t/re/pat_advanced.t
@@ -873,16 +873,16 @@ sub run_tests {
     }
 
     {
-        for (120 .. 130) {
+        for (120 .. 130, 240 .. 260) {
             my $head = 'x' x $_;
             my $message = q [Don't misparse \x{...} in regexp ] .
-                             q [near 127 char EXACT limit];
+                             q [near EXACT char count limit];
             for my $tail ('\x{0061}', '\x{1234}', '\x61') {
                 eval qq{like("$head$tail", qr/$head$tail/, \$message)};
 		is($@, '', $message);
             }
             $message = q [Don't misparse \N{...} in regexp ] .
-                             q [near 127 char EXACT limit];
+                             q [near EXACT char count limit];
             for my $tail ('\N{SNOWFLAKE}') {
                 eval qq {use charnames ':full';
                          like("$head$tail", qr/$head$tail/, \$message)};
@@ -2064,6 +2064,27 @@ EOP
     ok "x" =~ /\A(?>(?:(?:)A|B|C?x))\z/,
         "Check TRIE does not overwrite EXACT following NOTHING at start - RT #111842";
 
+    {
+        my $single = ":";
+        my $upper = "\x{390}";  # Fold is 3 chars.
+        my $multi = CORE::fc($upper);
+
+        my $failed = 0;
+
+        # Try forcing a node to be split, with a multi-char fold at the
+        # boundary
+        for my $repeat (1 .. 300) {
+            my $string = $single x $repeat;
+            my $lhs = $string . $upper;
+            if ($lhs !~ m/$string$multi/i) {
+                $failed = $repeat;
+                last;
+            }
+        }
+        ok(! $failed, "Matched multi-char fold across EXACTFish node boundaries; if failed, was at count $failed");
+
+    }
+
     #
     # Keep the following tests last -- they may crash perl
     #