regexec: Do less work on quantified UTF-8

Consider the regexes /A*B/ and /A*?B/ where A and B are arbitrary, except that B begins with an EXACTish node. Prior to this patch, as a shortcut, the loop for accumulating A* would look for the first character of B to help it decide if B is a possiblity for the next thing. It did not test for all of B unless testing showed that the next thing could be the beginning of B. If the target string was UTF-8, it converted each new sequence of bytes to the code point they represented, and then did the comparision. This is a relative expensive process. This commit avoids that conversion by just doing a memEQ at the current input position. To do this, it revamps S_setup_EXACTISH_ST_c1_c2() to output the UTF-8 sequences to compare against. The function also has been tightened up so that there are fewer false positives.
author: Karl Williamson <public@khwilliamson.com> 2012-10-16 10:17:01 -0600
committer: Karl Williamson <public@khwilliamson.com> 2012-10-16 21:48:37 -0600
commit: 79a2a0e89816b80870df1f9b9e7bb5fb1edcd556 (patch)
tree: f530af448db6076a9fc00479d2d4a3bb64427eee
parent: 57f0e7e230d864f5b78d28bb89545ef671c101a0 (diff)
download: perl-79a2a0e89816b80870df1f9b9e7bb5fb1edcd556.tar.gz
3 files changed, 272 insertions, 171 deletions
diff --git a/regexec.c b/regexec.c
index 00ab29f6e1..73160b76c3 100644
--- a/regexec.c
+++ b/regexec.c
@@ -2936,6 +2936,8 @@ S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
 
 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
+#define CHRTEST_NOT_A_CP_1 -999
+#define CHRTEST_NOT_A_CP_2 -998
 
 #define SLAB_FIRST(s) (&(s)->states[0])
 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
@@ -3273,175 +3275,252 @@ S_clear_backtrack_stack(pTHX_ void *p)
     }
 }
 static bool
-S_setup_EXACTISH_ST_c1_c2(pTHX_ regnode *text_node, I32 *c1, I32 *c2)
+S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, U8* c1_utf8, int *c2p, U8* c2_utf8)
 {
-    /* This sets up a relatively quick check for the initial part of what must
-     * match after a CURLY-type operation condition (the "B" in A*B), where B
-     * starts with an EXACTish node, <text_node>.  If this check is not met,
-     * the caller knows that it should continue with the loop.  If the check is
-     * met, the caller must see if all of B is met, before making the decision.
+    /* This function determines if there are one or two characters that match
+     * the first character of the passed-in EXACTish node <text_node>, and if
+     * so, returns them in the passed-in pointers.
      *
-     * This function sets *<c1> and *<c2> to be the first code point of B.  If
-     * there are two possible such code points (as when the text_node is
-     * folded), *<c2> is set to the second.  If there are more than two (which
-     * happens for some folds), or there is some other complication, these
-     * parameters are set to CHRTEST_VOID, to indicate not to do a quick check:
-     * just try all of B after every time through the loop.
+     * If it determines that no possible character in the target string can
+     * match, it returns FALSE; otherwise TRUE.  (The FALSE situation occurs if
+     * the first character in <text_node> requires UTF-8 to represent, and the
+     * target string isn't in UTF-8.)
      *
-     * If the routine determines that there is no possible way for there to be
-     * a match, it returns FALSE.
-     * */
+     * If there are more than two characters that could match the beginning of
+     * <text_node>, or if more context is required to determine a match or not,
+     * it sets both *<c1p> and *<c2p> to CHRTEST_VOID.
+     *
+     * The motiviation behind this function is to allow the caller to set up
+     * tight loops for matching.  If <text_node> is of type EXACT, there is
+     * only one possible character that can match its first character, and so
+     * the situation is quite simple.  But things get much more complicated if
+     * folding is involved.  It may be that the first character of an EXACTFish
+     * node doesn't participate in any possible fold, e.g., punctuation, so it
+     * can be matched only by itself.  The vast majority of characters that are
+     * in folds match just two things, their lower and upper-case equivalents.
+     * But not all are like that; some have multiple possible matches, or match
+     * sequences of more than one character.  This function sorts all that out.
+     *
+     * Consider the patterns A*B or A*?B where A and B are arbitrary.  In a
+     * loop of trying to match A*, we know we can't exit where the thing
+     * following it isn't a B.  And something can't be a B unless it is the
+     * beginning of B.  By putting a quick test for that beginning in a tight
+     * loop, we can rule out things that can't possibly be B without having to
+     * break out of the loop, thus avoiding work.  Similarly, if A is a single
+     * character, we can make a tight loop matching A*, using the outputs of
+     * this function.
+     *
+     * If the target string to match isn't in UTF-8, and there aren't
+     * complications which require CHRTEST_VOID, *<c1p> and *<c2p> are set to
+     * the one or two possible octets (which are characters in this situation)
+     * that can match.  In all cases, if there is only one character that can
+     * match, *<c1p> and *<c2p> will be identical.
+     *
+     * If the target string is in UTF-8, the buffers pointed to by <c1_utf8>
+     * and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that
+     * can match the beginning of <text_node>.  They should be declared with at
+     * least length UTF8_MAXBYTES+1.  (If the target string isn't in UTF-8, it is
+     * undefined what these contain.)  If one or both of the buffers are
+     * invariant under UTF-8, *<c1p>, and *<c2p> will also be set to the
+     * corresponding invariant.  If variant, the corresponding *<c1p> and/or
+     * *<c2p> will be set to a negative number(s) that shouldn't match any code
+     * point (unless inappropriately coerced to unsigned).   *<c1p> will equal
+     * *<c2p> if and only if <c1_utf8> and <c2_utf8> are the same. */
 
     const bool utf8_target = PL_reg_match_utf8;
-    const U32 uniflags = UTF8_ALLOW_DEFAULT;
+
+    UV c1, c2;
+    bool use_chrtest_void = FALSE;
+
+    /* Used when we have both utf8 input and utf8 output, to avoid converting
+     * to/from code points */
+    bool utf8_has_been_setup = FALSE;
+
     dVAR;
 
-    /* First byte from the EXACTish node */
     U8 *pat = (U8*)STRING(text_node);
 
-    if (! UTF_PATTERN) {    /* Not UTF-8: the code point is the byte */
-        *c1 = *pat;
-        if (OP(text_node) == EXACT) {
-            *c2 = *c1;
+    if (OP(text_node) == EXACT) {
+
+        /* In an exact node, only one thing can be matched, that first
+         * character.  If both the pat and the target are UTF-8, we can just
+         * copy the input to the output, avoiding finding the code point of
+         * that character */
+        if (! UTF_PATTERN) {
+            c2 = c1 = *pat;
+        }
+        else if (utf8_target) {
+            Copy(pat, c1_utf8, UTF8SKIP(pat), U8);
+            Copy(pat, c2_utf8, UTF8SKIP(pat), U8);
+            utf8_has_been_setup = TRUE;
+        }
+        else {
+            c2 = c1 = valid_utf8_to_uvchr(pat, NULL);
+        }
+    }
+    else /* an EXACTFish node */
+         if ((UTF_PATTERN
+                    && is_MULTI_CHAR_FOLD_utf8_safe(pat,
+                                                    pat + STR_LEN(text_node)))
+             || (! UTF_PATTERN
+                    && is_MULTI_CHAR_FOLD_latin1_safe(pat,
+                                                    pat + STR_LEN(text_node))))
+    {
+        /* Multi-character folds require more context to sort out.  Also
+         * PL_utf8_foldclosures used below doesn't handle them, so have to be
+         * handled outside this routine */
+        use_chrtest_void = TRUE;
+    }
+    else { /* an EXACTFish node which doesn't begin with a multi-char fold */
+        c1 = (UTF_PATTERN) ? valid_utf8_to_uvchr(pat, NULL) : *pat;
+        if (c1 > 256) {
+            /* Load the folds hash, if not already done */
+            SV** listp;
+            if (! PL_utf8_foldclosures) {
+                if (! PL_utf8_tofold) {
+                    U8 dummy[UTF8_MAXBYTES+1];
+
+                    /* Force loading this by folding an above-Latin1 char */
+                    to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
+                    assert(PL_utf8_tofold); /* Verify that worked */
+                }
+                PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
+            }
+
+            /* The fold closures data structure is a hash with the keys being
+             * the UTF-8 of every character that is folded to, like 'k', and
+             * the values each an array of all code points that fold to its
+             * key.  e.g. [ 'k', 'K', KELVIN_SIGN ].  Multi-character folds are
+             * not included */
+            if ((! (listp = hv_fetch(PL_utf8_foldclosures,
+                                     (char *) pat,
+                                     UTF8SKIP(pat),
+                                     FALSE))))
+            {
+                /* Not found in the hash, therefore there are no folds
+                 * containing it, so there is only a single character that
+                 * could match */
+                c2 = c1;
+            }
+            else {  /* Does participate in folds */
+                AV* list = (AV*) *listp;
+                if (av_len(list) != 1) {
+
+                    /* If there aren't exactly two folds to this, it is outside
+                     * the scope of this function */
+                    use_chrtest_void = TRUE;
+                }
+                else {  /* There are two.  Get them */
+                    SV** c_p = av_fetch(list, 0, FALSE);
+                    if (c_p == NULL) {
+                        Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
+                    }
+                    c1 = SvUV(*c_p);
+
+                    c_p = av_fetch(list, 1, FALSE);
+                    if (c_p == NULL) {
+                        Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
+                    }
+                    c2 = SvUV(*c_p);
+
+                    /* Folds that cross the 255/256 boundary are forbidden if
+                     * EXACTFL, or EXACTFA and one is ASCIII.  Since the
+                     * pattern character is above 256, and its only other match
+                     * is below 256, the only legal match will be to itself.
+                     * We have thrown away the original, so have to compute
+                     * which is the one above 255 */
+                    if ((c1 < 256) != (c2 < 256)) {
+                        if (OP(text_node) == EXACTFL
+                            || (OP(text_node) == EXACTFA
+                                && (isASCII(c1) || isASCII(c2))))
+                        {
+                            if (c1 < 256) {
+                                c1 = c2;
+                            }
+                            else {
+                                c2 = c1;
+                            }
+                        }
+                    }
+                }
+            }
         }
-        else if (utf8_target
-                 && HAS_NONLATIN1_FOLD_CLOSURE(*c1)
-                 && (OP(text_node) != EXACTFA || ! isASCII(*c1)))
+        else /* Here, c1 is < 255 */
+             if (utf8_target
+                 && HAS_NONLATIN1_FOLD_CLOSURE(c1)
+                 && OP(text_node) != EXACTFL
+                 && (OP(text_node) != EXACTFA || ! isASCII(c1)))
         {
             /* Here, there could be something above Latin1 in the target which
-             * folds to this character in the pattern, which means there are
-             * more than two possible beginnings of B. */
-            *c1 = *c2 = CHRTEST_VOID;
+             * folds to this character in the pattern.  All such cases except
+             * LATIN SMALL LETTER Y WITH DIAERESIS have more than two characters
+             * involved in their folds, so are outside the scope of this
+             * function */
+            if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
+                c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
+            }
+            else {
+                use_chrtest_void = TRUE;
+            }
         }
         else { /* Here nothing above Latin1 can fold to the pattern character */
             switch (OP(text_node)) {
 
                 case EXACTFL:   /* /l rules */
-                    *c2 = PL_fold_locale[*c1];
-                    break;
-
-                case EXACTFU_SS: /* This requires special handling: Don't
-                                    shortcut */
-                    *c1 = *c2 = CHRTEST_VOID;
+                    c2 = PL_fold_locale[c1];
                     break;
 
                 case EXACTF:
                     if (! utf8_target) {    /* /d rules */
-                        *c2 = PL_fold[*c1];
+                        c2 = PL_fold[c1];
                         break;
                     }
                     /* FALLTHROUGH */
                     /* /u rules for all these.  This happens to work for
-                     * EXACTFA in the ASCII range as nothing in Latin1 folds to
-                     * ASCII */
+                     * EXACTFA as nothing in Latin1 folds to ASCII */
                 case EXACTFA:
                 case EXACTFU_TRICKYFOLD:
+                case EXACTFU_SS:
                 case EXACTFU:
-                    *c2 = PL_fold_latin1[*c1];
+                    c2 = PL_fold_latin1[c1];
                     break;
 
 		default: Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
             }
         }
     }
-    else { /* UTF_PATTERN */
-        if (OP(text_node) == EXACT) {
-            *c2 = *c1 = utf8n_to_uvchr(pat, UTF8_MAXBYTES, 0, uniflags);
-            if (*c1 < 0) {  /* Overflowed what we can handle */
-                *c1 = *c2 = CHRTEST_VOID;
-            }
-            else if (*c1 > 255 && ! utf8_target) {
-                return FALSE; /* Can't possibly match */
-            }
+
+    /* Here have figured things out.  Set up the returns */
+    if (use_chrtest_void) {
+        *c2p = *c1p = CHRTEST_VOID;
+    }
+    else if (utf8_target) {
+        if (! utf8_has_been_setup) {    /* Don't have the utf8; must get it */
+            uvchr_to_utf8(c1_utf8, c1);
+            uvchr_to_utf8(c2_utf8, c2);
         }
-        else {
-            if (UTF8_IS_ABOVE_LATIN1(*pat)) {
 
-                /* A multi-character fold is complicated, probably has more
-                 * than two possibilities */
-                if (is_MULTI_CHAR_FOLD_utf8_safe((char*) pat,
-                                        (char*) pat + STR_LEN(text_node)))
-                {
-                    *c1 = *c2 = CHRTEST_VOID;
-                }
-                else {  /* Not a multi-char fold */
-
-                    /* Load the folds hash, if not already done */
-                    SV** listp;
-                    if (! PL_utf8_foldclosures) {
-                        if (! PL_utf8_tofold) {
-                            U8 dummy[UTF8_MAXBYTES+1];
-                            STRLEN dummy_len;
-
-                            /* Force loading this by folding an above-Latin1
-                             * char */
-                            to_utf8_fold((U8*) HYPHEN_UTF8, dummy, &dummy_len);
-                            assert(PL_utf8_tofold); /* Verify that worked */
-                        }
-                        PL_utf8_foldclosures =
-                                         _swash_inversion_hash(PL_utf8_tofold);
-                    }
+        /* Invariants are stored in both the utf8 and byte outputs; Use
+         * negative numbers otherwise for the byte ones.  Make sure that the
+         * byte ones are the same iff the utf8 ones are the same */
+        *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1;
+        *c2p = (UTF8_IS_INVARIANT(*c2_utf8))
+                ? *c2_utf8
+                : (c1 == c2)
+                  ? CHRTEST_NOT_A_CP_1
+                  : CHRTEST_NOT_A_CP_2;
+    }
+    else if (c1 > 255) {
+       if (c2 > 255) {  /* both possibilities are above what a non-utf8 string
+                           can represent */
+           return FALSE;
+       }
 
-                    /* The fold closures data structure is a hash with the keys
-                    * being every character that is folded to, like 'k', and
-                    * the values each an array of everything that folds to its
-                    * key.  e.g. [ 'k', 'K', KELVIN_SIGN ] */
-                    if ((! (listp = hv_fetch(PL_utf8_foldclosures,
-                                             (char *) pat,
-                                             UTF8SKIP(pat),
-                                             FALSE))))
-                    {
-                        /* Not found in the hash, therefore there are no folds
-                         * containing it, so there is only a single char
-                         * possible for beginning B */
-                        *c2 = *c1 = utf8n_to_uvchr(pat, STR_LEN(text_node),
-                                                                  0, uniflags);
-                        if (*c1 < 0) {  /* Overflowed what we can handle */
-                            *c1 = *c2 = CHRTEST_VOID;
-                        }
-                    }
-                    else {
-                        AV* list = (AV*) *listp;
-                        if (av_len(list) != 1) {    /* If there aren't exactly
-                                                       two folds to this, have
-                                                       to test B completely */
-                            *c1 = *c2 = CHRTEST_VOID;
-                        }
-                        else {  /* There are two.  Set *c1 and *c2 to them */
-                            SV** c_p = av_fetch(list, 0, FALSE);
-                            if (c_p == NULL) {
-                                Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
-                            }
-                            *c1 = SvUV(*c_p);
-                            c_p = av_fetch(list, 1, FALSE);
-                            if (c_p == NULL) {
-                                Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
-                            }
-                            *c2 = SvUV(*c_p);
-                        }
-                    }
-                }
-            }
-            else {
-                /* Get the character represented by the UTF-8-encoded byte */
-                U8 c = (UTF8_IS_INVARIANT(*pat))
-                        ? *pat
-                        : TWO_BYTE_UTF8_TO_UNI(*pat, *(pat+1));
-
-                if (HAS_NONLATIN1_FOLD_CLOSURE(c)
-                    && (OP(text_node) != EXACTFA || ! isASCII(c)))
-                {   /* Something above Latin1 folds to this; hence there are
-                       more than 2 possibilities for B to begin with */
-                    *c1 = *c2 = CHRTEST_VOID;
-                }
-                else {
-                    *c1 = c;
-                    *c2 = (OP(text_node) == EXACTFL)
-                           ? PL_fold_locale[*c1]
-                           : PL_fold_latin1[*c1];
-                }
-            }
-        }
+       *c1p = *c2p = c2;    /* c2 is the only representable value */
+    }
+    else {  /* c1 is representable; see about c2 */
+       *c1p = c1;
+       *c2p = (c2 < 256) ? c2 : c1;
     }
 
     return TRUE;
@@ -5574,8 +5653,8 @@ NULL
 	            	IS_TEXT and friends need to change.
 	             */
 		    if (PL_regkind[OP(text_node)] == EXACT) {
-                        if (! S_setup_EXACTISH_ST_c1_c2(aTHX_ text_node,
-                                                              &ST.c1, &ST.c2))
+                        if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
+                           text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8))
                         {
                             sayNO;
                         }
@@ -5590,19 +5669,31 @@ NULL
 		    "", (IV)ST.count)
 		);
 	    if (! NEXTCHR_IS_EOS && ST.c1 != CHRTEST_VOID) {
-                const UV c = (utf8_target)
-                              ? utf8n_to_uvchr((U8*)locinput,
-                                               UTF8_MAXBYTES, NULL,
-                                               uniflags)
-                              : nextchr;
-                if (c != (UV) ST.c1 && c != (UV) ST.c2) {
+                if (! UTF8_IS_INVARIANT(nextchr) && utf8_target) {
+                    if (memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
+                        && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
+                    {
+                        /* simulate B failing */
+                        DEBUG_OPTIMISE_r(
+                            PerlIO_printf(Perl_debug_log,
+                                "%*s  CURLYM Fast bail next target=U+%"UVXf" c1=U+%"UVXf" c2=U+%"UVXf"\n",
+                                (int)(REPORT_CODE_OFF+(depth*2)),"",
+                                valid_utf8_to_uvchr((U8 *) locinput, NULL),
+                                valid_utf8_to_uvchr(ST.c1_utf8, NULL),
+                                valid_utf8_to_uvchr(ST.c2_utf8, NULL))
+                        );
+                        state_num = CURLYM_B_fail;
+                        goto reenter_switch;
+                    }
+                }
+                else if (nextchr != ST.c1 && nextchr != ST.c2) {
                     /* simulate B failing */
                     DEBUG_OPTIMISE_r(
                         PerlIO_printf(Perl_debug_log,
-                            "%*s  CURLYM Fast bail c1=%"IVdf" c2=%"IVdf"\n",
+                            "%*s  CURLYM Fast bail next target=U+%X c1=U+%X c2=U+%X\n",
                             (int)(REPORT_CODE_OFF+(depth*2)),"",
-                            (IV)ST.c1,(IV)ST.c2
-                    ));
+                            (int) nextchr, ST.c1, ST.c2)
+                    );
                     state_num = CURLYM_B_fail;
                     goto reenter_switch;
                 }
@@ -5738,8 +5829,8 @@ NULL
                     
                         if this changes back then the macro for IS_TEXT and 
                         friends need to change. */
-                        if (! S_setup_EXACTISH_ST_c1_c2(aTHX_ text_node,
-                                                              &ST.c1, &ST.c2))
+                        if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
+                           text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8))
                         {
                             sayNO;
                         }
@@ -5831,26 +5922,21 @@ NULL
 		if (utf8_target) {
 		    n = (ST.oldloc == locinput) ? 0 : 1;
 		    if (ST.c1 == ST.c2) {
-			STRLEN len;
 			/* set n to utf8_distance(oldloc, locinput) */
-			while (locinput <= ST.maxpos &&
-			       utf8n_to_uvchr((U8*)locinput,
-					      UTF8_MAXBYTES, &len,
-					      uniflags) != (UV)ST.c1) {
-			    locinput += len;
+			while (locinput <= ST.maxpos
+                              && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)))
+                        {
+			    locinput += UTF8SKIP(locinput);
 			    n++;
 			}
 		    }
 		    else {
 			/* set n to utf8_distance(oldloc, locinput) */
-			while (locinput <= ST.maxpos) {
-			    STRLEN len;
-			    const UV c = utf8n_to_uvchr((U8*)locinput,
-						  UTF8_MAXBYTES, &len,
-						  uniflags);
-			    if (c == (UV)ST.c1 || c == (UV)ST.c2)
-				break;
-			    locinput += len;
+			while (locinput <= ST.maxpos
+                              && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
+                              && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
+                        {
+			    locinput += UTF8SKIP(locinput);
 			    n++;
 			}
 		    }
@@ -5931,16 +6017,25 @@ NULL
                 goto fake_end;
             }
 	    {
-		UV c = 0;
-		if (ST.c1 != CHRTEST_VOID && locinput < PL_regeol)
-		    c = utf8_target ? utf8n_to_uvchr((U8*)locinput,
-					   UTF8_MAXBYTES, 0, uniflags)
-				: (UV) UCHARAT(locinput);
+		bool could_match = locinput < PL_regeol;
+
 		/* If it could work, try it. */
-                if (ST.c1 == CHRTEST_VOID
-                    || (locinput < PL_regeol &&
-                        (c == (UV)ST.c1 || c == (UV)ST.c2)))
-                {
+                if (ST.c1 != CHRTEST_VOID && could_match) {
+                    if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target)
+                    {
+                        could_match = memEQ(locinput,
+                                            ST.c1_utf8,
+                                            UTF8SKIP(locinput))
+                                    || memEQ(locinput,
+                                             ST.c2_utf8,
+                                             UTF8SKIP(locinput));
+                    }
+                    else {
+                        could_match = UCHARAT(locinput) == ST.c1
+                                      || UCHARAT(locinput) == ST.c2;
+                    }
+                }
+                if (ST.c1 == CHRTEST_VOID || could_match) {
 		    CURLY_SETPAREN(ST.paren, ST.count);
 		    PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput);
 		    assert(0); /* NOTREACHED */
diff --git a/regexp.h b/regexp.h
index 0e3517d5f1..e1d5906c22 100644
--- a/regexp.h
+++ b/regexp.h
@@ -18,6 +18,8 @@
 /* we don't want to include this stuff if we are inside of
    an external regex engine based on the core one - like re 'debug'*/
 
+#include "utf8.h"
+
 struct regnode {
     U8	flags;
     U8  type;
@@ -740,7 +742,7 @@ typedef struct regmatch_state {
 	struct {
 	    /* this first element must match u.yes */
 	    struct regmatch_state *prev_yes_state;
-	    I32 c1, c2;		/* case fold search */
+	    int c1, c2;		/* case fold search */
 	    CHECKPOINT cp;
 	    U32 lastparen;
 	    U32 lastcloseparen;
@@ -749,6 +751,8 @@ typedef struct regmatch_state {
 	    bool minmod;
 	    regnode *A, *B;	/* the nodes corresponding to /A*B/  */
 	    regnode *me;	/* the curlym node */
+            U8 c1_utf8[UTF8_MAXBYTES+1];  /* */
+            U8 c2_utf8[UTF8_MAXBYTES+1];
 	} curlym;
 
 	struct {
@@ -756,12 +760,14 @@ typedef struct regmatch_state {
 	    CHECKPOINT cp;
 	    U32 lastparen;
 	    U32 lastcloseparen;
-	    I32 c1, c2;		/* case fold search */
+	    int c1, c2;		/* case fold search */
 	    char *maxpos;	/* highest possible point in string to match */
 	    char *oldloc;	/* the previous locinput */
 	    int count;
 	    int min, max;	/* {m,n} */
 	    regnode *A, *B;	/* the nodes corresponding to /A*B/  */
+            U8 c1_utf8[UTF8_MAXBYTES+1];  /* */
+            U8 c2_utf8[UTF8_MAXBYTES+1];
 	} curly; /* and CURLYN/PLUS/STAR */
 
     } u;
diff --git a/utf8.c b/utf8.c
index 13fb689075..8ad0478358 100644
--- a/utf8.c
+++ b/utf8.c
@@ -3606,7 +3606,7 @@ HV*
 Perl__swash_inversion_hash(pTHX_ SV* const swash)
 {
 
-   /* Subject to change or removal.  For use only in one place in regcomp.c.
+   /* Subject to change or removal.  For use only in regcomp.c and regexec.c
     * Can't be used on a property that is subject to user override, as it
     * relies on the value of SPECIALS in the swash which would be set by
     * utf8_heavy.pl to the hash in the non-overriden file, and hence is not set
author	Karl Williamson <public@khwilliamson.com>	2012-10-16 10:17:01 -0600
committer	Karl Williamson <public@khwilliamson.com>	2012-10-16 21:48:37 -0600
commit	79a2a0e89816b80870df1f9b9e7bb5fb1edcd556 (patch)
tree	f530af448db6076a9fc00479d2d4a3bb64427eee
parent	57f0e7e230d864f5b78d28bb89545ef671c101a0 (diff)
download	perl-79a2a0e89816b80870df1f9b9e7bb5fb1edcd556.tar.gz