temp commit for smokessmoke-me/khw-tricky

author: Karl Williamson <public@khwilliamson.com> 2011-12-22 21:55:09 -0700
committer: Karl Williamson <public@khwilliamson.com> 2011-12-22 21:55:09 -0700
commit: e17746f4b72a1a3dbaa579c15d3feaf7d58232de (patch)
tree: 34185f4c91afce18385a99dce24cf70f5ffb625d
parent: 9b29c3f73ae0922b17ad298dde855b933a4bfee0 (diff)
download: perl-smoke-me/khw-tricky.tar.gz
9 files changed, 391 insertions, 479 deletions
diff --git a/embed.fnc b/embed.fnc
index 3b81d3fc28..f3e7cf63e0 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -1889,7 +1889,7 @@ Es	|void	|regtail	|NN struct RExC_state_t *pRExC_state \
 Es	|SV *	|reg_scan_name	|NN struct RExC_state_t *pRExC_state \
 				|U32 flags
 Es	|U32	|join_exact	|NN struct RExC_state_t *pRExC_state \
-				|NN regnode *scan|NN I32 *min|U32 flags|NULLOK regnode *val|U32 depth
+				|NN regnode *scan|NN I32 *min_change|U32 flags|NULLOK regnode *val|U32 depth
 EsRn	|char *	|regwhite	|NN struct RExC_state_t *pRExC_state \
 				|NN char *p
 Es	|char *	|nextchar	|NN struct RExC_state_t *pRExC_state
diff --git a/pp_hot.c b/pp_hot.c
index a2d6f9140e..a3edfa9891 100644
--- a/pp_hot.c
+++ b/pp_hot.c
@@ -1290,8 +1290,10 @@ PP(pp_match)
 	rx = PM_GETRE(pm);
     }
 
-    if (RX_MINLEN(rx) > (I32)len)
+    if (RX_MINLEN(rx) > (I32)len) {
+	DEBUG_r(PerlIO_printf(Perl_debug_log, "Regex match must fail due to min length, so not tried\n"));
 	goto failure;
+    }
 
     truebase = t = s;
 
@@ -1330,8 +1332,10 @@ PP(pp_match)
   play_it_again:
     if (global && RX_OFFS(rx)[0].start != -1) {
 	t = s = RX_OFFS(rx)[0].end + truebase - RX_GOFS(rx);
-	if ((s + RX_MINLEN(rx)) > strend || s < truebase)
+	if ((s + RX_MINLEN(rx)) > strend || s < truebase) {
+	    DEBUG_r(PerlIO_printf(Perl_debug_log, "Regex match must fail, so not tried\n"));
 	    goto nope;
+	}
 	if (update_minmatch++)
 	    minmatch = had_zerolen;
     }
diff --git a/proto.h b/proto.h
index 60f191aa73..e0db5ca2ab 100644
--- a/proto.h
+++ b/proto.h
@@ -6350,12 +6350,12 @@ PERL_STATIC_INLINE void	S_invlist_trim(pTHX_ SV* const invlist)
 #define PERL_ARGS_ASSERT_INVLIST_TRIM	\
 	assert(invlist)
 
-STATIC U32	S_join_exact(pTHX_ struct RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags, regnode *val, U32 depth)
+STATIC U32	S_join_exact(pTHX_ struct RExC_state_t *pRExC_state, regnode *scan, I32 *min_change, U32 flags, regnode *val, U32 depth)
 			__attribute__nonnull__(pTHX_1)
 			__attribute__nonnull__(pTHX_2)
 			__attribute__nonnull__(pTHX_3);
 #define PERL_ARGS_ASSERT_JOIN_EXACT	\
-	assert(pRExC_state); assert(scan); assert(min)
+	assert(pRExC_state); assert(scan); assert(min_change)
 
 STATIC I32	S_make_trie(pTHX_ struct RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
 			__attribute__nonnull__(pTHX_1)
diff --git a/regcomp.c b/regcomp.c
index 68b9e04d06..80286fb8e6 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -2505,21 +2505,39 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode
    }});
 
 
+/* The below joins as many adjacent EXACTish nodes as possible into a single
+ * one, and looks for problematic sequences of characters whose folds vs.
+ * non-folds have sufficiently different lengths, that the optimizer would be
+ * fooled into rejecting legitimate matches of them, and the trie construction
+ * code can't cope with them.  The joining is only done if:
+ * 1) there is room in the current conglomerated node to entirely contain the
+ *    next one.
+ * 2) they are the exact same node type
+ *
+ * The adjacent nodes actually may be separated by NOTHING kind nodes.
+ *
+ * If there are problematic code sequences, *min_change is set to the delta
+ * that the minimum size of the node can off from its actual size.
+ *
+ * And, the node type of the result is changed to reflect that it contains
+ * these sequences
+ */
 
-
-
-#define JOIN_EXACT(scan,min,flags) \
+#define JOIN_EXACT(scan,min_change,flags) \
     if (PL_regkind[OP(scan)] == EXACT) \
-        join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
+        join_exact(pRExC_state,(scan),(min_change),(flags),NULL,depth+1)
 
 STATIC U32
-S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
+S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min_change, U32 flags,regnode *val, U32 depth) {
     /* Merge several consecutive EXACTish nodes into one. */
     regnode *n = regnext(scan);
     U32 stringok = 1;
     regnode *next = scan + NODE_SZ_STR(scan);
     U32 merged = 0;
     U32 stopnow = 0;
+    char *s, *t;
+    char * const s0 = STRING(scan);
+    char * const s_end = s0 + STR_LEN(scan);
 #ifdef DEBUGGING
     regnode *stop = scan;
     GET_RE_DEBUG_FLAGS_DECL;
@@ -2533,13 +2551,20 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags
     PERL_UNUSED_ARG(val);
 #endif
     DEBUG_PEEP("join",scan,depth);
+
+    /* These opcode should only be on output from this routine, never on input
+     */
+    assert(OP(scan) != EXACTFU_NO_TRIE);
+    assert(OP(scan) != EXACTFU_SS);
     
-    /* Skip NOTHING, merge EXACT*. */
-    while (n &&
-           ( PL_regkind[OP(n)] == NOTHING ||
-             (stringok && (OP(n) == OP(scan))))
+    /* Look through the subsequent nodes in the chain.  Skip NOTHING, merge
+     * EXACT ones that are mergeable to the current one. */
+    while (n
+           && (PL_regkind[OP(n)] == NOTHING
+               || (stringok && OP(n) == OP(scan)))
            && NEXT_OFF(n)
-           && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
+           && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
+    {
         
         if (OP(n) == TAIL || n > next)
             stringok = 0;
@@ -2563,7 +2588,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags
             DEBUG_PEEP("merg",n,depth);
             merged++;
 
-            NEXT_OFF(scan) += NEXT_OFF(n);
+	    NEXT_OFF(scan) += NEXT_OFF(n);
             STR_LEN(scan) += STR_LEN(n);
             next = n + NODE_SZ_STR(n);
             /* Now we can overwrite *n : */
@@ -2588,65 +2613,136 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags
 	}
 #endif
     }
-#define GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS   0x0390
-#define IOTA_D_T	GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS
-#define GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS	0x03B0
-#define UPSILON_D_T	GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS
 
-    if (UTF
-	&& ( OP(scan) == EXACTF || OP(scan) == EXACTFU || OP(scan) == EXACTFA)
-	&& ( STR_LEN(scan) >= 6 ) )
-    {
-    /*
-    Two problematic code points in Unicode casefolding of EXACT nodes:
-    
-    U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
-    U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
-    
-    which casefold to
-    
-    Unicode                      UTF-8
-    
-    U+03B9 U+0308 U+0301         0xCE 0xB9 0xCC 0x88 0xCC 0x81
-    U+03C5 U+0308 U+0301         0xCF 0x85 0xCC 0x88 0xCC 0x81
-    
-    This means that in case-insensitive matching (or "loose matching",
-    as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
-    length of the above casefolded versions) can match a target string
-    of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
-    This would rather mess up the minimum length computation.
-    
-    What we'll do is to look for the tail four bytes, and then peek
-    at the preceding two bytes to see whether we need to decrease
-    the minimum length by four (six minus two).
-    
-    Thanks to the design of UTF-8, there cannot be false matches:
-    A sequence of valid UTF-8 bytes cannot be a subsequence of
-    another valid sequence of UTF-8 bytes.
-    
-    */
-         char * const s0 = STRING(scan), *s, *t;
-         char * const s1 = s0 + STR_LEN(scan) - 1;
-         char * const s2 = s1 - 4;
+    *min_change = 0;
+
+    /* Here, all the adjacent mergeable EXACTish nodes have been merged.  We
+     * can now analyze for sequences of problematic code points.  (Prior to
+     * this final joining, sequences could have been split over boundaries, and
+     * hence missed).  The sequences only happen in folding */
+    if (OP(scan) != EXACT) {
+
+	/* There are three code points in Unicode whose folded lengths differ so
+	* much from the un-folded lengths that it causes problems for the
+	* optimizer and trie construction.  Why only these are problematic, and
+	* not others is something I (khw) do not understand.  And new versions of
+	* Unicode might add more such code points.  Hopefully the logic in
+	* fold_grind.t that figures out what to test (in part by veriying that
+	* each size-combination gets tested) will catch any that do come along, so
+	* they can be added to the special handling below.  The chances of this
+	* are actually rather small, as most, if not all, of the scripts that have
+	* casefolding have already been encoded by Unicode, as well as those from
+	* pre-existing standards that Unicode has encoded for backwards
+	* compatibility, which would be the new ones that might have enough
+	* weirdness to qualify for this */
+
+	/* First we look at the sequences that can occur only in UTF-8 strings.
+	 * The sequences are of length 6 */
+	if (UTF && STR_LEN(scan) >= 6) {
+
+	    /* Two problematic code points in Unicode casefolding of EXACT
+	     * nodes:
+	     *
+	     * U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
+	     * U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
+	     *  
+	     * which casefold to
+	     *  
+	     * Unicode                      UTF-8
+	     *  
+	     * U+03B9 U+0308 U+0301         0xCE 0xB9 0xCC 0x88 0xCC 0x81
+	     * U+03C5 U+0308 U+0301         0xCF 0x85 0xCC 0x88 0xCC 0x81
+             * 
+	     * This means that in case-insensitive matching (or "loose
+	     * matching", as Unicode calls it), an EXACTF of length six (the
+	     * UTF-8 encoded byte length of the above casefolded versions) can
+	     * match a target string of length two (the byte length of UTF-8
+	     * encoded U+0390 or U+03B0).  This would rather mess up the
+	     * minimum length computation.  (there are other code points that
+	     * also fold to these two sequences, but the delta is smaller)
+	     * 
+	     * What we'll do is to look for the tail four bytes, and then peek
+	     * at the preceding two bytes to see whether we need to decrease
+	     * the minimum length by four (six minus two).
+	     *  
+	     * Thanks to the design of UTF-8, there cannot be false matches:
+	     * A sequence of valid UTF-8 bytes cannot be a subsequence of
+	     * another valid sequence of UTF-8 bytes. */
+
 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
-	 const char t0[] = "\xaf\x49\xaf\x42";
-#else
-         const char t0[] = "\xcc\x88\xcc\x81";
-#endif
-         const char * const t1 = t0 + 3;
-    
-         for (s = s0 + 2;
-              s < s2 && (t = ninstr(s, s1, t0, t1));
-              s = t + 4) {
-#ifdef EBCDIC
-	      if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
-		  ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
+	    const char U390_first_byte = '\xb4';
+	    const char U390_2nd_byte = '\x68';
+	    const char U3B0_first_byte = '\xb5';
+	    const char U3B0_2nd_byte = '\x46';
+	    const char tail[] = "\xaf\x49\xaf\x42";
 #else
-              if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
-                  ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
+	    const char U390_first_byte = '\xce';
+	    const char U390_2nd_byte = '\xb9';
+	    const char U3B0_first_byte = '\xcf';
+	    const char U3B0_2nd_byte = '\x85';
+	    const char tail[] = "\xcc\x88\xcc\x81";
 #endif
-                   *min -= 4;
-         }
+	    const STRLEN tail_len = sizeof(tail) - 1;
+	    for (s = s0 + 2;
+		 s <= s_end - tail_len
+		   && (t = ninstr(s, s_end, tail, tail + tail_len));
+		 s = t + tail_len)
+	    {
+		if ((t[-1] == U390_2nd_byte && t[-2] == U390_first_byte)
+		    || (t[-1] == U3B0_2nd_byte && t[-2] == U3B0_first_byte))
+		{
+		    *min_change -= 4;
+
+		    /* This can't currently be handled by tries, so change the
+		     * node type to indicate this. */
+		    if (OP(scan) == EXACTFU) {
+			OP(scan) = EXACTFU_NO_TRIE;
+		    }
+		}
+	    }
+	}
+
+	/* The third problematic sequence is 'ss', which can match just the
+	 * single byte LATIN SMALL LETTER SHARP S, and it can do it in both
+	 * non- and UTF-8.  Code elsewhere in this file makes sure, however,
+	 * that the sharp s gets folded to 'ss' under Unicode rules even if not
+	 * UTF-8. */
+	if (STR_LEN(scan) >= 2
+	    && (OP(scan) == EXACTFU
+		|| OP(scan) == EXACTFU_NO_TRIE	/* The code above could have
+						   set to this node type */
+	        || OP(scan) == EXACTF))
+	{
+	    /* The string will be folded to 'ss' if it's in UTF-8, but it could
+	     * be 'Ss', etc when not.  We could have different code to handle
+	     * the two cases, but this is not necessary since both S and s are
+	     * invariants under UTF-8; and not worth it, especially because we
+	     * can use just one test each time through the loop (plus a mask)
+	     * Ths is because on both EBCDIC and ASCII machines, an 'S' and 's'
+	     * differ by a single bit.  On ASCII they are 32 apart; on EBCDIC,
+	     * they are 64.  This uses an exclusive 'or' to find that bit and
+	     * then inverts it to form a mask, with just a single 0, in the bit
+	     * position where 'S' and 's' differ. */
+	    const char S_or_s_mask = ~ ('S' ^ 's');
+	    const char s_masked = 's' & S_or_s_mask;
+
+	    for (s = s0; s < s_end - 1; s++) {
+		if (((*s & S_or_s_mask) == s_masked)
+		    && ((*(s+1) & S_or_s_mask) == s_masked))
+		{
+		    s++;
+		    *min_change -= 1;
+
+		    /* EXACTFU_SS also isn't trie'able, so don't have to
+		     * preserve EXACTFU_NO_TRIE.  EXACTF is also not trie'able,
+		     * and because we essentially punt the optimizations in its
+		     * case, we don't need to indicate that it has an ss */
+		    if (OP(scan) == EXACTFU || OP(scan) == EXACTFU_NO_TRIE) {
+			OP(scan) = EXACTFU_SS;
+		    }
+		}
+	    }
+	}
     }
     
 #ifdef DEBUGGING
@@ -2762,10 +2858,11 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
 
   fake_study_recurse:
     while ( scan && OP(scan) != END && scan < last ){
+        I32 min_change;
 	/* Peephole optimizer: */
 	DEBUG_STUDYDATA("Peep:", data,depth);
 	DEBUG_PEEP("Peep",scan,depth);
-        JOIN_EXACT(scan,&min,0);
+        JOIN_EXACT(scan,&min_change,0);
 
 	/* Follow the next-chain of the current node and optimize
 	   away all the NOTHINGs from it.  */
@@ -3059,8 +3156,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
     in below to fully enable trie logic.
 
 #define TRIE_TYPE_IS_SAFE 1
-
 */
+
 #define TRIE_TYPE_IS_SAFE ((UTF && optype == EXACTFU) || optype==EXACT)
 
                                 if ( last && TRIE_TYPE_IS_SAFE ) {
@@ -3279,9 +3376,23 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
 		l = utf8_length(s, s + l);
 		uc = utf8_to_uvchr(s, NULL);
 	    }
-	    min += l;
-	    if (flags & SCF_DO_SUBSTR)
+	    else if (OP(scan) == EXACTF) {
+		if (memchr(STRING(scan), LATIN_SMALL_LETTER_SHARP_S, l)) {
+		    RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
+		}
+	    }
+	    min += l + min_change;
+            if (min < 0) {
+                min = 0;
+            }
+            delta += abs(min_change);
+	    if (flags & SCF_DO_SUBSTR) {
 		data->pos_min += l;
+                data->pos_delta += abs(min_change);
+		if (min_change) {
+		    data->longest = &(data->longest_float);
+		}
+	    }
 	    if (flags & SCF_DO_STCLASS_AND) {
 		/* Check whether it is compatible with what we know already! */
 		int compat = 1;
@@ -3311,6 +3422,13 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
 			 * the full latin1 fold.  (Can't do this for locale,
 			 * because not known until runtime */
 			ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
+			if (uc == 's' || uc == 'S') {
+			    ANYOF_BITMAP_SET(data->start_class, LATIN_SMALL_LETTER_SHARP_S);
+			}
+			else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
+			    ANYOF_BITMAP_SET(data->start_class, 's');
+			    ANYOF_BITMAP_SET(data->start_class, 'S');
+			}
 		    }
 		}
 		else if (uc >= 0x100) {
@@ -3335,6 +3453,13 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                              * run-time */
                             ANYOF_BITMAP_SET(data->start_class,
 					     PL_fold_latin1[uc]);
+			    if (uc == 's' || uc == 'S') {
+				ANYOF_BITMAP_SET(data->start_class, LATIN_SMALL_LETTER_SHARP_S);
+			    }
+			    else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
+				ANYOF_BITMAP_SET(data->start_class, 's');
+				ANYOF_BITMAP_SET(data->start_class, 'S');
+			    }
                         }
 		    }
 		    data->start_class->flags &= ~ANYOF_EOS;
@@ -3740,18 +3865,6 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
 		data->longest = &(data->longest_float);
     	    }
 	}
-	else if (OP(scan) == FOLDCHAR) {
-	    int d = ARG(scan) == LATIN_SMALL_LETTER_SHARP_S ? 1 : 2;
-	    flags &= ~SCF_DO_STCLASS;
-            min += 1;
-            delta += d;
-            if (flags & SCF_DO_SUBSTR) {
-	        SCAN_COMMIT(pRExC_state,data,minlenp);	/* Cannot expect anything... */
-	        data->pos_min += 1;
-	        data->pos_delta += d;
-		data->longest = &(data->longest_float);
-	    }
-	}
 	else if (REGNODE_SIMPLE(OP(scan))) {
 	    int value = 0;
 
@@ -5067,9 +5180,10 @@ reStudy:
         {
             I32 t,ml;
 
-	    if (SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
-		&& data.offset_fixed == data.offset_float_min
-		&& SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
+	    if ((RExC_seen & REG_SEEN_EXACTF_SHARP_S)
+		|| (SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
+		    && data.offset_fixed == data.offset_float_min
+		    && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
 		    goto remove_float;		/* As in (a)+. */
 
             /* copy the information about the longest float from the reg_scan_data
@@ -5112,10 +5226,11 @@ reStudy:
            Be careful. 
          */
 	longest_fixed_length = CHR_SVLEN(data.longest_fixed);
-	if (longest_fixed_length
-	    || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
-		&& (!(data.flags & SF_FIX_BEFORE_MEOL)
-		    || (RExC_flags & RXf_PMf_MULTILINE)))) 
+	if (! (RExC_seen & REG_SEEN_EXACTF_SHARP_S)
+	    && (longest_fixed_length
+	        || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
+		    && (!(data.flags & SF_FIX_BEFORE_MEOL)
+		        || (RExC_flags & RXf_PMf_MULTILINE)))) )
         {
             I32 t,ml;
 
@@ -8820,15 +8935,6 @@ tryagain:
 	    RExC_parse++;
 
 	defchar: {
-	    typedef enum {
-		generic_char = 0,
-		char_s,
-		upsilon_1,
-		upsilon_2,
-		iota_1,
-		iota_2,
-	    } char_state;
-	    char_state latest_char_state = generic_char;
 	    register STRLEN len;
 	    register UV ender;
 	    register char *p;
@@ -8836,22 +8942,25 @@ tryagain:
 	    STRLEN foldlen;
 	    U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
 	    regnode * orig_emit;
+            int node_type;
+            bool is_exactfu_sharp_s;
 
 	    ender = 0;
 	    orig_emit = RExC_emit; /* Save the original output node position in
 				      case we need to output a different node
 				      type */
-	    ret = reg_node(pRExC_state,
-			   (U8) ((! FOLD) ? EXACT
+            node_type =    (U8) ((! FOLD) ? EXACT
 					  : (LOC)
 					     ? EXACTFL
 					     : (MORE_ASCII_RESTRICTED)
 					       ? EXACTFA
 					       : (AT_LEAST_UNI_SEMANTICS)
 					         ? EXACTFU
-					         : EXACTF)
-		    );
+					         : EXACTF);
+	    ret = reg_node(pRExC_state, node_type);
 	    s = STRING(ret);
+
+	    /* By going only up to 127 when the maximum storable is 255, we don't have to worry about expansion, not being in the last character in the fold */
 	    for (len = 0, p = RExC_parse - 1;
 	      len < 127 && p < RExC_end;
 	      len++)
@@ -9047,219 +9156,10 @@ tryagain:
 		    break;
 		} /* End of switch on the literal */
 
-		/* Certain characters are problematic because their folded
-		 * length is so different from their original length that it
-		 * isn't handleable by the optimizer.  They are therefore not
-		 * placed in an EXACTish node; and are here handled specially.
-		 * (Even if the optimizer handled LATIN_SMALL_LETTER_SHARP_S,
-		 * putting it in a special node keeps regexec from having to
-		 * deal with a non-utf8 multi-char fold */
-		if (FOLD
-		    && (ender > 255 || (! MORE_ASCII_RESTRICTED && ! LOC)))
-		{
-		    /* We look for either side of the fold.  For example \xDF
-		     * folds to 'ss'.  We look for both the single character
-		     * \xDF and the sequence 'ss'.  When we find something that
-		     * could be one of those, we stop and flush whatever we
-		     * have output so far into the EXACTish node that was being
-		     * built.  Then restore the input pointer to what it was.
-		     * regatom will return that EXACT node, and will be called
-		     * again, positioned so the first character is the one in
-		     * question, which we return in a different node type.
-		     * The multi-char folds are a sequence, so the occurrence
-		     * of the first character in that sequence doesn't
-		     * necessarily mean that what follows is the rest of the
-		     * sequence.  We keep track of that with a state machine,
-		     * with the state being set to the latest character
-		     * processed before the current one.  Most characters will
-		     * set the state to 0, but if one occurs that is part of a
-		     * potential tricky fold sequence, the state is set to that
-		     * character, and the next loop iteration sees if the state
-		     * should progress towards the final folded-from character,
-		     * or if it was a false alarm.  If it turns out to be a
-		     * false alarm, the character(s) will be output in a new
-		     * EXACTish node, and join_exact() will later combine them.
-		     * In the case of the 'ss' sequence, which is more common
-		     * and more easily checked, some look-ahead is done to
-		     * save time by ruling-out some false alarms */
-		    switch (ender) {
-			default:
-			    latest_char_state = generic_char;
-			    break;
-			case 's':
-			case 'S':
-			case 0x17F: /* LATIN SMALL LETTER LONG S */
-			     if (AT_LEAST_UNI_SEMANTICS) {
-				if (latest_char_state == char_s) {  /* 'ss' */
-				    ender = LATIN_SMALL_LETTER_SHARP_S;
-				    goto do_tricky;
-				}
-				else if (p < RExC_end) {
-
-				    /* Look-ahead at the next character.  If it
-				     * is also an s, we handle as a sharp s
-				     * tricky regnode.  */
-				    if (*p == 's' || *p == 'S') {
-
-					/* But first flush anything in the
-					 * EXACTish buffer */
-					if (len != 0) {
-					    p = oldp;
-					    goto loopdone;
-					}
-					p++;	/* Account for swallowing this
-						   's' up */
-					ender = LATIN_SMALL_LETTER_SHARP_S;
-					goto do_tricky;
-				    }
-					/* Here, the next character is not a
-					 * literal 's', but still could
-					 * evaluate to one if part of a \o{},
-					 * \x or \OCTAL-DIGIT.  The minimum
-					 * length required for that is 4, eg
-					 * \x53 or \123 */
-				    else if (*p == '\\'
-					     && p < RExC_end - 4
-					     && (isDIGIT(*(p + 1))
-						 || *(p + 1) == 'x'
-						 || *(p + 1) == 'o' ))
-				    {
-
-					/* Here, it could be an 's', too much
-					 * bother to figure it out here.  Flush
-					 * the buffer if any; when come back
-					 * here, set the state so know that the
-					 * previous char was an 's' */
-					if (len != 0) {
-					    latest_char_state = generic_char;
-					    p = oldp;
-					    goto loopdone;
-					}
-					latest_char_state = char_s;
-					break;
-				    }
-				}
-			    }
-
-			    /* Here, can't be an 'ss' sequence, or at least not
-			     * one that could fold to/from the sharp ss */
-			    latest_char_state = generic_char;
-			    break;
-			case 0x03C5:	/* First char in upsilon series */
-			case 0x03A5:	/* Also capital UPSILON, which folds to
-					   03C5, and hence exhibits the same
-					   problem */
-			    if (p < RExC_end - 4) { /* Need >= 4 bytes left */
-				latest_char_state = upsilon_1;
-				if (len != 0) {
-				    p = oldp;
-				    goto loopdone;
-				}
-			    }
-			    else {
-				latest_char_state = generic_char;
-			    }
-			    break;
-			case 0x03B9:	/* First char in iota series */
-			case 0x0399:	/* Also capital IOTA */
-			case 0x1FBE:	/* GREEK PROSGEGRAMMENI folds to 3B9 */
-			case 0x0345:	/* COMBINING GREEK YPOGEGRAMMENI folds
-					   to 3B9 */
-			    if (p < RExC_end - 4) {
-				latest_char_state = iota_1;
-				if (len != 0) {
-				    p = oldp;
-				    goto loopdone;
-				}
-			    }
-			    else {
-				latest_char_state = generic_char;
-			    }
-			    break;
-			case 0x0308:
-			    if (latest_char_state == upsilon_1) {
-				latest_char_state = upsilon_2;
-			    }
-			    else if (latest_char_state == iota_1) {
-				latest_char_state = iota_2;
-			    }
-			    else {
-				latest_char_state = generic_char;
-			    }
-			    break;
-			case 0x301:
-			    if (latest_char_state == upsilon_2) {
-				ender = GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS;
-				goto do_tricky;
-			    }
-			    else if (latest_char_state == iota_2) {
-				ender = GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS;
-				goto do_tricky;
-			    }
-			    latest_char_state = generic_char;
-			    break;
-
-			/* These are the tricky fold characters.  Flush any
-			 * buffer first. (When adding to this list, also should
-			 * add them to fold_grind.t to make sure get tested) */
-			case GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS:
-			case GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS:
-			case LATIN_SMALL_LETTER_SHARP_S:
-			case LATIN_CAPITAL_LETTER_SHARP_S:
-			case 0x1FD3: /* GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA */
-			case 0x1FE3: /* GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA */
-			    if (len != 0) {
-				p = oldp;
-				goto loopdone;
-			    }
-			    /* FALL THROUGH */
-			do_tricky: {
-			    char* const oldregxend = RExC_end;
-			    U8 tmpbuf[UTF8_MAXBYTES+1];
-
-			    /* Here, we know we need to generate a special
-			     * regnode, and 'ender' contains the tricky
-			     * character.  What's done is to pretend it's in a
-			     * [bracketed] class, and let the code that deals
-			     * with those handle it, as that code has all the
-			     * intelligence necessary.  First save the current
-			     * parse state, get rid of the already allocated
-			     * but empty EXACT node that the ANYOFV node will
-			     * replace, and point the parse to a buffer which
-			     * we fill with the character we want the regclass
-			     * code to think is being parsed */
-			    RExC_emit = orig_emit;
-			    RExC_parse = (char *) tmpbuf;
-			    if (UTF) {
-				U8 *d = uvchr_to_utf8(tmpbuf, ender);
-				*d = '\0';
-				RExC_end = (char *) d;
-			    }
-			    else {  /* ender above 255 already excluded */
-				tmpbuf[0] = (U8) ender;
-				tmpbuf[1] = '\0';
-				RExC_end = RExC_parse + 1;
-			    }
-
-			    ret = regclass(pRExC_state,depth+1);
-
-			    /* Here, have parsed the buffer.  Reset the parse to
-			     * the actual input, and return */
-			    RExC_end = oldregxend;
-			    RExC_parse = p - 1;
-
-			    Set_Node_Offset(ret, RExC_parse);
-			    Set_Node_Cur_Length(ret);
-			    nextchar(pRExC_state);
-			    *flagp |= HASWIDTH|SIMPLE;
-			    return ret;
-			}
-		    }
-		}
-
+                is_exactfu_sharp_s = (node_type == EXACTFU && ender == LATIN_SMALL_LETTER_SHARP_S);
 		if ( RExC_flags & RXf_PMf_EXTENDED)
 		    p = regwhite( pRExC_state, p );
-		if (UTF && FOLD) {
+		if ((UTF && FOLD) || is_exactfu_sharp_s) {
 		    /* Prime the casefolded buffer.  Locale rules, which apply
 		     * only to code points < 256, aren't known until execution,
 		     * so for them, just output the original character using
@@ -9322,7 +9222,7 @@ tryagain:
 		if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
 		    if (len)
 			p = oldp;
-		    else if (UTF) {
+		    else if (UTF || is_exactfu_sharp_s) {
 			 if (FOLD) {
 			      /* Emit all the Unicode characters. */
 			      STRLEN numlen;
@@ -9358,7 +9258,7 @@ tryagain:
 		    }
 		    break;
 		}
-		if (UTF) {
+                if (UTF || is_exactfu_sharp_s) {
 		     if (FOLD) {
 		          /* Emit all the Unicode characters. */
 			  STRLEN numlen;
@@ -11188,6 +11088,7 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,
         regnode * const temp = regnext(scan);
 #ifdef EXPERIMENTAL_INPLACESCAN
         if (PL_regkind[OP(scan)] == EXACT)
+
             if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
                 return EXACT;
 #endif
@@ -11197,6 +11098,8 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,
                 case EXACTF:
                 case EXACTFA:
                 case EXACTFU:
+                case EXACTFU_SS:
+                case EXACTFU_NO_TRIE:
                 case EXACTFL:
                         if( exact == PSEUDO )
                             exact= OP(scan);
@@ -11521,8 +11424,6 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
 			   SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
     } else if (k == LOGICAL)
 	Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);	/* 2: embedded, otherwise 1 */
-    else if (k == FOLDCHAR)
-	Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) );
     else if (k == ANYOF) {
 	int i, rangestart = -1;
 	const U8 flags = ANYOF_FLAGS(o);
diff --git a/regcomp.h b/regcomp.h
index 81c8a5ddd7..502674c088 100644
--- a/regcomp.h
+++ b/regcomp.h
@@ -492,6 +492,7 @@ struct regnode_charclass_class {
 #define REG_SEEN_VERBARG        0x00000080
 #define REG_SEEN_CUTGROUP       0x00000100
 #define REG_SEEN_RUN_ON_COMMENT 0x00000200
+#define REG_SEEN_EXACTF_SHARP_S 0x00000400
 
 START_EXTERN_C
 
diff --git a/regcomp.sym b/regcomp.sym
index 23b9ef2181..69366d7e87 100644
--- a/regcomp.sym
+++ b/regcomp.sym
@@ -97,7 +97,9 @@ BACK        BACK,       no 0 V    ; Match "", "next" ptr points backward.
 EXACT       EXACT,      str       ; Match this string (preceded by length).
 EXACTF      EXACT,      str       ; Match this non-UTF-8 string (not guaranteed to be folded) using /id rules (w/len).
 EXACTFL     EXACT,      str       ; Match this string (not guaranteed to be folded) using /il rules (w/len).
-EXACTFU     EXACT,      str	  ; Match this string (folded iff in UTF-8) using /iu rules (w/len).
+EXACTFU     EXACT,      str	  ; Match this string (folded iff in UTF-8, length in folding doesn't change if not in UTF-8) using /iu rules (w/len).
+EXACTFU_SS  EXACT,      str	  ; Match this string (folded iff in UTF-8, length in folding may change even if not in UTF-8) using /iu rules (w/len).
+EXACTFU_NO_TRIE EXACT,  str	  ; Match this folded UTF-8 string using /iu rules, but don't generate a trie for it
 EXACTFA     EXACT,      str	  ; Match this string (not guaranteed to be folded) using /iaa rules (w/len).
 
 #*Do nothing types
@@ -214,10 +216,8 @@ VERTWS      VERTWS,     none 0 S  ; vertical whitespace         (Perl 6)
 NVERTWS     NVERTWS,    none 0 S  ; not vertical whitespace     (Perl 6)
 HORIZWS     HORIZWS,    none 0 S  ; horizontal whitespace       (Perl 6)
 NHORIZWS    NHORIZWS,   none 0 S  ; not horizontal whitespace   (Perl 6)
-
 FOLDCHAR    FOLDCHAR,   codepoint 1 ; codepoint with tricky case folding properties.
 
-
 # NEW STUFF SOMEWHERE ABOVE THIS LINE
 
 ################################################################################
diff --git a/regexec.c b/regexec.c
index bde7027ede..0392f1b20d 100644
--- a/regexec.c
+++ b/regexec.c
@@ -303,13 +303,13 @@
 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
    we don't need this definition. */
 #define IS_TEXT(rn)   ( OP(rn)==EXACT   || OP(rn)==REF   || OP(rn)==NREF   )
-#define IS_TEXTF(rn)  ( (OP(rn)==EXACTFU || OP(rn)==EXACTFA ||  OP(rn)==EXACTF)  || OP(rn)==REFF  || OP(rn)==NREFF )
+#define IS_TEXTF(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_NO_TRIE || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF  || OP(rn)==NREFF )
 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
 
 #else
 /* ... so we use this as its faster. */
 #define IS_TEXT(rn)   ( OP(rn)==EXACT   )
-#define IS_TEXTFU(rn)  ( OP(rn)==EXACTFU || OP(rn) == EXACTFA)
+#define IS_TEXTFU(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_NO_TRIE || OP(rn) == EXACTFA)
 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  )
 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
 
@@ -1483,6 +1483,13 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
 	    folder = foldEQ_locale;
 	    goto do_exactf_non_utf8;
 
+	case EXACTFU_SS:
+	    if (UTF_PATTERN) {
+		utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
+	    }
+	    goto do_exactf_utf8;
+
+	case EXACTFU_NO_TRIE:
 	case EXACTFU:
 	    if (UTF_PATTERN || utf8_target) {
 		utf8_fold_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
@@ -3662,6 +3669,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
 	    fold_utf8_flags = FOLDEQ_UTF8_LOCALE;
 	    goto do_exactf;
 
+	case EXACTFU_SS:
+	case EXACTFU_NO_TRIE:
 	case EXACTFU:
 	    folder = foldEQ_latin1;
 	    fold_array = PL_fold_latin1;
@@ -3683,8 +3692,9 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
 	    s = STRING(scan);
 	    ln = STR_LEN(scan);
 
-	    if (utf8_target || UTF_PATTERN) {
-	      /* Either target or the pattern are utf8. */
+	    if (utf8_target || UTF_PATTERN || state_num == EXACTFU_SS) {
+	      /* Either target or the pattern are utf8, or has the issue where
+	       * the fold lengths may differ. */
 		const char * const l = locinput;
 		char *e = PL_regeol;
 
@@ -5072,6 +5082,8 @@ NULL
 			switch (OP(text_node)) {
 			    case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
 			    case EXACTFA:
+			    case EXACTFU_SS:
+			    case EXACTFU_NO_TRIE:
 			    case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
 			    case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
 			    default: ST.c2 = ST.c1;
@@ -5226,6 +5238,8 @@ NULL
 			switch (OP(text_node)) {
 			    case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
 			    case EXACTFA:
+			    case EXACTFU_SS:
+			    case EXACTFU_NO_TRIE:
 			    case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
 			    case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
 			    default: ST.c2 = ST.c1; break;
@@ -5694,27 +5708,6 @@ NULL
             sayNO;
             /* NOTREACHED */
 #undef ST
-        case FOLDCHAR:
-            n = ARG(scan);
-            if ( n == (U32)what_len_TRICKYFOLD(locinput,utf8_target,ln) ) {
-                locinput += ln;
-            } else if ( LATIN_SMALL_LETTER_SHARP_S == n && !utf8_target && !UTF_PATTERN ) {
-                sayNO;
-            } else  {
-                U8 folded[UTF8_MAXBYTES_CASE+1];
-                STRLEN foldlen;
-                const char * const l = locinput;
-                char *e = PL_regeol;
-                to_uni_fold(n, folded, &foldlen);
-
-		if (! foldEQ_utf8((const char*) folded, 0,  foldlen, 1,
-                	       l, &e, 0,  utf8_target)) {
-                        sayNO;
-                }
-                locinput = e;
-            } 
-            nextchr = UCHARAT(locinput);  
-            break;
         case LNBREAK:
             if ((n=is_LNBREAK(locinput,utf8_target))) {
                 locinput += n;
@@ -6039,6 +6032,8 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
 	    utf8_flags = 0;
 	    goto do_exactf;
 
+    case EXACTFU_SS:
+    case EXACTFU_NO_TRIE:
     case EXACTFU:
 	utf8_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
 
@@ -6049,7 +6044,7 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
 	c = (U8)*STRING(p);
 	assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
 
-	if (utf8_target) { /* Use full Unicode fold matching */
+	if (utf8_target || OP(p) == EXACTFU_SS) { /* Use full Unicode fold matching */
 	    char *tmpeol = loceol;
 	    while (hardcount < max
 		    && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
@@ -6080,6 +6075,7 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
 	    switch (OP(p)) {
 		case EXACTF: folded = PL_fold[c]; break;
 		case EXACTFA:
+		case EXACTFU_NO_TRIE:
 		case EXACTFU: folded = PL_fold_latin1[c]; break;
 		case EXACTFL: folded = PL_fold_locale[c]; break;
 		default: Perl_croak(aTHX_ "panic: Unexpected op %u", OP(p));
diff --git a/regnodes.h b/regnodes.h
index dccf2b7a99..785ff1c9ca 100644
--- a/regnodes.h
+++ b/regnodes.h
@@ -6,8 +6,8 @@
 
 /* Regops and State definitions */
 
-#define REGNODE_MAX           	111
-#define REGMATCH_STATE_MAX    	151
+#define REGNODE_MAX           	113
+#define REGMATCH_STATE_MAX    	153
 
 #define	END                   	0	/* 0000 End of program. */
 #define	SUCCEED               	1	/* 0x01 Return from a subroutine, basically. */
@@ -60,67 +60,69 @@
 #define	EXACT                 	48	/* 0x30 Match this string (preceded by length). */
 #define	EXACTF                	49	/* 0x31 Match this non-UTF-8 string (not guaranteed to be folded) using /id rules (w/len). */
 #define	EXACTFL               	50	/* 0x32 Match this string (not guaranteed to be folded) using /il rules (w/len). */
-#define	EXACTFU               	51	/* 0x33 Match this string (folded iff in UTF-8) using /iu rules (w/len). */
-#define	EXACTFA               	52	/* 0x34 Match this string (not guaranteed to be folded) using /iaa rules (w/len). */
-#define	NOTHING               	53	/* 0x35 Match empty string. */
-#define	TAIL                  	54	/* 0x36 Match empty string. Can jump here from outside. */
-#define	STAR                  	55	/* 0x37 Match this (simple) thing 0 or more times. */
-#define	PLUS                  	56	/* 0x38 Match this (simple) thing 1 or more times. */
-#define	CURLY                 	57	/* 0x39 Match this simple thing {n,m} times. */
-#define	CURLYN                	58	/* 0x3a Capture next-after-this simple thing */
-#define	CURLYM                	59	/* 0x3b Capture this medium-complex thing {n,m} times. */
-#define	CURLYX                	60	/* 0x3c Match this complex thing {n,m} times. */
-#define	WHILEM                	61	/* 0x3d Do curly processing and see if rest matches. */
-#define	OPEN                  	62	/* 0x3e Mark this point in input as start of */
-#define	CLOSE                 	63	/* 0x3f Analogous to OPEN. */
-#define	REF                   	64	/* 0x40 Match some already matched string */
-#define	REFF                  	65	/* 0x41 Match already matched string, folded using native charset semantics for non-utf8 */
-#define	REFFL                 	66	/* 0x42 Match already matched string, folded in loc. */
-#define	REFFU                 	67	/* 0x43 Match already matched string, folded using unicode semantics for non-utf8 */
-#define	REFFA                 	68	/* 0x44 Match already matched string, folded using unicode semantics for non-utf8, no mixing ASCII, non-ASCII */
-#define	NREF                  	69	/* 0x45 Match some already matched string */
-#define	NREFF                 	70	/* 0x46 Match already matched string, folded using native charset semantics for non-utf8 */
-#define	NREFFL                	71	/* 0x47 Match already matched string, folded in loc. */
-#define	NREFFU                	72	/* 0x48 Match already matched string, folded using unicode semantics for non-utf8 */
-#define	NREFFA                	73	/* 0x49 Match already matched string, folded using unicode semantics for non-utf8, no mixing ASCII, non-ASCII */
-#define	IFMATCH               	74	/* 0x4a Succeeds if the following matches. */
-#define	UNLESSM               	75	/* 0x4b Fails if the following matches. */
-#define	SUSPEND               	76	/* 0x4c "Independent" sub-RE. */
-#define	IFTHEN                	77	/* 0x4d Switch, should be preceded by switcher . */
-#define	GROUPP                	78	/* 0x4e Whether the group matched. */
-#define	LONGJMP               	79	/* 0x4f Jump far away. */
-#define	BRANCHJ               	80	/* 0x50 BRANCH with long offset. */
-#define	EVAL                  	81	/* 0x51 Execute some Perl code. */
-#define	MINMOD                	82	/* 0x52 Next operator is not greedy. */
-#define	LOGICAL               	83	/* 0x53 Next opcode should set the flag only. */
-#define	RENUM                 	84	/* 0x54 Group with independently numbered parens. */
-#define	TRIE                  	85	/* 0x55 Match many EXACT(F[ALU]?)? at once. flags==type */
-#define	TRIEC                 	86	/* 0x56 Same as TRIE, but with embedded charclass data */
-#define	AHOCORASICK           	87	/* 0x57 Aho Corasick stclass. flags==type */
-#define	AHOCORASICKC          	88	/* 0x58 Same as AHOCORASICK, but with embedded charclass data */
-#define	GOSUB                 	89	/* 0x59 recurse to paren arg1 at (signed) ofs arg2 */
-#define	GOSTART               	90	/* 0x5a recurse to start of pattern */
-#define	NGROUPP               	91	/* 0x5b Whether the group matched. */
-#define	INSUBP                	92	/* 0x5c Whether we are in a specific recurse. */
-#define	DEFINEP               	93	/* 0x5d Never execute directly. */
-#define	ENDLIKE               	94	/* 0x5e Used only for the type field of verbs */
-#define	OPFAIL                	95	/* 0x5f Same as (?!) */
-#define	ACCEPT                	96	/* 0x60 Accepts the current matched string. */
-#define	VERB                  	97	/* 0x61 Used only for the type field of verbs */
-#define	PRUNE                 	98	/* 0x62 Pattern fails at this startpoint if no-backtracking through this */
-#define	MARKPOINT             	99	/* 0x63 Push the current location for rollback by cut. */
-#define	SKIP                  	100	/* 0x64 On failure skip forward (to the mark) before retrying */
-#define	COMMIT                	101	/* 0x65 Pattern fails outright if backtracking through this */
-#define	CUTGROUP              	102	/* 0x66 On failure go to the next alternation in the group */
-#define	KEEPS                 	103	/* 0x67 $& begins here. */
-#define	LNBREAK               	104	/* 0x68 generic newline pattern */
-#define	VERTWS                	105	/* 0x69 vertical whitespace         (Perl 6) */
-#define	NVERTWS               	106	/* 0x6a not vertical whitespace     (Perl 6) */
-#define	HORIZWS               	107	/* 0x6b horizontal whitespace       (Perl 6) */
-#define	NHORIZWS              	108	/* 0x6c not horizontal whitespace   (Perl 6) */
-#define	FOLDCHAR              	109	/* 0x6d codepoint with tricky case folding properties. */
-#define	OPTIMIZED             	110	/* 0x6e Placeholder for dump. */
-#define	PSEUDO                	111	/* 0x6f Pseudo opcode for internal use. */
+#define	EXACTFU               	51	/* 0x33 Match this string (folded iff in UTF-8, length in folding doesn't change if not in UTF-8) using /iu rules (w/len). */
+#define	EXACTFU_SS            	52	/* 0x34 Match this string (folded iff in UTF-8, length in folding may change even if not in UTF-8) using /iu rules (w/len). */
+#define	EXACTFU_NO_TRIE       	53	/* 0x35 Match this folded UTF-8 string using /iu rules, but don't generate a trie for it */
+#define	EXACTFA               	54	/* 0x36 Match this string (not guaranteed to be folded) using /iaa rules (w/len). */
+#define	NOTHING               	55	/* 0x37 Match empty string. */
+#define	TAIL                  	56	/* 0x38 Match empty string. Can jump here from outside. */
+#define	STAR                  	57	/* 0x39 Match this (simple) thing 0 or more times. */
+#define	PLUS                  	58	/* 0x3a Match this (simple) thing 1 or more times. */
+#define	CURLY                 	59	/* 0x3b Match this simple thing {n,m} times. */
+#define	CURLYN                	60	/* 0x3c Capture next-after-this simple thing */
+#define	CURLYM                	61	/* 0x3d Capture this medium-complex thing {n,m} times. */
+#define	CURLYX                	62	/* 0x3e Match this complex thing {n,m} times. */
+#define	WHILEM                	63	/* 0x3f Do curly processing and see if rest matches. */
+#define	OPEN                  	64	/* 0x40 Mark this point in input as start of */
+#define	CLOSE                 	65	/* 0x41 Analogous to OPEN. */
+#define	REF                   	66	/* 0x42 Match some already matched string */
+#define	REFF                  	67	/* 0x43 Match already matched string, folded using native charset semantics for non-utf8 */
+#define	REFFL                 	68	/* 0x44 Match already matched string, folded in loc. */
+#define	REFFU                 	69	/* 0x45 Match already matched string, folded using unicode semantics for non-utf8 */
+#define	REFFA                 	70	/* 0x46 Match already matched string, folded using unicode semantics for non-utf8, no mixing ASCII, non-ASCII */
+#define	NREF                  	71	/* 0x47 Match some already matched string */
+#define	NREFF                 	72	/* 0x48 Match already matched string, folded using native charset semantics for non-utf8 */
+#define	NREFFL                	73	/* 0x49 Match already matched string, folded in loc. */
+#define	NREFFU                	74	/* 0x4a Match already matched string, folded using unicode semantics for non-utf8 */
+#define	NREFFA                	75	/* 0x4b Match already matched string, folded using unicode semantics for non-utf8, no mixing ASCII, non-ASCII */
+#define	IFMATCH               	76	/* 0x4c Succeeds if the following matches. */
+#define	UNLESSM               	77	/* 0x4d Fails if the following matches. */
+#define	SUSPEND               	78	/* 0x4e "Independent" sub-RE. */
+#define	IFTHEN                	79	/* 0x4f Switch, should be preceded by switcher . */
+#define	GROUPP                	80	/* 0x50 Whether the group matched. */
+#define	LONGJMP               	81	/* 0x51 Jump far away. */
+#define	BRANCHJ               	82	/* 0x52 BRANCH with long offset. */
+#define	EVAL                  	83	/* 0x53 Execute some Perl code. */
+#define	MINMOD                	84	/* 0x54 Next operator is not greedy. */
+#define	LOGICAL               	85	/* 0x55 Next opcode should set the flag only. */
+#define	RENUM                 	86	/* 0x56 Group with independently numbered parens. */
+#define	TRIE                  	87	/* 0x57 Match many EXACT(F[ALU]?)? at once. flags==type */
+#define	TRIEC                 	88	/* 0x58 Same as TRIE, but with embedded charclass data */
+#define	AHOCORASICK           	89	/* 0x59 Aho Corasick stclass. flags==type */
+#define	AHOCORASICKC          	90	/* 0x5a Same as AHOCORASICK, but with embedded charclass data */
+#define	GOSUB                 	91	/* 0x5b recurse to paren arg1 at (signed) ofs arg2 */
+#define	GOSTART               	92	/* 0x5c recurse to start of pattern */
+#define	NGROUPP               	93	/* 0x5d Whether the group matched. */
+#define	INSUBP                	94	/* 0x5e Whether we are in a specific recurse. */
+#define	DEFINEP               	95	/* 0x5f Never execute directly. */
+#define	ENDLIKE               	96	/* 0x60 Used only for the type field of verbs */
+#define	OPFAIL                	97	/* 0x61 Same as (?!) */
+#define	ACCEPT                	98	/* 0x62 Accepts the current matched string. */
+#define	VERB                  	99	/* 0x63 Used only for the type field of verbs */
+#define	PRUNE                 	100	/* 0x64 Pattern fails at this startpoint if no-backtracking through this */
+#define	MARKPOINT             	101	/* 0x65 Push the current location for rollback by cut. */
+#define	SKIP                  	102	/* 0x66 On failure skip forward (to the mark) before retrying */
+#define	COMMIT                	103	/* 0x67 Pattern fails outright if backtracking through this */
+#define	CUTGROUP              	104	/* 0x68 On failure go to the next alternation in the group */
+#define	KEEPS                 	105	/* 0x69 $& begins here. */
+#define	LNBREAK               	106	/* 0x6a generic newline pattern */
+#define	VERTWS                	107	/* 0x6b vertical whitespace         (Perl 6) */
+#define	NVERTWS               	108	/* 0x6c not vertical whitespace     (Perl 6) */
+#define	HORIZWS               	109	/* 0x6d horizontal whitespace       (Perl 6) */
+#define	NHORIZWS              	110	/* 0x6e not horizontal whitespace   (Perl 6) */
+#define	FOLDCHAR              	111	/* 0x6f codepoint with tricky case folding properties. */
+#define	OPTIMIZED             	112	/* 0x70 Placeholder for dump. */
+#define	PSEUDO                	113	/* 0x71 Pseudo opcode for internal use. */
 	/* ------------ States ------------- */
 #define	TRIE_next             	(REGNODE_MAX + 1)	/* state for TRIE */
 #define	TRIE_next_fail        	(REGNODE_MAX + 2)	/* state for TRIE */
@@ -221,6 +223,8 @@ EXTCONST U8 PL_regkind[] = {
 	EXACT,    	/* EXACTF                 */
 	EXACT,    	/* EXACTFL                */
 	EXACT,    	/* EXACTFU                */
+	EXACT,    	/* EXACTFU_SS             */
+	EXACT,    	/* EXACTFU_NO_TRIE        */
 	EXACT,    	/* EXACTFA                */
 	NOTHING,  	/* NOTHING                */
 	NOTHING,  	/* TAIL                   */
@@ -381,6 +385,8 @@ static const U8 regarglen[] = {
 	0,                                   	/* EXACTF       */
 	0,                                   	/* EXACTFL      */
 	0,                                   	/* EXACTFU      */
+	0,                                   	/* EXACTFU_SS   */
+	0,                                   	/* EXACTFU_NO_TRIE */
 	0,                                   	/* EXACTFA      */
 	0,                                   	/* NOTHING      */
 	0,                                   	/* TAIL         */
@@ -498,6 +504,8 @@ static const char reg_off_by_arg[] = {
 	0,	/* EXACTF       */
 	0,	/* EXACTFL      */
 	0,	/* EXACTFU      */
+	0,	/* EXACTFU_SS   */
+	0,	/* EXACTFU_NO_TRIE */
 	0,	/* EXACTFA      */
 	0,	/* NOTHING      */
 	0,	/* TAIL         */
@@ -620,66 +628,68 @@ EXTCONST char * const PL_reg_name[] = {
 	"EXACTF",                	/* 0x31 */
 	"EXACTFL",               	/* 0x32 */
 	"EXACTFU",               	/* 0x33 */
-	"EXACTFA",               	/* 0x34 */
-	"NOTHING",               	/* 0x35 */
-	"TAIL",                  	/* 0x36 */
-	"STAR",                  	/* 0x37 */
-	"PLUS",                  	/* 0x38 */
-	"CURLY",                 	/* 0x39 */
-	"CURLYN",                	/* 0x3a */
-	"CURLYM",                	/* 0x3b */
-	"CURLYX",                	/* 0x3c */
-	"WHILEM",                	/* 0x3d */
-	"OPEN",                  	/* 0x3e */
-	"CLOSE",                 	/* 0x3f */
-	"REF",                   	/* 0x40 */
-	"REFF",                  	/* 0x41 */
-	"REFFL",                 	/* 0x42 */
-	"REFFU",                 	/* 0x43 */
-	"REFFA",                 	/* 0x44 */
-	"NREF",                  	/* 0x45 */
-	"NREFF",                 	/* 0x46 */
-	"NREFFL",                	/* 0x47 */
-	"NREFFU",                	/* 0x48 */
-	"NREFFA",                	/* 0x49 */
-	"IFMATCH",               	/* 0x4a */
-	"UNLESSM",               	/* 0x4b */
-	"SUSPEND",               	/* 0x4c */
-	"IFTHEN",                	/* 0x4d */
-	"GROUPP",                	/* 0x4e */
-	"LONGJMP",               	/* 0x4f */
-	"BRANCHJ",               	/* 0x50 */
-	"EVAL",                  	/* 0x51 */
-	"MINMOD",                	/* 0x52 */
-	"LOGICAL",               	/* 0x53 */
-	"RENUM",                 	/* 0x54 */
-	"TRIE",                  	/* 0x55 */
-	"TRIEC",                 	/* 0x56 */
-	"AHOCORASICK",           	/* 0x57 */
-	"AHOCORASICKC",          	/* 0x58 */
-	"GOSUB",                 	/* 0x59 */
-	"GOSTART",               	/* 0x5a */
-	"NGROUPP",               	/* 0x5b */
-	"INSUBP",                	/* 0x5c */
-	"DEFINEP",               	/* 0x5d */
-	"ENDLIKE",               	/* 0x5e */
-	"OPFAIL",                	/* 0x5f */
-	"ACCEPT",                	/* 0x60 */
-	"VERB",                  	/* 0x61 */
-	"PRUNE",                 	/* 0x62 */
-	"MARKPOINT",             	/* 0x63 */
-	"SKIP",                  	/* 0x64 */
-	"COMMIT",                	/* 0x65 */
-	"CUTGROUP",              	/* 0x66 */
-	"KEEPS",                 	/* 0x67 */
-	"LNBREAK",               	/* 0x68 */
-	"VERTWS",                	/* 0x69 */
-	"NVERTWS",               	/* 0x6a */
-	"HORIZWS",               	/* 0x6b */
-	"NHORIZWS",              	/* 0x6c */
-	"FOLDCHAR",              	/* 0x6d */
-	"OPTIMIZED",             	/* 0x6e */
-	"PSEUDO",                	/* 0x6f */
+	"EXACTFU_SS",            	/* 0x34 */
+	"EXACTFU_NO_TRIE",       	/* 0x35 */
+	"EXACTFA",               	/* 0x36 */
+	"NOTHING",               	/* 0x37 */
+	"TAIL",                  	/* 0x38 */
+	"STAR",                  	/* 0x39 */
+	"PLUS",                  	/* 0x3a */
+	"CURLY",                 	/* 0x3b */
+	"CURLYN",                	/* 0x3c */
+	"CURLYM",                	/* 0x3d */
+	"CURLYX",                	/* 0x3e */
+	"WHILEM",                	/* 0x3f */
+	"OPEN",                  	/* 0x40 */
+	"CLOSE",                 	/* 0x41 */
+	"REF",                   	/* 0x42 */
+	"REFF",                  	/* 0x43 */
+	"REFFL",                 	/* 0x44 */
+	"REFFU",                 	/* 0x45 */
+	"REFFA",                 	/* 0x46 */
+	"NREF",                  	/* 0x47 */
+	"NREFF",                 	/* 0x48 */
+	"NREFFL",                	/* 0x49 */
+	"NREFFU",                	/* 0x4a */
+	"NREFFA",                	/* 0x4b */
+	"IFMATCH",               	/* 0x4c */
+	"UNLESSM",               	/* 0x4d */
+	"SUSPEND",               	/* 0x4e */
+	"IFTHEN",                	/* 0x4f */
+	"GROUPP",                	/* 0x50 */
+	"LONGJMP",               	/* 0x51 */
+	"BRANCHJ",               	/* 0x52 */
+	"EVAL",                  	/* 0x53 */
+	"MINMOD",                	/* 0x54 */
+	"LOGICAL",               	/* 0x55 */
+	"RENUM",                 	/* 0x56 */
+	"TRIE",                  	/* 0x57 */
+	"TRIEC",                 	/* 0x58 */
+	"AHOCORASICK",           	/* 0x59 */
+	"AHOCORASICKC",          	/* 0x5a */
+	"GOSUB",                 	/* 0x5b */
+	"GOSTART",               	/* 0x5c */
+	"NGROUPP",               	/* 0x5d */
+	"INSUBP",                	/* 0x5e */
+	"DEFINEP",               	/* 0x5f */
+	"ENDLIKE",               	/* 0x60 */
+	"OPFAIL",                	/* 0x61 */
+	"ACCEPT",                	/* 0x62 */
+	"VERB",                  	/* 0x63 */
+	"PRUNE",                 	/* 0x64 */
+	"MARKPOINT",             	/* 0x65 */
+	"SKIP",                  	/* 0x66 */
+	"COMMIT",                	/* 0x67 */
+	"CUTGROUP",              	/* 0x68 */
+	"KEEPS",                 	/* 0x69 */
+	"LNBREAK",               	/* 0x6a */
+	"VERTWS",                	/* 0x6b */
+	"NVERTWS",               	/* 0x6c */
+	"HORIZWS",               	/* 0x6d */
+	"NHORIZWS",              	/* 0x6e */
+	"FOLDCHAR",              	/* 0x6f */
+	"OPTIMIZED",             	/* 0x70 */
+	"PSEUDO",                	/* 0x71 */
 	/* ------------ States ------------- */
 	"TRIE_next",             	/* REGNODE_MAX +0x01 */
 	"TRIE_next_fail",        	/* REGNODE_MAX +0x02 */
@@ -784,7 +794,7 @@ EXTCONST U8 PL_varies[] __attribute__deprecated__ = {
 EXTCONST U8 PL_varies_bitmask[];
 #else
 EXTCONST U8 PL_varies_bitmask[] = {
-    0x00, 0x00, 0x40, 0x00, 0x00, 0xE0, 0x80, 0x3F, 0xFF, 0x33, 0x01, 0x00, 0x00, 0x00
+    0x00, 0x00, 0x40, 0x00, 0x00, 0xE0, 0x00, 0xFE, 0xFC, 0xCF, 0x04, 0x00, 0x00, 0x00, 0x00
 };
 #endif /* DOINIT */
 
@@ -808,7 +818,7 @@ EXTCONST U8 PL_simple[] __attribute__deprecated__ = {
 EXTCONST U8 PL_simple_bitmask[];
 #else
 EXTCONST U8 PL_simple_bitmask[] = {
-    0x00, 0x00, 0xBC, 0xFF, 0xFF, 0x1F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1E
+    0x00, 0x00, 0xBC, 0xFF, 0xFF, 0x1F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x78, 0x00
 };
 #endif /* DOINIT */
 
diff --git a/t/re/re_tests b/t/re/re_tests
index 84791cf15a..33a2fee148 100644
--- a/t/re/re_tests
+++ b/t/re/re_tests
@@ -1560,8 +1560,8 @@ abc\N{def	-	c	-	\\N{NAME} must be resolved by the lexer
 
 # Was matching 'ss' only and failing the entire match, not seeing the
 # alternative that would succeed
-/s\xDF/ui	\xDFs	yT	$&	\xDFs
-/sst/i	s\N{LATIN SMALL LIGATURE ST}	yT	$&	s\N{LATIN SMALL LIGATURE ST}
-/sst/i	s\N{LATIN SMALL LIGATURE LONG S T}	yT	$&	s\N{LATIN SMALL LIGATURE LONG S T}
+/s\xDF/ui	\xDFs	y	$&	\xDFs
+/sst/i	s\N{LATIN SMALL LIGATURE ST}	y	$&	s\N{LATIN SMALL LIGATURE ST}
+/sst/i	s\N{LATIN SMALL LIGATURE LONG S T}	y	$&	s\N{LATIN SMALL LIGATURE LONG S T}
 
 # vim: softtabstop=0 noexpandtab
author	Karl Williamson <public@khwilliamson.com>	2011-12-22 21:55:09 -0700
committer	Karl Williamson <public@khwilliamson.com>	2011-12-22 21:55:09 -0700
commit	e17746f4b72a1a3dbaa579c15d3feaf7d58232de (patch)
tree	34185f4c91afce18385a99dce24cf70f5ffb625d
parent	9b29c3f73ae0922b17ad298dde855b933a4bfee0 (diff)
download	perl-smoke-me/khw-tricky.tar.gz