summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKarl Williamson <public@khwilliamson.com>2011-12-22 21:55:09 -0700
committerKarl Williamson <public@khwilliamson.com>2011-12-22 21:55:09 -0700
commite17746f4b72a1a3dbaa579c15d3feaf7d58232de (patch)
tree34185f4c91afce18385a99dce24cf70f5ffb625d
parent9b29c3f73ae0922b17ad298dde855b933a4bfee0 (diff)
downloadperl-smoke-me/khw-tricky.tar.gz
temp commit for smokessmoke-me/khw-tricky
-rw-r--r--embed.fnc2
-rw-r--r--pp_hot.c8
-rw-r--r--proto.h4
-rw-r--r--regcomp.c535
-rw-r--r--regcomp.h1
-rw-r--r--regcomp.sym6
-rw-r--r--regexec.c48
-rw-r--r--regnodes.h260
-rw-r--r--t/re/re_tests6
9 files changed, 391 insertions, 479 deletions
diff --git a/embed.fnc b/embed.fnc
index 3b81d3fc28..f3e7cf63e0 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -1889,7 +1889,7 @@ Es |void |regtail |NN struct RExC_state_t *pRExC_state \
Es |SV * |reg_scan_name |NN struct RExC_state_t *pRExC_state \
|U32 flags
Es |U32 |join_exact |NN struct RExC_state_t *pRExC_state \
- |NN regnode *scan|NN I32 *min|U32 flags|NULLOK regnode *val|U32 depth
+ |NN regnode *scan|NN I32 *min_change|U32 flags|NULLOK regnode *val|U32 depth
EsRn |char * |regwhite |NN struct RExC_state_t *pRExC_state \
|NN char *p
Es |char * |nextchar |NN struct RExC_state_t *pRExC_state
diff --git a/pp_hot.c b/pp_hot.c
index a2d6f9140e..a3edfa9891 100644
--- a/pp_hot.c
+++ b/pp_hot.c
@@ -1290,8 +1290,10 @@ PP(pp_match)
rx = PM_GETRE(pm);
}
- if (RX_MINLEN(rx) > (I32)len)
+ if (RX_MINLEN(rx) > (I32)len) {
+ DEBUG_r(PerlIO_printf(Perl_debug_log, "Regex match must fail due to min length, so not tried\n"));
goto failure;
+ }
truebase = t = s;
@@ -1330,8 +1332,10 @@ PP(pp_match)
play_it_again:
if (global && RX_OFFS(rx)[0].start != -1) {
t = s = RX_OFFS(rx)[0].end + truebase - RX_GOFS(rx);
- if ((s + RX_MINLEN(rx)) > strend || s < truebase)
+ if ((s + RX_MINLEN(rx)) > strend || s < truebase) {
+ DEBUG_r(PerlIO_printf(Perl_debug_log, "Regex match must fail, so not tried\n"));
goto nope;
+ }
if (update_minmatch++)
minmatch = had_zerolen;
}
diff --git a/proto.h b/proto.h
index 60f191aa73..e0db5ca2ab 100644
--- a/proto.h
+++ b/proto.h
@@ -6350,12 +6350,12 @@ PERL_STATIC_INLINE void S_invlist_trim(pTHX_ SV* const invlist)
#define PERL_ARGS_ASSERT_INVLIST_TRIM \
assert(invlist)
-STATIC U32 S_join_exact(pTHX_ struct RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags, regnode *val, U32 depth)
+STATIC U32 S_join_exact(pTHX_ struct RExC_state_t *pRExC_state, regnode *scan, I32 *min_change, U32 flags, regnode *val, U32 depth)
__attribute__nonnull__(pTHX_1)
__attribute__nonnull__(pTHX_2)
__attribute__nonnull__(pTHX_3);
#define PERL_ARGS_ASSERT_JOIN_EXACT \
- assert(pRExC_state); assert(scan); assert(min)
+ assert(pRExC_state); assert(scan); assert(min_change)
STATIC I32 S_make_trie(pTHX_ struct RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
__attribute__nonnull__(pTHX_1)
diff --git a/regcomp.c b/regcomp.c
index 68b9e04d06..80286fb8e6 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -2505,21 +2505,39 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode
}});
+/* The below joins as many adjacent EXACTish nodes as possible into a single
+ * one, and looks for problematic sequences of characters whose folds vs.
+ * non-folds have sufficiently different lengths, that the optimizer would be
+ * fooled into rejecting legitimate matches of them, and the trie construction
+ * code can't cope with them. The joining is only done if:
+ * 1) there is room in the current conglomerated node to entirely contain the
+ * next one.
+ * 2) they are the exact same node type
+ *
+ * The adjacent nodes actually may be separated by NOTHING kind nodes.
+ *
+ * If there are problematic code sequences, *min_change is set to the delta
+ * that the minimum size of the node can off from its actual size.
+ *
+ * And, the node type of the result is changed to reflect that it contains
+ * these sequences
+ */
-
-
-#define JOIN_EXACT(scan,min,flags) \
+#define JOIN_EXACT(scan,min_change,flags) \
if (PL_regkind[OP(scan)] == EXACT) \
- join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
+ join_exact(pRExC_state,(scan),(min_change),(flags),NULL,depth+1)
STATIC U32
-S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
+S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min_change, U32 flags,regnode *val, U32 depth) {
/* Merge several consecutive EXACTish nodes into one. */
regnode *n = regnext(scan);
U32 stringok = 1;
regnode *next = scan + NODE_SZ_STR(scan);
U32 merged = 0;
U32 stopnow = 0;
+ char *s, *t;
+ char * const s0 = STRING(scan);
+ char * const s_end = s0 + STR_LEN(scan);
#ifdef DEBUGGING
regnode *stop = scan;
GET_RE_DEBUG_FLAGS_DECL;
@@ -2533,13 +2551,20 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags
PERL_UNUSED_ARG(val);
#endif
DEBUG_PEEP("join",scan,depth);
+
+ /* These opcode should only be on output from this routine, never on input
+ */
+ assert(OP(scan) != EXACTFU_NO_TRIE);
+ assert(OP(scan) != EXACTFU_SS);
- /* Skip NOTHING, merge EXACT*. */
- while (n &&
- ( PL_regkind[OP(n)] == NOTHING ||
- (stringok && (OP(n) == OP(scan))))
+ /* Look through the subsequent nodes in the chain. Skip NOTHING, merge
+ * EXACT ones that are mergeable to the current one. */
+ while (n
+ && (PL_regkind[OP(n)] == NOTHING
+ || (stringok && OP(n) == OP(scan)))
&& NEXT_OFF(n)
- && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
+ && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
+ {
if (OP(n) == TAIL || n > next)
stringok = 0;
@@ -2563,7 +2588,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags
DEBUG_PEEP("merg",n,depth);
merged++;
- NEXT_OFF(scan) += NEXT_OFF(n);
+ NEXT_OFF(scan) += NEXT_OFF(n);
STR_LEN(scan) += STR_LEN(n);
next = n + NODE_SZ_STR(n);
/* Now we can overwrite *n : */
@@ -2588,65 +2613,136 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags
}
#endif
}
-#define GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS 0x0390
-#define IOTA_D_T GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS
-#define GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS 0x03B0
-#define UPSILON_D_T GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS
- if (UTF
- && ( OP(scan) == EXACTF || OP(scan) == EXACTFU || OP(scan) == EXACTFA)
- && ( STR_LEN(scan) >= 6 ) )
- {
- /*
- Two problematic code points in Unicode casefolding of EXACT nodes:
-
- U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
- U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
-
- which casefold to
-
- Unicode UTF-8
-
- U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
- U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
-
- This means that in case-insensitive matching (or "loose matching",
- as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
- length of the above casefolded versions) can match a target string
- of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
- This would rather mess up the minimum length computation.
-
- What we'll do is to look for the tail four bytes, and then peek
- at the preceding two bytes to see whether we need to decrease
- the minimum length by four (six minus two).
-
- Thanks to the design of UTF-8, there cannot be false matches:
- A sequence of valid UTF-8 bytes cannot be a subsequence of
- another valid sequence of UTF-8 bytes.
-
- */
- char * const s0 = STRING(scan), *s, *t;
- char * const s1 = s0 + STR_LEN(scan) - 1;
- char * const s2 = s1 - 4;
+ *min_change = 0;
+
+ /* Here, all the adjacent mergeable EXACTish nodes have been merged. We
+ * can now analyze for sequences of problematic code points. (Prior to
+ * this final joining, sequences could have been split over boundaries, and
+ * hence missed). The sequences only happen in folding */
+ if (OP(scan) != EXACT) {
+
+ /* There are three code points in Unicode whose folded lengths differ so
+ * much from the un-folded lengths that it causes problems for the
+ * optimizer and trie construction. Why only these are problematic, and
+ * not others is something I (khw) do not understand. And new versions of
+ * Unicode might add more such code points. Hopefully the logic in
+ * fold_grind.t that figures out what to test (in part by veriying that
+ * each size-combination gets tested) will catch any that do come along, so
+ * they can be added to the special handling below. The chances of this
+ * are actually rather small, as most, if not all, of the scripts that have
+ * casefolding have already been encoded by Unicode, as well as those from
+ * pre-existing standards that Unicode has encoded for backwards
+ * compatibility, which would be the new ones that might have enough
+ * weirdness to qualify for this */
+
+ /* First we look at the sequences that can occur only in UTF-8 strings.
+ * The sequences are of length 6 */
+ if (UTF && STR_LEN(scan) >= 6) {
+
+ /* Two problematic code points in Unicode casefolding of EXACT
+ * nodes:
+ *
+ * U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
+ * U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
+ *
+ * which casefold to
+ *
+ * Unicode UTF-8
+ *
+ * U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
+ * U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
+ *
+ * This means that in case-insensitive matching (or "loose
+ * matching", as Unicode calls it), an EXACTF of length six (the
+ * UTF-8 encoded byte length of the above casefolded versions) can
+ * match a target string of length two (the byte length of UTF-8
+ * encoded U+0390 or U+03B0). This would rather mess up the
+ * minimum length computation. (there are other code points that
+ * also fold to these two sequences, but the delta is smaller)
+ *
+ * What we'll do is to look for the tail four bytes, and then peek
+ * at the preceding two bytes to see whether we need to decrease
+ * the minimum length by four (six minus two).
+ *
+ * Thanks to the design of UTF-8, there cannot be false matches:
+ * A sequence of valid UTF-8 bytes cannot be a subsequence of
+ * another valid sequence of UTF-8 bytes. */
+
#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
- const char t0[] = "\xaf\x49\xaf\x42";
-#else
- const char t0[] = "\xcc\x88\xcc\x81";
-#endif
- const char * const t1 = t0 + 3;
-
- for (s = s0 + 2;
- s < s2 && (t = ninstr(s, s1, t0, t1));
- s = t + 4) {
-#ifdef EBCDIC
- if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
- ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
+ const char U390_first_byte = '\xb4';
+ const char U390_2nd_byte = '\x68';
+ const char U3B0_first_byte = '\xb5';
+ const char U3B0_2nd_byte = '\x46';
+ const char tail[] = "\xaf\x49\xaf\x42";
#else
- if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
- ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
+ const char U390_first_byte = '\xce';
+ const char U390_2nd_byte = '\xb9';
+ const char U3B0_first_byte = '\xcf';
+ const char U3B0_2nd_byte = '\x85';
+ const char tail[] = "\xcc\x88\xcc\x81";
#endif
- *min -= 4;
- }
+ const STRLEN tail_len = sizeof(tail) - 1;
+ for (s = s0 + 2;
+ s <= s_end - tail_len
+ && (t = ninstr(s, s_end, tail, tail + tail_len));
+ s = t + tail_len)
+ {
+ if ((t[-1] == U390_2nd_byte && t[-2] == U390_first_byte)
+ || (t[-1] == U3B0_2nd_byte && t[-2] == U3B0_first_byte))
+ {
+ *min_change -= 4;
+
+ /* This can't currently be handled by tries, so change the
+ * node type to indicate this. */
+ if (OP(scan) == EXACTFU) {
+ OP(scan) = EXACTFU_NO_TRIE;
+ }
+ }
+ }
+ }
+
+ /* The third problematic sequence is 'ss', which can match just the
+ * single byte LATIN SMALL LETTER SHARP S, and it can do it in both
+ * non- and UTF-8. Code elsewhere in this file makes sure, however,
+ * that the sharp s gets folded to 'ss' under Unicode rules even if not
+ * UTF-8. */
+ if (STR_LEN(scan) >= 2
+ && (OP(scan) == EXACTFU
+ || OP(scan) == EXACTFU_NO_TRIE /* The code above could have
+ set to this node type */
+ || OP(scan) == EXACTF))
+ {
+ /* The string will be folded to 'ss' if it's in UTF-8, but it could
+ * be 'Ss', etc when not. We could have different code to handle
+ * the two cases, but this is not necessary since both S and s are
+ * invariants under UTF-8; and not worth it, especially because we
+ * can use just one test each time through the loop (plus a mask)
+ * Ths is because on both EBCDIC and ASCII machines, an 'S' and 's'
+ * differ by a single bit. On ASCII they are 32 apart; on EBCDIC,
+ * they are 64. This uses an exclusive 'or' to find that bit and
+ * then inverts it to form a mask, with just a single 0, in the bit
+ * position where 'S' and 's' differ. */
+ const char S_or_s_mask = ~ ('S' ^ 's');
+ const char s_masked = 's' & S_or_s_mask;
+
+ for (s = s0; s < s_end - 1; s++) {
+ if (((*s & S_or_s_mask) == s_masked)
+ && ((*(s+1) & S_or_s_mask) == s_masked))
+ {
+ s++;
+ *min_change -= 1;
+
+ /* EXACTFU_SS also isn't trie'able, so don't have to
+ * preserve EXACTFU_NO_TRIE. EXACTF is also not trie'able,
+ * and because we essentially punt the optimizations in its
+ * case, we don't need to indicate that it has an ss */
+ if (OP(scan) == EXACTFU || OP(scan) == EXACTFU_NO_TRIE) {
+ OP(scan) = EXACTFU_SS;
+ }
+ }
+ }
+ }
}
#ifdef DEBUGGING
@@ -2762,10 +2858,11 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
fake_study_recurse:
while ( scan && OP(scan) != END && scan < last ){
+ I32 min_change;
/* Peephole optimizer: */
DEBUG_STUDYDATA("Peep:", data,depth);
DEBUG_PEEP("Peep",scan,depth);
- JOIN_EXACT(scan,&min,0);
+ JOIN_EXACT(scan,&min_change,0);
/* Follow the next-chain of the current node and optimize
away all the NOTHINGs from it. */
@@ -3059,8 +3156,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
in below to fully enable trie logic.
#define TRIE_TYPE_IS_SAFE 1
-
*/
+
#define TRIE_TYPE_IS_SAFE ((UTF && optype == EXACTFU) || optype==EXACT)
if ( last && TRIE_TYPE_IS_SAFE ) {
@@ -3279,9 +3376,23 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
l = utf8_length(s, s + l);
uc = utf8_to_uvchr(s, NULL);
}
- min += l;
- if (flags & SCF_DO_SUBSTR)
+ else if (OP(scan) == EXACTF) {
+ if (memchr(STRING(scan), LATIN_SMALL_LETTER_SHARP_S, l)) {
+ RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
+ }
+ }
+ min += l + min_change;
+ if (min < 0) {
+ min = 0;
+ }
+ delta += abs(min_change);
+ if (flags & SCF_DO_SUBSTR) {
data->pos_min += l;
+ data->pos_delta += abs(min_change);
+ if (min_change) {
+ data->longest = &(data->longest_float);
+ }
+ }
if (flags & SCF_DO_STCLASS_AND) {
/* Check whether it is compatible with what we know already! */
int compat = 1;
@@ -3311,6 +3422,13 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
* the full latin1 fold. (Can't do this for locale,
* because not known until runtime */
ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
+ if (uc == 's' || uc == 'S') {
+ ANYOF_BITMAP_SET(data->start_class, LATIN_SMALL_LETTER_SHARP_S);
+ }
+ else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
+ ANYOF_BITMAP_SET(data->start_class, 's');
+ ANYOF_BITMAP_SET(data->start_class, 'S');
+ }
}
}
else if (uc >= 0x100) {
@@ -3335,6 +3453,13 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
* run-time */
ANYOF_BITMAP_SET(data->start_class,
PL_fold_latin1[uc]);
+ if (uc == 's' || uc == 'S') {
+ ANYOF_BITMAP_SET(data->start_class, LATIN_SMALL_LETTER_SHARP_S);
+ }
+ else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
+ ANYOF_BITMAP_SET(data->start_class, 's');
+ ANYOF_BITMAP_SET(data->start_class, 'S');
+ }
}
}
data->start_class->flags &= ~ANYOF_EOS;
@@ -3740,18 +3865,6 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
data->longest = &(data->longest_float);
}
}
- else if (OP(scan) == FOLDCHAR) {
- int d = ARG(scan) == LATIN_SMALL_LETTER_SHARP_S ? 1 : 2;
- flags &= ~SCF_DO_STCLASS;
- min += 1;
- delta += d;
- if (flags & SCF_DO_SUBSTR) {
- SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
- data->pos_min += 1;
- data->pos_delta += d;
- data->longest = &(data->longest_float);
- }
- }
else if (REGNODE_SIMPLE(OP(scan))) {
int value = 0;
@@ -5067,9 +5180,10 @@ reStudy:
{
I32 t,ml;
- if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
- && data.offset_fixed == data.offset_float_min
- && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
+ if ((RExC_seen & REG_SEEN_EXACTF_SHARP_S)
+ || (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
+ && data.offset_fixed == data.offset_float_min
+ && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
goto remove_float; /* As in (a)+. */
/* copy the information about the longest float from the reg_scan_data
@@ -5112,10 +5226,11 @@ reStudy:
Be careful.
*/
longest_fixed_length = CHR_SVLEN(data.longest_fixed);
- if (longest_fixed_length
- || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
- && (!(data.flags & SF_FIX_BEFORE_MEOL)
- || (RExC_flags & RXf_PMf_MULTILINE))))
+ if (! (RExC_seen & REG_SEEN_EXACTF_SHARP_S)
+ && (longest_fixed_length
+ || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
+ && (!(data.flags & SF_FIX_BEFORE_MEOL)
+ || (RExC_flags & RXf_PMf_MULTILINE)))) )
{
I32 t,ml;
@@ -8820,15 +8935,6 @@ tryagain:
RExC_parse++;
defchar: {
- typedef enum {
- generic_char = 0,
- char_s,
- upsilon_1,
- upsilon_2,
- iota_1,
- iota_2,
- } char_state;
- char_state latest_char_state = generic_char;
register STRLEN len;
register UV ender;
register char *p;
@@ -8836,22 +8942,25 @@ tryagain:
STRLEN foldlen;
U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
regnode * orig_emit;
+ int node_type;
+ bool is_exactfu_sharp_s;
ender = 0;
orig_emit = RExC_emit; /* Save the original output node position in
case we need to output a different node
type */
- ret = reg_node(pRExC_state,
- (U8) ((! FOLD) ? EXACT
+ node_type = (U8) ((! FOLD) ? EXACT
: (LOC)
? EXACTFL
: (MORE_ASCII_RESTRICTED)
? EXACTFA
: (AT_LEAST_UNI_SEMANTICS)
? EXACTFU
- : EXACTF)
- );
+ : EXACTF);
+ ret = reg_node(pRExC_state, node_type);
s = STRING(ret);
+
+ /* By going only up to 127 when the maximum storable is 255, we don't have to worry about expansion, not being in the last character in the fold */
for (len = 0, p = RExC_parse - 1;
len < 127 && p < RExC_end;
len++)
@@ -9047,219 +9156,10 @@ tryagain:
break;
} /* End of switch on the literal */
- /* Certain characters are problematic because their folded
- * length is so different from their original length that it
- * isn't handleable by the optimizer. They are therefore not
- * placed in an EXACTish node; and are here handled specially.
- * (Even if the optimizer handled LATIN_SMALL_LETTER_SHARP_S,
- * putting it in a special node keeps regexec from having to
- * deal with a non-utf8 multi-char fold */
- if (FOLD
- && (ender > 255 || (! MORE_ASCII_RESTRICTED && ! LOC)))
- {
- /* We look for either side of the fold. For example \xDF
- * folds to 'ss'. We look for both the single character
- * \xDF and the sequence 'ss'. When we find something that
- * could be one of those, we stop and flush whatever we
- * have output so far into the EXACTish node that was being
- * built. Then restore the input pointer to what it was.
- * regatom will return that EXACT node, and will be called
- * again, positioned so the first character is the one in
- * question, which we return in a different node type.
- * The multi-char folds are a sequence, so the occurrence
- * of the first character in that sequence doesn't
- * necessarily mean that what follows is the rest of the
- * sequence. We keep track of that with a state machine,
- * with the state being set to the latest character
- * processed before the current one. Most characters will
- * set the state to 0, but if one occurs that is part of a
- * potential tricky fold sequence, the state is set to that
- * character, and the next loop iteration sees if the state
- * should progress towards the final folded-from character,
- * or if it was a false alarm. If it turns out to be a
- * false alarm, the character(s) will be output in a new
- * EXACTish node, and join_exact() will later combine them.
- * In the case of the 'ss' sequence, which is more common
- * and more easily checked, some look-ahead is done to
- * save time by ruling-out some false alarms */
- switch (ender) {
- default:
- latest_char_state = generic_char;
- break;
- case 's':
- case 'S':
- case 0x17F: /* LATIN SMALL LETTER LONG S */
- if (AT_LEAST_UNI_SEMANTICS) {
- if (latest_char_state == char_s) { /* 'ss' */
- ender = LATIN_SMALL_LETTER_SHARP_S;
- goto do_tricky;
- }
- else if (p < RExC_end) {
-
- /* Look-ahead at the next character. If it
- * is also an s, we handle as a sharp s
- * tricky regnode. */
- if (*p == 's' || *p == 'S') {
-
- /* But first flush anything in the
- * EXACTish buffer */
- if (len != 0) {
- p = oldp;
- goto loopdone;
- }
- p++; /* Account for swallowing this
- 's' up */
- ender = LATIN_SMALL_LETTER_SHARP_S;
- goto do_tricky;
- }
- /* Here, the next character is not a
- * literal 's', but still could
- * evaluate to one if part of a \o{},
- * \x or \OCTAL-DIGIT. The minimum
- * length required for that is 4, eg
- * \x53 or \123 */
- else if (*p == '\\'
- && p < RExC_end - 4
- && (isDIGIT(*(p + 1))
- || *(p + 1) == 'x'
- || *(p + 1) == 'o' ))
- {
-
- /* Here, it could be an 's', too much
- * bother to figure it out here. Flush
- * the buffer if any; when come back
- * here, set the state so know that the
- * previous char was an 's' */
- if (len != 0) {
- latest_char_state = generic_char;
- p = oldp;
- goto loopdone;
- }
- latest_char_state = char_s;
- break;
- }
- }
- }
-
- /* Here, can't be an 'ss' sequence, or at least not
- * one that could fold to/from the sharp ss */
- latest_char_state = generic_char;
- break;
- case 0x03C5: /* First char in upsilon series */
- case 0x03A5: /* Also capital UPSILON, which folds to
- 03C5, and hence exhibits the same
- problem */
- if (p < RExC_end - 4) { /* Need >= 4 bytes left */
- latest_char_state = upsilon_1;
- if (len != 0) {
- p = oldp;
- goto loopdone;
- }
- }
- else {
- latest_char_state = generic_char;
- }
- break;
- case 0x03B9: /* First char in iota series */
- case 0x0399: /* Also capital IOTA */
- case 0x1FBE: /* GREEK PROSGEGRAMMENI folds to 3B9 */
- case 0x0345: /* COMBINING GREEK YPOGEGRAMMENI folds
- to 3B9 */
- if (p < RExC_end - 4) {
- latest_char_state = iota_1;
- if (len != 0) {
- p = oldp;
- goto loopdone;
- }
- }
- else {
- latest_char_state = generic_char;
- }
- break;
- case 0x0308:
- if (latest_char_state == upsilon_1) {
- latest_char_state = upsilon_2;
- }
- else if (latest_char_state == iota_1) {
- latest_char_state = iota_2;
- }
- else {
- latest_char_state = generic_char;
- }
- break;
- case 0x301:
- if (latest_char_state == upsilon_2) {
- ender = GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS;
- goto do_tricky;
- }
- else if (latest_char_state == iota_2) {
- ender = GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS;
- goto do_tricky;
- }
- latest_char_state = generic_char;
- break;
-
- /* These are the tricky fold characters. Flush any
- * buffer first. (When adding to this list, also should
- * add them to fold_grind.t to make sure get tested) */
- case GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS:
- case GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS:
- case LATIN_SMALL_LETTER_SHARP_S:
- case LATIN_CAPITAL_LETTER_SHARP_S:
- case 0x1FD3: /* GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA */
- case 0x1FE3: /* GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA */
- if (len != 0) {
- p = oldp;
- goto loopdone;
- }
- /* FALL THROUGH */
- do_tricky: {
- char* const oldregxend = RExC_end;
- U8 tmpbuf[UTF8_MAXBYTES+1];
-
- /* Here, we know we need to generate a special
- * regnode, and 'ender' contains the tricky
- * character. What's done is to pretend it's in a
- * [bracketed] class, and let the code that deals
- * with those handle it, as that code has all the
- * intelligence necessary. First save the current
- * parse state, get rid of the already allocated
- * but empty EXACT node that the ANYOFV node will
- * replace, and point the parse to a buffer which
- * we fill with the character we want the regclass
- * code to think is being parsed */
- RExC_emit = orig_emit;
- RExC_parse = (char *) tmpbuf;
- if (UTF) {
- U8 *d = uvchr_to_utf8(tmpbuf, ender);
- *d = '\0';
- RExC_end = (char *) d;
- }
- else { /* ender above 255 already excluded */
- tmpbuf[0] = (U8) ender;
- tmpbuf[1] = '\0';
- RExC_end = RExC_parse + 1;
- }
-
- ret = regclass(pRExC_state,depth+1);
-
- /* Here, have parsed the buffer. Reset the parse to
- * the actual input, and return */
- RExC_end = oldregxend;
- RExC_parse = p - 1;
-
- Set_Node_Offset(ret, RExC_parse);
- Set_Node_Cur_Length(ret);
- nextchar(pRExC_state);
- *flagp |= HASWIDTH|SIMPLE;
- return ret;
- }
- }
- }
-
+ is_exactfu_sharp_s = (node_type == EXACTFU && ender == LATIN_SMALL_LETTER_SHARP_S);
if ( RExC_flags & RXf_PMf_EXTENDED)
p = regwhite( pRExC_state, p );
- if (UTF && FOLD) {
+ if ((UTF && FOLD) || is_exactfu_sharp_s) {
/* Prime the casefolded buffer. Locale rules, which apply
* only to code points < 256, aren't known until execution,
* so for them, just output the original character using
@@ -9322,7 +9222,7 @@ tryagain:
if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
if (len)
p = oldp;
- else if (UTF) {
+ else if (UTF || is_exactfu_sharp_s) {
if (FOLD) {
/* Emit all the Unicode characters. */
STRLEN numlen;
@@ -9358,7 +9258,7 @@ tryagain:
}
break;
}
- if (UTF) {
+ if (UTF || is_exactfu_sharp_s) {
if (FOLD) {
/* Emit all the Unicode characters. */
STRLEN numlen;
@@ -11188,6 +11088,7 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,
regnode * const temp = regnext(scan);
#ifdef EXPERIMENTAL_INPLACESCAN
if (PL_regkind[OP(scan)] == EXACT)
+
if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
return EXACT;
#endif
@@ -11197,6 +11098,8 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,
case EXACTF:
case EXACTFA:
case EXACTFU:
+ case EXACTFU_SS:
+ case EXACTFU_NO_TRIE:
case EXACTFL:
if( exact == PSEUDO )
exact= OP(scan);
@@ -11521,8 +11424,6 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
} else if (k == LOGICAL)
Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
- else if (k == FOLDCHAR)
- Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) );
else if (k == ANYOF) {
int i, rangestart = -1;
const U8 flags = ANYOF_FLAGS(o);
diff --git a/regcomp.h b/regcomp.h
index 81c8a5ddd7..502674c088 100644
--- a/regcomp.h
+++ b/regcomp.h
@@ -492,6 +492,7 @@ struct regnode_charclass_class {
#define REG_SEEN_VERBARG 0x00000080
#define REG_SEEN_CUTGROUP 0x00000100
#define REG_SEEN_RUN_ON_COMMENT 0x00000200
+#define REG_SEEN_EXACTF_SHARP_S 0x00000400
START_EXTERN_C
diff --git a/regcomp.sym b/regcomp.sym
index 23b9ef2181..69366d7e87 100644
--- a/regcomp.sym
+++ b/regcomp.sym
@@ -97,7 +97,9 @@ BACK BACK, no 0 V ; Match "", "next" ptr points backward.
EXACT EXACT, str ; Match this string (preceded by length).
EXACTF EXACT, str ; Match this non-UTF-8 string (not guaranteed to be folded) using /id rules (w/len).
EXACTFL EXACT, str ; Match this string (not guaranteed to be folded) using /il rules (w/len).
-EXACTFU EXACT, str ; Match this string (folded iff in UTF-8) using /iu rules (w/len).
+EXACTFU EXACT, str ; Match this string (folded iff in UTF-8, length in folding doesn't change if not in UTF-8) using /iu rules (w/len).
+EXACTFU_SS EXACT, str ; Match this string (folded iff in UTF-8, length in folding may change even if not in UTF-8) using /iu rules (w/len).
+EXACTFU_NO_TRIE EXACT, str ; Match this folded UTF-8 string using /iu rules, but don't generate a trie for it
EXACTFA EXACT, str ; Match this string (not guaranteed to be folded) using /iaa rules (w/len).
#*Do nothing types
@@ -214,10 +216,8 @@ VERTWS VERTWS, none 0 S ; vertical whitespace (Perl 6)
NVERTWS NVERTWS, none 0 S ; not vertical whitespace (Perl 6)
HORIZWS HORIZWS, none 0 S ; horizontal whitespace (Perl 6)
NHORIZWS NHORIZWS, none 0 S ; not horizontal whitespace (Perl 6)
-
FOLDCHAR FOLDCHAR, codepoint 1 ; codepoint with tricky case folding properties.
-
# NEW STUFF SOMEWHERE ABOVE THIS LINE
################################################################################
diff --git a/regexec.c b/regexec.c
index bde7027ede..0392f1b20d 100644
--- a/regexec.c
+++ b/regexec.c
@@ -303,13 +303,13 @@
/* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
we don't need this definition. */
#define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
-#define IS_TEXTF(rn) ( (OP(rn)==EXACTFU || OP(rn)==EXACTFA || OP(rn)==EXACTF) || OP(rn)==REFF || OP(rn)==NREFF )
+#define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_NO_TRIE || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
#else
/* ... so we use this as its faster. */
#define IS_TEXT(rn) ( OP(rn)==EXACT )
-#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn) == EXACTFA)
+#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_NO_TRIE || OP(rn) == EXACTFA)
#define IS_TEXTF(rn) ( OP(rn)==EXACTF )
#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
@@ -1483,6 +1483,13 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
folder = foldEQ_locale;
goto do_exactf_non_utf8;
+ case EXACTFU_SS:
+ if (UTF_PATTERN) {
+ utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
+ }
+ goto do_exactf_utf8;
+
+ case EXACTFU_NO_TRIE:
case EXACTFU:
if (UTF_PATTERN || utf8_target) {
utf8_fold_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
@@ -3662,6 +3669,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
fold_utf8_flags = FOLDEQ_UTF8_LOCALE;
goto do_exactf;
+ case EXACTFU_SS:
+ case EXACTFU_NO_TRIE:
case EXACTFU:
folder = foldEQ_latin1;
fold_array = PL_fold_latin1;
@@ -3683,8 +3692,9 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
s = STRING(scan);
ln = STR_LEN(scan);
- if (utf8_target || UTF_PATTERN) {
- /* Either target or the pattern are utf8. */
+ if (utf8_target || UTF_PATTERN || state_num == EXACTFU_SS) {
+ /* Either target or the pattern are utf8, or has the issue where
+ * the fold lengths may differ. */
const char * const l = locinput;
char *e = PL_regeol;
@@ -5072,6 +5082,8 @@ NULL
switch (OP(text_node)) {
case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
case EXACTFA:
+ case EXACTFU_SS:
+ case EXACTFU_NO_TRIE:
case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
default: ST.c2 = ST.c1;
@@ -5226,6 +5238,8 @@ NULL
switch (OP(text_node)) {
case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
case EXACTFA:
+ case EXACTFU_SS:
+ case EXACTFU_NO_TRIE:
case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
default: ST.c2 = ST.c1; break;
@@ -5694,27 +5708,6 @@ NULL
sayNO;
/* NOTREACHED */
#undef ST
- case FOLDCHAR:
- n = ARG(scan);
- if ( n == (U32)what_len_TRICKYFOLD(locinput,utf8_target,ln) ) {
- locinput += ln;
- } else if ( LATIN_SMALL_LETTER_SHARP_S == n && !utf8_target && !UTF_PATTERN ) {
- sayNO;
- } else {
- U8 folded[UTF8_MAXBYTES_CASE+1];
- STRLEN foldlen;
- const char * const l = locinput;
- char *e = PL_regeol;
- to_uni_fold(n, folded, &foldlen);
-
- if (! foldEQ_utf8((const char*) folded, 0, foldlen, 1,
- l, &e, 0, utf8_target)) {
- sayNO;
- }
- locinput = e;
- }
- nextchr = UCHARAT(locinput);
- break;
case LNBREAK:
if ((n=is_LNBREAK(locinput,utf8_target))) {
locinput += n;
@@ -6039,6 +6032,8 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
utf8_flags = 0;
goto do_exactf;
+ case EXACTFU_SS:
+ case EXACTFU_NO_TRIE:
case EXACTFU:
utf8_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
@@ -6049,7 +6044,7 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
c = (U8)*STRING(p);
assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
- if (utf8_target) { /* Use full Unicode fold matching */
+ if (utf8_target || OP(p) == EXACTFU_SS) { /* Use full Unicode fold matching */
char *tmpeol = loceol;
while (hardcount < max
&& foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
@@ -6080,6 +6075,7 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
switch (OP(p)) {
case EXACTF: folded = PL_fold[c]; break;
case EXACTFA:
+ case EXACTFU_NO_TRIE:
case EXACTFU: folded = PL_fold_latin1[c]; break;
case EXACTFL: folded = PL_fold_locale[c]; break;
default: Perl_croak(aTHX_ "panic: Unexpected op %u", OP(p));
diff --git a/regnodes.h b/regnodes.h
index dccf2b7a99..785ff1c9ca 100644
--- a/regnodes.h
+++ b/regnodes.h
@@ -6,8 +6,8 @@
/* Regops and State definitions */
-#define REGNODE_MAX 111
-#define REGMATCH_STATE_MAX 151
+#define REGNODE_MAX 113
+#define REGMATCH_STATE_MAX 153
#define END 0 /* 0000 End of program. */
#define SUCCEED 1 /* 0x01 Return from a subroutine, basically. */
@@ -60,67 +60,69 @@
#define EXACT 48 /* 0x30 Match this string (preceded by length). */
#define EXACTF 49 /* 0x31 Match this non-UTF-8 string (not guaranteed to be folded) using /id rules (w/len). */
#define EXACTFL 50 /* 0x32 Match this string (not guaranteed to be folded) using /il rules (w/len). */
-#define EXACTFU 51 /* 0x33 Match this string (folded iff in UTF-8) using /iu rules (w/len). */
-#define EXACTFA 52 /* 0x34 Match this string (not guaranteed to be folded) using /iaa rules (w/len). */
-#define NOTHING 53 /* 0x35 Match empty string. */
-#define TAIL 54 /* 0x36 Match empty string. Can jump here from outside. */
-#define STAR 55 /* 0x37 Match this (simple) thing 0 or more times. */
-#define PLUS 56 /* 0x38 Match this (simple) thing 1 or more times. */
-#define CURLY 57 /* 0x39 Match this simple thing {n,m} times. */
-#define CURLYN 58 /* 0x3a Capture next-after-this simple thing */
-#define CURLYM 59 /* 0x3b Capture this medium-complex thing {n,m} times. */
-#define CURLYX 60 /* 0x3c Match this complex thing {n,m} times. */
-#define WHILEM 61 /* 0x3d Do curly processing and see if rest matches. */
-#define OPEN 62 /* 0x3e Mark this point in input as start of */
-#define CLOSE 63 /* 0x3f Analogous to OPEN. */
-#define REF 64 /* 0x40 Match some already matched string */
-#define REFF 65 /* 0x41 Match already matched string, folded using native charset semantics for non-utf8 */
-#define REFFL 66 /* 0x42 Match already matched string, folded in loc. */
-#define REFFU 67 /* 0x43 Match already matched string, folded using unicode semantics for non-utf8 */
-#define REFFA 68 /* 0x44 Match already matched string, folded using unicode semantics for non-utf8, no mixing ASCII, non-ASCII */
-#define NREF 69 /* 0x45 Match some already matched string */
-#define NREFF 70 /* 0x46 Match already matched string, folded using native charset semantics for non-utf8 */
-#define NREFFL 71 /* 0x47 Match already matched string, folded in loc. */
-#define NREFFU 72 /* 0x48 Match already matched string, folded using unicode semantics for non-utf8 */
-#define NREFFA 73 /* 0x49 Match already matched string, folded using unicode semantics for non-utf8, no mixing ASCII, non-ASCII */
-#define IFMATCH 74 /* 0x4a Succeeds if the following matches. */
-#define UNLESSM 75 /* 0x4b Fails if the following matches. */
-#define SUSPEND 76 /* 0x4c "Independent" sub-RE. */
-#define IFTHEN 77 /* 0x4d Switch, should be preceded by switcher . */
-#define GROUPP 78 /* 0x4e Whether the group matched. */
-#define LONGJMP 79 /* 0x4f Jump far away. */
-#define BRANCHJ 80 /* 0x50 BRANCH with long offset. */
-#define EVAL 81 /* 0x51 Execute some Perl code. */
-#define MINMOD 82 /* 0x52 Next operator is not greedy. */
-#define LOGICAL 83 /* 0x53 Next opcode should set the flag only. */
-#define RENUM 84 /* 0x54 Group with independently numbered parens. */
-#define TRIE 85 /* 0x55 Match many EXACT(F[ALU]?)? at once. flags==type */
-#define TRIEC 86 /* 0x56 Same as TRIE, but with embedded charclass data */
-#define AHOCORASICK 87 /* 0x57 Aho Corasick stclass. flags==type */
-#define AHOCORASICKC 88 /* 0x58 Same as AHOCORASICK, but with embedded charclass data */
-#define GOSUB 89 /* 0x59 recurse to paren arg1 at (signed) ofs arg2 */
-#define GOSTART 90 /* 0x5a recurse to start of pattern */
-#define NGROUPP 91 /* 0x5b Whether the group matched. */
-#define INSUBP 92 /* 0x5c Whether we are in a specific recurse. */
-#define DEFINEP 93 /* 0x5d Never execute directly. */
-#define ENDLIKE 94 /* 0x5e Used only for the type field of verbs */
-#define OPFAIL 95 /* 0x5f Same as (?!) */
-#define ACCEPT 96 /* 0x60 Accepts the current matched string. */
-#define VERB 97 /* 0x61 Used only for the type field of verbs */
-#define PRUNE 98 /* 0x62 Pattern fails at this startpoint if no-backtracking through this */
-#define MARKPOINT 99 /* 0x63 Push the current location for rollback by cut. */
-#define SKIP 100 /* 0x64 On failure skip forward (to the mark) before retrying */
-#define COMMIT 101 /* 0x65 Pattern fails outright if backtracking through this */
-#define CUTGROUP 102 /* 0x66 On failure go to the next alternation in the group */
-#define KEEPS 103 /* 0x67 $& begins here. */
-#define LNBREAK 104 /* 0x68 generic newline pattern */
-#define VERTWS 105 /* 0x69 vertical whitespace (Perl 6) */
-#define NVERTWS 106 /* 0x6a not vertical whitespace (Perl 6) */
-#define HORIZWS 107 /* 0x6b horizontal whitespace (Perl 6) */
-#define NHORIZWS 108 /* 0x6c not horizontal whitespace (Perl 6) */
-#define FOLDCHAR 109 /* 0x6d codepoint with tricky case folding properties. */
-#define OPTIMIZED 110 /* 0x6e Placeholder for dump. */
-#define PSEUDO 111 /* 0x6f Pseudo opcode for internal use. */
+#define EXACTFU 51 /* 0x33 Match this string (folded iff in UTF-8, length in folding doesn't change if not in UTF-8) using /iu rules (w/len). */
+#define EXACTFU_SS 52 /* 0x34 Match this string (folded iff in UTF-8, length in folding may change even if not in UTF-8) using /iu rules (w/len). */
+#define EXACTFU_NO_TRIE 53 /* 0x35 Match this folded UTF-8 string using /iu rules, but don't generate a trie for it */
+#define EXACTFA 54 /* 0x36 Match this string (not guaranteed to be folded) using /iaa rules (w/len). */
+#define NOTHING 55 /* 0x37 Match empty string. */
+#define TAIL 56 /* 0x38 Match empty string. Can jump here from outside. */
+#define STAR 57 /* 0x39 Match this (simple) thing 0 or more times. */
+#define PLUS 58 /* 0x3a Match this (simple) thing 1 or more times. */
+#define CURLY 59 /* 0x3b Match this simple thing {n,m} times. */
+#define CURLYN 60 /* 0x3c Capture next-after-this simple thing */
+#define CURLYM 61 /* 0x3d Capture this medium-complex thing {n,m} times. */
+#define CURLYX 62 /* 0x3e Match this complex thing {n,m} times. */
+#define WHILEM 63 /* 0x3f Do curly processing and see if rest matches. */
+#define OPEN 64 /* 0x40 Mark this point in input as start of */
+#define CLOSE 65 /* 0x41 Analogous to OPEN. */
+#define REF 66 /* 0x42 Match some already matched string */
+#define REFF 67 /* 0x43 Match already matched string, folded using native charset semantics for non-utf8 */
+#define REFFL 68 /* 0x44 Match already matched string, folded in loc. */
+#define REFFU 69 /* 0x45 Match already matched string, folded using unicode semantics for non-utf8 */
+#define REFFA 70 /* 0x46 Match already matched string, folded using unicode semantics for non-utf8, no mixing ASCII, non-ASCII */
+#define NREF 71 /* 0x47 Match some already matched string */
+#define NREFF 72 /* 0x48 Match already matched string, folded using native charset semantics for non-utf8 */
+#define NREFFL 73 /* 0x49 Match already matched string, folded in loc. */
+#define NREFFU 74 /* 0x4a Match already matched string, folded using unicode semantics for non-utf8 */
+#define NREFFA 75 /* 0x4b Match already matched string, folded using unicode semantics for non-utf8, no mixing ASCII, non-ASCII */
+#define IFMATCH 76 /* 0x4c Succeeds if the following matches. */
+#define UNLESSM 77 /* 0x4d Fails if the following matches. */
+#define SUSPEND 78 /* 0x4e "Independent" sub-RE. */
+#define IFTHEN 79 /* 0x4f Switch, should be preceded by switcher . */
+#define GROUPP 80 /* 0x50 Whether the group matched. */
+#define LONGJMP 81 /* 0x51 Jump far away. */
+#define BRANCHJ 82 /* 0x52 BRANCH with long offset. */
+#define EVAL 83 /* 0x53 Execute some Perl code. */
+#define MINMOD 84 /* 0x54 Next operator is not greedy. */
+#define LOGICAL 85 /* 0x55 Next opcode should set the flag only. */
+#define RENUM 86 /* 0x56 Group with independently numbered parens. */
+#define TRIE 87 /* 0x57 Match many EXACT(F[ALU]?)? at once. flags==type */
+#define TRIEC 88 /* 0x58 Same as TRIE, but with embedded charclass data */
+#define AHOCORASICK 89 /* 0x59 Aho Corasick stclass. flags==type */
+#define AHOCORASICKC 90 /* 0x5a Same as AHOCORASICK, but with embedded charclass data */
+#define GOSUB 91 /* 0x5b recurse to paren arg1 at (signed) ofs arg2 */
+#define GOSTART 92 /* 0x5c recurse to start of pattern */
+#define NGROUPP 93 /* 0x5d Whether the group matched. */
+#define INSUBP 94 /* 0x5e Whether we are in a specific recurse. */
+#define DEFINEP 95 /* 0x5f Never execute directly. */
+#define ENDLIKE 96 /* 0x60 Used only for the type field of verbs */
+#define OPFAIL 97 /* 0x61 Same as (?!) */
+#define ACCEPT 98 /* 0x62 Accepts the current matched string. */
+#define VERB 99 /* 0x63 Used only for the type field of verbs */
+#define PRUNE 100 /* 0x64 Pattern fails at this startpoint if no-backtracking through this */
+#define MARKPOINT 101 /* 0x65 Push the current location for rollback by cut. */
+#define SKIP 102 /* 0x66 On failure skip forward (to the mark) before retrying */
+#define COMMIT 103 /* 0x67 Pattern fails outright if backtracking through this */
+#define CUTGROUP 104 /* 0x68 On failure go to the next alternation in the group */
+#define KEEPS 105 /* 0x69 $& begins here. */
+#define LNBREAK 106 /* 0x6a generic newline pattern */
+#define VERTWS 107 /* 0x6b vertical whitespace (Perl 6) */
+#define NVERTWS 108 /* 0x6c not vertical whitespace (Perl 6) */
+#define HORIZWS 109 /* 0x6d horizontal whitespace (Perl 6) */
+#define NHORIZWS 110 /* 0x6e not horizontal whitespace (Perl 6) */
+#define FOLDCHAR 111 /* 0x6f codepoint with tricky case folding properties. */
+#define OPTIMIZED 112 /* 0x70 Placeholder for dump. */
+#define PSEUDO 113 /* 0x71 Pseudo opcode for internal use. */
/* ------------ States ------------- */
#define TRIE_next (REGNODE_MAX + 1) /* state for TRIE */
#define TRIE_next_fail (REGNODE_MAX + 2) /* state for TRIE */
@@ -221,6 +223,8 @@ EXTCONST U8 PL_regkind[] = {
EXACT, /* EXACTF */
EXACT, /* EXACTFL */
EXACT, /* EXACTFU */
+ EXACT, /* EXACTFU_SS */
+ EXACT, /* EXACTFU_NO_TRIE */
EXACT, /* EXACTFA */
NOTHING, /* NOTHING */
NOTHING, /* TAIL */
@@ -381,6 +385,8 @@ static const U8 regarglen[] = {
0, /* EXACTF */
0, /* EXACTFL */
0, /* EXACTFU */
+ 0, /* EXACTFU_SS */
+ 0, /* EXACTFU_NO_TRIE */
0, /* EXACTFA */
0, /* NOTHING */
0, /* TAIL */
@@ -498,6 +504,8 @@ static const char reg_off_by_arg[] = {
0, /* EXACTF */
0, /* EXACTFL */
0, /* EXACTFU */
+ 0, /* EXACTFU_SS */
+ 0, /* EXACTFU_NO_TRIE */
0, /* EXACTFA */
0, /* NOTHING */
0, /* TAIL */
@@ -620,66 +628,68 @@ EXTCONST char * const PL_reg_name[] = {
"EXACTF", /* 0x31 */
"EXACTFL", /* 0x32 */
"EXACTFU", /* 0x33 */
- "EXACTFA", /* 0x34 */
- "NOTHING", /* 0x35 */
- "TAIL", /* 0x36 */
- "STAR", /* 0x37 */
- "PLUS", /* 0x38 */
- "CURLY", /* 0x39 */
- "CURLYN", /* 0x3a */
- "CURLYM", /* 0x3b */
- "CURLYX", /* 0x3c */
- "WHILEM", /* 0x3d */
- "OPEN", /* 0x3e */
- "CLOSE", /* 0x3f */
- "REF", /* 0x40 */
- "REFF", /* 0x41 */
- "REFFL", /* 0x42 */
- "REFFU", /* 0x43 */
- "REFFA", /* 0x44 */
- "NREF", /* 0x45 */
- "NREFF", /* 0x46 */
- "NREFFL", /* 0x47 */
- "NREFFU", /* 0x48 */
- "NREFFA", /* 0x49 */
- "IFMATCH", /* 0x4a */
- "UNLESSM", /* 0x4b */
- "SUSPEND", /* 0x4c */
- "IFTHEN", /* 0x4d */
- "GROUPP", /* 0x4e */
- "LONGJMP", /* 0x4f */
- "BRANCHJ", /* 0x50 */
- "EVAL", /* 0x51 */
- "MINMOD", /* 0x52 */
- "LOGICAL", /* 0x53 */
- "RENUM", /* 0x54 */
- "TRIE", /* 0x55 */
- "TRIEC", /* 0x56 */
- "AHOCORASICK", /* 0x57 */
- "AHOCORASICKC", /* 0x58 */
- "GOSUB", /* 0x59 */
- "GOSTART", /* 0x5a */
- "NGROUPP", /* 0x5b */
- "INSUBP", /* 0x5c */
- "DEFINEP", /* 0x5d */
- "ENDLIKE", /* 0x5e */
- "OPFAIL", /* 0x5f */
- "ACCEPT", /* 0x60 */
- "VERB", /* 0x61 */
- "PRUNE", /* 0x62 */
- "MARKPOINT", /* 0x63 */
- "SKIP", /* 0x64 */
- "COMMIT", /* 0x65 */
- "CUTGROUP", /* 0x66 */
- "KEEPS", /* 0x67 */
- "LNBREAK", /* 0x68 */
- "VERTWS", /* 0x69 */
- "NVERTWS", /* 0x6a */
- "HORIZWS", /* 0x6b */
- "NHORIZWS", /* 0x6c */
- "FOLDCHAR", /* 0x6d */
- "OPTIMIZED", /* 0x6e */
- "PSEUDO", /* 0x6f */
+ "EXACTFU_SS", /* 0x34 */
+ "EXACTFU_NO_TRIE", /* 0x35 */
+ "EXACTFA", /* 0x36 */
+ "NOTHING", /* 0x37 */
+ "TAIL", /* 0x38 */
+ "STAR", /* 0x39 */
+ "PLUS", /* 0x3a */
+ "CURLY", /* 0x3b */
+ "CURLYN", /* 0x3c */
+ "CURLYM", /* 0x3d */
+ "CURLYX", /* 0x3e */
+ "WHILEM", /* 0x3f */
+ "OPEN", /* 0x40 */
+ "CLOSE", /* 0x41 */
+ "REF", /* 0x42 */
+ "REFF", /* 0x43 */
+ "REFFL", /* 0x44 */
+ "REFFU", /* 0x45 */
+ "REFFA", /* 0x46 */
+ "NREF", /* 0x47 */
+ "NREFF", /* 0x48 */
+ "NREFFL", /* 0x49 */
+ "NREFFU", /* 0x4a */
+ "NREFFA", /* 0x4b */
+ "IFMATCH", /* 0x4c */
+ "UNLESSM", /* 0x4d */
+ "SUSPEND", /* 0x4e */
+ "IFTHEN", /* 0x4f */
+ "GROUPP", /* 0x50 */
+ "LONGJMP", /* 0x51 */
+ "BRANCHJ", /* 0x52 */
+ "EVAL", /* 0x53 */
+ "MINMOD", /* 0x54 */
+ "LOGICAL", /* 0x55 */
+ "RENUM", /* 0x56 */
+ "TRIE", /* 0x57 */
+ "TRIEC", /* 0x58 */
+ "AHOCORASICK", /* 0x59 */
+ "AHOCORASICKC", /* 0x5a */
+ "GOSUB", /* 0x5b */
+ "GOSTART", /* 0x5c */
+ "NGROUPP", /* 0x5d */
+ "INSUBP", /* 0x5e */
+ "DEFINEP", /* 0x5f */
+ "ENDLIKE", /* 0x60 */
+ "OPFAIL", /* 0x61 */
+ "ACCEPT", /* 0x62 */
+ "VERB", /* 0x63 */
+ "PRUNE", /* 0x64 */
+ "MARKPOINT", /* 0x65 */
+ "SKIP", /* 0x66 */
+ "COMMIT", /* 0x67 */
+ "CUTGROUP", /* 0x68 */
+ "KEEPS", /* 0x69 */
+ "LNBREAK", /* 0x6a */
+ "VERTWS", /* 0x6b */
+ "NVERTWS", /* 0x6c */
+ "HORIZWS", /* 0x6d */
+ "NHORIZWS", /* 0x6e */
+ "FOLDCHAR", /* 0x6f */
+ "OPTIMIZED", /* 0x70 */
+ "PSEUDO", /* 0x71 */
/* ------------ States ------------- */
"TRIE_next", /* REGNODE_MAX +0x01 */
"TRIE_next_fail", /* REGNODE_MAX +0x02 */
@@ -784,7 +794,7 @@ EXTCONST U8 PL_varies[] __attribute__deprecated__ = {
EXTCONST U8 PL_varies_bitmask[];
#else
EXTCONST U8 PL_varies_bitmask[] = {
- 0x00, 0x00, 0x40, 0x00, 0x00, 0xE0, 0x80, 0x3F, 0xFF, 0x33, 0x01, 0x00, 0x00, 0x00
+ 0x00, 0x00, 0x40, 0x00, 0x00, 0xE0, 0x00, 0xFE, 0xFC, 0xCF, 0x04, 0x00, 0x00, 0x00, 0x00
};
#endif /* DOINIT */
@@ -808,7 +818,7 @@ EXTCONST U8 PL_simple[] __attribute__deprecated__ = {
EXTCONST U8 PL_simple_bitmask[];
#else
EXTCONST U8 PL_simple_bitmask[] = {
- 0x00, 0x00, 0xBC, 0xFF, 0xFF, 0x1F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1E
+ 0x00, 0x00, 0xBC, 0xFF, 0xFF, 0x1F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x78, 0x00
};
#endif /* DOINIT */
diff --git a/t/re/re_tests b/t/re/re_tests
index 84791cf15a..33a2fee148 100644
--- a/t/re/re_tests
+++ b/t/re/re_tests
@@ -1560,8 +1560,8 @@ abc\N{def - c - \\N{NAME} must be resolved by the lexer
# Was matching 'ss' only and failing the entire match, not seeing the
# alternative that would succeed
-/s\xDF/ui \xDFs yT $& \xDFs
-/sst/i s\N{LATIN SMALL LIGATURE ST} yT $& s\N{LATIN SMALL LIGATURE ST}
-/sst/i s\N{LATIN SMALL LIGATURE LONG S T} yT $& s\N{LATIN SMALL LIGATURE LONG S T}
+/s\xDF/ui \xDFs y $& \xDFs
+/sst/i s\N{LATIN SMALL LIGATURE ST} y $& s\N{LATIN SMALL LIGATURE ST}
+/sst/i s\N{LATIN SMALL LIGATURE LONG S T} y $& s\N{LATIN SMALL LIGATURE LONG S T}
# vim: softtabstop=0 noexpandtab