summaryrefslogtreecommitdiff
path: root/regcomp.c
diff options
context:
space:
mode:
Diffstat (limited to 'regcomp.c')
-rw-r--r--regcomp.c535
1 files changed, 218 insertions, 317 deletions
diff --git a/regcomp.c b/regcomp.c
index 68b9e04d06..80286fb8e6 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -2505,21 +2505,39 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode
}});
+/* The below joins as many adjacent EXACTish nodes as possible into a single
+ * one, and looks for problematic sequences of characters whose folds vs.
+ * non-folds have sufficiently different lengths, that the optimizer would be
+ * fooled into rejecting legitimate matches of them, and the trie construction
+ * code can't cope with them. The joining is only done if:
+ * 1) there is room in the current conglomerated node to entirely contain the
+ * next one.
+ * 2) they are the exact same node type
+ *
+ * The adjacent nodes actually may be separated by NOTHING kind nodes.
+ *
+ * If there are problematic code sequences, *min_change is set to the delta
+ * that the minimum size of the node can off from its actual size.
+ *
+ * And, the node type of the result is changed to reflect that it contains
+ * these sequences
+ */
-
-
-#define JOIN_EXACT(scan,min,flags) \
+#define JOIN_EXACT(scan,min_change,flags) \
if (PL_regkind[OP(scan)] == EXACT) \
- join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
+ join_exact(pRExC_state,(scan),(min_change),(flags),NULL,depth+1)
STATIC U32
-S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
+S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min_change, U32 flags,regnode *val, U32 depth) {
/* Merge several consecutive EXACTish nodes into one. */
regnode *n = regnext(scan);
U32 stringok = 1;
regnode *next = scan + NODE_SZ_STR(scan);
U32 merged = 0;
U32 stopnow = 0;
+ char *s, *t;
+ char * const s0 = STRING(scan);
+ char * const s_end = s0 + STR_LEN(scan);
#ifdef DEBUGGING
regnode *stop = scan;
GET_RE_DEBUG_FLAGS_DECL;
@@ -2533,13 +2551,20 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags
PERL_UNUSED_ARG(val);
#endif
DEBUG_PEEP("join",scan,depth);
+
+ /* These opcode should only be on output from this routine, never on input
+ */
+ assert(OP(scan) != EXACTFU_NO_TRIE);
+ assert(OP(scan) != EXACTFU_SS);
- /* Skip NOTHING, merge EXACT*. */
- while (n &&
- ( PL_regkind[OP(n)] == NOTHING ||
- (stringok && (OP(n) == OP(scan))))
+ /* Look through the subsequent nodes in the chain. Skip NOTHING, merge
+ * EXACT ones that are mergeable to the current one. */
+ while (n
+ && (PL_regkind[OP(n)] == NOTHING
+ || (stringok && OP(n) == OP(scan)))
&& NEXT_OFF(n)
- && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
+ && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
+ {
if (OP(n) == TAIL || n > next)
stringok = 0;
@@ -2563,7 +2588,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags
DEBUG_PEEP("merg",n,depth);
merged++;
- NEXT_OFF(scan) += NEXT_OFF(n);
+ NEXT_OFF(scan) += NEXT_OFF(n);
STR_LEN(scan) += STR_LEN(n);
next = n + NODE_SZ_STR(n);
/* Now we can overwrite *n : */
@@ -2588,65 +2613,136 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags
}
#endif
}
-#define GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS 0x0390
-#define IOTA_D_T GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS
-#define GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS 0x03B0
-#define UPSILON_D_T GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS
- if (UTF
- && ( OP(scan) == EXACTF || OP(scan) == EXACTFU || OP(scan) == EXACTFA)
- && ( STR_LEN(scan) >= 6 ) )
- {
- /*
- Two problematic code points in Unicode casefolding of EXACT nodes:
-
- U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
- U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
-
- which casefold to
-
- Unicode UTF-8
-
- U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
- U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
-
- This means that in case-insensitive matching (or "loose matching",
- as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
- length of the above casefolded versions) can match a target string
- of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
- This would rather mess up the minimum length computation.
-
- What we'll do is to look for the tail four bytes, and then peek
- at the preceding two bytes to see whether we need to decrease
- the minimum length by four (six minus two).
-
- Thanks to the design of UTF-8, there cannot be false matches:
- A sequence of valid UTF-8 bytes cannot be a subsequence of
- another valid sequence of UTF-8 bytes.
-
- */
- char * const s0 = STRING(scan), *s, *t;
- char * const s1 = s0 + STR_LEN(scan) - 1;
- char * const s2 = s1 - 4;
+ *min_change = 0;
+
+ /* Here, all the adjacent mergeable EXACTish nodes have been merged. We
+ * can now analyze for sequences of problematic code points. (Prior to
+ * this final joining, sequences could have been split over boundaries, and
+ * hence missed). The sequences only happen in folding */
+ if (OP(scan) != EXACT) {
+
+ /* There are three code points in Unicode whose folded lengths differ so
+ * much from the un-folded lengths that it causes problems for the
+ * optimizer and trie construction. Why only these are problematic, and
+ * not others is something I (khw) do not understand. And new versions of
+ * Unicode might add more such code points. Hopefully the logic in
+ * fold_grind.t that figures out what to test (in part by veriying that
+ * each size-combination gets tested) will catch any that do come along, so
+ * they can be added to the special handling below. The chances of this
+ * are actually rather small, as most, if not all, of the scripts that have
+ * casefolding have already been encoded by Unicode, as well as those from
+ * pre-existing standards that Unicode has encoded for backwards
+ * compatibility, which would be the new ones that might have enough
+ * weirdness to qualify for this */
+
+ /* First we look at the sequences that can occur only in UTF-8 strings.
+ * The sequences are of length 6 */
+ if (UTF && STR_LEN(scan) >= 6) {
+
+ /* Two problematic code points in Unicode casefolding of EXACT
+ * nodes:
+ *
+ * U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
+ * U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
+ *
+ * which casefold to
+ *
+ * Unicode UTF-8
+ *
+ * U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
+ * U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
+ *
+ * This means that in case-insensitive matching (or "loose
+ * matching", as Unicode calls it), an EXACTF of length six (the
+ * UTF-8 encoded byte length of the above casefolded versions) can
+ * match a target string of length two (the byte length of UTF-8
+ * encoded U+0390 or U+03B0). This would rather mess up the
+ * minimum length computation. (there are other code points that
+ * also fold to these two sequences, but the delta is smaller)
+ *
+ * What we'll do is to look for the tail four bytes, and then peek
+ * at the preceding two bytes to see whether we need to decrease
+ * the minimum length by four (six minus two).
+ *
+ * Thanks to the design of UTF-8, there cannot be false matches:
+ * A sequence of valid UTF-8 bytes cannot be a subsequence of
+ * another valid sequence of UTF-8 bytes. */
+
#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
- const char t0[] = "\xaf\x49\xaf\x42";
-#else
- const char t0[] = "\xcc\x88\xcc\x81";
-#endif
- const char * const t1 = t0 + 3;
-
- for (s = s0 + 2;
- s < s2 && (t = ninstr(s, s1, t0, t1));
- s = t + 4) {
-#ifdef EBCDIC
- if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
- ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
+ const char U390_first_byte = '\xb4';
+ const char U390_2nd_byte = '\x68';
+ const char U3B0_first_byte = '\xb5';
+ const char U3B0_2nd_byte = '\x46';
+ const char tail[] = "\xaf\x49\xaf\x42";
#else
- if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
- ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
+ const char U390_first_byte = '\xce';
+ const char U390_2nd_byte = '\xb9';
+ const char U3B0_first_byte = '\xcf';
+ const char U3B0_2nd_byte = '\x85';
+ const char tail[] = "\xcc\x88\xcc\x81";
#endif
- *min -= 4;
- }
+ const STRLEN tail_len = sizeof(tail) - 1;
+ for (s = s0 + 2;
+ s <= s_end - tail_len
+ && (t = ninstr(s, s_end, tail, tail + tail_len));
+ s = t + tail_len)
+ {
+ if ((t[-1] == U390_2nd_byte && t[-2] == U390_first_byte)
+ || (t[-1] == U3B0_2nd_byte && t[-2] == U3B0_first_byte))
+ {
+ *min_change -= 4;
+
+ /* This can't currently be handled by tries, so change the
+ * node type to indicate this. */
+ if (OP(scan) == EXACTFU) {
+ OP(scan) = EXACTFU_NO_TRIE;
+ }
+ }
+ }
+ }
+
+ /* The third problematic sequence is 'ss', which can match just the
+ * single byte LATIN SMALL LETTER SHARP S, and it can do it in both
+ * non- and UTF-8. Code elsewhere in this file makes sure, however,
+ * that the sharp s gets folded to 'ss' under Unicode rules even if not
+ * UTF-8. */
+ if (STR_LEN(scan) >= 2
+ && (OP(scan) == EXACTFU
+ || OP(scan) == EXACTFU_NO_TRIE /* The code above could have
+ set to this node type */
+ || OP(scan) == EXACTF))
+ {
+ /* The string will be folded to 'ss' if it's in UTF-8, but it could
+ * be 'Ss', etc when not. We could have different code to handle
+ * the two cases, but this is not necessary since both S and s are
+ * invariants under UTF-8; and not worth it, especially because we
+ * can use just one test each time through the loop (plus a mask)
+ * Ths is because on both EBCDIC and ASCII machines, an 'S' and 's'
+ * differ by a single bit. On ASCII they are 32 apart; on EBCDIC,
+ * they are 64. This uses an exclusive 'or' to find that bit and
+ * then inverts it to form a mask, with just a single 0, in the bit
+ * position where 'S' and 's' differ. */
+ const char S_or_s_mask = ~ ('S' ^ 's');
+ const char s_masked = 's' & S_or_s_mask;
+
+ for (s = s0; s < s_end - 1; s++) {
+ if (((*s & S_or_s_mask) == s_masked)
+ && ((*(s+1) & S_or_s_mask) == s_masked))
+ {
+ s++;
+ *min_change -= 1;
+
+ /* EXACTFU_SS also isn't trie'able, so don't have to
+ * preserve EXACTFU_NO_TRIE. EXACTF is also not trie'able,
+ * and because we essentially punt the optimizations in its
+ * case, we don't need to indicate that it has an ss */
+ if (OP(scan) == EXACTFU || OP(scan) == EXACTFU_NO_TRIE) {
+ OP(scan) = EXACTFU_SS;
+ }
+ }
+ }
+ }
}
#ifdef DEBUGGING
@@ -2762,10 +2858,11 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
fake_study_recurse:
while ( scan && OP(scan) != END && scan < last ){
+ I32 min_change;
/* Peephole optimizer: */
DEBUG_STUDYDATA("Peep:", data,depth);
DEBUG_PEEP("Peep",scan,depth);
- JOIN_EXACT(scan,&min,0);
+ JOIN_EXACT(scan,&min_change,0);
/* Follow the next-chain of the current node and optimize
away all the NOTHINGs from it. */
@@ -3059,8 +3156,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
in below to fully enable trie logic.
#define TRIE_TYPE_IS_SAFE 1
-
*/
+
#define TRIE_TYPE_IS_SAFE ((UTF && optype == EXACTFU) || optype==EXACT)
if ( last && TRIE_TYPE_IS_SAFE ) {
@@ -3279,9 +3376,23 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
l = utf8_length(s, s + l);
uc = utf8_to_uvchr(s, NULL);
}
- min += l;
- if (flags & SCF_DO_SUBSTR)
+ else if (OP(scan) == EXACTF) {
+ if (memchr(STRING(scan), LATIN_SMALL_LETTER_SHARP_S, l)) {
+ RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
+ }
+ }
+ min += l + min_change;
+ if (min < 0) {
+ min = 0;
+ }
+ delta += abs(min_change);
+ if (flags & SCF_DO_SUBSTR) {
data->pos_min += l;
+ data->pos_delta += abs(min_change);
+ if (min_change) {
+ data->longest = &(data->longest_float);
+ }
+ }
if (flags & SCF_DO_STCLASS_AND) {
/* Check whether it is compatible with what we know already! */
int compat = 1;
@@ -3311,6 +3422,13 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
* the full latin1 fold. (Can't do this for locale,
* because not known until runtime */
ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
+ if (uc == 's' || uc == 'S') {
+ ANYOF_BITMAP_SET(data->start_class, LATIN_SMALL_LETTER_SHARP_S);
+ }
+ else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
+ ANYOF_BITMAP_SET(data->start_class, 's');
+ ANYOF_BITMAP_SET(data->start_class, 'S');
+ }
}
}
else if (uc >= 0x100) {
@@ -3335,6 +3453,13 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
* run-time */
ANYOF_BITMAP_SET(data->start_class,
PL_fold_latin1[uc]);
+ if (uc == 's' || uc == 'S') {
+ ANYOF_BITMAP_SET(data->start_class, LATIN_SMALL_LETTER_SHARP_S);
+ }
+ else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
+ ANYOF_BITMAP_SET(data->start_class, 's');
+ ANYOF_BITMAP_SET(data->start_class, 'S');
+ }
}
}
data->start_class->flags &= ~ANYOF_EOS;
@@ -3740,18 +3865,6 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
data->longest = &(data->longest_float);
}
}
- else if (OP(scan) == FOLDCHAR) {
- int d = ARG(scan) == LATIN_SMALL_LETTER_SHARP_S ? 1 : 2;
- flags &= ~SCF_DO_STCLASS;
- min += 1;
- delta += d;
- if (flags & SCF_DO_SUBSTR) {
- SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
- data->pos_min += 1;
- data->pos_delta += d;
- data->longest = &(data->longest_float);
- }
- }
else if (REGNODE_SIMPLE(OP(scan))) {
int value = 0;
@@ -5067,9 +5180,10 @@ reStudy:
{
I32 t,ml;
- if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
- && data.offset_fixed == data.offset_float_min
- && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
+ if ((RExC_seen & REG_SEEN_EXACTF_SHARP_S)
+ || (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
+ && data.offset_fixed == data.offset_float_min
+ && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
goto remove_float; /* As in (a)+. */
/* copy the information about the longest float from the reg_scan_data
@@ -5112,10 +5226,11 @@ reStudy:
Be careful.
*/
longest_fixed_length = CHR_SVLEN(data.longest_fixed);
- if (longest_fixed_length
- || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
- && (!(data.flags & SF_FIX_BEFORE_MEOL)
- || (RExC_flags & RXf_PMf_MULTILINE))))
+ if (! (RExC_seen & REG_SEEN_EXACTF_SHARP_S)
+ && (longest_fixed_length
+ || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
+ && (!(data.flags & SF_FIX_BEFORE_MEOL)
+ || (RExC_flags & RXf_PMf_MULTILINE)))) )
{
I32 t,ml;
@@ -8820,15 +8935,6 @@ tryagain:
RExC_parse++;
defchar: {
- typedef enum {
- generic_char = 0,
- char_s,
- upsilon_1,
- upsilon_2,
- iota_1,
- iota_2,
- } char_state;
- char_state latest_char_state = generic_char;
register STRLEN len;
register UV ender;
register char *p;
@@ -8836,22 +8942,25 @@ tryagain:
STRLEN foldlen;
U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
regnode * orig_emit;
+ int node_type;
+ bool is_exactfu_sharp_s;
ender = 0;
orig_emit = RExC_emit; /* Save the original output node position in
case we need to output a different node
type */
- ret = reg_node(pRExC_state,
- (U8) ((! FOLD) ? EXACT
+ node_type = (U8) ((! FOLD) ? EXACT
: (LOC)
? EXACTFL
: (MORE_ASCII_RESTRICTED)
? EXACTFA
: (AT_LEAST_UNI_SEMANTICS)
? EXACTFU
- : EXACTF)
- );
+ : EXACTF);
+ ret = reg_node(pRExC_state, node_type);
s = STRING(ret);
+
+ /* By going only up to 127 when the maximum storable is 255, we don't have to worry about expansion, not being in the last character in the fold */
for (len = 0, p = RExC_parse - 1;
len < 127 && p < RExC_end;
len++)
@@ -9047,219 +9156,10 @@ tryagain:
break;
} /* End of switch on the literal */
- /* Certain characters are problematic because their folded
- * length is so different from their original length that it
- * isn't handleable by the optimizer. They are therefore not
- * placed in an EXACTish node; and are here handled specially.
- * (Even if the optimizer handled LATIN_SMALL_LETTER_SHARP_S,
- * putting it in a special node keeps regexec from having to
- * deal with a non-utf8 multi-char fold */
- if (FOLD
- && (ender > 255 || (! MORE_ASCII_RESTRICTED && ! LOC)))
- {
- /* We look for either side of the fold. For example \xDF
- * folds to 'ss'. We look for both the single character
- * \xDF and the sequence 'ss'. When we find something that
- * could be one of those, we stop and flush whatever we
- * have output so far into the EXACTish node that was being
- * built. Then restore the input pointer to what it was.
- * regatom will return that EXACT node, and will be called
- * again, positioned so the first character is the one in
- * question, which we return in a different node type.
- * The multi-char folds are a sequence, so the occurrence
- * of the first character in that sequence doesn't
- * necessarily mean that what follows is the rest of the
- * sequence. We keep track of that with a state machine,
- * with the state being set to the latest character
- * processed before the current one. Most characters will
- * set the state to 0, but if one occurs that is part of a
- * potential tricky fold sequence, the state is set to that
- * character, and the next loop iteration sees if the state
- * should progress towards the final folded-from character,
- * or if it was a false alarm. If it turns out to be a
- * false alarm, the character(s) will be output in a new
- * EXACTish node, and join_exact() will later combine them.
- * In the case of the 'ss' sequence, which is more common
- * and more easily checked, some look-ahead is done to
- * save time by ruling-out some false alarms */
- switch (ender) {
- default:
- latest_char_state = generic_char;
- break;
- case 's':
- case 'S':
- case 0x17F: /* LATIN SMALL LETTER LONG S */
- if (AT_LEAST_UNI_SEMANTICS) {
- if (latest_char_state == char_s) { /* 'ss' */
- ender = LATIN_SMALL_LETTER_SHARP_S;
- goto do_tricky;
- }
- else if (p < RExC_end) {
-
- /* Look-ahead at the next character. If it
- * is also an s, we handle as a sharp s
- * tricky regnode. */
- if (*p == 's' || *p == 'S') {
-
- /* But first flush anything in the
- * EXACTish buffer */
- if (len != 0) {
- p = oldp;
- goto loopdone;
- }
- p++; /* Account for swallowing this
- 's' up */
- ender = LATIN_SMALL_LETTER_SHARP_S;
- goto do_tricky;
- }
- /* Here, the next character is not a
- * literal 's', but still could
- * evaluate to one if part of a \o{},
- * \x or \OCTAL-DIGIT. The minimum
- * length required for that is 4, eg
- * \x53 or \123 */
- else if (*p == '\\'
- && p < RExC_end - 4
- && (isDIGIT(*(p + 1))
- || *(p + 1) == 'x'
- || *(p + 1) == 'o' ))
- {
-
- /* Here, it could be an 's', too much
- * bother to figure it out here. Flush
- * the buffer if any; when come back
- * here, set the state so know that the
- * previous char was an 's' */
- if (len != 0) {
- latest_char_state = generic_char;
- p = oldp;
- goto loopdone;
- }
- latest_char_state = char_s;
- break;
- }
- }
- }
-
- /* Here, can't be an 'ss' sequence, or at least not
- * one that could fold to/from the sharp ss */
- latest_char_state = generic_char;
- break;
- case 0x03C5: /* First char in upsilon series */
- case 0x03A5: /* Also capital UPSILON, which folds to
- 03C5, and hence exhibits the same
- problem */
- if (p < RExC_end - 4) { /* Need >= 4 bytes left */
- latest_char_state = upsilon_1;
- if (len != 0) {
- p = oldp;
- goto loopdone;
- }
- }
- else {
- latest_char_state = generic_char;
- }
- break;
- case 0x03B9: /* First char in iota series */
- case 0x0399: /* Also capital IOTA */
- case 0x1FBE: /* GREEK PROSGEGRAMMENI folds to 3B9 */
- case 0x0345: /* COMBINING GREEK YPOGEGRAMMENI folds
- to 3B9 */
- if (p < RExC_end - 4) {
- latest_char_state = iota_1;
- if (len != 0) {
- p = oldp;
- goto loopdone;
- }
- }
- else {
- latest_char_state = generic_char;
- }
- break;
- case 0x0308:
- if (latest_char_state == upsilon_1) {
- latest_char_state = upsilon_2;
- }
- else if (latest_char_state == iota_1) {
- latest_char_state = iota_2;
- }
- else {
- latest_char_state = generic_char;
- }
- break;
- case 0x301:
- if (latest_char_state == upsilon_2) {
- ender = GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS;
- goto do_tricky;
- }
- else if (latest_char_state == iota_2) {
- ender = GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS;
- goto do_tricky;
- }
- latest_char_state = generic_char;
- break;
-
- /* These are the tricky fold characters. Flush any
- * buffer first. (When adding to this list, also should
- * add them to fold_grind.t to make sure get tested) */
- case GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS:
- case GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS:
- case LATIN_SMALL_LETTER_SHARP_S:
- case LATIN_CAPITAL_LETTER_SHARP_S:
- case 0x1FD3: /* GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA */
- case 0x1FE3: /* GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA */
- if (len != 0) {
- p = oldp;
- goto loopdone;
- }
- /* FALL THROUGH */
- do_tricky: {
- char* const oldregxend = RExC_end;
- U8 tmpbuf[UTF8_MAXBYTES+1];
-
- /* Here, we know we need to generate a special
- * regnode, and 'ender' contains the tricky
- * character. What's done is to pretend it's in a
- * [bracketed] class, and let the code that deals
- * with those handle it, as that code has all the
- * intelligence necessary. First save the current
- * parse state, get rid of the already allocated
- * but empty EXACT node that the ANYOFV node will
- * replace, and point the parse to a buffer which
- * we fill with the character we want the regclass
- * code to think is being parsed */
- RExC_emit = orig_emit;
- RExC_parse = (char *) tmpbuf;
- if (UTF) {
- U8 *d = uvchr_to_utf8(tmpbuf, ender);
- *d = '\0';
- RExC_end = (char *) d;
- }
- else { /* ender above 255 already excluded */
- tmpbuf[0] = (U8) ender;
- tmpbuf[1] = '\0';
- RExC_end = RExC_parse + 1;
- }
-
- ret = regclass(pRExC_state,depth+1);
-
- /* Here, have parsed the buffer. Reset the parse to
- * the actual input, and return */
- RExC_end = oldregxend;
- RExC_parse = p - 1;
-
- Set_Node_Offset(ret, RExC_parse);
- Set_Node_Cur_Length(ret);
- nextchar(pRExC_state);
- *flagp |= HASWIDTH|SIMPLE;
- return ret;
- }
- }
- }
-
+ is_exactfu_sharp_s = (node_type == EXACTFU && ender == LATIN_SMALL_LETTER_SHARP_S);
if ( RExC_flags & RXf_PMf_EXTENDED)
p = regwhite( pRExC_state, p );
- if (UTF && FOLD) {
+ if ((UTF && FOLD) || is_exactfu_sharp_s) {
/* Prime the casefolded buffer. Locale rules, which apply
* only to code points < 256, aren't known until execution,
* so for them, just output the original character using
@@ -9322,7 +9222,7 @@ tryagain:
if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
if (len)
p = oldp;
- else if (UTF) {
+ else if (UTF || is_exactfu_sharp_s) {
if (FOLD) {
/* Emit all the Unicode characters. */
STRLEN numlen;
@@ -9358,7 +9258,7 @@ tryagain:
}
break;
}
- if (UTF) {
+ if (UTF || is_exactfu_sharp_s) {
if (FOLD) {
/* Emit all the Unicode characters. */
STRLEN numlen;
@@ -11188,6 +11088,7 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,
regnode * const temp = regnext(scan);
#ifdef EXPERIMENTAL_INPLACESCAN
if (PL_regkind[OP(scan)] == EXACT)
+
if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
return EXACT;
#endif
@@ -11197,6 +11098,8 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,
case EXACTF:
case EXACTFA:
case EXACTFU:
+ case EXACTFU_SS:
+ case EXACTFU_NO_TRIE:
case EXACTFL:
if( exact == PSEUDO )
exact= OP(scan);
@@ -11521,8 +11424,6 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
} else if (k == LOGICAL)
Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
- else if (k == FOLDCHAR)
- Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) );
else if (k == ANYOF) {
int i, rangestart = -1;
const U8 flags = ANYOF_FLAGS(o);