diff options
Diffstat (limited to 'regcomp.c')
-rw-r--r-- | regcomp.c | 535 |
1 files changed, 218 insertions, 317 deletions
@@ -2505,21 +2505,39 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode }}); +/* The below joins as many adjacent EXACTish nodes as possible into a single + * one, and looks for problematic sequences of characters whose folds vs. + * non-folds have sufficiently different lengths, that the optimizer would be + * fooled into rejecting legitimate matches of them, and the trie construction + * code can't cope with them. The joining is only done if: + * 1) there is room in the current conglomerated node to entirely contain the + * next one. + * 2) they are the exact same node type + * + * The adjacent nodes actually may be separated by NOTHING kind nodes. + * + * If there are problematic code sequences, *min_change is set to the delta + * that the minimum size of the node can off from its actual size. + * + * And, the node type of the result is changed to reflect that it contains + * these sequences + */ - - -#define JOIN_EXACT(scan,min,flags) \ +#define JOIN_EXACT(scan,min_change,flags) \ if (PL_regkind[OP(scan)] == EXACT) \ - join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1) + join_exact(pRExC_state,(scan),(min_change),(flags),NULL,depth+1) STATIC U32 -S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) { +S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min_change, U32 flags,regnode *val, U32 depth) { /* Merge several consecutive EXACTish nodes into one. */ regnode *n = regnext(scan); U32 stringok = 1; regnode *next = scan + NODE_SZ_STR(scan); U32 merged = 0; U32 stopnow = 0; + char *s, *t; + char * const s0 = STRING(scan); + char * const s_end = s0 + STR_LEN(scan); #ifdef DEBUGGING regnode *stop = scan; GET_RE_DEBUG_FLAGS_DECL; @@ -2533,13 +2551,20 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags PERL_UNUSED_ARG(val); #endif DEBUG_PEEP("join",scan,depth); + + /* These opcode should only be on output from this routine, never on input + */ + assert(OP(scan) != EXACTFU_NO_TRIE); + assert(OP(scan) != EXACTFU_SS); - /* Skip NOTHING, merge EXACT*. */ - while (n && - ( PL_regkind[OP(n)] == NOTHING || - (stringok && (OP(n) == OP(scan)))) + /* Look through the subsequent nodes in the chain. Skip NOTHING, merge + * EXACT ones that are mergeable to the current one. */ + while (n + && (PL_regkind[OP(n)] == NOTHING + || (stringok && OP(n) == OP(scan))) && NEXT_OFF(n) - && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) { + && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) + { if (OP(n) == TAIL || n > next) stringok = 0; @@ -2563,7 +2588,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags DEBUG_PEEP("merg",n,depth); merged++; - NEXT_OFF(scan) += NEXT_OFF(n); + NEXT_OFF(scan) += NEXT_OFF(n); STR_LEN(scan) += STR_LEN(n); next = n + NODE_SZ_STR(n); /* Now we can overwrite *n : */ @@ -2588,65 +2613,136 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags } #endif } -#define GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS 0x0390 -#define IOTA_D_T GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS -#define GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS 0x03B0 -#define UPSILON_D_T GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS - if (UTF - && ( OP(scan) == EXACTF || OP(scan) == EXACTFU || OP(scan) == EXACTFA) - && ( STR_LEN(scan) >= 6 ) ) - { - /* - Two problematic code points in Unicode casefolding of EXACT nodes: - - U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS - U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS - - which casefold to - - Unicode UTF-8 - - U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81 - U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81 - - This means that in case-insensitive matching (or "loose matching", - as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte - length of the above casefolded versions) can match a target string - of length two (the byte length of UTF-8 encoded U+0390 or U+03B0). - This would rather mess up the minimum length computation. - - What we'll do is to look for the tail four bytes, and then peek - at the preceding two bytes to see whether we need to decrease - the minimum length by four (six minus two). - - Thanks to the design of UTF-8, there cannot be false matches: - A sequence of valid UTF-8 bytes cannot be a subsequence of - another valid sequence of UTF-8 bytes. - - */ - char * const s0 = STRING(scan), *s, *t; - char * const s1 = s0 + STR_LEN(scan) - 1; - char * const s2 = s1 - 4; + *min_change = 0; + + /* Here, all the adjacent mergeable EXACTish nodes have been merged. We + * can now analyze for sequences of problematic code points. (Prior to + * this final joining, sequences could have been split over boundaries, and + * hence missed). The sequences only happen in folding */ + if (OP(scan) != EXACT) { + + /* There are three code points in Unicode whose folded lengths differ so + * much from the un-folded lengths that it causes problems for the + * optimizer and trie construction. Why only these are problematic, and + * not others is something I (khw) do not understand. And new versions of + * Unicode might add more such code points. Hopefully the logic in + * fold_grind.t that figures out what to test (in part by veriying that + * each size-combination gets tested) will catch any that do come along, so + * they can be added to the special handling below. The chances of this + * are actually rather small, as most, if not all, of the scripts that have + * casefolding have already been encoded by Unicode, as well as those from + * pre-existing standards that Unicode has encoded for backwards + * compatibility, which would be the new ones that might have enough + * weirdness to qualify for this */ + + /* First we look at the sequences that can occur only in UTF-8 strings. + * The sequences are of length 6 */ + if (UTF && STR_LEN(scan) >= 6) { + + /* Two problematic code points in Unicode casefolding of EXACT + * nodes: + * + * U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS + * U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS + * + * which casefold to + * + * Unicode UTF-8 + * + * U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81 + * U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81 + * + * This means that in case-insensitive matching (or "loose + * matching", as Unicode calls it), an EXACTF of length six (the + * UTF-8 encoded byte length of the above casefolded versions) can + * match a target string of length two (the byte length of UTF-8 + * encoded U+0390 or U+03B0). This would rather mess up the + * minimum length computation. (there are other code points that + * also fold to these two sequences, but the delta is smaller) + * + * What we'll do is to look for the tail four bytes, and then peek + * at the preceding two bytes to see whether we need to decrease + * the minimum length by four (six minus two). + * + * Thanks to the design of UTF-8, there cannot be false matches: + * A sequence of valid UTF-8 bytes cannot be a subsequence of + * another valid sequence of UTF-8 bytes. */ + #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */ - const char t0[] = "\xaf\x49\xaf\x42"; -#else - const char t0[] = "\xcc\x88\xcc\x81"; -#endif - const char * const t1 = t0 + 3; - - for (s = s0 + 2; - s < s2 && (t = ninstr(s, s1, t0, t1)); - s = t + 4) { -#ifdef EBCDIC - if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) || - ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5)) + const char U390_first_byte = '\xb4'; + const char U390_2nd_byte = '\x68'; + const char U3B0_first_byte = '\xb5'; + const char U3B0_2nd_byte = '\x46'; + const char tail[] = "\xaf\x49\xaf\x42"; #else - if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) || - ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF)) + const char U390_first_byte = '\xce'; + const char U390_2nd_byte = '\xb9'; + const char U3B0_first_byte = '\xcf'; + const char U3B0_2nd_byte = '\x85'; + const char tail[] = "\xcc\x88\xcc\x81"; #endif - *min -= 4; - } + const STRLEN tail_len = sizeof(tail) - 1; + for (s = s0 + 2; + s <= s_end - tail_len + && (t = ninstr(s, s_end, tail, tail + tail_len)); + s = t + tail_len) + { + if ((t[-1] == U390_2nd_byte && t[-2] == U390_first_byte) + || (t[-1] == U3B0_2nd_byte && t[-2] == U3B0_first_byte)) + { + *min_change -= 4; + + /* This can't currently be handled by tries, so change the + * node type to indicate this. */ + if (OP(scan) == EXACTFU) { + OP(scan) = EXACTFU_NO_TRIE; + } + } + } + } + + /* The third problematic sequence is 'ss', which can match just the + * single byte LATIN SMALL LETTER SHARP S, and it can do it in both + * non- and UTF-8. Code elsewhere in this file makes sure, however, + * that the sharp s gets folded to 'ss' under Unicode rules even if not + * UTF-8. */ + if (STR_LEN(scan) >= 2 + && (OP(scan) == EXACTFU + || OP(scan) == EXACTFU_NO_TRIE /* The code above could have + set to this node type */ + || OP(scan) == EXACTF)) + { + /* The string will be folded to 'ss' if it's in UTF-8, but it could + * be 'Ss', etc when not. We could have different code to handle + * the two cases, but this is not necessary since both S and s are + * invariants under UTF-8; and not worth it, especially because we + * can use just one test each time through the loop (plus a mask) + * Ths is because on both EBCDIC and ASCII machines, an 'S' and 's' + * differ by a single bit. On ASCII they are 32 apart; on EBCDIC, + * they are 64. This uses an exclusive 'or' to find that bit and + * then inverts it to form a mask, with just a single 0, in the bit + * position where 'S' and 's' differ. */ + const char S_or_s_mask = ~ ('S' ^ 's'); + const char s_masked = 's' & S_or_s_mask; + + for (s = s0; s < s_end - 1; s++) { + if (((*s & S_or_s_mask) == s_masked) + && ((*(s+1) & S_or_s_mask) == s_masked)) + { + s++; + *min_change -= 1; + + /* EXACTFU_SS also isn't trie'able, so don't have to + * preserve EXACTFU_NO_TRIE. EXACTF is also not trie'able, + * and because we essentially punt the optimizations in its + * case, we don't need to indicate that it has an ss */ + if (OP(scan) == EXACTFU || OP(scan) == EXACTFU_NO_TRIE) { + OP(scan) = EXACTFU_SS; + } + } + } + } } #ifdef DEBUGGING @@ -2762,10 +2858,11 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, fake_study_recurse: while ( scan && OP(scan) != END && scan < last ){ + I32 min_change; /* Peephole optimizer: */ DEBUG_STUDYDATA("Peep:", data,depth); DEBUG_PEEP("Peep",scan,depth); - JOIN_EXACT(scan,&min,0); + JOIN_EXACT(scan,&min_change,0); /* Follow the next-chain of the current node and optimize away all the NOTHINGs from it. */ @@ -3059,8 +3156,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, in below to fully enable trie logic. #define TRIE_TYPE_IS_SAFE 1 - */ + #define TRIE_TYPE_IS_SAFE ((UTF && optype == EXACTFU) || optype==EXACT) if ( last && TRIE_TYPE_IS_SAFE ) { @@ -3279,9 +3376,23 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, l = utf8_length(s, s + l); uc = utf8_to_uvchr(s, NULL); } - min += l; - if (flags & SCF_DO_SUBSTR) + else if (OP(scan) == EXACTF) { + if (memchr(STRING(scan), LATIN_SMALL_LETTER_SHARP_S, l)) { + RExC_seen |= REG_SEEN_EXACTF_SHARP_S; + } + } + min += l + min_change; + if (min < 0) { + min = 0; + } + delta += abs(min_change); + if (flags & SCF_DO_SUBSTR) { data->pos_min += l; + data->pos_delta += abs(min_change); + if (min_change) { + data->longest = &(data->longest_float); + } + } if (flags & SCF_DO_STCLASS_AND) { /* Check whether it is compatible with what we know already! */ int compat = 1; @@ -3311,6 +3422,13 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, * the full latin1 fold. (Can't do this for locale, * because not known until runtime */ ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]); + if (uc == 's' || uc == 'S') { + ANYOF_BITMAP_SET(data->start_class, LATIN_SMALL_LETTER_SHARP_S); + } + else if (uc == LATIN_SMALL_LETTER_SHARP_S) { + ANYOF_BITMAP_SET(data->start_class, 's'); + ANYOF_BITMAP_SET(data->start_class, 'S'); + } } } else if (uc >= 0x100) { @@ -3335,6 +3453,13 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, * run-time */ ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]); + if (uc == 's' || uc == 'S') { + ANYOF_BITMAP_SET(data->start_class, LATIN_SMALL_LETTER_SHARP_S); + } + else if (uc == LATIN_SMALL_LETTER_SHARP_S) { + ANYOF_BITMAP_SET(data->start_class, 's'); + ANYOF_BITMAP_SET(data->start_class, 'S'); + } } } data->start_class->flags &= ~ANYOF_EOS; @@ -3740,18 +3865,6 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, data->longest = &(data->longest_float); } } - else if (OP(scan) == FOLDCHAR) { - int d = ARG(scan) == LATIN_SMALL_LETTER_SHARP_S ? 1 : 2; - flags &= ~SCF_DO_STCLASS; - min += 1; - delta += d; - if (flags & SCF_DO_SUBSTR) { - SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */ - data->pos_min += 1; - data->pos_delta += d; - data->longest = &(data->longest_float); - } - } else if (REGNODE_SIMPLE(OP(scan))) { int value = 0; @@ -5067,9 +5180,10 @@ reStudy: { I32 t,ml; - if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */ - && data.offset_fixed == data.offset_float_min - && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)) + if ((RExC_seen & REG_SEEN_EXACTF_SHARP_S) + || (SvCUR(data.longest_fixed) /* ok to leave SvCUR */ + && data.offset_fixed == data.offset_float_min + && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))) goto remove_float; /* As in (a)+. */ /* copy the information about the longest float from the reg_scan_data @@ -5112,10 +5226,11 @@ reStudy: Be careful. */ longest_fixed_length = CHR_SVLEN(data.longest_fixed); - if (longest_fixed_length - || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */ - && (!(data.flags & SF_FIX_BEFORE_MEOL) - || (RExC_flags & RXf_PMf_MULTILINE)))) + if (! (RExC_seen & REG_SEEN_EXACTF_SHARP_S) + && (longest_fixed_length + || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */ + && (!(data.flags & SF_FIX_BEFORE_MEOL) + || (RExC_flags & RXf_PMf_MULTILINE)))) ) { I32 t,ml; @@ -8820,15 +8935,6 @@ tryagain: RExC_parse++; defchar: { - typedef enum { - generic_char = 0, - char_s, - upsilon_1, - upsilon_2, - iota_1, - iota_2, - } char_state; - char_state latest_char_state = generic_char; register STRLEN len; register UV ender; register char *p; @@ -8836,22 +8942,25 @@ tryagain: STRLEN foldlen; U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf; regnode * orig_emit; + int node_type; + bool is_exactfu_sharp_s; ender = 0; orig_emit = RExC_emit; /* Save the original output node position in case we need to output a different node type */ - ret = reg_node(pRExC_state, - (U8) ((! FOLD) ? EXACT + node_type = (U8) ((! FOLD) ? EXACT : (LOC) ? EXACTFL : (MORE_ASCII_RESTRICTED) ? EXACTFA : (AT_LEAST_UNI_SEMANTICS) ? EXACTFU - : EXACTF) - ); + : EXACTF); + ret = reg_node(pRExC_state, node_type); s = STRING(ret); + + /* By going only up to 127 when the maximum storable is 255, we don't have to worry about expansion, not being in the last character in the fold */ for (len = 0, p = RExC_parse - 1; len < 127 && p < RExC_end; len++) @@ -9047,219 +9156,10 @@ tryagain: break; } /* End of switch on the literal */ - /* Certain characters are problematic because their folded - * length is so different from their original length that it - * isn't handleable by the optimizer. They are therefore not - * placed in an EXACTish node; and are here handled specially. - * (Even if the optimizer handled LATIN_SMALL_LETTER_SHARP_S, - * putting it in a special node keeps regexec from having to - * deal with a non-utf8 multi-char fold */ - if (FOLD - && (ender > 255 || (! MORE_ASCII_RESTRICTED && ! LOC))) - { - /* We look for either side of the fold. For example \xDF - * folds to 'ss'. We look for both the single character - * \xDF and the sequence 'ss'. When we find something that - * could be one of those, we stop and flush whatever we - * have output so far into the EXACTish node that was being - * built. Then restore the input pointer to what it was. - * regatom will return that EXACT node, and will be called - * again, positioned so the first character is the one in - * question, which we return in a different node type. - * The multi-char folds are a sequence, so the occurrence - * of the first character in that sequence doesn't - * necessarily mean that what follows is the rest of the - * sequence. We keep track of that with a state machine, - * with the state being set to the latest character - * processed before the current one. Most characters will - * set the state to 0, but if one occurs that is part of a - * potential tricky fold sequence, the state is set to that - * character, and the next loop iteration sees if the state - * should progress towards the final folded-from character, - * or if it was a false alarm. If it turns out to be a - * false alarm, the character(s) will be output in a new - * EXACTish node, and join_exact() will later combine them. - * In the case of the 'ss' sequence, which is more common - * and more easily checked, some look-ahead is done to - * save time by ruling-out some false alarms */ - switch (ender) { - default: - latest_char_state = generic_char; - break; - case 's': - case 'S': - case 0x17F: /* LATIN SMALL LETTER LONG S */ - if (AT_LEAST_UNI_SEMANTICS) { - if (latest_char_state == char_s) { /* 'ss' */ - ender = LATIN_SMALL_LETTER_SHARP_S; - goto do_tricky; - } - else if (p < RExC_end) { - - /* Look-ahead at the next character. If it - * is also an s, we handle as a sharp s - * tricky regnode. */ - if (*p == 's' || *p == 'S') { - - /* But first flush anything in the - * EXACTish buffer */ - if (len != 0) { - p = oldp; - goto loopdone; - } - p++; /* Account for swallowing this - 's' up */ - ender = LATIN_SMALL_LETTER_SHARP_S; - goto do_tricky; - } - /* Here, the next character is not a - * literal 's', but still could - * evaluate to one if part of a \o{}, - * \x or \OCTAL-DIGIT. The minimum - * length required for that is 4, eg - * \x53 or \123 */ - else if (*p == '\\' - && p < RExC_end - 4 - && (isDIGIT(*(p + 1)) - || *(p + 1) == 'x' - || *(p + 1) == 'o' )) - { - - /* Here, it could be an 's', too much - * bother to figure it out here. Flush - * the buffer if any; when come back - * here, set the state so know that the - * previous char was an 's' */ - if (len != 0) { - latest_char_state = generic_char; - p = oldp; - goto loopdone; - } - latest_char_state = char_s; - break; - } - } - } - - /* Here, can't be an 'ss' sequence, or at least not - * one that could fold to/from the sharp ss */ - latest_char_state = generic_char; - break; - case 0x03C5: /* First char in upsilon series */ - case 0x03A5: /* Also capital UPSILON, which folds to - 03C5, and hence exhibits the same - problem */ - if (p < RExC_end - 4) { /* Need >= 4 bytes left */ - latest_char_state = upsilon_1; - if (len != 0) { - p = oldp; - goto loopdone; - } - } - else { - latest_char_state = generic_char; - } - break; - case 0x03B9: /* First char in iota series */ - case 0x0399: /* Also capital IOTA */ - case 0x1FBE: /* GREEK PROSGEGRAMMENI folds to 3B9 */ - case 0x0345: /* COMBINING GREEK YPOGEGRAMMENI folds - to 3B9 */ - if (p < RExC_end - 4) { - latest_char_state = iota_1; - if (len != 0) { - p = oldp; - goto loopdone; - } - } - else { - latest_char_state = generic_char; - } - break; - case 0x0308: - if (latest_char_state == upsilon_1) { - latest_char_state = upsilon_2; - } - else if (latest_char_state == iota_1) { - latest_char_state = iota_2; - } - else { - latest_char_state = generic_char; - } - break; - case 0x301: - if (latest_char_state == upsilon_2) { - ender = GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS; - goto do_tricky; - } - else if (latest_char_state == iota_2) { - ender = GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS; - goto do_tricky; - } - latest_char_state = generic_char; - break; - - /* These are the tricky fold characters. Flush any - * buffer first. (When adding to this list, also should - * add them to fold_grind.t to make sure get tested) */ - case GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS: - case GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS: - case LATIN_SMALL_LETTER_SHARP_S: - case LATIN_CAPITAL_LETTER_SHARP_S: - case 0x1FD3: /* GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA */ - case 0x1FE3: /* GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA */ - if (len != 0) { - p = oldp; - goto loopdone; - } - /* FALL THROUGH */ - do_tricky: { - char* const oldregxend = RExC_end; - U8 tmpbuf[UTF8_MAXBYTES+1]; - - /* Here, we know we need to generate a special - * regnode, and 'ender' contains the tricky - * character. What's done is to pretend it's in a - * [bracketed] class, and let the code that deals - * with those handle it, as that code has all the - * intelligence necessary. First save the current - * parse state, get rid of the already allocated - * but empty EXACT node that the ANYOFV node will - * replace, and point the parse to a buffer which - * we fill with the character we want the regclass - * code to think is being parsed */ - RExC_emit = orig_emit; - RExC_parse = (char *) tmpbuf; - if (UTF) { - U8 *d = uvchr_to_utf8(tmpbuf, ender); - *d = '\0'; - RExC_end = (char *) d; - } - else { /* ender above 255 already excluded */ - tmpbuf[0] = (U8) ender; - tmpbuf[1] = '\0'; - RExC_end = RExC_parse + 1; - } - - ret = regclass(pRExC_state,depth+1); - - /* Here, have parsed the buffer. Reset the parse to - * the actual input, and return */ - RExC_end = oldregxend; - RExC_parse = p - 1; - - Set_Node_Offset(ret, RExC_parse); - Set_Node_Cur_Length(ret); - nextchar(pRExC_state); - *flagp |= HASWIDTH|SIMPLE; - return ret; - } - } - } - + is_exactfu_sharp_s = (node_type == EXACTFU && ender == LATIN_SMALL_LETTER_SHARP_S); if ( RExC_flags & RXf_PMf_EXTENDED) p = regwhite( pRExC_state, p ); - if (UTF && FOLD) { + if ((UTF && FOLD) || is_exactfu_sharp_s) { /* Prime the casefolded buffer. Locale rules, which apply * only to code points < 256, aren't known until execution, * so for them, just output the original character using @@ -9322,7 +9222,7 @@ tryagain: if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */ if (len) p = oldp; - else if (UTF) { + else if (UTF || is_exactfu_sharp_s) { if (FOLD) { /* Emit all the Unicode characters. */ STRLEN numlen; @@ -9358,7 +9258,7 @@ tryagain: } break; } - if (UTF) { + if (UTF || is_exactfu_sharp_s) { if (FOLD) { /* Emit all the Unicode characters. */ STRLEN numlen; @@ -11188,6 +11088,7 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val, regnode * const temp = regnext(scan); #ifdef EXPERIMENTAL_INPLACESCAN if (PL_regkind[OP(scan)] == EXACT) + if (join_exact(pRExC_state,scan,&min,1,val,depth+1)) return EXACT; #endif @@ -11197,6 +11098,8 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val, case EXACTF: case EXACTFA: case EXACTFU: + case EXACTFU_SS: + case EXACTFU_NO_TRIE: case EXACTFL: if( exact == PSEUDO ) exact= OP(scan); @@ -11521,8 +11424,6 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ])))); } else if (k == LOGICAL) Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */ - else if (k == FOLDCHAR) - Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) ); else if (k == ANYOF) { int i, rangestart = -1; const U8 flags = ANYOF_FLAGS(o); |