diff options
-rw-r--r-- | embed.fnc | 2 | ||||
-rw-r--r-- | pp_hot.c | 8 | ||||
-rw-r--r-- | proto.h | 4 | ||||
-rw-r--r-- | regcomp.c | 535 | ||||
-rw-r--r-- | regcomp.h | 1 | ||||
-rw-r--r-- | regcomp.sym | 6 | ||||
-rw-r--r-- | regexec.c | 48 | ||||
-rw-r--r-- | regnodes.h | 260 | ||||
-rw-r--r-- | t/re/re_tests | 6 |
9 files changed, 391 insertions, 479 deletions
@@ -1889,7 +1889,7 @@ Es |void |regtail |NN struct RExC_state_t *pRExC_state \ Es |SV * |reg_scan_name |NN struct RExC_state_t *pRExC_state \ |U32 flags Es |U32 |join_exact |NN struct RExC_state_t *pRExC_state \ - |NN regnode *scan|NN I32 *min|U32 flags|NULLOK regnode *val|U32 depth + |NN regnode *scan|NN I32 *min_change|U32 flags|NULLOK regnode *val|U32 depth EsRn |char * |regwhite |NN struct RExC_state_t *pRExC_state \ |NN char *p Es |char * |nextchar |NN struct RExC_state_t *pRExC_state @@ -1290,8 +1290,10 @@ PP(pp_match) rx = PM_GETRE(pm); } - if (RX_MINLEN(rx) > (I32)len) + if (RX_MINLEN(rx) > (I32)len) { + DEBUG_r(PerlIO_printf(Perl_debug_log, "Regex match must fail due to min length, so not tried\n")); goto failure; + } truebase = t = s; @@ -1330,8 +1332,10 @@ PP(pp_match) play_it_again: if (global && RX_OFFS(rx)[0].start != -1) { t = s = RX_OFFS(rx)[0].end + truebase - RX_GOFS(rx); - if ((s + RX_MINLEN(rx)) > strend || s < truebase) + if ((s + RX_MINLEN(rx)) > strend || s < truebase) { + DEBUG_r(PerlIO_printf(Perl_debug_log, "Regex match must fail, so not tried\n")); goto nope; + } if (update_minmatch++) minmatch = had_zerolen; } @@ -6350,12 +6350,12 @@ PERL_STATIC_INLINE void S_invlist_trim(pTHX_ SV* const invlist) #define PERL_ARGS_ASSERT_INVLIST_TRIM \ assert(invlist) -STATIC U32 S_join_exact(pTHX_ struct RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags, regnode *val, U32 depth) +STATIC U32 S_join_exact(pTHX_ struct RExC_state_t *pRExC_state, regnode *scan, I32 *min_change, U32 flags, regnode *val, U32 depth) __attribute__nonnull__(pTHX_1) __attribute__nonnull__(pTHX_2) __attribute__nonnull__(pTHX_3); #define PERL_ARGS_ASSERT_JOIN_EXACT \ - assert(pRExC_state); assert(scan); assert(min) + assert(pRExC_state); assert(scan); assert(min_change) STATIC I32 S_make_trie(pTHX_ struct RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth) __attribute__nonnull__(pTHX_1) @@ -2505,21 +2505,39 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode }}); +/* The below joins as many adjacent EXACTish nodes as possible into a single + * one, and looks for problematic sequences of characters whose folds vs. + * non-folds have sufficiently different lengths, that the optimizer would be + * fooled into rejecting legitimate matches of them, and the trie construction + * code can't cope with them. The joining is only done if: + * 1) there is room in the current conglomerated node to entirely contain the + * next one. + * 2) they are the exact same node type + * + * The adjacent nodes actually may be separated by NOTHING kind nodes. + * + * If there are problematic code sequences, *min_change is set to the delta + * that the minimum size of the node can off from its actual size. + * + * And, the node type of the result is changed to reflect that it contains + * these sequences + */ - - -#define JOIN_EXACT(scan,min,flags) \ +#define JOIN_EXACT(scan,min_change,flags) \ if (PL_regkind[OP(scan)] == EXACT) \ - join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1) + join_exact(pRExC_state,(scan),(min_change),(flags),NULL,depth+1) STATIC U32 -S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) { +S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min_change, U32 flags,regnode *val, U32 depth) { /* Merge several consecutive EXACTish nodes into one. */ regnode *n = regnext(scan); U32 stringok = 1; regnode *next = scan + NODE_SZ_STR(scan); U32 merged = 0; U32 stopnow = 0; + char *s, *t; + char * const s0 = STRING(scan); + char * const s_end = s0 + STR_LEN(scan); #ifdef DEBUGGING regnode *stop = scan; GET_RE_DEBUG_FLAGS_DECL; @@ -2533,13 +2551,20 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags PERL_UNUSED_ARG(val); #endif DEBUG_PEEP("join",scan,depth); + + /* These opcode should only be on output from this routine, never on input + */ + assert(OP(scan) != EXACTFU_NO_TRIE); + assert(OP(scan) != EXACTFU_SS); - /* Skip NOTHING, merge EXACT*. */ - while (n && - ( PL_regkind[OP(n)] == NOTHING || - (stringok && (OP(n) == OP(scan)))) + /* Look through the subsequent nodes in the chain. Skip NOTHING, merge + * EXACT ones that are mergeable to the current one. */ + while (n + && (PL_regkind[OP(n)] == NOTHING + || (stringok && OP(n) == OP(scan))) && NEXT_OFF(n) - && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) { + && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) + { if (OP(n) == TAIL || n > next) stringok = 0; @@ -2563,7 +2588,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags DEBUG_PEEP("merg",n,depth); merged++; - NEXT_OFF(scan) += NEXT_OFF(n); + NEXT_OFF(scan) += NEXT_OFF(n); STR_LEN(scan) += STR_LEN(n); next = n + NODE_SZ_STR(n); /* Now we can overwrite *n : */ @@ -2588,65 +2613,136 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags } #endif } -#define GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS 0x0390 -#define IOTA_D_T GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS -#define GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS 0x03B0 -#define UPSILON_D_T GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS - if (UTF - && ( OP(scan) == EXACTF || OP(scan) == EXACTFU || OP(scan) == EXACTFA) - && ( STR_LEN(scan) >= 6 ) ) - { - /* - Two problematic code points in Unicode casefolding of EXACT nodes: - - U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS - U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS - - which casefold to - - Unicode UTF-8 - - U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81 - U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81 - - This means that in case-insensitive matching (or "loose matching", - as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte - length of the above casefolded versions) can match a target string - of length two (the byte length of UTF-8 encoded U+0390 or U+03B0). - This would rather mess up the minimum length computation. - - What we'll do is to look for the tail four bytes, and then peek - at the preceding two bytes to see whether we need to decrease - the minimum length by four (six minus two). - - Thanks to the design of UTF-8, there cannot be false matches: - A sequence of valid UTF-8 bytes cannot be a subsequence of - another valid sequence of UTF-8 bytes. - - */ - char * const s0 = STRING(scan), *s, *t; - char * const s1 = s0 + STR_LEN(scan) - 1; - char * const s2 = s1 - 4; + *min_change = 0; + + /* Here, all the adjacent mergeable EXACTish nodes have been merged. We + * can now analyze for sequences of problematic code points. (Prior to + * this final joining, sequences could have been split over boundaries, and + * hence missed). The sequences only happen in folding */ + if (OP(scan) != EXACT) { + + /* There are three code points in Unicode whose folded lengths differ so + * much from the un-folded lengths that it causes problems for the + * optimizer and trie construction. Why only these are problematic, and + * not others is something I (khw) do not understand. And new versions of + * Unicode might add more such code points. Hopefully the logic in + * fold_grind.t that figures out what to test (in part by veriying that + * each size-combination gets tested) will catch any that do come along, so + * they can be added to the special handling below. The chances of this + * are actually rather small, as most, if not all, of the scripts that have + * casefolding have already been encoded by Unicode, as well as those from + * pre-existing standards that Unicode has encoded for backwards + * compatibility, which would be the new ones that might have enough + * weirdness to qualify for this */ + + /* First we look at the sequences that can occur only in UTF-8 strings. + * The sequences are of length 6 */ + if (UTF && STR_LEN(scan) >= 6) { + + /* Two problematic code points in Unicode casefolding of EXACT + * nodes: + * + * U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS + * U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS + * + * which casefold to + * + * Unicode UTF-8 + * + * U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81 + * U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81 + * + * This means that in case-insensitive matching (or "loose + * matching", as Unicode calls it), an EXACTF of length six (the + * UTF-8 encoded byte length of the above casefolded versions) can + * match a target string of length two (the byte length of UTF-8 + * encoded U+0390 or U+03B0). This would rather mess up the + * minimum length computation. (there are other code points that + * also fold to these two sequences, but the delta is smaller) + * + * What we'll do is to look for the tail four bytes, and then peek + * at the preceding two bytes to see whether we need to decrease + * the minimum length by four (six minus two). + * + * Thanks to the design of UTF-8, there cannot be false matches: + * A sequence of valid UTF-8 bytes cannot be a subsequence of + * another valid sequence of UTF-8 bytes. */ + #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */ - const char t0[] = "\xaf\x49\xaf\x42"; -#else - const char t0[] = "\xcc\x88\xcc\x81"; -#endif - const char * const t1 = t0 + 3; - - for (s = s0 + 2; - s < s2 && (t = ninstr(s, s1, t0, t1)); - s = t + 4) { -#ifdef EBCDIC - if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) || - ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5)) + const char U390_first_byte = '\xb4'; + const char U390_2nd_byte = '\x68'; + const char U3B0_first_byte = '\xb5'; + const char U3B0_2nd_byte = '\x46'; + const char tail[] = "\xaf\x49\xaf\x42"; #else - if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) || - ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF)) + const char U390_first_byte = '\xce'; + const char U390_2nd_byte = '\xb9'; + const char U3B0_first_byte = '\xcf'; + const char U3B0_2nd_byte = '\x85'; + const char tail[] = "\xcc\x88\xcc\x81"; #endif - *min -= 4; - } + const STRLEN tail_len = sizeof(tail) - 1; + for (s = s0 + 2; + s <= s_end - tail_len + && (t = ninstr(s, s_end, tail, tail + tail_len)); + s = t + tail_len) + { + if ((t[-1] == U390_2nd_byte && t[-2] == U390_first_byte) + || (t[-1] == U3B0_2nd_byte && t[-2] == U3B0_first_byte)) + { + *min_change -= 4; + + /* This can't currently be handled by tries, so change the + * node type to indicate this. */ + if (OP(scan) == EXACTFU) { + OP(scan) = EXACTFU_NO_TRIE; + } + } + } + } + + /* The third problematic sequence is 'ss', which can match just the + * single byte LATIN SMALL LETTER SHARP S, and it can do it in both + * non- and UTF-8. Code elsewhere in this file makes sure, however, + * that the sharp s gets folded to 'ss' under Unicode rules even if not + * UTF-8. */ + if (STR_LEN(scan) >= 2 + && (OP(scan) == EXACTFU + || OP(scan) == EXACTFU_NO_TRIE /* The code above could have + set to this node type */ + || OP(scan) == EXACTF)) + { + /* The string will be folded to 'ss' if it's in UTF-8, but it could + * be 'Ss', etc when not. We could have different code to handle + * the two cases, but this is not necessary since both S and s are + * invariants under UTF-8; and not worth it, especially because we + * can use just one test each time through the loop (plus a mask) + * Ths is because on both EBCDIC and ASCII machines, an 'S' and 's' + * differ by a single bit. On ASCII they are 32 apart; on EBCDIC, + * they are 64. This uses an exclusive 'or' to find that bit and + * then inverts it to form a mask, with just a single 0, in the bit + * position where 'S' and 's' differ. */ + const char S_or_s_mask = ~ ('S' ^ 's'); + const char s_masked = 's' & S_or_s_mask; + + for (s = s0; s < s_end - 1; s++) { + if (((*s & S_or_s_mask) == s_masked) + && ((*(s+1) & S_or_s_mask) == s_masked)) + { + s++; + *min_change -= 1; + + /* EXACTFU_SS also isn't trie'able, so don't have to + * preserve EXACTFU_NO_TRIE. EXACTF is also not trie'able, + * and because we essentially punt the optimizations in its + * case, we don't need to indicate that it has an ss */ + if (OP(scan) == EXACTFU || OP(scan) == EXACTFU_NO_TRIE) { + OP(scan) = EXACTFU_SS; + } + } + } + } } #ifdef DEBUGGING @@ -2762,10 +2858,11 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, fake_study_recurse: while ( scan && OP(scan) != END && scan < last ){ + I32 min_change; /* Peephole optimizer: */ DEBUG_STUDYDATA("Peep:", data,depth); DEBUG_PEEP("Peep",scan,depth); - JOIN_EXACT(scan,&min,0); + JOIN_EXACT(scan,&min_change,0); /* Follow the next-chain of the current node and optimize away all the NOTHINGs from it. */ @@ -3059,8 +3156,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, in below to fully enable trie logic. #define TRIE_TYPE_IS_SAFE 1 - */ + #define TRIE_TYPE_IS_SAFE ((UTF && optype == EXACTFU) || optype==EXACT) if ( last && TRIE_TYPE_IS_SAFE ) { @@ -3279,9 +3376,23 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, l = utf8_length(s, s + l); uc = utf8_to_uvchr(s, NULL); } - min += l; - if (flags & SCF_DO_SUBSTR) + else if (OP(scan) == EXACTF) { + if (memchr(STRING(scan), LATIN_SMALL_LETTER_SHARP_S, l)) { + RExC_seen |= REG_SEEN_EXACTF_SHARP_S; + } + } + min += l + min_change; + if (min < 0) { + min = 0; + } + delta += abs(min_change); + if (flags & SCF_DO_SUBSTR) { data->pos_min += l; + data->pos_delta += abs(min_change); + if (min_change) { + data->longest = &(data->longest_float); + } + } if (flags & SCF_DO_STCLASS_AND) { /* Check whether it is compatible with what we know already! */ int compat = 1; @@ -3311,6 +3422,13 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, * the full latin1 fold. (Can't do this for locale, * because not known until runtime */ ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]); + if (uc == 's' || uc == 'S') { + ANYOF_BITMAP_SET(data->start_class, LATIN_SMALL_LETTER_SHARP_S); + } + else if (uc == LATIN_SMALL_LETTER_SHARP_S) { + ANYOF_BITMAP_SET(data->start_class, 's'); + ANYOF_BITMAP_SET(data->start_class, 'S'); + } } } else if (uc >= 0x100) { @@ -3335,6 +3453,13 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, * run-time */ ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]); + if (uc == 's' || uc == 'S') { + ANYOF_BITMAP_SET(data->start_class, LATIN_SMALL_LETTER_SHARP_S); + } + else if (uc == LATIN_SMALL_LETTER_SHARP_S) { + ANYOF_BITMAP_SET(data->start_class, 's'); + ANYOF_BITMAP_SET(data->start_class, 'S'); + } } } data->start_class->flags &= ~ANYOF_EOS; @@ -3740,18 +3865,6 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, data->longest = &(data->longest_float); } } - else if (OP(scan) == FOLDCHAR) { - int d = ARG(scan) == LATIN_SMALL_LETTER_SHARP_S ? 1 : 2; - flags &= ~SCF_DO_STCLASS; - min += 1; - delta += d; - if (flags & SCF_DO_SUBSTR) { - SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */ - data->pos_min += 1; - data->pos_delta += d; - data->longest = &(data->longest_float); - } - } else if (REGNODE_SIMPLE(OP(scan))) { int value = 0; @@ -5067,9 +5180,10 @@ reStudy: { I32 t,ml; - if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */ - && data.offset_fixed == data.offset_float_min - && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)) + if ((RExC_seen & REG_SEEN_EXACTF_SHARP_S) + || (SvCUR(data.longest_fixed) /* ok to leave SvCUR */ + && data.offset_fixed == data.offset_float_min + && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))) goto remove_float; /* As in (a)+. */ /* copy the information about the longest float from the reg_scan_data @@ -5112,10 +5226,11 @@ reStudy: Be careful. */ longest_fixed_length = CHR_SVLEN(data.longest_fixed); - if (longest_fixed_length - || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */ - && (!(data.flags & SF_FIX_BEFORE_MEOL) - || (RExC_flags & RXf_PMf_MULTILINE)))) + if (! (RExC_seen & REG_SEEN_EXACTF_SHARP_S) + && (longest_fixed_length + || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */ + && (!(data.flags & SF_FIX_BEFORE_MEOL) + || (RExC_flags & RXf_PMf_MULTILINE)))) ) { I32 t,ml; @@ -8820,15 +8935,6 @@ tryagain: RExC_parse++; defchar: { - typedef enum { - generic_char = 0, - char_s, - upsilon_1, - upsilon_2, - iota_1, - iota_2, - } char_state; - char_state latest_char_state = generic_char; register STRLEN len; register UV ender; register char *p; @@ -8836,22 +8942,25 @@ tryagain: STRLEN foldlen; U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf; regnode * orig_emit; + int node_type; + bool is_exactfu_sharp_s; ender = 0; orig_emit = RExC_emit; /* Save the original output node position in case we need to output a different node type */ - ret = reg_node(pRExC_state, - (U8) ((! FOLD) ? EXACT + node_type = (U8) ((! FOLD) ? EXACT : (LOC) ? EXACTFL : (MORE_ASCII_RESTRICTED) ? EXACTFA : (AT_LEAST_UNI_SEMANTICS) ? EXACTFU - : EXACTF) - ); + : EXACTF); + ret = reg_node(pRExC_state, node_type); s = STRING(ret); + + /* By going only up to 127 when the maximum storable is 255, we don't have to worry about expansion, not being in the last character in the fold */ for (len = 0, p = RExC_parse - 1; len < 127 && p < RExC_end; len++) @@ -9047,219 +9156,10 @@ tryagain: break; } /* End of switch on the literal */ - /* Certain characters are problematic because their folded - * length is so different from their original length that it - * isn't handleable by the optimizer. They are therefore not - * placed in an EXACTish node; and are here handled specially. - * (Even if the optimizer handled LATIN_SMALL_LETTER_SHARP_S, - * putting it in a special node keeps regexec from having to - * deal with a non-utf8 multi-char fold */ - if (FOLD - && (ender > 255 || (! MORE_ASCII_RESTRICTED && ! LOC))) - { - /* We look for either side of the fold. For example \xDF - * folds to 'ss'. We look for both the single character - * \xDF and the sequence 'ss'. When we find something that - * could be one of those, we stop and flush whatever we - * have output so far into the EXACTish node that was being - * built. Then restore the input pointer to what it was. - * regatom will return that EXACT node, and will be called - * again, positioned so the first character is the one in - * question, which we return in a different node type. - * The multi-char folds are a sequence, so the occurrence - * of the first character in that sequence doesn't - * necessarily mean that what follows is the rest of the - * sequence. We keep track of that with a state machine, - * with the state being set to the latest character - * processed before the current one. Most characters will - * set the state to 0, but if one occurs that is part of a - * potential tricky fold sequence, the state is set to that - * character, and the next loop iteration sees if the state - * should progress towards the final folded-from character, - * or if it was a false alarm. If it turns out to be a - * false alarm, the character(s) will be output in a new - * EXACTish node, and join_exact() will later combine them. - * In the case of the 'ss' sequence, which is more common - * and more easily checked, some look-ahead is done to - * save time by ruling-out some false alarms */ - switch (ender) { - default: - latest_char_state = generic_char; - break; - case 's': - case 'S': - case 0x17F: /* LATIN SMALL LETTER LONG S */ - if (AT_LEAST_UNI_SEMANTICS) { - if (latest_char_state == char_s) { /* 'ss' */ - ender = LATIN_SMALL_LETTER_SHARP_S; - goto do_tricky; - } - else if (p < RExC_end) { - - /* Look-ahead at the next character. If it - * is also an s, we handle as a sharp s - * tricky regnode. */ - if (*p == 's' || *p == 'S') { - - /* But first flush anything in the - * EXACTish buffer */ - if (len != 0) { - p = oldp; - goto loopdone; - } - p++; /* Account for swallowing this - 's' up */ - ender = LATIN_SMALL_LETTER_SHARP_S; - goto do_tricky; - } - /* Here, the next character is not a - * literal 's', but still could - * evaluate to one if part of a \o{}, - * \x or \OCTAL-DIGIT. The minimum - * length required for that is 4, eg - * \x53 or \123 */ - else if (*p == '\\' - && p < RExC_end - 4 - && (isDIGIT(*(p + 1)) - || *(p + 1) == 'x' - || *(p + 1) == 'o' )) - { - - /* Here, it could be an 's', too much - * bother to figure it out here. Flush - * the buffer if any; when come back - * here, set the state so know that the - * previous char was an 's' */ - if (len != 0) { - latest_char_state = generic_char; - p = oldp; - goto loopdone; - } - latest_char_state = char_s; - break; - } - } - } - - /* Here, can't be an 'ss' sequence, or at least not - * one that could fold to/from the sharp ss */ - latest_char_state = generic_char; - break; - case 0x03C5: /* First char in upsilon series */ - case 0x03A5: /* Also capital UPSILON, which folds to - 03C5, and hence exhibits the same - problem */ - if (p < RExC_end - 4) { /* Need >= 4 bytes left */ - latest_char_state = upsilon_1; - if (len != 0) { - p = oldp; - goto loopdone; - } - } - else { - latest_char_state = generic_char; - } - break; - case 0x03B9: /* First char in iota series */ - case 0x0399: /* Also capital IOTA */ - case 0x1FBE: /* GREEK PROSGEGRAMMENI folds to 3B9 */ - case 0x0345: /* COMBINING GREEK YPOGEGRAMMENI folds - to 3B9 */ - if (p < RExC_end - 4) { - latest_char_state = iota_1; - if (len != 0) { - p = oldp; - goto loopdone; - } - } - else { - latest_char_state = generic_char; - } - break; - case 0x0308: - if (latest_char_state == upsilon_1) { - latest_char_state = upsilon_2; - } - else if (latest_char_state == iota_1) { - latest_char_state = iota_2; - } - else { - latest_char_state = generic_char; - } - break; - case 0x301: - if (latest_char_state == upsilon_2) { - ender = GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS; - goto do_tricky; - } - else if (latest_char_state == iota_2) { - ender = GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS; - goto do_tricky; - } - latest_char_state = generic_char; - break; - - /* These are the tricky fold characters. Flush any - * buffer first. (When adding to this list, also should - * add them to fold_grind.t to make sure get tested) */ - case GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS: - case GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS: - case LATIN_SMALL_LETTER_SHARP_S: - case LATIN_CAPITAL_LETTER_SHARP_S: - case 0x1FD3: /* GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA */ - case 0x1FE3: /* GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA */ - if (len != 0) { - p = oldp; - goto loopdone; - } - /* FALL THROUGH */ - do_tricky: { - char* const oldregxend = RExC_end; - U8 tmpbuf[UTF8_MAXBYTES+1]; - - /* Here, we know we need to generate a special - * regnode, and 'ender' contains the tricky - * character. What's done is to pretend it's in a - * [bracketed] class, and let the code that deals - * with those handle it, as that code has all the - * intelligence necessary. First save the current - * parse state, get rid of the already allocated - * but empty EXACT node that the ANYOFV node will - * replace, and point the parse to a buffer which - * we fill with the character we want the regclass - * code to think is being parsed */ - RExC_emit = orig_emit; - RExC_parse = (char *) tmpbuf; - if (UTF) { - U8 *d = uvchr_to_utf8(tmpbuf, ender); - *d = '\0'; - RExC_end = (char *) d; - } - else { /* ender above 255 already excluded */ - tmpbuf[0] = (U8) ender; - tmpbuf[1] = '\0'; - RExC_end = RExC_parse + 1; - } - - ret = regclass(pRExC_state,depth+1); - - /* Here, have parsed the buffer. Reset the parse to - * the actual input, and return */ - RExC_end = oldregxend; - RExC_parse = p - 1; - - Set_Node_Offset(ret, RExC_parse); - Set_Node_Cur_Length(ret); - nextchar(pRExC_state); - *flagp |= HASWIDTH|SIMPLE; - return ret; - } - } - } - + is_exactfu_sharp_s = (node_type == EXACTFU && ender == LATIN_SMALL_LETTER_SHARP_S); if ( RExC_flags & RXf_PMf_EXTENDED) p = regwhite( pRExC_state, p ); - if (UTF && FOLD) { + if ((UTF && FOLD) || is_exactfu_sharp_s) { /* Prime the casefolded buffer. Locale rules, which apply * only to code points < 256, aren't known until execution, * so for them, just output the original character using @@ -9322,7 +9222,7 @@ tryagain: if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */ if (len) p = oldp; - else if (UTF) { + else if (UTF || is_exactfu_sharp_s) { if (FOLD) { /* Emit all the Unicode characters. */ STRLEN numlen; @@ -9358,7 +9258,7 @@ tryagain: } break; } - if (UTF) { + if (UTF || is_exactfu_sharp_s) { if (FOLD) { /* Emit all the Unicode characters. */ STRLEN numlen; @@ -11188,6 +11088,7 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val, regnode * const temp = regnext(scan); #ifdef EXPERIMENTAL_INPLACESCAN if (PL_regkind[OP(scan)] == EXACT) + if (join_exact(pRExC_state,scan,&min,1,val,depth+1)) return EXACT; #endif @@ -11197,6 +11098,8 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val, case EXACTF: case EXACTFA: case EXACTFU: + case EXACTFU_SS: + case EXACTFU_NO_TRIE: case EXACTFL: if( exact == PSEUDO ) exact= OP(scan); @@ -11521,8 +11424,6 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ])))); } else if (k == LOGICAL) Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */ - else if (k == FOLDCHAR) - Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) ); else if (k == ANYOF) { int i, rangestart = -1; const U8 flags = ANYOF_FLAGS(o); @@ -492,6 +492,7 @@ struct regnode_charclass_class { #define REG_SEEN_VERBARG 0x00000080 #define REG_SEEN_CUTGROUP 0x00000100 #define REG_SEEN_RUN_ON_COMMENT 0x00000200 +#define REG_SEEN_EXACTF_SHARP_S 0x00000400 START_EXTERN_C diff --git a/regcomp.sym b/regcomp.sym index 23b9ef2181..69366d7e87 100644 --- a/regcomp.sym +++ b/regcomp.sym @@ -97,7 +97,9 @@ BACK BACK, no 0 V ; Match "", "next" ptr points backward. EXACT EXACT, str ; Match this string (preceded by length). EXACTF EXACT, str ; Match this non-UTF-8 string (not guaranteed to be folded) using /id rules (w/len). EXACTFL EXACT, str ; Match this string (not guaranteed to be folded) using /il rules (w/len). -EXACTFU EXACT, str ; Match this string (folded iff in UTF-8) using /iu rules (w/len). +EXACTFU EXACT, str ; Match this string (folded iff in UTF-8, length in folding doesn't change if not in UTF-8) using /iu rules (w/len). +EXACTFU_SS EXACT, str ; Match this string (folded iff in UTF-8, length in folding may change even if not in UTF-8) using /iu rules (w/len). +EXACTFU_NO_TRIE EXACT, str ; Match this folded UTF-8 string using /iu rules, but don't generate a trie for it EXACTFA EXACT, str ; Match this string (not guaranteed to be folded) using /iaa rules (w/len). #*Do nothing types @@ -214,10 +216,8 @@ VERTWS VERTWS, none 0 S ; vertical whitespace (Perl 6) NVERTWS NVERTWS, none 0 S ; not vertical whitespace (Perl 6) HORIZWS HORIZWS, none 0 S ; horizontal whitespace (Perl 6) NHORIZWS NHORIZWS, none 0 S ; not horizontal whitespace (Perl 6) - FOLDCHAR FOLDCHAR, codepoint 1 ; codepoint with tricky case folding properties. - # NEW STUFF SOMEWHERE ABOVE THIS LINE ################################################################################ @@ -303,13 +303,13 @@ /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so we don't need this definition. */ #define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF ) -#define IS_TEXTF(rn) ( (OP(rn)==EXACTFU || OP(rn)==EXACTFA || OP(rn)==EXACTF) || OP(rn)==REFF || OP(rn)==NREFF ) +#define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_NO_TRIE || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF ) #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL ) #else /* ... so we use this as its faster. */ #define IS_TEXT(rn) ( OP(rn)==EXACT ) -#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn) == EXACTFA) +#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_NO_TRIE || OP(rn) == EXACTFA) #define IS_TEXTF(rn) ( OP(rn)==EXACTF ) #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL ) @@ -1483,6 +1483,13 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, folder = foldEQ_locale; goto do_exactf_non_utf8; + case EXACTFU_SS: + if (UTF_PATTERN) { + utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED; + } + goto do_exactf_utf8; + + case EXACTFU_NO_TRIE: case EXACTFU: if (UTF_PATTERN || utf8_target) { utf8_fold_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0; @@ -3662,6 +3669,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) fold_utf8_flags = FOLDEQ_UTF8_LOCALE; goto do_exactf; + case EXACTFU_SS: + case EXACTFU_NO_TRIE: case EXACTFU: folder = foldEQ_latin1; fold_array = PL_fold_latin1; @@ -3683,8 +3692,9 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) s = STRING(scan); ln = STR_LEN(scan); - if (utf8_target || UTF_PATTERN) { - /* Either target or the pattern are utf8. */ + if (utf8_target || UTF_PATTERN || state_num == EXACTFU_SS) { + /* Either target or the pattern are utf8, or has the issue where + * the fold lengths may differ. */ const char * const l = locinput; char *e = PL_regeol; @@ -5072,6 +5082,8 @@ NULL switch (OP(text_node)) { case EXACTF: ST.c2 = PL_fold[ST.c1]; break; case EXACTFA: + case EXACTFU_SS: + case EXACTFU_NO_TRIE: case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break; case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break; default: ST.c2 = ST.c1; @@ -5226,6 +5238,8 @@ NULL switch (OP(text_node)) { case EXACTF: ST.c2 = PL_fold[ST.c1]; break; case EXACTFA: + case EXACTFU_SS: + case EXACTFU_NO_TRIE: case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break; case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break; default: ST.c2 = ST.c1; break; @@ -5694,27 +5708,6 @@ NULL sayNO; /* NOTREACHED */ #undef ST - case FOLDCHAR: - n = ARG(scan); - if ( n == (U32)what_len_TRICKYFOLD(locinput,utf8_target,ln) ) { - locinput += ln; - } else if ( LATIN_SMALL_LETTER_SHARP_S == n && !utf8_target && !UTF_PATTERN ) { - sayNO; - } else { - U8 folded[UTF8_MAXBYTES_CASE+1]; - STRLEN foldlen; - const char * const l = locinput; - char *e = PL_regeol; - to_uni_fold(n, folded, &foldlen); - - if (! foldEQ_utf8((const char*) folded, 0, foldlen, 1, - l, &e, 0, utf8_target)) { - sayNO; - } - locinput = e; - } - nextchr = UCHARAT(locinput); - break; case LNBREAK: if ((n=is_LNBREAK(locinput,utf8_target))) { locinput += n; @@ -6039,6 +6032,8 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth) utf8_flags = 0; goto do_exactf; + case EXACTFU_SS: + case EXACTFU_NO_TRIE: case EXACTFU: utf8_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0; @@ -6049,7 +6044,7 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth) c = (U8)*STRING(p); assert(! UTF_PATTERN || UNI_IS_INVARIANT(c)); - if (utf8_target) { /* Use full Unicode fold matching */ + if (utf8_target || OP(p) == EXACTFU_SS) { /* Use full Unicode fold matching */ char *tmpeol = loceol; while (hardcount < max && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target, @@ -6080,6 +6075,7 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth) switch (OP(p)) { case EXACTF: folded = PL_fold[c]; break; case EXACTFA: + case EXACTFU_NO_TRIE: case EXACTFU: folded = PL_fold_latin1[c]; break; case EXACTFL: folded = PL_fold_locale[c]; break; default: Perl_croak(aTHX_ "panic: Unexpected op %u", OP(p)); diff --git a/regnodes.h b/regnodes.h index dccf2b7a99..785ff1c9ca 100644 --- a/regnodes.h +++ b/regnodes.h @@ -6,8 +6,8 @@ /* Regops and State definitions */ -#define REGNODE_MAX 111 -#define REGMATCH_STATE_MAX 151 +#define REGNODE_MAX 113 +#define REGMATCH_STATE_MAX 153 #define END 0 /* 0000 End of program. */ #define SUCCEED 1 /* 0x01 Return from a subroutine, basically. */ @@ -60,67 +60,69 @@ #define EXACT 48 /* 0x30 Match this string (preceded by length). */ #define EXACTF 49 /* 0x31 Match this non-UTF-8 string (not guaranteed to be folded) using /id rules (w/len). */ #define EXACTFL 50 /* 0x32 Match this string (not guaranteed to be folded) using /il rules (w/len). */ -#define EXACTFU 51 /* 0x33 Match this string (folded iff in UTF-8) using /iu rules (w/len). */ -#define EXACTFA 52 /* 0x34 Match this string (not guaranteed to be folded) using /iaa rules (w/len). */ -#define NOTHING 53 /* 0x35 Match empty string. */ -#define TAIL 54 /* 0x36 Match empty string. Can jump here from outside. */ -#define STAR 55 /* 0x37 Match this (simple) thing 0 or more times. */ -#define PLUS 56 /* 0x38 Match this (simple) thing 1 or more times. */ -#define CURLY 57 /* 0x39 Match this simple thing {n,m} times. */ -#define CURLYN 58 /* 0x3a Capture next-after-this simple thing */ -#define CURLYM 59 /* 0x3b Capture this medium-complex thing {n,m} times. */ -#define CURLYX 60 /* 0x3c Match this complex thing {n,m} times. */ -#define WHILEM 61 /* 0x3d Do curly processing and see if rest matches. */ -#define OPEN 62 /* 0x3e Mark this point in input as start of */ -#define CLOSE 63 /* 0x3f Analogous to OPEN. */ -#define REF 64 /* 0x40 Match some already matched string */ -#define REFF 65 /* 0x41 Match already matched string, folded using native charset semantics for non-utf8 */ -#define REFFL 66 /* 0x42 Match already matched string, folded in loc. */ -#define REFFU 67 /* 0x43 Match already matched string, folded using unicode semantics for non-utf8 */ -#define REFFA 68 /* 0x44 Match already matched string, folded using unicode semantics for non-utf8, no mixing ASCII, non-ASCII */ -#define NREF 69 /* 0x45 Match some already matched string */ -#define NREFF 70 /* 0x46 Match already matched string, folded using native charset semantics for non-utf8 */ -#define NREFFL 71 /* 0x47 Match already matched string, folded in loc. */ -#define NREFFU 72 /* 0x48 Match already matched string, folded using unicode semantics for non-utf8 */ -#define NREFFA 73 /* 0x49 Match already matched string, folded using unicode semantics for non-utf8, no mixing ASCII, non-ASCII */ -#define IFMATCH 74 /* 0x4a Succeeds if the following matches. */ -#define UNLESSM 75 /* 0x4b Fails if the following matches. */ -#define SUSPEND 76 /* 0x4c "Independent" sub-RE. */ -#define IFTHEN 77 /* 0x4d Switch, should be preceded by switcher . */ -#define GROUPP 78 /* 0x4e Whether the group matched. */ -#define LONGJMP 79 /* 0x4f Jump far away. */ -#define BRANCHJ 80 /* 0x50 BRANCH with long offset. */ -#define EVAL 81 /* 0x51 Execute some Perl code. */ -#define MINMOD 82 /* 0x52 Next operator is not greedy. */ -#define LOGICAL 83 /* 0x53 Next opcode should set the flag only. */ -#define RENUM 84 /* 0x54 Group with independently numbered parens. */ -#define TRIE 85 /* 0x55 Match many EXACT(F[ALU]?)? at once. flags==type */ -#define TRIEC 86 /* 0x56 Same as TRIE, but with embedded charclass data */ -#define AHOCORASICK 87 /* 0x57 Aho Corasick stclass. flags==type */ -#define AHOCORASICKC 88 /* 0x58 Same as AHOCORASICK, but with embedded charclass data */ -#define GOSUB 89 /* 0x59 recurse to paren arg1 at (signed) ofs arg2 */ -#define GOSTART 90 /* 0x5a recurse to start of pattern */ -#define NGROUPP 91 /* 0x5b Whether the group matched. */ -#define INSUBP 92 /* 0x5c Whether we are in a specific recurse. */ -#define DEFINEP 93 /* 0x5d Never execute directly. */ -#define ENDLIKE 94 /* 0x5e Used only for the type field of verbs */ -#define OPFAIL 95 /* 0x5f Same as (?!) */ -#define ACCEPT 96 /* 0x60 Accepts the current matched string. */ -#define VERB 97 /* 0x61 Used only for the type field of verbs */ -#define PRUNE 98 /* 0x62 Pattern fails at this startpoint if no-backtracking through this */ -#define MARKPOINT 99 /* 0x63 Push the current location for rollback by cut. */ -#define SKIP 100 /* 0x64 On failure skip forward (to the mark) before retrying */ -#define COMMIT 101 /* 0x65 Pattern fails outright if backtracking through this */ -#define CUTGROUP 102 /* 0x66 On failure go to the next alternation in the group */ -#define KEEPS 103 /* 0x67 $& begins here. */ -#define LNBREAK 104 /* 0x68 generic newline pattern */ -#define VERTWS 105 /* 0x69 vertical whitespace (Perl 6) */ -#define NVERTWS 106 /* 0x6a not vertical whitespace (Perl 6) */ -#define HORIZWS 107 /* 0x6b horizontal whitespace (Perl 6) */ -#define NHORIZWS 108 /* 0x6c not horizontal whitespace (Perl 6) */ -#define FOLDCHAR 109 /* 0x6d codepoint with tricky case folding properties. */ -#define OPTIMIZED 110 /* 0x6e Placeholder for dump. */ -#define PSEUDO 111 /* 0x6f Pseudo opcode for internal use. */ +#define EXACTFU 51 /* 0x33 Match this string (folded iff in UTF-8, length in folding doesn't change if not in UTF-8) using /iu rules (w/len). */ +#define EXACTFU_SS 52 /* 0x34 Match this string (folded iff in UTF-8, length in folding may change even if not in UTF-8) using /iu rules (w/len). */ +#define EXACTFU_NO_TRIE 53 /* 0x35 Match this folded UTF-8 string using /iu rules, but don't generate a trie for it */ +#define EXACTFA 54 /* 0x36 Match this string (not guaranteed to be folded) using /iaa rules (w/len). */ +#define NOTHING 55 /* 0x37 Match empty string. */ +#define TAIL 56 /* 0x38 Match empty string. Can jump here from outside. */ +#define STAR 57 /* 0x39 Match this (simple) thing 0 or more times. */ +#define PLUS 58 /* 0x3a Match this (simple) thing 1 or more times. */ +#define CURLY 59 /* 0x3b Match this simple thing {n,m} times. */ +#define CURLYN 60 /* 0x3c Capture next-after-this simple thing */ +#define CURLYM 61 /* 0x3d Capture this medium-complex thing {n,m} times. */ +#define CURLYX 62 /* 0x3e Match this complex thing {n,m} times. */ +#define WHILEM 63 /* 0x3f Do curly processing and see if rest matches. */ +#define OPEN 64 /* 0x40 Mark this point in input as start of */ +#define CLOSE 65 /* 0x41 Analogous to OPEN. */ +#define REF 66 /* 0x42 Match some already matched string */ +#define REFF 67 /* 0x43 Match already matched string, folded using native charset semantics for non-utf8 */ +#define REFFL 68 /* 0x44 Match already matched string, folded in loc. */ +#define REFFU 69 /* 0x45 Match already matched string, folded using unicode semantics for non-utf8 */ +#define REFFA 70 /* 0x46 Match already matched string, folded using unicode semantics for non-utf8, no mixing ASCII, non-ASCII */ +#define NREF 71 /* 0x47 Match some already matched string */ +#define NREFF 72 /* 0x48 Match already matched string, folded using native charset semantics for non-utf8 */ +#define NREFFL 73 /* 0x49 Match already matched string, folded in loc. */ +#define NREFFU 74 /* 0x4a Match already matched string, folded using unicode semantics for non-utf8 */ +#define NREFFA 75 /* 0x4b Match already matched string, folded using unicode semantics for non-utf8, no mixing ASCII, non-ASCII */ +#define IFMATCH 76 /* 0x4c Succeeds if the following matches. */ +#define UNLESSM 77 /* 0x4d Fails if the following matches. */ +#define SUSPEND 78 /* 0x4e "Independent" sub-RE. */ +#define IFTHEN 79 /* 0x4f Switch, should be preceded by switcher . */ +#define GROUPP 80 /* 0x50 Whether the group matched. */ +#define LONGJMP 81 /* 0x51 Jump far away. */ +#define BRANCHJ 82 /* 0x52 BRANCH with long offset. */ +#define EVAL 83 /* 0x53 Execute some Perl code. */ +#define MINMOD 84 /* 0x54 Next operator is not greedy. */ +#define LOGICAL 85 /* 0x55 Next opcode should set the flag only. */ +#define RENUM 86 /* 0x56 Group with independently numbered parens. */ +#define TRIE 87 /* 0x57 Match many EXACT(F[ALU]?)? at once. flags==type */ +#define TRIEC 88 /* 0x58 Same as TRIE, but with embedded charclass data */ +#define AHOCORASICK 89 /* 0x59 Aho Corasick stclass. flags==type */ +#define AHOCORASICKC 90 /* 0x5a Same as AHOCORASICK, but with embedded charclass data */ +#define GOSUB 91 /* 0x5b recurse to paren arg1 at (signed) ofs arg2 */ +#define GOSTART 92 /* 0x5c recurse to start of pattern */ +#define NGROUPP 93 /* 0x5d Whether the group matched. */ +#define INSUBP 94 /* 0x5e Whether we are in a specific recurse. */ +#define DEFINEP 95 /* 0x5f Never execute directly. */ +#define ENDLIKE 96 /* 0x60 Used only for the type field of verbs */ +#define OPFAIL 97 /* 0x61 Same as (?!) */ +#define ACCEPT 98 /* 0x62 Accepts the current matched string. */ +#define VERB 99 /* 0x63 Used only for the type field of verbs */ +#define PRUNE 100 /* 0x64 Pattern fails at this startpoint if no-backtracking through this */ +#define MARKPOINT 101 /* 0x65 Push the current location for rollback by cut. */ +#define SKIP 102 /* 0x66 On failure skip forward (to the mark) before retrying */ +#define COMMIT 103 /* 0x67 Pattern fails outright if backtracking through this */ +#define CUTGROUP 104 /* 0x68 On failure go to the next alternation in the group */ +#define KEEPS 105 /* 0x69 $& begins here. */ +#define LNBREAK 106 /* 0x6a generic newline pattern */ +#define VERTWS 107 /* 0x6b vertical whitespace (Perl 6) */ +#define NVERTWS 108 /* 0x6c not vertical whitespace (Perl 6) */ +#define HORIZWS 109 /* 0x6d horizontal whitespace (Perl 6) */ +#define NHORIZWS 110 /* 0x6e not horizontal whitespace (Perl 6) */ +#define FOLDCHAR 111 /* 0x6f codepoint with tricky case folding properties. */ +#define OPTIMIZED 112 /* 0x70 Placeholder for dump. */ +#define PSEUDO 113 /* 0x71 Pseudo opcode for internal use. */ /* ------------ States ------------- */ #define TRIE_next (REGNODE_MAX + 1) /* state for TRIE */ #define TRIE_next_fail (REGNODE_MAX + 2) /* state for TRIE */ @@ -221,6 +223,8 @@ EXTCONST U8 PL_regkind[] = { EXACT, /* EXACTF */ EXACT, /* EXACTFL */ EXACT, /* EXACTFU */ + EXACT, /* EXACTFU_SS */ + EXACT, /* EXACTFU_NO_TRIE */ EXACT, /* EXACTFA */ NOTHING, /* NOTHING */ NOTHING, /* TAIL */ @@ -381,6 +385,8 @@ static const U8 regarglen[] = { 0, /* EXACTF */ 0, /* EXACTFL */ 0, /* EXACTFU */ + 0, /* EXACTFU_SS */ + 0, /* EXACTFU_NO_TRIE */ 0, /* EXACTFA */ 0, /* NOTHING */ 0, /* TAIL */ @@ -498,6 +504,8 @@ static const char reg_off_by_arg[] = { 0, /* EXACTF */ 0, /* EXACTFL */ 0, /* EXACTFU */ + 0, /* EXACTFU_SS */ + 0, /* EXACTFU_NO_TRIE */ 0, /* EXACTFA */ 0, /* NOTHING */ 0, /* TAIL */ @@ -620,66 +628,68 @@ EXTCONST char * const PL_reg_name[] = { "EXACTF", /* 0x31 */ "EXACTFL", /* 0x32 */ "EXACTFU", /* 0x33 */ - "EXACTFA", /* 0x34 */ - "NOTHING", /* 0x35 */ - "TAIL", /* 0x36 */ - "STAR", /* 0x37 */ - "PLUS", /* 0x38 */ - "CURLY", /* 0x39 */ - "CURLYN", /* 0x3a */ - "CURLYM", /* 0x3b */ - "CURLYX", /* 0x3c */ - "WHILEM", /* 0x3d */ - "OPEN", /* 0x3e */ - "CLOSE", /* 0x3f */ - "REF", /* 0x40 */ - "REFF", /* 0x41 */ - "REFFL", /* 0x42 */ - "REFFU", /* 0x43 */ - "REFFA", /* 0x44 */ - "NREF", /* 0x45 */ - "NREFF", /* 0x46 */ - "NREFFL", /* 0x47 */ - "NREFFU", /* 0x48 */ - "NREFFA", /* 0x49 */ - "IFMATCH", /* 0x4a */ - "UNLESSM", /* 0x4b */ - "SUSPEND", /* 0x4c */ - "IFTHEN", /* 0x4d */ - "GROUPP", /* 0x4e */ - "LONGJMP", /* 0x4f */ - "BRANCHJ", /* 0x50 */ - "EVAL", /* 0x51 */ - "MINMOD", /* 0x52 */ - "LOGICAL", /* 0x53 */ - "RENUM", /* 0x54 */ - "TRIE", /* 0x55 */ - "TRIEC", /* 0x56 */ - "AHOCORASICK", /* 0x57 */ - "AHOCORASICKC", /* 0x58 */ - "GOSUB", /* 0x59 */ - "GOSTART", /* 0x5a */ - "NGROUPP", /* 0x5b */ - "INSUBP", /* 0x5c */ - "DEFINEP", /* 0x5d */ - "ENDLIKE", /* 0x5e */ - "OPFAIL", /* 0x5f */ - "ACCEPT", /* 0x60 */ - "VERB", /* 0x61 */ - "PRUNE", /* 0x62 */ - "MARKPOINT", /* 0x63 */ - "SKIP", /* 0x64 */ - "COMMIT", /* 0x65 */ - "CUTGROUP", /* 0x66 */ - "KEEPS", /* 0x67 */ - "LNBREAK", /* 0x68 */ - "VERTWS", /* 0x69 */ - "NVERTWS", /* 0x6a */ - "HORIZWS", /* 0x6b */ - "NHORIZWS", /* 0x6c */ - "FOLDCHAR", /* 0x6d */ - "OPTIMIZED", /* 0x6e */ - "PSEUDO", /* 0x6f */ + "EXACTFU_SS", /* 0x34 */ + "EXACTFU_NO_TRIE", /* 0x35 */ + "EXACTFA", /* 0x36 */ + "NOTHING", /* 0x37 */ + "TAIL", /* 0x38 */ + "STAR", /* 0x39 */ + "PLUS", /* 0x3a */ + "CURLY", /* 0x3b */ + "CURLYN", /* 0x3c */ + "CURLYM", /* 0x3d */ + "CURLYX", /* 0x3e */ + "WHILEM", /* 0x3f */ + "OPEN", /* 0x40 */ + "CLOSE", /* 0x41 */ + "REF", /* 0x42 */ + "REFF", /* 0x43 */ + "REFFL", /* 0x44 */ + "REFFU", /* 0x45 */ + "REFFA", /* 0x46 */ + "NREF", /* 0x47 */ + "NREFF", /* 0x48 */ + "NREFFL", /* 0x49 */ + "NREFFU", /* 0x4a */ + "NREFFA", /* 0x4b */ + "IFMATCH", /* 0x4c */ + "UNLESSM", /* 0x4d */ + "SUSPEND", /* 0x4e */ + "IFTHEN", /* 0x4f */ + "GROUPP", /* 0x50 */ + "LONGJMP", /* 0x51 */ + "BRANCHJ", /* 0x52 */ + "EVAL", /* 0x53 */ + "MINMOD", /* 0x54 */ + "LOGICAL", /* 0x55 */ + "RENUM", /* 0x56 */ + "TRIE", /* 0x57 */ + "TRIEC", /* 0x58 */ + "AHOCORASICK", /* 0x59 */ + "AHOCORASICKC", /* 0x5a */ + "GOSUB", /* 0x5b */ + "GOSTART", /* 0x5c */ + "NGROUPP", /* 0x5d */ + "INSUBP", /* 0x5e */ + "DEFINEP", /* 0x5f */ + "ENDLIKE", /* 0x60 */ + "OPFAIL", /* 0x61 */ + "ACCEPT", /* 0x62 */ + "VERB", /* 0x63 */ + "PRUNE", /* 0x64 */ + "MARKPOINT", /* 0x65 */ + "SKIP", /* 0x66 */ + "COMMIT", /* 0x67 */ + "CUTGROUP", /* 0x68 */ + "KEEPS", /* 0x69 */ + "LNBREAK", /* 0x6a */ + "VERTWS", /* 0x6b */ + "NVERTWS", /* 0x6c */ + "HORIZWS", /* 0x6d */ + "NHORIZWS", /* 0x6e */ + "FOLDCHAR", /* 0x6f */ + "OPTIMIZED", /* 0x70 */ + "PSEUDO", /* 0x71 */ /* ------------ States ------------- */ "TRIE_next", /* REGNODE_MAX +0x01 */ "TRIE_next_fail", /* REGNODE_MAX +0x02 */ @@ -784,7 +794,7 @@ EXTCONST U8 PL_varies[] __attribute__deprecated__ = { EXTCONST U8 PL_varies_bitmask[]; #else EXTCONST U8 PL_varies_bitmask[] = { - 0x00, 0x00, 0x40, 0x00, 0x00, 0xE0, 0x80, 0x3F, 0xFF, 0x33, 0x01, 0x00, 0x00, 0x00 + 0x00, 0x00, 0x40, 0x00, 0x00, 0xE0, 0x00, 0xFE, 0xFC, 0xCF, 0x04, 0x00, 0x00, 0x00, 0x00 }; #endif /* DOINIT */ @@ -808,7 +818,7 @@ EXTCONST U8 PL_simple[] __attribute__deprecated__ = { EXTCONST U8 PL_simple_bitmask[]; #else EXTCONST U8 PL_simple_bitmask[] = { - 0x00, 0x00, 0xBC, 0xFF, 0xFF, 0x1F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1E + 0x00, 0x00, 0xBC, 0xFF, 0xFF, 0x1F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x78, 0x00 }; #endif /* DOINIT */ diff --git a/t/re/re_tests b/t/re/re_tests index 84791cf15a..33a2fee148 100644 --- a/t/re/re_tests +++ b/t/re/re_tests @@ -1560,8 +1560,8 @@ abc\N{def - c - \\N{NAME} must be resolved by the lexer # Was matching 'ss' only and failing the entire match, not seeing the # alternative that would succeed -/s\xDF/ui \xDFs yT $& \xDFs -/sst/i s\N{LATIN SMALL LIGATURE ST} yT $& s\N{LATIN SMALL LIGATURE ST} -/sst/i s\N{LATIN SMALL LIGATURE LONG S T} yT $& s\N{LATIN SMALL LIGATURE LONG S T} +/s\xDF/ui \xDFs y $& \xDFs +/sst/i s\N{LATIN SMALL LIGATURE ST} y $& s\N{LATIN SMALL LIGATURE ST} +/sst/i s\N{LATIN SMALL LIGATURE LONG S T} y $& s\N{LATIN SMALL LIGATURE LONG S T} # vim: softtabstop=0 noexpandtab |