diff options
author | Karl Williamson <khw@cpan.org> | 2019-11-16 15:44:42 -0700 |
---|---|---|
committer | Karl Williamson <khw@cpan.org> | 2019-11-21 12:55:47 -0700 |
commit | 95815e9df4dca7d5909d369abff1a1c4a779de0b (patch) | |
tree | fa6b2d6dc566e056fc2ae14477b41fbdd8d8f2d4 | |
parent | e6769f93f413fd6d2ccd494444da8113892c2531 (diff) | |
download | perl-95815e9df4dca7d5909d369abff1a1c4a779de0b.tar.gz |
Properly handle filled /il regnodes and multi-char folds
Previously we were ignoring this possibility. Suppose a pattern being
compiled under /il contains 'SS', and that it so happens that a regnode
becomes filled with the first 'S', so that the next regnode would begin
with the second one. If at runtime, the locale is UTF-8, the pattern
should match match a LATIN SHARP S. Until this commit, it wouldn't.
The commit just extends the current mechanism used in this situation (of
a filled regnode) for non-/l patterns.
If the locale isn't a UTF-8 one, the 'SS' sequence shouldn't match the
SHARP S, and it won't, but we have to construct the node so that it can
handle the UTF-8 case.
-rw-r--r-- | regcomp.c | 176 | ||||
-rw-r--r-- | t/re/pat.t | 11 |
2 files changed, 165 insertions, 22 deletions
@@ -14570,13 +14570,11 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) goto continue_parse; } - else if (! LOC) { /* XXX shouldn't /l assume could be a UTF-8 - locale, and prepare for that? */ + else if (FOLD) { bool splittable = FALSE; bool backed_up = FALSE; - char * e = s; - - assert(FOLD); + char * e; + char * s_start; /* Here is /i. Running out of room creates a problem if we are * folding, and the split happens in the middle of a @@ -14613,6 +14611,132 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) * oldp points to the beginning byte in the input of * 'ender'. * + * In the case of /il, we haven't folded anything that could be + * affected by the locale. That means only above-Latin1 + * characters that fold to other above-latin1 characters get + * folded at compile time. To check where a good place to + * split nodes is, everything in it will have to be folded. + * The boolean 'maybe_exactfu' keeps track in /il if there are + * any unfolded characters in the node. */ + bool need_to_fold_loc = LOC && ! maybe_exactfu; + + /* If we do need to fold the node, we need a place to store the + * folded copy, and a way to map back to the unfolded original + * */ + char * locfold_buf; + Size_t * loc_correspondence; + + if (! need_to_fold_loc) { /* The normal case. Just + initialize to the actual node */ + e = s; + s_start = s0; + s = old_old_s; /* Point to the beginning of the final char + that fits in the node */ + } + else { + + /* Here, we have filled a /il node, and there are unfolded + * characters in it. If the runtime locale turns out to be + * UTF-8, there are possible multi-character folds, just + * like when not under /l. The node hence can't terminate + * in the middle of such a fold. To determine this, we + * have to create a folded copy of this node. That means + * reparsing the node, folding everything assuming a UTF-8 + * locale. (If at runtime it isn't such a locale, the + * actions here wouldn't have been necessary, but we have + * to assume the worst case.) If we find we need to back + * off the folded string, we do so, and then map that + * position back to the original unfolded node, which then + * gets output, truncated at that spot */ + + char * redo_p = RExC_parse; + char * redo_e; + char * old_redo_e; + + /* Allow enough space assuming a single byte input folds to + * a single byte output, plus assume that the two unparsed + * characters (that we may need) fold to the largest number + * of bytes possible, plus extra for one more worst case + * scenario. In the loop below, if we start eating into + * that final spare space, we enlarge this initial space */ + Size_t size = max_string_len + (3 * UTF8_MAXBYTES_CASE) + 1; + + Newxz(locfold_buf, size, char); + Newxz(loc_correspondence, size, Size_t); + + /* Redo this node's parse, folding into 'locfold_buf' */ + redo_p = RExC_parse; + redo_e = locfold_buf; + while (redo_p <= oldp) { + + old_redo_e = redo_e; + loc_correspondence[redo_e - locfold_buf] + = redo_p - RExC_parse; + + if (UTF) { + Size_t added_len; + + (void) _to_utf8_fold_flags((U8 *) redo_p, + (U8 *) RExC_end, + (U8 *) redo_e, + &added_len, + FOLD_FLAGS_FULL); + redo_e += added_len; + redo_p += UTF8SKIP(redo_p); + } + else { + + /* Note that if this code is run on some ancient + * Unicode versions, SHARP S doesn't fold to 'ss', + * but rather than clutter the code with #ifdef's, + * as is done above, we ignore that possibility. + * This is ok because this code doesn't affect what + * gets matched, but merely where the node gets + * split */ + if (UCHARAT(redo_p) != LATIN_SMALL_LETTER_SHARP_S) { + *redo_e++ = toLOWER_L1(UCHARAT(redo_p)); + } + else { + *redo_e++ = 's'; + *redo_e++ = 's'; + } + redo_p++; + } + + + /* If we're getting so close to the end that a + * worst-case fold in the next character would cause us + * to overflow, increase, assuming one byte output byte + * per one byte input one, plus room for another worst + * case fold */ + if ( redo_p <= oldp + && redo_e > locfold_buf + size + - (UTF8_MAXBYTES_CASE + 1)) + { + Size_t new_size = size + + (oldp - redo_p) + + UTF8_MAXBYTES_CASE + 1; + Ptrdiff_t e_offset = redo_e - locfold_buf; + + Renew(locfold_buf, new_size, char); + Renew(loc_correspondence, new_size, Size_t); + size = new_size; + + redo_e = locfold_buf + e_offset; + } + } + + /* Set so that things are in terms of the folded, temporary + * string */ + s = old_redo_e; + s_start = locfold_buf; + e = redo_e; + + } + + /* Here, we have 's', 's_start' and 'e' set up to point to the + * input that goes into the node, folded. + * * If the final character of the node and the fold of ender * form the first two characters of a three character fold, we * need to peek ahead at the next (unparsed) character in the @@ -14652,11 +14776,8 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) * and try again. * * Otherwise, the node can be split at the current position. - */ - s = old_old_s; /* Point to the beginning of the final char - that fits in the node */ - - /* The same logic is used for UTF-8 patterns and not */ + * + * The same logic is used for UTF-8 patterns and not */ if (UTF) { Size_t added_len; @@ -14695,7 +14816,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) * drop down to try at that position */ if (isPUNCT(*p)) { s = (char *) utf8_hop_back((U8 *) s, -1, - (U8 *) s0); + (U8 *) s_start); backed_up = TRUE; } else { @@ -14727,7 +14848,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) * either case would break apart a fold */ do { char *prev_s = (char *) utf8_hop_back((U8 *) s, -1, - (U8 *) s0); + (U8 *) s_start); /* If is a multi-char fold, can't split here. Backup * one char and try again */ @@ -14741,11 +14862,11 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) * three character fold starting at the character * before s, we can't split either before or after s. * Backup two chars and try again */ - if ( LIKELY(s > s0) + if ( LIKELY(s > s_start) && UNLIKELY(is_THREE_CHAR_FOLD_utf8_safe(prev_s, e))) { s = prev_s; - s = (char *) utf8_hop_back((U8 *) s, -1, (U8 *) s0); + s = (char *) utf8_hop_back((U8 *) s, -1, (U8 *) s_start); backed_up = TRUE; continue; } @@ -14755,7 +14876,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) splittable = TRUE; break; - } while (s > s0); /* End of loops backing up through the node */ + } while (s > s_start); /* End of loops backing up through the node */ /* Here we either couldn't find a place to split the node, * or else we broke out of the loop setting 'splittable' to @@ -14804,7 +14925,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) continue; } - if ( LIKELY(s > s0) + if ( LIKELY(s > s_start) && UNLIKELY(is_THREE_CHAR_FOLD_latin1_safe(s - 1, e))) { s -= 2; @@ -14815,7 +14936,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) splittable = TRUE; break; - } while (s > s0); + } while (s > s_start); if (splittable) { s++; @@ -14829,9 +14950,28 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) /* If we did find a place to split, reparse the entire node * stopping where we have calculated. */ if (splittable) { - upper_fill = s - s0; + + /* If we created a temporary folded string under /l, we + * have to map that back to the original */ + if (need_to_fold_loc) { + upper_fill = loc_correspondence[s - s_start]; + Safefree(locfold_buf); + Safefree(loc_correspondence); + + if (upper_fill == 0) { + FAIL2("panic: loc_correspondence[%d] is 0", + (int) (s - s_start)); + } + } + else { + upper_fill = s - s0; + } goto reparse; } + else if (need_to_fold_loc) { + Safefree(locfold_buf); + Safefree(loc_correspondence); + } /* Here the node consists entirely of non-final multi-char * folds. (Likely it is all 'f's or all 's's.) There's no diff --git a/t/re/pat.t b/t/re/pat.t index de8f2afbca..ccf494c302 100644 --- a/t/re/pat.t +++ b/t/re/pat.t @@ -25,7 +25,7 @@ BEGIN { skip_all('no re module') unless defined &DynaLoader::boot_DynaLoader; skip_all_without_unicode_tables(); -plan tests => 973; # Update this when adding/deleting tests. +plan tests => 1005; # Update this when adding/deleting tests. run_tests() unless caller; @@ -1430,12 +1430,15 @@ EOP { # Test that it avoids spllitting a multi-char fold across nodes. # These all fold to things that are like 'ss', which, if split across # nodes could fail to match a single character that folds to the - # combination. + # combination. 1F0 byte expands when folded; my $utf8_locale = find_utf8_ctype_locale(); - for my $char('F', $sharp_s, "\x{FB00}") { + for my $char('F', $sharp_s, "\x{1F0}", "\x{FB00}") { my $length = 260; # Long enough to overflow an EXACTFish regnode my $p = $char x $length; - my $s = ($char eq $sharp_s) ? 'ss' : 'ff'; + my $s = ($char eq $sharp_s) ? 'ss' + : $char eq "\x{1F0}" + ? "j\x{30c}" + : 'ff'; $s = $s x $length; for my $charset (qw(u d l aa)) { for my $utf8 (0..1) { |