summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2019-11-16 15:44:42 -0700
committerKarl Williamson <khw@cpan.org>2019-11-21 12:55:47 -0700
commit95815e9df4dca7d5909d369abff1a1c4a779de0b (patch)
treefa6b2d6dc566e056fc2ae14477b41fbdd8d8f2d4
parente6769f93f413fd6d2ccd494444da8113892c2531 (diff)
downloadperl-95815e9df4dca7d5909d369abff1a1c4a779de0b.tar.gz
Properly handle filled /il regnodes and multi-char folds
Previously we were ignoring this possibility. Suppose a pattern being compiled under /il contains 'SS', and that it so happens that a regnode becomes filled with the first 'S', so that the next regnode would begin with the second one. If at runtime, the locale is UTF-8, the pattern should match match a LATIN SHARP S. Until this commit, it wouldn't. The commit just extends the current mechanism used in this situation (of a filled regnode) for non-/l patterns. If the locale isn't a UTF-8 one, the 'SS' sequence shouldn't match the SHARP S, and it won't, but we have to construct the node so that it can handle the UTF-8 case.
-rw-r--r--regcomp.c176
-rw-r--r--t/re/pat.t11
2 files changed, 165 insertions, 22 deletions
diff --git a/regcomp.c b/regcomp.c
index 4116dd3d31..2c5cbfef7b 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -14570,13 +14570,11 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
goto continue_parse;
}
- else if (! LOC) { /* XXX shouldn't /l assume could be a UTF-8
- locale, and prepare for that? */
+ else if (FOLD) {
bool splittable = FALSE;
bool backed_up = FALSE;
- char * e = s;
-
- assert(FOLD);
+ char * e;
+ char * s_start;
/* Here is /i. Running out of room creates a problem if we are
* folding, and the split happens in the middle of a
@@ -14613,6 +14611,132 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
* oldp points to the beginning byte in the input of
* 'ender'.
*
+ * In the case of /il, we haven't folded anything that could be
+ * affected by the locale. That means only above-Latin1
+ * characters that fold to other above-latin1 characters get
+ * folded at compile time. To check where a good place to
+ * split nodes is, everything in it will have to be folded.
+ * The boolean 'maybe_exactfu' keeps track in /il if there are
+ * any unfolded characters in the node. */
+ bool need_to_fold_loc = LOC && ! maybe_exactfu;
+
+ /* If we do need to fold the node, we need a place to store the
+ * folded copy, and a way to map back to the unfolded original
+ * */
+ char * locfold_buf;
+ Size_t * loc_correspondence;
+
+ if (! need_to_fold_loc) { /* The normal case. Just
+ initialize to the actual node */
+ e = s;
+ s_start = s0;
+ s = old_old_s; /* Point to the beginning of the final char
+ that fits in the node */
+ }
+ else {
+
+ /* Here, we have filled a /il node, and there are unfolded
+ * characters in it. If the runtime locale turns out to be
+ * UTF-8, there are possible multi-character folds, just
+ * like when not under /l. The node hence can't terminate
+ * in the middle of such a fold. To determine this, we
+ * have to create a folded copy of this node. That means
+ * reparsing the node, folding everything assuming a UTF-8
+ * locale. (If at runtime it isn't such a locale, the
+ * actions here wouldn't have been necessary, but we have
+ * to assume the worst case.) If we find we need to back
+ * off the folded string, we do so, and then map that
+ * position back to the original unfolded node, which then
+ * gets output, truncated at that spot */
+
+ char * redo_p = RExC_parse;
+ char * redo_e;
+ char * old_redo_e;
+
+ /* Allow enough space assuming a single byte input folds to
+ * a single byte output, plus assume that the two unparsed
+ * characters (that we may need) fold to the largest number
+ * of bytes possible, plus extra for one more worst case
+ * scenario. In the loop below, if we start eating into
+ * that final spare space, we enlarge this initial space */
+ Size_t size = max_string_len + (3 * UTF8_MAXBYTES_CASE) + 1;
+
+ Newxz(locfold_buf, size, char);
+ Newxz(loc_correspondence, size, Size_t);
+
+ /* Redo this node's parse, folding into 'locfold_buf' */
+ redo_p = RExC_parse;
+ redo_e = locfold_buf;
+ while (redo_p <= oldp) {
+
+ old_redo_e = redo_e;
+ loc_correspondence[redo_e - locfold_buf]
+ = redo_p - RExC_parse;
+
+ if (UTF) {
+ Size_t added_len;
+
+ (void) _to_utf8_fold_flags((U8 *) redo_p,
+ (U8 *) RExC_end,
+ (U8 *) redo_e,
+ &added_len,
+ FOLD_FLAGS_FULL);
+ redo_e += added_len;
+ redo_p += UTF8SKIP(redo_p);
+ }
+ else {
+
+ /* Note that if this code is run on some ancient
+ * Unicode versions, SHARP S doesn't fold to 'ss',
+ * but rather than clutter the code with #ifdef's,
+ * as is done above, we ignore that possibility.
+ * This is ok because this code doesn't affect what
+ * gets matched, but merely where the node gets
+ * split */
+ if (UCHARAT(redo_p) != LATIN_SMALL_LETTER_SHARP_S) {
+ *redo_e++ = toLOWER_L1(UCHARAT(redo_p));
+ }
+ else {
+ *redo_e++ = 's';
+ *redo_e++ = 's';
+ }
+ redo_p++;
+ }
+
+
+ /* If we're getting so close to the end that a
+ * worst-case fold in the next character would cause us
+ * to overflow, increase, assuming one byte output byte
+ * per one byte input one, plus room for another worst
+ * case fold */
+ if ( redo_p <= oldp
+ && redo_e > locfold_buf + size
+ - (UTF8_MAXBYTES_CASE + 1))
+ {
+ Size_t new_size = size
+ + (oldp - redo_p)
+ + UTF8_MAXBYTES_CASE + 1;
+ Ptrdiff_t e_offset = redo_e - locfold_buf;
+
+ Renew(locfold_buf, new_size, char);
+ Renew(loc_correspondence, new_size, Size_t);
+ size = new_size;
+
+ redo_e = locfold_buf + e_offset;
+ }
+ }
+
+ /* Set so that things are in terms of the folded, temporary
+ * string */
+ s = old_redo_e;
+ s_start = locfold_buf;
+ e = redo_e;
+
+ }
+
+ /* Here, we have 's', 's_start' and 'e' set up to point to the
+ * input that goes into the node, folded.
+ *
* If the final character of the node and the fold of ender
* form the first two characters of a three character fold, we
* need to peek ahead at the next (unparsed) character in the
@@ -14652,11 +14776,8 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
* and try again.
*
* Otherwise, the node can be split at the current position.
- */
- s = old_old_s; /* Point to the beginning of the final char
- that fits in the node */
-
- /* The same logic is used for UTF-8 patterns and not */
+ *
+ * The same logic is used for UTF-8 patterns and not */
if (UTF) {
Size_t added_len;
@@ -14695,7 +14816,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
* drop down to try at that position */
if (isPUNCT(*p)) {
s = (char *) utf8_hop_back((U8 *) s, -1,
- (U8 *) s0);
+ (U8 *) s_start);
backed_up = TRUE;
}
else {
@@ -14727,7 +14848,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
* either case would break apart a fold */
do {
char *prev_s = (char *) utf8_hop_back((U8 *) s, -1,
- (U8 *) s0);
+ (U8 *) s_start);
/* If is a multi-char fold, can't split here. Backup
* one char and try again */
@@ -14741,11 +14862,11 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
* three character fold starting at the character
* before s, we can't split either before or after s.
* Backup two chars and try again */
- if ( LIKELY(s > s0)
+ if ( LIKELY(s > s_start)
&& UNLIKELY(is_THREE_CHAR_FOLD_utf8_safe(prev_s, e)))
{
s = prev_s;
- s = (char *) utf8_hop_back((U8 *) s, -1, (U8 *) s0);
+ s = (char *) utf8_hop_back((U8 *) s, -1, (U8 *) s_start);
backed_up = TRUE;
continue;
}
@@ -14755,7 +14876,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
splittable = TRUE;
break;
- } while (s > s0); /* End of loops backing up through the node */
+ } while (s > s_start); /* End of loops backing up through the node */
/* Here we either couldn't find a place to split the node,
* or else we broke out of the loop setting 'splittable' to
@@ -14804,7 +14925,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
continue;
}
- if ( LIKELY(s > s0)
+ if ( LIKELY(s > s_start)
&& UNLIKELY(is_THREE_CHAR_FOLD_latin1_safe(s - 1, e)))
{
s -= 2;
@@ -14815,7 +14936,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
splittable = TRUE;
break;
- } while (s > s0);
+ } while (s > s_start);
if (splittable) {
s++;
@@ -14829,9 +14950,28 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
/* If we did find a place to split, reparse the entire node
* stopping where we have calculated. */
if (splittable) {
- upper_fill = s - s0;
+
+ /* If we created a temporary folded string under /l, we
+ * have to map that back to the original */
+ if (need_to_fold_loc) {
+ upper_fill = loc_correspondence[s - s_start];
+ Safefree(locfold_buf);
+ Safefree(loc_correspondence);
+
+ if (upper_fill == 0) {
+ FAIL2("panic: loc_correspondence[%d] is 0",
+ (int) (s - s_start));
+ }
+ }
+ else {
+ upper_fill = s - s0;
+ }
goto reparse;
}
+ else if (need_to_fold_loc) {
+ Safefree(locfold_buf);
+ Safefree(loc_correspondence);
+ }
/* Here the node consists entirely of non-final multi-char
* folds. (Likely it is all 'f's or all 's's.) There's no
diff --git a/t/re/pat.t b/t/re/pat.t
index de8f2afbca..ccf494c302 100644
--- a/t/re/pat.t
+++ b/t/re/pat.t
@@ -25,7 +25,7 @@ BEGIN {
skip_all('no re module') unless defined &DynaLoader::boot_DynaLoader;
skip_all_without_unicode_tables();
-plan tests => 973; # Update this when adding/deleting tests.
+plan tests => 1005; # Update this when adding/deleting tests.
run_tests() unless caller;
@@ -1430,12 +1430,15 @@ EOP
{ # Test that it avoids spllitting a multi-char fold across nodes.
# These all fold to things that are like 'ss', which, if split across
# nodes could fail to match a single character that folds to the
- # combination.
+ # combination. 1F0 byte expands when folded;
my $utf8_locale = find_utf8_ctype_locale();
- for my $char('F', $sharp_s, "\x{FB00}") {
+ for my $char('F', $sharp_s, "\x{1F0}", "\x{FB00}") {
my $length = 260; # Long enough to overflow an EXACTFish regnode
my $p = $char x $length;
- my $s = ($char eq $sharp_s) ? 'ss' : 'ff';
+ my $s = ($char eq $sharp_s) ? 'ss'
+ : $char eq "\x{1F0}"
+ ? "j\x{30c}"
+ : 'ff';
$s = $s x $length;
for my $charset (qw(u d l aa)) {
for my $utf8 (0..1) {