summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--embedvar.h1
-rw-r--r--handy.h2
-rw-r--r--intrpvar.h2
-rw-r--r--regcomp.c186
-rw-r--r--sv.c2
-rw-r--r--t/re/pat_advanced.t27
6 files changed, 208 insertions, 12 deletions
diff --git a/embedvar.h b/embedvar.h
index 01f3db139f..0a3c7fa2d9 100644
--- a/embedvar.h
+++ b/embedvar.h
@@ -67,6 +67,7 @@
#define PL_Mem (vTHX->IMem)
#define PL_MemParse (vTHX->IMemParse)
#define PL_MemShared (vTHX->IMemShared)
+#define PL_NonL1NonFinalFold (vTHX->INonL1NonFinalFold)
#define PL_PerlSpace (vTHX->IPerlSpace)
#define PL_PosixAlnum (vTHX->IPosixAlnum)
#define PL_PosixAlpha (vTHX->IPosixAlpha)
diff --git a/handy.h b/handy.h
index 5e478cb540..db018d74cc 100644
--- a/handy.h
+++ b/handy.h
@@ -675,6 +675,8 @@ EXTCONST U32 PL_charclass[];
# define _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(c) ((! cBOOL(FITS_IN_8_BITS(c))) || (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_mask(_CC_NONLATIN1_FOLD)))
# define _isQUOTEMETA(c) _generic_isCC(c, _CC_QUOTEMETA)
+# define _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c) \
+ _generic_isCC(c, _CC_NON_FINAL_FOLD)
#else /* No perl.h. */
# ifdef EBCDIC
# define isALNUMC_A(c) (isASCII(c) && isALNUMC(c))
diff --git a/intrpvar.h b/intrpvar.h
index f2be894537..c27e33893f 100644
--- a/intrpvar.h
+++ b/intrpvar.h
@@ -611,6 +611,8 @@ PERLVAR(I, XPosixXDigit, SV *)
PERLVAR(I, VertSpace, SV *)
+PERLVAR(I, NonL1NonFinalFold, SV *)
+
/* utf8 character class swashes */
PERLVAR(I, utf8_alnum, SV *)
PERLVAR(I, utf8_alpha, SV *)
diff --git a/regcomp.c b/regcomp.c
index 488ebebf0a..0369055485 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -87,11 +87,10 @@ extern const struct regexp_engine my_reg_engine;
#endif
#include "dquote_static.c"
-#ifndef PERL_IN_XSUB_RE
-# include "charclass_invlists.h"
-#endif
+#include "charclass_invlists.h"
#define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
+#define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
#ifdef op
#undef op
@@ -10394,15 +10393,18 @@ tryagain:
RExC_parse++;
defchar: {
- register STRLEN len;
+ register STRLEN len = 0;
register UV ender;
register char *p;
char *s;
#define MAX_NODE_STRING_SIZE 127
- char foldbuf[MAX_NODE_STRING_SIZE];
+ char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
+ char *s0;
+ U8 upper_parse = MAX_NODE_STRING_SIZE;
STRLEN foldlen;
U8 node_type;
bool next_is_quantifier;
+ char * oldp;
ender = 0;
node_type = compute_EXACTish(pRExC_state);
@@ -10412,6 +10414,10 @@ tryagain:
* actual node, as the node doesn't exist yet */
s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
+ s0 = s;
+
+ reparse:
+
/* XXX The node can hold up to 255 bytes, yet this only goes to
* 127. I (khw) do not know why. Keeping it somewhat less than
* 255 allows us to not have to worry about overflow due to
@@ -10430,11 +10436,11 @@ tryagain:
* could back off to end with only a code point that isn't such a
* non-final, but it is possible for there not to be any in the
* entire node. */
- for (len = 0, p = RExC_parse - 1;
- len < MAX_NODE_STRING_SIZE && p < RExC_end;
+ for (p = RExC_parse - 1;
+ len < upper_parse && p < RExC_end;
len++)
{
- char * const oldp = p;
+ oldp = p;
if (RExC_flags & RXf_PMf_EXTENDED)
p = regwhite( pRExC_state, p );
@@ -10633,6 +10639,10 @@ tryagain:
break;
} /* End of switch on the literal */
+ /* Here, have looked at the literal character and <ender>
+ * contains its ordinal, <p> points to the character after it
+ */
+
if ( RExC_flags & RXf_PMf_EXTENDED)
p = regwhite( pRExC_state, p );
@@ -10693,7 +10703,7 @@ tryagain:
len += foldlen - 1;
}
else {
- REGC((char)ender, s++);
+ *(s++) = ender;
}
}
else if (UTF) {
@@ -10722,6 +10732,164 @@ tryagain:
} /* End of loop through literal characters */
+ /* Here we have either exhausted the input or ran out of room in
+ * the node. (If we encountered a character that can't be in the
+ * node, transfer is made directly to <loopdone>, and so we
+ * wouldn't have fallen off the end of the loop.) In the latter
+ * case, we artificially have to split the node into two, because
+ * we just don't have enough space to hold everything. This
+ * creates a problem if the final character participates in a
+ * multi-character fold in the non-final position, as a match that
+ * should have occurred won't, due to the way nodes are matched,
+ * and our artificial boundary. So back off until we find a non-
+ * problematic character -- one that isn't at the beginning or
+ * middle of such a fold. (Either it doesn't participate in any
+ * folds, or appears only in the final position of all the folds it
+ * does participate in.) A better solution with far fewer false
+ * positives, and that would fill the nodes more completely, would
+ * be to actually have available all the multi-character folds to
+ * test against, and to back-off only far enough to be sure that
+ * this node isn't ending with a partial one. <upper_parse> is set
+ * further below (if we need to reparse the node) to include just
+ * up through that final non-problematic character that this code
+ * identifies, so when it is set to less than the full node, we can
+ * skip the rest of this */
+ if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
+
+ const STRLEN full_len = len;
+
+ assert(len >= MAX_NODE_STRING_SIZE);
+
+ /* Here, <s> points to the final byte of the final character.
+ * Look backwards through the string until find a non-
+ * problematic character */
+
+ if (! UTF) {
+
+ /* These two have no multi-char folds to non-UTF characters
+ */
+ if (ASCII_FOLD_RESTRICTED || LOC) {
+ goto loopdone;
+ }
+
+ while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
+ len = s - s0 + 1;
+ }
+ else {
+ if (! PL_NonL1NonFinalFold) {
+ PL_NonL1NonFinalFold = _new_invlist_C_array(
+ NonL1_Perl_Non_Final_Folds_invlist);
+ }
+
+ /* Point to the first byte of the final character */
+ s = (char *) utf8_hop((U8 *) s, -1);
+
+ while (s >= s0) { /* Search backwards until find
+ non-problematic char */
+ if (UTF8_IS_INVARIANT(*s)) {
+
+ /* There are no ascii characters that participate
+ * in multi-char folds under /aa. In EBCDIC, the
+ * non-ascii invariants are all control characters,
+ * so don't ever participate in any folds. */
+ if (ASCII_FOLD_RESTRICTED
+ || ! IS_NON_FINAL_FOLD(*s))
+ {
+ break;
+ }
+ }
+ else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
+
+ /* No Latin1 characters participate in multi-char
+ * folds under /l */
+ if (LOC
+ || ! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_UNI(
+ *s, *(s+1))))
+ {
+ break;
+ }
+ }
+ else if (! _invlist_contains_cp(
+ PL_NonL1NonFinalFold,
+ valid_utf8_to_uvchr((U8 *) s, NULL)))
+ {
+ break;
+ }
+
+ /* Here, the current character is problematic in that
+ * it does occur in the non-final position of some
+ * fold, so try the character before it, but have to
+ * special case the very first byte in the string, so
+ * we don't read outside the string */
+ s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
+ } /* End of loop backwards through the string */
+
+ /* If there were only problematic characters in the string,
+ * <s> will point to before s0, in which case the length
+ * should be 0, otherwise include the length of the
+ * non-problematic character just found */
+ len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
+ }
+
+ /* Here, have found the final character, if any, that is
+ * non-problematic as far as ending the node without splitting
+ * it across a potential multi-char fold. <len> contains the
+ * number of bytes in the node up-to and including that
+ * character, or is 0 if there is no such character, meaning
+ * the whole node contains only problematic characters. In
+ * this case, give up and just take the node as-is. We can't
+ * do any better */
+ if (len == 0) {
+ len = full_len;
+ } else {
+
+ /* Here, the node does contain some characters that aren't
+ * problematic. If one such is the final character in the
+ * node, we are done */
+ if (len == full_len) {
+ goto loopdone;
+ }
+ else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
+
+ /* If the final character is problematic, but the
+ * penultimate is not, back-off that last character to
+ * later start a new node with it */
+ p = oldp;
+ goto loopdone;
+ }
+
+ /* Here, the final non-problematic character is earlier
+ * in the input than the penultimate character. What we do
+ * is reparse from the beginning, going up only as far as
+ * this final ok one, thus guaranteeing that the node ends
+ * in an acceptable character. The reason we reparse is
+ * that we know how far in the character is, but we don't
+ * know how to correlate its position with the input parse.
+ * An alternate implementation would be to build that
+ * correlation as we go along during the original parse,
+ * but that would entail extra work for every node, whereas
+ * this code gets executed only when the string is too
+ * large for the node, and the final two characters are
+ * problematic, an infrequent occurrence. Yet another
+ * possible strategy would be to save the tail of the
+ * string, and the next time regatom is called, initialize
+ * with that. The problem with this is that unless you
+ * back off one more character, you won't be guaranteed
+ * regatom will get called again, unless regbranch,
+ * regpiece ... are also changed. If you do back off that
+ * extra character, so that there is input guaranteed to
+ * force calling regatom, you can't handle the case where
+ * just the first character in the node is acceptable. I
+ * (khw) decided to try this method which doesn't have that
+ * pitfall; if performance issues are found, we can do a
+ * combination of the current approach plus that one */
+ upper_parse = len;
+ len = 0;
+ s = s0;
+ goto reparse;
+ }
+ } /* End of verifying node ends with an appropriate char */
+
loopdone: /* Jumped to when encounters something that shouldn't be in
the node */
RExC_parse = p - 1;
diff --git a/sv.c b/sv.c
index 4ab14e362d..4ad53cdf80 100644
--- a/sv.c
+++ b/sv.c
@@ -13370,6 +13370,8 @@ perl_clone_using(PerlInterpreter *proto_perl, UV flags,
PL_VertSpace = sv_dup_inc(proto_perl->IVertSpace, param);
+ PL_NonL1NonFinalFold = sv_dup_inc(proto_perl->INonL1NonFinalFold, param);
+
/* utf8 character class swashes */
PL_utf8_alnum = sv_dup_inc(proto_perl->Iutf8_alnum, param);
PL_utf8_alpha = sv_dup_inc(proto_perl->Iutf8_alpha, param);
diff --git a/t/re/pat_advanced.t b/t/re/pat_advanced.t
index 526b9e2000..a3540fd185 100644
--- a/t/re/pat_advanced.t
+++ b/t/re/pat_advanced.t
@@ -873,16 +873,16 @@ sub run_tests {
}
{
- for (120 .. 130) {
+ for (120 .. 130, 240 .. 260) {
my $head = 'x' x $_;
my $message = q [Don't misparse \x{...} in regexp ] .
- q [near 127 char EXACT limit];
+ q [near EXACT char count limit];
for my $tail ('\x{0061}', '\x{1234}', '\x61') {
eval qq{like("$head$tail", qr/$head$tail/, \$message)};
is($@, '', $message);
}
$message = q [Don't misparse \N{...} in regexp ] .
- q [near 127 char EXACT limit];
+ q [near EXACT char count limit];
for my $tail ('\N{SNOWFLAKE}') {
eval qq {use charnames ':full';
like("$head$tail", qr/$head$tail/, \$message)};
@@ -2064,6 +2064,27 @@ EOP
ok "x" =~ /\A(?>(?:(?:)A|B|C?x))\z/,
"Check TRIE does not overwrite EXACT following NOTHING at start - RT #111842";
+ {
+ my $single = ":";
+ my $upper = "\x{390}"; # Fold is 3 chars.
+ my $multi = CORE::fc($upper);
+
+ my $failed = 0;
+
+ # Try forcing a node to be split, with a multi-char fold at the
+ # boundary
+ for my $repeat (1 .. 300) {
+ my $string = $single x $repeat;
+ my $lhs = $string . $upper;
+ if ($lhs !~ m/$string$multi/i) {
+ $failed = $repeat;
+ last;
+ }
+ }
+ ok(! $failed, "Matched multi-char fold across EXACTFish node boundaries; if failed, was at count $failed");
+
+ }
+
#
# Keep the following tests last -- they may crash perl
#