diff options
author | Yves Orton <demerphq@gmail.com> | 2017-04-25 15:17:06 +0200 |
---|---|---|
committer | Sawyer X <xsawyerx@cpan.org> | 2017-06-01 10:53:31 +0200 |
commit | 0db967b2e6a4093a6a5f649190159767e5d005e0 (patch) | |
tree | 3ba46188e235582c4bf9b9d49fc805f75e466aa8 /ext/File-Glob | |
parent | 1b51016637278b8990d8aadc7f8cbf4ba92406d6 (diff) | |
download | perl-0db967b2e6a4093a6a5f649190159767e5d005e0.tar.gz |
[perl #131211] fixup File::Glob degenerate matching
The old code would go quadratic with recursion and backtracking
when doing patterns like "a*a*a*a*a*a*a*x" on a file like
"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa".
This patch changes the code to not recurse, and to not backtrack,
as per this article from Russ Cox: https://research.swtch.com/glob
It also adds a micro-optimisation for M_ONE and M_SET under the new code.
Thanks to Avar and Russ Cox for helping with this patch, along with
Jilles Tjoelker and the rest of the FreeBSD community.
Diffstat (limited to 'ext/File-Glob')
-rw-r--r-- | ext/File-Glob/bsd_glob.c | 64 | ||||
-rw-r--r-- | ext/File-Glob/t/rt131211.t | 94 |
2 files changed, 143 insertions, 15 deletions
diff --git a/ext/File-Glob/bsd_glob.c b/ext/File-Glob/bsd_glob.c index 821ef200ad..e96fb7356a 100644 --- a/ext/File-Glob/bsd_glob.c +++ b/ext/File-Glob/bsd_glob.c @@ -563,8 +563,12 @@ glob0(const Char *pattern, glob_t *pglob) break; case BG_STAR: pglob->gl_flags |= GLOB_MAGCHAR; - /* collapse adjacent stars to one, - * to avoid exponential behavior + /* Collapse adjacent stars to one. + * This is required to ensure that a pattern like + * "a**" matches a name like "a", as without this + * check when the first star matched everything it would + * cause the second star to return a match fail. + * As long ** is folded here this does not happen. */ if (bufnext == patbuf || bufnext[-1] != M_ALL) *bufnext++ = M_ALL; @@ -909,35 +913,56 @@ globextend(const Char *path, glob_t *pglob, size_t *limitp) /* - * pattern matching function for filenames. Each occurrence of the * - * pattern causes a recursion level. + * pattern matching function for filenames using state machine to avoid + * recursion. We maintain a "nextp" and "nextn" to allow us to backtrack + * without additional callframes, and to do cleanly prune the backtracking + * state when multiple '*' (start) matches are included in the patter. + * + * Thanks to Russ Cox for the improved state machine logic to avoid quadratic + * matching on failure. + * + * https://research.swtch.com/glob + * + * An example would be a pattern + * ("a*" x 100) . "y" + * against a file name like + * ("a" x 100) . "x" + * */ static int match(Char *name, Char *pat, Char *patend, int nocase) { int ok, negate_range; Char c, k; + Char *nextp = NULL; + Char *nextn = NULL; + loop: while (pat < patend) { c = *pat++; switch (c & M_MASK) { case M_ALL: if (pat == patend) return(1); - do - if (match(name, pat, patend, nocase)) - return(1); - while (*name++ != BG_EOS) - ; - return(0); + if (*name == BG_EOS) + return 0; + nextn = name + 1; + nextp = pat - 1; + break; case M_ONE: + /* since * matches leftmost-shortest first * + * if we encounter the EOS then backtracking * + * will not help, so we can exit early here. */ if (*name++ == BG_EOS) - return(0); + return 0; break; case M_SET: ok = 0; + /* since * matches leftmost-shortest first * + * if we encounter the EOS then backtracking * + * will not help, so we can exit early here. */ if ((k = *name++) == BG_EOS) - return(0); + return 0; if ((negate_range = ((*pat & M_MASK) == M_NOT)) != BG_EOS) ++pat; while (((c = *pat++) & M_MASK) != M_END) @@ -953,16 +978,25 @@ match(Char *name, Char *pat, Char *patend, int nocase) } else if (nocase ? (tolower(c) == tolower(k)) : (c == k)) ok = 1; if (ok == negate_range) - return(0); + goto fail; break; default: k = *name++; if (nocase ? (tolower(k) != tolower(c)) : (k != c)) - return(0); + goto fail; break; } } - return(*name == BG_EOS); + if (*name == BG_EOS) + return 1; + + fail: + if (nextn) { + pat = nextp; + name = nextn; + goto loop; + } + return 0; } /* Free allocated data belonging to a glob_t structure. */ diff --git a/ext/File-Glob/t/rt131211.t b/ext/File-Glob/t/rt131211.t new file mode 100644 index 0000000000..c1bcbe04e8 --- /dev/null +++ b/ext/File-Glob/t/rt131211.t @@ -0,0 +1,94 @@ +use strict; +use warnings; +use v5.16.0; +use File::Temp 'tempdir'; +use File::Spec::Functions; +use Test::More; +use Time::HiRes qw(time); + +plan tests => 13; + +my $path = tempdir uc cleanup => 1; +my @files= ( + "x".("a" x 50)."b", # 0 + "abbbbbbbbbbbbc", # 1 + "abbbbbbbbbbbbd", # 2 + "aaabaaaabaaaabc", # 3 + "pq", # 4 + "r", # 5 + "rttiiiiiii", # 6 + "wewewewewewe", # 7 + "weeeweeeweee", # 8 + "weewweewweew", # 9 + "wewewewewewewewewewewewewewewewewq", # 10 + "wtttttttetttttttwr", # 11 +); + + +foreach (@files) { + open(my $f, ">", catfile $path, $_); +} + +my $elapsed_fail= 0; +my $elapsed_match= 0; +my @got_files; +my @no_files; +my $count = 0; + +while (++$count < 10) { + $elapsed_match -= time; + @got_files= glob catfile $path, "x".("a*" x $count) . "b"; + $elapsed_match += time; + + $elapsed_fail -= time; + @no_files= glob catfile $path, "x".("a*" x $count) . "c"; + $elapsed_fail += time; + last if $elapsed_fail > $elapsed_match * 100; +} + +is $count,10, + "tried all the patterns without bailing out"; + +cmp_ok $elapsed_fail/$elapsed_match,"<",2, + "time to fail less than twice the time to match"; +is "@got_files", catfile($path, $files[0]), + "only got the expected file for xa*..b"; +is "@no_files", "", "shouldnt have files for xa*..c"; + + +@got_files= glob catfile $path, "a*b*b*b*bc"; +is "@got_files", catfile($path, $files[1]), + "only got the expected file for a*b*b*b*bc"; + +@got_files= sort glob catfile $path, "a*b*b*bc"; +is "@got_files", catfile($path, $files[3])." ".catfile($path,$files[1]), + "got the expected two files for a*b*b*bc"; + +@got_files= sort glob catfile $path, "p*"; +is "@got_files", catfile($path, $files[4]), + "p* matches pq"; + +@got_files= sort glob catfile $path, "r*???????"; +is "@got_files", catfile($path, $files[6]), + "r*??????? works as expected"; + +@got_files= sort glob catfile $path, "w*e*w??e"; +is "@got_files", join(" ", sort map { catfile($path, $files[$_]) } (7,8)), + "w*e*w??e works as expected"; + +@got_files= sort glob catfile $path, "w*e*we??"; +is "@got_files", join(" ", sort map { catfile($path, $files[$_]) } (7,8,9,10)), + "w*e*we?? works as expected"; + +@got_files= sort glob catfile $path, "w**e**w"; +is "@got_files", join(" ", sort map { catfile($path, $files[$_]) } (9)), + "w**e**w works as expected"; + +@got_files= sort glob catfile $path, "*wee*"; +is "@got_files", join(" ", sort map { catfile($path, $files[$_]) } (8,9)), + "*wee* works as expected"; + +@got_files= sort glob catfile $path, "we*"; +is "@got_files", join(" ", sort map { catfile($path, $files[$_]) } (7,8,9,10)), + "we* works as expected"; + |