summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPaul Eggert <eggert@cs.ucla.edu>2016-10-25 21:57:56 -0700
committerPaul Eggert <eggert@cs.ucla.edu>2016-10-25 21:58:31 -0700
commit68b82f6f8419a815cfcf962b3061352d414dc606 (patch)
tree2b40c7be3eecaeed747b5247965b76b4eac11151
parent571f01c069dfc7f860e5500a5d08ebfdaf9c3068 (diff)
downloaddiffutils-68b82f6f8419a815cfcf962b3061352d414dc606.tar.gz
diff: fix big performance degradation in 3.4
* NEWS, doc/diffutils.texi (Overview): Document this. * src/analyze.c (diff_2_files): Restore too_expensive heuristic, but this time with a floor that is 16 times the old floor. This should fix Bug#16848, by generating good-quality output for its test case, while not introducing Bug#24715, by running nearly as fast as diff-3.3 for that test case.
-rw-r--r--NEWS5
-rw-r--r--doc/diffutils.texi5
-rw-r--r--src/analyze.c11
3 files changed, 19 insertions, 2 deletions
diff --git a/NEWS b/NEWS
index 473332e..88f5d2b 100644
--- a/NEWS
+++ b/NEWS
@@ -6,6 +6,11 @@ GNU diffutils NEWS -*- outline -*-
diff no longer mishandles line numbers exceeding 2**31 on Mingw-w64.
+** Performance changes
+
+ diff's default algorithm has been tweaked to deal better with larger
+ files, reversing some of the changes made in diffutils-3.4.
+
* Noteworthy changes in release 3.5 (2016-08-20) [stable]
diff --git a/doc/diffutils.texi b/doc/diffutils.texi
index b478380..ceaf557 100644
--- a/doc/diffutils.texi
+++ b/doc/diffutils.texi
@@ -161,7 +161,10 @@ The algorithm was independently discovered as described by Esko Ukkonen in
@c Date: Wed, 29 Sep 1993 08:27:55 MST
@c Ukkonen should be given credit for also discovering the algorithm used
@c in GNU diff.
-Related algorithms are surveyed by Alfred V. Aho in
+Unless the @option{--minimal} option is used, @command{diff} uses a
+heuristic by Paul Eggert that limits the cost to @math{O(N^1.5 log N)}
+at the price of producing suboptimal output for large inputs with many
+differences. Related algorithms are surveyed by Alfred V. Aho in
section 6.3 of ``Algorithms for Finding Patterns in Strings'',
@cite{Handbook of Theoretical Computer Science} (Jan Van Leeuwen,
ed.), Vol.@: A, @cite{Algorithms and Complexity}, Elsevier/MIT Press,
diff --git a/src/analyze.c b/src/analyze.c
index 3981872..847b8b0 100644
--- a/src/analyze.c
+++ b/src/analyze.c
@@ -534,6 +534,7 @@ diff_2_files (struct comparison *cmp)
{
struct context ctxt;
lin diags;
+ lin too_expensive;
/* Allocate vectors for the results of comparison:
a flag for each line of each file, saying whether that line
@@ -565,11 +566,19 @@ diff_2_files (struct comparison *cmp)
ctxt.heuristic = speed_large_files;
+ /* Set TOO_EXPENSIVE to be the approximate square root of the
+ input size, bounded below by 4096. 4096 seems to be good for
+ circa-2016 CPUs; see Bug#16848 and Bug#24715. */
+ too_expensive = 1;
+ for (; diags != 0; diags >>= 2)
+ too_expensive <<= 1;
+ ctxt.too_expensive = MAX (4096, too_expensive);
+
files[0] = cmp->file[0];
files[1] = cmp->file[1];
compareseq (0, cmp->file[0].nondiscarded_lines,
- 0, cmp->file[1].nondiscarded_lines, &ctxt);
+ 0, cmp->file[1].nondiscarded_lines, minimal, &ctxt);
free (ctxt.fdiag - (cmp->file[1].nondiscarded_lines + 1));