6 files changed, 57 insertions, 33 deletions
diff --git a/CHANGES.rst b/CHANGES.rst
index 1f622b72..96926021 100644
--- a/CHANGES.rst
+++ b/CHANGES.rst
@@ -29,6 +29,11 @@ Unreleased
   - Using ``--format=total`` will write a single total number to the
     output.  This can be useful for making badges or writing status updates.
 
+- Combining data files with ``coverage combine`` now quickly hashes the data
+  files to skip files that provide no new information.  This can reduce the
+  time needed.  For coverage.py's own test suite, combining was about 17%
+  faster.
+
 - An empty file has a coverage total of 100%, but used to fail with
   ``--fail-under``.  This has been fixed, closing `issue 1470`_.
 
diff --git a/coverage/data.py b/coverage/data.py
index 4bdfe301..798d167f 100644
--- a/coverage/data.py
+++ b/coverage/data.py
@@ -11,6 +11,7 @@ imports working.
 """
 
 import glob
+import hashlib
 import os.path
 
 from coverage.exceptions import CoverageException, NoDataError
@@ -110,7 +111,9 @@ def combine_parallel_data(
     if strict and not files_to_combine:
         raise NoDataError("No data to combine")
 
-    files_combined = 0
+    file_hashes = set()
+    combined_any = False
+
     for f in files_to_combine:
         if f == data.data_filename():
             # Sometimes we are combining into a file which is one of the
@@ -118,34 +121,50 @@ def combine_parallel_data(
             if data._debug.should('dataio'):
                 data._debug.write(f"Skipping combining ourself: {f!r}")
             continue
-        if data._debug.should('dataio'):
-            data._debug.write(f"Combining data file {f!r}")
+
         try:
-            new_data = CoverageData(f, debug=data._debug)
-            new_data.read()
-        except CoverageException as exc:
-            if data._warn:
-                # The CoverageException has the file name in it, so just
-                # use the message as the warning.
-                data._warn(str(exc))
+            rel_file_name = os.path.relpath(f)
+        except ValueError:
+            # ValueError can be raised under Windows when os.getcwd() returns a
+            # folder from a different drive than the drive of f, in which case
+            # we print the original value of f instead of its relative path
+            rel_file_name = f
+
+        with open(f, "rb") as fobj:
+            hasher = hashlib.new("sha3_256")
+            hasher.update(fobj.read())
+            sha = hasher.digest()
+            combine_this_one = sha not in file_hashes
+
+        delete_this_one = not keep
+        if combine_this_one:
+            if data._debug.should('dataio'):
+                data._debug.write(f"Combining data file {f!r}")
+            file_hashes.add(sha)
+            try:
+                new_data = CoverageData(f, debug=data._debug)
+                new_data.read()
+            except CoverageException as exc:
+                if data._warn:
+                    # The CoverageException has the file name in it, so just
+                    # use the message as the warning.
+                    data._warn(str(exc))
+                delete_this_one = False
+            else:
+                data.update(new_data, aliases=aliases)
+                combined_any = True
+                if message:
+                    message(f"Combined data file {rel_file_name}")
         else:
-            data.update(new_data, aliases=aliases)
-            files_combined += 1
             if message:
-                try:
-                    file_name = os.path.relpath(f)
-                except ValueError:
-                    # ValueError can be raised under Windows when os.getcwd() returns a
-                    # folder from a different drive than the drive of f, in which case
-                    # we print the original value of f instead of its relative path
-                    file_name = f
-                message(f"Combined data file {file_name}")
-            if not keep:
-                if data._debug.should('dataio'):
-                    data._debug.write(f"Deleting combined data file {f!r}")
-                file_be_gone(f)
-
-    if strict and not files_combined:
+                message(f"Skipping duplicate data {rel_file_name}")
+
+        if delete_this_one:
+            if data._debug.should('dataio'):
+                data._debug.write(f"Deleting data file {f!r}")
+            file_be_gone(f)
+
+    if strict and not combined_any:
         raise NoDataError("No usable data files")
 
 
diff --git a/coverage/sqldata.py b/coverage/sqldata.py
index 2b773053..2fbc53f5 100644
--- a/coverage/sqldata.py
+++ b/coverage/sqldata.py
@@ -4,7 +4,6 @@
 """SQLite coverage data."""
 
 import collections
-import datetime
 import functools
 import glob
 import itertools
@@ -56,7 +55,6 @@ CREATE TABLE meta (
     --  'has_arcs' boolean      -- Is this data recording branches?
     --  'sys_argv' text         -- The coverage command line that recorded the data.
     --  'version' text          -- The version of coverage.py that made the file.
-    --  'when' text             -- Datetime when the file was created.
 );
 
 CREATE TABLE file (
@@ -305,7 +303,6 @@ class CoverageData(SimpleReprMixin):
             [
                 ("sys_argv", str(getattr(sys, "argv", None))),
                 ("version", __version__),
-                ("when", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")),
             ]
         )
 
diff --git a/doc/dbschema.rst b/doc/dbschema.rst
index 34e0a55d..42e616d9 100644
--- a/doc/dbschema.rst
+++ b/doc/dbschema.rst
@@ -70,7 +70,6 @@ This is the database schema:
         --  'has_arcs' boolean      -- Is this data recording branches?
         --  'sys_argv' text         -- The coverage command line that recorded the data.
         --  'version' text          -- The version of coverage.py that made the file.
-        --  'when' text             -- Datetime when the file was created.
     );
 
     CREATE TABLE file (
@@ -116,7 +115,7 @@ This is the database schema:
         foreign key (file_id) references file (id)
     );
 
-.. [[[end]]] (checksum: cfce1df016afbb43a5ff94306db56657)
+.. [[[end]]] (checksum: 9d87794485a9aa6d9064b735972a3447)
 
 
 .. _numbits:
diff --git a/tests/test_api.py b/tests/test_api.py
index ce44b9b1..19545232 100644
--- a/tests/test_api.py
+++ b/tests/test_api.py
@@ -1362,7 +1362,7 @@ class CombiningTest(CoverageTest):
 
         # Make bogus data files.
         self.make_file(".coverage.bad1", "This isn't a coverage data file.")
-        self.make_file(".coverage.bad2", "This isn't a coverage data file.")
+        self.make_file(".coverage.bad2", "This isn't a coverage data file either.")
 
         # Combine the parallel coverage data files into .coverage, but nothing is readable.
         cov = coverage.Coverage()
diff --git a/tests/test_concurrency.py b/tests/test_concurrency.py
index 0a51d4d9..2c827760 100644
--- a/tests/test_concurrency.py
+++ b/tests/test_concurrency.py
@@ -484,9 +484,13 @@ class MultiprocessingTest(CoverageTest):
             out_lines = out.splitlines()
             assert len(out_lines) == nprocs + 1
             assert all(
-                re.fullmatch(r"Combined data file \.coverage\..*\.\d+\.\d+", line)
+                re.fullmatch(
+                    r"(Combined data file|Skipping duplicate data) \.coverage\..*\.\d+\.\d+",
+                    line
+                )
                 for line in out_lines
             )
+            assert len(glob.glob(".coverage.*")) == 0
             out = self.run_command("coverage report -m")
 
             last_line = self.squeezed_lines(out)[-1]