perf: hash data files during combining to avoid unneeded work. #1483

When generating many parallel data files, often some data files will be exact copies of each other. Checking the hashes, we can avoid combining the duplicates, speeding the process. On a coverage.py metacov, we had 651 duplicates out of 2189 files (29%). The time to combine was reduced by 17%.
author: Ned Batchelder <ned@nedbatchelder.com> 2022-11-07 16:11:26 -0500
committer: Ned Batchelder <ned@nedbatchelder.com> 2022-11-08 06:36:47 -0500
commit: 09bc2d6ab0f951c58546dab234edeaa9de7d4c44 (patch)
tree: d53ba990b4a90d264a83ee11f89cef9800db3317
parent: bc630b58b5d1c58cc8108e584a480f3a1cd5ab70 (diff)
download: python-coveragepy-git-09bc2d6ab0f951c58546dab234edeaa9de7d4c44.tar.gz
6 files changed, 57 insertions, 33 deletions
diff --git a/CHANGES.rst b/CHANGES.rst
index 1f622b72..96926021 100644
--- a/CHANGES.rst
+++ b/CHANGES.rst
@@ -29,6 +29,11 @@ Unreleased
   - Using ``--format=total`` will write a single total number to the
     output.  This can be useful for making badges or writing status updates.
 
+- Combining data files with ``coverage combine`` now quickly hashes the data
+  files to skip files that provide no new information.  This can reduce the
+  time needed.  For coverage.py's own test suite, combining was about 17%
+  faster.
+
 - An empty file has a coverage total of 100%, but used to fail with
   ``--fail-under``.  This has been fixed, closing `issue 1470`_.
 
diff --git a/coverage/data.py b/coverage/data.py
index 4bdfe301..798d167f 100644
--- a/coverage/data.py
+++ b/coverage/data.py
@@ -11,6 +11,7 @@ imports working.
 """
 
 import glob
+import hashlib
 import os.path
 
 from coverage.exceptions import CoverageException, NoDataError
@@ -110,7 +111,9 @@ def combine_parallel_data(
     if strict and not files_to_combine:
         raise NoDataError("No data to combine")
 
-    files_combined = 0
+    file_hashes = set()
+    combined_any = False
+
     for f in files_to_combine:
         if f == data.data_filename():
             # Sometimes we are combining into a file which is one of the
@@ -118,34 +121,50 @@ def combine_parallel_data(
             if data._debug.should('dataio'):
                 data._debug.write(f"Skipping combining ourself: {f!r}")
             continue
-        if data._debug.should('dataio'):
-            data._debug.write(f"Combining data file {f!r}")
+
         try:
-            new_data = CoverageData(f, debug=data._debug)
-            new_data.read()
-        except CoverageException as exc:
-            if data._warn:
-                # The CoverageException has the file name in it, so just
-                # use the message as the warning.
-                data._warn(str(exc))
+            rel_file_name = os.path.relpath(f)
+        except ValueError:
+            # ValueError can be raised under Windows when os.getcwd() returns a
+            # folder from a different drive than the drive of f, in which case
+            # we print the original value of f instead of its relative path
+            rel_file_name = f
+
+        with open(f, "rb") as fobj:
+            hasher = hashlib.new("sha3_256")
+            hasher.update(fobj.read())
+            sha = hasher.digest()
+            combine_this_one = sha not in file_hashes
+
+        delete_this_one = not keep
+        if combine_this_one:
+            if data._debug.should('dataio'):
+                data._debug.write(f"Combining data file {f!r}")
+            file_hashes.add(sha)
+            try:
+                new_data = CoverageData(f, debug=data._debug)
+                new_data.read()
+            except CoverageException as exc:
+                if data._warn:
+                    # The CoverageException has the file name in it, so just
+                    # use the message as the warning.
+                    data._warn(str(exc))
+                delete_this_one = False
+            else:
+                data.update(new_data, aliases=aliases)
+                combined_any = True
+                if message:
+                    message(f"Combined data file {rel_file_name}")
         else:
-            data.update(new_data, aliases=aliases)
-            files_combined += 1
             if message:
-                try:
-                    file_name = os.path.relpath(f)
-                except ValueError:
-                    # ValueError can be raised under Windows when os.getcwd() returns a
-                    # folder from a different drive than the drive of f, in which case
-                    # we print the original value of f instead of its relative path
-                    file_name = f
-                message(f"Combined data file {file_name}")
-            if not keep:
-                if data._debug.should('dataio'):
-                    data._debug.write(f"Deleting combined data file {f!r}")
-                file_be_gone(f)
-
-    if strict and not files_combined:
+                message(f"Skipping duplicate data {rel_file_name}")
+
+        if delete_this_one:
+            if data._debug.should('dataio'):
+                data._debug.write(f"Deleting data file {f!r}")
+            file_be_gone(f)
+
+    if strict and not combined_any:
         raise NoDataError("No usable data files")
 
 
diff --git a/coverage/sqldata.py b/coverage/sqldata.py
index 2b773053..2fbc53f5 100644
--- a/coverage/sqldata.py
+++ b/coverage/sqldata.py
@@ -4,7 +4,6 @@
 """SQLite coverage data."""
 
 import collections
-import datetime
 import functools
 import glob
 import itertools
@@ -56,7 +55,6 @@ CREATE TABLE meta (
     --  'has_arcs' boolean      -- Is this data recording branches?
     --  'sys_argv' text         -- The coverage command line that recorded the data.
     --  'version' text          -- The version of coverage.py that made the file.
-    --  'when' text             -- Datetime when the file was created.
 );
 
 CREATE TABLE file (
@@ -305,7 +303,6 @@ class CoverageData(SimpleReprMixin):
             [
                 ("sys_argv", str(getattr(sys, "argv", None))),
                 ("version", __version__),
-                ("when", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")),
             ]
         )
 
diff --git a/doc/dbschema.rst b/doc/dbschema.rst
index 34e0a55d..42e616d9 100644
--- a/doc/dbschema.rst
+++ b/doc/dbschema.rst
@@ -70,7 +70,6 @@ This is the database schema:
         --  'has_arcs' boolean      -- Is this data recording branches?
         --  'sys_argv' text         -- The coverage command line that recorded the data.
         --  'version' text          -- The version of coverage.py that made the file.
-        --  'when' text             -- Datetime when the file was created.
     );
 
     CREATE TABLE file (
@@ -116,7 +115,7 @@ This is the database schema:
         foreign key (file_id) references file (id)
     );
 
-.. [[[end]]] (checksum: cfce1df016afbb43a5ff94306db56657)
+.. [[[end]]] (checksum: 9d87794485a9aa6d9064b735972a3447)
 
 
 .. _numbits:
diff --git a/tests/test_api.py b/tests/test_api.py
index ce44b9b1..19545232 100644
--- a/tests/test_api.py
+++ b/tests/test_api.py
@@ -1362,7 +1362,7 @@ class CombiningTest(CoverageTest):
 
         # Make bogus data files.
         self.make_file(".coverage.bad1", "This isn't a coverage data file.")
-        self.make_file(".coverage.bad2", "This isn't a coverage data file.")
+        self.make_file(".coverage.bad2", "This isn't a coverage data file either.")
 
         # Combine the parallel coverage data files into .coverage, but nothing is readable.
         cov = coverage.Coverage()
diff --git a/tests/test_concurrency.py b/tests/test_concurrency.py
index 0a51d4d9..2c827760 100644
--- a/tests/test_concurrency.py
+++ b/tests/test_concurrency.py
@@ -484,9 +484,13 @@ class MultiprocessingTest(CoverageTest):
             out_lines = out.splitlines()
             assert len(out_lines) == nprocs + 1
             assert all(
-                re.fullmatch(r"Combined data file \.coverage\..*\.\d+\.\d+", line)
+                re.fullmatch(
+                    r"(Combined data file|Skipping duplicate data) \.coverage\..*\.\d+\.\d+",
+                    line
+                )
                 for line in out_lines
             )
+            assert len(glob.glob(".coverage.*")) == 0
             out = self.run_command("coverage report -m")
 
             last_line = self.squeezed_lines(out)[-1]
author	Ned Batchelder <ned@nedbatchelder.com>	2022-11-07 16:11:26 -0500
committer	Ned Batchelder <ned@nedbatchelder.com>	2022-11-08 06:36:47 -0500
commit	09bc2d6ab0f951c58546dab234edeaa9de7d4c44 (patch)
tree	d53ba990b4a90d264a83ee11f89cef9800db3317
parent	bc630b58b5d1c58cc8108e584a480f3a1cd5ab70 (diff)
download	python-coveragepy-git-09bc2d6ab0f951c58546dab234edeaa9de7d4c44.tar.gz