summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--CHANGES.rst5
-rw-r--r--coverage/data.py71
-rw-r--r--coverage/sqldata.py3
-rw-r--r--doc/dbschema.rst3
-rw-r--r--tests/test_api.py2
-rw-r--r--tests/test_concurrency.py6
6 files changed, 57 insertions, 33 deletions
diff --git a/CHANGES.rst b/CHANGES.rst
index 1f622b72..96926021 100644
--- a/CHANGES.rst
+++ b/CHANGES.rst
@@ -29,6 +29,11 @@ Unreleased
- Using ``--format=total`` will write a single total number to the
output. This can be useful for making badges or writing status updates.
+- Combining data files with ``coverage combine`` now quickly hashes the data
+ files to skip files that provide no new information. This can reduce the
+ time needed. For coverage.py's own test suite, combining was about 17%
+ faster.
+
- An empty file has a coverage total of 100%, but used to fail with
``--fail-under``. This has been fixed, closing `issue 1470`_.
diff --git a/coverage/data.py b/coverage/data.py
index 4bdfe301..798d167f 100644
--- a/coverage/data.py
+++ b/coverage/data.py
@@ -11,6 +11,7 @@ imports working.
"""
import glob
+import hashlib
import os.path
from coverage.exceptions import CoverageException, NoDataError
@@ -110,7 +111,9 @@ def combine_parallel_data(
if strict and not files_to_combine:
raise NoDataError("No data to combine")
- files_combined = 0
+ file_hashes = set()
+ combined_any = False
+
for f in files_to_combine:
if f == data.data_filename():
# Sometimes we are combining into a file which is one of the
@@ -118,34 +121,50 @@ def combine_parallel_data(
if data._debug.should('dataio'):
data._debug.write(f"Skipping combining ourself: {f!r}")
continue
- if data._debug.should('dataio'):
- data._debug.write(f"Combining data file {f!r}")
+
try:
- new_data = CoverageData(f, debug=data._debug)
- new_data.read()
- except CoverageException as exc:
- if data._warn:
- # The CoverageException has the file name in it, so just
- # use the message as the warning.
- data._warn(str(exc))
+ rel_file_name = os.path.relpath(f)
+ except ValueError:
+ # ValueError can be raised under Windows when os.getcwd() returns a
+ # folder from a different drive than the drive of f, in which case
+ # we print the original value of f instead of its relative path
+ rel_file_name = f
+
+ with open(f, "rb") as fobj:
+ hasher = hashlib.new("sha3_256")
+ hasher.update(fobj.read())
+ sha = hasher.digest()
+ combine_this_one = sha not in file_hashes
+
+ delete_this_one = not keep
+ if combine_this_one:
+ if data._debug.should('dataio'):
+ data._debug.write(f"Combining data file {f!r}")
+ file_hashes.add(sha)
+ try:
+ new_data = CoverageData(f, debug=data._debug)
+ new_data.read()
+ except CoverageException as exc:
+ if data._warn:
+ # The CoverageException has the file name in it, so just
+ # use the message as the warning.
+ data._warn(str(exc))
+ delete_this_one = False
+ else:
+ data.update(new_data, aliases=aliases)
+ combined_any = True
+ if message:
+ message(f"Combined data file {rel_file_name}")
else:
- data.update(new_data, aliases=aliases)
- files_combined += 1
if message:
- try:
- file_name = os.path.relpath(f)
- except ValueError:
- # ValueError can be raised under Windows when os.getcwd() returns a
- # folder from a different drive than the drive of f, in which case
- # we print the original value of f instead of its relative path
- file_name = f
- message(f"Combined data file {file_name}")
- if not keep:
- if data._debug.should('dataio'):
- data._debug.write(f"Deleting combined data file {f!r}")
- file_be_gone(f)
-
- if strict and not files_combined:
+ message(f"Skipping duplicate data {rel_file_name}")
+
+ if delete_this_one:
+ if data._debug.should('dataio'):
+ data._debug.write(f"Deleting data file {f!r}")
+ file_be_gone(f)
+
+ if strict and not combined_any:
raise NoDataError("No usable data files")
diff --git a/coverage/sqldata.py b/coverage/sqldata.py
index 2b773053..2fbc53f5 100644
--- a/coverage/sqldata.py
+++ b/coverage/sqldata.py
@@ -4,7 +4,6 @@
"""SQLite coverage data."""
import collections
-import datetime
import functools
import glob
import itertools
@@ -56,7 +55,6 @@ CREATE TABLE meta (
-- 'has_arcs' boolean -- Is this data recording branches?
-- 'sys_argv' text -- The coverage command line that recorded the data.
-- 'version' text -- The version of coverage.py that made the file.
- -- 'when' text -- Datetime when the file was created.
);
CREATE TABLE file (
@@ -305,7 +303,6 @@ class CoverageData(SimpleReprMixin):
[
("sys_argv", str(getattr(sys, "argv", None))),
("version", __version__),
- ("when", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")),
]
)
diff --git a/doc/dbschema.rst b/doc/dbschema.rst
index 34e0a55d..42e616d9 100644
--- a/doc/dbschema.rst
+++ b/doc/dbschema.rst
@@ -70,7 +70,6 @@ This is the database schema:
-- 'has_arcs' boolean -- Is this data recording branches?
-- 'sys_argv' text -- The coverage command line that recorded the data.
-- 'version' text -- The version of coverage.py that made the file.
- -- 'when' text -- Datetime when the file was created.
);
CREATE TABLE file (
@@ -116,7 +115,7 @@ This is the database schema:
foreign key (file_id) references file (id)
);
-.. [[[end]]] (checksum: cfce1df016afbb43a5ff94306db56657)
+.. [[[end]]] (checksum: 9d87794485a9aa6d9064b735972a3447)
.. _numbits:
diff --git a/tests/test_api.py b/tests/test_api.py
index ce44b9b1..19545232 100644
--- a/tests/test_api.py
+++ b/tests/test_api.py
@@ -1362,7 +1362,7 @@ class CombiningTest(CoverageTest):
# Make bogus data files.
self.make_file(".coverage.bad1", "This isn't a coverage data file.")
- self.make_file(".coverage.bad2", "This isn't a coverage data file.")
+ self.make_file(".coverage.bad2", "This isn't a coverage data file either.")
# Combine the parallel coverage data files into .coverage, but nothing is readable.
cov = coverage.Coverage()
diff --git a/tests/test_concurrency.py b/tests/test_concurrency.py
index 0a51d4d9..2c827760 100644
--- a/tests/test_concurrency.py
+++ b/tests/test_concurrency.py
@@ -484,9 +484,13 @@ class MultiprocessingTest(CoverageTest):
out_lines = out.splitlines()
assert len(out_lines) == nprocs + 1
assert all(
- re.fullmatch(r"Combined data file \.coverage\..*\.\d+\.\d+", line)
+ re.fullmatch(
+ r"(Combined data file|Skipping duplicate data) \.coverage\..*\.\d+\.\d+",
+ line
+ )
for line in out_lines
)
+ assert len(glob.glob(".coverage.*")) == 0
out = self.run_command("coverage report -m")
last_line = self.squeezed_lines(out)[-1]