diff options
-rw-r--r-- | CHANGES.rst | 5 | ||||
-rw-r--r-- | coverage/data.py | 71 | ||||
-rw-r--r-- | coverage/sqldata.py | 3 | ||||
-rw-r--r-- | doc/dbschema.rst | 3 | ||||
-rw-r--r-- | tests/test_api.py | 2 | ||||
-rw-r--r-- | tests/test_concurrency.py | 6 |
6 files changed, 57 insertions, 33 deletions
diff --git a/CHANGES.rst b/CHANGES.rst index 1f622b72..96926021 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -29,6 +29,11 @@ Unreleased - Using ``--format=total`` will write a single total number to the output. This can be useful for making badges or writing status updates. +- Combining data files with ``coverage combine`` now quickly hashes the data + files to skip files that provide no new information. This can reduce the + time needed. For coverage.py's own test suite, combining was about 17% + faster. + - An empty file has a coverage total of 100%, but used to fail with ``--fail-under``. This has been fixed, closing `issue 1470`_. diff --git a/coverage/data.py b/coverage/data.py index 4bdfe301..798d167f 100644 --- a/coverage/data.py +++ b/coverage/data.py @@ -11,6 +11,7 @@ imports working. """ import glob +import hashlib import os.path from coverage.exceptions import CoverageException, NoDataError @@ -110,7 +111,9 @@ def combine_parallel_data( if strict and not files_to_combine: raise NoDataError("No data to combine") - files_combined = 0 + file_hashes = set() + combined_any = False + for f in files_to_combine: if f == data.data_filename(): # Sometimes we are combining into a file which is one of the @@ -118,34 +121,50 @@ def combine_parallel_data( if data._debug.should('dataio'): data._debug.write(f"Skipping combining ourself: {f!r}") continue - if data._debug.should('dataio'): - data._debug.write(f"Combining data file {f!r}") + try: - new_data = CoverageData(f, debug=data._debug) - new_data.read() - except CoverageException as exc: - if data._warn: - # The CoverageException has the file name in it, so just - # use the message as the warning. - data._warn(str(exc)) + rel_file_name = os.path.relpath(f) + except ValueError: + # ValueError can be raised under Windows when os.getcwd() returns a + # folder from a different drive than the drive of f, in which case + # we print the original value of f instead of its relative path + rel_file_name = f + + with open(f, "rb") as fobj: + hasher = hashlib.new("sha3_256") + hasher.update(fobj.read()) + sha = hasher.digest() + combine_this_one = sha not in file_hashes + + delete_this_one = not keep + if combine_this_one: + if data._debug.should('dataio'): + data._debug.write(f"Combining data file {f!r}") + file_hashes.add(sha) + try: + new_data = CoverageData(f, debug=data._debug) + new_data.read() + except CoverageException as exc: + if data._warn: + # The CoverageException has the file name in it, so just + # use the message as the warning. + data._warn(str(exc)) + delete_this_one = False + else: + data.update(new_data, aliases=aliases) + combined_any = True + if message: + message(f"Combined data file {rel_file_name}") else: - data.update(new_data, aliases=aliases) - files_combined += 1 if message: - try: - file_name = os.path.relpath(f) - except ValueError: - # ValueError can be raised under Windows when os.getcwd() returns a - # folder from a different drive than the drive of f, in which case - # we print the original value of f instead of its relative path - file_name = f - message(f"Combined data file {file_name}") - if not keep: - if data._debug.should('dataio'): - data._debug.write(f"Deleting combined data file {f!r}") - file_be_gone(f) - - if strict and not files_combined: + message(f"Skipping duplicate data {rel_file_name}") + + if delete_this_one: + if data._debug.should('dataio'): + data._debug.write(f"Deleting data file {f!r}") + file_be_gone(f) + + if strict and not combined_any: raise NoDataError("No usable data files") diff --git a/coverage/sqldata.py b/coverage/sqldata.py index 2b773053..2fbc53f5 100644 --- a/coverage/sqldata.py +++ b/coverage/sqldata.py @@ -4,7 +4,6 @@ """SQLite coverage data.""" import collections -import datetime import functools import glob import itertools @@ -56,7 +55,6 @@ CREATE TABLE meta ( -- 'has_arcs' boolean -- Is this data recording branches? -- 'sys_argv' text -- The coverage command line that recorded the data. -- 'version' text -- The version of coverage.py that made the file. - -- 'when' text -- Datetime when the file was created. ); CREATE TABLE file ( @@ -305,7 +303,6 @@ class CoverageData(SimpleReprMixin): [ ("sys_argv", str(getattr(sys, "argv", None))), ("version", __version__), - ("when", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")), ] ) diff --git a/doc/dbschema.rst b/doc/dbschema.rst index 34e0a55d..42e616d9 100644 --- a/doc/dbschema.rst +++ b/doc/dbschema.rst @@ -70,7 +70,6 @@ This is the database schema: -- 'has_arcs' boolean -- Is this data recording branches? -- 'sys_argv' text -- The coverage command line that recorded the data. -- 'version' text -- The version of coverage.py that made the file. - -- 'when' text -- Datetime when the file was created. ); CREATE TABLE file ( @@ -116,7 +115,7 @@ This is the database schema: foreign key (file_id) references file (id) ); -.. [[[end]]] (checksum: cfce1df016afbb43a5ff94306db56657) +.. [[[end]]] (checksum: 9d87794485a9aa6d9064b735972a3447) .. _numbits: diff --git a/tests/test_api.py b/tests/test_api.py index ce44b9b1..19545232 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -1362,7 +1362,7 @@ class CombiningTest(CoverageTest): # Make bogus data files. self.make_file(".coverage.bad1", "This isn't a coverage data file.") - self.make_file(".coverage.bad2", "This isn't a coverage data file.") + self.make_file(".coverage.bad2", "This isn't a coverage data file either.") # Combine the parallel coverage data files into .coverage, but nothing is readable. cov = coverage.Coverage() diff --git a/tests/test_concurrency.py b/tests/test_concurrency.py index 0a51d4d9..2c827760 100644 --- a/tests/test_concurrency.py +++ b/tests/test_concurrency.py @@ -484,9 +484,13 @@ class MultiprocessingTest(CoverageTest): out_lines = out.splitlines() assert len(out_lines) == nprocs + 1 assert all( - re.fullmatch(r"Combined data file \.coverage\..*\.\d+\.\d+", line) + re.fullmatch( + r"(Combined data file|Skipping duplicate data) \.coverage\..*\.\d+\.\d+", + line + ) for line in out_lines ) + assert len(glob.glob(".coverage.*")) == 0 out = self.run_command("coverage report -m") last_line = self.squeezed_lines(out)[-1] |