From 9d8c35bfa8a717c656a78c636c5a4600e5837ae7 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Tue, 4 Aug 2020 21:26:28 +0100 Subject: lorry: Prune unreachable commits from hg-fast-export marks file By default, hg-fast-export refuses to export a Mercurial repository if it has unnamed heads. lorry passes the --force option that overrides this since we still want to convert the named heads, but doing this sets us up for failure later. If the unnamed heads also have no tags pointing to them, the corresponding git commits may be deleted by 'git gc'. However they will still be listed in the 'marks' file used to record state for incremental conversions, so a later run of hg-fast-export may assume they exist and refer to them by hash. Before running hg-fast-export, delete any lines in its marks file that refer to unreachable or non-existent commits. Closes #7. --- lorry | 53 ++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 52 insertions(+), 1 deletion(-) (limited to 'lorry') diff --git a/lorry b/lorry index 955b46b..c2019d4 100755 --- a/lorry +++ b/lorry @@ -31,6 +31,8 @@ import email.message import email.utils import ftplib import re +import subprocess +import tempfile import yaml @@ -621,7 +623,13 @@ class Lorry(cliapp.Application): if not os.path.exists(gitdir): self.needs_aggressive = True self.run_program(['git', 'init', '--bare', gitdir]) - + + # Since there are marks files in existing deployments that + # have broken references, fix up the marks file before rather + # than after running hg-fast-export + self.prune_unreachable_marks(gitdir, + os.path.join(gitdir, 'hg2git-marks')) + self.progress('.. fast-exporting into git') self.run_program(['hg-fast-export', '-r', '../hg', '--quiet', '--force'], cwd=gitdir) @@ -718,6 +726,49 @@ class Lorry(cliapp.Application): if self.settings['verbose']: self.output.write('%s\n' % msg) + def prune_unreachable_marks(self, gitdir, marks_name): + if not os.path.exists(marks_name): + return + + # Find reachable commits + reachable = set() + with subprocess.Popen(['git', 'rev-list', '--all'], + cwd=gitdir, stdin=subprocess.DEVNULL, + stdout=subprocess.PIPE, + universal_newlines=True) as rev_list_proc: + for line in rev_list_proc.stdout: + reachable.add(line.rstrip('\n')) + + # Filter marks file to temporary file + mark_re = re.compile(r':(\S+) ([0-9a-f]{40,})\n') + marks_temp_fd, marks_temp_name = \ + tempfile.mkstemp(dir=os.path.dirname(marks_name)) + try: + with open(marks_temp_fd, 'w') as marks_out, \ + open(marks_name, 'r') as marks_in: + for line in marks_in: + match = mark_re.match(line) + if not match: + msg = ('%s: failed to parse line "%s"' + % (marks_name, line.rstrip('\n'))) + logging.warning(msg) + self.output.write('%s\n' % msg) + # We don't know whether it should be kept; err + # on the side of caution + marks_out.write(line) + elif match.group(2) in reachable: + marks_out.write(line) + else: + self.progress('%s: pruning unreachable commit %s' + % (marks_name, match.group(2))) + + # On success, replace marks file with temporary file + os.rename(marks_temp_name, marks_name) + except: + # On failure, delete temporary file + os.unlink(marks_temp_name) + raise + if __name__ == '__main__': Lorry(version=__version__).run() -- cgit v1.2.1 From acc048b4ee9aea7dc90395a091b0de5968047268 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Wed, 5 Aug 2020 20:14:38 +0100 Subject: lorry: Only log number of commits pruned from marks files Logging each pruned commit can result in very long logs for some repositories. This is a problem for Lorry Controller because job status updates result in copying the entire log and not just the new log lines. There's not much value in listing all the commit hashes, so log the number of commits instead. Update the test case accordingly. --- lorry | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'lorry') diff --git a/lorry b/lorry index c2019d4..9a10045 100755 --- a/lorry +++ b/lorry @@ -743,6 +743,7 @@ class Lorry(cliapp.Application): mark_re = re.compile(r':(\S+) ([0-9a-f]{40,})\n') marks_temp_fd, marks_temp_name = \ tempfile.mkstemp(dir=os.path.dirname(marks_name)) + n_pruned = 0 try: with open(marks_temp_fd, 'w') as marks_out, \ open(marks_name, 'r') as marks_in: @@ -759,11 +760,14 @@ class Lorry(cliapp.Application): elif match.group(2) in reachable: marks_out.write(line) else: - self.progress('%s: pruning unreachable commit %s' - % (marks_name, match.group(2))) + n_pruned += 1 # On success, replace marks file with temporary file os.rename(marks_temp_name, marks_name) + + if n_pruned: + self.progress('%s: pruned %d unreachable commit(s)' + % (marks_name, n_pruned)) except: # On failure, delete temporary file os.unlink(marks_temp_name) -- cgit v1.2.1