diff options
Diffstat (limited to 'bzrlib/multiparent.py')
-rw-r--r-- | bzrlib/multiparent.py | 681 |
1 files changed, 681 insertions, 0 deletions
diff --git a/bzrlib/multiparent.py b/bzrlib/multiparent.py new file mode 100644 index 0000000..c352cbc --- /dev/null +++ b/bzrlib/multiparent.py @@ -0,0 +1,681 @@ +# Copyright (C) 2007-2011 Canonical Ltd +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + +from __future__ import absolute_import + +from bzrlib.lazy_import import lazy_import + +lazy_import(globals(), """ +import errno +import gzip +import itertools +import os +from StringIO import StringIO + +from bzrlib import ( + bencode, + errors, + patiencediff, + ui, + ) +""") + + +def topo_iter_keys(vf, keys=None): + if keys is None: + keys = vf.keys() + parents = vf.get_parent_map(keys) + return _topo_iter(parents, keys) + +def topo_iter(vf, versions=None): + if versions is None: + versions = vf.versions() + parents = vf.get_parent_map(versions) + return _topo_iter(parents, versions) + +def _topo_iter(parents, versions): + seen = set() + descendants = {} + def pending_parents(version): + if parents[version] is None: + return [] + return [v for v in parents[version] if v in versions and + v not in seen] + for version_id in versions: + if parents[version_id] is None: + # parentless + continue + for parent_id in parents[version_id]: + descendants.setdefault(parent_id, []).append(version_id) + cur = [v for v in versions if len(pending_parents(v)) == 0] + while len(cur) > 0: + next = [] + for version_id in cur: + if version_id in seen: + continue + if len(pending_parents(version_id)) != 0: + continue + next.extend(descendants.get(version_id, [])) + yield version_id + seen.add(version_id) + cur = next + + +class MultiParent(object): + """A multi-parent diff""" + + __slots__ = ['hunks'] + + def __init__(self, hunks=None): + if hunks is not None: + self.hunks = hunks + else: + self.hunks = [] + + def __repr__(self): + return "MultiParent(%r)" % self.hunks + + def __eq__(self, other): + if self.__class__ is not other.__class__: + return False + return (self.hunks == other.hunks) + + @staticmethod + def from_lines(text, parents=(), left_blocks=None): + """Produce a MultiParent from a list of lines and parents""" + def compare(parent): + matcher = patiencediff.PatienceSequenceMatcher(None, parent, + text) + return matcher.get_matching_blocks() + if len(parents) > 0: + if left_blocks is None: + left_blocks = compare(parents[0]) + parent_comparisons = [left_blocks] + [compare(p) for p in + parents[1:]] + else: + parent_comparisons = [] + cur_line = 0 + new_text = NewText([]) + parent_text = [] + block_iter = [iter(i) for i in parent_comparisons] + diff = MultiParent([]) + def next_block(p): + try: + return block_iter[p].next() + except StopIteration: + return None + cur_block = [next_block(p) for p, i in enumerate(block_iter)] + while cur_line < len(text): + best_match = None + for p, block in enumerate(cur_block): + if block is None: + continue + i, j, n = block + while j + n <= cur_line: + block = cur_block[p] = next_block(p) + if block is None: + break + i, j, n = block + if block is None: + continue + if j > cur_line: + continue + offset = cur_line - j + i += offset + j = cur_line + n -= offset + if n == 0: + continue + if best_match is None or n > best_match.num_lines: + best_match = ParentText(p, i, j, n) + if best_match is None: + new_text.lines.append(text[cur_line]) + cur_line += 1 + else: + if len(new_text.lines) > 0: + diff.hunks.append(new_text) + new_text = NewText([]) + diff.hunks.append(best_match) + cur_line += best_match.num_lines + if len(new_text.lines) > 0: + diff.hunks.append(new_text) + return diff + + def get_matching_blocks(self, parent, parent_len): + for hunk in self.hunks: + if not isinstance(hunk, ParentText) or hunk.parent != parent: + continue + yield (hunk.parent_pos, hunk.child_pos, hunk.num_lines) + yield parent_len, self.num_lines(), 0 + + def to_lines(self, parents=()): + """Contruct a fulltext from this diff and its parents""" + mpvf = MultiMemoryVersionedFile() + for num, parent in enumerate(parents): + mpvf.add_version(StringIO(parent).readlines(), num, []) + mpvf.add_diff(self, 'a', range(len(parents))) + return mpvf.get_line_list(['a'])[0] + + @classmethod + def from_texts(cls, text, parents=()): + """Produce a MultiParent from a text and list of parent text""" + return cls.from_lines(StringIO(text).readlines(), + [StringIO(p).readlines() for p in parents]) + + def to_patch(self): + """Yield text lines for a patch""" + for hunk in self.hunks: + for line in hunk.to_patch(): + yield line + + def patch_len(self): + return len(''.join(self.to_patch())) + + def zipped_patch_len(self): + return len(gzip_string(self.to_patch())) + + @classmethod + def from_patch(cls, text): + """Create a MultiParent from its string form""" + return cls._from_patch(StringIO(text)) + + @staticmethod + def _from_patch(lines): + """This is private because it is essential to split lines on \n only""" + line_iter = iter(lines) + hunks = [] + cur_line = None + while(True): + try: + cur_line = line_iter.next() + except StopIteration: + break + if cur_line[0] == 'i': + num_lines = int(cur_line.split(' ')[1]) + hunk_lines = [line_iter.next() for x in xrange(num_lines)] + hunk_lines[-1] = hunk_lines[-1][:-1] + hunks.append(NewText(hunk_lines)) + elif cur_line[0] == '\n': + hunks[-1].lines[-1] += '\n' + else: + if not (cur_line[0] == 'c'): + raise AssertionError(cur_line[0]) + parent, parent_pos, child_pos, num_lines =\ + [int(v) for v in cur_line.split(' ')[1:]] + hunks.append(ParentText(parent, parent_pos, child_pos, + num_lines)) + return MultiParent(hunks) + + def range_iterator(self): + """Iterate through the hunks, with range indicated + + kind is "new" or "parent". + for "new", data is a list of lines. + for "parent", data is (parent, parent_start, parent_end) + :return: a generator of (start, end, kind, data) + """ + start = 0 + for hunk in self.hunks: + if isinstance(hunk, NewText): + kind = 'new' + end = start + len(hunk.lines) + data = hunk.lines + else: + kind = 'parent' + start = hunk.child_pos + end = start + hunk.num_lines + data = (hunk.parent, hunk.parent_pos, hunk.parent_pos + + hunk.num_lines) + yield start, end, kind, data + start = end + + def num_lines(self): + """The number of lines in the output text""" + extra_n = 0 + for hunk in reversed(self.hunks): + if isinstance(hunk, ParentText): + return hunk.child_pos + hunk.num_lines + extra_n + extra_n += len(hunk.lines) + return extra_n + + def is_snapshot(self): + """Return true of this hunk is effectively a fulltext""" + if len(self.hunks) != 1: + return False + return (isinstance(self.hunks[0], NewText)) + + +class NewText(object): + """The contents of text that is introduced by this text""" + + __slots__ = ['lines'] + + def __init__(self, lines): + self.lines = lines + + def __eq__(self, other): + if self.__class__ is not other.__class__: + return False + return (other.lines == self.lines) + + def __repr__(self): + return 'NewText(%r)' % self.lines + + def to_patch(self): + yield 'i %d\n' % len(self.lines) + for line in self.lines: + yield line + yield '\n' + + +class ParentText(object): + """A reference to text present in a parent text""" + + __slots__ = ['parent', 'parent_pos', 'child_pos', 'num_lines'] + + def __init__(self, parent, parent_pos, child_pos, num_lines): + self.parent = parent + self.parent_pos = parent_pos + self.child_pos = child_pos + self.num_lines = num_lines + + def _as_dict(self): + return dict(parent=self.parent, parent_pos=self.parent_pos, + child_pos=self.child_pos, num_lines=self.num_lines) + + def __repr__(self): + return ('ParentText(%(parent)r, %(parent_pos)r, %(child_pos)r,' + ' %(num_lines)r)' % self._as_dict()) + + def __eq__(self, other): + if self.__class__ is not other.__class__: + return False + return self._as_dict() == other._as_dict() + + def to_patch(self): + yield ('c %(parent)d %(parent_pos)d %(child_pos)d %(num_lines)d\n' + % self._as_dict()) + + +class BaseVersionedFile(object): + """Pseudo-VersionedFile skeleton for MultiParent""" + + def __init__(self, snapshot_interval=25, max_snapshots=None): + self._lines = {} + self._parents = {} + self._snapshots = set() + self.snapshot_interval = snapshot_interval + self.max_snapshots = max_snapshots + + def versions(self): + return iter(self._parents) + + def has_version(self, version): + return version in self._parents + + def do_snapshot(self, version_id, parent_ids): + """Determine whether to perform a snapshot for this version""" + if self.snapshot_interval is None: + return False + if self.max_snapshots is not None and\ + len(self._snapshots) == self.max_snapshots: + return False + if len(parent_ids) == 0: + return True + for ignored in xrange(self.snapshot_interval): + if len(parent_ids) == 0: + return False + version_ids = parent_ids + parent_ids = [] + for version_id in version_ids: + if version_id not in self._snapshots: + parent_ids.extend(self._parents[version_id]) + else: + return True + + def add_version(self, lines, version_id, parent_ids, + force_snapshot=None, single_parent=False): + """Add a version to the versionedfile + + :param lines: The list of lines to add. Must be split on '\n'. + :param version_id: The version_id of the version to add + :param force_snapshot: If true, force this version to be added as a + snapshot version. If false, force this version to be added as a + diff. If none, determine this automatically. + :param single_parent: If true, use a single parent, rather than + multiple parents. + """ + if force_snapshot is None: + do_snapshot = self.do_snapshot(version_id, parent_ids) + else: + do_snapshot = force_snapshot + if do_snapshot: + self._snapshots.add(version_id) + diff = MultiParent([NewText(lines)]) + else: + if single_parent: + parent_lines = self.get_line_list(parent_ids[:1]) + else: + parent_lines = self.get_line_list(parent_ids) + diff = MultiParent.from_lines(lines, parent_lines) + if diff.is_snapshot(): + self._snapshots.add(version_id) + self.add_diff(diff, version_id, parent_ids) + self._lines[version_id] = lines + + def get_parents(self, version_id): + return self._parents[version_id] + + def make_snapshot(self, version_id): + snapdiff = MultiParent([NewText(self.cache_version(version_id))]) + self.add_diff(snapdiff, version_id, self._parents[version_id]) + self._snapshots.add(version_id) + + def import_versionedfile(self, vf, snapshots, no_cache=True, + single_parent=False, verify=False): + """Import all revisions of a versionedfile + + :param vf: The versionedfile to import + :param snapshots: If provided, the revisions to make snapshots of. + Otherwise, this will be auto-determined + :param no_cache: If true, clear the cache after every add. + :param single_parent: If true, omit all but one parent text, (but + retain parent metadata). + """ + if not (no_cache or not verify): + raise ValueError() + revisions = set(vf.versions()) + total = len(revisions) + pb = ui.ui_factory.nested_progress_bar() + try: + while len(revisions) > 0: + added = set() + for revision in revisions: + parents = vf.get_parents(revision) + if [p for p in parents if p not in self._parents] != []: + continue + lines = [a + ' ' + l for a, l in + vf.annotate(revision)] + if snapshots is None: + force_snapshot = None + else: + force_snapshot = (revision in snapshots) + self.add_version(lines, revision, parents, force_snapshot, + single_parent) + added.add(revision) + if no_cache: + self.clear_cache() + vf.clear_cache() + if verify: + if not (lines == self.get_line_list([revision])[0]): + raise AssertionError() + self.clear_cache() + pb.update(gettext('Importing revisions'), + (total - len(revisions)) + len(added), total) + revisions = [r for r in revisions if r not in added] + finally: + pb.finished() + + def select_snapshots(self, vf): + """Determine which versions to add as snapshots""" + build_ancestors = {} + descendants = {} + snapshots = set() + for version_id in topo_iter(vf): + potential_build_ancestors = set(vf.get_parents(version_id)) + parents = vf.get_parents(version_id) + if len(parents) == 0: + snapshots.add(version_id) + build_ancestors[version_id] = set() + else: + for parent in vf.get_parents(version_id): + potential_build_ancestors.update(build_ancestors[parent]) + if len(potential_build_ancestors) > self.snapshot_interval: + snapshots.add(version_id) + build_ancestors[version_id] = set() + else: + build_ancestors[version_id] = potential_build_ancestors + return snapshots + + def select_by_size(self, num): + """Select snapshots for minimum output size""" + num -= len(self._snapshots) + new_snapshots = self.get_size_ranking()[-num:] + return [v for n, v in new_snapshots] + + def get_size_ranking(self): + """Get versions ranked by size""" + versions = [] + new_snapshots = set() + for version_id in self.versions(): + if version_id in self._snapshots: + continue + diff_len = self.get_diff(version_id).patch_len() + snapshot_len = MultiParent([NewText( + self.cache_version(version_id))]).patch_len() + versions.append((snapshot_len - diff_len, version_id)) + versions.sort() + return versions + + def import_diffs(self, vf): + """Import the diffs from another pseudo-versionedfile""" + for version_id in vf.versions(): + self.add_diff(vf.get_diff(version_id), version_id, + vf._parents[version_id]) + + def get_build_ranking(self): + """Return revisions sorted by how much they reduce build complexity""" + could_avoid = {} + referenced_by = {} + for version_id in topo_iter(self): + could_avoid[version_id] = set() + if version_id not in self._snapshots: + for parent_id in self._parents[version_id]: + could_avoid[version_id].update(could_avoid[parent_id]) + could_avoid[version_id].update(self._parents) + could_avoid[version_id].discard(version_id) + for avoid_id in could_avoid[version_id]: + referenced_by.setdefault(avoid_id, set()).add(version_id) + available_versions = list(self.versions()) + ranking = [] + while len(available_versions) > 0: + available_versions.sort(key=lambda x: + len(could_avoid[x]) * + len(referenced_by.get(x, []))) + selected = available_versions.pop() + ranking.append(selected) + for version_id in referenced_by[selected]: + could_avoid[version_id].difference_update( + could_avoid[selected]) + for version_id in could_avoid[selected]: + referenced_by[version_id].difference_update( + referenced_by[selected] + ) + return ranking + + def clear_cache(self): + self._lines.clear() + + def get_line_list(self, version_ids): + return [self.cache_version(v) for v in version_ids] + + def cache_version(self, version_id): + try: + return self._lines[version_id] + except KeyError: + pass + diff = self.get_diff(version_id) + lines = [] + reconstructor = _Reconstructor(self, self._lines, self._parents) + reconstructor.reconstruct_version(lines, version_id) + self._lines[version_id] = lines + return lines + + +class MultiMemoryVersionedFile(BaseVersionedFile): + """Memory-backed pseudo-versionedfile""" + + def __init__(self, snapshot_interval=25, max_snapshots=None): + BaseVersionedFile.__init__(self, snapshot_interval, max_snapshots) + self._diffs = {} + + def add_diff(self, diff, version_id, parent_ids): + self._diffs[version_id] = diff + self._parents[version_id] = parent_ids + + def get_diff(self, version_id): + try: + return self._diffs[version_id] + except KeyError: + raise errors.RevisionNotPresent(version_id, self) + + def destroy(self): + self._diffs = {} + + +class MultiVersionedFile(BaseVersionedFile): + """Disk-backed pseudo-versionedfile""" + + def __init__(self, filename, snapshot_interval=25, max_snapshots=None): + BaseVersionedFile.__init__(self, snapshot_interval, max_snapshots) + self._filename = filename + self._diff_offset = {} + + def get_diff(self, version_id): + start, count = self._diff_offset[version_id] + infile = open(self._filename + '.mpknit', 'rb') + try: + infile.seek(start) + sio = StringIO(infile.read(count)) + finally: + infile.close() + zip_file = gzip.GzipFile(None, mode='rb', fileobj=sio) + try: + file_version_id = zip_file.readline() + content = zip_file.read() + return MultiParent.from_patch(content) + finally: + zip_file.close() + + def add_diff(self, diff, version_id, parent_ids): + outfile = open(self._filename + '.mpknit', 'ab') + try: + outfile.seek(0, 2) # workaround for windows bug: + # .tell() for files opened in 'ab' mode + # before any write returns 0 + start = outfile.tell() + try: + zipfile = gzip.GzipFile(None, mode='ab', fileobj=outfile) + zipfile.writelines(itertools.chain( + ['version %s\n' % version_id], diff.to_patch())) + finally: + zipfile.close() + end = outfile.tell() + finally: + outfile.close() + self._diff_offset[version_id] = (start, end-start) + self._parents[version_id] = parent_ids + + def destroy(self): + try: + os.unlink(self._filename + '.mpknit') + except OSError, e: + if e.errno != errno.ENOENT: + raise + try: + os.unlink(self._filename + '.mpidx') + except OSError, e: + if e.errno != errno.ENOENT: + raise + + def save(self): + open(self._filename + '.mpidx', 'wb').write(bencode.bencode( + (self._parents, list(self._snapshots), self._diff_offset))) + + def load(self): + self._parents, snapshots, self._diff_offset = bencode.bdecode( + open(self._filename + '.mpidx', 'rb').read()) + self._snapshots = set(snapshots) + + +class _Reconstructor(object): + """Build a text from the diffs, ancestry graph and cached lines""" + + def __init__(self, diffs, lines, parents): + self.diffs = diffs + self.lines = lines + self.parents = parents + self.cursor = {} + + def reconstruct(self, lines, parent_text, version_id): + """Append the lines referred to by a ParentText to lines""" + parent_id = self.parents[version_id][parent_text.parent] + end = parent_text.parent_pos + parent_text.num_lines + return self._reconstruct(lines, parent_id, parent_text.parent_pos, + end) + + def _reconstruct(self, lines, req_version_id, req_start, req_end): + """Append lines for the requested version_id range""" + # stack of pending range requests + if req_start == req_end: + return + pending_reqs = [(req_version_id, req_start, req_end)] + while len(pending_reqs) > 0: + req_version_id, req_start, req_end = pending_reqs.pop() + # lazily allocate cursors for versions + if req_version_id in self.lines: + lines.extend(self.lines[req_version_id][req_start:req_end]) + continue + try: + start, end, kind, data, iterator = self.cursor[req_version_id] + except KeyError: + iterator = self.diffs.get_diff(req_version_id).range_iterator() + start, end, kind, data = iterator.next() + if start > req_start: + iterator = self.diffs.get_diff(req_version_id).range_iterator() + start, end, kind, data = iterator.next() + + # find the first hunk relevant to the request + while end <= req_start: + start, end, kind, data = iterator.next() + self.cursor[req_version_id] = start, end, kind, data, iterator + # if the hunk can't satisfy the whole request, split it in two, + # and leave the second half for later. + if req_end > end: + pending_reqs.append((req_version_id, end, req_end)) + req_end = end + if kind == 'new': + lines.extend(data[req_start - start: (req_end - start)]) + else: + # If the hunk is a ParentText, rewrite it as a range request + # for the parent, and make it the next pending request. + parent, parent_start, parent_end = data + new_version_id = self.parents[req_version_id][parent] + new_start = parent_start + req_start - start + new_end = parent_end + req_end - end + pending_reqs.append((new_version_id, new_start, new_end)) + + def reconstruct_version(self, lines, version_id): + length = self.diffs.get_diff(version_id).num_lines() + return self._reconstruct(lines, version_id, 0, length) + + +def gzip_string(lines): + sio = StringIO() + data_file = gzip.GzipFile(None, mode='wb', fileobj=sio) + data_file.writelines(lines) + data_file.close() + return sio.getvalue() |