diff options
author | Jim MacArthur <jim.macarthur@codethink.co.uk> | 2018-07-13 16:41:18 +0100 |
---|---|---|
committer | Jim MacArthur <jim.macarthur@codethink.co.uk> | 2018-07-24 14:39:03 +0100 |
commit | 302028197f77fb23803d4d6916f4fbc8ea6684d0 (patch) | |
tree | b03c6bc28261da614dd0258838802922e344739d | |
parent | d39392062bb22208aed1c4aae09a8e0fe079d3a9 (diff) | |
download | buildstream-302028197f77fb23803d4d6916f4fbc8ea6684d0.tar.gz |
WIP: Fixes to cas-to-cas import. Terrible mess
-rw-r--r-- | buildstream/storage/_casbaseddirectory.py | 184 |
1 files changed, 162 insertions, 22 deletions
diff --git a/buildstream/storage/_casbaseddirectory.py b/buildstream/storage/_casbaseddirectory.py index e0748b5bd..61ee976d2 100644 --- a/buildstream/storage/_casbaseddirectory.py +++ b/buildstream/storage/_casbaseddirectory.py @@ -330,6 +330,7 @@ class CasBasedDirectory(Directory): def _import_files_from_directory(self, source_directory, files, path_prefix=""): result = FileListResult() for entry in sorted(files): + if entry == ".": continue split_path = entry.split(os.path.sep) # The actual file on the FS we're importing import_file = os.path.join(source_directory, entry) @@ -341,8 +342,9 @@ class CasBasedDirectory(Directory): # a better way would be to hand off all the files in this subdir at once. subdir_result = self._import_directory_recursively(directory_name, source_directory, split_path[1:], path_prefix) + result.combine(subdir_result) - elif os.path.islink(import_file): + elif os.path.islink(import_file): # careful about ordering here, as some cases overlap if self._check_replacement(entry, path_prefix, result): self._add_new_link(source_directory, entry) result.files_written.append(relative_pathname) @@ -370,29 +372,153 @@ class CasBasedDirectory(Directory): with open(refname, "wb") as f: f.write(self.ref.SerializeToString()) - def _import_cas_into_cas(self, source_directory): + def find_updated_files(self, modified_directory, prefix=""): + """Find the list of written and overwritten files that would result + from importing 'modified_directory' into this one. This does + not change either directory. The reason this exists is for + direct imports of cas directories into other ones, which can + be done by simply replacing a hash, but we still need the file + lists. + + """ + result = FileListResult() + for entry in modified_directory.pb2_directory.directories: + existing_dir = self.find_pb2_entry(entry.name) + if existing_dir: + updates_files = existing_dir.find_updated_files(modified_directory.descend(entry.name), + os.path.join(prefix, entry.name)) + result.combine(updated_files) + else: + for f in source_directory.descend(entry.name).list_relative_paths(): + result.files_written.append(os.path.join(prefix, f)) + # None of these can overwrite anything, since the original files don't exist + for entry in modified_directory.pb2_directory.files + modified_directory.pb2_directory.symlinks: + if self.find_pb2_entry(entry.name): + result.files_overwritten.apppend(os.path.join(prefix, entry.name)) + result.file_written.apppend(os.path.join(prefix, entry.name)) + return result + + def filelist_iterator(sorted_files): + """ An interator which just returns the top level directories from sorted_files. It will return any directories listed as a single return value. The return value is a tuple consisting + of a path component and a list of sub-components. If the list is empty, this indicates that the return value is a file. """ + last_directory = None + for f in sorted_files: + components = f.split(os.path.sep) + if len(components)>1: + if components[0] != last_directory: + if last_directory is not None: + yield (components[0], subcomponents) + subcomponents = [] + subcomponents.append(os.path.sep.join(components[1:])) + else: + if last_directory is not None: + yield (last_directory, subcomponents) + yield(f, []) + if last_directory is not None: + yield (last_directory, subcomponents) + + def files_in_subdir(sorted_files, dirname): + """ Filters sorted_files and returns only the ones which have 'dirname' as a prefix, with that prefix removed. """ + if not dirname.endswith(os.path.sep): + dirname += os.path.sep + return [f.lstrip(dirname) for f in sorted_files if f.startswith(dirname) ] + + def _partial_import_cas_into_cas(self, source_directory, files, path_prefix="", file_list_required=True): + result = FileListResult() + sorted_files = sorted(files) # Check if this is necessary + processed_entries = set() + for f in sorted_files: + if f == ".": continue + components = f.split(os.path.sep) + if len(components)>1 or isinstance(source_directory.index[components[0]].buildstream_object, CasBasedDirectory): + # Then we are importing a directory + dirname = components[0] + if dirname not in processed_entries: + subcomponents = CasBasedDirectory.files_in_subdir(sorted_files, dirname) + dest_subdir = self.descend(dirname, create=True) + src_subdir = source_directory.descend(dirname) + import_result = dest_subdir._partial_import_cas_into_cas(src_subdir, subcomponents, path_prefix=os.path.join(path_prefix,f), file_list_required=file_list_required) + result.combine(import_result) + processed_entries.add(dirname) + else: + self._check_replacement(f, path_prefix, result) + fullname = os.path.join(path_prefix, f) + item = source_directory.index[f] + if isinstance(item.pb2_object, remote_execution_pb2.FileNode): + filenode = self.pb2_directory.files.add(digest=item.pb2_object.digest) + filenode.name = f + is_executable = item.pb2_object.is_executable + filenode.is_executable = is_executable + self.index[f] = IndexEntry(filenode, modified=(fullname in result.overwritten)) + else: # Must be a symlink + symlinknode = self.pb2_directory.symlinks.add() + symlinknode.name = fullname + # A symlink node has no digest. + symlinknode.target = item.pb2_object.target + self.index[filename] = IndexEntry(symlinknode, modified=(fullname in result.overwritten)) + return result + + def _full_import_cas_into_cas(self, source_directory, path_prefix="", file_list_required=True): + result = FileListResult() for entry in source_directory.pb2_directory.directories: - # Create a cloned CasBasedDirectory, to avoid modifying the old one - buildStreamDirectory = CasBasedDirectory(self.context, ref=entry.digest, - parent=self, filename=entry.name) - new_pb2_dirnode = self.pb2_directory.directories.add(digest=entry.digest, name=entry.name) + # Create a cloned CasBasedDirectory, since we may import more files + # into a subdirectory of it and we don't want to affect the original. + existing_dir = self.find_pb2_entry(entry.name) + if existing_dir: + existing_dir.digest = entry.digest + else: + new_pb2_dirnode = self.pb2_directory.directories.add(digest=entry.digest, name=entry.name) + buildStreamDirectory = CasBasedDirectory(self.context, ref=entry.digest, + parent=self, filename=entry.name) self.index[entry.name] = IndexEntry(entry, buildstream_object=buildStreamDirectory) + + if file_list_required: + if existing_dir: + updated_files = existing_dir.find_updated_files(source_directory.descend(entry.name), entry.name) + result.combine(updated_files) + else: + for i in source_directory.descend(entry.name).list_relative_paths(): + result.files_written.append(i) + for entry in source_directory.pb2_directory.files: - filenode = self.pb2_directory.files.add(name=entry.name, digest=entry.digest) + existing_file = self.find_pb2_entry(entry.name) + if existing_file: + result.files_overwritten.append(relative_pathname) + filenode = existing_file + filenode.digest = entry.digest + else: + filenode = self.pb2_directory.files.add(name=entry.name, digest=entry.digest) is_executable = entry.is_executable filenode.is_executable = is_executable - self.index[entry.name] = IndexEntry(filenode, modified=(entry.name in self.index)) - for entry in self.pb2_directory.symlinks: + self.index[entry.name] = IndexEntry(filenode, modified=(existing_file is not None)) + relative_pathname = os.path.join(path_prefix, entry.name) + result.files_written.append(relative_pathname) + + for entry in source_directory.pb2_directory.symlinks: + print(" Importing symlink {} into this directory.".format(entry.name)) existing_link = self.find_pb2_entry(entry.name) + relative_pathname = os.path.join(path_prefix, entry.name) if existing_link: symlinknode = existing_link + result.files_overwritten.append(relative_pathname) else: symlinknode = self.pb2_directory.symlinks.add() symlinknode.name = entry.name # A symlink node has no digest. symlinknode.target = entry.target self.index[entry.name] = IndexEntry(symlinknode, modified=(existing_link is not None)) - # TODO: Return a list of things imported. + result.files_written.append(relative_pathname) + return result + + def _import_cas_into_cas(self, source_directory, files=None): + """ A full import is significantly easier than a partial import """ + if files is None: + print("Performing FULL import of {} into {}".format(source_directory, self)) + return self._full_import_cas_into_cas(source_directory) + else: + print("Performing PARTIAL import of {} into {}".format(source_directory, self)) + print("File list is: {}".format(files)) + return self._partial_import_cas_into_cas(source_directory, files) def import_files(self, external_pathspec: any, files: List[str] = None, report_written: bool = True, update_utimes: bool = False, @@ -415,22 +541,33 @@ class CasBasedDirectory(Directory): can_link (bool): Ignored, since hard links do not have any meaning within CAS. """ - if isinstance(external_pathspec, FileBasedDirectory): - source_directory = external_pathspec.get_underlying_directory() - elif isinstance(external_pathspec, CasBasedDirectory): - result = self._import_cas_into_cas(external_pathspec) - # TODO: No result is returned? - return result - else: - source_directory = external_pathspec - if files is None: - files = list_relative_paths(source_directory) + duplicate_cas = None + if isinstance(external_pathspec, CasBasedDirectory): + result = self._import_cas_into_cas(external_pathspec, files=files) + + # Duplicate the current directory and do an import that way. + duplicate_cas = CasBasedDirectory(self.context, ref=self.ref) + with tempfile.TemporaryDirectory(prefix="roundtrip") as tmpdir: + external_pathspec.export_files(tmpdir) + if files is None: + files = list_relative_paths(tmpdir) + duplicate_cas._import_files_from_directory(tmpdir, files=files) + duplicate_cas._recalculate_recursing_down() + if duplicate_cas.parent: + duplicate_cas.parent._recalculate_recursing_up(self) + else: + if isinstance(external_pathspec, FileBasedDirectory): + source_directory = external_pathspec.get_underlying_directory() + else: + source_directory = external_pathspec + if files is None: + files = list_relative_paths(external_pathspec) + result = self._import_files_from_directory(source_directory, files=files) # TODO: No notice is taken of report_written, update_utimes or can_link. # Current behaviour is to fully populate the report, which is inefficient, # but still correct. - result = self._import_files_from_directory(source_directory, files=files) # We need to recalculate and store the hashes of all directories both # up and down the tree; we have changed our directory by importing files @@ -440,6 +577,10 @@ class CasBasedDirectory(Directory): self._recalculate_recursing_down() if self.parent: self.parent._recalculate_recursing_up(self) + if duplicate_cas: + if duplicate_cas.ref.hash != self.ref.hash: + print("Mismatch between file-imported result {} and cas-to-cas imported result {}.".format(duplicate_cas.ref.hash,self.ref.hash)) + return result def set_deterministic_mtime(self) -> None: @@ -466,7 +607,6 @@ class CasBasedDirectory(Directory): instead of copying. """ - if not os.path.exists(to_directory): os.mkdir(to_directory) |