WIP: Fixes to cas-to-cas import. Terrible mess

author: Jim MacArthur <jim.macarthur@codethink.co.uk> 2018-07-13 16:41:18 +0100
committer: Jim MacArthur <jim.macarthur@codethink.co.uk> 2018-07-24 14:39:03 +0100
commit: 302028197f77fb23803d4d6916f4fbc8ea6684d0 (patch)
tree: b03c6bc28261da614dd0258838802922e344739d
parent: d39392062bb22208aed1c4aae09a8e0fe079d3a9 (diff)
download: buildstream-302028197f77fb23803d4d6916f4fbc8ea6684d0.tar.gz
1 files changed, 162 insertions, 22 deletions
diff --git a/buildstream/storage/_casbaseddirectory.py b/buildstream/storage/_casbaseddirectory.py
index e0748b5bd..61ee976d2 100644
--- a/buildstream/storage/_casbaseddirectory.py
+++ b/buildstream/storage/_casbaseddirectory.py
@@ -330,6 +330,7 @@ class CasBasedDirectory(Directory):
     def _import_files_from_directory(self, source_directory, files, path_prefix=""):
         result = FileListResult()
         for entry in sorted(files):
+            if entry == ".": continue
             split_path = entry.split(os.path.sep)
             # The actual file on the FS we're importing
             import_file = os.path.join(source_directory, entry)
@@ -341,8 +342,9 @@ class CasBasedDirectory(Directory):
                 # a better way would be to hand off all the files in this subdir at once.
                 subdir_result = self._import_directory_recursively(directory_name, source_directory,
                                                                    split_path[1:], path_prefix)
+
                 result.combine(subdir_result)
-            elif os.path.islink(import_file):
+            elif os.path.islink(import_file):  # careful about ordering here, as some cases overlap
                 if self._check_replacement(entry, path_prefix, result):
                     self._add_new_link(source_directory, entry)
                     result.files_written.append(relative_pathname)
@@ -370,29 +372,153 @@ class CasBasedDirectory(Directory):
         with open(refname, "wb") as f:
             f.write(self.ref.SerializeToString())
 
-    def _import_cas_into_cas(self, source_directory):
+    def find_updated_files(self, modified_directory, prefix=""):
+        """Find the list of written and overwritten files that would result
+        from importing 'modified_directory' into this one.  This does
+        not change either directory. The reason this exists is for
+        direct imports of cas directories into other ones, which can
+        be done by simply replacing a hash, but we still need the file
+        lists.
+
+        """
+        result = FileListResult()
+        for entry in modified_directory.pb2_directory.directories:
+            existing_dir = self.find_pb2_entry(entry.name)
+            if existing_dir:
+                updates_files = existing_dir.find_updated_files(modified_directory.descend(entry.name),
+                                                                os.path.join(prefix, entry.name))
+                result.combine(updated_files)
+            else:
+                for f in source_directory.descend(entry.name).list_relative_paths():
+                    result.files_written.append(os.path.join(prefix, f))
+                    # None of these can overwrite anything, since the original files don't exist
+        for entry in modified_directory.pb2_directory.files + modified_directory.pb2_directory.symlinks:
+            if self.find_pb2_entry(entry.name):
+                result.files_overwritten.apppend(os.path.join(prefix, entry.name))
+            result.file_written.apppend(os.path.join(prefix, entry.name))
+        return result
+
+    def filelist_iterator(sorted_files):
+        """ An interator which just returns the top level directories from sorted_files. It will return any directories listed as a single return value. The return value is a tuple consisting
+         of a path component and a list of sub-components. If the list is empty, this indicates that the return value is a file. """
+        last_directory = None
+        for f in sorted_files:
+            components = f.split(os.path.sep)
+            if len(components)>1:
+                if components[0] != last_directory:
+                    if last_directory is not None:
+                        yield (components[0], subcomponents)
+                    subcomponents = []
+                subcomponents.append(os.path.sep.join(components[1:]))
+            else:
+                if last_directory is not None:
+                    yield (last_directory, subcomponents)
+                yield(f, [])
+        if last_directory is not None:
+            yield (last_directory, subcomponents)
+
+    def files_in_subdir(sorted_files, dirname):
+        """ Filters sorted_files and returns only the ones which have 'dirname' as a prefix, with that prefix removed. """
+        if not dirname.endswith(os.path.sep):
+            dirname += os.path.sep
+        return [f.lstrip(dirname) for f in sorted_files if f.startswith(dirname) ]
+
+    def _partial_import_cas_into_cas(self, source_directory, files, path_prefix="", file_list_required=True):
+        result = FileListResult()
+        sorted_files = sorted(files) # Check if this is necessary
+        processed_entries = set()
+        for f in sorted_files:
+            if f == ".": continue
+            components = f.split(os.path.sep)
+            if len(components)>1 or isinstance(source_directory.index[components[0]].buildstream_object, CasBasedDirectory):
+                # Then we are importing a directory
+                dirname = components[0]
+                if dirname not in processed_entries:
+                    subcomponents = CasBasedDirectory.files_in_subdir(sorted_files, dirname)
+                    dest_subdir = self.descend(dirname, create=True)
+                    src_subdir = source_directory.descend(dirname)
+                    import_result = dest_subdir._partial_import_cas_into_cas(src_subdir, subcomponents, path_prefix=os.path.join(path_prefix,f), file_list_required=file_list_required)
+                    result.combine(import_result)
+                processed_entries.add(dirname)
+            else:
+                self._check_replacement(f, path_prefix, result)
+                fullname = os.path.join(path_prefix, f)
+                item = source_directory.index[f]
+                if isinstance(item.pb2_object, remote_execution_pb2.FileNode):
+                    filenode = self.pb2_directory.files.add(digest=item.pb2_object.digest)
+                    filenode.name = f
+                    is_executable = item.pb2_object.is_executable
+                    filenode.is_executable = is_executable
+                    self.index[f] = IndexEntry(filenode, modified=(fullname in result.overwritten))
+                else: # Must be a symlink
+                    symlinknode = self.pb2_directory.symlinks.add()
+                    symlinknode.name = fullname
+                    # A symlink node has no digest.
+                    symlinknode.target = item.pb2_object.target
+                    self.index[filename] = IndexEntry(symlinknode, modified=(fullname in result.overwritten))
+        return result
+
+    def _full_import_cas_into_cas(self, source_directory, path_prefix="", file_list_required=True):
+        result = FileListResult()
         for entry in source_directory.pb2_directory.directories:
-            # Create a cloned CasBasedDirectory, to avoid modifying the old one
-            buildStreamDirectory = CasBasedDirectory(self.context, ref=entry.digest,
-                                                     parent=self, filename=entry.name)
-            new_pb2_dirnode = self.pb2_directory.directories.add(digest=entry.digest, name=entry.name)
+            # Create a cloned CasBasedDirectory, since we may import more files
+            # into a subdirectory of it and we don't want to affect the original.
+            existing_dir = self.find_pb2_entry(entry.name)
+            if existing_dir:
+                existing_dir.digest = entry.digest
+            else:
+                new_pb2_dirnode = self.pb2_directory.directories.add(digest=entry.digest, name=entry.name)
+            buildStreamDirectory = CasBasedDirectory(self.context, ref=entry.digest, 
+                                                    parent=self, filename=entry.name)
             self.index[entry.name] = IndexEntry(entry, buildstream_object=buildStreamDirectory)
+
+            if file_list_required:
+                if existing_dir:
+                    updated_files = existing_dir.find_updated_files(source_directory.descend(entry.name), entry.name)
+                    result.combine(updated_files)
+                else:
+                    for i in source_directory.descend(entry.name).list_relative_paths():
+                        result.files_written.append(i)
+
         for entry in source_directory.pb2_directory.files:
-            filenode = self.pb2_directory.files.add(name=entry.name, digest=entry.digest)
+            existing_file = self.find_pb2_entry(entry.name)
+            if existing_file:
+                result.files_overwritten.append(relative_pathname)
+                filenode = existing_file
+                filenode.digest = entry.digest
+            else:
+                filenode = self.pb2_directory.files.add(name=entry.name, digest=entry.digest)
             is_executable = entry.is_executable
             filenode.is_executable = is_executable
-            self.index[entry.name] = IndexEntry(filenode, modified=(entry.name in self.index))
-        for entry in self.pb2_directory.symlinks:
+            self.index[entry.name] = IndexEntry(filenode, modified=(existing_file is not None))
+            relative_pathname = os.path.join(path_prefix, entry.name)
+            result.files_written.append(relative_pathname)
+
+        for entry in source_directory.pb2_directory.symlinks:
+            print("  Importing symlink {} into this directory.".format(entry.name))
             existing_link = self.find_pb2_entry(entry.name)
+            relative_pathname = os.path.join(path_prefix, entry.name)
             if existing_link:
                 symlinknode = existing_link
+                result.files_overwritten.append(relative_pathname)
             else:
                 symlinknode = self.pb2_directory.symlinks.add()
             symlinknode.name = entry.name
             # A symlink node has no digest.
             symlinknode.target = entry.target
             self.index[entry.name] = IndexEntry(symlinknode, modified=(existing_link is not None))
-        # TODO: Return a list of things imported.
+            result.files_written.append(relative_pathname)
+        return result
+
+    def _import_cas_into_cas(self, source_directory, files=None):
+        """ A full import is significantly easier than a partial import """
+        if files is None:
+            print("Performing FULL import of {} into {}".format(source_directory, self))
+            return self._full_import_cas_into_cas(source_directory)
+        else:
+            print("Performing PARTIAL import of {} into {}".format(source_directory, self))
+            print("File list is: {}".format(files))
+            return self._partial_import_cas_into_cas(source_directory, files)
 
     def import_files(self, external_pathspec: any, files: List[str] = None,
                      report_written: bool = True, update_utimes: bool = False,
@@ -415,22 +541,33 @@ class CasBasedDirectory(Directory):
 
         can_link (bool): Ignored, since hard links do not have any meaning within CAS.
         """
-        if isinstance(external_pathspec, FileBasedDirectory):
-            source_directory = external_pathspec.get_underlying_directory()
-        elif isinstance(external_pathspec, CasBasedDirectory):
-            result = self._import_cas_into_cas(external_pathspec)
-            # TODO: No result is returned?
-            return result
-        else:
-            source_directory = external_pathspec
 
-        if files is None:
-            files = list_relative_paths(source_directory)
+        duplicate_cas = None
+        if isinstance(external_pathspec, CasBasedDirectory):
+            result = self._import_cas_into_cas(external_pathspec, files=files)
+
+            # Duplicate the current directory and do an import that way.
+            duplicate_cas = CasBasedDirectory(self.context, ref=self.ref)
+            with tempfile.TemporaryDirectory(prefix="roundtrip") as tmpdir:
+                external_pathspec.export_files(tmpdir)
+                if files is None:
+                    files = list_relative_paths(tmpdir)
+                duplicate_cas._import_files_from_directory(tmpdir, files=files)
+                duplicate_cas._recalculate_recursing_down()
+                if duplicate_cas.parent:
+                    duplicate_cas.parent._recalculate_recursing_up(self)
+        else:
+            if isinstance(external_pathspec, FileBasedDirectory):
+                source_directory = external_pathspec.get_underlying_directory()
+            else:
+                source_directory = external_pathspec
+            if files is None:
+                files = list_relative_paths(external_pathspec)
+            result = self._import_files_from_directory(source_directory, files=files)
 
         # TODO: No notice is taken of report_written, update_utimes or can_link.
         # Current behaviour is to fully populate the report, which is inefficient,
         # but still correct.
-        result = self._import_files_from_directory(source_directory, files=files)
 
         # We need to recalculate and store the hashes of all directories both
         # up and down the tree; we have changed our directory by importing files
@@ -440,6 +577,10 @@ class CasBasedDirectory(Directory):
         self._recalculate_recursing_down()
         if self.parent:
             self.parent._recalculate_recursing_up(self)
+        if duplicate_cas:
+            if duplicate_cas.ref.hash != self.ref.hash:
+                print("Mismatch between file-imported result {} and cas-to-cas imported result {}.".format(duplicate_cas.ref.hash,self.ref.hash))
+
         return result
 
     def set_deterministic_mtime(self) -> None:
@@ -466,7 +607,6 @@ class CasBasedDirectory(Directory):
         instead of copying.
 
         """
-
         if not os.path.exists(to_directory):
             os.mkdir(to_directory)
author	Jim MacArthur <jim.macarthur@codethink.co.uk>	2018-07-13 16:41:18 +0100
committer	Jim MacArthur <jim.macarthur@codethink.co.uk>	2018-07-24 14:39:03 +0100
commit	302028197f77fb23803d4d6916f4fbc8ea6684d0 (patch)
tree	b03c6bc28261da614dd0258838802922e344739d
parent	d39392062bb22208aed1c4aae09a8e0fe079d3a9 (diff)
download	buildstream-302028197f77fb23803d4d6916f4fbc8ea6684d0.tar.gz