summaryrefslogtreecommitdiff
path: root/src/libostree/ostree-repo-static-delta-compilation-analysis.c
diff options
context:
space:
mode:
authorColin Walters <walters@verbum.org>2015-02-11 03:29:14 -0500
committerColin Walters <walters@verbum.org>2015-02-16 10:10:35 -0500
commitc54df8977113e4cb327a61c9a48fd75f68e20237 (patch)
tree7e44173db6aafee9cf34a2bddfd20207d09efeb5 /src/libostree/ostree-repo-static-delta-compilation-analysis.c
parent0f74ed62b72a659191bf070911133d731ecd59da (diff)
downloadostree-c54df8977113e4cb327a61c9a48fd75f68e20237.tar.gz
deltas: Search for similar objects (possibly renamed across directories)
The previous diff algorithm was file tree based, and only looked at modified files that lived at the same path. However, components like the Linux kernel have versioned subdirectories, e.g. /usr/lib/modules/$kver/.../ext4.ko. We want to be able to detect these "modified renames" so that we can compute diffs (rollsum, bsdiff).
Diffstat (limited to 'src/libostree/ostree-repo-static-delta-compilation-analysis.c')
-rw-r--r--src/libostree/ostree-repo-static-delta-compilation-analysis.c305
1 files changed, 305 insertions, 0 deletions
diff --git a/src/libostree/ostree-repo-static-delta-compilation-analysis.c b/src/libostree/ostree-repo-static-delta-compilation-analysis.c
new file mode 100644
index 00000000..39c818fe
--- /dev/null
+++ b/src/libostree/ostree-repo-static-delta-compilation-analysis.c
@@ -0,0 +1,305 @@
+/* -*- mode: C; c-file-style: "gnu"; indent-tabs-mode: nil; -*-
+ *
+ * Copyright (C) 2015 Colin Walters <walters@verbum.org>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ */
+
+#include "config.h"
+
+#include <string.h>
+#include <gio/gunixoutputstream.h>
+
+#include "ostree-core-private.h"
+#include "ostree-repo-private.h"
+#include "ostree-lzma-compressor.h"
+#include "ostree-repo-static-delta-private.h"
+#include "ostree-diff.h"
+#include "ostree-rollsum.h"
+#include "otutil.h"
+#include "ostree-varint.h"
+
+void
+_ostree_delta_content_sizenames_free (gpointer v)
+{
+ OstreeDeltaContentSizeNames *ce = v;
+ g_free (ce->checksum);
+ g_ptr_array_unref (ce->basenames);
+ g_free (ce);
+}
+
+static gboolean
+build_content_sizenames_recurse (OstreeRepo *repo,
+ OstreeRepoCommitTraverseIter *iter,
+ GHashTable *sizenames_map,
+ GHashTable *include_only_objects,
+ GCancellable *cancellable,
+ GError **error)
+{
+ gboolean ret = FALSE;
+
+ while (TRUE)
+ {
+ OstreeRepoCommitIterResult iterres =
+ ostree_repo_commit_traverse_iter_next (iter, cancellable, error);
+
+ if (iterres == OSTREE_REPO_COMMIT_ITER_RESULT_ERROR)
+ goto out;
+ else if (iterres == OSTREE_REPO_COMMIT_ITER_RESULT_END)
+ break;
+ else if (iterres == OSTREE_REPO_COMMIT_ITER_RESULT_FILE)
+ {
+ char *name;
+ char *checksum;
+ OstreeDeltaContentSizeNames *csizenames;
+
+ ostree_repo_commit_traverse_iter_get_file (iter, &name, &checksum);
+
+ if (include_only_objects && !g_hash_table_contains (include_only_objects, checksum))
+ continue;
+
+ csizenames = g_hash_table_lookup (sizenames_map, checksum);
+ if (!csizenames)
+ {
+ gs_unref_object GFileInfo *finfo = NULL;
+
+ csizenames = g_new0 (OstreeDeltaContentSizeNames, 1);
+ csizenames->checksum = g_strdup (checksum);
+
+ /* Transfer ownership so things get cleaned up if we
+ * throw an exception below.
+ */
+ g_hash_table_replace (sizenames_map, csizenames->checksum, csizenames);
+
+ if (!ostree_repo_load_file (repo, checksum,
+ NULL, &finfo, NULL,
+ cancellable, error))
+ goto out;
+
+ csizenames->size = g_file_info_get_size (finfo);
+ }
+
+ if (!csizenames->basenames)
+ csizenames->basenames = g_ptr_array_new_with_free_func (g_free);
+ g_ptr_array_add (csizenames->basenames, g_strdup (name));
+ }
+ else if (iterres == OSTREE_REPO_COMMIT_ITER_RESULT_DIR)
+ {
+ char *name;
+ char *content_checksum;
+ char *meta_checksum;
+ gs_unref_variant GVariant *dirtree = NULL;
+ ostree_cleanup_repo_commit_traverse_iter
+ OstreeRepoCommitTraverseIter subiter = { 0, };
+
+ ostree_repo_commit_traverse_iter_get_dir (iter, &name, &content_checksum, &meta_checksum);
+
+ if (!ostree_repo_load_variant (repo, OSTREE_OBJECT_TYPE_DIR_TREE,
+ content_checksum, &dirtree,
+ error))
+ goto out;
+
+ if (!ostree_repo_commit_traverse_iter_init_dirtree (&subiter, repo, dirtree,
+ OSTREE_REPO_COMMIT_TRAVERSE_FLAG_NONE,
+ error))
+ goto out;
+
+ if (!build_content_sizenames_recurse (repo, &subiter,
+ sizenames_map, include_only_objects,
+ cancellable, error))
+ goto out;
+ }
+ else
+ g_assert_not_reached ();
+ }
+ ret = TRUE;
+ out:
+ return ret;
+}
+
+static int
+compare_sizenames (const void *a,
+ const void *b)
+{
+ OstreeDeltaContentSizeNames *sn_a = *(OstreeDeltaContentSizeNames**)(void*)a;
+ OstreeDeltaContentSizeNames *sn_b = *(OstreeDeltaContentSizeNames**)(void*)b;
+
+ return sn_a->size - sn_b->size;
+}
+
+/**
+ * Generate a sorted array of [(checksum: str, size: uint64, names: array[string]), ...]
+ * for regular file content.
+ */
+static gboolean
+build_content_sizenames_filtered (OstreeRepo *repo,
+ GVariant *commit,
+ GHashTable *include_only_objects,
+ GPtrArray **out_sizenames,
+ GCancellable *cancellable,
+ GError **error)
+{
+ gboolean ret = FALSE;
+ gs_unref_ptrarray GPtrArray *ret_sizenames =
+ g_ptr_array_new_with_free_func (_ostree_delta_content_sizenames_free);
+ gs_unref_hashtable GHashTable *sizenames_map =
+ g_hash_table_new_full (g_str_hash, g_str_equal, NULL, _ostree_delta_content_sizenames_free);
+ ostree_cleanup_repo_commit_traverse_iter
+ OstreeRepoCommitTraverseIter iter = { 0, };
+
+ if (!ostree_repo_commit_traverse_iter_init_commit (&iter, repo, commit,
+ OSTREE_REPO_COMMIT_TRAVERSE_FLAG_NONE,
+ error))
+ goto out;
+
+ if (!build_content_sizenames_recurse (repo, &iter, sizenames_map, include_only_objects,
+ cancellable, error))
+ goto out;
+
+ { GHashTableIter hashiter;
+ gpointer hkey, hvalue;
+
+ g_hash_table_iter_init (&hashiter, sizenames_map);
+ while (g_hash_table_iter_next (&hashiter, &hkey, &hvalue))
+ {
+ g_hash_table_iter_steal (&hashiter);
+ g_ptr_array_add (ret_sizenames, hvalue);
+ }
+ }
+
+ g_ptr_array_sort (ret_sizenames, compare_sizenames);
+
+ ret = TRUE;
+ gs_transfer_out_value (out_sizenames, &ret_sizenames);
+ out:
+ return ret;
+}
+
+static gboolean
+string_array_nonempty_intersection (GPtrArray *a,
+ GPtrArray *b)
+{
+ guint i;
+ for (i = 0; i < a->len; i++)
+ {
+ guint j;
+ const char *a_str = a->pdata[i];
+ for (j = 0; j < b->len; j++)
+ {
+ const char *b_str = b->pdata[j];
+ if (strcmp (a_str, b_str) == 0)
+ return TRUE;
+ }
+ }
+ return FALSE;
+}
+
+/*
+ * Build up a map of files with matching basenames and similar size,
+ * and use it to find apparently similar objects.
+ *
+ * @new_reachable_regfile_content is a Set<checksum> of new regular
+ * file objects.
+ *
+ * Currently, @out_modified_regfile_content will be a Map<to checksum,from checksum>;
+ * however in the future it would be easy to have this function return
+ * multiple candidate matches. The hard part would be changing
+ * the delta compiler to iterate over all matches, determine
+ * a cost for each one, then pick the best.
+ */
+gboolean
+_ostree_delta_compute_similar_objects (OstreeRepo *repo,
+ GVariant *from_commit,
+ GVariant *to_commit,
+ GHashTable *new_reachable_regfile_content,
+ guint similarity_percent_threshold,
+ GHashTable **out_modified_regfile_content,
+ GCancellable *cancellable,
+ GError **error)
+{
+ gboolean ret = FALSE;
+ gs_unref_hashtable GHashTable *ret_modified_regfile_content =
+ g_hash_table_new_full (g_str_hash, g_str_equal, g_free, (GDestroyNotify)g_ptr_array_unref);
+ gs_unref_ptrarray GPtrArray *from_sizes = NULL;
+ gs_unref_ptrarray GPtrArray *to_sizes = NULL;
+ guint i, j;
+ guint lower;
+ guint upper;
+
+ if (!build_content_sizenames_filtered (repo, from_commit, NULL,
+ &from_sizes,
+ cancellable, error))
+ goto out;
+
+ if (!build_content_sizenames_filtered (repo, to_commit, new_reachable_regfile_content,
+ &to_sizes,
+ cancellable, error))
+ goto out;
+
+ /* Iterate over all newly added objects, find objects which have
+ * similar basename and sizes.
+ *
+ * Because the arrays are sorted by size, we can maintain a `lower`
+ * bound on the original (from) objects to start searching.
+ */
+ lower = 0;
+ upper = from_sizes->len;
+ for (i = 0; i < to_sizes->len; i++)
+ {
+ OstreeDeltaContentSizeNames *to_sizenames = to_sizes->pdata[i];
+ const guint64 min_threshold = to_sizenames->size *
+ (1.0-similarity_percent_threshold/100.0);
+ const guint64 max_threshold = to_sizenames->size *
+ (1.0+similarity_percent_threshold/100.0);
+
+ /* Don't build candidates for the empty object */
+ if (to_sizenames->size == 0)
+ continue;
+
+ for (j = lower; j < upper; j++)
+ {
+ OstreeDeltaContentSizeNames *from_sizenames = from_sizes->pdata[j];
+
+ /* Don't build candidates for the empty object */
+ if (from_sizenames->size == 0)
+ continue;
+
+ if (from_sizenames->size < min_threshold)
+ {
+ lower++;
+ continue;
+ }
+
+ if (from_sizenames->size > max_threshold)
+ break;
+
+ if (!string_array_nonempty_intersection (from_sizenames->basenames, to_sizenames->basenames))
+ continue;
+
+ /* Only one candidate right now */
+ g_hash_table_insert (ret_modified_regfile_content,
+ g_strdup (to_sizenames->checksum),
+ g_strdup (from_sizenames->checksum));
+ break;
+ }
+ }
+
+ ret = TRUE;
+ gs_transfer_out_value (out_modified_regfile_content, &ret_modified_regfile_content);
+ out:
+ return ret;
+}
+