diff options
-rw-r--r-- | builtin/pack-objects.c | 84 | ||||
-rw-r--r-- | pack-objects.h | 9 | ||||
-rwxr-xr-x | t/t5314-pack-cycle-detection.sh | 113 |
3 files changed, 206 insertions, 0 deletions
diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c index c4c2a3c79d..ed77355c6e 100644 --- a/builtin/pack-objects.c +++ b/builtin/pack-objects.c @@ -1494,6 +1494,83 @@ static int pack_offset_sort(const void *_a, const void *_b) (a->in_pack_offset > b->in_pack_offset); } +/* + * Drop an on-disk delta we were planning to reuse. Naively, this would + * just involve blanking out the "delta" field, but we have to deal + * with some extra book-keeping: + * + * 1. Removing ourselves from the delta_sibling linked list. + * + * 2. Updating our size/type to the non-delta representation. These were + * either not recorded initially (size) or overwritten with the delta type + * (type) when check_object() decided to reuse the delta. + */ +static void drop_reused_delta(struct object_entry *entry) +{ + struct object_entry **p = &entry->delta->delta_child; + struct object_info oi = OBJECT_INFO_INIT; + + while (*p) { + if (*p == entry) + *p = (*p)->delta_sibling; + else + p = &(*p)->delta_sibling; + } + entry->delta = NULL; + + oi.sizep = &entry->size; + oi.typep = &entry->type; + if (packed_object_info(entry->in_pack, entry->in_pack_offset, &oi) < 0) { + /* + * We failed to get the info from this pack for some reason; + * fall back to sha1_object_info, which may find another copy. + * And if that fails, the error will be recorded in entry->type + * and dealt with in prepare_pack(). + */ + entry->type = sha1_object_info(entry->idx.sha1, &entry->size); + } +} + +/* + * Follow the chain of deltas from this entry onward, throwing away any links + * that cause us to hit a cycle (as determined by the DFS state flags in + * the entries). + */ +static void break_delta_chains(struct object_entry *entry) +{ + /* If it's not a delta, it can't be part of a cycle. */ + if (!entry->delta) { + entry->dfs_state = DFS_DONE; + return; + } + + switch (entry->dfs_state) { + case DFS_NONE: + /* + * This is the first time we've seen the object. We mark it as + * part of the active potential cycle and recurse. + */ + entry->dfs_state = DFS_ACTIVE; + break_delta_chains(entry->delta); + entry->dfs_state = DFS_DONE; + break; + + case DFS_DONE: + /* object already examined, and not part of a cycle */ + break; + + case DFS_ACTIVE: + /* + * We found a cycle that needs broken. It would be correct to + * break any link in the chain, but it's convenient to + * break this one. + */ + drop_reused_delta(entry); + entry->dfs_state = DFS_DONE; + break; + } +} + static void get_object_details(void) { uint32_t i; @@ -1511,6 +1588,13 @@ static void get_object_details(void) entry->no_try_delta = 1; } + /* + * This must happen in a second pass, since we rely on the delta + * information for the whole list being completed. + */ + for (i = 0; i < to_pack.nr_objects; i++) + break_delta_chains(&to_pack.objects[i]); + free(sorted_by_offset); } diff --git a/pack-objects.h b/pack-objects.h index d1b98b30ff..cc9b9a9b90 100644 --- a/pack-objects.h +++ b/pack-objects.h @@ -27,6 +27,15 @@ struct object_entry { unsigned no_try_delta:1; unsigned tagged:1; /* near the very tip of refs */ unsigned filled:1; /* assigned write-order */ + + /* + * State flags for depth-first search used for analyzing delta cycles. + */ + enum { + DFS_NONE = 0, + DFS_ACTIVE, + DFS_DONE + } dfs_state; }; struct packing_data { diff --git a/t/t5314-pack-cycle-detection.sh b/t/t5314-pack-cycle-detection.sh new file mode 100755 index 0000000000..f7dbdfb412 --- /dev/null +++ b/t/t5314-pack-cycle-detection.sh @@ -0,0 +1,113 @@ +#!/bin/sh + +test_description='test handling of inter-pack delta cycles during repack + +The goal here is to create a situation where we have two blobs, A and B, with A +as a delta against B in one pack, and vice versa in the other. Then if we can +persuade a full repack to find A from one pack and B from the other, that will +give us a cycle when we attempt to reuse those deltas. + +The trick is in the "persuade" step, as it depends on the internals of how +pack-objects picks which pack to reuse the deltas from. But we can assume +that it does so in one of two general strategies: + + 1. Using a static ordering of packs. In this case, no inter-pack cycles can + happen. Any objects with a delta relationship must be present in the same + pack (i.e., no "--thin" packs on disk), so we will find all related objects + from that pack. So assuming there are no cycles within a single pack (and + we avoid generating them via pack-objects or importing them via + index-pack), then our result will have no cycles. + + So this case should pass the tests no matter how we arrange things. + + 2. Picking the next pack to examine based on locality (i.e., where we found + something else recently). + + In this case, we want to make sure that we find the delta versions of A and + B and not their base versions. We can do this by putting two blobs in each + pack. The first is a "dummy" blob that can only be found in the pack in + question. And then the second is the actual delta we want to find. + + The two blobs must be present in the same tree, not present in other trees, + and the dummy pathname must sort before the delta path. + +The setup below focuses on case 2. We have two commits HEAD and HEAD^, each +which has two files: "dummy" and "file". Then we can make two packs which +contain: + + [pack one] + HEAD:dummy + HEAD:file (as delta against HEAD^:file) + HEAD^:file (as base) + + [pack two] + HEAD^:dummy + HEAD^:file (as delta against HEAD:file) + HEAD:file (as base) + +Then no matter which order we start looking at the packs in, we know that we +will always find a delta for "file", because its lookup will always come +immediately after the lookup for "dummy". +' +. ./test-lib.sh + + + +# Create a pack containing the the tree $1 and blob $1:file, with +# the latter stored as a delta against $2:file. +# +# We convince pack-objects to make the delta in the direction of our choosing +# by marking $2 as a preferred-base edge. That results in $1:file as a thin +# delta, and index-pack completes it by adding $2:file as a base. +# +# Note that the two variants of "file" must be similar enough to convince git +# to create the delta. +make_pack () { + { + printf '%s\n' "-$(git rev-parse $2)" + printf '%s dummy\n' "$(git rev-parse $1:dummy)" + printf '%s file\n' "$(git rev-parse $1:file)" + } | + git pack-objects --stdout | + git index-pack --stdin --fix-thin +} + +test_expect_success 'setup' ' + test-genrandom base 4096 >base && + for i in one two + do + # we want shared content here to encourage deltas... + cp base file && + echo $i >>file && + + # ...whereas dummy should be short, because we do not want + # deltas that would create duplicates when we --fix-thin + echo $i >dummy && + + git add file dummy && + test_tick && + git commit -m $i || + return 1 + done && + + make_pack HEAD^ HEAD && + make_pack HEAD HEAD^ +' + +test_expect_success 'repack' ' + # We first want to check that we do not have any internal errors, + # and also that we do not hit the last-ditch cycle-breaking code + # in write_object(), which will issue a warning to stderr. + >expect && + git repack -ad 2>stderr && + test_cmp expect stderr && + + # And then double-check that the resulting pack is usable (i.e., + # we did not fail to notice any cycles). We know we are accessing + # the objects via the new pack here, because "repack -d" will have + # removed the others. + git cat-file blob HEAD:file >/dev/null && + git cat-file blob HEAD^:file >/dev/null +' + +test_done |