diff options
author | Michael Schubert <schu@schu.io> | 2012-08-19 22:26:32 +0200 |
---|---|---|
committer | Michael Schubert <schu@schu.io> | 2012-10-09 21:28:31 +0200 |
commit | 0a32dca5ecbd4c56b02ba78af4174ac7f65a786c (patch) | |
tree | 733f2d3113d96524c1e4c05d7e98c03baf07614f | |
parent | ec1d42b7d5eb5a25b504277bee5b0cc97931e1a7 (diff) | |
download | libgit2-0a32dca5ecbd4c56b02ba78af4174ac7f65a786c.tar.gz |
gsoc-pack-objects WIP
-rw-r--r-- | include/git2.h | 1 | ||||
-rw-r--r-- | include/git2/errors.h | 1 | ||||
-rw-r--r-- | include/git2/pack.h | 89 | ||||
-rw-r--r-- | include/git2/types.h | 3 | ||||
-rw-r--r-- | src/pack-objects.c | 1339 | ||||
-rw-r--r-- | src/pack-objects.h | 85 | ||||
-rw-r--r-- | tests-clar/pack/packbuilder.c | 67 |
7 files changed, 1585 insertions, 0 deletions
diff --git a/include/git2.h b/include/git2.h index 805044abb..d55543986 100644 --- a/include/git2.h +++ b/include/git2.h @@ -51,5 +51,6 @@ #include "git2/notes.h" #include "git2/reset.h" #include "git2/message.h" +#include "git2/pack.h" #endif diff --git a/include/git2/errors.h b/include/git2/errors.h index f6d9bf2e3..1c4e910a6 100644 --- a/include/git2/errors.h +++ b/include/git2/errors.h @@ -56,6 +56,7 @@ typedef enum { GITERR_INDEXER, GITERR_SSL, GITERR_SUBMODULE, + GITERR_THREAD, } git_error_t; /** diff --git a/include/git2/pack.h b/include/git2/pack.h new file mode 100644 index 000000000..748ad2e11 --- /dev/null +++ b/include/git2/pack.h @@ -0,0 +1,89 @@ +/* + * Copyright (C) 2009-2012 the libgit2 contributors + * + * This file is part of libgit2, distributed under the GNU GPL v2 with + * a Linking Exception. For full terms see the included COPYING file. + */ +#ifndef INCLUDE_git_pack_h__ +#define INCLUDE_git_pack_h__ + +#include "common.h" +#include "oid.h" + +/** + * @file git2/pack.h + * @brief Git pack management routines + * @defgroup git_pack Git pack management routines + * @ingroup Git + * @{ + */ +GIT_BEGIN_DECL + +/** + * Initialize a new packbuilder + * + * @param out The new packbuilder object + * @param repo The repository + * + * @return 0 or an error code + */ +GIT_EXTERN(int) git_packbuilder_new(git_packbuilder **out, git_repository *repo); + +/** + * Set number of threads to spawn + * + * By default, libgit2 won't spawn any threads at all; + * when set to 0, libgit2 will autodetect the number of + * CPUs. + * + * @param pb The packbuilder + * @param n Number of threads to spawn + */ +GIT_EXTERN(void) git_packbuilder_set_threads(git_packbuilder *pb, unsigned int n); + +/** + * Insert a single object + * + * For an optimal pack it's mandatory to insert objects in recency order, + * commits followed by trees and blobs. + * + * @param pb The packbuilder + * @param oid The oid of the commit + * @param oid The name; might be NULL + * + * @return 0 or an error code + */ +GIT_EXTERN(int) git_packbuilder_insert(git_packbuilder *pb, const git_oid *oid, const char *name); + +/** + * Insert a root tree object + * + * This will add the tree as well as all referenced trees and blobs. + * + * @param pb The packbuilder + * @param oid The oid of the root tree + * + * @return 0 or an error code + */ +GIT_EXTERN(int) git_packbuilder_insert_tree(git_packbuilder *pb, const git_oid *oid); + +/** + * Write the new pack and the corresponding index to path + * + * @param pb The packbuilder + * @param path Directory to store the new pack and index + * + * @return 0 or an error code + */ +GIT_EXTERN(int) git_packbuilder_write(git_packbuilder *pb, const char *file); + +/** + * Free the packbuilder and all associated data + * + * @param pb The packbuilder + */ +GIT_EXTERN(void) git_packbuilder_free(git_packbuilder *pb); + +/** @} */ +GIT_END_DECL +#endif diff --git a/include/git2/types.h b/include/git2/types.h index 26e9c57e7..01ddbf3d6 100644 --- a/include/git2/types.h +++ b/include/git2/types.h @@ -137,6 +137,9 @@ typedef struct git_reflog git_reflog; /** Representation of a git note */ typedef struct git_note git_note; +/** Representation of a git packbuilder */ +typedef struct git_packbuilder git_packbuilder; + /** Time in a signature */ typedef struct git_time { git_time_t time; /** time in seconds from epoch */ diff --git a/src/pack-objects.c b/src/pack-objects.c new file mode 100644 index 000000000..bb0dfb6c8 --- /dev/null +++ b/src/pack-objects.c @@ -0,0 +1,1339 @@ +/* + * Copyright (C) 2009-2012 the libgit2 contributors + * + * This file is part of libgit2, distributed under the GNU GPL v2 with + * a Linking Exception. For full terms see the included COPYING file. + */ + +#include "pack-objects.h" + +#include "compress.h" +#include "delta.h" +#include "iterator.h" +#include "netops.h" +#include "pack.h" +#include "thread-utils.h" +#include "tree.h" + +#include "git2/pack.h" +#include "git2/commit.h" +#include "git2/tag.h" +#include "git2/indexer.h" +#include "git2/config.h" + +struct unpacked { + git_pobject *object; + void *data; + struct git_delta_index *index; + unsigned int depth; +}; + +static int locate_object_entry_hash(git_packbuilder *pb, const git_oid *oid) +{ + int i; + unsigned int ui; + memcpy(&ui, oid->id, sizeof(unsigned int)); + i = ui % pb->object_ix_hashsz; + while (0 < pb->object_ix[i]) { + if (!git_oid_cmp(oid, &pb->object_list[pb->object_ix[i]-1].id)) + return i; + if (++i == pb->object_ix_hashsz) + i = 0; + } + return -1 - i; +} + +static git_pobject *locate_object_entry(git_packbuilder *pb, const git_oid *oid) +{ + int i; + + if (!pb->object_ix_hashsz) + return NULL; + + i = locate_object_entry_hash(pb, oid); + if (0 <= i) + return &pb->object_list[pb->object_ix[i]-1]; + return NULL; +} + +static void rehash_objects(git_packbuilder *pb) +{ + git_pobject *po; + uint32_t i; + + pb->object_ix_hashsz = pb->nr_objects * 3; + if (pb->object_ix_hashsz < 1024) + pb->object_ix_hashsz = 1024; + pb->object_ix = git__realloc(pb->object_ix, sizeof(int) * pb->object_ix_hashsz); + memset(pb->object_ix, 0, sizeof(int) * pb->object_ix_hashsz); + for (i = 0, po = pb->object_list; i < pb->nr_objects; i++, po++) { + int ix = locate_object_entry_hash(pb, &po->id); + if (0 <= ix) + continue; + ix = -1 - ix; + pb->object_ix[ix] = i + 1; + } +} + +static unsigned name_hash(const char *name) +{ + unsigned c, hash = 0; + + if (!name) + return 0; + + /* + * This effectively just creates a sortable number from the + * last sixteen non-whitespace characters. Last characters + * count "most", so things that end in ".c" sort together. + */ + while ((c = *name++) != 0) { + if (git__isspace(c)) + continue; + hash = (hash >> 2) + (c << 24); + } + return hash; +} + +static int packbuilder_config(git_packbuilder *pb) +{ + git_config *config; + int ret; + + if (git_repository_config__weakptr(&config, pb->repo) < 0) + return -1; + +#define config_get(key, dst, default) \ + ret = git_config_get_int64((int64_t *)&dst, config, key); \ + if (ret == GIT_ENOTFOUND) \ + dst = default; \ + else if (ret < 0) \ + return -1; + + config_get("pack.deltaCacheSize", pb->max_delta_cache_size, + GIT_PACK_DELTA_CACHE_SIZE); + config_get("pack.deltaCacheLimit", pb->cache_max_small_delta_size, + GIT_PACK_DELTA_CACHE_LIMIT); + config_get("pack.deltaCacheSize", pb->big_file_threshold, + GIT_PACK_BIG_FILE_THRESHOLD); + config_get("pack.windowMemory", pb->window_memory_limit, 0); + +#undef config_get + + return 0; +} + +int git_packbuilder_new(git_packbuilder **out, git_repository *repo) +{ + git_packbuilder *pb; + + *out = NULL; + + pb = git__malloc(sizeof(*pb)); + GITERR_CHECK_ALLOC(pb); + + memset(pb, 0x0, sizeof(*pb)); + + pb->repo = repo; + pb->nr_threads = 1; /* do not spawn any thread by default */ + pb->ctx = git_hash_new_ctx(); + + if (git_repository_odb(&pb->odb, repo) < 0) + goto on_error; + + if (packbuilder_config(pb) < 0) + goto on_error; + + *out = pb; + return 0; + +on_error: + git__free(pb); + return -1; +} + +void git_packbuilder_set_threads(git_packbuilder *pb, unsigned int n) +{ + assert(pb); + pb->nr_threads = n; +} + +int git_packbuilder_insert(git_packbuilder *pb, const git_oid *oid, + const char *name) +{ + git_pobject *po; + git_odb_object *obj; + int ix; + + assert(pb && oid); + + ix = pb->nr_objects ? locate_object_entry_hash(pb, oid) : -1; + if (ix >=0) + return 0; + + if (pb->nr_objects >= pb->nr_alloc) { + pb->nr_alloc = (pb->nr_alloc + 1024) * 3 / 2; + pb->object_list = git__realloc(pb->object_list, + pb->nr_alloc * sizeof(*po)); + GITERR_CHECK_ALLOC(pb->object_list); + } + + if (git_odb_read(&obj, pb->odb, oid) < 0) + return -1; + + po = pb->object_list + pb->nr_objects++; + memset(po, 0x0, sizeof(*po)); + + git_oid_cpy(&po->id, oid); + po->type = git_odb_object_type(obj); + po->size = git_odb_object_size(obj); + po->hash = name_hash(name); + + git_odb_object_free(obj); + + if ((uint32_t)pb->object_ix_hashsz * 3 <= pb->nr_objects * 4) + rehash_objects(pb); + else + pb->object_ix[-1 - ix] = pb->nr_objects; + + pb->done = false; + return 0; +} + +/* + * The per-object header is a pretty dense thing, which is + * - first byte: low four bits are "size", + * then three bits of "type", + * with the high bit being "size continues". + * - each byte afterwards: low seven bits are size continuation, + * with the high bit being "size continues" + */ +static int gen_pack_object_header( + unsigned char *hdr, + unsigned long size, + git_otype type) +{ + unsigned char *hdr_base; + unsigned char c; + + assert(type >= GIT_OBJ_COMMIT && type <= GIT_OBJ_REF_DELTA); + + /* TODO: add support for chunked objects; see git.git 6c0d19b1 */ + + c = (unsigned char)((type << 4) | (size & 15)); + size >>= 4; + hdr_base = hdr; + + while (size) { + *hdr++ = c | 0x80; + c = size & 0x7f; + size >>= 7; + } + *hdr++ = c; + + return hdr - hdr_base; +} + +static int get_delta(void **out, git_odb *odb, git_pobject *po) +{ + git_odb_object *src = NULL, *trg = NULL; + unsigned long delta_size; + void *delta_buf; + + *out = NULL; + + if (git_odb_read(&src, odb, &po->delta->id) < 0 || + git_odb_read(&trg, odb, &po->id) < 0) + goto on_error; + + delta_buf = git_delta(git_odb_object_data(src), git_odb_object_size(src), + git_odb_object_data(trg), git_odb_object_size(trg), + &delta_size, 0); + + if (!delta_buf || delta_size != po->delta_size) { + giterr_set(GITERR_INVALID, "Delta size changed"); + goto on_error; + } + + *out = delta_buf; + + git_odb_object_free(src); + git_odb_object_free(trg); + return 0; + +on_error: + git_odb_object_free(src); + git_odb_object_free(trg); + return -1; +} + +static int write_object(git_buf *buf, git_packbuilder *pb, git_pobject *po) +{ + git_odb_object *obj = NULL; + git_buf zbuf = GIT_BUF_INIT; + git_otype type; + unsigned char hdr[10]; + unsigned int hdr_len; + unsigned long size; + void *data; + + if (po->delta) { + if (po->delta_data) + data = po->delta_data; + else if (get_delta(&data, pb->odb, po) < 0) + goto on_error; + size = po->delta_size; + type = GIT_OBJ_REF_DELTA; + } else { + if (git_odb_read(&obj, pb->odb, &po->id)) + goto on_error; + + data = (void *)git_odb_object_data(obj); + size = git_odb_object_size(obj); + type = git_odb_object_type(obj); + } + + /* Write header */ + hdr_len = gen_pack_object_header(hdr, size, type); + + if (git_buf_put(buf, (char *)hdr, hdr_len) < 0) + goto on_error; + + git_hash_update(pb->ctx, hdr, hdr_len); + + if (type == GIT_OBJ_REF_DELTA) { + if (git_buf_put(buf, (char *)po->delta->id.id, + GIT_OID_RAWSZ) < 0) + goto on_error; + + git_hash_update(pb->ctx, po->delta->id.id, GIT_OID_RAWSZ); + } + + /* Write data */ + if (po->z_delta_size) + size = po->z_delta_size; + else if (git__compress(&zbuf, data, size) < 0) + goto on_error; + else { + if (po->delta) + git__free(data); + data = zbuf.ptr; + size = zbuf.size; + } + + if (git_buf_put(buf, data, size) < 0) + goto on_error; + + git_hash_update(pb->ctx, data, size); + + if (po->delta_data) + git__free(po->delta_data); + + git_odb_object_free(obj); + git_buf_free(&zbuf); + + pb->nr_written++; + return 0; + +on_error: + git_odb_object_free(obj); + git_buf_free(&zbuf); + return -1; +} + +enum write_one_status { + WRITE_ONE_SKIP = -1, /* already written */ + WRITE_ONE_BREAK = 0, /* writing this will bust the limit; not written */ + WRITE_ONE_WRITTEN = 1, /* normal */ + WRITE_ONE_RECURSIVE = 2 /* already scheduled to be written */ +}; + +static int write_one(git_buf *buf, git_packbuilder *pb, git_pobject *po, + enum write_one_status *status) +{ + if (po->recursing) { + *status = WRITE_ONE_RECURSIVE; + return 0; + } else if (po->written) { + *status = WRITE_ONE_SKIP; + return 0; + } + + if (po->delta) { + po->recursing = 1; + if (write_one(buf, pb, po->delta, status) < 0) + return -1; + switch (*status) { + case WRITE_ONE_RECURSIVE: + /* we cannot depend on this one */ + po->delta = NULL; + break; + default: + break; + } + } + + po->written = 1; + po->recursing = 0; + return write_object(buf, pb, po); +} + +GIT_INLINE(void) add_to_write_order(git_pobject **wo, unsigned int *endp, + git_pobject *po) +{ + if (po->filled) + return; + wo[(*endp)++] = po; + po->filled = 1; +} + +static void add_descendants_to_write_order(git_pobject **wo, unsigned int *endp, + git_pobject *po) +{ + int add_to_order = 1; + while (po) { + if (add_to_order) { + git_pobject *s; + /* add this node... */ + add_to_write_order(wo, endp, po); + /* all its siblings... */ + for (s = po->delta_sibling; s; s = s->delta_sibling) { + add_to_write_order(wo, endp, s); + } + } + /* drop down a level to add left subtree nodes if possible */ + if (po->delta_child) { + add_to_order = 1; + po = po->delta_child; + } else { + add_to_order = 0; + /* our sibling might have some children, it is next */ + if (po->delta_sibling) { + po = po->delta_sibling; + continue; + } + /* go back to our parent node */ + po = po->delta; + while (po && !po->delta_sibling) { + /* we're on the right side of a subtree, keep + * going up until we can go right again */ + po = po->delta; + } + if (!po) { + /* done- we hit our original root node */ + return; + } + /* pass it off to sibling at this level */ + po = po->delta_sibling; + } + }; +} + +static void add_family_to_write_order(git_pobject **wo, unsigned int *endp, + git_pobject *po) +{ + git_pobject *root; + + for (root = po; root->delta; root = root->delta) + ; /* nothing */ + add_descendants_to_write_order(wo, endp, root); +} + +static int cb_tag_foreach(const char *name, git_oid *oid, void *data) +{ + git_packbuilder *pb = data; + git_pobject *po = locate_object_entry(pb, oid); + + GIT_UNUSED(name); + + if (po) + po->tagged = 1; + /* TODO: peel objects */ + return 0; +} + +static git_pobject **compute_write_order(git_packbuilder *pb) +{ + unsigned int i, wo_end, last_untagged; + + git_pobject **wo = git__malloc(sizeof(*wo) * pb->nr_objects); + + for (i = 0; i < pb->nr_objects; i++) { + git_pobject *po = pb->object_list + i; + po->tagged = 0; + po->filled = 0; + po->delta_child = NULL; + po->delta_sibling = NULL; + } + + /* + * Fully connect delta_child/delta_sibling network. + * Make sure delta_sibling is sorted in the original + * recency order. + */ + for (i = pb->nr_objects; i > 0;) { + git_pobject *po = &pb->object_list[--i]; + if (!po->delta) + continue; + /* Mark me as the first child */ + po->delta_sibling = po->delta->delta_child; + po->delta->delta_child = po; + } + + /* + * Mark objects that are at the tip of tags. + */ + if (git_tag_foreach(pb->repo, &cb_tag_foreach, pb) < 0) + return NULL; + + /* + * Give the objects in the original recency order until + * we see a tagged tip. + */ + for (i = wo_end = 0; i < pb->nr_objects; i++) { + git_pobject *po = pb->object_list + i; + if (po->tagged) + break; + add_to_write_order(wo, &wo_end, po); + } + last_untagged = i; + + /* + * Then fill all the tagged tips. + */ + for (; i < pb->nr_objects; i++) { + git_pobject *po = pb->object_list + i; + if (po->tagged) + add_to_write_order(wo, &wo_end, po); + } + + /* + * And then all remaining commits and tags. + */ + for (i = last_untagged; i < pb->nr_objects; i++) { + git_pobject *po = pb->object_list + i; + if (po->type != GIT_OBJ_COMMIT && + po->type != GIT_OBJ_TAG) + continue; + add_to_write_order(wo, &wo_end, po); + } + + /* + * And then all the trees. + */ + for (i = last_untagged; i < pb->nr_objects; i++) { + git_pobject *po = pb->object_list + i; + if (po->type != GIT_OBJ_TREE) + continue; + add_to_write_order(wo, &wo_end, po); + } + + /* + * Finally all the rest in really tight order + */ + for (i = last_untagged; i < pb->nr_objects; i++) { + git_pobject *po = pb->object_list + i; + if (!po->filled) + add_family_to_write_order(wo, &wo_end, po); + } + + if (wo_end != pb->nr_objects) { + giterr_set(GITERR_INVALID, "invalid write order"); + return NULL; + } + + return wo; +} + +static int write_pack(git_packbuilder *pb, + int (*cb)(void *buf, size_t size, void *data), + void *data) +{ + git_pobject **write_order; + git_pobject *po; + git_buf buf = GIT_BUF_INIT; + enum write_one_status status; + struct git_pack_header ph; + unsigned int i = 0; + + write_order = compute_write_order(pb); + if (write_order == NULL) + goto on_error; + + /* Write pack header */ + ph.hdr_signature = htonl(PACK_SIGNATURE); + ph.hdr_version = htonl(PACK_VERSION); + ph.hdr_entries = htonl(pb->nr_objects); + + if (cb(&ph, sizeof(ph), data) < 0) + goto on_error; + + git_hash_update(pb->ctx, &ph, sizeof(ph)); + + pb->nr_remaining = pb->nr_objects; + do { + pb->nr_written = 0; + for ( ; i < pb->nr_objects; ++i) { + po = write_order[i]; + if (write_one(&buf, pb, po, &status) < 0) + goto on_error; + if (cb(buf.ptr, buf.size, data) < 0) + goto on_error; + git_buf_clear(&buf); + } + + pb->nr_remaining -= pb->nr_written; + } while (pb->nr_remaining && i < pb->nr_objects); + + git__free(write_order); + git_buf_free(&buf); + git_hash_final(&pb->pack_oid, pb->ctx); + + return cb(pb->pack_oid.id, GIT_OID_RAWSZ, data); + +on_error: + git__free(write_order); + git_buf_free(&buf); + return -1; +} + +static int send_pack_file(void *buf, size_t size, void *data) +{ + git_transport *t = (git_transport *)data; + return gitno_send(t, buf, size, 0); +} + +static int write_pack_buf(void *buf, size_t size, void *data) +{ + git_buf *b = (git_buf *)data; + return git_buf_put(b, buf, size); +} + +static int write_pack_to_file(void *buf, size_t size, void *data) +{ + git_filebuf *file = (git_filebuf *)data; + return git_filebuf_write(file, buf, size); +} + +static int write_pack_file(git_packbuilder *pb, const char *path) +{ + git_filebuf file = GIT_FILEBUF_INIT; + + if (git_filebuf_open(&file, path, 0) < 0 || + write_pack(pb, &write_pack_to_file, &file) < 0 || + git_filebuf_commit(&file, GIT_PACK_FILE_MODE) < 0) { + git_filebuf_cleanup(&file); + return -1; + } + + return 0; +} + +static int type_size_sort(const void *_a, const void *_b) +{ + const git_pobject *a = (git_pobject *)_a; + const git_pobject *b = (git_pobject *)_b; + + if (a->type > b->type) + return -1; + if (a->type < b->type) + return 1; + if (a->hash > b->hash) + return -1; + if (a->hash < b->hash) + return 1; + /* + * TODO + * + if (a->preferred_base > b->preferred_base) + return -1; + if (a->preferred_base < b->preferred_base) + return 1; + */ + if (a->size > b->size) + return -1; + if (a->size < b->size) + return 1; + return a < b ? -1 : (a > b); /* newest first */ +} + +static int delta_cacheable(git_packbuilder *pb, unsigned long src_size, + unsigned long trg_size, unsigned long delta_size) +{ + if (pb->max_delta_cache_size && + pb->delta_cache_size + delta_size > pb->max_delta_cache_size) + return 0; + + if (delta_size < pb->cache_max_small_delta_size) + return 1; + + /* cache delta, if objects are large enough compared to delta size */ + if ((src_size >> 20) + (trg_size >> 21) > (delta_size >> 10)) + return 1; + + return 0; +} + +#ifdef GIT_THREADS +static git_mutex cache_mutex; +#define cache_lock() git_mutex_lock(&cache_mutex); +#define cache_unlock() git_mutex_unlock(&cache_mutex); + +static git_mutex progress_mutex; +#define progress_lock() git_mutex_lock(&progress_mutex); +#define progress_unlock() git_mutex_unlock(&progress_mutex); + +static git_cond progress_cond; +#else + +#define cache_lock() (void)0; +#define cache_unlock() (void)0; +#define progress_lock() (void)0; +#define progress_unlock() (void)0; +#endif + +static int try_delta(git_packbuilder *pb, struct unpacked *trg, + struct unpacked *src, unsigned int max_depth, + unsigned long *mem_usage, int *ret) +{ + git_pobject *trg_object = trg->object; + git_pobject *src_object = src->object; + git_odb_object *obj; + unsigned long trg_size, src_size, delta_size, + sizediff, max_size, sz; + unsigned int ref_depth; + void *delta_buf; + + /* Don't bother doing diffs between different types */ + if (trg_object->type != src_object->type) { + *ret = -1; + return 0; + } + + *ret = 0; + + /* TODO: support reuse-delta */ + + /* Let's not bust the allowed depth. */ + if (src->depth >= max_depth) + return 0; + + /* Now some size filtering heuristics. */ + trg_size = trg_object->size; + if (!trg_object->delta) { + max_size = trg_size/2 - 20; + ref_depth = 1; + } else { + max_size = trg_object->delta_size; + ref_depth = trg->depth; + } + + max_size = (uint64_t)max_size * (max_depth - src->depth) / + (max_depth - ref_depth + 1); + if (max_size == 0) + return 0; + + src_size = src_object->size; + sizediff = src_size < trg_size ? trg_size - src_size : 0; + if (sizediff >= max_size) + return 0; + if (trg_size < src_size / 32) + return 0; + + /* Load data if not already done */ + if (!trg->data) { + if (git_odb_read(&obj, pb->odb, &trg_object->id) < 0) + return -1; + + sz = git_odb_object_size(obj); + trg->data = git__malloc(sz); + GITERR_CHECK_ALLOC(trg->data); + memcpy(trg->data, git_odb_object_data(obj), sz); + + git_odb_object_free(obj); + + if (sz != trg_size) { + giterr_set(GITERR_INVALID, + "Inconsistent target object length"); + return -1; + } + + *mem_usage += sz; + } + if (!src->data) { + if (git_odb_read(&obj, pb->odb, &src_object->id) < 0) + return -1; + + sz = git_odb_object_size(obj); + src->data = git__malloc(sz); + GITERR_CHECK_ALLOC(src->data); + memcpy(src->data, git_odb_object_data(obj), sz); + + git_odb_object_free(obj); + + if (sz != src_size) { + giterr_set(GITERR_INVALID, + "Inconsistent source object length"); + return -1; + } + + *mem_usage += sz; + } + if (!src->index) { + src->index = git_delta_create_index(src->data, src_size); + if (!src->index) + return 0; /* suboptimal pack - out of memory */ + + *mem_usage += git_delta_sizeof_index(src->index); + } + + delta_buf = git_delta_create(src->index, trg->data, trg_size, + &delta_size, max_size); + if (!delta_buf) + return 0; + + if (trg_object->delta) { + /* Prefer only shallower same-sized deltas. */ + if (delta_size == trg_object->delta_size && + src->depth + 1 >= trg->depth) { + git__free(delta_buf); + return 0; + } + } + + cache_lock(); + if (trg_object->delta_data) { + git__free(trg_object->delta_data); + pb->delta_cache_size -= trg_object->delta_size; + trg_object->delta_data = NULL; + } + if (delta_cacheable(pb, src_size, trg_size, delta_size)) { + pb->delta_cache_size += delta_size; + cache_unlock(); + + trg_object->delta_data = git__realloc(delta_buf, delta_size); + GITERR_CHECK_ALLOC(trg_object->delta_data); + } else { + /* create delta when writing the pack */ + cache_unlock(); + git__free(delta_buf); + } + + trg_object->delta = src_object; + trg_object->delta_size = delta_size; + trg->depth = src->depth + 1; + + *ret = 1; + return 0; +} + +static unsigned int check_delta_limit(git_pobject *me, unsigned int n) +{ + git_pobject *child = me->delta_child; + unsigned int m = n; + + while (child) { + unsigned int c = check_delta_limit(child, n + 1); + if (m < c) + m = c; + child = child->delta_sibling; + } + return m; +} + +static unsigned long free_unpacked(struct unpacked *n) +{ + unsigned long freed_mem = git_delta_sizeof_index(n->index); + git_delta_free_index(n->index); + n->index = NULL; + if (n->data) { + freed_mem += n->object->size; + git__free(n->data); + n->data = NULL; + } + n->object = NULL; + n->depth = 0; + return freed_mem; +} + +static int find_deltas(git_packbuilder *pb, git_pobject **list, + unsigned int *list_size, unsigned int window, + unsigned int depth) +{ + git_pobject *po; + git_buf zbuf = GIT_BUF_INIT; + struct unpacked *array; + uint32_t idx = 0, count = 0; + unsigned long mem_usage = 0; + unsigned int i; + int error = -1; + + array = git__calloc(window, sizeof(struct unpacked)); + GITERR_CHECK_ALLOC(array); + + for (;;) { + struct unpacked *n = array + idx; + unsigned int max_depth; + int j, best_base = -1; + + progress_lock(); + if (!*list_size) { + progress_unlock(); + break; + } + + po = *list++; + (*list_size)--; + progress_unlock(); + + mem_usage -= free_unpacked(n); + n->object = po; + + while (pb->window_memory_limit && + mem_usage > pb->window_memory_limit && + count > 1) { + uint32_t tail = (idx + window - count) % window; + mem_usage -= free_unpacked(array + tail); + count--; + } + + /* + * If the current object is at pack edge, take the depth the + * objects that depend on the current object into account + * otherwise they would become too deep. + */ + max_depth = depth; + if (po->delta_child) { + max_depth -= check_delta_limit(po, 0); + if (max_depth <= 0) + goto next; + } + + j = window; + while (--j > 0) { + int ret; + uint32_t other_idx = idx + j; + struct unpacked *m; + + if (other_idx >= window) + other_idx -= window; + + m = array + other_idx; + if (!m->object) + break; + + if (try_delta(pb, n, m, max_depth, &mem_usage, &ret) < 0) + goto on_error; + if (ret < 0) + break; + else if (ret > 0) + best_base = other_idx; + } + + /* + * If we decided to cache the delta data, then it is best + * to compress it right away. First because we have to do + * it anyway, and doing it here while we're threaded will + * save a lot of time in the non threaded write phase, + * as well as allow for caching more deltas within + * the same cache size limit. + * ... + * But only if not writing to stdout, since in that case + * the network is most likely throttling writes anyway, + * and therefore it is best to go to the write phase ASAP + * instead, as we can afford spending more time compressing + * between writes at that moment. + */ + if (po->delta_data) { + if (git__compress(&zbuf, po->delta_data, po->delta_size) < 0) + goto on_error; + + git__free(po->delta_data); + po->delta_data = git__malloc(zbuf.size); + GITERR_CHECK_ALLOC(po->delta_data); + + memcpy(po->delta_data, zbuf.ptr, zbuf.size); + po->z_delta_size = zbuf.size; + git_buf_clear(&zbuf); + + cache_lock(); + pb->delta_cache_size -= po->delta_size; + pb->delta_cache_size += po->z_delta_size; + cache_unlock(); + } + + /* + * If we made n a delta, and if n is already at max + * depth, leaving it in the window is pointless. we + * should evict it first. + */ + if (po->delta && max_depth <= n->depth) + continue; + + /* + * Move the best delta base up in the window, after the + * currently deltified object, to keep it longer. It will + * be the first base object to be attempted next. + */ + if (po->delta) { + struct unpacked swap = array[best_base]; + int dist = (window + idx - best_base) % window; + int dst = best_base; + while (dist--) { + int src = (dst + 1) % window; + array[dst] = array[src]; + dst = src; + } + array[dst] = swap; + } + + next: + idx++; + if (count + 1 < window) + count++; + if (idx >= window) + idx = 0; + } + error = 0; + +on_error: + for (i = 0; i < window; ++i) { + git__free(array[i].index); + git__free(array[i].data); + } + git__free(array); + git_buf_free(&zbuf); + + return error; +} + +#ifdef GIT_THREADS + +struct thread_params { + git_thread thread; + git_packbuilder *pb; + + git_pobject **list; + + git_cond cond; + git_mutex mutex; + + unsigned int list_size; + unsigned int remaining; + + int window; + int depth; + int working; + int data_ready; +}; + +static void init_threaded_search(void) +{ + git_mutex_init(&cache_mutex); + git_mutex_init(&progress_mutex); + + git_cond_init(&progress_cond); +} + +static void cleanup_threaded_search(void) +{ + git_cond_free(&progress_cond); + + git_mutex_free(&cache_mutex); + git_mutex_free(&progress_mutex); +} + +static void *threaded_find_deltas(void *arg) +{ + struct thread_params *me = arg; + + while (me->remaining) { + if (find_deltas(me->pb, me->list, &me->remaining, + me->window, me->depth) < 0) { + ; /* TODO */ + } + + progress_lock(); + me->working = 0; + git_cond_signal(&progress_cond); + progress_unlock(); + + /* + * We must not set ->data_ready before we wait on the + * condition because the main thread may have set it to 1 + * before we get here. In order to be sure that new + * work is available if we see 1 in ->data_ready, it + * was initialized to 0 before this thread was spawned + * and we reset it to 0 right away. + */ + git_mutex_lock(&me->mutex); + while (!me->data_ready) + git_cond_wait(&me->cond, &me->mutex); + me->data_ready = 0; + git_mutex_unlock(&me->mutex); + } + /* leave ->working 1 so that this doesn't get more work assigned */ + return NULL; +} + +static int ll_find_deltas(git_packbuilder *pb, git_pobject **list, + unsigned int list_size, unsigned int window, + unsigned int depth) +{ + struct thread_params *p; + int i, ret, active_threads = 0; + + init_threaded_search(); + + if (!pb->nr_threads) + pb->nr_threads = git_online_cpus(); + if (pb->nr_threads <= 1) { + find_deltas(pb, list, &list_size, window, depth); + cleanup_threaded_search(); + return 0; + } + + p = git__malloc(pb->nr_threads * sizeof(*p)); + GITERR_CHECK_ALLOC(p); + + /* Partition the work among the threads */ + for (i = 0; i < pb->nr_threads; ++i) { + unsigned sub_size = list_size / (pb->nr_threads - i); + + /* don't use too small segments or no deltas will be found */ + if (sub_size < 2*window && i+1 < pb->nr_threads) + sub_size = 0; + + p[i].pb = pb; + p[i].window = window; + p[i].depth = depth; + p[i].working = 1; + p[i].data_ready = 0; + + /* try to split chunks on "path" boundaries */ + while (sub_size && sub_size < list_size && + list[sub_size]->hash && + list[sub_size]->hash == list[sub_size-1]->hash) + sub_size++; + + p[i].list = list; + p[i].list_size = sub_size; + p[i].remaining = sub_size; + + list += sub_size; + list_size -= sub_size; + } + + /* Start work threads */ + for (i = 0; i < pb->nr_threads; ++i) { + if (!p[i].list_size) + continue; + + git_mutex_init(&p[i].mutex); + git_cond_init(&p[i].cond); + + ret = git_thread_create(&p[i].thread, NULL, + threaded_find_deltas, &p[i]); + if (ret) { + giterr_set(GITERR_THREAD, "unable to create thread"); + return -1; + } + active_threads++; + } + + /* + * Now let's wait for work completion. Each time a thread is done + * with its work, we steal half of the remaining work from the + * thread with the largest number of unprocessed objects and give + * it to that newly idle thread. This ensure good load balancing + * until the remaining object list segments are simply too short + * to be worth splitting anymore. + */ + while (active_threads) { + struct thread_params *target = NULL; + struct thread_params *victim = NULL; + unsigned sub_size = 0; + + progress_lock(); + for (;;) { + for (i = 0; !target && i < pb->nr_threads; i++) + if (!p[i].working) + target = &p[i]; + if (target) + break; + git_cond_wait(&progress_cond, &progress_mutex); + } + + for (i = 0; i < pb->nr_threads; i++) + if (p[i].remaining > 2*window && + (!victim || victim->remaining < p[i].remaining)) + victim = &p[i]; + if (victim) { + sub_size = victim->remaining / 2; + list = victim->list + victim->list_size - sub_size; + while (sub_size && list[0]->hash && + list[0]->hash == list[-1]->hash) { + list++; + sub_size--; + } + if (!sub_size) { + /* + * It is possible for some "paths" to have + * so many objects that no hash boundary + * might be found. Let's just steal the + * exact half in that case. + */ + sub_size = victim->remaining / 2; + list -= sub_size; + } + target->list = list; + victim->list_size -= sub_size; + victim->remaining -= sub_size; + } + target->list_size = sub_size; + target->remaining = sub_size; + target->working = 1; + progress_unlock(); + + git_mutex_lock(&target->mutex); + target->data_ready = 1; + git_cond_signal(&target->cond); + git_mutex_unlock(&target->mutex); + + if (!sub_size) { + git_thread_join(target->thread, NULL); + git_cond_free(&target->cond); + git_mutex_free(&target->mutex); + active_threads--; + } + } + + cleanup_threaded_search(); + git__free(p); + return 0; +} + +#else +#define ll_find_deltas(pb, l, ls, w, d) find_deltas(pb, l, &ls, w, d) +#endif + +static void get_object_details(git_packbuilder *pb) +{ + git_pobject *po; + unsigned int i; + + for (i = 0; i < pb->nr_objects; ++i) { + po = &pb->object_list[i]; + if (pb->big_file_threshold < po->size) + po->no_try_delta = 1; + } +} + +static int prepare_pack(git_packbuilder *pb) +{ + git_pobject **delta_list; + unsigned int i, n = 0; + + if (pb->nr_objects == 0 || pb->done) + return 0; /* nothing to do */ + + get_object_details(pb); + + delta_list = git__malloc(pb->nr_objects * sizeof(*delta_list)); + GITERR_CHECK_ALLOC(delta_list); + + for (i = 0; i < pb->nr_objects; ++i) { + git_pobject *po = pb->object_list + i; + + if (po->size < 50) + continue; + + if (po->no_try_delta) + continue; + + delta_list[n++] = po; + } + + if (n > 1) { + git__tsort((void **)delta_list, n, type_size_sort); + if (ll_find_deltas(pb, delta_list, n, + GIT_PACK_WINDOW + 1, + GIT_PACK_DEPTH) < 0) { + git__free(delta_list); + return -1; + } + } + + pb->done = true; + git__free(delta_list); + return 0; +} + +#define PREPARE_PACK if (prepare_pack(pb) < 0) { return -1; } + +int git_packbuilder_send(git_packbuilder *pb, git_transport *t) +{ + PREPARE_PACK; + return write_pack(pb, &send_pack_file, t); +} + +int git_packbuilder_write_buf(git_buf *buf, git_packbuilder *pb) +{ + PREPARE_PACK; + return write_pack(pb, &write_pack_buf, buf); +} + +int git_packbuilder_write(git_packbuilder *pb, const char *path) +{ + PREPARE_PACK; + return write_pack_file(pb, path); +} + +#undef PREPARE_PACK + +static int cb_tree_walk(const char *root, const git_tree_entry *entry, void *payload) +{ + git_packbuilder *pb = payload; + git_buf buf = GIT_BUF_INIT; + + git_buf_puts(&buf, root); + git_buf_puts(&buf, git_tree_entry_name(entry)); + + if (git_packbuilder_insert(pb, git_tree_entry_id(entry), + git_buf_cstr(&buf)) < 0) { + git_buf_free(&buf); + return -1; + } + + git_buf_free(&buf); + return 0; +} + +int git_packbuilder_insert_tree(git_packbuilder *pb, const git_oid *oid) +{ + git_tree *tree; + + if (git_tree_lookup(&tree, pb->repo, oid) < 0 || + git_packbuilder_insert(pb, oid, NULL) < 0) + return -1; + + if (git_tree_walk(tree, cb_tree_walk, GIT_TREEWALK_PRE, pb) < 0) { + git_tree_free(tree); + return -1; + } + + git_tree_free(tree); + return 0; +} + +void git_packbuilder_free(git_packbuilder *pb) +{ + if (pb == NULL) + return; + + git_odb_free(pb->odb); + git_hash_free_ctx(pb->ctx); + git__free(pb->object_ix); + git__free(pb->object_list); + git__free(pb); +} diff --git a/src/pack-objects.h b/src/pack-objects.h new file mode 100644 index 000000000..971b30217 --- /dev/null +++ b/src/pack-objects.h @@ -0,0 +1,85 @@ +/* + * Copyright (C) 2009-2012 the libgit2 contributors + * + * This file is part of libgit2, distributed under the GNU GPL v2 with + * a Linking Exception. For full terms see the included COPYING file. + */ + +#ifndef INCLUDE_pack_objects_h__ +#define INCLUDE_pack_objects_h__ + +#include "common.h" + +#include "buffer.h" +#include "hash.h" + +#include "git2/oid.h" + +#define GIT_PACK_WINDOW 10 /* number of objects to possibly delta against */ +#define GIT_PACK_DEPTH 50 /* max delta depth */ +#define GIT_PACK_DELTA_CACHE_SIZE (256 * 1024 * 1024) +#define GIT_PACK_DELTA_CACHE_LIMIT 1000 +#define GIT_PACK_BIG_FILE_THRESHOLD (512 * 1024 * 1024) + +typedef struct git_pobject { + git_oid id; + git_otype type; + git_off_t offset; + + size_t size; + + unsigned int hash; /* name hint hash */ + + struct git_pobject *delta; /* delta base object */ + struct git_pobject *delta_child; /* deltified objects who bases me */ + struct git_pobject *delta_sibling; /* other deltified objects + * who uses the same base as + * me */ + + void *delta_data; + unsigned long delta_size; + unsigned long z_delta_size; + + int written:1, + recursing:1, + no_try_delta:1, + tagged:1, + filled:1; +} git_pobject; + +struct git_packbuilder { + git_repository *repo; /* associated repository */ + git_odb *odb; /* associated object database */ + + git_hash_ctx *ctx; + + uint32_t nr_objects, + nr_alloc, + nr_written, + nr_remaining; + + git_pobject *object_list; + + int *object_ix; + int object_ix_hashsz; + + + git_oid pack_oid; /* hash of written pack */ + + /* configs */ + unsigned long delta_cache_size; + unsigned long max_delta_cache_size; + unsigned long cache_max_small_delta_size; + unsigned long big_file_threshold; + unsigned long window_memory_limit; + + int nr_threads; /* nr of threads to use */ + + bool done; +}; + + +int git_packbuilder_send(git_packbuilder *pb, git_transport *t); +int git_packbuilder_write_buf(git_buf *buf, git_packbuilder *pb); + +#endif diff --git a/tests-clar/pack/packbuilder.c b/tests-clar/pack/packbuilder.c new file mode 100644 index 000000000..fa7bec14e --- /dev/null +++ b/tests-clar/pack/packbuilder.c @@ -0,0 +1,67 @@ +#include "clar_libgit2.h" +#include "iterator.h" +#include "vector.h" + +static git_repository *_repo; +static git_revwalk *_revwalker; +static git_packbuilder *_packbuilder; +static git_indexer *_indexer; +static git_vector _commits; + +void test_pack_packbuilder__initialize(void) +{ + cl_git_pass(git_repository_open(&_repo, cl_fixture("testrepo.git"))); + cl_git_pass(git_revwalk_new(&_revwalker, _repo)); + cl_git_pass(git_packbuilder_new(&_packbuilder, _repo)); + cl_git_pass(git_vector_init(&_commits, 0, NULL)); +} + +void test_pack_packbuilder__cleanup(void) +{ + git_oid *o; + unsigned int i; + + git_vector_foreach(&_commits, i, o) { + git__free(o); + } + git_vector_free(&_commits); + git_packbuilder_free(_packbuilder); + git_revwalk_free(_revwalker); + git_indexer_free(_indexer); + git_repository_free(_repo); +} + +void test_pack_packbuilder__create_pack(void) +{ + git_indexer_stats stats; + git_oid oid, *o; + unsigned int i; + + git_revwalk_sorting(_revwalker, GIT_SORT_TIME); + cl_git_pass(git_revwalk_push_ref(_revwalker, "HEAD")); + + while (git_revwalk_next(&oid, _revwalker) == 0) { + o = git__malloc(GIT_OID_RAWSZ); + cl_assert(o != NULL); + git_oid_cpy(o, &oid); + cl_git_pass(git_vector_insert(&_commits, o)); + } + + git_vector_foreach(&_commits, i, o) { + cl_git_pass(git_packbuilder_insert(_packbuilder, o, NULL)); + } + + git_vector_foreach(&_commits, i, o) { + git_object *obj; + cl_git_pass(git_object_lookup(&obj, _repo, o, GIT_OBJ_COMMIT)); + cl_git_pass(git_packbuilder_insert_tree(_packbuilder, + git_commit_tree_oid((git_commit *)obj))); + git_object_free(obj); + } + + cl_git_pass(git_packbuilder_write(_packbuilder, "testpack.pack")); + + cl_git_pass(git_indexer_new(&_indexer, "testpack.pack")); + cl_git_pass(git_indexer_run(_indexer, &stats)); + cl_git_pass(git_indexer_write(_indexer)); +} |