diff options
author | Edward Thomson <ethomson@edwardthomson.com> | 2021-11-14 08:47:40 -0500 |
---|---|---|
committer | Edward Thomson <ethomson@edwardthomson.com> | 2022-02-22 22:07:44 -0500 |
commit | ef4ab2988320005cbcb3db920e6b41f10b3c60cf (patch) | |
tree | 6609f213ad9d607e8df1f543919d3154c056c3ef /src/libgit2/indexer.c | |
parent | 49e180c862dc7c6d1f62a53bf8756e25b3417968 (diff) | |
download | libgit2-ef4ab2988320005cbcb3db920e6b41f10b3c60cf.tar.gz |
refactor: `src` is now `src/libgit2`
Diffstat (limited to 'src/libgit2/indexer.c')
-rw-r--r-- | src/libgit2/indexer.c | 1413 |
1 files changed, 1413 insertions, 0 deletions
diff --git a/src/libgit2/indexer.c b/src/libgit2/indexer.c new file mode 100644 index 000000000..f9a32e7ac --- /dev/null +++ b/src/libgit2/indexer.c @@ -0,0 +1,1413 @@ +/* + * Copyright (C) the libgit2 contributors. All rights reserved. + * + * This file is part of libgit2, distributed under the GNU GPL v2 with + * a Linking Exception. For full terms see the included COPYING file. + */ + +#include "indexer.h" + +#include "git2/indexer.h" +#include "git2/object.h" + +#include "commit.h" +#include "tree.h" +#include "tag.h" +#include "pack.h" +#include "mwindow.h" +#include "posix.h" +#include "pack.h" +#include "filebuf.h" +#include "oid.h" +#include "oidarray.h" +#include "oidmap.h" +#include "zstream.h" +#include "object.h" + +size_t git_indexer__max_objects = UINT32_MAX; + +#define UINT31_MAX (0x7FFFFFFF) + +struct entry { + git_oid oid; + uint32_t crc; + uint32_t offset; + uint64_t offset_long; +}; + +struct git_indexer { + unsigned int parsed_header :1, + pack_committed :1, + have_stream :1, + have_delta :1, + do_fsync :1, + do_verify :1; + struct git_pack_header hdr; + struct git_pack_file *pack; + unsigned int mode; + off64_t off; + off64_t entry_start; + git_object_t entry_type; + git_str entry_data; + git_packfile_stream stream; + size_t nr_objects; + git_vector objects; + git_vector deltas; + unsigned int fanout[256]; + git_hash_ctx hash_ctx; + unsigned char checksum[GIT_HASH_SHA1_SIZE]; + char name[(GIT_HASH_SHA1_SIZE * 2) + 1]; + git_indexer_progress_cb progress_cb; + void *progress_payload; + char objbuf[8*1024]; + + /* OIDs referenced from pack objects. Used for verification. */ + git_oidmap *expected_oids; + + /* Needed to look up objects which we want to inject to fix a thin pack */ + git_odb *odb; + + /* Fields for calculating the packfile trailer (hash of everything before it) */ + char inbuf[GIT_OID_RAWSZ]; + size_t inbuf_len; + git_hash_ctx trailer; +}; + +struct delta_info { + off64_t delta_off; +}; + +#ifndef GIT_DEPRECATE_HARD +const git_oid *git_indexer_hash(const git_indexer *idx) +{ + return (git_oid *)idx->checksum; +} +#endif + +const char *git_indexer_name(const git_indexer *idx) +{ + return idx->name; +} + +static int parse_header(struct git_pack_header *hdr, struct git_pack_file *pack) +{ + int error; + git_map map; + + if ((error = p_mmap(&map, sizeof(*hdr), GIT_PROT_READ, GIT_MAP_SHARED, pack->mwf.fd, 0)) < 0) + return error; + + memcpy(hdr, map.data, sizeof(*hdr)); + p_munmap(&map); + + /* Verify we recognize this pack file format. */ + if (hdr->hdr_signature != ntohl(PACK_SIGNATURE)) { + git_error_set(GIT_ERROR_INDEXER, "wrong pack signature"); + return -1; + } + + if (!pack_version_ok(hdr->hdr_version)) { + git_error_set(GIT_ERROR_INDEXER, "wrong pack version"); + return -1; + } + + return 0; +} + +static int objects_cmp(const void *a, const void *b) +{ + const struct entry *entrya = a; + const struct entry *entryb = b; + + return git_oid__cmp(&entrya->oid, &entryb->oid); +} + +int git_indexer_options_init(git_indexer_options *opts, unsigned int version) +{ + GIT_INIT_STRUCTURE_FROM_TEMPLATE( + opts, version, git_indexer_options, GIT_INDEXER_OPTIONS_INIT); + return 0; +} + +#ifndef GIT_DEPRECATE_HARD +int git_indexer_init_options(git_indexer_options *opts, unsigned int version) +{ + return git_indexer_options_init(opts, version); +} +#endif + +int git_indexer_new( + git_indexer **out, + const char *prefix, + unsigned int mode, + git_odb *odb, + git_indexer_options *in_opts) +{ + git_indexer_options opts = GIT_INDEXER_OPTIONS_INIT; + git_indexer *idx; + git_str path = GIT_STR_INIT, tmp_path = GIT_STR_INIT; + static const char suff[] = "/pack"; + int error, fd = -1; + + if (in_opts) + memcpy(&opts, in_opts, sizeof(opts)); + + idx = git__calloc(1, sizeof(git_indexer)); + GIT_ERROR_CHECK_ALLOC(idx); + idx->odb = odb; + idx->progress_cb = opts.progress_cb; + idx->progress_payload = opts.progress_cb_payload; + idx->mode = mode ? mode : GIT_PACK_FILE_MODE; + git_str_init(&idx->entry_data, 0); + + if ((error = git_hash_ctx_init(&idx->hash_ctx, GIT_HASH_ALGORITHM_SHA1)) < 0 || + (error = git_hash_ctx_init(&idx->trailer, GIT_HASH_ALGORITHM_SHA1)) < 0 || + (error = git_oidmap_new(&idx->expected_oids)) < 0) + goto cleanup; + + idx->do_verify = opts.verify; + + if (git_repository__fsync_gitdir) + idx->do_fsync = 1; + + error = git_str_joinpath(&path, prefix, suff); + if (error < 0) + goto cleanup; + + fd = git_futils_mktmp(&tmp_path, git_str_cstr(&path), idx->mode); + git_str_dispose(&path); + if (fd < 0) + goto cleanup; + + error = git_packfile_alloc(&idx->pack, git_str_cstr(&tmp_path)); + git_str_dispose(&tmp_path); + + if (error < 0) + goto cleanup; + + idx->pack->mwf.fd = fd; + if ((error = git_mwindow_file_register(&idx->pack->mwf)) < 0) + goto cleanup; + + *out = idx; + return 0; + +cleanup: + if (fd != -1) + p_close(fd); + + if (git_str_len(&tmp_path) > 0) + p_unlink(git_str_cstr(&tmp_path)); + + if (idx->pack != NULL) + p_unlink(idx->pack->pack_name); + + git_str_dispose(&path); + git_str_dispose(&tmp_path); + git__free(idx); + return -1; +} + +void git_indexer__set_fsync(git_indexer *idx, int do_fsync) +{ + idx->do_fsync = !!do_fsync; +} + +/* Try to store the delta so we can try to resolve it later */ +static int store_delta(git_indexer *idx) +{ + struct delta_info *delta; + + delta = git__calloc(1, sizeof(struct delta_info)); + GIT_ERROR_CHECK_ALLOC(delta); + delta->delta_off = idx->entry_start; + + if (git_vector_insert(&idx->deltas, delta) < 0) + return -1; + + return 0; +} + +static int hash_header(git_hash_ctx *ctx, off64_t len, git_object_t type) +{ + char buffer[64]; + size_t hdrlen; + int error; + + if ((error = git_odb__format_object_header(&hdrlen, + buffer, sizeof(buffer), (size_t)len, type)) < 0) + return error; + + return git_hash_update(ctx, buffer, hdrlen); +} + +static int hash_object_stream(git_indexer*idx, git_packfile_stream *stream) +{ + ssize_t read; + + GIT_ASSERT_ARG(idx); + GIT_ASSERT_ARG(stream); + + do { + if ((read = git_packfile_stream_read(stream, idx->objbuf, sizeof(idx->objbuf))) < 0) + break; + + if (idx->do_verify) + git_str_put(&idx->entry_data, idx->objbuf, read); + + git_hash_update(&idx->hash_ctx, idx->objbuf, read); + } while (read > 0); + + if (read < 0) + return (int)read; + + return 0; +} + +/* In order to create the packfile stream, we need to skip over the delta base description */ +static int advance_delta_offset(git_indexer *idx, git_object_t type) +{ + git_mwindow *w = NULL; + + GIT_ASSERT_ARG(type == GIT_OBJECT_REF_DELTA || type == GIT_OBJECT_OFS_DELTA); + + if (type == GIT_OBJECT_REF_DELTA) { + idx->off += GIT_OID_RAWSZ; + } else { + off64_t base_off; + int error = get_delta_base(&base_off, idx->pack, &w, &idx->off, type, idx->entry_start); + git_mwindow_close(&w); + if (error < 0) + return error; + } + + return 0; +} + +/* Read from the stream and discard any output */ +static int read_object_stream(git_indexer *idx, git_packfile_stream *stream) +{ + ssize_t read; + + GIT_ASSERT_ARG(stream); + + do { + read = git_packfile_stream_read(stream, idx->objbuf, sizeof(idx->objbuf)); + } while (read > 0); + + if (read < 0) + return (int)read; + + return 0; +} + +static int crc_object(uint32_t *crc_out, git_mwindow_file *mwf, off64_t start, off64_t size) +{ + void *ptr; + uint32_t crc; + unsigned int left, len; + git_mwindow *w = NULL; + + crc = crc32(0L, Z_NULL, 0); + while (size) { + ptr = git_mwindow_open(mwf, &w, start, (size_t)size, &left); + if (ptr == NULL) + return -1; + + len = min(left, (unsigned int)size); + crc = crc32(crc, ptr, len); + size -= len; + start += len; + git_mwindow_close(&w); + } + + *crc_out = htonl(crc); + return 0; +} + +static int add_expected_oid(git_indexer *idx, const git_oid *oid) +{ + /* + * If we know about that object because it is stored in our ODB or + * because we have already processed it as part of our pack file, we do + * not have to expect it. + */ + if ((!idx->odb || !git_odb_exists(idx->odb, oid)) && + !git_oidmap_exists(idx->pack->idx_cache, oid) && + !git_oidmap_exists(idx->expected_oids, oid)) { + git_oid *dup = git__malloc(sizeof(*oid)); + GIT_ERROR_CHECK_ALLOC(dup); + git_oid_cpy(dup, oid); + return git_oidmap_set(idx->expected_oids, dup, dup); + } + + return 0; +} + +static int check_object_connectivity(git_indexer *idx, const git_rawobj *obj) +{ + git_object *object; + git_oid *expected; + int error = 0; + + if (obj->type != GIT_OBJECT_BLOB && + obj->type != GIT_OBJECT_TREE && + obj->type != GIT_OBJECT_COMMIT && + obj->type != GIT_OBJECT_TAG) + return 0; + + if (git_object__from_raw(&object, obj->data, obj->len, obj->type) < 0) { + /* + * parse_raw returns EINVALID on invalid data; downgrade + * that to a normal -1 error code. + */ + error = -1; + goto out; + } + + if ((expected = git_oidmap_get(idx->expected_oids, &object->cached.oid)) != NULL) { + git_oidmap_delete(idx->expected_oids, &object->cached.oid); + git__free(expected); + } + + /* + * Check whether this is a known object. If so, we can just continue as + * we assume that the ODB has a complete graph. + */ + if (idx->odb && git_odb_exists(idx->odb, &object->cached.oid)) + return 0; + + switch (obj->type) { + case GIT_OBJECT_TREE: + { + git_tree *tree = (git_tree *) object; + git_tree_entry *entry; + size_t i; + + git_array_foreach(tree->entries, i, entry) + if (add_expected_oid(idx, entry->oid) < 0) + goto out; + + break; + } + case GIT_OBJECT_COMMIT: + { + git_commit *commit = (git_commit *) object; + git_oid *parent_oid; + size_t i; + + git_array_foreach(commit->parent_ids, i, parent_oid) + if (add_expected_oid(idx, parent_oid) < 0) + goto out; + + if (add_expected_oid(idx, &commit->tree_id) < 0) + goto out; + + break; + } + case GIT_OBJECT_TAG: + { + git_tag *tag = (git_tag *) object; + + if (add_expected_oid(idx, &tag->target) < 0) + goto out; + + break; + } + case GIT_OBJECT_BLOB: + default: + break; + } + +out: + git_object_free(object); + + return error; +} + +static int store_object(git_indexer *idx) +{ + int i, error; + git_oid oid; + struct entry *entry; + off64_t entry_size; + struct git_pack_entry *pentry; + off64_t entry_start = idx->entry_start; + + entry = git__calloc(1, sizeof(*entry)); + GIT_ERROR_CHECK_ALLOC(entry); + + pentry = git__calloc(1, sizeof(struct git_pack_entry)); + GIT_ERROR_CHECK_ALLOC(pentry); + + if (git_hash_final(oid.id, &idx->hash_ctx)) { + git__free(pentry); + goto on_error; + } + entry_size = idx->off - entry_start; + if (entry_start > UINT31_MAX) { + entry->offset = UINT32_MAX; + entry->offset_long = entry_start; + } else { + entry->offset = (uint32_t)entry_start; + } + + if (idx->do_verify) { + git_rawobj rawobj = { + idx->entry_data.ptr, + idx->entry_data.size, + idx->entry_type + }; + + if ((error = check_object_connectivity(idx, &rawobj)) < 0) + goto on_error; + } + + git_oid_cpy(&pentry->sha1, &oid); + pentry->offset = entry_start; + + if (git_oidmap_exists(idx->pack->idx_cache, &pentry->sha1)) { + git_error_set(GIT_ERROR_INDEXER, "duplicate object %s found in pack", git_oid_tostr_s(&pentry->sha1)); + git__free(pentry); + goto on_error; + } + + if ((error = git_oidmap_set(idx->pack->idx_cache, &pentry->sha1, pentry)) < 0) { + git__free(pentry); + git_error_set_oom(); + goto on_error; + } + + git_oid_cpy(&entry->oid, &oid); + + if (crc_object(&entry->crc, &idx->pack->mwf, entry_start, entry_size) < 0) + goto on_error; + + /* Add the object to the list */ + if (git_vector_insert(&idx->objects, entry) < 0) + goto on_error; + + for (i = oid.id[0]; i < 256; ++i) { + idx->fanout[i]++; + } + + return 0; + +on_error: + git__free(entry); + + return -1; +} + +GIT_INLINE(bool) has_entry(git_indexer *idx, git_oid *id) +{ + return git_oidmap_exists(idx->pack->idx_cache, id); +} + +static int save_entry(git_indexer *idx, struct entry *entry, struct git_pack_entry *pentry, off64_t entry_start) +{ + int i; + + if (entry_start > UINT31_MAX) { + entry->offset = UINT32_MAX; + entry->offset_long = entry_start; + } else { + entry->offset = (uint32_t)entry_start; + } + + pentry->offset = entry_start; + + if (git_oidmap_exists(idx->pack->idx_cache, &pentry->sha1) || + git_oidmap_set(idx->pack->idx_cache, &pentry->sha1, pentry) < 0) { + git_error_set(GIT_ERROR_INDEXER, "cannot insert object into pack"); + return -1; + } + + /* Add the object to the list */ + if (git_vector_insert(&idx->objects, entry) < 0) + return -1; + + for (i = entry->oid.id[0]; i < 256; ++i) { + idx->fanout[i]++; + } + + return 0; +} + +static int hash_and_save(git_indexer *idx, git_rawobj *obj, off64_t entry_start) +{ + git_oid oid; + size_t entry_size; + struct entry *entry; + struct git_pack_entry *pentry = NULL; + + entry = git__calloc(1, sizeof(*entry)); + GIT_ERROR_CHECK_ALLOC(entry); + + if (git_odb__hashobj(&oid, obj) < 0) { + git_error_set(GIT_ERROR_INDEXER, "failed to hash object"); + goto on_error; + } + + pentry = git__calloc(1, sizeof(struct git_pack_entry)); + GIT_ERROR_CHECK_ALLOC(pentry); + + git_oid_cpy(&pentry->sha1, &oid); + git_oid_cpy(&entry->oid, &oid); + entry->crc = crc32(0L, Z_NULL, 0); + + entry_size = (size_t)(idx->off - entry_start); + if (crc_object(&entry->crc, &idx->pack->mwf, entry_start, entry_size) < 0) + goto on_error; + + return save_entry(idx, entry, pentry, entry_start); + +on_error: + git__free(pentry); + git__free(entry); + git__free(obj->data); + return -1; +} + +static int do_progress_callback(git_indexer *idx, git_indexer_progress *stats) +{ + if (idx->progress_cb) + return git_error_set_after_callback_function( + idx->progress_cb(stats, idx->progress_payload), + "indexer progress"); + return 0; +} + +/* Hash everything but the last 20B of input */ +static void hash_partially(git_indexer *idx, const uint8_t *data, size_t size) +{ + size_t to_expell, to_keep; + + if (size == 0) + return; + + /* Easy case, dump the buffer and the data minus the last 20 bytes */ + if (size >= GIT_OID_RAWSZ) { + git_hash_update(&idx->trailer, idx->inbuf, idx->inbuf_len); + git_hash_update(&idx->trailer, data, size - GIT_OID_RAWSZ); + + data += size - GIT_OID_RAWSZ; + memcpy(idx->inbuf, data, GIT_OID_RAWSZ); + idx->inbuf_len = GIT_OID_RAWSZ; + return; + } + + /* We can just append */ + if (idx->inbuf_len + size <= GIT_OID_RAWSZ) { + memcpy(idx->inbuf + idx->inbuf_len, data, size); + idx->inbuf_len += size; + return; + } + + /* We need to partially drain the buffer and then append */ + to_keep = GIT_OID_RAWSZ - size; + to_expell = idx->inbuf_len - to_keep; + + git_hash_update(&idx->trailer, idx->inbuf, to_expell); + + memmove(idx->inbuf, idx->inbuf + to_expell, to_keep); + memcpy(idx->inbuf + to_keep, data, size); + idx->inbuf_len += size - to_expell; +} + +#if defined(NO_MMAP) || !defined(GIT_WIN32) + +static int write_at(git_indexer *idx, const void *data, off64_t offset, size_t size) +{ + size_t remaining_size = size; + const char *ptr = (const char *)data; + + /* Handle data size larger that ssize_t */ + while (remaining_size > 0) { + ssize_t nb; + HANDLE_EINTR(nb, p_pwrite(idx->pack->mwf.fd, (void *)ptr, + remaining_size, offset)); + if (nb <= 0) + return -1; + + ptr += nb; + offset += nb; + remaining_size -= nb; + } + + return 0; +} + +static int append_to_pack(git_indexer *idx, const void *data, size_t size) +{ + if (write_at(idx, data, idx->pack->mwf.size, size) < 0) { + git_error_set(GIT_ERROR_OS, "cannot extend packfile '%s'", idx->pack->pack_name); + return -1; + } + + return 0; +} + +#else + +/* + * Windows may keep different views to a networked file for the mmap- and + * open-accessed versions of a file, so any writes done through + * `write(2)`/`pwrite(2)` may not be reflected on the data that `mmap(2)` is + * able to read. + */ + +static int write_at(git_indexer *idx, const void *data, off64_t offset, size_t size) +{ + git_file fd = idx->pack->mwf.fd; + size_t mmap_alignment; + size_t page_offset; + off64_t page_start; + unsigned char *map_data; + git_map map; + int error; + + GIT_ASSERT_ARG(data); + GIT_ASSERT_ARG(size); + + if ((error = git__mmap_alignment(&mmap_alignment)) < 0) + return error; + + /* the offset needs to be at the mmap boundary for the platform */ + page_offset = offset % mmap_alignment; + page_start = offset - page_offset; + + if ((error = p_mmap(&map, page_offset + size, GIT_PROT_WRITE, GIT_MAP_SHARED, fd, page_start)) < 0) + return error; + + map_data = (unsigned char *)map.data; + memcpy(map_data + page_offset, data, size); + p_munmap(&map); + + return 0; +} + +static int append_to_pack(git_indexer *idx, const void *data, size_t size) +{ + off64_t new_size; + size_t mmap_alignment; + size_t page_offset; + off64_t page_start; + off64_t current_size = idx->pack->mwf.size; + int error; + + if (!size) + return 0; + + if ((error = git__mmap_alignment(&mmap_alignment)) < 0) + return error; + + /* Write a single byte to force the file system to allocate space now or + * report an error, since we can't report errors when writing using mmap. + * Round the size up to the nearest page so that we only need to perform file + * I/O when we add a page, instead of whenever we write even a single byte. */ + new_size = current_size + size; + page_offset = new_size % mmap_alignment; + page_start = new_size - page_offset; + + if (p_pwrite(idx->pack->mwf.fd, data, 1, page_start + mmap_alignment - 1) < 0) { + git_error_set(GIT_ERROR_OS, "cannot extend packfile '%s'", idx->pack->pack_name); + return -1; + } + + return write_at(idx, data, idx->pack->mwf.size, size); +} + +#endif + +static int read_stream_object(git_indexer *idx, git_indexer_progress *stats) +{ + git_packfile_stream *stream = &idx->stream; + off64_t entry_start = idx->off; + size_t entry_size; + git_object_t type; + git_mwindow *w = NULL; + int error; + + if (idx->pack->mwf.size <= idx->off + 20) + return GIT_EBUFS; + + if (!idx->have_stream) { + error = git_packfile_unpack_header(&entry_size, &type, idx->pack, &w, &idx->off); + if (error == GIT_EBUFS) { + idx->off = entry_start; + return error; + } + if (error < 0) + return error; + + git_mwindow_close(&w); + idx->entry_start = entry_start; + git_hash_init(&idx->hash_ctx); + git_str_clear(&idx->entry_data); + + if (type == GIT_OBJECT_REF_DELTA || type == GIT_OBJECT_OFS_DELTA) { + error = advance_delta_offset(idx, type); + if (error == GIT_EBUFS) { + idx->off = entry_start; + return error; + } + if (error < 0) + return error; + + idx->have_delta = 1; + } else { + idx->have_delta = 0; + + error = hash_header(&idx->hash_ctx, entry_size, type); + if (error < 0) + return error; + } + + idx->have_stream = 1; + idx->entry_type = type; + + error = git_packfile_stream_open(stream, idx->pack, idx->off); + if (error < 0) + return error; + } + + if (idx->have_delta) { + error = read_object_stream(idx, stream); + } else { + error = hash_object_stream(idx, stream); + } + + idx->off = stream->curpos; + if (error == GIT_EBUFS) + return error; + + /* We want to free the stream reasorces no matter what here */ + idx->have_stream = 0; + git_packfile_stream_dispose(stream); + + if (error < 0) + return error; + + if (idx->have_delta) { + error = store_delta(idx); + } else { + error = store_object(idx); + } + + if (error < 0) + return error; + + if (!idx->have_delta) { + stats->indexed_objects++; + } + stats->received_objects++; + + if ((error = do_progress_callback(idx, stats)) != 0) + return error; + + return 0; +} + +int git_indexer_append(git_indexer *idx, const void *data, size_t size, git_indexer_progress *stats) +{ + int error = -1; + struct git_pack_header *hdr = &idx->hdr; + git_mwindow_file *mwf = &idx->pack->mwf; + + GIT_ASSERT_ARG(idx); + GIT_ASSERT_ARG(data); + GIT_ASSERT_ARG(stats); + + if ((error = append_to_pack(idx, data, size)) < 0) + return error; + + hash_partially(idx, data, (int)size); + + /* Make sure we set the new size of the pack */ + idx->pack->mwf.size += size; + + if (!idx->parsed_header) { + unsigned int total_objects; + + if ((unsigned)idx->pack->mwf.size < sizeof(struct git_pack_header)) + return 0; + + if ((error = parse_header(&idx->hdr, idx->pack)) < 0) + return error; + + idx->parsed_header = 1; + idx->nr_objects = ntohl(hdr->hdr_entries); + idx->off = sizeof(struct git_pack_header); + + if (idx->nr_objects <= git_indexer__max_objects) { + total_objects = (unsigned int)idx->nr_objects; + } else { + git_error_set(GIT_ERROR_INDEXER, "too many objects"); + return -1; + } + + if (git_oidmap_new(&idx->pack->idx_cache) < 0) + return -1; + + idx->pack->has_cache = 1; + if (git_vector_init(&idx->objects, total_objects, objects_cmp) < 0) + return -1; + + if (git_vector_init(&idx->deltas, total_objects / 2, NULL) < 0) + return -1; + + stats->received_objects = 0; + stats->local_objects = 0; + stats->total_deltas = 0; + stats->indexed_deltas = 0; + stats->indexed_objects = 0; + stats->total_objects = total_objects; + + if ((error = do_progress_callback(idx, stats)) != 0) + return error; + } + + /* Now that we have data in the pack, let's try to parse it */ + + /* As the file grows any windows we try to use will be out of date */ + if ((error = git_mwindow_free_all(mwf)) < 0) + goto on_error; + + while (stats->indexed_objects < idx->nr_objects) { + if ((error = read_stream_object(idx, stats)) != 0) { + if (error == GIT_EBUFS) + break; + else + goto on_error; + } + } + + return 0; + +on_error: + git_mwindow_free_all(mwf); + return error; +} + +static int index_path(git_str *path, git_indexer *idx, const char *suffix) +{ + const char prefix[] = "pack-"; + size_t slash = (size_t)path->size; + + /* search backwards for '/' */ + while (slash > 0 && path->ptr[slash - 1] != '/') + slash--; + + if (git_str_grow(path, slash + 1 + strlen(prefix) + + GIT_OID_HEXSZ + strlen(suffix) + 1) < 0) + return -1; + + git_str_truncate(path, slash); + git_str_puts(path, prefix); + git_str_puts(path, idx->name); + git_str_puts(path, suffix); + + return git_str_oom(path) ? -1 : 0; +} + +/** + * Rewind the packfile by the trailer, as we might need to fix the + * packfile by injecting objects at the tail and must overwrite it. + */ +static int seek_back_trailer(git_indexer *idx) +{ + idx->pack->mwf.size -= GIT_OID_RAWSZ; + return git_mwindow_free_all(&idx->pack->mwf); +} + +static int inject_object(git_indexer *idx, git_oid *id) +{ + git_odb_object *obj = NULL; + struct entry *entry = NULL; + struct git_pack_entry *pentry = NULL; + unsigned char empty_checksum[GIT_HASH_SHA1_SIZE] = {0}; + unsigned char hdr[64]; + git_str buf = GIT_STR_INIT; + off64_t entry_start; + const void *data; + size_t len, hdr_len; + size_t checksum_size = GIT_HASH_SHA1_SIZE; + int error; + + if ((error = seek_back_trailer(idx)) < 0) + goto cleanup; + + entry_start = idx->pack->mwf.size; + + if ((error = git_odb_read(&obj, idx->odb, id)) < 0) { + git_error_set(GIT_ERROR_INDEXER, "missing delta bases"); + goto cleanup; + } + + data = git_odb_object_data(obj); + len = git_odb_object_size(obj); + + entry = git__calloc(1, sizeof(*entry)); + GIT_ERROR_CHECK_ALLOC(entry); + + entry->crc = crc32(0L, Z_NULL, 0); + + /* Write out the object header */ + if ((error = git_packfile__object_header(&hdr_len, hdr, len, git_odb_object_type(obj))) < 0 || + (error = append_to_pack(idx, hdr, hdr_len)) < 0) + goto cleanup; + + idx->pack->mwf.size += hdr_len; + entry->crc = crc32(entry->crc, hdr, (uInt)hdr_len); + + if ((error = git_zstream_deflatebuf(&buf, data, len)) < 0) + goto cleanup; + + /* And then the compressed object */ + if ((error = append_to_pack(idx, buf.ptr, buf.size)) < 0) + goto cleanup; + + idx->pack->mwf.size += buf.size; + entry->crc = htonl(crc32(entry->crc, (unsigned char *)buf.ptr, (uInt)buf.size)); + git_str_dispose(&buf); + + /* Write a fake trailer so the pack functions play ball */ + + if ((error = append_to_pack(idx, empty_checksum, checksum_size)) < 0) + goto cleanup; + + idx->pack->mwf.size += GIT_OID_RAWSZ; + + pentry = git__calloc(1, sizeof(struct git_pack_entry)); + GIT_ERROR_CHECK_ALLOC(pentry); + + git_oid_cpy(&pentry->sha1, id); + git_oid_cpy(&entry->oid, id); + idx->off = entry_start + hdr_len + len; + + error = save_entry(idx, entry, pentry, entry_start); + +cleanup: + if (error) { + git__free(entry); + git__free(pentry); + } + + git_odb_object_free(obj); + return error; +} + +static int fix_thin_pack(git_indexer *idx, git_indexer_progress *stats) +{ + int error, found_ref_delta = 0; + unsigned int i; + struct delta_info *delta; + size_t size; + git_object_t type; + git_mwindow *w = NULL; + off64_t curpos = 0; + unsigned char *base_info; + unsigned int left = 0; + git_oid base; + + GIT_ASSERT(git_vector_length(&idx->deltas) > 0); + + if (idx->odb == NULL) { + git_error_set(GIT_ERROR_INDEXER, "cannot fix a thin pack without an ODB"); + return -1; + } + + /* Loop until we find the first REF delta */ + git_vector_foreach(&idx->deltas, i, delta) { + if (!delta) + continue; + + curpos = delta->delta_off; + error = git_packfile_unpack_header(&size, &type, idx->pack, &w, &curpos); + if (error < 0) + return error; + + if (type == GIT_OBJECT_REF_DELTA) { + found_ref_delta = 1; + break; + } + } + + if (!found_ref_delta) { + git_error_set(GIT_ERROR_INDEXER, "no REF_DELTA found, cannot inject object"); + return -1; + } + + /* curpos now points to the base information, which is an OID */ + base_info = git_mwindow_open(&idx->pack->mwf, &w, curpos, GIT_OID_RAWSZ, &left); + if (base_info == NULL) { + git_error_set(GIT_ERROR_INDEXER, "failed to map delta information"); + return -1; + } + + git_oid_fromraw(&base, base_info); + git_mwindow_close(&w); + + if (has_entry(idx, &base)) + return 0; + + if (inject_object(idx, &base) < 0) + return -1; + + stats->local_objects++; + + return 0; +} + +static int resolve_deltas(git_indexer *idx, git_indexer_progress *stats) +{ + unsigned int i; + int error; + struct delta_info *delta; + int progressed = 0, non_null = 0, progress_cb_result; + + while (idx->deltas.length > 0) { + progressed = 0; + non_null = 0; + git_vector_foreach(&idx->deltas, i, delta) { + git_rawobj obj = {0}; + + if (!delta) + continue; + + non_null = 1; + idx->off = delta->delta_off; + if ((error = git_packfile_unpack(&obj, idx->pack, &idx->off)) < 0) { + if (error == GIT_PASSTHROUGH) { + /* We have not seen the base object, we'll try again later. */ + continue; + } + return -1; + } + + if (idx->do_verify && check_object_connectivity(idx, &obj) < 0) + /* TODO: error? continue? */ + continue; + + if (hash_and_save(idx, &obj, delta->delta_off) < 0) + continue; + + git__free(obj.data); + stats->indexed_objects++; + stats->indexed_deltas++; + progressed = 1; + if ((progress_cb_result = do_progress_callback(idx, stats)) < 0) + return progress_cb_result; + + /* remove from the list */ + git_vector_set(NULL, &idx->deltas, i, NULL); + git__free(delta); + } + + /* if none were actually set, we're done */ + if (!non_null) + break; + + if (!progressed && (fix_thin_pack(idx, stats) < 0)) { + return -1; + } + } + + return 0; +} + +static int update_header_and_rehash(git_indexer *idx, git_indexer_progress *stats) +{ + void *ptr; + size_t chunk = 1024*1024; + off64_t hashed = 0; + git_mwindow *w = NULL; + git_mwindow_file *mwf; + unsigned int left; + + mwf = &idx->pack->mwf; + + git_hash_init(&idx->trailer); + + + /* Update the header to include the number of local objects we injected */ + idx->hdr.hdr_entries = htonl(stats->total_objects + stats->local_objects); + if (write_at(idx, &idx->hdr, 0, sizeof(struct git_pack_header)) < 0) + return -1; + + /* + * We now use the same technique as before to determine the + * hash. We keep reading up to the end and let + * hash_partially() keep the existing trailer out of the + * calculation. + */ + if (git_mwindow_free_all(mwf) < 0) + return -1; + + idx->inbuf_len = 0; + while (hashed < mwf->size) { + ptr = git_mwindow_open(mwf, &w, hashed, chunk, &left); + if (ptr == NULL) + return -1; + + hash_partially(idx, ptr, left); + hashed += left; + + git_mwindow_close(&w); + } + + return 0; +} + +int git_indexer_commit(git_indexer *idx, git_indexer_progress *stats) +{ + git_mwindow *w = NULL; + unsigned int i, long_offsets = 0, left; + int error; + struct git_pack_idx_header hdr; + git_str filename = GIT_STR_INIT; + struct entry *entry; + unsigned char checksum[GIT_HASH_SHA1_SIZE]; + git_filebuf index_file = {0}; + void *packfile_trailer; + size_t checksum_size = GIT_HASH_SHA1_SIZE; + bool mismatch; + + if (!idx->parsed_header) { + git_error_set(GIT_ERROR_INDEXER, "incomplete pack header"); + return -1; + } + + /* Test for this before resolve_deltas(), as it plays with idx->off */ + if (idx->off + (ssize_t)checksum_size < idx->pack->mwf.size) { + git_error_set(GIT_ERROR_INDEXER, "unexpected data at the end of the pack"); + return -1; + } + if (idx->off + (ssize_t)checksum_size > idx->pack->mwf.size) { + git_error_set(GIT_ERROR_INDEXER, "missing trailer at the end of the pack"); + return -1; + } + + packfile_trailer = git_mwindow_open(&idx->pack->mwf, &w, idx->pack->mwf.size - checksum_size, checksum_size, &left); + if (packfile_trailer == NULL) { + git_mwindow_close(&w); + goto on_error; + } + + /* Compare the packfile trailer as it was sent to us and what we calculated */ + git_hash_final(checksum, &idx->trailer); + mismatch = !!memcmp(checksum, packfile_trailer, checksum_size); + git_mwindow_close(&w); + + if (mismatch) { + git_error_set(GIT_ERROR_INDEXER, "packfile trailer mismatch"); + return -1; + } + + /* Freeze the number of deltas */ + stats->total_deltas = stats->total_objects - stats->indexed_objects; + + if ((error = resolve_deltas(idx, stats)) < 0) + return error; + + if (stats->indexed_objects != stats->total_objects) { + git_error_set(GIT_ERROR_INDEXER, "early EOF"); + return -1; + } + + if (stats->local_objects > 0) { + if (update_header_and_rehash(idx, stats) < 0) + return -1; + + git_hash_final(checksum, &idx->trailer); + write_at(idx, checksum, idx->pack->mwf.size - checksum_size, checksum_size); + } + + /* + * Is the resulting graph fully connected or are we still + * missing some objects? In the second case, we can + * bail out due to an incomplete and thus corrupt + * packfile. + */ + if (git_oidmap_size(idx->expected_oids) > 0) { + git_error_set(GIT_ERROR_INDEXER, "packfile is missing %"PRIuZ" objects", + git_oidmap_size(idx->expected_oids)); + return -1; + } + + git_vector_sort(&idx->objects); + + /* Use the trailer hash as the pack file name to ensure + * files with different contents have different names */ + memcpy(idx->checksum, checksum, checksum_size); + if (git_hash_fmt(idx->name, checksum, checksum_size) < 0) + return -1; + + git_str_sets(&filename, idx->pack->pack_name); + git_str_shorten(&filename, strlen("pack")); + git_str_puts(&filename, "idx"); + if (git_str_oom(&filename)) + return -1; + + if (git_filebuf_open(&index_file, filename.ptr, + GIT_FILEBUF_HASH_CONTENTS | + (idx->do_fsync ? GIT_FILEBUF_FSYNC : 0), + idx->mode) < 0) + goto on_error; + + /* Write out the header */ + hdr.idx_signature = htonl(PACK_IDX_SIGNATURE); + hdr.idx_version = htonl(2); + git_filebuf_write(&index_file, &hdr, sizeof(hdr)); + + /* Write out the fanout table */ + for (i = 0; i < 256; ++i) { + uint32_t n = htonl(idx->fanout[i]); + git_filebuf_write(&index_file, &n, sizeof(n)); + } + + /* Write out the object names (SHA-1 hashes) */ + git_vector_foreach(&idx->objects, i, entry) { + git_filebuf_write(&index_file, &entry->oid, sizeof(git_oid)); + } + + /* Write out the CRC32 values */ + git_vector_foreach(&idx->objects, i, entry) { + git_filebuf_write(&index_file, &entry->crc, sizeof(uint32_t)); + } + + /* Write out the offsets */ + git_vector_foreach(&idx->objects, i, entry) { + uint32_t n; + + if (entry->offset == UINT32_MAX) + n = htonl(0x80000000 | long_offsets++); + else + n = htonl(entry->offset); + + git_filebuf_write(&index_file, &n, sizeof(uint32_t)); + } + + /* Write out the long offsets */ + git_vector_foreach(&idx->objects, i, entry) { + uint32_t split[2]; + + if (entry->offset != UINT32_MAX) + continue; + + split[0] = htonl(entry->offset_long >> 32); + split[1] = htonl(entry->offset_long & 0xffffffff); + + git_filebuf_write(&index_file, &split, sizeof(uint32_t) * 2); + } + + /* Write out the packfile trailer to the index */ + if (git_filebuf_write(&index_file, checksum, checksum_size) < 0) + goto on_error; + + /* Write out the hash of the idx */ + if (git_filebuf_hash(checksum, &index_file) < 0) + goto on_error; + + git_filebuf_write(&index_file, checksum, checksum_size); + + /* Figure out what the final name should be */ + if (index_path(&filename, idx, ".idx") < 0) + goto on_error; + + /* Commit file */ + if (git_filebuf_commit_at(&index_file, filename.ptr) < 0) + goto on_error; + + if (git_mwindow_free_all(&idx->pack->mwf) < 0) + goto on_error; + +#if !defined(NO_MMAP) && defined(GIT_WIN32) + /* + * Some non-Windows remote filesystems fail when truncating files if the + * file permissions change after opening the file (done by p_mkstemp). + * + * Truncation is only needed when mmap is used to undo rounding up to next + * page_size in append_to_pack. + */ + if (p_ftruncate(idx->pack->mwf.fd, idx->pack->mwf.size) < 0) { + git_error_set(GIT_ERROR_OS, "failed to truncate pack file '%s'", idx->pack->pack_name); + return -1; + } +#endif + + if (idx->do_fsync && p_fsync(idx->pack->mwf.fd) < 0) { + git_error_set(GIT_ERROR_OS, "failed to fsync packfile"); + goto on_error; + } + + /* We need to close the descriptor here so Windows doesn't choke on commit_at */ + if (p_close(idx->pack->mwf.fd) < 0) { + git_error_set(GIT_ERROR_OS, "failed to close packfile"); + goto on_error; + } + + idx->pack->mwf.fd = -1; + + if (index_path(&filename, idx, ".pack") < 0) + goto on_error; + + /* And don't forget to rename the packfile to its new place. */ + if (p_rename(idx->pack->pack_name, git_str_cstr(&filename)) < 0) + goto on_error; + + /* And fsync the parent directory if we're asked to. */ + if (idx->do_fsync && + git_futils_fsync_parent(git_str_cstr(&filename)) < 0) + goto on_error; + + idx->pack_committed = 1; + + git_str_dispose(&filename); + return 0; + +on_error: + git_mwindow_free_all(&idx->pack->mwf); + git_filebuf_cleanup(&index_file); + git_str_dispose(&filename); + return -1; +} + +void git_indexer_free(git_indexer *idx) +{ + const git_oid *key; + git_oid *value; + size_t iter; + + if (idx == NULL) + return; + + if (idx->have_stream) + git_packfile_stream_dispose(&idx->stream); + + git_vector_free_deep(&idx->objects); + + if (idx->pack->idx_cache) { + struct git_pack_entry *pentry; + git_oidmap_foreach_value(idx->pack->idx_cache, pentry, { + git__free(pentry); + }); + + git_oidmap_free(idx->pack->idx_cache); + } + + git_vector_free_deep(&idx->deltas); + + git_packfile_free(idx->pack, !idx->pack_committed); + + iter = 0; + while (git_oidmap_iterate((void **) &value, idx->expected_oids, &iter, &key) == 0) + git__free(value); + + git_hash_ctx_cleanup(&idx->trailer); + git_hash_ctx_cleanup(&idx->hash_ctx); + git_str_dispose(&idx->entry_data); + git_oidmap_free(idx->expected_oids); + git__free(idx); +} |