diff options
-rw-r--r-- | src/diff.c | 33 | ||||
-rw-r--r-- | src/diff.h | 2 | ||||
-rw-r--r-- | src/diff_output.c | 214 | ||||
-rw-r--r-- | src/fileops.c | 62 | ||||
-rw-r--r-- | src/fileops.h | 1 | ||||
-rw-r--r-- | src/odb.c | 33 | ||||
-rw-r--r-- | src/odb.h | 17 | ||||
-rw-r--r-- | tests-clar/status/worktree.c | 4 |
8 files changed, 260 insertions, 106 deletions
diff --git a/src/diff.c b/src/diff.c index f8a01086c..499b95b44 100644 --- a/src/diff.c +++ b/src/diff.c @@ -11,6 +11,7 @@ #include "fileops.h" #include "config.h" #include "attr_file.h" +#include "filter.h" static char *diff_prefix_from_pathspec(const git_strarray *pathspec) { @@ -63,8 +64,8 @@ static bool diff_path_matches_pathspec(git_diff_list *diff, const char *path) git_vector_foreach(&diff->pathspec, i, match) { int result = strcmp(match->pattern, path) ? FNM_NOMATCH : 0; - - if (((diff->opts.flags & GIT_DIFF_DISABLE_PATHSPEC_MATCH) == 0) && + + if (((diff->opts.flags & GIT_DIFF_DISABLE_PATHSPEC_MATCH) == 0) && result == FNM_NOMATCH) result = p_fnmatch(match->pattern, path, 0); @@ -262,12 +263,14 @@ static int diff_delta__from_two( delta = diff_delta__alloc(diff, status, old_entry->path); GITERR_CHECK_ALLOC(delta); - delta->old_file.mode = old_mode; git_oid_cpy(&delta->old_file.oid, &old_entry->oid); + delta->old_file.size = old_entry->file_size; + delta->old_file.mode = old_mode; delta->old_file.flags |= GIT_DIFF_FILE_VALID_OID; - delta->new_file.mode = new_mode; git_oid_cpy(&delta->new_file.oid, new_oid ? new_oid : &new_entry->oid); + delta->new_file.size = new_entry->file_size; + delta->new_file.mode = new_mode; if (new_oid || !git_oid_iszero(&new_entry->oid)) delta->new_file.flags |= GIT_DIFF_FILE_VALID_OID; @@ -440,14 +443,22 @@ static int oid_for_workdir_item( giterr_set(GITERR_OS, "File size overflow for 32-bit systems"); result = -1; } else { - int fd = git_futils_open_ro(full_path.ptr); - if (fd < 0) - result = fd; - else { - result = git_odb__hashfd( - oid, fd, (size_t)item->file_size, GIT_OBJ_BLOB); - p_close(fd); + git_vector filters = GIT_VECTOR_INIT; + + result = git_filters_load( + &filters, repo, item->path, GIT_FILTER_TO_ODB); + if (result >= 0) { + int fd = git_futils_open_ro(full_path.ptr); + if (fd < 0) + result = fd; + else { + result = git_odb__hashfd_filtered( + oid, fd, (size_t)item->file_size, GIT_OBJ_BLOB, &filters); + p_close(fd); + } } + + git_filters_free(&filters); } git_buf_free(&full_path); diff --git a/src/diff.h b/src/diff.h index 2785fa425..def746323 100644 --- a/src/diff.h +++ b/src/diff.h @@ -25,6 +25,8 @@ enum { GIT_DIFFCAPS_USE_DEV = (1 << 4), /* use st_dev? */ }; +#define MAX_DIFF_FILESIZE 0x20000000 + struct git_diff_list { git_refcount rc; git_repository *repo; diff --git a/src/diff_output.c b/src/diff_output.c index 2c64b92ee..e2ca8cf3e 100644 --- a/src/diff_output.c +++ b/src/diff_output.c @@ -22,7 +22,18 @@ * git_diff_foreach() call it is an emphemeral structure that is filled * in to execute each diff. In the case of a git_diff_iterator, it holds * most of the information for the diff in progress. - */ + * + * As each delta is processed, it goes through 3 phases: prep, load, exec. + * + * - In the prep phase, we just set the delta and quickly check the file + * attributes to see if it should be treated as binary. + * - In the load phase, we actually load the file content into memory. + * At this point, if we had deferred calculating OIDs, we might have to + * correct the delta to be UNMODIFIED. + * - In the exec phase, we actually run the diff and execute the callbacks. + * For foreach, this is just a pass-through to the user's callbacks. For + * iterators, we record the hunks and data spans into memory. + */ typedef struct { git_repository *repo; git_diff_options *opts; @@ -263,18 +274,40 @@ static void setup_xdiff_options( static int get_blob_content( git_repository *repo, - const git_oid *oid, + git_diff_file *file, git_map *map, git_blob **blob) { - if (git_oid_iszero(oid)) + int error; + git_odb *odb; + size_t len; + git_otype type; + + if (git_oid_iszero(&file->oid)) return 0; - if (git_blob_lookup(blob, repo, oid) < 0) - return -1; + /* peek at object header to avoid loading if too large */ + if ((error = git_repository_odb__weakptr(&odb, repo)) < 0 || + (error = git_odb_read_header(&len, &type, odb, &file->oid)) < 0) + return error; + + assert(type == GIT_OBJ_BLOB); + + /* if blob is too large to diff, mark as binary */ + if (len > MAX_DIFF_FILESIZE) { + file->flags |= GIT_DIFF_FILE_BINARY; + return 0; + } + + if (!file->size) + file->size = len; + + if ((error = git_blob_lookup(blob, repo, &file->oid)) < 0) + return error; map->data = (void *)git_blob_rawcontent(*blob); map->len = git_blob_rawsize(*blob); + return 0; } @@ -307,13 +340,66 @@ static int get_workdir_content( if (read_len < 0) { giterr_set(GITERR_OS, "Failed to read symlink '%s'", file->path); error = -1; - } else - map->len = read_len; + goto cleanup; + } + + map->len = read_len; } else { - error = git_futils_mmap_ro_file(map, path.ptr); - file->flags |= GIT_DIFF_FILE_UNMAP_DATA; + git_file fd = git_futils_open_ro(path.ptr); + git_vector filters = GIT_VECTOR_INIT; + + if (fd < 0) { + error = fd; + goto cleanup; + } + + if (!file->size) + file->size = git_futils_filesize(fd); + + /* if file is too large to diff, mark as binary */ + if (file->size > MAX_DIFF_FILESIZE) { + file->flags |= GIT_DIFF_FILE_BINARY; + goto close_and_cleanup; + } + + error = git_filters_load(&filters, repo, file->path, GIT_FILTER_TO_ODB); + if (error < 0) + goto close_and_cleanup; + + if (error == 0) { /* note: git_filters_load returns filter count */ + error = git_futils_mmap_ro(map, fd, 0, (size_t)file->size); + file->flags |= GIT_DIFF_FILE_UNMAP_DATA; + } else { + git_buf raw = GIT_BUF_INIT, filtered = GIT_BUF_INIT; + + if (!(error = git_futils_readbuffer_fd(&raw, fd, (size_t)file->size)) && + !(error = git_filters_apply(&filtered, &raw, &filters))) + { + map->len = git_buf_len(&filtered); + map->data = git_buf_detach(&filtered); + + file->flags |= GIT_DIFF_FILE_FREE_DATA; + } + + git_buf_free(&raw); + git_buf_free(&filtered); + } + +close_and_cleanup: + git_filters_free(&filters); + p_close(fd); + } + + /* once data is loaded, update OID if we didn't have it previously */ + if (!error && (file->flags & GIT_DIFF_FILE_VALID_OID) == 0) { + error = git_odb_hash( + &file->oid, map->data, map->len, GIT_OBJ_BLOB); + if (!error) + file->flags |= GIT_DIFF_FILE_VALID_OID; } + +cleanup: git_buf_free(&path); return error; } @@ -393,7 +479,9 @@ static int diff_delta_prep(diff_delta_context *ctxt) static int diff_delta_load(diff_delta_context *ctxt) { int error = 0; + git_repository *repo = ctxt->repo; git_diff_delta *delta = ctxt->delta; + bool load_old = false, load_new = false, check_if_unmodified = false; if (ctxt->loaded || !ctxt->delta) return 0; @@ -405,75 +493,77 @@ static int diff_delta_load(diff_delta_context *ctxt) ctxt->old_data.len = 0; ctxt->old_blob = NULL; - if (!error && delta->binary != 1 && - (delta->status == GIT_DELTA_DELETED || - delta->status == GIT_DELTA_MODIFIED)) - { - if (ctxt->old_src == GIT_ITERATOR_WORKDIR) - error = get_workdir_content( - ctxt->repo, &delta->old_file, &ctxt->old_data); - else { - error = get_blob_content( - ctxt->repo, &delta->old_file.oid, - &ctxt->old_data, &ctxt->old_blob); - - if (ctxt->new_src == GIT_ITERATOR_WORKDIR) { - /* TODO: convert crlf of blob content */ - } - } - } - ctxt->new_data.data = ""; ctxt->new_data.len = 0; ctxt->new_blob = NULL; - if (!error && delta->binary != 1 && - (delta->status == GIT_DELTA_ADDED || - delta->status == GIT_DELTA_MODIFIED)) - { - if (ctxt->new_src == GIT_ITERATOR_WORKDIR) - error = get_workdir_content( - ctxt->repo, &delta->new_file, &ctxt->new_data); - else { - error = get_blob_content( - ctxt->repo, &delta->new_file.oid, - &ctxt->new_data, &ctxt->new_blob); - - if (ctxt->old_src == GIT_ITERATOR_WORKDIR) { - /* TODO: convert crlf of blob content */ - } - } + if (delta->binary == 1) + goto cleanup; - if (!error && !(delta->new_file.flags & GIT_DIFF_FILE_VALID_OID)) { - error = git_odb_hash( - &delta->new_file.oid, ctxt->new_data.data, - ctxt->new_data.len, GIT_OBJ_BLOB); - if (error < 0) - goto cleanup; + switch (delta->status) { + case GIT_DELTA_ADDED: load_new = true; break; + case GIT_DELTA_DELETED: load_old = true; break; + case GIT_DELTA_MODIFIED: load_new = load_old = true; break; + default: break; + } - delta->new_file.flags |= GIT_DIFF_FILE_VALID_OID; + check_if_unmodified = + (load_old && (delta->old_file.flags & GIT_DIFF_FILE_VALID_OID) == 0) || + (load_new && (delta->new_file.flags & GIT_DIFF_FILE_VALID_OID) == 0); - /* since we did not have the definitive oid, we may have - * incorrect status and need to skip this item. - */ - if (delta->old_file.mode == delta->new_file.mode && - !git_oid_cmp(&delta->old_file.oid, &delta->new_file.oid)) - { - delta->status = GIT_DELTA_UNMODIFIED; + /* Always try to load workdir content first, since it may need to be + * filtered (and hence use 2x memory) and we want to minimize the max + * memory footprint during diff. + */ - if ((ctxt->opts->flags & GIT_DIFF_INCLUDE_UNMODIFIED) == 0) - goto cleanup; - } - } + if (load_old && ctxt->old_src == GIT_ITERATOR_WORKDIR) { + if ((error = get_workdir_content( + repo, &delta->old_file, &ctxt->old_data)) < 0) + goto cleanup; + + if ((delta->old_file.flags & GIT_DIFF_FILE_BINARY) != 0) + goto cleanup; + } + + if (load_new && ctxt->new_src == GIT_ITERATOR_WORKDIR) { + if ((error = get_workdir_content( + repo, &delta->new_file, &ctxt->new_data)) < 0) + goto cleanup; + + if ((delta->new_file.flags & GIT_DIFF_FILE_BINARY) != 0) + goto cleanup; } + if (load_old && ctxt->old_src != GIT_ITERATOR_WORKDIR && + (error = get_blob_content( + repo, &delta->old_file, &ctxt->old_data, &ctxt->old_blob)) < 0) + goto cleanup; + + if (load_new && ctxt->new_src != GIT_ITERATOR_WORKDIR && + (error = get_blob_content( + repo, &delta->new_file, &ctxt->new_data, &ctxt->new_blob)) < 0) + goto cleanup; + + /* if we did not previously have the definitive oid, we may have + * incorrect status and need to switch this to UNMODIFIED. + */ + if (check_if_unmodified && + delta->old_file.mode == delta->new_file.mode && + !git_oid_cmp(&delta->old_file.oid, &delta->new_file.oid)) + { + delta->status = GIT_DELTA_UNMODIFIED; + + if ((ctxt->opts->flags & GIT_DIFF_INCLUDE_UNMODIFIED) == 0) + goto cleanup; + } + +cleanup: /* if we have not already decided whether file is binary, * check the first 4K for nul bytes to decide... */ if (!error && delta->binary == -1) error = diff_delta_is_binary_by_content(ctxt); -cleanup: ctxt->loaded = !error; /* flag if we would want to diff the contents of these files */ diff --git a/src/fileops.c b/src/fileops.c index 95eacb5f1..d4def1a9a 100644 --- a/src/fileops.c +++ b/src/fileops.c @@ -115,10 +115,47 @@ mode_t git_futils_canonical_mode(mode_t raw_mode) return 0; } -int git_futils_readbuffer_updated(git_buf *buf, const char *path, time_t *mtime, int *updated) +#define MAX_READ_STALLS 10 + +int git_futils_readbuffer_fd(git_buf *buf, git_file fd, size_t len) +{ + int stalls = MAX_READ_STALLS; + + git_buf_clear(buf); + + if (git_buf_grow(buf, len + 1) < 0) + return -1; + + buf->ptr[len] = '\0'; + + while (len > 0) { + ssize_t read_size = p_read(fd, buf->ptr + buf->size, len); + + if (read_size < 0) { + giterr_set(GITERR_OS, "Failed to read descriptor"); + return -1; + } + + if (read_size == 0) { + stalls--; + + if (!stalls) { + giterr_set(GITERR_OS, "Too many stalls reading descriptor"); + return -1; + } + } + + len -= read_size; + buf->size += read_size; + } + + return 0; +} + +int git_futils_readbuffer_updated( + git_buf *buf, const char *path, time_t *mtime, int *updated) { git_file fd; - size_t len; struct stat st; assert(buf && path && *path); @@ -147,30 +184,11 @@ int git_futils_readbuffer_updated(git_buf *buf, const char *path, time_t *mtime, if (mtime != NULL) *mtime = st.st_mtime; - len = (size_t) st.st_size; - - git_buf_clear(buf); - - if (git_buf_grow(buf, len + 1) < 0) { + if (git_futils_readbuffer_fd(buf, fd, (size_t)st.st_size) < 0) { p_close(fd); return -1; } - buf->ptr[len] = '\0'; - - while (len > 0) { - ssize_t read_size = p_read(fd, buf->ptr, len); - - if (read_size < 0) { - p_close(fd); - giterr_set(GITERR_OS, "Failed to read descriptor for '%s'", path); - return -1; - } - - len -= read_size; - buf->size += read_size; - } - p_close(fd); if (updated != NULL) diff --git a/src/fileops.h b/src/fileops.h index 5c23ce30b..d2944f460 100644 --- a/src/fileops.h +++ b/src/fileops.h @@ -19,6 +19,7 @@ */ extern int git_futils_readbuffer(git_buf *obj, const char *path); extern int git_futils_readbuffer_updated(git_buf *obj, const char *path, time_t *mtime, int *updated); +extern int git_futils_readbuffer_fd(git_buf *obj, git_file fd, size_t len); /** * File utils @@ -12,6 +12,7 @@ #include "hash.h" #include "odb.h" #include "delta-apply.h" +#include "filter.h" #include "git2/odb_backend.h" #include "git2/oid.h" @@ -118,11 +119,12 @@ int git_odb__hashfd(git_oid *out, git_file fd, size_t size, git_otype type) hdr_len = format_object_header(hdr, sizeof(hdr), size, type); ctx = git_hash_new_ctx(); + GITERR_CHECK_ALLOC(ctx); git_hash_update(ctx, hdr, hdr_len); while (size > 0) { - ssize_t read_len = read(fd, buffer, sizeof(buffer)); + ssize_t read_len = p_read(fd, buffer, sizeof(buffer)); if (read_len < 0) { git_hash_free_ctx(ctx); @@ -140,6 +142,33 @@ int git_odb__hashfd(git_oid *out, git_file fd, size_t size, git_otype type) return 0; } +int git_odb__hashfd_filtered( + git_oid *out, git_file fd, size_t size, git_otype type, git_vector *filters) +{ + int error; + git_buf raw = GIT_BUF_INIT; + git_buf filtered = GIT_BUF_INIT; + + if (!filters || !filters->length) + return git_odb__hashfd(out, fd, size, type); + + /* size of data is used in header, so we have to read the whole file + * into memory to apply filters before beginning to calculate the hash + */ + + if (!(error = git_futils_readbuffer_fd(&raw, fd, size))) + error = git_filters_apply(&filtered, &raw, filters); + + git_buf_free(&raw); + + if (!error) + error = git_odb_hash(out, filtered.ptr, filtered.size, type); + + git_buf_free(&filtered); + + return error; +} + int git_odb__hashlink(git_oid *out, const char *path) { struct stat st; @@ -171,7 +200,7 @@ int git_odb__hashlink(git_oid *out, const char *path) result = git_odb_hash(out, link_data, (size_t)size, GIT_OBJ_BLOB); git__free(link_data); - } else { + } else { int fd = git_futils_open_ro(path); if (fd < 0) return -1; @@ -58,12 +58,19 @@ int git_odb__hashobj(git_oid *id, git_rawobj *obj); int git_odb__hashfd(git_oid *out, git_file fd, size_t size, git_otype type); /* - * Hash a `path`, assuming it could be a POSIX symlink: if the path is a symlink, - * then the raw contents of the symlink will be hashed. Otherwise, this will - * fallback to `git_odb__hashfd`. + * Hash an open file descriptor applying an array of filters + * Acts just like git_odb__hashfd with the addition of filters... + */ +int git_odb__hashfd_filtered( + git_oid *out, git_file fd, size_t len, git_otype type, git_vector *filters); + +/* + * Hash a `path`, assuming it could be a POSIX symlink: if the path is a + * symlink, then the raw contents of the symlink will be hashed. Otherwise, + * this will fallback to `git_odb__hashfd`. * - * The hash type for this call is always `GIT_OBJ_BLOB` because symlinks may only - * point to blobs. + * The hash type for this call is always `GIT_OBJ_BLOB` because symlinks may + * only point to blobs. */ int git_odb__hashlink(git_oid *out, const char *path); diff --git a/tests-clar/status/worktree.c b/tests-clar/status/worktree.c index c0412ef96..05e396e1f 100644 --- a/tests-clar/status/worktree.c +++ b/tests-clar/status/worktree.c @@ -839,9 +839,5 @@ void test_status_worktree__line_endings_dont_count_as_changes_with_autocrlf(void cl_git_pass(git_status_file(&status, repo, "current_file")); -#ifdef GIT_WIN32 cl_assert_equal_i(GIT_STATUS_CURRENT, status); -#else - cl_assert_equal_i(GIT_STATUS_WT_MODIFIED, status); -#endif } |