From 60b9d3fcef04a6beb0ad4df225ada058afabf0b9 Mon Sep 17 00:00:00 2001 From: Russell Belfer Date: Wed, 5 Sep 2012 15:00:40 -0700 Subject: Implement filters for status/diff blobs This adds support to diff and status for running filters (a la crlf) on blobs in the workdir before computing SHAs and before generating text diffs. This ended up being a bit more code change than I had thought since I had to reorganize some of the diff logic to minimize peak memory use when filtering blobs in a diff. This also adds a cap on the maximum size of data that will be loaded to diff. I set it at 512Mb which should match core git. Right now it is a #define in src/diff.h but it could be moved into the public API if desired. --- src/diff_output.c | 214 ++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 152 insertions(+), 62 deletions(-) (limited to 'src/diff_output.c') diff --git a/src/diff_output.c b/src/diff_output.c index 2c64b92ee..e2ca8cf3e 100644 --- a/src/diff_output.c +++ b/src/diff_output.c @@ -22,7 +22,18 @@ * git_diff_foreach() call it is an emphemeral structure that is filled * in to execute each diff. In the case of a git_diff_iterator, it holds * most of the information for the diff in progress. - */ + * + * As each delta is processed, it goes through 3 phases: prep, load, exec. + * + * - In the prep phase, we just set the delta and quickly check the file + * attributes to see if it should be treated as binary. + * - In the load phase, we actually load the file content into memory. + * At this point, if we had deferred calculating OIDs, we might have to + * correct the delta to be UNMODIFIED. + * - In the exec phase, we actually run the diff and execute the callbacks. + * For foreach, this is just a pass-through to the user's callbacks. For + * iterators, we record the hunks and data spans into memory. + */ typedef struct { git_repository *repo; git_diff_options *opts; @@ -263,18 +274,40 @@ static void setup_xdiff_options( static int get_blob_content( git_repository *repo, - const git_oid *oid, + git_diff_file *file, git_map *map, git_blob **blob) { - if (git_oid_iszero(oid)) + int error; + git_odb *odb; + size_t len; + git_otype type; + + if (git_oid_iszero(&file->oid)) return 0; - if (git_blob_lookup(blob, repo, oid) < 0) - return -1; + /* peek at object header to avoid loading if too large */ + if ((error = git_repository_odb__weakptr(&odb, repo)) < 0 || + (error = git_odb_read_header(&len, &type, odb, &file->oid)) < 0) + return error; + + assert(type == GIT_OBJ_BLOB); + + /* if blob is too large to diff, mark as binary */ + if (len > MAX_DIFF_FILESIZE) { + file->flags |= GIT_DIFF_FILE_BINARY; + return 0; + } + + if (!file->size) + file->size = len; + + if ((error = git_blob_lookup(blob, repo, &file->oid)) < 0) + return error; map->data = (void *)git_blob_rawcontent(*blob); map->len = git_blob_rawsize(*blob); + return 0; } @@ -307,13 +340,66 @@ static int get_workdir_content( if (read_len < 0) { giterr_set(GITERR_OS, "Failed to read symlink '%s'", file->path); error = -1; - } else - map->len = read_len; + goto cleanup; + } + + map->len = read_len; } else { - error = git_futils_mmap_ro_file(map, path.ptr); - file->flags |= GIT_DIFF_FILE_UNMAP_DATA; + git_file fd = git_futils_open_ro(path.ptr); + git_vector filters = GIT_VECTOR_INIT; + + if (fd < 0) { + error = fd; + goto cleanup; + } + + if (!file->size) + file->size = git_futils_filesize(fd); + + /* if file is too large to diff, mark as binary */ + if (file->size > MAX_DIFF_FILESIZE) { + file->flags |= GIT_DIFF_FILE_BINARY; + goto close_and_cleanup; + } + + error = git_filters_load(&filters, repo, file->path, GIT_FILTER_TO_ODB); + if (error < 0) + goto close_and_cleanup; + + if (error == 0) { /* note: git_filters_load returns filter count */ + error = git_futils_mmap_ro(map, fd, 0, (size_t)file->size); + file->flags |= GIT_DIFF_FILE_UNMAP_DATA; + } else { + git_buf raw = GIT_BUF_INIT, filtered = GIT_BUF_INIT; + + if (!(error = git_futils_readbuffer_fd(&raw, fd, (size_t)file->size)) && + !(error = git_filters_apply(&filtered, &raw, &filters))) + { + map->len = git_buf_len(&filtered); + map->data = git_buf_detach(&filtered); + + file->flags |= GIT_DIFF_FILE_FREE_DATA; + } + + git_buf_free(&raw); + git_buf_free(&filtered); + } + +close_and_cleanup: + git_filters_free(&filters); + p_close(fd); + } + + /* once data is loaded, update OID if we didn't have it previously */ + if (!error && (file->flags & GIT_DIFF_FILE_VALID_OID) == 0) { + error = git_odb_hash( + &file->oid, map->data, map->len, GIT_OBJ_BLOB); + if (!error) + file->flags |= GIT_DIFF_FILE_VALID_OID; } + +cleanup: git_buf_free(&path); return error; } @@ -393,7 +479,9 @@ static int diff_delta_prep(diff_delta_context *ctxt) static int diff_delta_load(diff_delta_context *ctxt) { int error = 0; + git_repository *repo = ctxt->repo; git_diff_delta *delta = ctxt->delta; + bool load_old = false, load_new = false, check_if_unmodified = false; if (ctxt->loaded || !ctxt->delta) return 0; @@ -405,75 +493,77 @@ static int diff_delta_load(diff_delta_context *ctxt) ctxt->old_data.len = 0; ctxt->old_blob = NULL; - if (!error && delta->binary != 1 && - (delta->status == GIT_DELTA_DELETED || - delta->status == GIT_DELTA_MODIFIED)) - { - if (ctxt->old_src == GIT_ITERATOR_WORKDIR) - error = get_workdir_content( - ctxt->repo, &delta->old_file, &ctxt->old_data); - else { - error = get_blob_content( - ctxt->repo, &delta->old_file.oid, - &ctxt->old_data, &ctxt->old_blob); - - if (ctxt->new_src == GIT_ITERATOR_WORKDIR) { - /* TODO: convert crlf of blob content */ - } - } - } - ctxt->new_data.data = ""; ctxt->new_data.len = 0; ctxt->new_blob = NULL; - if (!error && delta->binary != 1 && - (delta->status == GIT_DELTA_ADDED || - delta->status == GIT_DELTA_MODIFIED)) - { - if (ctxt->new_src == GIT_ITERATOR_WORKDIR) - error = get_workdir_content( - ctxt->repo, &delta->new_file, &ctxt->new_data); - else { - error = get_blob_content( - ctxt->repo, &delta->new_file.oid, - &ctxt->new_data, &ctxt->new_blob); - - if (ctxt->old_src == GIT_ITERATOR_WORKDIR) { - /* TODO: convert crlf of blob content */ - } - } + if (delta->binary == 1) + goto cleanup; - if (!error && !(delta->new_file.flags & GIT_DIFF_FILE_VALID_OID)) { - error = git_odb_hash( - &delta->new_file.oid, ctxt->new_data.data, - ctxt->new_data.len, GIT_OBJ_BLOB); - if (error < 0) - goto cleanup; + switch (delta->status) { + case GIT_DELTA_ADDED: load_new = true; break; + case GIT_DELTA_DELETED: load_old = true; break; + case GIT_DELTA_MODIFIED: load_new = load_old = true; break; + default: break; + } - delta->new_file.flags |= GIT_DIFF_FILE_VALID_OID; + check_if_unmodified = + (load_old && (delta->old_file.flags & GIT_DIFF_FILE_VALID_OID) == 0) || + (load_new && (delta->new_file.flags & GIT_DIFF_FILE_VALID_OID) == 0); - /* since we did not have the definitive oid, we may have - * incorrect status and need to skip this item. - */ - if (delta->old_file.mode == delta->new_file.mode && - !git_oid_cmp(&delta->old_file.oid, &delta->new_file.oid)) - { - delta->status = GIT_DELTA_UNMODIFIED; + /* Always try to load workdir content first, since it may need to be + * filtered (and hence use 2x memory) and we want to minimize the max + * memory footprint during diff. + */ - if ((ctxt->opts->flags & GIT_DIFF_INCLUDE_UNMODIFIED) == 0) - goto cleanup; - } - } + if (load_old && ctxt->old_src == GIT_ITERATOR_WORKDIR) { + if ((error = get_workdir_content( + repo, &delta->old_file, &ctxt->old_data)) < 0) + goto cleanup; + + if ((delta->old_file.flags & GIT_DIFF_FILE_BINARY) != 0) + goto cleanup; + } + + if (load_new && ctxt->new_src == GIT_ITERATOR_WORKDIR) { + if ((error = get_workdir_content( + repo, &delta->new_file, &ctxt->new_data)) < 0) + goto cleanup; + + if ((delta->new_file.flags & GIT_DIFF_FILE_BINARY) != 0) + goto cleanup; } + if (load_old && ctxt->old_src != GIT_ITERATOR_WORKDIR && + (error = get_blob_content( + repo, &delta->old_file, &ctxt->old_data, &ctxt->old_blob)) < 0) + goto cleanup; + + if (load_new && ctxt->new_src != GIT_ITERATOR_WORKDIR && + (error = get_blob_content( + repo, &delta->new_file, &ctxt->new_data, &ctxt->new_blob)) < 0) + goto cleanup; + + /* if we did not previously have the definitive oid, we may have + * incorrect status and need to switch this to UNMODIFIED. + */ + if (check_if_unmodified && + delta->old_file.mode == delta->new_file.mode && + !git_oid_cmp(&delta->old_file.oid, &delta->new_file.oid)) + { + delta->status = GIT_DELTA_UNMODIFIED; + + if ((ctxt->opts->flags & GIT_DIFF_INCLUDE_UNMODIFIED) == 0) + goto cleanup; + } + +cleanup: /* if we have not already decided whether file is binary, * check the first 4K for nul bytes to decide... */ if (!error && delta->binary == -1) error = diff_delta_is_binary_by_content(ctxt); -cleanup: ctxt->loaded = !error; /* flag if we would want to diff the contents of these files */ -- cgit v1.2.1 From 3a3deea80bb6555706f58006bdee8e878b0fd651 Mon Sep 17 00:00:00 2001 From: Russell Belfer Date: Thu, 6 Sep 2012 15:45:50 -0700 Subject: Clean up blob diff path Previously when diffing blobs, the diff code just ran with a NULL repository object. Of course, that's not necessary and the test for a NULL repo was confusing. This makes the blob diff run with the repo that contains the blobs and clarifies the test that it is possible to be diffing data where the path is unknown. --- src/diff_output.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'src/diff_output.c') diff --git a/src/diff_output.c b/src/diff_output.c index e2ca8cf3e..6ff880e95 100644 --- a/src/diff_output.c +++ b/src/diff_output.c @@ -151,7 +151,8 @@ static int update_file_is_binary_by_attr( { const char *value; - if (!repo) + /* because of blob diffs, cannot assume path is set */ + if (!file->path || !strlen(file->path)) return 0; if (git_attr_get(&value, repo, 0, file->path, "diff") < 0) @@ -1028,6 +1029,7 @@ int git_diff_blobs( diff_delta_context ctxt; git_diff_delta delta; git_blob *new, *old; + git_repository *repo; new = new_blob; old = old_blob; @@ -1038,8 +1040,15 @@ int git_diff_blobs( new = swap; } + if (new) + repo = git_object_owner((git_object *)new); + else if (old) + repo = git_object_owner((git_object *)old); + else + repo = NULL; + diff_delta_init_context( - &ctxt, NULL, options, GIT_ITERATOR_TREE, GIT_ITERATOR_TREE); + &ctxt, repo, options, GIT_ITERATOR_TREE, GIT_ITERATOR_TREE); /* populate a "fake" delta record */ -- cgit v1.2.1 From b36effa22e015871948daeea250b4996c663e11a Mon Sep 17 00:00:00 2001 From: Russell Belfer Date: Mon, 10 Sep 2012 09:59:14 -0700 Subject: Replace git_diff_iterator_num_files with progress The `git_diff_iterator_num_files` API was problematic, since we don't actually know the exact number of files to be iterated over until we load those files into memory. This replaces it with a new `git_diff_iterator_progress` API that goes from 0 to 1, and moves and renamed the old API for the internal places that can tolerate a max value instead of an exact value. --- src/diff_output.c | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) (limited to 'src/diff_output.c') diff --git a/src/diff_output.c b/src/diff_output.c index 6ff880e95..f65d0057f 100644 --- a/src/diff_output.c +++ b/src/diff_output.c @@ -1115,7 +1115,6 @@ struct git_diff_iterator { diff_delta_context ctxt; size_t file_index; size_t next_index; - size_t file_count; git_pool hunks; size_t hunk_count; diffiter_hunk *hunk_head; @@ -1239,8 +1238,6 @@ int git_diff_iterator_new( git_diff_iterator **iterator_ptr, git_diff_list *diff) { - size_t i; - git_diff_delta *delta; git_diff_iterator *iter; assert(diff && iterator_ptr); @@ -1261,12 +1258,6 @@ int git_diff_iterator_new( git_pool_init(&iter->lines, sizeof(diffiter_line), 0) < 0) goto fail; - git_vector_foreach(&diff->deltas, i, delta) { - if (diff_delta_should_skip(iter->ctxt.opts, delta)) - continue; - iter->file_count++; - } - *iterator_ptr = iter; return 0; @@ -1284,9 +1275,14 @@ void git_diff_iterator_free(git_diff_iterator *iter) git__free(iter); } -int git_diff_iterator_num_files(git_diff_iterator *iter) +float git_diff_iterator_progress(git_diff_iterator *iter) +{ + return (float)iter->next_index / (float)iter->diff->deltas.length; +} + +int git_diff_iterator__max_files(git_diff_iterator *iter) { - return (int)iter->file_count; + return (int)iter->diff->deltas.length; } int git_diff_iterator_num_hunks_in_file(git_diff_iterator *iter) -- cgit v1.2.1 From e597b1890ebd43e398d84b7bf4ca366365b24d27 Mon Sep 17 00:00:00 2001 From: Russell Belfer Date: Mon, 10 Sep 2012 11:49:12 -0700 Subject: Move diff max_size to public API This commit adds a max_size value in the public `git_diff_options` structure so that the user can automatically flag blobs over a certain size as binary regardless of other properties. Also, and perhaps more importantly, this moves binary detection to be as early as possible in the diff traversal inner loop and makes sure that we stop loading objects as soon as we decide that they are binary. --- src/diff_output.c | 148 ++++++++++++++++++++++++++++++------------------------ 1 file changed, 83 insertions(+), 65 deletions(-) (limited to 'src/diff_output.c') diff --git a/src/diff_output.c b/src/diff_output.c index f65d0057f..8873a4dc7 100644 --- a/src/diff_output.c +++ b/src/diff_output.c @@ -172,7 +172,7 @@ static void update_delta_is_binary(git_diff_delta *delta) if ((delta->old_file.flags & GIT_DIFF_FILE_BINARY) != 0 || (delta->new_file.flags & GIT_DIFF_FILE_BINARY) != 0) delta->binary = 1; - else if ((delta->old_file.flags & GIT_DIFF_FILE_NOT_BINARY) != 0 || + else if ((delta->old_file.flags & GIT_DIFF_FILE_NOT_BINARY) != 0 && (delta->new_file.flags & GIT_DIFF_FILE_NOT_BINARY) != 0) delta->binary = 0; /* otherwise leave delta->binary value untouched */ @@ -219,34 +219,46 @@ static int diff_delta_is_binary_by_attr(diff_delta_context *ctxt) return error; } -static int diff_delta_is_binary_by_content(diff_delta_context *ctxt) +static int diff_delta_is_binary_by_content( + diff_delta_context *ctxt, git_diff_file *file, git_map *map) { - git_diff_delta *delta = ctxt->delta; git_buf search; - if ((delta->old_file.flags & BINARY_DIFF_FLAGS) == 0) { - search.ptr = ctxt->old_data.data; - search.size = min(ctxt->old_data.len, 4000); + if ((file->flags & BINARY_DIFF_FLAGS) == 0) { + search.ptr = map->data; + search.size = min(map->len, 4000); if (git_buf_is_binary(&search)) - delta->old_file.flags |= GIT_DIFF_FILE_BINARY; + file->flags |= GIT_DIFF_FILE_BINARY; else - delta->old_file.flags |= GIT_DIFF_FILE_NOT_BINARY; + file->flags |= GIT_DIFF_FILE_NOT_BINARY; } - if ((delta->new_file.flags & BINARY_DIFF_FLAGS) == 0) { - search.ptr = ctxt->new_data.data; - search.size = min(ctxt->new_data.len, 4000); + update_delta_is_binary(ctxt->delta); - if (git_buf_is_binary(&search)) - delta->new_file.flags |= GIT_DIFF_FILE_BINARY; - else - delta->new_file.flags |= GIT_DIFF_FILE_NOT_BINARY; + return 0; +} + +static int diff_delta_is_binary_by_size( + diff_delta_context *ctxt, git_diff_file *file) +{ + git_off_t threshold = MAX_DIFF_FILESIZE; + + if ((file->flags & BINARY_DIFF_FLAGS) != 0) + return 0; + + if (ctxt && ctxt->opts) { + if (ctxt->opts->max_size < 0) + return 0; + + if (ctxt->opts->max_size > 0) + threshold = ctxt->opts->max_size; } - update_delta_is_binary(delta); + if (file->size > threshold) + file->flags |= GIT_DIFF_FILE_BINARY; - /* TODO: if value != NULL, implement diff drivers */ + update_delta_is_binary(ctxt->delta); return 0; } @@ -274,53 +286,56 @@ static void setup_xdiff_options( } static int get_blob_content( - git_repository *repo, + diff_delta_context *ctxt, git_diff_file *file, git_map *map, git_blob **blob) { int error; - git_odb *odb; - size_t len; - git_otype type; if (git_oid_iszero(&file->oid)) return 0; - /* peek at object header to avoid loading if too large */ - if ((error = git_repository_odb__weakptr(&odb, repo)) < 0 || - (error = git_odb_read_header(&len, &type, odb, &file->oid)) < 0) - return error; + if (!file->size) { + git_odb *odb; + size_t len; + git_otype type; - assert(type == GIT_OBJ_BLOB); + /* peek at object header to avoid loading if too large */ + if ((error = git_repository_odb__weakptr(&odb, ctxt->repo)) < 0 || + (error = git_odb_read_header(&len, &type, odb, &file->oid)) < 0) + return error; - /* if blob is too large to diff, mark as binary */ - if (len > MAX_DIFF_FILESIZE) { - file->flags |= GIT_DIFF_FILE_BINARY; - return 0; - } + assert(type == GIT_OBJ_BLOB); - if (!file->size) file->size = len; + } - if ((error = git_blob_lookup(blob, repo, &file->oid)) < 0) + /* if blob is too large to diff, mark as binary */ + if ((error = diff_delta_is_binary_by_size(ctxt, file)) < 0) + return error; + if (ctxt->delta->binary == 1) + return 0; + + if ((error = git_blob_lookup(blob, ctxt->repo, &file->oid)) < 0) return error; map->data = (void *)git_blob_rawcontent(*blob); map->len = git_blob_rawsize(*blob); - return 0; + return diff_delta_is_binary_by_content(ctxt, file, map); } static int get_workdir_content( - git_repository *repo, + diff_delta_context *ctxt, git_diff_file *file, git_map *map) { int error = 0; git_buf path = GIT_BUF_INIT; + const char *wd = git_repository_workdir(ctxt->repo); - if (git_buf_joinpath(&path, git_repository_workdir(repo), file->path) < 0) + if (git_buf_joinpath(&path, wd, file->path) < 0) return -1; if (S_ISLNK(file->mode)) { @@ -358,13 +373,12 @@ static int get_workdir_content( if (!file->size) file->size = git_futils_filesize(fd); - /* if file is too large to diff, mark as binary */ - if (file->size > MAX_DIFF_FILESIZE) { - file->flags |= GIT_DIFF_FILE_BINARY; + if ((error = diff_delta_is_binary_by_size(ctxt, file)) < 0 || + ctxt->delta->binary == 1) goto close_and_cleanup; - } - error = git_filters_load(&filters, repo, file->path, GIT_FILTER_TO_ODB); + error = git_filters_load( + &filters, ctxt->repo, file->path, GIT_FILTER_TO_ODB); if (error < 0) goto close_and_cleanup; @@ -400,6 +414,9 @@ close_and_cleanup: file->flags |= GIT_DIFF_FILE_VALID_OID; } + if (!error) + error = diff_delta_is_binary_by_content(ctxt, file, map); + cleanup: git_buf_free(&path); return error; @@ -480,7 +497,6 @@ static int diff_delta_prep(diff_delta_context *ctxt) static int diff_delta_load(diff_delta_context *ctxt) { int error = 0; - git_repository *repo = ctxt->repo; git_diff_delta *delta = ctxt->delta; bool load_old = false, load_new = false, check_if_unmodified = false; @@ -519,31 +535,35 @@ static int diff_delta_load(diff_delta_context *ctxt) if (load_old && ctxt->old_src == GIT_ITERATOR_WORKDIR) { if ((error = get_workdir_content( - repo, &delta->old_file, &ctxt->old_data)) < 0) + ctxt, &delta->old_file, &ctxt->old_data)) < 0) goto cleanup; - - if ((delta->old_file.flags & GIT_DIFF_FILE_BINARY) != 0) + if (delta->binary == 1) goto cleanup; } if (load_new && ctxt->new_src == GIT_ITERATOR_WORKDIR) { if ((error = get_workdir_content( - repo, &delta->new_file, &ctxt->new_data)) < 0) + ctxt, &delta->new_file, &ctxt->new_data)) < 0) goto cleanup; - - if ((delta->new_file.flags & GIT_DIFF_FILE_BINARY) != 0) + if (delta->binary == 1) goto cleanup; } - if (load_old && ctxt->old_src != GIT_ITERATOR_WORKDIR && - (error = get_blob_content( - repo, &delta->old_file, &ctxt->old_data, &ctxt->old_blob)) < 0) - goto cleanup; + if (load_old && ctxt->old_src != GIT_ITERATOR_WORKDIR) { + if ((error = get_blob_content( + ctxt, &delta->old_file, &ctxt->old_data, &ctxt->old_blob)) < 0) + goto cleanup; + if (delta->binary == 1) + goto cleanup; + } - if (load_new && ctxt->new_src != GIT_ITERATOR_WORKDIR && - (error = get_blob_content( - repo, &delta->new_file, &ctxt->new_data, &ctxt->new_blob)) < 0) - goto cleanup; + if (load_new && ctxt->new_src != GIT_ITERATOR_WORKDIR) { + if ((error = get_blob_content( + ctxt, &delta->new_file, &ctxt->new_data, &ctxt->new_blob)) < 0) + goto cleanup; + if (delta->binary == 1) + goto cleanup; + } /* if we did not previously have the definitive oid, we may have * incorrect status and need to switch this to UNMODIFIED. @@ -559,12 +579,6 @@ static int diff_delta_load(diff_delta_context *ctxt) } cleanup: - /* if we have not already decided whether file is binary, - * check the first 4K for nul bytes to decide... - */ - if (!error && delta->binary == -1) - error = diff_delta_is_binary_by_content(ctxt); - ctxt->loaded = !error; /* flag if we would want to diff the contents of these files */ @@ -1069,9 +1083,13 @@ int git_diff_blobs( if ((error = diff_delta_prep(&ctxt)) < 0) goto cleanup; - if (delta.binary == -1 && - (error = diff_delta_is_binary_by_content(&ctxt)) < 0) - goto cleanup; + if (delta.binary == -1) { + if ((error = diff_delta_is_binary_by_content( + &ctxt, &delta.old_file, &ctxt.old_data)) < 0 || + (error = diff_delta_is_binary_by_content( + &ctxt, &delta.new_file, &ctxt.new_data)) < 0) + goto cleanup; + } ctxt.loaded = 1; ctxt.diffable = (delta.binary != 1 && delta.status != GIT_DELTA_UNMODIFIED); -- cgit v1.2.1 From c6ac28fdc57d04a9a5eba129cfd267c7adde43b3 Mon Sep 17 00:00:00 2001 From: Russell Belfer Date: Mon, 10 Sep 2012 12:24:05 -0700 Subject: Reorg internal odb read header and object lookup Often `git_odb_read_header` will "fail" and have to read the entire object into memory instead of just the header. When this happens, the object is loaded and then disposed of immediately, which makes it difficult to efficiently use the header information to decide if the object should be loaded (since attempting to do so will often result in loading the object twice). This commit takes the existing code and reorganizes it to have two new functions: - `git_odb__read_header_or_object` which acts just like the old read header function except that it returns the object, too, if it was forced to load the whole thing. It then becomes the callers responsibility to free the `git_odb_object`. - `git_object__from_odb_object` which was extracted from the old `git_object_lookup` and creates a subclass of `git_object` from an existing `git_odb_object` (separating the ODB lookup from the `git_object` creation). This allows you to use the first header reading function efficiently without instantiating the `git_odb_object` twice. There is no net change to the behavior of any of the existing functions, but this allows internal code to tap into the ODB lookup and object creation to be more efficient. --- src/diff_output.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'src/diff_output.c') diff --git a/src/diff_output.c b/src/diff_output.c index 8873a4dc7..dbef7ddc1 100644 --- a/src/diff_output.c +++ b/src/diff_output.c @@ -292,6 +292,7 @@ static int get_blob_content( git_blob **blob) { int error; + git_odb_object *odb_obj = NULL; if (git_oid_iszero(&file->oid)) return 0; @@ -303,7 +304,8 @@ static int get_blob_content( /* peek at object header to avoid loading if too large */ if ((error = git_repository_odb__weakptr(&odb, ctxt->repo)) < 0 || - (error = git_odb_read_header(&len, &type, odb, &file->oid)) < 0) + (error = git_odb__read_header_or_object( + &odb_obj, &len, &type, odb, &file->oid)) < 0) return error; assert(type == GIT_OBJ_BLOB); @@ -317,7 +319,14 @@ static int get_blob_content( if (ctxt->delta->binary == 1) return 0; - if ((error = git_blob_lookup(blob, ctxt->repo, &file->oid)) < 0) + if (odb_obj != NULL) { + error = git_object__from_odb_object( + (git_object **)blob, ctxt->repo, odb_obj, GIT_OBJ_BLOB); + git_odb_object_free(odb_obj); + } else + error = git_blob_lookup(blob, ctxt->repo, &file->oid); + + if (error) return error; map->data = (void *)git_blob_rawcontent(*blob); -- cgit v1.2.1 From 1f35e89dbf6e0be8952cc4324a45fd600be5ca05 Mon Sep 17 00:00:00 2001 From: Russell Belfer Date: Tue, 11 Sep 2012 12:03:33 -0700 Subject: Fix diff binary file detection In the process of adding tests for the max file size threshold (which treats files over a certain size as binary) there seem to be a number of problems in the new code with detecting binaries. This should fix those up, as well as add a test for the file size threshold stuff. Also, this un-deprecates `GIT_DIFF_LINE_ADD_EOFNL`, since I finally found a legitimate situation where it would be returned. --- src/diff_output.c | 56 +++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 40 insertions(+), 16 deletions(-) (limited to 'src/diff_output.c') diff --git a/src/diff_output.c b/src/diff_output.c index dbef7ddc1..ea40c3355 100644 --- a/src/diff_output.c +++ b/src/diff_output.c @@ -172,9 +172,13 @@ static void update_delta_is_binary(git_diff_delta *delta) if ((delta->old_file.flags & GIT_DIFF_FILE_BINARY) != 0 || (delta->new_file.flags & GIT_DIFF_FILE_BINARY) != 0) delta->binary = 1; - else if ((delta->old_file.flags & GIT_DIFF_FILE_NOT_BINARY) != 0 && - (delta->new_file.flags & GIT_DIFF_FILE_NOT_BINARY) != 0) + +#define NOT_BINARY_FLAGS (GIT_DIFF_FILE_NOT_BINARY|GIT_DIFF_FILE_NO_DATA) + + else if ((delta->old_file.flags & NOT_BINARY_FLAGS) != 0 && + (delta->new_file.flags & NOT_BINARY_FLAGS) != 0) delta->binary = 0; + /* otherwise leave delta->binary value untouched */ } @@ -507,7 +511,7 @@ static int diff_delta_load(diff_delta_context *ctxt) { int error = 0; git_diff_delta *delta = ctxt->delta; - bool load_old = false, load_new = false, check_if_unmodified = false; + bool check_if_unmodified = false; if (ctxt->loaded || !ctxt->delta) return 0; @@ -527,22 +531,33 @@ static int diff_delta_load(diff_delta_context *ctxt) goto cleanup; switch (delta->status) { - case GIT_DELTA_ADDED: load_new = true; break; - case GIT_DELTA_DELETED: load_old = true; break; - case GIT_DELTA_MODIFIED: load_new = load_old = true; break; - default: break; + case GIT_DELTA_ADDED: + delta->old_file.flags |= GIT_DIFF_FILE_NO_DATA; + break; + case GIT_DELTA_DELETED: + delta->new_file.flags |= GIT_DIFF_FILE_NO_DATA; + break; + case GIT_DELTA_MODIFIED: + break; + default: + delta->new_file.flags |= GIT_DIFF_FILE_NO_DATA; + delta->old_file.flags |= GIT_DIFF_FILE_NO_DATA; + break; } +#define CHECK_UNMODIFIED (GIT_DIFF_FILE_NO_DATA | GIT_DIFF_FILE_VALID_OID) + check_if_unmodified = - (load_old && (delta->old_file.flags & GIT_DIFF_FILE_VALID_OID) == 0) || - (load_new && (delta->new_file.flags & GIT_DIFF_FILE_VALID_OID) == 0); + (delta->old_file.flags & CHECK_UNMODIFIED) == 0 && + (delta->new_file.flags & CHECK_UNMODIFIED) == 0; /* Always try to load workdir content first, since it may need to be * filtered (and hence use 2x memory) and we want to minimize the max * memory footprint during diff. */ - if (load_old && ctxt->old_src == GIT_ITERATOR_WORKDIR) { + if ((delta->old_file.flags & GIT_DIFF_FILE_NO_DATA) == 0 && + ctxt->old_src == GIT_ITERATOR_WORKDIR) { if ((error = get_workdir_content( ctxt, &delta->old_file, &ctxt->old_data)) < 0) goto cleanup; @@ -550,7 +565,8 @@ static int diff_delta_load(diff_delta_context *ctxt) goto cleanup; } - if (load_new && ctxt->new_src == GIT_ITERATOR_WORKDIR) { + if ((delta->new_file.flags & GIT_DIFF_FILE_NO_DATA) == 0 && + ctxt->new_src == GIT_ITERATOR_WORKDIR) { if ((error = get_workdir_content( ctxt, &delta->new_file, &ctxt->new_data)) < 0) goto cleanup; @@ -558,7 +574,8 @@ static int diff_delta_load(diff_delta_context *ctxt) goto cleanup; } - if (load_old && ctxt->old_src != GIT_ITERATOR_WORKDIR) { + if ((delta->old_file.flags & GIT_DIFF_FILE_NO_DATA) == 0 && + ctxt->old_src != GIT_ITERATOR_WORKDIR) { if ((error = get_blob_content( ctxt, &delta->old_file, &ctxt->old_data, &ctxt->old_blob)) < 0) goto cleanup; @@ -566,7 +583,8 @@ static int diff_delta_load(diff_delta_context *ctxt) goto cleanup; } - if (load_new && ctxt->new_src != GIT_ITERATOR_WORKDIR) { + if ((delta->new_file.flags & GIT_DIFF_FILE_NO_DATA) == 0 && + ctxt->new_src != GIT_ITERATOR_WORKDIR) { if ((error = get_blob_content( ctxt, &delta->new_file, &ctxt->new_data, &ctxt->new_blob)) < 0) goto cleanup; @@ -588,6 +606,10 @@ static int diff_delta_load(diff_delta_context *ctxt) } cleanup: + /* last change to update binary flag */ + if (delta->binary == -1) + update_delta_is_binary(delta); + ctxt->loaded = !error; /* flag if we would want to diff the contents of these files */ @@ -629,9 +651,10 @@ static int diff_delta_cb(void *priv, mmbuffer_t *bufs, int len) } if (len == 3 && !ctxt->cb_error) { - /* This should only happen if we are adding a line that does not - * have a newline at the end and the old code did. In that case, - * we have a ADD with a DEL_EOFNL as a pair. + /* If we have a '+' and a third buf, then we have added a line + * without a newline and the old code had one, so DEL_EOFNL. + * If we have a '-' and a third buf, then we have removed a line + * with out a newline but added a blank line, so ADD_EOFNL. */ char origin = (*bufs[0].ptr == '+') ? GIT_DIFF_LINE_DEL_EOFNL : @@ -1036,6 +1059,7 @@ static void set_data_from_blob( } else { map->data = ""; file->size = map->len = 0; + file->flags |= GIT_DIFF_FILE_NO_DATA; } } -- cgit v1.2.1