summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRussell Belfer <rb@github.com>2012-09-11 12:03:33 -0700
committerRussell Belfer <rb@github.com>2012-09-11 12:03:33 -0700
commit1f35e89dbf6e0be8952cc4324a45fd600be5ca05 (patch)
tree0489cf363f6bb7484c5de0f998b1acf3f831881b
parentc6ac28fdc57d04a9a5eba129cfd267c7adde43b3 (diff)
downloadlibgit2-1f35e89dbf6e0be8952cc4324a45fd600be5ca05.tar.gz
Fix diff binary file detection
In the process of adding tests for the max file size threshold (which treats files over a certain size as binary) there seem to be a number of problems in the new code with detecting binaries. This should fix those up, as well as add a test for the file size threshold stuff. Also, this un-deprecates `GIT_DIFF_LINE_ADD_EOFNL`, since I finally found a legitimate situation where it would be returned.
-rw-r--r--include/git2/diff.h19
-rw-r--r--src/diff_output.c56
-rw-r--r--src/fileops.c32
-rw-r--r--tests-clar/diff/diff_helpers.c3
-rw-r--r--tests-clar/diff/diffiter.c85
5 files changed, 153 insertions, 42 deletions
diff --git a/include/git2/diff.h b/include/git2/diff.h
index 4b4591a9e..05825c50d 100644
--- a/include/git2/diff.h
+++ b/include/git2/diff.h
@@ -79,13 +79,28 @@ typedef struct {
*/
typedef struct git_diff_list git_diff_list;
+/**
+ * Flags that can be set for the file on side of a diff.
+ *
+ * Most of the flags are just for internal consumption by libgit2,
+ * but some of them may be interesting to external users. They are:
+ *
+ * - VALID_OID - the `oid` value is computed and correct
+ * - FREE_PATH - the `path` string is separated allocated memory
+ * - BINARY - this file should be considered binary data
+ * - NOT_BINARY - this file should be considered text data
+ * - FREE_DATA - the internal file data is kept in allocated memory
+ * - UNMAP_DATA - the internal file data is kept in mmap'ed memory
+ * - NO_DATA - this side of the diff should not be loaded
+ */
enum {
GIT_DIFF_FILE_VALID_OID = (1 << 0),
GIT_DIFF_FILE_FREE_PATH = (1 << 1),
GIT_DIFF_FILE_BINARY = (1 << 2),
GIT_DIFF_FILE_NOT_BINARY = (1 << 3),
GIT_DIFF_FILE_FREE_DATA = (1 << 4),
- GIT_DIFF_FILE_UNMAP_DATA = (1 << 5)
+ GIT_DIFF_FILE_UNMAP_DATA = (1 << 5),
+ GIT_DIFF_FILE_NO_DATA = (1 << 6),
};
/**
@@ -176,7 +191,7 @@ enum {
GIT_DIFF_LINE_CONTEXT = ' ',
GIT_DIFF_LINE_ADDITION = '+',
GIT_DIFF_LINE_DELETION = '-',
- GIT_DIFF_LINE_ADD_EOFNL = '\n', /**< DEPRECATED - will not be returned */
+ GIT_DIFF_LINE_ADD_EOFNL = '\n', /**< Removed line w/o LF & added one with */
GIT_DIFF_LINE_DEL_EOFNL = '\0', /**< LF was removed at end of file */
/* The following values will only be sent to a `git_diff_data_fn` when
diff --git a/src/diff_output.c b/src/diff_output.c
index dbef7ddc1..ea40c3355 100644
--- a/src/diff_output.c
+++ b/src/diff_output.c
@@ -172,9 +172,13 @@ static void update_delta_is_binary(git_diff_delta *delta)
if ((delta->old_file.flags & GIT_DIFF_FILE_BINARY) != 0 ||
(delta->new_file.flags & GIT_DIFF_FILE_BINARY) != 0)
delta->binary = 1;
- else if ((delta->old_file.flags & GIT_DIFF_FILE_NOT_BINARY) != 0 &&
- (delta->new_file.flags & GIT_DIFF_FILE_NOT_BINARY) != 0)
+
+#define NOT_BINARY_FLAGS (GIT_DIFF_FILE_NOT_BINARY|GIT_DIFF_FILE_NO_DATA)
+
+ else if ((delta->old_file.flags & NOT_BINARY_FLAGS) != 0 &&
+ (delta->new_file.flags & NOT_BINARY_FLAGS) != 0)
delta->binary = 0;
+
/* otherwise leave delta->binary value untouched */
}
@@ -507,7 +511,7 @@ static int diff_delta_load(diff_delta_context *ctxt)
{
int error = 0;
git_diff_delta *delta = ctxt->delta;
- bool load_old = false, load_new = false, check_if_unmodified = false;
+ bool check_if_unmodified = false;
if (ctxt->loaded || !ctxt->delta)
return 0;
@@ -527,22 +531,33 @@ static int diff_delta_load(diff_delta_context *ctxt)
goto cleanup;
switch (delta->status) {
- case GIT_DELTA_ADDED: load_new = true; break;
- case GIT_DELTA_DELETED: load_old = true; break;
- case GIT_DELTA_MODIFIED: load_new = load_old = true; break;
- default: break;
+ case GIT_DELTA_ADDED:
+ delta->old_file.flags |= GIT_DIFF_FILE_NO_DATA;
+ break;
+ case GIT_DELTA_DELETED:
+ delta->new_file.flags |= GIT_DIFF_FILE_NO_DATA;
+ break;
+ case GIT_DELTA_MODIFIED:
+ break;
+ default:
+ delta->new_file.flags |= GIT_DIFF_FILE_NO_DATA;
+ delta->old_file.flags |= GIT_DIFF_FILE_NO_DATA;
+ break;
}
+#define CHECK_UNMODIFIED (GIT_DIFF_FILE_NO_DATA | GIT_DIFF_FILE_VALID_OID)
+
check_if_unmodified =
- (load_old && (delta->old_file.flags & GIT_DIFF_FILE_VALID_OID) == 0) ||
- (load_new && (delta->new_file.flags & GIT_DIFF_FILE_VALID_OID) == 0);
+ (delta->old_file.flags & CHECK_UNMODIFIED) == 0 &&
+ (delta->new_file.flags & CHECK_UNMODIFIED) == 0;
/* Always try to load workdir content first, since it may need to be
* filtered (and hence use 2x memory) and we want to minimize the max
* memory footprint during diff.
*/
- if (load_old && ctxt->old_src == GIT_ITERATOR_WORKDIR) {
+ if ((delta->old_file.flags & GIT_DIFF_FILE_NO_DATA) == 0 &&
+ ctxt->old_src == GIT_ITERATOR_WORKDIR) {
if ((error = get_workdir_content(
ctxt, &delta->old_file, &ctxt->old_data)) < 0)
goto cleanup;
@@ -550,7 +565,8 @@ static int diff_delta_load(diff_delta_context *ctxt)
goto cleanup;
}
- if (load_new && ctxt->new_src == GIT_ITERATOR_WORKDIR) {
+ if ((delta->new_file.flags & GIT_DIFF_FILE_NO_DATA) == 0 &&
+ ctxt->new_src == GIT_ITERATOR_WORKDIR) {
if ((error = get_workdir_content(
ctxt, &delta->new_file, &ctxt->new_data)) < 0)
goto cleanup;
@@ -558,7 +574,8 @@ static int diff_delta_load(diff_delta_context *ctxt)
goto cleanup;
}
- if (load_old && ctxt->old_src != GIT_ITERATOR_WORKDIR) {
+ if ((delta->old_file.flags & GIT_DIFF_FILE_NO_DATA) == 0 &&
+ ctxt->old_src != GIT_ITERATOR_WORKDIR) {
if ((error = get_blob_content(
ctxt, &delta->old_file, &ctxt->old_data, &ctxt->old_blob)) < 0)
goto cleanup;
@@ -566,7 +583,8 @@ static int diff_delta_load(diff_delta_context *ctxt)
goto cleanup;
}
- if (load_new && ctxt->new_src != GIT_ITERATOR_WORKDIR) {
+ if ((delta->new_file.flags & GIT_DIFF_FILE_NO_DATA) == 0 &&
+ ctxt->new_src != GIT_ITERATOR_WORKDIR) {
if ((error = get_blob_content(
ctxt, &delta->new_file, &ctxt->new_data, &ctxt->new_blob)) < 0)
goto cleanup;
@@ -588,6 +606,10 @@ static int diff_delta_load(diff_delta_context *ctxt)
}
cleanup:
+ /* last change to update binary flag */
+ if (delta->binary == -1)
+ update_delta_is_binary(delta);
+
ctxt->loaded = !error;
/* flag if we would want to diff the contents of these files */
@@ -629,9 +651,10 @@ static int diff_delta_cb(void *priv, mmbuffer_t *bufs, int len)
}
if (len == 3 && !ctxt->cb_error) {
- /* This should only happen if we are adding a line that does not
- * have a newline at the end and the old code did. In that case,
- * we have a ADD with a DEL_EOFNL as a pair.
+ /* If we have a '+' and a third buf, then we have added a line
+ * without a newline and the old code had one, so DEL_EOFNL.
+ * If we have a '-' and a third buf, then we have removed a line
+ * with out a newline but added a blank line, so ADD_EOFNL.
*/
char origin =
(*bufs[0].ptr == '+') ? GIT_DIFF_LINE_DEL_EOFNL :
@@ -1036,6 +1059,7 @@ static void set_data_from_blob(
} else {
map->data = "";
file->size = map->len = 0;
+ file->flags |= GIT_DIFF_FILE_NO_DATA;
}
}
diff --git a/src/fileops.c b/src/fileops.c
index d4def1a9a..cbe3d4782 100644
--- a/src/fileops.c
+++ b/src/fileops.c
@@ -115,40 +115,26 @@ mode_t git_futils_canonical_mode(mode_t raw_mode)
return 0;
}
-#define MAX_READ_STALLS 10
-
int git_futils_readbuffer_fd(git_buf *buf, git_file fd, size_t len)
{
- int stalls = MAX_READ_STALLS;
+ ssize_t read_size;
git_buf_clear(buf);
if (git_buf_grow(buf, len + 1) < 0)
return -1;
- buf->ptr[len] = '\0';
-
- while (len > 0) {
- ssize_t read_size = p_read(fd, buf->ptr + buf->size, len);
-
- if (read_size < 0) {
- giterr_set(GITERR_OS, "Failed to read descriptor");
- return -1;
- }
-
- if (read_size == 0) {
- stalls--;
+ /* p_read loops internally to read len bytes */
+ read_size = p_read(fd, buf->ptr, len);
- if (!stalls) {
- giterr_set(GITERR_OS, "Too many stalls reading descriptor");
- return -1;
- }
- }
-
- len -= read_size;
- buf->size += read_size;
+ if (read_size < 0) {
+ giterr_set(GITERR_OS, "Failed to read descriptor");
+ return -1;
}
+ buf->ptr[read_size] = '\0';
+ buf->size = read_size;
+
return 0;
}
diff --git a/tests-clar/diff/diff_helpers.c b/tests-clar/diff/diff_helpers.c
index ef59b686f..767b34392 100644
--- a/tests-clar/diff/diff_helpers.c
+++ b/tests-clar/diff/diff_helpers.c
@@ -89,7 +89,8 @@ int diff_line_fn(
e->line_adds++;
break;
case GIT_DIFF_LINE_ADD_EOFNL:
- assert(0);
+ /* technically not a line add, but we'll count it as such */
+ e->line_adds++;
break;
case GIT_DIFF_LINE_DELETION:
e->line_dels++;
diff --git a/tests-clar/diff/diffiter.c b/tests-clar/diff/diffiter.c
index 56c254741..23071e48b 100644
--- a/tests-clar/diff/diffiter.c
+++ b/tests-clar/diff/diffiter.c
@@ -114,3 +114,88 @@ void test_diff_diffiter__iterate_files_and_hunks(void)
git_diff_iterator_free(iter);
git_diff_list_free(diff);
}
+
+void test_diff_diffiter__max_size_threshold(void)
+{
+ git_repository *repo = cl_git_sandbox_init("status");
+ git_diff_options opts = {0};
+ git_diff_list *diff = NULL;
+ git_diff_iterator *iter;
+ git_diff_delta *delta;
+ int error, file_count = 0, binary_count = 0, hunk_count = 0;
+
+ opts.context_lines = 3;
+ opts.interhunk_lines = 1;
+ opts.flags |= GIT_DIFF_INCLUDE_IGNORED | GIT_DIFF_INCLUDE_UNTRACKED;
+
+ cl_git_pass(git_diff_workdir_to_index(repo, &opts, &diff));
+ cl_git_pass(git_diff_iterator_new(&iter, diff));
+
+ while ((error = git_diff_iterator_next_file(&delta, iter)) != GIT_ITEROVER) {
+ cl_assert_equal_i(0, error);
+ cl_assert(delta);
+
+ file_count++;
+
+ hunk_count += git_diff_iterator_num_hunks_in_file(iter);
+
+ assert(delta->binary == 0 || delta->binary == 1);
+
+ binary_count += delta->binary;
+ }
+
+ cl_assert_equal_i(GIT_ITEROVER, error);
+ cl_assert(delta == NULL);
+
+ cl_assert_equal_i(13, file_count);
+ cl_assert_equal_i(0, binary_count);
+ cl_assert_equal_i(8, hunk_count);
+
+ git_diff_iterator_free(iter);
+ git_diff_list_free(diff);
+
+ /* try again with low file size threshold */
+
+ file_count = 0;
+ binary_count = 0;
+ hunk_count = 0;
+
+ opts.context_lines = 3;
+ opts.interhunk_lines = 1;
+ opts.flags |= GIT_DIFF_INCLUDE_IGNORED | GIT_DIFF_INCLUDE_UNTRACKED;
+ opts.max_size = 50; /* treat anything over 50 bytes as binary! */
+
+ cl_git_pass(git_diff_workdir_to_index(repo, &opts, &diff));
+ cl_git_pass(git_diff_iterator_new(&iter, diff));
+
+ while ((error = git_diff_iterator_next_file(&delta, iter)) != GIT_ITEROVER) {
+ cl_assert_equal_i(0, error);
+ cl_assert(delta);
+
+ file_count++;
+
+ hunk_count += git_diff_iterator_num_hunks_in_file(iter);
+
+ assert(delta->binary == 0 || delta->binary == 1);
+
+ binary_count += delta->binary;
+ }
+
+ cl_assert_equal_i(GIT_ITEROVER, error);
+ cl_assert(delta == NULL);
+
+ cl_assert_equal_i(13, file_count);
+
+ /* Three files are over the 50 byte threshold:
+ * - staged_changes_file_deleted
+ * - staged_changes_modified_file
+ * - staged_new_file_modified_file
+ */
+ cl_assert_equal_i(3, binary_count);
+
+ cl_assert_equal_i(5, hunk_count);
+
+ git_diff_iterator_free(iter);
+ git_diff_list_free(diff);
+
+}