summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRussell Belfer <rb@github.com>2013-01-11 11:24:26 -0800
committerRussell Belfer <rb@github.com>2013-01-11 11:24:26 -0800
commit0d65acade84a5ff2421cd52de82a8963f004d481 (patch)
tree5c1f0ba04710e32642e32a4cadf7a8e14dc0aba6
parentd0b14cea0e1ff09af83a801c1a9cf1a431d46d0c (diff)
downloadlibgit2-0d65acade84a5ff2421cd52de82a8963f004d481.tar.gz
Match binary file check of core git in diff
Core git just looks for NUL bytes in files when deciding about is-binary inside diff (although it uses a better algorithm in checkout, when deciding if CRLF conversion should be done). Libgit2 was using the better algorithm in both places, but that is causing some confusion. For now, this makes diff just look for NUL bytes to decide if a file is binary by content in diff.
-rw-r--r--src/buf_text.c5
-rw-r--r--src/buf_text.h8
-rw-r--r--src/diff_output.c7
-rw-r--r--tests-clar/core/buffer.c23
4 files changed, 42 insertions, 1 deletions
diff --git a/src/buf_text.c b/src/buf_text.c
index a7122dc0c..0104a9057 100644
--- a/src/buf_text.c
+++ b/src/buf_text.c
@@ -109,6 +109,11 @@ bool git_buf_text_is_binary(const git_buf *buf)
return ((printable >> 7) < nonprintable);
}
+bool git_buf_text_contains_nul(const git_buf *buf)
+{
+ return (strnlen(buf->ptr, buf->size) != buf->size);
+}
+
int git_buf_text_detect_bom(git_bom_t *bom, const git_buf *buf, size_t offset)
{
const char *ptr;
diff --git a/src/buf_text.h b/src/buf_text.h
index ae5e6ca30..458ee33c9 100644
--- a/src/buf_text.h
+++ b/src/buf_text.h
@@ -71,6 +71,14 @@ extern int git_buf_text_common_prefix(git_buf *buf, const git_strarray *strs);
extern bool git_buf_text_is_binary(const git_buf *buf);
/**
+ * Check quickly if buffer contains a NUL byte
+ *
+ * @param buf Buffer to check
+ * @return true if buffer contains a NUL byte
+ */
+extern bool git_buf_text_contains_nul(const git_buf *buf);
+
+/**
* Check if a buffer begins with a UTF BOM
*
* @param bom Set to the type of BOM detected or GIT_BOM_NONE
diff --git a/src/diff_output.c b/src/diff_output.c
index f98665dfb..dcff78871 100644
--- a/src/diff_output.c
+++ b/src/diff_output.c
@@ -142,7 +142,12 @@ static int diff_delta_is_binary_by_content(
GIT_UNUSED(ctxt);
if ((file->flags & KNOWN_BINARY_FLAGS) == 0) {
- if (git_buf_text_is_binary(&search))
+ /* TODO: provide encoding / binary detection callbacks that can
+ * be UTF-8 aware, etc. For now, instead of trying to be smart,
+ * let's just use the simple NUL-byte detection that core git uses.
+ */
+ /* previously was: if (git_buf_text_is_binary(&search)) */
+ if (git_buf_text_contains_nul(&search))
file->flags |= GIT_DIFF_FILE_BINARY;
else
file->flags |= GIT_DIFF_FILE_NOT_BINARY;
diff --git a/tests-clar/core/buffer.c b/tests-clar/core/buffer.c
index 40fc4c571..5d9b7850c 100644
--- a/tests-clar/core/buffer.c
+++ b/tests-clar/core/buffer.c
@@ -704,3 +704,26 @@ void test_core_buffer__base64(void)
git_buf_free(&buf);
}
+
+void test_core_buffer__classify_with_utf8(void)
+{
+ char *data0 = "Simple text\n";
+ size_t data0len = 12;
+ char *data1 = "Is that UTF-8 data I seeā€¦\nYep!\n";
+ size_t data1len = 31;
+ char *data2 = "Internal NUL!!!\000\n\nI see you!\n";
+ size_t data2len = 29;
+ git_buf b;
+
+ b.ptr = data0; b.size = b.asize = data0len;
+ cl_assert(!git_buf_text_is_binary(&b));
+ cl_assert(!git_buf_text_contains_nul(&b));
+
+ b.ptr = data1; b.size = b.asize = data1len;
+ cl_assert(git_buf_text_is_binary(&b));
+ cl_assert(!git_buf_text_contains_nul(&b));
+
+ b.ptr = data2; b.size = b.asize = data2len;
+ cl_assert(git_buf_text_is_binary(&b));
+ cl_assert(git_buf_text_contains_nul(&b));
+}