Commit 1f35e89d by Russell Belfer

Fix diff binary file detection

In the process of adding tests for the max file size threshold
(which treats files over a certain size as binary) there seem to
be a number of problems in the new code with detecting binaries.
This should fix those up, as well as add a test for the file
size threshold stuff.

Also, this un-deprecates `GIT_DIFF_LINE_ADD_EOFNL`, since I
finally found a legitimate situation where it would be returned.
parent c6ac28fd
......@@ -79,13 +79,28 @@ typedef struct {
*/
typedef struct git_diff_list git_diff_list;
/**
* Flags that can be set for the file on side of a diff.
*
* Most of the flags are just for internal consumption by libgit2,
* but some of them may be interesting to external users. They are:
*
* - VALID_OID - the `oid` value is computed and correct
* - FREE_PATH - the `path` string is separated allocated memory
* - BINARY - this file should be considered binary data
* - NOT_BINARY - this file should be considered text data
* - FREE_DATA - the internal file data is kept in allocated memory
* - UNMAP_DATA - the internal file data is kept in mmap'ed memory
* - NO_DATA - this side of the diff should not be loaded
*/
enum {
GIT_DIFF_FILE_VALID_OID = (1 << 0),
GIT_DIFF_FILE_FREE_PATH = (1 << 1),
GIT_DIFF_FILE_BINARY = (1 << 2),
GIT_DIFF_FILE_NOT_BINARY = (1 << 3),
GIT_DIFF_FILE_FREE_DATA = (1 << 4),
GIT_DIFF_FILE_UNMAP_DATA = (1 << 5)
GIT_DIFF_FILE_UNMAP_DATA = (1 << 5),
GIT_DIFF_FILE_NO_DATA = (1 << 6),
};
/**
......@@ -176,7 +191,7 @@ enum {
GIT_DIFF_LINE_CONTEXT = ' ',
GIT_DIFF_LINE_ADDITION = '+',
GIT_DIFF_LINE_DELETION = '-',
GIT_DIFF_LINE_ADD_EOFNL = '\n', /**< DEPRECATED - will not be returned */
GIT_DIFF_LINE_ADD_EOFNL = '\n', /**< Removed line w/o LF & added one with */
GIT_DIFF_LINE_DEL_EOFNL = '\0', /**< LF was removed at end of file */
/* The following values will only be sent to a `git_diff_data_fn` when
......
......@@ -172,9 +172,13 @@ static void update_delta_is_binary(git_diff_delta *delta)
if ((delta->old_file.flags & GIT_DIFF_FILE_BINARY) != 0 ||
(delta->new_file.flags & GIT_DIFF_FILE_BINARY) != 0)
delta->binary = 1;
else if ((delta->old_file.flags & GIT_DIFF_FILE_NOT_BINARY) != 0 &&
(delta->new_file.flags & GIT_DIFF_FILE_NOT_BINARY) != 0)
#define NOT_BINARY_FLAGS (GIT_DIFF_FILE_NOT_BINARY|GIT_DIFF_FILE_NO_DATA)
else if ((delta->old_file.flags & NOT_BINARY_FLAGS) != 0 &&
(delta->new_file.flags & NOT_BINARY_FLAGS) != 0)
delta->binary = 0;
/* otherwise leave delta->binary value untouched */
}
......@@ -507,7 +511,7 @@ static int diff_delta_load(diff_delta_context *ctxt)
{
int error = 0;
git_diff_delta *delta = ctxt->delta;
bool load_old = false, load_new = false, check_if_unmodified = false;
bool check_if_unmodified = false;
if (ctxt->loaded || !ctxt->delta)
return 0;
......@@ -527,22 +531,33 @@ static int diff_delta_load(diff_delta_context *ctxt)
goto cleanup;
switch (delta->status) {
case GIT_DELTA_ADDED: load_new = true; break;
case GIT_DELTA_DELETED: load_old = true; break;
case GIT_DELTA_MODIFIED: load_new = load_old = true; break;
default: break;
case GIT_DELTA_ADDED:
delta->old_file.flags |= GIT_DIFF_FILE_NO_DATA;
break;
case GIT_DELTA_DELETED:
delta->new_file.flags |= GIT_DIFF_FILE_NO_DATA;
break;
case GIT_DELTA_MODIFIED:
break;
default:
delta->new_file.flags |= GIT_DIFF_FILE_NO_DATA;
delta->old_file.flags |= GIT_DIFF_FILE_NO_DATA;
break;
}
#define CHECK_UNMODIFIED (GIT_DIFF_FILE_NO_DATA | GIT_DIFF_FILE_VALID_OID)
check_if_unmodified =
(load_old && (delta->old_file.flags & GIT_DIFF_FILE_VALID_OID) == 0) ||
(load_new && (delta->new_file.flags & GIT_DIFF_FILE_VALID_OID) == 0);
(delta->old_file.flags & CHECK_UNMODIFIED) == 0 &&
(delta->new_file.flags & CHECK_UNMODIFIED) == 0;
/* Always try to load workdir content first, since it may need to be
* filtered (and hence use 2x memory) and we want to minimize the max
* memory footprint during diff.
*/
if (load_old && ctxt->old_src == GIT_ITERATOR_WORKDIR) {
if ((delta->old_file.flags & GIT_DIFF_FILE_NO_DATA) == 0 &&
ctxt->old_src == GIT_ITERATOR_WORKDIR) {
if ((error = get_workdir_content(
ctxt, &delta->old_file, &ctxt->old_data)) < 0)
goto cleanup;
......@@ -550,7 +565,8 @@ static int diff_delta_load(diff_delta_context *ctxt)
goto cleanup;
}
if (load_new && ctxt->new_src == GIT_ITERATOR_WORKDIR) {
if ((delta->new_file.flags & GIT_DIFF_FILE_NO_DATA) == 0 &&
ctxt->new_src == GIT_ITERATOR_WORKDIR) {
if ((error = get_workdir_content(
ctxt, &delta->new_file, &ctxt->new_data)) < 0)
goto cleanup;
......@@ -558,7 +574,8 @@ static int diff_delta_load(diff_delta_context *ctxt)
goto cleanup;
}
if (load_old && ctxt->old_src != GIT_ITERATOR_WORKDIR) {
if ((delta->old_file.flags & GIT_DIFF_FILE_NO_DATA) == 0 &&
ctxt->old_src != GIT_ITERATOR_WORKDIR) {
if ((error = get_blob_content(
ctxt, &delta->old_file, &ctxt->old_data, &ctxt->old_blob)) < 0)
goto cleanup;
......@@ -566,7 +583,8 @@ static int diff_delta_load(diff_delta_context *ctxt)
goto cleanup;
}
if (load_new && ctxt->new_src != GIT_ITERATOR_WORKDIR) {
if ((delta->new_file.flags & GIT_DIFF_FILE_NO_DATA) == 0 &&
ctxt->new_src != GIT_ITERATOR_WORKDIR) {
if ((error = get_blob_content(
ctxt, &delta->new_file, &ctxt->new_data, &ctxt->new_blob)) < 0)
goto cleanup;
......@@ -588,6 +606,10 @@ static int diff_delta_load(diff_delta_context *ctxt)
}
cleanup:
/* last change to update binary flag */
if (delta->binary == -1)
update_delta_is_binary(delta);
ctxt->loaded = !error;
/* flag if we would want to diff the contents of these files */
......@@ -629,9 +651,10 @@ static int diff_delta_cb(void *priv, mmbuffer_t *bufs, int len)
}
if (len == 3 && !ctxt->cb_error) {
/* This should only happen if we are adding a line that does not
* have a newline at the end and the old code did. In that case,
* we have a ADD with a DEL_EOFNL as a pair.
/* If we have a '+' and a third buf, then we have added a line
* without a newline and the old code had one, so DEL_EOFNL.
* If we have a '-' and a third buf, then we have removed a line
* with out a newline but added a blank line, so ADD_EOFNL.
*/
char origin =
(*bufs[0].ptr == '+') ? GIT_DIFF_LINE_DEL_EOFNL :
......@@ -1036,6 +1059,7 @@ static void set_data_from_blob(
} else {
map->data = "";
file->size = map->len = 0;
file->flags |= GIT_DIFF_FILE_NO_DATA;
}
}
......
......@@ -115,40 +115,26 @@ mode_t git_futils_canonical_mode(mode_t raw_mode)
return 0;
}
#define MAX_READ_STALLS 10
int git_futils_readbuffer_fd(git_buf *buf, git_file fd, size_t len)
{
int stalls = MAX_READ_STALLS;
ssize_t read_size;
git_buf_clear(buf);
if (git_buf_grow(buf, len + 1) < 0)
return -1;
buf->ptr[len] = '\0';
while (len > 0) {
ssize_t read_size = p_read(fd, buf->ptr + buf->size, len);
if (read_size < 0) {
giterr_set(GITERR_OS, "Failed to read descriptor");
return -1;
}
if (read_size == 0) {
stalls--;
/* p_read loops internally to read len bytes */
read_size = p_read(fd, buf->ptr, len);
if (!stalls) {
giterr_set(GITERR_OS, "Too many stalls reading descriptor");
return -1;
}
}
len -= read_size;
buf->size += read_size;
if (read_size < 0) {
giterr_set(GITERR_OS, "Failed to read descriptor");
return -1;
}
buf->ptr[read_size] = '\0';
buf->size = read_size;
return 0;
}
......
......@@ -89,7 +89,8 @@ int diff_line_fn(
e->line_adds++;
break;
case GIT_DIFF_LINE_ADD_EOFNL:
assert(0);
/* technically not a line add, but we'll count it as such */
e->line_adds++;
break;
case GIT_DIFF_LINE_DELETION:
e->line_dels++;
......
......@@ -114,3 +114,88 @@ void test_diff_diffiter__iterate_files_and_hunks(void)
git_diff_iterator_free(iter);
git_diff_list_free(diff);
}
void test_diff_diffiter__max_size_threshold(void)
{
git_repository *repo = cl_git_sandbox_init("status");
git_diff_options opts = {0};
git_diff_list *diff = NULL;
git_diff_iterator *iter;
git_diff_delta *delta;
int error, file_count = 0, binary_count = 0, hunk_count = 0;
opts.context_lines = 3;
opts.interhunk_lines = 1;
opts.flags |= GIT_DIFF_INCLUDE_IGNORED | GIT_DIFF_INCLUDE_UNTRACKED;
cl_git_pass(git_diff_workdir_to_index(repo, &opts, &diff));
cl_git_pass(git_diff_iterator_new(&iter, diff));
while ((error = git_diff_iterator_next_file(&delta, iter)) != GIT_ITEROVER) {
cl_assert_equal_i(0, error);
cl_assert(delta);
file_count++;
hunk_count += git_diff_iterator_num_hunks_in_file(iter);
assert(delta->binary == 0 || delta->binary == 1);
binary_count += delta->binary;
}
cl_assert_equal_i(GIT_ITEROVER, error);
cl_assert(delta == NULL);
cl_assert_equal_i(13, file_count);
cl_assert_equal_i(0, binary_count);
cl_assert_equal_i(8, hunk_count);
git_diff_iterator_free(iter);
git_diff_list_free(diff);
/* try again with low file size threshold */
file_count = 0;
binary_count = 0;
hunk_count = 0;
opts.context_lines = 3;
opts.interhunk_lines = 1;
opts.flags |= GIT_DIFF_INCLUDE_IGNORED | GIT_DIFF_INCLUDE_UNTRACKED;
opts.max_size = 50; /* treat anything over 50 bytes as binary! */
cl_git_pass(git_diff_workdir_to_index(repo, &opts, &diff));
cl_git_pass(git_diff_iterator_new(&iter, diff));
while ((error = git_diff_iterator_next_file(&delta, iter)) != GIT_ITEROVER) {
cl_assert_equal_i(0, error);
cl_assert(delta);
file_count++;
hunk_count += git_diff_iterator_num_hunks_in_file(iter);
assert(delta->binary == 0 || delta->binary == 1);
binary_count += delta->binary;
}
cl_assert_equal_i(GIT_ITEROVER, error);
cl_assert(delta == NULL);
cl_assert_equal_i(13, file_count);
/* Three files are over the 50 byte threshold:
* - staged_changes_file_deleted
* - staged_changes_modified_file
* - staged_new_file_modified_file
*/
cl_assert_equal_i(3, binary_count);
cl_assert_equal_i(5, hunk_count);
git_diff_iterator_free(iter);
git_diff_list_free(diff);
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment