Commit a5140f4d by Russell Belfer

Fix rename detection for tree-to-tree diffs

The performance improvements I introduced for rename detection
were not able to run successfully for tree-to-tree diffs because
the blob size was not known early enough and so the file signature
always had to be calculated nonetheless.

This change separates loading blobs into memory from calculating
the signature.  I can't avoid having to load the large blobs into
memory, but by moving it forward, I'm able to avoid the signature
calculation if the blob won't come into play for renames.
parent f5c4d022
...@@ -408,54 +408,90 @@ GIT_INLINE(git_diff_file *) similarity_get_file(git_diff_list *diff, size_t idx) ...@@ -408,54 +408,90 @@ GIT_INLINE(git_diff_file *) similarity_get_file(git_diff_list *diff, size_t idx)
return (idx & 1) ? &delta->new_file : &delta->old_file; return (idx & 1) ? &delta->new_file : &delta->old_file;
} }
static int similarity_calc( typedef struct {
git_diff_list *diff, size_t idx;
const git_diff_find_options *opts, git_iterator_type_t src;
size_t file_idx, git_repository *repo;
void **cache) git_diff_file *file;
git_buf data;
git_blob *blob;
int loaded;
} similarity_info;
static void similarity_init(
similarity_info *info, git_diff_list *diff, size_t file_idx)
{
info->idx = file_idx;
info->src = (file_idx & 1) ? diff->new_src : diff->old_src;
info->repo = diff->repo;
info->file = similarity_get_file(diff, file_idx);
info->blob = NULL;
info->loaded = 0;
git_buf_init(&info->data, 0);
}
static int similarity_load(similarity_info *info)
{ {
int error = 0; int error = 0;
git_diff_file *file = similarity_get_file(diff, file_idx); git_diff_file *file = info->file;
git_iterator_type_t src = (file_idx & 1) ? diff->new_src : diff->old_src;
if (src == GIT_ITERATOR_TYPE_WORKDIR) { /* compute hashsig from file */ if (info->src == GIT_ITERATOR_TYPE_WORKDIR) {
git_buf path = GIT_BUF_INIT; error = git_buf_joinpath(
&info->data, git_repository_workdir(info->repo), file->path);
/* TODO: apply wd-to-odb filters to file data if necessary */ /* if path is not a regular file, just skip this item */
if (!error && !git_path_isfile(info->data.ptr))
git_buf_free(&info->data);
} else if (git_blob_lookup(&info->blob, info->repo, &file->oid) < 0) {
/* if lookup fails, just skip this item in similarity calc */
giterr_clear();
} else {
if (!file->size)
file->size = git_blob_rawsize(info->blob);
assert(file->size == git_blob_rawsize(info->blob));
if ((error = git_buf_joinpath( info->data.size = (size_t)(git__is_sizet(file->size) ? file->size : -1);
&path, git_repository_workdir(diff->repo), file->path)) < 0) info->data.ptr = (char *)git_blob_rawcontent(info->blob);
return error; }
/* if path is not a regular file, just skip this item */ info->loaded = 1;
if (git_path_isfile(path.ptr))
error = opts->metric->file_signature(
&cache[file_idx], file, path.ptr, opts->metric->payload);
git_buf_free(&path); return error;
} else { /* compute hashsig from blob buffer */ }
git_blob *blob = NULL;
git_off_t blobsize;
/* TODO: add max size threshold a la diff? */ static void similarity_unload(similarity_info *info)
{
if (info->blob)
git_blob_free(info->blob);
else
git_buf_free(&info->data);
if (git_blob_lookup(&blob, diff->repo, &file->oid) < 0) { info->loaded = 0;
/* if lookup fails, just skip this item in similarity calc */ }
giterr_clear();
return 0;
}
blobsize = git_blob_rawsize(blob); static int similarity_calc(
if (!file->size) similarity_info *info,
file->size = blobsize; const git_diff_find_options *opts,
if (!git__is_sizet(blobsize)) /* ? what to do ? */ void **cache)
blobsize = (size_t)-1; {
int error = 0;
error = opts->metric->buffer_signature( if (!info->loaded && (error = similarity_load(info)) < 0)
&cache[file_idx], file, git_blob_rawcontent(blob), return error;
(size_t)blobsize, opts->metric->payload);
git_blob_free(blob); if (!info->data.size)
return 0;
if (info->src == GIT_ITERATOR_TYPE_WORKDIR) {
/* TODO: apply wd-to-odb filters to file data if necessary */
error = opts->metric->file_signature(
&cache[info->idx], info->file,
info->data.ptr, opts->metric->payload);
} else {
error = opts->metric->buffer_signature(
&cache[info->idx], info->file,
info->data.ptr, info->data.size, opts->metric->payload);
} }
return error; return error;
...@@ -478,6 +514,8 @@ static int similarity_measure( ...@@ -478,6 +514,8 @@ static int similarity_measure(
git_diff_file *a_file = similarity_get_file(diff, a_idx); git_diff_file *a_file = similarity_get_file(diff, a_idx);
git_diff_file *b_file = similarity_get_file(diff, b_idx); git_diff_file *b_file = similarity_get_file(diff, b_idx);
bool exact_match = FLAG_SET(opts, GIT_DIFF_FIND_EXACT_MATCH_ONLY); bool exact_match = FLAG_SET(opts, GIT_DIFF_FIND_EXACT_MATCH_ONLY);
int error = 0;
similarity_info a_info, b_info;
*score = -1; *score = -1;
...@@ -512,26 +550,39 @@ static int similarity_measure( ...@@ -512,26 +550,39 @@ static int similarity_measure(
return 0; return 0;
} }
similarity_init(&a_info, diff, a_idx);
similarity_init(&b_info, diff, b_idx);
if (!a_file->size && (error = similarity_load(&a_info)) < 0)
goto done;
if (!b_file->size && (error = similarity_load(&b_info)) < 0)
goto done;
/* check if file sizes are nowhere near each other */ /* check if file sizes are nowhere near each other */
if (a_file->size > 127 && if (a_file->size > 127 &&
b_file->size > 127 && b_file->size > 127 &&
(a_file->size > (b_file->size << 4) || (a_file->size > (b_file->size << 4) ||
b_file->size > (a_file->size << 4))) b_file->size > (a_file->size << 4)))
return 0; goto done;
/* update signature cache if needed */ /* update signature cache if needed */
if (!cache[a_idx] && similarity_calc(diff, opts, a_idx, cache) < 0) if (!cache[a_idx] && (error = similarity_calc(&a_info, opts, cache)) < 0)
return -1; goto done;
if (!cache[b_idx] && similarity_calc(diff, opts, b_idx, cache) < 0) if (!cache[b_idx] && (error = similarity_calc(&b_info, opts, cache)) < 0)
return -1; goto done;
/* some metrics may not wish to process this file (too big / too small) */ /* calculate similarity provided that the metric choose to process
if (!cache[a_idx] || !cache[b_idx]) * both the a and b files (some may not if file is too big, etc).
return 0; */
if (cache[a_idx] && cache[b_idx])
error = opts->metric->similarity(
score, cache[a_idx], cache[b_idx], opts->metric->payload);
/* compare signatures */ done:
return opts->metric->similarity( similarity_unload(&a_info);
score, cache[a_idx], cache[b_idx], opts->metric->payload); similarity_unload(&b_info);
return error;
} }
static int calc_self_similarity( static int calc_self_similarity(
......
...@@ -1126,7 +1126,7 @@ void test_diff_rename__unmodified_can_be_renamed(void) ...@@ -1126,7 +1126,7 @@ void test_diff_rename__unmodified_can_be_renamed(void)
void test_diff_rename__many_files(void) void test_diff_rename__many_files(void)
{ {
git_index *index; git_index *index;
git_tree *tree; git_tree *tree, *new_tree;
git_diff_list *diff = NULL; git_diff_list *diff = NULL;
diff_expects exp; diff_expects exp;
git_diff_options diffopts = GIT_DIFF_OPTIONS_INIT; git_diff_options diffopts = GIT_DIFF_OPTIONS_INIT;
...@@ -1176,6 +1176,52 @@ void test_diff_rename__many_files(void) ...@@ -1176,6 +1176,52 @@ void test_diff_rename__many_files(void)
cl_assert_equal_i(51, exp.files); cl_assert_equal_i(51, exp.files);
git_diff_list_free(diff); git_diff_list_free(diff);
git_index_free(index);
{
git_object *parent;
git_signature *sig;
git_oid tree_id, commit_id;
git_reference *ref;
cl_git_pass(git_index_write_tree(&tree_id, index));
cl_git_pass(git_tree_lookup(&new_tree, g_repo, &tree_id));
cl_git_pass(git_revparse_ext(&parent, &ref, g_repo, "HEAD"));
cl_git_pass(git_signature_new(
&sig, "Sm Test", "sm@tester.test", 1372350000, 480));
cl_git_pass(git_commit_create_v(
&commit_id, g_repo, git_reference_name(ref), sig, sig,
NULL, "yoyoyo", new_tree, 1, parent));
git_object_free(parent);
git_reference_free(ref);
git_signature_free(sig);
}
cl_git_pass(git_diff_tree_to_tree(
&diff, g_repo, tree, new_tree, &diffopts));
memset(&exp, 0, sizeof(exp));
cl_git_pass(git_diff_foreach(
diff, diff_file_cb, NULL, NULL, &exp));
cl_assert_equal_i(1, exp.file_status[GIT_DELTA_DELETED]);
cl_assert_equal_i(51, exp.file_status[GIT_DELTA_ADDED]);
cl_assert_equal_i(52, exp.files);
opts.flags = GIT_DIFF_FIND_ALL;
cl_git_pass(git_diff_find_similar(diff, &opts));
memset(&exp, 0, sizeof(exp));
cl_git_pass(git_diff_foreach(
diff, diff_file_cb, NULL, NULL, &exp));
cl_assert_equal_i(1, exp.file_status[GIT_DELTA_RENAMED]);
cl_assert_equal_i(50, exp.file_status[GIT_DELTA_ADDED]);
cl_assert_equal_i(51, exp.files);
git_diff_list_free(diff);
git_tree_free(new_tree);
git_tree_free(tree); git_tree_free(tree);
git_index_free(index);
} }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment