Commit 960a04dd by Russell Belfer

Initial integration of similarity metric to diff

This is the initial integration of the similarity metric into
the `git_diff_find_similar()` code path.  The existing tests all
pass, but the new functionality isn't currently well tested.  The
integration does go through the pluggable metric interface, so it
should be possible to drop in an alternative to the internal
metric that libgit2 implements.

This comes along with a behavior change for an existing interface;
namely, passing two NULLs to git_diff_blobs (or passing NULLs to
git_diff_blob_to_buffer) will now call the file_cb parameter zero
times instead of one time.  I know it's strange that that change
is paired with this other change, but it emerged from some
initialization changes that I ended up making.
parent 71a3d27e
......@@ -379,7 +379,7 @@ typedef struct git_diff_patch git_diff_patch;
typedef enum {
/** look for renames? (`--find-renames`) */
GIT_DIFF_FIND_RENAMES = (1 << 0),
/** consider old size of modified for renames? (`--break-rewrites=N`) */
/** consider old side of modified for renames? (`--break-rewrites=N`) */
GIT_DIFF_FIND_RENAMES_FROM_REWRITES = (1 << 1),
/** look for copies? (a la `--find-copies`) */
......@@ -897,11 +897,12 @@ GIT_EXTERN(int) git_diff_patch_to_str(
*
* NULL is allowed for either `old_blob` or `new_blob` and will be treated
* as an empty blob, with the `oid` set to NULL in the `git_diff_file` data.
* Passing NULL for both blobs is a noop; no callbacks will be made at all.
*
* We do run a binary content check on the two blobs and if either of the
* blobs looks like binary data, the `git_diff_delta` binary attribute will
* be set to 1 and no call to the hunk_cb nor line_cb will be made (unless
* you pass `GIT_DIFF_FORCE_TEXT` of course).
* We do run a binary content check on the blob content and if either blob
* looks like binary data, the `git_diff_delta` binary attribute will be set
* to 1 and no call to the hunk_cb nor line_cb will be made (unless you pass
* `GIT_DIFF_FORCE_TEXT` of course).
*
* @return 0 on success, GIT_EUSER on non-zero callback, or error code
*/
......@@ -921,6 +922,11 @@ GIT_EXTERN(int) git_diff_blobs(
* so the `git_diff_file` parameters to the callbacks will be faked a la the
* rules for `git_diff_blobs()`.
*
* Passing NULL for `old_blob` will be treated as an empty blob (i.e. the
* `file_cb` will be invoked with GIT_DELTA_ADDED and the diff will be the
* entire content of the buffer added). Passing NULL to the buffer will do
* the reverse, with GIT_DELTA_REMOVED and blob content removed.
*
* @return 0 on success, GIT_EUSER on non-zero callback, or error code
*/
GIT_EXTERN(int) git_diff_blob_to_buffer(
......
......@@ -456,7 +456,7 @@ static void release_content(git_diff_file *file, git_map *map, git_blob *blob)
}
static void diff_context_init(
static int diff_context_init(
diff_context *ctxt,
git_diff_list *diff,
git_repository *repo,
......@@ -468,6 +468,12 @@ static void diff_context_init(
{
memset(ctxt, 0, sizeof(diff_context));
if (!repo && diff)
repo = diff->repo;
if (!opts && diff)
opts = &diff->opts;
ctxt->repo = repo;
ctxt->diff = diff;
ctxt->opts = opts;
......@@ -478,6 +484,8 @@ static void diff_context_init(
ctxt->error = 0;
setup_xdiff_options(ctxt->opts, &ctxt->xdiff_config, &ctxt->xdiff_params);
return 0;
}
static int diff_delta_file_callback(
......@@ -922,6 +930,15 @@ static int diff_patch_line_cb(
return 0;
}
static int diff_required(git_diff_list *diff, const char *action)
{
if (!diff) {
giterr_set(GITERR_INVALID, "Must provide valid diff to %s", action);
return -1;
}
return 0;
}
int git_diff_foreach(
git_diff_list *diff,
......@@ -935,9 +952,12 @@ int git_diff_foreach(
size_t idx;
git_diff_patch patch;
diff_context_init(
&ctxt, diff, diff->repo, &diff->opts,
file_cb, hunk_cb, data_cb, payload);
if (diff_required(diff, "git_diff_foreach") < 0)
return -1;
if (diff_context_init(
&ctxt, diff, NULL, NULL, file_cb, hunk_cb, data_cb, payload) < 0)
return -1;
diff_patch_init(&ctxt, &patch);
......@@ -1306,8 +1326,10 @@ static int diff_single_init(
memset(data, 0, sizeof(*data));
diff_context_init(
&data->ctxt, NULL, repo, opts, file_cb, hunk_cb, data_cb, payload);
if (diff_context_init(
&data->ctxt, NULL, repo, opts,
file_cb, hunk_cb, data_cb, payload) < 0)
return -1;
diff_patch_init(&data->ctxt, &data->patch);
......@@ -1374,6 +1396,9 @@ int git_diff_blobs(
new_blob ? git_object_owner((const git_object *)new_blob) :
old_blob ? git_object_owner((const git_object *)old_blob) : NULL;
if (!repo) /* Hmm, given two NULL blobs, silently do no callbacks? */
return 0;
if ((error = diff_single_init(
&d, repo, options, file_cb, hunk_cb, data_cb, payload)) < 0)
return error;
......@@ -1405,6 +1430,9 @@ int git_diff_blob_to_buffer(
git_repository *repo =
old_blob ? git_object_owner((const git_object *)old_blob) : NULL;
if (!repo && !buf) /* Hmm, given NULLs, silently do no callbacks? */
return 0;
if ((error = diff_single_init(
&d, repo, options, file_cb, hunk_cb, data_cb, payload)) < 0)
return error;
......@@ -1453,11 +1481,19 @@ int git_diff_get_patch(
if (patch_ptr)
*patch_ptr = NULL;
if (delta_ptr)
*delta_ptr = NULL;
if (diff_required(diff, "git_diff_get_patch") < 0)
return -1;
if (diff_context_init(
&ctxt, diff, NULL, NULL,
NULL, diff_patch_hunk_cb, diff_patch_line_cb, NULL) < 0)
return -1;
delta = git_vector_get(&diff->deltas, idx);
if (!delta) {
if (delta_ptr)
*delta_ptr = NULL;
giterr_set(GITERR_INVALID, "Index out of range for delta in diff");
return GIT_ENOTFOUND;
}
......@@ -1470,10 +1506,6 @@ int git_diff_get_patch(
(diff->opts.flags & GIT_DIFF_SKIP_BINARY_CHECK) != 0))
return 0;
diff_context_init(
&ctxt, diff, diff->repo, &diff->opts,
NULL, diff_patch_hunk_cb, diff_patch_line_cb, NULL);
if (git_diff_delta__should_skip(ctxt.opts, delta))
return 0;
......
......@@ -7,6 +7,7 @@
#include "common.h"
#include "diff.h"
#include "git2/config.h"
#include "git2/blob.h"
#include "hashsig.h"
static git_diff_delta *diff_delta__dup(
......@@ -362,24 +363,86 @@ on_error:
return -1;
}
typedef struct {
/* array of delta index * 2 + (old_file/new_file) -> file hashes */
git_hashsig *sigs;
} diff_similarity_cache;
GIT_INLINE(git_diff_file *) similarity_get_file(git_diff_list *diff, size_t idx)
{
git_diff_delta *delta = git_vector_get(&diff->deltas, idx / 2);
return (idx & 1) ? &delta->new_file : &delta->old_file;
}
static unsigned int calc_similarity(
void *ref, git_diff_file *old_file, git_diff_file *new_file)
static int similarity_calc(
git_diff_list *diff,
git_diff_find_options *opts,
size_t file_idx,
void **cache)
{
diff_similarity_cache *cache = ref;
int error = 0;
git_diff_file *file = similarity_get_file(diff, file_idx);
git_iterator_type_t src = (file_idx & 1) ? diff->old_src : diff->new_src;
GIT_UNUSED(cache);
if (src == GIT_ITERATOR_TYPE_WORKDIR) { /* compute hashsig from file */
git_buf path = GIT_BUF_INIT;
if (git_oid_cmp(&old_file->oid, &new_file->oid) == 0)
return 100;
/* TODO: apply wd-to-odb filters to file data if necessary */
if (!(error = git_buf_joinpath(
&path, git_repository_workdir(diff->repo), file->path)))
error = opts->metric->file_signature(
&cache[file_idx], file, path.ptr, opts->metric->payload);
git_buf_free(&path);
} else { /* compute hashsig from blob buffer */
git_blob *blob = NULL;
/* TODO: insert actual similarity algo here */
/* TODO: add max size threshold a la diff? */
if ((error = git_blob_lookup(&blob, diff->repo, &file->oid)) < 0)
return error;
error = opts->metric->buffer_signature(
&cache[file_idx], file, git_blob_rawcontent(blob),
git_blob_rawsize(blob), opts->metric->payload);
git_blob_free(blob);
}
return error;
}
static int similarity_measure(
git_diff_list *diff,
git_diff_find_options *opts,
void **cache,
size_t a_idx,
size_t b_idx)
{
int score = 0;
git_diff_file *a_file = similarity_get_file(diff, a_idx);
git_diff_file *b_file = similarity_get_file(diff, b_idx);
if (GIT_MODE_TYPE(a_file->mode) != GIT_MODE_TYPE(b_file->mode))
return 0;
if (git_oid_cmp(&a_file->oid, &b_file->oid) == 0)
return 100;
/* update signature cache if needed */
if (!cache[a_idx] && similarity_calc(diff, opts, a_idx, cache) < 0)
return -1;
if (!cache[b_idx] && similarity_calc(diff, opts, b_idx, cache) < 0)
return -1;
/* compare signatures */
if (opts->metric->similarity(
&score, cache[a_idx], cache[b_idx], opts->metric->payload) < 0)
return -1;
/* clip score */
if (score < 0)
score = 0;
else if (score > 100)
score = 100;
return score;
}
#define FLAG_SET(opts,flag_name) ((opts.flags & flag_name) != 0)
......@@ -388,67 +451,85 @@ int git_diff_find_similar(
git_diff_list *diff,
git_diff_find_options *given_opts)
{
unsigned int i, j, similarity;
size_t i, j, cache_size, *matches;
int error = 0, similarity;
git_diff_delta *from, *to;
git_diff_find_options opts;
unsigned int tried_targets, num_changes = 0;
git_vector matches = GIT_VECTOR_INIT;
size_t tried_targets, num_rewrites = 0;
void **cache;
if (normalize_find_opts(diff, &opts, given_opts) < 0)
return -1;
if ((error = normalize_find_opts(diff, &opts, given_opts)) < 0)
return error;
/* TODO: maybe abort if deltas.length > target_limit ??? */
cache_size = diff->deltas.length * 2; /* must store b/c length may change */
cache = git__calloc(cache_size, sizeof(void *));
GITERR_CHECK_ALLOC(cache);
/* first do splits if requested */
matches = git__calloc(diff->deltas.length, sizeof(size_t));
GITERR_CHECK_ALLOC(matches);
/* first break MODIFIED records that are too different (if requested) */
if (FLAG_SET(opts, GIT_DIFF_FIND_AND_BREAK_REWRITES)) {
git_vector_foreach(&diff->deltas, i, from) {
if (from->status != GIT_DELTA_MODIFIED)
continue;
/* Right now, this doesn't work right because the similarity
* algorithm isn't actually implemented...
*/
similarity = 100;
/* calc_similarity(NULL, &from->old_file, from->new_file); */
similarity = similarity_measure(
diff, &opts, cache, 2 * i, 2 * i + 1);
if (similarity < 0) {
error = similarity;
goto cleanup;
}
if (similarity < opts.break_rewrite_threshold) {
if ((unsigned int)similarity < opts.break_rewrite_threshold) {
from->flags |= GIT_DIFF_FLAG__TO_SPLIT;
num_changes++;
num_rewrites++;
}
}
/* apply splits as needed */
if (num_changes > 0 &&
apply_splits_and_deletes(
diff, diff->deltas.length + num_changes) < 0)
return -1;
}
/* next find the most similar delta for each rename / copy candidate */
if (git_vector_init(&matches, diff->deltas.length, git_diff_delta__cmp) < 0)
return -1;
git_vector_foreach(&diff->deltas, i, from) {
tried_targets = 0;
/* skip things that aren't blobs */
if (GIT_MODE_TYPE(from->old_file.mode) !=
GIT_MODE_TYPE(GIT_FILEMODE_BLOB))
continue;
git_vector_foreach(&diff->deltas, j, to) {
if (i == j)
continue;
/* skip things that aren't blobs */
if (GIT_MODE_TYPE(to->new_file.mode) !=
GIT_MODE_TYPE(GIT_FILEMODE_BLOB))
continue;
switch (to->status) {
case GIT_DELTA_ADDED:
case GIT_DELTA_UNTRACKED:
case GIT_DELTA_RENAMED:
case GIT_DELTA_COPIED:
break;
case GIT_DELTA_MODIFIED:
if ((to->flags & GIT_DIFF_FLAG__TO_SPLIT) == 0)
continue;
break;
default:
/* only the above status values should be checked */
continue;
}
/* skip all but DELETED files unless copy detection is on */
if (from->status != GIT_DELTA_DELETED &&
!FLAG_SET(opts, GIT_DIFF_FIND_COPIES))
if (!FLAG_SET(opts, GIT_DIFF_FIND_COPIES) &&
from->status != GIT_DELTA_DELETED &&
(from->flags & GIT_DIFF_FLAG__TO_SPLIT) == 0)
continue;
/* don't check UNMODIFIED files as source unless given option */
......@@ -463,34 +544,44 @@ int git_diff_find_similar(
/* calculate similarity and see if this pair beats the
* similarity score of the current best pair.
*/
similarity = calc_similarity(NULL, &from->old_file, &to->new_file);
similarity = similarity_measure(
diff, &opts, cache, 2 * i, 2 * j + 1);
if (to->similarity < similarity) {
to->similarity = similarity;
if (git_vector_set(NULL, &matches, j, from) < 0)
return -1;
if (similarity < 0) {
error = similarity;
goto cleanup;
}
if (to->similarity < (unsigned int)similarity) {
to->similarity = (unsigned int)similarity;
matches[j] = i + 1;
}
}
}
/* next rewrite the diffs with renames / copies */
num_changes = 0;
num_rewrites = 0;
git_vector_foreach(&diff->deltas, j, to) {
from = GIT_VECTOR_GET(&matches, j);
if (!from) {
if (!matches[j]) {
assert(to->similarity == 0);
continue;
}
/* three possible outcomes here:
i = matches[j] - 1;
from = GIT_VECTOR_GET(&diff->deltas, i);
assert(from);
/* four possible outcomes here:
* 1. old DELETED and if over rename threshold,
* new becomes RENAMED and old goes away
* 2. old was MODIFIED but FIND_RENAMES_FROM_REWRITES is on and
* 2. old SPLIT and if over rename threshold,
* new becomes RENAMED and old becomes ADDED (clear SPLIT)
* 3. old was MODIFIED but FIND_RENAMES_FROM_REWRITES is on and
* old is more similar to new than it is to itself, in which
* case, new becomes RENAMED and old becomed ADDED
* 3. otherwise if over copy threshold, new becomes COPIED
* 4. otherwise if over copy threshold, new becomes COPIED
*/
if (from->status == GIT_DELTA_DELETED) {
......@@ -503,7 +594,26 @@ int git_diff_find_similar(
memcpy(&to->old_file, &from->old_file, sizeof(to->old_file));
from->flags |= GIT_DIFF_FLAG__TO_DELETE;
num_changes++;
num_rewrites++;
continue;
}
if (from->status == GIT_DELTA_MODIFIED &&
(from->flags & GIT_DIFF_FLAG__TO_SPLIT) != 0)
{
if (to->similarity < opts.rename_threshold) {
to->similarity = 0;
continue;
}
to->status = GIT_DELTA_RENAMED;
memcpy(&to->old_file, &from->old_file, sizeof(to->old_file));
from->status = GIT_DELTA_ADDED;
from->flags &= ~GIT_DIFF_FLAG__TO_SPLIT;
memset(&from->old_file, 0, sizeof(from->old_file));
num_rewrites--;
continue;
}
......@@ -512,10 +622,15 @@ int git_diff_find_similar(
FLAG_SET(opts, GIT_DIFF_FIND_RENAMES_FROM_REWRITES) &&
to->similarity > opts.rename_threshold)
{
similarity = 100;
/* calc_similarity(NULL, &from->old_file, from->new_file); */
similarity = similarity_measure(
diff, &opts, cache, 2 * i, 2 * i + 1);
if (similarity < opts.rename_from_rewrite_threshold) {
if (similarity < 0) {
error = similarity;
goto cleanup;
}
if ((unsigned int)similarity < opts.rename_from_rewrite_threshold) {
to->status = GIT_DELTA_RENAMED;
memcpy(&to->old_file, &from->old_file, sizeof(to->old_file));
......@@ -538,17 +653,23 @@ int git_diff_find_similar(
memcpy(&to->old_file, &from->old_file, sizeof(to->old_file));
}
git_vector_free(&matches);
if (num_rewrites > 0) {
assert(num_rewrites < diff->deltas.length);
if (num_changes > 0) {
assert(num_changes < diff->deltas.length);
error = apply_splits_and_deletes(
diff, diff->deltas.length - num_rewrites);
}
if (apply_splits_and_deletes(
diff, diff->deltas.length - num_changes) < 0)
return -1;
cleanup:
git__free(matches);
for (i = 0; i < cache_size; ++i) {
if (cache[i] != NULL)
opts.metric->free_signature(cache[i], opts.metric->payload);
}
git__free(cache);
return 0;
return error;
}
#undef FLAG_SET
......@@ -196,7 +196,7 @@ void test_diff_blob__can_compare_identical_blobs(void)
NULL, NULL, &opts, diff_file_cb, diff_hunk_cb, diff_line_cb, &expected));
cl_assert_equal_i(0, expected.files_binary);
assert_identical_blobs_comparison(&expected);
cl_assert_equal_i(0, expected.files); /* NULLs mean no callbacks, period */
memset(&expected, 0, sizeof(expected));
cl_git_pass(git_diff_blobs(
......@@ -399,7 +399,7 @@ void test_diff_blob__can_compare_blob_to_buffer(void)
assert_identical_blobs_comparison(&expected);
/* diff from NULL blob to content of b */
/* diff from NULL blob to content of a */
memset(&expected, 0, sizeof(expected));
cl_git_pass(git_diff_blob_to_buffer(
NULL, a_content, strlen(a_content),
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment