Commit 9be5be47 by Russell Belfer

More git_diff_find_similar improvements

- Add new GIT_DIFF_FIND_EXACT_MATCH_ONLY flag to do similarity
  matching without using the similarity metric (i.e. only compare
  the SHA).
- Clean up the similarity measurement code to more rigorously
  distinguish between files that are not similar and files that
  are not comparable (previously, a 0 could either mean that the
  files could not be compared or that they were totally different)
- When splitting a MODIFIED file into a DELETE/ADD pair, actually
  make a DELETED/UNTRACKED pair if the right side of the diff is
  from the working directory.  This prevents an odd mix of ADDED
  and UNTRACKED files on workdir diffs.
parent 5c8f37a3
...@@ -441,6 +441,8 @@ typedef enum { ...@@ -441,6 +441,8 @@ typedef enum {
GIT_DIFF_FIND_IGNORE_WHITESPACE = (1 << 12), GIT_DIFF_FIND_IGNORE_WHITESPACE = (1 << 12),
/** measure similarity including all data */ /** measure similarity including all data */
GIT_DIFF_FIND_DONT_IGNORE_WHITESPACE = (1 << 13), GIT_DIFF_FIND_DONT_IGNORE_WHITESPACE = (1 << 13),
/** measure similarity only by comparing SHAs (fast and cheap) */
GIT_DIFF_FIND_EXACT_MATCH_ONLY = (1 << 14),
} git_diff_find_t; } git_diff_find_t;
/** /**
......
...@@ -255,6 +255,16 @@ static int normalize_find_opts( ...@@ -255,6 +255,16 @@ static int normalize_find_opts(
/* some flags imply others */ /* some flags imply others */
if (opts->flags & GIT_DIFF_FIND_EXACT_MATCH_ONLY) {
/* if we are only looking for exact matches, then don't turn
* MODIFIED items into ADD/DELETE pairs because it's too picky
*/
opts->flags &= ~(GIT_DIFF_FIND_REWRITES | GIT_DIFF_BREAK_REWRITES);
/* similarly, don't look for self-rewrites to split */
opts->flags &= ~GIT_DIFF_FIND_RENAMES_FROM_REWRITES;
}
if (opts->flags & GIT_DIFF_FIND_RENAMES_FROM_REWRITES) if (opts->flags & GIT_DIFF_FIND_RENAMES_FROM_REWRITES)
opts->flags |= GIT_DIFF_FIND_RENAMES; opts->flags |= GIT_DIFF_FIND_RENAMES;
...@@ -373,7 +383,10 @@ static int apply_splits_and_deletes( ...@@ -373,7 +383,10 @@ static int apply_splits_and_deletes(
if (git_vector_insert(&onto, deleted) < 0) if (git_vector_insert(&onto, deleted) < 0)
goto on_error; goto on_error;
delta->status = GIT_DELTA_ADDED; if (diff->new_src == GIT_ITERATOR_TYPE_WORKDIR)
delta->status = GIT_DELTA_UNTRACKED;
else
delta->status = GIT_DELTA_ADDED;
memset(&delta->old_file, 0, sizeof(delta->old_file)); memset(&delta->old_file, 0, sizeof(delta->old_file));
delta->old_file.path = delta->new_file.path; delta->old_file.path = delta->new_file.path;
delta->old_file.flags |= GIT_DIFF_FLAG_VALID_OID; delta->old_file.flags |= GIT_DIFF_FLAG_VALID_OID;
...@@ -460,22 +473,56 @@ static int similarity_calc( ...@@ -460,22 +473,56 @@ static int similarity_calc(
return error; return error;
} }
#define FLAG_SET(opts,flag_name) (((opts).flags & flag_name) != 0)
/* - score < 0 means files cannot be compared
* - score >= 100 means files are exact match
* - score == 0 means files are completely different
*/
static int similarity_measure( static int similarity_measure(
int *score,
git_diff_list *diff, git_diff_list *diff,
git_diff_find_options *opts, git_diff_find_options *opts,
void **cache, void **cache,
size_t a_idx, size_t a_idx,
size_t b_idx) size_t b_idx)
{ {
int score = 0;
git_diff_file *a_file = similarity_get_file(diff, a_idx); git_diff_file *a_file = similarity_get_file(diff, a_idx);
git_diff_file *b_file = similarity_get_file(diff, b_idx); git_diff_file *b_file = similarity_get_file(diff, b_idx);
bool exact_match = FLAG_SET(*opts, GIT_DIFF_FIND_EXACT_MATCH_ONLY);
*score = -1;
/* don't try to compare files of different types */
if (GIT_MODE_TYPE(a_file->mode) != GIT_MODE_TYPE(b_file->mode)) if (GIT_MODE_TYPE(a_file->mode) != GIT_MODE_TYPE(b_file->mode))
return 0; return 0;
if (git_oid__cmp(&a_file->oid, &b_file->oid) == 0) /* if exact match is requested, force calculation of missing OIDs */
return 100; if (exact_match) {
if (git_oid_iszero(&a_file->oid) &&
diff->old_src == GIT_ITERATOR_TYPE_WORKDIR &&
!git_diff__oid_for_file(diff->repo, a_file->path,
a_file->mode, a_file->size, &a_file->oid))
a_file->flags |= GIT_DIFF_FLAG_VALID_OID;
if (git_oid_iszero(&b_file->oid) &&
diff->new_src == GIT_ITERATOR_TYPE_WORKDIR &&
!git_diff__oid_for_file(diff->repo, b_file->path,
b_file->mode, b_file->size, &b_file->oid))
b_file->flags |= GIT_DIFF_FLAG_VALID_OID;
}
/* check OID match as a quick test */
if (git_oid__cmp(&a_file->oid, &b_file->oid) == 0) {
*score = 100;
return 0;
}
/* don't calculate signatures if we are doing exact match */
if (exact_match) {
*score = 0;
return 0;
}
/* update signature cache if needed */ /* update signature cache if needed */
if (!cache[a_idx] && similarity_calc(diff, opts, a_idx, cache) < 0) if (!cache[a_idx] && similarity_calc(diff, opts, a_idx, cache) < 0)
...@@ -488,20 +535,33 @@ static int similarity_measure( ...@@ -488,20 +535,33 @@ static int similarity_measure(
return 0; return 0;
/* compare signatures */ /* compare signatures */
if (opts->metric->similarity( return opts->metric->similarity(
&score, cache[a_idx], cache[b_idx], opts->metric->payload) < 0) score, cache[a_idx], cache[b_idx], opts->metric->payload);
return -1;
/* clip score */
if (score < 1)
score = 1; /* zero means uncomparable, so use 1 for least similar */
else if (score > 100)
score = 100;
return score;
} }
#define FLAG_SET(opts,flag_name) ((opts.flags & flag_name) != 0) static void convert_to_rename_and_add(
git_diff_list *diff,
git_diff_delta *from,
git_diff_delta *to,
int similarity)
{
to->status = GIT_DELTA_RENAMED;
to->flags &= ~GIT_DIFF_FLAG__TO_SPLIT; /* ensure no split */
to->similarity = (uint32_t)similarity;
memcpy(&to->old_file, &from->old_file, sizeof(to->old_file));
validate_delta(to);
if (diff->new_src == GIT_ITERATOR_TYPE_WORKDIR)
from->status = GIT_DELTA_UNTRACKED;
else
from->status = GIT_DELTA_ADDED;
from->flags &= ~GIT_DIFF_FLAG__TO_SPLIT; /* ensure no split */
from->similarity = 0;
memset(&from->old_file, 0, sizeof(from->old_file));
from->old_file.path = from->new_file.path;
from->old_file.flags |= GIT_DIFF_FLAG_VALID_OID;
validate_delta(from);
}
typedef struct { typedef struct {
uint32_t idx; uint32_t idx;
...@@ -542,21 +602,17 @@ int git_diff_find_similar( ...@@ -542,21 +602,17 @@ int git_diff_find_similar(
continue; continue;
/* skip things that aren't plain blobs */ /* skip things that aren't plain blobs */
if (GIT_MODE_TYPE(from->old_file.mode) != if (!GIT_MODE_ISBLOB(from->old_file.mode))
GIT_MODE_TYPE(GIT_FILEMODE_BLOB))
continue; continue;
/* measure similarity from old_file to new_file */ /* measure similarity from old_file to new_file */
similarity = similarity_measure( if ((error = similarity_measure(
diff, &opts, cache, 2 * i, 2 * i + 1); &similarity, diff, &opts, cache, 2 * i, 2 * i + 1)) < 0)
if (similarity < 0) {
error = similarity;
goto cleanup; goto cleanup;
}
if (similarity > 0 && if (similarity < 0)
similarity < (int)opts.break_rewrite_threshold) { continue;
if (similarity < (int)opts.break_rewrite_threshold) {
from->similarity = (uint32_t)similarity; from->similarity = (uint32_t)similarity;
from->flags |= GIT_DIFF_FLAG__TO_SPLIT; from->flags |= GIT_DIFF_FLAG__TO_SPLIT;
num_rewrites++; num_rewrites++;
...@@ -573,8 +629,7 @@ int git_diff_find_similar( ...@@ -573,8 +629,7 @@ int git_diff_find_similar(
matches[i].similarity = 0; matches[i].similarity = 0;
/* skip things that aren't plain blobs */ /* skip things that aren't plain blobs */
if (GIT_MODE_TYPE(from->old_file.mode) != if (!GIT_MODE_ISBLOB(from->old_file.mode))
GIT_MODE_TYPE(GIT_FILEMODE_BLOB))
continue; continue;
/* don't check UNMODIFIED files as source unless given option */ /* don't check UNMODIFIED files as source unless given option */
...@@ -599,8 +654,7 @@ int git_diff_find_similar( ...@@ -599,8 +654,7 @@ int git_diff_find_similar(
continue; continue;
/* skip things that aren't blobs */ /* skip things that aren't blobs */
if (GIT_MODE_TYPE(to->new_file.mode) != if (!GIT_MODE_ISBLOB(to->new_file.mode))
GIT_MODE_TYPE(GIT_FILEMODE_BLOB))
continue; continue;
/* only consider ADDED, RENAMED, COPIED, and split MODIFIED as /* only consider ADDED, RENAMED, COPIED, and split MODIFIED as
...@@ -630,14 +684,13 @@ int git_diff_find_similar( ...@@ -630,14 +684,13 @@ int git_diff_find_similar(
break; break;
/* calculate similarity for this pair and find best match */ /* calculate similarity for this pair and find best match */
similarity = similarity_measure( if ((error = similarity_measure(
diff, &opts, cache, 2 * i, 2 * j + 1); &similarity, diff, &opts, cache, 2 * i, 2 * j + 1)) < 0)
if (similarity < 0) {
error = similarity;
goto cleanup; goto cleanup;
if (similarity < 0) {
--tried_targets;
continue;
} }
if (matches[i].similarity < (uint32_t)similarity) { if (matches[i].similarity < (uint32_t)similarity) {
matches[i].similarity = (uint32_t)similarity; matches[i].similarity = (uint32_t)similarity;
matches[i].idx = j; matches[i].idx = j;
...@@ -687,18 +740,7 @@ int git_diff_find_similar( ...@@ -687,18 +740,7 @@ int git_diff_find_similar(
if (similarity < (int)opts.rename_threshold) if (similarity < (int)opts.rename_threshold)
continue; continue;
to->status = GIT_DELTA_RENAMED; convert_to_rename_and_add(diff, from, to, similarity);
to->similarity = (uint32_t)similarity;
memcpy(&to->old_file, &from->old_file, sizeof(to->old_file));
validate_delta(to);
from->status = GIT_DELTA_ADDED;
from->flags &= ~GIT_DIFF_FLAG__TO_SPLIT;
from->similarity = 0; /* reset self-similarity */
memset(&from->old_file, 0, sizeof(from->old_file));
from->old_file.path = from->new_file.path;
validate_delta(from);
num_rewrites--; num_rewrites--;
num_updates++; num_updates++;
continue; continue;
...@@ -712,28 +754,16 @@ int git_diff_find_similar( ...@@ -712,28 +754,16 @@ int git_diff_find_similar(
FLAG_SET(opts, GIT_DIFF_FIND_RENAMES_FROM_REWRITES) && FLAG_SET(opts, GIT_DIFF_FIND_RENAMES_FROM_REWRITES) &&
similarity > (int)opts.rename_threshold) similarity > (int)opts.rename_threshold)
{ {
int self_similarity = similarity_measure( int self_similarity;
diff, &opts, cache, 2 * i, 2 * i + 1);
if (self_similarity < 0) { if ((error = similarity_measure(&self_similarity,
error = self_similarity; diff, &opts, cache, 2 * i, 2 * i + 1)) < 0)
goto cleanup; goto cleanup;
}
if (self_similarity < (int)opts.rename_from_rewrite_threshold) { if (self_similarity >= 0 &&
to->status = GIT_DELTA_RENAMED; self_similarity < (int)opts.rename_from_rewrite_threshold) {
to->flags &= ~GIT_DIFF_FLAG__TO_SPLIT; /* ensure no split */
to->similarity = (uint32_t)similarity;
memcpy(&to->old_file, &from->old_file, sizeof(to->old_file));
validate_delta(to);
from->status = GIT_DELTA_ADDED;
from->flags &= ~GIT_DIFF_FLAG__TO_SPLIT; /* ensure no split */
from->similarity = 0;
memset(&from->old_file, 0, sizeof(from->old_file));
from->old_file.path = from->new_file.path;
from->old_file.flags |= GIT_DIFF_FLAG_VALID_OID;
validate_delta(from);
convert_to_rename_and_add(diff, from, to, similarity);
num_updates++; num_updates++;
continue; continue;
} }
...@@ -754,13 +784,10 @@ int git_diff_find_similar( ...@@ -754,13 +784,10 @@ int git_diff_find_similar(
num_updates++; num_updates++;
} }
if (num_rewrites > 0) { if (num_rewrites > 0)
assert(num_rewrites < diff->deltas.length);
error = apply_splits_and_deletes( error = apply_splits_and_deletes(
diff, diff->deltas.length - num_rewrites, diff, diff->deltas.length - num_rewrites,
FLAG_SET(opts, GIT_DIFF_BREAK_REWRITES)); FLAG_SET(opts, GIT_DIFF_BREAK_REWRITES));
}
if (num_rewrites > 0 || num_updates > 0) if (num_rewrites > 0 || num_updates > 0)
git_vector_sort(&diff->deltas); git_vector_sort(&diff->deltas);
......
...@@ -223,6 +223,7 @@ extern git_off_t git_futils_filesize(git_file fd); ...@@ -223,6 +223,7 @@ extern git_off_t git_futils_filesize(git_file fd);
#define GIT_MODE_PERMS_MASK 0777 #define GIT_MODE_PERMS_MASK 0777
#define GIT_CANONICAL_PERMS(MODE) (((MODE) & 0100) ? 0755 : 0644) #define GIT_CANONICAL_PERMS(MODE) (((MODE) & 0100) ? 0755 : 0644)
#define GIT_MODE_TYPE(MODE) ((MODE) & ~GIT_MODE_PERMS_MASK) #define GIT_MODE_TYPE(MODE) ((MODE) & ~GIT_MODE_PERMS_MASK)
#define GIT_MODE_ISBLOB(MODE) (GIT_MODE_TYPE(MODE) == GIT_MODE_TYPE(GIT_FILEMODE_BLOB))
/** /**
* Convert a mode_t from the OS to a legal git mode_t value. * Convert a mode_t from the OS to a legal git mode_t value.
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment