Commit a21cbb12 by Russell Belfer

Significant rename detection rewrite

This flips rename detection around so instead of creating a
forward mapping from deltas to possible rename targets, instead
it creates a reverse mapping, looking at possible targets and
trying to find a source that they could have been renamed or
copied from.  This is important because each output can only
have a single source, but a given source could map to multiple
outputs (in the form of COPIED records).

Additionally, this makes a couple of tweaks to the public rename
detection APIs, mostly renaming a couple of options that control
the behavior to make more sense and to be more like core Git.

I walked through the tests looking at the exact results and
updated the expectations based on what I saw.  The new code is
different from the old because it cannot give some nonsense
results (like A was renamed to both B and C) which were part of
the outputs previously.
parent 4742148d
...@@ -429,8 +429,8 @@ typedef enum { ...@@ -429,8 +429,8 @@ typedef enum {
GIT_DIFF_FIND_AND_BREAK_REWRITES = GIT_DIFF_FIND_AND_BREAK_REWRITES =
(GIT_DIFF_FIND_REWRITES | GIT_DIFF_BREAK_REWRITES), (GIT_DIFF_FIND_REWRITES | GIT_DIFF_BREAK_REWRITES),
/** consider untracked files as rename/copy targets */ /** find renames/copies for untracked items in working directory */
GIT_DIFF_FIND_FROM_UNTRACKED = (1 << 6), GIT_DIFF_FIND_FOR_UNTRACKED = (1 << 6),
/** turn on all finding features */ /** turn on all finding features */
GIT_DIFF_FIND_ALL = (0x0ff), GIT_DIFF_FIND_ALL = (0x0ff),
...@@ -469,7 +469,10 @@ typedef struct { ...@@ -469,7 +469,10 @@ typedef struct {
* - `copy_threshold` is the same as the -C option with a value * - `copy_threshold` is the same as the -C option with a value
* - `rename_from_rewrite_threshold` matches the top of the -B option * - `rename_from_rewrite_threshold` matches the top of the -B option
* - `break_rewrite_threshold` matches the bottom of the -B option * - `break_rewrite_threshold` matches the bottom of the -B option
* - `target_limit` matches the -l option (approximately) * - `rename_limit` is the maximum number of matches to consider for
* a particular file. This is a little different from the `-l` option
* to regular Git because we will still process up to this many matches
* before abandoning the search.
* *
* The `metric` option allows you to plug in a custom similarity metric. * The `metric` option allows you to plug in a custom similarity metric.
* Set it to NULL for the default internal metric which is based on sampling * Set it to NULL for the default internal metric which is based on sampling
...@@ -492,10 +495,10 @@ typedef struct { ...@@ -492,10 +495,10 @@ typedef struct {
/** Similarity to split modify into delete/add pair (default 60) */ /** Similarity to split modify into delete/add pair (default 60) */
uint16_t break_rewrite_threshold; uint16_t break_rewrite_threshold;
/** Maximum similarity sources to examine (a la diff's `-l` option or /** Maximum similarity sources to examine for a file (somewhat like
* the `diff.renameLimit` config) (default 200) * git-diff's `-l` option or `diff.renameLimit` config) (default 200)
*/ */
size_t target_limit; size_t rename_limit;
/** Pluggable similarity metric; pass NULL to use internal metric */ /** Pluggable similarity metric; pass NULL to use internal metric */
git_diff_similarity_metric *metric; git_diff_similarity_metric *metric;
......
...@@ -34,10 +34,16 @@ enum { ...@@ -34,10 +34,16 @@ enum {
GIT_DIFF_FLAG__FREE_DATA = (1 << 8), /* internal file data is allocated */ GIT_DIFF_FLAG__FREE_DATA = (1 << 8), /* internal file data is allocated */
GIT_DIFF_FLAG__UNMAP_DATA = (1 << 9), /* internal file data is mmap'ed */ GIT_DIFF_FLAG__UNMAP_DATA = (1 << 9), /* internal file data is mmap'ed */
GIT_DIFF_FLAG__NO_DATA = (1 << 10), /* file data should not be loaded */ GIT_DIFF_FLAG__NO_DATA = (1 << 10), /* file data should not be loaded */
GIT_DIFF_FLAG__TO_DELETE = (1 << 11), /* delete entry during rename det. */
GIT_DIFF_FLAG__TO_SPLIT = (1 << 12), /* split entry during rename det. */ GIT_DIFF_FLAG__TO_DELETE = (1 << 16), /* delete entry during rename det. */
GIT_DIFF_FLAG__TO_SPLIT = (1 << 17), /* split entry during rename det. */
GIT_DIFF_FLAG__IS_RENAME_TARGET = (1 << 18),
GIT_DIFF_FLAG__IS_RENAME_SOURCE = (1 << 19),
GIT_DIFF_FLAG__HAS_SELF_SIMILARITY = (1 << 20),
}; };
#define GIT_DIFF_FLAG__CLEAR_INTERNAL(F) (F) = ((F) & 0x00FFFF)
struct git_diff_list { struct git_diff_list {
git_refcount rc; git_refcount rc;
git_repository *repo; git_repository *repo;
......
...@@ -15,6 +15,18 @@ void test_diff_rename__cleanup(void) ...@@ -15,6 +15,18 @@ void test_diff_rename__cleanup(void)
} }
/* /*
static int debug_print(
const git_diff_delta *delta, const git_diff_range *range, char usage,
const char *line, size_t line_len, void *data)
{
GIT_UNUSED(delta); GIT_UNUSED(range); GIT_UNUSED(usage);
GIT_UNUSED(line_len); GIT_UNUSED(data);
fputs(line, stderr);
return 0;
}
*/
/*
* Renames repo has: * Renames repo has:
* *
* commit 31e47d8c1fa36d7f8d537b96158e3f024de0a9f2 - * commit 31e47d8c1fa36d7f8d537b96158e3f024de0a9f2 -
...@@ -72,8 +84,10 @@ void test_diff_rename__match_oid(void) ...@@ -72,8 +84,10 @@ void test_diff_rename__match_oid(void)
/* git diff 31e47d8c1fa36d7f8d537b96158e3f024de0a9f2 \ /* git diff 31e47d8c1fa36d7f8d537b96158e3f024de0a9f2 \
* 2bc7f351d20b53f1c72c16c4b036e491c478c49a * 2bc7f351d20b53f1c72c16c4b036e491c478c49a
* don't use NULL opts to avoid config `diff.renames` contamination
*/ */
cl_git_pass(git_diff_find_similar(diff, NULL)); opts.flags = GIT_DIFF_FIND_RENAMES;
cl_git_pass(git_diff_find_similar(diff, &opts));
memset(&exp, 0, sizeof(exp)); memset(&exp, 0, sizeof(exp));
cl_git_pass(git_diff_foreach( cl_git_pass(git_diff_foreach(
...@@ -243,8 +257,8 @@ void test_diff_rename__not_exact_match(void) ...@@ -243,8 +257,8 @@ void test_diff_rename__not_exact_match(void)
cl_assert_equal_i(5, exp.files); cl_assert_equal_i(5, exp.files);
cl_assert_equal_i(1, exp.file_status[GIT_DELTA_UNMODIFIED]); cl_assert_equal_i(1, exp.file_status[GIT_DELTA_UNMODIFIED]);
cl_assert_equal_i(1, exp.file_status[GIT_DELTA_MODIFIED]); cl_assert_equal_i(1, exp.file_status[GIT_DELTA_MODIFIED]);
cl_assert_equal_i(1, exp.file_status[GIT_DELTA_DELETED]);
cl_assert_equal_i(1, exp.file_status[GIT_DELTA_ADDED]); cl_assert_equal_i(1, exp.file_status[GIT_DELTA_ADDED]);
cl_assert_equal_i(1, exp.file_status[GIT_DELTA_DELETED]);
cl_assert_equal_i(1, exp.file_status[GIT_DELTA_COPIED]); cl_assert_equal_i(1, exp.file_status[GIT_DELTA_COPIED]);
git_diff_list_free(diff); git_diff_list_free(diff);
...@@ -429,8 +443,8 @@ void test_diff_rename__working_directory_changes(void) ...@@ -429,8 +443,8 @@ void test_diff_rename__working_directory_changes(void)
cl_assert_equal_i(6, exp.files); cl_assert_equal_i(6, exp.files);
cl_assert_equal_i(1, exp.file_status[GIT_DELTA_MODIFIED]); cl_assert_equal_i(1, exp.file_status[GIT_DELTA_MODIFIED]);
cl_assert_equal_i(3, exp.file_status[GIT_DELTA_UNTRACKED]);
cl_assert_equal_i(2, exp.file_status[GIT_DELTA_DELETED]); cl_assert_equal_i(2, exp.file_status[GIT_DELTA_DELETED]);
cl_assert_equal_i(3, exp.file_status[GIT_DELTA_UNTRACKED]);
/* git diff -M 2bc7f351d20b53f1c72c16c4b036e491c478c49a */ /* git diff -M 2bc7f351d20b53f1c72c16c4b036e491c478c49a */
opts.flags = GIT_DIFF_FIND_ALL; opts.flags = GIT_DIFF_FIND_ALL;
...@@ -441,7 +455,8 @@ void test_diff_rename__working_directory_changes(void) ...@@ -441,7 +455,8 @@ void test_diff_rename__working_directory_changes(void)
diff, diff_file_cb, diff_hunk_cb, diff_line_cb, &exp)); diff, diff_file_cb, diff_hunk_cb, diff_line_cb, &exp));
cl_assert_equal_i(5, exp.files); cl_assert_equal_i(5, exp.files);
cl_assert_equal_i(3, exp.file_status[GIT_DELTA_RENAMED]); cl_assert_equal_i(2, exp.file_status[GIT_DELTA_RENAMED]);
cl_assert_equal_i(1, exp.file_status[GIT_DELTA_DELETED]);
cl_assert_equal_i(2, exp.file_status[GIT_DELTA_UNTRACKED]); cl_assert_equal_i(2, exp.file_status[GIT_DELTA_UNTRACKED]);
git_diff_list_free(diff); git_diff_list_free(diff);
...@@ -466,7 +481,8 @@ void test_diff_rename__working_directory_changes(void) ...@@ -466,7 +481,8 @@ void test_diff_rename__working_directory_changes(void)
diff, diff_file_cb, diff_hunk_cb, diff_line_cb, &exp)); diff, diff_file_cb, diff_hunk_cb, diff_line_cb, &exp));
cl_assert_equal_i(5, exp.files); cl_assert_equal_i(5, exp.files);
cl_assert_equal_i(3, exp.file_status[GIT_DELTA_RENAMED]); cl_assert_equal_i(2, exp.file_status[GIT_DELTA_RENAMED]);
cl_assert_equal_i(1, exp.file_status[GIT_DELTA_DELETED]);
cl_assert_equal_i(2, exp.file_status[GIT_DELTA_UNTRACKED]); cl_assert_equal_i(2, exp.file_status[GIT_DELTA_UNTRACKED]);
git_diff_list_free(diff); git_diff_list_free(diff);
...@@ -521,13 +537,19 @@ void test_diff_rename__working_directory_changes(void) ...@@ -521,13 +537,19 @@ void test_diff_rename__working_directory_changes(void)
opts.flags = GIT_DIFF_FIND_ALL | GIT_DIFF_FIND_EXACT_MATCH_ONLY; opts.flags = GIT_DIFF_FIND_ALL | GIT_DIFF_FIND_EXACT_MATCH_ONLY;
cl_git_pass(git_diff_find_similar(diff, &opts)); cl_git_pass(git_diff_find_similar(diff, &opts));
/*
fprintf(stderr, "\n\n");
cl_git_pass(git_diff_print_raw(diff, debug_print, NULL));
*/
memset(&exp, 0, sizeof(exp)); memset(&exp, 0, sizeof(exp));
cl_git_pass(git_diff_foreach( cl_git_pass(git_diff_foreach(
diff, diff_file_cb, diff_hunk_cb, diff_line_cb, &exp)); diff, diff_file_cb, diff_hunk_cb, diff_line_cb, &exp));
cl_assert_equal_i(5, exp.files); cl_assert_equal_i(5, exp.files);
cl_assert_equal_i(1, exp.file_status[GIT_DELTA_MODIFIED]); cl_assert_equal_i(1, exp.file_status[GIT_DELTA_MODIFIED]);
cl_assert_equal_i(2, exp.file_status[GIT_DELTA_RENAMED]); cl_assert_equal_i(1, exp.file_status[GIT_DELTA_DELETED]);
cl_assert_equal_i(1, exp.file_status[GIT_DELTA_RENAMED]);
cl_assert_equal_i(2, exp.file_status[GIT_DELTA_UNTRACKED]); cl_assert_equal_i(2, exp.file_status[GIT_DELTA_UNTRACKED]);
git_diff_list_free(diff); git_diff_list_free(diff);
......
...@@ -87,7 +87,6 @@ void test_object_raw_convert__convert_oid_partially(void) ...@@ -87,7 +87,6 @@ void test_object_raw_convert__convert_oid_partially(void)
const char *exp = "16a0123456789abcdef4b775213c23a8bd74f5e0"; const char *exp = "16a0123456789abcdef4b775213c23a8bd74f5e0";
git_oid in; git_oid in;
char big[GIT_OID_HEXSZ + 1 + 3]; /* note + 4 => big buffer */ char big[GIT_OID_HEXSZ + 1 + 3]; /* note + 4 => big buffer */
char *str;
cl_git_pass(git_oid_fromstr(&in, exp)); cl_git_pass(git_oid_fromstr(&in, exp));
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment