diff_tform.c 29.4 KB
Newer Older
1
/*
Edward Thomson committed
2
 * Copyright (C) the libgit2 contributors. All rights reserved.
3 4 5 6
 *
 * This file is part of libgit2, distributed under the GNU GPL v2 with
 * a Linking Exception. For full terms see the included COPYING file.
 */
7 8

#include "diff_tform.h"
9

10
#include "git2/config.h"
11
#include "git2/blob.h"
12
#include "git2/sys/hashsig.h"
13 14

#include "diff.h"
15
#include "diff_generate.h"
16
#include "path.h"
17
#include "futils.h"
18
#include "config.h"
19

20
git_diff_delta *git_diff__delta_dup(
21 22 23 24 25 26 27
	const git_diff_delta *d, git_pool *pool)
{
	git_diff_delta *delta = git__malloc(sizeof(git_diff_delta));
	if (!delta)
		return NULL;

	memcpy(delta, d, sizeof(git_diff_delta));
28
	GIT_DIFF_FLAG__CLEAR_INTERNAL(delta->flags);
29

30 31 32 33 34
	if (d->old_file.path != NULL) {
		delta->old_file.path = git_pool_strdup(pool, d->old_file.path);
		if (delta->old_file.path == NULL)
			goto fail;
	}
35

36
	if (d->new_file.path != d->old_file.path && d->new_file.path != NULL) {
37 38 39 40 41 42 43 44 45 46 47 48 49 50
		delta->new_file.path = git_pool_strdup(pool, d->new_file.path);
		if (delta->new_file.path == NULL)
			goto fail;
	} else {
		delta->new_file.path = delta->old_file.path;
	}

	return delta;

fail:
	git__free(delta);
	return NULL;
}

51
git_diff_delta *git_diff__merge_like_cgit(
52 53 54
	const git_diff_delta *a,
	const git_diff_delta *b,
	git_pool *pool)
55 56 57 58 59 60 61 62 63 64 65 66 67 68 69
{
	git_diff_delta *dup;

	/* Emulate C git for merging two diffs (a la 'git diff <sha>').
	 *
	 * When C git does a diff between the work dir and a tree, it actually
	 * diffs with the index but uses the workdir contents.  This emulates
	 * those choices so we can emulate the type of diff.
	 *
	 * We have three file descriptions here, let's call them:
	 *  f1 = a->old_file
	 *  f2 = a->new_file AND b->old_file
	 *  f3 = b->new_file
	 */

70 71
	/* If one of the diffs is a conflict, just dup it */
	if (b->status == GIT_DELTA_CONFLICTED)
72
		return git_diff__delta_dup(b, pool);
73
	if (a->status == GIT_DELTA_CONFLICTED)
74
		return git_diff__delta_dup(a, pool);
75

76 77
	/* if f2 == f3 or f2 is deleted, then just dup the 'a' diff */
	if (b->status == GIT_DELTA_UNMODIFIED || a->status == GIT_DELTA_DELETED)
78
		return git_diff__delta_dup(a, pool);
79 80

	/* otherwise, base this diff on the 'b' diff */
81
	if ((dup = git_diff__delta_dup(b, pool)) == NULL)
82 83 84
		return NULL;

	/* If 'a' status is uninteresting, then we're done */
85 86 87
	if (a->status == GIT_DELTA_UNMODIFIED ||
		a->status == GIT_DELTA_UNTRACKED ||
		a->status == GIT_DELTA_UNREADABLE)
88 89 90 91 92 93 94 95
		return dup;

	assert(b->status != GIT_DELTA_UNMODIFIED);

	/* A cgit exception is that the diff of a file that is only in the
	 * index (i.e. not in HEAD nor workdir) is given as empty.
	 */
	if (dup->status == GIT_DELTA_DELETED) {
96
		if (a->status == GIT_DELTA_ADDED) {
97
			dup->status = GIT_DELTA_UNMODIFIED;
98 99
			dup->nfiles = 2;
		}
100 101 102
		/* else don't overwrite DELETE status */
	} else {
		dup->status = a->status;
103
		dup->nfiles = a->nfiles;
104 105
	}

106
	git_oid_cpy(&dup->old_file.id, &a->old_file.id);
107 108 109 110 111 112 113
	dup->old_file.mode  = a->old_file.mode;
	dup->old_file.size  = a->old_file.size;
	dup->old_file.flags = a->old_file.flags;

	return dup;
}

114
int git_diff__merge(
115
	git_diff *onto, const git_diff *from, git_diff__merge_cb cb)
116 117 118 119 120
{
	int error = 0;
	git_pool onto_pool;
	git_vector onto_new;
	git_diff_delta *delta;
121
	bool ignore_case, reversed;
122 123 124 125 126 127 128
	unsigned int i, j;

	assert(onto && from);

	if (!from->deltas.length)
		return 0;

129 130 131 132 133
	ignore_case = ((onto->opts.flags & GIT_DIFF_IGNORE_CASE) != 0);
	reversed    = ((onto->opts.flags & GIT_DIFF_REVERSE) != 0);

	if (ignore_case != ((from->opts.flags & GIT_DIFF_IGNORE_CASE) != 0) ||
		reversed    != ((from->opts.flags & GIT_DIFF_REVERSE) != 0)) {
134
		git_error_set(GIT_ERROR_INVALID,
135
			"attempt to merge diffs created with conflicting options");
136 137 138
		return -1;
	}

139
	if (git_vector_init(&onto_new, onto->deltas.length, git_diff_delta__cmp) < 0)
140 141
		return -1;

142 143
	git_pool_init(&onto_pool, 1);

144 145 146
	for (i = 0, j = 0; i < onto->deltas.length || j < from->deltas.length; ) {
		git_diff_delta *o = GIT_VECTOR_GET(&onto->deltas, i);
		const git_diff_delta *f = GIT_VECTOR_GET(&from->deltas, j);
147 148
		int cmp = !f ? -1 : !o ? 1 :
			STRCMP_CASESELECT(ignore_case, o->old_file.path, f->old_file.path);
149 150

		if (cmp < 0) {
151
			delta = git_diff__delta_dup(o, &onto_pool);
152 153
			i++;
		} else if (cmp > 0) {
154
			delta = git_diff__delta_dup(f, &onto_pool);
155 156
			j++;
		} else {
157 158 159
			const git_diff_delta *left = reversed ? f : o;
			const git_diff_delta *right = reversed ? o : f;

160
			delta = cb(left, right, &onto_pool);
161 162 163 164 165 166 167
			i++;
			j++;
		}

		/* the ignore rules for the target may not match the source
		 * or the result of a merged delta could be skippable...
		 */
168
		if (delta && git_diff_delta__should_skip(&onto->opts, delta)) {
169 170 171 172 173 174 175 176 177 178 179
			git__free(delta);
			continue;
		}

		if ((error = !delta ? -1 : git_vector_insert(&onto_new, delta)) < 0)
			break;
	}

	if (!error) {
		git_vector_swap(&onto->deltas, &onto_new);
		git_pool_swap(&onto->pool, &onto_pool);
180 181 182 183 184

		if ((onto->opts.flags & GIT_DIFF_REVERSE) != 0)
			onto->old_src = from->old_src;
		else
			onto->new_src = from->new_src;
185 186 187 188 189 190 191 192

		/* prefix strings also come from old pool, so recreate those.*/
		onto->opts.old_prefix =
			git_pool_strdup_safe(&onto->pool, onto->opts.old_prefix);
		onto->opts.new_prefix =
			git_pool_strdup_safe(&onto->pool, onto->opts.new_prefix);
	}

193
	git_vector_free_deep(&onto_new);
194 195 196 197 198
	git_pool_clear(&onto_pool);

	return error;
}

199 200
int git_diff_merge(git_diff *onto, const git_diff *from)
{
201
	return git_diff__merge(onto, from, git_diff__merge_like_cgit);
202 203
}

Edward Thomson committed
204
int git_diff_find_similar__hashsig_for_file(
205 206
	void **out, const git_diff_file *f, const char *path, void *p)
{
Ben Straub committed
207
	git_hashsig_option_t opt = (git_hashsig_option_t)(intptr_t)p;
208

209
	GIT_UNUSED(f);
210
	return git_hashsig_create_fromfile((git_hashsig **)out, path, opt);
211
}
212

Edward Thomson committed
213
int git_diff_find_similar__hashsig_for_buf(
214 215
	void **out, const git_diff_file *f, const char *buf, size_t len, void *p)
{
Ben Straub committed
216
	git_hashsig_option_t opt = (git_hashsig_option_t)(intptr_t)p;
Edward Thomson committed
217

218
	GIT_UNUSED(f);
219
	return git_hashsig_create((git_hashsig **)out, buf, len, opt);
220
}
221

Edward Thomson committed
222
void git_diff_find_similar__hashsig_free(void *sig, void *payload)
223 224 225 226 227
{
	GIT_UNUSED(payload);
	git_hashsig_free(sig);
}

Edward Thomson committed
228
int git_diff_find_similar__calc_similarity(
229 230
	int *score, void *siga, void *sigb, void *payload)
{
231 232
	int error;

233
	GIT_UNUSED(payload);
234 235 236 237 238
	error = git_hashsig_compare(siga, sigb);
	if (error < 0)
		return error;

	*score = error;
239 240 241
	return 0;
}

242 243
#define DEFAULT_THRESHOLD 50
#define DEFAULT_BREAK_REWRITE_THRESHOLD 60
244
#define DEFAULT_RENAME_LIMIT 200
245 246

static int normalize_find_opts(
247
	git_diff *diff,
248
	git_diff_find_options *opts,
Russell Belfer committed
249
	const git_diff_find_options *given)
250 251
{
	git_config *cfg = NULL;
252
	git_hashsig_option_t hashsig_opts;
253

254
	GIT_ERROR_CHECK_VERSION(given, GIT_DIFF_FIND_OPTIONS_VERSION, "git_diff_find_options");
Ben Straub committed
255

256 257 258 259
	if (diff->repo != NULL &&
		git_repository_config__weakptr(&cfg, diff->repo) < 0)
		return -1;

260
	if (given)
261 262
		memcpy(opts, given, sizeof(*opts));

263 264 265
	if (!given ||
		 (given->flags & GIT_DIFF_FIND_ALL) == GIT_DIFF_FIND_BY_CONFIG)
	{
266
		if (cfg) {
267 268 269 270 271 272 273 274 275 276 277 278 279 280
			char *rule =
				git_config__get_string_force(cfg, "diff.renames", "true");
			int boolval;

			if (!git__parse_bool(&boolval, rule) && !boolval)
				/* don't set FIND_RENAMES if bool value is false */;
			else if (!strcasecmp(rule, "copies") || !strcasecmp(rule, "copy"))
				opts->flags |= GIT_DIFF_FIND_RENAMES | GIT_DIFF_FIND_COPIES;
			else
				opts->flags |= GIT_DIFF_FIND_RENAMES;

			git__free(rule);
		} else {
			/* set default flag */
281
			opts->flags |= GIT_DIFF_FIND_RENAMES;
282
		}
283
	}
284

285 286
	/* some flags imply others */

287 288 289 290 291 292 293 294 295 296
	if (opts->flags & GIT_DIFF_FIND_EXACT_MATCH_ONLY) {
		/* if we are only looking for exact matches, then don't turn
		 * MODIFIED items into ADD/DELETE pairs because it's too picky
		 */
		opts->flags &= ~(GIT_DIFF_FIND_REWRITES | GIT_DIFF_BREAK_REWRITES);

		/* similarly, don't look for self-rewrites to split */
		opts->flags &= ~GIT_DIFF_FIND_RENAMES_FROM_REWRITES;
	}

297 298 299 300 301 302
	if (opts->flags & GIT_DIFF_FIND_RENAMES_FROM_REWRITES)
		opts->flags |= GIT_DIFF_FIND_RENAMES;

	if (opts->flags & GIT_DIFF_FIND_COPIES_FROM_UNMODIFIED)
		opts->flags |= GIT_DIFF_FIND_COPIES;

303 304 305
	if (opts->flags & GIT_DIFF_BREAK_REWRITES)
		opts->flags |= GIT_DIFF_FIND_REWRITES;

306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321
#define USE_DEFAULT(X) ((X) == 0 || (X) > 100)

	if (USE_DEFAULT(opts->rename_threshold))
		opts->rename_threshold = DEFAULT_THRESHOLD;

	if (USE_DEFAULT(opts->rename_from_rewrite_threshold))
		opts->rename_from_rewrite_threshold = DEFAULT_THRESHOLD;

	if (USE_DEFAULT(opts->copy_threshold))
		opts->copy_threshold = DEFAULT_THRESHOLD;

	if (USE_DEFAULT(opts->break_rewrite_threshold))
		opts->break_rewrite_threshold = DEFAULT_BREAK_REWRITE_THRESHOLD;

#undef USE_DEFAULT

322
	if (!opts->rename_limit) {
323 324 325 326
		if (cfg) {
			opts->rename_limit = git_config__get_int_force(
				cfg, "diff.renamelimit", DEFAULT_RENAME_LIMIT);
		}
327

328 329
		if (opts->rename_limit <= 0)
			opts->rename_limit = DEFAULT_RENAME_LIMIT;
330 331
	}

332
	/* assign the internal metric with whitespace flag as payload */
333
	if (!opts->metric) {
334
		opts->metric = git__malloc(sizeof(git_diff_similarity_metric));
335
		GIT_ERROR_CHECK_ALLOC(opts->metric);
336

Edward Thomson committed
337 338 339 340
		opts->metric->file_signature = git_diff_find_similar__hashsig_for_file;
		opts->metric->buffer_signature = git_diff_find_similar__hashsig_for_buf;
		opts->metric->free_signature = git_diff_find_similar__hashsig_free;
		opts->metric->similarity = git_diff_find_similar__calc_similarity;
341

342
		if (opts->flags & GIT_DIFF_FIND_IGNORE_WHITESPACE)
343
			hashsig_opts = GIT_HASHSIG_IGNORE_WHITESPACE;
344
		else if (opts->flags & GIT_DIFF_FIND_DONT_IGNORE_WHITESPACE)
345
			hashsig_opts = GIT_HASHSIG_NORMAL;
346
		else
347 348 349
			hashsig_opts = GIT_HASHSIG_SMART_WHITESPACE;
		hashsig_opts |= GIT_HASHSIG_ALLOW_SMALL_FILES;
		opts->metric->payload = (void *)hashsig_opts;
350 351
	}

352 353 354
	return 0;
}

355 356 357 358
static int insert_delete_side_of_split(
	git_diff *diff, git_vector *onto, const git_diff_delta *delta)
{
	/* make new record for DELETED side of split */
359
	git_diff_delta *deleted = git_diff__delta_dup(delta, &diff->pool);
360
	GIT_ERROR_CHECK_ALLOC(deleted);
361 362 363 364 365

	deleted->status = GIT_DELTA_DELETED;
	deleted->nfiles = 1;
	memset(&deleted->new_file, 0, sizeof(deleted->new_file));
	deleted->new_file.path = deleted->old_file.path;
366
	deleted->new_file.flags |= GIT_DIFF_FLAG_VALID_ID;
367 368 369 370

	return git_vector_insert(onto, deleted);
}

371
static int apply_splits_and_deletes(
372
	git_diff *diff, size_t expected_size, bool actually_split)
373 374 375
{
	git_vector onto = GIT_VECTOR_INIT;
	size_t i;
376
	git_diff_delta *delta;
377 378 379 380 381 382

	if (git_vector_init(&onto, expected_size, git_diff_delta__cmp) < 0)
		return -1;

	/* build new delta list without TO_DELETE and splitting TO_SPLIT */
	git_vector_foreach(&diff->deltas, i, delta) {
383
		if ((delta->flags & GIT_DIFF_FLAG__TO_DELETE) != 0)
384 385
			continue;

386
		if ((delta->flags & GIT_DIFF_FLAG__TO_SPLIT) != 0 && actually_split) {
387 388
			delta->similarity = 0;

389
			if (insert_delete_side_of_split(diff, &onto, delta) < 0)
390
				goto on_error;
391

392 393 394 395
			if (diff->new_src == GIT_ITERATOR_TYPE_WORKDIR)
				delta->status = GIT_DELTA_UNTRACKED;
			else
				delta->status = GIT_DELTA_ADDED;
396
			delta->nfiles = 1;
397 398
			memset(&delta->old_file, 0, sizeof(delta->old_file));
			delta->old_file.path = delta->new_file.path;
399
			delta->old_file.flags |= GIT_DIFF_FLAG_VALID_ID;
400 401
		}

402 403 404 405 406 407 408 409 410
		/* clean up delta before inserting into new list */
		GIT_DIFF_FLAG__CLEAR_INTERNAL(delta->flags);

		if (delta->status != GIT_DELTA_COPIED &&
			delta->status != GIT_DELTA_RENAMED &&
			(delta->status != GIT_DELTA_MODIFIED || actually_split))
			delta->similarity = 0;

		/* insert into new list */
411 412
		if (git_vector_insert(&onto, delta) < 0)
			goto on_error;
413 414
	}

415
	/* cannot return an error past this point */
416 417

	/* free deltas from old list that didn't make it to the new one */
418
	git_vector_foreach(&diff->deltas, i, delta) {
419
		if ((delta->flags & GIT_DIFF_FLAG__TO_DELETE) != 0)
420
			git__free(delta);
421 422
	}

423 424 425
	/* swap new delta list into place */
	git_vector_swap(&diff->deltas, &onto);
	git_vector_free(&onto);
426
	git_vector_sort(&diff->deltas);
427 428

	return 0;
429 430

on_error:
431
	git_vector_free_deep(&onto);
432 433

	return -1;
434 435
}

436
GIT_INLINE(git_diff_file *) similarity_get_file(git_diff *diff, size_t idx)
437 438 439 440
{
	git_diff_delta *delta = git_vector_get(&diff->deltas, idx / 2);
	return (idx & 1) ? &delta->new_file : &delta->old_file;
}
441

442 443 444 445 446 447
typedef struct {
	size_t idx;
	git_iterator_type_t src;
	git_repository *repo;
	git_diff_file *file;
	git_buf data;
448
	git_odb_object *odb_obj;
449 450 451
	git_blob *blob;
} similarity_info;

452
static int similarity_init(
453
	similarity_info *info, git_diff *diff, size_t file_idx)
454 455 456 457 458
{
	info->idx  = file_idx;
	info->src  = (file_idx & 1) ? diff->new_src : diff->old_src;
	info->repo = diff->repo;
	info->file = similarity_get_file(diff, file_idx);
459
	info->odb_obj = NULL;
460 461
	info->blob = NULL;
	git_buf_init(&info->data, 0);
462

463
	if (info->file->size > 0 || info->src == GIT_ITERATOR_TYPE_WORKDIR)
464
		return 0;
465

466 467
	return git_diff_file__resolve_zero_size(
		info->file, &info->odb_obj, info->repo);
468
}
469

470
static int similarity_sig(
471 472 473 474 475
	similarity_info *info,
	const git_diff_find_options *opts,
	void **cache)
{
	int error = 0;
476
	git_diff_file *file = info->file;
477

478 479 480 481
	if (info->src == GIT_ITERATOR_TYPE_WORKDIR) {
		if ((error = git_buf_joinpath(
			&info->data, git_repository_workdir(info->repo), file->path)) < 0)
			return error;
482

483 484 485
		/* if path is not a regular file, just skip this item */
		if (!git_path_isfile(info->data.ptr))
			return 0;
486 487 488 489 490 491 492

		/* TODO: apply wd-to-odb filters to file data if necessary */

		error = opts->metric->file_signature(
			&cache[info->idx], info->file,
			info->data.ptr, opts->metric->payload);
	} else {
493 494 495 496 497 498
		/* if we didn't initially know the size, we might have an odb_obj
		 * around from earlier, so convert that, otherwise load the blob now
		 */
		if (info->odb_obj != NULL)
			error = git_object__from_odb_object(
				(git_object **)&info->blob, info->repo,
499
				info->odb_obj, GIT_OBJECT_BLOB);
500
		else
501
			error = git_blob_lookup(&info->blob, info->repo, &file->id);
502 503 504

		if (error < 0) {
			/* if lookup fails, just skip this item in similarity calc */
505
			git_error_clear();
506
		} else {
507 508 509 510 511 512
			size_t sz;

			/* index size may not be actual blob size if filtered */
			if (file->size != git_blob_rawsize(info->blob))
				file->size = git_blob_rawsize(info->blob);

513
			sz = git__is_sizet(file->size) ? (size_t)file->size : (size_t)-1;
514 515 516 517 518

			error = opts->metric->buffer_signature(
				&cache[info->idx], info->file,
				git_blob_rawcontent(info->blob), sz, opts->metric->payload);
		}
519 520 521 522 523
	}

	return error;
}

524 525 526 527 528 529 530 531
static void similarity_unload(similarity_info *info)
{
	if (info->odb_obj)
		git_odb_object_free(info->odb_obj);

	if (info->blob)
		git_blob_free(info->blob);
	else
532
		git_buf_dispose(&info->data);
533 534
}

535
#define FLAG_SET(opts,flag_name) (((opts)->flags & flag_name) != 0)
536 537 538 539 540

/* - score < 0 means files cannot be compared
 * - score >= 100 means files are exact match
 * - score == 0 means files are completely different
 */
541
static int similarity_measure(
542
	int *score,
543
	git_diff *diff,
544
	const git_diff_find_options *opts,
545 546 547 548 549 550
	void **cache,
	size_t a_idx,
	size_t b_idx)
{
	git_diff_file *a_file = similarity_get_file(diff, a_idx);
	git_diff_file *b_file = similarity_get_file(diff, b_idx);
551
	bool exact_match = FLAG_SET(opts, GIT_DIFF_FIND_EXACT_MATCH_ONLY);
552 553
	int error = 0;
	similarity_info a_info, b_info;
554 555

	*score = -1;
556

557 558
	/* don't try to compare things that aren't files */
	if (!GIT_MODE_ISBLOB(a_file->mode) || !GIT_MODE_ISBLOB(b_file->mode))
559 560
		return 0;

561
	/* if exact match is requested, force calculation of missing OIDs now */
562
	if (exact_match) {
563
		if (git_oid_is_zero(&a_file->id) &&
564
			diff->old_src == GIT_ITERATOR_TYPE_WORKDIR &&
565 566
			!git_diff__oid_for_file(&a_file->id,
				diff, a_file->path, a_file->mode, a_file->size))
567
			a_file->flags |= GIT_DIFF_FLAG_VALID_ID;
568

569
		if (git_oid_is_zero(&b_file->id) &&
570
			diff->new_src == GIT_ITERATOR_TYPE_WORKDIR &&
571 572
			!git_diff__oid_for_file(&b_file->id,
				diff, b_file->path, b_file->mode, b_file->size))
573
			b_file->flags |= GIT_DIFF_FLAG_VALID_ID;
574 575 576
	}

	/* check OID match as a quick test */
577
	if (git_oid__cmp(&a_file->id, &b_file->id) == 0) {
578 579 580 581 582 583 584 585 586
		*score = 100;
		return 0;
	}

	/* don't calculate signatures if we are doing exact match */
	if (exact_match) {
		*score = 0;
		return 0;
	}
587

588 589
	memset(&a_info, 0, sizeof(a_info));
	memset(&b_info, 0, sizeof(b_info));
590

591 592 593 594 595
	/* set up similarity data (will try to update missing file sizes) */
	if (!cache[a_idx] && (error = similarity_init(&a_info, diff, a_idx)) < 0)
		return error;
	if (!cache[b_idx] && (error = similarity_init(&b_info, diff, b_idx)) < 0)
		goto cleanup;
596

Russell Belfer committed
597
	/* check if file sizes are nowhere near each other */
598 599
	if (a_file->size > 127 &&
		b_file->size > 127 &&
600 601
		(a_file->size > (b_file->size << 3) ||
		 b_file->size > (a_file->size << 3)))
602
		goto cleanup;
603

604
	/* update signature cache if needed */
605 606 607 608 609 610 611 612
	if (!cache[a_idx]) {
		if ((error = similarity_sig(&a_info, opts, cache)) < 0)
			goto cleanup;
	}
	if (!cache[b_idx]) {
		if ((error = similarity_sig(&b_info, opts, cache)) < 0)
			goto cleanup;
	}
nulltoken committed
613

614 615 616 617 618 619
	/* calculate similarity provided that the metric choose to process
	 * both the a and b files (some may not if file is too big, etc).
	 */
	if (cache[a_idx] && cache[b_idx])
		error = opts->metric->similarity(
			score, cache[a_idx], cache[b_idx], opts->metric->payload);
620

621
cleanup:
622 623 624 625
	similarity_unload(&a_info);
	similarity_unload(&b_info);

	return error;
626 627
}

628
static int calc_self_similarity(
629
	git_diff *diff,
630 631 632
	const git_diff_find_options *opts,
	size_t delta_idx,
	void **cache)
633
{
634 635 636 637 638 639 640 641 642 643 644 645
	int error, similarity = -1;
	git_diff_delta *delta = GIT_VECTOR_GET(&diff->deltas, delta_idx);

	if ((delta->flags & GIT_DIFF_FLAG__HAS_SELF_SIMILARITY) != 0)
		return 0;

	error = similarity_measure(
		&similarity, diff, opts, cache, 2 * delta_idx, 2 * delta_idx + 1);
	if (error < 0)
		return error;

	if (similarity >= 0) {
646
		delta->similarity = (uint16_t)similarity;
647 648 649 650 651 652 653
		delta->flags |= GIT_DIFF_FLAG__HAS_SELF_SIMILARITY;
	}

	return 0;
}

static bool is_rename_target(
654
	git_diff *diff,
655 656 657 658 659 660 661 662 663 664 665
	const git_diff_find_options *opts,
	size_t delta_idx,
	void **cache)
{
	git_diff_delta *delta = GIT_VECTOR_GET(&diff->deltas, delta_idx);

	/* skip things that aren't plain blobs */
	if (!GIT_MODE_ISBLOB(delta->new_file.mode))
		return false;

	/* only consider ADDED, RENAMED, COPIED, and split MODIFIED as
666
	 * targets; maybe include UNTRACKED if requested.
667 668 669 670
	 */
	switch (delta->status) {
	case GIT_DELTA_UNMODIFIED:
	case GIT_DELTA_DELETED:
671 672
	case GIT_DELTA_IGNORED:
	case GIT_DELTA_CONFLICTED:
673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688
		return false;

	case GIT_DELTA_MODIFIED:
		if (!FLAG_SET(opts, GIT_DIFF_FIND_REWRITES) &&
			!FLAG_SET(opts, GIT_DIFF_FIND_RENAMES_FROM_REWRITES))
			return false;

		if (calc_self_similarity(diff, opts, delta_idx, cache) < 0)
			return false;

		if (FLAG_SET(opts, GIT_DIFF_BREAK_REWRITES) &&
			delta->similarity < opts->break_rewrite_threshold) {
			delta->flags |= GIT_DIFF_FLAG__TO_SPLIT;
			break;
		}
		if (FLAG_SET(opts, GIT_DIFF_FIND_RENAMES_FROM_REWRITES) &&
689 690
			delta->similarity < opts->rename_from_rewrite_threshold) {
			delta->flags |= GIT_DIFF_FLAG__TO_SPLIT;
691
			break;
692
		}
693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709

		return false;

	case GIT_DELTA_UNTRACKED:
		if (!FLAG_SET(opts, GIT_DIFF_FIND_FOR_UNTRACKED))
			return false;
		break;

	default: /* all other status values should be checked */
		break;
	}

	delta->flags |= GIT_DIFF_FLAG__IS_RENAME_TARGET;
	return true;
}

static bool is_rename_source(
710
	git_diff *diff,
711 712 713 714 715 716 717 718 719 720 721 722 723
	const git_diff_find_options *opts,
	size_t delta_idx,
	void **cache)
{
	git_diff_delta *delta = GIT_VECTOR_GET(&diff->deltas, delta_idx);

	/* skip things that aren't blobs */
	if (!GIT_MODE_ISBLOB(delta->old_file.mode))
		return false;

	switch (delta->status) {
	case GIT_DELTA_ADDED:
	case GIT_DELTA_UNTRACKED:
724
	case GIT_DELTA_UNREADABLE:
725
	case GIT_DELTA_IGNORED:
726
	case GIT_DELTA_CONFLICTED:
727 728 729 730 731 732 733 734 735
		return false;

	case GIT_DELTA_DELETED:
	case GIT_DELTA_TYPECHANGE:
		break;

	case GIT_DELTA_UNMODIFIED:
		if (!FLAG_SET(opts, GIT_DIFF_FIND_COPIES_FROM_UNMODIFIED))
			return false;
736
		if (FLAG_SET(opts, GIT_DIFF_FIND_REMOVE_UNMODIFIED))
737
			delta->flags |= GIT_DIFF_FLAG__TO_DELETE;
738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779
		break;

	default: /* MODIFIED, RENAMED, COPIED */
		/* if we're finding copies, this could be a source */
		if (FLAG_SET(opts, GIT_DIFF_FIND_COPIES))
			break;

		/* otherwise, this is only a source if we can split it */
		if (!FLAG_SET(opts, GIT_DIFF_FIND_REWRITES) &&
			!FLAG_SET(opts, GIT_DIFF_FIND_RENAMES_FROM_REWRITES))
			return false;

		if (calc_self_similarity(diff, opts, delta_idx, cache) < 0)
			return false;

		if (FLAG_SET(opts, GIT_DIFF_BREAK_REWRITES) &&
			delta->similarity < opts->break_rewrite_threshold) {
			delta->flags |= GIT_DIFF_FLAG__TO_SPLIT;
			break;
		}

		if (FLAG_SET(opts, GIT_DIFF_FIND_RENAMES_FROM_REWRITES) &&
			delta->similarity < opts->rename_from_rewrite_threshold)
			break;

		return false;
	}

	delta->flags |= GIT_DIFF_FLAG__IS_RENAME_SOURCE;
	return true;
}

GIT_INLINE(bool) delta_is_split(git_diff_delta *delta)
{
	return (delta->status == GIT_DELTA_TYPECHANGE ||
			(delta->flags & GIT_DIFF_FLAG__TO_SPLIT) != 0);
}

GIT_INLINE(bool) delta_is_new_only(git_diff_delta *delta)
{
	return (delta->status == GIT_DELTA_ADDED ||
			delta->status == GIT_DELTA_UNTRACKED ||
780
			delta->status == GIT_DELTA_UNREADABLE ||
781
			delta->status == GIT_DELTA_IGNORED);
782
}
783

784
GIT_INLINE(void) delta_make_rename(
785
	git_diff_delta *to, const git_diff_delta *from, uint16_t similarity)
786 787 788
{
	to->status     = GIT_DELTA_RENAMED;
	to->similarity = similarity;
789
	to->nfiles     = 2;
790 791 792 793
	memcpy(&to->old_file, &from->old_file, sizeof(to->old_file));
	to->flags &= ~GIT_DIFF_FLAG__TO_SPLIT;
}

794
typedef struct {
795 796
	size_t   idx;
	uint16_t similarity;
797 798
} diff_find_match;

799
int git_diff_find_similar(
800
	git_diff *diff,
Russell Belfer committed
801
	const git_diff_find_options *given_opts)
802
{
803
	size_t s, t;
804 805
	int error = 0, result;
	uint16_t similarity;
806
	git_diff_delta *src, *tgt;
807
	git_diff_find_options opts = GIT_DIFF_FIND_OPTIONS_INIT;
808 809
	size_t num_deltas, num_srcs = 0, num_tgts = 0;
	size_t tried_srcs = 0, tried_tgts = 0;
810
	size_t num_rewrites = 0, num_updates = 0, num_bumped = 0;
811
	size_t sigcache_size;
812
	void **sigcache = NULL; /* cache of similarity metric file signatures */
813 814 815 816
	diff_find_match *tgt2src = NULL;
	diff_find_match *src2tgt = NULL;
	diff_find_match *tgt2src_copy = NULL;
	diff_find_match *best_match;
817
	git_diff_file swap;
818

819 820
	assert(diff);

821
	if ((error = normalize_find_opts(diff, &opts, given_opts)) < 0)
822 823
		return error;

824 825
	num_deltas = diff->deltas.length;

826
	/* TODO: maybe abort if deltas.length > rename_limit ??? */
827
	if (!num_deltas || !git__is_uint32(num_deltas))
828 829 830 831 832
		goto cleanup;

	/* No flags set; nothing to do */
	if ((opts.flags & GIT_DIFF_FIND_ALL) == 0)
		goto cleanup;
833

834
	GIT_ERROR_CHECK_ALLOC_MULTIPLY(&sigcache_size, num_deltas, 2);
835
	sigcache = git__calloc(sigcache_size, sizeof(void *));
836
	GIT_ERROR_CHECK_ALLOC(sigcache);
837 838 839 840 841 842

	/* Label rename sources and targets
	 *
	 * This will also set self-similarity scores for MODIFIED files and
	 * mark them for splitting if break-rewrites is enabled
	 */
843 844
	git_vector_foreach(&diff->deltas, t, tgt) {
		if (is_rename_source(diff, &opts, t, sigcache))
845 846
			++num_srcs;

847
		if (is_rename_target(diff, &opts, t, sigcache))
848
			++num_tgts;
849 850 851

		if ((tgt->flags & GIT_DIFF_FLAG__TO_SPLIT) != 0)
			num_rewrites++;
852
	}
853

854 855 856
	/* if there are no candidate srcs or tgts, we're done */
	if (!num_srcs || !num_tgts)
		goto cleanup;
857

858
	src2tgt = git__calloc(num_deltas, sizeof(diff_find_match));
859
	GIT_ERROR_CHECK_ALLOC(src2tgt);
860
	tgt2src = git__calloc(num_deltas, sizeof(diff_find_match));
861
	GIT_ERROR_CHECK_ALLOC(tgt2src);
862 863 864

	if (FLAG_SET(&opts, GIT_DIFF_FIND_COPIES)) {
		tgt2src_copy = git__calloc(num_deltas, sizeof(diff_find_match));
865
		GIT_ERROR_CHECK_ALLOC(tgt2src_copy);
866
	}
867

868 869 870
	/*
	 * Find best-fit matches for rename / copy candidates
	 */
871

872 873
find_best_matches:
	tried_tgts = num_bumped = 0;
874

875
	git_vector_foreach(&diff->deltas, t, tgt) {
876
		/* skip things that are not rename targets */
877
		if ((tgt->flags & GIT_DIFF_FLAG__IS_RENAME_TARGET) == 0)
878 879
			continue;

880
		tried_srcs = 0;
881

882
		git_vector_foreach(&diff->deltas, s, src) {
883
			/* skip things that are not rename sources */
884
			if ((src->flags & GIT_DIFF_FLAG__IS_RENAME_SOURCE) == 0)
885 886
				continue;

887
			/* calculate similarity for this pair and find best match */
888
			if (s == t)
889
				result = -1; /* don't measure self-similarity here */
890
			else if ((error = similarity_measure(
891
				&result, diff, &opts, sigcache, 2 * s, 2 * t + 1)) < 0)
892
				goto cleanup;
893

894
			if (result < 0)
895
				continue;
896
			similarity = (uint16_t)result;
897 898

			/* is this a better rename? */
899 900
			if (tgt2src[t].similarity < similarity &&
				src2tgt[s].similarity < similarity)
901
			{
902 903 904 905 906 907 908 909
				/* eject old mapping */
				if (src2tgt[s].similarity > 0) {
					tgt2src[src2tgt[s].idx].similarity = 0;
					num_bumped++;
				}
				if (tgt2src[t].similarity > 0) {
					src2tgt[tgt2src[t].idx].similarity = 0;
					num_bumped++;
910 911
				}

912
				/* write new mapping */
913 914 915 916
				tgt2src[t].idx = s;
				tgt2src[t].similarity = similarity;
				src2tgt[s].idx = t;
				src2tgt[s].similarity = similarity;
917
			}
918

919 920
			/* keep best absolute match for copies */
			if (tgt2src_copy != NULL &&
921
				tgt2src_copy[t].similarity < similarity)
922
			{
923 924
				tgt2src_copy[t].idx = s;
				tgt2src_copy[t].similarity = similarity;
925
			}
926 927 928 929

			if (++tried_srcs >= num_srcs)
				break;

930
			/* cap on maximum targets we'll examine (per "tgt" file) */
931 932
			if (tried_srcs > opts.rename_limit)
				break;
933
		}
934 935 936

		if (++tried_tgts >= num_tgts)
			break;
937 938
	}

939 940 941 942 943 944 945
	if (num_bumped > 0) /* try again if we bumped some items */
		goto find_best_matches;

	/*
	 * Rewrite the diffs with renames / copies
	 */

946
	git_vector_foreach(&diff->deltas, t, tgt) {
947
		/* skip things that are not rename targets */
948
		if ((tgt->flags & GIT_DIFF_FLAG__IS_RENAME_TARGET) == 0)
949
			continue;
950

951
		/* check if this delta was the target of a similarity */
952 953 954 955 956
		if (tgt2src[t].similarity)
			best_match = &tgt2src[t];
		else if (tgt2src_copy && tgt2src_copy[t].similarity)
			best_match = &tgt2src_copy[t];
		else
957
			continue;
958

959 960
		s = best_match->idx;
		src = GIT_VECTOR_GET(&diff->deltas, s);
961

962 963 964 965 966 967
		/* possible scenarios:
		 * 1. from DELETE to ADD/UNTRACK/IGNORE = RENAME
		 * 2. from DELETE to SPLIT/TYPECHANGE = RENAME + DELETE
		 * 3. from SPLIT/TYPECHANGE to ADD/UNTRACK/IGNORE = ADD + RENAME
		 * 4. from SPLIT/TYPECHANGE to SPLIT/TYPECHANGE = RENAME + SPLIT
		 * 5. from OTHER to ADD/UNTRACK/IGNORE = OTHER + COPY
968 969
		 */

970
		if (src->status == GIT_DELTA_DELETED) {
971

972
			if (delta_is_new_only(tgt)) {
973

974
				if (best_match->similarity < opts.rename_threshold)
975
					continue;
976

977
				delta_make_rename(tgt, src, best_match->similarity);
978

979
				src->flags |= GIT_DIFF_FLAG__TO_DELETE;
980 981
				num_rewrites++;
			} else {
982
				assert(delta_is_split(tgt));
983

984
				if (best_match->similarity < opts.rename_from_rewrite_threshold)
985
					continue;
986

987
				memcpy(&swap, &tgt->old_file, sizeof(swap));
988

989
				delta_make_rename(tgt, src, best_match->similarity);
990 991
				num_rewrites--;

992
				assert(src->status == GIT_DELTA_DELETED);
993 994 995
				memcpy(&src->old_file, &swap, sizeof(src->old_file));
				memset(&src->new_file, 0, sizeof(src->new_file));
				src->new_file.path = src->old_file.path;
996
				src->new_file.flags |= GIT_DIFF_FLAG_VALID_ID;
997

998
				num_updates++;
999 1000 1001

				if (src2tgt[t].similarity > 0 && src2tgt[t].idx > t) {
					/* what used to be at src t is now at src s */
1002
					tgt2src[src2tgt[t].idx].idx = s;
1003
				}
1004 1005 1006
			}
		}

1007
		else if (delta_is_split(src)) {
1008

1009
			if (delta_is_new_only(tgt)) {
1010

1011
				if (best_match->similarity < opts.rename_threshold)
1012
					continue;
1013

1014
				delta_make_rename(tgt, src, best_match->similarity);
1015

1016
				src->status = (diff->new_src == GIT_ITERATOR_TYPE_WORKDIR) ?
1017
					GIT_DELTA_UNTRACKED : GIT_DELTA_ADDED;
1018
				src->nfiles = 1;
1019 1020
				memset(&src->old_file, 0, sizeof(src->old_file));
				src->old_file.path = src->new_file.path;
1021
				src->old_file.flags |= GIT_DIFF_FLAG_VALID_ID;
1022

1023
				src->flags &= ~GIT_DIFF_FLAG__TO_SPLIT;
1024
				num_rewrites--;
1025 1026 1027

				num_updates++;
			} else {
1028
				assert(delta_is_split(src));
1029

1030
				if (best_match->similarity < opts.rename_from_rewrite_threshold)
1031 1032
					continue;

1033
				memcpy(&swap, &tgt->old_file, sizeof(swap));
1034

1035
				delta_make_rename(tgt, src, best_match->similarity);
1036 1037
				num_rewrites--;
				num_updates++;
1038

1039
				memcpy(&src->old_file, &swap, sizeof(src->old_file));
1040

1041 1042
				/* if we've just swapped the new element into the correct
				 * place, clear the SPLIT flag
1043
				 */
1044 1045
				if (tgt2src[s].idx == t &&
					tgt2src[s].similarity >
1046
					opts.rename_from_rewrite_threshold) {
1047 1048 1049 1050
					src->status     = GIT_DELTA_RENAMED;
					src->similarity = tgt2src[s].similarity;
					tgt2src[s].similarity = 0;
					src->flags &= ~GIT_DIFF_FLAG__TO_SPLIT;
1051 1052
					num_rewrites--;
				}
1053
				/* otherwise, if we just overwrote a source, update mapping */
1054
				else if (src2tgt[t].similarity > 0 && src2tgt[t].idx > t) {
1055
					/* what used to be at src t is now at src s */
1056
					tgt2src[src2tgt[t].idx].idx = s;
1057
				}
1058

1059 1060 1061 1062
				num_updates++;
			}
		}

1063
		else if (FLAG_SET(&opts, GIT_DIFF_FIND_COPIES)) {
1064
			if (tgt2src_copy[t].similarity < opts.copy_threshold)
1065 1066
				continue;

1067 1068 1069 1070
			/* always use best possible source for copy */
			best_match = &tgt2src_copy[t];
			src = GIT_VECTOR_GET(&diff->deltas, best_match->idx);

1071 1072 1073 1074 1075 1076 1077 1078 1079 1080
			if (delta_is_split(tgt)) {
				error = insert_delete_side_of_split(diff, &diff->deltas, tgt);
				if (error < 0)
					goto cleanup;
				num_rewrites--;
			}

			if (!delta_is_split(tgt) && !delta_is_new_only(tgt))
				continue;

1081 1082
			tgt->status     = GIT_DELTA_COPIED;
			tgt->similarity = best_match->similarity;
1083
			tgt->nfiles     = 2;
1084
			memcpy(&tgt->old_file, &src->old_file, sizeof(tgt->old_file));
1085
			tgt->flags &= ~GIT_DIFF_FLAG__TO_SPLIT;
1086 1087 1088

			num_updates++;
		}
1089 1090
	}

1091 1092 1093 1094
	/*
	 * Actually split and delete entries as needed
	 */

1095
	if (num_rewrites > 0 || num_updates > 0)
1096
		error = apply_splits_and_deletes(
1097
			diff, diff->deltas.length - num_rewrites,
1098 1099
			FLAG_SET(&opts, GIT_DIFF_BREAK_REWRITES) &&
			!FLAG_SET(&opts, GIT_DIFF_BREAK_REWRITES_FOR_RENAMES_ONLY));
1100

1101
cleanup:
1102 1103 1104
	git__free(tgt2src);
	git__free(src2tgt);
	git__free(tgt2src_copy);
1105

1106 1107 1108 1109 1110 1111
	if (sigcache) {
		for (t = 0; t < num_deltas * 2; ++t) {
			if (sigcache[t] != NULL)
				opts.metric->free_signature(sigcache[t], opts.metric->payload);
		}
		git__free(sigcache);
1112 1113
	}

1114 1115 1116
	if (!given_opts || !given_opts->metric)
		git__free(opts.metric);

1117
	return error;
1118 1119 1120
}

#undef FLAG_SET