diff_tform.c 29.3 KB
Newer Older
1
/*
Edward Thomson committed
2
 * Copyright (C) the libgit2 contributors. All rights reserved.
3 4 5 6
 *
 * This file is part of libgit2, distributed under the GNU GPL v2 with
 * a Linking Exception. For full terms see the included COPYING file.
 */
7 8

#include "diff_tform.h"
9

10
#include "git2/config.h"
11
#include "git2/blob.h"
12
#include "git2/sys/hashsig.h"
13 14

#include "diff.h"
15
#include "diff_generate.h"
16 17
#include "path.h"
#include "fileops.h"
18
#include "config.h"
19

20
git_diff_delta *git_diff__delta_dup(
21 22 23 24 25 26 27
	const git_diff_delta *d, git_pool *pool)
{
	git_diff_delta *delta = git__malloc(sizeof(git_diff_delta));
	if (!delta)
		return NULL;

	memcpy(delta, d, sizeof(git_diff_delta));
28
	GIT_DIFF_FLAG__CLEAR_INTERNAL(delta->flags);
29

30 31 32 33 34
	if (d->old_file.path != NULL) {
		delta->old_file.path = git_pool_strdup(pool, d->old_file.path);
		if (delta->old_file.path == NULL)
			goto fail;
	}
35

36
	if (d->new_file.path != d->old_file.path && d->new_file.path != NULL) {
37 38 39 40 41 42 43 44 45 46 47 48 49 50
		delta->new_file.path = git_pool_strdup(pool, d->new_file.path);
		if (delta->new_file.path == NULL)
			goto fail;
	} else {
		delta->new_file.path = delta->old_file.path;
	}

	return delta;

fail:
	git__free(delta);
	return NULL;
}

51
git_diff_delta *git_diff__merge_like_cgit(
52 53 54
	const git_diff_delta *a,
	const git_diff_delta *b,
	git_pool *pool)
55 56 57 58 59 60 61 62 63 64 65 66 67 68 69
{
	git_diff_delta *dup;

	/* Emulate C git for merging two diffs (a la 'git diff <sha>').
	 *
	 * When C git does a diff between the work dir and a tree, it actually
	 * diffs with the index but uses the workdir contents.  This emulates
	 * those choices so we can emulate the type of diff.
	 *
	 * We have three file descriptions here, let's call them:
	 *  f1 = a->old_file
	 *  f2 = a->new_file AND b->old_file
	 *  f3 = b->new_file
	 */

70 71
	/* If one of the diffs is a conflict, just dup it */
	if (b->status == GIT_DELTA_CONFLICTED)
72
		return git_diff__delta_dup(b, pool);
73
	if (a->status == GIT_DELTA_CONFLICTED)
74
		return git_diff__delta_dup(a, pool);
75

76 77
	/* if f2 == f3 or f2 is deleted, then just dup the 'a' diff */
	if (b->status == GIT_DELTA_UNMODIFIED || a->status == GIT_DELTA_DELETED)
78
		return git_diff__delta_dup(a, pool);
79 80

	/* otherwise, base this diff on the 'b' diff */
81
	if ((dup = git_diff__delta_dup(b, pool)) == NULL)
82 83 84
		return NULL;

	/* If 'a' status is uninteresting, then we're done */
85 86 87
	if (a->status == GIT_DELTA_UNMODIFIED ||
		a->status == GIT_DELTA_UNTRACKED ||
		a->status == GIT_DELTA_UNREADABLE)
88 89 90 91 92 93 94 95
		return dup;

	assert(b->status != GIT_DELTA_UNMODIFIED);

	/* A cgit exception is that the diff of a file that is only in the
	 * index (i.e. not in HEAD nor workdir) is given as empty.
	 */
	if (dup->status == GIT_DELTA_DELETED) {
96
		if (a->status == GIT_DELTA_ADDED) {
97
			dup->status = GIT_DELTA_UNMODIFIED;
98 99
			dup->nfiles = 2;
		}
100 101 102
		/* else don't overwrite DELETE status */
	} else {
		dup->status = a->status;
103
		dup->nfiles = a->nfiles;
104 105
	}

106
	git_oid_cpy(&dup->old_file.id, &a->old_file.id);
107 108 109 110 111 112 113
	dup->old_file.mode  = a->old_file.mode;
	dup->old_file.size  = a->old_file.size;
	dup->old_file.flags = a->old_file.flags;

	return dup;
}

114
int git_diff__merge(
115
	git_diff *onto, const git_diff *from, git_diff__merge_cb cb)
116 117 118 119 120
{
	int error = 0;
	git_pool onto_pool;
	git_vector onto_new;
	git_diff_delta *delta;
121
	bool ignore_case, reversed;
122 123 124 125 126 127 128
	unsigned int i, j;

	assert(onto && from);

	if (!from->deltas.length)
		return 0;

129 130 131 132 133
	ignore_case = ((onto->opts.flags & GIT_DIFF_IGNORE_CASE) != 0);
	reversed    = ((onto->opts.flags & GIT_DIFF_REVERSE) != 0);

	if (ignore_case != ((from->opts.flags & GIT_DIFF_IGNORE_CASE) != 0) ||
		reversed    != ((from->opts.flags & GIT_DIFF_REVERSE) != 0)) {
134
		giterr_set(GITERR_INVALID,
135
			"attempt to merge diffs created with conflicting options");
136 137 138
		return -1;
	}

139
	if (git_vector_init(&onto_new, onto->deltas.length, git_diff_delta__cmp) < 0)
140 141
		return -1;

142 143
	git_pool_init(&onto_pool, 1);

144 145 146
	for (i = 0, j = 0; i < onto->deltas.length || j < from->deltas.length; ) {
		git_diff_delta *o = GIT_VECTOR_GET(&onto->deltas, i);
		const git_diff_delta *f = GIT_VECTOR_GET(&from->deltas, j);
147 148
		int cmp = !f ? -1 : !o ? 1 :
			STRCMP_CASESELECT(ignore_case, o->old_file.path, f->old_file.path);
149 150

		if (cmp < 0) {
151
			delta = git_diff__delta_dup(o, &onto_pool);
152 153
			i++;
		} else if (cmp > 0) {
154
			delta = git_diff__delta_dup(f, &onto_pool);
155 156
			j++;
		} else {
157 158 159
			const git_diff_delta *left = reversed ? f : o;
			const git_diff_delta *right = reversed ? o : f;

160
			delta = cb(left, right, &onto_pool);
161 162 163 164 165 166 167
			i++;
			j++;
		}

		/* the ignore rules for the target may not match the source
		 * or the result of a merged delta could be skippable...
		 */
168
		if (delta && git_diff_delta__should_skip(&onto->opts, delta)) {
169 170 171 172 173 174 175 176 177 178 179
			git__free(delta);
			continue;
		}

		if ((error = !delta ? -1 : git_vector_insert(&onto_new, delta)) < 0)
			break;
	}

	if (!error) {
		git_vector_swap(&onto->deltas, &onto_new);
		git_pool_swap(&onto->pool, &onto_pool);
180 181 182 183 184

		if ((onto->opts.flags & GIT_DIFF_REVERSE) != 0)
			onto->old_src = from->old_src;
		else
			onto->new_src = from->new_src;
185 186 187 188 189 190 191 192

		/* prefix strings also come from old pool, so recreate those.*/
		onto->opts.old_prefix =
			git_pool_strdup_safe(&onto->pool, onto->opts.old_prefix);
		onto->opts.new_prefix =
			git_pool_strdup_safe(&onto->pool, onto->opts.new_prefix);
	}

193
	git_vector_free_deep(&onto_new);
194 195 196 197 198
	git_pool_clear(&onto_pool);

	return error;
}

199 200
int git_diff_merge(git_diff *onto, const git_diff *from)
{
201
	return git_diff__merge(onto, from, git_diff__merge_like_cgit);
202 203
}

Edward Thomson committed
204
int git_diff_find_similar__hashsig_for_file(
205 206
	void **out, const git_diff_file *f, const char *path, void *p)
{
Ben Straub committed
207
	git_hashsig_option_t opt = (git_hashsig_option_t)(intptr_t)p;
208

209
	GIT_UNUSED(f);
210
	return git_hashsig_create_fromfile((git_hashsig **)out, path, opt);
211
}
212

Edward Thomson committed
213
int git_diff_find_similar__hashsig_for_buf(
214 215
	void **out, const git_diff_file *f, const char *buf, size_t len, void *p)
{
Ben Straub committed
216
	git_hashsig_option_t opt = (git_hashsig_option_t)(intptr_t)p;
Edward Thomson committed
217

218
	GIT_UNUSED(f);
219
	return git_hashsig_create((git_hashsig **)out, buf, len, opt);
220
}
221

Edward Thomson committed
222
void git_diff_find_similar__hashsig_free(void *sig, void *payload)
223 224 225 226 227
{
	GIT_UNUSED(payload);
	git_hashsig_free(sig);
}

Edward Thomson committed
228
int git_diff_find_similar__calc_similarity(
229 230
	int *score, void *siga, void *sigb, void *payload)
{
231 232
	int error;

233
	GIT_UNUSED(payload);
234 235 236 237 238
	error = git_hashsig_compare(siga, sigb);
	if (error < 0)
		return error;

	*score = error;
239 240 241
	return 0;
}

242 243
#define DEFAULT_THRESHOLD 50
#define DEFAULT_BREAK_REWRITE_THRESHOLD 60
244
#define DEFAULT_RENAME_LIMIT 200
245 246

static int normalize_find_opts(
247
	git_diff *diff,
248
	git_diff_find_options *opts,
Russell Belfer committed
249
	const git_diff_find_options *given)
250 251
{
	git_config *cfg = NULL;
252
	git_hashsig_option_t hashsig_opts;
253

Ben Straub committed
254 255
	GITERR_CHECK_VERSION(given, GIT_DIFF_FIND_OPTIONS_VERSION, "git_diff_find_options");

256 257 258 259
	if (diff->repo != NULL &&
		git_repository_config__weakptr(&cfg, diff->repo) < 0)
		return -1;

260
	if (given)
261 262
		memcpy(opts, given, sizeof(*opts));

263 264 265
	if (!given ||
		 (given->flags & GIT_DIFF_FIND_ALL) == GIT_DIFF_FIND_BY_CONFIG)
	{
266
		if (cfg) {
267 268 269 270 271 272 273 274 275 276 277 278 279 280
			char *rule =
				git_config__get_string_force(cfg, "diff.renames", "true");
			int boolval;

			if (!git__parse_bool(&boolval, rule) && !boolval)
				/* don't set FIND_RENAMES if bool value is false */;
			else if (!strcasecmp(rule, "copies") || !strcasecmp(rule, "copy"))
				opts->flags |= GIT_DIFF_FIND_RENAMES | GIT_DIFF_FIND_COPIES;
			else
				opts->flags |= GIT_DIFF_FIND_RENAMES;

			git__free(rule);
		} else {
			/* set default flag */
281
			opts->flags |= GIT_DIFF_FIND_RENAMES;
282
		}
283
	}
284

285 286
	/* some flags imply others */

287 288 289 290 291 292 293 294 295 296
	if (opts->flags & GIT_DIFF_FIND_EXACT_MATCH_ONLY) {
		/* if we are only looking for exact matches, then don't turn
		 * MODIFIED items into ADD/DELETE pairs because it's too picky
		 */
		opts->flags &= ~(GIT_DIFF_FIND_REWRITES | GIT_DIFF_BREAK_REWRITES);

		/* similarly, don't look for self-rewrites to split */
		opts->flags &= ~GIT_DIFF_FIND_RENAMES_FROM_REWRITES;
	}

297 298 299 300 301 302
	if (opts->flags & GIT_DIFF_FIND_RENAMES_FROM_REWRITES)
		opts->flags |= GIT_DIFF_FIND_RENAMES;

	if (opts->flags & GIT_DIFF_FIND_COPIES_FROM_UNMODIFIED)
		opts->flags |= GIT_DIFF_FIND_COPIES;

303 304 305
	if (opts->flags & GIT_DIFF_BREAK_REWRITES)
		opts->flags |= GIT_DIFF_FIND_REWRITES;

306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321
#define USE_DEFAULT(X) ((X) == 0 || (X) > 100)

	if (USE_DEFAULT(opts->rename_threshold))
		opts->rename_threshold = DEFAULT_THRESHOLD;

	if (USE_DEFAULT(opts->rename_from_rewrite_threshold))
		opts->rename_from_rewrite_threshold = DEFAULT_THRESHOLD;

	if (USE_DEFAULT(opts->copy_threshold))
		opts->copy_threshold = DEFAULT_THRESHOLD;

	if (USE_DEFAULT(opts->break_rewrite_threshold))
		opts->break_rewrite_threshold = DEFAULT_BREAK_REWRITE_THRESHOLD;

#undef USE_DEFAULT

322
	if (!opts->rename_limit) {
323 324 325 326
		if (cfg) {
			opts->rename_limit = git_config__get_int_force(
				cfg, "diff.renamelimit", DEFAULT_RENAME_LIMIT);
		}
327

328 329
		if (opts->rename_limit <= 0)
			opts->rename_limit = DEFAULT_RENAME_LIMIT;
330 331
	}

332
	/* assign the internal metric with whitespace flag as payload */
333
	if (!opts->metric) {
334 335 336
		opts->metric = git__malloc(sizeof(git_diff_similarity_metric));
		GITERR_CHECK_ALLOC(opts->metric);

Edward Thomson committed
337 338 339 340
		opts->metric->file_signature = git_diff_find_similar__hashsig_for_file;
		opts->metric->buffer_signature = git_diff_find_similar__hashsig_for_buf;
		opts->metric->free_signature = git_diff_find_similar__hashsig_free;
		opts->metric->similarity = git_diff_find_similar__calc_similarity;
341

342
		if (opts->flags & GIT_DIFF_FIND_IGNORE_WHITESPACE)
343
			hashsig_opts = GIT_HASHSIG_IGNORE_WHITESPACE;
344
		else if (opts->flags & GIT_DIFF_FIND_DONT_IGNORE_WHITESPACE)
345
			hashsig_opts = GIT_HASHSIG_NORMAL;
346
		else
347 348 349
			hashsig_opts = GIT_HASHSIG_SMART_WHITESPACE;
		hashsig_opts |= GIT_HASHSIG_ALLOW_SMALL_FILES;
		opts->metric->payload = (void *)hashsig_opts;
350 351
	}

352 353 354
	return 0;
}

355 356 357 358
static int insert_delete_side_of_split(
	git_diff *diff, git_vector *onto, const git_diff_delta *delta)
{
	/* make new record for DELETED side of split */
359
	git_diff_delta *deleted = git_diff__delta_dup(delta, &diff->pool);
360 361 362 363 364 365
	GITERR_CHECK_ALLOC(deleted);

	deleted->status = GIT_DELTA_DELETED;
	deleted->nfiles = 1;
	memset(&deleted->new_file, 0, sizeof(deleted->new_file));
	deleted->new_file.path = deleted->old_file.path;
366
	deleted->new_file.flags |= GIT_DIFF_FLAG_VALID_ID;
367 368 369 370

	return git_vector_insert(onto, deleted);
}

371
static int apply_splits_and_deletes(
372
	git_diff *diff, size_t expected_size, bool actually_split)
373 374 375
{
	git_vector onto = GIT_VECTOR_INIT;
	size_t i;
376
	git_diff_delta *delta;
377 378 379 380 381 382

	if (git_vector_init(&onto, expected_size, git_diff_delta__cmp) < 0)
		return -1;

	/* build new delta list without TO_DELETE and splitting TO_SPLIT */
	git_vector_foreach(&diff->deltas, i, delta) {
383
		if ((delta->flags & GIT_DIFF_FLAG__TO_DELETE) != 0)
384 385
			continue;

386
		if ((delta->flags & GIT_DIFF_FLAG__TO_SPLIT) != 0 && actually_split) {
387 388
			delta->similarity = 0;

389
			if (insert_delete_side_of_split(diff, &onto, delta) < 0)
390
				goto on_error;
391

392 393 394 395
			if (diff->new_src == GIT_ITERATOR_TYPE_WORKDIR)
				delta->status = GIT_DELTA_UNTRACKED;
			else
				delta->status = GIT_DELTA_ADDED;
396
			delta->nfiles = 1;
397 398
			memset(&delta->old_file, 0, sizeof(delta->old_file));
			delta->old_file.path = delta->new_file.path;
399
			delta->old_file.flags |= GIT_DIFF_FLAG_VALID_ID;
400 401
		}

402 403 404 405 406 407 408 409 410
		/* clean up delta before inserting into new list */
		GIT_DIFF_FLAG__CLEAR_INTERNAL(delta->flags);

		if (delta->status != GIT_DELTA_COPIED &&
			delta->status != GIT_DELTA_RENAMED &&
			(delta->status != GIT_DELTA_MODIFIED || actually_split))
			delta->similarity = 0;

		/* insert into new list */
411 412
		if (git_vector_insert(&onto, delta) < 0)
			goto on_error;
413 414
	}

415
	/* cannot return an error past this point */
416 417

	/* free deltas from old list that didn't make it to the new one */
418
	git_vector_foreach(&diff->deltas, i, delta) {
419
		if ((delta->flags & GIT_DIFF_FLAG__TO_DELETE) != 0)
420
			git__free(delta);
421 422
	}

423 424 425
	/* swap new delta list into place */
	git_vector_swap(&diff->deltas, &onto);
	git_vector_free(&onto);
426
	git_vector_sort(&diff->deltas);
427 428

	return 0;
429 430

on_error:
431
	git_vector_free_deep(&onto);
432 433

	return -1;
434 435
}

436
GIT_INLINE(git_diff_file *) similarity_get_file(git_diff *diff, size_t idx)
437 438 439 440
{
	git_diff_delta *delta = git_vector_get(&diff->deltas, idx / 2);
	return (idx & 1) ? &delta->new_file : &delta->old_file;
}
441

442 443 444 445 446 447
typedef struct {
	size_t idx;
	git_iterator_type_t src;
	git_repository *repo;
	git_diff_file *file;
	git_buf data;
448
	git_odb_object *odb_obj;
449 450 451
	git_blob *blob;
} similarity_info;

452
static int similarity_init(
453
	similarity_info *info, git_diff *diff, size_t file_idx)
454 455 456 457 458
{
	info->idx  = file_idx;
	info->src  = (file_idx & 1) ? diff->new_src : diff->old_src;
	info->repo = diff->repo;
	info->file = similarity_get_file(diff, file_idx);
459
	info->odb_obj = NULL;
460 461
	info->blob = NULL;
	git_buf_init(&info->data, 0);
462

463
	if (info->file->size > 0 || info->src == GIT_ITERATOR_TYPE_WORKDIR)
464
		return 0;
465

466 467
	return git_diff_file__resolve_zero_size(
		info->file, &info->odb_obj, info->repo);
468
}
469

470
static int similarity_sig(
471 472 473 474 475
	similarity_info *info,
	const git_diff_find_options *opts,
	void **cache)
{
	int error = 0;
476
	git_diff_file *file = info->file;
477

478 479 480 481
	if (info->src == GIT_ITERATOR_TYPE_WORKDIR) {
		if ((error = git_buf_joinpath(
			&info->data, git_repository_workdir(info->repo), file->path)) < 0)
			return error;
482

483 484 485
		/* if path is not a regular file, just skip this item */
		if (!git_path_isfile(info->data.ptr))
			return 0;
486 487 488 489 490 491 492

		/* TODO: apply wd-to-odb filters to file data if necessary */

		error = opts->metric->file_signature(
			&cache[info->idx], info->file,
			info->data.ptr, opts->metric->payload);
	} else {
493 494 495 496 497 498 499 500
		/* if we didn't initially know the size, we might have an odb_obj
		 * around from earlier, so convert that, otherwise load the blob now
		 */
		if (info->odb_obj != NULL)
			error = git_object__from_odb_object(
				(git_object **)&info->blob, info->repo,
				info->odb_obj, GIT_OBJ_BLOB);
		else
501
			error = git_blob_lookup(&info->blob, info->repo, &file->id);
502 503 504 505 506

		if (error < 0) {
			/* if lookup fails, just skip this item in similarity calc */
			giterr_clear();
		} else {
507 508 509 510 511 512 513
			size_t sz;

			/* index size may not be actual blob size if filtered */
			if (file->size != git_blob_rawsize(info->blob))
				file->size = git_blob_rawsize(info->blob);

			sz = (size_t)(git__is_sizet(file->size) ? file->size : -1);
514 515 516 517 518

			error = opts->metric->buffer_signature(
				&cache[info->idx], info->file,
				git_blob_rawcontent(info->blob), sz, opts->metric->payload);
		}
519 520 521 522 523
	}

	return error;
}

524 525 526 527 528 529 530 531 532 533 534
static void similarity_unload(similarity_info *info)
{
	if (info->odb_obj)
		git_odb_object_free(info->odb_obj);

	if (info->blob)
		git_blob_free(info->blob);
	else
		git_buf_free(&info->data);
}

535
#define FLAG_SET(opts,flag_name) (((opts)->flags & flag_name) != 0)
536 537 538 539 540

/* - score < 0 means files cannot be compared
 * - score >= 100 means files are exact match
 * - score == 0 means files are completely different
 */
541
static int similarity_measure(
542
	int *score,
543
	git_diff *diff,
544
	const git_diff_find_options *opts,
545 546 547 548 549 550
	void **cache,
	size_t a_idx,
	size_t b_idx)
{
	git_diff_file *a_file = similarity_get_file(diff, a_idx);
	git_diff_file *b_file = similarity_get_file(diff, b_idx);
551
	bool exact_match = FLAG_SET(opts, GIT_DIFF_FIND_EXACT_MATCH_ONLY);
552 553
	int error = 0;
	similarity_info a_info, b_info;
554 555

	*score = -1;
556

557 558
	/* don't try to compare things that aren't files */
	if (!GIT_MODE_ISBLOB(a_file->mode) || !GIT_MODE_ISBLOB(b_file->mode))
559 560
		return 0;

561
	/* if exact match is requested, force calculation of missing OIDs now */
562
	if (exact_match) {
563
		if (git_oid_iszero(&a_file->id) &&
564
			diff->old_src == GIT_ITERATOR_TYPE_WORKDIR &&
565 566
			!git_diff__oid_for_file(&a_file->id,
				diff, a_file->path, a_file->mode, a_file->size))
567
			a_file->flags |= GIT_DIFF_FLAG_VALID_ID;
568

569
		if (git_oid_iszero(&b_file->id) &&
570
			diff->new_src == GIT_ITERATOR_TYPE_WORKDIR &&
571 572
			!git_diff__oid_for_file(&b_file->id,
				diff, b_file->path, b_file->mode, b_file->size))
573
			b_file->flags |= GIT_DIFF_FLAG_VALID_ID;
574 575 576
	}

	/* check OID match as a quick test */
577
	if (git_oid__cmp(&a_file->id, &b_file->id) == 0) {
578 579 580 581 582 583 584 585 586
		*score = 100;
		return 0;
	}

	/* don't calculate signatures if we are doing exact match */
	if (exact_match) {
		*score = 0;
		return 0;
	}
587

588 589
	memset(&a_info, 0, sizeof(a_info));
	memset(&b_info, 0, sizeof(b_info));
590

591 592 593 594 595
	/* set up similarity data (will try to update missing file sizes) */
	if (!cache[a_idx] && (error = similarity_init(&a_info, diff, a_idx)) < 0)
		return error;
	if (!cache[b_idx] && (error = similarity_init(&b_info, diff, b_idx)) < 0)
		goto cleanup;
596

Russell Belfer committed
597
	/* check if file sizes are nowhere near each other */
598 599
	if (a_file->size > 127 &&
		b_file->size > 127 &&
600 601
		(a_file->size > (b_file->size << 3) ||
		 b_file->size > (a_file->size << 3)))
602
		goto cleanup;
603

604
	/* update signature cache if needed */
605 606 607 608 609 610 611 612
	if (!cache[a_idx]) {
		if ((error = similarity_sig(&a_info, opts, cache)) < 0)
			goto cleanup;
	}
	if (!cache[b_idx]) {
		if ((error = similarity_sig(&b_info, opts, cache)) < 0)
			goto cleanup;
	}
nulltoken committed
613

614 615 616 617 618 619
	/* calculate similarity provided that the metric choose to process
	 * both the a and b files (some may not if file is too big, etc).
	 */
	if (cache[a_idx] && cache[b_idx])
		error = opts->metric->similarity(
			score, cache[a_idx], cache[b_idx], opts->metric->payload);
620

621
cleanup:
622 623 624 625
	similarity_unload(&a_info);
	similarity_unload(&b_info);

	return error;
626 627
}

628
static int calc_self_similarity(
629
	git_diff *diff,
630 631 632
	const git_diff_find_options *opts,
	size_t delta_idx,
	void **cache)
633
{
634 635 636 637 638 639 640 641 642 643 644 645
	int error, similarity = -1;
	git_diff_delta *delta = GIT_VECTOR_GET(&diff->deltas, delta_idx);

	if ((delta->flags & GIT_DIFF_FLAG__HAS_SELF_SIMILARITY) != 0)
		return 0;

	error = similarity_measure(
		&similarity, diff, opts, cache, 2 * delta_idx, 2 * delta_idx + 1);
	if (error < 0)
		return error;

	if (similarity >= 0) {
646
		delta->similarity = (uint16_t)similarity;
647 648 649 650 651 652 653
		delta->flags |= GIT_DIFF_FLAG__HAS_SELF_SIMILARITY;
	}

	return 0;
}

static bool is_rename_target(
654
	git_diff *diff,
655 656 657 658 659 660 661 662 663 664 665
	const git_diff_find_options *opts,
	size_t delta_idx,
	void **cache)
{
	git_diff_delta *delta = GIT_VECTOR_GET(&diff->deltas, delta_idx);

	/* skip things that aren't plain blobs */
	if (!GIT_MODE_ISBLOB(delta->new_file.mode))
		return false;

	/* only consider ADDED, RENAMED, COPIED, and split MODIFIED as
666
	 * targets; maybe include UNTRACKED if requested.
667 668 669 670
	 */
	switch (delta->status) {
	case GIT_DELTA_UNMODIFIED:
	case GIT_DELTA_DELETED:
671 672
	case GIT_DELTA_IGNORED:
	case GIT_DELTA_CONFLICTED:
673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688
		return false;

	case GIT_DELTA_MODIFIED:
		if (!FLAG_SET(opts, GIT_DIFF_FIND_REWRITES) &&
			!FLAG_SET(opts, GIT_DIFF_FIND_RENAMES_FROM_REWRITES))
			return false;

		if (calc_self_similarity(diff, opts, delta_idx, cache) < 0)
			return false;

		if (FLAG_SET(opts, GIT_DIFF_BREAK_REWRITES) &&
			delta->similarity < opts->break_rewrite_threshold) {
			delta->flags |= GIT_DIFF_FLAG__TO_SPLIT;
			break;
		}
		if (FLAG_SET(opts, GIT_DIFF_FIND_RENAMES_FROM_REWRITES) &&
689 690
			delta->similarity < opts->rename_from_rewrite_threshold) {
			delta->flags |= GIT_DIFF_FLAG__TO_SPLIT;
691
			break;
692
		}
693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709

		return false;

	case GIT_DELTA_UNTRACKED:
		if (!FLAG_SET(opts, GIT_DIFF_FIND_FOR_UNTRACKED))
			return false;
		break;

	default: /* all other status values should be checked */
		break;
	}

	delta->flags |= GIT_DIFF_FLAG__IS_RENAME_TARGET;
	return true;
}

static bool is_rename_source(
710
	git_diff *diff,
711 712 713 714 715 716 717 718 719 720 721 722 723
	const git_diff_find_options *opts,
	size_t delta_idx,
	void **cache)
{
	git_diff_delta *delta = GIT_VECTOR_GET(&diff->deltas, delta_idx);

	/* skip things that aren't blobs */
	if (!GIT_MODE_ISBLOB(delta->old_file.mode))
		return false;

	switch (delta->status) {
	case GIT_DELTA_ADDED:
	case GIT_DELTA_UNTRACKED:
724
	case GIT_DELTA_UNREADABLE:
725
	case GIT_DELTA_IGNORED:
726
	case GIT_DELTA_CONFLICTED:
727 728 729 730 731 732 733 734 735
		return false;

	case GIT_DELTA_DELETED:
	case GIT_DELTA_TYPECHANGE:
		break;

	case GIT_DELTA_UNMODIFIED:
		if (!FLAG_SET(opts, GIT_DIFF_FIND_COPIES_FROM_UNMODIFIED))
			return false;
736
		if (FLAG_SET(opts, GIT_DIFF_FIND_REMOVE_UNMODIFIED))
737
			delta->flags |= GIT_DIFF_FLAG__TO_DELETE;
738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779
		break;

	default: /* MODIFIED, RENAMED, COPIED */
		/* if we're finding copies, this could be a source */
		if (FLAG_SET(opts, GIT_DIFF_FIND_COPIES))
			break;

		/* otherwise, this is only a source if we can split it */
		if (!FLAG_SET(opts, GIT_DIFF_FIND_REWRITES) &&
			!FLAG_SET(opts, GIT_DIFF_FIND_RENAMES_FROM_REWRITES))
			return false;

		if (calc_self_similarity(diff, opts, delta_idx, cache) < 0)
			return false;

		if (FLAG_SET(opts, GIT_DIFF_BREAK_REWRITES) &&
			delta->similarity < opts->break_rewrite_threshold) {
			delta->flags |= GIT_DIFF_FLAG__TO_SPLIT;
			break;
		}

		if (FLAG_SET(opts, GIT_DIFF_FIND_RENAMES_FROM_REWRITES) &&
			delta->similarity < opts->rename_from_rewrite_threshold)
			break;

		return false;
	}

	delta->flags |= GIT_DIFF_FLAG__IS_RENAME_SOURCE;
	return true;
}

GIT_INLINE(bool) delta_is_split(git_diff_delta *delta)
{
	return (delta->status == GIT_DELTA_TYPECHANGE ||
			(delta->flags & GIT_DIFF_FLAG__TO_SPLIT) != 0);
}

GIT_INLINE(bool) delta_is_new_only(git_diff_delta *delta)
{
	return (delta->status == GIT_DELTA_ADDED ||
			delta->status == GIT_DELTA_UNTRACKED ||
780
			delta->status == GIT_DELTA_UNREADABLE ||
781
			delta->status == GIT_DELTA_IGNORED);
782
}
783

784
GIT_INLINE(void) delta_make_rename(
785
	git_diff_delta *to, const git_diff_delta *from, uint16_t similarity)
786 787 788
{
	to->status     = GIT_DELTA_RENAMED;
	to->similarity = similarity;
789
	to->nfiles     = 2;
790 791 792 793
	memcpy(&to->old_file, &from->old_file, sizeof(to->old_file));
	to->flags &= ~GIT_DIFF_FLAG__TO_SPLIT;
}

794
typedef struct {
795 796
	size_t   idx;
	uint16_t similarity;
797 798
} diff_find_match;

799
int git_diff_find_similar(
800
	git_diff *diff,
Russell Belfer committed
801
	const git_diff_find_options *given_opts)
802
{
803
	size_t s, t;
804 805
	int error = 0, result;
	uint16_t similarity;
806
	git_diff_delta *src, *tgt;
807
	git_diff_find_options opts = GIT_DIFF_FIND_OPTIONS_INIT;
808 809
	size_t num_deltas, num_srcs = 0, num_tgts = 0;
	size_t tried_srcs = 0, tried_tgts = 0;
810
	size_t num_rewrites = 0, num_updates = 0, num_bumped = 0;
811
	size_t sigcache_size;
812
	void **sigcache = NULL; /* cache of similarity metric file signatures */
813 814 815 816
	diff_find_match *tgt2src = NULL;
	diff_find_match *src2tgt = NULL;
	diff_find_match *tgt2src_copy = NULL;
	diff_find_match *best_match;
817
	git_diff_file swap;
818

819
	if ((error = normalize_find_opts(diff, &opts, given_opts)) < 0)
820 821
		return error;

822 823
	num_deltas = diff->deltas.length;

824
	/* TODO: maybe abort if deltas.length > rename_limit ??? */
825
	if (!git__is_uint32(num_deltas))
826 827 828 829 830
		goto cleanup;

	/* No flags set; nothing to do */
	if ((opts.flags & GIT_DIFF_FIND_ALL) == 0)
		goto cleanup;
831

832 833
	GITERR_CHECK_ALLOC_MULTIPLY(&sigcache_size, num_deltas, 2);
	sigcache = git__calloc(sigcache_size, sizeof(void *));
834 835 836 837 838 839 840
	GITERR_CHECK_ALLOC(sigcache);

	/* Label rename sources and targets
	 *
	 * This will also set self-similarity scores for MODIFIED files and
	 * mark them for splitting if break-rewrites is enabled
	 */
841 842
	git_vector_foreach(&diff->deltas, t, tgt) {
		if (is_rename_source(diff, &opts, t, sigcache))
843 844
			++num_srcs;

845
		if (is_rename_target(diff, &opts, t, sigcache))
846
			++num_tgts;
847 848 849

		if ((tgt->flags & GIT_DIFF_FLAG__TO_SPLIT) != 0)
			num_rewrites++;
850
	}
851

852 853 854
	/* if there are no candidate srcs or tgts, we're done */
	if (!num_srcs || !num_tgts)
		goto cleanup;
855

856 857 858 859 860 861 862 863 864
	src2tgt = git__calloc(num_deltas, sizeof(diff_find_match));
	GITERR_CHECK_ALLOC(src2tgt);
	tgt2src = git__calloc(num_deltas, sizeof(diff_find_match));
	GITERR_CHECK_ALLOC(tgt2src);

	if (FLAG_SET(&opts, GIT_DIFF_FIND_COPIES)) {
		tgt2src_copy = git__calloc(num_deltas, sizeof(diff_find_match));
		GITERR_CHECK_ALLOC(tgt2src_copy);
	}
865

866 867 868
	/*
	 * Find best-fit matches for rename / copy candidates
	 */
869

870 871
find_best_matches:
	tried_tgts = num_bumped = 0;
872

873
	git_vector_foreach(&diff->deltas, t, tgt) {
874
		/* skip things that are not rename targets */
875
		if ((tgt->flags & GIT_DIFF_FLAG__IS_RENAME_TARGET) == 0)
876 877
			continue;

878
		tried_srcs = 0;
879

880
		git_vector_foreach(&diff->deltas, s, src) {
881
			/* skip things that are not rename sources */
882
			if ((src->flags & GIT_DIFF_FLAG__IS_RENAME_SOURCE) == 0)
883 884
				continue;

885
			/* calculate similarity for this pair and find best match */
886
			if (s == t)
887
				result = -1; /* don't measure self-similarity here */
888
			else if ((error = similarity_measure(
889
				&result, diff, &opts, sigcache, 2 * s, 2 * t + 1)) < 0)
890
				goto cleanup;
891

892
			if (result < 0)
893
				continue;
894
			similarity = (uint16_t)result;
895 896

			/* is this a better rename? */
897 898
			if (tgt2src[t].similarity < similarity &&
				src2tgt[s].similarity < similarity)
899
			{
900 901 902 903 904 905 906 907
				/* eject old mapping */
				if (src2tgt[s].similarity > 0) {
					tgt2src[src2tgt[s].idx].similarity = 0;
					num_bumped++;
				}
				if (tgt2src[t].similarity > 0) {
					src2tgt[tgt2src[t].idx].similarity = 0;
					num_bumped++;
908 909
				}

910
				/* write new mapping */
911 912 913 914
				tgt2src[t].idx = s;
				tgt2src[t].similarity = similarity;
				src2tgt[s].idx = t;
				src2tgt[s].similarity = similarity;
915
			}
916

917 918
			/* keep best absolute match for copies */
			if (tgt2src_copy != NULL &&
919
				tgt2src_copy[t].similarity < similarity)
920
			{
921 922
				tgt2src_copy[t].idx = s;
				tgt2src_copy[t].similarity = similarity;
923
			}
924 925 926 927

			if (++tried_srcs >= num_srcs)
				break;

928
			/* cap on maximum targets we'll examine (per "tgt" file) */
929 930
			if (tried_srcs > opts.rename_limit)
				break;
931
		}
932 933 934

		if (++tried_tgts >= num_tgts)
			break;
935 936
	}

937 938 939 940 941 942 943
	if (num_bumped > 0) /* try again if we bumped some items */
		goto find_best_matches;

	/*
	 * Rewrite the diffs with renames / copies
	 */

944
	git_vector_foreach(&diff->deltas, t, tgt) {
945
		/* skip things that are not rename targets */
946
		if ((tgt->flags & GIT_DIFF_FLAG__IS_RENAME_TARGET) == 0)
947
			continue;
948

949
		/* check if this delta was the target of a similarity */
950 951 952 953 954
		if (tgt2src[t].similarity)
			best_match = &tgt2src[t];
		else if (tgt2src_copy && tgt2src_copy[t].similarity)
			best_match = &tgt2src_copy[t];
		else
955
			continue;
956

957 958
		s = best_match->idx;
		src = GIT_VECTOR_GET(&diff->deltas, s);
959

960 961 962 963 964 965
		/* possible scenarios:
		 * 1. from DELETE to ADD/UNTRACK/IGNORE = RENAME
		 * 2. from DELETE to SPLIT/TYPECHANGE = RENAME + DELETE
		 * 3. from SPLIT/TYPECHANGE to ADD/UNTRACK/IGNORE = ADD + RENAME
		 * 4. from SPLIT/TYPECHANGE to SPLIT/TYPECHANGE = RENAME + SPLIT
		 * 5. from OTHER to ADD/UNTRACK/IGNORE = OTHER + COPY
966 967
		 */

968
		if (src->status == GIT_DELTA_DELETED) {
969

970
			if (delta_is_new_only(tgt)) {
971

972
				if (best_match->similarity < opts.rename_threshold)
973
					continue;
974

975
				delta_make_rename(tgt, src, best_match->similarity);
976

977
				src->flags |= GIT_DIFF_FLAG__TO_DELETE;
978 979
				num_rewrites++;
			} else {
980
				assert(delta_is_split(tgt));
981

982
				if (best_match->similarity < opts.rename_from_rewrite_threshold)
983
					continue;
984

985
				memcpy(&swap, &tgt->old_file, sizeof(swap));
986

987
				delta_make_rename(tgt, src, best_match->similarity);
988 989
				num_rewrites--;

990
				assert(src->status == GIT_DELTA_DELETED);
991 992 993
				memcpy(&src->old_file, &swap, sizeof(src->old_file));
				memset(&src->new_file, 0, sizeof(src->new_file));
				src->new_file.path = src->old_file.path;
994
				src->new_file.flags |= GIT_DIFF_FLAG_VALID_ID;
995

996
				num_updates++;
997 998 999

				if (src2tgt[t].similarity > 0 && src2tgt[t].idx > t) {
					/* what used to be at src t is now at src s */
1000
					tgt2src[src2tgt[t].idx].idx = s;
1001
				}
1002 1003 1004
			}
		}

1005
		else if (delta_is_split(src)) {
1006

1007
			if (delta_is_new_only(tgt)) {
1008

1009
				if (best_match->similarity < opts.rename_threshold)
1010
					continue;
1011

1012
				delta_make_rename(tgt, src, best_match->similarity);
1013

1014
				src->status = (diff->new_src == GIT_ITERATOR_TYPE_WORKDIR) ?
1015
					GIT_DELTA_UNTRACKED : GIT_DELTA_ADDED;
1016
				src->nfiles = 1;
1017 1018
				memset(&src->old_file, 0, sizeof(src->old_file));
				src->old_file.path = src->new_file.path;
1019
				src->old_file.flags |= GIT_DIFF_FLAG_VALID_ID;
1020

1021
				src->flags &= ~GIT_DIFF_FLAG__TO_SPLIT;
1022
				num_rewrites--;
1023 1024 1025

				num_updates++;
			} else {
1026
				assert(delta_is_split(src));
1027

1028
				if (best_match->similarity < opts.rename_from_rewrite_threshold)
1029 1030
					continue;

1031
				memcpy(&swap, &tgt->old_file, sizeof(swap));
1032

1033
				delta_make_rename(tgt, src, best_match->similarity);
1034 1035
				num_rewrites--;
				num_updates++;
1036

1037
				memcpy(&src->old_file, &swap, sizeof(src->old_file));
1038

1039 1040
				/* if we've just swapped the new element into the correct
				 * place, clear the SPLIT flag
1041
				 */
1042 1043
				if (tgt2src[s].idx == t &&
					tgt2src[s].similarity >
1044
					opts.rename_from_rewrite_threshold) {
1045 1046 1047 1048
					src->status     = GIT_DELTA_RENAMED;
					src->similarity = tgt2src[s].similarity;
					tgt2src[s].similarity = 0;
					src->flags &= ~GIT_DIFF_FLAG__TO_SPLIT;
1049 1050
					num_rewrites--;
				}
1051
				/* otherwise, if we just overwrote a source, update mapping */
1052
				else if (src2tgt[t].similarity > 0 && src2tgt[t].idx > t) {
1053
					/* what used to be at src t is now at src s */
1054
					tgt2src[src2tgt[t].idx].idx = s;
1055
				}
1056

1057 1058 1059 1060
				num_updates++;
			}
		}

1061
		else if (FLAG_SET(&opts, GIT_DIFF_FIND_COPIES)) {
1062
			if (tgt2src_copy[t].similarity < opts.copy_threshold)
1063 1064
				continue;

1065 1066 1067 1068
			/* always use best possible source for copy */
			best_match = &tgt2src_copy[t];
			src = GIT_VECTOR_GET(&diff->deltas, best_match->idx);

1069 1070 1071 1072 1073 1074 1075 1076 1077 1078
			if (delta_is_split(tgt)) {
				error = insert_delete_side_of_split(diff, &diff->deltas, tgt);
				if (error < 0)
					goto cleanup;
				num_rewrites--;
			}

			if (!delta_is_split(tgt) && !delta_is_new_only(tgt))
				continue;

1079 1080
			tgt->status     = GIT_DELTA_COPIED;
			tgt->similarity = best_match->similarity;
1081
			tgt->nfiles     = 2;
1082
			memcpy(&tgt->old_file, &src->old_file, sizeof(tgt->old_file));
1083
			tgt->flags &= ~GIT_DIFF_FLAG__TO_SPLIT;
1084 1085 1086

			num_updates++;
		}
1087 1088
	}

1089 1090 1091 1092
	/*
	 * Actually split and delete entries as needed
	 */

1093
	if (num_rewrites > 0 || num_updates > 0)
1094
		error = apply_splits_and_deletes(
1095
			diff, diff->deltas.length - num_rewrites,
1096 1097
			FLAG_SET(&opts, GIT_DIFF_BREAK_REWRITES) &&
			!FLAG_SET(&opts, GIT_DIFF_BREAK_REWRITES_FOR_RENAMES_ONLY));
1098

1099
cleanup:
1100 1101 1102
	git__free(tgt2src);
	git__free(src2tgt);
	git__free(tgt2src_copy);
1103

1104 1105 1106 1107 1108 1109
	if (sigcache) {
		for (t = 0; t < num_deltas * 2; ++t) {
			if (sigcache[t] != NULL)
				opts.metric->free_signature(sigcache[t], opts.metric->payload);
		}
		git__free(sigcache);
1110 1111
	}

1112 1113 1114
	if (!given_opts || !given_opts->metric)
		git__free(opts.metric);

1115
	return error;
1116 1117 1118
}

#undef FLAG_SET