/* * Copyright (C) 2012 the libgit2 contributors * * This file is part of libgit2, distributed under the GNU GPL v2 with * a Linking Exception. For full terms see the included COPYING file. */ #include "common.h" #include "diff.h" #include "git2/config.h" static git_diff_delta *diff_delta__dup( const git_diff_delta *d, git_pool *pool) { git_diff_delta *delta = git__malloc(sizeof(git_diff_delta)); if (!delta) return NULL; memcpy(delta, d, sizeof(git_diff_delta)); delta->old_file.path = git_pool_strdup(pool, d->old_file.path); if (delta->old_file.path == NULL) goto fail; if (d->new_file.path != d->old_file.path) { delta->new_file.path = git_pool_strdup(pool, d->new_file.path); if (delta->new_file.path == NULL) goto fail; } else { delta->new_file.path = delta->old_file.path; } return delta; fail: git__free(delta); return NULL; } static git_diff_delta *diff_delta__merge_like_cgit( const git_diff_delta *a, const git_diff_delta *b, git_pool *pool) { git_diff_delta *dup; /* Emulate C git for merging two diffs (a la 'git diff <sha>'). * * When C git does a diff between the work dir and a tree, it actually * diffs with the index but uses the workdir contents. This emulates * those choices so we can emulate the type of diff. * * We have three file descriptions here, let's call them: * f1 = a->old_file * f2 = a->new_file AND b->old_file * f3 = b->new_file */ /* if f2 == f3 or f2 is deleted, then just dup the 'a' diff */ if (b->status == GIT_DELTA_UNMODIFIED || a->status == GIT_DELTA_DELETED) return diff_delta__dup(a, pool); /* otherwise, base this diff on the 'b' diff */ if ((dup = diff_delta__dup(b, pool)) == NULL) return NULL; /* If 'a' status is uninteresting, then we're done */ if (a->status == GIT_DELTA_UNMODIFIED) return dup; assert(a->status != GIT_DELTA_UNMODIFIED); assert(b->status != GIT_DELTA_UNMODIFIED); /* A cgit exception is that the diff of a file that is only in the * index (i.e. not in HEAD nor workdir) is given as empty. */ if (dup->status == GIT_DELTA_DELETED) { if (a->status == GIT_DELTA_ADDED) dup->status = GIT_DELTA_UNMODIFIED; /* else don't overwrite DELETE status */ } else { dup->status = a->status; } git_oid_cpy(&dup->old_file.oid, &a->old_file.oid); dup->old_file.mode = a->old_file.mode; dup->old_file.size = a->old_file.size; dup->old_file.flags = a->old_file.flags; return dup; } int git_diff_merge( git_diff_list *onto, const git_diff_list *from) { int error = 0; git_pool onto_pool; git_vector onto_new; git_diff_delta *delta; bool ignore_case = false; unsigned int i, j; assert(onto && from); if (!from->deltas.length) return 0; if (git_vector_init( &onto_new, onto->deltas.length, git_diff_delta__cmp) < 0 || git_pool_init(&onto_pool, 1, 0) < 0) return -1; if ((onto->opts.flags & GIT_DIFF_DELTAS_ARE_ICASE) != 0 || (from->opts.flags & GIT_DIFF_DELTAS_ARE_ICASE) != 0) { ignore_case = true; /* This function currently only supports merging diff lists that * are sorted identically. */ assert((onto->opts.flags & GIT_DIFF_DELTAS_ARE_ICASE) != 0 && (from->opts.flags & GIT_DIFF_DELTAS_ARE_ICASE) != 0); } for (i = 0, j = 0; i < onto->deltas.length || j < from->deltas.length; ) { git_diff_delta *o = GIT_VECTOR_GET(&onto->deltas, i); const git_diff_delta *f = GIT_VECTOR_GET(&from->deltas, j); int cmp = !f ? -1 : !o ? 1 : STRCMP_CASESELECT(ignore_case, o->old_file.path, f->old_file.path); if (cmp < 0) { delta = diff_delta__dup(o, &onto_pool); i++; } else if (cmp > 0) { delta = diff_delta__dup(f, &onto_pool); j++; } else { delta = diff_delta__merge_like_cgit(o, f, &onto_pool); i++; j++; } /* the ignore rules for the target may not match the source * or the result of a merged delta could be skippable... */ if (git_diff_delta__should_skip(&onto->opts, delta)) { git__free(delta); continue; } if ((error = !delta ? -1 : git_vector_insert(&onto_new, delta)) < 0) break; } if (!error) { git_vector_swap(&onto->deltas, &onto_new); git_pool_swap(&onto->pool, &onto_pool); onto->new_src = from->new_src; /* prefix strings also come from old pool, so recreate those.*/ onto->opts.old_prefix = git_pool_strdup_safe(&onto->pool, onto->opts.old_prefix); onto->opts.new_prefix = git_pool_strdup_safe(&onto->pool, onto->opts.new_prefix); } git_vector_foreach(&onto_new, i, delta) git__free(delta); git_vector_free(&onto_new); git_pool_clear(&onto_pool); return error; } #define DEFAULT_THRESHOLD 50 #define DEFAULT_BREAK_REWRITE_THRESHOLD 60 #define DEFAULT_TARGET_LIMIT 200 static int normalize_find_opts( git_diff_list *diff, git_diff_find_options *opts, git_diff_find_options *given) { git_config *cfg = NULL; const char *val; if (diff->repo != NULL && git_repository_config__weakptr(&cfg, diff->repo) < 0) return -1; if (given != NULL) memcpy(opts, given, sizeof(*opts)); else { git_diff_find_options init = GIT_DIFF_FIND_OPTIONS_INIT; memmove(opts, &init, sizeof(init)); opts->flags = GIT_DIFF_FIND_RENAMES; if (git_config_get_string(&val, cfg, "diff.renames") < 0) giterr_clear(); else if (val && (!strcasecmp(val, "copies") || !strcasecmp(val, "copy"))) opts->flags = GIT_DIFF_FIND_RENAMES | GIT_DIFF_FIND_COPIES; } GITERR_CHECK_VERSION(opts, GIT_DIFF_FIND_OPTIONS_VERSION, "git_diff_find_options"); /* some flags imply others */ if (opts->flags & GIT_DIFF_FIND_RENAMES_FROM_REWRITES) opts->flags |= GIT_DIFF_FIND_RENAMES; if (opts->flags & GIT_DIFF_FIND_COPIES_FROM_UNMODIFIED) opts->flags |= GIT_DIFF_FIND_COPIES; #define USE_DEFAULT(X) ((X) == 0 || (X) > 100) if (USE_DEFAULT(opts->rename_threshold)) opts->rename_threshold = DEFAULT_THRESHOLD; if (USE_DEFAULT(opts->rename_from_rewrite_threshold)) opts->rename_from_rewrite_threshold = DEFAULT_THRESHOLD; if (USE_DEFAULT(opts->copy_threshold)) opts->copy_threshold = DEFAULT_THRESHOLD; if (USE_DEFAULT(opts->break_rewrite_threshold)) opts->break_rewrite_threshold = DEFAULT_BREAK_REWRITE_THRESHOLD; #undef USE_DEFAULT if (!opts->target_limit) { int32_t limit = 0; opts->target_limit = DEFAULT_TARGET_LIMIT; if (git_config_get_int32(&limit, cfg, "diff.renameLimit") < 0) giterr_clear(); else if (limit > 0) opts->target_limit = limit; } return 0; } static int apply_splits_and_deletes(git_diff_list *diff, size_t expected_size) { git_vector onto = GIT_VECTOR_INIT; size_t i; git_diff_delta *delta; if (git_vector_init(&onto, expected_size, git_diff_delta__cmp) < 0) return -1; /* build new delta list without TO_DELETE and splitting TO_SPLIT */ git_vector_foreach(&diff->deltas, i, delta) { if (delta->status == GIT_DELTA__TO_DELETE) { git__free(delta); continue; } if (delta->status == GIT_DELTA__TO_SPLIT) { git_diff_delta *deleted = diff_delta__dup(delta, &diff->pool); if (!deleted) return -1; deleted->status = GIT_DELTA_DELETED; memset(&deleted->new_file, 0, sizeof(deleted->new_file)); deleted->new_file.path = deleted->old_file.path; deleted->new_file.flags |= GIT_DIFF_FILE_VALID_OID; git_vector_insert(&onto, deleted); delta->status = GIT_DELTA_ADDED; memset(&delta->old_file, 0, sizeof(delta->old_file)); delta->old_file.path = delta->new_file.path; delta->old_file.flags |= GIT_DIFF_FILE_VALID_OID; } git_vector_insert(&onto, delta); } /* swap new delta list into place */ git_vector_sort(&onto); git_vector_swap(&diff->deltas, &onto); git_vector_free(&onto); return 0; } static unsigned int calc_similarity( void *cache, git_diff_file *old_file, git_diff_file *new_file) { GIT_UNUSED(cache); if (git_oid_cmp(&old_file->oid, &new_file->oid) == 0) return 100; /* TODO: insert actual similarity algo here */ return 0; } #define FLAG_SET(opts,flag_name) ((opts.flags & flag_name) != 0) int git_diff_find_similar( git_diff_list *diff, git_diff_find_options *given_opts) { unsigned int i, j, similarity; git_diff_delta *from, *to; git_diff_find_options opts; unsigned int tried_targets, num_changes = 0; git_vector matches = GIT_VECTOR_INIT; if (normalize_find_opts(diff, &opts, given_opts) < 0) return -1; /* first do splits if requested */ if (FLAG_SET(opts, GIT_DIFF_FIND_AND_BREAK_REWRITES)) { git_vector_foreach(&diff->deltas, i, from) { if (from->status != GIT_DELTA_MODIFIED) continue; /* Right now, this doesn't work right because the similarity * algorithm isn't actually implemented... */ similarity = 100; /* calc_similarity(NULL, &from->old_file, from->new_file); */ if (similarity < opts.break_rewrite_threshold) { from->status = GIT_DELTA__TO_SPLIT; num_changes++; } } /* apply splits as needed */ if (num_changes > 0 && apply_splits_and_deletes( diff, diff->deltas.length + num_changes) < 0) return -1; } /* next find the most similar delta for each rename / copy candidate */ if (git_vector_init(&matches, diff->deltas.length, git_diff_delta__cmp) < 0) return -1; git_vector_foreach(&diff->deltas, i, from) { tried_targets = 0; git_vector_foreach(&diff->deltas, j, to) { if (i == j) continue; switch (to->status) { case GIT_DELTA_ADDED: case GIT_DELTA_UNTRACKED: case GIT_DELTA_RENAMED: case GIT_DELTA_COPIED: break; default: /* only the above status values should be checked */ continue; } /* skip all but DELETED files unless copy detection is on */ if (from->status != GIT_DELTA_DELETED && !FLAG_SET(opts, GIT_DIFF_FIND_COPIES)) continue; /* don't check UNMODIFIED files as source unless given option */ if (from->status == GIT_DELTA_UNMODIFIED && !FLAG_SET(opts, GIT_DIFF_FIND_COPIES_FROM_UNMODIFIED)) continue; /* cap on maximum files we'll examine */ if (++tried_targets > opts.target_limit) break; /* calculate similarity and see if this pair beats the * similarity score of the current best pair. */ similarity = calc_similarity(NULL, &from->old_file, &to->new_file); if (to->similarity < similarity) { to->similarity = similarity; if (git_vector_set(NULL, &matches, j, from) < 0) return -1; } } } /* next rewrite the diffs with renames / copies */ num_changes = 0; git_vector_foreach(&diff->deltas, j, to) { from = GIT_VECTOR_GET(&matches, j); if (!from) { assert(to->similarity == 0); continue; } /* three possible outcomes here: * 1. old DELETED and if over rename threshold, * new becomes RENAMED and old goes away * 2. old was MODIFIED but FIND_RENAMES_FROM_REWRITES is on and * old is more similar to new than it is to itself, in which * case, new becomes RENAMED and old becomed ADDED * 3. otherwise if over copy threshold, new becomes COPIED */ if (from->status == GIT_DELTA_DELETED) { if (to->similarity < opts.rename_threshold) { to->similarity = 0; continue; } to->status = GIT_DELTA_RENAMED; memcpy(&to->old_file, &from->old_file, sizeof(to->old_file)); from->status = GIT_DELTA__TO_DELETE; num_changes++; continue; } if (from->status == GIT_DELTA_MODIFIED && FLAG_SET(opts, GIT_DIFF_FIND_RENAMES_FROM_REWRITES) && to->similarity > opts.rename_threshold) { similarity = 100; /* calc_similarity(NULL, &from->old_file, from->new_file); */ if (similarity < opts.rename_from_rewrite_threshold) { to->status = GIT_DELTA_RENAMED; memcpy(&to->old_file, &from->old_file, sizeof(to->old_file)); from->status = GIT_DELTA_ADDED; memset(&from->old_file, 0, sizeof(from->old_file)); from->old_file.path = to->old_file.path; from->old_file.flags |= GIT_DIFF_FILE_VALID_OID; continue; } } if (to->similarity < opts.copy_threshold) { to->similarity = 0; continue; } /* convert "to" to a COPIED record */ to->status = GIT_DELTA_COPIED; memcpy(&to->old_file, &from->old_file, sizeof(to->old_file)); } git_vector_free(&matches); if (num_changes > 0) { assert(num_changes < diff->deltas.length); if (apply_splits_and_deletes( diff, diff->deltas.length - num_changes) < 0) return -1; } return 0; } #undef FLAG_SET