Commit 36fc5497 by Pierre-Olivier Latour Committed by Edward Thomson

Added GIT_HASHSIG_ALLOW_SMALL_FILES to allow computing signatures for small files

The implementation of the hashsig API disallows computing a signature on
small files containing only a few lines. This new flag disables this
behavior.

git_diff_find_similar() sets this flag by default which means that rename
/ copy detection of small files will now work. This in turn affects the
behavior of the git_status and git_blame APIs which will now detect rename
of small files assuming the right options are passed.
parent d147900e
......@@ -12,33 +12,52 @@
GIT_BEGIN_DECL
/**
* Similarity signature of line hashes for a buffer
* Similarity signature of arbitrary text content based on line hashes
*/
typedef struct git_hashsig git_hashsig;
/**
* Options for hashsig calculation
* Options for hashsig computation
*
* The options GIT_HASHSIG_NORMAL, GIT_HASHSIG_IGNORE_WHITESPACE,
* GIT_HASHSIG_SMART_WHITESPACE are exclusive and should not be combined.
*/
typedef enum {
GIT_HASHSIG_NORMAL = 0, /* use all data */
GIT_HASHSIG_IGNORE_WHITESPACE = 1, /* ignore whitespace */
GIT_HASHSIG_SMART_WHITESPACE = 2, /* ignore \r and all space after \n */
/**
* Use all data
*/
GIT_HASHSIG_NORMAL = 0,
/**
* Ignore whitespace
*/
GIT_HASHSIG_IGNORE_WHITESPACE = (1 << 0),
/**
* Ignore \r and all space after \n
*/
GIT_HASHSIG_SMART_WHITESPACE = (1 << 1),
/**
* Allow hashing of small files
*/
GIT_HASHSIG_ALLOW_SMALL_FILES = (1 << 2)
} git_hashsig_option_t;
/**
* Build a similarity signature for a buffer
*
* If you have passed a whitespace-ignoring buffer, then the whitespace
* will be removed from the buffer while it is being processed, modifying
* the buffer in place. Sorry about that!
* Compute a similarity signature for a text buffer
*
* This will return an error if the buffer doesn't contain enough data to
* compute a valid signature.
* If you have passed the option GIT_HASHSIG_IGNORE_WHITESPACE, then the
* whitespace will be removed from the buffer while it is being processed,
* modifying the buffer in place. Sorry about that!
*
* @param out The array of hashed runs representing the file content
* @param buf The contents of the file to hash
* @param buflen The length of the data at `buf`
* @param generate_pairwise_hashes Should pairwise runs be hashed
* @param out The computed similarity signature.
* @param buf The input buffer.
* @param buflen The input buffer size.
* @param opts The signature computation options (see above).
* @return 0 on success, GIT_EBUFS if the buffer doesn't contain enough data to
* compute a valid signature (unless GIT_HASHSIG_ALLOW_SMALL_FILES is set), or
* error code.
*/
GIT_EXTERN(int) git_hashsig_create(
git_hashsig **out,
......@@ -47,13 +66,17 @@ GIT_EXTERN(int) git_hashsig_create(
git_hashsig_option_t opts);
/**
* Build a similarity signature from a file
* Compute a similarity signature for a text file
*
* This walks through the file, only loading a maximum of 4K of file data at
* a time. Otherwise, it acts just like `git_hashsig_create`.
* a time. Otherwise, it acts just like `git_hashsig_create`.
*
* This will return an error if the file doesn't contain enough data to
* compute a valid signature.
* @param out The computed similarity signature.
* @param path The path to the input file.
* @param opts The signature computation options (see above).
* @return 0 on success, GIT_EBUFS if the buffer doesn't contain enough data to
* compute a valid signature (unless GIT_HASHSIG_ALLOW_SMALL_FILES is set), or
* error code.
*/
GIT_EXTERN(int) git_hashsig_create_fromfile(
git_hashsig **out,
......@@ -62,13 +85,17 @@ GIT_EXTERN(int) git_hashsig_create_fromfile(
/**
* Release memory for a content similarity signature
*
* @param sig The similarity signature to free.
*/
GIT_EXTERN(void) git_hashsig_free(git_hashsig *sig);
/**
* Measure similarity between two files
* Measure similarity score between two similarity signatures
*
* @return <0 for error, [0 to 100] as similarity score
* @param a The first similarity signature to compare.
* @param b The second similarity signature to compare.
* @return [0 to 100] on success as the similarity score, or error code.
*/
GIT_EXTERN(int) git_hashsig_compare(
const git_hashsig *a,
......
......@@ -219,34 +219,18 @@ int git_diff_find_similar__hashsig_for_file(
void **out, const git_diff_file *f, const char *path, void *p)
{
git_hashsig_option_t opt = (git_hashsig_option_t)(intptr_t)p;
int error = 0;
GIT_UNUSED(f);
error = git_hashsig_create_fromfile((git_hashsig **)out, path, opt);
if (error == GIT_EBUFS) {
error = 0;
giterr_clear();
}
return error;
return git_hashsig_create_fromfile((git_hashsig **)out, path, opt);
}
int git_diff_find_similar__hashsig_for_buf(
void **out, const git_diff_file *f, const char *buf, size_t len, void *p)
{
git_hashsig_option_t opt = (git_hashsig_option_t)(intptr_t)p;
int error = 0;
GIT_UNUSED(f);
error = git_hashsig_create((git_hashsig **)out, buf, len, opt);
if (error == GIT_EBUFS) {
error = 0;
giterr_clear();
}
return error;
return git_hashsig_create((git_hashsig **)out, buf, len, opt);
}
void git_diff_find_similar__hashsig_free(void *sig, void *payload)
......@@ -258,8 +242,14 @@ void git_diff_find_similar__hashsig_free(void *sig, void *payload)
int git_diff_find_similar__calc_similarity(
int *score, void *siga, void *sigb, void *payload)
{
int error;
GIT_UNUSED(payload);
*score = git_hashsig_compare(siga, sigb);
error = git_hashsig_compare(siga, sigb);
if (error < 0)
return error;
*score = error;
return 0;
}
......@@ -273,6 +263,7 @@ static int normalize_find_opts(
const git_diff_find_options *given)
{
git_config *cfg = NULL;
git_hashsig_option_t hashsig_opts;
GITERR_CHECK_VERSION(given, GIT_DIFF_FIND_OPTIONS_VERSION, "git_diff_find_options");
......@@ -354,11 +345,13 @@ static int normalize_find_opts(
opts->metric->similarity = git_diff_find_similar__calc_similarity;
if (opts->flags & GIT_DIFF_FIND_IGNORE_WHITESPACE)
opts->metric->payload = (void *)GIT_HASHSIG_IGNORE_WHITESPACE;
hashsig_opts = GIT_HASHSIG_IGNORE_WHITESPACE;
else if (opts->flags & GIT_DIFF_FIND_DONT_IGNORE_WHITESPACE)
opts->metric->payload = (void *)GIT_HASHSIG_NORMAL;
hashsig_opts = GIT_HASHSIG_NORMAL;
else
opts->metric->payload = (void *)GIT_HASHSIG_SMART_WHITESPACE;
hashsig_opts = GIT_HASHSIG_SMART_WHITESPACE;
hashsig_opts |= GIT_HASHSIG_ALLOW_SMALL_FILES;
opts->metric->payload = (void *)hashsig_opts;
}
return 0;
......
......@@ -35,7 +35,6 @@ struct git_hashsig {
hashsig_heap mins;
hashsig_heap maxs;
git_hashsig_option_t opt;
int considered;
};
#define HEAP_LCHILD_OF(I) (((I)<<1)+1)
......@@ -135,25 +134,23 @@ static void hashsig_in_progress_init(
{
int i;
switch (sig->opt) {
case GIT_HASHSIG_IGNORE_WHITESPACE:
/* no more than one can be set */
assert(!(sig->opt & GIT_HASHSIG_IGNORE_WHITESPACE) ||
!(sig->opt & GIT_HASHSIG_SMART_WHITESPACE));
if (sig->opt & GIT_HASHSIG_IGNORE_WHITESPACE) {
for (i = 0; i < 256; ++i)
prog->ignore_ch[i] = git__isspace_nonlf(i);
prog->use_ignores = 1;
break;
case GIT_HASHSIG_SMART_WHITESPACE:
} else if (sig->opt & GIT_HASHSIG_SMART_WHITESPACE) {
for (i = 0; i < 256; ++i)
prog->ignore_ch[i] = git__isspace(i);
prog->use_ignores = 1;
break;
default:
} else {
memset(prog, 0, sizeof(*prog));
break;
}
}
#define HASHSIG_IN_PROGRESS_INIT { 1 }
static int hashsig_add_hashes(
git_hashsig *sig,
const uint8_t *data,
......@@ -174,12 +171,13 @@ static int hashsig_add_hashes(
if (use_ignores)
for (; scan < end && git__isspace_nonlf(ch); ch = *scan)
++scan;
else if (sig->opt != GIT_HASHSIG_NORMAL)
else if (sig->opt &
(GIT_HASHSIG_IGNORE_WHITESPACE | GIT_HASHSIG_SMART_WHITESPACE))
for (; scan < end && ch == '\r'; ch = *scan)
++scan;
/* peek at next character to decide what to do next */
if (sig->opt == GIT_HASHSIG_SMART_WHITESPACE)
if (sig->opt & GIT_HASHSIG_SMART_WHITESPACE)
use_ignores = (ch == '\n');
if (scan >= end)
......@@ -198,8 +196,6 @@ static int hashsig_add_hashes(
hashsig_heap_insert(&sig->mins, (hashsig_t)state);
hashsig_heap_insert(&sig->maxs, (hashsig_t)state);
sig->considered++;
while (scan < end && (*scan == '\n' || !*scan))
++scan;
}
......@@ -212,7 +208,8 @@ static int hashsig_add_hashes(
static int hashsig_finalize_hashes(git_hashsig *sig)
{
if (sig->mins.size < HASHSIG_HEAP_MIN_SIZE) {
if (sig->mins.size < HASHSIG_HEAP_MIN_SIZE &&
!(sig->opt & GIT_HASHSIG_ALLOW_SMALL_FILES)) {
giterr_set(GITERR_INVALID,
"File too small for similarity signature calculation");
return GIT_EBUFS;
......
......@@ -53,6 +53,11 @@ void cl_git_rewritefile(const char *path, const char *content)
cl_git_write2file(path, content, 0, O_WRONLY | O_CREAT | O_TRUNC, 0644);
}
void cl_git_rmfile(const char *filename)
{
cl_must_pass(p_unlink(filename));
}
#ifdef GIT_WIN32
#include "win32/utf-conv.h"
......
......@@ -112,6 +112,7 @@ void cl_git_append2file(const char *filename, const char *new_content);
void cl_git_rewritefile(const char *filename, const char *new_content);
void cl_git_write2file(const char *path, const char *data,
size_t datalen, int flags, unsigned int mode);
void cl_git_rmfile(const char *filename);
bool cl_toggle_filemode(const char *filename);
bool cl_is_chmod_supported(void);
......
......@@ -381,37 +381,53 @@ void test_diff_rename__not_exact_match(void)
git_tree_free(new_tree);
}
void test_diff_rename__handles_small_files(void)
void test_diff_rename__test_small_files(void)
{
const char *tree_sha = "2bc7f351d20b53f1c72c16c4b036e491c478c49a";
git_index *index;
git_tree *tree;
git_reference *head_reference;
git_commit *head_commit;
git_tree *head_tree;
git_tree *commit_tree;
git_signature *signature;
git_diff *diff;
git_diff_options diffopts = GIT_DIFF_OPTIONS_INIT;
git_diff_find_options opts = GIT_DIFF_FIND_OPTIONS_INIT;
git_oid oid;
const git_diff_delta *delta;
git_diff_options diff_options = GIT_DIFF_OPTIONS_INIT;
git_diff_find_options find_options = GIT_DIFF_FIND_OPTIONS_INIT;
cl_git_pass(git_repository_index(&index, g_repo));
tree = resolve_commit_oid_to_tree(g_repo, tree_sha);
cl_git_mkfile("renames/small.txt", "Hello World!\n");
cl_git_pass(git_index_add_bypath(index, "small.txt"));
cl_git_rewritefile("renames/songof7cities.txt", "single line\n");
cl_git_pass(git_index_add_bypath(index, "songof7cities.txt"));
cl_git_pass(git_repository_head(&head_reference, g_repo));
cl_git_pass(git_reference_peel((git_object**)&head_commit, head_reference, GIT_OBJ_COMMIT));
cl_git_pass(git_commit_tree(&head_tree, head_commit));
cl_git_pass(git_index_write_tree(&oid, index));
cl_git_pass(git_tree_lookup(&commit_tree, g_repo, &oid));
cl_git_pass(git_signature_new(&signature, "Rename", "rename@example.com", 1404157834, 0));
cl_git_pass(git_commit_create(&oid, g_repo, "HEAD", signature, signature, NULL, "Test commit", commit_tree, 1, (const git_commit**)&head_commit));
cl_git_rewritefile("renames/untimely.txt", "untimely\n");
cl_git_pass(git_index_add_bypath(index, "untimely.txt"));
cl_git_mkfile("renames/copy.txt", "Hello World!\n");
cl_git_rmfile("renames/small.txt");
/* Tests that we can invoke find_similar on small files
* and that the GIT_EBUFS (too small) error code is not
* propagated to the caller.
*/
cl_git_pass(git_diff_tree_to_index(&diff, g_repo, tree, index, &diffopts));
diff_options.flags = GIT_DIFF_INCLUDE_UNTRACKED;
cl_git_pass(git_diff_tree_to_workdir(&diff, g_repo, commit_tree, &diff_options));
find_options.flags = GIT_DIFF_FIND_RENAMES | GIT_DIFF_FIND_FOR_UNTRACKED;
cl_git_pass(git_diff_find_similar(diff, &find_options));
opts.flags = GIT_DIFF_FIND_RENAMES | GIT_DIFF_FIND_COPIES |
GIT_DIFF_FIND_AND_BREAK_REWRITES;
cl_git_pass(git_diff_find_similar(diff, &opts));
cl_assert_equal_i(git_diff_num_deltas(diff), 1);
delta = git_diff_get_delta(diff, 0);
cl_assert_equal_i(delta->status, GIT_DELTA_RENAMED);
cl_assert_equal_s(delta->old_file.path, "small.txt");
cl_assert_equal_s(delta->new_file.path, "copy.txt");
git_diff_free(diff);
git_tree_free(tree);
git_signature_free(signature);
git_tree_free(commit_tree);
git_tree_free(head_tree);
git_commit_free(head_commit);
git_reference_free(head_reference);
git_index_free(index);
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment