Commit 36fc5497 by Pierre-Olivier Latour Committed by Edward Thomson

Added GIT_HASHSIG_ALLOW_SMALL_FILES to allow computing signatures for small files

The implementation of the hashsig API disallows computing a signature on
small files containing only a few lines. This new flag disables this
behavior.

git_diff_find_similar() sets this flag by default which means that rename
/ copy detection of small files will now work. This in turn affects the
behavior of the git_status and git_blame APIs which will now detect rename
of small files assuming the right options are passed.
parent d147900e
...@@ -12,33 +12,52 @@ ...@@ -12,33 +12,52 @@
GIT_BEGIN_DECL GIT_BEGIN_DECL
/** /**
* Similarity signature of line hashes for a buffer * Similarity signature of arbitrary text content based on line hashes
*/ */
typedef struct git_hashsig git_hashsig; typedef struct git_hashsig git_hashsig;
/** /**
* Options for hashsig calculation * Options for hashsig computation
*
* The options GIT_HASHSIG_NORMAL, GIT_HASHSIG_IGNORE_WHITESPACE,
* GIT_HASHSIG_SMART_WHITESPACE are exclusive and should not be combined.
*/ */
typedef enum { typedef enum {
GIT_HASHSIG_NORMAL = 0, /* use all data */ /**
GIT_HASHSIG_IGNORE_WHITESPACE = 1, /* ignore whitespace */ * Use all data
GIT_HASHSIG_SMART_WHITESPACE = 2, /* ignore \r and all space after \n */ */
GIT_HASHSIG_NORMAL = 0,
/**
* Ignore whitespace
*/
GIT_HASHSIG_IGNORE_WHITESPACE = (1 << 0),
/**
* Ignore \r and all space after \n
*/
GIT_HASHSIG_SMART_WHITESPACE = (1 << 1),
/**
* Allow hashing of small files
*/
GIT_HASHSIG_ALLOW_SMALL_FILES = (1 << 2)
} git_hashsig_option_t; } git_hashsig_option_t;
/** /**
* Build a similarity signature for a buffer * Compute a similarity signature for a text buffer
*
* If you have passed a whitespace-ignoring buffer, then the whitespace
* will be removed from the buffer while it is being processed, modifying
* the buffer in place. Sorry about that!
* *
* This will return an error if the buffer doesn't contain enough data to * If you have passed the option GIT_HASHSIG_IGNORE_WHITESPACE, then the
* compute a valid signature. * whitespace will be removed from the buffer while it is being processed,
* modifying the buffer in place. Sorry about that!
* *
* @param out The array of hashed runs representing the file content * @param out The computed similarity signature.
* @param buf The contents of the file to hash * @param buf The input buffer.
* @param buflen The length of the data at `buf` * @param buflen The input buffer size.
* @param generate_pairwise_hashes Should pairwise runs be hashed * @param opts The signature computation options (see above).
* @return 0 on success, GIT_EBUFS if the buffer doesn't contain enough data to
* compute a valid signature (unless GIT_HASHSIG_ALLOW_SMALL_FILES is set), or
* error code.
*/ */
GIT_EXTERN(int) git_hashsig_create( GIT_EXTERN(int) git_hashsig_create(
git_hashsig **out, git_hashsig **out,
...@@ -47,13 +66,17 @@ GIT_EXTERN(int) git_hashsig_create( ...@@ -47,13 +66,17 @@ GIT_EXTERN(int) git_hashsig_create(
git_hashsig_option_t opts); git_hashsig_option_t opts);
/** /**
* Build a similarity signature from a file * Compute a similarity signature for a text file
* *
* This walks through the file, only loading a maximum of 4K of file data at * This walks through the file, only loading a maximum of 4K of file data at
* a time. Otherwise, it acts just like `git_hashsig_create`. * a time. Otherwise, it acts just like `git_hashsig_create`.
* *
* This will return an error if the file doesn't contain enough data to * @param out The computed similarity signature.
* compute a valid signature. * @param path The path to the input file.
* @param opts The signature computation options (see above).
* @return 0 on success, GIT_EBUFS if the buffer doesn't contain enough data to
* compute a valid signature (unless GIT_HASHSIG_ALLOW_SMALL_FILES is set), or
* error code.
*/ */
GIT_EXTERN(int) git_hashsig_create_fromfile( GIT_EXTERN(int) git_hashsig_create_fromfile(
git_hashsig **out, git_hashsig **out,
...@@ -62,13 +85,17 @@ GIT_EXTERN(int) git_hashsig_create_fromfile( ...@@ -62,13 +85,17 @@ GIT_EXTERN(int) git_hashsig_create_fromfile(
/** /**
* Release memory for a content similarity signature * Release memory for a content similarity signature
*
* @param sig The similarity signature to free.
*/ */
GIT_EXTERN(void) git_hashsig_free(git_hashsig *sig); GIT_EXTERN(void) git_hashsig_free(git_hashsig *sig);
/** /**
* Measure similarity between two files * Measure similarity score between two similarity signatures
* *
* @return <0 for error, [0 to 100] as similarity score * @param a The first similarity signature to compare.
* @param b The second similarity signature to compare.
* @return [0 to 100] on success as the similarity score, or error code.
*/ */
GIT_EXTERN(int) git_hashsig_compare( GIT_EXTERN(int) git_hashsig_compare(
const git_hashsig *a, const git_hashsig *a,
......
...@@ -219,34 +219,18 @@ int git_diff_find_similar__hashsig_for_file( ...@@ -219,34 +219,18 @@ int git_diff_find_similar__hashsig_for_file(
void **out, const git_diff_file *f, const char *path, void *p) void **out, const git_diff_file *f, const char *path, void *p)
{ {
git_hashsig_option_t opt = (git_hashsig_option_t)(intptr_t)p; git_hashsig_option_t opt = (git_hashsig_option_t)(intptr_t)p;
int error = 0;
GIT_UNUSED(f); GIT_UNUSED(f);
error = git_hashsig_create_fromfile((git_hashsig **)out, path, opt); return git_hashsig_create_fromfile((git_hashsig **)out, path, opt);
if (error == GIT_EBUFS) {
error = 0;
giterr_clear();
}
return error;
} }
int git_diff_find_similar__hashsig_for_buf( int git_diff_find_similar__hashsig_for_buf(
void **out, const git_diff_file *f, const char *buf, size_t len, void *p) void **out, const git_diff_file *f, const char *buf, size_t len, void *p)
{ {
git_hashsig_option_t opt = (git_hashsig_option_t)(intptr_t)p; git_hashsig_option_t opt = (git_hashsig_option_t)(intptr_t)p;
int error = 0;
GIT_UNUSED(f); GIT_UNUSED(f);
error = git_hashsig_create((git_hashsig **)out, buf, len, opt); return git_hashsig_create((git_hashsig **)out, buf, len, opt);
if (error == GIT_EBUFS) {
error = 0;
giterr_clear();
}
return error;
} }
void git_diff_find_similar__hashsig_free(void *sig, void *payload) void git_diff_find_similar__hashsig_free(void *sig, void *payload)
...@@ -258,8 +242,14 @@ void git_diff_find_similar__hashsig_free(void *sig, void *payload) ...@@ -258,8 +242,14 @@ void git_diff_find_similar__hashsig_free(void *sig, void *payload)
int git_diff_find_similar__calc_similarity( int git_diff_find_similar__calc_similarity(
int *score, void *siga, void *sigb, void *payload) int *score, void *siga, void *sigb, void *payload)
{ {
int error;
GIT_UNUSED(payload); GIT_UNUSED(payload);
*score = git_hashsig_compare(siga, sigb); error = git_hashsig_compare(siga, sigb);
if (error < 0)
return error;
*score = error;
return 0; return 0;
} }
...@@ -273,6 +263,7 @@ static int normalize_find_opts( ...@@ -273,6 +263,7 @@ static int normalize_find_opts(
const git_diff_find_options *given) const git_diff_find_options *given)
{ {
git_config *cfg = NULL; git_config *cfg = NULL;
git_hashsig_option_t hashsig_opts;
GITERR_CHECK_VERSION(given, GIT_DIFF_FIND_OPTIONS_VERSION, "git_diff_find_options"); GITERR_CHECK_VERSION(given, GIT_DIFF_FIND_OPTIONS_VERSION, "git_diff_find_options");
...@@ -354,11 +345,13 @@ static int normalize_find_opts( ...@@ -354,11 +345,13 @@ static int normalize_find_opts(
opts->metric->similarity = git_diff_find_similar__calc_similarity; opts->metric->similarity = git_diff_find_similar__calc_similarity;
if (opts->flags & GIT_DIFF_FIND_IGNORE_WHITESPACE) if (opts->flags & GIT_DIFF_FIND_IGNORE_WHITESPACE)
opts->metric->payload = (void *)GIT_HASHSIG_IGNORE_WHITESPACE; hashsig_opts = GIT_HASHSIG_IGNORE_WHITESPACE;
else if (opts->flags & GIT_DIFF_FIND_DONT_IGNORE_WHITESPACE) else if (opts->flags & GIT_DIFF_FIND_DONT_IGNORE_WHITESPACE)
opts->metric->payload = (void *)GIT_HASHSIG_NORMAL; hashsig_opts = GIT_HASHSIG_NORMAL;
else else
opts->metric->payload = (void *)GIT_HASHSIG_SMART_WHITESPACE; hashsig_opts = GIT_HASHSIG_SMART_WHITESPACE;
hashsig_opts |= GIT_HASHSIG_ALLOW_SMALL_FILES;
opts->metric->payload = (void *)hashsig_opts;
} }
return 0; return 0;
......
...@@ -35,7 +35,6 @@ struct git_hashsig { ...@@ -35,7 +35,6 @@ struct git_hashsig {
hashsig_heap mins; hashsig_heap mins;
hashsig_heap maxs; hashsig_heap maxs;
git_hashsig_option_t opt; git_hashsig_option_t opt;
int considered;
}; };
#define HEAP_LCHILD_OF(I) (((I)<<1)+1) #define HEAP_LCHILD_OF(I) (((I)<<1)+1)
...@@ -135,25 +134,23 @@ static void hashsig_in_progress_init( ...@@ -135,25 +134,23 @@ static void hashsig_in_progress_init(
{ {
int i; int i;
switch (sig->opt) { /* no more than one can be set */
case GIT_HASHSIG_IGNORE_WHITESPACE: assert(!(sig->opt & GIT_HASHSIG_IGNORE_WHITESPACE) ||
!(sig->opt & GIT_HASHSIG_SMART_WHITESPACE));
if (sig->opt & GIT_HASHSIG_IGNORE_WHITESPACE) {
for (i = 0; i < 256; ++i) for (i = 0; i < 256; ++i)
prog->ignore_ch[i] = git__isspace_nonlf(i); prog->ignore_ch[i] = git__isspace_nonlf(i);
prog->use_ignores = 1; prog->use_ignores = 1;
break; } else if (sig->opt & GIT_HASHSIG_SMART_WHITESPACE) {
case GIT_HASHSIG_SMART_WHITESPACE:
for (i = 0; i < 256; ++i) for (i = 0; i < 256; ++i)
prog->ignore_ch[i] = git__isspace(i); prog->ignore_ch[i] = git__isspace(i);
prog->use_ignores = 1; prog->use_ignores = 1;
break; } else {
default:
memset(prog, 0, sizeof(*prog)); memset(prog, 0, sizeof(*prog));
break;
} }
} }
#define HASHSIG_IN_PROGRESS_INIT { 1 }
static int hashsig_add_hashes( static int hashsig_add_hashes(
git_hashsig *sig, git_hashsig *sig,
const uint8_t *data, const uint8_t *data,
...@@ -174,12 +171,13 @@ static int hashsig_add_hashes( ...@@ -174,12 +171,13 @@ static int hashsig_add_hashes(
if (use_ignores) if (use_ignores)
for (; scan < end && git__isspace_nonlf(ch); ch = *scan) for (; scan < end && git__isspace_nonlf(ch); ch = *scan)
++scan; ++scan;
else if (sig->opt != GIT_HASHSIG_NORMAL) else if (sig->opt &
(GIT_HASHSIG_IGNORE_WHITESPACE | GIT_HASHSIG_SMART_WHITESPACE))
for (; scan < end && ch == '\r'; ch = *scan) for (; scan < end && ch == '\r'; ch = *scan)
++scan; ++scan;
/* peek at next character to decide what to do next */ /* peek at next character to decide what to do next */
if (sig->opt == GIT_HASHSIG_SMART_WHITESPACE) if (sig->opt & GIT_HASHSIG_SMART_WHITESPACE)
use_ignores = (ch == '\n'); use_ignores = (ch == '\n');
if (scan >= end) if (scan >= end)
...@@ -198,8 +196,6 @@ static int hashsig_add_hashes( ...@@ -198,8 +196,6 @@ static int hashsig_add_hashes(
hashsig_heap_insert(&sig->mins, (hashsig_t)state); hashsig_heap_insert(&sig->mins, (hashsig_t)state);
hashsig_heap_insert(&sig->maxs, (hashsig_t)state); hashsig_heap_insert(&sig->maxs, (hashsig_t)state);
sig->considered++;
while (scan < end && (*scan == '\n' || !*scan)) while (scan < end && (*scan == '\n' || !*scan))
++scan; ++scan;
} }
...@@ -212,7 +208,8 @@ static int hashsig_add_hashes( ...@@ -212,7 +208,8 @@ static int hashsig_add_hashes(
static int hashsig_finalize_hashes(git_hashsig *sig) static int hashsig_finalize_hashes(git_hashsig *sig)
{ {
if (sig->mins.size < HASHSIG_HEAP_MIN_SIZE) { if (sig->mins.size < HASHSIG_HEAP_MIN_SIZE &&
!(sig->opt & GIT_HASHSIG_ALLOW_SMALL_FILES)) {
giterr_set(GITERR_INVALID, giterr_set(GITERR_INVALID,
"File too small for similarity signature calculation"); "File too small for similarity signature calculation");
return GIT_EBUFS; return GIT_EBUFS;
......
...@@ -53,6 +53,11 @@ void cl_git_rewritefile(const char *path, const char *content) ...@@ -53,6 +53,11 @@ void cl_git_rewritefile(const char *path, const char *content)
cl_git_write2file(path, content, 0, O_WRONLY | O_CREAT | O_TRUNC, 0644); cl_git_write2file(path, content, 0, O_WRONLY | O_CREAT | O_TRUNC, 0644);
} }
void cl_git_rmfile(const char *filename)
{
cl_must_pass(p_unlink(filename));
}
#ifdef GIT_WIN32 #ifdef GIT_WIN32
#include "win32/utf-conv.h" #include "win32/utf-conv.h"
......
...@@ -112,6 +112,7 @@ void cl_git_append2file(const char *filename, const char *new_content); ...@@ -112,6 +112,7 @@ void cl_git_append2file(const char *filename, const char *new_content);
void cl_git_rewritefile(const char *filename, const char *new_content); void cl_git_rewritefile(const char *filename, const char *new_content);
void cl_git_write2file(const char *path, const char *data, void cl_git_write2file(const char *path, const char *data,
size_t datalen, int flags, unsigned int mode); size_t datalen, int flags, unsigned int mode);
void cl_git_rmfile(const char *filename);
bool cl_toggle_filemode(const char *filename); bool cl_toggle_filemode(const char *filename);
bool cl_is_chmod_supported(void); bool cl_is_chmod_supported(void);
......
...@@ -381,37 +381,53 @@ void test_diff_rename__not_exact_match(void) ...@@ -381,37 +381,53 @@ void test_diff_rename__not_exact_match(void)
git_tree_free(new_tree); git_tree_free(new_tree);
} }
void test_diff_rename__handles_small_files(void) void test_diff_rename__test_small_files(void)
{ {
const char *tree_sha = "2bc7f351d20b53f1c72c16c4b036e491c478c49a";
git_index *index; git_index *index;
git_tree *tree; git_reference *head_reference;
git_commit *head_commit;
git_tree *head_tree;
git_tree *commit_tree;
git_signature *signature;
git_diff *diff; git_diff *diff;
git_diff_options diffopts = GIT_DIFF_OPTIONS_INIT; git_oid oid;
git_diff_find_options opts = GIT_DIFF_FIND_OPTIONS_INIT; const git_diff_delta *delta;
git_diff_options diff_options = GIT_DIFF_OPTIONS_INIT;
git_diff_find_options find_options = GIT_DIFF_FIND_OPTIONS_INIT;
cl_git_pass(git_repository_index(&index, g_repo)); cl_git_pass(git_repository_index(&index, g_repo));
tree = resolve_commit_oid_to_tree(g_repo, tree_sha); cl_git_mkfile("renames/small.txt", "Hello World!\n");
cl_git_pass(git_index_add_bypath(index, "small.txt"));
cl_git_rewritefile("renames/songof7cities.txt", "single line\n"); cl_git_pass(git_repository_head(&head_reference, g_repo));
cl_git_pass(git_index_add_bypath(index, "songof7cities.txt")); cl_git_pass(git_reference_peel((git_object**)&head_commit, head_reference, GIT_OBJ_COMMIT));
cl_git_pass(git_commit_tree(&head_tree, head_commit));
cl_git_pass(git_index_write_tree(&oid, index));
cl_git_pass(git_tree_lookup(&commit_tree, g_repo, &oid));
cl_git_pass(git_signature_new(&signature, "Rename", "rename@example.com", 1404157834, 0));
cl_git_pass(git_commit_create(&oid, g_repo, "HEAD", signature, signature, NULL, "Test commit", commit_tree, 1, (const git_commit**)&head_commit));
cl_git_rewritefile("renames/untimely.txt", "untimely\n"); cl_git_mkfile("renames/copy.txt", "Hello World!\n");
cl_git_pass(git_index_add_bypath(index, "untimely.txt")); cl_git_rmfile("renames/small.txt");
/* Tests that we can invoke find_similar on small files diff_options.flags = GIT_DIFF_INCLUDE_UNTRACKED;
* and that the GIT_EBUFS (too small) error code is not cl_git_pass(git_diff_tree_to_workdir(&diff, g_repo, commit_tree, &diff_options));
* propagated to the caller. find_options.flags = GIT_DIFF_FIND_RENAMES | GIT_DIFF_FIND_FOR_UNTRACKED;
*/ cl_git_pass(git_diff_find_similar(diff, &find_options));
cl_git_pass(git_diff_tree_to_index(&diff, g_repo, tree, index, &diffopts));
opts.flags = GIT_DIFF_FIND_RENAMES | GIT_DIFF_FIND_COPIES | cl_assert_equal_i(git_diff_num_deltas(diff), 1);
GIT_DIFF_FIND_AND_BREAK_REWRITES; delta = git_diff_get_delta(diff, 0);
cl_git_pass(git_diff_find_similar(diff, &opts)); cl_assert_equal_i(delta->status, GIT_DELTA_RENAMED);
cl_assert_equal_s(delta->old_file.path, "small.txt");
cl_assert_equal_s(delta->new_file.path, "copy.txt");
git_diff_free(diff); git_diff_free(diff);
git_tree_free(tree); git_signature_free(signature);
git_tree_free(commit_tree);
git_tree_free(head_tree);
git_commit_free(head_commit);
git_reference_free(head_reference);
git_index_free(index); git_index_free(index);
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment