Commit f3327cac by Russell Belfer

Some similarity metric adjustments

This makes the text similarity metric treat \r as equivalent
to \n and makes it skip whitespace immediately following a line
terminator, so line indentation will have less effect on the
difference measurement (and so \r\n will be treated as just a
single line terminator).

This also separates the text and binary hash calculators into
two separate functions instead of have more if statements inside
the loop. This should make it easier to have more differentiated
heuristics in the future if we so wish.
parent 9c454b00
......@@ -232,7 +232,7 @@ struct git_buf_text_hashsig {
unsigned int pairs : 1;
};
static int similarity_advance(git_buf_text_hashsig *sig, uint32_t hash)
static int similarity_record_hash(git_buf_text_hashsig *sig, uint32_t hash)
{
if (sig->size >= sig->asize) {
size_t new_asize = sig->asize + 512;
......@@ -248,31 +248,67 @@ static int similarity_advance(git_buf_text_hashsig *sig, uint32_t hash)
return 0;
}
static int similarity_add_hashes(
static int similarity_add_hashes_text(
git_buf_text_hashsig *sig,
uint32_t *hash_start,
size_t *hashlen_start,
const char *ptr,
size_t len)
{
int error = 0;
int error;
const char *scan = ptr, *scan_end = ptr + len;
char term = (sig->format == SIMILARITY_FORMAT_TEXT) ? '\n' : '\0';
uint32_t hash = hash_start ? *hash_start : SIMILARITY_HASH_START;
size_t hashlen = hashlen_start ? *hashlen_start : 0;
uint32_t hash = *hash_start;
size_t hashlen = *hashlen_start;
while (scan < scan_end) {
char ch = *scan++;
if (ch == '\r' || ch == '\n' || hashlen >= SIMILARITY_MAXRUN) {
if ((error = similarity_record_hash(sig, hash)) < 0)
break;
hash = SIMILARITY_HASH_START;
hashlen = 0;
/* skip all whitespace immediately after line ending */
while (scan < scan_end && git__isspace(*scan))
scan++;
} else {
hash = SIMILARITY_HASH_UPDATE(hash, ch);
hashlen++;
}
}
*hash_start = hash;
*hashlen_start = hashlen;
return error;
}
static int similarity_add_hashes_binary(
git_buf_text_hashsig *sig,
uint32_t *hash_start,
size_t *hashlen_start,
const char *ptr,
size_t len)
{
int error;
const char *scan = ptr, *scan_end = ptr + len;
uint32_t hash = *hash_start;
size_t hashlen = *hashlen_start;
while (scan < scan_end) {
char ch = *scan++;
if (ch == term || hashlen >= SIMILARITY_MAXRUN) {
if ((error = similarity_advance(sig, hash)) < 0)
if (!ch || hashlen >= SIMILARITY_MAXRUN) {
if ((error = similarity_record_hash(sig, hash)) < 0)
break;
hash = SIMILARITY_HASH_START;
hashlen = 0;
/* skip run of terminators */
while (scan < scan_end && *scan == term)
while (scan < scan_end && !*scan)
scan++;
} else {
hash = SIMILARITY_HASH_UPDATE(hash, ch);
......@@ -280,6 +316,28 @@ static int similarity_add_hashes(
}
}
*hash_start = hash;
*hashlen_start = hashlen;
return error;
}
static int similarity_add_hashes(
git_buf_text_hashsig *sig,
uint32_t *hash_start,
size_t *hashlen_start,
const char *ptr,
size_t len)
{
int error = 0;
uint32_t hash = hash_start ? *hash_start : SIMILARITY_HASH_START;
size_t hashlen = hashlen_start ? *hashlen_start : 0;
if (sig->format == SIMILARITY_FORMAT_TEXT)
error = similarity_add_hashes_text(sig, &hash, &hashlen, ptr, len);
else
error = similarity_add_hashes_binary(sig, &hash, &hashlen, ptr, len);
if (hash_start)
*hash_start = hash;
if (hashlen_start)
......@@ -287,7 +345,7 @@ static int similarity_add_hashes(
/* if we're not saving intermediate state, add final hash as needed */
if (!error && !hash_start && hashlen > 0)
error = similarity_advance(sig, hash);
error = similarity_record_hash(sig, hash);
return error;
}
......@@ -436,7 +494,7 @@ int git_buf_text_hashsig_create_fromfile(
p_close(fd);
if (!error && hashlen > 0)
error = similarity_advance(sig, hash);
error = similarity_record_hash(sig, hash);
if (!error)
error = similarity_finalize_hashes(sig, generate_pairs);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment