Change similarity metric to sampled hashes

This moves the similarity metric code out of buf_text and into a new file. Also, this implements a different approach to similarity measurement based on a Rabin-Karp rolling hash where we only keep the top 100 and bottom 100 hashes. In theory, that should be sufficient samples to given a fairly accurate measurement while limiting the amount of data we keep for file signatures no matter how large the file is.

Change similarity metric to sampled hashes
This moves the similarity metric code out of buf_text and into a new file. Also, this implements a different approach to similarity measurement based on a Rabin-Karp rolling hash where we only keep the top 100 and bottom 100 hashes. In theory, that should be sufficient samples to given a fairly accurate measurement while limiting the amount of data we keep for file signatures no matter how large the file is.
5e5848eb · Russell Belfer · 99ba8f23 · 5e5848eb · 5e5848eb · 5e5848eb
Commit 5e5848eb authored Feb 14, 2013 by Russell Belfer
Hide whitespace changes
Inline Side-by-side

Showing with 498 additions and 464 deletions

src/buf_text.c
+0 -360

src/buf_text.h
+0 -48

src/diff_tform.c
+4 -2

src/hashsig.c
+364 -0

src/hashsig.h
+70 -0

tests-clar/core/buffer.c
+60 -54

No files found.
--- a/src/buf_text.c
+++ b/src/buf_text.c
@@ -5,7 +5,6 @@
 * a Linking Exception. For full terms see the included COPYING file.
 */
 #include "buf_text.h"
-#include "fileops.h"

 int git_buf_text_puts_escaped(
 	git_buf *buf,
@@ -213,362 +212,3 @@ bool git_buf_text_gather_stats(
 	return (stats->nul > 0 ||
 		((stats->printable >> 7) < stats->nonprintable));
 }
-
-#define SIMILARITY_MAXRUN 256
-#define SIMILARITY_HASH_START  5381
-#define SIMILARITY_HASH_UPDATE(S,N) (((S) << 5) + (S) + (uint32_t)(N))
-
-enum {
-	SIMILARITY_FORMAT_UNKNOWN = 0,
-	SIMILARITY_FORMAT_TEXT = 1,
-	SIMILARITY_FORMAT_BINARY = 2
-};
-
-struct git_buf_text_hashsig {
-	uint32_t *hashes;
-	size_t size;
-	size_t asize;
-	unsigned int format : 2;
-	unsigned int pairs : 1;
-};
-
-static int similarity_record_hash(git_buf_text_hashsig *sig, uint32_t hash)
-{
-	if (sig->size >= sig->asize) {
-		size_t new_asize = sig->asize + 512;
-		uint32_t *new_hashes =
-			git__realloc(sig->hashes, new_asize * sizeof(uint32_t));
-		GITERR_CHECK_ALLOC(new_hashes);
-
-		sig->hashes = new_hashes;
-		sig->asize  = new_asize;
-	}
-
-	sig->hashes[sig->size++] = hash;
-	return 0;
-}
-
-static int similarity_add_hashes_text(
-	git_buf_text_hashsig *sig,
-	uint32_t *hash_start,
-	size_t *hashlen_start,
-	const char *ptr,
-	size_t len)
-{
-	int error;
-	const char *scan = ptr, *scan_end = ptr + len;
-	uint32_t hash = *hash_start;
-	size_t hashlen = *hashlen_start;
-
-	while (scan < scan_end) {
-		char ch = *scan++;
-
-		if (ch == '\r' || ch == '\n' || hashlen >= SIMILARITY_MAXRUN) {
-			if ((error = similarity_record_hash(sig, hash)) < 0)
-				break;
-
-			hash = SIMILARITY_HASH_START;
-			hashlen = 0;
-
-			/* skip all whitespace immediately after line ending */
-			while (scan < scan_end && git__isspace(*scan))
-				scan++;
-		} else {
-			hash = SIMILARITY_HASH_UPDATE(hash, ch);
-			hashlen++;
-		}
-	}
-
-	*hash_start = hash;
-	*hashlen_start = hashlen;
-
-	return error;
-}
-
-static int similarity_add_hashes_binary(
-	git_buf_text_hashsig *sig,
-	uint32_t *hash_start,
-	size_t *hashlen_start,
-	const char *ptr,
-	size_t len)
-{
-	int error;
-	const char *scan = ptr, *scan_end = ptr + len;
-	uint32_t hash = *hash_start;
-	size_t hashlen = *hashlen_start;
-
-	while (scan < scan_end) {
-		char ch = *scan++;
-
-		if (!ch || hashlen >= SIMILARITY_MAXRUN) {
-			if ((error = similarity_record_hash(sig, hash)) < 0)
-				break;
-
-			hash = SIMILARITY_HASH_START;
-			hashlen = 0;
-
-			/* skip run of terminators */
-			while (scan < scan_end && !*scan)
-				scan++;
-		} else {
-			hash = SIMILARITY_HASH_UPDATE(hash, ch);
-			hashlen++;
-		}
-	}
-
-	*hash_start = hash;
-	*hashlen_start = hashlen;
-
-	return error;
-}
-
-static int similarity_add_hashes(
-	git_buf_text_hashsig *sig,
-	uint32_t *hash_start,
-	size_t *hashlen_start,
-	const char *ptr,
-	size_t len)
-{
-	int error = 0;
-	uint32_t hash = hash_start ? *hash_start : SIMILARITY_HASH_START;
-	size_t hashlen = hashlen_start ? *hashlen_start : 0;
-
-	if (sig->format == SIMILARITY_FORMAT_TEXT)
-		error = similarity_add_hashes_text(sig, &hash, &hashlen, ptr, len);
-	else
-		error = similarity_add_hashes_binary(sig, &hash, &hashlen, ptr, len);
-
-	if (hash_start)
-		*hash_start = hash;
-	if (hashlen_start)
-		*hashlen_start = hashlen;
-
-	/* if we're not saving intermediate state, add final hash as needed */
-	if (!error && !hash_start && hashlen > 0)
-		error = similarity_record_hash(sig, hash);
-
-	return error;
-}
-
-/*
- * Decide if \0 or \n terminated runs are a better choice for hashes
- */
-static void similarity_guess_format(
-	git_buf_text_hashsig *sig, const char *ptr, size_t len)
-{
-	size_t lines = 0, line_length = 0, max_line_length = 0;
-	size_t runs = 0, run_length = 0, max_run_length = 0;
-
-	/* don't process more than 4k of data for this */
-	if (len > 4096)
-		len = 4096;
-
-	/* gather some stats */
-	while (len--) {
-		char ch = *ptr++;
-
-		if (ch == '\0') {
-			runs++;
-			if (max_run_length < run_length)
-				max_run_length = run_length;
-			run_length = 0;
-		} else if (ch == '\n') {
-			lines++;
-			if (max_line_length < line_length)
-				max_line_length = line_length;
-			line_length = 0;
-		} else {
-			run_length++;
-			line_length++;
-		}
-	}
-
-	/* the following heuristic could probably be improved */
-	if (lines > runs)
-		sig->format = SIMILARITY_FORMAT_TEXT;
-	else if (runs > 0)
-		sig->format = SIMILARITY_FORMAT_BINARY;
-	else
-		sig->format = SIMILARITY_FORMAT_UNKNOWN;
-}
-
-static int similarity_compare_score(const void *a, const void *b)
-{
-	uint32_t av = *(uint32_t *)a, bv = *(uint32_t *)b;
-	return (av < bv) ? -1 : (av > bv) ? 1 : 0;
-}
-
-static int similarity_finalize_hashes(
-	git_buf_text_hashsig *sig, bool generate_pairs)
-{
-	if (!sig->size)
-		return 0;
-
-	/* create pairwise hashes if requested */
-
-	if (generate_pairs) {
-		size_t i, needed_size = sig->size * 2 - 1;
-
-		if (needed_size > sig->asize) {
-			uint32_t *new_hashes =
-				git__realloc(sig->hashes, needed_size * sizeof(uint32_t));
-			GITERR_CHECK_ALLOC(new_hashes);
-
-			sig->hashes = new_hashes;
-			sig->asize  = needed_size;
-		}
-
-		for (i = 1; i < sig->size; ++i)
-			sig->hashes[sig->size + i - 1] =
-				SIMILARITY_HASH_UPDATE(sig->hashes[i - 1], sig->hashes[i]);
-
-		sig->pairs = 1;
-	}
-
-	/* sort all hashes */
-
-	qsort(sig->hashes, sig->size, sizeof(uint32_t), similarity_compare_score);
-
-	if (generate_pairs)
-		qsort(&sig->hashes[sig->size], sig->size - 1, sizeof(uint32_t),
-			similarity_compare_score);
-
-	return 0;
-}
-
-int git_buf_text_hashsig_create(
-	git_buf_text_hashsig **out,
-	const git_buf *buf,
-	bool generate_pairs)
-{
-	int error;
-	git_buf_text_hashsig *sig = git__calloc(1, sizeof(git_buf_text_hashsig));
-	GITERR_CHECK_ALLOC(sig);
-
-	similarity_guess_format(sig, buf->ptr, buf->size);
-
-	error = similarity_add_hashes(sig, NULL, NULL, buf->ptr, buf->size);
-
-	if (!error)
-		error = similarity_finalize_hashes(sig, generate_pairs);
-
-	if (!error)
-		*out = sig;
-	else
-		git_buf_text_hashsig_free(sig);
-
-	return error;
-}
-
-int git_buf_text_hashsig_create_fromfile(
-	git_buf_text_hashsig **out,
-	const char *path,
-	bool generate_pairs)
-{
-	char buf[4096];
-	ssize_t buflen = 0;
-	uint32_t hash = SIMILARITY_HASH_START;
-	size_t hashlen = 0;
-	int error = 0, fd;
-	git_buf_text_hashsig *sig = git__calloc(1, sizeof(git_buf_text_hashsig));
-	GITERR_CHECK_ALLOC(sig);
-
-	if ((fd = git_futils_open_ro(path)) < 0) {
-		git__free(sig);
-		return fd;
-	}
-
-	while (!error && (buflen = p_read(fd, buf, sizeof(buf))) > 0) {
-		if (sig->format == SIMILARITY_FORMAT_UNKNOWN)
-			similarity_guess_format(sig, buf, buflen);
-
-		error = similarity_add_hashes(sig, &hash, &hashlen, buf, buflen);
-	}
-
-	if (buflen < 0) {
-		giterr_set(GITERR_OS,
-			"Read error on '%s' while calculating similarity hashes", path);
-		error = (int)buflen;
-	}
-
-	p_close(fd);
-
-	if (!error && hashlen > 0)
-		error = similarity_record_hash(sig, hash);
-
-	if (!error)
-		error = similarity_finalize_hashes(sig, generate_pairs);
-
-	if (!error)
-		*out = sig;
-	else
-		git_buf_text_hashsig_free(sig);
-
-	return error;
-}
-
-void git_buf_text_hashsig_free(git_buf_text_hashsig *sig)
-{
-	if (!sig)
-		return;
-
-	if (sig->hashes) {
-		git__free(sig->hashes);
-		sig->hashes = NULL;
-	}
-
-	git__free(sig);
-}
-
-int git_buf_text_hashsig_compare(
-	const git_buf_text_hashsig *a,
-	const git_buf_text_hashsig *b,
-	int scale)
-{
-	size_t matches = 0, pairs = 0, total = 0, i, j;
-
-	if (a->format != b->format || !a->size || !b->size)
-		return 0;
-
-	if (scale <= 0)
-		scale = 100;
-
-	/* hash lists are sorted - just look for overlap vs total */
-
-	for (i = 0, j = 0; i < a->size && j < b->size; ) {
-		uint32_t av = a->hashes[i];
-		uint32_t bv = b->hashes[j];
-
-		if (av < bv)
-			++i;
-		else if (av > bv)
-			++j;
-		else {
-			++i; ++j;
-			++matches;
-		}
-	}
-
-	total = (a->size + b->size);
-
-	if (a->pairs && b->pairs) {
-		for (i = 0, j = 0; i < a->size - 1 && j < b->size - 1; ) {
-			uint32_t av = a->hashes[i + a->size];
-			uint32_t bv = b->hashes[j + b->size];
-
-			if (av < bv)
-				++i;
-			else if (av > bv)
-				++j;
-			else {
-				++i; ++j;
-				++pairs;
-			}
-		}
-
-		total += (a->size + b->size - 2);
-	}
-
-	return (int)(scale * 2 * (matches + pairs) / total);
-}
-
--- a/src/buf_text.h
+++ b/src/buf_text.h
@@ -105,52 +105,4 @@ extern int git_buf_text_detect_bom(
 extern bool git_buf_text_gather_stats(
 	git_buf_text_stats *stats, const git_buf *buf, bool skip_bom);

-/**
- * Similarity signature of line hashes for a buffer
- */
-typedef struct git_buf_text_hashsig git_buf_text_hashsig;
-
-/**
- * Build a similarity signature for a buffer
- *
- * This can either generate a simple array of hashed lines/runs in the
- * file, or it can also keep hashes of pairs of runs in sequence.  Adding
- * the pairwise runs means the final score will be sensitive to line
- * ordering changes as well as individual line contents.
- *
- * @param out The array of hashed runs representing the file content
- * @param buf The contents of the file to hash
- * @param generate_pairwise_hashes Should pairwise runs be hashed
- */
-extern int git_buf_text_hashsig_create(
-	git_buf_text_hashsig **out,
-	const git_buf *buf,
-	bool generate_pairwise_hashes);
-
-/**
- * Build a similarity signature from a file
- *
- * This walks through the file, only loading a maximum of 4K of file data at
- * a time.  Otherwise, it acts just like `git_buf_text_hashsig_create`.
- */
-extern int git_buf_text_hashsig_create_fromfile(
-	git_buf_text_hashsig **out,
-	const char *path,
-	bool generate_pairwise_hashes);
-
-/**
- * Release memory for a content similarity signature
- */
-extern void git_buf_text_hashsig_free(git_buf_text_hashsig *sig);
-
-/**
- * Measure similarity between two files
- *
- * @return <0 for error, [0 to scale] as similarity score
- */
-extern int git_buf_text_hashsig_compare(
-	const git_buf_text_hashsig *a,
-	const git_buf_text_hashsig *b,
-	int scale);
-
 #endif
--- a/src/diff_tform.c
+++ b/src/diff_tform.c
@@ -7,7 +7,7 @@
 #include "common.h"
 #include "diff.h"
 #include "git2/config.h"
-#include "buf_text.h"
+#include "hashsig.h"

 static git_diff_delta *diff_delta__dup(
 	const git_diff_delta *d, git_pool *pool)
@@ -300,7 +300,7 @@ on_error:

 typedef struct {
 	/* array of delta index * 2 + (old_file/new_file) -> file hashes */
-	git_buf_text_hashsig *sigs;
+	git_hashsig *sigs;
 } diff_similarity_cache;

 static unsigned int calc_similarity(
@@ -308,6 +308,8 @@ static unsigned int calc_similarity(
 {
 	diff_similarity_cache *cache = ref;

+	GIT_UNUSED(cache);
+
 	if (git_oid_cmp(&old_file->oid, &new_file->oid) == 0)
 		return 100;


--- a/src/hashsig.c
+++ b/src/hashsig.c
+/*
+ * Copyright (C) the libgit2 contributors. All rights reserved.
+ *
+ * This file is part of libgit2, distributed under the GNU GPL v2 with
+ * a Linking Exception. For full terms see the included COPYING file.
+ */
+#include "hashsig.h"
+#include "fileops.h"
+
+typedef uint32_t hashsig_t;
+typedef uint64_t hashsig_state;
+
+#define HASHSIG_SCALE 100
+
+#define HASHSIG_HASH_WINDOW 32
+#define HASHSIG_HASH_START	0
+#define HASHSIG_HASH_SHIFT  5
+#define HASHSIG_HASH_MASK   0x7FFFFFFF
+
+#define HASHSIG_HEAP_SIZE ((1 << 7) - 1)
+
+typedef int (*hashsig_cmp)(const void *a, const void *b);
+
+typedef struct {
+	int size, asize;
+	hashsig_cmp cmp;
+	hashsig_t values[HASHSIG_HEAP_SIZE];
+} hashsig_heap;
+
+typedef struct {
+	hashsig_state state, shift_n;
+	char window[HASHSIG_HASH_WINDOW];
+	int win_len, win_pos, saw_lf;
+} hashsig_in_progress;
+
+#define HASHSIG_IN_PROGRESS_INIT { HASHSIG_HASH_START, 1, {0}, 0, 0, 1 }
+
+struct git_hashsig {
+	hashsig_heap mins;
+	hashsig_heap maxs;
+	git_hashsig_option_t opt;
+	int considered;
+};
+
+#define HEAP_LCHILD_OF(I) (((I)*2)+1)
+#define HEAP_RCHILD_OF(I) (((I)*2)+2)
+#define HEAP_PARENT_OF(I) (((I)-1)>>1)
+
+static void hashsig_heap_init(hashsig_heap *h, hashsig_cmp cmp)
+{
+	h->size  = 0;
+	h->asize = HASHSIG_HEAP_SIZE;
+	h->cmp   = cmp;
+}
+
+static int hashsig_cmp_max(const void *a, const void *b)
+{
+	hashsig_t av = *(const hashsig_t *)a, bv = *(const hashsig_t *)b;
+	return (av < bv) ? -1 : (av > bv) ? 1 : 0;
+}
+
+static int hashsig_cmp_min(const void *a, const void *b)
+{
+	hashsig_t av = *(const hashsig_t *)a, bv = *(const hashsig_t *)b;
+	return (av > bv) ? -1 : (av < bv) ? 1 : 0;
+}
+
+static void hashsig_heap_up(hashsig_heap *h, int el)
+{
+	int parent_el = HEAP_PARENT_OF(el);
+
+	while (el > 0 && h->cmp(&h->values[parent_el], &h->values[el]) > 0) {
+		hashsig_t t = h->values[el];
+		h->values[el] = h->values[parent_el];
+		h->values[parent_el] = t;
+
+		el = parent_el;
+		parent_el = HEAP_PARENT_OF(el);
+	}
+}
+
+static void hashsig_heap_down(hashsig_heap *h, int el)
+{
+	hashsig_t v, lv, rv;
+
+	/* 'el < h->size / 2' tests if el is bottom row of heap */
+
+	while (el < h->size / 2) {
+		int lel = HEAP_LCHILD_OF(el), rel = HEAP_RCHILD_OF(el), swapel;
+
+		v  = h->values[el];
+		lv = h->values[lel];
+		rv = h->values[rel];
+
+		if (h->cmp(&v, &lv) < 0 && h->cmp(&v, &rv) < 0)
+			break;
+
+		swapel = (h->cmp(&lv, &rv) < 0) ? lel : rel;
+
+		h->values[el] = h->values[swapel];
+		h->values[swapel] = v;
+
+		el = swapel;
+	}
+}
+
+static void hashsig_heap_sort(hashsig_heap *h)
+{
+	/* only need to do this at the end for signature comparison */
+	qsort(h->values, h->size, sizeof(hashsig_t), h->cmp);
+}
+
+static void hashsig_heap_insert(hashsig_heap *h, hashsig_t val)
+{
+	/* if heap is full, pop top if new element should replace it */
+	if (h->size == h->asize && h->cmp(&val, &h->values[0]) > 0) {
+		h->size--;
+		h->values[0] = h->values[h->size];
+		hashsig_heap_down(h, 0);
+	}
+
+	/* if heap is not full, insert new element */
+	if (h->size < h->asize) {
+		h->values[h->size++] = val;
+		hashsig_heap_up(h, h->size - 1);
+	}
+}
+
+GIT_INLINE(bool) hashsig_include_char(
+	char ch, git_hashsig_option_t opt, int *saw_lf)
+{
+	if ((opt & GIT_HASHSIG_IGNORE_WHITESPACE) && git__isspace(ch))
+		return false;
+
+	if (opt & GIT_HASHSIG_SMART_WHITESPACE) {
+		if (ch == '\r' || (*saw_lf && git__isspace(ch)))
+			return false;
+
+		*saw_lf = (ch == '\n');
+	}
+
+	return true;
+}
+
+static void hashsig_initial_window(
+	git_hashsig *sig,
+	const char **data,
+	size_t size,
+	hashsig_in_progress *prog)
+{
+	hashsig_state state, shift_n;
+	int win_len;
+	const char *scan, *end;
+
+	/* init until we have processed at least HASHSIG_HASH_WINDOW data */
+
+	if (prog->win_len >= HASHSIG_HASH_WINDOW)
+		return;
+
+	state   = prog->state;
+	win_len = prog->win_len;
+	shift_n = prog->shift_n;
+
+	scan = *data;
+	end  = scan + size;
+
+	while (scan < end && win_len < HASHSIG_HASH_WINDOW) {
+		char ch = *scan++;
+
+		if (!hashsig_include_char(ch, sig->opt, &prog->saw_lf))
+			continue;
+
+		state = (state * HASHSIG_HASH_SHIFT + ch) & HASHSIG_HASH_MASK;
+
+		if (!win_len)
+			shift_n = 1;
+		else
+			shift_n = (shift_n * HASHSIG_HASH_SHIFT) & HASHSIG_HASH_MASK;
+
+		prog->window[win_len++] = ch;
+	}
+
+	/* insert initial hash if we just finished */
+
+	if (win_len == HASHSIG_HASH_WINDOW) {
+		hashsig_heap_insert(&sig->mins, state);
+		hashsig_heap_insert(&sig->maxs, state);
+		sig->considered = 1;
+	}
+
+	prog->state   = state;
+	prog->win_len = win_len;
+	prog->shift_n = shift_n;
+
+	*data = scan;
+}
+
+static int hashsig_add_hashes(
+	git_hashsig *sig,
+	const char *data,
+	size_t size,
+	hashsig_in_progress *prog)
+{
+	const char *scan = data, *end = data + size;
+	hashsig_state state, shift_n, rmv;
+
+	if (prog->win_len < HASHSIG_HASH_WINDOW)
+		hashsig_initial_window(sig, &scan, size, prog);
+
+	state   = prog->state;
+	shift_n = prog->shift_n;
+
+	/* advance window, adding new chars and removing old */
+
+	for (; scan < end; ++scan) {
+		char ch = *scan;
+
+		if (!hashsig_include_char(ch, sig->opt, &prog->saw_lf))
+			continue;
+
+		rmv = shift_n * prog->window[prog->win_pos];
+
+		state = (state - rmv) & HASHSIG_HASH_MASK;
+		state = (state * HASHSIG_HASH_SHIFT) & HASHSIG_HASH_MASK;
+		state = (state + ch) & HASHSIG_HASH_MASK;
+
+		hashsig_heap_insert(&sig->mins, state);
+		hashsig_heap_insert(&sig->maxs, state);
+		sig->considered++;
+
+		prog->window[prog->win_pos] = ch;
+		prog->win_pos = (prog->win_pos + 1) % HASHSIG_HASH_WINDOW;
+	}
+
+	prog->state = state;
+
+	return 0;
+}
+
+static int hashsig_finalize_hashes(git_hashsig *sig)
+{
+	if (sig->mins.size < HASHSIG_HEAP_SIZE) {
+		giterr_set(GITERR_INVALID,
+			"File too small for similarity signature calculation");
+		return GIT_EBUFS;
+	}
+
+	hashsig_heap_sort(&sig->mins);
+	hashsig_heap_sort(&sig->maxs);
+
+	return 0;
+}
+
+static git_hashsig *hashsig_alloc(git_hashsig_option_t opts)
+{
+	git_hashsig *sig = git__calloc(1, sizeof(git_hashsig));
+	if (!sig)
+		return NULL;
+
+	hashsig_heap_init(&sig->mins, hashsig_cmp_min);
+	hashsig_heap_init(&sig->maxs, hashsig_cmp_max);
+	sig->opt = opts;
+
+	return sig;
+}
+
+int git_hashsig_create(
+	git_hashsig **out,
+	const git_buf *buf,
+	git_hashsig_option_t opts)
+{
+	int error;
+	hashsig_in_progress prog = HASHSIG_IN_PROGRESS_INIT;
+	git_hashsig *sig = hashsig_alloc(opts);
+	GITERR_CHECK_ALLOC(sig);
+
+	error = hashsig_add_hashes(sig, buf->ptr, buf->size, &prog);
+
+	if (!error)
+		error = hashsig_finalize_hashes(sig);
+
+	if (!error)
+		*out = sig;
+	else
+		git_hashsig_free(sig);
+
+	return error;
+}
+
+int git_hashsig_create_fromfile(
+	git_hashsig **out,
+	const char *path,
+	git_hashsig_option_t opts)
+{
+	char buf[4096];
+	ssize_t buflen = 0;
+	int error = 0, fd;
+	hashsig_in_progress prog = HASHSIG_IN_PROGRESS_INIT;
+	git_hashsig *sig = hashsig_alloc(opts);
+	GITERR_CHECK_ALLOC(sig);
+
+	if ((fd = git_futils_open_ro(path)) < 0) {
+		git__free(sig);
+		return fd;
+	}
+
+	while (!error) {
+		if ((buflen = p_read(fd, buf, sizeof(buf))) <= 0) {
+			if ((error = buflen) < 0)
+				giterr_set(GITERR_OS,
+					"Read error on '%s' calculating similarity hashes", path);
+			break;
+		}
+
+		error = hashsig_add_hashes(sig, buf, buflen, &prog);
+	}
+
+	p_close(fd);
+
+	if (!error)
+		error = hashsig_finalize_hashes(sig);
+
+	if (!error)
+		*out = sig;
+	else
+		git_hashsig_free(sig);
+
+	return error;
+}
+
+void git_hashsig_free(git_hashsig *sig)
+{
+	git__free(sig);
+}
+
+static int hashsig_heap_compare(const hashsig_heap *a, const hashsig_heap *b)
+{
+	int matches = 0, i, j, cmp;
+
+	assert(a->cmp == b->cmp);
+
+	/* hash heaps are sorted - just look for overlap vs total */
+
+	for (i = 0, j = 0; i < a->size && j < b->size; ) {
+		cmp = a->cmp(&a->values[i], &b->values[j]);
+
+		if (cmp < 0)
+			++i;
+		else if (cmp > 0)
+			++j;
+		else {
+			++i; ++j; ++matches;
+		}
+	}
+
+	return HASHSIG_SCALE * (matches * 2) / (a->size + b->size);
+}
+
+int git_hashsig_compare(const git_hashsig *a, const git_hashsig *b)
+{
+	return (hashsig_heap_compare(&a->mins, &b->mins) +
+			hashsig_heap_compare(&a->maxs, &b->maxs)) / 2;
+}
+
--- a/src/hashsig.h
+++ b/src/hashsig.h
+/*
+ * Copyright (C) the libgit2 contributors. All rights reserved.
+ *
+ * This file is part of libgit2, distributed under the GNU GPL v2 with
+ * a Linking Exception. For full terms see the included COPYING file.
+ */
+#ifndef INCLUDE_hashsig_h__
+#define INCLUDE_hashsig_h__
+
+#include "buffer.h"
+
+/**
+ * Similarity signature of line hashes for a buffer
+ */
+typedef struct git_hashsig git_hashsig;
+
+typedef enum {
+	GIT_HASHSIG_NORMAL = 0, /* use all data */
+	GIT_HASHSIG_IGNORE_WHITESPACE = 1, /* ignore whitespace */
+	GIT_HASHSIG_SMART_WHITESPACE = 2, /* ignore \r and all space after \n */
+} git_hashsig_option_t;
+
+/**
+ * Build a similarity signature for a buffer
+ *
+ * If you have passed a whitespace-ignoring buffer, then the whitespace
+ * will be removed from the buffer while it is being processed, modifying
+ * the buffer in place.  Sorry about that!
+ *
+ * This will return an error if the buffer doesn't contain enough data to
+ * compute a valid signature.
+ *
+ * @param out The array of hashed runs representing the file content
+ * @param buf The contents of the file to hash
+ * @param generate_pairwise_hashes Should pairwise runs be hashed
+ */
+extern int git_hashsig_create(
+	git_hashsig **out,
+	const git_buf *buf,
+	git_hashsig_option_t opts);
+
+/**
+ * Build a similarity signature from a file
+ *
+ * This walks through the file, only loading a maximum of 4K of file data at
+ * a time.  Otherwise, it acts just like `git_hashsig_create`.
+ *
+ * This will return an error if the file doesn't contain enough data to
+ * compute a valid signature.
+ */
+extern int git_hashsig_create_fromfile(
+	git_hashsig **out,
+	const char *path,
+	git_hashsig_option_t opts);
+
+/**
+ * Release memory for a content similarity signature
+ */
+extern void git_hashsig_free(git_hashsig *sig);
+
+/**
+ * Measure similarity between two files
+ *
+ * @return <0 for error, [0 to 100] as similarity score
+ */
+extern int git_hashsig_compare(
+	const git_hashsig *a,
+	const git_hashsig *b);
+
+#endif
--- a/tests-clar/core/buffer.c
+++ b/tests-clar/core/buffer.c
 #include "clar_libgit2.h"
 #include "buffer.h"
 #include "buf_text.h"
+#include "hashsig.h"
 #include "fileops.h"

 #define TESTSTR "Have you seen that? Have you seeeen that??"
@@ -732,89 +733,94 @@ void test_core_buffer__classify_with_utf8(void)
 	cl_assert(git_buf_text_contains_nul(&b));
 }

+#define SIMILARITY_TEST_DATA_1 \
+	"test data\nright here\ninline\ntada\nneeds more data\nlots of data\n" \
+	"is this enough?\nthere has to be enough data to fill the hash array!\n" \
+	"Apparently 191 bytes is the minimum amount of data needed.\nHere goes!\n" \
+	"Let's make sure we've got plenty to go with here.\n   smile   \n"
+
 void test_core_buffer__similarity_metric(void)
 {
-	git_buf_text_hashsig *a, *b;
+	git_hashsig *a, *b;
 	git_buf buf = GIT_BUF_INIT;
 	int sim;

 	/* in the first case, we compare data to itself and expect 100% match */

-	cl_git_pass(git_buf_sets(&buf, "test data\nright here\ninline\ntada"));
-	cl_git_pass(git_buf_text_hashsig_create(&a, &buf, true));
-	cl_git_pass(git_buf_text_hashsig_create(&b, &buf, true));
+	cl_git_pass(git_buf_sets(&buf, SIMILARITY_TEST_DATA_1));
+	cl_git_pass(git_hashsig_create(&a, &buf, GIT_HASHSIG_NORMAL));
+	cl_git_pass(git_hashsig_create(&b, &buf, GIT_HASHSIG_NORMAL));

-	cl_assert_equal_i(100, git_buf_text_hashsig_compare(a, b, 100));
+	cl_assert_equal_i(100, git_hashsig_compare(a, b));

-	git_buf_text_hashsig_free(a);
-	git_buf_text_hashsig_free(b);
+	git_hashsig_free(a);
+	git_hashsig_free(b);

-	/* in the second case, half of a is matched and all of b is matched, so
-	 * we'll expect a score of around 66% to be the similarity score
-	 */
+	/* if we change just a single byte, how much does that change magnify? */

-	cl_git_pass(
-		git_buf_sets(&buf, "a\nb\nc\nd\ne\nf\ng\nh\ni\nj\nk\nl\nm\nn\no\np\n"));
-	cl_git_pass(git_buf_text_hashsig_create(&a, &buf, true));
+	cl_git_pass(git_buf_sets(&buf, SIMILARITY_TEST_DATA_1));
+	cl_git_pass(git_hashsig_create(&a, &buf, GIT_HASHSIG_NORMAL));
+	cl_git_pass(git_buf_sets(&buf,
+		"Test data\nright here\ninline\ntada\nneeds more data\nlots of data\n"
+		"is this enough?\nthere has to be enough data to fill the hash array!\n"
+		"Apparently 191 bytes is the minimum amount of data needed.\nHere goes!\n"
+		 "Let's make sure we've got plenty to go with here.\n   smile   \n"));
+	cl_git_pass(git_hashsig_create(&b, &buf, GIT_HASHSIG_NORMAL));

-	cl_git_pass(git_buf_sets(&buf, "a\nb\nc\nd\ne\nf\ng\nh"));
-	cl_git_pass(git_buf_text_hashsig_create(&b, &buf, true));
+	sim = git_hashsig_compare(a, b);

-	sim = git_buf_text_hashsig_compare(a, b, 100);
-	cl_assert(sim > 60 && sim < 70);
+	cl_assert(95 < sim && sim < 100); /* expect >95% similarity */

-	git_buf_text_hashsig_free(a);
-	git_buf_text_hashsig_free(b);
+	git_hashsig_free(a);
+	git_hashsig_free(b);

-	/* in the reversed case, 100% of line hashes match, but no pairwise hashes
-	 * match, so we'll expect about a 50% match for a reversed file
-	 */
+	/* let's try comparing data to a superset of itself */

-	cl_git_pass(
-		git_buf_sets(&buf, "a\nb\nc\nd\ne\nf\ng\nh\ni\nj\nk\nl\nm\nn\no\np\n"));
-	cl_git_pass(git_buf_text_hashsig_create(&a, &buf, true));
-	cl_git_pass(
-		git_buf_sets(&buf, "p\no\nn\nm\nl\nk\nj\ni\nh\ng\nf\ne\nd\nc\nb\na\n"));
-	cl_git_pass(git_buf_text_hashsig_create(&b, &buf, true));
+	cl_git_pass(git_buf_sets(&buf, SIMILARITY_TEST_DATA_1));
+	cl_git_pass(git_hashsig_create(&a, &buf, GIT_HASHSIG_NORMAL));
+	cl_git_pass(git_buf_sets(&buf, SIMILARITY_TEST_DATA_1
+		"and if I add some more, it should still be pretty similar, yes?\n"));
+	cl_git_pass(git_hashsig_create(&b, &buf, GIT_HASHSIG_NORMAL));

-	sim = git_buf_text_hashsig_compare(a, b, 100);
-	cl_assert(sim > 45 && sim < 55);
+	sim = git_hashsig_compare(a, b);

-	git_buf_text_hashsig_free(a);
-	git_buf_text_hashsig_free(b);
+	cl_assert(70 < sim && sim < 80); /* expect in the 70-80% similarity range */

-	/* if we don't use pairwise signatures, then a reversed file should
-	 * match 100%
-	 */
+	git_hashsig_free(a);
+	git_hashsig_free(b);
+
+	/* what if we keep about half the original data and add half new */
+
+	cl_git_pass(git_buf_sets(&buf, SIMILARITY_TEST_DATA_1));
+	cl_git_pass(git_hashsig_create(&a, &buf, GIT_HASHSIG_NORMAL));
+	cl_git_pass(git_buf_sets(&buf,
+		"test data\nright here\ninline\ntada\nneeds more data\nlots of data\n"
+		"is this enough?\nthere has to be enough data to fill the hash array!\n"
+		"okay, that's half the original\nwhat else can we add?\nmore data\n"
+		 "one more line will complete this\nshort\nlines\ndon't\nmatter\n"));
+	cl_git_pass(git_hashsig_create(&b, &buf, GIT_HASHSIG_NORMAL));

-	cl_git_pass(
-		git_buf_sets(&buf, "a\nb\nc\nd\ne\nf\ng\nh\ni\nj\nk\nl\nm\nn\no\np\n"));
-	cl_git_pass(git_buf_text_hashsig_create(&a, &buf, false));
-	cl_git_pass(
-		git_buf_sets(&buf, "p\no\nn\nm\nl\nk\nj\ni\nh\ng\nf\ne\nd\nc\nb\na\n"));
-	cl_git_pass(git_buf_text_hashsig_create(&b, &buf, false));
+	sim = git_hashsig_compare(a, b);

-	sim = git_buf_text_hashsig_compare(a, b, 100);
-	cl_assert_equal_i(100, sim);
+	cl_assert(40 < sim && sim < 60); /* expect in the 40-60% similarity range */

-	git_buf_text_hashsig_free(a);
-	git_buf_text_hashsig_free(b);
+	git_hashsig_free(a);
+	git_hashsig_free(b);

 	/* lastly, let's check that we can hash file content as well */

-	cl_git_pass(
-		git_buf_sets(&buf, "a\nb\nc\nd\ne\nf\ng\nh\ni\nj\nk\nl\nm\nn\no\np\n"));
-	cl_git_pass(git_buf_text_hashsig_create(&a, &buf, true));
+	cl_git_pass(git_buf_sets(&buf, SIMILARITY_TEST_DATA_1));
+	cl_git_pass(git_hashsig_create(&a, &buf, GIT_HASHSIG_NORMAL));

 	cl_git_pass(git_futils_mkdir("scratch", NULL, 0755, GIT_MKDIR_PATH));
-	cl_git_mkfile("scratch/testdata",
-		"a\nb\nc\nd\ne\nf\ng\nh\ni\nj\nk\nl\nm\nn\no\np\n");
-	cl_git_pass(git_buf_text_hashsig_create_fromfile(&b, "scratch/testdata", true));
+	cl_git_mkfile("scratch/testdata", SIMILARITY_TEST_DATA_1);
+	cl_git_pass(git_hashsig_create_fromfile(
+		&b, "scratch/testdata", GIT_HASHSIG_NORMAL));

-	cl_assert_equal_i(100, git_buf_text_hashsig_compare(a, b, 100));
+	cl_assert_equal_i(100, git_hashsig_compare(a, b));

-	git_buf_text_hashsig_free(a);
-	git_buf_text_hashsig_free(b);
+	git_hashsig_free(a);
+	git_hashsig_free(b);

 	git_buf_free(&buf);
 	git_futils_rmdir_r("scratch", NULL, GIT_RMDIR_REMOVE_FILES);