hashsig.c 8.16 KB
Newer Older
1 2 3 4 5 6
/*
 * Copyright (C) the libgit2 contributors. All rights reserved.
 *
 * This file is part of libgit2, distributed under the GNU GPL v2 with
 * a Linking Exception. For full terms see the included COPYING file.
 */
7 8 9

#include "common.h"

10
#include "git2/sys/hashsig.h"
11
#include "futils.h"
12
#include "util.h"
13 14 15 16 17 18

typedef uint32_t hashsig_t;
typedef uint64_t hashsig_state;

#define HASHSIG_SCALE 100

19
#define HASHSIG_MAX_RUN 80
20
#define HASHSIG_HASH_START	INT64_C(0x012345678ABCDEF0)
21
#define HASHSIG_HASH_SHIFT  5
22 23 24

#define HASHSIG_HASH_MIX(S,CH) \
	(S) = ((S) << HASHSIG_HASH_SHIFT) - (S) + (hashsig_state)(CH)
25 26

#define HASHSIG_HEAP_SIZE ((1 << 7) - 1)
27
#define HASHSIG_HEAP_MIN_SIZE 4
28

29
typedef int (*hashsig_cmp)(const void *a, const void *b, void *);
30 31 32 33 34 35 36 37 38 39

typedef struct {
	int size, asize;
	hashsig_cmp cmp;
	hashsig_t values[HASHSIG_HEAP_SIZE];
} hashsig_heap;

struct git_hashsig {
	hashsig_heap mins;
	hashsig_heap maxs;
40
	size_t lines;
41 42 43
	git_hashsig_option_t opt;
};

44 45
#define HEAP_LCHILD_OF(I) (((I)<<1)+1)
#define HEAP_RCHILD_OF(I) (((I)<<1)+2)
46 47 48 49 50 51 52 53 54
#define HEAP_PARENT_OF(I) (((I)-1)>>1)

static void hashsig_heap_init(hashsig_heap *h, hashsig_cmp cmp)
{
	h->size  = 0;
	h->asize = HASHSIG_HEAP_SIZE;
	h->cmp   = cmp;
}

55
static int hashsig_cmp_max(const void *a, const void *b, void *payload)
56 57
{
	hashsig_t av = *(const hashsig_t *)a, bv = *(const hashsig_t *)b;
58
	GIT_UNUSED(payload);
59 60 61
	return (av < bv) ? -1 : (av > bv) ? 1 : 0;
}

62
static int hashsig_cmp_min(const void *a, const void *b, void *payload)
63 64
{
	hashsig_t av = *(const hashsig_t *)a, bv = *(const hashsig_t *)b;
65
	GIT_UNUSED(payload);
66 67 68 69 70 71 72
	return (av > bv) ? -1 : (av < bv) ? 1 : 0;
}

static void hashsig_heap_up(hashsig_heap *h, int el)
{
	int parent_el = HEAP_PARENT_OF(el);

73
	while (el > 0 && h->cmp(&h->values[parent_el], &h->values[el], NULL) > 0) {
74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
		hashsig_t t = h->values[el];
		h->values[el] = h->values[parent_el];
		h->values[parent_el] = t;

		el = parent_el;
		parent_el = HEAP_PARENT_OF(el);
	}
}

static void hashsig_heap_down(hashsig_heap *h, int el)
{
	hashsig_t v, lv, rv;

	/* 'el < h->size / 2' tests if el is bottom row of heap */

	while (el < h->size / 2) {
		int lel = HEAP_LCHILD_OF(el), rel = HEAP_RCHILD_OF(el), swapel;

		v  = h->values[el];
		lv = h->values[lel];
		rv = h->values[rel];

96
		if (h->cmp(&v, &lv, NULL) < 0 && h->cmp(&v, &rv, NULL) < 0)
97 98
			break;

99
		swapel = (h->cmp(&lv, &rv, NULL) < 0) ? lel : rel;
100 101 102 103 104 105 106 107 108 109 110

		h->values[el] = h->values[swapel];
		h->values[swapel] = v;

		el = swapel;
	}
}

static void hashsig_heap_sort(hashsig_heap *h)
{
	/* only need to do this at the end for signature comparison */
111
	git__qsort_r(h->values, h->size, sizeof(hashsig_t), h->cmp, NULL);
112 113 114 115 116 117 118 119 120 121
}

static void hashsig_heap_insert(hashsig_heap *h, hashsig_t val)
{
	/* if heap is not full, insert new element */
	if (h->size < h->asize) {
		h->values[h->size++] = val;
		hashsig_heap_up(h, h->size - 1);
	}

122 123 124 125 126
	/* if heap is full, pop top if new element should replace it */
	else if (h->cmp(&val, &h->values[0], NULL) > 0) {
		h->size--;
		h->values[0] = h->values[h->size];
		hashsig_heap_down(h, 0);
127 128 129 130
	}

}

131 132 133 134
typedef struct {
	int use_ignores;
	uint8_t ignore_ch[256];
} hashsig_in_progress;
135

136
static int hashsig_in_progress_init(
137 138 139 140
	hashsig_in_progress *prog, git_hashsig *sig)
{
	int i;

141
	/* no more than one can be set */
142
	GIT_ASSERT(!(sig->opt & GIT_HASHSIG_IGNORE_WHITESPACE) ||
143 144 145
		   !(sig->opt & GIT_HASHSIG_SMART_WHITESPACE));

	if (sig->opt & GIT_HASHSIG_IGNORE_WHITESPACE) {
146 147 148
		for (i = 0; i < 256; ++i)
			prog->ignore_ch[i] = git__isspace_nonlf(i);
		prog->use_ignores = 1;
149
	} else if (sig->opt & GIT_HASHSIG_SMART_WHITESPACE) {
150 151 152
		for (i = 0; i < 256; ++i)
			prog->ignore_ch[i] = git__isspace(i);
		prog->use_ignores = 1;
153
	} else {
154
		memset(prog, 0, sizeof(*prog));
155
	}
156 157

	return 0;
158 159 160 161
}

static int hashsig_add_hashes(
	git_hashsig *sig,
162
	const uint8_t *data,
163 164 165
	size_t size,
	hashsig_in_progress *prog)
{
166 167 168 169 170 171 172 173 174 175 176 177 178 179
	const uint8_t *scan = data, *end = data + size;
	hashsig_state state = HASHSIG_HASH_START;
	int use_ignores = prog->use_ignores, len;
	uint8_t ch;

	while (scan < end) {
		state = HASHSIG_HASH_START;

		for (len = 0; scan < end && len < HASHSIG_MAX_RUN; ) {
			ch = *scan;

			if (use_ignores)
				for (; scan < end && git__isspace_nonlf(ch); ch = *scan)
					++scan;
180 181
			else if (sig->opt &
					 (GIT_HASHSIG_IGNORE_WHITESPACE | GIT_HASHSIG_SMART_WHITESPACE))
182 183 184 185
				for (; scan < end && ch == '\r'; ch = *scan)
					++scan;

			/* peek at next character to decide what to do next */
186
			if (sig->opt & GIT_HASHSIG_SMART_WHITESPACE)
187 188 189 190 191 192 193
				use_ignores = (ch == '\n');

			if (scan >= end)
				break;
			++scan;

			/* check run terminator */
194 195
			if (ch == '\n' || ch == '\0') {
				sig->lines++;
196
				break;
197
			}
198 199 200 201

			++len;
			HASHSIG_HASH_MIX(state, ch);
		}
202

203 204 205
		if (len > 0) {
			hashsig_heap_insert(&sig->mins, (hashsig_t)state);
			hashsig_heap_insert(&sig->maxs, (hashsig_t)state);
206

207 208 209
			while (scan < end && (*scan == '\n' || !*scan))
				++scan;
		}
210 211
	}

212
	prog->use_ignores = use_ignores;
213 214 215 216 217 218

	return 0;
}

static int hashsig_finalize_hashes(git_hashsig *sig)
{
219 220
	if (sig->mins.size < HASHSIG_HEAP_MIN_SIZE &&
		!(sig->opt & GIT_HASHSIG_ALLOW_SMALL_FILES)) {
221
		git_error_set(GIT_ERROR_INVALID,
222
			"file too small for similarity signature calculation");
223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246
		return GIT_EBUFS;
	}

	hashsig_heap_sort(&sig->mins);
	hashsig_heap_sort(&sig->maxs);

	return 0;
}

static git_hashsig *hashsig_alloc(git_hashsig_option_t opts)
{
	git_hashsig *sig = git__calloc(1, sizeof(git_hashsig));
	if (!sig)
		return NULL;

	hashsig_heap_init(&sig->mins, hashsig_cmp_min);
	hashsig_heap_init(&sig->maxs, hashsig_cmp_max);
	sig->opt = opts;

	return sig;
}

int git_hashsig_create(
	git_hashsig **out,
247 248
	const char *buf,
	size_t buflen,
249 250 251
	git_hashsig_option_t opts)
{
	int error;
252
	hashsig_in_progress prog;
253
	git_hashsig *sig = hashsig_alloc(opts);
254
	GIT_ERROR_CHECK_ALLOC(sig);
255

256 257
	if ((error = hashsig_in_progress_init(&prog, sig)) < 0)
		return error;
258 259

	error = hashsig_add_hashes(sig, (const uint8_t *)buf, buflen, &prog);
260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276

	if (!error)
		error = hashsig_finalize_hashes(sig);

	if (!error)
		*out = sig;
	else
		git_hashsig_free(sig);

	return error;
}

int git_hashsig_create_fromfile(
	git_hashsig **out,
	const char *path,
	git_hashsig_option_t opts)
{
277
	uint8_t buf[0x1000];
278 279
	ssize_t buflen = 0;
	int error = 0, fd;
280
	hashsig_in_progress prog;
281
	git_hashsig *sig = hashsig_alloc(opts);
282
	GIT_ERROR_CHECK_ALLOC(sig);
283 284 285 286 287 288

	if ((fd = git_futils_open_ro(path)) < 0) {
		git__free(sig);
		return fd;
	}

289 290
	if ((error = hashsig_in_progress_init(&prog, sig)) < 0) {
		p_close(fd);
291
		return error;
292
	}
293

294 295
	while (!error) {
		if ((buflen = p_read(fd, buf, sizeof(buf))) <= 0) {
296
			if ((error = (int)buflen) < 0)
297
				git_error_set(GIT_ERROR_OS,
298
					"read error on '%s' calculating similarity hashes", path);
299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326
			break;
		}

		error = hashsig_add_hashes(sig, buf, buflen, &prog);
	}

	p_close(fd);

	if (!error)
		error = hashsig_finalize_hashes(sig);

	if (!error)
		*out = sig;
	else
		git_hashsig_free(sig);

	return error;
}

void git_hashsig_free(git_hashsig *sig)
{
	git__free(sig);
}

static int hashsig_heap_compare(const hashsig_heap *a, const hashsig_heap *b)
{
	int matches = 0, i, j, cmp;

327
	GIT_ASSERT_WITH_RETVAL(a->cmp == b->cmp, 0);
328 329 330 331

	/* hash heaps are sorted - just look for overlap vs total */

	for (i = 0, j = 0; i < a->size && j < b->size; ) {
332
		cmp = a->cmp(&a->values[i], &b->values[j], NULL);
333 334 335 336 337 338 339 340 341 342 343 344 345 346 347

		if (cmp < 0)
			++i;
		else if (cmp > 0)
			++j;
		else {
			++i; ++j; ++matches;
		}
	}

	return HASHSIG_SCALE * (matches * 2) / (a->size + b->size);
}

int git_hashsig_compare(const git_hashsig *a, const git_hashsig *b)
{
348 349 350 351 352 353 354 355 356 357 358 359
	/* if we have no elements in either file then each file is either
	 * empty or blank.  if we're ignoring whitespace then the files are
	 * similar, otherwise they're dissimilar.
	 */
	if (a->mins.size == 0 && b->mins.size == 0) {
		if ((!a->lines && !b->lines) ||
			(a->opt & GIT_HASHSIG_IGNORE_WHITESPACE))
			return HASHSIG_SCALE;
		else
			return 0;
	}

360 361 362
	/* if we have fewer than the maximum number of elements, then just use
	 * one array since the two arrays will be the same
	 */
363
	if (a->mins.size < HASHSIG_HEAP_SIZE) {
364
		return hashsig_heap_compare(&a->mins, &b->mins);
365 366 367 368 369 370 371 372 373 374
	} else {
		int mins, maxs;

		if ((mins = hashsig_heap_compare(&a->mins, &b->mins)) < 0)
			return mins;
		if ((maxs = hashsig_heap_compare(&a->maxs, &b->maxs)) < 0)
			return maxs;

		return (mins + maxs) / 2;
	}
375
}