sha1_lookup.c 5.22 KB
Newer Older
1
/*
Edward Thomson committed
2
 * Copyright (C) the libgit2 contributors. All rights reserved.
3
 *
Vicent Marti committed
4 5
 * This file is part of libgit2, distributed under the GNU GPL v2 with
 * a Linking Exception. For full terms see the included COPYING file.
6 7 8 9 10 11 12 13 14 15 16
 */

#include <stdio.h>

#include "sha1_lookup.h"
#include "common.h"

/*
 * Conventional binary search loop looks like this:
 *
 *	unsigned lo, hi;
Vicent Marti committed
17 18 19 20 21 22 23 24 25 26
 *		do {
 *				unsigned mi = (lo + hi) / 2;
 *				int cmp = "entry pointed at by mi" minus "target";
 *				if (!cmp)
 *						return (mi is the wanted one)
 *				if (cmp > 0)
 *						hi = mi; "mi is larger than target"
 *				else
 *						lo = mi+1; "mi is smaller than target"
 *		} while (lo < hi);
27 28 29 30
 *
 * The invariants are:
 *
 * - When entering the loop, lo points at a slot that is never
Vicent Marti committed
31 32 33
 *	above the target (it could be at the target), hi points at a
 *	slot that is guaranteed to be above the target (it can never
 *	be at the target).
34 35
 *
 * - We find a point 'mi' between lo and hi (mi could be the same
Vicent Marti committed
36 37
 *	as lo, but never can be as same as hi), and check if it hits
 *	the target. There are three cases:
38
 *
Vicent Marti committed
39
 *	- if it is a hit, we are happy.
40
 *
Vicent Marti committed
41 42
 *	- if it is strictly higher than the target, we set it to hi,
 *		and repeat the search.
43
 *
Vicent Marti committed
44 45
 *	- if it is strictly lower than the target, we update lo to
 *		one slot after it, because we allow lo to be at the target.
46
 *
Vicent Marti committed
47
 *	If the loop exits, there is no matching entry.
48 49 50
 *
 * When choosing 'mi', we do not have to take the "middle" but
 * anywhere in between lo and hi, as long as lo <= mi < hi is
Vicent Marti committed
51
 * satisfied. When we somehow know that the distance between the
52 53 54 55 56
 * target and lo is much shorter than the target and hi, we could
 * pick mi that is much closer to lo than the midway.
 *
 * Now, we can take advantage of the fact that SHA-1 is a good hash
 * function, and as long as there are enough entries in the table, we
Vicent Marti committed
57
 * can expect uniform distribution. An entry that begins with for
58
 * example "deadbeef..." is much likely to appear much later than in
Vicent Marti committed
59
 * the midway of the table. It can reasonably be expected to be near
60 61
 * 87% (222/256) from the top of the table.
 *
Vicent Marti committed
62
 * However, we do not want to pick "mi" too precisely. If the entry at
63 64 65
 * the 87% in the above example turns out to be higher than the target
 * we are looking for, we would end up narrowing the search space down
 * only by 13%, instead of 50% we would get if we did a simple binary
Vicent Marti committed
66
 * search. So we would want to hedge our bets by being less aggressive.
67 68
 *
 * The table at "table" holds at least "nr" entries of "elem_size"
Vicent Marti committed
69 70
 * bytes each. Each entry has the SHA-1 key at "key_offset". The
 * table is sorted by the SHA-1 key of the entries. The caller wants
71 72 73 74 75
 * to find the entry with "key", and knows that the entry at "lo" is
 * not higher than the entry it is looking for, and that the entry at
 * "hi" is higher than the entry it is looking for.
 */
int sha1_entry_pos(const void *table,
Vicent Marti committed
76 77 78 79
			size_t elem_size,
			size_t key_offset,
			unsigned lo, unsigned hi, unsigned nr,
			const unsigned char *key)
80
{
81
	const unsigned char *base = (const unsigned char*)table;
82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140
	const unsigned char *hi_key, *lo_key;
	unsigned ofs_0;

	if (!nr || lo >= hi)
		return -1;

	if (nr == hi)
		hi_key = NULL;
	else
		hi_key = base + elem_size * hi + key_offset;
	lo_key = base + elem_size * lo + key_offset;

	ofs_0 = 0;
	do {
		int cmp;
		unsigned ofs, mi, range;
		unsigned lov, hiv, kyv;
		const unsigned char *mi_key;

		range = hi - lo;
		if (hi_key) {
			for (ofs = ofs_0; ofs < 20; ofs++)
				if (lo_key[ofs] != hi_key[ofs])
					break;
			ofs_0 = ofs;
			/*
			 * byte 0 thru (ofs-1) are the same between
			 * lo and hi; ofs is the first byte that is
			 * different.
			 */
			hiv = hi_key[ofs_0];
			if (ofs_0 < 19)
				hiv = (hiv << 8) | hi_key[ofs_0+1];
		} else {
			hiv = 256;
			if (ofs_0 < 19)
				hiv <<= 8;
		}
		lov = lo_key[ofs_0];
		kyv = key[ofs_0];
		if (ofs_0 < 19) {
			lov = (lov << 8) | lo_key[ofs_0+1];
			kyv = (kyv << 8) | key[ofs_0+1];
		}
		assert(lov < hiv);

		if (kyv < lov)
			return -1 - lo;
		if (hiv < kyv)
			return -1 - hi;

		/*
		 * Even if we know the target is much closer to 'hi'
		 * than 'lo', if we pick too precisely and overshoot
		 * (e.g. when we know 'mi' is closer to 'hi' than to
		 * 'lo', pick 'mi' that is higher than the target), we
		 * end up narrowing the search space by a smaller
		 * amount (i.e. the distance between 'mi' and 'hi')
		 * than what we would have (i.e. about half of 'lo'
Vicent Marti committed
141
		 * and 'hi'). Hedge our bets to pick 'mi' less
142 143 144 145 146 147 148 149 150 151 152 153 154 155 156
		 * aggressively, i.e. make 'mi' a bit closer to the
		 * middle than we would otherwise pick.
		 */
		kyv = (kyv * 6 + lov + hiv) / 8;
		if (lov < hiv - 1) {
			if (kyv == lov)
				kyv++;
			else if (kyv == hiv)
				kyv--;
		}
		mi = (range - 1) * (kyv - lov) / (hiv - lov) + lo;

#ifdef INDEX_DEBUG_LOOKUP
		printf("lo %u hi %u rg %u mi %u ", lo, hi, range, mi);
		printf("ofs %u lov %x, hiv %x, kyv %x\n",
Vicent Marti committed
157
				ofs_0, lov, hiv, kyv);
158 159 160
#endif

		if (!(lo <= mi && mi < hi)) {
161 162
			giterr_set(GITERR_INVALID, "Assertion failure. Binary search invariant is false");
			return -1;
163 164 165 166 167 168 169 170 171 172 173 174 175 176
		}

		mi_key = base + elem_size * mi + key_offset;
		cmp = memcmp(mi_key + ofs_0, key + ofs_0, 20 - ofs_0);
		if (!cmp)
			return mi;
		if (cmp > 0) {
			hi = mi;
			hi_key = mi_key;
		} else {
			lo = mi + 1;
			lo_key = mi_key + elem_size;
		}
	} while (lo < hi);
177
	return -((int)lo)-1;
178
}