sha1_lookup.c 7.02 KB
Newer Older
1
/*
Edward Thomson committed
2
 * Copyright (C) the libgit2 contributors. All rights reserved.
3
 *
Vicent Marti committed
4 5
 * This file is part of libgit2, distributed under the GNU GPL v2 with
 * a Linking Exception. For full terms see the included COPYING file.
6 7 8 9 10 11
 */

#include <stdio.h>

#include "sha1_lookup.h"
#include "common.h"
12
#include "oid.h"
13 14 15 16 17

/*
 * Conventional binary search loop looks like this:
 *
 *	unsigned lo, hi;
Vicent Marti committed
18 19 20 21 22 23 24 25 26 27
 *		do {
 *				unsigned mi = (lo + hi) / 2;
 *				int cmp = "entry pointed at by mi" minus "target";
 *				if (!cmp)
 *						return (mi is the wanted one)
 *				if (cmp > 0)
 *						hi = mi; "mi is larger than target"
 *				else
 *						lo = mi+1; "mi is smaller than target"
 *		} while (lo < hi);
28 29 30 31
 *
 * The invariants are:
 *
 * - When entering the loop, lo points at a slot that is never
Vicent Marti committed
32 33 34
 *	above the target (it could be at the target), hi points at a
 *	slot that is guaranteed to be above the target (it can never
 *	be at the target).
35 36
 *
 * - We find a point 'mi' between lo and hi (mi could be the same
Vicent Marti committed
37 38
 *	as lo, but never can be as same as hi), and check if it hits
 *	the target. There are three cases:
39
 *
Vicent Marti committed
40
 *	- if it is a hit, we are happy.
41
 *
Vicent Marti committed
42 43
 *	- if it is strictly higher than the target, we set it to hi,
 *		and repeat the search.
44
 *
Vicent Marti committed
45 46
 *	- if it is strictly lower than the target, we update lo to
 *		one slot after it, because we allow lo to be at the target.
47
 *
Vicent Marti committed
48
 *	If the loop exits, there is no matching entry.
49 50 51
 *
 * When choosing 'mi', we do not have to take the "middle" but
 * anywhere in between lo and hi, as long as lo <= mi < hi is
Vicent Marti committed
52
 * satisfied. When we somehow know that the distance between the
53 54 55 56 57
 * target and lo is much shorter than the target and hi, we could
 * pick mi that is much closer to lo than the midway.
 *
 * Now, we can take advantage of the fact that SHA-1 is a good hash
 * function, and as long as there are enough entries in the table, we
Vicent Marti committed
58
 * can expect uniform distribution. An entry that begins with for
59
 * example "deadbeef..." is much likely to appear much later than in
Vicent Marti committed
60
 * the midway of the table. It can reasonably be expected to be near
61 62
 * 87% (222/256) from the top of the table.
 *
Vicent Marti committed
63
 * However, we do not want to pick "mi" too precisely. If the entry at
64 65 66
 * the 87% in the above example turns out to be higher than the target
 * we are looking for, we would end up narrowing the search space down
 * only by 13%, instead of 50% we would get if we did a simple binary
Vicent Marti committed
67
 * search. So we would want to hedge our bets by being less aggressive.
68 69
 *
 * The table at "table" holds at least "nr" entries of "elem_size"
Vicent Marti committed
70 71
 * bytes each. Each entry has the SHA-1 key at "key_offset". The
 * table is sorted by the SHA-1 key of the entries. The caller wants
72 73 74 75 76
 * to find the entry with "key", and knows that the entry at "lo" is
 * not higher than the entry it is looking for, and that the entry at
 * "hi" is higher than the entry it is looking for.
 */
int sha1_entry_pos(const void *table,
Vicent Marti committed
77 78 79 80
			size_t elem_size,
			size_t key_offset,
			unsigned lo, unsigned hi, unsigned nr,
			const unsigned char *key)
81
{
82
	const unsigned char *base = (const unsigned char*)table;
83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111
	const unsigned char *hi_key, *lo_key;
	unsigned ofs_0;

	if (!nr || lo >= hi)
		return -1;

	if (nr == hi)
		hi_key = NULL;
	else
		hi_key = base + elem_size * hi + key_offset;
	lo_key = base + elem_size * lo + key_offset;

	ofs_0 = 0;
	do {
		int cmp;
		unsigned ofs, mi, range;
		unsigned lov, hiv, kyv;
		const unsigned char *mi_key;

		range = hi - lo;
		if (hi_key) {
			for (ofs = ofs_0; ofs < 20; ofs++)
				if (lo_key[ofs] != hi_key[ofs])
					break;
			ofs_0 = ofs;
			/*
			 * byte 0 thru (ofs-1) are the same between
			 * lo and hi; ofs is the first byte that is
			 * different.
112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146
			 *
			 * If ofs==20, then no bytes are different,
			 * meaning we have entries with duplicate
			 * keys. We know that we are in a solid run
			 * of this entry (because the entries are
			 * sorted, and our lo and hi are the same,
			 * there can be nothing but this single key
			 * in between). So we can stop the search.
			 * Either one of these entries is it (and
			 * we do not care which), or we do not have
			 * it.
			 *
			 * Furthermore, we know that one of our
			 * endpoints must be the edge of the run of
			 * duplicates. For example, given this
			 * sequence:
			 *
			 *     idx 0 1 2 3 4 5
			 *     key A C C C C D
			 *
			 * If we are searching for "B", we might
			 * hit the duplicate run at lo=1, hi=3
			 * (e.g., by first mi=3, then mi=0). But we
			 * can never have lo > 1, because B < C.
			 * That is, if our key is less than the
			 * run, we know that "lo" is the edge, but
			 * we can say nothing of "hi". Similarly,
			 * if our key is greater than the run, we
			 * know that "hi" is the edge, but we can
			 * say nothing of "lo".
			 *
			 * Therefore if we do not find it, we also
			 * know where it would go if it did exist:
			 * just on the far side of the edge that we
			 * know about.
147
			 */
148 149 150 151 152 153 154 155 156 157 158 159
			if (ofs == 20) {
				mi = lo;
				mi_key = base + elem_size * mi + key_offset;
				cmp = memcmp(mi_key, key, 20);
				if (!cmp)
					return mi;
				if (cmp < 0)
					return -1 - hi;
				else
					return -1 - lo;
			}

160 161 162 163 164 165 166 167 168 169 170 171 172 173
			hiv = hi_key[ofs_0];
			if (ofs_0 < 19)
				hiv = (hiv << 8) | hi_key[ofs_0+1];
		} else {
			hiv = 256;
			if (ofs_0 < 19)
				hiv <<= 8;
		}
		lov = lo_key[ofs_0];
		kyv = key[ofs_0];
		if (ofs_0 < 19) {
			lov = (lov << 8) | lo_key[ofs_0+1];
			kyv = (kyv << 8) | key[ofs_0+1];
		}
174
		assert(lov < hiv);
175 176 177 178 179 180 181 182 183 184 185 186 187 188

		if (kyv < lov)
			return -1 - lo;
		if (hiv < kyv)
			return -1 - hi;

		/*
		 * Even if we know the target is much closer to 'hi'
		 * than 'lo', if we pick too precisely and overshoot
		 * (e.g. when we know 'mi' is closer to 'hi' than to
		 * 'lo', pick 'mi' that is higher than the target), we
		 * end up narrowing the search space by a smaller
		 * amount (i.e. the distance between 'mi' and 'hi')
		 * than what we would have (i.e. about half of 'lo'
Vicent Marti committed
189
		 * and 'hi'). Hedge our bets to pick 'mi' less
190 191 192 193 194 195 196 197 198 199 200 201 202 203 204
		 * aggressively, i.e. make 'mi' a bit closer to the
		 * middle than we would otherwise pick.
		 */
		kyv = (kyv * 6 + lov + hiv) / 8;
		if (lov < hiv - 1) {
			if (kyv == lov)
				kyv++;
			else if (kyv == hiv)
				kyv--;
		}
		mi = (range - 1) * (kyv - lov) / (hiv - lov) + lo;

#ifdef INDEX_DEBUG_LOOKUP
		printf("lo %u hi %u rg %u mi %u ", lo, hi, range, mi);
		printf("ofs %u lov %x, hiv %x, kyv %x\n",
Vicent Marti committed
205
				ofs_0, lov, hiv, kyv);
206 207 208
#endif

		if (!(lo <= mi && mi < hi)) {
209 210
			giterr_set(GITERR_INVALID, "Assertion failure. Binary search invariant is false");
			return -1;
211 212 213 214 215 216 217 218 219 220 221 222 223 224
		}

		mi_key = base + elem_size * mi + key_offset;
		cmp = memcmp(mi_key + ofs_0, key + ofs_0, 20 - ofs_0);
		if (!cmp)
			return mi;
		if (cmp > 0) {
			hi = mi;
			hi_key = mi_key;
		} else {
			lo = mi + 1;
			lo_key = mi_key + elem_size;
		}
	} while (lo < hi);
225
	return -((int)lo)-1;
226
}
227 228 229 230 231 232

int sha1_position(const void *table,
			size_t stride,
			unsigned lo, unsigned hi,
			const unsigned char *key)
{
233 234
	const unsigned char *base = table;

235 236
	do {
		unsigned mi = (lo + hi) / 2;
237
		int cmp = git_oid__hashcmp(base + mi * stride, key);
238 239 240 241 242 243 244 245 246 247 248 249

		if (!cmp)
			return mi;

		if (cmp > 0)
			hi = mi;
		else
			lo = mi+1;
	} while (lo < hi);

	return -((int)lo)-1;
}